diff --git "a/plbart_fmft_official_1e-05/checkpoint-117744/trainer_state.json" "b/plbart_fmft_official_1e-05/checkpoint-117744/trainer_state.json" new file mode 100644--- /dev/null +++ "b/plbart_fmft_official_1e-05/checkpoint-117744/trainer_state.json" @@ -0,0 +1,165045 @@ +{ + "best_metric": 0.017746460291639673, + "best_model_checkpoint": "./results-cc/plbart/plbart_fmft_official_1e-05/checkpoint-103026", + "epoch": 8.0, + "eval_steps": 500, + "global_step": 117744, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003397200706617747, + "grad_norm": 148.53599548339844, + "learning_rate": 9.99983013996467e-06, + "loss": 12.9902, + "step": 5 + }, + { + "epoch": 0.0006794401413235494, + "grad_norm": 28.510236740112305, + "learning_rate": 9.999405489876343e-06, + "loss": 8.4051, + "step": 10 + }, + { + "epoch": 0.0010191602119853241, + "grad_norm": 14.12374496459961, + "learning_rate": 9.998980839788016e-06, + "loss": 7.2869, + "step": 15 + }, + { + "epoch": 0.001358880282647099, + "grad_norm": 15.21195125579834, + "learning_rate": 9.998556189699689e-06, + "loss": 6.4182, + "step": 20 + }, + { + "epoch": 0.0016986003533088735, + "grad_norm": 20.368452072143555, + "learning_rate": 9.998131539611361e-06, + "loss": 6.092, + "step": 25 + }, + { + "epoch": 0.0020383204239706482, + "grad_norm": 16.919343948364258, + "learning_rate": 9.997706889523033e-06, + "loss": 5.5361, + "step": 30 + }, + { + "epoch": 0.002378040494632423, + "grad_norm": 18.563220977783203, + "learning_rate": 9.997282239434707e-06, + "loss": 5.0504, + "step": 35 + }, + { + "epoch": 0.002717760565294198, + "grad_norm": 11.689538955688477, + "learning_rate": 9.99685758934638e-06, + "loss": 4.7017, + "step": 40 + }, + { + "epoch": 0.0030574806359559724, + "grad_norm": 13.993334770202637, + "learning_rate": 9.996432939258051e-06, + "loss": 4.5594, + "step": 45 + }, + { + "epoch": 0.003397200706617747, + "grad_norm": 33.61357498168945, + "learning_rate": 9.996008289169725e-06, + "loss": 4.5109, + "step": 50 + }, + { + "epoch": 0.0037369207772795215, + "grad_norm": 12.810786247253418, + "learning_rate": 9.995583639081398e-06, + "loss": 4.2455, + "step": 55 + }, + { + "epoch": 0.0040766408479412965, + "grad_norm": 19.113449096679688, + "learning_rate": 9.99515898899307e-06, + "loss": 4.1021, + "step": 60 + }, + { + "epoch": 0.0044163609186030715, + "grad_norm": 8.351461410522461, + "learning_rate": 9.994734338904744e-06, + "loss": 4.062, + "step": 65 + }, + { + "epoch": 0.004756080989264846, + "grad_norm": 8.440468788146973, + "learning_rate": 9.994309688816417e-06, + "loss": 4.222, + "step": 70 + }, + { + "epoch": 0.005095801059926621, + "grad_norm": 9.2280855178833, + "learning_rate": 9.993885038728088e-06, + "loss": 3.8221, + "step": 75 + }, + { + "epoch": 0.005435521130588396, + "grad_norm": 9.78824234008789, + "learning_rate": 9.993460388639762e-06, + "loss": 3.7984, + "step": 80 + }, + { + "epoch": 0.00577524120125017, + "grad_norm": 7.302383899688721, + "learning_rate": 9.993035738551435e-06, + "loss": 3.7646, + "step": 85 + }, + { + "epoch": 0.006114961271911945, + "grad_norm": 10.201916694641113, + "learning_rate": 9.992611088463106e-06, + "loss": 3.8794, + "step": 90 + }, + { + "epoch": 0.006454681342573719, + "grad_norm": 7.366273403167725, + "learning_rate": 9.99218643837478e-06, + "loss": 4.0858, + "step": 95 + }, + { + "epoch": 0.006794401413235494, + "grad_norm": 9.170772552490234, + "learning_rate": 9.991761788286452e-06, + "loss": 3.9636, + "step": 100 + }, + { + "epoch": 0.007134121483897269, + "grad_norm": 6.835070610046387, + "learning_rate": 9.991337138198125e-06, + "loss": 3.6305, + "step": 105 + }, + { + "epoch": 0.007473841554559043, + "grad_norm": 11.587068557739258, + "learning_rate": 9.990912488109799e-06, + "loss": 3.9685, + "step": 110 + }, + { + "epoch": 0.007813561625220818, + "grad_norm": 6.694929122924805, + "learning_rate": 9.99048783802147e-06, + "loss": 3.9153, + "step": 115 + }, + { + "epoch": 0.008153281695882593, + "grad_norm": 10.825115203857422, + "learning_rate": 9.990063187933145e-06, + "loss": 4.1538, + "step": 120 + }, + { + "epoch": 0.008493001766544368, + "grad_norm": 7.728686332702637, + "learning_rate": 9.989638537844817e-06, + "loss": 3.9094, + "step": 125 + }, + { + "epoch": 0.008832721837206143, + "grad_norm": 10.31328296661377, + "learning_rate": 9.989213887756489e-06, + "loss": 3.8337, + "step": 130 + }, + { + "epoch": 0.009172441907867916, + "grad_norm": 10.25274658203125, + "learning_rate": 9.988789237668163e-06, + "loss": 3.7996, + "step": 135 + }, + { + "epoch": 0.009512161978529691, + "grad_norm": 19.4798641204834, + "learning_rate": 9.988364587579836e-06, + "loss": 3.7268, + "step": 140 + }, + { + "epoch": 0.009851882049191466, + "grad_norm": 7.912703037261963, + "learning_rate": 9.987939937491507e-06, + "loss": 3.6443, + "step": 145 + }, + { + "epoch": 0.010191602119853241, + "grad_norm": 9.14289665222168, + "learning_rate": 9.987515287403181e-06, + "loss": 3.8804, + "step": 150 + }, + { + "epoch": 0.010531322190515016, + "grad_norm": 13.008984565734863, + "learning_rate": 9.987090637314854e-06, + "loss": 3.8672, + "step": 155 + }, + { + "epoch": 0.010871042261176791, + "grad_norm": 10.258288383483887, + "learning_rate": 9.986665987226525e-06, + "loss": 3.8951, + "step": 160 + }, + { + "epoch": 0.011210762331838564, + "grad_norm": 7.645933151245117, + "learning_rate": 9.9862413371382e-06, + "loss": 3.7634, + "step": 165 + }, + { + "epoch": 0.01155048240250034, + "grad_norm": 8.420952796936035, + "learning_rate": 9.985816687049871e-06, + "loss": 3.8743, + "step": 170 + }, + { + "epoch": 0.011890202473162114, + "grad_norm": 7.853847026824951, + "learning_rate": 9.985392036961544e-06, + "loss": 3.7343, + "step": 175 + }, + { + "epoch": 0.01222992254382389, + "grad_norm": 9.838406562805176, + "learning_rate": 9.984967386873218e-06, + "loss": 3.9157, + "step": 180 + }, + { + "epoch": 0.012569642614485664, + "grad_norm": 13.414801597595215, + "learning_rate": 9.98454273678489e-06, + "loss": 3.8457, + "step": 185 + }, + { + "epoch": 0.012909362685147438, + "grad_norm": 7.718922138214111, + "learning_rate": 9.984118086696562e-06, + "loss": 3.7401, + "step": 190 + }, + { + "epoch": 0.013249082755809213, + "grad_norm": 8.71448040008545, + "learning_rate": 9.983693436608237e-06, + "loss": 3.9705, + "step": 195 + }, + { + "epoch": 0.013588802826470988, + "grad_norm": 8.514265060424805, + "learning_rate": 9.983268786519908e-06, + "loss": 3.8351, + "step": 200 + }, + { + "epoch": 0.013928522897132763, + "grad_norm": 7.789298057556152, + "learning_rate": 9.98284413643158e-06, + "loss": 3.8753, + "step": 205 + }, + { + "epoch": 0.014268242967794538, + "grad_norm": 7.6094746589660645, + "learning_rate": 9.982419486343255e-06, + "loss": 3.6475, + "step": 210 + }, + { + "epoch": 0.014607963038456313, + "grad_norm": 6.159725189208984, + "learning_rate": 9.981994836254926e-06, + "loss": 3.6997, + "step": 215 + }, + { + "epoch": 0.014947683109118086, + "grad_norm": 6.749081611633301, + "learning_rate": 9.981570186166599e-06, + "loss": 3.84, + "step": 220 + }, + { + "epoch": 0.015287403179779861, + "grad_norm": 7.618193626403809, + "learning_rate": 9.981145536078274e-06, + "loss": 3.686, + "step": 225 + }, + { + "epoch": 0.015627123250441636, + "grad_norm": 6.524477005004883, + "learning_rate": 9.980720885989945e-06, + "loss": 3.739, + "step": 230 + }, + { + "epoch": 0.01596684332110341, + "grad_norm": 7.232755661010742, + "learning_rate": 9.980296235901617e-06, + "loss": 3.7328, + "step": 235 + }, + { + "epoch": 0.016306563391765186, + "grad_norm": 8.607287406921387, + "learning_rate": 9.979871585813292e-06, + "loss": 3.5291, + "step": 240 + }, + { + "epoch": 0.01664628346242696, + "grad_norm": 7.676520347595215, + "learning_rate": 9.979446935724963e-06, + "loss": 3.6354, + "step": 245 + }, + { + "epoch": 0.016986003533088736, + "grad_norm": 6.363873481750488, + "learning_rate": 9.979022285636636e-06, + "loss": 3.78, + "step": 250 + }, + { + "epoch": 0.01732572360375051, + "grad_norm": 9.055436134338379, + "learning_rate": 9.978597635548309e-06, + "loss": 3.6249, + "step": 255 + }, + { + "epoch": 0.017665443674412286, + "grad_norm": 10.74278450012207, + "learning_rate": 9.978172985459981e-06, + "loss": 3.9546, + "step": 260 + }, + { + "epoch": 0.01800516374507406, + "grad_norm": 7.796395778656006, + "learning_rate": 9.977748335371654e-06, + "loss": 3.9823, + "step": 265 + }, + { + "epoch": 0.018344883815735832, + "grad_norm": 9.671547889709473, + "learning_rate": 9.977323685283327e-06, + "loss": 3.6681, + "step": 270 + }, + { + "epoch": 0.01868460388639761, + "grad_norm": 6.77473783493042, + "learning_rate": 9.976899035195e-06, + "loss": 3.5885, + "step": 275 + }, + { + "epoch": 0.019024323957059382, + "grad_norm": 6.752045631408691, + "learning_rate": 9.976474385106673e-06, + "loss": 3.6998, + "step": 280 + }, + { + "epoch": 0.01936404402772116, + "grad_norm": 5.650535583496094, + "learning_rate": 9.976049735018345e-06, + "loss": 3.7031, + "step": 285 + }, + { + "epoch": 0.019703764098382932, + "grad_norm": 6.8793182373046875, + "learning_rate": 9.975625084930018e-06, + "loss": 3.8127, + "step": 290 + }, + { + "epoch": 0.020043484169044706, + "grad_norm": 6.416210174560547, + "learning_rate": 9.975200434841691e-06, + "loss": 3.6807, + "step": 295 + }, + { + "epoch": 0.020383204239706482, + "grad_norm": 7.304996013641357, + "learning_rate": 9.974775784753364e-06, + "loss": 3.4921, + "step": 300 + }, + { + "epoch": 0.020722924310368256, + "grad_norm": 10.484519004821777, + "learning_rate": 9.974351134665037e-06, + "loss": 3.721, + "step": 305 + }, + { + "epoch": 0.021062644381030032, + "grad_norm": 7.492562294006348, + "learning_rate": 9.97392648457671e-06, + "loss": 3.4944, + "step": 310 + }, + { + "epoch": 0.021402364451691806, + "grad_norm": 7.586320400238037, + "learning_rate": 9.973501834488382e-06, + "loss": 3.9201, + "step": 315 + }, + { + "epoch": 0.021742084522353582, + "grad_norm": 9.695868492126465, + "learning_rate": 9.973077184400055e-06, + "loss": 3.6991, + "step": 320 + }, + { + "epoch": 0.022081804593015356, + "grad_norm": 7.968296527862549, + "learning_rate": 9.972652534311728e-06, + "loss": 3.5408, + "step": 325 + }, + { + "epoch": 0.02242152466367713, + "grad_norm": 6.526900768280029, + "learning_rate": 9.9722278842234e-06, + "loss": 3.3796, + "step": 330 + }, + { + "epoch": 0.022761244734338906, + "grad_norm": 6.392164707183838, + "learning_rate": 9.971803234135073e-06, + "loss": 3.8195, + "step": 335 + }, + { + "epoch": 0.02310096480500068, + "grad_norm": 9.041703224182129, + "learning_rate": 9.971378584046746e-06, + "loss": 3.7773, + "step": 340 + }, + { + "epoch": 0.023440684875662456, + "grad_norm": 9.754754066467285, + "learning_rate": 9.970953933958419e-06, + "loss": 3.7583, + "step": 345 + }, + { + "epoch": 0.02378040494632423, + "grad_norm": 10.969034194946289, + "learning_rate": 9.970529283870092e-06, + "loss": 3.5839, + "step": 350 + }, + { + "epoch": 0.024120125016986002, + "grad_norm": 8.523895263671875, + "learning_rate": 9.970104633781765e-06, + "loss": 3.9539, + "step": 355 + }, + { + "epoch": 0.02445984508764778, + "grad_norm": 7.236157417297363, + "learning_rate": 9.969679983693437e-06, + "loss": 3.7084, + "step": 360 + }, + { + "epoch": 0.024799565158309552, + "grad_norm": 8.141843795776367, + "learning_rate": 9.96925533360511e-06, + "loss": 3.741, + "step": 365 + }, + { + "epoch": 0.02513928522897133, + "grad_norm": 7.401703834533691, + "learning_rate": 9.968830683516783e-06, + "loss": 3.8019, + "step": 370 + }, + { + "epoch": 0.025479005299633102, + "grad_norm": 8.310694694519043, + "learning_rate": 9.968406033428456e-06, + "loss": 3.6032, + "step": 375 + }, + { + "epoch": 0.025818725370294875, + "grad_norm": 7.512619972229004, + "learning_rate": 9.967981383340129e-06, + "loss": 3.8114, + "step": 380 + }, + { + "epoch": 0.026158445440956652, + "grad_norm": 8.614784240722656, + "learning_rate": 9.967556733251801e-06, + "loss": 3.8398, + "step": 385 + }, + { + "epoch": 0.026498165511618425, + "grad_norm": 6.873485088348389, + "learning_rate": 9.967132083163474e-06, + "loss": 3.4916, + "step": 390 + }, + { + "epoch": 0.026837885582280202, + "grad_norm": 7.542867183685303, + "learning_rate": 9.966707433075147e-06, + "loss": 3.3274, + "step": 395 + }, + { + "epoch": 0.027177605652941975, + "grad_norm": 7.508244037628174, + "learning_rate": 9.96628278298682e-06, + "loss": 3.5875, + "step": 400 + }, + { + "epoch": 0.027517325723603752, + "grad_norm": 8.974902153015137, + "learning_rate": 9.965858132898493e-06, + "loss": 3.5749, + "step": 405 + }, + { + "epoch": 0.027857045794265525, + "grad_norm": 7.6895551681518555, + "learning_rate": 9.965433482810165e-06, + "loss": 3.8554, + "step": 410 + }, + { + "epoch": 0.0281967658649273, + "grad_norm": 8.112239837646484, + "learning_rate": 9.965008832721838e-06, + "loss": 3.7929, + "step": 415 + }, + { + "epoch": 0.028536485935589075, + "grad_norm": 10.58179759979248, + "learning_rate": 9.964584182633511e-06, + "loss": 3.6308, + "step": 420 + }, + { + "epoch": 0.02887620600625085, + "grad_norm": 8.093683242797852, + "learning_rate": 9.964159532545184e-06, + "loss": 3.8292, + "step": 425 + }, + { + "epoch": 0.029215926076912625, + "grad_norm": 8.273056983947754, + "learning_rate": 9.963734882456857e-06, + "loss": 3.8019, + "step": 430 + }, + { + "epoch": 0.0295556461475744, + "grad_norm": 7.379275321960449, + "learning_rate": 9.96331023236853e-06, + "loss": 3.6728, + "step": 435 + }, + { + "epoch": 0.029895366218236172, + "grad_norm": 6.773268699645996, + "learning_rate": 9.962885582280202e-06, + "loss": 3.7346, + "step": 440 + }, + { + "epoch": 0.03023508628889795, + "grad_norm": 7.728012561798096, + "learning_rate": 9.962460932191873e-06, + "loss": 3.5269, + "step": 445 + }, + { + "epoch": 0.030574806359559722, + "grad_norm": 7.065719127655029, + "learning_rate": 9.962036282103548e-06, + "loss": 3.3809, + "step": 450 + }, + { + "epoch": 0.0309145264302215, + "grad_norm": 6.762896537780762, + "learning_rate": 9.96161163201522e-06, + "loss": 3.6522, + "step": 455 + }, + { + "epoch": 0.03125424650088327, + "grad_norm": 6.95136833190918, + "learning_rate": 9.961186981926893e-06, + "loss": 4.1289, + "step": 460 + }, + { + "epoch": 0.03159396657154505, + "grad_norm": 9.940567016601562, + "learning_rate": 9.960762331838566e-06, + "loss": 3.7242, + "step": 465 + }, + { + "epoch": 0.03193368664220682, + "grad_norm": 7.9607672691345215, + "learning_rate": 9.960337681750239e-06, + "loss": 3.6988, + "step": 470 + }, + { + "epoch": 0.032273406712868595, + "grad_norm": 8.269115447998047, + "learning_rate": 9.959913031661912e-06, + "loss": 3.7966, + "step": 475 + }, + { + "epoch": 0.03261312678353037, + "grad_norm": 9.283363342285156, + "learning_rate": 9.959488381573585e-06, + "loss": 3.4665, + "step": 480 + }, + { + "epoch": 0.03295284685419215, + "grad_norm": 8.101306915283203, + "learning_rate": 9.959063731485257e-06, + "loss": 3.5232, + "step": 485 + }, + { + "epoch": 0.03329256692485392, + "grad_norm": 8.232202529907227, + "learning_rate": 9.95863908139693e-06, + "loss": 3.584, + "step": 490 + }, + { + "epoch": 0.033632286995515695, + "grad_norm": 8.470726013183594, + "learning_rate": 9.958214431308603e-06, + "loss": 3.6574, + "step": 495 + }, + { + "epoch": 0.03397200706617747, + "grad_norm": 7.896862030029297, + "learning_rate": 9.957789781220276e-06, + "loss": 3.5144, + "step": 500 + }, + { + "epoch": 0.03431172713683924, + "grad_norm": 6.869927883148193, + "learning_rate": 9.957365131131949e-06, + "loss": 3.5886, + "step": 505 + }, + { + "epoch": 0.03465144720750102, + "grad_norm": 8.847620964050293, + "learning_rate": 9.956940481043621e-06, + "loss": 3.7837, + "step": 510 + }, + { + "epoch": 0.034991167278162795, + "grad_norm": 8.453719139099121, + "learning_rate": 9.956515830955293e-06, + "loss": 3.5518, + "step": 515 + }, + { + "epoch": 0.03533088734882457, + "grad_norm": 9.331916809082031, + "learning_rate": 9.956091180866967e-06, + "loss": 3.7184, + "step": 520 + }, + { + "epoch": 0.03567060741948634, + "grad_norm": 6.707831859588623, + "learning_rate": 9.95566653077864e-06, + "loss": 3.5653, + "step": 525 + }, + { + "epoch": 0.03601032749014812, + "grad_norm": 7.192928314208984, + "learning_rate": 9.955241880690311e-06, + "loss": 3.3545, + "step": 530 + }, + { + "epoch": 0.036350047560809895, + "grad_norm": 8.409167289733887, + "learning_rate": 9.954817230601985e-06, + "loss": 3.6337, + "step": 535 + }, + { + "epoch": 0.036689767631471665, + "grad_norm": 7.470005035400391, + "learning_rate": 9.954392580513658e-06, + "loss": 3.5696, + "step": 540 + }, + { + "epoch": 0.03702948770213344, + "grad_norm": 10.062429428100586, + "learning_rate": 9.95396793042533e-06, + "loss": 3.633, + "step": 545 + }, + { + "epoch": 0.03736920777279522, + "grad_norm": 7.284005641937256, + "learning_rate": 9.953543280337004e-06, + "loss": 3.8256, + "step": 550 + }, + { + "epoch": 0.03770892784345699, + "grad_norm": 7.462994575500488, + "learning_rate": 9.953118630248677e-06, + "loss": 3.6268, + "step": 555 + }, + { + "epoch": 0.038048647914118765, + "grad_norm": 7.898111343383789, + "learning_rate": 9.952693980160348e-06, + "loss": 3.6721, + "step": 560 + }, + { + "epoch": 0.03838836798478054, + "grad_norm": 8.012699127197266, + "learning_rate": 9.952269330072022e-06, + "loss": 3.6547, + "step": 565 + }, + { + "epoch": 0.03872808805544232, + "grad_norm": 7.422359466552734, + "learning_rate": 9.951844679983695e-06, + "loss": 3.8251, + "step": 570 + }, + { + "epoch": 0.03906780812610409, + "grad_norm": 7.375709056854248, + "learning_rate": 9.951420029895366e-06, + "loss": 3.8018, + "step": 575 + }, + { + "epoch": 0.039407528196765865, + "grad_norm": 6.857296466827393, + "learning_rate": 9.95099537980704e-06, + "loss": 3.4916, + "step": 580 + }, + { + "epoch": 0.03974724826742764, + "grad_norm": 7.8817949295043945, + "learning_rate": 9.950570729718713e-06, + "loss": 3.4789, + "step": 585 + }, + { + "epoch": 0.04008696833808941, + "grad_norm": 8.200421333312988, + "learning_rate": 9.950146079630385e-06, + "loss": 3.481, + "step": 590 + }, + { + "epoch": 0.04042668840875119, + "grad_norm": 7.094868183135986, + "learning_rate": 9.949721429542059e-06, + "loss": 3.4817, + "step": 595 + }, + { + "epoch": 0.040766408479412965, + "grad_norm": 7.142947196960449, + "learning_rate": 9.94929677945373e-06, + "loss": 3.5454, + "step": 600 + }, + { + "epoch": 0.04110612855007474, + "grad_norm": 8.945012092590332, + "learning_rate": 9.948872129365403e-06, + "loss": 3.6543, + "step": 605 + }, + { + "epoch": 0.04144584862073651, + "grad_norm": 7.728625297546387, + "learning_rate": 9.948447479277077e-06, + "loss": 3.6365, + "step": 610 + }, + { + "epoch": 0.04178556869139829, + "grad_norm": 7.790318965911865, + "learning_rate": 9.948022829188749e-06, + "loss": 3.6342, + "step": 615 + }, + { + "epoch": 0.042125288762060065, + "grad_norm": 6.19236421585083, + "learning_rate": 9.947598179100421e-06, + "loss": 3.4286, + "step": 620 + }, + { + "epoch": 0.042465008832721834, + "grad_norm": 6.038117408752441, + "learning_rate": 9.947173529012096e-06, + "loss": 3.2745, + "step": 625 + }, + { + "epoch": 0.04280472890338361, + "grad_norm": 7.0737504959106445, + "learning_rate": 9.946748878923767e-06, + "loss": 3.4183, + "step": 630 + }, + { + "epoch": 0.04314444897404539, + "grad_norm": 7.456831455230713, + "learning_rate": 9.94632422883544e-06, + "loss": 3.5345, + "step": 635 + }, + { + "epoch": 0.043484169044707165, + "grad_norm": 7.028069972991943, + "learning_rate": 9.945899578747114e-06, + "loss": 3.5648, + "step": 640 + }, + { + "epoch": 0.043823889115368934, + "grad_norm": 7.766293048858643, + "learning_rate": 9.945474928658785e-06, + "loss": 3.8444, + "step": 645 + }, + { + "epoch": 0.04416360918603071, + "grad_norm": 6.770590782165527, + "learning_rate": 9.945050278570458e-06, + "loss": 3.6839, + "step": 650 + }, + { + "epoch": 0.04450332925669249, + "grad_norm": 8.245901107788086, + "learning_rate": 9.944625628482133e-06, + "loss": 3.5106, + "step": 655 + }, + { + "epoch": 0.04484304932735426, + "grad_norm": 7.901270389556885, + "learning_rate": 9.944200978393804e-06, + "loss": 3.5432, + "step": 660 + }, + { + "epoch": 0.045182769398016034, + "grad_norm": 6.151656627655029, + "learning_rate": 9.943776328305477e-06, + "loss": 3.5115, + "step": 665 + }, + { + "epoch": 0.04552248946867781, + "grad_norm": 7.359994888305664, + "learning_rate": 9.94335167821715e-06, + "loss": 3.8098, + "step": 670 + }, + { + "epoch": 0.04586220953933958, + "grad_norm": 8.968863487243652, + "learning_rate": 9.942927028128822e-06, + "loss": 3.6669, + "step": 675 + }, + { + "epoch": 0.04620192961000136, + "grad_norm": 8.388439178466797, + "learning_rate": 9.942502378040495e-06, + "loss": 3.5548, + "step": 680 + }, + { + "epoch": 0.046541649680663134, + "grad_norm": 7.189079284667969, + "learning_rate": 9.942077727952168e-06, + "loss": 3.7556, + "step": 685 + }, + { + "epoch": 0.04688136975132491, + "grad_norm": 7.795598030090332, + "learning_rate": 9.94165307786384e-06, + "loss": 3.6133, + "step": 690 + }, + { + "epoch": 0.04722108982198668, + "grad_norm": 6.8026251792907715, + "learning_rate": 9.941228427775513e-06, + "loss": 3.7131, + "step": 695 + }, + { + "epoch": 0.04756080989264846, + "grad_norm": 5.7751946449279785, + "learning_rate": 9.940803777687186e-06, + "loss": 3.6663, + "step": 700 + }, + { + "epoch": 0.047900529963310234, + "grad_norm": 7.476159572601318, + "learning_rate": 9.940379127598859e-06, + "loss": 3.5967, + "step": 705 + }, + { + "epoch": 0.048240250033972004, + "grad_norm": 6.295339584350586, + "learning_rate": 9.939954477510532e-06, + "loss": 3.5975, + "step": 710 + }, + { + "epoch": 0.04857997010463378, + "grad_norm": 6.456781387329102, + "learning_rate": 9.939529827422205e-06, + "loss": 3.5406, + "step": 715 + }, + { + "epoch": 0.04891969017529556, + "grad_norm": 6.793482780456543, + "learning_rate": 9.939105177333877e-06, + "loss": 3.6537, + "step": 720 + }, + { + "epoch": 0.049259410245957334, + "grad_norm": 7.70743465423584, + "learning_rate": 9.93868052724555e-06, + "loss": 3.4518, + "step": 725 + }, + { + "epoch": 0.049599130316619104, + "grad_norm": 8.921398162841797, + "learning_rate": 9.938255877157223e-06, + "loss": 3.45, + "step": 730 + }, + { + "epoch": 0.04993885038728088, + "grad_norm": 8.636183738708496, + "learning_rate": 9.937831227068896e-06, + "loss": 3.552, + "step": 735 + }, + { + "epoch": 0.05027857045794266, + "grad_norm": 9.054503440856934, + "learning_rate": 9.937406576980569e-06, + "loss": 3.7861, + "step": 740 + }, + { + "epoch": 0.05061829052860443, + "grad_norm": 7.526261329650879, + "learning_rate": 9.936981926892241e-06, + "loss": 3.3784, + "step": 745 + }, + { + "epoch": 0.050958010599266204, + "grad_norm": 8.562353134155273, + "learning_rate": 9.936557276803914e-06, + "loss": 3.5638, + "step": 750 + }, + { + "epoch": 0.05129773066992798, + "grad_norm": 7.575862884521484, + "learning_rate": 9.936132626715587e-06, + "loss": 3.5763, + "step": 755 + }, + { + "epoch": 0.05163745074058975, + "grad_norm": 9.033392906188965, + "learning_rate": 9.93570797662726e-06, + "loss": 3.7396, + "step": 760 + }, + { + "epoch": 0.05197717081125153, + "grad_norm": 7.3270440101623535, + "learning_rate": 9.935283326538933e-06, + "loss": 3.4951, + "step": 765 + }, + { + "epoch": 0.052316890881913304, + "grad_norm": 8.666138648986816, + "learning_rate": 9.934858676450605e-06, + "loss": 3.4739, + "step": 770 + }, + { + "epoch": 0.05265661095257508, + "grad_norm": 7.660459041595459, + "learning_rate": 9.934434026362278e-06, + "loss": 3.7187, + "step": 775 + }, + { + "epoch": 0.05299633102323685, + "grad_norm": 7.0125250816345215, + "learning_rate": 9.934009376273951e-06, + "loss": 3.7187, + "step": 780 + }, + { + "epoch": 0.05333605109389863, + "grad_norm": 7.56631326675415, + "learning_rate": 9.933584726185624e-06, + "loss": 3.5219, + "step": 785 + }, + { + "epoch": 0.053675771164560404, + "grad_norm": 7.230415344238281, + "learning_rate": 9.933160076097297e-06, + "loss": 3.6075, + "step": 790 + }, + { + "epoch": 0.054015491235222174, + "grad_norm": 6.540461540222168, + "learning_rate": 9.93273542600897e-06, + "loss": 3.4571, + "step": 795 + }, + { + "epoch": 0.05435521130588395, + "grad_norm": 8.224103927612305, + "learning_rate": 9.932310775920642e-06, + "loss": 3.8027, + "step": 800 + }, + { + "epoch": 0.05469493137654573, + "grad_norm": 7.055105686187744, + "learning_rate": 9.931886125832315e-06, + "loss": 3.8336, + "step": 805 + }, + { + "epoch": 0.055034651447207504, + "grad_norm": 7.134389400482178, + "learning_rate": 9.931461475743988e-06, + "loss": 3.5769, + "step": 810 + }, + { + "epoch": 0.055374371517869274, + "grad_norm": 9.121651649475098, + "learning_rate": 9.93103682565566e-06, + "loss": 3.632, + "step": 815 + }, + { + "epoch": 0.05571409158853105, + "grad_norm": 9.066234588623047, + "learning_rate": 9.930612175567333e-06, + "loss": 3.4632, + "step": 820 + }, + { + "epoch": 0.05605381165919283, + "grad_norm": 5.973104476928711, + "learning_rate": 9.930187525479006e-06, + "loss": 3.4888, + "step": 825 + }, + { + "epoch": 0.0563935317298546, + "grad_norm": 7.75482702255249, + "learning_rate": 9.929762875390679e-06, + "loss": 3.4572, + "step": 830 + }, + { + "epoch": 0.056733251800516374, + "grad_norm": 7.779238700866699, + "learning_rate": 9.929338225302352e-06, + "loss": 3.519, + "step": 835 + }, + { + "epoch": 0.05707297187117815, + "grad_norm": 6.7241106033325195, + "learning_rate": 9.928913575214025e-06, + "loss": 3.7912, + "step": 840 + }, + { + "epoch": 0.05741269194183993, + "grad_norm": 7.29032564163208, + "learning_rate": 9.928488925125697e-06, + "loss": 3.7518, + "step": 845 + }, + { + "epoch": 0.0577524120125017, + "grad_norm": 8.265213966369629, + "learning_rate": 9.92806427503737e-06, + "loss": 3.5763, + "step": 850 + }, + { + "epoch": 0.058092132083163474, + "grad_norm": 6.878722667694092, + "learning_rate": 9.927639624949043e-06, + "loss": 3.6057, + "step": 855 + }, + { + "epoch": 0.05843185215382525, + "grad_norm": 6.768926620483398, + "learning_rate": 9.927214974860716e-06, + "loss": 3.2384, + "step": 860 + }, + { + "epoch": 0.05877157222448702, + "grad_norm": 8.497178077697754, + "learning_rate": 9.926790324772389e-06, + "loss": 3.601, + "step": 865 + }, + { + "epoch": 0.0591112922951488, + "grad_norm": 7.753541469573975, + "learning_rate": 9.926365674684061e-06, + "loss": 3.6013, + "step": 870 + }, + { + "epoch": 0.059451012365810574, + "grad_norm": 6.495034694671631, + "learning_rate": 9.925941024595734e-06, + "loss": 3.3042, + "step": 875 + }, + { + "epoch": 0.059790732436472344, + "grad_norm": 7.493185997009277, + "learning_rate": 9.925516374507407e-06, + "loss": 3.5086, + "step": 880 + }, + { + "epoch": 0.06013045250713412, + "grad_norm": 8.698945999145508, + "learning_rate": 9.92509172441908e-06, + "loss": 3.4495, + "step": 885 + }, + { + "epoch": 0.0604701725777959, + "grad_norm": 10.50797176361084, + "learning_rate": 9.924667074330753e-06, + "loss": 3.6783, + "step": 890 + }, + { + "epoch": 0.060809892648457674, + "grad_norm": 6.78216552734375, + "learning_rate": 9.924242424242425e-06, + "loss": 3.6426, + "step": 895 + }, + { + "epoch": 0.061149612719119444, + "grad_norm": 9.329264640808105, + "learning_rate": 9.923817774154098e-06, + "loss": 3.6783, + "step": 900 + }, + { + "epoch": 0.06148933278978122, + "grad_norm": 8.175287246704102, + "learning_rate": 9.923393124065771e-06, + "loss": 3.5594, + "step": 905 + }, + { + "epoch": 0.061829052860443, + "grad_norm": 7.199501037597656, + "learning_rate": 9.922968473977444e-06, + "loss": 3.4175, + "step": 910 + }, + { + "epoch": 0.06216877293110477, + "grad_norm": 8.06881332397461, + "learning_rate": 9.922543823889117e-06, + "loss": 3.7222, + "step": 915 + }, + { + "epoch": 0.06250849300176654, + "grad_norm": 7.477462291717529, + "learning_rate": 9.92211917380079e-06, + "loss": 3.6295, + "step": 920 + }, + { + "epoch": 0.06284821307242831, + "grad_norm": 7.969459533691406, + "learning_rate": 9.921694523712462e-06, + "loss": 3.5423, + "step": 925 + }, + { + "epoch": 0.0631879331430901, + "grad_norm": 7.432893753051758, + "learning_rate": 9.921269873624133e-06, + "loss": 3.6635, + "step": 930 + }, + { + "epoch": 0.06352765321375187, + "grad_norm": 6.892743110656738, + "learning_rate": 9.920845223535808e-06, + "loss": 3.7451, + "step": 935 + }, + { + "epoch": 0.06386737328441364, + "grad_norm": 6.048975467681885, + "learning_rate": 9.92042057344748e-06, + "loss": 3.3968, + "step": 940 + }, + { + "epoch": 0.06420709335507542, + "grad_norm": 7.135468006134033, + "learning_rate": 9.919995923359152e-06, + "loss": 3.5354, + "step": 945 + }, + { + "epoch": 0.06454681342573719, + "grad_norm": 7.055885314941406, + "learning_rate": 9.919571273270826e-06, + "loss": 3.5314, + "step": 950 + }, + { + "epoch": 0.06488653349639897, + "grad_norm": 7.901118755340576, + "learning_rate": 9.919146623182499e-06, + "loss": 3.4942, + "step": 955 + }, + { + "epoch": 0.06522625356706074, + "grad_norm": 7.503381729125977, + "learning_rate": 9.91872197309417e-06, + "loss": 3.6872, + "step": 960 + }, + { + "epoch": 0.06556597363772251, + "grad_norm": 7.075658321380615, + "learning_rate": 9.918297323005845e-06, + "loss": 3.6314, + "step": 965 + }, + { + "epoch": 0.0659056937083843, + "grad_norm": 11.341386795043945, + "learning_rate": 9.917872672917517e-06, + "loss": 3.7775, + "step": 970 + }, + { + "epoch": 0.06624541377904607, + "grad_norm": 7.840639591217041, + "learning_rate": 9.917448022829189e-06, + "loss": 3.762, + "step": 975 + }, + { + "epoch": 0.06658513384970784, + "grad_norm": 7.495547771453857, + "learning_rate": 9.917023372740863e-06, + "loss": 3.6216, + "step": 980 + }, + { + "epoch": 0.06692485392036962, + "grad_norm": 8.108582496643066, + "learning_rate": 9.916598722652536e-06, + "loss": 3.6406, + "step": 985 + }, + { + "epoch": 0.06726457399103139, + "grad_norm": 6.949503421783447, + "learning_rate": 9.916174072564207e-06, + "loss": 3.5797, + "step": 990 + }, + { + "epoch": 0.06760429406169316, + "grad_norm": 6.053611755371094, + "learning_rate": 9.915749422475881e-06, + "loss": 3.4359, + "step": 995 + }, + { + "epoch": 0.06794401413235494, + "grad_norm": 9.337986946105957, + "learning_rate": 9.915324772387554e-06, + "loss": 3.5884, + "step": 1000 + }, + { + "epoch": 0.06828373420301671, + "grad_norm": 7.822343826293945, + "learning_rate": 9.914900122299225e-06, + "loss": 3.5145, + "step": 1005 + }, + { + "epoch": 0.06862345427367848, + "grad_norm": 6.915095329284668, + "learning_rate": 9.9144754722109e-06, + "loss": 3.2972, + "step": 1010 + }, + { + "epoch": 0.06896317434434027, + "grad_norm": 7.765141487121582, + "learning_rate": 9.914050822122571e-06, + "loss": 3.6676, + "step": 1015 + }, + { + "epoch": 0.06930289441500204, + "grad_norm": 6.179137229919434, + "learning_rate": 9.913626172034244e-06, + "loss": 3.4382, + "step": 1020 + }, + { + "epoch": 0.0696426144856638, + "grad_norm": 6.534022331237793, + "learning_rate": 9.913201521945918e-06, + "loss": 3.4409, + "step": 1025 + }, + { + "epoch": 0.06998233455632559, + "grad_norm": 7.097350597381592, + "learning_rate": 9.91277687185759e-06, + "loss": 3.6009, + "step": 1030 + }, + { + "epoch": 0.07032205462698736, + "grad_norm": 8.477814674377441, + "learning_rate": 9.912352221769262e-06, + "loss": 3.4949, + "step": 1035 + }, + { + "epoch": 0.07066177469764914, + "grad_norm": 9.299301147460938, + "learning_rate": 9.911927571680937e-06, + "loss": 3.5248, + "step": 1040 + }, + { + "epoch": 0.07100149476831091, + "grad_norm": 10.176789283752441, + "learning_rate": 9.911502921592608e-06, + "loss": 3.5317, + "step": 1045 + }, + { + "epoch": 0.07134121483897268, + "grad_norm": 8.405863761901855, + "learning_rate": 9.91107827150428e-06, + "loss": 3.8478, + "step": 1050 + }, + { + "epoch": 0.07168093490963447, + "grad_norm": 8.814489364624023, + "learning_rate": 9.910653621415955e-06, + "loss": 3.369, + "step": 1055 + }, + { + "epoch": 0.07202065498029624, + "grad_norm": 7.475778102874756, + "learning_rate": 9.910228971327626e-06, + "loss": 3.3845, + "step": 1060 + }, + { + "epoch": 0.072360375050958, + "grad_norm": 6.592101097106934, + "learning_rate": 9.909804321239299e-06, + "loss": 3.248, + "step": 1065 + }, + { + "epoch": 0.07270009512161979, + "grad_norm": 8.98584270477295, + "learning_rate": 9.909379671150973e-06, + "loss": 3.8559, + "step": 1070 + }, + { + "epoch": 0.07303981519228156, + "grad_norm": 8.415946960449219, + "learning_rate": 9.908955021062645e-06, + "loss": 3.4053, + "step": 1075 + }, + { + "epoch": 0.07337953526294333, + "grad_norm": 8.585652351379395, + "learning_rate": 9.908530370974317e-06, + "loss": 3.7706, + "step": 1080 + }, + { + "epoch": 0.07371925533360511, + "grad_norm": 7.560981273651123, + "learning_rate": 9.90810572088599e-06, + "loss": 3.4782, + "step": 1085 + }, + { + "epoch": 0.07405897540426688, + "grad_norm": 8.4655179977417, + "learning_rate": 9.907681070797663e-06, + "loss": 3.6683, + "step": 1090 + }, + { + "epoch": 0.07439869547492865, + "grad_norm": 7.878279685974121, + "learning_rate": 9.907256420709336e-06, + "loss": 3.5848, + "step": 1095 + }, + { + "epoch": 0.07473841554559044, + "grad_norm": 7.0207953453063965, + "learning_rate": 9.906831770621009e-06, + "loss": 3.2274, + "step": 1100 + }, + { + "epoch": 0.0750781356162522, + "grad_norm": 7.524562835693359, + "learning_rate": 9.906407120532681e-06, + "loss": 3.7739, + "step": 1105 + }, + { + "epoch": 0.07541785568691398, + "grad_norm": 7.296402454376221, + "learning_rate": 9.905982470444354e-06, + "loss": 3.5069, + "step": 1110 + }, + { + "epoch": 0.07575757575757576, + "grad_norm": 6.728298664093018, + "learning_rate": 9.905557820356027e-06, + "loss": 3.6449, + "step": 1115 + }, + { + "epoch": 0.07609729582823753, + "grad_norm": 6.113227367401123, + "learning_rate": 9.9051331702677e-06, + "loss": 3.4029, + "step": 1120 + }, + { + "epoch": 0.07643701589889931, + "grad_norm": 7.012844085693359, + "learning_rate": 9.904708520179373e-06, + "loss": 3.5347, + "step": 1125 + }, + { + "epoch": 0.07677673596956108, + "grad_norm": 8.624529838562012, + "learning_rate": 9.904283870091045e-06, + "loss": 3.4456, + "step": 1130 + }, + { + "epoch": 0.07711645604022285, + "grad_norm": 7.858013153076172, + "learning_rate": 9.903859220002718e-06, + "loss": 3.5707, + "step": 1135 + }, + { + "epoch": 0.07745617611088464, + "grad_norm": 8.86461067199707, + "learning_rate": 9.903434569914393e-06, + "loss": 3.2757, + "step": 1140 + }, + { + "epoch": 0.0777958961815464, + "grad_norm": 9.084182739257812, + "learning_rate": 9.903009919826064e-06, + "loss": 3.4652, + "step": 1145 + }, + { + "epoch": 0.07813561625220818, + "grad_norm": 8.807682991027832, + "learning_rate": 9.902585269737737e-06, + "loss": 3.4805, + "step": 1150 + }, + { + "epoch": 0.07847533632286996, + "grad_norm": 8.30797290802002, + "learning_rate": 9.902160619649411e-06, + "loss": 3.5762, + "step": 1155 + }, + { + "epoch": 0.07881505639353173, + "grad_norm": 6.87647008895874, + "learning_rate": 9.901735969561082e-06, + "loss": 3.3388, + "step": 1160 + }, + { + "epoch": 0.0791547764641935, + "grad_norm": 8.965385437011719, + "learning_rate": 9.901311319472755e-06, + "loss": 3.6527, + "step": 1165 + }, + { + "epoch": 0.07949449653485528, + "grad_norm": 6.975662708282471, + "learning_rate": 9.900886669384428e-06, + "loss": 3.5676, + "step": 1170 + }, + { + "epoch": 0.07983421660551705, + "grad_norm": 7.536172389984131, + "learning_rate": 9.9004620192961e-06, + "loss": 3.3704, + "step": 1175 + }, + { + "epoch": 0.08017393667617882, + "grad_norm": 8.21885871887207, + "learning_rate": 9.900037369207773e-06, + "loss": 3.8638, + "step": 1180 + }, + { + "epoch": 0.0805136567468406, + "grad_norm": 5.672938346862793, + "learning_rate": 9.899612719119446e-06, + "loss": 3.5109, + "step": 1185 + }, + { + "epoch": 0.08085337681750238, + "grad_norm": 7.099566459655762, + "learning_rate": 9.899188069031119e-06, + "loss": 3.6583, + "step": 1190 + }, + { + "epoch": 0.08119309688816416, + "grad_norm": 6.97761869430542, + "learning_rate": 9.898763418942792e-06, + "loss": 3.2171, + "step": 1195 + }, + { + "epoch": 0.08153281695882593, + "grad_norm": 7.999174118041992, + "learning_rate": 9.898338768854465e-06, + "loss": 3.6525, + "step": 1200 + }, + { + "epoch": 0.0818725370294877, + "grad_norm": 10.474752426147461, + "learning_rate": 9.897914118766137e-06, + "loss": 3.7682, + "step": 1205 + }, + { + "epoch": 0.08221225710014948, + "grad_norm": 7.5210418701171875, + "learning_rate": 9.89748946867781e-06, + "loss": 3.3628, + "step": 1210 + }, + { + "epoch": 0.08255197717081125, + "grad_norm": 7.81768798828125, + "learning_rate": 9.897064818589483e-06, + "loss": 3.3506, + "step": 1215 + }, + { + "epoch": 0.08289169724147302, + "grad_norm": 7.772113800048828, + "learning_rate": 9.896640168501156e-06, + "loss": 3.4306, + "step": 1220 + }, + { + "epoch": 0.0832314173121348, + "grad_norm": 6.869964599609375, + "learning_rate": 9.896215518412829e-06, + "loss": 3.4645, + "step": 1225 + }, + { + "epoch": 0.08357113738279658, + "grad_norm": 7.957749843597412, + "learning_rate": 9.895790868324501e-06, + "loss": 3.7118, + "step": 1230 + }, + { + "epoch": 0.08391085745345835, + "grad_norm": 7.574612617492676, + "learning_rate": 9.895366218236174e-06, + "loss": 3.4821, + "step": 1235 + }, + { + "epoch": 0.08425057752412013, + "grad_norm": 8.408559799194336, + "learning_rate": 9.894941568147847e-06, + "loss": 3.6089, + "step": 1240 + }, + { + "epoch": 0.0845902975947819, + "grad_norm": 7.396973133087158, + "learning_rate": 9.89451691805952e-06, + "loss": 3.3598, + "step": 1245 + }, + { + "epoch": 0.08493001766544367, + "grad_norm": 9.272367477416992, + "learning_rate": 9.894092267971193e-06, + "loss": 3.6434, + "step": 1250 + }, + { + "epoch": 0.08526973773610545, + "grad_norm": 6.924034118652344, + "learning_rate": 9.893667617882865e-06, + "loss": 3.6805, + "step": 1255 + }, + { + "epoch": 0.08560945780676722, + "grad_norm": 7.4177656173706055, + "learning_rate": 9.893242967794538e-06, + "loss": 3.3404, + "step": 1260 + }, + { + "epoch": 0.08594917787742899, + "grad_norm": 7.627190589904785, + "learning_rate": 9.892818317706211e-06, + "loss": 3.5014, + "step": 1265 + }, + { + "epoch": 0.08628889794809078, + "grad_norm": 6.803024768829346, + "learning_rate": 9.892393667617884e-06, + "loss": 3.5134, + "step": 1270 + }, + { + "epoch": 0.08662861801875255, + "grad_norm": 8.395498275756836, + "learning_rate": 9.891969017529557e-06, + "loss": 3.5064, + "step": 1275 + }, + { + "epoch": 0.08696833808941433, + "grad_norm": 6.635056495666504, + "learning_rate": 9.89154436744123e-06, + "loss": 3.6225, + "step": 1280 + }, + { + "epoch": 0.0873080581600761, + "grad_norm": 7.680435657501221, + "learning_rate": 9.891119717352902e-06, + "loss": 3.9075, + "step": 1285 + }, + { + "epoch": 0.08764777823073787, + "grad_norm": 6.157256126403809, + "learning_rate": 9.890695067264575e-06, + "loss": 3.4423, + "step": 1290 + }, + { + "epoch": 0.08798749830139965, + "grad_norm": 7.211416721343994, + "learning_rate": 9.890270417176248e-06, + "loss": 3.5382, + "step": 1295 + }, + { + "epoch": 0.08832721837206142, + "grad_norm": 7.6889495849609375, + "learning_rate": 9.88984576708792e-06, + "loss": 3.6967, + "step": 1300 + }, + { + "epoch": 0.08866693844272319, + "grad_norm": 7.798202037811279, + "learning_rate": 9.889421116999593e-06, + "loss": 3.4808, + "step": 1305 + }, + { + "epoch": 0.08900665851338498, + "grad_norm": 5.888078689575195, + "learning_rate": 9.888996466911266e-06, + "loss": 3.6108, + "step": 1310 + }, + { + "epoch": 0.08934637858404675, + "grad_norm": 9.174164772033691, + "learning_rate": 9.888571816822939e-06, + "loss": 3.4455, + "step": 1315 + }, + { + "epoch": 0.08968609865470852, + "grad_norm": 7.071383953094482, + "learning_rate": 9.888147166734612e-06, + "loss": 3.2512, + "step": 1320 + }, + { + "epoch": 0.0900258187253703, + "grad_norm": 7.3101348876953125, + "learning_rate": 9.887722516646285e-06, + "loss": 3.2549, + "step": 1325 + }, + { + "epoch": 0.09036553879603207, + "grad_norm": 8.513507843017578, + "learning_rate": 9.887297866557957e-06, + "loss": 3.6533, + "step": 1330 + }, + { + "epoch": 0.09070525886669384, + "grad_norm": 9.133475303649902, + "learning_rate": 9.88687321646963e-06, + "loss": 3.6299, + "step": 1335 + }, + { + "epoch": 0.09104497893735562, + "grad_norm": 7.597873210906982, + "learning_rate": 9.886448566381303e-06, + "loss": 3.6525, + "step": 1340 + }, + { + "epoch": 0.09138469900801739, + "grad_norm": 6.511581897735596, + "learning_rate": 9.886023916292976e-06, + "loss": 3.1781, + "step": 1345 + }, + { + "epoch": 0.09172441907867916, + "grad_norm": 7.5529985427856445, + "learning_rate": 9.885599266204649e-06, + "loss": 3.3991, + "step": 1350 + }, + { + "epoch": 0.09206413914934095, + "grad_norm": 5.907275676727295, + "learning_rate": 9.885174616116321e-06, + "loss": 3.5921, + "step": 1355 + }, + { + "epoch": 0.09240385922000272, + "grad_norm": 9.74201488494873, + "learning_rate": 9.884749966027993e-06, + "loss": 3.4781, + "step": 1360 + }, + { + "epoch": 0.0927435792906645, + "grad_norm": 7.092123031616211, + "learning_rate": 9.884325315939667e-06, + "loss": 3.5148, + "step": 1365 + }, + { + "epoch": 0.09308329936132627, + "grad_norm": 6.33518648147583, + "learning_rate": 9.88390066585134e-06, + "loss": 3.5938, + "step": 1370 + }, + { + "epoch": 0.09342301943198804, + "grad_norm": 7.05945348739624, + "learning_rate": 9.883476015763011e-06, + "loss": 3.6563, + "step": 1375 + }, + { + "epoch": 0.09376273950264982, + "grad_norm": 8.032682418823242, + "learning_rate": 9.883051365674685e-06, + "loss": 3.4037, + "step": 1380 + }, + { + "epoch": 0.09410245957331159, + "grad_norm": 8.485739707946777, + "learning_rate": 9.882626715586358e-06, + "loss": 3.5726, + "step": 1385 + }, + { + "epoch": 0.09444217964397336, + "grad_norm": 6.818784236907959, + "learning_rate": 9.88220206549803e-06, + "loss": 3.4866, + "step": 1390 + }, + { + "epoch": 0.09478189971463515, + "grad_norm": 6.609715461730957, + "learning_rate": 9.881777415409704e-06, + "loss": 3.4708, + "step": 1395 + }, + { + "epoch": 0.09512161978529692, + "grad_norm": 6.724889755249023, + "learning_rate": 9.881352765321377e-06, + "loss": 3.5161, + "step": 1400 + }, + { + "epoch": 0.09546133985595869, + "grad_norm": 6.192169666290283, + "learning_rate": 9.880928115233048e-06, + "loss": 3.5141, + "step": 1405 + }, + { + "epoch": 0.09580105992662047, + "grad_norm": 6.147939205169678, + "learning_rate": 9.880503465144722e-06, + "loss": 3.5424, + "step": 1410 + }, + { + "epoch": 0.09614077999728224, + "grad_norm": 7.450833797454834, + "learning_rate": 9.880078815056395e-06, + "loss": 3.5658, + "step": 1415 + }, + { + "epoch": 0.09648050006794401, + "grad_norm": 6.084408760070801, + "learning_rate": 9.879654164968066e-06, + "loss": 3.6178, + "step": 1420 + }, + { + "epoch": 0.09682022013860579, + "grad_norm": 8.488066673278809, + "learning_rate": 9.87922951487974e-06, + "loss": 3.6126, + "step": 1425 + }, + { + "epoch": 0.09715994020926756, + "grad_norm": 6.148119926452637, + "learning_rate": 9.878804864791412e-06, + "loss": 3.4282, + "step": 1430 + }, + { + "epoch": 0.09749966027992933, + "grad_norm": 7.422104358673096, + "learning_rate": 9.878380214703085e-06, + "loss": 3.4782, + "step": 1435 + }, + { + "epoch": 0.09783938035059112, + "grad_norm": 8.7265043258667, + "learning_rate": 9.877955564614759e-06, + "loss": 3.4976, + "step": 1440 + }, + { + "epoch": 0.09817910042125289, + "grad_norm": 6.2633376121521, + "learning_rate": 9.87753091452643e-06, + "loss": 3.2922, + "step": 1445 + }, + { + "epoch": 0.09851882049191467, + "grad_norm": 6.67805814743042, + "learning_rate": 9.877106264438103e-06, + "loss": 3.4982, + "step": 1450 + }, + { + "epoch": 0.09885854056257644, + "grad_norm": 6.542866230010986, + "learning_rate": 9.876681614349777e-06, + "loss": 3.5515, + "step": 1455 + }, + { + "epoch": 0.09919826063323821, + "grad_norm": 6.787693977355957, + "learning_rate": 9.876256964261449e-06, + "loss": 3.6178, + "step": 1460 + }, + { + "epoch": 0.09953798070389999, + "grad_norm": 10.402273178100586, + "learning_rate": 9.875832314173121e-06, + "loss": 3.5537, + "step": 1465 + }, + { + "epoch": 0.09987770077456176, + "grad_norm": 7.573434829711914, + "learning_rate": 9.875407664084796e-06, + "loss": 3.7112, + "step": 1470 + }, + { + "epoch": 0.10021742084522353, + "grad_norm": 6.854200839996338, + "learning_rate": 9.874983013996467e-06, + "loss": 3.2897, + "step": 1475 + }, + { + "epoch": 0.10055714091588532, + "grad_norm": 9.077394485473633, + "learning_rate": 9.874558363908141e-06, + "loss": 3.5578, + "step": 1480 + }, + { + "epoch": 0.10089686098654709, + "grad_norm": 6.590576171875, + "learning_rate": 9.874133713819814e-06, + "loss": 3.4848, + "step": 1485 + }, + { + "epoch": 0.10123658105720885, + "grad_norm": 6.844799995422363, + "learning_rate": 9.873709063731485e-06, + "loss": 3.5055, + "step": 1490 + }, + { + "epoch": 0.10157630112787064, + "grad_norm": 6.457330226898193, + "learning_rate": 9.87328441364316e-06, + "loss": 3.4994, + "step": 1495 + }, + { + "epoch": 0.10191602119853241, + "grad_norm": 6.89069938659668, + "learning_rate": 9.872859763554831e-06, + "loss": 3.7267, + "step": 1500 + }, + { + "epoch": 0.10225574126919418, + "grad_norm": 7.917197227478027, + "learning_rate": 9.872435113466504e-06, + "loss": 3.6511, + "step": 1505 + }, + { + "epoch": 0.10259546133985596, + "grad_norm": 6.821871280670166, + "learning_rate": 9.872010463378178e-06, + "loss": 3.3794, + "step": 1510 + }, + { + "epoch": 0.10293518141051773, + "grad_norm": 6.53437614440918, + "learning_rate": 9.87158581328985e-06, + "loss": 3.4173, + "step": 1515 + }, + { + "epoch": 0.1032749014811795, + "grad_norm": 6.441720485687256, + "learning_rate": 9.871161163201522e-06, + "loss": 3.4189, + "step": 1520 + }, + { + "epoch": 0.10361462155184128, + "grad_norm": 7.781720161437988, + "learning_rate": 9.870736513113197e-06, + "loss": 3.6685, + "step": 1525 + }, + { + "epoch": 0.10395434162250305, + "grad_norm": 6.935204982757568, + "learning_rate": 9.870311863024868e-06, + "loss": 3.4905, + "step": 1530 + }, + { + "epoch": 0.10429406169316484, + "grad_norm": 6.280497074127197, + "learning_rate": 9.86988721293654e-06, + "loss": 3.6955, + "step": 1535 + }, + { + "epoch": 0.10463378176382661, + "grad_norm": 8.084494590759277, + "learning_rate": 9.869462562848215e-06, + "loss": 3.6325, + "step": 1540 + }, + { + "epoch": 0.10497350183448838, + "grad_norm": 5.829450607299805, + "learning_rate": 9.869037912759886e-06, + "loss": 3.4738, + "step": 1545 + }, + { + "epoch": 0.10531322190515016, + "grad_norm": 7.751105308532715, + "learning_rate": 9.868613262671559e-06, + "loss": 3.6126, + "step": 1550 + }, + { + "epoch": 0.10565294197581193, + "grad_norm": 7.374814987182617, + "learning_rate": 9.868188612583233e-06, + "loss": 3.4422, + "step": 1555 + }, + { + "epoch": 0.1059926620464737, + "grad_norm": 7.969167709350586, + "learning_rate": 9.867763962494905e-06, + "loss": 3.549, + "step": 1560 + }, + { + "epoch": 0.10633238211713548, + "grad_norm": 9.76917552947998, + "learning_rate": 9.867339312406577e-06, + "loss": 3.5591, + "step": 1565 + }, + { + "epoch": 0.10667210218779725, + "grad_norm": 7.4201812744140625, + "learning_rate": 9.866914662318252e-06, + "loss": 3.4207, + "step": 1570 + }, + { + "epoch": 0.10701182225845902, + "grad_norm": 7.576995372772217, + "learning_rate": 9.866490012229923e-06, + "loss": 3.6246, + "step": 1575 + }, + { + "epoch": 0.10735154232912081, + "grad_norm": 6.365749835968018, + "learning_rate": 9.866065362141596e-06, + "loss": 3.5035, + "step": 1580 + }, + { + "epoch": 0.10769126239978258, + "grad_norm": 6.779904365539551, + "learning_rate": 9.865640712053269e-06, + "loss": 3.5599, + "step": 1585 + }, + { + "epoch": 0.10803098247044435, + "grad_norm": 7.051429271697998, + "learning_rate": 9.865216061964941e-06, + "loss": 3.4008, + "step": 1590 + }, + { + "epoch": 0.10837070254110613, + "grad_norm": 6.140173435211182, + "learning_rate": 9.864791411876614e-06, + "loss": 3.6071, + "step": 1595 + }, + { + "epoch": 0.1087104226117679, + "grad_norm": 5.702284336090088, + "learning_rate": 9.864366761788287e-06, + "loss": 3.4672, + "step": 1600 + }, + { + "epoch": 0.10905014268242967, + "grad_norm": 7.401752471923828, + "learning_rate": 9.86394211169996e-06, + "loss": 3.5273, + "step": 1605 + }, + { + "epoch": 0.10938986275309145, + "grad_norm": 9.138490676879883, + "learning_rate": 9.863517461611633e-06, + "loss": 3.6793, + "step": 1610 + }, + { + "epoch": 0.10972958282375322, + "grad_norm": 7.003054141998291, + "learning_rate": 9.863092811523305e-06, + "loss": 3.4808, + "step": 1615 + }, + { + "epoch": 0.11006930289441501, + "grad_norm": 7.09920072555542, + "learning_rate": 9.862668161434978e-06, + "loss": 3.379, + "step": 1620 + }, + { + "epoch": 0.11040902296507678, + "grad_norm": 7.027630805969238, + "learning_rate": 9.862243511346651e-06, + "loss": 3.3791, + "step": 1625 + }, + { + "epoch": 0.11074874303573855, + "grad_norm": 8.04013442993164, + "learning_rate": 9.861818861258324e-06, + "loss": 3.5583, + "step": 1630 + }, + { + "epoch": 0.11108846310640033, + "grad_norm": 8.106400489807129, + "learning_rate": 9.861394211169997e-06, + "loss": 3.5482, + "step": 1635 + }, + { + "epoch": 0.1114281831770621, + "grad_norm": 7.295941352844238, + "learning_rate": 9.86096956108167e-06, + "loss": 3.6439, + "step": 1640 + }, + { + "epoch": 0.11176790324772387, + "grad_norm": 6.340036392211914, + "learning_rate": 9.860544910993342e-06, + "loss": 3.4859, + "step": 1645 + }, + { + "epoch": 0.11210762331838565, + "grad_norm": 6.1852545738220215, + "learning_rate": 9.860120260905015e-06, + "loss": 3.3276, + "step": 1650 + }, + { + "epoch": 0.11244734338904742, + "grad_norm": 8.265686988830566, + "learning_rate": 9.859695610816688e-06, + "loss": 3.5552, + "step": 1655 + }, + { + "epoch": 0.1127870634597092, + "grad_norm": 7.727878093719482, + "learning_rate": 9.85927096072836e-06, + "loss": 3.5642, + "step": 1660 + }, + { + "epoch": 0.11312678353037098, + "grad_norm": 8.827818870544434, + "learning_rate": 9.858846310640033e-06, + "loss": 3.6152, + "step": 1665 + }, + { + "epoch": 0.11346650360103275, + "grad_norm": 8.304938316345215, + "learning_rate": 9.858421660551706e-06, + "loss": 3.5841, + "step": 1670 + }, + { + "epoch": 0.11380622367169452, + "grad_norm": 7.142397880554199, + "learning_rate": 9.857997010463379e-06, + "loss": 3.515, + "step": 1675 + }, + { + "epoch": 0.1141459437423563, + "grad_norm": 7.158762454986572, + "learning_rate": 9.857572360375052e-06, + "loss": 3.4493, + "step": 1680 + }, + { + "epoch": 0.11448566381301807, + "grad_norm": 6.5224504470825195, + "learning_rate": 9.857147710286725e-06, + "loss": 3.4371, + "step": 1685 + }, + { + "epoch": 0.11482538388367985, + "grad_norm": 6.542701721191406, + "learning_rate": 9.856723060198397e-06, + "loss": 3.3099, + "step": 1690 + }, + { + "epoch": 0.11516510395434162, + "grad_norm": 6.155130863189697, + "learning_rate": 9.85629841011007e-06, + "loss": 3.399, + "step": 1695 + }, + { + "epoch": 0.1155048240250034, + "grad_norm": 8.402070999145508, + "learning_rate": 9.855873760021743e-06, + "loss": 3.562, + "step": 1700 + }, + { + "epoch": 0.11584454409566518, + "grad_norm": 6.014597415924072, + "learning_rate": 9.855449109933416e-06, + "loss": 3.2836, + "step": 1705 + }, + { + "epoch": 0.11618426416632695, + "grad_norm": 6.035742282867432, + "learning_rate": 9.855024459845089e-06, + "loss": 3.6433, + "step": 1710 + }, + { + "epoch": 0.11652398423698872, + "grad_norm": 8.158051490783691, + "learning_rate": 9.854599809756761e-06, + "loss": 3.2344, + "step": 1715 + }, + { + "epoch": 0.1168637043076505, + "grad_norm": 7.680637836456299, + "learning_rate": 9.854175159668434e-06, + "loss": 3.6643, + "step": 1720 + }, + { + "epoch": 0.11720342437831227, + "grad_norm": 8.073476791381836, + "learning_rate": 9.853750509580107e-06, + "loss": 3.32, + "step": 1725 + }, + { + "epoch": 0.11754314444897404, + "grad_norm": 7.5266923904418945, + "learning_rate": 9.85332585949178e-06, + "loss": 3.3746, + "step": 1730 + }, + { + "epoch": 0.11788286451963582, + "grad_norm": 11.649001121520996, + "learning_rate": 9.852901209403453e-06, + "loss": 3.4758, + "step": 1735 + }, + { + "epoch": 0.1182225845902976, + "grad_norm": 8.453317642211914, + "learning_rate": 9.852476559315125e-06, + "loss": 3.5822, + "step": 1740 + }, + { + "epoch": 0.11856230466095936, + "grad_norm": 6.846125602722168, + "learning_rate": 9.852051909226798e-06, + "loss": 3.687, + "step": 1745 + }, + { + "epoch": 0.11890202473162115, + "grad_norm": 7.810622215270996, + "learning_rate": 9.851627259138471e-06, + "loss": 3.5302, + "step": 1750 + }, + { + "epoch": 0.11924174480228292, + "grad_norm": 7.491157531738281, + "learning_rate": 9.851202609050144e-06, + "loss": 3.6161, + "step": 1755 + }, + { + "epoch": 0.11958146487294469, + "grad_norm": 6.940120220184326, + "learning_rate": 9.850777958961817e-06, + "loss": 3.7091, + "step": 1760 + }, + { + "epoch": 0.11992118494360647, + "grad_norm": 7.195255279541016, + "learning_rate": 9.85035330887349e-06, + "loss": 3.4858, + "step": 1765 + }, + { + "epoch": 0.12026090501426824, + "grad_norm": 6.500730037689209, + "learning_rate": 9.849928658785162e-06, + "loss": 3.5275, + "step": 1770 + }, + { + "epoch": 0.12060062508493002, + "grad_norm": 7.429586887359619, + "learning_rate": 9.849504008696833e-06, + "loss": 3.41, + "step": 1775 + }, + { + "epoch": 0.1209403451555918, + "grad_norm": 7.0954084396362305, + "learning_rate": 9.849079358608508e-06, + "loss": 3.5714, + "step": 1780 + }, + { + "epoch": 0.12128006522625356, + "grad_norm": 7.723840713500977, + "learning_rate": 9.84865470852018e-06, + "loss": 3.6565, + "step": 1785 + }, + { + "epoch": 0.12161978529691535, + "grad_norm": 8.310978889465332, + "learning_rate": 9.848230058431852e-06, + "loss": 3.6318, + "step": 1790 + }, + { + "epoch": 0.12195950536757712, + "grad_norm": 7.451302528381348, + "learning_rate": 9.847805408343526e-06, + "loss": 3.3406, + "step": 1795 + }, + { + "epoch": 0.12229922543823889, + "grad_norm": 6.936929702758789, + "learning_rate": 9.847380758255199e-06, + "loss": 3.4152, + "step": 1800 + }, + { + "epoch": 0.12263894550890067, + "grad_norm": 7.109097003936768, + "learning_rate": 9.84695610816687e-06, + "loss": 3.5129, + "step": 1805 + }, + { + "epoch": 0.12297866557956244, + "grad_norm": 6.065667629241943, + "learning_rate": 9.846531458078545e-06, + "loss": 3.4208, + "step": 1810 + }, + { + "epoch": 0.12331838565022421, + "grad_norm": 8.6099214553833, + "learning_rate": 9.846106807990217e-06, + "loss": 3.289, + "step": 1815 + }, + { + "epoch": 0.123658105720886, + "grad_norm": 7.512652397155762, + "learning_rate": 9.84568215790189e-06, + "loss": 3.4426, + "step": 1820 + }, + { + "epoch": 0.12399782579154776, + "grad_norm": 8.682568550109863, + "learning_rate": 9.845257507813563e-06, + "loss": 3.5308, + "step": 1825 + }, + { + "epoch": 0.12433754586220953, + "grad_norm": 6.264684677124023, + "learning_rate": 9.844832857725236e-06, + "loss": 3.529, + "step": 1830 + }, + { + "epoch": 0.12467726593287132, + "grad_norm": 7.9210357666015625, + "learning_rate": 9.844408207636909e-06, + "loss": 3.4941, + "step": 1835 + }, + { + "epoch": 0.1250169860035331, + "grad_norm": 8.201410293579102, + "learning_rate": 9.843983557548581e-06, + "loss": 3.5731, + "step": 1840 + }, + { + "epoch": 0.12535670607419486, + "grad_norm": 8.870905876159668, + "learning_rate": 9.843558907460252e-06, + "loss": 3.4479, + "step": 1845 + }, + { + "epoch": 0.12569642614485663, + "grad_norm": 9.326218605041504, + "learning_rate": 9.843134257371927e-06, + "loss": 3.6382, + "step": 1850 + }, + { + "epoch": 0.12603614621551842, + "grad_norm": 7.463184833526611, + "learning_rate": 9.8427096072836e-06, + "loss": 3.4977, + "step": 1855 + }, + { + "epoch": 0.1263758662861802, + "grad_norm": 7.693656921386719, + "learning_rate": 9.842284957195271e-06, + "loss": 3.5777, + "step": 1860 + }, + { + "epoch": 0.12671558635684196, + "grad_norm": 7.773869514465332, + "learning_rate": 9.841860307106945e-06, + "loss": 3.4488, + "step": 1865 + }, + { + "epoch": 0.12705530642750373, + "grad_norm": 6.571211814880371, + "learning_rate": 9.841435657018618e-06, + "loss": 3.5626, + "step": 1870 + }, + { + "epoch": 0.1273950264981655, + "grad_norm": 6.249183654785156, + "learning_rate": 9.84101100693029e-06, + "loss": 3.4444, + "step": 1875 + }, + { + "epoch": 0.12773474656882727, + "grad_norm": 8.222867965698242, + "learning_rate": 9.840586356841964e-06, + "loss": 3.4741, + "step": 1880 + }, + { + "epoch": 0.12807446663948907, + "grad_norm": 6.250969886779785, + "learning_rate": 9.840161706753637e-06, + "loss": 3.607, + "step": 1885 + }, + { + "epoch": 0.12841418671015084, + "grad_norm": 6.801116943359375, + "learning_rate": 9.839737056665308e-06, + "loss": 3.0078, + "step": 1890 + }, + { + "epoch": 0.1287539067808126, + "grad_norm": 7.946317195892334, + "learning_rate": 9.839312406576982e-06, + "loss": 3.577, + "step": 1895 + }, + { + "epoch": 0.12909362685147438, + "grad_norm": 7.179463863372803, + "learning_rate": 9.838887756488655e-06, + "loss": 3.414, + "step": 1900 + }, + { + "epoch": 0.12943334692213615, + "grad_norm": 6.54093599319458, + "learning_rate": 9.838463106400326e-06, + "loss": 3.2509, + "step": 1905 + }, + { + "epoch": 0.12977306699279795, + "grad_norm": 7.216185092926025, + "learning_rate": 9.838038456312e-06, + "loss": 3.521, + "step": 1910 + }, + { + "epoch": 0.13011278706345972, + "grad_norm": 5.821789741516113, + "learning_rate": 9.837613806223673e-06, + "loss": 3.5074, + "step": 1915 + }, + { + "epoch": 0.1304525071341215, + "grad_norm": 7.868536472320557, + "learning_rate": 9.837189156135345e-06, + "loss": 3.3188, + "step": 1920 + }, + { + "epoch": 0.13079222720478326, + "grad_norm": 7.602503299713135, + "learning_rate": 9.836764506047019e-06, + "loss": 3.6187, + "step": 1925 + }, + { + "epoch": 0.13113194727544503, + "grad_norm": 8.793668746948242, + "learning_rate": 9.83633985595869e-06, + "loss": 3.594, + "step": 1930 + }, + { + "epoch": 0.1314716673461068, + "grad_norm": 9.14974308013916, + "learning_rate": 9.835915205870363e-06, + "loss": 3.4118, + "step": 1935 + }, + { + "epoch": 0.1318113874167686, + "grad_norm": 6.320901870727539, + "learning_rate": 9.835490555782037e-06, + "loss": 3.0672, + "step": 1940 + }, + { + "epoch": 0.13215110748743036, + "grad_norm": 10.093212127685547, + "learning_rate": 9.835065905693709e-06, + "loss": 3.4336, + "step": 1945 + }, + { + "epoch": 0.13249082755809213, + "grad_norm": 6.433370590209961, + "learning_rate": 9.834641255605381e-06, + "loss": 3.5584, + "step": 1950 + }, + { + "epoch": 0.1328305476287539, + "grad_norm": 6.673224449157715, + "learning_rate": 9.834216605517056e-06, + "loss": 3.4698, + "step": 1955 + }, + { + "epoch": 0.13317026769941567, + "grad_norm": 8.246062278747559, + "learning_rate": 9.833791955428727e-06, + "loss": 3.5393, + "step": 1960 + }, + { + "epoch": 0.13350998777007744, + "grad_norm": 5.872581481933594, + "learning_rate": 9.8333673053404e-06, + "loss": 3.1358, + "step": 1965 + }, + { + "epoch": 0.13384970784073924, + "grad_norm": 7.793081283569336, + "learning_rate": 9.832942655252074e-06, + "loss": 3.3839, + "step": 1970 + }, + { + "epoch": 0.134189427911401, + "grad_norm": 7.282268047332764, + "learning_rate": 9.832518005163745e-06, + "loss": 3.6166, + "step": 1975 + }, + { + "epoch": 0.13452914798206278, + "grad_norm": 8.259596824645996, + "learning_rate": 9.832093355075418e-06, + "loss": 3.4106, + "step": 1980 + }, + { + "epoch": 0.13486886805272455, + "grad_norm": 6.8006367683410645, + "learning_rate": 9.831668704987093e-06, + "loss": 3.5882, + "step": 1985 + }, + { + "epoch": 0.13520858812338632, + "grad_norm": 8.181178092956543, + "learning_rate": 9.831244054898764e-06, + "loss": 3.4914, + "step": 1990 + }, + { + "epoch": 0.13554830819404812, + "grad_norm": 6.864015579223633, + "learning_rate": 9.830819404810437e-06, + "loss": 3.5454, + "step": 1995 + }, + { + "epoch": 0.1358880282647099, + "grad_norm": 7.033741474151611, + "learning_rate": 9.83039475472211e-06, + "loss": 3.3774, + "step": 2000 + }, + { + "epoch": 0.13622774833537166, + "grad_norm": 8.049239158630371, + "learning_rate": 9.829970104633782e-06, + "loss": 3.3876, + "step": 2005 + }, + { + "epoch": 0.13656746840603343, + "grad_norm": 7.010183811187744, + "learning_rate": 9.829545454545455e-06, + "loss": 3.7103, + "step": 2010 + }, + { + "epoch": 0.1369071884766952, + "grad_norm": 5.812173366546631, + "learning_rate": 9.829120804457128e-06, + "loss": 3.4645, + "step": 2015 + }, + { + "epoch": 0.13724690854735697, + "grad_norm": 8.208044052124023, + "learning_rate": 9.8286961543688e-06, + "loss": 3.6919, + "step": 2020 + }, + { + "epoch": 0.13758662861801876, + "grad_norm": 6.561134338378906, + "learning_rate": 9.828271504280473e-06, + "loss": 3.3678, + "step": 2025 + }, + { + "epoch": 0.13792634868868053, + "grad_norm": 10.61146068572998, + "learning_rate": 9.827846854192146e-06, + "loss": 3.385, + "step": 2030 + }, + { + "epoch": 0.1382660687593423, + "grad_norm": 7.285191535949707, + "learning_rate": 9.827422204103819e-06, + "loss": 3.7104, + "step": 2035 + }, + { + "epoch": 0.13860578883000407, + "grad_norm": 5.590353012084961, + "learning_rate": 9.826997554015492e-06, + "loss": 3.5673, + "step": 2040 + }, + { + "epoch": 0.13894550890066584, + "grad_norm": 5.936677932739258, + "learning_rate": 9.826572903927165e-06, + "loss": 3.4752, + "step": 2045 + }, + { + "epoch": 0.1392852289713276, + "grad_norm": 6.939801216125488, + "learning_rate": 9.826148253838837e-06, + "loss": 3.4617, + "step": 2050 + }, + { + "epoch": 0.1396249490419894, + "grad_norm": 7.499307632446289, + "learning_rate": 9.82572360375051e-06, + "loss": 3.5916, + "step": 2055 + }, + { + "epoch": 0.13996466911265118, + "grad_norm": 6.273457050323486, + "learning_rate": 9.825298953662183e-06, + "loss": 3.6184, + "step": 2060 + }, + { + "epoch": 0.14030438918331295, + "grad_norm": 8.047819137573242, + "learning_rate": 9.824874303573856e-06, + "loss": 3.6126, + "step": 2065 + }, + { + "epoch": 0.14064410925397472, + "grad_norm": 7.994832515716553, + "learning_rate": 9.824449653485529e-06, + "loss": 3.4483, + "step": 2070 + }, + { + "epoch": 0.1409838293246365, + "grad_norm": 7.81788969039917, + "learning_rate": 9.824025003397201e-06, + "loss": 3.3324, + "step": 2075 + }, + { + "epoch": 0.1413235493952983, + "grad_norm": 5.837959289550781, + "learning_rate": 9.823600353308874e-06, + "loss": 3.5176, + "step": 2080 + }, + { + "epoch": 0.14166326946596006, + "grad_norm": 7.596682548522949, + "learning_rate": 9.823175703220547e-06, + "loss": 3.5744, + "step": 2085 + }, + { + "epoch": 0.14200298953662183, + "grad_norm": 7.238300323486328, + "learning_rate": 9.82275105313222e-06, + "loss": 3.4583, + "step": 2090 + }, + { + "epoch": 0.1423427096072836, + "grad_norm": 6.0559797286987305, + "learning_rate": 9.822326403043893e-06, + "loss": 3.5692, + "step": 2095 + }, + { + "epoch": 0.14268242967794537, + "grad_norm": 6.484986305236816, + "learning_rate": 9.821901752955565e-06, + "loss": 3.4298, + "step": 2100 + }, + { + "epoch": 0.14302214974860714, + "grad_norm": 7.064390182495117, + "learning_rate": 9.821477102867238e-06, + "loss": 3.5576, + "step": 2105 + }, + { + "epoch": 0.14336186981926893, + "grad_norm": 6.44007682800293, + "learning_rate": 9.821052452778911e-06, + "loss": 3.4566, + "step": 2110 + }, + { + "epoch": 0.1437015898899307, + "grad_norm": 6.2416510581970215, + "learning_rate": 9.820627802690584e-06, + "loss": 3.134, + "step": 2115 + }, + { + "epoch": 0.14404130996059247, + "grad_norm": 6.978240013122559, + "learning_rate": 9.820203152602257e-06, + "loss": 3.6185, + "step": 2120 + }, + { + "epoch": 0.14438103003125424, + "grad_norm": 8.49980640411377, + "learning_rate": 9.81977850251393e-06, + "loss": 3.512, + "step": 2125 + }, + { + "epoch": 0.144720750101916, + "grad_norm": 8.492128372192383, + "learning_rate": 9.819353852425602e-06, + "loss": 3.4927, + "step": 2130 + }, + { + "epoch": 0.14506047017257778, + "grad_norm": 9.024857521057129, + "learning_rate": 9.818929202337275e-06, + "loss": 3.4733, + "step": 2135 + }, + { + "epoch": 0.14540019024323958, + "grad_norm": 6.397353172302246, + "learning_rate": 9.818504552248948e-06, + "loss": 3.4296, + "step": 2140 + }, + { + "epoch": 0.14573991031390135, + "grad_norm": 6.261580944061279, + "learning_rate": 9.81807990216062e-06, + "loss": 3.5087, + "step": 2145 + }, + { + "epoch": 0.14607963038456312, + "grad_norm": 7.803206443786621, + "learning_rate": 9.817655252072293e-06, + "loss": 3.3146, + "step": 2150 + }, + { + "epoch": 0.1464193504552249, + "grad_norm": 7.003575801849365, + "learning_rate": 9.817230601983966e-06, + "loss": 3.9774, + "step": 2155 + }, + { + "epoch": 0.14675907052588666, + "grad_norm": 6.811065196990967, + "learning_rate": 9.816805951895639e-06, + "loss": 3.4883, + "step": 2160 + }, + { + "epoch": 0.14709879059654846, + "grad_norm": 9.689181327819824, + "learning_rate": 9.816381301807312e-06, + "loss": 3.4139, + "step": 2165 + }, + { + "epoch": 0.14743851066721023, + "grad_norm": 7.833205223083496, + "learning_rate": 9.815956651718985e-06, + "loss": 3.3531, + "step": 2170 + }, + { + "epoch": 0.147778230737872, + "grad_norm": 7.642338275909424, + "learning_rate": 9.815532001630657e-06, + "loss": 3.6652, + "step": 2175 + }, + { + "epoch": 0.14811795080853377, + "grad_norm": 6.633028030395508, + "learning_rate": 9.81510735154233e-06, + "loss": 3.4481, + "step": 2180 + }, + { + "epoch": 0.14845767087919554, + "grad_norm": 7.202051639556885, + "learning_rate": 9.814682701454003e-06, + "loss": 3.4946, + "step": 2185 + }, + { + "epoch": 0.1487973909498573, + "grad_norm": 7.616193771362305, + "learning_rate": 9.814258051365676e-06, + "loss": 3.5139, + "step": 2190 + }, + { + "epoch": 0.1491371110205191, + "grad_norm": 7.073561191558838, + "learning_rate": 9.813833401277349e-06, + "loss": 3.4047, + "step": 2195 + }, + { + "epoch": 0.14947683109118087, + "grad_norm": 8.187318801879883, + "learning_rate": 9.813408751189021e-06, + "loss": 3.5337, + "step": 2200 + }, + { + "epoch": 0.14981655116184264, + "grad_norm": 6.957797050476074, + "learning_rate": 9.812984101100694e-06, + "loss": 3.5097, + "step": 2205 + }, + { + "epoch": 0.1501562712325044, + "grad_norm": 6.667513370513916, + "learning_rate": 9.812559451012367e-06, + "loss": 3.6616, + "step": 2210 + }, + { + "epoch": 0.15049599130316618, + "grad_norm": 6.668028354644775, + "learning_rate": 9.81213480092404e-06, + "loss": 3.5718, + "step": 2215 + }, + { + "epoch": 0.15083571137382795, + "grad_norm": 7.337394714355469, + "learning_rate": 9.811710150835713e-06, + "loss": 3.5353, + "step": 2220 + }, + { + "epoch": 0.15117543144448975, + "grad_norm": 6.892975807189941, + "learning_rate": 9.811285500747385e-06, + "loss": 3.7648, + "step": 2225 + }, + { + "epoch": 0.15151515151515152, + "grad_norm": 8.148287773132324, + "learning_rate": 9.810860850659058e-06, + "loss": 3.3444, + "step": 2230 + }, + { + "epoch": 0.1518548715858133, + "grad_norm": 6.57051420211792, + "learning_rate": 9.810436200570731e-06, + "loss": 3.6379, + "step": 2235 + }, + { + "epoch": 0.15219459165647506, + "grad_norm": 6.102906703948975, + "learning_rate": 9.810011550482404e-06, + "loss": 3.6068, + "step": 2240 + }, + { + "epoch": 0.15253431172713683, + "grad_norm": 5.345434665679932, + "learning_rate": 9.809586900394077e-06, + "loss": 3.6004, + "step": 2245 + }, + { + "epoch": 0.15287403179779863, + "grad_norm": 7.73962926864624, + "learning_rate": 9.80916225030575e-06, + "loss": 3.2818, + "step": 2250 + }, + { + "epoch": 0.1532137518684604, + "grad_norm": 6.963854789733887, + "learning_rate": 9.808737600217422e-06, + "loss": 3.5311, + "step": 2255 + }, + { + "epoch": 0.15355347193912217, + "grad_norm": 9.03022575378418, + "learning_rate": 9.808312950129095e-06, + "loss": 3.7257, + "step": 2260 + }, + { + "epoch": 0.15389319200978394, + "grad_norm": 5.974201202392578, + "learning_rate": 9.807888300040768e-06, + "loss": 3.4682, + "step": 2265 + }, + { + "epoch": 0.1542329120804457, + "grad_norm": 6.743483066558838, + "learning_rate": 9.80746364995244e-06, + "loss": 3.5528, + "step": 2270 + }, + { + "epoch": 0.15457263215110748, + "grad_norm": 8.01964282989502, + "learning_rate": 9.807038999864112e-06, + "loss": 3.4091, + "step": 2275 + }, + { + "epoch": 0.15491235222176927, + "grad_norm": 8.52920913696289, + "learning_rate": 9.806614349775786e-06, + "loss": 3.3017, + "step": 2280 + }, + { + "epoch": 0.15525207229243104, + "grad_norm": 7.722394943237305, + "learning_rate": 9.806189699687459e-06, + "loss": 3.3579, + "step": 2285 + }, + { + "epoch": 0.1555917923630928, + "grad_norm": 7.064719200134277, + "learning_rate": 9.80576504959913e-06, + "loss": 3.4021, + "step": 2290 + }, + { + "epoch": 0.15593151243375458, + "grad_norm": 8.010902404785156, + "learning_rate": 9.805340399510805e-06, + "loss": 3.4062, + "step": 2295 + }, + { + "epoch": 0.15627123250441635, + "grad_norm": 8.702544212341309, + "learning_rate": 9.804915749422477e-06, + "loss": 3.4079, + "step": 2300 + }, + { + "epoch": 0.15661095257507815, + "grad_norm": 7.2567667961120605, + "learning_rate": 9.804491099334148e-06, + "loss": 3.6028, + "step": 2305 + }, + { + "epoch": 0.15695067264573992, + "grad_norm": 7.457847595214844, + "learning_rate": 9.804066449245823e-06, + "loss": 3.2941, + "step": 2310 + }, + { + "epoch": 0.1572903927164017, + "grad_norm": 6.674655914306641, + "learning_rate": 9.803641799157496e-06, + "loss": 3.5489, + "step": 2315 + }, + { + "epoch": 0.15763011278706346, + "grad_norm": 7.362298488616943, + "learning_rate": 9.803217149069167e-06, + "loss": 3.4461, + "step": 2320 + }, + { + "epoch": 0.15796983285772523, + "grad_norm": 7.722926616668701, + "learning_rate": 9.802792498980841e-06, + "loss": 3.598, + "step": 2325 + }, + { + "epoch": 0.158309552928387, + "grad_norm": 5.6898651123046875, + "learning_rate": 9.802367848892514e-06, + "loss": 3.3707, + "step": 2330 + }, + { + "epoch": 0.1586492729990488, + "grad_norm": 6.514618873596191, + "learning_rate": 9.801943198804185e-06, + "loss": 3.4818, + "step": 2335 + }, + { + "epoch": 0.15898899306971057, + "grad_norm": 6.611716270446777, + "learning_rate": 9.80151854871586e-06, + "loss": 3.5426, + "step": 2340 + }, + { + "epoch": 0.15932871314037234, + "grad_norm": 6.740464210510254, + "learning_rate": 9.801093898627531e-06, + "loss": 3.4804, + "step": 2345 + }, + { + "epoch": 0.1596684332110341, + "grad_norm": 8.664388656616211, + "learning_rate": 9.800669248539204e-06, + "loss": 3.7327, + "step": 2350 + }, + { + "epoch": 0.16000815328169588, + "grad_norm": 5.999797344207764, + "learning_rate": 9.800244598450878e-06, + "loss": 3.5553, + "step": 2355 + }, + { + "epoch": 0.16034787335235764, + "grad_norm": 6.7360734939575195, + "learning_rate": 9.79981994836255e-06, + "loss": 3.5778, + "step": 2360 + }, + { + "epoch": 0.16068759342301944, + "grad_norm": 6.086040496826172, + "learning_rate": 9.799395298274222e-06, + "loss": 3.2117, + "step": 2365 + }, + { + "epoch": 0.1610273134936812, + "grad_norm": 7.762253761291504, + "learning_rate": 9.798970648185897e-06, + "loss": 3.4565, + "step": 2370 + }, + { + "epoch": 0.16136703356434298, + "grad_norm": 6.458375453948975, + "learning_rate": 9.798545998097568e-06, + "loss": 3.5104, + "step": 2375 + }, + { + "epoch": 0.16170675363500475, + "grad_norm": 8.345690727233887, + "learning_rate": 9.79812134800924e-06, + "loss": 3.4432, + "step": 2380 + }, + { + "epoch": 0.16204647370566652, + "grad_norm": 6.4412360191345215, + "learning_rate": 9.797696697920915e-06, + "loss": 3.4182, + "step": 2385 + }, + { + "epoch": 0.16238619377632832, + "grad_norm": 7.601959228515625, + "learning_rate": 9.797272047832586e-06, + "loss": 3.5197, + "step": 2390 + }, + { + "epoch": 0.1627259138469901, + "grad_norm": 9.051590919494629, + "learning_rate": 9.796847397744259e-06, + "loss": 3.3421, + "step": 2395 + }, + { + "epoch": 0.16306563391765186, + "grad_norm": 6.561274528503418, + "learning_rate": 9.796422747655933e-06, + "loss": 3.5441, + "step": 2400 + }, + { + "epoch": 0.16340535398831363, + "grad_norm": 6.0623016357421875, + "learning_rate": 9.795998097567604e-06, + "loss": 3.2551, + "step": 2405 + }, + { + "epoch": 0.1637450740589754, + "grad_norm": 5.422914981842041, + "learning_rate": 9.795573447479277e-06, + "loss": 3.6544, + "step": 2410 + }, + { + "epoch": 0.16408479412963717, + "grad_norm": 7.116767406463623, + "learning_rate": 9.79514879739095e-06, + "loss": 3.6124, + "step": 2415 + }, + { + "epoch": 0.16442451420029897, + "grad_norm": 10.289285659790039, + "learning_rate": 9.794724147302623e-06, + "loss": 3.4473, + "step": 2420 + }, + { + "epoch": 0.16476423427096074, + "grad_norm": 6.803409099578857, + "learning_rate": 9.794299497214296e-06, + "loss": 3.3722, + "step": 2425 + }, + { + "epoch": 0.1651039543416225, + "grad_norm": 8.176136016845703, + "learning_rate": 9.793874847125968e-06, + "loss": 3.5901, + "step": 2430 + }, + { + "epoch": 0.16544367441228428, + "grad_norm": 6.310689449310303, + "learning_rate": 9.793450197037641e-06, + "loss": 3.6228, + "step": 2435 + }, + { + "epoch": 0.16578339448294604, + "grad_norm": 6.9548797607421875, + "learning_rate": 9.793025546949314e-06, + "loss": 3.528, + "step": 2440 + }, + { + "epoch": 0.16612311455360781, + "grad_norm": 8.800976753234863, + "learning_rate": 9.792600896860987e-06, + "loss": 3.5231, + "step": 2445 + }, + { + "epoch": 0.1664628346242696, + "grad_norm": 6.720234394073486, + "learning_rate": 9.79217624677266e-06, + "loss": 3.4355, + "step": 2450 + }, + { + "epoch": 0.16680255469493138, + "grad_norm": 9.701380729675293, + "learning_rate": 9.791751596684332e-06, + "loss": 3.5312, + "step": 2455 + }, + { + "epoch": 0.16714227476559315, + "grad_norm": 7.8971476554870605, + "learning_rate": 9.791326946596005e-06, + "loss": 3.4903, + "step": 2460 + }, + { + "epoch": 0.16748199483625492, + "grad_norm": 7.191115379333496, + "learning_rate": 9.790902296507678e-06, + "loss": 3.455, + "step": 2465 + }, + { + "epoch": 0.1678217149069167, + "grad_norm": 9.365320205688477, + "learning_rate": 9.790477646419351e-06, + "loss": 3.6308, + "step": 2470 + }, + { + "epoch": 0.1681614349775785, + "grad_norm": 10.12504768371582, + "learning_rate": 9.790052996331024e-06, + "loss": 3.5636, + "step": 2475 + }, + { + "epoch": 0.16850115504824026, + "grad_norm": 6.994856357574463, + "learning_rate": 9.789628346242696e-06, + "loss": 3.5596, + "step": 2480 + }, + { + "epoch": 0.16884087511890203, + "grad_norm": 6.49503231048584, + "learning_rate": 9.78920369615437e-06, + "loss": 3.5399, + "step": 2485 + }, + { + "epoch": 0.1691805951895638, + "grad_norm": 5.578256607055664, + "learning_rate": 9.788779046066042e-06, + "loss": 3.4269, + "step": 2490 + }, + { + "epoch": 0.16952031526022557, + "grad_norm": 7.47146463394165, + "learning_rate": 9.788354395977715e-06, + "loss": 3.5435, + "step": 2495 + }, + { + "epoch": 0.16986003533088734, + "grad_norm": 5.874685764312744, + "learning_rate": 9.787929745889388e-06, + "loss": 3.6482, + "step": 2500 + }, + { + "epoch": 0.17019975540154914, + "grad_norm": 6.944308757781982, + "learning_rate": 9.78750509580106e-06, + "loss": 3.6272, + "step": 2505 + }, + { + "epoch": 0.1705394754722109, + "grad_norm": 6.173206806182861, + "learning_rate": 9.787080445712733e-06, + "loss": 3.665, + "step": 2510 + }, + { + "epoch": 0.17087919554287267, + "grad_norm": 6.115314960479736, + "learning_rate": 9.786655795624406e-06, + "loss": 3.5507, + "step": 2515 + }, + { + "epoch": 0.17121891561353444, + "grad_norm": 5.7783942222595215, + "learning_rate": 9.786231145536079e-06, + "loss": 3.6776, + "step": 2520 + }, + { + "epoch": 0.17155863568419621, + "grad_norm": 6.357859134674072, + "learning_rate": 9.785806495447752e-06, + "loss": 3.6286, + "step": 2525 + }, + { + "epoch": 0.17189835575485798, + "grad_norm": 6.963959217071533, + "learning_rate": 9.785381845359425e-06, + "loss": 3.4406, + "step": 2530 + }, + { + "epoch": 0.17223807582551978, + "grad_norm": 7.687249660491943, + "learning_rate": 9.784957195271097e-06, + "loss": 3.6807, + "step": 2535 + }, + { + "epoch": 0.17257779589618155, + "grad_norm": 7.113305568695068, + "learning_rate": 9.78453254518277e-06, + "loss": 3.3087, + "step": 2540 + }, + { + "epoch": 0.17291751596684332, + "grad_norm": 8.57931900024414, + "learning_rate": 9.784107895094443e-06, + "loss": 3.4052, + "step": 2545 + }, + { + "epoch": 0.1732572360375051, + "grad_norm": 6.150431156158447, + "learning_rate": 9.783683245006116e-06, + "loss": 3.6067, + "step": 2550 + }, + { + "epoch": 0.17359695610816686, + "grad_norm": 6.489128589630127, + "learning_rate": 9.783258594917789e-06, + "loss": 3.3257, + "step": 2555 + }, + { + "epoch": 0.17393667617882866, + "grad_norm": 6.683917999267578, + "learning_rate": 9.782833944829461e-06, + "loss": 3.2706, + "step": 2560 + }, + { + "epoch": 0.17427639624949043, + "grad_norm": 9.306288719177246, + "learning_rate": 9.782409294741134e-06, + "loss": 3.4414, + "step": 2565 + }, + { + "epoch": 0.1746161163201522, + "grad_norm": 7.766913890838623, + "learning_rate": 9.781984644652807e-06, + "loss": 3.5135, + "step": 2570 + }, + { + "epoch": 0.17495583639081397, + "grad_norm": 7.26235294342041, + "learning_rate": 9.78155999456448e-06, + "loss": 3.4868, + "step": 2575 + }, + { + "epoch": 0.17529555646147574, + "grad_norm": 8.546307563781738, + "learning_rate": 9.781135344476153e-06, + "loss": 3.6152, + "step": 2580 + }, + { + "epoch": 0.1756352765321375, + "grad_norm": 6.145389080047607, + "learning_rate": 9.780710694387825e-06, + "loss": 3.3074, + "step": 2585 + }, + { + "epoch": 0.1759749966027993, + "grad_norm": 6.230631351470947, + "learning_rate": 9.780286044299498e-06, + "loss": 3.326, + "step": 2590 + }, + { + "epoch": 0.17631471667346107, + "grad_norm": 7.887233734130859, + "learning_rate": 9.779861394211171e-06, + "loss": 3.4685, + "step": 2595 + }, + { + "epoch": 0.17665443674412284, + "grad_norm": 7.809077262878418, + "learning_rate": 9.779436744122844e-06, + "loss": 3.3038, + "step": 2600 + }, + { + "epoch": 0.17699415681478461, + "grad_norm": 8.763053894042969, + "learning_rate": 9.779012094034517e-06, + "loss": 3.7587, + "step": 2605 + }, + { + "epoch": 0.17733387688544638, + "grad_norm": 7.339809894561768, + "learning_rate": 9.77858744394619e-06, + "loss": 3.5308, + "step": 2610 + }, + { + "epoch": 0.17767359695610815, + "grad_norm": 6.3528642654418945, + "learning_rate": 9.778162793857862e-06, + "loss": 3.1666, + "step": 2615 + }, + { + "epoch": 0.17801331702676995, + "grad_norm": 6.8632097244262695, + "learning_rate": 9.777738143769535e-06, + "loss": 3.5021, + "step": 2620 + }, + { + "epoch": 0.17835303709743172, + "grad_norm": 8.166184425354004, + "learning_rate": 9.777313493681208e-06, + "loss": 3.4387, + "step": 2625 + }, + { + "epoch": 0.1786927571680935, + "grad_norm": 6.770681381225586, + "learning_rate": 9.77688884359288e-06, + "loss": 3.3337, + "step": 2630 + }, + { + "epoch": 0.17903247723875526, + "grad_norm": 6.252292633056641, + "learning_rate": 9.776464193504553e-06, + "loss": 3.3185, + "step": 2635 + }, + { + "epoch": 0.17937219730941703, + "grad_norm": 6.807849884033203, + "learning_rate": 9.776039543416226e-06, + "loss": 3.5203, + "step": 2640 + }, + { + "epoch": 0.17971191738007883, + "grad_norm": 6.1453704833984375, + "learning_rate": 9.775614893327899e-06, + "loss": 3.5747, + "step": 2645 + }, + { + "epoch": 0.1800516374507406, + "grad_norm": 7.057030200958252, + "learning_rate": 9.775190243239572e-06, + "loss": 3.7238, + "step": 2650 + }, + { + "epoch": 0.18039135752140237, + "grad_norm": 7.492882251739502, + "learning_rate": 9.774765593151245e-06, + "loss": 3.5132, + "step": 2655 + }, + { + "epoch": 0.18073107759206414, + "grad_norm": 6.075038909912109, + "learning_rate": 9.774340943062917e-06, + "loss": 3.3747, + "step": 2660 + }, + { + "epoch": 0.1810707976627259, + "grad_norm": 7.322343826293945, + "learning_rate": 9.77391629297459e-06, + "loss": 3.5743, + "step": 2665 + }, + { + "epoch": 0.18141051773338768, + "grad_norm": 7.301365852355957, + "learning_rate": 9.773491642886263e-06, + "loss": 3.4226, + "step": 2670 + }, + { + "epoch": 0.18175023780404947, + "grad_norm": 8.92799186706543, + "learning_rate": 9.773066992797936e-06, + "loss": 3.4108, + "step": 2675 + }, + { + "epoch": 0.18208995787471124, + "grad_norm": 7.992076873779297, + "learning_rate": 9.772642342709609e-06, + "loss": 3.5573, + "step": 2680 + }, + { + "epoch": 0.18242967794537301, + "grad_norm": 5.345090866088867, + "learning_rate": 9.772217692621281e-06, + "loss": 3.3598, + "step": 2685 + }, + { + "epoch": 0.18276939801603478, + "grad_norm": 6.904548168182373, + "learning_rate": 9.771793042532952e-06, + "loss": 3.4978, + "step": 2690 + }, + { + "epoch": 0.18310911808669655, + "grad_norm": 6.71835994720459, + "learning_rate": 9.771368392444627e-06, + "loss": 3.1183, + "step": 2695 + }, + { + "epoch": 0.18344883815735832, + "grad_norm": 7.514228820800781, + "learning_rate": 9.7709437423563e-06, + "loss": 3.5857, + "step": 2700 + }, + { + "epoch": 0.18378855822802012, + "grad_norm": 7.541950702667236, + "learning_rate": 9.77051909226797e-06, + "loss": 3.4695, + "step": 2705 + }, + { + "epoch": 0.1841282782986819, + "grad_norm": 7.119455337524414, + "learning_rate": 9.770094442179645e-06, + "loss": 3.3707, + "step": 2710 + }, + { + "epoch": 0.18446799836934366, + "grad_norm": 7.159289360046387, + "learning_rate": 9.769669792091318e-06, + "loss": 3.5713, + "step": 2715 + }, + { + "epoch": 0.18480771844000543, + "grad_norm": 8.385406494140625, + "learning_rate": 9.76924514200299e-06, + "loss": 3.4378, + "step": 2720 + }, + { + "epoch": 0.1851474385106672, + "grad_norm": 7.156319618225098, + "learning_rate": 9.768820491914664e-06, + "loss": 3.5694, + "step": 2725 + }, + { + "epoch": 0.185487158581329, + "grad_norm": 7.021224498748779, + "learning_rate": 9.768395841826337e-06, + "loss": 3.6473, + "step": 2730 + }, + { + "epoch": 0.18582687865199077, + "grad_norm": 7.289370059967041, + "learning_rate": 9.767971191738008e-06, + "loss": 3.6197, + "step": 2735 + }, + { + "epoch": 0.18616659872265254, + "grad_norm": 5.963705062866211, + "learning_rate": 9.767546541649682e-06, + "loss": 3.2909, + "step": 2740 + }, + { + "epoch": 0.1865063187933143, + "grad_norm": 5.463754177093506, + "learning_rate": 9.767121891561355e-06, + "loss": 3.3915, + "step": 2745 + }, + { + "epoch": 0.18684603886397608, + "grad_norm": 8.008318901062012, + "learning_rate": 9.766697241473026e-06, + "loss": 3.451, + "step": 2750 + }, + { + "epoch": 0.18718575893463785, + "grad_norm": 7.375139236450195, + "learning_rate": 9.7662725913847e-06, + "loss": 3.6826, + "step": 2755 + }, + { + "epoch": 0.18752547900529964, + "grad_norm": 6.712100982666016, + "learning_rate": 9.765847941296372e-06, + "loss": 3.4852, + "step": 2760 + }, + { + "epoch": 0.18786519907596141, + "grad_norm": 5.8798604011535645, + "learning_rate": 9.765423291208044e-06, + "loss": 3.5413, + "step": 2765 + }, + { + "epoch": 0.18820491914662318, + "grad_norm": 5.199516296386719, + "learning_rate": 9.764998641119719e-06, + "loss": 3.4522, + "step": 2770 + }, + { + "epoch": 0.18854463921728495, + "grad_norm": 7.559567928314209, + "learning_rate": 9.76457399103139e-06, + "loss": 3.57, + "step": 2775 + }, + { + "epoch": 0.18888435928794672, + "grad_norm": 5.8801164627075195, + "learning_rate": 9.764149340943063e-06, + "loss": 3.4093, + "step": 2780 + }, + { + "epoch": 0.1892240793586085, + "grad_norm": 7.029899597167969, + "learning_rate": 9.763724690854737e-06, + "loss": 3.2811, + "step": 2785 + }, + { + "epoch": 0.1895637994292703, + "grad_norm": 7.359169960021973, + "learning_rate": 9.763300040766408e-06, + "loss": 3.4624, + "step": 2790 + }, + { + "epoch": 0.18990351949993206, + "grad_norm": 7.547095775604248, + "learning_rate": 9.762875390678081e-06, + "loss": 3.3824, + "step": 2795 + }, + { + "epoch": 0.19024323957059383, + "grad_norm": 8.064611434936523, + "learning_rate": 9.762450740589756e-06, + "loss": 3.4014, + "step": 2800 + }, + { + "epoch": 0.1905829596412556, + "grad_norm": 8.17822265625, + "learning_rate": 9.762026090501427e-06, + "loss": 3.4902, + "step": 2805 + }, + { + "epoch": 0.19092267971191737, + "grad_norm": 6.4243903160095215, + "learning_rate": 9.7616014404131e-06, + "loss": 3.0667, + "step": 2810 + }, + { + "epoch": 0.19126239978257917, + "grad_norm": 9.075425148010254, + "learning_rate": 9.761176790324774e-06, + "loss": 3.499, + "step": 2815 + }, + { + "epoch": 0.19160211985324094, + "grad_norm": 7.238821029663086, + "learning_rate": 9.760752140236445e-06, + "loss": 3.6799, + "step": 2820 + }, + { + "epoch": 0.1919418399239027, + "grad_norm": 5.458573341369629, + "learning_rate": 9.760327490148118e-06, + "loss": 3.3564, + "step": 2825 + }, + { + "epoch": 0.19228155999456448, + "grad_norm": 5.926985740661621, + "learning_rate": 9.759902840059793e-06, + "loss": 3.6948, + "step": 2830 + }, + { + "epoch": 0.19262128006522625, + "grad_norm": 8.078461647033691, + "learning_rate": 9.759478189971464e-06, + "loss": 3.4176, + "step": 2835 + }, + { + "epoch": 0.19296100013588802, + "grad_norm": 7.210725784301758, + "learning_rate": 9.759053539883138e-06, + "loss": 3.5572, + "step": 2840 + }, + { + "epoch": 0.19330072020654981, + "grad_norm": 5.1557536125183105, + "learning_rate": 9.75862888979481e-06, + "loss": 3.5538, + "step": 2845 + }, + { + "epoch": 0.19364044027721158, + "grad_norm": 5.758269309997559, + "learning_rate": 9.758204239706482e-06, + "loss": 3.5288, + "step": 2850 + }, + { + "epoch": 0.19398016034787335, + "grad_norm": 8.442049980163574, + "learning_rate": 9.757779589618157e-06, + "loss": 3.6659, + "step": 2855 + }, + { + "epoch": 0.19431988041853512, + "grad_norm": 6.890397548675537, + "learning_rate": 9.757354939529828e-06, + "loss": 3.3607, + "step": 2860 + }, + { + "epoch": 0.1946596004891969, + "grad_norm": 9.447219848632812, + "learning_rate": 9.7569302894415e-06, + "loss": 3.4957, + "step": 2865 + }, + { + "epoch": 0.19499932055985866, + "grad_norm": 8.068108558654785, + "learning_rate": 9.756505639353175e-06, + "loss": 3.3596, + "step": 2870 + }, + { + "epoch": 0.19533904063052046, + "grad_norm": 9.034869194030762, + "learning_rate": 9.756080989264846e-06, + "loss": 3.3913, + "step": 2875 + }, + { + "epoch": 0.19567876070118223, + "grad_norm": 6.527540683746338, + "learning_rate": 9.755656339176519e-06, + "loss": 3.5211, + "step": 2880 + }, + { + "epoch": 0.196018480771844, + "grad_norm": 7.065912246704102, + "learning_rate": 9.755231689088193e-06, + "loss": 3.5018, + "step": 2885 + }, + { + "epoch": 0.19635820084250577, + "grad_norm": 5.76986837387085, + "learning_rate": 9.754807038999864e-06, + "loss": 2.9263, + "step": 2890 + }, + { + "epoch": 0.19669792091316754, + "grad_norm": 7.37658166885376, + "learning_rate": 9.754382388911537e-06, + "loss": 3.1762, + "step": 2895 + }, + { + "epoch": 0.19703764098382934, + "grad_norm": 8.46464729309082, + "learning_rate": 9.753957738823212e-06, + "loss": 3.4849, + "step": 2900 + }, + { + "epoch": 0.1973773610544911, + "grad_norm": 7.717998504638672, + "learning_rate": 9.753533088734883e-06, + "loss": 3.2757, + "step": 2905 + }, + { + "epoch": 0.19771708112515288, + "grad_norm": 9.244932174682617, + "learning_rate": 9.753108438646556e-06, + "loss": 3.4737, + "step": 2910 + }, + { + "epoch": 0.19805680119581465, + "grad_norm": 7.359454154968262, + "learning_rate": 9.752683788558228e-06, + "loss": 3.4533, + "step": 2915 + }, + { + "epoch": 0.19839652126647642, + "grad_norm": 7.558138370513916, + "learning_rate": 9.752259138469901e-06, + "loss": 3.4545, + "step": 2920 + }, + { + "epoch": 0.1987362413371382, + "grad_norm": 6.732647895812988, + "learning_rate": 9.751834488381574e-06, + "loss": 3.4461, + "step": 2925 + }, + { + "epoch": 0.19907596140779998, + "grad_norm": 8.631799697875977, + "learning_rate": 9.751409838293247e-06, + "loss": 3.2901, + "step": 2930 + }, + { + "epoch": 0.19941568147846175, + "grad_norm": 6.7108283042907715, + "learning_rate": 9.75098518820492e-06, + "loss": 3.6038, + "step": 2935 + }, + { + "epoch": 0.19975540154912352, + "grad_norm": 8.52261734008789, + "learning_rate": 9.750560538116592e-06, + "loss": 3.1894, + "step": 2940 + }, + { + "epoch": 0.2000951216197853, + "grad_norm": 7.784303665161133, + "learning_rate": 9.750135888028265e-06, + "loss": 3.5429, + "step": 2945 + }, + { + "epoch": 0.20043484169044706, + "grad_norm": 7.456640243530273, + "learning_rate": 9.749711237939938e-06, + "loss": 3.5781, + "step": 2950 + }, + { + "epoch": 0.20077456176110883, + "grad_norm": 6.3843770027160645, + "learning_rate": 9.749286587851611e-06, + "loss": 3.7564, + "step": 2955 + }, + { + "epoch": 0.20111428183177063, + "grad_norm": 7.8352766036987305, + "learning_rate": 9.748861937763284e-06, + "loss": 3.5891, + "step": 2960 + }, + { + "epoch": 0.2014540019024324, + "grad_norm": 5.513573169708252, + "learning_rate": 9.748437287674956e-06, + "loss": 3.4392, + "step": 2965 + }, + { + "epoch": 0.20179372197309417, + "grad_norm": 6.436010837554932, + "learning_rate": 9.74801263758663e-06, + "loss": 3.4841, + "step": 2970 + }, + { + "epoch": 0.20213344204375594, + "grad_norm": 7.682677745819092, + "learning_rate": 9.747587987498302e-06, + "loss": 3.4323, + "step": 2975 + }, + { + "epoch": 0.2024731621144177, + "grad_norm": 7.192296504974365, + "learning_rate": 9.747163337409975e-06, + "loss": 3.4905, + "step": 2980 + }, + { + "epoch": 0.2028128821850795, + "grad_norm": 7.386812686920166, + "learning_rate": 9.746738687321648e-06, + "loss": 3.3009, + "step": 2985 + }, + { + "epoch": 0.20315260225574128, + "grad_norm": 5.3180108070373535, + "learning_rate": 9.74631403723332e-06, + "loss": 3.3791, + "step": 2990 + }, + { + "epoch": 0.20349232232640305, + "grad_norm": 18.92714500427246, + "learning_rate": 9.745889387144993e-06, + "loss": 3.4879, + "step": 2995 + }, + { + "epoch": 0.20383204239706482, + "grad_norm": 5.708279609680176, + "learning_rate": 9.745464737056666e-06, + "loss": 3.4326, + "step": 3000 + }, + { + "epoch": 0.2041717624677266, + "grad_norm": 8.887340545654297, + "learning_rate": 9.745040086968339e-06, + "loss": 3.4798, + "step": 3005 + }, + { + "epoch": 0.20451148253838836, + "grad_norm": 6.230954170227051, + "learning_rate": 9.744615436880012e-06, + "loss": 3.549, + "step": 3010 + }, + { + "epoch": 0.20485120260905015, + "grad_norm": 6.709387302398682, + "learning_rate": 9.744190786791684e-06, + "loss": 3.5059, + "step": 3015 + }, + { + "epoch": 0.20519092267971192, + "grad_norm": 6.501018524169922, + "learning_rate": 9.743766136703357e-06, + "loss": 3.5379, + "step": 3020 + }, + { + "epoch": 0.2055306427503737, + "grad_norm": 7.153782844543457, + "learning_rate": 9.74334148661503e-06, + "loss": 3.5207, + "step": 3025 + }, + { + "epoch": 0.20587036282103546, + "grad_norm": 8.099502563476562, + "learning_rate": 9.742916836526703e-06, + "loss": 3.4858, + "step": 3030 + }, + { + "epoch": 0.20621008289169723, + "grad_norm": 5.976144790649414, + "learning_rate": 9.742492186438376e-06, + "loss": 3.3765, + "step": 3035 + }, + { + "epoch": 0.206549802962359, + "grad_norm": 7.955788612365723, + "learning_rate": 9.742067536350048e-06, + "loss": 3.5036, + "step": 3040 + }, + { + "epoch": 0.2068895230330208, + "grad_norm": 7.954986095428467, + "learning_rate": 9.741642886261721e-06, + "loss": 3.3945, + "step": 3045 + }, + { + "epoch": 0.20722924310368257, + "grad_norm": 6.915881633758545, + "learning_rate": 9.741218236173394e-06, + "loss": 3.4543, + "step": 3050 + }, + { + "epoch": 0.20756896317434434, + "grad_norm": 7.194121837615967, + "learning_rate": 9.740793586085067e-06, + "loss": 3.2191, + "step": 3055 + }, + { + "epoch": 0.2079086832450061, + "grad_norm": 8.101570129394531, + "learning_rate": 9.74036893599674e-06, + "loss": 3.4954, + "step": 3060 + }, + { + "epoch": 0.20824840331566788, + "grad_norm": 5.7727742195129395, + "learning_rate": 9.739944285908412e-06, + "loss": 3.2957, + "step": 3065 + }, + { + "epoch": 0.20858812338632968, + "grad_norm": 7.264947891235352, + "learning_rate": 9.739519635820085e-06, + "loss": 3.5736, + "step": 3070 + }, + { + "epoch": 0.20892784345699145, + "grad_norm": 6.088995933532715, + "learning_rate": 9.739094985731758e-06, + "loss": 3.3096, + "step": 3075 + }, + { + "epoch": 0.20926756352765322, + "grad_norm": 7.243954658508301, + "learning_rate": 9.738670335643431e-06, + "loss": 3.336, + "step": 3080 + }, + { + "epoch": 0.209607283598315, + "grad_norm": 8.162642478942871, + "learning_rate": 9.738245685555104e-06, + "loss": 3.6199, + "step": 3085 + }, + { + "epoch": 0.20994700366897676, + "grad_norm": 9.551087379455566, + "learning_rate": 9.737821035466776e-06, + "loss": 3.4686, + "step": 3090 + }, + { + "epoch": 0.21028672373963853, + "grad_norm": 6.85495138168335, + "learning_rate": 9.73739638537845e-06, + "loss": 3.601, + "step": 3095 + }, + { + "epoch": 0.21062644381030032, + "grad_norm": 7.865480422973633, + "learning_rate": 9.736971735290122e-06, + "loss": 3.5754, + "step": 3100 + }, + { + "epoch": 0.2109661638809621, + "grad_norm": 7.951822280883789, + "learning_rate": 9.736547085201793e-06, + "loss": 3.4274, + "step": 3105 + }, + { + "epoch": 0.21130588395162386, + "grad_norm": 6.941093921661377, + "learning_rate": 9.736122435113468e-06, + "loss": 3.6175, + "step": 3110 + }, + { + "epoch": 0.21164560402228563, + "grad_norm": 6.5775628089904785, + "learning_rate": 9.73569778502514e-06, + "loss": 3.5429, + "step": 3115 + }, + { + "epoch": 0.2119853240929474, + "grad_norm": 13.433586120605469, + "learning_rate": 9.735273134936812e-06, + "loss": 3.6244, + "step": 3120 + }, + { + "epoch": 0.21232504416360917, + "grad_norm": 7.790881633758545, + "learning_rate": 9.734848484848486e-06, + "loss": 3.1834, + "step": 3125 + }, + { + "epoch": 0.21266476423427097, + "grad_norm": 8.507579803466797, + "learning_rate": 9.734423834760159e-06, + "loss": 3.6328, + "step": 3130 + }, + { + "epoch": 0.21300448430493274, + "grad_norm": 7.313068389892578, + "learning_rate": 9.73399918467183e-06, + "loss": 3.4428, + "step": 3135 + }, + { + "epoch": 0.2133442043755945, + "grad_norm": 6.455645561218262, + "learning_rate": 9.733574534583505e-06, + "loss": 3.4054, + "step": 3140 + }, + { + "epoch": 0.21368392444625628, + "grad_norm": 8.315359115600586, + "learning_rate": 9.733149884495177e-06, + "loss": 3.5563, + "step": 3145 + }, + { + "epoch": 0.21402364451691805, + "grad_norm": 6.728188514709473, + "learning_rate": 9.732725234406848e-06, + "loss": 3.1374, + "step": 3150 + }, + { + "epoch": 0.21436336458757985, + "grad_norm": 8.63435173034668, + "learning_rate": 9.732300584318523e-06, + "loss": 3.5224, + "step": 3155 + }, + { + "epoch": 0.21470308465824162, + "grad_norm": 6.376506328582764, + "learning_rate": 9.731875934230196e-06, + "loss": 3.4412, + "step": 3160 + }, + { + "epoch": 0.2150428047289034, + "grad_norm": 5.453405857086182, + "learning_rate": 9.731451284141867e-06, + "loss": 3.4148, + "step": 3165 + }, + { + "epoch": 0.21538252479956516, + "grad_norm": 8.22853946685791, + "learning_rate": 9.731026634053541e-06, + "loss": 3.3538, + "step": 3170 + }, + { + "epoch": 0.21572224487022693, + "grad_norm": 6.458868980407715, + "learning_rate": 9.730601983965214e-06, + "loss": 3.3647, + "step": 3175 + }, + { + "epoch": 0.2160619649408887, + "grad_norm": 6.088205337524414, + "learning_rate": 9.730177333876887e-06, + "loss": 3.4268, + "step": 3180 + }, + { + "epoch": 0.2164016850115505, + "grad_norm": 7.250519752502441, + "learning_rate": 9.72975268378856e-06, + "loss": 3.5028, + "step": 3185 + }, + { + "epoch": 0.21674140508221226, + "grad_norm": 9.875602722167969, + "learning_rate": 9.72932803370023e-06, + "loss": 3.6987, + "step": 3190 + }, + { + "epoch": 0.21708112515287403, + "grad_norm": 6.27207612991333, + "learning_rate": 9.728903383611905e-06, + "loss": 3.344, + "step": 3195 + }, + { + "epoch": 0.2174208452235358, + "grad_norm": 7.279569625854492, + "learning_rate": 9.728478733523578e-06, + "loss": 3.5035, + "step": 3200 + }, + { + "epoch": 0.21776056529419757, + "grad_norm": 8.677027702331543, + "learning_rate": 9.72805408343525e-06, + "loss": 3.8383, + "step": 3205 + }, + { + "epoch": 0.21810028536485934, + "grad_norm": 7.225039958953857, + "learning_rate": 9.727629433346924e-06, + "loss": 3.5324, + "step": 3210 + }, + { + "epoch": 0.21844000543552114, + "grad_norm": 7.316394805908203, + "learning_rate": 9.727204783258597e-06, + "loss": 3.4971, + "step": 3215 + }, + { + "epoch": 0.2187797255061829, + "grad_norm": 6.9767374992370605, + "learning_rate": 9.726780133170268e-06, + "loss": 3.4568, + "step": 3220 + }, + { + "epoch": 0.21911944557684468, + "grad_norm": 6.876035690307617, + "learning_rate": 9.726355483081942e-06, + "loss": 3.2628, + "step": 3225 + }, + { + "epoch": 0.21945916564750645, + "grad_norm": 7.061302661895752, + "learning_rate": 9.725930832993615e-06, + "loss": 3.556, + "step": 3230 + }, + { + "epoch": 0.21979888571816822, + "grad_norm": 6.911355018615723, + "learning_rate": 9.725506182905286e-06, + "loss": 3.4594, + "step": 3235 + }, + { + "epoch": 0.22013860578883002, + "grad_norm": 7.234079360961914, + "learning_rate": 9.72508153281696e-06, + "loss": 3.4012, + "step": 3240 + }, + { + "epoch": 0.22047832585949179, + "grad_norm": 6.792803764343262, + "learning_rate": 9.724656882728633e-06, + "loss": 3.3563, + "step": 3245 + }, + { + "epoch": 0.22081804593015356, + "grad_norm": 6.9311347007751465, + "learning_rate": 9.724232232640304e-06, + "loss": 3.505, + "step": 3250 + }, + { + "epoch": 0.22115776600081533, + "grad_norm": 6.099791049957275, + "learning_rate": 9.723807582551979e-06, + "loss": 3.3817, + "step": 3255 + }, + { + "epoch": 0.2214974860714771, + "grad_norm": 8.093216896057129, + "learning_rate": 9.72338293246365e-06, + "loss": 3.4618, + "step": 3260 + }, + { + "epoch": 0.22183720614213887, + "grad_norm": 6.302989482879639, + "learning_rate": 9.722958282375323e-06, + "loss": 3.4407, + "step": 3265 + }, + { + "epoch": 0.22217692621280066, + "grad_norm": 8.674922943115234, + "learning_rate": 9.722533632286997e-06, + "loss": 3.5655, + "step": 3270 + }, + { + "epoch": 0.22251664628346243, + "grad_norm": 7.521174907684326, + "learning_rate": 9.722108982198668e-06, + "loss": 3.5689, + "step": 3275 + }, + { + "epoch": 0.2228563663541242, + "grad_norm": 5.650582313537598, + "learning_rate": 9.721684332110341e-06, + "loss": 3.2272, + "step": 3280 + }, + { + "epoch": 0.22319608642478597, + "grad_norm": 6.845421314239502, + "learning_rate": 9.721259682022016e-06, + "loss": 3.7068, + "step": 3285 + }, + { + "epoch": 0.22353580649544774, + "grad_norm": 9.42795467376709, + "learning_rate": 9.720835031933687e-06, + "loss": 3.4102, + "step": 3290 + }, + { + "epoch": 0.22387552656610954, + "grad_norm": 7.929673194885254, + "learning_rate": 9.72041038184536e-06, + "loss": 3.2234, + "step": 3295 + }, + { + "epoch": 0.2242152466367713, + "grad_norm": 6.419642925262451, + "learning_rate": 9.719985731757034e-06, + "loss": 3.3132, + "step": 3300 + }, + { + "epoch": 0.22455496670743308, + "grad_norm": 6.427025318145752, + "learning_rate": 9.719561081668705e-06, + "loss": 3.6293, + "step": 3305 + }, + { + "epoch": 0.22489468677809485, + "grad_norm": 6.57861328125, + "learning_rate": 9.719136431580378e-06, + "loss": 3.0758, + "step": 3310 + }, + { + "epoch": 0.22523440684875662, + "grad_norm": 7.92677640914917, + "learning_rate": 9.718711781492053e-06, + "loss": 3.3192, + "step": 3315 + }, + { + "epoch": 0.2255741269194184, + "grad_norm": 7.046933650970459, + "learning_rate": 9.718287131403724e-06, + "loss": 3.3592, + "step": 3320 + }, + { + "epoch": 0.22591384699008019, + "grad_norm": 6.839446067810059, + "learning_rate": 9.717862481315396e-06, + "loss": 3.199, + "step": 3325 + }, + { + "epoch": 0.22625356706074196, + "grad_norm": 6.0227837562561035, + "learning_rate": 9.71743783122707e-06, + "loss": 3.5183, + "step": 3330 + }, + { + "epoch": 0.22659328713140373, + "grad_norm": 5.918550968170166, + "learning_rate": 9.717013181138742e-06, + "loss": 3.3349, + "step": 3335 + }, + { + "epoch": 0.2269330072020655, + "grad_norm": 13.407699584960938, + "learning_rate": 9.716588531050415e-06, + "loss": 3.241, + "step": 3340 + }, + { + "epoch": 0.22727272727272727, + "grad_norm": 7.4301300048828125, + "learning_rate": 9.716163880962088e-06, + "loss": 3.4607, + "step": 3345 + }, + { + "epoch": 0.22761244734338903, + "grad_norm": 6.462761402130127, + "learning_rate": 9.71573923087376e-06, + "loss": 3.5148, + "step": 3350 + }, + { + "epoch": 0.22795216741405083, + "grad_norm": 6.5533623695373535, + "learning_rate": 9.715314580785433e-06, + "loss": 3.3934, + "step": 3355 + }, + { + "epoch": 0.2282918874847126, + "grad_norm": 5.383028030395508, + "learning_rate": 9.714889930697106e-06, + "loss": 3.4193, + "step": 3360 + }, + { + "epoch": 0.22863160755537437, + "grad_norm": 7.241731643676758, + "learning_rate": 9.714465280608779e-06, + "loss": 3.5986, + "step": 3365 + }, + { + "epoch": 0.22897132762603614, + "grad_norm": 7.838988304138184, + "learning_rate": 9.714040630520452e-06, + "loss": 3.4991, + "step": 3370 + }, + { + "epoch": 0.2293110476966979, + "grad_norm": 6.560708522796631, + "learning_rate": 9.713615980432124e-06, + "loss": 3.5743, + "step": 3375 + }, + { + "epoch": 0.2296507677673597, + "grad_norm": 7.450181007385254, + "learning_rate": 9.713191330343797e-06, + "loss": 3.3903, + "step": 3380 + }, + { + "epoch": 0.22999048783802148, + "grad_norm": 6.624258041381836, + "learning_rate": 9.71276668025547e-06, + "loss": 3.4454, + "step": 3385 + }, + { + "epoch": 0.23033020790868325, + "grad_norm": 5.97130823135376, + "learning_rate": 9.712342030167143e-06, + "loss": 3.3082, + "step": 3390 + }, + { + "epoch": 0.23066992797934502, + "grad_norm": 6.3514227867126465, + "learning_rate": 9.711917380078816e-06, + "loss": 3.4449, + "step": 3395 + }, + { + "epoch": 0.2310096480500068, + "grad_norm": 8.4389066696167, + "learning_rate": 9.711492729990488e-06, + "loss": 3.3538, + "step": 3400 + }, + { + "epoch": 0.23134936812066856, + "grad_norm": 7.235487937927246, + "learning_rate": 9.711068079902161e-06, + "loss": 3.5427, + "step": 3405 + }, + { + "epoch": 0.23168908819133036, + "grad_norm": 6.877317905426025, + "learning_rate": 9.710643429813834e-06, + "loss": 3.4258, + "step": 3410 + }, + { + "epoch": 0.23202880826199213, + "grad_norm": 8.087273597717285, + "learning_rate": 9.710218779725507e-06, + "loss": 3.4853, + "step": 3415 + }, + { + "epoch": 0.2323685283326539, + "grad_norm": 5.905575275421143, + "learning_rate": 9.70979412963718e-06, + "loss": 3.5596, + "step": 3420 + }, + { + "epoch": 0.23270824840331567, + "grad_norm": 7.127460479736328, + "learning_rate": 9.709369479548852e-06, + "loss": 3.4953, + "step": 3425 + }, + { + "epoch": 0.23304796847397743, + "grad_norm": 7.383110046386719, + "learning_rate": 9.708944829460525e-06, + "loss": 3.3763, + "step": 3430 + }, + { + "epoch": 0.2333876885446392, + "grad_norm": 5.464799880981445, + "learning_rate": 9.708520179372198e-06, + "loss": 3.5297, + "step": 3435 + }, + { + "epoch": 0.233727408615301, + "grad_norm": 8.000072479248047, + "learning_rate": 9.708095529283871e-06, + "loss": 3.2376, + "step": 3440 + }, + { + "epoch": 0.23406712868596277, + "grad_norm": 10.77569580078125, + "learning_rate": 9.707670879195544e-06, + "loss": 3.3376, + "step": 3445 + }, + { + "epoch": 0.23440684875662454, + "grad_norm": 7.124791145324707, + "learning_rate": 9.707246229107216e-06, + "loss": 3.3448, + "step": 3450 + }, + { + "epoch": 0.2347465688272863, + "grad_norm": 9.195862770080566, + "learning_rate": 9.70682157901889e-06, + "loss": 3.5044, + "step": 3455 + }, + { + "epoch": 0.23508628889794808, + "grad_norm": 7.608007907867432, + "learning_rate": 9.706396928930562e-06, + "loss": 3.462, + "step": 3460 + }, + { + "epoch": 0.23542600896860988, + "grad_norm": 6.174948215484619, + "learning_rate": 9.705972278842235e-06, + "loss": 3.6074, + "step": 3465 + }, + { + "epoch": 0.23576572903927165, + "grad_norm": 7.246816158294678, + "learning_rate": 9.705547628753908e-06, + "loss": 3.392, + "step": 3470 + }, + { + "epoch": 0.23610544910993342, + "grad_norm": 7.725343704223633, + "learning_rate": 9.70512297866558e-06, + "loss": 3.3809, + "step": 3475 + }, + { + "epoch": 0.2364451691805952, + "grad_norm": 9.242006301879883, + "learning_rate": 9.704698328577253e-06, + "loss": 3.5953, + "step": 3480 + }, + { + "epoch": 0.23678488925125696, + "grad_norm": 6.883312225341797, + "learning_rate": 9.704273678488926e-06, + "loss": 3.5185, + "step": 3485 + }, + { + "epoch": 0.23712460932191873, + "grad_norm": 6.610252857208252, + "learning_rate": 9.703849028400599e-06, + "loss": 3.2351, + "step": 3490 + }, + { + "epoch": 0.23746432939258053, + "grad_norm": 6.339471340179443, + "learning_rate": 9.703424378312272e-06, + "loss": 3.3279, + "step": 3495 + }, + { + "epoch": 0.2378040494632423, + "grad_norm": 7.4543023109436035, + "learning_rate": 9.702999728223944e-06, + "loss": 3.4781, + "step": 3500 + }, + { + "epoch": 0.23814376953390406, + "grad_norm": 8.208173751831055, + "learning_rate": 9.702575078135617e-06, + "loss": 3.5089, + "step": 3505 + }, + { + "epoch": 0.23848348960456583, + "grad_norm": 9.14156436920166, + "learning_rate": 9.70215042804729e-06, + "loss": 3.3799, + "step": 3510 + }, + { + "epoch": 0.2388232096752276, + "grad_norm": 7.22252082824707, + "learning_rate": 9.701725777958963e-06, + "loss": 3.44, + "step": 3515 + }, + { + "epoch": 0.23916292974588937, + "grad_norm": 6.6338791847229, + "learning_rate": 9.701301127870636e-06, + "loss": 3.3442, + "step": 3520 + }, + { + "epoch": 0.23950264981655117, + "grad_norm": 6.296628952026367, + "learning_rate": 9.700876477782308e-06, + "loss": 3.5242, + "step": 3525 + }, + { + "epoch": 0.23984236988721294, + "grad_norm": 7.077075004577637, + "learning_rate": 9.700451827693981e-06, + "loss": 3.4448, + "step": 3530 + }, + { + "epoch": 0.2401820899578747, + "grad_norm": 5.963624000549316, + "learning_rate": 9.700027177605654e-06, + "loss": 3.2535, + "step": 3535 + }, + { + "epoch": 0.24052181002853648, + "grad_norm": 6.92118501663208, + "learning_rate": 9.699602527517327e-06, + "loss": 3.4297, + "step": 3540 + }, + { + "epoch": 0.24086153009919825, + "grad_norm": 8.039560317993164, + "learning_rate": 9.699177877429e-06, + "loss": 3.5201, + "step": 3545 + }, + { + "epoch": 0.24120125016986005, + "grad_norm": 8.276659965515137, + "learning_rate": 9.698753227340672e-06, + "loss": 3.6389, + "step": 3550 + }, + { + "epoch": 0.24154097024052182, + "grad_norm": 6.893321514129639, + "learning_rate": 9.698328577252345e-06, + "loss": 3.6281, + "step": 3555 + }, + { + "epoch": 0.2418806903111836, + "grad_norm": 9.091200828552246, + "learning_rate": 9.697903927164018e-06, + "loss": 3.4077, + "step": 3560 + }, + { + "epoch": 0.24222041038184536, + "grad_norm": 6.61489200592041, + "learning_rate": 9.697479277075691e-06, + "loss": 3.3335, + "step": 3565 + }, + { + "epoch": 0.24256013045250713, + "grad_norm": 8.034381866455078, + "learning_rate": 9.697054626987364e-06, + "loss": 3.277, + "step": 3570 + }, + { + "epoch": 0.2428998505231689, + "grad_norm": 6.878505229949951, + "learning_rate": 9.696629976899036e-06, + "loss": 3.5193, + "step": 3575 + }, + { + "epoch": 0.2432395705938307, + "grad_norm": 6.8077497482299805, + "learning_rate": 9.69620532681071e-06, + "loss": 3.5544, + "step": 3580 + }, + { + "epoch": 0.24357929066449246, + "grad_norm": 6.321957111358643, + "learning_rate": 9.695780676722382e-06, + "loss": 3.5707, + "step": 3585 + }, + { + "epoch": 0.24391901073515423, + "grad_norm": 7.787632465362549, + "learning_rate": 9.695356026634055e-06, + "loss": 3.2599, + "step": 3590 + }, + { + "epoch": 0.244258730805816, + "grad_norm": 7.533718585968018, + "learning_rate": 9.694931376545728e-06, + "loss": 3.448, + "step": 3595 + }, + { + "epoch": 0.24459845087647777, + "grad_norm": 7.405473232269287, + "learning_rate": 9.6945067264574e-06, + "loss": 3.3395, + "step": 3600 + }, + { + "epoch": 0.24493817094713954, + "grad_norm": 7.8113484382629395, + "learning_rate": 9.694082076369072e-06, + "loss": 3.4167, + "step": 3605 + }, + { + "epoch": 0.24527789101780134, + "grad_norm": 8.911964416503906, + "learning_rate": 9.693657426280746e-06, + "loss": 3.4817, + "step": 3610 + }, + { + "epoch": 0.2456176110884631, + "grad_norm": 6.253965854644775, + "learning_rate": 9.693232776192419e-06, + "loss": 3.6465, + "step": 3615 + }, + { + "epoch": 0.24595733115912488, + "grad_norm": 6.061107158660889, + "learning_rate": 9.69280812610409e-06, + "loss": 3.2372, + "step": 3620 + }, + { + "epoch": 0.24629705122978665, + "grad_norm": 8.153894424438477, + "learning_rate": 9.692383476015764e-06, + "loss": 3.3647, + "step": 3625 + }, + { + "epoch": 0.24663677130044842, + "grad_norm": 6.801196098327637, + "learning_rate": 9.691958825927437e-06, + "loss": 3.3508, + "step": 3630 + }, + { + "epoch": 0.24697649137111022, + "grad_norm": 7.4579644203186035, + "learning_rate": 9.691534175839108e-06, + "loss": 3.5254, + "step": 3635 + }, + { + "epoch": 0.247316211441772, + "grad_norm": 7.39466667175293, + "learning_rate": 9.691109525750783e-06, + "loss": 3.5985, + "step": 3640 + }, + { + "epoch": 0.24765593151243376, + "grad_norm": 7.541428565979004, + "learning_rate": 9.690684875662456e-06, + "loss": 3.6556, + "step": 3645 + }, + { + "epoch": 0.24799565158309553, + "grad_norm": 6.450454235076904, + "learning_rate": 9.690260225574127e-06, + "loss": 3.4496, + "step": 3650 + }, + { + "epoch": 0.2483353716537573, + "grad_norm": 7.236602783203125, + "learning_rate": 9.689835575485801e-06, + "loss": 3.2354, + "step": 3655 + }, + { + "epoch": 0.24867509172441907, + "grad_norm": 7.249136447906494, + "learning_rate": 9.689410925397474e-06, + "loss": 3.3415, + "step": 3660 + }, + { + "epoch": 0.24901481179508086, + "grad_norm": 6.665775299072266, + "learning_rate": 9.688986275309145e-06, + "loss": 3.2683, + "step": 3665 + }, + { + "epoch": 0.24935453186574263, + "grad_norm": 7.980654716491699, + "learning_rate": 9.68856162522082e-06, + "loss": 3.5729, + "step": 3670 + }, + { + "epoch": 0.2496942519364044, + "grad_norm": 9.812528610229492, + "learning_rate": 9.68813697513249e-06, + "loss": 3.7454, + "step": 3675 + }, + { + "epoch": 0.2500339720070662, + "grad_norm": 6.531294822692871, + "learning_rate": 9.687712325044164e-06, + "loss": 3.4363, + "step": 3680 + }, + { + "epoch": 0.25037369207772797, + "grad_norm": 7.167244911193848, + "learning_rate": 9.687287674955838e-06, + "loss": 3.3577, + "step": 3685 + }, + { + "epoch": 0.2507134121483897, + "grad_norm": 6.28443717956543, + "learning_rate": 9.68686302486751e-06, + "loss": 3.3726, + "step": 3690 + }, + { + "epoch": 0.2510531322190515, + "grad_norm": 6.548681259155273, + "learning_rate": 9.686438374779182e-06, + "loss": 3.5489, + "step": 3695 + }, + { + "epoch": 0.25139285228971325, + "grad_norm": 8.7153902053833, + "learning_rate": 9.686013724690856e-06, + "loss": 3.5419, + "step": 3700 + }, + { + "epoch": 0.25173257236037505, + "grad_norm": 6.61867618560791, + "learning_rate": 9.685589074602528e-06, + "loss": 3.4761, + "step": 3705 + }, + { + "epoch": 0.25207229243103685, + "grad_norm": 7.908356666564941, + "learning_rate": 9.6851644245142e-06, + "loss": 3.4401, + "step": 3710 + }, + { + "epoch": 0.2524120125016986, + "grad_norm": 8.824913024902344, + "learning_rate": 9.684739774425875e-06, + "loss": 3.4308, + "step": 3715 + }, + { + "epoch": 0.2527517325723604, + "grad_norm": 5.812005043029785, + "learning_rate": 9.684315124337546e-06, + "loss": 3.4023, + "step": 3720 + }, + { + "epoch": 0.25309145264302213, + "grad_norm": 5.79753303527832, + "learning_rate": 9.683890474249219e-06, + "loss": 3.4072, + "step": 3725 + }, + { + "epoch": 0.2534311727136839, + "grad_norm": 8.313252449035645, + "learning_rate": 9.683465824160893e-06, + "loss": 3.6562, + "step": 3730 + }, + { + "epoch": 0.2537708927843457, + "grad_norm": 6.970767498016357, + "learning_rate": 9.683041174072564e-06, + "loss": 3.4079, + "step": 3735 + }, + { + "epoch": 0.25411061285500747, + "grad_norm": 7.474125385284424, + "learning_rate": 9.682616523984237e-06, + "loss": 3.4638, + "step": 3740 + }, + { + "epoch": 0.25445033292566926, + "grad_norm": 7.791403770446777, + "learning_rate": 9.682191873895912e-06, + "loss": 3.3152, + "step": 3745 + }, + { + "epoch": 0.254790052996331, + "grad_norm": 5.929944038391113, + "learning_rate": 9.681767223807583e-06, + "loss": 3.1769, + "step": 3750 + }, + { + "epoch": 0.2551297730669928, + "grad_norm": 8.052458763122559, + "learning_rate": 9.681342573719256e-06, + "loss": 3.549, + "step": 3755 + }, + { + "epoch": 0.25546949313765455, + "grad_norm": 6.957122325897217, + "learning_rate": 9.680917923630928e-06, + "loss": 3.443, + "step": 3760 + }, + { + "epoch": 0.25580921320831634, + "grad_norm": 5.341514587402344, + "learning_rate": 9.680493273542601e-06, + "loss": 3.3576, + "step": 3765 + }, + { + "epoch": 0.25614893327897814, + "grad_norm": 7.811539173126221, + "learning_rate": 9.680068623454274e-06, + "loss": 3.4194, + "step": 3770 + }, + { + "epoch": 0.2564886533496399, + "grad_norm": 6.209771156311035, + "learning_rate": 9.679643973365947e-06, + "loss": 3.3244, + "step": 3775 + }, + { + "epoch": 0.2568283734203017, + "grad_norm": 7.531528949737549, + "learning_rate": 9.67921932327762e-06, + "loss": 3.5785, + "step": 3780 + }, + { + "epoch": 0.2571680934909634, + "grad_norm": 6.3796491622924805, + "learning_rate": 9.678794673189292e-06, + "loss": 3.4424, + "step": 3785 + }, + { + "epoch": 0.2575078135616252, + "grad_norm": 6.597477436065674, + "learning_rate": 9.678370023100965e-06, + "loss": 3.5792, + "step": 3790 + }, + { + "epoch": 0.257847533632287, + "grad_norm": 7.920498371124268, + "learning_rate": 9.677945373012638e-06, + "loss": 3.4151, + "step": 3795 + }, + { + "epoch": 0.25818725370294876, + "grad_norm": 6.1240458488464355, + "learning_rate": 9.67752072292431e-06, + "loss": 3.3523, + "step": 3800 + }, + { + "epoch": 0.25852697377361056, + "grad_norm": 9.07082748413086, + "learning_rate": 9.677096072835984e-06, + "loss": 3.4051, + "step": 3805 + }, + { + "epoch": 0.2588666938442723, + "grad_norm": 8.085282325744629, + "learning_rate": 9.676671422747656e-06, + "loss": 3.3552, + "step": 3810 + }, + { + "epoch": 0.2592064139149341, + "grad_norm": 6.063992500305176, + "learning_rate": 9.67624677265933e-06, + "loss": 3.4305, + "step": 3815 + }, + { + "epoch": 0.2595461339855959, + "grad_norm": 8.446815490722656, + "learning_rate": 9.675822122571002e-06, + "loss": 3.2574, + "step": 3820 + }, + { + "epoch": 0.25988585405625764, + "grad_norm": 7.001699447631836, + "learning_rate": 9.675397472482675e-06, + "loss": 3.5829, + "step": 3825 + }, + { + "epoch": 0.26022557412691943, + "grad_norm": 9.10374927520752, + "learning_rate": 9.674972822394348e-06, + "loss": 3.2446, + "step": 3830 + }, + { + "epoch": 0.2605652941975812, + "grad_norm": 7.350986480712891, + "learning_rate": 9.67454817230602e-06, + "loss": 3.3826, + "step": 3835 + }, + { + "epoch": 0.260905014268243, + "grad_norm": 6.786753177642822, + "learning_rate": 9.674123522217693e-06, + "loss": 3.4599, + "step": 3840 + }, + { + "epoch": 0.2612447343389047, + "grad_norm": 6.64835786819458, + "learning_rate": 9.673698872129366e-06, + "loss": 3.6414, + "step": 3845 + }, + { + "epoch": 0.2615844544095665, + "grad_norm": 8.642499923706055, + "learning_rate": 9.673274222041039e-06, + "loss": 3.5445, + "step": 3850 + }, + { + "epoch": 0.2619241744802283, + "grad_norm": 7.521528720855713, + "learning_rate": 9.672849571952712e-06, + "loss": 3.3876, + "step": 3855 + }, + { + "epoch": 0.26226389455089005, + "grad_norm": 7.97447395324707, + "learning_rate": 9.672424921864384e-06, + "loss": 3.4831, + "step": 3860 + }, + { + "epoch": 0.26260361462155185, + "grad_norm": 6.565851211547852, + "learning_rate": 9.672000271776057e-06, + "loss": 3.333, + "step": 3865 + }, + { + "epoch": 0.2629433346922136, + "grad_norm": 6.232231140136719, + "learning_rate": 9.67157562168773e-06, + "loss": 3.2462, + "step": 3870 + }, + { + "epoch": 0.2632830547628754, + "grad_norm": 6.923056125640869, + "learning_rate": 9.671150971599403e-06, + "loss": 3.5307, + "step": 3875 + }, + { + "epoch": 0.2636227748335372, + "grad_norm": 6.440350532531738, + "learning_rate": 9.670726321511076e-06, + "loss": 3.5914, + "step": 3880 + }, + { + "epoch": 0.26396249490419893, + "grad_norm": 7.596096038818359, + "learning_rate": 9.670301671422748e-06, + "loss": 3.5259, + "step": 3885 + }, + { + "epoch": 0.2643022149748607, + "grad_norm": 5.610416889190674, + "learning_rate": 9.669877021334421e-06, + "loss": 3.5099, + "step": 3890 + }, + { + "epoch": 0.26464193504552247, + "grad_norm": 7.306087493896484, + "learning_rate": 9.669452371246094e-06, + "loss": 3.5071, + "step": 3895 + }, + { + "epoch": 0.26498165511618427, + "grad_norm": 7.042050838470459, + "learning_rate": 9.669027721157767e-06, + "loss": 3.2606, + "step": 3900 + }, + { + "epoch": 0.26532137518684606, + "grad_norm": 7.678199291229248, + "learning_rate": 9.66860307106944e-06, + "loss": 3.6936, + "step": 3905 + }, + { + "epoch": 0.2656610952575078, + "grad_norm": 7.966768264770508, + "learning_rate": 9.668178420981112e-06, + "loss": 3.2876, + "step": 3910 + }, + { + "epoch": 0.2660008153281696, + "grad_norm": 7.044416904449463, + "learning_rate": 9.667753770892785e-06, + "loss": 3.6569, + "step": 3915 + }, + { + "epoch": 0.26634053539883135, + "grad_norm": 7.931710720062256, + "learning_rate": 9.667329120804458e-06, + "loss": 3.5366, + "step": 3920 + }, + { + "epoch": 0.26668025546949314, + "grad_norm": 7.012920379638672, + "learning_rate": 9.66690447071613e-06, + "loss": 3.4442, + "step": 3925 + }, + { + "epoch": 0.2670199755401549, + "grad_norm": 6.855907917022705, + "learning_rate": 9.666479820627804e-06, + "loss": 3.2538, + "step": 3930 + }, + { + "epoch": 0.2673596956108167, + "grad_norm": 6.779373645782471, + "learning_rate": 9.666055170539476e-06, + "loss": 3.6343, + "step": 3935 + }, + { + "epoch": 0.2676994156814785, + "grad_norm": 5.748664379119873, + "learning_rate": 9.66563052045115e-06, + "loss": 3.4987, + "step": 3940 + }, + { + "epoch": 0.2680391357521402, + "grad_norm": 8.221884727478027, + "learning_rate": 9.665205870362822e-06, + "loss": 3.4342, + "step": 3945 + }, + { + "epoch": 0.268378855822802, + "grad_norm": 6.894197940826416, + "learning_rate": 9.664781220274495e-06, + "loss": 3.6356, + "step": 3950 + }, + { + "epoch": 0.26871857589346376, + "grad_norm": 7.40133810043335, + "learning_rate": 9.664356570186168e-06, + "loss": 3.2329, + "step": 3955 + }, + { + "epoch": 0.26905829596412556, + "grad_norm": 6.206688404083252, + "learning_rate": 9.66393192009784e-06, + "loss": 3.5435, + "step": 3960 + }, + { + "epoch": 0.26939801603478736, + "grad_norm": 7.822265148162842, + "learning_rate": 9.663507270009513e-06, + "loss": 3.6645, + "step": 3965 + }, + { + "epoch": 0.2697377361054491, + "grad_norm": 7.279390811920166, + "learning_rate": 9.663082619921186e-06, + "loss": 3.2919, + "step": 3970 + }, + { + "epoch": 0.2700774561761109, + "grad_norm": 7.763699054718018, + "learning_rate": 9.662657969832859e-06, + "loss": 3.7092, + "step": 3975 + }, + { + "epoch": 0.27041717624677264, + "grad_norm": 6.790695667266846, + "learning_rate": 9.662233319744532e-06, + "loss": 3.4297, + "step": 3980 + }, + { + "epoch": 0.27075689631743444, + "grad_norm": 5.526285648345947, + "learning_rate": 9.661808669656204e-06, + "loss": 3.6503, + "step": 3985 + }, + { + "epoch": 0.27109661638809623, + "grad_norm": 5.396424293518066, + "learning_rate": 9.661384019567877e-06, + "loss": 3.4411, + "step": 3990 + }, + { + "epoch": 0.271436336458758, + "grad_norm": 6.951815128326416, + "learning_rate": 9.66095936947955e-06, + "loss": 3.3832, + "step": 3995 + }, + { + "epoch": 0.2717760565294198, + "grad_norm": 8.104421615600586, + "learning_rate": 9.660534719391223e-06, + "loss": 3.7421, + "step": 4000 + }, + { + "epoch": 0.2721157766000815, + "grad_norm": 6.183870792388916, + "learning_rate": 9.660110069302896e-06, + "loss": 3.4491, + "step": 4005 + }, + { + "epoch": 0.2724554966707433, + "grad_norm": 5.737735271453857, + "learning_rate": 9.659685419214568e-06, + "loss": 3.3282, + "step": 4010 + }, + { + "epoch": 0.27279521674140506, + "grad_norm": 7.368603706359863, + "learning_rate": 9.659260769126241e-06, + "loss": 3.6476, + "step": 4015 + }, + { + "epoch": 0.27313493681206685, + "grad_norm": 6.911849498748779, + "learning_rate": 9.658836119037912e-06, + "loss": 3.3549, + "step": 4020 + }, + { + "epoch": 0.27347465688272865, + "grad_norm": 6.688433647155762, + "learning_rate": 9.658411468949587e-06, + "loss": 3.3216, + "step": 4025 + }, + { + "epoch": 0.2738143769533904, + "grad_norm": 6.799325942993164, + "learning_rate": 9.65798681886126e-06, + "loss": 3.2199, + "step": 4030 + }, + { + "epoch": 0.2741540970240522, + "grad_norm": 6.181464672088623, + "learning_rate": 9.65756216877293e-06, + "loss": 3.4924, + "step": 4035 + }, + { + "epoch": 0.27449381709471393, + "grad_norm": 6.212704181671143, + "learning_rate": 9.657137518684605e-06, + "loss": 3.3328, + "step": 4040 + }, + { + "epoch": 0.27483353716537573, + "grad_norm": 7.790589332580566, + "learning_rate": 9.656712868596278e-06, + "loss": 3.6581, + "step": 4045 + }, + { + "epoch": 0.2751732572360375, + "grad_norm": 8.463546752929688, + "learning_rate": 9.65628821850795e-06, + "loss": 3.5107, + "step": 4050 + }, + { + "epoch": 0.27551297730669927, + "grad_norm": 5.519874095916748, + "learning_rate": 9.655863568419624e-06, + "loss": 3.477, + "step": 4055 + }, + { + "epoch": 0.27585269737736107, + "grad_norm": 7.751538276672363, + "learning_rate": 9.655438918331296e-06, + "loss": 3.4484, + "step": 4060 + }, + { + "epoch": 0.2761924174480228, + "grad_norm": 7.558211326599121, + "learning_rate": 9.655014268242968e-06, + "loss": 3.3963, + "step": 4065 + }, + { + "epoch": 0.2765321375186846, + "grad_norm": 6.637917518615723, + "learning_rate": 9.654589618154642e-06, + "loss": 3.3413, + "step": 4070 + }, + { + "epoch": 0.2768718575893464, + "grad_norm": 6.168222904205322, + "learning_rate": 9.654164968066315e-06, + "loss": 3.543, + "step": 4075 + }, + { + "epoch": 0.27721157766000815, + "grad_norm": 6.1014885902404785, + "learning_rate": 9.653740317977986e-06, + "loss": 3.4557, + "step": 4080 + }, + { + "epoch": 0.27755129773066994, + "grad_norm": 6.8894877433776855, + "learning_rate": 9.65331566788966e-06, + "loss": 3.2307, + "step": 4085 + }, + { + "epoch": 0.2778910178013317, + "grad_norm": 6.6876420974731445, + "learning_rate": 9.652891017801333e-06, + "loss": 3.403, + "step": 4090 + }, + { + "epoch": 0.2782307378719935, + "grad_norm": 8.181442260742188, + "learning_rate": 9.652466367713004e-06, + "loss": 3.4706, + "step": 4095 + }, + { + "epoch": 0.2785704579426552, + "grad_norm": 6.777012825012207, + "learning_rate": 9.652041717624679e-06, + "loss": 3.4894, + "step": 4100 + }, + { + "epoch": 0.278910178013317, + "grad_norm": 5.852108001708984, + "learning_rate": 9.65161706753635e-06, + "loss": 3.3298, + "step": 4105 + }, + { + "epoch": 0.2792498980839788, + "grad_norm": 6.445892333984375, + "learning_rate": 9.651192417448023e-06, + "loss": 3.4952, + "step": 4110 + }, + { + "epoch": 0.27958961815464056, + "grad_norm": 6.517755508422852, + "learning_rate": 9.650767767359697e-06, + "loss": 3.5267, + "step": 4115 + }, + { + "epoch": 0.27992933822530236, + "grad_norm": 5.369908809661865, + "learning_rate": 9.650343117271368e-06, + "loss": 3.4412, + "step": 4120 + }, + { + "epoch": 0.2802690582959641, + "grad_norm": 6.771788120269775, + "learning_rate": 9.649918467183041e-06, + "loss": 3.3534, + "step": 4125 + }, + { + "epoch": 0.2806087783666259, + "grad_norm": 7.529616832733154, + "learning_rate": 9.649493817094716e-06, + "loss": 3.4332, + "step": 4130 + }, + { + "epoch": 0.2809484984372877, + "grad_norm": 5.792358875274658, + "learning_rate": 9.649069167006387e-06, + "loss": 3.3554, + "step": 4135 + }, + { + "epoch": 0.28128821850794944, + "grad_norm": 5.845894813537598, + "learning_rate": 9.64864451691806e-06, + "loss": 3.3522, + "step": 4140 + }, + { + "epoch": 0.28162793857861124, + "grad_norm": 6.903589725494385, + "learning_rate": 9.648219866829734e-06, + "loss": 3.3894, + "step": 4145 + }, + { + "epoch": 0.281967658649273, + "grad_norm": 7.6048264503479, + "learning_rate": 9.647795216741405e-06, + "loss": 3.6747, + "step": 4150 + }, + { + "epoch": 0.2823073787199348, + "grad_norm": 6.593227863311768, + "learning_rate": 9.647370566653078e-06, + "loss": 3.5635, + "step": 4155 + }, + { + "epoch": 0.2826470987905966, + "grad_norm": 6.899693012237549, + "learning_rate": 9.646945916564752e-06, + "loss": 3.4886, + "step": 4160 + }, + { + "epoch": 0.2829868188612583, + "grad_norm": 7.098773956298828, + "learning_rate": 9.646521266476424e-06, + "loss": 3.2181, + "step": 4165 + }, + { + "epoch": 0.2833265389319201, + "grad_norm": 5.718012809753418, + "learning_rate": 9.646096616388096e-06, + "loss": 3.3427, + "step": 4170 + }, + { + "epoch": 0.28366625900258186, + "grad_norm": 7.345568656921387, + "learning_rate": 9.64567196629977e-06, + "loss": 3.4426, + "step": 4175 + }, + { + "epoch": 0.28400597907324365, + "grad_norm": 8.046019554138184, + "learning_rate": 9.645247316211442e-06, + "loss": 2.9764, + "step": 4180 + }, + { + "epoch": 0.2843456991439054, + "grad_norm": 7.696800231933594, + "learning_rate": 9.644822666123115e-06, + "loss": 3.4684, + "step": 4185 + }, + { + "epoch": 0.2846854192145672, + "grad_norm": 8.253087043762207, + "learning_rate": 9.644398016034788e-06, + "loss": 3.2619, + "step": 4190 + }, + { + "epoch": 0.285025139285229, + "grad_norm": 8.1162748336792, + "learning_rate": 9.64397336594646e-06, + "loss": 3.4359, + "step": 4195 + }, + { + "epoch": 0.28536485935589073, + "grad_norm": 6.463571071624756, + "learning_rate": 9.643548715858135e-06, + "loss": 3.4133, + "step": 4200 + }, + { + "epoch": 0.28570457942655253, + "grad_norm": 5.660048484802246, + "learning_rate": 9.643124065769806e-06, + "loss": 3.5709, + "step": 4205 + }, + { + "epoch": 0.28604429949721427, + "grad_norm": 7.495563507080078, + "learning_rate": 9.642699415681479e-06, + "loss": 3.4086, + "step": 4210 + }, + { + "epoch": 0.28638401956787607, + "grad_norm": 7.104689121246338, + "learning_rate": 9.642274765593153e-06, + "loss": 3.5034, + "step": 4215 + }, + { + "epoch": 0.28672373963853787, + "grad_norm": 7.414596080780029, + "learning_rate": 9.641850115504824e-06, + "loss": 3.6469, + "step": 4220 + }, + { + "epoch": 0.2870634597091996, + "grad_norm": 7.689900875091553, + "learning_rate": 9.641425465416497e-06, + "loss": 3.5334, + "step": 4225 + }, + { + "epoch": 0.2874031797798614, + "grad_norm": 7.017579555511475, + "learning_rate": 9.641000815328172e-06, + "loss": 3.4411, + "step": 4230 + }, + { + "epoch": 0.28774289985052315, + "grad_norm": 5.2954277992248535, + "learning_rate": 9.640576165239843e-06, + "loss": 3.2189, + "step": 4235 + }, + { + "epoch": 0.28808261992118495, + "grad_norm": 6.957217216491699, + "learning_rate": 9.640151515151516e-06, + "loss": 3.681, + "step": 4240 + }, + { + "epoch": 0.28842233999184674, + "grad_norm": 5.804520606994629, + "learning_rate": 9.639726865063188e-06, + "loss": 3.4896, + "step": 4245 + }, + { + "epoch": 0.2887620600625085, + "grad_norm": 7.608871936798096, + "learning_rate": 9.639302214974861e-06, + "loss": 3.3166, + "step": 4250 + }, + { + "epoch": 0.2891017801331703, + "grad_norm": 5.647308349609375, + "learning_rate": 9.638877564886534e-06, + "loss": 3.3232, + "step": 4255 + }, + { + "epoch": 0.289441500203832, + "grad_norm": 8.484602928161621, + "learning_rate": 9.638452914798207e-06, + "loss": 3.7341, + "step": 4260 + }, + { + "epoch": 0.2897812202744938, + "grad_norm": 6.967067718505859, + "learning_rate": 9.63802826470988e-06, + "loss": 3.3329, + "step": 4265 + }, + { + "epoch": 0.29012094034515556, + "grad_norm": 7.27335786819458, + "learning_rate": 9.637603614621552e-06, + "loss": 3.4835, + "step": 4270 + }, + { + "epoch": 0.29046066041581736, + "grad_norm": 7.352554798126221, + "learning_rate": 9.637178964533225e-06, + "loss": 3.5435, + "step": 4275 + }, + { + "epoch": 0.29080038048647916, + "grad_norm": 8.713628768920898, + "learning_rate": 9.636754314444898e-06, + "loss": 3.3678, + "step": 4280 + }, + { + "epoch": 0.2911401005571409, + "grad_norm": 6.072607517242432, + "learning_rate": 9.63632966435657e-06, + "loss": 3.5808, + "step": 4285 + }, + { + "epoch": 0.2914798206278027, + "grad_norm": 7.143949508666992, + "learning_rate": 9.635905014268244e-06, + "loss": 3.3469, + "step": 4290 + }, + { + "epoch": 0.29181954069846444, + "grad_norm": 7.116053104400635, + "learning_rate": 9.635480364179916e-06, + "loss": 3.7016, + "step": 4295 + }, + { + "epoch": 0.29215926076912624, + "grad_norm": 6.67861795425415, + "learning_rate": 9.63505571409159e-06, + "loss": 3.1809, + "step": 4300 + }, + { + "epoch": 0.29249898083978804, + "grad_norm": 8.922670364379883, + "learning_rate": 9.634631064003262e-06, + "loss": 3.2261, + "step": 4305 + }, + { + "epoch": 0.2928387009104498, + "grad_norm": 7.162497520446777, + "learning_rate": 9.634206413914935e-06, + "loss": 3.5893, + "step": 4310 + }, + { + "epoch": 0.2931784209811116, + "grad_norm": 8.328531265258789, + "learning_rate": 9.633781763826608e-06, + "loss": 3.2397, + "step": 4315 + }, + { + "epoch": 0.2935181410517733, + "grad_norm": 8.537714004516602, + "learning_rate": 9.63335711373828e-06, + "loss": 3.4969, + "step": 4320 + }, + { + "epoch": 0.2938578611224351, + "grad_norm": 6.951941013336182, + "learning_rate": 9.632932463649953e-06, + "loss": 3.3714, + "step": 4325 + }, + { + "epoch": 0.2941975811930969, + "grad_norm": 6.8032965660095215, + "learning_rate": 9.632507813561626e-06, + "loss": 3.3564, + "step": 4330 + }, + { + "epoch": 0.29453730126375866, + "grad_norm": 6.858579158782959, + "learning_rate": 9.632083163473299e-06, + "loss": 3.6291, + "step": 4335 + }, + { + "epoch": 0.29487702133442045, + "grad_norm": 8.279915809631348, + "learning_rate": 9.631658513384972e-06, + "loss": 3.4306, + "step": 4340 + }, + { + "epoch": 0.2952167414050822, + "grad_norm": 5.95363712310791, + "learning_rate": 9.631233863296644e-06, + "loss": 2.9672, + "step": 4345 + }, + { + "epoch": 0.295556461475744, + "grad_norm": 6.917148590087891, + "learning_rate": 9.630809213208317e-06, + "loss": 3.2277, + "step": 4350 + }, + { + "epoch": 0.29589618154640573, + "grad_norm": 6.971553325653076, + "learning_rate": 9.63038456311999e-06, + "loss": 3.4764, + "step": 4355 + }, + { + "epoch": 0.29623590161706753, + "grad_norm": 6.271878719329834, + "learning_rate": 9.629959913031663e-06, + "loss": 3.296, + "step": 4360 + }, + { + "epoch": 0.29657562168772933, + "grad_norm": 6.293243408203125, + "learning_rate": 9.629535262943336e-06, + "loss": 3.5792, + "step": 4365 + }, + { + "epoch": 0.29691534175839107, + "grad_norm": 5.756286144256592, + "learning_rate": 9.629110612855008e-06, + "loss": 3.5376, + "step": 4370 + }, + { + "epoch": 0.29725506182905287, + "grad_norm": 7.469455242156982, + "learning_rate": 9.628685962766681e-06, + "loss": 3.3735, + "step": 4375 + }, + { + "epoch": 0.2975947818997146, + "grad_norm": 7.618985176086426, + "learning_rate": 9.628261312678354e-06, + "loss": 3.4213, + "step": 4380 + }, + { + "epoch": 0.2979345019703764, + "grad_norm": 5.838161945343018, + "learning_rate": 9.627836662590027e-06, + "loss": 3.493, + "step": 4385 + }, + { + "epoch": 0.2982742220410382, + "grad_norm": 7.254555702209473, + "learning_rate": 9.6274120125017e-06, + "loss": 3.6391, + "step": 4390 + }, + { + "epoch": 0.29861394211169995, + "grad_norm": 7.390652656555176, + "learning_rate": 9.626987362413372e-06, + "loss": 3.5515, + "step": 4395 + }, + { + "epoch": 0.29895366218236175, + "grad_norm": 7.5107951164245605, + "learning_rate": 9.626562712325045e-06, + "loss": 3.4456, + "step": 4400 + }, + { + "epoch": 0.2992933822530235, + "grad_norm": 8.447529792785645, + "learning_rate": 9.626138062236718e-06, + "loss": 3.448, + "step": 4405 + }, + { + "epoch": 0.2996331023236853, + "grad_norm": 7.32254695892334, + "learning_rate": 9.62571341214839e-06, + "loss": 3.6443, + "step": 4410 + }, + { + "epoch": 0.2999728223943471, + "grad_norm": 8.116665840148926, + "learning_rate": 9.625288762060064e-06, + "loss": 3.5948, + "step": 4415 + }, + { + "epoch": 0.3003125424650088, + "grad_norm": 9.295966148376465, + "learning_rate": 9.624864111971736e-06, + "loss": 3.4259, + "step": 4420 + }, + { + "epoch": 0.3006522625356706, + "grad_norm": 5.56254768371582, + "learning_rate": 9.62443946188341e-06, + "loss": 3.612, + "step": 4425 + }, + { + "epoch": 0.30099198260633236, + "grad_norm": 7.211649417877197, + "learning_rate": 9.624014811795082e-06, + "loss": 3.5922, + "step": 4430 + }, + { + "epoch": 0.30133170267699416, + "grad_norm": 9.755934715270996, + "learning_rate": 9.623590161706755e-06, + "loss": 3.6313, + "step": 4435 + }, + { + "epoch": 0.3016714227476559, + "grad_norm": 7.19437313079834, + "learning_rate": 9.623165511618428e-06, + "loss": 3.319, + "step": 4440 + }, + { + "epoch": 0.3020111428183177, + "grad_norm": 6.887825965881348, + "learning_rate": 9.6227408615301e-06, + "loss": 3.4588, + "step": 4445 + }, + { + "epoch": 0.3023508628889795, + "grad_norm": 7.173717021942139, + "learning_rate": 9.622316211441772e-06, + "loss": 3.5654, + "step": 4450 + }, + { + "epoch": 0.30269058295964124, + "grad_norm": 5.676904678344727, + "learning_rate": 9.621891561353446e-06, + "loss": 3.1275, + "step": 4455 + }, + { + "epoch": 0.30303030303030304, + "grad_norm": 6.730070114135742, + "learning_rate": 9.621466911265119e-06, + "loss": 3.4637, + "step": 4460 + }, + { + "epoch": 0.3033700231009648, + "grad_norm": 6.3370041847229, + "learning_rate": 9.62104226117679e-06, + "loss": 3.3901, + "step": 4465 + }, + { + "epoch": 0.3037097431716266, + "grad_norm": 7.012800693511963, + "learning_rate": 9.620617611088464e-06, + "loss": 3.2524, + "step": 4470 + }, + { + "epoch": 0.3040494632422884, + "grad_norm": 7.369883060455322, + "learning_rate": 9.620192961000137e-06, + "loss": 3.5464, + "step": 4475 + }, + { + "epoch": 0.3043891833129501, + "grad_norm": 7.487855911254883, + "learning_rate": 9.619768310911808e-06, + "loss": 3.5631, + "step": 4480 + }, + { + "epoch": 0.3047289033836119, + "grad_norm": 6.363354206085205, + "learning_rate": 9.619343660823483e-06, + "loss": 3.3577, + "step": 4485 + }, + { + "epoch": 0.30506862345427366, + "grad_norm": 6.459602355957031, + "learning_rate": 9.618919010735156e-06, + "loss": 3.6907, + "step": 4490 + }, + { + "epoch": 0.30540834352493546, + "grad_norm": 8.389426231384277, + "learning_rate": 9.618494360646827e-06, + "loss": 3.4157, + "step": 4495 + }, + { + "epoch": 0.30574806359559725, + "grad_norm": 7.058099269866943, + "learning_rate": 9.618069710558501e-06, + "loss": 3.4753, + "step": 4500 + }, + { + "epoch": 0.306087783666259, + "grad_norm": 6.586700916290283, + "learning_rate": 9.617645060470174e-06, + "loss": 3.3632, + "step": 4505 + }, + { + "epoch": 0.3064275037369208, + "grad_norm": 7.327359676361084, + "learning_rate": 9.617220410381845e-06, + "loss": 3.6649, + "step": 4510 + }, + { + "epoch": 0.30676722380758253, + "grad_norm": 5.109470367431641, + "learning_rate": 9.61679576029352e-06, + "loss": 3.4831, + "step": 4515 + }, + { + "epoch": 0.30710694387824433, + "grad_norm": 7.803720474243164, + "learning_rate": 9.61637111020519e-06, + "loss": 3.5512, + "step": 4520 + }, + { + "epoch": 0.3074466639489061, + "grad_norm": 7.322420120239258, + "learning_rate": 9.615946460116864e-06, + "loss": 3.2476, + "step": 4525 + }, + { + "epoch": 0.30778638401956787, + "grad_norm": 9.917129516601562, + "learning_rate": 9.615521810028538e-06, + "loss": 3.5656, + "step": 4530 + }, + { + "epoch": 0.30812610409022967, + "grad_norm": 7.351716995239258, + "learning_rate": 9.615097159940209e-06, + "loss": 3.2842, + "step": 4535 + }, + { + "epoch": 0.3084658241608914, + "grad_norm": 6.839828014373779, + "learning_rate": 9.614672509851884e-06, + "loss": 3.3413, + "step": 4540 + }, + { + "epoch": 0.3088055442315532, + "grad_norm": 9.112578392028809, + "learning_rate": 9.614247859763556e-06, + "loss": 3.2281, + "step": 4545 + }, + { + "epoch": 0.30914526430221495, + "grad_norm": 6.6876749992370605, + "learning_rate": 9.613823209675228e-06, + "loss": 3.3095, + "step": 4550 + }, + { + "epoch": 0.30948498437287675, + "grad_norm": 5.77860689163208, + "learning_rate": 9.613398559586902e-06, + "loss": 3.4368, + "step": 4555 + }, + { + "epoch": 0.30982470444353855, + "grad_norm": 6.215092658996582, + "learning_rate": 9.612973909498575e-06, + "loss": 3.4416, + "step": 4560 + }, + { + "epoch": 0.3101644245142003, + "grad_norm": 7.767589092254639, + "learning_rate": 9.612549259410246e-06, + "loss": 3.4591, + "step": 4565 + }, + { + "epoch": 0.3105041445848621, + "grad_norm": 7.48415994644165, + "learning_rate": 9.61212460932192e-06, + "loss": 3.5531, + "step": 4570 + }, + { + "epoch": 0.3108438646555238, + "grad_norm": 6.554416656494141, + "learning_rate": 9.611699959233593e-06, + "loss": 3.2178, + "step": 4575 + }, + { + "epoch": 0.3111835847261856, + "grad_norm": 8.496367454528809, + "learning_rate": 9.611275309145264e-06, + "loss": 3.741, + "step": 4580 + }, + { + "epoch": 0.3115233047968474, + "grad_norm": 6.565785884857178, + "learning_rate": 9.610850659056939e-06, + "loss": 3.2777, + "step": 4585 + }, + { + "epoch": 0.31186302486750916, + "grad_norm": 8.484699249267578, + "learning_rate": 9.61042600896861e-06, + "loss": 3.4418, + "step": 4590 + }, + { + "epoch": 0.31220274493817096, + "grad_norm": 7.029791831970215, + "learning_rate": 9.610001358880283e-06, + "loss": 3.4465, + "step": 4595 + }, + { + "epoch": 0.3125424650088327, + "grad_norm": 6.824605464935303, + "learning_rate": 9.609576708791957e-06, + "loss": 3.5221, + "step": 4600 + }, + { + "epoch": 0.3128821850794945, + "grad_norm": 5.961697578430176, + "learning_rate": 9.609152058703628e-06, + "loss": 3.49, + "step": 4605 + }, + { + "epoch": 0.3132219051501563, + "grad_norm": 8.269464492797852, + "learning_rate": 9.608727408615301e-06, + "loss": 3.4548, + "step": 4610 + }, + { + "epoch": 0.31356162522081804, + "grad_norm": 5.944865703582764, + "learning_rate": 9.608302758526976e-06, + "loss": 3.3438, + "step": 4615 + }, + { + "epoch": 0.31390134529147984, + "grad_norm": 7.233912944793701, + "learning_rate": 9.607878108438647e-06, + "loss": 3.5645, + "step": 4620 + }, + { + "epoch": 0.3142410653621416, + "grad_norm": 6.3583526611328125, + "learning_rate": 9.60745345835032e-06, + "loss": 3.462, + "step": 4625 + }, + { + "epoch": 0.3145807854328034, + "grad_norm": 5.620872497558594, + "learning_rate": 9.607028808261994e-06, + "loss": 3.255, + "step": 4630 + }, + { + "epoch": 0.3149205055034651, + "grad_norm": 8.245948791503906, + "learning_rate": 9.606604158173665e-06, + "loss": 3.4021, + "step": 4635 + }, + { + "epoch": 0.3152602255741269, + "grad_norm": 6.411770820617676, + "learning_rate": 9.606179508085338e-06, + "loss": 3.5362, + "step": 4640 + }, + { + "epoch": 0.3155999456447887, + "grad_norm": 6.60984992980957, + "learning_rate": 9.605754857997012e-06, + "loss": 3.3718, + "step": 4645 + }, + { + "epoch": 0.31593966571545046, + "grad_norm": 5.670566558837891, + "learning_rate": 9.605330207908684e-06, + "loss": 3.4125, + "step": 4650 + }, + { + "epoch": 0.31627938578611225, + "grad_norm": 7.767109394073486, + "learning_rate": 9.604905557820356e-06, + "loss": 3.3258, + "step": 4655 + }, + { + "epoch": 0.316619105856774, + "grad_norm": 6.021259784698486, + "learning_rate": 9.604480907732031e-06, + "loss": 3.6887, + "step": 4660 + }, + { + "epoch": 0.3169588259274358, + "grad_norm": 7.536830425262451, + "learning_rate": 9.604056257643702e-06, + "loss": 3.3619, + "step": 4665 + }, + { + "epoch": 0.3172985459980976, + "grad_norm": 7.340490818023682, + "learning_rate": 9.603631607555375e-06, + "loss": 3.5509, + "step": 4670 + }, + { + "epoch": 0.31763826606875933, + "grad_norm": 6.561247825622559, + "learning_rate": 9.603206957467048e-06, + "loss": 3.4638, + "step": 4675 + }, + { + "epoch": 0.31797798613942113, + "grad_norm": 6.28687858581543, + "learning_rate": 9.60278230737872e-06, + "loss": 3.4298, + "step": 4680 + }, + { + "epoch": 0.3183177062100829, + "grad_norm": 4.719228744506836, + "learning_rate": 9.602357657290393e-06, + "loss": 3.5301, + "step": 4685 + }, + { + "epoch": 0.31865742628074467, + "grad_norm": 6.321611404418945, + "learning_rate": 9.601933007202066e-06, + "loss": 3.5052, + "step": 4690 + }, + { + "epoch": 0.31899714635140647, + "grad_norm": 6.369035720825195, + "learning_rate": 9.601508357113739e-06, + "loss": 3.5225, + "step": 4695 + }, + { + "epoch": 0.3193368664220682, + "grad_norm": 5.776918411254883, + "learning_rate": 9.601083707025412e-06, + "loss": 3.4483, + "step": 4700 + }, + { + "epoch": 0.31967658649273, + "grad_norm": 5.813501834869385, + "learning_rate": 9.600659056937084e-06, + "loss": 3.4474, + "step": 4705 + }, + { + "epoch": 0.32001630656339175, + "grad_norm": 6.8820366859436035, + "learning_rate": 9.600234406848757e-06, + "loss": 3.2651, + "step": 4710 + }, + { + "epoch": 0.32035602663405355, + "grad_norm": 6.268302917480469, + "learning_rate": 9.59980975676043e-06, + "loss": 3.4607, + "step": 4715 + }, + { + "epoch": 0.3206957467047153, + "grad_norm": 5.896376609802246, + "learning_rate": 9.599385106672103e-06, + "loss": 3.5434, + "step": 4720 + }, + { + "epoch": 0.3210354667753771, + "grad_norm": 5.961143970489502, + "learning_rate": 9.598960456583776e-06, + "loss": 3.4831, + "step": 4725 + }, + { + "epoch": 0.3213751868460389, + "grad_norm": 6.45692777633667, + "learning_rate": 9.598535806495448e-06, + "loss": 3.3031, + "step": 4730 + }, + { + "epoch": 0.3217149069167006, + "grad_norm": 6.18162202835083, + "learning_rate": 9.598111156407121e-06, + "loss": 3.2095, + "step": 4735 + }, + { + "epoch": 0.3220546269873624, + "grad_norm": 7.858071327209473, + "learning_rate": 9.597686506318794e-06, + "loss": 3.2408, + "step": 4740 + }, + { + "epoch": 0.32239434705802417, + "grad_norm": 6.481049537658691, + "learning_rate": 9.597261856230467e-06, + "loss": 3.2499, + "step": 4745 + }, + { + "epoch": 0.32273406712868596, + "grad_norm": 6.49184513092041, + "learning_rate": 9.59683720614214e-06, + "loss": 3.3578, + "step": 4750 + }, + { + "epoch": 0.32307378719934776, + "grad_norm": 8.163735389709473, + "learning_rate": 9.596412556053812e-06, + "loss": 3.5355, + "step": 4755 + }, + { + "epoch": 0.3234135072700095, + "grad_norm": 6.679351329803467, + "learning_rate": 9.595987905965485e-06, + "loss": 3.3055, + "step": 4760 + }, + { + "epoch": 0.3237532273406713, + "grad_norm": 6.921202182769775, + "learning_rate": 9.595563255877158e-06, + "loss": 3.5477, + "step": 4765 + }, + { + "epoch": 0.32409294741133304, + "grad_norm": 7.951940059661865, + "learning_rate": 9.59513860578883e-06, + "loss": 3.409, + "step": 4770 + }, + { + "epoch": 0.32443266748199484, + "grad_norm": 5.950642108917236, + "learning_rate": 9.594713955700504e-06, + "loss": 3.2844, + "step": 4775 + }, + { + "epoch": 0.32477238755265664, + "grad_norm": 6.414890766143799, + "learning_rate": 9.594289305612176e-06, + "loss": 3.4931, + "step": 4780 + }, + { + "epoch": 0.3251121076233184, + "grad_norm": 6.617333889007568, + "learning_rate": 9.59386465552385e-06, + "loss": 3.3606, + "step": 4785 + }, + { + "epoch": 0.3254518276939802, + "grad_norm": 7.059715747833252, + "learning_rate": 9.593440005435522e-06, + "loss": 3.2644, + "step": 4790 + }, + { + "epoch": 0.3257915477646419, + "grad_norm": 6.907299995422363, + "learning_rate": 9.593015355347195e-06, + "loss": 3.4167, + "step": 4795 + }, + { + "epoch": 0.3261312678353037, + "grad_norm": 7.201488971710205, + "learning_rate": 9.592590705258868e-06, + "loss": 3.2507, + "step": 4800 + }, + { + "epoch": 0.32647098790596546, + "grad_norm": 9.147533416748047, + "learning_rate": 9.59216605517054e-06, + "loss": 3.359, + "step": 4805 + }, + { + "epoch": 0.32681070797662726, + "grad_norm": 6.2896318435668945, + "learning_rate": 9.591741405082213e-06, + "loss": 3.5983, + "step": 4810 + }, + { + "epoch": 0.32715042804728905, + "grad_norm": 8.147607803344727, + "learning_rate": 9.591316754993886e-06, + "loss": 3.2005, + "step": 4815 + }, + { + "epoch": 0.3274901481179508, + "grad_norm": 8.20680046081543, + "learning_rate": 9.590892104905559e-06, + "loss": 3.3734, + "step": 4820 + }, + { + "epoch": 0.3278298681886126, + "grad_norm": 4.974747180938721, + "learning_rate": 9.590467454817232e-06, + "loss": 3.4815, + "step": 4825 + }, + { + "epoch": 0.32816958825927434, + "grad_norm": 6.940661430358887, + "learning_rate": 9.590042804728904e-06, + "loss": 3.4896, + "step": 4830 + }, + { + "epoch": 0.32850930832993613, + "grad_norm": 7.522816181182861, + "learning_rate": 9.589618154640577e-06, + "loss": 3.4543, + "step": 4835 + }, + { + "epoch": 0.32884902840059793, + "grad_norm": 6.329257011413574, + "learning_rate": 9.58919350455225e-06, + "loss": 3.6359, + "step": 4840 + }, + { + "epoch": 0.3291887484712597, + "grad_norm": 7.739781379699707, + "learning_rate": 9.588768854463923e-06, + "loss": 3.4519, + "step": 4845 + }, + { + "epoch": 0.32952846854192147, + "grad_norm": 8.041434288024902, + "learning_rate": 9.588344204375596e-06, + "loss": 3.4636, + "step": 4850 + }, + { + "epoch": 0.3298681886125832, + "grad_norm": 7.169039726257324, + "learning_rate": 9.587919554287268e-06, + "loss": 3.5242, + "step": 4855 + }, + { + "epoch": 0.330207908683245, + "grad_norm": 7.932892322540283, + "learning_rate": 9.587494904198941e-06, + "loss": 3.184, + "step": 4860 + }, + { + "epoch": 0.3305476287539068, + "grad_norm": 7.6538405418396, + "learning_rate": 9.587070254110612e-06, + "loss": 3.4971, + "step": 4865 + }, + { + "epoch": 0.33088734882456855, + "grad_norm": 6.824309349060059, + "learning_rate": 9.586645604022287e-06, + "loss": 3.6947, + "step": 4870 + }, + { + "epoch": 0.33122706889523035, + "grad_norm": 7.824872016906738, + "learning_rate": 9.58622095393396e-06, + "loss": 3.2406, + "step": 4875 + }, + { + "epoch": 0.3315667889658921, + "grad_norm": 7.3087568283081055, + "learning_rate": 9.585796303845632e-06, + "loss": 3.1406, + "step": 4880 + }, + { + "epoch": 0.3319065090365539, + "grad_norm": 4.902907371520996, + "learning_rate": 9.585371653757305e-06, + "loss": 3.523, + "step": 4885 + }, + { + "epoch": 0.33224622910721563, + "grad_norm": 8.630228996276855, + "learning_rate": 9.584947003668978e-06, + "loss": 3.482, + "step": 4890 + }, + { + "epoch": 0.3325859491778774, + "grad_norm": 6.545489311218262, + "learning_rate": 9.58452235358065e-06, + "loss": 3.6721, + "step": 4895 + }, + { + "epoch": 0.3329256692485392, + "grad_norm": 7.52313232421875, + "learning_rate": 9.584097703492324e-06, + "loss": 3.5088, + "step": 4900 + }, + { + "epoch": 0.33326538931920097, + "grad_norm": 8.359875679016113, + "learning_rate": 9.583673053403996e-06, + "loss": 3.3919, + "step": 4905 + }, + { + "epoch": 0.33360510938986276, + "grad_norm": 5.930079460144043, + "learning_rate": 9.58324840331567e-06, + "loss": 3.5669, + "step": 4910 + }, + { + "epoch": 0.3339448294605245, + "grad_norm": 8.090261459350586, + "learning_rate": 9.582823753227342e-06, + "loss": 3.505, + "step": 4915 + }, + { + "epoch": 0.3342845495311863, + "grad_norm": 6.742050647735596, + "learning_rate": 9.582399103139015e-06, + "loss": 3.3254, + "step": 4920 + }, + { + "epoch": 0.3346242696018481, + "grad_norm": 6.595394611358643, + "learning_rate": 9.581974453050688e-06, + "loss": 3.6359, + "step": 4925 + }, + { + "epoch": 0.33496398967250984, + "grad_norm": 7.1515631675720215, + "learning_rate": 9.58154980296236e-06, + "loss": 3.9308, + "step": 4930 + }, + { + "epoch": 0.33530370974317164, + "grad_norm": 7.743899345397949, + "learning_rate": 9.581125152874032e-06, + "loss": 3.4693, + "step": 4935 + }, + { + "epoch": 0.3356434298138334, + "grad_norm": 5.387275695800781, + "learning_rate": 9.580700502785706e-06, + "loss": 3.2576, + "step": 4940 + }, + { + "epoch": 0.3359831498844952, + "grad_norm": 5.428455352783203, + "learning_rate": 9.580275852697379e-06, + "loss": 3.526, + "step": 4945 + }, + { + "epoch": 0.336322869955157, + "grad_norm": 6.963418006896973, + "learning_rate": 9.57985120260905e-06, + "loss": 3.4135, + "step": 4950 + }, + { + "epoch": 0.3366625900258187, + "grad_norm": 5.40343713760376, + "learning_rate": 9.579426552520724e-06, + "loss": 3.5332, + "step": 4955 + }, + { + "epoch": 0.3370023100964805, + "grad_norm": 5.786355972290039, + "learning_rate": 9.579001902432397e-06, + "loss": 3.3905, + "step": 4960 + }, + { + "epoch": 0.33734203016714226, + "grad_norm": 5.111032485961914, + "learning_rate": 9.578577252344068e-06, + "loss": 3.4006, + "step": 4965 + }, + { + "epoch": 0.33768175023780406, + "grad_norm": 9.65556526184082, + "learning_rate": 9.578152602255743e-06, + "loss": 3.2101, + "step": 4970 + }, + { + "epoch": 0.3380214703084658, + "grad_norm": 8.492072105407715, + "learning_rate": 9.577727952167416e-06, + "loss": 3.435, + "step": 4975 + }, + { + "epoch": 0.3383611903791276, + "grad_norm": 7.219313621520996, + "learning_rate": 9.577303302079087e-06, + "loss": 3.4491, + "step": 4980 + }, + { + "epoch": 0.3387009104497894, + "grad_norm": 6.196358680725098, + "learning_rate": 9.576878651990761e-06, + "loss": 3.3299, + "step": 4985 + }, + { + "epoch": 0.33904063052045114, + "grad_norm": 7.945899963378906, + "learning_rate": 9.576454001902434e-06, + "loss": 3.4536, + "step": 4990 + }, + { + "epoch": 0.33938035059111293, + "grad_norm": 5.739086151123047, + "learning_rate": 9.576029351814105e-06, + "loss": 3.5129, + "step": 4995 + }, + { + "epoch": 0.3397200706617747, + "grad_norm": 7.088822364807129, + "learning_rate": 9.57560470172578e-06, + "loss": 3.3907, + "step": 5000 + }, + { + "epoch": 0.3400597907324365, + "grad_norm": 5.859934329986572, + "learning_rate": 9.575180051637452e-06, + "loss": 3.5304, + "step": 5005 + }, + { + "epoch": 0.34039951080309827, + "grad_norm": 7.154534339904785, + "learning_rate": 9.574755401549124e-06, + "loss": 3.3891, + "step": 5010 + }, + { + "epoch": 0.34073923087376, + "grad_norm": 5.997685432434082, + "learning_rate": 9.574330751460798e-06, + "loss": 3.1822, + "step": 5015 + }, + { + "epoch": 0.3410789509444218, + "grad_norm": 6.972036838531494, + "learning_rate": 9.573906101372469e-06, + "loss": 3.1955, + "step": 5020 + }, + { + "epoch": 0.34141867101508355, + "grad_norm": 6.520454406738281, + "learning_rate": 9.573481451284142e-06, + "loss": 3.1988, + "step": 5025 + }, + { + "epoch": 0.34175839108574535, + "grad_norm": 8.895792961120605, + "learning_rate": 9.573056801195816e-06, + "loss": 3.3682, + "step": 5030 + }, + { + "epoch": 0.34209811115640715, + "grad_norm": 8.275264739990234, + "learning_rate": 9.572632151107488e-06, + "loss": 3.2168, + "step": 5035 + }, + { + "epoch": 0.3424378312270689, + "grad_norm": 8.493302345275879, + "learning_rate": 9.57220750101916e-06, + "loss": 3.2912, + "step": 5040 + }, + { + "epoch": 0.3427775512977307, + "grad_norm": 6.803069591522217, + "learning_rate": 9.571782850930835e-06, + "loss": 3.4545, + "step": 5045 + }, + { + "epoch": 0.34311727136839243, + "grad_norm": 8.539693832397461, + "learning_rate": 9.571358200842506e-06, + "loss": 3.5111, + "step": 5050 + }, + { + "epoch": 0.3434569914390542, + "grad_norm": 5.114822864532471, + "learning_rate": 9.570933550754179e-06, + "loss": 3.5891, + "step": 5055 + }, + { + "epoch": 0.34379671150971597, + "grad_norm": 7.768092632293701, + "learning_rate": 9.570508900665853e-06, + "loss": 3.1976, + "step": 5060 + }, + { + "epoch": 0.34413643158037777, + "grad_norm": 6.24239444732666, + "learning_rate": 9.570084250577524e-06, + "loss": 3.0208, + "step": 5065 + }, + { + "epoch": 0.34447615165103956, + "grad_norm": 5.3542399406433105, + "learning_rate": 9.569659600489197e-06, + "loss": 3.3502, + "step": 5070 + }, + { + "epoch": 0.3448158717217013, + "grad_norm": 6.526214122772217, + "learning_rate": 9.569234950400872e-06, + "loss": 3.306, + "step": 5075 + }, + { + "epoch": 0.3451555917923631, + "grad_norm": 8.541136741638184, + "learning_rate": 9.568810300312543e-06, + "loss": 3.3816, + "step": 5080 + }, + { + "epoch": 0.34549531186302485, + "grad_norm": 8.999726295471191, + "learning_rate": 9.568385650224216e-06, + "loss": 3.3367, + "step": 5085 + }, + { + "epoch": 0.34583503193368664, + "grad_norm": 8.332294464111328, + "learning_rate": 9.567961000135888e-06, + "loss": 3.2996, + "step": 5090 + }, + { + "epoch": 0.34617475200434844, + "grad_norm": 6.061913013458252, + "learning_rate": 9.567536350047561e-06, + "loss": 3.4941, + "step": 5095 + }, + { + "epoch": 0.3465144720750102, + "grad_norm": 7.116024971008301, + "learning_rate": 9.567111699959234e-06, + "loss": 3.1119, + "step": 5100 + }, + { + "epoch": 0.346854192145672, + "grad_norm": 6.016885757446289, + "learning_rate": 9.566687049870907e-06, + "loss": 3.313, + "step": 5105 + }, + { + "epoch": 0.3471939122163337, + "grad_norm": 5.801341533660889, + "learning_rate": 9.56626239978258e-06, + "loss": 3.4908, + "step": 5110 + }, + { + "epoch": 0.3475336322869955, + "grad_norm": 7.009602069854736, + "learning_rate": 9.565837749694252e-06, + "loss": 3.4765, + "step": 5115 + }, + { + "epoch": 0.3478733523576573, + "grad_norm": 7.215595722198486, + "learning_rate": 9.565413099605925e-06, + "loss": 3.394, + "step": 5120 + }, + { + "epoch": 0.34821307242831906, + "grad_norm": 6.719605445861816, + "learning_rate": 9.564988449517598e-06, + "loss": 3.4529, + "step": 5125 + }, + { + "epoch": 0.34855279249898086, + "grad_norm": 6.072567939758301, + "learning_rate": 9.56456379942927e-06, + "loss": 3.2308, + "step": 5130 + }, + { + "epoch": 0.3488925125696426, + "grad_norm": 6.094821453094482, + "learning_rate": 9.564139149340944e-06, + "loss": 3.2801, + "step": 5135 + }, + { + "epoch": 0.3492322326403044, + "grad_norm": 5.280786037445068, + "learning_rate": 9.563714499252616e-06, + "loss": 3.3348, + "step": 5140 + }, + { + "epoch": 0.34957195271096614, + "grad_norm": 7.118689060211182, + "learning_rate": 9.563289849164289e-06, + "loss": 3.2551, + "step": 5145 + }, + { + "epoch": 0.34991167278162794, + "grad_norm": 6.7676005363464355, + "learning_rate": 9.562865199075962e-06, + "loss": 3.6235, + "step": 5150 + }, + { + "epoch": 0.35025139285228973, + "grad_norm": 6.088188648223877, + "learning_rate": 9.562440548987635e-06, + "loss": 3.3048, + "step": 5155 + }, + { + "epoch": 0.3505911129229515, + "grad_norm": 6.528585910797119, + "learning_rate": 9.562015898899308e-06, + "loss": 3.5243, + "step": 5160 + }, + { + "epoch": 0.3509308329936133, + "grad_norm": 6.424222469329834, + "learning_rate": 9.56159124881098e-06, + "loss": 3.4195, + "step": 5165 + }, + { + "epoch": 0.351270553064275, + "grad_norm": 5.828665733337402, + "learning_rate": 9.561166598722653e-06, + "loss": 3.326, + "step": 5170 + }, + { + "epoch": 0.3516102731349368, + "grad_norm": 7.858094215393066, + "learning_rate": 9.560741948634326e-06, + "loss": 3.2006, + "step": 5175 + }, + { + "epoch": 0.3519499932055986, + "grad_norm": 9.512784957885742, + "learning_rate": 9.560317298545999e-06, + "loss": 3.5244, + "step": 5180 + }, + { + "epoch": 0.35228971327626035, + "grad_norm": 8.733100891113281, + "learning_rate": 9.559892648457672e-06, + "loss": 3.4165, + "step": 5185 + }, + { + "epoch": 0.35262943334692215, + "grad_norm": 6.614064693450928, + "learning_rate": 9.559467998369344e-06, + "loss": 3.4899, + "step": 5190 + }, + { + "epoch": 0.3529691534175839, + "grad_norm": 7.070528984069824, + "learning_rate": 9.559043348281017e-06, + "loss": 3.3317, + "step": 5195 + }, + { + "epoch": 0.3533088734882457, + "grad_norm": 7.854245662689209, + "learning_rate": 9.55861869819269e-06, + "loss": 3.5253, + "step": 5200 + }, + { + "epoch": 0.3536485935589075, + "grad_norm": 7.407670497894287, + "learning_rate": 9.558194048104363e-06, + "loss": 3.3489, + "step": 5205 + }, + { + "epoch": 0.35398831362956923, + "grad_norm": 7.857064723968506, + "learning_rate": 9.557769398016036e-06, + "loss": 3.4693, + "step": 5210 + }, + { + "epoch": 0.354328033700231, + "grad_norm": 5.917909622192383, + "learning_rate": 9.557344747927708e-06, + "loss": 3.5482, + "step": 5215 + }, + { + "epoch": 0.35466775377089277, + "grad_norm": 7.96218204498291, + "learning_rate": 9.556920097839381e-06, + "loss": 3.5794, + "step": 5220 + }, + { + "epoch": 0.35500747384155457, + "grad_norm": 6.912331581115723, + "learning_rate": 9.556495447751054e-06, + "loss": 3.3257, + "step": 5225 + }, + { + "epoch": 0.3553471939122163, + "grad_norm": 6.01753568649292, + "learning_rate": 9.556070797662727e-06, + "loss": 3.4189, + "step": 5230 + }, + { + "epoch": 0.3556869139828781, + "grad_norm": 6.097118377685547, + "learning_rate": 9.5556461475744e-06, + "loss": 3.1128, + "step": 5235 + }, + { + "epoch": 0.3560266340535399, + "grad_norm": 5.214508056640625, + "learning_rate": 9.555221497486072e-06, + "loss": 3.4392, + "step": 5240 + }, + { + "epoch": 0.35636635412420165, + "grad_norm": 7.466480731964111, + "learning_rate": 9.554796847397745e-06, + "loss": 3.3032, + "step": 5245 + }, + { + "epoch": 0.35670607419486344, + "grad_norm": 6.448169708251953, + "learning_rate": 9.554372197309418e-06, + "loss": 3.6827, + "step": 5250 + }, + { + "epoch": 0.3570457942655252, + "grad_norm": 5.419131278991699, + "learning_rate": 9.55394754722109e-06, + "loss": 3.3082, + "step": 5255 + }, + { + "epoch": 0.357385514336187, + "grad_norm": 5.662751197814941, + "learning_rate": 9.553522897132764e-06, + "loss": 3.3002, + "step": 5260 + }, + { + "epoch": 0.3577252344068488, + "grad_norm": 6.822617530822754, + "learning_rate": 9.553098247044436e-06, + "loss": 3.0453, + "step": 5265 + }, + { + "epoch": 0.3580649544775105, + "grad_norm": 6.143743991851807, + "learning_rate": 9.55267359695611e-06, + "loss": 3.3743, + "step": 5270 + }, + { + "epoch": 0.3584046745481723, + "grad_norm": 6.704483509063721, + "learning_rate": 9.552248946867782e-06, + "loss": 3.6149, + "step": 5275 + }, + { + "epoch": 0.35874439461883406, + "grad_norm": 5.839700222015381, + "learning_rate": 9.551824296779455e-06, + "loss": 3.4473, + "step": 5280 + }, + { + "epoch": 0.35908411468949586, + "grad_norm": 7.590534687042236, + "learning_rate": 9.551399646691128e-06, + "loss": 3.4147, + "step": 5285 + }, + { + "epoch": 0.35942383476015766, + "grad_norm": 7.7614970207214355, + "learning_rate": 9.5509749966028e-06, + "loss": 3.5595, + "step": 5290 + }, + { + "epoch": 0.3597635548308194, + "grad_norm": 8.171051025390625, + "learning_rate": 9.550550346514473e-06, + "loss": 3.5188, + "step": 5295 + }, + { + "epoch": 0.3601032749014812, + "grad_norm": 6.104159832000732, + "learning_rate": 9.550125696426146e-06, + "loss": 3.6047, + "step": 5300 + }, + { + "epoch": 0.36044299497214294, + "grad_norm": 5.664949417114258, + "learning_rate": 9.549701046337819e-06, + "loss": 3.5932, + "step": 5305 + }, + { + "epoch": 0.36078271504280474, + "grad_norm": 5.997740745544434, + "learning_rate": 9.549276396249492e-06, + "loss": 3.5611, + "step": 5310 + }, + { + "epoch": 0.3611224351134665, + "grad_norm": 5.761636734008789, + "learning_rate": 9.548851746161164e-06, + "loss": 3.6768, + "step": 5315 + }, + { + "epoch": 0.3614621551841283, + "grad_norm": 6.9144792556762695, + "learning_rate": 9.548427096072837e-06, + "loss": 3.291, + "step": 5320 + }, + { + "epoch": 0.3618018752547901, + "grad_norm": 7.126469612121582, + "learning_rate": 9.54800244598451e-06, + "loss": 3.6457, + "step": 5325 + }, + { + "epoch": 0.3621415953254518, + "grad_norm": 5.638972759246826, + "learning_rate": 9.547577795896183e-06, + "loss": 3.4357, + "step": 5330 + }, + { + "epoch": 0.3624813153961136, + "grad_norm": 9.23878002166748, + "learning_rate": 9.547153145807856e-06, + "loss": 3.6782, + "step": 5335 + }, + { + "epoch": 0.36282103546677535, + "grad_norm": 8.16588020324707, + "learning_rate": 9.546728495719528e-06, + "loss": 3.3304, + "step": 5340 + }, + { + "epoch": 0.36316075553743715, + "grad_norm": 8.015191078186035, + "learning_rate": 9.546303845631201e-06, + "loss": 3.2235, + "step": 5345 + }, + { + "epoch": 0.36350047560809895, + "grad_norm": 8.046871185302734, + "learning_rate": 9.545879195542872e-06, + "loss": 3.4048, + "step": 5350 + }, + { + "epoch": 0.3638401956787607, + "grad_norm": 5.374883651733398, + "learning_rate": 9.545454545454547e-06, + "loss": 3.0742, + "step": 5355 + }, + { + "epoch": 0.3641799157494225, + "grad_norm": 6.683962345123291, + "learning_rate": 9.54502989536622e-06, + "loss": 3.2619, + "step": 5360 + }, + { + "epoch": 0.36451963582008423, + "grad_norm": 9.141363143920898, + "learning_rate": 9.54460524527789e-06, + "loss": 3.5202, + "step": 5365 + }, + { + "epoch": 0.36485935589074603, + "grad_norm": 7.288174152374268, + "learning_rate": 9.544180595189565e-06, + "loss": 3.1614, + "step": 5370 + }, + { + "epoch": 0.3651990759614078, + "grad_norm": 6.856041431427002, + "learning_rate": 9.543755945101238e-06, + "loss": 3.3129, + "step": 5375 + }, + { + "epoch": 0.36553879603206957, + "grad_norm": 7.10647439956665, + "learning_rate": 9.543331295012909e-06, + "loss": 3.2746, + "step": 5380 + }, + { + "epoch": 0.36587851610273137, + "grad_norm": 5.649776458740234, + "learning_rate": 9.542906644924584e-06, + "loss": 3.4645, + "step": 5385 + }, + { + "epoch": 0.3662182361733931, + "grad_norm": 6.652994632720947, + "learning_rate": 9.542481994836256e-06, + "loss": 3.3908, + "step": 5390 + }, + { + "epoch": 0.3665579562440549, + "grad_norm": 6.471915245056152, + "learning_rate": 9.542057344747927e-06, + "loss": 3.2642, + "step": 5395 + }, + { + "epoch": 0.36689767631471665, + "grad_norm": 7.027013778686523, + "learning_rate": 9.541632694659602e-06, + "loss": 3.3797, + "step": 5400 + }, + { + "epoch": 0.36723739638537845, + "grad_norm": 6.677957534790039, + "learning_rate": 9.541208044571275e-06, + "loss": 3.245, + "step": 5405 + }, + { + "epoch": 0.36757711645604024, + "grad_norm": 6.817892551422119, + "learning_rate": 9.540783394482946e-06, + "loss": 3.3453, + "step": 5410 + }, + { + "epoch": 0.367916836526702, + "grad_norm": 6.059580326080322, + "learning_rate": 9.54035874439462e-06, + "loss": 3.3633, + "step": 5415 + }, + { + "epoch": 0.3682565565973638, + "grad_norm": 6.174221515655518, + "learning_rate": 9.539934094306293e-06, + "loss": 3.3451, + "step": 5420 + }, + { + "epoch": 0.3685962766680255, + "grad_norm": 8.009212493896484, + "learning_rate": 9.539509444217964e-06, + "loss": 3.1706, + "step": 5425 + }, + { + "epoch": 0.3689359967386873, + "grad_norm": 6.4231858253479, + "learning_rate": 9.539084794129639e-06, + "loss": 3.3845, + "step": 5430 + }, + { + "epoch": 0.3692757168093491, + "grad_norm": 7.033600807189941, + "learning_rate": 9.53866014404131e-06, + "loss": 3.3745, + "step": 5435 + }, + { + "epoch": 0.36961543688001086, + "grad_norm": 7.262562274932861, + "learning_rate": 9.538235493952983e-06, + "loss": 3.2104, + "step": 5440 + }, + { + "epoch": 0.36995515695067266, + "grad_norm": 6.597311019897461, + "learning_rate": 9.537810843864657e-06, + "loss": 3.3878, + "step": 5445 + }, + { + "epoch": 0.3702948770213344, + "grad_norm": 7.472986698150635, + "learning_rate": 9.537386193776328e-06, + "loss": 3.3607, + "step": 5450 + }, + { + "epoch": 0.3706345970919962, + "grad_norm": 6.671315670013428, + "learning_rate": 9.536961543688001e-06, + "loss": 3.6341, + "step": 5455 + }, + { + "epoch": 0.370974317162658, + "grad_norm": 5.867924690246582, + "learning_rate": 9.536536893599676e-06, + "loss": 3.4102, + "step": 5460 + }, + { + "epoch": 0.37131403723331974, + "grad_norm": 7.664874076843262, + "learning_rate": 9.536112243511347e-06, + "loss": 3.231, + "step": 5465 + }, + { + "epoch": 0.37165375730398154, + "grad_norm": 9.557147026062012, + "learning_rate": 9.53568759342302e-06, + "loss": 3.4505, + "step": 5470 + }, + { + "epoch": 0.3719934773746433, + "grad_norm": 7.588325023651123, + "learning_rate": 9.535262943334694e-06, + "loss": 3.6015, + "step": 5475 + }, + { + "epoch": 0.3723331974453051, + "grad_norm": 6.409093856811523, + "learning_rate": 9.534838293246365e-06, + "loss": 3.5358, + "step": 5480 + }, + { + "epoch": 0.3726729175159668, + "grad_norm": 7.807512283325195, + "learning_rate": 9.534413643158038e-06, + "loss": 3.683, + "step": 5485 + }, + { + "epoch": 0.3730126375866286, + "grad_norm": 8.072503089904785, + "learning_rate": 9.533988993069712e-06, + "loss": 3.4526, + "step": 5490 + }, + { + "epoch": 0.3733523576572904, + "grad_norm": 6.537031650543213, + "learning_rate": 9.533564342981384e-06, + "loss": 3.4263, + "step": 5495 + }, + { + "epoch": 0.37369207772795215, + "grad_norm": 6.469498157501221, + "learning_rate": 9.533139692893056e-06, + "loss": 3.3511, + "step": 5500 + }, + { + "epoch": 0.37403179779861395, + "grad_norm": 6.622940540313721, + "learning_rate": 9.532715042804729e-06, + "loss": 3.6646, + "step": 5505 + }, + { + "epoch": 0.3743715178692757, + "grad_norm": 5.963841438293457, + "learning_rate": 9.532290392716402e-06, + "loss": 3.3126, + "step": 5510 + }, + { + "epoch": 0.3747112379399375, + "grad_norm": 7.084362030029297, + "learning_rate": 9.531865742628075e-06, + "loss": 3.5099, + "step": 5515 + }, + { + "epoch": 0.3750509580105993, + "grad_norm": 6.0330891609191895, + "learning_rate": 9.531441092539748e-06, + "loss": 3.1549, + "step": 5520 + }, + { + "epoch": 0.37539067808126103, + "grad_norm": 6.228006839752197, + "learning_rate": 9.53101644245142e-06, + "loss": 3.1666, + "step": 5525 + }, + { + "epoch": 0.37573039815192283, + "grad_norm": 8.06070327758789, + "learning_rate": 9.530591792363093e-06, + "loss": 3.3083, + "step": 5530 + }, + { + "epoch": 0.37607011822258457, + "grad_norm": 5.80387020111084, + "learning_rate": 9.530167142274766e-06, + "loss": 3.6638, + "step": 5535 + }, + { + "epoch": 0.37640983829324637, + "grad_norm": 7.130016326904297, + "learning_rate": 9.529742492186439e-06, + "loss": 3.4508, + "step": 5540 + }, + { + "epoch": 0.37674955836390817, + "grad_norm": 5.89362096786499, + "learning_rate": 9.529317842098112e-06, + "loss": 3.6863, + "step": 5545 + }, + { + "epoch": 0.3770892784345699, + "grad_norm": 5.279539108276367, + "learning_rate": 9.528893192009784e-06, + "loss": 3.7858, + "step": 5550 + }, + { + "epoch": 0.3774289985052317, + "grad_norm": 6.982321739196777, + "learning_rate": 9.528468541921457e-06, + "loss": 3.5996, + "step": 5555 + }, + { + "epoch": 0.37776871857589345, + "grad_norm": 9.375544548034668, + "learning_rate": 9.528043891833132e-06, + "loss": 3.5554, + "step": 5560 + }, + { + "epoch": 0.37810843864655524, + "grad_norm": 6.756528854370117, + "learning_rate": 9.527619241744803e-06, + "loss": 3.3056, + "step": 5565 + }, + { + "epoch": 0.378448158717217, + "grad_norm": 6.9144511222839355, + "learning_rate": 9.527194591656476e-06, + "loss": 3.4468, + "step": 5570 + }, + { + "epoch": 0.3787878787878788, + "grad_norm": 6.46054744720459, + "learning_rate": 9.526769941568148e-06, + "loss": 3.2367, + "step": 5575 + }, + { + "epoch": 0.3791275988585406, + "grad_norm": 7.5032958984375, + "learning_rate": 9.526345291479821e-06, + "loss": 3.327, + "step": 5580 + }, + { + "epoch": 0.3794673189292023, + "grad_norm": 7.416750431060791, + "learning_rate": 9.525920641391494e-06, + "loss": 3.3336, + "step": 5585 + }, + { + "epoch": 0.3798070389998641, + "grad_norm": 6.300477504730225, + "learning_rate": 9.525495991303167e-06, + "loss": 3.2232, + "step": 5590 + }, + { + "epoch": 0.38014675907052586, + "grad_norm": 7.246498107910156, + "learning_rate": 9.52507134121484e-06, + "loss": 3.2652, + "step": 5595 + }, + { + "epoch": 0.38048647914118766, + "grad_norm": 5.818582534790039, + "learning_rate": 9.524646691126512e-06, + "loss": 3.5046, + "step": 5600 + }, + { + "epoch": 0.38082619921184946, + "grad_norm": 6.982439994812012, + "learning_rate": 9.524222041038185e-06, + "loss": 3.4227, + "step": 5605 + }, + { + "epoch": 0.3811659192825112, + "grad_norm": 6.950711250305176, + "learning_rate": 9.523797390949858e-06, + "loss": 3.3013, + "step": 5610 + }, + { + "epoch": 0.381505639353173, + "grad_norm": 6.144259452819824, + "learning_rate": 9.52337274086153e-06, + "loss": 3.3559, + "step": 5615 + }, + { + "epoch": 0.38184535942383474, + "grad_norm": 5.517207622528076, + "learning_rate": 9.522948090773204e-06, + "loss": 3.4889, + "step": 5620 + }, + { + "epoch": 0.38218507949449654, + "grad_norm": 8.506850242614746, + "learning_rate": 9.522523440684876e-06, + "loss": 3.3662, + "step": 5625 + }, + { + "epoch": 0.38252479956515834, + "grad_norm": 8.213678359985352, + "learning_rate": 9.522098790596549e-06, + "loss": 3.4891, + "step": 5630 + }, + { + "epoch": 0.3828645196358201, + "grad_norm": 7.085252285003662, + "learning_rate": 9.521674140508222e-06, + "loss": 3.2767, + "step": 5635 + }, + { + "epoch": 0.3832042397064819, + "grad_norm": 6.704138278961182, + "learning_rate": 9.521249490419895e-06, + "loss": 3.4298, + "step": 5640 + }, + { + "epoch": 0.3835439597771436, + "grad_norm": 7.638959884643555, + "learning_rate": 9.520824840331568e-06, + "loss": 3.4646, + "step": 5645 + }, + { + "epoch": 0.3838836798478054, + "grad_norm": 7.942511558532715, + "learning_rate": 9.52040019024324e-06, + "loss": 3.3275, + "step": 5650 + }, + { + "epoch": 0.38422339991846716, + "grad_norm": 6.270159721374512, + "learning_rate": 9.519975540154913e-06, + "loss": 3.469, + "step": 5655 + }, + { + "epoch": 0.38456311998912895, + "grad_norm": 5.687293529510498, + "learning_rate": 9.519550890066586e-06, + "loss": 3.3189, + "step": 5660 + }, + { + "epoch": 0.38490284005979075, + "grad_norm": 6.125545501708984, + "learning_rate": 9.519126239978259e-06, + "loss": 3.3565, + "step": 5665 + }, + { + "epoch": 0.3852425601304525, + "grad_norm": 6.526694297790527, + "learning_rate": 9.518701589889932e-06, + "loss": 3.2691, + "step": 5670 + }, + { + "epoch": 0.3855822802011143, + "grad_norm": 6.728771686553955, + "learning_rate": 9.518276939801604e-06, + "loss": 3.1971, + "step": 5675 + }, + { + "epoch": 0.38592200027177603, + "grad_norm": 7.100254058837891, + "learning_rate": 9.517852289713277e-06, + "loss": 3.2912, + "step": 5680 + }, + { + "epoch": 0.38626172034243783, + "grad_norm": 6.287652492523193, + "learning_rate": 9.51742763962495e-06, + "loss": 3.2375, + "step": 5685 + }, + { + "epoch": 0.38660144041309963, + "grad_norm": 7.502579689025879, + "learning_rate": 9.517002989536623e-06, + "loss": 3.483, + "step": 5690 + }, + { + "epoch": 0.38694116048376137, + "grad_norm": 6.308753967285156, + "learning_rate": 9.516578339448296e-06, + "loss": 3.3475, + "step": 5695 + }, + { + "epoch": 0.38728088055442317, + "grad_norm": 7.709463596343994, + "learning_rate": 9.516153689359968e-06, + "loss": 3.3861, + "step": 5700 + }, + { + "epoch": 0.3876206006250849, + "grad_norm": 6.528424263000488, + "learning_rate": 9.515729039271641e-06, + "loss": 3.1853, + "step": 5705 + }, + { + "epoch": 0.3879603206957467, + "grad_norm": 5.87557315826416, + "learning_rate": 9.515304389183314e-06, + "loss": 3.4249, + "step": 5710 + }, + { + "epoch": 0.3883000407664085, + "grad_norm": 6.590567111968994, + "learning_rate": 9.514879739094987e-06, + "loss": 3.4805, + "step": 5715 + }, + { + "epoch": 0.38863976083707025, + "grad_norm": 7.304425239562988, + "learning_rate": 9.51445508900666e-06, + "loss": 3.3455, + "step": 5720 + }, + { + "epoch": 0.38897948090773204, + "grad_norm": 6.849177837371826, + "learning_rate": 9.514030438918332e-06, + "loss": 3.5822, + "step": 5725 + }, + { + "epoch": 0.3893192009783938, + "grad_norm": 6.86956787109375, + "learning_rate": 9.513605788830005e-06, + "loss": 3.4146, + "step": 5730 + }, + { + "epoch": 0.3896589210490556, + "grad_norm": 8.572770118713379, + "learning_rate": 9.513181138741678e-06, + "loss": 3.4605, + "step": 5735 + }, + { + "epoch": 0.3899986411197173, + "grad_norm": 6.112575531005859, + "learning_rate": 9.51275648865335e-06, + "loss": 3.6508, + "step": 5740 + }, + { + "epoch": 0.3903383611903791, + "grad_norm": 5.32134485244751, + "learning_rate": 9.512331838565024e-06, + "loss": 3.3746, + "step": 5745 + }, + { + "epoch": 0.3906780812610409, + "grad_norm": 6.611032485961914, + "learning_rate": 9.511907188476696e-06, + "loss": 3.385, + "step": 5750 + }, + { + "epoch": 0.39101780133170266, + "grad_norm": 6.939200401306152, + "learning_rate": 9.511482538388369e-06, + "loss": 3.5072, + "step": 5755 + }, + { + "epoch": 0.39135752140236446, + "grad_norm": 8.3921537399292, + "learning_rate": 9.511057888300042e-06, + "loss": 3.444, + "step": 5760 + }, + { + "epoch": 0.3916972414730262, + "grad_norm": 8.26220417022705, + "learning_rate": 9.510633238211715e-06, + "loss": 3.2129, + "step": 5765 + }, + { + "epoch": 0.392036961543688, + "grad_norm": 6.8053998947143555, + "learning_rate": 9.510208588123388e-06, + "loss": 3.4665, + "step": 5770 + }, + { + "epoch": 0.3923766816143498, + "grad_norm": 6.714470386505127, + "learning_rate": 9.50978393803506e-06, + "loss": 3.4554, + "step": 5775 + }, + { + "epoch": 0.39271640168501154, + "grad_norm": 7.200636863708496, + "learning_rate": 9.509359287946731e-06, + "loss": 3.2287, + "step": 5780 + }, + { + "epoch": 0.39305612175567334, + "grad_norm": 7.100858211517334, + "learning_rate": 9.508934637858406e-06, + "loss": 3.4534, + "step": 5785 + }, + { + "epoch": 0.3933958418263351, + "grad_norm": 8.184640884399414, + "learning_rate": 9.508509987770079e-06, + "loss": 3.6462, + "step": 5790 + }, + { + "epoch": 0.3937355618969969, + "grad_norm": 6.081997871398926, + "learning_rate": 9.50808533768175e-06, + "loss": 3.388, + "step": 5795 + }, + { + "epoch": 0.3940752819676587, + "grad_norm": 8.927611351013184, + "learning_rate": 9.507660687593424e-06, + "loss": 3.5702, + "step": 5800 + }, + { + "epoch": 0.3944150020383204, + "grad_norm": 7.669685363769531, + "learning_rate": 9.507236037505097e-06, + "loss": 3.5504, + "step": 5805 + }, + { + "epoch": 0.3947547221089822, + "grad_norm": 6.5881428718566895, + "learning_rate": 9.506811387416768e-06, + "loss": 3.3956, + "step": 5810 + }, + { + "epoch": 0.39509444217964396, + "grad_norm": 7.645932197570801, + "learning_rate": 9.506386737328443e-06, + "loss": 3.0199, + "step": 5815 + }, + { + "epoch": 0.39543416225030575, + "grad_norm": 6.739983558654785, + "learning_rate": 9.505962087240116e-06, + "loss": 3.6069, + "step": 5820 + }, + { + "epoch": 0.3957738823209675, + "grad_norm": 7.312482833862305, + "learning_rate": 9.505537437151787e-06, + "loss": 3.4807, + "step": 5825 + }, + { + "epoch": 0.3961136023916293, + "grad_norm": 7.49228048324585, + "learning_rate": 9.505112787063461e-06, + "loss": 3.2293, + "step": 5830 + }, + { + "epoch": 0.3964533224622911, + "grad_norm": 7.029796600341797, + "learning_rate": 9.504688136975134e-06, + "loss": 3.6218, + "step": 5835 + }, + { + "epoch": 0.39679304253295283, + "grad_norm": 6.776014804840088, + "learning_rate": 9.504263486886805e-06, + "loss": 3.3781, + "step": 5840 + }, + { + "epoch": 0.39713276260361463, + "grad_norm": 6.212076663970947, + "learning_rate": 9.50383883679848e-06, + "loss": 3.279, + "step": 5845 + }, + { + "epoch": 0.3974724826742764, + "grad_norm": 7.190142631530762, + "learning_rate": 9.50341418671015e-06, + "loss": 3.3042, + "step": 5850 + }, + { + "epoch": 0.39781220274493817, + "grad_norm": 6.754173755645752, + "learning_rate": 9.502989536621823e-06, + "loss": 3.4045, + "step": 5855 + }, + { + "epoch": 0.39815192281559997, + "grad_norm": 5.659743309020996, + "learning_rate": 9.502564886533498e-06, + "loss": 3.4487, + "step": 5860 + }, + { + "epoch": 0.3984916428862617, + "grad_norm": 5.435281276702881, + "learning_rate": 9.502140236445169e-06, + "loss": 3.5426, + "step": 5865 + }, + { + "epoch": 0.3988313629569235, + "grad_norm": 7.711711406707764, + "learning_rate": 9.501715586356842e-06, + "loss": 3.3312, + "step": 5870 + }, + { + "epoch": 0.39917108302758525, + "grad_norm": 5.220008373260498, + "learning_rate": 9.501290936268516e-06, + "loss": 3.0917, + "step": 5875 + }, + { + "epoch": 0.39951080309824705, + "grad_norm": 7.940798759460449, + "learning_rate": 9.500866286180187e-06, + "loss": 3.4926, + "step": 5880 + }, + { + "epoch": 0.39985052316890884, + "grad_norm": 6.372572898864746, + "learning_rate": 9.50044163609186e-06, + "loss": 3.6518, + "step": 5885 + }, + { + "epoch": 0.4001902432395706, + "grad_norm": 6.033905982971191, + "learning_rate": 9.500016986003535e-06, + "loss": 3.4549, + "step": 5890 + }, + { + "epoch": 0.4005299633102324, + "grad_norm": 7.11289119720459, + "learning_rate": 9.499592335915206e-06, + "loss": 3.0925, + "step": 5895 + }, + { + "epoch": 0.4008696833808941, + "grad_norm": 8.02014446258545, + "learning_rate": 9.49916768582688e-06, + "loss": 3.4115, + "step": 5900 + }, + { + "epoch": 0.4012094034515559, + "grad_norm": 5.697676181793213, + "learning_rate": 9.498743035738553e-06, + "loss": 3.4208, + "step": 5905 + }, + { + "epoch": 0.40154912352221767, + "grad_norm": 8.30915641784668, + "learning_rate": 9.498318385650224e-06, + "loss": 3.4765, + "step": 5910 + }, + { + "epoch": 0.40188884359287946, + "grad_norm": 7.23581075668335, + "learning_rate": 9.497893735561899e-06, + "loss": 3.4732, + "step": 5915 + }, + { + "epoch": 0.40222856366354126, + "grad_norm": 7.821160316467285, + "learning_rate": 9.49746908547357e-06, + "loss": 3.5932, + "step": 5920 + }, + { + "epoch": 0.402568283734203, + "grad_norm": 6.7192864418029785, + "learning_rate": 9.497044435385243e-06, + "loss": 3.3461, + "step": 5925 + }, + { + "epoch": 0.4029080038048648, + "grad_norm": 7.260104656219482, + "learning_rate": 9.496619785296917e-06, + "loss": 3.4467, + "step": 5930 + }, + { + "epoch": 0.40324772387552654, + "grad_norm": 6.08171272277832, + "learning_rate": 9.496195135208588e-06, + "loss": 3.5909, + "step": 5935 + }, + { + "epoch": 0.40358744394618834, + "grad_norm": 6.6215057373046875, + "learning_rate": 9.495770485120261e-06, + "loss": 3.3109, + "step": 5940 + }, + { + "epoch": 0.40392716401685014, + "grad_norm": 7.297423839569092, + "learning_rate": 9.495345835031936e-06, + "loss": 3.4017, + "step": 5945 + }, + { + "epoch": 0.4042668840875119, + "grad_norm": 5.683389663696289, + "learning_rate": 9.494921184943607e-06, + "loss": 3.6158, + "step": 5950 + }, + { + "epoch": 0.4046066041581737, + "grad_norm": 6.348202705383301, + "learning_rate": 9.49449653485528e-06, + "loss": 3.4864, + "step": 5955 + }, + { + "epoch": 0.4049463242288354, + "grad_norm": 7.194920539855957, + "learning_rate": 9.494071884766954e-06, + "loss": 3.3765, + "step": 5960 + }, + { + "epoch": 0.4052860442994972, + "grad_norm": 6.812239170074463, + "learning_rate": 9.493647234678625e-06, + "loss": 3.4716, + "step": 5965 + }, + { + "epoch": 0.405625764370159, + "grad_norm": 7.622183799743652, + "learning_rate": 9.493222584590298e-06, + "loss": 3.7238, + "step": 5970 + }, + { + "epoch": 0.40596548444082076, + "grad_norm": 6.37131404876709, + "learning_rate": 9.492797934501972e-06, + "loss": 3.2517, + "step": 5975 + }, + { + "epoch": 0.40630520451148255, + "grad_norm": 7.829806804656982, + "learning_rate": 9.492373284413643e-06, + "loss": 3.3991, + "step": 5980 + }, + { + "epoch": 0.4066449245821443, + "grad_norm": 5.652291297912598, + "learning_rate": 9.491948634325316e-06, + "loss": 3.3371, + "step": 5985 + }, + { + "epoch": 0.4069846446528061, + "grad_norm": 7.8592915534973145, + "learning_rate": 9.49152398423699e-06, + "loss": 3.7193, + "step": 5990 + }, + { + "epoch": 0.40732436472346784, + "grad_norm": 5.926516532897949, + "learning_rate": 9.491099334148662e-06, + "loss": 3.4444, + "step": 5995 + }, + { + "epoch": 0.40766408479412963, + "grad_norm": 5.829591751098633, + "learning_rate": 9.490674684060335e-06, + "loss": 3.2887, + "step": 6000 + }, + { + "epoch": 0.40800380486479143, + "grad_norm": 7.3568010330200195, + "learning_rate": 9.490250033972007e-06, + "loss": 3.4245, + "step": 6005 + }, + { + "epoch": 0.4083435249354532, + "grad_norm": 7.363008499145508, + "learning_rate": 9.48982538388368e-06, + "loss": 3.3752, + "step": 6010 + }, + { + "epoch": 0.40868324500611497, + "grad_norm": 6.110202789306641, + "learning_rate": 9.489400733795353e-06, + "loss": 3.3905, + "step": 6015 + }, + { + "epoch": 0.4090229650767767, + "grad_norm": 7.4788737297058105, + "learning_rate": 9.488976083707026e-06, + "loss": 3.4624, + "step": 6020 + }, + { + "epoch": 0.4093626851474385, + "grad_norm": 7.106937408447266, + "learning_rate": 9.488551433618699e-06, + "loss": 3.2335, + "step": 6025 + }, + { + "epoch": 0.4097024052181003, + "grad_norm": 7.158503532409668, + "learning_rate": 9.488126783530371e-06, + "loss": 3.1872, + "step": 6030 + }, + { + "epoch": 0.41004212528876205, + "grad_norm": 7.065827369689941, + "learning_rate": 9.487702133442044e-06, + "loss": 3.3792, + "step": 6035 + }, + { + "epoch": 0.41038184535942385, + "grad_norm": 9.904696464538574, + "learning_rate": 9.487277483353717e-06, + "loss": 3.5256, + "step": 6040 + }, + { + "epoch": 0.4107215654300856, + "grad_norm": 6.784527778625488, + "learning_rate": 9.48685283326539e-06, + "loss": 3.3128, + "step": 6045 + }, + { + "epoch": 0.4110612855007474, + "grad_norm": 6.896822929382324, + "learning_rate": 9.486428183177063e-06, + "loss": 3.3899, + "step": 6050 + }, + { + "epoch": 0.4114010055714092, + "grad_norm": 6.261762619018555, + "learning_rate": 9.486003533088735e-06, + "loss": 3.2483, + "step": 6055 + }, + { + "epoch": 0.4117407256420709, + "grad_norm": 7.196678638458252, + "learning_rate": 9.485578883000408e-06, + "loss": 3.2504, + "step": 6060 + }, + { + "epoch": 0.4120804457127327, + "grad_norm": 9.269731521606445, + "learning_rate": 9.485154232912081e-06, + "loss": 3.4569, + "step": 6065 + }, + { + "epoch": 0.41242016578339447, + "grad_norm": 8.222381591796875, + "learning_rate": 9.484729582823754e-06, + "loss": 3.6045, + "step": 6070 + }, + { + "epoch": 0.41275988585405626, + "grad_norm": 7.385223865509033, + "learning_rate": 9.484304932735427e-06, + "loss": 3.025, + "step": 6075 + }, + { + "epoch": 0.413099605924718, + "grad_norm": 8.648031234741211, + "learning_rate": 9.4838802826471e-06, + "loss": 3.5673, + "step": 6080 + }, + { + "epoch": 0.4134393259953798, + "grad_norm": 9.593182563781738, + "learning_rate": 9.483455632558772e-06, + "loss": 3.4453, + "step": 6085 + }, + { + "epoch": 0.4137790460660416, + "grad_norm": 5.965849876403809, + "learning_rate": 9.483030982470445e-06, + "loss": 3.5455, + "step": 6090 + }, + { + "epoch": 0.41411876613670334, + "grad_norm": 7.52025032043457, + "learning_rate": 9.482606332382118e-06, + "loss": 3.3278, + "step": 6095 + }, + { + "epoch": 0.41445848620736514, + "grad_norm": 6.752696990966797, + "learning_rate": 9.48218168229379e-06, + "loss": 3.3361, + "step": 6100 + }, + { + "epoch": 0.4147982062780269, + "grad_norm": 6.503914833068848, + "learning_rate": 9.481757032205464e-06, + "loss": 3.2161, + "step": 6105 + }, + { + "epoch": 0.4151379263486887, + "grad_norm": 6.2747297286987305, + "learning_rate": 9.481332382117136e-06, + "loss": 3.358, + "step": 6110 + }, + { + "epoch": 0.4154776464193505, + "grad_norm": 8.702648162841797, + "learning_rate": 9.480907732028809e-06, + "loss": 3.2973, + "step": 6115 + }, + { + "epoch": 0.4158173664900122, + "grad_norm": 7.033487796783447, + "learning_rate": 9.480483081940482e-06, + "loss": 3.7077, + "step": 6120 + }, + { + "epoch": 0.416157086560674, + "grad_norm": 6.071026802062988, + "learning_rate": 9.480058431852155e-06, + "loss": 3.4768, + "step": 6125 + }, + { + "epoch": 0.41649680663133576, + "grad_norm": 5.77849006652832, + "learning_rate": 9.479633781763828e-06, + "loss": 3.1615, + "step": 6130 + }, + { + "epoch": 0.41683652670199756, + "grad_norm": 7.835556983947754, + "learning_rate": 9.4792091316755e-06, + "loss": 3.324, + "step": 6135 + }, + { + "epoch": 0.41717624677265935, + "grad_norm": 6.497552394866943, + "learning_rate": 9.478784481587173e-06, + "loss": 3.4256, + "step": 6140 + }, + { + "epoch": 0.4175159668433211, + "grad_norm": 7.5304718017578125, + "learning_rate": 9.478359831498846e-06, + "loss": 3.435, + "step": 6145 + }, + { + "epoch": 0.4178556869139829, + "grad_norm": 9.641617774963379, + "learning_rate": 9.477935181410519e-06, + "loss": 3.5135, + "step": 6150 + }, + { + "epoch": 0.41819540698464464, + "grad_norm": 6.725968837738037, + "learning_rate": 9.477510531322192e-06, + "loss": 3.3359, + "step": 6155 + }, + { + "epoch": 0.41853512705530643, + "grad_norm": 5.792344570159912, + "learning_rate": 9.477085881233864e-06, + "loss": 3.4444, + "step": 6160 + }, + { + "epoch": 0.4188748471259682, + "grad_norm": 6.876856327056885, + "learning_rate": 9.476661231145537e-06, + "loss": 3.7058, + "step": 6165 + }, + { + "epoch": 0.41921456719663, + "grad_norm": 7.8057050704956055, + "learning_rate": 9.47623658105721e-06, + "loss": 3.4633, + "step": 6170 + }, + { + "epoch": 0.41955428726729177, + "grad_norm": 7.6334943771362305, + "learning_rate": 9.475811930968883e-06, + "loss": 3.5169, + "step": 6175 + }, + { + "epoch": 0.4198940073379535, + "grad_norm": 6.237453460693359, + "learning_rate": 9.475387280880556e-06, + "loss": 3.3575, + "step": 6180 + }, + { + "epoch": 0.4202337274086153, + "grad_norm": 5.981119155883789, + "learning_rate": 9.474962630792228e-06, + "loss": 3.2776, + "step": 6185 + }, + { + "epoch": 0.42057344747927705, + "grad_norm": 6.211704254150391, + "learning_rate": 9.474537980703901e-06, + "loss": 3.1882, + "step": 6190 + }, + { + "epoch": 0.42091316754993885, + "grad_norm": 4.898773193359375, + "learning_rate": 9.474113330615572e-06, + "loss": 3.4555, + "step": 6195 + }, + { + "epoch": 0.42125288762060065, + "grad_norm": 6.481904983520508, + "learning_rate": 9.473688680527247e-06, + "loss": 3.3964, + "step": 6200 + }, + { + "epoch": 0.4215926076912624, + "grad_norm": 7.897691249847412, + "learning_rate": 9.47326403043892e-06, + "loss": 3.28, + "step": 6205 + }, + { + "epoch": 0.4219323277619242, + "grad_norm": 6.409961700439453, + "learning_rate": 9.47283938035059e-06, + "loss": 3.3193, + "step": 6210 + }, + { + "epoch": 0.42227204783258593, + "grad_norm": 8.01500415802002, + "learning_rate": 9.472414730262265e-06, + "loss": 3.2538, + "step": 6215 + }, + { + "epoch": 0.4226117679032477, + "grad_norm": 6.584757328033447, + "learning_rate": 9.471990080173938e-06, + "loss": 3.3306, + "step": 6220 + }, + { + "epoch": 0.4229514879739095, + "grad_norm": 6.43218469619751, + "learning_rate": 9.471565430085609e-06, + "loss": 3.2836, + "step": 6225 + }, + { + "epoch": 0.42329120804457127, + "grad_norm": 5.731999397277832, + "learning_rate": 9.471140779997284e-06, + "loss": 3.5014, + "step": 6230 + }, + { + "epoch": 0.42363092811523306, + "grad_norm": 5.836027145385742, + "learning_rate": 9.470716129908956e-06, + "loss": 3.516, + "step": 6235 + }, + { + "epoch": 0.4239706481858948, + "grad_norm": 5.769575595855713, + "learning_rate": 9.470291479820629e-06, + "loss": 3.2894, + "step": 6240 + }, + { + "epoch": 0.4243103682565566, + "grad_norm": 7.9777655601501465, + "learning_rate": 9.469866829732302e-06, + "loss": 3.3641, + "step": 6245 + }, + { + "epoch": 0.42465008832721834, + "grad_norm": 6.540637969970703, + "learning_rate": 9.469442179643975e-06, + "loss": 3.3969, + "step": 6250 + }, + { + "epoch": 0.42498980839788014, + "grad_norm": 6.216270923614502, + "learning_rate": 9.469017529555648e-06, + "loss": 3.4189, + "step": 6255 + }, + { + "epoch": 0.42532952846854194, + "grad_norm": 7.564327716827393, + "learning_rate": 9.46859287946732e-06, + "loss": 3.1743, + "step": 6260 + }, + { + "epoch": 0.4256692485392037, + "grad_norm": 5.288341522216797, + "learning_rate": 9.468168229378991e-06, + "loss": 3.2104, + "step": 6265 + }, + { + "epoch": 0.4260089686098655, + "grad_norm": 8.07434368133545, + "learning_rate": 9.467743579290666e-06, + "loss": 3.5471, + "step": 6270 + }, + { + "epoch": 0.4263486886805272, + "grad_norm": 6.595783233642578, + "learning_rate": 9.467318929202339e-06, + "loss": 3.314, + "step": 6275 + }, + { + "epoch": 0.426688408751189, + "grad_norm": 9.31948184967041, + "learning_rate": 9.46689427911401e-06, + "loss": 3.3296, + "step": 6280 + }, + { + "epoch": 0.4270281288218508, + "grad_norm": 8.399971008300781, + "learning_rate": 9.466469629025684e-06, + "loss": 3.5627, + "step": 6285 + }, + { + "epoch": 0.42736784889251256, + "grad_norm": 8.560869216918945, + "learning_rate": 9.466044978937357e-06, + "loss": 3.4209, + "step": 6290 + }, + { + "epoch": 0.42770756896317436, + "grad_norm": 5.91993522644043, + "learning_rate": 9.465620328849028e-06, + "loss": 3.5633, + "step": 6295 + }, + { + "epoch": 0.4280472890338361, + "grad_norm": 5.0648193359375, + "learning_rate": 9.465195678760703e-06, + "loss": 3.3204, + "step": 6300 + }, + { + "epoch": 0.4283870091044979, + "grad_norm": 5.8893842697143555, + "learning_rate": 9.464771028672376e-06, + "loss": 3.0649, + "step": 6305 + }, + { + "epoch": 0.4287267291751597, + "grad_norm": 6.3182501792907715, + "learning_rate": 9.464346378584047e-06, + "loss": 3.6714, + "step": 6310 + }, + { + "epoch": 0.42906644924582144, + "grad_norm": 7.0247578620910645, + "learning_rate": 9.463921728495721e-06, + "loss": 3.4565, + "step": 6315 + }, + { + "epoch": 0.42940616931648323, + "grad_norm": 7.556800365447998, + "learning_rate": 9.463497078407394e-06, + "loss": 3.3952, + "step": 6320 + }, + { + "epoch": 0.429745889387145, + "grad_norm": 7.505949974060059, + "learning_rate": 9.463072428319065e-06, + "loss": 3.2417, + "step": 6325 + }, + { + "epoch": 0.4300856094578068, + "grad_norm": 9.284164428710938, + "learning_rate": 9.46264777823074e-06, + "loss": 3.3475, + "step": 6330 + }, + { + "epoch": 0.4304253295284685, + "grad_norm": 6.8819169998168945, + "learning_rate": 9.462223128142412e-06, + "loss": 3.4019, + "step": 6335 + }, + { + "epoch": 0.4307650495991303, + "grad_norm": 6.666365623474121, + "learning_rate": 9.461798478054083e-06, + "loss": 3.2892, + "step": 6340 + }, + { + "epoch": 0.4311047696697921, + "grad_norm": 6.657983303070068, + "learning_rate": 9.461373827965758e-06, + "loss": 3.2665, + "step": 6345 + }, + { + "epoch": 0.43144448974045385, + "grad_norm": 7.782900810241699, + "learning_rate": 9.460949177877429e-06, + "loss": 3.3023, + "step": 6350 + }, + { + "epoch": 0.43178420981111565, + "grad_norm": 7.746505260467529, + "learning_rate": 9.460524527789102e-06, + "loss": 3.3917, + "step": 6355 + }, + { + "epoch": 0.4321239298817774, + "grad_norm": 6.757083892822266, + "learning_rate": 9.460099877700776e-06, + "loss": 3.2114, + "step": 6360 + }, + { + "epoch": 0.4324636499524392, + "grad_norm": 6.307335376739502, + "learning_rate": 9.459675227612447e-06, + "loss": 3.3192, + "step": 6365 + }, + { + "epoch": 0.432803370023101, + "grad_norm": 8.651726722717285, + "learning_rate": 9.45925057752412e-06, + "loss": 3.1487, + "step": 6370 + }, + { + "epoch": 0.43314309009376273, + "grad_norm": 8.956656455993652, + "learning_rate": 9.458825927435795e-06, + "loss": 3.3172, + "step": 6375 + }, + { + "epoch": 0.4334828101644245, + "grad_norm": 6.394066333770752, + "learning_rate": 9.458401277347466e-06, + "loss": 3.4572, + "step": 6380 + }, + { + "epoch": 0.43382253023508627, + "grad_norm": 9.836201667785645, + "learning_rate": 9.457976627259139e-06, + "loss": 3.512, + "step": 6385 + }, + { + "epoch": 0.43416225030574807, + "grad_norm": 8.087204933166504, + "learning_rate": 9.457551977170813e-06, + "loss": 3.3858, + "step": 6390 + }, + { + "epoch": 0.43450197037640986, + "grad_norm": 6.261463642120361, + "learning_rate": 9.457127327082484e-06, + "loss": 3.5554, + "step": 6395 + }, + { + "epoch": 0.4348416904470716, + "grad_norm": 8.014594078063965, + "learning_rate": 9.456702676994157e-06, + "loss": 3.3493, + "step": 6400 + }, + { + "epoch": 0.4351814105177334, + "grad_norm": 5.856029510498047, + "learning_rate": 9.456278026905832e-06, + "loss": 3.3742, + "step": 6405 + }, + { + "epoch": 0.43552113058839514, + "grad_norm": 6.555278778076172, + "learning_rate": 9.455853376817503e-06, + "loss": 3.274, + "step": 6410 + }, + { + "epoch": 0.43586085065905694, + "grad_norm": 7.048019886016846, + "learning_rate": 9.455428726729175e-06, + "loss": 3.2902, + "step": 6415 + }, + { + "epoch": 0.4362005707297187, + "grad_norm": 6.30293607711792, + "learning_rate": 9.455004076640848e-06, + "loss": 3.2779, + "step": 6420 + }, + { + "epoch": 0.4365402908003805, + "grad_norm": 6.751884937286377, + "learning_rate": 9.454579426552521e-06, + "loss": 3.2178, + "step": 6425 + }, + { + "epoch": 0.4368800108710423, + "grad_norm": 6.375414848327637, + "learning_rate": 9.454154776464194e-06, + "loss": 3.5986, + "step": 6430 + }, + { + "epoch": 0.437219730941704, + "grad_norm": 7.0296478271484375, + "learning_rate": 9.453730126375867e-06, + "loss": 3.4101, + "step": 6435 + }, + { + "epoch": 0.4375594510123658, + "grad_norm": 7.793421268463135, + "learning_rate": 9.45330547628754e-06, + "loss": 3.3192, + "step": 6440 + }, + { + "epoch": 0.43789917108302756, + "grad_norm": 9.065211296081543, + "learning_rate": 9.452880826199212e-06, + "loss": 3.4084, + "step": 6445 + }, + { + "epoch": 0.43823889115368936, + "grad_norm": 7.5156941413879395, + "learning_rate": 9.452456176110885e-06, + "loss": 3.4599, + "step": 6450 + }, + { + "epoch": 0.43857861122435116, + "grad_norm": 7.054537773132324, + "learning_rate": 9.452031526022558e-06, + "loss": 3.5131, + "step": 6455 + }, + { + "epoch": 0.4389183312950129, + "grad_norm": 6.726709842681885, + "learning_rate": 9.45160687593423e-06, + "loss": 3.2617, + "step": 6460 + }, + { + "epoch": 0.4392580513656747, + "grad_norm": 7.466663360595703, + "learning_rate": 9.451182225845903e-06, + "loss": 3.3703, + "step": 6465 + }, + { + "epoch": 0.43959777143633644, + "grad_norm": 6.975412368774414, + "learning_rate": 9.450757575757576e-06, + "loss": 3.1778, + "step": 6470 + }, + { + "epoch": 0.43993749150699824, + "grad_norm": 7.012231349945068, + "learning_rate": 9.450332925669249e-06, + "loss": 3.5913, + "step": 6475 + }, + { + "epoch": 0.44027721157766003, + "grad_norm": 6.149038791656494, + "learning_rate": 9.449908275580922e-06, + "loss": 3.57, + "step": 6480 + }, + { + "epoch": 0.4406169316483218, + "grad_norm": 7.091043949127197, + "learning_rate": 9.449483625492595e-06, + "loss": 3.3046, + "step": 6485 + }, + { + "epoch": 0.44095665171898357, + "grad_norm": 6.876466274261475, + "learning_rate": 9.449058975404267e-06, + "loss": 3.5998, + "step": 6490 + }, + { + "epoch": 0.4412963717896453, + "grad_norm": 7.152483940124512, + "learning_rate": 9.44863432531594e-06, + "loss": 3.353, + "step": 6495 + }, + { + "epoch": 0.4416360918603071, + "grad_norm": 7.677128791809082, + "learning_rate": 9.448209675227613e-06, + "loss": 3.6069, + "step": 6500 + }, + { + "epoch": 0.4419758119309689, + "grad_norm": 7.035632133483887, + "learning_rate": 9.447785025139286e-06, + "loss": 3.2464, + "step": 6505 + }, + { + "epoch": 0.44231553200163065, + "grad_norm": 5.8423357009887695, + "learning_rate": 9.447360375050959e-06, + "loss": 3.449, + "step": 6510 + }, + { + "epoch": 0.44265525207229245, + "grad_norm": 7.325884819030762, + "learning_rate": 9.446935724962631e-06, + "loss": 3.371, + "step": 6515 + }, + { + "epoch": 0.4429949721429542, + "grad_norm": 6.766622543334961, + "learning_rate": 9.446511074874304e-06, + "loss": 3.1645, + "step": 6520 + }, + { + "epoch": 0.443334692213616, + "grad_norm": 8.606966018676758, + "learning_rate": 9.446086424785977e-06, + "loss": 3.4307, + "step": 6525 + }, + { + "epoch": 0.44367441228427773, + "grad_norm": 6.483682155609131, + "learning_rate": 9.44566177469765e-06, + "loss": 3.3225, + "step": 6530 + }, + { + "epoch": 0.44401413235493953, + "grad_norm": 7.299102783203125, + "learning_rate": 9.445237124609323e-06, + "loss": 3.2805, + "step": 6535 + }, + { + "epoch": 0.4443538524256013, + "grad_norm": 6.785515308380127, + "learning_rate": 9.444812474520995e-06, + "loss": 3.3083, + "step": 6540 + }, + { + "epoch": 0.44469357249626307, + "grad_norm": 5.95014762878418, + "learning_rate": 9.444387824432668e-06, + "loss": 3.4051, + "step": 6545 + }, + { + "epoch": 0.44503329256692487, + "grad_norm": 7.265538215637207, + "learning_rate": 9.443963174344341e-06, + "loss": 3.11, + "step": 6550 + }, + { + "epoch": 0.4453730126375866, + "grad_norm": 7.993046283721924, + "learning_rate": 9.443538524256014e-06, + "loss": 3.487, + "step": 6555 + }, + { + "epoch": 0.4457127327082484, + "grad_norm": 6.646249294281006, + "learning_rate": 9.443113874167687e-06, + "loss": 3.6603, + "step": 6560 + }, + { + "epoch": 0.4460524527789102, + "grad_norm": 7.089112281799316, + "learning_rate": 9.44268922407936e-06, + "loss": 3.0153, + "step": 6565 + }, + { + "epoch": 0.44639217284957194, + "grad_norm": 6.964341640472412, + "learning_rate": 9.442264573991032e-06, + "loss": 3.2037, + "step": 6570 + }, + { + "epoch": 0.44673189292023374, + "grad_norm": 8.274697303771973, + "learning_rate": 9.441839923902705e-06, + "loss": 3.5161, + "step": 6575 + }, + { + "epoch": 0.4470716129908955, + "grad_norm": 7.015497207641602, + "learning_rate": 9.441415273814378e-06, + "loss": 3.2903, + "step": 6580 + }, + { + "epoch": 0.4474113330615573, + "grad_norm": 7.444988250732422, + "learning_rate": 9.44099062372605e-06, + "loss": 3.349, + "step": 6585 + }, + { + "epoch": 0.4477510531322191, + "grad_norm": 6.515074253082275, + "learning_rate": 9.440565973637723e-06, + "loss": 3.3423, + "step": 6590 + }, + { + "epoch": 0.4480907732028808, + "grad_norm": 6.034765720367432, + "learning_rate": 9.440141323549396e-06, + "loss": 3.3141, + "step": 6595 + }, + { + "epoch": 0.4484304932735426, + "grad_norm": 5.9993205070495605, + "learning_rate": 9.439716673461069e-06, + "loss": 3.554, + "step": 6600 + }, + { + "epoch": 0.44877021334420436, + "grad_norm": 5.774402618408203, + "learning_rate": 9.439292023372742e-06, + "loss": 3.2323, + "step": 6605 + }, + { + "epoch": 0.44910993341486616, + "grad_norm": 6.435078144073486, + "learning_rate": 9.438867373284415e-06, + "loss": 3.2468, + "step": 6610 + }, + { + "epoch": 0.4494496534855279, + "grad_norm": 6.179295063018799, + "learning_rate": 9.438442723196087e-06, + "loss": 3.5801, + "step": 6615 + }, + { + "epoch": 0.4497893735561897, + "grad_norm": 6.027328968048096, + "learning_rate": 9.43801807310776e-06, + "loss": 3.3609, + "step": 6620 + }, + { + "epoch": 0.4501290936268515, + "grad_norm": 7.189925193786621, + "learning_rate": 9.437593423019433e-06, + "loss": 3.4491, + "step": 6625 + }, + { + "epoch": 0.45046881369751324, + "grad_norm": 6.60461950302124, + "learning_rate": 9.437168772931106e-06, + "loss": 3.6333, + "step": 6630 + }, + { + "epoch": 0.45080853376817503, + "grad_norm": 7.849706172943115, + "learning_rate": 9.436744122842779e-06, + "loss": 3.4124, + "step": 6635 + }, + { + "epoch": 0.4511482538388368, + "grad_norm": 6.961631774902344, + "learning_rate": 9.436319472754451e-06, + "loss": 3.5023, + "step": 6640 + }, + { + "epoch": 0.4514879739094986, + "grad_norm": 7.081550121307373, + "learning_rate": 9.435894822666124e-06, + "loss": 3.1249, + "step": 6645 + }, + { + "epoch": 0.45182769398016037, + "grad_norm": 7.405824661254883, + "learning_rate": 9.435470172577797e-06, + "loss": 3.5395, + "step": 6650 + }, + { + "epoch": 0.4521674140508221, + "grad_norm": 8.105003356933594, + "learning_rate": 9.43504552248947e-06, + "loss": 3.376, + "step": 6655 + }, + { + "epoch": 0.4525071341214839, + "grad_norm": 6.291880130767822, + "learning_rate": 9.434620872401143e-06, + "loss": 3.2472, + "step": 6660 + }, + { + "epoch": 0.45284685419214565, + "grad_norm": 6.225637912750244, + "learning_rate": 9.434196222312815e-06, + "loss": 3.1518, + "step": 6665 + }, + { + "epoch": 0.45318657426280745, + "grad_norm": 7.307924270629883, + "learning_rate": 9.433771572224488e-06, + "loss": 3.5565, + "step": 6670 + }, + { + "epoch": 0.45352629433346925, + "grad_norm": 6.968791484832764, + "learning_rate": 9.433346922136161e-06, + "loss": 3.7317, + "step": 6675 + }, + { + "epoch": 0.453866014404131, + "grad_norm": 7.671453952789307, + "learning_rate": 9.432922272047834e-06, + "loss": 3.3902, + "step": 6680 + }, + { + "epoch": 0.4542057344747928, + "grad_norm": 6.705921649932861, + "learning_rate": 9.432497621959507e-06, + "loss": 3.2603, + "step": 6685 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 6.408170700073242, + "learning_rate": 9.43207297187118e-06, + "loss": 3.2427, + "step": 6690 + }, + { + "epoch": 0.45488517461611633, + "grad_norm": 8.185742378234863, + "learning_rate": 9.43164832178285e-06, + "loss": 3.4434, + "step": 6695 + }, + { + "epoch": 0.45522489468677807, + "grad_norm": 8.793863296508789, + "learning_rate": 9.431223671694525e-06, + "loss": 3.1341, + "step": 6700 + }, + { + "epoch": 0.45556461475743987, + "grad_norm": 7.747708320617676, + "learning_rate": 9.430799021606198e-06, + "loss": 3.2761, + "step": 6705 + }, + { + "epoch": 0.45590433482810166, + "grad_norm": 9.14657211303711, + "learning_rate": 9.430374371517869e-06, + "loss": 3.5234, + "step": 6710 + }, + { + "epoch": 0.4562440548987634, + "grad_norm": 6.330729007720947, + "learning_rate": 9.429949721429544e-06, + "loss": 3.5023, + "step": 6715 + }, + { + "epoch": 0.4565837749694252, + "grad_norm": 6.313687801361084, + "learning_rate": 9.429525071341216e-06, + "loss": 3.2848, + "step": 6720 + }, + { + "epoch": 0.45692349504008695, + "grad_norm": 7.5239763259887695, + "learning_rate": 9.429100421252887e-06, + "loss": 3.3212, + "step": 6725 + }, + { + "epoch": 0.45726321511074874, + "grad_norm": 6.254762172698975, + "learning_rate": 9.428675771164562e-06, + "loss": 3.3868, + "step": 6730 + }, + { + "epoch": 0.45760293518141054, + "grad_norm": 9.261787414550781, + "learning_rate": 9.428251121076235e-06, + "loss": 3.3956, + "step": 6735 + }, + { + "epoch": 0.4579426552520723, + "grad_norm": 7.3875226974487305, + "learning_rate": 9.427826470987906e-06, + "loss": 3.3962, + "step": 6740 + }, + { + "epoch": 0.4582823753227341, + "grad_norm": 7.693233489990234, + "learning_rate": 9.42740182089958e-06, + "loss": 3.3033, + "step": 6745 + }, + { + "epoch": 0.4586220953933958, + "grad_norm": 6.197378635406494, + "learning_rate": 9.426977170811253e-06, + "loss": 3.4905, + "step": 6750 + }, + { + "epoch": 0.4589618154640576, + "grad_norm": 6.383137226104736, + "learning_rate": 9.426552520722924e-06, + "loss": 3.4802, + "step": 6755 + }, + { + "epoch": 0.4593015355347194, + "grad_norm": 6.648200511932373, + "learning_rate": 9.426127870634599e-06, + "loss": 3.1047, + "step": 6760 + }, + { + "epoch": 0.45964125560538116, + "grad_norm": 8.228150367736816, + "learning_rate": 9.42570322054627e-06, + "loss": 3.4722, + "step": 6765 + }, + { + "epoch": 0.45998097567604296, + "grad_norm": 7.151434898376465, + "learning_rate": 9.425278570457943e-06, + "loss": 3.4675, + "step": 6770 + }, + { + "epoch": 0.4603206957467047, + "grad_norm": 6.5327067375183105, + "learning_rate": 9.424853920369617e-06, + "loss": 3.4377, + "step": 6775 + }, + { + "epoch": 0.4606604158173665, + "grad_norm": 7.447115421295166, + "learning_rate": 9.424429270281288e-06, + "loss": 3.5227, + "step": 6780 + }, + { + "epoch": 0.46100013588802824, + "grad_norm": 6.768777847290039, + "learning_rate": 9.424004620192961e-06, + "loss": 3.4825, + "step": 6785 + }, + { + "epoch": 0.46133985595869004, + "grad_norm": 7.439245700836182, + "learning_rate": 9.423579970104636e-06, + "loss": 3.2827, + "step": 6790 + }, + { + "epoch": 0.46167957602935183, + "grad_norm": 7.945503234863281, + "learning_rate": 9.423155320016307e-06, + "loss": 3.2529, + "step": 6795 + }, + { + "epoch": 0.4620192961000136, + "grad_norm": 7.008657455444336, + "learning_rate": 9.42273066992798e-06, + "loss": 3.468, + "step": 6800 + }, + { + "epoch": 0.4623590161706754, + "grad_norm": 8.605535507202148, + "learning_rate": 9.422306019839654e-06, + "loss": 3.3374, + "step": 6805 + }, + { + "epoch": 0.4626987362413371, + "grad_norm": 9.60008716583252, + "learning_rate": 9.421881369751325e-06, + "loss": 3.365, + "step": 6810 + }, + { + "epoch": 0.4630384563119989, + "grad_norm": 6.343447685241699, + "learning_rate": 9.421456719662998e-06, + "loss": 3.2304, + "step": 6815 + }, + { + "epoch": 0.4633781763826607, + "grad_norm": 6.772511959075928, + "learning_rate": 9.421032069574672e-06, + "loss": 3.6629, + "step": 6820 + }, + { + "epoch": 0.46371789645332245, + "grad_norm": 7.488915920257568, + "learning_rate": 9.420607419486343e-06, + "loss": 3.438, + "step": 6825 + }, + { + "epoch": 0.46405761652398425, + "grad_norm": 7.663504600524902, + "learning_rate": 9.420182769398016e-06, + "loss": 3.2869, + "step": 6830 + }, + { + "epoch": 0.464397336594646, + "grad_norm": 7.7402873039245605, + "learning_rate": 9.419758119309689e-06, + "loss": 3.2503, + "step": 6835 + }, + { + "epoch": 0.4647370566653078, + "grad_norm": 5.998109340667725, + "learning_rate": 9.419333469221362e-06, + "loss": 3.5155, + "step": 6840 + }, + { + "epoch": 0.4650767767359696, + "grad_norm": 6.350292205810547, + "learning_rate": 9.418908819133035e-06, + "loss": 3.2587, + "step": 6845 + }, + { + "epoch": 0.46541649680663133, + "grad_norm": 7.180111885070801, + "learning_rate": 9.418484169044707e-06, + "loss": 3.4311, + "step": 6850 + }, + { + "epoch": 0.4657562168772931, + "grad_norm": 7.717229843139648, + "learning_rate": 9.41805951895638e-06, + "loss": 3.3107, + "step": 6855 + }, + { + "epoch": 0.46609593694795487, + "grad_norm": 6.982679843902588, + "learning_rate": 9.417634868868053e-06, + "loss": 3.4592, + "step": 6860 + }, + { + "epoch": 0.46643565701861667, + "grad_norm": 5.959148406982422, + "learning_rate": 9.417210218779726e-06, + "loss": 3.2011, + "step": 6865 + }, + { + "epoch": 0.4667753770892784, + "grad_norm": 6.555063247680664, + "learning_rate": 9.416785568691399e-06, + "loss": 3.3378, + "step": 6870 + }, + { + "epoch": 0.4671150971599402, + "grad_norm": 8.067090034484863, + "learning_rate": 9.416360918603071e-06, + "loss": 3.1938, + "step": 6875 + }, + { + "epoch": 0.467454817230602, + "grad_norm": 7.8034234046936035, + "learning_rate": 9.415936268514744e-06, + "loss": 3.1937, + "step": 6880 + }, + { + "epoch": 0.46779453730126375, + "grad_norm": 7.369071960449219, + "learning_rate": 9.415511618426417e-06, + "loss": 3.3791, + "step": 6885 + }, + { + "epoch": 0.46813425737192554, + "grad_norm": 6.244277000427246, + "learning_rate": 9.41508696833809e-06, + "loss": 3.4599, + "step": 6890 + }, + { + "epoch": 0.4684739774425873, + "grad_norm": 7.8628082275390625, + "learning_rate": 9.414662318249763e-06, + "loss": 3.4814, + "step": 6895 + }, + { + "epoch": 0.4688136975132491, + "grad_norm": 6.655680179595947, + "learning_rate": 9.414237668161435e-06, + "loss": 3.4942, + "step": 6900 + }, + { + "epoch": 0.4691534175839109, + "grad_norm": 7.901004791259766, + "learning_rate": 9.413813018073108e-06, + "loss": 3.548, + "step": 6905 + }, + { + "epoch": 0.4694931376545726, + "grad_norm": 9.881803512573242, + "learning_rate": 9.413388367984781e-06, + "loss": 3.4923, + "step": 6910 + }, + { + "epoch": 0.4698328577252344, + "grad_norm": 6.451216697692871, + "learning_rate": 9.412963717896454e-06, + "loss": 3.529, + "step": 6915 + }, + { + "epoch": 0.47017257779589616, + "grad_norm": 6.979562759399414, + "learning_rate": 9.412539067808127e-06, + "loss": 3.1936, + "step": 6920 + }, + { + "epoch": 0.47051229786655796, + "grad_norm": 6.0294928550720215, + "learning_rate": 9.4121144177198e-06, + "loss": 3.3145, + "step": 6925 + }, + { + "epoch": 0.47085201793721976, + "grad_norm": 9.155203819274902, + "learning_rate": 9.411689767631472e-06, + "loss": 3.4322, + "step": 6930 + }, + { + "epoch": 0.4711917380078815, + "grad_norm": 7.409775257110596, + "learning_rate": 9.411265117543145e-06, + "loss": 3.3721, + "step": 6935 + }, + { + "epoch": 0.4715314580785433, + "grad_norm": 6.384495258331299, + "learning_rate": 9.410840467454818e-06, + "loss": 3.3314, + "step": 6940 + }, + { + "epoch": 0.47187117814920504, + "grad_norm": 7.332393646240234, + "learning_rate": 9.41041581736649e-06, + "loss": 3.5526, + "step": 6945 + }, + { + "epoch": 0.47221089821986684, + "grad_norm": 6.876650333404541, + "learning_rate": 9.409991167278163e-06, + "loss": 3.3288, + "step": 6950 + }, + { + "epoch": 0.4725506182905286, + "grad_norm": 7.057038307189941, + "learning_rate": 9.409566517189836e-06, + "loss": 3.5793, + "step": 6955 + }, + { + "epoch": 0.4728903383611904, + "grad_norm": 7.850484848022461, + "learning_rate": 9.409141867101509e-06, + "loss": 3.5475, + "step": 6960 + }, + { + "epoch": 0.4732300584318522, + "grad_norm": 9.761149406433105, + "learning_rate": 9.408717217013182e-06, + "loss": 3.5382, + "step": 6965 + }, + { + "epoch": 0.4735697785025139, + "grad_norm": 5.760624408721924, + "learning_rate": 9.408292566924855e-06, + "loss": 3.1598, + "step": 6970 + }, + { + "epoch": 0.4739094985731757, + "grad_norm": 6.332986831665039, + "learning_rate": 9.407867916836527e-06, + "loss": 3.4259, + "step": 6975 + }, + { + "epoch": 0.47424921864383746, + "grad_norm": 5.592994213104248, + "learning_rate": 9.4074432667482e-06, + "loss": 3.493, + "step": 6980 + }, + { + "epoch": 0.47458893871449925, + "grad_norm": 6.417639255523682, + "learning_rate": 9.407018616659873e-06, + "loss": 3.353, + "step": 6985 + }, + { + "epoch": 0.47492865878516105, + "grad_norm": 6.00710391998291, + "learning_rate": 9.406593966571546e-06, + "loss": 3.3287, + "step": 6990 + }, + { + "epoch": 0.4752683788558228, + "grad_norm": 6.539790630340576, + "learning_rate": 9.406169316483219e-06, + "loss": 3.0641, + "step": 6995 + }, + { + "epoch": 0.4756080989264846, + "grad_norm": 5.795100212097168, + "learning_rate": 9.405744666394891e-06, + "loss": 3.0868, + "step": 7000 + }, + { + "epoch": 0.47594781899714633, + "grad_norm": 8.23336124420166, + "learning_rate": 9.405320016306564e-06, + "loss": 3.7075, + "step": 7005 + }, + { + "epoch": 0.47628753906780813, + "grad_norm": 7.666558265686035, + "learning_rate": 9.404895366218237e-06, + "loss": 3.5261, + "step": 7010 + }, + { + "epoch": 0.4766272591384699, + "grad_norm": 7.672287464141846, + "learning_rate": 9.40447071612991e-06, + "loss": 3.3874, + "step": 7015 + }, + { + "epoch": 0.47696697920913167, + "grad_norm": 6.285202980041504, + "learning_rate": 9.404046066041583e-06, + "loss": 3.1153, + "step": 7020 + }, + { + "epoch": 0.47730669927979347, + "grad_norm": 6.376352310180664, + "learning_rate": 9.403621415953255e-06, + "loss": 3.5415, + "step": 7025 + }, + { + "epoch": 0.4776464193504552, + "grad_norm": 7.8136115074157715, + "learning_rate": 9.403196765864928e-06, + "loss": 3.4061, + "step": 7030 + }, + { + "epoch": 0.477986139421117, + "grad_norm": 7.370520114898682, + "learning_rate": 9.402772115776601e-06, + "loss": 3.2768, + "step": 7035 + }, + { + "epoch": 0.47832585949177875, + "grad_norm": 6.496222496032715, + "learning_rate": 9.402347465688274e-06, + "loss": 3.3588, + "step": 7040 + }, + { + "epoch": 0.47866557956244055, + "grad_norm": 6.890233516693115, + "learning_rate": 9.401922815599947e-06, + "loss": 3.4424, + "step": 7045 + }, + { + "epoch": 0.47900529963310234, + "grad_norm": 6.124893665313721, + "learning_rate": 9.40149816551162e-06, + "loss": 3.315, + "step": 7050 + }, + { + "epoch": 0.4793450197037641, + "grad_norm": 7.815262794494629, + "learning_rate": 9.401073515423292e-06, + "loss": 3.561, + "step": 7055 + }, + { + "epoch": 0.4796847397744259, + "grad_norm": 6.334140777587891, + "learning_rate": 9.400648865334965e-06, + "loss": 3.4631, + "step": 7060 + }, + { + "epoch": 0.4800244598450876, + "grad_norm": 6.54144811630249, + "learning_rate": 9.400224215246638e-06, + "loss": 3.3043, + "step": 7065 + }, + { + "epoch": 0.4803641799157494, + "grad_norm": 8.12617015838623, + "learning_rate": 9.39979956515831e-06, + "loss": 3.2781, + "step": 7070 + }, + { + "epoch": 0.4807038999864112, + "grad_norm": 5.572495937347412, + "learning_rate": 9.399374915069983e-06, + "loss": 3.4869, + "step": 7075 + }, + { + "epoch": 0.48104362005707296, + "grad_norm": 6.127620697021484, + "learning_rate": 9.398950264981656e-06, + "loss": 3.4742, + "step": 7080 + }, + { + "epoch": 0.48138334012773476, + "grad_norm": 7.141599178314209, + "learning_rate": 9.398525614893329e-06, + "loss": 3.4525, + "step": 7085 + }, + { + "epoch": 0.4817230601983965, + "grad_norm": 8.135917663574219, + "learning_rate": 9.398100964805002e-06, + "loss": 3.2485, + "step": 7090 + }, + { + "epoch": 0.4820627802690583, + "grad_norm": 6.595906734466553, + "learning_rate": 9.397676314716675e-06, + "loss": 3.5151, + "step": 7095 + }, + { + "epoch": 0.4824025003397201, + "grad_norm": 5.010725975036621, + "learning_rate": 9.397251664628347e-06, + "loss": 3.2521, + "step": 7100 + }, + { + "epoch": 0.48274222041038184, + "grad_norm": 7.239065647125244, + "learning_rate": 9.39682701454002e-06, + "loss": 3.3404, + "step": 7105 + }, + { + "epoch": 0.48308194048104364, + "grad_norm": 7.274843692779541, + "learning_rate": 9.396402364451691e-06, + "loss": 3.6182, + "step": 7110 + }, + { + "epoch": 0.4834216605517054, + "grad_norm": 7.410704612731934, + "learning_rate": 9.395977714363366e-06, + "loss": 3.3919, + "step": 7115 + }, + { + "epoch": 0.4837613806223672, + "grad_norm": 5.803159713745117, + "learning_rate": 9.395553064275039e-06, + "loss": 3.3721, + "step": 7120 + }, + { + "epoch": 0.4841011006930289, + "grad_norm": 7.961521148681641, + "learning_rate": 9.39512841418671e-06, + "loss": 3.4092, + "step": 7125 + }, + { + "epoch": 0.4844408207636907, + "grad_norm": 7.490447521209717, + "learning_rate": 9.394703764098384e-06, + "loss": 3.2968, + "step": 7130 + }, + { + "epoch": 0.4847805408343525, + "grad_norm": 7.452274322509766, + "learning_rate": 9.394279114010057e-06, + "loss": 3.2306, + "step": 7135 + }, + { + "epoch": 0.48512026090501426, + "grad_norm": 7.7036213874816895, + "learning_rate": 9.393854463921728e-06, + "loss": 3.5495, + "step": 7140 + }, + { + "epoch": 0.48545998097567605, + "grad_norm": 7.923842906951904, + "learning_rate": 9.393429813833403e-06, + "loss": 3.562, + "step": 7145 + }, + { + "epoch": 0.4857997010463378, + "grad_norm": 7.114926338195801, + "learning_rate": 9.393005163745075e-06, + "loss": 3.4577, + "step": 7150 + }, + { + "epoch": 0.4861394211169996, + "grad_norm": 6.626836776733398, + "learning_rate": 9.392580513656747e-06, + "loss": 3.4408, + "step": 7155 + }, + { + "epoch": 0.4864791411876614, + "grad_norm": 7.2074503898620605, + "learning_rate": 9.392155863568421e-06, + "loss": 3.4742, + "step": 7160 + }, + { + "epoch": 0.48681886125832313, + "grad_norm": 7.22507381439209, + "learning_rate": 9.391731213480094e-06, + "loss": 3.4014, + "step": 7165 + }, + { + "epoch": 0.48715858132898493, + "grad_norm": 7.493715286254883, + "learning_rate": 9.391306563391765e-06, + "loss": 3.0875, + "step": 7170 + }, + { + "epoch": 0.48749830139964667, + "grad_norm": 8.374188423156738, + "learning_rate": 9.39088191330344e-06, + "loss": 3.4697, + "step": 7175 + }, + { + "epoch": 0.48783802147030847, + "grad_norm": 6.5383076667785645, + "learning_rate": 9.39045726321511e-06, + "loss": 3.4748, + "step": 7180 + }, + { + "epoch": 0.48817774154097027, + "grad_norm": 8.158641815185547, + "learning_rate": 9.390032613126783e-06, + "loss": 3.2922, + "step": 7185 + }, + { + "epoch": 0.488517461611632, + "grad_norm": 5.357043743133545, + "learning_rate": 9.389607963038458e-06, + "loss": 3.417, + "step": 7190 + }, + { + "epoch": 0.4888571816822938, + "grad_norm": 7.227246284484863, + "learning_rate": 9.389183312950129e-06, + "loss": 3.5247, + "step": 7195 + }, + { + "epoch": 0.48919690175295555, + "grad_norm": 6.697081565856934, + "learning_rate": 9.388758662861802e-06, + "loss": 3.2873, + "step": 7200 + }, + { + "epoch": 0.48953662182361735, + "grad_norm": 6.7383341789245605, + "learning_rate": 9.388334012773476e-06, + "loss": 3.3063, + "step": 7205 + }, + { + "epoch": 0.4898763418942791, + "grad_norm": 7.659915447235107, + "learning_rate": 9.387909362685147e-06, + "loss": 3.5364, + "step": 7210 + }, + { + "epoch": 0.4902160619649409, + "grad_norm": 6.319233417510986, + "learning_rate": 9.38748471259682e-06, + "loss": 3.5001, + "step": 7215 + }, + { + "epoch": 0.4905557820356027, + "grad_norm": 7.161003589630127, + "learning_rate": 9.387060062508495e-06, + "loss": 3.3117, + "step": 7220 + }, + { + "epoch": 0.4908955021062644, + "grad_norm": 7.926325798034668, + "learning_rate": 9.386635412420166e-06, + "loss": 3.4311, + "step": 7225 + }, + { + "epoch": 0.4912352221769262, + "grad_norm": 7.258816242218018, + "learning_rate": 9.386210762331839e-06, + "loss": 3.1636, + "step": 7230 + }, + { + "epoch": 0.49157494224758796, + "grad_norm": 5.813920497894287, + "learning_rate": 9.385786112243513e-06, + "loss": 3.3434, + "step": 7235 + }, + { + "epoch": 0.49191466231824976, + "grad_norm": 8.334423065185547, + "learning_rate": 9.385361462155184e-06, + "loss": 3.458, + "step": 7240 + }, + { + "epoch": 0.49225438238891156, + "grad_norm": 7.1705827713012695, + "learning_rate": 9.384936812066857e-06, + "loss": 3.4779, + "step": 7245 + }, + { + "epoch": 0.4925941024595733, + "grad_norm": 6.935011863708496, + "learning_rate": 9.384512161978531e-06, + "loss": 3.5104, + "step": 7250 + }, + { + "epoch": 0.4929338225302351, + "grad_norm": 6.015654563903809, + "learning_rate": 9.384087511890203e-06, + "loss": 2.9445, + "step": 7255 + }, + { + "epoch": 0.49327354260089684, + "grad_norm": 7.412994861602783, + "learning_rate": 9.383662861801877e-06, + "loss": 3.4138, + "step": 7260 + }, + { + "epoch": 0.49361326267155864, + "grad_norm": 6.6737751960754395, + "learning_rate": 9.383238211713548e-06, + "loss": 3.4612, + "step": 7265 + }, + { + "epoch": 0.49395298274222044, + "grad_norm": 7.233213901519775, + "learning_rate": 9.382813561625221e-06, + "loss": 3.4172, + "step": 7270 + }, + { + "epoch": 0.4942927028128822, + "grad_norm": 7.591920375823975, + "learning_rate": 9.382388911536895e-06, + "loss": 3.2393, + "step": 7275 + }, + { + "epoch": 0.494632422883544, + "grad_norm": 7.387029647827148, + "learning_rate": 9.381964261448567e-06, + "loss": 3.4557, + "step": 7280 + }, + { + "epoch": 0.4949721429542057, + "grad_norm": 6.500645637512207, + "learning_rate": 9.38153961136024e-06, + "loss": 3.6464, + "step": 7285 + }, + { + "epoch": 0.4953118630248675, + "grad_norm": 7.398702144622803, + "learning_rate": 9.381114961271914e-06, + "loss": 3.2854, + "step": 7290 + }, + { + "epoch": 0.49565158309552926, + "grad_norm": 8.451618194580078, + "learning_rate": 9.380690311183585e-06, + "loss": 3.3733, + "step": 7295 + }, + { + "epoch": 0.49599130316619106, + "grad_norm": 6.152918338775635, + "learning_rate": 9.380265661095258e-06, + "loss": 3.4892, + "step": 7300 + }, + { + "epoch": 0.49633102323685285, + "grad_norm": 6.787991523742676, + "learning_rate": 9.379841011006932e-06, + "loss": 3.2919, + "step": 7305 + }, + { + "epoch": 0.4966707433075146, + "grad_norm": 6.847397327423096, + "learning_rate": 9.379416360918603e-06, + "loss": 3.3536, + "step": 7310 + }, + { + "epoch": 0.4970104633781764, + "grad_norm": 6.682425498962402, + "learning_rate": 9.378991710830276e-06, + "loss": 3.3058, + "step": 7315 + }, + { + "epoch": 0.49735018344883813, + "grad_norm": 7.112499713897705, + "learning_rate": 9.37856706074195e-06, + "loss": 3.6327, + "step": 7320 + }, + { + "epoch": 0.49768990351949993, + "grad_norm": 7.604178428649902, + "learning_rate": 9.378142410653622e-06, + "loss": 3.3378, + "step": 7325 + }, + { + "epoch": 0.49802962359016173, + "grad_norm": 5.767140865325928, + "learning_rate": 9.377717760565295e-06, + "loss": 3.3526, + "step": 7330 + }, + { + "epoch": 0.49836934366082347, + "grad_norm": 6.17573881149292, + "learning_rate": 9.377293110476967e-06, + "loss": 3.4636, + "step": 7335 + }, + { + "epoch": 0.49870906373148527, + "grad_norm": 6.612054824829102, + "learning_rate": 9.37686846038864e-06, + "loss": 3.6099, + "step": 7340 + }, + { + "epoch": 0.499048783802147, + "grad_norm": 6.580817699432373, + "learning_rate": 9.376443810300313e-06, + "loss": 3.4624, + "step": 7345 + }, + { + "epoch": 0.4993885038728088, + "grad_norm": 6.728540420532227, + "learning_rate": 9.376019160211986e-06, + "loss": 3.1017, + "step": 7350 + }, + { + "epoch": 0.4997282239434706, + "grad_norm": 6.474740028381348, + "learning_rate": 9.375594510123659e-06, + "loss": 3.3618, + "step": 7355 + }, + { + "epoch": 0.5000679440141323, + "grad_norm": 6.742430210113525, + "learning_rate": 9.375169860035331e-06, + "loss": 3.4826, + "step": 7360 + }, + { + "epoch": 0.5004076640847941, + "grad_norm": 6.535891532897949, + "learning_rate": 9.374745209947004e-06, + "loss": 3.4109, + "step": 7365 + }, + { + "epoch": 0.5007473841554559, + "grad_norm": 8.641221046447754, + "learning_rate": 9.374320559858677e-06, + "loss": 3.6495, + "step": 7370 + }, + { + "epoch": 0.5010871042261177, + "grad_norm": 8.143317222595215, + "learning_rate": 9.37389590977035e-06, + "loss": 3.4792, + "step": 7375 + }, + { + "epoch": 0.5014268242967794, + "grad_norm": 6.147731304168701, + "learning_rate": 9.373471259682023e-06, + "loss": 3.6578, + "step": 7380 + }, + { + "epoch": 0.5017665443674413, + "grad_norm": 6.1206769943237305, + "learning_rate": 9.373046609593695e-06, + "loss": 3.5624, + "step": 7385 + }, + { + "epoch": 0.502106264438103, + "grad_norm": 6.178816318511963, + "learning_rate": 9.372621959505368e-06, + "loss": 3.1511, + "step": 7390 + }, + { + "epoch": 0.5024459845087648, + "grad_norm": 7.825201034545898, + "learning_rate": 9.372197309417041e-06, + "loss": 3.4064, + "step": 7395 + }, + { + "epoch": 0.5027857045794265, + "grad_norm": 7.920228481292725, + "learning_rate": 9.371772659328714e-06, + "loss": 3.5066, + "step": 7400 + }, + { + "epoch": 0.5031254246500884, + "grad_norm": 7.817113399505615, + "learning_rate": 9.371348009240387e-06, + "loss": 3.3319, + "step": 7405 + }, + { + "epoch": 0.5034651447207501, + "grad_norm": 6.805710792541504, + "learning_rate": 9.37092335915206e-06, + "loss": 3.3858, + "step": 7410 + }, + { + "epoch": 0.5038048647914118, + "grad_norm": 6.846779823303223, + "learning_rate": 9.370498709063732e-06, + "loss": 3.3534, + "step": 7415 + }, + { + "epoch": 0.5041445848620737, + "grad_norm": 7.099699020385742, + "learning_rate": 9.370074058975405e-06, + "loss": 3.3283, + "step": 7420 + }, + { + "epoch": 0.5044843049327354, + "grad_norm": 7.774317741394043, + "learning_rate": 9.369649408887078e-06, + "loss": 3.2812, + "step": 7425 + }, + { + "epoch": 0.5048240250033972, + "grad_norm": 6.470667362213135, + "learning_rate": 9.36922475879875e-06, + "loss": 3.3763, + "step": 7430 + }, + { + "epoch": 0.5051637450740589, + "grad_norm": 4.811521530151367, + "learning_rate": 9.368800108710423e-06, + "loss": 3.3973, + "step": 7435 + }, + { + "epoch": 0.5055034651447208, + "grad_norm": 5.934732437133789, + "learning_rate": 9.368375458622096e-06, + "loss": 3.2084, + "step": 7440 + }, + { + "epoch": 0.5058431852153825, + "grad_norm": 5.877926349639893, + "learning_rate": 9.367950808533769e-06, + "loss": 3.3451, + "step": 7445 + }, + { + "epoch": 0.5061829052860443, + "grad_norm": 5.519674301147461, + "learning_rate": 9.367526158445442e-06, + "loss": 3.2462, + "step": 7450 + }, + { + "epoch": 0.5065226253567061, + "grad_norm": 8.251991271972656, + "learning_rate": 9.367101508357115e-06, + "loss": 3.3576, + "step": 7455 + }, + { + "epoch": 0.5068623454273679, + "grad_norm": 7.0833659172058105, + "learning_rate": 9.366676858268787e-06, + "loss": 3.3319, + "step": 7460 + }, + { + "epoch": 0.5072020654980296, + "grad_norm": 7.399389743804932, + "learning_rate": 9.36625220818046e-06, + "loss": 3.4388, + "step": 7465 + }, + { + "epoch": 0.5075417855686915, + "grad_norm": 6.919555187225342, + "learning_rate": 9.365827558092133e-06, + "loss": 3.1627, + "step": 7470 + }, + { + "epoch": 0.5078815056393532, + "grad_norm": 5.994718551635742, + "learning_rate": 9.365402908003806e-06, + "loss": 3.5365, + "step": 7475 + }, + { + "epoch": 0.5082212257100149, + "grad_norm": 6.096606254577637, + "learning_rate": 9.364978257915479e-06, + "loss": 3.1135, + "step": 7480 + }, + { + "epoch": 0.5085609457806767, + "grad_norm": 7.12670373916626, + "learning_rate": 9.364553607827151e-06, + "loss": 3.6206, + "step": 7485 + }, + { + "epoch": 0.5089006658513385, + "grad_norm": 5.544092655181885, + "learning_rate": 9.364128957738824e-06, + "loss": 3.2138, + "step": 7490 + }, + { + "epoch": 0.5092403859220003, + "grad_norm": 8.024269104003906, + "learning_rate": 9.363704307650497e-06, + "loss": 3.5719, + "step": 7495 + }, + { + "epoch": 0.509580105992662, + "grad_norm": 6.777406692504883, + "learning_rate": 9.36327965756217e-06, + "loss": 3.474, + "step": 7500 + }, + { + "epoch": 0.5099198260633239, + "grad_norm": 5.997620105743408, + "learning_rate": 9.362855007473843e-06, + "loss": 3.4139, + "step": 7505 + }, + { + "epoch": 0.5102595461339856, + "grad_norm": 6.72051477432251, + "learning_rate": 9.362430357385515e-06, + "loss": 3.5625, + "step": 7510 + }, + { + "epoch": 0.5105992662046474, + "grad_norm": 7.837960243225098, + "learning_rate": 9.362005707297188e-06, + "loss": 3.498, + "step": 7515 + }, + { + "epoch": 0.5109389862753091, + "grad_norm": 6.252409934997559, + "learning_rate": 9.361581057208861e-06, + "loss": 3.5133, + "step": 7520 + }, + { + "epoch": 0.511278706345971, + "grad_norm": 8.286930084228516, + "learning_rate": 9.361156407120532e-06, + "loss": 3.4269, + "step": 7525 + }, + { + "epoch": 0.5116184264166327, + "grad_norm": 8.523837089538574, + "learning_rate": 9.360731757032207e-06, + "loss": 3.5207, + "step": 7530 + }, + { + "epoch": 0.5119581464872944, + "grad_norm": 8.885704040527344, + "learning_rate": 9.36030710694388e-06, + "loss": 3.4524, + "step": 7535 + }, + { + "epoch": 0.5122978665579563, + "grad_norm": 5.4131999015808105, + "learning_rate": 9.35988245685555e-06, + "loss": 3.4314, + "step": 7540 + }, + { + "epoch": 0.512637586628618, + "grad_norm": 6.339590549468994, + "learning_rate": 9.359457806767225e-06, + "loss": 3.1926, + "step": 7545 + }, + { + "epoch": 0.5129773066992798, + "grad_norm": 6.363923072814941, + "learning_rate": 9.359033156678898e-06, + "loss": 3.3553, + "step": 7550 + }, + { + "epoch": 0.5133170267699416, + "grad_norm": 5.820192337036133, + "learning_rate": 9.358608506590569e-06, + "loss": 3.4409, + "step": 7555 + }, + { + "epoch": 0.5136567468406034, + "grad_norm": 7.312633037567139, + "learning_rate": 9.358183856502243e-06, + "loss": 3.4495, + "step": 7560 + }, + { + "epoch": 0.5139964669112651, + "grad_norm": 7.13507604598999, + "learning_rate": 9.357759206413916e-06, + "loss": 3.5039, + "step": 7565 + }, + { + "epoch": 0.5143361869819268, + "grad_norm": 6.476308345794678, + "learning_rate": 9.357334556325587e-06, + "loss": 3.4296, + "step": 7570 + }, + { + "epoch": 0.5146759070525887, + "grad_norm": 7.480803489685059, + "learning_rate": 9.356909906237262e-06, + "loss": 3.4459, + "step": 7575 + }, + { + "epoch": 0.5150156271232504, + "grad_norm": 8.33530044555664, + "learning_rate": 9.356485256148935e-06, + "loss": 3.4994, + "step": 7580 + }, + { + "epoch": 0.5153553471939122, + "grad_norm": 6.516520023345947, + "learning_rate": 9.356060606060606e-06, + "loss": 3.4798, + "step": 7585 + }, + { + "epoch": 0.515695067264574, + "grad_norm": 6.517733097076416, + "learning_rate": 9.35563595597228e-06, + "loss": 3.4411, + "step": 7590 + }, + { + "epoch": 0.5160347873352358, + "grad_norm": 7.351516246795654, + "learning_rate": 9.355211305883953e-06, + "loss": 3.3156, + "step": 7595 + }, + { + "epoch": 0.5163745074058975, + "grad_norm": 5.656729221343994, + "learning_rate": 9.354786655795624e-06, + "loss": 3.4662, + "step": 7600 + }, + { + "epoch": 0.5167142274765593, + "grad_norm": 5.741865634918213, + "learning_rate": 9.354362005707299e-06, + "loss": 3.1674, + "step": 7605 + }, + { + "epoch": 0.5170539475472211, + "grad_norm": 9.694225311279297, + "learning_rate": 9.35393735561897e-06, + "loss": 3.2993, + "step": 7610 + }, + { + "epoch": 0.5173936676178829, + "grad_norm": 8.349076271057129, + "learning_rate": 9.353512705530644e-06, + "loss": 3.052, + "step": 7615 + }, + { + "epoch": 0.5177333876885446, + "grad_norm": 5.224138259887695, + "learning_rate": 9.353088055442317e-06, + "loss": 3.4544, + "step": 7620 + }, + { + "epoch": 0.5180731077592065, + "grad_norm": 6.035685062408447, + "learning_rate": 9.352663405353988e-06, + "loss": 3.4997, + "step": 7625 + }, + { + "epoch": 0.5184128278298682, + "grad_norm": 7.801394462585449, + "learning_rate": 9.352238755265663e-06, + "loss": 3.26, + "step": 7630 + }, + { + "epoch": 0.5187525479005299, + "grad_norm": 8.756573677062988, + "learning_rate": 9.351814105177335e-06, + "loss": 3.5278, + "step": 7635 + }, + { + "epoch": 0.5190922679711918, + "grad_norm": 5.837248802185059, + "learning_rate": 9.351389455089007e-06, + "loss": 3.4161, + "step": 7640 + }, + { + "epoch": 0.5194319880418535, + "grad_norm": 7.832045078277588, + "learning_rate": 9.350964805000681e-06, + "loss": 3.3544, + "step": 7645 + }, + { + "epoch": 0.5197717081125153, + "grad_norm": 7.436858654022217, + "learning_rate": 9.350540154912354e-06, + "loss": 3.3799, + "step": 7650 + }, + { + "epoch": 0.520111428183177, + "grad_norm": 6.888467311859131, + "learning_rate": 9.350115504824025e-06, + "loss": 3.2185, + "step": 7655 + }, + { + "epoch": 0.5204511482538389, + "grad_norm": 6.692691802978516, + "learning_rate": 9.3496908547357e-06, + "loss": 3.3484, + "step": 7660 + }, + { + "epoch": 0.5207908683245006, + "grad_norm": 7.221436977386475, + "learning_rate": 9.349266204647372e-06, + "loss": 3.2238, + "step": 7665 + }, + { + "epoch": 0.5211305883951624, + "grad_norm": 6.301865100860596, + "learning_rate": 9.348841554559043e-06, + "loss": 3.3877, + "step": 7670 + }, + { + "epoch": 0.5214703084658242, + "grad_norm": 6.58628511428833, + "learning_rate": 9.348416904470718e-06, + "loss": 3.2199, + "step": 7675 + }, + { + "epoch": 0.521810028536486, + "grad_norm": 6.124441146850586, + "learning_rate": 9.347992254382389e-06, + "loss": 3.4539, + "step": 7680 + }, + { + "epoch": 0.5221497486071477, + "grad_norm": 6.732500076293945, + "learning_rate": 9.347567604294062e-06, + "loss": 3.4041, + "step": 7685 + }, + { + "epoch": 0.5224894686778094, + "grad_norm": 6.204999923706055, + "learning_rate": 9.347142954205736e-06, + "loss": 3.509, + "step": 7690 + }, + { + "epoch": 0.5228291887484713, + "grad_norm": 7.059607982635498, + "learning_rate": 9.346718304117407e-06, + "loss": 3.1685, + "step": 7695 + }, + { + "epoch": 0.523168908819133, + "grad_norm": 6.126708030700684, + "learning_rate": 9.34629365402908e-06, + "loss": 3.3886, + "step": 7700 + }, + { + "epoch": 0.5235086288897948, + "grad_norm": 6.06515645980835, + "learning_rate": 9.345869003940755e-06, + "loss": 3.5852, + "step": 7705 + }, + { + "epoch": 0.5238483489604566, + "grad_norm": 6.954622745513916, + "learning_rate": 9.345444353852426e-06, + "loss": 3.2458, + "step": 7710 + }, + { + "epoch": 0.5241880690311184, + "grad_norm": 9.34189224243164, + "learning_rate": 9.345019703764099e-06, + "loss": 3.4771, + "step": 7715 + }, + { + "epoch": 0.5245277891017801, + "grad_norm": 7.545896530151367, + "learning_rate": 9.344595053675773e-06, + "loss": 3.3761, + "step": 7720 + }, + { + "epoch": 0.524867509172442, + "grad_norm": 6.290584564208984, + "learning_rate": 9.344170403587444e-06, + "loss": 3.414, + "step": 7725 + }, + { + "epoch": 0.5252072292431037, + "grad_norm": 6.420226097106934, + "learning_rate": 9.343745753499117e-06, + "loss": 3.3311, + "step": 7730 + }, + { + "epoch": 0.5255469493137654, + "grad_norm": 6.24008846282959, + "learning_rate": 9.343321103410791e-06, + "loss": 3.3017, + "step": 7735 + }, + { + "epoch": 0.5258866693844272, + "grad_norm": 6.348813533782959, + "learning_rate": 9.342896453322463e-06, + "loss": 3.4192, + "step": 7740 + }, + { + "epoch": 0.526226389455089, + "grad_norm": 6.367891311645508, + "learning_rate": 9.342471803234135e-06, + "loss": 3.3278, + "step": 7745 + }, + { + "epoch": 0.5265661095257508, + "grad_norm": 6.280129432678223, + "learning_rate": 9.342047153145808e-06, + "loss": 3.5682, + "step": 7750 + }, + { + "epoch": 0.5269058295964125, + "grad_norm": 7.82330322265625, + "learning_rate": 9.341622503057481e-06, + "loss": 3.5285, + "step": 7755 + }, + { + "epoch": 0.5272455496670744, + "grad_norm": 8.396161079406738, + "learning_rate": 9.341197852969154e-06, + "loss": 2.9091, + "step": 7760 + }, + { + "epoch": 0.5275852697377361, + "grad_norm": 7.5630717277526855, + "learning_rate": 9.340773202880827e-06, + "loss": 3.5659, + "step": 7765 + }, + { + "epoch": 0.5279249898083979, + "grad_norm": 9.35262680053711, + "learning_rate": 9.3403485527925e-06, + "loss": 3.3175, + "step": 7770 + }, + { + "epoch": 0.5282647098790596, + "grad_norm": 7.080179214477539, + "learning_rate": 9.339923902704172e-06, + "loss": 3.5717, + "step": 7775 + }, + { + "epoch": 0.5286044299497215, + "grad_norm": 7.0869035720825195, + "learning_rate": 9.339499252615845e-06, + "loss": 3.0567, + "step": 7780 + }, + { + "epoch": 0.5289441500203832, + "grad_norm": 6.721363067626953, + "learning_rate": 9.339074602527518e-06, + "loss": 3.6074, + "step": 7785 + }, + { + "epoch": 0.5292838700910449, + "grad_norm": 7.014103412628174, + "learning_rate": 9.33864995243919e-06, + "loss": 3.616, + "step": 7790 + }, + { + "epoch": 0.5296235901617068, + "grad_norm": 7.669628620147705, + "learning_rate": 9.338225302350863e-06, + "loss": 3.5116, + "step": 7795 + }, + { + "epoch": 0.5299633102323685, + "grad_norm": 7.1859660148620605, + "learning_rate": 9.337800652262536e-06, + "loss": 3.2397, + "step": 7800 + }, + { + "epoch": 0.5303030303030303, + "grad_norm": 5.801093101501465, + "learning_rate": 9.337376002174209e-06, + "loss": 3.3143, + "step": 7805 + }, + { + "epoch": 0.5306427503736921, + "grad_norm": 8.40312385559082, + "learning_rate": 9.336951352085882e-06, + "loss": 3.3079, + "step": 7810 + }, + { + "epoch": 0.5309824704443539, + "grad_norm": 7.78193473815918, + "learning_rate": 9.336526701997555e-06, + "loss": 3.3981, + "step": 7815 + }, + { + "epoch": 0.5313221905150156, + "grad_norm": 6.677841663360596, + "learning_rate": 9.336102051909227e-06, + "loss": 3.488, + "step": 7820 + }, + { + "epoch": 0.5316619105856774, + "grad_norm": 6.864682674407959, + "learning_rate": 9.3356774018209e-06, + "loss": 3.5403, + "step": 7825 + }, + { + "epoch": 0.5320016306563392, + "grad_norm": 8.168366432189941, + "learning_rate": 9.335252751732573e-06, + "loss": 3.33, + "step": 7830 + }, + { + "epoch": 0.532341350727001, + "grad_norm": 7.493493556976318, + "learning_rate": 9.334828101644246e-06, + "loss": 3.4794, + "step": 7835 + }, + { + "epoch": 0.5326810707976627, + "grad_norm": 7.945013046264648, + "learning_rate": 9.334403451555919e-06, + "loss": 3.6686, + "step": 7840 + }, + { + "epoch": 0.5330207908683245, + "grad_norm": 6.544580459594727, + "learning_rate": 9.333978801467591e-06, + "loss": 3.3371, + "step": 7845 + }, + { + "epoch": 0.5333605109389863, + "grad_norm": 5.540278911590576, + "learning_rate": 9.333554151379264e-06, + "loss": 3.4552, + "step": 7850 + }, + { + "epoch": 0.533700231009648, + "grad_norm": 5.4939045906066895, + "learning_rate": 9.333129501290937e-06, + "loss": 3.4968, + "step": 7855 + }, + { + "epoch": 0.5340399510803098, + "grad_norm": 6.348739147186279, + "learning_rate": 9.33270485120261e-06, + "loss": 3.3768, + "step": 7860 + }, + { + "epoch": 0.5343796711509716, + "grad_norm": 8.081729888916016, + "learning_rate": 9.332280201114283e-06, + "loss": 3.4036, + "step": 7865 + }, + { + "epoch": 0.5347193912216334, + "grad_norm": 5.288845062255859, + "learning_rate": 9.331855551025955e-06, + "loss": 3.3848, + "step": 7870 + }, + { + "epoch": 0.5350591112922951, + "grad_norm": 6.137636661529541, + "learning_rate": 9.331430900937628e-06, + "loss": 3.3617, + "step": 7875 + }, + { + "epoch": 0.535398831362957, + "grad_norm": 8.561034202575684, + "learning_rate": 9.331006250849301e-06, + "loss": 3.5479, + "step": 7880 + }, + { + "epoch": 0.5357385514336187, + "grad_norm": 7.462732315063477, + "learning_rate": 9.330581600760974e-06, + "loss": 3.2971, + "step": 7885 + }, + { + "epoch": 0.5360782715042804, + "grad_norm": 6.412789821624756, + "learning_rate": 9.330156950672647e-06, + "loss": 3.566, + "step": 7890 + }, + { + "epoch": 0.5364179915749423, + "grad_norm": 6.785240173339844, + "learning_rate": 9.32973230058432e-06, + "loss": 3.2497, + "step": 7895 + }, + { + "epoch": 0.536757711645604, + "grad_norm": 6.620786190032959, + "learning_rate": 9.329307650495992e-06, + "loss": 3.1655, + "step": 7900 + }, + { + "epoch": 0.5370974317162658, + "grad_norm": 6.475356578826904, + "learning_rate": 9.328883000407665e-06, + "loss": 3.5459, + "step": 7905 + }, + { + "epoch": 0.5374371517869275, + "grad_norm": 6.397312164306641, + "learning_rate": 9.328458350319338e-06, + "loss": 3.4761, + "step": 7910 + }, + { + "epoch": 0.5377768718575894, + "grad_norm": 5.761232852935791, + "learning_rate": 9.32803370023101e-06, + "loss": 3.5885, + "step": 7915 + }, + { + "epoch": 0.5381165919282511, + "grad_norm": 7.225599765777588, + "learning_rate": 9.327609050142683e-06, + "loss": 3.2527, + "step": 7920 + }, + { + "epoch": 0.5384563119989129, + "grad_norm": 7.209740161895752, + "learning_rate": 9.327184400054356e-06, + "loss": 3.4026, + "step": 7925 + }, + { + "epoch": 0.5387960320695747, + "grad_norm": 6.405486106872559, + "learning_rate": 9.326759749966029e-06, + "loss": 3.6202, + "step": 7930 + }, + { + "epoch": 0.5391357521402365, + "grad_norm": 5.759757995605469, + "learning_rate": 9.326335099877702e-06, + "loss": 3.4918, + "step": 7935 + }, + { + "epoch": 0.5394754722108982, + "grad_norm": 9.131099700927734, + "learning_rate": 9.325910449789375e-06, + "loss": 3.2735, + "step": 7940 + }, + { + "epoch": 0.5398151922815599, + "grad_norm": 7.626204490661621, + "learning_rate": 9.325485799701047e-06, + "loss": 3.4073, + "step": 7945 + }, + { + "epoch": 0.5401549123522218, + "grad_norm": 8.018057823181152, + "learning_rate": 9.32506114961272e-06, + "loss": 3.5499, + "step": 7950 + }, + { + "epoch": 0.5404946324228835, + "grad_norm": 5.869449138641357, + "learning_rate": 9.324636499524393e-06, + "loss": 3.0962, + "step": 7955 + }, + { + "epoch": 0.5408343524935453, + "grad_norm": 6.451284408569336, + "learning_rate": 9.324211849436066e-06, + "loss": 3.3414, + "step": 7960 + }, + { + "epoch": 0.5411740725642071, + "grad_norm": 5.266515254974365, + "learning_rate": 9.323787199347739e-06, + "loss": 3.4244, + "step": 7965 + }, + { + "epoch": 0.5415137926348689, + "grad_norm": 7.418032646179199, + "learning_rate": 9.323362549259411e-06, + "loss": 3.3044, + "step": 7970 + }, + { + "epoch": 0.5418535127055306, + "grad_norm": 6.171967506408691, + "learning_rate": 9.322937899171084e-06, + "loss": 3.4061, + "step": 7975 + }, + { + "epoch": 0.5421932327761925, + "grad_norm": 6.938091278076172, + "learning_rate": 9.322513249082757e-06, + "loss": 3.4179, + "step": 7980 + }, + { + "epoch": 0.5425329528468542, + "grad_norm": 6.59197473526001, + "learning_rate": 9.32208859899443e-06, + "loss": 3.4811, + "step": 7985 + }, + { + "epoch": 0.542872672917516, + "grad_norm": 7.022855281829834, + "learning_rate": 9.321663948906103e-06, + "loss": 3.4263, + "step": 7990 + }, + { + "epoch": 0.5432123929881777, + "grad_norm": 8.482316017150879, + "learning_rate": 9.321239298817775e-06, + "loss": 3.4797, + "step": 7995 + }, + { + "epoch": 0.5435521130588395, + "grad_norm": 6.674006938934326, + "learning_rate": 9.320814648729448e-06, + "loss": 3.503, + "step": 8000 + }, + { + "epoch": 0.5438918331295013, + "grad_norm": 6.553414344787598, + "learning_rate": 9.320389998641121e-06, + "loss": 3.3266, + "step": 8005 + }, + { + "epoch": 0.544231553200163, + "grad_norm": 6.808204174041748, + "learning_rate": 9.319965348552794e-06, + "loss": 3.4003, + "step": 8010 + }, + { + "epoch": 0.5445712732708249, + "grad_norm": 5.3096232414245605, + "learning_rate": 9.319540698464467e-06, + "loss": 3.3754, + "step": 8015 + }, + { + "epoch": 0.5449109933414866, + "grad_norm": 7.325875759124756, + "learning_rate": 9.31911604837614e-06, + "loss": 3.3903, + "step": 8020 + }, + { + "epoch": 0.5452507134121484, + "grad_norm": 7.258479118347168, + "learning_rate": 9.31869139828781e-06, + "loss": 3.5423, + "step": 8025 + }, + { + "epoch": 0.5455904334828101, + "grad_norm": 7.174134731292725, + "learning_rate": 9.318266748199485e-06, + "loss": 3.3527, + "step": 8030 + }, + { + "epoch": 0.545930153553472, + "grad_norm": 5.372439861297607, + "learning_rate": 9.317842098111158e-06, + "loss": 3.3936, + "step": 8035 + }, + { + "epoch": 0.5462698736241337, + "grad_norm": 6.655653476715088, + "learning_rate": 9.317417448022829e-06, + "loss": 3.1479, + "step": 8040 + }, + { + "epoch": 0.5466095936947954, + "grad_norm": 7.111118316650391, + "learning_rate": 9.316992797934503e-06, + "loss": 3.2589, + "step": 8045 + }, + { + "epoch": 0.5469493137654573, + "grad_norm": 7.546431541442871, + "learning_rate": 9.316568147846176e-06, + "loss": 3.245, + "step": 8050 + }, + { + "epoch": 0.547289033836119, + "grad_norm": 6.469536304473877, + "learning_rate": 9.316143497757847e-06, + "loss": 3.3232, + "step": 8055 + }, + { + "epoch": 0.5476287539067808, + "grad_norm": 7.480095863342285, + "learning_rate": 9.315718847669522e-06, + "loss": 3.3633, + "step": 8060 + }, + { + "epoch": 0.5479684739774426, + "grad_norm": 7.097555637359619, + "learning_rate": 9.315294197581195e-06, + "loss": 3.3386, + "step": 8065 + }, + { + "epoch": 0.5483081940481044, + "grad_norm": 7.520008563995361, + "learning_rate": 9.314869547492866e-06, + "loss": 3.4616, + "step": 8070 + }, + { + "epoch": 0.5486479141187661, + "grad_norm": 6.272017955780029, + "learning_rate": 9.31444489740454e-06, + "loss": 3.3306, + "step": 8075 + }, + { + "epoch": 0.5489876341894279, + "grad_norm": 5.856510162353516, + "learning_rate": 9.314020247316213e-06, + "loss": 3.3784, + "step": 8080 + }, + { + "epoch": 0.5493273542600897, + "grad_norm": 7.328276634216309, + "learning_rate": 9.313595597227884e-06, + "loss": 3.3876, + "step": 8085 + }, + { + "epoch": 0.5496670743307515, + "grad_norm": 7.992668151855469, + "learning_rate": 9.313170947139559e-06, + "loss": 3.4603, + "step": 8090 + }, + { + "epoch": 0.5500067944014132, + "grad_norm": 7.8176960945129395, + "learning_rate": 9.31274629705123e-06, + "loss": 3.333, + "step": 8095 + }, + { + "epoch": 0.550346514472075, + "grad_norm": 7.3568572998046875, + "learning_rate": 9.312321646962903e-06, + "loss": 3.4478, + "step": 8100 + }, + { + "epoch": 0.5506862345427368, + "grad_norm": Infinity, + "learning_rate": 9.311981926892241e-06, + "loss": 3.2717, + "step": 8105 + }, + { + "epoch": 0.5510259546133985, + "grad_norm": 7.193819999694824, + "learning_rate": 9.311557276803914e-06, + "loss": 3.2629, + "step": 8110 + }, + { + "epoch": 0.5513656746840603, + "grad_norm": 5.965134620666504, + "learning_rate": 9.311132626715587e-06, + "loss": 3.2782, + "step": 8115 + }, + { + "epoch": 0.5517053947547221, + "grad_norm": 5.9168243408203125, + "learning_rate": 9.31070797662726e-06, + "loss": 3.2924, + "step": 8120 + }, + { + "epoch": 0.5520451148253839, + "grad_norm": 6.662998676300049, + "learning_rate": 9.310283326538933e-06, + "loss": 3.494, + "step": 8125 + }, + { + "epoch": 0.5523848348960456, + "grad_norm": 9.007112503051758, + "learning_rate": 9.309858676450605e-06, + "loss": 3.5645, + "step": 8130 + }, + { + "epoch": 0.5527245549667075, + "grad_norm": 7.041459560394287, + "learning_rate": 9.309434026362278e-06, + "loss": 3.2375, + "step": 8135 + }, + { + "epoch": 0.5530642750373692, + "grad_norm": 7.4902801513671875, + "learning_rate": 9.309009376273951e-06, + "loss": 3.3743, + "step": 8140 + }, + { + "epoch": 0.553403995108031, + "grad_norm": 6.943088054656982, + "learning_rate": 9.308584726185624e-06, + "loss": 3.4636, + "step": 8145 + }, + { + "epoch": 0.5537437151786928, + "grad_norm": 8.274903297424316, + "learning_rate": 9.308160076097297e-06, + "loss": 3.2984, + "step": 8150 + }, + { + "epoch": 0.5540834352493546, + "grad_norm": 6.481700420379639, + "learning_rate": 9.30773542600897e-06, + "loss": 3.3045, + "step": 8155 + }, + { + "epoch": 0.5544231553200163, + "grad_norm": 6.712331771850586, + "learning_rate": 9.307310775920642e-06, + "loss": 3.2522, + "step": 8160 + }, + { + "epoch": 0.554762875390678, + "grad_norm": 7.216068267822266, + "learning_rate": 9.306886125832315e-06, + "loss": 3.4856, + "step": 8165 + }, + { + "epoch": 0.5551025954613399, + "grad_norm": 5.190288543701172, + "learning_rate": 9.306461475743988e-06, + "loss": 3.4801, + "step": 8170 + }, + { + "epoch": 0.5554423155320016, + "grad_norm": 5.992360591888428, + "learning_rate": 9.30603682565566e-06, + "loss": 3.4657, + "step": 8175 + }, + { + "epoch": 0.5557820356026634, + "grad_norm": 6.2899675369262695, + "learning_rate": 9.305612175567333e-06, + "loss": 3.3275, + "step": 8180 + }, + { + "epoch": 0.5561217556733252, + "grad_norm": 6.613733291625977, + "learning_rate": 9.305187525479006e-06, + "loss": 3.3368, + "step": 8185 + }, + { + "epoch": 0.556461475743987, + "grad_norm": 6.475748062133789, + "learning_rate": 9.304762875390679e-06, + "loss": 3.3852, + "step": 8190 + }, + { + "epoch": 0.5568011958146487, + "grad_norm": 7.680159091949463, + "learning_rate": 9.304338225302352e-06, + "loss": 3.417, + "step": 8195 + }, + { + "epoch": 0.5571409158853105, + "grad_norm": 5.244831085205078, + "learning_rate": 9.303913575214025e-06, + "loss": 3.3769, + "step": 8200 + }, + { + "epoch": 0.5574806359559723, + "grad_norm": 8.119514465332031, + "learning_rate": 9.303488925125697e-06, + "loss": 3.3053, + "step": 8205 + }, + { + "epoch": 0.557820356026634, + "grad_norm": 7.276259422302246, + "learning_rate": 9.30306427503737e-06, + "loss": 3.3762, + "step": 8210 + }, + { + "epoch": 0.5581600760972958, + "grad_norm": 7.0549774169921875, + "learning_rate": 9.302639624949043e-06, + "loss": 3.312, + "step": 8215 + }, + { + "epoch": 0.5584997961679576, + "grad_norm": 5.956245422363281, + "learning_rate": 9.302214974860716e-06, + "loss": 3.539, + "step": 8220 + }, + { + "epoch": 0.5588395162386194, + "grad_norm": 8.094970703125, + "learning_rate": 9.301790324772389e-06, + "loss": 3.2562, + "step": 8225 + }, + { + "epoch": 0.5591792363092811, + "grad_norm": 6.116145133972168, + "learning_rate": 9.301365674684061e-06, + "loss": 3.4887, + "step": 8230 + }, + { + "epoch": 0.559518956379943, + "grad_norm": 6.619522571563721, + "learning_rate": 9.300941024595734e-06, + "loss": 3.3733, + "step": 8235 + }, + { + "epoch": 0.5598586764506047, + "grad_norm": 5.5799055099487305, + "learning_rate": 9.300516374507407e-06, + "loss": 3.6418, + "step": 8240 + }, + { + "epoch": 0.5601983965212665, + "grad_norm": 6.51112699508667, + "learning_rate": 9.30009172441908e-06, + "loss": 3.1834, + "step": 8245 + }, + { + "epoch": 0.5605381165919282, + "grad_norm": 7.41110897064209, + "learning_rate": 9.299667074330753e-06, + "loss": 3.3948, + "step": 8250 + }, + { + "epoch": 0.5608778366625901, + "grad_norm": 6.574060440063477, + "learning_rate": 9.299242424242425e-06, + "loss": 3.1843, + "step": 8255 + }, + { + "epoch": 0.5612175567332518, + "grad_norm": 6.173676490783691, + "learning_rate": 9.298817774154098e-06, + "loss": 3.384, + "step": 8260 + }, + { + "epoch": 0.5615572768039135, + "grad_norm": 5.4675703048706055, + "learning_rate": 9.298393124065771e-06, + "loss": 3.3663, + "step": 8265 + }, + { + "epoch": 0.5618969968745754, + "grad_norm": 5.699326038360596, + "learning_rate": 9.297968473977444e-06, + "loss": 3.2303, + "step": 8270 + }, + { + "epoch": 0.5622367169452371, + "grad_norm": 8.059255599975586, + "learning_rate": 9.297543823889115e-06, + "loss": 3.1974, + "step": 8275 + }, + { + "epoch": 0.5625764370158989, + "grad_norm": 8.429980278015137, + "learning_rate": 9.29711917380079e-06, + "loss": 3.4901, + "step": 8280 + }, + { + "epoch": 0.5629161570865606, + "grad_norm": 6.46212100982666, + "learning_rate": 9.296694523712462e-06, + "loss": 3.3513, + "step": 8285 + }, + { + "epoch": 0.5632558771572225, + "grad_norm": 6.972095489501953, + "learning_rate": 9.296269873624133e-06, + "loss": 3.5237, + "step": 8290 + }, + { + "epoch": 0.5635955972278842, + "grad_norm": 7.3061089515686035, + "learning_rate": 9.295845223535808e-06, + "loss": 3.2412, + "step": 8295 + }, + { + "epoch": 0.563935317298546, + "grad_norm": 5.837936878204346, + "learning_rate": 9.29542057344748e-06, + "loss": 3.5018, + "step": 8300 + }, + { + "epoch": 0.5642750373692078, + "grad_norm": 5.850408554077148, + "learning_rate": 9.294995923359152e-06, + "loss": 3.309, + "step": 8305 + }, + { + "epoch": 0.5646147574398696, + "grad_norm": 6.404292583465576, + "learning_rate": 9.294571273270826e-06, + "loss": 3.3537, + "step": 8310 + }, + { + "epoch": 0.5649544775105313, + "grad_norm": 6.468895435333252, + "learning_rate": 9.294146623182499e-06, + "loss": 3.1937, + "step": 8315 + }, + { + "epoch": 0.5652941975811931, + "grad_norm": 6.2834930419921875, + "learning_rate": 9.29372197309417e-06, + "loss": 3.2112, + "step": 8320 + }, + { + "epoch": 0.5656339176518549, + "grad_norm": 7.2141499519348145, + "learning_rate": 9.293297323005845e-06, + "loss": 3.5268, + "step": 8325 + }, + { + "epoch": 0.5659736377225166, + "grad_norm": 7.148204326629639, + "learning_rate": 9.292872672917517e-06, + "loss": 3.4706, + "step": 8330 + }, + { + "epoch": 0.5663133577931784, + "grad_norm": 6.6410369873046875, + "learning_rate": 9.292448022829189e-06, + "loss": 3.5455, + "step": 8335 + }, + { + "epoch": 0.5666530778638402, + "grad_norm": 7.461374759674072, + "learning_rate": 9.292023372740863e-06, + "loss": 3.4243, + "step": 8340 + }, + { + "epoch": 0.566992797934502, + "grad_norm": 6.483984470367432, + "learning_rate": 9.291598722652534e-06, + "loss": 3.318, + "step": 8345 + }, + { + "epoch": 0.5673325180051637, + "grad_norm": 9.014633178710938, + "learning_rate": 9.291174072564207e-06, + "loss": 3.6542, + "step": 8350 + }, + { + "epoch": 0.5676722380758256, + "grad_norm": 6.34745979309082, + "learning_rate": 9.290749422475881e-06, + "loss": 3.5048, + "step": 8355 + }, + { + "epoch": 0.5680119581464873, + "grad_norm": 5.3658528327941895, + "learning_rate": 9.290324772387553e-06, + "loss": 3.2624, + "step": 8360 + }, + { + "epoch": 0.568351678217149, + "grad_norm": 6.258679389953613, + "learning_rate": 9.289900122299225e-06, + "loss": 3.2704, + "step": 8365 + }, + { + "epoch": 0.5686913982878108, + "grad_norm": 6.253342151641846, + "learning_rate": 9.2894754722109e-06, + "loss": 3.3718, + "step": 8370 + }, + { + "epoch": 0.5690311183584726, + "grad_norm": 5.813457489013672, + "learning_rate": 9.289050822122571e-06, + "loss": 3.5467, + "step": 8375 + }, + { + "epoch": 0.5693708384291344, + "grad_norm": 6.0005106925964355, + "learning_rate": 9.288626172034244e-06, + "loss": 3.6071, + "step": 8380 + }, + { + "epoch": 0.5697105584997961, + "grad_norm": 5.919152736663818, + "learning_rate": 9.288201521945918e-06, + "loss": 3.3857, + "step": 8385 + }, + { + "epoch": 0.570050278570458, + "grad_norm": 6.4245710372924805, + "learning_rate": 9.28777687185759e-06, + "loss": 3.3234, + "step": 8390 + }, + { + "epoch": 0.5703899986411197, + "grad_norm": 6.903382301330566, + "learning_rate": 9.287352221769262e-06, + "loss": 3.6726, + "step": 8395 + }, + { + "epoch": 0.5707297187117815, + "grad_norm": 6.507415294647217, + "learning_rate": 9.286927571680937e-06, + "loss": 3.2626, + "step": 8400 + }, + { + "epoch": 0.5710694387824433, + "grad_norm": 7.314284324645996, + "learning_rate": 9.286502921592608e-06, + "loss": 3.4154, + "step": 8405 + }, + { + "epoch": 0.5714091588531051, + "grad_norm": 8.711563110351562, + "learning_rate": 9.28607827150428e-06, + "loss": 3.4309, + "step": 8410 + }, + { + "epoch": 0.5717488789237668, + "grad_norm": 7.386160373687744, + "learning_rate": 9.285653621415955e-06, + "loss": 2.9839, + "step": 8415 + }, + { + "epoch": 0.5720885989944285, + "grad_norm": 6.995179176330566, + "learning_rate": 9.285228971327626e-06, + "loss": 3.4549, + "step": 8420 + }, + { + "epoch": 0.5724283190650904, + "grad_norm": 5.555901050567627, + "learning_rate": 9.284804321239299e-06, + "loss": 3.414, + "step": 8425 + }, + { + "epoch": 0.5727680391357521, + "grad_norm": 5.727787971496582, + "learning_rate": 9.284379671150972e-06, + "loss": 3.4298, + "step": 8430 + }, + { + "epoch": 0.5731077592064139, + "grad_norm": 9.741089820861816, + "learning_rate": 9.283955021062645e-06, + "loss": 3.1975, + "step": 8435 + }, + { + "epoch": 0.5734474792770757, + "grad_norm": 7.804315090179443, + "learning_rate": 9.283530370974317e-06, + "loss": 3.3393, + "step": 8440 + }, + { + "epoch": 0.5737871993477375, + "grad_norm": 8.198302268981934, + "learning_rate": 9.28310572088599e-06, + "loss": 3.2248, + "step": 8445 + }, + { + "epoch": 0.5741269194183992, + "grad_norm": 7.356081962585449, + "learning_rate": 9.282681070797663e-06, + "loss": 3.3977, + "step": 8450 + }, + { + "epoch": 0.574466639489061, + "grad_norm": 6.550528049468994, + "learning_rate": 9.282256420709336e-06, + "loss": 3.0529, + "step": 8455 + }, + { + "epoch": 0.5748063595597228, + "grad_norm": 6.847325801849365, + "learning_rate": 9.281831770621009e-06, + "loss": 3.5421, + "step": 8460 + }, + { + "epoch": 0.5751460796303846, + "grad_norm": 6.694525718688965, + "learning_rate": 9.281407120532681e-06, + "loss": 3.4677, + "step": 8465 + }, + { + "epoch": 0.5754857997010463, + "grad_norm": 6.596917152404785, + "learning_rate": 9.280982470444354e-06, + "loss": 3.321, + "step": 8470 + }, + { + "epoch": 0.5758255197717081, + "grad_norm": 6.758399486541748, + "learning_rate": 9.280557820356027e-06, + "loss": 3.3704, + "step": 8475 + }, + { + "epoch": 0.5761652398423699, + "grad_norm": 6.7966532707214355, + "learning_rate": 9.2801331702677e-06, + "loss": 3.2314, + "step": 8480 + }, + { + "epoch": 0.5765049599130316, + "grad_norm": 6.9139509201049805, + "learning_rate": 9.279708520179373e-06, + "loss": 3.3845, + "step": 8485 + }, + { + "epoch": 0.5768446799836935, + "grad_norm": 7.762287139892578, + "learning_rate": 9.279283870091045e-06, + "loss": 3.1948, + "step": 8490 + }, + { + "epoch": 0.5771844000543552, + "grad_norm": 8.866230964660645, + "learning_rate": 9.278859220002718e-06, + "loss": 3.4243, + "step": 8495 + }, + { + "epoch": 0.577524120125017, + "grad_norm": 5.754431247711182, + "learning_rate": 9.278434569914391e-06, + "loss": 3.4308, + "step": 8500 + }, + { + "epoch": 0.5778638401956787, + "grad_norm": 7.171598434448242, + "learning_rate": 9.278009919826064e-06, + "loss": 3.2602, + "step": 8505 + }, + { + "epoch": 0.5782035602663406, + "grad_norm": 6.816784381866455, + "learning_rate": 9.277585269737737e-06, + "loss": 3.3793, + "step": 8510 + }, + { + "epoch": 0.5785432803370023, + "grad_norm": 6.944392681121826, + "learning_rate": 9.27716061964941e-06, + "loss": 3.327, + "step": 8515 + }, + { + "epoch": 0.578883000407664, + "grad_norm": 7.285239219665527, + "learning_rate": 9.276735969561082e-06, + "loss": 3.1713, + "step": 8520 + }, + { + "epoch": 0.5792227204783259, + "grad_norm": 5.771254062652588, + "learning_rate": 9.276311319472755e-06, + "loss": 3.2834, + "step": 8525 + }, + { + "epoch": 0.5795624405489876, + "grad_norm": 5.028548717498779, + "learning_rate": 9.275886669384428e-06, + "loss": 3.2462, + "step": 8530 + }, + { + "epoch": 0.5799021606196494, + "grad_norm": 7.399678707122803, + "learning_rate": 9.2754620192961e-06, + "loss": 3.26, + "step": 8535 + }, + { + "epoch": 0.5802418806903111, + "grad_norm": 10.384771347045898, + "learning_rate": 9.275037369207773e-06, + "loss": 3.3261, + "step": 8540 + }, + { + "epoch": 0.580581600760973, + "grad_norm": 8.299656867980957, + "learning_rate": 9.274612719119446e-06, + "loss": 3.5193, + "step": 8545 + }, + { + "epoch": 0.5809213208316347, + "grad_norm": 6.682275772094727, + "learning_rate": 9.274188069031119e-06, + "loss": 3.2, + "step": 8550 + }, + { + "epoch": 0.5812610409022965, + "grad_norm": 6.191930770874023, + "learning_rate": 9.273763418942792e-06, + "loss": 3.4001, + "step": 8555 + }, + { + "epoch": 0.5816007609729583, + "grad_norm": 6.688408374786377, + "learning_rate": 9.273338768854465e-06, + "loss": 3.3978, + "step": 8560 + }, + { + "epoch": 0.5819404810436201, + "grad_norm": 6.6615071296691895, + "learning_rate": 9.272914118766137e-06, + "loss": 3.3249, + "step": 8565 + }, + { + "epoch": 0.5822802011142818, + "grad_norm": 6.803758144378662, + "learning_rate": 9.27248946867781e-06, + "loss": 3.3739, + "step": 8570 + }, + { + "epoch": 0.5826199211849437, + "grad_norm": 7.563642978668213, + "learning_rate": 9.272064818589483e-06, + "loss": 3.3688, + "step": 8575 + }, + { + "epoch": 0.5829596412556054, + "grad_norm": 6.712185859680176, + "learning_rate": 9.271640168501156e-06, + "loss": 3.2174, + "step": 8580 + }, + { + "epoch": 0.5832993613262671, + "grad_norm": 7.01743745803833, + "learning_rate": 9.271215518412829e-06, + "loss": 3.4103, + "step": 8585 + }, + { + "epoch": 0.5836390813969289, + "grad_norm": 7.476060390472412, + "learning_rate": 9.270790868324501e-06, + "loss": 3.4549, + "step": 8590 + }, + { + "epoch": 0.5839788014675907, + "grad_norm": 6.209798812866211, + "learning_rate": 9.270366218236174e-06, + "loss": 3.6892, + "step": 8595 + }, + { + "epoch": 0.5843185215382525, + "grad_norm": 6.874569416046143, + "learning_rate": 9.269941568147847e-06, + "loss": 3.2526, + "step": 8600 + }, + { + "epoch": 0.5846582416089142, + "grad_norm": 7.830489158630371, + "learning_rate": 9.26951691805952e-06, + "loss": 3.5704, + "step": 8605 + }, + { + "epoch": 0.5849979616795761, + "grad_norm": 5.2021589279174805, + "learning_rate": 9.269092267971193e-06, + "loss": 3.2178, + "step": 8610 + }, + { + "epoch": 0.5853376817502378, + "grad_norm": 6.143906116485596, + "learning_rate": 9.268667617882865e-06, + "loss": 3.398, + "step": 8615 + }, + { + "epoch": 0.5856774018208996, + "grad_norm": 9.865966796875, + "learning_rate": 9.268242967794538e-06, + "loss": 3.2905, + "step": 8620 + }, + { + "epoch": 0.5860171218915613, + "grad_norm": 8.576623916625977, + "learning_rate": 9.267818317706211e-06, + "loss": 3.3945, + "step": 8625 + }, + { + "epoch": 0.5863568419622232, + "grad_norm": 6.971482753753662, + "learning_rate": 9.267393667617884e-06, + "loss": 3.5347, + "step": 8630 + }, + { + "epoch": 0.5866965620328849, + "grad_norm": 5.542952060699463, + "learning_rate": 9.266969017529557e-06, + "loss": 3.3746, + "step": 8635 + }, + { + "epoch": 0.5870362821035466, + "grad_norm": 7.623120307922363, + "learning_rate": 9.26654436744123e-06, + "loss": 3.0451, + "step": 8640 + }, + { + "epoch": 0.5873760021742085, + "grad_norm": 6.1923346519470215, + "learning_rate": 9.266119717352902e-06, + "loss": 3.6598, + "step": 8645 + }, + { + "epoch": 0.5877157222448702, + "grad_norm": 6.067724227905273, + "learning_rate": 9.265695067264575e-06, + "loss": 3.6276, + "step": 8650 + }, + { + "epoch": 0.588055442315532, + "grad_norm": 5.874804973602295, + "learning_rate": 9.265270417176248e-06, + "loss": 3.1694, + "step": 8655 + }, + { + "epoch": 0.5883951623861938, + "grad_norm": 8.410348892211914, + "learning_rate": 9.26484576708792e-06, + "loss": 3.2313, + "step": 8660 + }, + { + "epoch": 0.5887348824568556, + "grad_norm": 6.159420967102051, + "learning_rate": 9.264421116999593e-06, + "loss": 3.3145, + "step": 8665 + }, + { + "epoch": 0.5890746025275173, + "grad_norm": 4.78861665725708, + "learning_rate": 9.263996466911266e-06, + "loss": 3.4636, + "step": 8670 + }, + { + "epoch": 0.589414322598179, + "grad_norm": 6.398593902587891, + "learning_rate": 9.263571816822939e-06, + "loss": 3.3603, + "step": 8675 + }, + { + "epoch": 0.5897540426688409, + "grad_norm": 5.9349799156188965, + "learning_rate": 9.263147166734612e-06, + "loss": 3.3726, + "step": 8680 + }, + { + "epoch": 0.5900937627395026, + "grad_norm": 6.5928239822387695, + "learning_rate": 9.262722516646285e-06, + "loss": 3.2315, + "step": 8685 + }, + { + "epoch": 0.5904334828101644, + "grad_norm": 6.351653575897217, + "learning_rate": 9.262297866557956e-06, + "loss": 3.1173, + "step": 8690 + }, + { + "epoch": 0.5907732028808262, + "grad_norm": 6.069891929626465, + "learning_rate": 9.26187321646963e-06, + "loss": 3.3511, + "step": 8695 + }, + { + "epoch": 0.591112922951488, + "grad_norm": 6.8769121170043945, + "learning_rate": 9.261448566381303e-06, + "loss": 3.4773, + "step": 8700 + }, + { + "epoch": 0.5914526430221497, + "grad_norm": 6.9748382568359375, + "learning_rate": 9.261023916292974e-06, + "loss": 3.4381, + "step": 8705 + }, + { + "epoch": 0.5917923630928115, + "grad_norm": 7.272889137268066, + "learning_rate": 9.260599266204649e-06, + "loss": 3.3625, + "step": 8710 + }, + { + "epoch": 0.5921320831634733, + "grad_norm": 7.303472518920898, + "learning_rate": 9.260174616116321e-06, + "loss": 3.1574, + "step": 8715 + }, + { + "epoch": 0.5924718032341351, + "grad_norm": 7.133380889892578, + "learning_rate": 9.259749966027993e-06, + "loss": 3.3997, + "step": 8720 + }, + { + "epoch": 0.5928115233047968, + "grad_norm": 7.370189666748047, + "learning_rate": 9.259325315939667e-06, + "loss": 3.4633, + "step": 8725 + }, + { + "epoch": 0.5931512433754587, + "grad_norm": 7.532242298126221, + "learning_rate": 9.25890066585134e-06, + "loss": 3.5474, + "step": 8730 + }, + { + "epoch": 0.5934909634461204, + "grad_norm": 10.36549186706543, + "learning_rate": 9.258476015763011e-06, + "loss": 3.4434, + "step": 8735 + }, + { + "epoch": 0.5938306835167821, + "grad_norm": 7.053276538848877, + "learning_rate": 9.258051365674685e-06, + "loss": 3.49, + "step": 8740 + }, + { + "epoch": 0.594170403587444, + "grad_norm": 7.049251556396484, + "learning_rate": 9.257626715586358e-06, + "loss": 3.3053, + "step": 8745 + }, + { + "epoch": 0.5945101236581057, + "grad_norm": 6.833352088928223, + "learning_rate": 9.25720206549803e-06, + "loss": 3.4003, + "step": 8750 + }, + { + "epoch": 0.5948498437287675, + "grad_norm": 5.992280960083008, + "learning_rate": 9.256777415409704e-06, + "loss": 3.1842, + "step": 8755 + }, + { + "epoch": 0.5951895637994292, + "grad_norm": 9.174123764038086, + "learning_rate": 9.256352765321377e-06, + "loss": 3.4534, + "step": 8760 + }, + { + "epoch": 0.5955292838700911, + "grad_norm": 7.674689769744873, + "learning_rate": 9.255928115233048e-06, + "loss": 3.3342, + "step": 8765 + }, + { + "epoch": 0.5958690039407528, + "grad_norm": 5.946090221405029, + "learning_rate": 9.255503465144722e-06, + "loss": 3.3406, + "step": 8770 + }, + { + "epoch": 0.5962087240114146, + "grad_norm": 8.098655700683594, + "learning_rate": 9.255078815056393e-06, + "loss": 3.3565, + "step": 8775 + }, + { + "epoch": 0.5965484440820764, + "grad_norm": 6.875548362731934, + "learning_rate": 9.254654164968066e-06, + "loss": 3.3447, + "step": 8780 + }, + { + "epoch": 0.5968881641527382, + "grad_norm": 7.318806171417236, + "learning_rate": 9.25422951487974e-06, + "loss": 3.4176, + "step": 8785 + }, + { + "epoch": 0.5972278842233999, + "grad_norm": 7.070799350738525, + "learning_rate": 9.253804864791412e-06, + "loss": 3.4313, + "step": 8790 + }, + { + "epoch": 0.5975676042940616, + "grad_norm": 5.602840900421143, + "learning_rate": 9.253380214703085e-06, + "loss": 3.4038, + "step": 8795 + }, + { + "epoch": 0.5979073243647235, + "grad_norm": 5.987039566040039, + "learning_rate": 9.252955564614759e-06, + "loss": 3.265, + "step": 8800 + }, + { + "epoch": 0.5982470444353852, + "grad_norm": 7.727420806884766, + "learning_rate": 9.25253091452643e-06, + "loss": 3.4108, + "step": 8805 + }, + { + "epoch": 0.598586764506047, + "grad_norm": 6.699317455291748, + "learning_rate": 9.252106264438103e-06, + "loss": 3.4698, + "step": 8810 + }, + { + "epoch": 0.5989264845767088, + "grad_norm": 8.905025482177734, + "learning_rate": 9.251681614349777e-06, + "loss": 3.3965, + "step": 8815 + }, + { + "epoch": 0.5992662046473706, + "grad_norm": 7.062687873840332, + "learning_rate": 9.251256964261449e-06, + "loss": 3.2305, + "step": 8820 + }, + { + "epoch": 0.5996059247180323, + "grad_norm": 7.305023670196533, + "learning_rate": 9.250832314173121e-06, + "loss": 3.3205, + "step": 8825 + }, + { + "epoch": 0.5999456447886942, + "grad_norm": 5.547568321228027, + "learning_rate": 9.250407664084796e-06, + "loss": 3.311, + "step": 8830 + }, + { + "epoch": 0.6002853648593559, + "grad_norm": 6.632659912109375, + "learning_rate": 9.249983013996467e-06, + "loss": 3.3943, + "step": 8835 + }, + { + "epoch": 0.6006250849300176, + "grad_norm": 6.001976490020752, + "learning_rate": 9.249558363908141e-06, + "loss": 3.4456, + "step": 8840 + }, + { + "epoch": 0.6009648050006794, + "grad_norm": 6.071115016937256, + "learning_rate": 9.249133713819813e-06, + "loss": 3.3695, + "step": 8845 + }, + { + "epoch": 0.6013045250713412, + "grad_norm": 6.646505355834961, + "learning_rate": 9.248709063731485e-06, + "loss": 3.5553, + "step": 8850 + }, + { + "epoch": 0.601644245142003, + "grad_norm": 6.906591415405273, + "learning_rate": 9.24828441364316e-06, + "loss": 3.4239, + "step": 8855 + }, + { + "epoch": 0.6019839652126647, + "grad_norm": 6.531507968902588, + "learning_rate": 9.247859763554831e-06, + "loss": 3.3153, + "step": 8860 + }, + { + "epoch": 0.6023236852833266, + "grad_norm": 5.961309432983398, + "learning_rate": 9.247435113466504e-06, + "loss": 3.3469, + "step": 8865 + }, + { + "epoch": 0.6026634053539883, + "grad_norm": 6.293231010437012, + "learning_rate": 9.247010463378178e-06, + "loss": 3.2405, + "step": 8870 + }, + { + "epoch": 0.6030031254246501, + "grad_norm": 7.373183250427246, + "learning_rate": 9.24658581328985e-06, + "loss": 3.2843, + "step": 8875 + }, + { + "epoch": 0.6033428454953118, + "grad_norm": 8.089350700378418, + "learning_rate": 9.246161163201522e-06, + "loss": 3.2972, + "step": 8880 + }, + { + "epoch": 0.6036825655659737, + "grad_norm": 7.571255207061768, + "learning_rate": 9.245736513113197e-06, + "loss": 3.5711, + "step": 8885 + }, + { + "epoch": 0.6040222856366354, + "grad_norm": 5.1000261306762695, + "learning_rate": 9.245311863024868e-06, + "loss": 3.3356, + "step": 8890 + }, + { + "epoch": 0.6043620057072971, + "grad_norm": 7.17865514755249, + "learning_rate": 9.24488721293654e-06, + "loss": 3.4092, + "step": 8895 + }, + { + "epoch": 0.604701725777959, + "grad_norm": 7.068516731262207, + "learning_rate": 9.244462562848215e-06, + "loss": 3.5261, + "step": 8900 + }, + { + "epoch": 0.6050414458486207, + "grad_norm": 7.608928203582764, + "learning_rate": 9.244037912759886e-06, + "loss": 3.3213, + "step": 8905 + }, + { + "epoch": 0.6053811659192825, + "grad_norm": 5.999921798706055, + "learning_rate": 9.243613262671559e-06, + "loss": 3.3375, + "step": 8910 + }, + { + "epoch": 0.6057208859899443, + "grad_norm": 8.357889175415039, + "learning_rate": 9.243188612583232e-06, + "loss": 3.2392, + "step": 8915 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 5.521888732910156, + "learning_rate": 9.242763962494905e-06, + "loss": 3.305, + "step": 8920 + }, + { + "epoch": 0.6064003261312678, + "grad_norm": 7.099625587463379, + "learning_rate": 9.242339312406577e-06, + "loss": 3.2265, + "step": 8925 + }, + { + "epoch": 0.6067400462019296, + "grad_norm": 7.6956610679626465, + "learning_rate": 9.24191466231825e-06, + "loss": 3.5012, + "step": 8930 + }, + { + "epoch": 0.6070797662725914, + "grad_norm": 8.751075744628906, + "learning_rate": 9.241490012229923e-06, + "loss": 3.4402, + "step": 8935 + }, + { + "epoch": 0.6074194863432532, + "grad_norm": 6.248912334442139, + "learning_rate": 9.241065362141596e-06, + "loss": 3.3512, + "step": 8940 + }, + { + "epoch": 0.6077592064139149, + "grad_norm": 7.358643531799316, + "learning_rate": 9.240640712053269e-06, + "loss": 3.4619, + "step": 8945 + }, + { + "epoch": 0.6080989264845768, + "grad_norm": 6.816802024841309, + "learning_rate": 9.240216061964941e-06, + "loss": 3.4637, + "step": 8950 + }, + { + "epoch": 0.6084386465552385, + "grad_norm": 7.022294521331787, + "learning_rate": 9.239791411876614e-06, + "loss": 3.3155, + "step": 8955 + }, + { + "epoch": 0.6087783666259002, + "grad_norm": 6.480865001678467, + "learning_rate": 9.239366761788287e-06, + "loss": 3.3877, + "step": 8960 + }, + { + "epoch": 0.609118086696562, + "grad_norm": 6.937707424163818, + "learning_rate": 9.23894211169996e-06, + "loss": 3.2298, + "step": 8965 + }, + { + "epoch": 0.6094578067672238, + "grad_norm": 5.383481025695801, + "learning_rate": 9.238517461611633e-06, + "loss": 3.3829, + "step": 8970 + }, + { + "epoch": 0.6097975268378856, + "grad_norm": 7.1855082511901855, + "learning_rate": 9.238092811523305e-06, + "loss": 3.2644, + "step": 8975 + }, + { + "epoch": 0.6101372469085473, + "grad_norm": 8.146191596984863, + "learning_rate": 9.237668161434978e-06, + "loss": 3.1736, + "step": 8980 + }, + { + "epoch": 0.6104769669792092, + "grad_norm": 6.066573619842529, + "learning_rate": 9.237243511346651e-06, + "loss": 3.1309, + "step": 8985 + }, + { + "epoch": 0.6108166870498709, + "grad_norm": 5.6682233810424805, + "learning_rate": 9.236818861258324e-06, + "loss": 3.3837, + "step": 8990 + }, + { + "epoch": 0.6111564071205327, + "grad_norm": 7.52933931350708, + "learning_rate": 9.236394211169997e-06, + "loss": 3.6544, + "step": 8995 + }, + { + "epoch": 0.6114961271911945, + "grad_norm": 5.405888557434082, + "learning_rate": 9.23596956108167e-06, + "loss": 3.3893, + "step": 9000 + }, + { + "epoch": 0.6118358472618562, + "grad_norm": 7.794858455657959, + "learning_rate": 9.235544910993342e-06, + "loss": 3.4936, + "step": 9005 + }, + { + "epoch": 0.612175567332518, + "grad_norm": 6.735956192016602, + "learning_rate": 9.235120260905015e-06, + "loss": 3.3357, + "step": 9010 + }, + { + "epoch": 0.6125152874031797, + "grad_norm": 5.6659393310546875, + "learning_rate": 9.234695610816688e-06, + "loss": 3.2402, + "step": 9015 + }, + { + "epoch": 0.6128550074738416, + "grad_norm": 6.979365825653076, + "learning_rate": 9.23427096072836e-06, + "loss": 3.4921, + "step": 9020 + }, + { + "epoch": 0.6131947275445033, + "grad_norm": 7.736665725708008, + "learning_rate": 9.233846310640033e-06, + "loss": 3.1781, + "step": 9025 + }, + { + "epoch": 0.6135344476151651, + "grad_norm": 7.69908332824707, + "learning_rate": 9.233421660551706e-06, + "loss": 3.2984, + "step": 9030 + }, + { + "epoch": 0.6138741676858269, + "grad_norm": 7.489945888519287, + "learning_rate": 9.232997010463379e-06, + "loss": 3.2022, + "step": 9035 + }, + { + "epoch": 0.6142138877564887, + "grad_norm": 5.5901265144348145, + "learning_rate": 9.232572360375052e-06, + "loss": 3.5374, + "step": 9040 + }, + { + "epoch": 0.6145536078271504, + "grad_norm": 7.34580659866333, + "learning_rate": 9.232147710286725e-06, + "loss": 3.4818, + "step": 9045 + }, + { + "epoch": 0.6148933278978121, + "grad_norm": 6.94255256652832, + "learning_rate": 9.231723060198397e-06, + "loss": 3.4305, + "step": 9050 + }, + { + "epoch": 0.615233047968474, + "grad_norm": 7.576052665710449, + "learning_rate": 9.23129841011007e-06, + "loss": 3.5302, + "step": 9055 + }, + { + "epoch": 0.6155727680391357, + "grad_norm": 6.735146999359131, + "learning_rate": 9.230873760021743e-06, + "loss": 3.3414, + "step": 9060 + }, + { + "epoch": 0.6159124881097975, + "grad_norm": 6.241534233093262, + "learning_rate": 9.230449109933416e-06, + "loss": 3.5613, + "step": 9065 + }, + { + "epoch": 0.6162522081804593, + "grad_norm": 7.203308582305908, + "learning_rate": 9.230024459845089e-06, + "loss": 3.1676, + "step": 9070 + }, + { + "epoch": 0.6165919282511211, + "grad_norm": 6.766077518463135, + "learning_rate": 9.229599809756761e-06, + "loss": 3.1936, + "step": 9075 + }, + { + "epoch": 0.6169316483217828, + "grad_norm": 5.647199630737305, + "learning_rate": 9.229175159668434e-06, + "loss": 3.4046, + "step": 9080 + }, + { + "epoch": 0.6172713683924447, + "grad_norm": 6.742757797241211, + "learning_rate": 9.228750509580107e-06, + "loss": 3.4442, + "step": 9085 + }, + { + "epoch": 0.6176110884631064, + "grad_norm": 7.170332431793213, + "learning_rate": 9.22832585949178e-06, + "loss": 3.253, + "step": 9090 + }, + { + "epoch": 0.6179508085337682, + "grad_norm": 5.90661096572876, + "learning_rate": 9.227901209403453e-06, + "loss": 3.3006, + "step": 9095 + }, + { + "epoch": 0.6182905286044299, + "grad_norm": 6.9584574699401855, + "learning_rate": 9.227476559315125e-06, + "loss": 3.6072, + "step": 9100 + }, + { + "epoch": 0.6186302486750918, + "grad_norm": 6.798515319824219, + "learning_rate": 9.227051909226798e-06, + "loss": 3.6067, + "step": 9105 + }, + { + "epoch": 0.6189699687457535, + "grad_norm": 7.56443977355957, + "learning_rate": 9.226627259138471e-06, + "loss": 3.0153, + "step": 9110 + }, + { + "epoch": 0.6193096888164152, + "grad_norm": 6.92351770401001, + "learning_rate": 9.226202609050144e-06, + "loss": 3.4946, + "step": 9115 + }, + { + "epoch": 0.6196494088870771, + "grad_norm": 5.945494651794434, + "learning_rate": 9.225777958961815e-06, + "loss": 3.3463, + "step": 9120 + }, + { + "epoch": 0.6199891289577388, + "grad_norm": 6.738556385040283, + "learning_rate": 9.22535330887349e-06, + "loss": 3.5023, + "step": 9125 + }, + { + "epoch": 0.6203288490284006, + "grad_norm": 7.389678001403809, + "learning_rate": 9.224928658785162e-06, + "loss": 3.3434, + "step": 9130 + }, + { + "epoch": 0.6206685690990623, + "grad_norm": 6.1889472007751465, + "learning_rate": 9.224504008696833e-06, + "loss": 3.4215, + "step": 9135 + }, + { + "epoch": 0.6210082891697242, + "grad_norm": 5.400968074798584, + "learning_rate": 9.224079358608508e-06, + "loss": 3.2544, + "step": 9140 + }, + { + "epoch": 0.6213480092403859, + "grad_norm": 7.5719122886657715, + "learning_rate": 9.22365470852018e-06, + "loss": 3.4626, + "step": 9145 + }, + { + "epoch": 0.6216877293110477, + "grad_norm": 4.654695510864258, + "learning_rate": 9.223230058431852e-06, + "loss": 3.1458, + "step": 9150 + }, + { + "epoch": 0.6220274493817095, + "grad_norm": 5.403666019439697, + "learning_rate": 9.222805408343526e-06, + "loss": 3.3495, + "step": 9155 + }, + { + "epoch": 0.6223671694523712, + "grad_norm": 6.267913341522217, + "learning_rate": 9.222380758255199e-06, + "loss": 3.3873, + "step": 9160 + }, + { + "epoch": 0.622706889523033, + "grad_norm": 7.369998455047607, + "learning_rate": 9.22195610816687e-06, + "loss": 3.3472, + "step": 9165 + }, + { + "epoch": 0.6230466095936948, + "grad_norm": 7.418186187744141, + "learning_rate": 9.221531458078545e-06, + "loss": 3.4355, + "step": 9170 + }, + { + "epoch": 0.6233863296643566, + "grad_norm": 7.0693511962890625, + "learning_rate": 9.221106807990217e-06, + "loss": 3.3923, + "step": 9175 + }, + { + "epoch": 0.6237260497350183, + "grad_norm": 8.852241516113281, + "learning_rate": 9.22068215790189e-06, + "loss": 3.0571, + "step": 9180 + }, + { + "epoch": 0.6240657698056801, + "grad_norm": 5.4957122802734375, + "learning_rate": 9.220257507813563e-06, + "loss": 3.5914, + "step": 9185 + }, + { + "epoch": 0.6244054898763419, + "grad_norm": 7.008285045623779, + "learning_rate": 9.219832857725234e-06, + "loss": 3.488, + "step": 9190 + }, + { + "epoch": 0.6247452099470037, + "grad_norm": 6.707099437713623, + "learning_rate": 9.219408207636909e-06, + "loss": 3.5046, + "step": 9195 + }, + { + "epoch": 0.6250849300176654, + "grad_norm": 7.102568626403809, + "learning_rate": 9.218983557548581e-06, + "loss": 3.2988, + "step": 9200 + }, + { + "epoch": 0.6254246500883273, + "grad_norm": 6.322928428649902, + "learning_rate": 9.218558907460253e-06, + "loss": 3.4406, + "step": 9205 + }, + { + "epoch": 0.625764370158989, + "grad_norm": 8.004341125488281, + "learning_rate": 9.218134257371927e-06, + "loss": 3.5923, + "step": 9210 + }, + { + "epoch": 0.6261040902296507, + "grad_norm": 8.493672370910645, + "learning_rate": 9.2177096072836e-06, + "loss": 3.5617, + "step": 9215 + }, + { + "epoch": 0.6264438103003126, + "grad_norm": 7.463306427001953, + "learning_rate": 9.217284957195271e-06, + "loss": 3.4076, + "step": 9220 + }, + { + "epoch": 0.6267835303709743, + "grad_norm": 7.620706081390381, + "learning_rate": 9.216860307106945e-06, + "loss": 3.7891, + "step": 9225 + }, + { + "epoch": 0.6271232504416361, + "grad_norm": 8.87363338470459, + "learning_rate": 9.216435657018618e-06, + "loss": 3.3481, + "step": 9230 + }, + { + "epoch": 0.6274629705122978, + "grad_norm": 6.527007102966309, + "learning_rate": 9.21601100693029e-06, + "loss": 3.2676, + "step": 9235 + }, + { + "epoch": 0.6278026905829597, + "grad_norm": 7.0613484382629395, + "learning_rate": 9.215586356841964e-06, + "loss": 3.3563, + "step": 9240 + }, + { + "epoch": 0.6281424106536214, + "grad_norm": 7.5915703773498535, + "learning_rate": 9.215161706753637e-06, + "loss": 3.3373, + "step": 9245 + }, + { + "epoch": 0.6284821307242832, + "grad_norm": 8.95241641998291, + "learning_rate": 9.214737056665308e-06, + "loss": 3.2485, + "step": 9250 + }, + { + "epoch": 0.628821850794945, + "grad_norm": 8.659767150878906, + "learning_rate": 9.214312406576982e-06, + "loss": 3.1841, + "step": 9255 + }, + { + "epoch": 0.6291615708656068, + "grad_norm": 5.632987976074219, + "learning_rate": 9.213887756488653e-06, + "loss": 3.3372, + "step": 9260 + }, + { + "epoch": 0.6295012909362685, + "grad_norm": 6.218563079833984, + "learning_rate": 9.213463106400326e-06, + "loss": 3.2366, + "step": 9265 + }, + { + "epoch": 0.6298410110069302, + "grad_norm": 6.8418869972229, + "learning_rate": 9.213038456312e-06, + "loss": 3.1856, + "step": 9270 + }, + { + "epoch": 0.6301807310775921, + "grad_norm": 6.8494696617126465, + "learning_rate": 9.212613806223672e-06, + "loss": 3.2453, + "step": 9275 + }, + { + "epoch": 0.6305204511482538, + "grad_norm": 6.510276794433594, + "learning_rate": 9.212189156135345e-06, + "loss": 3.4692, + "step": 9280 + }, + { + "epoch": 0.6308601712189156, + "grad_norm": 8.00693130493164, + "learning_rate": 9.211764506047019e-06, + "loss": 3.5843, + "step": 9285 + }, + { + "epoch": 0.6311998912895774, + "grad_norm": 8.039600372314453, + "learning_rate": 9.21133985595869e-06, + "loss": 3.0919, + "step": 9290 + }, + { + "epoch": 0.6315396113602392, + "grad_norm": 5.056745529174805, + "learning_rate": 9.210915205870363e-06, + "loss": 3.1676, + "step": 9295 + }, + { + "epoch": 0.6318793314309009, + "grad_norm": 8.188335418701172, + "learning_rate": 9.210490555782037e-06, + "loss": 3.4823, + "step": 9300 + }, + { + "epoch": 0.6322190515015628, + "grad_norm": 6.322336673736572, + "learning_rate": 9.210065905693709e-06, + "loss": 3.317, + "step": 9305 + }, + { + "epoch": 0.6325587715722245, + "grad_norm": 5.790127754211426, + "learning_rate": 9.209641255605381e-06, + "loss": 3.184, + "step": 9310 + }, + { + "epoch": 0.6328984916428863, + "grad_norm": 5.183618068695068, + "learning_rate": 9.209216605517056e-06, + "loss": 3.2031, + "step": 9315 + }, + { + "epoch": 0.633238211713548, + "grad_norm": 6.111123561859131, + "learning_rate": 9.208791955428727e-06, + "loss": 3.2468, + "step": 9320 + }, + { + "epoch": 0.6335779317842098, + "grad_norm": 7.5627055168151855, + "learning_rate": 9.2083673053404e-06, + "loss": 3.5205, + "step": 9325 + }, + { + "epoch": 0.6339176518548716, + "grad_norm": 6.484257221221924, + "learning_rate": 9.207942655252074e-06, + "loss": 3.4386, + "step": 9330 + }, + { + "epoch": 0.6342573719255333, + "grad_norm": 7.280816555023193, + "learning_rate": 9.207518005163745e-06, + "loss": 3.5404, + "step": 9335 + }, + { + "epoch": 0.6345970919961952, + "grad_norm": 6.421640396118164, + "learning_rate": 9.207093355075418e-06, + "loss": 3.495, + "step": 9340 + }, + { + "epoch": 0.6349368120668569, + "grad_norm": 6.849699020385742, + "learning_rate": 9.206668704987091e-06, + "loss": 3.3241, + "step": 9345 + }, + { + "epoch": 0.6352765321375187, + "grad_norm": 5.997524261474609, + "learning_rate": 9.206244054898764e-06, + "loss": 3.356, + "step": 9350 + }, + { + "epoch": 0.6356162522081804, + "grad_norm": 5.247689247131348, + "learning_rate": 9.205819404810437e-06, + "loss": 3.3017, + "step": 9355 + }, + { + "epoch": 0.6359559722788423, + "grad_norm": 6.344562530517578, + "learning_rate": 9.20539475472211e-06, + "loss": 3.3522, + "step": 9360 + }, + { + "epoch": 0.636295692349504, + "grad_norm": 7.483940124511719, + "learning_rate": 9.204970104633782e-06, + "loss": 3.4545, + "step": 9365 + }, + { + "epoch": 0.6366354124201657, + "grad_norm": 6.010200500488281, + "learning_rate": 9.204545454545455e-06, + "loss": 3.5454, + "step": 9370 + }, + { + "epoch": 0.6369751324908276, + "grad_norm": 9.156431198120117, + "learning_rate": 9.204120804457128e-06, + "loss": 3.4277, + "step": 9375 + }, + { + "epoch": 0.6373148525614893, + "grad_norm": 9.698859214782715, + "learning_rate": 9.2036961543688e-06, + "loss": 3.1173, + "step": 9380 + }, + { + "epoch": 0.6376545726321511, + "grad_norm": 5.2595601081848145, + "learning_rate": 9.203271504280473e-06, + "loss": 3.312, + "step": 9385 + }, + { + "epoch": 0.6379942927028129, + "grad_norm": 6.4138102531433105, + "learning_rate": 9.202846854192146e-06, + "loss": 3.3828, + "step": 9390 + }, + { + "epoch": 0.6383340127734747, + "grad_norm": 6.949238300323486, + "learning_rate": 9.202422204103819e-06, + "loss": 3.4085, + "step": 9395 + }, + { + "epoch": 0.6386737328441364, + "grad_norm": 6.487392425537109, + "learning_rate": 9.201997554015492e-06, + "loss": 3.3404, + "step": 9400 + }, + { + "epoch": 0.6390134529147982, + "grad_norm": 8.05395793914795, + "learning_rate": 9.201572903927165e-06, + "loss": 3.2749, + "step": 9405 + }, + { + "epoch": 0.63935317298546, + "grad_norm": 6.814947605133057, + "learning_rate": 9.201148253838837e-06, + "loss": 3.3529, + "step": 9410 + }, + { + "epoch": 0.6396928930561218, + "grad_norm": 7.008006572723389, + "learning_rate": 9.20072360375051e-06, + "loss": 3.1487, + "step": 9415 + }, + { + "epoch": 0.6400326131267835, + "grad_norm": 6.78053092956543, + "learning_rate": 9.200298953662183e-06, + "loss": 3.353, + "step": 9420 + }, + { + "epoch": 0.6403723331974454, + "grad_norm": 5.667815208435059, + "learning_rate": 9.199874303573856e-06, + "loss": 3.3956, + "step": 9425 + }, + { + "epoch": 0.6407120532681071, + "grad_norm": 6.908779144287109, + "learning_rate": 9.199449653485529e-06, + "loss": 3.3954, + "step": 9430 + }, + { + "epoch": 0.6410517733387688, + "grad_norm": 7.409111976623535, + "learning_rate": 9.199025003397201e-06, + "loss": 3.1881, + "step": 9435 + }, + { + "epoch": 0.6413914934094306, + "grad_norm": 6.330677509307861, + "learning_rate": 9.198600353308874e-06, + "loss": 3.4072, + "step": 9440 + }, + { + "epoch": 0.6417312134800924, + "grad_norm": 7.391622543334961, + "learning_rate": 9.198175703220547e-06, + "loss": 3.3565, + "step": 9445 + }, + { + "epoch": 0.6420709335507542, + "grad_norm": 7.223138809204102, + "learning_rate": 9.19775105313222e-06, + "loss": 3.3416, + "step": 9450 + }, + { + "epoch": 0.6424106536214159, + "grad_norm": 7.858185291290283, + "learning_rate": 9.197326403043893e-06, + "loss": 3.3149, + "step": 9455 + }, + { + "epoch": 0.6427503736920778, + "grad_norm": 7.321844100952148, + "learning_rate": 9.196901752955565e-06, + "loss": 3.4033, + "step": 9460 + }, + { + "epoch": 0.6430900937627395, + "grad_norm": 7.143193244934082, + "learning_rate": 9.196477102867238e-06, + "loss": 3.2512, + "step": 9465 + }, + { + "epoch": 0.6434298138334013, + "grad_norm": 8.199613571166992, + "learning_rate": 9.196052452778911e-06, + "loss": 3.381, + "step": 9470 + }, + { + "epoch": 0.6437695339040631, + "grad_norm": 6.467016220092773, + "learning_rate": 9.195627802690584e-06, + "loss": 3.4879, + "step": 9475 + }, + { + "epoch": 0.6441092539747248, + "grad_norm": 5.810321807861328, + "learning_rate": 9.195203152602257e-06, + "loss": 3.1424, + "step": 9480 + }, + { + "epoch": 0.6444489740453866, + "grad_norm": 6.119264602661133, + "learning_rate": 9.19477850251393e-06, + "loss": 3.756, + "step": 9485 + }, + { + "epoch": 0.6447886941160483, + "grad_norm": 6.773740291595459, + "learning_rate": 9.194353852425602e-06, + "loss": 3.3515, + "step": 9490 + }, + { + "epoch": 0.6451284141867102, + "grad_norm": 6.728276252746582, + "learning_rate": 9.193929202337275e-06, + "loss": 3.294, + "step": 9495 + }, + { + "epoch": 0.6454681342573719, + "grad_norm": 6.523977756500244, + "learning_rate": 9.193504552248948e-06, + "loss": 3.3163, + "step": 9500 + }, + { + "epoch": 0.6458078543280337, + "grad_norm": 5.77198600769043, + "learning_rate": 9.19307990216062e-06, + "loss": 3.1743, + "step": 9505 + }, + { + "epoch": 0.6461475743986955, + "grad_norm": 6.431521892547607, + "learning_rate": 9.192655252072293e-06, + "loss": 3.2978, + "step": 9510 + }, + { + "epoch": 0.6464872944693573, + "grad_norm": 5.9026031494140625, + "learning_rate": 9.192230601983966e-06, + "loss": 3.3766, + "step": 9515 + }, + { + "epoch": 0.646827014540019, + "grad_norm": 7.906280517578125, + "learning_rate": 9.191805951895639e-06, + "loss": 3.4759, + "step": 9520 + }, + { + "epoch": 0.6471667346106807, + "grad_norm": 7.4844069480896, + "learning_rate": 9.191381301807312e-06, + "loss": 3.4103, + "step": 9525 + }, + { + "epoch": 0.6475064546813426, + "grad_norm": 5.446690559387207, + "learning_rate": 9.190956651718985e-06, + "loss": 3.3758, + "step": 9530 + }, + { + "epoch": 0.6478461747520043, + "grad_norm": 6.2910003662109375, + "learning_rate": 9.190532001630657e-06, + "loss": 3.3523, + "step": 9535 + }, + { + "epoch": 0.6481858948226661, + "grad_norm": 8.325623512268066, + "learning_rate": 9.19010735154233e-06, + "loss": 3.5572, + "step": 9540 + }, + { + "epoch": 0.6485256148933279, + "grad_norm": 8.34815788269043, + "learning_rate": 9.189682701454003e-06, + "loss": 3.4539, + "step": 9545 + }, + { + "epoch": 0.6488653349639897, + "grad_norm": 6.797037601470947, + "learning_rate": 9.189258051365676e-06, + "loss": 3.5179, + "step": 9550 + }, + { + "epoch": 0.6492050550346514, + "grad_norm": 5.321785926818848, + "learning_rate": 9.188833401277349e-06, + "loss": 3.2967, + "step": 9555 + }, + { + "epoch": 0.6495447751053133, + "grad_norm": 6.538219928741455, + "learning_rate": 9.188408751189021e-06, + "loss": 3.4426, + "step": 9560 + }, + { + "epoch": 0.649884495175975, + "grad_norm": 5.648243427276611, + "learning_rate": 9.187984101100694e-06, + "loss": 3.4886, + "step": 9565 + }, + { + "epoch": 0.6502242152466368, + "grad_norm": 6.298815727233887, + "learning_rate": 9.187559451012367e-06, + "loss": 3.2558, + "step": 9570 + }, + { + "epoch": 0.6505639353172985, + "grad_norm": 7.462907314300537, + "learning_rate": 9.18713480092404e-06, + "loss": 3.1901, + "step": 9575 + }, + { + "epoch": 0.6509036553879604, + "grad_norm": 6.858093738555908, + "learning_rate": 9.186710150835713e-06, + "loss": 3.4378, + "step": 9580 + }, + { + "epoch": 0.6512433754586221, + "grad_norm": 8.394076347351074, + "learning_rate": 9.186285500747385e-06, + "loss": 3.3504, + "step": 9585 + }, + { + "epoch": 0.6515830955292838, + "grad_norm": 5.103447437286377, + "learning_rate": 9.185860850659058e-06, + "loss": 3.3, + "step": 9590 + }, + { + "epoch": 0.6519228155999457, + "grad_norm": 7.101974964141846, + "learning_rate": 9.185436200570731e-06, + "loss": 3.3984, + "step": 9595 + }, + { + "epoch": 0.6522625356706074, + "grad_norm": 6.519262313842773, + "learning_rate": 9.185011550482404e-06, + "loss": 3.2096, + "step": 9600 + }, + { + "epoch": 0.6526022557412692, + "grad_norm": 6.265388011932373, + "learning_rate": 9.184586900394075e-06, + "loss": 3.4524, + "step": 9605 + }, + { + "epoch": 0.6529419758119309, + "grad_norm": 7.498419761657715, + "learning_rate": 9.18416225030575e-06, + "loss": 3.4142, + "step": 9610 + }, + { + "epoch": 0.6532816958825928, + "grad_norm": 7.436209201812744, + "learning_rate": 9.183737600217422e-06, + "loss": 3.5159, + "step": 9615 + }, + { + "epoch": 0.6536214159532545, + "grad_norm": 6.619922161102295, + "learning_rate": 9.183312950129093e-06, + "loss": 3.2498, + "step": 9620 + }, + { + "epoch": 0.6539611360239163, + "grad_norm": 7.372441291809082, + "learning_rate": 9.182888300040768e-06, + "loss": 3.3889, + "step": 9625 + }, + { + "epoch": 0.6543008560945781, + "grad_norm": 7.012074947357178, + "learning_rate": 9.18246364995244e-06, + "loss": 3.3356, + "step": 9630 + }, + { + "epoch": 0.6546405761652399, + "grad_norm": 6.163271903991699, + "learning_rate": 9.182038999864112e-06, + "loss": 3.2962, + "step": 9635 + }, + { + "epoch": 0.6549802962359016, + "grad_norm": 7.4979095458984375, + "learning_rate": 9.181614349775786e-06, + "loss": 3.3225, + "step": 9640 + }, + { + "epoch": 0.6553200163065634, + "grad_norm": 6.199854373931885, + "learning_rate": 9.181189699687459e-06, + "loss": 3.1607, + "step": 9645 + }, + { + "epoch": 0.6556597363772252, + "grad_norm": 7.758447170257568, + "learning_rate": 9.18076504959913e-06, + "loss": 2.9487, + "step": 9650 + }, + { + "epoch": 0.6559994564478869, + "grad_norm": 7.079701900482178, + "learning_rate": 9.180340399510805e-06, + "loss": 3.3867, + "step": 9655 + }, + { + "epoch": 0.6563391765185487, + "grad_norm": 6.536162376403809, + "learning_rate": 9.179915749422477e-06, + "loss": 3.6707, + "step": 9660 + }, + { + "epoch": 0.6566788965892105, + "grad_norm": 7.809629440307617, + "learning_rate": 9.179491099334149e-06, + "loss": 3.2544, + "step": 9665 + }, + { + "epoch": 0.6570186166598723, + "grad_norm": 8.193182945251465, + "learning_rate": 9.179066449245823e-06, + "loss": 2.891, + "step": 9670 + }, + { + "epoch": 0.657358336730534, + "grad_norm": 7.182677268981934, + "learning_rate": 9.178641799157496e-06, + "loss": 3.1731, + "step": 9675 + }, + { + "epoch": 0.6576980568011959, + "grad_norm": 5.764720916748047, + "learning_rate": 9.178217149069167e-06, + "loss": 3.5393, + "step": 9680 + }, + { + "epoch": 0.6580377768718576, + "grad_norm": 7.761220932006836, + "learning_rate": 9.177792498980841e-06, + "loss": 3.1784, + "step": 9685 + }, + { + "epoch": 0.6583774969425193, + "grad_norm": 7.8854756355285645, + "learning_rate": 9.177367848892513e-06, + "loss": 3.086, + "step": 9690 + }, + { + "epoch": 0.6587172170131811, + "grad_norm": 7.800393581390381, + "learning_rate": 9.176943198804185e-06, + "loss": 3.4193, + "step": 9695 + }, + { + "epoch": 0.6590569370838429, + "grad_norm": 6.436568260192871, + "learning_rate": 9.17651854871586e-06, + "loss": 3.3094, + "step": 9700 + }, + { + "epoch": 0.6593966571545047, + "grad_norm": 7.493227958679199, + "learning_rate": 9.176093898627531e-06, + "loss": 3.3485, + "step": 9705 + }, + { + "epoch": 0.6597363772251664, + "grad_norm": 5.889840126037598, + "learning_rate": 9.175669248539204e-06, + "loss": 3.3043, + "step": 9710 + }, + { + "epoch": 0.6600760972958283, + "grad_norm": 6.407329559326172, + "learning_rate": 9.175244598450878e-06, + "loss": 3.4204, + "step": 9715 + }, + { + "epoch": 0.66041581736649, + "grad_norm": 6.029567241668701, + "learning_rate": 9.17481994836255e-06, + "loss": 3.7932, + "step": 9720 + }, + { + "epoch": 0.6607555374371518, + "grad_norm": 6.793412208557129, + "learning_rate": 9.174395298274222e-06, + "loss": 3.5242, + "step": 9725 + }, + { + "epoch": 0.6610952575078136, + "grad_norm": 7.588111400604248, + "learning_rate": 9.173970648185897e-06, + "loss": 3.0798, + "step": 9730 + }, + { + "epoch": 0.6614349775784754, + "grad_norm": 6.080990791320801, + "learning_rate": 9.173545998097568e-06, + "loss": 3.2967, + "step": 9735 + }, + { + "epoch": 0.6617746976491371, + "grad_norm": 7.849877834320068, + "learning_rate": 9.17312134800924e-06, + "loss": 3.2942, + "step": 9740 + }, + { + "epoch": 0.6621144177197988, + "grad_norm": 6.07185173034668, + "learning_rate": 9.172696697920915e-06, + "loss": 3.271, + "step": 9745 + }, + { + "epoch": 0.6624541377904607, + "grad_norm": 7.858744144439697, + "learning_rate": 9.172272047832586e-06, + "loss": 3.4186, + "step": 9750 + }, + { + "epoch": 0.6627938578611224, + "grad_norm": 8.102896690368652, + "learning_rate": 9.171847397744259e-06, + "loss": 3.5074, + "step": 9755 + }, + { + "epoch": 0.6631335779317842, + "grad_norm": 8.035725593566895, + "learning_rate": 9.171422747655932e-06, + "loss": 3.2762, + "step": 9760 + }, + { + "epoch": 0.663473298002446, + "grad_norm": 7.681070327758789, + "learning_rate": 9.170998097567605e-06, + "loss": 3.1218, + "step": 9765 + }, + { + "epoch": 0.6638130180731078, + "grad_norm": 7.024676322937012, + "learning_rate": 9.170573447479277e-06, + "loss": 3.3877, + "step": 9770 + }, + { + "epoch": 0.6641527381437695, + "grad_norm": 8.908258438110352, + "learning_rate": 9.17014879739095e-06, + "loss": 3.2604, + "step": 9775 + }, + { + "epoch": 0.6644924582144313, + "grad_norm": 5.517627716064453, + "learning_rate": 9.169724147302623e-06, + "loss": 3.5196, + "step": 9780 + }, + { + "epoch": 0.6648321782850931, + "grad_norm": 6.126286029815674, + "learning_rate": 9.169299497214296e-06, + "loss": 3.5277, + "step": 9785 + }, + { + "epoch": 0.6651718983557549, + "grad_norm": 5.607492923736572, + "learning_rate": 9.168874847125969e-06, + "loss": 3.22, + "step": 9790 + }, + { + "epoch": 0.6655116184264166, + "grad_norm": 6.567805767059326, + "learning_rate": 9.168450197037641e-06, + "loss": 3.1735, + "step": 9795 + }, + { + "epoch": 0.6658513384970784, + "grad_norm": 7.655783176422119, + "learning_rate": 9.168025546949314e-06, + "loss": 3.2358, + "step": 9800 + }, + { + "epoch": 0.6661910585677402, + "grad_norm": 5.229218482971191, + "learning_rate": 9.167600896860987e-06, + "loss": 3.2799, + "step": 9805 + }, + { + "epoch": 0.6665307786384019, + "grad_norm": 6.202256202697754, + "learning_rate": 9.16717624677266e-06, + "loss": 3.3192, + "step": 9810 + }, + { + "epoch": 0.6668704987090638, + "grad_norm": 7.563780307769775, + "learning_rate": 9.166751596684333e-06, + "loss": 3.6056, + "step": 9815 + }, + { + "epoch": 0.6672102187797255, + "grad_norm": 7.7346320152282715, + "learning_rate": 9.166326946596005e-06, + "loss": 3.5902, + "step": 9820 + }, + { + "epoch": 0.6675499388503873, + "grad_norm": 6.877519130706787, + "learning_rate": 9.165902296507678e-06, + "loss": 3.4816, + "step": 9825 + }, + { + "epoch": 0.667889658921049, + "grad_norm": 6.908212661743164, + "learning_rate": 9.165477646419351e-06, + "loss": 3.4175, + "step": 9830 + }, + { + "epoch": 0.6682293789917109, + "grad_norm": 7.143728733062744, + "learning_rate": 9.165052996331024e-06, + "loss": 3.3132, + "step": 9835 + }, + { + "epoch": 0.6685690990623726, + "grad_norm": 6.996997833251953, + "learning_rate": 9.164628346242697e-06, + "loss": 3.2593, + "step": 9840 + }, + { + "epoch": 0.6689088191330343, + "grad_norm": 7.0160346031188965, + "learning_rate": 9.16420369615437e-06, + "loss": 3.5376, + "step": 9845 + }, + { + "epoch": 0.6692485392036962, + "grad_norm": 7.260173797607422, + "learning_rate": 9.163779046066042e-06, + "loss": 3.1979, + "step": 9850 + }, + { + "epoch": 0.6695882592743579, + "grad_norm": 7.640297889709473, + "learning_rate": 9.163354395977715e-06, + "loss": 3.5806, + "step": 9855 + }, + { + "epoch": 0.6699279793450197, + "grad_norm": 5.11570405960083, + "learning_rate": 9.162929745889388e-06, + "loss": 3.5098, + "step": 9860 + }, + { + "epoch": 0.6702676994156814, + "grad_norm": 5.711785793304443, + "learning_rate": 9.16250509580106e-06, + "loss": 3.4783, + "step": 9865 + }, + { + "epoch": 0.6706074194863433, + "grad_norm": 7.050374984741211, + "learning_rate": 9.162080445712733e-06, + "loss": 3.2043, + "step": 9870 + }, + { + "epoch": 0.670947139557005, + "grad_norm": 5.966212749481201, + "learning_rate": 9.161655795624406e-06, + "loss": 3.6432, + "step": 9875 + }, + { + "epoch": 0.6712868596276668, + "grad_norm": 6.753113269805908, + "learning_rate": 9.161231145536079e-06, + "loss": 3.3876, + "step": 9880 + }, + { + "epoch": 0.6716265796983286, + "grad_norm": 6.263952255249023, + "learning_rate": 9.160806495447752e-06, + "loss": 3.4729, + "step": 9885 + }, + { + "epoch": 0.6719662997689904, + "grad_norm": 5.141692161560059, + "learning_rate": 9.160381845359425e-06, + "loss": 3.2542, + "step": 9890 + }, + { + "epoch": 0.6723060198396521, + "grad_norm": 6.564356327056885, + "learning_rate": 9.160042125288763e-06, + "loss": 3.2614, + "step": 9895 + }, + { + "epoch": 0.672645739910314, + "grad_norm": 5.758349895477295, + "learning_rate": 9.159617475200435e-06, + "loss": 3.4018, + "step": 9900 + }, + { + "epoch": 0.6729854599809757, + "grad_norm": 6.299472332000732, + "learning_rate": 9.159192825112109e-06, + "loss": 3.149, + "step": 9905 + }, + { + "epoch": 0.6733251800516374, + "grad_norm": 5.472528457641602, + "learning_rate": 9.158768175023782e-06, + "loss": 3.3594, + "step": 9910 + }, + { + "epoch": 0.6736649001222992, + "grad_norm": 6.112292766571045, + "learning_rate": 9.158343524935453e-06, + "loss": 3.448, + "step": 9915 + }, + { + "epoch": 0.674004620192961, + "grad_norm": 6.517717361450195, + "learning_rate": 9.157918874847127e-06, + "loss": 3.4099, + "step": 9920 + }, + { + "epoch": 0.6743443402636228, + "grad_norm": 6.788369655609131, + "learning_rate": 9.1574942247588e-06, + "loss": 3.2174, + "step": 9925 + }, + { + "epoch": 0.6746840603342845, + "grad_norm": 5.461520195007324, + "learning_rate": 9.157069574670471e-06, + "loss": 3.2777, + "step": 9930 + }, + { + "epoch": 0.6750237804049464, + "grad_norm": 6.996174335479736, + "learning_rate": 9.156644924582146e-06, + "loss": 3.2986, + "step": 9935 + }, + { + "epoch": 0.6753635004756081, + "grad_norm": 7.641750335693359, + "learning_rate": 9.156220274493817e-06, + "loss": 3.2979, + "step": 9940 + }, + { + "epoch": 0.6757032205462699, + "grad_norm": 6.440120697021484, + "learning_rate": 9.15579562440549e-06, + "loss": 3.3766, + "step": 9945 + }, + { + "epoch": 0.6760429406169316, + "grad_norm": 6.110731601715088, + "learning_rate": 9.155370974317164e-06, + "loss": 3.439, + "step": 9950 + }, + { + "epoch": 0.6763826606875935, + "grad_norm": 6.841864109039307, + "learning_rate": 9.154946324228835e-06, + "loss": 3.2914, + "step": 9955 + }, + { + "epoch": 0.6767223807582552, + "grad_norm": 7.001214981079102, + "learning_rate": 9.154521674140508e-06, + "loss": 3.3552, + "step": 9960 + }, + { + "epoch": 0.6770621008289169, + "grad_norm": 7.16904878616333, + "learning_rate": 9.154097024052183e-06, + "loss": 3.4212, + "step": 9965 + }, + { + "epoch": 0.6774018208995788, + "grad_norm": 6.948495388031006, + "learning_rate": 9.153672373963854e-06, + "loss": 3.318, + "step": 9970 + }, + { + "epoch": 0.6777415409702405, + "grad_norm": 7.268731117248535, + "learning_rate": 9.153247723875527e-06, + "loss": 3.2816, + "step": 9975 + }, + { + "epoch": 0.6780812610409023, + "grad_norm": 6.0193657875061035, + "learning_rate": 9.152823073787201e-06, + "loss": 3.4059, + "step": 9980 + }, + { + "epoch": 0.6784209811115641, + "grad_norm": 7.023170471191406, + "learning_rate": 9.152398423698872e-06, + "loss": 3.2861, + "step": 9985 + }, + { + "epoch": 0.6787607011822259, + "grad_norm": 5.609365463256836, + "learning_rate": 9.151973773610545e-06, + "loss": 3.3405, + "step": 9990 + }, + { + "epoch": 0.6791004212528876, + "grad_norm": 5.942836761474609, + "learning_rate": 9.15154912352222e-06, + "loss": 3.1897, + "step": 9995 + }, + { + "epoch": 0.6794401413235494, + "grad_norm": 6.865691184997559, + "learning_rate": 9.15112447343389e-06, + "loss": 3.2608, + "step": 10000 + }, + { + "epoch": 0.6797798613942112, + "grad_norm": 8.353499412536621, + "learning_rate": 9.150699823345563e-06, + "loss": 3.6113, + "step": 10005 + }, + { + "epoch": 0.680119581464873, + "grad_norm": 7.2677693367004395, + "learning_rate": 9.150275173257236e-06, + "loss": 3.4062, + "step": 10010 + }, + { + "epoch": 0.6804593015355347, + "grad_norm": 6.7417216300964355, + "learning_rate": 9.149850523168909e-06, + "loss": 3.3524, + "step": 10015 + }, + { + "epoch": 0.6807990216061965, + "grad_norm": 6.267829895019531, + "learning_rate": 9.149425873080582e-06, + "loss": 3.2312, + "step": 10020 + }, + { + "epoch": 0.6811387416768583, + "grad_norm": 6.738746166229248, + "learning_rate": 9.149001222992255e-06, + "loss": 3.4642, + "step": 10025 + }, + { + "epoch": 0.68147846174752, + "grad_norm": 6.4491286277771, + "learning_rate": 9.148576572903927e-06, + "loss": 3.2689, + "step": 10030 + }, + { + "epoch": 0.6818181818181818, + "grad_norm": 6.51322078704834, + "learning_rate": 9.1481519228156e-06, + "loss": 3.5685, + "step": 10035 + }, + { + "epoch": 0.6821579018888436, + "grad_norm": 6.465193748474121, + "learning_rate": 9.147727272727273e-06, + "loss": 3.296, + "step": 10040 + }, + { + "epoch": 0.6824976219595054, + "grad_norm": 6.707819938659668, + "learning_rate": 9.147302622638946e-06, + "loss": 3.3152, + "step": 10045 + }, + { + "epoch": 0.6828373420301671, + "grad_norm": 7.404690265655518, + "learning_rate": 9.146877972550619e-06, + "loss": 3.1361, + "step": 10050 + }, + { + "epoch": 0.683177062100829, + "grad_norm": 6.363169193267822, + "learning_rate": 9.146453322462291e-06, + "loss": 3.397, + "step": 10055 + }, + { + "epoch": 0.6835167821714907, + "grad_norm": 6.450015068054199, + "learning_rate": 9.146028672373964e-06, + "loss": 3.5719, + "step": 10060 + }, + { + "epoch": 0.6838565022421524, + "grad_norm": 7.216011047363281, + "learning_rate": 9.145604022285639e-06, + "loss": 3.4785, + "step": 10065 + }, + { + "epoch": 0.6841962223128143, + "grad_norm": 5.993366718292236, + "learning_rate": 9.14517937219731e-06, + "loss": 3.6518, + "step": 10070 + }, + { + "epoch": 0.684535942383476, + "grad_norm": 7.40217924118042, + "learning_rate": 9.144754722108983e-06, + "loss": 3.0748, + "step": 10075 + }, + { + "epoch": 0.6848756624541378, + "grad_norm": 6.785897254943848, + "learning_rate": 9.144330072020655e-06, + "loss": 3.2799, + "step": 10080 + }, + { + "epoch": 0.6852153825247995, + "grad_norm": 5.766666412353516, + "learning_rate": 9.143905421932328e-06, + "loss": 3.2428, + "step": 10085 + }, + { + "epoch": 0.6855551025954614, + "grad_norm": 6.347005844116211, + "learning_rate": 9.143480771844001e-06, + "loss": 3.2419, + "step": 10090 + }, + { + "epoch": 0.6858948226661231, + "grad_norm": 7.9424357414245605, + "learning_rate": 9.143056121755674e-06, + "loss": 3.3781, + "step": 10095 + }, + { + "epoch": 0.6862345427367849, + "grad_norm": 8.070687294006348, + "learning_rate": 9.142631471667347e-06, + "loss": 3.5282, + "step": 10100 + }, + { + "epoch": 0.6865742628074467, + "grad_norm": 6.514300346374512, + "learning_rate": 9.14220682157902e-06, + "loss": 3.1643, + "step": 10105 + }, + { + "epoch": 0.6869139828781085, + "grad_norm": 6.299792289733887, + "learning_rate": 9.141782171490692e-06, + "loss": 3.2264, + "step": 10110 + }, + { + "epoch": 0.6872537029487702, + "grad_norm": 6.103789806365967, + "learning_rate": 9.141357521402365e-06, + "loss": 3.4704, + "step": 10115 + }, + { + "epoch": 0.6875934230194319, + "grad_norm": 9.468796730041504, + "learning_rate": 9.140932871314038e-06, + "loss": 3.0802, + "step": 10120 + }, + { + "epoch": 0.6879331430900938, + "grad_norm": 6.502553462982178, + "learning_rate": 9.14050822122571e-06, + "loss": 3.4219, + "step": 10125 + }, + { + "epoch": 0.6882728631607555, + "grad_norm": 6.500614166259766, + "learning_rate": 9.140083571137383e-06, + "loss": 3.2047, + "step": 10130 + }, + { + "epoch": 0.6886125832314173, + "grad_norm": 5.216808319091797, + "learning_rate": 9.139658921049056e-06, + "loss": 3.2854, + "step": 10135 + }, + { + "epoch": 0.6889523033020791, + "grad_norm": 7.73716402053833, + "learning_rate": 9.139234270960729e-06, + "loss": 3.1654, + "step": 10140 + }, + { + "epoch": 0.6892920233727409, + "grad_norm": 6.248583793640137, + "learning_rate": 9.138809620872402e-06, + "loss": 3.4229, + "step": 10145 + }, + { + "epoch": 0.6896317434434026, + "grad_norm": 6.660424709320068, + "learning_rate": 9.138384970784075e-06, + "loss": 3.4213, + "step": 10150 + }, + { + "epoch": 0.6899714635140645, + "grad_norm": 7.057060718536377, + "learning_rate": 9.137960320695747e-06, + "loss": 3.3551, + "step": 10155 + }, + { + "epoch": 0.6903111835847262, + "grad_norm": 7.326634407043457, + "learning_rate": 9.13753567060742e-06, + "loss": 3.2913, + "step": 10160 + }, + { + "epoch": 0.690650903655388, + "grad_norm": 7.048072338104248, + "learning_rate": 9.137111020519093e-06, + "loss": 3.3518, + "step": 10165 + }, + { + "epoch": 0.6909906237260497, + "grad_norm": 6.458706378936768, + "learning_rate": 9.136686370430766e-06, + "loss": 3.3268, + "step": 10170 + }, + { + "epoch": 0.6913303437967115, + "grad_norm": 6.828353404998779, + "learning_rate": 9.136261720342439e-06, + "loss": 3.2029, + "step": 10175 + }, + { + "epoch": 0.6916700638673733, + "grad_norm": 8.272785186767578, + "learning_rate": 9.135837070254111e-06, + "loss": 3.4956, + "step": 10180 + }, + { + "epoch": 0.692009783938035, + "grad_norm": 6.0541510581970215, + "learning_rate": 9.135412420165784e-06, + "loss": 2.982, + "step": 10185 + }, + { + "epoch": 0.6923495040086969, + "grad_norm": 7.145397186279297, + "learning_rate": 9.134987770077457e-06, + "loss": 3.2942, + "step": 10190 + }, + { + "epoch": 0.6926892240793586, + "grad_norm": 6.137980937957764, + "learning_rate": 9.13456311998913e-06, + "loss": 3.4207, + "step": 10195 + }, + { + "epoch": 0.6930289441500204, + "grad_norm": 6.631294250488281, + "learning_rate": 9.134138469900803e-06, + "loss": 3.3004, + "step": 10200 + }, + { + "epoch": 0.6933686642206821, + "grad_norm": 8.32862663269043, + "learning_rate": 9.133713819812475e-06, + "loss": 3.1401, + "step": 10205 + }, + { + "epoch": 0.693708384291344, + "grad_norm": 6.124314785003662, + "learning_rate": 9.133289169724148e-06, + "loss": 3.3273, + "step": 10210 + }, + { + "epoch": 0.6940481043620057, + "grad_norm": 6.564631938934326, + "learning_rate": 9.132864519635821e-06, + "loss": 3.4778, + "step": 10215 + }, + { + "epoch": 0.6943878244326674, + "grad_norm": 8.816319465637207, + "learning_rate": 9.132439869547494e-06, + "loss": 3.4484, + "step": 10220 + }, + { + "epoch": 0.6947275445033293, + "grad_norm": 9.070428848266602, + "learning_rate": 9.132015219459167e-06, + "loss": 3.2818, + "step": 10225 + }, + { + "epoch": 0.695067264573991, + "grad_norm": 6.857361316680908, + "learning_rate": 9.13159056937084e-06, + "loss": 3.3826, + "step": 10230 + }, + { + "epoch": 0.6954069846446528, + "grad_norm": 7.296266555786133, + "learning_rate": 9.131165919282512e-06, + "loss": 3.4138, + "step": 10235 + }, + { + "epoch": 0.6957467047153146, + "grad_norm": 5.895049571990967, + "learning_rate": 9.130741269194185e-06, + "loss": 3.3705, + "step": 10240 + }, + { + "epoch": 0.6960864247859764, + "grad_norm": 6.136512756347656, + "learning_rate": 9.130316619105858e-06, + "loss": 3.4482, + "step": 10245 + }, + { + "epoch": 0.6964261448566381, + "grad_norm": 6.518797397613525, + "learning_rate": 9.12989196901753e-06, + "loss": 3.3483, + "step": 10250 + }, + { + "epoch": 0.6967658649272999, + "grad_norm": 8.367196083068848, + "learning_rate": 9.129467318929203e-06, + "loss": 3.3283, + "step": 10255 + }, + { + "epoch": 0.6971055849979617, + "grad_norm": 7.974131107330322, + "learning_rate": 9.129042668840876e-06, + "loss": 3.3643, + "step": 10260 + }, + { + "epoch": 0.6974453050686235, + "grad_norm": 7.713794231414795, + "learning_rate": 9.128618018752549e-06, + "loss": 3.3464, + "step": 10265 + }, + { + "epoch": 0.6977850251392852, + "grad_norm": 6.898194789886475, + "learning_rate": 9.128193368664222e-06, + "loss": 3.5493, + "step": 10270 + }, + { + "epoch": 0.698124745209947, + "grad_norm": 6.448472023010254, + "learning_rate": 9.127768718575895e-06, + "loss": 3.19, + "step": 10275 + }, + { + "epoch": 0.6984644652806088, + "grad_norm": 7.972611904144287, + "learning_rate": 9.127344068487567e-06, + "loss": 3.4118, + "step": 10280 + }, + { + "epoch": 0.6988041853512705, + "grad_norm": 5.7603230476379395, + "learning_rate": 9.126919418399239e-06, + "loss": 3.2598, + "step": 10285 + }, + { + "epoch": 0.6991439054219323, + "grad_norm": 7.086019039154053, + "learning_rate": 9.126494768310913e-06, + "loss": 3.1904, + "step": 10290 + }, + { + "epoch": 0.6994836254925941, + "grad_norm": 6.571238994598389, + "learning_rate": 9.126070118222586e-06, + "loss": 3.0548, + "step": 10295 + }, + { + "epoch": 0.6998233455632559, + "grad_norm": 6.105621337890625, + "learning_rate": 9.125645468134257e-06, + "loss": 3.3358, + "step": 10300 + }, + { + "epoch": 0.7001630656339176, + "grad_norm": 8.357881546020508, + "learning_rate": 9.125220818045931e-06, + "loss": 3.3396, + "step": 10305 + }, + { + "epoch": 0.7005027857045795, + "grad_norm": 7.789773464202881, + "learning_rate": 9.124796167957604e-06, + "loss": 3.3131, + "step": 10310 + }, + { + "epoch": 0.7008425057752412, + "grad_norm": 7.678349018096924, + "learning_rate": 9.124371517869275e-06, + "loss": 3.394, + "step": 10315 + }, + { + "epoch": 0.701182225845903, + "grad_norm": 7.689753532409668, + "learning_rate": 9.12394686778095e-06, + "loss": 3.2829, + "step": 10320 + }, + { + "epoch": 0.7015219459165648, + "grad_norm": 6.215521812438965, + "learning_rate": 9.123522217692623e-06, + "loss": 3.3153, + "step": 10325 + }, + { + "epoch": 0.7018616659872265, + "grad_norm": 8.194893836975098, + "learning_rate": 9.123097567604294e-06, + "loss": 3.4707, + "step": 10330 + }, + { + "epoch": 0.7022013860578883, + "grad_norm": 5.758842945098877, + "learning_rate": 9.122672917515968e-06, + "loss": 3.2456, + "step": 10335 + }, + { + "epoch": 0.70254110612855, + "grad_norm": 6.191925525665283, + "learning_rate": 9.122248267427641e-06, + "loss": 3.113, + "step": 10340 + }, + { + "epoch": 0.7028808261992119, + "grad_norm": 7.677281856536865, + "learning_rate": 9.121823617339312e-06, + "loss": 3.499, + "step": 10345 + }, + { + "epoch": 0.7032205462698736, + "grad_norm": 6.1066365242004395, + "learning_rate": 9.121398967250987e-06, + "loss": 3.2298, + "step": 10350 + }, + { + "epoch": 0.7035602663405354, + "grad_norm": 7.123300075531006, + "learning_rate": 9.120974317162658e-06, + "loss": 3.422, + "step": 10355 + }, + { + "epoch": 0.7038999864111972, + "grad_norm": 7.030430316925049, + "learning_rate": 9.12054966707433e-06, + "loss": 3.4934, + "step": 10360 + }, + { + "epoch": 0.704239706481859, + "grad_norm": 9.024046897888184, + "learning_rate": 9.120125016986005e-06, + "loss": 3.1162, + "step": 10365 + }, + { + "epoch": 0.7045794265525207, + "grad_norm": 9.209921836853027, + "learning_rate": 9.119700366897676e-06, + "loss": 3.4527, + "step": 10370 + }, + { + "epoch": 0.7049191466231824, + "grad_norm": 6.5018696784973145, + "learning_rate": 9.119275716809349e-06, + "loss": 3.269, + "step": 10375 + }, + { + "epoch": 0.7052588666938443, + "grad_norm": 5.964661598205566, + "learning_rate": 9.118851066721023e-06, + "loss": 3.46, + "step": 10380 + }, + { + "epoch": 0.705598586764506, + "grad_norm": 7.048312664031982, + "learning_rate": 9.118426416632695e-06, + "loss": 3.4049, + "step": 10385 + }, + { + "epoch": 0.7059383068351678, + "grad_norm": 6.988325595855713, + "learning_rate": 9.118001766544367e-06, + "loss": 3.3965, + "step": 10390 + }, + { + "epoch": 0.7062780269058296, + "grad_norm": 6.578484535217285, + "learning_rate": 9.117577116456042e-06, + "loss": 3.437, + "step": 10395 + }, + { + "epoch": 0.7066177469764914, + "grad_norm": 6.138343334197998, + "learning_rate": 9.117152466367713e-06, + "loss": 3.3577, + "step": 10400 + }, + { + "epoch": 0.7069574670471531, + "grad_norm": 6.102237701416016, + "learning_rate": 9.116727816279387e-06, + "loss": 3.1759, + "step": 10405 + }, + { + "epoch": 0.707297187117815, + "grad_norm": 6.3202314376831055, + "learning_rate": 9.11630316619106e-06, + "loss": 3.463, + "step": 10410 + }, + { + "epoch": 0.7076369071884767, + "grad_norm": 6.105660438537598, + "learning_rate": 9.115878516102731e-06, + "loss": 3.4287, + "step": 10415 + }, + { + "epoch": 0.7079766272591385, + "grad_norm": 7.0063042640686035, + "learning_rate": 9.115453866014406e-06, + "loss": 3.4942, + "step": 10420 + }, + { + "epoch": 0.7083163473298002, + "grad_norm": 5.922314643859863, + "learning_rate": 9.115029215926077e-06, + "loss": 3.4929, + "step": 10425 + }, + { + "epoch": 0.708656067400462, + "grad_norm": 6.226991176605225, + "learning_rate": 9.11460456583775e-06, + "loss": 3.501, + "step": 10430 + }, + { + "epoch": 0.7089957874711238, + "grad_norm": 5.849566459655762, + "learning_rate": 9.114179915749424e-06, + "loss": 3.476, + "step": 10435 + }, + { + "epoch": 0.7093355075417855, + "grad_norm": 6.167099952697754, + "learning_rate": 9.113755265661095e-06, + "loss": 3.4606, + "step": 10440 + }, + { + "epoch": 0.7096752276124474, + "grad_norm": 5.697735786437988, + "learning_rate": 9.113330615572768e-06, + "loss": 3.4711, + "step": 10445 + }, + { + "epoch": 0.7100149476831091, + "grad_norm": 6.431954860687256, + "learning_rate": 9.112905965484443e-06, + "loss": 3.3144, + "step": 10450 + }, + { + "epoch": 0.7103546677537709, + "grad_norm": 6.629333019256592, + "learning_rate": 9.112481315396114e-06, + "loss": 3.5879, + "step": 10455 + }, + { + "epoch": 0.7106943878244326, + "grad_norm": 6.7980546951293945, + "learning_rate": 9.112056665307787e-06, + "loss": 3.341, + "step": 10460 + }, + { + "epoch": 0.7110341078950945, + "grad_norm": 6.984792709350586, + "learning_rate": 9.111632015219461e-06, + "loss": 3.3256, + "step": 10465 + }, + { + "epoch": 0.7113738279657562, + "grad_norm": 5.602013111114502, + "learning_rate": 9.111207365131132e-06, + "loss": 3.5263, + "step": 10470 + }, + { + "epoch": 0.711713548036418, + "grad_norm": 7.637757778167725, + "learning_rate": 9.110782715042805e-06, + "loss": 3.2689, + "step": 10475 + }, + { + "epoch": 0.7120532681070798, + "grad_norm": 6.264740943908691, + "learning_rate": 9.11035806495448e-06, + "loss": 3.4133, + "step": 10480 + }, + { + "epoch": 0.7123929881777415, + "grad_norm": 7.3508100509643555, + "learning_rate": 9.10993341486615e-06, + "loss": 3.0491, + "step": 10485 + }, + { + "epoch": 0.7127327082484033, + "grad_norm": 5.785887718200684, + "learning_rate": 9.109508764777823e-06, + "loss": 3.3144, + "step": 10490 + }, + { + "epoch": 0.7130724283190651, + "grad_norm": 7.767294406890869, + "learning_rate": 9.109084114689498e-06, + "loss": 3.3303, + "step": 10495 + }, + { + "epoch": 0.7134121483897269, + "grad_norm": 7.2804036140441895, + "learning_rate": 9.108659464601169e-06, + "loss": 3.3374, + "step": 10500 + }, + { + "epoch": 0.7137518684603886, + "grad_norm": 8.150713920593262, + "learning_rate": 9.108234814512842e-06, + "loss": 3.4737, + "step": 10505 + }, + { + "epoch": 0.7140915885310504, + "grad_norm": 5.9641337394714355, + "learning_rate": 9.107810164424515e-06, + "loss": 3.597, + "step": 10510 + }, + { + "epoch": 0.7144313086017122, + "grad_norm": 6.13884973526001, + "learning_rate": 9.107385514336187e-06, + "loss": 3.2125, + "step": 10515 + }, + { + "epoch": 0.714771028672374, + "grad_norm": 8.325356483459473, + "learning_rate": 9.10696086424786e-06, + "loss": 3.2717, + "step": 10520 + }, + { + "epoch": 0.7151107487430357, + "grad_norm": 7.4428324699401855, + "learning_rate": 9.106536214159533e-06, + "loss": 3.4653, + "step": 10525 + }, + { + "epoch": 0.7154504688136976, + "grad_norm": 5.791220664978027, + "learning_rate": 9.106111564071206e-06, + "loss": 3.1813, + "step": 10530 + }, + { + "epoch": 0.7157901888843593, + "grad_norm": 5.610538959503174, + "learning_rate": 9.105686913982879e-06, + "loss": 3.4962, + "step": 10535 + }, + { + "epoch": 0.716129908955021, + "grad_norm": 7.115026950836182, + "learning_rate": 9.105262263894551e-06, + "loss": 3.4811, + "step": 10540 + }, + { + "epoch": 0.7164696290256828, + "grad_norm": 7.928124904632568, + "learning_rate": 9.104837613806224e-06, + "loss": 3.2171, + "step": 10545 + }, + { + "epoch": 0.7168093490963446, + "grad_norm": 7.934271812438965, + "learning_rate": 9.104412963717897e-06, + "loss": 3.4772, + "step": 10550 + }, + { + "epoch": 0.7171490691670064, + "grad_norm": 5.723385334014893, + "learning_rate": 9.10398831362957e-06, + "loss": 3.3897, + "step": 10555 + }, + { + "epoch": 0.7174887892376681, + "grad_norm": 5.650809288024902, + "learning_rate": 9.103563663541243e-06, + "loss": 3.4196, + "step": 10560 + }, + { + "epoch": 0.71782850930833, + "grad_norm": 5.096344470977783, + "learning_rate": 9.103139013452915e-06, + "loss": 3.4545, + "step": 10565 + }, + { + "epoch": 0.7181682293789917, + "grad_norm": 6.317584037780762, + "learning_rate": 9.102714363364588e-06, + "loss": 3.4733, + "step": 10570 + }, + { + "epoch": 0.7185079494496535, + "grad_norm": 8.19660472869873, + "learning_rate": 9.102289713276261e-06, + "loss": 3.254, + "step": 10575 + }, + { + "epoch": 0.7188476695203153, + "grad_norm": 8.06600284576416, + "learning_rate": 9.101865063187934e-06, + "loss": 3.4875, + "step": 10580 + }, + { + "epoch": 0.719187389590977, + "grad_norm": 6.5635528564453125, + "learning_rate": 9.101440413099607e-06, + "loss": 3.2094, + "step": 10585 + }, + { + "epoch": 0.7195271096616388, + "grad_norm": 7.5610175132751465, + "learning_rate": 9.10101576301128e-06, + "loss": 3.4999, + "step": 10590 + }, + { + "epoch": 0.7198668297323005, + "grad_norm": 6.521462917327881, + "learning_rate": 9.100591112922952e-06, + "loss": 3.1708, + "step": 10595 + }, + { + "epoch": 0.7202065498029624, + "grad_norm": 6.139681339263916, + "learning_rate": 9.100166462834625e-06, + "loss": 3.1918, + "step": 10600 + }, + { + "epoch": 0.7205462698736241, + "grad_norm": 6.748563289642334, + "learning_rate": 9.099741812746298e-06, + "loss": 3.4696, + "step": 10605 + }, + { + "epoch": 0.7208859899442859, + "grad_norm": 5.685897350311279, + "learning_rate": 9.09931716265797e-06, + "loss": 3.373, + "step": 10610 + }, + { + "epoch": 0.7212257100149477, + "grad_norm": 5.658117294311523, + "learning_rate": 9.098892512569643e-06, + "loss": 3.3023, + "step": 10615 + }, + { + "epoch": 0.7215654300856095, + "grad_norm": 6.914041042327881, + "learning_rate": 9.098467862481316e-06, + "loss": 3.2828, + "step": 10620 + }, + { + "epoch": 0.7219051501562712, + "grad_norm": 6.790530204772949, + "learning_rate": 9.098043212392989e-06, + "loss": 3.5803, + "step": 10625 + }, + { + "epoch": 0.722244870226933, + "grad_norm": 5.843021869659424, + "learning_rate": 9.097618562304662e-06, + "loss": 3.3042, + "step": 10630 + }, + { + "epoch": 0.7225845902975948, + "grad_norm": 6.1854567527771, + "learning_rate": 9.097193912216335e-06, + "loss": 3.236, + "step": 10635 + }, + { + "epoch": 0.7229243103682566, + "grad_norm": 6.218750476837158, + "learning_rate": 9.096769262128007e-06, + "loss": 3.248, + "step": 10640 + }, + { + "epoch": 0.7232640304389183, + "grad_norm": 7.816906929016113, + "learning_rate": 9.09634461203968e-06, + "loss": 3.4073, + "step": 10645 + }, + { + "epoch": 0.7236037505095801, + "grad_norm": 9.037160873413086, + "learning_rate": 9.095919961951353e-06, + "loss": 3.324, + "step": 10650 + }, + { + "epoch": 0.7239434705802419, + "grad_norm": 6.84898567199707, + "learning_rate": 9.095495311863026e-06, + "loss": 3.3179, + "step": 10655 + }, + { + "epoch": 0.7242831906509036, + "grad_norm": 7.005622386932373, + "learning_rate": 9.095070661774699e-06, + "loss": 3.1033, + "step": 10660 + }, + { + "epoch": 0.7246229107215655, + "grad_norm": 7.4242119789123535, + "learning_rate": 9.094646011686371e-06, + "loss": 3.3591, + "step": 10665 + }, + { + "epoch": 0.7249626307922272, + "grad_norm": 7.131082534790039, + "learning_rate": 9.094221361598044e-06, + "loss": 3.305, + "step": 10670 + }, + { + "epoch": 0.725302350862889, + "grad_norm": 7.142106056213379, + "learning_rate": 9.093796711509717e-06, + "loss": 3.5533, + "step": 10675 + }, + { + "epoch": 0.7256420709335507, + "grad_norm": 6.501027584075928, + "learning_rate": 9.09337206142139e-06, + "loss": 3.496, + "step": 10680 + }, + { + "epoch": 0.7259817910042126, + "grad_norm": 8.047861099243164, + "learning_rate": 9.092947411333063e-06, + "loss": 3.3971, + "step": 10685 + }, + { + "epoch": 0.7263215110748743, + "grad_norm": 6.309912204742432, + "learning_rate": 9.092522761244735e-06, + "loss": 3.383, + "step": 10690 + }, + { + "epoch": 0.726661231145536, + "grad_norm": 6.087305545806885, + "learning_rate": 9.092098111156408e-06, + "loss": 3.4716, + "step": 10695 + }, + { + "epoch": 0.7270009512161979, + "grad_norm": 7.303882122039795, + "learning_rate": 9.09167346106808e-06, + "loss": 3.6055, + "step": 10700 + }, + { + "epoch": 0.7273406712868596, + "grad_norm": 6.290319919586182, + "learning_rate": 9.091248810979754e-06, + "loss": 3.5144, + "step": 10705 + }, + { + "epoch": 0.7276803913575214, + "grad_norm": 7.902047634124756, + "learning_rate": 9.090824160891427e-06, + "loss": 3.5429, + "step": 10710 + }, + { + "epoch": 0.7280201114281831, + "grad_norm": 6.0104451179504395, + "learning_rate": 9.090399510803098e-06, + "loss": 3.5718, + "step": 10715 + }, + { + "epoch": 0.728359831498845, + "grad_norm": 8.638808250427246, + "learning_rate": 9.089974860714772e-06, + "loss": 3.1847, + "step": 10720 + }, + { + "epoch": 0.7286995515695067, + "grad_norm": 5.401157855987549, + "learning_rate": 9.089550210626445e-06, + "loss": 3.3464, + "step": 10725 + }, + { + "epoch": 0.7290392716401685, + "grad_norm": 5.971068382263184, + "learning_rate": 9.089125560538116e-06, + "loss": 3.1691, + "step": 10730 + }, + { + "epoch": 0.7293789917108303, + "grad_norm": 7.160252094268799, + "learning_rate": 9.08870091044979e-06, + "loss": 3.5031, + "step": 10735 + }, + { + "epoch": 0.7297187117814921, + "grad_norm": 5.884240627288818, + "learning_rate": 9.088276260361463e-06, + "loss": 3.4018, + "step": 10740 + }, + { + "epoch": 0.7300584318521538, + "grad_norm": 5.5285563468933105, + "learning_rate": 9.087851610273136e-06, + "loss": 3.2083, + "step": 10745 + }, + { + "epoch": 0.7303981519228157, + "grad_norm": 8.916622161865234, + "learning_rate": 9.087426960184809e-06, + "loss": 3.4323, + "step": 10750 + }, + { + "epoch": 0.7307378719934774, + "grad_norm": 6.908820629119873, + "learning_rate": 9.087002310096482e-06, + "loss": 3.3546, + "step": 10755 + }, + { + "epoch": 0.7310775920641391, + "grad_norm": 6.308446884155273, + "learning_rate": 9.086577660008155e-06, + "loss": 3.5297, + "step": 10760 + }, + { + "epoch": 0.7314173121348009, + "grad_norm": 6.804640293121338, + "learning_rate": 9.086153009919827e-06, + "loss": 3.3836, + "step": 10765 + }, + { + "epoch": 0.7317570322054627, + "grad_norm": 6.7476487159729, + "learning_rate": 9.085728359831499e-06, + "loss": 3.2516, + "step": 10770 + }, + { + "epoch": 0.7320967522761245, + "grad_norm": 6.20426082611084, + "learning_rate": 9.085303709743173e-06, + "loss": 3.5853, + "step": 10775 + }, + { + "epoch": 0.7324364723467862, + "grad_norm": 6.133166313171387, + "learning_rate": 9.084879059654846e-06, + "loss": 3.3625, + "step": 10780 + }, + { + "epoch": 0.7327761924174481, + "grad_norm": 6.875912666320801, + "learning_rate": 9.084454409566517e-06, + "loss": 3.4409, + "step": 10785 + }, + { + "epoch": 0.7331159124881098, + "grad_norm": 6.5460309982299805, + "learning_rate": 9.084029759478191e-06, + "loss": 3.2552, + "step": 10790 + }, + { + "epoch": 0.7334556325587716, + "grad_norm": 6.8802385330200195, + "learning_rate": 9.083605109389864e-06, + "loss": 3.3395, + "step": 10795 + }, + { + "epoch": 0.7337953526294333, + "grad_norm": 7.280309200286865, + "learning_rate": 9.083180459301535e-06, + "loss": 3.2878, + "step": 10800 + }, + { + "epoch": 0.7341350727000951, + "grad_norm": 8.607853889465332, + "learning_rate": 9.08275580921321e-06, + "loss": 3.4498, + "step": 10805 + }, + { + "epoch": 0.7344747927707569, + "grad_norm": 6.382808208465576, + "learning_rate": 9.082331159124883e-06, + "loss": 3.4806, + "step": 10810 + }, + { + "epoch": 0.7348145128414186, + "grad_norm": 7.033539295196533, + "learning_rate": 9.081906509036554e-06, + "loss": 3.3129, + "step": 10815 + }, + { + "epoch": 0.7351542329120805, + "grad_norm": 5.814842700958252, + "learning_rate": 9.081481858948228e-06, + "loss": 3.4178, + "step": 10820 + }, + { + "epoch": 0.7354939529827422, + "grad_norm": 5.278096675872803, + "learning_rate": 9.081057208859901e-06, + "loss": 3.358, + "step": 10825 + }, + { + "epoch": 0.735833673053404, + "grad_norm": 6.17524528503418, + "learning_rate": 9.080632558771572e-06, + "loss": 3.2767, + "step": 10830 + }, + { + "epoch": 0.7361733931240658, + "grad_norm": 6.849056720733643, + "learning_rate": 9.080207908683247e-06, + "loss": 3.4274, + "step": 10835 + }, + { + "epoch": 0.7365131131947276, + "grad_norm": 6.402642726898193, + "learning_rate": 9.07978325859492e-06, + "loss": 3.2754, + "step": 10840 + }, + { + "epoch": 0.7368528332653893, + "grad_norm": 6.3077168464660645, + "learning_rate": 9.07935860850659e-06, + "loss": 3.2502, + "step": 10845 + }, + { + "epoch": 0.737192553336051, + "grad_norm": 6.923896789550781, + "learning_rate": 9.078933958418265e-06, + "loss": 3.4057, + "step": 10850 + }, + { + "epoch": 0.7375322734067129, + "grad_norm": 5.681328296661377, + "learning_rate": 9.078509308329936e-06, + "loss": 3.3288, + "step": 10855 + }, + { + "epoch": 0.7378719934773746, + "grad_norm": 6.3904314041137695, + "learning_rate": 9.078084658241609e-06, + "loss": 3.4944, + "step": 10860 + }, + { + "epoch": 0.7382117135480364, + "grad_norm": 5.313968658447266, + "learning_rate": 9.077660008153283e-06, + "loss": 3.4584, + "step": 10865 + }, + { + "epoch": 0.7385514336186982, + "grad_norm": 6.138406753540039, + "learning_rate": 9.077235358064955e-06, + "loss": 3.2662, + "step": 10870 + }, + { + "epoch": 0.73889115368936, + "grad_norm": 7.310222625732422, + "learning_rate": 9.076810707976627e-06, + "loss": 3.4065, + "step": 10875 + }, + { + "epoch": 0.7392308737600217, + "grad_norm": 6.962902545928955, + "learning_rate": 9.076386057888302e-06, + "loss": 3.352, + "step": 10880 + }, + { + "epoch": 0.7395705938306835, + "grad_norm": 6.464142799377441, + "learning_rate": 9.075961407799973e-06, + "loss": 3.3182, + "step": 10885 + }, + { + "epoch": 0.7399103139013453, + "grad_norm": 7.227391242980957, + "learning_rate": 9.075536757711646e-06, + "loss": 3.3188, + "step": 10890 + }, + { + "epoch": 0.7402500339720071, + "grad_norm": 7.061652660369873, + "learning_rate": 9.07511210762332e-06, + "loss": 3.3624, + "step": 10895 + }, + { + "epoch": 0.7405897540426688, + "grad_norm": 6.788956165313721, + "learning_rate": 9.074687457534991e-06, + "loss": 3.3271, + "step": 10900 + }, + { + "epoch": 0.7409294741133307, + "grad_norm": 6.292221546173096, + "learning_rate": 9.074262807446664e-06, + "loss": 3.4522, + "step": 10905 + }, + { + "epoch": 0.7412691941839924, + "grad_norm": 6.7370171546936035, + "learning_rate": 9.073838157358339e-06, + "loss": 3.3746, + "step": 10910 + }, + { + "epoch": 0.7416089142546541, + "grad_norm": 5.511499881744385, + "learning_rate": 9.07341350727001e-06, + "loss": 3.5618, + "step": 10915 + }, + { + "epoch": 0.741948634325316, + "grad_norm": 8.416470527648926, + "learning_rate": 9.072988857181683e-06, + "loss": 3.3462, + "step": 10920 + }, + { + "epoch": 0.7422883543959777, + "grad_norm": 7.0192036628723145, + "learning_rate": 9.072564207093355e-06, + "loss": 3.2669, + "step": 10925 + }, + { + "epoch": 0.7426280744666395, + "grad_norm": 7.924942493438721, + "learning_rate": 9.072139557005028e-06, + "loss": 3.5451, + "step": 10930 + }, + { + "epoch": 0.7429677945373012, + "grad_norm": 8.660205841064453, + "learning_rate": 9.071714906916701e-06, + "loss": 3.2299, + "step": 10935 + }, + { + "epoch": 0.7433075146079631, + "grad_norm": 8.170196533203125, + "learning_rate": 9.071290256828374e-06, + "loss": 3.2641, + "step": 10940 + }, + { + "epoch": 0.7436472346786248, + "grad_norm": 5.561654567718506, + "learning_rate": 9.070865606740047e-06, + "loss": 3.2767, + "step": 10945 + }, + { + "epoch": 0.7439869547492866, + "grad_norm": 6.219527244567871, + "learning_rate": 9.07044095665172e-06, + "loss": 3.7265, + "step": 10950 + }, + { + "epoch": 0.7443266748199484, + "grad_norm": 6.296635627746582, + "learning_rate": 9.070016306563392e-06, + "loss": 3.2196, + "step": 10955 + }, + { + "epoch": 0.7446663948906102, + "grad_norm": 5.0517258644104, + "learning_rate": 9.069591656475065e-06, + "loss": 3.2012, + "step": 10960 + }, + { + "epoch": 0.7450061149612719, + "grad_norm": 6.310434341430664, + "learning_rate": 9.069167006386738e-06, + "loss": 3.493, + "step": 10965 + }, + { + "epoch": 0.7453458350319336, + "grad_norm": 8.036003112792969, + "learning_rate": 9.06874235629841e-06, + "loss": 3.3459, + "step": 10970 + }, + { + "epoch": 0.7456855551025955, + "grad_norm": 6.895312786102295, + "learning_rate": 9.068317706210083e-06, + "loss": 3.3818, + "step": 10975 + }, + { + "epoch": 0.7460252751732572, + "grad_norm": 7.9100422859191895, + "learning_rate": 9.067893056121756e-06, + "loss": 3.2132, + "step": 10980 + }, + { + "epoch": 0.746364995243919, + "grad_norm": 6.720648765563965, + "learning_rate": 9.067468406033429e-06, + "loss": 3.3535, + "step": 10985 + }, + { + "epoch": 0.7467047153145808, + "grad_norm": 6.2077860832214355, + "learning_rate": 9.067043755945102e-06, + "loss": 3.2997, + "step": 10990 + }, + { + "epoch": 0.7470444353852426, + "grad_norm": 5.858314037322998, + "learning_rate": 9.066619105856775e-06, + "loss": 3.1462, + "step": 10995 + }, + { + "epoch": 0.7473841554559043, + "grad_norm": 5.477288246154785, + "learning_rate": 9.066194455768447e-06, + "loss": 3.125, + "step": 11000 + }, + { + "epoch": 0.7477238755265662, + "grad_norm": 7.213051795959473, + "learning_rate": 9.06576980568012e-06, + "loss": 3.3339, + "step": 11005 + }, + { + "epoch": 0.7480635955972279, + "grad_norm": 7.101606369018555, + "learning_rate": 9.065345155591793e-06, + "loss": 3.2629, + "step": 11010 + }, + { + "epoch": 0.7484033156678896, + "grad_norm": 6.831035137176514, + "learning_rate": 9.064920505503466e-06, + "loss": 3.0767, + "step": 11015 + }, + { + "epoch": 0.7487430357385514, + "grad_norm": 7.355017185211182, + "learning_rate": 9.064495855415139e-06, + "loss": 3.3636, + "step": 11020 + }, + { + "epoch": 0.7490827558092132, + "grad_norm": 6.2029242515563965, + "learning_rate": 9.064071205326811e-06, + "loss": 3.3307, + "step": 11025 + }, + { + "epoch": 0.749422475879875, + "grad_norm": 7.996400833129883, + "learning_rate": 9.063646555238484e-06, + "loss": 3.4064, + "step": 11030 + }, + { + "epoch": 0.7497621959505367, + "grad_norm": 6.444186210632324, + "learning_rate": 9.063221905150157e-06, + "loss": 3.4266, + "step": 11035 + }, + { + "epoch": 0.7501019160211986, + "grad_norm": 6.211459159851074, + "learning_rate": 9.06279725506183e-06, + "loss": 3.4582, + "step": 11040 + }, + { + "epoch": 0.7504416360918603, + "grad_norm": 5.623589992523193, + "learning_rate": 9.062372604973503e-06, + "loss": 3.3474, + "step": 11045 + }, + { + "epoch": 0.7507813561625221, + "grad_norm": 6.598448276519775, + "learning_rate": 9.061947954885175e-06, + "loss": 3.3688, + "step": 11050 + }, + { + "epoch": 0.7511210762331838, + "grad_norm": 6.7818474769592285, + "learning_rate": 9.061523304796848e-06, + "loss": 3.0219, + "step": 11055 + }, + { + "epoch": 0.7514607963038457, + "grad_norm": 6.31464958190918, + "learning_rate": 9.061098654708521e-06, + "loss": 3.4427, + "step": 11060 + }, + { + "epoch": 0.7518005163745074, + "grad_norm": 7.2516984939575195, + "learning_rate": 9.060674004620194e-06, + "loss": 3.4319, + "step": 11065 + }, + { + "epoch": 0.7521402364451691, + "grad_norm": 7.6485748291015625, + "learning_rate": 9.060249354531867e-06, + "loss": 3.2939, + "step": 11070 + }, + { + "epoch": 0.752479956515831, + "grad_norm": 6.254462718963623, + "learning_rate": 9.05982470444354e-06, + "loss": 3.4747, + "step": 11075 + }, + { + "epoch": 0.7528196765864927, + "grad_norm": 5.738269805908203, + "learning_rate": 9.059400054355212e-06, + "loss": 3.2146, + "step": 11080 + }, + { + "epoch": 0.7531593966571545, + "grad_norm": 7.990801811218262, + "learning_rate": 9.058975404266885e-06, + "loss": 3.1465, + "step": 11085 + }, + { + "epoch": 0.7534991167278163, + "grad_norm": 9.052546501159668, + "learning_rate": 9.058550754178558e-06, + "loss": 3.4435, + "step": 11090 + }, + { + "epoch": 0.7538388367984781, + "grad_norm": 5.406906604766846, + "learning_rate": 9.05812610409023e-06, + "loss": 3.4983, + "step": 11095 + }, + { + "epoch": 0.7541785568691398, + "grad_norm": 6.15324592590332, + "learning_rate": 9.057701454001903e-06, + "loss": 3.1687, + "step": 11100 + }, + { + "epoch": 0.7545182769398016, + "grad_norm": 5.771951198577881, + "learning_rate": 9.057276803913576e-06, + "loss": 3.3552, + "step": 11105 + }, + { + "epoch": 0.7548579970104634, + "grad_norm": 6.443511009216309, + "learning_rate": 9.056852153825249e-06, + "loss": 3.3062, + "step": 11110 + }, + { + "epoch": 0.7551977170811252, + "grad_norm": 7.192091464996338, + "learning_rate": 9.056427503736922e-06, + "loss": 3.3062, + "step": 11115 + }, + { + "epoch": 0.7555374371517869, + "grad_norm": 6.903396129608154, + "learning_rate": 9.056002853648595e-06, + "loss": 3.416, + "step": 11120 + }, + { + "epoch": 0.7558771572224487, + "grad_norm": 6.692253112792969, + "learning_rate": 9.055578203560267e-06, + "loss": 3.4799, + "step": 11125 + }, + { + "epoch": 0.7562168772931105, + "grad_norm": 6.832904815673828, + "learning_rate": 9.05515355347194e-06, + "loss": 3.4047, + "step": 11130 + }, + { + "epoch": 0.7565565973637722, + "grad_norm": 7.0822014808654785, + "learning_rate": 9.054728903383613e-06, + "loss": 3.4776, + "step": 11135 + }, + { + "epoch": 0.756896317434434, + "grad_norm": 6.474172115325928, + "learning_rate": 9.054304253295286e-06, + "loss": 3.4473, + "step": 11140 + }, + { + "epoch": 0.7572360375050958, + "grad_norm": 7.287619113922119, + "learning_rate": 9.053879603206959e-06, + "loss": 3.1248, + "step": 11145 + }, + { + "epoch": 0.7575757575757576, + "grad_norm": 6.995494365692139, + "learning_rate": 9.053454953118631e-06, + "loss": 3.6491, + "step": 11150 + }, + { + "epoch": 0.7579154776464193, + "grad_norm": 7.6708221435546875, + "learning_rate": 9.053030303030304e-06, + "loss": 3.3432, + "step": 11155 + }, + { + "epoch": 0.7582551977170812, + "grad_norm": 6.389604568481445, + "learning_rate": 9.052605652941977e-06, + "loss": 3.3218, + "step": 11160 + }, + { + "epoch": 0.7585949177877429, + "grad_norm": 7.958615779876709, + "learning_rate": 9.05218100285365e-06, + "loss": 3.1263, + "step": 11165 + }, + { + "epoch": 0.7589346378584046, + "grad_norm": 5.807221412658691, + "learning_rate": 9.051756352765323e-06, + "loss": 3.2225, + "step": 11170 + }, + { + "epoch": 0.7592743579290665, + "grad_norm": 6.81330680847168, + "learning_rate": 9.051331702676995e-06, + "loss": 3.2937, + "step": 11175 + }, + { + "epoch": 0.7596140779997282, + "grad_norm": 7.816102981567383, + "learning_rate": 9.050907052588668e-06, + "loss": 3.1817, + "step": 11180 + }, + { + "epoch": 0.75995379807039, + "grad_norm": 6.370071887969971, + "learning_rate": 9.050482402500341e-06, + "loss": 3.2777, + "step": 11185 + }, + { + "epoch": 0.7602935181410517, + "grad_norm": 5.8073883056640625, + "learning_rate": 9.050057752412014e-06, + "loss": 3.3402, + "step": 11190 + }, + { + "epoch": 0.7606332382117136, + "grad_norm": 5.764433860778809, + "learning_rate": 9.049633102323687e-06, + "loss": 3.3821, + "step": 11195 + }, + { + "epoch": 0.7609729582823753, + "grad_norm": 7.790317058563232, + "learning_rate": 9.049208452235358e-06, + "loss": 3.2631, + "step": 11200 + }, + { + "epoch": 0.7613126783530371, + "grad_norm": 7.1733717918396, + "learning_rate": 9.048783802147032e-06, + "loss": 3.4476, + "step": 11205 + }, + { + "epoch": 0.7616523984236989, + "grad_norm": 7.43806266784668, + "learning_rate": 9.048359152058705e-06, + "loss": 3.3904, + "step": 11210 + }, + { + "epoch": 0.7619921184943607, + "grad_norm": 7.632862567901611, + "learning_rate": 9.047934501970376e-06, + "loss": 3.264, + "step": 11215 + }, + { + "epoch": 0.7623318385650224, + "grad_norm": 6.999017715454102, + "learning_rate": 9.04750985188205e-06, + "loss": 3.4248, + "step": 11220 + }, + { + "epoch": 0.7626715586356841, + "grad_norm": 6.298268795013428, + "learning_rate": 9.047085201793723e-06, + "loss": 3.2464, + "step": 11225 + }, + { + "epoch": 0.763011278706346, + "grad_norm": 5.587419509887695, + "learning_rate": 9.046660551705395e-06, + "loss": 3.4625, + "step": 11230 + }, + { + "epoch": 0.7633509987770077, + "grad_norm": 6.013851165771484, + "learning_rate": 9.046235901617069e-06, + "loss": 3.4819, + "step": 11235 + }, + { + "epoch": 0.7636907188476695, + "grad_norm": 6.126076698303223, + "learning_rate": 9.045811251528742e-06, + "loss": 3.2569, + "step": 11240 + }, + { + "epoch": 0.7640304389183313, + "grad_norm": 7.115164756774902, + "learning_rate": 9.045386601440413e-06, + "loss": 3.0978, + "step": 11245 + }, + { + "epoch": 0.7643701589889931, + "grad_norm": 6.382407188415527, + "learning_rate": 9.044961951352087e-06, + "loss": 3.2764, + "step": 11250 + }, + { + "epoch": 0.7647098790596548, + "grad_norm": 5.556023597717285, + "learning_rate": 9.04453730126376e-06, + "loss": 3.4675, + "step": 11255 + }, + { + "epoch": 0.7650495991303167, + "grad_norm": 5.365880489349365, + "learning_rate": 9.044112651175431e-06, + "loss": 3.3798, + "step": 11260 + }, + { + "epoch": 0.7653893192009784, + "grad_norm": 5.680520057678223, + "learning_rate": 9.043688001087106e-06, + "loss": 3.3379, + "step": 11265 + }, + { + "epoch": 0.7657290392716402, + "grad_norm": 7.783652305603027, + "learning_rate": 9.043263350998777e-06, + "loss": 3.3513, + "step": 11270 + }, + { + "epoch": 0.7660687593423019, + "grad_norm": 7.57780647277832, + "learning_rate": 9.04283870091045e-06, + "loss": 3.3819, + "step": 11275 + }, + { + "epoch": 0.7664084794129638, + "grad_norm": 7.174657344818115, + "learning_rate": 9.042414050822124e-06, + "loss": 3.3262, + "step": 11280 + }, + { + "epoch": 0.7667481994836255, + "grad_norm": 7.273410320281982, + "learning_rate": 9.041989400733795e-06, + "loss": 3.2822, + "step": 11285 + }, + { + "epoch": 0.7670879195542872, + "grad_norm": 6.285484790802002, + "learning_rate": 9.041564750645468e-06, + "loss": 3.2321, + "step": 11290 + }, + { + "epoch": 0.7674276396249491, + "grad_norm": 6.124421119689941, + "learning_rate": 9.041140100557143e-06, + "loss": 3.1675, + "step": 11295 + }, + { + "epoch": 0.7677673596956108, + "grad_norm": 5.150004863739014, + "learning_rate": 9.040715450468814e-06, + "loss": 3.3001, + "step": 11300 + }, + { + "epoch": 0.7681070797662726, + "grad_norm": 6.640211582183838, + "learning_rate": 9.040290800380487e-06, + "loss": 3.2171, + "step": 11305 + }, + { + "epoch": 0.7684467998369343, + "grad_norm": 7.7508225440979, + "learning_rate": 9.039866150292161e-06, + "loss": 3.4643, + "step": 11310 + }, + { + "epoch": 0.7687865199075962, + "grad_norm": 8.165081024169922, + "learning_rate": 9.039441500203832e-06, + "loss": 3.3274, + "step": 11315 + }, + { + "epoch": 0.7691262399782579, + "grad_norm": 6.707429885864258, + "learning_rate": 9.039016850115505e-06, + "loss": 3.4791, + "step": 11320 + }, + { + "epoch": 0.7694659600489197, + "grad_norm": 7.629330635070801, + "learning_rate": 9.03859220002718e-06, + "loss": 3.2925, + "step": 11325 + }, + { + "epoch": 0.7698056801195815, + "grad_norm": 8.021346092224121, + "learning_rate": 9.03816754993885e-06, + "loss": 3.2498, + "step": 11330 + }, + { + "epoch": 0.7701454001902432, + "grad_norm": 7.489944934844971, + "learning_rate": 9.037742899850523e-06, + "loss": 3.2283, + "step": 11335 + }, + { + "epoch": 0.770485120260905, + "grad_norm": 7.260774612426758, + "learning_rate": 9.037318249762196e-06, + "loss": 3.0746, + "step": 11340 + }, + { + "epoch": 0.7708248403315668, + "grad_norm": 8.26562213897705, + "learning_rate": 9.036893599673869e-06, + "loss": 3.3535, + "step": 11345 + }, + { + "epoch": 0.7711645604022286, + "grad_norm": 5.579193592071533, + "learning_rate": 9.036468949585542e-06, + "loss": 3.378, + "step": 11350 + }, + { + "epoch": 0.7715042804728903, + "grad_norm": 7.512802600860596, + "learning_rate": 9.036044299497215e-06, + "loss": 3.2059, + "step": 11355 + }, + { + "epoch": 0.7718440005435521, + "grad_norm": 8.660296440124512, + "learning_rate": 9.035619649408887e-06, + "loss": 3.6505, + "step": 11360 + }, + { + "epoch": 0.7721837206142139, + "grad_norm": 5.599608898162842, + "learning_rate": 9.03519499932056e-06, + "loss": 3.3848, + "step": 11365 + }, + { + "epoch": 0.7725234406848757, + "grad_norm": 7.171607971191406, + "learning_rate": 9.034770349232233e-06, + "loss": 3.5687, + "step": 11370 + }, + { + "epoch": 0.7728631607555374, + "grad_norm": 7.771046161651611, + "learning_rate": 9.034345699143906e-06, + "loss": 3.287, + "step": 11375 + }, + { + "epoch": 0.7732028808261993, + "grad_norm": 6.906393051147461, + "learning_rate": 9.033921049055579e-06, + "loss": 3.2056, + "step": 11380 + }, + { + "epoch": 0.773542600896861, + "grad_norm": 6.912164688110352, + "learning_rate": 9.033496398967251e-06, + "loss": 3.3775, + "step": 11385 + }, + { + "epoch": 0.7738823209675227, + "grad_norm": 7.242130279541016, + "learning_rate": 9.033071748878924e-06, + "loss": 3.4005, + "step": 11390 + }, + { + "epoch": 0.7742220410381845, + "grad_norm": 7.561464309692383, + "learning_rate": 9.032647098790597e-06, + "loss": 3.37, + "step": 11395 + }, + { + "epoch": 0.7745617611088463, + "grad_norm": 5.701076984405518, + "learning_rate": 9.03222244870227e-06, + "loss": 3.2946, + "step": 11400 + }, + { + "epoch": 0.7749014811795081, + "grad_norm": 6.897267818450928, + "learning_rate": 9.031797798613943e-06, + "loss": 3.4513, + "step": 11405 + }, + { + "epoch": 0.7752412012501698, + "grad_norm": 5.6706414222717285, + "learning_rate": 9.031373148525615e-06, + "loss": 3.546, + "step": 11410 + }, + { + "epoch": 0.7755809213208317, + "grad_norm": 9.436697959899902, + "learning_rate": 9.030948498437288e-06, + "loss": 3.5395, + "step": 11415 + }, + { + "epoch": 0.7759206413914934, + "grad_norm": 8.605462074279785, + "learning_rate": 9.030523848348961e-06, + "loss": 3.4601, + "step": 11420 + }, + { + "epoch": 0.7762603614621552, + "grad_norm": 8.912337303161621, + "learning_rate": 9.030099198260634e-06, + "loss": 3.1077, + "step": 11425 + }, + { + "epoch": 0.776600081532817, + "grad_norm": 6.435003280639648, + "learning_rate": 9.029674548172307e-06, + "loss": 3.1108, + "step": 11430 + }, + { + "epoch": 0.7769398016034788, + "grad_norm": 5.4626336097717285, + "learning_rate": 9.02924989808398e-06, + "loss": 3.2344, + "step": 11435 + }, + { + "epoch": 0.7772795216741405, + "grad_norm": 6.659729480743408, + "learning_rate": 9.028825247995652e-06, + "loss": 3.4432, + "step": 11440 + }, + { + "epoch": 0.7776192417448022, + "grad_norm": 6.760040760040283, + "learning_rate": 9.028400597907325e-06, + "loss": 3.2268, + "step": 11445 + }, + { + "epoch": 0.7779589618154641, + "grad_norm": 8.330668449401855, + "learning_rate": 9.027975947818998e-06, + "loss": 3.3427, + "step": 11450 + }, + { + "epoch": 0.7782986818861258, + "grad_norm": 6.806839942932129, + "learning_rate": 9.02755129773067e-06, + "loss": 3.4715, + "step": 11455 + }, + { + "epoch": 0.7786384019567876, + "grad_norm": 5.965996742248535, + "learning_rate": 9.027126647642343e-06, + "loss": 3.1228, + "step": 11460 + }, + { + "epoch": 0.7789781220274494, + "grad_norm": 5.7542948722839355, + "learning_rate": 9.026701997554016e-06, + "loss": 3.4248, + "step": 11465 + }, + { + "epoch": 0.7793178420981112, + "grad_norm": 7.1560587882995605, + "learning_rate": 9.026277347465689e-06, + "loss": 3.3679, + "step": 11470 + }, + { + "epoch": 0.7796575621687729, + "grad_norm": 6.984847545623779, + "learning_rate": 9.025852697377362e-06, + "loss": 3.4163, + "step": 11475 + }, + { + "epoch": 0.7799972822394347, + "grad_norm": 6.6082940101623535, + "learning_rate": 9.025428047289035e-06, + "loss": 3.3613, + "step": 11480 + }, + { + "epoch": 0.7803370023100965, + "grad_norm": 6.498472690582275, + "learning_rate": 9.025003397200707e-06, + "loss": 3.3505, + "step": 11485 + }, + { + "epoch": 0.7806767223807582, + "grad_norm": 8.689260482788086, + "learning_rate": 9.02457874711238e-06, + "loss": 3.2777, + "step": 11490 + }, + { + "epoch": 0.78101644245142, + "grad_norm": 5.991199016571045, + "learning_rate": 9.024154097024053e-06, + "loss": 3.3587, + "step": 11495 + }, + { + "epoch": 0.7813561625220818, + "grad_norm": 8.361907005310059, + "learning_rate": 9.023729446935726e-06, + "loss": 2.948, + "step": 11500 + }, + { + "epoch": 0.7816958825927436, + "grad_norm": 5.933726787567139, + "learning_rate": 9.023304796847399e-06, + "loss": 3.2779, + "step": 11505 + }, + { + "epoch": 0.7820356026634053, + "grad_norm": 5.892867565155029, + "learning_rate": 9.022880146759071e-06, + "loss": 3.6019, + "step": 11510 + }, + { + "epoch": 0.7823753227340672, + "grad_norm": 6.176912307739258, + "learning_rate": 9.022455496670744e-06, + "loss": 3.2576, + "step": 11515 + }, + { + "epoch": 0.7827150428047289, + "grad_norm": 6.560268402099609, + "learning_rate": 9.022030846582417e-06, + "loss": 3.4314, + "step": 11520 + }, + { + "epoch": 0.7830547628753907, + "grad_norm": 7.095328330993652, + "learning_rate": 9.02160619649409e-06, + "loss": 3.4949, + "step": 11525 + }, + { + "epoch": 0.7833944829460524, + "grad_norm": 6.419678211212158, + "learning_rate": 9.021181546405763e-06, + "loss": 3.2398, + "step": 11530 + }, + { + "epoch": 0.7837342030167143, + "grad_norm": 5.724656105041504, + "learning_rate": 9.020756896317435e-06, + "loss": 3.2211, + "step": 11535 + }, + { + "epoch": 0.784073923087376, + "grad_norm": 5.560624599456787, + "learning_rate": 9.020332246229108e-06, + "loss": 3.5516, + "step": 11540 + }, + { + "epoch": 0.7844136431580377, + "grad_norm": 5.972039699554443, + "learning_rate": 9.019907596140781e-06, + "loss": 3.3151, + "step": 11545 + }, + { + "epoch": 0.7847533632286996, + "grad_norm": 5.065945625305176, + "learning_rate": 9.019482946052454e-06, + "loss": 3.3813, + "step": 11550 + }, + { + "epoch": 0.7850930832993613, + "grad_norm": 6.112202167510986, + "learning_rate": 9.019058295964127e-06, + "loss": 3.3601, + "step": 11555 + }, + { + "epoch": 0.7854328033700231, + "grad_norm": 6.283032417297363, + "learning_rate": 9.0186336458758e-06, + "loss": 3.4688, + "step": 11560 + }, + { + "epoch": 0.7857725234406848, + "grad_norm": 6.994748115539551, + "learning_rate": 9.018208995787472e-06, + "loss": 3.3159, + "step": 11565 + }, + { + "epoch": 0.7861122435113467, + "grad_norm": 9.283740043640137, + "learning_rate": 9.017784345699145e-06, + "loss": 3.1274, + "step": 11570 + }, + { + "epoch": 0.7864519635820084, + "grad_norm": 7.809957504272461, + "learning_rate": 9.017359695610818e-06, + "loss": 3.2704, + "step": 11575 + }, + { + "epoch": 0.7867916836526702, + "grad_norm": 7.085536956787109, + "learning_rate": 9.01693504552249e-06, + "loss": 3.629, + "step": 11580 + }, + { + "epoch": 0.787131403723332, + "grad_norm": 5.6845479011535645, + "learning_rate": 9.016510395434163e-06, + "loss": 3.2526, + "step": 11585 + }, + { + "epoch": 0.7874711237939938, + "grad_norm": 4.656717300415039, + "learning_rate": 9.016085745345836e-06, + "loss": 3.2663, + "step": 11590 + }, + { + "epoch": 0.7878108438646555, + "grad_norm": 5.812920093536377, + "learning_rate": 9.015661095257509e-06, + "loss": 3.218, + "step": 11595 + }, + { + "epoch": 0.7881505639353173, + "grad_norm": 6.0390238761901855, + "learning_rate": 9.015236445169182e-06, + "loss": 3.2837, + "step": 11600 + }, + { + "epoch": 0.7884902840059791, + "grad_norm": 6.282290458679199, + "learning_rate": 9.014811795080855e-06, + "loss": 3.4869, + "step": 11605 + }, + { + "epoch": 0.7888300040766408, + "grad_norm": 6.97031307220459, + "learning_rate": 9.014387144992527e-06, + "loss": 3.5242, + "step": 11610 + }, + { + "epoch": 0.7891697241473026, + "grad_norm": 6.857149124145508, + "learning_rate": 9.013962494904198e-06, + "loss": 3.2499, + "step": 11615 + }, + { + "epoch": 0.7895094442179644, + "grad_norm": 7.041757583618164, + "learning_rate": 9.013537844815873e-06, + "loss": 3.2995, + "step": 11620 + }, + { + "epoch": 0.7898491642886262, + "grad_norm": 6.340093612670898, + "learning_rate": 9.013113194727546e-06, + "loss": 3.2271, + "step": 11625 + }, + { + "epoch": 0.7901888843592879, + "grad_norm": 6.567028045654297, + "learning_rate": 9.012688544639217e-06, + "loss": 3.247, + "step": 11630 + }, + { + "epoch": 0.7905286044299498, + "grad_norm": 6.5978007316589355, + "learning_rate": 9.012263894550891e-06, + "loss": 3.292, + "step": 11635 + }, + { + "epoch": 0.7908683245006115, + "grad_norm": 6.409482479095459, + "learning_rate": 9.011839244462564e-06, + "loss": 3.5359, + "step": 11640 + }, + { + "epoch": 0.7912080445712733, + "grad_norm": 7.108513832092285, + "learning_rate": 9.011414594374235e-06, + "loss": 3.1918, + "step": 11645 + }, + { + "epoch": 0.791547764641935, + "grad_norm": 5.8510260581970215, + "learning_rate": 9.01098994428591e-06, + "loss": 3.403, + "step": 11650 + }, + { + "epoch": 0.7918874847125968, + "grad_norm": 8.396463394165039, + "learning_rate": 9.010565294197583e-06, + "loss": 3.4019, + "step": 11655 + }, + { + "epoch": 0.7922272047832586, + "grad_norm": 7.120832920074463, + "learning_rate": 9.010140644109254e-06, + "loss": 3.4129, + "step": 11660 + }, + { + "epoch": 0.7925669248539203, + "grad_norm": 7.336734294891357, + "learning_rate": 9.009715994020928e-06, + "loss": 3.2565, + "step": 11665 + }, + { + "epoch": 0.7929066449245822, + "grad_norm": 5.325462818145752, + "learning_rate": 9.009291343932601e-06, + "loss": 3.3299, + "step": 11670 + }, + { + "epoch": 0.7932463649952439, + "grad_norm": 6.6922760009765625, + "learning_rate": 9.008866693844272e-06, + "loss": 3.3336, + "step": 11675 + }, + { + "epoch": 0.7935860850659057, + "grad_norm": 6.055711269378662, + "learning_rate": 9.008442043755947e-06, + "loss": 3.3923, + "step": 11680 + }, + { + "epoch": 0.7939258051365675, + "grad_norm": 6.305657863616943, + "learning_rate": 9.008017393667618e-06, + "loss": 3.536, + "step": 11685 + }, + { + "epoch": 0.7942655252072293, + "grad_norm": 6.9141845703125, + "learning_rate": 9.00759274357929e-06, + "loss": 3.1711, + "step": 11690 + }, + { + "epoch": 0.794605245277891, + "grad_norm": 7.026118278503418, + "learning_rate": 9.007168093490965e-06, + "loss": 3.4173, + "step": 11695 + }, + { + "epoch": 0.7949449653485527, + "grad_norm": 7.070748805999756, + "learning_rate": 9.006743443402636e-06, + "loss": 3.3503, + "step": 11700 + }, + { + "epoch": 0.7952846854192146, + "grad_norm": 6.7805023193359375, + "learning_rate": 9.006318793314309e-06, + "loss": 3.1776, + "step": 11705 + }, + { + "epoch": 0.7956244054898763, + "grad_norm": 6.814888954162598, + "learning_rate": 9.005894143225983e-06, + "loss": 3.1621, + "step": 11710 + }, + { + "epoch": 0.7959641255605381, + "grad_norm": 6.216871738433838, + "learning_rate": 9.005469493137654e-06, + "loss": 3.4188, + "step": 11715 + }, + { + "epoch": 0.7963038456311999, + "grad_norm": 6.7159223556518555, + "learning_rate": 9.005044843049327e-06, + "loss": 3.4569, + "step": 11720 + }, + { + "epoch": 0.7966435657018617, + "grad_norm": 6.696798801422119, + "learning_rate": 9.004620192961002e-06, + "loss": 3.2659, + "step": 11725 + }, + { + "epoch": 0.7969832857725234, + "grad_norm": 7.579565525054932, + "learning_rate": 9.004195542872673e-06, + "loss": 3.5001, + "step": 11730 + }, + { + "epoch": 0.7973230058431852, + "grad_norm": 8.009488105773926, + "learning_rate": 9.003770892784346e-06, + "loss": 3.3375, + "step": 11735 + }, + { + "epoch": 0.797662725913847, + "grad_norm": 7.012862205505371, + "learning_rate": 9.00334624269602e-06, + "loss": 3.2356, + "step": 11740 + }, + { + "epoch": 0.7980024459845088, + "grad_norm": 7.086427211761475, + "learning_rate": 9.002921592607691e-06, + "loss": 3.377, + "step": 11745 + }, + { + "epoch": 0.7983421660551705, + "grad_norm": 8.627531051635742, + "learning_rate": 9.002496942519364e-06, + "loss": 3.546, + "step": 11750 + }, + { + "epoch": 0.7986818861258324, + "grad_norm": 6.196637153625488, + "learning_rate": 9.002072292431039e-06, + "loss": 3.0628, + "step": 11755 + }, + { + "epoch": 0.7990216061964941, + "grad_norm": 10.306758880615234, + "learning_rate": 9.00164764234271e-06, + "loss": 3.488, + "step": 11760 + }, + { + "epoch": 0.7993613262671558, + "grad_norm": 6.629557132720947, + "learning_rate": 9.001222992254384e-06, + "loss": 3.4052, + "step": 11765 + }, + { + "epoch": 0.7997010463378177, + "grad_norm": 5.126906871795654, + "learning_rate": 9.000798342166055e-06, + "loss": 3.3363, + "step": 11770 + }, + { + "epoch": 0.8000407664084794, + "grad_norm": 5.377384662628174, + "learning_rate": 9.000373692077728e-06, + "loss": 3.2248, + "step": 11775 + }, + { + "epoch": 0.8003804864791412, + "grad_norm": 6.608932971954346, + "learning_rate": 8.999949041989403e-06, + "loss": 3.5591, + "step": 11780 + }, + { + "epoch": 0.8007202065498029, + "grad_norm": 6.544389724731445, + "learning_rate": 8.999524391901074e-06, + "loss": 3.4786, + "step": 11785 + }, + { + "epoch": 0.8010599266204648, + "grad_norm": 5.378861904144287, + "learning_rate": 8.999099741812746e-06, + "loss": 3.06, + "step": 11790 + }, + { + "epoch": 0.8013996466911265, + "grad_norm": 5.817331790924072, + "learning_rate": 8.998675091724421e-06, + "loss": 3.3282, + "step": 11795 + }, + { + "epoch": 0.8017393667617883, + "grad_norm": 6.565239429473877, + "learning_rate": 8.998250441636092e-06, + "loss": 3.4894, + "step": 11800 + }, + { + "epoch": 0.8020790868324501, + "grad_norm": 7.962750434875488, + "learning_rate": 8.997825791547765e-06, + "loss": 3.4915, + "step": 11805 + }, + { + "epoch": 0.8024188069031118, + "grad_norm": 8.244783401489258, + "learning_rate": 8.99740114145944e-06, + "loss": 3.1554, + "step": 11810 + }, + { + "epoch": 0.8027585269737736, + "grad_norm": 6.5549798011779785, + "learning_rate": 8.99697649137111e-06, + "loss": 3.1292, + "step": 11815 + }, + { + "epoch": 0.8030982470444353, + "grad_norm": 6.33634090423584, + "learning_rate": 8.996551841282783e-06, + "loss": 3.3359, + "step": 11820 + }, + { + "epoch": 0.8034379671150972, + "grad_norm": 6.782951831817627, + "learning_rate": 8.996127191194458e-06, + "loss": 3.5208, + "step": 11825 + }, + { + "epoch": 0.8037776871857589, + "grad_norm": 6.162242889404297, + "learning_rate": 8.995702541106129e-06, + "loss": 3.46, + "step": 11830 + }, + { + "epoch": 0.8041174072564207, + "grad_norm": 5.538574695587158, + "learning_rate": 8.995277891017802e-06, + "loss": 3.4685, + "step": 11835 + }, + { + "epoch": 0.8044571273270825, + "grad_norm": 8.150626182556152, + "learning_rate": 8.994853240929475e-06, + "loss": 3.4472, + "step": 11840 + }, + { + "epoch": 0.8047968473977443, + "grad_norm": 6.91654109954834, + "learning_rate": 8.994428590841147e-06, + "loss": 3.3026, + "step": 11845 + }, + { + "epoch": 0.805136567468406, + "grad_norm": 9.103206634521484, + "learning_rate": 8.99400394075282e-06, + "loss": 3.3479, + "step": 11850 + }, + { + "epoch": 0.8054762875390679, + "grad_norm": 7.137856960296631, + "learning_rate": 8.993579290664493e-06, + "loss": 3.1661, + "step": 11855 + }, + { + "epoch": 0.8058160076097296, + "grad_norm": 7.957926273345947, + "learning_rate": 8.993154640576166e-06, + "loss": 3.1284, + "step": 11860 + }, + { + "epoch": 0.8061557276803913, + "grad_norm": 7.162635803222656, + "learning_rate": 8.992729990487839e-06, + "loss": 3.2918, + "step": 11865 + }, + { + "epoch": 0.8064954477510531, + "grad_norm": 5.3118133544921875, + "learning_rate": 8.992305340399511e-06, + "loss": 3.3358, + "step": 11870 + }, + { + "epoch": 0.8068351678217149, + "grad_norm": 6.563787937164307, + "learning_rate": 8.991880690311184e-06, + "loss": 3.4868, + "step": 11875 + }, + { + "epoch": 0.8071748878923767, + "grad_norm": 6.726981163024902, + "learning_rate": 8.991456040222857e-06, + "loss": 2.9126, + "step": 11880 + }, + { + "epoch": 0.8075146079630384, + "grad_norm": 6.081318378448486, + "learning_rate": 8.99103139013453e-06, + "loss": 3.4181, + "step": 11885 + }, + { + "epoch": 0.8078543280337003, + "grad_norm": 6.261704444885254, + "learning_rate": 8.990606740046203e-06, + "loss": 3.1259, + "step": 11890 + }, + { + "epoch": 0.808194048104362, + "grad_norm": 7.0475006103515625, + "learning_rate": 8.990182089957875e-06, + "loss": 3.267, + "step": 11895 + }, + { + "epoch": 0.8085337681750238, + "grad_norm": 8.137659072875977, + "learning_rate": 8.989757439869548e-06, + "loss": 3.2746, + "step": 11900 + }, + { + "epoch": 0.8088734882456855, + "grad_norm": 6.464759826660156, + "learning_rate": 8.989332789781221e-06, + "loss": 3.239, + "step": 11905 + }, + { + "epoch": 0.8092132083163474, + "grad_norm": 5.835093021392822, + "learning_rate": 8.988908139692894e-06, + "loss": 3.3922, + "step": 11910 + }, + { + "epoch": 0.8095529283870091, + "grad_norm": 5.669806957244873, + "learning_rate": 8.988483489604567e-06, + "loss": 3.3136, + "step": 11915 + }, + { + "epoch": 0.8098926484576708, + "grad_norm": 5.617775917053223, + "learning_rate": 8.98805883951624e-06, + "loss": 3.0792, + "step": 11920 + }, + { + "epoch": 0.8102323685283327, + "grad_norm": 5.638987064361572, + "learning_rate": 8.987634189427912e-06, + "loss": 3.3111, + "step": 11925 + }, + { + "epoch": 0.8105720885989944, + "grad_norm": 6.137881755828857, + "learning_rate": 8.987209539339585e-06, + "loss": 3.3579, + "step": 11930 + }, + { + "epoch": 0.8109118086696562, + "grad_norm": 9.60262680053711, + "learning_rate": 8.986784889251258e-06, + "loss": 3.4507, + "step": 11935 + }, + { + "epoch": 0.811251528740318, + "grad_norm": 5.359809875488281, + "learning_rate": 8.98636023916293e-06, + "loss": 3.1342, + "step": 11940 + }, + { + "epoch": 0.8115912488109798, + "grad_norm": 6.559675693511963, + "learning_rate": 8.985935589074603e-06, + "loss": 3.1041, + "step": 11945 + }, + { + "epoch": 0.8119309688816415, + "grad_norm": 6.048555850982666, + "learning_rate": 8.985510938986276e-06, + "loss": 3.3184, + "step": 11950 + }, + { + "epoch": 0.8122706889523033, + "grad_norm": 10.775065422058105, + "learning_rate": 8.985086288897949e-06, + "loss": 3.3741, + "step": 11955 + }, + { + "epoch": 0.8126104090229651, + "grad_norm": 6.706980228424072, + "learning_rate": 8.984661638809622e-06, + "loss": 3.3145, + "step": 11960 + }, + { + "epoch": 0.8129501290936268, + "grad_norm": 6.171966552734375, + "learning_rate": 8.984236988721295e-06, + "loss": 3.3485, + "step": 11965 + }, + { + "epoch": 0.8132898491642886, + "grad_norm": 7.979349613189697, + "learning_rate": 8.983812338632967e-06, + "loss": 3.4086, + "step": 11970 + }, + { + "epoch": 0.8136295692349504, + "grad_norm": 7.971080303192139, + "learning_rate": 8.98338768854464e-06, + "loss": 3.3929, + "step": 11975 + }, + { + "epoch": 0.8139692893056122, + "grad_norm": 8.037186622619629, + "learning_rate": 8.982963038456313e-06, + "loss": 3.1609, + "step": 11980 + }, + { + "epoch": 0.8143090093762739, + "grad_norm": 5.557927131652832, + "learning_rate": 8.982538388367986e-06, + "loss": 3.2005, + "step": 11985 + }, + { + "epoch": 0.8146487294469357, + "grad_norm": 7.352094650268555, + "learning_rate": 8.982113738279659e-06, + "loss": 3.4294, + "step": 11990 + }, + { + "epoch": 0.8149884495175975, + "grad_norm": 7.875720500946045, + "learning_rate": 8.981689088191331e-06, + "loss": 3.234, + "step": 11995 + }, + { + "epoch": 0.8153281695882593, + "grad_norm": 7.393388271331787, + "learning_rate": 8.981264438103004e-06, + "loss": 3.2398, + "step": 12000 + }, + { + "epoch": 0.815667889658921, + "grad_norm": 7.962485313415527, + "learning_rate": 8.980839788014677e-06, + "loss": 3.2682, + "step": 12005 + }, + { + "epoch": 0.8160076097295829, + "grad_norm": 7.226648330688477, + "learning_rate": 8.98041513792635e-06, + "loss": 3.4351, + "step": 12010 + }, + { + "epoch": 0.8163473298002446, + "grad_norm": 6.877664089202881, + "learning_rate": 8.979990487838023e-06, + "loss": 3.4758, + "step": 12015 + }, + { + "epoch": 0.8166870498709063, + "grad_norm": 7.081029891967773, + "learning_rate": 8.979565837749695e-06, + "loss": 3.3639, + "step": 12020 + }, + { + "epoch": 0.8170267699415682, + "grad_norm": 6.794394493103027, + "learning_rate": 8.979141187661368e-06, + "loss": 2.9336, + "step": 12025 + }, + { + "epoch": 0.8173664900122299, + "grad_norm": 5.964876174926758, + "learning_rate": 8.97871653757304e-06, + "loss": 3.3484, + "step": 12030 + }, + { + "epoch": 0.8177062100828917, + "grad_norm": 8.334808349609375, + "learning_rate": 8.978291887484714e-06, + "loss": 3.3158, + "step": 12035 + }, + { + "epoch": 0.8180459301535534, + "grad_norm": 6.015440464019775, + "learning_rate": 8.977867237396387e-06, + "loss": 3.1429, + "step": 12040 + }, + { + "epoch": 0.8183856502242153, + "grad_norm": 4.696622848510742, + "learning_rate": 8.977442587308058e-06, + "loss": 3.5459, + "step": 12045 + }, + { + "epoch": 0.818725370294877, + "grad_norm": 5.1512651443481445, + "learning_rate": 8.977017937219732e-06, + "loss": 3.3582, + "step": 12050 + }, + { + "epoch": 0.8190650903655388, + "grad_norm": 7.951510906219482, + "learning_rate": 8.976593287131405e-06, + "loss": 3.3547, + "step": 12055 + }, + { + "epoch": 0.8194048104362006, + "grad_norm": 6.843975067138672, + "learning_rate": 8.976168637043076e-06, + "loss": 3.2639, + "step": 12060 + }, + { + "epoch": 0.8197445305068624, + "grad_norm": 6.190030574798584, + "learning_rate": 8.97574398695475e-06, + "loss": 3.3703, + "step": 12065 + }, + { + "epoch": 0.8200842505775241, + "grad_norm": 8.270796775817871, + "learning_rate": 8.975319336866423e-06, + "loss": 3.2964, + "step": 12070 + }, + { + "epoch": 0.8204239706481858, + "grad_norm": 5.763399124145508, + "learning_rate": 8.974894686778094e-06, + "loss": 3.455, + "step": 12075 + }, + { + "epoch": 0.8207636907188477, + "grad_norm": 7.061771392822266, + "learning_rate": 8.974470036689769e-06, + "loss": 3.0291, + "step": 12080 + }, + { + "epoch": 0.8211034107895094, + "grad_norm": 6.504642486572266, + "learning_rate": 8.974045386601442e-06, + "loss": 3.436, + "step": 12085 + }, + { + "epoch": 0.8214431308601712, + "grad_norm": 7.714406967163086, + "learning_rate": 8.973620736513113e-06, + "loss": 3.4354, + "step": 12090 + }, + { + "epoch": 0.821782850930833, + "grad_norm": 6.873347759246826, + "learning_rate": 8.973196086424787e-06, + "loss": 3.4434, + "step": 12095 + }, + { + "epoch": 0.8221225710014948, + "grad_norm": 7.350863456726074, + "learning_rate": 8.97277143633646e-06, + "loss": 3.3221, + "step": 12100 + }, + { + "epoch": 0.8224622910721565, + "grad_norm": 7.085488319396973, + "learning_rate": 8.972346786248133e-06, + "loss": 3.2091, + "step": 12105 + }, + { + "epoch": 0.8228020111428184, + "grad_norm": 6.612829685211182, + "learning_rate": 8.971922136159806e-06, + "loss": 3.3867, + "step": 12110 + }, + { + "epoch": 0.8231417312134801, + "grad_norm": 6.76235294342041, + "learning_rate": 8.971497486071477e-06, + "loss": 3.3457, + "step": 12115 + }, + { + "epoch": 0.8234814512841419, + "grad_norm": 6.449110507965088, + "learning_rate": 8.971072835983151e-06, + "loss": 3.4213, + "step": 12120 + }, + { + "epoch": 0.8238211713548036, + "grad_norm": 8.078417778015137, + "learning_rate": 8.970648185894824e-06, + "loss": 3.3515, + "step": 12125 + }, + { + "epoch": 0.8241608914254654, + "grad_norm": 6.2821245193481445, + "learning_rate": 8.970223535806495e-06, + "loss": 3.4425, + "step": 12130 + }, + { + "epoch": 0.8245006114961272, + "grad_norm": 7.032104015350342, + "learning_rate": 8.96979888571817e-06, + "loss": 3.0678, + "step": 12135 + }, + { + "epoch": 0.8248403315667889, + "grad_norm": 5.846761703491211, + "learning_rate": 8.969374235629843e-06, + "loss": 3.2959, + "step": 12140 + }, + { + "epoch": 0.8251800516374508, + "grad_norm": 6.272344589233398, + "learning_rate": 8.968949585541514e-06, + "loss": 3.3629, + "step": 12145 + }, + { + "epoch": 0.8255197717081125, + "grad_norm": 6.802596569061279, + "learning_rate": 8.968524935453188e-06, + "loss": 3.2985, + "step": 12150 + }, + { + "epoch": 0.8258594917787743, + "grad_norm": 6.201138496398926, + "learning_rate": 8.968100285364861e-06, + "loss": 3.4549, + "step": 12155 + }, + { + "epoch": 0.826199211849436, + "grad_norm": 6.883821487426758, + "learning_rate": 8.967675635276532e-06, + "loss": 3.3742, + "step": 12160 + }, + { + "epoch": 0.8265389319200979, + "grad_norm": 7.236270904541016, + "learning_rate": 8.967250985188207e-06, + "loss": 3.254, + "step": 12165 + }, + { + "epoch": 0.8268786519907596, + "grad_norm": 7.870363712310791, + "learning_rate": 8.96682633509988e-06, + "loss": 3.4532, + "step": 12170 + }, + { + "epoch": 0.8272183720614213, + "grad_norm": 9.698821067810059, + "learning_rate": 8.96640168501155e-06, + "loss": 3.2053, + "step": 12175 + }, + { + "epoch": 0.8275580921320832, + "grad_norm": 5.333906650543213, + "learning_rate": 8.965977034923225e-06, + "loss": 3.4055, + "step": 12180 + }, + { + "epoch": 0.8278978122027449, + "grad_norm": 5.652205944061279, + "learning_rate": 8.965552384834896e-06, + "loss": 3.3068, + "step": 12185 + }, + { + "epoch": 0.8282375322734067, + "grad_norm": 7.801706790924072, + "learning_rate": 8.965127734746569e-06, + "loss": 3.4701, + "step": 12190 + }, + { + "epoch": 0.8285772523440685, + "grad_norm": 5.5337114334106445, + "learning_rate": 8.964703084658243e-06, + "loss": 3.5473, + "step": 12195 + }, + { + "epoch": 0.8289169724147303, + "grad_norm": 7.301129341125488, + "learning_rate": 8.964278434569914e-06, + "loss": 3.4401, + "step": 12200 + }, + { + "epoch": 0.829256692485392, + "grad_norm": 6.864589691162109, + "learning_rate": 8.963853784481587e-06, + "loss": 3.4922, + "step": 12205 + }, + { + "epoch": 0.8295964125560538, + "grad_norm": 6.734142303466797, + "learning_rate": 8.963429134393262e-06, + "loss": 3.1875, + "step": 12210 + }, + { + "epoch": 0.8299361326267156, + "grad_norm": 8.271151542663574, + "learning_rate": 8.963004484304933e-06, + "loss": 3.3426, + "step": 12215 + }, + { + "epoch": 0.8302758526973774, + "grad_norm": 6.554001331329346, + "learning_rate": 8.962579834216606e-06, + "loss": 3.2638, + "step": 12220 + }, + { + "epoch": 0.8306155727680391, + "grad_norm": 6.730641841888428, + "learning_rate": 8.96215518412828e-06, + "loss": 3.3591, + "step": 12225 + }, + { + "epoch": 0.830955292838701, + "grad_norm": 6.8074951171875, + "learning_rate": 8.961730534039951e-06, + "loss": 3.2377, + "step": 12230 + }, + { + "epoch": 0.8312950129093627, + "grad_norm": 6.632007122039795, + "learning_rate": 8.961305883951624e-06, + "loss": 2.9578, + "step": 12235 + }, + { + "epoch": 0.8316347329800244, + "grad_norm": 6.2069525718688965, + "learning_rate": 8.960881233863299e-06, + "loss": 3.2393, + "step": 12240 + }, + { + "epoch": 0.8319744530506862, + "grad_norm": 5.901941299438477, + "learning_rate": 8.96045658377497e-06, + "loss": 2.926, + "step": 12245 + }, + { + "epoch": 0.832314173121348, + "grad_norm": 5.881171703338623, + "learning_rate": 8.960031933686642e-06, + "loss": 3.2836, + "step": 12250 + }, + { + "epoch": 0.8326538931920098, + "grad_norm": 7.382627010345459, + "learning_rate": 8.959607283598315e-06, + "loss": 3.1702, + "step": 12255 + }, + { + "epoch": 0.8329936132626715, + "grad_norm": 8.117081642150879, + "learning_rate": 8.959182633509988e-06, + "loss": 3.3011, + "step": 12260 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 8.451509475708008, + "learning_rate": 8.958757983421661e-06, + "loss": 3.4666, + "step": 12265 + }, + { + "epoch": 0.8336730534039951, + "grad_norm": 7.728665351867676, + "learning_rate": 8.958333333333334e-06, + "loss": 3.0233, + "step": 12270 + }, + { + "epoch": 0.8340127734746569, + "grad_norm": 6.134167671203613, + "learning_rate": 8.957908683245006e-06, + "loss": 3.4225, + "step": 12275 + }, + { + "epoch": 0.8343524935453187, + "grad_norm": 7.797046661376953, + "learning_rate": 8.95748403315668e-06, + "loss": 3.4827, + "step": 12280 + }, + { + "epoch": 0.8346922136159804, + "grad_norm": 7.284855365753174, + "learning_rate": 8.957059383068352e-06, + "loss": 3.2061, + "step": 12285 + }, + { + "epoch": 0.8350319336866422, + "grad_norm": 7.186800003051758, + "learning_rate": 8.956634732980025e-06, + "loss": 3.2917, + "step": 12290 + }, + { + "epoch": 0.8353716537573039, + "grad_norm": 6.560112476348877, + "learning_rate": 8.956210082891698e-06, + "loss": 3.176, + "step": 12295 + }, + { + "epoch": 0.8357113738279658, + "grad_norm": 7.454448699951172, + "learning_rate": 8.95578543280337e-06, + "loss": 3.4266, + "step": 12300 + }, + { + "epoch": 0.8360510938986275, + "grad_norm": 9.211386680603027, + "learning_rate": 8.955360782715043e-06, + "loss": 3.1138, + "step": 12305 + }, + { + "epoch": 0.8363908139692893, + "grad_norm": 7.409143447875977, + "learning_rate": 8.954936132626716e-06, + "loss": 3.5137, + "step": 12310 + }, + { + "epoch": 0.8367305340399511, + "grad_norm": 7.66379976272583, + "learning_rate": 8.954511482538389e-06, + "loss": 3.3123, + "step": 12315 + }, + { + "epoch": 0.8370702541106129, + "grad_norm": 6.689318656921387, + "learning_rate": 8.954086832450062e-06, + "loss": 3.4277, + "step": 12320 + }, + { + "epoch": 0.8374099741812746, + "grad_norm": 6.561331272125244, + "learning_rate": 8.953662182361734e-06, + "loss": 3.4017, + "step": 12325 + }, + { + "epoch": 0.8377496942519363, + "grad_norm": 6.6735711097717285, + "learning_rate": 8.953237532273407e-06, + "loss": 3.0249, + "step": 12330 + }, + { + "epoch": 0.8380894143225982, + "grad_norm": 5.96565055847168, + "learning_rate": 8.95281288218508e-06, + "loss": 3.4899, + "step": 12335 + }, + { + "epoch": 0.83842913439326, + "grad_norm": 5.594418048858643, + "learning_rate": 8.952388232096753e-06, + "loss": 3.337, + "step": 12340 + }, + { + "epoch": 0.8387688544639217, + "grad_norm": 6.79469108581543, + "learning_rate": 8.951963582008426e-06, + "loss": 3.3441, + "step": 12345 + }, + { + "epoch": 0.8391085745345835, + "grad_norm": 6.4760613441467285, + "learning_rate": 8.951538931920098e-06, + "loss": 3.2458, + "step": 12350 + }, + { + "epoch": 0.8394482946052453, + "grad_norm": 5.994215488433838, + "learning_rate": 8.951114281831771e-06, + "loss": 3.3422, + "step": 12355 + }, + { + "epoch": 0.839788014675907, + "grad_norm": 7.002240180969238, + "learning_rate": 8.950689631743444e-06, + "loss": 3.2785, + "step": 12360 + }, + { + "epoch": 0.8401277347465689, + "grad_norm": 5.228823661804199, + "learning_rate": 8.950264981655117e-06, + "loss": 3.5019, + "step": 12365 + }, + { + "epoch": 0.8404674548172306, + "grad_norm": 8.782246589660645, + "learning_rate": 8.94984033156679e-06, + "loss": 3.5277, + "step": 12370 + }, + { + "epoch": 0.8408071748878924, + "grad_norm": 7.475368976593018, + "learning_rate": 8.949415681478462e-06, + "loss": 3.2994, + "step": 12375 + }, + { + "epoch": 0.8411468949585541, + "grad_norm": 7.014561176300049, + "learning_rate": 8.948991031390135e-06, + "loss": 3.3889, + "step": 12380 + }, + { + "epoch": 0.841486615029216, + "grad_norm": 7.870868682861328, + "learning_rate": 8.948566381301808e-06, + "loss": 3.5135, + "step": 12385 + }, + { + "epoch": 0.8418263350998777, + "grad_norm": 6.60628080368042, + "learning_rate": 8.948141731213481e-06, + "loss": 3.148, + "step": 12390 + }, + { + "epoch": 0.8421660551705394, + "grad_norm": 7.195352554321289, + "learning_rate": 8.947717081125154e-06, + "loss": 3.2557, + "step": 12395 + }, + { + "epoch": 0.8425057752412013, + "grad_norm": 7.36768913269043, + "learning_rate": 8.947292431036826e-06, + "loss": 3.4093, + "step": 12400 + }, + { + "epoch": 0.842845495311863, + "grad_norm": 6.081052780151367, + "learning_rate": 8.9468677809485e-06, + "loss": 3.3128, + "step": 12405 + }, + { + "epoch": 0.8431852153825248, + "grad_norm": 8.285329818725586, + "learning_rate": 8.946443130860172e-06, + "loss": 3.521, + "step": 12410 + }, + { + "epoch": 0.8435249354531865, + "grad_norm": 6.648558616638184, + "learning_rate": 8.946018480771845e-06, + "loss": 3.4165, + "step": 12415 + }, + { + "epoch": 0.8438646555238484, + "grad_norm": 5.638489723205566, + "learning_rate": 8.945593830683518e-06, + "loss": 3.5493, + "step": 12420 + }, + { + "epoch": 0.8442043755945101, + "grad_norm": 7.404422283172607, + "learning_rate": 8.94516918059519e-06, + "loss": 3.4514, + "step": 12425 + }, + { + "epoch": 0.8445440956651719, + "grad_norm": 6.265591144561768, + "learning_rate": 8.944744530506863e-06, + "loss": 3.283, + "step": 12430 + }, + { + "epoch": 0.8448838157358337, + "grad_norm": 5.941522121429443, + "learning_rate": 8.944319880418536e-06, + "loss": 3.4272, + "step": 12435 + }, + { + "epoch": 0.8452235358064955, + "grad_norm": 8.841819763183594, + "learning_rate": 8.943895230330209e-06, + "loss": 3.3176, + "step": 12440 + }, + { + "epoch": 0.8455632558771572, + "grad_norm": 6.318163871765137, + "learning_rate": 8.943470580241882e-06, + "loss": 3.2789, + "step": 12445 + }, + { + "epoch": 0.845902975947819, + "grad_norm": 6.32560396194458, + "learning_rate": 8.943045930153555e-06, + "loss": 3.323, + "step": 12450 + }, + { + "epoch": 0.8462426960184808, + "grad_norm": 8.141730308532715, + "learning_rate": 8.942621280065227e-06, + "loss": 3.6213, + "step": 12455 + }, + { + "epoch": 0.8465824160891425, + "grad_norm": 7.563545227050781, + "learning_rate": 8.9421966299769e-06, + "loss": 3.2831, + "step": 12460 + }, + { + "epoch": 0.8469221361598043, + "grad_norm": 5.476590633392334, + "learning_rate": 8.941771979888573e-06, + "loss": 3.419, + "step": 12465 + }, + { + "epoch": 0.8472618562304661, + "grad_norm": 6.978398323059082, + "learning_rate": 8.941347329800246e-06, + "loss": 3.3245, + "step": 12470 + }, + { + "epoch": 0.8476015763011279, + "grad_norm": 5.836533069610596, + "learning_rate": 8.940922679711919e-06, + "loss": 2.9511, + "step": 12475 + }, + { + "epoch": 0.8479412963717896, + "grad_norm": 7.979818344116211, + "learning_rate": 8.940498029623591e-06, + "loss": 3.1995, + "step": 12480 + }, + { + "epoch": 0.8482810164424515, + "grad_norm": 6.5490498542785645, + "learning_rate": 8.940073379535264e-06, + "loss": 3.3459, + "step": 12485 + }, + { + "epoch": 0.8486207365131132, + "grad_norm": 6.517265796661377, + "learning_rate": 8.939648729446937e-06, + "loss": 3.3395, + "step": 12490 + }, + { + "epoch": 0.848960456583775, + "grad_norm": 6.4955549240112305, + "learning_rate": 8.93922407935861e-06, + "loss": 3.4128, + "step": 12495 + }, + { + "epoch": 0.8493001766544367, + "grad_norm": 7.5016632080078125, + "learning_rate": 8.938799429270283e-06, + "loss": 3.3485, + "step": 12500 + }, + { + "epoch": 0.8496398967250985, + "grad_norm": 6.329983711242676, + "learning_rate": 8.938374779181955e-06, + "loss": 3.4006, + "step": 12505 + }, + { + "epoch": 0.8499796167957603, + "grad_norm": 7.688725471496582, + "learning_rate": 8.937950129093628e-06, + "loss": 3.1797, + "step": 12510 + }, + { + "epoch": 0.850319336866422, + "grad_norm": 5.9614410400390625, + "learning_rate": 8.937525479005301e-06, + "loss": 3.2249, + "step": 12515 + }, + { + "epoch": 0.8506590569370839, + "grad_norm": 6.198179244995117, + "learning_rate": 8.937100828916974e-06, + "loss": 3.3745, + "step": 12520 + }, + { + "epoch": 0.8509987770077456, + "grad_norm": 8.117182731628418, + "learning_rate": 8.936676178828647e-06, + "loss": 3.3645, + "step": 12525 + }, + { + "epoch": 0.8513384970784074, + "grad_norm": 7.716073036193848, + "learning_rate": 8.936251528740318e-06, + "loss": 3.528, + "step": 12530 + }, + { + "epoch": 0.8516782171490692, + "grad_norm": 7.479373455047607, + "learning_rate": 8.935826878651992e-06, + "loss": 3.4886, + "step": 12535 + }, + { + "epoch": 0.852017937219731, + "grad_norm": 7.09079647064209, + "learning_rate": 8.935402228563665e-06, + "loss": 3.4355, + "step": 12540 + }, + { + "epoch": 0.8523576572903927, + "grad_norm": 6.662778854370117, + "learning_rate": 8.934977578475336e-06, + "loss": 3.4432, + "step": 12545 + }, + { + "epoch": 0.8526973773610544, + "grad_norm": 6.953200817108154, + "learning_rate": 8.93455292838701e-06, + "loss": 3.4064, + "step": 12550 + }, + { + "epoch": 0.8530370974317163, + "grad_norm": 7.862915515899658, + "learning_rate": 8.934128278298683e-06, + "loss": 3.345, + "step": 12555 + }, + { + "epoch": 0.853376817502378, + "grad_norm": 7.532567024230957, + "learning_rate": 8.933703628210354e-06, + "loss": 3.2216, + "step": 12560 + }, + { + "epoch": 0.8537165375730398, + "grad_norm": 5.785308361053467, + "learning_rate": 8.933278978122029e-06, + "loss": 3.5501, + "step": 12565 + }, + { + "epoch": 0.8540562576437016, + "grad_norm": 7.726802825927734, + "learning_rate": 8.932854328033702e-06, + "loss": 3.1694, + "step": 12570 + }, + { + "epoch": 0.8543959777143634, + "grad_norm": 6.4960103034973145, + "learning_rate": 8.932429677945373e-06, + "loss": 3.3955, + "step": 12575 + }, + { + "epoch": 0.8547356977850251, + "grad_norm": 6.507502555847168, + "learning_rate": 8.932005027857047e-06, + "loss": 3.3417, + "step": 12580 + }, + { + "epoch": 0.8550754178556869, + "grad_norm": 6.54812479019165, + "learning_rate": 8.93158037776872e-06, + "loss": 3.0833, + "step": 12585 + }, + { + "epoch": 0.8554151379263487, + "grad_norm": 5.85428524017334, + "learning_rate": 8.931155727680391e-06, + "loss": 3.2983, + "step": 12590 + }, + { + "epoch": 0.8557548579970105, + "grad_norm": 6.954521179199219, + "learning_rate": 8.930731077592066e-06, + "loss": 3.457, + "step": 12595 + }, + { + "epoch": 0.8560945780676722, + "grad_norm": 7.023073196411133, + "learning_rate": 8.930306427503737e-06, + "loss": 3.4233, + "step": 12600 + }, + { + "epoch": 0.856434298138334, + "grad_norm": 7.992888450622559, + "learning_rate": 8.92988177741541e-06, + "loss": 3.0874, + "step": 12605 + }, + { + "epoch": 0.8567740182089958, + "grad_norm": 5.769635200500488, + "learning_rate": 8.929457127327084e-06, + "loss": 3.299, + "step": 12610 + }, + { + "epoch": 0.8571137382796575, + "grad_norm": 6.166844844818115, + "learning_rate": 8.929032477238755e-06, + "loss": 3.2689, + "step": 12615 + }, + { + "epoch": 0.8574534583503194, + "grad_norm": 8.936781883239746, + "learning_rate": 8.928607827150428e-06, + "loss": 3.6308, + "step": 12620 + }, + { + "epoch": 0.8577931784209811, + "grad_norm": 6.839977741241455, + "learning_rate": 8.928183177062103e-06, + "loss": 3.4385, + "step": 12625 + }, + { + "epoch": 0.8581328984916429, + "grad_norm": 6.638980388641357, + "learning_rate": 8.927758526973774e-06, + "loss": 3.4776, + "step": 12630 + }, + { + "epoch": 0.8584726185623046, + "grad_norm": 5.422125339508057, + "learning_rate": 8.927333876885446e-06, + "loss": 3.081, + "step": 12635 + }, + { + "epoch": 0.8588123386329665, + "grad_norm": 7.126796245574951, + "learning_rate": 8.926909226797121e-06, + "loss": 3.454, + "step": 12640 + }, + { + "epoch": 0.8591520587036282, + "grad_norm": 5.4309773445129395, + "learning_rate": 8.926484576708792e-06, + "loss": 3.3984, + "step": 12645 + }, + { + "epoch": 0.85949177877429, + "grad_norm": 6.46823787689209, + "learning_rate": 8.926059926620465e-06, + "loss": 3.2854, + "step": 12650 + }, + { + "epoch": 0.8598314988449518, + "grad_norm": 6.015686988830566, + "learning_rate": 8.92563527653214e-06, + "loss": 3.3514, + "step": 12655 + }, + { + "epoch": 0.8601712189156135, + "grad_norm": 6.788808345794678, + "learning_rate": 8.92521062644381e-06, + "loss": 3.312, + "step": 12660 + }, + { + "epoch": 0.8605109389862753, + "grad_norm": 6.841235160827637, + "learning_rate": 8.924785976355483e-06, + "loss": 3.3041, + "step": 12665 + }, + { + "epoch": 0.860850659056937, + "grad_norm": 7.140774250030518, + "learning_rate": 8.924361326267158e-06, + "loss": 3.2709, + "step": 12670 + }, + { + "epoch": 0.8611903791275989, + "grad_norm": 5.114997386932373, + "learning_rate": 8.923936676178829e-06, + "loss": 3.1737, + "step": 12675 + }, + { + "epoch": 0.8615300991982606, + "grad_norm": 7.623608589172363, + "learning_rate": 8.923512026090502e-06, + "loss": 3.2879, + "step": 12680 + }, + { + "epoch": 0.8618698192689224, + "grad_norm": 6.110797882080078, + "learning_rate": 8.923087376002174e-06, + "loss": 3.4053, + "step": 12685 + }, + { + "epoch": 0.8622095393395842, + "grad_norm": 7.6401286125183105, + "learning_rate": 8.922662725913847e-06, + "loss": 3.1676, + "step": 12690 + }, + { + "epoch": 0.862549259410246, + "grad_norm": 6.050968647003174, + "learning_rate": 8.92223807582552e-06, + "loss": 3.4366, + "step": 12695 + }, + { + "epoch": 0.8628889794809077, + "grad_norm": 8.30381965637207, + "learning_rate": 8.921813425737193e-06, + "loss": 3.3286, + "step": 12700 + }, + { + "epoch": 0.8632286995515696, + "grad_norm": 6.199645042419434, + "learning_rate": 8.921388775648866e-06, + "loss": 3.1662, + "step": 12705 + }, + { + "epoch": 0.8635684196222313, + "grad_norm": 7.964922904968262, + "learning_rate": 8.920964125560538e-06, + "loss": 3.1406, + "step": 12710 + }, + { + "epoch": 0.863908139692893, + "grad_norm": 7.921074390411377, + "learning_rate": 8.920539475472211e-06, + "loss": 3.4337, + "step": 12715 + }, + { + "epoch": 0.8642478597635548, + "grad_norm": 5.880890846252441, + "learning_rate": 8.920114825383884e-06, + "loss": 3.3212, + "step": 12720 + }, + { + "epoch": 0.8645875798342166, + "grad_norm": 6.141681671142578, + "learning_rate": 8.919690175295557e-06, + "loss": 3.4935, + "step": 12725 + }, + { + "epoch": 0.8649272999048784, + "grad_norm": 5.707877159118652, + "learning_rate": 8.91926552520723e-06, + "loss": 3.2585, + "step": 12730 + }, + { + "epoch": 0.8652670199755401, + "grad_norm": 7.017467975616455, + "learning_rate": 8.918840875118902e-06, + "loss": 3.5787, + "step": 12735 + }, + { + "epoch": 0.865606740046202, + "grad_norm": 7.979851245880127, + "learning_rate": 8.918416225030575e-06, + "loss": 3.477, + "step": 12740 + }, + { + "epoch": 0.8659464601168637, + "grad_norm": 7.853791236877441, + "learning_rate": 8.917991574942248e-06, + "loss": 3.4511, + "step": 12745 + }, + { + "epoch": 0.8662861801875255, + "grad_norm": 5.684115409851074, + "learning_rate": 8.917566924853921e-06, + "loss": 3.3667, + "step": 12750 + }, + { + "epoch": 0.8666259002581872, + "grad_norm": 5.512235164642334, + "learning_rate": 8.917142274765594e-06, + "loss": 3.2864, + "step": 12755 + }, + { + "epoch": 0.866965620328849, + "grad_norm": 6.132421493530273, + "learning_rate": 8.916717624677266e-06, + "loss": 3.2214, + "step": 12760 + }, + { + "epoch": 0.8673053403995108, + "grad_norm": 6.391918659210205, + "learning_rate": 8.91629297458894e-06, + "loss": 3.3446, + "step": 12765 + }, + { + "epoch": 0.8676450604701725, + "grad_norm": 6.854994297027588, + "learning_rate": 8.915868324500612e-06, + "loss": 3.4508, + "step": 12770 + }, + { + "epoch": 0.8679847805408344, + "grad_norm": 6.366187572479248, + "learning_rate": 8.915443674412285e-06, + "loss": 3.4796, + "step": 12775 + }, + { + "epoch": 0.8683245006114961, + "grad_norm": 5.94672966003418, + "learning_rate": 8.915019024323958e-06, + "loss": 3.2979, + "step": 12780 + }, + { + "epoch": 0.8686642206821579, + "grad_norm": 7.204557418823242, + "learning_rate": 8.91459437423563e-06, + "loss": 3.5246, + "step": 12785 + }, + { + "epoch": 0.8690039407528197, + "grad_norm": 7.451384544372559, + "learning_rate": 8.914169724147303e-06, + "loss": 3.2447, + "step": 12790 + }, + { + "epoch": 0.8693436608234815, + "grad_norm": 5.287717342376709, + "learning_rate": 8.913745074058976e-06, + "loss": 3.3112, + "step": 12795 + }, + { + "epoch": 0.8696833808941432, + "grad_norm": 6.151953220367432, + "learning_rate": 8.913320423970649e-06, + "loss": 3.4357, + "step": 12800 + }, + { + "epoch": 0.870023100964805, + "grad_norm": 5.998720169067383, + "learning_rate": 8.912895773882322e-06, + "loss": 3.5116, + "step": 12805 + }, + { + "epoch": 0.8703628210354668, + "grad_norm": 6.992258548736572, + "learning_rate": 8.912471123793994e-06, + "loss": 3.5546, + "step": 12810 + }, + { + "epoch": 0.8707025411061285, + "grad_norm": 8.062926292419434, + "learning_rate": 8.912046473705667e-06, + "loss": 3.3262, + "step": 12815 + }, + { + "epoch": 0.8710422611767903, + "grad_norm": 5.8680009841918945, + "learning_rate": 8.91162182361734e-06, + "loss": 3.4173, + "step": 12820 + }, + { + "epoch": 0.8713819812474521, + "grad_norm": 7.983694553375244, + "learning_rate": 8.911197173529013e-06, + "loss": 3.5533, + "step": 12825 + }, + { + "epoch": 0.8717217013181139, + "grad_norm": 5.72251558303833, + "learning_rate": 8.910772523440686e-06, + "loss": 3.3346, + "step": 12830 + }, + { + "epoch": 0.8720614213887756, + "grad_norm": 5.362501621246338, + "learning_rate": 8.910347873352358e-06, + "loss": 3.1878, + "step": 12835 + }, + { + "epoch": 0.8724011414594374, + "grad_norm": 7.950568675994873, + "learning_rate": 8.909923223264031e-06, + "loss": 3.2968, + "step": 12840 + }, + { + "epoch": 0.8727408615300992, + "grad_norm": 6.358529567718506, + "learning_rate": 8.909498573175704e-06, + "loss": 3.2998, + "step": 12845 + }, + { + "epoch": 0.873080581600761, + "grad_norm": 7.007701873779297, + "learning_rate": 8.909073923087377e-06, + "loss": 3.5433, + "step": 12850 + }, + { + "epoch": 0.8734203016714227, + "grad_norm": 6.246647834777832, + "learning_rate": 8.90864927299905e-06, + "loss": 3.437, + "step": 12855 + }, + { + "epoch": 0.8737600217420846, + "grad_norm": 6.029445171356201, + "learning_rate": 8.908224622910722e-06, + "loss": 3.2314, + "step": 12860 + }, + { + "epoch": 0.8740997418127463, + "grad_norm": 6.226210594177246, + "learning_rate": 8.907799972822395e-06, + "loss": 3.2933, + "step": 12865 + }, + { + "epoch": 0.874439461883408, + "grad_norm": 10.26089859008789, + "learning_rate": 8.907375322734068e-06, + "loss": 3.1367, + "step": 12870 + }, + { + "epoch": 0.8747791819540699, + "grad_norm": 6.311893939971924, + "learning_rate": 8.906950672645741e-06, + "loss": 3.3759, + "step": 12875 + }, + { + "epoch": 0.8751189020247316, + "grad_norm": 6.253583908081055, + "learning_rate": 8.906526022557414e-06, + "loss": 3.115, + "step": 12880 + }, + { + "epoch": 0.8754586220953934, + "grad_norm": 6.768709659576416, + "learning_rate": 8.906101372469086e-06, + "loss": 3.3492, + "step": 12885 + }, + { + "epoch": 0.8757983421660551, + "grad_norm": 5.997818946838379, + "learning_rate": 8.90567672238076e-06, + "loss": 3.2666, + "step": 12890 + }, + { + "epoch": 0.876138062236717, + "grad_norm": 6.924018383026123, + "learning_rate": 8.905252072292432e-06, + "loss": 3.4461, + "step": 12895 + }, + { + "epoch": 0.8764777823073787, + "grad_norm": 6.45501708984375, + "learning_rate": 8.904827422204105e-06, + "loss": 3.4733, + "step": 12900 + }, + { + "epoch": 0.8768175023780405, + "grad_norm": 7.263872146606445, + "learning_rate": 8.904402772115778e-06, + "loss": 3.2448, + "step": 12905 + }, + { + "epoch": 0.8771572224487023, + "grad_norm": 9.777731895446777, + "learning_rate": 8.90397812202745e-06, + "loss": 3.3952, + "step": 12910 + }, + { + "epoch": 0.877496942519364, + "grad_norm": 5.9612016677856445, + "learning_rate": 8.903553471939123e-06, + "loss": 3.0465, + "step": 12915 + }, + { + "epoch": 0.8778366625900258, + "grad_norm": 7.303951263427734, + "learning_rate": 8.903128821850796e-06, + "loss": 3.3021, + "step": 12920 + }, + { + "epoch": 0.8781763826606876, + "grad_norm": 7.072498798370361, + "learning_rate": 8.902704171762469e-06, + "loss": 3.0901, + "step": 12925 + }, + { + "epoch": 0.8785161027313494, + "grad_norm": 8.6417875289917, + "learning_rate": 8.902279521674142e-06, + "loss": 3.3085, + "step": 12930 + }, + { + "epoch": 0.8788558228020111, + "grad_norm": 6.515583038330078, + "learning_rate": 8.901854871585814e-06, + "loss": 3.3254, + "step": 12935 + }, + { + "epoch": 0.8791955428726729, + "grad_norm": 8.221056938171387, + "learning_rate": 8.901430221497487e-06, + "loss": 3.2952, + "step": 12940 + }, + { + "epoch": 0.8795352629433347, + "grad_norm": 6.139904975891113, + "learning_rate": 8.901005571409158e-06, + "loss": 3.3151, + "step": 12945 + }, + { + "epoch": 0.8798749830139965, + "grad_norm": 5.9420623779296875, + "learning_rate": 8.900580921320833e-06, + "loss": 3.0484, + "step": 12950 + }, + { + "epoch": 0.8802147030846582, + "grad_norm": 7.9142584800720215, + "learning_rate": 8.900156271232506e-06, + "loss": 3.4706, + "step": 12955 + }, + { + "epoch": 0.8805544231553201, + "grad_norm": 6.196759223937988, + "learning_rate": 8.899731621144177e-06, + "loss": 3.3427, + "step": 12960 + }, + { + "epoch": 0.8808941432259818, + "grad_norm": 5.946737766265869, + "learning_rate": 8.899306971055851e-06, + "loss": 3.3979, + "step": 12965 + }, + { + "epoch": 0.8812338632966435, + "grad_norm": 6.913461685180664, + "learning_rate": 8.898882320967524e-06, + "loss": 3.4361, + "step": 12970 + }, + { + "epoch": 0.8815735833673053, + "grad_norm": 6.451849937438965, + "learning_rate": 8.898457670879195e-06, + "loss": 3.2809, + "step": 12975 + }, + { + "epoch": 0.8819133034379671, + "grad_norm": 7.117389678955078, + "learning_rate": 8.89803302079087e-06, + "loss": 3.5061, + "step": 12980 + }, + { + "epoch": 0.8822530235086289, + "grad_norm": 7.778730869293213, + "learning_rate": 8.897608370702542e-06, + "loss": 3.2964, + "step": 12985 + }, + { + "epoch": 0.8825927435792906, + "grad_norm": 8.75714111328125, + "learning_rate": 8.897183720614214e-06, + "loss": 3.4384, + "step": 12990 + }, + { + "epoch": 0.8829324636499525, + "grad_norm": 6.776620388031006, + "learning_rate": 8.896759070525888e-06, + "loss": 3.3866, + "step": 12995 + }, + { + "epoch": 0.8832721837206142, + "grad_norm": 6.256683349609375, + "learning_rate": 8.896334420437561e-06, + "loss": 3.3589, + "step": 13000 + }, + { + "epoch": 0.883611903791276, + "grad_norm": 9.486576080322266, + "learning_rate": 8.895909770349232e-06, + "loss": 3.4276, + "step": 13005 + }, + { + "epoch": 0.8839516238619378, + "grad_norm": 5.233585357666016, + "learning_rate": 8.895485120260906e-06, + "loss": 3.1512, + "step": 13010 + }, + { + "epoch": 0.8842913439325996, + "grad_norm": 6.04524040222168, + "learning_rate": 8.89506047017258e-06, + "loss": 3.1665, + "step": 13015 + }, + { + "epoch": 0.8846310640032613, + "grad_norm": 5.702670574188232, + "learning_rate": 8.89463582008425e-06, + "loss": 3.1617, + "step": 13020 + }, + { + "epoch": 0.884970784073923, + "grad_norm": 7.40413236618042, + "learning_rate": 8.894211169995925e-06, + "loss": 3.2337, + "step": 13025 + }, + { + "epoch": 0.8853105041445849, + "grad_norm": 7.733927249908447, + "learning_rate": 8.893786519907596e-06, + "loss": 3.2779, + "step": 13030 + }, + { + "epoch": 0.8856502242152466, + "grad_norm": 7.52202844619751, + "learning_rate": 8.893361869819269e-06, + "loss": 3.4096, + "step": 13035 + }, + { + "epoch": 0.8859899442859084, + "grad_norm": 6.604290962219238, + "learning_rate": 8.892937219730943e-06, + "loss": 3.142, + "step": 13040 + }, + { + "epoch": 0.8863296643565702, + "grad_norm": 5.448315620422363, + "learning_rate": 8.892512569642614e-06, + "loss": 3.3013, + "step": 13045 + }, + { + "epoch": 0.886669384427232, + "grad_norm": 7.12876033782959, + "learning_rate": 8.892087919554287e-06, + "loss": 3.4581, + "step": 13050 + }, + { + "epoch": 0.8870091044978937, + "grad_norm": 6.9550371170043945, + "learning_rate": 8.891663269465962e-06, + "loss": 3.242, + "step": 13055 + }, + { + "epoch": 0.8873488245685555, + "grad_norm": 8.747074127197266, + "learning_rate": 8.891238619377633e-06, + "loss": 3.2816, + "step": 13060 + }, + { + "epoch": 0.8876885446392173, + "grad_norm": 6.191891193389893, + "learning_rate": 8.890813969289306e-06, + "loss": 3.3406, + "step": 13065 + }, + { + "epoch": 0.8880282647098791, + "grad_norm": 6.948544979095459, + "learning_rate": 8.89038931920098e-06, + "loss": 3.2485, + "step": 13070 + }, + { + "epoch": 0.8883679847805408, + "grad_norm": 6.862801551818848, + "learning_rate": 8.889964669112651e-06, + "loss": 3.3999, + "step": 13075 + }, + { + "epoch": 0.8887077048512027, + "grad_norm": 6.45248556137085, + "learning_rate": 8.889540019024324e-06, + "loss": 3.4102, + "step": 13080 + }, + { + "epoch": 0.8890474249218644, + "grad_norm": 7.3049635887146, + "learning_rate": 8.889115368935999e-06, + "loss": 3.3001, + "step": 13085 + }, + { + "epoch": 0.8893871449925261, + "grad_norm": 6.296259880065918, + "learning_rate": 8.88869071884767e-06, + "loss": 3.3097, + "step": 13090 + }, + { + "epoch": 0.889726865063188, + "grad_norm": 7.0207839012146, + "learning_rate": 8.888266068759342e-06, + "loss": 3.328, + "step": 13095 + }, + { + "epoch": 0.8900665851338497, + "grad_norm": 6.265158176422119, + "learning_rate": 8.887841418671015e-06, + "loss": 3.3101, + "step": 13100 + }, + { + "epoch": 0.8904063052045115, + "grad_norm": 7.338021755218506, + "learning_rate": 8.887416768582688e-06, + "loss": 3.3817, + "step": 13105 + }, + { + "epoch": 0.8907460252751732, + "grad_norm": 7.969470500946045, + "learning_rate": 8.88699211849436e-06, + "loss": 3.4148, + "step": 13110 + }, + { + "epoch": 0.8910857453458351, + "grad_norm": 6.438961982727051, + "learning_rate": 8.886567468406034e-06, + "loss": 3.2966, + "step": 13115 + }, + { + "epoch": 0.8914254654164968, + "grad_norm": 6.574192047119141, + "learning_rate": 8.886142818317706e-06, + "loss": 3.4289, + "step": 13120 + }, + { + "epoch": 0.8917651854871586, + "grad_norm": 6.345705032348633, + "learning_rate": 8.885718168229381e-06, + "loss": 3.0624, + "step": 13125 + }, + { + "epoch": 0.8921049055578204, + "grad_norm": 6.733394622802734, + "learning_rate": 8.885293518141052e-06, + "loss": 3.1702, + "step": 13130 + }, + { + "epoch": 0.8924446256284821, + "grad_norm": 8.232789039611816, + "learning_rate": 8.884868868052725e-06, + "loss": 3.2705, + "step": 13135 + }, + { + "epoch": 0.8927843456991439, + "grad_norm": 5.452823162078857, + "learning_rate": 8.8844442179644e-06, + "loss": 3.3195, + "step": 13140 + }, + { + "epoch": 0.8931240657698056, + "grad_norm": 6.452500820159912, + "learning_rate": 8.88401956787607e-06, + "loss": 3.4963, + "step": 13145 + }, + { + "epoch": 0.8934637858404675, + "grad_norm": 8.424601554870605, + "learning_rate": 8.883594917787743e-06, + "loss": 3.3458, + "step": 13150 + }, + { + "epoch": 0.8938035059111292, + "grad_norm": 5.662675380706787, + "learning_rate": 8.883170267699418e-06, + "loss": 3.5944, + "step": 13155 + }, + { + "epoch": 0.894143225981791, + "grad_norm": 7.308087348937988, + "learning_rate": 8.882745617611089e-06, + "loss": 3.0318, + "step": 13160 + }, + { + "epoch": 0.8944829460524528, + "grad_norm": 5.693304061889648, + "learning_rate": 8.882320967522762e-06, + "loss": 3.1652, + "step": 13165 + }, + { + "epoch": 0.8948226661231146, + "grad_norm": 6.163827896118164, + "learning_rate": 8.881896317434434e-06, + "loss": 3.1661, + "step": 13170 + }, + { + "epoch": 0.8951623861937763, + "grad_norm": 4.714858531951904, + "learning_rate": 8.881471667346107e-06, + "loss": 3.3642, + "step": 13175 + }, + { + "epoch": 0.8955021062644382, + "grad_norm": 8.191203117370605, + "learning_rate": 8.88104701725778e-06, + "loss": 3.3635, + "step": 13180 + }, + { + "epoch": 0.8958418263350999, + "grad_norm": 6.938210964202881, + "learning_rate": 8.880622367169453e-06, + "loss": 3.6228, + "step": 13185 + }, + { + "epoch": 0.8961815464057616, + "grad_norm": 5.973705768585205, + "learning_rate": 8.880197717081126e-06, + "loss": 3.2285, + "step": 13190 + }, + { + "epoch": 0.8965212664764234, + "grad_norm": 7.444724082946777, + "learning_rate": 8.879773066992798e-06, + "loss": 3.5423, + "step": 13195 + }, + { + "epoch": 0.8968609865470852, + "grad_norm": 6.640089988708496, + "learning_rate": 8.879348416904471e-06, + "loss": 3.2769, + "step": 13200 + }, + { + "epoch": 0.897200706617747, + "grad_norm": 6.334479331970215, + "learning_rate": 8.878923766816144e-06, + "loss": 3.2264, + "step": 13205 + }, + { + "epoch": 0.8975404266884087, + "grad_norm": 6.037014007568359, + "learning_rate": 8.878499116727817e-06, + "loss": 3.2506, + "step": 13210 + }, + { + "epoch": 0.8978801467590706, + "grad_norm": 8.240159034729004, + "learning_rate": 8.87807446663949e-06, + "loss": 3.2091, + "step": 13215 + }, + { + "epoch": 0.8982198668297323, + "grad_norm": 8.196845054626465, + "learning_rate": 8.877649816551162e-06, + "loss": 3.3278, + "step": 13220 + }, + { + "epoch": 0.8985595869003941, + "grad_norm": 8.163293838500977, + "learning_rate": 8.877225166462835e-06, + "loss": 3.5422, + "step": 13225 + }, + { + "epoch": 0.8988993069710558, + "grad_norm": 7.0655198097229, + "learning_rate": 8.876800516374508e-06, + "loss": 3.563, + "step": 13230 + }, + { + "epoch": 0.8992390270417177, + "grad_norm": 7.227688312530518, + "learning_rate": 8.87637586628618e-06, + "loss": 3.2618, + "step": 13235 + }, + { + "epoch": 0.8995787471123794, + "grad_norm": 5.8480095863342285, + "learning_rate": 8.875951216197854e-06, + "loss": 3.4482, + "step": 13240 + }, + { + "epoch": 0.8999184671830411, + "grad_norm": 5.614614963531494, + "learning_rate": 8.875526566109526e-06, + "loss": 3.0914, + "step": 13245 + }, + { + "epoch": 0.900258187253703, + "grad_norm": 6.735170364379883, + "learning_rate": 8.8751019160212e-06, + "loss": 3.2953, + "step": 13250 + }, + { + "epoch": 0.9005979073243647, + "grad_norm": 6.056962966918945, + "learning_rate": 8.874677265932872e-06, + "loss": 3.2422, + "step": 13255 + }, + { + "epoch": 0.9009376273950265, + "grad_norm": 6.682920932769775, + "learning_rate": 8.874252615844545e-06, + "loss": 3.2701, + "step": 13260 + }, + { + "epoch": 0.9012773474656883, + "grad_norm": 5.777838230133057, + "learning_rate": 8.873827965756218e-06, + "loss": 3.2898, + "step": 13265 + }, + { + "epoch": 0.9016170675363501, + "grad_norm": 7.545052528381348, + "learning_rate": 8.87340331566789e-06, + "loss": 3.3563, + "step": 13270 + }, + { + "epoch": 0.9019567876070118, + "grad_norm": 7.674961566925049, + "learning_rate": 8.872978665579563e-06, + "loss": 3.4486, + "step": 13275 + }, + { + "epoch": 0.9022965076776736, + "grad_norm": 7.682774543762207, + "learning_rate": 8.872554015491236e-06, + "loss": 3.692, + "step": 13280 + }, + { + "epoch": 0.9026362277483354, + "grad_norm": 5.458476543426514, + "learning_rate": 8.872129365402909e-06, + "loss": 3.3477, + "step": 13285 + }, + { + "epoch": 0.9029759478189971, + "grad_norm": 7.505293846130371, + "learning_rate": 8.871704715314582e-06, + "loss": 3.3103, + "step": 13290 + }, + { + "epoch": 0.9033156678896589, + "grad_norm": 6.461740493774414, + "learning_rate": 8.871280065226254e-06, + "loss": 3.6056, + "step": 13295 + }, + { + "epoch": 0.9036553879603207, + "grad_norm": 6.571490287780762, + "learning_rate": 8.870855415137927e-06, + "loss": 3.3525, + "step": 13300 + }, + { + "epoch": 0.9039951080309825, + "grad_norm": 6.846792697906494, + "learning_rate": 8.8704307650496e-06, + "loss": 3.1307, + "step": 13305 + }, + { + "epoch": 0.9043348281016442, + "grad_norm": 6.2226738929748535, + "learning_rate": 8.870006114961273e-06, + "loss": 3.5022, + "step": 13310 + }, + { + "epoch": 0.904674548172306, + "grad_norm": 5.929004669189453, + "learning_rate": 8.869581464872946e-06, + "loss": 3.5593, + "step": 13315 + }, + { + "epoch": 0.9050142682429678, + "grad_norm": 6.00783109664917, + "learning_rate": 8.869156814784618e-06, + "loss": 3.1303, + "step": 13320 + }, + { + "epoch": 0.9053539883136296, + "grad_norm": 6.57087516784668, + "learning_rate": 8.868732164696291e-06, + "loss": 3.2055, + "step": 13325 + }, + { + "epoch": 0.9056937083842913, + "grad_norm": 6.91628360748291, + "learning_rate": 8.868307514607964e-06, + "loss": 3.2542, + "step": 13330 + }, + { + "epoch": 0.9060334284549532, + "grad_norm": 7.02885103225708, + "learning_rate": 8.867882864519637e-06, + "loss": 3.4904, + "step": 13335 + }, + { + "epoch": 0.9063731485256149, + "grad_norm": 6.963503360748291, + "learning_rate": 8.86745821443131e-06, + "loss": 3.2225, + "step": 13340 + }, + { + "epoch": 0.9067128685962766, + "grad_norm": 5.870925426483154, + "learning_rate": 8.867033564342982e-06, + "loss": 3.3815, + "step": 13345 + }, + { + "epoch": 0.9070525886669385, + "grad_norm": 8.078207015991211, + "learning_rate": 8.866608914254655e-06, + "loss": 3.3865, + "step": 13350 + }, + { + "epoch": 0.9073923087376002, + "grad_norm": 5.988964557647705, + "learning_rate": 8.866184264166328e-06, + "loss": 3.3006, + "step": 13355 + }, + { + "epoch": 0.907732028808262, + "grad_norm": 7.216220855712891, + "learning_rate": 8.865759614078e-06, + "loss": 3.297, + "step": 13360 + }, + { + "epoch": 0.9080717488789237, + "grad_norm": 5.835912227630615, + "learning_rate": 8.865334963989674e-06, + "loss": 3.1239, + "step": 13365 + }, + { + "epoch": 0.9084114689495856, + "grad_norm": 7.316339015960693, + "learning_rate": 8.864910313901346e-06, + "loss": 3.3273, + "step": 13370 + }, + { + "epoch": 0.9087511890202473, + "grad_norm": 6.498430252075195, + "learning_rate": 8.864485663813018e-06, + "loss": 2.9689, + "step": 13375 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 6.59779691696167, + "learning_rate": 8.864061013724692e-06, + "loss": 3.4678, + "step": 13380 + }, + { + "epoch": 0.9094306291615709, + "grad_norm": 6.432597637176514, + "learning_rate": 8.863636363636365e-06, + "loss": 3.4694, + "step": 13385 + }, + { + "epoch": 0.9097703492322327, + "grad_norm": 8.79452896118164, + "learning_rate": 8.863211713548036e-06, + "loss": 3.5122, + "step": 13390 + }, + { + "epoch": 0.9101100693028944, + "grad_norm": 5.904415130615234, + "learning_rate": 8.86278706345971e-06, + "loss": 3.1203, + "step": 13395 + }, + { + "epoch": 0.9104497893735561, + "grad_norm": 6.891711711883545, + "learning_rate": 8.862362413371383e-06, + "loss": 3.1464, + "step": 13400 + }, + { + "epoch": 0.910789509444218, + "grad_norm": 7.71150016784668, + "learning_rate": 8.861937763283054e-06, + "loss": 3.666, + "step": 13405 + }, + { + "epoch": 0.9111292295148797, + "grad_norm": 5.617831230163574, + "learning_rate": 8.861513113194729e-06, + "loss": 3.0955, + "step": 13410 + }, + { + "epoch": 0.9114689495855415, + "grad_norm": 5.302960395812988, + "learning_rate": 8.861088463106402e-06, + "loss": 3.3881, + "step": 13415 + }, + { + "epoch": 0.9118086696562033, + "grad_norm": 6.535092830657959, + "learning_rate": 8.860663813018073e-06, + "loss": 3.501, + "step": 13420 + }, + { + "epoch": 0.9121483897268651, + "grad_norm": 7.458796977996826, + "learning_rate": 8.860239162929747e-06, + "loss": 3.3395, + "step": 13425 + }, + { + "epoch": 0.9124881097975268, + "grad_norm": 5.888338565826416, + "learning_rate": 8.85981451284142e-06, + "loss": 3.0924, + "step": 13430 + }, + { + "epoch": 0.9128278298681887, + "grad_norm": 6.837749481201172, + "learning_rate": 8.859389862753091e-06, + "loss": 3.3662, + "step": 13435 + }, + { + "epoch": 0.9131675499388504, + "grad_norm": 7.950228214263916, + "learning_rate": 8.858965212664766e-06, + "loss": 3.2454, + "step": 13440 + }, + { + "epoch": 0.9135072700095122, + "grad_norm": 6.496403217315674, + "learning_rate": 8.858540562576437e-06, + "loss": 3.2061, + "step": 13445 + }, + { + "epoch": 0.9138469900801739, + "grad_norm": 7.130161762237549, + "learning_rate": 8.85811591248811e-06, + "loss": 3.1252, + "step": 13450 + }, + { + "epoch": 0.9141867101508357, + "grad_norm": 5.815441131591797, + "learning_rate": 8.857691262399784e-06, + "loss": 3.2989, + "step": 13455 + }, + { + "epoch": 0.9145264302214975, + "grad_norm": 7.5626301765441895, + "learning_rate": 8.857266612311455e-06, + "loss": 3.5223, + "step": 13460 + }, + { + "epoch": 0.9148661502921592, + "grad_norm": 6.723893642425537, + "learning_rate": 8.85684196222313e-06, + "loss": 3.3576, + "step": 13465 + }, + { + "epoch": 0.9152058703628211, + "grad_norm": 6.412136554718018, + "learning_rate": 8.856417312134802e-06, + "loss": 3.4848, + "step": 13470 + }, + { + "epoch": 0.9155455904334828, + "grad_norm": 6.2787275314331055, + "learning_rate": 8.855992662046474e-06, + "loss": 3.2943, + "step": 13475 + }, + { + "epoch": 0.9158853105041446, + "grad_norm": 7.0774922370910645, + "learning_rate": 8.855568011958148e-06, + "loss": 3.2368, + "step": 13480 + }, + { + "epoch": 0.9162250305748063, + "grad_norm": 5.875144958496094, + "learning_rate": 8.855143361869821e-06, + "loss": 3.3013, + "step": 13485 + }, + { + "epoch": 0.9165647506454682, + "grad_norm": 6.674097061157227, + "learning_rate": 8.854718711781492e-06, + "loss": 3.3759, + "step": 13490 + }, + { + "epoch": 0.9169044707161299, + "grad_norm": 8.254383087158203, + "learning_rate": 8.854294061693166e-06, + "loss": 3.3381, + "step": 13495 + }, + { + "epoch": 0.9172441907867916, + "grad_norm": 6.587164878845215, + "learning_rate": 8.85386941160484e-06, + "loss": 3.2535, + "step": 13500 + }, + { + "epoch": 0.9175839108574535, + "grad_norm": 6.401578426361084, + "learning_rate": 8.85344476151651e-06, + "loss": 3.41, + "step": 13505 + }, + { + "epoch": 0.9179236309281152, + "grad_norm": 6.055038928985596, + "learning_rate": 8.853020111428185e-06, + "loss": 3.5956, + "step": 13510 + }, + { + "epoch": 0.918263350998777, + "grad_norm": 6.651979446411133, + "learning_rate": 8.852595461339856e-06, + "loss": 2.8837, + "step": 13515 + }, + { + "epoch": 0.9186030710694388, + "grad_norm": 8.957976341247559, + "learning_rate": 8.852170811251529e-06, + "loss": 3.4016, + "step": 13520 + }, + { + "epoch": 0.9189427911401006, + "grad_norm": 6.4627299308776855, + "learning_rate": 8.851746161163203e-06, + "loss": 3.3242, + "step": 13525 + }, + { + "epoch": 0.9192825112107623, + "grad_norm": 6.953244209289551, + "learning_rate": 8.851321511074874e-06, + "loss": 3.0527, + "step": 13530 + }, + { + "epoch": 0.9196222312814241, + "grad_norm": 6.025221824645996, + "learning_rate": 8.850896860986547e-06, + "loss": 3.4776, + "step": 13535 + }, + { + "epoch": 0.9199619513520859, + "grad_norm": 6.311152935028076, + "learning_rate": 8.850472210898222e-06, + "loss": 3.5806, + "step": 13540 + }, + { + "epoch": 0.9203016714227477, + "grad_norm": 7.300408363342285, + "learning_rate": 8.850047560809893e-06, + "loss": 3.1891, + "step": 13545 + }, + { + "epoch": 0.9206413914934094, + "grad_norm": 6.835318565368652, + "learning_rate": 8.849622910721566e-06, + "loss": 3.4589, + "step": 13550 + }, + { + "epoch": 0.9209811115640713, + "grad_norm": 7.7573041915893555, + "learning_rate": 8.84919826063324e-06, + "loss": 3.4296, + "step": 13555 + }, + { + "epoch": 0.921320831634733, + "grad_norm": 7.197726249694824, + "learning_rate": 8.848773610544911e-06, + "loss": 3.3143, + "step": 13560 + }, + { + "epoch": 0.9216605517053947, + "grad_norm": 7.95094108581543, + "learning_rate": 8.848348960456584e-06, + "loss": 3.4004, + "step": 13565 + }, + { + "epoch": 0.9220002717760565, + "grad_norm": 6.085048198699951, + "learning_rate": 8.847924310368258e-06, + "loss": 3.3775, + "step": 13570 + }, + { + "epoch": 0.9223399918467183, + "grad_norm": 5.830416679382324, + "learning_rate": 8.84749966027993e-06, + "loss": 3.2841, + "step": 13575 + }, + { + "epoch": 0.9226797119173801, + "grad_norm": 6.464962959289551, + "learning_rate": 8.847075010191602e-06, + "loss": 3.2207, + "step": 13580 + }, + { + "epoch": 0.9230194319880418, + "grad_norm": 6.663308620452881, + "learning_rate": 8.846650360103277e-06, + "loss": 3.1243, + "step": 13585 + }, + { + "epoch": 0.9233591520587037, + "grad_norm": 6.360827922821045, + "learning_rate": 8.846225710014948e-06, + "loss": 3.2796, + "step": 13590 + }, + { + "epoch": 0.9236988721293654, + "grad_norm": 6.624440670013428, + "learning_rate": 8.84580105992662e-06, + "loss": 3.2604, + "step": 13595 + }, + { + "epoch": 0.9240385922000272, + "grad_norm": 6.170647144317627, + "learning_rate": 8.845376409838294e-06, + "loss": 3.2133, + "step": 13600 + }, + { + "epoch": 0.924378312270689, + "grad_norm": 6.856496810913086, + "learning_rate": 8.844951759749966e-06, + "loss": 3.4498, + "step": 13605 + }, + { + "epoch": 0.9247180323413507, + "grad_norm": 6.296213150024414, + "learning_rate": 8.84452710966164e-06, + "loss": 3.4225, + "step": 13610 + }, + { + "epoch": 0.9250577524120125, + "grad_norm": 7.534445762634277, + "learning_rate": 8.844102459573312e-06, + "loss": 3.3842, + "step": 13615 + }, + { + "epoch": 0.9253974724826742, + "grad_norm": 6.811347007751465, + "learning_rate": 8.843677809484985e-06, + "loss": 3.379, + "step": 13620 + }, + { + "epoch": 0.9257371925533361, + "grad_norm": 5.260218143463135, + "learning_rate": 8.843253159396658e-06, + "loss": 3.4408, + "step": 13625 + }, + { + "epoch": 0.9260769126239978, + "grad_norm": 4.666255474090576, + "learning_rate": 8.84282850930833e-06, + "loss": 3.1642, + "step": 13630 + }, + { + "epoch": 0.9264166326946596, + "grad_norm": 6.763018608093262, + "learning_rate": 8.842403859220003e-06, + "loss": 3.4071, + "step": 13635 + }, + { + "epoch": 0.9267563527653214, + "grad_norm": 8.026787757873535, + "learning_rate": 8.841979209131676e-06, + "loss": 3.2595, + "step": 13640 + }, + { + "epoch": 0.9270960728359832, + "grad_norm": 6.171877861022949, + "learning_rate": 8.841554559043349e-06, + "loss": 3.1111, + "step": 13645 + }, + { + "epoch": 0.9274357929066449, + "grad_norm": 7.280976295471191, + "learning_rate": 8.841129908955022e-06, + "loss": 3.493, + "step": 13650 + }, + { + "epoch": 0.9277755129773066, + "grad_norm": 7.447112560272217, + "learning_rate": 8.840705258866694e-06, + "loss": 3.3913, + "step": 13655 + }, + { + "epoch": 0.9281152330479685, + "grad_norm": 6.177844524383545, + "learning_rate": 8.840280608778367e-06, + "loss": 3.3931, + "step": 13660 + }, + { + "epoch": 0.9284549531186302, + "grad_norm": 6.131439685821533, + "learning_rate": 8.83985595869004e-06, + "loss": 3.2689, + "step": 13665 + }, + { + "epoch": 0.928794673189292, + "grad_norm": 6.2535014152526855, + "learning_rate": 8.839431308601713e-06, + "loss": 3.3907, + "step": 13670 + }, + { + "epoch": 0.9291343932599538, + "grad_norm": 7.424090385437012, + "learning_rate": 8.839006658513386e-06, + "loss": 3.4224, + "step": 13675 + }, + { + "epoch": 0.9294741133306156, + "grad_norm": 7.201848983764648, + "learning_rate": 8.838582008425058e-06, + "loss": 3.3785, + "step": 13680 + }, + { + "epoch": 0.9298138334012773, + "grad_norm": 7.486930847167969, + "learning_rate": 8.838157358336731e-06, + "loss": 3.1377, + "step": 13685 + }, + { + "epoch": 0.9301535534719392, + "grad_norm": 6.59567403793335, + "learning_rate": 8.837732708248404e-06, + "loss": 3.1556, + "step": 13690 + }, + { + "epoch": 0.9304932735426009, + "grad_norm": 8.190057754516602, + "learning_rate": 8.837308058160077e-06, + "loss": 3.3478, + "step": 13695 + }, + { + "epoch": 0.9308329936132627, + "grad_norm": 6.33314847946167, + "learning_rate": 8.83688340807175e-06, + "loss": 3.1882, + "step": 13700 + }, + { + "epoch": 0.9311727136839244, + "grad_norm": 7.009542465209961, + "learning_rate": 8.836458757983422e-06, + "loss": 3.3325, + "step": 13705 + }, + { + "epoch": 0.9315124337545863, + "grad_norm": 5.874746322631836, + "learning_rate": 8.836034107895095e-06, + "loss": 3.2852, + "step": 13710 + }, + { + "epoch": 0.931852153825248, + "grad_norm": 6.586053371429443, + "learning_rate": 8.835609457806768e-06, + "loss": 3.3709, + "step": 13715 + }, + { + "epoch": 0.9321918738959097, + "grad_norm": 6.893450736999512, + "learning_rate": 8.83518480771844e-06, + "loss": 3.2978, + "step": 13720 + }, + { + "epoch": 0.9325315939665716, + "grad_norm": 6.895447254180908, + "learning_rate": 8.834760157630114e-06, + "loss": 3.0738, + "step": 13725 + }, + { + "epoch": 0.9328713140372333, + "grad_norm": 6.037014961242676, + "learning_rate": 8.834335507541786e-06, + "loss": 3.246, + "step": 13730 + }, + { + "epoch": 0.9332110341078951, + "grad_norm": 7.375148296356201, + "learning_rate": 8.83391085745346e-06, + "loss": 3.5107, + "step": 13735 + }, + { + "epoch": 0.9335507541785568, + "grad_norm": 6.2194108963012695, + "learning_rate": 8.833486207365132e-06, + "loss": 3.2683, + "step": 13740 + }, + { + "epoch": 0.9338904742492187, + "grad_norm": 7.081309795379639, + "learning_rate": 8.833061557276805e-06, + "loss": 3.4954, + "step": 13745 + }, + { + "epoch": 0.9342301943198804, + "grad_norm": 7.698179721832275, + "learning_rate": 8.832636907188478e-06, + "loss": 3.0386, + "step": 13750 + }, + { + "epoch": 0.9345699143905422, + "grad_norm": 5.752802848815918, + "learning_rate": 8.83221225710015e-06, + "loss": 3.5363, + "step": 13755 + }, + { + "epoch": 0.934909634461204, + "grad_norm": 8.244654655456543, + "learning_rate": 8.831787607011823e-06, + "loss": 3.3323, + "step": 13760 + }, + { + "epoch": 0.9352493545318658, + "grad_norm": 6.584081649780273, + "learning_rate": 8.831362956923496e-06, + "loss": 3.3449, + "step": 13765 + }, + { + "epoch": 0.9355890746025275, + "grad_norm": 5.325397491455078, + "learning_rate": 8.830938306835169e-06, + "loss": 3.322, + "step": 13770 + }, + { + "epoch": 0.9359287946731893, + "grad_norm": 7.0776777267456055, + "learning_rate": 8.830513656746842e-06, + "loss": 3.1451, + "step": 13775 + }, + { + "epoch": 0.9362685147438511, + "grad_norm": 7.346451282501221, + "learning_rate": 8.830089006658514e-06, + "loss": 3.2007, + "step": 13780 + }, + { + "epoch": 0.9366082348145128, + "grad_norm": 5.324946880340576, + "learning_rate": 8.829664356570187e-06, + "loss": 3.3463, + "step": 13785 + }, + { + "epoch": 0.9369479548851746, + "grad_norm": 5.642464637756348, + "learning_rate": 8.829239706481858e-06, + "loss": 3.2487, + "step": 13790 + }, + { + "epoch": 0.9372876749558364, + "grad_norm": 7.979766845703125, + "learning_rate": 8.828815056393533e-06, + "loss": 3.383, + "step": 13795 + }, + { + "epoch": 0.9376273950264982, + "grad_norm": 5.425391674041748, + "learning_rate": 8.828390406305206e-06, + "loss": 3.0309, + "step": 13800 + }, + { + "epoch": 0.9379671150971599, + "grad_norm": 5.308307647705078, + "learning_rate": 8.827965756216878e-06, + "loss": 3.3161, + "step": 13805 + }, + { + "epoch": 0.9383068351678218, + "grad_norm": 7.118289470672607, + "learning_rate": 8.827541106128551e-06, + "loss": 3.11, + "step": 13810 + }, + { + "epoch": 0.9386465552384835, + "grad_norm": 6.741113662719727, + "learning_rate": 8.827116456040224e-06, + "loss": 3.2964, + "step": 13815 + }, + { + "epoch": 0.9389862753091452, + "grad_norm": 5.685742378234863, + "learning_rate": 8.826691805951897e-06, + "loss": 3.4348, + "step": 13820 + }, + { + "epoch": 0.939325995379807, + "grad_norm": 6.356738567352295, + "learning_rate": 8.82626715586357e-06, + "loss": 3.0397, + "step": 13825 + }, + { + "epoch": 0.9396657154504688, + "grad_norm": 7.320123672485352, + "learning_rate": 8.825842505775242e-06, + "loss": 3.4294, + "step": 13830 + }, + { + "epoch": 0.9400054355211306, + "grad_norm": 6.392636775970459, + "learning_rate": 8.825417855686915e-06, + "loss": 3.2283, + "step": 13835 + }, + { + "epoch": 0.9403451555917923, + "grad_norm": 5.583836555480957, + "learning_rate": 8.824993205598588e-06, + "loss": 3.2384, + "step": 13840 + }, + { + "epoch": 0.9406848756624542, + "grad_norm": 7.84402322769165, + "learning_rate": 8.82456855551026e-06, + "loss": 3.1388, + "step": 13845 + }, + { + "epoch": 0.9410245957331159, + "grad_norm": 5.915610313415527, + "learning_rate": 8.824143905421934e-06, + "loss": 3.3624, + "step": 13850 + }, + { + "epoch": 0.9413643158037777, + "grad_norm": 5.2784929275512695, + "learning_rate": 8.823719255333606e-06, + "loss": 3.4005, + "step": 13855 + }, + { + "epoch": 0.9417040358744395, + "grad_norm": 6.06076192855835, + "learning_rate": 8.823294605245278e-06, + "loss": 3.2427, + "step": 13860 + }, + { + "epoch": 0.9420437559451013, + "grad_norm": 8.506467819213867, + "learning_rate": 8.822869955156952e-06, + "loss": 3.0584, + "step": 13865 + }, + { + "epoch": 0.942383476015763, + "grad_norm": 7.913955211639404, + "learning_rate": 8.822445305068625e-06, + "loss": 3.2774, + "step": 13870 + }, + { + "epoch": 0.9427231960864247, + "grad_norm": 6.723132133483887, + "learning_rate": 8.822020654980296e-06, + "loss": 3.2846, + "step": 13875 + }, + { + "epoch": 0.9430629161570866, + "grad_norm": 7.857599258422852, + "learning_rate": 8.82159600489197e-06, + "loss": 3.277, + "step": 13880 + }, + { + "epoch": 0.9434026362277483, + "grad_norm": 6.601655960083008, + "learning_rate": 8.821171354803643e-06, + "loss": 3.1388, + "step": 13885 + }, + { + "epoch": 0.9437423562984101, + "grad_norm": 6.4948930740356445, + "learning_rate": 8.820746704715314e-06, + "loss": 3.5017, + "step": 13890 + }, + { + "epoch": 0.9440820763690719, + "grad_norm": 6.484705448150635, + "learning_rate": 8.820322054626989e-06, + "loss": 3.4911, + "step": 13895 + }, + { + "epoch": 0.9444217964397337, + "grad_norm": 7.678147315979004, + "learning_rate": 8.819897404538662e-06, + "loss": 3.2107, + "step": 13900 + }, + { + "epoch": 0.9447615165103954, + "grad_norm": 6.247237682342529, + "learning_rate": 8.819472754450333e-06, + "loss": 3.3186, + "step": 13905 + }, + { + "epoch": 0.9451012365810572, + "grad_norm": 6.924861431121826, + "learning_rate": 8.819048104362007e-06, + "loss": 3.552, + "step": 13910 + }, + { + "epoch": 0.945440956651719, + "grad_norm": 7.816624164581299, + "learning_rate": 8.81862345427368e-06, + "loss": 3.2872, + "step": 13915 + }, + { + "epoch": 0.9457806767223808, + "grad_norm": 6.2653021812438965, + "learning_rate": 8.818198804185351e-06, + "loss": 3.1191, + "step": 13920 + }, + { + "epoch": 0.9461203967930425, + "grad_norm": 5.9118852615356445, + "learning_rate": 8.817774154097026e-06, + "loss": 3.4544, + "step": 13925 + }, + { + "epoch": 0.9464601168637043, + "grad_norm": 7.161766052246094, + "learning_rate": 8.817349504008697e-06, + "loss": 3.304, + "step": 13930 + }, + { + "epoch": 0.9467998369343661, + "grad_norm": 7.188033580780029, + "learning_rate": 8.81692485392037e-06, + "loss": 3.2305, + "step": 13935 + }, + { + "epoch": 0.9471395570050278, + "grad_norm": 9.439506530761719, + "learning_rate": 8.816500203832044e-06, + "loss": 3.5701, + "step": 13940 + }, + { + "epoch": 0.9474792770756897, + "grad_norm": 6.838264465332031, + "learning_rate": 8.816075553743715e-06, + "loss": 3.327, + "step": 13945 + }, + { + "epoch": 0.9478189971463514, + "grad_norm": 7.104968547821045, + "learning_rate": 8.815650903655388e-06, + "loss": 3.3464, + "step": 13950 + }, + { + "epoch": 0.9481587172170132, + "grad_norm": 5.336981296539307, + "learning_rate": 8.815226253567062e-06, + "loss": 3.1871, + "step": 13955 + }, + { + "epoch": 0.9484984372876749, + "grad_norm": 6.149016380310059, + "learning_rate": 8.814801603478734e-06, + "loss": 3.2114, + "step": 13960 + }, + { + "epoch": 0.9488381573583368, + "grad_norm": 6.59226655960083, + "learning_rate": 8.814376953390406e-06, + "loss": 3.1362, + "step": 13965 + }, + { + "epoch": 0.9491778774289985, + "grad_norm": 9.111952781677246, + "learning_rate": 8.813952303302081e-06, + "loss": 3.5084, + "step": 13970 + }, + { + "epoch": 0.9495175974996602, + "grad_norm": 6.648433208465576, + "learning_rate": 8.813527653213752e-06, + "loss": 3.3602, + "step": 13975 + }, + { + "epoch": 0.9498573175703221, + "grad_norm": 5.474483489990234, + "learning_rate": 8.813103003125425e-06, + "loss": 3.4505, + "step": 13980 + }, + { + "epoch": 0.9501970376409838, + "grad_norm": 6.133967399597168, + "learning_rate": 8.8126783530371e-06, + "loss": 3.21, + "step": 13985 + }, + { + "epoch": 0.9505367577116456, + "grad_norm": 6.1313557624816895, + "learning_rate": 8.81225370294877e-06, + "loss": 3.2717, + "step": 13990 + }, + { + "epoch": 0.9508764777823073, + "grad_norm": 7.475008487701416, + "learning_rate": 8.811829052860443e-06, + "loss": 3.4232, + "step": 13995 + }, + { + "epoch": 0.9512161978529692, + "grad_norm": 6.123263835906982, + "learning_rate": 8.811404402772118e-06, + "loss": 3.369, + "step": 14000 + }, + { + "epoch": 0.9515559179236309, + "grad_norm": 4.874521255493164, + "learning_rate": 8.810979752683789e-06, + "loss": 2.9688, + "step": 14005 + }, + { + "epoch": 0.9518956379942927, + "grad_norm": 6.006505489349365, + "learning_rate": 8.810555102595462e-06, + "loss": 3.1379, + "step": 14010 + }, + { + "epoch": 0.9522353580649545, + "grad_norm": 6.928956508636475, + "learning_rate": 8.810130452507134e-06, + "loss": 3.3203, + "step": 14015 + }, + { + "epoch": 0.9525750781356163, + "grad_norm": 6.069747447967529, + "learning_rate": 8.809705802418807e-06, + "loss": 3.372, + "step": 14020 + }, + { + "epoch": 0.952914798206278, + "grad_norm": 7.280343532562256, + "learning_rate": 8.80928115233048e-06, + "loss": 3.395, + "step": 14025 + }, + { + "epoch": 0.9532545182769399, + "grad_norm": 5.628610610961914, + "learning_rate": 8.808856502242153e-06, + "loss": 3.2866, + "step": 14030 + }, + { + "epoch": 0.9535942383476016, + "grad_norm": 5.965348720550537, + "learning_rate": 8.808431852153826e-06, + "loss": 3.2667, + "step": 14035 + }, + { + "epoch": 0.9539339584182633, + "grad_norm": 6.106235504150391, + "learning_rate": 8.808007202065498e-06, + "loss": 3.3398, + "step": 14040 + }, + { + "epoch": 0.9542736784889251, + "grad_norm": 8.059491157531738, + "learning_rate": 8.807582551977171e-06, + "loss": 3.3904, + "step": 14045 + }, + { + "epoch": 0.9546133985595869, + "grad_norm": 6.679736137390137, + "learning_rate": 8.807157901888844e-06, + "loss": 3.6013, + "step": 14050 + }, + { + "epoch": 0.9549531186302487, + "grad_norm": 6.093998432159424, + "learning_rate": 8.806733251800517e-06, + "loss": 3.321, + "step": 14055 + }, + { + "epoch": 0.9552928387009104, + "grad_norm": 6.823462009429932, + "learning_rate": 8.80630860171219e-06, + "loss": 3.172, + "step": 14060 + }, + { + "epoch": 0.9556325587715723, + "grad_norm": 6.591785430908203, + "learning_rate": 8.805883951623862e-06, + "loss": 3.3746, + "step": 14065 + }, + { + "epoch": 0.955972278842234, + "grad_norm": 6.7195916175842285, + "learning_rate": 8.805459301535535e-06, + "loss": 3.307, + "step": 14070 + }, + { + "epoch": 0.9563119989128958, + "grad_norm": 6.397758483886719, + "learning_rate": 8.805034651447208e-06, + "loss": 3.3989, + "step": 14075 + }, + { + "epoch": 0.9566517189835575, + "grad_norm": 8.413322448730469, + "learning_rate": 8.80461000135888e-06, + "loss": 3.3911, + "step": 14080 + }, + { + "epoch": 0.9569914390542194, + "grad_norm": 6.019286632537842, + "learning_rate": 8.804185351270554e-06, + "loss": 3.238, + "step": 14085 + }, + { + "epoch": 0.9573311591248811, + "grad_norm": 7.3144121170043945, + "learning_rate": 8.803760701182226e-06, + "loss": 3.3115, + "step": 14090 + }, + { + "epoch": 0.9576708791955428, + "grad_norm": 5.526050567626953, + "learning_rate": 8.8033360510939e-06, + "loss": 3.4, + "step": 14095 + }, + { + "epoch": 0.9580105992662047, + "grad_norm": 7.618831634521484, + "learning_rate": 8.802911401005572e-06, + "loss": 3.1832, + "step": 14100 + }, + { + "epoch": 0.9583503193368664, + "grad_norm": 8.065630912780762, + "learning_rate": 8.802486750917245e-06, + "loss": 3.1888, + "step": 14105 + }, + { + "epoch": 0.9586900394075282, + "grad_norm": 6.749079704284668, + "learning_rate": 8.802062100828918e-06, + "loss": 3.1837, + "step": 14110 + }, + { + "epoch": 0.95902975947819, + "grad_norm": 6.832324504852295, + "learning_rate": 8.80163745074059e-06, + "loss": 3.2756, + "step": 14115 + }, + { + "epoch": 0.9593694795488518, + "grad_norm": 6.678717136383057, + "learning_rate": 8.801212800652263e-06, + "loss": 3.6117, + "step": 14120 + }, + { + "epoch": 0.9597091996195135, + "grad_norm": 7.816190242767334, + "learning_rate": 8.800788150563936e-06, + "loss": 3.4896, + "step": 14125 + }, + { + "epoch": 0.9600489196901753, + "grad_norm": 6.738938331604004, + "learning_rate": 8.800363500475609e-06, + "loss": 3.3394, + "step": 14130 + }, + { + "epoch": 0.9603886397608371, + "grad_norm": 7.597910404205322, + "learning_rate": 8.799938850387282e-06, + "loss": 3.2588, + "step": 14135 + }, + { + "epoch": 0.9607283598314988, + "grad_norm": 6.742018699645996, + "learning_rate": 8.799514200298954e-06, + "loss": 3.4208, + "step": 14140 + }, + { + "epoch": 0.9610680799021606, + "grad_norm": 5.375246047973633, + "learning_rate": 8.799089550210627e-06, + "loss": 3.2268, + "step": 14145 + }, + { + "epoch": 0.9614077999728224, + "grad_norm": 6.06875467300415, + "learning_rate": 8.7986649001223e-06, + "loss": 3.4313, + "step": 14150 + }, + { + "epoch": 0.9617475200434842, + "grad_norm": 7.636511325836182, + "learning_rate": 8.798240250033973e-06, + "loss": 3.3838, + "step": 14155 + }, + { + "epoch": 0.9620872401141459, + "grad_norm": 6.774612903594971, + "learning_rate": 8.797815599945646e-06, + "loss": 3.3521, + "step": 14160 + }, + { + "epoch": 0.9624269601848077, + "grad_norm": 6.609130382537842, + "learning_rate": 8.797390949857318e-06, + "loss": 3.3964, + "step": 14165 + }, + { + "epoch": 0.9627666802554695, + "grad_norm": 6.841860771179199, + "learning_rate": 8.796966299768991e-06, + "loss": 3.3503, + "step": 14170 + }, + { + "epoch": 0.9631064003261313, + "grad_norm": 7.518406867980957, + "learning_rate": 8.796541649680664e-06, + "loss": 3.3237, + "step": 14175 + }, + { + "epoch": 0.963446120396793, + "grad_norm": 6.940134525299072, + "learning_rate": 8.796116999592337e-06, + "loss": 3.2736, + "step": 14180 + }, + { + "epoch": 0.9637858404674549, + "grad_norm": 6.9153876304626465, + "learning_rate": 8.79569234950401e-06, + "loss": 3.494, + "step": 14185 + }, + { + "epoch": 0.9641255605381166, + "grad_norm": 6.604236602783203, + "learning_rate": 8.795267699415682e-06, + "loss": 3.2467, + "step": 14190 + }, + { + "epoch": 0.9644652806087783, + "grad_norm": 6.997333526611328, + "learning_rate": 8.794843049327355e-06, + "loss": 3.4338, + "step": 14195 + }, + { + "epoch": 0.9648050006794402, + "grad_norm": 5.457606315612793, + "learning_rate": 8.794418399239028e-06, + "loss": 3.2287, + "step": 14200 + }, + { + "epoch": 0.9651447207501019, + "grad_norm": 7.647720813751221, + "learning_rate": 8.7939937491507e-06, + "loss": 3.2251, + "step": 14205 + }, + { + "epoch": 0.9654844408207637, + "grad_norm": 6.212101936340332, + "learning_rate": 8.793569099062374e-06, + "loss": 3.2158, + "step": 14210 + }, + { + "epoch": 0.9658241608914254, + "grad_norm": 7.12466287612915, + "learning_rate": 8.793144448974046e-06, + "loss": 3.3133, + "step": 14215 + }, + { + "epoch": 0.9661638809620873, + "grad_norm": 7.8672871589660645, + "learning_rate": 8.79271979888572e-06, + "loss": 3.4241, + "step": 14220 + }, + { + "epoch": 0.966503601032749, + "grad_norm": 7.218077659606934, + "learning_rate": 8.792295148797392e-06, + "loss": 3.3386, + "step": 14225 + }, + { + "epoch": 0.9668433211034108, + "grad_norm": 7.255960464477539, + "learning_rate": 8.791870498709065e-06, + "loss": 3.3128, + "step": 14230 + }, + { + "epoch": 0.9671830411740726, + "grad_norm": 5.947029113769531, + "learning_rate": 8.791445848620738e-06, + "loss": 3.3961, + "step": 14235 + }, + { + "epoch": 0.9675227612447344, + "grad_norm": 7.1058807373046875, + "learning_rate": 8.79102119853241e-06, + "loss": 3.1491, + "step": 14240 + }, + { + "epoch": 0.9678624813153961, + "grad_norm": 6.048633098602295, + "learning_rate": 8.790596548444083e-06, + "loss": 3.1329, + "step": 14245 + }, + { + "epoch": 0.9682022013860578, + "grad_norm": 5.677624225616455, + "learning_rate": 8.790171898355756e-06, + "loss": 3.4222, + "step": 14250 + }, + { + "epoch": 0.9685419214567197, + "grad_norm": 5.600594997406006, + "learning_rate": 8.789747248267429e-06, + "loss": 3.3676, + "step": 14255 + }, + { + "epoch": 0.9688816415273814, + "grad_norm": 5.791308879852295, + "learning_rate": 8.789322598179102e-06, + "loss": 3.321, + "step": 14260 + }, + { + "epoch": 0.9692213615980432, + "grad_norm": 5.158947944641113, + "learning_rate": 8.788897948090774e-06, + "loss": 3.1534, + "step": 14265 + }, + { + "epoch": 0.969561081668705, + "grad_norm": 7.112443923950195, + "learning_rate": 8.788473298002447e-06, + "loss": 3.2812, + "step": 14270 + }, + { + "epoch": 0.9699008017393668, + "grad_norm": 7.025630950927734, + "learning_rate": 8.788048647914118e-06, + "loss": 3.31, + "step": 14275 + }, + { + "epoch": 0.9702405218100285, + "grad_norm": 7.25236177444458, + "learning_rate": 8.787623997825793e-06, + "loss": 3.2611, + "step": 14280 + }, + { + "epoch": 0.9705802418806904, + "grad_norm": 5.442069053649902, + "learning_rate": 8.787199347737466e-06, + "loss": 3.3712, + "step": 14285 + }, + { + "epoch": 0.9709199619513521, + "grad_norm": 5.397100925445557, + "learning_rate": 8.786774697649137e-06, + "loss": 3.3061, + "step": 14290 + }, + { + "epoch": 0.9712596820220138, + "grad_norm": 6.483907222747803, + "learning_rate": 8.786350047560811e-06, + "loss": 3.2963, + "step": 14295 + }, + { + "epoch": 0.9715994020926756, + "grad_norm": 6.515300750732422, + "learning_rate": 8.785925397472484e-06, + "loss": 3.2894, + "step": 14300 + }, + { + "epoch": 0.9719391221633374, + "grad_norm": 6.5279340744018555, + "learning_rate": 8.785500747384155e-06, + "loss": 3.4353, + "step": 14305 + }, + { + "epoch": 0.9722788422339992, + "grad_norm": 7.243682384490967, + "learning_rate": 8.78507609729583e-06, + "loss": 3.298, + "step": 14310 + }, + { + "epoch": 0.9726185623046609, + "grad_norm": 7.1268510818481445, + "learning_rate": 8.784651447207502e-06, + "loss": 3.1473, + "step": 14315 + }, + { + "epoch": 0.9729582823753228, + "grad_norm": 5.734854698181152, + "learning_rate": 8.784226797119174e-06, + "loss": 3.2173, + "step": 14320 + }, + { + "epoch": 0.9732980024459845, + "grad_norm": 7.626285552978516, + "learning_rate": 8.783802147030848e-06, + "loss": 2.8986, + "step": 14325 + }, + { + "epoch": 0.9736377225166463, + "grad_norm": 5.9669365882873535, + "learning_rate": 8.78337749694252e-06, + "loss": 3.3482, + "step": 14330 + }, + { + "epoch": 0.973977442587308, + "grad_norm": 5.078127861022949, + "learning_rate": 8.782952846854192e-06, + "loss": 3.3511, + "step": 14335 + }, + { + "epoch": 0.9743171626579699, + "grad_norm": 4.899078845977783, + "learning_rate": 8.782528196765866e-06, + "loss": 3.2643, + "step": 14340 + }, + { + "epoch": 0.9746568827286316, + "grad_norm": 6.099704265594482, + "learning_rate": 8.78210354667754e-06, + "loss": 3.4114, + "step": 14345 + }, + { + "epoch": 0.9749966027992933, + "grad_norm": 7.3979291915893555, + "learning_rate": 8.78167889658921e-06, + "loss": 3.3863, + "step": 14350 + }, + { + "epoch": 0.9753363228699552, + "grad_norm": 6.512387752532959, + "learning_rate": 8.781254246500885e-06, + "loss": 2.9189, + "step": 14355 + }, + { + "epoch": 0.9756760429406169, + "grad_norm": 6.496485233306885, + "learning_rate": 8.780829596412556e-06, + "loss": 3.5122, + "step": 14360 + }, + { + "epoch": 0.9760157630112787, + "grad_norm": 5.429251670837402, + "learning_rate": 8.780404946324229e-06, + "loss": 3.2423, + "step": 14365 + }, + { + "epoch": 0.9763554830819405, + "grad_norm": 6.253954887390137, + "learning_rate": 8.779980296235903e-06, + "loss": 3.4663, + "step": 14370 + }, + { + "epoch": 0.9766952031526023, + "grad_norm": 6.468486785888672, + "learning_rate": 8.779555646147574e-06, + "loss": 2.9966, + "step": 14375 + }, + { + "epoch": 0.977034923223264, + "grad_norm": 6.400164604187012, + "learning_rate": 8.779130996059247e-06, + "loss": 3.4618, + "step": 14380 + }, + { + "epoch": 0.9773746432939258, + "grad_norm": 5.40606164932251, + "learning_rate": 8.778706345970922e-06, + "loss": 3.0382, + "step": 14385 + }, + { + "epoch": 0.9777143633645876, + "grad_norm": 7.504058837890625, + "learning_rate": 8.778281695882593e-06, + "loss": 3.093, + "step": 14390 + }, + { + "epoch": 0.9780540834352494, + "grad_norm": 6.646205902099609, + "learning_rate": 8.777857045794266e-06, + "loss": 3.4537, + "step": 14395 + }, + { + "epoch": 0.9783938035059111, + "grad_norm": 6.966532230377197, + "learning_rate": 8.77743239570594e-06, + "loss": 3.2484, + "step": 14400 + }, + { + "epoch": 0.978733523576573, + "grad_norm": 6.214527130126953, + "learning_rate": 8.777007745617611e-06, + "loss": 3.1773, + "step": 14405 + }, + { + "epoch": 0.9790732436472347, + "grad_norm": 6.728498458862305, + "learning_rate": 8.776583095529284e-06, + "loss": 3.1568, + "step": 14410 + }, + { + "epoch": 0.9794129637178964, + "grad_norm": 5.6684346199035645, + "learning_rate": 8.776158445440958e-06, + "loss": 3.4057, + "step": 14415 + }, + { + "epoch": 0.9797526837885582, + "grad_norm": 7.645877838134766, + "learning_rate": 8.77573379535263e-06, + "loss": 3.0448, + "step": 14420 + }, + { + "epoch": 0.98009240385922, + "grad_norm": 5.891788005828857, + "learning_rate": 8.775309145264302e-06, + "loss": 3.0925, + "step": 14425 + }, + { + "epoch": 0.9804321239298818, + "grad_norm": 6.817049980163574, + "learning_rate": 8.774884495175975e-06, + "loss": 3.3309, + "step": 14430 + }, + { + "epoch": 0.9807718440005435, + "grad_norm": 8.016084671020508, + "learning_rate": 8.774544775105314e-06, + "loss": 2.9367, + "step": 14435 + }, + { + "epoch": 0.9811115640712054, + "grad_norm": 7.154316425323486, + "learning_rate": 8.774120125016987e-06, + "loss": 3.3847, + "step": 14440 + }, + { + "epoch": 0.9814512841418671, + "grad_norm": 6.64216423034668, + "learning_rate": 8.77369547492866e-06, + "loss": 3.4078, + "step": 14445 + }, + { + "epoch": 0.9817910042125289, + "grad_norm": 7.412374496459961, + "learning_rate": 8.773270824840332e-06, + "loss": 3.0064, + "step": 14450 + }, + { + "epoch": 0.9821307242831907, + "grad_norm": 4.879425048828125, + "learning_rate": 8.772846174752005e-06, + "loss": 3.3984, + "step": 14455 + }, + { + "epoch": 0.9824704443538524, + "grad_norm": 7.088574409484863, + "learning_rate": 8.772421524663678e-06, + "loss": 3.1997, + "step": 14460 + }, + { + "epoch": 0.9828101644245142, + "grad_norm": 5.845264434814453, + "learning_rate": 8.771996874575351e-06, + "loss": 3.3394, + "step": 14465 + }, + { + "epoch": 0.9831498844951759, + "grad_norm": 7.70010232925415, + "learning_rate": 8.771572224487024e-06, + "loss": 3.3113, + "step": 14470 + }, + { + "epoch": 0.9834896045658378, + "grad_norm": 8.025528907775879, + "learning_rate": 8.771147574398696e-06, + "loss": 3.0258, + "step": 14475 + }, + { + "epoch": 0.9838293246364995, + "grad_norm": 7.85750675201416, + "learning_rate": 8.77072292431037e-06, + "loss": 3.2147, + "step": 14480 + }, + { + "epoch": 0.9841690447071613, + "grad_norm": 6.041637420654297, + "learning_rate": 8.770298274222042e-06, + "loss": 3.4089, + "step": 14485 + }, + { + "epoch": 0.9845087647778231, + "grad_norm": 6.631992340087891, + "learning_rate": 8.769873624133715e-06, + "loss": 3.3145, + "step": 14490 + }, + { + "epoch": 0.9848484848484849, + "grad_norm": 8.119039535522461, + "learning_rate": 8.769448974045388e-06, + "loss": 3.5053, + "step": 14495 + }, + { + "epoch": 0.9851882049191466, + "grad_norm": 6.473926544189453, + "learning_rate": 8.76902432395706e-06, + "loss": 3.3294, + "step": 14500 + }, + { + "epoch": 0.9855279249898083, + "grad_norm": 7.295680522918701, + "learning_rate": 8.768599673868733e-06, + "loss": 3.2622, + "step": 14505 + }, + { + "epoch": 0.9858676450604702, + "grad_norm": 7.837872505187988, + "learning_rate": 8.768175023780406e-06, + "loss": 3.1246, + "step": 14510 + }, + { + "epoch": 0.9862073651311319, + "grad_norm": 7.707462310791016, + "learning_rate": 8.767750373692079e-06, + "loss": 3.4414, + "step": 14515 + }, + { + "epoch": 0.9865470852017937, + "grad_norm": 6.760806560516357, + "learning_rate": 8.767325723603752e-06, + "loss": 3.1205, + "step": 14520 + }, + { + "epoch": 0.9868868052724555, + "grad_norm": 5.0304341316223145, + "learning_rate": 8.766901073515424e-06, + "loss": 3.0871, + "step": 14525 + }, + { + "epoch": 0.9872265253431173, + "grad_norm": 5.952856540679932, + "learning_rate": 8.766476423427097e-06, + "loss": 3.2736, + "step": 14530 + }, + { + "epoch": 0.987566245413779, + "grad_norm": 6.520649433135986, + "learning_rate": 8.76605177333877e-06, + "loss": 3.302, + "step": 14535 + }, + { + "epoch": 0.9879059654844409, + "grad_norm": 7.859011173248291, + "learning_rate": 8.765627123250441e-06, + "loss": 3.4216, + "step": 14540 + }, + { + "epoch": 0.9882456855551026, + "grad_norm": 5.523493766784668, + "learning_rate": 8.765202473162116e-06, + "loss": 3.2116, + "step": 14545 + }, + { + "epoch": 0.9885854056257644, + "grad_norm": 6.951886177062988, + "learning_rate": 8.764777823073788e-06, + "loss": 3.2326, + "step": 14550 + }, + { + "epoch": 0.9889251256964261, + "grad_norm": 8.22221851348877, + "learning_rate": 8.76435317298546e-06, + "loss": 3.1871, + "step": 14555 + }, + { + "epoch": 0.989264845767088, + "grad_norm": 6.328803062438965, + "learning_rate": 8.763928522897134e-06, + "loss": 3.3008, + "step": 14560 + }, + { + "epoch": 0.9896045658377497, + "grad_norm": 6.34280252456665, + "learning_rate": 8.763503872808807e-06, + "loss": 3.3759, + "step": 14565 + }, + { + "epoch": 0.9899442859084114, + "grad_norm": 4.668499946594238, + "learning_rate": 8.763079222720478e-06, + "loss": 3.1682, + "step": 14570 + }, + { + "epoch": 0.9902840059790733, + "grad_norm": 8.418146133422852, + "learning_rate": 8.762654572632152e-06, + "loss": 3.4883, + "step": 14575 + }, + { + "epoch": 0.990623726049735, + "grad_norm": 6.929956912994385, + "learning_rate": 8.762229922543825e-06, + "loss": 3.2483, + "step": 14580 + }, + { + "epoch": 0.9909634461203968, + "grad_norm": 6.474230766296387, + "learning_rate": 8.761805272455496e-06, + "loss": 3.2173, + "step": 14585 + }, + { + "epoch": 0.9913031661910585, + "grad_norm": 5.541803359985352, + "learning_rate": 8.761380622367171e-06, + "loss": 3.4474, + "step": 14590 + }, + { + "epoch": 0.9916428862617204, + "grad_norm": 6.539732456207275, + "learning_rate": 8.760955972278844e-06, + "loss": 3.2657, + "step": 14595 + }, + { + "epoch": 0.9919826063323821, + "grad_norm": 5.774468898773193, + "learning_rate": 8.760531322190515e-06, + "loss": 3.3722, + "step": 14600 + }, + { + "epoch": 0.9923223264030439, + "grad_norm": 7.135651111602783, + "learning_rate": 8.76010667210219e-06, + "loss": 3.4967, + "step": 14605 + }, + { + "epoch": 0.9926620464737057, + "grad_norm": 6.887558937072754, + "learning_rate": 8.75968202201386e-06, + "loss": 3.1569, + "step": 14610 + }, + { + "epoch": 0.9930017665443674, + "grad_norm": 6.428689479827881, + "learning_rate": 8.759257371925533e-06, + "loss": 3.351, + "step": 14615 + }, + { + "epoch": 0.9933414866150292, + "grad_norm": 6.47819709777832, + "learning_rate": 8.758832721837208e-06, + "loss": 3.2323, + "step": 14620 + }, + { + "epoch": 0.993681206685691, + "grad_norm": 6.350615501403809, + "learning_rate": 8.758408071748879e-06, + "loss": 3.306, + "step": 14625 + }, + { + "epoch": 0.9940209267563528, + "grad_norm": 6.358270645141602, + "learning_rate": 8.757983421660552e-06, + "loss": 3.5187, + "step": 14630 + }, + { + "epoch": 0.9943606468270145, + "grad_norm": 9.08192253112793, + "learning_rate": 8.757558771572226e-06, + "loss": 3.2074, + "step": 14635 + }, + { + "epoch": 0.9947003668976763, + "grad_norm": 5.298459053039551, + "learning_rate": 8.757134121483897e-06, + "loss": 3.3333, + "step": 14640 + }, + { + "epoch": 0.9950400869683381, + "grad_norm": 6.722025394439697, + "learning_rate": 8.75670947139557e-06, + "loss": 3.1853, + "step": 14645 + }, + { + "epoch": 0.9953798070389999, + "grad_norm": 6.464664459228516, + "learning_rate": 8.756284821307244e-06, + "loss": 3.2463, + "step": 14650 + }, + { + "epoch": 0.9957195271096616, + "grad_norm": 7.829146385192871, + "learning_rate": 8.755860171218916e-06, + "loss": 3.1722, + "step": 14655 + }, + { + "epoch": 0.9960592471803235, + "grad_norm": 6.449219703674316, + "learning_rate": 8.755435521130588e-06, + "loss": 3.2413, + "step": 14660 + }, + { + "epoch": 0.9963989672509852, + "grad_norm": 5.748312473297119, + "learning_rate": 8.755010871042263e-06, + "loss": 3.4984, + "step": 14665 + }, + { + "epoch": 0.9967386873216469, + "grad_norm": 7.214095592498779, + "learning_rate": 8.754586220953934e-06, + "loss": 3.4705, + "step": 14670 + }, + { + "epoch": 0.9970784073923087, + "grad_norm": 5.966444492340088, + "learning_rate": 8.754161570865607e-06, + "loss": 3.304, + "step": 14675 + }, + { + "epoch": 0.9974181274629705, + "grad_norm": 7.243247032165527, + "learning_rate": 8.75373692077728e-06, + "loss": 3.4736, + "step": 14680 + }, + { + "epoch": 0.9977578475336323, + "grad_norm": 6.71095085144043, + "learning_rate": 8.753312270688952e-06, + "loss": 3.2783, + "step": 14685 + }, + { + "epoch": 0.998097567604294, + "grad_norm": 4.515254497528076, + "learning_rate": 8.752887620600627e-06, + "loss": 3.276, + "step": 14690 + }, + { + "epoch": 0.9984372876749559, + "grad_norm": 6.955607891082764, + "learning_rate": 8.752462970512298e-06, + "loss": 3.6246, + "step": 14695 + }, + { + "epoch": 0.9987770077456176, + "grad_norm": 6.351483345031738, + "learning_rate": 8.75203832042397e-06, + "loss": 3.5232, + "step": 14700 + }, + { + "epoch": 0.9991167278162794, + "grad_norm": 7.838197231292725, + "learning_rate": 8.751613670335645e-06, + "loss": 3.2146, + "step": 14705 + }, + { + "epoch": 0.9994564478869412, + "grad_norm": 8.825458526611328, + "learning_rate": 8.751189020247316e-06, + "loss": 3.3545, + "step": 14710 + }, + { + "epoch": 0.999796167957603, + "grad_norm": 6.859269142150879, + "learning_rate": 8.75076437015899e-06, + "loss": 3.4822, + "step": 14715 + }, + { + "epoch": 1.0, + "eval_bertscore": { + "f1": 0.8367397996422533, + "precision": 0.8395578920327135, + "recall": 0.8346374581945833 + }, + "eval_bleu_4": 0.010298228728202682, + "eval_exact_match": 0.0, + "eval_loss": 3.311650276184082, + "eval_meteor": 0.09766592763164576, + "eval_rouge": { + "rouge1": 0.13370039867558905, + "rouge2": 0.012780770210280549, + "rougeL": 0.11135498586675158, + "rougeLsum": 0.11133230102737238 + }, + "eval_runtime": 1178.4215, + "eval_samples_per_second": 8.757, + "eval_steps_per_second": 1.095, + "step": 14718 + }, + { + "epoch": 1.0001358880282647, + "grad_norm": 6.685173511505127, + "learning_rate": 8.750339720070664e-06, + "loss": 3.1569, + "step": 14720 + }, + { + "epoch": 1.0004756080989265, + "grad_norm": 6.011618614196777, + "learning_rate": 8.749915069982335e-06, + "loss": 3.1055, + "step": 14725 + }, + { + "epoch": 1.0008153281695882, + "grad_norm": 8.452324867248535, + "learning_rate": 8.749490419894008e-06, + "loss": 3.1016, + "step": 14730 + }, + { + "epoch": 1.00115504824025, + "grad_norm": 6.488195419311523, + "learning_rate": 8.749065769805682e-06, + "loss": 3.1577, + "step": 14735 + }, + { + "epoch": 1.0014947683109119, + "grad_norm": 6.734018802642822, + "learning_rate": 8.748641119717353e-06, + "loss": 3.1594, + "step": 14740 + }, + { + "epoch": 1.0018344883815735, + "grad_norm": 9.259119987487793, + "learning_rate": 8.748216469629026e-06, + "loss": 3.2975, + "step": 14745 + }, + { + "epoch": 1.0021742084522354, + "grad_norm": 6.362176895141602, + "learning_rate": 8.7477918195407e-06, + "loss": 3.0002, + "step": 14750 + }, + { + "epoch": 1.0025139285228972, + "grad_norm": 6.777210712432861, + "learning_rate": 8.747367169452372e-06, + "loss": 2.9064, + "step": 14755 + }, + { + "epoch": 1.0028536485935589, + "grad_norm": 9.050280570983887, + "learning_rate": 8.746942519364044e-06, + "loss": 3.2638, + "step": 14760 + }, + { + "epoch": 1.0031933686642207, + "grad_norm": 6.237172603607178, + "learning_rate": 8.746517869275717e-06, + "loss": 3.1876, + "step": 14765 + }, + { + "epoch": 1.0035330887348826, + "grad_norm": 6.08978796005249, + "learning_rate": 8.74609321918739e-06, + "loss": 3.0706, + "step": 14770 + }, + { + "epoch": 1.0038728088055442, + "grad_norm": 7.5270915031433105, + "learning_rate": 8.745668569099063e-06, + "loss": 3.2528, + "step": 14775 + }, + { + "epoch": 1.004212528876206, + "grad_norm": 8.041753768920898, + "learning_rate": 8.745243919010736e-06, + "loss": 3.2889, + "step": 14780 + }, + { + "epoch": 1.0045522489468677, + "grad_norm": 7.839522838592529, + "learning_rate": 8.744819268922408e-06, + "loss": 3.2524, + "step": 14785 + }, + { + "epoch": 1.0048919690175295, + "grad_norm": 8.595640182495117, + "learning_rate": 8.744394618834081e-06, + "loss": 3.2909, + "step": 14790 + }, + { + "epoch": 1.0052316890881914, + "grad_norm": 7.924987316131592, + "learning_rate": 8.743969968745754e-06, + "loss": 3.0663, + "step": 14795 + }, + { + "epoch": 1.005571409158853, + "grad_norm": 6.890618324279785, + "learning_rate": 8.743545318657427e-06, + "loss": 3.1557, + "step": 14800 + }, + { + "epoch": 1.0059111292295149, + "grad_norm": 6.551941394805908, + "learning_rate": 8.7431206685691e-06, + "loss": 3.1689, + "step": 14805 + }, + { + "epoch": 1.0062508493001767, + "grad_norm": 6.975053787231445, + "learning_rate": 8.742696018480772e-06, + "loss": 3.1322, + "step": 14810 + }, + { + "epoch": 1.0065905693708384, + "grad_norm": 5.907357692718506, + "learning_rate": 8.742271368392445e-06, + "loss": 2.9359, + "step": 14815 + }, + { + "epoch": 1.0069302894415002, + "grad_norm": 8.124777793884277, + "learning_rate": 8.741846718304118e-06, + "loss": 3.5015, + "step": 14820 + }, + { + "epoch": 1.007270009512162, + "grad_norm": 8.682503700256348, + "learning_rate": 8.74142206821579e-06, + "loss": 3.2377, + "step": 14825 + }, + { + "epoch": 1.0076097295828237, + "grad_norm": 7.555274963378906, + "learning_rate": 8.740997418127464e-06, + "loss": 3.242, + "step": 14830 + }, + { + "epoch": 1.0079494496534855, + "grad_norm": 7.792573928833008, + "learning_rate": 8.740572768039136e-06, + "loss": 3.1915, + "step": 14835 + }, + { + "epoch": 1.0082891697241474, + "grad_norm": 6.304718971252441, + "learning_rate": 8.74014811795081e-06, + "loss": 3.1797, + "step": 14840 + }, + { + "epoch": 1.008628889794809, + "grad_norm": 6.474432945251465, + "learning_rate": 8.739723467862482e-06, + "loss": 3.5038, + "step": 14845 + }, + { + "epoch": 1.0089686098654709, + "grad_norm": 7.490272045135498, + "learning_rate": 8.739298817774155e-06, + "loss": 3.4436, + "step": 14850 + }, + { + "epoch": 1.0093083299361327, + "grad_norm": 5.960350513458252, + "learning_rate": 8.738874167685828e-06, + "loss": 3.5094, + "step": 14855 + }, + { + "epoch": 1.0096480500067944, + "grad_norm": 7.728791236877441, + "learning_rate": 8.7384495175975e-06, + "loss": 3.1804, + "step": 14860 + }, + { + "epoch": 1.0099877700774562, + "grad_norm": 6.361643314361572, + "learning_rate": 8.738024867509173e-06, + "loss": 3.0643, + "step": 14865 + }, + { + "epoch": 1.0103274901481178, + "grad_norm": 6.931436538696289, + "learning_rate": 8.737600217420846e-06, + "loss": 3.1798, + "step": 14870 + }, + { + "epoch": 1.0106672102187797, + "grad_norm": 6.912081241607666, + "learning_rate": 8.737175567332519e-06, + "loss": 3.1607, + "step": 14875 + }, + { + "epoch": 1.0110069302894416, + "grad_norm": 7.735078811645508, + "learning_rate": 8.736750917244192e-06, + "loss": 3.0971, + "step": 14880 + }, + { + "epoch": 1.0113466503601032, + "grad_norm": 5.932215690612793, + "learning_rate": 8.736326267155864e-06, + "loss": 3.325, + "step": 14885 + }, + { + "epoch": 1.011686370430765, + "grad_norm": 8.334389686584473, + "learning_rate": 8.735901617067537e-06, + "loss": 3.4049, + "step": 14890 + }, + { + "epoch": 1.012026090501427, + "grad_norm": 6.140349388122559, + "learning_rate": 8.73547696697921e-06, + "loss": 3.0259, + "step": 14895 + }, + { + "epoch": 1.0123658105720885, + "grad_norm": 6.660938262939453, + "learning_rate": 8.735052316890883e-06, + "loss": 3.0261, + "step": 14900 + }, + { + "epoch": 1.0127055306427504, + "grad_norm": 6.240072727203369, + "learning_rate": 8.734627666802556e-06, + "loss": 3.4607, + "step": 14905 + }, + { + "epoch": 1.0130452507134122, + "grad_norm": 5.273401737213135, + "learning_rate": 8.734203016714228e-06, + "loss": 3.3565, + "step": 14910 + }, + { + "epoch": 1.0133849707840739, + "grad_norm": 7.9375739097595215, + "learning_rate": 8.733778366625901e-06, + "loss": 3.1597, + "step": 14915 + }, + { + "epoch": 1.0137246908547357, + "grad_norm": 6.915647983551025, + "learning_rate": 8.733353716537574e-06, + "loss": 3.0627, + "step": 14920 + }, + { + "epoch": 1.0140644109253976, + "grad_norm": 6.206578731536865, + "learning_rate": 8.732929066449247e-06, + "loss": 3.2129, + "step": 14925 + }, + { + "epoch": 1.0144041309960592, + "grad_norm": 7.760478973388672, + "learning_rate": 8.73250441636092e-06, + "loss": 3.0775, + "step": 14930 + }, + { + "epoch": 1.014743851066721, + "grad_norm": 7.085804462432861, + "learning_rate": 8.732079766272592e-06, + "loss": 3.0035, + "step": 14935 + }, + { + "epoch": 1.015083571137383, + "grad_norm": 6.143313884735107, + "learning_rate": 8.731655116184265e-06, + "loss": 3.0228, + "step": 14940 + }, + { + "epoch": 1.0154232912080445, + "grad_norm": 6.859093189239502, + "learning_rate": 8.731230466095938e-06, + "loss": 3.4184, + "step": 14945 + }, + { + "epoch": 1.0157630112787064, + "grad_norm": 6.648591041564941, + "learning_rate": 8.73080581600761e-06, + "loss": 3.4158, + "step": 14950 + }, + { + "epoch": 1.016102731349368, + "grad_norm": 6.38861608505249, + "learning_rate": 8.730381165919282e-06, + "loss": 3.1612, + "step": 14955 + }, + { + "epoch": 1.0164424514200299, + "grad_norm": 5.80001974105835, + "learning_rate": 8.729956515830956e-06, + "loss": 3.2256, + "step": 14960 + }, + { + "epoch": 1.0167821714906917, + "grad_norm": 7.658553123474121, + "learning_rate": 8.72953186574263e-06, + "loss": 3.129, + "step": 14965 + }, + { + "epoch": 1.0171218915613534, + "grad_norm": 6.737097263336182, + "learning_rate": 8.7291072156543e-06, + "loss": 2.9417, + "step": 14970 + }, + { + "epoch": 1.0174616116320152, + "grad_norm": 6.952664852142334, + "learning_rate": 8.728682565565975e-06, + "loss": 3.1744, + "step": 14975 + }, + { + "epoch": 1.017801331702677, + "grad_norm": 6.3667755126953125, + "learning_rate": 8.728257915477648e-06, + "loss": 3.0758, + "step": 14980 + }, + { + "epoch": 1.0181410517733387, + "grad_norm": 8.179152488708496, + "learning_rate": 8.727833265389319e-06, + "loss": 3.4284, + "step": 14985 + }, + { + "epoch": 1.0184807718440005, + "grad_norm": 6.464476585388184, + "learning_rate": 8.727408615300993e-06, + "loss": 3.1582, + "step": 14990 + }, + { + "epoch": 1.0188204919146624, + "grad_norm": 7.806763172149658, + "learning_rate": 8.726983965212666e-06, + "loss": 3.2375, + "step": 14995 + }, + { + "epoch": 1.019160211985324, + "grad_norm": 6.3943095207214355, + "learning_rate": 8.726559315124337e-06, + "loss": 3.2441, + "step": 15000 + }, + { + "epoch": 1.0194999320559859, + "grad_norm": 5.991319179534912, + "learning_rate": 8.726134665036012e-06, + "loss": 3.0772, + "step": 15005 + }, + { + "epoch": 1.0198396521266477, + "grad_norm": 5.927591323852539, + "learning_rate": 8.725710014947684e-06, + "loss": 3.264, + "step": 15010 + }, + { + "epoch": 1.0201793721973094, + "grad_norm": 4.97849702835083, + "learning_rate": 8.725285364859356e-06, + "loss": 3.1682, + "step": 15015 + }, + { + "epoch": 1.0205190922679712, + "grad_norm": 7.927879333496094, + "learning_rate": 8.72486071477103e-06, + "loss": 2.9239, + "step": 15020 + }, + { + "epoch": 1.020858812338633, + "grad_norm": 7.134195804595947, + "learning_rate": 8.724436064682701e-06, + "loss": 3.1203, + "step": 15025 + }, + { + "epoch": 1.0211985324092947, + "grad_norm": 7.889961242675781, + "learning_rate": 8.724011414594374e-06, + "loss": 2.9544, + "step": 15030 + }, + { + "epoch": 1.0215382524799566, + "grad_norm": 5.879467010498047, + "learning_rate": 8.723586764506048e-06, + "loss": 3.4275, + "step": 15035 + }, + { + "epoch": 1.0218779725506182, + "grad_norm": 6.887064456939697, + "learning_rate": 8.72316211441772e-06, + "loss": 3.1998, + "step": 15040 + }, + { + "epoch": 1.02221769262128, + "grad_norm": 6.85420036315918, + "learning_rate": 8.722737464329394e-06, + "loss": 3.1207, + "step": 15045 + }, + { + "epoch": 1.022557412691942, + "grad_norm": 8.5359468460083, + "learning_rate": 8.722312814241067e-06, + "loss": 3.2289, + "step": 15050 + }, + { + "epoch": 1.0228971327626035, + "grad_norm": 6.673225402832031, + "learning_rate": 8.721888164152738e-06, + "loss": 3.1491, + "step": 15055 + }, + { + "epoch": 1.0232368528332654, + "grad_norm": 7.700650691986084, + "learning_rate": 8.721463514064412e-06, + "loss": 3.1847, + "step": 15060 + }, + { + "epoch": 1.0235765729039272, + "grad_norm": 7.502573013305664, + "learning_rate": 8.721038863976085e-06, + "loss": 3.0704, + "step": 15065 + }, + { + "epoch": 1.0239162929745889, + "grad_norm": 7.013248920440674, + "learning_rate": 8.720614213887756e-06, + "loss": 3.0584, + "step": 15070 + }, + { + "epoch": 1.0242560130452507, + "grad_norm": 8.12600326538086, + "learning_rate": 8.720189563799431e-06, + "loss": 3.2571, + "step": 15075 + }, + { + "epoch": 1.0245957331159126, + "grad_norm": 6.657962322235107, + "learning_rate": 8.719764913711104e-06, + "loss": 3.3591, + "step": 15080 + }, + { + "epoch": 1.0249354531865742, + "grad_norm": 7.069977760314941, + "learning_rate": 8.719340263622775e-06, + "loss": 3.1634, + "step": 15085 + }, + { + "epoch": 1.025275173257236, + "grad_norm": 6.245774269104004, + "learning_rate": 8.71891561353445e-06, + "loss": 3.2107, + "step": 15090 + }, + { + "epoch": 1.025614893327898, + "grad_norm": 6.383551597595215, + "learning_rate": 8.718490963446122e-06, + "loss": 3.3849, + "step": 15095 + }, + { + "epoch": 1.0259546133985595, + "grad_norm": 6.282948017120361, + "learning_rate": 8.718066313357793e-06, + "loss": 3.1943, + "step": 15100 + }, + { + "epoch": 1.0262943334692214, + "grad_norm": 6.677504062652588, + "learning_rate": 8.717641663269468e-06, + "loss": 3.1754, + "step": 15105 + }, + { + "epoch": 1.0266340535398832, + "grad_norm": 6.885970592498779, + "learning_rate": 8.717217013181139e-06, + "loss": 3.1307, + "step": 15110 + }, + { + "epoch": 1.0269737736105449, + "grad_norm": 8.505400657653809, + "learning_rate": 8.716792363092812e-06, + "loss": 3.1348, + "step": 15115 + }, + { + "epoch": 1.0273134936812067, + "grad_norm": 6.222141265869141, + "learning_rate": 8.716367713004486e-06, + "loss": 3.29, + "step": 15120 + }, + { + "epoch": 1.0276532137518684, + "grad_norm": 6.914943218231201, + "learning_rate": 8.715943062916157e-06, + "loss": 3.0623, + "step": 15125 + }, + { + "epoch": 1.0279929338225302, + "grad_norm": 6.481078147888184, + "learning_rate": 8.71551841282783e-06, + "loss": 3.1279, + "step": 15130 + }, + { + "epoch": 1.028332653893192, + "grad_norm": 7.816770553588867, + "learning_rate": 8.715093762739504e-06, + "loss": 3.1787, + "step": 15135 + }, + { + "epoch": 1.0286723739638537, + "grad_norm": 5.885744571685791, + "learning_rate": 8.714669112651176e-06, + "loss": 3.3635, + "step": 15140 + }, + { + "epoch": 1.0290120940345155, + "grad_norm": 6.535862445831299, + "learning_rate": 8.714244462562848e-06, + "loss": 3.0877, + "step": 15145 + }, + { + "epoch": 1.0293518141051774, + "grad_norm": 7.150203704833984, + "learning_rate": 8.713819812474523e-06, + "loss": 3.2852, + "step": 15150 + }, + { + "epoch": 1.029691534175839, + "grad_norm": 6.408694744110107, + "learning_rate": 8.713395162386194e-06, + "loss": 3.1786, + "step": 15155 + }, + { + "epoch": 1.0300312542465009, + "grad_norm": 6.023165702819824, + "learning_rate": 8.712970512297867e-06, + "loss": 3.2556, + "step": 15160 + }, + { + "epoch": 1.0303709743171627, + "grad_norm": 6.306533336639404, + "learning_rate": 8.712545862209541e-06, + "loss": 3.1332, + "step": 15165 + }, + { + "epoch": 1.0307106943878244, + "grad_norm": 5.446987152099609, + "learning_rate": 8.712121212121212e-06, + "loss": 3.3556, + "step": 15170 + }, + { + "epoch": 1.0310504144584862, + "grad_norm": 7.797298431396484, + "learning_rate": 8.711696562032885e-06, + "loss": 2.9757, + "step": 15175 + }, + { + "epoch": 1.031390134529148, + "grad_norm": 8.676726341247559, + "learning_rate": 8.711271911944558e-06, + "loss": 3.3609, + "step": 15180 + }, + { + "epoch": 1.0317298545998097, + "grad_norm": 5.299256801605225, + "learning_rate": 8.71084726185623e-06, + "loss": 3.233, + "step": 15185 + }, + { + "epoch": 1.0320695746704716, + "grad_norm": 7.1797099113464355, + "learning_rate": 8.710422611767904e-06, + "loss": 3.3699, + "step": 15190 + }, + { + "epoch": 1.0324092947411334, + "grad_norm": 6.739312171936035, + "learning_rate": 8.709997961679576e-06, + "loss": 3.2144, + "step": 15195 + }, + { + "epoch": 1.032749014811795, + "grad_norm": 6.272330284118652, + "learning_rate": 8.70957331159125e-06, + "loss": 3.2261, + "step": 15200 + }, + { + "epoch": 1.033088734882457, + "grad_norm": 7.391417980194092, + "learning_rate": 8.709148661502922e-06, + "loss": 3.2062, + "step": 15205 + }, + { + "epoch": 1.0334284549531185, + "grad_norm": 7.95960807800293, + "learning_rate": 8.708724011414595e-06, + "loss": 3.2422, + "step": 15210 + }, + { + "epoch": 1.0337681750237804, + "grad_norm": 5.746967792510986, + "learning_rate": 8.708299361326268e-06, + "loss": 3.2854, + "step": 15215 + }, + { + "epoch": 1.0341078950944422, + "grad_norm": 6.8498992919921875, + "learning_rate": 8.70787471123794e-06, + "loss": 3.3467, + "step": 15220 + }, + { + "epoch": 1.0344476151651039, + "grad_norm": 7.573444366455078, + "learning_rate": 8.707450061149613e-06, + "loss": 3.0064, + "step": 15225 + }, + { + "epoch": 1.0347873352357657, + "grad_norm": 7.081925868988037, + "learning_rate": 8.707025411061286e-06, + "loss": 3.1571, + "step": 15230 + }, + { + "epoch": 1.0351270553064276, + "grad_norm": 5.4306111335754395, + "learning_rate": 8.706600760972959e-06, + "loss": 3.4269, + "step": 15235 + }, + { + "epoch": 1.0354667753770892, + "grad_norm": 7.559370994567871, + "learning_rate": 8.706176110884632e-06, + "loss": 3.1783, + "step": 15240 + }, + { + "epoch": 1.035806495447751, + "grad_norm": 6.852243423461914, + "learning_rate": 8.705751460796304e-06, + "loss": 3.3176, + "step": 15245 + }, + { + "epoch": 1.036146215518413, + "grad_norm": 5.645192623138428, + "learning_rate": 8.705326810707977e-06, + "loss": 3.2123, + "step": 15250 + }, + { + "epoch": 1.0364859355890745, + "grad_norm": 6.556341648101807, + "learning_rate": 8.70490216061965e-06, + "loss": 3.2838, + "step": 15255 + }, + { + "epoch": 1.0368256556597364, + "grad_norm": 6.10467004776001, + "learning_rate": 8.704477510531323e-06, + "loss": 3.3117, + "step": 15260 + }, + { + "epoch": 1.0371653757303982, + "grad_norm": 6.228422164916992, + "learning_rate": 8.704052860442996e-06, + "loss": 3.1888, + "step": 15265 + }, + { + "epoch": 1.0375050958010599, + "grad_norm": 5.497036933898926, + "learning_rate": 8.703628210354668e-06, + "loss": 3.2056, + "step": 15270 + }, + { + "epoch": 1.0378448158717217, + "grad_norm": 7.867119789123535, + "learning_rate": 8.703203560266341e-06, + "loss": 3.0517, + "step": 15275 + }, + { + "epoch": 1.0381845359423836, + "grad_norm": 7.3814778327941895, + "learning_rate": 8.702778910178014e-06, + "loss": 3.2624, + "step": 15280 + }, + { + "epoch": 1.0385242560130452, + "grad_norm": 7.103148937225342, + "learning_rate": 8.702354260089687e-06, + "loss": 3.2959, + "step": 15285 + }, + { + "epoch": 1.038863976083707, + "grad_norm": 5.894172191619873, + "learning_rate": 8.70192961000136e-06, + "loss": 3.5881, + "step": 15290 + }, + { + "epoch": 1.0392036961543687, + "grad_norm": 7.583561420440674, + "learning_rate": 8.701504959913032e-06, + "loss": 3.179, + "step": 15295 + }, + { + "epoch": 1.0395434162250305, + "grad_norm": 6.296192646026611, + "learning_rate": 8.701080309824705e-06, + "loss": 3.2121, + "step": 15300 + }, + { + "epoch": 1.0398831362956924, + "grad_norm": 7.512871742248535, + "learning_rate": 8.700655659736378e-06, + "loss": 3.2136, + "step": 15305 + }, + { + "epoch": 1.040222856366354, + "grad_norm": 7.180334568023682, + "learning_rate": 8.70023100964805e-06, + "loss": 2.8589, + "step": 15310 + }, + { + "epoch": 1.0405625764370159, + "grad_norm": 8.137789726257324, + "learning_rate": 8.699806359559724e-06, + "loss": 3.3317, + "step": 15315 + }, + { + "epoch": 1.0409022965076777, + "grad_norm": 7.818446159362793, + "learning_rate": 8.699381709471396e-06, + "loss": 3.1297, + "step": 15320 + }, + { + "epoch": 1.0412420165783394, + "grad_norm": 6.809744834899902, + "learning_rate": 8.69895705938307e-06, + "loss": 3.0957, + "step": 15325 + }, + { + "epoch": 1.0415817366490012, + "grad_norm": 6.498271942138672, + "learning_rate": 8.698532409294742e-06, + "loss": 3.2061, + "step": 15330 + }, + { + "epoch": 1.041921456719663, + "grad_norm": 6.831043243408203, + "learning_rate": 8.698107759206415e-06, + "loss": 3.26, + "step": 15335 + }, + { + "epoch": 1.0422611767903247, + "grad_norm": 6.895270824432373, + "learning_rate": 8.697683109118088e-06, + "loss": 3.4251, + "step": 15340 + }, + { + "epoch": 1.0426008968609866, + "grad_norm": 6.98019552230835, + "learning_rate": 8.69725845902976e-06, + "loss": 3.162, + "step": 15345 + }, + { + "epoch": 1.0429406169316484, + "grad_norm": 5.7384467124938965, + "learning_rate": 8.696833808941433e-06, + "loss": 3.013, + "step": 15350 + }, + { + "epoch": 1.04328033700231, + "grad_norm": 7.022907733917236, + "learning_rate": 8.696409158853106e-06, + "loss": 2.968, + "step": 15355 + }, + { + "epoch": 1.043620057072972, + "grad_norm": 6.080611705780029, + "learning_rate": 8.695984508764779e-06, + "loss": 3.221, + "step": 15360 + }, + { + "epoch": 1.0439597771436337, + "grad_norm": 7.423184394836426, + "learning_rate": 8.695559858676452e-06, + "loss": 2.9518, + "step": 15365 + }, + { + "epoch": 1.0442994972142954, + "grad_norm": 7.410548210144043, + "learning_rate": 8.695135208588123e-06, + "loss": 3.3578, + "step": 15370 + }, + { + "epoch": 1.0446392172849572, + "grad_norm": 6.49241828918457, + "learning_rate": 8.694710558499797e-06, + "loss": 3.0866, + "step": 15375 + }, + { + "epoch": 1.0449789373556189, + "grad_norm": 6.26837682723999, + "learning_rate": 8.69428590841147e-06, + "loss": 3.0149, + "step": 15380 + }, + { + "epoch": 1.0453186574262807, + "grad_norm": 8.203568458557129, + "learning_rate": 8.693861258323143e-06, + "loss": 3.0194, + "step": 15385 + }, + { + "epoch": 1.0456583774969426, + "grad_norm": 7.321976661682129, + "learning_rate": 8.693436608234816e-06, + "loss": 3.1234, + "step": 15390 + }, + { + "epoch": 1.0459980975676042, + "grad_norm": 6.098819255828857, + "learning_rate": 8.693011958146488e-06, + "loss": 2.9341, + "step": 15395 + }, + { + "epoch": 1.046337817638266, + "grad_norm": 8.06982135772705, + "learning_rate": 8.692587308058161e-06, + "loss": 3.1581, + "step": 15400 + }, + { + "epoch": 1.046677537708928, + "grad_norm": 8.614377975463867, + "learning_rate": 8.692162657969834e-06, + "loss": 3.1116, + "step": 15405 + }, + { + "epoch": 1.0470172577795895, + "grad_norm": 9.290026664733887, + "learning_rate": 8.691738007881507e-06, + "loss": 3.1658, + "step": 15410 + }, + { + "epoch": 1.0473569778502514, + "grad_norm": 7.261781692504883, + "learning_rate": 8.69131335779318e-06, + "loss": 3.4145, + "step": 15415 + }, + { + "epoch": 1.0476966979209132, + "grad_norm": 8.120779991149902, + "learning_rate": 8.690888707704852e-06, + "loss": 3.0877, + "step": 15420 + }, + { + "epoch": 1.0480364179915749, + "grad_norm": 5.702973365783691, + "learning_rate": 8.690464057616525e-06, + "loss": 3.0562, + "step": 15425 + }, + { + "epoch": 1.0483761380622367, + "grad_norm": 7.730495452880859, + "learning_rate": 8.690039407528198e-06, + "loss": 3.0314, + "step": 15430 + }, + { + "epoch": 1.0487158581328986, + "grad_norm": 6.485436916351318, + "learning_rate": 8.68961475743987e-06, + "loss": 3.201, + "step": 15435 + }, + { + "epoch": 1.0490555782035602, + "grad_norm": 5.112759590148926, + "learning_rate": 8.689190107351544e-06, + "loss": 3.0821, + "step": 15440 + }, + { + "epoch": 1.049395298274222, + "grad_norm": 7.856513500213623, + "learning_rate": 8.688765457263216e-06, + "loss": 3.0113, + "step": 15445 + }, + { + "epoch": 1.049735018344884, + "grad_norm": 6.66057014465332, + "learning_rate": 8.68834080717489e-06, + "loss": 3.3595, + "step": 15450 + }, + { + "epoch": 1.0500747384155455, + "grad_norm": 6.079074382781982, + "learning_rate": 8.68791615708656e-06, + "loss": 3.1934, + "step": 15455 + }, + { + "epoch": 1.0504144584862074, + "grad_norm": 10.08078670501709, + "learning_rate": 8.687491506998235e-06, + "loss": 3.0538, + "step": 15460 + }, + { + "epoch": 1.050754178556869, + "grad_norm": 7.195629596710205, + "learning_rate": 8.687066856909908e-06, + "loss": 3.4093, + "step": 15465 + }, + { + "epoch": 1.0510938986275309, + "grad_norm": 7.020378112792969, + "learning_rate": 8.686642206821579e-06, + "loss": 3.3857, + "step": 15470 + }, + { + "epoch": 1.0514336186981927, + "grad_norm": 6.368786811828613, + "learning_rate": 8.686217556733253e-06, + "loss": 3.0448, + "step": 15475 + }, + { + "epoch": 1.0517733387688544, + "grad_norm": 7.212168216705322, + "learning_rate": 8.685792906644926e-06, + "loss": 3.2237, + "step": 15480 + }, + { + "epoch": 1.0521130588395162, + "grad_norm": 6.649903297424316, + "learning_rate": 8.685368256556597e-06, + "loss": 3.3767, + "step": 15485 + }, + { + "epoch": 1.052452778910178, + "grad_norm": 5.575019836425781, + "learning_rate": 8.684943606468272e-06, + "loss": 3.2754, + "step": 15490 + }, + { + "epoch": 1.0527924989808397, + "grad_norm": 6.335310935974121, + "learning_rate": 8.684518956379944e-06, + "loss": 3.0093, + "step": 15495 + }, + { + "epoch": 1.0531322190515016, + "grad_norm": 5.971419811248779, + "learning_rate": 8.684094306291616e-06, + "loss": 3.0446, + "step": 15500 + }, + { + "epoch": 1.0534719391221634, + "grad_norm": 7.395442008972168, + "learning_rate": 8.68366965620329e-06, + "loss": 3.271, + "step": 15505 + }, + { + "epoch": 1.053811659192825, + "grad_norm": 5.778542518615723, + "learning_rate": 8.683245006114963e-06, + "loss": 3.3816, + "step": 15510 + }, + { + "epoch": 1.054151379263487, + "grad_norm": 7.79568338394165, + "learning_rate": 8.682820356026634e-06, + "loss": 3.0488, + "step": 15515 + }, + { + "epoch": 1.0544910993341488, + "grad_norm": 6.853940010070801, + "learning_rate": 8.682395705938308e-06, + "loss": 3.4105, + "step": 15520 + }, + { + "epoch": 1.0548308194048104, + "grad_norm": 8.840965270996094, + "learning_rate": 8.68197105584998e-06, + "loss": 3.1396, + "step": 15525 + }, + { + "epoch": 1.0551705394754722, + "grad_norm": 5.716559410095215, + "learning_rate": 8.681546405761652e-06, + "loss": 3.1264, + "step": 15530 + }, + { + "epoch": 1.055510259546134, + "grad_norm": 8.066107749938965, + "learning_rate": 8.681121755673327e-06, + "loss": 3.3801, + "step": 15535 + }, + { + "epoch": 1.0558499796167957, + "grad_norm": 7.236964702606201, + "learning_rate": 8.680697105584998e-06, + "loss": 3.2567, + "step": 15540 + }, + { + "epoch": 1.0561896996874576, + "grad_norm": 5.185129642486572, + "learning_rate": 8.68027245549667e-06, + "loss": 3.3633, + "step": 15545 + }, + { + "epoch": 1.0565294197581192, + "grad_norm": 6.0093817710876465, + "learning_rate": 8.679847805408345e-06, + "loss": 3.2045, + "step": 15550 + }, + { + "epoch": 1.056869139828781, + "grad_norm": 6.189488410949707, + "learning_rate": 8.679423155320016e-06, + "loss": 3.1184, + "step": 15555 + }, + { + "epoch": 1.057208859899443, + "grad_norm": 7.014467716217041, + "learning_rate": 8.678998505231689e-06, + "loss": 2.9058, + "step": 15560 + }, + { + "epoch": 1.0575485799701045, + "grad_norm": 7.619813442230225, + "learning_rate": 8.678573855143364e-06, + "loss": 3.3285, + "step": 15565 + }, + { + "epoch": 1.0578883000407664, + "grad_norm": 9.591767311096191, + "learning_rate": 8.678149205055035e-06, + "loss": 3.2119, + "step": 15570 + }, + { + "epoch": 1.0582280201114282, + "grad_norm": 7.289506912231445, + "learning_rate": 8.677724554966708e-06, + "loss": 3.0844, + "step": 15575 + }, + { + "epoch": 1.0585677401820899, + "grad_norm": 6.5983805656433105, + "learning_rate": 8.677299904878382e-06, + "loss": 3.2482, + "step": 15580 + }, + { + "epoch": 1.0589074602527517, + "grad_norm": 6.620102882385254, + "learning_rate": 8.676875254790053e-06, + "loss": 3.2039, + "step": 15585 + }, + { + "epoch": 1.0592471803234136, + "grad_norm": 6.120856761932373, + "learning_rate": 8.676450604701726e-06, + "loss": 3.3863, + "step": 15590 + }, + { + "epoch": 1.0595869003940752, + "grad_norm": 7.005153179168701, + "learning_rate": 8.676025954613399e-06, + "loss": 3.2711, + "step": 15595 + }, + { + "epoch": 1.059926620464737, + "grad_norm": 5.049185752868652, + "learning_rate": 8.675601304525072e-06, + "loss": 3.2937, + "step": 15600 + }, + { + "epoch": 1.060266340535399, + "grad_norm": 6.338646411895752, + "learning_rate": 8.675176654436744e-06, + "loss": 3.0742, + "step": 15605 + }, + { + "epoch": 1.0606060606060606, + "grad_norm": 8.123196601867676, + "learning_rate": 8.674752004348417e-06, + "loss": 3.1702, + "step": 15610 + }, + { + "epoch": 1.0609457806767224, + "grad_norm": 6.505966663360596, + "learning_rate": 8.67432735426009e-06, + "loss": 3.2465, + "step": 15615 + }, + { + "epoch": 1.0612855007473843, + "grad_norm": 6.046250343322754, + "learning_rate": 8.673902704171763e-06, + "loss": 3.5145, + "step": 15620 + }, + { + "epoch": 1.061625220818046, + "grad_norm": 6.571189880371094, + "learning_rate": 8.673478054083436e-06, + "loss": 3.2206, + "step": 15625 + }, + { + "epoch": 1.0619649408887077, + "grad_norm": 5.4217681884765625, + "learning_rate": 8.673053403995108e-06, + "loss": 3.1636, + "step": 15630 + }, + { + "epoch": 1.0623046609593694, + "grad_norm": 7.679835319519043, + "learning_rate": 8.672628753906781e-06, + "loss": 3.4113, + "step": 15635 + }, + { + "epoch": 1.0626443810300312, + "grad_norm": 9.076969146728516, + "learning_rate": 8.672204103818454e-06, + "loss": 3.0778, + "step": 15640 + }, + { + "epoch": 1.062984101100693, + "grad_norm": 6.463776588439941, + "learning_rate": 8.671779453730127e-06, + "loss": 3.289, + "step": 15645 + }, + { + "epoch": 1.0633238211713547, + "grad_norm": 6.622988700866699, + "learning_rate": 8.6713548036418e-06, + "loss": 3.137, + "step": 15650 + }, + { + "epoch": 1.0636635412420166, + "grad_norm": 8.241394996643066, + "learning_rate": 8.670930153553472e-06, + "loss": 3.2457, + "step": 15655 + }, + { + "epoch": 1.0640032613126784, + "grad_norm": 7.659491062164307, + "learning_rate": 8.670505503465145e-06, + "loss": 3.1727, + "step": 15660 + }, + { + "epoch": 1.06434298138334, + "grad_norm": 5.646787643432617, + "learning_rate": 8.670080853376818e-06, + "loss": 3.0911, + "step": 15665 + }, + { + "epoch": 1.064682701454002, + "grad_norm": 5.699162006378174, + "learning_rate": 8.66965620328849e-06, + "loss": 3.0648, + "step": 15670 + }, + { + "epoch": 1.0650224215246638, + "grad_norm": 9.035747528076172, + "learning_rate": 8.669231553200164e-06, + "loss": 3.0608, + "step": 15675 + }, + { + "epoch": 1.0653621415953254, + "grad_norm": 7.2738800048828125, + "learning_rate": 8.668806903111836e-06, + "loss": 3.3687, + "step": 15680 + }, + { + "epoch": 1.0657018616659872, + "grad_norm": 7.5158185958862305, + "learning_rate": 8.66838225302351e-06, + "loss": 3.1087, + "step": 15685 + }, + { + "epoch": 1.066041581736649, + "grad_norm": 6.150826930999756, + "learning_rate": 8.667957602935182e-06, + "loss": 3.2455, + "step": 15690 + }, + { + "epoch": 1.0663813018073107, + "grad_norm": 7.798165798187256, + "learning_rate": 8.667532952846855e-06, + "loss": 3.2546, + "step": 15695 + }, + { + "epoch": 1.0667210218779726, + "grad_norm": 7.137589931488037, + "learning_rate": 8.667108302758528e-06, + "loss": 3.2318, + "step": 15700 + }, + { + "epoch": 1.0670607419486344, + "grad_norm": 7.100584506988525, + "learning_rate": 8.6666836526702e-06, + "loss": 2.9023, + "step": 15705 + }, + { + "epoch": 1.067400462019296, + "grad_norm": 5.764091968536377, + "learning_rate": 8.666259002581873e-06, + "loss": 3.1718, + "step": 15710 + }, + { + "epoch": 1.067740182089958, + "grad_norm": 7.396551609039307, + "learning_rate": 8.665834352493546e-06, + "loss": 3.1764, + "step": 15715 + }, + { + "epoch": 1.0680799021606195, + "grad_norm": 7.1156463623046875, + "learning_rate": 8.665409702405219e-06, + "loss": 3.181, + "step": 15720 + }, + { + "epoch": 1.0684196222312814, + "grad_norm": 6.132216453552246, + "learning_rate": 8.664985052316892e-06, + "loss": 3.1855, + "step": 15725 + }, + { + "epoch": 1.0687593423019432, + "grad_norm": 7.242958068847656, + "learning_rate": 8.664560402228564e-06, + "loss": 3.122, + "step": 15730 + }, + { + "epoch": 1.0690990623726049, + "grad_norm": 5.905008316040039, + "learning_rate": 8.664135752140237e-06, + "loss": 3.3272, + "step": 15735 + }, + { + "epoch": 1.0694387824432667, + "grad_norm": 6.189210414886475, + "learning_rate": 8.66371110205191e-06, + "loss": 3.2732, + "step": 15740 + }, + { + "epoch": 1.0697785025139286, + "grad_norm": 6.370070934295654, + "learning_rate": 8.663286451963583e-06, + "loss": 3.0454, + "step": 15745 + }, + { + "epoch": 1.0701182225845902, + "grad_norm": 5.605490684509277, + "learning_rate": 8.662861801875256e-06, + "loss": 2.8631, + "step": 15750 + }, + { + "epoch": 1.070457942655252, + "grad_norm": 7.384043216705322, + "learning_rate": 8.662437151786928e-06, + "loss": 3.5346, + "step": 15755 + }, + { + "epoch": 1.070797662725914, + "grad_norm": 7.736724376678467, + "learning_rate": 8.662012501698601e-06, + "loss": 3.1875, + "step": 15760 + }, + { + "epoch": 1.0711373827965756, + "grad_norm": 6.9678449630737305, + "learning_rate": 8.661587851610274e-06, + "loss": 3.3374, + "step": 15765 + }, + { + "epoch": 1.0714771028672374, + "grad_norm": 7.120771884918213, + "learning_rate": 8.661163201521947e-06, + "loss": 2.99, + "step": 15770 + }, + { + "epoch": 1.0718168229378993, + "grad_norm": 6.910594463348389, + "learning_rate": 8.66073855143362e-06, + "loss": 3.2512, + "step": 15775 + }, + { + "epoch": 1.072156543008561, + "grad_norm": 6.89998197555542, + "learning_rate": 8.660313901345292e-06, + "loss": 3.2796, + "step": 15780 + }, + { + "epoch": 1.0724962630792227, + "grad_norm": 6.511589050292969, + "learning_rate": 8.659889251256965e-06, + "loss": 3.0296, + "step": 15785 + }, + { + "epoch": 1.0728359831498846, + "grad_norm": 6.421206474304199, + "learning_rate": 8.659464601168638e-06, + "loss": 3.2218, + "step": 15790 + }, + { + "epoch": 1.0731757032205462, + "grad_norm": 7.386100769042969, + "learning_rate": 8.65903995108031e-06, + "loss": 3.3732, + "step": 15795 + }, + { + "epoch": 1.073515423291208, + "grad_norm": 8.067360877990723, + "learning_rate": 8.658615300991984e-06, + "loss": 3.2441, + "step": 15800 + }, + { + "epoch": 1.07385514336187, + "grad_norm": 6.379727363586426, + "learning_rate": 8.658190650903656e-06, + "loss": 3.3284, + "step": 15805 + }, + { + "epoch": 1.0741948634325316, + "grad_norm": 5.142232418060303, + "learning_rate": 8.65776600081533e-06, + "loss": 3.0524, + "step": 15810 + }, + { + "epoch": 1.0745345835031934, + "grad_norm": 6.438656330108643, + "learning_rate": 8.657341350727002e-06, + "loss": 3.3636, + "step": 15815 + }, + { + "epoch": 1.074874303573855, + "grad_norm": 5.899439811706543, + "learning_rate": 8.656916700638675e-06, + "loss": 3.1606, + "step": 15820 + }, + { + "epoch": 1.075214023644517, + "grad_norm": 7.432394027709961, + "learning_rate": 8.656492050550348e-06, + "loss": 3.2469, + "step": 15825 + }, + { + "epoch": 1.0755537437151788, + "grad_norm": 6.646400451660156, + "learning_rate": 8.65606740046202e-06, + "loss": 3.2424, + "step": 15830 + }, + { + "epoch": 1.0758934637858404, + "grad_norm": 6.861017227172852, + "learning_rate": 8.655642750373693e-06, + "loss": 2.9245, + "step": 15835 + }, + { + "epoch": 1.0762331838565022, + "grad_norm": 6.643834590911865, + "learning_rate": 8.655218100285366e-06, + "loss": 3.1064, + "step": 15840 + }, + { + "epoch": 1.076572903927164, + "grad_norm": 6.79772424697876, + "learning_rate": 8.654793450197039e-06, + "loss": 2.8296, + "step": 15845 + }, + { + "epoch": 1.0769126239978257, + "grad_norm": 6.913973808288574, + "learning_rate": 8.654368800108712e-06, + "loss": 2.9594, + "step": 15850 + }, + { + "epoch": 1.0772523440684876, + "grad_norm": 5.10388708114624, + "learning_rate": 8.653944150020384e-06, + "loss": 3.1407, + "step": 15855 + }, + { + "epoch": 1.0775920641391494, + "grad_norm": 7.30620813369751, + "learning_rate": 8.653519499932057e-06, + "loss": 3.3971, + "step": 15860 + }, + { + "epoch": 1.077931784209811, + "grad_norm": 6.9626054763793945, + "learning_rate": 8.65309484984373e-06, + "loss": 2.9451, + "step": 15865 + }, + { + "epoch": 1.078271504280473, + "grad_norm": 6.137735843658447, + "learning_rate": 8.652670199755401e-06, + "loss": 3.1607, + "step": 15870 + }, + { + "epoch": 1.0786112243511348, + "grad_norm": 6.289219856262207, + "learning_rate": 8.652245549667076e-06, + "loss": 3.1212, + "step": 15875 + }, + { + "epoch": 1.0789509444217964, + "grad_norm": 7.770288944244385, + "learning_rate": 8.651820899578748e-06, + "loss": 3.1397, + "step": 15880 + }, + { + "epoch": 1.0792906644924583, + "grad_norm": 5.813635349273682, + "learning_rate": 8.65139624949042e-06, + "loss": 3.2674, + "step": 15885 + }, + { + "epoch": 1.0796303845631199, + "grad_norm": 5.088466644287109, + "learning_rate": 8.650971599402094e-06, + "loss": 3.0915, + "step": 15890 + }, + { + "epoch": 1.0799701046337817, + "grad_norm": 5.843857765197754, + "learning_rate": 8.650546949313767e-06, + "loss": 3.229, + "step": 15895 + }, + { + "epoch": 1.0803098247044436, + "grad_norm": 8.093461036682129, + "learning_rate": 8.650122299225438e-06, + "loss": 2.8466, + "step": 15900 + }, + { + "epoch": 1.0806495447751052, + "grad_norm": 6.422791481018066, + "learning_rate": 8.649697649137112e-06, + "loss": 3.3018, + "step": 15905 + }, + { + "epoch": 1.080989264845767, + "grad_norm": 7.453834056854248, + "learning_rate": 8.649272999048785e-06, + "loss": 3.4397, + "step": 15910 + }, + { + "epoch": 1.081328984916429, + "grad_norm": 6.883430480957031, + "learning_rate": 8.648848348960456e-06, + "loss": 3.2715, + "step": 15915 + }, + { + "epoch": 1.0816687049870906, + "grad_norm": 8.296605110168457, + "learning_rate": 8.64842369887213e-06, + "loss": 3.1417, + "step": 15920 + }, + { + "epoch": 1.0820084250577524, + "grad_norm": 7.057784557342529, + "learning_rate": 8.647999048783804e-06, + "loss": 3.154, + "step": 15925 + }, + { + "epoch": 1.0823481451284143, + "grad_norm": 5.661929130554199, + "learning_rate": 8.647574398695475e-06, + "loss": 3.1023, + "step": 15930 + }, + { + "epoch": 1.082687865199076, + "grad_norm": 6.574446678161621, + "learning_rate": 8.64714974860715e-06, + "loss": 3.1096, + "step": 15935 + }, + { + "epoch": 1.0830275852697377, + "grad_norm": 9.693873405456543, + "learning_rate": 8.64672509851882e-06, + "loss": 3.221, + "step": 15940 + }, + { + "epoch": 1.0833673053403996, + "grad_norm": 6.680211544036865, + "learning_rate": 8.646300448430493e-06, + "loss": 3.2147, + "step": 15945 + }, + { + "epoch": 1.0837070254110612, + "grad_norm": 6.242702484130859, + "learning_rate": 8.645875798342168e-06, + "loss": 3.0392, + "step": 15950 + }, + { + "epoch": 1.084046745481723, + "grad_norm": 6.847929000854492, + "learning_rate": 8.645451148253839e-06, + "loss": 3.3229, + "step": 15955 + }, + { + "epoch": 1.084386465552385, + "grad_norm": 6.057990550994873, + "learning_rate": 8.645026498165512e-06, + "loss": 3.1169, + "step": 15960 + }, + { + "epoch": 1.0847261856230466, + "grad_norm": 7.077889442443848, + "learning_rate": 8.644601848077186e-06, + "loss": 3.2954, + "step": 15965 + }, + { + "epoch": 1.0850659056937084, + "grad_norm": 6.000818252563477, + "learning_rate": 8.644177197988857e-06, + "loss": 3.168, + "step": 15970 + }, + { + "epoch": 1.0854056257643703, + "grad_norm": 6.4675726890563965, + "learning_rate": 8.64375254790053e-06, + "loss": 3.3367, + "step": 15975 + }, + { + "epoch": 1.085745345835032, + "grad_norm": 6.781857490539551, + "learning_rate": 8.643327897812204e-06, + "loss": 3.3438, + "step": 15980 + }, + { + "epoch": 1.0860850659056938, + "grad_norm": 7.1982598304748535, + "learning_rate": 8.642903247723876e-06, + "loss": 3.204, + "step": 15985 + }, + { + "epoch": 1.0864247859763554, + "grad_norm": 5.955109596252441, + "learning_rate": 8.642478597635548e-06, + "loss": 3.1111, + "step": 15990 + }, + { + "epoch": 1.0867645060470172, + "grad_norm": 6.423818111419678, + "learning_rate": 8.642053947547223e-06, + "loss": 3.2865, + "step": 15995 + }, + { + "epoch": 1.087104226117679, + "grad_norm": 7.753147125244141, + "learning_rate": 8.641629297458894e-06, + "loss": 3.4143, + "step": 16000 + }, + { + "epoch": 1.0874439461883407, + "grad_norm": 5.981224536895752, + "learning_rate": 8.641204647370567e-06, + "loss": 3.1666, + "step": 16005 + }, + { + "epoch": 1.0877836662590026, + "grad_norm": 6.530725479125977, + "learning_rate": 8.64077999728224e-06, + "loss": 3.3796, + "step": 16010 + }, + { + "epoch": 1.0881233863296644, + "grad_norm": 5.260318279266357, + "learning_rate": 8.640355347193912e-06, + "loss": 3.1511, + "step": 16015 + }, + { + "epoch": 1.088463106400326, + "grad_norm": 5.438953399658203, + "learning_rate": 8.639930697105585e-06, + "loss": 3.0971, + "step": 16020 + }, + { + "epoch": 1.088802826470988, + "grad_norm": 5.752909183502197, + "learning_rate": 8.639506047017258e-06, + "loss": 3.1098, + "step": 16025 + }, + { + "epoch": 1.0891425465416498, + "grad_norm": 6.373091220855713, + "learning_rate": 8.63908139692893e-06, + "loss": 3.1576, + "step": 16030 + }, + { + "epoch": 1.0894822666123114, + "grad_norm": 6.975641250610352, + "learning_rate": 8.638656746840604e-06, + "loss": 3.0609, + "step": 16035 + }, + { + "epoch": 1.0898219866829733, + "grad_norm": 6.629700183868408, + "learning_rate": 8.638232096752276e-06, + "loss": 3.0057, + "step": 16040 + }, + { + "epoch": 1.090161706753635, + "grad_norm": 6.831123352050781, + "learning_rate": 8.637807446663949e-06, + "loss": 3.0621, + "step": 16045 + }, + { + "epoch": 1.0905014268242967, + "grad_norm": 5.100019931793213, + "learning_rate": 8.637382796575622e-06, + "loss": 3.4141, + "step": 16050 + }, + { + "epoch": 1.0908411468949586, + "grad_norm": 5.581820964813232, + "learning_rate": 8.636958146487295e-06, + "loss": 3.1324, + "step": 16055 + }, + { + "epoch": 1.0911808669656202, + "grad_norm": 6.833179473876953, + "learning_rate": 8.636533496398968e-06, + "loss": 3.0044, + "step": 16060 + }, + { + "epoch": 1.091520587036282, + "grad_norm": 7.514232158660889, + "learning_rate": 8.636108846310642e-06, + "loss": 3.1572, + "step": 16065 + }, + { + "epoch": 1.091860307106944, + "grad_norm": 8.570684432983398, + "learning_rate": 8.635684196222313e-06, + "loss": 3.1696, + "step": 16070 + }, + { + "epoch": 1.0922000271776056, + "grad_norm": 6.394261837005615, + "learning_rate": 8.635259546133986e-06, + "loss": 3.232, + "step": 16075 + }, + { + "epoch": 1.0925397472482674, + "grad_norm": 6.456424713134766, + "learning_rate": 8.63483489604566e-06, + "loss": 3.2311, + "step": 16080 + }, + { + "epoch": 1.0928794673189293, + "grad_norm": 7.2502217292785645, + "learning_rate": 8.634410245957332e-06, + "loss": 3.1466, + "step": 16085 + }, + { + "epoch": 1.093219187389591, + "grad_norm": 7.286230087280273, + "learning_rate": 8.633985595869004e-06, + "loss": 2.952, + "step": 16090 + }, + { + "epoch": 1.0935589074602527, + "grad_norm": 8.416799545288086, + "learning_rate": 8.633560945780677e-06, + "loss": 3.323, + "step": 16095 + }, + { + "epoch": 1.0938986275309146, + "grad_norm": 6.385899066925049, + "learning_rate": 8.63313629569235e-06, + "loss": 3.3215, + "step": 16100 + }, + { + "epoch": 1.0942383476015762, + "grad_norm": 6.855759620666504, + "learning_rate": 8.632711645604023e-06, + "loss": 3.1442, + "step": 16105 + }, + { + "epoch": 1.094578067672238, + "grad_norm": 7.181530952453613, + "learning_rate": 8.632286995515696e-06, + "loss": 3.1836, + "step": 16110 + }, + { + "epoch": 1.0949177877429, + "grad_norm": 7.720142841339111, + "learning_rate": 8.631862345427368e-06, + "loss": 3.2745, + "step": 16115 + }, + { + "epoch": 1.0952575078135616, + "grad_norm": 5.949118614196777, + "learning_rate": 8.631437695339041e-06, + "loss": 3.2597, + "step": 16120 + }, + { + "epoch": 1.0955972278842234, + "grad_norm": 8.850571632385254, + "learning_rate": 8.631013045250714e-06, + "loss": 3.112, + "step": 16125 + }, + { + "epoch": 1.0959369479548853, + "grad_norm": 6.579134941101074, + "learning_rate": 8.630588395162387e-06, + "loss": 3.3846, + "step": 16130 + }, + { + "epoch": 1.096276668025547, + "grad_norm": 7.4462432861328125, + "learning_rate": 8.63016374507406e-06, + "loss": 3.0012, + "step": 16135 + }, + { + "epoch": 1.0966163880962088, + "grad_norm": 5.273566722869873, + "learning_rate": 8.629739094985732e-06, + "loss": 3.3209, + "step": 16140 + }, + { + "epoch": 1.0969561081668706, + "grad_norm": 5.783092498779297, + "learning_rate": 8.629314444897405e-06, + "loss": 3.1204, + "step": 16145 + }, + { + "epoch": 1.0972958282375322, + "grad_norm": 5.759192943572998, + "learning_rate": 8.628889794809078e-06, + "loss": 3.1353, + "step": 16150 + }, + { + "epoch": 1.097635548308194, + "grad_norm": 6.572216510772705, + "learning_rate": 8.62846514472075e-06, + "loss": 3.1796, + "step": 16155 + }, + { + "epoch": 1.0979752683788557, + "grad_norm": 6.180042266845703, + "learning_rate": 8.628040494632424e-06, + "loss": 3.1544, + "step": 16160 + }, + { + "epoch": 1.0983149884495176, + "grad_norm": 7.915735721588135, + "learning_rate": 8.627615844544096e-06, + "loss": 3.0476, + "step": 16165 + }, + { + "epoch": 1.0986547085201794, + "grad_norm": 8.056050300598145, + "learning_rate": 8.627191194455769e-06, + "loss": 3.1721, + "step": 16170 + }, + { + "epoch": 1.098994428590841, + "grad_norm": 8.109063148498535, + "learning_rate": 8.626766544367442e-06, + "loss": 3.2412, + "step": 16175 + }, + { + "epoch": 1.099334148661503, + "grad_norm": 7.512217044830322, + "learning_rate": 8.626341894279115e-06, + "loss": 3.1984, + "step": 16180 + }, + { + "epoch": 1.0996738687321648, + "grad_norm": 6.046035289764404, + "learning_rate": 8.625917244190788e-06, + "loss": 3.3092, + "step": 16185 + }, + { + "epoch": 1.1000135888028264, + "grad_norm": 6.150191307067871, + "learning_rate": 8.62549259410246e-06, + "loss": 2.866, + "step": 16190 + }, + { + "epoch": 1.1003533088734883, + "grad_norm": 6.631065845489502, + "learning_rate": 8.625067944014133e-06, + "loss": 3.2484, + "step": 16195 + }, + { + "epoch": 1.10069302894415, + "grad_norm": 5.824328899383545, + "learning_rate": 8.624643293925806e-06, + "loss": 3.0639, + "step": 16200 + }, + { + "epoch": 1.1010327490148117, + "grad_norm": 5.943797588348389, + "learning_rate": 8.624218643837479e-06, + "loss": 3.2816, + "step": 16205 + }, + { + "epoch": 1.1013724690854736, + "grad_norm": 5.792860984802246, + "learning_rate": 8.623793993749152e-06, + "loss": 2.9896, + "step": 16210 + }, + { + "epoch": 1.1017121891561354, + "grad_norm": 4.9612836837768555, + "learning_rate": 8.623369343660824e-06, + "loss": 2.9969, + "step": 16215 + }, + { + "epoch": 1.102051909226797, + "grad_norm": 5.777973175048828, + "learning_rate": 8.622944693572497e-06, + "loss": 3.1391, + "step": 16220 + }, + { + "epoch": 1.102391629297459, + "grad_norm": 6.6431050300598145, + "learning_rate": 8.62252004348417e-06, + "loss": 3.3523, + "step": 16225 + }, + { + "epoch": 1.1027313493681206, + "grad_norm": 5.559348106384277, + "learning_rate": 8.622095393395843e-06, + "loss": 3.2212, + "step": 16230 + }, + { + "epoch": 1.1030710694387824, + "grad_norm": 7.961246967315674, + "learning_rate": 8.621670743307516e-06, + "loss": 3.2974, + "step": 16235 + }, + { + "epoch": 1.1034107895094443, + "grad_norm": 5.7380499839782715, + "learning_rate": 8.621246093219188e-06, + "loss": 3.3315, + "step": 16240 + }, + { + "epoch": 1.103750509580106, + "grad_norm": 6.256002426147461, + "learning_rate": 8.620821443130861e-06, + "loss": 3.2875, + "step": 16245 + }, + { + "epoch": 1.1040902296507678, + "grad_norm": 7.3245391845703125, + "learning_rate": 8.620396793042534e-06, + "loss": 3.2146, + "step": 16250 + }, + { + "epoch": 1.1044299497214296, + "grad_norm": 5.694943904876709, + "learning_rate": 8.619972142954207e-06, + "loss": 3.1354, + "step": 16255 + }, + { + "epoch": 1.1047696697920912, + "grad_norm": 5.515941143035889, + "learning_rate": 8.61954749286588e-06, + "loss": 3.0355, + "step": 16260 + }, + { + "epoch": 1.105109389862753, + "grad_norm": 7.344152927398682, + "learning_rate": 8.619122842777552e-06, + "loss": 3.061, + "step": 16265 + }, + { + "epoch": 1.105449109933415, + "grad_norm": 8.492867469787598, + "learning_rate": 8.618698192689225e-06, + "loss": 3.0521, + "step": 16270 + }, + { + "epoch": 1.1057888300040766, + "grad_norm": 6.2526140213012695, + "learning_rate": 8.618273542600898e-06, + "loss": 3.2645, + "step": 16275 + }, + { + "epoch": 1.1061285500747384, + "grad_norm": 7.451211452484131, + "learning_rate": 8.61784889251257e-06, + "loss": 3.1114, + "step": 16280 + }, + { + "epoch": 1.1064682701454003, + "grad_norm": 5.666543006896973, + "learning_rate": 8.617424242424242e-06, + "loss": 3.1848, + "step": 16285 + }, + { + "epoch": 1.106807990216062, + "grad_norm": 6.544435977935791, + "learning_rate": 8.616999592335916e-06, + "loss": 3.0664, + "step": 16290 + }, + { + "epoch": 1.1071477102867238, + "grad_norm": 6.979064464569092, + "learning_rate": 8.61657494224759e-06, + "loss": 2.9895, + "step": 16295 + }, + { + "epoch": 1.1074874303573856, + "grad_norm": 7.0885796546936035, + "learning_rate": 8.61615029215926e-06, + "loss": 2.8691, + "step": 16300 + }, + { + "epoch": 1.1078271504280472, + "grad_norm": 5.923239231109619, + "learning_rate": 8.615725642070935e-06, + "loss": 3.1239, + "step": 16305 + }, + { + "epoch": 1.108166870498709, + "grad_norm": 6.995856761932373, + "learning_rate": 8.615300991982608e-06, + "loss": 3.3867, + "step": 16310 + }, + { + "epoch": 1.108506590569371, + "grad_norm": 7.02217435836792, + "learning_rate": 8.614876341894279e-06, + "loss": 2.9697, + "step": 16315 + }, + { + "epoch": 1.1088463106400326, + "grad_norm": 6.929929256439209, + "learning_rate": 8.614451691805953e-06, + "loss": 3.3525, + "step": 16320 + }, + { + "epoch": 1.1091860307106944, + "grad_norm": 5.730223178863525, + "learning_rate": 8.614027041717626e-06, + "loss": 3.1749, + "step": 16325 + }, + { + "epoch": 1.109525750781356, + "grad_norm": 8.69602108001709, + "learning_rate": 8.613602391629297e-06, + "loss": 3.1551, + "step": 16330 + }, + { + "epoch": 1.109865470852018, + "grad_norm": 4.9134697914123535, + "learning_rate": 8.613177741540972e-06, + "loss": 3.3431, + "step": 16335 + }, + { + "epoch": 1.1102051909226798, + "grad_norm": 5.8416948318481445, + "learning_rate": 8.612753091452644e-06, + "loss": 3.084, + "step": 16340 + }, + { + "epoch": 1.1105449109933414, + "grad_norm": 6.6074538230896, + "learning_rate": 8.612328441364315e-06, + "loss": 3.336, + "step": 16345 + }, + { + "epoch": 1.1108846310640033, + "grad_norm": 5.414676189422607, + "learning_rate": 8.61190379127599e-06, + "loss": 3.181, + "step": 16350 + }, + { + "epoch": 1.1112243511346651, + "grad_norm": 8.097134590148926, + "learning_rate": 8.611479141187661e-06, + "loss": 3.2284, + "step": 16355 + }, + { + "epoch": 1.1115640712053267, + "grad_norm": 6.196414470672607, + "learning_rate": 8.611054491099334e-06, + "loss": 2.9329, + "step": 16360 + }, + { + "epoch": 1.1119037912759886, + "grad_norm": 5.413163661956787, + "learning_rate": 8.610629841011008e-06, + "loss": 3.1417, + "step": 16365 + }, + { + "epoch": 1.1122435113466504, + "grad_norm": 6.4571533203125, + "learning_rate": 8.61020519092268e-06, + "loss": 3.1311, + "step": 16370 + }, + { + "epoch": 1.112583231417312, + "grad_norm": 5.679881572723389, + "learning_rate": 8.609780540834352e-06, + "loss": 2.9621, + "step": 16375 + }, + { + "epoch": 1.112922951487974, + "grad_norm": 6.8964433670043945, + "learning_rate": 8.609355890746027e-06, + "loss": 3.3072, + "step": 16380 + }, + { + "epoch": 1.1132626715586358, + "grad_norm": 6.687068939208984, + "learning_rate": 8.608931240657698e-06, + "loss": 3.4132, + "step": 16385 + }, + { + "epoch": 1.1136023916292974, + "grad_norm": 6.200169563293457, + "learning_rate": 8.60850659056937e-06, + "loss": 3.4824, + "step": 16390 + }, + { + "epoch": 1.1139421116999593, + "grad_norm": 5.8250956535339355, + "learning_rate": 8.608081940481045e-06, + "loss": 3.3503, + "step": 16395 + }, + { + "epoch": 1.114281831770621, + "grad_norm": 6.897777080535889, + "learning_rate": 8.607657290392716e-06, + "loss": 3.131, + "step": 16400 + }, + { + "epoch": 1.1146215518412828, + "grad_norm": 7.398114204406738, + "learning_rate": 8.60723264030439e-06, + "loss": 3.1363, + "step": 16405 + }, + { + "epoch": 1.1149612719119446, + "grad_norm": 7.012904167175293, + "learning_rate": 8.606807990216064e-06, + "loss": 3.259, + "step": 16410 + }, + { + "epoch": 1.1153009919826062, + "grad_norm": 8.337095260620117, + "learning_rate": 8.606383340127735e-06, + "loss": 3.2272, + "step": 16415 + }, + { + "epoch": 1.115640712053268, + "grad_norm": 6.021138668060303, + "learning_rate": 8.60595869003941e-06, + "loss": 3.2009, + "step": 16420 + }, + { + "epoch": 1.11598043212393, + "grad_norm": 5.377895355224609, + "learning_rate": 8.605534039951082e-06, + "loss": 2.9995, + "step": 16425 + }, + { + "epoch": 1.1163201521945916, + "grad_norm": 7.166087627410889, + "learning_rate": 8.605109389862753e-06, + "loss": 3.3187, + "step": 16430 + }, + { + "epoch": 1.1166598722652534, + "grad_norm": 8.630145072937012, + "learning_rate": 8.604684739774428e-06, + "loss": 3.1314, + "step": 16435 + }, + { + "epoch": 1.1169995923359153, + "grad_norm": 7.169849872589111, + "learning_rate": 8.604260089686099e-06, + "loss": 3.1375, + "step": 16440 + }, + { + "epoch": 1.117339312406577, + "grad_norm": 7.572669982910156, + "learning_rate": 8.603835439597771e-06, + "loss": 3.1527, + "step": 16445 + }, + { + "epoch": 1.1176790324772388, + "grad_norm": 7.3418474197387695, + "learning_rate": 8.603410789509446e-06, + "loss": 3.46, + "step": 16450 + }, + { + "epoch": 1.1180187525479006, + "grad_norm": 6.971317768096924, + "learning_rate": 8.602986139421117e-06, + "loss": 3.242, + "step": 16455 + }, + { + "epoch": 1.1183584726185622, + "grad_norm": 9.341131210327148, + "learning_rate": 8.60256148933279e-06, + "loss": 3.2244, + "step": 16460 + }, + { + "epoch": 1.118698192689224, + "grad_norm": 6.6447672843933105, + "learning_rate": 8.602136839244464e-06, + "loss": 3.3892, + "step": 16465 + }, + { + "epoch": 1.119037912759886, + "grad_norm": 8.699872016906738, + "learning_rate": 8.601712189156136e-06, + "loss": 2.9586, + "step": 16470 + }, + { + "epoch": 1.1193776328305476, + "grad_norm": 4.790816307067871, + "learning_rate": 8.601287539067808e-06, + "loss": 3.2427, + "step": 16475 + }, + { + "epoch": 1.1197173529012094, + "grad_norm": 6.541463375091553, + "learning_rate": 8.600862888979483e-06, + "loss": 3.2605, + "step": 16480 + }, + { + "epoch": 1.1200570729718713, + "grad_norm": 7.209291934967041, + "learning_rate": 8.600438238891154e-06, + "loss": 3.1164, + "step": 16485 + }, + { + "epoch": 1.120396793042533, + "grad_norm": 6.27357816696167, + "learning_rate": 8.600013588802827e-06, + "loss": 3.2098, + "step": 16490 + }, + { + "epoch": 1.1207365131131948, + "grad_norm": 5.926974773406982, + "learning_rate": 8.599588938714501e-06, + "loss": 3.1827, + "step": 16495 + }, + { + "epoch": 1.1210762331838564, + "grad_norm": 6.859259128570557, + "learning_rate": 8.599164288626172e-06, + "loss": 3.2403, + "step": 16500 + }, + { + "epoch": 1.1214159532545183, + "grad_norm": 5.920802116394043, + "learning_rate": 8.598739638537845e-06, + "loss": 3.1659, + "step": 16505 + }, + { + "epoch": 1.1217556733251801, + "grad_norm": 6.482490539550781, + "learning_rate": 8.598314988449518e-06, + "loss": 3.1409, + "step": 16510 + }, + { + "epoch": 1.1220953933958417, + "grad_norm": 6.3046441078186035, + "learning_rate": 8.59789033836119e-06, + "loss": 3.1557, + "step": 16515 + }, + { + "epoch": 1.1224351134665036, + "grad_norm": 6.8752522468566895, + "learning_rate": 8.597465688272864e-06, + "loss": 3.2991, + "step": 16520 + }, + { + "epoch": 1.1227748335371655, + "grad_norm": 6.2004570960998535, + "learning_rate": 8.597041038184536e-06, + "loss": 3.0616, + "step": 16525 + }, + { + "epoch": 1.123114553607827, + "grad_norm": 6.604061126708984, + "learning_rate": 8.596616388096209e-06, + "loss": 3.1758, + "step": 16530 + }, + { + "epoch": 1.123454273678489, + "grad_norm": 6.52665901184082, + "learning_rate": 8.596191738007882e-06, + "loss": 3.0624, + "step": 16535 + }, + { + "epoch": 1.1237939937491508, + "grad_norm": 7.087593078613281, + "learning_rate": 8.595767087919555e-06, + "loss": 3.3223, + "step": 16540 + }, + { + "epoch": 1.1241337138198124, + "grad_norm": 6.7145256996154785, + "learning_rate": 8.595342437831228e-06, + "loss": 3.2255, + "step": 16545 + }, + { + "epoch": 1.1244734338904743, + "grad_norm": 6.721302032470703, + "learning_rate": 8.595002717760565e-06, + "loss": 3.1618, + "step": 16550 + }, + { + "epoch": 1.1248131539611361, + "grad_norm": 6.786927223205566, + "learning_rate": 8.59457806767224e-06, + "loss": 3.136, + "step": 16555 + }, + { + "epoch": 1.1251528740317978, + "grad_norm": 7.417334079742432, + "learning_rate": 8.594153417583912e-06, + "loss": 3.2495, + "step": 16560 + }, + { + "epoch": 1.1254925941024596, + "grad_norm": 7.9601287841796875, + "learning_rate": 8.593728767495583e-06, + "loss": 3.265, + "step": 16565 + }, + { + "epoch": 1.1258323141731212, + "grad_norm": 6.622315406799316, + "learning_rate": 8.593304117407258e-06, + "loss": 3.0824, + "step": 16570 + }, + { + "epoch": 1.126172034243783, + "grad_norm": 8.279936790466309, + "learning_rate": 8.59287946731893e-06, + "loss": 3.2651, + "step": 16575 + }, + { + "epoch": 1.126511754314445, + "grad_norm": 7.426762580871582, + "learning_rate": 8.592454817230602e-06, + "loss": 3.3403, + "step": 16580 + }, + { + "epoch": 1.1268514743851066, + "grad_norm": 4.8542070388793945, + "learning_rate": 8.592030167142276e-06, + "loss": 3.2991, + "step": 16585 + }, + { + "epoch": 1.1271911944557684, + "grad_norm": 6.657066822052002, + "learning_rate": 8.591605517053949e-06, + "loss": 3.1115, + "step": 16590 + }, + { + "epoch": 1.1275309145264303, + "grad_norm": 6.747493267059326, + "learning_rate": 8.59118086696562e-06, + "loss": 3.4656, + "step": 16595 + }, + { + "epoch": 1.127870634597092, + "grad_norm": 9.694433212280273, + "learning_rate": 8.590756216877294e-06, + "loss": 3.1946, + "step": 16600 + }, + { + "epoch": 1.1282103546677538, + "grad_norm": 6.723586082458496, + "learning_rate": 8.590331566788967e-06, + "loss": 3.1654, + "step": 16605 + }, + { + "epoch": 1.1285500747384156, + "grad_norm": 6.316267490386963, + "learning_rate": 8.58990691670064e-06, + "loss": 3.2618, + "step": 16610 + }, + { + "epoch": 1.1288897948090773, + "grad_norm": 6.723963260650635, + "learning_rate": 8.589482266612313e-06, + "loss": 3.2504, + "step": 16615 + }, + { + "epoch": 1.129229514879739, + "grad_norm": 6.9316630363464355, + "learning_rate": 8.589057616523984e-06, + "loss": 3.0778, + "step": 16620 + }, + { + "epoch": 1.129569234950401, + "grad_norm": 7.38731050491333, + "learning_rate": 8.588632966435658e-06, + "loss": 3.176, + "step": 16625 + }, + { + "epoch": 1.1299089550210626, + "grad_norm": 7.49238395690918, + "learning_rate": 8.588208316347331e-06, + "loss": 2.9997, + "step": 16630 + }, + { + "epoch": 1.1302486750917244, + "grad_norm": 6.330277919769287, + "learning_rate": 8.587783666259002e-06, + "loss": 3.2873, + "step": 16635 + }, + { + "epoch": 1.1305883951623863, + "grad_norm": 7.552095413208008, + "learning_rate": 8.587359016170677e-06, + "loss": 3.3662, + "step": 16640 + }, + { + "epoch": 1.130928115233048, + "grad_norm": 6.899776458740234, + "learning_rate": 8.58693436608235e-06, + "loss": 3.1112, + "step": 16645 + }, + { + "epoch": 1.1312678353037098, + "grad_norm": 7.817601203918457, + "learning_rate": 8.58650971599402e-06, + "loss": 3.496, + "step": 16650 + }, + { + "epoch": 1.1316075553743716, + "grad_norm": 5.936111927032471, + "learning_rate": 8.586085065905695e-06, + "loss": 2.9751, + "step": 16655 + }, + { + "epoch": 1.1319472754450333, + "grad_norm": 6.400041580200195, + "learning_rate": 8.585660415817368e-06, + "loss": 3.2339, + "step": 16660 + }, + { + "epoch": 1.1322869955156951, + "grad_norm": 5.472104549407959, + "learning_rate": 8.58523576572904e-06, + "loss": 3.0848, + "step": 16665 + }, + { + "epoch": 1.1326267155863567, + "grad_norm": 7.11216926574707, + "learning_rate": 8.584811115640714e-06, + "loss": 3.2714, + "step": 16670 + }, + { + "epoch": 1.1329664356570186, + "grad_norm": 6.753519535064697, + "learning_rate": 8.584386465552386e-06, + "loss": 3.2205, + "step": 16675 + }, + { + "epoch": 1.1333061557276805, + "grad_norm": 7.458407878875732, + "learning_rate": 8.583961815464058e-06, + "loss": 3.1767, + "step": 16680 + }, + { + "epoch": 1.133645875798342, + "grad_norm": 6.213800430297852, + "learning_rate": 8.583537165375732e-06, + "loss": 3.2414, + "step": 16685 + }, + { + "epoch": 1.133985595869004, + "grad_norm": 8.79051685333252, + "learning_rate": 8.583112515287403e-06, + "loss": 3.2938, + "step": 16690 + }, + { + "epoch": 1.1343253159396658, + "grad_norm": 6.862942695617676, + "learning_rate": 8.582687865199076e-06, + "loss": 3.1654, + "step": 16695 + }, + { + "epoch": 1.1346650360103274, + "grad_norm": 7.467367649078369, + "learning_rate": 8.58226321511075e-06, + "loss": 3.0836, + "step": 16700 + }, + { + "epoch": 1.1350047560809893, + "grad_norm": 6.137763500213623, + "learning_rate": 8.581838565022422e-06, + "loss": 3.3088, + "step": 16705 + }, + { + "epoch": 1.1353444761516511, + "grad_norm": 8.128146171569824, + "learning_rate": 8.581413914934094e-06, + "loss": 3.2579, + "step": 16710 + }, + { + "epoch": 1.1356841962223128, + "grad_norm": 6.150387763977051, + "learning_rate": 8.580989264845769e-06, + "loss": 3.2078, + "step": 16715 + }, + { + "epoch": 1.1360239162929746, + "grad_norm": 6.834339618682861, + "learning_rate": 8.58056461475744e-06, + "loss": 3.3934, + "step": 16720 + }, + { + "epoch": 1.1363636363636362, + "grad_norm": 6.752582550048828, + "learning_rate": 8.580139964669113e-06, + "loss": 3.2635, + "step": 16725 + }, + { + "epoch": 1.136703356434298, + "grad_norm": 6.1579270362854, + "learning_rate": 8.579715314580787e-06, + "loss": 3.3681, + "step": 16730 + }, + { + "epoch": 1.13704307650496, + "grad_norm": 6.543057918548584, + "learning_rate": 8.579290664492458e-06, + "loss": 3.3121, + "step": 16735 + }, + { + "epoch": 1.1373827965756216, + "grad_norm": 6.666302680969238, + "learning_rate": 8.578866014404131e-06, + "loss": 3.1251, + "step": 16740 + }, + { + "epoch": 1.1377225166462834, + "grad_norm": 6.883190631866455, + "learning_rate": 8.578441364315806e-06, + "loss": 3.1925, + "step": 16745 + }, + { + "epoch": 1.1380622367169453, + "grad_norm": 7.656110763549805, + "learning_rate": 8.578016714227477e-06, + "loss": 3.1822, + "step": 16750 + }, + { + "epoch": 1.138401956787607, + "grad_norm": 9.218098640441895, + "learning_rate": 8.57759206413915e-06, + "loss": 3.4359, + "step": 16755 + }, + { + "epoch": 1.1387416768582688, + "grad_norm": 7.853702068328857, + "learning_rate": 8.577167414050822e-06, + "loss": 3.1061, + "step": 16760 + }, + { + "epoch": 1.1390813969289306, + "grad_norm": 5.789314270019531, + "learning_rate": 8.576742763962495e-06, + "loss": 3.3801, + "step": 16765 + }, + { + "epoch": 1.1394211169995923, + "grad_norm": 7.662946701049805, + "learning_rate": 8.576318113874168e-06, + "loss": 3.1438, + "step": 16770 + }, + { + "epoch": 1.139760837070254, + "grad_norm": 6.989950656890869, + "learning_rate": 8.57589346378584e-06, + "loss": 3.1121, + "step": 16775 + }, + { + "epoch": 1.140100557140916, + "grad_norm": 5.8362226486206055, + "learning_rate": 8.575468813697514e-06, + "loss": 3.0875, + "step": 16780 + }, + { + "epoch": 1.1404402772115776, + "grad_norm": 7.6342010498046875, + "learning_rate": 8.575044163609186e-06, + "loss": 3.222, + "step": 16785 + }, + { + "epoch": 1.1407799972822394, + "grad_norm": 6.425926208496094, + "learning_rate": 8.57461951352086e-06, + "loss": 3.4547, + "step": 16790 + }, + { + "epoch": 1.1411197173529013, + "grad_norm": 7.2010979652404785, + "learning_rate": 8.574194863432532e-06, + "loss": 3.0132, + "step": 16795 + }, + { + "epoch": 1.141459437423563, + "grad_norm": 7.416762828826904, + "learning_rate": 8.573770213344205e-06, + "loss": 3.2072, + "step": 16800 + }, + { + "epoch": 1.1417991574942248, + "grad_norm": 6.862436294555664, + "learning_rate": 8.573345563255878e-06, + "loss": 3.1935, + "step": 16805 + }, + { + "epoch": 1.1421388775648866, + "grad_norm": 7.008119583129883, + "learning_rate": 8.57292091316755e-06, + "loss": 3.2695, + "step": 16810 + }, + { + "epoch": 1.1424785976355483, + "grad_norm": 8.954910278320312, + "learning_rate": 8.572496263079223e-06, + "loss": 3.2221, + "step": 16815 + }, + { + "epoch": 1.1428183177062101, + "grad_norm": 6.7774977684021, + "learning_rate": 8.572071612990896e-06, + "loss": 3.0145, + "step": 16820 + }, + { + "epoch": 1.143158037776872, + "grad_norm": 7.745080471038818, + "learning_rate": 8.571646962902569e-06, + "loss": 3.0849, + "step": 16825 + }, + { + "epoch": 1.1434977578475336, + "grad_norm": 6.7285637855529785, + "learning_rate": 8.571222312814242e-06, + "loss": 3.2185, + "step": 16830 + }, + { + "epoch": 1.1438374779181955, + "grad_norm": 6.2319865226745605, + "learning_rate": 8.570797662725914e-06, + "loss": 3.3038, + "step": 16835 + }, + { + "epoch": 1.144177197988857, + "grad_norm": 6.627745628356934, + "learning_rate": 8.570373012637587e-06, + "loss": 3.324, + "step": 16840 + }, + { + "epoch": 1.144516918059519, + "grad_norm": 5.929566860198975, + "learning_rate": 8.56994836254926e-06, + "loss": 3.1845, + "step": 16845 + }, + { + "epoch": 1.1448566381301808, + "grad_norm": 7.235621929168701, + "learning_rate": 8.569523712460933e-06, + "loss": 3.2195, + "step": 16850 + }, + { + "epoch": 1.1451963582008424, + "grad_norm": 7.747852325439453, + "learning_rate": 8.569099062372606e-06, + "loss": 3.0558, + "step": 16855 + }, + { + "epoch": 1.1455360782715043, + "grad_norm": 5.673890113830566, + "learning_rate": 8.568674412284278e-06, + "loss": 3.2391, + "step": 16860 + }, + { + "epoch": 1.1458757983421661, + "grad_norm": 6.678596496582031, + "learning_rate": 8.568249762195951e-06, + "loss": 3.031, + "step": 16865 + }, + { + "epoch": 1.1462155184128278, + "grad_norm": 6.979052543640137, + "learning_rate": 8.567825112107624e-06, + "loss": 3.1672, + "step": 16870 + }, + { + "epoch": 1.1465552384834896, + "grad_norm": 7.705677509307861, + "learning_rate": 8.567400462019297e-06, + "loss": 3.3974, + "step": 16875 + }, + { + "epoch": 1.1468949585541515, + "grad_norm": 6.816099166870117, + "learning_rate": 8.56697581193097e-06, + "loss": 3.0999, + "step": 16880 + }, + { + "epoch": 1.147234678624813, + "grad_norm": 7.178157329559326, + "learning_rate": 8.566551161842642e-06, + "loss": 3.2319, + "step": 16885 + }, + { + "epoch": 1.147574398695475, + "grad_norm": 7.334706783294678, + "learning_rate": 8.566126511754315e-06, + "loss": 3.2275, + "step": 16890 + }, + { + "epoch": 1.1479141187661366, + "grad_norm": 7.460343837738037, + "learning_rate": 8.565701861665988e-06, + "loss": 3.2713, + "step": 16895 + }, + { + "epoch": 1.1482538388367984, + "grad_norm": 6.739879131317139, + "learning_rate": 8.56527721157766e-06, + "loss": 3.1223, + "step": 16900 + }, + { + "epoch": 1.1485935589074603, + "grad_norm": 5.852973461151123, + "learning_rate": 8.564852561489334e-06, + "loss": 3.3085, + "step": 16905 + }, + { + "epoch": 1.148933278978122, + "grad_norm": 6.614869117736816, + "learning_rate": 8.564427911401006e-06, + "loss": 3.0299, + "step": 16910 + }, + { + "epoch": 1.1492729990487838, + "grad_norm": 6.384291648864746, + "learning_rate": 8.56400326131268e-06, + "loss": 3.0798, + "step": 16915 + }, + { + "epoch": 1.1496127191194456, + "grad_norm": 7.013394355773926, + "learning_rate": 8.563578611224352e-06, + "loss": 3.0985, + "step": 16920 + }, + { + "epoch": 1.1499524391901073, + "grad_norm": 7.218706130981445, + "learning_rate": 8.563153961136025e-06, + "loss": 3.0891, + "step": 16925 + }, + { + "epoch": 1.150292159260769, + "grad_norm": 7.837388515472412, + "learning_rate": 8.562729311047698e-06, + "loss": 3.2492, + "step": 16930 + }, + { + "epoch": 1.150631879331431, + "grad_norm": 6.4668707847595215, + "learning_rate": 8.56230466095937e-06, + "loss": 3.1633, + "step": 16935 + }, + { + "epoch": 1.1509715994020926, + "grad_norm": 5.989139556884766, + "learning_rate": 8.561880010871043e-06, + "loss": 3.3353, + "step": 16940 + }, + { + "epoch": 1.1513113194727544, + "grad_norm": 6.161069869995117, + "learning_rate": 8.561455360782716e-06, + "loss": 3.0285, + "step": 16945 + }, + { + "epoch": 1.1516510395434163, + "grad_norm": 5.335538864135742, + "learning_rate": 8.561030710694389e-06, + "loss": 3.1644, + "step": 16950 + }, + { + "epoch": 1.151990759614078, + "grad_norm": 5.906455993652344, + "learning_rate": 8.560606060606062e-06, + "loss": 2.9867, + "step": 16955 + }, + { + "epoch": 1.1523304796847398, + "grad_norm": 6.421401023864746, + "learning_rate": 8.560181410517734e-06, + "loss": 3.0225, + "step": 16960 + }, + { + "epoch": 1.1526701997554016, + "grad_norm": 5.832885265350342, + "learning_rate": 8.559756760429407e-06, + "loss": 3.1193, + "step": 16965 + }, + { + "epoch": 1.1530099198260633, + "grad_norm": 8.049269676208496, + "learning_rate": 8.55933211034108e-06, + "loss": 3.1999, + "step": 16970 + }, + { + "epoch": 1.1533496398967251, + "grad_norm": 6.560512065887451, + "learning_rate": 8.558907460252753e-06, + "loss": 3.0117, + "step": 16975 + }, + { + "epoch": 1.153689359967387, + "grad_norm": 8.375260353088379, + "learning_rate": 8.558482810164426e-06, + "loss": 3.1536, + "step": 16980 + }, + { + "epoch": 1.1540290800380486, + "grad_norm": 8.038991928100586, + "learning_rate": 8.558058160076098e-06, + "loss": 3.3143, + "step": 16985 + }, + { + "epoch": 1.1543688001087105, + "grad_norm": 6.207976341247559, + "learning_rate": 8.557633509987771e-06, + "loss": 3.0055, + "step": 16990 + }, + { + "epoch": 1.1547085201793723, + "grad_norm": 6.437304973602295, + "learning_rate": 8.557208859899444e-06, + "loss": 3.1879, + "step": 16995 + }, + { + "epoch": 1.155048240250034, + "grad_norm": 6.987186908721924, + "learning_rate": 8.556784209811117e-06, + "loss": 3.2233, + "step": 17000 + }, + { + "epoch": 1.1553879603206958, + "grad_norm": 8.22426700592041, + "learning_rate": 8.55635955972279e-06, + "loss": 3.1969, + "step": 17005 + }, + { + "epoch": 1.1557276803913574, + "grad_norm": 7.039334297180176, + "learning_rate": 8.555934909634462e-06, + "loss": 3.2993, + "step": 17010 + }, + { + "epoch": 1.1560674004620193, + "grad_norm": 6.061263561248779, + "learning_rate": 8.555510259546135e-06, + "loss": 3.2712, + "step": 17015 + }, + { + "epoch": 1.1564071205326811, + "grad_norm": 5.230696678161621, + "learning_rate": 8.555085609457808e-06, + "loss": 3.2701, + "step": 17020 + }, + { + "epoch": 1.1567468406033428, + "grad_norm": 9.176583290100098, + "learning_rate": 8.55466095936948e-06, + "loss": 3.4886, + "step": 17025 + }, + { + "epoch": 1.1570865606740046, + "grad_norm": 7.433403015136719, + "learning_rate": 8.554236309281154e-06, + "loss": 3.2458, + "step": 17030 + }, + { + "epoch": 1.1574262807446665, + "grad_norm": 7.45168924331665, + "learning_rate": 8.553811659192825e-06, + "loss": 3.1021, + "step": 17035 + }, + { + "epoch": 1.157766000815328, + "grad_norm": 7.76920223236084, + "learning_rate": 8.5533870091045e-06, + "loss": 2.9402, + "step": 17040 + }, + { + "epoch": 1.15810572088599, + "grad_norm": 5.547115802764893, + "learning_rate": 8.552962359016172e-06, + "loss": 3.4495, + "step": 17045 + }, + { + "epoch": 1.1584454409566518, + "grad_norm": 7.063078880310059, + "learning_rate": 8.552537708927843e-06, + "loss": 3.0021, + "step": 17050 + }, + { + "epoch": 1.1587851610273134, + "grad_norm": 5.606100559234619, + "learning_rate": 8.552113058839518e-06, + "loss": 3.3074, + "step": 17055 + }, + { + "epoch": 1.1591248810979753, + "grad_norm": 5.788193702697754, + "learning_rate": 8.55168840875119e-06, + "loss": 3.1649, + "step": 17060 + }, + { + "epoch": 1.159464601168637, + "grad_norm": 5.448424339294434, + "learning_rate": 8.551263758662862e-06, + "loss": 3.2438, + "step": 17065 + }, + { + "epoch": 1.1598043212392988, + "grad_norm": 6.554318428039551, + "learning_rate": 8.550839108574536e-06, + "loss": 3.2143, + "step": 17070 + }, + { + "epoch": 1.1601440413099606, + "grad_norm": 7.065948486328125, + "learning_rate": 8.550414458486209e-06, + "loss": 3.2596, + "step": 17075 + }, + { + "epoch": 1.1604837613806223, + "grad_norm": 7.982277870178223, + "learning_rate": 8.54998980839788e-06, + "loss": 2.9548, + "step": 17080 + }, + { + "epoch": 1.1608234814512841, + "grad_norm": 7.054662227630615, + "learning_rate": 8.549565158309554e-06, + "loss": 3.1471, + "step": 17085 + }, + { + "epoch": 1.161163201521946, + "grad_norm": 6.567215919494629, + "learning_rate": 8.549140508221227e-06, + "loss": 3.1236, + "step": 17090 + }, + { + "epoch": 1.1615029215926076, + "grad_norm": 6.386531352996826, + "learning_rate": 8.548715858132898e-06, + "loss": 3.1809, + "step": 17095 + }, + { + "epoch": 1.1618426416632694, + "grad_norm": 6.646076679229736, + "learning_rate": 8.548291208044573e-06, + "loss": 3.0587, + "step": 17100 + }, + { + "epoch": 1.1621823617339313, + "grad_norm": 6.849241733551025, + "learning_rate": 8.547866557956244e-06, + "loss": 3.1956, + "step": 17105 + }, + { + "epoch": 1.162522081804593, + "grad_norm": 6.0146989822387695, + "learning_rate": 8.547441907867917e-06, + "loss": 3.0579, + "step": 17110 + }, + { + "epoch": 1.1628618018752548, + "grad_norm": 7.030190467834473, + "learning_rate": 8.547017257779591e-06, + "loss": 3.1017, + "step": 17115 + }, + { + "epoch": 1.1632015219459166, + "grad_norm": 7.166305065155029, + "learning_rate": 8.546592607691262e-06, + "loss": 3.0944, + "step": 17120 + }, + { + "epoch": 1.1635412420165783, + "grad_norm": 5.666445732116699, + "learning_rate": 8.546167957602935e-06, + "loss": 3.0511, + "step": 17125 + }, + { + "epoch": 1.1638809620872401, + "grad_norm": 5.187930107116699, + "learning_rate": 8.54574330751461e-06, + "loss": 3.4681, + "step": 17130 + }, + { + "epoch": 1.164220682157902, + "grad_norm": 7.08839225769043, + "learning_rate": 8.54531865742628e-06, + "loss": 3.0557, + "step": 17135 + }, + { + "epoch": 1.1645604022285636, + "grad_norm": 6.604378700256348, + "learning_rate": 8.544894007337954e-06, + "loss": 3.3511, + "step": 17140 + }, + { + "epoch": 1.1649001222992255, + "grad_norm": 6.628900051116943, + "learning_rate": 8.544469357249628e-06, + "loss": 3.1602, + "step": 17145 + }, + { + "epoch": 1.1652398423698873, + "grad_norm": 8.612101554870605, + "learning_rate": 8.544044707161299e-06, + "loss": 3.2483, + "step": 17150 + }, + { + "epoch": 1.165579562440549, + "grad_norm": 5.598548412322998, + "learning_rate": 8.543620057072972e-06, + "loss": 3.1275, + "step": 17155 + }, + { + "epoch": 1.1659192825112108, + "grad_norm": 6.15999174118042, + "learning_rate": 8.543195406984646e-06, + "loss": 3.3335, + "step": 17160 + }, + { + "epoch": 1.1662590025818727, + "grad_norm": 7.5597825050354, + "learning_rate": 8.542770756896318e-06, + "loss": 3.2668, + "step": 17165 + }, + { + "epoch": 1.1665987226525343, + "grad_norm": 5.101920127868652, + "learning_rate": 8.54234610680799e-06, + "loss": 3.0301, + "step": 17170 + }, + { + "epoch": 1.1669384427231961, + "grad_norm": 7.8180012702941895, + "learning_rate": 8.541921456719665e-06, + "loss": 3.3671, + "step": 17175 + }, + { + "epoch": 1.1672781627938578, + "grad_norm": 6.372391223907471, + "learning_rate": 8.541496806631336e-06, + "loss": 3.3156, + "step": 17180 + }, + { + "epoch": 1.1676178828645196, + "grad_norm": 5.92647123336792, + "learning_rate": 8.541072156543009e-06, + "loss": 3.3344, + "step": 17185 + }, + { + "epoch": 1.1679576029351815, + "grad_norm": 7.649982929229736, + "learning_rate": 8.540647506454682e-06, + "loss": 3.0628, + "step": 17190 + }, + { + "epoch": 1.168297323005843, + "grad_norm": 9.218098640441895, + "learning_rate": 8.540222856366354e-06, + "loss": 3.1404, + "step": 17195 + }, + { + "epoch": 1.168637043076505, + "grad_norm": 7.353550434112549, + "learning_rate": 8.539798206278027e-06, + "loss": 3.3241, + "step": 17200 + }, + { + "epoch": 1.1689767631471668, + "grad_norm": 5.649467468261719, + "learning_rate": 8.5393735561897e-06, + "loss": 3.1797, + "step": 17205 + }, + { + "epoch": 1.1693164832178284, + "grad_norm": 6.6558332443237305, + "learning_rate": 8.538948906101373e-06, + "loss": 3.1938, + "step": 17210 + }, + { + "epoch": 1.1696562032884903, + "grad_norm": 7.07541036605835, + "learning_rate": 8.538524256013046e-06, + "loss": 3.2648, + "step": 17215 + }, + { + "epoch": 1.1699959233591521, + "grad_norm": 5.2271504402160645, + "learning_rate": 8.538099605924718e-06, + "loss": 3.2526, + "step": 17220 + }, + { + "epoch": 1.1703356434298138, + "grad_norm": 6.226665019989014, + "learning_rate": 8.537674955836391e-06, + "loss": 3.0679, + "step": 17225 + }, + { + "epoch": 1.1706753635004756, + "grad_norm": 6.890496730804443, + "learning_rate": 8.537250305748064e-06, + "loss": 3.2028, + "step": 17230 + }, + { + "epoch": 1.1710150835711373, + "grad_norm": 6.376018524169922, + "learning_rate": 8.536825655659737e-06, + "loss": 2.9276, + "step": 17235 + }, + { + "epoch": 1.1713548036417991, + "grad_norm": 8.50093936920166, + "learning_rate": 8.53640100557141e-06, + "loss": 2.9508, + "step": 17240 + }, + { + "epoch": 1.171694523712461, + "grad_norm": 7.283524513244629, + "learning_rate": 8.535976355483082e-06, + "loss": 3.207, + "step": 17245 + }, + { + "epoch": 1.1720342437831226, + "grad_norm": 7.0892014503479, + "learning_rate": 8.535551705394755e-06, + "loss": 3.1469, + "step": 17250 + }, + { + "epoch": 1.1723739638537845, + "grad_norm": 7.1157355308532715, + "learning_rate": 8.535127055306428e-06, + "loss": 3.324, + "step": 17255 + }, + { + "epoch": 1.1727136839244463, + "grad_norm": 6.675520896911621, + "learning_rate": 8.5347024052181e-06, + "loss": 3.2174, + "step": 17260 + }, + { + "epoch": 1.173053403995108, + "grad_norm": 6.04493522644043, + "learning_rate": 8.534277755129774e-06, + "loss": 3.1831, + "step": 17265 + }, + { + "epoch": 1.1733931240657698, + "grad_norm": 7.384869575500488, + "learning_rate": 8.533853105041446e-06, + "loss": 3.122, + "step": 17270 + }, + { + "epoch": 1.1737328441364316, + "grad_norm": 6.948855400085449, + "learning_rate": 8.53342845495312e-06, + "loss": 2.8183, + "step": 17275 + }, + { + "epoch": 1.1740725642070933, + "grad_norm": 7.755239009857178, + "learning_rate": 8.533003804864792e-06, + "loss": 3.4393, + "step": 17280 + }, + { + "epoch": 1.1744122842777551, + "grad_norm": 6.929354667663574, + "learning_rate": 8.532579154776465e-06, + "loss": 3.159, + "step": 17285 + }, + { + "epoch": 1.174752004348417, + "grad_norm": 6.88331413269043, + "learning_rate": 8.532154504688138e-06, + "loss": 3.2855, + "step": 17290 + }, + { + "epoch": 1.1750917244190786, + "grad_norm": 6.481498718261719, + "learning_rate": 8.53172985459981e-06, + "loss": 3.3706, + "step": 17295 + }, + { + "epoch": 1.1754314444897405, + "grad_norm": 6.622309684753418, + "learning_rate": 8.531305204511483e-06, + "loss": 3.4015, + "step": 17300 + }, + { + "epoch": 1.1757711645604023, + "grad_norm": 6.866086483001709, + "learning_rate": 8.530880554423156e-06, + "loss": 3.23, + "step": 17305 + }, + { + "epoch": 1.176110884631064, + "grad_norm": 7.246187686920166, + "learning_rate": 8.530455904334829e-06, + "loss": 3.2655, + "step": 17310 + }, + { + "epoch": 1.1764506047017258, + "grad_norm": 6.3059587478637695, + "learning_rate": 8.530031254246502e-06, + "loss": 3.1464, + "step": 17315 + }, + { + "epoch": 1.1767903247723877, + "grad_norm": 6.139138698577881, + "learning_rate": 8.529606604158174e-06, + "loss": 3.3108, + "step": 17320 + }, + { + "epoch": 1.1771300448430493, + "grad_norm": 6.439684867858887, + "learning_rate": 8.529181954069847e-06, + "loss": 3.1135, + "step": 17325 + }, + { + "epoch": 1.1774697649137111, + "grad_norm": 6.1916728019714355, + "learning_rate": 8.52875730398152e-06, + "loss": 3.4854, + "step": 17330 + }, + { + "epoch": 1.177809484984373, + "grad_norm": 4.619826316833496, + "learning_rate": 8.528332653893193e-06, + "loss": 3.2904, + "step": 17335 + }, + { + "epoch": 1.1781492050550346, + "grad_norm": 6.827152729034424, + "learning_rate": 8.527908003804866e-06, + "loss": 3.25, + "step": 17340 + }, + { + "epoch": 1.1784889251256965, + "grad_norm": 6.66576623916626, + "learning_rate": 8.527483353716538e-06, + "loss": 3.2011, + "step": 17345 + }, + { + "epoch": 1.178828645196358, + "grad_norm": 5.709493637084961, + "learning_rate": 8.527058703628211e-06, + "loss": 3.1582, + "step": 17350 + }, + { + "epoch": 1.17916836526702, + "grad_norm": 6.463400363922119, + "learning_rate": 8.526634053539884e-06, + "loss": 3.0431, + "step": 17355 + }, + { + "epoch": 1.1795080853376818, + "grad_norm": 7.165821552276611, + "learning_rate": 8.526209403451557e-06, + "loss": 3.3142, + "step": 17360 + }, + { + "epoch": 1.1798478054083434, + "grad_norm": 7.183704853057861, + "learning_rate": 8.52578475336323e-06, + "loss": 2.924, + "step": 17365 + }, + { + "epoch": 1.1801875254790053, + "grad_norm": 6.512821197509766, + "learning_rate": 8.525360103274902e-06, + "loss": 3.1082, + "step": 17370 + }, + { + "epoch": 1.1805272455496671, + "grad_norm": 6.2424540519714355, + "learning_rate": 8.524935453186575e-06, + "loss": 3.0677, + "step": 17375 + }, + { + "epoch": 1.1808669656203288, + "grad_norm": 6.800835132598877, + "learning_rate": 8.524510803098248e-06, + "loss": 3.2038, + "step": 17380 + }, + { + "epoch": 1.1812066856909906, + "grad_norm": 6.809747219085693, + "learning_rate": 8.52408615300992e-06, + "loss": 2.9966, + "step": 17385 + }, + { + "epoch": 1.1815464057616525, + "grad_norm": 8.102014541625977, + "learning_rate": 8.523661502921594e-06, + "loss": 3.2058, + "step": 17390 + }, + { + "epoch": 1.1818861258323141, + "grad_norm": 8.593916893005371, + "learning_rate": 8.523236852833266e-06, + "loss": 3.0768, + "step": 17395 + }, + { + "epoch": 1.182225845902976, + "grad_norm": 6.607616901397705, + "learning_rate": 8.52281220274494e-06, + "loss": 3.0383, + "step": 17400 + }, + { + "epoch": 1.1825655659736376, + "grad_norm": 7.890481472015381, + "learning_rate": 8.522387552656612e-06, + "loss": 3.1473, + "step": 17405 + }, + { + "epoch": 1.1829052860442995, + "grad_norm": 6.104465007781982, + "learning_rate": 8.521962902568285e-06, + "loss": 3.3494, + "step": 17410 + }, + { + "epoch": 1.1832450061149613, + "grad_norm": 6.472104549407959, + "learning_rate": 8.521538252479958e-06, + "loss": 3.087, + "step": 17415 + }, + { + "epoch": 1.183584726185623, + "grad_norm": 6.760950565338135, + "learning_rate": 8.52111360239163e-06, + "loss": 3.0477, + "step": 17420 + }, + { + "epoch": 1.1839244462562848, + "grad_norm": 6.5563201904296875, + "learning_rate": 8.520688952303303e-06, + "loss": 3.1291, + "step": 17425 + }, + { + "epoch": 1.1842641663269466, + "grad_norm": 6.928585529327393, + "learning_rate": 8.520264302214976e-06, + "loss": 3.2819, + "step": 17430 + }, + { + "epoch": 1.1846038863976083, + "grad_norm": 5.3912787437438965, + "learning_rate": 8.519839652126649e-06, + "loss": 3.4335, + "step": 17435 + }, + { + "epoch": 1.1849436064682701, + "grad_norm": 7.412026882171631, + "learning_rate": 8.519415002038322e-06, + "loss": 3.096, + "step": 17440 + }, + { + "epoch": 1.185283326538932, + "grad_norm": 7.1557841300964355, + "learning_rate": 8.518990351949994e-06, + "loss": 3.0036, + "step": 17445 + }, + { + "epoch": 1.1856230466095936, + "grad_norm": 6.894718647003174, + "learning_rate": 8.518565701861665e-06, + "loss": 3.0501, + "step": 17450 + }, + { + "epoch": 1.1859627666802555, + "grad_norm": 7.439157962799072, + "learning_rate": 8.51814105177334e-06, + "loss": 3.3708, + "step": 17455 + }, + { + "epoch": 1.1863024867509173, + "grad_norm": 5.156220436096191, + "learning_rate": 8.517716401685013e-06, + "loss": 3.2328, + "step": 17460 + }, + { + "epoch": 1.186642206821579, + "grad_norm": 7.802455425262451, + "learning_rate": 8.517291751596684e-06, + "loss": 3.1395, + "step": 17465 + }, + { + "epoch": 1.1869819268922408, + "grad_norm": 6.2932963371276855, + "learning_rate": 8.516867101508358e-06, + "loss": 3.3458, + "step": 17470 + }, + { + "epoch": 1.1873216469629027, + "grad_norm": 7.877438068389893, + "learning_rate": 8.516442451420031e-06, + "loss": 2.8657, + "step": 17475 + }, + { + "epoch": 1.1876613670335643, + "grad_norm": 7.1557536125183105, + "learning_rate": 8.516017801331702e-06, + "loss": 3.3105, + "step": 17480 + }, + { + "epoch": 1.1880010871042261, + "grad_norm": 7.511354446411133, + "learning_rate": 8.515593151243377e-06, + "loss": 3.2747, + "step": 17485 + }, + { + "epoch": 1.188340807174888, + "grad_norm": 6.775200366973877, + "learning_rate": 8.51516850115505e-06, + "loss": 2.8856, + "step": 17490 + }, + { + "epoch": 1.1886805272455496, + "grad_norm": 6.462643146514893, + "learning_rate": 8.51474385106672e-06, + "loss": 2.9654, + "step": 17495 + }, + { + "epoch": 1.1890202473162115, + "grad_norm": 5.996549129486084, + "learning_rate": 8.514319200978395e-06, + "loss": 3.0845, + "step": 17500 + }, + { + "epoch": 1.1893599673868733, + "grad_norm": 6.388870716094971, + "learning_rate": 8.513894550890068e-06, + "loss": 3.0759, + "step": 17505 + }, + { + "epoch": 1.189699687457535, + "grad_norm": 5.166457176208496, + "learning_rate": 8.513469900801739e-06, + "loss": 3.0574, + "step": 17510 + }, + { + "epoch": 1.1900394075281968, + "grad_norm": 6.201503276824951, + "learning_rate": 8.513045250713414e-06, + "loss": 3.3507, + "step": 17515 + }, + { + "epoch": 1.1903791275988584, + "grad_norm": 6.7793731689453125, + "learning_rate": 8.512620600625086e-06, + "loss": 3.0241, + "step": 17520 + }, + { + "epoch": 1.1907188476695203, + "grad_norm": 7.28551721572876, + "learning_rate": 8.512195950536758e-06, + "loss": 3.117, + "step": 17525 + }, + { + "epoch": 1.1910585677401822, + "grad_norm": 6.684642314910889, + "learning_rate": 8.511771300448432e-06, + "loss": 3.181, + "step": 17530 + }, + { + "epoch": 1.1913982878108438, + "grad_norm": 7.050947189331055, + "learning_rate": 8.511346650360103e-06, + "loss": 3.3902, + "step": 17535 + }, + { + "epoch": 1.1917380078815056, + "grad_norm": 8.251884460449219, + "learning_rate": 8.510922000271776e-06, + "loss": 3.1533, + "step": 17540 + }, + { + "epoch": 1.1920777279521675, + "grad_norm": 7.742501735687256, + "learning_rate": 8.51049735018345e-06, + "loss": 3.2459, + "step": 17545 + }, + { + "epoch": 1.1924174480228291, + "grad_norm": 6.5316314697265625, + "learning_rate": 8.510072700095122e-06, + "loss": 3.225, + "step": 17550 + }, + { + "epoch": 1.192757168093491, + "grad_norm": 8.519659042358398, + "learning_rate": 8.509648050006794e-06, + "loss": 3.2068, + "step": 17555 + }, + { + "epoch": 1.1930968881641528, + "grad_norm": 6.892725944519043, + "learning_rate": 8.509223399918469e-06, + "loss": 3.1993, + "step": 17560 + }, + { + "epoch": 1.1934366082348145, + "grad_norm": 6.644540309906006, + "learning_rate": 8.50879874983014e-06, + "loss": 2.8595, + "step": 17565 + }, + { + "epoch": 1.1937763283054763, + "grad_norm": 6.541611194610596, + "learning_rate": 8.508374099741813e-06, + "loss": 3.2435, + "step": 17570 + }, + { + "epoch": 1.194116048376138, + "grad_norm": 8.780521392822266, + "learning_rate": 8.507949449653487e-06, + "loss": 2.94, + "step": 17575 + }, + { + "epoch": 1.1944557684467998, + "grad_norm": 6.898624420166016, + "learning_rate": 8.507524799565158e-06, + "loss": 3.2229, + "step": 17580 + }, + { + "epoch": 1.1947954885174616, + "grad_norm": 7.644357204437256, + "learning_rate": 8.507100149476831e-06, + "loss": 3.3128, + "step": 17585 + }, + { + "epoch": 1.1951352085881233, + "grad_norm": 6.956271648406982, + "learning_rate": 8.506675499388506e-06, + "loss": 3.3176, + "step": 17590 + }, + { + "epoch": 1.1954749286587851, + "grad_norm": 6.497437000274658, + "learning_rate": 8.506250849300177e-06, + "loss": 3.3648, + "step": 17595 + }, + { + "epoch": 1.195814648729447, + "grad_norm": 6.148984909057617, + "learning_rate": 8.50582619921185e-06, + "loss": 3.151, + "step": 17600 + }, + { + "epoch": 1.1961543688001086, + "grad_norm": 7.020702838897705, + "learning_rate": 8.505401549123522e-06, + "loss": 3.0009, + "step": 17605 + }, + { + "epoch": 1.1964940888707705, + "grad_norm": 6.719621181488037, + "learning_rate": 8.504976899035195e-06, + "loss": 2.7564, + "step": 17610 + }, + { + "epoch": 1.1968338089414323, + "grad_norm": 7.8429741859436035, + "learning_rate": 8.504552248946868e-06, + "loss": 3.3029, + "step": 17615 + }, + { + "epoch": 1.197173529012094, + "grad_norm": 6.8234782218933105, + "learning_rate": 8.50412759885854e-06, + "loss": 3.1766, + "step": 17620 + }, + { + "epoch": 1.1975132490827558, + "grad_norm": 6.828193664550781, + "learning_rate": 8.503702948770214e-06, + "loss": 2.9056, + "step": 17625 + }, + { + "epoch": 1.1978529691534177, + "grad_norm": 5.637232780456543, + "learning_rate": 8.503278298681888e-06, + "loss": 3.1454, + "step": 17630 + }, + { + "epoch": 1.1981926892240793, + "grad_norm": 5.891693592071533, + "learning_rate": 8.502853648593559e-06, + "loss": 3.0498, + "step": 17635 + }, + { + "epoch": 1.1985324092947411, + "grad_norm": 7.58267879486084, + "learning_rate": 8.502428998505232e-06, + "loss": 3.3852, + "step": 17640 + }, + { + "epoch": 1.198872129365403, + "grad_norm": 6.677652835845947, + "learning_rate": 8.502004348416906e-06, + "loss": 3.0659, + "step": 17645 + }, + { + "epoch": 1.1992118494360646, + "grad_norm": 7.6666131019592285, + "learning_rate": 8.501579698328578e-06, + "loss": 3.3065, + "step": 17650 + }, + { + "epoch": 1.1995515695067265, + "grad_norm": 7.379983901977539, + "learning_rate": 8.50115504824025e-06, + "loss": 3.2931, + "step": 17655 + }, + { + "epoch": 1.1998912895773883, + "grad_norm": 6.702931880950928, + "learning_rate": 8.500730398151925e-06, + "loss": 3.2815, + "step": 17660 + }, + { + "epoch": 1.20023100964805, + "grad_norm": 7.152735233306885, + "learning_rate": 8.500305748063596e-06, + "loss": 3.2045, + "step": 17665 + }, + { + "epoch": 1.2005707297187118, + "grad_norm": 5.963703155517578, + "learning_rate": 8.499881097975269e-06, + "loss": 3.1111, + "step": 17670 + }, + { + "epoch": 1.2009104497893737, + "grad_norm": 5.769615173339844, + "learning_rate": 8.499456447886942e-06, + "loss": 3.1262, + "step": 17675 + }, + { + "epoch": 1.2012501698600353, + "grad_norm": 5.604719638824463, + "learning_rate": 8.499031797798614e-06, + "loss": 2.9242, + "step": 17680 + }, + { + "epoch": 1.2015898899306972, + "grad_norm": 7.812938690185547, + "learning_rate": 8.498607147710287e-06, + "loss": 3.2698, + "step": 17685 + }, + { + "epoch": 1.2019296100013588, + "grad_norm": 5.410299301147461, + "learning_rate": 8.49818249762196e-06, + "loss": 3.371, + "step": 17690 + }, + { + "epoch": 1.2022693300720206, + "grad_norm": 6.369499206542969, + "learning_rate": 8.497757847533633e-06, + "loss": 2.8412, + "step": 17695 + }, + { + "epoch": 1.2026090501426825, + "grad_norm": 5.855342388153076, + "learning_rate": 8.497333197445306e-06, + "loss": 3.0188, + "step": 17700 + }, + { + "epoch": 1.2029487702133441, + "grad_norm": 6.682077884674072, + "learning_rate": 8.496908547356978e-06, + "loss": 3.0316, + "step": 17705 + }, + { + "epoch": 1.203288490284006, + "grad_norm": 7.229621410369873, + "learning_rate": 8.496483897268651e-06, + "loss": 3.482, + "step": 17710 + }, + { + "epoch": 1.2036282103546678, + "grad_norm": 7.009416103363037, + "learning_rate": 8.496059247180324e-06, + "loss": 2.9211, + "step": 17715 + }, + { + "epoch": 1.2039679304253295, + "grad_norm": 5.452607154846191, + "learning_rate": 8.495634597091997e-06, + "loss": 3.2586, + "step": 17720 + }, + { + "epoch": 1.2043076504959913, + "grad_norm": 6.428520679473877, + "learning_rate": 8.49520994700367e-06, + "loss": 3.3126, + "step": 17725 + }, + { + "epoch": 1.2046473705666532, + "grad_norm": 8.196589469909668, + "learning_rate": 8.494785296915342e-06, + "loss": 3.1806, + "step": 17730 + }, + { + "epoch": 1.2049870906373148, + "grad_norm": 7.410258769989014, + "learning_rate": 8.494360646827015e-06, + "loss": 3.1382, + "step": 17735 + }, + { + "epoch": 1.2053268107079766, + "grad_norm": 5.899505138397217, + "learning_rate": 8.493935996738688e-06, + "loss": 3.049, + "step": 17740 + }, + { + "epoch": 1.2056665307786383, + "grad_norm": 5.818045139312744, + "learning_rate": 8.49351134665036e-06, + "loss": 3.1198, + "step": 17745 + }, + { + "epoch": 1.2060062508493001, + "grad_norm": 5.958587646484375, + "learning_rate": 8.493086696562034e-06, + "loss": 3.2741, + "step": 17750 + }, + { + "epoch": 1.206345970919962, + "grad_norm": 8.335936546325684, + "learning_rate": 8.492662046473706e-06, + "loss": 3.0128, + "step": 17755 + }, + { + "epoch": 1.2066856909906236, + "grad_norm": 5.916487216949463, + "learning_rate": 8.492237396385379e-06, + "loss": 3.2089, + "step": 17760 + }, + { + "epoch": 1.2070254110612855, + "grad_norm": 6.021685600280762, + "learning_rate": 8.491812746297052e-06, + "loss": 3.2849, + "step": 17765 + }, + { + "epoch": 1.2073651311319473, + "grad_norm": 7.057094573974609, + "learning_rate": 8.491388096208725e-06, + "loss": 3.0258, + "step": 17770 + }, + { + "epoch": 1.207704851202609, + "grad_norm": 6.801000118255615, + "learning_rate": 8.490963446120398e-06, + "loss": 3.3372, + "step": 17775 + }, + { + "epoch": 1.2080445712732708, + "grad_norm": 7.368900775909424, + "learning_rate": 8.49053879603207e-06, + "loss": 3.3147, + "step": 17780 + }, + { + "epoch": 1.2083842913439327, + "grad_norm": 7.184224605560303, + "learning_rate": 8.490114145943743e-06, + "loss": 3.0219, + "step": 17785 + }, + { + "epoch": 1.2087240114145943, + "grad_norm": 7.230550289154053, + "learning_rate": 8.489689495855416e-06, + "loss": 3.4849, + "step": 17790 + }, + { + "epoch": 1.2090637314852561, + "grad_norm": 6.297170162200928, + "learning_rate": 8.489264845767089e-06, + "loss": 3.2332, + "step": 17795 + }, + { + "epoch": 1.209403451555918, + "grad_norm": 7.038989067077637, + "learning_rate": 8.488840195678762e-06, + "loss": 3.3207, + "step": 17800 + }, + { + "epoch": 1.2097431716265796, + "grad_norm": 7.682188510894775, + "learning_rate": 8.488415545590434e-06, + "loss": 3.0286, + "step": 17805 + }, + { + "epoch": 1.2100828916972415, + "grad_norm": 8.825727462768555, + "learning_rate": 8.487990895502107e-06, + "loss": 3.1753, + "step": 17810 + }, + { + "epoch": 1.2104226117679033, + "grad_norm": 7.47607421875, + "learning_rate": 8.48756624541378e-06, + "loss": 3.2586, + "step": 17815 + }, + { + "epoch": 1.210762331838565, + "grad_norm": 6.096461772918701, + "learning_rate": 8.487141595325453e-06, + "loss": 3.1731, + "step": 17820 + }, + { + "epoch": 1.2111020519092268, + "grad_norm": 10.352439880371094, + "learning_rate": 8.486716945237126e-06, + "loss": 3.307, + "step": 17825 + }, + { + "epoch": 1.2114417719798887, + "grad_norm": 5.818789005279541, + "learning_rate": 8.486292295148798e-06, + "loss": 3.3579, + "step": 17830 + }, + { + "epoch": 1.2117814920505503, + "grad_norm": 7.308349132537842, + "learning_rate": 8.485867645060471e-06, + "loss": 3.242, + "step": 17835 + }, + { + "epoch": 1.2121212121212122, + "grad_norm": 7.655679225921631, + "learning_rate": 8.485442994972144e-06, + "loss": 3.4125, + "step": 17840 + }, + { + "epoch": 1.212460932191874, + "grad_norm": 6.67854642868042, + "learning_rate": 8.485018344883817e-06, + "loss": 3.3158, + "step": 17845 + }, + { + "epoch": 1.2128006522625356, + "grad_norm": 7.049806594848633, + "learning_rate": 8.48459369479549e-06, + "loss": 3.1764, + "step": 17850 + }, + { + "epoch": 1.2131403723331975, + "grad_norm": 7.455491542816162, + "learning_rate": 8.484169044707162e-06, + "loss": 3.1834, + "step": 17855 + }, + { + "epoch": 1.2134800924038591, + "grad_norm": 5.443549633026123, + "learning_rate": 8.483744394618835e-06, + "loss": 3.0346, + "step": 17860 + }, + { + "epoch": 1.213819812474521, + "grad_norm": 6.0855231285095215, + "learning_rate": 8.483319744530506e-06, + "loss": 3.3604, + "step": 17865 + }, + { + "epoch": 1.2141595325451828, + "grad_norm": 6.830227851867676, + "learning_rate": 8.48289509444218e-06, + "loss": 3.0544, + "step": 17870 + }, + { + "epoch": 1.2144992526158445, + "grad_norm": 6.309613227844238, + "learning_rate": 8.482470444353854e-06, + "loss": 3.3392, + "step": 17875 + }, + { + "epoch": 1.2148389726865063, + "grad_norm": 6.147001266479492, + "learning_rate": 8.482045794265525e-06, + "loss": 3.2991, + "step": 17880 + }, + { + "epoch": 1.2151786927571682, + "grad_norm": 6.9174933433532715, + "learning_rate": 8.4816211441772e-06, + "loss": 3.0253, + "step": 17885 + }, + { + "epoch": 1.2155184128278298, + "grad_norm": 5.180703639984131, + "learning_rate": 8.481196494088872e-06, + "loss": 3.4079, + "step": 17890 + }, + { + "epoch": 1.2158581328984917, + "grad_norm": 5.827797889709473, + "learning_rate": 8.480771844000543e-06, + "loss": 3.3608, + "step": 17895 + }, + { + "epoch": 1.2161978529691535, + "grad_norm": 5.933509349822998, + "learning_rate": 8.480347193912218e-06, + "loss": 3.2925, + "step": 17900 + }, + { + "epoch": 1.2165375730398151, + "grad_norm": 7.657311916351318, + "learning_rate": 8.47992254382389e-06, + "loss": 3.1725, + "step": 17905 + }, + { + "epoch": 1.216877293110477, + "grad_norm": 7.138423442840576, + "learning_rate": 8.479497893735561e-06, + "loss": 3.1124, + "step": 17910 + }, + { + "epoch": 1.2172170131811386, + "grad_norm": 8.807454109191895, + "learning_rate": 8.479073243647236e-06, + "loss": 3.2244, + "step": 17915 + }, + { + "epoch": 1.2175567332518005, + "grad_norm": 6.5886735916137695, + "learning_rate": 8.478648593558909e-06, + "loss": 3.1931, + "step": 17920 + }, + { + "epoch": 1.2178964533224623, + "grad_norm": 7.300242900848389, + "learning_rate": 8.47822394347058e-06, + "loss": 3.2616, + "step": 17925 + }, + { + "epoch": 1.218236173393124, + "grad_norm": 7.1689887046813965, + "learning_rate": 8.477799293382254e-06, + "loss": 2.959, + "step": 17930 + }, + { + "epoch": 1.2185758934637858, + "grad_norm": 6.564547061920166, + "learning_rate": 8.477374643293927e-06, + "loss": 3.3427, + "step": 17935 + }, + { + "epoch": 1.2189156135344477, + "grad_norm": 6.540771961212158, + "learning_rate": 8.476949993205598e-06, + "loss": 3.0254, + "step": 17940 + }, + { + "epoch": 1.2192553336051093, + "grad_norm": 5.740583419799805, + "learning_rate": 8.476525343117273e-06, + "loss": 3.5086, + "step": 17945 + }, + { + "epoch": 1.2195950536757711, + "grad_norm": 6.649125576019287, + "learning_rate": 8.476100693028944e-06, + "loss": 3.1622, + "step": 17950 + }, + { + "epoch": 1.219934773746433, + "grad_norm": 7.591765880584717, + "learning_rate": 8.475676042940617e-06, + "loss": 3.0813, + "step": 17955 + }, + { + "epoch": 1.2202744938170946, + "grad_norm": 7.142739295959473, + "learning_rate": 8.475251392852291e-06, + "loss": 3.2266, + "step": 17960 + }, + { + "epoch": 1.2206142138877565, + "grad_norm": 5.810253620147705, + "learning_rate": 8.474826742763962e-06, + "loss": 3.1493, + "step": 17965 + }, + { + "epoch": 1.2209539339584183, + "grad_norm": 5.581496238708496, + "learning_rate": 8.474402092675637e-06, + "loss": 3.1929, + "step": 17970 + }, + { + "epoch": 1.22129365402908, + "grad_norm": 7.379402160644531, + "learning_rate": 8.47397744258731e-06, + "loss": 3.2532, + "step": 17975 + }, + { + "epoch": 1.2216333740997418, + "grad_norm": 5.322261810302734, + "learning_rate": 8.47355279249898e-06, + "loss": 3.22, + "step": 17980 + }, + { + "epoch": 1.2219730941704037, + "grad_norm": 6.82430362701416, + "learning_rate": 8.473128142410655e-06, + "loss": 3.0022, + "step": 17985 + }, + { + "epoch": 1.2223128142410653, + "grad_norm": 8.016586303710938, + "learning_rate": 8.472703492322328e-06, + "loss": 3.3483, + "step": 17990 + }, + { + "epoch": 1.2226525343117272, + "grad_norm": 6.465390682220459, + "learning_rate": 8.472278842233999e-06, + "loss": 3.3084, + "step": 17995 + }, + { + "epoch": 1.222992254382389, + "grad_norm": 10.594207763671875, + "learning_rate": 8.471854192145674e-06, + "loss": 3.2958, + "step": 18000 + }, + { + "epoch": 1.2233319744530506, + "grad_norm": 6.933552265167236, + "learning_rate": 8.471429542057346e-06, + "loss": 3.1095, + "step": 18005 + }, + { + "epoch": 1.2236716945237125, + "grad_norm": 7.620056629180908, + "learning_rate": 8.471004891969017e-06, + "loss": 3.2212, + "step": 18010 + }, + { + "epoch": 1.2240114145943743, + "grad_norm": 5.444890022277832, + "learning_rate": 8.470580241880692e-06, + "loss": 3.0585, + "step": 18015 + }, + { + "epoch": 1.224351134665036, + "grad_norm": 6.175193786621094, + "learning_rate": 8.470155591792363e-06, + "loss": 3.4302, + "step": 18020 + }, + { + "epoch": 1.2246908547356978, + "grad_norm": 6.1694254875183105, + "learning_rate": 8.469730941704036e-06, + "loss": 3.0256, + "step": 18025 + }, + { + "epoch": 1.2250305748063597, + "grad_norm": 6.466031074523926, + "learning_rate": 8.46930629161571e-06, + "loss": 2.9155, + "step": 18030 + }, + { + "epoch": 1.2253702948770213, + "grad_norm": 6.446869373321533, + "learning_rate": 8.468881641527381e-06, + "loss": 3.5652, + "step": 18035 + }, + { + "epoch": 1.2257100149476832, + "grad_norm": 8.170638084411621, + "learning_rate": 8.468456991439054e-06, + "loss": 3.3529, + "step": 18040 + }, + { + "epoch": 1.2260497350183448, + "grad_norm": 6.50553560256958, + "learning_rate": 8.468032341350729e-06, + "loss": 3.3433, + "step": 18045 + }, + { + "epoch": 1.2263894550890067, + "grad_norm": 6.336504936218262, + "learning_rate": 8.4676076912624e-06, + "loss": 3.0491, + "step": 18050 + }, + { + "epoch": 1.2267291751596685, + "grad_norm": 7.2816290855407715, + "learning_rate": 8.467183041174073e-06, + "loss": 3.2396, + "step": 18055 + }, + { + "epoch": 1.2270688952303301, + "grad_norm": 8.416263580322266, + "learning_rate": 8.466758391085747e-06, + "loss": 2.995, + "step": 18060 + }, + { + "epoch": 1.227408615300992, + "grad_norm": 6.695535182952881, + "learning_rate": 8.466333740997418e-06, + "loss": 3.1629, + "step": 18065 + }, + { + "epoch": 1.2277483353716538, + "grad_norm": 5.314905643463135, + "learning_rate": 8.465909090909091e-06, + "loss": 3.2349, + "step": 18070 + }, + { + "epoch": 1.2280880554423155, + "grad_norm": 5.613912582397461, + "learning_rate": 8.465484440820766e-06, + "loss": 3.2567, + "step": 18075 + }, + { + "epoch": 1.2284277755129773, + "grad_norm": 7.647927761077881, + "learning_rate": 8.465059790732437e-06, + "loss": 3.109, + "step": 18080 + }, + { + "epoch": 1.228767495583639, + "grad_norm": 6.863351345062256, + "learning_rate": 8.46463514064411e-06, + "loss": 3.1811, + "step": 18085 + }, + { + "epoch": 1.2291072156543008, + "grad_norm": 7.432269096374512, + "learning_rate": 8.464210490555784e-06, + "loss": 3.1854, + "step": 18090 + }, + { + "epoch": 1.2294469357249627, + "grad_norm": 8.482829093933105, + "learning_rate": 8.463785840467455e-06, + "loss": 3.1464, + "step": 18095 + }, + { + "epoch": 1.2297866557956243, + "grad_norm": 5.548416614532471, + "learning_rate": 8.463361190379128e-06, + "loss": 3.3742, + "step": 18100 + }, + { + "epoch": 1.2301263758662861, + "grad_norm": 6.146251201629639, + "learning_rate": 8.4629365402908e-06, + "loss": 3.2663, + "step": 18105 + }, + { + "epoch": 1.230466095936948, + "grad_norm": 6.317619323730469, + "learning_rate": 8.462511890202473e-06, + "loss": 3.1799, + "step": 18110 + }, + { + "epoch": 1.2308058160076096, + "grad_norm": 6.03815221786499, + "learning_rate": 8.462087240114146e-06, + "loss": 3.2175, + "step": 18115 + }, + { + "epoch": 1.2311455360782715, + "grad_norm": 7.078567028045654, + "learning_rate": 8.461662590025819e-06, + "loss": 3.429, + "step": 18120 + }, + { + "epoch": 1.2314852561489333, + "grad_norm": 6.725367069244385, + "learning_rate": 8.461237939937492e-06, + "loss": 3.0458, + "step": 18125 + }, + { + "epoch": 1.231824976219595, + "grad_norm": 5.981473445892334, + "learning_rate": 8.460813289849165e-06, + "loss": 3.1425, + "step": 18130 + }, + { + "epoch": 1.2321646962902568, + "grad_norm": 6.330872535705566, + "learning_rate": 8.460388639760838e-06, + "loss": 3.2068, + "step": 18135 + }, + { + "epoch": 1.2325044163609187, + "grad_norm": 6.16633415222168, + "learning_rate": 8.45996398967251e-06, + "loss": 3.1559, + "step": 18140 + }, + { + "epoch": 1.2328441364315803, + "grad_norm": 9.01767635345459, + "learning_rate": 8.459539339584183e-06, + "loss": 3.1552, + "step": 18145 + }, + { + "epoch": 1.2331838565022422, + "grad_norm": 6.676793098449707, + "learning_rate": 8.459114689495856e-06, + "loss": 3.2004, + "step": 18150 + }, + { + "epoch": 1.233523576572904, + "grad_norm": 5.354626178741455, + "learning_rate": 8.458690039407529e-06, + "loss": 3.0226, + "step": 18155 + }, + { + "epoch": 1.2338632966435656, + "grad_norm": 6.880114555358887, + "learning_rate": 8.458265389319202e-06, + "loss": 3.1697, + "step": 18160 + }, + { + "epoch": 1.2342030167142275, + "grad_norm": 8.317505836486816, + "learning_rate": 8.457840739230874e-06, + "loss": 3.0176, + "step": 18165 + }, + { + "epoch": 1.2345427367848893, + "grad_norm": 7.971860408782959, + "learning_rate": 8.457416089142547e-06, + "loss": 3.1477, + "step": 18170 + }, + { + "epoch": 1.234882456855551, + "grad_norm": 7.8256683349609375, + "learning_rate": 8.45699143905422e-06, + "loss": 3.1232, + "step": 18175 + }, + { + "epoch": 1.2352221769262128, + "grad_norm": 7.867437839508057, + "learning_rate": 8.456566788965893e-06, + "loss": 3.1554, + "step": 18180 + }, + { + "epoch": 1.2355618969968747, + "grad_norm": 6.801882743835449, + "learning_rate": 8.456142138877566e-06, + "loss": 3.1788, + "step": 18185 + }, + { + "epoch": 1.2359016170675363, + "grad_norm": 6.200046062469482, + "learning_rate": 8.455717488789238e-06, + "loss": 3.1726, + "step": 18190 + }, + { + "epoch": 1.2362413371381982, + "grad_norm": 6.452198505401611, + "learning_rate": 8.455292838700911e-06, + "loss": 3.1026, + "step": 18195 + }, + { + "epoch": 1.23658105720886, + "grad_norm": 6.717223644256592, + "learning_rate": 8.454868188612584e-06, + "loss": 3.1225, + "step": 18200 + }, + { + "epoch": 1.2369207772795217, + "grad_norm": 6.3696465492248535, + "learning_rate": 8.454443538524257e-06, + "loss": 3.0238, + "step": 18205 + }, + { + "epoch": 1.2372604973501835, + "grad_norm": 7.909265041351318, + "learning_rate": 8.45401888843593e-06, + "loss": 3.168, + "step": 18210 + }, + { + "epoch": 1.2376002174208451, + "grad_norm": 5.9409074783325195, + "learning_rate": 8.453594238347602e-06, + "loss": 3.1765, + "step": 18215 + }, + { + "epoch": 1.237939937491507, + "grad_norm": 6.709949016571045, + "learning_rate": 8.453169588259275e-06, + "loss": 2.99, + "step": 18220 + }, + { + "epoch": 1.2382796575621688, + "grad_norm": 7.587391376495361, + "learning_rate": 8.452744938170948e-06, + "loss": 3.4616, + "step": 18225 + }, + { + "epoch": 1.2386193776328305, + "grad_norm": 7.824438095092773, + "learning_rate": 8.45232028808262e-06, + "loss": 3.2107, + "step": 18230 + }, + { + "epoch": 1.2389590977034923, + "grad_norm": 4.482739448547363, + "learning_rate": 8.451895637994294e-06, + "loss": 3.2262, + "step": 18235 + }, + { + "epoch": 1.2392988177741542, + "grad_norm": 5.043693542480469, + "learning_rate": 8.451470987905966e-06, + "loss": 2.9922, + "step": 18240 + }, + { + "epoch": 1.2396385378448158, + "grad_norm": 8.432146072387695, + "learning_rate": 8.451046337817639e-06, + "loss": 3.1753, + "step": 18245 + }, + { + "epoch": 1.2399782579154777, + "grad_norm": 6.132830619812012, + "learning_rate": 8.450621687729312e-06, + "loss": 3.2033, + "step": 18250 + }, + { + "epoch": 1.2403179779861393, + "grad_norm": 6.828149795532227, + "learning_rate": 8.450197037640985e-06, + "loss": 3.1956, + "step": 18255 + }, + { + "epoch": 1.2406576980568012, + "grad_norm": 7.821610450744629, + "learning_rate": 8.449772387552658e-06, + "loss": 3.0273, + "step": 18260 + }, + { + "epoch": 1.240997418127463, + "grad_norm": 6.916013240814209, + "learning_rate": 8.44934773746433e-06, + "loss": 3.4083, + "step": 18265 + }, + { + "epoch": 1.2413371381981246, + "grad_norm": 7.934512615203857, + "learning_rate": 8.448923087376003e-06, + "loss": 2.8822, + "step": 18270 + }, + { + "epoch": 1.2416768582687865, + "grad_norm": 7.286118507385254, + "learning_rate": 8.448498437287676e-06, + "loss": 3.0402, + "step": 18275 + }, + { + "epoch": 1.2420165783394483, + "grad_norm": 6.324624061584473, + "learning_rate": 8.448073787199349e-06, + "loss": 3.2281, + "step": 18280 + }, + { + "epoch": 1.24235629841011, + "grad_norm": 6.653368949890137, + "learning_rate": 8.447649137111022e-06, + "loss": 3.4939, + "step": 18285 + }, + { + "epoch": 1.2426960184807718, + "grad_norm": 7.797722339630127, + "learning_rate": 8.447224487022694e-06, + "loss": 3.17, + "step": 18290 + }, + { + "epoch": 1.2430357385514337, + "grad_norm": 6.408388137817383, + "learning_rate": 8.446799836934365e-06, + "loss": 3.085, + "step": 18295 + }, + { + "epoch": 1.2433754586220953, + "grad_norm": 8.972452163696289, + "learning_rate": 8.44637518684604e-06, + "loss": 3.3668, + "step": 18300 + }, + { + "epoch": 1.2437151786927572, + "grad_norm": 5.579138278961182, + "learning_rate": 8.445950536757713e-06, + "loss": 3.2423, + "step": 18305 + }, + { + "epoch": 1.244054898763419, + "grad_norm": 6.797225475311279, + "learning_rate": 8.445525886669386e-06, + "loss": 3.3784, + "step": 18310 + }, + { + "epoch": 1.2443946188340806, + "grad_norm": 6.87531852722168, + "learning_rate": 8.445101236581058e-06, + "loss": 3.2894, + "step": 18315 + }, + { + "epoch": 1.2447343389047425, + "grad_norm": 8.145861625671387, + "learning_rate": 8.444676586492731e-06, + "loss": 3.2684, + "step": 18320 + }, + { + "epoch": 1.2450740589754044, + "grad_norm": 6.677893161773682, + "learning_rate": 8.444251936404404e-06, + "loss": 2.9994, + "step": 18325 + }, + { + "epoch": 1.245413779046066, + "grad_norm": 7.29204797744751, + "learning_rate": 8.443827286316077e-06, + "loss": 3.0449, + "step": 18330 + }, + { + "epoch": 1.2457534991167278, + "grad_norm": 7.461769104003906, + "learning_rate": 8.44340263622775e-06, + "loss": 3.1247, + "step": 18335 + }, + { + "epoch": 1.2460932191873897, + "grad_norm": 6.788158416748047, + "learning_rate": 8.442977986139422e-06, + "loss": 3.1124, + "step": 18340 + }, + { + "epoch": 1.2464329392580513, + "grad_norm": 6.64309549331665, + "learning_rate": 8.442553336051095e-06, + "loss": 3.2149, + "step": 18345 + }, + { + "epoch": 1.2467726593287132, + "grad_norm": 6.493622303009033, + "learning_rate": 8.442128685962768e-06, + "loss": 3.0065, + "step": 18350 + }, + { + "epoch": 1.247112379399375, + "grad_norm": 5.841716289520264, + "learning_rate": 8.44170403587444e-06, + "loss": 3.4021, + "step": 18355 + }, + { + "epoch": 1.2474520994700367, + "grad_norm": 5.710478782653809, + "learning_rate": 8.441279385786114e-06, + "loss": 3.2625, + "step": 18360 + }, + { + "epoch": 1.2477918195406985, + "grad_norm": 5.274589538574219, + "learning_rate": 8.440854735697785e-06, + "loss": 3.1694, + "step": 18365 + }, + { + "epoch": 1.2481315396113604, + "grad_norm": 7.412038803100586, + "learning_rate": 8.440430085609459e-06, + "loss": 3.0047, + "step": 18370 + }, + { + "epoch": 1.248471259682022, + "grad_norm": 5.923502445220947, + "learning_rate": 8.440005435521132e-06, + "loss": 3.3368, + "step": 18375 + }, + { + "epoch": 1.2488109797526838, + "grad_norm": 6.456714630126953, + "learning_rate": 8.439580785432803e-06, + "loss": 2.9311, + "step": 18380 + }, + { + "epoch": 1.2491506998233455, + "grad_norm": 6.4184441566467285, + "learning_rate": 8.439156135344478e-06, + "loss": 3.4969, + "step": 18385 + }, + { + "epoch": 1.2494904198940073, + "grad_norm": 6.045565605163574, + "learning_rate": 8.43873148525615e-06, + "loss": 3.0489, + "step": 18390 + }, + { + "epoch": 1.2498301399646692, + "grad_norm": 5.497725009918213, + "learning_rate": 8.438306835167821e-06, + "loss": 3.105, + "step": 18395 + }, + { + "epoch": 1.2501698600353308, + "grad_norm": 5.6546149253845215, + "learning_rate": 8.437882185079496e-06, + "loss": 3.3164, + "step": 18400 + }, + { + "epoch": 1.2505095801059927, + "grad_norm": 6.778075695037842, + "learning_rate": 8.437457534991169e-06, + "loss": 3.3608, + "step": 18405 + }, + { + "epoch": 1.2508493001766543, + "grad_norm": 5.720085144042969, + "learning_rate": 8.43703288490284e-06, + "loss": 3.2297, + "step": 18410 + }, + { + "epoch": 1.2511890202473162, + "grad_norm": 5.6127495765686035, + "learning_rate": 8.436608234814514e-06, + "loss": 2.8875, + "step": 18415 + }, + { + "epoch": 1.251528740317978, + "grad_norm": 7.213126182556152, + "learning_rate": 8.436183584726187e-06, + "loss": 3.0765, + "step": 18420 + }, + { + "epoch": 1.2518684603886396, + "grad_norm": 7.075490474700928, + "learning_rate": 8.435758934637858e-06, + "loss": 3.1825, + "step": 18425 + }, + { + "epoch": 1.2522081804593015, + "grad_norm": 7.095405578613281, + "learning_rate": 8.435334284549533e-06, + "loss": 3.0882, + "step": 18430 + }, + { + "epoch": 1.2525479005299633, + "grad_norm": 6.872604846954346, + "learning_rate": 8.434909634461204e-06, + "loss": 3.3064, + "step": 18435 + }, + { + "epoch": 1.252887620600625, + "grad_norm": 7.074413776397705, + "learning_rate": 8.434484984372877e-06, + "loss": 3.2728, + "step": 18440 + }, + { + "epoch": 1.2532273406712868, + "grad_norm": 7.511848449707031, + "learning_rate": 8.434060334284551e-06, + "loss": 3.158, + "step": 18445 + }, + { + "epoch": 1.2535670607419487, + "grad_norm": 5.97578239440918, + "learning_rate": 8.433635684196222e-06, + "loss": 3.2817, + "step": 18450 + }, + { + "epoch": 1.2539067808126103, + "grad_norm": 6.254410743713379, + "learning_rate": 8.433211034107895e-06, + "loss": 3.0225, + "step": 18455 + }, + { + "epoch": 1.2542465008832722, + "grad_norm": 7.071788787841797, + "learning_rate": 8.43278638401957e-06, + "loss": 3.2812, + "step": 18460 + }, + { + "epoch": 1.254586220953934, + "grad_norm": 8.788405418395996, + "learning_rate": 8.43236173393124e-06, + "loss": 3.1036, + "step": 18465 + }, + { + "epoch": 1.2549259410245956, + "grad_norm": 6.544928073883057, + "learning_rate": 8.431937083842913e-06, + "loss": 2.9072, + "step": 18470 + }, + { + "epoch": 1.2552656610952575, + "grad_norm": 7.079372882843018, + "learning_rate": 8.431512433754588e-06, + "loss": 3.1398, + "step": 18475 + }, + { + "epoch": 1.2556053811659194, + "grad_norm": 6.245556831359863, + "learning_rate": 8.431087783666259e-06, + "loss": 3.0368, + "step": 18480 + }, + { + "epoch": 1.255945101236581, + "grad_norm": 6.5753960609436035, + "learning_rate": 8.430663133577932e-06, + "loss": 2.9838, + "step": 18485 + }, + { + "epoch": 1.2562848213072428, + "grad_norm": 8.075159072875977, + "learning_rate": 8.430238483489606e-06, + "loss": 3.164, + "step": 18490 + }, + { + "epoch": 1.2566245413779047, + "grad_norm": 7.392457485198975, + "learning_rate": 8.429813833401277e-06, + "loss": 3.1998, + "step": 18495 + }, + { + "epoch": 1.2569642614485663, + "grad_norm": 6.586413383483887, + "learning_rate": 8.42938918331295e-06, + "loss": 3.0587, + "step": 18500 + }, + { + "epoch": 1.2573039815192282, + "grad_norm": 6.847851276397705, + "learning_rate": 8.428964533224625e-06, + "loss": 3.1853, + "step": 18505 + }, + { + "epoch": 1.25764370158989, + "grad_norm": 6.663017272949219, + "learning_rate": 8.428539883136296e-06, + "loss": 3.1664, + "step": 18510 + }, + { + "epoch": 1.2579834216605517, + "grad_norm": 6.595666408538818, + "learning_rate": 8.428115233047969e-06, + "loss": 3.4235, + "step": 18515 + }, + { + "epoch": 1.2583231417312135, + "grad_norm": 9.40002727508545, + "learning_rate": 8.427690582959641e-06, + "loss": 3.2855, + "step": 18520 + }, + { + "epoch": 1.2586628618018754, + "grad_norm": 6.734963893890381, + "learning_rate": 8.427265932871314e-06, + "loss": 3.469, + "step": 18525 + }, + { + "epoch": 1.259002581872537, + "grad_norm": 5.9096503257751465, + "learning_rate": 8.426841282782987e-06, + "loss": 3.2144, + "step": 18530 + }, + { + "epoch": 1.2593423019431988, + "grad_norm": 6.523425579071045, + "learning_rate": 8.42641663269466e-06, + "loss": 3.3814, + "step": 18535 + }, + { + "epoch": 1.2596820220138607, + "grad_norm": 6.34055757522583, + "learning_rate": 8.425991982606333e-06, + "loss": 3.0899, + "step": 18540 + }, + { + "epoch": 1.2600217420845223, + "grad_norm": 7.313846111297607, + "learning_rate": 8.425567332518005e-06, + "loss": 3.3954, + "step": 18545 + }, + { + "epoch": 1.2603614621551842, + "grad_norm": 6.1291022300720215, + "learning_rate": 8.425142682429678e-06, + "loss": 3.5074, + "step": 18550 + }, + { + "epoch": 1.260701182225846, + "grad_norm": 7.33896541595459, + "learning_rate": 8.424718032341351e-06, + "loss": 3.1021, + "step": 18555 + }, + { + "epoch": 1.2610409022965077, + "grad_norm": 6.0079803466796875, + "learning_rate": 8.424293382253024e-06, + "loss": 3.5931, + "step": 18560 + }, + { + "epoch": 1.2613806223671695, + "grad_norm": 7.994865894317627, + "learning_rate": 8.423868732164697e-06, + "loss": 3.197, + "step": 18565 + }, + { + "epoch": 1.2617203424378312, + "grad_norm": 6.901488780975342, + "learning_rate": 8.42344408207637e-06, + "loss": 3.3617, + "step": 18570 + }, + { + "epoch": 1.262060062508493, + "grad_norm": 6.726329326629639, + "learning_rate": 8.423019431988042e-06, + "loss": 3.0207, + "step": 18575 + }, + { + "epoch": 1.2623997825791546, + "grad_norm": 7.754878997802734, + "learning_rate": 8.422594781899715e-06, + "loss": 3.0667, + "step": 18580 + }, + { + "epoch": 1.2627395026498165, + "grad_norm": 8.578836441040039, + "learning_rate": 8.422170131811388e-06, + "loss": 3.2965, + "step": 18585 + }, + { + "epoch": 1.2630792227204783, + "grad_norm": 7.312583923339844, + "learning_rate": 8.42174548172306e-06, + "loss": 3.3269, + "step": 18590 + }, + { + "epoch": 1.26341894279114, + "grad_norm": 6.544732093811035, + "learning_rate": 8.421320831634733e-06, + "loss": 3.1983, + "step": 18595 + }, + { + "epoch": 1.2637586628618018, + "grad_norm": 6.823315143585205, + "learning_rate": 8.420896181546406e-06, + "loss": 3.3296, + "step": 18600 + }, + { + "epoch": 1.2640983829324637, + "grad_norm": 7.002361297607422, + "learning_rate": 8.420471531458079e-06, + "loss": 3.0872, + "step": 18605 + }, + { + "epoch": 1.2644381030031253, + "grad_norm": 5.963258266448975, + "learning_rate": 8.420046881369752e-06, + "loss": 2.9, + "step": 18610 + }, + { + "epoch": 1.2647778230737872, + "grad_norm": 6.051836967468262, + "learning_rate": 8.419622231281425e-06, + "loss": 3.3182, + "step": 18615 + }, + { + "epoch": 1.265117543144449, + "grad_norm": 7.321277618408203, + "learning_rate": 8.419197581193097e-06, + "loss": 3.4998, + "step": 18620 + }, + { + "epoch": 1.2654572632151107, + "grad_norm": 7.071592330932617, + "learning_rate": 8.41877293110477e-06, + "loss": 2.9969, + "step": 18625 + }, + { + "epoch": 1.2657969832857725, + "grad_norm": 6.947005748748779, + "learning_rate": 8.418348281016443e-06, + "loss": 3.0064, + "step": 18630 + }, + { + "epoch": 1.2661367033564344, + "grad_norm": 7.108137130737305, + "learning_rate": 8.417923630928116e-06, + "loss": 3.4559, + "step": 18635 + }, + { + "epoch": 1.266476423427096, + "grad_norm": 5.940869331359863, + "learning_rate": 8.417498980839789e-06, + "loss": 2.839, + "step": 18640 + }, + { + "epoch": 1.2668161434977578, + "grad_norm": 5.740029335021973, + "learning_rate": 8.417074330751461e-06, + "loss": 2.9742, + "step": 18645 + }, + { + "epoch": 1.2671558635684197, + "grad_norm": 7.917629718780518, + "learning_rate": 8.416649680663134e-06, + "loss": 3.1817, + "step": 18650 + }, + { + "epoch": 1.2674955836390813, + "grad_norm": 5.353533744812012, + "learning_rate": 8.416225030574807e-06, + "loss": 3.3375, + "step": 18655 + }, + { + "epoch": 1.2678353037097432, + "grad_norm": 7.873083114624023, + "learning_rate": 8.41580038048648e-06, + "loss": 3.3088, + "step": 18660 + }, + { + "epoch": 1.268175023780405, + "grad_norm": 7.119892597198486, + "learning_rate": 8.415375730398153e-06, + "loss": 3.394, + "step": 18665 + }, + { + "epoch": 1.2685147438510667, + "grad_norm": NaN, + "learning_rate": 8.415036010327492e-06, + "loss": 3.0722, + "step": 18670 + }, + { + "epoch": 1.2688544639217285, + "grad_norm": 7.17415714263916, + "learning_rate": 8.414611360239163e-06, + "loss": 3.0478, + "step": 18675 + }, + { + "epoch": 1.2691941839923904, + "grad_norm": 6.3612236976623535, + "learning_rate": 8.414186710150837e-06, + "loss": 3.0907, + "step": 18680 + }, + { + "epoch": 1.269533904063052, + "grad_norm": 6.729541301727295, + "learning_rate": 8.41376206006251e-06, + "loss": 3.3899, + "step": 18685 + }, + { + "epoch": 1.2698736241337139, + "grad_norm": 7.480048179626465, + "learning_rate": 8.413337409974181e-06, + "loss": 3.4284, + "step": 18690 + }, + { + "epoch": 1.2702133442043757, + "grad_norm": 8.478038787841797, + "learning_rate": 8.412912759885856e-06, + "loss": 3.2301, + "step": 18695 + }, + { + "epoch": 1.2705530642750373, + "grad_norm": 7.83911657333374, + "learning_rate": 8.412488109797527e-06, + "loss": 3.42, + "step": 18700 + }, + { + "epoch": 1.2708927843456992, + "grad_norm": 5.740550994873047, + "learning_rate": 8.4120634597092e-06, + "loss": 3.1906, + "step": 18705 + }, + { + "epoch": 1.271232504416361, + "grad_norm": 6.774436950683594, + "learning_rate": 8.411638809620874e-06, + "loss": 3.217, + "step": 18710 + }, + { + "epoch": 1.2715722244870227, + "grad_norm": 8.249251365661621, + "learning_rate": 8.411214159532545e-06, + "loss": 3.1002, + "step": 18715 + }, + { + "epoch": 1.2719119445576845, + "grad_norm": 7.61782169342041, + "learning_rate": 8.410789509444218e-06, + "loss": 2.9961, + "step": 18720 + }, + { + "epoch": 1.2722516646283464, + "grad_norm": 5.6876068115234375, + "learning_rate": 8.410364859355892e-06, + "loss": 3.2244, + "step": 18725 + }, + { + "epoch": 1.272591384699008, + "grad_norm": 6.8059306144714355, + "learning_rate": 8.409940209267564e-06, + "loss": 3.0207, + "step": 18730 + }, + { + "epoch": 1.2729311047696699, + "grad_norm": 6.281402587890625, + "learning_rate": 8.409515559179236e-06, + "loss": 3.1414, + "step": 18735 + }, + { + "epoch": 1.2732708248403315, + "grad_norm": 6.145198822021484, + "learning_rate": 8.40909090909091e-06, + "loss": 3.0109, + "step": 18740 + }, + { + "epoch": 1.2736105449109933, + "grad_norm": 7.394955158233643, + "learning_rate": 8.408666259002582e-06, + "loss": 3.0899, + "step": 18745 + }, + { + "epoch": 1.273950264981655, + "grad_norm": 7.090727806091309, + "learning_rate": 8.408241608914255e-06, + "loss": 3.2771, + "step": 18750 + }, + { + "epoch": 1.2742899850523168, + "grad_norm": 9.662922859191895, + "learning_rate": 8.40781695882593e-06, + "loss": 3.3119, + "step": 18755 + }, + { + "epoch": 1.2746297051229787, + "grad_norm": 5.539370536804199, + "learning_rate": 8.4073923087376e-06, + "loss": 3.2027, + "step": 18760 + }, + { + "epoch": 1.2749694251936403, + "grad_norm": 6.783092975616455, + "learning_rate": 8.406967658649273e-06, + "loss": 3.0086, + "step": 18765 + }, + { + "epoch": 1.2753091452643022, + "grad_norm": 5.988852500915527, + "learning_rate": 8.406543008560946e-06, + "loss": 3.0969, + "step": 18770 + }, + { + "epoch": 1.275648865334964, + "grad_norm": 7.134854316711426, + "learning_rate": 8.406118358472619e-06, + "loss": 3.2157, + "step": 18775 + }, + { + "epoch": 1.2759885854056257, + "grad_norm": 6.439280033111572, + "learning_rate": 8.405693708384292e-06, + "loss": 3.4115, + "step": 18780 + }, + { + "epoch": 1.2763283054762875, + "grad_norm": 8.359282493591309, + "learning_rate": 8.405269058295964e-06, + "loss": 3.3087, + "step": 18785 + }, + { + "epoch": 1.2766680255469494, + "grad_norm": 7.647959232330322, + "learning_rate": 8.404844408207637e-06, + "loss": 3.3575, + "step": 18790 + }, + { + "epoch": 1.277007745617611, + "grad_norm": 6.304222583770752, + "learning_rate": 8.40441975811931e-06, + "loss": 3.2447, + "step": 18795 + }, + { + "epoch": 1.2773474656882728, + "grad_norm": 7.307572841644287, + "learning_rate": 8.403995108030983e-06, + "loss": 3.1065, + "step": 18800 + }, + { + "epoch": 1.2776871857589347, + "grad_norm": 5.608338832855225, + "learning_rate": 8.403570457942656e-06, + "loss": 3.278, + "step": 18805 + }, + { + "epoch": 1.2780269058295963, + "grad_norm": 5.935089588165283, + "learning_rate": 8.403145807854328e-06, + "loss": 3.2416, + "step": 18810 + }, + { + "epoch": 1.2783666259002582, + "grad_norm": 6.934607028961182, + "learning_rate": 8.402721157766001e-06, + "loss": 3.181, + "step": 18815 + }, + { + "epoch": 1.27870634597092, + "grad_norm": 8.402826309204102, + "learning_rate": 8.402296507677674e-06, + "loss": 3.2795, + "step": 18820 + }, + { + "epoch": 1.2790460660415817, + "grad_norm": 5.935617923736572, + "learning_rate": 8.401871857589347e-06, + "loss": 3.2514, + "step": 18825 + }, + { + "epoch": 1.2793857861122435, + "grad_norm": 6.683816432952881, + "learning_rate": 8.40144720750102e-06, + "loss": 3.1618, + "step": 18830 + }, + { + "epoch": 1.2797255061829054, + "grad_norm": 7.837291717529297, + "learning_rate": 8.401022557412692e-06, + "loss": 3.4259, + "step": 18835 + }, + { + "epoch": 1.280065226253567, + "grad_norm": 7.296358585357666, + "learning_rate": 8.400597907324365e-06, + "loss": 3.2712, + "step": 18840 + }, + { + "epoch": 1.2804049463242289, + "grad_norm": 6.442309379577637, + "learning_rate": 8.400173257236038e-06, + "loss": 3.1182, + "step": 18845 + }, + { + "epoch": 1.2807446663948907, + "grad_norm": 7.9870686531066895, + "learning_rate": 8.39974860714771e-06, + "loss": 3.0255, + "step": 18850 + }, + { + "epoch": 1.2810843864655523, + "grad_norm": 6.622733116149902, + "learning_rate": 8.399323957059384e-06, + "loss": 3.224, + "step": 18855 + }, + { + "epoch": 1.2814241065362142, + "grad_norm": 6.649055004119873, + "learning_rate": 8.398899306971056e-06, + "loss": 3.1783, + "step": 18860 + }, + { + "epoch": 1.281763826606876, + "grad_norm": 5.784315586090088, + "learning_rate": 8.398474656882729e-06, + "loss": 3.0412, + "step": 18865 + }, + { + "epoch": 1.2821035466775377, + "grad_norm": 6.257731914520264, + "learning_rate": 8.398050006794402e-06, + "loss": 3.2893, + "step": 18870 + }, + { + "epoch": 1.2824432667481995, + "grad_norm": 6.242255210876465, + "learning_rate": 8.397625356706075e-06, + "loss": 3.1796, + "step": 18875 + }, + { + "epoch": 1.2827829868188614, + "grad_norm": 6.990503311157227, + "learning_rate": 8.397200706617748e-06, + "loss": 3.0051, + "step": 18880 + }, + { + "epoch": 1.283122706889523, + "grad_norm": 6.883703231811523, + "learning_rate": 8.39677605652942e-06, + "loss": 3.1534, + "step": 18885 + }, + { + "epoch": 1.2834624269601849, + "grad_norm": 6.645342826843262, + "learning_rate": 8.396351406441093e-06, + "loss": 2.969, + "step": 18890 + }, + { + "epoch": 1.2838021470308467, + "grad_norm": 6.420872688293457, + "learning_rate": 8.395926756352766e-06, + "loss": 3.1979, + "step": 18895 + }, + { + "epoch": 1.2841418671015083, + "grad_norm": 5.657236576080322, + "learning_rate": 8.395502106264439e-06, + "loss": 3.1648, + "step": 18900 + }, + { + "epoch": 1.2844815871721702, + "grad_norm": 7.49269962310791, + "learning_rate": 8.395077456176112e-06, + "loss": 3.117, + "step": 18905 + }, + { + "epoch": 1.2848213072428318, + "grad_norm": 6.237244606018066, + "learning_rate": 8.394652806087784e-06, + "loss": 3.3097, + "step": 18910 + }, + { + "epoch": 1.2851610273134937, + "grad_norm": 6.801767826080322, + "learning_rate": 8.394228155999457e-06, + "loss": 2.814, + "step": 18915 + }, + { + "epoch": 1.2855007473841553, + "grad_norm": 6.159125328063965, + "learning_rate": 8.39380350591113e-06, + "loss": 3.1592, + "step": 18920 + }, + { + "epoch": 1.2858404674548172, + "grad_norm": 5.6173176765441895, + "learning_rate": 8.393378855822803e-06, + "loss": 3.0641, + "step": 18925 + }, + { + "epoch": 1.286180187525479, + "grad_norm": 6.441351890563965, + "learning_rate": 8.392954205734476e-06, + "loss": 3.1993, + "step": 18930 + }, + { + "epoch": 1.2865199075961407, + "grad_norm": 6.474668979644775, + "learning_rate": 8.392529555646148e-06, + "loss": 3.0473, + "step": 18935 + }, + { + "epoch": 1.2868596276668025, + "grad_norm": 7.992763519287109, + "learning_rate": 8.392104905557821e-06, + "loss": 3.2821, + "step": 18940 + }, + { + "epoch": 1.2871993477374644, + "grad_norm": 7.145872592926025, + "learning_rate": 8.391680255469494e-06, + "loss": 3.079, + "step": 18945 + }, + { + "epoch": 1.287539067808126, + "grad_norm": 6.528059959411621, + "learning_rate": 8.391255605381167e-06, + "loss": 3.4117, + "step": 18950 + }, + { + "epoch": 1.2878787878787878, + "grad_norm": 5.860785484313965, + "learning_rate": 8.39083095529284e-06, + "loss": 3.2399, + "step": 18955 + }, + { + "epoch": 1.2882185079494497, + "grad_norm": 6.9936909675598145, + "learning_rate": 8.390406305204512e-06, + "loss": 2.9482, + "step": 18960 + }, + { + "epoch": 1.2885582280201113, + "grad_norm": 7.116635322570801, + "learning_rate": 8.389981655116185e-06, + "loss": 3.1112, + "step": 18965 + }, + { + "epoch": 1.2888979480907732, + "grad_norm": 7.252281665802002, + "learning_rate": 8.389557005027858e-06, + "loss": 3.2437, + "step": 18970 + }, + { + "epoch": 1.289237668161435, + "grad_norm": 5.950170040130615, + "learning_rate": 8.38913235493953e-06, + "loss": 3.2583, + "step": 18975 + }, + { + "epoch": 1.2895773882320967, + "grad_norm": 6.028046607971191, + "learning_rate": 8.388707704851204e-06, + "loss": 3.1901, + "step": 18980 + }, + { + "epoch": 1.2899171083027585, + "grad_norm": 5.810698986053467, + "learning_rate": 8.388283054762876e-06, + "loss": 3.2948, + "step": 18985 + }, + { + "epoch": 1.2902568283734204, + "grad_norm": 5.576903820037842, + "learning_rate": 8.38785840467455e-06, + "loss": 3.0526, + "step": 18990 + }, + { + "epoch": 1.290596548444082, + "grad_norm": 9.017126083374023, + "learning_rate": 8.387433754586222e-06, + "loss": 3.5939, + "step": 18995 + }, + { + "epoch": 1.2909362685147439, + "grad_norm": 6.148862361907959, + "learning_rate": 8.387009104497895e-06, + "loss": 3.0914, + "step": 19000 + }, + { + "epoch": 1.2912759885854057, + "grad_norm": 7.3319830894470215, + "learning_rate": 8.386584454409568e-06, + "loss": 3.1803, + "step": 19005 + }, + { + "epoch": 1.2916157086560673, + "grad_norm": 6.064766883850098, + "learning_rate": 8.38615980432124e-06, + "loss": 3.2413, + "step": 19010 + }, + { + "epoch": 1.2919554287267292, + "grad_norm": 7.72840690612793, + "learning_rate": 8.385735154232913e-06, + "loss": 3.2733, + "step": 19015 + }, + { + "epoch": 1.292295148797391, + "grad_norm": 8.483951568603516, + "learning_rate": 8.385310504144586e-06, + "loss": 3.2414, + "step": 19020 + }, + { + "epoch": 1.2926348688680527, + "grad_norm": 5.911952495574951, + "learning_rate": 8.384885854056259e-06, + "loss": 3.2575, + "step": 19025 + }, + { + "epoch": 1.2929745889387145, + "grad_norm": 7.489439487457275, + "learning_rate": 8.384461203967932e-06, + "loss": 3.2001, + "step": 19030 + }, + { + "epoch": 1.2933143090093764, + "grad_norm": 6.282622814178467, + "learning_rate": 8.384036553879604e-06, + "loss": 3.3626, + "step": 19035 + }, + { + "epoch": 1.293654029080038, + "grad_norm": 6.3597731590271, + "learning_rate": 8.383611903791277e-06, + "loss": 3.1794, + "step": 19040 + }, + { + "epoch": 1.2939937491506999, + "grad_norm": 6.775139808654785, + "learning_rate": 8.383187253702948e-06, + "loss": 3.2085, + "step": 19045 + }, + { + "epoch": 1.2943334692213617, + "grad_norm": 6.086141109466553, + "learning_rate": 8.382762603614623e-06, + "loss": 2.9277, + "step": 19050 + }, + { + "epoch": 1.2946731892920234, + "grad_norm": 5.864109992980957, + "learning_rate": 8.382337953526296e-06, + "loss": 3.3096, + "step": 19055 + }, + { + "epoch": 1.2950129093626852, + "grad_norm": 6.292870998382568, + "learning_rate": 8.381913303437967e-06, + "loss": 3.3538, + "step": 19060 + }, + { + "epoch": 1.295352629433347, + "grad_norm": 6.065367698669434, + "learning_rate": 8.381488653349641e-06, + "loss": 3.0079, + "step": 19065 + }, + { + "epoch": 1.2956923495040087, + "grad_norm": 7.654036998748779, + "learning_rate": 8.381064003261314e-06, + "loss": 2.9769, + "step": 19070 + }, + { + "epoch": 1.2960320695746705, + "grad_norm": 8.174041748046875, + "learning_rate": 8.380639353172985e-06, + "loss": 3.1046, + "step": 19075 + }, + { + "epoch": 1.2963717896453322, + "grad_norm": 6.325058937072754, + "learning_rate": 8.38021470308466e-06, + "loss": 3.4376, + "step": 19080 + }, + { + "epoch": 1.296711509715994, + "grad_norm": 7.669739723205566, + "learning_rate": 8.379790052996332e-06, + "loss": 3.3836, + "step": 19085 + }, + { + "epoch": 1.2970512297866557, + "grad_norm": 6.269105911254883, + "learning_rate": 8.379365402908003e-06, + "loss": 3.254, + "step": 19090 + }, + { + "epoch": 1.2973909498573175, + "grad_norm": 6.209226131439209, + "learning_rate": 8.378940752819678e-06, + "loss": 3.0058, + "step": 19095 + }, + { + "epoch": 1.2977306699279794, + "grad_norm": 8.52430248260498, + "learning_rate": 8.37851610273135e-06, + "loss": 3.1168, + "step": 19100 + }, + { + "epoch": 1.298070389998641, + "grad_norm": 7.442807674407959, + "learning_rate": 8.378091452643022e-06, + "loss": 3.1134, + "step": 19105 + }, + { + "epoch": 1.2984101100693028, + "grad_norm": 7.499734878540039, + "learning_rate": 8.377666802554696e-06, + "loss": 3.155, + "step": 19110 + }, + { + "epoch": 1.2987498301399647, + "grad_norm": 7.293271541595459, + "learning_rate": 8.377242152466367e-06, + "loss": 3.1567, + "step": 19115 + }, + { + "epoch": 1.2990895502106263, + "grad_norm": 6.478341102600098, + "learning_rate": 8.37681750237804e-06, + "loss": 3.2365, + "step": 19120 + }, + { + "epoch": 1.2994292702812882, + "grad_norm": 6.777817249298096, + "learning_rate": 8.376392852289715e-06, + "loss": 3.2395, + "step": 19125 + }, + { + "epoch": 1.29976899035195, + "grad_norm": 6.47390604019165, + "learning_rate": 8.375968202201386e-06, + "loss": 3.2304, + "step": 19130 + }, + { + "epoch": 1.3001087104226117, + "grad_norm": 7.717251777648926, + "learning_rate": 8.375543552113059e-06, + "loss": 3.1551, + "step": 19135 + }, + { + "epoch": 1.3004484304932735, + "grad_norm": 7.267390251159668, + "learning_rate": 8.375118902024733e-06, + "loss": 3.0965, + "step": 19140 + }, + { + "epoch": 1.3007881505639354, + "grad_norm": 7.695171356201172, + "learning_rate": 8.374694251936404e-06, + "loss": 3.1527, + "step": 19145 + }, + { + "epoch": 1.301127870634597, + "grad_norm": 5.65108060836792, + "learning_rate": 8.374269601848077e-06, + "loss": 3.2778, + "step": 19150 + }, + { + "epoch": 1.3014675907052589, + "grad_norm": 5.950771331787109, + "learning_rate": 8.373844951759752e-06, + "loss": 3.0786, + "step": 19155 + }, + { + "epoch": 1.3018073107759207, + "grad_norm": 7.2466607093811035, + "learning_rate": 8.373420301671423e-06, + "loss": 3.2549, + "step": 19160 + }, + { + "epoch": 1.3021470308465823, + "grad_norm": 6.049253940582275, + "learning_rate": 8.372995651583095e-06, + "loss": 3.085, + "step": 19165 + }, + { + "epoch": 1.3024867509172442, + "grad_norm": 6.4775071144104, + "learning_rate": 8.37257100149477e-06, + "loss": 3.4594, + "step": 19170 + }, + { + "epoch": 1.302826470987906, + "grad_norm": 6.6653571128845215, + "learning_rate": 8.372146351406441e-06, + "loss": 2.9946, + "step": 19175 + }, + { + "epoch": 1.3031661910585677, + "grad_norm": 9.162681579589844, + "learning_rate": 8.371721701318114e-06, + "loss": 3.1415, + "step": 19180 + }, + { + "epoch": 1.3035059111292295, + "grad_norm": 5.466099262237549, + "learning_rate": 8.371297051229787e-06, + "loss": 3.0897, + "step": 19185 + }, + { + "epoch": 1.3038456311998914, + "grad_norm": 5.5643086433410645, + "learning_rate": 8.37087240114146e-06, + "loss": 3.4182, + "step": 19190 + }, + { + "epoch": 1.304185351270553, + "grad_norm": 5.790210247039795, + "learning_rate": 8.370447751053134e-06, + "loss": 3.1199, + "step": 19195 + }, + { + "epoch": 1.3045250713412149, + "grad_norm": 7.517702102661133, + "learning_rate": 8.370023100964805e-06, + "loss": 3.5079, + "step": 19200 + }, + { + "epoch": 1.3048647914118767, + "grad_norm": 8.230661392211914, + "learning_rate": 8.369598450876478e-06, + "loss": 3.141, + "step": 19205 + }, + { + "epoch": 1.3052045114825384, + "grad_norm": 5.586494445800781, + "learning_rate": 8.369173800788152e-06, + "loss": 3.1542, + "step": 19210 + }, + { + "epoch": 1.3055442315532002, + "grad_norm": 7.148723125457764, + "learning_rate": 8.368749150699824e-06, + "loss": 3.3981, + "step": 19215 + }, + { + "epoch": 1.305883951623862, + "grad_norm": 5.89068603515625, + "learning_rate": 8.368324500611496e-06, + "loss": 3.1062, + "step": 19220 + }, + { + "epoch": 1.3062236716945237, + "grad_norm": 6.9835734367370605, + "learning_rate": 8.36789985052317e-06, + "loss": 3.3024, + "step": 19225 + }, + { + "epoch": 1.3065633917651855, + "grad_norm": 7.737909317016602, + "learning_rate": 8.367475200434842e-06, + "loss": 3.2222, + "step": 19230 + }, + { + "epoch": 1.3069031118358474, + "grad_norm": 8.149100303649902, + "learning_rate": 8.367050550346515e-06, + "loss": 2.9715, + "step": 19235 + }, + { + "epoch": 1.307242831906509, + "grad_norm": 6.270033836364746, + "learning_rate": 8.36662590025819e-06, + "loss": 3.1165, + "step": 19240 + }, + { + "epoch": 1.3075825519771709, + "grad_norm": 6.056464672088623, + "learning_rate": 8.36620125016986e-06, + "loss": 3.3726, + "step": 19245 + }, + { + "epoch": 1.3079222720478325, + "grad_norm": 7.831257343292236, + "learning_rate": 8.365776600081533e-06, + "loss": 3.4169, + "step": 19250 + }, + { + "epoch": 1.3082619921184944, + "grad_norm": 5.7067413330078125, + "learning_rate": 8.365351949993208e-06, + "loss": 3.4883, + "step": 19255 + }, + { + "epoch": 1.308601712189156, + "grad_norm": 7.2321577072143555, + "learning_rate": 8.364927299904879e-06, + "loss": 3.315, + "step": 19260 + }, + { + "epoch": 1.3089414322598178, + "grad_norm": 5.994912624359131, + "learning_rate": 8.364502649816552e-06, + "loss": 2.6979, + "step": 19265 + }, + { + "epoch": 1.3092811523304797, + "grad_norm": 9.311325073242188, + "learning_rate": 8.364077999728224e-06, + "loss": 3.209, + "step": 19270 + }, + { + "epoch": 1.3096208724011413, + "grad_norm": 5.196181297302246, + "learning_rate": 8.363653349639897e-06, + "loss": 3.0593, + "step": 19275 + }, + { + "epoch": 1.3099605924718032, + "grad_norm": 6.7414140701293945, + "learning_rate": 8.36322869955157e-06, + "loss": 2.7671, + "step": 19280 + }, + { + "epoch": 1.310300312542465, + "grad_norm": 6.174583435058594, + "learning_rate": 8.362804049463243e-06, + "loss": 3.1643, + "step": 19285 + }, + { + "epoch": 1.3106400326131267, + "grad_norm": 5.524929046630859, + "learning_rate": 8.362379399374916e-06, + "loss": 3.0774, + "step": 19290 + }, + { + "epoch": 1.3109797526837885, + "grad_norm": 6.122289180755615, + "learning_rate": 8.361954749286588e-06, + "loss": 3.2135, + "step": 19295 + }, + { + "epoch": 1.3113194727544504, + "grad_norm": 5.983076572418213, + "learning_rate": 8.361530099198261e-06, + "loss": 3.2013, + "step": 19300 + }, + { + "epoch": 1.311659192825112, + "grad_norm": 5.638158321380615, + "learning_rate": 8.361105449109934e-06, + "loss": 3.2798, + "step": 19305 + }, + { + "epoch": 1.3119989128957739, + "grad_norm": 6.106377124786377, + "learning_rate": 8.360680799021607e-06, + "loss": 2.9773, + "step": 19310 + }, + { + "epoch": 1.3123386329664357, + "grad_norm": 6.530561923980713, + "learning_rate": 8.36025614893328e-06, + "loss": 3.0637, + "step": 19315 + }, + { + "epoch": 1.3126783530370973, + "grad_norm": 5.066561222076416, + "learning_rate": 8.359831498844952e-06, + "loss": 3.0977, + "step": 19320 + }, + { + "epoch": 1.3130180731077592, + "grad_norm": 6.4318366050720215, + "learning_rate": 8.359406848756625e-06, + "loss": 3.2356, + "step": 19325 + }, + { + "epoch": 1.313357793178421, + "grad_norm": 7.358024597167969, + "learning_rate": 8.358982198668298e-06, + "loss": 3.2132, + "step": 19330 + }, + { + "epoch": 1.3136975132490827, + "grad_norm": 7.528659343719482, + "learning_rate": 8.35855754857997e-06, + "loss": 3.0566, + "step": 19335 + }, + { + "epoch": 1.3140372333197445, + "grad_norm": 7.123202800750732, + "learning_rate": 8.358132898491644e-06, + "loss": 3.2515, + "step": 19340 + }, + { + "epoch": 1.3143769533904064, + "grad_norm": 6.772909164428711, + "learning_rate": 8.357708248403316e-06, + "loss": 3.1662, + "step": 19345 + }, + { + "epoch": 1.314716673461068, + "grad_norm": 7.260865211486816, + "learning_rate": 8.357283598314989e-06, + "loss": 3.1784, + "step": 19350 + }, + { + "epoch": 1.3150563935317299, + "grad_norm": 5.88129997253418, + "learning_rate": 8.356858948226662e-06, + "loss": 2.9872, + "step": 19355 + }, + { + "epoch": 1.3153961136023917, + "grad_norm": 8.03112506866455, + "learning_rate": 8.356434298138335e-06, + "loss": 3.0345, + "step": 19360 + }, + { + "epoch": 1.3157358336730534, + "grad_norm": 6.53188419342041, + "learning_rate": 8.356009648050008e-06, + "loss": 3.2586, + "step": 19365 + }, + { + "epoch": 1.3160755537437152, + "grad_norm": 5.944188117980957, + "learning_rate": 8.35558499796168e-06, + "loss": 3.2605, + "step": 19370 + }, + { + "epoch": 1.316415273814377, + "grad_norm": 5.594062805175781, + "learning_rate": 8.355160347873353e-06, + "loss": 3.4771, + "step": 19375 + }, + { + "epoch": 1.3167549938850387, + "grad_norm": 6.361100673675537, + "learning_rate": 8.354735697785026e-06, + "loss": 3.1591, + "step": 19380 + }, + { + "epoch": 1.3170947139557005, + "grad_norm": 6.456884860992432, + "learning_rate": 8.354311047696699e-06, + "loss": 3.2745, + "step": 19385 + }, + { + "epoch": 1.3174344340263624, + "grad_norm": 7.67694091796875, + "learning_rate": 8.353886397608372e-06, + "loss": 3.3724, + "step": 19390 + }, + { + "epoch": 1.317774154097024, + "grad_norm": 9.220251083374023, + "learning_rate": 8.353461747520044e-06, + "loss": 2.8428, + "step": 19395 + }, + { + "epoch": 1.3181138741676859, + "grad_norm": 6.029196739196777, + "learning_rate": 8.353037097431717e-06, + "loss": 3.3172, + "step": 19400 + }, + { + "epoch": 1.3184535942383477, + "grad_norm": 6.595646381378174, + "learning_rate": 8.35261244734339e-06, + "loss": 3.1548, + "step": 19405 + }, + { + "epoch": 1.3187933143090094, + "grad_norm": 6.325791835784912, + "learning_rate": 8.352187797255063e-06, + "loss": 3.1432, + "step": 19410 + }, + { + "epoch": 1.3191330343796712, + "grad_norm": 6.849134922027588, + "learning_rate": 8.351763147166736e-06, + "loss": 3.1118, + "step": 19415 + }, + { + "epoch": 1.3194727544503329, + "grad_norm": 6.595311641693115, + "learning_rate": 8.351338497078408e-06, + "loss": 3.4271, + "step": 19420 + }, + { + "epoch": 1.3198124745209947, + "grad_norm": 6.0847296714782715, + "learning_rate": 8.350913846990081e-06, + "loss": 3.0742, + "step": 19425 + }, + { + "epoch": 1.3201521945916566, + "grad_norm": 6.516181945800781, + "learning_rate": 8.350489196901754e-06, + "loss": 3.2298, + "step": 19430 + }, + { + "epoch": 1.3204919146623182, + "grad_norm": 5.87290096282959, + "learning_rate": 8.350064546813427e-06, + "loss": 3.21, + "step": 19435 + }, + { + "epoch": 1.32083163473298, + "grad_norm": 7.205922603607178, + "learning_rate": 8.3496398967251e-06, + "loss": 3.1224, + "step": 19440 + }, + { + "epoch": 1.3211713548036417, + "grad_norm": 8.140690803527832, + "learning_rate": 8.349215246636772e-06, + "loss": 3.2741, + "step": 19445 + }, + { + "epoch": 1.3215110748743035, + "grad_norm": 9.143299102783203, + "learning_rate": 8.348790596548445e-06, + "loss": 3.1911, + "step": 19450 + }, + { + "epoch": 1.3218507949449654, + "grad_norm": 6.351974010467529, + "learning_rate": 8.348365946460118e-06, + "loss": 3.3637, + "step": 19455 + }, + { + "epoch": 1.322190515015627, + "grad_norm": 6.682953357696533, + "learning_rate": 8.347941296371789e-06, + "loss": 3.4374, + "step": 19460 + }, + { + "epoch": 1.3225302350862889, + "grad_norm": 5.674413681030273, + "learning_rate": 8.347516646283464e-06, + "loss": 3.0464, + "step": 19465 + }, + { + "epoch": 1.3228699551569507, + "grad_norm": 6.8382134437561035, + "learning_rate": 8.347091996195136e-06, + "loss": 3.2002, + "step": 19470 + }, + { + "epoch": 1.3232096752276123, + "grad_norm": 7.451681137084961, + "learning_rate": 8.346667346106807e-06, + "loss": 3.2134, + "step": 19475 + }, + { + "epoch": 1.3235493952982742, + "grad_norm": 6.842391014099121, + "learning_rate": 8.346242696018482e-06, + "loss": 3.0829, + "step": 19480 + }, + { + "epoch": 1.323889115368936, + "grad_norm": 7.553800582885742, + "learning_rate": 8.345818045930155e-06, + "loss": 3.2331, + "step": 19485 + }, + { + "epoch": 1.3242288354395977, + "grad_norm": 5.165170192718506, + "learning_rate": 8.345393395841826e-06, + "loss": 2.9704, + "step": 19490 + }, + { + "epoch": 1.3245685555102595, + "grad_norm": 7.003549098968506, + "learning_rate": 8.3449687457535e-06, + "loss": 3.2779, + "step": 19495 + }, + { + "epoch": 1.3249082755809214, + "grad_norm": 5.291104316711426, + "learning_rate": 8.344544095665173e-06, + "loss": 3.176, + "step": 19500 + }, + { + "epoch": 1.325247995651583, + "grad_norm": 7.776943206787109, + "learning_rate": 8.344119445576844e-06, + "loss": 2.9847, + "step": 19505 + }, + { + "epoch": 1.3255877157222449, + "grad_norm": 11.12006950378418, + "learning_rate": 8.343694795488519e-06, + "loss": 3.2002, + "step": 19510 + }, + { + "epoch": 1.3259274357929067, + "grad_norm": 6.223942279815674, + "learning_rate": 8.343270145400192e-06, + "loss": 3.2316, + "step": 19515 + }, + { + "epoch": 1.3262671558635684, + "grad_norm": 7.327055931091309, + "learning_rate": 8.342845495311863e-06, + "loss": 3.0784, + "step": 19520 + }, + { + "epoch": 1.3266068759342302, + "grad_norm": 6.280395984649658, + "learning_rate": 8.342420845223537e-06, + "loss": 3.1201, + "step": 19525 + }, + { + "epoch": 1.326946596004892, + "grad_norm": 7.328942775726318, + "learning_rate": 8.341996195135208e-06, + "loss": 2.9462, + "step": 19530 + }, + { + "epoch": 1.3272863160755537, + "grad_norm": 6.825775146484375, + "learning_rate": 8.341571545046883e-06, + "loss": 3.2945, + "step": 19535 + }, + { + "epoch": 1.3276260361462155, + "grad_norm": 7.653871536254883, + "learning_rate": 8.341146894958556e-06, + "loss": 3.2916, + "step": 19540 + }, + { + "epoch": 1.3279657562168774, + "grad_norm": 4.754966735839844, + "learning_rate": 8.340722244870227e-06, + "loss": 3.1044, + "step": 19545 + }, + { + "epoch": 1.328305476287539, + "grad_norm": 5.4903340339660645, + "learning_rate": 8.340297594781901e-06, + "loss": 3.0674, + "step": 19550 + }, + { + "epoch": 1.3286451963582009, + "grad_norm": 8.782718658447266, + "learning_rate": 8.339872944693574e-06, + "loss": 3.3843, + "step": 19555 + }, + { + "epoch": 1.3289849164288627, + "grad_norm": 5.704433917999268, + "learning_rate": 8.339448294605245e-06, + "loss": 3.2289, + "step": 19560 + }, + { + "epoch": 1.3293246364995244, + "grad_norm": 7.910236358642578, + "learning_rate": 8.33902364451692e-06, + "loss": 3.0763, + "step": 19565 + }, + { + "epoch": 1.3296643565701862, + "grad_norm": 6.975687503814697, + "learning_rate": 8.338598994428592e-06, + "loss": 3.1194, + "step": 19570 + }, + { + "epoch": 1.330004076640848, + "grad_norm": 7.450704574584961, + "learning_rate": 8.338174344340263e-06, + "loss": 3.3642, + "step": 19575 + }, + { + "epoch": 1.3303437967115097, + "grad_norm": 6.194193363189697, + "learning_rate": 8.337749694251938e-06, + "loss": 3.1061, + "step": 19580 + }, + { + "epoch": 1.3306835167821716, + "grad_norm": 6.44516134262085, + "learning_rate": 8.33732504416361e-06, + "loss": 3.1879, + "step": 19585 + }, + { + "epoch": 1.3310232368528332, + "grad_norm": 6.84967041015625, + "learning_rate": 8.336900394075282e-06, + "loss": 3.1064, + "step": 19590 + }, + { + "epoch": 1.331362956923495, + "grad_norm": 6.718817234039307, + "learning_rate": 8.336475743986956e-06, + "loss": 3.2975, + "step": 19595 + }, + { + "epoch": 1.331702676994157, + "grad_norm": 7.900121212005615, + "learning_rate": 8.33605109389863e-06, + "loss": 3.3786, + "step": 19600 + }, + { + "epoch": 1.3320423970648185, + "grad_norm": 6.682680606842041, + "learning_rate": 8.3356264438103e-06, + "loss": 3.2681, + "step": 19605 + }, + { + "epoch": 1.3323821171354804, + "grad_norm": 6.9952239990234375, + "learning_rate": 8.335201793721975e-06, + "loss": 3.0926, + "step": 19610 + }, + { + "epoch": 1.332721837206142, + "grad_norm": 8.210883140563965, + "learning_rate": 8.334777143633646e-06, + "loss": 2.9614, + "step": 19615 + }, + { + "epoch": 1.3330615572768039, + "grad_norm": 7.508040428161621, + "learning_rate": 8.334352493545319e-06, + "loss": 3.5173, + "step": 19620 + }, + { + "epoch": 1.3334012773474657, + "grad_norm": 7.450888156890869, + "learning_rate": 8.333927843456993e-06, + "loss": 3.2364, + "step": 19625 + }, + { + "epoch": 1.3337409974181273, + "grad_norm": 7.617779731750488, + "learning_rate": 8.333503193368664e-06, + "loss": 3.5648, + "step": 19630 + }, + { + "epoch": 1.3340807174887892, + "grad_norm": 6.939319133758545, + "learning_rate": 8.333078543280337e-06, + "loss": 3.197, + "step": 19635 + }, + { + "epoch": 1.334420437559451, + "grad_norm": 6.768627166748047, + "learning_rate": 8.332653893192012e-06, + "loss": 3.0926, + "step": 19640 + }, + { + "epoch": 1.3347601576301127, + "grad_norm": 5.774292945861816, + "learning_rate": 8.332229243103683e-06, + "loss": 3.0821, + "step": 19645 + }, + { + "epoch": 1.3350998777007745, + "grad_norm": 6.215695858001709, + "learning_rate": 8.331804593015355e-06, + "loss": 2.9089, + "step": 19650 + }, + { + "epoch": 1.3354395977714364, + "grad_norm": 5.771040439605713, + "learning_rate": 8.33137994292703e-06, + "loss": 2.9974, + "step": 19655 + }, + { + "epoch": 1.335779317842098, + "grad_norm": 6.505273818969727, + "learning_rate": 8.330955292838701e-06, + "loss": 3.2638, + "step": 19660 + }, + { + "epoch": 1.3361190379127599, + "grad_norm": 7.166094779968262, + "learning_rate": 8.330530642750374e-06, + "loss": 3.3849, + "step": 19665 + }, + { + "epoch": 1.3364587579834217, + "grad_norm": 6.752564430236816, + "learning_rate": 8.330105992662048e-06, + "loss": 3.2852, + "step": 19670 + }, + { + "epoch": 1.3367984780540834, + "grad_norm": 7.420102596282959, + "learning_rate": 8.32968134257372e-06, + "loss": 3.3329, + "step": 19675 + }, + { + "epoch": 1.3371381981247452, + "grad_norm": 8.453338623046875, + "learning_rate": 8.329256692485392e-06, + "loss": 3.2552, + "step": 19680 + }, + { + "epoch": 1.337477918195407, + "grad_norm": 7.14545202255249, + "learning_rate": 8.328832042397065e-06, + "loss": 3.3351, + "step": 19685 + }, + { + "epoch": 1.3378176382660687, + "grad_norm": 7.012853622436523, + "learning_rate": 8.328407392308738e-06, + "loss": 3.1287, + "step": 19690 + }, + { + "epoch": 1.3381573583367306, + "grad_norm": 5.680570125579834, + "learning_rate": 8.32798274222041e-06, + "loss": 3.228, + "step": 19695 + }, + { + "epoch": 1.3384970784073924, + "grad_norm": 7.15523624420166, + "learning_rate": 8.327558092132083e-06, + "loss": 3.1025, + "step": 19700 + }, + { + "epoch": 1.338836798478054, + "grad_norm": 6.431162357330322, + "learning_rate": 8.327133442043756e-06, + "loss": 3.1477, + "step": 19705 + }, + { + "epoch": 1.3391765185487159, + "grad_norm": 7.9767680168151855, + "learning_rate": 8.326708791955429e-06, + "loss": 3.1083, + "step": 19710 + }, + { + "epoch": 1.3395162386193777, + "grad_norm": 6.172760009765625, + "learning_rate": 8.326284141867102e-06, + "loss": 3.0188, + "step": 19715 + }, + { + "epoch": 1.3398559586900394, + "grad_norm": 8.889629364013672, + "learning_rate": 8.325859491778775e-06, + "loss": 3.3188, + "step": 19720 + }, + { + "epoch": 1.3401956787607012, + "grad_norm": 5.882484436035156, + "learning_rate": 8.325434841690447e-06, + "loss": 3.1842, + "step": 19725 + }, + { + "epoch": 1.340535398831363, + "grad_norm": 5.757203578948975, + "learning_rate": 8.32501019160212e-06, + "loss": 3.1787, + "step": 19730 + }, + { + "epoch": 1.3408751189020247, + "grad_norm": 5.941006660461426, + "learning_rate": 8.324585541513793e-06, + "loss": 3.2041, + "step": 19735 + }, + { + "epoch": 1.3412148389726866, + "grad_norm": 6.554311752319336, + "learning_rate": 8.324160891425466e-06, + "loss": 2.9435, + "step": 19740 + }, + { + "epoch": 1.3415545590433484, + "grad_norm": 6.054712295532227, + "learning_rate": 8.323736241337139e-06, + "loss": 3.2462, + "step": 19745 + }, + { + "epoch": 1.34189427911401, + "grad_norm": 7.657281398773193, + "learning_rate": 8.323311591248811e-06, + "loss": 3.0685, + "step": 19750 + }, + { + "epoch": 1.342233999184672, + "grad_norm": 7.230365753173828, + "learning_rate": 8.322886941160484e-06, + "loss": 3.2472, + "step": 19755 + }, + { + "epoch": 1.3425737192553335, + "grad_norm": 6.827581405639648, + "learning_rate": 8.322462291072157e-06, + "loss": 3.0997, + "step": 19760 + }, + { + "epoch": 1.3429134393259954, + "grad_norm": 7.206411361694336, + "learning_rate": 8.32203764098383e-06, + "loss": 2.9735, + "step": 19765 + }, + { + "epoch": 1.3432531593966572, + "grad_norm": 9.182312965393066, + "learning_rate": 8.321612990895503e-06, + "loss": 3.1701, + "step": 19770 + }, + { + "epoch": 1.3435928794673189, + "grad_norm": 9.972248077392578, + "learning_rate": 8.321188340807175e-06, + "loss": 3.0559, + "step": 19775 + }, + { + "epoch": 1.3439325995379807, + "grad_norm": 5.810710430145264, + "learning_rate": 8.320763690718848e-06, + "loss": 3.2992, + "step": 19780 + }, + { + "epoch": 1.3442723196086424, + "grad_norm": 7.799502849578857, + "learning_rate": 8.320339040630521e-06, + "loss": 3.2102, + "step": 19785 + }, + { + "epoch": 1.3446120396793042, + "grad_norm": 6.0126729011535645, + "learning_rate": 8.319914390542194e-06, + "loss": 3.293, + "step": 19790 + }, + { + "epoch": 1.344951759749966, + "grad_norm": 6.201000690460205, + "learning_rate": 8.319489740453867e-06, + "loss": 3.2855, + "step": 19795 + }, + { + "epoch": 1.3452914798206277, + "grad_norm": 8.71015739440918, + "learning_rate": 8.31906509036554e-06, + "loss": 3.1096, + "step": 19800 + }, + { + "epoch": 1.3456311998912895, + "grad_norm": 7.407625198364258, + "learning_rate": 8.318640440277212e-06, + "loss": 3.2122, + "step": 19805 + }, + { + "epoch": 1.3459709199619514, + "grad_norm": 5.852577209472656, + "learning_rate": 8.318215790188885e-06, + "loss": 2.9349, + "step": 19810 + }, + { + "epoch": 1.346310640032613, + "grad_norm": 7.332413673400879, + "learning_rate": 8.317791140100558e-06, + "loss": 3.1347, + "step": 19815 + }, + { + "epoch": 1.3466503601032749, + "grad_norm": 6.10461950302124, + "learning_rate": 8.31736649001223e-06, + "loss": 3.0971, + "step": 19820 + }, + { + "epoch": 1.3469900801739367, + "grad_norm": 6.0449442863464355, + "learning_rate": 8.316941839923904e-06, + "loss": 3.1075, + "step": 19825 + }, + { + "epoch": 1.3473298002445984, + "grad_norm": 5.648074150085449, + "learning_rate": 8.316517189835576e-06, + "loss": 3.1576, + "step": 19830 + }, + { + "epoch": 1.3476695203152602, + "grad_norm": 4.981823444366455, + "learning_rate": 8.316092539747249e-06, + "loss": 3.3008, + "step": 19835 + }, + { + "epoch": 1.348009240385922, + "grad_norm": 7.10968017578125, + "learning_rate": 8.315667889658922e-06, + "loss": 3.0995, + "step": 19840 + }, + { + "epoch": 1.3483489604565837, + "grad_norm": 7.15934419631958, + "learning_rate": 8.315243239570595e-06, + "loss": 3.1013, + "step": 19845 + }, + { + "epoch": 1.3486886805272456, + "grad_norm": 7.996486186981201, + "learning_rate": 8.314818589482268e-06, + "loss": 3.3279, + "step": 19850 + }, + { + "epoch": 1.3490284005979074, + "grad_norm": 5.352023601531982, + "learning_rate": 8.31439393939394e-06, + "loss": 3.2385, + "step": 19855 + }, + { + "epoch": 1.349368120668569, + "grad_norm": 7.239404201507568, + "learning_rate": 8.313969289305613e-06, + "loss": 3.1183, + "step": 19860 + }, + { + "epoch": 1.349707840739231, + "grad_norm": 7.80229377746582, + "learning_rate": 8.313544639217286e-06, + "loss": 3.306, + "step": 19865 + }, + { + "epoch": 1.3500475608098927, + "grad_norm": 6.34010648727417, + "learning_rate": 8.313119989128959e-06, + "loss": 3.301, + "step": 19870 + }, + { + "epoch": 1.3503872808805544, + "grad_norm": 7.470625400543213, + "learning_rate": 8.312695339040632e-06, + "loss": 3.4359, + "step": 19875 + }, + { + "epoch": 1.3507270009512162, + "grad_norm": 6.833315372467041, + "learning_rate": 8.312270688952304e-06, + "loss": 3.3628, + "step": 19880 + }, + { + "epoch": 1.351066721021878, + "grad_norm": 6.155881404876709, + "learning_rate": 8.311846038863977e-06, + "loss": 3.3, + "step": 19885 + }, + { + "epoch": 1.3514064410925397, + "grad_norm": 7.4692182540893555, + "learning_rate": 8.31142138877565e-06, + "loss": 3.1821, + "step": 19890 + }, + { + "epoch": 1.3517461611632016, + "grad_norm": 6.425995826721191, + "learning_rate": 8.310996738687323e-06, + "loss": 3.2195, + "step": 19895 + }, + { + "epoch": 1.3520858812338634, + "grad_norm": 7.365162372589111, + "learning_rate": 8.310572088598996e-06, + "loss": 3.1467, + "step": 19900 + }, + { + "epoch": 1.352425601304525, + "grad_norm": 5.932562351226807, + "learning_rate": 8.310147438510668e-06, + "loss": 3.1246, + "step": 19905 + }, + { + "epoch": 1.352765321375187, + "grad_norm": 7.492823123931885, + "learning_rate": 8.309722788422341e-06, + "loss": 3.1921, + "step": 19910 + }, + { + "epoch": 1.3531050414458488, + "grad_norm": 5.982995510101318, + "learning_rate": 8.309298138334014e-06, + "loss": 3.2296, + "step": 19915 + }, + { + "epoch": 1.3534447615165104, + "grad_norm": 6.377257823944092, + "learning_rate": 8.308873488245687e-06, + "loss": 3.2332, + "step": 19920 + }, + { + "epoch": 1.3537844815871722, + "grad_norm": 6.247061729431152, + "learning_rate": 8.30844883815736e-06, + "loss": 3.2057, + "step": 19925 + }, + { + "epoch": 1.3541242016578339, + "grad_norm": 6.991499423980713, + "learning_rate": 8.308024188069032e-06, + "loss": 3.4028, + "step": 19930 + }, + { + "epoch": 1.3544639217284957, + "grad_norm": 6.482231616973877, + "learning_rate": 8.307599537980705e-06, + "loss": 3.2856, + "step": 19935 + }, + { + "epoch": 1.3548036417991576, + "grad_norm": 6.789702892303467, + "learning_rate": 8.307174887892378e-06, + "loss": 3.2599, + "step": 19940 + }, + { + "epoch": 1.3551433618698192, + "grad_norm": 5.924798965454102, + "learning_rate": 8.30675023780405e-06, + "loss": 3.2742, + "step": 19945 + }, + { + "epoch": 1.355483081940481, + "grad_norm": 6.23853874206543, + "learning_rate": 8.306325587715724e-06, + "loss": 3.2709, + "step": 19950 + }, + { + "epoch": 1.3558228020111427, + "grad_norm": 6.315508842468262, + "learning_rate": 8.305900937627396e-06, + "loss": 3.2808, + "step": 19955 + }, + { + "epoch": 1.3561625220818045, + "grad_norm": 6.114943504333496, + "learning_rate": 8.305476287539067e-06, + "loss": 3.3518, + "step": 19960 + }, + { + "epoch": 1.3565022421524664, + "grad_norm": 6.838662624359131, + "learning_rate": 8.305051637450742e-06, + "loss": 3.3012, + "step": 19965 + }, + { + "epoch": 1.356841962223128, + "grad_norm": 7.027342796325684, + "learning_rate": 8.304626987362415e-06, + "loss": 3.1737, + "step": 19970 + }, + { + "epoch": 1.3571816822937899, + "grad_norm": 6.140130996704102, + "learning_rate": 8.304202337274086e-06, + "loss": 3.1138, + "step": 19975 + }, + { + "epoch": 1.3575214023644517, + "grad_norm": 5.8609185218811035, + "learning_rate": 8.30377768718576e-06, + "loss": 3.1898, + "step": 19980 + }, + { + "epoch": 1.3578611224351134, + "grad_norm": 5.996417999267578, + "learning_rate": 8.303353037097433e-06, + "loss": 3.1362, + "step": 19985 + }, + { + "epoch": 1.3582008425057752, + "grad_norm": 5.453254699707031, + "learning_rate": 8.302928387009104e-06, + "loss": 3.2013, + "step": 19990 + }, + { + "epoch": 1.358540562576437, + "grad_norm": 5.849050045013428, + "learning_rate": 8.302503736920779e-06, + "loss": 3.0108, + "step": 19995 + }, + { + "epoch": 1.3588802826470987, + "grad_norm": 6.395934104919434, + "learning_rate": 8.302079086832452e-06, + "loss": 3.3907, + "step": 20000 + }, + { + "epoch": 1.3592200027177606, + "grad_norm": 6.651646614074707, + "learning_rate": 8.301654436744123e-06, + "loss": 3.4401, + "step": 20005 + }, + { + "epoch": 1.3595597227884224, + "grad_norm": 7.57117223739624, + "learning_rate": 8.301229786655797e-06, + "loss": 3.29, + "step": 20010 + }, + { + "epoch": 1.359899442859084, + "grad_norm": 6.21440315246582, + "learning_rate": 8.30080513656747e-06, + "loss": 3.3808, + "step": 20015 + }, + { + "epoch": 1.360239162929746, + "grad_norm": 5.864458084106445, + "learning_rate": 8.300380486479141e-06, + "loss": 3.3253, + "step": 20020 + }, + { + "epoch": 1.3605788830004077, + "grad_norm": 7.215778827667236, + "learning_rate": 8.299955836390816e-06, + "loss": 3.0554, + "step": 20025 + }, + { + "epoch": 1.3609186030710694, + "grad_norm": 5.718527317047119, + "learning_rate": 8.299531186302487e-06, + "loss": 3.1904, + "step": 20030 + }, + { + "epoch": 1.3612583231417312, + "grad_norm": 6.578711986541748, + "learning_rate": 8.29910653621416e-06, + "loss": 3.323, + "step": 20035 + }, + { + "epoch": 1.361598043212393, + "grad_norm": 6.553785800933838, + "learning_rate": 8.298681886125834e-06, + "loss": 3.3691, + "step": 20040 + }, + { + "epoch": 1.3619377632830547, + "grad_norm": 7.395156383514404, + "learning_rate": 8.298257236037505e-06, + "loss": 3.14, + "step": 20045 + }, + { + "epoch": 1.3622774833537166, + "grad_norm": 7.395801067352295, + "learning_rate": 8.297832585949178e-06, + "loss": 3.2924, + "step": 20050 + }, + { + "epoch": 1.3626172034243784, + "grad_norm": 6.735192775726318, + "learning_rate": 8.297407935860852e-06, + "loss": 3.2806, + "step": 20055 + }, + { + "epoch": 1.36295692349504, + "grad_norm": 5.077358722686768, + "learning_rate": 8.296983285772523e-06, + "loss": 3.341, + "step": 20060 + }, + { + "epoch": 1.363296643565702, + "grad_norm": 6.633719444274902, + "learning_rate": 8.296558635684196e-06, + "loss": 3.201, + "step": 20065 + }, + { + "epoch": 1.3636363636363638, + "grad_norm": 7.555473327636719, + "learning_rate": 8.29613398559587e-06, + "loss": 3.1532, + "step": 20070 + }, + { + "epoch": 1.3639760837070254, + "grad_norm": 7.257934093475342, + "learning_rate": 8.295709335507542e-06, + "loss": 3.2633, + "step": 20075 + }, + { + "epoch": 1.3643158037776872, + "grad_norm": 6.493884086608887, + "learning_rate": 8.295284685419215e-06, + "loss": 3.1221, + "step": 20080 + }, + { + "epoch": 1.364655523848349, + "grad_norm": 7.447437286376953, + "learning_rate": 8.294860035330889e-06, + "loss": 3.0402, + "step": 20085 + }, + { + "epoch": 1.3649952439190107, + "grad_norm": 6.911088466644287, + "learning_rate": 8.29443538524256e-06, + "loss": 3.2531, + "step": 20090 + }, + { + "epoch": 1.3653349639896726, + "grad_norm": 6.082359313964844, + "learning_rate": 8.294010735154233e-06, + "loss": 3.1033, + "step": 20095 + }, + { + "epoch": 1.3656746840603342, + "grad_norm": 6.114701271057129, + "learning_rate": 8.293586085065906e-06, + "loss": 3.157, + "step": 20100 + }, + { + "epoch": 1.366014404130996, + "grad_norm": 6.880590915679932, + "learning_rate": 8.293161434977579e-06, + "loss": 3.2896, + "step": 20105 + }, + { + "epoch": 1.366354124201658, + "grad_norm": 8.36473560333252, + "learning_rate": 8.292736784889251e-06, + "loss": 3.098, + "step": 20110 + }, + { + "epoch": 1.3666938442723195, + "grad_norm": 7.039590358734131, + "learning_rate": 8.292312134800924e-06, + "loss": 3.359, + "step": 20115 + }, + { + "epoch": 1.3670335643429814, + "grad_norm": 6.374740123748779, + "learning_rate": 8.291887484712597e-06, + "loss": 2.7423, + "step": 20120 + }, + { + "epoch": 1.367373284413643, + "grad_norm": 5.510599613189697, + "learning_rate": 8.29146283462427e-06, + "loss": 2.9991, + "step": 20125 + }, + { + "epoch": 1.3677130044843049, + "grad_norm": 5.722341060638428, + "learning_rate": 8.291038184535943e-06, + "loss": 3.121, + "step": 20130 + }, + { + "epoch": 1.3680527245549667, + "grad_norm": 7.913478374481201, + "learning_rate": 8.290613534447615e-06, + "loss": 3.3241, + "step": 20135 + }, + { + "epoch": 1.3683924446256284, + "grad_norm": 6.322983741760254, + "learning_rate": 8.290188884359288e-06, + "loss": 2.8212, + "step": 20140 + }, + { + "epoch": 1.3687321646962902, + "grad_norm": 6.935922622680664, + "learning_rate": 8.289764234270961e-06, + "loss": 3.1328, + "step": 20145 + }, + { + "epoch": 1.369071884766952, + "grad_norm": 8.514531135559082, + "learning_rate": 8.289339584182634e-06, + "loss": 2.9664, + "step": 20150 + }, + { + "epoch": 1.3694116048376137, + "grad_norm": 8.38914966583252, + "learning_rate": 8.288914934094307e-06, + "loss": 3.3487, + "step": 20155 + }, + { + "epoch": 1.3697513249082756, + "grad_norm": 7.48569393157959, + "learning_rate": 8.28849028400598e-06, + "loss": 3.1853, + "step": 20160 + }, + { + "epoch": 1.3700910449789374, + "grad_norm": 6.586266040802002, + "learning_rate": 8.288065633917652e-06, + "loss": 3.1891, + "step": 20165 + }, + { + "epoch": 1.370430765049599, + "grad_norm": 6.744504928588867, + "learning_rate": 8.287640983829325e-06, + "loss": 3.066, + "step": 20170 + }, + { + "epoch": 1.370770485120261, + "grad_norm": 5.588136196136475, + "learning_rate": 8.287216333740998e-06, + "loss": 3.0195, + "step": 20175 + }, + { + "epoch": 1.3711102051909227, + "grad_norm": 6.160069465637207, + "learning_rate": 8.28679168365267e-06, + "loss": 3.2699, + "step": 20180 + }, + { + "epoch": 1.3714499252615844, + "grad_norm": 5.6150126457214355, + "learning_rate": 8.286367033564343e-06, + "loss": 2.8183, + "step": 20185 + }, + { + "epoch": 1.3717896453322462, + "grad_norm": 8.105490684509277, + "learning_rate": 8.285942383476016e-06, + "loss": 3.3508, + "step": 20190 + }, + { + "epoch": 1.372129365402908, + "grad_norm": 8.174959182739258, + "learning_rate": 8.285517733387689e-06, + "loss": 2.9411, + "step": 20195 + }, + { + "epoch": 1.3724690854735697, + "grad_norm": 7.8685078620910645, + "learning_rate": 8.285093083299362e-06, + "loss": 3.0201, + "step": 20200 + }, + { + "epoch": 1.3728088055442316, + "grad_norm": 5.499654293060303, + "learning_rate": 8.284668433211035e-06, + "loss": 2.9884, + "step": 20205 + }, + { + "epoch": 1.3731485256148934, + "grad_norm": 6.214478969573975, + "learning_rate": 8.284243783122707e-06, + "loss": 3.04, + "step": 20210 + }, + { + "epoch": 1.373488245685555, + "grad_norm": 6.733551025390625, + "learning_rate": 8.28381913303438e-06, + "loss": 3.0929, + "step": 20215 + }, + { + "epoch": 1.373827965756217, + "grad_norm": 6.808114528656006, + "learning_rate": 8.283394482946053e-06, + "loss": 3.0058, + "step": 20220 + }, + { + "epoch": 1.3741676858268788, + "grad_norm": 6.8255133628845215, + "learning_rate": 8.282969832857726e-06, + "loss": 3.254, + "step": 20225 + }, + { + "epoch": 1.3745074058975404, + "grad_norm": 4.972942352294922, + "learning_rate": 8.282545182769399e-06, + "loss": 2.8609, + "step": 20230 + }, + { + "epoch": 1.3748471259682022, + "grad_norm": 8.266292572021484, + "learning_rate": 8.282120532681071e-06, + "loss": 3.2159, + "step": 20235 + }, + { + "epoch": 1.375186846038864, + "grad_norm": 6.793859004974365, + "learning_rate": 8.281695882592744e-06, + "loss": 3.0439, + "step": 20240 + }, + { + "epoch": 1.3755265661095257, + "grad_norm": 6.447922229766846, + "learning_rate": 8.281271232504417e-06, + "loss": 3.0741, + "step": 20245 + }, + { + "epoch": 1.3758662861801876, + "grad_norm": 7.164614200592041, + "learning_rate": 8.28084658241609e-06, + "loss": 3.0678, + "step": 20250 + }, + { + "epoch": 1.3762060062508494, + "grad_norm": 6.327066421508789, + "learning_rate": 8.280421932327763e-06, + "loss": 3.0376, + "step": 20255 + }, + { + "epoch": 1.376545726321511, + "grad_norm": 7.888425350189209, + "learning_rate": 8.279997282239435e-06, + "loss": 3.0893, + "step": 20260 + }, + { + "epoch": 1.376885446392173, + "grad_norm": 9.258403778076172, + "learning_rate": 8.279572632151108e-06, + "loss": 3.1955, + "step": 20265 + }, + { + "epoch": 1.3772251664628345, + "grad_norm": 7.344465255737305, + "learning_rate": 8.279147982062781e-06, + "loss": 3.4413, + "step": 20270 + }, + { + "epoch": 1.3775648865334964, + "grad_norm": 8.619365692138672, + "learning_rate": 8.278723331974454e-06, + "loss": 3.233, + "step": 20275 + }, + { + "epoch": 1.3779046066041583, + "grad_norm": 6.847774505615234, + "learning_rate": 8.278298681886127e-06, + "loss": 3.2011, + "step": 20280 + }, + { + "epoch": 1.3782443266748199, + "grad_norm": 8.585705757141113, + "learning_rate": 8.2778740317978e-06, + "loss": 3.082, + "step": 20285 + }, + { + "epoch": 1.3785840467454817, + "grad_norm": 6.069488048553467, + "learning_rate": 8.277449381709472e-06, + "loss": 3.2703, + "step": 20290 + }, + { + "epoch": 1.3789237668161434, + "grad_norm": 6.568089962005615, + "learning_rate": 8.277024731621145e-06, + "loss": 3.3483, + "step": 20295 + }, + { + "epoch": 1.3792634868868052, + "grad_norm": 5.737105846405029, + "learning_rate": 8.276600081532818e-06, + "loss": 3.2262, + "step": 20300 + }, + { + "epoch": 1.379603206957467, + "grad_norm": 7.2997026443481445, + "learning_rate": 8.27617543144449e-06, + "loss": 2.9837, + "step": 20305 + }, + { + "epoch": 1.3799429270281287, + "grad_norm": 5.876772403717041, + "learning_rate": 8.275750781356163e-06, + "loss": 3.1769, + "step": 20310 + }, + { + "epoch": 1.3802826470987906, + "grad_norm": 6.271533966064453, + "learning_rate": 8.275326131267836e-06, + "loss": 3.272, + "step": 20315 + }, + { + "epoch": 1.3806223671694524, + "grad_norm": 7.924236297607422, + "learning_rate": 8.274901481179509e-06, + "loss": 3.2816, + "step": 20320 + }, + { + "epoch": 1.380962087240114, + "grad_norm": 9.545585632324219, + "learning_rate": 8.274476831091182e-06, + "loss": 3.0874, + "step": 20325 + }, + { + "epoch": 1.381301807310776, + "grad_norm": 6.19189453125, + "learning_rate": 8.274052181002855e-06, + "loss": 3.2961, + "step": 20330 + }, + { + "epoch": 1.3816415273814378, + "grad_norm": 6.384284973144531, + "learning_rate": 8.273627530914527e-06, + "loss": 3.045, + "step": 20335 + }, + { + "epoch": 1.3819812474520994, + "grad_norm": 5.592685222625732, + "learning_rate": 8.2732028808262e-06, + "loss": 3.2522, + "step": 20340 + }, + { + "epoch": 1.3823209675227612, + "grad_norm": 6.603451251983643, + "learning_rate": 8.272778230737873e-06, + "loss": 3.1452, + "step": 20345 + }, + { + "epoch": 1.382660687593423, + "grad_norm": 6.2502546310424805, + "learning_rate": 8.272353580649546e-06, + "loss": 3.1511, + "step": 20350 + }, + { + "epoch": 1.3830004076640847, + "grad_norm": 6.8513641357421875, + "learning_rate": 8.271928930561219e-06, + "loss": 3.1291, + "step": 20355 + }, + { + "epoch": 1.3833401277347466, + "grad_norm": 6.753864765167236, + "learning_rate": 8.271504280472891e-06, + "loss": 3.002, + "step": 20360 + }, + { + "epoch": 1.3836798478054084, + "grad_norm": 6.528906345367432, + "learning_rate": 8.271079630384564e-06, + "loss": 3.3213, + "step": 20365 + }, + { + "epoch": 1.38401956787607, + "grad_norm": 6.263979911804199, + "learning_rate": 8.270654980296237e-06, + "loss": 3.2264, + "step": 20370 + }, + { + "epoch": 1.384359287946732, + "grad_norm": 6.285154342651367, + "learning_rate": 8.270230330207908e-06, + "loss": 3.1795, + "step": 20375 + }, + { + "epoch": 1.3846990080173938, + "grad_norm": 6.148340702056885, + "learning_rate": 8.269805680119583e-06, + "loss": 3.4603, + "step": 20380 + }, + { + "epoch": 1.3850387280880554, + "grad_norm": 6.29748010635376, + "learning_rate": 8.269381030031255e-06, + "loss": 3.4031, + "step": 20385 + }, + { + "epoch": 1.3853784481587172, + "grad_norm": 5.809183597564697, + "learning_rate": 8.268956379942927e-06, + "loss": 3.0266, + "step": 20390 + }, + { + "epoch": 1.385718168229379, + "grad_norm": 6.878056526184082, + "learning_rate": 8.268531729854601e-06, + "loss": 3.1908, + "step": 20395 + }, + { + "epoch": 1.3860578883000407, + "grad_norm": 6.278916358947754, + "learning_rate": 8.268107079766274e-06, + "loss": 3.121, + "step": 20400 + }, + { + "epoch": 1.3863976083707026, + "grad_norm": 8.537321090698242, + "learning_rate": 8.267682429677945e-06, + "loss": 3.0989, + "step": 20405 + }, + { + "epoch": 1.3867373284413644, + "grad_norm": 6.068114757537842, + "learning_rate": 8.26725777958962e-06, + "loss": 3.0714, + "step": 20410 + }, + { + "epoch": 1.387077048512026, + "grad_norm": 7.816760063171387, + "learning_rate": 8.266833129501292e-06, + "loss": 3.2556, + "step": 20415 + }, + { + "epoch": 1.387416768582688, + "grad_norm": 5.828388690948486, + "learning_rate": 8.266408479412963e-06, + "loss": 3.2779, + "step": 20420 + }, + { + "epoch": 1.3877564886533498, + "grad_norm": 5.998442649841309, + "learning_rate": 8.265983829324638e-06, + "loss": 3.1471, + "step": 20425 + }, + { + "epoch": 1.3880962087240114, + "grad_norm": 5.983120918273926, + "learning_rate": 8.26555917923631e-06, + "loss": 3.1334, + "step": 20430 + }, + { + "epoch": 1.3884359287946733, + "grad_norm": 6.81999397277832, + "learning_rate": 8.265134529147982e-06, + "loss": 3.2308, + "step": 20435 + }, + { + "epoch": 1.3887756488653349, + "grad_norm": 7.32501745223999, + "learning_rate": 8.264709879059656e-06, + "loss": 3.1744, + "step": 20440 + }, + { + "epoch": 1.3891153689359967, + "grad_norm": 6.617311954498291, + "learning_rate": 8.264285228971327e-06, + "loss": 3.3675, + "step": 20445 + }, + { + "epoch": 1.3894550890066586, + "grad_norm": 9.035505294799805, + "learning_rate": 8.263860578883e-06, + "loss": 3.2516, + "step": 20450 + }, + { + "epoch": 1.3897948090773202, + "grad_norm": 6.405926704406738, + "learning_rate": 8.263435928794675e-06, + "loss": 3.0148, + "step": 20455 + }, + { + "epoch": 1.390134529147982, + "grad_norm": 6.374638080596924, + "learning_rate": 8.263011278706346e-06, + "loss": 3.4024, + "step": 20460 + }, + { + "epoch": 1.3904742492186437, + "grad_norm": 5.420829772949219, + "learning_rate": 8.262586628618019e-06, + "loss": 3.1714, + "step": 20465 + }, + { + "epoch": 1.3908139692893056, + "grad_norm": 7.281968593597412, + "learning_rate": 8.262161978529693e-06, + "loss": 3.0297, + "step": 20470 + }, + { + "epoch": 1.3911536893599674, + "grad_norm": 6.992068767547607, + "learning_rate": 8.261737328441364e-06, + "loss": 3.1444, + "step": 20475 + }, + { + "epoch": 1.391493409430629, + "grad_norm": 7.359330654144287, + "learning_rate": 8.261312678353037e-06, + "loss": 3.3399, + "step": 20480 + }, + { + "epoch": 1.391833129501291, + "grad_norm": 7.134758472442627, + "learning_rate": 8.260888028264712e-06, + "loss": 2.9868, + "step": 20485 + }, + { + "epoch": 1.3921728495719528, + "grad_norm": 6.778067588806152, + "learning_rate": 8.260463378176383e-06, + "loss": 3.1706, + "step": 20490 + }, + { + "epoch": 1.3925125696426144, + "grad_norm": 7.757146835327148, + "learning_rate": 8.260038728088055e-06, + "loss": 3.0289, + "step": 20495 + }, + { + "epoch": 1.3928522897132762, + "grad_norm": 9.93943977355957, + "learning_rate": 8.25961407799973e-06, + "loss": 3.2112, + "step": 20500 + }, + { + "epoch": 1.393192009783938, + "grad_norm": 7.291049480438232, + "learning_rate": 8.259189427911401e-06, + "loss": 3.2131, + "step": 20505 + }, + { + "epoch": 1.3935317298545997, + "grad_norm": 7.419704914093018, + "learning_rate": 8.258764777823074e-06, + "loss": 3.0491, + "step": 20510 + }, + { + "epoch": 1.3938714499252616, + "grad_norm": 6.185955047607422, + "learning_rate": 8.258340127734748e-06, + "loss": 3.0594, + "step": 20515 + }, + { + "epoch": 1.3942111699959234, + "grad_norm": 8.30728530883789, + "learning_rate": 8.25791547764642e-06, + "loss": 3.3648, + "step": 20520 + }, + { + "epoch": 1.394550890066585, + "grad_norm": 7.515995025634766, + "learning_rate": 8.257490827558092e-06, + "loss": 3.2526, + "step": 20525 + }, + { + "epoch": 1.394890610137247, + "grad_norm": 7.990439414978027, + "learning_rate": 8.257066177469765e-06, + "loss": 3.0313, + "step": 20530 + }, + { + "epoch": 1.3952303302079088, + "grad_norm": 6.90913200378418, + "learning_rate": 8.256641527381438e-06, + "loss": 3.2563, + "step": 20535 + }, + { + "epoch": 1.3955700502785704, + "grad_norm": 5.493824481964111, + "learning_rate": 8.25621687729311e-06, + "loss": 3.2585, + "step": 20540 + }, + { + "epoch": 1.3959097703492322, + "grad_norm": 6.8605756759643555, + "learning_rate": 8.255792227204783e-06, + "loss": 3.1394, + "step": 20545 + }, + { + "epoch": 1.396249490419894, + "grad_norm": 7.277971267700195, + "learning_rate": 8.255367577116456e-06, + "loss": 2.9405, + "step": 20550 + }, + { + "epoch": 1.3965892104905557, + "grad_norm": 5.240039825439453, + "learning_rate": 8.25494292702813e-06, + "loss": 2.9951, + "step": 20555 + }, + { + "epoch": 1.3969289305612176, + "grad_norm": 5.781737804412842, + "learning_rate": 8.254518276939802e-06, + "loss": 2.89, + "step": 20560 + }, + { + "epoch": 1.3972686506318794, + "grad_norm": 6.127159118652344, + "learning_rate": 8.254093626851475e-06, + "loss": 3.172, + "step": 20565 + }, + { + "epoch": 1.397608370702541, + "grad_norm": 6.498159408569336, + "learning_rate": 8.253668976763149e-06, + "loss": 3.0632, + "step": 20570 + }, + { + "epoch": 1.397948090773203, + "grad_norm": 6.408636093139648, + "learning_rate": 8.25324432667482e-06, + "loss": 2.8637, + "step": 20575 + }, + { + "epoch": 1.3982878108438648, + "grad_norm": 7.784261703491211, + "learning_rate": 8.252819676586493e-06, + "loss": 3.2219, + "step": 20580 + }, + { + "epoch": 1.3986275309145264, + "grad_norm": 7.918929100036621, + "learning_rate": 8.252395026498168e-06, + "loss": 3.0816, + "step": 20585 + }, + { + "epoch": 1.3989672509851883, + "grad_norm": 6.908321380615234, + "learning_rate": 8.251970376409839e-06, + "loss": 3.2648, + "step": 20590 + }, + { + "epoch": 1.3993069710558501, + "grad_norm": 7.361175060272217, + "learning_rate": 8.251545726321511e-06, + "loss": 3.0516, + "step": 20595 + }, + { + "epoch": 1.3996466911265117, + "grad_norm": 8.527466773986816, + "learning_rate": 8.251121076233184e-06, + "loss": 3.1679, + "step": 20600 + }, + { + "epoch": 1.3999864111971736, + "grad_norm": 6.913378715515137, + "learning_rate": 8.250696426144857e-06, + "loss": 3.2356, + "step": 20605 + }, + { + "epoch": 1.4003261312678352, + "grad_norm": 6.855132579803467, + "learning_rate": 8.25027177605653e-06, + "loss": 3.3026, + "step": 20610 + }, + { + "epoch": 1.400665851338497, + "grad_norm": 5.346390724182129, + "learning_rate": 8.249847125968203e-06, + "loss": 3.3396, + "step": 20615 + }, + { + "epoch": 1.401005571409159, + "grad_norm": 6.196659088134766, + "learning_rate": 8.249422475879875e-06, + "loss": 3.1363, + "step": 20620 + }, + { + "epoch": 1.4013452914798206, + "grad_norm": 7.508933067321777, + "learning_rate": 8.248997825791548e-06, + "loss": 3.0087, + "step": 20625 + }, + { + "epoch": 1.4016850115504824, + "grad_norm": 4.6154069900512695, + "learning_rate": 8.248573175703221e-06, + "loss": 3.1816, + "step": 20630 + }, + { + "epoch": 1.402024731621144, + "grad_norm": 6.937403202056885, + "learning_rate": 8.248148525614894e-06, + "loss": 2.9829, + "step": 20635 + }, + { + "epoch": 1.402364451691806, + "grad_norm": 6.158953666687012, + "learning_rate": 8.247723875526567e-06, + "loss": 3.5507, + "step": 20640 + }, + { + "epoch": 1.4027041717624678, + "grad_norm": 8.175652503967285, + "learning_rate": 8.24729922543824e-06, + "loss": 3.5045, + "step": 20645 + }, + { + "epoch": 1.4030438918331294, + "grad_norm": 8.258753776550293, + "learning_rate": 8.246874575349912e-06, + "loss": 3.0891, + "step": 20650 + }, + { + "epoch": 1.4033836119037912, + "grad_norm": 6.101559162139893, + "learning_rate": 8.246449925261585e-06, + "loss": 2.9553, + "step": 20655 + }, + { + "epoch": 1.403723331974453, + "grad_norm": 6.527842044830322, + "learning_rate": 8.246025275173258e-06, + "loss": 3.2158, + "step": 20660 + }, + { + "epoch": 1.4040630520451147, + "grad_norm": 6.259500026702881, + "learning_rate": 8.24560062508493e-06, + "loss": 3.3502, + "step": 20665 + }, + { + "epoch": 1.4044027721157766, + "grad_norm": 6.966640949249268, + "learning_rate": 8.245175974996603e-06, + "loss": 3.1231, + "step": 20670 + }, + { + "epoch": 1.4047424921864384, + "grad_norm": 6.75847053527832, + "learning_rate": 8.244751324908276e-06, + "loss": 3.0009, + "step": 20675 + }, + { + "epoch": 1.4050822122571, + "grad_norm": 6.109805583953857, + "learning_rate": 8.244326674819949e-06, + "loss": 3.2919, + "step": 20680 + }, + { + "epoch": 1.405421932327762, + "grad_norm": 5.371834754943848, + "learning_rate": 8.243902024731622e-06, + "loss": 3.1082, + "step": 20685 + }, + { + "epoch": 1.4057616523984238, + "grad_norm": 5.594232082366943, + "learning_rate": 8.243477374643295e-06, + "loss": 3.0117, + "step": 20690 + }, + { + "epoch": 1.4061013724690854, + "grad_norm": 7.019782543182373, + "learning_rate": 8.243052724554967e-06, + "loss": 3.1757, + "step": 20695 + }, + { + "epoch": 1.4064410925397473, + "grad_norm": 7.222899913787842, + "learning_rate": 8.24262807446664e-06, + "loss": 3.3428, + "step": 20700 + }, + { + "epoch": 1.406780812610409, + "grad_norm": 6.738530158996582, + "learning_rate": 8.242203424378313e-06, + "loss": 3.1737, + "step": 20705 + }, + { + "epoch": 1.4071205326810707, + "grad_norm": 6.940562725067139, + "learning_rate": 8.241778774289986e-06, + "loss": 3.3808, + "step": 20710 + }, + { + "epoch": 1.4074602527517326, + "grad_norm": 6.3318023681640625, + "learning_rate": 8.241354124201659e-06, + "loss": 3.0912, + "step": 20715 + }, + { + "epoch": 1.4077999728223944, + "grad_norm": 5.157370090484619, + "learning_rate": 8.240929474113331e-06, + "loss": 2.8742, + "step": 20720 + }, + { + "epoch": 1.408139692893056, + "grad_norm": 6.372654914855957, + "learning_rate": 8.240504824025004e-06, + "loss": 3.0483, + "step": 20725 + }, + { + "epoch": 1.408479412963718, + "grad_norm": 6.704179763793945, + "learning_rate": 8.240080173936677e-06, + "loss": 3.0288, + "step": 20730 + }, + { + "epoch": 1.4088191330343798, + "grad_norm": 6.504441738128662, + "learning_rate": 8.23965552384835e-06, + "loss": 3.2623, + "step": 20735 + }, + { + "epoch": 1.4091588531050414, + "grad_norm": 8.10242748260498, + "learning_rate": 8.239230873760023e-06, + "loss": 3.0379, + "step": 20740 + }, + { + "epoch": 1.4094985731757033, + "grad_norm": 6.756860256195068, + "learning_rate": 8.238806223671695e-06, + "loss": 3.0178, + "step": 20745 + }, + { + "epoch": 1.4098382932463651, + "grad_norm": 8.377742767333984, + "learning_rate": 8.238381573583368e-06, + "loss": 3.1906, + "step": 20750 + }, + { + "epoch": 1.4101780133170267, + "grad_norm": 7.1861348152160645, + "learning_rate": 8.237956923495041e-06, + "loss": 2.9727, + "step": 20755 + }, + { + "epoch": 1.4105177333876886, + "grad_norm": 5.827033042907715, + "learning_rate": 8.237532273406714e-06, + "loss": 3.0506, + "step": 20760 + }, + { + "epoch": 1.4108574534583505, + "grad_norm": 7.064783096313477, + "learning_rate": 8.237107623318387e-06, + "loss": 3.275, + "step": 20765 + }, + { + "epoch": 1.411197173529012, + "grad_norm": 6.609522819519043, + "learning_rate": 8.23668297323006e-06, + "loss": 3.2051, + "step": 20770 + }, + { + "epoch": 1.411536893599674, + "grad_norm": 8.418006896972656, + "learning_rate": 8.236258323141732e-06, + "loss": 3.0262, + "step": 20775 + }, + { + "epoch": 1.4118766136703356, + "grad_norm": 6.009592533111572, + "learning_rate": 8.235833673053405e-06, + "loss": 3.3246, + "step": 20780 + }, + { + "epoch": 1.4122163337409974, + "grad_norm": 7.7142333984375, + "learning_rate": 8.235409022965078e-06, + "loss": 3.2263, + "step": 20785 + }, + { + "epoch": 1.4125560538116593, + "grad_norm": 8.865796089172363, + "learning_rate": 8.234984372876749e-06, + "loss": 3.1012, + "step": 20790 + }, + { + "epoch": 1.412895773882321, + "grad_norm": 5.972334861755371, + "learning_rate": 8.234559722788423e-06, + "loss": 3.2392, + "step": 20795 + }, + { + "epoch": 1.4132354939529828, + "grad_norm": 7.1775054931640625, + "learning_rate": 8.234135072700096e-06, + "loss": 3.1509, + "step": 20800 + }, + { + "epoch": 1.4135752140236444, + "grad_norm": 6.123889923095703, + "learning_rate": 8.233710422611767e-06, + "loss": 3.1837, + "step": 20805 + }, + { + "epoch": 1.4139149340943062, + "grad_norm": 6.056642532348633, + "learning_rate": 8.233285772523442e-06, + "loss": 3.0091, + "step": 20810 + }, + { + "epoch": 1.414254654164968, + "grad_norm": 7.1061859130859375, + "learning_rate": 8.232861122435115e-06, + "loss": 3.322, + "step": 20815 + }, + { + "epoch": 1.4145943742356297, + "grad_norm": 8.024557113647461, + "learning_rate": 8.232436472346786e-06, + "loss": 3.2825, + "step": 20820 + }, + { + "epoch": 1.4149340943062916, + "grad_norm": 6.314208984375, + "learning_rate": 8.23201182225846e-06, + "loss": 3.2389, + "step": 20825 + }, + { + "epoch": 1.4152738143769534, + "grad_norm": 7.541729927062988, + "learning_rate": 8.231587172170133e-06, + "loss": 3.2232, + "step": 20830 + }, + { + "epoch": 1.415613534447615, + "grad_norm": 6.486074924468994, + "learning_rate": 8.231162522081804e-06, + "loss": 3.1316, + "step": 20835 + }, + { + "epoch": 1.415953254518277, + "grad_norm": 5.649775505065918, + "learning_rate": 8.230737871993479e-06, + "loss": 3.2108, + "step": 20840 + }, + { + "epoch": 1.4162929745889388, + "grad_norm": 9.04045295715332, + "learning_rate": 8.230313221905151e-06, + "loss": 3.4285, + "step": 20845 + }, + { + "epoch": 1.4166326946596004, + "grad_norm": 6.820240497589111, + "learning_rate": 8.229888571816823e-06, + "loss": 2.9823, + "step": 20850 + }, + { + "epoch": 1.4169724147302623, + "grad_norm": 6.109437942504883, + "learning_rate": 8.229463921728497e-06, + "loss": 2.9921, + "step": 20855 + }, + { + "epoch": 1.417312134800924, + "grad_norm": 6.995706558227539, + "learning_rate": 8.229039271640168e-06, + "loss": 3.1183, + "step": 20860 + }, + { + "epoch": 1.4176518548715857, + "grad_norm": 6.764227867126465, + "learning_rate": 8.228614621551841e-06, + "loss": 3.2003, + "step": 20865 + }, + { + "epoch": 1.4179915749422476, + "grad_norm": 6.848831653594971, + "learning_rate": 8.228189971463515e-06, + "loss": 3.224, + "step": 20870 + }, + { + "epoch": 1.4183312950129094, + "grad_norm": 6.052220344543457, + "learning_rate": 8.227765321375187e-06, + "loss": 3.1938, + "step": 20875 + }, + { + "epoch": 1.418671015083571, + "grad_norm": 7.271588325500488, + "learning_rate": 8.22734067128686e-06, + "loss": 3.3267, + "step": 20880 + }, + { + "epoch": 1.419010735154233, + "grad_norm": 5.925724029541016, + "learning_rate": 8.226916021198534e-06, + "loss": 3.1474, + "step": 20885 + }, + { + "epoch": 1.4193504552248948, + "grad_norm": 6.918968200683594, + "learning_rate": 8.226491371110205e-06, + "loss": 3.2804, + "step": 20890 + }, + { + "epoch": 1.4196901752955564, + "grad_norm": 7.143500328063965, + "learning_rate": 8.22606672102188e-06, + "loss": 3.399, + "step": 20895 + }, + { + "epoch": 1.4200298953662183, + "grad_norm": 6.820494651794434, + "learning_rate": 8.225642070933552e-06, + "loss": 3.2288, + "step": 20900 + }, + { + "epoch": 1.4203696154368801, + "grad_norm": 7.970963478088379, + "learning_rate": 8.225217420845223e-06, + "loss": 2.9291, + "step": 20905 + }, + { + "epoch": 1.4207093355075417, + "grad_norm": 7.639498233795166, + "learning_rate": 8.224792770756898e-06, + "loss": 3.4283, + "step": 20910 + }, + { + "epoch": 1.4210490555782036, + "grad_norm": 6.350168228149414, + "learning_rate": 8.22436812066857e-06, + "loss": 3.1713, + "step": 20915 + }, + { + "epoch": 1.4213887756488655, + "grad_norm": 7.544386386871338, + "learning_rate": 8.223943470580242e-06, + "loss": 3.41, + "step": 20920 + }, + { + "epoch": 1.421728495719527, + "grad_norm": 7.768373012542725, + "learning_rate": 8.223518820491916e-06, + "loss": 3.2097, + "step": 20925 + }, + { + "epoch": 1.422068215790189, + "grad_norm": 5.924417018890381, + "learning_rate": 8.223094170403589e-06, + "loss": 3.1622, + "step": 20930 + }, + { + "epoch": 1.4224079358608508, + "grad_norm": 7.282247066497803, + "learning_rate": 8.22266952031526e-06, + "loss": 3.1811, + "step": 20935 + }, + { + "epoch": 1.4227476559315124, + "grad_norm": 6.029991149902344, + "learning_rate": 8.222244870226935e-06, + "loss": 2.9723, + "step": 20940 + }, + { + "epoch": 1.4230873760021743, + "grad_norm": 6.148096561431885, + "learning_rate": 8.221820220138606e-06, + "loss": 3.0728, + "step": 20945 + }, + { + "epoch": 1.423427096072836, + "grad_norm": 8.750913619995117, + "learning_rate": 8.221395570050279e-06, + "loss": 3.0585, + "step": 20950 + }, + { + "epoch": 1.4237668161434978, + "grad_norm": 5.993468761444092, + "learning_rate": 8.220970919961953e-06, + "loss": 3.4431, + "step": 20955 + }, + { + "epoch": 1.4241065362141596, + "grad_norm": 7.998112201690674, + "learning_rate": 8.220546269873624e-06, + "loss": 3.3095, + "step": 20960 + }, + { + "epoch": 1.4244462562848212, + "grad_norm": 5.826508045196533, + "learning_rate": 8.220121619785297e-06, + "loss": 3.0602, + "step": 20965 + }, + { + "epoch": 1.424785976355483, + "grad_norm": 7.7321062088012695, + "learning_rate": 8.219696969696971e-06, + "loss": 3.1975, + "step": 20970 + }, + { + "epoch": 1.4251256964261447, + "grad_norm": 5.814226150512695, + "learning_rate": 8.219272319608643e-06, + "loss": 3.0986, + "step": 20975 + }, + { + "epoch": 1.4254654164968066, + "grad_norm": 6.062782287597656, + "learning_rate": 8.218847669520315e-06, + "loss": 3.3451, + "step": 20980 + }, + { + "epoch": 1.4258051365674684, + "grad_norm": 7.863983631134033, + "learning_rate": 8.21842301943199e-06, + "loss": 3.147, + "step": 20985 + }, + { + "epoch": 1.42614485663813, + "grad_norm": 9.432933807373047, + "learning_rate": 8.217998369343661e-06, + "loss": 3.0362, + "step": 20990 + }, + { + "epoch": 1.426484576708792, + "grad_norm": 6.077063083648682, + "learning_rate": 8.217573719255334e-06, + "loss": 2.9158, + "step": 20995 + }, + { + "epoch": 1.4268242967794538, + "grad_norm": 5.821117401123047, + "learning_rate": 8.217149069167008e-06, + "loss": 3.0664, + "step": 21000 + }, + { + "epoch": 1.4271640168501154, + "grad_norm": 7.093871116638184, + "learning_rate": 8.21672441907868e-06, + "loss": 3.3664, + "step": 21005 + }, + { + "epoch": 1.4275037369207773, + "grad_norm": 7.149366855621338, + "learning_rate": 8.216299768990352e-06, + "loss": 3.2868, + "step": 21010 + }, + { + "epoch": 1.427843456991439, + "grad_norm": 6.96628999710083, + "learning_rate": 8.215875118902025e-06, + "loss": 3.3728, + "step": 21015 + }, + { + "epoch": 1.4281831770621007, + "grad_norm": 6.868318557739258, + "learning_rate": 8.215450468813698e-06, + "loss": 2.9685, + "step": 21020 + }, + { + "epoch": 1.4285228971327626, + "grad_norm": 6.757184982299805, + "learning_rate": 8.21502581872537e-06, + "loss": 3.1735, + "step": 21025 + }, + { + "epoch": 1.4288626172034244, + "grad_norm": 8.223414421081543, + "learning_rate": 8.214601168637043e-06, + "loss": 3.3009, + "step": 21030 + }, + { + "epoch": 1.429202337274086, + "grad_norm": 6.033567428588867, + "learning_rate": 8.214176518548716e-06, + "loss": 3.2175, + "step": 21035 + }, + { + "epoch": 1.429542057344748, + "grad_norm": 6.400125503540039, + "learning_rate": 8.213751868460389e-06, + "loss": 3.3344, + "step": 21040 + }, + { + "epoch": 1.4298817774154098, + "grad_norm": 5.267225742340088, + "learning_rate": 8.213327218372062e-06, + "loss": 3.1127, + "step": 21045 + }, + { + "epoch": 1.4302214974860714, + "grad_norm": 6.111664295196533, + "learning_rate": 8.212902568283735e-06, + "loss": 3.2933, + "step": 21050 + }, + { + "epoch": 1.4305612175567333, + "grad_norm": 6.9425740242004395, + "learning_rate": 8.212477918195407e-06, + "loss": 3.3291, + "step": 21055 + }, + { + "epoch": 1.4309009376273951, + "grad_norm": 6.453843116760254, + "learning_rate": 8.21205326810708e-06, + "loss": 3.1667, + "step": 21060 + }, + { + "epoch": 1.4312406576980568, + "grad_norm": 5.795014381408691, + "learning_rate": 8.211628618018753e-06, + "loss": 3.2685, + "step": 21065 + }, + { + "epoch": 1.4315803777687186, + "grad_norm": 6.906888961791992, + "learning_rate": 8.211203967930426e-06, + "loss": 3.0714, + "step": 21070 + }, + { + "epoch": 1.4319200978393805, + "grad_norm": 6.376998424530029, + "learning_rate": 8.210779317842099e-06, + "loss": 3.018, + "step": 21075 + }, + { + "epoch": 1.432259817910042, + "grad_norm": 7.046078205108643, + "learning_rate": 8.210354667753771e-06, + "loss": 3.0549, + "step": 21080 + }, + { + "epoch": 1.432599537980704, + "grad_norm": 7.507627010345459, + "learning_rate": 8.209930017665444e-06, + "loss": 3.3485, + "step": 21085 + }, + { + "epoch": 1.4329392580513658, + "grad_norm": 6.989729404449463, + "learning_rate": 8.209505367577117e-06, + "loss": 3.1248, + "step": 21090 + }, + { + "epoch": 1.4332789781220274, + "grad_norm": 6.919845104217529, + "learning_rate": 8.20908071748879e-06, + "loss": 3.107, + "step": 21095 + }, + { + "epoch": 1.4336186981926893, + "grad_norm": 6.923623561859131, + "learning_rate": 8.208656067400463e-06, + "loss": 2.7782, + "step": 21100 + }, + { + "epoch": 1.4339584182633511, + "grad_norm": 7.207547187805176, + "learning_rate": 8.208231417312135e-06, + "loss": 3.1022, + "step": 21105 + }, + { + "epoch": 1.4342981383340128, + "grad_norm": 5.997544288635254, + "learning_rate": 8.207806767223808e-06, + "loss": 3.1568, + "step": 21110 + }, + { + "epoch": 1.4346378584046746, + "grad_norm": 5.709188938140869, + "learning_rate": 8.207382117135481e-06, + "loss": 3.0965, + "step": 21115 + }, + { + "epoch": 1.4349775784753362, + "grad_norm": 6.865525722503662, + "learning_rate": 8.206957467047154e-06, + "loss": 3.2366, + "step": 21120 + }, + { + "epoch": 1.435317298545998, + "grad_norm": 5.406304836273193, + "learning_rate": 8.206532816958827e-06, + "loss": 3.1557, + "step": 21125 + }, + { + "epoch": 1.43565701861666, + "grad_norm": 7.671781539916992, + "learning_rate": 8.2061081668705e-06, + "loss": 3.2646, + "step": 21130 + }, + { + "epoch": 1.4359967386873216, + "grad_norm": 5.629626750946045, + "learning_rate": 8.205683516782172e-06, + "loss": 3.2285, + "step": 21135 + }, + { + "epoch": 1.4363364587579834, + "grad_norm": 6.757355690002441, + "learning_rate": 8.205258866693845e-06, + "loss": 3.1138, + "step": 21140 + }, + { + "epoch": 1.436676178828645, + "grad_norm": 7.515583515167236, + "learning_rate": 8.204834216605518e-06, + "loss": 3.302, + "step": 21145 + }, + { + "epoch": 1.437015898899307, + "grad_norm": 7.368851184844971, + "learning_rate": 8.20440956651719e-06, + "loss": 3.176, + "step": 21150 + }, + { + "epoch": 1.4373556189699688, + "grad_norm": 6.459756374359131, + "learning_rate": 8.203984916428863e-06, + "loss": 3.0601, + "step": 21155 + }, + { + "epoch": 1.4376953390406304, + "grad_norm": 7.170598983764648, + "learning_rate": 8.203560266340536e-06, + "loss": 2.877, + "step": 21160 + }, + { + "epoch": 1.4380350591112923, + "grad_norm": 7.8587727546691895, + "learning_rate": 8.203135616252209e-06, + "loss": 3.1994, + "step": 21165 + }, + { + "epoch": 1.438374779181954, + "grad_norm": 6.155292510986328, + "learning_rate": 8.202710966163882e-06, + "loss": 3.526, + "step": 21170 + }, + { + "epoch": 1.4387144992526157, + "grad_norm": 8.848860740661621, + "learning_rate": 8.202286316075555e-06, + "loss": 3.1796, + "step": 21175 + }, + { + "epoch": 1.4390542193232776, + "grad_norm": 5.641871452331543, + "learning_rate": 8.201861665987227e-06, + "loss": 3.0577, + "step": 21180 + }, + { + "epoch": 1.4393939393939394, + "grad_norm": 6.050562381744385, + "learning_rate": 8.2014370158989e-06, + "loss": 3.3142, + "step": 21185 + }, + { + "epoch": 1.439733659464601, + "grad_norm": 7.338657855987549, + "learning_rate": 8.201012365810573e-06, + "loss": 3.1854, + "step": 21190 + }, + { + "epoch": 1.440073379535263, + "grad_norm": 6.231538772583008, + "learning_rate": 8.200587715722246e-06, + "loss": 3.3057, + "step": 21195 + }, + { + "epoch": 1.4404130996059248, + "grad_norm": 6.494061470031738, + "learning_rate": 8.200163065633919e-06, + "loss": 3.141, + "step": 21200 + }, + { + "epoch": 1.4407528196765864, + "grad_norm": 7.17828369140625, + "learning_rate": 8.19973841554559e-06, + "loss": 3.5898, + "step": 21205 + }, + { + "epoch": 1.4410925397472483, + "grad_norm": 6.297820091247559, + "learning_rate": 8.199313765457264e-06, + "loss": 3.1942, + "step": 21210 + }, + { + "epoch": 1.4414322598179101, + "grad_norm": 5.736927032470703, + "learning_rate": 8.198889115368937e-06, + "loss": 3.3422, + "step": 21215 + }, + { + "epoch": 1.4417719798885718, + "grad_norm": 7.111210823059082, + "learning_rate": 8.198464465280608e-06, + "loss": 3.194, + "step": 21220 + }, + { + "epoch": 1.4421116999592336, + "grad_norm": 6.909523963928223, + "learning_rate": 8.198039815192283e-06, + "loss": 3.3683, + "step": 21225 + }, + { + "epoch": 1.4424514200298955, + "grad_norm": 6.0498738288879395, + "learning_rate": 8.197615165103955e-06, + "loss": 3.1777, + "step": 21230 + }, + { + "epoch": 1.442791140100557, + "grad_norm": 9.767814636230469, + "learning_rate": 8.197190515015628e-06, + "loss": 3.2041, + "step": 21235 + }, + { + "epoch": 1.443130860171219, + "grad_norm": 6.289534568786621, + "learning_rate": 8.196765864927301e-06, + "loss": 3.0875, + "step": 21240 + }, + { + "epoch": 1.4434705802418808, + "grad_norm": 7.323869705200195, + "learning_rate": 8.196426144856638e-06, + "loss": 3.2866, + "step": 21245 + }, + { + "epoch": 1.4438103003125424, + "grad_norm": 7.329625129699707, + "learning_rate": 8.196001494768313e-06, + "loss": 3.0178, + "step": 21250 + }, + { + "epoch": 1.4441500203832043, + "grad_norm": 7.76638126373291, + "learning_rate": 8.195576844679984e-06, + "loss": 3.298, + "step": 21255 + }, + { + "epoch": 1.4444897404538661, + "grad_norm": 6.996373653411865, + "learning_rate": 8.195152194591657e-06, + "loss": 3.07, + "step": 21260 + }, + { + "epoch": 1.4448294605245278, + "grad_norm": 6.534031391143799, + "learning_rate": 8.19472754450333e-06, + "loss": 2.8835, + "step": 21265 + }, + { + "epoch": 1.4451691805951896, + "grad_norm": 5.721751689910889, + "learning_rate": 8.194302894415002e-06, + "loss": 3.1406, + "step": 21270 + }, + { + "epoch": 1.4455089006658515, + "grad_norm": 8.832011222839355, + "learning_rate": 8.193878244326675e-06, + "loss": 3.0466, + "step": 21275 + }, + { + "epoch": 1.445848620736513, + "grad_norm": 5.337122440338135, + "learning_rate": 8.193453594238348e-06, + "loss": 3.0104, + "step": 21280 + }, + { + "epoch": 1.446188340807175, + "grad_norm": 7.290724754333496, + "learning_rate": 8.19302894415002e-06, + "loss": 3.3409, + "step": 21285 + }, + { + "epoch": 1.4465280608778366, + "grad_norm": 5.600089073181152, + "learning_rate": 8.192604294061693e-06, + "loss": 3.4503, + "step": 21290 + }, + { + "epoch": 1.4468677809484984, + "grad_norm": 7.556376934051514, + "learning_rate": 8.192179643973366e-06, + "loss": 3.1766, + "step": 21295 + }, + { + "epoch": 1.4472075010191603, + "grad_norm": 7.227556228637695, + "learning_rate": 8.191754993885039e-06, + "loss": 3.1881, + "step": 21300 + }, + { + "epoch": 1.447547221089822, + "grad_norm": 7.187263488769531, + "learning_rate": 8.191330343796712e-06, + "loss": 3.1477, + "step": 21305 + }, + { + "epoch": 1.4478869411604838, + "grad_norm": 7.4763336181640625, + "learning_rate": 8.190905693708385e-06, + "loss": 3.1838, + "step": 21310 + }, + { + "epoch": 1.4482266612311454, + "grad_norm": 8.041691780090332, + "learning_rate": 8.190481043620057e-06, + "loss": 3.1333, + "step": 21315 + }, + { + "epoch": 1.4485663813018073, + "grad_norm": 7.0004143714904785, + "learning_rate": 8.19005639353173e-06, + "loss": 3.0906, + "step": 21320 + }, + { + "epoch": 1.4489061013724691, + "grad_norm": 7.479894161224365, + "learning_rate": 8.189631743443403e-06, + "loss": 3.299, + "step": 21325 + }, + { + "epoch": 1.4492458214431307, + "grad_norm": 6.419628143310547, + "learning_rate": 8.189207093355076e-06, + "loss": 3.1924, + "step": 21330 + }, + { + "epoch": 1.4495855415137926, + "grad_norm": 6.594597339630127, + "learning_rate": 8.188782443266749e-06, + "loss": 3.3841, + "step": 21335 + }, + { + "epoch": 1.4499252615844545, + "grad_norm": 6.255782127380371, + "learning_rate": 8.188357793178421e-06, + "loss": 3.1639, + "step": 21340 + }, + { + "epoch": 1.450264981655116, + "grad_norm": 6.319358825683594, + "learning_rate": 8.187933143090094e-06, + "loss": 3.327, + "step": 21345 + }, + { + "epoch": 1.450604701725778, + "grad_norm": 10.361949920654297, + "learning_rate": 8.187508493001767e-06, + "loss": 3.156, + "step": 21350 + }, + { + "epoch": 1.4509444217964398, + "grad_norm": 7.392336845397949, + "learning_rate": 8.18708384291344e-06, + "loss": 3.3018, + "step": 21355 + }, + { + "epoch": 1.4512841418671014, + "grad_norm": 7.128921985626221, + "learning_rate": 8.186659192825113e-06, + "loss": 3.1638, + "step": 21360 + }, + { + "epoch": 1.4516238619377633, + "grad_norm": 6.284449577331543, + "learning_rate": 8.186234542736785e-06, + "loss": 2.9944, + "step": 21365 + }, + { + "epoch": 1.4519635820084251, + "grad_norm": 5.64547872543335, + "learning_rate": 8.185809892648458e-06, + "loss": 3.4539, + "step": 21370 + }, + { + "epoch": 1.4523033020790868, + "grad_norm": 5.943681716918945, + "learning_rate": 8.185385242560131e-06, + "loss": 3.2044, + "step": 21375 + }, + { + "epoch": 1.4526430221497486, + "grad_norm": 7.673740863800049, + "learning_rate": 8.184960592471804e-06, + "loss": 2.9284, + "step": 21380 + }, + { + "epoch": 1.4529827422204105, + "grad_norm": 7.100234031677246, + "learning_rate": 8.184535942383477e-06, + "loss": 3.3553, + "step": 21385 + }, + { + "epoch": 1.453322462291072, + "grad_norm": 7.201615810394287, + "learning_rate": 8.18411129229515e-06, + "loss": 3.2202, + "step": 21390 + }, + { + "epoch": 1.453662182361734, + "grad_norm": 7.886471748352051, + "learning_rate": 8.183686642206822e-06, + "loss": 2.88, + "step": 21395 + }, + { + "epoch": 1.4540019024323958, + "grad_norm": 6.863929748535156, + "learning_rate": 8.183261992118495e-06, + "loss": 3.0749, + "step": 21400 + }, + { + "epoch": 1.4543416225030574, + "grad_norm": 4.985477924346924, + "learning_rate": 8.182837342030168e-06, + "loss": 3.2586, + "step": 21405 + }, + { + "epoch": 1.4546813425737193, + "grad_norm": 8.114705085754395, + "learning_rate": 8.18241269194184e-06, + "loss": 3.3044, + "step": 21410 + }, + { + "epoch": 1.4550210626443811, + "grad_norm": 7.861844062805176, + "learning_rate": 8.181988041853513e-06, + "loss": 3.265, + "step": 21415 + }, + { + "epoch": 1.4553607827150428, + "grad_norm": 8.53042221069336, + "learning_rate": 8.181563391765186e-06, + "loss": 3.0088, + "step": 21420 + }, + { + "epoch": 1.4557005027857046, + "grad_norm": 5.879409313201904, + "learning_rate": 8.181138741676859e-06, + "loss": 3.1141, + "step": 21425 + }, + { + "epoch": 1.4560402228563665, + "grad_norm": 7.971463680267334, + "learning_rate": 8.180714091588532e-06, + "loss": 3.1407, + "step": 21430 + }, + { + "epoch": 1.456379942927028, + "grad_norm": 5.920498371124268, + "learning_rate": 8.180289441500205e-06, + "loss": 3.0305, + "step": 21435 + }, + { + "epoch": 1.45671966299769, + "grad_norm": 8.129977226257324, + "learning_rate": 8.179864791411877e-06, + "loss": 3.294, + "step": 21440 + }, + { + "epoch": 1.4570593830683518, + "grad_norm": 7.096366882324219, + "learning_rate": 8.17944014132355e-06, + "loss": 3.1818, + "step": 21445 + }, + { + "epoch": 1.4573991031390134, + "grad_norm": 6.674953937530518, + "learning_rate": 8.179015491235223e-06, + "loss": 3.49, + "step": 21450 + }, + { + "epoch": 1.4577388232096753, + "grad_norm": 6.339662075042725, + "learning_rate": 8.178590841146896e-06, + "loss": 3.2208, + "step": 21455 + }, + { + "epoch": 1.458078543280337, + "grad_norm": 7.692011833190918, + "learning_rate": 8.178166191058569e-06, + "loss": 3.0242, + "step": 21460 + }, + { + "epoch": 1.4584182633509988, + "grad_norm": 7.070464611053467, + "learning_rate": 8.177741540970241e-06, + "loss": 3.321, + "step": 21465 + }, + { + "epoch": 1.4587579834216606, + "grad_norm": 5.503876209259033, + "learning_rate": 8.177316890881914e-06, + "loss": 3.0197, + "step": 21470 + }, + { + "epoch": 1.4590977034923223, + "grad_norm": 6.975700378417969, + "learning_rate": 8.176892240793587e-06, + "loss": 3.069, + "step": 21475 + }, + { + "epoch": 1.4594374235629841, + "grad_norm": 7.086871147155762, + "learning_rate": 8.17646759070526e-06, + "loss": 3.4131, + "step": 21480 + }, + { + "epoch": 1.4597771436336457, + "grad_norm": 7.42788553237915, + "learning_rate": 8.176042940616933e-06, + "loss": 3.2295, + "step": 21485 + }, + { + "epoch": 1.4601168637043076, + "grad_norm": 9.579109191894531, + "learning_rate": 8.175618290528606e-06, + "loss": 3.522, + "step": 21490 + }, + { + "epoch": 1.4604565837749695, + "grad_norm": 7.232555866241455, + "learning_rate": 8.175193640440278e-06, + "loss": 3.4003, + "step": 21495 + }, + { + "epoch": 1.460796303845631, + "grad_norm": 6.797307014465332, + "learning_rate": 8.174768990351951e-06, + "loss": 2.9254, + "step": 21500 + }, + { + "epoch": 1.461136023916293, + "grad_norm": 6.815131187438965, + "learning_rate": 8.174344340263624e-06, + "loss": 3.1659, + "step": 21505 + }, + { + "epoch": 1.4614757439869548, + "grad_norm": 5.66281795501709, + "learning_rate": 8.173919690175297e-06, + "loss": 3.3865, + "step": 21510 + }, + { + "epoch": 1.4618154640576164, + "grad_norm": 7.944351673126221, + "learning_rate": 8.17349504008697e-06, + "loss": 3.3267, + "step": 21515 + }, + { + "epoch": 1.4621551841282783, + "grad_norm": 6.657196044921875, + "learning_rate": 8.173070389998642e-06, + "loss": 2.8387, + "step": 21520 + }, + { + "epoch": 1.4624949041989401, + "grad_norm": 6.783127307891846, + "learning_rate": 8.172645739910315e-06, + "loss": 3.3967, + "step": 21525 + }, + { + "epoch": 1.4628346242696018, + "grad_norm": 7.213234901428223, + "learning_rate": 8.172221089821988e-06, + "loss": 3.1153, + "step": 21530 + }, + { + "epoch": 1.4631743443402636, + "grad_norm": 8.063642501831055, + "learning_rate": 8.17179643973366e-06, + "loss": 3.2916, + "step": 21535 + }, + { + "epoch": 1.4635140644109255, + "grad_norm": 8.666719436645508, + "learning_rate": 8.171371789645332e-06, + "loss": 3.2252, + "step": 21540 + }, + { + "epoch": 1.463853784481587, + "grad_norm": 6.816811561584473, + "learning_rate": 8.170947139557006e-06, + "loss": 3.0394, + "step": 21545 + }, + { + "epoch": 1.464193504552249, + "grad_norm": 7.613834381103516, + "learning_rate": 8.170522489468679e-06, + "loss": 3.3039, + "step": 21550 + }, + { + "epoch": 1.4645332246229108, + "grad_norm": 8.600113868713379, + "learning_rate": 8.17009783938035e-06, + "loss": 3.0942, + "step": 21555 + }, + { + "epoch": 1.4648729446935724, + "grad_norm": 5.906998634338379, + "learning_rate": 8.169673189292025e-06, + "loss": 3.1343, + "step": 21560 + }, + { + "epoch": 1.4652126647642343, + "grad_norm": 5.5703840255737305, + "learning_rate": 8.169248539203698e-06, + "loss": 3.3637, + "step": 21565 + }, + { + "epoch": 1.4655523848348961, + "grad_norm": 7.111658096313477, + "learning_rate": 8.168823889115369e-06, + "loss": 3.3571, + "step": 21570 + }, + { + "epoch": 1.4658921049055578, + "grad_norm": 6.61870002746582, + "learning_rate": 8.168399239027043e-06, + "loss": 3.3293, + "step": 21575 + }, + { + "epoch": 1.4662318249762196, + "grad_norm": 7.8056159019470215, + "learning_rate": 8.167974588938716e-06, + "loss": 3.5388, + "step": 21580 + }, + { + "epoch": 1.4665715450468815, + "grad_norm": 6.554567337036133, + "learning_rate": 8.167549938850387e-06, + "loss": 3.0659, + "step": 21585 + }, + { + "epoch": 1.466911265117543, + "grad_norm": 5.480721473693848, + "learning_rate": 8.167125288762062e-06, + "loss": 3.0992, + "step": 21590 + }, + { + "epoch": 1.467250985188205, + "grad_norm": 7.6247124671936035, + "learning_rate": 8.166700638673734e-06, + "loss": 3.2207, + "step": 21595 + }, + { + "epoch": 1.4675907052588668, + "grad_norm": 7.388587951660156, + "learning_rate": 8.166275988585405e-06, + "loss": 3.247, + "step": 21600 + }, + { + "epoch": 1.4679304253295284, + "grad_norm": 6.047605514526367, + "learning_rate": 8.16585133849708e-06, + "loss": 3.2377, + "step": 21605 + }, + { + "epoch": 1.4682701454001903, + "grad_norm": 6.943005084991455, + "learning_rate": 8.165426688408751e-06, + "loss": 3.2435, + "step": 21610 + }, + { + "epoch": 1.4686098654708521, + "grad_norm": 6.2294769287109375, + "learning_rate": 8.165002038320424e-06, + "loss": 3.0401, + "step": 21615 + }, + { + "epoch": 1.4689495855415138, + "grad_norm": 6.997790813446045, + "learning_rate": 8.164577388232098e-06, + "loss": 3.1196, + "step": 21620 + }, + { + "epoch": 1.4692893056121756, + "grad_norm": 6.743719100952148, + "learning_rate": 8.16415273814377e-06, + "loss": 2.9023, + "step": 21625 + }, + { + "epoch": 1.4696290256828373, + "grad_norm": 6.599986553192139, + "learning_rate": 8.163728088055442e-06, + "loss": 2.9247, + "step": 21630 + }, + { + "epoch": 1.4699687457534991, + "grad_norm": 7.977846622467041, + "learning_rate": 8.163303437967117e-06, + "loss": 3.3289, + "step": 21635 + }, + { + "epoch": 1.470308465824161, + "grad_norm": 8.502054214477539, + "learning_rate": 8.162878787878788e-06, + "loss": 2.9813, + "step": 21640 + }, + { + "epoch": 1.4706481858948226, + "grad_norm": 6.693021774291992, + "learning_rate": 8.16245413779046e-06, + "loss": 3.1289, + "step": 21645 + }, + { + "epoch": 1.4709879059654845, + "grad_norm": 7.597743034362793, + "learning_rate": 8.162029487702135e-06, + "loss": 3.1776, + "step": 21650 + }, + { + "epoch": 1.471327626036146, + "grad_norm": 6.97343111038208, + "learning_rate": 8.161604837613806e-06, + "loss": 3.1678, + "step": 21655 + }, + { + "epoch": 1.471667346106808, + "grad_norm": 5.598223686218262, + "learning_rate": 8.161180187525479e-06, + "loss": 3.0013, + "step": 21660 + }, + { + "epoch": 1.4720070661774698, + "grad_norm": 8.444082260131836, + "learning_rate": 8.160755537437154e-06, + "loss": 3.3003, + "step": 21665 + }, + { + "epoch": 1.4723467862481314, + "grad_norm": 6.577394962310791, + "learning_rate": 8.160330887348825e-06, + "loss": 3.3954, + "step": 21670 + }, + { + "epoch": 1.4726865063187933, + "grad_norm": 6.471332550048828, + "learning_rate": 8.159906237260497e-06, + "loss": 3.1301, + "step": 21675 + }, + { + "epoch": 1.4730262263894551, + "grad_norm": 5.776363849639893, + "learning_rate": 8.159481587172172e-06, + "loss": 3.1178, + "step": 21680 + }, + { + "epoch": 1.4733659464601168, + "grad_norm": 5.828070163726807, + "learning_rate": 8.159056937083843e-06, + "loss": 3.1431, + "step": 21685 + }, + { + "epoch": 1.4737056665307786, + "grad_norm": 6.256012916564941, + "learning_rate": 8.158632286995516e-06, + "loss": 3.2124, + "step": 21690 + }, + { + "epoch": 1.4740453866014405, + "grad_norm": 7.973407745361328, + "learning_rate": 8.158207636907189e-06, + "loss": 3.3843, + "step": 21695 + }, + { + "epoch": 1.474385106672102, + "grad_norm": 6.473258972167969, + "learning_rate": 8.157782986818861e-06, + "loss": 3.0637, + "step": 21700 + }, + { + "epoch": 1.474724826742764, + "grad_norm": 6.4484405517578125, + "learning_rate": 8.157358336730534e-06, + "loss": 3.1413, + "step": 21705 + }, + { + "epoch": 1.4750645468134258, + "grad_norm": 5.809981346130371, + "learning_rate": 8.156933686642207e-06, + "loss": 3.1606, + "step": 21710 + }, + { + "epoch": 1.4754042668840874, + "grad_norm": 5.581980228424072, + "learning_rate": 8.15650903655388e-06, + "loss": 3.2601, + "step": 21715 + }, + { + "epoch": 1.4757439869547493, + "grad_norm": 5.982985973358154, + "learning_rate": 8.156084386465553e-06, + "loss": 3.134, + "step": 21720 + }, + { + "epoch": 1.4760837070254111, + "grad_norm": 7.046659469604492, + "learning_rate": 8.155659736377225e-06, + "loss": 3.2207, + "step": 21725 + }, + { + "epoch": 1.4764234270960728, + "grad_norm": 5.434577941894531, + "learning_rate": 8.155235086288898e-06, + "loss": 3.1371, + "step": 21730 + }, + { + "epoch": 1.4767631471667346, + "grad_norm": 6.677894115447998, + "learning_rate": 8.154810436200571e-06, + "loss": 3.0206, + "step": 21735 + }, + { + "epoch": 1.4771028672373965, + "grad_norm": 6.770547389984131, + "learning_rate": 8.154385786112244e-06, + "loss": 3.106, + "step": 21740 + }, + { + "epoch": 1.477442587308058, + "grad_norm": 6.373946666717529, + "learning_rate": 8.153961136023917e-06, + "loss": 2.9825, + "step": 21745 + }, + { + "epoch": 1.47778230737872, + "grad_norm": 6.619965553283691, + "learning_rate": 8.15353648593559e-06, + "loss": 3.084, + "step": 21750 + }, + { + "epoch": 1.4781220274493818, + "grad_norm": 5.353378772735596, + "learning_rate": 8.153111835847262e-06, + "loss": 3.1569, + "step": 21755 + }, + { + "epoch": 1.4784617475200434, + "grad_norm": 5.766949653625488, + "learning_rate": 8.152687185758935e-06, + "loss": 3.137, + "step": 21760 + }, + { + "epoch": 1.4788014675907053, + "grad_norm": 8.287833213806152, + "learning_rate": 8.152262535670608e-06, + "loss": 3.4439, + "step": 21765 + }, + { + "epoch": 1.4791411876613672, + "grad_norm": 7.018538475036621, + "learning_rate": 8.15183788558228e-06, + "loss": 3.1527, + "step": 21770 + }, + { + "epoch": 1.4794809077320288, + "grad_norm": 7.220016956329346, + "learning_rate": 8.151413235493953e-06, + "loss": 3.2889, + "step": 21775 + }, + { + "epoch": 1.4798206278026906, + "grad_norm": 6.492581844329834, + "learning_rate": 8.150988585405626e-06, + "loss": 3.0471, + "step": 21780 + }, + { + "epoch": 1.4801603478733525, + "grad_norm": 5.748755931854248, + "learning_rate": 8.150563935317299e-06, + "loss": 3.4113, + "step": 21785 + }, + { + "epoch": 1.4805000679440141, + "grad_norm": 7.553306579589844, + "learning_rate": 8.150139285228972e-06, + "loss": 3.2719, + "step": 21790 + }, + { + "epoch": 1.480839788014676, + "grad_norm": 6.454686641693115, + "learning_rate": 8.149714635140645e-06, + "loss": 3.2593, + "step": 21795 + }, + { + "epoch": 1.4811795080853376, + "grad_norm": 8.55599308013916, + "learning_rate": 8.149289985052317e-06, + "loss": 3.277, + "step": 21800 + }, + { + "epoch": 1.4815192281559995, + "grad_norm": 5.0937933921813965, + "learning_rate": 8.14886533496399e-06, + "loss": 3.4585, + "step": 21805 + }, + { + "epoch": 1.4818589482266613, + "grad_norm": 4.979554176330566, + "learning_rate": 8.148440684875663e-06, + "loss": 3.2193, + "step": 21810 + }, + { + "epoch": 1.482198668297323, + "grad_norm": 8.331725120544434, + "learning_rate": 8.148016034787336e-06, + "loss": 3.3498, + "step": 21815 + }, + { + "epoch": 1.4825383883679848, + "grad_norm": 5.503475666046143, + "learning_rate": 8.147591384699009e-06, + "loss": 2.9525, + "step": 21820 + }, + { + "epoch": 1.4828781084386464, + "grad_norm": 5.9604926109313965, + "learning_rate": 8.147166734610681e-06, + "loss": 3.4883, + "step": 21825 + }, + { + "epoch": 1.4832178285093083, + "grad_norm": 6.757287502288818, + "learning_rate": 8.146742084522354e-06, + "loss": 3.1342, + "step": 21830 + }, + { + "epoch": 1.4835575485799701, + "grad_norm": 8.343624114990234, + "learning_rate": 8.146317434434027e-06, + "loss": 3.3727, + "step": 21835 + }, + { + "epoch": 1.4838972686506318, + "grad_norm": 5.931246757507324, + "learning_rate": 8.1458927843457e-06, + "loss": 3.134, + "step": 21840 + }, + { + "epoch": 1.4842369887212936, + "grad_norm": 6.752651214599609, + "learning_rate": 8.145468134257373e-06, + "loss": 3.1503, + "step": 21845 + }, + { + "epoch": 1.4845767087919555, + "grad_norm": 7.595345973968506, + "learning_rate": 8.145043484169045e-06, + "loss": 3.1663, + "step": 21850 + }, + { + "epoch": 1.484916428862617, + "grad_norm": 9.212642669677734, + "learning_rate": 8.144618834080718e-06, + "loss": 3.3693, + "step": 21855 + }, + { + "epoch": 1.485256148933279, + "grad_norm": 7.257197856903076, + "learning_rate": 8.144194183992391e-06, + "loss": 3.1461, + "step": 21860 + }, + { + "epoch": 1.4855958690039408, + "grad_norm": 6.520511150360107, + "learning_rate": 8.143769533904064e-06, + "loss": 3.3814, + "step": 21865 + }, + { + "epoch": 1.4859355890746024, + "grad_norm": 7.823672771453857, + "learning_rate": 8.143344883815737e-06, + "loss": 3.0345, + "step": 21870 + }, + { + "epoch": 1.4862753091452643, + "grad_norm": 6.1994524002075195, + "learning_rate": 8.14292023372741e-06, + "loss": 3.1211, + "step": 21875 + }, + { + "epoch": 1.4866150292159261, + "grad_norm": 7.534433841705322, + "learning_rate": 8.142495583639082e-06, + "loss": 3.2403, + "step": 21880 + }, + { + "epoch": 1.4869547492865878, + "grad_norm": 5.192112445831299, + "learning_rate": 8.142070933550755e-06, + "loss": 2.8697, + "step": 21885 + }, + { + "epoch": 1.4872944693572496, + "grad_norm": 8.034311294555664, + "learning_rate": 8.141646283462428e-06, + "loss": 3.0652, + "step": 21890 + }, + { + "epoch": 1.4876341894279115, + "grad_norm": 6.968142509460449, + "learning_rate": 8.1412216333741e-06, + "loss": 3.2204, + "step": 21895 + }, + { + "epoch": 1.487973909498573, + "grad_norm": 7.702812671661377, + "learning_rate": 8.140796983285773e-06, + "loss": 3.0653, + "step": 21900 + }, + { + "epoch": 1.488313629569235, + "grad_norm": 7.1233906745910645, + "learning_rate": 8.140372333197446e-06, + "loss": 3.3583, + "step": 21905 + }, + { + "epoch": 1.4886533496398968, + "grad_norm": 6.245172500610352, + "learning_rate": 8.139947683109119e-06, + "loss": 2.8736, + "step": 21910 + }, + { + "epoch": 1.4889930697105584, + "grad_norm": 7.930967330932617, + "learning_rate": 8.139523033020792e-06, + "loss": 3.0343, + "step": 21915 + }, + { + "epoch": 1.4893327897812203, + "grad_norm": 8.27612018585205, + "learning_rate": 8.139098382932465e-06, + "loss": 3.0628, + "step": 21920 + }, + { + "epoch": 1.4896725098518822, + "grad_norm": 8.0164155960083, + "learning_rate": 8.138673732844137e-06, + "loss": 3.0517, + "step": 21925 + }, + { + "epoch": 1.4900122299225438, + "grad_norm": 6.9432244300842285, + "learning_rate": 8.13824908275581e-06, + "loss": 3.2988, + "step": 21930 + }, + { + "epoch": 1.4903519499932056, + "grad_norm": 5.516629695892334, + "learning_rate": 8.137824432667483e-06, + "loss": 3.1134, + "step": 21935 + }, + { + "epoch": 1.4906916700638675, + "grad_norm": 7.090358734130859, + "learning_rate": 8.137399782579156e-06, + "loss": 3.2693, + "step": 21940 + }, + { + "epoch": 1.4910313901345291, + "grad_norm": 7.4875593185424805, + "learning_rate": 8.136975132490829e-06, + "loss": 3.2411, + "step": 21945 + }, + { + "epoch": 1.491371110205191, + "grad_norm": 8.021028518676758, + "learning_rate": 8.136550482402501e-06, + "loss": 3.3195, + "step": 21950 + }, + { + "epoch": 1.4917108302758528, + "grad_norm": 6.402764797210693, + "learning_rate": 8.136125832314173e-06, + "loss": 3.1343, + "step": 21955 + }, + { + "epoch": 1.4920505503465145, + "grad_norm": 6.367369174957275, + "learning_rate": 8.135701182225847e-06, + "loss": 3.0562, + "step": 21960 + }, + { + "epoch": 1.4923902704171763, + "grad_norm": 7.194770812988281, + "learning_rate": 8.13527653213752e-06, + "loss": 3.1036, + "step": 21965 + }, + { + "epoch": 1.492729990487838, + "grad_norm": 6.0172834396362305, + "learning_rate": 8.134851882049191e-06, + "loss": 2.953, + "step": 21970 + }, + { + "epoch": 1.4930697105584998, + "grad_norm": 6.858339309692383, + "learning_rate": 8.134427231960865e-06, + "loss": 3.2848, + "step": 21975 + }, + { + "epoch": 1.4934094306291616, + "grad_norm": 6.520463943481445, + "learning_rate": 8.134002581872538e-06, + "loss": 3.232, + "step": 21980 + }, + { + "epoch": 1.4937491506998233, + "grad_norm": 8.592901229858398, + "learning_rate": 8.13357793178421e-06, + "loss": 2.8977, + "step": 21985 + }, + { + "epoch": 1.4940888707704851, + "grad_norm": 5.466273784637451, + "learning_rate": 8.133153281695884e-06, + "loss": 3.0593, + "step": 21990 + }, + { + "epoch": 1.4944285908411468, + "grad_norm": 6.051250457763672, + "learning_rate": 8.132728631607557e-06, + "loss": 3.1946, + "step": 21995 + }, + { + "epoch": 1.4947683109118086, + "grad_norm": 6.011348247528076, + "learning_rate": 8.132303981519228e-06, + "loss": 3.2216, + "step": 22000 + }, + { + "epoch": 1.4951080309824705, + "grad_norm": 7.992321491241455, + "learning_rate": 8.131879331430902e-06, + "loss": 3.4371, + "step": 22005 + }, + { + "epoch": 1.495447751053132, + "grad_norm": 7.342377662658691, + "learning_rate": 8.131454681342575e-06, + "loss": 3.1039, + "step": 22010 + }, + { + "epoch": 1.495787471123794, + "grad_norm": 7.68485689163208, + "learning_rate": 8.131030031254246e-06, + "loss": 3.2973, + "step": 22015 + }, + { + "epoch": 1.4961271911944558, + "grad_norm": 6.609769344329834, + "learning_rate": 8.13060538116592e-06, + "loss": 3.2304, + "step": 22020 + }, + { + "epoch": 1.4964669112651174, + "grad_norm": 6.227417945861816, + "learning_rate": 8.130180731077593e-06, + "loss": 3.069, + "step": 22025 + }, + { + "epoch": 1.4968066313357793, + "grad_norm": 6.680327892303467, + "learning_rate": 8.129756080989265e-06, + "loss": 2.9633, + "step": 22030 + }, + { + "epoch": 1.4971463514064411, + "grad_norm": 7.132140636444092, + "learning_rate": 8.129331430900939e-06, + "loss": 3.3683, + "step": 22035 + }, + { + "epoch": 1.4974860714771028, + "grad_norm": 6.525623798370361, + "learning_rate": 8.12890678081261e-06, + "loss": 3.177, + "step": 22040 + }, + { + "epoch": 1.4978257915477646, + "grad_norm": 6.139394283294678, + "learning_rate": 8.128482130724283e-06, + "loss": 3.188, + "step": 22045 + }, + { + "epoch": 1.4981655116184265, + "grad_norm": 6.937084197998047, + "learning_rate": 8.128057480635957e-06, + "loss": 3.2018, + "step": 22050 + }, + { + "epoch": 1.4985052316890881, + "grad_norm": 5.81224250793457, + "learning_rate": 8.127632830547629e-06, + "loss": 3.0874, + "step": 22055 + }, + { + "epoch": 1.49884495175975, + "grad_norm": 6.331086158752441, + "learning_rate": 8.127208180459301e-06, + "loss": 3.2905, + "step": 22060 + }, + { + "epoch": 1.4991846718304118, + "grad_norm": 6.523308753967285, + "learning_rate": 8.126783530370976e-06, + "loss": 3.5083, + "step": 22065 + }, + { + "epoch": 1.4995243919010735, + "grad_norm": 6.436559677124023, + "learning_rate": 8.126358880282647e-06, + "loss": 3.1218, + "step": 22070 + }, + { + "epoch": 1.4998641119717353, + "grad_norm": 6.367175579071045, + "learning_rate": 8.12593423019432e-06, + "loss": 3.2387, + "step": 22075 + }, + { + "epoch": 1.5002038320423972, + "grad_norm": 6.687195301055908, + "learning_rate": 8.125509580105994e-06, + "loss": 3.2567, + "step": 22080 + }, + { + "epoch": 1.5005435521130588, + "grad_norm": 6.1665239334106445, + "learning_rate": 8.125084930017665e-06, + "loss": 3.0118, + "step": 22085 + }, + { + "epoch": 1.5008832721837206, + "grad_norm": 6.939234256744385, + "learning_rate": 8.124660279929338e-06, + "loss": 3.0135, + "step": 22090 + }, + { + "epoch": 1.5012229922543825, + "grad_norm": 8.80126667022705, + "learning_rate": 8.124235629841013e-06, + "loss": 3.1467, + "step": 22095 + }, + { + "epoch": 1.5015627123250441, + "grad_norm": 6.693106174468994, + "learning_rate": 8.123810979752684e-06, + "loss": 3.043, + "step": 22100 + }, + { + "epoch": 1.501902432395706, + "grad_norm": 7.319981098175049, + "learning_rate": 8.123386329664357e-06, + "loss": 3.1986, + "step": 22105 + }, + { + "epoch": 1.5022421524663678, + "grad_norm": 6.161500453948975, + "learning_rate": 8.12296167957603e-06, + "loss": 3.2625, + "step": 22110 + }, + { + "epoch": 1.5025818725370295, + "grad_norm": 6.908188343048096, + "learning_rate": 8.122537029487702e-06, + "loss": 3.1979, + "step": 22115 + }, + { + "epoch": 1.5029215926076913, + "grad_norm": 5.824929237365723, + "learning_rate": 8.122112379399375e-06, + "loss": 3.0542, + "step": 22120 + }, + { + "epoch": 1.5032613126783532, + "grad_norm": 6.380246639251709, + "learning_rate": 8.121687729311048e-06, + "loss": 3.1485, + "step": 22125 + }, + { + "epoch": 1.5036010327490148, + "grad_norm": 6.152600288391113, + "learning_rate": 8.12126307922272e-06, + "loss": 3.1268, + "step": 22130 + }, + { + "epoch": 1.5039407528196764, + "grad_norm": 8.797917366027832, + "learning_rate": 8.120838429134395e-06, + "loss": 3.1944, + "step": 22135 + }, + { + "epoch": 1.5042804728903385, + "grad_norm": 7.840415954589844, + "learning_rate": 8.120413779046066e-06, + "loss": 3.1774, + "step": 22140 + }, + { + "epoch": 1.5046201929610001, + "grad_norm": 6.758212089538574, + "learning_rate": 8.119989128957739e-06, + "loss": 3.2384, + "step": 22145 + }, + { + "epoch": 1.5049599130316618, + "grad_norm": 7.963546276092529, + "learning_rate": 8.119564478869414e-06, + "loss": 3.365, + "step": 22150 + }, + { + "epoch": 1.5052996331023238, + "grad_norm": 7.234251499176025, + "learning_rate": 8.119139828781085e-06, + "loss": 3.2055, + "step": 22155 + }, + { + "epoch": 1.5056393531729855, + "grad_norm": 7.589718818664551, + "learning_rate": 8.118715178692757e-06, + "loss": 3.3265, + "step": 22160 + }, + { + "epoch": 1.505979073243647, + "grad_norm": 7.054195880889893, + "learning_rate": 8.118290528604432e-06, + "loss": 3.3129, + "step": 22165 + }, + { + "epoch": 1.506318793314309, + "grad_norm": 7.317948818206787, + "learning_rate": 8.117865878516103e-06, + "loss": 3.0164, + "step": 22170 + }, + { + "epoch": 1.5066585133849708, + "grad_norm": 7.831077575683594, + "learning_rate": 8.117441228427776e-06, + "loss": 3.0803, + "step": 22175 + }, + { + "epoch": 1.5069982334556324, + "grad_norm": 7.181831359863281, + "learning_rate": 8.117016578339449e-06, + "loss": 3.19, + "step": 22180 + }, + { + "epoch": 1.5073379535262943, + "grad_norm": 6.789498329162598, + "learning_rate": 8.116591928251121e-06, + "loss": 3.4128, + "step": 22185 + }, + { + "epoch": 1.5076776735969561, + "grad_norm": 7.701596736907959, + "learning_rate": 8.116167278162794e-06, + "loss": 2.8799, + "step": 22190 + }, + { + "epoch": 1.5080173936676178, + "grad_norm": 6.596227645874023, + "learning_rate": 8.115742628074467e-06, + "loss": 3.1943, + "step": 22195 + }, + { + "epoch": 1.5083571137382796, + "grad_norm": 5.970792770385742, + "learning_rate": 8.11531797798614e-06, + "loss": 3.1746, + "step": 22200 + }, + { + "epoch": 1.5086968338089415, + "grad_norm": 8.401588439941406, + "learning_rate": 8.114893327897813e-06, + "loss": 3.3755, + "step": 22205 + }, + { + "epoch": 1.5090365538796031, + "grad_norm": 7.525246620178223, + "learning_rate": 8.114468677809485e-06, + "loss": 3.0885, + "step": 22210 + }, + { + "epoch": 1.509376273950265, + "grad_norm": 5.9187912940979, + "learning_rate": 8.114044027721158e-06, + "loss": 3.2527, + "step": 22215 + }, + { + "epoch": 1.5097159940209268, + "grad_norm": 6.698221683502197, + "learning_rate": 8.113619377632831e-06, + "loss": 3.0555, + "step": 22220 + }, + { + "epoch": 1.5100557140915885, + "grad_norm": 7.073246479034424, + "learning_rate": 8.113194727544504e-06, + "loss": 3.1327, + "step": 22225 + }, + { + "epoch": 1.5103954341622503, + "grad_norm": 7.302006721496582, + "learning_rate": 8.112770077456177e-06, + "loss": 3.0589, + "step": 22230 + }, + { + "epoch": 1.5107351542329122, + "grad_norm": 6.575310230255127, + "learning_rate": 8.11234542736785e-06, + "loss": 3.0612, + "step": 22235 + }, + { + "epoch": 1.5110748743035738, + "grad_norm": 6.368984699249268, + "learning_rate": 8.111920777279522e-06, + "loss": 3.2063, + "step": 22240 + }, + { + "epoch": 1.5114145943742356, + "grad_norm": 5.162621974945068, + "learning_rate": 8.111496127191195e-06, + "loss": 3.007, + "step": 22245 + }, + { + "epoch": 1.5117543144448975, + "grad_norm": 6.861945629119873, + "learning_rate": 8.111071477102868e-06, + "loss": 3.2918, + "step": 22250 + }, + { + "epoch": 1.5120940345155591, + "grad_norm": 7.907005310058594, + "learning_rate": 8.11064682701454e-06, + "loss": 3.3201, + "step": 22255 + }, + { + "epoch": 1.512433754586221, + "grad_norm": 6.505929946899414, + "learning_rate": 8.110222176926213e-06, + "loss": 2.9509, + "step": 22260 + }, + { + "epoch": 1.5127734746568828, + "grad_norm": 5.802111625671387, + "learning_rate": 8.109797526837886e-06, + "loss": 3.1228, + "step": 22265 + }, + { + "epoch": 1.5131131947275445, + "grad_norm": 6.02520751953125, + "learning_rate": 8.109372876749559e-06, + "loss": 3.1065, + "step": 22270 + }, + { + "epoch": 1.5134529147982063, + "grad_norm": 6.327506065368652, + "learning_rate": 8.108948226661232e-06, + "loss": 3.0509, + "step": 22275 + }, + { + "epoch": 1.5137926348688682, + "grad_norm": 10.041318893432617, + "learning_rate": 8.108523576572905e-06, + "loss": 3.1352, + "step": 22280 + }, + { + "epoch": 1.5141323549395298, + "grad_norm": 7.028964519500732, + "learning_rate": 8.108098926484577e-06, + "loss": 3.239, + "step": 22285 + }, + { + "epoch": 1.5144720750101917, + "grad_norm": 6.743167400360107, + "learning_rate": 8.10767427639625e-06, + "loss": 3.261, + "step": 22290 + }, + { + "epoch": 1.5148117950808535, + "grad_norm": 8.447031021118164, + "learning_rate": 8.107249626307923e-06, + "loss": 3.0659, + "step": 22295 + }, + { + "epoch": 1.5151515151515151, + "grad_norm": 6.391541004180908, + "learning_rate": 8.106824976219596e-06, + "loss": 3.2404, + "step": 22300 + }, + { + "epoch": 1.5154912352221768, + "grad_norm": 6.442255020141602, + "learning_rate": 8.106400326131269e-06, + "loss": 3.4473, + "step": 22305 + }, + { + "epoch": 1.5158309552928388, + "grad_norm": 6.977697849273682, + "learning_rate": 8.105975676042941e-06, + "loss": 3.1208, + "step": 22310 + }, + { + "epoch": 1.5161706753635005, + "grad_norm": 7.380660057067871, + "learning_rate": 8.105551025954614e-06, + "loss": 3.3597, + "step": 22315 + }, + { + "epoch": 1.516510395434162, + "grad_norm": 6.869278430938721, + "learning_rate": 8.105126375866287e-06, + "loss": 3.2478, + "step": 22320 + }, + { + "epoch": 1.5168501155048242, + "grad_norm": 8.96591567993164, + "learning_rate": 8.10470172577796e-06, + "loss": 3.1098, + "step": 22325 + }, + { + "epoch": 1.5171898355754858, + "grad_norm": 7.490819454193115, + "learning_rate": 8.104277075689633e-06, + "loss": 3.1828, + "step": 22330 + }, + { + "epoch": 1.5175295556461474, + "grad_norm": 5.749340057373047, + "learning_rate": 8.103852425601305e-06, + "loss": 3.202, + "step": 22335 + }, + { + "epoch": 1.5178692757168093, + "grad_norm": 7.288333892822266, + "learning_rate": 8.103427775512978e-06, + "loss": 3.0877, + "step": 22340 + }, + { + "epoch": 1.5182089957874711, + "grad_norm": 6.355310916900635, + "learning_rate": 8.103003125424651e-06, + "loss": 3.0849, + "step": 22345 + }, + { + "epoch": 1.5185487158581328, + "grad_norm": 6.35358190536499, + "learning_rate": 8.102578475336324e-06, + "loss": 3.0489, + "step": 22350 + }, + { + "epoch": 1.5188884359287946, + "grad_norm": 6.503847122192383, + "learning_rate": 8.102153825247997e-06, + "loss": 3.024, + "step": 22355 + }, + { + "epoch": 1.5192281559994565, + "grad_norm": 5.7171220779418945, + "learning_rate": 8.10172917515967e-06, + "loss": 3.295, + "step": 22360 + }, + { + "epoch": 1.5195678760701181, + "grad_norm": 7.007888317108154, + "learning_rate": 8.101304525071342e-06, + "loss": 2.9366, + "step": 22365 + }, + { + "epoch": 1.51990759614078, + "grad_norm": 6.791631698608398, + "learning_rate": 8.100879874983013e-06, + "loss": 2.95, + "step": 22370 + }, + { + "epoch": 1.5202473162114418, + "grad_norm": 5.981745719909668, + "learning_rate": 8.100455224894688e-06, + "loss": 3.2547, + "step": 22375 + }, + { + "epoch": 1.5205870362821035, + "grad_norm": 6.111562252044678, + "learning_rate": 8.10003057480636e-06, + "loss": 3.3031, + "step": 22380 + }, + { + "epoch": 1.5209267563527653, + "grad_norm": 6.478964805603027, + "learning_rate": 8.099605924718032e-06, + "loss": 3.2147, + "step": 22385 + }, + { + "epoch": 1.5212664764234272, + "grad_norm": 7.331483364105225, + "learning_rate": 8.099181274629706e-06, + "loss": 3.1804, + "step": 22390 + }, + { + "epoch": 1.5216061964940888, + "grad_norm": 7.492217063903809, + "learning_rate": 8.098756624541379e-06, + "loss": 3.2718, + "step": 22395 + }, + { + "epoch": 1.5219459165647506, + "grad_norm": 6.631264686584473, + "learning_rate": 8.09833197445305e-06, + "loss": 3.3451, + "step": 22400 + }, + { + "epoch": 1.5222856366354125, + "grad_norm": 8.213099479675293, + "learning_rate": 8.097907324364725e-06, + "loss": 3.2137, + "step": 22405 + }, + { + "epoch": 1.5226253567060741, + "grad_norm": 5.9799394607543945, + "learning_rate": 8.097482674276397e-06, + "loss": 3.4933, + "step": 22410 + }, + { + "epoch": 1.522965076776736, + "grad_norm": 6.0805816650390625, + "learning_rate": 8.097058024188069e-06, + "loss": 3.3328, + "step": 22415 + }, + { + "epoch": 1.5233047968473978, + "grad_norm": 8.253693580627441, + "learning_rate": 8.096633374099743e-06, + "loss": 3.1692, + "step": 22420 + }, + { + "epoch": 1.5236445169180595, + "grad_norm": 5.192126274108887, + "learning_rate": 8.096208724011416e-06, + "loss": 3.2775, + "step": 22425 + }, + { + "epoch": 1.5239842369887213, + "grad_norm": 5.691368579864502, + "learning_rate": 8.095784073923087e-06, + "loss": 2.9628, + "step": 22430 + }, + { + "epoch": 1.5243239570593832, + "grad_norm": 7.6490044593811035, + "learning_rate": 8.095359423834761e-06, + "loss": 3.0742, + "step": 22435 + }, + { + "epoch": 1.5246636771300448, + "grad_norm": 7.043272018432617, + "learning_rate": 8.094934773746434e-06, + "loss": 3.2034, + "step": 22440 + }, + { + "epoch": 1.5250033972007067, + "grad_norm": 7.19939661026001, + "learning_rate": 8.094510123658105e-06, + "loss": 3.2814, + "step": 22445 + }, + { + "epoch": 1.5253431172713685, + "grad_norm": 9.595939636230469, + "learning_rate": 8.09408547356978e-06, + "loss": 3.0702, + "step": 22450 + }, + { + "epoch": 1.5256828373420301, + "grad_norm": 6.8135199546813965, + "learning_rate": 8.093660823481451e-06, + "loss": 3.1719, + "step": 22455 + }, + { + "epoch": 1.526022557412692, + "grad_norm": 7.379150390625, + "learning_rate": 8.093236173393124e-06, + "loss": 3.3277, + "step": 22460 + }, + { + "epoch": 1.5263622774833538, + "grad_norm": 6.0805864334106445, + "learning_rate": 8.092811523304798e-06, + "loss": 3.2092, + "step": 22465 + }, + { + "epoch": 1.5267019975540155, + "grad_norm": 6.396120071411133, + "learning_rate": 8.09238687321647e-06, + "loss": 3.1529, + "step": 22470 + }, + { + "epoch": 1.527041717624677, + "grad_norm": 7.12426233291626, + "learning_rate": 8.091962223128144e-06, + "loss": 3.4132, + "step": 22475 + }, + { + "epoch": 1.5273814376953392, + "grad_norm": 7.27792501449585, + "learning_rate": 8.091537573039817e-06, + "loss": 3.1249, + "step": 22480 + }, + { + "epoch": 1.5277211577660008, + "grad_norm": 5.133693695068359, + "learning_rate": 8.091112922951488e-06, + "loss": 3.2784, + "step": 22485 + }, + { + "epoch": 1.5280608778366624, + "grad_norm": 7.929385662078857, + "learning_rate": 8.090688272863162e-06, + "loss": 3.2024, + "step": 22490 + }, + { + "epoch": 1.5284005979073245, + "grad_norm": 7.116937160491943, + "learning_rate": 8.090263622774835e-06, + "loss": 3.2729, + "step": 22495 + }, + { + "epoch": 1.5287403179779862, + "grad_norm": 8.473075866699219, + "learning_rate": 8.089838972686506e-06, + "loss": 3.0639, + "step": 22500 + }, + { + "epoch": 1.5290800380486478, + "grad_norm": 6.638252258300781, + "learning_rate": 8.08941432259818e-06, + "loss": 3.0204, + "step": 22505 + }, + { + "epoch": 1.5294197581193096, + "grad_norm": 6.712108612060547, + "learning_rate": 8.089074602527518e-06, + "loss": 3.2094, + "step": 22510 + }, + { + "epoch": 1.5297594781899715, + "grad_norm": 6.531246662139893, + "learning_rate": 8.08864995243919e-06, + "loss": 3.0462, + "step": 22515 + }, + { + "epoch": 1.5300991982606331, + "grad_norm": 8.504571914672852, + "learning_rate": 8.088225302350863e-06, + "loss": 2.9844, + "step": 22520 + }, + { + "epoch": 1.530438918331295, + "grad_norm": 6.753965377807617, + "learning_rate": 8.087800652262536e-06, + "loss": 3.2805, + "step": 22525 + }, + { + "epoch": 1.5307786384019568, + "grad_norm": 6.487996578216553, + "learning_rate": 8.087376002174209e-06, + "loss": 3.0724, + "step": 22530 + }, + { + "epoch": 1.5311183584726185, + "grad_norm": 6.58884334564209, + "learning_rate": 8.086951352085882e-06, + "loss": 3.3761, + "step": 22535 + }, + { + "epoch": 1.5314580785432803, + "grad_norm": 8.287710189819336, + "learning_rate": 8.086526701997555e-06, + "loss": 3.1997, + "step": 22540 + }, + { + "epoch": 1.5317977986139422, + "grad_norm": 6.04194450378418, + "learning_rate": 8.086102051909228e-06, + "loss": 3.0974, + "step": 22545 + }, + { + "epoch": 1.5321375186846038, + "grad_norm": 6.919079780578613, + "learning_rate": 8.0856774018209e-06, + "loss": 3.1206, + "step": 22550 + }, + { + "epoch": 1.5324772387552656, + "grad_norm": 6.70133113861084, + "learning_rate": 8.085252751732573e-06, + "loss": 3.2592, + "step": 22555 + }, + { + "epoch": 1.5328169588259275, + "grad_norm": 5.629627227783203, + "learning_rate": 8.084828101644246e-06, + "loss": 3.2079, + "step": 22560 + }, + { + "epoch": 1.5331566788965891, + "grad_norm": 8.92464828491211, + "learning_rate": 8.084403451555919e-06, + "loss": 3.1792, + "step": 22565 + }, + { + "epoch": 1.533496398967251, + "grad_norm": 6.707019329071045, + "learning_rate": 8.083978801467592e-06, + "loss": 3.3018, + "step": 22570 + }, + { + "epoch": 1.5338361190379128, + "grad_norm": 6.848013877868652, + "learning_rate": 8.083554151379264e-06, + "loss": 3.174, + "step": 22575 + }, + { + "epoch": 1.5341758391085745, + "grad_norm": 8.654877662658691, + "learning_rate": 8.083129501290937e-06, + "loss": 3.2353, + "step": 22580 + }, + { + "epoch": 1.5345155591792363, + "grad_norm": 9.180298805236816, + "learning_rate": 8.08270485120261e-06, + "loss": 3.2663, + "step": 22585 + }, + { + "epoch": 1.5348552792498982, + "grad_norm": 7.7465972900390625, + "learning_rate": 8.082280201114283e-06, + "loss": 3.4627, + "step": 22590 + }, + { + "epoch": 1.5351949993205598, + "grad_norm": 6.627148628234863, + "learning_rate": 8.081855551025956e-06, + "loss": 3.4808, + "step": 22595 + }, + { + "epoch": 1.5355347193912217, + "grad_norm": 6.798698902130127, + "learning_rate": 8.081430900937628e-06, + "loss": 3.1765, + "step": 22600 + }, + { + "epoch": 1.5358744394618835, + "grad_norm": 5.587130546569824, + "learning_rate": 8.081006250849301e-06, + "loss": 3.15, + "step": 22605 + }, + { + "epoch": 1.5362141595325451, + "grad_norm": 7.7114763259887695, + "learning_rate": 8.080581600760974e-06, + "loss": 3.168, + "step": 22610 + }, + { + "epoch": 1.536553879603207, + "grad_norm": 5.52433967590332, + "learning_rate": 8.080156950672647e-06, + "loss": 3.3982, + "step": 22615 + }, + { + "epoch": 1.5368935996738688, + "grad_norm": 6.045597553253174, + "learning_rate": 8.07973230058432e-06, + "loss": 3.166, + "step": 22620 + }, + { + "epoch": 1.5372333197445305, + "grad_norm": 6.928520202636719, + "learning_rate": 8.079307650495992e-06, + "loss": 3.2146, + "step": 22625 + }, + { + "epoch": 1.5375730398151923, + "grad_norm": 6.718807220458984, + "learning_rate": 8.078883000407665e-06, + "loss": 3.1476, + "step": 22630 + }, + { + "epoch": 1.5379127598858542, + "grad_norm": 6.481622219085693, + "learning_rate": 8.078458350319336e-06, + "loss": 3.307, + "step": 22635 + }, + { + "epoch": 1.5382524799565158, + "grad_norm": 5.878395080566406, + "learning_rate": 8.07803370023101e-06, + "loss": 3.1745, + "step": 22640 + }, + { + "epoch": 1.5385922000271774, + "grad_norm": 6.230693340301514, + "learning_rate": 8.077609050142684e-06, + "loss": 3.1906, + "step": 22645 + }, + { + "epoch": 1.5389319200978395, + "grad_norm": 6.228152275085449, + "learning_rate": 8.077184400054355e-06, + "loss": 3.174, + "step": 22650 + }, + { + "epoch": 1.5392716401685012, + "grad_norm": 6.076822757720947, + "learning_rate": 8.076759749966029e-06, + "loss": 3.0471, + "step": 22655 + }, + { + "epoch": 1.5396113602391628, + "grad_norm": 5.986904621124268, + "learning_rate": 8.076335099877702e-06, + "loss": 3.179, + "step": 22660 + }, + { + "epoch": 1.5399510803098249, + "grad_norm": 7.376126289367676, + "learning_rate": 8.075910449789373e-06, + "loss": 3.072, + "step": 22665 + }, + { + "epoch": 1.5402908003804865, + "grad_norm": 5.596498489379883, + "learning_rate": 8.075485799701048e-06, + "loss": 3.2258, + "step": 22670 + }, + { + "epoch": 1.5406305204511481, + "grad_norm": 6.37067174911499, + "learning_rate": 8.07506114961272e-06, + "loss": 3.228, + "step": 22675 + }, + { + "epoch": 1.54097024052181, + "grad_norm": 6.998894214630127, + "learning_rate": 8.074636499524393e-06, + "loss": 3.2027, + "step": 22680 + }, + { + "epoch": 1.5413099605924718, + "grad_norm": 6.032708168029785, + "learning_rate": 8.074211849436066e-06, + "loss": 3.2771, + "step": 22685 + }, + { + "epoch": 1.5416496806631335, + "grad_norm": 6.290567398071289, + "learning_rate": 8.073787199347739e-06, + "loss": 2.9777, + "step": 22690 + }, + { + "epoch": 1.5419894007337953, + "grad_norm": 6.167185306549072, + "learning_rate": 8.073362549259412e-06, + "loss": 3.3073, + "step": 22695 + }, + { + "epoch": 1.5423291208044572, + "grad_norm": 7.609457015991211, + "learning_rate": 8.072937899171084e-06, + "loss": 3.0275, + "step": 22700 + }, + { + "epoch": 1.5426688408751188, + "grad_norm": 4.917629718780518, + "learning_rate": 8.072513249082755e-06, + "loss": 3.1716, + "step": 22705 + }, + { + "epoch": 1.5430085609457806, + "grad_norm": 8.586840629577637, + "learning_rate": 8.07208859899443e-06, + "loss": 3.151, + "step": 22710 + }, + { + "epoch": 1.5433482810164425, + "grad_norm": 7.389169692993164, + "learning_rate": 8.071663948906103e-06, + "loss": 3.1783, + "step": 22715 + }, + { + "epoch": 1.5436880010871041, + "grad_norm": 6.6251654624938965, + "learning_rate": 8.071239298817774e-06, + "loss": 2.8378, + "step": 22720 + }, + { + "epoch": 1.544027721157766, + "grad_norm": 5.990330219268799, + "learning_rate": 8.070814648729448e-06, + "loss": 3.1739, + "step": 22725 + }, + { + "epoch": 1.5443674412284278, + "grad_norm": 8.562126159667969, + "learning_rate": 8.070389998641121e-06, + "loss": 3.4089, + "step": 22730 + }, + { + "epoch": 1.5447071612990895, + "grad_norm": 6.080057621002197, + "learning_rate": 8.069965348552792e-06, + "loss": 3.0923, + "step": 22735 + }, + { + "epoch": 1.5450468813697513, + "grad_norm": 5.703962326049805, + "learning_rate": 8.069540698464467e-06, + "loss": 3.2082, + "step": 22740 + }, + { + "epoch": 1.5453866014404132, + "grad_norm": 6.43034029006958, + "learning_rate": 8.06911604837614e-06, + "loss": 3.0945, + "step": 22745 + }, + { + "epoch": 1.5457263215110748, + "grad_norm": 6.474040508270264, + "learning_rate": 8.06869139828781e-06, + "loss": 3.3535, + "step": 22750 + }, + { + "epoch": 1.5460660415817367, + "grad_norm": 6.681677341461182, + "learning_rate": 8.068266748199485e-06, + "loss": 3.1371, + "step": 22755 + }, + { + "epoch": 1.5464057616523985, + "grad_norm": 7.46764612197876, + "learning_rate": 8.067842098111158e-06, + "loss": 2.9469, + "step": 22760 + }, + { + "epoch": 1.5467454817230601, + "grad_norm": 6.368871212005615, + "learning_rate": 8.067417448022829e-06, + "loss": 3.1469, + "step": 22765 + }, + { + "epoch": 1.547085201793722, + "grad_norm": 7.500199794769287, + "learning_rate": 8.066992797934504e-06, + "loss": 3.0857, + "step": 22770 + }, + { + "epoch": 1.5474249218643839, + "grad_norm": 7.1610846519470215, + "learning_rate": 8.066568147846175e-06, + "loss": 3.3245, + "step": 22775 + }, + { + "epoch": 1.5477646419350455, + "grad_norm": 8.425518989562988, + "learning_rate": 8.066143497757847e-06, + "loss": 3.141, + "step": 22780 + }, + { + "epoch": 1.5481043620057073, + "grad_norm": 7.059214115142822, + "learning_rate": 8.065718847669522e-06, + "loss": 3.3183, + "step": 22785 + }, + { + "epoch": 1.5484440820763692, + "grad_norm": 7.266088008880615, + "learning_rate": 8.065294197581193e-06, + "loss": 3.202, + "step": 22790 + }, + { + "epoch": 1.5487838021470308, + "grad_norm": 7.914792060852051, + "learning_rate": 8.064869547492866e-06, + "loss": 3.4883, + "step": 22795 + }, + { + "epoch": 1.5491235222176927, + "grad_norm": 6.646385192871094, + "learning_rate": 8.06444489740454e-06, + "loss": 2.9628, + "step": 22800 + }, + { + "epoch": 1.5494632422883545, + "grad_norm": 6.530134201049805, + "learning_rate": 8.064020247316211e-06, + "loss": 2.934, + "step": 22805 + }, + { + "epoch": 1.5498029623590162, + "grad_norm": 6.8814005851745605, + "learning_rate": 8.063595597227884e-06, + "loss": 3.055, + "step": 22810 + }, + { + "epoch": 1.5501426824296778, + "grad_norm": 6.36423921585083, + "learning_rate": 8.063170947139559e-06, + "loss": 3.2645, + "step": 22815 + }, + { + "epoch": 1.5504824025003399, + "grad_norm": 5.391768455505371, + "learning_rate": 8.06274629705123e-06, + "loss": 3.0757, + "step": 22820 + }, + { + "epoch": 1.5508221225710015, + "grad_norm": 8.078798294067383, + "learning_rate": 8.062321646962903e-06, + "loss": 3.1856, + "step": 22825 + }, + { + "epoch": 1.5511618426416631, + "grad_norm": 6.929506778717041, + "learning_rate": 8.061896996874577e-06, + "loss": 3.3116, + "step": 22830 + }, + { + "epoch": 1.5515015627123252, + "grad_norm": 7.3810930252075195, + "learning_rate": 8.061472346786248e-06, + "loss": 3.3081, + "step": 22835 + }, + { + "epoch": 1.5518412827829868, + "grad_norm": 7.279691219329834, + "learning_rate": 8.061047696697921e-06, + "loss": 3.192, + "step": 22840 + }, + { + "epoch": 1.5521810028536485, + "grad_norm": 6.665369987487793, + "learning_rate": 8.060623046609596e-06, + "loss": 3.3179, + "step": 22845 + }, + { + "epoch": 1.5525207229243103, + "grad_norm": 6.167862415313721, + "learning_rate": 8.060198396521267e-06, + "loss": 3.1415, + "step": 22850 + }, + { + "epoch": 1.5528604429949722, + "grad_norm": 8.643240928649902, + "learning_rate": 8.05977374643294e-06, + "loss": 3.0019, + "step": 22855 + }, + { + "epoch": 1.5532001630656338, + "grad_norm": 7.710629463195801, + "learning_rate": 8.059349096344612e-06, + "loss": 3.2047, + "step": 22860 + }, + { + "epoch": 1.5535398831362957, + "grad_norm": 6.214609146118164, + "learning_rate": 8.058924446256285e-06, + "loss": 3.2952, + "step": 22865 + }, + { + "epoch": 1.5538796032069575, + "grad_norm": 7.824427604675293, + "learning_rate": 8.058499796167958e-06, + "loss": 3.1732, + "step": 22870 + }, + { + "epoch": 1.5542193232776191, + "grad_norm": 6.128389358520508, + "learning_rate": 8.05807514607963e-06, + "loss": 3.1104, + "step": 22875 + }, + { + "epoch": 1.554559043348281, + "grad_norm": 5.291926860809326, + "learning_rate": 8.057650495991303e-06, + "loss": 3.1605, + "step": 22880 + }, + { + "epoch": 1.5548987634189428, + "grad_norm": 6.422240257263184, + "learning_rate": 8.057225845902976e-06, + "loss": 3.155, + "step": 22885 + }, + { + "epoch": 1.5552384834896045, + "grad_norm": 8.921157836914062, + "learning_rate": 8.056801195814649e-06, + "loss": 3.3002, + "step": 22890 + }, + { + "epoch": 1.5555782035602663, + "grad_norm": 6.741271018981934, + "learning_rate": 8.056376545726322e-06, + "loss": 3.2753, + "step": 22895 + }, + { + "epoch": 1.5559179236309282, + "grad_norm": 7.249508380889893, + "learning_rate": 8.055951895637995e-06, + "loss": 3.1886, + "step": 22900 + }, + { + "epoch": 1.5562576437015898, + "grad_norm": 5.606007099151611, + "learning_rate": 8.055527245549667e-06, + "loss": 3.0991, + "step": 22905 + }, + { + "epoch": 1.5565973637722517, + "grad_norm": 6.320374011993408, + "learning_rate": 8.05510259546134e-06, + "loss": 3.1749, + "step": 22910 + }, + { + "epoch": 1.5569370838429135, + "grad_norm": 6.539905071258545, + "learning_rate": 8.054677945373013e-06, + "loss": 3.2508, + "step": 22915 + }, + { + "epoch": 1.5572768039135751, + "grad_norm": 5.521591663360596, + "learning_rate": 8.054253295284686e-06, + "loss": 3.127, + "step": 22920 + }, + { + "epoch": 1.557616523984237, + "grad_norm": 5.993317604064941, + "learning_rate": 8.053828645196359e-06, + "loss": 3.1851, + "step": 22925 + }, + { + "epoch": 1.5579562440548989, + "grad_norm": 6.361711502075195, + "learning_rate": 8.053403995108031e-06, + "loss": 3.1281, + "step": 22930 + }, + { + "epoch": 1.5582959641255605, + "grad_norm": 5.733673095703125, + "learning_rate": 8.052979345019704e-06, + "loss": 3.0783, + "step": 22935 + }, + { + "epoch": 1.5586356841962223, + "grad_norm": 5.620368480682373, + "learning_rate": 8.052554694931377e-06, + "loss": 3.2855, + "step": 22940 + }, + { + "epoch": 1.5589754042668842, + "grad_norm": 7.080999851226807, + "learning_rate": 8.05213004484305e-06, + "loss": 2.9597, + "step": 22945 + }, + { + "epoch": 1.5593151243375458, + "grad_norm": 5.575887680053711, + "learning_rate": 8.051705394754723e-06, + "loss": 3.0748, + "step": 22950 + }, + { + "epoch": 1.5596548444082077, + "grad_norm": 6.323554515838623, + "learning_rate": 8.051280744666395e-06, + "loss": 3.1889, + "step": 22955 + }, + { + "epoch": 1.5599945644788695, + "grad_norm": 6.778013706207275, + "learning_rate": 8.050856094578068e-06, + "loss": 3.2239, + "step": 22960 + }, + { + "epoch": 1.5603342845495312, + "grad_norm": 8.599899291992188, + "learning_rate": 8.050431444489741e-06, + "loss": 3.1644, + "step": 22965 + }, + { + "epoch": 1.560674004620193, + "grad_norm": 7.582270622253418, + "learning_rate": 8.050006794401414e-06, + "loss": 3.3753, + "step": 22970 + }, + { + "epoch": 1.5610137246908549, + "grad_norm": 5.1099348068237305, + "learning_rate": 8.049582144313087e-06, + "loss": 2.941, + "step": 22975 + }, + { + "epoch": 1.5613534447615165, + "grad_norm": 7.146424770355225, + "learning_rate": 8.04915749422476e-06, + "loss": 3.0319, + "step": 22980 + }, + { + "epoch": 1.5616931648321781, + "grad_norm": 7.879884719848633, + "learning_rate": 8.048732844136432e-06, + "loss": 3.1148, + "step": 22985 + }, + { + "epoch": 1.5620328849028402, + "grad_norm": 6.326765060424805, + "learning_rate": 8.048308194048105e-06, + "loss": 3.4378, + "step": 22990 + }, + { + "epoch": 1.5623726049735018, + "grad_norm": 5.479720592498779, + "learning_rate": 8.047883543959778e-06, + "loss": 3.1255, + "step": 22995 + }, + { + "epoch": 1.5627123250441635, + "grad_norm": 6.394311904907227, + "learning_rate": 8.04745889387145e-06, + "loss": 3.117, + "step": 23000 + }, + { + "epoch": 1.5630520451148255, + "grad_norm": 7.865484237670898, + "learning_rate": 8.047034243783123e-06, + "loss": 3.142, + "step": 23005 + }, + { + "epoch": 1.5633917651854872, + "grad_norm": 6.500702857971191, + "learning_rate": 8.046609593694796e-06, + "loss": 2.9881, + "step": 23010 + }, + { + "epoch": 1.5637314852561488, + "grad_norm": 6.905800819396973, + "learning_rate": 8.046184943606469e-06, + "loss": 3.4895, + "step": 23015 + }, + { + "epoch": 1.5640712053268107, + "grad_norm": 6.066555023193359, + "learning_rate": 8.045760293518142e-06, + "loss": 3.3505, + "step": 23020 + }, + { + "epoch": 1.5644109253974725, + "grad_norm": 7.7547221183776855, + "learning_rate": 8.045335643429815e-06, + "loss": 3.1214, + "step": 23025 + }, + { + "epoch": 1.5647506454681341, + "grad_norm": 9.60391616821289, + "learning_rate": 8.044910993341487e-06, + "loss": 3.0594, + "step": 23030 + }, + { + "epoch": 1.565090365538796, + "grad_norm": 6.975337505340576, + "learning_rate": 8.04448634325316e-06, + "loss": 3.4191, + "step": 23035 + }, + { + "epoch": 1.5654300856094578, + "grad_norm": 6.874581813812256, + "learning_rate": 8.044061693164833e-06, + "loss": 3.2092, + "step": 23040 + }, + { + "epoch": 1.5657698056801195, + "grad_norm": 6.019896030426025, + "learning_rate": 8.043637043076506e-06, + "loss": 3.0459, + "step": 23045 + }, + { + "epoch": 1.5661095257507813, + "grad_norm": 5.936823844909668, + "learning_rate": 8.043212392988179e-06, + "loss": 3.1539, + "step": 23050 + }, + { + "epoch": 1.5664492458214432, + "grad_norm": 8.247655868530273, + "learning_rate": 8.042787742899851e-06, + "loss": 3.2533, + "step": 23055 + }, + { + "epoch": 1.5667889658921048, + "grad_norm": 5.915541648864746, + "learning_rate": 8.042363092811524e-06, + "loss": 3.4072, + "step": 23060 + }, + { + "epoch": 1.5671286859627667, + "grad_norm": 7.795478343963623, + "learning_rate": 8.041938442723197e-06, + "loss": 3.4099, + "step": 23065 + }, + { + "epoch": 1.5674684060334285, + "grad_norm": 7.26991081237793, + "learning_rate": 8.04151379263487e-06, + "loss": 3.4532, + "step": 23070 + }, + { + "epoch": 1.5678081261040901, + "grad_norm": 7.255702972412109, + "learning_rate": 8.041089142546543e-06, + "loss": 3.1001, + "step": 23075 + }, + { + "epoch": 1.568147846174752, + "grad_norm": 7.226129531860352, + "learning_rate": 8.040664492458215e-06, + "loss": 3.3293, + "step": 23080 + }, + { + "epoch": 1.5684875662454139, + "grad_norm": 6.925755500793457, + "learning_rate": 8.040239842369888e-06, + "loss": 3.2238, + "step": 23085 + }, + { + "epoch": 1.5688272863160755, + "grad_norm": 7.004758358001709, + "learning_rate": 8.039815192281561e-06, + "loss": 3.1143, + "step": 23090 + }, + { + "epoch": 1.5691670063867373, + "grad_norm": 7.486815452575684, + "learning_rate": 8.039390542193234e-06, + "loss": 3.0374, + "step": 23095 + }, + { + "epoch": 1.5695067264573992, + "grad_norm": 6.328569412231445, + "learning_rate": 8.038965892104907e-06, + "loss": 3.1553, + "step": 23100 + }, + { + "epoch": 1.5698464465280608, + "grad_norm": 7.4862446784973145, + "learning_rate": 8.03854124201658e-06, + "loss": 3.2809, + "step": 23105 + }, + { + "epoch": 1.5701861665987227, + "grad_norm": 5.23076868057251, + "learning_rate": 8.038116591928252e-06, + "loss": 3.0947, + "step": 23110 + }, + { + "epoch": 1.5705258866693845, + "grad_norm": 8.566861152648926, + "learning_rate": 8.037691941839925e-06, + "loss": 3.2286, + "step": 23115 + }, + { + "epoch": 1.5708656067400462, + "grad_norm": 6.320465564727783, + "learning_rate": 8.037267291751596e-06, + "loss": 3.0882, + "step": 23120 + }, + { + "epoch": 1.571205326810708, + "grad_norm": 6.076781272888184, + "learning_rate": 8.03684264166327e-06, + "loss": 3.2327, + "step": 23125 + }, + { + "epoch": 1.5715450468813699, + "grad_norm": 6.10314416885376, + "learning_rate": 8.036417991574943e-06, + "loss": 3.1336, + "step": 23130 + }, + { + "epoch": 1.5718847669520315, + "grad_norm": 5.848506450653076, + "learning_rate": 8.035993341486615e-06, + "loss": 3.3765, + "step": 23135 + }, + { + "epoch": 1.5722244870226934, + "grad_norm": 7.2057785987854, + "learning_rate": 8.035568691398289e-06, + "loss": 3.1933, + "step": 23140 + }, + { + "epoch": 1.5725642070933552, + "grad_norm": 6.5290446281433105, + "learning_rate": 8.035144041309962e-06, + "loss": 3.2486, + "step": 23145 + }, + { + "epoch": 1.5729039271640168, + "grad_norm": 6.133744716644287, + "learning_rate": 8.034719391221633e-06, + "loss": 3.2325, + "step": 23150 + }, + { + "epoch": 1.5732436472346785, + "grad_norm": 9.76150894165039, + "learning_rate": 8.034294741133308e-06, + "loss": 3.3036, + "step": 23155 + }, + { + "epoch": 1.5735833673053405, + "grad_norm": 6.090497016906738, + "learning_rate": 8.03387009104498e-06, + "loss": 2.998, + "step": 23160 + }, + { + "epoch": 1.5739230873760022, + "grad_norm": 5.697354793548584, + "learning_rate": 8.033445440956651e-06, + "loss": 3.1192, + "step": 23165 + }, + { + "epoch": 1.5742628074466638, + "grad_norm": 6.207306385040283, + "learning_rate": 8.033020790868326e-06, + "loss": 3.1638, + "step": 23170 + }, + { + "epoch": 1.5746025275173259, + "grad_norm": 8.339762687683105, + "learning_rate": 8.032596140779999e-06, + "loss": 3.2838, + "step": 23175 + }, + { + "epoch": 1.5749422475879875, + "grad_norm": 7.491079330444336, + "learning_rate": 8.03217149069167e-06, + "loss": 3.3657, + "step": 23180 + }, + { + "epoch": 1.5752819676586491, + "grad_norm": 6.912400245666504, + "learning_rate": 8.031746840603344e-06, + "loss": 3.1824, + "step": 23185 + }, + { + "epoch": 1.575621687729311, + "grad_norm": 7.980975151062012, + "learning_rate": 8.031322190515017e-06, + "loss": 3.3281, + "step": 23190 + }, + { + "epoch": 1.5759614077999728, + "grad_norm": 6.724276542663574, + "learning_rate": 8.030897540426688e-06, + "loss": 3.4427, + "step": 23195 + }, + { + "epoch": 1.5763011278706345, + "grad_norm": 6.54376745223999, + "learning_rate": 8.030472890338363e-06, + "loss": 2.9501, + "step": 23200 + }, + { + "epoch": 1.5766408479412963, + "grad_norm": 7.444710731506348, + "learning_rate": 8.030048240250034e-06, + "loss": 3.1399, + "step": 23205 + }, + { + "epoch": 1.5769805680119582, + "grad_norm": 8.453093528747559, + "learning_rate": 8.029623590161707e-06, + "loss": 3.1202, + "step": 23210 + }, + { + "epoch": 1.5773202880826198, + "grad_norm": 5.974777698516846, + "learning_rate": 8.029198940073381e-06, + "loss": 3.1346, + "step": 23215 + }, + { + "epoch": 1.5776600081532817, + "grad_norm": 6.946164131164551, + "learning_rate": 8.028774289985052e-06, + "loss": 3.1864, + "step": 23220 + }, + { + "epoch": 1.5779997282239435, + "grad_norm": 6.385114669799805, + "learning_rate": 8.028349639896725e-06, + "loss": 3.2612, + "step": 23225 + }, + { + "epoch": 1.5783394482946052, + "grad_norm": 6.953195095062256, + "learning_rate": 8.0279249898084e-06, + "loss": 2.9996, + "step": 23230 + }, + { + "epoch": 1.578679168365267, + "grad_norm": 7.747397422790527, + "learning_rate": 8.02750033972007e-06, + "loss": 3.1567, + "step": 23235 + }, + { + "epoch": 1.5790188884359289, + "grad_norm": 7.334286212921143, + "learning_rate": 8.027075689631743e-06, + "loss": 3.2002, + "step": 23240 + }, + { + "epoch": 1.5793586085065905, + "grad_norm": 7.370994567871094, + "learning_rate": 8.026651039543418e-06, + "loss": 2.9323, + "step": 23245 + }, + { + "epoch": 1.5796983285772523, + "grad_norm": 6.395468235015869, + "learning_rate": 8.026226389455089e-06, + "loss": 3.1898, + "step": 23250 + }, + { + "epoch": 1.5800380486479142, + "grad_norm": 7.951178550720215, + "learning_rate": 8.025801739366762e-06, + "loss": 3.4016, + "step": 23255 + }, + { + "epoch": 1.5803777687185758, + "grad_norm": 7.022189140319824, + "learning_rate": 8.025377089278436e-06, + "loss": 3.2089, + "step": 23260 + }, + { + "epoch": 1.5807174887892377, + "grad_norm": 7.973498344421387, + "learning_rate": 8.024952439190107e-06, + "loss": 3.1653, + "step": 23265 + }, + { + "epoch": 1.5810572088598995, + "grad_norm": 6.656071662902832, + "learning_rate": 8.02452778910178e-06, + "loss": 3.1986, + "step": 23270 + }, + { + "epoch": 1.5813969289305612, + "grad_norm": 6.304806232452393, + "learning_rate": 8.024103139013453e-06, + "loss": 3.4434, + "step": 23275 + }, + { + "epoch": 1.581736649001223, + "grad_norm": 7.152355194091797, + "learning_rate": 8.023678488925126e-06, + "loss": 3.4152, + "step": 23280 + }, + { + "epoch": 1.5820763690718849, + "grad_norm": 6.939756393432617, + "learning_rate": 8.023253838836799e-06, + "loss": 3.1061, + "step": 23285 + }, + { + "epoch": 1.5824160891425465, + "grad_norm": 8.240216255187988, + "learning_rate": 8.022829188748471e-06, + "loss": 2.8727, + "step": 23290 + }, + { + "epoch": 1.5827558092132084, + "grad_norm": 7.765799045562744, + "learning_rate": 8.022404538660144e-06, + "loss": 3.0912, + "step": 23295 + }, + { + "epoch": 1.5830955292838702, + "grad_norm": 7.320083141326904, + "learning_rate": 8.021979888571817e-06, + "loss": 3.1721, + "step": 23300 + }, + { + "epoch": 1.5834352493545318, + "grad_norm": 7.692086219787598, + "learning_rate": 8.02155523848349e-06, + "loss": 3.2443, + "step": 23305 + }, + { + "epoch": 1.5837749694251937, + "grad_norm": 7.894920349121094, + "learning_rate": 8.021130588395163e-06, + "loss": 3.0804, + "step": 23310 + }, + { + "epoch": 1.5841146894958555, + "grad_norm": 6.036900997161865, + "learning_rate": 8.020705938306835e-06, + "loss": 2.8915, + "step": 23315 + }, + { + "epoch": 1.5844544095665172, + "grad_norm": 6.141956329345703, + "learning_rate": 8.020281288218508e-06, + "loss": 2.9702, + "step": 23320 + }, + { + "epoch": 1.5847941296371788, + "grad_norm": 7.1070027351379395, + "learning_rate": 8.019856638130181e-06, + "loss": 3.1489, + "step": 23325 + }, + { + "epoch": 1.5851338497078409, + "grad_norm": 5.801689147949219, + "learning_rate": 8.019431988041854e-06, + "loss": 3.1973, + "step": 23330 + }, + { + "epoch": 1.5854735697785025, + "grad_norm": 6.657332897186279, + "learning_rate": 8.019007337953527e-06, + "loss": 3.2456, + "step": 23335 + }, + { + "epoch": 1.5858132898491641, + "grad_norm": 7.290010929107666, + "learning_rate": 8.0185826878652e-06, + "loss": 2.9558, + "step": 23340 + }, + { + "epoch": 1.5861530099198262, + "grad_norm": 7.4278082847595215, + "learning_rate": 8.018158037776872e-06, + "loss": 3.1995, + "step": 23345 + }, + { + "epoch": 1.5864927299904878, + "grad_norm": 6.515374660491943, + "learning_rate": 8.017733387688545e-06, + "loss": 3.4571, + "step": 23350 + }, + { + "epoch": 1.5868324500611495, + "grad_norm": 7.963770866394043, + "learning_rate": 8.017308737600218e-06, + "loss": 3.237, + "step": 23355 + }, + { + "epoch": 1.5871721701318113, + "grad_norm": 7.730865955352783, + "learning_rate": 8.01688408751189e-06, + "loss": 3.1161, + "step": 23360 + }, + { + "epoch": 1.5875118902024732, + "grad_norm": 5.937524318695068, + "learning_rate": 8.016459437423563e-06, + "loss": 3.3341, + "step": 23365 + }, + { + "epoch": 1.5878516102731348, + "grad_norm": 6.680533409118652, + "learning_rate": 8.016034787335236e-06, + "loss": 3.2155, + "step": 23370 + }, + { + "epoch": 1.5881913303437967, + "grad_norm": 5.432262420654297, + "learning_rate": 8.015610137246909e-06, + "loss": 3.1517, + "step": 23375 + }, + { + "epoch": 1.5885310504144585, + "grad_norm": 6.553748607635498, + "learning_rate": 8.015185487158582e-06, + "loss": 3.218, + "step": 23380 + }, + { + "epoch": 1.5888707704851202, + "grad_norm": 7.380005359649658, + "learning_rate": 8.014760837070255e-06, + "loss": 3.2225, + "step": 23385 + }, + { + "epoch": 1.589210490555782, + "grad_norm": 9.541501998901367, + "learning_rate": 8.014336186981927e-06, + "loss": 3.0567, + "step": 23390 + }, + { + "epoch": 1.5895502106264439, + "grad_norm": 6.324533939361572, + "learning_rate": 8.0139115368936e-06, + "loss": 3.2778, + "step": 23395 + }, + { + "epoch": 1.5898899306971055, + "grad_norm": 7.972381591796875, + "learning_rate": 8.013486886805273e-06, + "loss": 3.1508, + "step": 23400 + }, + { + "epoch": 1.5902296507677673, + "grad_norm": 6.594654560089111, + "learning_rate": 8.013062236716946e-06, + "loss": 3.4155, + "step": 23405 + }, + { + "epoch": 1.5905693708384292, + "grad_norm": 7.261844635009766, + "learning_rate": 8.012637586628619e-06, + "loss": 3.3724, + "step": 23410 + }, + { + "epoch": 1.5909090909090908, + "grad_norm": 5.467884540557861, + "learning_rate": 8.012212936540291e-06, + "loss": 3.2539, + "step": 23415 + }, + { + "epoch": 1.5912488109797527, + "grad_norm": 7.619297027587891, + "learning_rate": 8.011788286451964e-06, + "loss": 3.0098, + "step": 23420 + }, + { + "epoch": 1.5915885310504145, + "grad_norm": 7.154738426208496, + "learning_rate": 8.011363636363637e-06, + "loss": 3.4054, + "step": 23425 + }, + { + "epoch": 1.5919282511210762, + "grad_norm": 5.926559925079346, + "learning_rate": 8.01093898627531e-06, + "loss": 3.0961, + "step": 23430 + }, + { + "epoch": 1.592267971191738, + "grad_norm": 7.680482387542725, + "learning_rate": 8.010514336186983e-06, + "loss": 2.8946, + "step": 23435 + }, + { + "epoch": 1.5926076912623999, + "grad_norm": 8.688632011413574, + "learning_rate": 8.010089686098655e-06, + "loss": 2.8358, + "step": 23440 + }, + { + "epoch": 1.5929474113330615, + "grad_norm": 7.049492359161377, + "learning_rate": 8.009665036010328e-06, + "loss": 3.0938, + "step": 23445 + }, + { + "epoch": 1.5932871314037234, + "grad_norm": 6.548642158508301, + "learning_rate": 8.009240385922001e-06, + "loss": 3.5313, + "step": 23450 + }, + { + "epoch": 1.5936268514743852, + "grad_norm": 7.793388843536377, + "learning_rate": 8.008815735833674e-06, + "loss": 2.9782, + "step": 23455 + }, + { + "epoch": 1.5939665715450468, + "grad_norm": 6.787985801696777, + "learning_rate": 8.008391085745347e-06, + "loss": 3.3035, + "step": 23460 + }, + { + "epoch": 1.5943062916157087, + "grad_norm": 5.543227672576904, + "learning_rate": 8.00796643565702e-06, + "loss": 3.0678, + "step": 23465 + }, + { + "epoch": 1.5946460116863705, + "grad_norm": 7.808091640472412, + "learning_rate": 8.007541785568692e-06, + "loss": 3.3463, + "step": 23470 + }, + { + "epoch": 1.5949857317570322, + "grad_norm": 6.9321794509887695, + "learning_rate": 8.007117135480365e-06, + "loss": 2.8135, + "step": 23475 + }, + { + "epoch": 1.595325451827694, + "grad_norm": 6.358381748199463, + "learning_rate": 8.006692485392038e-06, + "loss": 3.1574, + "step": 23480 + }, + { + "epoch": 1.5956651718983559, + "grad_norm": 8.655670166015625, + "learning_rate": 8.00626783530371e-06, + "loss": 3.0468, + "step": 23485 + }, + { + "epoch": 1.5960048919690175, + "grad_norm": 7.081387996673584, + "learning_rate": 8.005843185215383e-06, + "loss": 3.0636, + "step": 23490 + }, + { + "epoch": 1.5963446120396791, + "grad_norm": 6.815731525421143, + "learning_rate": 8.005418535127056e-06, + "loss": 3.0585, + "step": 23495 + }, + { + "epoch": 1.5966843321103412, + "grad_norm": 8.234298706054688, + "learning_rate": 8.004993885038729e-06, + "loss": 3.2876, + "step": 23500 + }, + { + "epoch": 1.5970240521810029, + "grad_norm": 8.053092956542969, + "learning_rate": 8.004569234950402e-06, + "loss": 3.0055, + "step": 23505 + }, + { + "epoch": 1.5973637722516645, + "grad_norm": 7.501079082489014, + "learning_rate": 8.004144584862075e-06, + "loss": 3.2736, + "step": 23510 + }, + { + "epoch": 1.5977034923223266, + "grad_norm": 7.206111431121826, + "learning_rate": 8.003719934773747e-06, + "loss": 3.3275, + "step": 23515 + }, + { + "epoch": 1.5980432123929882, + "grad_norm": 7.157451152801514, + "learning_rate": 8.00329528468542e-06, + "loss": 3.0848, + "step": 23520 + }, + { + "epoch": 1.5983829324636498, + "grad_norm": 7.0425896644592285, + "learning_rate": 8.002870634597093e-06, + "loss": 3.404, + "step": 23525 + }, + { + "epoch": 1.5987226525343117, + "grad_norm": 6.248306751251221, + "learning_rate": 8.002445984508766e-06, + "loss": 3.0445, + "step": 23530 + }, + { + "epoch": 1.5990623726049735, + "grad_norm": 6.5537567138671875, + "learning_rate": 8.002021334420439e-06, + "loss": 2.9157, + "step": 23535 + }, + { + "epoch": 1.5994020926756352, + "grad_norm": 7.503211975097656, + "learning_rate": 8.001596684332111e-06, + "loss": 3.3554, + "step": 23540 + }, + { + "epoch": 1.599741812746297, + "grad_norm": 7.352776050567627, + "learning_rate": 8.001172034243784e-06, + "loss": 3.0839, + "step": 23545 + }, + { + "epoch": 1.6000815328169589, + "grad_norm": 6.550344944000244, + "learning_rate": 8.000747384155455e-06, + "loss": 3.3648, + "step": 23550 + }, + { + "epoch": 1.6004212528876205, + "grad_norm": 6.2093281745910645, + "learning_rate": 8.00032273406713e-06, + "loss": 3.3023, + "step": 23555 + }, + { + "epoch": 1.6007609729582823, + "grad_norm": 6.015896320343018, + "learning_rate": 7.999898083978803e-06, + "loss": 3.2649, + "step": 23560 + }, + { + "epoch": 1.6011006930289442, + "grad_norm": 7.085857391357422, + "learning_rate": 7.999473433890474e-06, + "loss": 3.0564, + "step": 23565 + }, + { + "epoch": 1.6014404130996058, + "grad_norm": 7.014346599578857, + "learning_rate": 7.999048783802148e-06, + "loss": 3.1833, + "step": 23570 + }, + { + "epoch": 1.6017801331702677, + "grad_norm": 7.053066730499268, + "learning_rate": 7.998624133713821e-06, + "loss": 3.1279, + "step": 23575 + }, + { + "epoch": 1.6021198532409295, + "grad_norm": 10.431622505187988, + "learning_rate": 7.998199483625492e-06, + "loss": 3.0308, + "step": 23580 + }, + { + "epoch": 1.6024595733115912, + "grad_norm": 7.544126987457275, + "learning_rate": 7.997774833537167e-06, + "loss": 3.4721, + "step": 23585 + }, + { + "epoch": 1.602799293382253, + "grad_norm": 6.398107528686523, + "learning_rate": 7.99735018344884e-06, + "loss": 3.2739, + "step": 23590 + }, + { + "epoch": 1.6031390134529149, + "grad_norm": 6.098026752471924, + "learning_rate": 7.99692553336051e-06, + "loss": 3.0194, + "step": 23595 + }, + { + "epoch": 1.6034787335235765, + "grad_norm": 7.754726886749268, + "learning_rate": 7.996500883272185e-06, + "loss": 3.3237, + "step": 23600 + }, + { + "epoch": 1.6038184535942384, + "grad_norm": 6.2962517738342285, + "learning_rate": 7.996076233183858e-06, + "loss": 3.3409, + "step": 23605 + }, + { + "epoch": 1.6041581736649002, + "grad_norm": 6.213929176330566, + "learning_rate": 7.995651583095529e-06, + "loss": 3.1376, + "step": 23610 + }, + { + "epoch": 1.6044978937355618, + "grad_norm": 9.261594772338867, + "learning_rate": 7.995226933007203e-06, + "loss": 2.8804, + "step": 23615 + }, + { + "epoch": 1.6048376138062237, + "grad_norm": 7.014772891998291, + "learning_rate": 7.994802282918875e-06, + "loss": 3.284, + "step": 23620 + }, + { + "epoch": 1.6051773338768855, + "grad_norm": 6.034864902496338, + "learning_rate": 7.994377632830547e-06, + "loss": 3.2648, + "step": 23625 + }, + { + "epoch": 1.6055170539475472, + "grad_norm": 5.530580520629883, + "learning_rate": 7.993952982742222e-06, + "loss": 3.1669, + "step": 23630 + }, + { + "epoch": 1.605856774018209, + "grad_norm": 7.484724044799805, + "learning_rate": 7.993528332653893e-06, + "loss": 3.1204, + "step": 23635 + }, + { + "epoch": 1.6061964940888709, + "grad_norm": 6.66617488861084, + "learning_rate": 7.993103682565566e-06, + "loss": 3.2666, + "step": 23640 + }, + { + "epoch": 1.6065362141595325, + "grad_norm": 5.291116714477539, + "learning_rate": 7.99267903247724e-06, + "loss": 3.0402, + "step": 23645 + }, + { + "epoch": 1.6068759342301944, + "grad_norm": 6.2718424797058105, + "learning_rate": 7.992254382388911e-06, + "loss": 3.1088, + "step": 23650 + }, + { + "epoch": 1.6072156543008562, + "grad_norm": 7.675546169281006, + "learning_rate": 7.991829732300584e-06, + "loss": 3.2069, + "step": 23655 + }, + { + "epoch": 1.6075553743715179, + "grad_norm": 5.807267189025879, + "learning_rate": 7.991405082212259e-06, + "loss": 3.2277, + "step": 23660 + }, + { + "epoch": 1.6078950944421795, + "grad_norm": 5.819814205169678, + "learning_rate": 7.99098043212393e-06, + "loss": 3.0025, + "step": 23665 + }, + { + "epoch": 1.6082348145128416, + "grad_norm": 6.958734512329102, + "learning_rate": 7.990555782035603e-06, + "loss": 3.0472, + "step": 23670 + }, + { + "epoch": 1.6085745345835032, + "grad_norm": 7.367216110229492, + "learning_rate": 7.990131131947277e-06, + "loss": 3.2153, + "step": 23675 + }, + { + "epoch": 1.6089142546541648, + "grad_norm": 6.318203449249268, + "learning_rate": 7.989706481858948e-06, + "loss": 3.1246, + "step": 23680 + }, + { + "epoch": 1.609253974724827, + "grad_norm": 6.227957725524902, + "learning_rate": 7.989281831770621e-06, + "loss": 3.1545, + "step": 23685 + }, + { + "epoch": 1.6095936947954885, + "grad_norm": 5.546966075897217, + "learning_rate": 7.988857181682294e-06, + "loss": 3.3144, + "step": 23690 + }, + { + "epoch": 1.6099334148661502, + "grad_norm": 8.057475090026855, + "learning_rate": 7.988432531593967e-06, + "loss": 2.8745, + "step": 23695 + }, + { + "epoch": 1.610273134936812, + "grad_norm": 6.725799560546875, + "learning_rate": 7.988007881505641e-06, + "loss": 3.1515, + "step": 23700 + }, + { + "epoch": 1.6106128550074739, + "grad_norm": 5.109588623046875, + "learning_rate": 7.987583231417312e-06, + "loss": 3.2072, + "step": 23705 + }, + { + "epoch": 1.6109525750781355, + "grad_norm": 6.316041946411133, + "learning_rate": 7.987158581328985e-06, + "loss": 3.1924, + "step": 23710 + }, + { + "epoch": 1.6112922951487973, + "grad_norm": 6.674480438232422, + "learning_rate": 7.98673393124066e-06, + "loss": 3.2616, + "step": 23715 + }, + { + "epoch": 1.6116320152194592, + "grad_norm": 6.783422946929932, + "learning_rate": 7.98630928115233e-06, + "loss": 3.468, + "step": 23720 + }, + { + "epoch": 1.6119717352901208, + "grad_norm": 6.731157302856445, + "learning_rate": 7.985884631064003e-06, + "loss": 3.1184, + "step": 23725 + }, + { + "epoch": 1.6123114553607827, + "grad_norm": 8.969887733459473, + "learning_rate": 7.985459980975678e-06, + "loss": 3.4127, + "step": 23730 + }, + { + "epoch": 1.6126511754314445, + "grad_norm": 7.991096496582031, + "learning_rate": 7.985035330887349e-06, + "loss": 3.2276, + "step": 23735 + }, + { + "epoch": 1.6129908955021062, + "grad_norm": 5.918176174163818, + "learning_rate": 7.984610680799022e-06, + "loss": 2.8453, + "step": 23740 + }, + { + "epoch": 1.613330615572768, + "grad_norm": 6.3426947593688965, + "learning_rate": 7.984186030710696e-06, + "loss": 3.2138, + "step": 23745 + }, + { + "epoch": 1.6136703356434299, + "grad_norm": 6.554202079772949, + "learning_rate": 7.983761380622367e-06, + "loss": 3.1916, + "step": 23750 + }, + { + "epoch": 1.6140100557140915, + "grad_norm": 7.179882049560547, + "learning_rate": 7.98333673053404e-06, + "loss": 3.3399, + "step": 23755 + }, + { + "epoch": 1.6143497757847534, + "grad_norm": 5.936587810516357, + "learning_rate": 7.982912080445715e-06, + "loss": 3.2165, + "step": 23760 + }, + { + "epoch": 1.6146894958554152, + "grad_norm": 7.156703948974609, + "learning_rate": 7.982487430357386e-06, + "loss": 3.1314, + "step": 23765 + }, + { + "epoch": 1.6150292159260768, + "grad_norm": 6.821993350982666, + "learning_rate": 7.982062780269059e-06, + "loss": 2.9896, + "step": 23770 + }, + { + "epoch": 1.6153689359967387, + "grad_norm": 5.967657089233398, + "learning_rate": 7.981638130180731e-06, + "loss": 3.1694, + "step": 23775 + }, + { + "epoch": 1.6157086560674006, + "grad_norm": 6.344442844390869, + "learning_rate": 7.981213480092404e-06, + "loss": 3.0807, + "step": 23780 + }, + { + "epoch": 1.6160483761380622, + "grad_norm": 6.091097831726074, + "learning_rate": 7.980788830004077e-06, + "loss": 3.0904, + "step": 23785 + }, + { + "epoch": 1.616388096208724, + "grad_norm": 5.997616767883301, + "learning_rate": 7.98036417991575e-06, + "loss": 2.9713, + "step": 23790 + }, + { + "epoch": 1.6167278162793859, + "grad_norm": 7.294738292694092, + "learning_rate": 7.979939529827423e-06, + "loss": 2.961, + "step": 23795 + }, + { + "epoch": 1.6170675363500475, + "grad_norm": 6.22149133682251, + "learning_rate": 7.979514879739095e-06, + "loss": 3.1427, + "step": 23800 + }, + { + "epoch": 1.6174072564207094, + "grad_norm": 5.684275150299072, + "learning_rate": 7.979090229650768e-06, + "loss": 2.9403, + "step": 23805 + }, + { + "epoch": 1.6177469764913712, + "grad_norm": 7.571439743041992, + "learning_rate": 7.978665579562441e-06, + "loss": 3.2184, + "step": 23810 + }, + { + "epoch": 1.6180866965620329, + "grad_norm": 6.598820686340332, + "learning_rate": 7.978240929474114e-06, + "loss": 3.511, + "step": 23815 + }, + { + "epoch": 1.6184264166326947, + "grad_norm": 6.159831523895264, + "learning_rate": 7.977816279385787e-06, + "loss": 2.9626, + "step": 23820 + }, + { + "epoch": 1.6187661367033566, + "grad_norm": 8.576926231384277, + "learning_rate": 7.97739162929746e-06, + "loss": 3.2064, + "step": 23825 + }, + { + "epoch": 1.6191058567740182, + "grad_norm": 6.147668838500977, + "learning_rate": 7.976966979209132e-06, + "loss": 3.1897, + "step": 23830 + }, + { + "epoch": 1.6194455768446798, + "grad_norm": 6.645761966705322, + "learning_rate": 7.976542329120805e-06, + "loss": 3.1869, + "step": 23835 + }, + { + "epoch": 1.619785296915342, + "grad_norm": 5.374839782714844, + "learning_rate": 7.976117679032478e-06, + "loss": 2.9474, + "step": 23840 + }, + { + "epoch": 1.6201250169860035, + "grad_norm": 6.362802028656006, + "learning_rate": 7.97569302894415e-06, + "loss": 2.8356, + "step": 23845 + }, + { + "epoch": 1.6204647370566652, + "grad_norm": 6.525477409362793, + "learning_rate": 7.975268378855823e-06, + "loss": 3.3167, + "step": 23850 + }, + { + "epoch": 1.6208044571273272, + "grad_norm": 6.246317386627197, + "learning_rate": 7.974843728767496e-06, + "loss": 3.3174, + "step": 23855 + }, + { + "epoch": 1.6211441771979889, + "grad_norm": 6.823639392852783, + "learning_rate": 7.974419078679169e-06, + "loss": 3.0794, + "step": 23860 + }, + { + "epoch": 1.6214838972686505, + "grad_norm": 5.8109283447265625, + "learning_rate": 7.973994428590842e-06, + "loss": 3.0162, + "step": 23865 + }, + { + "epoch": 1.6218236173393124, + "grad_norm": 6.4162068367004395, + "learning_rate": 7.973569778502515e-06, + "loss": 3.0166, + "step": 23870 + }, + { + "epoch": 1.6221633374099742, + "grad_norm": 7.863381385803223, + "learning_rate": 7.973145128414187e-06, + "loss": 3.1501, + "step": 23875 + }, + { + "epoch": 1.6225030574806358, + "grad_norm": 6.801530361175537, + "learning_rate": 7.97272047832586e-06, + "loss": 3.2657, + "step": 23880 + }, + { + "epoch": 1.6228427775512977, + "grad_norm": 6.406929969787598, + "learning_rate": 7.972295828237533e-06, + "loss": 3.1714, + "step": 23885 + }, + { + "epoch": 1.6231824976219595, + "grad_norm": 6.697259426116943, + "learning_rate": 7.971871178149206e-06, + "loss": 3.2302, + "step": 23890 + }, + { + "epoch": 1.6235222176926212, + "grad_norm": 7.70324182510376, + "learning_rate": 7.971446528060879e-06, + "loss": 3.0477, + "step": 23895 + }, + { + "epoch": 1.623861937763283, + "grad_norm": 6.335871696472168, + "learning_rate": 7.971021877972551e-06, + "loss": 3.2053, + "step": 23900 + }, + { + "epoch": 1.6242016578339449, + "grad_norm": 6.450273513793945, + "learning_rate": 7.970597227884224e-06, + "loss": 3.1971, + "step": 23905 + }, + { + "epoch": 1.6245413779046065, + "grad_norm": 6.68673849105835, + "learning_rate": 7.970172577795897e-06, + "loss": 3.2086, + "step": 23910 + }, + { + "epoch": 1.6248810979752684, + "grad_norm": 7.19473123550415, + "learning_rate": 7.96974792770757e-06, + "loss": 3.1635, + "step": 23915 + }, + { + "epoch": 1.6252208180459302, + "grad_norm": 6.793288230895996, + "learning_rate": 7.969323277619243e-06, + "loss": 3.2365, + "step": 23920 + }, + { + "epoch": 1.6255605381165918, + "grad_norm": 5.199896812438965, + "learning_rate": 7.968898627530915e-06, + "loss": 3.1679, + "step": 23925 + }, + { + "epoch": 1.6259002581872537, + "grad_norm": 5.413940906524658, + "learning_rate": 7.968473977442588e-06, + "loss": 3.2266, + "step": 23930 + }, + { + "epoch": 1.6262399782579156, + "grad_norm": 7.658891677856445, + "learning_rate": 7.968049327354261e-06, + "loss": 2.9957, + "step": 23935 + }, + { + "epoch": 1.6265796983285772, + "grad_norm": 5.1363325119018555, + "learning_rate": 7.967624677265934e-06, + "loss": 3.2032, + "step": 23940 + }, + { + "epoch": 1.626919418399239, + "grad_norm": 7.946913242340088, + "learning_rate": 7.967200027177607e-06, + "loss": 3.1543, + "step": 23945 + }, + { + "epoch": 1.627259138469901, + "grad_norm": 6.549281120300293, + "learning_rate": 7.96677537708928e-06, + "loss": 2.872, + "step": 23950 + }, + { + "epoch": 1.6275988585405625, + "grad_norm": 7.048633098602295, + "learning_rate": 7.966350727000952e-06, + "loss": 3.1847, + "step": 23955 + }, + { + "epoch": 1.6279385786112244, + "grad_norm": 5.969107627868652, + "learning_rate": 7.965926076912625e-06, + "loss": 3.317, + "step": 23960 + }, + { + "epoch": 1.6282782986818862, + "grad_norm": 7.691532611846924, + "learning_rate": 7.965501426824296e-06, + "loss": 3.2064, + "step": 23965 + }, + { + "epoch": 1.6286180187525479, + "grad_norm": 6.9124627113342285, + "learning_rate": 7.96507677673597e-06, + "loss": 3.3282, + "step": 23970 + }, + { + "epoch": 1.6289577388232097, + "grad_norm": 6.4560465812683105, + "learning_rate": 7.964652126647643e-06, + "loss": 3.1873, + "step": 23975 + }, + { + "epoch": 1.6292974588938716, + "grad_norm": 7.906497478485107, + "learning_rate": 7.964227476559315e-06, + "loss": 3.14, + "step": 23980 + }, + { + "epoch": 1.6296371789645332, + "grad_norm": 6.838839054107666, + "learning_rate": 7.963802826470989e-06, + "loss": 3.344, + "step": 23985 + }, + { + "epoch": 1.629976899035195, + "grad_norm": 7.95068359375, + "learning_rate": 7.963378176382662e-06, + "loss": 3.2439, + "step": 23990 + }, + { + "epoch": 1.630316619105857, + "grad_norm": 6.498924732208252, + "learning_rate": 7.962953526294333e-06, + "loss": 3.2016, + "step": 23995 + }, + { + "epoch": 1.6306563391765185, + "grad_norm": 6.295480728149414, + "learning_rate": 7.962528876206007e-06, + "loss": 3.2101, + "step": 24000 + }, + { + "epoch": 1.6309960592471802, + "grad_norm": 7.201601028442383, + "learning_rate": 7.96210422611768e-06, + "loss": 3.1621, + "step": 24005 + }, + { + "epoch": 1.6313357793178422, + "grad_norm": 7.133662223815918, + "learning_rate": 7.961679576029351e-06, + "loss": 3.0433, + "step": 24010 + }, + { + "epoch": 1.6316754993885039, + "grad_norm": 5.341712951660156, + "learning_rate": 7.961254925941026e-06, + "loss": 3.0235, + "step": 24015 + }, + { + "epoch": 1.6320152194591655, + "grad_norm": 6.871660232543945, + "learning_rate": 7.960830275852699e-06, + "loss": 3.388, + "step": 24020 + }, + { + "epoch": 1.6323549395298276, + "grad_norm": 6.8448662757873535, + "learning_rate": 7.96040562576437e-06, + "loss": 3.3024, + "step": 24025 + }, + { + "epoch": 1.6326946596004892, + "grad_norm": 6.257900714874268, + "learning_rate": 7.959980975676044e-06, + "loss": 3.2471, + "step": 24030 + }, + { + "epoch": 1.6330343796711508, + "grad_norm": 5.962330341339111, + "learning_rate": 7.959556325587715e-06, + "loss": 3.2451, + "step": 24035 + }, + { + "epoch": 1.633374099741813, + "grad_norm": 6.40329647064209, + "learning_rate": 7.95913167549939e-06, + "loss": 3.1805, + "step": 24040 + }, + { + "epoch": 1.6337138198124745, + "grad_norm": 5.469931125640869, + "learning_rate": 7.958707025411063e-06, + "loss": 2.9142, + "step": 24045 + }, + { + "epoch": 1.6340535398831362, + "grad_norm": 8.1489896774292, + "learning_rate": 7.958282375322734e-06, + "loss": 2.9736, + "step": 24050 + }, + { + "epoch": 1.634393259953798, + "grad_norm": 7.470694065093994, + "learning_rate": 7.957857725234408e-06, + "loss": 2.8517, + "step": 24055 + }, + { + "epoch": 1.6347329800244599, + "grad_norm": 5.943249702453613, + "learning_rate": 7.957433075146081e-06, + "loss": 3.1584, + "step": 24060 + }, + { + "epoch": 1.6350727000951215, + "grad_norm": 6.3284430503845215, + "learning_rate": 7.957008425057752e-06, + "loss": 3.295, + "step": 24065 + }, + { + "epoch": 1.6354124201657834, + "grad_norm": 8.717845916748047, + "learning_rate": 7.956583774969427e-06, + "loss": 3.3184, + "step": 24070 + }, + { + "epoch": 1.6357521402364452, + "grad_norm": 5.642142295837402, + "learning_rate": 7.9561591248811e-06, + "loss": 3.1696, + "step": 24075 + }, + { + "epoch": 1.6360918603071068, + "grad_norm": 6.152289390563965, + "learning_rate": 7.95573447479277e-06, + "loss": 3.3177, + "step": 24080 + }, + { + "epoch": 1.6364315803777687, + "grad_norm": 7.442540645599365, + "learning_rate": 7.955309824704445e-06, + "loss": 3.5198, + "step": 24085 + }, + { + "epoch": 1.6367713004484306, + "grad_norm": 8.58493709564209, + "learning_rate": 7.954885174616118e-06, + "loss": 3.198, + "step": 24090 + }, + { + "epoch": 1.6371110205190922, + "grad_norm": 6.828758716583252, + "learning_rate": 7.954460524527789e-06, + "loss": 3.0855, + "step": 24095 + }, + { + "epoch": 1.637450740589754, + "grad_norm": 6.907014846801758, + "learning_rate": 7.954035874439463e-06, + "loss": 3.1841, + "step": 24100 + }, + { + "epoch": 1.637790460660416, + "grad_norm": 6.915493011474609, + "learning_rate": 7.953611224351136e-06, + "loss": 3.0572, + "step": 24105 + }, + { + "epoch": 1.6381301807310775, + "grad_norm": 7.136019706726074, + "learning_rate": 7.953186574262807e-06, + "loss": 3.04, + "step": 24110 + }, + { + "epoch": 1.6384699008017394, + "grad_norm": 7.59631872177124, + "learning_rate": 7.952761924174482e-06, + "loss": 3.1129, + "step": 24115 + }, + { + "epoch": 1.6388096208724012, + "grad_norm": 7.144615173339844, + "learning_rate": 7.952337274086153e-06, + "loss": 3.001, + "step": 24120 + }, + { + "epoch": 1.6391493409430629, + "grad_norm": 6.972914218902588, + "learning_rate": 7.951912623997826e-06, + "loss": 3.0937, + "step": 24125 + }, + { + "epoch": 1.6394890610137247, + "grad_norm": 6.506470680236816, + "learning_rate": 7.9514879739095e-06, + "loss": 3.19, + "step": 24130 + }, + { + "epoch": 1.6398287810843866, + "grad_norm": 5.459421634674072, + "learning_rate": 7.951063323821171e-06, + "loss": 3.1201, + "step": 24135 + }, + { + "epoch": 1.6401685011550482, + "grad_norm": 7.764454364776611, + "learning_rate": 7.950638673732844e-06, + "loss": 3.2671, + "step": 24140 + }, + { + "epoch": 1.64050822122571, + "grad_norm": 7.343043327331543, + "learning_rate": 7.950214023644519e-06, + "loss": 3.0206, + "step": 24145 + }, + { + "epoch": 1.640847941296372, + "grad_norm": 6.371550559997559, + "learning_rate": 7.94978937355619e-06, + "loss": 3.3264, + "step": 24150 + }, + { + "epoch": 1.6411876613670335, + "grad_norm": 7.214774131774902, + "learning_rate": 7.949364723467863e-06, + "loss": 3.0369, + "step": 24155 + }, + { + "epoch": 1.6415273814376954, + "grad_norm": 5.051723957061768, + "learning_rate": 7.948940073379537e-06, + "loss": 2.829, + "step": 24160 + }, + { + "epoch": 1.6418671015083572, + "grad_norm": 6.537191390991211, + "learning_rate": 7.948515423291208e-06, + "loss": 3.3191, + "step": 24165 + }, + { + "epoch": 1.6422068215790189, + "grad_norm": 7.358672618865967, + "learning_rate": 7.948090773202881e-06, + "loss": 2.9028, + "step": 24170 + }, + { + "epoch": 1.6425465416496805, + "grad_norm": 6.889394760131836, + "learning_rate": 7.947666123114555e-06, + "loss": 3.1936, + "step": 24175 + }, + { + "epoch": 1.6428862617203426, + "grad_norm": 6.765139579772949, + "learning_rate": 7.947241473026227e-06, + "loss": 3.1329, + "step": 24180 + }, + { + "epoch": 1.6432259817910042, + "grad_norm": 8.614166259765625, + "learning_rate": 7.9468168229379e-06, + "loss": 3.2792, + "step": 24185 + }, + { + "epoch": 1.6435657018616658, + "grad_norm": 6.922491550445557, + "learning_rate": 7.946392172849572e-06, + "loss": 2.9192, + "step": 24190 + }, + { + "epoch": 1.643905421932328, + "grad_norm": 6.405961513519287, + "learning_rate": 7.945967522761245e-06, + "loss": 3.1523, + "step": 24195 + }, + { + "epoch": 1.6442451420029895, + "grad_norm": 7.145602703094482, + "learning_rate": 7.945542872672918e-06, + "loss": 3.3645, + "step": 24200 + }, + { + "epoch": 1.6445848620736512, + "grad_norm": 7.508849620819092, + "learning_rate": 7.94511822258459e-06, + "loss": 3.2713, + "step": 24205 + }, + { + "epoch": 1.6449245821443133, + "grad_norm": 5.21617317199707, + "learning_rate": 7.944693572496263e-06, + "loss": 3.052, + "step": 24210 + }, + { + "epoch": 1.6452643022149749, + "grad_norm": 7.435060024261475, + "learning_rate": 7.944268922407936e-06, + "loss": 3.1616, + "step": 24215 + }, + { + "epoch": 1.6456040222856365, + "grad_norm": 6.896716594696045, + "learning_rate": 7.943844272319609e-06, + "loss": 3.137, + "step": 24220 + }, + { + "epoch": 1.6459437423562984, + "grad_norm": 5.607396125793457, + "learning_rate": 7.943419622231282e-06, + "loss": 3.15, + "step": 24225 + }, + { + "epoch": 1.6462834624269602, + "grad_norm": 9.100713729858398, + "learning_rate": 7.942994972142955e-06, + "loss": 3.1818, + "step": 24230 + }, + { + "epoch": 1.6466231824976219, + "grad_norm": 7.410979270935059, + "learning_rate": 7.942570322054627e-06, + "loss": 3.0551, + "step": 24235 + }, + { + "epoch": 1.6469629025682837, + "grad_norm": 5.770133018493652, + "learning_rate": 7.9421456719663e-06, + "loss": 3.2989, + "step": 24240 + }, + { + "epoch": 1.6473026226389456, + "grad_norm": 6.969286918640137, + "learning_rate": 7.941721021877973e-06, + "loss": 3.0771, + "step": 24245 + }, + { + "epoch": 1.6476423427096072, + "grad_norm": 6.168772220611572, + "learning_rate": 7.941296371789646e-06, + "loss": 3.2736, + "step": 24250 + }, + { + "epoch": 1.647982062780269, + "grad_norm": 7.236126899719238, + "learning_rate": 7.940871721701319e-06, + "loss": 3.0529, + "step": 24255 + }, + { + "epoch": 1.648321782850931, + "grad_norm": 7.260137557983398, + "learning_rate": 7.940447071612991e-06, + "loss": 3.2731, + "step": 24260 + }, + { + "epoch": 1.6486615029215925, + "grad_norm": 5.929930210113525, + "learning_rate": 7.940022421524664e-06, + "loss": 3.3057, + "step": 24265 + }, + { + "epoch": 1.6490012229922544, + "grad_norm": 9.884267807006836, + "learning_rate": 7.939597771436337e-06, + "loss": 3.0882, + "step": 24270 + }, + { + "epoch": 1.6493409430629162, + "grad_norm": 6.609869480133057, + "learning_rate": 7.93917312134801e-06, + "loss": 3.1534, + "step": 24275 + }, + { + "epoch": 1.6496806631335779, + "grad_norm": 7.919809341430664, + "learning_rate": 7.938748471259683e-06, + "loss": 3.2782, + "step": 24280 + }, + { + "epoch": 1.6500203832042397, + "grad_norm": 6.050290107727051, + "learning_rate": 7.938323821171355e-06, + "loss": 3.0587, + "step": 24285 + }, + { + "epoch": 1.6503601032749016, + "grad_norm": 8.94149398803711, + "learning_rate": 7.937899171083028e-06, + "loss": 3.0746, + "step": 24290 + }, + { + "epoch": 1.6506998233455632, + "grad_norm": 7.356546401977539, + "learning_rate": 7.937474520994701e-06, + "loss": 3.3658, + "step": 24295 + }, + { + "epoch": 1.651039543416225, + "grad_norm": 5.267816543579102, + "learning_rate": 7.937049870906374e-06, + "loss": 2.9957, + "step": 24300 + }, + { + "epoch": 1.651379263486887, + "grad_norm": 6.800343036651611, + "learning_rate": 7.936625220818047e-06, + "loss": 3.2235, + "step": 24305 + }, + { + "epoch": 1.6517189835575485, + "grad_norm": 8.182849884033203, + "learning_rate": 7.93620057072972e-06, + "loss": 3.2775, + "step": 24310 + }, + { + "epoch": 1.6520587036282104, + "grad_norm": 6.575831413269043, + "learning_rate": 7.935775920641392e-06, + "loss": 3.2452, + "step": 24315 + }, + { + "epoch": 1.6523984236988722, + "grad_norm": 8.869951248168945, + "learning_rate": 7.935351270553065e-06, + "loss": 3.0695, + "step": 24320 + }, + { + "epoch": 1.6527381437695339, + "grad_norm": 6.813230514526367, + "learning_rate": 7.934926620464738e-06, + "loss": 3.0791, + "step": 24325 + }, + { + "epoch": 1.6530778638401957, + "grad_norm": 8.392790794372559, + "learning_rate": 7.93450197037641e-06, + "loss": 3.0909, + "step": 24330 + }, + { + "epoch": 1.6534175839108576, + "grad_norm": 6.368007183074951, + "learning_rate": 7.934077320288083e-06, + "loss": 3.2268, + "step": 24335 + }, + { + "epoch": 1.6537573039815192, + "grad_norm": 6.154805660247803, + "learning_rate": 7.933652670199756e-06, + "loss": 2.9118, + "step": 24340 + }, + { + "epoch": 1.6540970240521808, + "grad_norm": 6.222280025482178, + "learning_rate": 7.933228020111429e-06, + "loss": 3.2308, + "step": 24345 + }, + { + "epoch": 1.654436744122843, + "grad_norm": 7.985805034637451, + "learning_rate": 7.932803370023102e-06, + "loss": 3.215, + "step": 24350 + }, + { + "epoch": 1.6547764641935045, + "grad_norm": 7.437739372253418, + "learning_rate": 7.932378719934775e-06, + "loss": 3.2139, + "step": 24355 + }, + { + "epoch": 1.6551161842641662, + "grad_norm": 6.462048530578613, + "learning_rate": 7.931954069846447e-06, + "loss": 3.1913, + "step": 24360 + }, + { + "epoch": 1.6554559043348283, + "grad_norm": 6.886509418487549, + "learning_rate": 7.93152941975812e-06, + "loss": 3.1365, + "step": 24365 + }, + { + "epoch": 1.6557956244054899, + "grad_norm": 6.735101222991943, + "learning_rate": 7.931104769669793e-06, + "loss": 3.2299, + "step": 24370 + }, + { + "epoch": 1.6561353444761515, + "grad_norm": 6.081961154937744, + "learning_rate": 7.930680119581466e-06, + "loss": 2.9969, + "step": 24375 + }, + { + "epoch": 1.6564750645468136, + "grad_norm": 5.628035545349121, + "learning_rate": 7.930255469493139e-06, + "loss": 2.9682, + "step": 24380 + }, + { + "epoch": 1.6568147846174752, + "grad_norm": 7.124545097351074, + "learning_rate": 7.929830819404811e-06, + "loss": 3.4189, + "step": 24385 + }, + { + "epoch": 1.6571545046881369, + "grad_norm": 7.313871383666992, + "learning_rate": 7.929406169316484e-06, + "loss": 3.0946, + "step": 24390 + }, + { + "epoch": 1.6574942247587987, + "grad_norm": 7.344930648803711, + "learning_rate": 7.928981519228157e-06, + "loss": 3.3231, + "step": 24395 + }, + { + "epoch": 1.6578339448294606, + "grad_norm": 6.134850978851318, + "learning_rate": 7.92855686913983e-06, + "loss": 3.3261, + "step": 24400 + }, + { + "epoch": 1.6581736649001222, + "grad_norm": 7.111206531524658, + "learning_rate": 7.928132219051503e-06, + "loss": 3.1168, + "step": 24405 + }, + { + "epoch": 1.658513384970784, + "grad_norm": 7.83414363861084, + "learning_rate": 7.927707568963175e-06, + "loss": 3.2747, + "step": 24410 + }, + { + "epoch": 1.658853105041446, + "grad_norm": 7.581506252288818, + "learning_rate": 7.927282918874848e-06, + "loss": 3.2161, + "step": 24415 + }, + { + "epoch": 1.6591928251121075, + "grad_norm": 7.298463821411133, + "learning_rate": 7.926858268786521e-06, + "loss": 3.0526, + "step": 24420 + }, + { + "epoch": 1.6595325451827694, + "grad_norm": 6.4980340003967285, + "learning_rate": 7.926433618698194e-06, + "loss": 3.1516, + "step": 24425 + }, + { + "epoch": 1.6598722652534312, + "grad_norm": 6.377585411071777, + "learning_rate": 7.926008968609867e-06, + "loss": 3.3342, + "step": 24430 + }, + { + "epoch": 1.6602119853240929, + "grad_norm": 6.681432723999023, + "learning_rate": 7.92558431852154e-06, + "loss": 3.1872, + "step": 24435 + }, + { + "epoch": 1.6605517053947547, + "grad_norm": 9.409080505371094, + "learning_rate": 7.925159668433212e-06, + "loss": 3.2245, + "step": 24440 + }, + { + "epoch": 1.6608914254654166, + "grad_norm": 5.642204761505127, + "learning_rate": 7.924735018344885e-06, + "loss": 3.0474, + "step": 24445 + }, + { + "epoch": 1.6612311455360782, + "grad_norm": 6.448130130767822, + "learning_rate": 7.924310368256558e-06, + "loss": 3.3122, + "step": 24450 + }, + { + "epoch": 1.66157086560674, + "grad_norm": 7.271479606628418, + "learning_rate": 7.92388571816823e-06, + "loss": 3.053, + "step": 24455 + }, + { + "epoch": 1.661910585677402, + "grad_norm": 6.252056121826172, + "learning_rate": 7.923461068079903e-06, + "loss": 3.2443, + "step": 24460 + }, + { + "epoch": 1.6622503057480635, + "grad_norm": 8.184598922729492, + "learning_rate": 7.923036417991575e-06, + "loss": 3.1415, + "step": 24465 + }, + { + "epoch": 1.6625900258187254, + "grad_norm": 8.948588371276855, + "learning_rate": 7.922611767903249e-06, + "loss": 3.2888, + "step": 24470 + }, + { + "epoch": 1.6629297458893872, + "grad_norm": 7.117583751678467, + "learning_rate": 7.922187117814922e-06, + "loss": 3.3094, + "step": 24475 + }, + { + "epoch": 1.6632694659600489, + "grad_norm": 6.17196798324585, + "learning_rate": 7.921762467726593e-06, + "loss": 3.2988, + "step": 24480 + }, + { + "epoch": 1.6636091860307107, + "grad_norm": 8.362027168273926, + "learning_rate": 7.921337817638267e-06, + "loss": 3.0262, + "step": 24485 + }, + { + "epoch": 1.6639489061013726, + "grad_norm": 6.914194583892822, + "learning_rate": 7.92091316754994e-06, + "loss": 3.2174, + "step": 24490 + }, + { + "epoch": 1.6642886261720342, + "grad_norm": 8.273885726928711, + "learning_rate": 7.920488517461611e-06, + "loss": 3.0724, + "step": 24495 + }, + { + "epoch": 1.664628346242696, + "grad_norm": 6.621171474456787, + "learning_rate": 7.920063867373286e-06, + "loss": 3.3734, + "step": 24500 + }, + { + "epoch": 1.664968066313358, + "grad_norm": 6.36458158493042, + "learning_rate": 7.919639217284959e-06, + "loss": 3.058, + "step": 24505 + }, + { + "epoch": 1.6653077863840196, + "grad_norm": 6.466413497924805, + "learning_rate": 7.91921456719663e-06, + "loss": 3.1101, + "step": 24510 + }, + { + "epoch": 1.6656475064546812, + "grad_norm": 6.025920391082764, + "learning_rate": 7.918789917108304e-06, + "loss": 2.9796, + "step": 24515 + }, + { + "epoch": 1.6659872265253433, + "grad_norm": 7.144632339477539, + "learning_rate": 7.918365267019977e-06, + "loss": 3.1099, + "step": 24520 + }, + { + "epoch": 1.6663269465960049, + "grad_norm": 5.520550727844238, + "learning_rate": 7.917940616931648e-06, + "loss": 3.2242, + "step": 24525 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 6.904483795166016, + "learning_rate": 7.917515966843323e-06, + "loss": 3.2239, + "step": 24530 + }, + { + "epoch": 1.6670063867373286, + "grad_norm": 7.849203109741211, + "learning_rate": 7.917091316754994e-06, + "loss": 3.336, + "step": 24535 + }, + { + "epoch": 1.6673461068079902, + "grad_norm": 7.025584697723389, + "learning_rate": 7.916666666666667e-06, + "loss": 3.1435, + "step": 24540 + }, + { + "epoch": 1.6676858268786519, + "grad_norm": 8.100563049316406, + "learning_rate": 7.916242016578341e-06, + "loss": 3.1815, + "step": 24545 + }, + { + "epoch": 1.668025546949314, + "grad_norm": 8.628182411193848, + "learning_rate": 7.915817366490012e-06, + "loss": 3.2363, + "step": 24550 + }, + { + "epoch": 1.6683652670199756, + "grad_norm": 7.150721549987793, + "learning_rate": 7.915392716401685e-06, + "loss": 3.4142, + "step": 24555 + }, + { + "epoch": 1.6687049870906372, + "grad_norm": 6.300673961639404, + "learning_rate": 7.91496806631336e-06, + "loss": 3.2036, + "step": 24560 + }, + { + "epoch": 1.669044707161299, + "grad_norm": 5.330132961273193, + "learning_rate": 7.91454341622503e-06, + "loss": 3.2237, + "step": 24565 + }, + { + "epoch": 1.669384427231961, + "grad_norm": 5.5084757804870605, + "learning_rate": 7.914118766136703e-06, + "loss": 3.403, + "step": 24570 + }, + { + "epoch": 1.6697241473026225, + "grad_norm": 6.2711873054504395, + "learning_rate": 7.913694116048378e-06, + "loss": 3.049, + "step": 24575 + }, + { + "epoch": 1.6700638673732844, + "grad_norm": 8.211346626281738, + "learning_rate": 7.913269465960049e-06, + "loss": 3.2064, + "step": 24580 + }, + { + "epoch": 1.6704035874439462, + "grad_norm": 6.698671340942383, + "learning_rate": 7.912844815871722e-06, + "loss": 3.0305, + "step": 24585 + }, + { + "epoch": 1.6707433075146079, + "grad_norm": 6.721352577209473, + "learning_rate": 7.912420165783396e-06, + "loss": 2.9809, + "step": 24590 + }, + { + "epoch": 1.6710830275852697, + "grad_norm": 5.114359378814697, + "learning_rate": 7.911995515695067e-06, + "loss": 3.0619, + "step": 24595 + }, + { + "epoch": 1.6714227476559316, + "grad_norm": 6.251538276672363, + "learning_rate": 7.91157086560674e-06, + "loss": 3.0338, + "step": 24600 + }, + { + "epoch": 1.6717624677265932, + "grad_norm": 7.249123573303223, + "learning_rate": 7.911146215518413e-06, + "loss": 3.1999, + "step": 24605 + }, + { + "epoch": 1.672102187797255, + "grad_norm": 6.955320358276367, + "learning_rate": 7.910721565430086e-06, + "loss": 3.2683, + "step": 24610 + }, + { + "epoch": 1.672441907867917, + "grad_norm": 7.566341876983643, + "learning_rate": 7.910296915341759e-06, + "loss": 3.4241, + "step": 24615 + }, + { + "epoch": 1.6727816279385785, + "grad_norm": 7.786848545074463, + "learning_rate": 7.909872265253431e-06, + "loss": 2.9319, + "step": 24620 + }, + { + "epoch": 1.6731213480092404, + "grad_norm": 8.293450355529785, + "learning_rate": 7.909447615165104e-06, + "loss": 3.2584, + "step": 24625 + }, + { + "epoch": 1.6734610680799022, + "grad_norm": 5.8620500564575195, + "learning_rate": 7.909022965076777e-06, + "loss": 3.2683, + "step": 24630 + }, + { + "epoch": 1.6738007881505639, + "grad_norm": 6.0824055671691895, + "learning_rate": 7.90859831498845e-06, + "loss": 3.1872, + "step": 24635 + }, + { + "epoch": 1.6741405082212257, + "grad_norm": 6.719066143035889, + "learning_rate": 7.908173664900123e-06, + "loss": 3.2091, + "step": 24640 + }, + { + "epoch": 1.6744802282918876, + "grad_norm": 6.172815799713135, + "learning_rate": 7.907749014811795e-06, + "loss": 3.2005, + "step": 24645 + }, + { + "epoch": 1.6748199483625492, + "grad_norm": 5.804276943206787, + "learning_rate": 7.907324364723468e-06, + "loss": 2.9846, + "step": 24650 + }, + { + "epoch": 1.675159668433211, + "grad_norm": 6.536708831787109, + "learning_rate": 7.906899714635141e-06, + "loss": 3.2006, + "step": 24655 + }, + { + "epoch": 1.675499388503873, + "grad_norm": 6.014743328094482, + "learning_rate": 7.906475064546814e-06, + "loss": 3.1048, + "step": 24660 + }, + { + "epoch": 1.6758391085745346, + "grad_norm": 7.966916084289551, + "learning_rate": 7.906050414458487e-06, + "loss": 3.2707, + "step": 24665 + }, + { + "epoch": 1.6761788286451964, + "grad_norm": 9.642778396606445, + "learning_rate": 7.90562576437016e-06, + "loss": 3.1984, + "step": 24670 + }, + { + "epoch": 1.6765185487158583, + "grad_norm": 6.506056785583496, + "learning_rate": 7.905201114281832e-06, + "loss": 3.2277, + "step": 24675 + }, + { + "epoch": 1.67685826878652, + "grad_norm": 5.977292537689209, + "learning_rate": 7.904776464193505e-06, + "loss": 3.0704, + "step": 24680 + }, + { + "epoch": 1.6771979888571815, + "grad_norm": 7.6394362449646, + "learning_rate": 7.904351814105178e-06, + "loss": 3.4013, + "step": 24685 + }, + { + "epoch": 1.6775377089278436, + "grad_norm": 7.024845123291016, + "learning_rate": 7.90392716401685e-06, + "loss": 3.1708, + "step": 24690 + }, + { + "epoch": 1.6778774289985052, + "grad_norm": 8.183586120605469, + "learning_rate": 7.903502513928523e-06, + "loss": 3.2648, + "step": 24695 + }, + { + "epoch": 1.6782171490691669, + "grad_norm": 6.932861804962158, + "learning_rate": 7.903077863840196e-06, + "loss": 3.1622, + "step": 24700 + }, + { + "epoch": 1.678556869139829, + "grad_norm": 7.588301658630371, + "learning_rate": 7.902653213751869e-06, + "loss": 3.2219, + "step": 24705 + }, + { + "epoch": 1.6788965892104906, + "grad_norm": 5.871506214141846, + "learning_rate": 7.902228563663542e-06, + "loss": 3.1325, + "step": 24710 + }, + { + "epoch": 1.6792363092811522, + "grad_norm": 6.468436241149902, + "learning_rate": 7.901803913575215e-06, + "loss": 3.3399, + "step": 24715 + }, + { + "epoch": 1.6795760293518143, + "grad_norm": 6.506476402282715, + "learning_rate": 7.901379263486887e-06, + "loss": 3.1181, + "step": 24720 + }, + { + "epoch": 1.679915749422476, + "grad_norm": 6.498031139373779, + "learning_rate": 7.90095461339856e-06, + "loss": 3.1124, + "step": 24725 + }, + { + "epoch": 1.6802554694931375, + "grad_norm": 8.274836540222168, + "learning_rate": 7.900529963310233e-06, + "loss": 3.1767, + "step": 24730 + }, + { + "epoch": 1.6805951895637994, + "grad_norm": 7.886473178863525, + "learning_rate": 7.900105313221906e-06, + "loss": 3.1741, + "step": 24735 + }, + { + "epoch": 1.6809349096344612, + "grad_norm": 5.626759052276611, + "learning_rate": 7.899680663133579e-06, + "loss": 3.1039, + "step": 24740 + }, + { + "epoch": 1.6812746297051229, + "grad_norm": 6.803172588348389, + "learning_rate": 7.899256013045251e-06, + "loss": 3.3883, + "step": 24745 + }, + { + "epoch": 1.6816143497757847, + "grad_norm": 7.126495838165283, + "learning_rate": 7.898831362956924e-06, + "loss": 3.2004, + "step": 24750 + }, + { + "epoch": 1.6819540698464466, + "grad_norm": 7.747750759124756, + "learning_rate": 7.898406712868597e-06, + "loss": 3.1473, + "step": 24755 + }, + { + "epoch": 1.6822937899171082, + "grad_norm": 6.757997989654541, + "learning_rate": 7.89798206278027e-06, + "loss": 3.0531, + "step": 24760 + }, + { + "epoch": 1.68263350998777, + "grad_norm": 6.359602928161621, + "learning_rate": 7.897557412691943e-06, + "loss": 2.7592, + "step": 24765 + }, + { + "epoch": 1.682973230058432, + "grad_norm": 6.964632987976074, + "learning_rate": 7.897132762603615e-06, + "loss": 3.0694, + "step": 24770 + }, + { + "epoch": 1.6833129501290935, + "grad_norm": 6.96670389175415, + "learning_rate": 7.896708112515288e-06, + "loss": 3.1821, + "step": 24775 + }, + { + "epoch": 1.6836526701997554, + "grad_norm": 6.800308704376221, + "learning_rate": 7.896283462426961e-06, + "loss": 3.059, + "step": 24780 + }, + { + "epoch": 1.6839923902704172, + "grad_norm": 7.354246616363525, + "learning_rate": 7.895858812338634e-06, + "loss": 3.0935, + "step": 24785 + }, + { + "epoch": 1.6843321103410789, + "grad_norm": 5.980794906616211, + "learning_rate": 7.895434162250307e-06, + "loss": 2.9999, + "step": 24790 + }, + { + "epoch": 1.6846718304117407, + "grad_norm": 6.246969699859619, + "learning_rate": 7.89500951216198e-06, + "loss": 3.2626, + "step": 24795 + }, + { + "epoch": 1.6850115504824026, + "grad_norm": 6.262210369110107, + "learning_rate": 7.894584862073652e-06, + "loss": 3.0582, + "step": 24800 + }, + { + "epoch": 1.6853512705530642, + "grad_norm": 8.603456497192383, + "learning_rate": 7.894160211985325e-06, + "loss": 2.9888, + "step": 24805 + }, + { + "epoch": 1.685690990623726, + "grad_norm": 9.48288345336914, + "learning_rate": 7.893735561896998e-06, + "loss": 2.8337, + "step": 24810 + }, + { + "epoch": 1.686030710694388, + "grad_norm": 5.990634918212891, + "learning_rate": 7.89331091180867e-06, + "loss": 3.218, + "step": 24815 + }, + { + "epoch": 1.6863704307650496, + "grad_norm": 6.321527481079102, + "learning_rate": 7.892886261720343e-06, + "loss": 3.1071, + "step": 24820 + }, + { + "epoch": 1.6867101508357114, + "grad_norm": 6.377739906311035, + "learning_rate": 7.892461611632016e-06, + "loss": 3.1633, + "step": 24825 + }, + { + "epoch": 1.6870498709063733, + "grad_norm": 7.334214210510254, + "learning_rate": 7.892036961543689e-06, + "loss": 3.3779, + "step": 24830 + }, + { + "epoch": 1.687389590977035, + "grad_norm": 5.705536365509033, + "learning_rate": 7.891612311455362e-06, + "loss": 3.0893, + "step": 24835 + }, + { + "epoch": 1.6877293110476967, + "grad_norm": 6.339046955108643, + "learning_rate": 7.891187661367035e-06, + "loss": 3.218, + "step": 24840 + }, + { + "epoch": 1.6880690311183586, + "grad_norm": 7.118504524230957, + "learning_rate": 7.890763011278707e-06, + "loss": 3.2192, + "step": 24845 + }, + { + "epoch": 1.6884087511890202, + "grad_norm": 8.709249496459961, + "learning_rate": 7.89033836119038e-06, + "loss": 3.0232, + "step": 24850 + }, + { + "epoch": 1.6887484712596819, + "grad_norm": 6.205137252807617, + "learning_rate": 7.889913711102053e-06, + "loss": 3.1655, + "step": 24855 + }, + { + "epoch": 1.689088191330344, + "grad_norm": 7.072819709777832, + "learning_rate": 7.889489061013726e-06, + "loss": 3.2074, + "step": 24860 + }, + { + "epoch": 1.6894279114010056, + "grad_norm": 5.966200828552246, + "learning_rate": 7.889064410925399e-06, + "loss": 2.9976, + "step": 24865 + }, + { + "epoch": 1.6897676314716672, + "grad_norm": 6.486223220825195, + "learning_rate": 7.888639760837071e-06, + "loss": 2.9914, + "step": 24870 + }, + { + "epoch": 1.6901073515423293, + "grad_norm": 5.427412986755371, + "learning_rate": 7.888215110748744e-06, + "loss": 3.2371, + "step": 24875 + }, + { + "epoch": 1.690447071612991, + "grad_norm": 5.545830726623535, + "learning_rate": 7.887790460660415e-06, + "loss": 3.0668, + "step": 24880 + }, + { + "epoch": 1.6907867916836525, + "grad_norm": 6.849703311920166, + "learning_rate": 7.88736581057209e-06, + "loss": 2.9936, + "step": 24885 + }, + { + "epoch": 1.6911265117543146, + "grad_norm": 6.792960166931152, + "learning_rate": 7.886941160483763e-06, + "loss": 3.1692, + "step": 24890 + }, + { + "epoch": 1.6914662318249762, + "grad_norm": 8.368699073791504, + "learning_rate": 7.886516510395434e-06, + "loss": 3.211, + "step": 24895 + }, + { + "epoch": 1.6918059518956379, + "grad_norm": 6.6526288986206055, + "learning_rate": 7.886091860307108e-06, + "loss": 3.0152, + "step": 24900 + }, + { + "epoch": 1.6921456719662997, + "grad_norm": 7.201880931854248, + "learning_rate": 7.885667210218781e-06, + "loss": 3.0448, + "step": 24905 + }, + { + "epoch": 1.6924853920369616, + "grad_norm": 6.462223052978516, + "learning_rate": 7.885242560130452e-06, + "loss": 3.1846, + "step": 24910 + }, + { + "epoch": 1.6928251121076232, + "grad_norm": 8.105114936828613, + "learning_rate": 7.884817910042127e-06, + "loss": 3.1694, + "step": 24915 + }, + { + "epoch": 1.693164832178285, + "grad_norm": 6.158847808837891, + "learning_rate": 7.8843932599538e-06, + "loss": 3.1391, + "step": 24920 + }, + { + "epoch": 1.693504552248947, + "grad_norm": 6.209299087524414, + "learning_rate": 7.88396860986547e-06, + "loss": 3.4261, + "step": 24925 + }, + { + "epoch": 1.6938442723196085, + "grad_norm": 6.399814128875732, + "learning_rate": 7.883543959777145e-06, + "loss": 2.8092, + "step": 24930 + }, + { + "epoch": 1.6941839923902704, + "grad_norm": 5.9130682945251465, + "learning_rate": 7.883119309688818e-06, + "loss": 3.3418, + "step": 24935 + }, + { + "epoch": 1.6945237124609323, + "grad_norm": 7.215877532958984, + "learning_rate": 7.882694659600489e-06, + "loss": 3.0372, + "step": 24940 + }, + { + "epoch": 1.6948634325315939, + "grad_norm": 6.797908306121826, + "learning_rate": 7.882270009512163e-06, + "loss": 3.1199, + "step": 24945 + }, + { + "epoch": 1.6952031526022557, + "grad_norm": 8.87006950378418, + "learning_rate": 7.881845359423835e-06, + "loss": 3.2758, + "step": 24950 + }, + { + "epoch": 1.6955428726729176, + "grad_norm": 5.536745071411133, + "learning_rate": 7.881420709335507e-06, + "loss": 3.2166, + "step": 24955 + }, + { + "epoch": 1.6958825927435792, + "grad_norm": 6.985620498657227, + "learning_rate": 7.880996059247182e-06, + "loss": 2.9684, + "step": 24960 + }, + { + "epoch": 1.696222312814241, + "grad_norm": 5.990126132965088, + "learning_rate": 7.880571409158853e-06, + "loss": 3.0564, + "step": 24965 + }, + { + "epoch": 1.696562032884903, + "grad_norm": 5.475201606750488, + "learning_rate": 7.880146759070526e-06, + "loss": 3.2227, + "step": 24970 + }, + { + "epoch": 1.6969017529555646, + "grad_norm": 5.6128458976745605, + "learning_rate": 7.8797221089822e-06, + "loss": 3.1563, + "step": 24975 + }, + { + "epoch": 1.6972414730262264, + "grad_norm": 7.765176296234131, + "learning_rate": 7.879297458893871e-06, + "loss": 3.202, + "step": 24980 + }, + { + "epoch": 1.6975811930968883, + "grad_norm": 7.721471786499023, + "learning_rate": 7.878872808805544e-06, + "loss": 2.9419, + "step": 24985 + }, + { + "epoch": 1.69792091316755, + "grad_norm": 7.175419807434082, + "learning_rate": 7.878448158717219e-06, + "loss": 3.0333, + "step": 24990 + }, + { + "epoch": 1.6982606332382117, + "grad_norm": 7.424148082733154, + "learning_rate": 7.87802350862889e-06, + "loss": 3.3824, + "step": 24995 + }, + { + "epoch": 1.6986003533088736, + "grad_norm": 6.4961323738098145, + "learning_rate": 7.877598858540563e-06, + "loss": 3.2571, + "step": 25000 + }, + { + "epoch": 1.6989400733795352, + "grad_norm": 6.000683784484863, + "learning_rate": 7.877174208452237e-06, + "loss": 3.1221, + "step": 25005 + }, + { + "epoch": 1.699279793450197, + "grad_norm": 8.178991317749023, + "learning_rate": 7.876749558363908e-06, + "loss": 3.3213, + "step": 25010 + }, + { + "epoch": 1.699619513520859, + "grad_norm": 7.763392925262451, + "learning_rate": 7.876324908275581e-06, + "loss": 2.8575, + "step": 25015 + }, + { + "epoch": 1.6999592335915206, + "grad_norm": 7.659182548522949, + "learning_rate": 7.875900258187255e-06, + "loss": 3.3223, + "step": 25020 + }, + { + "epoch": 1.7002989536621822, + "grad_norm": 8.78338623046875, + "learning_rate": 7.875475608098927e-06, + "loss": 3.2074, + "step": 25025 + }, + { + "epoch": 1.7006386737328443, + "grad_norm": 6.046013832092285, + "learning_rate": 7.8750509580106e-06, + "loss": 3.0881, + "step": 25030 + }, + { + "epoch": 1.700978393803506, + "grad_norm": 6.233828067779541, + "learning_rate": 7.874626307922272e-06, + "loss": 2.8115, + "step": 25035 + }, + { + "epoch": 1.7013181138741675, + "grad_norm": 6.90580940246582, + "learning_rate": 7.874201657833945e-06, + "loss": 3.261, + "step": 25040 + }, + { + "epoch": 1.7016578339448296, + "grad_norm": 7.760556697845459, + "learning_rate": 7.873777007745618e-06, + "loss": 3.0649, + "step": 25045 + }, + { + "epoch": 1.7019975540154912, + "grad_norm": 7.465638637542725, + "learning_rate": 7.87335235765729e-06, + "loss": 3.0662, + "step": 25050 + }, + { + "epoch": 1.7023372740861529, + "grad_norm": 8.243617057800293, + "learning_rate": 7.872927707568963e-06, + "loss": 3.1735, + "step": 25055 + }, + { + "epoch": 1.702676994156815, + "grad_norm": 7.093069076538086, + "learning_rate": 7.872503057480638e-06, + "loss": 3.0435, + "step": 25060 + }, + { + "epoch": 1.7030167142274766, + "grad_norm": 5.938830852508545, + "learning_rate": 7.872078407392309e-06, + "loss": 2.8668, + "step": 25065 + }, + { + "epoch": 1.7033564342981382, + "grad_norm": 6.248878002166748, + "learning_rate": 7.871653757303982e-06, + "loss": 3.2415, + "step": 25070 + }, + { + "epoch": 1.7036961543688, + "grad_norm": 5.989567756652832, + "learning_rate": 7.871229107215656e-06, + "loss": 2.9015, + "step": 25075 + }, + { + "epoch": 1.704035874439462, + "grad_norm": 6.278524875640869, + "learning_rate": 7.870804457127327e-06, + "loss": 3.2009, + "step": 25080 + }, + { + "epoch": 1.7043755945101235, + "grad_norm": 7.736875534057617, + "learning_rate": 7.870379807039e-06, + "loss": 3.3592, + "step": 25085 + }, + { + "epoch": 1.7047153145807854, + "grad_norm": 5.819395542144775, + "learning_rate": 7.869955156950675e-06, + "loss": 3.1384, + "step": 25090 + }, + { + "epoch": 1.7050550346514473, + "grad_norm": 6.494046211242676, + "learning_rate": 7.869530506862346e-06, + "loss": 3.2941, + "step": 25095 + }, + { + "epoch": 1.7053947547221089, + "grad_norm": 5.624703407287598, + "learning_rate": 7.869105856774019e-06, + "loss": 3.2051, + "step": 25100 + }, + { + "epoch": 1.7057344747927707, + "grad_norm": 7.00847864151001, + "learning_rate": 7.868681206685691e-06, + "loss": 3.2407, + "step": 25105 + }, + { + "epoch": 1.7060741948634326, + "grad_norm": 4.881647109985352, + "learning_rate": 7.868256556597364e-06, + "loss": 3.0623, + "step": 25110 + }, + { + "epoch": 1.7064139149340942, + "grad_norm": 5.8037333488464355, + "learning_rate": 7.867831906509037e-06, + "loss": 3.0725, + "step": 25115 + }, + { + "epoch": 1.706753635004756, + "grad_norm": 7.623387336730957, + "learning_rate": 7.86740725642071e-06, + "loss": 3.0237, + "step": 25120 + }, + { + "epoch": 1.707093355075418, + "grad_norm": 8.616981506347656, + "learning_rate": 7.866982606332383e-06, + "loss": 3.1399, + "step": 25125 + }, + { + "epoch": 1.7074330751460796, + "grad_norm": 6.15738582611084, + "learning_rate": 7.866557956244055e-06, + "loss": 3.3832, + "step": 25130 + }, + { + "epoch": 1.7077727952167414, + "grad_norm": 6.350391864776611, + "learning_rate": 7.866133306155728e-06, + "loss": 3.0433, + "step": 25135 + }, + { + "epoch": 1.7081125152874033, + "grad_norm": 6.646370887756348, + "learning_rate": 7.865708656067401e-06, + "loss": 3.4085, + "step": 25140 + }, + { + "epoch": 1.708452235358065, + "grad_norm": 10.820255279541016, + "learning_rate": 7.865284005979074e-06, + "loss": 3.1268, + "step": 25145 + }, + { + "epoch": 1.7087919554287267, + "grad_norm": 7.898733139038086, + "learning_rate": 7.864859355890747e-06, + "loss": 3.5302, + "step": 25150 + }, + { + "epoch": 1.7091316754993886, + "grad_norm": 6.782475471496582, + "learning_rate": 7.86443470580242e-06, + "loss": 3.2812, + "step": 25155 + }, + { + "epoch": 1.7094713955700502, + "grad_norm": 7.2061944007873535, + "learning_rate": 7.864010055714092e-06, + "loss": 3.1311, + "step": 25160 + }, + { + "epoch": 1.709811115640712, + "grad_norm": 6.733331680297852, + "learning_rate": 7.863585405625765e-06, + "loss": 3.0615, + "step": 25165 + }, + { + "epoch": 1.710150835711374, + "grad_norm": 7.304823875427246, + "learning_rate": 7.863160755537438e-06, + "loss": 3.1773, + "step": 25170 + }, + { + "epoch": 1.7104905557820356, + "grad_norm": 7.20900821685791, + "learning_rate": 7.86273610544911e-06, + "loss": 3.3176, + "step": 25175 + }, + { + "epoch": 1.7108302758526974, + "grad_norm": 7.147749900817871, + "learning_rate": 7.862311455360783e-06, + "loss": 3.2314, + "step": 25180 + }, + { + "epoch": 1.7111699959233593, + "grad_norm": 6.453732490539551, + "learning_rate": 7.861886805272456e-06, + "loss": 3.3173, + "step": 25185 + }, + { + "epoch": 1.711509715994021, + "grad_norm": 10.46760082244873, + "learning_rate": 7.861462155184129e-06, + "loss": 3.2948, + "step": 25190 + }, + { + "epoch": 1.7118494360646825, + "grad_norm": 7.630192279815674, + "learning_rate": 7.861037505095802e-06, + "loss": 3.3445, + "step": 25195 + }, + { + "epoch": 1.7121891561353446, + "grad_norm": 6.658563613891602, + "learning_rate": 7.860612855007475e-06, + "loss": 3.0898, + "step": 25200 + }, + { + "epoch": 1.7125288762060062, + "grad_norm": 6.829784393310547, + "learning_rate": 7.860188204919147e-06, + "loss": 3.0766, + "step": 25205 + }, + { + "epoch": 1.7128685962766679, + "grad_norm": 7.109785079956055, + "learning_rate": 7.85976355483082e-06, + "loss": 3.1906, + "step": 25210 + }, + { + "epoch": 1.71320831634733, + "grad_norm": 6.126891613006592, + "learning_rate": 7.859338904742493e-06, + "loss": 3.071, + "step": 25215 + }, + { + "epoch": 1.7135480364179916, + "grad_norm": 6.826797962188721, + "learning_rate": 7.858914254654166e-06, + "loss": 3.2875, + "step": 25220 + }, + { + "epoch": 1.7138877564886532, + "grad_norm": 6.855309963226318, + "learning_rate": 7.858489604565839e-06, + "loss": 3.16, + "step": 25225 + }, + { + "epoch": 1.7142274765593153, + "grad_norm": 6.213575839996338, + "learning_rate": 7.858064954477511e-06, + "loss": 2.8737, + "step": 25230 + }, + { + "epoch": 1.714567196629977, + "grad_norm": 6.341024875640869, + "learning_rate": 7.857640304389184e-06, + "loss": 3.093, + "step": 25235 + }, + { + "epoch": 1.7149069167006386, + "grad_norm": 8.235767364501953, + "learning_rate": 7.857215654300857e-06, + "loss": 3.6193, + "step": 25240 + }, + { + "epoch": 1.7152466367713004, + "grad_norm": 5.695298671722412, + "learning_rate": 7.85679100421253e-06, + "loss": 3.0813, + "step": 25245 + }, + { + "epoch": 1.7155863568419623, + "grad_norm": 6.6504645347595215, + "learning_rate": 7.856366354124203e-06, + "loss": 3.2994, + "step": 25250 + }, + { + "epoch": 1.7159260769126239, + "grad_norm": 6.547455787658691, + "learning_rate": 7.855941704035875e-06, + "loss": 3.1078, + "step": 25255 + }, + { + "epoch": 1.7162657969832857, + "grad_norm": 7.100152492523193, + "learning_rate": 7.855517053947548e-06, + "loss": 3.1059, + "step": 25260 + }, + { + "epoch": 1.7166055170539476, + "grad_norm": 6.841859340667725, + "learning_rate": 7.855092403859221e-06, + "loss": 3.3478, + "step": 25265 + }, + { + "epoch": 1.7169452371246092, + "grad_norm": 6.269033432006836, + "learning_rate": 7.854667753770894e-06, + "loss": 3.1504, + "step": 25270 + }, + { + "epoch": 1.717284957195271, + "grad_norm": 5.665428638458252, + "learning_rate": 7.854243103682567e-06, + "loss": 3.0554, + "step": 25275 + }, + { + "epoch": 1.717624677265933, + "grad_norm": 8.938471794128418, + "learning_rate": 7.85381845359424e-06, + "loss": 3.2725, + "step": 25280 + }, + { + "epoch": 1.7179643973365946, + "grad_norm": 6.372951507568359, + "learning_rate": 7.853393803505912e-06, + "loss": 3.3894, + "step": 25285 + }, + { + "epoch": 1.7183041174072564, + "grad_norm": 6.074949741363525, + "learning_rate": 7.852969153417585e-06, + "loss": 3.3082, + "step": 25290 + }, + { + "epoch": 1.7186438374779183, + "grad_norm": 7.577573776245117, + "learning_rate": 7.852544503329256e-06, + "loss": 2.9056, + "step": 25295 + }, + { + "epoch": 1.71898355754858, + "grad_norm": 5.7255167961120605, + "learning_rate": 7.85211985324093e-06, + "loss": 3.2384, + "step": 25300 + }, + { + "epoch": 1.7193232776192418, + "grad_norm": 6.813014507293701, + "learning_rate": 7.851695203152603e-06, + "loss": 2.9968, + "step": 25305 + }, + { + "epoch": 1.7196629976899036, + "grad_norm": 7.263901233673096, + "learning_rate": 7.851270553064274e-06, + "loss": 3.3429, + "step": 25310 + }, + { + "epoch": 1.7200027177605652, + "grad_norm": 7.832527160644531, + "learning_rate": 7.850845902975949e-06, + "loss": 3.2883, + "step": 25315 + }, + { + "epoch": 1.720342437831227, + "grad_norm": 6.536856174468994, + "learning_rate": 7.850421252887622e-06, + "loss": 3.2117, + "step": 25320 + }, + { + "epoch": 1.720682157901889, + "grad_norm": 6.780724048614502, + "learning_rate": 7.849996602799293e-06, + "loss": 3.1357, + "step": 25325 + }, + { + "epoch": 1.7210218779725506, + "grad_norm": 6.199527263641357, + "learning_rate": 7.849571952710967e-06, + "loss": 3.2828, + "step": 25330 + }, + { + "epoch": 1.7213615980432124, + "grad_norm": 7.324316024780273, + "learning_rate": 7.84914730262264e-06, + "loss": 3.0299, + "step": 25335 + }, + { + "epoch": 1.7217013181138743, + "grad_norm": 5.947216987609863, + "learning_rate": 7.848722652534311e-06, + "loss": 3.1841, + "step": 25340 + }, + { + "epoch": 1.722041038184536, + "grad_norm": 6.7080793380737305, + "learning_rate": 7.848298002445986e-06, + "loss": 3.004, + "step": 25345 + }, + { + "epoch": 1.7223807582551978, + "grad_norm": 7.303158760070801, + "learning_rate": 7.847873352357659e-06, + "loss": 2.9336, + "step": 25350 + }, + { + "epoch": 1.7227204783258596, + "grad_norm": 7.6895670890808105, + "learning_rate": 7.84744870226933e-06, + "loss": 3.0453, + "step": 25355 + }, + { + "epoch": 1.7230601983965212, + "grad_norm": 5.995151996612549, + "learning_rate": 7.847024052181004e-06, + "loss": 2.9417, + "step": 25360 + }, + { + "epoch": 1.7233999184671829, + "grad_norm": 7.03848934173584, + "learning_rate": 7.846599402092675e-06, + "loss": 2.9855, + "step": 25365 + }, + { + "epoch": 1.723739638537845, + "grad_norm": 7.229310035705566, + "learning_rate": 7.846174752004348e-06, + "loss": 3.382, + "step": 25370 + }, + { + "epoch": 1.7240793586085066, + "grad_norm": 7.213864803314209, + "learning_rate": 7.845750101916023e-06, + "loss": 3.0857, + "step": 25375 + }, + { + "epoch": 1.7244190786791682, + "grad_norm": 8.378355026245117, + "learning_rate": 7.845325451827694e-06, + "loss": 3.0546, + "step": 25380 + }, + { + "epoch": 1.7247587987498303, + "grad_norm": 8.178887367248535, + "learning_rate": 7.844900801739366e-06, + "loss": 3.2586, + "step": 25385 + }, + { + "epoch": 1.725098518820492, + "grad_norm": 6.091599464416504, + "learning_rate": 7.844476151651041e-06, + "loss": 3.3046, + "step": 25390 + }, + { + "epoch": 1.7254382388911536, + "grad_norm": 7.702304363250732, + "learning_rate": 7.844051501562712e-06, + "loss": 3.1142, + "step": 25395 + }, + { + "epoch": 1.7257779589618156, + "grad_norm": 6.96030330657959, + "learning_rate": 7.843626851474387e-06, + "loss": 3.0733, + "step": 25400 + }, + { + "epoch": 1.7261176790324773, + "grad_norm": 6.488358497619629, + "learning_rate": 7.84320220138606e-06, + "loss": 3.09, + "step": 25405 + }, + { + "epoch": 1.726457399103139, + "grad_norm": 7.324504375457764, + "learning_rate": 7.84277755129773e-06, + "loss": 3.2139, + "step": 25410 + }, + { + "epoch": 1.7267971191738007, + "grad_norm": 6.011954307556152, + "learning_rate": 7.842352901209405e-06, + "loss": 3.2796, + "step": 25415 + }, + { + "epoch": 1.7271368392444626, + "grad_norm": 7.1123576164245605, + "learning_rate": 7.841928251121078e-06, + "loss": 3.3947, + "step": 25420 + }, + { + "epoch": 1.7274765593151242, + "grad_norm": 6.744057655334473, + "learning_rate": 7.841503601032749e-06, + "loss": 3.0543, + "step": 25425 + }, + { + "epoch": 1.727816279385786, + "grad_norm": 7.675909042358398, + "learning_rate": 7.841078950944423e-06, + "loss": 3.0419, + "step": 25430 + }, + { + "epoch": 1.728155999456448, + "grad_norm": 7.854129791259766, + "learning_rate": 7.840654300856096e-06, + "loss": 3.2121, + "step": 25435 + }, + { + "epoch": 1.7284957195271096, + "grad_norm": 5.956796646118164, + "learning_rate": 7.840229650767767e-06, + "loss": 3.2037, + "step": 25440 + }, + { + "epoch": 1.7288354395977714, + "grad_norm": 6.8328046798706055, + "learning_rate": 7.839805000679442e-06, + "loss": 3.1499, + "step": 25445 + }, + { + "epoch": 1.7291751596684333, + "grad_norm": 7.885838508605957, + "learning_rate": 7.839380350591113e-06, + "loss": 3.1947, + "step": 25450 + }, + { + "epoch": 1.729514879739095, + "grad_norm": 7.3630900382995605, + "learning_rate": 7.838955700502786e-06, + "loss": 3.2039, + "step": 25455 + }, + { + "epoch": 1.7298545998097568, + "grad_norm": 6.642541885375977, + "learning_rate": 7.83853105041446e-06, + "loss": 3.2939, + "step": 25460 + }, + { + "epoch": 1.7301943198804186, + "grad_norm": 6.668527126312256, + "learning_rate": 7.838106400326131e-06, + "loss": 3.3134, + "step": 25465 + }, + { + "epoch": 1.7305340399510802, + "grad_norm": 8.034283638000488, + "learning_rate": 7.837681750237804e-06, + "loss": 2.9999, + "step": 25470 + }, + { + "epoch": 1.730873760021742, + "grad_norm": 6.659444332122803, + "learning_rate": 7.837257100149479e-06, + "loss": 3.1987, + "step": 25475 + }, + { + "epoch": 1.731213480092404, + "grad_norm": 7.334278583526611, + "learning_rate": 7.83683245006115e-06, + "loss": 3.0019, + "step": 25480 + }, + { + "epoch": 1.7315532001630656, + "grad_norm": 8.10038948059082, + "learning_rate": 7.836407799972822e-06, + "loss": 3.1969, + "step": 25485 + }, + { + "epoch": 1.7318929202337274, + "grad_norm": 7.731234550476074, + "learning_rate": 7.835983149884497e-06, + "loss": 3.1376, + "step": 25490 + }, + { + "epoch": 1.7322326403043893, + "grad_norm": 8.063458442687988, + "learning_rate": 7.835558499796168e-06, + "loss": 3.0071, + "step": 25495 + }, + { + "epoch": 1.732572360375051, + "grad_norm": 8.416071891784668, + "learning_rate": 7.835133849707841e-06, + "loss": 3.1792, + "step": 25500 + }, + { + "epoch": 1.7329120804457128, + "grad_norm": 7.28736686706543, + "learning_rate": 7.834709199619515e-06, + "loss": 3.2099, + "step": 25505 + }, + { + "epoch": 1.7332518005163746, + "grad_norm": 6.66664457321167, + "learning_rate": 7.834284549531186e-06, + "loss": 3.4317, + "step": 25510 + }, + { + "epoch": 1.7335915205870362, + "grad_norm": 6.769410610198975, + "learning_rate": 7.83385989944286e-06, + "loss": 2.9525, + "step": 25515 + }, + { + "epoch": 1.733931240657698, + "grad_norm": 6.029792308807373, + "learning_rate": 7.833435249354532e-06, + "loss": 3.1861, + "step": 25520 + }, + { + "epoch": 1.73427096072836, + "grad_norm": 6.93290901184082, + "learning_rate": 7.833010599266205e-06, + "loss": 2.9051, + "step": 25525 + }, + { + "epoch": 1.7346106807990216, + "grad_norm": 6.6620941162109375, + "learning_rate": 7.832585949177878e-06, + "loss": 2.9609, + "step": 25530 + }, + { + "epoch": 1.7349504008696832, + "grad_norm": 6.034198760986328, + "learning_rate": 7.83216129908955e-06, + "loss": 3.0221, + "step": 25535 + }, + { + "epoch": 1.7352901209403453, + "grad_norm": 6.278834342956543, + "learning_rate": 7.831736649001223e-06, + "loss": 3.0367, + "step": 25540 + }, + { + "epoch": 1.735629841011007, + "grad_norm": 7.41230583190918, + "learning_rate": 7.831311998912896e-06, + "loss": 3.1297, + "step": 25545 + }, + { + "epoch": 1.7359695610816686, + "grad_norm": 7.856441497802734, + "learning_rate": 7.830887348824569e-06, + "loss": 3.2768, + "step": 25550 + }, + { + "epoch": 1.7363092811523306, + "grad_norm": 6.246037483215332, + "learning_rate": 7.830462698736242e-06, + "loss": 3.2638, + "step": 25555 + }, + { + "epoch": 1.7366490012229923, + "grad_norm": 5.06129789352417, + "learning_rate": 7.830038048647915e-06, + "loss": 3.1399, + "step": 25560 + }, + { + "epoch": 1.736988721293654, + "grad_norm": 7.417859077453613, + "learning_rate": 7.829613398559587e-06, + "loss": 3.1002, + "step": 25565 + }, + { + "epoch": 1.737328441364316, + "grad_norm": 7.942375183105469, + "learning_rate": 7.82918874847126e-06, + "loss": 2.8658, + "step": 25570 + }, + { + "epoch": 1.7376681614349776, + "grad_norm": 7.033936023712158, + "learning_rate": 7.828764098382933e-06, + "loss": 3.1257, + "step": 25575 + }, + { + "epoch": 1.7380078815056392, + "grad_norm": 7.540808200836182, + "learning_rate": 7.828339448294606e-06, + "loss": 3.2889, + "step": 25580 + }, + { + "epoch": 1.738347601576301, + "grad_norm": 6.6662421226501465, + "learning_rate": 7.827914798206279e-06, + "loss": 2.9994, + "step": 25585 + }, + { + "epoch": 1.738687321646963, + "grad_norm": 6.294602394104004, + "learning_rate": 7.827490148117951e-06, + "loss": 3.1251, + "step": 25590 + }, + { + "epoch": 1.7390270417176246, + "grad_norm": 6.602039337158203, + "learning_rate": 7.827065498029624e-06, + "loss": 3.2669, + "step": 25595 + }, + { + "epoch": 1.7393667617882864, + "grad_norm": 7.8123087882995605, + "learning_rate": 7.826640847941297e-06, + "loss": 3.3758, + "step": 25600 + }, + { + "epoch": 1.7397064818589483, + "grad_norm": 7.488753795623779, + "learning_rate": 7.82621619785297e-06, + "loss": 3.2064, + "step": 25605 + }, + { + "epoch": 1.74004620192961, + "grad_norm": 7.736843109130859, + "learning_rate": 7.825791547764643e-06, + "loss": 3.2096, + "step": 25610 + }, + { + "epoch": 1.7403859220002718, + "grad_norm": 8.14976978302002, + "learning_rate": 7.825366897676315e-06, + "loss": 3.3849, + "step": 25615 + }, + { + "epoch": 1.7407256420709336, + "grad_norm": 7.684345245361328, + "learning_rate": 7.824942247587988e-06, + "loss": 3.1321, + "step": 25620 + }, + { + "epoch": 1.7410653621415952, + "grad_norm": 7.117124557495117, + "learning_rate": 7.824517597499661e-06, + "loss": 3.1611, + "step": 25625 + }, + { + "epoch": 1.741405082212257, + "grad_norm": 8.896442413330078, + "learning_rate": 7.824092947411334e-06, + "loss": 3.235, + "step": 25630 + }, + { + "epoch": 1.741744802282919, + "grad_norm": 6.596230506896973, + "learning_rate": 7.823668297323007e-06, + "loss": 3.1361, + "step": 25635 + }, + { + "epoch": 1.7420845223535806, + "grad_norm": 6.069308757781982, + "learning_rate": 7.82324364723468e-06, + "loss": 3.0227, + "step": 25640 + }, + { + "epoch": 1.7424242424242424, + "grad_norm": 7.967853546142578, + "learning_rate": 7.822818997146352e-06, + "loss": 3.0143, + "step": 25645 + }, + { + "epoch": 1.7427639624949043, + "grad_norm": 7.562276363372803, + "learning_rate": 7.822394347058025e-06, + "loss": 3.0078, + "step": 25650 + }, + { + "epoch": 1.743103682565566, + "grad_norm": 6.195555210113525, + "learning_rate": 7.821969696969698e-06, + "loss": 3.1805, + "step": 25655 + }, + { + "epoch": 1.7434434026362278, + "grad_norm": 6.590412139892578, + "learning_rate": 7.82154504688137e-06, + "loss": 2.9883, + "step": 25660 + }, + { + "epoch": 1.7437831227068896, + "grad_norm": 8.507209777832031, + "learning_rate": 7.821120396793043e-06, + "loss": 3.151, + "step": 25665 + }, + { + "epoch": 1.7441228427775513, + "grad_norm": 6.170257091522217, + "learning_rate": 7.820695746704716e-06, + "loss": 3.4604, + "step": 25670 + }, + { + "epoch": 1.744462562848213, + "grad_norm": 7.385732650756836, + "learning_rate": 7.820271096616389e-06, + "loss": 2.9866, + "step": 25675 + }, + { + "epoch": 1.744802282918875, + "grad_norm": 6.324735164642334, + "learning_rate": 7.819846446528062e-06, + "loss": 3.2125, + "step": 25680 + }, + { + "epoch": 1.7451420029895366, + "grad_norm": 6.852725028991699, + "learning_rate": 7.819421796439735e-06, + "loss": 3.082, + "step": 25685 + }, + { + "epoch": 1.7454817230601984, + "grad_norm": 7.492153167724609, + "learning_rate": 7.818997146351407e-06, + "loss": 3.2348, + "step": 25690 + }, + { + "epoch": 1.7458214431308603, + "grad_norm": 6.141279220581055, + "learning_rate": 7.81857249626308e-06, + "loss": 3.2305, + "step": 25695 + }, + { + "epoch": 1.746161163201522, + "grad_norm": 6.440272808074951, + "learning_rate": 7.818147846174753e-06, + "loss": 3.0299, + "step": 25700 + }, + { + "epoch": 1.7465008832721836, + "grad_norm": 8.619428634643555, + "learning_rate": 7.817723196086426e-06, + "loss": 3.4009, + "step": 25705 + }, + { + "epoch": 1.7468406033428456, + "grad_norm": 7.914241790771484, + "learning_rate": 7.817298545998097e-06, + "loss": 3.0224, + "step": 25710 + }, + { + "epoch": 1.7471803234135073, + "grad_norm": 8.550457000732422, + "learning_rate": 7.816873895909771e-06, + "loss": 3.0482, + "step": 25715 + }, + { + "epoch": 1.747520043484169, + "grad_norm": 7.460422992706299, + "learning_rate": 7.816449245821444e-06, + "loss": 3.2262, + "step": 25720 + }, + { + "epoch": 1.747859763554831, + "grad_norm": 6.535146713256836, + "learning_rate": 7.816024595733115e-06, + "loss": 3.0512, + "step": 25725 + }, + { + "epoch": 1.7481994836254926, + "grad_norm": 5.630773067474365, + "learning_rate": 7.81559994564479e-06, + "loss": 3.1919, + "step": 25730 + }, + { + "epoch": 1.7485392036961542, + "grad_norm": 5.789183616638184, + "learning_rate": 7.815175295556463e-06, + "loss": 2.8832, + "step": 25735 + }, + { + "epoch": 1.7488789237668163, + "grad_norm": 5.900900363922119, + "learning_rate": 7.814750645468135e-06, + "loss": 3.2, + "step": 25740 + }, + { + "epoch": 1.749218643837478, + "grad_norm": 5.170674800872803, + "learning_rate": 7.814325995379808e-06, + "loss": 2.8998, + "step": 25745 + }, + { + "epoch": 1.7495583639081396, + "grad_norm": 7.603888988494873, + "learning_rate": 7.813901345291481e-06, + "loss": 3.2957, + "step": 25750 + }, + { + "epoch": 1.7498980839788014, + "grad_norm": 8.568167686462402, + "learning_rate": 7.813476695203154e-06, + "loss": 3.1608, + "step": 25755 + }, + { + "epoch": 1.7502378040494633, + "grad_norm": 8.016767501831055, + "learning_rate": 7.813052045114827e-06, + "loss": 3.1307, + "step": 25760 + }, + { + "epoch": 1.750577524120125, + "grad_norm": 7.133134365081787, + "learning_rate": 7.8126273950265e-06, + "loss": 3.08, + "step": 25765 + }, + { + "epoch": 1.7509172441907868, + "grad_norm": 5.380340576171875, + "learning_rate": 7.812202744938172e-06, + "loss": 2.989, + "step": 25770 + }, + { + "epoch": 1.7512569642614486, + "grad_norm": 6.436545372009277, + "learning_rate": 7.811778094849845e-06, + "loss": 3.2111, + "step": 25775 + }, + { + "epoch": 1.7515966843321102, + "grad_norm": 6.817014217376709, + "learning_rate": 7.811353444761518e-06, + "loss": 3.1606, + "step": 25780 + }, + { + "epoch": 1.751936404402772, + "grad_norm": 7.1606550216674805, + "learning_rate": 7.81092879467319e-06, + "loss": 2.9634, + "step": 25785 + }, + { + "epoch": 1.752276124473434, + "grad_norm": 6.945859432220459, + "learning_rate": 7.810504144584863e-06, + "loss": 3.3601, + "step": 25790 + }, + { + "epoch": 1.7526158445440956, + "grad_norm": 5.510765552520752, + "learning_rate": 7.810079494496534e-06, + "loss": 3.0607, + "step": 25795 + }, + { + "epoch": 1.7529555646147574, + "grad_norm": 6.854775428771973, + "learning_rate": 7.809654844408209e-06, + "loss": 3.2323, + "step": 25800 + }, + { + "epoch": 1.7532952846854193, + "grad_norm": 8.08820629119873, + "learning_rate": 7.809230194319882e-06, + "loss": 3.1439, + "step": 25805 + }, + { + "epoch": 1.753635004756081, + "grad_norm": 5.872491359710693, + "learning_rate": 7.808805544231553e-06, + "loss": 3.3303, + "step": 25810 + }, + { + "epoch": 1.7539747248267428, + "grad_norm": 6.996467590332031, + "learning_rate": 7.808380894143227e-06, + "loss": 3.3837, + "step": 25815 + }, + { + "epoch": 1.7543144448974046, + "grad_norm": 9.554566383361816, + "learning_rate": 7.8079562440549e-06, + "loss": 3.3987, + "step": 25820 + }, + { + "epoch": 1.7546541649680663, + "grad_norm": 7.316112995147705, + "learning_rate": 7.807531593966571e-06, + "loss": 3.0785, + "step": 25825 + }, + { + "epoch": 1.754993885038728, + "grad_norm": 8.209728240966797, + "learning_rate": 7.807106943878246e-06, + "loss": 3.2848, + "step": 25830 + }, + { + "epoch": 1.75533360510939, + "grad_norm": 6.928753852844238, + "learning_rate": 7.806682293789919e-06, + "loss": 3.2679, + "step": 25835 + }, + { + "epoch": 1.7556733251800516, + "grad_norm": 6.456140518188477, + "learning_rate": 7.80625764370159e-06, + "loss": 3.1823, + "step": 25840 + }, + { + "epoch": 1.7560130452507134, + "grad_norm": 6.508700370788574, + "learning_rate": 7.805832993613264e-06, + "loss": 3.2286, + "step": 25845 + }, + { + "epoch": 1.7563527653213753, + "grad_norm": 7.844232082366943, + "learning_rate": 7.805408343524937e-06, + "loss": 3.2692, + "step": 25850 + }, + { + "epoch": 1.756692485392037, + "grad_norm": 6.463357925415039, + "learning_rate": 7.804983693436608e-06, + "loss": 3.1516, + "step": 25855 + }, + { + "epoch": 1.7570322054626988, + "grad_norm": 8.433822631835938, + "learning_rate": 7.804559043348283e-06, + "loss": 3.1786, + "step": 25860 + }, + { + "epoch": 1.7573719255333606, + "grad_norm": 7.994987964630127, + "learning_rate": 7.804134393259954e-06, + "loss": 3.1882, + "step": 25865 + }, + { + "epoch": 1.7577116456040223, + "grad_norm": 5.15543794631958, + "learning_rate": 7.803709743171626e-06, + "loss": 2.9743, + "step": 25870 + }, + { + "epoch": 1.758051365674684, + "grad_norm": 7.444790363311768, + "learning_rate": 7.803285093083301e-06, + "loss": 2.9396, + "step": 25875 + }, + { + "epoch": 1.758391085745346, + "grad_norm": 6.685849666595459, + "learning_rate": 7.802860442994972e-06, + "loss": 2.9193, + "step": 25880 + }, + { + "epoch": 1.7587308058160076, + "grad_norm": 6.108608722686768, + "learning_rate": 7.802435792906645e-06, + "loss": 3.1811, + "step": 25885 + }, + { + "epoch": 1.7590705258866692, + "grad_norm": 5.95236349105835, + "learning_rate": 7.80201114281832e-06, + "loss": 2.992, + "step": 25890 + }, + { + "epoch": 1.7594102459573313, + "grad_norm": 6.525211334228516, + "learning_rate": 7.80158649272999e-06, + "loss": 3.1956, + "step": 25895 + }, + { + "epoch": 1.759749966027993, + "grad_norm": 6.4323554039001465, + "learning_rate": 7.801161842641663e-06, + "loss": 2.7979, + "step": 25900 + }, + { + "epoch": 1.7600896860986546, + "grad_norm": 8.768511772155762, + "learning_rate": 7.800737192553338e-06, + "loss": 3.0902, + "step": 25905 + }, + { + "epoch": 1.7604294061693166, + "grad_norm": 7.334231853485107, + "learning_rate": 7.800312542465009e-06, + "loss": 3.2502, + "step": 25910 + }, + { + "epoch": 1.7607691262399783, + "grad_norm": 5.9333062171936035, + "learning_rate": 7.799887892376682e-06, + "loss": 3.1911, + "step": 25915 + }, + { + "epoch": 1.76110884631064, + "grad_norm": 6.351955890655518, + "learning_rate": 7.799463242288356e-06, + "loss": 3.033, + "step": 25920 + }, + { + "epoch": 1.7614485663813018, + "grad_norm": 7.292060852050781, + "learning_rate": 7.799038592200027e-06, + "loss": 3.1378, + "step": 25925 + }, + { + "epoch": 1.7617882864519636, + "grad_norm": 6.140055179595947, + "learning_rate": 7.7986139421117e-06, + "loss": 3.1704, + "step": 25930 + }, + { + "epoch": 1.7621280065226252, + "grad_norm": 8.2022123336792, + "learning_rate": 7.798189292023373e-06, + "loss": 3.1789, + "step": 25935 + }, + { + "epoch": 1.762467726593287, + "grad_norm": 7.465223789215088, + "learning_rate": 7.797764641935046e-06, + "loss": 3.0863, + "step": 25940 + }, + { + "epoch": 1.762807446663949, + "grad_norm": 5.970048904418945, + "learning_rate": 7.797339991846718e-06, + "loss": 3.4661, + "step": 25945 + }, + { + "epoch": 1.7631471667346106, + "grad_norm": 8.31456470489502, + "learning_rate": 7.796915341758391e-06, + "loss": 3.1166, + "step": 25950 + }, + { + "epoch": 1.7634868868052724, + "grad_norm": 6.8895182609558105, + "learning_rate": 7.796490691670064e-06, + "loss": 3.251, + "step": 25955 + }, + { + "epoch": 1.7638266068759343, + "grad_norm": 6.618929386138916, + "learning_rate": 7.796066041581737e-06, + "loss": 3.3163, + "step": 25960 + }, + { + "epoch": 1.764166326946596, + "grad_norm": 6.371677398681641, + "learning_rate": 7.79564139149341e-06, + "loss": 3.1038, + "step": 25965 + }, + { + "epoch": 1.7645060470172578, + "grad_norm": 6.576242923736572, + "learning_rate": 7.795216741405082e-06, + "loss": 3.4385, + "step": 25970 + }, + { + "epoch": 1.7648457670879196, + "grad_norm": 5.080860137939453, + "learning_rate": 7.794792091316755e-06, + "loss": 3.1689, + "step": 25975 + }, + { + "epoch": 1.7651854871585813, + "grad_norm": 4.72226619720459, + "learning_rate": 7.794367441228428e-06, + "loss": 3.0826, + "step": 25980 + }, + { + "epoch": 1.765525207229243, + "grad_norm": 8.121103286743164, + "learning_rate": 7.793942791140101e-06, + "loss": 3.0441, + "step": 25985 + }, + { + "epoch": 1.765864927299905, + "grad_norm": 6.80539083480835, + "learning_rate": 7.793518141051774e-06, + "loss": 3.1267, + "step": 25990 + }, + { + "epoch": 1.7662046473705666, + "grad_norm": 7.357268333435059, + "learning_rate": 7.793093490963446e-06, + "loss": 3.4727, + "step": 25995 + }, + { + "epoch": 1.7665443674412284, + "grad_norm": 6.100580215454102, + "learning_rate": 7.79266884087512e-06, + "loss": 3.2463, + "step": 26000 + }, + { + "epoch": 1.7668840875118903, + "grad_norm": 6.166871547698975, + "learning_rate": 7.792244190786792e-06, + "loss": 3.0946, + "step": 26005 + }, + { + "epoch": 1.767223807582552, + "grad_norm": 7.257382392883301, + "learning_rate": 7.791819540698465e-06, + "loss": 3.28, + "step": 26010 + }, + { + "epoch": 1.7675635276532138, + "grad_norm": 6.248256206512451, + "learning_rate": 7.791394890610138e-06, + "loss": 3.1302, + "step": 26015 + }, + { + "epoch": 1.7679032477238756, + "grad_norm": 8.30368709564209, + "learning_rate": 7.79097024052181e-06, + "loss": 3.1082, + "step": 26020 + }, + { + "epoch": 1.7682429677945373, + "grad_norm": 6.907036304473877, + "learning_rate": 7.790545590433483e-06, + "loss": 2.7952, + "step": 26025 + }, + { + "epoch": 1.7685826878651991, + "grad_norm": 5.97445821762085, + "learning_rate": 7.790120940345156e-06, + "loss": 3.028, + "step": 26030 + }, + { + "epoch": 1.768922407935861, + "grad_norm": 6.2056708335876465, + "learning_rate": 7.789696290256829e-06, + "loss": 3.2853, + "step": 26035 + }, + { + "epoch": 1.7692621280065226, + "grad_norm": 6.867105484008789, + "learning_rate": 7.789271640168502e-06, + "loss": 3.2235, + "step": 26040 + }, + { + "epoch": 1.7696018480771842, + "grad_norm": 6.815287113189697, + "learning_rate": 7.788846990080174e-06, + "loss": 3.1426, + "step": 26045 + }, + { + "epoch": 1.7699415681478463, + "grad_norm": 8.382044792175293, + "learning_rate": 7.788422339991847e-06, + "loss": 3.2954, + "step": 26050 + }, + { + "epoch": 1.770281288218508, + "grad_norm": 7.172018527984619, + "learning_rate": 7.78799768990352e-06, + "loss": 3.0071, + "step": 26055 + }, + { + "epoch": 1.7706210082891696, + "grad_norm": 6.516674518585205, + "learning_rate": 7.787573039815193e-06, + "loss": 3.1322, + "step": 26060 + }, + { + "epoch": 1.7709607283598316, + "grad_norm": 7.1657819747924805, + "learning_rate": 7.787148389726866e-06, + "loss": 3.1883, + "step": 26065 + }, + { + "epoch": 1.7713004484304933, + "grad_norm": 6.655969619750977, + "learning_rate": 7.786723739638538e-06, + "loss": 2.9629, + "step": 26070 + }, + { + "epoch": 1.771640168501155, + "grad_norm": 7.2365899085998535, + "learning_rate": 7.786299089550211e-06, + "loss": 3.0235, + "step": 26075 + }, + { + "epoch": 1.771979888571817, + "grad_norm": 8.00097942352295, + "learning_rate": 7.785874439461884e-06, + "loss": 3.3341, + "step": 26080 + }, + { + "epoch": 1.7723196086424786, + "grad_norm": 6.450279712677002, + "learning_rate": 7.785449789373557e-06, + "loss": 3.0555, + "step": 26085 + }, + { + "epoch": 1.7726593287131402, + "grad_norm": 6.478649139404297, + "learning_rate": 7.78502513928523e-06, + "loss": 3.2498, + "step": 26090 + }, + { + "epoch": 1.772999048783802, + "grad_norm": 9.190281867980957, + "learning_rate": 7.784600489196902e-06, + "loss": 2.9147, + "step": 26095 + }, + { + "epoch": 1.773338768854464, + "grad_norm": 6.415149211883545, + "learning_rate": 7.784175839108575e-06, + "loss": 3.0081, + "step": 26100 + }, + { + "epoch": 1.7736784889251256, + "grad_norm": 4.747370719909668, + "learning_rate": 7.783751189020248e-06, + "loss": 3.0951, + "step": 26105 + }, + { + "epoch": 1.7740182089957874, + "grad_norm": 7.653624534606934, + "learning_rate": 7.783326538931921e-06, + "loss": 2.9006, + "step": 26110 + }, + { + "epoch": 1.7743579290664493, + "grad_norm": 7.209681034088135, + "learning_rate": 7.782901888843594e-06, + "loss": 3.1942, + "step": 26115 + }, + { + "epoch": 1.774697649137111, + "grad_norm": 7.194592475891113, + "learning_rate": 7.782477238755266e-06, + "loss": 3.2539, + "step": 26120 + }, + { + "epoch": 1.7750373692077728, + "grad_norm": 6.684964656829834, + "learning_rate": 7.78205258866694e-06, + "loss": 3.2132, + "step": 26125 + }, + { + "epoch": 1.7753770892784346, + "grad_norm": 6.673149585723877, + "learning_rate": 7.781627938578612e-06, + "loss": 3.082, + "step": 26130 + }, + { + "epoch": 1.7757168093490963, + "grad_norm": 7.380263805389404, + "learning_rate": 7.781203288490285e-06, + "loss": 3.13, + "step": 26135 + }, + { + "epoch": 1.7760565294197581, + "grad_norm": 6.149505138397217, + "learning_rate": 7.780778638401958e-06, + "loss": 3.093, + "step": 26140 + }, + { + "epoch": 1.77639624949042, + "grad_norm": 7.882704257965088, + "learning_rate": 7.78035398831363e-06, + "loss": 3.1321, + "step": 26145 + }, + { + "epoch": 1.7767359695610816, + "grad_norm": 7.341670036315918, + "learning_rate": 7.779929338225303e-06, + "loss": 3.0222, + "step": 26150 + }, + { + "epoch": 1.7770756896317434, + "grad_norm": 7.040404319763184, + "learning_rate": 7.779504688136976e-06, + "loss": 2.9568, + "step": 26155 + }, + { + "epoch": 1.7774154097024053, + "grad_norm": 6.406331539154053, + "learning_rate": 7.779080038048649e-06, + "loss": 3.1269, + "step": 26160 + }, + { + "epoch": 1.777755129773067, + "grad_norm": 7.756372451782227, + "learning_rate": 7.778655387960322e-06, + "loss": 3.2712, + "step": 26165 + }, + { + "epoch": 1.7780948498437288, + "grad_norm": 8.40280818939209, + "learning_rate": 7.778230737871995e-06, + "loss": 3.2525, + "step": 26170 + }, + { + "epoch": 1.7784345699143906, + "grad_norm": 7.155107498168945, + "learning_rate": 7.777806087783667e-06, + "loss": 2.9049, + "step": 26175 + }, + { + "epoch": 1.7787742899850523, + "grad_norm": 6.187009811401367, + "learning_rate": 7.77738143769534e-06, + "loss": 3.4328, + "step": 26180 + }, + { + "epoch": 1.7791140100557141, + "grad_norm": 5.575058460235596, + "learning_rate": 7.776956787607013e-06, + "loss": 3.1182, + "step": 26185 + }, + { + "epoch": 1.779453730126376, + "grad_norm": 8.188425064086914, + "learning_rate": 7.776532137518686e-06, + "loss": 3.0549, + "step": 26190 + }, + { + "epoch": 1.7797934501970376, + "grad_norm": 6.753876209259033, + "learning_rate": 7.776107487430359e-06, + "loss": 2.9331, + "step": 26195 + }, + { + "epoch": 1.7801331702676995, + "grad_norm": 8.71293830871582, + "learning_rate": 7.775682837342031e-06, + "loss": 2.9296, + "step": 26200 + }, + { + "epoch": 1.7804728903383613, + "grad_norm": 7.192230224609375, + "learning_rate": 7.775258187253704e-06, + "loss": 3.0833, + "step": 26205 + }, + { + "epoch": 1.780812610409023, + "grad_norm": 8.098237037658691, + "learning_rate": 7.774833537165375e-06, + "loss": 3.1223, + "step": 26210 + }, + { + "epoch": 1.7811523304796846, + "grad_norm": 7.457405090332031, + "learning_rate": 7.77440888707705e-06, + "loss": 3.2541, + "step": 26215 + }, + { + "epoch": 1.7814920505503467, + "grad_norm": 6.97428560256958, + "learning_rate": 7.773984236988723e-06, + "loss": 3.1885, + "step": 26220 + }, + { + "epoch": 1.7818317706210083, + "grad_norm": 5.253377914428711, + "learning_rate": 7.773559586900394e-06, + "loss": 3.2353, + "step": 26225 + }, + { + "epoch": 1.78217149069167, + "grad_norm": 6.618801116943359, + "learning_rate": 7.773134936812068e-06, + "loss": 3.3364, + "step": 26230 + }, + { + "epoch": 1.782511210762332, + "grad_norm": 5.489784240722656, + "learning_rate": 7.772710286723741e-06, + "loss": 3.2265, + "step": 26235 + }, + { + "epoch": 1.7828509308329936, + "grad_norm": 6.327445983886719, + "learning_rate": 7.772285636635412e-06, + "loss": 3.1618, + "step": 26240 + }, + { + "epoch": 1.7831906509036552, + "grad_norm": 6.747842311859131, + "learning_rate": 7.771860986547087e-06, + "loss": 3.2371, + "step": 26245 + }, + { + "epoch": 1.7835303709743173, + "grad_norm": 6.959957599639893, + "learning_rate": 7.77143633645876e-06, + "loss": 3.3259, + "step": 26250 + }, + { + "epoch": 1.783870091044979, + "grad_norm": 5.124095439910889, + "learning_rate": 7.77101168637043e-06, + "loss": 3.0167, + "step": 26255 + }, + { + "epoch": 1.7842098111156406, + "grad_norm": 8.522730827331543, + "learning_rate": 7.770587036282105e-06, + "loss": 3.161, + "step": 26260 + }, + { + "epoch": 1.7845495311863024, + "grad_norm": 6.670040607452393, + "learning_rate": 7.770162386193778e-06, + "loss": 3.1507, + "step": 26265 + }, + { + "epoch": 1.7848892512569643, + "grad_norm": 5.631170272827148, + "learning_rate": 7.769737736105449e-06, + "loss": 3.1779, + "step": 26270 + }, + { + "epoch": 1.785228971327626, + "grad_norm": 7.627096652984619, + "learning_rate": 7.769313086017123e-06, + "loss": 3.1973, + "step": 26275 + }, + { + "epoch": 1.7855686913982878, + "grad_norm": 5.614919662475586, + "learning_rate": 7.768888435928794e-06, + "loss": 3.264, + "step": 26280 + }, + { + "epoch": 1.7859084114689496, + "grad_norm": 6.700911045074463, + "learning_rate": 7.768463785840467e-06, + "loss": 3.2612, + "step": 26285 + }, + { + "epoch": 1.7862481315396113, + "grad_norm": 6.673486709594727, + "learning_rate": 7.768039135752142e-06, + "loss": 3.1401, + "step": 26290 + }, + { + "epoch": 1.7865878516102731, + "grad_norm": 6.8322367668151855, + "learning_rate": 7.767614485663813e-06, + "loss": 3.2066, + "step": 26295 + }, + { + "epoch": 1.786927571680935, + "grad_norm": 5.755945205688477, + "learning_rate": 7.767189835575486e-06, + "loss": 3.0404, + "step": 26300 + }, + { + "epoch": 1.7872672917515966, + "grad_norm": 6.112196445465088, + "learning_rate": 7.76676518548716e-06, + "loss": 3.0866, + "step": 26305 + }, + { + "epoch": 1.7876070118222585, + "grad_norm": 7.469074249267578, + "learning_rate": 7.766340535398831e-06, + "loss": 3.1832, + "step": 26310 + }, + { + "epoch": 1.7879467318929203, + "grad_norm": 6.858458518981934, + "learning_rate": 7.765915885310504e-06, + "loss": 3.2744, + "step": 26315 + }, + { + "epoch": 1.788286451963582, + "grad_norm": 5.62556266784668, + "learning_rate": 7.765491235222179e-06, + "loss": 2.9878, + "step": 26320 + }, + { + "epoch": 1.7886261720342438, + "grad_norm": 6.543940544128418, + "learning_rate": 7.76506658513385e-06, + "loss": 2.984, + "step": 26325 + }, + { + "epoch": 1.7889658921049056, + "grad_norm": 7.694422245025635, + "learning_rate": 7.764641935045522e-06, + "loss": 3.2413, + "step": 26330 + }, + { + "epoch": 1.7893056121755673, + "grad_norm": 5.978701114654541, + "learning_rate": 7.764217284957197e-06, + "loss": 3.0058, + "step": 26335 + }, + { + "epoch": 1.7896453322462291, + "grad_norm": 6.904677391052246, + "learning_rate": 7.763792634868868e-06, + "loss": 3.1627, + "step": 26340 + }, + { + "epoch": 1.789985052316891, + "grad_norm": 6.434545516967773, + "learning_rate": 7.76336798478054e-06, + "loss": 3.181, + "step": 26345 + }, + { + "epoch": 1.7903247723875526, + "grad_norm": 7.988498687744141, + "learning_rate": 7.762943334692215e-06, + "loss": 3.5432, + "step": 26350 + }, + { + "epoch": 1.7906644924582145, + "grad_norm": 6.37662935256958, + "learning_rate": 7.762518684603886e-06, + "loss": 3.3536, + "step": 26355 + }, + { + "epoch": 1.7910042125288763, + "grad_norm": 6.027717590332031, + "learning_rate": 7.76209403451556e-06, + "loss": 3.1499, + "step": 26360 + }, + { + "epoch": 1.791343932599538, + "grad_norm": 7.466001510620117, + "learning_rate": 7.761669384427232e-06, + "loss": 3.1813, + "step": 26365 + }, + { + "epoch": 1.7916836526701998, + "grad_norm": 6.52482271194458, + "learning_rate": 7.761244734338905e-06, + "loss": 3.2417, + "step": 26370 + }, + { + "epoch": 1.7920233727408617, + "grad_norm": 6.646611213684082, + "learning_rate": 7.760820084250578e-06, + "loss": 3.2332, + "step": 26375 + }, + { + "epoch": 1.7923630928115233, + "grad_norm": 6.334125995635986, + "learning_rate": 7.76039543416225e-06, + "loss": 3.3843, + "step": 26380 + }, + { + "epoch": 1.792702812882185, + "grad_norm": 6.264531135559082, + "learning_rate": 7.759970784073923e-06, + "loss": 3.2267, + "step": 26385 + }, + { + "epoch": 1.793042532952847, + "grad_norm": 6.775615215301514, + "learning_rate": 7.759546133985596e-06, + "loss": 3.1647, + "step": 26390 + }, + { + "epoch": 1.7933822530235086, + "grad_norm": 5.6644768714904785, + "learning_rate": 7.759121483897269e-06, + "loss": 3.3565, + "step": 26395 + }, + { + "epoch": 1.7937219730941703, + "grad_norm": 7.722905158996582, + "learning_rate": 7.758696833808942e-06, + "loss": 3.1429, + "step": 26400 + }, + { + "epoch": 1.7940616931648323, + "grad_norm": 6.135321140289307, + "learning_rate": 7.758272183720614e-06, + "loss": 3.1556, + "step": 26405 + }, + { + "epoch": 1.794401413235494, + "grad_norm": 6.391127109527588, + "learning_rate": 7.757847533632287e-06, + "loss": 3.1135, + "step": 26410 + }, + { + "epoch": 1.7947411333061556, + "grad_norm": 4.744157314300537, + "learning_rate": 7.75742288354396e-06, + "loss": 3.1482, + "step": 26415 + }, + { + "epoch": 1.7950808533768177, + "grad_norm": 6.225052833557129, + "learning_rate": 7.756998233455635e-06, + "loss": 3.0216, + "step": 26420 + }, + { + "epoch": 1.7954205734474793, + "grad_norm": 7.0685200691223145, + "learning_rate": 7.756573583367306e-06, + "loss": 3.0086, + "step": 26425 + }, + { + "epoch": 1.795760293518141, + "grad_norm": 6.089881896972656, + "learning_rate": 7.756148933278978e-06, + "loss": 3.1255, + "step": 26430 + }, + { + "epoch": 1.7961000135888028, + "grad_norm": 7.587910175323486, + "learning_rate": 7.755724283190651e-06, + "loss": 2.9294, + "step": 26435 + }, + { + "epoch": 1.7964397336594646, + "grad_norm": 8.025167465209961, + "learning_rate": 7.755299633102324e-06, + "loss": 3.3146, + "step": 26440 + }, + { + "epoch": 1.7967794537301263, + "grad_norm": 5.154735565185547, + "learning_rate": 7.754874983013997e-06, + "loss": 3.0538, + "step": 26445 + }, + { + "epoch": 1.7971191738007881, + "grad_norm": 7.465004920959473, + "learning_rate": 7.75445033292567e-06, + "loss": 2.8803, + "step": 26450 + }, + { + "epoch": 1.79745889387145, + "grad_norm": 8.441150665283203, + "learning_rate": 7.754025682837342e-06, + "loss": 3.2012, + "step": 26455 + }, + { + "epoch": 1.7977986139421116, + "grad_norm": 7.07659912109375, + "learning_rate": 7.753601032749015e-06, + "loss": 3.2774, + "step": 26460 + }, + { + "epoch": 1.7981383340127735, + "grad_norm": 5.45149564743042, + "learning_rate": 7.753176382660688e-06, + "loss": 3.1033, + "step": 26465 + }, + { + "epoch": 1.7984780540834353, + "grad_norm": 7.194268703460693, + "learning_rate": 7.752751732572361e-06, + "loss": 3.1843, + "step": 26470 + }, + { + "epoch": 1.798817774154097, + "grad_norm": 7.290097713470459, + "learning_rate": 7.752327082484034e-06, + "loss": 3.2661, + "step": 26475 + }, + { + "epoch": 1.7991574942247588, + "grad_norm": 6.540630340576172, + "learning_rate": 7.751902432395706e-06, + "loss": 3.2242, + "step": 26480 + }, + { + "epoch": 1.7994972142954206, + "grad_norm": 7.625791072845459, + "learning_rate": 7.75147778230738e-06, + "loss": 3.0735, + "step": 26485 + }, + { + "epoch": 1.7998369343660823, + "grad_norm": 7.645023345947266, + "learning_rate": 7.751053132219052e-06, + "loss": 3.1344, + "step": 26490 + }, + { + "epoch": 1.8001766544367441, + "grad_norm": 5.993186950683594, + "learning_rate": 7.750628482130725e-06, + "loss": 3.1993, + "step": 26495 + }, + { + "epoch": 1.800516374507406, + "grad_norm": 6.109645366668701, + "learning_rate": 7.750203832042398e-06, + "loss": 3.1269, + "step": 26500 + }, + { + "epoch": 1.8008560945780676, + "grad_norm": 7.836030960083008, + "learning_rate": 7.74977918195407e-06, + "loss": 2.9955, + "step": 26505 + }, + { + "epoch": 1.8011958146487295, + "grad_norm": 6.031117916107178, + "learning_rate": 7.749354531865743e-06, + "loss": 3.4081, + "step": 26510 + }, + { + "epoch": 1.8015355347193913, + "grad_norm": 6.95298433303833, + "learning_rate": 7.748929881777416e-06, + "loss": 3.2218, + "step": 26515 + }, + { + "epoch": 1.801875254790053, + "grad_norm": 5.7397589683532715, + "learning_rate": 7.748505231689089e-06, + "loss": 2.9034, + "step": 26520 + }, + { + "epoch": 1.8022149748607148, + "grad_norm": 6.341169834136963, + "learning_rate": 7.748080581600762e-06, + "loss": 3.3697, + "step": 26525 + }, + { + "epoch": 1.8025546949313767, + "grad_norm": 6.034757614135742, + "learning_rate": 7.747655931512434e-06, + "loss": 3.0639, + "step": 26530 + }, + { + "epoch": 1.8028944150020383, + "grad_norm": 8.120405197143555, + "learning_rate": 7.747231281424107e-06, + "loss": 3.4149, + "step": 26535 + }, + { + "epoch": 1.8032341350727001, + "grad_norm": 6.69488000869751, + "learning_rate": 7.74680663133578e-06, + "loss": 3.1574, + "step": 26540 + }, + { + "epoch": 1.803573855143362, + "grad_norm": 6.159152984619141, + "learning_rate": 7.746381981247453e-06, + "loss": 3.1955, + "step": 26545 + }, + { + "epoch": 1.8039135752140236, + "grad_norm": 8.025785446166992, + "learning_rate": 7.745957331159126e-06, + "loss": 3.1292, + "step": 26550 + }, + { + "epoch": 1.8042532952846853, + "grad_norm": 6.072959899902344, + "learning_rate": 7.745532681070798e-06, + "loss": 3.3544, + "step": 26555 + }, + { + "epoch": 1.8045930153553473, + "grad_norm": 7.694691181182861, + "learning_rate": 7.745108030982471e-06, + "loss": 3.4252, + "step": 26560 + }, + { + "epoch": 1.804932735426009, + "grad_norm": 5.897690296173096, + "learning_rate": 7.744683380894144e-06, + "loss": 3.1518, + "step": 26565 + }, + { + "epoch": 1.8052724554966706, + "grad_norm": 6.344274520874023, + "learning_rate": 7.744258730805817e-06, + "loss": 3.2184, + "step": 26570 + }, + { + "epoch": 1.8056121755673327, + "grad_norm": 8.084465980529785, + "learning_rate": 7.74383408071749e-06, + "loss": 3.0788, + "step": 26575 + }, + { + "epoch": 1.8059518956379943, + "grad_norm": 6.525135517120361, + "learning_rate": 7.743409430629162e-06, + "loss": 3.0151, + "step": 26580 + }, + { + "epoch": 1.806291615708656, + "grad_norm": 6.032605171203613, + "learning_rate": 7.742984780540835e-06, + "loss": 3.0845, + "step": 26585 + }, + { + "epoch": 1.806631335779318, + "grad_norm": 7.1209821701049805, + "learning_rate": 7.742560130452508e-06, + "loss": 3.1294, + "step": 26590 + }, + { + "epoch": 1.8069710558499796, + "grad_norm": 6.468820571899414, + "learning_rate": 7.742135480364181e-06, + "loss": 3.2321, + "step": 26595 + }, + { + "epoch": 1.8073107759206413, + "grad_norm": 7.403686046600342, + "learning_rate": 7.741710830275854e-06, + "loss": 3.2328, + "step": 26600 + }, + { + "epoch": 1.8076504959913031, + "grad_norm": 7.062324047088623, + "learning_rate": 7.741286180187526e-06, + "loss": 3.0911, + "step": 26605 + }, + { + "epoch": 1.807990216061965, + "grad_norm": 5.306662559509277, + "learning_rate": 7.7408615300992e-06, + "loss": 3.1763, + "step": 26610 + }, + { + "epoch": 1.8083299361326266, + "grad_norm": 5.578573703765869, + "learning_rate": 7.740436880010872e-06, + "loss": 3.0664, + "step": 26615 + }, + { + "epoch": 1.8086696562032885, + "grad_norm": 5.895156383514404, + "learning_rate": 7.740012229922545e-06, + "loss": 3.149, + "step": 26620 + }, + { + "epoch": 1.8090093762739503, + "grad_norm": 6.7981767654418945, + "learning_rate": 7.739587579834216e-06, + "loss": 3.1239, + "step": 26625 + }, + { + "epoch": 1.809349096344612, + "grad_norm": 5.919961452484131, + "learning_rate": 7.73916292974589e-06, + "loss": 3.2178, + "step": 26630 + }, + { + "epoch": 1.8096888164152738, + "grad_norm": 6.706294059753418, + "learning_rate": 7.738738279657563e-06, + "loss": 3.363, + "step": 26635 + }, + { + "epoch": 1.8100285364859356, + "grad_norm": 7.606604099273682, + "learning_rate": 7.738313629569234e-06, + "loss": 3.152, + "step": 26640 + }, + { + "epoch": 1.8103682565565973, + "grad_norm": 5.859683990478516, + "learning_rate": 7.737888979480909e-06, + "loss": 3.2026, + "step": 26645 + }, + { + "epoch": 1.8107079766272591, + "grad_norm": 5.849737644195557, + "learning_rate": 7.737464329392582e-06, + "loss": 3.2733, + "step": 26650 + }, + { + "epoch": 1.811047696697921, + "grad_norm": 6.891690254211426, + "learning_rate": 7.737039679304253e-06, + "loss": 3.2257, + "step": 26655 + }, + { + "epoch": 1.8113874167685826, + "grad_norm": 6.164402961730957, + "learning_rate": 7.736615029215927e-06, + "loss": 3.1464, + "step": 26660 + }, + { + "epoch": 1.8117271368392445, + "grad_norm": 5.3577880859375, + "learning_rate": 7.7361903791276e-06, + "loss": 3.162, + "step": 26665 + }, + { + "epoch": 1.8120668569099063, + "grad_norm": 6.628687381744385, + "learning_rate": 7.735765729039271e-06, + "loss": 3.077, + "step": 26670 + }, + { + "epoch": 1.812406576980568, + "grad_norm": 6.21530294418335, + "learning_rate": 7.735341078950946e-06, + "loss": 3.082, + "step": 26675 + }, + { + "epoch": 1.8127462970512298, + "grad_norm": 5.837088108062744, + "learning_rate": 7.734916428862618e-06, + "loss": 3.0231, + "step": 26680 + }, + { + "epoch": 1.8130860171218917, + "grad_norm": 6.5851922035217285, + "learning_rate": 7.73449177877429e-06, + "loss": 3.1925, + "step": 26685 + }, + { + "epoch": 1.8134257371925533, + "grad_norm": 5.724183082580566, + "learning_rate": 7.734067128685964e-06, + "loss": 3.0684, + "step": 26690 + }, + { + "epoch": 1.8137654572632151, + "grad_norm": 7.713676929473877, + "learning_rate": 7.733642478597637e-06, + "loss": 3.5227, + "step": 26695 + }, + { + "epoch": 1.814105177333877, + "grad_norm": 5.132675647735596, + "learning_rate": 7.733217828509308e-06, + "loss": 3.1555, + "step": 26700 + }, + { + "epoch": 1.8144448974045386, + "grad_norm": 6.880270481109619, + "learning_rate": 7.732793178420982e-06, + "loss": 2.9949, + "step": 26705 + }, + { + "epoch": 1.8147846174752005, + "grad_norm": 5.942502498626709, + "learning_rate": 7.732368528332654e-06, + "loss": 3.324, + "step": 26710 + }, + { + "epoch": 1.8151243375458623, + "grad_norm": 6.913224697113037, + "learning_rate": 7.731943878244326e-06, + "loss": 3.1107, + "step": 26715 + }, + { + "epoch": 1.815464057616524, + "grad_norm": 7.83759069442749, + "learning_rate": 7.731519228156001e-06, + "loss": 3.1921, + "step": 26720 + }, + { + "epoch": 1.8158037776871856, + "grad_norm": 5.758805274963379, + "learning_rate": 7.731094578067672e-06, + "loss": 3.0572, + "step": 26725 + }, + { + "epoch": 1.8161434977578477, + "grad_norm": 7.269608497619629, + "learning_rate": 7.730669927979345e-06, + "loss": 3.3242, + "step": 26730 + }, + { + "epoch": 1.8164832178285093, + "grad_norm": 7.189188480377197, + "learning_rate": 7.73024527789102e-06, + "loss": 3.3572, + "step": 26735 + }, + { + "epoch": 1.816822937899171, + "grad_norm": 6.551280975341797, + "learning_rate": 7.72982062780269e-06, + "loss": 2.8682, + "step": 26740 + }, + { + "epoch": 1.817162657969833, + "grad_norm": 6.053833961486816, + "learning_rate": 7.729395977714363e-06, + "loss": 3.2547, + "step": 26745 + }, + { + "epoch": 1.8175023780404946, + "grad_norm": 9.0945405960083, + "learning_rate": 7.728971327626038e-06, + "loss": 3.3392, + "step": 26750 + }, + { + "epoch": 1.8178420981111563, + "grad_norm": 4.885644912719727, + "learning_rate": 7.728546677537709e-06, + "loss": 3.1623, + "step": 26755 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 9.230530738830566, + "learning_rate": 7.728122027449383e-06, + "loss": 3.1827, + "step": 26760 + }, + { + "epoch": 1.81852153825248, + "grad_norm": 6.305636882781982, + "learning_rate": 7.727697377361056e-06, + "loss": 3.0133, + "step": 26765 + }, + { + "epoch": 1.8188612583231416, + "grad_norm": 6.678019046783447, + "learning_rate": 7.727272727272727e-06, + "loss": 3.236, + "step": 26770 + }, + { + "epoch": 1.8192009783938035, + "grad_norm": 7.4762654304504395, + "learning_rate": 7.726848077184402e-06, + "loss": 3.2612, + "step": 26775 + }, + { + "epoch": 1.8195406984644653, + "grad_norm": 8.872535705566406, + "learning_rate": 7.726423427096073e-06, + "loss": 3.0628, + "step": 26780 + }, + { + "epoch": 1.819880418535127, + "grad_norm": 6.155622959136963, + "learning_rate": 7.725998777007746e-06, + "loss": 3.0642, + "step": 26785 + }, + { + "epoch": 1.8202201386057888, + "grad_norm": 8.391833305358887, + "learning_rate": 7.72557412691942e-06, + "loss": 3.0618, + "step": 26790 + }, + { + "epoch": 1.8205598586764506, + "grad_norm": 7.20731782913208, + "learning_rate": 7.725149476831091e-06, + "loss": 3.1036, + "step": 26795 + }, + { + "epoch": 1.8208995787471123, + "grad_norm": 6.46295166015625, + "learning_rate": 7.724724826742764e-06, + "loss": 3.1737, + "step": 26800 + }, + { + "epoch": 1.8212392988177741, + "grad_norm": 7.544014930725098, + "learning_rate": 7.724300176654439e-06, + "loss": 3.3115, + "step": 26805 + }, + { + "epoch": 1.821579018888436, + "grad_norm": 10.000415802001953, + "learning_rate": 7.72387552656611e-06, + "loss": 3.2357, + "step": 26810 + }, + { + "epoch": 1.8219187389590976, + "grad_norm": 6.793159484863281, + "learning_rate": 7.723450876477782e-06, + "loss": 2.9941, + "step": 26815 + }, + { + "epoch": 1.8222584590297595, + "grad_norm": 6.328683376312256, + "learning_rate": 7.723026226389457e-06, + "loss": 2.9066, + "step": 26820 + }, + { + "epoch": 1.8225981791004213, + "grad_norm": 6.878556251525879, + "learning_rate": 7.722601576301128e-06, + "loss": 2.9995, + "step": 26825 + }, + { + "epoch": 1.822937899171083, + "grad_norm": 6.515439510345459, + "learning_rate": 7.7221769262128e-06, + "loss": 3.0427, + "step": 26830 + }, + { + "epoch": 1.8232776192417448, + "grad_norm": 7.003849506378174, + "learning_rate": 7.721752276124475e-06, + "loss": 3.1719, + "step": 26835 + }, + { + "epoch": 1.8236173393124067, + "grad_norm": 6.432281017303467, + "learning_rate": 7.721327626036146e-06, + "loss": 3.1585, + "step": 26840 + }, + { + "epoch": 1.8239570593830683, + "grad_norm": 7.151465892791748, + "learning_rate": 7.72090297594782e-06, + "loss": 3.2156, + "step": 26845 + }, + { + "epoch": 1.8242967794537301, + "grad_norm": 6.74555778503418, + "learning_rate": 7.720478325859492e-06, + "loss": 3.2761, + "step": 26850 + }, + { + "epoch": 1.824636499524392, + "grad_norm": 7.383942127227783, + "learning_rate": 7.720053675771165e-06, + "loss": 3.2675, + "step": 26855 + }, + { + "epoch": 1.8249762195950536, + "grad_norm": 8.441356658935547, + "learning_rate": 7.719629025682838e-06, + "loss": 2.9277, + "step": 26860 + }, + { + "epoch": 1.8253159396657155, + "grad_norm": 6.506313323974609, + "learning_rate": 7.71920437559451e-06, + "loss": 3.2025, + "step": 26865 + }, + { + "epoch": 1.8256556597363773, + "grad_norm": 7.992907524108887, + "learning_rate": 7.718779725506183e-06, + "loss": 3.2681, + "step": 26870 + }, + { + "epoch": 1.825995379807039, + "grad_norm": 7.836397647857666, + "learning_rate": 7.718355075417856e-06, + "loss": 3.5487, + "step": 26875 + }, + { + "epoch": 1.8263350998777008, + "grad_norm": 5.903964996337891, + "learning_rate": 7.717930425329529e-06, + "loss": 3.0788, + "step": 26880 + }, + { + "epoch": 1.8266748199483627, + "grad_norm": 6.512215614318848, + "learning_rate": 7.717505775241202e-06, + "loss": 3.0633, + "step": 26885 + }, + { + "epoch": 1.8270145400190243, + "grad_norm": 9.139721870422363, + "learning_rate": 7.717081125152874e-06, + "loss": 3.1203, + "step": 26890 + }, + { + "epoch": 1.827354260089686, + "grad_norm": 5.933333873748779, + "learning_rate": 7.716656475064547e-06, + "loss": 3.1857, + "step": 26895 + }, + { + "epoch": 1.827693980160348, + "grad_norm": 8.149227142333984, + "learning_rate": 7.71623182497622e-06, + "loss": 2.9905, + "step": 26900 + }, + { + "epoch": 1.8280337002310096, + "grad_norm": 8.108192443847656, + "learning_rate": 7.715807174887893e-06, + "loss": 3.1996, + "step": 26905 + }, + { + "epoch": 1.8283734203016713, + "grad_norm": 6.459251403808594, + "learning_rate": 7.715382524799566e-06, + "loss": 2.9586, + "step": 26910 + }, + { + "epoch": 1.8287131403723333, + "grad_norm": 7.225358009338379, + "learning_rate": 7.714957874711238e-06, + "loss": 3.2896, + "step": 26915 + }, + { + "epoch": 1.829052860442995, + "grad_norm": 7.2530517578125, + "learning_rate": 7.714533224622911e-06, + "loss": 3.0465, + "step": 26920 + }, + { + "epoch": 1.8293925805136566, + "grad_norm": 9.897294998168945, + "learning_rate": 7.714108574534584e-06, + "loss": 3.1789, + "step": 26925 + }, + { + "epoch": 1.8297323005843187, + "grad_norm": 6.327062606811523, + "learning_rate": 7.713683924446257e-06, + "loss": 3.2637, + "step": 26930 + }, + { + "epoch": 1.8300720206549803, + "grad_norm": 5.122750759124756, + "learning_rate": 7.71325927435793e-06, + "loss": 3.2311, + "step": 26935 + }, + { + "epoch": 1.830411740725642, + "grad_norm": 6.2975687980651855, + "learning_rate": 7.712834624269602e-06, + "loss": 3.1603, + "step": 26940 + }, + { + "epoch": 1.8307514607963038, + "grad_norm": 6.213536739349365, + "learning_rate": 7.712409974181275e-06, + "loss": 2.9678, + "step": 26945 + }, + { + "epoch": 1.8310911808669657, + "grad_norm": 6.181307792663574, + "learning_rate": 7.711985324092948e-06, + "loss": 2.8927, + "step": 26950 + }, + { + "epoch": 1.8314309009376273, + "grad_norm": 7.973388671875, + "learning_rate": 7.71156067400462e-06, + "loss": 3.184, + "step": 26955 + }, + { + "epoch": 1.8317706210082891, + "grad_norm": 5.504587650299072, + "learning_rate": 7.711136023916294e-06, + "loss": 3.1557, + "step": 26960 + }, + { + "epoch": 1.832110341078951, + "grad_norm": 6.222821235656738, + "learning_rate": 7.710711373827966e-06, + "loss": 3.0836, + "step": 26965 + }, + { + "epoch": 1.8324500611496126, + "grad_norm": 7.302416801452637, + "learning_rate": 7.71028672373964e-06, + "loss": 3.1671, + "step": 26970 + }, + { + "epoch": 1.8327897812202745, + "grad_norm": 6.91143798828125, + "learning_rate": 7.709862073651312e-06, + "loss": 3.3158, + "step": 26975 + }, + { + "epoch": 1.8331295012909363, + "grad_norm": 4.962676048278809, + "learning_rate": 7.709437423562985e-06, + "loss": 3.131, + "step": 26980 + }, + { + "epoch": 1.833469221361598, + "grad_norm": 6.974936485290527, + "learning_rate": 7.709012773474658e-06, + "loss": 3.1005, + "step": 26985 + }, + { + "epoch": 1.8338089414322598, + "grad_norm": 6.915289878845215, + "learning_rate": 7.70858812338633e-06, + "loss": 3.0476, + "step": 26990 + }, + { + "epoch": 1.8341486615029217, + "grad_norm": 6.136562824249268, + "learning_rate": 7.70824840331567e-06, + "loss": 2.8476, + "step": 26995 + }, + { + "epoch": 1.8344883815735833, + "grad_norm": 6.758520126342773, + "learning_rate": 7.707823753227342e-06, + "loss": 3.271, + "step": 27000 + }, + { + "epoch": 1.8348281016442451, + "grad_norm": 7.204280376434326, + "learning_rate": 7.707399103139013e-06, + "loss": 3.0175, + "step": 27005 + }, + { + "epoch": 1.835167821714907, + "grad_norm": 7.731189250946045, + "learning_rate": 7.706974453050688e-06, + "loss": 3.0055, + "step": 27010 + }, + { + "epoch": 1.8355075417855686, + "grad_norm": 6.492426872253418, + "learning_rate": 7.70654980296236e-06, + "loss": 3.207, + "step": 27015 + }, + { + "epoch": 1.8358472618562305, + "grad_norm": 6.538259506225586, + "learning_rate": 7.706125152874032e-06, + "loss": 3.2112, + "step": 27020 + }, + { + "epoch": 1.8361869819268923, + "grad_norm": 7.19352388381958, + "learning_rate": 7.705700502785706e-06, + "loss": 3.2672, + "step": 27025 + }, + { + "epoch": 1.836526701997554, + "grad_norm": 7.290232181549072, + "learning_rate": 7.705275852697377e-06, + "loss": 3.0103, + "step": 27030 + }, + { + "epoch": 1.8368664220682158, + "grad_norm": 5.056563377380371, + "learning_rate": 7.70485120260905e-06, + "loss": 3.1625, + "step": 27035 + }, + { + "epoch": 1.8372061421388777, + "grad_norm": 6.035808086395264, + "learning_rate": 7.704426552520725e-06, + "loss": 2.8796, + "step": 27040 + }, + { + "epoch": 1.8375458622095393, + "grad_norm": 7.619948387145996, + "learning_rate": 7.704001902432396e-06, + "loss": 3.0289, + "step": 27045 + }, + { + "epoch": 1.8378855822802012, + "grad_norm": 5.888753890991211, + "learning_rate": 7.703577252344068e-06, + "loss": 3.2769, + "step": 27050 + }, + { + "epoch": 1.838225302350863, + "grad_norm": 6.889127254486084, + "learning_rate": 7.703152602255743e-06, + "loss": 3.1901, + "step": 27055 + }, + { + "epoch": 1.8385650224215246, + "grad_norm": 7.018806457519531, + "learning_rate": 7.702727952167414e-06, + "loss": 3.1501, + "step": 27060 + }, + { + "epoch": 1.8389047424921863, + "grad_norm": 7.751131057739258, + "learning_rate": 7.702303302079087e-06, + "loss": 3.2012, + "step": 27065 + }, + { + "epoch": 1.8392444625628483, + "grad_norm": 5.974591255187988, + "learning_rate": 7.701878651990761e-06, + "loss": 3.0711, + "step": 27070 + }, + { + "epoch": 1.83958418263351, + "grad_norm": 6.6121506690979, + "learning_rate": 7.701454001902432e-06, + "loss": 3.2205, + "step": 27075 + }, + { + "epoch": 1.8399239027041716, + "grad_norm": 7.155187606811523, + "learning_rate": 7.701029351814105e-06, + "loss": 3.1255, + "step": 27080 + }, + { + "epoch": 1.8402636227748337, + "grad_norm": 9.828055381774902, + "learning_rate": 7.70060470172578e-06, + "loss": 3.1547, + "step": 27085 + }, + { + "epoch": 1.8406033428454953, + "grad_norm": 7.894463062286377, + "learning_rate": 7.700180051637451e-06, + "loss": 3.1706, + "step": 27090 + }, + { + "epoch": 1.840943062916157, + "grad_norm": 6.7881669998168945, + "learning_rate": 7.699755401549124e-06, + "loss": 2.9712, + "step": 27095 + }, + { + "epoch": 1.841282782986819, + "grad_norm": 7.992203235626221, + "learning_rate": 7.699330751460798e-06, + "loss": 3.1562, + "step": 27100 + }, + { + "epoch": 1.8416225030574807, + "grad_norm": 6.375755786895752, + "learning_rate": 7.69890610137247e-06, + "loss": 3.0425, + "step": 27105 + }, + { + "epoch": 1.8419622231281423, + "grad_norm": 5.594574928283691, + "learning_rate": 7.698481451284142e-06, + "loss": 3.2189, + "step": 27110 + }, + { + "epoch": 1.8423019431988041, + "grad_norm": 6.995436668395996, + "learning_rate": 7.698056801195815e-06, + "loss": 2.9601, + "step": 27115 + }, + { + "epoch": 1.842641663269466, + "grad_norm": 6.870937347412109, + "learning_rate": 7.697632151107488e-06, + "loss": 3.2336, + "step": 27120 + }, + { + "epoch": 1.8429813833401276, + "grad_norm": 7.762845993041992, + "learning_rate": 7.69720750101916e-06, + "loss": 3.3334, + "step": 27125 + }, + { + "epoch": 1.8433211034107895, + "grad_norm": 7.199080467224121, + "learning_rate": 7.696782850930833e-06, + "loss": 3.1654, + "step": 27130 + }, + { + "epoch": 1.8436608234814513, + "grad_norm": 5.343786239624023, + "learning_rate": 7.696358200842506e-06, + "loss": 3.1687, + "step": 27135 + }, + { + "epoch": 1.844000543552113, + "grad_norm": 5.665068626403809, + "learning_rate": 7.695933550754179e-06, + "loss": 3.1587, + "step": 27140 + }, + { + "epoch": 1.8443402636227748, + "grad_norm": 5.479167461395264, + "learning_rate": 7.695508900665852e-06, + "loss": 2.9778, + "step": 27145 + }, + { + "epoch": 1.8446799836934367, + "grad_norm": 8.066679000854492, + "learning_rate": 7.695084250577524e-06, + "loss": 3.1824, + "step": 27150 + }, + { + "epoch": 1.8450197037640983, + "grad_norm": 6.046168804168701, + "learning_rate": 7.694659600489197e-06, + "loss": 3.2212, + "step": 27155 + }, + { + "epoch": 1.8453594238347601, + "grad_norm": 6.865650653839111, + "learning_rate": 7.69423495040087e-06, + "loss": 3.3171, + "step": 27160 + }, + { + "epoch": 1.845699143905422, + "grad_norm": 9.235589027404785, + "learning_rate": 7.693810300312543e-06, + "loss": 2.9625, + "step": 27165 + }, + { + "epoch": 1.8460388639760836, + "grad_norm": 7.195647716522217, + "learning_rate": 7.693385650224216e-06, + "loss": 3.0681, + "step": 27170 + }, + { + "epoch": 1.8463785840467455, + "grad_norm": 6.774362087249756, + "learning_rate": 7.692961000135888e-06, + "loss": 3.2838, + "step": 27175 + }, + { + "epoch": 1.8467183041174073, + "grad_norm": 9.515990257263184, + "learning_rate": 7.692536350047561e-06, + "loss": 3.4953, + "step": 27180 + }, + { + "epoch": 1.847058024188069, + "grad_norm": 8.22684383392334, + "learning_rate": 7.692111699959234e-06, + "loss": 3.2498, + "step": 27185 + }, + { + "epoch": 1.8473977442587308, + "grad_norm": 7.211028575897217, + "learning_rate": 7.691687049870907e-06, + "loss": 3.4112, + "step": 27190 + }, + { + "epoch": 1.8477374643293927, + "grad_norm": 5.646414279937744, + "learning_rate": 7.69126239978258e-06, + "loss": 3.3665, + "step": 27195 + }, + { + "epoch": 1.8480771844000543, + "grad_norm": 6.322606563568115, + "learning_rate": 7.690837749694253e-06, + "loss": 3.2071, + "step": 27200 + }, + { + "epoch": 1.8484169044707162, + "grad_norm": 5.8209004402160645, + "learning_rate": 7.690413099605925e-06, + "loss": 3.5006, + "step": 27205 + }, + { + "epoch": 1.848756624541378, + "grad_norm": 5.442793846130371, + "learning_rate": 7.689988449517598e-06, + "loss": 3.1689, + "step": 27210 + }, + { + "epoch": 1.8490963446120396, + "grad_norm": 5.320268630981445, + "learning_rate": 7.689563799429271e-06, + "loss": 2.9617, + "step": 27215 + }, + { + "epoch": 1.8494360646827015, + "grad_norm": 5.877702713012695, + "learning_rate": 7.689139149340944e-06, + "loss": 3.2148, + "step": 27220 + }, + { + "epoch": 1.8497757847533634, + "grad_norm": 8.723907470703125, + "learning_rate": 7.688714499252617e-06, + "loss": 3.1379, + "step": 27225 + }, + { + "epoch": 1.850115504824025, + "grad_norm": 8.698330879211426, + "learning_rate": 7.68828984916429e-06, + "loss": 2.9994, + "step": 27230 + }, + { + "epoch": 1.8504552248946866, + "grad_norm": 6.248712062835693, + "learning_rate": 7.687865199075962e-06, + "loss": 3.2515, + "step": 27235 + }, + { + "epoch": 1.8507949449653487, + "grad_norm": 7.114750862121582, + "learning_rate": 7.687440548987635e-06, + "loss": 3.1492, + "step": 27240 + }, + { + "epoch": 1.8511346650360103, + "grad_norm": 7.182178020477295, + "learning_rate": 7.687015898899308e-06, + "loss": 3.208, + "step": 27245 + }, + { + "epoch": 1.851474385106672, + "grad_norm": 5.490970611572266, + "learning_rate": 7.68659124881098e-06, + "loss": 3.0126, + "step": 27250 + }, + { + "epoch": 1.851814105177334, + "grad_norm": 6.791418552398682, + "learning_rate": 7.686166598722653e-06, + "loss": 3.1679, + "step": 27255 + }, + { + "epoch": 1.8521538252479957, + "grad_norm": 5.47775411605835, + "learning_rate": 7.685741948634326e-06, + "loss": 3.2147, + "step": 27260 + }, + { + "epoch": 1.8524935453186573, + "grad_norm": 7.726205825805664, + "learning_rate": 7.685317298545999e-06, + "loss": 3.2868, + "step": 27265 + }, + { + "epoch": 1.8528332653893194, + "grad_norm": 8.352020263671875, + "learning_rate": 7.684892648457672e-06, + "loss": 3.1921, + "step": 27270 + }, + { + "epoch": 1.853172985459981, + "grad_norm": 7.172081470489502, + "learning_rate": 7.684467998369345e-06, + "loss": 3.5223, + "step": 27275 + }, + { + "epoch": 1.8535127055306426, + "grad_norm": 7.418200969696045, + "learning_rate": 7.684043348281017e-06, + "loss": 3.2606, + "step": 27280 + }, + { + "epoch": 1.8538524256013045, + "grad_norm": 7.5486226081848145, + "learning_rate": 7.68361869819269e-06, + "loss": 3.109, + "step": 27285 + }, + { + "epoch": 1.8541921456719663, + "grad_norm": 5.081632614135742, + "learning_rate": 7.683194048104363e-06, + "loss": 3.1167, + "step": 27290 + }, + { + "epoch": 1.854531865742628, + "grad_norm": 7.781050205230713, + "learning_rate": 7.682769398016036e-06, + "loss": 3.161, + "step": 27295 + }, + { + "epoch": 1.8548715858132898, + "grad_norm": 8.335197448730469, + "learning_rate": 7.682344747927709e-06, + "loss": 3.2273, + "step": 27300 + }, + { + "epoch": 1.8552113058839517, + "grad_norm": 6.354374408721924, + "learning_rate": 7.681920097839381e-06, + "loss": 3.1527, + "step": 27305 + }, + { + "epoch": 1.8555510259546133, + "grad_norm": 8.075221061706543, + "learning_rate": 7.681495447751054e-06, + "loss": 3.002, + "step": 27310 + }, + { + "epoch": 1.8558907460252752, + "grad_norm": 8.226201057434082, + "learning_rate": 7.681070797662727e-06, + "loss": 3.3414, + "step": 27315 + }, + { + "epoch": 1.856230466095937, + "grad_norm": 6.462031841278076, + "learning_rate": 7.6806461475744e-06, + "loss": 3.042, + "step": 27320 + }, + { + "epoch": 1.8565701861665986, + "grad_norm": 6.437406539916992, + "learning_rate": 7.680221497486073e-06, + "loss": 3.0079, + "step": 27325 + }, + { + "epoch": 1.8569099062372605, + "grad_norm": 7.854445457458496, + "learning_rate": 7.679796847397745e-06, + "loss": 3.1195, + "step": 27330 + }, + { + "epoch": 1.8572496263079223, + "grad_norm": 7.131911754608154, + "learning_rate": 7.679372197309418e-06, + "loss": 3.0265, + "step": 27335 + }, + { + "epoch": 1.857589346378584, + "grad_norm": 6.344163417816162, + "learning_rate": 7.678947547221091e-06, + "loss": 3.003, + "step": 27340 + }, + { + "epoch": 1.8579290664492458, + "grad_norm": 6.690452575683594, + "learning_rate": 7.678522897132764e-06, + "loss": 2.9856, + "step": 27345 + }, + { + "epoch": 1.8582687865199077, + "grad_norm": 7.833378791809082, + "learning_rate": 7.678098247044437e-06, + "loss": 3.1556, + "step": 27350 + }, + { + "epoch": 1.8586085065905693, + "grad_norm": 5.732474327087402, + "learning_rate": 7.67767359695611e-06, + "loss": 3.2311, + "step": 27355 + }, + { + "epoch": 1.8589482266612312, + "grad_norm": 6.355639934539795, + "learning_rate": 7.677248946867782e-06, + "loss": 3.1084, + "step": 27360 + }, + { + "epoch": 1.859287946731893, + "grad_norm": 6.782613277435303, + "learning_rate": 7.676824296779455e-06, + "loss": 3.0529, + "step": 27365 + }, + { + "epoch": 1.8596276668025546, + "grad_norm": 5.80108118057251, + "learning_rate": 7.676399646691128e-06, + "loss": 3.0519, + "step": 27370 + }, + { + "epoch": 1.8599673868732165, + "grad_norm": 9.268766403198242, + "learning_rate": 7.675974996602799e-06, + "loss": 3.195, + "step": 27375 + }, + { + "epoch": 1.8603071069438784, + "grad_norm": 6.975952625274658, + "learning_rate": 7.675550346514473e-06, + "loss": 3.0668, + "step": 27380 + }, + { + "epoch": 1.86064682701454, + "grad_norm": 7.354006767272949, + "learning_rate": 7.675125696426146e-06, + "loss": 3.1549, + "step": 27385 + }, + { + "epoch": 1.8609865470852018, + "grad_norm": 8.1115140914917, + "learning_rate": 7.674701046337817e-06, + "loss": 2.898, + "step": 27390 + }, + { + "epoch": 1.8613262671558637, + "grad_norm": 5.378767490386963, + "learning_rate": 7.674276396249492e-06, + "loss": 3.2681, + "step": 27395 + }, + { + "epoch": 1.8616659872265253, + "grad_norm": 7.971062660217285, + "learning_rate": 7.673851746161165e-06, + "loss": 3.3807, + "step": 27400 + }, + { + "epoch": 1.862005707297187, + "grad_norm": 8.601875305175781, + "learning_rate": 7.673427096072836e-06, + "loss": 3.0892, + "step": 27405 + }, + { + "epoch": 1.862345427367849, + "grad_norm": 6.918034553527832, + "learning_rate": 7.67300244598451e-06, + "loss": 3.3245, + "step": 27410 + }, + { + "epoch": 1.8626851474385107, + "grad_norm": 7.400420665740967, + "learning_rate": 7.672577795896183e-06, + "loss": 3.3614, + "step": 27415 + }, + { + "epoch": 1.8630248675091723, + "grad_norm": 6.310996055603027, + "learning_rate": 7.672153145807854e-06, + "loss": 3.2308, + "step": 27420 + }, + { + "epoch": 1.8633645875798344, + "grad_norm": 6.794577598571777, + "learning_rate": 7.671728495719529e-06, + "loss": 3.2229, + "step": 27425 + }, + { + "epoch": 1.863704307650496, + "grad_norm": 6.597177982330322, + "learning_rate": 7.671303845631201e-06, + "loss": 3.2282, + "step": 27430 + }, + { + "epoch": 1.8640440277211576, + "grad_norm": 5.115915775299072, + "learning_rate": 7.670879195542872e-06, + "loss": 3.1878, + "step": 27435 + }, + { + "epoch": 1.8643837477918197, + "grad_norm": 8.03510856628418, + "learning_rate": 7.670454545454547e-06, + "loss": 3.347, + "step": 27440 + }, + { + "epoch": 1.8647234678624813, + "grad_norm": 7.274251937866211, + "learning_rate": 7.670029895366218e-06, + "loss": 3.1527, + "step": 27445 + }, + { + "epoch": 1.865063187933143, + "grad_norm": 6.805904865264893, + "learning_rate": 7.66960524527789e-06, + "loss": 2.9795, + "step": 27450 + }, + { + "epoch": 1.8654029080038048, + "grad_norm": 6.576681613922119, + "learning_rate": 7.669180595189565e-06, + "loss": 2.9318, + "step": 27455 + }, + { + "epoch": 1.8657426280744667, + "grad_norm": 5.409641265869141, + "learning_rate": 7.668755945101236e-06, + "loss": 3.2788, + "step": 27460 + }, + { + "epoch": 1.8660823481451283, + "grad_norm": 7.783731937408447, + "learning_rate": 7.66833129501291e-06, + "loss": 3.0432, + "step": 27465 + }, + { + "epoch": 1.8664220682157902, + "grad_norm": 7.148990154266357, + "learning_rate": 7.667906644924584e-06, + "loss": 3.1848, + "step": 27470 + }, + { + "epoch": 1.866761788286452, + "grad_norm": 6.118326187133789, + "learning_rate": 7.667481994836255e-06, + "loss": 2.9151, + "step": 27475 + }, + { + "epoch": 1.8671015083571136, + "grad_norm": 6.884566783905029, + "learning_rate": 7.667057344747928e-06, + "loss": 3.2953, + "step": 27480 + }, + { + "epoch": 1.8674412284277755, + "grad_norm": 5.649154186248779, + "learning_rate": 7.666632694659602e-06, + "loss": 2.9669, + "step": 27485 + }, + { + "epoch": 1.8677809484984373, + "grad_norm": 7.1247968673706055, + "learning_rate": 7.666208044571273e-06, + "loss": 3.0287, + "step": 27490 + }, + { + "epoch": 1.868120668569099, + "grad_norm": 8.247347831726074, + "learning_rate": 7.665783394482946e-06, + "loss": 3.3912, + "step": 27495 + }, + { + "epoch": 1.8684603886397608, + "grad_norm": 7.812985897064209, + "learning_rate": 7.66535874439462e-06, + "loss": 3.1415, + "step": 27500 + }, + { + "epoch": 1.8688001087104227, + "grad_norm": 6.709445476531982, + "learning_rate": 7.664934094306292e-06, + "loss": 3.2131, + "step": 27505 + }, + { + "epoch": 1.8691398287810843, + "grad_norm": 7.906744480133057, + "learning_rate": 7.664509444217964e-06, + "loss": 2.8955, + "step": 27510 + }, + { + "epoch": 1.8694795488517462, + "grad_norm": 5.841775894165039, + "learning_rate": 7.664084794129639e-06, + "loss": 3.2302, + "step": 27515 + }, + { + "epoch": 1.869819268922408, + "grad_norm": 6.67901086807251, + "learning_rate": 7.66366014404131e-06, + "loss": 3.0071, + "step": 27520 + }, + { + "epoch": 1.8701589889930696, + "grad_norm": 6.264431953430176, + "learning_rate": 7.663235493952983e-06, + "loss": 3.1097, + "step": 27525 + }, + { + "epoch": 1.8704987090637315, + "grad_norm": 5.368793487548828, + "learning_rate": 7.662810843864656e-06, + "loss": 3.236, + "step": 27530 + }, + { + "epoch": 1.8708384291343934, + "grad_norm": 6.7178053855896, + "learning_rate": 7.662386193776328e-06, + "loss": 2.8115, + "step": 27535 + }, + { + "epoch": 1.871178149205055, + "grad_norm": 6.571673393249512, + "learning_rate": 7.661961543688001e-06, + "loss": 3.3798, + "step": 27540 + }, + { + "epoch": 1.8715178692757168, + "grad_norm": 6.575045108795166, + "learning_rate": 7.661536893599674e-06, + "loss": 3.1765, + "step": 27545 + }, + { + "epoch": 1.8718575893463787, + "grad_norm": 6.140381813049316, + "learning_rate": 7.661112243511347e-06, + "loss": 3.3464, + "step": 27550 + }, + { + "epoch": 1.8721973094170403, + "grad_norm": 7.00314474105835, + "learning_rate": 7.66068759342302e-06, + "loss": 3.1419, + "step": 27555 + }, + { + "epoch": 1.8725370294877022, + "grad_norm": 7.0595598220825195, + "learning_rate": 7.660262943334692e-06, + "loss": 2.8531, + "step": 27560 + }, + { + "epoch": 1.872876749558364, + "grad_norm": 7.322221755981445, + "learning_rate": 7.659838293246365e-06, + "loss": 3.1793, + "step": 27565 + }, + { + "epoch": 1.8732164696290257, + "grad_norm": 6.2719244956970215, + "learning_rate": 7.659413643158038e-06, + "loss": 3.0391, + "step": 27570 + }, + { + "epoch": 1.8735561896996873, + "grad_norm": 7.496593952178955, + "learning_rate": 7.658988993069711e-06, + "loss": 3.3573, + "step": 27575 + }, + { + "epoch": 1.8738959097703494, + "grad_norm": 8.718388557434082, + "learning_rate": 7.658564342981384e-06, + "loss": 3.4073, + "step": 27580 + }, + { + "epoch": 1.874235629841011, + "grad_norm": 6.470010280609131, + "learning_rate": 7.658139692893056e-06, + "loss": 3.1814, + "step": 27585 + }, + { + "epoch": 1.8745753499116726, + "grad_norm": 5.537491798400879, + "learning_rate": 7.65771504280473e-06, + "loss": 2.9701, + "step": 27590 + }, + { + "epoch": 1.8749150699823347, + "grad_norm": 7.285256862640381, + "learning_rate": 7.657290392716402e-06, + "loss": 3.0875, + "step": 27595 + }, + { + "epoch": 1.8752547900529963, + "grad_norm": 6.89643669128418, + "learning_rate": 7.656865742628075e-06, + "loss": 3.0955, + "step": 27600 + }, + { + "epoch": 1.875594510123658, + "grad_norm": 8.097179412841797, + "learning_rate": 7.656441092539748e-06, + "loss": 3.2642, + "step": 27605 + }, + { + "epoch": 1.87593423019432, + "grad_norm": 6.066300392150879, + "learning_rate": 7.65601644245142e-06, + "loss": 3.0787, + "step": 27610 + }, + { + "epoch": 1.8762739502649817, + "grad_norm": 6.601510047912598, + "learning_rate": 7.655591792363093e-06, + "loss": 3.2652, + "step": 27615 + }, + { + "epoch": 1.8766136703356433, + "grad_norm": 5.813055038452148, + "learning_rate": 7.655167142274766e-06, + "loss": 3.0631, + "step": 27620 + }, + { + "epoch": 1.8769533904063052, + "grad_norm": 7.672543525695801, + "learning_rate": 7.654742492186439e-06, + "loss": 3.3481, + "step": 27625 + }, + { + "epoch": 1.877293110476967, + "grad_norm": 5.437895774841309, + "learning_rate": 7.654317842098112e-06, + "loss": 3.1461, + "step": 27630 + }, + { + "epoch": 1.8776328305476286, + "grad_norm": 7.086071491241455, + "learning_rate": 7.653893192009784e-06, + "loss": 2.8495, + "step": 27635 + }, + { + "epoch": 1.8779725506182905, + "grad_norm": 5.012383460998535, + "learning_rate": 7.653468541921457e-06, + "loss": 3.1717, + "step": 27640 + }, + { + "epoch": 1.8783122706889523, + "grad_norm": 8.005416870117188, + "learning_rate": 7.65304389183313e-06, + "loss": 2.937, + "step": 27645 + }, + { + "epoch": 1.878651990759614, + "grad_norm": 9.101865768432617, + "learning_rate": 7.652619241744803e-06, + "loss": 2.9788, + "step": 27650 + }, + { + "epoch": 1.8789917108302758, + "grad_norm": 5.632835865020752, + "learning_rate": 7.652194591656476e-06, + "loss": 3.1244, + "step": 27655 + }, + { + "epoch": 1.8793314309009377, + "grad_norm": 5.919254779815674, + "learning_rate": 7.651769941568148e-06, + "loss": 3.0699, + "step": 27660 + }, + { + "epoch": 1.8796711509715993, + "grad_norm": 6.733277320861816, + "learning_rate": 7.651345291479821e-06, + "loss": 3.2546, + "step": 27665 + }, + { + "epoch": 1.8800108710422612, + "grad_norm": 6.5069684982299805, + "learning_rate": 7.650920641391494e-06, + "loss": 3.1375, + "step": 27670 + }, + { + "epoch": 1.880350591112923, + "grad_norm": 6.475186347961426, + "learning_rate": 7.650495991303167e-06, + "loss": 3.0632, + "step": 27675 + }, + { + "epoch": 1.8806903111835847, + "grad_norm": 6.543583869934082, + "learning_rate": 7.65007134121484e-06, + "loss": 3.2159, + "step": 27680 + }, + { + "epoch": 1.8810300312542465, + "grad_norm": 7.1212310791015625, + "learning_rate": 7.649646691126512e-06, + "loss": 3.1926, + "step": 27685 + }, + { + "epoch": 1.8813697513249084, + "grad_norm": 5.3468427658081055, + "learning_rate": 7.649222041038185e-06, + "loss": 3.1146, + "step": 27690 + }, + { + "epoch": 1.88170947139557, + "grad_norm": 7.5674357414245605, + "learning_rate": 7.648797390949858e-06, + "loss": 3.2451, + "step": 27695 + }, + { + "epoch": 1.8820491914662318, + "grad_norm": 7.558800220489502, + "learning_rate": 7.648372740861531e-06, + "loss": 3.066, + "step": 27700 + }, + { + "epoch": 1.8823889115368937, + "grad_norm": 6.959020614624023, + "learning_rate": 7.647948090773204e-06, + "loss": 3.0603, + "step": 27705 + }, + { + "epoch": 1.8827286316075553, + "grad_norm": 7.721263885498047, + "learning_rate": 7.647523440684876e-06, + "loss": 3.1962, + "step": 27710 + }, + { + "epoch": 1.8830683516782172, + "grad_norm": 5.725180149078369, + "learning_rate": 7.64709879059655e-06, + "loss": 3.2173, + "step": 27715 + }, + { + "epoch": 1.883408071748879, + "grad_norm": 6.241030693054199, + "learning_rate": 7.646674140508222e-06, + "loss": 3.0313, + "step": 27720 + }, + { + "epoch": 1.8837477918195407, + "grad_norm": 5.192254543304443, + "learning_rate": 7.646249490419895e-06, + "loss": 3.1494, + "step": 27725 + }, + { + "epoch": 1.8840875118902025, + "grad_norm": 7.890355587005615, + "learning_rate": 7.645824840331568e-06, + "loss": 3.1537, + "step": 27730 + }, + { + "epoch": 1.8844272319608644, + "grad_norm": 8.08179759979248, + "learning_rate": 7.64540019024324e-06, + "loss": 3.3203, + "step": 27735 + }, + { + "epoch": 1.884766952031526, + "grad_norm": 7.832530975341797, + "learning_rate": 7.644975540154913e-06, + "loss": 3.2528, + "step": 27740 + }, + { + "epoch": 1.8851066721021879, + "grad_norm": 5.716762542724609, + "learning_rate": 7.644550890066586e-06, + "loss": 3.0073, + "step": 27745 + }, + { + "epoch": 1.8854463921728497, + "grad_norm": 5.669264316558838, + "learning_rate": 7.644126239978259e-06, + "loss": 3.0253, + "step": 27750 + }, + { + "epoch": 1.8857861122435113, + "grad_norm": 6.44020414352417, + "learning_rate": 7.643701589889932e-06, + "loss": 2.9587, + "step": 27755 + }, + { + "epoch": 1.886125832314173, + "grad_norm": 7.300680637359619, + "learning_rate": 7.643276939801604e-06, + "loss": 3.2764, + "step": 27760 + }, + { + "epoch": 1.886465552384835, + "grad_norm": 7.436569690704346, + "learning_rate": 7.642852289713277e-06, + "loss": 3.2388, + "step": 27765 + }, + { + "epoch": 1.8868052724554967, + "grad_norm": 7.452202320098877, + "learning_rate": 7.64242763962495e-06, + "loss": 3.2525, + "step": 27770 + }, + { + "epoch": 1.8871449925261583, + "grad_norm": 7.428544044494629, + "learning_rate": 7.642002989536623e-06, + "loss": 2.9167, + "step": 27775 + }, + { + "epoch": 1.8874847125968204, + "grad_norm": 5.458137035369873, + "learning_rate": 7.641578339448296e-06, + "loss": 3.0172, + "step": 27780 + }, + { + "epoch": 1.887824432667482, + "grad_norm": 8.966728210449219, + "learning_rate": 7.641153689359968e-06, + "loss": 3.0033, + "step": 27785 + }, + { + "epoch": 1.8881641527381436, + "grad_norm": 6.760031223297119, + "learning_rate": 7.64072903927164e-06, + "loss": 3.3567, + "step": 27790 + }, + { + "epoch": 1.8885038728088055, + "grad_norm": 5.678564071655273, + "learning_rate": 7.640304389183314e-06, + "loss": 3.068, + "step": 27795 + }, + { + "epoch": 1.8888435928794673, + "grad_norm": 6.811620712280273, + "learning_rate": 7.639879739094987e-06, + "loss": 3.0312, + "step": 27800 + }, + { + "epoch": 1.889183312950129, + "grad_norm": 5.726400852203369, + "learning_rate": 7.639455089006658e-06, + "loss": 3.1116, + "step": 27805 + }, + { + "epoch": 1.8895230330207908, + "grad_norm": 6.006499767303467, + "learning_rate": 7.639030438918333e-06, + "loss": 3.3409, + "step": 27810 + }, + { + "epoch": 1.8898627530914527, + "grad_norm": 8.117121696472168, + "learning_rate": 7.638605788830005e-06, + "loss": 3.0556, + "step": 27815 + }, + { + "epoch": 1.8902024731621143, + "grad_norm": 6.1916704177856445, + "learning_rate": 7.638181138741676e-06, + "loss": 3.1199, + "step": 27820 + }, + { + "epoch": 1.8905421932327762, + "grad_norm": 5.31387996673584, + "learning_rate": 7.637756488653351e-06, + "loss": 3.1025, + "step": 27825 + }, + { + "epoch": 1.890881913303438, + "grad_norm": 8.824198722839355, + "learning_rate": 7.637331838565024e-06, + "loss": 3.5076, + "step": 27830 + }, + { + "epoch": 1.8912216333740997, + "grad_norm": 6.949975490570068, + "learning_rate": 7.636907188476695e-06, + "loss": 3.3185, + "step": 27835 + }, + { + "epoch": 1.8915613534447615, + "grad_norm": 5.725770950317383, + "learning_rate": 7.63648253838837e-06, + "loss": 3.0833, + "step": 27840 + }, + { + "epoch": 1.8919010735154234, + "grad_norm": 7.031792640686035, + "learning_rate": 7.636057888300042e-06, + "loss": 3.0491, + "step": 27845 + }, + { + "epoch": 1.892240793586085, + "grad_norm": 6.90504264831543, + "learning_rate": 7.635633238211713e-06, + "loss": 3.0461, + "step": 27850 + }, + { + "epoch": 1.8925805136567468, + "grad_norm": 6.472363471984863, + "learning_rate": 7.635208588123388e-06, + "loss": 3.4483, + "step": 27855 + }, + { + "epoch": 1.8929202337274087, + "grad_norm": 6.153521537780762, + "learning_rate": 7.63478393803506e-06, + "loss": 3.4666, + "step": 27860 + }, + { + "epoch": 1.8932599537980703, + "grad_norm": 8.732483863830566, + "learning_rate": 7.634359287946732e-06, + "loss": 2.9813, + "step": 27865 + }, + { + "epoch": 1.8935996738687322, + "grad_norm": 7.045727729797363, + "learning_rate": 7.633934637858406e-06, + "loss": 3.3749, + "step": 27870 + }, + { + "epoch": 1.893939393939394, + "grad_norm": 7.041116714477539, + "learning_rate": 7.633509987770077e-06, + "loss": 3.3774, + "step": 27875 + }, + { + "epoch": 1.8942791140100557, + "grad_norm": 6.801476001739502, + "learning_rate": 7.63308533768175e-06, + "loss": 3.2796, + "step": 27880 + }, + { + "epoch": 1.8946188340807175, + "grad_norm": 7.7943925857543945, + "learning_rate": 7.632660687593425e-06, + "loss": 3.0305, + "step": 27885 + }, + { + "epoch": 1.8949585541513794, + "grad_norm": 5.477722644805908, + "learning_rate": 7.632236037505096e-06, + "loss": 3.0983, + "step": 27890 + }, + { + "epoch": 1.895298274222041, + "grad_norm": 7.383486270904541, + "learning_rate": 7.631811387416768e-06, + "loss": 3.1149, + "step": 27895 + }, + { + "epoch": 1.8956379942927029, + "grad_norm": 7.419162750244141, + "learning_rate": 7.631386737328443e-06, + "loss": 3.2563, + "step": 27900 + }, + { + "epoch": 1.8959777143633647, + "grad_norm": 8.45196533203125, + "learning_rate": 7.630962087240114e-06, + "loss": 3.0246, + "step": 27905 + }, + { + "epoch": 1.8963174344340263, + "grad_norm": 6.445399284362793, + "learning_rate": 7.630537437151787e-06, + "loss": 3.2365, + "step": 27910 + }, + { + "epoch": 1.8966571545046882, + "grad_norm": 6.480842113494873, + "learning_rate": 7.630112787063461e-06, + "loss": 3.1051, + "step": 27915 + }, + { + "epoch": 1.89699687457535, + "grad_norm": 6.668388843536377, + "learning_rate": 7.629688136975132e-06, + "loss": 3.2249, + "step": 27920 + }, + { + "epoch": 1.8973365946460117, + "grad_norm": 6.899972915649414, + "learning_rate": 7.629263486886805e-06, + "loss": 3.1672, + "step": 27925 + }, + { + "epoch": 1.8976763147166733, + "grad_norm": 6.104898929595947, + "learning_rate": 7.628838836798479e-06, + "loss": 3.2229, + "step": 27930 + }, + { + "epoch": 1.8980160347873354, + "grad_norm": 7.664186954498291, + "learning_rate": 7.628414186710152e-06, + "loss": 3.1823, + "step": 27935 + }, + { + "epoch": 1.898355754857997, + "grad_norm": 7.0849714279174805, + "learning_rate": 7.627989536621824e-06, + "loss": 3.1278, + "step": 27940 + }, + { + "epoch": 1.8986954749286586, + "grad_norm": 6.574739933013916, + "learning_rate": 7.627564886533497e-06, + "loss": 3.0192, + "step": 27945 + }, + { + "epoch": 1.8990351949993207, + "grad_norm": 6.423635482788086, + "learning_rate": 7.62714023644517e-06, + "loss": 3.1209, + "step": 27950 + }, + { + "epoch": 1.8993749150699824, + "grad_norm": 6.701910495758057, + "learning_rate": 7.626715586356842e-06, + "loss": 2.9644, + "step": 27955 + }, + { + "epoch": 1.899714635140644, + "grad_norm": 6.248241901397705, + "learning_rate": 7.626290936268516e-06, + "loss": 3.0705, + "step": 27960 + }, + { + "epoch": 1.9000543552113058, + "grad_norm": 7.102050304412842, + "learning_rate": 7.625866286180188e-06, + "loss": 3.2308, + "step": 27965 + }, + { + "epoch": 1.9003940752819677, + "grad_norm": 5.377800941467285, + "learning_rate": 7.6254416360918604e-06, + "loss": 3.3502, + "step": 27970 + }, + { + "epoch": 1.9007337953526293, + "grad_norm": 5.5517683029174805, + "learning_rate": 7.625016986003534e-06, + "loss": 3.0125, + "step": 27975 + }, + { + "epoch": 1.9010735154232912, + "grad_norm": 6.15374755859375, + "learning_rate": 7.624592335915206e-06, + "loss": 3.2829, + "step": 27980 + }, + { + "epoch": 1.901413235493953, + "grad_norm": 6.0381269454956055, + "learning_rate": 7.62416768582688e-06, + "loss": 3.5171, + "step": 27985 + }, + { + "epoch": 1.9017529555646147, + "grad_norm": 8.127166748046875, + "learning_rate": 7.6237430357385525e-06, + "loss": 3.1053, + "step": 27990 + }, + { + "epoch": 1.9020926756352765, + "grad_norm": 8.914494514465332, + "learning_rate": 7.6233183856502244e-06, + "loss": 3.148, + "step": 27995 + }, + { + "epoch": 1.9024323957059384, + "grad_norm": 5.809132099151611, + "learning_rate": 7.622893735561898e-06, + "loss": 3.0885, + "step": 28000 + }, + { + "epoch": 1.9027721157766, + "grad_norm": 6.121757984161377, + "learning_rate": 7.622469085473571e-06, + "loss": 3.1419, + "step": 28005 + }, + { + "epoch": 1.9031118358472618, + "grad_norm": 6.740731716156006, + "learning_rate": 7.622044435385243e-06, + "loss": 3.349, + "step": 28010 + }, + { + "epoch": 1.9034515559179237, + "grad_norm": 7.703741073608398, + "learning_rate": 7.6216197852969165e-06, + "loss": 2.9281, + "step": 28015 + }, + { + "epoch": 1.9037912759885853, + "grad_norm": 6.971902847290039, + "learning_rate": 7.621195135208589e-06, + "loss": 3.1757, + "step": 28020 + }, + { + "epoch": 1.9041309960592472, + "grad_norm": 6.3673858642578125, + "learning_rate": 7.620770485120261e-06, + "loss": 3.0982, + "step": 28025 + }, + { + "epoch": 1.904470716129909, + "grad_norm": 5.813795566558838, + "learning_rate": 7.620345835031935e-06, + "loss": 3.1491, + "step": 28030 + }, + { + "epoch": 1.9048104362005707, + "grad_norm": 7.482023239135742, + "learning_rate": 7.619921184943607e-06, + "loss": 3.1924, + "step": 28035 + }, + { + "epoch": 1.9051501562712325, + "grad_norm": 8.701274871826172, + "learning_rate": 7.61949653485528e-06, + "loss": 3.2084, + "step": 28040 + }, + { + "epoch": 1.9054898763418944, + "grad_norm": 6.442837238311768, + "learning_rate": 7.619071884766953e-06, + "loss": 3.312, + "step": 28045 + }, + { + "epoch": 1.905829596412556, + "grad_norm": 6.656210422515869, + "learning_rate": 7.618647234678625e-06, + "loss": 2.9849, + "step": 28050 + }, + { + "epoch": 1.9061693164832179, + "grad_norm": 7.515468120574951, + "learning_rate": 7.618222584590298e-06, + "loss": 3.1387, + "step": 28055 + }, + { + "epoch": 1.9065090365538797, + "grad_norm": 5.941793918609619, + "learning_rate": 7.617797934501972e-06, + "loss": 3.217, + "step": 28060 + }, + { + "epoch": 1.9068487566245413, + "grad_norm": 6.118073463439941, + "learning_rate": 7.617373284413644e-06, + "loss": 2.9195, + "step": 28065 + }, + { + "epoch": 1.9071884766952032, + "grad_norm": 5.818133354187012, + "learning_rate": 7.6169486343253164e-06, + "loss": 3.0739, + "step": 28070 + }, + { + "epoch": 1.907528196765865, + "grad_norm": 6.2356648445129395, + "learning_rate": 7.61652398423699e-06, + "loss": 3.3071, + "step": 28075 + }, + { + "epoch": 1.9078679168365267, + "grad_norm": 6.196620464324951, + "learning_rate": 7.616099334148662e-06, + "loss": 3.098, + "step": 28080 + }, + { + "epoch": 1.9082076369071885, + "grad_norm": 7.599545001983643, + "learning_rate": 7.615674684060335e-06, + "loss": 3.0905, + "step": 28085 + }, + { + "epoch": 1.9085473569778504, + "grad_norm": 5.609899520874023, + "learning_rate": 7.6152500339720085e-06, + "loss": 3.3157, + "step": 28090 + }, + { + "epoch": 1.908887077048512, + "grad_norm": 6.986759662628174, + "learning_rate": 7.6148253838836805e-06, + "loss": 3.0672, + "step": 28095 + }, + { + "epoch": 1.9092267971191736, + "grad_norm": 7.4459638595581055, + "learning_rate": 7.614400733795353e-06, + "loss": 3.1678, + "step": 28100 + }, + { + "epoch": 1.9095665171898357, + "grad_norm": 5.9386186599731445, + "learning_rate": 7.613976083707026e-06, + "loss": 3.0442, + "step": 28105 + }, + { + "epoch": 1.9099062372604974, + "grad_norm": 7.616998195648193, + "learning_rate": 7.613551433618699e-06, + "loss": 3.1302, + "step": 28110 + }, + { + "epoch": 1.910245957331159, + "grad_norm": 5.898701190948486, + "learning_rate": 7.613126783530372e-06, + "loss": 3.0655, + "step": 28115 + }, + { + "epoch": 1.910585677401821, + "grad_norm": 6.437546730041504, + "learning_rate": 7.6127021334420445e-06, + "loss": 3.4147, + "step": 28120 + }, + { + "epoch": 1.9109253974724827, + "grad_norm": 6.89699125289917, + "learning_rate": 7.612277483353717e-06, + "loss": 3.4042, + "step": 28125 + }, + { + "epoch": 1.9112651175431443, + "grad_norm": 6.883574962615967, + "learning_rate": 7.611852833265389e-06, + "loss": 3.1854, + "step": 28130 + }, + { + "epoch": 1.9116048376138062, + "grad_norm": 7.857903957366943, + "learning_rate": 7.611428183177063e-06, + "loss": 2.9277, + "step": 28135 + }, + { + "epoch": 1.911944557684468, + "grad_norm": 6.655631065368652, + "learning_rate": 7.611003533088736e-06, + "loss": 3.4677, + "step": 28140 + }, + { + "epoch": 1.9122842777551297, + "grad_norm": 6.31135368347168, + "learning_rate": 7.610578883000408e-06, + "loss": 3.2635, + "step": 28145 + }, + { + "epoch": 1.9126239978257915, + "grad_norm": 6.824995517730713, + "learning_rate": 7.610154232912081e-06, + "loss": 3.1878, + "step": 28150 + }, + { + "epoch": 1.9129637178964534, + "grad_norm": 6.537460803985596, + "learning_rate": 7.609729582823754e-06, + "loss": 3.2031, + "step": 28155 + }, + { + "epoch": 1.913303437967115, + "grad_norm": 7.934191703796387, + "learning_rate": 7.609304932735426e-06, + "loss": 3.2686, + "step": 28160 + }, + { + "epoch": 1.9136431580377768, + "grad_norm": 6.884794235229492, + "learning_rate": 7.6088802826471e-06, + "loss": 3.2975, + "step": 28165 + }, + { + "epoch": 1.9139828781084387, + "grad_norm": 7.819282054901123, + "learning_rate": 7.6084556325587725e-06, + "loss": 3.1399, + "step": 28170 + }, + { + "epoch": 1.9143225981791003, + "grad_norm": 7.765523433685303, + "learning_rate": 7.608030982470444e-06, + "loss": 2.9693, + "step": 28175 + }, + { + "epoch": 1.9146623182497622, + "grad_norm": 7.7043657302856445, + "learning_rate": 7.607606332382118e-06, + "loss": 3.2922, + "step": 28180 + }, + { + "epoch": 1.915002038320424, + "grad_norm": 7.690728664398193, + "learning_rate": 7.607181682293791e-06, + "loss": 3.2059, + "step": 28185 + }, + { + "epoch": 1.9153417583910857, + "grad_norm": 6.931865692138672, + "learning_rate": 7.606757032205463e-06, + "loss": 3.096, + "step": 28190 + }, + { + "epoch": 1.9156814784617475, + "grad_norm": 5.762805938720703, + "learning_rate": 7.6063323821171365e-06, + "loss": 2.9988, + "step": 28195 + }, + { + "epoch": 1.9160211985324094, + "grad_norm": 6.170013904571533, + "learning_rate": 7.605907732028808e-06, + "loss": 3.4667, + "step": 28200 + }, + { + "epoch": 1.916360918603071, + "grad_norm": 7.937504291534424, + "learning_rate": 7.605483081940481e-06, + "loss": 3.2695, + "step": 28205 + }, + { + "epoch": 1.9167006386737329, + "grad_norm": 5.353275775909424, + "learning_rate": 7.605058431852155e-06, + "loss": 3.0936, + "step": 28210 + }, + { + "epoch": 1.9170403587443947, + "grad_norm": 5.101224422454834, + "learning_rate": 7.604633781763827e-06, + "loss": 3.0384, + "step": 28215 + }, + { + "epoch": 1.9173800788150563, + "grad_norm": 6.781169414520264, + "learning_rate": 7.6042091316755e-06, + "loss": 3.0979, + "step": 28220 + }, + { + "epoch": 1.9177197988857182, + "grad_norm": 7.425670146942139, + "learning_rate": 7.603784481587173e-06, + "loss": 2.9259, + "step": 28225 + }, + { + "epoch": 1.91805951895638, + "grad_norm": 6.02872896194458, + "learning_rate": 7.603359831498845e-06, + "loss": 2.803, + "step": 28230 + }, + { + "epoch": 1.9183992390270417, + "grad_norm": 6.040752410888672, + "learning_rate": 7.602935181410518e-06, + "loss": 3.1692, + "step": 28235 + }, + { + "epoch": 1.9187389590977035, + "grad_norm": 5.721450328826904, + "learning_rate": 7.602510531322192e-06, + "loss": 3.1889, + "step": 28240 + }, + { + "epoch": 1.9190786791683654, + "grad_norm": 7.432201385498047, + "learning_rate": 7.602085881233864e-06, + "loss": 3.1229, + "step": 28245 + }, + { + "epoch": 1.919418399239027, + "grad_norm": 7.702428817749023, + "learning_rate": 7.6016612311455364e-06, + "loss": 3.1813, + "step": 28250 + }, + { + "epoch": 1.9197581193096889, + "grad_norm": 6.866563320159912, + "learning_rate": 7.60123658105721e-06, + "loss": 3.2798, + "step": 28255 + }, + { + "epoch": 1.9200978393803507, + "grad_norm": 6.873692512512207, + "learning_rate": 7.600811930968882e-06, + "loss": 3.2552, + "step": 28260 + }, + { + "epoch": 1.9204375594510124, + "grad_norm": 8.553792953491211, + "learning_rate": 7.600387280880555e-06, + "loss": 3.4148, + "step": 28265 + }, + { + "epoch": 1.920777279521674, + "grad_norm": 6.915821075439453, + "learning_rate": 7.5999626307922285e-06, + "loss": 3.3171, + "step": 28270 + }, + { + "epoch": 1.921116999592336, + "grad_norm": 6.443299293518066, + "learning_rate": 7.5995379807039004e-06, + "loss": 3.2529, + "step": 28275 + }, + { + "epoch": 1.9214567196629977, + "grad_norm": 6.293262958526611, + "learning_rate": 7.599113330615573e-06, + "loss": 3.0079, + "step": 28280 + }, + { + "epoch": 1.9217964397336593, + "grad_norm": 7.475148677825928, + "learning_rate": 7.598688680527246e-06, + "loss": 3.1956, + "step": 28285 + }, + { + "epoch": 1.9221361598043214, + "grad_norm": 5.163491249084473, + "learning_rate": 7.598264030438919e-06, + "loss": 3.2969, + "step": 28290 + }, + { + "epoch": 1.922475879874983, + "grad_norm": 6.866916179656982, + "learning_rate": 7.597839380350591e-06, + "loss": 3.1287, + "step": 28295 + }, + { + "epoch": 1.9228155999456447, + "grad_norm": 6.208005428314209, + "learning_rate": 7.5974147302622644e-06, + "loss": 3.3673, + "step": 28300 + }, + { + "epoch": 1.9231553200163065, + "grad_norm": 5.326771259307861, + "learning_rate": 7.596990080173937e-06, + "loss": 3.2458, + "step": 28305 + }, + { + "epoch": 1.9234950400869684, + "grad_norm": 7.054676055908203, + "learning_rate": 7.596565430085609e-06, + "loss": 3.182, + "step": 28310 + }, + { + "epoch": 1.92383476015763, + "grad_norm": 6.114138603210449, + "learning_rate": 7.596140779997283e-06, + "loss": 3.465, + "step": 28315 + }, + { + "epoch": 1.9241744802282919, + "grad_norm": 6.872567653656006, + "learning_rate": 7.595716129908956e-06, + "loss": 3.2901, + "step": 28320 + }, + { + "epoch": 1.9245142002989537, + "grad_norm": 7.095840930938721, + "learning_rate": 7.595291479820629e-06, + "loss": 3.1902, + "step": 28325 + }, + { + "epoch": 1.9248539203696153, + "grad_norm": 7.446535110473633, + "learning_rate": 7.594866829732301e-06, + "loss": 3.1536, + "step": 28330 + }, + { + "epoch": 1.9251936404402772, + "grad_norm": 6.817484378814697, + "learning_rate": 7.594442179643974e-06, + "loss": 3.1517, + "step": 28335 + }, + { + "epoch": 1.925533360510939, + "grad_norm": 6.763257026672363, + "learning_rate": 7.594017529555648e-06, + "loss": 3.2521, + "step": 28340 + }, + { + "epoch": 1.9258730805816007, + "grad_norm": 6.803769111633301, + "learning_rate": 7.59359287946732e-06, + "loss": 3.0908, + "step": 28345 + }, + { + "epoch": 1.9262128006522625, + "grad_norm": 7.362276554107666, + "learning_rate": 7.5931682293789924e-06, + "loss": 3.1559, + "step": 28350 + }, + { + "epoch": 1.9265525207229244, + "grad_norm": 6.774146556854248, + "learning_rate": 7.592743579290665e-06, + "loss": 3.0296, + "step": 28355 + }, + { + "epoch": 1.926892240793586, + "grad_norm": 7.386756896972656, + "learning_rate": 7.592318929202338e-06, + "loss": 3.1193, + "step": 28360 + }, + { + "epoch": 1.9272319608642479, + "grad_norm": 6.460944652557373, + "learning_rate": 7.591894279114011e-06, + "loss": 3.1742, + "step": 28365 + }, + { + "epoch": 1.9275716809349097, + "grad_norm": 5.7047648429870605, + "learning_rate": 7.591469629025684e-06, + "loss": 3.0726, + "step": 28370 + }, + { + "epoch": 1.9279114010055713, + "grad_norm": 6.396632671356201, + "learning_rate": 7.5910449789373564e-06, + "loss": 3.1061, + "step": 28375 + }, + { + "epoch": 1.9282511210762332, + "grad_norm": 7.462396144866943, + "learning_rate": 7.590620328849028e-06, + "loss": 3.1905, + "step": 28380 + }, + { + "epoch": 1.928590841146895, + "grad_norm": 5.813670635223389, + "learning_rate": 7.590195678760702e-06, + "loss": 2.9292, + "step": 28385 + }, + { + "epoch": 1.9289305612175567, + "grad_norm": 7.893398284912109, + "learning_rate": 7.589771028672375e-06, + "loss": 3.2172, + "step": 28390 + }, + { + "epoch": 1.9292702812882185, + "grad_norm": 6.256020545959473, + "learning_rate": 7.589346378584047e-06, + "loss": 3.2474, + "step": 28395 + }, + { + "epoch": 1.9296100013588804, + "grad_norm": 7.1317219734191895, + "learning_rate": 7.5889217284957205e-06, + "loss": 3.1685, + "step": 28400 + }, + { + "epoch": 1.929949721429542, + "grad_norm": 6.777476787567139, + "learning_rate": 7.588497078407393e-06, + "loss": 3.2264, + "step": 28405 + }, + { + "epoch": 1.9302894415002039, + "grad_norm": 5.873265266418457, + "learning_rate": 7.588072428319065e-06, + "loss": 3.0676, + "step": 28410 + }, + { + "epoch": 1.9306291615708657, + "grad_norm": 7.285456657409668, + "learning_rate": 7.587647778230739e-06, + "loss": 2.758, + "step": 28415 + }, + { + "epoch": 1.9309688816415274, + "grad_norm": 5.978732109069824, + "learning_rate": 7.587223128142412e-06, + "loss": 3.3014, + "step": 28420 + }, + { + "epoch": 1.9313086017121892, + "grad_norm": 6.571243762969971, + "learning_rate": 7.586798478054084e-06, + "loss": 3.1267, + "step": 28425 + }, + { + "epoch": 1.931648321782851, + "grad_norm": 7.713825702667236, + "learning_rate": 7.586373827965757e-06, + "loss": 3.1728, + "step": 28430 + }, + { + "epoch": 1.9319880418535127, + "grad_norm": 8.73408031463623, + "learning_rate": 7.58594917787743e-06, + "loss": 3.1614, + "step": 28435 + }, + { + "epoch": 1.9323277619241743, + "grad_norm": 7.706807613372803, + "learning_rate": 7.585524527789102e-06, + "loss": 3.1382, + "step": 28440 + }, + { + "epoch": 1.9326674819948364, + "grad_norm": 6.6343255043029785, + "learning_rate": 7.585099877700776e-06, + "loss": 3.1767, + "step": 28445 + }, + { + "epoch": 1.933007202065498, + "grad_norm": 9.32791805267334, + "learning_rate": 7.584675227612448e-06, + "loss": 3.1931, + "step": 28450 + }, + { + "epoch": 1.9333469221361597, + "grad_norm": 6.568286895751953, + "learning_rate": 7.58425057752412e-06, + "loss": 3.193, + "step": 28455 + }, + { + "epoch": 1.9336866422068217, + "grad_norm": 5.2037529945373535, + "learning_rate": 7.583825927435794e-06, + "loss": 3.2273, + "step": 28460 + }, + { + "epoch": 1.9340263622774834, + "grad_norm": 7.19879150390625, + "learning_rate": 7.583401277347466e-06, + "loss": 3.2203, + "step": 28465 + }, + { + "epoch": 1.934366082348145, + "grad_norm": 6.203047752380371, + "learning_rate": 7.582976627259139e-06, + "loss": 2.9705, + "step": 28470 + }, + { + "epoch": 1.9347058024188069, + "grad_norm": 8.135429382324219, + "learning_rate": 7.5825519771708125e-06, + "loss": 3.4235, + "step": 28475 + }, + { + "epoch": 1.9350455224894687, + "grad_norm": 5.467385292053223, + "learning_rate": 7.582127327082484e-06, + "loss": 3.245, + "step": 28480 + }, + { + "epoch": 1.9353852425601303, + "grad_norm": 7.326074123382568, + "learning_rate": 7.581702676994157e-06, + "loss": 3.1977, + "step": 28485 + }, + { + "epoch": 1.9357249626307922, + "grad_norm": 5.4896039962768555, + "learning_rate": 7.581278026905831e-06, + "loss": 2.8832, + "step": 28490 + }, + { + "epoch": 1.936064682701454, + "grad_norm": 7.934334754943848, + "learning_rate": 7.580853376817503e-06, + "loss": 3.2059, + "step": 28495 + }, + { + "epoch": 1.9364044027721157, + "grad_norm": 7.510618209838867, + "learning_rate": 7.580428726729176e-06, + "loss": 3.2182, + "step": 28500 + }, + { + "epoch": 1.9367441228427775, + "grad_norm": 5.488558292388916, + "learning_rate": 7.580004076640849e-06, + "loss": 3.3754, + "step": 28505 + }, + { + "epoch": 1.9370838429134394, + "grad_norm": 5.9312520027160645, + "learning_rate": 7.579579426552521e-06, + "loss": 3.1236, + "step": 28510 + }, + { + "epoch": 1.937423562984101, + "grad_norm": 6.8245463371276855, + "learning_rate": 7.579154776464194e-06, + "loss": 3.1155, + "step": 28515 + }, + { + "epoch": 1.9377632830547629, + "grad_norm": 7.024289608001709, + "learning_rate": 7.578730126375868e-06, + "loss": 3.1435, + "step": 28520 + }, + { + "epoch": 1.9381030031254247, + "grad_norm": 6.3831400871276855, + "learning_rate": 7.57830547628754e-06, + "loss": 3.1352, + "step": 28525 + }, + { + "epoch": 1.9384427231960863, + "grad_norm": 5.7817230224609375, + "learning_rate": 7.5778808261992124e-06, + "loss": 3.1642, + "step": 28530 + }, + { + "epoch": 1.9387824432667482, + "grad_norm": 5.707894802093506, + "learning_rate": 7.577456176110885e-06, + "loss": 3.1214, + "step": 28535 + }, + { + "epoch": 1.93912216333741, + "grad_norm": 6.983304977416992, + "learning_rate": 7.577031526022558e-06, + "loss": 3.1835, + "step": 28540 + }, + { + "epoch": 1.9394618834080717, + "grad_norm": 7.866306304931641, + "learning_rate": 7.57660687593423e-06, + "loss": 3.221, + "step": 28545 + }, + { + "epoch": 1.9398016034787335, + "grad_norm": 6.330482006072998, + "learning_rate": 7.576182225845904e-06, + "loss": 3.2072, + "step": 28550 + }, + { + "epoch": 1.9401413235493954, + "grad_norm": 6.932738780975342, + "learning_rate": 7.5757575757575764e-06, + "loss": 3.1332, + "step": 28555 + }, + { + "epoch": 1.940481043620057, + "grad_norm": 6.7887163162231445, + "learning_rate": 7.575332925669248e-06, + "loss": 3.3218, + "step": 28560 + }, + { + "epoch": 1.9408207636907189, + "grad_norm": 5.279545783996582, + "learning_rate": 7.574908275580922e-06, + "loss": 3.0678, + "step": 28565 + }, + { + "epoch": 1.9411604837613807, + "grad_norm": 7.8651909828186035, + "learning_rate": 7.574483625492595e-06, + "loss": 3.2273, + "step": 28570 + }, + { + "epoch": 1.9415002038320424, + "grad_norm": 7.205106258392334, + "learning_rate": 7.574058975404267e-06, + "loss": 2.8767, + "step": 28575 + }, + { + "epoch": 1.9418399239027042, + "grad_norm": 7.541749477386475, + "learning_rate": 7.5736343253159404e-06, + "loss": 3.2887, + "step": 28580 + }, + { + "epoch": 1.942179643973366, + "grad_norm": 5.9531660079956055, + "learning_rate": 7.573209675227613e-06, + "loss": 3.4302, + "step": 28585 + }, + { + "epoch": 1.9425193640440277, + "grad_norm": 6.837569236755371, + "learning_rate": 7.572785025139285e-06, + "loss": 3.2058, + "step": 28590 + }, + { + "epoch": 1.9428590841146895, + "grad_norm": 6.47291374206543, + "learning_rate": 7.572360375050959e-06, + "loss": 3.1348, + "step": 28595 + }, + { + "epoch": 1.9431988041853514, + "grad_norm": 7.987802982330322, + "learning_rate": 7.571935724962632e-06, + "loss": 2.8798, + "step": 28600 + }, + { + "epoch": 1.943538524256013, + "grad_norm": 7.428594589233398, + "learning_rate": 7.571511074874304e-06, + "loss": 3.0978, + "step": 28605 + }, + { + "epoch": 1.9438782443266747, + "grad_norm": 7.703134536743164, + "learning_rate": 7.571086424785977e-06, + "loss": 3.1135, + "step": 28610 + }, + { + "epoch": 1.9442179643973367, + "grad_norm": 6.536875247955322, + "learning_rate": 7.57066177469765e-06, + "loss": 3.1212, + "step": 28615 + }, + { + "epoch": 1.9445576844679984, + "grad_norm": 7.545880317687988, + "learning_rate": 7.570237124609322e-06, + "loss": 2.7628, + "step": 28620 + }, + { + "epoch": 1.94489740453866, + "grad_norm": 7.531265735626221, + "learning_rate": 7.569812474520996e-06, + "loss": 3.1634, + "step": 28625 + }, + { + "epoch": 1.945237124609322, + "grad_norm": 6.692963600158691, + "learning_rate": 7.569387824432668e-06, + "loss": 3.0932, + "step": 28630 + }, + { + "epoch": 1.9455768446799837, + "grad_norm": 6.733750820159912, + "learning_rate": 7.56896317434434e-06, + "loss": 3.0506, + "step": 28635 + }, + { + "epoch": 1.9459165647506453, + "grad_norm": 6.715493679046631, + "learning_rate": 7.568538524256014e-06, + "loss": 3.077, + "step": 28640 + }, + { + "epoch": 1.9462562848213072, + "grad_norm": 6.880084991455078, + "learning_rate": 7.568113874167686e-06, + "loss": 3.2405, + "step": 28645 + }, + { + "epoch": 1.946596004891969, + "grad_norm": 5.935616493225098, + "learning_rate": 7.567689224079359e-06, + "loss": 2.7933, + "step": 28650 + }, + { + "epoch": 1.9469357249626307, + "grad_norm": 7.557309150695801, + "learning_rate": 7.5672645739910324e-06, + "loss": 3.3637, + "step": 28655 + }, + { + "epoch": 1.9472754450332925, + "grad_norm": 7.8401198387146, + "learning_rate": 7.566839923902704e-06, + "loss": 3.4079, + "step": 28660 + }, + { + "epoch": 1.9476151651039544, + "grad_norm": 7.392384052276611, + "learning_rate": 7.566415273814378e-06, + "loss": 2.9695, + "step": 28665 + }, + { + "epoch": 1.947954885174616, + "grad_norm": 8.747076034545898, + "learning_rate": 7.565990623726051e-06, + "loss": 3.1245, + "step": 28670 + }, + { + "epoch": 1.9482946052452779, + "grad_norm": 7.472886562347412, + "learning_rate": 7.565565973637723e-06, + "loss": 2.8996, + "step": 28675 + }, + { + "epoch": 1.9486343253159397, + "grad_norm": 6.579679012298584, + "learning_rate": 7.5651413235493964e-06, + "loss": 3.3358, + "step": 28680 + }, + { + "epoch": 1.9489740453866014, + "grad_norm": 7.477827548980713, + "learning_rate": 7.564716673461069e-06, + "loss": 3.3322, + "step": 28685 + }, + { + "epoch": 1.9493137654572632, + "grad_norm": 6.577177047729492, + "learning_rate": 7.564292023372741e-06, + "loss": 3.2255, + "step": 28690 + }, + { + "epoch": 1.949653485527925, + "grad_norm": 6.338163375854492, + "learning_rate": 7.563867373284415e-06, + "loss": 3.1961, + "step": 28695 + }, + { + "epoch": 1.9499932055985867, + "grad_norm": 7.161726474761963, + "learning_rate": 7.563442723196087e-06, + "loss": 3.134, + "step": 28700 + }, + { + "epoch": 1.9503329256692485, + "grad_norm": 6.378720760345459, + "learning_rate": 7.56301807310776e-06, + "loss": 3.0723, + "step": 28705 + }, + { + "epoch": 1.9506726457399104, + "grad_norm": 8.69613265991211, + "learning_rate": 7.562593423019433e-06, + "loss": 3.3641, + "step": 28710 + }, + { + "epoch": 1.951012365810572, + "grad_norm": 6.836185932159424, + "learning_rate": 7.562168772931105e-06, + "loss": 3.258, + "step": 28715 + }, + { + "epoch": 1.9513520858812339, + "grad_norm": 6.122758865356445, + "learning_rate": 7.561744122842778e-06, + "loss": 3.2519, + "step": 28720 + }, + { + "epoch": 1.9516918059518957, + "grad_norm": 6.7040557861328125, + "learning_rate": 7.561319472754452e-06, + "loss": 3.2111, + "step": 28725 + }, + { + "epoch": 1.9520315260225574, + "grad_norm": 7.998995304107666, + "learning_rate": 7.560894822666124e-06, + "loss": 3.1874, + "step": 28730 + }, + { + "epoch": 1.9523712460932192, + "grad_norm": 6.831028461456299, + "learning_rate": 7.560470172577796e-06, + "loss": 3.3748, + "step": 28735 + }, + { + "epoch": 1.952710966163881, + "grad_norm": 6.644139289855957, + "learning_rate": 7.56004552248947e-06, + "loss": 3.2396, + "step": 28740 + }, + { + "epoch": 1.9530506862345427, + "grad_norm": 7.05450963973999, + "learning_rate": 7.559620872401142e-06, + "loss": 3.3664, + "step": 28745 + }, + { + "epoch": 1.9533904063052046, + "grad_norm": 5.96948766708374, + "learning_rate": 7.559196222312815e-06, + "loss": 3.0805, + "step": 28750 + }, + { + "epoch": 1.9537301263758664, + "grad_norm": 7.065918922424316, + "learning_rate": 7.5587715722244885e-06, + "loss": 3.2996, + "step": 28755 + }, + { + "epoch": 1.954069846446528, + "grad_norm": 6.459409236907959, + "learning_rate": 7.55834692213616e-06, + "loss": 3.2494, + "step": 28760 + }, + { + "epoch": 1.95440956651719, + "grad_norm": 9.157086372375488, + "learning_rate": 7.557922272047833e-06, + "loss": 3.2699, + "step": 28765 + }, + { + "epoch": 1.9547492865878517, + "grad_norm": 7.716764450073242, + "learning_rate": 7.557497621959506e-06, + "loss": 3.1836, + "step": 28770 + }, + { + "epoch": 1.9550890066585134, + "grad_norm": 7.251779079437256, + "learning_rate": 7.557072971871179e-06, + "loss": 3.0616, + "step": 28775 + }, + { + "epoch": 1.955428726729175, + "grad_norm": 6.105279445648193, + "learning_rate": 7.556648321782852e-06, + "loss": 3.3685, + "step": 28780 + }, + { + "epoch": 1.955768446799837, + "grad_norm": 8.191567420959473, + "learning_rate": 7.556223671694524e-06, + "loss": 3.197, + "step": 28785 + }, + { + "epoch": 1.9561081668704987, + "grad_norm": 6.237909317016602, + "learning_rate": 7.555799021606197e-06, + "loss": 3.3368, + "step": 28790 + }, + { + "epoch": 1.9564478869411603, + "grad_norm": 7.15216064453125, + "learning_rate": 7.555374371517869e-06, + "loss": 3.1085, + "step": 28795 + }, + { + "epoch": 1.9567876070118224, + "grad_norm": 5.357245922088623, + "learning_rate": 7.554949721429543e-06, + "loss": 3.0735, + "step": 28800 + }, + { + "epoch": 1.957127327082484, + "grad_norm": 6.154484272003174, + "learning_rate": 7.554525071341216e-06, + "loss": 2.8704, + "step": 28805 + }, + { + "epoch": 1.9574670471531457, + "grad_norm": 6.632323265075684, + "learning_rate": 7.5541004212528876e-06, + "loss": 3.1473, + "step": 28810 + }, + { + "epoch": 1.9578067672238075, + "grad_norm": 7.891892910003662, + "learning_rate": 7.553675771164561e-06, + "loss": 3.0847, + "step": 28815 + }, + { + "epoch": 1.9581464872944694, + "grad_norm": 6.067958831787109, + "learning_rate": 7.553251121076234e-06, + "loss": 3.1978, + "step": 28820 + }, + { + "epoch": 1.958486207365131, + "grad_norm": 6.146535396575928, + "learning_rate": 7.552826470987906e-06, + "loss": 3.1118, + "step": 28825 + }, + { + "epoch": 1.9588259274357929, + "grad_norm": 7.857239246368408, + "learning_rate": 7.55240182089958e-06, + "loss": 3.0694, + "step": 28830 + }, + { + "epoch": 1.9591656475064547, + "grad_norm": 8.16315746307373, + "learning_rate": 7.5519771708112524e-06, + "loss": 3.0289, + "step": 28835 + }, + { + "epoch": 1.9595053675771164, + "grad_norm": 6.1479997634887695, + "learning_rate": 7.551552520722924e-06, + "loss": 3.3008, + "step": 28840 + }, + { + "epoch": 1.9598450876477782, + "grad_norm": 7.021384239196777, + "learning_rate": 7.551127870634598e-06, + "loss": 3.2379, + "step": 28845 + }, + { + "epoch": 1.96018480771844, + "grad_norm": 9.598062515258789, + "learning_rate": 7.550703220546271e-06, + "loss": 3.2451, + "step": 28850 + }, + { + "epoch": 1.9605245277891017, + "grad_norm": 5.507959842681885, + "learning_rate": 7.550278570457943e-06, + "loss": 3.1485, + "step": 28855 + }, + { + "epoch": 1.9608642478597635, + "grad_norm": 7.554663181304932, + "learning_rate": 7.5498539203696164e-06, + "loss": 3.3122, + "step": 28860 + }, + { + "epoch": 1.9612039679304254, + "grad_norm": 5.585144519805908, + "learning_rate": 7.549429270281288e-06, + "loss": 3.188, + "step": 28865 + }, + { + "epoch": 1.961543688001087, + "grad_norm": 7.442901611328125, + "learning_rate": 7.549004620192961e-06, + "loss": 3.1924, + "step": 28870 + }, + { + "epoch": 1.9618834080717489, + "grad_norm": 6.4908881187438965, + "learning_rate": 7.548579970104635e-06, + "loss": 3.3312, + "step": 28875 + }, + { + "epoch": 1.9622231281424107, + "grad_norm": 5.485721111297607, + "learning_rate": 7.548155320016307e-06, + "loss": 3.1991, + "step": 28880 + }, + { + "epoch": 1.9625628482130724, + "grad_norm": 6.803783416748047, + "learning_rate": 7.54773066992798e-06, + "loss": 2.9537, + "step": 28885 + }, + { + "epoch": 1.9629025682837342, + "grad_norm": 5.9296064376831055, + "learning_rate": 7.547306019839653e-06, + "loss": 3.0825, + "step": 28890 + }, + { + "epoch": 1.963242288354396, + "grad_norm": 8.082944869995117, + "learning_rate": 7.546881369751325e-06, + "loss": 3.067, + "step": 28895 + }, + { + "epoch": 1.9635820084250577, + "grad_norm": 6.995436191558838, + "learning_rate": 7.546456719662998e-06, + "loss": 3.0768, + "step": 28900 + }, + { + "epoch": 1.9639217284957196, + "grad_norm": 9.388365745544434, + "learning_rate": 7.546032069574672e-06, + "loss": 3.4601, + "step": 28905 + }, + { + "epoch": 1.9642614485663814, + "grad_norm": 6.891729831695557, + "learning_rate": 7.545607419486344e-06, + "loss": 2.9963, + "step": 28910 + }, + { + "epoch": 1.964601168637043, + "grad_norm": 6.099961757659912, + "learning_rate": 7.545182769398016e-06, + "loss": 3.0368, + "step": 28915 + }, + { + "epoch": 1.964940888707705, + "grad_norm": 5.850136756896973, + "learning_rate": 7.54475811930969e-06, + "loss": 3.3909, + "step": 28920 + }, + { + "epoch": 1.9652806087783667, + "grad_norm": 6.41069221496582, + "learning_rate": 7.544333469221362e-06, + "loss": 3.0949, + "step": 28925 + }, + { + "epoch": 1.9656203288490284, + "grad_norm": 7.368398666381836, + "learning_rate": 7.543908819133035e-06, + "loss": 3.1346, + "step": 28930 + }, + { + "epoch": 1.9659600489196902, + "grad_norm": 6.532386302947998, + "learning_rate": 7.5434841690447084e-06, + "loss": 3.0794, + "step": 28935 + }, + { + "epoch": 1.966299768990352, + "grad_norm": 6.029325008392334, + "learning_rate": 7.54305951895638e-06, + "loss": 3.1249, + "step": 28940 + }, + { + "epoch": 1.9666394890610137, + "grad_norm": 6.201851844787598, + "learning_rate": 7.542634868868053e-06, + "loss": 3.3058, + "step": 28945 + }, + { + "epoch": 1.9669792091316753, + "grad_norm": 8.992504119873047, + "learning_rate": 7.542210218779726e-06, + "loss": 3.3926, + "step": 28950 + }, + { + "epoch": 1.9673189292023374, + "grad_norm": 8.742613792419434, + "learning_rate": 7.541785568691399e-06, + "loss": 3.1214, + "step": 28955 + }, + { + "epoch": 1.967658649272999, + "grad_norm": 7.474251747131348, + "learning_rate": 7.541360918603072e-06, + "loss": 2.9602, + "step": 28960 + }, + { + "epoch": 1.9679983693436607, + "grad_norm": 8.077353477478027, + "learning_rate": 7.540936268514744e-06, + "loss": 3.3553, + "step": 28965 + }, + { + "epoch": 1.9683380894143228, + "grad_norm": 8.730528831481934, + "learning_rate": 7.540511618426417e-06, + "loss": 3.2248, + "step": 28970 + }, + { + "epoch": 1.9686778094849844, + "grad_norm": 7.60882043838501, + "learning_rate": 7.540086968338089e-06, + "loss": 2.9573, + "step": 28975 + }, + { + "epoch": 1.969017529555646, + "grad_norm": 6.441590309143066, + "learning_rate": 7.539662318249763e-06, + "loss": 3.0018, + "step": 28980 + }, + { + "epoch": 1.9693572496263079, + "grad_norm": 9.22675609588623, + "learning_rate": 7.539237668161436e-06, + "loss": 2.9589, + "step": 28985 + }, + { + "epoch": 1.9696969696969697, + "grad_norm": 6.623274326324463, + "learning_rate": 7.5388130180731076e-06, + "loss": 3.1211, + "step": 28990 + }, + { + "epoch": 1.9700366897676314, + "grad_norm": 5.533134937286377, + "learning_rate": 7.538388367984781e-06, + "loss": 2.9067, + "step": 28995 + }, + { + "epoch": 1.9703764098382932, + "grad_norm": 7.00352668762207, + "learning_rate": 7.537963717896454e-06, + "loss": 3.1814, + "step": 29000 + }, + { + "epoch": 1.970716129908955, + "grad_norm": 6.146524429321289, + "learning_rate": 7.537539067808128e-06, + "loss": 3.1438, + "step": 29005 + }, + { + "epoch": 1.9710558499796167, + "grad_norm": 5.769242763519287, + "learning_rate": 7.5371144177198e-06, + "loss": 3.1119, + "step": 29010 + }, + { + "epoch": 1.9713955700502785, + "grad_norm": 7.663446426391602, + "learning_rate": 7.536689767631472e-06, + "loss": 3.186, + "step": 29015 + }, + { + "epoch": 1.9717352901209404, + "grad_norm": 7.77294921875, + "learning_rate": 7.536265117543145e-06, + "loss": 3.2786, + "step": 29020 + }, + { + "epoch": 1.972075010191602, + "grad_norm": 6.731208801269531, + "learning_rate": 7.535840467454818e-06, + "loss": 3.268, + "step": 29025 + }, + { + "epoch": 1.9724147302622639, + "grad_norm": 8.936989784240723, + "learning_rate": 7.535415817366491e-06, + "loss": 3.2996, + "step": 29030 + }, + { + "epoch": 1.9727544503329257, + "grad_norm": 7.8429436683654785, + "learning_rate": 7.534991167278164e-06, + "loss": 3.2549, + "step": 29035 + }, + { + "epoch": 1.9730941704035874, + "grad_norm": 6.460975170135498, + "learning_rate": 7.534566517189836e-06, + "loss": 3.1413, + "step": 29040 + }, + { + "epoch": 1.9734338904742492, + "grad_norm": 7.081985950469971, + "learning_rate": 7.534141867101508e-06, + "loss": 3.1428, + "step": 29045 + }, + { + "epoch": 1.973773610544911, + "grad_norm": 7.444870948791504, + "learning_rate": 7.533717217013182e-06, + "loss": 3.1809, + "step": 29050 + }, + { + "epoch": 1.9741133306155727, + "grad_norm": 8.650276184082031, + "learning_rate": 7.533292566924855e-06, + "loss": 3.3905, + "step": 29055 + }, + { + "epoch": 1.9744530506862346, + "grad_norm": 6.1067423820495605, + "learning_rate": 7.532867916836527e-06, + "loss": 3.1164, + "step": 29060 + }, + { + "epoch": 1.9747927707568964, + "grad_norm": 6.819446086883545, + "learning_rate": 7.5324432667482e-06, + "loss": 3.3838, + "step": 29065 + }, + { + "epoch": 1.975132490827558, + "grad_norm": 7.278988361358643, + "learning_rate": 7.532018616659873e-06, + "loss": 3.0767, + "step": 29070 + }, + { + "epoch": 1.97547221089822, + "grad_norm": 7.118364334106445, + "learning_rate": 7.531593966571545e-06, + "loss": 3.0843, + "step": 29075 + }, + { + "epoch": 1.9758119309688817, + "grad_norm": 7.655605792999268, + "learning_rate": 7.531169316483219e-06, + "loss": 3.0341, + "step": 29080 + }, + { + "epoch": 1.9761516510395434, + "grad_norm": 6.369137287139893, + "learning_rate": 7.530744666394892e-06, + "loss": 3.2164, + "step": 29085 + }, + { + "epoch": 1.9764913711102052, + "grad_norm": 8.03490924835205, + "learning_rate": 7.5303200163065636e-06, + "loss": 3.2007, + "step": 29090 + }, + { + "epoch": 1.976831091180867, + "grad_norm": 6.70344352722168, + "learning_rate": 7.5299802962359025e-06, + "loss": 3.2746, + "step": 29095 + }, + { + "epoch": 1.9771708112515287, + "grad_norm": 6.191385746002197, + "learning_rate": 7.529555646147575e-06, + "loss": 2.9928, + "step": 29100 + }, + { + "epoch": 1.9775105313221906, + "grad_norm": 5.593034744262695, + "learning_rate": 7.529130996059247e-06, + "loss": 3.3925, + "step": 29105 + }, + { + "epoch": 1.9778502513928524, + "grad_norm": 7.474363327026367, + "learning_rate": 7.528706345970921e-06, + "loss": 2.9312, + "step": 29110 + }, + { + "epoch": 1.978189971463514, + "grad_norm": 5.786718845367432, + "learning_rate": 7.528281695882594e-06, + "loss": 3.1969, + "step": 29115 + }, + { + "epoch": 1.9785296915341757, + "grad_norm": 7.245942115783691, + "learning_rate": 7.527857045794266e-06, + "loss": 3.258, + "step": 29120 + }, + { + "epoch": 1.9788694116048378, + "grad_norm": 6.351879596710205, + "learning_rate": 7.527432395705939e-06, + "loss": 3.353, + "step": 29125 + }, + { + "epoch": 1.9792091316754994, + "grad_norm": 7.404674530029297, + "learning_rate": 7.527007745617611e-06, + "loss": 3.2312, + "step": 29130 + }, + { + "epoch": 1.979548851746161, + "grad_norm": 7.05686616897583, + "learning_rate": 7.526583095529284e-06, + "loss": 3.3019, + "step": 29135 + }, + { + "epoch": 1.979888571816823, + "grad_norm": 8.196260452270508, + "learning_rate": 7.526158445440958e-06, + "loss": 2.9456, + "step": 29140 + }, + { + "epoch": 1.9802282918874847, + "grad_norm": 5.543732643127441, + "learning_rate": 7.52573379535263e-06, + "loss": 3.1686, + "step": 29145 + }, + { + "epoch": 1.9805680119581464, + "grad_norm": 6.871106147766113, + "learning_rate": 7.5253091452643024e-06, + "loss": 3.1904, + "step": 29150 + }, + { + "epoch": 1.9809077320288082, + "grad_norm": 6.415173530578613, + "learning_rate": 7.524884495175976e-06, + "loss": 3.0882, + "step": 29155 + }, + { + "epoch": 1.98124745209947, + "grad_norm": 9.269657135009766, + "learning_rate": 7.524459845087648e-06, + "loss": 3.0961, + "step": 29160 + }, + { + "epoch": 1.9815871721701317, + "grad_norm": 8.24505615234375, + "learning_rate": 7.524035194999321e-06, + "loss": 2.972, + "step": 29165 + }, + { + "epoch": 1.9819268922407935, + "grad_norm": 7.915306568145752, + "learning_rate": 7.5236105449109945e-06, + "loss": 3.2764, + "step": 29170 + }, + { + "epoch": 1.9822666123114554, + "grad_norm": 5.868544578552246, + "learning_rate": 7.5231858948226665e-06, + "loss": 3.4371, + "step": 29175 + }, + { + "epoch": 1.982606332382117, + "grad_norm": 6.031189441680908, + "learning_rate": 7.522761244734339e-06, + "loss": 3.0506, + "step": 29180 + }, + { + "epoch": 1.9829460524527789, + "grad_norm": 5.500970840454102, + "learning_rate": 7.522336594646013e-06, + "loss": 3.2083, + "step": 29185 + }, + { + "epoch": 1.9832857725234407, + "grad_norm": 7.493671417236328, + "learning_rate": 7.521911944557685e-06, + "loss": 3.3041, + "step": 29190 + }, + { + "epoch": 1.9836254925941024, + "grad_norm": 7.520320892333984, + "learning_rate": 7.521487294469358e-06, + "loss": 3.2934, + "step": 29195 + }, + { + "epoch": 1.9839652126647642, + "grad_norm": 7.576314449310303, + "learning_rate": 7.5210626443810305e-06, + "loss": 3.356, + "step": 29200 + }, + { + "epoch": 1.984304932735426, + "grad_norm": 6.989159107208252, + "learning_rate": 7.520637994292703e-06, + "loss": 3.1523, + "step": 29205 + }, + { + "epoch": 1.9846446528060877, + "grad_norm": 6.634716033935547, + "learning_rate": 7.520213344204377e-06, + "loss": 2.9625, + "step": 29210 + }, + { + "epoch": 1.9849843728767496, + "grad_norm": 7.047618865966797, + "learning_rate": 7.519788694116049e-06, + "loss": 3.2777, + "step": 29215 + }, + { + "epoch": 1.9853240929474114, + "grad_norm": 6.736203670501709, + "learning_rate": 7.519364044027722e-06, + "loss": 2.9847, + "step": 29220 + }, + { + "epoch": 1.985663813018073, + "grad_norm": 7.219259738922119, + "learning_rate": 7.518939393939395e-06, + "loss": 3.2242, + "step": 29225 + }, + { + "epoch": 1.986003533088735, + "grad_norm": 7.589790344238281, + "learning_rate": 7.518514743851067e-06, + "loss": 3.2533, + "step": 29230 + }, + { + "epoch": 1.9863432531593967, + "grad_norm": 6.738240718841553, + "learning_rate": 7.51809009376274e-06, + "loss": 3.0636, + "step": 29235 + }, + { + "epoch": 1.9866829732300584, + "grad_norm": 8.749276161193848, + "learning_rate": 7.517665443674414e-06, + "loss": 3.0299, + "step": 29240 + }, + { + "epoch": 1.9870226933007202, + "grad_norm": 8.122437477111816, + "learning_rate": 7.517240793586086e-06, + "loss": 3.2971, + "step": 29245 + }, + { + "epoch": 1.987362413371382, + "grad_norm": 7.035105228424072, + "learning_rate": 7.5168161434977585e-06, + "loss": 3.2154, + "step": 29250 + }, + { + "epoch": 1.9877021334420437, + "grad_norm": 8.144723892211914, + "learning_rate": 7.516391493409432e-06, + "loss": 3.0954, + "step": 29255 + }, + { + "epoch": 1.9880418535127056, + "grad_norm": 6.433614730834961, + "learning_rate": 7.515966843321104e-06, + "loss": 3.3303, + "step": 29260 + }, + { + "epoch": 1.9883815735833674, + "grad_norm": 6.3709397315979, + "learning_rate": 7.515542193232777e-06, + "loss": 3.0112, + "step": 29265 + }, + { + "epoch": 1.988721293654029, + "grad_norm": 6.4391279220581055, + "learning_rate": 7.51511754314445e-06, + "loss": 3.1507, + "step": 29270 + }, + { + "epoch": 1.989061013724691, + "grad_norm": 6.810340404510498, + "learning_rate": 7.5146928930561225e-06, + "loss": 2.9646, + "step": 29275 + }, + { + "epoch": 1.9894007337953528, + "grad_norm": 5.654391288757324, + "learning_rate": 7.514268242967795e-06, + "loss": 3.1398, + "step": 29280 + }, + { + "epoch": 1.9897404538660144, + "grad_norm": 8.340712547302246, + "learning_rate": 7.513843592879468e-06, + "loss": 3.2029, + "step": 29285 + }, + { + "epoch": 1.990080173936676, + "grad_norm": 5.70193338394165, + "learning_rate": 7.513418942791141e-06, + "loss": 3.0553, + "step": 29290 + }, + { + "epoch": 1.990419894007338, + "grad_norm": 5.046416759490967, + "learning_rate": 7.512994292702813e-06, + "loss": 3.0886, + "step": 29295 + }, + { + "epoch": 1.9907596140779997, + "grad_norm": 6.133103370666504, + "learning_rate": 7.5125696426144865e-06, + "loss": 3.3508, + "step": 29300 + }, + { + "epoch": 1.9910993341486614, + "grad_norm": 6.084384441375732, + "learning_rate": 7.512144992526159e-06, + "loss": 2.9696, + "step": 29305 + }, + { + "epoch": 1.9914390542193234, + "grad_norm": 5.29665994644165, + "learning_rate": 7.511720342437831e-06, + "loss": 3.1791, + "step": 29310 + }, + { + "epoch": 1.991778774289985, + "grad_norm": 6.1441144943237305, + "learning_rate": 7.511295692349505e-06, + "loss": 3.0627, + "step": 29315 + }, + { + "epoch": 1.9921184943606467, + "grad_norm": 5.469547748565674, + "learning_rate": 7.510871042261178e-06, + "loss": 3.1302, + "step": 29320 + }, + { + "epoch": 1.9924582144313085, + "grad_norm": 6.792356967926025, + "learning_rate": 7.51044639217285e-06, + "loss": 3.2222, + "step": 29325 + }, + { + "epoch": 1.9927979345019704, + "grad_norm": 6.180647373199463, + "learning_rate": 7.510021742084523e-06, + "loss": 2.9869, + "step": 29330 + }, + { + "epoch": 1.993137654572632, + "grad_norm": 6.823919773101807, + "learning_rate": 7.509597091996196e-06, + "loss": 3.0448, + "step": 29335 + }, + { + "epoch": 1.9934773746432939, + "grad_norm": 6.643738269805908, + "learning_rate": 7.509172441907868e-06, + "loss": 3.3761, + "step": 29340 + }, + { + "epoch": 1.9938170947139557, + "grad_norm": 7.915398120880127, + "learning_rate": 7.508747791819542e-06, + "loss": 2.9109, + "step": 29345 + }, + { + "epoch": 1.9941568147846174, + "grad_norm": 6.583926200866699, + "learning_rate": 7.5083231417312145e-06, + "loss": 3.395, + "step": 29350 + }, + { + "epoch": 1.9944965348552792, + "grad_norm": 6.226190567016602, + "learning_rate": 7.5078984916428864e-06, + "loss": 2.9339, + "step": 29355 + }, + { + "epoch": 1.994836254925941, + "grad_norm": 9.858806610107422, + "learning_rate": 7.50747384155456e-06, + "loss": 3.2882, + "step": 29360 + }, + { + "epoch": 1.9951759749966027, + "grad_norm": 6.744472503662109, + "learning_rate": 7.507049191466232e-06, + "loss": 3.1495, + "step": 29365 + }, + { + "epoch": 1.9955156950672646, + "grad_norm": 7.2967729568481445, + "learning_rate": 7.506624541377905e-06, + "loss": 3.2432, + "step": 29370 + }, + { + "epoch": 1.9958554151379264, + "grad_norm": 7.343134880065918, + "learning_rate": 7.5061998912895785e-06, + "loss": 3.2589, + "step": 29375 + }, + { + "epoch": 1.996195135208588, + "grad_norm": 5.19368314743042, + "learning_rate": 7.5057752412012504e-06, + "loss": 3.2389, + "step": 29380 + }, + { + "epoch": 1.99653485527925, + "grad_norm": 8.641587257385254, + "learning_rate": 7.505350591112923e-06, + "loss": 3.1131, + "step": 29385 + }, + { + "epoch": 1.9968745753499118, + "grad_norm": 7.40465784072876, + "learning_rate": 7.504925941024597e-06, + "loss": 3.0658, + "step": 29390 + }, + { + "epoch": 1.9972142954205734, + "grad_norm": 5.737337112426758, + "learning_rate": 7.504501290936269e-06, + "loss": 3.2485, + "step": 29395 + }, + { + "epoch": 1.9975540154912352, + "grad_norm": 6.8279852867126465, + "learning_rate": 7.504076640847942e-06, + "loss": 3.3336, + "step": 29400 + }, + { + "epoch": 1.997893735561897, + "grad_norm": 6.817395210266113, + "learning_rate": 7.503651990759615e-06, + "loss": 3.2607, + "step": 29405 + }, + { + "epoch": 1.9982334556325587, + "grad_norm": 6.294494152069092, + "learning_rate": 7.503227340671287e-06, + "loss": 3.1566, + "step": 29410 + }, + { + "epoch": 1.9985731757032206, + "grad_norm": 7.521232604980469, + "learning_rate": 7.50280269058296e-06, + "loss": 3.0515, + "step": 29415 + }, + { + "epoch": 1.9989128957738824, + "grad_norm": 7.675815582275391, + "learning_rate": 7.502378040494634e-06, + "loss": 3.3484, + "step": 29420 + }, + { + "epoch": 1.999252615844544, + "grad_norm": 5.781737804412842, + "learning_rate": 7.501953390406306e-06, + "loss": 3.1306, + "step": 29425 + }, + { + "epoch": 1.999592335915206, + "grad_norm": 6.677013397216797, + "learning_rate": 7.5015287403179784e-06, + "loss": 3.2209, + "step": 29430 + }, + { + "epoch": 1.9999320559858678, + "grad_norm": 6.5743560791015625, + "learning_rate": 7.501104090229652e-06, + "loss": 3.2013, + "step": 29435 + }, + { + "epoch": 2.0, + "eval_bertscore": { + "f1": 0.8382569242845338, + "precision": 0.8391379099712895, + "recall": 0.8381095085163451 + }, + "eval_bleu_4": 0.014631782735403826, + "eval_exact_match": 0.0, + "eval_loss": 3.292123556137085, + "eval_meteor": 0.10055500077168252, + "eval_rouge": { + "rouge1": 0.13165979426604135, + "rouge2": 0.016279484783694405, + "rougeL": 0.11230229145465914, + "rougeLsum": 0.11224224705228229 + }, + "eval_runtime": 1046.9921, + "eval_samples_per_second": 9.856, + "eval_steps_per_second": 1.232, + "step": 29436 + }, + { + "epoch": 2.0002717760565294, + "grad_norm": 8.450142860412598, + "learning_rate": 7.500679440141324e-06, + "loss": 3.0491, + "step": 29440 + }, + { + "epoch": 2.000611496127191, + "grad_norm": 5.219364166259766, + "learning_rate": 7.500254790052997e-06, + "loss": 3.0096, + "step": 29445 + }, + { + "epoch": 2.000951216197853, + "grad_norm": 5.632687091827393, + "learning_rate": 7.49983013996467e-06, + "loss": 3.0001, + "step": 29450 + }, + { + "epoch": 2.0012909362685147, + "grad_norm": 6.189481735229492, + "learning_rate": 7.4994054898763424e-06, + "loss": 2.9659, + "step": 29455 + }, + { + "epoch": 2.0016306563391764, + "grad_norm": 8.417391777038574, + "learning_rate": 7.498980839788015e-06, + "loss": 2.9963, + "step": 29460 + }, + { + "epoch": 2.0019703764098384, + "grad_norm": 7.604085445404053, + "learning_rate": 7.498556189699688e-06, + "loss": 2.8894, + "step": 29465 + }, + { + "epoch": 2.0023100964805, + "grad_norm": 8.528348922729492, + "learning_rate": 7.498131539611361e-06, + "loss": 2.886, + "step": 29470 + }, + { + "epoch": 2.0026498165511617, + "grad_norm": 8.181131362915039, + "learning_rate": 7.497706889523033e-06, + "loss": 3.0117, + "step": 29475 + }, + { + "epoch": 2.0029895366218238, + "grad_norm": 7.495603084564209, + "learning_rate": 7.4972822394347065e-06, + "loss": 2.9479, + "step": 29480 + }, + { + "epoch": 2.0033292566924854, + "grad_norm": 6.971821308135986, + "learning_rate": 7.496857589346379e-06, + "loss": 2.939, + "step": 29485 + }, + { + "epoch": 2.003668976763147, + "grad_norm": 6.600963115692139, + "learning_rate": 7.496432939258051e-06, + "loss": 3.1128, + "step": 29490 + }, + { + "epoch": 2.004008696833809, + "grad_norm": 5.8892822265625, + "learning_rate": 7.496008289169725e-06, + "loss": 3.253, + "step": 29495 + }, + { + "epoch": 2.0043484169044707, + "grad_norm": 6.467007637023926, + "learning_rate": 7.495583639081398e-06, + "loss": 2.795, + "step": 29500 + }, + { + "epoch": 2.0046881369751324, + "grad_norm": 8.215682983398438, + "learning_rate": 7.49515898899307e-06, + "loss": 3.1825, + "step": 29505 + }, + { + "epoch": 2.0050278570457944, + "grad_norm": 7.703218936920166, + "learning_rate": 7.494734338904743e-06, + "loss": 3.262, + "step": 29510 + }, + { + "epoch": 2.005367577116456, + "grad_norm": 6.652834892272949, + "learning_rate": 7.494309688816416e-06, + "loss": 3.1635, + "step": 29515 + }, + { + "epoch": 2.0057072971871177, + "grad_norm": 9.828217506408691, + "learning_rate": 7.493885038728088e-06, + "loss": 2.8767, + "step": 29520 + }, + { + "epoch": 2.00604701725778, + "grad_norm": 5.02217435836792, + "learning_rate": 7.493460388639762e-06, + "loss": 3.0991, + "step": 29525 + }, + { + "epoch": 2.0063867373284414, + "grad_norm": 5.264050483703613, + "learning_rate": 7.4930357385514345e-06, + "loss": 3.0821, + "step": 29530 + }, + { + "epoch": 2.006726457399103, + "grad_norm": 7.258960723876953, + "learning_rate": 7.492611088463106e-06, + "loss": 3.2203, + "step": 29535 + }, + { + "epoch": 2.007066177469765, + "grad_norm": 6.166618824005127, + "learning_rate": 7.49218643837478e-06, + "loss": 3.2132, + "step": 29540 + }, + { + "epoch": 2.0074058975404268, + "grad_norm": 7.027062892913818, + "learning_rate": 7.491761788286452e-06, + "loss": 3.3389, + "step": 29545 + }, + { + "epoch": 2.0077456176110884, + "grad_norm": 6.754455089569092, + "learning_rate": 7.491337138198125e-06, + "loss": 2.565, + "step": 29550 + }, + { + "epoch": 2.0080853376817505, + "grad_norm": 6.073787212371826, + "learning_rate": 7.4909124881097985e-06, + "loss": 3.0882, + "step": 29555 + }, + { + "epoch": 2.008425057752412, + "grad_norm": 7.213141441345215, + "learning_rate": 7.49048783802147e-06, + "loss": 2.8386, + "step": 29560 + }, + { + "epoch": 2.0087647778230737, + "grad_norm": 6.262905597686768, + "learning_rate": 7.490063187933144e-06, + "loss": 2.9894, + "step": 29565 + }, + { + "epoch": 2.0091044978937354, + "grad_norm": 7.287519931793213, + "learning_rate": 7.489638537844817e-06, + "loss": 3.1474, + "step": 29570 + }, + { + "epoch": 2.0094442179643974, + "grad_norm": 5.576166152954102, + "learning_rate": 7.489213887756489e-06, + "loss": 3.1351, + "step": 29575 + }, + { + "epoch": 2.009783938035059, + "grad_norm": 6.919936656951904, + "learning_rate": 7.4887892376681625e-06, + "loss": 3.2105, + "step": 29580 + }, + { + "epoch": 2.0101236581057207, + "grad_norm": 5.390073299407959, + "learning_rate": 7.488364587579835e-06, + "loss": 3.0288, + "step": 29585 + }, + { + "epoch": 2.0104633781763828, + "grad_norm": 7.610101699829102, + "learning_rate": 7.487939937491507e-06, + "loss": 3.0433, + "step": 29590 + }, + { + "epoch": 2.0108030982470444, + "grad_norm": 8.703185081481934, + "learning_rate": 7.487515287403181e-06, + "loss": 3.0783, + "step": 29595 + }, + { + "epoch": 2.011142818317706, + "grad_norm": 7.238278388977051, + "learning_rate": 7.487090637314854e-06, + "loss": 2.8742, + "step": 29600 + }, + { + "epoch": 2.011482538388368, + "grad_norm": 7.616098403930664, + "learning_rate": 7.486665987226526e-06, + "loss": 2.925, + "step": 29605 + }, + { + "epoch": 2.0118222584590297, + "grad_norm": 7.056384086608887, + "learning_rate": 7.486241337138199e-06, + "loss": 2.9774, + "step": 29610 + }, + { + "epoch": 2.0121619785296914, + "grad_norm": 7.8794684410095215, + "learning_rate": 7.485816687049871e-06, + "loss": 3.2004, + "step": 29615 + }, + { + "epoch": 2.0125016986003534, + "grad_norm": 7.680569171905518, + "learning_rate": 7.485392036961544e-06, + "loss": 3.0916, + "step": 29620 + }, + { + "epoch": 2.012841418671015, + "grad_norm": 7.410799503326416, + "learning_rate": 7.484967386873218e-06, + "loss": 2.9839, + "step": 29625 + }, + { + "epoch": 2.0131811387416767, + "grad_norm": 7.75437593460083, + "learning_rate": 7.48454273678489e-06, + "loss": 3.1924, + "step": 29630 + }, + { + "epoch": 2.0135208588123388, + "grad_norm": 5.416973114013672, + "learning_rate": 7.4841180866965624e-06, + "loss": 2.9184, + "step": 29635 + }, + { + "epoch": 2.0138605788830004, + "grad_norm": 7.347233295440674, + "learning_rate": 7.483693436608236e-06, + "loss": 2.8158, + "step": 29640 + }, + { + "epoch": 2.014200298953662, + "grad_norm": 5.795236110687256, + "learning_rate": 7.483268786519908e-06, + "loss": 2.9054, + "step": 29645 + }, + { + "epoch": 2.014540019024324, + "grad_norm": 6.09187650680542, + "learning_rate": 7.482844136431581e-06, + "loss": 3.001, + "step": 29650 + }, + { + "epoch": 2.0148797390949857, + "grad_norm": 8.976563453674316, + "learning_rate": 7.4824194863432545e-06, + "loss": 3.288, + "step": 29655 + }, + { + "epoch": 2.0152194591656474, + "grad_norm": 6.21115779876709, + "learning_rate": 7.4819948362549264e-06, + "loss": 3.1285, + "step": 29660 + }, + { + "epoch": 2.0155591792363095, + "grad_norm": 5.720863342285156, + "learning_rate": 7.481570186166599e-06, + "loss": 3.1162, + "step": 29665 + }, + { + "epoch": 2.015898899306971, + "grad_norm": 7.51417875289917, + "learning_rate": 7.481145536078273e-06, + "loss": 3.022, + "step": 29670 + }, + { + "epoch": 2.0162386193776327, + "grad_norm": 7.266471862792969, + "learning_rate": 7.480720885989945e-06, + "loss": 2.7362, + "step": 29675 + }, + { + "epoch": 2.016578339448295, + "grad_norm": 7.4644293785095215, + "learning_rate": 7.480296235901618e-06, + "loss": 3.2305, + "step": 29680 + }, + { + "epoch": 2.0169180595189564, + "grad_norm": 7.264395236968994, + "learning_rate": 7.479871585813291e-06, + "loss": 2.9641, + "step": 29685 + }, + { + "epoch": 2.017257779589618, + "grad_norm": 6.023128509521484, + "learning_rate": 7.479446935724963e-06, + "loss": 3.1835, + "step": 29690 + }, + { + "epoch": 2.01759749966028, + "grad_norm": 5.835675239562988, + "learning_rate": 7.479022285636636e-06, + "loss": 3.2332, + "step": 29695 + }, + { + "epoch": 2.0179372197309418, + "grad_norm": 7.7277140617370605, + "learning_rate": 7.478597635548309e-06, + "loss": 2.9873, + "step": 29700 + }, + { + "epoch": 2.0182769398016034, + "grad_norm": 6.846813678741455, + "learning_rate": 7.478172985459982e-06, + "loss": 2.9636, + "step": 29705 + }, + { + "epoch": 2.0186166598722655, + "grad_norm": 6.023989677429199, + "learning_rate": 7.477748335371654e-06, + "loss": 3.1022, + "step": 29710 + }, + { + "epoch": 2.018956379942927, + "grad_norm": 8.363265037536621, + "learning_rate": 7.477323685283327e-06, + "loss": 3.0341, + "step": 29715 + }, + { + "epoch": 2.0192961000135887, + "grad_norm": 7.067403316497803, + "learning_rate": 7.476899035195e-06, + "loss": 2.83, + "step": 29720 + }, + { + "epoch": 2.0196358200842504, + "grad_norm": 7.378002166748047, + "learning_rate": 7.476474385106672e-06, + "loss": 3.0638, + "step": 29725 + }, + { + "epoch": 2.0199755401549124, + "grad_norm": 7.479710102081299, + "learning_rate": 7.476049735018346e-06, + "loss": 2.7248, + "step": 29730 + }, + { + "epoch": 2.020315260225574, + "grad_norm": 7.488161563873291, + "learning_rate": 7.4756250849300184e-06, + "loss": 2.8964, + "step": 29735 + }, + { + "epoch": 2.0206549802962357, + "grad_norm": 6.493444442749023, + "learning_rate": 7.47520043484169e-06, + "loss": 2.9574, + "step": 29740 + }, + { + "epoch": 2.0209947003668978, + "grad_norm": 9.427895545959473, + "learning_rate": 7.474775784753364e-06, + "loss": 2.8892, + "step": 29745 + }, + { + "epoch": 2.0213344204375594, + "grad_norm": 6.237044334411621, + "learning_rate": 7.474351134665037e-06, + "loss": 3.0175, + "step": 29750 + }, + { + "epoch": 2.021674140508221, + "grad_norm": 5.877905368804932, + "learning_rate": 7.473926484576709e-06, + "loss": 3.0941, + "step": 29755 + }, + { + "epoch": 2.022013860578883, + "grad_norm": 5.924066066741943, + "learning_rate": 7.4735018344883824e-06, + "loss": 3.3298, + "step": 29760 + }, + { + "epoch": 2.0223535806495447, + "grad_norm": 6.650903701782227, + "learning_rate": 7.473077184400055e-06, + "loss": 3.0742, + "step": 29765 + }, + { + "epoch": 2.0226933007202064, + "grad_norm": 7.455063819885254, + "learning_rate": 7.472652534311727e-06, + "loss": 2.8909, + "step": 29770 + }, + { + "epoch": 2.0230330207908684, + "grad_norm": 6.122622489929199, + "learning_rate": 7.472227884223401e-06, + "loss": 3.0318, + "step": 29775 + }, + { + "epoch": 2.02337274086153, + "grad_norm": 7.694584369659424, + "learning_rate": 7.471803234135074e-06, + "loss": 3.1518, + "step": 29780 + }, + { + "epoch": 2.0237124609321917, + "grad_norm": 6.616119861602783, + "learning_rate": 7.471378584046746e-06, + "loss": 3.0072, + "step": 29785 + }, + { + "epoch": 2.024052181002854, + "grad_norm": 6.107890605926514, + "learning_rate": 7.470953933958419e-06, + "loss": 3.1399, + "step": 29790 + }, + { + "epoch": 2.0243919010735154, + "grad_norm": 8.066967010498047, + "learning_rate": 7.470529283870091e-06, + "loss": 2.9776, + "step": 29795 + }, + { + "epoch": 2.024731621144177, + "grad_norm": 6.012620449066162, + "learning_rate": 7.470104633781764e-06, + "loss": 3.3908, + "step": 29800 + }, + { + "epoch": 2.025071341214839, + "grad_norm": 7.402095317840576, + "learning_rate": 7.469679983693438e-06, + "loss": 3.1693, + "step": 29805 + }, + { + "epoch": 2.0254110612855007, + "grad_norm": 6.834306240081787, + "learning_rate": 7.46925533360511e-06, + "loss": 3.2106, + "step": 29810 + }, + { + "epoch": 2.0257507813561624, + "grad_norm": 7.2130513191223145, + "learning_rate": 7.468830683516782e-06, + "loss": 3.0587, + "step": 29815 + }, + { + "epoch": 2.0260905014268245, + "grad_norm": 4.991905689239502, + "learning_rate": 7.468406033428456e-06, + "loss": 2.9147, + "step": 29820 + }, + { + "epoch": 2.026430221497486, + "grad_norm": 7.001828193664551, + "learning_rate": 7.467981383340128e-06, + "loss": 3.0174, + "step": 29825 + }, + { + "epoch": 2.0267699415681477, + "grad_norm": 6.739830493927002, + "learning_rate": 7.467556733251801e-06, + "loss": 2.9046, + "step": 29830 + }, + { + "epoch": 2.02710966163881, + "grad_norm": 6.853739261627197, + "learning_rate": 7.4671320831634745e-06, + "loss": 2.8985, + "step": 29835 + }, + { + "epoch": 2.0274493817094714, + "grad_norm": 9.365882873535156, + "learning_rate": 7.466707433075146e-06, + "loss": 3.276, + "step": 29840 + }, + { + "epoch": 2.027789101780133, + "grad_norm": 6.7617621421813965, + "learning_rate": 7.466282782986819e-06, + "loss": 2.9002, + "step": 29845 + }, + { + "epoch": 2.028128821850795, + "grad_norm": 6.286224842071533, + "learning_rate": 7.465858132898493e-06, + "loss": 3.0431, + "step": 29850 + }, + { + "epoch": 2.0284685419214568, + "grad_norm": 7.337525367736816, + "learning_rate": 7.465433482810165e-06, + "loss": 3.203, + "step": 29855 + }, + { + "epoch": 2.0288082619921184, + "grad_norm": 5.40701150894165, + "learning_rate": 7.465008832721838e-06, + "loss": 3.0212, + "step": 29860 + }, + { + "epoch": 2.0291479820627805, + "grad_norm": 9.744670867919922, + "learning_rate": 7.46458418263351e-06, + "loss": 3.2006, + "step": 29865 + }, + { + "epoch": 2.029487702133442, + "grad_norm": 8.989707946777344, + "learning_rate": 7.464159532545183e-06, + "loss": 3.1349, + "step": 29870 + }, + { + "epoch": 2.0298274222041037, + "grad_norm": 6.188287258148193, + "learning_rate": 7.463734882456856e-06, + "loss": 3.2556, + "step": 29875 + }, + { + "epoch": 2.030167142274766, + "grad_norm": 6.897579669952393, + "learning_rate": 7.463310232368529e-06, + "loss": 3.0572, + "step": 29880 + }, + { + "epoch": 2.0305068623454274, + "grad_norm": 8.191865921020508, + "learning_rate": 7.462885582280202e-06, + "loss": 3.0952, + "step": 29885 + }, + { + "epoch": 2.030846582416089, + "grad_norm": 6.72647762298584, + "learning_rate": 7.4624609321918736e-06, + "loss": 3.0426, + "step": 29890 + }, + { + "epoch": 2.031186302486751, + "grad_norm": 9.360942840576172, + "learning_rate": 7.462036282103547e-06, + "loss": 2.7674, + "step": 29895 + }, + { + "epoch": 2.0315260225574128, + "grad_norm": 8.129820823669434, + "learning_rate": 7.46161163201522e-06, + "loss": 2.7752, + "step": 29900 + }, + { + "epoch": 2.0318657426280744, + "grad_norm": 6.541640281677246, + "learning_rate": 7.461186981926894e-06, + "loss": 3.2728, + "step": 29905 + }, + { + "epoch": 2.032205462698736, + "grad_norm": 5.560698986053467, + "learning_rate": 7.460762331838566e-06, + "loss": 3.168, + "step": 29910 + }, + { + "epoch": 2.032545182769398, + "grad_norm": 7.554078578948975, + "learning_rate": 7.4603376817502384e-06, + "loss": 3.0687, + "step": 29915 + }, + { + "epoch": 2.0328849028400597, + "grad_norm": 7.035473346710205, + "learning_rate": 7.459913031661912e-06, + "loss": 2.9659, + "step": 29920 + }, + { + "epoch": 2.0332246229107214, + "grad_norm": 7.056912422180176, + "learning_rate": 7.459488381573584e-06, + "loss": 3.0638, + "step": 29925 + }, + { + "epoch": 2.0335643429813834, + "grad_norm": 6.1208600997924805, + "learning_rate": 7.459063731485257e-06, + "loss": 3.0607, + "step": 29930 + }, + { + "epoch": 2.033904063052045, + "grad_norm": 8.168315887451172, + "learning_rate": 7.45863908139693e-06, + "loss": 2.8572, + "step": 29935 + }, + { + "epoch": 2.0342437831227067, + "grad_norm": 6.63469934463501, + "learning_rate": 7.4582144313086024e-06, + "loss": 3.1913, + "step": 29940 + }, + { + "epoch": 2.034583503193369, + "grad_norm": 7.6903157234191895, + "learning_rate": 7.457789781220275e-06, + "loss": 2.9648, + "step": 29945 + }, + { + "epoch": 2.0349232232640304, + "grad_norm": 5.461888313293457, + "learning_rate": 7.457365131131948e-06, + "loss": 2.9378, + "step": 29950 + }, + { + "epoch": 2.035262943334692, + "grad_norm": 8.166374206542969, + "learning_rate": 7.456940481043621e-06, + "loss": 3.1162, + "step": 29955 + }, + { + "epoch": 2.035602663405354, + "grad_norm": 6.038872241973877, + "learning_rate": 7.456515830955293e-06, + "loss": 2.895, + "step": 29960 + }, + { + "epoch": 2.0359423834760157, + "grad_norm": 6.7882771492004395, + "learning_rate": 7.4560911808669664e-06, + "loss": 2.9106, + "step": 29965 + }, + { + "epoch": 2.0362821035466774, + "grad_norm": 9.037242889404297, + "learning_rate": 7.455666530778639e-06, + "loss": 3.1427, + "step": 29970 + }, + { + "epoch": 2.0366218236173395, + "grad_norm": 6.5600972175598145, + "learning_rate": 7.455241880690311e-06, + "loss": 3.1065, + "step": 29975 + }, + { + "epoch": 2.036961543688001, + "grad_norm": 8.417868614196777, + "learning_rate": 7.454817230601985e-06, + "loss": 2.9598, + "step": 29980 + }, + { + "epoch": 2.0373012637586627, + "grad_norm": 7.092510223388672, + "learning_rate": 7.454392580513658e-06, + "loss": 2.9608, + "step": 29985 + }, + { + "epoch": 2.037640983829325, + "grad_norm": 7.125059604644775, + "learning_rate": 7.45396793042533e-06, + "loss": 3.2575, + "step": 29990 + }, + { + "epoch": 2.0379807038999864, + "grad_norm": 5.78247594833374, + "learning_rate": 7.453543280337003e-06, + "loss": 3.2491, + "step": 29995 + }, + { + "epoch": 2.038320423970648, + "grad_norm": 6.418194770812988, + "learning_rate": 7.453118630248676e-06, + "loss": 3.1447, + "step": 30000 + }, + { + "epoch": 2.03866014404131, + "grad_norm": 6.741938591003418, + "learning_rate": 7.452693980160348e-06, + "loss": 3.1398, + "step": 30005 + }, + { + "epoch": 2.0389998641119718, + "grad_norm": 6.955453872680664, + "learning_rate": 7.452269330072022e-06, + "loss": 3.2805, + "step": 30010 + }, + { + "epoch": 2.0393395841826334, + "grad_norm": 6.5794901847839355, + "learning_rate": 7.4518446799836944e-06, + "loss": 3.3061, + "step": 30015 + }, + { + "epoch": 2.0396793042532955, + "grad_norm": 6.016541481018066, + "learning_rate": 7.451420029895366e-06, + "loss": 3.0898, + "step": 30020 + }, + { + "epoch": 2.040019024323957, + "grad_norm": 7.134653091430664, + "learning_rate": 7.45099537980704e-06, + "loss": 3.1013, + "step": 30025 + }, + { + "epoch": 2.0403587443946187, + "grad_norm": 6.025000095367432, + "learning_rate": 7.450570729718713e-06, + "loss": 3.105, + "step": 30030 + }, + { + "epoch": 2.040698464465281, + "grad_norm": 7.459017276763916, + "learning_rate": 7.450146079630385e-06, + "loss": 3.1778, + "step": 30035 + }, + { + "epoch": 2.0410381845359424, + "grad_norm": 7.374375820159912, + "learning_rate": 7.4497214295420584e-06, + "loss": 2.9553, + "step": 30040 + }, + { + "epoch": 2.041377904606604, + "grad_norm": 7.713390827178955, + "learning_rate": 7.44929677945373e-06, + "loss": 3.0649, + "step": 30045 + }, + { + "epoch": 2.041717624677266, + "grad_norm": 7.3303985595703125, + "learning_rate": 7.448872129365403e-06, + "loss": 2.9931, + "step": 30050 + }, + { + "epoch": 2.0420573447479278, + "grad_norm": 6.847064018249512, + "learning_rate": 7.448447479277077e-06, + "loss": 2.8237, + "step": 30055 + }, + { + "epoch": 2.0423970648185894, + "grad_norm": 8.249898910522461, + "learning_rate": 7.448022829188749e-06, + "loss": 3.0723, + "step": 30060 + }, + { + "epoch": 2.042736784889251, + "grad_norm": 7.342833995819092, + "learning_rate": 7.447598179100422e-06, + "loss": 2.9677, + "step": 30065 + }, + { + "epoch": 2.043076504959913, + "grad_norm": 5.5368242263793945, + "learning_rate": 7.447173529012095e-06, + "loss": 3.287, + "step": 30070 + }, + { + "epoch": 2.0434162250305747, + "grad_norm": 6.821015357971191, + "learning_rate": 7.446748878923767e-06, + "loss": 2.8947, + "step": 30075 + }, + { + "epoch": 2.0437559451012364, + "grad_norm": 6.405946731567383, + "learning_rate": 7.44632422883544e-06, + "loss": 2.9436, + "step": 30080 + }, + { + "epoch": 2.0440956651718984, + "grad_norm": 7.15674352645874, + "learning_rate": 7.445899578747114e-06, + "loss": 3.1232, + "step": 30085 + }, + { + "epoch": 2.04443538524256, + "grad_norm": 5.087882995605469, + "learning_rate": 7.445474928658786e-06, + "loss": 3.011, + "step": 30090 + }, + { + "epoch": 2.0447751053132217, + "grad_norm": 6.390735626220703, + "learning_rate": 7.445050278570458e-06, + "loss": 2.8885, + "step": 30095 + }, + { + "epoch": 2.045114825383884, + "grad_norm": 7.8188300132751465, + "learning_rate": 7.444625628482132e-06, + "loss": 3.0064, + "step": 30100 + }, + { + "epoch": 2.0454545454545454, + "grad_norm": 6.768884181976318, + "learning_rate": 7.444200978393804e-06, + "loss": 3.035, + "step": 30105 + }, + { + "epoch": 2.045794265525207, + "grad_norm": 7.395243167877197, + "learning_rate": 7.443776328305477e-06, + "loss": 2.9977, + "step": 30110 + }, + { + "epoch": 2.046133985595869, + "grad_norm": 5.843698978424072, + "learning_rate": 7.44335167821715e-06, + "loss": 3.2184, + "step": 30115 + }, + { + "epoch": 2.0464737056665308, + "grad_norm": 6.617861747741699, + "learning_rate": 7.442927028128822e-06, + "loss": 3.0227, + "step": 30120 + }, + { + "epoch": 2.0468134257371924, + "grad_norm": 7.065215587615967, + "learning_rate": 7.442502378040495e-06, + "loss": 3.0618, + "step": 30125 + }, + { + "epoch": 2.0471531458078545, + "grad_norm": 6.8107123374938965, + "learning_rate": 7.442077727952168e-06, + "loss": 3.0129, + "step": 30130 + }, + { + "epoch": 2.047492865878516, + "grad_norm": 6.821507453918457, + "learning_rate": 7.441653077863841e-06, + "loss": 2.8459, + "step": 30135 + }, + { + "epoch": 2.0478325859491777, + "grad_norm": 6.291569709777832, + "learning_rate": 7.441228427775513e-06, + "loss": 3.1738, + "step": 30140 + }, + { + "epoch": 2.04817230601984, + "grad_norm": 6.794885635375977, + "learning_rate": 7.440803777687186e-06, + "loss": 3.1163, + "step": 30145 + }, + { + "epoch": 2.0485120260905014, + "grad_norm": 7.99683952331543, + "learning_rate": 7.440379127598859e-06, + "loss": 2.9297, + "step": 30150 + }, + { + "epoch": 2.048851746161163, + "grad_norm": 6.024868488311768, + "learning_rate": 7.439954477510531e-06, + "loss": 2.9329, + "step": 30155 + }, + { + "epoch": 2.049191466231825, + "grad_norm": 6.42983865737915, + "learning_rate": 7.439529827422205e-06, + "loss": 3.0763, + "step": 30160 + }, + { + "epoch": 2.0495311863024868, + "grad_norm": 5.921435832977295, + "learning_rate": 7.439105177333878e-06, + "loss": 2.9761, + "step": 30165 + }, + { + "epoch": 2.0498709063731484, + "grad_norm": 7.290289402008057, + "learning_rate": 7.4386805272455496e-06, + "loss": 2.9928, + "step": 30170 + }, + { + "epoch": 2.0502106264438105, + "grad_norm": 6.97319221496582, + "learning_rate": 7.438255877157223e-06, + "loss": 3.07, + "step": 30175 + }, + { + "epoch": 2.050550346514472, + "grad_norm": 5.623164176940918, + "learning_rate": 7.437831227068896e-06, + "loss": 3.16, + "step": 30180 + }, + { + "epoch": 2.0508900665851337, + "grad_norm": 9.032249450683594, + "learning_rate": 7.437406576980568e-06, + "loss": 2.8318, + "step": 30185 + }, + { + "epoch": 2.051229786655796, + "grad_norm": 7.072918891906738, + "learning_rate": 7.436981926892242e-06, + "loss": 2.9303, + "step": 30190 + }, + { + "epoch": 2.0515695067264574, + "grad_norm": 7.316866397857666, + "learning_rate": 7.436557276803914e-06, + "loss": 2.9227, + "step": 30195 + }, + { + "epoch": 2.051909226797119, + "grad_norm": 7.234928131103516, + "learning_rate": 7.436132626715586e-06, + "loss": 3.2818, + "step": 30200 + }, + { + "epoch": 2.052248946867781, + "grad_norm": 5.170249938964844, + "learning_rate": 7.43570797662726e-06, + "loss": 3.0296, + "step": 30205 + }, + { + "epoch": 2.0525886669384428, + "grad_norm": 11.29124927520752, + "learning_rate": 7.435283326538932e-06, + "loss": 3.101, + "step": 30210 + }, + { + "epoch": 2.0529283870091044, + "grad_norm": 7.538742542266846, + "learning_rate": 7.434858676450605e-06, + "loss": 3.0686, + "step": 30215 + }, + { + "epoch": 2.0532681070797665, + "grad_norm": 5.739257335662842, + "learning_rate": 7.4344340263622784e-06, + "loss": 3.3211, + "step": 30220 + }, + { + "epoch": 2.053607827150428, + "grad_norm": 5.6112518310546875, + "learning_rate": 7.43400937627395e-06, + "loss": 2.7453, + "step": 30225 + }, + { + "epoch": 2.0539475472210897, + "grad_norm": 8.130837440490723, + "learning_rate": 7.433584726185623e-06, + "loss": 2.8785, + "step": 30230 + }, + { + "epoch": 2.054287267291752, + "grad_norm": 6.1453471183776855, + "learning_rate": 7.433160076097297e-06, + "loss": 3.1476, + "step": 30235 + }, + { + "epoch": 2.0546269873624134, + "grad_norm": 6.931228160858154, + "learning_rate": 7.432735426008969e-06, + "loss": 3.0115, + "step": 30240 + }, + { + "epoch": 2.054966707433075, + "grad_norm": 6.852062702178955, + "learning_rate": 7.4323107759206424e-06, + "loss": 3.192, + "step": 30245 + }, + { + "epoch": 2.0553064275037367, + "grad_norm": 6.310399532318115, + "learning_rate": 7.431886125832315e-06, + "loss": 3.0337, + "step": 30250 + }, + { + "epoch": 2.055646147574399, + "grad_norm": 5.989257335662842, + "learning_rate": 7.431461475743987e-06, + "loss": 3.209, + "step": 30255 + }, + { + "epoch": 2.0559858676450604, + "grad_norm": 6.3037848472595215, + "learning_rate": 7.431036825655661e-06, + "loss": 3.3147, + "step": 30260 + }, + { + "epoch": 2.056325587715722, + "grad_norm": 7.290218353271484, + "learning_rate": 7.430612175567334e-06, + "loss": 3.1699, + "step": 30265 + }, + { + "epoch": 2.056665307786384, + "grad_norm": 6.6407599449157715, + "learning_rate": 7.430187525479006e-06, + "loss": 2.8782, + "step": 30270 + }, + { + "epoch": 2.0570050278570458, + "grad_norm": 6.893121719360352, + "learning_rate": 7.429762875390679e-06, + "loss": 3.0934, + "step": 30275 + }, + { + "epoch": 2.0573447479277074, + "grad_norm": 7.071237564086914, + "learning_rate": 7.429338225302351e-06, + "loss": 2.8248, + "step": 30280 + }, + { + "epoch": 2.0576844679983695, + "grad_norm": 7.076777935028076, + "learning_rate": 7.428913575214024e-06, + "loss": 3.1173, + "step": 30285 + }, + { + "epoch": 2.058024188069031, + "grad_norm": 6.613283634185791, + "learning_rate": 7.428488925125698e-06, + "loss": 3.0993, + "step": 30290 + }, + { + "epoch": 2.0583639081396927, + "grad_norm": 8.187283515930176, + "learning_rate": 7.42806427503737e-06, + "loss": 2.8957, + "step": 30295 + }, + { + "epoch": 2.058703628210355, + "grad_norm": 9.101707458496094, + "learning_rate": 7.427639624949042e-06, + "loss": 3.1303, + "step": 30300 + }, + { + "epoch": 2.0590433482810164, + "grad_norm": 7.341488361358643, + "learning_rate": 7.427214974860716e-06, + "loss": 3.0967, + "step": 30305 + }, + { + "epoch": 2.059383068351678, + "grad_norm": 9.172080993652344, + "learning_rate": 7.426790324772388e-06, + "loss": 2.985, + "step": 30310 + }, + { + "epoch": 2.05972278842234, + "grad_norm": 6.2468180656433105, + "learning_rate": 7.426365674684061e-06, + "loss": 3.0383, + "step": 30315 + }, + { + "epoch": 2.0600625084930018, + "grad_norm": 6.896068572998047, + "learning_rate": 7.4259410245957344e-06, + "loss": 3.0835, + "step": 30320 + }, + { + "epoch": 2.0604022285636634, + "grad_norm": 7.379892349243164, + "learning_rate": 7.425516374507406e-06, + "loss": 3.1961, + "step": 30325 + }, + { + "epoch": 2.0607419486343255, + "grad_norm": 6.945452690124512, + "learning_rate": 7.425091724419079e-06, + "loss": 2.9895, + "step": 30330 + }, + { + "epoch": 2.061081668704987, + "grad_norm": 6.753675937652588, + "learning_rate": 7.424667074330753e-06, + "loss": 3.0067, + "step": 30335 + }, + { + "epoch": 2.0614213887756487, + "grad_norm": 6.555142402648926, + "learning_rate": 7.424242424242425e-06, + "loss": 3.0543, + "step": 30340 + }, + { + "epoch": 2.061761108846311, + "grad_norm": 6.793056488037109, + "learning_rate": 7.423817774154098e-06, + "loss": 2.8671, + "step": 30345 + }, + { + "epoch": 2.0621008289169724, + "grad_norm": 6.887236595153809, + "learning_rate": 7.423393124065771e-06, + "loss": 3.118, + "step": 30350 + }, + { + "epoch": 2.062440548987634, + "grad_norm": 7.172531604766846, + "learning_rate": 7.422968473977443e-06, + "loss": 3.0934, + "step": 30355 + }, + { + "epoch": 2.062780269058296, + "grad_norm": 7.987286567687988, + "learning_rate": 7.422543823889116e-06, + "loss": 3.1139, + "step": 30360 + }, + { + "epoch": 2.0631199891289578, + "grad_norm": 6.451647758483887, + "learning_rate": 7.422119173800789e-06, + "loss": 2.8239, + "step": 30365 + }, + { + "epoch": 2.0634597091996194, + "grad_norm": 7.341365814208984, + "learning_rate": 7.421694523712462e-06, + "loss": 2.9155, + "step": 30370 + }, + { + "epoch": 2.0637994292702815, + "grad_norm": 7.285196304321289, + "learning_rate": 7.4212698736241336e-06, + "loss": 2.8902, + "step": 30375 + }, + { + "epoch": 2.064139149340943, + "grad_norm": 6.124340534210205, + "learning_rate": 7.420845223535807e-06, + "loss": 3.0155, + "step": 30380 + }, + { + "epoch": 2.0644788694116047, + "grad_norm": 6.281754493713379, + "learning_rate": 7.42042057344748e-06, + "loss": 2.9953, + "step": 30385 + }, + { + "epoch": 2.064818589482267, + "grad_norm": 8.114513397216797, + "learning_rate": 7.419995923359152e-06, + "loss": 3.2278, + "step": 30390 + }, + { + "epoch": 2.0651583095529285, + "grad_norm": 7.582934379577637, + "learning_rate": 7.419571273270826e-06, + "loss": 3.1821, + "step": 30395 + }, + { + "epoch": 2.06549802962359, + "grad_norm": 8.475608825683594, + "learning_rate": 7.419146623182498e-06, + "loss": 2.7941, + "step": 30400 + }, + { + "epoch": 2.0658377496942517, + "grad_norm": 7.410001277923584, + "learning_rate": 7.41872197309417e-06, + "loss": 3.0028, + "step": 30405 + }, + { + "epoch": 2.066177469764914, + "grad_norm": 5.827835559844971, + "learning_rate": 7.418297323005844e-06, + "loss": 3.0541, + "step": 30410 + }, + { + "epoch": 2.0665171898355754, + "grad_norm": 4.5743327140808105, + "learning_rate": 7.417872672917517e-06, + "loss": 2.9304, + "step": 30415 + }, + { + "epoch": 2.066856909906237, + "grad_norm": 6.978954315185547, + "learning_rate": 7.417448022829189e-06, + "loss": 2.7829, + "step": 30420 + }, + { + "epoch": 2.067196629976899, + "grad_norm": 6.144008636474609, + "learning_rate": 7.417023372740862e-06, + "loss": 2.9409, + "step": 30425 + }, + { + "epoch": 2.0675363500475608, + "grad_norm": 6.11353063583374, + "learning_rate": 7.416598722652535e-06, + "loss": 3.088, + "step": 30430 + }, + { + "epoch": 2.0678760701182224, + "grad_norm": 5.014461994171143, + "learning_rate": 7.416174072564207e-06, + "loss": 3.0599, + "step": 30435 + }, + { + "epoch": 2.0682157901888845, + "grad_norm": 5.133183002471924, + "learning_rate": 7.415749422475881e-06, + "loss": 3.0797, + "step": 30440 + }, + { + "epoch": 2.068555510259546, + "grad_norm": 8.063490867614746, + "learning_rate": 7.415324772387554e-06, + "loss": 3.1003, + "step": 30445 + }, + { + "epoch": 2.0688952303302077, + "grad_norm": 6.384683609008789, + "learning_rate": 7.4149001222992256e-06, + "loss": 3.2476, + "step": 30450 + }, + { + "epoch": 2.06923495040087, + "grad_norm": 6.547235488891602, + "learning_rate": 7.414475472210899e-06, + "loss": 2.8981, + "step": 30455 + }, + { + "epoch": 2.0695746704715314, + "grad_norm": 7.333865642547607, + "learning_rate": 7.414050822122571e-06, + "loss": 3.2193, + "step": 30460 + }, + { + "epoch": 2.069914390542193, + "grad_norm": 6.37252950668335, + "learning_rate": 7.413626172034244e-06, + "loss": 3.1135, + "step": 30465 + }, + { + "epoch": 2.070254110612855, + "grad_norm": 6.456254959106445, + "learning_rate": 7.413201521945918e-06, + "loss": 3.0188, + "step": 30470 + }, + { + "epoch": 2.0705938306835168, + "grad_norm": 6.158144474029541, + "learning_rate": 7.4127768718575896e-06, + "loss": 2.9135, + "step": 30475 + }, + { + "epoch": 2.0709335507541784, + "grad_norm": 9.851455688476562, + "learning_rate": 7.412352221769262e-06, + "loss": 3.1194, + "step": 30480 + }, + { + "epoch": 2.0712732708248405, + "grad_norm": 7.141355037689209, + "learning_rate": 7.411927571680936e-06, + "loss": 2.8919, + "step": 30485 + }, + { + "epoch": 2.071612990895502, + "grad_norm": 6.668102741241455, + "learning_rate": 7.411502921592608e-06, + "loss": 3.0881, + "step": 30490 + }, + { + "epoch": 2.0719527109661637, + "grad_norm": 6.4865217208862305, + "learning_rate": 7.411078271504281e-06, + "loss": 3.0479, + "step": 30495 + }, + { + "epoch": 2.072292431036826, + "grad_norm": 6.404610633850098, + "learning_rate": 7.410653621415954e-06, + "loss": 2.8927, + "step": 30500 + }, + { + "epoch": 2.0726321511074874, + "grad_norm": 7.109172344207764, + "learning_rate": 7.410228971327626e-06, + "loss": 3.0977, + "step": 30505 + }, + { + "epoch": 2.072971871178149, + "grad_norm": 6.076446056365967, + "learning_rate": 7.409804321239299e-06, + "loss": 2.8583, + "step": 30510 + }, + { + "epoch": 2.073311591248811, + "grad_norm": 8.868185043334961, + "learning_rate": 7.409379671150973e-06, + "loss": 2.8708, + "step": 30515 + }, + { + "epoch": 2.073651311319473, + "grad_norm": 6.686309814453125, + "learning_rate": 7.408955021062645e-06, + "loss": 3.124, + "step": 30520 + }, + { + "epoch": 2.0739910313901344, + "grad_norm": 6.291171550750732, + "learning_rate": 7.408530370974318e-06, + "loss": 3.3038, + "step": 30525 + }, + { + "epoch": 2.0743307514607965, + "grad_norm": 6.666740417480469, + "learning_rate": 7.40810572088599e-06, + "loss": 3.1001, + "step": 30530 + }, + { + "epoch": 2.074670471531458, + "grad_norm": 6.839756965637207, + "learning_rate": 7.407681070797663e-06, + "loss": 2.7963, + "step": 30535 + }, + { + "epoch": 2.0750101916021197, + "grad_norm": 7.202813625335693, + "learning_rate": 7.407256420709336e-06, + "loss": 2.8133, + "step": 30540 + }, + { + "epoch": 2.075349911672782, + "grad_norm": 6.243248462677002, + "learning_rate": 7.406831770621009e-06, + "loss": 2.9973, + "step": 30545 + }, + { + "epoch": 2.0756896317434435, + "grad_norm": 6.325594902038574, + "learning_rate": 7.406407120532682e-06, + "loss": 2.9996, + "step": 30550 + }, + { + "epoch": 2.076029351814105, + "grad_norm": 6.931618690490723, + "learning_rate": 7.4059824704443535e-06, + "loss": 3.1147, + "step": 30555 + }, + { + "epoch": 2.076369071884767, + "grad_norm": 6.34541654586792, + "learning_rate": 7.405557820356027e-06, + "loss": 3.0788, + "step": 30560 + }, + { + "epoch": 2.076708791955429, + "grad_norm": 7.97369909286499, + "learning_rate": 7.4051331702677e-06, + "loss": 3.0551, + "step": 30565 + }, + { + "epoch": 2.0770485120260904, + "grad_norm": 5.723299980163574, + "learning_rate": 7.404708520179372e-06, + "loss": 2.8384, + "step": 30570 + }, + { + "epoch": 2.0773882320967525, + "grad_norm": 7.085638999938965, + "learning_rate": 7.404283870091046e-06, + "loss": 3.0449, + "step": 30575 + }, + { + "epoch": 2.077727952167414, + "grad_norm": 6.749181270599365, + "learning_rate": 7.403859220002718e-06, + "loss": 3.0446, + "step": 30580 + }, + { + "epoch": 2.0780676722380758, + "grad_norm": 6.469086170196533, + "learning_rate": 7.403434569914392e-06, + "loss": 2.8329, + "step": 30585 + }, + { + "epoch": 2.0784073923087374, + "grad_norm": 6.5225043296813965, + "learning_rate": 7.403009919826064e-06, + "loss": 2.8959, + "step": 30590 + }, + { + "epoch": 2.0787471123793995, + "grad_norm": 6.610477924346924, + "learning_rate": 7.402585269737737e-06, + "loss": 2.9999, + "step": 30595 + }, + { + "epoch": 2.079086832450061, + "grad_norm": 6.626093864440918, + "learning_rate": 7.4021606196494104e-06, + "loss": 3.0946, + "step": 30600 + }, + { + "epoch": 2.0794265525207227, + "grad_norm": 6.362390518188477, + "learning_rate": 7.401735969561082e-06, + "loss": 3.0901, + "step": 30605 + }, + { + "epoch": 2.079766272591385, + "grad_norm": 6.01162052154541, + "learning_rate": 7.401311319472755e-06, + "loss": 3.1018, + "step": 30610 + }, + { + "epoch": 2.0801059926620464, + "grad_norm": 6.589305400848389, + "learning_rate": 7.400886669384428e-06, + "loss": 3.0902, + "step": 30615 + }, + { + "epoch": 2.080445712732708, + "grad_norm": 6.8286309242248535, + "learning_rate": 7.400462019296101e-06, + "loss": 2.8908, + "step": 30620 + }, + { + "epoch": 2.08078543280337, + "grad_norm": 6.181317329406738, + "learning_rate": 7.400037369207773e-06, + "loss": 3.193, + "step": 30625 + }, + { + "epoch": 2.0811251528740318, + "grad_norm": 5.736461639404297, + "learning_rate": 7.399612719119446e-06, + "loss": 3.0352, + "step": 30630 + }, + { + "epoch": 2.0814648729446934, + "grad_norm": 7.560342788696289, + "learning_rate": 7.399188069031119e-06, + "loss": 3.006, + "step": 30635 + }, + { + "epoch": 2.0818045930153555, + "grad_norm": 7.852213382720947, + "learning_rate": 7.398763418942791e-06, + "loss": 3.0336, + "step": 30640 + }, + { + "epoch": 2.082144313086017, + "grad_norm": 8.579500198364258, + "learning_rate": 7.398338768854465e-06, + "loss": 3.2013, + "step": 30645 + }, + { + "epoch": 2.0824840331566787, + "grad_norm": 6.180568218231201, + "learning_rate": 7.397914118766138e-06, + "loss": 3.1563, + "step": 30650 + }, + { + "epoch": 2.082823753227341, + "grad_norm": 5.73502779006958, + "learning_rate": 7.3974894686778096e-06, + "loss": 3.241, + "step": 30655 + }, + { + "epoch": 2.0831634732980024, + "grad_norm": 7.552704811096191, + "learning_rate": 7.397064818589483e-06, + "loss": 3.0792, + "step": 30660 + }, + { + "epoch": 2.083503193368664, + "grad_norm": 5.349788665771484, + "learning_rate": 7.396640168501156e-06, + "loss": 2.9095, + "step": 30665 + }, + { + "epoch": 2.083842913439326, + "grad_norm": 7.8943190574646, + "learning_rate": 7.396215518412828e-06, + "loss": 3.0596, + "step": 30670 + }, + { + "epoch": 2.084182633509988, + "grad_norm": 6.404941558837891, + "learning_rate": 7.395790868324502e-06, + "loss": 3.0326, + "step": 30675 + }, + { + "epoch": 2.0845223535806494, + "grad_norm": 6.811624050140381, + "learning_rate": 7.395366218236174e-06, + "loss": 3.0051, + "step": 30680 + }, + { + "epoch": 2.0848620736513115, + "grad_norm": 7.563056468963623, + "learning_rate": 7.394941568147846e-06, + "loss": 3.2102, + "step": 30685 + }, + { + "epoch": 2.085201793721973, + "grad_norm": 6.834903717041016, + "learning_rate": 7.39451691805952e-06, + "loss": 3.0102, + "step": 30690 + }, + { + "epoch": 2.0855415137926347, + "grad_norm": 6.096364974975586, + "learning_rate": 7.394092267971193e-06, + "loss": 3.0656, + "step": 30695 + }, + { + "epoch": 2.085881233863297, + "grad_norm": 7.382063865661621, + "learning_rate": 7.393667617882865e-06, + "loss": 2.8059, + "step": 30700 + }, + { + "epoch": 2.0862209539339585, + "grad_norm": 5.469675540924072, + "learning_rate": 7.393242967794538e-06, + "loss": 2.9826, + "step": 30705 + }, + { + "epoch": 2.08656067400462, + "grad_norm": 8.566733360290527, + "learning_rate": 7.39281831770621e-06, + "loss": 3.01, + "step": 30710 + }, + { + "epoch": 2.086900394075282, + "grad_norm": 6.995896816253662, + "learning_rate": 7.392393667617883e-06, + "loss": 2.9888, + "step": 30715 + }, + { + "epoch": 2.087240114145944, + "grad_norm": 5.995123863220215, + "learning_rate": 7.391969017529557e-06, + "loss": 3.0468, + "step": 30720 + }, + { + "epoch": 2.0875798342166054, + "grad_norm": 6.40384578704834, + "learning_rate": 7.391544367441229e-06, + "loss": 3.3366, + "step": 30725 + }, + { + "epoch": 2.0879195542872675, + "grad_norm": 7.092265605926514, + "learning_rate": 7.3911197173529016e-06, + "loss": 2.9941, + "step": 30730 + }, + { + "epoch": 2.088259274357929, + "grad_norm": 6.608972549438477, + "learning_rate": 7.390695067264575e-06, + "loss": 2.9262, + "step": 30735 + }, + { + "epoch": 2.0885989944285908, + "grad_norm": 5.352901935577393, + "learning_rate": 7.390270417176247e-06, + "loss": 2.951, + "step": 30740 + }, + { + "epoch": 2.0889387144992524, + "grad_norm": 5.592867374420166, + "learning_rate": 7.38984576708792e-06, + "loss": 3.0722, + "step": 30745 + }, + { + "epoch": 2.0892784345699145, + "grad_norm": 6.074195384979248, + "learning_rate": 7.389421116999594e-06, + "loss": 3.1353, + "step": 30750 + }, + { + "epoch": 2.089618154640576, + "grad_norm": 6.0903120040893555, + "learning_rate": 7.3889964669112656e-06, + "loss": 3.2604, + "step": 30755 + }, + { + "epoch": 2.0899578747112377, + "grad_norm": 7.318535327911377, + "learning_rate": 7.388571816822938e-06, + "loss": 2.9577, + "step": 30760 + }, + { + "epoch": 2.0902975947819, + "grad_norm": 5.021672248840332, + "learning_rate": 7.388147166734612e-06, + "loss": 3.4794, + "step": 30765 + }, + { + "epoch": 2.0906373148525614, + "grad_norm": 6.103806495666504, + "learning_rate": 7.387722516646284e-06, + "loss": 3.1224, + "step": 30770 + }, + { + "epoch": 2.090977034923223, + "grad_norm": 5.418480396270752, + "learning_rate": 7.387297866557957e-06, + "loss": 2.8391, + "step": 30775 + }, + { + "epoch": 2.091316754993885, + "grad_norm": 6.00214958190918, + "learning_rate": 7.3868732164696296e-06, + "loss": 3.0417, + "step": 30780 + }, + { + "epoch": 2.0916564750645468, + "grad_norm": 6.51531457901001, + "learning_rate": 7.386448566381302e-06, + "loss": 3.1094, + "step": 30785 + }, + { + "epoch": 2.0919961951352084, + "grad_norm": 5.929406642913818, + "learning_rate": 7.386023916292975e-06, + "loss": 3.151, + "step": 30790 + }, + { + "epoch": 2.0923359152058705, + "grad_norm": 9.431547164916992, + "learning_rate": 7.385599266204648e-06, + "loss": 3.2264, + "step": 30795 + }, + { + "epoch": 2.092675635276532, + "grad_norm": 6.056599140167236, + "learning_rate": 7.385174616116321e-06, + "loss": 3.0249, + "step": 30800 + }, + { + "epoch": 2.0930153553471937, + "grad_norm": 8.750822067260742, + "learning_rate": 7.384749966027993e-06, + "loss": 3.0154, + "step": 30805 + }, + { + "epoch": 2.093355075417856, + "grad_norm": 7.499706745147705, + "learning_rate": 7.384325315939666e-06, + "loss": 2.9948, + "step": 30810 + }, + { + "epoch": 2.0936947954885174, + "grad_norm": 5.553750514984131, + "learning_rate": 7.383900665851339e-06, + "loss": 3.0604, + "step": 30815 + }, + { + "epoch": 2.094034515559179, + "grad_norm": 6.795691967010498, + "learning_rate": 7.383476015763011e-06, + "loss": 2.978, + "step": 30820 + }, + { + "epoch": 2.094374235629841, + "grad_norm": 7.7020416259765625, + "learning_rate": 7.383051365674685e-06, + "loss": 3.0714, + "step": 30825 + }, + { + "epoch": 2.094713955700503, + "grad_norm": 7.792293548583984, + "learning_rate": 7.382626715586358e-06, + "loss": 3.0654, + "step": 30830 + }, + { + "epoch": 2.0950536757711644, + "grad_norm": 6.471405029296875, + "learning_rate": 7.3822020654980295e-06, + "loss": 2.9928, + "step": 30835 + }, + { + "epoch": 2.0953933958418265, + "grad_norm": 6.571180820465088, + "learning_rate": 7.381777415409703e-06, + "loss": 3.0282, + "step": 30840 + }, + { + "epoch": 2.095733115912488, + "grad_norm": 7.064785003662109, + "learning_rate": 7.381352765321376e-06, + "loss": 3.1436, + "step": 30845 + }, + { + "epoch": 2.0960728359831498, + "grad_norm": 5.598662376403809, + "learning_rate": 7.380928115233048e-06, + "loss": 3.3026, + "step": 30850 + }, + { + "epoch": 2.096412556053812, + "grad_norm": 6.728210926055908, + "learning_rate": 7.380503465144722e-06, + "loss": 3.0426, + "step": 30855 + }, + { + "epoch": 2.0967522761244735, + "grad_norm": 7.256170749664307, + "learning_rate": 7.380078815056394e-06, + "loss": 3.1617, + "step": 30860 + }, + { + "epoch": 2.097091996195135, + "grad_norm": 8.046212196350098, + "learning_rate": 7.379654164968066e-06, + "loss": 2.8027, + "step": 30865 + }, + { + "epoch": 2.097431716265797, + "grad_norm": 6.497415542602539, + "learning_rate": 7.37922951487974e-06, + "loss": 3.0924, + "step": 30870 + }, + { + "epoch": 2.097771436336459, + "grad_norm": 5.290037631988525, + "learning_rate": 7.378804864791412e-06, + "loss": 2.9797, + "step": 30875 + }, + { + "epoch": 2.0981111564071204, + "grad_norm": 7.765720367431641, + "learning_rate": 7.378380214703085e-06, + "loss": 2.9246, + "step": 30880 + }, + { + "epoch": 2.0984508764777825, + "grad_norm": 7.435765743255615, + "learning_rate": 7.377955564614758e-06, + "loss": 3.2395, + "step": 30885 + }, + { + "epoch": 2.098790596548444, + "grad_norm": 6.156705379486084, + "learning_rate": 7.37753091452643e-06, + "loss": 3.2515, + "step": 30890 + }, + { + "epoch": 2.0991303166191058, + "grad_norm": 7.460115909576416, + "learning_rate": 7.377106264438103e-06, + "loss": 2.9509, + "step": 30895 + }, + { + "epoch": 2.099470036689768, + "grad_norm": 7.5323405265808105, + "learning_rate": 7.376681614349777e-06, + "loss": 3.0732, + "step": 30900 + }, + { + "epoch": 2.0998097567604295, + "grad_norm": 6.7592010498046875, + "learning_rate": 7.376256964261449e-06, + "loss": 3.3193, + "step": 30905 + }, + { + "epoch": 2.100149476831091, + "grad_norm": 6.12090539932251, + "learning_rate": 7.3758323141731215e-06, + "loss": 3.0785, + "step": 30910 + }, + { + "epoch": 2.100489196901753, + "grad_norm": 7.293724060058594, + "learning_rate": 7.375407664084795e-06, + "loss": 3.1629, + "step": 30915 + }, + { + "epoch": 2.100828916972415, + "grad_norm": 7.286472320556641, + "learning_rate": 7.374983013996467e-06, + "loss": 2.8365, + "step": 30920 + }, + { + "epoch": 2.1011686370430764, + "grad_norm": 8.03487491607666, + "learning_rate": 7.374558363908141e-06, + "loss": 3.2149, + "step": 30925 + }, + { + "epoch": 2.101508357113738, + "grad_norm": 6.119637966156006, + "learning_rate": 7.374133713819814e-06, + "loss": 2.9189, + "step": 30930 + }, + { + "epoch": 2.1018480771844, + "grad_norm": 6.922786235809326, + "learning_rate": 7.3737090637314855e-06, + "loss": 3.0909, + "step": 30935 + }, + { + "epoch": 2.1021877972550618, + "grad_norm": 5.785052299499512, + "learning_rate": 7.373284413643159e-06, + "loss": 3.1342, + "step": 30940 + }, + { + "epoch": 2.1025275173257234, + "grad_norm": 6.9791259765625, + "learning_rate": 7.372859763554831e-06, + "loss": 2.7052, + "step": 30945 + }, + { + "epoch": 2.1028672373963855, + "grad_norm": 7.4300432205200195, + "learning_rate": 7.372435113466504e-06, + "loss": 2.9458, + "step": 30950 + }, + { + "epoch": 2.103206957467047, + "grad_norm": 8.93624210357666, + "learning_rate": 7.372010463378178e-06, + "loss": 3.0491, + "step": 30955 + }, + { + "epoch": 2.1035466775377087, + "grad_norm": 7.299061298370361, + "learning_rate": 7.3715858132898496e-06, + "loss": 3.2787, + "step": 30960 + }, + { + "epoch": 2.103886397608371, + "grad_norm": 7.201992511749268, + "learning_rate": 7.371161163201522e-06, + "loss": 3.0029, + "step": 30965 + }, + { + "epoch": 2.1042261176790324, + "grad_norm": 6.910516262054443, + "learning_rate": 7.370736513113196e-06, + "loss": 2.9811, + "step": 30970 + }, + { + "epoch": 2.104565837749694, + "grad_norm": 6.79630708694458, + "learning_rate": 7.370311863024868e-06, + "loss": 3.0258, + "step": 30975 + }, + { + "epoch": 2.104905557820356, + "grad_norm": 6.613862037658691, + "learning_rate": 7.369887212936541e-06, + "loss": 2.9381, + "step": 30980 + }, + { + "epoch": 2.105245277891018, + "grad_norm": 5.980459690093994, + "learning_rate": 7.369462562848214e-06, + "loss": 2.9495, + "step": 30985 + }, + { + "epoch": 2.1055849979616794, + "grad_norm": 6.9055070877075195, + "learning_rate": 7.369037912759886e-06, + "loss": 2.9615, + "step": 30990 + }, + { + "epoch": 2.1059247180323415, + "grad_norm": 7.224883556365967, + "learning_rate": 7.368613262671559e-06, + "loss": 3.1365, + "step": 30995 + }, + { + "epoch": 2.106264438103003, + "grad_norm": 7.591970443725586, + "learning_rate": 7.368188612583233e-06, + "loss": 3.218, + "step": 31000 + }, + { + "epoch": 2.1066041581736648, + "grad_norm": 6.06096076965332, + "learning_rate": 7.367763962494905e-06, + "loss": 3.0498, + "step": 31005 + }, + { + "epoch": 2.106943878244327, + "grad_norm": 6.232500076293945, + "learning_rate": 7.3673393124065776e-06, + "loss": 2.9266, + "step": 31010 + }, + { + "epoch": 2.1072835983149885, + "grad_norm": 6.278465270996094, + "learning_rate": 7.366914662318251e-06, + "loss": 3.2762, + "step": 31015 + }, + { + "epoch": 2.10762331838565, + "grad_norm": 5.762889385223389, + "learning_rate": 7.366490012229923e-06, + "loss": 2.918, + "step": 31020 + }, + { + "epoch": 2.107963038456312, + "grad_norm": 7.230963230133057, + "learning_rate": 7.366065362141596e-06, + "loss": 2.9886, + "step": 31025 + }, + { + "epoch": 2.108302758526974, + "grad_norm": 5.6988701820373535, + "learning_rate": 7.365640712053269e-06, + "loss": 3.0904, + "step": 31030 + }, + { + "epoch": 2.1086424785976354, + "grad_norm": 8.332229614257812, + "learning_rate": 7.3652160619649416e-06, + "loss": 2.8679, + "step": 31035 + }, + { + "epoch": 2.1089821986682975, + "grad_norm": 6.351261615753174, + "learning_rate": 7.364791411876614e-06, + "loss": 3.1064, + "step": 31040 + }, + { + "epoch": 2.109321918738959, + "grad_norm": 5.579447269439697, + "learning_rate": 7.364366761788287e-06, + "loss": 3.1865, + "step": 31045 + }, + { + "epoch": 2.1096616388096208, + "grad_norm": 9.787012100219727, + "learning_rate": 7.36394211169996e-06, + "loss": 3.0624, + "step": 31050 + }, + { + "epoch": 2.110001358880283, + "grad_norm": 6.5315680503845215, + "learning_rate": 7.363517461611632e-06, + "loss": 3.3208, + "step": 31055 + }, + { + "epoch": 2.1103410789509445, + "grad_norm": 6.235720157623291, + "learning_rate": 7.3630928115233056e-06, + "loss": 3.059, + "step": 31060 + }, + { + "epoch": 2.110680799021606, + "grad_norm": 7.109639644622803, + "learning_rate": 7.362668161434978e-06, + "loss": 3.1679, + "step": 31065 + }, + { + "epoch": 2.111020519092268, + "grad_norm": 7.796191215515137, + "learning_rate": 7.36224351134665e-06, + "loss": 3.1241, + "step": 31070 + }, + { + "epoch": 2.11136023916293, + "grad_norm": 5.626248359680176, + "learning_rate": 7.361818861258324e-06, + "loss": 3.3064, + "step": 31075 + }, + { + "epoch": 2.1116999592335914, + "grad_norm": 8.624835014343262, + "learning_rate": 7.361394211169997e-06, + "loss": 2.9932, + "step": 31080 + }, + { + "epoch": 2.112039679304253, + "grad_norm": 8.519912719726562, + "learning_rate": 7.360969561081669e-06, + "loss": 3.0861, + "step": 31085 + }, + { + "epoch": 2.112379399374915, + "grad_norm": 6.889503002166748, + "learning_rate": 7.360544910993342e-06, + "loss": 2.993, + "step": 31090 + }, + { + "epoch": 2.1127191194455768, + "grad_norm": 5.380033016204834, + "learning_rate": 7.360120260905015e-06, + "loss": 3.0393, + "step": 31095 + }, + { + "epoch": 2.1130588395162384, + "grad_norm": 4.941396713256836, + "learning_rate": 7.359695610816687e-06, + "loss": 2.7549, + "step": 31100 + }, + { + "epoch": 2.1133985595869005, + "grad_norm": 8.41861629486084, + "learning_rate": 7.359270960728361e-06, + "loss": 3.0687, + "step": 31105 + }, + { + "epoch": 2.113738279657562, + "grad_norm": 7.792470455169678, + "learning_rate": 7.358931240657699e-06, + "loss": 3.3175, + "step": 31110 + }, + { + "epoch": 2.1140779997282237, + "grad_norm": 5.49116849899292, + "learning_rate": 7.358506590569371e-06, + "loss": 2.8418, + "step": 31115 + }, + { + "epoch": 2.114417719798886, + "grad_norm": 5.910560131072998, + "learning_rate": 7.3580819404810444e-06, + "loss": 2.9448, + "step": 31120 + }, + { + "epoch": 2.1147574398695475, + "grad_norm": 6.964545249938965, + "learning_rate": 7.357657290392716e-06, + "loss": 3.1423, + "step": 31125 + }, + { + "epoch": 2.115097159940209, + "grad_norm": 6.682591915130615, + "learning_rate": 7.35723264030439e-06, + "loss": 2.9729, + "step": 31130 + }, + { + "epoch": 2.115436880010871, + "grad_norm": 6.541523456573486, + "learning_rate": 7.356807990216063e-06, + "loss": 2.6539, + "step": 31135 + }, + { + "epoch": 2.115776600081533, + "grad_norm": 7.3295578956604, + "learning_rate": 7.356383340127735e-06, + "loss": 2.8447, + "step": 31140 + }, + { + "epoch": 2.1161163201521944, + "grad_norm": 5.864851951599121, + "learning_rate": 7.3559586900394085e-06, + "loss": 3.2892, + "step": 31145 + }, + { + "epoch": 2.1164560402228565, + "grad_norm": 6.587907791137695, + "learning_rate": 7.355534039951081e-06, + "loss": 3.3915, + "step": 31150 + }, + { + "epoch": 2.116795760293518, + "grad_norm": 6.575422286987305, + "learning_rate": 7.355109389862753e-06, + "loss": 2.7614, + "step": 31155 + }, + { + "epoch": 2.1171354803641798, + "grad_norm": 7.2569193840026855, + "learning_rate": 7.354684739774427e-06, + "loss": 2.953, + "step": 31160 + }, + { + "epoch": 2.117475200434842, + "grad_norm": 6.14595365524292, + "learning_rate": 7.3542600896861e-06, + "loss": 3.1894, + "step": 31165 + }, + { + "epoch": 2.1178149205055035, + "grad_norm": 6.557041645050049, + "learning_rate": 7.353835439597772e-06, + "loss": 2.9275, + "step": 31170 + }, + { + "epoch": 2.118154640576165, + "grad_norm": 6.439289093017578, + "learning_rate": 7.353410789509445e-06, + "loss": 3.0794, + "step": 31175 + }, + { + "epoch": 2.118494360646827, + "grad_norm": 6.595274448394775, + "learning_rate": 7.352986139421118e-06, + "loss": 3.0602, + "step": 31180 + }, + { + "epoch": 2.118834080717489, + "grad_norm": 6.6159844398498535, + "learning_rate": 7.35256148933279e-06, + "loss": 2.9984, + "step": 31185 + }, + { + "epoch": 2.1191738007881504, + "grad_norm": 6.33873176574707, + "learning_rate": 7.352136839244464e-06, + "loss": 2.9046, + "step": 31190 + }, + { + "epoch": 2.1195135208588125, + "grad_norm": 6.354400157928467, + "learning_rate": 7.3517121891561365e-06, + "loss": 2.9724, + "step": 31195 + }, + { + "epoch": 2.119853240929474, + "grad_norm": 6.6232171058654785, + "learning_rate": 7.351287539067808e-06, + "loss": 3.1219, + "step": 31200 + }, + { + "epoch": 2.1201929610001358, + "grad_norm": 7.306310176849365, + "learning_rate": 7.350862888979482e-06, + "loss": 3.3011, + "step": 31205 + }, + { + "epoch": 2.120532681070798, + "grad_norm": 7.479798316955566, + "learning_rate": 7.350438238891154e-06, + "loss": 3.023, + "step": 31210 + }, + { + "epoch": 2.1208724011414595, + "grad_norm": 8.714900970458984, + "learning_rate": 7.350013588802827e-06, + "loss": 3.1412, + "step": 31215 + }, + { + "epoch": 2.121212121212121, + "grad_norm": 8.44467830657959, + "learning_rate": 7.3495889387145005e-06, + "loss": 3.1218, + "step": 31220 + }, + { + "epoch": 2.121551841282783, + "grad_norm": 8.177520751953125, + "learning_rate": 7.349164288626172e-06, + "loss": 2.916, + "step": 31225 + }, + { + "epoch": 2.121891561353445, + "grad_norm": 6.056643009185791, + "learning_rate": 7.348739638537845e-06, + "loss": 3.0023, + "step": 31230 + }, + { + "epoch": 2.1222312814241064, + "grad_norm": 7.926848411560059, + "learning_rate": 7.348314988449519e-06, + "loss": 3.0706, + "step": 31235 + }, + { + "epoch": 2.1225710014947685, + "grad_norm": 6.727169513702393, + "learning_rate": 7.347890338361191e-06, + "loss": 3.0476, + "step": 31240 + }, + { + "epoch": 2.12291072156543, + "grad_norm": 6.6997389793396, + "learning_rate": 7.347465688272864e-06, + "loss": 3.1991, + "step": 31245 + }, + { + "epoch": 2.123250441636092, + "grad_norm": 7.485479831695557, + "learning_rate": 7.347041038184537e-06, + "loss": 3.2094, + "step": 31250 + }, + { + "epoch": 2.123590161706754, + "grad_norm": 9.760879516601562, + "learning_rate": 7.346616388096209e-06, + "loss": 3.2411, + "step": 31255 + }, + { + "epoch": 2.1239298817774155, + "grad_norm": 6.104340076446533, + "learning_rate": 7.346191738007882e-06, + "loss": 2.927, + "step": 31260 + }, + { + "epoch": 2.124269601848077, + "grad_norm": 8.262473106384277, + "learning_rate": 7.345767087919556e-06, + "loss": 2.7745, + "step": 31265 + }, + { + "epoch": 2.1246093219187387, + "grad_norm": 7.0062737464904785, + "learning_rate": 7.345342437831228e-06, + "loss": 2.9374, + "step": 31270 + }, + { + "epoch": 2.124949041989401, + "grad_norm": 6.718959331512451, + "learning_rate": 7.3449177877429004e-06, + "loss": 2.8873, + "step": 31275 + }, + { + "epoch": 2.1252887620600625, + "grad_norm": 6.251192092895508, + "learning_rate": 7.344493137654573e-06, + "loss": 3.0932, + "step": 31280 + }, + { + "epoch": 2.125628482130724, + "grad_norm": 6.390078544616699, + "learning_rate": 7.344068487566246e-06, + "loss": 3.1515, + "step": 31285 + }, + { + "epoch": 2.125968202201386, + "grad_norm": 7.489143371582031, + "learning_rate": 7.343643837477919e-06, + "loss": 3.0539, + "step": 31290 + }, + { + "epoch": 2.126307922272048, + "grad_norm": 5.849267482757568, + "learning_rate": 7.343219187389592e-06, + "loss": 2.6991, + "step": 31295 + }, + { + "epoch": 2.1266476423427094, + "grad_norm": 7.752559185028076, + "learning_rate": 7.3427945373012644e-06, + "loss": 2.9313, + "step": 31300 + }, + { + "epoch": 2.1269873624133715, + "grad_norm": 6.754436492919922, + "learning_rate": 7.342369887212936e-06, + "loss": 2.8826, + "step": 31305 + }, + { + "epoch": 2.127327082484033, + "grad_norm": 5.678539752960205, + "learning_rate": 7.34194523712461e-06, + "loss": 2.9584, + "step": 31310 + }, + { + "epoch": 2.1276668025546948, + "grad_norm": 6.839791774749756, + "learning_rate": 7.341520587036283e-06, + "loss": 3.0461, + "step": 31315 + }, + { + "epoch": 2.128006522625357, + "grad_norm": 7.407652854919434, + "learning_rate": 7.341095936947955e-06, + "loss": 2.9292, + "step": 31320 + }, + { + "epoch": 2.1283462426960185, + "grad_norm": 7.240954399108887, + "learning_rate": 7.3406712868596284e-06, + "loss": 2.9618, + "step": 31325 + }, + { + "epoch": 2.12868596276668, + "grad_norm": 8.824786186218262, + "learning_rate": 7.340246636771301e-06, + "loss": 2.9703, + "step": 31330 + }, + { + "epoch": 2.129025682837342, + "grad_norm": 6.423029899597168, + "learning_rate": 7.339821986682973e-06, + "loss": 3.2473, + "step": 31335 + }, + { + "epoch": 2.129365402908004, + "grad_norm": 6.845353603363037, + "learning_rate": 7.339397336594647e-06, + "loss": 3.052, + "step": 31340 + }, + { + "epoch": 2.1297051229786654, + "grad_norm": 7.932998180389404, + "learning_rate": 7.33897268650632e-06, + "loss": 3.0773, + "step": 31345 + }, + { + "epoch": 2.1300448430493275, + "grad_norm": 6.813721656799316, + "learning_rate": 7.338548036417992e-06, + "loss": 3.1742, + "step": 31350 + }, + { + "epoch": 2.130384563119989, + "grad_norm": 7.324182033538818, + "learning_rate": 7.338123386329665e-06, + "loss": 3.0105, + "step": 31355 + }, + { + "epoch": 2.1307242831906508, + "grad_norm": 6.641140460968018, + "learning_rate": 7.337698736241338e-06, + "loss": 2.8216, + "step": 31360 + }, + { + "epoch": 2.131064003261313, + "grad_norm": 5.991360664367676, + "learning_rate": 7.33727408615301e-06, + "loss": 2.9633, + "step": 31365 + }, + { + "epoch": 2.1314037233319745, + "grad_norm": 5.527778148651123, + "learning_rate": 7.336849436064684e-06, + "loss": 2.9642, + "step": 31370 + }, + { + "epoch": 2.131743443402636, + "grad_norm": 7.817241668701172, + "learning_rate": 7.336424785976356e-06, + "loss": 3.0665, + "step": 31375 + }, + { + "epoch": 2.132083163473298, + "grad_norm": 8.891615867614746, + "learning_rate": 7.336000135888028e-06, + "loss": 3.0248, + "step": 31380 + }, + { + "epoch": 2.13242288354396, + "grad_norm": 6.476778030395508, + "learning_rate": 7.335575485799702e-06, + "loss": 3.1637, + "step": 31385 + }, + { + "epoch": 2.1327626036146214, + "grad_norm": 8.826467514038086, + "learning_rate": 7.335150835711374e-06, + "loss": 3.1428, + "step": 31390 + }, + { + "epoch": 2.1331023236852835, + "grad_norm": 7.696263313293457, + "learning_rate": 7.334726185623047e-06, + "loss": 2.9113, + "step": 31395 + }, + { + "epoch": 2.133442043755945, + "grad_norm": 6.588804244995117, + "learning_rate": 7.3343015355347204e-06, + "loss": 3.1032, + "step": 31400 + }, + { + "epoch": 2.133781763826607, + "grad_norm": 6.481825828552246, + "learning_rate": 7.333876885446392e-06, + "loss": 2.9379, + "step": 31405 + }, + { + "epoch": 2.134121483897269, + "grad_norm": 6.354457378387451, + "learning_rate": 7.333452235358065e-06, + "loss": 3.1893, + "step": 31410 + }, + { + "epoch": 2.1344612039679305, + "grad_norm": 9.416118621826172, + "learning_rate": 7.333027585269739e-06, + "loss": 3.0251, + "step": 31415 + }, + { + "epoch": 2.134800924038592, + "grad_norm": 6.920758247375488, + "learning_rate": 7.332602935181411e-06, + "loss": 3.1118, + "step": 31420 + }, + { + "epoch": 2.1351406441092537, + "grad_norm": 8.168233871459961, + "learning_rate": 7.332178285093084e-06, + "loss": 3.0033, + "step": 31425 + }, + { + "epoch": 2.135480364179916, + "grad_norm": 8.529585838317871, + "learning_rate": 7.331753635004757e-06, + "loss": 3.1611, + "step": 31430 + }, + { + "epoch": 2.1358200842505775, + "grad_norm": 8.843220710754395, + "learning_rate": 7.331328984916429e-06, + "loss": 3.0123, + "step": 31435 + }, + { + "epoch": 2.136159804321239, + "grad_norm": 7.131879806518555, + "learning_rate": 7.330904334828102e-06, + "loss": 2.9991, + "step": 31440 + }, + { + "epoch": 2.136499524391901, + "grad_norm": 6.5155792236328125, + "learning_rate": 7.330479684739775e-06, + "loss": 2.8162, + "step": 31445 + }, + { + "epoch": 2.136839244462563, + "grad_norm": 6.646294116973877, + "learning_rate": 7.330055034651448e-06, + "loss": 2.9915, + "step": 31450 + }, + { + "epoch": 2.1371789645332244, + "grad_norm": 7.505641460418701, + "learning_rate": 7.32963038456312e-06, + "loss": 3.0111, + "step": 31455 + }, + { + "epoch": 2.1375186846038865, + "grad_norm": 6.197359561920166, + "learning_rate": 7.329205734474793e-06, + "loss": 3.0143, + "step": 31460 + }, + { + "epoch": 2.137858404674548, + "grad_norm": 5.754868984222412, + "learning_rate": 7.328781084386466e-06, + "loss": 2.8591, + "step": 31465 + }, + { + "epoch": 2.1381981247452098, + "grad_norm": 7.323569297790527, + "learning_rate": 7.32835643429814e-06, + "loss": 2.718, + "step": 31470 + }, + { + "epoch": 2.138537844815872, + "grad_norm": 6.119228363037109, + "learning_rate": 7.327931784209812e-06, + "loss": 2.8266, + "step": 31475 + }, + { + "epoch": 2.1388775648865335, + "grad_norm": 7.101261615753174, + "learning_rate": 7.327507134121484e-06, + "loss": 2.8279, + "step": 31480 + }, + { + "epoch": 2.139217284957195, + "grad_norm": 8.100776672363281, + "learning_rate": 7.327082484033158e-06, + "loss": 3.0884, + "step": 31485 + }, + { + "epoch": 2.139557005027857, + "grad_norm": 9.120532035827637, + "learning_rate": 7.32665783394483e-06, + "loss": 3.1964, + "step": 31490 + }, + { + "epoch": 2.139896725098519, + "grad_norm": 10.770353317260742, + "learning_rate": 7.326233183856503e-06, + "loss": 2.9828, + "step": 31495 + }, + { + "epoch": 2.1402364451691804, + "grad_norm": 6.438304901123047, + "learning_rate": 7.3258085337681765e-06, + "loss": 2.8653, + "step": 31500 + }, + { + "epoch": 2.1405761652398425, + "grad_norm": 6.68835973739624, + "learning_rate": 7.325383883679848e-06, + "loss": 2.9607, + "step": 31505 + }, + { + "epoch": 2.140915885310504, + "grad_norm": 8.175272941589355, + "learning_rate": 7.324959233591521e-06, + "loss": 3.0666, + "step": 31510 + }, + { + "epoch": 2.1412556053811658, + "grad_norm": 6.76072883605957, + "learning_rate": 7.324534583503195e-06, + "loss": 3.1347, + "step": 31515 + }, + { + "epoch": 2.141595325451828, + "grad_norm": 7.078790664672852, + "learning_rate": 7.324109933414867e-06, + "loss": 2.9874, + "step": 31520 + }, + { + "epoch": 2.1419350455224895, + "grad_norm": 6.3969855308532715, + "learning_rate": 7.32368528332654e-06, + "loss": 2.839, + "step": 31525 + }, + { + "epoch": 2.142274765593151, + "grad_norm": 7.893093585968018, + "learning_rate": 7.323260633238212e-06, + "loss": 3.1396, + "step": 31530 + }, + { + "epoch": 2.142614485663813, + "grad_norm": 6.397088050842285, + "learning_rate": 7.322835983149885e-06, + "loss": 3.0307, + "step": 31535 + }, + { + "epoch": 2.142954205734475, + "grad_norm": 6.266099452972412, + "learning_rate": 7.322411333061558e-06, + "loss": 3.0748, + "step": 31540 + }, + { + "epoch": 2.1432939258051364, + "grad_norm": 7.305940628051758, + "learning_rate": 7.321986682973231e-06, + "loss": 2.8873, + "step": 31545 + }, + { + "epoch": 2.1436336458757985, + "grad_norm": 4.783533096313477, + "learning_rate": 7.321562032884904e-06, + "loss": 3.2467, + "step": 31550 + }, + { + "epoch": 2.14397336594646, + "grad_norm": 9.011321067810059, + "learning_rate": 7.3211373827965756e-06, + "loss": 3.0568, + "step": 31555 + }, + { + "epoch": 2.144313086017122, + "grad_norm": 7.145503044128418, + "learning_rate": 7.320712732708249e-06, + "loss": 3.1992, + "step": 31560 + }, + { + "epoch": 2.144652806087784, + "grad_norm": 9.10889720916748, + "learning_rate": 7.320288082619922e-06, + "loss": 3.1218, + "step": 31565 + }, + { + "epoch": 2.1449925261584455, + "grad_norm": 6.321659564971924, + "learning_rate": 7.319863432531594e-06, + "loss": 2.8753, + "step": 31570 + }, + { + "epoch": 2.145332246229107, + "grad_norm": 7.991772651672363, + "learning_rate": 7.319438782443268e-06, + "loss": 3.0322, + "step": 31575 + }, + { + "epoch": 2.145671966299769, + "grad_norm": 8.014856338500977, + "learning_rate": 7.3190141323549404e-06, + "loss": 3.0152, + "step": 31580 + }, + { + "epoch": 2.146011686370431, + "grad_norm": 7.539979934692383, + "learning_rate": 7.318589482266612e-06, + "loss": 2.9989, + "step": 31585 + }, + { + "epoch": 2.1463514064410925, + "grad_norm": 6.731026649475098, + "learning_rate": 7.318164832178286e-06, + "loss": 3.1279, + "step": 31590 + }, + { + "epoch": 2.1466911265117545, + "grad_norm": 9.412519454956055, + "learning_rate": 7.317740182089959e-06, + "loss": 2.9434, + "step": 31595 + }, + { + "epoch": 2.147030846582416, + "grad_norm": 6.752955436706543, + "learning_rate": 7.317315532001631e-06, + "loss": 3.0189, + "step": 31600 + }, + { + "epoch": 2.147370566653078, + "grad_norm": 7.939055442810059, + "learning_rate": 7.3168908819133044e-06, + "loss": 2.823, + "step": 31605 + }, + { + "epoch": 2.14771028672374, + "grad_norm": 6.868344306945801, + "learning_rate": 7.316466231824977e-06, + "loss": 2.9257, + "step": 31610 + }, + { + "epoch": 2.1480500067944015, + "grad_norm": 7.3974528312683105, + "learning_rate": 7.316041581736649e-06, + "loss": 3.0813, + "step": 31615 + }, + { + "epoch": 2.148389726865063, + "grad_norm": 7.769813537597656, + "learning_rate": 7.315616931648323e-06, + "loss": 2.9701, + "step": 31620 + }, + { + "epoch": 2.1487294469357248, + "grad_norm": 9.037347793579102, + "learning_rate": 7.315192281559995e-06, + "loss": 3.1482, + "step": 31625 + }, + { + "epoch": 2.149069167006387, + "grad_norm": 9.091198921203613, + "learning_rate": 7.314767631471668e-06, + "loss": 3.0018, + "step": 31630 + }, + { + "epoch": 2.1494088870770485, + "grad_norm": 8.106643676757812, + "learning_rate": 7.314342981383341e-06, + "loss": 2.8719, + "step": 31635 + }, + { + "epoch": 2.14974860714771, + "grad_norm": 8.638731002807617, + "learning_rate": 7.313918331295013e-06, + "loss": 3.2197, + "step": 31640 + }, + { + "epoch": 2.150088327218372, + "grad_norm": 8.592079162597656, + "learning_rate": 7.313493681206686e-06, + "loss": 3.0378, + "step": 31645 + }, + { + "epoch": 2.150428047289034, + "grad_norm": 7.916415691375732, + "learning_rate": 7.31306903111836e-06, + "loss": 3.3185, + "step": 31650 + }, + { + "epoch": 2.1507677673596954, + "grad_norm": 6.317062854766846, + "learning_rate": 7.312644381030032e-06, + "loss": 3.1734, + "step": 31655 + }, + { + "epoch": 2.1511074874303575, + "grad_norm": 5.7183308601379395, + "learning_rate": 7.312219730941704e-06, + "loss": 3.1731, + "step": 31660 + }, + { + "epoch": 2.151447207501019, + "grad_norm": 6.22590446472168, + "learning_rate": 7.311795080853378e-06, + "loss": 3.2243, + "step": 31665 + }, + { + "epoch": 2.1517869275716808, + "grad_norm": 6.696282863616943, + "learning_rate": 7.31137043076505e-06, + "loss": 3.1006, + "step": 31670 + }, + { + "epoch": 2.152126647642343, + "grad_norm": 6.904116630554199, + "learning_rate": 7.310945780676723e-06, + "loss": 3.2083, + "step": 31675 + }, + { + "epoch": 2.1524663677130045, + "grad_norm": 9.556726455688477, + "learning_rate": 7.3105211305883964e-06, + "loss": 2.9842, + "step": 31680 + }, + { + "epoch": 2.152806087783666, + "grad_norm": 8.072011947631836, + "learning_rate": 7.310096480500068e-06, + "loss": 3.0129, + "step": 31685 + }, + { + "epoch": 2.153145807854328, + "grad_norm": 6.953336238861084, + "learning_rate": 7.309671830411741e-06, + "loss": 3.1364, + "step": 31690 + }, + { + "epoch": 2.15348552792499, + "grad_norm": 6.330174922943115, + "learning_rate": 7.309247180323414e-06, + "loss": 2.7445, + "step": 31695 + }, + { + "epoch": 2.1538252479956514, + "grad_norm": 7.489501953125, + "learning_rate": 7.308822530235087e-06, + "loss": 3.0618, + "step": 31700 + }, + { + "epoch": 2.1541649680663135, + "grad_norm": 8.0653657913208, + "learning_rate": 7.30839788014676e-06, + "loss": 3.0647, + "step": 31705 + }, + { + "epoch": 2.154504688136975, + "grad_norm": 7.3168253898620605, + "learning_rate": 7.307973230058432e-06, + "loss": 3.1068, + "step": 31710 + }, + { + "epoch": 2.154844408207637, + "grad_norm": 4.960559844970703, + "learning_rate": 7.307548579970105e-06, + "loss": 2.8723, + "step": 31715 + }, + { + "epoch": 2.155184128278299, + "grad_norm": 7.423941135406494, + "learning_rate": 7.307123929881777e-06, + "loss": 2.9291, + "step": 31720 + }, + { + "epoch": 2.1555238483489605, + "grad_norm": 6.168736934661865, + "learning_rate": 7.306699279793451e-06, + "loss": 2.9506, + "step": 31725 + }, + { + "epoch": 2.155863568419622, + "grad_norm": 5.641807556152344, + "learning_rate": 7.306274629705124e-06, + "loss": 3.1805, + "step": 31730 + }, + { + "epoch": 2.156203288490284, + "grad_norm": 7.993002414703369, + "learning_rate": 7.3058499796167956e-06, + "loss": 2.7598, + "step": 31735 + }, + { + "epoch": 2.156543008560946, + "grad_norm": 7.873050689697266, + "learning_rate": 7.305425329528469e-06, + "loss": 3.2751, + "step": 31740 + }, + { + "epoch": 2.1568827286316075, + "grad_norm": 6.014259338378906, + "learning_rate": 7.305000679440142e-06, + "loss": 3.0725, + "step": 31745 + }, + { + "epoch": 2.1572224487022695, + "grad_norm": 6.77291202545166, + "learning_rate": 7.304576029351814e-06, + "loss": 2.8229, + "step": 31750 + }, + { + "epoch": 2.157562168772931, + "grad_norm": 7.432730197906494, + "learning_rate": 7.304151379263488e-06, + "loss": 3.076, + "step": 31755 + }, + { + "epoch": 2.157901888843593, + "grad_norm": 6.843648910522461, + "learning_rate": 7.30372672917516e-06, + "loss": 3.0201, + "step": 31760 + }, + { + "epoch": 2.1582416089142544, + "grad_norm": 7.863668441772461, + "learning_rate": 7.303302079086832e-06, + "loss": 2.9621, + "step": 31765 + }, + { + "epoch": 2.1585813289849165, + "grad_norm": 6.383977890014648, + "learning_rate": 7.302877428998506e-06, + "loss": 3.1658, + "step": 31770 + }, + { + "epoch": 2.158921049055578, + "grad_norm": 7.451029300689697, + "learning_rate": 7.302452778910179e-06, + "loss": 2.7211, + "step": 31775 + }, + { + "epoch": 2.1592607691262398, + "grad_norm": 7.1132025718688965, + "learning_rate": 7.302028128821851e-06, + "loss": 2.8933, + "step": 31780 + }, + { + "epoch": 2.159600489196902, + "grad_norm": 5.947272777557373, + "learning_rate": 7.301603478733524e-06, + "loss": 3.0069, + "step": 31785 + }, + { + "epoch": 2.1599402092675635, + "grad_norm": 7.422350883483887, + "learning_rate": 7.301178828645196e-06, + "loss": 3.1971, + "step": 31790 + }, + { + "epoch": 2.160279929338225, + "grad_norm": 7.470895767211914, + "learning_rate": 7.300754178556869e-06, + "loss": 3.2077, + "step": 31795 + }, + { + "epoch": 2.160619649408887, + "grad_norm": 6.813565731048584, + "learning_rate": 7.300329528468543e-06, + "loss": 2.8967, + "step": 31800 + }, + { + "epoch": 2.160959369479549, + "grad_norm": 8.016066551208496, + "learning_rate": 7.299904878380215e-06, + "loss": 3.1567, + "step": 31805 + }, + { + "epoch": 2.1612990895502104, + "grad_norm": 6.891726493835449, + "learning_rate": 7.299480228291888e-06, + "loss": 3.3692, + "step": 31810 + }, + { + "epoch": 2.1616388096208725, + "grad_norm": 5.842895984649658, + "learning_rate": 7.299055578203561e-06, + "loss": 3.1049, + "step": 31815 + }, + { + "epoch": 2.161978529691534, + "grad_norm": 6.058854103088379, + "learning_rate": 7.298630928115233e-06, + "loss": 2.9813, + "step": 31820 + }, + { + "epoch": 2.1623182497621958, + "grad_norm": 6.374045372009277, + "learning_rate": 7.298206278026907e-06, + "loss": 3.2058, + "step": 31825 + }, + { + "epoch": 2.162657969832858, + "grad_norm": 6.926768779754639, + "learning_rate": 7.29778162793858e-06, + "loss": 2.9832, + "step": 31830 + }, + { + "epoch": 2.1629976899035195, + "grad_norm": 7.75740909576416, + "learning_rate": 7.2973569778502516e-06, + "loss": 2.9734, + "step": 31835 + }, + { + "epoch": 2.163337409974181, + "grad_norm": 7.210062503814697, + "learning_rate": 7.296932327761925e-06, + "loss": 3.1963, + "step": 31840 + }, + { + "epoch": 2.163677130044843, + "grad_norm": 5.975211143493652, + "learning_rate": 7.296507677673598e-06, + "loss": 3.2117, + "step": 31845 + }, + { + "epoch": 2.164016850115505, + "grad_norm": 7.044256687164307, + "learning_rate": 7.29608302758527e-06, + "loss": 2.9221, + "step": 31850 + }, + { + "epoch": 2.1643565701861665, + "grad_norm": 5.556101322174072, + "learning_rate": 7.295658377496944e-06, + "loss": 3.0994, + "step": 31855 + }, + { + "epoch": 2.1646962902568285, + "grad_norm": 7.364765167236328, + "learning_rate": 7.295233727408616e-06, + "loss": 3.1195, + "step": 31860 + }, + { + "epoch": 2.16503601032749, + "grad_norm": 6.095069885253906, + "learning_rate": 7.294809077320288e-06, + "loss": 3.1067, + "step": 31865 + }, + { + "epoch": 2.165375730398152, + "grad_norm": 6.924616813659668, + "learning_rate": 7.294384427231962e-06, + "loss": 3.0992, + "step": 31870 + }, + { + "epoch": 2.165715450468814, + "grad_norm": 6.848287582397461, + "learning_rate": 7.293959777143634e-06, + "loss": 2.9671, + "step": 31875 + }, + { + "epoch": 2.1660551705394755, + "grad_norm": 7.695724964141846, + "learning_rate": 7.293535127055307e-06, + "loss": 2.9031, + "step": 31880 + }, + { + "epoch": 2.166394890610137, + "grad_norm": 5.535592555999756, + "learning_rate": 7.2931104769669804e-06, + "loss": 3.0379, + "step": 31885 + }, + { + "epoch": 2.166734610680799, + "grad_norm": 5.274994373321533, + "learning_rate": 7.292685826878652e-06, + "loss": 3.0763, + "step": 31890 + }, + { + "epoch": 2.167074330751461, + "grad_norm": 6.715068340301514, + "learning_rate": 7.292261176790325e-06, + "loss": 3.0016, + "step": 31895 + }, + { + "epoch": 2.1674140508221225, + "grad_norm": 7.419303894042969, + "learning_rate": 7.291836526701999e-06, + "loss": 3.0234, + "step": 31900 + }, + { + "epoch": 2.1677537708927845, + "grad_norm": 6.540747165679932, + "learning_rate": 7.291411876613671e-06, + "loss": 3.1552, + "step": 31905 + }, + { + "epoch": 2.168093490963446, + "grad_norm": 7.553868293762207, + "learning_rate": 7.290987226525344e-06, + "loss": 3.1772, + "step": 31910 + }, + { + "epoch": 2.168433211034108, + "grad_norm": 9.391408920288086, + "learning_rate": 7.290562576437017e-06, + "loss": 3.1527, + "step": 31915 + }, + { + "epoch": 2.16877293110477, + "grad_norm": 9.022672653198242, + "learning_rate": 7.290137926348689e-06, + "loss": 2.971, + "step": 31920 + }, + { + "epoch": 2.1691126511754315, + "grad_norm": 6.523103713989258, + "learning_rate": 7.289713276260362e-06, + "loss": 3.0037, + "step": 31925 + }, + { + "epoch": 2.169452371246093, + "grad_norm": 6.090214252471924, + "learning_rate": 7.289288626172036e-06, + "loss": 2.8033, + "step": 31930 + }, + { + "epoch": 2.169792091316755, + "grad_norm": 6.77434778213501, + "learning_rate": 7.288863976083708e-06, + "loss": 3.164, + "step": 31935 + }, + { + "epoch": 2.170131811387417, + "grad_norm": 6.771028518676758, + "learning_rate": 7.28843932599538e-06, + "loss": 3.1494, + "step": 31940 + }, + { + "epoch": 2.1704715314580785, + "grad_norm": 6.630463123321533, + "learning_rate": 7.288014675907053e-06, + "loss": 3.1461, + "step": 31945 + }, + { + "epoch": 2.1708112515287405, + "grad_norm": 7.31418514251709, + "learning_rate": 7.287590025818726e-06, + "loss": 2.9388, + "step": 31950 + }, + { + "epoch": 2.171150971599402, + "grad_norm": 6.810643672943115, + "learning_rate": 7.287165375730399e-06, + "loss": 3.0924, + "step": 31955 + }, + { + "epoch": 2.171490691670064, + "grad_norm": 7.23947286605835, + "learning_rate": 7.286740725642072e-06, + "loss": 3.0163, + "step": 31960 + }, + { + "epoch": 2.1718304117407254, + "grad_norm": 7.414385795593262, + "learning_rate": 7.286316075553744e-06, + "loss": 3.2107, + "step": 31965 + }, + { + "epoch": 2.1721701318113875, + "grad_norm": 6.5535149574279785, + "learning_rate": 7.285891425465416e-06, + "loss": 3.1068, + "step": 31970 + }, + { + "epoch": 2.172509851882049, + "grad_norm": 8.955015182495117, + "learning_rate": 7.28546677537709e-06, + "loss": 3.0191, + "step": 31975 + }, + { + "epoch": 2.172849571952711, + "grad_norm": 7.646866798400879, + "learning_rate": 7.285042125288763e-06, + "loss": 2.9258, + "step": 31980 + }, + { + "epoch": 2.173189292023373, + "grad_norm": 8.075562477111816, + "learning_rate": 7.284617475200435e-06, + "loss": 3.0482, + "step": 31985 + }, + { + "epoch": 2.1735290120940345, + "grad_norm": 6.815784454345703, + "learning_rate": 7.284192825112108e-06, + "loss": 2.9088, + "step": 31990 + }, + { + "epoch": 2.173868732164696, + "grad_norm": 7.5793280601501465, + "learning_rate": 7.283768175023781e-06, + "loss": 2.9778, + "step": 31995 + }, + { + "epoch": 2.174208452235358, + "grad_norm": 6.591877460479736, + "learning_rate": 7.283343524935453e-06, + "loss": 2.8817, + "step": 32000 + }, + { + "epoch": 2.17454817230602, + "grad_norm": 8.782571792602539, + "learning_rate": 7.282918874847127e-06, + "loss": 3.3034, + "step": 32005 + }, + { + "epoch": 2.1748878923766815, + "grad_norm": 7.528925895690918, + "learning_rate": 7.2824942247588e-06, + "loss": 3.066, + "step": 32010 + }, + { + "epoch": 2.1752276124473435, + "grad_norm": 5.5050859451293945, + "learning_rate": 7.2820695746704716e-06, + "loss": 3.0847, + "step": 32015 + }, + { + "epoch": 2.175567332518005, + "grad_norm": 8.095808982849121, + "learning_rate": 7.281644924582145e-06, + "loss": 2.9342, + "step": 32020 + }, + { + "epoch": 2.175907052588667, + "grad_norm": 7.731491565704346, + "learning_rate": 7.281220274493818e-06, + "loss": 3.13, + "step": 32025 + }, + { + "epoch": 2.176246772659329, + "grad_norm": 7.5426201820373535, + "learning_rate": 7.28079562440549e-06, + "loss": 3.0317, + "step": 32030 + }, + { + "epoch": 2.1765864927299905, + "grad_norm": 8.718971252441406, + "learning_rate": 7.280370974317164e-06, + "loss": 3.1368, + "step": 32035 + }, + { + "epoch": 2.176926212800652, + "grad_norm": 5.529819488525391, + "learning_rate": 7.2799463242288356e-06, + "loss": 2.8721, + "step": 32040 + }, + { + "epoch": 2.177265932871314, + "grad_norm": 5.958070278167725, + "learning_rate": 7.279521674140508e-06, + "loss": 3.0521, + "step": 32045 + }, + { + "epoch": 2.177605652941976, + "grad_norm": 8.15489387512207, + "learning_rate": 7.279097024052182e-06, + "loss": 3.034, + "step": 32050 + }, + { + "epoch": 2.1779453730126375, + "grad_norm": 7.88503885269165, + "learning_rate": 7.278672373963854e-06, + "loss": 3.0457, + "step": 32055 + }, + { + "epoch": 2.1782850930832995, + "grad_norm": 6.4465179443359375, + "learning_rate": 7.278247723875527e-06, + "loss": 3.0604, + "step": 32060 + }, + { + "epoch": 2.178624813153961, + "grad_norm": 7.553908824920654, + "learning_rate": 7.2778230737872e-06, + "loss": 3.1511, + "step": 32065 + }, + { + "epoch": 2.178964533224623, + "grad_norm": 5.901176929473877, + "learning_rate": 7.277398423698872e-06, + "loss": 3.2283, + "step": 32070 + }, + { + "epoch": 2.179304253295285, + "grad_norm": 7.923645973205566, + "learning_rate": 7.276973773610545e-06, + "loss": 3.0471, + "step": 32075 + }, + { + "epoch": 2.1796439733659465, + "grad_norm": 6.868990898132324, + "learning_rate": 7.276549123522219e-06, + "loss": 3.1085, + "step": 32080 + }, + { + "epoch": 2.179983693436608, + "grad_norm": 6.6238694190979, + "learning_rate": 7.276124473433891e-06, + "loss": 3.2885, + "step": 32085 + }, + { + "epoch": 2.18032341350727, + "grad_norm": 6.55242919921875, + "learning_rate": 7.2756998233455636e-06, + "loss": 2.7764, + "step": 32090 + }, + { + "epoch": 2.180663133577932, + "grad_norm": 8.673505783081055, + "learning_rate": 7.275275173257237e-06, + "loss": 3.0228, + "step": 32095 + }, + { + "epoch": 2.1810028536485935, + "grad_norm": 5.701931476593018, + "learning_rate": 7.274850523168909e-06, + "loss": 3.1411, + "step": 32100 + }, + { + "epoch": 2.181342573719255, + "grad_norm": 7.635279655456543, + "learning_rate": 7.274425873080582e-06, + "loss": 2.9269, + "step": 32105 + }, + { + "epoch": 2.181682293789917, + "grad_norm": 7.055490493774414, + "learning_rate": 7.274001222992256e-06, + "loss": 3.0055, + "step": 32110 + }, + { + "epoch": 2.182022013860579, + "grad_norm": 7.903571128845215, + "learning_rate": 7.2735765729039276e-06, + "loss": 2.9134, + "step": 32115 + }, + { + "epoch": 2.1823617339312404, + "grad_norm": 8.793736457824707, + "learning_rate": 7.2731519228156e-06, + "loss": 3.1059, + "step": 32120 + }, + { + "epoch": 2.1827014540019025, + "grad_norm": 6.900751113891602, + "learning_rate": 7.272727272727273e-06, + "loss": 3.2055, + "step": 32125 + }, + { + "epoch": 2.183041174072564, + "grad_norm": 5.7176289558410645, + "learning_rate": 7.272302622638946e-06, + "loss": 2.9374, + "step": 32130 + }, + { + "epoch": 2.183380894143226, + "grad_norm": 8.283708572387695, + "learning_rate": 7.271877972550618e-06, + "loss": 3.066, + "step": 32135 + }, + { + "epoch": 2.183720614213888, + "grad_norm": 6.377049922943115, + "learning_rate": 7.2714533224622916e-06, + "loss": 3.0907, + "step": 32140 + }, + { + "epoch": 2.1840603342845495, + "grad_norm": 6.4865312576293945, + "learning_rate": 7.271028672373964e-06, + "loss": 2.8592, + "step": 32145 + }, + { + "epoch": 2.184400054355211, + "grad_norm": 6.871230125427246, + "learning_rate": 7.270604022285638e-06, + "loss": 3.0108, + "step": 32150 + }, + { + "epoch": 2.184739774425873, + "grad_norm": 6.931108474731445, + "learning_rate": 7.27017937219731e-06, + "loss": 2.7889, + "step": 32155 + }, + { + "epoch": 2.185079494496535, + "grad_norm": 7.108587741851807, + "learning_rate": 7.269754722108983e-06, + "loss": 2.8744, + "step": 32160 + }, + { + "epoch": 2.1854192145671965, + "grad_norm": 6.6096296310424805, + "learning_rate": 7.269330072020656e-06, + "loss": 3.0371, + "step": 32165 + }, + { + "epoch": 2.1857589346378585, + "grad_norm": 7.219512462615967, + "learning_rate": 7.268905421932328e-06, + "loss": 3.1057, + "step": 32170 + }, + { + "epoch": 2.18609865470852, + "grad_norm": 8.778680801391602, + "learning_rate": 7.268480771844001e-06, + "loss": 2.8453, + "step": 32175 + }, + { + "epoch": 2.186438374779182, + "grad_norm": 6.63734245300293, + "learning_rate": 7.268056121755675e-06, + "loss": 3.0235, + "step": 32180 + }, + { + "epoch": 2.186778094849844, + "grad_norm": 6.886673450469971, + "learning_rate": 7.267631471667347e-06, + "loss": 3.0575, + "step": 32185 + }, + { + "epoch": 2.1871178149205055, + "grad_norm": 6.931079387664795, + "learning_rate": 7.26720682157902e-06, + "loss": 3.2505, + "step": 32190 + }, + { + "epoch": 2.187457534991167, + "grad_norm": 6.025925159454346, + "learning_rate": 7.266782171490692e-06, + "loss": 3.1545, + "step": 32195 + }, + { + "epoch": 2.187797255061829, + "grad_norm": 7.2950439453125, + "learning_rate": 7.266357521402365e-06, + "loss": 2.8082, + "step": 32200 + }, + { + "epoch": 2.188136975132491, + "grad_norm": 6.985259532928467, + "learning_rate": 7.265932871314038e-06, + "loss": 3.1066, + "step": 32205 + }, + { + "epoch": 2.1884766952031525, + "grad_norm": 7.518407821655273, + "learning_rate": 7.265508221225711e-06, + "loss": 3.176, + "step": 32210 + }, + { + "epoch": 2.1888164152738145, + "grad_norm": 7.191827297210693, + "learning_rate": 7.265083571137384e-06, + "loss": 3.0454, + "step": 32215 + }, + { + "epoch": 2.189156135344476, + "grad_norm": 7.2074737548828125, + "learning_rate": 7.2646589210490555e-06, + "loss": 3.1243, + "step": 32220 + }, + { + "epoch": 2.189495855415138, + "grad_norm": 6.412374496459961, + "learning_rate": 7.264234270960729e-06, + "loss": 2.9278, + "step": 32225 + }, + { + "epoch": 2.1898355754858, + "grad_norm": 8.198405265808105, + "learning_rate": 7.263809620872402e-06, + "loss": 3.0531, + "step": 32230 + }, + { + "epoch": 2.1901752955564615, + "grad_norm": 7.943531036376953, + "learning_rate": 7.263384970784074e-06, + "loss": 3.1976, + "step": 32235 + }, + { + "epoch": 2.190515015627123, + "grad_norm": 6.706291198730469, + "learning_rate": 7.262960320695748e-06, + "loss": 3.0673, + "step": 32240 + }, + { + "epoch": 2.190854735697785, + "grad_norm": 6.955034255981445, + "learning_rate": 7.26253567060742e-06, + "loss": 2.7482, + "step": 32245 + }, + { + "epoch": 2.191194455768447, + "grad_norm": 6.1616034507751465, + "learning_rate": 7.262111020519092e-06, + "loss": 3.0754, + "step": 32250 + }, + { + "epoch": 2.1915341758391085, + "grad_norm": 6.875460147857666, + "learning_rate": 7.261686370430766e-06, + "loss": 3.0383, + "step": 32255 + }, + { + "epoch": 2.1918738959097706, + "grad_norm": 6.592569828033447, + "learning_rate": 7.261261720342439e-06, + "loss": 3.0157, + "step": 32260 + }, + { + "epoch": 2.192213615980432, + "grad_norm": 7.396389007568359, + "learning_rate": 7.260837070254111e-06, + "loss": 3.2471, + "step": 32265 + }, + { + "epoch": 2.192553336051094, + "grad_norm": 7.048007965087891, + "learning_rate": 7.260412420165784e-06, + "loss": 3.0933, + "step": 32270 + }, + { + "epoch": 2.192893056121756, + "grad_norm": 7.039594650268555, + "learning_rate": 7.259987770077457e-06, + "loss": 3.1183, + "step": 32275 + }, + { + "epoch": 2.1932327761924175, + "grad_norm": 8.784004211425781, + "learning_rate": 7.259563119989129e-06, + "loss": 3.0999, + "step": 32280 + }, + { + "epoch": 2.193572496263079, + "grad_norm": 6.168890476226807, + "learning_rate": 7.259138469900803e-06, + "loss": 2.6623, + "step": 32285 + }, + { + "epoch": 2.1939122163337412, + "grad_norm": 7.072152614593506, + "learning_rate": 7.258713819812475e-06, + "loss": 3.0601, + "step": 32290 + }, + { + "epoch": 2.194251936404403, + "grad_norm": 8.222448348999023, + "learning_rate": 7.2582891697241475e-06, + "loss": 3.211, + "step": 32295 + }, + { + "epoch": 2.1945916564750645, + "grad_norm": 6.935388088226318, + "learning_rate": 7.257864519635821e-06, + "loss": 2.9975, + "step": 32300 + }, + { + "epoch": 2.194931376545726, + "grad_norm": 7.200239181518555, + "learning_rate": 7.257439869547493e-06, + "loss": 3.0874, + "step": 32305 + }, + { + "epoch": 2.195271096616388, + "grad_norm": 7.512578964233398, + "learning_rate": 7.257015219459166e-06, + "loss": 2.8443, + "step": 32310 + }, + { + "epoch": 2.19561081668705, + "grad_norm": 6.147582054138184, + "learning_rate": 7.25659056937084e-06, + "loss": 2.7967, + "step": 32315 + }, + { + "epoch": 2.1959505367577115, + "grad_norm": 6.325517177581787, + "learning_rate": 7.2561659192825116e-06, + "loss": 2.7155, + "step": 32320 + }, + { + "epoch": 2.1962902568283735, + "grad_norm": 6.6977338790893555, + "learning_rate": 7.255741269194184e-06, + "loss": 3.1155, + "step": 32325 + }, + { + "epoch": 2.196629976899035, + "grad_norm": 6.56777811050415, + "learning_rate": 7.255316619105858e-06, + "loss": 2.9317, + "step": 32330 + }, + { + "epoch": 2.196969696969697, + "grad_norm": 7.913999080657959, + "learning_rate": 7.25489196901753e-06, + "loss": 3.0393, + "step": 32335 + }, + { + "epoch": 2.197309417040359, + "grad_norm": 6.092250347137451, + "learning_rate": 7.254467318929203e-06, + "loss": 3.188, + "step": 32340 + }, + { + "epoch": 2.1976491371110205, + "grad_norm": 7.304535865783691, + "learning_rate": 7.254042668840876e-06, + "loss": 3.2619, + "step": 32345 + }, + { + "epoch": 2.197988857181682, + "grad_norm": 6.311341762542725, + "learning_rate": 7.253618018752548e-06, + "loss": 3.117, + "step": 32350 + }, + { + "epoch": 2.198328577252344, + "grad_norm": 8.000083923339844, + "learning_rate": 7.253193368664221e-06, + "loss": 3.1326, + "step": 32355 + }, + { + "epoch": 2.198668297323006, + "grad_norm": 6.417201042175293, + "learning_rate": 7.252768718575894e-06, + "loss": 3.0315, + "step": 32360 + }, + { + "epoch": 2.1990080173936675, + "grad_norm": 6.349673271179199, + "learning_rate": 7.252344068487567e-06, + "loss": 2.7776, + "step": 32365 + }, + { + "epoch": 2.1993477374643295, + "grad_norm": 6.043234348297119, + "learning_rate": 7.2519194183992396e-06, + "loss": 2.94, + "step": 32370 + }, + { + "epoch": 2.199687457534991, + "grad_norm": 6.921864986419678, + "learning_rate": 7.251494768310912e-06, + "loss": 2.8234, + "step": 32375 + }, + { + "epoch": 2.200027177605653, + "grad_norm": 6.728738307952881, + "learning_rate": 7.251070118222585e-06, + "loss": 3.0982, + "step": 32380 + }, + { + "epoch": 2.200366897676315, + "grad_norm": 7.883802890777588, + "learning_rate": 7.250645468134257e-06, + "loss": 3.0964, + "step": 32385 + }, + { + "epoch": 2.2007066177469765, + "grad_norm": 6.628837585449219, + "learning_rate": 7.250220818045931e-06, + "loss": 3.1897, + "step": 32390 + }, + { + "epoch": 2.201046337817638, + "grad_norm": 7.381194591522217, + "learning_rate": 7.2497961679576036e-06, + "loss": 2.9311, + "step": 32395 + }, + { + "epoch": 2.2013860578883, + "grad_norm": 6.989338397979736, + "learning_rate": 7.2493715178692755e-06, + "loss": 3.2386, + "step": 32400 + }, + { + "epoch": 2.201725777958962, + "grad_norm": 6.40382194519043, + "learning_rate": 7.248946867780949e-06, + "loss": 3.1474, + "step": 32405 + }, + { + "epoch": 2.2020654980296235, + "grad_norm": 7.1292572021484375, + "learning_rate": 7.248522217692622e-06, + "loss": 2.9949, + "step": 32410 + }, + { + "epoch": 2.2024052181002856, + "grad_norm": 6.2883477210998535, + "learning_rate": 7.248097567604294e-06, + "loss": 3.2303, + "step": 32415 + }, + { + "epoch": 2.202744938170947, + "grad_norm": 6.972686767578125, + "learning_rate": 7.2476729175159676e-06, + "loss": 2.9385, + "step": 32420 + }, + { + "epoch": 2.203084658241609, + "grad_norm": 7.661683559417725, + "learning_rate": 7.24724826742764e-06, + "loss": 3.2533, + "step": 32425 + }, + { + "epoch": 2.203424378312271, + "grad_norm": 8.157512664794922, + "learning_rate": 7.246823617339312e-06, + "loss": 2.7493, + "step": 32430 + }, + { + "epoch": 2.2037640983829325, + "grad_norm": 6.068077564239502, + "learning_rate": 7.246398967250986e-06, + "loss": 3.0041, + "step": 32435 + }, + { + "epoch": 2.204103818453594, + "grad_norm": 7.268886566162109, + "learning_rate": 7.245974317162659e-06, + "loss": 3.282, + "step": 32440 + }, + { + "epoch": 2.204443538524256, + "grad_norm": 5.877236843109131, + "learning_rate": 7.245549667074331e-06, + "loss": 3.2337, + "step": 32445 + }, + { + "epoch": 2.204783258594918, + "grad_norm": 7.446588516235352, + "learning_rate": 7.245125016986004e-06, + "loss": 2.9097, + "step": 32450 + }, + { + "epoch": 2.2051229786655795, + "grad_norm": 6.0167646408081055, + "learning_rate": 7.244700366897677e-06, + "loss": 2.9098, + "step": 32455 + }, + { + "epoch": 2.205462698736241, + "grad_norm": 7.172595024108887, + "learning_rate": 7.244275716809349e-06, + "loss": 2.9777, + "step": 32460 + }, + { + "epoch": 2.205802418806903, + "grad_norm": 6.89896821975708, + "learning_rate": 7.243851066721023e-06, + "loss": 3.2789, + "step": 32465 + }, + { + "epoch": 2.206142138877565, + "grad_norm": 6.7052412033081055, + "learning_rate": 7.243426416632695e-06, + "loss": 3.0563, + "step": 32470 + }, + { + "epoch": 2.2064818589482265, + "grad_norm": 6.427281379699707, + "learning_rate": 7.2430017665443675e-06, + "loss": 3.2187, + "step": 32475 + }, + { + "epoch": 2.2068215790188885, + "grad_norm": 7.226778984069824, + "learning_rate": 7.242577116456041e-06, + "loss": 2.74, + "step": 32480 + }, + { + "epoch": 2.20716129908955, + "grad_norm": 5.98664665222168, + "learning_rate": 7.242152466367713e-06, + "loss": 3.2462, + "step": 32485 + }, + { + "epoch": 2.207501019160212, + "grad_norm": 6.811773300170898, + "learning_rate": 7.241727816279387e-06, + "loss": 3.0398, + "step": 32490 + }, + { + "epoch": 2.207840739230874, + "grad_norm": 6.306571006774902, + "learning_rate": 7.24130316619106e-06, + "loss": 3.2054, + "step": 32495 + }, + { + "epoch": 2.2081804593015355, + "grad_norm": 6.754238128662109, + "learning_rate": 7.2408785161027315e-06, + "loss": 2.8688, + "step": 32500 + }, + { + "epoch": 2.208520179372197, + "grad_norm": 7.531455039978027, + "learning_rate": 7.240453866014405e-06, + "loss": 3.0169, + "step": 32505 + }, + { + "epoch": 2.208859899442859, + "grad_norm": 6.640850067138672, + "learning_rate": 7.240029215926078e-06, + "loss": 3.1507, + "step": 32510 + }, + { + "epoch": 2.209199619513521, + "grad_norm": 7.771536827087402, + "learning_rate": 7.23960456583775e-06, + "loss": 3.1307, + "step": 32515 + }, + { + "epoch": 2.2095393395841825, + "grad_norm": 7.608039855957031, + "learning_rate": 7.239179915749424e-06, + "loss": 3.0218, + "step": 32520 + }, + { + "epoch": 2.2098790596548445, + "grad_norm": 7.170582294464111, + "learning_rate": 7.238755265661096e-06, + "loss": 3.1722, + "step": 32525 + }, + { + "epoch": 2.210218779725506, + "grad_norm": 6.983574867248535, + "learning_rate": 7.238330615572768e-06, + "loss": 3.1658, + "step": 32530 + }, + { + "epoch": 2.210558499796168, + "grad_norm": 6.803972244262695, + "learning_rate": 7.237905965484442e-06, + "loss": 3.026, + "step": 32535 + }, + { + "epoch": 2.21089821986683, + "grad_norm": 5.561335563659668, + "learning_rate": 7.237481315396114e-06, + "loss": 3.2002, + "step": 32540 + }, + { + "epoch": 2.2112379399374915, + "grad_norm": 6.828242301940918, + "learning_rate": 7.237056665307787e-06, + "loss": 3.0581, + "step": 32545 + }, + { + "epoch": 2.211577660008153, + "grad_norm": 5.550840377807617, + "learning_rate": 7.23663201521946e-06, + "loss": 3.1334, + "step": 32550 + }, + { + "epoch": 2.211917380078815, + "grad_norm": 7.035225868225098, + "learning_rate": 7.236207365131132e-06, + "loss": 3.0772, + "step": 32555 + }, + { + "epoch": 2.212257100149477, + "grad_norm": 7.434988021850586, + "learning_rate": 7.235782715042805e-06, + "loss": 2.9944, + "step": 32560 + }, + { + "epoch": 2.2125968202201385, + "grad_norm": 6.022154808044434, + "learning_rate": 7.235358064954479e-06, + "loss": 3.0914, + "step": 32565 + }, + { + "epoch": 2.2129365402908006, + "grad_norm": 6.575758934020996, + "learning_rate": 7.234933414866151e-06, + "loss": 2.8502, + "step": 32570 + }, + { + "epoch": 2.213276260361462, + "grad_norm": 6.964272499084473, + "learning_rate": 7.2345087647778235e-06, + "loss": 2.8529, + "step": 32575 + }, + { + "epoch": 2.213615980432124, + "grad_norm": 7.376224994659424, + "learning_rate": 7.234084114689497e-06, + "loss": 3.0567, + "step": 32580 + }, + { + "epoch": 2.213955700502786, + "grad_norm": 7.052042484283447, + "learning_rate": 7.233659464601169e-06, + "loss": 3.3568, + "step": 32585 + }, + { + "epoch": 2.2142954205734475, + "grad_norm": 6.065715312957764, + "learning_rate": 7.233234814512842e-06, + "loss": 2.8706, + "step": 32590 + }, + { + "epoch": 2.214635140644109, + "grad_norm": 7.873793601989746, + "learning_rate": 7.232810164424516e-06, + "loss": 2.9307, + "step": 32595 + }, + { + "epoch": 2.2149748607147712, + "grad_norm": 7.286266803741455, + "learning_rate": 7.2323855143361875e-06, + "loss": 3.0634, + "step": 32600 + }, + { + "epoch": 2.215314580785433, + "grad_norm": 6.300685882568359, + "learning_rate": 7.23196086424786e-06, + "loss": 3.0091, + "step": 32605 + }, + { + "epoch": 2.2156543008560945, + "grad_norm": 6.558652400970459, + "learning_rate": 7.231536214159533e-06, + "loss": 2.889, + "step": 32610 + }, + { + "epoch": 2.2159940209267566, + "grad_norm": 8.324139595031738, + "learning_rate": 7.231111564071206e-06, + "loss": 3.2205, + "step": 32615 + }, + { + "epoch": 2.216333740997418, + "grad_norm": 8.256672859191895, + "learning_rate": 7.230686913982879e-06, + "loss": 3.0732, + "step": 32620 + }, + { + "epoch": 2.21667346106808, + "grad_norm": 8.058125495910645, + "learning_rate": 7.2302622638945516e-06, + "loss": 3.1199, + "step": 32625 + }, + { + "epoch": 2.217013181138742, + "grad_norm": 7.098164081573486, + "learning_rate": 7.229837613806224e-06, + "loss": 3.0523, + "step": 32630 + }, + { + "epoch": 2.2173529012094035, + "grad_norm": 7.796998977661133, + "learning_rate": 7.229412963717896e-06, + "loss": 2.8812, + "step": 32635 + }, + { + "epoch": 2.217692621280065, + "grad_norm": 6.922847747802734, + "learning_rate": 7.22898831362957e-06, + "loss": 3.0461, + "step": 32640 + }, + { + "epoch": 2.218032341350727, + "grad_norm": 7.053717136383057, + "learning_rate": 7.228563663541243e-06, + "loss": 3.0489, + "step": 32645 + }, + { + "epoch": 2.218372061421389, + "grad_norm": 6.300652980804443, + "learning_rate": 7.228139013452915e-06, + "loss": 2.9803, + "step": 32650 + }, + { + "epoch": 2.2187117814920505, + "grad_norm": 9.335392951965332, + "learning_rate": 7.227714363364588e-06, + "loss": 3.0839, + "step": 32655 + }, + { + "epoch": 2.219051501562712, + "grad_norm": 7.495307445526123, + "learning_rate": 7.227289713276261e-06, + "loss": 3.0064, + "step": 32660 + }, + { + "epoch": 2.219391221633374, + "grad_norm": 6.658489227294922, + "learning_rate": 7.226865063187933e-06, + "loss": 2.8806, + "step": 32665 + }, + { + "epoch": 2.219730941704036, + "grad_norm": 7.699064254760742, + "learning_rate": 7.226440413099607e-06, + "loss": 3.1835, + "step": 32670 + }, + { + "epoch": 2.2200706617746975, + "grad_norm": 7.211783409118652, + "learning_rate": 7.2260157630112796e-06, + "loss": 3.0771, + "step": 32675 + }, + { + "epoch": 2.2204103818453595, + "grad_norm": 6.684628963470459, + "learning_rate": 7.2255911129229515e-06, + "loss": 2.8341, + "step": 32680 + }, + { + "epoch": 2.220750101916021, + "grad_norm": 6.843960762023926, + "learning_rate": 7.225166462834625e-06, + "loss": 2.965, + "step": 32685 + }, + { + "epoch": 2.221089821986683, + "grad_norm": 9.624232292175293, + "learning_rate": 7.224741812746298e-06, + "loss": 2.8535, + "step": 32690 + }, + { + "epoch": 2.221429542057345, + "grad_norm": 8.90040111541748, + "learning_rate": 7.22431716265797e-06, + "loss": 2.8696, + "step": 32695 + }, + { + "epoch": 2.2217692621280065, + "grad_norm": 5.371764183044434, + "learning_rate": 7.2238925125696436e-06, + "loss": 3.1245, + "step": 32700 + }, + { + "epoch": 2.222108982198668, + "grad_norm": 6.11844539642334, + "learning_rate": 7.2234678624813155e-06, + "loss": 2.8548, + "step": 32705 + }, + { + "epoch": 2.2224487022693302, + "grad_norm": 5.681001663208008, + "learning_rate": 7.223043212392988e-06, + "loss": 3.1004, + "step": 32710 + }, + { + "epoch": 2.222788422339992, + "grad_norm": 7.462239742279053, + "learning_rate": 7.222618562304662e-06, + "loss": 3.0921, + "step": 32715 + }, + { + "epoch": 2.2231281424106535, + "grad_norm": 8.07728385925293, + "learning_rate": 7.222193912216334e-06, + "loss": 3.0514, + "step": 32720 + }, + { + "epoch": 2.2234678624813156, + "grad_norm": 6.4991326332092285, + "learning_rate": 7.221769262128007e-06, + "loss": 3.023, + "step": 32725 + }, + { + "epoch": 2.223807582551977, + "grad_norm": 7.603526592254639, + "learning_rate": 7.22134461203968e-06, + "loss": 3.1434, + "step": 32730 + }, + { + "epoch": 2.224147302622639, + "grad_norm": 6.870924472808838, + "learning_rate": 7.220919961951352e-06, + "loss": 3.0711, + "step": 32735 + }, + { + "epoch": 2.224487022693301, + "grad_norm": 7.274544715881348, + "learning_rate": 7.220495311863025e-06, + "loss": 3.089, + "step": 32740 + }, + { + "epoch": 2.2248267427639625, + "grad_norm": 7.218973636627197, + "learning_rate": 7.220070661774699e-06, + "loss": 3.1117, + "step": 32745 + }, + { + "epoch": 2.225166462834624, + "grad_norm": 8.188616752624512, + "learning_rate": 7.219646011686371e-06, + "loss": 2.7469, + "step": 32750 + }, + { + "epoch": 2.2255061829052862, + "grad_norm": 9.086454391479492, + "learning_rate": 7.2192213615980435e-06, + "loss": 3.0756, + "step": 32755 + }, + { + "epoch": 2.225845902975948, + "grad_norm": 6.168883800506592, + "learning_rate": 7.218796711509717e-06, + "loss": 3.1347, + "step": 32760 + }, + { + "epoch": 2.2261856230466095, + "grad_norm": 7.382747650146484, + "learning_rate": 7.218372061421389e-06, + "loss": 2.9843, + "step": 32765 + }, + { + "epoch": 2.2265253431172716, + "grad_norm": 7.34613561630249, + "learning_rate": 7.217947411333062e-06, + "loss": 3.1657, + "step": 32770 + }, + { + "epoch": 2.226865063187933, + "grad_norm": 6.820351600646973, + "learning_rate": 7.2175227612447356e-06, + "loss": 3.0194, + "step": 32775 + }, + { + "epoch": 2.227204783258595, + "grad_norm": 7.271883964538574, + "learning_rate": 7.2170981111564075e-06, + "loss": 3.0978, + "step": 32780 + }, + { + "epoch": 2.2275445033292565, + "grad_norm": 7.929479598999023, + "learning_rate": 7.21667346106808e-06, + "loss": 2.8551, + "step": 32785 + }, + { + "epoch": 2.2278842233999185, + "grad_norm": 6.099721908569336, + "learning_rate": 7.216248810979753e-06, + "loss": 2.9119, + "step": 32790 + }, + { + "epoch": 2.22822394347058, + "grad_norm": 7.027042388916016, + "learning_rate": 7.215824160891426e-06, + "loss": 3.0806, + "step": 32795 + }, + { + "epoch": 2.228563663541242, + "grad_norm": 6.3023858070373535, + "learning_rate": 7.215399510803098e-06, + "loss": 3.2155, + "step": 32800 + }, + { + "epoch": 2.228903383611904, + "grad_norm": 6.726261138916016, + "learning_rate": 7.2149748607147715e-06, + "loss": 2.9855, + "step": 32805 + }, + { + "epoch": 2.2292431036825655, + "grad_norm": 7.3802008628845215, + "learning_rate": 7.214550210626444e-06, + "loss": 3.1175, + "step": 32810 + }, + { + "epoch": 2.229582823753227, + "grad_norm": 6.344055652618408, + "learning_rate": 7.214125560538116e-06, + "loss": 3.2372, + "step": 32815 + }, + { + "epoch": 2.229922543823889, + "grad_norm": 9.80965805053711, + "learning_rate": 7.21370091044979e-06, + "loss": 3.1553, + "step": 32820 + }, + { + "epoch": 2.230262263894551, + "grad_norm": 6.207666873931885, + "learning_rate": 7.213276260361463e-06, + "loss": 3.1928, + "step": 32825 + }, + { + "epoch": 2.2306019839652125, + "grad_norm": 7.574747085571289, + "learning_rate": 7.212851610273136e-06, + "loss": 3.2151, + "step": 32830 + }, + { + "epoch": 2.2309417040358746, + "grad_norm": 7.701200485229492, + "learning_rate": 7.212426960184808e-06, + "loss": 3.0474, + "step": 32835 + }, + { + "epoch": 2.231281424106536, + "grad_norm": 8.095280647277832, + "learning_rate": 7.212002310096481e-06, + "loss": 2.9603, + "step": 32840 + }, + { + "epoch": 2.231621144177198, + "grad_norm": 7.287888526916504, + "learning_rate": 7.211577660008155e-06, + "loss": 2.9569, + "step": 32845 + }, + { + "epoch": 2.23196086424786, + "grad_norm": 6.669665336608887, + "learning_rate": 7.211153009919827e-06, + "loss": 3.1107, + "step": 32850 + }, + { + "epoch": 2.2323005843185215, + "grad_norm": 6.720019817352295, + "learning_rate": 7.2107283598314995e-06, + "loss": 3.0103, + "step": 32855 + }, + { + "epoch": 2.232640304389183, + "grad_norm": 7.036870956420898, + "learning_rate": 7.210303709743172e-06, + "loss": 2.8659, + "step": 32860 + }, + { + "epoch": 2.2329800244598452, + "grad_norm": 8.461530685424805, + "learning_rate": 7.209879059654845e-06, + "loss": 2.8363, + "step": 32865 + }, + { + "epoch": 2.233319744530507, + "grad_norm": 6.153038024902344, + "learning_rate": 7.209454409566518e-06, + "loss": 3.2134, + "step": 32870 + }, + { + "epoch": 2.2336594646011685, + "grad_norm": 8.310957908630371, + "learning_rate": 7.209029759478191e-06, + "loss": 3.2571, + "step": 32875 + }, + { + "epoch": 2.2339991846718306, + "grad_norm": 7.012611389160156, + "learning_rate": 7.2086051093898635e-06, + "loss": 3.0056, + "step": 32880 + }, + { + "epoch": 2.234338904742492, + "grad_norm": 7.530524253845215, + "learning_rate": 7.2081804593015355e-06, + "loss": 3.0156, + "step": 32885 + }, + { + "epoch": 2.234678624813154, + "grad_norm": 7.103142738342285, + "learning_rate": 7.207755809213209e-06, + "loss": 3.0857, + "step": 32890 + }, + { + "epoch": 2.235018344883816, + "grad_norm": 8.116386413574219, + "learning_rate": 7.207331159124882e-06, + "loss": 3.014, + "step": 32895 + }, + { + "epoch": 2.2353580649544775, + "grad_norm": 6.832813739776611, + "learning_rate": 7.206906509036554e-06, + "loss": 3.0131, + "step": 32900 + }, + { + "epoch": 2.235697785025139, + "grad_norm": 6.785576343536377, + "learning_rate": 7.2064818589482275e-06, + "loss": 3.0478, + "step": 32905 + }, + { + "epoch": 2.2360375050958012, + "grad_norm": 7.01193904876709, + "learning_rate": 7.2060572088599e-06, + "loss": 3.2507, + "step": 32910 + }, + { + "epoch": 2.236377225166463, + "grad_norm": 9.442078590393066, + "learning_rate": 7.205632558771572e-06, + "loss": 3.2713, + "step": 32915 + }, + { + "epoch": 2.2367169452371245, + "grad_norm": 8.040295600891113, + "learning_rate": 7.205207908683246e-06, + "loss": 2.9376, + "step": 32920 + }, + { + "epoch": 2.2370566653077866, + "grad_norm": 7.95536994934082, + "learning_rate": 7.204783258594919e-06, + "loss": 3.1481, + "step": 32925 + }, + { + "epoch": 2.237396385378448, + "grad_norm": 6.641626834869385, + "learning_rate": 7.204358608506591e-06, + "loss": 3.1575, + "step": 32930 + }, + { + "epoch": 2.23773610544911, + "grad_norm": 6.539656162261963, + "learning_rate": 7.203933958418264e-06, + "loss": 2.8434, + "step": 32935 + }, + { + "epoch": 2.238075825519772, + "grad_norm": 8.068069458007812, + "learning_rate": 7.203509308329937e-06, + "loss": 2.88, + "step": 32940 + }, + { + "epoch": 2.2384155455904335, + "grad_norm": 7.1976318359375, + "learning_rate": 7.203084658241609e-06, + "loss": 3.0068, + "step": 32945 + }, + { + "epoch": 2.238755265661095, + "grad_norm": 5.99709415435791, + "learning_rate": 7.202660008153283e-06, + "loss": 3.1995, + "step": 32950 + }, + { + "epoch": 2.2390949857317572, + "grad_norm": 6.829178810119629, + "learning_rate": 7.202235358064955e-06, + "loss": 3.0389, + "step": 32955 + }, + { + "epoch": 2.239434705802419, + "grad_norm": 5.679297924041748, + "learning_rate": 7.2018107079766275e-06, + "loss": 2.9697, + "step": 32960 + }, + { + "epoch": 2.2397744258730805, + "grad_norm": 7.228677272796631, + "learning_rate": 7.201386057888301e-06, + "loss": 2.947, + "step": 32965 + }, + { + "epoch": 2.2401141459437426, + "grad_norm": 7.884355545043945, + "learning_rate": 7.200961407799973e-06, + "loss": 3.2001, + "step": 32970 + }, + { + "epoch": 2.240453866014404, + "grad_norm": 7.2422871589660645, + "learning_rate": 7.200536757711646e-06, + "loss": 2.9361, + "step": 32975 + }, + { + "epoch": 2.240793586085066, + "grad_norm": 6.692373752593994, + "learning_rate": 7.2001121076233196e-06, + "loss": 3.0605, + "step": 32980 + }, + { + "epoch": 2.2411333061557275, + "grad_norm": 8.86691951751709, + "learning_rate": 7.1996874575349915e-06, + "loss": 3.0738, + "step": 32985 + }, + { + "epoch": 2.2414730262263896, + "grad_norm": 8.390023231506348, + "learning_rate": 7.199262807446664e-06, + "loss": 2.9148, + "step": 32990 + }, + { + "epoch": 2.241812746297051, + "grad_norm": 7.590596675872803, + "learning_rate": 7.198838157358338e-06, + "loss": 2.9641, + "step": 32995 + }, + { + "epoch": 2.242152466367713, + "grad_norm": 7.047868251800537, + "learning_rate": 7.19841350727001e-06, + "loss": 2.8642, + "step": 33000 + }, + { + "epoch": 2.242492186438375, + "grad_norm": 6.458000659942627, + "learning_rate": 7.197988857181683e-06, + "loss": 3.0231, + "step": 33005 + }, + { + "epoch": 2.2428319065090365, + "grad_norm": 7.494079113006592, + "learning_rate": 7.197564207093356e-06, + "loss": 2.875, + "step": 33010 + }, + { + "epoch": 2.243171626579698, + "grad_norm": 7.8221564292907715, + "learning_rate": 7.197139557005028e-06, + "loss": 3.1808, + "step": 33015 + }, + { + "epoch": 2.2435113466503602, + "grad_norm": 7.196558952331543, + "learning_rate": 7.196714906916701e-06, + "loss": 3.0477, + "step": 33020 + }, + { + "epoch": 2.243851066721022, + "grad_norm": 6.579256534576416, + "learning_rate": 7.196290256828375e-06, + "loss": 3.0182, + "step": 33025 + }, + { + "epoch": 2.2441907867916835, + "grad_norm": 7.9231672286987305, + "learning_rate": 7.195865606740047e-06, + "loss": 3.0492, + "step": 33030 + }, + { + "epoch": 2.2445305068623456, + "grad_norm": 6.644550800323486, + "learning_rate": 7.1954409566517195e-06, + "loss": 3.0334, + "step": 33035 + }, + { + "epoch": 2.244870226933007, + "grad_norm": 6.317070007324219, + "learning_rate": 7.195016306563392e-06, + "loss": 3.085, + "step": 33040 + }, + { + "epoch": 2.245209947003669, + "grad_norm": 6.962858200073242, + "learning_rate": 7.194591656475065e-06, + "loss": 3.0656, + "step": 33045 + }, + { + "epoch": 2.245549667074331, + "grad_norm": 7.032510280609131, + "learning_rate": 7.194167006386737e-06, + "loss": 3.2451, + "step": 33050 + }, + { + "epoch": 2.2458893871449925, + "grad_norm": 5.618537425994873, + "learning_rate": 7.193742356298411e-06, + "loss": 3.2256, + "step": 33055 + }, + { + "epoch": 2.246229107215654, + "grad_norm": 6.557762622833252, + "learning_rate": 7.1933177062100835e-06, + "loss": 2.9293, + "step": 33060 + }, + { + "epoch": 2.2465688272863162, + "grad_norm": 8.996702194213867, + "learning_rate": 7.1928930561217555e-06, + "loss": 3.0342, + "step": 33065 + }, + { + "epoch": 2.246908547356978, + "grad_norm": 8.757485389709473, + "learning_rate": 7.192468406033429e-06, + "loss": 3.0969, + "step": 33070 + }, + { + "epoch": 2.2472482674276395, + "grad_norm": 7.273478031158447, + "learning_rate": 7.192043755945102e-06, + "loss": 3.3344, + "step": 33075 + }, + { + "epoch": 2.2475879874983016, + "grad_norm": 5.966818809509277, + "learning_rate": 7.191619105856774e-06, + "loss": 3.0708, + "step": 33080 + }, + { + "epoch": 2.247927707568963, + "grad_norm": 10.115335464477539, + "learning_rate": 7.1911944557684475e-06, + "loss": 3.1049, + "step": 33085 + }, + { + "epoch": 2.248267427639625, + "grad_norm": 6.05573034286499, + "learning_rate": 7.19076980568012e-06, + "loss": 2.8459, + "step": 33090 + }, + { + "epoch": 2.248607147710287, + "grad_norm": 6.234659671783447, + "learning_rate": 7.190345155591792e-06, + "loss": 2.7436, + "step": 33095 + }, + { + "epoch": 2.2489468677809485, + "grad_norm": 7.163509845733643, + "learning_rate": 7.189920505503466e-06, + "loss": 2.9915, + "step": 33100 + }, + { + "epoch": 2.24928658785161, + "grad_norm": 6.17488431930542, + "learning_rate": 7.189495855415139e-06, + "loss": 2.8314, + "step": 33105 + }, + { + "epoch": 2.2496263079222723, + "grad_norm": 7.454296112060547, + "learning_rate": 7.189071205326811e-06, + "loss": 2.9656, + "step": 33110 + }, + { + "epoch": 2.249966027992934, + "grad_norm": 5.800012111663818, + "learning_rate": 7.188646555238484e-06, + "loss": 3.0992, + "step": 33115 + }, + { + "epoch": 2.2503057480635955, + "grad_norm": 7.520727157592773, + "learning_rate": 7.188221905150157e-06, + "loss": 3.0895, + "step": 33120 + }, + { + "epoch": 2.250645468134257, + "grad_norm": 8.078045845031738, + "learning_rate": 7.187797255061829e-06, + "loss": 3.2108, + "step": 33125 + }, + { + "epoch": 2.250985188204919, + "grad_norm": 6.754947185516357, + "learning_rate": 7.187372604973503e-06, + "loss": 3.2413, + "step": 33130 + }, + { + "epoch": 2.251324908275581, + "grad_norm": 7.255218029022217, + "learning_rate": 7.18703288490284e-06, + "loss": 3.1494, + "step": 33135 + }, + { + "epoch": 2.2516646283462425, + "grad_norm": 8.021346092224121, + "learning_rate": 7.186608234814514e-06, + "loss": 3.0735, + "step": 33140 + }, + { + "epoch": 2.2520043484169046, + "grad_norm": 7.590003490447998, + "learning_rate": 7.186183584726186e-06, + "loss": 3.0857, + "step": 33145 + }, + { + "epoch": 2.252344068487566, + "grad_norm": 6.863206386566162, + "learning_rate": 7.185758934637858e-06, + "loss": 2.8172, + "step": 33150 + }, + { + "epoch": 2.252683788558228, + "grad_norm": 7.684318542480469, + "learning_rate": 7.185334284549532e-06, + "loss": 3.3212, + "step": 33155 + }, + { + "epoch": 2.25302350862889, + "grad_norm": 6.628709316253662, + "learning_rate": 7.184909634461205e-06, + "loss": 3.0949, + "step": 33160 + }, + { + "epoch": 2.2533632286995515, + "grad_norm": 6.1991424560546875, + "learning_rate": 7.184484984372877e-06, + "loss": 3.2522, + "step": 33165 + }, + { + "epoch": 2.253702948770213, + "grad_norm": 8.33907413482666, + "learning_rate": 7.18406033428455e-06, + "loss": 2.8994, + "step": 33170 + }, + { + "epoch": 2.2540426688408752, + "grad_norm": 7.274807453155518, + "learning_rate": 7.183635684196223e-06, + "loss": 3.0343, + "step": 33175 + }, + { + "epoch": 2.254382388911537, + "grad_norm": 6.592090129852295, + "learning_rate": 7.183211034107895e-06, + "loss": 2.9382, + "step": 33180 + }, + { + "epoch": 2.2547221089821985, + "grad_norm": 6.851048469543457, + "learning_rate": 7.182786384019569e-06, + "loss": 3.163, + "step": 33185 + }, + { + "epoch": 2.2550618290528606, + "grad_norm": 6.848361015319824, + "learning_rate": 7.182361733931242e-06, + "loss": 3.0778, + "step": 33190 + }, + { + "epoch": 2.255401549123522, + "grad_norm": 8.510313034057617, + "learning_rate": 7.1819370838429136e-06, + "loss": 3.2285, + "step": 33195 + }, + { + "epoch": 2.255741269194184, + "grad_norm": 6.075769901275635, + "learning_rate": 7.181512433754587e-06, + "loss": 3.1023, + "step": 33200 + }, + { + "epoch": 2.256080989264846, + "grad_norm": 6.407498359680176, + "learning_rate": 7.181087783666259e-06, + "loss": 3.1502, + "step": 33205 + }, + { + "epoch": 2.2564207093355075, + "grad_norm": 6.04201078414917, + "learning_rate": 7.180663133577932e-06, + "loss": 3.095, + "step": 33210 + }, + { + "epoch": 2.256760429406169, + "grad_norm": 7.1197896003723145, + "learning_rate": 7.180238483489606e-06, + "loss": 2.8454, + "step": 33215 + }, + { + "epoch": 2.2571001494768312, + "grad_norm": 6.208929538726807, + "learning_rate": 7.1798138334012776e-06, + "loss": 3.1538, + "step": 33220 + }, + { + "epoch": 2.257439869547493, + "grad_norm": 9.144120216369629, + "learning_rate": 7.17938918331295e-06, + "loss": 2.9838, + "step": 33225 + }, + { + "epoch": 2.2577795896181545, + "grad_norm": 8.498346328735352, + "learning_rate": 7.178964533224624e-06, + "loss": 3.0923, + "step": 33230 + }, + { + "epoch": 2.2581193096888166, + "grad_norm": 5.853455543518066, + "learning_rate": 7.178539883136296e-06, + "loss": 3.0539, + "step": 33235 + }, + { + "epoch": 2.258459029759478, + "grad_norm": 8.231396675109863, + "learning_rate": 7.178115233047969e-06, + "loss": 3.0265, + "step": 33240 + }, + { + "epoch": 2.25879874983014, + "grad_norm": 7.912394046783447, + "learning_rate": 7.177690582959642e-06, + "loss": 2.8546, + "step": 33245 + }, + { + "epoch": 2.259138469900802, + "grad_norm": 8.039727210998535, + "learning_rate": 7.177265932871314e-06, + "loss": 3.0497, + "step": 33250 + }, + { + "epoch": 2.2594781899714635, + "grad_norm": 5.844260215759277, + "learning_rate": 7.176841282782987e-06, + "loss": 2.735, + "step": 33255 + }, + { + "epoch": 2.259817910042125, + "grad_norm": 6.011659622192383, + "learning_rate": 7.176416632694661e-06, + "loss": 3.1484, + "step": 33260 + }, + { + "epoch": 2.2601576301127873, + "grad_norm": 8.495080947875977, + "learning_rate": 7.175991982606333e-06, + "loss": 3.3523, + "step": 33265 + }, + { + "epoch": 2.260497350183449, + "grad_norm": 5.851802825927734, + "learning_rate": 7.175567332518006e-06, + "loss": 2.9382, + "step": 33270 + }, + { + "epoch": 2.2608370702541105, + "grad_norm": 7.197444438934326, + "learning_rate": 7.175142682429679e-06, + "loss": 3.1486, + "step": 33275 + }, + { + "epoch": 2.2611767903247726, + "grad_norm": 7.010100364685059, + "learning_rate": 7.174718032341351e-06, + "loss": 3.0087, + "step": 33280 + }, + { + "epoch": 2.261516510395434, + "grad_norm": 8.54199504852295, + "learning_rate": 7.174293382253024e-06, + "loss": 3.1418, + "step": 33285 + }, + { + "epoch": 2.261856230466096, + "grad_norm": 5.500904560089111, + "learning_rate": 7.173868732164697e-06, + "loss": 3.034, + "step": 33290 + }, + { + "epoch": 2.262195950536758, + "grad_norm": 7.017819881439209, + "learning_rate": 7.17344408207637e-06, + "loss": 3.0124, + "step": 33295 + }, + { + "epoch": 2.2625356706074196, + "grad_norm": 6.437252998352051, + "learning_rate": 7.1730194319880415e-06, + "loss": 3.0969, + "step": 33300 + }, + { + "epoch": 2.262875390678081, + "grad_norm": 7.049270153045654, + "learning_rate": 7.172594781899715e-06, + "loss": 3.0883, + "step": 33305 + }, + { + "epoch": 2.2632151107487433, + "grad_norm": 7.79445743560791, + "learning_rate": 7.172170131811388e-06, + "loss": 3.0913, + "step": 33310 + }, + { + "epoch": 2.263554830819405, + "grad_norm": 6.404678821563721, + "learning_rate": 7.17174548172306e-06, + "loss": 3.1138, + "step": 33315 + }, + { + "epoch": 2.2638945508900665, + "grad_norm": 5.269931316375732, + "learning_rate": 7.171320831634734e-06, + "loss": 3.1102, + "step": 33320 + }, + { + "epoch": 2.2642342709607286, + "grad_norm": 6.9634575843811035, + "learning_rate": 7.170896181546406e-06, + "loss": 2.8338, + "step": 33325 + }, + { + "epoch": 2.2645739910313902, + "grad_norm": 8.352190971374512, + "learning_rate": 7.170471531458078e-06, + "loss": 2.93, + "step": 33330 + }, + { + "epoch": 2.264913711102052, + "grad_norm": 6.074806213378906, + "learning_rate": 7.170046881369752e-06, + "loss": 3.0079, + "step": 33335 + }, + { + "epoch": 2.2652534311727135, + "grad_norm": 6.10242223739624, + "learning_rate": 7.169622231281425e-06, + "loss": 2.9537, + "step": 33340 + }, + { + "epoch": 2.2655931512433756, + "grad_norm": 6.4868855476379395, + "learning_rate": 7.169197581193097e-06, + "loss": 2.9158, + "step": 33345 + }, + { + "epoch": 2.265932871314037, + "grad_norm": 6.477223873138428, + "learning_rate": 7.16877293110477e-06, + "loss": 3.1333, + "step": 33350 + }, + { + "epoch": 2.266272591384699, + "grad_norm": 7.932903289794922, + "learning_rate": 7.168348281016443e-06, + "loss": 3.1732, + "step": 33355 + }, + { + "epoch": 2.266612311455361, + "grad_norm": 5.6436662673950195, + "learning_rate": 7.167923630928115e-06, + "loss": 3.2793, + "step": 33360 + }, + { + "epoch": 2.2669520315260225, + "grad_norm": 7.784404277801514, + "learning_rate": 7.167498980839789e-06, + "loss": 3.1564, + "step": 33365 + }, + { + "epoch": 2.267291751596684, + "grad_norm": 6.565340042114258, + "learning_rate": 7.167074330751462e-06, + "loss": 3.1558, + "step": 33370 + }, + { + "epoch": 2.2676314716673462, + "grad_norm": 6.597188472747803, + "learning_rate": 7.166649680663134e-06, + "loss": 2.8388, + "step": 33375 + }, + { + "epoch": 2.267971191738008, + "grad_norm": 5.959476470947266, + "learning_rate": 7.166225030574807e-06, + "loss": 3.1974, + "step": 33380 + }, + { + "epoch": 2.2683109118086695, + "grad_norm": 6.4588141441345215, + "learning_rate": 7.165800380486479e-06, + "loss": 2.77, + "step": 33385 + }, + { + "epoch": 2.2686506318793316, + "grad_norm": 6.543514251708984, + "learning_rate": 7.165375730398153e-06, + "loss": 3.0405, + "step": 33390 + }, + { + "epoch": 2.268990351949993, + "grad_norm": 6.874528408050537, + "learning_rate": 7.164951080309826e-06, + "loss": 3.0655, + "step": 33395 + }, + { + "epoch": 2.269330072020655, + "grad_norm": 6.0566511154174805, + "learning_rate": 7.1645264302214976e-06, + "loss": 2.8175, + "step": 33400 + }, + { + "epoch": 2.269669792091317, + "grad_norm": 7.5926079750061035, + "learning_rate": 7.164101780133171e-06, + "loss": 2.9582, + "step": 33405 + }, + { + "epoch": 2.2700095121619785, + "grad_norm": 7.086320400238037, + "learning_rate": 7.163677130044844e-06, + "loss": 2.9791, + "step": 33410 + }, + { + "epoch": 2.27034923223264, + "grad_norm": 6.334750652313232, + "learning_rate": 7.163252479956516e-06, + "loss": 3.1386, + "step": 33415 + }, + { + "epoch": 2.2706889523033023, + "grad_norm": 6.4294867515563965, + "learning_rate": 7.16282782986819e-06, + "loss": 3.119, + "step": 33420 + }, + { + "epoch": 2.271028672373964, + "grad_norm": 7.084782600402832, + "learning_rate": 7.162403179779862e-06, + "loss": 2.7591, + "step": 33425 + }, + { + "epoch": 2.2713683924446255, + "grad_norm": 6.595669269561768, + "learning_rate": 7.161978529691534e-06, + "loss": 2.9176, + "step": 33430 + }, + { + "epoch": 2.2717081125152876, + "grad_norm": 5.835068702697754, + "learning_rate": 7.161553879603208e-06, + "loss": 2.9221, + "step": 33435 + }, + { + "epoch": 2.2720478325859492, + "grad_norm": 6.5016045570373535, + "learning_rate": 7.161129229514881e-06, + "loss": 2.9447, + "step": 33440 + }, + { + "epoch": 2.272387552656611, + "grad_norm": 6.1125006675720215, + "learning_rate": 7.160704579426553e-06, + "loss": 3.0463, + "step": 33445 + }, + { + "epoch": 2.2727272727272725, + "grad_norm": 7.411643981933594, + "learning_rate": 7.160279929338226e-06, + "loss": 2.8839, + "step": 33450 + }, + { + "epoch": 2.2730669927979346, + "grad_norm": 7.665368556976318, + "learning_rate": 7.159855279249898e-06, + "loss": 3.1779, + "step": 33455 + }, + { + "epoch": 2.273406712868596, + "grad_norm": 9.566598892211914, + "learning_rate": 7.159430629161571e-06, + "loss": 2.9979, + "step": 33460 + }, + { + "epoch": 2.273746432939258, + "grad_norm": 6.921080112457275, + "learning_rate": 7.159005979073245e-06, + "loss": 2.9985, + "step": 33465 + }, + { + "epoch": 2.27408615300992, + "grad_norm": 9.300741195678711, + "learning_rate": 7.158581328984917e-06, + "loss": 2.9412, + "step": 33470 + }, + { + "epoch": 2.2744258730805815, + "grad_norm": 5.452068328857422, + "learning_rate": 7.1581566788965896e-06, + "loss": 3.1267, + "step": 33475 + }, + { + "epoch": 2.274765593151243, + "grad_norm": 8.341252326965332, + "learning_rate": 7.157732028808263e-06, + "loss": 2.9915, + "step": 33480 + }, + { + "epoch": 2.2751053132219052, + "grad_norm": 6.771302700042725, + "learning_rate": 7.157307378719935e-06, + "loss": 3.0223, + "step": 33485 + }, + { + "epoch": 2.275445033292567, + "grad_norm": 5.1469550132751465, + "learning_rate": 7.156882728631608e-06, + "loss": 3.0885, + "step": 33490 + }, + { + "epoch": 2.2757847533632285, + "grad_norm": 7.111029148101807, + "learning_rate": 7.156458078543282e-06, + "loss": 3.127, + "step": 33495 + }, + { + "epoch": 2.2761244734338906, + "grad_norm": 5.8707733154296875, + "learning_rate": 7.1560334284549536e-06, + "loss": 3.2352, + "step": 33500 + }, + { + "epoch": 2.276464193504552, + "grad_norm": 5.771938323974609, + "learning_rate": 7.155608778366626e-06, + "loss": 3.1745, + "step": 33505 + }, + { + "epoch": 2.276803913575214, + "grad_norm": 7.293656349182129, + "learning_rate": 7.1551841282783e-06, + "loss": 2.8788, + "step": 33510 + }, + { + "epoch": 2.277143633645876, + "grad_norm": 7.3661274909973145, + "learning_rate": 7.154759478189972e-06, + "loss": 3.0564, + "step": 33515 + }, + { + "epoch": 2.2774833537165375, + "grad_norm": 7.719625949859619, + "learning_rate": 7.154334828101645e-06, + "loss": 3.0551, + "step": 33520 + }, + { + "epoch": 2.277823073787199, + "grad_norm": 6.962398052215576, + "learning_rate": 7.153910178013318e-06, + "loss": 3.0181, + "step": 33525 + }, + { + "epoch": 2.2781627938578612, + "grad_norm": 6.95889139175415, + "learning_rate": 7.15348552792499e-06, + "loss": 2.976, + "step": 33530 + }, + { + "epoch": 2.278502513928523, + "grad_norm": 6.741274833679199, + "learning_rate": 7.153060877836663e-06, + "loss": 2.9311, + "step": 33535 + }, + { + "epoch": 2.2788422339991845, + "grad_norm": 9.088313102722168, + "learning_rate": 7.152636227748336e-06, + "loss": 3.3533, + "step": 33540 + }, + { + "epoch": 2.2791819540698466, + "grad_norm": 7.328268527984619, + "learning_rate": 7.152211577660009e-06, + "loss": 3.022, + "step": 33545 + }, + { + "epoch": 2.279521674140508, + "grad_norm": 6.625513553619385, + "learning_rate": 7.151786927571681e-06, + "loss": 2.9946, + "step": 33550 + }, + { + "epoch": 2.27986139421117, + "grad_norm": 7.58961820602417, + "learning_rate": 7.151362277483354e-06, + "loss": 3.0063, + "step": 33555 + }, + { + "epoch": 2.280201114281832, + "grad_norm": 8.573471069335938, + "learning_rate": 7.150937627395027e-06, + "loss": 3.021, + "step": 33560 + }, + { + "epoch": 2.2805408343524936, + "grad_norm": 6.04892635345459, + "learning_rate": 7.150512977306699e-06, + "loss": 2.9059, + "step": 33565 + }, + { + "epoch": 2.280880554423155, + "grad_norm": 5.452587127685547, + "learning_rate": 7.150088327218373e-06, + "loss": 3.1631, + "step": 33570 + }, + { + "epoch": 2.2812202744938173, + "grad_norm": 7.571829319000244, + "learning_rate": 7.149663677130046e-06, + "loss": 3.1017, + "step": 33575 + }, + { + "epoch": 2.281559994564479, + "grad_norm": 9.157288551330566, + "learning_rate": 7.1492390270417175e-06, + "loss": 3.0231, + "step": 33580 + }, + { + "epoch": 2.2818997146351405, + "grad_norm": 6.110222816467285, + "learning_rate": 7.148814376953391e-06, + "loss": 3.1377, + "step": 33585 + }, + { + "epoch": 2.2822394347058026, + "grad_norm": 6.527655124664307, + "learning_rate": 7.148389726865064e-06, + "loss": 2.9086, + "step": 33590 + }, + { + "epoch": 2.2825791547764642, + "grad_norm": 8.999923706054688, + "learning_rate": 7.147965076776736e-06, + "loss": 3.0121, + "step": 33595 + }, + { + "epoch": 2.282918874847126, + "grad_norm": 6.893283843994141, + "learning_rate": 7.14754042668841e-06, + "loss": 3.0608, + "step": 33600 + }, + { + "epoch": 2.283258594917788, + "grad_norm": 8.079146385192871, + "learning_rate": 7.147115776600082e-06, + "loss": 2.8937, + "step": 33605 + }, + { + "epoch": 2.2835983149884496, + "grad_norm": 8.07444953918457, + "learning_rate": 7.146691126511754e-06, + "loss": 3.0557, + "step": 33610 + }, + { + "epoch": 2.283938035059111, + "grad_norm": 6.891087055206299, + "learning_rate": 7.146266476423428e-06, + "loss": 2.9883, + "step": 33615 + }, + { + "epoch": 2.2842777551297733, + "grad_norm": 7.690329551696777, + "learning_rate": 7.145841826335101e-06, + "loss": 3.1211, + "step": 33620 + }, + { + "epoch": 2.284617475200435, + "grad_norm": 7.254636764526367, + "learning_rate": 7.145417176246773e-06, + "loss": 3.2821, + "step": 33625 + }, + { + "epoch": 2.2849571952710965, + "grad_norm": 8.120099067687988, + "learning_rate": 7.144992526158446e-06, + "loss": 2.9501, + "step": 33630 + }, + { + "epoch": 2.2852969153417586, + "grad_norm": 8.108870506286621, + "learning_rate": 7.144567876070118e-06, + "loss": 3.0471, + "step": 33635 + }, + { + "epoch": 2.2856366354124202, + "grad_norm": 7.257412433624268, + "learning_rate": 7.144143225981791e-06, + "loss": 3.3479, + "step": 33640 + }, + { + "epoch": 2.285976355483082, + "grad_norm": 7.485077857971191, + "learning_rate": 7.143718575893465e-06, + "loss": 3.2132, + "step": 33645 + }, + { + "epoch": 2.286316075553744, + "grad_norm": 6.768326282501221, + "learning_rate": 7.143293925805137e-06, + "loss": 3.0191, + "step": 33650 + }, + { + "epoch": 2.2866557956244056, + "grad_norm": 8.532272338867188, + "learning_rate": 7.1428692757168095e-06, + "loss": 2.9755, + "step": 33655 + }, + { + "epoch": 2.286995515695067, + "grad_norm": 5.807704925537109, + "learning_rate": 7.142444625628483e-06, + "loss": 3.0768, + "step": 33660 + }, + { + "epoch": 2.2873352357657293, + "grad_norm": 6.3006181716918945, + "learning_rate": 7.142019975540155e-06, + "loss": 3.0773, + "step": 33665 + }, + { + "epoch": 2.287674955836391, + "grad_norm": 6.67125940322876, + "learning_rate": 7.141595325451828e-06, + "loss": 3.0065, + "step": 33670 + }, + { + "epoch": 2.2880146759070525, + "grad_norm": 8.547911643981934, + "learning_rate": 7.141170675363502e-06, + "loss": 2.8523, + "step": 33675 + }, + { + "epoch": 2.288354395977714, + "grad_norm": 7.5062665939331055, + "learning_rate": 7.1407460252751735e-06, + "loss": 2.9671, + "step": 33680 + }, + { + "epoch": 2.2886941160483762, + "grad_norm": 7.268524646759033, + "learning_rate": 7.140321375186846e-06, + "loss": 3.1796, + "step": 33685 + }, + { + "epoch": 2.289033836119038, + "grad_norm": 7.81730842590332, + "learning_rate": 7.13989672509852e-06, + "loss": 3.1612, + "step": 33690 + }, + { + "epoch": 2.2893735561896995, + "grad_norm": 6.258007049560547, + "learning_rate": 7.139472075010192e-06, + "loss": 2.9722, + "step": 33695 + }, + { + "epoch": 2.2897132762603616, + "grad_norm": 6.949703693389893, + "learning_rate": 7.139047424921865e-06, + "loss": 3.1883, + "step": 33700 + }, + { + "epoch": 2.290052996331023, + "grad_norm": 6.219644069671631, + "learning_rate": 7.1386227748335376e-06, + "loss": 3.109, + "step": 33705 + }, + { + "epoch": 2.290392716401685, + "grad_norm": 9.921180725097656, + "learning_rate": 7.13819812474521e-06, + "loss": 2.9137, + "step": 33710 + }, + { + "epoch": 2.290732436472347, + "grad_norm": 7.776577472686768, + "learning_rate": 7.137773474656884e-06, + "loss": 2.9769, + "step": 33715 + }, + { + "epoch": 2.2910721565430086, + "grad_norm": 6.751384258270264, + "learning_rate": 7.137348824568556e-06, + "loss": 3.028, + "step": 33720 + }, + { + "epoch": 2.29141187661367, + "grad_norm": 5.281036853790283, + "learning_rate": 7.136924174480229e-06, + "loss": 3.0165, + "step": 33725 + }, + { + "epoch": 2.2917515966843323, + "grad_norm": 6.123898983001709, + "learning_rate": 7.136499524391902e-06, + "loss": 3.2879, + "step": 33730 + }, + { + "epoch": 2.292091316754994, + "grad_norm": 7.500540733337402, + "learning_rate": 7.136074874303574e-06, + "loss": 3.0194, + "step": 33735 + }, + { + "epoch": 2.2924310368256555, + "grad_norm": 7.388730525970459, + "learning_rate": 7.135650224215247e-06, + "loss": 2.9457, + "step": 33740 + }, + { + "epoch": 2.2927707568963176, + "grad_norm": 7.087454795837402, + "learning_rate": 7.135225574126921e-06, + "loss": 2.9945, + "step": 33745 + }, + { + "epoch": 2.2931104769669792, + "grad_norm": 6.753959655761719, + "learning_rate": 7.134800924038593e-06, + "loss": 3.1068, + "step": 33750 + }, + { + "epoch": 2.293450197037641, + "grad_norm": 6.8050312995910645, + "learning_rate": 7.1343762739502656e-06, + "loss": 3.2408, + "step": 33755 + }, + { + "epoch": 2.293789917108303, + "grad_norm": 6.088819980621338, + "learning_rate": 7.133951623861939e-06, + "loss": 2.8776, + "step": 33760 + }, + { + "epoch": 2.2941296371789646, + "grad_norm": 6.07413911819458, + "learning_rate": 7.133526973773611e-06, + "loss": 2.7843, + "step": 33765 + }, + { + "epoch": 2.294469357249626, + "grad_norm": 6.822026252746582, + "learning_rate": 7.133102323685284e-06, + "loss": 3.1248, + "step": 33770 + }, + { + "epoch": 2.2948090773202883, + "grad_norm": 7.561911106109619, + "learning_rate": 7.132677673596957e-06, + "loss": 2.8957, + "step": 33775 + }, + { + "epoch": 2.29514879739095, + "grad_norm": 7.348755359649658, + "learning_rate": 7.1322530235086296e-06, + "loss": 3.0931, + "step": 33780 + }, + { + "epoch": 2.2954885174616115, + "grad_norm": 8.469046592712402, + "learning_rate": 7.131828373420302e-06, + "loss": 3.0083, + "step": 33785 + }, + { + "epoch": 2.295828237532273, + "grad_norm": 8.99614143371582, + "learning_rate": 7.131403723331975e-06, + "loss": 3.0276, + "step": 33790 + }, + { + "epoch": 2.2961679576029352, + "grad_norm": 8.102863311767578, + "learning_rate": 7.130979073243648e-06, + "loss": 3.1122, + "step": 33795 + }, + { + "epoch": 2.296507677673597, + "grad_norm": 5.918293476104736, + "learning_rate": 7.13055442315532e-06, + "loss": 3.1048, + "step": 33800 + }, + { + "epoch": 2.2968473977442585, + "grad_norm": 9.848952293395996, + "learning_rate": 7.1301297730669936e-06, + "loss": 3.0687, + "step": 33805 + }, + { + "epoch": 2.2971871178149206, + "grad_norm": 6.303630828857422, + "learning_rate": 7.129705122978666e-06, + "loss": 2.8762, + "step": 33810 + }, + { + "epoch": 2.297526837885582, + "grad_norm": 6.324081897735596, + "learning_rate": 7.129280472890338e-06, + "loss": 3.0349, + "step": 33815 + }, + { + "epoch": 2.297866557956244, + "grad_norm": 5.901996612548828, + "learning_rate": 7.128855822802012e-06, + "loss": 3.0642, + "step": 33820 + }, + { + "epoch": 2.298206278026906, + "grad_norm": 6.8282856941223145, + "learning_rate": 7.128431172713685e-06, + "loss": 2.763, + "step": 33825 + }, + { + "epoch": 2.2985459980975675, + "grad_norm": 7.282529354095459, + "learning_rate": 7.128006522625357e-06, + "loss": 3.1075, + "step": 33830 + }, + { + "epoch": 2.298885718168229, + "grad_norm": 6.348235607147217, + "learning_rate": 7.12758187253703e-06, + "loss": 2.8788, + "step": 33835 + }, + { + "epoch": 2.2992254382388913, + "grad_norm": 7.901159763336182, + "learning_rate": 7.127157222448703e-06, + "loss": 2.9443, + "step": 33840 + }, + { + "epoch": 2.299565158309553, + "grad_norm": 7.167915344238281, + "learning_rate": 7.126732572360375e-06, + "loss": 3.2416, + "step": 33845 + }, + { + "epoch": 2.2999048783802145, + "grad_norm": 6.9019083976745605, + "learning_rate": 7.126307922272049e-06, + "loss": 3.1996, + "step": 33850 + }, + { + "epoch": 2.3002445984508766, + "grad_norm": 5.9247894287109375, + "learning_rate": 7.125883272183722e-06, + "loss": 3.051, + "step": 33855 + }, + { + "epoch": 2.300584318521538, + "grad_norm": 5.912909507751465, + "learning_rate": 7.1254586220953935e-06, + "loss": 2.6687, + "step": 33860 + }, + { + "epoch": 2.3009240385922, + "grad_norm": 6.9484171867370605, + "learning_rate": 7.125033972007067e-06, + "loss": 3.2213, + "step": 33865 + }, + { + "epoch": 2.301263758662862, + "grad_norm": 7.0674729347229, + "learning_rate": 7.124609321918739e-06, + "loss": 2.7413, + "step": 33870 + }, + { + "epoch": 2.3016034787335236, + "grad_norm": 5.887868404388428, + "learning_rate": 7.124184671830412e-06, + "loss": 2.9105, + "step": 33875 + }, + { + "epoch": 2.301943198804185, + "grad_norm": 6.9871320724487305, + "learning_rate": 7.123760021742086e-06, + "loss": 3.0608, + "step": 33880 + }, + { + "epoch": 2.3022829188748473, + "grad_norm": 6.44654655456543, + "learning_rate": 7.1233353716537575e-06, + "loss": 2.9787, + "step": 33885 + }, + { + "epoch": 2.302622638945509, + "grad_norm": 6.373457908630371, + "learning_rate": 7.12291072156543e-06, + "loss": 3.0498, + "step": 33890 + }, + { + "epoch": 2.3029623590161705, + "grad_norm": 6.552952766418457, + "learning_rate": 7.122486071477104e-06, + "loss": 3.0115, + "step": 33895 + }, + { + "epoch": 2.3033020790868326, + "grad_norm": 7.552300453186035, + "learning_rate": 7.122061421388776e-06, + "loss": 3.0059, + "step": 33900 + }, + { + "epoch": 2.3036417991574942, + "grad_norm": 7.250486373901367, + "learning_rate": 7.121636771300449e-06, + "loss": 3.2284, + "step": 33905 + }, + { + "epoch": 2.303981519228156, + "grad_norm": 7.051871299743652, + "learning_rate": 7.121212121212122e-06, + "loss": 3.0016, + "step": 33910 + }, + { + "epoch": 2.304321239298818, + "grad_norm": 6.569393157958984, + "learning_rate": 7.120787471123794e-06, + "loss": 3.2378, + "step": 33915 + }, + { + "epoch": 2.3046609593694796, + "grad_norm": 7.164257049560547, + "learning_rate": 7.120362821035467e-06, + "loss": 3.2285, + "step": 33920 + }, + { + "epoch": 2.305000679440141, + "grad_norm": 7.333830833435059, + "learning_rate": 7.119938170947141e-06, + "loss": 3.2342, + "step": 33925 + }, + { + "epoch": 2.3053403995108033, + "grad_norm": 6.816415309906006, + "learning_rate": 7.119513520858813e-06, + "loss": 2.9523, + "step": 33930 + }, + { + "epoch": 2.305680119581465, + "grad_norm": 7.1747541427612305, + "learning_rate": 7.1190888707704855e-06, + "loss": 3.1339, + "step": 33935 + }, + { + "epoch": 2.3060198396521265, + "grad_norm": 6.2683024406433105, + "learning_rate": 7.118664220682159e-06, + "loss": 3.0117, + "step": 33940 + }, + { + "epoch": 2.3063595597227886, + "grad_norm": 8.01198673248291, + "learning_rate": 7.118239570593831e-06, + "loss": 3.1609, + "step": 33945 + }, + { + "epoch": 2.3066992797934502, + "grad_norm": 7.70021390914917, + "learning_rate": 7.117814920505504e-06, + "loss": 3.0763, + "step": 33950 + }, + { + "epoch": 2.307038999864112, + "grad_norm": 6.068751811981201, + "learning_rate": 7.117390270417177e-06, + "loss": 2.9591, + "step": 33955 + }, + { + "epoch": 2.307378719934774, + "grad_norm": 7.033817768096924, + "learning_rate": 7.1169656203288495e-06, + "loss": 3.0746, + "step": 33960 + }, + { + "epoch": 2.3077184400054356, + "grad_norm": 7.154333591461182, + "learning_rate": 7.116540970240522e-06, + "loss": 3.027, + "step": 33965 + }, + { + "epoch": 2.308058160076097, + "grad_norm": 7.239738464355469, + "learning_rate": 7.116116320152195e-06, + "loss": 2.7536, + "step": 33970 + }, + { + "epoch": 2.3083978801467593, + "grad_norm": 5.735853672027588, + "learning_rate": 7.115691670063868e-06, + "loss": 3.0658, + "step": 33975 + }, + { + "epoch": 2.308737600217421, + "grad_norm": 7.915246963500977, + "learning_rate": 7.11526701997554e-06, + "loss": 2.8732, + "step": 33980 + }, + { + "epoch": 2.3090773202880825, + "grad_norm": 8.453693389892578, + "learning_rate": 7.1148423698872135e-06, + "loss": 3.1939, + "step": 33985 + }, + { + "epoch": 2.3094170403587446, + "grad_norm": 7.902822017669678, + "learning_rate": 7.114417719798886e-06, + "loss": 3.0142, + "step": 33990 + }, + { + "epoch": 2.3097567604294063, + "grad_norm": 8.089700698852539, + "learning_rate": 7.113993069710558e-06, + "loss": 3.1894, + "step": 33995 + }, + { + "epoch": 2.310096480500068, + "grad_norm": 9.555642127990723, + "learning_rate": 7.113568419622232e-06, + "loss": 3.0336, + "step": 34000 + }, + { + "epoch": 2.31043620057073, + "grad_norm": 8.483628273010254, + "learning_rate": 7.113143769533905e-06, + "loss": 3.1926, + "step": 34005 + }, + { + "epoch": 2.3107759206413916, + "grad_norm": 7.008466720581055, + "learning_rate": 7.112719119445577e-06, + "loss": 3.1579, + "step": 34010 + }, + { + "epoch": 2.311115640712053, + "grad_norm": 6.985295295715332, + "learning_rate": 7.11229446935725e-06, + "loss": 3.1198, + "step": 34015 + }, + { + "epoch": 2.311455360782715, + "grad_norm": 6.392380714416504, + "learning_rate": 7.111869819268923e-06, + "loss": 3.0454, + "step": 34020 + }, + { + "epoch": 2.311795080853377, + "grad_norm": 6.477524757385254, + "learning_rate": 7.111445169180595e-06, + "loss": 3.096, + "step": 34025 + }, + { + "epoch": 2.3121348009240386, + "grad_norm": 7.442898750305176, + "learning_rate": 7.111020519092269e-06, + "loss": 2.8524, + "step": 34030 + }, + { + "epoch": 2.3124745209947, + "grad_norm": 6.446352481842041, + "learning_rate": 7.1105958690039416e-06, + "loss": 3.1336, + "step": 34035 + }, + { + "epoch": 2.3128142410653623, + "grad_norm": 6.032713413238525, + "learning_rate": 7.1101712189156135e-06, + "loss": 3.22, + "step": 34040 + }, + { + "epoch": 2.313153961136024, + "grad_norm": 6.45413064956665, + "learning_rate": 7.109746568827287e-06, + "loss": 3.1629, + "step": 34045 + }, + { + "epoch": 2.3134936812066855, + "grad_norm": 7.397029399871826, + "learning_rate": 7.109321918738959e-06, + "loss": 2.8773, + "step": 34050 + }, + { + "epoch": 2.3138334012773476, + "grad_norm": 8.445341110229492, + "learning_rate": 7.108897268650633e-06, + "loss": 2.8311, + "step": 34055 + }, + { + "epoch": 2.3141731213480092, + "grad_norm": 6.707890033721924, + "learning_rate": 7.1084726185623056e-06, + "loss": 2.8853, + "step": 34060 + }, + { + "epoch": 2.314512841418671, + "grad_norm": 6.413041114807129, + "learning_rate": 7.1080479684739775e-06, + "loss": 3.1794, + "step": 34065 + }, + { + "epoch": 2.314852561489333, + "grad_norm": 8.071807861328125, + "learning_rate": 7.107623318385651e-06, + "loss": 2.92, + "step": 34070 + }, + { + "epoch": 2.3151922815599946, + "grad_norm": 6.79602575302124, + "learning_rate": 7.107198668297324e-06, + "loss": 2.8372, + "step": 34075 + }, + { + "epoch": 2.315532001630656, + "grad_norm": 7.757300853729248, + "learning_rate": 7.106774018208996e-06, + "loss": 3.1228, + "step": 34080 + }, + { + "epoch": 2.3158717217013183, + "grad_norm": 6.464444637298584, + "learning_rate": 7.1063493681206696e-06, + "loss": 3.3329, + "step": 34085 + }, + { + "epoch": 2.31621144177198, + "grad_norm": 6.631301403045654, + "learning_rate": 7.105924718032342e-06, + "loss": 3.1707, + "step": 34090 + }, + { + "epoch": 2.3165511618426415, + "grad_norm": 6.3493828773498535, + "learning_rate": 7.105500067944014e-06, + "loss": 3.2421, + "step": 34095 + }, + { + "epoch": 2.3168908819133036, + "grad_norm": 5.939933776855469, + "learning_rate": 7.105075417855688e-06, + "loss": 3.1529, + "step": 34100 + }, + { + "epoch": 2.3172306019839652, + "grad_norm": 7.065769672393799, + "learning_rate": 7.104650767767361e-06, + "loss": 2.8019, + "step": 34105 + }, + { + "epoch": 2.317570322054627, + "grad_norm": 5.890874862670898, + "learning_rate": 7.104226117679033e-06, + "loss": 3.1295, + "step": 34110 + }, + { + "epoch": 2.317910042125289, + "grad_norm": 7.372478008270264, + "learning_rate": 7.103801467590706e-06, + "loss": 3.2484, + "step": 34115 + }, + { + "epoch": 2.3182497621959506, + "grad_norm": 9.921036720275879, + "learning_rate": 7.103376817502378e-06, + "loss": 3.2363, + "step": 34120 + }, + { + "epoch": 2.318589482266612, + "grad_norm": 6.126318454742432, + "learning_rate": 7.102952167414051e-06, + "loss": 3.0039, + "step": 34125 + }, + { + "epoch": 2.318929202337274, + "grad_norm": 8.138043403625488, + "learning_rate": 7.102527517325725e-06, + "loss": 3.1009, + "step": 34130 + }, + { + "epoch": 2.319268922407936, + "grad_norm": 6.539886951446533, + "learning_rate": 7.102102867237397e-06, + "loss": 3.2382, + "step": 34135 + }, + { + "epoch": 2.3196086424785975, + "grad_norm": 6.100086212158203, + "learning_rate": 7.1016782171490695e-06, + "loss": 3.1028, + "step": 34140 + }, + { + "epoch": 2.319948362549259, + "grad_norm": 8.080214500427246, + "learning_rate": 7.101253567060743e-06, + "loss": 2.955, + "step": 34145 + }, + { + "epoch": 2.3202880826199213, + "grad_norm": 7.508269786834717, + "learning_rate": 7.100828916972415e-06, + "loss": 3.1022, + "step": 34150 + }, + { + "epoch": 2.320627802690583, + "grad_norm": 7.40081262588501, + "learning_rate": 7.100404266884088e-06, + "loss": 2.8944, + "step": 34155 + }, + { + "epoch": 2.3209675227612445, + "grad_norm": 9.826458930969238, + "learning_rate": 7.099979616795762e-06, + "loss": 2.9206, + "step": 34160 + }, + { + "epoch": 2.3213072428319066, + "grad_norm": 6.375816822052002, + "learning_rate": 7.0995549667074335e-06, + "loss": 2.9053, + "step": 34165 + }, + { + "epoch": 2.3216469629025682, + "grad_norm": 5.850046157836914, + "learning_rate": 7.099130316619106e-06, + "loss": 2.9228, + "step": 34170 + }, + { + "epoch": 2.32198668297323, + "grad_norm": 6.32615327835083, + "learning_rate": 7.09870566653078e-06, + "loss": 2.9082, + "step": 34175 + }, + { + "epoch": 2.322326403043892, + "grad_norm": 7.308278560638428, + "learning_rate": 7.098281016442452e-06, + "loss": 2.8287, + "step": 34180 + }, + { + "epoch": 2.3226661231145536, + "grad_norm": 8.566187858581543, + "learning_rate": 7.097856366354125e-06, + "loss": 3.0107, + "step": 34185 + }, + { + "epoch": 2.323005843185215, + "grad_norm": 6.843099117279053, + "learning_rate": 7.097431716265798e-06, + "loss": 2.9374, + "step": 34190 + }, + { + "epoch": 2.3233455632558773, + "grad_norm": 6.8234405517578125, + "learning_rate": 7.09700706617747e-06, + "loss": 3.1793, + "step": 34195 + }, + { + "epoch": 2.323685283326539, + "grad_norm": 6.4178147315979, + "learning_rate": 7.096582416089143e-06, + "loss": 2.9906, + "step": 34200 + }, + { + "epoch": 2.3240250033972005, + "grad_norm": 8.23788833618164, + "learning_rate": 7.096157766000816e-06, + "loss": 3.0218, + "step": 34205 + }, + { + "epoch": 2.3243647234678626, + "grad_norm": 5.892755031585693, + "learning_rate": 7.095733115912489e-06, + "loss": 3.407, + "step": 34210 + }, + { + "epoch": 2.3247044435385242, + "grad_norm": 6.080620288848877, + "learning_rate": 7.095308465824161e-06, + "loss": 2.9682, + "step": 34215 + }, + { + "epoch": 2.325044163609186, + "grad_norm": 7.708416938781738, + "learning_rate": 7.094883815735834e-06, + "loss": 3.299, + "step": 34220 + }, + { + "epoch": 2.325383883679848, + "grad_norm": 7.846455097198486, + "learning_rate": 7.094459165647507e-06, + "loss": 2.9849, + "step": 34225 + }, + { + "epoch": 2.3257236037505096, + "grad_norm": 6.729650974273682, + "learning_rate": 7.094034515559179e-06, + "loss": 3.1383, + "step": 34230 + }, + { + "epoch": 2.326063323821171, + "grad_norm": 6.480725288391113, + "learning_rate": 7.093609865470853e-06, + "loss": 2.9859, + "step": 34235 + }, + { + "epoch": 2.3264030438918333, + "grad_norm": 5.7536492347717285, + "learning_rate": 7.0931852153825255e-06, + "loss": 3.0658, + "step": 34240 + }, + { + "epoch": 2.326742763962495, + "grad_norm": 6.765872955322266, + "learning_rate": 7.0927605652941975e-06, + "loss": 3.0399, + "step": 34245 + }, + { + "epoch": 2.3270824840331565, + "grad_norm": 7.65557336807251, + "learning_rate": 7.092335915205871e-06, + "loss": 3.0811, + "step": 34250 + }, + { + "epoch": 2.3274222041038186, + "grad_norm": 7.061673641204834, + "learning_rate": 7.091911265117544e-06, + "loss": 3.0785, + "step": 34255 + }, + { + "epoch": 2.3277619241744802, + "grad_norm": 6.716119766235352, + "learning_rate": 7.091486615029216e-06, + "loss": 3.0313, + "step": 34260 + }, + { + "epoch": 2.328101644245142, + "grad_norm": 7.185941696166992, + "learning_rate": 7.0910619649408895e-06, + "loss": 2.9332, + "step": 34265 + }, + { + "epoch": 2.328441364315804, + "grad_norm": 7.559231281280518, + "learning_rate": 7.090637314852562e-06, + "loss": 3.1693, + "step": 34270 + }, + { + "epoch": 2.3287810843864656, + "grad_norm": 6.7212324142456055, + "learning_rate": 7.090212664764234e-06, + "loss": 2.82, + "step": 34275 + }, + { + "epoch": 2.329120804457127, + "grad_norm": 6.419245719909668, + "learning_rate": 7.089788014675908e-06, + "loss": 3.4175, + "step": 34280 + }, + { + "epoch": 2.3294605245277893, + "grad_norm": 6.273334980010986, + "learning_rate": 7.089363364587581e-06, + "loss": 2.9854, + "step": 34285 + }, + { + "epoch": 2.329800244598451, + "grad_norm": 6.622249603271484, + "learning_rate": 7.088938714499253e-06, + "loss": 2.8836, + "step": 34290 + }, + { + "epoch": 2.3301399646691126, + "grad_norm": 5.693356990814209, + "learning_rate": 7.088514064410926e-06, + "loss": 3.0848, + "step": 34295 + }, + { + "epoch": 2.3304796847397746, + "grad_norm": 6.571086883544922, + "learning_rate": 7.088089414322598e-06, + "loss": 2.9287, + "step": 34300 + }, + { + "epoch": 2.3308194048104363, + "grad_norm": 6.725530624389648, + "learning_rate": 7.087664764234271e-06, + "loss": 3.2608, + "step": 34305 + }, + { + "epoch": 2.331159124881098, + "grad_norm": 7.288125038146973, + "learning_rate": 7.087240114145945e-06, + "loss": 3.1232, + "step": 34310 + }, + { + "epoch": 2.33149884495176, + "grad_norm": 8.909781455993652, + "learning_rate": 7.086815464057617e-06, + "loss": 3.1271, + "step": 34315 + }, + { + "epoch": 2.3318385650224216, + "grad_norm": 7.587810516357422, + "learning_rate": 7.0863908139692895e-06, + "loss": 2.7982, + "step": 34320 + }, + { + "epoch": 2.3321782850930832, + "grad_norm": 6.910731792449951, + "learning_rate": 7.085966163880963e-06, + "loss": 2.8603, + "step": 34325 + }, + { + "epoch": 2.3325180051637453, + "grad_norm": 7.826142311096191, + "learning_rate": 7.085541513792635e-06, + "loss": 3.0795, + "step": 34330 + }, + { + "epoch": 2.332857725234407, + "grad_norm": 7.2628583908081055, + "learning_rate": 7.085116863704308e-06, + "loss": 2.8949, + "step": 34335 + }, + { + "epoch": 2.3331974453050686, + "grad_norm": 7.1360182762146, + "learning_rate": 7.0846922136159816e-06, + "loss": 3.0712, + "step": 34340 + }, + { + "epoch": 2.3335371653757306, + "grad_norm": 7.515873432159424, + "learning_rate": 7.0842675635276535e-06, + "loss": 2.9557, + "step": 34345 + }, + { + "epoch": 2.3338768854463923, + "grad_norm": 7.496188640594482, + "learning_rate": 7.083842913439326e-06, + "loss": 3.205, + "step": 34350 + }, + { + "epoch": 2.334216605517054, + "grad_norm": 7.312015533447266, + "learning_rate": 7.083418263351e-06, + "loss": 3.1067, + "step": 34355 + }, + { + "epoch": 2.3345563255877155, + "grad_norm": 6.533201694488525, + "learning_rate": 7.082993613262672e-06, + "loss": 3.2797, + "step": 34360 + }, + { + "epoch": 2.3348960456583776, + "grad_norm": 6.782447338104248, + "learning_rate": 7.082568963174345e-06, + "loss": 3.0867, + "step": 34365 + }, + { + "epoch": 2.3352357657290392, + "grad_norm": 7.695888042449951, + "learning_rate": 7.0821443130860175e-06, + "loss": 3.1609, + "step": 34370 + }, + { + "epoch": 2.335575485799701, + "grad_norm": 6.855913162231445, + "learning_rate": 7.08171966299769e-06, + "loss": 3.1932, + "step": 34375 + }, + { + "epoch": 2.335915205870363, + "grad_norm": 6.661323070526123, + "learning_rate": 7.081295012909363e-06, + "loss": 2.9482, + "step": 34380 + }, + { + "epoch": 2.3362549259410246, + "grad_norm": 7.312204837799072, + "learning_rate": 7.080870362821036e-06, + "loss": 3.0999, + "step": 34385 + }, + { + "epoch": 2.336594646011686, + "grad_norm": 9.701333999633789, + "learning_rate": 7.080445712732709e-06, + "loss": 3.0284, + "step": 34390 + }, + { + "epoch": 2.3369343660823483, + "grad_norm": 8.963165283203125, + "learning_rate": 7.080021062644382e-06, + "loss": 3.1619, + "step": 34395 + }, + { + "epoch": 2.33727408615301, + "grad_norm": 7.98538064956665, + "learning_rate": 7.079596412556054e-06, + "loss": 3.1525, + "step": 34400 + }, + { + "epoch": 2.3376138062236715, + "grad_norm": 6.929330825805664, + "learning_rate": 7.079171762467727e-06, + "loss": 3.0582, + "step": 34405 + }, + { + "epoch": 2.3379535262943336, + "grad_norm": 6.6328630447387695, + "learning_rate": 7.078747112379401e-06, + "loss": 3.1219, + "step": 34410 + }, + { + "epoch": 2.3382932463649952, + "grad_norm": 10.133343696594238, + "learning_rate": 7.078322462291073e-06, + "loss": 3.1447, + "step": 34415 + }, + { + "epoch": 2.338632966435657, + "grad_norm": 7.431289196014404, + "learning_rate": 7.0778978122027455e-06, + "loss": 2.8685, + "step": 34420 + }, + { + "epoch": 2.338972686506319, + "grad_norm": 6.954623699188232, + "learning_rate": 7.077473162114419e-06, + "loss": 2.7852, + "step": 34425 + }, + { + "epoch": 2.3393124065769806, + "grad_norm": 6.6405534744262695, + "learning_rate": 7.077048512026091e-06, + "loss": 3.0652, + "step": 34430 + }, + { + "epoch": 2.339652126647642, + "grad_norm": 5.98737907409668, + "learning_rate": 7.076623861937764e-06, + "loss": 3.2209, + "step": 34435 + }, + { + "epoch": 2.3399918467183043, + "grad_norm": 9.362318992614746, + "learning_rate": 7.076199211849437e-06, + "loss": 2.9577, + "step": 34440 + }, + { + "epoch": 2.340331566788966, + "grad_norm": 6.236973762512207, + "learning_rate": 7.0757745617611095e-06, + "loss": 3.012, + "step": 34445 + }, + { + "epoch": 2.3406712868596276, + "grad_norm": 7.005245208740234, + "learning_rate": 7.075349911672782e-06, + "loss": 3.0624, + "step": 34450 + }, + { + "epoch": 2.3410110069302896, + "grad_norm": 7.6978864669799805, + "learning_rate": 7.074925261584455e-06, + "loss": 3.1747, + "step": 34455 + }, + { + "epoch": 2.3413507270009513, + "grad_norm": 9.402749061584473, + "learning_rate": 7.074500611496128e-06, + "loss": 3.2389, + "step": 34460 + }, + { + "epoch": 2.341690447071613, + "grad_norm": 7.559848308563232, + "learning_rate": 7.0740759614078e-06, + "loss": 3.115, + "step": 34465 + }, + { + "epoch": 2.3420301671422745, + "grad_norm": 5.383645534515381, + "learning_rate": 7.0736513113194735e-06, + "loss": 2.9528, + "step": 34470 + }, + { + "epoch": 2.3423698872129366, + "grad_norm": 7.466920852661133, + "learning_rate": 7.073226661231146e-06, + "loss": 2.8207, + "step": 34475 + }, + { + "epoch": 2.3427096072835982, + "grad_norm": 6.216403484344482, + "learning_rate": 7.072802011142818e-06, + "loss": 3.0484, + "step": 34480 + }, + { + "epoch": 2.34304932735426, + "grad_norm": 7.666809558868408, + "learning_rate": 7.072377361054492e-06, + "loss": 3.0215, + "step": 34485 + }, + { + "epoch": 2.343389047424922, + "grad_norm": 7.35624361038208, + "learning_rate": 7.071952710966165e-06, + "loss": 3.2192, + "step": 34490 + }, + { + "epoch": 2.3437287674955836, + "grad_norm": 7.47675085067749, + "learning_rate": 7.071528060877837e-06, + "loss": 3.1068, + "step": 34495 + }, + { + "epoch": 2.344068487566245, + "grad_norm": 6.949357986450195, + "learning_rate": 7.07110341078951e-06, + "loss": 2.8502, + "step": 34500 + }, + { + "epoch": 2.3444082076369073, + "grad_norm": 7.9908671379089355, + "learning_rate": 7.070678760701183e-06, + "loss": 3.0654, + "step": 34505 + }, + { + "epoch": 2.344747927707569, + "grad_norm": 8.834517478942871, + "learning_rate": 7.070254110612855e-06, + "loss": 3.0602, + "step": 34510 + }, + { + "epoch": 2.3450876477782305, + "grad_norm": 8.857802391052246, + "learning_rate": 7.069829460524529e-06, + "loss": 3.0626, + "step": 34515 + }, + { + "epoch": 2.3454273678488926, + "grad_norm": 7.454411029815674, + "learning_rate": 7.0694048104362015e-06, + "loss": 2.7868, + "step": 34520 + }, + { + "epoch": 2.3457670879195542, + "grad_norm": 6.423980236053467, + "learning_rate": 7.0689801603478735e-06, + "loss": 3.1026, + "step": 34525 + }, + { + "epoch": 2.346106807990216, + "grad_norm": 6.529477596282959, + "learning_rate": 7.068555510259547e-06, + "loss": 3.1909, + "step": 34530 + }, + { + "epoch": 2.346446528060878, + "grad_norm": 7.769742965698242, + "learning_rate": 7.06813086017122e-06, + "loss": 3.1189, + "step": 34535 + }, + { + "epoch": 2.3467862481315396, + "grad_norm": 7.9413557052612305, + "learning_rate": 7.067706210082892e-06, + "loss": 2.9067, + "step": 34540 + }, + { + "epoch": 2.347125968202201, + "grad_norm": 7.5363569259643555, + "learning_rate": 7.0672815599945655e-06, + "loss": 3.1072, + "step": 34545 + }, + { + "epoch": 2.3474656882728633, + "grad_norm": 7.905725002288818, + "learning_rate": 7.0668569099062375e-06, + "loss": 3.1903, + "step": 34550 + }, + { + "epoch": 2.347805408343525, + "grad_norm": 6.179491996765137, + "learning_rate": 7.06643225981791e-06, + "loss": 3.0496, + "step": 34555 + }, + { + "epoch": 2.3481451284141865, + "grad_norm": 6.772598743438721, + "learning_rate": 7.066007609729584e-06, + "loss": 3.208, + "step": 34560 + }, + { + "epoch": 2.3484848484848486, + "grad_norm": 7.396958351135254, + "learning_rate": 7.065582959641256e-06, + "loss": 3.2914, + "step": 34565 + }, + { + "epoch": 2.3488245685555103, + "grad_norm": 5.8505024909973145, + "learning_rate": 7.065158309552929e-06, + "loss": 2.9541, + "step": 34570 + }, + { + "epoch": 2.349164288626172, + "grad_norm": 7.929022312164307, + "learning_rate": 7.064733659464602e-06, + "loss": 2.9858, + "step": 34575 + }, + { + "epoch": 2.349504008696834, + "grad_norm": 7.674720764160156, + "learning_rate": 7.064309009376274e-06, + "loss": 3.1905, + "step": 34580 + }, + { + "epoch": 2.3498437287674956, + "grad_norm": 8.69290828704834, + "learning_rate": 7.063884359287947e-06, + "loss": 2.9816, + "step": 34585 + }, + { + "epoch": 2.350183448838157, + "grad_norm": 5.897392272949219, + "learning_rate": 7.063459709199621e-06, + "loss": 3.2372, + "step": 34590 + }, + { + "epoch": 2.3505231689088193, + "grad_norm": 7.226382255554199, + "learning_rate": 7.063035059111293e-06, + "loss": 2.8957, + "step": 34595 + }, + { + "epoch": 2.350862888979481, + "grad_norm": 5.753334045410156, + "learning_rate": 7.0626104090229655e-06, + "loss": 3.1381, + "step": 34600 + }, + { + "epoch": 2.3512026090501426, + "grad_norm": 8.334921836853027, + "learning_rate": 7.062185758934639e-06, + "loss": 3.0883, + "step": 34605 + }, + { + "epoch": 2.3515423291208046, + "grad_norm": 5.625976085662842, + "learning_rate": 7.061761108846311e-06, + "loss": 3.1149, + "step": 34610 + }, + { + "epoch": 2.3518820491914663, + "grad_norm": 6.635734558105469, + "learning_rate": 7.061336458757984e-06, + "loss": 2.9852, + "step": 34615 + }, + { + "epoch": 2.352221769262128, + "grad_norm": 6.093541145324707, + "learning_rate": 7.060911808669657e-06, + "loss": 3.0183, + "step": 34620 + }, + { + "epoch": 2.35256148933279, + "grad_norm": 5.474567890167236, + "learning_rate": 7.0604871585813295e-06, + "loss": 3.0082, + "step": 34625 + }, + { + "epoch": 2.3529012094034516, + "grad_norm": 8.061391830444336, + "learning_rate": 7.060062508493002e-06, + "loss": 3.1637, + "step": 34630 + }, + { + "epoch": 2.3532409294741132, + "grad_norm": 8.823674201965332, + "learning_rate": 7.059637858404675e-06, + "loss": 3.31, + "step": 34635 + }, + { + "epoch": 2.3535806495447753, + "grad_norm": 6.454288005828857, + "learning_rate": 7.059213208316348e-06, + "loss": 3.0017, + "step": 34640 + }, + { + "epoch": 2.353920369615437, + "grad_norm": 5.655095100402832, + "learning_rate": 7.05878855822802e-06, + "loss": 3.0836, + "step": 34645 + }, + { + "epoch": 2.3542600896860986, + "grad_norm": 6.260900020599365, + "learning_rate": 7.0583639081396935e-06, + "loss": 3.0729, + "step": 34650 + }, + { + "epoch": 2.3545998097567606, + "grad_norm": 8.454659461975098, + "learning_rate": 7.057939258051366e-06, + "loss": 3.2846, + "step": 34655 + }, + { + "epoch": 2.3549395298274223, + "grad_norm": 5.9944329261779785, + "learning_rate": 7.057514607963038e-06, + "loss": 3.0454, + "step": 34660 + }, + { + "epoch": 2.355279249898084, + "grad_norm": 7.8738203048706055, + "learning_rate": 7.057089957874712e-06, + "loss": 3.1017, + "step": 34665 + }, + { + "epoch": 2.355618969968746, + "grad_norm": 6.4377593994140625, + "learning_rate": 7.056665307786385e-06, + "loss": 3.0967, + "step": 34670 + }, + { + "epoch": 2.3559586900394076, + "grad_norm": 5.2954511642456055, + "learning_rate": 7.056240657698057e-06, + "loss": 3.0687, + "step": 34675 + }, + { + "epoch": 2.3562984101100692, + "grad_norm": 8.291754722595215, + "learning_rate": 7.05581600760973e-06, + "loss": 3.1029, + "step": 34680 + }, + { + "epoch": 2.3566381301807313, + "grad_norm": 5.819234371185303, + "learning_rate": 7.055391357521403e-06, + "loss": 3.0739, + "step": 34685 + }, + { + "epoch": 2.356977850251393, + "grad_norm": 5.8823347091674805, + "learning_rate": 7.054966707433075e-06, + "loss": 3.0819, + "step": 34690 + }, + { + "epoch": 2.3573175703220546, + "grad_norm": 6.105862140655518, + "learning_rate": 7.054542057344749e-06, + "loss": 2.9349, + "step": 34695 + }, + { + "epoch": 2.357657290392716, + "grad_norm": 6.9983086585998535, + "learning_rate": 7.0541174072564215e-06, + "loss": 3.3236, + "step": 34700 + }, + { + "epoch": 2.3579970104633783, + "grad_norm": 7.2044501304626465, + "learning_rate": 7.0536927571680935e-06, + "loss": 2.9596, + "step": 34705 + }, + { + "epoch": 2.35833673053404, + "grad_norm": 6.742651462554932, + "learning_rate": 7.053268107079767e-06, + "loss": 2.8676, + "step": 34710 + }, + { + "epoch": 2.3586764506047015, + "grad_norm": 7.565903663635254, + "learning_rate": 7.052843456991439e-06, + "loss": 2.8086, + "step": 34715 + }, + { + "epoch": 2.3590161706753636, + "grad_norm": 6.604324817657471, + "learning_rate": 7.052418806903112e-06, + "loss": 3.21, + "step": 34720 + }, + { + "epoch": 2.3593558907460253, + "grad_norm": 7.062246322631836, + "learning_rate": 7.0519941568147855e-06, + "loss": 3.0374, + "step": 34725 + }, + { + "epoch": 2.359695610816687, + "grad_norm": 7.741151809692383, + "learning_rate": 7.0515695067264575e-06, + "loss": 2.9953, + "step": 34730 + }, + { + "epoch": 2.360035330887349, + "grad_norm": 6.569192886352539, + "learning_rate": 7.051144856638131e-06, + "loss": 3.0024, + "step": 34735 + }, + { + "epoch": 2.3603750509580106, + "grad_norm": 6.496488571166992, + "learning_rate": 7.050720206549804e-06, + "loss": 2.7036, + "step": 34740 + }, + { + "epoch": 2.360714771028672, + "grad_norm": 6.557485103607178, + "learning_rate": 7.050295556461476e-06, + "loss": 2.9666, + "step": 34745 + }, + { + "epoch": 2.3610544910993343, + "grad_norm": 6.705279350280762, + "learning_rate": 7.0498709063731495e-06, + "loss": 3.1008, + "step": 34750 + }, + { + "epoch": 2.361394211169996, + "grad_norm": 6.813723087310791, + "learning_rate": 7.049446256284822e-06, + "loss": 3.0999, + "step": 34755 + }, + { + "epoch": 2.3617339312406576, + "grad_norm": 6.200854778289795, + "learning_rate": 7.049021606196494e-06, + "loss": 3.2274, + "step": 34760 + }, + { + "epoch": 2.3620736513113196, + "grad_norm": 7.256714820861816, + "learning_rate": 7.048596956108168e-06, + "loss": 3.036, + "step": 34765 + }, + { + "epoch": 2.3624133713819813, + "grad_norm": 7.301797389984131, + "learning_rate": 7.048172306019841e-06, + "loss": 2.8136, + "step": 34770 + }, + { + "epoch": 2.362753091452643, + "grad_norm": 8.026815414428711, + "learning_rate": 7.047747655931513e-06, + "loss": 3.0972, + "step": 34775 + }, + { + "epoch": 2.363092811523305, + "grad_norm": 6.894765377044678, + "learning_rate": 7.047323005843186e-06, + "loss": 3.0269, + "step": 34780 + }, + { + "epoch": 2.3634325315939666, + "grad_norm": 6.745061874389648, + "learning_rate": 7.046898355754858e-06, + "loss": 3.0333, + "step": 34785 + }, + { + "epoch": 2.3637722516646282, + "grad_norm": 10.03209400177002, + "learning_rate": 7.046473705666531e-06, + "loss": 3.2444, + "step": 34790 + }, + { + "epoch": 2.3641119717352903, + "grad_norm": 5.555551052093506, + "learning_rate": 7.046049055578205e-06, + "loss": 2.9976, + "step": 34795 + }, + { + "epoch": 2.364451691805952, + "grad_norm": 7.598805904388428, + "learning_rate": 7.045624405489877e-06, + "loss": 3.1262, + "step": 34800 + }, + { + "epoch": 2.3647914118766136, + "grad_norm": 5.972809314727783, + "learning_rate": 7.0451997554015495e-06, + "loss": 3.0928, + "step": 34805 + }, + { + "epoch": 2.365131131947275, + "grad_norm": 7.9514079093933105, + "learning_rate": 7.044775105313223e-06, + "loss": 3.1129, + "step": 34810 + }, + { + "epoch": 2.3654708520179373, + "grad_norm": 6.7960052490234375, + "learning_rate": 7.044350455224895e-06, + "loss": 2.8155, + "step": 34815 + }, + { + "epoch": 2.365810572088599, + "grad_norm": 6.6849589347839355, + "learning_rate": 7.043925805136568e-06, + "loss": 2.7216, + "step": 34820 + }, + { + "epoch": 2.3661502921592605, + "grad_norm": 5.720019340515137, + "learning_rate": 7.0435011550482415e-06, + "loss": 2.6875, + "step": 34825 + }, + { + "epoch": 2.3664900122299226, + "grad_norm": 5.976961612701416, + "learning_rate": 7.0430765049599135e-06, + "loss": 2.8287, + "step": 34830 + }, + { + "epoch": 2.3668297323005842, + "grad_norm": 6.590436935424805, + "learning_rate": 7.042651854871586e-06, + "loss": 3.2086, + "step": 34835 + }, + { + "epoch": 2.367169452371246, + "grad_norm": 6.489885330200195, + "learning_rate": 7.04222720478326e-06, + "loss": 3.2451, + "step": 34840 + }, + { + "epoch": 2.367509172441908, + "grad_norm": 7.456089019775391, + "learning_rate": 7.041802554694932e-06, + "loss": 3.0409, + "step": 34845 + }, + { + "epoch": 2.3678488925125696, + "grad_norm": 7.42903470993042, + "learning_rate": 7.041377904606605e-06, + "loss": 3.1157, + "step": 34850 + }, + { + "epoch": 2.368188612583231, + "grad_norm": 6.459178447723389, + "learning_rate": 7.040953254518278e-06, + "loss": 2.9635, + "step": 34855 + }, + { + "epoch": 2.3685283326538933, + "grad_norm": 5.405936241149902, + "learning_rate": 7.04052860442995e-06, + "loss": 2.9606, + "step": 34860 + }, + { + "epoch": 2.368868052724555, + "grad_norm": 6.9020304679870605, + "learning_rate": 7.040103954341623e-06, + "loss": 2.9286, + "step": 34865 + }, + { + "epoch": 2.3692077727952165, + "grad_norm": 8.947012901306152, + "learning_rate": 7.039679304253296e-06, + "loss": 3.1749, + "step": 34870 + }, + { + "epoch": 2.3695474928658786, + "grad_norm": 6.854835510253906, + "learning_rate": 7.039254654164969e-06, + "loss": 3.1048, + "step": 34875 + }, + { + "epoch": 2.3698872129365403, + "grad_norm": 6.4877028465271, + "learning_rate": 7.0388300040766415e-06, + "loss": 3.0209, + "step": 34880 + }, + { + "epoch": 2.370226933007202, + "grad_norm": 7.87687873840332, + "learning_rate": 7.038405353988314e-06, + "loss": 3.1021, + "step": 34885 + }, + { + "epoch": 2.370566653077864, + "grad_norm": 5.703847885131836, + "learning_rate": 7.037980703899987e-06, + "loss": 3.0681, + "step": 34890 + }, + { + "epoch": 2.3709063731485256, + "grad_norm": 7.1646528244018555, + "learning_rate": 7.037556053811659e-06, + "loss": 3.1299, + "step": 34895 + }, + { + "epoch": 2.3712460932191872, + "grad_norm": 7.0988874435424805, + "learning_rate": 7.037131403723333e-06, + "loss": 3.2404, + "step": 34900 + }, + { + "epoch": 2.3715858132898493, + "grad_norm": 7.60968542098999, + "learning_rate": 7.0367067536350055e-06, + "loss": 3.0171, + "step": 34905 + }, + { + "epoch": 2.371925533360511, + "grad_norm": 8.041359901428223, + "learning_rate": 7.0362821035466775e-06, + "loss": 2.8801, + "step": 34910 + }, + { + "epoch": 2.3722652534311726, + "grad_norm": 5.560117244720459, + "learning_rate": 7.035857453458351e-06, + "loss": 3.2399, + "step": 34915 + }, + { + "epoch": 2.3726049735018346, + "grad_norm": 6.498453617095947, + "learning_rate": 7.035432803370024e-06, + "loss": 3.0852, + "step": 34920 + }, + { + "epoch": 2.3729446935724963, + "grad_norm": 7.76530647277832, + "learning_rate": 7.035008153281696e-06, + "loss": 3.0702, + "step": 34925 + }, + { + "epoch": 2.373284413643158, + "grad_norm": 6.105423927307129, + "learning_rate": 7.0345835031933695e-06, + "loss": 3.1252, + "step": 34930 + }, + { + "epoch": 2.37362413371382, + "grad_norm": 8.199880599975586, + "learning_rate": 7.034158853105042e-06, + "loss": 3.2438, + "step": 34935 + }, + { + "epoch": 2.3739638537844816, + "grad_norm": 7.388331890106201, + "learning_rate": 7.033734203016714e-06, + "loss": 3.1991, + "step": 34940 + }, + { + "epoch": 2.3743035738551432, + "grad_norm": 6.781189441680908, + "learning_rate": 7.033309552928388e-06, + "loss": 2.8525, + "step": 34945 + }, + { + "epoch": 2.3746432939258053, + "grad_norm": 8.139293670654297, + "learning_rate": 7.032884902840061e-06, + "loss": 3.1142, + "step": 34950 + }, + { + "epoch": 2.374983013996467, + "grad_norm": 8.786938667297363, + "learning_rate": 7.032460252751733e-06, + "loss": 3.0573, + "step": 34955 + }, + { + "epoch": 2.3753227340671286, + "grad_norm": 6.730849742889404, + "learning_rate": 7.032035602663406e-06, + "loss": 3.1169, + "step": 34960 + }, + { + "epoch": 2.3756624541377906, + "grad_norm": 7.230918884277344, + "learning_rate": 7.031610952575078e-06, + "loss": 3.1016, + "step": 34965 + }, + { + "epoch": 2.3760021742084523, + "grad_norm": 7.340399742126465, + "learning_rate": 7.031186302486751e-06, + "loss": 3.0831, + "step": 34970 + }, + { + "epoch": 2.376341894279114, + "grad_norm": 7.149040222167969, + "learning_rate": 7.030761652398425e-06, + "loss": 3.0964, + "step": 34975 + }, + { + "epoch": 2.376681614349776, + "grad_norm": 7.394927501678467, + "learning_rate": 7.030337002310097e-06, + "loss": 2.9287, + "step": 34980 + }, + { + "epoch": 2.3770213344204376, + "grad_norm": 7.475829601287842, + "learning_rate": 7.0299123522217695e-06, + "loss": 2.9736, + "step": 34985 + }, + { + "epoch": 2.3773610544910992, + "grad_norm": 6.690319061279297, + "learning_rate": 7.029487702133443e-06, + "loss": 3.2979, + "step": 34990 + }, + { + "epoch": 2.3777007745617613, + "grad_norm": 6.799644947052002, + "learning_rate": 7.029063052045115e-06, + "loss": 2.9537, + "step": 34995 + }, + { + "epoch": 2.378040494632423, + "grad_norm": 7.417754173278809, + "learning_rate": 7.028638401956788e-06, + "loss": 3.1889, + "step": 35000 + }, + { + "epoch": 2.3783802147030846, + "grad_norm": 7.847931385040283, + "learning_rate": 7.0282137518684615e-06, + "loss": 2.8724, + "step": 35005 + }, + { + "epoch": 2.3787199347737467, + "grad_norm": 6.920871734619141, + "learning_rate": 7.0277891017801335e-06, + "loss": 3.2002, + "step": 35010 + }, + { + "epoch": 2.3790596548444083, + "grad_norm": 7.358174800872803, + "learning_rate": 7.027364451691806e-06, + "loss": 2.9965, + "step": 35015 + }, + { + "epoch": 2.37939937491507, + "grad_norm": 6.514119625091553, + "learning_rate": 7.02693980160348e-06, + "loss": 3.1375, + "step": 35020 + }, + { + "epoch": 2.379739094985732, + "grad_norm": 7.671753883361816, + "learning_rate": 7.026515151515152e-06, + "loss": 3.1734, + "step": 35025 + }, + { + "epoch": 2.3800788150563936, + "grad_norm": 6.461233139038086, + "learning_rate": 7.026090501426825e-06, + "loss": 3.08, + "step": 35030 + }, + { + "epoch": 2.3804185351270553, + "grad_norm": 6.628191947937012, + "learning_rate": 7.0256658513384975e-06, + "loss": 3.0924, + "step": 35035 + }, + { + "epoch": 2.380758255197717, + "grad_norm": 10.060201644897461, + "learning_rate": 7.02524120125017e-06, + "loss": 3.2386, + "step": 35040 + }, + { + "epoch": 2.381097975268379, + "grad_norm": 8.884075164794922, + "learning_rate": 7.024816551161843e-06, + "loss": 2.8029, + "step": 35045 + }, + { + "epoch": 2.3814376953390406, + "grad_norm": 6.2296905517578125, + "learning_rate": 7.024391901073516e-06, + "loss": 2.7085, + "step": 35050 + }, + { + "epoch": 2.3817774154097022, + "grad_norm": 10.491393089294434, + "learning_rate": 7.023967250985189e-06, + "loss": 3.0135, + "step": 35055 + }, + { + "epoch": 2.3821171354803643, + "grad_norm": 6.783937454223633, + "learning_rate": 7.023542600896861e-06, + "loss": 3.1176, + "step": 35060 + }, + { + "epoch": 2.382456855551026, + "grad_norm": 6.1174492835998535, + "learning_rate": 7.023117950808534e-06, + "loss": 3.0132, + "step": 35065 + }, + { + "epoch": 2.3827965756216876, + "grad_norm": 8.974913597106934, + "learning_rate": 7.022693300720207e-06, + "loss": 3.0334, + "step": 35070 + }, + { + "epoch": 2.3831362956923496, + "grad_norm": 7.1983184814453125, + "learning_rate": 7.022268650631881e-06, + "loss": 3.013, + "step": 35075 + }, + { + "epoch": 2.3834760157630113, + "grad_norm": 6.762165069580078, + "learning_rate": 7.021844000543553e-06, + "loss": 3.0697, + "step": 35080 + }, + { + "epoch": 2.383815735833673, + "grad_norm": 8.905295372009277, + "learning_rate": 7.0214193504552255e-06, + "loss": 2.9615, + "step": 35085 + }, + { + "epoch": 2.384155455904335, + "grad_norm": 6.789909839630127, + "learning_rate": 7.020994700366899e-06, + "loss": 2.9679, + "step": 35090 + }, + { + "epoch": 2.3844951759749966, + "grad_norm": 7.629694938659668, + "learning_rate": 7.020570050278571e-06, + "loss": 3.2487, + "step": 35095 + }, + { + "epoch": 2.3848348960456582, + "grad_norm": 6.027575492858887, + "learning_rate": 7.020145400190244e-06, + "loss": 3.2235, + "step": 35100 + }, + { + "epoch": 2.3851746161163203, + "grad_norm": 5.934464931488037, + "learning_rate": 7.0197207501019175e-06, + "loss": 3.2471, + "step": 35105 + }, + { + "epoch": 2.385514336186982, + "grad_norm": 7.2538743019104, + "learning_rate": 7.0192961000135895e-06, + "loss": 3.104, + "step": 35110 + }, + { + "epoch": 2.3858540562576436, + "grad_norm": 7.059278964996338, + "learning_rate": 7.018871449925262e-06, + "loss": 3.0738, + "step": 35115 + }, + { + "epoch": 2.3861937763283056, + "grad_norm": 7.95205020904541, + "learning_rate": 7.018446799836935e-06, + "loss": 3.3237, + "step": 35120 + }, + { + "epoch": 2.3865334963989673, + "grad_norm": 6.852903842926025, + "learning_rate": 7.018022149748608e-06, + "loss": 3.1964, + "step": 35125 + }, + { + "epoch": 2.386873216469629, + "grad_norm": 7.098516464233398, + "learning_rate": 7.01759749966028e-06, + "loss": 3.2686, + "step": 35130 + }, + { + "epoch": 2.387212936540291, + "grad_norm": 7.044128894805908, + "learning_rate": 7.0171728495719535e-06, + "loss": 2.9901, + "step": 35135 + }, + { + "epoch": 2.3875526566109526, + "grad_norm": 5.975692272186279, + "learning_rate": 7.016748199483626e-06, + "loss": 2.9064, + "step": 35140 + }, + { + "epoch": 2.3878923766816142, + "grad_norm": 6.3419294357299805, + "learning_rate": 7.016323549395298e-06, + "loss": 3.056, + "step": 35145 + }, + { + "epoch": 2.388232096752276, + "grad_norm": 9.11259937286377, + "learning_rate": 7.015898899306972e-06, + "loss": 3.0509, + "step": 35150 + }, + { + "epoch": 2.388571816822938, + "grad_norm": 5.770564556121826, + "learning_rate": 7.015474249218645e-06, + "loss": 3.0472, + "step": 35155 + }, + { + "epoch": 2.3889115368935996, + "grad_norm": 6.701061248779297, + "learning_rate": 7.015049599130317e-06, + "loss": 3.2108, + "step": 35160 + }, + { + "epoch": 2.389251256964261, + "grad_norm": 8.104769706726074, + "learning_rate": 7.01462494904199e-06, + "loss": 3.1514, + "step": 35165 + }, + { + "epoch": 2.3895909770349233, + "grad_norm": 7.603956699371338, + "learning_rate": 7.014200298953663e-06, + "loss": 2.9043, + "step": 35170 + }, + { + "epoch": 2.389930697105585, + "grad_norm": 8.018089294433594, + "learning_rate": 7.013775648865335e-06, + "loss": 3.1729, + "step": 35175 + }, + { + "epoch": 2.3902704171762466, + "grad_norm": 7.573482513427734, + "learning_rate": 7.013350998777009e-06, + "loss": 2.9643, + "step": 35180 + }, + { + "epoch": 2.3906101372469086, + "grad_norm": 8.219799995422363, + "learning_rate": 7.0129263486886815e-06, + "loss": 3.1012, + "step": 35185 + }, + { + "epoch": 2.3909498573175703, + "grad_norm": 6.899787425994873, + "learning_rate": 7.0125016986003535e-06, + "loss": 2.9811, + "step": 35190 + }, + { + "epoch": 2.391289577388232, + "grad_norm": 7.262122631072998, + "learning_rate": 7.012077048512027e-06, + "loss": 3.2181, + "step": 35195 + }, + { + "epoch": 2.391629297458894, + "grad_norm": 6.875399589538574, + "learning_rate": 7.0116523984237e-06, + "loss": 3.2524, + "step": 35200 + }, + { + "epoch": 2.3919690175295556, + "grad_norm": 8.509119987487793, + "learning_rate": 7.011227748335372e-06, + "loss": 2.708, + "step": 35205 + }, + { + "epoch": 2.3923087376002172, + "grad_norm": 7.277152061462402, + "learning_rate": 7.0108030982470455e-06, + "loss": 3.566, + "step": 35210 + }, + { + "epoch": 2.3926484576708793, + "grad_norm": 7.2408833503723145, + "learning_rate": 7.010463378176383e-06, + "loss": 3.0921, + "step": 35215 + }, + { + "epoch": 2.392988177741541, + "grad_norm": 6.284710884094238, + "learning_rate": 7.0100387280880555e-06, + "loss": 3.1206, + "step": 35220 + }, + { + "epoch": 2.3933278978122026, + "grad_norm": 8.25841999053955, + "learning_rate": 7.009614077999729e-06, + "loss": 3.2963, + "step": 35225 + }, + { + "epoch": 2.3936676178828646, + "grad_norm": 8.16793441772461, + "learning_rate": 7.009189427911401e-06, + "loss": 3.1824, + "step": 35230 + }, + { + "epoch": 2.3940073379535263, + "grad_norm": 7.447442531585693, + "learning_rate": 7.008764777823074e-06, + "loss": 2.9635, + "step": 35235 + }, + { + "epoch": 2.394347058024188, + "grad_norm": 7.561320781707764, + "learning_rate": 7.008340127734748e-06, + "loss": 3.1118, + "step": 35240 + }, + { + "epoch": 2.39468677809485, + "grad_norm": 6.391260147094727, + "learning_rate": 7.0079154776464195e-06, + "loss": 2.8323, + "step": 35245 + }, + { + "epoch": 2.3950264981655116, + "grad_norm": 8.017400741577148, + "learning_rate": 7.007490827558092e-06, + "loss": 2.9814, + "step": 35250 + }, + { + "epoch": 2.3953662182361732, + "grad_norm": 6.064600944519043, + "learning_rate": 7.007066177469766e-06, + "loss": 2.8874, + "step": 35255 + }, + { + "epoch": 2.3957059383068353, + "grad_norm": 8.449888229370117, + "learning_rate": 7.006641527381438e-06, + "loss": 3.0252, + "step": 35260 + }, + { + "epoch": 2.396045658377497, + "grad_norm": 6.337996006011963, + "learning_rate": 7.006216877293111e-06, + "loss": 3.3276, + "step": 35265 + }, + { + "epoch": 2.3963853784481586, + "grad_norm": 6.975801944732666, + "learning_rate": 7.005792227204784e-06, + "loss": 3.1818, + "step": 35270 + }, + { + "epoch": 2.3967250985188207, + "grad_norm": 6.331030368804932, + "learning_rate": 7.005367577116456e-06, + "loss": 3.009, + "step": 35275 + }, + { + "epoch": 2.3970648185894823, + "grad_norm": 5.675800323486328, + "learning_rate": 7.00494292702813e-06, + "loss": 3.0944, + "step": 35280 + }, + { + "epoch": 2.397404538660144, + "grad_norm": 6.138897895812988, + "learning_rate": 7.004518276939802e-06, + "loss": 3.1097, + "step": 35285 + }, + { + "epoch": 2.397744258730806, + "grad_norm": 6.9565606117248535, + "learning_rate": 7.004093626851475e-06, + "loss": 3.1853, + "step": 35290 + }, + { + "epoch": 2.3980839788014676, + "grad_norm": 5.992327690124512, + "learning_rate": 7.003668976763148e-06, + "loss": 3.1384, + "step": 35295 + }, + { + "epoch": 2.3984236988721293, + "grad_norm": 7.870963096618652, + "learning_rate": 7.00324432667482e-06, + "loss": 3.0483, + "step": 35300 + }, + { + "epoch": 2.3987634189427913, + "grad_norm": 6.2442851066589355, + "learning_rate": 7.002819676586493e-06, + "loss": 3.2051, + "step": 35305 + }, + { + "epoch": 2.399103139013453, + "grad_norm": 7.0837273597717285, + "learning_rate": 7.002395026498167e-06, + "loss": 2.8037, + "step": 35310 + }, + { + "epoch": 2.3994428590841146, + "grad_norm": 8.114240646362305, + "learning_rate": 7.001970376409839e-06, + "loss": 2.882, + "step": 35315 + }, + { + "epoch": 2.3997825791547767, + "grad_norm": 6.946849822998047, + "learning_rate": 7.0015457263215115e-06, + "loss": 3.0795, + "step": 35320 + }, + { + "epoch": 2.4001222992254383, + "grad_norm": 7.209394931793213, + "learning_rate": 7.001121076233185e-06, + "loss": 3.1004, + "step": 35325 + }, + { + "epoch": 2.4004620192961, + "grad_norm": 6.519365310668945, + "learning_rate": 7.000696426144857e-06, + "loss": 3.0017, + "step": 35330 + }, + { + "epoch": 2.400801739366762, + "grad_norm": 5.9612250328063965, + "learning_rate": 7.00027177605653e-06, + "loss": 3.2308, + "step": 35335 + }, + { + "epoch": 2.4011414594374236, + "grad_norm": 6.373744010925293, + "learning_rate": 6.999847125968204e-06, + "loss": 3.0456, + "step": 35340 + }, + { + "epoch": 2.4014811795080853, + "grad_norm": 7.51680850982666, + "learning_rate": 6.9994224758798755e-06, + "loss": 3.0835, + "step": 35345 + }, + { + "epoch": 2.4018208995787473, + "grad_norm": 8.25805377960205, + "learning_rate": 6.998997825791548e-06, + "loss": 2.8368, + "step": 35350 + }, + { + "epoch": 2.402160619649409, + "grad_norm": 8.330673217773438, + "learning_rate": 6.998573175703222e-06, + "loss": 2.9586, + "step": 35355 + }, + { + "epoch": 2.4025003397200706, + "grad_norm": 9.206212043762207, + "learning_rate": 6.998148525614894e-06, + "loss": 2.9962, + "step": 35360 + }, + { + "epoch": 2.4028400597907327, + "grad_norm": 8.9185209274292, + "learning_rate": 6.997723875526567e-06, + "loss": 3.0199, + "step": 35365 + }, + { + "epoch": 2.4031797798613943, + "grad_norm": 6.59260368347168, + "learning_rate": 6.9972992254382396e-06, + "loss": 3.1094, + "step": 35370 + }, + { + "epoch": 2.403519499932056, + "grad_norm": 5.939663887023926, + "learning_rate": 6.996874575349912e-06, + "loss": 2.8983, + "step": 35375 + }, + { + "epoch": 2.4038592200027176, + "grad_norm": 7.438543796539307, + "learning_rate": 6.996449925261585e-06, + "loss": 2.9844, + "step": 35380 + }, + { + "epoch": 2.4041989400733796, + "grad_norm": 7.118528366088867, + "learning_rate": 6.996025275173258e-06, + "loss": 3.0971, + "step": 35385 + }, + { + "epoch": 2.4045386601440413, + "grad_norm": 6.258388519287109, + "learning_rate": 6.995600625084931e-06, + "loss": 3.1117, + "step": 35390 + }, + { + "epoch": 2.404878380214703, + "grad_norm": 7.618902206420898, + "learning_rate": 6.995175974996603e-06, + "loss": 3.0029, + "step": 35395 + }, + { + "epoch": 2.405218100285365, + "grad_norm": 6.526343822479248, + "learning_rate": 6.994751324908276e-06, + "loss": 3.1078, + "step": 35400 + }, + { + "epoch": 2.4055578203560266, + "grad_norm": 7.629055023193359, + "learning_rate": 6.994326674819949e-06, + "loss": 3.2671, + "step": 35405 + }, + { + "epoch": 2.4058975404266882, + "grad_norm": 7.71380090713501, + "learning_rate": 6.993902024731621e-06, + "loss": 2.8778, + "step": 35410 + }, + { + "epoch": 2.4062372604973503, + "grad_norm": 6.157193183898926, + "learning_rate": 6.993477374643295e-06, + "loss": 2.8441, + "step": 35415 + }, + { + "epoch": 2.406576980568012, + "grad_norm": 6.111415386199951, + "learning_rate": 6.9930527245549676e-06, + "loss": 3.0952, + "step": 35420 + }, + { + "epoch": 2.4069167006386736, + "grad_norm": 8.008645057678223, + "learning_rate": 6.9926280744666395e-06, + "loss": 3.2267, + "step": 35425 + }, + { + "epoch": 2.4072564207093357, + "grad_norm": 6.725258827209473, + "learning_rate": 6.992203424378313e-06, + "loss": 3.1837, + "step": 35430 + }, + { + "epoch": 2.4075961407799973, + "grad_norm": 7.8250627517700195, + "learning_rate": 6.991778774289986e-06, + "loss": 2.9933, + "step": 35435 + }, + { + "epoch": 2.407935860850659, + "grad_norm": 6.294147491455078, + "learning_rate": 6.991354124201658e-06, + "loss": 2.8709, + "step": 35440 + }, + { + "epoch": 2.408275580921321, + "grad_norm": 7.445245265960693, + "learning_rate": 6.9909294741133316e-06, + "loss": 3.2712, + "step": 35445 + }, + { + "epoch": 2.4086153009919826, + "grad_norm": 8.01440715789795, + "learning_rate": 6.990504824025004e-06, + "loss": 3.3073, + "step": 35450 + }, + { + "epoch": 2.4089550210626443, + "grad_norm": 6.860519886016846, + "learning_rate": 6.990080173936676e-06, + "loss": 3.0991, + "step": 35455 + }, + { + "epoch": 2.4092947411333063, + "grad_norm": 7.087556838989258, + "learning_rate": 6.98965552384835e-06, + "loss": 3.1976, + "step": 35460 + }, + { + "epoch": 2.409634461203968, + "grad_norm": 7.2742533683776855, + "learning_rate": 6.989230873760022e-06, + "loss": 2.8891, + "step": 35465 + }, + { + "epoch": 2.4099741812746296, + "grad_norm": 5.7927165031433105, + "learning_rate": 6.988806223671695e-06, + "loss": 3.1708, + "step": 35470 + }, + { + "epoch": 2.4103139013452917, + "grad_norm": 5.882875919342041, + "learning_rate": 6.988381573583368e-06, + "loss": 2.9884, + "step": 35475 + }, + { + "epoch": 2.4106536214159533, + "grad_norm": 5.4171037673950195, + "learning_rate": 6.98795692349504e-06, + "loss": 3.0113, + "step": 35480 + }, + { + "epoch": 2.410993341486615, + "grad_norm": 8.50019645690918, + "learning_rate": 6.987532273406713e-06, + "loss": 3.1332, + "step": 35485 + }, + { + "epoch": 2.4113330615572766, + "grad_norm": 6.975743770599365, + "learning_rate": 6.987107623318387e-06, + "loss": 3.2849, + "step": 35490 + }, + { + "epoch": 2.4116727816279386, + "grad_norm": 7.828217029571533, + "learning_rate": 6.986682973230059e-06, + "loss": 3.198, + "step": 35495 + }, + { + "epoch": 2.4120125016986003, + "grad_norm": 7.573753833770752, + "learning_rate": 6.9862583231417315e-06, + "loss": 3.2306, + "step": 35500 + }, + { + "epoch": 2.412352221769262, + "grad_norm": 6.596642017364502, + "learning_rate": 6.985833673053405e-06, + "loss": 2.8484, + "step": 35505 + }, + { + "epoch": 2.412691941839924, + "grad_norm": 7.000308513641357, + "learning_rate": 6.985409022965077e-06, + "loss": 3.0237, + "step": 35510 + }, + { + "epoch": 2.4130316619105856, + "grad_norm": 6.046454429626465, + "learning_rate": 6.98498437287675e-06, + "loss": 2.9573, + "step": 35515 + }, + { + "epoch": 2.4133713819812472, + "grad_norm": 8.990402221679688, + "learning_rate": 6.9845597227884236e-06, + "loss": 3.3632, + "step": 35520 + }, + { + "epoch": 2.4137111020519093, + "grad_norm": 7.437816143035889, + "learning_rate": 6.9841350727000955e-06, + "loss": 3.0401, + "step": 35525 + }, + { + "epoch": 2.414050822122571, + "grad_norm": 7.371769905090332, + "learning_rate": 6.983710422611768e-06, + "loss": 3.1884, + "step": 35530 + }, + { + "epoch": 2.4143905421932326, + "grad_norm": 5.882689952850342, + "learning_rate": 6.983285772523441e-06, + "loss": 3.1937, + "step": 35535 + }, + { + "epoch": 2.4147302622638946, + "grad_norm": 7.001346111297607, + "learning_rate": 6.982861122435114e-06, + "loss": 3.1362, + "step": 35540 + }, + { + "epoch": 2.4150699823345563, + "grad_norm": 6.556390285491943, + "learning_rate": 6.982436472346787e-06, + "loss": 2.9258, + "step": 35545 + }, + { + "epoch": 2.415409702405218, + "grad_norm": 7.070376873016357, + "learning_rate": 6.9820118222584595e-06, + "loss": 2.7434, + "step": 35550 + }, + { + "epoch": 2.41574942247588, + "grad_norm": 6.807934761047363, + "learning_rate": 6.981587172170132e-06, + "loss": 3.228, + "step": 35555 + }, + { + "epoch": 2.4160891425465416, + "grad_norm": 6.191982269287109, + "learning_rate": 6.981162522081804e-06, + "loss": 3.1264, + "step": 35560 + }, + { + "epoch": 2.4164288626172032, + "grad_norm": 6.146716594696045, + "learning_rate": 6.980737871993478e-06, + "loss": 2.8773, + "step": 35565 + }, + { + "epoch": 2.4167685826878653, + "grad_norm": 7.759289264678955, + "learning_rate": 6.980313221905151e-06, + "loss": 2.9768, + "step": 35570 + }, + { + "epoch": 2.417108302758527, + "grad_norm": 5.595877647399902, + "learning_rate": 6.979888571816823e-06, + "loss": 2.9815, + "step": 35575 + }, + { + "epoch": 2.4174480228291886, + "grad_norm": 7.08402681350708, + "learning_rate": 6.979463921728496e-06, + "loss": 3.1092, + "step": 35580 + }, + { + "epoch": 2.4177877428998507, + "grad_norm": 5.8265557289123535, + "learning_rate": 6.979039271640169e-06, + "loss": 3.2825, + "step": 35585 + }, + { + "epoch": 2.4181274629705123, + "grad_norm": 6.349917411804199, + "learning_rate": 6.978614621551841e-06, + "loss": 3.0846, + "step": 35590 + }, + { + "epoch": 2.418467183041174, + "grad_norm": 8.861905097961426, + "learning_rate": 6.978189971463515e-06, + "loss": 3.3493, + "step": 35595 + }, + { + "epoch": 2.418806903111836, + "grad_norm": 8.208427429199219, + "learning_rate": 6.9777653213751875e-06, + "loss": 3.1139, + "step": 35600 + }, + { + "epoch": 2.4191466231824976, + "grad_norm": 6.036651134490967, + "learning_rate": 6.9773406712868595e-06, + "loss": 3.0804, + "step": 35605 + }, + { + "epoch": 2.4194863432531593, + "grad_norm": 7.093289852142334, + "learning_rate": 6.976916021198533e-06, + "loss": 3.1255, + "step": 35610 + }, + { + "epoch": 2.4198260633238213, + "grad_norm": 7.914956569671631, + "learning_rate": 6.976491371110206e-06, + "loss": 3.1777, + "step": 35615 + }, + { + "epoch": 2.420165783394483, + "grad_norm": 5.996580123901367, + "learning_rate": 6.976066721021879e-06, + "loss": 3.193, + "step": 35620 + }, + { + "epoch": 2.4205055034651446, + "grad_norm": 7.023397922515869, + "learning_rate": 6.9756420709335515e-06, + "loss": 2.9493, + "step": 35625 + }, + { + "epoch": 2.4208452235358067, + "grad_norm": 8.356377601623535, + "learning_rate": 6.9752174208452235e-06, + "loss": 3.1002, + "step": 35630 + }, + { + "epoch": 2.4211849436064683, + "grad_norm": 6.306221961975098, + "learning_rate": 6.974792770756897e-06, + "loss": 3.2354, + "step": 35635 + }, + { + "epoch": 2.42152466367713, + "grad_norm": 7.045029163360596, + "learning_rate": 6.97436812066857e-06, + "loss": 2.8903, + "step": 35640 + }, + { + "epoch": 2.421864383747792, + "grad_norm": 6.211605548858643, + "learning_rate": 6.973943470580242e-06, + "loss": 2.982, + "step": 35645 + }, + { + "epoch": 2.4222041038184536, + "grad_norm": 7.123550891876221, + "learning_rate": 6.9735188204919155e-06, + "loss": 2.9282, + "step": 35650 + }, + { + "epoch": 2.4225438238891153, + "grad_norm": 6.087734699249268, + "learning_rate": 6.973094170403588e-06, + "loss": 2.8894, + "step": 35655 + }, + { + "epoch": 2.4228835439597773, + "grad_norm": 6.967028617858887, + "learning_rate": 6.97266952031526e-06, + "loss": 3.2249, + "step": 35660 + }, + { + "epoch": 2.423223264030439, + "grad_norm": 6.1157612800598145, + "learning_rate": 6.972244870226934e-06, + "loss": 3.0784, + "step": 35665 + }, + { + "epoch": 2.4235629841011006, + "grad_norm": 7.240536689758301, + "learning_rate": 6.971820220138607e-06, + "loss": 3.0651, + "step": 35670 + }, + { + "epoch": 2.4239027041717627, + "grad_norm": 9.556371688842773, + "learning_rate": 6.971395570050279e-06, + "loss": 3.0225, + "step": 35675 + }, + { + "epoch": 2.4242424242424243, + "grad_norm": 7.294412136077881, + "learning_rate": 6.970970919961952e-06, + "loss": 3.0288, + "step": 35680 + }, + { + "epoch": 2.424582144313086, + "grad_norm": 6.604001522064209, + "learning_rate": 6.970546269873625e-06, + "loss": 3.0595, + "step": 35685 + }, + { + "epoch": 2.424921864383748, + "grad_norm": 6.616589546203613, + "learning_rate": 6.970121619785297e-06, + "loss": 3.0272, + "step": 35690 + }, + { + "epoch": 2.4252615844544096, + "grad_norm": 7.17515754699707, + "learning_rate": 6.969696969696971e-06, + "loss": 2.8674, + "step": 35695 + }, + { + "epoch": 2.4256013045250713, + "grad_norm": 7.033890247344971, + "learning_rate": 6.9692723196086436e-06, + "loss": 3.0626, + "step": 35700 + }, + { + "epoch": 2.4259410245957334, + "grad_norm": 7.517335414886475, + "learning_rate": 6.9688476695203155e-06, + "loss": 2.7262, + "step": 35705 + }, + { + "epoch": 2.426280744666395, + "grad_norm": 5.902669906616211, + "learning_rate": 6.968423019431989e-06, + "loss": 3.1894, + "step": 35710 + }, + { + "epoch": 2.4266204647370566, + "grad_norm": 6.056654930114746, + "learning_rate": 6.967998369343661e-06, + "loss": 2.8793, + "step": 35715 + }, + { + "epoch": 2.4269601848077182, + "grad_norm": 7.224855899810791, + "learning_rate": 6.967573719255334e-06, + "loss": 3.0525, + "step": 35720 + }, + { + "epoch": 2.4272999048783803, + "grad_norm": 5.5433268547058105, + "learning_rate": 6.9671490691670076e-06, + "loss": 2.9025, + "step": 35725 + }, + { + "epoch": 2.427639624949042, + "grad_norm": 7.341770648956299, + "learning_rate": 6.9667244190786795e-06, + "loss": 3.2043, + "step": 35730 + }, + { + "epoch": 2.4279793450197036, + "grad_norm": 6.465906143188477, + "learning_rate": 6.966299768990352e-06, + "loss": 2.9752, + "step": 35735 + }, + { + "epoch": 2.4283190650903657, + "grad_norm": 6.346965312957764, + "learning_rate": 6.965875118902026e-06, + "loss": 2.9342, + "step": 35740 + }, + { + "epoch": 2.4286587851610273, + "grad_norm": 7.334663391113281, + "learning_rate": 6.965450468813698e-06, + "loss": 2.8069, + "step": 35745 + }, + { + "epoch": 2.428998505231689, + "grad_norm": 7.538919448852539, + "learning_rate": 6.965025818725371e-06, + "loss": 3.1307, + "step": 35750 + }, + { + "epoch": 2.429338225302351, + "grad_norm": 8.551025390625, + "learning_rate": 6.964601168637044e-06, + "loss": 2.9985, + "step": 35755 + }, + { + "epoch": 2.4296779453730126, + "grad_norm": 6.633123397827148, + "learning_rate": 6.964176518548716e-06, + "loss": 3.331, + "step": 35760 + }, + { + "epoch": 2.4300176654436743, + "grad_norm": 8.607592582702637, + "learning_rate": 6.963751868460389e-06, + "loss": 3.1882, + "step": 35765 + }, + { + "epoch": 2.4303573855143363, + "grad_norm": 6.925440311431885, + "learning_rate": 6.963327218372063e-06, + "loss": 3.1434, + "step": 35770 + }, + { + "epoch": 2.430697105584998, + "grad_norm": 7.657562255859375, + "learning_rate": 6.962902568283735e-06, + "loss": 3.0268, + "step": 35775 + }, + { + "epoch": 2.4310368256556596, + "grad_norm": 5.969607830047607, + "learning_rate": 6.9624779181954075e-06, + "loss": 2.679, + "step": 35780 + }, + { + "epoch": 2.4313765457263217, + "grad_norm": 7.906417369842529, + "learning_rate": 6.96205326810708e-06, + "loss": 3.1187, + "step": 35785 + }, + { + "epoch": 2.4317162657969833, + "grad_norm": 7.013284206390381, + "learning_rate": 6.961628618018753e-06, + "loss": 2.9656, + "step": 35790 + }, + { + "epoch": 2.432055985867645, + "grad_norm": 8.539687156677246, + "learning_rate": 6.961203967930426e-06, + "loss": 3.271, + "step": 35795 + }, + { + "epoch": 2.432395705938307, + "grad_norm": 6.185183525085449, + "learning_rate": 6.960779317842099e-06, + "loss": 3.0713, + "step": 35800 + }, + { + "epoch": 2.4327354260089686, + "grad_norm": 8.945923805236816, + "learning_rate": 6.9603546677537715e-06, + "loss": 2.9952, + "step": 35805 + }, + { + "epoch": 2.4330751460796303, + "grad_norm": 6.789841175079346, + "learning_rate": 6.9599300176654435e-06, + "loss": 3.1085, + "step": 35810 + }, + { + "epoch": 2.4334148661502923, + "grad_norm": 6.586999893188477, + "learning_rate": 6.959505367577117e-06, + "loss": 2.9942, + "step": 35815 + }, + { + "epoch": 2.433754586220954, + "grad_norm": 5.535648822784424, + "learning_rate": 6.95908071748879e-06, + "loss": 3.2082, + "step": 35820 + }, + { + "epoch": 2.4340943062916156, + "grad_norm": 7.344588756561279, + "learning_rate": 6.958656067400462e-06, + "loss": 3.1079, + "step": 35825 + }, + { + "epoch": 2.4344340263622772, + "grad_norm": 5.600092887878418, + "learning_rate": 6.9582314173121355e-06, + "loss": 2.9872, + "step": 35830 + }, + { + "epoch": 2.4347737464329393, + "grad_norm": 6.3509931564331055, + "learning_rate": 6.957806767223808e-06, + "loss": 2.8683, + "step": 35835 + }, + { + "epoch": 2.435113466503601, + "grad_norm": 6.587081432342529, + "learning_rate": 6.95738211713548e-06, + "loss": 3.256, + "step": 35840 + }, + { + "epoch": 2.4354531865742626, + "grad_norm": 8.162163734436035, + "learning_rate": 6.956957467047154e-06, + "loss": 2.9227, + "step": 35845 + }, + { + "epoch": 2.4357929066449246, + "grad_norm": 7.767141342163086, + "learning_rate": 6.956532816958827e-06, + "loss": 3.0404, + "step": 35850 + }, + { + "epoch": 2.4361326267155863, + "grad_norm": 6.8585286140441895, + "learning_rate": 6.956108166870499e-06, + "loss": 3.0751, + "step": 35855 + }, + { + "epoch": 2.436472346786248, + "grad_norm": 7.780017852783203, + "learning_rate": 6.955683516782172e-06, + "loss": 3.3455, + "step": 35860 + }, + { + "epoch": 2.43681206685691, + "grad_norm": 7.188108921051025, + "learning_rate": 6.955258866693845e-06, + "loss": 2.9937, + "step": 35865 + }, + { + "epoch": 2.4371517869275716, + "grad_norm": 6.9105224609375, + "learning_rate": 6.954834216605517e-06, + "loss": 3.1178, + "step": 35870 + }, + { + "epoch": 2.4374915069982332, + "grad_norm": 6.095139026641846, + "learning_rate": 6.954409566517191e-06, + "loss": 3.0364, + "step": 35875 + }, + { + "epoch": 2.4378312270688953, + "grad_norm": 7.198815822601318, + "learning_rate": 6.953984916428863e-06, + "loss": 3.0749, + "step": 35880 + }, + { + "epoch": 2.438170947139557, + "grad_norm": 8.350829124450684, + "learning_rate": 6.9535602663405355e-06, + "loss": 2.9382, + "step": 35885 + }, + { + "epoch": 2.4385106672102186, + "grad_norm": 8.039674758911133, + "learning_rate": 6.953135616252209e-06, + "loss": 3.1158, + "step": 35890 + }, + { + "epoch": 2.4388503872808807, + "grad_norm": 6.73134183883667, + "learning_rate": 6.952710966163881e-06, + "loss": 3.1545, + "step": 35895 + }, + { + "epoch": 2.4391901073515423, + "grad_norm": 6.489910125732422, + "learning_rate": 6.952286316075554e-06, + "loss": 2.9179, + "step": 35900 + }, + { + "epoch": 2.439529827422204, + "grad_norm": 7.798190593719482, + "learning_rate": 6.9518616659872275e-06, + "loss": 2.9643, + "step": 35905 + }, + { + "epoch": 2.439869547492866, + "grad_norm": 5.676476001739502, + "learning_rate": 6.9514370158988995e-06, + "loss": 3.296, + "step": 35910 + }, + { + "epoch": 2.4402092675635276, + "grad_norm": 7.205140113830566, + "learning_rate": 6.951012365810572e-06, + "loss": 3.0987, + "step": 35915 + }, + { + "epoch": 2.4405489876341893, + "grad_norm": 7.273864269256592, + "learning_rate": 6.950587715722246e-06, + "loss": 2.9435, + "step": 35920 + }, + { + "epoch": 2.4408887077048513, + "grad_norm": 5.757659912109375, + "learning_rate": 6.950163065633918e-06, + "loss": 2.8742, + "step": 35925 + }, + { + "epoch": 2.441228427775513, + "grad_norm": 7.247875690460205, + "learning_rate": 6.949738415545591e-06, + "loss": 3.1529, + "step": 35930 + }, + { + "epoch": 2.4415681478461746, + "grad_norm": 7.821478843688965, + "learning_rate": 6.949313765457264e-06, + "loss": 3.3445, + "step": 35935 + }, + { + "epoch": 2.4419078679168367, + "grad_norm": 6.776769161224365, + "learning_rate": 6.948889115368936e-06, + "loss": 3.0176, + "step": 35940 + }, + { + "epoch": 2.4422475879874983, + "grad_norm": 7.430598258972168, + "learning_rate": 6.948464465280609e-06, + "loss": 3.0551, + "step": 35945 + }, + { + "epoch": 2.44258730805816, + "grad_norm": 6.613341808319092, + "learning_rate": 6.948039815192282e-06, + "loss": 3.0374, + "step": 35950 + }, + { + "epoch": 2.442927028128822, + "grad_norm": 5.673040390014648, + "learning_rate": 6.947615165103955e-06, + "loss": 3.246, + "step": 35955 + }, + { + "epoch": 2.4432667481994836, + "grad_norm": 6.198999881744385, + "learning_rate": 6.947190515015628e-06, + "loss": 2.9085, + "step": 35960 + }, + { + "epoch": 2.4436064682701453, + "grad_norm": 6.088625907897949, + "learning_rate": 6.9467658649273e-06, + "loss": 3.0307, + "step": 35965 + }, + { + "epoch": 2.4439461883408073, + "grad_norm": 6.950636386871338, + "learning_rate": 6.946341214838973e-06, + "loss": 2.8972, + "step": 35970 + }, + { + "epoch": 2.444285908411469, + "grad_norm": 5.8473052978515625, + "learning_rate": 6.945916564750647e-06, + "loss": 3.015, + "step": 35975 + }, + { + "epoch": 2.4446256284821306, + "grad_norm": 5.950364589691162, + "learning_rate": 6.945491914662319e-06, + "loss": 2.7907, + "step": 35980 + }, + { + "epoch": 2.4449653485527927, + "grad_norm": 5.771753787994385, + "learning_rate": 6.9450672645739915e-06, + "loss": 2.6746, + "step": 35985 + }, + { + "epoch": 2.4453050686234543, + "grad_norm": 7.00787353515625, + "learning_rate": 6.944642614485665e-06, + "loss": 3.1732, + "step": 35990 + }, + { + "epoch": 2.445644788694116, + "grad_norm": 7.268126964569092, + "learning_rate": 6.944217964397337e-06, + "loss": 3.0366, + "step": 35995 + }, + { + "epoch": 2.445984508764778, + "grad_norm": 7.433817386627197, + "learning_rate": 6.94379331430901e-06, + "loss": 3.2255, + "step": 36000 + }, + { + "epoch": 2.4463242288354397, + "grad_norm": 7.558895587921143, + "learning_rate": 6.9433686642206836e-06, + "loss": 3.1682, + "step": 36005 + }, + { + "epoch": 2.4466639489061013, + "grad_norm": 6.908871650695801, + "learning_rate": 6.9429440141323555e-06, + "loss": 3.0957, + "step": 36010 + }, + { + "epoch": 2.4470036689767634, + "grad_norm": 7.133845806121826, + "learning_rate": 6.942519364044028e-06, + "loss": 3.1091, + "step": 36015 + }, + { + "epoch": 2.447343389047425, + "grad_norm": 6.118066310882568, + "learning_rate": 6.942094713955702e-06, + "loss": 2.8057, + "step": 36020 + }, + { + "epoch": 2.4476831091180866, + "grad_norm": 7.26671028137207, + "learning_rate": 6.941670063867374e-06, + "loss": 3.1009, + "step": 36025 + }, + { + "epoch": 2.4480228291887487, + "grad_norm": 6.292152404785156, + "learning_rate": 6.941245413779047e-06, + "loss": 2.9836, + "step": 36030 + }, + { + "epoch": 2.4483625492594103, + "grad_norm": 6.609243869781494, + "learning_rate": 6.9408207636907195e-06, + "loss": 3.1059, + "step": 36035 + }, + { + "epoch": 2.448702269330072, + "grad_norm": 6.994518756866455, + "learning_rate": 6.940396113602392e-06, + "loss": 2.9485, + "step": 36040 + }, + { + "epoch": 2.449041989400734, + "grad_norm": 5.791663646697998, + "learning_rate": 6.939971463514065e-06, + "loss": 3.2219, + "step": 36045 + }, + { + "epoch": 2.4493817094713957, + "grad_norm": 5.875488758087158, + "learning_rate": 6.939546813425738e-06, + "loss": 3.1335, + "step": 36050 + }, + { + "epoch": 2.4497214295420573, + "grad_norm": 8.291877746582031, + "learning_rate": 6.939122163337411e-06, + "loss": 3.0518, + "step": 36055 + }, + { + "epoch": 2.4500611496127194, + "grad_norm": 7.308042049407959, + "learning_rate": 6.938697513249083e-06, + "loss": 2.9324, + "step": 36060 + }, + { + "epoch": 2.450400869683381, + "grad_norm": 7.721179008483887, + "learning_rate": 6.938272863160756e-06, + "loss": 2.9099, + "step": 36065 + }, + { + "epoch": 2.4507405897540426, + "grad_norm": 6.134171962738037, + "learning_rate": 6.937848213072429e-06, + "loss": 3.0208, + "step": 36070 + }, + { + "epoch": 2.4510803098247043, + "grad_norm": 7.846149444580078, + "learning_rate": 6.937423562984101e-06, + "loss": 3.0923, + "step": 36075 + }, + { + "epoch": 2.4514200298953663, + "grad_norm": 5.604952335357666, + "learning_rate": 6.936998912895775e-06, + "loss": 3.0927, + "step": 36080 + }, + { + "epoch": 2.451759749966028, + "grad_norm": 6.6752119064331055, + "learning_rate": 6.9365742628074475e-06, + "loss": 3.2016, + "step": 36085 + }, + { + "epoch": 2.4520994700366896, + "grad_norm": 6.339922904968262, + "learning_rate": 6.9361496127191195e-06, + "loss": 2.8986, + "step": 36090 + }, + { + "epoch": 2.4524391901073517, + "grad_norm": 7.556845188140869, + "learning_rate": 6.935724962630793e-06, + "loss": 2.9557, + "step": 36095 + }, + { + "epoch": 2.4527789101780133, + "grad_norm": 8.561321258544922, + "learning_rate": 6.935300312542466e-06, + "loss": 2.8813, + "step": 36100 + }, + { + "epoch": 2.453118630248675, + "grad_norm": 6.18418550491333, + "learning_rate": 6.934875662454138e-06, + "loss": 2.7645, + "step": 36105 + }, + { + "epoch": 2.453458350319337, + "grad_norm": 6.630561828613281, + "learning_rate": 6.9344510123658115e-06, + "loss": 3.1694, + "step": 36110 + }, + { + "epoch": 2.4537980703899986, + "grad_norm": 6.445113658905029, + "learning_rate": 6.934026362277484e-06, + "loss": 3.1874, + "step": 36115 + }, + { + "epoch": 2.4541377904606603, + "grad_norm": 7.191171169281006, + "learning_rate": 6.933601712189156e-06, + "loss": 3.0463, + "step": 36120 + }, + { + "epoch": 2.4544775105313223, + "grad_norm": 7.964921474456787, + "learning_rate": 6.93317706210083e-06, + "loss": 3.0844, + "step": 36125 + }, + { + "epoch": 2.454817230601984, + "grad_norm": 6.7341694831848145, + "learning_rate": 6.932752412012502e-06, + "loss": 2.9422, + "step": 36130 + }, + { + "epoch": 2.4551569506726456, + "grad_norm": 6.957146167755127, + "learning_rate": 6.932327761924175e-06, + "loss": 3.225, + "step": 36135 + }, + { + "epoch": 2.4554966707433077, + "grad_norm": 6.857155799865723, + "learning_rate": 6.931903111835848e-06, + "loss": 2.8038, + "step": 36140 + }, + { + "epoch": 2.4558363908139693, + "grad_norm": 7.035041332244873, + "learning_rate": 6.93147846174752e-06, + "loss": 3.2649, + "step": 36145 + }, + { + "epoch": 2.456176110884631, + "grad_norm": 5.897815227508545, + "learning_rate": 6.931053811659193e-06, + "loss": 3.074, + "step": 36150 + }, + { + "epoch": 2.456515830955293, + "grad_norm": 7.000744819641113, + "learning_rate": 6.930629161570867e-06, + "loss": 3.1559, + "step": 36155 + }, + { + "epoch": 2.4568555510259547, + "grad_norm": 8.53759479522705, + "learning_rate": 6.930204511482539e-06, + "loss": 3.1351, + "step": 36160 + }, + { + "epoch": 2.4571952710966163, + "grad_norm": 7.13145112991333, + "learning_rate": 6.9297798613942115e-06, + "loss": 3.2779, + "step": 36165 + }, + { + "epoch": 2.457534991167278, + "grad_norm": 7.137942314147949, + "learning_rate": 6.929355211305885e-06, + "loss": 2.8318, + "step": 36170 + }, + { + "epoch": 2.45787471123794, + "grad_norm": 6.551816463470459, + "learning_rate": 6.928930561217557e-06, + "loss": 3.1401, + "step": 36175 + }, + { + "epoch": 2.4582144313086016, + "grad_norm": 6.797963619232178, + "learning_rate": 6.92850591112923e-06, + "loss": 3.0321, + "step": 36180 + }, + { + "epoch": 2.4585541513792633, + "grad_norm": 7.850872039794922, + "learning_rate": 6.9280812610409035e-06, + "loss": 3.1131, + "step": 36185 + }, + { + "epoch": 2.4588938714499253, + "grad_norm": 6.160820007324219, + "learning_rate": 6.9276566109525755e-06, + "loss": 3.2283, + "step": 36190 + }, + { + "epoch": 2.459233591520587, + "grad_norm": 6.916452884674072, + "learning_rate": 6.927231960864248e-06, + "loss": 3.1747, + "step": 36195 + }, + { + "epoch": 2.4595733115912486, + "grad_norm": 7.097950458526611, + "learning_rate": 6.926807310775921e-06, + "loss": 2.9577, + "step": 36200 + }, + { + "epoch": 2.4599130316619107, + "grad_norm": 6.343079090118408, + "learning_rate": 6.926382660687594e-06, + "loss": 3.3385, + "step": 36205 + }, + { + "epoch": 2.4602527517325723, + "grad_norm": 5.930951118469238, + "learning_rate": 6.925958010599267e-06, + "loss": 3.1351, + "step": 36210 + }, + { + "epoch": 2.460592471803234, + "grad_norm": 6.969964981079102, + "learning_rate": 6.9255333605109395e-06, + "loss": 2.8553, + "step": 36215 + }, + { + "epoch": 2.460932191873896, + "grad_norm": 7.879733562469482, + "learning_rate": 6.925108710422612e-06, + "loss": 3.2618, + "step": 36220 + }, + { + "epoch": 2.4612719119445576, + "grad_norm": 7.213466167449951, + "learning_rate": 6.924684060334284e-06, + "loss": 2.8324, + "step": 36225 + }, + { + "epoch": 2.4616116320152193, + "grad_norm": 7.066836833953857, + "learning_rate": 6.924259410245958e-06, + "loss": 3.1375, + "step": 36230 + }, + { + "epoch": 2.4619513520858813, + "grad_norm": 6.158652305603027, + "learning_rate": 6.923834760157631e-06, + "loss": 2.7421, + "step": 36235 + }, + { + "epoch": 2.462291072156543, + "grad_norm": 7.719857215881348, + "learning_rate": 6.923410110069303e-06, + "loss": 2.971, + "step": 36240 + }, + { + "epoch": 2.4626307922272046, + "grad_norm": 7.119471549987793, + "learning_rate": 6.922985459980976e-06, + "loss": 3.0048, + "step": 36245 + }, + { + "epoch": 2.4629705122978667, + "grad_norm": 7.628752708435059, + "learning_rate": 6.922560809892649e-06, + "loss": 2.9579, + "step": 36250 + }, + { + "epoch": 2.4633102323685283, + "grad_norm": 7.979672431945801, + "learning_rate": 6.922136159804321e-06, + "loss": 2.9135, + "step": 36255 + }, + { + "epoch": 2.46364995243919, + "grad_norm": 8.271079063415527, + "learning_rate": 6.921711509715995e-06, + "loss": 2.9267, + "step": 36260 + }, + { + "epoch": 2.463989672509852, + "grad_norm": 8.080140113830566, + "learning_rate": 6.9212868596276675e-06, + "loss": 2.9521, + "step": 36265 + }, + { + "epoch": 2.4643293925805136, + "grad_norm": 6.658079147338867, + "learning_rate": 6.9208622095393395e-06, + "loss": 3.2209, + "step": 36270 + }, + { + "epoch": 2.4646691126511753, + "grad_norm": 6.784581661224365, + "learning_rate": 6.920437559451013e-06, + "loss": 2.9997, + "step": 36275 + }, + { + "epoch": 2.4650088327218374, + "grad_norm": 7.760153770446777, + "learning_rate": 6.920012909362686e-06, + "loss": 3.1197, + "step": 36280 + }, + { + "epoch": 2.465348552792499, + "grad_norm": 5.245085716247559, + "learning_rate": 6.919588259274358e-06, + "loss": 2.9912, + "step": 36285 + }, + { + "epoch": 2.4656882728631606, + "grad_norm": 9.402006149291992, + "learning_rate": 6.9191636091860315e-06, + "loss": 3.1569, + "step": 36290 + }, + { + "epoch": 2.4660279929338227, + "grad_norm": 6.451511383056641, + "learning_rate": 6.9187389590977035e-06, + "loss": 2.8096, + "step": 36295 + }, + { + "epoch": 2.4663677130044843, + "grad_norm": 7.043482303619385, + "learning_rate": 6.918314309009377e-06, + "loss": 2.8964, + "step": 36300 + }, + { + "epoch": 2.466707433075146, + "grad_norm": 5.43980598449707, + "learning_rate": 6.91788965892105e-06, + "loss": 2.9052, + "step": 36305 + }, + { + "epoch": 2.467047153145808, + "grad_norm": 7.5790252685546875, + "learning_rate": 6.917465008832722e-06, + "loss": 3.172, + "step": 36310 + }, + { + "epoch": 2.4673868732164697, + "grad_norm": 6.146546840667725, + "learning_rate": 6.9170403587443955e-06, + "loss": 3.1606, + "step": 36315 + }, + { + "epoch": 2.4677265932871313, + "grad_norm": 5.104159832000732, + "learning_rate": 6.916615708656068e-06, + "loss": 3.1187, + "step": 36320 + }, + { + "epoch": 2.4680663133577934, + "grad_norm": 7.6454267501831055, + "learning_rate": 6.91619105856774e-06, + "loss": 2.9953, + "step": 36325 + }, + { + "epoch": 2.468406033428455, + "grad_norm": 6.33618688583374, + "learning_rate": 6.915766408479414e-06, + "loss": 2.8879, + "step": 36330 + }, + { + "epoch": 2.4687457534991166, + "grad_norm": 6.5458455085754395, + "learning_rate": 6.915341758391087e-06, + "loss": 3.3275, + "step": 36335 + }, + { + "epoch": 2.4690854735697787, + "grad_norm": 7.147051811218262, + "learning_rate": 6.914917108302759e-06, + "loss": 2.8176, + "step": 36340 + }, + { + "epoch": 2.4694251936404403, + "grad_norm": 6.860525131225586, + "learning_rate": 6.914492458214432e-06, + "loss": 3.0075, + "step": 36345 + }, + { + "epoch": 2.469764913711102, + "grad_norm": 7.7784247398376465, + "learning_rate": 6.914067808126105e-06, + "loss": 3.1754, + "step": 36350 + }, + { + "epoch": 2.470104633781764, + "grad_norm": 8.257159233093262, + "learning_rate": 6.913643158037777e-06, + "loss": 2.8063, + "step": 36355 + }, + { + "epoch": 2.4704443538524257, + "grad_norm": 6.9882307052612305, + "learning_rate": 6.913218507949451e-06, + "loss": 3.0329, + "step": 36360 + }, + { + "epoch": 2.4707840739230873, + "grad_norm": 7.2785844802856445, + "learning_rate": 6.9127938578611235e-06, + "loss": 3.2038, + "step": 36365 + }, + { + "epoch": 2.4711237939937494, + "grad_norm": 6.968507766723633, + "learning_rate": 6.9123692077727955e-06, + "loss": 2.9803, + "step": 36370 + }, + { + "epoch": 2.471463514064411, + "grad_norm": 7.540365219116211, + "learning_rate": 6.911944557684469e-06, + "loss": 2.9034, + "step": 36375 + }, + { + "epoch": 2.4718032341350726, + "grad_norm": 8.520315170288086, + "learning_rate": 6.911519907596141e-06, + "loss": 3.1589, + "step": 36380 + }, + { + "epoch": 2.4721429542057347, + "grad_norm": 7.0989298820495605, + "learning_rate": 6.911095257507814e-06, + "loss": 3.1717, + "step": 36385 + }, + { + "epoch": 2.4724826742763963, + "grad_norm": 6.9803571701049805, + "learning_rate": 6.9106706074194875e-06, + "loss": 2.9001, + "step": 36390 + }, + { + "epoch": 2.472822394347058, + "grad_norm": 7.067039489746094, + "learning_rate": 6.9102459573311595e-06, + "loss": 3.0464, + "step": 36395 + }, + { + "epoch": 2.47316211441772, + "grad_norm": 7.134427547454834, + "learning_rate": 6.909821307242832e-06, + "loss": 2.9939, + "step": 36400 + }, + { + "epoch": 2.4735018344883817, + "grad_norm": 6.748677730560303, + "learning_rate": 6.909396657154506e-06, + "loss": 3.0912, + "step": 36405 + }, + { + "epoch": 2.4738415545590433, + "grad_norm": 6.159321308135986, + "learning_rate": 6.908972007066178e-06, + "loss": 3.1383, + "step": 36410 + }, + { + "epoch": 2.474181274629705, + "grad_norm": 8.319291114807129, + "learning_rate": 6.908547356977851e-06, + "loss": 3.0151, + "step": 36415 + }, + { + "epoch": 2.474520994700367, + "grad_norm": 8.072233200073242, + "learning_rate": 6.908122706889524e-06, + "loss": 3.0367, + "step": 36420 + }, + { + "epoch": 2.4748607147710286, + "grad_norm": 7.330676555633545, + "learning_rate": 6.907698056801196e-06, + "loss": 3.0581, + "step": 36425 + }, + { + "epoch": 2.4752004348416903, + "grad_norm": 5.991333961486816, + "learning_rate": 6.907273406712869e-06, + "loss": 3.1006, + "step": 36430 + }, + { + "epoch": 2.4755401549123524, + "grad_norm": 7.133347988128662, + "learning_rate": 6.906848756624543e-06, + "loss": 3.0634, + "step": 36435 + }, + { + "epoch": 2.475879874983014, + "grad_norm": 8.148080825805664, + "learning_rate": 6.906424106536215e-06, + "loss": 3.0301, + "step": 36440 + }, + { + "epoch": 2.4762195950536756, + "grad_norm": 7.526590824127197, + "learning_rate": 6.9059994564478875e-06, + "loss": 3.168, + "step": 36445 + }, + { + "epoch": 2.4765593151243377, + "grad_norm": 5.212734222412109, + "learning_rate": 6.90557480635956e-06, + "loss": 3.1443, + "step": 36450 + }, + { + "epoch": 2.4768990351949993, + "grad_norm": 6.771424770355225, + "learning_rate": 6.905150156271233e-06, + "loss": 3.0853, + "step": 36455 + }, + { + "epoch": 2.477238755265661, + "grad_norm": 7.440215587615967, + "learning_rate": 6.904725506182906e-06, + "loss": 3.0253, + "step": 36460 + }, + { + "epoch": 2.477578475336323, + "grad_norm": 6.97763729095459, + "learning_rate": 6.904300856094579e-06, + "loss": 2.9643, + "step": 36465 + }, + { + "epoch": 2.4779181954069847, + "grad_norm": 7.335552215576172, + "learning_rate": 6.9038762060062515e-06, + "loss": 2.9656, + "step": 36470 + }, + { + "epoch": 2.4782579154776463, + "grad_norm": 7.254310131072998, + "learning_rate": 6.9034515559179234e-06, + "loss": 3.1632, + "step": 36475 + }, + { + "epoch": 2.4785976355483084, + "grad_norm": 8.140978813171387, + "learning_rate": 6.903026905829597e-06, + "loss": 2.9816, + "step": 36480 + }, + { + "epoch": 2.47893735561897, + "grad_norm": 8.00261116027832, + "learning_rate": 6.90260225574127e-06, + "loss": 2.9221, + "step": 36485 + }, + { + "epoch": 2.4792770756896316, + "grad_norm": 7.523237705230713, + "learning_rate": 6.902177605652942e-06, + "loss": 2.8618, + "step": 36490 + }, + { + "epoch": 2.4796167957602937, + "grad_norm": 6.162886142730713, + "learning_rate": 6.9017529555646155e-06, + "loss": 3.133, + "step": 36495 + }, + { + "epoch": 2.4799565158309553, + "grad_norm": 7.681686878204346, + "learning_rate": 6.901328305476288e-06, + "loss": 2.8266, + "step": 36500 + }, + { + "epoch": 2.480296235901617, + "grad_norm": 6.1798176765441895, + "learning_rate": 6.90090365538796e-06, + "loss": 3.0835, + "step": 36505 + }, + { + "epoch": 2.4806359559722786, + "grad_norm": 5.609086513519287, + "learning_rate": 6.900479005299634e-06, + "loss": 3.0842, + "step": 36510 + }, + { + "epoch": 2.4809756760429407, + "grad_norm": 6.783772945404053, + "learning_rate": 6.900054355211307e-06, + "loss": 2.919, + "step": 36515 + }, + { + "epoch": 2.4813153961136023, + "grad_norm": 6.958718299865723, + "learning_rate": 6.899629705122979e-06, + "loss": 2.9458, + "step": 36520 + }, + { + "epoch": 2.481655116184264, + "grad_norm": 9.036917686462402, + "learning_rate": 6.899205055034652e-06, + "loss": 3.1735, + "step": 36525 + }, + { + "epoch": 2.481994836254926, + "grad_norm": 7.468946933746338, + "learning_rate": 6.898780404946325e-06, + "loss": 3.0517, + "step": 36530 + }, + { + "epoch": 2.4823345563255876, + "grad_norm": 5.317216873168945, + "learning_rate": 6.898355754857997e-06, + "loss": 3.0911, + "step": 36535 + }, + { + "epoch": 2.4826742763962493, + "grad_norm": 5.668015003204346, + "learning_rate": 6.897931104769671e-06, + "loss": 2.8211, + "step": 36540 + }, + { + "epoch": 2.4830139964669113, + "grad_norm": 7.672703742980957, + "learning_rate": 6.897506454681343e-06, + "loss": 3.0566, + "step": 36545 + }, + { + "epoch": 2.483353716537573, + "grad_norm": 9.229156494140625, + "learning_rate": 6.8970818045930155e-06, + "loss": 3.0181, + "step": 36550 + }, + { + "epoch": 2.4836934366082346, + "grad_norm": 6.67619514465332, + "learning_rate": 6.896657154504689e-06, + "loss": 2.8971, + "step": 36555 + }, + { + "epoch": 2.4840331566788967, + "grad_norm": 8.050517082214355, + "learning_rate": 6.896232504416361e-06, + "loss": 2.9337, + "step": 36560 + }, + { + "epoch": 2.4843728767495583, + "grad_norm": 7.700088977813721, + "learning_rate": 6.895807854328034e-06, + "loss": 2.8806, + "step": 36565 + }, + { + "epoch": 2.48471259682022, + "grad_norm": 7.620433330535889, + "learning_rate": 6.8953832042397075e-06, + "loss": 2.8433, + "step": 36570 + }, + { + "epoch": 2.485052316890882, + "grad_norm": 7.736762046813965, + "learning_rate": 6.8949585541513795e-06, + "loss": 3.2049, + "step": 36575 + }, + { + "epoch": 2.4853920369615436, + "grad_norm": 8.339058876037598, + "learning_rate": 6.894533904063052e-06, + "loss": 3.3161, + "step": 36580 + }, + { + "epoch": 2.4857317570322053, + "grad_norm": 6.221527099609375, + "learning_rate": 6.894109253974726e-06, + "loss": 2.982, + "step": 36585 + }, + { + "epoch": 2.4860714771028674, + "grad_norm": 6.0521931648254395, + "learning_rate": 6.893684603886398e-06, + "loss": 3.1706, + "step": 36590 + }, + { + "epoch": 2.486411197173529, + "grad_norm": 7.6410603523254395, + "learning_rate": 6.893259953798071e-06, + "loss": 2.861, + "step": 36595 + }, + { + "epoch": 2.4867509172441906, + "grad_norm": 7.381184101104736, + "learning_rate": 6.892835303709744e-06, + "loss": 3.1726, + "step": 36600 + }, + { + "epoch": 2.4870906373148527, + "grad_norm": 6.538417339324951, + "learning_rate": 6.892410653621416e-06, + "loss": 2.8043, + "step": 36605 + }, + { + "epoch": 2.4874303573855143, + "grad_norm": 7.94490909576416, + "learning_rate": 6.891986003533089e-06, + "loss": 3.2015, + "step": 36610 + }, + { + "epoch": 2.487770077456176, + "grad_norm": 4.809153079986572, + "learning_rate": 6.891561353444763e-06, + "loss": 3.1303, + "step": 36615 + }, + { + "epoch": 2.488109797526838, + "grad_norm": 7.108616828918457, + "learning_rate": 6.891136703356435e-06, + "loss": 2.9648, + "step": 36620 + }, + { + "epoch": 2.4884495175974997, + "grad_norm": 6.305851459503174, + "learning_rate": 6.8907120532681075e-06, + "loss": 3.1897, + "step": 36625 + }, + { + "epoch": 2.4887892376681613, + "grad_norm": 7.306728363037109, + "learning_rate": 6.89028740317978e-06, + "loss": 2.9397, + "step": 36630 + }, + { + "epoch": 2.4891289577388234, + "grad_norm": 6.147366046905518, + "learning_rate": 6.889862753091453e-06, + "loss": 3.1071, + "step": 36635 + }, + { + "epoch": 2.489468677809485, + "grad_norm": 6.681424140930176, + "learning_rate": 6.889438103003127e-06, + "loss": 2.9695, + "step": 36640 + }, + { + "epoch": 2.4898083978801466, + "grad_norm": 6.215904235839844, + "learning_rate": 6.889013452914799e-06, + "loss": 3.0803, + "step": 36645 + }, + { + "epoch": 2.4901481179508087, + "grad_norm": 7.014934539794922, + "learning_rate": 6.8885888028264715e-06, + "loss": 3.0096, + "step": 36650 + }, + { + "epoch": 2.4904878380214703, + "grad_norm": 7.034728527069092, + "learning_rate": 6.888164152738145e-06, + "loss": 3.1253, + "step": 36655 + }, + { + "epoch": 2.490827558092132, + "grad_norm": 7.852980136871338, + "learning_rate": 6.887739502649817e-06, + "loss": 2.8212, + "step": 36660 + }, + { + "epoch": 2.491167278162794, + "grad_norm": 8.713321685791016, + "learning_rate": 6.88731485256149e-06, + "loss": 2.9326, + "step": 36665 + }, + { + "epoch": 2.4915069982334557, + "grad_norm": 6.549736976623535, + "learning_rate": 6.8868902024731635e-06, + "loss": 2.9431, + "step": 36670 + }, + { + "epoch": 2.4918467183041173, + "grad_norm": 6.658658981323242, + "learning_rate": 6.8864655523848355e-06, + "loss": 2.7058, + "step": 36675 + }, + { + "epoch": 2.4921864383747794, + "grad_norm": 7.581761837005615, + "learning_rate": 6.886040902296508e-06, + "loss": 2.9541, + "step": 36680 + }, + { + "epoch": 2.492526158445441, + "grad_norm": 6.993992328643799, + "learning_rate": 6.885616252208182e-06, + "loss": 2.9661, + "step": 36685 + }, + { + "epoch": 2.4928658785161026, + "grad_norm": 7.581914901733398, + "learning_rate": 6.885191602119854e-06, + "loss": 2.8653, + "step": 36690 + }, + { + "epoch": 2.4932055985867647, + "grad_norm": 6.034030437469482, + "learning_rate": 6.884766952031527e-06, + "loss": 3.0171, + "step": 36695 + }, + { + "epoch": 2.4935453186574263, + "grad_norm": 6.345771312713623, + "learning_rate": 6.8843423019431995e-06, + "loss": 2.9931, + "step": 36700 + }, + { + "epoch": 2.493885038728088, + "grad_norm": 9.103705406188965, + "learning_rate": 6.883917651854872e-06, + "loss": 2.8822, + "step": 36705 + }, + { + "epoch": 2.49422475879875, + "grad_norm": 6.598937511444092, + "learning_rate": 6.883493001766545e-06, + "loss": 3.1291, + "step": 36710 + }, + { + "epoch": 2.4945644788694117, + "grad_norm": 6.172515869140625, + "learning_rate": 6.883068351678218e-06, + "loss": 3.235, + "step": 36715 + }, + { + "epoch": 2.4949041989400733, + "grad_norm": 7.916900634765625, + "learning_rate": 6.882643701589891e-06, + "loss": 2.9626, + "step": 36720 + }, + { + "epoch": 2.4952439190107354, + "grad_norm": 7.414167881011963, + "learning_rate": 6.882219051501563e-06, + "loss": 2.8335, + "step": 36725 + }, + { + "epoch": 2.495583639081397, + "grad_norm": 5.44651460647583, + "learning_rate": 6.881794401413236e-06, + "loss": 3.1043, + "step": 36730 + }, + { + "epoch": 2.4959233591520587, + "grad_norm": 8.504047393798828, + "learning_rate": 6.881369751324909e-06, + "loss": 3.2051, + "step": 36735 + }, + { + "epoch": 2.4962630792227207, + "grad_norm": 8.506926536560059, + "learning_rate": 6.880945101236581e-06, + "loss": 2.8818, + "step": 36740 + }, + { + "epoch": 2.4966027992933824, + "grad_norm": 8.54286003112793, + "learning_rate": 6.880520451148255e-06, + "loss": 2.943, + "step": 36745 + }, + { + "epoch": 2.496942519364044, + "grad_norm": 6.223911762237549, + "learning_rate": 6.8800958010599275e-06, + "loss": 3.122, + "step": 36750 + }, + { + "epoch": 2.4972822394347056, + "grad_norm": 7.069011688232422, + "learning_rate": 6.8796711509715994e-06, + "loss": 2.8856, + "step": 36755 + }, + { + "epoch": 2.4976219595053677, + "grad_norm": 5.782228946685791, + "learning_rate": 6.879246500883273e-06, + "loss": 2.9247, + "step": 36760 + }, + { + "epoch": 2.4979616795760293, + "grad_norm": 5.986255645751953, + "learning_rate": 6.878821850794946e-06, + "loss": 2.8893, + "step": 36765 + }, + { + "epoch": 2.498301399646691, + "grad_norm": 5.801057815551758, + "learning_rate": 6.878397200706618e-06, + "loss": 3.046, + "step": 36770 + }, + { + "epoch": 2.498641119717353, + "grad_norm": 6.23342752456665, + "learning_rate": 6.8779725506182915e-06, + "loss": 3.0029, + "step": 36775 + }, + { + "epoch": 2.4989808397880147, + "grad_norm": 5.768257141113281, + "learning_rate": 6.877547900529964e-06, + "loss": 2.7762, + "step": 36780 + }, + { + "epoch": 2.4993205598586763, + "grad_norm": 6.886030197143555, + "learning_rate": 6.877123250441636e-06, + "loss": 3.0642, + "step": 36785 + }, + { + "epoch": 2.4996602799293384, + "grad_norm": 6.965820789337158, + "learning_rate": 6.87669860035331e-06, + "loss": 3.009, + "step": 36790 + }, + { + "epoch": 2.5, + "grad_norm": 7.0141777992248535, + "learning_rate": 6.876273950264982e-06, + "loss": 2.9966, + "step": 36795 + }, + { + "epoch": 2.5003397200706616, + "grad_norm": 8.038148880004883, + "learning_rate": 6.875849300176655e-06, + "loss": 3.2813, + "step": 36800 + }, + { + "epoch": 2.5006794401413237, + "grad_norm": 6.546644687652588, + "learning_rate": 6.875424650088328e-06, + "loss": 3.1339, + "step": 36805 + }, + { + "epoch": 2.5010191602119853, + "grad_norm": 6.906323432922363, + "learning_rate": 6.875e-06, + "loss": 3.1554, + "step": 36810 + }, + { + "epoch": 2.501358880282647, + "grad_norm": 7.164019584655762, + "learning_rate": 6.874575349911673e-06, + "loss": 2.9052, + "step": 36815 + }, + { + "epoch": 2.5016986003533086, + "grad_norm": 6.772796154022217, + "learning_rate": 6.874150699823347e-06, + "loss": 3.0849, + "step": 36820 + }, + { + "epoch": 2.5020383204239707, + "grad_norm": 6.036638259887695, + "learning_rate": 6.873726049735019e-06, + "loss": 3.115, + "step": 36825 + }, + { + "epoch": 2.5023780404946323, + "grad_norm": 9.061833381652832, + "learning_rate": 6.8733013996466914e-06, + "loss": 3.1048, + "step": 36830 + }, + { + "epoch": 2.502717760565294, + "grad_norm": 7.180422782897949, + "learning_rate": 6.872876749558365e-06, + "loss": 3.0372, + "step": 36835 + }, + { + "epoch": 2.503057480635956, + "grad_norm": 5.761464595794678, + "learning_rate": 6.872452099470037e-06, + "loss": 3.179, + "step": 36840 + }, + { + "epoch": 2.5033972007066176, + "grad_norm": 8.658852577209473, + "learning_rate": 6.87202744938171e-06, + "loss": 3.1379, + "step": 36845 + }, + { + "epoch": 2.5037369207772793, + "grad_norm": 6.1003828048706055, + "learning_rate": 6.8716027992933835e-06, + "loss": 2.9581, + "step": 36850 + }, + { + "epoch": 2.5040766408479413, + "grad_norm": 7.839721202850342, + "learning_rate": 6.8711781492050555e-06, + "loss": 2.9284, + "step": 36855 + }, + { + "epoch": 2.504416360918603, + "grad_norm": 6.730178356170654, + "learning_rate": 6.870753499116728e-06, + "loss": 3.1775, + "step": 36860 + }, + { + "epoch": 2.5047560809892646, + "grad_norm": 6.395962238311768, + "learning_rate": 6.870328849028401e-06, + "loss": 3.1505, + "step": 36865 + }, + { + "epoch": 2.5050958010599267, + "grad_norm": 5.961023807525635, + "learning_rate": 6.869904198940074e-06, + "loss": 2.8016, + "step": 36870 + }, + { + "epoch": 2.5054355211305883, + "grad_norm": 6.592942237854004, + "learning_rate": 6.869479548851747e-06, + "loss": 2.9493, + "step": 36875 + }, + { + "epoch": 2.50577524120125, + "grad_norm": 6.389243125915527, + "learning_rate": 6.8690548987634195e-06, + "loss": 3.015, + "step": 36880 + }, + { + "epoch": 2.506114961271912, + "grad_norm": 6.255850315093994, + "learning_rate": 6.868630248675092e-06, + "loss": 2.9917, + "step": 36885 + }, + { + "epoch": 2.5064546813425737, + "grad_norm": 6.802150249481201, + "learning_rate": 6.868205598586764e-06, + "loss": 2.9618, + "step": 36890 + }, + { + "epoch": 2.5067944014132353, + "grad_norm": 7.31250524520874, + "learning_rate": 6.867780948498438e-06, + "loss": 2.8394, + "step": 36895 + }, + { + "epoch": 2.5071341214838974, + "grad_norm": 8.087465286254883, + "learning_rate": 6.867356298410111e-06, + "loss": 2.9337, + "step": 36900 + }, + { + "epoch": 2.507473841554559, + "grad_norm": 6.007490634918213, + "learning_rate": 6.866931648321783e-06, + "loss": 3.0447, + "step": 36905 + }, + { + "epoch": 2.5078135616252206, + "grad_norm": 5.317610263824463, + "learning_rate": 6.866506998233456e-06, + "loss": 2.9055, + "step": 36910 + }, + { + "epoch": 2.5081532816958827, + "grad_norm": 9.555509567260742, + "learning_rate": 6.866082348145129e-06, + "loss": 3.023, + "step": 36915 + }, + { + "epoch": 2.5084930017665443, + "grad_norm": 6.868502140045166, + "learning_rate": 6.865657698056801e-06, + "loss": 2.9665, + "step": 36920 + }, + { + "epoch": 2.508832721837206, + "grad_norm": 7.686300277709961, + "learning_rate": 6.865233047968475e-06, + "loss": 3.1595, + "step": 36925 + }, + { + "epoch": 2.509172441907868, + "grad_norm": 6.485630989074707, + "learning_rate": 6.8648083978801475e-06, + "loss": 2.7806, + "step": 36930 + }, + { + "epoch": 2.5095121619785297, + "grad_norm": 9.613712310791016, + "learning_rate": 6.864383747791819e-06, + "loss": 3.225, + "step": 36935 + }, + { + "epoch": 2.5098518820491913, + "grad_norm": 5.435347080230713, + "learning_rate": 6.863959097703493e-06, + "loss": 3.1168, + "step": 36940 + }, + { + "epoch": 2.5101916021198534, + "grad_norm": 6.986595630645752, + "learning_rate": 6.863534447615166e-06, + "loss": 3.0092, + "step": 36945 + }, + { + "epoch": 2.510531322190515, + "grad_norm": 8.357592582702637, + "learning_rate": 6.863109797526838e-06, + "loss": 2.862, + "step": 36950 + }, + { + "epoch": 2.5108710422611766, + "grad_norm": 8.176613807678223, + "learning_rate": 6.8626851474385115e-06, + "loss": 3.0424, + "step": 36955 + }, + { + "epoch": 2.5112107623318387, + "grad_norm": 5.442854881286621, + "learning_rate": 6.862260497350184e-06, + "loss": 3.0073, + "step": 36960 + }, + { + "epoch": 2.5115504824025003, + "grad_norm": 7.119658470153809, + "learning_rate": 6.861835847261856e-06, + "loss": 3.0149, + "step": 36965 + }, + { + "epoch": 2.511890202473162, + "grad_norm": 6.4712958335876465, + "learning_rate": 6.86141119717353e-06, + "loss": 3.0035, + "step": 36970 + }, + { + "epoch": 2.512229922543824, + "grad_norm": 6.314143657684326, + "learning_rate": 6.860986547085202e-06, + "loss": 3.2603, + "step": 36975 + }, + { + "epoch": 2.5125696426144857, + "grad_norm": 5.682586669921875, + "learning_rate": 6.860561896996875e-06, + "loss": 2.8833, + "step": 36980 + }, + { + "epoch": 2.5129093626851473, + "grad_norm": 7.2556939125061035, + "learning_rate": 6.860137246908548e-06, + "loss": 3.0701, + "step": 36985 + }, + { + "epoch": 2.5132490827558094, + "grad_norm": 7.679253578186035, + "learning_rate": 6.85971259682022e-06, + "loss": 3.2959, + "step": 36990 + }, + { + "epoch": 2.513588802826471, + "grad_norm": 6.116089344024658, + "learning_rate": 6.859287946731894e-06, + "loss": 3.2871, + "step": 36995 + }, + { + "epoch": 2.5139285228971326, + "grad_norm": 7.347585201263428, + "learning_rate": 6.858863296643567e-06, + "loss": 3.1465, + "step": 37000 + }, + { + "epoch": 2.5142682429677947, + "grad_norm": 6.341131687164307, + "learning_rate": 6.858438646555239e-06, + "loss": 3.5394, + "step": 37005 + }, + { + "epoch": 2.5146079630384564, + "grad_norm": 5.327327251434326, + "learning_rate": 6.858013996466912e-06, + "loss": 2.8451, + "step": 37010 + }, + { + "epoch": 2.514947683109118, + "grad_norm": 6.528316497802734, + "learning_rate": 6.857589346378585e-06, + "loss": 3.0348, + "step": 37015 + }, + { + "epoch": 2.51528740317978, + "grad_norm": 6.264271259307861, + "learning_rate": 6.857164696290257e-06, + "loss": 2.8854, + "step": 37020 + }, + { + "epoch": 2.5156271232504417, + "grad_norm": 7.119262218475342, + "learning_rate": 6.856740046201931e-06, + "loss": 2.9392, + "step": 37025 + }, + { + "epoch": 2.5159668433211033, + "grad_norm": 5.259028434753418, + "learning_rate": 6.8563153961136035e-06, + "loss": 2.8902, + "step": 37030 + }, + { + "epoch": 2.5163065633917654, + "grad_norm": 4.7681427001953125, + "learning_rate": 6.8558907460252754e-06, + "loss": 2.8534, + "step": 37035 + }, + { + "epoch": 2.516646283462427, + "grad_norm": 5.554497241973877, + "learning_rate": 6.855466095936949e-06, + "loss": 3.0123, + "step": 37040 + }, + { + "epoch": 2.5169860035330887, + "grad_norm": 6.958225250244141, + "learning_rate": 6.855041445848621e-06, + "loss": 2.7857, + "step": 37045 + }, + { + "epoch": 2.5173257236037507, + "grad_norm": 8.052546501159668, + "learning_rate": 6.854616795760294e-06, + "loss": 3.1257, + "step": 37050 + }, + { + "epoch": 2.5176654436744124, + "grad_norm": 7.641473770141602, + "learning_rate": 6.8541921456719675e-06, + "loss": 3.1292, + "step": 37055 + }, + { + "epoch": 2.518005163745074, + "grad_norm": 6.816627502441406, + "learning_rate": 6.8537674955836394e-06, + "loss": 2.9416, + "step": 37060 + }, + { + "epoch": 2.518344883815736, + "grad_norm": 6.47691011428833, + "learning_rate": 6.853342845495312e-06, + "loss": 3.1087, + "step": 37065 + }, + { + "epoch": 2.5186846038863977, + "grad_norm": 5.660024642944336, + "learning_rate": 6.852918195406986e-06, + "loss": 3.0274, + "step": 37070 + }, + { + "epoch": 2.5190243239570593, + "grad_norm": 6.011835098266602, + "learning_rate": 6.852493545318658e-06, + "loss": 2.7067, + "step": 37075 + }, + { + "epoch": 2.5193640440277214, + "grad_norm": 8.379851341247559, + "learning_rate": 6.852068895230331e-06, + "loss": 3.1525, + "step": 37080 + }, + { + "epoch": 2.519703764098383, + "grad_norm": 5.672330379486084, + "learning_rate": 6.851644245142004e-06, + "loss": 2.8466, + "step": 37085 + }, + { + "epoch": 2.5200434841690447, + "grad_norm": 5.8295183181762695, + "learning_rate": 6.851219595053676e-06, + "loss": 3.0959, + "step": 37090 + }, + { + "epoch": 2.5203832042397067, + "grad_norm": 8.773138046264648, + "learning_rate": 6.850794944965349e-06, + "loss": 3.3406, + "step": 37095 + }, + { + "epoch": 2.5207229243103684, + "grad_norm": 6.422532081604004, + "learning_rate": 6.850370294877023e-06, + "loss": 2.8848, + "step": 37100 + }, + { + "epoch": 2.52106264438103, + "grad_norm": 6.648931980133057, + "learning_rate": 6.849945644788695e-06, + "loss": 2.8247, + "step": 37105 + }, + { + "epoch": 2.521402364451692, + "grad_norm": 7.834948539733887, + "learning_rate": 6.8495209947003674e-06, + "loss": 2.9431, + "step": 37110 + }, + { + "epoch": 2.5217420845223537, + "grad_norm": 8.52723503112793, + "learning_rate": 6.84909634461204e-06, + "loss": 3.0774, + "step": 37115 + }, + { + "epoch": 2.5220818045930153, + "grad_norm": 6.880016326904297, + "learning_rate": 6.848671694523713e-06, + "loss": 3.0425, + "step": 37120 + }, + { + "epoch": 2.522421524663677, + "grad_norm": 7.340639114379883, + "learning_rate": 6.848247044435386e-06, + "loss": 3.081, + "step": 37125 + }, + { + "epoch": 2.522761244734339, + "grad_norm": 7.274834156036377, + "learning_rate": 6.847822394347059e-06, + "loss": 3.0764, + "step": 37130 + }, + { + "epoch": 2.5231009648050007, + "grad_norm": 7.172582626342773, + "learning_rate": 6.8473977442587314e-06, + "loss": 3.187, + "step": 37135 + }, + { + "epoch": 2.5234406848756623, + "grad_norm": 4.769299507141113, + "learning_rate": 6.846973094170403e-06, + "loss": 3.1596, + "step": 37140 + }, + { + "epoch": 2.5237804049463244, + "grad_norm": 6.47199010848999, + "learning_rate": 6.846548444082077e-06, + "loss": 2.8897, + "step": 37145 + }, + { + "epoch": 2.524120125016986, + "grad_norm": 7.292878150939941, + "learning_rate": 6.84612379399375e-06, + "loss": 3.2029, + "step": 37150 + }, + { + "epoch": 2.5244598450876476, + "grad_norm": 7.307283878326416, + "learning_rate": 6.845699143905422e-06, + "loss": 3.1289, + "step": 37155 + }, + { + "epoch": 2.5247995651583093, + "grad_norm": 6.501472473144531, + "learning_rate": 6.8452744938170955e-06, + "loss": 2.8946, + "step": 37160 + }, + { + "epoch": 2.5251392852289714, + "grad_norm": 6.518960475921631, + "learning_rate": 6.844849843728768e-06, + "loss": 3.1871, + "step": 37165 + }, + { + "epoch": 2.525479005299633, + "grad_norm": 7.603945732116699, + "learning_rate": 6.84442519364044e-06, + "loss": 3.0614, + "step": 37170 + }, + { + "epoch": 2.5258187253702946, + "grad_norm": 8.77941608428955, + "learning_rate": 6.844000543552114e-06, + "loss": 2.9322, + "step": 37175 + }, + { + "epoch": 2.5261584454409567, + "grad_norm": 8.45514965057373, + "learning_rate": 6.843575893463787e-06, + "loss": 2.9161, + "step": 37180 + }, + { + "epoch": 2.5264981655116183, + "grad_norm": 5.628091812133789, + "learning_rate": 6.843151243375459e-06, + "loss": 3.0099, + "step": 37185 + }, + { + "epoch": 2.52683788558228, + "grad_norm": 5.576364040374756, + "learning_rate": 6.842726593287132e-06, + "loss": 2.9291, + "step": 37190 + }, + { + "epoch": 2.527177605652942, + "grad_norm": 7.25970983505249, + "learning_rate": 6.842301943198805e-06, + "loss": 3.2401, + "step": 37195 + }, + { + "epoch": 2.5275173257236037, + "grad_norm": 6.115822792053223, + "learning_rate": 6.841877293110477e-06, + "loss": 2.8019, + "step": 37200 + }, + { + "epoch": 2.5278570457942653, + "grad_norm": 5.6913676261901855, + "learning_rate": 6.841452643022151e-06, + "loss": 3.122, + "step": 37205 + }, + { + "epoch": 2.5281967658649274, + "grad_norm": 6.96402645111084, + "learning_rate": 6.841027992933823e-06, + "loss": 2.9722, + "step": 37210 + }, + { + "epoch": 2.528536485935589, + "grad_norm": 7.399899959564209, + "learning_rate": 6.840603342845495e-06, + "loss": 2.8447, + "step": 37215 + }, + { + "epoch": 2.5288762060062506, + "grad_norm": 6.258357524871826, + "learning_rate": 6.840178692757169e-06, + "loss": 3.2309, + "step": 37220 + }, + { + "epoch": 2.5292159260769127, + "grad_norm": 7.3561248779296875, + "learning_rate": 6.839754042668841e-06, + "loss": 2.9286, + "step": 37225 + }, + { + "epoch": 2.5295556461475743, + "grad_norm": 6.515239715576172, + "learning_rate": 6.839329392580514e-06, + "loss": 2.8932, + "step": 37230 + }, + { + "epoch": 2.529895366218236, + "grad_norm": 8.215228080749512, + "learning_rate": 6.8389047424921875e-06, + "loss": 2.921, + "step": 37235 + }, + { + "epoch": 2.530235086288898, + "grad_norm": 7.448976993560791, + "learning_rate": 6.838480092403859e-06, + "loss": 3.1243, + "step": 37240 + }, + { + "epoch": 2.5305748063595597, + "grad_norm": 9.889043807983398, + "learning_rate": 6.838055442315532e-06, + "loss": 2.9364, + "step": 37245 + }, + { + "epoch": 2.5309145264302213, + "grad_norm": 5.906825542449951, + "learning_rate": 6.837630792227206e-06, + "loss": 2.8066, + "step": 37250 + }, + { + "epoch": 2.5312542465008834, + "grad_norm": 6.992008686065674, + "learning_rate": 6.837206142138878e-06, + "loss": 3.1326, + "step": 37255 + }, + { + "epoch": 2.531593966571545, + "grad_norm": 7.060009479522705, + "learning_rate": 6.836781492050551e-06, + "loss": 3.0439, + "step": 37260 + }, + { + "epoch": 2.5319336866422066, + "grad_norm": 7.252647399902344, + "learning_rate": 6.836356841962224e-06, + "loss": 2.9649, + "step": 37265 + }, + { + "epoch": 2.5322734067128687, + "grad_norm": 7.114459037780762, + "learning_rate": 6.835932191873896e-06, + "loss": 3.029, + "step": 37270 + }, + { + "epoch": 2.5326131267835303, + "grad_norm": 6.03603458404541, + "learning_rate": 6.835507541785569e-06, + "loss": 3.0024, + "step": 37275 + }, + { + "epoch": 2.532952846854192, + "grad_norm": 8.08926773071289, + "learning_rate": 6.835082891697243e-06, + "loss": 2.9792, + "step": 37280 + }, + { + "epoch": 2.533292566924854, + "grad_norm": 8.28125286102295, + "learning_rate": 6.834658241608915e-06, + "loss": 2.98, + "step": 37285 + }, + { + "epoch": 2.5336322869955157, + "grad_norm": 6.21207857131958, + "learning_rate": 6.8342335915205874e-06, + "loss": 2.9675, + "step": 37290 + }, + { + "epoch": 2.5339720070661773, + "grad_norm": 6.7520880699157715, + "learning_rate": 6.83380894143226e-06, + "loss": 2.9465, + "step": 37295 + }, + { + "epoch": 2.5343117271368394, + "grad_norm": 8.740714073181152, + "learning_rate": 6.833384291343933e-06, + "loss": 2.9233, + "step": 37300 + }, + { + "epoch": 2.534651447207501, + "grad_norm": 7.349019527435303, + "learning_rate": 6.832959641255605e-06, + "loss": 3.1003, + "step": 37305 + }, + { + "epoch": 2.5349911672781626, + "grad_norm": 5.917484283447266, + "learning_rate": 6.832534991167279e-06, + "loss": 2.77, + "step": 37310 + }, + { + "epoch": 2.5353308873488247, + "grad_norm": 7.7353434562683105, + "learning_rate": 6.8321103410789514e-06, + "loss": 2.7906, + "step": 37315 + }, + { + "epoch": 2.5356706074194864, + "grad_norm": 6.279388904571533, + "learning_rate": 6.831685690990623e-06, + "loss": 3.1108, + "step": 37320 + }, + { + "epoch": 2.536010327490148, + "grad_norm": 7.478115558624268, + "learning_rate": 6.831261040902297e-06, + "loss": 2.9721, + "step": 37325 + }, + { + "epoch": 2.53635004756081, + "grad_norm": 6.829433441162109, + "learning_rate": 6.83083639081397e-06, + "loss": 3.1634, + "step": 37330 + }, + { + "epoch": 2.5366897676314717, + "grad_norm": 7.710355758666992, + "learning_rate": 6.8304117407256435e-06, + "loss": 3.0853, + "step": 37335 + }, + { + "epoch": 2.5370294877021333, + "grad_norm": 5.4613118171691895, + "learning_rate": 6.8299870906373154e-06, + "loss": 3.052, + "step": 37340 + }, + { + "epoch": 2.5373692077727954, + "grad_norm": 7.322218894958496, + "learning_rate": 6.829562440548988e-06, + "loss": 3.2829, + "step": 37345 + }, + { + "epoch": 2.537708927843457, + "grad_norm": 7.881622791290283, + "learning_rate": 6.829137790460662e-06, + "loss": 2.8329, + "step": 37350 + }, + { + "epoch": 2.5380486479141187, + "grad_norm": 5.486594200134277, + "learning_rate": 6.828713140372334e-06, + "loss": 3.1314, + "step": 37355 + }, + { + "epoch": 2.5383883679847807, + "grad_norm": 6.688736915588379, + "learning_rate": 6.828373420301672e-06, + "loss": 2.999, + "step": 37360 + }, + { + "epoch": 2.5387280880554424, + "grad_norm": 6.885712623596191, + "learning_rate": 6.827948770213345e-06, + "loss": 2.8812, + "step": 37365 + }, + { + "epoch": 2.539067808126104, + "grad_norm": 8.905272483825684, + "learning_rate": 6.8275241201250175e-06, + "loss": 3.0987, + "step": 37370 + }, + { + "epoch": 2.539407528196766, + "grad_norm": 6.533342361450195, + "learning_rate": 6.82709947003669e-06, + "loss": 3.1121, + "step": 37375 + }, + { + "epoch": 2.5397472482674277, + "grad_norm": 6.032941818237305, + "learning_rate": 6.826674819948363e-06, + "loss": 3.0779, + "step": 37380 + }, + { + "epoch": 2.5400869683380893, + "grad_norm": 6.535576343536377, + "learning_rate": 6.826250169860036e-06, + "loss": 3.0522, + "step": 37385 + }, + { + "epoch": 2.5404266884087514, + "grad_norm": 8.491734504699707, + "learning_rate": 6.825825519771708e-06, + "loss": 3.0392, + "step": 37390 + }, + { + "epoch": 2.540766408479413, + "grad_norm": 5.622157096862793, + "learning_rate": 6.8254008696833815e-06, + "loss": 3.1247, + "step": 37395 + }, + { + "epoch": 2.5411061285500747, + "grad_norm": 7.173590660095215, + "learning_rate": 6.824976219595054e-06, + "loss": 3.2654, + "step": 37400 + }, + { + "epoch": 2.5414458486207367, + "grad_norm": 6.874992847442627, + "learning_rate": 6.824551569506726e-06, + "loss": 2.9527, + "step": 37405 + }, + { + "epoch": 2.5417855686913984, + "grad_norm": 6.774543285369873, + "learning_rate": 6.8241269194184e-06, + "loss": 3.1198, + "step": 37410 + }, + { + "epoch": 2.54212528876206, + "grad_norm": 7.537154674530029, + "learning_rate": 6.823702269330073e-06, + "loss": 3.3341, + "step": 37415 + }, + { + "epoch": 2.542465008832722, + "grad_norm": 6.967547416687012, + "learning_rate": 6.823277619241745e-06, + "loss": 3.4212, + "step": 37420 + }, + { + "epoch": 2.5428047289033837, + "grad_norm": 7.05034875869751, + "learning_rate": 6.822852969153418e-06, + "loss": 3.1048, + "step": 37425 + }, + { + "epoch": 2.5431444489740453, + "grad_norm": 6.537496566772461, + "learning_rate": 6.822428319065091e-06, + "loss": 3.1616, + "step": 37430 + }, + { + "epoch": 2.5434841690447074, + "grad_norm": 8.044475555419922, + "learning_rate": 6.822003668976763e-06, + "loss": 3.028, + "step": 37435 + }, + { + "epoch": 2.543823889115369, + "grad_norm": 7.785207271575928, + "learning_rate": 6.821579018888437e-06, + "loss": 2.9894, + "step": 37440 + }, + { + "epoch": 2.5441636091860307, + "grad_norm": 8.124017715454102, + "learning_rate": 6.8211543688001095e-06, + "loss": 2.9428, + "step": 37445 + }, + { + "epoch": 2.5445033292566928, + "grad_norm": 6.803427219390869, + "learning_rate": 6.8207297187117815e-06, + "loss": 2.891, + "step": 37450 + }, + { + "epoch": 2.5448430493273544, + "grad_norm": 9.519227027893066, + "learning_rate": 6.820305068623455e-06, + "loss": 3.0146, + "step": 37455 + }, + { + "epoch": 2.545182769398016, + "grad_norm": 7.0505146980285645, + "learning_rate": 6.819880418535128e-06, + "loss": 3.0952, + "step": 37460 + }, + { + "epoch": 2.5455224894686777, + "grad_norm": 7.143350124359131, + "learning_rate": 6.8194557684468e-06, + "loss": 3.0989, + "step": 37465 + }, + { + "epoch": 2.5458622095393397, + "grad_norm": 6.296285152435303, + "learning_rate": 6.8190311183584735e-06, + "loss": 3.0092, + "step": 37470 + }, + { + "epoch": 2.5462019296100014, + "grad_norm": 6.899101257324219, + "learning_rate": 6.8186064682701455e-06, + "loss": 2.9199, + "step": 37475 + }, + { + "epoch": 2.546541649680663, + "grad_norm": 7.362902641296387, + "learning_rate": 6.818181818181818e-06, + "loss": 2.7802, + "step": 37480 + }, + { + "epoch": 2.546881369751325, + "grad_norm": 6.218628406524658, + "learning_rate": 6.817757168093492e-06, + "loss": 3.2192, + "step": 37485 + }, + { + "epoch": 2.5472210898219867, + "grad_norm": 8.133817672729492, + "learning_rate": 6.817332518005164e-06, + "loss": 3.334, + "step": 37490 + }, + { + "epoch": 2.5475608098926483, + "grad_norm": 7.531465530395508, + "learning_rate": 6.816907867916837e-06, + "loss": 2.9838, + "step": 37495 + }, + { + "epoch": 2.54790052996331, + "grad_norm": 7.651416778564453, + "learning_rate": 6.81648321782851e-06, + "loss": 3.0828, + "step": 37500 + }, + { + "epoch": 2.548240250033972, + "grad_norm": 6.579484939575195, + "learning_rate": 6.816058567740182e-06, + "loss": 3.1928, + "step": 37505 + }, + { + "epoch": 2.5485799701046337, + "grad_norm": 7.702869892120361, + "learning_rate": 6.815633917651855e-06, + "loss": 3.0789, + "step": 37510 + }, + { + "epoch": 2.5489196901752953, + "grad_norm": 6.729400634765625, + "learning_rate": 6.815209267563529e-06, + "loss": 3.1081, + "step": 37515 + }, + { + "epoch": 2.5492594102459574, + "grad_norm": 5.573088645935059, + "learning_rate": 6.814784617475201e-06, + "loss": 2.9462, + "step": 37520 + }, + { + "epoch": 2.549599130316619, + "grad_norm": 7.891323566436768, + "learning_rate": 6.8143599673868735e-06, + "loss": 3.168, + "step": 37525 + }, + { + "epoch": 2.5499388503872806, + "grad_norm": 6.130146503448486, + "learning_rate": 6.813935317298547e-06, + "loss": 2.9341, + "step": 37530 + }, + { + "epoch": 2.5502785704579427, + "grad_norm": 5.533452987670898, + "learning_rate": 6.813510667210219e-06, + "loss": 3.1457, + "step": 37535 + }, + { + "epoch": 2.5506182905286043, + "grad_norm": 6.116693496704102, + "learning_rate": 6.813086017121893e-06, + "loss": 3.0194, + "step": 37540 + }, + { + "epoch": 2.550958010599266, + "grad_norm": 7.494287967681885, + "learning_rate": 6.812661367033565e-06, + "loss": 3.0944, + "step": 37545 + }, + { + "epoch": 2.551297730669928, + "grad_norm": 8.631431579589844, + "learning_rate": 6.8122367169452375e-06, + "loss": 2.8921, + "step": 37550 + }, + { + "epoch": 2.5516374507405897, + "grad_norm": 6.666840553283691, + "learning_rate": 6.811812066856911e-06, + "loss": 2.8201, + "step": 37555 + }, + { + "epoch": 2.5519771708112513, + "grad_norm": 4.98326301574707, + "learning_rate": 6.811387416768583e-06, + "loss": 2.752, + "step": 37560 + }, + { + "epoch": 2.5523168908819134, + "grad_norm": 5.604018688201904, + "learning_rate": 6.810962766680256e-06, + "loss": 2.9951, + "step": 37565 + }, + { + "epoch": 2.552656610952575, + "grad_norm": 9.189343452453613, + "learning_rate": 6.8105381165919295e-06, + "loss": 2.8976, + "step": 37570 + }, + { + "epoch": 2.5529963310232366, + "grad_norm": 6.567091464996338, + "learning_rate": 6.8101134665036015e-06, + "loss": 2.9847, + "step": 37575 + }, + { + "epoch": 2.5533360510938987, + "grad_norm": 6.224113941192627, + "learning_rate": 6.809688816415274e-06, + "loss": 3.2048, + "step": 37580 + }, + { + "epoch": 2.5536757711645603, + "grad_norm": 5.992724895477295, + "learning_rate": 6.809264166326948e-06, + "loss": 2.686, + "step": 37585 + }, + { + "epoch": 2.554015491235222, + "grad_norm": 7.32023811340332, + "learning_rate": 6.80883951623862e-06, + "loss": 2.9842, + "step": 37590 + }, + { + "epoch": 2.554355211305884, + "grad_norm": 7.158278465270996, + "learning_rate": 6.808414866150293e-06, + "loss": 3.115, + "step": 37595 + }, + { + "epoch": 2.5546949313765457, + "grad_norm": 6.945290565490723, + "learning_rate": 6.807990216061966e-06, + "loss": 2.7912, + "step": 37600 + }, + { + "epoch": 2.5550346514472073, + "grad_norm": 5.633657932281494, + "learning_rate": 6.807565565973638e-06, + "loss": 3.0682, + "step": 37605 + }, + { + "epoch": 2.5553743715178694, + "grad_norm": 8.796341896057129, + "learning_rate": 6.807140915885311e-06, + "loss": 3.1144, + "step": 37610 + }, + { + "epoch": 2.555714091588531, + "grad_norm": 7.733123779296875, + "learning_rate": 6.806716265796984e-06, + "loss": 2.5603, + "step": 37615 + }, + { + "epoch": 2.5560538116591927, + "grad_norm": 5.973759174346924, + "learning_rate": 6.806291615708657e-06, + "loss": 3.1852, + "step": 37620 + }, + { + "epoch": 2.5563935317298547, + "grad_norm": 11.015374183654785, + "learning_rate": 6.8058669656203295e-06, + "loss": 3.1841, + "step": 37625 + }, + { + "epoch": 2.5567332518005164, + "grad_norm": 7.10573673248291, + "learning_rate": 6.805442315532002e-06, + "loss": 2.9913, + "step": 37630 + }, + { + "epoch": 2.557072971871178, + "grad_norm": 7.332078456878662, + "learning_rate": 6.805017665443675e-06, + "loss": 3.2113, + "step": 37635 + }, + { + "epoch": 2.55741269194184, + "grad_norm": 8.63245964050293, + "learning_rate": 6.804593015355347e-06, + "loss": 2.9167, + "step": 37640 + }, + { + "epoch": 2.5577524120125017, + "grad_norm": 7.040544033050537, + "learning_rate": 6.804168365267021e-06, + "loss": 3.1832, + "step": 37645 + }, + { + "epoch": 2.5580921320831633, + "grad_norm": 5.797129154205322, + "learning_rate": 6.8037437151786935e-06, + "loss": 2.9156, + "step": 37650 + }, + { + "epoch": 2.5584318521538254, + "grad_norm": 6.706235885620117, + "learning_rate": 6.8033190650903655e-06, + "loss": 3.0883, + "step": 37655 + }, + { + "epoch": 2.558771572224487, + "grad_norm": 7.037680625915527, + "learning_rate": 6.802894415002039e-06, + "loss": 3.0421, + "step": 37660 + }, + { + "epoch": 2.5591112922951487, + "grad_norm": 5.547999858856201, + "learning_rate": 6.802469764913712e-06, + "loss": 2.8093, + "step": 37665 + }, + { + "epoch": 2.5594510123658107, + "grad_norm": 5.836372375488281, + "learning_rate": 6.802045114825384e-06, + "loss": 3.0368, + "step": 37670 + }, + { + "epoch": 2.5597907324364724, + "grad_norm": 6.611190319061279, + "learning_rate": 6.8016204647370575e-06, + "loss": 3.2278, + "step": 37675 + }, + { + "epoch": 2.560130452507134, + "grad_norm": 8.029052734375, + "learning_rate": 6.80119581464873e-06, + "loss": 3.0932, + "step": 37680 + }, + { + "epoch": 2.560470172577796, + "grad_norm": 8.089186668395996, + "learning_rate": 6.800771164560402e-06, + "loss": 3.1134, + "step": 37685 + }, + { + "epoch": 2.5608098926484577, + "grad_norm": 8.776652336120605, + "learning_rate": 6.800346514472076e-06, + "loss": 3.2764, + "step": 37690 + }, + { + "epoch": 2.5611496127191193, + "grad_norm": 6.2923784255981445, + "learning_rate": 6.799921864383749e-06, + "loss": 3.0262, + "step": 37695 + }, + { + "epoch": 2.5614893327897814, + "grad_norm": 7.049410820007324, + "learning_rate": 6.799497214295421e-06, + "loss": 2.9947, + "step": 37700 + }, + { + "epoch": 2.561829052860443, + "grad_norm": 7.614809513092041, + "learning_rate": 6.799072564207094e-06, + "loss": 2.9905, + "step": 37705 + }, + { + "epoch": 2.5621687729311047, + "grad_norm": 7.254080295562744, + "learning_rate": 6.798647914118766e-06, + "loss": 2.8877, + "step": 37710 + }, + { + "epoch": 2.5625084930017668, + "grad_norm": 6.1983537673950195, + "learning_rate": 6.798223264030439e-06, + "loss": 3.0222, + "step": 37715 + }, + { + "epoch": 2.5628482130724284, + "grad_norm": 7.477678298950195, + "learning_rate": 6.797798613942113e-06, + "loss": 3.0936, + "step": 37720 + }, + { + "epoch": 2.56318793314309, + "grad_norm": 8.10572338104248, + "learning_rate": 6.797373963853785e-06, + "loss": 2.8648, + "step": 37725 + }, + { + "epoch": 2.563527653213752, + "grad_norm": 6.320599555969238, + "learning_rate": 6.7969493137654575e-06, + "loss": 3.0826, + "step": 37730 + }, + { + "epoch": 2.5638673732844137, + "grad_norm": 6.914572715759277, + "learning_rate": 6.796524663677131e-06, + "loss": 3.074, + "step": 37735 + }, + { + "epoch": 2.5642070933550754, + "grad_norm": 5.958341121673584, + "learning_rate": 6.796100013588803e-06, + "loss": 3.1049, + "step": 37740 + }, + { + "epoch": 2.5645468134257374, + "grad_norm": 5.906381607055664, + "learning_rate": 6.795675363500476e-06, + "loss": 3.1784, + "step": 37745 + }, + { + "epoch": 2.564886533496399, + "grad_norm": 6.563027858734131, + "learning_rate": 6.7952507134121495e-06, + "loss": 2.8635, + "step": 37750 + }, + { + "epoch": 2.5652262535670607, + "grad_norm": 6.492307186126709, + "learning_rate": 6.7948260633238215e-06, + "loss": 3.0875, + "step": 37755 + }, + { + "epoch": 2.5655659736377228, + "grad_norm": 7.288070201873779, + "learning_rate": 6.794401413235494e-06, + "loss": 3.1236, + "step": 37760 + }, + { + "epoch": 2.5659056937083844, + "grad_norm": 6.850299835205078, + "learning_rate": 6.793976763147168e-06, + "loss": 3.0428, + "step": 37765 + }, + { + "epoch": 2.566245413779046, + "grad_norm": 7.045976638793945, + "learning_rate": 6.79355211305884e-06, + "loss": 2.9053, + "step": 37770 + }, + { + "epoch": 2.566585133849708, + "grad_norm": 8.532981872558594, + "learning_rate": 6.793127462970513e-06, + "loss": 3.2979, + "step": 37775 + }, + { + "epoch": 2.5669248539203697, + "grad_norm": 6.067155838012695, + "learning_rate": 6.792702812882186e-06, + "loss": 3.1507, + "step": 37780 + }, + { + "epoch": 2.5672645739910314, + "grad_norm": 5.821338653564453, + "learning_rate": 6.792278162793858e-06, + "loss": 2.8684, + "step": 37785 + }, + { + "epoch": 2.5676042940616934, + "grad_norm": 6.4860711097717285, + "learning_rate": 6.791853512705531e-06, + "loss": 3.301, + "step": 37790 + }, + { + "epoch": 2.567944014132355, + "grad_norm": 6.734237194061279, + "learning_rate": 6.791428862617204e-06, + "loss": 3.0334, + "step": 37795 + }, + { + "epoch": 2.5682837342030167, + "grad_norm": 9.267126083374023, + "learning_rate": 6.791004212528877e-06, + "loss": 3.2475, + "step": 37800 + }, + { + "epoch": 2.5686234542736783, + "grad_norm": 7.119350433349609, + "learning_rate": 6.790579562440549e-06, + "loss": 2.9915, + "step": 37805 + }, + { + "epoch": 2.5689631743443404, + "grad_norm": 6.176825523376465, + "learning_rate": 6.790154912352222e-06, + "loss": 3.1094, + "step": 37810 + }, + { + "epoch": 2.569302894415002, + "grad_norm": 5.9671406745910645, + "learning_rate": 6.789730262263895e-06, + "loss": 3.0757, + "step": 37815 + }, + { + "epoch": 2.5696426144856637, + "grad_norm": 7.5184149742126465, + "learning_rate": 6.789305612175567e-06, + "loss": 3.0648, + "step": 37820 + }, + { + "epoch": 2.5699823345563257, + "grad_norm": 7.003256797790527, + "learning_rate": 6.788880962087241e-06, + "loss": 3.1256, + "step": 37825 + }, + { + "epoch": 2.5703220546269874, + "grad_norm": 6.38278865814209, + "learning_rate": 6.7884563119989135e-06, + "loss": 2.6526, + "step": 37830 + }, + { + "epoch": 2.570661774697649, + "grad_norm": 7.049222946166992, + "learning_rate": 6.7880316619105854e-06, + "loss": 2.8723, + "step": 37835 + }, + { + "epoch": 2.5710014947683106, + "grad_norm": 6.722485542297363, + "learning_rate": 6.787607011822259e-06, + "loss": 2.9336, + "step": 37840 + }, + { + "epoch": 2.5713412148389727, + "grad_norm": 7.743687629699707, + "learning_rate": 6.787182361733932e-06, + "loss": 3.0039, + "step": 37845 + }, + { + "epoch": 2.5716809349096343, + "grad_norm": 7.422345161437988, + "learning_rate": 6.786757711645604e-06, + "loss": 3.1868, + "step": 37850 + }, + { + "epoch": 2.572020654980296, + "grad_norm": 6.2938737869262695, + "learning_rate": 6.7863330615572775e-06, + "loss": 3.1679, + "step": 37855 + }, + { + "epoch": 2.572360375050958, + "grad_norm": 5.745462894439697, + "learning_rate": 6.78590841146895e-06, + "loss": 3.0157, + "step": 37860 + }, + { + "epoch": 2.5727000951216197, + "grad_norm": 7.623197555541992, + "learning_rate": 6.785483761380622e-06, + "loss": 3.2513, + "step": 37865 + }, + { + "epoch": 2.5730398151922813, + "grad_norm": 6.8555402755737305, + "learning_rate": 6.785059111292296e-06, + "loss": 3.098, + "step": 37870 + }, + { + "epoch": 2.5733795352629434, + "grad_norm": 6.776661396026611, + "learning_rate": 6.784634461203969e-06, + "loss": 3.1138, + "step": 37875 + }, + { + "epoch": 2.573719255333605, + "grad_norm": 8.18631362915039, + "learning_rate": 6.7842098111156415e-06, + "loss": 2.9493, + "step": 37880 + }, + { + "epoch": 2.5740589754042666, + "grad_norm": 8.15953254699707, + "learning_rate": 6.783785161027314e-06, + "loss": 2.9907, + "step": 37885 + }, + { + "epoch": 2.5743986954749287, + "grad_norm": 5.886674880981445, + "learning_rate": 6.783360510938986e-06, + "loss": 2.9628, + "step": 37890 + }, + { + "epoch": 2.5747384155455904, + "grad_norm": 8.826915740966797, + "learning_rate": 6.78293586085066e-06, + "loss": 3.1239, + "step": 37895 + }, + { + "epoch": 2.575078135616252, + "grad_norm": 8.431636810302734, + "learning_rate": 6.782511210762333e-06, + "loss": 3.25, + "step": 37900 + }, + { + "epoch": 2.575417855686914, + "grad_norm": 6.559323787689209, + "learning_rate": 6.782086560674005e-06, + "loss": 3.1864, + "step": 37905 + }, + { + "epoch": 2.5757575757575757, + "grad_norm": 7.900216579437256, + "learning_rate": 6.781661910585678e-06, + "loss": 3.0195, + "step": 37910 + }, + { + "epoch": 2.5760972958282373, + "grad_norm": 8.00910758972168, + "learning_rate": 6.781237260497351e-06, + "loss": 3.0831, + "step": 37915 + }, + { + "epoch": 2.5764370158988994, + "grad_norm": 5.82229471206665, + "learning_rate": 6.780812610409023e-06, + "loss": 3.0179, + "step": 37920 + }, + { + "epoch": 2.576776735969561, + "grad_norm": 7.388565540313721, + "learning_rate": 6.780387960320697e-06, + "loss": 3.059, + "step": 37925 + }, + { + "epoch": 2.5771164560402227, + "grad_norm": 7.200228214263916, + "learning_rate": 6.7799633102323695e-06, + "loss": 2.9124, + "step": 37930 + }, + { + "epoch": 2.5774561761108847, + "grad_norm": 7.556779384613037, + "learning_rate": 6.7795386601440415e-06, + "loss": 2.7867, + "step": 37935 + }, + { + "epoch": 2.5777958961815464, + "grad_norm": 6.3216633796691895, + "learning_rate": 6.779114010055715e-06, + "loss": 2.9089, + "step": 37940 + }, + { + "epoch": 2.578135616252208, + "grad_norm": 7.121318817138672, + "learning_rate": 6.778689359967388e-06, + "loss": 2.9614, + "step": 37945 + }, + { + "epoch": 2.57847533632287, + "grad_norm": 6.951565742492676, + "learning_rate": 6.77826470987906e-06, + "loss": 2.986, + "step": 37950 + }, + { + "epoch": 2.5788150563935317, + "grad_norm": 6.32493782043457, + "learning_rate": 6.7778400597907335e-06, + "loss": 2.953, + "step": 37955 + }, + { + "epoch": 2.5791547764641933, + "grad_norm": 7.284655570983887, + "learning_rate": 6.7774154097024055e-06, + "loss": 2.8328, + "step": 37960 + }, + { + "epoch": 2.5794944965348554, + "grad_norm": 7.145143032073975, + "learning_rate": 6.776990759614078e-06, + "loss": 3.2202, + "step": 37965 + }, + { + "epoch": 2.579834216605517, + "grad_norm": 7.24539852142334, + "learning_rate": 6.776566109525752e-06, + "loss": 2.8482, + "step": 37970 + }, + { + "epoch": 2.5801739366761787, + "grad_norm": 7.675868988037109, + "learning_rate": 6.776141459437424e-06, + "loss": 3.077, + "step": 37975 + }, + { + "epoch": 2.5805136567468407, + "grad_norm": 6.76090145111084, + "learning_rate": 6.775716809349097e-06, + "loss": 2.9548, + "step": 37980 + }, + { + "epoch": 2.5808533768175024, + "grad_norm": 7.951904296875, + "learning_rate": 6.77529215926077e-06, + "loss": 2.9454, + "step": 37985 + }, + { + "epoch": 2.581193096888164, + "grad_norm": 7.987598419189453, + "learning_rate": 6.774867509172442e-06, + "loss": 3.1596, + "step": 37990 + }, + { + "epoch": 2.581532816958826, + "grad_norm": 6.787177562713623, + "learning_rate": 6.774442859084115e-06, + "loss": 3.2117, + "step": 37995 + }, + { + "epoch": 2.5818725370294877, + "grad_norm": 7.65310525894165, + "learning_rate": 6.774018208995789e-06, + "loss": 2.7807, + "step": 38000 + }, + { + "epoch": 2.5822122571001493, + "grad_norm": 9.014565467834473, + "learning_rate": 6.773593558907461e-06, + "loss": 2.9563, + "step": 38005 + }, + { + "epoch": 2.5825519771708114, + "grad_norm": 5.599524021148682, + "learning_rate": 6.7731689088191335e-06, + "loss": 3.2462, + "step": 38010 + }, + { + "epoch": 2.582891697241473, + "grad_norm": 8.092597007751465, + "learning_rate": 6.772744258730807e-06, + "loss": 3.2466, + "step": 38015 + }, + { + "epoch": 2.5832314173121347, + "grad_norm": 6.782453536987305, + "learning_rate": 6.772319608642479e-06, + "loss": 3.2495, + "step": 38020 + }, + { + "epoch": 2.5835711373827968, + "grad_norm": 5.0927324295043945, + "learning_rate": 6.771894958554152e-06, + "loss": 3.0814, + "step": 38025 + }, + { + "epoch": 2.5839108574534584, + "grad_norm": 7.3053364753723145, + "learning_rate": 6.7714703084658255e-06, + "loss": 3.0004, + "step": 38030 + }, + { + "epoch": 2.58425057752412, + "grad_norm": 8.196005821228027, + "learning_rate": 6.7710456583774975e-06, + "loss": 2.9682, + "step": 38035 + }, + { + "epoch": 2.584590297594782, + "grad_norm": 7.961076736450195, + "learning_rate": 6.77062100828917e-06, + "loss": 3.3137, + "step": 38040 + }, + { + "epoch": 2.5849300176654437, + "grad_norm": 6.650418281555176, + "learning_rate": 6.770196358200843e-06, + "loss": 3.0885, + "step": 38045 + }, + { + "epoch": 2.5852697377361054, + "grad_norm": 6.723734378814697, + "learning_rate": 6.769771708112516e-06, + "loss": 2.9424, + "step": 38050 + }, + { + "epoch": 2.5856094578067674, + "grad_norm": 5.894201278686523, + "learning_rate": 6.769347058024188e-06, + "loss": 3.1819, + "step": 38055 + }, + { + "epoch": 2.585949177877429, + "grad_norm": 6.818343639373779, + "learning_rate": 6.7689224079358615e-06, + "loss": 3.1899, + "step": 38060 + }, + { + "epoch": 2.5862888979480907, + "grad_norm": 8.358367919921875, + "learning_rate": 6.768497757847534e-06, + "loss": 3.2147, + "step": 38065 + }, + { + "epoch": 2.5866286180187528, + "grad_norm": 5.965912818908691, + "learning_rate": 6.768073107759206e-06, + "loss": 3.206, + "step": 38070 + }, + { + "epoch": 2.5869683380894144, + "grad_norm": 6.723337650299072, + "learning_rate": 6.76764845767088e-06, + "loss": 3.2039, + "step": 38075 + }, + { + "epoch": 2.587308058160076, + "grad_norm": 6.787264823913574, + "learning_rate": 6.767223807582553e-06, + "loss": 2.9446, + "step": 38080 + }, + { + "epoch": 2.587647778230738, + "grad_norm": 4.838286876678467, + "learning_rate": 6.766799157494225e-06, + "loss": 3.0789, + "step": 38085 + }, + { + "epoch": 2.5879874983013997, + "grad_norm": 7.612879276275635, + "learning_rate": 6.766374507405898e-06, + "loss": 3.0345, + "step": 38090 + }, + { + "epoch": 2.5883272183720614, + "grad_norm": 7.651739597320557, + "learning_rate": 6.765949857317571e-06, + "loss": 3.0365, + "step": 38095 + }, + { + "epoch": 2.5886669384427234, + "grad_norm": 5.529569149017334, + "learning_rate": 6.765525207229243e-06, + "loss": 2.9773, + "step": 38100 + }, + { + "epoch": 2.589006658513385, + "grad_norm": 5.769416809082031, + "learning_rate": 6.765100557140917e-06, + "loss": 3.2883, + "step": 38105 + }, + { + "epoch": 2.5893463785840467, + "grad_norm": 9.121223449707031, + "learning_rate": 6.7646759070525895e-06, + "loss": 3.312, + "step": 38110 + }, + { + "epoch": 2.589686098654709, + "grad_norm": 7.677315711975098, + "learning_rate": 6.7642512569642614e-06, + "loss": 3.0622, + "step": 38115 + }, + { + "epoch": 2.5900258187253704, + "grad_norm": 9.157130241394043, + "learning_rate": 6.763826606875935e-06, + "loss": 2.977, + "step": 38120 + }, + { + "epoch": 2.590365538796032, + "grad_norm": 7.416688919067383, + "learning_rate": 6.763401956787608e-06, + "loss": 3.2122, + "step": 38125 + }, + { + "epoch": 2.590705258866694, + "grad_norm": 8.669011116027832, + "learning_rate": 6.76297730669928e-06, + "loss": 3.0058, + "step": 38130 + }, + { + "epoch": 2.5910449789373557, + "grad_norm": 7.60522985458374, + "learning_rate": 6.7625526566109535e-06, + "loss": 3.026, + "step": 38135 + }, + { + "epoch": 2.5913846990080174, + "grad_norm": 7.958376407623291, + "learning_rate": 6.7621280065226254e-06, + "loss": 2.9755, + "step": 38140 + }, + { + "epoch": 2.591724419078679, + "grad_norm": 6.946855545043945, + "learning_rate": 6.761703356434298e-06, + "loss": 3.2584, + "step": 38145 + }, + { + "epoch": 2.592064139149341, + "grad_norm": 9.145869255065918, + "learning_rate": 6.761278706345972e-06, + "loss": 3.0239, + "step": 38150 + }, + { + "epoch": 2.5924038592200027, + "grad_norm": 7.329145908355713, + "learning_rate": 6.760854056257644e-06, + "loss": 3.0678, + "step": 38155 + }, + { + "epoch": 2.5927435792906643, + "grad_norm": 6.365365982055664, + "learning_rate": 6.760429406169317e-06, + "loss": 3.0469, + "step": 38160 + }, + { + "epoch": 2.5930832993613264, + "grad_norm": 6.757895469665527, + "learning_rate": 6.76000475608099e-06, + "loss": 2.8889, + "step": 38165 + }, + { + "epoch": 2.593423019431988, + "grad_norm": 7.666064739227295, + "learning_rate": 6.759580105992662e-06, + "loss": 2.9106, + "step": 38170 + }, + { + "epoch": 2.5937627395026497, + "grad_norm": 7.155445098876953, + "learning_rate": 6.759155455904335e-06, + "loss": 2.8295, + "step": 38175 + }, + { + "epoch": 2.5941024595733113, + "grad_norm": 6.332232475280762, + "learning_rate": 6.758730805816009e-06, + "loss": 3.0616, + "step": 38180 + }, + { + "epoch": 2.5944421796439734, + "grad_norm": 6.554192543029785, + "learning_rate": 6.758306155727681e-06, + "loss": 2.8066, + "step": 38185 + }, + { + "epoch": 2.594781899714635, + "grad_norm": 7.056439399719238, + "learning_rate": 6.7578815056393534e-06, + "loss": 2.915, + "step": 38190 + }, + { + "epoch": 2.5951216197852967, + "grad_norm": 8.455979347229004, + "learning_rate": 6.757456855551027e-06, + "loss": 3.2596, + "step": 38195 + }, + { + "epoch": 2.5954613398559587, + "grad_norm": 6.420719623565674, + "learning_rate": 6.757032205462699e-06, + "loss": 2.9739, + "step": 38200 + }, + { + "epoch": 2.5958010599266204, + "grad_norm": 6.5813727378845215, + "learning_rate": 6.756607555374372e-06, + "loss": 3.0002, + "step": 38205 + }, + { + "epoch": 2.596140779997282, + "grad_norm": 5.536645412445068, + "learning_rate": 6.756182905286045e-06, + "loss": 3.1133, + "step": 38210 + }, + { + "epoch": 2.596480500067944, + "grad_norm": 7.2702789306640625, + "learning_rate": 6.7557582551977174e-06, + "loss": 2.9561, + "step": 38215 + }, + { + "epoch": 2.5968202201386057, + "grad_norm": 6.722850322723389, + "learning_rate": 6.755333605109391e-06, + "loss": 2.9917, + "step": 38220 + }, + { + "epoch": 2.5971599402092673, + "grad_norm": 6.505650997161865, + "learning_rate": 6.754908955021063e-06, + "loss": 3.0083, + "step": 38225 + }, + { + "epoch": 2.5974996602799294, + "grad_norm": 6.997833251953125, + "learning_rate": 6.754484304932736e-06, + "loss": 3.1292, + "step": 38230 + }, + { + "epoch": 2.597839380350591, + "grad_norm": 6.373660087585449, + "learning_rate": 6.7540596548444095e-06, + "loss": 3.1647, + "step": 38235 + }, + { + "epoch": 2.5981791004212527, + "grad_norm": 6.715140342712402, + "learning_rate": 6.7536350047560815e-06, + "loss": 3.0237, + "step": 38240 + }, + { + "epoch": 2.5985188204919147, + "grad_norm": 6.616750717163086, + "learning_rate": 6.753210354667754e-06, + "loss": 3.1715, + "step": 38245 + }, + { + "epoch": 2.5988585405625764, + "grad_norm": 7.073883533477783, + "learning_rate": 6.752785704579428e-06, + "loss": 3.3112, + "step": 38250 + }, + { + "epoch": 2.599198260633238, + "grad_norm": 7.520802974700928, + "learning_rate": 6.7523610544911e-06, + "loss": 3.1479, + "step": 38255 + }, + { + "epoch": 2.5995379807039, + "grad_norm": 6.678810119628906, + "learning_rate": 6.751936404402773e-06, + "loss": 3.0595, + "step": 38260 + }, + { + "epoch": 2.5998777007745617, + "grad_norm": 7.230568885803223, + "learning_rate": 6.751511754314446e-06, + "loss": 2.8678, + "step": 38265 + }, + { + "epoch": 2.6002174208452233, + "grad_norm": 8.24029541015625, + "learning_rate": 6.751087104226118e-06, + "loss": 3.2243, + "step": 38270 + }, + { + "epoch": 2.6005571409158854, + "grad_norm": 5.271633625030518, + "learning_rate": 6.750662454137791e-06, + "loss": 2.916, + "step": 38275 + }, + { + "epoch": 2.600896860986547, + "grad_norm": 7.164420127868652, + "learning_rate": 6.750237804049464e-06, + "loss": 2.9612, + "step": 38280 + }, + { + "epoch": 2.6012365810572087, + "grad_norm": 6.685145854949951, + "learning_rate": 6.749813153961137e-06, + "loss": 3.1318, + "step": 38285 + }, + { + "epoch": 2.6015763011278707, + "grad_norm": 9.213006973266602, + "learning_rate": 6.7493885038728095e-06, + "loss": 2.9523, + "step": 38290 + }, + { + "epoch": 2.6019160211985324, + "grad_norm": 7.212219715118408, + "learning_rate": 6.748963853784482e-06, + "loss": 3.0704, + "step": 38295 + }, + { + "epoch": 2.602255741269194, + "grad_norm": 9.471741676330566, + "learning_rate": 6.748539203696155e-06, + "loss": 3.1724, + "step": 38300 + }, + { + "epoch": 2.602595461339856, + "grad_norm": 8.253273010253906, + "learning_rate": 6.748114553607827e-06, + "loss": 2.9378, + "step": 38305 + }, + { + "epoch": 2.6029351814105177, + "grad_norm": 6.773858547210693, + "learning_rate": 6.747689903519501e-06, + "loss": 2.9642, + "step": 38310 + }, + { + "epoch": 2.6032749014811793, + "grad_norm": 9.097108840942383, + "learning_rate": 6.7472652534311735e-06, + "loss": 2.8927, + "step": 38315 + }, + { + "epoch": 2.6036146215518414, + "grad_norm": 6.868325233459473, + "learning_rate": 6.746840603342845e-06, + "loss": 3.1682, + "step": 38320 + }, + { + "epoch": 2.603954341622503, + "grad_norm": 8.027026176452637, + "learning_rate": 6.746415953254519e-06, + "loss": 3.3692, + "step": 38325 + }, + { + "epoch": 2.6042940616931647, + "grad_norm": 5.583869934082031, + "learning_rate": 6.745991303166192e-06, + "loss": 3.1744, + "step": 38330 + }, + { + "epoch": 2.6046337817638268, + "grad_norm": 6.896630764007568, + "learning_rate": 6.745566653077864e-06, + "loss": 3.0988, + "step": 38335 + }, + { + "epoch": 2.6049735018344884, + "grad_norm": 9.265917778015137, + "learning_rate": 6.7451420029895375e-06, + "loss": 3.0999, + "step": 38340 + }, + { + "epoch": 2.60531322190515, + "grad_norm": 6.720678806304932, + "learning_rate": 6.74471735290121e-06, + "loss": 3.0687, + "step": 38345 + }, + { + "epoch": 2.605652941975812, + "grad_norm": 7.203989028930664, + "learning_rate": 6.744292702812882e-06, + "loss": 2.8141, + "step": 38350 + }, + { + "epoch": 2.6059926620464737, + "grad_norm": 8.661796569824219, + "learning_rate": 6.743868052724556e-06, + "loss": 2.8887, + "step": 38355 + }, + { + "epoch": 2.6063323821171354, + "grad_norm": 8.45517349243164, + "learning_rate": 6.743443402636229e-06, + "loss": 3.0048, + "step": 38360 + }, + { + "epoch": 2.6066721021877974, + "grad_norm": 8.276408195495605, + "learning_rate": 6.743018752547901e-06, + "loss": 2.86, + "step": 38365 + }, + { + "epoch": 2.607011822258459, + "grad_norm": 5.785329818725586, + "learning_rate": 6.742594102459574e-06, + "loss": 3.1824, + "step": 38370 + }, + { + "epoch": 2.6073515423291207, + "grad_norm": 6.143020153045654, + "learning_rate": 6.742169452371246e-06, + "loss": 3.3879, + "step": 38375 + }, + { + "epoch": 2.6076912623997828, + "grad_norm": 6.628415584564209, + "learning_rate": 6.741744802282919e-06, + "loss": 3.1872, + "step": 38380 + }, + { + "epoch": 2.6080309824704444, + "grad_norm": 7.980793476104736, + "learning_rate": 6.741320152194593e-06, + "loss": 3.1279, + "step": 38385 + }, + { + "epoch": 2.608370702541106, + "grad_norm": 5.665501117706299, + "learning_rate": 6.740895502106265e-06, + "loss": 2.9555, + "step": 38390 + }, + { + "epoch": 2.608710422611768, + "grad_norm": 6.361156463623047, + "learning_rate": 6.7404708520179374e-06, + "loss": 3.2394, + "step": 38395 + }, + { + "epoch": 2.6090501426824297, + "grad_norm": 5.256187915802002, + "learning_rate": 6.740046201929611e-06, + "loss": 3.2428, + "step": 38400 + }, + { + "epoch": 2.6093898627530914, + "grad_norm": 7.055069446563721, + "learning_rate": 6.739621551841283e-06, + "loss": 3.0769, + "step": 38405 + }, + { + "epoch": 2.6097295828237534, + "grad_norm": 7.21749210357666, + "learning_rate": 6.739196901752956e-06, + "loss": 3.0888, + "step": 38410 + }, + { + "epoch": 2.610069302894415, + "grad_norm": 6.477624416351318, + "learning_rate": 6.7387722516646295e-06, + "loss": 3.0458, + "step": 38415 + }, + { + "epoch": 2.6104090229650767, + "grad_norm": 6.400271415710449, + "learning_rate": 6.7383476015763014e-06, + "loss": 3.0194, + "step": 38420 + }, + { + "epoch": 2.610748743035739, + "grad_norm": 6.613548278808594, + "learning_rate": 6.737922951487974e-06, + "loss": 3.067, + "step": 38425 + }, + { + "epoch": 2.6110884631064004, + "grad_norm": 6.370099067687988, + "learning_rate": 6.737498301399648e-06, + "loss": 2.755, + "step": 38430 + }, + { + "epoch": 2.611428183177062, + "grad_norm": 7.1621222496032715, + "learning_rate": 6.73707365131132e-06, + "loss": 2.7602, + "step": 38435 + }, + { + "epoch": 2.611767903247724, + "grad_norm": 8.532333374023438, + "learning_rate": 6.736649001222993e-06, + "loss": 2.9797, + "step": 38440 + }, + { + "epoch": 2.6121076233183858, + "grad_norm": 7.773689270019531, + "learning_rate": 6.736224351134666e-06, + "loss": 2.7742, + "step": 38445 + }, + { + "epoch": 2.6124473433890474, + "grad_norm": 5.774016380310059, + "learning_rate": 6.735799701046338e-06, + "loss": 2.9457, + "step": 38450 + }, + { + "epoch": 2.6127870634597095, + "grad_norm": 6.7160186767578125, + "learning_rate": 6.735375050958011e-06, + "loss": 3.1302, + "step": 38455 + }, + { + "epoch": 2.613126783530371, + "grad_norm": 5.943790912628174, + "learning_rate": 6.734950400869684e-06, + "loss": 3.1519, + "step": 38460 + }, + { + "epoch": 2.6134665036010327, + "grad_norm": 6.265141487121582, + "learning_rate": 6.734525750781357e-06, + "loss": 3.069, + "step": 38465 + }, + { + "epoch": 2.613806223671695, + "grad_norm": 5.981054782867432, + "learning_rate": 6.7341011006930294e-06, + "loss": 2.9329, + "step": 38470 + }, + { + "epoch": 2.6141459437423564, + "grad_norm": 6.716476917266846, + "learning_rate": 6.733676450604702e-06, + "loss": 2.9395, + "step": 38475 + }, + { + "epoch": 2.614485663813018, + "grad_norm": 7.683617115020752, + "learning_rate": 6.733251800516375e-06, + "loss": 3.2182, + "step": 38480 + }, + { + "epoch": 2.6148253838836797, + "grad_norm": 6.528001308441162, + "learning_rate": 6.732827150428047e-06, + "loss": 2.9932, + "step": 38485 + }, + { + "epoch": 2.6151651039543418, + "grad_norm": 8.939292907714844, + "learning_rate": 6.732402500339721e-06, + "loss": 2.837, + "step": 38490 + }, + { + "epoch": 2.6155048240250034, + "grad_norm": 6.598121166229248, + "learning_rate": 6.7319778502513934e-06, + "loss": 3.0954, + "step": 38495 + }, + { + "epoch": 2.615844544095665, + "grad_norm": 10.018739700317383, + "learning_rate": 6.731553200163065e-06, + "loss": 2.8178, + "step": 38500 + }, + { + "epoch": 2.616184264166327, + "grad_norm": 7.05116605758667, + "learning_rate": 6.731128550074739e-06, + "loss": 2.7049, + "step": 38505 + }, + { + "epoch": 2.6165239842369887, + "grad_norm": 8.308996200561523, + "learning_rate": 6.730703899986412e-06, + "loss": 2.9321, + "step": 38510 + }, + { + "epoch": 2.6168637043076504, + "grad_norm": 10.11319637298584, + "learning_rate": 6.730279249898084e-06, + "loss": 2.7973, + "step": 38515 + }, + { + "epoch": 2.617203424378312, + "grad_norm": 7.4615888595581055, + "learning_rate": 6.7298545998097574e-06, + "loss": 3.0783, + "step": 38520 + }, + { + "epoch": 2.617543144448974, + "grad_norm": 7.235738277435303, + "learning_rate": 6.72942994972143e-06, + "loss": 2.9581, + "step": 38525 + }, + { + "epoch": 2.6178828645196357, + "grad_norm": 9.192206382751465, + "learning_rate": 6.729005299633102e-06, + "loss": 3.029, + "step": 38530 + }, + { + "epoch": 2.6182225845902973, + "grad_norm": 4.357504844665527, + "learning_rate": 6.728580649544776e-06, + "loss": 2.73, + "step": 38535 + }, + { + "epoch": 2.6185623046609594, + "grad_norm": 5.39738655090332, + "learning_rate": 6.728155999456449e-06, + "loss": 3.0655, + "step": 38540 + }, + { + "epoch": 2.618902024731621, + "grad_norm": 6.525325298309326, + "learning_rate": 6.727731349368121e-06, + "loss": 2.9748, + "step": 38545 + }, + { + "epoch": 2.6192417448022827, + "grad_norm": 5.614326477050781, + "learning_rate": 6.727306699279794e-06, + "loss": 3.0086, + "step": 38550 + }, + { + "epoch": 2.6195814648729447, + "grad_norm": 7.497986793518066, + "learning_rate": 6.726882049191466e-06, + "loss": 3.1219, + "step": 38555 + }, + { + "epoch": 2.6199211849436064, + "grad_norm": 7.829811096191406, + "learning_rate": 6.72645739910314e-06, + "loss": 2.931, + "step": 38560 + }, + { + "epoch": 2.620260905014268, + "grad_norm": 7.632440090179443, + "learning_rate": 6.726032749014813e-06, + "loss": 2.9796, + "step": 38565 + }, + { + "epoch": 2.62060062508493, + "grad_norm": 6.92441987991333, + "learning_rate": 6.725608098926485e-06, + "loss": 3.1858, + "step": 38570 + }, + { + "epoch": 2.6209403451555917, + "grad_norm": 7.196139335632324, + "learning_rate": 6.725183448838158e-06, + "loss": 3.0087, + "step": 38575 + }, + { + "epoch": 2.6212800652262533, + "grad_norm": 6.116889476776123, + "learning_rate": 6.724758798749831e-06, + "loss": 3.0796, + "step": 38580 + }, + { + "epoch": 2.6216197852969154, + "grad_norm": 7.471711158752441, + "learning_rate": 6.724334148661503e-06, + "loss": 3.1338, + "step": 38585 + }, + { + "epoch": 2.621959505367577, + "grad_norm": 7.380681037902832, + "learning_rate": 6.723909498573177e-06, + "loss": 3.0269, + "step": 38590 + }, + { + "epoch": 2.6222992254382387, + "grad_norm": 9.608914375305176, + "learning_rate": 6.7234848484848495e-06, + "loss": 3.1232, + "step": 38595 + }, + { + "epoch": 2.6226389455089008, + "grad_norm": 6.968075275421143, + "learning_rate": 6.723060198396521e-06, + "loss": 3.0435, + "step": 38600 + }, + { + "epoch": 2.6229786655795624, + "grad_norm": 7.39564847946167, + "learning_rate": 6.722635548308195e-06, + "loss": 3.2526, + "step": 38605 + }, + { + "epoch": 2.623318385650224, + "grad_norm": 6.6999921798706055, + "learning_rate": 6.722210898219868e-06, + "loss": 3.1134, + "step": 38610 + }, + { + "epoch": 2.623658105720886, + "grad_norm": 6.842861652374268, + "learning_rate": 6.72178624813154e-06, + "loss": 2.9407, + "step": 38615 + }, + { + "epoch": 2.6239978257915477, + "grad_norm": 8.5570650100708, + "learning_rate": 6.7213615980432135e-06, + "loss": 3.0609, + "step": 38620 + }, + { + "epoch": 2.6243375458622094, + "grad_norm": 7.53179407119751, + "learning_rate": 6.720936947954885e-06, + "loss": 3.2797, + "step": 38625 + }, + { + "epoch": 2.6246772659328714, + "grad_norm": 6.40292501449585, + "learning_rate": 6.720512297866558e-06, + "loss": 3.2869, + "step": 38630 + }, + { + "epoch": 2.625016986003533, + "grad_norm": 7.9715070724487305, + "learning_rate": 6.720087647778232e-06, + "loss": 3.2946, + "step": 38635 + }, + { + "epoch": 2.6253567060741947, + "grad_norm": 7.857914924621582, + "learning_rate": 6.719662997689904e-06, + "loss": 3.1251, + "step": 38640 + }, + { + "epoch": 2.6256964261448568, + "grad_norm": 6.220840930938721, + "learning_rate": 6.719238347601577e-06, + "loss": 3.1904, + "step": 38645 + }, + { + "epoch": 2.6260361462155184, + "grad_norm": 7.171165466308594, + "learning_rate": 6.71881369751325e-06, + "loss": 3.1512, + "step": 38650 + }, + { + "epoch": 2.62637586628618, + "grad_norm": 7.737171649932861, + "learning_rate": 6.718389047424922e-06, + "loss": 2.7692, + "step": 38655 + }, + { + "epoch": 2.626715586356842, + "grad_norm": 6.739435195922852, + "learning_rate": 6.717964397336595e-06, + "loss": 2.8682, + "step": 38660 + }, + { + "epoch": 2.6270553064275037, + "grad_norm": 5.754110336303711, + "learning_rate": 6.717539747248269e-06, + "loss": 2.8743, + "step": 38665 + }, + { + "epoch": 2.6273950264981654, + "grad_norm": 8.08150577545166, + "learning_rate": 6.717115097159941e-06, + "loss": 3.1334, + "step": 38670 + }, + { + "epoch": 2.6277347465688274, + "grad_norm": 7.477474689483643, + "learning_rate": 6.7166904470716134e-06, + "loss": 2.8798, + "step": 38675 + }, + { + "epoch": 2.628074466639489, + "grad_norm": 8.071650505065918, + "learning_rate": 6.716265796983287e-06, + "loss": 3.1024, + "step": 38680 + }, + { + "epoch": 2.6284141867101507, + "grad_norm": 6.4072465896606445, + "learning_rate": 6.715841146894959e-06, + "loss": 3.2167, + "step": 38685 + }, + { + "epoch": 2.6287539067808128, + "grad_norm": 6.198503494262695, + "learning_rate": 6.715416496806632e-06, + "loss": 3.1408, + "step": 38690 + }, + { + "epoch": 2.6290936268514744, + "grad_norm": 8.358064651489258, + "learning_rate": 6.7149918467183055e-06, + "loss": 3.0601, + "step": 38695 + }, + { + "epoch": 2.629433346922136, + "grad_norm": 5.939841270446777, + "learning_rate": 6.7145671966299774e-06, + "loss": 2.9833, + "step": 38700 + }, + { + "epoch": 2.629773066992798, + "grad_norm": 6.87867546081543, + "learning_rate": 6.71414254654165e-06, + "loss": 3.0998, + "step": 38705 + }, + { + "epoch": 2.6301127870634597, + "grad_norm": 4.9479079246521, + "learning_rate": 6.713717896453323e-06, + "loss": 3.1721, + "step": 38710 + }, + { + "epoch": 2.6304525071341214, + "grad_norm": 8.58785629272461, + "learning_rate": 6.713293246364996e-06, + "loss": 2.9722, + "step": 38715 + }, + { + "epoch": 2.6307922272047835, + "grad_norm": 8.531195640563965, + "learning_rate": 6.712868596276668e-06, + "loss": 3.0982, + "step": 38720 + }, + { + "epoch": 2.631131947275445, + "grad_norm": 5.978361129760742, + "learning_rate": 6.7124439461883414e-06, + "loss": 3.0284, + "step": 38725 + }, + { + "epoch": 2.6314716673461067, + "grad_norm": 8.555500030517578, + "learning_rate": 6.712019296100014e-06, + "loss": 2.9817, + "step": 38730 + }, + { + "epoch": 2.631811387416769, + "grad_norm": 6.966934680938721, + "learning_rate": 6.711594646011686e-06, + "loss": 3.0898, + "step": 38735 + }, + { + "epoch": 2.6321511074874304, + "grad_norm": 7.678680419921875, + "learning_rate": 6.71116999592336e-06, + "loss": 3.0266, + "step": 38740 + }, + { + "epoch": 2.632490827558092, + "grad_norm": 7.16014289855957, + "learning_rate": 6.710745345835033e-06, + "loss": 3.072, + "step": 38745 + }, + { + "epoch": 2.632830547628754, + "grad_norm": 5.935529708862305, + "learning_rate": 6.710320695746705e-06, + "loss": 3.0973, + "step": 38750 + }, + { + "epoch": 2.6331702676994158, + "grad_norm": 8.362955093383789, + "learning_rate": 6.709896045658378e-06, + "loss": 2.8952, + "step": 38755 + }, + { + "epoch": 2.6335099877700774, + "grad_norm": 7.244884967803955, + "learning_rate": 6.709471395570051e-06, + "loss": 3.0824, + "step": 38760 + }, + { + "epoch": 2.6338497078407395, + "grad_norm": 6.3490424156188965, + "learning_rate": 6.709046745481723e-06, + "loss": 2.9473, + "step": 38765 + }, + { + "epoch": 2.634189427911401, + "grad_norm": 9.641876220703125, + "learning_rate": 6.708622095393397e-06, + "loss": 3.0917, + "step": 38770 + }, + { + "epoch": 2.6345291479820627, + "grad_norm": 6.3485846519470215, + "learning_rate": 6.7081974453050694e-06, + "loss": 3.0389, + "step": 38775 + }, + { + "epoch": 2.634868868052725, + "grad_norm": 5.909295082092285, + "learning_rate": 6.707772795216741e-06, + "loss": 3.1891, + "step": 38780 + }, + { + "epoch": 2.6352085881233864, + "grad_norm": 8.08013916015625, + "learning_rate": 6.707348145128415e-06, + "loss": 3.3239, + "step": 38785 + }, + { + "epoch": 2.635548308194048, + "grad_norm": 8.506105422973633, + "learning_rate": 6.706923495040088e-06, + "loss": 3.1322, + "step": 38790 + }, + { + "epoch": 2.63588802826471, + "grad_norm": 7.198599815368652, + "learning_rate": 6.70649884495176e-06, + "loss": 3.0078, + "step": 38795 + }, + { + "epoch": 2.6362277483353718, + "grad_norm": 7.162069797515869, + "learning_rate": 6.7060741948634334e-06, + "loss": 3.1524, + "step": 38800 + }, + { + "epoch": 2.6365674684060334, + "grad_norm": 7.6695122718811035, + "learning_rate": 6.705649544775105e-06, + "loss": 2.9417, + "step": 38805 + }, + { + "epoch": 2.6369071884766955, + "grad_norm": 10.015180587768555, + "learning_rate": 6.705224894686778e-06, + "loss": 2.9077, + "step": 38810 + }, + { + "epoch": 2.637246908547357, + "grad_norm": 7.383358478546143, + "learning_rate": 6.704800244598452e-06, + "loss": 2.9038, + "step": 38815 + }, + { + "epoch": 2.6375866286180187, + "grad_norm": 7.861931800842285, + "learning_rate": 6.704375594510124e-06, + "loss": 3.2788, + "step": 38820 + }, + { + "epoch": 2.6379263486886804, + "grad_norm": 8.095179557800293, + "learning_rate": 6.703950944421797e-06, + "loss": 3.1869, + "step": 38825 + }, + { + "epoch": 2.6382660687593424, + "grad_norm": 6.890710353851318, + "learning_rate": 6.70352629433347e-06, + "loss": 3.0458, + "step": 38830 + }, + { + "epoch": 2.638605788830004, + "grad_norm": 6.696605682373047, + "learning_rate": 6.703101644245142e-06, + "loss": 3.1397, + "step": 38835 + }, + { + "epoch": 2.6389455089006657, + "grad_norm": 6.222130298614502, + "learning_rate": 6.702676994156815e-06, + "loss": 2.919, + "step": 38840 + }, + { + "epoch": 2.639285228971328, + "grad_norm": 5.152824401855469, + "learning_rate": 6.702252344068489e-06, + "loss": 3.011, + "step": 38845 + }, + { + "epoch": 2.6396249490419894, + "grad_norm": 7.929075717926025, + "learning_rate": 6.701827693980161e-06, + "loss": 2.9713, + "step": 38850 + }, + { + "epoch": 2.639964669112651, + "grad_norm": 5.989529609680176, + "learning_rate": 6.701403043891833e-06, + "loss": 3.1431, + "step": 38855 + }, + { + "epoch": 2.640304389183313, + "grad_norm": 6.766332149505615, + "learning_rate": 6.700978393803507e-06, + "loss": 3.0277, + "step": 38860 + }, + { + "epoch": 2.6406441092539747, + "grad_norm": 7.153970241546631, + "learning_rate": 6.700553743715179e-06, + "loss": 3.0166, + "step": 38865 + }, + { + "epoch": 2.6409838293246364, + "grad_norm": 6.169755458831787, + "learning_rate": 6.700129093626852e-06, + "loss": 2.9382, + "step": 38870 + }, + { + "epoch": 2.641323549395298, + "grad_norm": 7.800542831420898, + "learning_rate": 6.699704443538525e-06, + "loss": 3.2443, + "step": 38875 + }, + { + "epoch": 2.64166326946596, + "grad_norm": 5.477301120758057, + "learning_rate": 6.699279793450197e-06, + "loss": 2.9629, + "step": 38880 + }, + { + "epoch": 2.6420029895366217, + "grad_norm": 7.422054290771484, + "learning_rate": 6.69885514336187e-06, + "loss": 3.2967, + "step": 38885 + }, + { + "epoch": 2.6423427096072833, + "grad_norm": 6.536081314086914, + "learning_rate": 6.698430493273543e-06, + "loss": 3.0806, + "step": 38890 + }, + { + "epoch": 2.6426824296779454, + "grad_norm": 7.4979567527771, + "learning_rate": 6.698005843185216e-06, + "loss": 3.061, + "step": 38895 + }, + { + "epoch": 2.643022149748607, + "grad_norm": 6.868211269378662, + "learning_rate": 6.6975811930968895e-06, + "loss": 2.7067, + "step": 38900 + }, + { + "epoch": 2.6433618698192687, + "grad_norm": 7.665322303771973, + "learning_rate": 6.697156543008561e-06, + "loss": 2.9113, + "step": 38905 + }, + { + "epoch": 2.6437015898899308, + "grad_norm": 5.235840320587158, + "learning_rate": 6.696731892920234e-06, + "loss": 2.9154, + "step": 38910 + }, + { + "epoch": 2.6440413099605924, + "grad_norm": 7.087958335876465, + "learning_rate": 6.696307242831908e-06, + "loss": 2.9937, + "step": 38915 + }, + { + "epoch": 2.644381030031254, + "grad_norm": 7.895360469818115, + "learning_rate": 6.69588259274358e-06, + "loss": 3.1683, + "step": 38920 + }, + { + "epoch": 2.644720750101916, + "grad_norm": 6.657004356384277, + "learning_rate": 6.695457942655253e-06, + "loss": 2.8136, + "step": 38925 + }, + { + "epoch": 2.6450604701725777, + "grad_norm": 7.033236026763916, + "learning_rate": 6.695033292566926e-06, + "loss": 2.7595, + "step": 38930 + }, + { + "epoch": 2.6454001902432394, + "grad_norm": 6.787055492401123, + "learning_rate": 6.694608642478598e-06, + "loss": 2.8981, + "step": 38935 + }, + { + "epoch": 2.6457399103139014, + "grad_norm": 5.688292026519775, + "learning_rate": 6.694183992390271e-06, + "loss": 2.9217, + "step": 38940 + }, + { + "epoch": 2.646079630384563, + "grad_norm": 6.6314239501953125, + "learning_rate": 6.693759342301944e-06, + "loss": 3.1625, + "step": 38945 + }, + { + "epoch": 2.6464193504552247, + "grad_norm": 8.713077545166016, + "learning_rate": 6.693334692213617e-06, + "loss": 2.9593, + "step": 38950 + }, + { + "epoch": 2.6467590705258868, + "grad_norm": 6.734726905822754, + "learning_rate": 6.692910042125289e-06, + "loss": 2.9451, + "step": 38955 + }, + { + "epoch": 2.6470987905965484, + "grad_norm": 7.501668453216553, + "learning_rate": 6.692485392036962e-06, + "loss": 2.8149, + "step": 38960 + }, + { + "epoch": 2.64743851066721, + "grad_norm": 7.730534553527832, + "learning_rate": 6.692060741948635e-06, + "loss": 3.0091, + "step": 38965 + }, + { + "epoch": 2.647778230737872, + "grad_norm": 6.66005277633667, + "learning_rate": 6.691636091860307e-06, + "loss": 3.0936, + "step": 38970 + }, + { + "epoch": 2.6481179508085337, + "grad_norm": 5.948374271392822, + "learning_rate": 6.691211441771981e-06, + "loss": 3.0766, + "step": 38975 + }, + { + "epoch": 2.6484576708791954, + "grad_norm": 7.55396842956543, + "learning_rate": 6.6907867916836534e-06, + "loss": 3.1772, + "step": 38980 + }, + { + "epoch": 2.6487973909498574, + "grad_norm": 6.667080879211426, + "learning_rate": 6.690362141595325e-06, + "loss": 2.9756, + "step": 38985 + }, + { + "epoch": 2.649137111020519, + "grad_norm": 8.477561950683594, + "learning_rate": 6.689937491506999e-06, + "loss": 2.9865, + "step": 38990 + }, + { + "epoch": 2.6494768310911807, + "grad_norm": 7.520643711090088, + "learning_rate": 6.689512841418672e-06, + "loss": 2.9446, + "step": 38995 + }, + { + "epoch": 2.649816551161843, + "grad_norm": 7.323678016662598, + "learning_rate": 6.689088191330344e-06, + "loss": 3.2461, + "step": 39000 + }, + { + "epoch": 2.6501562712325044, + "grad_norm": 7.559390068054199, + "learning_rate": 6.6886635412420174e-06, + "loss": 2.903, + "step": 39005 + }, + { + "epoch": 2.650495991303166, + "grad_norm": 7.569305419921875, + "learning_rate": 6.68823889115369e-06, + "loss": 3.2274, + "step": 39010 + }, + { + "epoch": 2.650835711373828, + "grad_norm": 7.84682559967041, + "learning_rate": 6.687814241065362e-06, + "loss": 3.0285, + "step": 39015 + }, + { + "epoch": 2.6511754314444897, + "grad_norm": 8.42270565032959, + "learning_rate": 6.687389590977036e-06, + "loss": 2.8484, + "step": 39020 + }, + { + "epoch": 2.6515151515151514, + "grad_norm": 6.6350789070129395, + "learning_rate": 6.686964940888709e-06, + "loss": 3.2288, + "step": 39025 + }, + { + "epoch": 2.6518548715858135, + "grad_norm": 7.495988845825195, + "learning_rate": 6.686540290800381e-06, + "loss": 3.1446, + "step": 39030 + }, + { + "epoch": 2.652194591656475, + "grad_norm": 7.352488994598389, + "learning_rate": 6.686115640712054e-06, + "loss": 3.0903, + "step": 39035 + }, + { + "epoch": 2.6525343117271367, + "grad_norm": 7.745459079742432, + "learning_rate": 6.685690990623727e-06, + "loss": 3.1308, + "step": 39040 + }, + { + "epoch": 2.652874031797799, + "grad_norm": 8.042099952697754, + "learning_rate": 6.685266340535399e-06, + "loss": 3.0707, + "step": 39045 + }, + { + "epoch": 2.6532137518684604, + "grad_norm": 8.69625186920166, + "learning_rate": 6.684841690447073e-06, + "loss": 2.9271, + "step": 39050 + }, + { + "epoch": 2.653553471939122, + "grad_norm": 9.018759727478027, + "learning_rate": 6.684417040358745e-06, + "loss": 2.7543, + "step": 39055 + }, + { + "epoch": 2.653893192009784, + "grad_norm": 5.875161647796631, + "learning_rate": 6.683992390270417e-06, + "loss": 3.126, + "step": 39060 + }, + { + "epoch": 2.6542329120804458, + "grad_norm": 6.168339252471924, + "learning_rate": 6.683567740182091e-06, + "loss": 2.7829, + "step": 39065 + }, + { + "epoch": 2.6545726321511074, + "grad_norm": 7.4145307540893555, + "learning_rate": 6.683143090093763e-06, + "loss": 3.3162, + "step": 39070 + }, + { + "epoch": 2.6549123522217695, + "grad_norm": 8.483137130737305, + "learning_rate": 6.682718440005436e-06, + "loss": 2.911, + "step": 39075 + }, + { + "epoch": 2.655252072292431, + "grad_norm": 6.657697677612305, + "learning_rate": 6.6822937899171094e-06, + "loss": 3.0931, + "step": 39080 + }, + { + "epoch": 2.6555917923630927, + "grad_norm": 6.792809963226318, + "learning_rate": 6.681869139828781e-06, + "loss": 3.0962, + "step": 39085 + }, + { + "epoch": 2.655931512433755, + "grad_norm": 6.3655686378479, + "learning_rate": 6.681444489740454e-06, + "loss": 3.0206, + "step": 39090 + }, + { + "epoch": 2.6562712325044164, + "grad_norm": 6.6296162605285645, + "learning_rate": 6.681019839652128e-06, + "loss": 3.0402, + "step": 39095 + }, + { + "epoch": 2.656610952575078, + "grad_norm": 9.393792152404785, + "learning_rate": 6.6805951895638e-06, + "loss": 3.0583, + "step": 39100 + }, + { + "epoch": 2.65695067264574, + "grad_norm": 5.946435451507568, + "learning_rate": 6.680170539475473e-06, + "loss": 2.9079, + "step": 39105 + }, + { + "epoch": 2.6572903927164018, + "grad_norm": 8.581501007080078, + "learning_rate": 6.679745889387146e-06, + "loss": 2.8085, + "step": 39110 + }, + { + "epoch": 2.6576301127870634, + "grad_norm": 8.337575912475586, + "learning_rate": 6.679321239298818e-06, + "loss": 2.9459, + "step": 39115 + }, + { + "epoch": 2.6579698328577255, + "grad_norm": 7.170663356781006, + "learning_rate": 6.678896589210491e-06, + "loss": 3.2621, + "step": 39120 + }, + { + "epoch": 2.658309552928387, + "grad_norm": 6.021507263183594, + "learning_rate": 6.678471939122164e-06, + "loss": 2.7515, + "step": 39125 + }, + { + "epoch": 2.6586492729990487, + "grad_norm": 6.7473063468933105, + "learning_rate": 6.678047289033837e-06, + "loss": 2.8617, + "step": 39130 + }, + { + "epoch": 2.658988993069711, + "grad_norm": 7.940512180328369, + "learning_rate": 6.677622638945509e-06, + "loss": 3.1199, + "step": 39135 + }, + { + "epoch": 2.6593287131403724, + "grad_norm": 6.639886379241943, + "learning_rate": 6.677197988857182e-06, + "loss": 3.0341, + "step": 39140 + }, + { + "epoch": 2.659668433211034, + "grad_norm": 7.02381706237793, + "learning_rate": 6.676773338768855e-06, + "loss": 2.9094, + "step": 39145 + }, + { + "epoch": 2.660008153281696, + "grad_norm": 7.0429558753967285, + "learning_rate": 6.676348688680527e-06, + "loss": 2.8387, + "step": 39150 + }, + { + "epoch": 2.660347873352358, + "grad_norm": 6.130962371826172, + "learning_rate": 6.675924038592201e-06, + "loss": 3.3708, + "step": 39155 + }, + { + "epoch": 2.6606875934230194, + "grad_norm": 5.920094966888428, + "learning_rate": 6.675499388503873e-06, + "loss": 2.9337, + "step": 39160 + }, + { + "epoch": 2.661027313493681, + "grad_norm": 8.882762908935547, + "learning_rate": 6.675074738415545e-06, + "loss": 3.1947, + "step": 39165 + }, + { + "epoch": 2.661367033564343, + "grad_norm": 6.834293365478516, + "learning_rate": 6.674650088327219e-06, + "loss": 3.0599, + "step": 39170 + }, + { + "epoch": 2.6617067536350048, + "grad_norm": 6.51509952545166, + "learning_rate": 6.674225438238892e-06, + "loss": 2.872, + "step": 39175 + }, + { + "epoch": 2.6620464737056664, + "grad_norm": 6.577803134918213, + "learning_rate": 6.673800788150564e-06, + "loss": 2.8776, + "step": 39180 + }, + { + "epoch": 2.6623861937763285, + "grad_norm": 6.714766502380371, + "learning_rate": 6.673376138062237e-06, + "loss": 2.7677, + "step": 39185 + }, + { + "epoch": 2.66272591384699, + "grad_norm": 7.718066692352295, + "learning_rate": 6.67295148797391e-06, + "loss": 2.9416, + "step": 39190 + }, + { + "epoch": 2.6630656339176517, + "grad_norm": 7.063122749328613, + "learning_rate": 6.672526837885582e-06, + "loss": 3.1106, + "step": 39195 + }, + { + "epoch": 2.663405353988314, + "grad_norm": 7.6621904373168945, + "learning_rate": 6.672102187797256e-06, + "loss": 3.1729, + "step": 39200 + }, + { + "epoch": 2.6637450740589754, + "grad_norm": 8.047582626342773, + "learning_rate": 6.671677537708929e-06, + "loss": 2.9506, + "step": 39205 + }, + { + "epoch": 2.664084794129637, + "grad_norm": 6.303478717803955, + "learning_rate": 6.6712528876206006e-06, + "loss": 3.0027, + "step": 39210 + }, + { + "epoch": 2.6644245142002987, + "grad_norm": 7.031788349151611, + "learning_rate": 6.670828237532274e-06, + "loss": 2.9818, + "step": 39215 + }, + { + "epoch": 2.6647642342709608, + "grad_norm": 8.52446174621582, + "learning_rate": 6.670403587443946e-06, + "loss": 2.8436, + "step": 39220 + }, + { + "epoch": 2.6651039543416224, + "grad_norm": 6.5555739402771, + "learning_rate": 6.669978937355619e-06, + "loss": 3.2337, + "step": 39225 + }, + { + "epoch": 2.665443674412284, + "grad_norm": 8.380693435668945, + "learning_rate": 6.669554287267293e-06, + "loss": 3.0478, + "step": 39230 + }, + { + "epoch": 2.665783394482946, + "grad_norm": 6.643281936645508, + "learning_rate": 6.6691296371789646e-06, + "loss": 2.9208, + "step": 39235 + }, + { + "epoch": 2.6661231145536077, + "grad_norm": 6.941432952880859, + "learning_rate": 6.668704987090638e-06, + "loss": 2.8927, + "step": 39240 + }, + { + "epoch": 2.6664628346242694, + "grad_norm": 5.98531436920166, + "learning_rate": 6.668280337002311e-06, + "loss": 2.9599, + "step": 39245 + }, + { + "epoch": 2.6668025546949314, + "grad_norm": 6.989819526672363, + "learning_rate": 6.667855686913983e-06, + "loss": 3.1901, + "step": 39250 + }, + { + "epoch": 2.667142274765593, + "grad_norm": 5.637101650238037, + "learning_rate": 6.667431036825657e-06, + "loss": 2.7931, + "step": 39255 + }, + { + "epoch": 2.6674819948362547, + "grad_norm": 10.015056610107422, + "learning_rate": 6.667006386737329e-06, + "loss": 3.3015, + "step": 39260 + }, + { + "epoch": 2.6678217149069168, + "grad_norm": 7.32835578918457, + "learning_rate": 6.666581736649001e-06, + "loss": 2.9242, + "step": 39265 + }, + { + "epoch": 2.6681614349775784, + "grad_norm": 6.381802558898926, + "learning_rate": 6.666157086560675e-06, + "loss": 2.8386, + "step": 39270 + }, + { + "epoch": 2.66850115504824, + "grad_norm": 6.868673801422119, + "learning_rate": 6.665732436472348e-06, + "loss": 3.1964, + "step": 39275 + }, + { + "epoch": 2.668840875118902, + "grad_norm": 7.284146785736084, + "learning_rate": 6.66530778638402e-06, + "loss": 3.1184, + "step": 39280 + }, + { + "epoch": 2.6691805951895637, + "grad_norm": 6.1695556640625, + "learning_rate": 6.6648831362956934e-06, + "loss": 3.2155, + "step": 39285 + }, + { + "epoch": 2.6695203152602254, + "grad_norm": 6.313165187835693, + "learning_rate": 6.664458486207365e-06, + "loss": 2.9825, + "step": 39290 + }, + { + "epoch": 2.6698600353308874, + "grad_norm": 7.004883766174316, + "learning_rate": 6.664033836119038e-06, + "loss": 3.19, + "step": 39295 + }, + { + "epoch": 2.670199755401549, + "grad_norm": 7.9691162109375, + "learning_rate": 6.663609186030712e-06, + "loss": 3.1039, + "step": 39300 + }, + { + "epoch": 2.6705394754722107, + "grad_norm": 7.569075107574463, + "learning_rate": 6.663184535942384e-06, + "loss": 2.9567, + "step": 39305 + }, + { + "epoch": 2.670879195542873, + "grad_norm": 6.4705305099487305, + "learning_rate": 6.662759885854057e-06, + "loss": 3.0128, + "step": 39310 + }, + { + "epoch": 2.6712189156135344, + "grad_norm": 8.040604591369629, + "learning_rate": 6.66233523576573e-06, + "loss": 3.1855, + "step": 39315 + }, + { + "epoch": 2.671558635684196, + "grad_norm": 8.47291088104248, + "learning_rate": 6.661910585677402e-06, + "loss": 3.5163, + "step": 39320 + }, + { + "epoch": 2.671898355754858, + "grad_norm": 5.854726791381836, + "learning_rate": 6.661485935589075e-06, + "loss": 2.9434, + "step": 39325 + }, + { + "epoch": 2.6722380758255198, + "grad_norm": 5.591141223907471, + "learning_rate": 6.661061285500749e-06, + "loss": 3.0737, + "step": 39330 + }, + { + "epoch": 2.6725777958961814, + "grad_norm": 6.514902114868164, + "learning_rate": 6.660636635412421e-06, + "loss": 2.9878, + "step": 39335 + }, + { + "epoch": 2.6729175159668435, + "grad_norm": 7.77490234375, + "learning_rate": 6.660211985324093e-06, + "loss": 3.1494, + "step": 39340 + }, + { + "epoch": 2.673257236037505, + "grad_norm": 8.809000968933105, + "learning_rate": 6.659787335235767e-06, + "loss": 2.949, + "step": 39345 + }, + { + "epoch": 2.6735969561081667, + "grad_norm": 8.005853652954102, + "learning_rate": 6.659362685147439e-06, + "loss": 3.0858, + "step": 39350 + }, + { + "epoch": 2.673936676178829, + "grad_norm": 6.575723648071289, + "learning_rate": 6.658938035059112e-06, + "loss": 3.276, + "step": 39355 + }, + { + "epoch": 2.6742763962494904, + "grad_norm": 6.224764823913574, + "learning_rate": 6.6585133849707854e-06, + "loss": 2.8012, + "step": 39360 + }, + { + "epoch": 2.674616116320152, + "grad_norm": 6.542947769165039, + "learning_rate": 6.658088734882457e-06, + "loss": 2.9623, + "step": 39365 + }, + { + "epoch": 2.674955836390814, + "grad_norm": 7.2126240730285645, + "learning_rate": 6.65766408479413e-06, + "loss": 3.048, + "step": 39370 + }, + { + "epoch": 2.6752955564614758, + "grad_norm": 6.3592848777771, + "learning_rate": 6.657239434705803e-06, + "loss": 3.1688, + "step": 39375 + }, + { + "epoch": 2.6756352765321374, + "grad_norm": 6.465016841888428, + "learning_rate": 6.656814784617476e-06, + "loss": 3.0636, + "step": 39380 + }, + { + "epoch": 2.6759749966027995, + "grad_norm": 6.25490140914917, + "learning_rate": 6.656390134529149e-06, + "loss": 2.9218, + "step": 39385 + }, + { + "epoch": 2.676314716673461, + "grad_norm": 6.310543537139893, + "learning_rate": 6.655965484440821e-06, + "loss": 3.1807, + "step": 39390 + }, + { + "epoch": 2.6766544367441227, + "grad_norm": 6.994909763336182, + "learning_rate": 6.655540834352494e-06, + "loss": 3.212, + "step": 39395 + }, + { + "epoch": 2.676994156814785, + "grad_norm": 6.280840873718262, + "learning_rate": 6.655116184264166e-06, + "loss": 3.1653, + "step": 39400 + }, + { + "epoch": 2.6773338768854464, + "grad_norm": 6.64470100402832, + "learning_rate": 6.65469153417584e-06, + "loss": 2.9108, + "step": 39405 + }, + { + "epoch": 2.677673596956108, + "grad_norm": 7.423292636871338, + "learning_rate": 6.654266884087513e-06, + "loss": 2.8579, + "step": 39410 + }, + { + "epoch": 2.67801331702677, + "grad_norm": 5.67707633972168, + "learning_rate": 6.6538422339991846e-06, + "loss": 3.2135, + "step": 39415 + }, + { + "epoch": 2.6783530370974318, + "grad_norm": 8.666667938232422, + "learning_rate": 6.653417583910858e-06, + "loss": 3.155, + "step": 39420 + }, + { + "epoch": 2.6786927571680934, + "grad_norm": 8.594717979431152, + "learning_rate": 6.652992933822531e-06, + "loss": 3.1964, + "step": 39425 + }, + { + "epoch": 2.6790324772387555, + "grad_norm": 7.351821422576904, + "learning_rate": 6.652568283734203e-06, + "loss": 3.1875, + "step": 39430 + }, + { + "epoch": 2.679372197309417, + "grad_norm": 7.245453834533691, + "learning_rate": 6.652143633645877e-06, + "loss": 2.9786, + "step": 39435 + }, + { + "epoch": 2.6797119173800787, + "grad_norm": 5.555490970611572, + "learning_rate": 6.651718983557549e-06, + "loss": 3.0195, + "step": 39440 + }, + { + "epoch": 2.680051637450741, + "grad_norm": 6.553640842437744, + "learning_rate": 6.651294333469221e-06, + "loss": 3.1391, + "step": 39445 + }, + { + "epoch": 2.6803913575214025, + "grad_norm": 6.58973503112793, + "learning_rate": 6.650869683380895e-06, + "loss": 3.1623, + "step": 39450 + }, + { + "epoch": 2.680731077592064, + "grad_norm": 7.5813822746276855, + "learning_rate": 6.650445033292568e-06, + "loss": 3.1326, + "step": 39455 + }, + { + "epoch": 2.681070797662726, + "grad_norm": 6.366547107696533, + "learning_rate": 6.65002038320424e-06, + "loss": 3.0522, + "step": 39460 + }, + { + "epoch": 2.681410517733388, + "grad_norm": 7.191102981567383, + "learning_rate": 6.649595733115913e-06, + "loss": 2.9815, + "step": 39465 + }, + { + "epoch": 2.6817502378040494, + "grad_norm": 5.451993942260742, + "learning_rate": 6.649171083027585e-06, + "loss": 3.0698, + "step": 39470 + }, + { + "epoch": 2.6820899578747115, + "grad_norm": 5.907939434051514, + "learning_rate": 6.648746432939258e-06, + "loss": 2.9088, + "step": 39475 + }, + { + "epoch": 2.682429677945373, + "grad_norm": 6.226558208465576, + "learning_rate": 6.648321782850932e-06, + "loss": 2.915, + "step": 39480 + }, + { + "epoch": 2.6827693980160348, + "grad_norm": 8.30263900756836, + "learning_rate": 6.647897132762604e-06, + "loss": 3.2454, + "step": 39485 + }, + { + "epoch": 2.683109118086697, + "grad_norm": 7.303353309631348, + "learning_rate": 6.6474724826742766e-06, + "loss": 3.1873, + "step": 39490 + }, + { + "epoch": 2.6834488381573585, + "grad_norm": 7.458218574523926, + "learning_rate": 6.64704783258595e-06, + "loss": 2.9608, + "step": 39495 + }, + { + "epoch": 2.68378855822802, + "grad_norm": 6.670719146728516, + "learning_rate": 6.646623182497622e-06, + "loss": 3.0163, + "step": 39500 + }, + { + "epoch": 2.6841282782986817, + "grad_norm": 7.363874912261963, + "learning_rate": 6.646198532409295e-06, + "loss": 2.8989, + "step": 39505 + }, + { + "epoch": 2.684467998369344, + "grad_norm": 6.140823841094971, + "learning_rate": 6.645773882320969e-06, + "loss": 3.0466, + "step": 39510 + }, + { + "epoch": 2.6848077184400054, + "grad_norm": 8.164806365966797, + "learning_rate": 6.6453492322326406e-06, + "loss": 2.9696, + "step": 39515 + }, + { + "epoch": 2.685147438510667, + "grad_norm": 7.331602573394775, + "learning_rate": 6.644924582144313e-06, + "loss": 2.7539, + "step": 39520 + }, + { + "epoch": 2.685487158581329, + "grad_norm": 7.91966438293457, + "learning_rate": 6.644499932055987e-06, + "loss": 2.8532, + "step": 39525 + }, + { + "epoch": 2.6858268786519908, + "grad_norm": 7.307243347167969, + "learning_rate": 6.644075281967659e-06, + "loss": 3.0568, + "step": 39530 + }, + { + "epoch": 2.6861665987226524, + "grad_norm": 7.661015510559082, + "learning_rate": 6.643650631879332e-06, + "loss": 3.3031, + "step": 39535 + }, + { + "epoch": 2.6865063187933145, + "grad_norm": 7.492199420928955, + "learning_rate": 6.6432259817910046e-06, + "loss": 3.084, + "step": 39540 + }, + { + "epoch": 2.686846038863976, + "grad_norm": 9.85456371307373, + "learning_rate": 6.642801331702677e-06, + "loss": 2.7424, + "step": 39545 + }, + { + "epoch": 2.6871857589346377, + "grad_norm": 8.667065620422363, + "learning_rate": 6.64237668161435e-06, + "loss": 3.0857, + "step": 39550 + }, + { + "epoch": 2.6875254790052994, + "grad_norm": 7.471447944641113, + "learning_rate": 6.641952031526023e-06, + "loss": 3.0506, + "step": 39555 + }, + { + "epoch": 2.6878651990759614, + "grad_norm": 7.1204118728637695, + "learning_rate": 6.641527381437696e-06, + "loss": 3.1481, + "step": 39560 + }, + { + "epoch": 2.688204919146623, + "grad_norm": 6.261053085327148, + "learning_rate": 6.641102731349368e-06, + "loss": 3.1598, + "step": 39565 + }, + { + "epoch": 2.6885446392172847, + "grad_norm": 9.850921630859375, + "learning_rate": 6.640678081261041e-06, + "loss": 3.0809, + "step": 39570 + }, + { + "epoch": 2.688884359287947, + "grad_norm": 6.652348518371582, + "learning_rate": 6.640253431172714e-06, + "loss": 2.9542, + "step": 39575 + }, + { + "epoch": 2.6892240793586084, + "grad_norm": 7.547204494476318, + "learning_rate": 6.639828781084388e-06, + "loss": 2.9856, + "step": 39580 + }, + { + "epoch": 2.68956379942927, + "grad_norm": 7.67018985748291, + "learning_rate": 6.63940413099606e-06, + "loss": 3.2982, + "step": 39585 + }, + { + "epoch": 2.689903519499932, + "grad_norm": 7.796199321746826, + "learning_rate": 6.638979480907733e-06, + "loss": 3.0241, + "step": 39590 + }, + { + "epoch": 2.6902432395705937, + "grad_norm": 6.608558654785156, + "learning_rate": 6.638554830819406e-06, + "loss": 3.1158, + "step": 39595 + }, + { + "epoch": 2.6905829596412554, + "grad_norm": 9.151734352111816, + "learning_rate": 6.638130180731078e-06, + "loss": 2.9361, + "step": 39600 + }, + { + "epoch": 2.6909226797119175, + "grad_norm": 6.479490756988525, + "learning_rate": 6.637705530642751e-06, + "loss": 2.9355, + "step": 39605 + }, + { + "epoch": 2.691262399782579, + "grad_norm": 7.8657732009887695, + "learning_rate": 6.637280880554425e-06, + "loss": 3.076, + "step": 39610 + }, + { + "epoch": 2.6916021198532407, + "grad_norm": 6.160977363586426, + "learning_rate": 6.636856230466097e-06, + "loss": 3.2375, + "step": 39615 + }, + { + "epoch": 2.691941839923903, + "grad_norm": 6.798373222351074, + "learning_rate": 6.636431580377769e-06, + "loss": 3.0641, + "step": 39620 + }, + { + "epoch": 2.6922815599945644, + "grad_norm": 8.289960861206055, + "learning_rate": 6.636006930289442e-06, + "loss": 3.1454, + "step": 39625 + }, + { + "epoch": 2.692621280065226, + "grad_norm": 6.720081806182861, + "learning_rate": 6.635582280201115e-06, + "loss": 2.9337, + "step": 39630 + }, + { + "epoch": 2.692961000135888, + "grad_norm": 7.143761157989502, + "learning_rate": 6.635242560130453e-06, + "loss": 2.9856, + "step": 39635 + }, + { + "epoch": 2.6933007202065498, + "grad_norm": 9.250435829162598, + "learning_rate": 6.634817910042126e-06, + "loss": 3.0317, + "step": 39640 + }, + { + "epoch": 2.6936404402772114, + "grad_norm": 6.826703071594238, + "learning_rate": 6.634393259953799e-06, + "loss": 2.7918, + "step": 39645 + }, + { + "epoch": 2.6939801603478735, + "grad_norm": 6.616824150085449, + "learning_rate": 6.633968609865471e-06, + "loss": 3.0875, + "step": 39650 + }, + { + "epoch": 2.694319880418535, + "grad_norm": 6.879612922668457, + "learning_rate": 6.633543959777144e-06, + "loss": 3.0186, + "step": 39655 + }, + { + "epoch": 2.6946596004891967, + "grad_norm": 9.165230751037598, + "learning_rate": 6.633119309688817e-06, + "loss": 2.9693, + "step": 39660 + }, + { + "epoch": 2.694999320559859, + "grad_norm": 5.901497840881348, + "learning_rate": 6.632694659600489e-06, + "loss": 3.1707, + "step": 39665 + }, + { + "epoch": 2.6953390406305204, + "grad_norm": 5.910821437835693, + "learning_rate": 6.632270009512163e-06, + "loss": 2.9103, + "step": 39670 + }, + { + "epoch": 2.695678760701182, + "grad_norm": 8.94427490234375, + "learning_rate": 6.6318453594238355e-06, + "loss": 2.9008, + "step": 39675 + }, + { + "epoch": 2.696018480771844, + "grad_norm": 6.359302520751953, + "learning_rate": 6.631420709335507e-06, + "loss": 2.9097, + "step": 39680 + }, + { + "epoch": 2.6963582008425058, + "grad_norm": 8.113799095153809, + "learning_rate": 6.630996059247181e-06, + "loss": 3.0397, + "step": 39685 + }, + { + "epoch": 2.6966979209131674, + "grad_norm": 6.6832194328308105, + "learning_rate": 6.630571409158854e-06, + "loss": 3.0551, + "step": 39690 + }, + { + "epoch": 2.6970376409838295, + "grad_norm": 5.786617279052734, + "learning_rate": 6.630146759070526e-06, + "loss": 3.0751, + "step": 39695 + }, + { + "epoch": 2.697377361054491, + "grad_norm": 5.3178253173828125, + "learning_rate": 6.6297221089821995e-06, + "loss": 3.086, + "step": 39700 + }, + { + "epoch": 2.6977170811251527, + "grad_norm": 6.399562835693359, + "learning_rate": 6.629297458893872e-06, + "loss": 2.9998, + "step": 39705 + }, + { + "epoch": 2.698056801195815, + "grad_norm": 6.719265937805176, + "learning_rate": 6.628872808805544e-06, + "loss": 3.1209, + "step": 39710 + }, + { + "epoch": 2.6983965212664764, + "grad_norm": 8.267183303833008, + "learning_rate": 6.628448158717218e-06, + "loss": 3.0587, + "step": 39715 + }, + { + "epoch": 2.698736241337138, + "grad_norm": 8.441838264465332, + "learning_rate": 6.62802350862889e-06, + "loss": 3.282, + "step": 39720 + }, + { + "epoch": 2.6990759614078, + "grad_norm": 6.121035099029541, + "learning_rate": 6.627598858540563e-06, + "loss": 2.9863, + "step": 39725 + }, + { + "epoch": 2.699415681478462, + "grad_norm": 5.48193883895874, + "learning_rate": 6.627174208452236e-06, + "loss": 2.8712, + "step": 39730 + }, + { + "epoch": 2.6997554015491234, + "grad_norm": 9.564308166503906, + "learning_rate": 6.626749558363908e-06, + "loss": 3.0993, + "step": 39735 + }, + { + "epoch": 2.7000951216197855, + "grad_norm": 6.308286190032959, + "learning_rate": 6.626324908275581e-06, + "loss": 2.8984, + "step": 39740 + }, + { + "epoch": 2.700434841690447, + "grad_norm": 6.6811676025390625, + "learning_rate": 6.625900258187255e-06, + "loss": 3.0977, + "step": 39745 + }, + { + "epoch": 2.7007745617611087, + "grad_norm": 7.938480854034424, + "learning_rate": 6.625475608098927e-06, + "loss": 3.0946, + "step": 39750 + }, + { + "epoch": 2.701114281831771, + "grad_norm": 5.886785984039307, + "learning_rate": 6.6250509580105994e-06, + "loss": 3.0278, + "step": 39755 + }, + { + "epoch": 2.7014540019024325, + "grad_norm": 7.591104984283447, + "learning_rate": 6.624626307922273e-06, + "loss": 3.136, + "step": 39760 + }, + { + "epoch": 2.701793721973094, + "grad_norm": 6.479985237121582, + "learning_rate": 6.624201657833945e-06, + "loss": 3.0597, + "step": 39765 + }, + { + "epoch": 2.702133442043756, + "grad_norm": 7.39333963394165, + "learning_rate": 6.623777007745618e-06, + "loss": 3.0497, + "step": 39770 + }, + { + "epoch": 2.702473162114418, + "grad_norm": 7.0546183586120605, + "learning_rate": 6.6233523576572915e-06, + "loss": 2.9291, + "step": 39775 + }, + { + "epoch": 2.7028128821850794, + "grad_norm": 7.796270370483398, + "learning_rate": 6.6229277075689634e-06, + "loss": 2.819, + "step": 39780 + }, + { + "epoch": 2.7031526022557415, + "grad_norm": 8.718722343444824, + "learning_rate": 6.622503057480637e-06, + "loss": 2.9878, + "step": 39785 + }, + { + "epoch": 2.703492322326403, + "grad_norm": 7.108526229858398, + "learning_rate": 6.622078407392309e-06, + "loss": 3.1771, + "step": 39790 + }, + { + "epoch": 2.7038320423970648, + "grad_norm": 8.559737205505371, + "learning_rate": 6.621653757303982e-06, + "loss": 3.0178, + "step": 39795 + }, + { + "epoch": 2.704171762467727, + "grad_norm": 9.122142791748047, + "learning_rate": 6.6212291072156555e-06, + "loss": 2.9645, + "step": 39800 + }, + { + "epoch": 2.7045114825383885, + "grad_norm": 7.530338287353516, + "learning_rate": 6.6208044571273274e-06, + "loss": 2.9434, + "step": 39805 + }, + { + "epoch": 2.70485120260905, + "grad_norm": 6.9948248863220215, + "learning_rate": 6.620379807039e-06, + "loss": 3.0733, + "step": 39810 + }, + { + "epoch": 2.705190922679712, + "grad_norm": 7.398950576782227, + "learning_rate": 6.619955156950674e-06, + "loss": 2.9027, + "step": 39815 + }, + { + "epoch": 2.705530642750374, + "grad_norm": 7.078474998474121, + "learning_rate": 6.619530506862346e-06, + "loss": 3.0518, + "step": 39820 + }, + { + "epoch": 2.7058703628210354, + "grad_norm": 6.927072048187256, + "learning_rate": 6.619105856774019e-06, + "loss": 2.6929, + "step": 39825 + }, + { + "epoch": 2.7062100828916975, + "grad_norm": 7.7168426513671875, + "learning_rate": 6.618681206685692e-06, + "loss": 2.9674, + "step": 39830 + }, + { + "epoch": 2.706549802962359, + "grad_norm": 7.26326847076416, + "learning_rate": 6.618256556597364e-06, + "loss": 3.1564, + "step": 39835 + }, + { + "epoch": 2.7068895230330208, + "grad_norm": 7.179008483886719, + "learning_rate": 6.617831906509037e-06, + "loss": 2.8761, + "step": 39840 + }, + { + "epoch": 2.7072292431036824, + "grad_norm": 5.354895114898682, + "learning_rate": 6.617407256420711e-06, + "loss": 2.8916, + "step": 39845 + }, + { + "epoch": 2.7075689631743445, + "grad_norm": 6.962270736694336, + "learning_rate": 6.616982606332383e-06, + "loss": 3.0922, + "step": 39850 + }, + { + "epoch": 2.707908683245006, + "grad_norm": 6.880329132080078, + "learning_rate": 6.6165579562440554e-06, + "loss": 3.0003, + "step": 39855 + }, + { + "epoch": 2.7082484033156677, + "grad_norm": 7.424592971801758, + "learning_rate": 6.616133306155729e-06, + "loss": 2.9614, + "step": 39860 + }, + { + "epoch": 2.70858812338633, + "grad_norm": 7.018537998199463, + "learning_rate": 6.615708656067401e-06, + "loss": 3.1372, + "step": 39865 + }, + { + "epoch": 2.7089278434569914, + "grad_norm": 9.115880966186523, + "learning_rate": 6.615284005979074e-06, + "loss": 3.147, + "step": 39870 + }, + { + "epoch": 2.709267563527653, + "grad_norm": 6.011544227600098, + "learning_rate": 6.614859355890747e-06, + "loss": 2.9395, + "step": 39875 + }, + { + "epoch": 2.709607283598315, + "grad_norm": 9.536578178405762, + "learning_rate": 6.6144347058024194e-06, + "loss": 3.0111, + "step": 39880 + }, + { + "epoch": 2.709947003668977, + "grad_norm": 5.994778156280518, + "learning_rate": 6.614010055714092e-06, + "loss": 3.1097, + "step": 39885 + }, + { + "epoch": 2.7102867237396384, + "grad_norm": 6.155023097991943, + "learning_rate": 6.613585405625765e-06, + "loss": 3.0965, + "step": 39890 + }, + { + "epoch": 2.7106264438103, + "grad_norm": 6.556095123291016, + "learning_rate": 6.613160755537438e-06, + "loss": 3.1583, + "step": 39895 + }, + { + "epoch": 2.710966163880962, + "grad_norm": 6.119725704193115, + "learning_rate": 6.61273610544911e-06, + "loss": 2.858, + "step": 39900 + }, + { + "epoch": 2.7113058839516238, + "grad_norm": 7.499425411224365, + "learning_rate": 6.6123114553607835e-06, + "loss": 3.0274, + "step": 39905 + }, + { + "epoch": 2.7116456040222854, + "grad_norm": 6.981029033660889, + "learning_rate": 6.611886805272456e-06, + "loss": 2.9988, + "step": 39910 + }, + { + "epoch": 2.7119853240929475, + "grad_norm": 7.263107776641846, + "learning_rate": 6.611462155184128e-06, + "loss": 3.0283, + "step": 39915 + }, + { + "epoch": 2.712325044163609, + "grad_norm": 5.899283409118652, + "learning_rate": 6.611037505095802e-06, + "loss": 3.1968, + "step": 39920 + }, + { + "epoch": 2.7126647642342707, + "grad_norm": 5.755337238311768, + "learning_rate": 6.610612855007475e-06, + "loss": 3.124, + "step": 39925 + }, + { + "epoch": 2.713004484304933, + "grad_norm": 8.286221504211426, + "learning_rate": 6.610188204919147e-06, + "loss": 3.1746, + "step": 39930 + }, + { + "epoch": 2.7133442043755944, + "grad_norm": 7.977997779846191, + "learning_rate": 6.60976355483082e-06, + "loss": 3.0967, + "step": 39935 + }, + { + "epoch": 2.713683924446256, + "grad_norm": 7.079314708709717, + "learning_rate": 6.609338904742493e-06, + "loss": 3.1035, + "step": 39940 + }, + { + "epoch": 2.714023644516918, + "grad_norm": 6.953109264373779, + "learning_rate": 6.608914254654165e-06, + "loss": 2.8216, + "step": 39945 + }, + { + "epoch": 2.7143633645875798, + "grad_norm": 8.076980590820312, + "learning_rate": 6.608489604565839e-06, + "loss": 2.8539, + "step": 39950 + }, + { + "epoch": 2.7147030846582414, + "grad_norm": 6.793123245239258, + "learning_rate": 6.6080649544775115e-06, + "loss": 3.0328, + "step": 39955 + }, + { + "epoch": 2.7150428047289035, + "grad_norm": 6.515094757080078, + "learning_rate": 6.607640304389183e-06, + "loss": 3.194, + "step": 39960 + }, + { + "epoch": 2.715382524799565, + "grad_norm": 6.22542667388916, + "learning_rate": 6.607215654300857e-06, + "loss": 3.0808, + "step": 39965 + }, + { + "epoch": 2.7157222448702267, + "grad_norm": 6.135310173034668, + "learning_rate": 6.606791004212529e-06, + "loss": 2.7834, + "step": 39970 + }, + { + "epoch": 2.716061964940889, + "grad_norm": 6.7592010498046875, + "learning_rate": 6.606366354124202e-06, + "loss": 3.1656, + "step": 39975 + }, + { + "epoch": 2.7164016850115504, + "grad_norm": 6.423046588897705, + "learning_rate": 6.6059417040358755e-06, + "loss": 2.946, + "step": 39980 + }, + { + "epoch": 2.716741405082212, + "grad_norm": 6.839411735534668, + "learning_rate": 6.605517053947547e-06, + "loss": 2.9143, + "step": 39985 + }, + { + "epoch": 2.717081125152874, + "grad_norm": 6.601999282836914, + "learning_rate": 6.60509240385922e-06, + "loss": 3.0693, + "step": 39990 + }, + { + "epoch": 2.7174208452235358, + "grad_norm": 6.65459680557251, + "learning_rate": 6.604667753770894e-06, + "loss": 3.0064, + "step": 39995 + }, + { + "epoch": 2.7177605652941974, + "grad_norm": 6.044437408447266, + "learning_rate": 6.604243103682566e-06, + "loss": 3.1443, + "step": 40000 + }, + { + "epoch": 2.7181002853648595, + "grad_norm": 8.371136665344238, + "learning_rate": 6.603818453594239e-06, + "loss": 3.0575, + "step": 40005 + }, + { + "epoch": 2.718440005435521, + "grad_norm": 7.029742240905762, + "learning_rate": 6.603393803505912e-06, + "loss": 2.8303, + "step": 40010 + }, + { + "epoch": 2.7187797255061827, + "grad_norm": 5.800806045532227, + "learning_rate": 6.602969153417584e-06, + "loss": 2.9463, + "step": 40015 + }, + { + "epoch": 2.719119445576845, + "grad_norm": 8.105385780334473, + "learning_rate": 6.602544503329257e-06, + "loss": 3.0922, + "step": 40020 + }, + { + "epoch": 2.7194591656475064, + "grad_norm": 5.748523235321045, + "learning_rate": 6.602119853240931e-06, + "loss": 3.2045, + "step": 40025 + }, + { + "epoch": 2.719798885718168, + "grad_norm": 8.574041366577148, + "learning_rate": 6.601695203152603e-06, + "loss": 3.0483, + "step": 40030 + }, + { + "epoch": 2.72013860578883, + "grad_norm": 6.614802360534668, + "learning_rate": 6.6012705530642754e-06, + "loss": 2.8388, + "step": 40035 + }, + { + "epoch": 2.720478325859492, + "grad_norm": 8.17268180847168, + "learning_rate": 6.600845902975948e-06, + "loss": 3.1207, + "step": 40040 + }, + { + "epoch": 2.7208180459301534, + "grad_norm": 6.8446269035339355, + "learning_rate": 6.600421252887621e-06, + "loss": 2.9812, + "step": 40045 + }, + { + "epoch": 2.7211577660008155, + "grad_norm": 6.3121867179870605, + "learning_rate": 6.599996602799294e-06, + "loss": 2.9747, + "step": 40050 + }, + { + "epoch": 2.721497486071477, + "grad_norm": 8.11983585357666, + "learning_rate": 6.599571952710967e-06, + "loss": 3.0725, + "step": 40055 + }, + { + "epoch": 2.7218372061421388, + "grad_norm": 9.084782600402832, + "learning_rate": 6.5991473026226394e-06, + "loss": 2.8773, + "step": 40060 + }, + { + "epoch": 2.722176926212801, + "grad_norm": 7.276980400085449, + "learning_rate": 6.598722652534311e-06, + "loss": 3.0125, + "step": 40065 + }, + { + "epoch": 2.7225166462834625, + "grad_norm": 6.959247589111328, + "learning_rate": 6.598298002445985e-06, + "loss": 3.1564, + "step": 40070 + }, + { + "epoch": 2.722856366354124, + "grad_norm": 8.097156524658203, + "learning_rate": 6.597873352357658e-06, + "loss": 3.0908, + "step": 40075 + }, + { + "epoch": 2.723196086424786, + "grad_norm": 9.81141471862793, + "learning_rate": 6.59744870226933e-06, + "loss": 2.9936, + "step": 40080 + }, + { + "epoch": 2.723535806495448, + "grad_norm": 7.004639625549316, + "learning_rate": 6.5970240521810034e-06, + "loss": 3.2549, + "step": 40085 + }, + { + "epoch": 2.7238755265661094, + "grad_norm": 9.871088027954102, + "learning_rate": 6.596599402092676e-06, + "loss": 3.1226, + "step": 40090 + }, + { + "epoch": 2.7242152466367715, + "grad_norm": 5.826299667358398, + "learning_rate": 6.596174752004348e-06, + "loss": 3.0702, + "step": 40095 + }, + { + "epoch": 2.724554966707433, + "grad_norm": 8.221305847167969, + "learning_rate": 6.595750101916022e-06, + "loss": 3.0682, + "step": 40100 + }, + { + "epoch": 2.7248946867780948, + "grad_norm": 7.048481464385986, + "learning_rate": 6.595325451827695e-06, + "loss": 3.0569, + "step": 40105 + }, + { + "epoch": 2.725234406848757, + "grad_norm": 9.40771198272705, + "learning_rate": 6.594900801739367e-06, + "loss": 3.1108, + "step": 40110 + }, + { + "epoch": 2.7255741269194185, + "grad_norm": 7.527024269104004, + "learning_rate": 6.59447615165104e-06, + "loss": 3.0853, + "step": 40115 + }, + { + "epoch": 2.72591384699008, + "grad_norm": 4.9330735206604, + "learning_rate": 6.594051501562713e-06, + "loss": 2.8915, + "step": 40120 + }, + { + "epoch": 2.726253567060742, + "grad_norm": 7.318823337554932, + "learning_rate": 6.593626851474386e-06, + "loss": 3.0086, + "step": 40125 + }, + { + "epoch": 2.726593287131404, + "grad_norm": 6.621556282043457, + "learning_rate": 6.593202201386059e-06, + "loss": 3.1679, + "step": 40130 + }, + { + "epoch": 2.7269330072020654, + "grad_norm": 7.617416858673096, + "learning_rate": 6.592777551297731e-06, + "loss": 2.8572, + "step": 40135 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 7.65066385269165, + "learning_rate": 6.592352901209404e-06, + "loss": 2.9216, + "step": 40140 + }, + { + "epoch": 2.727612447343389, + "grad_norm": 7.44815731048584, + "learning_rate": 6.591928251121077e-06, + "loss": 2.9783, + "step": 40145 + }, + { + "epoch": 2.7279521674140508, + "grad_norm": 7.396963596343994, + "learning_rate": 6.591503601032749e-06, + "loss": 3.0169, + "step": 40150 + }, + { + "epoch": 2.728291887484713, + "grad_norm": 5.419087886810303, + "learning_rate": 6.591078950944423e-06, + "loss": 3.0671, + "step": 40155 + }, + { + "epoch": 2.7286316075553745, + "grad_norm": 6.409541130065918, + "learning_rate": 6.5906543008560954e-06, + "loss": 2.9801, + "step": 40160 + }, + { + "epoch": 2.728971327626036, + "grad_norm": 8.161104202270508, + "learning_rate": 6.590229650767767e-06, + "loss": 3.069, + "step": 40165 + }, + { + "epoch": 2.729311047696698, + "grad_norm": 9.200109481811523, + "learning_rate": 6.589805000679441e-06, + "loss": 3.0667, + "step": 40170 + }, + { + "epoch": 2.72965076776736, + "grad_norm": 5.798320293426514, + "learning_rate": 6.589380350591114e-06, + "loss": 3.127, + "step": 40175 + }, + { + "epoch": 2.7299904878380215, + "grad_norm": 8.603782653808594, + "learning_rate": 6.588955700502786e-06, + "loss": 3.2565, + "step": 40180 + }, + { + "epoch": 2.730330207908683, + "grad_norm": 5.710972785949707, + "learning_rate": 6.5885310504144594e-06, + "loss": 3.0254, + "step": 40185 + }, + { + "epoch": 2.730669927979345, + "grad_norm": 6.126916885375977, + "learning_rate": 6.588106400326132e-06, + "loss": 3.1517, + "step": 40190 + }, + { + "epoch": 2.731009648050007, + "grad_norm": 6.286531925201416, + "learning_rate": 6.587681750237804e-06, + "loss": 3.1522, + "step": 40195 + }, + { + "epoch": 2.7313493681206684, + "grad_norm": 6.839592456817627, + "learning_rate": 6.587257100149478e-06, + "loss": 3.2917, + "step": 40200 + }, + { + "epoch": 2.7316890881913305, + "grad_norm": 7.648024559020996, + "learning_rate": 6.586832450061151e-06, + "loss": 2.9497, + "step": 40205 + }, + { + "epoch": 2.732028808261992, + "grad_norm": 6.670932292938232, + "learning_rate": 6.586407799972823e-06, + "loss": 3.1561, + "step": 40210 + }, + { + "epoch": 2.7323685283326538, + "grad_norm": 7.727689266204834, + "learning_rate": 6.585983149884496e-06, + "loss": 3.1653, + "step": 40215 + }, + { + "epoch": 2.732708248403316, + "grad_norm": 6.294128894805908, + "learning_rate": 6.585558499796168e-06, + "loss": 3.0107, + "step": 40220 + }, + { + "epoch": 2.7330479684739775, + "grad_norm": 7.4868316650390625, + "learning_rate": 6.585133849707841e-06, + "loss": 3.0193, + "step": 40225 + }, + { + "epoch": 2.733387688544639, + "grad_norm": 7.286006450653076, + "learning_rate": 6.584709199619515e-06, + "loss": 2.9415, + "step": 40230 + }, + { + "epoch": 2.7337274086153007, + "grad_norm": 8.16640853881836, + "learning_rate": 6.584284549531187e-06, + "loss": 3.0565, + "step": 40235 + }, + { + "epoch": 2.734067128685963, + "grad_norm": 6.189652442932129, + "learning_rate": 6.583859899442859e-06, + "loss": 3.1211, + "step": 40240 + }, + { + "epoch": 2.7344068487566244, + "grad_norm": 7.274412631988525, + "learning_rate": 6.583435249354533e-06, + "loss": 2.9807, + "step": 40245 + }, + { + "epoch": 2.734746568827286, + "grad_norm": 7.835208892822266, + "learning_rate": 6.583010599266205e-06, + "loss": 3.111, + "step": 40250 + }, + { + "epoch": 2.735086288897948, + "grad_norm": 7.650782108306885, + "learning_rate": 6.582585949177878e-06, + "loss": 3.1872, + "step": 40255 + }, + { + "epoch": 2.7354260089686098, + "grad_norm": 5.255536079406738, + "learning_rate": 6.5821612990895515e-06, + "loss": 2.9712, + "step": 40260 + }, + { + "epoch": 2.7357657290392714, + "grad_norm": 7.476844787597656, + "learning_rate": 6.581736649001223e-06, + "loss": 3.0265, + "step": 40265 + }, + { + "epoch": 2.7361054491099335, + "grad_norm": 6.113440990447998, + "learning_rate": 6.581311998912896e-06, + "loss": 3.0983, + "step": 40270 + }, + { + "epoch": 2.736445169180595, + "grad_norm": 7.270235061645508, + "learning_rate": 6.58088734882457e-06, + "loss": 2.9162, + "step": 40275 + }, + { + "epoch": 2.7367848892512567, + "grad_norm": 6.720767974853516, + "learning_rate": 6.580462698736242e-06, + "loss": 3.0911, + "step": 40280 + }, + { + "epoch": 2.737124609321919, + "grad_norm": 7.20589017868042, + "learning_rate": 6.580038048647915e-06, + "loss": 3.021, + "step": 40285 + }, + { + "epoch": 2.7374643293925804, + "grad_norm": 6.5863938331604, + "learning_rate": 6.579613398559587e-06, + "loss": 3.122, + "step": 40290 + }, + { + "epoch": 2.737804049463242, + "grad_norm": 6.222710609436035, + "learning_rate": 6.57918874847126e-06, + "loss": 2.9541, + "step": 40295 + }, + { + "epoch": 2.738143769533904, + "grad_norm": 7.4433817863464355, + "learning_rate": 6.578764098382933e-06, + "loss": 2.9502, + "step": 40300 + }, + { + "epoch": 2.738483489604566, + "grad_norm": 8.282551765441895, + "learning_rate": 6.578339448294606e-06, + "loss": 2.9965, + "step": 40305 + }, + { + "epoch": 2.7388232096752274, + "grad_norm": 6.510672569274902, + "learning_rate": 6.577914798206279e-06, + "loss": 3.2079, + "step": 40310 + }, + { + "epoch": 2.7391629297458895, + "grad_norm": 7.118696212768555, + "learning_rate": 6.5774901481179506e-06, + "loss": 3.1905, + "step": 40315 + }, + { + "epoch": 2.739502649816551, + "grad_norm": 6.611802577972412, + "learning_rate": 6.577065498029624e-06, + "loss": 2.735, + "step": 40320 + }, + { + "epoch": 2.7398423698872127, + "grad_norm": 6.3645477294921875, + "learning_rate": 6.576640847941297e-06, + "loss": 3.0704, + "step": 40325 + }, + { + "epoch": 2.740182089957875, + "grad_norm": 7.213042259216309, + "learning_rate": 6.576216197852969e-06, + "loss": 3.1583, + "step": 40330 + }, + { + "epoch": 2.7405218100285365, + "grad_norm": 5.928746223449707, + "learning_rate": 6.575791547764643e-06, + "loss": 3.0283, + "step": 40335 + }, + { + "epoch": 2.740861530099198, + "grad_norm": 7.254805088043213, + "learning_rate": 6.5753668976763154e-06, + "loss": 3.021, + "step": 40340 + }, + { + "epoch": 2.74120125016986, + "grad_norm": 8.633955955505371, + "learning_rate": 6.574942247587987e-06, + "loss": 3.1354, + "step": 40345 + }, + { + "epoch": 2.741540970240522, + "grad_norm": 8.22828483581543, + "learning_rate": 6.574517597499661e-06, + "loss": 2.6198, + "step": 40350 + }, + { + "epoch": 2.7418806903111834, + "grad_norm": 6.296673774719238, + "learning_rate": 6.574092947411334e-06, + "loss": 2.963, + "step": 40355 + }, + { + "epoch": 2.7422204103818455, + "grad_norm": 6.845297813415527, + "learning_rate": 6.573668297323006e-06, + "loss": 3.137, + "step": 40360 + }, + { + "epoch": 2.742560130452507, + "grad_norm": 8.011483192443848, + "learning_rate": 6.5732436472346794e-06, + "loss": 3.1803, + "step": 40365 + }, + { + "epoch": 2.7428998505231688, + "grad_norm": 5.956367015838623, + "learning_rate": 6.572818997146352e-06, + "loss": 3.2158, + "step": 40370 + }, + { + "epoch": 2.743239570593831, + "grad_norm": 8.283730506896973, + "learning_rate": 6.572394347058024e-06, + "loss": 3.0526, + "step": 40375 + }, + { + "epoch": 2.7435792906644925, + "grad_norm": 7.3685173988342285, + "learning_rate": 6.571969696969698e-06, + "loss": 3.1938, + "step": 40380 + }, + { + "epoch": 2.743919010735154, + "grad_norm": 7.775487899780273, + "learning_rate": 6.57154504688137e-06, + "loss": 3.0728, + "step": 40385 + }, + { + "epoch": 2.744258730805816, + "grad_norm": 6.821288108825684, + "learning_rate": 6.571120396793043e-06, + "loss": 3.2368, + "step": 40390 + }, + { + "epoch": 2.744598450876478, + "grad_norm": 7.458282947540283, + "learning_rate": 6.570695746704716e-06, + "loss": 2.9607, + "step": 40395 + }, + { + "epoch": 2.7449381709471394, + "grad_norm": 7.273994445800781, + "learning_rate": 6.570271096616388e-06, + "loss": 2.8128, + "step": 40400 + }, + { + "epoch": 2.7452778910178015, + "grad_norm": 6.564302444458008, + "learning_rate": 6.569846446528061e-06, + "loss": 3.0898, + "step": 40405 + }, + { + "epoch": 2.745617611088463, + "grad_norm": 6.05994987487793, + "learning_rate": 6.569421796439735e-06, + "loss": 2.993, + "step": 40410 + }, + { + "epoch": 2.7459573311591248, + "grad_norm": 6.952377796173096, + "learning_rate": 6.568997146351407e-06, + "loss": 3.0104, + "step": 40415 + }, + { + "epoch": 2.746297051229787, + "grad_norm": 6.1060895919799805, + "learning_rate": 6.568572496263079e-06, + "loss": 3.1329, + "step": 40420 + }, + { + "epoch": 2.7466367713004485, + "grad_norm": 5.7107930183410645, + "learning_rate": 6.568147846174753e-06, + "loss": 3.1562, + "step": 40425 + }, + { + "epoch": 2.74697649137111, + "grad_norm": 6.191544055938721, + "learning_rate": 6.567723196086425e-06, + "loss": 2.87, + "step": 40430 + }, + { + "epoch": 2.747316211441772, + "grad_norm": 5.84751558303833, + "learning_rate": 6.567298545998098e-06, + "loss": 2.9832, + "step": 40435 + }, + { + "epoch": 2.747655931512434, + "grad_norm": 6.998279094696045, + "learning_rate": 6.5668738959097714e-06, + "loss": 3.156, + "step": 40440 + }, + { + "epoch": 2.7479956515830954, + "grad_norm": 6.5921783447265625, + "learning_rate": 6.566449245821443e-06, + "loss": 3.1283, + "step": 40445 + }, + { + "epoch": 2.7483353716537575, + "grad_norm": 9.053677558898926, + "learning_rate": 6.566024595733116e-06, + "loss": 3.1379, + "step": 40450 + }, + { + "epoch": 2.748675091724419, + "grad_norm": 5.873165607452393, + "learning_rate": 6.565599945644789e-06, + "loss": 3.0213, + "step": 40455 + }, + { + "epoch": 2.749014811795081, + "grad_norm": 8.791287422180176, + "learning_rate": 6.565175295556462e-06, + "loss": 3.1836, + "step": 40460 + }, + { + "epoch": 2.749354531865743, + "grad_norm": 8.237722396850586, + "learning_rate": 6.5647506454681354e-06, + "loss": 2.9603, + "step": 40465 + }, + { + "epoch": 2.7496942519364045, + "grad_norm": 7.337713241577148, + "learning_rate": 6.564325995379807e-06, + "loss": 2.5526, + "step": 40470 + }, + { + "epoch": 2.750033972007066, + "grad_norm": 6.121457576751709, + "learning_rate": 6.56390134529148e-06, + "loss": 3.1384, + "step": 40475 + }, + { + "epoch": 2.750373692077728, + "grad_norm": 7.126333236694336, + "learning_rate": 6.563476695203154e-06, + "loss": 3.0235, + "step": 40480 + }, + { + "epoch": 2.75071341214839, + "grad_norm": 5.507492542266846, + "learning_rate": 6.563052045114826e-06, + "loss": 3.0913, + "step": 40485 + }, + { + "epoch": 2.7510531322190515, + "grad_norm": 6.1590447425842285, + "learning_rate": 6.562627395026499e-06, + "loss": 3.0424, + "step": 40490 + }, + { + "epoch": 2.7513928522897135, + "grad_norm": 6.030524253845215, + "learning_rate": 6.562202744938172e-06, + "loss": 2.9937, + "step": 40495 + }, + { + "epoch": 2.751732572360375, + "grad_norm": 7.121448516845703, + "learning_rate": 6.561778094849844e-06, + "loss": 3.092, + "step": 40500 + }, + { + "epoch": 2.752072292431037, + "grad_norm": 6.341362476348877, + "learning_rate": 6.561353444761517e-06, + "loss": 2.8091, + "step": 40505 + }, + { + "epoch": 2.752412012501699, + "grad_norm": 5.633964538574219, + "learning_rate": 6.560928794673191e-06, + "loss": 3.033, + "step": 40510 + }, + { + "epoch": 2.7527517325723605, + "grad_norm": 6.961881637573242, + "learning_rate": 6.560504144584863e-06, + "loss": 3.0635, + "step": 40515 + }, + { + "epoch": 2.753091452643022, + "grad_norm": 7.411567211151123, + "learning_rate": 6.560079494496535e-06, + "loss": 2.8382, + "step": 40520 + }, + { + "epoch": 2.7534311727136838, + "grad_norm": 8.309521675109863, + "learning_rate": 6.559654844408209e-06, + "loss": 3.0177, + "step": 40525 + }, + { + "epoch": 2.753770892784346, + "grad_norm": 6.007071495056152, + "learning_rate": 6.559230194319881e-06, + "loss": 2.9785, + "step": 40530 + }, + { + "epoch": 2.7541106128550075, + "grad_norm": 5.275259971618652, + "learning_rate": 6.558805544231554e-06, + "loss": 3.2038, + "step": 40535 + }, + { + "epoch": 2.754450332925669, + "grad_norm": 6.918881416320801, + "learning_rate": 6.558380894143227e-06, + "loss": 3.044, + "step": 40540 + }, + { + "epoch": 2.754790052996331, + "grad_norm": 7.253678321838379, + "learning_rate": 6.557956244054899e-06, + "loss": 3.0269, + "step": 40545 + }, + { + "epoch": 2.755129773066993, + "grad_norm": 7.955885410308838, + "learning_rate": 6.557531593966572e-06, + "loss": 3.2256, + "step": 40550 + }, + { + "epoch": 2.7554694931376544, + "grad_norm": 7.931502342224121, + "learning_rate": 6.557106943878245e-06, + "loss": 3.1989, + "step": 40555 + }, + { + "epoch": 2.7558092132083165, + "grad_norm": 5.875177383422852, + "learning_rate": 6.556682293789918e-06, + "loss": 2.9816, + "step": 40560 + }, + { + "epoch": 2.756148933278978, + "grad_norm": 6.036719799041748, + "learning_rate": 6.55625764370159e-06, + "loss": 2.9765, + "step": 40565 + }, + { + "epoch": 2.7564886533496398, + "grad_norm": 6.330686092376709, + "learning_rate": 6.555832993613263e-06, + "loss": 2.7938, + "step": 40570 + }, + { + "epoch": 2.7568283734203014, + "grad_norm": 7.527028560638428, + "learning_rate": 6.555408343524936e-06, + "loss": 2.9816, + "step": 40575 + }, + { + "epoch": 2.7571680934909635, + "grad_norm": 8.245838165283203, + "learning_rate": 6.554983693436608e-06, + "loss": 2.9884, + "step": 40580 + }, + { + "epoch": 2.757507813561625, + "grad_norm": 8.377872467041016, + "learning_rate": 6.554559043348282e-06, + "loss": 2.9587, + "step": 40585 + }, + { + "epoch": 2.7578475336322867, + "grad_norm": 7.148929119110107, + "learning_rate": 6.554134393259955e-06, + "loss": 2.9465, + "step": 40590 + }, + { + "epoch": 2.758187253702949, + "grad_norm": 8.42023754119873, + "learning_rate": 6.5537097431716266e-06, + "loss": 3.0889, + "step": 40595 + }, + { + "epoch": 2.7585269737736104, + "grad_norm": 6.7378010749816895, + "learning_rate": 6.5532850930833e-06, + "loss": 3.1561, + "step": 40600 + }, + { + "epoch": 2.758866693844272, + "grad_norm": 5.736878871917725, + "learning_rate": 6.552860442994973e-06, + "loss": 3.1674, + "step": 40605 + }, + { + "epoch": 2.759206413914934, + "grad_norm": 7.925562858581543, + "learning_rate": 6.552435792906645e-06, + "loss": 3.0809, + "step": 40610 + }, + { + "epoch": 2.759546133985596, + "grad_norm": 6.676865577697754, + "learning_rate": 6.552011142818319e-06, + "loss": 2.6902, + "step": 40615 + }, + { + "epoch": 2.7598858540562574, + "grad_norm": 6.390982627868652, + "learning_rate": 6.551586492729991e-06, + "loss": 3.0492, + "step": 40620 + }, + { + "epoch": 2.7602255741269195, + "grad_norm": 5.940365791320801, + "learning_rate": 6.551161842641663e-06, + "loss": 2.9846, + "step": 40625 + }, + { + "epoch": 2.760565294197581, + "grad_norm": 7.362405776977539, + "learning_rate": 6.550737192553337e-06, + "loss": 2.9814, + "step": 40630 + }, + { + "epoch": 2.7609050142682428, + "grad_norm": 6.774704456329346, + "learning_rate": 6.550312542465009e-06, + "loss": 2.9743, + "step": 40635 + }, + { + "epoch": 2.761244734338905, + "grad_norm": 5.246988773345947, + "learning_rate": 6.549887892376682e-06, + "loss": 2.7529, + "step": 40640 + }, + { + "epoch": 2.7615844544095665, + "grad_norm": 8.061402320861816, + "learning_rate": 6.5494632422883554e-06, + "loss": 3.1756, + "step": 40645 + }, + { + "epoch": 2.761924174480228, + "grad_norm": 7.695900917053223, + "learning_rate": 6.549038592200027e-06, + "loss": 3.1003, + "step": 40650 + }, + { + "epoch": 2.76226389455089, + "grad_norm": 5.91233491897583, + "learning_rate": 6.5486139421117e-06, + "loss": 3.0359, + "step": 40655 + }, + { + "epoch": 2.762603614621552, + "grad_norm": 7.740060806274414, + "learning_rate": 6.548189292023374e-06, + "loss": 3.0226, + "step": 40660 + }, + { + "epoch": 2.7629433346922134, + "grad_norm": 6.190329074859619, + "learning_rate": 6.547764641935046e-06, + "loss": 2.9572, + "step": 40665 + }, + { + "epoch": 2.7632830547628755, + "grad_norm": 6.305927276611328, + "learning_rate": 6.547339991846719e-06, + "loss": 3.0927, + "step": 40670 + }, + { + "epoch": 2.763622774833537, + "grad_norm": 7.603245735168457, + "learning_rate": 6.546915341758392e-06, + "loss": 2.9221, + "step": 40675 + }, + { + "epoch": 2.7639624949041988, + "grad_norm": 5.7694549560546875, + "learning_rate": 6.546490691670064e-06, + "loss": 2.8878, + "step": 40680 + }, + { + "epoch": 2.764302214974861, + "grad_norm": 8.194539070129395, + "learning_rate": 6.546066041581737e-06, + "loss": 3.2393, + "step": 40685 + }, + { + "epoch": 2.7646419350455225, + "grad_norm": 6.828918933868408, + "learning_rate": 6.545641391493411e-06, + "loss": 3.1997, + "step": 40690 + }, + { + "epoch": 2.764981655116184, + "grad_norm": 7.198002815246582, + "learning_rate": 6.545216741405083e-06, + "loss": 3.0083, + "step": 40695 + }, + { + "epoch": 2.765321375186846, + "grad_norm": 7.652865886688232, + "learning_rate": 6.544792091316755e-06, + "loss": 2.8946, + "step": 40700 + }, + { + "epoch": 2.765661095257508, + "grad_norm": 5.881415843963623, + "learning_rate": 6.544367441228428e-06, + "loss": 3.0564, + "step": 40705 + }, + { + "epoch": 2.7660008153281694, + "grad_norm": 7.70552396774292, + "learning_rate": 6.543942791140101e-06, + "loss": 2.9792, + "step": 40710 + }, + { + "epoch": 2.7663405353988315, + "grad_norm": 6.189456939697266, + "learning_rate": 6.543518141051774e-06, + "loss": 2.9772, + "step": 40715 + }, + { + "epoch": 2.766680255469493, + "grad_norm": 8.129439353942871, + "learning_rate": 6.543093490963447e-06, + "loss": 2.9814, + "step": 40720 + }, + { + "epoch": 2.7670199755401548, + "grad_norm": 7.199769496917725, + "learning_rate": 6.542668840875119e-06, + "loss": 2.827, + "step": 40725 + }, + { + "epoch": 2.767359695610817, + "grad_norm": 8.082324028015137, + "learning_rate": 6.542244190786791e-06, + "loss": 2.9773, + "step": 40730 + }, + { + "epoch": 2.7676994156814785, + "grad_norm": 5.783034801483154, + "learning_rate": 6.541819540698465e-06, + "loss": 3.0232, + "step": 40735 + }, + { + "epoch": 2.76803913575214, + "grad_norm": 7.087257385253906, + "learning_rate": 6.541394890610138e-06, + "loss": 3.0728, + "step": 40740 + }, + { + "epoch": 2.768378855822802, + "grad_norm": 6.565028190612793, + "learning_rate": 6.54097024052181e-06, + "loss": 3.0096, + "step": 40745 + }, + { + "epoch": 2.768718575893464, + "grad_norm": 6.9840898513793945, + "learning_rate": 6.540545590433483e-06, + "loss": 3.0322, + "step": 40750 + }, + { + "epoch": 2.7690582959641254, + "grad_norm": 8.098379135131836, + "learning_rate": 6.540120940345156e-06, + "loss": 2.9891, + "step": 40755 + }, + { + "epoch": 2.7693980160347875, + "grad_norm": 6.322446823120117, + "learning_rate": 6.539696290256828e-06, + "loss": 3.0147, + "step": 40760 + }, + { + "epoch": 2.769737736105449, + "grad_norm": 6.670670032501221, + "learning_rate": 6.539271640168502e-06, + "loss": 3.0967, + "step": 40765 + }, + { + "epoch": 2.770077456176111, + "grad_norm": 6.275531768798828, + "learning_rate": 6.538846990080175e-06, + "loss": 3.0252, + "step": 40770 + }, + { + "epoch": 2.770417176246773, + "grad_norm": 9.02888011932373, + "learning_rate": 6.5384223399918466e-06, + "loss": 3.2965, + "step": 40775 + }, + { + "epoch": 2.7707568963174345, + "grad_norm": 7.145671367645264, + "learning_rate": 6.53799768990352e-06, + "loss": 3.0227, + "step": 40780 + }, + { + "epoch": 2.771096616388096, + "grad_norm": 7.697038173675537, + "learning_rate": 6.537573039815193e-06, + "loss": 3.0509, + "step": 40785 + }, + { + "epoch": 2.771436336458758, + "grad_norm": 6.131148815155029, + "learning_rate": 6.537148389726865e-06, + "loss": 3.0971, + "step": 40790 + }, + { + "epoch": 2.77177605652942, + "grad_norm": 6.53752326965332, + "learning_rate": 6.536723739638539e-06, + "loss": 3.1161, + "step": 40795 + }, + { + "epoch": 2.7721157766000815, + "grad_norm": 6.317758083343506, + "learning_rate": 6.5362990895502106e-06, + "loss": 3.0401, + "step": 40800 + }, + { + "epoch": 2.7724554966707435, + "grad_norm": 7.611598968505859, + "learning_rate": 6.535874439461884e-06, + "loss": 2.912, + "step": 40805 + }, + { + "epoch": 2.772795216741405, + "grad_norm": 6.536986351013184, + "learning_rate": 6.535449789373557e-06, + "loss": 2.901, + "step": 40810 + }, + { + "epoch": 2.773134936812067, + "grad_norm": 7.720832347869873, + "learning_rate": 6.535025139285229e-06, + "loss": 3.1791, + "step": 40815 + }, + { + "epoch": 2.773474656882729, + "grad_norm": 6.059291839599609, + "learning_rate": 6.534600489196903e-06, + "loss": 3.2106, + "step": 40820 + }, + { + "epoch": 2.7738143769533905, + "grad_norm": 8.010421752929688, + "learning_rate": 6.534175839108575e-06, + "loss": 2.9882, + "step": 40825 + }, + { + "epoch": 2.774154097024052, + "grad_norm": 9.055464744567871, + "learning_rate": 6.533751189020247e-06, + "loss": 2.9482, + "step": 40830 + }, + { + "epoch": 2.774493817094714, + "grad_norm": 6.36646032333374, + "learning_rate": 6.533326538931921e-06, + "loss": 3.1187, + "step": 40835 + }, + { + "epoch": 2.774833537165376, + "grad_norm": 6.069004535675049, + "learning_rate": 6.532901888843594e-06, + "loss": 3.2267, + "step": 40840 + }, + { + "epoch": 2.7751732572360375, + "grad_norm": 6.8024139404296875, + "learning_rate": 6.532477238755266e-06, + "loss": 3.2438, + "step": 40845 + }, + { + "epoch": 2.7755129773066995, + "grad_norm": 7.009382247924805, + "learning_rate": 6.532052588666939e-06, + "loss": 2.9371, + "step": 40850 + }, + { + "epoch": 2.775852697377361, + "grad_norm": 6.4292449951171875, + "learning_rate": 6.531627938578612e-06, + "loss": 2.8439, + "step": 40855 + }, + { + "epoch": 2.776192417448023, + "grad_norm": 6.717789649963379, + "learning_rate": 6.531203288490284e-06, + "loss": 2.9298, + "step": 40860 + }, + { + "epoch": 2.7765321375186844, + "grad_norm": 7.773100852966309, + "learning_rate": 6.530778638401958e-06, + "loss": 3.2535, + "step": 40865 + }, + { + "epoch": 2.7768718575893465, + "grad_norm": 6.1114888191223145, + "learning_rate": 6.530353988313631e-06, + "loss": 2.9898, + "step": 40870 + }, + { + "epoch": 2.777211577660008, + "grad_norm": 8.768919944763184, + "learning_rate": 6.5299293382253026e-06, + "loss": 2.9636, + "step": 40875 + }, + { + "epoch": 2.7775512977306698, + "grad_norm": 7.749453544616699, + "learning_rate": 6.529504688136976e-06, + "loss": 3.0401, + "step": 40880 + }, + { + "epoch": 2.777891017801332, + "grad_norm": 9.042740821838379, + "learning_rate": 6.529080038048648e-06, + "loss": 2.8612, + "step": 40885 + }, + { + "epoch": 2.7782307378719935, + "grad_norm": 7.1327433586120605, + "learning_rate": 6.528655387960321e-06, + "loss": 3.0648, + "step": 40890 + }, + { + "epoch": 2.778570457942655, + "grad_norm": 6.573414325714111, + "learning_rate": 6.528230737871995e-06, + "loss": 3.2478, + "step": 40895 + }, + { + "epoch": 2.778910178013317, + "grad_norm": 7.802298069000244, + "learning_rate": 6.5278060877836666e-06, + "loss": 2.7165, + "step": 40900 + }, + { + "epoch": 2.779249898083979, + "grad_norm": 7.103037357330322, + "learning_rate": 6.527381437695339e-06, + "loss": 3.1418, + "step": 40905 + }, + { + "epoch": 2.7795896181546405, + "grad_norm": 6.549777030944824, + "learning_rate": 6.526956787607013e-06, + "loss": 3.0462, + "step": 40910 + }, + { + "epoch": 2.779929338225302, + "grad_norm": 7.816006183624268, + "learning_rate": 6.526532137518685e-06, + "loss": 2.972, + "step": 40915 + }, + { + "epoch": 2.780269058295964, + "grad_norm": 6.962384223937988, + "learning_rate": 6.526107487430358e-06, + "loss": 2.6698, + "step": 40920 + }, + { + "epoch": 2.780608778366626, + "grad_norm": 6.896947383880615, + "learning_rate": 6.525682837342031e-06, + "loss": 3.1637, + "step": 40925 + }, + { + "epoch": 2.7809484984372874, + "grad_norm": 6.803746223449707, + "learning_rate": 6.525258187253703e-06, + "loss": 3.0635, + "step": 40930 + }, + { + "epoch": 2.7812882185079495, + "grad_norm": 6.211753845214844, + "learning_rate": 6.524833537165376e-06, + "loss": 3.0864, + "step": 40935 + }, + { + "epoch": 2.781627938578611, + "grad_norm": 5.364887714385986, + "learning_rate": 6.52440888707705e-06, + "loss": 3.1639, + "step": 40940 + }, + { + "epoch": 2.7819676586492728, + "grad_norm": 6.621084690093994, + "learning_rate": 6.523984236988722e-06, + "loss": 2.8941, + "step": 40945 + }, + { + "epoch": 2.782307378719935, + "grad_norm": 10.960285186767578, + "learning_rate": 6.523559586900395e-06, + "loss": 3.1236, + "step": 40950 + }, + { + "epoch": 2.7826470987905965, + "grad_norm": 6.7998199462890625, + "learning_rate": 6.523134936812067e-06, + "loss": 3.0984, + "step": 40955 + }, + { + "epoch": 2.782986818861258, + "grad_norm": 7.286172866821289, + "learning_rate": 6.52271028672374e-06, + "loss": 3.032, + "step": 40960 + }, + { + "epoch": 2.78332653893192, + "grad_norm": 8.168346405029297, + "learning_rate": 6.522285636635413e-06, + "loss": 2.9703, + "step": 40965 + }, + { + "epoch": 2.783666259002582, + "grad_norm": 5.856871604919434, + "learning_rate": 6.521860986547086e-06, + "loss": 2.9278, + "step": 40970 + }, + { + "epoch": 2.7840059790732434, + "grad_norm": 6.721293926239014, + "learning_rate": 6.521436336458759e-06, + "loss": 3.1124, + "step": 40975 + }, + { + "epoch": 2.7843456991439055, + "grad_norm": 7.476454734802246, + "learning_rate": 6.5210116863704305e-06, + "loss": 2.7711, + "step": 40980 + }, + { + "epoch": 2.784685419214567, + "grad_norm": 6.675976753234863, + "learning_rate": 6.520587036282104e-06, + "loss": 3.0929, + "step": 40985 + }, + { + "epoch": 2.7850251392852288, + "grad_norm": 7.01841402053833, + "learning_rate": 6.520162386193777e-06, + "loss": 2.984, + "step": 40990 + }, + { + "epoch": 2.785364859355891, + "grad_norm": 7.393857479095459, + "learning_rate": 6.519737736105449e-06, + "loss": 3.1004, + "step": 40995 + }, + { + "epoch": 2.7857045794265525, + "grad_norm": 7.298580169677734, + "learning_rate": 6.519313086017123e-06, + "loss": 2.8136, + "step": 41000 + }, + { + "epoch": 2.786044299497214, + "grad_norm": 5.838169097900391, + "learning_rate": 6.518888435928795e-06, + "loss": 3.3166, + "step": 41005 + }, + { + "epoch": 2.786384019567876, + "grad_norm": 6.979430675506592, + "learning_rate": 6.518463785840467e-06, + "loss": 3.1092, + "step": 41010 + }, + { + "epoch": 2.786723739638538, + "grad_norm": 5.725465774536133, + "learning_rate": 6.518039135752141e-06, + "loss": 3.1852, + "step": 41015 + }, + { + "epoch": 2.7870634597091994, + "grad_norm": 7.325740337371826, + "learning_rate": 6.517614485663814e-06, + "loss": 2.9795, + "step": 41020 + }, + { + "epoch": 2.7874031797798615, + "grad_norm": 6.5783915519714355, + "learning_rate": 6.517189835575486e-06, + "loss": 3.1706, + "step": 41025 + }, + { + "epoch": 2.787742899850523, + "grad_norm": 9.221442222595215, + "learning_rate": 6.516765185487159e-06, + "loss": 2.895, + "step": 41030 + }, + { + "epoch": 2.788082619921185, + "grad_norm": 6.954926490783691, + "learning_rate": 6.516340535398832e-06, + "loss": 3.1352, + "step": 41035 + }, + { + "epoch": 2.788422339991847, + "grad_norm": 6.953190803527832, + "learning_rate": 6.515915885310504e-06, + "loss": 3.0651, + "step": 41040 + }, + { + "epoch": 2.7887620600625085, + "grad_norm": 5.98902702331543, + "learning_rate": 6.515491235222178e-06, + "loss": 2.679, + "step": 41045 + }, + { + "epoch": 2.78910178013317, + "grad_norm": 7.6798930168151855, + "learning_rate": 6.51506658513385e-06, + "loss": 2.8245, + "step": 41050 + }, + { + "epoch": 2.789441500203832, + "grad_norm": 4.649390697479248, + "learning_rate": 6.5146419350455225e-06, + "loss": 3.1696, + "step": 41055 + }, + { + "epoch": 2.789781220274494, + "grad_norm": 6.059734344482422, + "learning_rate": 6.514217284957196e-06, + "loss": 3.0299, + "step": 41060 + }, + { + "epoch": 2.7901209403451555, + "grad_norm": 6.223656177520752, + "learning_rate": 6.513792634868868e-06, + "loss": 2.9832, + "step": 41065 + }, + { + "epoch": 2.7904606604158175, + "grad_norm": 6.062582015991211, + "learning_rate": 6.513367984780541e-06, + "loss": 3.1476, + "step": 41070 + }, + { + "epoch": 2.790800380486479, + "grad_norm": 8.086355209350586, + "learning_rate": 6.512943334692215e-06, + "loss": 2.9132, + "step": 41075 + }, + { + "epoch": 2.791140100557141, + "grad_norm": 7.175983905792236, + "learning_rate": 6.5125186846038866e-06, + "loss": 2.8847, + "step": 41080 + }, + { + "epoch": 2.791479820627803, + "grad_norm": 8.347475051879883, + "learning_rate": 6.512094034515559e-06, + "loss": 3.0219, + "step": 41085 + }, + { + "epoch": 2.7918195406984645, + "grad_norm": 6.581754684448242, + "learning_rate": 6.511669384427233e-06, + "loss": 2.9635, + "step": 41090 + }, + { + "epoch": 2.792159260769126, + "grad_norm": 7.091050624847412, + "learning_rate": 6.511244734338905e-06, + "loss": 3.1006, + "step": 41095 + }, + { + "epoch": 2.792498980839788, + "grad_norm": 7.244357109069824, + "learning_rate": 6.510820084250578e-06, + "loss": 2.9613, + "step": 41100 + }, + { + "epoch": 2.79283870091045, + "grad_norm": 6.012045383453369, + "learning_rate": 6.510395434162251e-06, + "loss": 3.1189, + "step": 41105 + }, + { + "epoch": 2.7931784209811115, + "grad_norm": 6.223806381225586, + "learning_rate": 6.509970784073923e-06, + "loss": 3.0836, + "step": 41110 + }, + { + "epoch": 2.7935181410517735, + "grad_norm": 6.621018886566162, + "learning_rate": 6.509546133985596e-06, + "loss": 3.1806, + "step": 41115 + }, + { + "epoch": 2.793857861122435, + "grad_norm": 6.295006275177002, + "learning_rate": 6.50912148389727e-06, + "loss": 3.0093, + "step": 41120 + }, + { + "epoch": 2.794197581193097, + "grad_norm": 5.939113616943359, + "learning_rate": 6.508696833808942e-06, + "loss": 3.1672, + "step": 41125 + }, + { + "epoch": 2.794537301263759, + "grad_norm": 6.788038730621338, + "learning_rate": 6.5082721837206146e-06, + "loss": 2.8308, + "step": 41130 + }, + { + "epoch": 2.7948770213344205, + "grad_norm": 5.979137897491455, + "learning_rate": 6.507847533632287e-06, + "loss": 2.983, + "step": 41135 + }, + { + "epoch": 2.795216741405082, + "grad_norm": 7.092451572418213, + "learning_rate": 6.50742288354396e-06, + "loss": 2.8913, + "step": 41140 + }, + { + "epoch": 2.795556461475744, + "grad_norm": 10.187573432922363, + "learning_rate": 6.506998233455634e-06, + "loss": 2.7814, + "step": 41145 + }, + { + "epoch": 2.795896181546406, + "grad_norm": 7.095305919647217, + "learning_rate": 6.506573583367306e-06, + "loss": 3.0348, + "step": 41150 + }, + { + "epoch": 2.7962359016170675, + "grad_norm": 8.03724479675293, + "learning_rate": 6.5061489332789786e-06, + "loss": 2.8609, + "step": 41155 + }, + { + "epoch": 2.7965756216877296, + "grad_norm": 6.4180169105529785, + "learning_rate": 6.505724283190652e-06, + "loss": 2.9964, + "step": 41160 + }, + { + "epoch": 2.796915341758391, + "grad_norm": 8.190896987915039, + "learning_rate": 6.505299633102324e-06, + "loss": 2.9721, + "step": 41165 + }, + { + "epoch": 2.797255061829053, + "grad_norm": 5.879998207092285, + "learning_rate": 6.504874983013997e-06, + "loss": 3.0379, + "step": 41170 + }, + { + "epoch": 2.797594781899715, + "grad_norm": 6.811498165130615, + "learning_rate": 6.504450332925671e-06, + "loss": 3.0556, + "step": 41175 + }, + { + "epoch": 2.7979345019703765, + "grad_norm": 7.436062335968018, + "learning_rate": 6.5040256828373426e-06, + "loss": 3.0855, + "step": 41180 + }, + { + "epoch": 2.798274222041038, + "grad_norm": 7.144768238067627, + "learning_rate": 6.503601032749015e-06, + "loss": 2.9463, + "step": 41185 + }, + { + "epoch": 2.7986139421117002, + "grad_norm": 5.983580112457275, + "learning_rate": 6.503176382660689e-06, + "loss": 3.0364, + "step": 41190 + }, + { + "epoch": 2.798953662182362, + "grad_norm": 5.6702799797058105, + "learning_rate": 6.502751732572361e-06, + "loss": 3.3093, + "step": 41195 + }, + { + "epoch": 2.7992933822530235, + "grad_norm": 7.513028144836426, + "learning_rate": 6.502327082484034e-06, + "loss": 2.9859, + "step": 41200 + }, + { + "epoch": 2.799633102323685, + "grad_norm": 5.885880947113037, + "learning_rate": 6.5019024323957066e-06, + "loss": 3.0008, + "step": 41205 + }, + { + "epoch": 2.799972822394347, + "grad_norm": 6.470900058746338, + "learning_rate": 6.501477782307379e-06, + "loss": 3.2718, + "step": 41210 + }, + { + "epoch": 2.800312542465009, + "grad_norm": 6.491139888763428, + "learning_rate": 6.501053132219052e-06, + "loss": 2.8076, + "step": 41215 + }, + { + "epoch": 2.8006522625356705, + "grad_norm": 7.045638561248779, + "learning_rate": 6.500628482130725e-06, + "loss": 2.8214, + "step": 41220 + }, + { + "epoch": 2.8009919826063325, + "grad_norm": 6.389166831970215, + "learning_rate": 6.500203832042398e-06, + "loss": 3.0132, + "step": 41225 + }, + { + "epoch": 2.801331702676994, + "grad_norm": 7.525749206542969, + "learning_rate": 6.49977918195407e-06, + "loss": 2.9596, + "step": 41230 + }, + { + "epoch": 2.801671422747656, + "grad_norm": 6.903130054473877, + "learning_rate": 6.499354531865743e-06, + "loss": 2.9143, + "step": 41235 + }, + { + "epoch": 2.802011142818318, + "grad_norm": 7.4684624671936035, + "learning_rate": 6.498929881777416e-06, + "loss": 3.0509, + "step": 41240 + }, + { + "epoch": 2.8023508628889795, + "grad_norm": 7.786921977996826, + "learning_rate": 6.498505231689088e-06, + "loss": 2.9347, + "step": 41245 + }, + { + "epoch": 2.802690582959641, + "grad_norm": 6.207686424255371, + "learning_rate": 6.498080581600762e-06, + "loss": 2.9499, + "step": 41250 + }, + { + "epoch": 2.8030303030303028, + "grad_norm": 7.589352607727051, + "learning_rate": 6.497655931512435e-06, + "loss": 2.987, + "step": 41255 + }, + { + "epoch": 2.803370023100965, + "grad_norm": 7.800253868103027, + "learning_rate": 6.4972312814241065e-06, + "loss": 3.5234, + "step": 41260 + }, + { + "epoch": 2.8037097431716265, + "grad_norm": 6.498704433441162, + "learning_rate": 6.49680663133578e-06, + "loss": 3.0477, + "step": 41265 + }, + { + "epoch": 2.804049463242288, + "grad_norm": 6.004737377166748, + "learning_rate": 6.496381981247453e-06, + "loss": 2.87, + "step": 41270 + }, + { + "epoch": 2.80438918331295, + "grad_norm": 6.258490562438965, + "learning_rate": 6.495957331159125e-06, + "loss": 2.9291, + "step": 41275 + }, + { + "epoch": 2.804728903383612, + "grad_norm": 6.970682144165039, + "learning_rate": 6.495532681070799e-06, + "loss": 3.044, + "step": 41280 + }, + { + "epoch": 2.8050686234542734, + "grad_norm": 6.441134929656982, + "learning_rate": 6.495108030982471e-06, + "loss": 2.8494, + "step": 41285 + }, + { + "epoch": 2.8054083435249355, + "grad_norm": 9.064065933227539, + "learning_rate": 6.494683380894143e-06, + "loss": 3.1709, + "step": 41290 + }, + { + "epoch": 2.805748063595597, + "grad_norm": 6.62473201751709, + "learning_rate": 6.494258730805817e-06, + "loss": 2.6981, + "step": 41295 + }, + { + "epoch": 2.8060877836662588, + "grad_norm": 6.933293342590332, + "learning_rate": 6.493834080717489e-06, + "loss": 3.0477, + "step": 41300 + }, + { + "epoch": 2.806427503736921, + "grad_norm": 6.754517555236816, + "learning_rate": 6.493409430629162e-06, + "loss": 3.1381, + "step": 41305 + }, + { + "epoch": 2.8067672238075825, + "grad_norm": 6.117249011993408, + "learning_rate": 6.492984780540835e-06, + "loss": 2.8377, + "step": 41310 + }, + { + "epoch": 2.807106943878244, + "grad_norm": 7.300892353057861, + "learning_rate": 6.492560130452507e-06, + "loss": 2.9432, + "step": 41315 + }, + { + "epoch": 2.807446663948906, + "grad_norm": 7.978987216949463, + "learning_rate": 6.49213548036418e-06, + "loss": 2.9629, + "step": 41320 + }, + { + "epoch": 2.807786384019568, + "grad_norm": 6.3762407302856445, + "learning_rate": 6.491710830275854e-06, + "loss": 3.132, + "step": 41325 + }, + { + "epoch": 2.8081261040902294, + "grad_norm": 7.060822010040283, + "learning_rate": 6.491286180187526e-06, + "loss": 2.9976, + "step": 41330 + }, + { + "epoch": 2.8084658241608915, + "grad_norm": 8.044644355773926, + "learning_rate": 6.4908615300991985e-06, + "loss": 2.9967, + "step": 41335 + }, + { + "epoch": 2.808805544231553, + "grad_norm": 6.828532695770264, + "learning_rate": 6.490436880010872e-06, + "loss": 3.007, + "step": 41340 + }, + { + "epoch": 2.809145264302215, + "grad_norm": 6.192978858947754, + "learning_rate": 6.490012229922544e-06, + "loss": 3.0646, + "step": 41345 + }, + { + "epoch": 2.809484984372877, + "grad_norm": 7.11400032043457, + "learning_rate": 6.489587579834217e-06, + "loss": 2.8231, + "step": 41350 + }, + { + "epoch": 2.8098247044435385, + "grad_norm": 6.529383182525635, + "learning_rate": 6.489162929745891e-06, + "loss": 3.1291, + "step": 41355 + }, + { + "epoch": 2.8101644245142, + "grad_norm": 7.590965270996094, + "learning_rate": 6.4887382796575625e-06, + "loss": 2.9993, + "step": 41360 + }, + { + "epoch": 2.810504144584862, + "grad_norm": 9.051068305969238, + "learning_rate": 6.488313629569235e-06, + "loss": 3.1131, + "step": 41365 + }, + { + "epoch": 2.810843864655524, + "grad_norm": 8.238490104675293, + "learning_rate": 6.487888979480908e-06, + "loss": 3.034, + "step": 41370 + }, + { + "epoch": 2.8111835847261855, + "grad_norm": 7.926944255828857, + "learning_rate": 6.487464329392581e-06, + "loss": 3.0375, + "step": 41375 + }, + { + "epoch": 2.8115233047968475, + "grad_norm": 6.310218334197998, + "learning_rate": 6.487039679304254e-06, + "loss": 3.2174, + "step": 41380 + }, + { + "epoch": 2.811863024867509, + "grad_norm": 7.589714527130127, + "learning_rate": 6.4866150292159266e-06, + "loss": 3.2579, + "step": 41385 + }, + { + "epoch": 2.812202744938171, + "grad_norm": 6.523081302642822, + "learning_rate": 6.486190379127599e-06, + "loss": 3.1463, + "step": 41390 + }, + { + "epoch": 2.812542465008833, + "grad_norm": 6.749239444732666, + "learning_rate": 6.485765729039271e-06, + "loss": 3.1386, + "step": 41395 + }, + { + "epoch": 2.8128821850794945, + "grad_norm": 6.291728973388672, + "learning_rate": 6.485341078950945e-06, + "loss": 3.119, + "step": 41400 + }, + { + "epoch": 2.813221905150156, + "grad_norm": 6.818948745727539, + "learning_rate": 6.484916428862618e-06, + "loss": 3.0964, + "step": 41405 + }, + { + "epoch": 2.813561625220818, + "grad_norm": 5.7272796630859375, + "learning_rate": 6.48449177877429e-06, + "loss": 3.1356, + "step": 41410 + }, + { + "epoch": 2.81390134529148, + "grad_norm": 7.600958824157715, + "learning_rate": 6.484067128685963e-06, + "loss": 3.115, + "step": 41415 + }, + { + "epoch": 2.8142410653621415, + "grad_norm": 9.370437622070312, + "learning_rate": 6.483642478597636e-06, + "loss": 3.2483, + "step": 41420 + }, + { + "epoch": 2.8145807854328035, + "grad_norm": 6.765005111694336, + "learning_rate": 6.483217828509308e-06, + "loss": 3.1034, + "step": 41425 + }, + { + "epoch": 2.814920505503465, + "grad_norm": 9.176674842834473, + "learning_rate": 6.482793178420982e-06, + "loss": 2.9428, + "step": 41430 + }, + { + "epoch": 2.815260225574127, + "grad_norm": 6.232470512390137, + "learning_rate": 6.4823685283326546e-06, + "loss": 2.897, + "step": 41435 + }, + { + "epoch": 2.815599945644789, + "grad_norm": 8.639704704284668, + "learning_rate": 6.4819438782443265e-06, + "loss": 2.9881, + "step": 41440 + }, + { + "epoch": 2.8159396657154505, + "grad_norm": 7.615033149719238, + "learning_rate": 6.481519228156e-06, + "loss": 3.0826, + "step": 41445 + }, + { + "epoch": 2.816279385786112, + "grad_norm": 7.51003360748291, + "learning_rate": 6.481094578067673e-06, + "loss": 3.4247, + "step": 41450 + }, + { + "epoch": 2.816619105856774, + "grad_norm": 6.870616436004639, + "learning_rate": 6.480669927979345e-06, + "loss": 3.0788, + "step": 41455 + }, + { + "epoch": 2.816958825927436, + "grad_norm": 6.42258882522583, + "learning_rate": 6.4802452778910186e-06, + "loss": 3.0941, + "step": 41460 + }, + { + "epoch": 2.8172985459980975, + "grad_norm": 7.813767433166504, + "learning_rate": 6.479820627802691e-06, + "loss": 3.024, + "step": 41465 + }, + { + "epoch": 2.8176382660687596, + "grad_norm": 6.833556175231934, + "learning_rate": 6.479395977714363e-06, + "loss": 3.0124, + "step": 41470 + }, + { + "epoch": 2.817977986139421, + "grad_norm": 5.733304023742676, + "learning_rate": 6.478971327626037e-06, + "loss": 3.169, + "step": 41475 + }, + { + "epoch": 2.818317706210083, + "grad_norm": 5.682704448699951, + "learning_rate": 6.478546677537709e-06, + "loss": 3.0189, + "step": 41480 + }, + { + "epoch": 2.818657426280745, + "grad_norm": 7.43728494644165, + "learning_rate": 6.4781220274493826e-06, + "loss": 3.1968, + "step": 41485 + }, + { + "epoch": 2.8189971463514065, + "grad_norm": 7.358680725097656, + "learning_rate": 6.477697377361055e-06, + "loss": 3.032, + "step": 41490 + }, + { + "epoch": 2.819336866422068, + "grad_norm": 6.635673522949219, + "learning_rate": 6.477272727272727e-06, + "loss": 3.1269, + "step": 41495 + }, + { + "epoch": 2.8196765864927302, + "grad_norm": 7.587494850158691, + "learning_rate": 6.476848077184401e-06, + "loss": 2.867, + "step": 41500 + }, + { + "epoch": 2.820016306563392, + "grad_norm": 6.531081676483154, + "learning_rate": 6.476423427096074e-06, + "loss": 3.1205, + "step": 41505 + }, + { + "epoch": 2.8203560266340535, + "grad_norm": 6.592920780181885, + "learning_rate": 6.475998777007746e-06, + "loss": 3.0698, + "step": 41510 + }, + { + "epoch": 2.8206957467047156, + "grad_norm": 7.4168806076049805, + "learning_rate": 6.475574126919419e-06, + "loss": 3.0065, + "step": 41515 + }, + { + "epoch": 2.821035466775377, + "grad_norm": 7.945240497589111, + "learning_rate": 6.475149476831092e-06, + "loss": 3.1193, + "step": 41520 + }, + { + "epoch": 2.821375186846039, + "grad_norm": 5.521647930145264, + "learning_rate": 6.474724826742764e-06, + "loss": 2.952, + "step": 41525 + }, + { + "epoch": 2.821714906916701, + "grad_norm": 5.833735466003418, + "learning_rate": 6.474300176654438e-06, + "loss": 3.0883, + "step": 41530 + }, + { + "epoch": 2.8220546269873625, + "grad_norm": 5.013900279998779, + "learning_rate": 6.4738755265661106e-06, + "loss": 3.1102, + "step": 41535 + }, + { + "epoch": 2.822394347058024, + "grad_norm": 6.83367395401001, + "learning_rate": 6.4734508764777825e-06, + "loss": 2.981, + "step": 41540 + }, + { + "epoch": 2.822734067128686, + "grad_norm": 5.068186283111572, + "learning_rate": 6.473026226389456e-06, + "loss": 2.8901, + "step": 41545 + }, + { + "epoch": 2.823073787199348, + "grad_norm": 6.2950053215026855, + "learning_rate": 6.472601576301128e-06, + "loss": 2.9983, + "step": 41550 + }, + { + "epoch": 2.8234135072700095, + "grad_norm": 7.731604099273682, + "learning_rate": 6.472176926212801e-06, + "loss": 3.2742, + "step": 41555 + }, + { + "epoch": 2.823753227340671, + "grad_norm": 6.458128452301025, + "learning_rate": 6.471752276124475e-06, + "loss": 3.1596, + "step": 41560 + }, + { + "epoch": 2.824092947411333, + "grad_norm": 8.27320384979248, + "learning_rate": 6.4713276260361465e-06, + "loss": 3.3473, + "step": 41565 + }, + { + "epoch": 2.824432667481995, + "grad_norm": 6.7022833824157715, + "learning_rate": 6.470902975947819e-06, + "loss": 2.9185, + "step": 41570 + }, + { + "epoch": 2.8247723875526565, + "grad_norm": 8.802841186523438, + "learning_rate": 6.470478325859493e-06, + "loss": 3.1326, + "step": 41575 + }, + { + "epoch": 2.8251121076233185, + "grad_norm": 5.20651912689209, + "learning_rate": 6.470053675771165e-06, + "loss": 2.9771, + "step": 41580 + }, + { + "epoch": 2.82545182769398, + "grad_norm": 7.846694469451904, + "learning_rate": 6.469629025682838e-06, + "loss": 2.9975, + "step": 41585 + }, + { + "epoch": 2.825791547764642, + "grad_norm": 7.163923263549805, + "learning_rate": 6.469204375594511e-06, + "loss": 3.2677, + "step": 41590 + }, + { + "epoch": 2.8261312678353034, + "grad_norm": 6.796749591827393, + "learning_rate": 6.468779725506183e-06, + "loss": 2.9962, + "step": 41595 + }, + { + "epoch": 2.8264709879059655, + "grad_norm": 6.184351921081543, + "learning_rate": 6.468355075417856e-06, + "loss": 3.1648, + "step": 41600 + }, + { + "epoch": 2.826810707976627, + "grad_norm": 6.614372730255127, + "learning_rate": 6.46793042532953e-06, + "loss": 3.0597, + "step": 41605 + }, + { + "epoch": 2.8271504280472888, + "grad_norm": 6.179275035858154, + "learning_rate": 6.467505775241202e-06, + "loss": 3.0071, + "step": 41610 + }, + { + "epoch": 2.827490148117951, + "grad_norm": 6.236144542694092, + "learning_rate": 6.4670811251528745e-06, + "loss": 3.0387, + "step": 41615 + }, + { + "epoch": 2.8278298681886125, + "grad_norm": 6.715654373168945, + "learning_rate": 6.466656475064547e-06, + "loss": 3.0603, + "step": 41620 + }, + { + "epoch": 2.828169588259274, + "grad_norm": 7.945906162261963, + "learning_rate": 6.46623182497622e-06, + "loss": 2.8495, + "step": 41625 + }, + { + "epoch": 2.828509308329936, + "grad_norm": 5.889591217041016, + "learning_rate": 6.465807174887893e-06, + "loss": 3.1366, + "step": 41630 + }, + { + "epoch": 2.828849028400598, + "grad_norm": 5.640597820281982, + "learning_rate": 6.465382524799566e-06, + "loss": 3.0134, + "step": 41635 + }, + { + "epoch": 2.8291887484712595, + "grad_norm": 5.486609935760498, + "learning_rate": 6.4649578747112385e-06, + "loss": 3.0002, + "step": 41640 + }, + { + "epoch": 2.8295284685419215, + "grad_norm": 7.342526435852051, + "learning_rate": 6.4645332246229105e-06, + "loss": 3.0799, + "step": 41645 + }, + { + "epoch": 2.829868188612583, + "grad_norm": 7.298508167266846, + "learning_rate": 6.464108574534584e-06, + "loss": 3.181, + "step": 41650 + }, + { + "epoch": 2.830207908683245, + "grad_norm": 8.4816312789917, + "learning_rate": 6.463683924446257e-06, + "loss": 3.1481, + "step": 41655 + }, + { + "epoch": 2.830547628753907, + "grad_norm": 7.552684307098389, + "learning_rate": 6.463259274357929e-06, + "loss": 2.9561, + "step": 41660 + }, + { + "epoch": 2.8308873488245685, + "grad_norm": 7.519665241241455, + "learning_rate": 6.4628346242696025e-06, + "loss": 2.8744, + "step": 41665 + }, + { + "epoch": 2.83122706889523, + "grad_norm": 9.03995418548584, + "learning_rate": 6.462409974181275e-06, + "loss": 3.2069, + "step": 41670 + }, + { + "epoch": 2.831566788965892, + "grad_norm": 7.059176445007324, + "learning_rate": 6.461985324092947e-06, + "loss": 2.9703, + "step": 41675 + }, + { + "epoch": 2.831906509036554, + "grad_norm": 7.9171552658081055, + "learning_rate": 6.461560674004621e-06, + "loss": 3.1288, + "step": 41680 + }, + { + "epoch": 2.8322462291072155, + "grad_norm": 7.060021877288818, + "learning_rate": 6.461136023916294e-06, + "loss": 3.0933, + "step": 41685 + }, + { + "epoch": 2.8325859491778775, + "grad_norm": 6.320113182067871, + "learning_rate": 6.460711373827966e-06, + "loss": 3.1993, + "step": 41690 + }, + { + "epoch": 2.832925669248539, + "grad_norm": 7.157583236694336, + "learning_rate": 6.460286723739639e-06, + "loss": 3.096, + "step": 41695 + }, + { + "epoch": 2.833265389319201, + "grad_norm": 8.245757102966309, + "learning_rate": 6.459862073651312e-06, + "loss": 3.1067, + "step": 41700 + }, + { + "epoch": 2.833605109389863, + "grad_norm": 6.618064880371094, + "learning_rate": 6.459437423562984e-06, + "loss": 3.1596, + "step": 41705 + }, + { + "epoch": 2.8339448294605245, + "grad_norm": 6.54335880279541, + "learning_rate": 6.459012773474658e-06, + "loss": 3.2016, + "step": 41710 + }, + { + "epoch": 2.834284549531186, + "grad_norm": 7.337384223937988, + "learning_rate": 6.45858812338633e-06, + "loss": 2.9258, + "step": 41715 + }, + { + "epoch": 2.834624269601848, + "grad_norm": 6.067234992980957, + "learning_rate": 6.4581634732980025e-06, + "loss": 2.9859, + "step": 41720 + }, + { + "epoch": 2.83496398967251, + "grad_norm": 6.17203950881958, + "learning_rate": 6.457738823209676e-06, + "loss": 3.1026, + "step": 41725 + }, + { + "epoch": 2.8353037097431715, + "grad_norm": 6.843202590942383, + "learning_rate": 6.457314173121348e-06, + "loss": 3.2636, + "step": 41730 + }, + { + "epoch": 2.8356434298138335, + "grad_norm": 7.20074987411499, + "learning_rate": 6.456889523033021e-06, + "loss": 2.9505, + "step": 41735 + }, + { + "epoch": 2.835983149884495, + "grad_norm": 5.500288486480713, + "learning_rate": 6.4564648729446946e-06, + "loss": 3.0489, + "step": 41740 + }, + { + "epoch": 2.836322869955157, + "grad_norm": 7.25391149520874, + "learning_rate": 6.4560402228563665e-06, + "loss": 3.2124, + "step": 41745 + }, + { + "epoch": 2.836662590025819, + "grad_norm": 6.93170166015625, + "learning_rate": 6.455615572768039e-06, + "loss": 3.1289, + "step": 41750 + }, + { + "epoch": 2.8370023100964805, + "grad_norm": 6.061196804046631, + "learning_rate": 6.455190922679713e-06, + "loss": 2.9149, + "step": 41755 + }, + { + "epoch": 2.837342030167142, + "grad_norm": 8.412734031677246, + "learning_rate": 6.454766272591385e-06, + "loss": 3.3353, + "step": 41760 + }, + { + "epoch": 2.8376817502378042, + "grad_norm": 6.830774307250977, + "learning_rate": 6.454341622503058e-06, + "loss": 3.0556, + "step": 41765 + }, + { + "epoch": 2.838021470308466, + "grad_norm": Infinity, + "learning_rate": 6.454001902432397e-06, + "loss": 3.2232, + "step": 41770 + }, + { + "epoch": 2.8383611903791275, + "grad_norm": 6.754270076751709, + "learning_rate": 6.453577252344069e-06, + "loss": 3.0727, + "step": 41775 + }, + { + "epoch": 2.8387009104497896, + "grad_norm": 5.9432525634765625, + "learning_rate": 6.453152602255742e-06, + "loss": 3.0957, + "step": 41780 + }, + { + "epoch": 2.839040630520451, + "grad_norm": 6.107873439788818, + "learning_rate": 6.452727952167415e-06, + "loss": 3.1304, + "step": 41785 + }, + { + "epoch": 2.839380350591113, + "grad_norm": 6.298766136169434, + "learning_rate": 6.452303302079087e-06, + "loss": 3.0031, + "step": 41790 + }, + { + "epoch": 2.839720070661775, + "grad_norm": 7.436909198760986, + "learning_rate": 6.451878651990761e-06, + "loss": 3.113, + "step": 41795 + }, + { + "epoch": 2.8400597907324365, + "grad_norm": 7.078671455383301, + "learning_rate": 6.451454001902433e-06, + "loss": 2.8494, + "step": 41800 + }, + { + "epoch": 2.840399510803098, + "grad_norm": 7.375350475311279, + "learning_rate": 6.451029351814105e-06, + "loss": 3.2939, + "step": 41805 + }, + { + "epoch": 2.8407392308737602, + "grad_norm": 7.8096089363098145, + "learning_rate": 6.450604701725779e-06, + "loss": 3.3056, + "step": 41810 + }, + { + "epoch": 2.841078950944422, + "grad_norm": 6.209782123565674, + "learning_rate": 6.450180051637451e-06, + "loss": 3.2575, + "step": 41815 + }, + { + "epoch": 2.8414186710150835, + "grad_norm": 8.668060302734375, + "learning_rate": 6.449755401549124e-06, + "loss": 2.8363, + "step": 41820 + }, + { + "epoch": 2.8417583910857456, + "grad_norm": 7.088021755218506, + "learning_rate": 6.4493307514607974e-06, + "loss": 2.8863, + "step": 41825 + }, + { + "epoch": 2.842098111156407, + "grad_norm": 6.568081378936768, + "learning_rate": 6.448906101372469e-06, + "loss": 2.914, + "step": 41830 + }, + { + "epoch": 2.842437831227069, + "grad_norm": 6.339105129241943, + "learning_rate": 6.448481451284142e-06, + "loss": 2.9369, + "step": 41835 + }, + { + "epoch": 2.842777551297731, + "grad_norm": 6.479033946990967, + "learning_rate": 6.448056801195816e-06, + "loss": 3.1603, + "step": 41840 + }, + { + "epoch": 2.8431172713683925, + "grad_norm": 7.810810565948486, + "learning_rate": 6.447632151107488e-06, + "loss": 3.0824, + "step": 41845 + }, + { + "epoch": 2.843456991439054, + "grad_norm": 8.111696243286133, + "learning_rate": 6.447207501019161e-06, + "loss": 2.9912, + "step": 41850 + }, + { + "epoch": 2.8437967115097162, + "grad_norm": 7.055820941925049, + "learning_rate": 6.446782850930834e-06, + "loss": 3.0484, + "step": 41855 + }, + { + "epoch": 2.844136431580378, + "grad_norm": 6.47278356552124, + "learning_rate": 6.446358200842506e-06, + "loss": 2.6479, + "step": 41860 + }, + { + "epoch": 2.8444761516510395, + "grad_norm": 7.482726573944092, + "learning_rate": 6.445933550754179e-06, + "loss": 2.876, + "step": 41865 + }, + { + "epoch": 2.8448158717217016, + "grad_norm": 6.3558030128479, + "learning_rate": 6.445508900665852e-06, + "loss": 2.8703, + "step": 41870 + }, + { + "epoch": 2.845155591792363, + "grad_norm": 6.69830322265625, + "learning_rate": 6.445084250577525e-06, + "loss": 2.8639, + "step": 41875 + }, + { + "epoch": 2.845495311863025, + "grad_norm": 7.197018623352051, + "learning_rate": 6.444659600489197e-06, + "loss": 2.9603, + "step": 41880 + }, + { + "epoch": 2.8458350319336865, + "grad_norm": 7.038707256317139, + "learning_rate": 6.44423495040087e-06, + "loss": 3.0371, + "step": 41885 + }, + { + "epoch": 2.8461747520043486, + "grad_norm": 6.693580627441406, + "learning_rate": 6.443810300312543e-06, + "loss": 3.1846, + "step": 41890 + }, + { + "epoch": 2.84651447207501, + "grad_norm": 9.464871406555176, + "learning_rate": 6.443385650224215e-06, + "loss": 3.2987, + "step": 41895 + }, + { + "epoch": 2.846854192145672, + "grad_norm": 7.676341533660889, + "learning_rate": 6.442961000135889e-06, + "loss": 3.1015, + "step": 41900 + }, + { + "epoch": 2.847193912216334, + "grad_norm": 5.923218727111816, + "learning_rate": 6.442536350047561e-06, + "loss": 2.9684, + "step": 41905 + }, + { + "epoch": 2.8475336322869955, + "grad_norm": 6.421643257141113, + "learning_rate": 6.442111699959233e-06, + "loss": 2.9521, + "step": 41910 + }, + { + "epoch": 2.847873352357657, + "grad_norm": 6.478880405426025, + "learning_rate": 6.441687049870907e-06, + "loss": 3.1936, + "step": 41915 + }, + { + "epoch": 2.8482130724283192, + "grad_norm": 7.596074104309082, + "learning_rate": 6.44126239978258e-06, + "loss": 3.296, + "step": 41920 + }, + { + "epoch": 2.848552792498981, + "grad_norm": 7.335726261138916, + "learning_rate": 6.440837749694252e-06, + "loss": 3.1292, + "step": 41925 + }, + { + "epoch": 2.8488925125696425, + "grad_norm": 6.396792888641357, + "learning_rate": 6.440413099605925e-06, + "loss": 2.9702, + "step": 41930 + }, + { + "epoch": 2.849232232640304, + "grad_norm": 7.450913429260254, + "learning_rate": 6.439988449517598e-06, + "loss": 3.0096, + "step": 41935 + }, + { + "epoch": 2.849571952710966, + "grad_norm": 8.766474723815918, + "learning_rate": 6.43956379942927e-06, + "loss": 2.9696, + "step": 41940 + }, + { + "epoch": 2.849911672781628, + "grad_norm": 6.69104528427124, + "learning_rate": 6.439139149340944e-06, + "loss": 2.8951, + "step": 41945 + }, + { + "epoch": 2.8502513928522895, + "grad_norm": 6.649441242218018, + "learning_rate": 6.438714499252617e-06, + "loss": 2.8644, + "step": 41950 + }, + { + "epoch": 2.8505911129229515, + "grad_norm": 5.956593036651611, + "learning_rate": 6.4382898491642886e-06, + "loss": 3.0715, + "step": 41955 + }, + { + "epoch": 2.850930832993613, + "grad_norm": 6.676300525665283, + "learning_rate": 6.437865199075962e-06, + "loss": 3.1218, + "step": 41960 + }, + { + "epoch": 2.851270553064275, + "grad_norm": 7.842119216918945, + "learning_rate": 6.437440548987635e-06, + "loss": 2.9015, + "step": 41965 + }, + { + "epoch": 2.851610273134937, + "grad_norm": 4.851983547210693, + "learning_rate": 6.437015898899307e-06, + "loss": 3.0678, + "step": 41970 + }, + { + "epoch": 2.8519499932055985, + "grad_norm": 7.713118076324463, + "learning_rate": 6.436591248810981e-06, + "loss": 3.2428, + "step": 41975 + }, + { + "epoch": 2.85228971327626, + "grad_norm": 6.25697135925293, + "learning_rate": 6.4361665987226526e-06, + "loss": 2.8048, + "step": 41980 + }, + { + "epoch": 2.852629433346922, + "grad_norm": 8.517234802246094, + "learning_rate": 6.435741948634325e-06, + "loss": 3.1333, + "step": 41985 + }, + { + "epoch": 2.852969153417584, + "grad_norm": 11.707669258117676, + "learning_rate": 6.435317298545999e-06, + "loss": 3.0565, + "step": 41990 + }, + { + "epoch": 2.8533088734882455, + "grad_norm": 7.930828094482422, + "learning_rate": 6.434892648457671e-06, + "loss": 3.1178, + "step": 41995 + }, + { + "epoch": 2.8536485935589075, + "grad_norm": 5.936435222625732, + "learning_rate": 6.434467998369344e-06, + "loss": 3.2945, + "step": 42000 + }, + { + "epoch": 2.853988313629569, + "grad_norm": 5.988119602203369, + "learning_rate": 6.434043348281017e-06, + "loss": 3.1486, + "step": 42005 + }, + { + "epoch": 2.854328033700231, + "grad_norm": 7.902193069458008, + "learning_rate": 6.433618698192689e-06, + "loss": 2.8539, + "step": 42010 + }, + { + "epoch": 2.854667753770893, + "grad_norm": 7.669791221618652, + "learning_rate": 6.433194048104362e-06, + "loss": 3.1039, + "step": 42015 + }, + { + "epoch": 2.8550074738415545, + "grad_norm": 7.82558012008667, + "learning_rate": 6.432769398016036e-06, + "loss": 3.2186, + "step": 42020 + }, + { + "epoch": 2.855347193912216, + "grad_norm": 8.2056884765625, + "learning_rate": 6.432344747927708e-06, + "loss": 3.3025, + "step": 42025 + }, + { + "epoch": 2.855686913982878, + "grad_norm": 5.621081829071045, + "learning_rate": 6.4319200978393814e-06, + "loss": 2.9908, + "step": 42030 + }, + { + "epoch": 2.85602663405354, + "grad_norm": 6.720729351043701, + "learning_rate": 6.431495447751054e-06, + "loss": 2.6611, + "step": 42035 + }, + { + "epoch": 2.8563663541242015, + "grad_norm": 8.958150863647461, + "learning_rate": 6.431070797662726e-06, + "loss": 3.1369, + "step": 42040 + }, + { + "epoch": 2.8567060741948636, + "grad_norm": 7.100444316864014, + "learning_rate": 6.4306461475744e-06, + "loss": 3.2111, + "step": 42045 + }, + { + "epoch": 2.857045794265525, + "grad_norm": 6.64046573638916, + "learning_rate": 6.430221497486072e-06, + "loss": 2.9904, + "step": 42050 + }, + { + "epoch": 2.857385514336187, + "grad_norm": 8.436575889587402, + "learning_rate": 6.429796847397745e-06, + "loss": 3.2596, + "step": 42055 + }, + { + "epoch": 2.857725234406849, + "grad_norm": 6.527285099029541, + "learning_rate": 6.429372197309418e-06, + "loss": 2.9731, + "step": 42060 + }, + { + "epoch": 2.8580649544775105, + "grad_norm": 5.284181118011475, + "learning_rate": 6.42894754722109e-06, + "loss": 3.0357, + "step": 42065 + }, + { + "epoch": 2.858404674548172, + "grad_norm": 6.656822204589844, + "learning_rate": 6.428522897132763e-06, + "loss": 3.0414, + "step": 42070 + }, + { + "epoch": 2.8587443946188342, + "grad_norm": 7.077617645263672, + "learning_rate": 6.428098247044437e-06, + "loss": 2.9603, + "step": 42075 + }, + { + "epoch": 2.859084114689496, + "grad_norm": 6.874936103820801, + "learning_rate": 6.427673596956109e-06, + "loss": 2.926, + "step": 42080 + }, + { + "epoch": 2.8594238347601575, + "grad_norm": 5.839734077453613, + "learning_rate": 6.427248946867781e-06, + "loss": 2.91, + "step": 42085 + }, + { + "epoch": 2.8597635548308196, + "grad_norm": 6.037518501281738, + "learning_rate": 6.426824296779455e-06, + "loss": 3.1594, + "step": 42090 + }, + { + "epoch": 2.860103274901481, + "grad_norm": 6.804446220397949, + "learning_rate": 6.426399646691127e-06, + "loss": 2.9941, + "step": 42095 + }, + { + "epoch": 2.860442994972143, + "grad_norm": 7.9674973487854, + "learning_rate": 6.4259749966028e-06, + "loss": 3.0943, + "step": 42100 + }, + { + "epoch": 2.860782715042805, + "grad_norm": 5.338374614715576, + "learning_rate": 6.4255503465144734e-06, + "loss": 2.7909, + "step": 42105 + }, + { + "epoch": 2.8611224351134665, + "grad_norm": 6.1181111335754395, + "learning_rate": 6.425125696426145e-06, + "loss": 3.2389, + "step": 42110 + }, + { + "epoch": 2.861462155184128, + "grad_norm": 7.393165111541748, + "learning_rate": 6.424701046337818e-06, + "loss": 2.9336, + "step": 42115 + }, + { + "epoch": 2.8618018752547902, + "grad_norm": 6.989060878753662, + "learning_rate": 6.424276396249491e-06, + "loss": 3.0779, + "step": 42120 + }, + { + "epoch": 2.862141595325452, + "grad_norm": 8.046748161315918, + "learning_rate": 6.423851746161164e-06, + "loss": 3.1277, + "step": 42125 + }, + { + "epoch": 2.8624813153961135, + "grad_norm": 7.463590621948242, + "learning_rate": 6.423427096072837e-06, + "loss": 3.1433, + "step": 42130 + }, + { + "epoch": 2.8628210354667756, + "grad_norm": 8.253482818603516, + "learning_rate": 6.423002445984509e-06, + "loss": 3.0466, + "step": 42135 + }, + { + "epoch": 2.863160755537437, + "grad_norm": 5.908024787902832, + "learning_rate": 6.422577795896182e-06, + "loss": 2.6966, + "step": 42140 + }, + { + "epoch": 2.863500475608099, + "grad_norm": 7.449804782867432, + "learning_rate": 6.422153145807854e-06, + "loss": 2.9401, + "step": 42145 + }, + { + "epoch": 2.863840195678761, + "grad_norm": 6.999392509460449, + "learning_rate": 6.421728495719528e-06, + "loss": 2.9415, + "step": 42150 + }, + { + "epoch": 2.8641799157494225, + "grad_norm": 6.9557929039001465, + "learning_rate": 6.421303845631201e-06, + "loss": 3.149, + "step": 42155 + }, + { + "epoch": 2.864519635820084, + "grad_norm": 5.828924179077148, + "learning_rate": 6.4208791955428726e-06, + "loss": 2.8894, + "step": 42160 + }, + { + "epoch": 2.8648593558907463, + "grad_norm": 6.497562885284424, + "learning_rate": 6.420454545454546e-06, + "loss": 2.8084, + "step": 42165 + }, + { + "epoch": 2.865199075961408, + "grad_norm": 7.807274341583252, + "learning_rate": 6.420029895366219e-06, + "loss": 3.0637, + "step": 42170 + }, + { + "epoch": 2.8655387960320695, + "grad_norm": 5.3565192222595215, + "learning_rate": 6.419605245277891e-06, + "loss": 2.9677, + "step": 42175 + }, + { + "epoch": 2.8658785161027316, + "grad_norm": 6.827608585357666, + "learning_rate": 6.419180595189565e-06, + "loss": 3.2916, + "step": 42180 + }, + { + "epoch": 2.866218236173393, + "grad_norm": 7.5037031173706055, + "learning_rate": 6.418755945101237e-06, + "loss": 2.827, + "step": 42185 + }, + { + "epoch": 2.866557956244055, + "grad_norm": 5.379166603088379, + "learning_rate": 6.418331295012909e-06, + "loss": 3.0058, + "step": 42190 + }, + { + "epoch": 2.866897676314717, + "grad_norm": 7.203950881958008, + "learning_rate": 6.417906644924583e-06, + "loss": 3.0806, + "step": 42195 + }, + { + "epoch": 2.8672373963853786, + "grad_norm": 7.36407470703125, + "learning_rate": 6.417481994836256e-06, + "loss": 3.0834, + "step": 42200 + }, + { + "epoch": 2.86757711645604, + "grad_norm": 7.231139659881592, + "learning_rate": 6.417057344747928e-06, + "loss": 3.0839, + "step": 42205 + }, + { + "epoch": 2.8679168365267023, + "grad_norm": 6.171706676483154, + "learning_rate": 6.416632694659601e-06, + "loss": 3.0077, + "step": 42210 + }, + { + "epoch": 2.868256556597364, + "grad_norm": 5.216926097869873, + "learning_rate": 6.416208044571273e-06, + "loss": 3.0216, + "step": 42215 + }, + { + "epoch": 2.8685962766680255, + "grad_norm": 6.26591682434082, + "learning_rate": 6.415783394482946e-06, + "loss": 2.8259, + "step": 42220 + }, + { + "epoch": 2.868935996738687, + "grad_norm": 6.902961730957031, + "learning_rate": 6.41535874439462e-06, + "loss": 3.1709, + "step": 42225 + }, + { + "epoch": 2.8692757168093492, + "grad_norm": 8.669389724731445, + "learning_rate": 6.414934094306292e-06, + "loss": 3.058, + "step": 42230 + }, + { + "epoch": 2.869615436880011, + "grad_norm": 6.213407516479492, + "learning_rate": 6.4145094442179646e-06, + "loss": 3.0673, + "step": 42235 + }, + { + "epoch": 2.8699551569506725, + "grad_norm": 6.697137832641602, + "learning_rate": 6.414084794129638e-06, + "loss": 3.1623, + "step": 42240 + }, + { + "epoch": 2.8702948770213346, + "grad_norm": 8.221456527709961, + "learning_rate": 6.41366014404131e-06, + "loss": 3.1346, + "step": 42245 + }, + { + "epoch": 2.870634597091996, + "grad_norm": 6.4219865798950195, + "learning_rate": 6.413235493952983e-06, + "loss": 2.9279, + "step": 42250 + }, + { + "epoch": 2.870974317162658, + "grad_norm": 7.691821575164795, + "learning_rate": 6.412810843864657e-06, + "loss": 3.1193, + "step": 42255 + }, + { + "epoch": 2.87131403723332, + "grad_norm": 6.19867467880249, + "learning_rate": 6.4123861937763286e-06, + "loss": 2.9753, + "step": 42260 + }, + { + "epoch": 2.8716537573039815, + "grad_norm": 7.026841163635254, + "learning_rate": 6.411961543688001e-06, + "loss": 2.9679, + "step": 42265 + }, + { + "epoch": 2.871993477374643, + "grad_norm": 7.994307994842529, + "learning_rate": 6.411536893599675e-06, + "loss": 2.8998, + "step": 42270 + }, + { + "epoch": 2.872333197445305, + "grad_norm": 7.623109817504883, + "learning_rate": 6.411112243511347e-06, + "loss": 3.2083, + "step": 42275 + }, + { + "epoch": 2.872672917515967, + "grad_norm": 6.000772953033447, + "learning_rate": 6.41068759342302e-06, + "loss": 3.0722, + "step": 42280 + }, + { + "epoch": 2.8730126375866285, + "grad_norm": 9.100970268249512, + "learning_rate": 6.410262943334693e-06, + "loss": 2.9852, + "step": 42285 + }, + { + "epoch": 2.87335235765729, + "grad_norm": 6.766416072845459, + "learning_rate": 6.409838293246365e-06, + "loss": 2.9684, + "step": 42290 + }, + { + "epoch": 2.873692077727952, + "grad_norm": 7.017146587371826, + "learning_rate": 6.409413643158038e-06, + "loss": 3.0267, + "step": 42295 + }, + { + "epoch": 2.874031797798614, + "grad_norm": 6.189488410949707, + "learning_rate": 6.408988993069711e-06, + "loss": 3.2322, + "step": 42300 + }, + { + "epoch": 2.8743715178692755, + "grad_norm": 7.1434645652771, + "learning_rate": 6.408564342981384e-06, + "loss": 3.0694, + "step": 42305 + }, + { + "epoch": 2.8747112379399375, + "grad_norm": 7.881141662597656, + "learning_rate": 6.408139692893056e-06, + "loss": 2.9796, + "step": 42310 + }, + { + "epoch": 2.875050958010599, + "grad_norm": 7.050014019012451, + "learning_rate": 6.407715042804729e-06, + "loss": 3.1224, + "step": 42315 + }, + { + "epoch": 2.875390678081261, + "grad_norm": 8.49600887298584, + "learning_rate": 6.407290392716402e-06, + "loss": 3.1302, + "step": 42320 + }, + { + "epoch": 2.875730398151923, + "grad_norm": 6.264461517333984, + "learning_rate": 6.406865742628074e-06, + "loss": 3.0349, + "step": 42325 + }, + { + "epoch": 2.8760701182225845, + "grad_norm": 6.735751152038574, + "learning_rate": 6.406441092539748e-06, + "loss": 3.0883, + "step": 42330 + }, + { + "epoch": 2.876409838293246, + "grad_norm": 7.6813764572143555, + "learning_rate": 6.406016442451421e-06, + "loss": 2.9443, + "step": 42335 + }, + { + "epoch": 2.876749558363908, + "grad_norm": 6.077528476715088, + "learning_rate": 6.4055917923630925e-06, + "loss": 3.1066, + "step": 42340 + }, + { + "epoch": 2.87708927843457, + "grad_norm": 8.636297225952148, + "learning_rate": 6.405167142274766e-06, + "loss": 3.0012, + "step": 42345 + }, + { + "epoch": 2.8774289985052315, + "grad_norm": 6.4808149337768555, + "learning_rate": 6.404742492186439e-06, + "loss": 3.2376, + "step": 42350 + }, + { + "epoch": 2.8777687185758936, + "grad_norm": 6.527832508087158, + "learning_rate": 6.404317842098111e-06, + "loss": 3.0846, + "step": 42355 + }, + { + "epoch": 2.878108438646555, + "grad_norm": 5.871425628662109, + "learning_rate": 6.403893192009785e-06, + "loss": 3.0443, + "step": 42360 + }, + { + "epoch": 2.878448158717217, + "grad_norm": 6.077014923095703, + "learning_rate": 6.403468541921457e-06, + "loss": 2.9659, + "step": 42365 + }, + { + "epoch": 2.878787878787879, + "grad_norm": 8.348764419555664, + "learning_rate": 6.40304389183313e-06, + "loss": 3.2306, + "step": 42370 + }, + { + "epoch": 2.8791275988585405, + "grad_norm": 6.677791118621826, + "learning_rate": 6.402619241744803e-06, + "loss": 3.2571, + "step": 42375 + }, + { + "epoch": 2.879467318929202, + "grad_norm": 7.642316818237305, + "learning_rate": 6.402194591656476e-06, + "loss": 3.1119, + "step": 42380 + }, + { + "epoch": 2.8798070389998642, + "grad_norm": 5.9624528884887695, + "learning_rate": 6.401769941568149e-06, + "loss": 2.8937, + "step": 42385 + }, + { + "epoch": 2.880146759070526, + "grad_norm": 6.366430759429932, + "learning_rate": 6.401345291479821e-06, + "loss": 2.8966, + "step": 42390 + }, + { + "epoch": 2.8804864791411875, + "grad_norm": 7.068002700805664, + "learning_rate": 6.400920641391493e-06, + "loss": 3.0765, + "step": 42395 + }, + { + "epoch": 2.8808261992118496, + "grad_norm": 8.751716613769531, + "learning_rate": 6.400495991303167e-06, + "loss": 3.2723, + "step": 42400 + }, + { + "epoch": 2.881165919282511, + "grad_norm": 6.499773979187012, + "learning_rate": 6.40007134121484e-06, + "loss": 2.8298, + "step": 42405 + }, + { + "epoch": 2.881505639353173, + "grad_norm": 7.422586441040039, + "learning_rate": 6.399646691126512e-06, + "loss": 3.0814, + "step": 42410 + }, + { + "epoch": 2.881845359423835, + "grad_norm": 6.434075832366943, + "learning_rate": 6.399222041038185e-06, + "loss": 3.1724, + "step": 42415 + }, + { + "epoch": 2.8821850794944965, + "grad_norm": 9.3477783203125, + "learning_rate": 6.398797390949858e-06, + "loss": 3.3307, + "step": 42420 + }, + { + "epoch": 2.882524799565158, + "grad_norm": 7.639052867889404, + "learning_rate": 6.39837274086153e-06, + "loss": 2.8138, + "step": 42425 + }, + { + "epoch": 2.8828645196358202, + "grad_norm": 8.60008716583252, + "learning_rate": 6.397948090773204e-06, + "loss": 3.1319, + "step": 42430 + }, + { + "epoch": 2.883204239706482, + "grad_norm": 5.684903144836426, + "learning_rate": 6.397523440684877e-06, + "loss": 3.0687, + "step": 42435 + }, + { + "epoch": 2.8835439597771435, + "grad_norm": 6.148285388946533, + "learning_rate": 6.3970987905965485e-06, + "loss": 2.9586, + "step": 42440 + }, + { + "epoch": 2.8838836798478056, + "grad_norm": 6.828604698181152, + "learning_rate": 6.396674140508222e-06, + "loss": 3.3432, + "step": 42445 + }, + { + "epoch": 2.884223399918467, + "grad_norm": 8.354683876037598, + "learning_rate": 6.396249490419895e-06, + "loss": 2.9248, + "step": 42450 + }, + { + "epoch": 2.884563119989129, + "grad_norm": 7.447123050689697, + "learning_rate": 6.395824840331567e-06, + "loss": 3.1519, + "step": 42455 + }, + { + "epoch": 2.884902840059791, + "grad_norm": 6.0916242599487305, + "learning_rate": 6.395400190243241e-06, + "loss": 3.1156, + "step": 42460 + }, + { + "epoch": 2.8852425601304525, + "grad_norm": 5.2631096839904785, + "learning_rate": 6.3949755401549126e-06, + "loss": 3.0256, + "step": 42465 + }, + { + "epoch": 2.885582280201114, + "grad_norm": 8.124747276306152, + "learning_rate": 6.394550890066585e-06, + "loss": 2.9683, + "step": 42470 + }, + { + "epoch": 2.8859220002717763, + "grad_norm": 7.077648162841797, + "learning_rate": 6.394126239978259e-06, + "loss": 3.103, + "step": 42475 + }, + { + "epoch": 2.886261720342438, + "grad_norm": 6.426178455352783, + "learning_rate": 6.393701589889931e-06, + "loss": 3.14, + "step": 42480 + }, + { + "epoch": 2.8866014404130995, + "grad_norm": 6.729709148406982, + "learning_rate": 6.393276939801604e-06, + "loss": 3.0683, + "step": 42485 + }, + { + "epoch": 2.8869411604837616, + "grad_norm": 5.9411234855651855, + "learning_rate": 6.392852289713277e-06, + "loss": 3.0143, + "step": 42490 + }, + { + "epoch": 2.8872808805544232, + "grad_norm": 6.451745510101318, + "learning_rate": 6.392427639624949e-06, + "loss": 3.0046, + "step": 42495 + }, + { + "epoch": 2.887620600625085, + "grad_norm": 6.7880635261535645, + "learning_rate": 6.392002989536622e-06, + "loss": 2.9563, + "step": 42500 + }, + { + "epoch": 2.887960320695747, + "grad_norm": 6.371416091918945, + "learning_rate": 6.391578339448296e-06, + "loss": 3.0467, + "step": 42505 + }, + { + "epoch": 2.8883000407664086, + "grad_norm": 10.834404945373535, + "learning_rate": 6.391153689359968e-06, + "loss": 3.0905, + "step": 42510 + }, + { + "epoch": 2.88863976083707, + "grad_norm": 8.162240028381348, + "learning_rate": 6.3907290392716406e-06, + "loss": 2.841, + "step": 42515 + }, + { + "epoch": 2.8889794809077323, + "grad_norm": 8.21748161315918, + "learning_rate": 6.390304389183314e-06, + "loss": 3.2143, + "step": 42520 + }, + { + "epoch": 2.889319200978394, + "grad_norm": 7.327398777008057, + "learning_rate": 6.389879739094986e-06, + "loss": 2.9709, + "step": 42525 + }, + { + "epoch": 2.8896589210490555, + "grad_norm": 7.2356462478637695, + "learning_rate": 6.389455089006659e-06, + "loss": 3.152, + "step": 42530 + }, + { + "epoch": 2.8899986411197176, + "grad_norm": 6.977593898773193, + "learning_rate": 6.389030438918333e-06, + "loss": 2.8625, + "step": 42535 + }, + { + "epoch": 2.8903383611903792, + "grad_norm": 9.178464889526367, + "learning_rate": 6.3886057888300046e-06, + "loss": 3.2323, + "step": 42540 + }, + { + "epoch": 2.890678081261041, + "grad_norm": 6.86057186126709, + "learning_rate": 6.388181138741677e-06, + "loss": 2.9436, + "step": 42545 + }, + { + "epoch": 2.891017801331703, + "grad_norm": 9.078153610229492, + "learning_rate": 6.38775648865335e-06, + "loss": 3.0022, + "step": 42550 + }, + { + "epoch": 2.8913575214023646, + "grad_norm": 7.477174758911133, + "learning_rate": 6.387331838565023e-06, + "loss": 2.9491, + "step": 42555 + }, + { + "epoch": 2.891697241473026, + "grad_norm": 6.930423259735107, + "learning_rate": 6.386907188476695e-06, + "loss": 3.1163, + "step": 42560 + }, + { + "epoch": 2.8920369615436883, + "grad_norm": 8.487268447875977, + "learning_rate": 6.3864825383883686e-06, + "loss": 3.2427, + "step": 42565 + }, + { + "epoch": 2.89237668161435, + "grad_norm": 6.39663553237915, + "learning_rate": 6.386057888300041e-06, + "loss": 3.0728, + "step": 42570 + }, + { + "epoch": 2.8927164016850115, + "grad_norm": 5.828339576721191, + "learning_rate": 6.385633238211713e-06, + "loss": 2.9189, + "step": 42575 + }, + { + "epoch": 2.893056121755673, + "grad_norm": 6.529816627502441, + "learning_rate": 6.385208588123387e-06, + "loss": 2.8947, + "step": 42580 + }, + { + "epoch": 2.8933958418263352, + "grad_norm": 5.6855597496032715, + "learning_rate": 6.38478393803506e-06, + "loss": 3.0668, + "step": 42585 + }, + { + "epoch": 2.893735561896997, + "grad_norm": 8.850361824035645, + "learning_rate": 6.384359287946732e-06, + "loss": 2.9816, + "step": 42590 + }, + { + "epoch": 2.8940752819676585, + "grad_norm": 7.307889938354492, + "learning_rate": 6.383934637858405e-06, + "loss": 3.078, + "step": 42595 + }, + { + "epoch": 2.8944150020383206, + "grad_norm": 7.999233722686768, + "learning_rate": 6.383509987770078e-06, + "loss": 3.1037, + "step": 42600 + }, + { + "epoch": 2.894754722108982, + "grad_norm": 6.785325050354004, + "learning_rate": 6.38308533768175e-06, + "loss": 2.7835, + "step": 42605 + }, + { + "epoch": 2.895094442179644, + "grad_norm": 7.505251884460449, + "learning_rate": 6.382660687593424e-06, + "loss": 3.0675, + "step": 42610 + }, + { + "epoch": 2.8954341622503055, + "grad_norm": 6.840035438537598, + "learning_rate": 6.382236037505097e-06, + "loss": 2.9083, + "step": 42615 + }, + { + "epoch": 2.8957738823209676, + "grad_norm": 5.659488201141357, + "learning_rate": 6.3818113874167685e-06, + "loss": 3.0789, + "step": 42620 + }, + { + "epoch": 2.896113602391629, + "grad_norm": 7.673923015594482, + "learning_rate": 6.381386737328442e-06, + "loss": 2.9622, + "step": 42625 + }, + { + "epoch": 2.896453322462291, + "grad_norm": 8.00847339630127, + "learning_rate": 6.380962087240115e-06, + "loss": 2.9542, + "step": 42630 + }, + { + "epoch": 2.896793042532953, + "grad_norm": 7.1535491943359375, + "learning_rate": 6.380537437151787e-06, + "loss": 3.1111, + "step": 42635 + }, + { + "epoch": 2.8971327626036145, + "grad_norm": 8.712152481079102, + "learning_rate": 6.380112787063461e-06, + "loss": 3.027, + "step": 42640 + }, + { + "epoch": 2.897472482674276, + "grad_norm": 7.070759296417236, + "learning_rate": 6.3796881369751325e-06, + "loss": 3.3099, + "step": 42645 + }, + { + "epoch": 2.8978122027449382, + "grad_norm": 4.918685436248779, + "learning_rate": 6.379263486886805e-06, + "loss": 2.9776, + "step": 42650 + }, + { + "epoch": 2.8981519228156, + "grad_norm": 6.08206033706665, + "learning_rate": 6.378838836798479e-06, + "loss": 3.0153, + "step": 42655 + }, + { + "epoch": 2.8984916428862615, + "grad_norm": 8.427213668823242, + "learning_rate": 6.378414186710151e-06, + "loss": 3.0613, + "step": 42660 + }, + { + "epoch": 2.8988313629569236, + "grad_norm": 6.850191116333008, + "learning_rate": 6.377989536621824e-06, + "loss": 3.2302, + "step": 42665 + }, + { + "epoch": 2.899171083027585, + "grad_norm": 7.930974006652832, + "learning_rate": 6.377564886533497e-06, + "loss": 2.9355, + "step": 42670 + }, + { + "epoch": 2.899510803098247, + "grad_norm": 6.627294063568115, + "learning_rate": 6.377140236445169e-06, + "loss": 3.0616, + "step": 42675 + }, + { + "epoch": 2.899850523168909, + "grad_norm": 6.823770046234131, + "learning_rate": 6.376715586356842e-06, + "loss": 2.9696, + "step": 42680 + }, + { + "epoch": 2.9001902432395705, + "grad_norm": 7.993913173675537, + "learning_rate": 6.376290936268516e-06, + "loss": 3.0787, + "step": 42685 + }, + { + "epoch": 2.900529963310232, + "grad_norm": 8.157132148742676, + "learning_rate": 6.375866286180188e-06, + "loss": 3.0136, + "step": 42690 + }, + { + "epoch": 2.9008696833808942, + "grad_norm": 6.671135902404785, + "learning_rate": 6.3754416360918605e-06, + "loss": 3.1311, + "step": 42695 + }, + { + "epoch": 2.901209403451556, + "grad_norm": 6.139617443084717, + "learning_rate": 6.375016986003534e-06, + "loss": 2.9826, + "step": 42700 + }, + { + "epoch": 2.9015491235222175, + "grad_norm": 8.21922779083252, + "learning_rate": 6.374592335915206e-06, + "loss": 3.3261, + "step": 42705 + }, + { + "epoch": 2.9018888435928796, + "grad_norm": 7.638373851776123, + "learning_rate": 6.37416768582688e-06, + "loss": 3.0728, + "step": 42710 + }, + { + "epoch": 2.902228563663541, + "grad_norm": 7.2572221755981445, + "learning_rate": 6.373743035738552e-06, + "loss": 3.0215, + "step": 42715 + }, + { + "epoch": 2.902568283734203, + "grad_norm": 6.7276201248168945, + "learning_rate": 6.3733183856502245e-06, + "loss": 2.9609, + "step": 42720 + }, + { + "epoch": 2.902908003804865, + "grad_norm": 7.244295597076416, + "learning_rate": 6.372893735561898e-06, + "loss": 3.1333, + "step": 42725 + }, + { + "epoch": 2.9032477238755265, + "grad_norm": 7.324938774108887, + "learning_rate": 6.37246908547357e-06, + "loss": 3.0865, + "step": 42730 + }, + { + "epoch": 2.903587443946188, + "grad_norm": 5.899125099182129, + "learning_rate": 6.372044435385243e-06, + "loss": 2.9903, + "step": 42735 + }, + { + "epoch": 2.9039271640168502, + "grad_norm": 8.286092758178711, + "learning_rate": 6.371619785296917e-06, + "loss": 3.0926, + "step": 42740 + }, + { + "epoch": 2.904266884087512, + "grad_norm": 7.5006842613220215, + "learning_rate": 6.3711951352085885e-06, + "loss": 2.9523, + "step": 42745 + }, + { + "epoch": 2.9046066041581735, + "grad_norm": 6.167902946472168, + "learning_rate": 6.370770485120261e-06, + "loss": 2.9119, + "step": 42750 + }, + { + "epoch": 2.9049463242288356, + "grad_norm": 6.906810760498047, + "learning_rate": 6.370345835031935e-06, + "loss": 2.9558, + "step": 42755 + }, + { + "epoch": 2.905286044299497, + "grad_norm": 7.708724021911621, + "learning_rate": 6.369921184943607e-06, + "loss": 3.0889, + "step": 42760 + }, + { + "epoch": 2.905625764370159, + "grad_norm": 7.330000877380371, + "learning_rate": 6.36949653485528e-06, + "loss": 3.0387, + "step": 42765 + }, + { + "epoch": 2.905965484440821, + "grad_norm": 4.957417011260986, + "learning_rate": 6.369071884766953e-06, + "loss": 3.0332, + "step": 42770 + }, + { + "epoch": 2.9063052045114826, + "grad_norm": 6.153911113739014, + "learning_rate": 6.368647234678625e-06, + "loss": 3.2501, + "step": 42775 + }, + { + "epoch": 2.906644924582144, + "grad_norm": 8.11436653137207, + "learning_rate": 6.368222584590298e-06, + "loss": 3.082, + "step": 42780 + }, + { + "epoch": 2.9069846446528063, + "grad_norm": 8.076065063476562, + "learning_rate": 6.367797934501971e-06, + "loss": 2.803, + "step": 42785 + }, + { + "epoch": 2.907324364723468, + "grad_norm": 7.44881010055542, + "learning_rate": 6.367373284413644e-06, + "loss": 3.1116, + "step": 42790 + }, + { + "epoch": 2.9076640847941295, + "grad_norm": 6.549025535583496, + "learning_rate": 6.3669486343253166e-06, + "loss": 2.9038, + "step": 42795 + }, + { + "epoch": 2.9080038048647916, + "grad_norm": 6.385436058044434, + "learning_rate": 6.366523984236989e-06, + "loss": 3.1046, + "step": 42800 + }, + { + "epoch": 2.9083435249354532, + "grad_norm": 10.424908638000488, + "learning_rate": 6.366099334148662e-06, + "loss": 2.9668, + "step": 42805 + }, + { + "epoch": 2.908683245006115, + "grad_norm": 6.913161277770996, + "learning_rate": 6.365674684060334e-06, + "loss": 3.2631, + "step": 42810 + }, + { + "epoch": 2.909022965076777, + "grad_norm": 7.691544532775879, + "learning_rate": 6.365250033972008e-06, + "loss": 3.1667, + "step": 42815 + }, + { + "epoch": 2.9093626851474386, + "grad_norm": 6.139925003051758, + "learning_rate": 6.3648253838836806e-06, + "loss": 3.0387, + "step": 42820 + }, + { + "epoch": 2.9097024052181, + "grad_norm": 6.606639385223389, + "learning_rate": 6.3644007337953525e-06, + "loss": 3.0581, + "step": 42825 + }, + { + "epoch": 2.9100421252887623, + "grad_norm": 5.729974269866943, + "learning_rate": 6.363976083707026e-06, + "loss": 2.9386, + "step": 42830 + }, + { + "epoch": 2.910381845359424, + "grad_norm": 7.2373552322387695, + "learning_rate": 6.363551433618699e-06, + "loss": 3.2197, + "step": 42835 + }, + { + "epoch": 2.9107215654300855, + "grad_norm": 5.926656723022461, + "learning_rate": 6.363126783530371e-06, + "loss": 3.1408, + "step": 42840 + }, + { + "epoch": 2.9110612855007476, + "grad_norm": 5.413050174713135, + "learning_rate": 6.3627021334420446e-06, + "loss": 2.9661, + "step": 42845 + }, + { + "epoch": 2.9114010055714092, + "grad_norm": 6.490556240081787, + "learning_rate": 6.362277483353717e-06, + "loss": 3.0107, + "step": 42850 + }, + { + "epoch": 2.911740725642071, + "grad_norm": 9.00305461883545, + "learning_rate": 6.361852833265389e-06, + "loss": 2.9993, + "step": 42855 + }, + { + "epoch": 2.912080445712733, + "grad_norm": 5.811800479888916, + "learning_rate": 6.361428183177063e-06, + "loss": 3.0489, + "step": 42860 + }, + { + "epoch": 2.9124201657833946, + "grad_norm": 7.875122547149658, + "learning_rate": 6.361003533088736e-06, + "loss": 3.1959, + "step": 42865 + }, + { + "epoch": 2.912759885854056, + "grad_norm": 7.32477331161499, + "learning_rate": 6.360578883000408e-06, + "loss": 3.134, + "step": 42870 + }, + { + "epoch": 2.9130996059247183, + "grad_norm": 6.848030090332031, + "learning_rate": 6.360154232912081e-06, + "loss": 3.0332, + "step": 42875 + }, + { + "epoch": 2.91343932599538, + "grad_norm": 5.914986610412598, + "learning_rate": 6.359729582823753e-06, + "loss": 2.9513, + "step": 42880 + }, + { + "epoch": 2.9137790460660415, + "grad_norm": 6.513923168182373, + "learning_rate": 6.359304932735426e-06, + "loss": 3.3014, + "step": 42885 + }, + { + "epoch": 2.9141187661367036, + "grad_norm": 6.821894645690918, + "learning_rate": 6.3588802826471e-06, + "loss": 2.9627, + "step": 42890 + }, + { + "epoch": 2.9144584862073653, + "grad_norm": 6.610276699066162, + "learning_rate": 6.358455632558772e-06, + "loss": 3.1028, + "step": 42895 + }, + { + "epoch": 2.914798206278027, + "grad_norm": 7.041287422180176, + "learning_rate": 6.3580309824704445e-06, + "loss": 2.8893, + "step": 42900 + }, + { + "epoch": 2.915137926348689, + "grad_norm": 5.832179069519043, + "learning_rate": 6.357606332382118e-06, + "loss": 3.1408, + "step": 42905 + }, + { + "epoch": 2.9154776464193506, + "grad_norm": 7.748973846435547, + "learning_rate": 6.35718168229379e-06, + "loss": 3.0067, + "step": 42910 + }, + { + "epoch": 2.915817366490012, + "grad_norm": 6.186592102050781, + "learning_rate": 6.356757032205463e-06, + "loss": 3.1028, + "step": 42915 + }, + { + "epoch": 2.916157086560674, + "grad_norm": 9.367181777954102, + "learning_rate": 6.356332382117137e-06, + "loss": 3.0538, + "step": 42920 + }, + { + "epoch": 2.916496806631336, + "grad_norm": 7.071614742279053, + "learning_rate": 6.3559077320288085e-06, + "loss": 3.015, + "step": 42925 + }, + { + "epoch": 2.9168365267019976, + "grad_norm": 6.64204740524292, + "learning_rate": 6.355483081940481e-06, + "loss": 2.9595, + "step": 42930 + }, + { + "epoch": 2.917176246772659, + "grad_norm": 7.9431633949279785, + "learning_rate": 6.355058431852155e-06, + "loss": 3.1897, + "step": 42935 + }, + { + "epoch": 2.9175159668433213, + "grad_norm": 5.51362943649292, + "learning_rate": 6.354633781763827e-06, + "loss": 3.2729, + "step": 42940 + }, + { + "epoch": 2.917855686913983, + "grad_norm": 5.8196821212768555, + "learning_rate": 6.3542091316755e-06, + "loss": 3.1786, + "step": 42945 + }, + { + "epoch": 2.9181954069846445, + "grad_norm": 6.499339580535889, + "learning_rate": 6.353784481587173e-06, + "loss": 2.9555, + "step": 42950 + }, + { + "epoch": 2.918535127055306, + "grad_norm": 8.120046615600586, + "learning_rate": 6.353359831498845e-06, + "loss": 2.9314, + "step": 42955 + }, + { + "epoch": 2.9188748471259682, + "grad_norm": 6.749604225158691, + "learning_rate": 6.352935181410518e-06, + "loss": 3.0242, + "step": 42960 + }, + { + "epoch": 2.91921456719663, + "grad_norm": 5.779324531555176, + "learning_rate": 6.352510531322191e-06, + "loss": 3.1381, + "step": 42965 + }, + { + "epoch": 2.9195542872672915, + "grad_norm": 8.458962440490723, + "learning_rate": 6.352085881233864e-06, + "loss": 3.2038, + "step": 42970 + }, + { + "epoch": 2.9198940073379536, + "grad_norm": 5.830483436584473, + "learning_rate": 6.3516612311455365e-06, + "loss": 2.9725, + "step": 42975 + }, + { + "epoch": 2.920233727408615, + "grad_norm": 6.903151035308838, + "learning_rate": 6.351236581057209e-06, + "loss": 3.1658, + "step": 42980 + }, + { + "epoch": 2.920573447479277, + "grad_norm": 8.211484909057617, + "learning_rate": 6.350811930968882e-06, + "loss": 2.8486, + "step": 42985 + }, + { + "epoch": 2.920913167549939, + "grad_norm": 8.29982852935791, + "learning_rate": 6.350387280880554e-06, + "loss": 3.0697, + "step": 42990 + }, + { + "epoch": 2.9212528876206005, + "grad_norm": 6.027213096618652, + "learning_rate": 6.349962630792228e-06, + "loss": 3.0765, + "step": 42995 + }, + { + "epoch": 2.921592607691262, + "grad_norm": 6.688024997711182, + "learning_rate": 6.3495379807039005e-06, + "loss": 3.0303, + "step": 43000 + }, + { + "epoch": 2.9219323277619242, + "grad_norm": 6.972597599029541, + "learning_rate": 6.3491133306155725e-06, + "loss": 3.3019, + "step": 43005 + }, + { + "epoch": 2.922272047832586, + "grad_norm": 5.224844932556152, + "learning_rate": 6.348688680527246e-06, + "loss": 3.0635, + "step": 43010 + }, + { + "epoch": 2.9226117679032475, + "grad_norm": 8.141965866088867, + "learning_rate": 6.348264030438919e-06, + "loss": 2.7317, + "step": 43015 + }, + { + "epoch": 2.9229514879739096, + "grad_norm": 7.392412185668945, + "learning_rate": 6.347839380350591e-06, + "loss": 3.1273, + "step": 43020 + }, + { + "epoch": 2.923291208044571, + "grad_norm": 6.320067882537842, + "learning_rate": 6.3474147302622645e-06, + "loss": 3.2454, + "step": 43025 + }, + { + "epoch": 2.923630928115233, + "grad_norm": 8.011208534240723, + "learning_rate": 6.346990080173937e-06, + "loss": 3.0577, + "step": 43030 + }, + { + "epoch": 2.923970648185895, + "grad_norm": 5.865612030029297, + "learning_rate": 6.346565430085609e-06, + "loss": 2.9807, + "step": 43035 + }, + { + "epoch": 2.9243103682565565, + "grad_norm": 5.8740153312683105, + "learning_rate": 6.346140779997283e-06, + "loss": 3.1564, + "step": 43040 + }, + { + "epoch": 2.924650088327218, + "grad_norm": 6.4077067375183105, + "learning_rate": 6.345716129908956e-06, + "loss": 3.1546, + "step": 43045 + }, + { + "epoch": 2.9249898083978803, + "grad_norm": 6.285772800445557, + "learning_rate": 6.3452914798206285e-06, + "loss": 3.1582, + "step": 43050 + }, + { + "epoch": 2.925329528468542, + "grad_norm": 6.634266376495361, + "learning_rate": 6.344866829732301e-06, + "loss": 2.8998, + "step": 43055 + }, + { + "epoch": 2.9256692485392035, + "grad_norm": 6.96546745300293, + "learning_rate": 6.344442179643973e-06, + "loss": 3.0154, + "step": 43060 + }, + { + "epoch": 2.9260089686098656, + "grad_norm": 6.921733379364014, + "learning_rate": 6.344017529555647e-06, + "loss": 3.1458, + "step": 43065 + }, + { + "epoch": 2.926348688680527, + "grad_norm": 7.003030776977539, + "learning_rate": 6.34359287946732e-06, + "loss": 2.9599, + "step": 43070 + }, + { + "epoch": 2.926688408751189, + "grad_norm": 7.4792985916137695, + "learning_rate": 6.343168229378992e-06, + "loss": 3.0821, + "step": 43075 + }, + { + "epoch": 2.927028128821851, + "grad_norm": 9.739068984985352, + "learning_rate": 6.342743579290665e-06, + "loss": 3.1539, + "step": 43080 + }, + { + "epoch": 2.9273678488925126, + "grad_norm": 7.351678848266602, + "learning_rate": 6.342318929202338e-06, + "loss": 3.1178, + "step": 43085 + }, + { + "epoch": 2.927707568963174, + "grad_norm": 6.233542442321777, + "learning_rate": 6.34189427911401e-06, + "loss": 2.8189, + "step": 43090 + }, + { + "epoch": 2.9280472890338363, + "grad_norm": 5.475973129272461, + "learning_rate": 6.341469629025684e-06, + "loss": 2.786, + "step": 43095 + }, + { + "epoch": 2.928387009104498, + "grad_norm": 6.697453498840332, + "learning_rate": 6.3410449789373566e-06, + "loss": 2.9359, + "step": 43100 + }, + { + "epoch": 2.9287267291751595, + "grad_norm": 8.141510009765625, + "learning_rate": 6.3406203288490285e-06, + "loss": 3.0591, + "step": 43105 + }, + { + "epoch": 2.9290664492458216, + "grad_norm": 6.212671279907227, + "learning_rate": 6.340195678760702e-06, + "loss": 2.991, + "step": 43110 + }, + { + "epoch": 2.9294061693164832, + "grad_norm": 5.482203960418701, + "learning_rate": 6.339771028672375e-06, + "loss": 3.1694, + "step": 43115 + }, + { + "epoch": 2.929745889387145, + "grad_norm": 7.83444356918335, + "learning_rate": 6.339346378584047e-06, + "loss": 2.7601, + "step": 43120 + }, + { + "epoch": 2.930085609457807, + "grad_norm": 5.4034318923950195, + "learning_rate": 6.3389217284957206e-06, + "loss": 3.1793, + "step": 43125 + }, + { + "epoch": 2.9304253295284686, + "grad_norm": 7.270864486694336, + "learning_rate": 6.3384970784073925e-06, + "loss": 3.0705, + "step": 43130 + }, + { + "epoch": 2.93076504959913, + "grad_norm": 7.318307876586914, + "learning_rate": 6.338072428319065e-06, + "loss": 2.724, + "step": 43135 + }, + { + "epoch": 2.9311047696697923, + "grad_norm": 7.444481372833252, + "learning_rate": 6.337647778230739e-06, + "loss": 3.0005, + "step": 43140 + }, + { + "epoch": 2.931444489740454, + "grad_norm": 5.028359413146973, + "learning_rate": 6.337223128142411e-06, + "loss": 3.2771, + "step": 43145 + }, + { + "epoch": 2.9317842098111155, + "grad_norm": 6.774798393249512, + "learning_rate": 6.336798478054084e-06, + "loss": 3.4058, + "step": 43150 + }, + { + "epoch": 2.9321239298817776, + "grad_norm": 8.244882583618164, + "learning_rate": 6.336373827965757e-06, + "loss": 3.1255, + "step": 43155 + }, + { + "epoch": 2.9324636499524392, + "grad_norm": 6.791582107543945, + "learning_rate": 6.335949177877429e-06, + "loss": 3.1363, + "step": 43160 + }, + { + "epoch": 2.932803370023101, + "grad_norm": 5.744012832641602, + "learning_rate": 6.335524527789102e-06, + "loss": 3.151, + "step": 43165 + }, + { + "epoch": 2.933143090093763, + "grad_norm": 7.691285610198975, + "learning_rate": 6.335099877700776e-06, + "loss": 2.7709, + "step": 43170 + }, + { + "epoch": 2.9334828101644246, + "grad_norm": 8.459431648254395, + "learning_rate": 6.334675227612448e-06, + "loss": 3.0458, + "step": 43175 + }, + { + "epoch": 2.933822530235086, + "grad_norm": 7.801453113555908, + "learning_rate": 6.3342505775241205e-06, + "loss": 3.2138, + "step": 43180 + }, + { + "epoch": 2.9341622503057483, + "grad_norm": 5.741397380828857, + "learning_rate": 6.333825927435794e-06, + "loss": 2.9519, + "step": 43185 + }, + { + "epoch": 2.93450197037641, + "grad_norm": 7.801802158355713, + "learning_rate": 6.333401277347466e-06, + "loss": 3.0539, + "step": 43190 + }, + { + "epoch": 2.9348416904470715, + "grad_norm": 8.22550106048584, + "learning_rate": 6.332976627259139e-06, + "loss": 3.0706, + "step": 43195 + }, + { + "epoch": 2.9351814105177336, + "grad_norm": 8.414432525634766, + "learning_rate": 6.3325519771708126e-06, + "loss": 3.0654, + "step": 43200 + }, + { + "epoch": 2.9355211305883953, + "grad_norm": 8.06180477142334, + "learning_rate": 6.3321273270824845e-06, + "loss": 3.2374, + "step": 43205 + }, + { + "epoch": 2.935860850659057, + "grad_norm": 6.232586860656738, + "learning_rate": 6.331702676994157e-06, + "loss": 3.0828, + "step": 43210 + }, + { + "epoch": 2.936200570729719, + "grad_norm": 6.958437442779541, + "learning_rate": 6.33127802690583e-06, + "loss": 3.0772, + "step": 43215 + }, + { + "epoch": 2.9365402908003806, + "grad_norm": 6.31768274307251, + "learning_rate": 6.330853376817503e-06, + "loss": 3.0809, + "step": 43220 + }, + { + "epoch": 2.9368800108710422, + "grad_norm": 7.216230392456055, + "learning_rate": 6.330428726729175e-06, + "loss": 3.0161, + "step": 43225 + }, + { + "epoch": 2.9372197309417043, + "grad_norm": 7.270317077636719, + "learning_rate": 6.3300040766408485e-06, + "loss": 3.0446, + "step": 43230 + }, + { + "epoch": 2.937559451012366, + "grad_norm": 6.410009860992432, + "learning_rate": 6.329579426552521e-06, + "loss": 3.014, + "step": 43235 + }, + { + "epoch": 2.9378991710830276, + "grad_norm": 7.332113265991211, + "learning_rate": 6.329154776464193e-06, + "loss": 2.9622, + "step": 43240 + }, + { + "epoch": 2.9382388911536896, + "grad_norm": 5.608218669891357, + "learning_rate": 6.328730126375867e-06, + "loss": 2.9307, + "step": 43245 + }, + { + "epoch": 2.9385786112243513, + "grad_norm": 6.240694522857666, + "learning_rate": 6.32830547628754e-06, + "loss": 2.9885, + "step": 43250 + }, + { + "epoch": 2.938918331295013, + "grad_norm": 6.575701713562012, + "learning_rate": 6.327880826199212e-06, + "loss": 3.201, + "step": 43255 + }, + { + "epoch": 2.9392580513656745, + "grad_norm": 7.374064922332764, + "learning_rate": 6.327456176110885e-06, + "loss": 2.9997, + "step": 43260 + }, + { + "epoch": 2.9395977714363366, + "grad_norm": 8.125832557678223, + "learning_rate": 6.327031526022558e-06, + "loss": 2.9967, + "step": 43265 + }, + { + "epoch": 2.9399374915069982, + "grad_norm": 5.887503623962402, + "learning_rate": 6.32660687593423e-06, + "loss": 2.9094, + "step": 43270 + }, + { + "epoch": 2.94027721157766, + "grad_norm": 5.215559959411621, + "learning_rate": 6.326182225845904e-06, + "loss": 2.9974, + "step": 43275 + }, + { + "epoch": 2.940616931648322, + "grad_norm": 8.44703197479248, + "learning_rate": 6.3257575757575765e-06, + "loss": 3.0203, + "step": 43280 + }, + { + "epoch": 2.9409566517189836, + "grad_norm": 7.750678062438965, + "learning_rate": 6.3253329256692485e-06, + "loss": 3.0745, + "step": 43285 + }, + { + "epoch": 2.941296371789645, + "grad_norm": 5.4886064529418945, + "learning_rate": 6.324908275580922e-06, + "loss": 3.0892, + "step": 43290 + }, + { + "epoch": 2.941636091860307, + "grad_norm": 5.624373435974121, + "learning_rate": 6.324483625492595e-06, + "loss": 3.1634, + "step": 43295 + }, + { + "epoch": 2.941975811930969, + "grad_norm": 7.301599979400635, + "learning_rate": 6.324058975404267e-06, + "loss": 3.1847, + "step": 43300 + }, + { + "epoch": 2.9423155320016305, + "grad_norm": 5.996294021606445, + "learning_rate": 6.3236343253159405e-06, + "loss": 2.7966, + "step": 43305 + }, + { + "epoch": 2.942655252072292, + "grad_norm": 5.810632705688477, + "learning_rate": 6.3232096752276125e-06, + "loss": 3.0464, + "step": 43310 + }, + { + "epoch": 2.9429949721429542, + "grad_norm": 7.23922872543335, + "learning_rate": 6.322785025139285e-06, + "loss": 2.9657, + "step": 43315 + }, + { + "epoch": 2.943334692213616, + "grad_norm": 8.199243545532227, + "learning_rate": 6.322360375050959e-06, + "loss": 3.2182, + "step": 43320 + }, + { + "epoch": 2.9436744122842775, + "grad_norm": 8.970941543579102, + "learning_rate": 6.321935724962631e-06, + "loss": 2.9237, + "step": 43325 + }, + { + "epoch": 2.9440141323549396, + "grad_norm": 6.87772274017334, + "learning_rate": 6.321511074874304e-06, + "loss": 2.88, + "step": 43330 + }, + { + "epoch": 2.944353852425601, + "grad_norm": 6.428309917449951, + "learning_rate": 6.321086424785977e-06, + "loss": 2.9389, + "step": 43335 + }, + { + "epoch": 2.944693572496263, + "grad_norm": 5.177769184112549, + "learning_rate": 6.320661774697649e-06, + "loss": 3.2518, + "step": 43340 + }, + { + "epoch": 2.945033292566925, + "grad_norm": 8.492241859436035, + "learning_rate": 6.320237124609322e-06, + "loss": 3.1052, + "step": 43345 + }, + { + "epoch": 2.9453730126375866, + "grad_norm": 6.34234094619751, + "learning_rate": 6.319812474520996e-06, + "loss": 3.1209, + "step": 43350 + }, + { + "epoch": 2.945712732708248, + "grad_norm": 7.646693706512451, + "learning_rate": 6.319387824432668e-06, + "loss": 3.1962, + "step": 43355 + }, + { + "epoch": 2.9460524527789103, + "grad_norm": 6.218652248382568, + "learning_rate": 6.3189631743443405e-06, + "loss": 3.0112, + "step": 43360 + }, + { + "epoch": 2.946392172849572, + "grad_norm": 7.03550910949707, + "learning_rate": 6.318538524256014e-06, + "loss": 2.9801, + "step": 43365 + }, + { + "epoch": 2.9467318929202335, + "grad_norm": 9.638158798217773, + "learning_rate": 6.318113874167686e-06, + "loss": 3.2468, + "step": 43370 + }, + { + "epoch": 2.9470716129908956, + "grad_norm": 6.664770126342773, + "learning_rate": 6.317689224079359e-06, + "loss": 3.2219, + "step": 43375 + }, + { + "epoch": 2.9474113330615572, + "grad_norm": 7.537553787231445, + "learning_rate": 6.317264573991032e-06, + "loss": 2.9076, + "step": 43380 + }, + { + "epoch": 2.947751053132219, + "grad_norm": 6.732965469360352, + "learning_rate": 6.3168399239027045e-06, + "loss": 2.9848, + "step": 43385 + }, + { + "epoch": 2.948090773202881, + "grad_norm": 6.3978962898254395, + "learning_rate": 6.316415273814378e-06, + "loss": 3.0044, + "step": 43390 + }, + { + "epoch": 2.9484304932735426, + "grad_norm": 6.884183883666992, + "learning_rate": 6.31599062372605e-06, + "loss": 2.9961, + "step": 43395 + }, + { + "epoch": 2.948770213344204, + "grad_norm": 7.2982563972473145, + "learning_rate": 6.315565973637723e-06, + "loss": 3.125, + "step": 43400 + }, + { + "epoch": 2.9491099334148663, + "grad_norm": 6.109523773193359, + "learning_rate": 6.3151413235493966e-06, + "loss": 2.8896, + "step": 43405 + }, + { + "epoch": 2.949449653485528, + "grad_norm": 8.40254020690918, + "learning_rate": 6.3147166734610685e-06, + "loss": 2.8954, + "step": 43410 + }, + { + "epoch": 2.9497893735561895, + "grad_norm": 6.394122123718262, + "learning_rate": 6.314292023372741e-06, + "loss": 2.8906, + "step": 43415 + }, + { + "epoch": 2.9501290936268516, + "grad_norm": 7.486762523651123, + "learning_rate": 6.313867373284415e-06, + "loss": 3.0005, + "step": 43420 + }, + { + "epoch": 2.9504688136975132, + "grad_norm": 6.260144233703613, + "learning_rate": 6.313442723196087e-06, + "loss": 3.1008, + "step": 43425 + }, + { + "epoch": 2.950808533768175, + "grad_norm": 9.506696701049805, + "learning_rate": 6.31301807310776e-06, + "loss": 3.2071, + "step": 43430 + }, + { + "epoch": 2.951148253838837, + "grad_norm": 8.245879173278809, + "learning_rate": 6.312593423019433e-06, + "loss": 3.2637, + "step": 43435 + }, + { + "epoch": 2.9514879739094986, + "grad_norm": 6.740200042724609, + "learning_rate": 6.312168772931105e-06, + "loss": 3.1265, + "step": 43440 + }, + { + "epoch": 2.95182769398016, + "grad_norm": 5.932375431060791, + "learning_rate": 6.311744122842778e-06, + "loss": 3.1652, + "step": 43445 + }, + { + "epoch": 2.9521674140508223, + "grad_norm": 9.185877799987793, + "learning_rate": 6.311319472754451e-06, + "loss": 2.8356, + "step": 43450 + }, + { + "epoch": 2.952507134121484, + "grad_norm": 8.326531410217285, + "learning_rate": 6.310894822666124e-06, + "loss": 2.9455, + "step": 43455 + }, + { + "epoch": 2.9528468541921455, + "grad_norm": 7.107790470123291, + "learning_rate": 6.3104701725777965e-06, + "loss": 3.0932, + "step": 43460 + }, + { + "epoch": 2.9531865742628076, + "grad_norm": 6.445524215698242, + "learning_rate": 6.310045522489469e-06, + "loss": 2.9539, + "step": 43465 + }, + { + "epoch": 2.9535262943334692, + "grad_norm": 8.06733226776123, + "learning_rate": 6.309620872401142e-06, + "loss": 3.2151, + "step": 43470 + }, + { + "epoch": 2.953866014404131, + "grad_norm": 8.514510154724121, + "learning_rate": 6.309196222312814e-06, + "loss": 3.0076, + "step": 43475 + }, + { + "epoch": 2.954205734474793, + "grad_norm": 8.274514198303223, + "learning_rate": 6.308771572224488e-06, + "loss": 3.1087, + "step": 43480 + }, + { + "epoch": 2.9545454545454546, + "grad_norm": 5.226507186889648, + "learning_rate": 6.3083469221361605e-06, + "loss": 3.1512, + "step": 43485 + }, + { + "epoch": 2.954885174616116, + "grad_norm": 6.577873229980469, + "learning_rate": 6.3079222720478325e-06, + "loss": 3.2621, + "step": 43490 + }, + { + "epoch": 2.9552248946867783, + "grad_norm": 5.335796356201172, + "learning_rate": 6.307497621959506e-06, + "loss": 2.8727, + "step": 43495 + }, + { + "epoch": 2.95556461475744, + "grad_norm": 7.314086437225342, + "learning_rate": 6.307072971871179e-06, + "loss": 2.9246, + "step": 43500 + }, + { + "epoch": 2.9559043348281016, + "grad_norm": 6.284130573272705, + "learning_rate": 6.306648321782851e-06, + "loss": 2.8542, + "step": 43505 + }, + { + "epoch": 2.9562440548987636, + "grad_norm": 6.9295244216918945, + "learning_rate": 6.3062236716945245e-06, + "loss": 3.1724, + "step": 43510 + }, + { + "epoch": 2.9565837749694253, + "grad_norm": 7.468255043029785, + "learning_rate": 6.305799021606197e-06, + "loss": 2.531, + "step": 43515 + }, + { + "epoch": 2.956923495040087, + "grad_norm": 5.5520453453063965, + "learning_rate": 6.305374371517869e-06, + "loss": 2.852, + "step": 43520 + }, + { + "epoch": 2.957263215110749, + "grad_norm": 7.294271469116211, + "learning_rate": 6.304949721429543e-06, + "loss": 3.0166, + "step": 43525 + }, + { + "epoch": 2.9576029351814106, + "grad_norm": 7.2598114013671875, + "learning_rate": 6.304525071341216e-06, + "loss": 3.1735, + "step": 43530 + }, + { + "epoch": 2.9579426552520722, + "grad_norm": 7.0802001953125, + "learning_rate": 6.304100421252888e-06, + "loss": 2.8043, + "step": 43535 + }, + { + "epoch": 2.9582823753227343, + "grad_norm": 7.656821250915527, + "learning_rate": 6.303675771164561e-06, + "loss": 2.9383, + "step": 43540 + }, + { + "epoch": 2.958622095393396, + "grad_norm": 6.61857271194458, + "learning_rate": 6.303251121076234e-06, + "loss": 3.0633, + "step": 43545 + }, + { + "epoch": 2.9589618154640576, + "grad_norm": 7.178877353668213, + "learning_rate": 6.302826470987906e-06, + "loss": 3.2489, + "step": 43550 + }, + { + "epoch": 2.9593015355347196, + "grad_norm": 7.028130054473877, + "learning_rate": 6.30240182089958e-06, + "loss": 2.7616, + "step": 43555 + }, + { + "epoch": 2.9596412556053813, + "grad_norm": 6.933868408203125, + "learning_rate": 6.301977170811252e-06, + "loss": 3.0813, + "step": 43560 + }, + { + "epoch": 2.959980975676043, + "grad_norm": 6.089624404907227, + "learning_rate": 6.3015525207229245e-06, + "loss": 2.6443, + "step": 43565 + }, + { + "epoch": 2.960320695746705, + "grad_norm": 8.35413932800293, + "learning_rate": 6.301127870634598e-06, + "loss": 2.8936, + "step": 43570 + }, + { + "epoch": 2.9606604158173666, + "grad_norm": 6.833504676818848, + "learning_rate": 6.30070322054627e-06, + "loss": 3.1145, + "step": 43575 + }, + { + "epoch": 2.9610001358880282, + "grad_norm": 7.282142162322998, + "learning_rate": 6.300278570457943e-06, + "loss": 3.1675, + "step": 43580 + }, + { + "epoch": 2.9613398559586903, + "grad_norm": 8.218879699707031, + "learning_rate": 6.2998539203696165e-06, + "loss": 3.0079, + "step": 43585 + }, + { + "epoch": 2.961679576029352, + "grad_norm": 5.438037872314453, + "learning_rate": 6.2994292702812885e-06, + "loss": 3.2622, + "step": 43590 + }, + { + "epoch": 2.9620192961000136, + "grad_norm": 7.584253311157227, + "learning_rate": 6.299004620192961e-06, + "loss": 3.0553, + "step": 43595 + }, + { + "epoch": 2.962359016170675, + "grad_norm": 6.595642566680908, + "learning_rate": 6.298579970104635e-06, + "loss": 3.1154, + "step": 43600 + }, + { + "epoch": 2.9626987362413373, + "grad_norm": 5.31526517868042, + "learning_rate": 6.298155320016307e-06, + "loss": 3.1235, + "step": 43605 + }, + { + "epoch": 2.963038456311999, + "grad_norm": 7.475553512573242, + "learning_rate": 6.29773066992798e-06, + "loss": 2.993, + "step": 43610 + }, + { + "epoch": 2.9633781763826605, + "grad_norm": 6.433483600616455, + "learning_rate": 6.297306019839653e-06, + "loss": 2.9736, + "step": 43615 + }, + { + "epoch": 2.9637178964533226, + "grad_norm": 6.724891185760498, + "learning_rate": 6.296881369751325e-06, + "loss": 3.1088, + "step": 43620 + }, + { + "epoch": 2.9640576165239843, + "grad_norm": 6.711582183837891, + "learning_rate": 6.296456719662998e-06, + "loss": 3.0819, + "step": 43625 + }, + { + "epoch": 2.964397336594646, + "grad_norm": 8.349358558654785, + "learning_rate": 6.296032069574671e-06, + "loss": 2.802, + "step": 43630 + }, + { + "epoch": 2.9647370566653075, + "grad_norm": 7.738336563110352, + "learning_rate": 6.295607419486344e-06, + "loss": 3.0895, + "step": 43635 + }, + { + "epoch": 2.9650767767359696, + "grad_norm": 8.942286491394043, + "learning_rate": 6.2951827693980165e-06, + "loss": 3.1467, + "step": 43640 + }, + { + "epoch": 2.965416496806631, + "grad_norm": 6.065532207489014, + "learning_rate": 6.294758119309689e-06, + "loss": 3.0429, + "step": 43645 + }, + { + "epoch": 2.965756216877293, + "grad_norm": 8.242013931274414, + "learning_rate": 6.294333469221362e-06, + "loss": 3.0724, + "step": 43650 + }, + { + "epoch": 2.966095936947955, + "grad_norm": 6.161288738250732, + "learning_rate": 6.293908819133034e-06, + "loss": 3.138, + "step": 43655 + }, + { + "epoch": 2.9664356570186166, + "grad_norm": 7.346237659454346, + "learning_rate": 6.293484169044708e-06, + "loss": 3.2509, + "step": 43660 + }, + { + "epoch": 2.966775377089278, + "grad_norm": 6.787334442138672, + "learning_rate": 6.2930595189563805e-06, + "loss": 3.1086, + "step": 43665 + }, + { + "epoch": 2.9671150971599403, + "grad_norm": 6.910240173339844, + "learning_rate": 6.2926348688680525e-06, + "loss": 3.1963, + "step": 43670 + }, + { + "epoch": 2.967454817230602, + "grad_norm": 6.34417724609375, + "learning_rate": 6.292210218779726e-06, + "loss": 2.9823, + "step": 43675 + }, + { + "epoch": 2.9677945373012635, + "grad_norm": 7.642199516296387, + "learning_rate": 6.291785568691399e-06, + "loss": 3.0262, + "step": 43680 + }, + { + "epoch": 2.9681342573719256, + "grad_norm": 5.998164653778076, + "learning_rate": 6.291360918603071e-06, + "loss": 3.0995, + "step": 43685 + }, + { + "epoch": 2.9684739774425872, + "grad_norm": 6.1264448165893555, + "learning_rate": 6.2909362685147445e-06, + "loss": 2.9624, + "step": 43690 + }, + { + "epoch": 2.968813697513249, + "grad_norm": 7.444458961486816, + "learning_rate": 6.290511618426417e-06, + "loss": 3.1466, + "step": 43695 + }, + { + "epoch": 2.969153417583911, + "grad_norm": 8.688739776611328, + "learning_rate": 6.290086968338089e-06, + "loss": 3.135, + "step": 43700 + }, + { + "epoch": 2.9694931376545726, + "grad_norm": 7.0472283363342285, + "learning_rate": 6.289662318249763e-06, + "loss": 2.9848, + "step": 43705 + }, + { + "epoch": 2.969832857725234, + "grad_norm": 6.188602447509766, + "learning_rate": 6.289237668161436e-06, + "loss": 2.9879, + "step": 43710 + }, + { + "epoch": 2.9701725777958963, + "grad_norm": 7.81558895111084, + "learning_rate": 6.288813018073108e-06, + "loss": 2.9499, + "step": 43715 + }, + { + "epoch": 2.970512297866558, + "grad_norm": 6.248012542724609, + "learning_rate": 6.288388367984781e-06, + "loss": 2.9016, + "step": 43720 + }, + { + "epoch": 2.9708520179372195, + "grad_norm": 7.18861722946167, + "learning_rate": 6.287963717896453e-06, + "loss": 2.9851, + "step": 43725 + }, + { + "epoch": 2.9711917380078816, + "grad_norm": 8.962878227233887, + "learning_rate": 6.287539067808127e-06, + "loss": 3.1058, + "step": 43730 + }, + { + "epoch": 2.9715314580785432, + "grad_norm": 6.592052459716797, + "learning_rate": 6.2871144177198e-06, + "loss": 3.043, + "step": 43735 + }, + { + "epoch": 2.971871178149205, + "grad_norm": 7.289508819580078, + "learning_rate": 6.286689767631472e-06, + "loss": 3.0423, + "step": 43740 + }, + { + "epoch": 2.972210898219867, + "grad_norm": 6.726831436157227, + "learning_rate": 6.286265117543145e-06, + "loss": 3.1549, + "step": 43745 + }, + { + "epoch": 2.9725506182905286, + "grad_norm": 5.7494330406188965, + "learning_rate": 6.285840467454818e-06, + "loss": 3.0644, + "step": 43750 + }, + { + "epoch": 2.97289033836119, + "grad_norm": 8.369254112243652, + "learning_rate": 6.28541581736649e-06, + "loss": 3.0833, + "step": 43755 + }, + { + "epoch": 2.9732300584318523, + "grad_norm": 7.361592769622803, + "learning_rate": 6.284991167278164e-06, + "loss": 3.2079, + "step": 43760 + }, + { + "epoch": 2.973569778502514, + "grad_norm": 8.51099967956543, + "learning_rate": 6.2845665171898365e-06, + "loss": 2.86, + "step": 43765 + }, + { + "epoch": 2.9739094985731755, + "grad_norm": 5.069108486175537, + "learning_rate": 6.2841418671015085e-06, + "loss": 3.1154, + "step": 43770 + }, + { + "epoch": 2.9742492186438376, + "grad_norm": 8.811100959777832, + "learning_rate": 6.283717217013182e-06, + "loss": 2.8766, + "step": 43775 + }, + { + "epoch": 2.9745889387144993, + "grad_norm": 6.340822696685791, + "learning_rate": 6.283292566924855e-06, + "loss": 3.135, + "step": 43780 + }, + { + "epoch": 2.974928658785161, + "grad_norm": 5.820384502410889, + "learning_rate": 6.282867916836527e-06, + "loss": 3.0325, + "step": 43785 + }, + { + "epoch": 2.975268378855823, + "grad_norm": 6.1708550453186035, + "learning_rate": 6.2824432667482005e-06, + "loss": 3.0848, + "step": 43790 + }, + { + "epoch": 2.9756080989264846, + "grad_norm": 8.448491096496582, + "learning_rate": 6.2820186166598725e-06, + "loss": 3.0351, + "step": 43795 + }, + { + "epoch": 2.975947818997146, + "grad_norm": 5.723543643951416, + "learning_rate": 6.281593966571545e-06, + "loss": 2.9745, + "step": 43800 + }, + { + "epoch": 2.9762875390678083, + "grad_norm": 6.077064037322998, + "learning_rate": 6.281169316483219e-06, + "loss": 3.0605, + "step": 43805 + }, + { + "epoch": 2.97662725913847, + "grad_norm": 8.41396427154541, + "learning_rate": 6.280744666394891e-06, + "loss": 2.9617, + "step": 43810 + }, + { + "epoch": 2.9769669792091316, + "grad_norm": 7.023823261260986, + "learning_rate": 6.280320016306564e-06, + "loss": 3.1091, + "step": 43815 + }, + { + "epoch": 2.9773066992797936, + "grad_norm": 7.094802379608154, + "learning_rate": 6.279895366218237e-06, + "loss": 3.1946, + "step": 43820 + }, + { + "epoch": 2.9776464193504553, + "grad_norm": 5.582535743713379, + "learning_rate": 6.279470716129909e-06, + "loss": 2.9969, + "step": 43825 + }, + { + "epoch": 2.977986139421117, + "grad_norm": 6.343632698059082, + "learning_rate": 6.279046066041582e-06, + "loss": 3.0109, + "step": 43830 + }, + { + "epoch": 2.978325859491779, + "grad_norm": 7.572453498840332, + "learning_rate": 6.278621415953256e-06, + "loss": 2.6043, + "step": 43835 + }, + { + "epoch": 2.9786655795624406, + "grad_norm": 6.390636920928955, + "learning_rate": 6.278196765864928e-06, + "loss": 3.0803, + "step": 43840 + }, + { + "epoch": 2.9790052996331022, + "grad_norm": 5.896251201629639, + "learning_rate": 6.2777721157766005e-06, + "loss": 3.0943, + "step": 43845 + }, + { + "epoch": 2.9793450197037643, + "grad_norm": 7.2865309715271, + "learning_rate": 6.277347465688274e-06, + "loss": 3.1383, + "step": 43850 + }, + { + "epoch": 2.979684739774426, + "grad_norm": 5.623177528381348, + "learning_rate": 6.276922815599946e-06, + "loss": 2.8668, + "step": 43855 + }, + { + "epoch": 2.9800244598450876, + "grad_norm": 7.48580265045166, + "learning_rate": 6.276498165511619e-06, + "loss": 3.0764, + "step": 43860 + }, + { + "epoch": 2.9803641799157496, + "grad_norm": 6.392224311828613, + "learning_rate": 6.2760735154232925e-06, + "loss": 2.9933, + "step": 43865 + }, + { + "epoch": 2.9807038999864113, + "grad_norm": 6.772653102874756, + "learning_rate": 6.2756488653349645e-06, + "loss": 3.0004, + "step": 43870 + }, + { + "epoch": 2.981043620057073, + "grad_norm": 7.357587814331055, + "learning_rate": 6.275224215246637e-06, + "loss": 2.9621, + "step": 43875 + }, + { + "epoch": 2.981383340127735, + "grad_norm": 7.315915584564209, + "learning_rate": 6.27479956515831e-06, + "loss": 3.3004, + "step": 43880 + }, + { + "epoch": 2.9817230601983966, + "grad_norm": 7.470418453216553, + "learning_rate": 6.274374915069983e-06, + "loss": 2.9025, + "step": 43885 + }, + { + "epoch": 2.9820627802690582, + "grad_norm": 8.049561500549316, + "learning_rate": 6.273950264981656e-06, + "loss": 2.7316, + "step": 43890 + }, + { + "epoch": 2.9824025003397203, + "grad_norm": 7.716163635253906, + "learning_rate": 6.2735256148933285e-06, + "loss": 3.3877, + "step": 43895 + }, + { + "epoch": 2.982742220410382, + "grad_norm": 8.480571746826172, + "learning_rate": 6.273100964805001e-06, + "loss": 3.0454, + "step": 43900 + }, + { + "epoch": 2.9830819404810436, + "grad_norm": 8.185712814331055, + "learning_rate": 6.272676314716673e-06, + "loss": 3.1616, + "step": 43905 + }, + { + "epoch": 2.9834216605517057, + "grad_norm": 7.498427867889404, + "learning_rate": 6.272251664628347e-06, + "loss": 3.0246, + "step": 43910 + }, + { + "epoch": 2.9837613806223673, + "grad_norm": 6.214836597442627, + "learning_rate": 6.27182701454002e-06, + "loss": 3.1501, + "step": 43915 + }, + { + "epoch": 2.984101100693029, + "grad_norm": 6.242298126220703, + "learning_rate": 6.271402364451692e-06, + "loss": 2.9378, + "step": 43920 + }, + { + "epoch": 2.984440820763691, + "grad_norm": 6.7226643562316895, + "learning_rate": 6.270977714363365e-06, + "loss": 3.0585, + "step": 43925 + }, + { + "epoch": 2.9847805408343526, + "grad_norm": 7.236179351806641, + "learning_rate": 6.270553064275038e-06, + "loss": 3.009, + "step": 43930 + }, + { + "epoch": 2.9851202609050143, + "grad_norm": 8.250428199768066, + "learning_rate": 6.27012841418671e-06, + "loss": 2.7473, + "step": 43935 + }, + { + "epoch": 2.985459980975676, + "grad_norm": 8.004809379577637, + "learning_rate": 6.269703764098384e-06, + "loss": 2.9739, + "step": 43940 + }, + { + "epoch": 2.985799701046338, + "grad_norm": 6.659898281097412, + "learning_rate": 6.2692791140100565e-06, + "loss": 2.9169, + "step": 43945 + }, + { + "epoch": 2.9861394211169996, + "grad_norm": 7.006575584411621, + "learning_rate": 6.2688544639217285e-06, + "loss": 3.119, + "step": 43950 + }, + { + "epoch": 2.9864791411876612, + "grad_norm": 7.870160102844238, + "learning_rate": 6.268429813833402e-06, + "loss": 3.0845, + "step": 43955 + }, + { + "epoch": 2.9868188612583233, + "grad_norm": 8.381513595581055, + "learning_rate": 6.268005163745075e-06, + "loss": 3.2647, + "step": 43960 + }, + { + "epoch": 2.987158581328985, + "grad_norm": 7.483004093170166, + "learning_rate": 6.267580513656747e-06, + "loss": 3.1373, + "step": 43965 + }, + { + "epoch": 2.9874983013996466, + "grad_norm": 6.982004642486572, + "learning_rate": 6.2671558635684205e-06, + "loss": 3.2392, + "step": 43970 + }, + { + "epoch": 2.987838021470308, + "grad_norm": 5.340638637542725, + "learning_rate": 6.2667312134800925e-06, + "loss": 3.0593, + "step": 43975 + }, + { + "epoch": 2.9881777415409703, + "grad_norm": 6.730386257171631, + "learning_rate": 6.266306563391765e-06, + "loss": 3.0666, + "step": 43980 + }, + { + "epoch": 2.988517461611632, + "grad_norm": 6.145496368408203, + "learning_rate": 6.265881913303439e-06, + "loss": 2.9007, + "step": 43985 + }, + { + "epoch": 2.9888571816822935, + "grad_norm": 9.446761131286621, + "learning_rate": 6.265457263215111e-06, + "loss": 3.2194, + "step": 43990 + }, + { + "epoch": 2.9891969017529556, + "grad_norm": 7.562762260437012, + "learning_rate": 6.265032613126784e-06, + "loss": 3.1679, + "step": 43995 + }, + { + "epoch": 2.9895366218236172, + "grad_norm": 7.107407569885254, + "learning_rate": 6.264607963038457e-06, + "loss": 2.8829, + "step": 44000 + }, + { + "epoch": 2.989876341894279, + "grad_norm": 8.335253715515137, + "learning_rate": 6.264183312950129e-06, + "loss": 3.065, + "step": 44005 + }, + { + "epoch": 2.990216061964941, + "grad_norm": 8.069765090942383, + "learning_rate": 6.263758662861802e-06, + "loss": 2.8398, + "step": 44010 + }, + { + "epoch": 2.9905557820356026, + "grad_norm": 7.135418891906738, + "learning_rate": 6.263334012773476e-06, + "loss": 3.1877, + "step": 44015 + }, + { + "epoch": 2.990895502106264, + "grad_norm": 6.614368915557861, + "learning_rate": 6.262909362685148e-06, + "loss": 3.0145, + "step": 44020 + }, + { + "epoch": 2.9912352221769263, + "grad_norm": 6.5324859619140625, + "learning_rate": 6.2624847125968205e-06, + "loss": 2.9263, + "step": 44025 + }, + { + "epoch": 2.991574942247588, + "grad_norm": 8.040119171142578, + "learning_rate": 6.262144992526159e-06, + "loss": 3.5563, + "step": 44030 + }, + { + "epoch": 2.9919146623182495, + "grad_norm": 7.286890983581543, + "learning_rate": 6.261720342437831e-06, + "loss": 3.0401, + "step": 44035 + }, + { + "epoch": 2.9922543823889116, + "grad_norm": 5.88771390914917, + "learning_rate": 6.261295692349505e-06, + "loss": 3.019, + "step": 44040 + }, + { + "epoch": 2.9925941024595732, + "grad_norm": 5.724491596221924, + "learning_rate": 6.260871042261178e-06, + "loss": 3.13, + "step": 44045 + }, + { + "epoch": 2.992933822530235, + "grad_norm": 7.191132545471191, + "learning_rate": 6.26044639217285e-06, + "loss": 3.2162, + "step": 44050 + }, + { + "epoch": 2.993273542600897, + "grad_norm": 7.7789835929870605, + "learning_rate": 6.260021742084523e-06, + "loss": 3.2405, + "step": 44055 + }, + { + "epoch": 2.9936132626715586, + "grad_norm": 6.867500305175781, + "learning_rate": 6.259597091996195e-06, + "loss": 3.1909, + "step": 44060 + }, + { + "epoch": 2.99395298274222, + "grad_norm": 8.461623191833496, + "learning_rate": 6.259172441907868e-06, + "loss": 2.9031, + "step": 44065 + }, + { + "epoch": 2.9942927028128823, + "grad_norm": 7.319505214691162, + "learning_rate": 6.258747791819542e-06, + "loss": 3.0674, + "step": 44070 + }, + { + "epoch": 2.994632422883544, + "grad_norm": 7.720163345336914, + "learning_rate": 6.258323141731214e-06, + "loss": 3.0233, + "step": 44075 + }, + { + "epoch": 2.9949721429542056, + "grad_norm": 9.845767974853516, + "learning_rate": 6.2578984916428865e-06, + "loss": 2.9881, + "step": 44080 + }, + { + "epoch": 2.9953118630248676, + "grad_norm": 8.225934028625488, + "learning_rate": 6.25747384155456e-06, + "loss": 3.1214, + "step": 44085 + }, + { + "epoch": 2.9956515830955293, + "grad_norm": 7.223713397979736, + "learning_rate": 6.257049191466232e-06, + "loss": 3.0091, + "step": 44090 + }, + { + "epoch": 2.995991303166191, + "grad_norm": 6.3637166023254395, + "learning_rate": 6.256624541377905e-06, + "loss": 3.207, + "step": 44095 + }, + { + "epoch": 2.996331023236853, + "grad_norm": 7.710927963256836, + "learning_rate": 6.256199891289579e-06, + "loss": 3.2492, + "step": 44100 + }, + { + "epoch": 2.9966707433075146, + "grad_norm": 7.237220287322998, + "learning_rate": 6.2557752412012505e-06, + "loss": 3.2199, + "step": 44105 + }, + { + "epoch": 2.9970104633781762, + "grad_norm": 7.134881019592285, + "learning_rate": 6.255350591112923e-06, + "loss": 3.0713, + "step": 44110 + }, + { + "epoch": 2.9973501834488383, + "grad_norm": 8.646482467651367, + "learning_rate": 6.254925941024597e-06, + "loss": 2.9339, + "step": 44115 + }, + { + "epoch": 2.9976899035195, + "grad_norm": 6.548867225646973, + "learning_rate": 6.254501290936269e-06, + "loss": 3.1244, + "step": 44120 + }, + { + "epoch": 2.9980296235901616, + "grad_norm": 6.343789577484131, + "learning_rate": 6.254076640847942e-06, + "loss": 3.1366, + "step": 44125 + }, + { + "epoch": 2.9983693436608236, + "grad_norm": 7.6330695152282715, + "learning_rate": 6.2536519907596146e-06, + "loss": 3.1701, + "step": 44130 + }, + { + "epoch": 2.9987090637314853, + "grad_norm": 7.153676509857178, + "learning_rate": 6.253227340671287e-06, + "loss": 3.1917, + "step": 44135 + }, + { + "epoch": 2.999048783802147, + "grad_norm": 7.07666015625, + "learning_rate": 6.25280269058296e-06, + "loss": 2.7708, + "step": 44140 + }, + { + "epoch": 2.999388503872809, + "grad_norm": 6.853504180908203, + "learning_rate": 6.252378040494633e-06, + "loss": 3.0509, + "step": 44145 + }, + { + "epoch": 2.9997282239434706, + "grad_norm": 6.363455772399902, + "learning_rate": 6.251953390406306e-06, + "loss": 3.0116, + "step": 44150 + }, + { + "epoch": 3.0, + "eval_bertscore": { + "f1": 0.8383315820681955, + "precision": 0.8403238740952997, + "recall": 0.8370211590848605 + }, + "eval_bleu_4": 0.011162069419169364, + "eval_exact_match": 0.0, + "eval_loss": 3.292250394821167, + "eval_meteor": 0.10351546585377323, + "eval_rouge": { + "rouge1": 0.13557953212979273, + "rouge2": 0.013648806850386627, + "rougeL": 0.11335314021149467, + "rougeLsum": 0.11332594555346201 + }, + "eval_runtime": 1076.9831, + "eval_samples_per_second": 9.581, + "eval_steps_per_second": 1.198, + "step": 44154 + }, + { + "epoch": 3.0000679440141322, + "grad_norm": 6.4181952476501465, + "learning_rate": 6.251528740317978e-06, + "loss": 2.9421, + "step": 44155 + }, + { + "epoch": 3.0004076640847943, + "grad_norm": 7.15275764465332, + "learning_rate": 6.251104090229651e-06, + "loss": 3.0368, + "step": 44160 + }, + { + "epoch": 3.000747384155456, + "grad_norm": 7.590848922729492, + "learning_rate": 6.250679440141324e-06, + "loss": 3.0138, + "step": 44165 + }, + { + "epoch": 3.0010871042261176, + "grad_norm": 7.485401153564453, + "learning_rate": 6.250254790052996e-06, + "loss": 2.822, + "step": 44170 + }, + { + "epoch": 3.0014268242967796, + "grad_norm": 8.02118968963623, + "learning_rate": 6.24983013996467e-06, + "loss": 2.8745, + "step": 44175 + }, + { + "epoch": 3.0017665443674413, + "grad_norm": 7.1381683349609375, + "learning_rate": 6.2494054898763426e-06, + "loss": 2.9077, + "step": 44180 + }, + { + "epoch": 3.002106264438103, + "grad_norm": 6.92413854598999, + "learning_rate": 6.2489808397880145e-06, + "loss": 2.8081, + "step": 44185 + }, + { + "epoch": 3.002445984508765, + "grad_norm": 5.566002368927002, + "learning_rate": 6.248556189699688e-06, + "loss": 3.0403, + "step": 44190 + }, + { + "epoch": 3.0027857045794266, + "grad_norm": 7.949538707733154, + "learning_rate": 6.248131539611361e-06, + "loss": 2.8444, + "step": 44195 + }, + { + "epoch": 3.0031254246500882, + "grad_norm": 6.72312593460083, + "learning_rate": 6.247706889523033e-06, + "loss": 2.8107, + "step": 44200 + }, + { + "epoch": 3.0034651447207503, + "grad_norm": 8.63421630859375, + "learning_rate": 6.2472822394347066e-06, + "loss": 2.9521, + "step": 44205 + }, + { + "epoch": 3.003804864791412, + "grad_norm": 7.122517108917236, + "learning_rate": 6.246857589346379e-06, + "loss": 2.9928, + "step": 44210 + }, + { + "epoch": 3.0041445848620736, + "grad_norm": 6.670889854431152, + "learning_rate": 6.246432939258051e-06, + "loss": 2.7884, + "step": 44215 + }, + { + "epoch": 3.004484304932735, + "grad_norm": 7.4225239753723145, + "learning_rate": 6.246008289169725e-06, + "loss": 2.983, + "step": 44220 + }, + { + "epoch": 3.0048240250033973, + "grad_norm": 7.3408894538879395, + "learning_rate": 6.245583639081397e-06, + "loss": 3.0458, + "step": 44225 + }, + { + "epoch": 3.005163745074059, + "grad_norm": 6.8870320320129395, + "learning_rate": 6.24515898899307e-06, + "loss": 2.8125, + "step": 44230 + }, + { + "epoch": 3.0055034651447206, + "grad_norm": 8.487162590026855, + "learning_rate": 6.244734338904743e-06, + "loss": 2.7388, + "step": 44235 + }, + { + "epoch": 3.0058431852153826, + "grad_norm": 6.1962361335754395, + "learning_rate": 6.244309688816415e-06, + "loss": 2.8289, + "step": 44240 + }, + { + "epoch": 3.0061829052860443, + "grad_norm": 8.466833114624023, + "learning_rate": 6.243885038728088e-06, + "loss": 3.0887, + "step": 44245 + }, + { + "epoch": 3.006522625356706, + "grad_norm": 8.493128776550293, + "learning_rate": 6.243460388639762e-06, + "loss": 3.0083, + "step": 44250 + }, + { + "epoch": 3.006862345427368, + "grad_norm": 7.661616325378418, + "learning_rate": 6.243035738551434e-06, + "loss": 2.8452, + "step": 44255 + }, + { + "epoch": 3.0072020654980296, + "grad_norm": 8.082876205444336, + "learning_rate": 6.2426110884631065e-06, + "loss": 3.2396, + "step": 44260 + }, + { + "epoch": 3.0075417855686912, + "grad_norm": 6.882887840270996, + "learning_rate": 6.24218643837478e-06, + "loss": 2.8941, + "step": 44265 + }, + { + "epoch": 3.0078815056393533, + "grad_norm": 7.011538505554199, + "learning_rate": 6.241761788286452e-06, + "loss": 2.6039, + "step": 44270 + }, + { + "epoch": 3.008221225710015, + "grad_norm": 7.679276466369629, + "learning_rate": 6.241337138198125e-06, + "loss": 2.8553, + "step": 44275 + }, + { + "epoch": 3.0085609457806766, + "grad_norm": 6.490180492401123, + "learning_rate": 6.2409124881097986e-06, + "loss": 2.9771, + "step": 44280 + }, + { + "epoch": 3.0089006658513386, + "grad_norm": 8.17187786102295, + "learning_rate": 6.2404878380214705e-06, + "loss": 2.8325, + "step": 44285 + }, + { + "epoch": 3.0092403859220003, + "grad_norm": 8.122148513793945, + "learning_rate": 6.240063187933144e-06, + "loss": 2.8922, + "step": 44290 + }, + { + "epoch": 3.009580105992662, + "grad_norm": 6.014626502990723, + "learning_rate": 6.239638537844816e-06, + "loss": 3.0021, + "step": 44295 + }, + { + "epoch": 3.009919826063324, + "grad_norm": 6.258330821990967, + "learning_rate": 6.239213887756489e-06, + "loss": 2.9594, + "step": 44300 + }, + { + "epoch": 3.0102595461339856, + "grad_norm": 7.245804786682129, + "learning_rate": 6.238789237668163e-06, + "loss": 2.8244, + "step": 44305 + }, + { + "epoch": 3.0105992662046472, + "grad_norm": 6.863999843597412, + "learning_rate": 6.2383645875798345e-06, + "loss": 3.1184, + "step": 44310 + }, + { + "epoch": 3.0109389862753093, + "grad_norm": 7.066293239593506, + "learning_rate": 6.237939937491507e-06, + "loss": 3.0112, + "step": 44315 + }, + { + "epoch": 3.011278706345971, + "grad_norm": 8.229748725891113, + "learning_rate": 6.237515287403181e-06, + "loss": 2.8975, + "step": 44320 + }, + { + "epoch": 3.0116184264166326, + "grad_norm": 6.490983009338379, + "learning_rate": 6.237090637314853e-06, + "loss": 2.9148, + "step": 44325 + }, + { + "epoch": 3.0119581464872947, + "grad_norm": 7.0738654136657715, + "learning_rate": 6.236665987226526e-06, + "loss": 3.2241, + "step": 44330 + }, + { + "epoch": 3.0122978665579563, + "grad_norm": 6.6326584815979, + "learning_rate": 6.236241337138199e-06, + "loss": 2.9524, + "step": 44335 + }, + { + "epoch": 3.012637586628618, + "grad_norm": 6.711297512054443, + "learning_rate": 6.235816687049871e-06, + "loss": 2.7937, + "step": 44340 + }, + { + "epoch": 3.01297730669928, + "grad_norm": 8.603460311889648, + "learning_rate": 6.235392036961544e-06, + "loss": 3.0538, + "step": 44345 + }, + { + "epoch": 3.0133170267699416, + "grad_norm": 6.36869478225708, + "learning_rate": 6.234967386873218e-06, + "loss": 2.9461, + "step": 44350 + }, + { + "epoch": 3.0136567468406033, + "grad_norm": 7.114576816558838, + "learning_rate": 6.23454273678489e-06, + "loss": 3.0181, + "step": 44355 + }, + { + "epoch": 3.0139964669112653, + "grad_norm": 6.132726669311523, + "learning_rate": 6.2341180866965625e-06, + "loss": 2.9371, + "step": 44360 + }, + { + "epoch": 3.014336186981927, + "grad_norm": 6.251181125640869, + "learning_rate": 6.233693436608236e-06, + "loss": 2.8683, + "step": 44365 + }, + { + "epoch": 3.0146759070525886, + "grad_norm": 6.115169048309326, + "learning_rate": 6.233268786519908e-06, + "loss": 2.9199, + "step": 44370 + }, + { + "epoch": 3.01501562712325, + "grad_norm": 6.454366683959961, + "learning_rate": 6.232844136431581e-06, + "loss": 3.0738, + "step": 44375 + }, + { + "epoch": 3.0153553471939123, + "grad_norm": 7.041844367980957, + "learning_rate": 6.232419486343254e-06, + "loss": 2.7373, + "step": 44380 + }, + { + "epoch": 3.015695067264574, + "grad_norm": 7.732128143310547, + "learning_rate": 6.2319948362549265e-06, + "loss": 2.9889, + "step": 44385 + }, + { + "epoch": 3.0160347873352356, + "grad_norm": 7.884781360626221, + "learning_rate": 6.231570186166599e-06, + "loss": 2.9537, + "step": 44390 + }, + { + "epoch": 3.0163745074058976, + "grad_norm": 6.6351728439331055, + "learning_rate": 6.231145536078272e-06, + "loss": 2.838, + "step": 44395 + }, + { + "epoch": 3.0167142274765593, + "grad_norm": 7.502725601196289, + "learning_rate": 6.230720885989945e-06, + "loss": 3.1162, + "step": 44400 + }, + { + "epoch": 3.017053947547221, + "grad_norm": 6.197939872741699, + "learning_rate": 6.230296235901617e-06, + "loss": 2.8371, + "step": 44405 + }, + { + "epoch": 3.017393667617883, + "grad_norm": 7.545255184173584, + "learning_rate": 6.2298715858132905e-06, + "loss": 2.7857, + "step": 44410 + }, + { + "epoch": 3.0177333876885446, + "grad_norm": 7.48677396774292, + "learning_rate": 6.229446935724963e-06, + "loss": 3.1908, + "step": 44415 + }, + { + "epoch": 3.0180731077592062, + "grad_norm": 8.072816848754883, + "learning_rate": 6.229022285636635e-06, + "loss": 3.0222, + "step": 44420 + }, + { + "epoch": 3.0184128278298683, + "grad_norm": 6.956480503082275, + "learning_rate": 6.228597635548309e-06, + "loss": 3.164, + "step": 44425 + }, + { + "epoch": 3.01875254790053, + "grad_norm": 5.455509662628174, + "learning_rate": 6.228172985459982e-06, + "loss": 3.0407, + "step": 44430 + }, + { + "epoch": 3.0190922679711916, + "grad_norm": 6.862191677093506, + "learning_rate": 6.227748335371654e-06, + "loss": 2.7905, + "step": 44435 + }, + { + "epoch": 3.0194319880418536, + "grad_norm": 7.2150068283081055, + "learning_rate": 6.227323685283327e-06, + "loss": 3.2586, + "step": 44440 + }, + { + "epoch": 3.0197717081125153, + "grad_norm": 8.170867919921875, + "learning_rate": 6.226899035195e-06, + "loss": 3.0444, + "step": 44445 + }, + { + "epoch": 3.020111428183177, + "grad_norm": 7.917517185211182, + "learning_rate": 6.226474385106672e-06, + "loss": 2.8817, + "step": 44450 + }, + { + "epoch": 3.020451148253839, + "grad_norm": 8.785748481750488, + "learning_rate": 6.226049735018346e-06, + "loss": 2.8379, + "step": 44455 + }, + { + "epoch": 3.0207908683245006, + "grad_norm": 6.562155723571777, + "learning_rate": 6.2256250849300186e-06, + "loss": 2.7654, + "step": 44460 + }, + { + "epoch": 3.0211305883951622, + "grad_norm": 8.161965370178223, + "learning_rate": 6.2252004348416905e-06, + "loss": 3.0755, + "step": 44465 + }, + { + "epoch": 3.0214703084658243, + "grad_norm": 7.9483489990234375, + "learning_rate": 6.224775784753364e-06, + "loss": 2.912, + "step": 44470 + }, + { + "epoch": 3.021810028536486, + "grad_norm": 7.411147594451904, + "learning_rate": 6.224351134665036e-06, + "loss": 2.9052, + "step": 44475 + }, + { + "epoch": 3.0221497486071476, + "grad_norm": 8.831048965454102, + "learning_rate": 6.223926484576709e-06, + "loss": 3.0957, + "step": 44480 + }, + { + "epoch": 3.0224894686778097, + "grad_norm": 8.682311058044434, + "learning_rate": 6.2235018344883826e-06, + "loss": 3.1833, + "step": 44485 + }, + { + "epoch": 3.0228291887484713, + "grad_norm": 5.782588005065918, + "learning_rate": 6.2230771844000545e-06, + "loss": 2.5823, + "step": 44490 + }, + { + "epoch": 3.023168908819133, + "grad_norm": 7.403542518615723, + "learning_rate": 6.222652534311727e-06, + "loss": 2.9554, + "step": 44495 + }, + { + "epoch": 3.023508628889795, + "grad_norm": 6.451030254364014, + "learning_rate": 6.222227884223401e-06, + "loss": 2.935, + "step": 44500 + }, + { + "epoch": 3.0238483489604566, + "grad_norm": 8.272784233093262, + "learning_rate": 6.221803234135073e-06, + "loss": 2.7492, + "step": 44505 + }, + { + "epoch": 3.0241880690311183, + "grad_norm": 6.908932685852051, + "learning_rate": 6.221378584046746e-06, + "loss": 2.9807, + "step": 44510 + }, + { + "epoch": 3.0245277891017803, + "grad_norm": 7.670294284820557, + "learning_rate": 6.220953933958419e-06, + "loss": 3.0647, + "step": 44515 + }, + { + "epoch": 3.024867509172442, + "grad_norm": 7.483391761779785, + "learning_rate": 6.220529283870091e-06, + "loss": 2.8631, + "step": 44520 + }, + { + "epoch": 3.0252072292431036, + "grad_norm": 8.354586601257324, + "learning_rate": 6.220104633781764e-06, + "loss": 2.908, + "step": 44525 + }, + { + "epoch": 3.0255469493137657, + "grad_norm": 6.900322437286377, + "learning_rate": 6.219679983693438e-06, + "loss": 2.9657, + "step": 44530 + }, + { + "epoch": 3.0258866693844273, + "grad_norm": 7.4017720222473145, + "learning_rate": 6.21925533360511e-06, + "loss": 2.9724, + "step": 44535 + }, + { + "epoch": 3.026226389455089, + "grad_norm": 6.141792297363281, + "learning_rate": 6.2188306835167825e-06, + "loss": 3.3687, + "step": 44540 + }, + { + "epoch": 3.026566109525751, + "grad_norm": 8.287593841552734, + "learning_rate": 6.218406033428455e-06, + "loss": 2.6894, + "step": 44545 + }, + { + "epoch": 3.0269058295964126, + "grad_norm": 9.375313758850098, + "learning_rate": 6.217981383340128e-06, + "loss": 3.0258, + "step": 44550 + }, + { + "epoch": 3.0272455496670743, + "grad_norm": 5.948861122131348, + "learning_rate": 6.217556733251801e-06, + "loss": 2.929, + "step": 44555 + }, + { + "epoch": 3.027585269737736, + "grad_norm": 8.359333992004395, + "learning_rate": 6.217132083163474e-06, + "loss": 2.8892, + "step": 44560 + }, + { + "epoch": 3.027924989808398, + "grad_norm": 7.018699645996094, + "learning_rate": 6.2167074330751465e-06, + "loss": 2.9917, + "step": 44565 + }, + { + "epoch": 3.0282647098790596, + "grad_norm": 5.701771259307861, + "learning_rate": 6.2162827829868185e-06, + "loss": 2.8757, + "step": 44570 + }, + { + "epoch": 3.0286044299497212, + "grad_norm": 6.2869486808776855, + "learning_rate": 6.215858132898492e-06, + "loss": 2.9999, + "step": 44575 + }, + { + "epoch": 3.0289441500203833, + "grad_norm": 8.533926963806152, + "learning_rate": 6.215433482810165e-06, + "loss": 2.7619, + "step": 44580 + }, + { + "epoch": 3.029283870091045, + "grad_norm": 8.09538459777832, + "learning_rate": 6.215008832721837e-06, + "loss": 2.9893, + "step": 44585 + }, + { + "epoch": 3.0296235901617066, + "grad_norm": 7.3746018409729, + "learning_rate": 6.2145841826335105e-06, + "loss": 2.9164, + "step": 44590 + }, + { + "epoch": 3.0299633102323686, + "grad_norm": 8.370617866516113, + "learning_rate": 6.214159532545183e-06, + "loss": 2.8816, + "step": 44595 + }, + { + "epoch": 3.0303030303030303, + "grad_norm": 7.446950912475586, + "learning_rate": 6.213734882456855e-06, + "loss": 2.9614, + "step": 44600 + }, + { + "epoch": 3.030642750373692, + "grad_norm": 7.997874736785889, + "learning_rate": 6.213310232368529e-06, + "loss": 2.8871, + "step": 44605 + }, + { + "epoch": 3.030982470444354, + "grad_norm": 6.531839847564697, + "learning_rate": 6.212885582280202e-06, + "loss": 2.8197, + "step": 44610 + }, + { + "epoch": 3.0313221905150156, + "grad_norm": 6.89546537399292, + "learning_rate": 6.212460932191874e-06, + "loss": 2.7677, + "step": 44615 + }, + { + "epoch": 3.0316619105856772, + "grad_norm": 6.830991744995117, + "learning_rate": 6.212036282103547e-06, + "loss": 2.8815, + "step": 44620 + }, + { + "epoch": 3.0320016306563393, + "grad_norm": 8.704238891601562, + "learning_rate": 6.21161163201522e-06, + "loss": 3.056, + "step": 44625 + }, + { + "epoch": 3.032341350727001, + "grad_norm": 7.547338008880615, + "learning_rate": 6.211186981926893e-06, + "loss": 3.1802, + "step": 44630 + }, + { + "epoch": 3.0326810707976626, + "grad_norm": 7.551136016845703, + "learning_rate": 6.210762331838566e-06, + "loss": 2.8988, + "step": 44635 + }, + { + "epoch": 3.0330207908683247, + "grad_norm": 6.052545547485352, + "learning_rate": 6.210337681750238e-06, + "loss": 2.657, + "step": 44640 + }, + { + "epoch": 3.0333605109389863, + "grad_norm": 7.605722427368164, + "learning_rate": 6.209913031661911e-06, + "loss": 3.1703, + "step": 44645 + }, + { + "epoch": 3.033700231009648, + "grad_norm": 6.723886966705322, + "learning_rate": 6.209488381573584e-06, + "loss": 2.7126, + "step": 44650 + }, + { + "epoch": 3.03403995108031, + "grad_norm": 7.271027088165283, + "learning_rate": 6.209063731485256e-06, + "loss": 2.9692, + "step": 44655 + }, + { + "epoch": 3.0343796711509716, + "grad_norm": 6.980769634246826, + "learning_rate": 6.20863908139693e-06, + "loss": 2.9058, + "step": 44660 + }, + { + "epoch": 3.0347193912216333, + "grad_norm": 6.350400447845459, + "learning_rate": 6.2082144313086025e-06, + "loss": 3.0898, + "step": 44665 + }, + { + "epoch": 3.0350591112922953, + "grad_norm": 8.486936569213867, + "learning_rate": 6.2077897812202745e-06, + "loss": 2.7197, + "step": 44670 + }, + { + "epoch": 3.035398831362957, + "grad_norm": 6.438683032989502, + "learning_rate": 6.207365131131948e-06, + "loss": 2.8255, + "step": 44675 + }, + { + "epoch": 3.0357385514336186, + "grad_norm": 8.456071853637695, + "learning_rate": 6.206940481043621e-06, + "loss": 2.9375, + "step": 44680 + }, + { + "epoch": 3.0360782715042807, + "grad_norm": 7.310471057891846, + "learning_rate": 6.206515830955293e-06, + "loss": 3.0627, + "step": 44685 + }, + { + "epoch": 3.0364179915749423, + "grad_norm": 6.555561065673828, + "learning_rate": 6.2060911808669665e-06, + "loss": 2.8413, + "step": 44690 + }, + { + "epoch": 3.036757711645604, + "grad_norm": 9.080948829650879, + "learning_rate": 6.205666530778639e-06, + "loss": 2.8471, + "step": 44695 + }, + { + "epoch": 3.037097431716266, + "grad_norm": 6.908421039581299, + "learning_rate": 6.205241880690311e-06, + "loss": 2.9571, + "step": 44700 + }, + { + "epoch": 3.0374371517869276, + "grad_norm": 9.093962669372559, + "learning_rate": 6.204817230601985e-06, + "loss": 2.8286, + "step": 44705 + }, + { + "epoch": 3.0377768718575893, + "grad_norm": 6.3190388679504395, + "learning_rate": 6.204392580513658e-06, + "loss": 3.0264, + "step": 44710 + }, + { + "epoch": 3.038116591928251, + "grad_norm": 8.717555046081543, + "learning_rate": 6.20396793042533e-06, + "loss": 2.751, + "step": 44715 + }, + { + "epoch": 3.038456311998913, + "grad_norm": 6.465308666229248, + "learning_rate": 6.203543280337003e-06, + "loss": 2.8652, + "step": 44720 + }, + { + "epoch": 3.0387960320695746, + "grad_norm": 5.852976322174072, + "learning_rate": 6.203118630248675e-06, + "loss": 2.7784, + "step": 44725 + }, + { + "epoch": 3.0391357521402362, + "grad_norm": 8.51550579071045, + "learning_rate": 6.202693980160348e-06, + "loss": 3.0306, + "step": 44730 + }, + { + "epoch": 3.0394754722108983, + "grad_norm": 6.65303897857666, + "learning_rate": 6.202269330072022e-06, + "loss": 3.0651, + "step": 44735 + }, + { + "epoch": 3.03981519228156, + "grad_norm": 6.606013774871826, + "learning_rate": 6.201844679983694e-06, + "loss": 2.6334, + "step": 44740 + }, + { + "epoch": 3.0401549123522216, + "grad_norm": 7.308811664581299, + "learning_rate": 6.2014200298953665e-06, + "loss": 3.029, + "step": 44745 + }, + { + "epoch": 3.0404946324228836, + "grad_norm": 5.642664909362793, + "learning_rate": 6.20099537980704e-06, + "loss": 2.8593, + "step": 44750 + }, + { + "epoch": 3.0408343524935453, + "grad_norm": 6.552827835083008, + "learning_rate": 6.200570729718712e-06, + "loss": 2.7194, + "step": 44755 + }, + { + "epoch": 3.041174072564207, + "grad_norm": 7.785162448883057, + "learning_rate": 6.200146079630385e-06, + "loss": 2.8542, + "step": 44760 + }, + { + "epoch": 3.041513792634869, + "grad_norm": 6.606443405151367, + "learning_rate": 6.1997214295420586e-06, + "loss": 2.7839, + "step": 44765 + }, + { + "epoch": 3.0418535127055306, + "grad_norm": 7.60473108291626, + "learning_rate": 6.1992967794537305e-06, + "loss": 2.3885, + "step": 44770 + }, + { + "epoch": 3.0421932327761922, + "grad_norm": 9.475836753845215, + "learning_rate": 6.198872129365403e-06, + "loss": 3.0779, + "step": 44775 + }, + { + "epoch": 3.0425329528468543, + "grad_norm": 7.350878715515137, + "learning_rate": 6.198447479277077e-06, + "loss": 2.8415, + "step": 44780 + }, + { + "epoch": 3.042872672917516, + "grad_norm": 7.269857406616211, + "learning_rate": 6.198022829188749e-06, + "loss": 2.8253, + "step": 44785 + }, + { + "epoch": 3.0432123929881776, + "grad_norm": 6.920557975769043, + "learning_rate": 6.197598179100422e-06, + "loss": 2.6644, + "step": 44790 + }, + { + "epoch": 3.0435521130588397, + "grad_norm": 6.773350715637207, + "learning_rate": 6.1971735290120945e-06, + "loss": 2.9826, + "step": 44795 + }, + { + "epoch": 3.0438918331295013, + "grad_norm": 6.004032135009766, + "learning_rate": 6.196748878923767e-06, + "loss": 3.0801, + "step": 44800 + }, + { + "epoch": 3.044231553200163, + "grad_norm": 9.516948699951172, + "learning_rate": 6.19632422883544e-06, + "loss": 2.9393, + "step": 44805 + }, + { + "epoch": 3.044571273270825, + "grad_norm": 6.0540032386779785, + "learning_rate": 6.195899578747113e-06, + "loss": 2.7454, + "step": 44810 + }, + { + "epoch": 3.0449109933414866, + "grad_norm": 6.396414279937744, + "learning_rate": 6.195474928658786e-06, + "loss": 3.1623, + "step": 44815 + }, + { + "epoch": 3.0452507134121483, + "grad_norm": 8.565287590026855, + "learning_rate": 6.195050278570458e-06, + "loss": 3.008, + "step": 44820 + }, + { + "epoch": 3.0455904334828103, + "grad_norm": 6.004703998565674, + "learning_rate": 6.194625628482131e-06, + "loss": 3.0156, + "step": 44825 + }, + { + "epoch": 3.045930153553472, + "grad_norm": 5.9184722900390625, + "learning_rate": 6.194200978393804e-06, + "loss": 2.9932, + "step": 44830 + }, + { + "epoch": 3.0462698736241336, + "grad_norm": 6.4662604331970215, + "learning_rate": 6.193776328305476e-06, + "loss": 2.6498, + "step": 44835 + }, + { + "epoch": 3.0466095936947957, + "grad_norm": 6.993135929107666, + "learning_rate": 6.19335167821715e-06, + "loss": 2.8039, + "step": 44840 + }, + { + "epoch": 3.0469493137654573, + "grad_norm": 6.81443977355957, + "learning_rate": 6.1929270281288225e-06, + "loss": 2.8114, + "step": 44845 + }, + { + "epoch": 3.047289033836119, + "grad_norm": 5.752827167510986, + "learning_rate": 6.1925023780404945e-06, + "loss": 3.0573, + "step": 44850 + }, + { + "epoch": 3.047628753906781, + "grad_norm": 5.636299133300781, + "learning_rate": 6.192077727952168e-06, + "loss": 3.002, + "step": 44855 + }, + { + "epoch": 3.0479684739774426, + "grad_norm": 6.314489841461182, + "learning_rate": 6.191653077863841e-06, + "loss": 2.8539, + "step": 44860 + }, + { + "epoch": 3.0483081940481043, + "grad_norm": 9.14030933380127, + "learning_rate": 6.191228427775513e-06, + "loss": 2.9109, + "step": 44865 + }, + { + "epoch": 3.0486479141187663, + "grad_norm": 6.601457118988037, + "learning_rate": 6.1908037776871865e-06, + "loss": 2.8041, + "step": 44870 + }, + { + "epoch": 3.048987634189428, + "grad_norm": 8.144753456115723, + "learning_rate": 6.190379127598859e-06, + "loss": 2.8957, + "step": 44875 + }, + { + "epoch": 3.0493273542600896, + "grad_norm": 7.485243797302246, + "learning_rate": 6.189954477510531e-06, + "loss": 2.7984, + "step": 44880 + }, + { + "epoch": 3.0496670743307517, + "grad_norm": 8.851460456848145, + "learning_rate": 6.189529827422205e-06, + "loss": 3.0058, + "step": 44885 + }, + { + "epoch": 3.0500067944014133, + "grad_norm": 6.766012191772461, + "learning_rate": 6.189105177333877e-06, + "loss": 3.059, + "step": 44890 + }, + { + "epoch": 3.050346514472075, + "grad_norm": 7.424582481384277, + "learning_rate": 6.18868052724555e-06, + "loss": 2.9088, + "step": 44895 + }, + { + "epoch": 3.0506862345427366, + "grad_norm": 8.151839256286621, + "learning_rate": 6.188255877157223e-06, + "loss": 3.0159, + "step": 44900 + }, + { + "epoch": 3.0510259546133986, + "grad_norm": 6.310959339141846, + "learning_rate": 6.187831227068895e-06, + "loss": 2.7406, + "step": 44905 + }, + { + "epoch": 3.0513656746840603, + "grad_norm": 8.362350463867188, + "learning_rate": 6.187406576980568e-06, + "loss": 2.8899, + "step": 44910 + }, + { + "epoch": 3.051705394754722, + "grad_norm": 6.075854301452637, + "learning_rate": 6.186981926892242e-06, + "loss": 3.0235, + "step": 44915 + }, + { + "epoch": 3.052045114825384, + "grad_norm": 6.344240188598633, + "learning_rate": 6.186557276803914e-06, + "loss": 3.0864, + "step": 44920 + }, + { + "epoch": 3.0523848348960456, + "grad_norm": 7.622957706451416, + "learning_rate": 6.1861326267155865e-06, + "loss": 2.8762, + "step": 44925 + }, + { + "epoch": 3.0527245549667072, + "grad_norm": 7.845760345458984, + "learning_rate": 6.18570797662726e-06, + "loss": 3.0402, + "step": 44930 + }, + { + "epoch": 3.0530642750373693, + "grad_norm": 8.435930252075195, + "learning_rate": 6.185283326538932e-06, + "loss": 2.7121, + "step": 44935 + }, + { + "epoch": 3.053403995108031, + "grad_norm": 7.724775314331055, + "learning_rate": 6.184858676450605e-06, + "loss": 2.9018, + "step": 44940 + }, + { + "epoch": 3.0537437151786926, + "grad_norm": 6.1344194412231445, + "learning_rate": 6.1844340263622785e-06, + "loss": 2.9639, + "step": 44945 + }, + { + "epoch": 3.0540834352493547, + "grad_norm": 6.737536907196045, + "learning_rate": 6.1840093762739505e-06, + "loss": 3.029, + "step": 44950 + }, + { + "epoch": 3.0544231553200163, + "grad_norm": 6.619947910308838, + "learning_rate": 6.183584726185623e-06, + "loss": 2.8464, + "step": 44955 + }, + { + "epoch": 3.054762875390678, + "grad_norm": 5.070106506347656, + "learning_rate": 6.183160076097297e-06, + "loss": 2.9872, + "step": 44960 + }, + { + "epoch": 3.05510259546134, + "grad_norm": 7.3166680335998535, + "learning_rate": 6.182735426008969e-06, + "loss": 2.8966, + "step": 44965 + }, + { + "epoch": 3.0554423155320016, + "grad_norm": 5.9819464683532715, + "learning_rate": 6.1823107759206425e-06, + "loss": 2.7741, + "step": 44970 + }, + { + "epoch": 3.0557820356026633, + "grad_norm": 7.729825019836426, + "learning_rate": 6.1818861258323145e-06, + "loss": 3.0297, + "step": 44975 + }, + { + "epoch": 3.0561217556733253, + "grad_norm": 6.483199119567871, + "learning_rate": 6.181461475743987e-06, + "loss": 3.0192, + "step": 44980 + }, + { + "epoch": 3.056461475743987, + "grad_norm": 7.279105186462402, + "learning_rate": 6.181036825655661e-06, + "loss": 3.0705, + "step": 44985 + }, + { + "epoch": 3.0568011958146486, + "grad_norm": 6.235270977020264, + "learning_rate": 6.180612175567333e-06, + "loss": 3.0109, + "step": 44990 + }, + { + "epoch": 3.0571409158853107, + "grad_norm": 6.86525821685791, + "learning_rate": 6.180187525479006e-06, + "loss": 2.69, + "step": 44995 + }, + { + "epoch": 3.0574806359559723, + "grad_norm": 6.326185703277588, + "learning_rate": 6.179762875390679e-06, + "loss": 2.8415, + "step": 45000 + }, + { + "epoch": 3.057820356026634, + "grad_norm": 7.145307540893555, + "learning_rate": 6.179338225302351e-06, + "loss": 2.8531, + "step": 45005 + }, + { + "epoch": 3.058160076097296, + "grad_norm": 6.910794258117676, + "learning_rate": 6.178913575214024e-06, + "loss": 2.7222, + "step": 45010 + }, + { + "epoch": 3.0584997961679576, + "grad_norm": 7.320703983306885, + "learning_rate": 6.178488925125698e-06, + "loss": 3.0541, + "step": 45015 + }, + { + "epoch": 3.0588395162386193, + "grad_norm": 6.646900653839111, + "learning_rate": 6.17806427503737e-06, + "loss": 2.6496, + "step": 45020 + }, + { + "epoch": 3.0591792363092813, + "grad_norm": 6.495427131652832, + "learning_rate": 6.1776396249490425e-06, + "loss": 2.9191, + "step": 45025 + }, + { + "epoch": 3.059518956379943, + "grad_norm": 7.282018184661865, + "learning_rate": 6.177214974860716e-06, + "loss": 3.0773, + "step": 45030 + }, + { + "epoch": 3.0598586764506046, + "grad_norm": 8.270220756530762, + "learning_rate": 6.176790324772388e-06, + "loss": 2.924, + "step": 45035 + }, + { + "epoch": 3.0601983965212667, + "grad_norm": 8.157793045043945, + "learning_rate": 6.176365674684061e-06, + "loss": 3.041, + "step": 45040 + }, + { + "epoch": 3.0605381165919283, + "grad_norm": 6.355042457580566, + "learning_rate": 6.175941024595734e-06, + "loss": 2.7182, + "step": 45045 + }, + { + "epoch": 3.06087783666259, + "grad_norm": 8.306310653686523, + "learning_rate": 6.1755163745074065e-06, + "loss": 2.9089, + "step": 45050 + }, + { + "epoch": 3.0612175567332516, + "grad_norm": 7.248730182647705, + "learning_rate": 6.175091724419079e-06, + "loss": 2.8314, + "step": 45055 + }, + { + "epoch": 3.0615572768039137, + "grad_norm": 6.740631580352783, + "learning_rate": 6.174667074330752e-06, + "loss": 2.9983, + "step": 45060 + }, + { + "epoch": 3.0618969968745753, + "grad_norm": 7.510885238647461, + "learning_rate": 6.174242424242425e-06, + "loss": 2.9516, + "step": 45065 + }, + { + "epoch": 3.062236716945237, + "grad_norm": 8.858470916748047, + "learning_rate": 6.173817774154097e-06, + "loss": 2.7938, + "step": 45070 + }, + { + "epoch": 3.062576437015899, + "grad_norm": 6.886726379394531, + "learning_rate": 6.1733931240657705e-06, + "loss": 3.1955, + "step": 45075 + }, + { + "epoch": 3.0629161570865606, + "grad_norm": 5.856719017028809, + "learning_rate": 6.172968473977443e-06, + "loss": 2.9521, + "step": 45080 + }, + { + "epoch": 3.0632558771572223, + "grad_norm": 6.587240219116211, + "learning_rate": 6.172543823889115e-06, + "loss": 3.0577, + "step": 45085 + }, + { + "epoch": 3.0635955972278843, + "grad_norm": 7.126266002655029, + "learning_rate": 6.172119173800789e-06, + "loss": 2.8995, + "step": 45090 + }, + { + "epoch": 3.063935317298546, + "grad_norm": 6.349134922027588, + "learning_rate": 6.171694523712462e-06, + "loss": 2.9195, + "step": 45095 + }, + { + "epoch": 3.0642750373692076, + "grad_norm": 7.052435398101807, + "learning_rate": 6.171269873624134e-06, + "loss": 2.9202, + "step": 45100 + }, + { + "epoch": 3.0646147574398697, + "grad_norm": 8.107014656066895, + "learning_rate": 6.170845223535807e-06, + "loss": 3.0172, + "step": 45105 + }, + { + "epoch": 3.0649544775105313, + "grad_norm": 6.08603048324585, + "learning_rate": 6.17042057344748e-06, + "loss": 3.094, + "step": 45110 + }, + { + "epoch": 3.065294197581193, + "grad_norm": 7.693296432495117, + "learning_rate": 6.169995923359152e-06, + "loss": 2.8831, + "step": 45115 + }, + { + "epoch": 3.065633917651855, + "grad_norm": 6.28033971786499, + "learning_rate": 6.169571273270826e-06, + "loss": 2.9816, + "step": 45120 + }, + { + "epoch": 3.0659736377225166, + "grad_norm": 7.5353803634643555, + "learning_rate": 6.1691466231824985e-06, + "loss": 2.9004, + "step": 45125 + }, + { + "epoch": 3.0663133577931783, + "grad_norm": 8.020724296569824, + "learning_rate": 6.1687219730941705e-06, + "loss": 3.0117, + "step": 45130 + }, + { + "epoch": 3.0666530778638403, + "grad_norm": 6.553598403930664, + "learning_rate": 6.168297323005844e-06, + "loss": 2.8578, + "step": 45135 + }, + { + "epoch": 3.066992797934502, + "grad_norm": 8.493875503540039, + "learning_rate": 6.167872672917516e-06, + "loss": 2.7874, + "step": 45140 + }, + { + "epoch": 3.0673325180051636, + "grad_norm": 8.081475257873535, + "learning_rate": 6.167448022829189e-06, + "loss": 2.7259, + "step": 45145 + }, + { + "epoch": 3.0676722380758257, + "grad_norm": 6.4393391609191895, + "learning_rate": 6.1670233727408625e-06, + "loss": 2.9316, + "step": 45150 + }, + { + "epoch": 3.0680119581464873, + "grad_norm": 8.57060432434082, + "learning_rate": 6.1665987226525345e-06, + "loss": 2.946, + "step": 45155 + }, + { + "epoch": 3.068351678217149, + "grad_norm": 6.249833583831787, + "learning_rate": 6.166174072564207e-06, + "loss": 2.7837, + "step": 45160 + }, + { + "epoch": 3.068691398287811, + "grad_norm": 6.370911598205566, + "learning_rate": 6.165749422475881e-06, + "loss": 2.9786, + "step": 45165 + }, + { + "epoch": 3.0690311183584726, + "grad_norm": 6.837189674377441, + "learning_rate": 6.165324772387553e-06, + "loss": 2.9779, + "step": 45170 + }, + { + "epoch": 3.0693708384291343, + "grad_norm": 8.784998893737793, + "learning_rate": 6.164900122299226e-06, + "loss": 2.8797, + "step": 45175 + }, + { + "epoch": 3.0697105584997963, + "grad_norm": 7.96160888671875, + "learning_rate": 6.164475472210899e-06, + "loss": 2.9427, + "step": 45180 + }, + { + "epoch": 3.070050278570458, + "grad_norm": 7.9350714683532715, + "learning_rate": 6.164050822122571e-06, + "loss": 2.9687, + "step": 45185 + }, + { + "epoch": 3.0703899986411196, + "grad_norm": 7.346402168273926, + "learning_rate": 6.163626172034244e-06, + "loss": 3.0764, + "step": 45190 + }, + { + "epoch": 3.0707297187117817, + "grad_norm": 7.014779090881348, + "learning_rate": 6.163201521945918e-06, + "loss": 2.7648, + "step": 45195 + }, + { + "epoch": 3.0710694387824433, + "grad_norm": 7.346489906311035, + "learning_rate": 6.16277687185759e-06, + "loss": 2.9585, + "step": 45200 + }, + { + "epoch": 3.071409158853105, + "grad_norm": 7.352536201477051, + "learning_rate": 6.1623522217692625e-06, + "loss": 2.6363, + "step": 45205 + }, + { + "epoch": 3.071748878923767, + "grad_norm": 7.878098487854004, + "learning_rate": 6.161927571680935e-06, + "loss": 3.0543, + "step": 45210 + }, + { + "epoch": 3.0720885989944287, + "grad_norm": 8.413265228271484, + "learning_rate": 6.161502921592608e-06, + "loss": 3.0254, + "step": 45215 + }, + { + "epoch": 3.0724283190650903, + "grad_norm": 6.682627201080322, + "learning_rate": 6.161078271504281e-06, + "loss": 2.8397, + "step": 45220 + }, + { + "epoch": 3.0727680391357524, + "grad_norm": 6.474935054779053, + "learning_rate": 6.160653621415954e-06, + "loss": 2.944, + "step": 45225 + }, + { + "epoch": 3.073107759206414, + "grad_norm": 6.309755325317383, + "learning_rate": 6.1602289713276265e-06, + "loss": 2.75, + "step": 45230 + }, + { + "epoch": 3.0734474792770756, + "grad_norm": 8.221648216247559, + "learning_rate": 6.1598043212392984e-06, + "loss": 2.664, + "step": 45235 + }, + { + "epoch": 3.0737871993477373, + "grad_norm": 6.253941059112549, + "learning_rate": 6.159379671150972e-06, + "loss": 2.9327, + "step": 45240 + }, + { + "epoch": 3.0741269194183993, + "grad_norm": 8.229692459106445, + "learning_rate": 6.158955021062645e-06, + "loss": 3.1723, + "step": 45245 + }, + { + "epoch": 3.074466639489061, + "grad_norm": 9.848112106323242, + "learning_rate": 6.158530370974317e-06, + "loss": 3.2477, + "step": 45250 + }, + { + "epoch": 3.0748063595597226, + "grad_norm": 6.006877899169922, + "learning_rate": 6.1581057208859905e-06, + "loss": 2.915, + "step": 45255 + }, + { + "epoch": 3.0751460796303847, + "grad_norm": 8.659896850585938, + "learning_rate": 6.157681070797663e-06, + "loss": 2.7323, + "step": 45260 + }, + { + "epoch": 3.0754857997010463, + "grad_norm": 7.581682205200195, + "learning_rate": 6.157256420709335e-06, + "loss": 2.905, + "step": 45265 + }, + { + "epoch": 3.075825519771708, + "grad_norm": 8.390357971191406, + "learning_rate": 6.156831770621009e-06, + "loss": 2.6914, + "step": 45270 + }, + { + "epoch": 3.07616523984237, + "grad_norm": 6.9657111167907715, + "learning_rate": 6.156407120532682e-06, + "loss": 2.7457, + "step": 45275 + }, + { + "epoch": 3.0765049599130316, + "grad_norm": 6.663273811340332, + "learning_rate": 6.155982470444354e-06, + "loss": 2.744, + "step": 45280 + }, + { + "epoch": 3.0768446799836933, + "grad_norm": 6.296318054199219, + "learning_rate": 6.155557820356027e-06, + "loss": 3.1082, + "step": 45285 + }, + { + "epoch": 3.0771844000543553, + "grad_norm": 5.185708045959473, + "learning_rate": 6.1551331702677e-06, + "loss": 2.9728, + "step": 45290 + }, + { + "epoch": 3.077524120125017, + "grad_norm": 8.593910217285156, + "learning_rate": 6.154708520179372e-06, + "loss": 2.9363, + "step": 45295 + }, + { + "epoch": 3.0778638401956786, + "grad_norm": 7.698365688323975, + "learning_rate": 6.154283870091046e-06, + "loss": 3.0238, + "step": 45300 + }, + { + "epoch": 3.0782035602663407, + "grad_norm": 5.74570369720459, + "learning_rate": 6.153859220002718e-06, + "loss": 2.9493, + "step": 45305 + }, + { + "epoch": 3.0785432803370023, + "grad_norm": 9.165974617004395, + "learning_rate": 6.153434569914391e-06, + "loss": 2.9888, + "step": 45310 + }, + { + "epoch": 3.078883000407664, + "grad_norm": 5.030172348022461, + "learning_rate": 6.153009919826064e-06, + "loss": 3.0416, + "step": 45315 + }, + { + "epoch": 3.079222720478326, + "grad_norm": 7.461723327636719, + "learning_rate": 6.152585269737736e-06, + "loss": 2.8679, + "step": 45320 + }, + { + "epoch": 3.0795624405489876, + "grad_norm": 7.154533386230469, + "learning_rate": 6.15216061964941e-06, + "loss": 2.9321, + "step": 45325 + }, + { + "epoch": 3.0799021606196493, + "grad_norm": 7.449382781982422, + "learning_rate": 6.1517359695610825e-06, + "loss": 3.0296, + "step": 45330 + }, + { + "epoch": 3.0802418806903114, + "grad_norm": 8.149882316589355, + "learning_rate": 6.1513113194727545e-06, + "loss": 2.9525, + "step": 45335 + }, + { + "epoch": 3.080581600760973, + "grad_norm": 6.727640628814697, + "learning_rate": 6.150886669384428e-06, + "loss": 2.7794, + "step": 45340 + }, + { + "epoch": 3.0809213208316346, + "grad_norm": 7.094851493835449, + "learning_rate": 6.150462019296101e-06, + "loss": 3.0795, + "step": 45345 + }, + { + "epoch": 3.0812610409022967, + "grad_norm": 8.652596473693848, + "learning_rate": 6.150037369207773e-06, + "loss": 2.9816, + "step": 45350 + }, + { + "epoch": 3.0816007609729583, + "grad_norm": 9.714263916015625, + "learning_rate": 6.1496127191194465e-06, + "loss": 2.8613, + "step": 45355 + }, + { + "epoch": 3.08194048104362, + "grad_norm": 8.254345893859863, + "learning_rate": 6.149188069031119e-06, + "loss": 3.0562, + "step": 45360 + }, + { + "epoch": 3.082280201114282, + "grad_norm": 7.943113327026367, + "learning_rate": 6.148763418942791e-06, + "loss": 3.006, + "step": 45365 + }, + { + "epoch": 3.0826199211849437, + "grad_norm": 7.7732648849487305, + "learning_rate": 6.148338768854465e-06, + "loss": 2.8973, + "step": 45370 + }, + { + "epoch": 3.0829596412556053, + "grad_norm": 6.861783027648926, + "learning_rate": 6.147914118766138e-06, + "loss": 2.5818, + "step": 45375 + }, + { + "epoch": 3.0832993613262674, + "grad_norm": 6.5963311195373535, + "learning_rate": 6.14748946867781e-06, + "loss": 2.9224, + "step": 45380 + }, + { + "epoch": 3.083639081396929, + "grad_norm": 6.221450328826904, + "learning_rate": 6.147064818589483e-06, + "loss": 2.9324, + "step": 45385 + }, + { + "epoch": 3.0839788014675906, + "grad_norm": 9.555167198181152, + "learning_rate": 6.146640168501155e-06, + "loss": 3.0225, + "step": 45390 + }, + { + "epoch": 3.0843185215382523, + "grad_norm": 6.37032413482666, + "learning_rate": 6.146215518412828e-06, + "loss": 2.9795, + "step": 45395 + }, + { + "epoch": 3.0846582416089143, + "grad_norm": 7.9775309562683105, + "learning_rate": 6.145790868324502e-06, + "loss": 2.9342, + "step": 45400 + }, + { + "epoch": 3.084997961679576, + "grad_norm": 9.238286972045898, + "learning_rate": 6.145366218236174e-06, + "loss": 2.8128, + "step": 45405 + }, + { + "epoch": 3.0853376817502376, + "grad_norm": 7.214141368865967, + "learning_rate": 6.1449415681478465e-06, + "loss": 2.662, + "step": 45410 + }, + { + "epoch": 3.0856774018208997, + "grad_norm": 8.553296089172363, + "learning_rate": 6.14451691805952e-06, + "loss": 2.8422, + "step": 45415 + }, + { + "epoch": 3.0860171218915613, + "grad_norm": 8.02751636505127, + "learning_rate": 6.144092267971192e-06, + "loss": 2.9127, + "step": 45420 + }, + { + "epoch": 3.086356841962223, + "grad_norm": 7.1640191078186035, + "learning_rate": 6.143667617882865e-06, + "loss": 2.9486, + "step": 45425 + }, + { + "epoch": 3.086696562032885, + "grad_norm": 6.657286643981934, + "learning_rate": 6.1432429677945385e-06, + "loss": 2.9334, + "step": 45430 + }, + { + "epoch": 3.0870362821035466, + "grad_norm": 6.225935935974121, + "learning_rate": 6.1428183177062105e-06, + "loss": 3.0264, + "step": 45435 + }, + { + "epoch": 3.0873760021742083, + "grad_norm": 5.52301025390625, + "learning_rate": 6.142393667617883e-06, + "loss": 2.8539, + "step": 45440 + }, + { + "epoch": 3.0877157222448703, + "grad_norm": 8.490525245666504, + "learning_rate": 6.141969017529557e-06, + "loss": 2.9261, + "step": 45445 + }, + { + "epoch": 3.088055442315532, + "grad_norm": 8.536730766296387, + "learning_rate": 6.141544367441229e-06, + "loss": 3.2509, + "step": 45450 + }, + { + "epoch": 3.0883951623861936, + "grad_norm": 7.234903335571289, + "learning_rate": 6.141119717352902e-06, + "loss": 2.9029, + "step": 45455 + }, + { + "epoch": 3.0887348824568557, + "grad_norm": 6.867849349975586, + "learning_rate": 6.1406950672645745e-06, + "loss": 2.8732, + "step": 45460 + }, + { + "epoch": 3.0890746025275173, + "grad_norm": 7.170031547546387, + "learning_rate": 6.140270417176247e-06, + "loss": 2.7546, + "step": 45465 + }, + { + "epoch": 3.089414322598179, + "grad_norm": 7.188812255859375, + "learning_rate": 6.13984576708792e-06, + "loss": 2.8877, + "step": 45470 + }, + { + "epoch": 3.089754042668841, + "grad_norm": 7.3824920654296875, + "learning_rate": 6.139421116999593e-06, + "loss": 2.8207, + "step": 45475 + }, + { + "epoch": 3.0900937627395026, + "grad_norm": 7.23252534866333, + "learning_rate": 6.138996466911266e-06, + "loss": 2.77, + "step": 45480 + }, + { + "epoch": 3.0904334828101643, + "grad_norm": 6.116435527801514, + "learning_rate": 6.138571816822938e-06, + "loss": 2.7544, + "step": 45485 + }, + { + "epoch": 3.0907732028808264, + "grad_norm": 6.338181495666504, + "learning_rate": 6.138147166734611e-06, + "loss": 3.0894, + "step": 45490 + }, + { + "epoch": 3.091112922951488, + "grad_norm": 7.7892279624938965, + "learning_rate": 6.137722516646284e-06, + "loss": 2.867, + "step": 45495 + }, + { + "epoch": 3.0914526430221496, + "grad_norm": 8.597735404968262, + "learning_rate": 6.137297866557956e-06, + "loss": 2.8895, + "step": 45500 + }, + { + "epoch": 3.0917923630928117, + "grad_norm": 9.117444038391113, + "learning_rate": 6.13687321646963e-06, + "loss": 2.805, + "step": 45505 + }, + { + "epoch": 3.0921320831634733, + "grad_norm": 8.26286792755127, + "learning_rate": 6.1364485663813025e-06, + "loss": 2.9894, + "step": 45510 + }, + { + "epoch": 3.092471803234135, + "grad_norm": 7.030079364776611, + "learning_rate": 6.1360239162929744e-06, + "loss": 3.0921, + "step": 45515 + }, + { + "epoch": 3.092811523304797, + "grad_norm": 5.4009599685668945, + "learning_rate": 6.135599266204648e-06, + "loss": 3.0707, + "step": 45520 + }, + { + "epoch": 3.0931512433754587, + "grad_norm": 7.856867790222168, + "learning_rate": 6.135174616116321e-06, + "loss": 2.9224, + "step": 45525 + }, + { + "epoch": 3.0934909634461203, + "grad_norm": 7.77103328704834, + "learning_rate": 6.134749966027993e-06, + "loss": 3.0396, + "step": 45530 + }, + { + "epoch": 3.0938306835167824, + "grad_norm": 9.005882263183594, + "learning_rate": 6.1343253159396665e-06, + "loss": 2.9805, + "step": 45535 + }, + { + "epoch": 3.094170403587444, + "grad_norm": 5.3445305824279785, + "learning_rate": 6.133900665851339e-06, + "loss": 3.0079, + "step": 45540 + }, + { + "epoch": 3.0945101236581056, + "grad_norm": 6.188485622406006, + "learning_rate": 6.133476015763011e-06, + "loss": 3.141, + "step": 45545 + }, + { + "epoch": 3.0948498437287677, + "grad_norm": 7.488727569580078, + "learning_rate": 6.133051365674685e-06, + "loss": 2.8693, + "step": 45550 + }, + { + "epoch": 3.0951895637994293, + "grad_norm": 6.661380290985107, + "learning_rate": 6.132626715586357e-06, + "loss": 3.0231, + "step": 45555 + }, + { + "epoch": 3.095529283870091, + "grad_norm": 6.724999904632568, + "learning_rate": 6.13220206549803e-06, + "loss": 2.9816, + "step": 45560 + }, + { + "epoch": 3.095869003940753, + "grad_norm": 7.61167049407959, + "learning_rate": 6.131777415409703e-06, + "loss": 2.9321, + "step": 45565 + }, + { + "epoch": 3.0962087240114147, + "grad_norm": 7.2588396072387695, + "learning_rate": 6.131352765321375e-06, + "loss": 2.9098, + "step": 45570 + }, + { + "epoch": 3.0965484440820763, + "grad_norm": 6.055788516998291, + "learning_rate": 6.130928115233048e-06, + "loss": 2.9005, + "step": 45575 + }, + { + "epoch": 3.096888164152738, + "grad_norm": 7.273479461669922, + "learning_rate": 6.130503465144722e-06, + "loss": 2.652, + "step": 45580 + }, + { + "epoch": 3.0972278842234, + "grad_norm": 6.977941036224365, + "learning_rate": 6.130078815056394e-06, + "loss": 3.1678, + "step": 45585 + }, + { + "epoch": 3.0975676042940616, + "grad_norm": 7.744830131530762, + "learning_rate": 6.1296541649680664e-06, + "loss": 2.7567, + "step": 45590 + }, + { + "epoch": 3.0979073243647233, + "grad_norm": 5.826841831207275, + "learning_rate": 6.12922951487974e-06, + "loss": 2.9208, + "step": 45595 + }, + { + "epoch": 3.0982470444353853, + "grad_norm": 7.546230792999268, + "learning_rate": 6.128804864791412e-06, + "loss": 2.859, + "step": 45600 + }, + { + "epoch": 3.098586764506047, + "grad_norm": 4.6899566650390625, + "learning_rate": 6.128380214703085e-06, + "loss": 2.8925, + "step": 45605 + }, + { + "epoch": 3.0989264845767086, + "grad_norm": 6.344299793243408, + "learning_rate": 6.1279555646147585e-06, + "loss": 2.8244, + "step": 45610 + }, + { + "epoch": 3.0992662046473707, + "grad_norm": 8.025097846984863, + "learning_rate": 6.1275309145264305e-06, + "loss": 2.7271, + "step": 45615 + }, + { + "epoch": 3.0996059247180323, + "grad_norm": 5.414759159088135, + "learning_rate": 6.127106264438103e-06, + "loss": 3.0166, + "step": 45620 + }, + { + "epoch": 3.099945644788694, + "grad_norm": 8.202191352844238, + "learning_rate": 6.126681614349777e-06, + "loss": 2.6707, + "step": 45625 + }, + { + "epoch": 3.100285364859356, + "grad_norm": 7.551990509033203, + "learning_rate": 6.126256964261449e-06, + "loss": 2.8456, + "step": 45630 + }, + { + "epoch": 3.1006250849300176, + "grad_norm": 7.830854415893555, + "learning_rate": 6.125832314173122e-06, + "loss": 2.9907, + "step": 45635 + }, + { + "epoch": 3.1009648050006793, + "grad_norm": 6.4956536293029785, + "learning_rate": 6.1254076640847945e-06, + "loss": 2.6335, + "step": 45640 + }, + { + "epoch": 3.1013045250713414, + "grad_norm": 9.136770248413086, + "learning_rate": 6.124983013996467e-06, + "loss": 2.9346, + "step": 45645 + }, + { + "epoch": 3.101644245142003, + "grad_norm": 8.087834358215332, + "learning_rate": 6.124558363908141e-06, + "loss": 2.8476, + "step": 45650 + }, + { + "epoch": 3.1019839652126646, + "grad_norm": 7.399925231933594, + "learning_rate": 6.124133713819813e-06, + "loss": 2.9823, + "step": 45655 + }, + { + "epoch": 3.1023236852833267, + "grad_norm": 8.260469436645508, + "learning_rate": 6.123709063731486e-06, + "loss": 3.0833, + "step": 45660 + }, + { + "epoch": 3.1026634053539883, + "grad_norm": 8.53100299835205, + "learning_rate": 6.123284413643159e-06, + "loss": 2.8522, + "step": 45665 + }, + { + "epoch": 3.10300312542465, + "grad_norm": 7.429061412811279, + "learning_rate": 6.122859763554831e-06, + "loss": 2.8664, + "step": 45670 + }, + { + "epoch": 3.103342845495312, + "grad_norm": 6.599551677703857, + "learning_rate": 6.122435113466504e-06, + "loss": 2.5489, + "step": 45675 + }, + { + "epoch": 3.1036825655659737, + "grad_norm": 6.6460089683532715, + "learning_rate": 6.122010463378178e-06, + "loss": 2.9627, + "step": 45680 + }, + { + "epoch": 3.1040222856366353, + "grad_norm": 6.402641296386719, + "learning_rate": 6.12158581328985e-06, + "loss": 3.0318, + "step": 45685 + }, + { + "epoch": 3.1043620057072974, + "grad_norm": 8.188644409179688, + "learning_rate": 6.1211611632015225e-06, + "loss": 2.8247, + "step": 45690 + }, + { + "epoch": 3.104701725777959, + "grad_norm": 9.607285499572754, + "learning_rate": 6.120736513113196e-06, + "loss": 2.9794, + "step": 45695 + }, + { + "epoch": 3.1050414458486206, + "grad_norm": 9.504545211791992, + "learning_rate": 6.120311863024868e-06, + "loss": 3.239, + "step": 45700 + }, + { + "epoch": 3.1053811659192827, + "grad_norm": 9.308987617492676, + "learning_rate": 6.119887212936541e-06, + "loss": 2.9145, + "step": 45705 + }, + { + "epoch": 3.1057208859899443, + "grad_norm": 8.081727027893066, + "learning_rate": 6.119462562848214e-06, + "loss": 3.1109, + "step": 45710 + }, + { + "epoch": 3.106060606060606, + "grad_norm": 8.477089881896973, + "learning_rate": 6.1190379127598865e-06, + "loss": 2.8329, + "step": 45715 + }, + { + "epoch": 3.106400326131268, + "grad_norm": 5.726563930511475, + "learning_rate": 6.118613262671559e-06, + "loss": 2.823, + "step": 45720 + }, + { + "epoch": 3.1067400462019297, + "grad_norm": 5.750587463378906, + "learning_rate": 6.118188612583232e-06, + "loss": 2.9199, + "step": 45725 + }, + { + "epoch": 3.1070797662725913, + "grad_norm": 6.374975681304932, + "learning_rate": 6.117763962494905e-06, + "loss": 2.7807, + "step": 45730 + }, + { + "epoch": 3.107419486343253, + "grad_norm": 7.086883544921875, + "learning_rate": 6.117339312406577e-06, + "loss": 2.869, + "step": 45735 + }, + { + "epoch": 3.107759206413915, + "grad_norm": 9.711797714233398, + "learning_rate": 6.1169146623182505e-06, + "loss": 2.9729, + "step": 45740 + }, + { + "epoch": 3.1080989264845766, + "grad_norm": 8.490896224975586, + "learning_rate": 6.116490012229923e-06, + "loss": 3.0735, + "step": 45745 + }, + { + "epoch": 3.1084386465552383, + "grad_norm": 8.234946250915527, + "learning_rate": 6.116065362141595e-06, + "loss": 3.0233, + "step": 45750 + }, + { + "epoch": 3.1087783666259003, + "grad_norm": 7.55048942565918, + "learning_rate": 6.115640712053269e-06, + "loss": 2.8278, + "step": 45755 + }, + { + "epoch": 3.109118086696562, + "grad_norm": 7.840838432312012, + "learning_rate": 6.115216061964942e-06, + "loss": 2.9855, + "step": 45760 + }, + { + "epoch": 3.1094578067672236, + "grad_norm": 7.800296306610107, + "learning_rate": 6.114791411876614e-06, + "loss": 2.7792, + "step": 45765 + }, + { + "epoch": 3.1097975268378857, + "grad_norm": 8.003653526306152, + "learning_rate": 6.114366761788287e-06, + "loss": 2.8781, + "step": 45770 + }, + { + "epoch": 3.1101372469085473, + "grad_norm": 6.72524356842041, + "learning_rate": 6.11394211169996e-06, + "loss": 2.9339, + "step": 45775 + }, + { + "epoch": 3.110476966979209, + "grad_norm": 5.4584479331970215, + "learning_rate": 6.113517461611632e-06, + "loss": 3.0119, + "step": 45780 + }, + { + "epoch": 3.110816687049871, + "grad_norm": 7.36461877822876, + "learning_rate": 6.113092811523306e-06, + "loss": 2.9725, + "step": 45785 + }, + { + "epoch": 3.1111564071205327, + "grad_norm": 6.6337151527404785, + "learning_rate": 6.1126681614349785e-06, + "loss": 3.1607, + "step": 45790 + }, + { + "epoch": 3.1114961271911943, + "grad_norm": 10.301446914672852, + "learning_rate": 6.1122435113466504e-06, + "loss": 3.1274, + "step": 45795 + }, + { + "epoch": 3.1118358472618564, + "grad_norm": 7.802721977233887, + "learning_rate": 6.111818861258324e-06, + "loss": 3.136, + "step": 45800 + }, + { + "epoch": 3.112175567332518, + "grad_norm": 6.703995227813721, + "learning_rate": 6.111394211169996e-06, + "loss": 2.8083, + "step": 45805 + }, + { + "epoch": 3.1125152874031796, + "grad_norm": 5.298614501953125, + "learning_rate": 6.110969561081669e-06, + "loss": 2.6689, + "step": 45810 + }, + { + "epoch": 3.1128550074738417, + "grad_norm": 6.143704414367676, + "learning_rate": 6.1105449109933425e-06, + "loss": 2.8734, + "step": 45815 + }, + { + "epoch": 3.1131947275445033, + "grad_norm": 7.218565464019775, + "learning_rate": 6.1101202609050144e-06, + "loss": 3.1212, + "step": 45820 + }, + { + "epoch": 3.113534447615165, + "grad_norm": 7.604104995727539, + "learning_rate": 6.109695610816687e-06, + "loss": 2.8357, + "step": 45825 + }, + { + "epoch": 3.113874167685827, + "grad_norm": 4.997576713562012, + "learning_rate": 6.109270960728361e-06, + "loss": 3.0194, + "step": 45830 + }, + { + "epoch": 3.1142138877564887, + "grad_norm": 7.182968616485596, + "learning_rate": 6.108846310640033e-06, + "loss": 2.9664, + "step": 45835 + }, + { + "epoch": 3.1145536078271503, + "grad_norm": 6.903810977935791, + "learning_rate": 6.108421660551706e-06, + "loss": 2.9472, + "step": 45840 + }, + { + "epoch": 3.1148933278978124, + "grad_norm": 6.277137279510498, + "learning_rate": 6.107997010463379e-06, + "loss": 2.9124, + "step": 45845 + }, + { + "epoch": 3.115233047968474, + "grad_norm": 7.761193752288818, + "learning_rate": 6.107572360375051e-06, + "loss": 2.9932, + "step": 45850 + }, + { + "epoch": 3.1155727680391356, + "grad_norm": 7.607492446899414, + "learning_rate": 6.107147710286724e-06, + "loss": 2.8943, + "step": 45855 + }, + { + "epoch": 3.1159124881097977, + "grad_norm": 7.947126865386963, + "learning_rate": 6.106723060198398e-06, + "loss": 2.84, + "step": 45860 + }, + { + "epoch": 3.1162522081804593, + "grad_norm": 5.831336498260498, + "learning_rate": 6.10629841011007e-06, + "loss": 2.9391, + "step": 45865 + }, + { + "epoch": 3.116591928251121, + "grad_norm": 7.8866987228393555, + "learning_rate": 6.1058737600217424e-06, + "loss": 2.7943, + "step": 45870 + }, + { + "epoch": 3.116931648321783, + "grad_norm": 6.705236911773682, + "learning_rate": 6.105449109933415e-06, + "loss": 2.9243, + "step": 45875 + }, + { + "epoch": 3.1172713683924447, + "grad_norm": 6.258044719696045, + "learning_rate": 6.105024459845088e-06, + "loss": 3.1432, + "step": 45880 + }, + { + "epoch": 3.1176110884631063, + "grad_norm": 8.233860969543457, + "learning_rate": 6.104599809756761e-06, + "loss": 2.9754, + "step": 45885 + }, + { + "epoch": 3.1179508085337684, + "grad_norm": 8.333966255187988, + "learning_rate": 6.104175159668434e-06, + "loss": 3.0603, + "step": 45890 + }, + { + "epoch": 3.11829052860443, + "grad_norm": 8.659801483154297, + "learning_rate": 6.1037505095801064e-06, + "loss": 2.9145, + "step": 45895 + }, + { + "epoch": 3.1186302486750916, + "grad_norm": 8.690898895263672, + "learning_rate": 6.103325859491778e-06, + "loss": 3.0527, + "step": 45900 + }, + { + "epoch": 3.1189699687457537, + "grad_norm": 6.601908206939697, + "learning_rate": 6.102901209403452e-06, + "loss": 3.0506, + "step": 45905 + }, + { + "epoch": 3.1193096888164153, + "grad_norm": 7.444464683532715, + "learning_rate": 6.102476559315125e-06, + "loss": 2.9306, + "step": 45910 + }, + { + "epoch": 3.119649408887077, + "grad_norm": 7.572744846343994, + "learning_rate": 6.102051909226797e-06, + "loss": 3.0025, + "step": 45915 + }, + { + "epoch": 3.1199891289577386, + "grad_norm": 6.719772815704346, + "learning_rate": 6.1016272591384705e-06, + "loss": 2.9068, + "step": 45920 + }, + { + "epoch": 3.1203288490284007, + "grad_norm": 6.703258514404297, + "learning_rate": 6.101202609050143e-06, + "loss": 2.7465, + "step": 45925 + }, + { + "epoch": 3.1206685690990623, + "grad_norm": 7.361517906188965, + "learning_rate": 6.100777958961815e-06, + "loss": 2.8369, + "step": 45930 + }, + { + "epoch": 3.121008289169724, + "grad_norm": 8.592582702636719, + "learning_rate": 6.100353308873489e-06, + "loss": 2.9398, + "step": 45935 + }, + { + "epoch": 3.121348009240386, + "grad_norm": 7.84604024887085, + "learning_rate": 6.099928658785162e-06, + "loss": 2.841, + "step": 45940 + }, + { + "epoch": 3.1216877293110477, + "grad_norm": 8.091031074523926, + "learning_rate": 6.099504008696834e-06, + "loss": 3.0373, + "step": 45945 + }, + { + "epoch": 3.1220274493817093, + "grad_norm": 6.941708564758301, + "learning_rate": 6.099079358608507e-06, + "loss": 3.0153, + "step": 45950 + }, + { + "epoch": 3.1223671694523714, + "grad_norm": 6.82130241394043, + "learning_rate": 6.09865470852018e-06, + "loss": 2.6833, + "step": 45955 + }, + { + "epoch": 3.122706889523033, + "grad_norm": 8.067419052124023, + "learning_rate": 6.098230058431852e-06, + "loss": 2.9347, + "step": 45960 + }, + { + "epoch": 3.1230466095936946, + "grad_norm": 6.172304630279541, + "learning_rate": 6.097805408343526e-06, + "loss": 2.8322, + "step": 45965 + }, + { + "epoch": 3.1233863296643567, + "grad_norm": 9.406464576721191, + "learning_rate": 6.0973807582551985e-06, + "loss": 2.9118, + "step": 45970 + }, + { + "epoch": 3.1237260497350183, + "grad_norm": 8.121591567993164, + "learning_rate": 6.09695610816687e-06, + "loss": 2.9733, + "step": 45975 + }, + { + "epoch": 3.12406576980568, + "grad_norm": 6.351742267608643, + "learning_rate": 6.096531458078544e-06, + "loss": 2.8405, + "step": 45980 + }, + { + "epoch": 3.124405489876342, + "grad_norm": 6.486812591552734, + "learning_rate": 6.096106807990216e-06, + "loss": 3.1186, + "step": 45985 + }, + { + "epoch": 3.1247452099470037, + "grad_norm": 6.162101745605469, + "learning_rate": 6.09568215790189e-06, + "loss": 2.6952, + "step": 45990 + }, + { + "epoch": 3.1250849300176653, + "grad_norm": 6.1673173904418945, + "learning_rate": 6.0952575078135625e-06, + "loss": 3.0019, + "step": 45995 + }, + { + "epoch": 3.1254246500883274, + "grad_norm": 6.721914768218994, + "learning_rate": 6.094832857725234e-06, + "loss": 3.1041, + "step": 46000 + }, + { + "epoch": 3.125764370158989, + "grad_norm": 6.431205749511719, + "learning_rate": 6.094408207636908e-06, + "loss": 2.7607, + "step": 46005 + }, + { + "epoch": 3.1261040902296506, + "grad_norm": 5.984856128692627, + "learning_rate": 6.093983557548581e-06, + "loss": 3.0252, + "step": 46010 + }, + { + "epoch": 3.1264438103003127, + "grad_norm": 8.57251262664795, + "learning_rate": 6.093558907460253e-06, + "loss": 2.8829, + "step": 46015 + }, + { + "epoch": 3.1267835303709743, + "grad_norm": 7.785546779632568, + "learning_rate": 6.0931342573719265e-06, + "loss": 2.9301, + "step": 46020 + }, + { + "epoch": 3.127123250441636, + "grad_norm": 5.780330181121826, + "learning_rate": 6.092709607283599e-06, + "loss": 2.7858, + "step": 46025 + }, + { + "epoch": 3.127462970512298, + "grad_norm": 5.835597038269043, + "learning_rate": 6.092284957195271e-06, + "loss": 2.9425, + "step": 46030 + }, + { + "epoch": 3.1278026905829597, + "grad_norm": 7.498185634613037, + "learning_rate": 6.091860307106945e-06, + "loss": 2.7312, + "step": 46035 + }, + { + "epoch": 3.1281424106536213, + "grad_norm": 10.434504508972168, + "learning_rate": 6.091435657018618e-06, + "loss": 2.9905, + "step": 46040 + }, + { + "epoch": 3.1284821307242834, + "grad_norm": 7.817806720733643, + "learning_rate": 6.09101100693029e-06, + "loss": 2.7869, + "step": 46045 + }, + { + "epoch": 3.128821850794945, + "grad_norm": 8.968701362609863, + "learning_rate": 6.090586356841963e-06, + "loss": 3.0713, + "step": 46050 + }, + { + "epoch": 3.1291615708656066, + "grad_norm": 7.19913911819458, + "learning_rate": 6.090161706753635e-06, + "loss": 2.7688, + "step": 46055 + }, + { + "epoch": 3.1295012909362687, + "grad_norm": 6.6378326416015625, + "learning_rate": 6.089737056665308e-06, + "loss": 2.9419, + "step": 46060 + }, + { + "epoch": 3.1298410110069304, + "grad_norm": 8.511122703552246, + "learning_rate": 6.089312406576982e-06, + "loss": 2.9931, + "step": 46065 + }, + { + "epoch": 3.130180731077592, + "grad_norm": 7.496527194976807, + "learning_rate": 6.088887756488654e-06, + "loss": 3.1751, + "step": 46070 + }, + { + "epoch": 3.1305204511482536, + "grad_norm": 10.217255592346191, + "learning_rate": 6.0884631064003264e-06, + "loss": 2.8896, + "step": 46075 + }, + { + "epoch": 3.1308601712189157, + "grad_norm": 8.920502662658691, + "learning_rate": 6.088038456312e-06, + "loss": 2.8762, + "step": 46080 + }, + { + "epoch": 3.1311998912895773, + "grad_norm": 6.1636176109313965, + "learning_rate": 6.087613806223672e-06, + "loss": 2.7777, + "step": 46085 + }, + { + "epoch": 3.131539611360239, + "grad_norm": 6.945397853851318, + "learning_rate": 6.087189156135345e-06, + "loss": 3.2133, + "step": 46090 + }, + { + "epoch": 3.131879331430901, + "grad_norm": 6.619563102722168, + "learning_rate": 6.0867645060470185e-06, + "loss": 3.1176, + "step": 46095 + }, + { + "epoch": 3.1322190515015627, + "grad_norm": 6.3478264808654785, + "learning_rate": 6.0863398559586904e-06, + "loss": 3.0695, + "step": 46100 + }, + { + "epoch": 3.1325587715722243, + "grad_norm": 8.454303741455078, + "learning_rate": 6.085915205870363e-06, + "loss": 3.2626, + "step": 46105 + }, + { + "epoch": 3.1328984916428864, + "grad_norm": 7.567410469055176, + "learning_rate": 6.085490555782037e-06, + "loss": 2.9825, + "step": 46110 + }, + { + "epoch": 3.133238211713548, + "grad_norm": 7.625874996185303, + "learning_rate": 6.085065905693709e-06, + "loss": 2.822, + "step": 46115 + }, + { + "epoch": 3.1335779317842096, + "grad_norm": 7.507160663604736, + "learning_rate": 6.084641255605382e-06, + "loss": 2.9456, + "step": 46120 + }, + { + "epoch": 3.1339176518548717, + "grad_norm": 7.090866565704346, + "learning_rate": 6.0842166055170544e-06, + "loss": 3.2051, + "step": 46125 + }, + { + "epoch": 3.1342573719255333, + "grad_norm": 6.445340156555176, + "learning_rate": 6.083791955428727e-06, + "loss": 2.6217, + "step": 46130 + }, + { + "epoch": 3.134597091996195, + "grad_norm": 7.831929683685303, + "learning_rate": 6.0833673053404e-06, + "loss": 2.9017, + "step": 46135 + }, + { + "epoch": 3.134936812066857, + "grad_norm": 5.97227144241333, + "learning_rate": 6.082942655252073e-06, + "loss": 2.7417, + "step": 46140 + }, + { + "epoch": 3.1352765321375187, + "grad_norm": 8.182737350463867, + "learning_rate": 6.082518005163746e-06, + "loss": 3.0323, + "step": 46145 + }, + { + "epoch": 3.1356162522081803, + "grad_norm": 7.100011348724365, + "learning_rate": 6.082093355075418e-06, + "loss": 2.9066, + "step": 46150 + }, + { + "epoch": 3.1359559722788424, + "grad_norm": 6.325173377990723, + "learning_rate": 6.081668704987091e-06, + "loss": 2.9808, + "step": 46155 + }, + { + "epoch": 3.136295692349504, + "grad_norm": 7.9259748458862305, + "learning_rate": 6.081244054898764e-06, + "loss": 2.8627, + "step": 46160 + }, + { + "epoch": 3.1366354124201656, + "grad_norm": 8.854554176330566, + "learning_rate": 6.080819404810436e-06, + "loss": 2.985, + "step": 46165 + }, + { + "epoch": 3.1369751324908277, + "grad_norm": 5.797286510467529, + "learning_rate": 6.08039475472211e-06, + "loss": 2.98, + "step": 46170 + }, + { + "epoch": 3.1373148525614893, + "grad_norm": 7.8991217613220215, + "learning_rate": 6.0799701046337824e-06, + "loss": 3.0463, + "step": 46175 + }, + { + "epoch": 3.137654572632151, + "grad_norm": 6.977099895477295, + "learning_rate": 6.079545454545454e-06, + "loss": 2.9476, + "step": 46180 + }, + { + "epoch": 3.137994292702813, + "grad_norm": 8.153653144836426, + "learning_rate": 6.079120804457128e-06, + "loss": 2.9977, + "step": 46185 + }, + { + "epoch": 3.1383340127734747, + "grad_norm": 8.242837905883789, + "learning_rate": 6.078696154368801e-06, + "loss": 3.1441, + "step": 46190 + }, + { + "epoch": 3.1386737328441363, + "grad_norm": 6.409984111785889, + "learning_rate": 6.078271504280473e-06, + "loss": 3.1873, + "step": 46195 + }, + { + "epoch": 3.1390134529147984, + "grad_norm": 7.713500499725342, + "learning_rate": 6.0778468541921464e-06, + "loss": 2.9108, + "step": 46200 + }, + { + "epoch": 3.13935317298546, + "grad_norm": 7.3683552742004395, + "learning_rate": 6.077422204103819e-06, + "loss": 2.8563, + "step": 46205 + }, + { + "epoch": 3.1396928930561216, + "grad_norm": 8.535143852233887, + "learning_rate": 6.076997554015491e-06, + "loss": 2.823, + "step": 46210 + }, + { + "epoch": 3.1400326131267837, + "grad_norm": 6.500173568725586, + "learning_rate": 6.076572903927165e-06, + "loss": 3.2562, + "step": 46215 + }, + { + "epoch": 3.1403723331974454, + "grad_norm": 6.461174488067627, + "learning_rate": 6.076148253838837e-06, + "loss": 2.9012, + "step": 46220 + }, + { + "epoch": 3.140712053268107, + "grad_norm": 7.160241603851318, + "learning_rate": 6.07572360375051e-06, + "loss": 2.9344, + "step": 46225 + }, + { + "epoch": 3.141051773338769, + "grad_norm": 7.997501373291016, + "learning_rate": 6.075298953662183e-06, + "loss": 2.762, + "step": 46230 + }, + { + "epoch": 3.1413914934094307, + "grad_norm": 8.151817321777344, + "learning_rate": 6.074874303573855e-06, + "loss": 2.8318, + "step": 46235 + }, + { + "epoch": 3.1417312134800923, + "grad_norm": 6.282421588897705, + "learning_rate": 6.074449653485528e-06, + "loss": 2.8771, + "step": 46240 + }, + { + "epoch": 3.1420709335507544, + "grad_norm": 8.11064338684082, + "learning_rate": 6.074109933414867e-06, + "loss": 2.7964, + "step": 46245 + }, + { + "epoch": 3.142410653621416, + "grad_norm": 7.3299241065979, + "learning_rate": 6.073685283326539e-06, + "loss": 2.9185, + "step": 46250 + }, + { + "epoch": 3.1427503736920777, + "grad_norm": 8.990062713623047, + "learning_rate": 6.0732606332382125e-06, + "loss": 2.9612, + "step": 46255 + }, + { + "epoch": 3.1430900937627397, + "grad_norm": 8.324023246765137, + "learning_rate": 6.072835983149885e-06, + "loss": 2.9681, + "step": 46260 + }, + { + "epoch": 3.1434298138334014, + "grad_norm": 8.506744384765625, + "learning_rate": 6.072411333061557e-06, + "loss": 2.9149, + "step": 46265 + }, + { + "epoch": 3.143769533904063, + "grad_norm": 10.331487655639648, + "learning_rate": 6.071986682973231e-06, + "loss": 3.2131, + "step": 46270 + }, + { + "epoch": 3.1441092539747246, + "grad_norm": 8.48234748840332, + "learning_rate": 6.071562032884904e-06, + "loss": 3.1489, + "step": 46275 + }, + { + "epoch": 3.1444489740453867, + "grad_norm": 7.622289180755615, + "learning_rate": 6.071137382796576e-06, + "loss": 2.9669, + "step": 46280 + }, + { + "epoch": 3.1447886941160483, + "grad_norm": 7.448095321655273, + "learning_rate": 6.070712732708249e-06, + "loss": 2.9042, + "step": 46285 + }, + { + "epoch": 3.14512841418671, + "grad_norm": 5.4604597091674805, + "learning_rate": 6.070288082619922e-06, + "loss": 2.9979, + "step": 46290 + }, + { + "epoch": 3.145468134257372, + "grad_norm": 8.215002059936523, + "learning_rate": 6.069863432531594e-06, + "loss": 2.8582, + "step": 46295 + }, + { + "epoch": 3.1458078543280337, + "grad_norm": 7.865070343017578, + "learning_rate": 6.069438782443268e-06, + "loss": 3.1172, + "step": 46300 + }, + { + "epoch": 3.1461475743986953, + "grad_norm": 8.307150840759277, + "learning_rate": 6.06901413235494e-06, + "loss": 3.0926, + "step": 46305 + }, + { + "epoch": 3.1464872944693574, + "grad_norm": 7.508153915405273, + "learning_rate": 6.0685894822666125e-06, + "loss": 2.8094, + "step": 46310 + }, + { + "epoch": 3.146827014540019, + "grad_norm": 6.303588390350342, + "learning_rate": 6.068164832178286e-06, + "loss": 2.867, + "step": 46315 + }, + { + "epoch": 3.1471667346106806, + "grad_norm": 7.395394325256348, + "learning_rate": 6.067740182089958e-06, + "loss": 2.751, + "step": 46320 + }, + { + "epoch": 3.1475064546813427, + "grad_norm": 10.712591171264648, + "learning_rate": 6.067315532001631e-06, + "loss": 3.1696, + "step": 46325 + }, + { + "epoch": 3.1478461747520043, + "grad_norm": 7.232101917266846, + "learning_rate": 6.0668908819133045e-06, + "loss": 3.0694, + "step": 46330 + }, + { + "epoch": 3.148185894822666, + "grad_norm": 8.208572387695312, + "learning_rate": 6.0664662318249765e-06, + "loss": 2.9744, + "step": 46335 + }, + { + "epoch": 3.148525614893328, + "grad_norm": 8.259156227111816, + "learning_rate": 6.066041581736649e-06, + "loss": 2.8963, + "step": 46340 + }, + { + "epoch": 3.1488653349639897, + "grad_norm": 7.703058242797852, + "learning_rate": 6.065616931648323e-06, + "loss": 2.9701, + "step": 46345 + }, + { + "epoch": 3.1492050550346513, + "grad_norm": 6.666650295257568, + "learning_rate": 6.065192281559995e-06, + "loss": 2.9282, + "step": 46350 + }, + { + "epoch": 3.1495447751053134, + "grad_norm": 8.870184898376465, + "learning_rate": 6.064767631471668e-06, + "loss": 2.917, + "step": 46355 + }, + { + "epoch": 3.149884495175975, + "grad_norm": 8.4242582321167, + "learning_rate": 6.064342981383341e-06, + "loss": 3.0087, + "step": 46360 + }, + { + "epoch": 3.1502242152466366, + "grad_norm": 7.235738277435303, + "learning_rate": 6.063918331295013e-06, + "loss": 3.1365, + "step": 46365 + }, + { + "epoch": 3.1505639353172987, + "grad_norm": 9.651091575622559, + "learning_rate": 6.063493681206686e-06, + "loss": 2.8003, + "step": 46370 + }, + { + "epoch": 3.1509036553879604, + "grad_norm": 6.301943778991699, + "learning_rate": 6.063069031118359e-06, + "loss": 2.8494, + "step": 46375 + }, + { + "epoch": 3.151243375458622, + "grad_norm": 6.776623725891113, + "learning_rate": 6.062644381030032e-06, + "loss": 2.8343, + "step": 46380 + }, + { + "epoch": 3.151583095529284, + "grad_norm": 7.1866135597229, + "learning_rate": 6.0622197309417045e-06, + "loss": 3.0665, + "step": 46385 + }, + { + "epoch": 3.1519228155999457, + "grad_norm": 6.222047328948975, + "learning_rate": 6.061795080853377e-06, + "loss": 2.8907, + "step": 46390 + }, + { + "epoch": 3.1522625356706073, + "grad_norm": 6.645224094390869, + "learning_rate": 6.06137043076505e-06, + "loss": 2.9927, + "step": 46395 + }, + { + "epoch": 3.1526022557412694, + "grad_norm": 8.306278228759766, + "learning_rate": 6.060945780676722e-06, + "loss": 3.2216, + "step": 46400 + }, + { + "epoch": 3.152941975811931, + "grad_norm": 9.877921104431152, + "learning_rate": 6.060521130588396e-06, + "loss": 2.8224, + "step": 46405 + }, + { + "epoch": 3.1532816958825927, + "grad_norm": 5.17737340927124, + "learning_rate": 6.0600964805000685e-06, + "loss": 2.9332, + "step": 46410 + }, + { + "epoch": 3.1536214159532543, + "grad_norm": 7.022219657897949, + "learning_rate": 6.0596718304117405e-06, + "loss": 2.8417, + "step": 46415 + }, + { + "epoch": 3.1539611360239164, + "grad_norm": 9.049006462097168, + "learning_rate": 6.059247180323414e-06, + "loss": 3.008, + "step": 46420 + }, + { + "epoch": 3.154300856094578, + "grad_norm": 7.108543872833252, + "learning_rate": 6.058822530235087e-06, + "loss": 3.0781, + "step": 46425 + }, + { + "epoch": 3.1546405761652396, + "grad_norm": 6.736482620239258, + "learning_rate": 6.058397880146759e-06, + "loss": 2.9991, + "step": 46430 + }, + { + "epoch": 3.1549802962359017, + "grad_norm": 7.801517009735107, + "learning_rate": 6.0579732300584325e-06, + "loss": 2.7942, + "step": 46435 + }, + { + "epoch": 3.1553200163065633, + "grad_norm": 7.78257417678833, + "learning_rate": 6.057548579970105e-06, + "loss": 2.9777, + "step": 46440 + }, + { + "epoch": 3.155659736377225, + "grad_norm": 5.0668110847473145, + "learning_rate": 6.057123929881777e-06, + "loss": 3.0354, + "step": 46445 + }, + { + "epoch": 3.155999456447887, + "grad_norm": 6.0385050773620605, + "learning_rate": 6.056699279793451e-06, + "loss": 3.0746, + "step": 46450 + }, + { + "epoch": 3.1563391765185487, + "grad_norm": 7.752113342285156, + "learning_rate": 6.056274629705124e-06, + "loss": 3.1248, + "step": 46455 + }, + { + "epoch": 3.1566788965892103, + "grad_norm": 9.578398704528809, + "learning_rate": 6.055849979616796e-06, + "loss": 2.8635, + "step": 46460 + }, + { + "epoch": 3.1570186166598724, + "grad_norm": 4.671297550201416, + "learning_rate": 6.055425329528469e-06, + "loss": 2.8455, + "step": 46465 + }, + { + "epoch": 3.157358336730534, + "grad_norm": 7.780137538909912, + "learning_rate": 6.055000679440142e-06, + "loss": 3.1153, + "step": 46470 + }, + { + "epoch": 3.1576980568011956, + "grad_norm": 6.680250644683838, + "learning_rate": 6.054576029351814e-06, + "loss": 2.7582, + "step": 46475 + }, + { + "epoch": 3.1580377768718577, + "grad_norm": 6.7838640213012695, + "learning_rate": 6.054151379263488e-06, + "loss": 2.8488, + "step": 46480 + }, + { + "epoch": 3.1583774969425193, + "grad_norm": 5.987166404724121, + "learning_rate": 6.05372672917516e-06, + "loss": 2.9057, + "step": 46485 + }, + { + "epoch": 3.158717217013181, + "grad_norm": 6.925121784210205, + "learning_rate": 6.0533020790868325e-06, + "loss": 2.8928, + "step": 46490 + }, + { + "epoch": 3.159056937083843, + "grad_norm": 8.050283432006836, + "learning_rate": 6.052877428998506e-06, + "loss": 3.02, + "step": 46495 + }, + { + "epoch": 3.1593966571545047, + "grad_norm": 5.982475280761719, + "learning_rate": 6.052452778910178e-06, + "loss": 2.5674, + "step": 46500 + }, + { + "epoch": 3.1597363772251663, + "grad_norm": 8.216506004333496, + "learning_rate": 6.052028128821851e-06, + "loss": 2.9786, + "step": 46505 + }, + { + "epoch": 3.1600760972958284, + "grad_norm": 9.807909965515137, + "learning_rate": 6.0516034787335245e-06, + "loss": 3.0655, + "step": 46510 + }, + { + "epoch": 3.16041581736649, + "grad_norm": 8.484810829162598, + "learning_rate": 6.0511788286451965e-06, + "loss": 3.0583, + "step": 46515 + }, + { + "epoch": 3.1607555374371517, + "grad_norm": 7.016671180725098, + "learning_rate": 6.050754178556869e-06, + "loss": 3.1702, + "step": 46520 + }, + { + "epoch": 3.1610952575078137, + "grad_norm": 8.543699264526367, + "learning_rate": 6.050329528468543e-06, + "loss": 2.8732, + "step": 46525 + }, + { + "epoch": 3.1614349775784754, + "grad_norm": 6.640881061553955, + "learning_rate": 6.049904878380215e-06, + "loss": 2.9425, + "step": 46530 + }, + { + "epoch": 3.161774697649137, + "grad_norm": 5.4645538330078125, + "learning_rate": 6.0494802282918885e-06, + "loss": 3.0397, + "step": 46535 + }, + { + "epoch": 3.162114417719799, + "grad_norm": 8.326565742492676, + "learning_rate": 6.049055578203561e-06, + "loss": 2.7732, + "step": 46540 + }, + { + "epoch": 3.1624541377904607, + "grad_norm": 9.37065601348877, + "learning_rate": 6.048630928115233e-06, + "loss": 2.895, + "step": 46545 + }, + { + "epoch": 3.1627938578611223, + "grad_norm": 5.220396995544434, + "learning_rate": 6.048206278026907e-06, + "loss": 3.0291, + "step": 46550 + }, + { + "epoch": 3.1631335779317844, + "grad_norm": 9.318581581115723, + "learning_rate": 6.047781627938579e-06, + "loss": 2.9781, + "step": 46555 + }, + { + "epoch": 3.163473298002446, + "grad_norm": 8.529766082763672, + "learning_rate": 6.047356977850252e-06, + "loss": 2.9596, + "step": 46560 + }, + { + "epoch": 3.1638130180731077, + "grad_norm": 5.29815149307251, + "learning_rate": 6.046932327761925e-06, + "loss": 3.1014, + "step": 46565 + }, + { + "epoch": 3.1641527381437697, + "grad_norm": 7.807796955108643, + "learning_rate": 6.046507677673597e-06, + "loss": 2.8662, + "step": 46570 + }, + { + "epoch": 3.1644924582144314, + "grad_norm": 7.4746294021606445, + "learning_rate": 6.04608302758527e-06, + "loss": 3.1194, + "step": 46575 + }, + { + "epoch": 3.164832178285093, + "grad_norm": 6.2591729164123535, + "learning_rate": 6.045658377496944e-06, + "loss": 3.0567, + "step": 46580 + }, + { + "epoch": 3.165171898355755, + "grad_norm": 9.228411674499512, + "learning_rate": 6.045233727408616e-06, + "loss": 2.9994, + "step": 46585 + }, + { + "epoch": 3.1655116184264167, + "grad_norm": 8.24625301361084, + "learning_rate": 6.0448090773202885e-06, + "loss": 2.9206, + "step": 46590 + }, + { + "epoch": 3.1658513384970783, + "grad_norm": 7.560414791107178, + "learning_rate": 6.044384427231962e-06, + "loss": 3.1334, + "step": 46595 + }, + { + "epoch": 3.1661910585677404, + "grad_norm": 6.604122161865234, + "learning_rate": 6.043959777143634e-06, + "loss": 2.9577, + "step": 46600 + }, + { + "epoch": 3.166530778638402, + "grad_norm": 6.766440391540527, + "learning_rate": 6.043535127055307e-06, + "loss": 2.9993, + "step": 46605 + }, + { + "epoch": 3.1668704987090637, + "grad_norm": 8.622365951538086, + "learning_rate": 6.0431104769669805e-06, + "loss": 2.938, + "step": 46610 + }, + { + "epoch": 3.1672102187797253, + "grad_norm": 8.16295337677002, + "learning_rate": 6.0426858268786525e-06, + "loss": 2.9656, + "step": 46615 + }, + { + "epoch": 3.1675499388503874, + "grad_norm": 7.986705303192139, + "learning_rate": 6.042261176790325e-06, + "loss": 3.0539, + "step": 46620 + }, + { + "epoch": 3.167889658921049, + "grad_norm": 7.65809440612793, + "learning_rate": 6.041836526701998e-06, + "loss": 3.1985, + "step": 46625 + }, + { + "epoch": 3.1682293789917106, + "grad_norm": 6.807671070098877, + "learning_rate": 6.041411876613671e-06, + "loss": 2.8026, + "step": 46630 + }, + { + "epoch": 3.1685690990623727, + "grad_norm": 7.498118877410889, + "learning_rate": 6.040987226525344e-06, + "loss": 3.0469, + "step": 46635 + }, + { + "epoch": 3.1689088191330343, + "grad_norm": 6.427182674407959, + "learning_rate": 6.0405625764370165e-06, + "loss": 3.0633, + "step": 46640 + }, + { + "epoch": 3.169248539203696, + "grad_norm": 6.951352119445801, + "learning_rate": 6.040137926348689e-06, + "loss": 2.9754, + "step": 46645 + }, + { + "epoch": 3.169588259274358, + "grad_norm": 9.225484848022461, + "learning_rate": 6.039713276260361e-06, + "loss": 3.025, + "step": 46650 + }, + { + "epoch": 3.1699279793450197, + "grad_norm": 7.723385810852051, + "learning_rate": 6.039288626172035e-06, + "loss": 2.9698, + "step": 46655 + }, + { + "epoch": 3.1702676994156813, + "grad_norm": 6.969664096832275, + "learning_rate": 6.038863976083708e-06, + "loss": 3.1086, + "step": 46660 + }, + { + "epoch": 3.1706074194863434, + "grad_norm": 5.748724937438965, + "learning_rate": 6.03843932599538e-06, + "loss": 2.8908, + "step": 46665 + }, + { + "epoch": 3.170947139557005, + "grad_norm": 8.440788269042969, + "learning_rate": 6.038014675907053e-06, + "loss": 2.9595, + "step": 46670 + }, + { + "epoch": 3.1712868596276667, + "grad_norm": 8.300621032714844, + "learning_rate": 6.037590025818726e-06, + "loss": 2.9834, + "step": 46675 + }, + { + "epoch": 3.1716265796983287, + "grad_norm": 7.498384952545166, + "learning_rate": 6.037165375730398e-06, + "loss": 3.1174, + "step": 46680 + }, + { + "epoch": 3.1719662997689904, + "grad_norm": 7.434438705444336, + "learning_rate": 6.036740725642072e-06, + "loss": 2.8712, + "step": 46685 + }, + { + "epoch": 3.172306019839652, + "grad_norm": 10.847270011901855, + "learning_rate": 6.0363160755537445e-06, + "loss": 2.9058, + "step": 46690 + }, + { + "epoch": 3.172645739910314, + "grad_norm": 7.343774795532227, + "learning_rate": 6.0358914254654165e-06, + "loss": 3.1837, + "step": 46695 + }, + { + "epoch": 3.1729854599809757, + "grad_norm": 7.658304214477539, + "learning_rate": 6.03546677537709e-06, + "loss": 2.8629, + "step": 46700 + }, + { + "epoch": 3.1733251800516373, + "grad_norm": 7.742221832275391, + "learning_rate": 6.035042125288763e-06, + "loss": 2.8944, + "step": 46705 + }, + { + "epoch": 3.1736649001222994, + "grad_norm": 7.551424980163574, + "learning_rate": 6.034617475200435e-06, + "loss": 2.9359, + "step": 46710 + }, + { + "epoch": 3.174004620192961, + "grad_norm": 10.966901779174805, + "learning_rate": 6.0341928251121085e-06, + "loss": 3.1904, + "step": 46715 + }, + { + "epoch": 3.1743443402636227, + "grad_norm": 6.511794090270996, + "learning_rate": 6.0337681750237805e-06, + "loss": 3.0112, + "step": 46720 + }, + { + "epoch": 3.1746840603342847, + "grad_norm": 6.971020221710205, + "learning_rate": 6.033343524935453e-06, + "loss": 2.8682, + "step": 46725 + }, + { + "epoch": 3.1750237804049464, + "grad_norm": 7.216233730316162, + "learning_rate": 6.032918874847127e-06, + "loss": 2.9735, + "step": 46730 + }, + { + "epoch": 3.175363500475608, + "grad_norm": 8.157943725585938, + "learning_rate": 6.032494224758799e-06, + "loss": 2.9192, + "step": 46735 + }, + { + "epoch": 3.17570322054627, + "grad_norm": 8.5066556930542, + "learning_rate": 6.032069574670472e-06, + "loss": 3.0343, + "step": 46740 + }, + { + "epoch": 3.1760429406169317, + "grad_norm": 8.245861053466797, + "learning_rate": 6.031644924582145e-06, + "loss": 3.2352, + "step": 46745 + }, + { + "epoch": 3.1763826606875933, + "grad_norm": 6.17750358581543, + "learning_rate": 6.031220274493817e-06, + "loss": 3.0137, + "step": 46750 + }, + { + "epoch": 3.176722380758255, + "grad_norm": 5.487748622894287, + "learning_rate": 6.03079562440549e-06, + "loss": 2.6985, + "step": 46755 + }, + { + "epoch": 3.177062100828917, + "grad_norm": 7.255011081695557, + "learning_rate": 6.030370974317164e-06, + "loss": 3.0163, + "step": 46760 + }, + { + "epoch": 3.1774018208995787, + "grad_norm": 6.487035274505615, + "learning_rate": 6.029946324228836e-06, + "loss": 2.7967, + "step": 46765 + }, + { + "epoch": 3.1777415409702403, + "grad_norm": 7.309234619140625, + "learning_rate": 6.0295216741405085e-06, + "loss": 2.9337, + "step": 46770 + }, + { + "epoch": 3.1780812610409024, + "grad_norm": 6.774007797241211, + "learning_rate": 6.029097024052182e-06, + "loss": 2.7114, + "step": 46775 + }, + { + "epoch": 3.178420981111564, + "grad_norm": 6.329509735107422, + "learning_rate": 6.028672373963854e-06, + "loss": 2.9865, + "step": 46780 + }, + { + "epoch": 3.1787607011822256, + "grad_norm": 8.36528491973877, + "learning_rate": 6.028247723875527e-06, + "loss": 2.9186, + "step": 46785 + }, + { + "epoch": 3.1791004212528877, + "grad_norm": 7.530045509338379, + "learning_rate": 6.0278230737872005e-06, + "loss": 3.2279, + "step": 46790 + }, + { + "epoch": 3.1794401413235494, + "grad_norm": 6.452785015106201, + "learning_rate": 6.0273984236988725e-06, + "loss": 2.9181, + "step": 46795 + }, + { + "epoch": 3.179779861394211, + "grad_norm": 9.58105182647705, + "learning_rate": 6.026973773610545e-06, + "loss": 2.7851, + "step": 46800 + }, + { + "epoch": 3.180119581464873, + "grad_norm": 5.896331787109375, + "learning_rate": 6.026549123522218e-06, + "loss": 2.7377, + "step": 46805 + }, + { + "epoch": 3.1804593015355347, + "grad_norm": 9.709874153137207, + "learning_rate": 6.026124473433891e-06, + "loss": 2.8542, + "step": 46810 + }, + { + "epoch": 3.1807990216061963, + "grad_norm": 7.224583625793457, + "learning_rate": 6.025699823345563e-06, + "loss": 2.7508, + "step": 46815 + }, + { + "epoch": 3.1811387416768584, + "grad_norm": 7.640529632568359, + "learning_rate": 6.0252751732572365e-06, + "loss": 2.8207, + "step": 46820 + }, + { + "epoch": 3.18147846174752, + "grad_norm": 6.208957195281982, + "learning_rate": 6.024850523168909e-06, + "loss": 3.0291, + "step": 46825 + }, + { + "epoch": 3.1818181818181817, + "grad_norm": 6.566804885864258, + "learning_rate": 6.024425873080581e-06, + "loss": 2.9275, + "step": 46830 + }, + { + "epoch": 3.1821579018888437, + "grad_norm": 5.610049247741699, + "learning_rate": 6.024001222992255e-06, + "loss": 2.9572, + "step": 46835 + }, + { + "epoch": 3.1824976219595054, + "grad_norm": 8.722081184387207, + "learning_rate": 6.023576572903928e-06, + "loss": 2.8495, + "step": 46840 + }, + { + "epoch": 3.182837342030167, + "grad_norm": 6.741726398468018, + "learning_rate": 6.0231519228156e-06, + "loss": 2.8588, + "step": 46845 + }, + { + "epoch": 3.183177062100829, + "grad_norm": 6.546483039855957, + "learning_rate": 6.022727272727273e-06, + "loss": 2.9679, + "step": 46850 + }, + { + "epoch": 3.1835167821714907, + "grad_norm": 6.665781497955322, + "learning_rate": 6.022302622638946e-06, + "loss": 2.9978, + "step": 46855 + }, + { + "epoch": 3.1838565022421523, + "grad_norm": 8.887256622314453, + "learning_rate": 6.021877972550618e-06, + "loss": 3.0994, + "step": 46860 + }, + { + "epoch": 3.1841962223128144, + "grad_norm": 7.851647853851318, + "learning_rate": 6.021453322462292e-06, + "loss": 2.922, + "step": 46865 + }, + { + "epoch": 3.184535942383476, + "grad_norm": 6.8227620124816895, + "learning_rate": 6.0210286723739645e-06, + "loss": 3.0651, + "step": 46870 + }, + { + "epoch": 3.1848756624541377, + "grad_norm": 7.389965534210205, + "learning_rate": 6.020604022285637e-06, + "loss": 3.008, + "step": 46875 + }, + { + "epoch": 3.1852153825247997, + "grad_norm": 5.912023544311523, + "learning_rate": 6.02017937219731e-06, + "loss": 2.9901, + "step": 46880 + }, + { + "epoch": 3.1855551025954614, + "grad_norm": 7.059504508972168, + "learning_rate": 6.019754722108983e-06, + "loss": 3.2504, + "step": 46885 + }, + { + "epoch": 3.185894822666123, + "grad_norm": 8.392033576965332, + "learning_rate": 6.019330072020656e-06, + "loss": 2.9478, + "step": 46890 + }, + { + "epoch": 3.186234542736785, + "grad_norm": 7.753514766693115, + "learning_rate": 6.0189054219323285e-06, + "loss": 2.8155, + "step": 46895 + }, + { + "epoch": 3.1865742628074467, + "grad_norm": 7.746694564819336, + "learning_rate": 6.0184807718440004e-06, + "loss": 3.0803, + "step": 46900 + }, + { + "epoch": 3.1869139828781083, + "grad_norm": 7.380273818969727, + "learning_rate": 6.018056121755674e-06, + "loss": 2.8925, + "step": 46905 + }, + { + "epoch": 3.1872537029487704, + "grad_norm": 6.902713298797607, + "learning_rate": 6.017631471667347e-06, + "loss": 3.0935, + "step": 46910 + }, + { + "epoch": 3.187593423019432, + "grad_norm": 7.1658244132995605, + "learning_rate": 6.017206821579019e-06, + "loss": 2.8979, + "step": 46915 + }, + { + "epoch": 3.1879331430900937, + "grad_norm": 7.391925811767578, + "learning_rate": 6.0167821714906925e-06, + "loss": 3.1454, + "step": 46920 + }, + { + "epoch": 3.1882728631607558, + "grad_norm": 6.804670333862305, + "learning_rate": 6.016357521402365e-06, + "loss": 3.1014, + "step": 46925 + }, + { + "epoch": 3.1886125832314174, + "grad_norm": 6.942734718322754, + "learning_rate": 6.015932871314037e-06, + "loss": 2.9724, + "step": 46930 + }, + { + "epoch": 3.188952303302079, + "grad_norm": 8.59958553314209, + "learning_rate": 6.015508221225711e-06, + "loss": 2.8741, + "step": 46935 + }, + { + "epoch": 3.189292023372741, + "grad_norm": 8.809890747070312, + "learning_rate": 6.015083571137384e-06, + "loss": 3.0347, + "step": 46940 + }, + { + "epoch": 3.1896317434434027, + "grad_norm": 5.819733142852783, + "learning_rate": 6.014658921049056e-06, + "loss": 2.7988, + "step": 46945 + }, + { + "epoch": 3.1899714635140644, + "grad_norm": 6.010608196258545, + "learning_rate": 6.014234270960729e-06, + "loss": 2.8805, + "step": 46950 + }, + { + "epoch": 3.190311183584726, + "grad_norm": 9.10186767578125, + "learning_rate": 6.013809620872402e-06, + "loss": 2.8522, + "step": 46955 + }, + { + "epoch": 3.190650903655388, + "grad_norm": 6.482830047607422, + "learning_rate": 6.013384970784074e-06, + "loss": 2.6648, + "step": 46960 + }, + { + "epoch": 3.1909906237260497, + "grad_norm": 8.884810447692871, + "learning_rate": 6.012960320695748e-06, + "loss": 2.922, + "step": 46965 + }, + { + "epoch": 3.1913303437967113, + "grad_norm": 9.06906509399414, + "learning_rate": 6.01253567060742e-06, + "loss": 2.782, + "step": 46970 + }, + { + "epoch": 3.1916700638673734, + "grad_norm": 7.3541436195373535, + "learning_rate": 6.0121110205190924e-06, + "loss": 3.0048, + "step": 46975 + }, + { + "epoch": 3.192009783938035, + "grad_norm": 7.4913201332092285, + "learning_rate": 6.011686370430766e-06, + "loss": 2.8138, + "step": 46980 + }, + { + "epoch": 3.1923495040086967, + "grad_norm": 7.52366304397583, + "learning_rate": 6.011261720342438e-06, + "loss": 2.9292, + "step": 46985 + }, + { + "epoch": 3.1926892240793587, + "grad_norm": 7.978975772857666, + "learning_rate": 6.010837070254111e-06, + "loss": 2.9278, + "step": 46990 + }, + { + "epoch": 3.1930289441500204, + "grad_norm": 7.111487865447998, + "learning_rate": 6.0104124201657845e-06, + "loss": 2.9641, + "step": 46995 + }, + { + "epoch": 3.193368664220682, + "grad_norm": 5.550787448883057, + "learning_rate": 6.0099877700774565e-06, + "loss": 2.8709, + "step": 47000 + }, + { + "epoch": 3.193708384291344, + "grad_norm": 6.293915748596191, + "learning_rate": 6.009563119989129e-06, + "loss": 3.1256, + "step": 47005 + }, + { + "epoch": 3.1940481043620057, + "grad_norm": 6.453405380249023, + "learning_rate": 6.009138469900803e-06, + "loss": 2.8193, + "step": 47010 + }, + { + "epoch": 3.1943878244326673, + "grad_norm": 7.527571201324463, + "learning_rate": 6.008713819812475e-06, + "loss": 2.8725, + "step": 47015 + }, + { + "epoch": 3.1947275445033294, + "grad_norm": 7.012632369995117, + "learning_rate": 6.008289169724148e-06, + "loss": 2.8592, + "step": 47020 + }, + { + "epoch": 3.195067264573991, + "grad_norm": 6.937170505523682, + "learning_rate": 6.007864519635821e-06, + "loss": 2.9534, + "step": 47025 + }, + { + "epoch": 3.1954069846446527, + "grad_norm": 7.549351215362549, + "learning_rate": 6.007439869547493e-06, + "loss": 2.9538, + "step": 47030 + }, + { + "epoch": 3.1957467047153147, + "grad_norm": 6.528261184692383, + "learning_rate": 6.007015219459166e-06, + "loss": 2.8765, + "step": 47035 + }, + { + "epoch": 3.1960864247859764, + "grad_norm": 8.43447494506836, + "learning_rate": 6.00659056937084e-06, + "loss": 2.9965, + "step": 47040 + }, + { + "epoch": 3.196426144856638, + "grad_norm": 6.561512470245361, + "learning_rate": 6.006165919282512e-06, + "loss": 2.7398, + "step": 47045 + }, + { + "epoch": 3.1967658649273, + "grad_norm": 7.859545707702637, + "learning_rate": 6.0057412691941845e-06, + "loss": 2.8072, + "step": 47050 + }, + { + "epoch": 3.1971055849979617, + "grad_norm": 6.299407958984375, + "learning_rate": 6.005316619105857e-06, + "loss": 2.9145, + "step": 47055 + }, + { + "epoch": 3.1974453050686233, + "grad_norm": 9.610539436340332, + "learning_rate": 6.00489196901753e-06, + "loss": 2.8651, + "step": 47060 + }, + { + "epoch": 3.1977850251392854, + "grad_norm": 7.0506792068481445, + "learning_rate": 6.004467318929202e-06, + "loss": 3.1029, + "step": 47065 + }, + { + "epoch": 3.198124745209947, + "grad_norm": 7.789498805999756, + "learning_rate": 6.004042668840876e-06, + "loss": 2.8878, + "step": 47070 + }, + { + "epoch": 3.1984644652806087, + "grad_norm": 6.01115608215332, + "learning_rate": 6.0036180187525485e-06, + "loss": 3.1562, + "step": 47075 + }, + { + "epoch": 3.1988041853512708, + "grad_norm": 7.7505621910095215, + "learning_rate": 6.00319336866422e-06, + "loss": 2.8194, + "step": 47080 + }, + { + "epoch": 3.1991439054219324, + "grad_norm": 7.711774826049805, + "learning_rate": 6.002768718575894e-06, + "loss": 3.0428, + "step": 47085 + }, + { + "epoch": 3.199483625492594, + "grad_norm": 7.70568323135376, + "learning_rate": 6.002344068487567e-06, + "loss": 3.0012, + "step": 47090 + }, + { + "epoch": 3.1998233455632556, + "grad_norm": 6.616424083709717, + "learning_rate": 6.001919418399239e-06, + "loss": 2.8096, + "step": 47095 + }, + { + "epoch": 3.2001630656339177, + "grad_norm": 5.572930812835693, + "learning_rate": 6.0014947683109125e-06, + "loss": 2.8521, + "step": 47100 + }, + { + "epoch": 3.2005027857045794, + "grad_norm": 7.1588544845581055, + "learning_rate": 6.001070118222585e-06, + "loss": 2.8493, + "step": 47105 + }, + { + "epoch": 3.200842505775241, + "grad_norm": 6.278040885925293, + "learning_rate": 6.000645468134257e-06, + "loss": 3.2577, + "step": 47110 + }, + { + "epoch": 3.201182225845903, + "grad_norm": 6.699854850769043, + "learning_rate": 6.000220818045931e-06, + "loss": 2.9886, + "step": 47115 + }, + { + "epoch": 3.2015219459165647, + "grad_norm": 7.058969020843506, + "learning_rate": 5.999796167957604e-06, + "loss": 2.9706, + "step": 47120 + }, + { + "epoch": 3.2018616659872263, + "grad_norm": 6.157156944274902, + "learning_rate": 5.999371517869276e-06, + "loss": 2.6966, + "step": 47125 + }, + { + "epoch": 3.2022013860578884, + "grad_norm": 5.915049076080322, + "learning_rate": 5.998946867780949e-06, + "loss": 3.0858, + "step": 47130 + }, + { + "epoch": 3.20254110612855, + "grad_norm": 8.633157730102539, + "learning_rate": 5.998522217692622e-06, + "loss": 3.1437, + "step": 47135 + }, + { + "epoch": 3.2028808261992117, + "grad_norm": 7.573117256164551, + "learning_rate": 5.998097567604294e-06, + "loss": 2.9934, + "step": 47140 + }, + { + "epoch": 3.2032205462698737, + "grad_norm": 5.516845226287842, + "learning_rate": 5.997672917515968e-06, + "loss": 2.9424, + "step": 47145 + }, + { + "epoch": 3.2035602663405354, + "grad_norm": 7.671825408935547, + "learning_rate": 5.99724826742764e-06, + "loss": 2.9941, + "step": 47150 + }, + { + "epoch": 3.203899986411197, + "grad_norm": 6.002398490905762, + "learning_rate": 5.9968236173393124e-06, + "loss": 3.1478, + "step": 47155 + }, + { + "epoch": 3.204239706481859, + "grad_norm": 7.101724624633789, + "learning_rate": 5.996398967250986e-06, + "loss": 3.0632, + "step": 47160 + }, + { + "epoch": 3.2045794265525207, + "grad_norm": 7.151246070861816, + "learning_rate": 5.995974317162658e-06, + "loss": 2.9432, + "step": 47165 + }, + { + "epoch": 3.2049191466231823, + "grad_norm": 6.931392669677734, + "learning_rate": 5.995549667074331e-06, + "loss": 3.0172, + "step": 47170 + }, + { + "epoch": 3.2052588666938444, + "grad_norm": 6.290801525115967, + "learning_rate": 5.9951250169860045e-06, + "loss": 2.8925, + "step": 47175 + }, + { + "epoch": 3.205598586764506, + "grad_norm": 6.825459957122803, + "learning_rate": 5.9947003668976764e-06, + "loss": 3.1296, + "step": 47180 + }, + { + "epoch": 3.2059383068351677, + "grad_norm": 8.12112045288086, + "learning_rate": 5.994275716809349e-06, + "loss": 2.8235, + "step": 47185 + }, + { + "epoch": 3.2062780269058297, + "grad_norm": 8.371085166931152, + "learning_rate": 5.993851066721023e-06, + "loss": 2.8562, + "step": 47190 + }, + { + "epoch": 3.2066177469764914, + "grad_norm": 7.169764518737793, + "learning_rate": 5.993426416632695e-06, + "loss": 2.889, + "step": 47195 + }, + { + "epoch": 3.206957467047153, + "grad_norm": 6.383420944213867, + "learning_rate": 5.993001766544368e-06, + "loss": 2.7284, + "step": 47200 + }, + { + "epoch": 3.207297187117815, + "grad_norm": 8.167241096496582, + "learning_rate": 5.992577116456041e-06, + "loss": 2.8642, + "step": 47205 + }, + { + "epoch": 3.2076369071884767, + "grad_norm": 5.603298664093018, + "learning_rate": 5.992152466367713e-06, + "loss": 3.0173, + "step": 47210 + }, + { + "epoch": 3.2079766272591383, + "grad_norm": 6.115334987640381, + "learning_rate": 5.991727816279387e-06, + "loss": 2.6431, + "step": 47215 + }, + { + "epoch": 3.2083163473298004, + "grad_norm": 7.670407295227051, + "learning_rate": 5.991303166191059e-06, + "loss": 2.7904, + "step": 47220 + }, + { + "epoch": 3.208656067400462, + "grad_norm": 6.404486656188965, + "learning_rate": 5.990878516102732e-06, + "loss": 2.9053, + "step": 47225 + }, + { + "epoch": 3.2089957874711237, + "grad_norm": 6.641207218170166, + "learning_rate": 5.990453866014405e-06, + "loss": 2.5865, + "step": 47230 + }, + { + "epoch": 3.2093355075417858, + "grad_norm": 9.485172271728516, + "learning_rate": 5.990029215926077e-06, + "loss": 3.0682, + "step": 47235 + }, + { + "epoch": 3.2096752276124474, + "grad_norm": 7.549869060516357, + "learning_rate": 5.98960456583775e-06, + "loss": 2.9655, + "step": 47240 + }, + { + "epoch": 3.210014947683109, + "grad_norm": 6.803981304168701, + "learning_rate": 5.989179915749424e-06, + "loss": 3.0714, + "step": 47245 + }, + { + "epoch": 3.210354667753771, + "grad_norm": 8.023756980895996, + "learning_rate": 5.988755265661096e-06, + "loss": 3.12, + "step": 47250 + }, + { + "epoch": 3.2106943878244327, + "grad_norm": 6.362231254577637, + "learning_rate": 5.9883306155727684e-06, + "loss": 2.8958, + "step": 47255 + }, + { + "epoch": 3.2110341078950944, + "grad_norm": 7.307496547698975, + "learning_rate": 5.987905965484442e-06, + "loss": 3.0627, + "step": 47260 + }, + { + "epoch": 3.2113738279657564, + "grad_norm": 7.491209030151367, + "learning_rate": 5.987481315396114e-06, + "loss": 2.7096, + "step": 47265 + }, + { + "epoch": 3.211713548036418, + "grad_norm": 5.643599033355713, + "learning_rate": 5.987056665307787e-06, + "loss": 2.9208, + "step": 47270 + }, + { + "epoch": 3.2120532681070797, + "grad_norm": 10.285829544067383, + "learning_rate": 5.9866320152194605e-06, + "loss": 2.8466, + "step": 47275 + }, + { + "epoch": 3.2123929881777418, + "grad_norm": 7.947960376739502, + "learning_rate": 5.9862073651311325e-06, + "loss": 2.9172, + "step": 47280 + }, + { + "epoch": 3.2127327082484034, + "grad_norm": 6.14763069152832, + "learning_rate": 5.985782715042805e-06, + "loss": 2.8678, + "step": 47285 + }, + { + "epoch": 3.213072428319065, + "grad_norm": 9.537172317504883, + "learning_rate": 5.985358064954478e-06, + "loss": 2.943, + "step": 47290 + }, + { + "epoch": 3.2134121483897267, + "grad_norm": 6.200684070587158, + "learning_rate": 5.984933414866151e-06, + "loss": 2.818, + "step": 47295 + }, + { + "epoch": 3.2137518684603887, + "grad_norm": 7.48037576675415, + "learning_rate": 5.984508764777824e-06, + "loss": 3.1071, + "step": 47300 + }, + { + "epoch": 3.2140915885310504, + "grad_norm": 5.7591328620910645, + "learning_rate": 5.9840841146894965e-06, + "loss": 2.7258, + "step": 47305 + }, + { + "epoch": 3.214431308601712, + "grad_norm": 5.824651718139648, + "learning_rate": 5.983659464601169e-06, + "loss": 2.6873, + "step": 47310 + }, + { + "epoch": 3.214771028672374, + "grad_norm": 6.510841369628906, + "learning_rate": 5.983234814512841e-06, + "loss": 2.8706, + "step": 47315 + }, + { + "epoch": 3.2151107487430357, + "grad_norm": 6.7928924560546875, + "learning_rate": 5.982810164424515e-06, + "loss": 3.0634, + "step": 47320 + }, + { + "epoch": 3.2154504688136973, + "grad_norm": 6.325881004333496, + "learning_rate": 5.982385514336188e-06, + "loss": 3.0306, + "step": 47325 + }, + { + "epoch": 3.2157901888843594, + "grad_norm": 9.975218772888184, + "learning_rate": 5.98196086424786e-06, + "loss": 2.8109, + "step": 47330 + }, + { + "epoch": 3.216129908955021, + "grad_norm": 6.867137432098389, + "learning_rate": 5.981536214159533e-06, + "loss": 2.8738, + "step": 47335 + }, + { + "epoch": 3.2164696290256827, + "grad_norm": 6.329349040985107, + "learning_rate": 5.981111564071206e-06, + "loss": 2.8792, + "step": 47340 + }, + { + "epoch": 3.2168093490963448, + "grad_norm": 7.637319564819336, + "learning_rate": 5.980686913982878e-06, + "loss": 2.6568, + "step": 47345 + }, + { + "epoch": 3.2171490691670064, + "grad_norm": 6.28972864151001, + "learning_rate": 5.980262263894552e-06, + "loss": 2.914, + "step": 47350 + }, + { + "epoch": 3.217488789237668, + "grad_norm": 6.168192386627197, + "learning_rate": 5.9798376138062245e-06, + "loss": 2.7391, + "step": 47355 + }, + { + "epoch": 3.21782850930833, + "grad_norm": 6.6686482429504395, + "learning_rate": 5.979412963717896e-06, + "loss": 2.8885, + "step": 47360 + }, + { + "epoch": 3.2181682293789917, + "grad_norm": 7.695548057556152, + "learning_rate": 5.97898831362957e-06, + "loss": 2.7332, + "step": 47365 + }, + { + "epoch": 3.2185079494496533, + "grad_norm": 6.9059929847717285, + "learning_rate": 5.978563663541243e-06, + "loss": 3.0163, + "step": 47370 + }, + { + "epoch": 3.2188476695203154, + "grad_norm": 6.990141868591309, + "learning_rate": 5.978139013452915e-06, + "loss": 2.9028, + "step": 47375 + }, + { + "epoch": 3.219187389590977, + "grad_norm": 5.93409538269043, + "learning_rate": 5.9777143633645885e-06, + "loss": 3.1109, + "step": 47380 + }, + { + "epoch": 3.2195271096616387, + "grad_norm": 6.7555437088012695, + "learning_rate": 5.97728971327626e-06, + "loss": 2.5534, + "step": 47385 + }, + { + "epoch": 3.2198668297323008, + "grad_norm": 5.448732852935791, + "learning_rate": 5.976865063187933e-06, + "loss": 3.0377, + "step": 47390 + }, + { + "epoch": 3.2202065498029624, + "grad_norm": 8.48669719696045, + "learning_rate": 5.976440413099607e-06, + "loss": 3.0967, + "step": 47395 + }, + { + "epoch": 3.220546269873624, + "grad_norm": 6.367254734039307, + "learning_rate": 5.976015763011279e-06, + "loss": 2.8925, + "step": 47400 + }, + { + "epoch": 3.220885989944286, + "grad_norm": 9.079215049743652, + "learning_rate": 5.975591112922952e-06, + "loss": 2.9359, + "step": 47405 + }, + { + "epoch": 3.2212257100149477, + "grad_norm": 6.668749809265137, + "learning_rate": 5.975166462834625e-06, + "loss": 2.9868, + "step": 47410 + }, + { + "epoch": 3.2215654300856094, + "grad_norm": 7.170860290527344, + "learning_rate": 5.974741812746297e-06, + "loss": 3.0777, + "step": 47415 + }, + { + "epoch": 3.2219051501562714, + "grad_norm": 7.87964391708374, + "learning_rate": 5.97431716265797e-06, + "loss": 2.7883, + "step": 47420 + }, + { + "epoch": 3.222244870226933, + "grad_norm": 8.23336410522461, + "learning_rate": 5.973892512569644e-06, + "loss": 2.805, + "step": 47425 + }, + { + "epoch": 3.2225845902975947, + "grad_norm": 10.705235481262207, + "learning_rate": 5.973467862481316e-06, + "loss": 2.6394, + "step": 47430 + }, + { + "epoch": 3.2229243103682563, + "grad_norm": 9.240042686462402, + "learning_rate": 5.9730432123929884e-06, + "loss": 2.9313, + "step": 47435 + }, + { + "epoch": 3.2232640304389184, + "grad_norm": 7.473839282989502, + "learning_rate": 5.972618562304662e-06, + "loss": 2.9231, + "step": 47440 + }, + { + "epoch": 3.22360375050958, + "grad_norm": 6.643666744232178, + "learning_rate": 5.972193912216334e-06, + "loss": 2.9981, + "step": 47445 + }, + { + "epoch": 3.2239434705802417, + "grad_norm": 7.202516555786133, + "learning_rate": 5.971769262128007e-06, + "loss": 2.8824, + "step": 47450 + }, + { + "epoch": 3.2242831906509037, + "grad_norm": 6.232826232910156, + "learning_rate": 5.9713446120396805e-06, + "loss": 3.0651, + "step": 47455 + }, + { + "epoch": 3.2246229107215654, + "grad_norm": 7.9958014488220215, + "learning_rate": 5.9709199619513524e-06, + "loss": 2.8826, + "step": 47460 + }, + { + "epoch": 3.224962630792227, + "grad_norm": 6.636288642883301, + "learning_rate": 5.970495311863025e-06, + "loss": 2.9197, + "step": 47465 + }, + { + "epoch": 3.225302350862889, + "grad_norm": 7.049856662750244, + "learning_rate": 5.970070661774698e-06, + "loss": 3.014, + "step": 47470 + }, + { + "epoch": 3.2256420709335507, + "grad_norm": 5.551400184631348, + "learning_rate": 5.969646011686371e-06, + "loss": 2.8937, + "step": 47475 + }, + { + "epoch": 3.2259817910042123, + "grad_norm": 6.114572048187256, + "learning_rate": 5.969221361598044e-06, + "loss": 2.9657, + "step": 47480 + }, + { + "epoch": 3.2263215110748744, + "grad_norm": 5.713433265686035, + "learning_rate": 5.9687967115097164e-06, + "loss": 3.0358, + "step": 47485 + }, + { + "epoch": 3.226661231145536, + "grad_norm": 7.071638584136963, + "learning_rate": 5.968372061421389e-06, + "loss": 2.9659, + "step": 47490 + }, + { + "epoch": 3.2270009512161977, + "grad_norm": 8.403432846069336, + "learning_rate": 5.967947411333061e-06, + "loss": 2.7078, + "step": 47495 + }, + { + "epoch": 3.2273406712868598, + "grad_norm": 8.782109260559082, + "learning_rate": 5.967522761244735e-06, + "loss": 2.7545, + "step": 47500 + }, + { + "epoch": 3.2276803913575214, + "grad_norm": 6.97522497177124, + "learning_rate": 5.967098111156408e-06, + "loss": 3.0946, + "step": 47505 + }, + { + "epoch": 3.228020111428183, + "grad_norm": 6.6284589767456055, + "learning_rate": 5.96667346106808e-06, + "loss": 2.9799, + "step": 47510 + }, + { + "epoch": 3.228359831498845, + "grad_norm": 6.503807067871094, + "learning_rate": 5.966248810979753e-06, + "loss": 3.1487, + "step": 47515 + }, + { + "epoch": 3.2286995515695067, + "grad_norm": 5.458562850952148, + "learning_rate": 5.965824160891426e-06, + "loss": 2.7975, + "step": 47520 + }, + { + "epoch": 3.2290392716401684, + "grad_norm": 5.670429706573486, + "learning_rate": 5.965399510803098e-06, + "loss": 2.8841, + "step": 47525 + }, + { + "epoch": 3.2293789917108304, + "grad_norm": 6.445399284362793, + "learning_rate": 5.964974860714772e-06, + "loss": 3.0364, + "step": 47530 + }, + { + "epoch": 3.229718711781492, + "grad_norm": 8.37863826751709, + "learning_rate": 5.9645502106264444e-06, + "loss": 2.6872, + "step": 47535 + }, + { + "epoch": 3.2300584318521537, + "grad_norm": 8.512499809265137, + "learning_rate": 5.964125560538116e-06, + "loss": 2.8493, + "step": 47540 + }, + { + "epoch": 3.2303981519228158, + "grad_norm": 6.1106767654418945, + "learning_rate": 5.96370091044979e-06, + "loss": 2.9132, + "step": 47545 + }, + { + "epoch": 3.2307378719934774, + "grad_norm": 6.790396690368652, + "learning_rate": 5.963276260361463e-06, + "loss": 2.5946, + "step": 47550 + }, + { + "epoch": 3.231077592064139, + "grad_norm": 8.002788543701172, + "learning_rate": 5.962851610273136e-06, + "loss": 2.8765, + "step": 47555 + }, + { + "epoch": 3.231417312134801, + "grad_norm": 7.1084747314453125, + "learning_rate": 5.9624269601848084e-06, + "loss": 2.8277, + "step": 47560 + }, + { + "epoch": 3.2317570322054627, + "grad_norm": 5.710435390472412, + "learning_rate": 5.96200231009648e-06, + "loss": 2.9329, + "step": 47565 + }, + { + "epoch": 3.2320967522761244, + "grad_norm": 6.241621494293213, + "learning_rate": 5.961577660008154e-06, + "loss": 2.8934, + "step": 47570 + }, + { + "epoch": 3.2324364723467864, + "grad_norm": 6.746476173400879, + "learning_rate": 5.961153009919827e-06, + "loss": 3.0102, + "step": 47575 + }, + { + "epoch": 3.232776192417448, + "grad_norm": 7.604429244995117, + "learning_rate": 5.960728359831499e-06, + "loss": 2.8286, + "step": 47580 + }, + { + "epoch": 3.2331159124881097, + "grad_norm": 8.862712860107422, + "learning_rate": 5.9603037097431725e-06, + "loss": 3.1652, + "step": 47585 + }, + { + "epoch": 3.2334556325587718, + "grad_norm": 6.6295270919799805, + "learning_rate": 5.959879059654845e-06, + "loss": 3.1222, + "step": 47590 + }, + { + "epoch": 3.2337953526294334, + "grad_norm": 6.957479953765869, + "learning_rate": 5.959454409566517e-06, + "loss": 2.6962, + "step": 47595 + }, + { + "epoch": 3.234135072700095, + "grad_norm": 7.213087558746338, + "learning_rate": 5.959029759478191e-06, + "loss": 3.1065, + "step": 47600 + }, + { + "epoch": 3.234474792770757, + "grad_norm": 8.49256706237793, + "learning_rate": 5.958605109389864e-06, + "loss": 3.19, + "step": 47605 + }, + { + "epoch": 3.2348145128414187, + "grad_norm": 6.79567813873291, + "learning_rate": 5.958180459301536e-06, + "loss": 2.7147, + "step": 47610 + }, + { + "epoch": 3.2351542329120804, + "grad_norm": 7.79114294052124, + "learning_rate": 5.957755809213209e-06, + "loss": 2.9802, + "step": 47615 + }, + { + "epoch": 3.2354939529827424, + "grad_norm": 5.179914951324463, + "learning_rate": 5.957331159124882e-06, + "loss": 3.1276, + "step": 47620 + }, + { + "epoch": 3.235833673053404, + "grad_norm": 9.6448335647583, + "learning_rate": 5.956906509036554e-06, + "loss": 2.8251, + "step": 47625 + }, + { + "epoch": 3.2361733931240657, + "grad_norm": 6.84242057800293, + "learning_rate": 5.956481858948228e-06, + "loss": 3.1004, + "step": 47630 + }, + { + "epoch": 3.2365131131947273, + "grad_norm": 6.884141445159912, + "learning_rate": 5.9560572088599e-06, + "loss": 2.957, + "step": 47635 + }, + { + "epoch": 3.2368528332653894, + "grad_norm": 5.701085090637207, + "learning_rate": 5.955632558771572e-06, + "loss": 3.0423, + "step": 47640 + }, + { + "epoch": 3.237192553336051, + "grad_norm": 7.1036152839660645, + "learning_rate": 5.955207908683246e-06, + "loss": 3.0365, + "step": 47645 + }, + { + "epoch": 3.2375322734067127, + "grad_norm": 6.859034061431885, + "learning_rate": 5.954783258594918e-06, + "loss": 2.7174, + "step": 47650 + }, + { + "epoch": 3.2378719934773748, + "grad_norm": 5.895500183105469, + "learning_rate": 5.954358608506591e-06, + "loss": 2.9147, + "step": 47655 + }, + { + "epoch": 3.2382117135480364, + "grad_norm": 6.591684818267822, + "learning_rate": 5.95401888843593e-06, + "loss": 3.1321, + "step": 47660 + }, + { + "epoch": 3.238551433618698, + "grad_norm": 6.73738431930542, + "learning_rate": 5.953594238347602e-06, + "loss": 2.9976, + "step": 47665 + }, + { + "epoch": 3.23889115368936, + "grad_norm": 5.778050422668457, + "learning_rate": 5.9531695882592745e-06, + "loss": 2.9904, + "step": 47670 + }, + { + "epoch": 3.2392308737600217, + "grad_norm": 7.7845563888549805, + "learning_rate": 5.952744938170948e-06, + "loss": 2.8153, + "step": 47675 + }, + { + "epoch": 3.2395705938306834, + "grad_norm": 6.438722133636475, + "learning_rate": 5.95232028808262e-06, + "loss": 2.953, + "step": 47680 + }, + { + "epoch": 3.2399103139013454, + "grad_norm": 8.03254222869873, + "learning_rate": 5.951895637994293e-06, + "loss": 3.2902, + "step": 47685 + }, + { + "epoch": 3.240250033972007, + "grad_norm": 7.062995433807373, + "learning_rate": 5.9514709879059665e-06, + "loss": 2.9663, + "step": 47690 + }, + { + "epoch": 3.2405897540426687, + "grad_norm": 6.0261335372924805, + "learning_rate": 5.9510463378176385e-06, + "loss": 3.188, + "step": 47695 + }, + { + "epoch": 3.2409294741133308, + "grad_norm": 7.212213516235352, + "learning_rate": 5.950621687729311e-06, + "loss": 2.9653, + "step": 47700 + }, + { + "epoch": 3.2412691941839924, + "grad_norm": 6.0346784591674805, + "learning_rate": 5.950197037640985e-06, + "loss": 3.0719, + "step": 47705 + }, + { + "epoch": 3.241608914254654, + "grad_norm": 7.0820112228393555, + "learning_rate": 5.949772387552657e-06, + "loss": 3.02, + "step": 47710 + }, + { + "epoch": 3.241948634325316, + "grad_norm": 8.027338027954102, + "learning_rate": 5.94934773746433e-06, + "loss": 2.8526, + "step": 47715 + }, + { + "epoch": 3.2422883543959777, + "grad_norm": 9.699687957763672, + "learning_rate": 5.9489230873760025e-06, + "loss": 2.9522, + "step": 47720 + }, + { + "epoch": 3.2426280744666394, + "grad_norm": 7.019383907318115, + "learning_rate": 5.948498437287675e-06, + "loss": 2.8788, + "step": 47725 + }, + { + "epoch": 3.2429677945373014, + "grad_norm": 7.79295015335083, + "learning_rate": 5.948073787199348e-06, + "loss": 2.8438, + "step": 47730 + }, + { + "epoch": 3.243307514607963, + "grad_norm": 6.90206241607666, + "learning_rate": 5.947649137111021e-06, + "loss": 3.0953, + "step": 47735 + }, + { + "epoch": 3.2436472346786247, + "grad_norm": 7.434995174407959, + "learning_rate": 5.947224487022694e-06, + "loss": 2.8964, + "step": 47740 + }, + { + "epoch": 3.2439869547492868, + "grad_norm": 7.46890926361084, + "learning_rate": 5.946799836934366e-06, + "loss": 2.8879, + "step": 47745 + }, + { + "epoch": 3.2443266748199484, + "grad_norm": 7.569260120391846, + "learning_rate": 5.946375186846039e-06, + "loss": 2.9167, + "step": 47750 + }, + { + "epoch": 3.24466639489061, + "grad_norm": 6.526523590087891, + "learning_rate": 5.945950536757712e-06, + "loss": 3.1128, + "step": 47755 + }, + { + "epoch": 3.245006114961272, + "grad_norm": 7.725632667541504, + "learning_rate": 5.945525886669386e-06, + "loss": 2.9862, + "step": 47760 + }, + { + "epoch": 3.2453458350319337, + "grad_norm": 6.77612829208374, + "learning_rate": 5.945101236581058e-06, + "loss": 3.0419, + "step": 47765 + }, + { + "epoch": 3.2456855551025954, + "grad_norm": 6.302372455596924, + "learning_rate": 5.9446765864927305e-06, + "loss": 3.0022, + "step": 47770 + }, + { + "epoch": 3.246025275173257, + "grad_norm": 7.010482311248779, + "learning_rate": 5.944251936404404e-06, + "loss": 2.8728, + "step": 47775 + }, + { + "epoch": 3.246364995243919, + "grad_norm": 7.053501129150391, + "learning_rate": 5.943827286316076e-06, + "loss": 2.888, + "step": 47780 + }, + { + "epoch": 3.2467047153145807, + "grad_norm": 6.970826148986816, + "learning_rate": 5.943402636227749e-06, + "loss": 2.6665, + "step": 47785 + }, + { + "epoch": 3.2470444353852423, + "grad_norm": 7.140949249267578, + "learning_rate": 5.942977986139422e-06, + "loss": 2.8704, + "step": 47790 + }, + { + "epoch": 3.2473841554559044, + "grad_norm": 6.967106342315674, + "learning_rate": 5.9425533360510945e-06, + "loss": 2.8392, + "step": 47795 + }, + { + "epoch": 3.247723875526566, + "grad_norm": 7.132386207580566, + "learning_rate": 5.942128685962767e-06, + "loss": 2.9629, + "step": 47800 + }, + { + "epoch": 3.2480635955972277, + "grad_norm": 8.03559398651123, + "learning_rate": 5.94170403587444e-06, + "loss": 2.7001, + "step": 47805 + }, + { + "epoch": 3.2484033156678898, + "grad_norm": 7.2428741455078125, + "learning_rate": 5.941279385786113e-06, + "loss": 3.0512, + "step": 47810 + }, + { + "epoch": 3.2487430357385514, + "grad_norm": 7.6815385818481445, + "learning_rate": 5.940854735697785e-06, + "loss": 2.7924, + "step": 47815 + }, + { + "epoch": 3.249082755809213, + "grad_norm": 5.969428539276123, + "learning_rate": 5.9404300856094585e-06, + "loss": 3.1991, + "step": 47820 + }, + { + "epoch": 3.249422475879875, + "grad_norm": 8.747967720031738, + "learning_rate": 5.940005435521131e-06, + "loss": 2.8649, + "step": 47825 + }, + { + "epoch": 3.2497621959505367, + "grad_norm": 6.669416904449463, + "learning_rate": 5.939580785432803e-06, + "loss": 2.8787, + "step": 47830 + }, + { + "epoch": 3.2501019160211984, + "grad_norm": 7.497708320617676, + "learning_rate": 5.939156135344477e-06, + "loss": 3.0374, + "step": 47835 + }, + { + "epoch": 3.2504416360918604, + "grad_norm": 7.630190372467041, + "learning_rate": 5.93873148525615e-06, + "loss": 3.0789, + "step": 47840 + }, + { + "epoch": 3.250781356162522, + "grad_norm": 6.458012580871582, + "learning_rate": 5.938306835167822e-06, + "loss": 2.94, + "step": 47845 + }, + { + "epoch": 3.2511210762331837, + "grad_norm": 6.9539594650268555, + "learning_rate": 5.937882185079495e-06, + "loss": 3.0899, + "step": 47850 + }, + { + "epoch": 3.2514607963038458, + "grad_norm": 8.161558151245117, + "learning_rate": 5.937457534991168e-06, + "loss": 2.9462, + "step": 47855 + }, + { + "epoch": 3.2518005163745074, + "grad_norm": 5.480520725250244, + "learning_rate": 5.93703288490284e-06, + "loss": 2.9373, + "step": 47860 + }, + { + "epoch": 3.252140236445169, + "grad_norm": 7.733339309692383, + "learning_rate": 5.936608234814514e-06, + "loss": 2.688, + "step": 47865 + }, + { + "epoch": 3.252479956515831, + "grad_norm": 7.491146087646484, + "learning_rate": 5.9361835847261865e-06, + "loss": 2.9015, + "step": 47870 + }, + { + "epoch": 3.2528196765864927, + "grad_norm": 7.814619541168213, + "learning_rate": 5.9357589346378585e-06, + "loss": 2.6864, + "step": 47875 + }, + { + "epoch": 3.2531593966571544, + "grad_norm": 7.164092540740967, + "learning_rate": 5.935334284549532e-06, + "loss": 3.1111, + "step": 47880 + }, + { + "epoch": 3.2534991167278164, + "grad_norm": 6.595895767211914, + "learning_rate": 5.934909634461204e-06, + "loss": 3.0088, + "step": 47885 + }, + { + "epoch": 3.253838836798478, + "grad_norm": 7.354781627655029, + "learning_rate": 5.934484984372877e-06, + "loss": 3.0827, + "step": 47890 + }, + { + "epoch": 3.2541785568691397, + "grad_norm": 8.348043441772461, + "learning_rate": 5.9340603342845505e-06, + "loss": 2.9334, + "step": 47895 + }, + { + "epoch": 3.254518276939802, + "grad_norm": 7.9718241691589355, + "learning_rate": 5.9336356841962225e-06, + "loss": 2.8489, + "step": 47900 + }, + { + "epoch": 3.2548579970104634, + "grad_norm": 8.375986099243164, + "learning_rate": 5.933211034107895e-06, + "loss": 2.9404, + "step": 47905 + }, + { + "epoch": 3.255197717081125, + "grad_norm": 8.031068801879883, + "learning_rate": 5.932786384019569e-06, + "loss": 2.9374, + "step": 47910 + }, + { + "epoch": 3.255537437151787, + "grad_norm": 6.242839813232422, + "learning_rate": 5.932361733931241e-06, + "loss": 2.8312, + "step": 47915 + }, + { + "epoch": 3.2558771572224487, + "grad_norm": 6.667185306549072, + "learning_rate": 5.931937083842914e-06, + "loss": 3.2744, + "step": 47920 + }, + { + "epoch": 3.2562168772931104, + "grad_norm": 5.641888618469238, + "learning_rate": 5.931512433754587e-06, + "loss": 2.9855, + "step": 47925 + }, + { + "epoch": 3.2565565973637725, + "grad_norm": 7.30580472946167, + "learning_rate": 5.931087783666259e-06, + "loss": 3.0284, + "step": 47930 + }, + { + "epoch": 3.256896317434434, + "grad_norm": 6.423435688018799, + "learning_rate": 5.930663133577932e-06, + "loss": 3.2576, + "step": 47935 + }, + { + "epoch": 3.2572360375050957, + "grad_norm": 6.946545124053955, + "learning_rate": 5.930238483489606e-06, + "loss": 2.7996, + "step": 47940 + }, + { + "epoch": 3.257575757575758, + "grad_norm": 6.746503829956055, + "learning_rate": 5.929813833401278e-06, + "loss": 2.8992, + "step": 47945 + }, + { + "epoch": 3.2579154776464194, + "grad_norm": 6.138664722442627, + "learning_rate": 5.9293891833129505e-06, + "loss": 3.0197, + "step": 47950 + }, + { + "epoch": 3.258255197717081, + "grad_norm": 7.335579872131348, + "learning_rate": 5.928964533224624e-06, + "loss": 3.0506, + "step": 47955 + }, + { + "epoch": 3.258594917787743, + "grad_norm": 6.346207618713379, + "learning_rate": 5.928539883136296e-06, + "loss": 3.1397, + "step": 47960 + }, + { + "epoch": 3.2589346378584048, + "grad_norm": 7.313090801239014, + "learning_rate": 5.928115233047969e-06, + "loss": 2.8914, + "step": 47965 + }, + { + "epoch": 3.2592743579290664, + "grad_norm": 6.791153430938721, + "learning_rate": 5.927690582959642e-06, + "loss": 3.0864, + "step": 47970 + }, + { + "epoch": 3.2596140779997285, + "grad_norm": 6.389687538146973, + "learning_rate": 5.9272659328713145e-06, + "loss": 2.7528, + "step": 47975 + }, + { + "epoch": 3.25995379807039, + "grad_norm": 6.637500762939453, + "learning_rate": 5.926841282782987e-06, + "loss": 2.7642, + "step": 47980 + }, + { + "epoch": 3.2602935181410517, + "grad_norm": 7.212366580963135, + "learning_rate": 5.92641663269466e-06, + "loss": 3.0816, + "step": 47985 + }, + { + "epoch": 3.2606332382117134, + "grad_norm": 7.321902275085449, + "learning_rate": 5.925991982606333e-06, + "loss": 2.9654, + "step": 47990 + }, + { + "epoch": 3.2609729582823754, + "grad_norm": 6.831309795379639, + "learning_rate": 5.925567332518005e-06, + "loss": 3.0582, + "step": 47995 + }, + { + "epoch": 3.261312678353037, + "grad_norm": 5.920591354370117, + "learning_rate": 5.9251426824296785e-06, + "loss": 3.0291, + "step": 48000 + }, + { + "epoch": 3.2616523984236987, + "grad_norm": 7.628720760345459, + "learning_rate": 5.924718032341351e-06, + "loss": 2.8649, + "step": 48005 + }, + { + "epoch": 3.2619921184943608, + "grad_norm": 6.6827287673950195, + "learning_rate": 5.924293382253023e-06, + "loss": 3.145, + "step": 48010 + }, + { + "epoch": 3.2623318385650224, + "grad_norm": 8.289463996887207, + "learning_rate": 5.923868732164697e-06, + "loss": 3.0547, + "step": 48015 + }, + { + "epoch": 3.262671558635684, + "grad_norm": 8.119093894958496, + "learning_rate": 5.92344408207637e-06, + "loss": 2.891, + "step": 48020 + }, + { + "epoch": 3.263011278706346, + "grad_norm": 6.951908588409424, + "learning_rate": 5.923019431988042e-06, + "loss": 3.2831, + "step": 48025 + }, + { + "epoch": 3.2633509987770077, + "grad_norm": 5.482414722442627, + "learning_rate": 5.922594781899715e-06, + "loss": 3.0124, + "step": 48030 + }, + { + "epoch": 3.2636907188476694, + "grad_norm": 6.745979309082031, + "learning_rate": 5.922170131811388e-06, + "loss": 2.9546, + "step": 48035 + }, + { + "epoch": 3.2640304389183314, + "grad_norm": 5.844730854034424, + "learning_rate": 5.92174548172306e-06, + "loss": 3.0114, + "step": 48040 + }, + { + "epoch": 3.264370158988993, + "grad_norm": 7.547298431396484, + "learning_rate": 5.921320831634734e-06, + "loss": 2.8778, + "step": 48045 + }, + { + "epoch": 3.2647098790596547, + "grad_norm": 6.658215045928955, + "learning_rate": 5.9208961815464065e-06, + "loss": 3.0785, + "step": 48050 + }, + { + "epoch": 3.265049599130317, + "grad_norm": 9.058669090270996, + "learning_rate": 5.9204715314580785e-06, + "loss": 2.7056, + "step": 48055 + }, + { + "epoch": 3.2653893192009784, + "grad_norm": 9.082686424255371, + "learning_rate": 5.920046881369752e-06, + "loss": 2.6871, + "step": 48060 + }, + { + "epoch": 3.26572903927164, + "grad_norm": 6.784292221069336, + "learning_rate": 5.919622231281424e-06, + "loss": 2.9928, + "step": 48065 + }, + { + "epoch": 3.266068759342302, + "grad_norm": 6.897621154785156, + "learning_rate": 5.919197581193097e-06, + "loss": 2.8726, + "step": 48070 + }, + { + "epoch": 3.2664084794129638, + "grad_norm": 7.027532577514648, + "learning_rate": 5.9187729311047705e-06, + "loss": 3.084, + "step": 48075 + }, + { + "epoch": 3.2667481994836254, + "grad_norm": 7.634126663208008, + "learning_rate": 5.9183482810164425e-06, + "loss": 2.8909, + "step": 48080 + }, + { + "epoch": 3.2670879195542875, + "grad_norm": 9.64392375946045, + "learning_rate": 5.917923630928115e-06, + "loss": 2.7492, + "step": 48085 + }, + { + "epoch": 3.267427639624949, + "grad_norm": 9.712850570678711, + "learning_rate": 5.917498980839789e-06, + "loss": 2.9065, + "step": 48090 + }, + { + "epoch": 3.2677673596956107, + "grad_norm": 7.460468292236328, + "learning_rate": 5.917074330751461e-06, + "loss": 2.9928, + "step": 48095 + }, + { + "epoch": 3.2681070797662723, + "grad_norm": 6.786312103271484, + "learning_rate": 5.9166496806631345e-06, + "loss": 3.0777, + "step": 48100 + }, + { + "epoch": 3.2684467998369344, + "grad_norm": 6.368765830993652, + "learning_rate": 5.916225030574807e-06, + "loss": 3.0104, + "step": 48105 + }, + { + "epoch": 3.268786519907596, + "grad_norm": 10.579416275024414, + "learning_rate": 5.915800380486479e-06, + "loss": 3.0596, + "step": 48110 + }, + { + "epoch": 3.2691262399782577, + "grad_norm": 6.877032279968262, + "learning_rate": 5.915375730398153e-06, + "loss": 2.9592, + "step": 48115 + }, + { + "epoch": 3.2694659600489198, + "grad_norm": 5.411431789398193, + "learning_rate": 5.914951080309826e-06, + "loss": 2.971, + "step": 48120 + }, + { + "epoch": 3.2698056801195814, + "grad_norm": 8.643455505371094, + "learning_rate": 5.914526430221498e-06, + "loss": 3.1308, + "step": 48125 + }, + { + "epoch": 3.270145400190243, + "grad_norm": 7.932943820953369, + "learning_rate": 5.914101780133171e-06, + "loss": 2.7505, + "step": 48130 + }, + { + "epoch": 3.270485120260905, + "grad_norm": 7.733397483825684, + "learning_rate": 5.913677130044843e-06, + "loss": 2.7271, + "step": 48135 + }, + { + "epoch": 3.2708248403315667, + "grad_norm": 7.763428688049316, + "learning_rate": 5.913252479956516e-06, + "loss": 2.9228, + "step": 48140 + }, + { + "epoch": 3.2711645604022284, + "grad_norm": 7.7408294677734375, + "learning_rate": 5.91282782986819e-06, + "loss": 3.055, + "step": 48145 + }, + { + "epoch": 3.2715042804728904, + "grad_norm": 6.262774467468262, + "learning_rate": 5.912403179779862e-06, + "loss": 3.1189, + "step": 48150 + }, + { + "epoch": 3.271844000543552, + "grad_norm": 8.756786346435547, + "learning_rate": 5.9119785296915345e-06, + "loss": 3.1603, + "step": 48155 + }, + { + "epoch": 3.2721837206142137, + "grad_norm": 6.830657958984375, + "learning_rate": 5.911553879603208e-06, + "loss": 2.5153, + "step": 48160 + }, + { + "epoch": 3.2725234406848758, + "grad_norm": 8.203941345214844, + "learning_rate": 5.91112922951488e-06, + "loss": 3.0821, + "step": 48165 + }, + { + "epoch": 3.2728631607555374, + "grad_norm": 7.510458946228027, + "learning_rate": 5.910704579426553e-06, + "loss": 2.7469, + "step": 48170 + }, + { + "epoch": 3.273202880826199, + "grad_norm": 9.884343147277832, + "learning_rate": 5.9102799293382265e-06, + "loss": 2.957, + "step": 48175 + }, + { + "epoch": 3.273542600896861, + "grad_norm": 6.953112602233887, + "learning_rate": 5.9098552792498985e-06, + "loss": 2.8767, + "step": 48180 + }, + { + "epoch": 3.2738823209675227, + "grad_norm": 8.582254409790039, + "learning_rate": 5.909430629161571e-06, + "loss": 3.0476, + "step": 48185 + }, + { + "epoch": 3.2742220410381844, + "grad_norm": 8.379496574401855, + "learning_rate": 5.909005979073245e-06, + "loss": 3.021, + "step": 48190 + }, + { + "epoch": 3.2745617611088464, + "grad_norm": 7.461149215698242, + "learning_rate": 5.908581328984917e-06, + "loss": 3.0468, + "step": 48195 + }, + { + "epoch": 3.274901481179508, + "grad_norm": 8.715845108032227, + "learning_rate": 5.90815667889659e-06, + "loss": 2.9073, + "step": 48200 + }, + { + "epoch": 3.2752412012501697, + "grad_norm": 10.032157897949219, + "learning_rate": 5.907732028808263e-06, + "loss": 2.9782, + "step": 48205 + }, + { + "epoch": 3.275580921320832, + "grad_norm": 9.437108993530273, + "learning_rate": 5.907307378719935e-06, + "loss": 3.1397, + "step": 48210 + }, + { + "epoch": 3.2759206413914934, + "grad_norm": 7.420114517211914, + "learning_rate": 5.906882728631608e-06, + "loss": 2.9294, + "step": 48215 + }, + { + "epoch": 3.276260361462155, + "grad_norm": 6.228782653808594, + "learning_rate": 5.906458078543281e-06, + "loss": 2.9802, + "step": 48220 + }, + { + "epoch": 3.276600081532817, + "grad_norm": 6.926295757293701, + "learning_rate": 5.906033428454954e-06, + "loss": 2.9261, + "step": 48225 + }, + { + "epoch": 3.2769398016034788, + "grad_norm": 8.061842918395996, + "learning_rate": 5.905608778366626e-06, + "loss": 2.9825, + "step": 48230 + }, + { + "epoch": 3.2772795216741404, + "grad_norm": 8.997513771057129, + "learning_rate": 5.905184128278299e-06, + "loss": 2.9764, + "step": 48235 + }, + { + "epoch": 3.2776192417448025, + "grad_norm": 7.399506092071533, + "learning_rate": 5.904759478189972e-06, + "loss": 3.0644, + "step": 48240 + }, + { + "epoch": 3.277958961815464, + "grad_norm": 6.748323440551758, + "learning_rate": 5.904334828101644e-06, + "loss": 3.1249, + "step": 48245 + }, + { + "epoch": 3.2782986818861257, + "grad_norm": 7.113633632659912, + "learning_rate": 5.903910178013318e-06, + "loss": 3.0898, + "step": 48250 + }, + { + "epoch": 3.278638401956788, + "grad_norm": 6.36654806137085, + "learning_rate": 5.9034855279249905e-06, + "loss": 2.8733, + "step": 48255 + }, + { + "epoch": 3.2789781220274494, + "grad_norm": 8.2117338180542, + "learning_rate": 5.9030608778366624e-06, + "loss": 2.9248, + "step": 48260 + }, + { + "epoch": 3.279317842098111, + "grad_norm": 7.9229302406311035, + "learning_rate": 5.902636227748336e-06, + "loss": 2.8997, + "step": 48265 + }, + { + "epoch": 3.279657562168773, + "grad_norm": 6.56724214553833, + "learning_rate": 5.902211577660009e-06, + "loss": 2.862, + "step": 48270 + }, + { + "epoch": 3.2799972822394348, + "grad_norm": 8.66203498840332, + "learning_rate": 5.901786927571681e-06, + "loss": 2.8127, + "step": 48275 + }, + { + "epoch": 3.2803370023100964, + "grad_norm": 6.774995803833008, + "learning_rate": 5.9013622774833545e-06, + "loss": 2.8672, + "step": 48280 + }, + { + "epoch": 3.2806767223807585, + "grad_norm": 7.348494529724121, + "learning_rate": 5.900937627395027e-06, + "loss": 2.8868, + "step": 48285 + }, + { + "epoch": 3.28101644245142, + "grad_norm": 7.30278205871582, + "learning_rate": 5.900512977306699e-06, + "loss": 2.8898, + "step": 48290 + }, + { + "epoch": 3.2813561625220817, + "grad_norm": 6.369009017944336, + "learning_rate": 5.900088327218373e-06, + "loss": 3.0013, + "step": 48295 + }, + { + "epoch": 3.281695882592744, + "grad_norm": 8.877767562866211, + "learning_rate": 5.899663677130046e-06, + "loss": 2.9135, + "step": 48300 + }, + { + "epoch": 3.2820356026634054, + "grad_norm": 8.384411811828613, + "learning_rate": 5.899239027041718e-06, + "loss": 2.9461, + "step": 48305 + }, + { + "epoch": 3.282375322734067, + "grad_norm": 7.322615146636963, + "learning_rate": 5.898814376953391e-06, + "loss": 3.0462, + "step": 48310 + }, + { + "epoch": 3.282715042804729, + "grad_norm": 7.945516109466553, + "learning_rate": 5.898389726865063e-06, + "loss": 2.9956, + "step": 48315 + }, + { + "epoch": 3.2830547628753908, + "grad_norm": 9.153901100158691, + "learning_rate": 5.897965076776736e-06, + "loss": 3.0312, + "step": 48320 + }, + { + "epoch": 3.2833944829460524, + "grad_norm": 9.80693244934082, + "learning_rate": 5.89754042668841e-06, + "loss": 3.0512, + "step": 48325 + }, + { + "epoch": 3.283734203016714, + "grad_norm": 7.390742301940918, + "learning_rate": 5.897115776600082e-06, + "loss": 2.9745, + "step": 48330 + }, + { + "epoch": 3.284073923087376, + "grad_norm": 6.6601057052612305, + "learning_rate": 5.8966911265117544e-06, + "loss": 2.8233, + "step": 48335 + }, + { + "epoch": 3.2844136431580377, + "grad_norm": 6.46761417388916, + "learning_rate": 5.896266476423428e-06, + "loss": 2.9398, + "step": 48340 + }, + { + "epoch": 3.2847533632286994, + "grad_norm": 6.738687992095947, + "learning_rate": 5.8958418263351e-06, + "loss": 2.6404, + "step": 48345 + }, + { + "epoch": 3.2850930832993614, + "grad_norm": 7.24674654006958, + "learning_rate": 5.895417176246773e-06, + "loss": 3.0506, + "step": 48350 + }, + { + "epoch": 3.285432803370023, + "grad_norm": 6.742031574249268, + "learning_rate": 5.8949925261584465e-06, + "loss": 2.9295, + "step": 48355 + }, + { + "epoch": 3.2857725234406847, + "grad_norm": 7.198927879333496, + "learning_rate": 5.8945678760701185e-06, + "loss": 2.8534, + "step": 48360 + }, + { + "epoch": 3.286112243511347, + "grad_norm": 6.7230424880981445, + "learning_rate": 5.894143225981791e-06, + "loss": 2.9135, + "step": 48365 + }, + { + "epoch": 3.2864519635820084, + "grad_norm": 7.369361400604248, + "learning_rate": 5.893718575893465e-06, + "loss": 3.0423, + "step": 48370 + }, + { + "epoch": 3.28679168365267, + "grad_norm": 8.248465538024902, + "learning_rate": 5.893293925805137e-06, + "loss": 3.1066, + "step": 48375 + }, + { + "epoch": 3.287131403723332, + "grad_norm": 6.308130741119385, + "learning_rate": 5.89286927571681e-06, + "loss": 3.1313, + "step": 48380 + }, + { + "epoch": 3.2874711237939938, + "grad_norm": 6.429049015045166, + "learning_rate": 5.8924446256284825e-06, + "loss": 2.8367, + "step": 48385 + }, + { + "epoch": 3.2878108438646554, + "grad_norm": 7.571176052093506, + "learning_rate": 5.892019975540155e-06, + "loss": 2.9855, + "step": 48390 + }, + { + "epoch": 3.2881505639353175, + "grad_norm": 6.001975059509277, + "learning_rate": 5.891595325451828e-06, + "loss": 2.892, + "step": 48395 + }, + { + "epoch": 3.288490284005979, + "grad_norm": 5.802894115447998, + "learning_rate": 5.891170675363501e-06, + "loss": 3.0379, + "step": 48400 + }, + { + "epoch": 3.2888300040766407, + "grad_norm": 7.20633602142334, + "learning_rate": 5.890746025275174e-06, + "loss": 3.2222, + "step": 48405 + }, + { + "epoch": 3.289169724147303, + "grad_norm": 6.799243450164795, + "learning_rate": 5.890321375186846e-06, + "loss": 3.0373, + "step": 48410 + }, + { + "epoch": 3.2895094442179644, + "grad_norm": 7.747155666351318, + "learning_rate": 5.889896725098519e-06, + "loss": 2.8405, + "step": 48415 + }, + { + "epoch": 3.289849164288626, + "grad_norm": 9.313284873962402, + "learning_rate": 5.889472075010192e-06, + "loss": 2.7989, + "step": 48420 + }, + { + "epoch": 3.290188884359288, + "grad_norm": 7.218620300292969, + "learning_rate": 5.889047424921864e-06, + "loss": 2.9195, + "step": 48425 + }, + { + "epoch": 3.2905286044299498, + "grad_norm": 8.348706245422363, + "learning_rate": 5.888622774833538e-06, + "loss": 2.9662, + "step": 48430 + }, + { + "epoch": 3.2908683245006114, + "grad_norm": 6.658208847045898, + "learning_rate": 5.8881981247452105e-06, + "loss": 2.8314, + "step": 48435 + }, + { + "epoch": 3.291208044571273, + "grad_norm": 9.776019096374512, + "learning_rate": 5.887773474656884e-06, + "loss": 3.2315, + "step": 48440 + }, + { + "epoch": 3.291547764641935, + "grad_norm": 7.132726192474365, + "learning_rate": 5.887348824568556e-06, + "loss": 3.1903, + "step": 48445 + }, + { + "epoch": 3.2918874847125967, + "grad_norm": 5.543691635131836, + "learning_rate": 5.886924174480229e-06, + "loss": 3.1714, + "step": 48450 + }, + { + "epoch": 3.2922272047832584, + "grad_norm": 7.123250961303711, + "learning_rate": 5.886499524391902e-06, + "loss": 2.6213, + "step": 48455 + }, + { + "epoch": 3.2925669248539204, + "grad_norm": 7.216650485992432, + "learning_rate": 5.8860748743035745e-06, + "loss": 2.7157, + "step": 48460 + }, + { + "epoch": 3.292906644924582, + "grad_norm": 7.228537559509277, + "learning_rate": 5.885650224215247e-06, + "loss": 2.901, + "step": 48465 + }, + { + "epoch": 3.2932463649952437, + "grad_norm": 7.4668192863464355, + "learning_rate": 5.88522557412692e-06, + "loss": 2.8248, + "step": 48470 + }, + { + "epoch": 3.2935860850659058, + "grad_norm": 7.1168437004089355, + "learning_rate": 5.884800924038593e-06, + "loss": 2.8682, + "step": 48475 + }, + { + "epoch": 3.2939258051365674, + "grad_norm": 5.566479206085205, + "learning_rate": 5.884376273950265e-06, + "loss": 3.0944, + "step": 48480 + }, + { + "epoch": 3.294265525207229, + "grad_norm": 6.315682888031006, + "learning_rate": 5.8839516238619385e-06, + "loss": 2.752, + "step": 48485 + }, + { + "epoch": 3.294605245277891, + "grad_norm": 7.326840877532959, + "learning_rate": 5.883526973773611e-06, + "loss": 2.9976, + "step": 48490 + }, + { + "epoch": 3.2949449653485527, + "grad_norm": 8.524767875671387, + "learning_rate": 5.883102323685283e-06, + "loss": 2.9037, + "step": 48495 + }, + { + "epoch": 3.2952846854192144, + "grad_norm": 7.573965072631836, + "learning_rate": 5.882677673596957e-06, + "loss": 2.9005, + "step": 48500 + }, + { + "epoch": 3.2956244054898765, + "grad_norm": 6.6148600578308105, + "learning_rate": 5.88225302350863e-06, + "loss": 3.1084, + "step": 48505 + }, + { + "epoch": 3.295964125560538, + "grad_norm": 6.7934346199035645, + "learning_rate": 5.881828373420302e-06, + "loss": 2.9118, + "step": 48510 + }, + { + "epoch": 3.2963038456311997, + "grad_norm": 8.02425765991211, + "learning_rate": 5.881403723331975e-06, + "loss": 3.0163, + "step": 48515 + }, + { + "epoch": 3.296643565701862, + "grad_norm": 6.77824592590332, + "learning_rate": 5.880979073243648e-06, + "loss": 2.8845, + "step": 48520 + }, + { + "epoch": 3.2969832857725234, + "grad_norm": 8.127382278442383, + "learning_rate": 5.88055442315532e-06, + "loss": 2.8308, + "step": 48525 + }, + { + "epoch": 3.297323005843185, + "grad_norm": 7.2036309242248535, + "learning_rate": 5.880129773066994e-06, + "loss": 2.8174, + "step": 48530 + }, + { + "epoch": 3.297662725913847, + "grad_norm": 6.2864298820495605, + "learning_rate": 5.8797051229786665e-06, + "loss": 2.7953, + "step": 48535 + }, + { + "epoch": 3.2980024459845088, + "grad_norm": 7.348821640014648, + "learning_rate": 5.8792804728903384e-06, + "loss": 2.7181, + "step": 48540 + }, + { + "epoch": 3.2983421660551704, + "grad_norm": 7.442555904388428, + "learning_rate": 5.878855822802012e-06, + "loss": 2.9833, + "step": 48545 + }, + { + "epoch": 3.2986818861258325, + "grad_norm": 8.6884183883667, + "learning_rate": 5.878431172713685e-06, + "loss": 2.9231, + "step": 48550 + }, + { + "epoch": 3.299021606196494, + "grad_norm": 7.7246928215026855, + "learning_rate": 5.878006522625357e-06, + "loss": 2.9914, + "step": 48555 + }, + { + "epoch": 3.2993613262671557, + "grad_norm": 7.366265296936035, + "learning_rate": 5.8775818725370305e-06, + "loss": 3.1382, + "step": 48560 + }, + { + "epoch": 3.299701046337818, + "grad_norm": 7.505753517150879, + "learning_rate": 5.8771572224487024e-06, + "loss": 2.7347, + "step": 48565 + }, + { + "epoch": 3.3000407664084794, + "grad_norm": 6.571694374084473, + "learning_rate": 5.876732572360375e-06, + "loss": 3.0766, + "step": 48570 + }, + { + "epoch": 3.300380486479141, + "grad_norm": 8.474743843078613, + "learning_rate": 5.876307922272049e-06, + "loss": 3.0414, + "step": 48575 + }, + { + "epoch": 3.300720206549803, + "grad_norm": 6.0102739334106445, + "learning_rate": 5.875883272183721e-06, + "loss": 3.0349, + "step": 48580 + }, + { + "epoch": 3.3010599266204648, + "grad_norm": 6.059275150299072, + "learning_rate": 5.875458622095394e-06, + "loss": 3.0828, + "step": 48585 + }, + { + "epoch": 3.3013996466911264, + "grad_norm": 9.254331588745117, + "learning_rate": 5.875033972007067e-06, + "loss": 2.8342, + "step": 48590 + }, + { + "epoch": 3.3017393667617885, + "grad_norm": 6.827725410461426, + "learning_rate": 5.874609321918739e-06, + "loss": 3.083, + "step": 48595 + }, + { + "epoch": 3.30207908683245, + "grad_norm": 6.106866359710693, + "learning_rate": 5.874184671830412e-06, + "loss": 2.8916, + "step": 48600 + }, + { + "epoch": 3.3024188069031117, + "grad_norm": 8.445870399475098, + "learning_rate": 5.873760021742086e-06, + "loss": 2.6828, + "step": 48605 + }, + { + "epoch": 3.302758526973774, + "grad_norm": 7.236316680908203, + "learning_rate": 5.873335371653758e-06, + "loss": 3.0345, + "step": 48610 + }, + { + "epoch": 3.3030982470444354, + "grad_norm": 7.549234867095947, + "learning_rate": 5.8729107215654304e-06, + "loss": 3.1967, + "step": 48615 + }, + { + "epoch": 3.303437967115097, + "grad_norm": 7.245845317840576, + "learning_rate": 5.872486071477104e-06, + "loss": 2.8699, + "step": 48620 + }, + { + "epoch": 3.303777687185759, + "grad_norm": 6.638112545013428, + "learning_rate": 5.872061421388776e-06, + "loss": 2.7901, + "step": 48625 + }, + { + "epoch": 3.304117407256421, + "grad_norm": 7.9260149002075195, + "learning_rate": 5.871636771300449e-06, + "loss": 2.8307, + "step": 48630 + }, + { + "epoch": 3.3044571273270824, + "grad_norm": 7.267489910125732, + "learning_rate": 5.871212121212122e-06, + "loss": 3.2148, + "step": 48635 + }, + { + "epoch": 3.3047968473977445, + "grad_norm": 7.570919990539551, + "learning_rate": 5.8707874711237944e-06, + "loss": 3.0978, + "step": 48640 + }, + { + "epoch": 3.305136567468406, + "grad_norm": 7.112813949584961, + "learning_rate": 5.870362821035467e-06, + "loss": 3.0053, + "step": 48645 + }, + { + "epoch": 3.3054762875390677, + "grad_norm": 6.6903581619262695, + "learning_rate": 5.86993817094714e-06, + "loss": 3.1501, + "step": 48650 + }, + { + "epoch": 3.30581600760973, + "grad_norm": 6.009005546569824, + "learning_rate": 5.869513520858813e-06, + "loss": 2.9252, + "step": 48655 + }, + { + "epoch": 3.3061557276803915, + "grad_norm": 6.498281002044678, + "learning_rate": 5.869088870770485e-06, + "loss": 3.1554, + "step": 48660 + }, + { + "epoch": 3.306495447751053, + "grad_norm": 6.372365474700928, + "learning_rate": 5.8686642206821585e-06, + "loss": 2.7987, + "step": 48665 + }, + { + "epoch": 3.3068351678217147, + "grad_norm": 7.378864288330078, + "learning_rate": 5.868239570593831e-06, + "loss": 2.9764, + "step": 48670 + }, + { + "epoch": 3.307174887892377, + "grad_norm": 6.919984817504883, + "learning_rate": 5.867814920505503e-06, + "loss": 2.9838, + "step": 48675 + }, + { + "epoch": 3.3075146079630384, + "grad_norm": 6.703470230102539, + "learning_rate": 5.867390270417177e-06, + "loss": 2.752, + "step": 48680 + }, + { + "epoch": 3.3078543280337, + "grad_norm": 8.200871467590332, + "learning_rate": 5.86696562032885e-06, + "loss": 2.6433, + "step": 48685 + }, + { + "epoch": 3.308194048104362, + "grad_norm": 7.278584957122803, + "learning_rate": 5.866540970240522e-06, + "loss": 2.7004, + "step": 48690 + }, + { + "epoch": 3.3085337681750238, + "grad_norm": 6.539923667907715, + "learning_rate": 5.866116320152195e-06, + "loss": 2.9148, + "step": 48695 + }, + { + "epoch": 3.3088734882456854, + "grad_norm": 7.520110130310059, + "learning_rate": 5.865691670063868e-06, + "loss": 3.1068, + "step": 48700 + }, + { + "epoch": 3.3092132083163475, + "grad_norm": 6.30714225769043, + "learning_rate": 5.86526701997554e-06, + "loss": 2.7832, + "step": 48705 + }, + { + "epoch": 3.309552928387009, + "grad_norm": 7.2681779861450195, + "learning_rate": 5.864842369887214e-06, + "loss": 2.7833, + "step": 48710 + }, + { + "epoch": 3.3098926484576707, + "grad_norm": 6.966093063354492, + "learning_rate": 5.8644177197988865e-06, + "loss": 2.8648, + "step": 48715 + }, + { + "epoch": 3.310232368528333, + "grad_norm": 6.330059051513672, + "learning_rate": 5.863993069710558e-06, + "loss": 2.7869, + "step": 48720 + }, + { + "epoch": 3.3105720885989944, + "grad_norm": 9.386425971984863, + "learning_rate": 5.863568419622232e-06, + "loss": 2.9489, + "step": 48725 + }, + { + "epoch": 3.310911808669656, + "grad_norm": 7.231137752532959, + "learning_rate": 5.863143769533904e-06, + "loss": 2.8598, + "step": 48730 + }, + { + "epoch": 3.311251528740318, + "grad_norm": 8.038545608520508, + "learning_rate": 5.862719119445577e-06, + "loss": 2.8461, + "step": 48735 + }, + { + "epoch": 3.3115912488109798, + "grad_norm": 7.302592754364014, + "learning_rate": 5.8622944693572505e-06, + "loss": 2.9801, + "step": 48740 + }, + { + "epoch": 3.3119309688816414, + "grad_norm": 7.680179119110107, + "learning_rate": 5.861869819268922e-06, + "loss": 2.7568, + "step": 48745 + }, + { + "epoch": 3.3122706889523035, + "grad_norm": 6.836512088775635, + "learning_rate": 5.861445169180595e-06, + "loss": 3.1813, + "step": 48750 + }, + { + "epoch": 3.312610409022965, + "grad_norm": 6.057115077972412, + "learning_rate": 5.861020519092269e-06, + "loss": 2.7243, + "step": 48755 + }, + { + "epoch": 3.3129501290936267, + "grad_norm": 5.63767671585083, + "learning_rate": 5.860595869003941e-06, + "loss": 2.7521, + "step": 48760 + }, + { + "epoch": 3.313289849164289, + "grad_norm": 8.287874221801758, + "learning_rate": 5.860171218915614e-06, + "loss": 2.936, + "step": 48765 + }, + { + "epoch": 3.3136295692349504, + "grad_norm": 7.468160152435303, + "learning_rate": 5.859746568827287e-06, + "loss": 2.7535, + "step": 48770 + }, + { + "epoch": 3.313969289305612, + "grad_norm": 5.852672100067139, + "learning_rate": 5.859321918738959e-06, + "loss": 2.895, + "step": 48775 + }, + { + "epoch": 3.3143090093762737, + "grad_norm": 7.2568678855896, + "learning_rate": 5.858897268650633e-06, + "loss": 2.9248, + "step": 48780 + }, + { + "epoch": 3.314648729446936, + "grad_norm": 6.518198013305664, + "learning_rate": 5.858472618562306e-06, + "loss": 2.8863, + "step": 48785 + }, + { + "epoch": 3.3149884495175974, + "grad_norm": 7.219676971435547, + "learning_rate": 5.858047968473978e-06, + "loss": 3.1699, + "step": 48790 + }, + { + "epoch": 3.315328169588259, + "grad_norm": 6.730762004852295, + "learning_rate": 5.857623318385651e-06, + "loss": 3.012, + "step": 48795 + }, + { + "epoch": 3.315667889658921, + "grad_norm": 7.373460292816162, + "learning_rate": 5.857198668297323e-06, + "loss": 2.9596, + "step": 48800 + }, + { + "epoch": 3.3160076097295828, + "grad_norm": 8.097969055175781, + "learning_rate": 5.856774018208996e-06, + "loss": 2.9792, + "step": 48805 + }, + { + "epoch": 3.3163473298002444, + "grad_norm": 6.450416564941406, + "learning_rate": 5.85634936812067e-06, + "loss": 2.8576, + "step": 48810 + }, + { + "epoch": 3.3166870498709065, + "grad_norm": 8.459507942199707, + "learning_rate": 5.855924718032342e-06, + "loss": 2.9417, + "step": 48815 + }, + { + "epoch": 3.317026769941568, + "grad_norm": 7.92201042175293, + "learning_rate": 5.8555000679440144e-06, + "loss": 2.8166, + "step": 48820 + }, + { + "epoch": 3.3173664900122297, + "grad_norm": 7.994729042053223, + "learning_rate": 5.855075417855688e-06, + "loss": 3.0158, + "step": 48825 + }, + { + "epoch": 3.317706210082892, + "grad_norm": 7.314764022827148, + "learning_rate": 5.85465076776736e-06, + "loss": 2.8801, + "step": 48830 + }, + { + "epoch": 3.3180459301535534, + "grad_norm": 5.789459705352783, + "learning_rate": 5.854226117679033e-06, + "loss": 3.2387, + "step": 48835 + }, + { + "epoch": 3.318385650224215, + "grad_norm": 6.239288806915283, + "learning_rate": 5.8538014675907065e-06, + "loss": 3.1188, + "step": 48840 + }, + { + "epoch": 3.318725370294877, + "grad_norm": 6.271949291229248, + "learning_rate": 5.8533768175023784e-06, + "loss": 3.1282, + "step": 48845 + }, + { + "epoch": 3.3190650903655388, + "grad_norm": 6.7441816329956055, + "learning_rate": 5.852952167414051e-06, + "loss": 2.7076, + "step": 48850 + }, + { + "epoch": 3.3194048104362004, + "grad_norm": 7.27828311920166, + "learning_rate": 5.852527517325725e-06, + "loss": 3.0195, + "step": 48855 + }, + { + "epoch": 3.3197445305068625, + "grad_norm": 7.4012908935546875, + "learning_rate": 5.852102867237397e-06, + "loss": 2.6675, + "step": 48860 + }, + { + "epoch": 3.320084250577524, + "grad_norm": 8.967294692993164, + "learning_rate": 5.85167821714907e-06, + "loss": 2.9522, + "step": 48865 + }, + { + "epoch": 3.3204239706481857, + "grad_norm": 7.093212127685547, + "learning_rate": 5.851253567060743e-06, + "loss": 3.0395, + "step": 48870 + }, + { + "epoch": 3.320763690718848, + "grad_norm": 6.647829532623291, + "learning_rate": 5.850828916972415e-06, + "loss": 2.833, + "step": 48875 + }, + { + "epoch": 3.3211034107895094, + "grad_norm": 8.236040115356445, + "learning_rate": 5.850404266884088e-06, + "loss": 2.987, + "step": 48880 + }, + { + "epoch": 3.321443130860171, + "grad_norm": 7.04250431060791, + "learning_rate": 5.849979616795761e-06, + "loss": 2.8118, + "step": 48885 + }, + { + "epoch": 3.321782850930833, + "grad_norm": 8.29987907409668, + "learning_rate": 5.849554966707434e-06, + "loss": 3.0015, + "step": 48890 + }, + { + "epoch": 3.3221225710014948, + "grad_norm": 8.092233657836914, + "learning_rate": 5.8491303166191064e-06, + "loss": 3.1108, + "step": 48895 + }, + { + "epoch": 3.3224622910721564, + "grad_norm": 5.8748321533203125, + "learning_rate": 5.848705666530779e-06, + "loss": 3.0848, + "step": 48900 + }, + { + "epoch": 3.3228020111428185, + "grad_norm": 8.992798805236816, + "learning_rate": 5.848281016442452e-06, + "loss": 2.9481, + "step": 48905 + }, + { + "epoch": 3.32314173121348, + "grad_norm": 5.920229434967041, + "learning_rate": 5.847856366354124e-06, + "loss": 3.1893, + "step": 48910 + }, + { + "epoch": 3.3234814512841417, + "grad_norm": 8.286075592041016, + "learning_rate": 5.847431716265798e-06, + "loss": 2.9297, + "step": 48915 + }, + { + "epoch": 3.323821171354804, + "grad_norm": 8.709609985351562, + "learning_rate": 5.8470070661774704e-06, + "loss": 2.8943, + "step": 48920 + }, + { + "epoch": 3.3241608914254654, + "grad_norm": 7.418821334838867, + "learning_rate": 5.846582416089142e-06, + "loss": 3.106, + "step": 48925 + }, + { + "epoch": 3.324500611496127, + "grad_norm": 8.148286819458008, + "learning_rate": 5.846157766000816e-06, + "loss": 3.014, + "step": 48930 + }, + { + "epoch": 3.324840331566789, + "grad_norm": 7.139161586761475, + "learning_rate": 5.845733115912489e-06, + "loss": 2.9576, + "step": 48935 + }, + { + "epoch": 3.325180051637451, + "grad_norm": 7.169981956481934, + "learning_rate": 5.845308465824161e-06, + "loss": 2.961, + "step": 48940 + }, + { + "epoch": 3.3255197717081124, + "grad_norm": 6.342175483703613, + "learning_rate": 5.8448838157358344e-06, + "loss": 2.9206, + "step": 48945 + }, + { + "epoch": 3.3258594917787745, + "grad_norm": 9.612479209899902, + "learning_rate": 5.844459165647507e-06, + "loss": 3.1806, + "step": 48950 + }, + { + "epoch": 3.326199211849436, + "grad_norm": 6.8565802574157715, + "learning_rate": 5.844034515559179e-06, + "loss": 2.9121, + "step": 48955 + }, + { + "epoch": 3.3265389319200978, + "grad_norm": 7.137147903442383, + "learning_rate": 5.843609865470853e-06, + "loss": 2.9457, + "step": 48960 + }, + { + "epoch": 3.32687865199076, + "grad_norm": 7.697794437408447, + "learning_rate": 5.843185215382526e-06, + "loss": 3.2342, + "step": 48965 + }, + { + "epoch": 3.3272183720614215, + "grad_norm": 7.714986801147461, + "learning_rate": 5.842760565294198e-06, + "loss": 2.934, + "step": 48970 + }, + { + "epoch": 3.327558092132083, + "grad_norm": 7.858079433441162, + "learning_rate": 5.842335915205871e-06, + "loss": 2.7777, + "step": 48975 + }, + { + "epoch": 3.327897812202745, + "grad_norm": 6.208376884460449, + "learning_rate": 5.841911265117543e-06, + "loss": 2.7831, + "step": 48980 + }, + { + "epoch": 3.328237532273407, + "grad_norm": 6.860001564025879, + "learning_rate": 5.841486615029216e-06, + "loss": 2.8338, + "step": 48985 + }, + { + "epoch": 3.3285772523440684, + "grad_norm": 8.598397254943848, + "learning_rate": 5.84106196494089e-06, + "loss": 2.7634, + "step": 48990 + }, + { + "epoch": 3.3289169724147305, + "grad_norm": 9.697380065917969, + "learning_rate": 5.840637314852562e-06, + "loss": 2.9063, + "step": 48995 + }, + { + "epoch": 3.329256692485392, + "grad_norm": 7.104887962341309, + "learning_rate": 5.840212664764234e-06, + "loss": 3.047, + "step": 49000 + }, + { + "epoch": 3.3295964125560538, + "grad_norm": 7.531630992889404, + "learning_rate": 5.839788014675908e-06, + "loss": 2.8307, + "step": 49005 + }, + { + "epoch": 3.3299361326267154, + "grad_norm": 8.676010131835938, + "learning_rate": 5.83936336458758e-06, + "loss": 2.994, + "step": 49010 + }, + { + "epoch": 3.3302758526973775, + "grad_norm": 7.760703086853027, + "learning_rate": 5.838938714499253e-06, + "loss": 2.8651, + "step": 49015 + }, + { + "epoch": 3.330615572768039, + "grad_norm": 9.521717071533203, + "learning_rate": 5.8385140644109265e-06, + "loss": 2.84, + "step": 49020 + }, + { + "epoch": 3.3309552928387007, + "grad_norm": 8.08009147644043, + "learning_rate": 5.838089414322598e-06, + "loss": 3.1614, + "step": 49025 + }, + { + "epoch": 3.331295012909363, + "grad_norm": 7.84117317199707, + "learning_rate": 5.837664764234271e-06, + "loss": 3.0983, + "step": 49030 + }, + { + "epoch": 3.3316347329800244, + "grad_norm": 6.518743515014648, + "learning_rate": 5.837240114145945e-06, + "loss": 3.0781, + "step": 49035 + }, + { + "epoch": 3.331974453050686, + "grad_norm": 7.084331512451172, + "learning_rate": 5.836815464057617e-06, + "loss": 3.132, + "step": 49040 + }, + { + "epoch": 3.332314173121348, + "grad_norm": 5.773519992828369, + "learning_rate": 5.83639081396929e-06, + "loss": 2.8199, + "step": 49045 + }, + { + "epoch": 3.3326538931920098, + "grad_norm": 7.266257286071777, + "learning_rate": 5.835966163880962e-06, + "loss": 3.0747, + "step": 49050 + }, + { + "epoch": 3.3329936132626714, + "grad_norm": 5.625278472900391, + "learning_rate": 5.835541513792635e-06, + "loss": 3.0204, + "step": 49055 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 7.200693130493164, + "learning_rate": 5.835116863704308e-06, + "loss": 3.0524, + "step": 49060 + }, + { + "epoch": 3.333673053403995, + "grad_norm": 7.527703285217285, + "learning_rate": 5.834692213615981e-06, + "loss": 2.6034, + "step": 49065 + }, + { + "epoch": 3.3340127734746567, + "grad_norm": 8.748478889465332, + "learning_rate": 5.834267563527654e-06, + "loss": 2.7631, + "step": 49070 + }, + { + "epoch": 3.334352493545319, + "grad_norm": 6.399389743804932, + "learning_rate": 5.8338429134393256e-06, + "loss": 2.9384, + "step": 49075 + }, + { + "epoch": 3.3346922136159804, + "grad_norm": 7.383506774902344, + "learning_rate": 5.833418263350999e-06, + "loss": 2.866, + "step": 49080 + }, + { + "epoch": 3.335031933686642, + "grad_norm": 6.255630970001221, + "learning_rate": 5.832993613262672e-06, + "loss": 2.9347, + "step": 49085 + }, + { + "epoch": 3.335371653757304, + "grad_norm": 7.492527961730957, + "learning_rate": 5.832568963174344e-06, + "loss": 2.8297, + "step": 49090 + }, + { + "epoch": 3.335711373827966, + "grad_norm": 6.450595378875732, + "learning_rate": 5.832144313086018e-06, + "loss": 3.1468, + "step": 49095 + }, + { + "epoch": 3.3360510938986274, + "grad_norm": 7.046986103057861, + "learning_rate": 5.8317196629976904e-06, + "loss": 2.9084, + "step": 49100 + }, + { + "epoch": 3.3363908139692895, + "grad_norm": 7.661008834838867, + "learning_rate": 5.831295012909362e-06, + "loss": 2.8658, + "step": 49105 + }, + { + "epoch": 3.336730534039951, + "grad_norm": 6.649855613708496, + "learning_rate": 5.830870362821036e-06, + "loss": 3.2392, + "step": 49110 + }, + { + "epoch": 3.3370702541106128, + "grad_norm": 7.6015753746032715, + "learning_rate": 5.830445712732709e-06, + "loss": 2.7458, + "step": 49115 + }, + { + "epoch": 3.3374099741812744, + "grad_norm": 6.842534065246582, + "learning_rate": 5.8300210626443825e-06, + "loss": 2.874, + "step": 49120 + }, + { + "epoch": 3.3377496942519365, + "grad_norm": 8.206812858581543, + "learning_rate": 5.8295964125560544e-06, + "loss": 2.8845, + "step": 49125 + }, + { + "epoch": 3.338089414322598, + "grad_norm": 6.737429618835449, + "learning_rate": 5.829171762467727e-06, + "loss": 2.9797, + "step": 49130 + }, + { + "epoch": 3.3384291343932597, + "grad_norm": 5.922501564025879, + "learning_rate": 5.8287471123794e-06, + "loss": 2.9, + "step": 49135 + }, + { + "epoch": 3.338768854463922, + "grad_norm": 9.025370597839355, + "learning_rate": 5.828322462291073e-06, + "loss": 3.2547, + "step": 49140 + }, + { + "epoch": 3.3391085745345834, + "grad_norm": 8.858993530273438, + "learning_rate": 5.827897812202745e-06, + "loss": 3.1593, + "step": 49145 + }, + { + "epoch": 3.339448294605245, + "grad_norm": 5.750442981719971, + "learning_rate": 5.8274731621144184e-06, + "loss": 2.9093, + "step": 49150 + }, + { + "epoch": 3.339788014675907, + "grad_norm": 8.80821704864502, + "learning_rate": 5.827048512026091e-06, + "loss": 2.8447, + "step": 49155 + }, + { + "epoch": 3.3401277347465688, + "grad_norm": 8.040141105651855, + "learning_rate": 5.826623861937763e-06, + "loss": 2.9216, + "step": 49160 + }, + { + "epoch": 3.3404674548172304, + "grad_norm": 9.35992431640625, + "learning_rate": 5.826199211849437e-06, + "loss": 2.9158, + "step": 49165 + }, + { + "epoch": 3.3408071748878925, + "grad_norm": 6.941888332366943, + "learning_rate": 5.82577456176111e-06, + "loss": 2.9304, + "step": 49170 + }, + { + "epoch": 3.341146894958554, + "grad_norm": 7.779732704162598, + "learning_rate": 5.825349911672782e-06, + "loss": 2.9005, + "step": 49175 + }, + { + "epoch": 3.3414866150292157, + "grad_norm": 7.7862348556518555, + "learning_rate": 5.824925261584455e-06, + "loss": 2.8051, + "step": 49180 + }, + { + "epoch": 3.341826335099878, + "grad_norm": 6.758364200592041, + "learning_rate": 5.824500611496128e-06, + "loss": 2.6139, + "step": 49185 + }, + { + "epoch": 3.3421660551705394, + "grad_norm": 6.339778423309326, + "learning_rate": 5.8240759614078e-06, + "loss": 2.7827, + "step": 49190 + }, + { + "epoch": 3.342505775241201, + "grad_norm": 7.276236534118652, + "learning_rate": 5.823651311319474e-06, + "loss": 2.7947, + "step": 49195 + }, + { + "epoch": 3.342845495311863, + "grad_norm": 8.65062427520752, + "learning_rate": 5.8232266612311464e-06, + "loss": 2.7973, + "step": 49200 + }, + { + "epoch": 3.3431852153825248, + "grad_norm": 7.745363235473633, + "learning_rate": 5.822802011142818e-06, + "loss": 3.2534, + "step": 49205 + }, + { + "epoch": 3.3435249354531864, + "grad_norm": 9.38711166381836, + "learning_rate": 5.822377361054492e-06, + "loss": 3.0903, + "step": 49210 + }, + { + "epoch": 3.3438646555238485, + "grad_norm": 8.685643196105957, + "learning_rate": 5.821952710966165e-06, + "loss": 2.9477, + "step": 49215 + }, + { + "epoch": 3.34420437559451, + "grad_norm": 7.057826995849609, + "learning_rate": 5.821528060877837e-06, + "loss": 2.8429, + "step": 49220 + }, + { + "epoch": 3.3445440956651717, + "grad_norm": 9.174065589904785, + "learning_rate": 5.8211034107895104e-06, + "loss": 2.921, + "step": 49225 + }, + { + "epoch": 3.344883815735834, + "grad_norm": 7.16883659362793, + "learning_rate": 5.820678760701182e-06, + "loss": 3.0919, + "step": 49230 + }, + { + "epoch": 3.3452235358064955, + "grad_norm": 5.5807647705078125, + "learning_rate": 5.820254110612855e-06, + "loss": 2.5548, + "step": 49235 + }, + { + "epoch": 3.345563255877157, + "grad_norm": 7.436548233032227, + "learning_rate": 5.819829460524529e-06, + "loss": 2.9068, + "step": 49240 + }, + { + "epoch": 3.345902975947819, + "grad_norm": 6.116715431213379, + "learning_rate": 5.819404810436201e-06, + "loss": 2.8574, + "step": 49245 + }, + { + "epoch": 3.346242696018481, + "grad_norm": 10.212565422058105, + "learning_rate": 5.818980160347874e-06, + "loss": 3.0165, + "step": 49250 + }, + { + "epoch": 3.3465824160891424, + "grad_norm": 8.579707145690918, + "learning_rate": 5.818555510259547e-06, + "loss": 2.9694, + "step": 49255 + }, + { + "epoch": 3.3469221361598045, + "grad_norm": 6.5209641456604, + "learning_rate": 5.818130860171219e-06, + "loss": 3.0496, + "step": 49260 + }, + { + "epoch": 3.347261856230466, + "grad_norm": 10.129051208496094, + "learning_rate": 5.817706210082892e-06, + "loss": 3.057, + "step": 49265 + }, + { + "epoch": 3.3476015763011278, + "grad_norm": 7.952390670776367, + "learning_rate": 5.817281559994566e-06, + "loss": 3.0898, + "step": 49270 + }, + { + "epoch": 3.34794129637179, + "grad_norm": 6.708808422088623, + "learning_rate": 5.816856909906238e-06, + "loss": 3.0628, + "step": 49275 + }, + { + "epoch": 3.3482810164424515, + "grad_norm": 7.676601409912109, + "learning_rate": 5.81643225981791e-06, + "loss": 3.0764, + "step": 49280 + }, + { + "epoch": 3.348620736513113, + "grad_norm": 6.3575568199157715, + "learning_rate": 5.816007609729584e-06, + "loss": 2.9662, + "step": 49285 + }, + { + "epoch": 3.348960456583775, + "grad_norm": 6.963106632232666, + "learning_rate": 5.815582959641256e-06, + "loss": 3.0534, + "step": 49290 + }, + { + "epoch": 3.349300176654437, + "grad_norm": 6.712420463562012, + "learning_rate": 5.815158309552929e-06, + "loss": 2.9784, + "step": 49295 + }, + { + "epoch": 3.3496398967250984, + "grad_norm": 7.203134059906006, + "learning_rate": 5.814733659464602e-06, + "loss": 2.869, + "step": 49300 + }, + { + "epoch": 3.3499796167957605, + "grad_norm": 8.918145179748535, + "learning_rate": 5.814309009376274e-06, + "loss": 2.8461, + "step": 49305 + }, + { + "epoch": 3.350319336866422, + "grad_norm": 7.113206386566162, + "learning_rate": 5.813884359287947e-06, + "loss": 2.8712, + "step": 49310 + }, + { + "epoch": 3.3506590569370838, + "grad_norm": 6.016955375671387, + "learning_rate": 5.81345970919962e-06, + "loss": 3.0375, + "step": 49315 + }, + { + "epoch": 3.350998777007746, + "grad_norm": 7.074008941650391, + "learning_rate": 5.813035059111293e-06, + "loss": 3.083, + "step": 49320 + }, + { + "epoch": 3.3513384970784075, + "grad_norm": 7.040643215179443, + "learning_rate": 5.812610409022965e-06, + "loss": 2.8175, + "step": 49325 + }, + { + "epoch": 3.351678217149069, + "grad_norm": 7.557529926300049, + "learning_rate": 5.812185758934638e-06, + "loss": 2.885, + "step": 49330 + }, + { + "epoch": 3.352017937219731, + "grad_norm": 9.093733787536621, + "learning_rate": 5.811761108846311e-06, + "loss": 2.9171, + "step": 49335 + }, + { + "epoch": 3.352357657290393, + "grad_norm": 6.581753730773926, + "learning_rate": 5.811336458757983e-06, + "loss": 2.9534, + "step": 49340 + }, + { + "epoch": 3.3526973773610544, + "grad_norm": 8.278156280517578, + "learning_rate": 5.810911808669657e-06, + "loss": 2.7262, + "step": 49345 + }, + { + "epoch": 3.353037097431716, + "grad_norm": 7.234246253967285, + "learning_rate": 5.81048715858133e-06, + "loss": 2.9079, + "step": 49350 + }, + { + "epoch": 3.353376817502378, + "grad_norm": 7.356922149658203, + "learning_rate": 5.8100625084930016e-06, + "loss": 2.7056, + "step": 49355 + }, + { + "epoch": 3.35371653757304, + "grad_norm": 7.783629417419434, + "learning_rate": 5.809637858404675e-06, + "loss": 2.8258, + "step": 49360 + }, + { + "epoch": 3.3540562576437014, + "grad_norm": 7.3046064376831055, + "learning_rate": 5.809213208316348e-06, + "loss": 2.9965, + "step": 49365 + }, + { + "epoch": 3.3543959777143635, + "grad_norm": 6.597474575042725, + "learning_rate": 5.80878855822802e-06, + "loss": 2.8915, + "step": 49370 + }, + { + "epoch": 3.354735697785025, + "grad_norm": 6.636688232421875, + "learning_rate": 5.808363908139694e-06, + "loss": 2.8052, + "step": 49375 + }, + { + "epoch": 3.3550754178556867, + "grad_norm": 7.320723056793213, + "learning_rate": 5.807939258051366e-06, + "loss": 2.8366, + "step": 49380 + }, + { + "epoch": 3.355415137926349, + "grad_norm": 9.56503963470459, + "learning_rate": 5.807514607963038e-06, + "loss": 2.9453, + "step": 49385 + }, + { + "epoch": 3.3557548579970105, + "grad_norm": 5.695808410644531, + "learning_rate": 5.807089957874712e-06, + "loss": 2.701, + "step": 49390 + }, + { + "epoch": 3.356094578067672, + "grad_norm": 7.103822708129883, + "learning_rate": 5.806665307786384e-06, + "loss": 3.0435, + "step": 49395 + }, + { + "epoch": 3.356434298138334, + "grad_norm": 8.275370597839355, + "learning_rate": 5.806240657698057e-06, + "loss": 2.9882, + "step": 49400 + }, + { + "epoch": 3.356774018208996, + "grad_norm": 6.906413555145264, + "learning_rate": 5.8058160076097304e-06, + "loss": 2.8555, + "step": 49405 + }, + { + "epoch": 3.3571137382796574, + "grad_norm": 7.021193981170654, + "learning_rate": 5.805391357521402e-06, + "loss": 3.2965, + "step": 49410 + }, + { + "epoch": 3.3574534583503195, + "grad_norm": 5.929733753204346, + "learning_rate": 5.804966707433075e-06, + "loss": 2.743, + "step": 49415 + }, + { + "epoch": 3.357793178420981, + "grad_norm": 5.9011712074279785, + "learning_rate": 5.804542057344749e-06, + "loss": 2.9061, + "step": 49420 + }, + { + "epoch": 3.3581328984916428, + "grad_norm": 6.9698967933654785, + "learning_rate": 5.804117407256421e-06, + "loss": 3.0329, + "step": 49425 + }, + { + "epoch": 3.358472618562305, + "grad_norm": 6.06493616104126, + "learning_rate": 5.803692757168094e-06, + "loss": 2.8765, + "step": 49430 + }, + { + "epoch": 3.3588123386329665, + "grad_norm": 6.195511817932129, + "learning_rate": 5.803268107079767e-06, + "loss": 2.6135, + "step": 49435 + }, + { + "epoch": 3.359152058703628, + "grad_norm": 7.95913028717041, + "learning_rate": 5.802843456991439e-06, + "loss": 2.6226, + "step": 49440 + }, + { + "epoch": 3.35949177877429, + "grad_norm": 7.6958184242248535, + "learning_rate": 5.802418806903112e-06, + "loss": 2.765, + "step": 49445 + }, + { + "epoch": 3.359831498844952, + "grad_norm": 5.60326623916626, + "learning_rate": 5.801994156814786e-06, + "loss": 2.6497, + "step": 49450 + }, + { + "epoch": 3.3601712189156134, + "grad_norm": 7.509664535522461, + "learning_rate": 5.801569506726458e-06, + "loss": 3.2804, + "step": 49455 + }, + { + "epoch": 3.360510938986275, + "grad_norm": 7.076169490814209, + "learning_rate": 5.801144856638131e-06, + "loss": 3.0012, + "step": 49460 + }, + { + "epoch": 3.360850659056937, + "grad_norm": 8.820225715637207, + "learning_rate": 5.800720206549804e-06, + "loss": 2.6916, + "step": 49465 + }, + { + "epoch": 3.3611903791275988, + "grad_norm": 7.198815822601318, + "learning_rate": 5.800295556461476e-06, + "loss": 3.0214, + "step": 49470 + }, + { + "epoch": 3.3615300991982604, + "grad_norm": 7.383418083190918, + "learning_rate": 5.79987090637315e-06, + "loss": 2.9954, + "step": 49475 + }, + { + "epoch": 3.3618698192689225, + "grad_norm": 7.25649881362915, + "learning_rate": 5.799446256284822e-06, + "loss": 2.8215, + "step": 49480 + }, + { + "epoch": 3.362209539339584, + "grad_norm": 6.9499382972717285, + "learning_rate": 5.799021606196494e-06, + "loss": 2.8165, + "step": 49485 + }, + { + "epoch": 3.3625492594102457, + "grad_norm": 7.625580310821533, + "learning_rate": 5.798596956108168e-06, + "loss": 3.1011, + "step": 49490 + }, + { + "epoch": 3.362888979480908, + "grad_norm": 8.810893058776855, + "learning_rate": 5.79817230601984e-06, + "loss": 3.0169, + "step": 49495 + }, + { + "epoch": 3.3632286995515694, + "grad_norm": 7.40373420715332, + "learning_rate": 5.797747655931513e-06, + "loss": 2.8417, + "step": 49500 + }, + { + "epoch": 3.363568419622231, + "grad_norm": 6.679191589355469, + "learning_rate": 5.7973230058431864e-06, + "loss": 2.7624, + "step": 49505 + }, + { + "epoch": 3.363908139692893, + "grad_norm": 8.189706802368164, + "learning_rate": 5.796898355754858e-06, + "loss": 2.9116, + "step": 49510 + }, + { + "epoch": 3.364247859763555, + "grad_norm": 8.939448356628418, + "learning_rate": 5.796473705666531e-06, + "loss": 3.2699, + "step": 49515 + }, + { + "epoch": 3.3645875798342164, + "grad_norm": 8.108807563781738, + "learning_rate": 5.796049055578205e-06, + "loss": 2.9094, + "step": 49520 + }, + { + "epoch": 3.3649272999048785, + "grad_norm": 7.451289653778076, + "learning_rate": 5.795624405489877e-06, + "loss": 2.9969, + "step": 49525 + }, + { + "epoch": 3.36526701997554, + "grad_norm": 7.337569236755371, + "learning_rate": 5.79519975540155e-06, + "loss": 3.0091, + "step": 49530 + }, + { + "epoch": 3.3656067400462018, + "grad_norm": 7.343556880950928, + "learning_rate": 5.794775105313223e-06, + "loss": 3.1722, + "step": 49535 + }, + { + "epoch": 3.365946460116864, + "grad_norm": 7.302038192749023, + "learning_rate": 5.794350455224895e-06, + "loss": 3.0565, + "step": 49540 + }, + { + "epoch": 3.3662861801875255, + "grad_norm": 8.165271759033203, + "learning_rate": 5.793925805136568e-06, + "loss": 3.034, + "step": 49545 + }, + { + "epoch": 3.366625900258187, + "grad_norm": 5.493114471435547, + "learning_rate": 5.793501155048241e-06, + "loss": 3.0868, + "step": 49550 + }, + { + "epoch": 3.366965620328849, + "grad_norm": 7.177222728729248, + "learning_rate": 5.793076504959914e-06, + "loss": 3.0537, + "step": 49555 + }, + { + "epoch": 3.367305340399511, + "grad_norm": 7.2261247634887695, + "learning_rate": 5.792651854871586e-06, + "loss": 2.648, + "step": 49560 + }, + { + "epoch": 3.3676450604701724, + "grad_norm": 9.274568557739258, + "learning_rate": 5.792227204783259e-06, + "loss": 2.8978, + "step": 49565 + }, + { + "epoch": 3.3679847805408345, + "grad_norm": 8.269844055175781, + "learning_rate": 5.791802554694932e-06, + "loss": 2.969, + "step": 49570 + }, + { + "epoch": 3.368324500611496, + "grad_norm": 6.745450019836426, + "learning_rate": 5.791377904606604e-06, + "loss": 2.8931, + "step": 49575 + }, + { + "epoch": 3.3686642206821578, + "grad_norm": 9.09343433380127, + "learning_rate": 5.790953254518278e-06, + "loss": 3.0665, + "step": 49580 + }, + { + "epoch": 3.36900394075282, + "grad_norm": 8.417157173156738, + "learning_rate": 5.79052860442995e-06, + "loss": 2.7735, + "step": 49585 + }, + { + "epoch": 3.3693436608234815, + "grad_norm": 5.777881622314453, + "learning_rate": 5.790103954341622e-06, + "loss": 2.8935, + "step": 49590 + }, + { + "epoch": 3.369683380894143, + "grad_norm": 8.132521629333496, + "learning_rate": 5.789679304253296e-06, + "loss": 3.0097, + "step": 49595 + }, + { + "epoch": 3.370023100964805, + "grad_norm": 7.980018138885498, + "learning_rate": 5.789254654164969e-06, + "loss": 2.9398, + "step": 49600 + }, + { + "epoch": 3.370362821035467, + "grad_norm": 6.804202556610107, + "learning_rate": 5.788830004076641e-06, + "loss": 3.0368, + "step": 49605 + }, + { + "epoch": 3.3707025411061284, + "grad_norm": 6.986982345581055, + "learning_rate": 5.788405353988314e-06, + "loss": 3.0011, + "step": 49610 + }, + { + "epoch": 3.3710422611767905, + "grad_norm": 5.712082386016846, + "learning_rate": 5.787980703899987e-06, + "loss": 2.8252, + "step": 49615 + }, + { + "epoch": 3.371381981247452, + "grad_norm": 7.094027519226074, + "learning_rate": 5.787556053811659e-06, + "loss": 2.8735, + "step": 49620 + }, + { + "epoch": 3.3717217013181138, + "grad_norm": 9.584994316101074, + "learning_rate": 5.787131403723333e-06, + "loss": 2.938, + "step": 49625 + }, + { + "epoch": 3.372061421388776, + "grad_norm": 7.517410755157471, + "learning_rate": 5.786706753635006e-06, + "loss": 3.0073, + "step": 49630 + }, + { + "epoch": 3.3724011414594375, + "grad_norm": 8.322463035583496, + "learning_rate": 5.7862821035466776e-06, + "loss": 2.7638, + "step": 49635 + }, + { + "epoch": 3.372740861530099, + "grad_norm": 6.7069478034973145, + "learning_rate": 5.785857453458351e-06, + "loss": 2.9035, + "step": 49640 + }, + { + "epoch": 3.373080581600761, + "grad_norm": 7.597870826721191, + "learning_rate": 5.785432803370023e-06, + "loss": 2.9372, + "step": 49645 + }, + { + "epoch": 3.373420301671423, + "grad_norm": 6.080428123474121, + "learning_rate": 5.785008153281696e-06, + "loss": 3.0108, + "step": 49650 + }, + { + "epoch": 3.3737600217420844, + "grad_norm": 8.148886680603027, + "learning_rate": 5.78458350319337e-06, + "loss": 3.0989, + "step": 49655 + }, + { + "epoch": 3.3740997418127465, + "grad_norm": 8.06602954864502, + "learning_rate": 5.7841588531050416e-06, + "loss": 2.7464, + "step": 49660 + }, + { + "epoch": 3.374439461883408, + "grad_norm": 6.694769859313965, + "learning_rate": 5.783734203016714e-06, + "loss": 2.9481, + "step": 49665 + }, + { + "epoch": 3.37477918195407, + "grad_norm": 6.00588846206665, + "learning_rate": 5.783309552928388e-06, + "loss": 3.0271, + "step": 49670 + }, + { + "epoch": 3.375118902024732, + "grad_norm": 6.46501350402832, + "learning_rate": 5.78288490284006e-06, + "loss": 2.7905, + "step": 49675 + }, + { + "epoch": 3.3754586220953935, + "grad_norm": 7.880086421966553, + "learning_rate": 5.782460252751733e-06, + "loss": 2.8249, + "step": 49680 + }, + { + "epoch": 3.375798342166055, + "grad_norm": 6.049464225769043, + "learning_rate": 5.782035602663406e-06, + "loss": 2.8963, + "step": 49685 + }, + { + "epoch": 3.3761380622367168, + "grad_norm": 8.096434593200684, + "learning_rate": 5.781610952575078e-06, + "loss": 2.8371, + "step": 49690 + }, + { + "epoch": 3.376477782307379, + "grad_norm": 7.345740795135498, + "learning_rate": 5.781186302486751e-06, + "loss": 3.1749, + "step": 49695 + }, + { + "epoch": 3.3768175023780405, + "grad_norm": 7.617060661315918, + "learning_rate": 5.780761652398425e-06, + "loss": 2.8932, + "step": 49700 + }, + { + "epoch": 3.377157222448702, + "grad_norm": 7.379764080047607, + "learning_rate": 5.780337002310097e-06, + "loss": 2.9171, + "step": 49705 + }, + { + "epoch": 3.377496942519364, + "grad_norm": 7.914454936981201, + "learning_rate": 5.77991235222177e-06, + "loss": 2.9742, + "step": 49710 + }, + { + "epoch": 3.377836662590026, + "grad_norm": 5.86680793762207, + "learning_rate": 5.779487702133442e-06, + "loss": 2.7868, + "step": 49715 + }, + { + "epoch": 3.3781763826606874, + "grad_norm": 6.683329105377197, + "learning_rate": 5.779063052045115e-06, + "loss": 3.0669, + "step": 49720 + }, + { + "epoch": 3.3785161027313495, + "grad_norm": 7.226804733276367, + "learning_rate": 5.778638401956788e-06, + "loss": 2.8126, + "step": 49725 + }, + { + "epoch": 3.378855822802011, + "grad_norm": 7.012740135192871, + "learning_rate": 5.778213751868461e-06, + "loss": 2.952, + "step": 49730 + }, + { + "epoch": 3.3791955428726728, + "grad_norm": 7.556081771850586, + "learning_rate": 5.777789101780134e-06, + "loss": 2.9019, + "step": 49735 + }, + { + "epoch": 3.379535262943335, + "grad_norm": 8.50284481048584, + "learning_rate": 5.7773644516918055e-06, + "loss": 2.9244, + "step": 49740 + }, + { + "epoch": 3.3798749830139965, + "grad_norm": 9.123859405517578, + "learning_rate": 5.776939801603479e-06, + "loss": 2.9611, + "step": 49745 + }, + { + "epoch": 3.380214703084658, + "grad_norm": 7.309154510498047, + "learning_rate": 5.776515151515152e-06, + "loss": 2.8729, + "step": 49750 + }, + { + "epoch": 3.38055442315532, + "grad_norm": 6.7473602294921875, + "learning_rate": 5.776090501426824e-06, + "loss": 2.9159, + "step": 49755 + }, + { + "epoch": 3.380894143225982, + "grad_norm": 7.969610691070557, + "learning_rate": 5.775665851338498e-06, + "loss": 2.6483, + "step": 49760 + }, + { + "epoch": 3.3812338632966434, + "grad_norm": 7.069694519042969, + "learning_rate": 5.77524120125017e-06, + "loss": 3.0043, + "step": 49765 + }, + { + "epoch": 3.3815735833673055, + "grad_norm": 6.050764560699463, + "learning_rate": 5.774816551161842e-06, + "loss": 2.7303, + "step": 49770 + }, + { + "epoch": 3.381913303437967, + "grad_norm": 7.0655317306518555, + "learning_rate": 5.774391901073516e-06, + "loss": 3.0067, + "step": 49775 + }, + { + "epoch": 3.3822530235086288, + "grad_norm": 6.831150531768799, + "learning_rate": 5.773967250985189e-06, + "loss": 3.074, + "step": 49780 + }, + { + "epoch": 3.382592743579291, + "grad_norm": 6.12942361831665, + "learning_rate": 5.773542600896861e-06, + "loss": 3.014, + "step": 49785 + }, + { + "epoch": 3.3829324636499525, + "grad_norm": 5.920955181121826, + "learning_rate": 5.773117950808534e-06, + "loss": 2.9931, + "step": 49790 + }, + { + "epoch": 3.383272183720614, + "grad_norm": 7.139315128326416, + "learning_rate": 5.772693300720207e-06, + "loss": 3.0078, + "step": 49795 + }, + { + "epoch": 3.3836119037912757, + "grad_norm": 6.403778553009033, + "learning_rate": 5.77226865063188e-06, + "loss": 2.9741, + "step": 49800 + }, + { + "epoch": 3.383951623861938, + "grad_norm": 6.241901397705078, + "learning_rate": 5.771844000543553e-06, + "loss": 2.6378, + "step": 49805 + }, + { + "epoch": 3.3842913439325994, + "grad_norm": 7.010397911071777, + "learning_rate": 5.771419350455225e-06, + "loss": 3.1245, + "step": 49810 + }, + { + "epoch": 3.384631064003261, + "grad_norm": 7.489863395690918, + "learning_rate": 5.770994700366898e-06, + "loss": 3.0283, + "step": 49815 + }, + { + "epoch": 3.384970784073923, + "grad_norm": 6.915450096130371, + "learning_rate": 5.770570050278571e-06, + "loss": 2.8911, + "step": 49820 + }, + { + "epoch": 3.385310504144585, + "grad_norm": 6.6423749923706055, + "learning_rate": 5.770145400190243e-06, + "loss": 3.0364, + "step": 49825 + }, + { + "epoch": 3.3856502242152464, + "grad_norm": 7.393054008483887, + "learning_rate": 5.769720750101917e-06, + "loss": 3.002, + "step": 49830 + }, + { + "epoch": 3.3859899442859085, + "grad_norm": 9.585013389587402, + "learning_rate": 5.76929610001359e-06, + "loss": 3.2358, + "step": 49835 + }, + { + "epoch": 3.38632966435657, + "grad_norm": 6.376208782196045, + "learning_rate": 5.7688714499252616e-06, + "loss": 2.7972, + "step": 49840 + }, + { + "epoch": 3.3866693844272318, + "grad_norm": 7.622546195983887, + "learning_rate": 5.768446799836935e-06, + "loss": 2.8078, + "step": 49845 + }, + { + "epoch": 3.387009104497894, + "grad_norm": 7.222412586212158, + "learning_rate": 5.768022149748608e-06, + "loss": 3.1123, + "step": 49850 + }, + { + "epoch": 3.3873488245685555, + "grad_norm": 7.783916473388672, + "learning_rate": 5.76759749966028e-06, + "loss": 3.031, + "step": 49855 + }, + { + "epoch": 3.387688544639217, + "grad_norm": 8.574929237365723, + "learning_rate": 5.767172849571954e-06, + "loss": 3.0127, + "step": 49860 + }, + { + "epoch": 3.388028264709879, + "grad_norm": 6.777508735656738, + "learning_rate": 5.766748199483626e-06, + "loss": 2.9195, + "step": 49865 + }, + { + "epoch": 3.388367984780541, + "grad_norm": 7.668612480163574, + "learning_rate": 5.766323549395298e-06, + "loss": 2.9983, + "step": 49870 + }, + { + "epoch": 3.3887077048512024, + "grad_norm": 7.625641345977783, + "learning_rate": 5.765898899306972e-06, + "loss": 3.1721, + "step": 49875 + }, + { + "epoch": 3.3890474249218645, + "grad_norm": 7.931344032287598, + "learning_rate": 5.765474249218645e-06, + "loss": 2.9231, + "step": 49880 + }, + { + "epoch": 3.389387144992526, + "grad_norm": 8.354511260986328, + "learning_rate": 5.765049599130317e-06, + "loss": 2.8683, + "step": 49885 + }, + { + "epoch": 3.3897268650631878, + "grad_norm": 8.846640586853027, + "learning_rate": 5.76462494904199e-06, + "loss": 2.8448, + "step": 49890 + }, + { + "epoch": 3.39006658513385, + "grad_norm": 6.5666985511779785, + "learning_rate": 5.764200298953662e-06, + "loss": 3.072, + "step": 49895 + }, + { + "epoch": 3.3904063052045115, + "grad_norm": 8.423257827758789, + "learning_rate": 5.763775648865335e-06, + "loss": 3.0845, + "step": 49900 + }, + { + "epoch": 3.390746025275173, + "grad_norm": 8.49833869934082, + "learning_rate": 5.763350998777009e-06, + "loss": 2.9168, + "step": 49905 + }, + { + "epoch": 3.391085745345835, + "grad_norm": 7.812121868133545, + "learning_rate": 5.762926348688681e-06, + "loss": 2.8724, + "step": 49910 + }, + { + "epoch": 3.391425465416497, + "grad_norm": 8.685689926147461, + "learning_rate": 5.7625016986003536e-06, + "loss": 2.8544, + "step": 49915 + }, + { + "epoch": 3.3917651854871584, + "grad_norm": 9.400155067443848, + "learning_rate": 5.762077048512027e-06, + "loss": 2.8378, + "step": 49920 + }, + { + "epoch": 3.3921049055578205, + "grad_norm": 7.775266170501709, + "learning_rate": 5.761652398423699e-06, + "loss": 3.0001, + "step": 49925 + }, + { + "epoch": 3.392444625628482, + "grad_norm": 6.472149848937988, + "learning_rate": 5.761227748335372e-06, + "loss": 2.9033, + "step": 49930 + }, + { + "epoch": 3.3927843456991438, + "grad_norm": 8.165515899658203, + "learning_rate": 5.760803098247046e-06, + "loss": 3.1311, + "step": 49935 + }, + { + "epoch": 3.393124065769806, + "grad_norm": 7.42587947845459, + "learning_rate": 5.7603784481587176e-06, + "loss": 2.8789, + "step": 49940 + }, + { + "epoch": 3.3934637858404675, + "grad_norm": 7.410848140716553, + "learning_rate": 5.75995379807039e-06, + "loss": 3.0345, + "step": 49945 + }, + { + "epoch": 3.393803505911129, + "grad_norm": 6.92191219329834, + "learning_rate": 5.759529147982064e-06, + "loss": 2.9445, + "step": 49950 + }, + { + "epoch": 3.394143225981791, + "grad_norm": 5.938358783721924, + "learning_rate": 5.759104497893736e-06, + "loss": 3.0449, + "step": 49955 + }, + { + "epoch": 3.394482946052453, + "grad_norm": 6.231206893920898, + "learning_rate": 5.758679847805409e-06, + "loss": 3.0262, + "step": 49960 + }, + { + "epoch": 3.3948226661231145, + "grad_norm": 6.628662586212158, + "learning_rate": 5.7582551977170816e-06, + "loss": 3.0139, + "step": 49965 + }, + { + "epoch": 3.3951623861937765, + "grad_norm": 6.958836555480957, + "learning_rate": 5.757830547628754e-06, + "loss": 2.7269, + "step": 49970 + }, + { + "epoch": 3.395502106264438, + "grad_norm": 5.51931095123291, + "learning_rate": 5.757405897540427e-06, + "loss": 2.9377, + "step": 49975 + }, + { + "epoch": 3.3958418263351, + "grad_norm": 7.661561012268066, + "learning_rate": 5.7569812474521e-06, + "loss": 2.962, + "step": 49980 + }, + { + "epoch": 3.396181546405762, + "grad_norm": 5.9661078453063965, + "learning_rate": 5.756556597363773e-06, + "loss": 2.9301, + "step": 49985 + }, + { + "epoch": 3.3965212664764235, + "grad_norm": 5.8165106773376465, + "learning_rate": 5.756131947275445e-06, + "loss": 3.041, + "step": 49990 + }, + { + "epoch": 3.396860986547085, + "grad_norm": 6.103263854980469, + "learning_rate": 5.755707297187118e-06, + "loss": 2.8197, + "step": 49995 + }, + { + "epoch": 3.397200706617747, + "grad_norm": 6.418395042419434, + "learning_rate": 5.755282647098791e-06, + "loss": 3.0502, + "step": 50000 + }, + { + "epoch": 3.397540426688409, + "grad_norm": 7.789127826690674, + "learning_rate": 5.754857997010463e-06, + "loss": 3.0904, + "step": 50005 + }, + { + "epoch": 3.3978801467590705, + "grad_norm": 8.330771446228027, + "learning_rate": 5.754433346922137e-06, + "loss": 3.1231, + "step": 50010 + }, + { + "epoch": 3.3982198668297325, + "grad_norm": 7.410358905792236, + "learning_rate": 5.75400869683381e-06, + "loss": 3.0971, + "step": 50015 + }, + { + "epoch": 3.398559586900394, + "grad_norm": 7.3294830322265625, + "learning_rate": 5.7535840467454815e-06, + "loss": 2.883, + "step": 50020 + }, + { + "epoch": 3.398899306971056, + "grad_norm": 7.661747455596924, + "learning_rate": 5.753159396657155e-06, + "loss": 2.931, + "step": 50025 + }, + { + "epoch": 3.3992390270417174, + "grad_norm": 5.694294452667236, + "learning_rate": 5.752734746568828e-06, + "loss": 2.8878, + "step": 50030 + }, + { + "epoch": 3.3995787471123795, + "grad_norm": 8.048738479614258, + "learning_rate": 5.7523100964805e-06, + "loss": 2.9751, + "step": 50035 + }, + { + "epoch": 3.399918467183041, + "grad_norm": 7.072715759277344, + "learning_rate": 5.751885446392174e-06, + "loss": 3.2107, + "step": 50040 + }, + { + "epoch": 3.4002581872537028, + "grad_norm": 8.718249320983887, + "learning_rate": 5.751460796303846e-06, + "loss": 2.9012, + "step": 50045 + }, + { + "epoch": 3.400597907324365, + "grad_norm": 6.999054431915283, + "learning_rate": 5.751036146215518e-06, + "loss": 3.0009, + "step": 50050 + }, + { + "epoch": 3.4009376273950265, + "grad_norm": 7.619232177734375, + "learning_rate": 5.750611496127192e-06, + "loss": 2.9577, + "step": 50055 + }, + { + "epoch": 3.401277347465688, + "grad_norm": 5.815208435058594, + "learning_rate": 5.750186846038864e-06, + "loss": 2.8013, + "step": 50060 + }, + { + "epoch": 3.40161706753635, + "grad_norm": 7.0376057624816895, + "learning_rate": 5.749762195950537e-06, + "loss": 3.048, + "step": 50065 + }, + { + "epoch": 3.401956787607012, + "grad_norm": 7.740194797515869, + "learning_rate": 5.74933754586221e-06, + "loss": 2.9888, + "step": 50070 + }, + { + "epoch": 3.4022965076776734, + "grad_norm": 7.690343379974365, + "learning_rate": 5.748912895773882e-06, + "loss": 3.0131, + "step": 50075 + }, + { + "epoch": 3.4026362277483355, + "grad_norm": 6.177016258239746, + "learning_rate": 5.748488245685555e-06, + "loss": 2.6503, + "step": 50080 + }, + { + "epoch": 3.402975947818997, + "grad_norm": 7.011213302612305, + "learning_rate": 5.748063595597229e-06, + "loss": 3.1386, + "step": 50085 + }, + { + "epoch": 3.403315667889659, + "grad_norm": 7.239718914031982, + "learning_rate": 5.747638945508901e-06, + "loss": 2.9836, + "step": 50090 + }, + { + "epoch": 3.403655387960321, + "grad_norm": 9.602765083312988, + "learning_rate": 5.7472142954205735e-06, + "loss": 3.1178, + "step": 50095 + }, + { + "epoch": 3.4039951080309825, + "grad_norm": 6.653228282928467, + "learning_rate": 5.746789645332247e-06, + "loss": 2.8982, + "step": 50100 + }, + { + "epoch": 3.404334828101644, + "grad_norm": 8.397440910339355, + "learning_rate": 5.746364995243919e-06, + "loss": 2.9416, + "step": 50105 + }, + { + "epoch": 3.404674548172306, + "grad_norm": 7.412312984466553, + "learning_rate": 5.745940345155592e-06, + "loss": 3.0251, + "step": 50110 + }, + { + "epoch": 3.405014268242968, + "grad_norm": 7.571224212646484, + "learning_rate": 5.745515695067266e-06, + "loss": 3.0413, + "step": 50115 + }, + { + "epoch": 3.4053539883136295, + "grad_norm": 7.246413707733154, + "learning_rate": 5.7450910449789375e-06, + "loss": 3.1631, + "step": 50120 + }, + { + "epoch": 3.4056937083842915, + "grad_norm": 8.581528663635254, + "learning_rate": 5.74466639489061e-06, + "loss": 3.0559, + "step": 50125 + }, + { + "epoch": 3.406033428454953, + "grad_norm": 6.899304389953613, + "learning_rate": 5.744241744802284e-06, + "loss": 3.0661, + "step": 50130 + }, + { + "epoch": 3.406373148525615, + "grad_norm": 6.916772365570068, + "learning_rate": 5.743817094713956e-06, + "loss": 2.8386, + "step": 50135 + }, + { + "epoch": 3.4067128685962764, + "grad_norm": 6.910708904266357, + "learning_rate": 5.74339244462563e-06, + "loss": 3.074, + "step": 50140 + }, + { + "epoch": 3.4070525886669385, + "grad_norm": 7.099345684051514, + "learning_rate": 5.7429677945373016e-06, + "loss": 2.9142, + "step": 50145 + }, + { + "epoch": 3.4073923087376, + "grad_norm": 7.505921840667725, + "learning_rate": 5.742543144448974e-06, + "loss": 2.9279, + "step": 50150 + }, + { + "epoch": 3.4077320288082618, + "grad_norm": 7.357627868652344, + "learning_rate": 5.742118494360648e-06, + "loss": 3.0288, + "step": 50155 + }, + { + "epoch": 3.408071748878924, + "grad_norm": 7.24204683303833, + "learning_rate": 5.74169384427232e-06, + "loss": 3.0373, + "step": 50160 + }, + { + "epoch": 3.4084114689495855, + "grad_norm": 8.126716613769531, + "learning_rate": 5.741269194183993e-06, + "loss": 3.1925, + "step": 50165 + }, + { + "epoch": 3.408751189020247, + "grad_norm": 6.620179176330566, + "learning_rate": 5.740844544095666e-06, + "loss": 2.9778, + "step": 50170 + }, + { + "epoch": 3.409090909090909, + "grad_norm": 6.090761184692383, + "learning_rate": 5.740419894007338e-06, + "loss": 2.8169, + "step": 50175 + }, + { + "epoch": 3.409430629161571, + "grad_norm": 7.685548305511475, + "learning_rate": 5.739995243919011e-06, + "loss": 2.939, + "step": 50180 + }, + { + "epoch": 3.4097703492322324, + "grad_norm": 7.572020053863525, + "learning_rate": 5.739570593830685e-06, + "loss": 2.8833, + "step": 50185 + }, + { + "epoch": 3.4101100693028945, + "grad_norm": 8.521007537841797, + "learning_rate": 5.739145943742357e-06, + "loss": 2.9313, + "step": 50190 + }, + { + "epoch": 3.410449789373556, + "grad_norm": 7.25687313079834, + "learning_rate": 5.7387212936540296e-06, + "loss": 2.7617, + "step": 50195 + }, + { + "epoch": 3.4107895094442178, + "grad_norm": 8.415238380432129, + "learning_rate": 5.738296643565703e-06, + "loss": 3.0847, + "step": 50200 + }, + { + "epoch": 3.41112922951488, + "grad_norm": 7.552134990692139, + "learning_rate": 5.737871993477375e-06, + "loss": 3.0345, + "step": 50205 + }, + { + "epoch": 3.4114689495855415, + "grad_norm": 5.893405914306641, + "learning_rate": 5.737447343389048e-06, + "loss": 2.7732, + "step": 50210 + }, + { + "epoch": 3.411808669656203, + "grad_norm": 6.419479846954346, + "learning_rate": 5.737022693300721e-06, + "loss": 3.0523, + "step": 50215 + }, + { + "epoch": 3.412148389726865, + "grad_norm": 6.06226110458374, + "learning_rate": 5.7365980432123936e-06, + "loss": 2.8956, + "step": 50220 + }, + { + "epoch": 3.412488109797527, + "grad_norm": 7.981592655181885, + "learning_rate": 5.736173393124066e-06, + "loss": 2.7372, + "step": 50225 + }, + { + "epoch": 3.4128278298681884, + "grad_norm": 8.318147659301758, + "learning_rate": 5.735748743035739e-06, + "loss": 2.8611, + "step": 50230 + }, + { + "epoch": 3.4131675499388505, + "grad_norm": 6.634959697723389, + "learning_rate": 5.735324092947412e-06, + "loss": 2.6665, + "step": 50235 + }, + { + "epoch": 3.413507270009512, + "grad_norm": 6.173363208770752, + "learning_rate": 5.734899442859084e-06, + "loss": 2.8387, + "step": 50240 + }, + { + "epoch": 3.413846990080174, + "grad_norm": 7.442813396453857, + "learning_rate": 5.7344747927707576e-06, + "loss": 2.7045, + "step": 50245 + }, + { + "epoch": 3.414186710150836, + "grad_norm": 7.090803146362305, + "learning_rate": 5.73405014268243e-06, + "loss": 3.0753, + "step": 50250 + }, + { + "epoch": 3.4145264302214975, + "grad_norm": 7.1183180809021, + "learning_rate": 5.733625492594102e-06, + "loss": 2.864, + "step": 50255 + }, + { + "epoch": 3.414866150292159, + "grad_norm": 7.4899492263793945, + "learning_rate": 5.733200842505776e-06, + "loss": 3.1095, + "step": 50260 + }, + { + "epoch": 3.415205870362821, + "grad_norm": 7.2304863929748535, + "learning_rate": 5.732776192417449e-06, + "loss": 3.1801, + "step": 50265 + }, + { + "epoch": 3.415545590433483, + "grad_norm": 9.09505844116211, + "learning_rate": 5.732351542329121e-06, + "loss": 3.1304, + "step": 50270 + }, + { + "epoch": 3.4158853105041445, + "grad_norm": 8.376733779907227, + "learning_rate": 5.731926892240794e-06, + "loss": 2.9248, + "step": 50275 + }, + { + "epoch": 3.4162250305748065, + "grad_norm": 6.752462387084961, + "learning_rate": 5.731502242152467e-06, + "loss": 3.2089, + "step": 50280 + }, + { + "epoch": 3.416564750645468, + "grad_norm": 7.814446926116943, + "learning_rate": 5.731077592064139e-06, + "loss": 2.9094, + "step": 50285 + }, + { + "epoch": 3.41690447071613, + "grad_norm": 6.335479736328125, + "learning_rate": 5.730652941975813e-06, + "loss": 3.0019, + "step": 50290 + }, + { + "epoch": 3.417244190786792, + "grad_norm": 7.117396831512451, + "learning_rate": 5.7302282918874856e-06, + "loss": 2.9681, + "step": 50295 + }, + { + "epoch": 3.4175839108574535, + "grad_norm": 6.884106159210205, + "learning_rate": 5.7298036417991575e-06, + "loss": 2.854, + "step": 50300 + }, + { + "epoch": 3.417923630928115, + "grad_norm": 6.003307819366455, + "learning_rate": 5.729378991710831e-06, + "loss": 2.8137, + "step": 50305 + }, + { + "epoch": 3.418263350998777, + "grad_norm": 6.948925495147705, + "learning_rate": 5.728954341622503e-06, + "loss": 2.9572, + "step": 50310 + }, + { + "epoch": 3.418603071069439, + "grad_norm": 8.046704292297363, + "learning_rate": 5.728529691534176e-06, + "loss": 2.7571, + "step": 50315 + }, + { + "epoch": 3.4189427911401005, + "grad_norm": 6.763134479522705, + "learning_rate": 5.72810504144585e-06, + "loss": 2.9373, + "step": 50320 + }, + { + "epoch": 3.4192825112107625, + "grad_norm": 6.9168901443481445, + "learning_rate": 5.7276803913575215e-06, + "loss": 2.9402, + "step": 50325 + }, + { + "epoch": 3.419622231281424, + "grad_norm": 7.98268461227417, + "learning_rate": 5.727255741269194e-06, + "loss": 3.1787, + "step": 50330 + }, + { + "epoch": 3.419961951352086, + "grad_norm": 6.773372173309326, + "learning_rate": 5.726831091180868e-06, + "loss": 2.8347, + "step": 50335 + }, + { + "epoch": 3.420301671422748, + "grad_norm": 9.37230396270752, + "learning_rate": 5.72640644109254e-06, + "loss": 3.2838, + "step": 50340 + }, + { + "epoch": 3.4206413914934095, + "grad_norm": 7.546730995178223, + "learning_rate": 5.725981791004213e-06, + "loss": 3.0449, + "step": 50345 + }, + { + "epoch": 3.420981111564071, + "grad_norm": 7.527979850769043, + "learning_rate": 5.725557140915886e-06, + "loss": 3.0251, + "step": 50350 + }, + { + "epoch": 3.421320831634733, + "grad_norm": 7.371557235717773, + "learning_rate": 5.725132490827558e-06, + "loss": 3.0109, + "step": 50355 + }, + { + "epoch": 3.421660551705395, + "grad_norm": 8.747239112854004, + "learning_rate": 5.724707840739231e-06, + "loss": 2.8343, + "step": 50360 + }, + { + "epoch": 3.4220002717760565, + "grad_norm": 5.981770992279053, + "learning_rate": 5.724283190650905e-06, + "loss": 2.8978, + "step": 50365 + }, + { + "epoch": 3.422339991846718, + "grad_norm": 7.708380222320557, + "learning_rate": 5.723858540562577e-06, + "loss": 2.9198, + "step": 50370 + }, + { + "epoch": 3.42267971191738, + "grad_norm": 7.277462959289551, + "learning_rate": 5.7234338904742495e-06, + "loss": 3.0087, + "step": 50375 + }, + { + "epoch": 3.423019431988042, + "grad_norm": 8.627724647521973, + "learning_rate": 5.723009240385922e-06, + "loss": 2.6682, + "step": 50380 + }, + { + "epoch": 3.4233591520587034, + "grad_norm": 8.1124849319458, + "learning_rate": 5.722584590297595e-06, + "loss": 3.1727, + "step": 50385 + }, + { + "epoch": 3.4236988721293655, + "grad_norm": 7.539731979370117, + "learning_rate": 5.722159940209268e-06, + "loss": 2.8377, + "step": 50390 + }, + { + "epoch": 3.424038592200027, + "grad_norm": 6.450497627258301, + "learning_rate": 5.721735290120941e-06, + "loss": 2.9584, + "step": 50395 + }, + { + "epoch": 3.424378312270689, + "grad_norm": 6.659336090087891, + "learning_rate": 5.7213106400326135e-06, + "loss": 2.9439, + "step": 50400 + }, + { + "epoch": 3.424718032341351, + "grad_norm": 7.678518295288086, + "learning_rate": 5.7208859899442855e-06, + "loss": 2.843, + "step": 50405 + }, + { + "epoch": 3.4250577524120125, + "grad_norm": 7.074731349945068, + "learning_rate": 5.720461339855959e-06, + "loss": 2.9645, + "step": 50410 + }, + { + "epoch": 3.425397472482674, + "grad_norm": 8.167655944824219, + "learning_rate": 5.720036689767632e-06, + "loss": 2.7497, + "step": 50415 + }, + { + "epoch": 3.425737192553336, + "grad_norm": 7.110902309417725, + "learning_rate": 5.719612039679304e-06, + "loss": 2.9219, + "step": 50420 + }, + { + "epoch": 3.426076912623998, + "grad_norm": 7.584236145019531, + "learning_rate": 5.7191873895909775e-06, + "loss": 3.043, + "step": 50425 + }, + { + "epoch": 3.4264166326946595, + "grad_norm": 6.992465496063232, + "learning_rate": 5.71876273950265e-06, + "loss": 3.0298, + "step": 50430 + }, + { + "epoch": 3.4267563527653215, + "grad_norm": 6.503422737121582, + "learning_rate": 5.718338089414322e-06, + "loss": 2.7532, + "step": 50435 + }, + { + "epoch": 3.427096072835983, + "grad_norm": 8.26086711883545, + "learning_rate": 5.717913439325996e-06, + "loss": 2.7832, + "step": 50440 + }, + { + "epoch": 3.427435792906645, + "grad_norm": 5.688017845153809, + "learning_rate": 5.717488789237669e-06, + "loss": 2.8674, + "step": 50445 + }, + { + "epoch": 3.427775512977307, + "grad_norm": 7.1450958251953125, + "learning_rate": 5.717064139149341e-06, + "loss": 2.8169, + "step": 50450 + }, + { + "epoch": 3.4281152330479685, + "grad_norm": 5.645364761352539, + "learning_rate": 5.716639489061014e-06, + "loss": 2.9776, + "step": 50455 + }, + { + "epoch": 3.42845495311863, + "grad_norm": 7.23663854598999, + "learning_rate": 5.716214838972687e-06, + "loss": 3.0839, + "step": 50460 + }, + { + "epoch": 3.428794673189292, + "grad_norm": 6.674689769744873, + "learning_rate": 5.715790188884359e-06, + "loss": 2.7881, + "step": 50465 + }, + { + "epoch": 3.429134393259954, + "grad_norm": 6.90579891204834, + "learning_rate": 5.715365538796033e-06, + "loss": 2.9421, + "step": 50470 + }, + { + "epoch": 3.4294741133306155, + "grad_norm": 7.104522705078125, + "learning_rate": 5.7149408887077056e-06, + "loss": 2.793, + "step": 50475 + }, + { + "epoch": 3.429813833401277, + "grad_norm": 7.285946369171143, + "learning_rate": 5.714516238619378e-06, + "loss": 2.9845, + "step": 50480 + }, + { + "epoch": 3.430153553471939, + "grad_norm": 7.5980072021484375, + "learning_rate": 5.714091588531051e-06, + "loss": 3.1098, + "step": 50485 + }, + { + "epoch": 3.430493273542601, + "grad_norm": 5.409635543823242, + "learning_rate": 5.713666938442723e-06, + "loss": 2.991, + "step": 50490 + }, + { + "epoch": 3.4308329936132624, + "grad_norm": 9.065324783325195, + "learning_rate": 5.713242288354397e-06, + "loss": 2.7559, + "step": 50495 + }, + { + "epoch": 3.4311727136839245, + "grad_norm": 7.129666805267334, + "learning_rate": 5.7128176382660696e-06, + "loss": 3.0743, + "step": 50500 + }, + { + "epoch": 3.431512433754586, + "grad_norm": 6.214807510375977, + "learning_rate": 5.7123929881777415e-06, + "loss": 2.8977, + "step": 50505 + }, + { + "epoch": 3.4318521538252478, + "grad_norm": 6.9669108390808105, + "learning_rate": 5.711968338089415e-06, + "loss": 2.7036, + "step": 50510 + }, + { + "epoch": 3.43219187389591, + "grad_norm": 9.869786262512207, + "learning_rate": 5.711543688001088e-06, + "loss": 3.0626, + "step": 50515 + }, + { + "epoch": 3.4325315939665715, + "grad_norm": 8.224231719970703, + "learning_rate": 5.71111903791276e-06, + "loss": 2.8718, + "step": 50520 + }, + { + "epoch": 3.432871314037233, + "grad_norm": 6.915376663208008, + "learning_rate": 5.7106943878244336e-06, + "loss": 2.7165, + "step": 50525 + }, + { + "epoch": 3.433211034107895, + "grad_norm": 6.743305206298828, + "learning_rate": 5.710269737736106e-06, + "loss": 2.9095, + "step": 50530 + }, + { + "epoch": 3.433550754178557, + "grad_norm": 7.851807117462158, + "learning_rate": 5.709845087647778e-06, + "loss": 2.7773, + "step": 50535 + }, + { + "epoch": 3.4338904742492184, + "grad_norm": 8.79519271850586, + "learning_rate": 5.709420437559452e-06, + "loss": 2.8864, + "step": 50540 + }, + { + "epoch": 3.4342301943198805, + "grad_norm": 7.6261701583862305, + "learning_rate": 5.708995787471125e-06, + "loss": 3.0191, + "step": 50545 + }, + { + "epoch": 3.434569914390542, + "grad_norm": 6.986472129821777, + "learning_rate": 5.708571137382797e-06, + "loss": 3.031, + "step": 50550 + }, + { + "epoch": 3.434909634461204, + "grad_norm": 6.93757438659668, + "learning_rate": 5.70814648729447e-06, + "loss": 2.8767, + "step": 50555 + }, + { + "epoch": 3.435249354531866, + "grad_norm": 8.430168151855469, + "learning_rate": 5.707721837206142e-06, + "loss": 3.1941, + "step": 50560 + }, + { + "epoch": 3.4355890746025275, + "grad_norm": 7.477123260498047, + "learning_rate": 5.707297187117815e-06, + "loss": 2.9485, + "step": 50565 + }, + { + "epoch": 3.435928794673189, + "grad_norm": 6.189366817474365, + "learning_rate": 5.706872537029489e-06, + "loss": 2.8599, + "step": 50570 + }, + { + "epoch": 3.436268514743851, + "grad_norm": 6.171515941619873, + "learning_rate": 5.706447886941161e-06, + "loss": 2.7637, + "step": 50575 + }, + { + "epoch": 3.436608234814513, + "grad_norm": 9.011609077453613, + "learning_rate": 5.7060232368528335e-06, + "loss": 3.0039, + "step": 50580 + }, + { + "epoch": 3.4369479548851745, + "grad_norm": 9.868864059448242, + "learning_rate": 5.705598586764507e-06, + "loss": 3.1823, + "step": 50585 + }, + { + "epoch": 3.4372876749558365, + "grad_norm": 8.65869140625, + "learning_rate": 5.705173936676179e-06, + "loss": 2.9748, + "step": 50590 + }, + { + "epoch": 3.437627395026498, + "grad_norm": 8.399337768554688, + "learning_rate": 5.704749286587852e-06, + "loss": 3.1942, + "step": 50595 + }, + { + "epoch": 3.43796711509716, + "grad_norm": 6.043955326080322, + "learning_rate": 5.7043246364995256e-06, + "loss": 3.0912, + "step": 50600 + }, + { + "epoch": 3.438306835167822, + "grad_norm": 6.486403465270996, + "learning_rate": 5.7038999864111975e-06, + "loss": 3.0399, + "step": 50605 + }, + { + "epoch": 3.4386465552384835, + "grad_norm": 8.088369369506836, + "learning_rate": 5.70347533632287e-06, + "loss": 2.9069, + "step": 50610 + }, + { + "epoch": 3.438986275309145, + "grad_norm": 6.553292274475098, + "learning_rate": 5.703050686234544e-06, + "loss": 2.6689, + "step": 50615 + }, + { + "epoch": 3.439325995379807, + "grad_norm": 6.516694068908691, + "learning_rate": 5.702626036146216e-06, + "loss": 3.1006, + "step": 50620 + }, + { + "epoch": 3.439665715450469, + "grad_norm": 7.147058486938477, + "learning_rate": 5.702201386057889e-06, + "loss": 3.0311, + "step": 50625 + }, + { + "epoch": 3.4400054355211305, + "grad_norm": 7.540927410125732, + "learning_rate": 5.7017767359695615e-06, + "loss": 3.1804, + "step": 50630 + }, + { + "epoch": 3.4403451555917925, + "grad_norm": 7.480328559875488, + "learning_rate": 5.701352085881234e-06, + "loss": 2.9864, + "step": 50635 + }, + { + "epoch": 3.440684875662454, + "grad_norm": 6.567859649658203, + "learning_rate": 5.700927435792907e-06, + "loss": 3.0419, + "step": 50640 + }, + { + "epoch": 3.441024595733116, + "grad_norm": 8.476354598999023, + "learning_rate": 5.70050278570458e-06, + "loss": 3.0902, + "step": 50645 + }, + { + "epoch": 3.441364315803778, + "grad_norm": 6.569108009338379, + "learning_rate": 5.700078135616253e-06, + "loss": 2.9919, + "step": 50650 + }, + { + "epoch": 3.4417040358744395, + "grad_norm": 9.926798820495605, + "learning_rate": 5.699653485527925e-06, + "loss": 3.1416, + "step": 50655 + }, + { + "epoch": 3.442043755945101, + "grad_norm": 6.245280742645264, + "learning_rate": 5.699228835439598e-06, + "loss": 3.0166, + "step": 50660 + }, + { + "epoch": 3.442383476015763, + "grad_norm": 5.121922969818115, + "learning_rate": 5.698804185351271e-06, + "loss": 3.134, + "step": 50665 + }, + { + "epoch": 3.442723196086425, + "grad_norm": 7.788108825683594, + "learning_rate": 5.698379535262943e-06, + "loss": 3.0198, + "step": 50670 + }, + { + "epoch": 3.4430629161570865, + "grad_norm": 7.4013261795043945, + "learning_rate": 5.697954885174617e-06, + "loss": 2.9223, + "step": 50675 + }, + { + "epoch": 3.4434026362277486, + "grad_norm": 7.1011762619018555, + "learning_rate": 5.6975302350862895e-06, + "loss": 3.1319, + "step": 50680 + }, + { + "epoch": 3.44374235629841, + "grad_norm": 7.338512420654297, + "learning_rate": 5.6971055849979615e-06, + "loss": 3.0799, + "step": 50685 + }, + { + "epoch": 3.444082076369072, + "grad_norm": 9.205201148986816, + "learning_rate": 5.696680934909635e-06, + "loss": 3.196, + "step": 50690 + }, + { + "epoch": 3.444421796439734, + "grad_norm": 7.3050312995910645, + "learning_rate": 5.696256284821308e-06, + "loss": 3.0308, + "step": 50695 + }, + { + "epoch": 3.4447615165103955, + "grad_norm": 5.821048736572266, + "learning_rate": 5.69583163473298e-06, + "loss": 2.8788, + "step": 50700 + }, + { + "epoch": 3.445101236581057, + "grad_norm": 7.808674335479736, + "learning_rate": 5.6954069846446535e-06, + "loss": 2.9321, + "step": 50705 + }, + { + "epoch": 3.4454409566517192, + "grad_norm": 7.21701717376709, + "learning_rate": 5.694982334556326e-06, + "loss": 3.0156, + "step": 50710 + }, + { + "epoch": 3.445780676722381, + "grad_norm": 8.644315719604492, + "learning_rate": 5.694557684467998e-06, + "loss": 3.064, + "step": 50715 + }, + { + "epoch": 3.4461203967930425, + "grad_norm": 7.535207748413086, + "learning_rate": 5.694133034379672e-06, + "loss": 3.0861, + "step": 50720 + }, + { + "epoch": 3.446460116863704, + "grad_norm": 6.707708835601807, + "learning_rate": 5.693708384291344e-06, + "loss": 3.158, + "step": 50725 + }, + { + "epoch": 3.446799836934366, + "grad_norm": 7.826462745666504, + "learning_rate": 5.693283734203017e-06, + "loss": 3.2754, + "step": 50730 + }, + { + "epoch": 3.447139557005028, + "grad_norm": 6.748630046844482, + "learning_rate": 5.69285908411469e-06, + "loss": 2.6724, + "step": 50735 + }, + { + "epoch": 3.4474792770756895, + "grad_norm": 5.933713436126709, + "learning_rate": 5.692434434026362e-06, + "loss": 2.9855, + "step": 50740 + }, + { + "epoch": 3.4478189971463515, + "grad_norm": 6.401937484741211, + "learning_rate": 5.692009783938035e-06, + "loss": 2.6749, + "step": 50745 + }, + { + "epoch": 3.448158717217013, + "grad_norm": 7.754793167114258, + "learning_rate": 5.691585133849709e-06, + "loss": 2.9312, + "step": 50750 + }, + { + "epoch": 3.448498437287675, + "grad_norm": 6.022877216339111, + "learning_rate": 5.691160483761381e-06, + "loss": 2.9334, + "step": 50755 + }, + { + "epoch": 3.448838157358337, + "grad_norm": 7.499212265014648, + "learning_rate": 5.6907358336730535e-06, + "loss": 2.7362, + "step": 50760 + }, + { + "epoch": 3.4491778774289985, + "grad_norm": 6.709115028381348, + "learning_rate": 5.690311183584727e-06, + "loss": 2.619, + "step": 50765 + }, + { + "epoch": 3.44951759749966, + "grad_norm": 8.903124809265137, + "learning_rate": 5.689886533496399e-06, + "loss": 2.9349, + "step": 50770 + }, + { + "epoch": 3.449857317570322, + "grad_norm": 7.441792011260986, + "learning_rate": 5.689461883408072e-06, + "loss": 2.7978, + "step": 50775 + }, + { + "epoch": 3.450197037640984, + "grad_norm": 5.987461566925049, + "learning_rate": 5.6890372333197456e-06, + "loss": 2.8545, + "step": 50780 + }, + { + "epoch": 3.4505367577116455, + "grad_norm": 6.973785877227783, + "learning_rate": 5.6886125832314175e-06, + "loss": 2.8931, + "step": 50785 + }, + { + "epoch": 3.4508764777823075, + "grad_norm": 8.022148132324219, + "learning_rate": 5.68818793314309e-06, + "loss": 3.2578, + "step": 50790 + }, + { + "epoch": 3.451216197852969, + "grad_norm": 12.681746482849121, + "learning_rate": 5.687763283054764e-06, + "loss": 2.9127, + "step": 50795 + }, + { + "epoch": 3.451555917923631, + "grad_norm": 6.370044231414795, + "learning_rate": 5.687338632966436e-06, + "loss": 2.9561, + "step": 50800 + }, + { + "epoch": 3.451895637994293, + "grad_norm": 6.909450054168701, + "learning_rate": 5.686913982878109e-06, + "loss": 2.984, + "step": 50805 + }, + { + "epoch": 3.4522353580649545, + "grad_norm": 8.233694076538086, + "learning_rate": 5.6864893327897815e-06, + "loss": 2.9573, + "step": 50810 + }, + { + "epoch": 3.452575078135616, + "grad_norm": 5.267715930938721, + "learning_rate": 5.686064682701454e-06, + "loss": 3.0642, + "step": 50815 + }, + { + "epoch": 3.452914798206278, + "grad_norm": 6.616067409515381, + "learning_rate": 5.685640032613128e-06, + "loss": 2.8643, + "step": 50820 + }, + { + "epoch": 3.45325451827694, + "grad_norm": 6.170154094696045, + "learning_rate": 5.6852153825248e-06, + "loss": 3.0018, + "step": 50825 + }, + { + "epoch": 3.4535942383476015, + "grad_norm": 7.193914413452148, + "learning_rate": 5.684790732436473e-06, + "loss": 2.9619, + "step": 50830 + }, + { + "epoch": 3.453933958418263, + "grad_norm": 7.872987270355225, + "learning_rate": 5.684366082348146e-06, + "loss": 2.8024, + "step": 50835 + }, + { + "epoch": 3.454273678488925, + "grad_norm": 6.650320053100586, + "learning_rate": 5.683941432259818e-06, + "loss": 2.9014, + "step": 50840 + }, + { + "epoch": 3.454613398559587, + "grad_norm": 6.881227016448975, + "learning_rate": 5.683516782171491e-06, + "loss": 2.971, + "step": 50845 + }, + { + "epoch": 3.4549531186302485, + "grad_norm": 6.803928375244141, + "learning_rate": 5.683092132083165e-06, + "loss": 2.9964, + "step": 50850 + }, + { + "epoch": 3.4552928387009105, + "grad_norm": 8.6207275390625, + "learning_rate": 5.682667481994837e-06, + "loss": 3.1, + "step": 50855 + }, + { + "epoch": 3.455632558771572, + "grad_norm": 6.788658618927002, + "learning_rate": 5.6822428319065095e-06, + "loss": 2.8144, + "step": 50860 + }, + { + "epoch": 3.455972278842234, + "grad_norm": 8.793525695800781, + "learning_rate": 5.681818181818183e-06, + "loss": 2.9457, + "step": 50865 + }, + { + "epoch": 3.456311998912896, + "grad_norm": 7.596872329711914, + "learning_rate": 5.681393531729855e-06, + "loss": 2.9379, + "step": 50870 + }, + { + "epoch": 3.4566517189835575, + "grad_norm": 9.277633666992188, + "learning_rate": 5.680968881641528e-06, + "loss": 3.0037, + "step": 50875 + }, + { + "epoch": 3.456991439054219, + "grad_norm": 6.957672119140625, + "learning_rate": 5.680544231553201e-06, + "loss": 2.947, + "step": 50880 + }, + { + "epoch": 3.457331159124881, + "grad_norm": 8.895540237426758, + "learning_rate": 5.6801195814648735e-06, + "loss": 2.8997, + "step": 50885 + }, + { + "epoch": 3.457670879195543, + "grad_norm": 4.947597980499268, + "learning_rate": 5.679694931376546e-06, + "loss": 2.8794, + "step": 50890 + }, + { + "epoch": 3.4580105992662045, + "grad_norm": 8.724106788635254, + "learning_rate": 5.679270281288219e-06, + "loss": 2.711, + "step": 50895 + }, + { + "epoch": 3.4583503193368665, + "grad_norm": 7.277353763580322, + "learning_rate": 5.678845631199892e-06, + "loss": 3.0533, + "step": 50900 + }, + { + "epoch": 3.458690039407528, + "grad_norm": 6.847234725952148, + "learning_rate": 5.678420981111564e-06, + "loss": 2.971, + "step": 50905 + }, + { + "epoch": 3.45902975947819, + "grad_norm": 7.924529075622559, + "learning_rate": 5.6779963310232375e-06, + "loss": 3.0188, + "step": 50910 + }, + { + "epoch": 3.459369479548852, + "grad_norm": 8.033426284790039, + "learning_rate": 5.67757168093491e-06, + "loss": 3.0119, + "step": 50915 + }, + { + "epoch": 3.4597091996195135, + "grad_norm": 9.586308479309082, + "learning_rate": 5.677147030846582e-06, + "loss": 2.8512, + "step": 50920 + }, + { + "epoch": 3.460048919690175, + "grad_norm": 7.645775318145752, + "learning_rate": 5.676722380758256e-06, + "loss": 2.7717, + "step": 50925 + }, + { + "epoch": 3.460388639760837, + "grad_norm": 6.694353103637695, + "learning_rate": 5.676297730669929e-06, + "loss": 2.6608, + "step": 50930 + }, + { + "epoch": 3.460728359831499, + "grad_norm": 8.551461219787598, + "learning_rate": 5.675873080581601e-06, + "loss": 3.0254, + "step": 50935 + }, + { + "epoch": 3.4610680799021605, + "grad_norm": 7.506023406982422, + "learning_rate": 5.675448430493274e-06, + "loss": 2.9355, + "step": 50940 + }, + { + "epoch": 3.4614077999728226, + "grad_norm": 7.4383649826049805, + "learning_rate": 5.675023780404947e-06, + "loss": 3.1286, + "step": 50945 + }, + { + "epoch": 3.461747520043484, + "grad_norm": 7.405762195587158, + "learning_rate": 5.674599130316619e-06, + "loss": 3.0444, + "step": 50950 + }, + { + "epoch": 3.462087240114146, + "grad_norm": 6.860501766204834, + "learning_rate": 5.674174480228293e-06, + "loss": 2.9313, + "step": 50955 + }, + { + "epoch": 3.462426960184808, + "grad_norm": 6.74829626083374, + "learning_rate": 5.6737498301399655e-06, + "loss": 3.1052, + "step": 50960 + }, + { + "epoch": 3.4627666802554695, + "grad_norm": 8.448113441467285, + "learning_rate": 5.6733251800516375e-06, + "loss": 3.0063, + "step": 50965 + }, + { + "epoch": 3.463106400326131, + "grad_norm": 7.636466026306152, + "learning_rate": 5.672900529963311e-06, + "loss": 2.9095, + "step": 50970 + }, + { + "epoch": 3.4634461203967932, + "grad_norm": 9.201987266540527, + "learning_rate": 5.672475879874983e-06, + "loss": 3.0271, + "step": 50975 + }, + { + "epoch": 3.463785840467455, + "grad_norm": 7.80034065246582, + "learning_rate": 5.672051229786656e-06, + "loss": 2.9888, + "step": 50980 + }, + { + "epoch": 3.4641255605381165, + "grad_norm": 7.272043228149414, + "learning_rate": 5.6716265796983295e-06, + "loss": 2.894, + "step": 50985 + }, + { + "epoch": 3.4644652806087786, + "grad_norm": 7.409377574920654, + "learning_rate": 5.6712019296100015e-06, + "loss": 2.6875, + "step": 50990 + }, + { + "epoch": 3.46480500067944, + "grad_norm": 7.757415294647217, + "learning_rate": 5.670777279521674e-06, + "loss": 3.0537, + "step": 50995 + }, + { + "epoch": 3.465144720750102, + "grad_norm": 5.287522315979004, + "learning_rate": 5.670352629433348e-06, + "loss": 3.1207, + "step": 51000 + }, + { + "epoch": 3.465484440820764, + "grad_norm": 6.17368221282959, + "learning_rate": 5.66992797934502e-06, + "loss": 3.2848, + "step": 51005 + }, + { + "epoch": 3.4658241608914255, + "grad_norm": 7.43454647064209, + "learning_rate": 5.669503329256693e-06, + "loss": 2.8492, + "step": 51010 + }, + { + "epoch": 3.466163880962087, + "grad_norm": 7.342229843139648, + "learning_rate": 5.669078679168366e-06, + "loss": 2.9362, + "step": 51015 + }, + { + "epoch": 3.4665036010327492, + "grad_norm": 7.828834533691406, + "learning_rate": 5.668654029080038e-06, + "loss": 2.8685, + "step": 51020 + }, + { + "epoch": 3.466843321103411, + "grad_norm": 8.812917709350586, + "learning_rate": 5.668229378991711e-06, + "loss": 3.0223, + "step": 51025 + }, + { + "epoch": 3.4671830411740725, + "grad_norm": 6.562093734741211, + "learning_rate": 5.667804728903385e-06, + "loss": 2.9933, + "step": 51030 + }, + { + "epoch": 3.4675227612447346, + "grad_norm": 7.4861249923706055, + "learning_rate": 5.667380078815057e-06, + "loss": 3.073, + "step": 51035 + }, + { + "epoch": 3.467862481315396, + "grad_norm": 6.807706832885742, + "learning_rate": 5.6669554287267295e-06, + "loss": 3.0782, + "step": 51040 + }, + { + "epoch": 3.468202201386058, + "grad_norm": 7.0091753005981445, + "learning_rate": 5.666530778638403e-06, + "loss": 2.87, + "step": 51045 + }, + { + "epoch": 3.46854192145672, + "grad_norm": 7.278872013092041, + "learning_rate": 5.666106128550075e-06, + "loss": 3.2081, + "step": 51050 + }, + { + "epoch": 3.4688816415273815, + "grad_norm": 7.347292423248291, + "learning_rate": 5.665681478461748e-06, + "loss": 3.0557, + "step": 51055 + }, + { + "epoch": 3.469221361598043, + "grad_norm": 9.182536125183105, + "learning_rate": 5.665256828373421e-06, + "loss": 2.8951, + "step": 51060 + }, + { + "epoch": 3.469561081668705, + "grad_norm": 7.507583141326904, + "learning_rate": 5.6648321782850935e-06, + "loss": 2.9397, + "step": 51065 + }, + { + "epoch": 3.469900801739367, + "grad_norm": 7.328458309173584, + "learning_rate": 5.6644075281967655e-06, + "loss": 2.6342, + "step": 51070 + }, + { + "epoch": 3.4702405218100285, + "grad_norm": 6.914056777954102, + "learning_rate": 5.663982878108439e-06, + "loss": 2.7267, + "step": 51075 + }, + { + "epoch": 3.47058024188069, + "grad_norm": 8.482115745544434, + "learning_rate": 5.663558228020112e-06, + "loss": 2.9392, + "step": 51080 + }, + { + "epoch": 3.470919961951352, + "grad_norm": 6.560996055603027, + "learning_rate": 5.663133577931784e-06, + "loss": 2.9224, + "step": 51085 + }, + { + "epoch": 3.471259682022014, + "grad_norm": 7.956362724304199, + "learning_rate": 5.6627089278434575e-06, + "loss": 3.2074, + "step": 51090 + }, + { + "epoch": 3.4715994020926755, + "grad_norm": 5.98801851272583, + "learning_rate": 5.66228427775513e-06, + "loss": 2.8869, + "step": 51095 + }, + { + "epoch": 3.4719391221633376, + "grad_norm": 8.445929527282715, + "learning_rate": 5.661859627666802e-06, + "loss": 3.0741, + "step": 51100 + }, + { + "epoch": 3.472278842233999, + "grad_norm": 6.846676826477051, + "learning_rate": 5.661434977578476e-06, + "loss": 2.7913, + "step": 51105 + }, + { + "epoch": 3.472618562304661, + "grad_norm": 6.322475910186768, + "learning_rate": 5.661010327490149e-06, + "loss": 2.8719, + "step": 51110 + }, + { + "epoch": 3.472958282375323, + "grad_norm": 6.11098051071167, + "learning_rate": 5.660585677401821e-06, + "loss": 3.0345, + "step": 51115 + }, + { + "epoch": 3.4732980024459845, + "grad_norm": 7.158013820648193, + "learning_rate": 5.660161027313494e-06, + "loss": 3.0467, + "step": 51120 + }, + { + "epoch": 3.473637722516646, + "grad_norm": 8.295100212097168, + "learning_rate": 5.659736377225167e-06, + "loss": 3.0571, + "step": 51125 + }, + { + "epoch": 3.4739774425873082, + "grad_norm": 9.010640144348145, + "learning_rate": 5.659311727136839e-06, + "loss": 2.9927, + "step": 51130 + }, + { + "epoch": 3.47431716265797, + "grad_norm": 6.4185471534729, + "learning_rate": 5.658887077048513e-06, + "loss": 2.8654, + "step": 51135 + }, + { + "epoch": 3.4746568827286315, + "grad_norm": 8.066850662231445, + "learning_rate": 5.6584624269601855e-06, + "loss": 2.8315, + "step": 51140 + }, + { + "epoch": 3.4749966027992936, + "grad_norm": 6.578293323516846, + "learning_rate": 5.6580377768718575e-06, + "loss": 2.9894, + "step": 51145 + }, + { + "epoch": 3.475336322869955, + "grad_norm": 5.622398853302002, + "learning_rate": 5.657613126783531e-06, + "loss": 3.1024, + "step": 51150 + }, + { + "epoch": 3.475676042940617, + "grad_norm": 9.909356117248535, + "learning_rate": 5.657188476695203e-06, + "loss": 2.8775, + "step": 51155 + }, + { + "epoch": 3.4760157630112785, + "grad_norm": 6.475826263427734, + "learning_rate": 5.656763826606877e-06, + "loss": 2.9449, + "step": 51160 + }, + { + "epoch": 3.4763554830819405, + "grad_norm": 7.18269157409668, + "learning_rate": 5.6563391765185495e-06, + "loss": 2.8917, + "step": 51165 + }, + { + "epoch": 3.476695203152602, + "grad_norm": 6.841857433319092, + "learning_rate": 5.6559145264302215e-06, + "loss": 2.781, + "step": 51170 + }, + { + "epoch": 3.477034923223264, + "grad_norm": 7.225924491882324, + "learning_rate": 5.655489876341895e-06, + "loss": 2.8331, + "step": 51175 + }, + { + "epoch": 3.477374643293926, + "grad_norm": 8.844172477722168, + "learning_rate": 5.655065226253568e-06, + "loss": 3.0919, + "step": 51180 + }, + { + "epoch": 3.4777143633645875, + "grad_norm": 7.902665615081787, + "learning_rate": 5.65464057616524e-06, + "loss": 3.2161, + "step": 51185 + }, + { + "epoch": 3.478054083435249, + "grad_norm": 5.195366859436035, + "learning_rate": 5.6542159260769135e-06, + "loss": 2.9685, + "step": 51190 + }, + { + "epoch": 3.478393803505911, + "grad_norm": 6.7704081535339355, + "learning_rate": 5.653791275988586e-06, + "loss": 3.0352, + "step": 51195 + }, + { + "epoch": 3.478733523576573, + "grad_norm": 5.645393371582031, + "learning_rate": 5.653366625900258e-06, + "loss": 2.7514, + "step": 51200 + }, + { + "epoch": 3.4790732436472345, + "grad_norm": 6.402341842651367, + "learning_rate": 5.652941975811932e-06, + "loss": 2.8402, + "step": 51205 + }, + { + "epoch": 3.4794129637178965, + "grad_norm": 8.287803649902344, + "learning_rate": 5.652517325723605e-06, + "loss": 2.6951, + "step": 51210 + }, + { + "epoch": 3.479752683788558, + "grad_norm": 7.855546951293945, + "learning_rate": 5.652092675635277e-06, + "loss": 3.0752, + "step": 51215 + }, + { + "epoch": 3.48009240385922, + "grad_norm": 8.795680046081543, + "learning_rate": 5.65166802554695e-06, + "loss": 2.8132, + "step": 51220 + }, + { + "epoch": 3.480432123929882, + "grad_norm": 7.707067489624023, + "learning_rate": 5.651243375458622e-06, + "loss": 2.9629, + "step": 51225 + }, + { + "epoch": 3.4807718440005435, + "grad_norm": 10.000994682312012, + "learning_rate": 5.650818725370295e-06, + "loss": 3.061, + "step": 51230 + }, + { + "epoch": 3.481111564071205, + "grad_norm": 7.984265327453613, + "learning_rate": 5.650394075281969e-06, + "loss": 3.0825, + "step": 51235 + }, + { + "epoch": 3.481451284141867, + "grad_norm": 7.0830793380737305, + "learning_rate": 5.649969425193641e-06, + "loss": 2.8589, + "step": 51240 + }, + { + "epoch": 3.481791004212529, + "grad_norm": 7.395538330078125, + "learning_rate": 5.6495447751053135e-06, + "loss": 2.6863, + "step": 51245 + }, + { + "epoch": 3.4821307242831905, + "grad_norm": 7.697061538696289, + "learning_rate": 5.649120125016987e-06, + "loss": 2.8173, + "step": 51250 + }, + { + "epoch": 3.4824704443538526, + "grad_norm": 6.994256019592285, + "learning_rate": 5.648695474928659e-06, + "loss": 3.161, + "step": 51255 + }, + { + "epoch": 3.482810164424514, + "grad_norm": 8.656325340270996, + "learning_rate": 5.648270824840332e-06, + "loss": 2.9052, + "step": 51260 + }, + { + "epoch": 3.483149884495176, + "grad_norm": 6.443594932556152, + "learning_rate": 5.6478461747520055e-06, + "loss": 2.976, + "step": 51265 + }, + { + "epoch": 3.483489604565838, + "grad_norm": 5.630237579345703, + "learning_rate": 5.6474215246636775e-06, + "loss": 3.0316, + "step": 51270 + }, + { + "epoch": 3.4838293246364995, + "grad_norm": 7.499780178070068, + "learning_rate": 5.64699687457535e-06, + "loss": 2.7613, + "step": 51275 + }, + { + "epoch": 3.484169044707161, + "grad_norm": 6.211853981018066, + "learning_rate": 5.646572224487024e-06, + "loss": 3.0744, + "step": 51280 + }, + { + "epoch": 3.4845087647778232, + "grad_norm": 6.327541828155518, + "learning_rate": 5.646147574398696e-06, + "loss": 2.8815, + "step": 51285 + }, + { + "epoch": 3.484848484848485, + "grad_norm": 8.625079154968262, + "learning_rate": 5.645722924310369e-06, + "loss": 2.892, + "step": 51290 + }, + { + "epoch": 3.4851882049191465, + "grad_norm": 7.318621635437012, + "learning_rate": 5.6452982742220415e-06, + "loss": 2.9903, + "step": 51295 + }, + { + "epoch": 3.4855279249898086, + "grad_norm": 6.715267181396484, + "learning_rate": 5.644873624133714e-06, + "loss": 2.8482, + "step": 51300 + }, + { + "epoch": 3.48586764506047, + "grad_norm": 6.386991024017334, + "learning_rate": 5.644448974045387e-06, + "loss": 3.2011, + "step": 51305 + }, + { + "epoch": 3.486207365131132, + "grad_norm": 7.240581512451172, + "learning_rate": 5.64402432395706e-06, + "loss": 2.8347, + "step": 51310 + }, + { + "epoch": 3.486547085201794, + "grad_norm": 7.406033515930176, + "learning_rate": 5.643599673868733e-06, + "loss": 3.0597, + "step": 51315 + }, + { + "epoch": 3.4868868052724555, + "grad_norm": 6.836404323577881, + "learning_rate": 5.643175023780405e-06, + "loss": 2.7128, + "step": 51320 + }, + { + "epoch": 3.487226525343117, + "grad_norm": 8.202628135681152, + "learning_rate": 5.642750373692078e-06, + "loss": 3.0167, + "step": 51325 + }, + { + "epoch": 3.4875662454137792, + "grad_norm": 8.051453590393066, + "learning_rate": 5.642325723603751e-06, + "loss": 3.1981, + "step": 51330 + }, + { + "epoch": 3.487905965484441, + "grad_norm": 7.096549034118652, + "learning_rate": 5.641901073515423e-06, + "loss": 3.1242, + "step": 51335 + }, + { + "epoch": 3.4882456855551025, + "grad_norm": 6.653530120849609, + "learning_rate": 5.641476423427097e-06, + "loss": 2.9658, + "step": 51340 + }, + { + "epoch": 3.4885854056257646, + "grad_norm": 6.401042938232422, + "learning_rate": 5.6410517733387695e-06, + "loss": 3.1358, + "step": 51345 + }, + { + "epoch": 3.488925125696426, + "grad_norm": 6.830475807189941, + "learning_rate": 5.6406271232504415e-06, + "loss": 3.0001, + "step": 51350 + }, + { + "epoch": 3.489264845767088, + "grad_norm": 5.718338966369629, + "learning_rate": 5.640202473162115e-06, + "loss": 2.8563, + "step": 51355 + }, + { + "epoch": 3.48960456583775, + "grad_norm": 4.961582183837891, + "learning_rate": 5.639777823073788e-06, + "loss": 2.6833, + "step": 51360 + }, + { + "epoch": 3.4899442859084115, + "grad_norm": 7.622646808624268, + "learning_rate": 5.63935317298546e-06, + "loss": 2.9523, + "step": 51365 + }, + { + "epoch": 3.490284005979073, + "grad_norm": 7.75595760345459, + "learning_rate": 5.6389285228971335e-06, + "loss": 3.0227, + "step": 51370 + }, + { + "epoch": 3.4906237260497353, + "grad_norm": 6.98682975769043, + "learning_rate": 5.638503872808806e-06, + "loss": 3.1196, + "step": 51375 + }, + { + "epoch": 3.490963446120397, + "grad_norm": 6.382034778594971, + "learning_rate": 5.638079222720478e-06, + "loss": 2.864, + "step": 51380 + }, + { + "epoch": 3.4913031661910585, + "grad_norm": 6.8025102615356445, + "learning_rate": 5.637654572632152e-06, + "loss": 2.927, + "step": 51385 + }, + { + "epoch": 3.4916428862617206, + "grad_norm": 7.881445407867432, + "learning_rate": 5.637229922543824e-06, + "loss": 2.7626, + "step": 51390 + }, + { + "epoch": 3.491982606332382, + "grad_norm": 6.181014060974121, + "learning_rate": 5.636805272455497e-06, + "loss": 3.0035, + "step": 51395 + }, + { + "epoch": 3.492322326403044, + "grad_norm": 7.871237754821777, + "learning_rate": 5.63638062236717e-06, + "loss": 3.0009, + "step": 51400 + }, + { + "epoch": 3.4926620464737055, + "grad_norm": 7.158176422119141, + "learning_rate": 5.635955972278842e-06, + "loss": 2.8637, + "step": 51405 + }, + { + "epoch": 3.4930017665443676, + "grad_norm": 9.148155212402344, + "learning_rate": 5.635531322190515e-06, + "loss": 3.0778, + "step": 51410 + }, + { + "epoch": 3.493341486615029, + "grad_norm": 7.409608840942383, + "learning_rate": 5.635106672102189e-06, + "loss": 2.8518, + "step": 51415 + }, + { + "epoch": 3.493681206685691, + "grad_norm": 7.58951473236084, + "learning_rate": 5.634682022013861e-06, + "loss": 3.4421, + "step": 51420 + }, + { + "epoch": 3.494020926756353, + "grad_norm": 6.1600117683410645, + "learning_rate": 5.6342573719255335e-06, + "loss": 2.9178, + "step": 51425 + }, + { + "epoch": 3.4943606468270145, + "grad_norm": 8.152284622192383, + "learning_rate": 5.633832721837207e-06, + "loss": 3.0043, + "step": 51430 + }, + { + "epoch": 3.494700366897676, + "grad_norm": 5.594325065612793, + "learning_rate": 5.633408071748879e-06, + "loss": 3.1356, + "step": 51435 + }, + { + "epoch": 3.4950400869683382, + "grad_norm": 6.380620002746582, + "learning_rate": 5.632983421660552e-06, + "loss": 2.7621, + "step": 51440 + }, + { + "epoch": 3.495379807039, + "grad_norm": 6.594417572021484, + "learning_rate": 5.6325587715722255e-06, + "loss": 2.8556, + "step": 51445 + }, + { + "epoch": 3.4957195271096615, + "grad_norm": 6.704120635986328, + "learning_rate": 5.6321341214838975e-06, + "loss": 2.8403, + "step": 51450 + }, + { + "epoch": 3.4960592471803236, + "grad_norm": 9.88803768157959, + "learning_rate": 5.63170947139557e-06, + "loss": 2.9382, + "step": 51455 + }, + { + "epoch": 3.496398967250985, + "grad_norm": 6.64229679107666, + "learning_rate": 5.631284821307244e-06, + "loss": 2.992, + "step": 51460 + }, + { + "epoch": 3.496738687321647, + "grad_norm": 6.773732662200928, + "learning_rate": 5.630860171218916e-06, + "loss": 2.859, + "step": 51465 + }, + { + "epoch": 3.497078407392309, + "grad_norm": 7.812883377075195, + "learning_rate": 5.630435521130589e-06, + "loss": 2.9438, + "step": 51470 + }, + { + "epoch": 3.4974181274629705, + "grad_norm": 7.420592784881592, + "learning_rate": 5.6300108710422615e-06, + "loss": 2.7497, + "step": 51475 + }, + { + "epoch": 3.497757847533632, + "grad_norm": 6.7519941329956055, + "learning_rate": 5.629586220953934e-06, + "loss": 3.076, + "step": 51480 + }, + { + "epoch": 3.4980975676042942, + "grad_norm": 6.474454879760742, + "learning_rate": 5.629161570865607e-06, + "loss": 2.7619, + "step": 51485 + }, + { + "epoch": 3.498437287674956, + "grad_norm": 6.524909019470215, + "learning_rate": 5.62873692077728e-06, + "loss": 2.9551, + "step": 51490 + }, + { + "epoch": 3.4987770077456175, + "grad_norm": 7.018847942352295, + "learning_rate": 5.628312270688953e-06, + "loss": 2.9859, + "step": 51495 + }, + { + "epoch": 3.499116727816279, + "grad_norm": 6.288427829742432, + "learning_rate": 5.627887620600626e-06, + "loss": 2.8825, + "step": 51500 + }, + { + "epoch": 3.499456447886941, + "grad_norm": 9.86685848236084, + "learning_rate": 5.627462970512298e-06, + "loss": 3.0282, + "step": 51505 + }, + { + "epoch": 3.499796167957603, + "grad_norm": 6.295832633972168, + "learning_rate": 5.627038320423971e-06, + "loss": 3.0407, + "step": 51510 + }, + { + "epoch": 3.5001358880282645, + "grad_norm": 8.043557167053223, + "learning_rate": 5.626613670335645e-06, + "loss": 2.8201, + "step": 51515 + }, + { + "epoch": 3.5004756080989265, + "grad_norm": 7.676563739776611, + "learning_rate": 5.626189020247317e-06, + "loss": 2.763, + "step": 51520 + }, + { + "epoch": 3.500815328169588, + "grad_norm": 7.098781108856201, + "learning_rate": 5.6257643701589895e-06, + "loss": 2.8922, + "step": 51525 + }, + { + "epoch": 3.50115504824025, + "grad_norm": 7.011512756347656, + "learning_rate": 5.625339720070663e-06, + "loss": 2.9527, + "step": 51530 + }, + { + "epoch": 3.501494768310912, + "grad_norm": 6.281407356262207, + "learning_rate": 5.624915069982335e-06, + "loss": 2.8112, + "step": 51535 + }, + { + "epoch": 3.5018344883815735, + "grad_norm": 6.686036109924316, + "learning_rate": 5.624490419894008e-06, + "loss": 2.921, + "step": 51540 + }, + { + "epoch": 3.502174208452235, + "grad_norm": 7.397223472595215, + "learning_rate": 5.624065769805681e-06, + "loss": 2.7768, + "step": 51545 + }, + { + "epoch": 3.5025139285228972, + "grad_norm": 7.637008190155029, + "learning_rate": 5.6236411197173535e-06, + "loss": 3.0574, + "step": 51550 + }, + { + "epoch": 3.502853648593559, + "grad_norm": 6.992290496826172, + "learning_rate": 5.623216469629026e-06, + "loss": 2.8314, + "step": 51555 + }, + { + "epoch": 3.5031933686642205, + "grad_norm": 6.350825309753418, + "learning_rate": 5.622791819540699e-06, + "loss": 2.7911, + "step": 51560 + }, + { + "epoch": 3.5035330887348826, + "grad_norm": 7.226481914520264, + "learning_rate": 5.622367169452372e-06, + "loss": 3.0872, + "step": 51565 + }, + { + "epoch": 3.503872808805544, + "grad_norm": 6.5205078125, + "learning_rate": 5.621942519364044e-06, + "loss": 2.722, + "step": 51570 + }, + { + "epoch": 3.504212528876206, + "grad_norm": 8.556228637695312, + "learning_rate": 5.6215178692757175e-06, + "loss": 3.1409, + "step": 51575 + }, + { + "epoch": 3.504552248946868, + "grad_norm": 7.851258754730225, + "learning_rate": 5.62109321918739e-06, + "loss": 2.9379, + "step": 51580 + }, + { + "epoch": 3.5048919690175295, + "grad_norm": 6.232591152191162, + "learning_rate": 5.620668569099062e-06, + "loss": 2.9277, + "step": 51585 + }, + { + "epoch": 3.505231689088191, + "grad_norm": 6.366501331329346, + "learning_rate": 5.620243919010736e-06, + "loss": 2.7796, + "step": 51590 + }, + { + "epoch": 3.5055714091588532, + "grad_norm": 8.888642311096191, + "learning_rate": 5.619819268922409e-06, + "loss": 3.1851, + "step": 51595 + }, + { + "epoch": 3.505911129229515, + "grad_norm": 7.30031681060791, + "learning_rate": 5.619394618834081e-06, + "loss": 2.6099, + "step": 51600 + }, + { + "epoch": 3.5062508493001765, + "grad_norm": 5.514179706573486, + "learning_rate": 5.618969968745754e-06, + "loss": 2.6263, + "step": 51605 + }, + { + "epoch": 3.5065905693708386, + "grad_norm": 6.956149578094482, + "learning_rate": 5.618545318657427e-06, + "loss": 2.719, + "step": 51610 + }, + { + "epoch": 3.5069302894415, + "grad_norm": 5.472037315368652, + "learning_rate": 5.618120668569099e-06, + "loss": 2.5811, + "step": 51615 + }, + { + "epoch": 3.507270009512162, + "grad_norm": 6.323130130767822, + "learning_rate": 5.617696018480773e-06, + "loss": 3.0905, + "step": 51620 + }, + { + "epoch": 3.507609729582824, + "grad_norm": 6.960794448852539, + "learning_rate": 5.6172713683924455e-06, + "loss": 3.1909, + "step": 51625 + }, + { + "epoch": 3.5079494496534855, + "grad_norm": 8.498973846435547, + "learning_rate": 5.6168467183041175e-06, + "loss": 3.0618, + "step": 51630 + }, + { + "epoch": 3.508289169724147, + "grad_norm": 6.812423229217529, + "learning_rate": 5.616422068215791e-06, + "loss": 2.8403, + "step": 51635 + }, + { + "epoch": 3.5086288897948092, + "grad_norm": 9.303580284118652, + "learning_rate": 5.615997418127463e-06, + "loss": 3.0884, + "step": 51640 + }, + { + "epoch": 3.508968609865471, + "grad_norm": 8.055952072143555, + "learning_rate": 5.615572768039136e-06, + "loss": 2.8963, + "step": 51645 + }, + { + "epoch": 3.5093083299361325, + "grad_norm": 7.894192218780518, + "learning_rate": 5.6151481179508095e-06, + "loss": 3.1269, + "step": 51650 + }, + { + "epoch": 3.5096480500067946, + "grad_norm": 6.910122394561768, + "learning_rate": 5.6147234678624815e-06, + "loss": 2.8746, + "step": 51655 + }, + { + "epoch": 3.509987770077456, + "grad_norm": 6.993963241577148, + "learning_rate": 5.614298817774154e-06, + "loss": 2.8583, + "step": 51660 + }, + { + "epoch": 3.510327490148118, + "grad_norm": 5.539280414581299, + "learning_rate": 5.613874167685828e-06, + "loss": 2.7674, + "step": 51665 + }, + { + "epoch": 3.51066721021878, + "grad_norm": 6.409377098083496, + "learning_rate": 5.6134495175975e-06, + "loss": 3.0491, + "step": 51670 + }, + { + "epoch": 3.5110069302894416, + "grad_norm": 6.90503454208374, + "learning_rate": 5.613024867509173e-06, + "loss": 3.0357, + "step": 51675 + }, + { + "epoch": 3.511346650360103, + "grad_norm": 6.685184955596924, + "learning_rate": 5.612600217420846e-06, + "loss": 2.894, + "step": 51680 + }, + { + "epoch": 3.5116863704307653, + "grad_norm": 5.901299476623535, + "learning_rate": 5.612175567332518e-06, + "loss": 2.9399, + "step": 51685 + }, + { + "epoch": 3.512026090501427, + "grad_norm": 6.937253475189209, + "learning_rate": 5.611750917244191e-06, + "loss": 2.7583, + "step": 51690 + }, + { + "epoch": 3.5123658105720885, + "grad_norm": 6.054793357849121, + "learning_rate": 5.611326267155865e-06, + "loss": 3.0441, + "step": 51695 + }, + { + "epoch": 3.5127055306427506, + "grad_norm": 8.653225898742676, + "learning_rate": 5.610901617067537e-06, + "loss": 3.0986, + "step": 51700 + }, + { + "epoch": 3.5130452507134122, + "grad_norm": 7.850712299346924, + "learning_rate": 5.6104769669792095e-06, + "loss": 2.7461, + "step": 51705 + }, + { + "epoch": 3.513384970784074, + "grad_norm": 6.166309356689453, + "learning_rate": 5.610052316890883e-06, + "loss": 3.119, + "step": 51710 + }, + { + "epoch": 3.513724690854736, + "grad_norm": 6.895243167877197, + "learning_rate": 5.609627666802555e-06, + "loss": 2.6667, + "step": 51715 + }, + { + "epoch": 3.5140644109253976, + "grad_norm": 8.172143936157227, + "learning_rate": 5.609203016714228e-06, + "loss": 2.955, + "step": 51720 + }, + { + "epoch": 3.514404130996059, + "grad_norm": 6.3892083168029785, + "learning_rate": 5.608778366625901e-06, + "loss": 2.8027, + "step": 51725 + }, + { + "epoch": 3.5147438510667213, + "grad_norm": 8.644740104675293, + "learning_rate": 5.6083537165375735e-06, + "loss": 2.7063, + "step": 51730 + }, + { + "epoch": 3.515083571137383, + "grad_norm": 6.253964424133301, + "learning_rate": 5.607929066449245e-06, + "loss": 2.895, + "step": 51735 + }, + { + "epoch": 3.5154232912080445, + "grad_norm": 8.161255836486816, + "learning_rate": 5.607504416360919e-06, + "loss": 2.9965, + "step": 51740 + }, + { + "epoch": 3.5157630112787066, + "grad_norm": 9.95002555847168, + "learning_rate": 5.607079766272592e-06, + "loss": 3.0686, + "step": 51745 + }, + { + "epoch": 3.5161027313493682, + "grad_norm": 7.66132116317749, + "learning_rate": 5.606655116184264e-06, + "loss": 3.2069, + "step": 51750 + }, + { + "epoch": 3.51644245142003, + "grad_norm": 6.591899394989014, + "learning_rate": 5.6062304660959375e-06, + "loss": 2.9427, + "step": 51755 + }, + { + "epoch": 3.516782171490692, + "grad_norm": 7.424413204193115, + "learning_rate": 5.60580581600761e-06, + "loss": 2.9275, + "step": 51760 + }, + { + "epoch": 3.5171218915613536, + "grad_norm": 6.71150541305542, + "learning_rate": 5.605381165919282e-06, + "loss": 2.8212, + "step": 51765 + }, + { + "epoch": 3.517461611632015, + "grad_norm": 8.584774017333984, + "learning_rate": 5.604956515830956e-06, + "loss": 2.9459, + "step": 51770 + }, + { + "epoch": 3.517801331702677, + "grad_norm": 8.27781867980957, + "learning_rate": 5.604531865742629e-06, + "loss": 2.9413, + "step": 51775 + }, + { + "epoch": 3.518141051773339, + "grad_norm": 7.803736209869385, + "learning_rate": 5.604107215654301e-06, + "loss": 2.9399, + "step": 51780 + }, + { + "epoch": 3.5184807718440005, + "grad_norm": 8.571208000183105, + "learning_rate": 5.603682565565974e-06, + "loss": 2.8685, + "step": 51785 + }, + { + "epoch": 3.518820491914662, + "grad_norm": 7.1866960525512695, + "learning_rate": 5.603257915477647e-06, + "loss": 3.1066, + "step": 51790 + }, + { + "epoch": 3.5191602119853242, + "grad_norm": 8.908540725708008, + "learning_rate": 5.602833265389319e-06, + "loss": 2.8797, + "step": 51795 + }, + { + "epoch": 3.519499932055986, + "grad_norm": 7.17578125, + "learning_rate": 5.602408615300993e-06, + "loss": 2.7825, + "step": 51800 + }, + { + "epoch": 3.5198396521266475, + "grad_norm": 7.099145889282227, + "learning_rate": 5.6019839652126655e-06, + "loss": 3.0907, + "step": 51805 + }, + { + "epoch": 3.520179372197309, + "grad_norm": 5.910643100738525, + "learning_rate": 5.6015593151243374e-06, + "loss": 3.1362, + "step": 51810 + }, + { + "epoch": 3.520519092267971, + "grad_norm": 7.949717998504639, + "learning_rate": 5.601134665036011e-06, + "loss": 2.9813, + "step": 51815 + }, + { + "epoch": 3.520858812338633, + "grad_norm": 7.135042190551758, + "learning_rate": 5.600710014947683e-06, + "loss": 2.8004, + "step": 51820 + }, + { + "epoch": 3.5211985324092945, + "grad_norm": 6.711698055267334, + "learning_rate": 5.600285364859356e-06, + "loss": 2.8356, + "step": 51825 + }, + { + "epoch": 3.5215382524799566, + "grad_norm": 7.6796979904174805, + "learning_rate": 5.5998607147710295e-06, + "loss": 2.9569, + "step": 51830 + }, + { + "epoch": 3.521877972550618, + "grad_norm": 6.28026008605957, + "learning_rate": 5.5994360646827014e-06, + "loss": 3.1509, + "step": 51835 + }, + { + "epoch": 3.52221769262128, + "grad_norm": Infinity, + "learning_rate": 5.59909634461204e-06, + "loss": 2.8315, + "step": 51840 + }, + { + "epoch": 3.522557412691942, + "grad_norm": 7.909867763519287, + "learning_rate": 5.598671694523713e-06, + "loss": 2.9081, + "step": 51845 + }, + { + "epoch": 3.5228971327626035, + "grad_norm": 8.709237098693848, + "learning_rate": 5.598247044435385e-06, + "loss": 3.0204, + "step": 51850 + }, + { + "epoch": 3.523236852833265, + "grad_norm": 7.249949932098389, + "learning_rate": 5.597822394347059e-06, + "loss": 3.0023, + "step": 51855 + }, + { + "epoch": 3.5235765729039272, + "grad_norm": 7.707732200622559, + "learning_rate": 5.5973977442587316e-06, + "loss": 3.0712, + "step": 51860 + }, + { + "epoch": 3.523916292974589, + "grad_norm": 6.1641011238098145, + "learning_rate": 5.5969730941704035e-06, + "loss": 2.8826, + "step": 51865 + }, + { + "epoch": 3.5242560130452505, + "grad_norm": 7.103036403656006, + "learning_rate": 5.596548444082077e-06, + "loss": 2.8572, + "step": 51870 + }, + { + "epoch": 3.5245957331159126, + "grad_norm": 6.433956146240234, + "learning_rate": 5.59612379399375e-06, + "loss": 3.0773, + "step": 51875 + }, + { + "epoch": 3.524935453186574, + "grad_norm": 7.399594783782959, + "learning_rate": 5.595699143905422e-06, + "loss": 2.7686, + "step": 51880 + }, + { + "epoch": 3.525275173257236, + "grad_norm": 7.043419361114502, + "learning_rate": 5.5952744938170956e-06, + "loss": 2.6426, + "step": 51885 + }, + { + "epoch": 3.525614893327898, + "grad_norm": 5.548206329345703, + "learning_rate": 5.5948498437287675e-06, + "loss": 3.0295, + "step": 51890 + }, + { + "epoch": 3.5259546133985595, + "grad_norm": 6.894670009613037, + "learning_rate": 5.59442519364044e-06, + "loss": 2.9623, + "step": 51895 + }, + { + "epoch": 3.526294333469221, + "grad_norm": 6.6057538986206055, + "learning_rate": 5.594000543552114e-06, + "loss": 2.9634, + "step": 51900 + }, + { + "epoch": 3.5266340535398832, + "grad_norm": 7.36745548248291, + "learning_rate": 5.593575893463786e-06, + "loss": 2.9877, + "step": 51905 + }, + { + "epoch": 3.526973773610545, + "grad_norm": 8.188756942749023, + "learning_rate": 5.593151243375459e-06, + "loss": 3.0488, + "step": 51910 + }, + { + "epoch": 3.5273134936812065, + "grad_norm": 8.986541748046875, + "learning_rate": 5.592726593287132e-06, + "loss": 3.0939, + "step": 51915 + }, + { + "epoch": 3.5276532137518686, + "grad_norm": 6.519941806793213, + "learning_rate": 5.592301943198804e-06, + "loss": 2.6764, + "step": 51920 + }, + { + "epoch": 3.52799293382253, + "grad_norm": 8.25126838684082, + "learning_rate": 5.591877293110477e-06, + "loss": 3.0724, + "step": 51925 + }, + { + "epoch": 3.528332653893192, + "grad_norm": 5.7604899406433105, + "learning_rate": 5.591452643022151e-06, + "loss": 3.0494, + "step": 51930 + }, + { + "epoch": 3.528672373963854, + "grad_norm": 6.174117088317871, + "learning_rate": 5.591027992933823e-06, + "loss": 2.766, + "step": 51935 + }, + { + "epoch": 3.5290120940345155, + "grad_norm": 9.547270774841309, + "learning_rate": 5.5906033428454955e-06, + "loss": 2.9943, + "step": 51940 + }, + { + "epoch": 3.529351814105177, + "grad_norm": 6.614893436431885, + "learning_rate": 5.590178692757169e-06, + "loss": 2.8297, + "step": 51945 + }, + { + "epoch": 3.5296915341758393, + "grad_norm": 7.839678764343262, + "learning_rate": 5.589754042668841e-06, + "loss": 2.7721, + "step": 51950 + }, + { + "epoch": 3.530031254246501, + "grad_norm": 7.096536636352539, + "learning_rate": 5.589329392580514e-06, + "loss": 2.866, + "step": 51955 + }, + { + "epoch": 3.5303709743171625, + "grad_norm": 7.9682936668396, + "learning_rate": 5.5889047424921876e-06, + "loss": 2.9847, + "step": 51960 + }, + { + "epoch": 3.5307106943878246, + "grad_norm": 7.5204243659973145, + "learning_rate": 5.5884800924038595e-06, + "loss": 2.8576, + "step": 51965 + }, + { + "epoch": 3.531050414458486, + "grad_norm": 6.254872798919678, + "learning_rate": 5.588055442315532e-06, + "loss": 3.1399, + "step": 51970 + }, + { + "epoch": 3.531390134529148, + "grad_norm": 7.331809043884277, + "learning_rate": 5.587630792227205e-06, + "loss": 2.5667, + "step": 51975 + }, + { + "epoch": 3.53172985459981, + "grad_norm": 6.9817094802856445, + "learning_rate": 5.587206142138878e-06, + "loss": 2.7103, + "step": 51980 + }, + { + "epoch": 3.5320695746704716, + "grad_norm": 7.712571620941162, + "learning_rate": 5.586781492050551e-06, + "loss": 3.0629, + "step": 51985 + }, + { + "epoch": 3.532409294741133, + "grad_norm": 7.1175947189331055, + "learning_rate": 5.5863568419622235e-06, + "loss": 2.9539, + "step": 51990 + }, + { + "epoch": 3.5327490148117953, + "grad_norm": 9.202729225158691, + "learning_rate": 5.585932191873896e-06, + "loss": 2.9841, + "step": 51995 + }, + { + "epoch": 3.533088734882457, + "grad_norm": 8.572283744812012, + "learning_rate": 5.585507541785568e-06, + "loss": 3.0013, + "step": 52000 + }, + { + "epoch": 3.5334284549531185, + "grad_norm": 6.911673545837402, + "learning_rate": 5.585082891697242e-06, + "loss": 3.1355, + "step": 52005 + }, + { + "epoch": 3.5337681750237806, + "grad_norm": 6.359920978546143, + "learning_rate": 5.584658241608915e-06, + "loss": 2.7382, + "step": 52010 + }, + { + "epoch": 3.5341078950944422, + "grad_norm": 9.156593322753906, + "learning_rate": 5.584233591520587e-06, + "loss": 3.0165, + "step": 52015 + }, + { + "epoch": 3.534447615165104, + "grad_norm": 7.453369617462158, + "learning_rate": 5.58380894143226e-06, + "loss": 2.8194, + "step": 52020 + }, + { + "epoch": 3.534787335235766, + "grad_norm": 5.554835319519043, + "learning_rate": 5.583384291343933e-06, + "loss": 3.0195, + "step": 52025 + }, + { + "epoch": 3.5351270553064276, + "grad_norm": 6.87845516204834, + "learning_rate": 5.582959641255605e-06, + "loss": 3.1395, + "step": 52030 + }, + { + "epoch": 3.535466775377089, + "grad_norm": 8.703362464904785, + "learning_rate": 5.582534991167279e-06, + "loss": 3.0976, + "step": 52035 + }, + { + "epoch": 3.5358064954477513, + "grad_norm": 9.419074058532715, + "learning_rate": 5.5821103410789515e-06, + "loss": 2.8506, + "step": 52040 + }, + { + "epoch": 3.536146215518413, + "grad_norm": 7.226834774017334, + "learning_rate": 5.5816856909906235e-06, + "loss": 2.9052, + "step": 52045 + }, + { + "epoch": 3.5364859355890745, + "grad_norm": 7.486923694610596, + "learning_rate": 5.581261040902297e-06, + "loss": 3.0384, + "step": 52050 + }, + { + "epoch": 3.5368256556597366, + "grad_norm": 7.769999027252197, + "learning_rate": 5.58083639081397e-06, + "loss": 3.1251, + "step": 52055 + }, + { + "epoch": 3.5371653757303982, + "grad_norm": 7.940886974334717, + "learning_rate": 5.580411740725643e-06, + "loss": 2.9237, + "step": 52060 + }, + { + "epoch": 3.53750509580106, + "grad_norm": 7.309014320373535, + "learning_rate": 5.5799870906373155e-06, + "loss": 2.8056, + "step": 52065 + }, + { + "epoch": 3.537844815871722, + "grad_norm": 8.47529411315918, + "learning_rate": 5.5795624405489875e-06, + "loss": 2.862, + "step": 52070 + }, + { + "epoch": 3.5381845359423836, + "grad_norm": 7.111324310302734, + "learning_rate": 5.579137790460661e-06, + "loss": 2.9767, + "step": 52075 + }, + { + "epoch": 3.538524256013045, + "grad_norm": 7.4279465675354, + "learning_rate": 5.578713140372334e-06, + "loss": 3.0632, + "step": 52080 + }, + { + "epoch": 3.5388639760837073, + "grad_norm": 8.62515640258789, + "learning_rate": 5.578288490284006e-06, + "loss": 2.7791, + "step": 52085 + }, + { + "epoch": 3.539203696154369, + "grad_norm": 8.160459518432617, + "learning_rate": 5.5778638401956795e-06, + "loss": 2.8868, + "step": 52090 + }, + { + "epoch": 3.5395434162250305, + "grad_norm": 7.4001922607421875, + "learning_rate": 5.577439190107352e-06, + "loss": 3.1798, + "step": 52095 + }, + { + "epoch": 3.5398831362956926, + "grad_norm": 6.2638397216796875, + "learning_rate": 5.577014540019024e-06, + "loss": 2.9021, + "step": 52100 + }, + { + "epoch": 3.5402228563663543, + "grad_norm": 5.113304615020752, + "learning_rate": 5.576589889930698e-06, + "loss": 3.0259, + "step": 52105 + }, + { + "epoch": 3.540562576437016, + "grad_norm": 5.590500354766846, + "learning_rate": 5.576165239842371e-06, + "loss": 2.731, + "step": 52110 + }, + { + "epoch": 3.5409022965076775, + "grad_norm": 7.100639820098877, + "learning_rate": 5.575740589754043e-06, + "loss": 3.097, + "step": 52115 + }, + { + "epoch": 3.5412420165783396, + "grad_norm": 7.800697326660156, + "learning_rate": 5.575315939665716e-06, + "loss": 2.8734, + "step": 52120 + }, + { + "epoch": 3.541581736649001, + "grad_norm": 6.549933910369873, + "learning_rate": 5.574891289577389e-06, + "loss": 2.8906, + "step": 52125 + }, + { + "epoch": 3.541921456719663, + "grad_norm": 7.916582107543945, + "learning_rate": 5.574466639489061e-06, + "loss": 3.1121, + "step": 52130 + }, + { + "epoch": 3.542261176790325, + "grad_norm": 8.66512680053711, + "learning_rate": 5.574041989400735e-06, + "loss": 3.0584, + "step": 52135 + }, + { + "epoch": 3.5426008968609866, + "grad_norm": 6.45006799697876, + "learning_rate": 5.573617339312407e-06, + "loss": 2.7885, + "step": 52140 + }, + { + "epoch": 3.542940616931648, + "grad_norm": 6.453755855560303, + "learning_rate": 5.5731926892240795e-06, + "loss": 2.9637, + "step": 52145 + }, + { + "epoch": 3.54328033700231, + "grad_norm": 5.790700912475586, + "learning_rate": 5.572768039135753e-06, + "loss": 2.8719, + "step": 52150 + }, + { + "epoch": 3.543620057072972, + "grad_norm": 6.697534561157227, + "learning_rate": 5.572343389047425e-06, + "loss": 3.0453, + "step": 52155 + }, + { + "epoch": 3.5439597771436335, + "grad_norm": 8.804265975952148, + "learning_rate": 5.571918738959098e-06, + "loss": 3.2443, + "step": 52160 + }, + { + "epoch": 3.544299497214295, + "grad_norm": 7.674715042114258, + "learning_rate": 5.5714940888707716e-06, + "loss": 2.904, + "step": 52165 + }, + { + "epoch": 3.5446392172849572, + "grad_norm": 8.87926197052002, + "learning_rate": 5.5710694387824435e-06, + "loss": 2.8203, + "step": 52170 + }, + { + "epoch": 3.544978937355619, + "grad_norm": 7.559710502624512, + "learning_rate": 5.570644788694116e-06, + "loss": 2.7955, + "step": 52175 + }, + { + "epoch": 3.5453186574262805, + "grad_norm": 7.815056800842285, + "learning_rate": 5.57022013860579e-06, + "loss": 2.7602, + "step": 52180 + }, + { + "epoch": 3.5456583774969426, + "grad_norm": 6.411179065704346, + "learning_rate": 5.569795488517462e-06, + "loss": 2.6734, + "step": 52185 + }, + { + "epoch": 3.545998097567604, + "grad_norm": 6.160617828369141, + "learning_rate": 5.569370838429135e-06, + "loss": 2.9926, + "step": 52190 + }, + { + "epoch": 3.546337817638266, + "grad_norm": 6.423404216766357, + "learning_rate": 5.568946188340808e-06, + "loss": 2.7592, + "step": 52195 + }, + { + "epoch": 3.546677537708928, + "grad_norm": 9.958795547485352, + "learning_rate": 5.56852153825248e-06, + "loss": 2.8391, + "step": 52200 + }, + { + "epoch": 3.5470172577795895, + "grad_norm": 8.692988395690918, + "learning_rate": 5.568096888164153e-06, + "loss": 2.8616, + "step": 52205 + }, + { + "epoch": 3.547356977850251, + "grad_norm": 7.097723007202148, + "learning_rate": 5.567672238075827e-06, + "loss": 2.9081, + "step": 52210 + }, + { + "epoch": 3.5476966979209132, + "grad_norm": 7.598605632781982, + "learning_rate": 5.567247587987499e-06, + "loss": 3.0097, + "step": 52215 + }, + { + "epoch": 3.548036417991575, + "grad_norm": 7.018558979034424, + "learning_rate": 5.5668229378991715e-06, + "loss": 2.9532, + "step": 52220 + }, + { + "epoch": 3.5483761380622365, + "grad_norm": 7.466568470001221, + "learning_rate": 5.566398287810844e-06, + "loss": 2.9477, + "step": 52225 + }, + { + "epoch": 3.5487158581328986, + "grad_norm": 7.996242046356201, + "learning_rate": 5.565973637722517e-06, + "loss": 3.0547, + "step": 52230 + }, + { + "epoch": 3.54905557820356, + "grad_norm": 6.023513317108154, + "learning_rate": 5.565548987634189e-06, + "loss": 2.9156, + "step": 52235 + }, + { + "epoch": 3.549395298274222, + "grad_norm": 7.530548572540283, + "learning_rate": 5.565124337545863e-06, + "loss": 2.9831, + "step": 52240 + }, + { + "epoch": 3.549735018344884, + "grad_norm": 8.38786792755127, + "learning_rate": 5.5646996874575355e-06, + "loss": 3.0942, + "step": 52245 + }, + { + "epoch": 3.5500747384155455, + "grad_norm": 6.465775012969971, + "learning_rate": 5.5642750373692075e-06, + "loss": 2.8432, + "step": 52250 + }, + { + "epoch": 3.550414458486207, + "grad_norm": 6.46498441696167, + "learning_rate": 5.563850387280881e-06, + "loss": 2.9138, + "step": 52255 + }, + { + "epoch": 3.5507541785568693, + "grad_norm": 8.102930068969727, + "learning_rate": 5.563425737192554e-06, + "loss": 2.904, + "step": 52260 + }, + { + "epoch": 3.551093898627531, + "grad_norm": 6.051650047302246, + "learning_rate": 5.563001087104226e-06, + "loss": 2.858, + "step": 52265 + }, + { + "epoch": 3.5514336186981925, + "grad_norm": 6.425142288208008, + "learning_rate": 5.5625764370158995e-06, + "loss": 2.8621, + "step": 52270 + }, + { + "epoch": 3.5517733387688546, + "grad_norm": 5.861327648162842, + "learning_rate": 5.562151786927572e-06, + "loss": 3.1031, + "step": 52275 + }, + { + "epoch": 3.5521130588395162, + "grad_norm": 8.05202579498291, + "learning_rate": 5.561727136839244e-06, + "loss": 3.0887, + "step": 52280 + }, + { + "epoch": 3.552452778910178, + "grad_norm": 7.711359024047852, + "learning_rate": 5.561302486750918e-06, + "loss": 2.9987, + "step": 52285 + }, + { + "epoch": 3.55279249898084, + "grad_norm": 6.706821441650391, + "learning_rate": 5.560877836662591e-06, + "loss": 3.0116, + "step": 52290 + }, + { + "epoch": 3.5531322190515016, + "grad_norm": 8.654193878173828, + "learning_rate": 5.560453186574263e-06, + "loss": 3.3063, + "step": 52295 + }, + { + "epoch": 3.553471939122163, + "grad_norm": 6.606875896453857, + "learning_rate": 5.560028536485936e-06, + "loss": 2.8284, + "step": 52300 + }, + { + "epoch": 3.5538116591928253, + "grad_norm": 7.179717540740967, + "learning_rate": 5.559603886397609e-06, + "loss": 3.1384, + "step": 52305 + }, + { + "epoch": 3.554151379263487, + "grad_norm": 7.033690929412842, + "learning_rate": 5.559179236309281e-06, + "loss": 2.8463, + "step": 52310 + }, + { + "epoch": 3.5544910993341485, + "grad_norm": 6.768926620483398, + "learning_rate": 5.558754586220955e-06, + "loss": 3.0277, + "step": 52315 + }, + { + "epoch": 3.5548308194048106, + "grad_norm": 7.1485185623168945, + "learning_rate": 5.558329936132627e-06, + "loss": 2.9854, + "step": 52320 + }, + { + "epoch": 3.5551705394754722, + "grad_norm": 5.719754219055176, + "learning_rate": 5.5579052860442995e-06, + "loss": 3.0572, + "step": 52325 + }, + { + "epoch": 3.555510259546134, + "grad_norm": 6.339126110076904, + "learning_rate": 5.557480635955973e-06, + "loss": 2.9501, + "step": 52330 + }, + { + "epoch": 3.555849979616796, + "grad_norm": 7.949528217315674, + "learning_rate": 5.557055985867645e-06, + "loss": 3.2173, + "step": 52335 + }, + { + "epoch": 3.5561896996874576, + "grad_norm": 6.578742504119873, + "learning_rate": 5.556631335779318e-06, + "loss": 2.7806, + "step": 52340 + }, + { + "epoch": 3.556529419758119, + "grad_norm": 8.316962242126465, + "learning_rate": 5.5562066856909915e-06, + "loss": 2.7816, + "step": 52345 + }, + { + "epoch": 3.5568691398287813, + "grad_norm": 6.460604667663574, + "learning_rate": 5.5557820356026635e-06, + "loss": 2.8328, + "step": 52350 + }, + { + "epoch": 3.557208859899443, + "grad_norm": 8.599050521850586, + "learning_rate": 5.555357385514336e-06, + "loss": 2.8873, + "step": 52355 + }, + { + "epoch": 3.5575485799701045, + "grad_norm": 6.53253173828125, + "learning_rate": 5.55493273542601e-06, + "loss": 3.0825, + "step": 52360 + }, + { + "epoch": 3.5578883000407666, + "grad_norm": 6.522430896759033, + "learning_rate": 5.554508085337682e-06, + "loss": 2.8148, + "step": 52365 + }, + { + "epoch": 3.5582280201114282, + "grad_norm": 6.520254611968994, + "learning_rate": 5.554083435249355e-06, + "loss": 2.9519, + "step": 52370 + }, + { + "epoch": 3.55856774018209, + "grad_norm": 6.947271347045898, + "learning_rate": 5.553658785161028e-06, + "loss": 2.7552, + "step": 52375 + }, + { + "epoch": 3.558907460252752, + "grad_norm": 6.4096245765686035, + "learning_rate": 5.5532341350727e-06, + "loss": 2.9375, + "step": 52380 + }, + { + "epoch": 3.5592471803234136, + "grad_norm": 6.629870891571045, + "learning_rate": 5.552809484984373e-06, + "loss": 2.8047, + "step": 52385 + }, + { + "epoch": 3.559586900394075, + "grad_norm": 9.010618209838867, + "learning_rate": 5.552384834896046e-06, + "loss": 2.8688, + "step": 52390 + }, + { + "epoch": 3.5599266204647373, + "grad_norm": 10.025575637817383, + "learning_rate": 5.551960184807719e-06, + "loss": 2.819, + "step": 52395 + }, + { + "epoch": 3.560266340535399, + "grad_norm": 8.08598804473877, + "learning_rate": 5.551535534719392e-06, + "loss": 2.8732, + "step": 52400 + }, + { + "epoch": 3.5606060606060606, + "grad_norm": 6.876646995544434, + "learning_rate": 5.551110884631064e-06, + "loss": 2.8237, + "step": 52405 + }, + { + "epoch": 3.5609457806767226, + "grad_norm": 6.464658260345459, + "learning_rate": 5.550686234542737e-06, + "loss": 2.9876, + "step": 52410 + }, + { + "epoch": 3.5612855007473843, + "grad_norm": 7.109898090362549, + "learning_rate": 5.550261584454411e-06, + "loss": 2.9955, + "step": 52415 + }, + { + "epoch": 3.561625220818046, + "grad_norm": 7.367915630340576, + "learning_rate": 5.549836934366083e-06, + "loss": 2.9372, + "step": 52420 + }, + { + "epoch": 3.561964940888708, + "grad_norm": 6.773404598236084, + "learning_rate": 5.5494122842777555e-06, + "loss": 2.9982, + "step": 52425 + }, + { + "epoch": 3.5623046609593696, + "grad_norm": 6.881749629974365, + "learning_rate": 5.548987634189429e-06, + "loss": 2.6172, + "step": 52430 + }, + { + "epoch": 3.5626443810300312, + "grad_norm": 6.717489242553711, + "learning_rate": 5.548562984101101e-06, + "loss": 3.1088, + "step": 52435 + }, + { + "epoch": 3.5629841011006933, + "grad_norm": 7.9974470138549805, + "learning_rate": 5.548138334012774e-06, + "loss": 2.7834, + "step": 52440 + }, + { + "epoch": 3.563323821171355, + "grad_norm": 9.077794075012207, + "learning_rate": 5.5477136839244476e-06, + "loss": 2.9289, + "step": 52445 + }, + { + "epoch": 3.5636635412420166, + "grad_norm": 9.944906234741211, + "learning_rate": 5.5472890338361195e-06, + "loss": 2.8884, + "step": 52450 + }, + { + "epoch": 3.564003261312678, + "grad_norm": 11.226998329162598, + "learning_rate": 5.546864383747792e-06, + "loss": 2.8866, + "step": 52455 + }, + { + "epoch": 3.5643429813833403, + "grad_norm": 7.430442810058594, + "learning_rate": 5.546439733659465e-06, + "loss": 3.038, + "step": 52460 + }, + { + "epoch": 3.564682701454002, + "grad_norm": 6.9544878005981445, + "learning_rate": 5.546015083571138e-06, + "loss": 2.8188, + "step": 52465 + }, + { + "epoch": 3.5650224215246635, + "grad_norm": 7.307485580444336, + "learning_rate": 5.545590433482811e-06, + "loss": 2.839, + "step": 52470 + }, + { + "epoch": 3.5653621415953256, + "grad_norm": 9.01939582824707, + "learning_rate": 5.5451657833944835e-06, + "loss": 2.9909, + "step": 52475 + }, + { + "epoch": 3.5657018616659872, + "grad_norm": 8.716110229492188, + "learning_rate": 5.544741133306156e-06, + "loss": 3.1489, + "step": 52480 + }, + { + "epoch": 3.566041581736649, + "grad_norm": 6.971141815185547, + "learning_rate": 5.544316483217828e-06, + "loss": 3.0425, + "step": 52485 + }, + { + "epoch": 3.5663813018073105, + "grad_norm": 7.686551570892334, + "learning_rate": 5.543891833129502e-06, + "loss": 3.0183, + "step": 52490 + }, + { + "epoch": 3.5667210218779726, + "grad_norm": 7.57843542098999, + "learning_rate": 5.543467183041175e-06, + "loss": 3.2333, + "step": 52495 + }, + { + "epoch": 3.567060741948634, + "grad_norm": 5.801814079284668, + "learning_rate": 5.543042532952847e-06, + "loss": 2.7835, + "step": 52500 + }, + { + "epoch": 3.567400462019296, + "grad_norm": 6.19645357131958, + "learning_rate": 5.54261788286452e-06, + "loss": 2.8549, + "step": 52505 + }, + { + "epoch": 3.567740182089958, + "grad_norm": 5.459426403045654, + "learning_rate": 5.542193232776193e-06, + "loss": 2.819, + "step": 52510 + }, + { + "epoch": 3.5680799021606195, + "grad_norm": 6.619383335113525, + "learning_rate": 5.541768582687865e-06, + "loss": 2.6636, + "step": 52515 + }, + { + "epoch": 3.568419622231281, + "grad_norm": 6.762641906738281, + "learning_rate": 5.541343932599539e-06, + "loss": 2.9615, + "step": 52520 + }, + { + "epoch": 3.5687593423019432, + "grad_norm": 8.825703620910645, + "learning_rate": 5.5409192825112115e-06, + "loss": 3.2701, + "step": 52525 + }, + { + "epoch": 3.569099062372605, + "grad_norm": 7.430239200592041, + "learning_rate": 5.5404946324228835e-06, + "loss": 2.9183, + "step": 52530 + }, + { + "epoch": 3.5694387824432665, + "grad_norm": 7.226684093475342, + "learning_rate": 5.540069982334557e-06, + "loss": 2.6879, + "step": 52535 + }, + { + "epoch": 3.5697785025139286, + "grad_norm": 7.246286869049072, + "learning_rate": 5.53964533224623e-06, + "loss": 3.1894, + "step": 52540 + }, + { + "epoch": 3.57011822258459, + "grad_norm": 8.6387300491333, + "learning_rate": 5.539220682157902e-06, + "loss": 3.0067, + "step": 52545 + }, + { + "epoch": 3.570457942655252, + "grad_norm": 8.473434448242188, + "learning_rate": 5.5387960320695755e-06, + "loss": 3.0581, + "step": 52550 + }, + { + "epoch": 3.570797662725914, + "grad_norm": 8.81411361694336, + "learning_rate": 5.538371381981248e-06, + "loss": 3.0947, + "step": 52555 + }, + { + "epoch": 3.5711373827965756, + "grad_norm": 7.535830974578857, + "learning_rate": 5.53794673189292e-06, + "loss": 2.9544, + "step": 52560 + }, + { + "epoch": 3.571477102867237, + "grad_norm": 6.414172649383545, + "learning_rate": 5.537522081804594e-06, + "loss": 2.82, + "step": 52565 + }, + { + "epoch": 3.5718168229378993, + "grad_norm": 6.617055416107178, + "learning_rate": 5.537097431716266e-06, + "loss": 2.9967, + "step": 52570 + }, + { + "epoch": 3.572156543008561, + "grad_norm": 7.9949116706848145, + "learning_rate": 5.536672781627939e-06, + "loss": 3.1379, + "step": 52575 + }, + { + "epoch": 3.5724962630792225, + "grad_norm": 7.980973243713379, + "learning_rate": 5.536248131539612e-06, + "loss": 2.9622, + "step": 52580 + }, + { + "epoch": 3.5728359831498846, + "grad_norm": 8.054998397827148, + "learning_rate": 5.535823481451284e-06, + "loss": 3.2268, + "step": 52585 + }, + { + "epoch": 3.5731757032205462, + "grad_norm": 7.948501110076904, + "learning_rate": 5.535398831362957e-06, + "loss": 3.1282, + "step": 52590 + }, + { + "epoch": 3.573515423291208, + "grad_norm": 9.459346771240234, + "learning_rate": 5.534974181274631e-06, + "loss": 2.8997, + "step": 52595 + }, + { + "epoch": 3.57385514336187, + "grad_norm": 6.739127159118652, + "learning_rate": 5.534549531186303e-06, + "loss": 2.9055, + "step": 52600 + }, + { + "epoch": 3.5741948634325316, + "grad_norm": 5.403268814086914, + "learning_rate": 5.5341248810979755e-06, + "loss": 2.8701, + "step": 52605 + }, + { + "epoch": 3.574534583503193, + "grad_norm": 9.846470832824707, + "learning_rate": 5.533700231009649e-06, + "loss": 3.217, + "step": 52610 + }, + { + "epoch": 3.5748743035738553, + "grad_norm": 9.437577247619629, + "learning_rate": 5.533275580921321e-06, + "loss": 2.9219, + "step": 52615 + }, + { + "epoch": 3.575214023644517, + "grad_norm": 6.8811726570129395, + "learning_rate": 5.532850930832994e-06, + "loss": 2.9508, + "step": 52620 + }, + { + "epoch": 3.5755537437151785, + "grad_norm": 7.176451206207275, + "learning_rate": 5.5324262807446675e-06, + "loss": 3.017, + "step": 52625 + }, + { + "epoch": 3.5758934637858406, + "grad_norm": 7.876410484313965, + "learning_rate": 5.5320016306563395e-06, + "loss": 3.0571, + "step": 52630 + }, + { + "epoch": 3.5762331838565022, + "grad_norm": 6.523097038269043, + "learning_rate": 5.531576980568012e-06, + "loss": 2.7501, + "step": 52635 + }, + { + "epoch": 3.576572903927164, + "grad_norm": 8.533604621887207, + "learning_rate": 5.531152330479685e-06, + "loss": 2.8458, + "step": 52640 + }, + { + "epoch": 3.576912623997826, + "grad_norm": 6.546022891998291, + "learning_rate": 5.530727680391358e-06, + "loss": 2.9089, + "step": 52645 + }, + { + "epoch": 3.5772523440684876, + "grad_norm": 7.333369731903076, + "learning_rate": 5.530303030303031e-06, + "loss": 3.1625, + "step": 52650 + }, + { + "epoch": 3.577592064139149, + "grad_norm": 7.21026086807251, + "learning_rate": 5.5298783802147035e-06, + "loss": 2.9185, + "step": 52655 + }, + { + "epoch": 3.5779317842098113, + "grad_norm": 7.252209663391113, + "learning_rate": 5.529453730126376e-06, + "loss": 2.8327, + "step": 52660 + }, + { + "epoch": 3.578271504280473, + "grad_norm": 6.562163352966309, + "learning_rate": 5.529029080038048e-06, + "loss": 2.8396, + "step": 52665 + }, + { + "epoch": 3.5786112243511345, + "grad_norm": 7.276566982269287, + "learning_rate": 5.528604429949722e-06, + "loss": 2.953, + "step": 52670 + }, + { + "epoch": 3.5789509444217966, + "grad_norm": 8.068192481994629, + "learning_rate": 5.528179779861395e-06, + "loss": 2.9518, + "step": 52675 + }, + { + "epoch": 3.5792906644924583, + "grad_norm": 5.374444961547852, + "learning_rate": 5.527755129773067e-06, + "loss": 3.0108, + "step": 52680 + }, + { + "epoch": 3.57963038456312, + "grad_norm": 7.186967372894287, + "learning_rate": 5.52733047968474e-06, + "loss": 3.0086, + "step": 52685 + }, + { + "epoch": 3.579970104633782, + "grad_norm": 7.214108467102051, + "learning_rate": 5.526905829596413e-06, + "loss": 3.0166, + "step": 52690 + }, + { + "epoch": 3.5803098247044436, + "grad_norm": 7.8889994621276855, + "learning_rate": 5.526481179508085e-06, + "loss": 2.8255, + "step": 52695 + }, + { + "epoch": 3.580649544775105, + "grad_norm": 7.535250186920166, + "learning_rate": 5.526056529419759e-06, + "loss": 3.2009, + "step": 52700 + }, + { + "epoch": 3.5809892648457673, + "grad_norm": 7.334280490875244, + "learning_rate": 5.5256318793314315e-06, + "loss": 2.9528, + "step": 52705 + }, + { + "epoch": 3.581328984916429, + "grad_norm": 6.546240329742432, + "learning_rate": 5.5252072292431035e-06, + "loss": 2.9406, + "step": 52710 + }, + { + "epoch": 3.5816687049870906, + "grad_norm": 6.18552827835083, + "learning_rate": 5.524782579154777e-06, + "loss": 2.9494, + "step": 52715 + }, + { + "epoch": 3.5820084250577526, + "grad_norm": 9.449615478515625, + "learning_rate": 5.52435792906645e-06, + "loss": 3.0453, + "step": 52720 + }, + { + "epoch": 3.5823481451284143, + "grad_norm": 7.454505920410156, + "learning_rate": 5.523933278978122e-06, + "loss": 3.0349, + "step": 52725 + }, + { + "epoch": 3.582687865199076, + "grad_norm": 7.903056621551514, + "learning_rate": 5.5235086288897955e-06, + "loss": 2.8705, + "step": 52730 + }, + { + "epoch": 3.583027585269738, + "grad_norm": 6.802554130554199, + "learning_rate": 5.5230839788014675e-06, + "loss": 2.9642, + "step": 52735 + }, + { + "epoch": 3.5833673053403996, + "grad_norm": 9.677363395690918, + "learning_rate": 5.522659328713141e-06, + "loss": 2.9337, + "step": 52740 + }, + { + "epoch": 3.5837070254110612, + "grad_norm": 6.265154838562012, + "learning_rate": 5.522234678624814e-06, + "loss": 2.604, + "step": 52745 + }, + { + "epoch": 3.5840467454817233, + "grad_norm": 6.810751438140869, + "learning_rate": 5.521810028536486e-06, + "loss": 2.9765, + "step": 52750 + }, + { + "epoch": 3.584386465552385, + "grad_norm": 7.079813480377197, + "learning_rate": 5.5213853784481595e-06, + "loss": 2.8129, + "step": 52755 + }, + { + "epoch": 3.5847261856230466, + "grad_norm": 8.654583930969238, + "learning_rate": 5.520960728359832e-06, + "loss": 3.1298, + "step": 52760 + }, + { + "epoch": 3.5850659056937086, + "grad_norm": 8.643328666687012, + "learning_rate": 5.520536078271504e-06, + "loss": 2.9431, + "step": 52765 + }, + { + "epoch": 3.5854056257643703, + "grad_norm": 5.914066791534424, + "learning_rate": 5.520111428183178e-06, + "loss": 3.0117, + "step": 52770 + }, + { + "epoch": 3.585745345835032, + "grad_norm": 6.597156524658203, + "learning_rate": 5.519686778094851e-06, + "loss": 2.9992, + "step": 52775 + }, + { + "epoch": 3.586085065905694, + "grad_norm": 5.107043266296387, + "learning_rate": 5.519262128006523e-06, + "loss": 2.9551, + "step": 52780 + }, + { + "epoch": 3.5864247859763556, + "grad_norm": 8.282841682434082, + "learning_rate": 5.518837477918196e-06, + "loss": 2.7502, + "step": 52785 + }, + { + "epoch": 3.5867645060470172, + "grad_norm": 6.027997970581055, + "learning_rate": 5.518412827829869e-06, + "loss": 3.153, + "step": 52790 + }, + { + "epoch": 3.587104226117679, + "grad_norm": 7.221599102020264, + "learning_rate": 5.517988177741541e-06, + "loss": 3.1561, + "step": 52795 + }, + { + "epoch": 3.587443946188341, + "grad_norm": 5.925917148590088, + "learning_rate": 5.517563527653215e-06, + "loss": 3.2027, + "step": 52800 + }, + { + "epoch": 3.5877836662590026, + "grad_norm": 7.961273193359375, + "learning_rate": 5.517138877564887e-06, + "loss": 3.0042, + "step": 52805 + }, + { + "epoch": 3.588123386329664, + "grad_norm": 7.943288326263428, + "learning_rate": 5.5167142274765595e-06, + "loss": 3.3033, + "step": 52810 + }, + { + "epoch": 3.5884631064003263, + "grad_norm": 6.7348480224609375, + "learning_rate": 5.516289577388233e-06, + "loss": 3.0572, + "step": 52815 + }, + { + "epoch": 3.588802826470988, + "grad_norm": 8.597132682800293, + "learning_rate": 5.515864927299905e-06, + "loss": 2.9067, + "step": 52820 + }, + { + "epoch": 3.5891425465416495, + "grad_norm": 9.095280647277832, + "learning_rate": 5.515440277211578e-06, + "loss": 2.9861, + "step": 52825 + }, + { + "epoch": 3.589482266612311, + "grad_norm": 7.00110387802124, + "learning_rate": 5.5150156271232515e-06, + "loss": 3.1474, + "step": 52830 + }, + { + "epoch": 3.5898219866829733, + "grad_norm": 7.338333606719971, + "learning_rate": 5.5145909770349235e-06, + "loss": 2.9764, + "step": 52835 + }, + { + "epoch": 3.590161706753635, + "grad_norm": 6.980720520019531, + "learning_rate": 5.514166326946596e-06, + "loss": 3.1535, + "step": 52840 + }, + { + "epoch": 3.5905014268242965, + "grad_norm": 7.594194412231445, + "learning_rate": 5.51374167685827e-06, + "loss": 2.9897, + "step": 52845 + }, + { + "epoch": 3.5908411468949586, + "grad_norm": 7.716477394104004, + "learning_rate": 5.513317026769942e-06, + "loss": 3.0492, + "step": 52850 + }, + { + "epoch": 3.59118086696562, + "grad_norm": 9.582322120666504, + "learning_rate": 5.512892376681615e-06, + "loss": 3.0179, + "step": 52855 + }, + { + "epoch": 3.591520587036282, + "grad_norm": 6.047802925109863, + "learning_rate": 5.512467726593288e-06, + "loss": 2.8384, + "step": 52860 + }, + { + "epoch": 3.591860307106944, + "grad_norm": 7.038320541381836, + "learning_rate": 5.51204307650496e-06, + "loss": 2.8991, + "step": 52865 + }, + { + "epoch": 3.5922000271776056, + "grad_norm": 8.862276077270508, + "learning_rate": 5.511618426416633e-06, + "loss": 2.989, + "step": 52870 + }, + { + "epoch": 3.592539747248267, + "grad_norm": 6.99591064453125, + "learning_rate": 5.511193776328307e-06, + "loss": 2.7205, + "step": 52875 + }, + { + "epoch": 3.5928794673189293, + "grad_norm": 7.723250389099121, + "learning_rate": 5.510769126239979e-06, + "loss": 2.9671, + "step": 52880 + }, + { + "epoch": 3.593219187389591, + "grad_norm": 7.148946762084961, + "learning_rate": 5.5103444761516515e-06, + "loss": 3.0102, + "step": 52885 + }, + { + "epoch": 3.5935589074602525, + "grad_norm": 5.608345985412598, + "learning_rate": 5.509919826063324e-06, + "loss": 2.8399, + "step": 52890 + }, + { + "epoch": 3.5938986275309146, + "grad_norm": 6.006377220153809, + "learning_rate": 5.509495175974997e-06, + "loss": 2.9289, + "step": 52895 + }, + { + "epoch": 3.5942383476015762, + "grad_norm": 6.623997688293457, + "learning_rate": 5.50907052588667e-06, + "loss": 2.7674, + "step": 52900 + }, + { + "epoch": 3.594578067672238, + "grad_norm": 7.24717903137207, + "learning_rate": 5.508645875798343e-06, + "loss": 2.9181, + "step": 52905 + }, + { + "epoch": 3.5949177877429, + "grad_norm": 7.462850093841553, + "learning_rate": 5.5082212257100155e-06, + "loss": 2.9811, + "step": 52910 + }, + { + "epoch": 3.5952575078135616, + "grad_norm": 8.0250883102417, + "learning_rate": 5.5077965756216874e-06, + "loss": 3.2212, + "step": 52915 + }, + { + "epoch": 3.595597227884223, + "grad_norm": 6.001498222351074, + "learning_rate": 5.507371925533361e-06, + "loss": 3.0069, + "step": 52920 + }, + { + "epoch": 3.5959369479548853, + "grad_norm": 6.510980606079102, + "learning_rate": 5.506947275445034e-06, + "loss": 2.5418, + "step": 52925 + }, + { + "epoch": 3.596276668025547, + "grad_norm": 8.708207130432129, + "learning_rate": 5.506522625356706e-06, + "loss": 2.9883, + "step": 52930 + }, + { + "epoch": 3.5966163880962085, + "grad_norm": 5.968963623046875, + "learning_rate": 5.5060979752683795e-06, + "loss": 2.8018, + "step": 52935 + }, + { + "epoch": 3.5969561081668706, + "grad_norm": 7.762907028198242, + "learning_rate": 5.505673325180052e-06, + "loss": 2.8155, + "step": 52940 + }, + { + "epoch": 3.5972958282375322, + "grad_norm": 7.236458778381348, + "learning_rate": 5.505248675091724e-06, + "loss": 2.7714, + "step": 52945 + }, + { + "epoch": 3.597635548308194, + "grad_norm": 7.2899250984191895, + "learning_rate": 5.504824025003398e-06, + "loss": 3.042, + "step": 52950 + }, + { + "epoch": 3.597975268378856, + "grad_norm": 7.892643928527832, + "learning_rate": 5.504399374915071e-06, + "loss": 2.9591, + "step": 52955 + }, + { + "epoch": 3.5983149884495176, + "grad_norm": 6.645802974700928, + "learning_rate": 5.503974724826743e-06, + "loss": 2.8316, + "step": 52960 + }, + { + "epoch": 3.598654708520179, + "grad_norm": 6.2049431800842285, + "learning_rate": 5.503550074738416e-06, + "loss": 2.9337, + "step": 52965 + }, + { + "epoch": 3.5989944285908413, + "grad_norm": 6.812562942504883, + "learning_rate": 5.503125424650089e-06, + "loss": 2.9851, + "step": 52970 + }, + { + "epoch": 3.599334148661503, + "grad_norm": 5.6216301918029785, + "learning_rate": 5.502700774561761e-06, + "loss": 3.1024, + "step": 52975 + }, + { + "epoch": 3.5996738687321646, + "grad_norm": 7.111921310424805, + "learning_rate": 5.502276124473435e-06, + "loss": 3.0172, + "step": 52980 + }, + { + "epoch": 3.6000135888028266, + "grad_norm": 8.445518493652344, + "learning_rate": 5.501851474385107e-06, + "loss": 2.6617, + "step": 52985 + }, + { + "epoch": 3.6003533088734883, + "grad_norm": 7.634243011474609, + "learning_rate": 5.5014268242967794e-06, + "loss": 2.8763, + "step": 52990 + }, + { + "epoch": 3.60069302894415, + "grad_norm": 7.550417423248291, + "learning_rate": 5.501002174208453e-06, + "loss": 3.1618, + "step": 52995 + }, + { + "epoch": 3.601032749014812, + "grad_norm": 6.2486419677734375, + "learning_rate": 5.500577524120125e-06, + "loss": 2.7898, + "step": 53000 + }, + { + "epoch": 3.6013724690854736, + "grad_norm": 5.982616901397705, + "learning_rate": 5.500152874031798e-06, + "loss": 2.9319, + "step": 53005 + }, + { + "epoch": 3.6017121891561352, + "grad_norm": 7.295872211456299, + "learning_rate": 5.4997282239434715e-06, + "loss": 2.8107, + "step": 53010 + }, + { + "epoch": 3.6020519092267973, + "grad_norm": 6.700310707092285, + "learning_rate": 5.4993035738551435e-06, + "loss": 2.903, + "step": 53015 + }, + { + "epoch": 3.602391629297459, + "grad_norm": 7.002999782562256, + "learning_rate": 5.498878923766816e-06, + "loss": 2.8597, + "step": 53020 + }, + { + "epoch": 3.6027313493681206, + "grad_norm": 9.186859130859375, + "learning_rate": 5.49845427367849e-06, + "loss": 2.853, + "step": 53025 + }, + { + "epoch": 3.6030710694387826, + "grad_norm": 7.926418304443359, + "learning_rate": 5.498029623590162e-06, + "loss": 2.8851, + "step": 53030 + }, + { + "epoch": 3.6034107895094443, + "grad_norm": 6.302804946899414, + "learning_rate": 5.497604973501835e-06, + "loss": 2.9011, + "step": 53035 + }, + { + "epoch": 3.603750509580106, + "grad_norm": 8.313060760498047, + "learning_rate": 5.497180323413508e-06, + "loss": 2.8834, + "step": 53040 + }, + { + "epoch": 3.604090229650768, + "grad_norm": 7.944309234619141, + "learning_rate": 5.49675567332518e-06, + "loss": 2.8704, + "step": 53045 + }, + { + "epoch": 3.6044299497214296, + "grad_norm": 7.830504894256592, + "learning_rate": 5.496331023236853e-06, + "loss": 2.8507, + "step": 53050 + }, + { + "epoch": 3.6047696697920912, + "grad_norm": 6.0174407958984375, + "learning_rate": 5.495906373148526e-06, + "loss": 3.0053, + "step": 53055 + }, + { + "epoch": 3.6051093898627533, + "grad_norm": 7.274085998535156, + "learning_rate": 5.495481723060199e-06, + "loss": 2.9998, + "step": 53060 + }, + { + "epoch": 3.605449109933415, + "grad_norm": 8.906363487243652, + "learning_rate": 5.4950570729718715e-06, + "loss": 2.9494, + "step": 53065 + }, + { + "epoch": 3.6057888300040766, + "grad_norm": 5.3590569496154785, + "learning_rate": 5.494632422883544e-06, + "loss": 2.9817, + "step": 53070 + }, + { + "epoch": 3.6061285500747386, + "grad_norm": 9.319375991821289, + "learning_rate": 5.494207772795217e-06, + "loss": 2.8931, + "step": 53075 + }, + { + "epoch": 3.6064682701454003, + "grad_norm": 7.555563926696777, + "learning_rate": 5.493783122706891e-06, + "loss": 2.9036, + "step": 53080 + }, + { + "epoch": 3.606807990216062, + "grad_norm": 6.842304706573486, + "learning_rate": 5.493358472618563e-06, + "loss": 2.7542, + "step": 53085 + }, + { + "epoch": 3.607147710286724, + "grad_norm": 7.0595245361328125, + "learning_rate": 5.4929338225302355e-06, + "loss": 2.8045, + "step": 53090 + }, + { + "epoch": 3.6074874303573856, + "grad_norm": 5.4353203773498535, + "learning_rate": 5.492509172441909e-06, + "loss": 2.8267, + "step": 53095 + }, + { + "epoch": 3.6078271504280472, + "grad_norm": 6.456258296966553, + "learning_rate": 5.492084522353581e-06, + "loss": 3.077, + "step": 53100 + }, + { + "epoch": 3.6081668704987093, + "grad_norm": 6.339841365814209, + "learning_rate": 5.491659872265254e-06, + "loss": 3.0006, + "step": 53105 + }, + { + "epoch": 3.608506590569371, + "grad_norm": 7.644526958465576, + "learning_rate": 5.4912352221769275e-06, + "loss": 2.7503, + "step": 53110 + }, + { + "epoch": 3.6088463106400326, + "grad_norm": 6.836472988128662, + "learning_rate": 5.4908105720885995e-06, + "loss": 2.9788, + "step": 53115 + }, + { + "epoch": 3.6091860307106947, + "grad_norm": 8.504411697387695, + "learning_rate": 5.490385922000272e-06, + "loss": 3.0188, + "step": 53120 + }, + { + "epoch": 3.6095257507813563, + "grad_norm": 8.12525463104248, + "learning_rate": 5.489961271911946e-06, + "loss": 3.1119, + "step": 53125 + }, + { + "epoch": 3.609865470852018, + "grad_norm": 8.125636100769043, + "learning_rate": 5.489536621823618e-06, + "loss": 3.0323, + "step": 53130 + }, + { + "epoch": 3.6102051909226796, + "grad_norm": 7.008706569671631, + "learning_rate": 5.489111971735291e-06, + "loss": 3.1832, + "step": 53135 + }, + { + "epoch": 3.6105449109933416, + "grad_norm": 7.18609094619751, + "learning_rate": 5.4886873216469635e-06, + "loss": 2.8509, + "step": 53140 + }, + { + "epoch": 3.6108846310640033, + "grad_norm": 7.040839672088623, + "learning_rate": 5.488262671558636e-06, + "loss": 2.7952, + "step": 53145 + }, + { + "epoch": 3.611224351134665, + "grad_norm": 8.063737869262695, + "learning_rate": 5.487838021470308e-06, + "loss": 3.15, + "step": 53150 + }, + { + "epoch": 3.611564071205327, + "grad_norm": 6.665809154510498, + "learning_rate": 5.487413371381982e-06, + "loss": 2.9131, + "step": 53155 + }, + { + "epoch": 3.6119037912759886, + "grad_norm": 7.853329658508301, + "learning_rate": 5.486988721293655e-06, + "loss": 2.7929, + "step": 53160 + }, + { + "epoch": 3.6122435113466502, + "grad_norm": 8.030280113220215, + "learning_rate": 5.486564071205327e-06, + "loss": 3.0823, + "step": 53165 + }, + { + "epoch": 3.612583231417312, + "grad_norm": 5.8054890632629395, + "learning_rate": 5.486139421117e-06, + "loss": 2.84, + "step": 53170 + }, + { + "epoch": 3.612922951487974, + "grad_norm": 8.019311904907227, + "learning_rate": 5.485714771028673e-06, + "loss": 2.6829, + "step": 53175 + }, + { + "epoch": 3.6132626715586356, + "grad_norm": 7.582683086395264, + "learning_rate": 5.485290120940345e-06, + "loss": 2.9781, + "step": 53180 + }, + { + "epoch": 3.613602391629297, + "grad_norm": 9.083072662353516, + "learning_rate": 5.484865470852019e-06, + "loss": 2.8517, + "step": 53185 + }, + { + "epoch": 3.6139421116999593, + "grad_norm": 10.195756912231445, + "learning_rate": 5.4844408207636915e-06, + "loss": 3.0426, + "step": 53190 + }, + { + "epoch": 3.614281831770621, + "grad_norm": 7.326128959655762, + "learning_rate": 5.4840161706753634e-06, + "loss": 2.9514, + "step": 53195 + }, + { + "epoch": 3.6146215518412825, + "grad_norm": 10.276004791259766, + "learning_rate": 5.483591520587037e-06, + "loss": 2.973, + "step": 53200 + }, + { + "epoch": 3.6149612719119446, + "grad_norm": 8.0318021774292, + "learning_rate": 5.48316687049871e-06, + "loss": 2.8044, + "step": 53205 + }, + { + "epoch": 3.6153009919826062, + "grad_norm": 7.4757256507873535, + "learning_rate": 5.482742220410382e-06, + "loss": 3.1664, + "step": 53210 + }, + { + "epoch": 3.615640712053268, + "grad_norm": 9.599532127380371, + "learning_rate": 5.4823175703220555e-06, + "loss": 3.198, + "step": 53215 + }, + { + "epoch": 3.61598043212393, + "grad_norm": 6.813093662261963, + "learning_rate": 5.481892920233728e-06, + "loss": 2.9212, + "step": 53220 + }, + { + "epoch": 3.6163201521945916, + "grad_norm": 6.623867511749268, + "learning_rate": 5.4814682701454e-06, + "loss": 2.7647, + "step": 53225 + }, + { + "epoch": 3.616659872265253, + "grad_norm": 6.831400394439697, + "learning_rate": 5.481043620057074e-06, + "loss": 2.9525, + "step": 53230 + }, + { + "epoch": 3.6169995923359153, + "grad_norm": 6.765492916107178, + "learning_rate": 5.480618969968746e-06, + "loss": 2.972, + "step": 53235 + }, + { + "epoch": 3.617339312406577, + "grad_norm": 7.822057247161865, + "learning_rate": 5.480194319880419e-06, + "loss": 3.0746, + "step": 53240 + }, + { + "epoch": 3.6176790324772385, + "grad_norm": 9.243106842041016, + "learning_rate": 5.479769669792092e-06, + "loss": 2.9281, + "step": 53245 + }, + { + "epoch": 3.6180187525479006, + "grad_norm": 8.537182807922363, + "learning_rate": 5.479345019703764e-06, + "loss": 3.0022, + "step": 53250 + }, + { + "epoch": 3.6183584726185622, + "grad_norm": 9.53793716430664, + "learning_rate": 5.478920369615437e-06, + "loss": 3.0091, + "step": 53255 + }, + { + "epoch": 3.618698192689224, + "grad_norm": 6.856293678283691, + "learning_rate": 5.478495719527111e-06, + "loss": 2.9261, + "step": 53260 + }, + { + "epoch": 3.619037912759886, + "grad_norm": 7.987049102783203, + "learning_rate": 5.478071069438783e-06, + "loss": 2.9379, + "step": 53265 + }, + { + "epoch": 3.6193776328305476, + "grad_norm": 8.556400299072266, + "learning_rate": 5.4776464193504554e-06, + "loss": 2.7685, + "step": 53270 + }, + { + "epoch": 3.619717352901209, + "grad_norm": 7.0311408042907715, + "learning_rate": 5.477221769262129e-06, + "loss": 2.9083, + "step": 53275 + }, + { + "epoch": 3.6200570729718713, + "grad_norm": 7.813589572906494, + "learning_rate": 5.476797119173801e-06, + "loss": 3.0605, + "step": 53280 + }, + { + "epoch": 3.620396793042533, + "grad_norm": 7.335193634033203, + "learning_rate": 5.476372469085474e-06, + "loss": 2.8637, + "step": 53285 + }, + { + "epoch": 3.6207365131131946, + "grad_norm": 6.774892807006836, + "learning_rate": 5.4759478189971475e-06, + "loss": 2.9454, + "step": 53290 + }, + { + "epoch": 3.6210762331838566, + "grad_norm": 7.6714253425598145, + "learning_rate": 5.4755231689088194e-06, + "loss": 2.8472, + "step": 53295 + }, + { + "epoch": 3.6214159532545183, + "grad_norm": 7.898352146148682, + "learning_rate": 5.475098518820492e-06, + "loss": 3.0233, + "step": 53300 + }, + { + "epoch": 3.62175567332518, + "grad_norm": 7.912813663482666, + "learning_rate": 5.474673868732165e-06, + "loss": 3.0201, + "step": 53305 + }, + { + "epoch": 3.622095393395842, + "grad_norm": 7.596132278442383, + "learning_rate": 5.474249218643838e-06, + "loss": 3.0454, + "step": 53310 + }, + { + "epoch": 3.6224351134665036, + "grad_norm": 6.411233425140381, + "learning_rate": 5.473824568555511e-06, + "loss": 2.9226, + "step": 53315 + }, + { + "epoch": 3.6227748335371652, + "grad_norm": 10.270657539367676, + "learning_rate": 5.4733999184671835e-06, + "loss": 2.7478, + "step": 53320 + }, + { + "epoch": 3.6231145536078273, + "grad_norm": 5.871866703033447, + "learning_rate": 5.472975268378856e-06, + "loss": 2.9087, + "step": 53325 + }, + { + "epoch": 3.623454273678489, + "grad_norm": 7.958948612213135, + "learning_rate": 5.472550618290528e-06, + "loss": 3.0654, + "step": 53330 + }, + { + "epoch": 3.6237939937491506, + "grad_norm": 6.130060195922852, + "learning_rate": 5.472125968202202e-06, + "loss": 2.8889, + "step": 53335 + }, + { + "epoch": 3.6241337138198126, + "grad_norm": 7.1668877601623535, + "learning_rate": 5.471701318113875e-06, + "loss": 3.071, + "step": 53340 + }, + { + "epoch": 3.6244734338904743, + "grad_norm": 8.150585174560547, + "learning_rate": 5.471276668025547e-06, + "loss": 2.777, + "step": 53345 + }, + { + "epoch": 3.624813153961136, + "grad_norm": 7.013278007507324, + "learning_rate": 5.47085201793722e-06, + "loss": 2.7266, + "step": 53350 + }, + { + "epoch": 3.625152874031798, + "grad_norm": 6.464544773101807, + "learning_rate": 5.470427367848893e-06, + "loss": 2.8591, + "step": 53355 + }, + { + "epoch": 3.6254925941024596, + "grad_norm": 7.43189001083374, + "learning_rate": 5.470002717760565e-06, + "loss": 3.1234, + "step": 53360 + }, + { + "epoch": 3.6258323141731212, + "grad_norm": 5.69916296005249, + "learning_rate": 5.469578067672239e-06, + "loss": 2.7999, + "step": 53365 + }, + { + "epoch": 3.6261720342437833, + "grad_norm": 6.031794548034668, + "learning_rate": 5.4691534175839115e-06, + "loss": 3.0428, + "step": 53370 + }, + { + "epoch": 3.626511754314445, + "grad_norm": 6.464380741119385, + "learning_rate": 5.468728767495583e-06, + "loss": 2.7848, + "step": 53375 + }, + { + "epoch": 3.6268514743851066, + "grad_norm": 7.829343795776367, + "learning_rate": 5.468304117407257e-06, + "loss": 3.23, + "step": 53380 + }, + { + "epoch": 3.6271911944557687, + "grad_norm": 6.122811794281006, + "learning_rate": 5.46787946731893e-06, + "loss": 2.9951, + "step": 53385 + }, + { + "epoch": 3.6275309145264303, + "grad_norm": 7.598443031311035, + "learning_rate": 5.467454817230602e-06, + "loss": 2.9307, + "step": 53390 + }, + { + "epoch": 3.627870634597092, + "grad_norm": 8.06122875213623, + "learning_rate": 5.4670301671422755e-06, + "loss": 2.8376, + "step": 53395 + }, + { + "epoch": 3.628210354667754, + "grad_norm": 6.784999370574951, + "learning_rate": 5.466605517053947e-06, + "loss": 2.8048, + "step": 53400 + }, + { + "epoch": 3.6285500747384156, + "grad_norm": 8.329526901245117, + "learning_rate": 5.46618086696562e-06, + "loss": 3.0077, + "step": 53405 + }, + { + "epoch": 3.6288897948090773, + "grad_norm": 7.456105709075928, + "learning_rate": 5.465756216877294e-06, + "loss": 3.1034, + "step": 53410 + }, + { + "epoch": 3.6292295148797393, + "grad_norm": 6.415255069732666, + "learning_rate": 5.465331566788966e-06, + "loss": 2.7958, + "step": 53415 + }, + { + "epoch": 3.629569234950401, + "grad_norm": 7.021518707275391, + "learning_rate": 5.4649069167006395e-06, + "loss": 2.7423, + "step": 53420 + }, + { + "epoch": 3.6299089550210626, + "grad_norm": 6.444123268127441, + "learning_rate": 5.464482266612312e-06, + "loss": 3.034, + "step": 53425 + }, + { + "epoch": 3.6302486750917247, + "grad_norm": 8.576526641845703, + "learning_rate": 5.464057616523984e-06, + "loss": 3.074, + "step": 53430 + }, + { + "epoch": 3.6305883951623863, + "grad_norm": 7.669839382171631, + "learning_rate": 5.463632966435658e-06, + "loss": 3.011, + "step": 53435 + }, + { + "epoch": 3.630928115233048, + "grad_norm": 8.393391609191895, + "learning_rate": 5.463208316347331e-06, + "loss": 3.0507, + "step": 53440 + }, + { + "epoch": 3.63126783530371, + "grad_norm": 7.263246536254883, + "learning_rate": 5.462783666259003e-06, + "loss": 2.8456, + "step": 53445 + }, + { + "epoch": 3.6316075553743716, + "grad_norm": 5.361393928527832, + "learning_rate": 5.462359016170676e-06, + "loss": 2.9264, + "step": 53450 + }, + { + "epoch": 3.6319472754450333, + "grad_norm": 6.59988260269165, + "learning_rate": 5.461934366082349e-06, + "loss": 3.0928, + "step": 53455 + }, + { + "epoch": 3.6322869955156953, + "grad_norm": 9.552803993225098, + "learning_rate": 5.461509715994021e-06, + "loss": 2.7095, + "step": 53460 + }, + { + "epoch": 3.632626715586357, + "grad_norm": 6.79280424118042, + "learning_rate": 5.461085065905695e-06, + "loss": 2.958, + "step": 53465 + }, + { + "epoch": 3.6329664356570186, + "grad_norm": 5.887414455413818, + "learning_rate": 5.4606604158173675e-06, + "loss": 2.971, + "step": 53470 + }, + { + "epoch": 3.6333061557276802, + "grad_norm": 8.110596656799316, + "learning_rate": 5.4602357657290394e-06, + "loss": 3.0089, + "step": 53475 + }, + { + "epoch": 3.6336458757983423, + "grad_norm": 7.529082775115967, + "learning_rate": 5.459811115640713e-06, + "loss": 2.8927, + "step": 53480 + }, + { + "epoch": 3.633985595869004, + "grad_norm": 7.425765514373779, + "learning_rate": 5.459386465552385e-06, + "loss": 2.9089, + "step": 53485 + }, + { + "epoch": 3.6343253159396656, + "grad_norm": 9.295875549316406, + "learning_rate": 5.458961815464058e-06, + "loss": 2.9575, + "step": 53490 + }, + { + "epoch": 3.6346650360103276, + "grad_norm": 7.779655933380127, + "learning_rate": 5.4585371653757315e-06, + "loss": 2.9744, + "step": 53495 + }, + { + "epoch": 3.6350047560809893, + "grad_norm": 5.73015832901001, + "learning_rate": 5.4581125152874034e-06, + "loss": 3.0163, + "step": 53500 + }, + { + "epoch": 3.635344476151651, + "grad_norm": 7.307088851928711, + "learning_rate": 5.457687865199076e-06, + "loss": 2.8692, + "step": 53505 + }, + { + "epoch": 3.635684196222313, + "grad_norm": 6.461928844451904, + "learning_rate": 5.45726321511075e-06, + "loss": 2.9915, + "step": 53510 + }, + { + "epoch": 3.6360239162929746, + "grad_norm": 5.989895820617676, + "learning_rate": 5.456838565022422e-06, + "loss": 2.8925, + "step": 53515 + }, + { + "epoch": 3.6363636363636362, + "grad_norm": 7.104710578918457, + "learning_rate": 5.456413914934095e-06, + "loss": 3.1696, + "step": 53520 + }, + { + "epoch": 3.636703356434298, + "grad_norm": 9.078314781188965, + "learning_rate": 5.455989264845768e-06, + "loss": 2.7271, + "step": 53525 + }, + { + "epoch": 3.63704307650496, + "grad_norm": 7.329996585845947, + "learning_rate": 5.45556461475744e-06, + "loss": 3.2757, + "step": 53530 + }, + { + "epoch": 3.6373827965756216, + "grad_norm": 6.026089191436768, + "learning_rate": 5.455139964669113e-06, + "loss": 3.1081, + "step": 53535 + }, + { + "epoch": 3.637722516646283, + "grad_norm": 9.404988288879395, + "learning_rate": 5.454715314580787e-06, + "loss": 3.0798, + "step": 53540 + }, + { + "epoch": 3.6380622367169453, + "grad_norm": 6.192330837249756, + "learning_rate": 5.454290664492459e-06, + "loss": 2.9958, + "step": 53545 + }, + { + "epoch": 3.638401956787607, + "grad_norm": 7.736417293548584, + "learning_rate": 5.4538660144041314e-06, + "loss": 3.0744, + "step": 53550 + }, + { + "epoch": 3.6387416768582685, + "grad_norm": 7.069762229919434, + "learning_rate": 5.453441364315804e-06, + "loss": 3.2309, + "step": 53555 + }, + { + "epoch": 3.6390813969289306, + "grad_norm": 8.27016830444336, + "learning_rate": 5.453016714227477e-06, + "loss": 2.8294, + "step": 53560 + }, + { + "epoch": 3.6394211169995923, + "grad_norm": 8.161609649658203, + "learning_rate": 5.45259206413915e-06, + "loss": 2.9687, + "step": 53565 + }, + { + "epoch": 3.639760837070254, + "grad_norm": 7.540998458862305, + "learning_rate": 5.452167414050823e-06, + "loss": 2.9193, + "step": 53570 + }, + { + "epoch": 3.640100557140916, + "grad_norm": 8.332438468933105, + "learning_rate": 5.4517427639624954e-06, + "loss": 3.1394, + "step": 53575 + }, + { + "epoch": 3.6404402772115776, + "grad_norm": 6.41091251373291, + "learning_rate": 5.451318113874167e-06, + "loss": 2.9066, + "step": 53580 + }, + { + "epoch": 3.6407799972822392, + "grad_norm": 6.511992454528809, + "learning_rate": 5.450893463785841e-06, + "loss": 2.7469, + "step": 53585 + }, + { + "epoch": 3.6411197173529013, + "grad_norm": 7.135504245758057, + "learning_rate": 5.450468813697514e-06, + "loss": 3.1733, + "step": 53590 + }, + { + "epoch": 3.641459437423563, + "grad_norm": 8.785316467285156, + "learning_rate": 5.450044163609186e-06, + "loss": 2.9028, + "step": 53595 + }, + { + "epoch": 3.6417991574942246, + "grad_norm": 8.75658893585205, + "learning_rate": 5.4496195135208594e-06, + "loss": 3.039, + "step": 53600 + }, + { + "epoch": 3.6421388775648866, + "grad_norm": 6.686179161071777, + "learning_rate": 5.449194863432532e-06, + "loss": 3.022, + "step": 53605 + }, + { + "epoch": 3.6424785976355483, + "grad_norm": 7.00740385055542, + "learning_rate": 5.448770213344204e-06, + "loss": 2.95, + "step": 53610 + }, + { + "epoch": 3.64281831770621, + "grad_norm": 8.24589729309082, + "learning_rate": 5.448345563255878e-06, + "loss": 3.1174, + "step": 53615 + }, + { + "epoch": 3.643158037776872, + "grad_norm": 7.383034706115723, + "learning_rate": 5.447920913167551e-06, + "loss": 2.8863, + "step": 53620 + }, + { + "epoch": 3.6434977578475336, + "grad_norm": 6.48681116104126, + "learning_rate": 5.447496263079223e-06, + "loss": 3.0843, + "step": 53625 + }, + { + "epoch": 3.6438374779181952, + "grad_norm": 7.700616836547852, + "learning_rate": 5.447071612990896e-06, + "loss": 3.2335, + "step": 53630 + }, + { + "epoch": 3.6441771979888573, + "grad_norm": 7.721787452697754, + "learning_rate": 5.446646962902569e-06, + "loss": 2.8383, + "step": 53635 + }, + { + "epoch": 3.644516918059519, + "grad_norm": 7.960993766784668, + "learning_rate": 5.446222312814241e-06, + "loss": 3.1435, + "step": 53640 + }, + { + "epoch": 3.6448566381301806, + "grad_norm": 6.975861549377441, + "learning_rate": 5.445797662725915e-06, + "loss": 2.7262, + "step": 53645 + }, + { + "epoch": 3.6451963582008426, + "grad_norm": 6.646708011627197, + "learning_rate": 5.445373012637587e-06, + "loss": 2.968, + "step": 53650 + }, + { + "epoch": 3.6455360782715043, + "grad_norm": 8.125754356384277, + "learning_rate": 5.444948362549259e-06, + "loss": 2.9594, + "step": 53655 + }, + { + "epoch": 3.645875798342166, + "grad_norm": 8.279936790466309, + "learning_rate": 5.444523712460933e-06, + "loss": 3.0216, + "step": 53660 + }, + { + "epoch": 3.646215518412828, + "grad_norm": 5.787612438201904, + "learning_rate": 5.444099062372605e-06, + "loss": 3.0144, + "step": 53665 + }, + { + "epoch": 3.6465552384834896, + "grad_norm": 7.528153896331787, + "learning_rate": 5.443674412284278e-06, + "loss": 3.0089, + "step": 53670 + }, + { + "epoch": 3.6468949585541512, + "grad_norm": 8.593923568725586, + "learning_rate": 5.4432497621959515e-06, + "loss": 2.9654, + "step": 53675 + }, + { + "epoch": 3.6472346786248133, + "grad_norm": 6.90634822845459, + "learning_rate": 5.442825112107623e-06, + "loss": 2.819, + "step": 53680 + }, + { + "epoch": 3.647574398695475, + "grad_norm": 6.353360176086426, + "learning_rate": 5.442400462019296e-06, + "loss": 3.0483, + "step": 53685 + }, + { + "epoch": 3.6479141187661366, + "grad_norm": 6.6861772537231445, + "learning_rate": 5.44197581193097e-06, + "loss": 3.0063, + "step": 53690 + }, + { + "epoch": 3.6482538388367987, + "grad_norm": 7.079182147979736, + "learning_rate": 5.441551161842642e-06, + "loss": 2.9857, + "step": 53695 + }, + { + "epoch": 3.6485935589074603, + "grad_norm": 6.680636405944824, + "learning_rate": 5.441126511754315e-06, + "loss": 2.9173, + "step": 53700 + }, + { + "epoch": 3.648933278978122, + "grad_norm": 10.486820220947266, + "learning_rate": 5.440701861665988e-06, + "loss": 3.0225, + "step": 53705 + }, + { + "epoch": 3.649272999048784, + "grad_norm": 7.515833377838135, + "learning_rate": 5.44027721157766e-06, + "loss": 3.0761, + "step": 53710 + }, + { + "epoch": 3.6496127191194456, + "grad_norm": 6.336942672729492, + "learning_rate": 5.439852561489333e-06, + "loss": 3.1617, + "step": 53715 + }, + { + "epoch": 3.6499524391901073, + "grad_norm": 7.51331090927124, + "learning_rate": 5.439427911401006e-06, + "loss": 3.0444, + "step": 53720 + }, + { + "epoch": 3.6502921592607693, + "grad_norm": 7.284501075744629, + "learning_rate": 5.439003261312679e-06, + "loss": 2.858, + "step": 53725 + }, + { + "epoch": 3.650631879331431, + "grad_norm": 9.394169807434082, + "learning_rate": 5.438578611224351e-06, + "loss": 3.0614, + "step": 53730 + }, + { + "epoch": 3.6509715994020926, + "grad_norm": 7.557094097137451, + "learning_rate": 5.438153961136024e-06, + "loss": 2.887, + "step": 53735 + }, + { + "epoch": 3.6513113194727547, + "grad_norm": 7.346457004547119, + "learning_rate": 5.437729311047697e-06, + "loss": 2.7682, + "step": 53740 + }, + { + "epoch": 3.6516510395434163, + "grad_norm": 8.1331787109375, + "learning_rate": 5.437304660959369e-06, + "loss": 2.6974, + "step": 53745 + }, + { + "epoch": 3.651990759614078, + "grad_norm": 5.890145778656006, + "learning_rate": 5.436880010871043e-06, + "loss": 3.0101, + "step": 53750 + }, + { + "epoch": 3.65233047968474, + "grad_norm": 6.1101298332214355, + "learning_rate": 5.4364553607827154e-06, + "loss": 3.0818, + "step": 53755 + }, + { + "epoch": 3.6526701997554016, + "grad_norm": 6.901745319366455, + "learning_rate": 5.436030710694389e-06, + "loss": 2.7909, + "step": 53760 + }, + { + "epoch": 3.6530099198260633, + "grad_norm": 7.3889570236206055, + "learning_rate": 5.435606060606061e-06, + "loss": 2.9159, + "step": 53765 + }, + { + "epoch": 3.6533496398967253, + "grad_norm": 7.340576171875, + "learning_rate": 5.435181410517734e-06, + "loss": 2.7372, + "step": 53770 + }, + { + "epoch": 3.653689359967387, + "grad_norm": 6.372344493865967, + "learning_rate": 5.4347567604294075e-06, + "loss": 3.0175, + "step": 53775 + }, + { + "epoch": 3.6540290800380486, + "grad_norm": 7.4589643478393555, + "learning_rate": 5.4343321103410794e-06, + "loss": 2.9589, + "step": 53780 + }, + { + "epoch": 3.6543688001087107, + "grad_norm": 7.191547870635986, + "learning_rate": 5.433907460252752e-06, + "loss": 2.9725, + "step": 53785 + }, + { + "epoch": 3.6547085201793723, + "grad_norm": 8.370956420898438, + "learning_rate": 5.433482810164426e-06, + "loss": 2.9896, + "step": 53790 + }, + { + "epoch": 3.655048240250034, + "grad_norm": 6.924493789672852, + "learning_rate": 5.433058160076098e-06, + "loss": 2.9416, + "step": 53795 + }, + { + "epoch": 3.655387960320696, + "grad_norm": 6.370575428009033, + "learning_rate": 5.432633509987771e-06, + "loss": 2.8713, + "step": 53800 + }, + { + "epoch": 3.6557276803913576, + "grad_norm": 8.761923789978027, + "learning_rate": 5.4322088598994434e-06, + "loss": 3.1187, + "step": 53805 + }, + { + "epoch": 3.6560674004620193, + "grad_norm": 7.300205230712891, + "learning_rate": 5.431784209811116e-06, + "loss": 2.8822, + "step": 53810 + }, + { + "epoch": 3.656407120532681, + "grad_norm": 6.264296531677246, + "learning_rate": 5.431359559722788e-06, + "loss": 3.1141, + "step": 53815 + }, + { + "epoch": 3.656746840603343, + "grad_norm": 7.109840393066406, + "learning_rate": 5.430934909634462e-06, + "loss": 3.0851, + "step": 53820 + }, + { + "epoch": 3.6570865606740046, + "grad_norm": 8.088141441345215, + "learning_rate": 5.430510259546135e-06, + "loss": 2.9816, + "step": 53825 + }, + { + "epoch": 3.6574262807446662, + "grad_norm": 6.945579528808594, + "learning_rate": 5.430085609457807e-06, + "loss": 2.7698, + "step": 53830 + }, + { + "epoch": 3.6577660008153283, + "grad_norm": 7.620445728302002, + "learning_rate": 5.42966095936948e-06, + "loss": 2.5067, + "step": 53835 + }, + { + "epoch": 3.65810572088599, + "grad_norm": 9.769613265991211, + "learning_rate": 5.429236309281153e-06, + "loss": 2.723, + "step": 53840 + }, + { + "epoch": 3.6584454409566516, + "grad_norm": 10.559006690979004, + "learning_rate": 5.428811659192825e-06, + "loss": 3.0006, + "step": 53845 + }, + { + "epoch": 3.6587851610273137, + "grad_norm": 6.526966094970703, + "learning_rate": 5.428387009104499e-06, + "loss": 2.9669, + "step": 53850 + }, + { + "epoch": 3.6591248810979753, + "grad_norm": 8.35496711730957, + "learning_rate": 5.4279623590161714e-06, + "loss": 2.9594, + "step": 53855 + }, + { + "epoch": 3.659464601168637, + "grad_norm": 7.427022457122803, + "learning_rate": 5.427537708927843e-06, + "loss": 2.9622, + "step": 53860 + }, + { + "epoch": 3.6598043212392986, + "grad_norm": 6.479942798614502, + "learning_rate": 5.427113058839517e-06, + "loss": 2.6871, + "step": 53865 + }, + { + "epoch": 3.6601440413099606, + "grad_norm": 6.408888339996338, + "learning_rate": 5.42668840875119e-06, + "loss": 2.888, + "step": 53870 + }, + { + "epoch": 3.6604837613806223, + "grad_norm": 6.295713424682617, + "learning_rate": 5.426263758662862e-06, + "loss": 3.0039, + "step": 53875 + }, + { + "epoch": 3.660823481451284, + "grad_norm": 5.531013488769531, + "learning_rate": 5.4258391085745354e-06, + "loss": 2.8393, + "step": 53880 + }, + { + "epoch": 3.661163201521946, + "grad_norm": 6.174610137939453, + "learning_rate": 5.4254993885038735e-06, + "loss": 2.6667, + "step": 53885 + }, + { + "epoch": 3.6615029215926076, + "grad_norm": 6.120554447174072, + "learning_rate": 5.4250747384155455e-06, + "loss": 2.885, + "step": 53890 + }, + { + "epoch": 3.6618426416632692, + "grad_norm": 7.1718268394470215, + "learning_rate": 5.424650088327219e-06, + "loss": 3.0898, + "step": 53895 + }, + { + "epoch": 3.6621823617339313, + "grad_norm": 7.294985294342041, + "learning_rate": 5.424225438238891e-06, + "loss": 2.9095, + "step": 53900 + }, + { + "epoch": 3.662522081804593, + "grad_norm": 5.908066749572754, + "learning_rate": 5.423800788150564e-06, + "loss": 2.9886, + "step": 53905 + }, + { + "epoch": 3.6628618018752546, + "grad_norm": 6.800163745880127, + "learning_rate": 5.4233761380622375e-06, + "loss": 3.0549, + "step": 53910 + }, + { + "epoch": 3.6632015219459166, + "grad_norm": 8.594569206237793, + "learning_rate": 5.4229514879739095e-06, + "loss": 2.7965, + "step": 53915 + }, + { + "epoch": 3.6635412420165783, + "grad_norm": 7.107891082763672, + "learning_rate": 5.422526837885582e-06, + "loss": 3.0032, + "step": 53920 + }, + { + "epoch": 3.66388096208724, + "grad_norm": 7.380228519439697, + "learning_rate": 5.422102187797256e-06, + "loss": 3.0757, + "step": 53925 + }, + { + "epoch": 3.664220682157902, + "grad_norm": 6.666867256164551, + "learning_rate": 5.421677537708928e-06, + "loss": 3.1283, + "step": 53930 + }, + { + "epoch": 3.6645604022285636, + "grad_norm": 6.951168060302734, + "learning_rate": 5.421252887620601e-06, + "loss": 2.9796, + "step": 53935 + }, + { + "epoch": 3.6649001222992252, + "grad_norm": 6.088851451873779, + "learning_rate": 5.420828237532274e-06, + "loss": 2.9217, + "step": 53940 + }, + { + "epoch": 3.6652398423698873, + "grad_norm": 7.333390712738037, + "learning_rate": 5.420403587443946e-06, + "loss": 2.7765, + "step": 53945 + }, + { + "epoch": 3.665579562440549, + "grad_norm": 8.034505844116211, + "learning_rate": 5.419978937355619e-06, + "loss": 2.8745, + "step": 53950 + }, + { + "epoch": 3.6659192825112106, + "grad_norm": 9.142935752868652, + "learning_rate": 5.419554287267293e-06, + "loss": 3.2099, + "step": 53955 + }, + { + "epoch": 3.6662590025818727, + "grad_norm": 7.1833720207214355, + "learning_rate": 5.419129637178965e-06, + "loss": 2.9704, + "step": 53960 + }, + { + "epoch": 3.6665987226525343, + "grad_norm": 7.252254486083984, + "learning_rate": 5.418704987090638e-06, + "loss": 3.032, + "step": 53965 + }, + { + "epoch": 3.666938442723196, + "grad_norm": 9.300618171691895, + "learning_rate": 5.418280337002311e-06, + "loss": 3.0251, + "step": 53970 + }, + { + "epoch": 3.667278162793858, + "grad_norm": 7.5354485511779785, + "learning_rate": 5.417855686913983e-06, + "loss": 3.3317, + "step": 53975 + }, + { + "epoch": 3.6676178828645196, + "grad_norm": 5.464232444763184, + "learning_rate": 5.417431036825657e-06, + "loss": 2.9966, + "step": 53980 + }, + { + "epoch": 3.6679576029351812, + "grad_norm": 7.299261569976807, + "learning_rate": 5.417006386737329e-06, + "loss": 2.966, + "step": 53985 + }, + { + "epoch": 3.6682973230058433, + "grad_norm": 7.446243762969971, + "learning_rate": 5.4165817366490015e-06, + "loss": 2.6572, + "step": 53990 + }, + { + "epoch": 3.668637043076505, + "grad_norm": 8.562141418457031, + "learning_rate": 5.416157086560675e-06, + "loss": 3.008, + "step": 53995 + }, + { + "epoch": 3.6689767631471666, + "grad_norm": 8.521077156066895, + "learning_rate": 5.415732436472347e-06, + "loss": 3.0466, + "step": 54000 + }, + { + "epoch": 3.6693164832178287, + "grad_norm": 7.084395885467529, + "learning_rate": 5.41530778638402e-06, + "loss": 3.0503, + "step": 54005 + }, + { + "epoch": 3.6696562032884903, + "grad_norm": 5.433046340942383, + "learning_rate": 5.4148831362956935e-06, + "loss": 2.8627, + "step": 54010 + }, + { + "epoch": 3.669995923359152, + "grad_norm": 6.126475811004639, + "learning_rate": 5.4144584862073655e-06, + "loss": 2.9282, + "step": 54015 + }, + { + "epoch": 3.670335643429814, + "grad_norm": 6.2231221199035645, + "learning_rate": 5.414033836119038e-06, + "loss": 2.9446, + "step": 54020 + }, + { + "epoch": 3.6706753635004756, + "grad_norm": 7.659963607788086, + "learning_rate": 5.413609186030712e-06, + "loss": 3.0301, + "step": 54025 + }, + { + "epoch": 3.6710150835711373, + "grad_norm": 7.183284759521484, + "learning_rate": 5.413184535942384e-06, + "loss": 3.0042, + "step": 54030 + }, + { + "epoch": 3.6713548036417993, + "grad_norm": 6.523367881774902, + "learning_rate": 5.412759885854057e-06, + "loss": 2.9572, + "step": 54035 + }, + { + "epoch": 3.671694523712461, + "grad_norm": 6.790130615234375, + "learning_rate": 5.41233523576573e-06, + "loss": 3.2329, + "step": 54040 + }, + { + "epoch": 3.6720342437831226, + "grad_norm": 7.4340362548828125, + "learning_rate": 5.411910585677402e-06, + "loss": 3.0246, + "step": 54045 + }, + { + "epoch": 3.6723739638537847, + "grad_norm": 6.371457099914551, + "learning_rate": 5.411485935589075e-06, + "loss": 3.0315, + "step": 54050 + }, + { + "epoch": 3.6727136839244463, + "grad_norm": 8.011350631713867, + "learning_rate": 5.411061285500748e-06, + "loss": 3.0209, + "step": 54055 + }, + { + "epoch": 3.673053403995108, + "grad_norm": 7.2852253913879395, + "learning_rate": 5.410636635412421e-06, + "loss": 3.1195, + "step": 54060 + }, + { + "epoch": 3.67339312406577, + "grad_norm": 6.60341215133667, + "learning_rate": 5.4102119853240935e-06, + "loss": 2.8683, + "step": 54065 + }, + { + "epoch": 3.6737328441364316, + "grad_norm": 8.086774826049805, + "learning_rate": 5.409787335235766e-06, + "loss": 3.0154, + "step": 54070 + }, + { + "epoch": 3.6740725642070933, + "grad_norm": 6.603668212890625, + "learning_rate": 5.409362685147439e-06, + "loss": 3.2545, + "step": 54075 + }, + { + "epoch": 3.6744122842777553, + "grad_norm": 6.727849006652832, + "learning_rate": 5.408938035059111e-06, + "loss": 3.1774, + "step": 54080 + }, + { + "epoch": 3.674752004348417, + "grad_norm": 8.026338577270508, + "learning_rate": 5.408513384970785e-06, + "loss": 3.1446, + "step": 54085 + }, + { + "epoch": 3.6750917244190786, + "grad_norm": 5.8958306312561035, + "learning_rate": 5.4080887348824575e-06, + "loss": 2.9387, + "step": 54090 + }, + { + "epoch": 3.6754314444897407, + "grad_norm": 6.4256744384765625, + "learning_rate": 5.4076640847941295e-06, + "loss": 2.5709, + "step": 54095 + }, + { + "epoch": 3.6757711645604023, + "grad_norm": 6.808287143707275, + "learning_rate": 5.407239434705803e-06, + "loss": 3.0523, + "step": 54100 + }, + { + "epoch": 3.676110884631064, + "grad_norm": 7.450641632080078, + "learning_rate": 5.406814784617476e-06, + "loss": 2.8829, + "step": 54105 + }, + { + "epoch": 3.676450604701726, + "grad_norm": 8.175773620605469, + "learning_rate": 5.406390134529148e-06, + "loss": 2.9386, + "step": 54110 + }, + { + "epoch": 3.6767903247723877, + "grad_norm": 7.52345609664917, + "learning_rate": 5.4059654844408215e-06, + "loss": 2.9786, + "step": 54115 + }, + { + "epoch": 3.6771300448430493, + "grad_norm": 7.729432106018066, + "learning_rate": 5.405540834352494e-06, + "loss": 2.9658, + "step": 54120 + }, + { + "epoch": 3.6774697649137114, + "grad_norm": 6.560648441314697, + "learning_rate": 5.405116184264166e-06, + "loss": 2.8754, + "step": 54125 + }, + { + "epoch": 3.677809484984373, + "grad_norm": 5.908719539642334, + "learning_rate": 5.40469153417584e-06, + "loss": 2.7926, + "step": 54130 + }, + { + "epoch": 3.6781492050550346, + "grad_norm": 6.963536739349365, + "learning_rate": 5.404266884087513e-06, + "loss": 3.1119, + "step": 54135 + }, + { + "epoch": 3.6784889251256967, + "grad_norm": 8.322005271911621, + "learning_rate": 5.403842233999185e-06, + "loss": 3.1153, + "step": 54140 + }, + { + "epoch": 3.6788286451963583, + "grad_norm": 7.327701568603516, + "learning_rate": 5.403417583910858e-06, + "loss": 3.0191, + "step": 54145 + }, + { + "epoch": 3.67916836526702, + "grad_norm": 6.255403518676758, + "learning_rate": 5.40299293382253e-06, + "loss": 3.01, + "step": 54150 + }, + { + "epoch": 3.6795080853376816, + "grad_norm": 8.081936836242676, + "learning_rate": 5.402568283734203e-06, + "loss": 2.9506, + "step": 54155 + }, + { + "epoch": 3.6798478054083437, + "grad_norm": 6.992847919464111, + "learning_rate": 5.402143633645877e-06, + "loss": 3.0335, + "step": 54160 + }, + { + "epoch": 3.6801875254790053, + "grad_norm": 8.707926750183105, + "learning_rate": 5.401718983557549e-06, + "loss": 3.0639, + "step": 54165 + }, + { + "epoch": 3.680527245549667, + "grad_norm": 8.47619915008545, + "learning_rate": 5.4012943334692215e-06, + "loss": 2.6827, + "step": 54170 + }, + { + "epoch": 3.680866965620329, + "grad_norm": 7.472257137298584, + "learning_rate": 5.400869683380895e-06, + "loss": 3.2264, + "step": 54175 + }, + { + "epoch": 3.6812066856909906, + "grad_norm": 6.196707725524902, + "learning_rate": 5.400445033292567e-06, + "loss": 2.8228, + "step": 54180 + }, + { + "epoch": 3.6815464057616523, + "grad_norm": 7.46325159072876, + "learning_rate": 5.40002038320424e-06, + "loss": 2.9686, + "step": 54185 + }, + { + "epoch": 3.6818861258323143, + "grad_norm": 6.493176460266113, + "learning_rate": 5.3995957331159135e-06, + "loss": 2.872, + "step": 54190 + }, + { + "epoch": 3.682225845902976, + "grad_norm": 5.894530296325684, + "learning_rate": 5.3991710830275855e-06, + "loss": 2.7855, + "step": 54195 + }, + { + "epoch": 3.6825655659736376, + "grad_norm": 7.309478759765625, + "learning_rate": 5.398746432939258e-06, + "loss": 3.0337, + "step": 54200 + }, + { + "epoch": 3.6829052860442992, + "grad_norm": 7.534619331359863, + "learning_rate": 5.398321782850932e-06, + "loss": 2.8936, + "step": 54205 + }, + { + "epoch": 3.6832450061149613, + "grad_norm": 8.54553508758545, + "learning_rate": 5.397897132762604e-06, + "loss": 3.0471, + "step": 54210 + }, + { + "epoch": 3.683584726185623, + "grad_norm": 6.507936954498291, + "learning_rate": 5.397472482674277e-06, + "loss": 2.9732, + "step": 54215 + }, + { + "epoch": 3.6839244462562846, + "grad_norm": 6.3906426429748535, + "learning_rate": 5.3970478325859495e-06, + "loss": 2.8985, + "step": 54220 + }, + { + "epoch": 3.6842641663269466, + "grad_norm": 8.392302513122559, + "learning_rate": 5.396623182497622e-06, + "loss": 2.8432, + "step": 54225 + }, + { + "epoch": 3.6846038863976083, + "grad_norm": 7.175116539001465, + "learning_rate": 5.396198532409295e-06, + "loss": 3.0514, + "step": 54230 + }, + { + "epoch": 3.68494360646827, + "grad_norm": 6.041513442993164, + "learning_rate": 5.395773882320968e-06, + "loss": 2.9902, + "step": 54235 + }, + { + "epoch": 3.685283326538932, + "grad_norm": 6.876642227172852, + "learning_rate": 5.395349232232641e-06, + "loss": 2.5941, + "step": 54240 + }, + { + "epoch": 3.6856230466095936, + "grad_norm": 8.246308326721191, + "learning_rate": 5.394924582144313e-06, + "loss": 3.0698, + "step": 54245 + }, + { + "epoch": 3.6859627666802552, + "grad_norm": 7.565339088439941, + "learning_rate": 5.394499932055986e-06, + "loss": 2.9917, + "step": 54250 + }, + { + "epoch": 3.6863024867509173, + "grad_norm": 8.709266662597656, + "learning_rate": 5.394075281967659e-06, + "loss": 3.1776, + "step": 54255 + }, + { + "epoch": 3.686642206821579, + "grad_norm": 5.722690105438232, + "learning_rate": 5.393650631879331e-06, + "loss": 2.7183, + "step": 54260 + }, + { + "epoch": 3.6869819268922406, + "grad_norm": 5.183568954467773, + "learning_rate": 5.393225981791005e-06, + "loss": 3.1613, + "step": 54265 + }, + { + "epoch": 3.6873216469629027, + "grad_norm": 6.497524261474609, + "learning_rate": 5.3928013317026775e-06, + "loss": 3.026, + "step": 54270 + }, + { + "epoch": 3.6876613670335643, + "grad_norm": 6.626051425933838, + "learning_rate": 5.3923766816143494e-06, + "loss": 2.9005, + "step": 54275 + }, + { + "epoch": 3.688001087104226, + "grad_norm": 6.223028182983398, + "learning_rate": 5.391952031526023e-06, + "loss": 3.1281, + "step": 54280 + }, + { + "epoch": 3.688340807174888, + "grad_norm": 6.7321953773498535, + "learning_rate": 5.391527381437696e-06, + "loss": 2.9383, + "step": 54285 + }, + { + "epoch": 3.6886805272455496, + "grad_norm": 5.717567443847656, + "learning_rate": 5.391102731349368e-06, + "loss": 2.9574, + "step": 54290 + }, + { + "epoch": 3.6890202473162113, + "grad_norm": 5.942893981933594, + "learning_rate": 5.3906780812610415e-06, + "loss": 2.9791, + "step": 54295 + }, + { + "epoch": 3.6893599673868733, + "grad_norm": 8.157812118530273, + "learning_rate": 5.390253431172714e-06, + "loss": 2.997, + "step": 54300 + }, + { + "epoch": 3.689699687457535, + "grad_norm": 8.913674354553223, + "learning_rate": 5.389828781084387e-06, + "loss": 3.0292, + "step": 54305 + }, + { + "epoch": 3.6900394075281966, + "grad_norm": 6.876455783843994, + "learning_rate": 5.38940413099606e-06, + "loss": 2.672, + "step": 54310 + }, + { + "epoch": 3.6903791275988587, + "grad_norm": 6.6851372718811035, + "learning_rate": 5.388979480907732e-06, + "loss": 3.0909, + "step": 54315 + }, + { + "epoch": 3.6907188476695203, + "grad_norm": 6.115947723388672, + "learning_rate": 5.3885548308194055e-06, + "loss": 3.2132, + "step": 54320 + }, + { + "epoch": 3.691058567740182, + "grad_norm": 6.6177287101745605, + "learning_rate": 5.388130180731078e-06, + "loss": 2.8913, + "step": 54325 + }, + { + "epoch": 3.691398287810844, + "grad_norm": 5.713276386260986, + "learning_rate": 5.38770553064275e-06, + "loss": 2.7685, + "step": 54330 + }, + { + "epoch": 3.6917380078815056, + "grad_norm": 7.793349742889404, + "learning_rate": 5.387280880554424e-06, + "loss": 3.0663, + "step": 54335 + }, + { + "epoch": 3.6920777279521673, + "grad_norm": 6.863578796386719, + "learning_rate": 5.386856230466097e-06, + "loss": 2.7655, + "step": 54340 + }, + { + "epoch": 3.6924174480228293, + "grad_norm": 6.828675270080566, + "learning_rate": 5.386431580377769e-06, + "loss": 2.9037, + "step": 54345 + }, + { + "epoch": 3.692757168093491, + "grad_norm": 6.009841442108154, + "learning_rate": 5.386006930289442e-06, + "loss": 3.1192, + "step": 54350 + }, + { + "epoch": 3.6930968881641526, + "grad_norm": 6.929678916931152, + "learning_rate": 5.385582280201115e-06, + "loss": 3.1006, + "step": 54355 + }, + { + "epoch": 3.6934366082348147, + "grad_norm": 6.179379940032959, + "learning_rate": 5.385157630112787e-06, + "loss": 2.9776, + "step": 54360 + }, + { + "epoch": 3.6937763283054763, + "grad_norm": 5.643165111541748, + "learning_rate": 5.384732980024461e-06, + "loss": 2.8316, + "step": 54365 + }, + { + "epoch": 3.694116048376138, + "grad_norm": 6.499878406524658, + "learning_rate": 5.3843083299361335e-06, + "loss": 2.9234, + "step": 54370 + }, + { + "epoch": 3.6944557684468, + "grad_norm": 8.73599624633789, + "learning_rate": 5.3838836798478055e-06, + "loss": 3.02, + "step": 54375 + }, + { + "epoch": 3.6947954885174616, + "grad_norm": 7.026976585388184, + "learning_rate": 5.383459029759479e-06, + "loss": 2.7243, + "step": 54380 + }, + { + "epoch": 3.6951352085881233, + "grad_norm": 8.567180633544922, + "learning_rate": 5.383034379671152e-06, + "loss": 2.9123, + "step": 54385 + }, + { + "epoch": 3.6954749286587854, + "grad_norm": 8.506650924682617, + "learning_rate": 5.382609729582824e-06, + "loss": 2.935, + "step": 54390 + }, + { + "epoch": 3.695814648729447, + "grad_norm": 6.934621334075928, + "learning_rate": 5.3821850794944975e-06, + "loss": 2.7983, + "step": 54395 + }, + { + "epoch": 3.6961543688001086, + "grad_norm": 9.148905754089355, + "learning_rate": 5.3817604294061695e-06, + "loss": 2.7809, + "step": 54400 + }, + { + "epoch": 3.6964940888707707, + "grad_norm": 7.204852104187012, + "learning_rate": 5.381335779317842e-06, + "loss": 3.1011, + "step": 54405 + }, + { + "epoch": 3.6968338089414323, + "grad_norm": 6.396143913269043, + "learning_rate": 5.380911129229516e-06, + "loss": 2.9245, + "step": 54410 + }, + { + "epoch": 3.697173529012094, + "grad_norm": 7.336819171905518, + "learning_rate": 5.380486479141188e-06, + "loss": 2.8947, + "step": 54415 + }, + { + "epoch": 3.697513249082756, + "grad_norm": 8.278249740600586, + "learning_rate": 5.380061829052861e-06, + "loss": 3.0183, + "step": 54420 + }, + { + "epoch": 3.6978529691534177, + "grad_norm": 5.681805610656738, + "learning_rate": 5.379637178964534e-06, + "loss": 2.8042, + "step": 54425 + }, + { + "epoch": 3.6981926892240793, + "grad_norm": 7.015410900115967, + "learning_rate": 5.379212528876206e-06, + "loss": 2.8941, + "step": 54430 + }, + { + "epoch": 3.6985324092947414, + "grad_norm": 7.254770755767822, + "learning_rate": 5.378787878787879e-06, + "loss": 2.8575, + "step": 54435 + }, + { + "epoch": 3.698872129365403, + "grad_norm": 7.958259105682373, + "learning_rate": 5.378363228699553e-06, + "loss": 2.936, + "step": 54440 + }, + { + "epoch": 3.6992118494360646, + "grad_norm": 7.428812503814697, + "learning_rate": 5.377938578611225e-06, + "loss": 3.0878, + "step": 54445 + }, + { + "epoch": 3.6995515695067267, + "grad_norm": 7.159594535827637, + "learning_rate": 5.3775139285228975e-06, + "loss": 2.8043, + "step": 54450 + }, + { + "epoch": 3.6998912895773883, + "grad_norm": 9.020659446716309, + "learning_rate": 5.377089278434571e-06, + "loss": 3.1467, + "step": 54455 + }, + { + "epoch": 3.70023100964805, + "grad_norm": 7.101630210876465, + "learning_rate": 5.376664628346243e-06, + "loss": 2.8891, + "step": 54460 + }, + { + "epoch": 3.700570729718712, + "grad_norm": 7.834315299987793, + "learning_rate": 5.376239978257916e-06, + "loss": 3.0194, + "step": 54465 + }, + { + "epoch": 3.7009104497893737, + "grad_norm": 5.920685291290283, + "learning_rate": 5.375815328169589e-06, + "loss": 3.017, + "step": 54470 + }, + { + "epoch": 3.7012501698600353, + "grad_norm": 5.6169633865356445, + "learning_rate": 5.3753906780812615e-06, + "loss": 2.9387, + "step": 54475 + }, + { + "epoch": 3.7015898899306974, + "grad_norm": 8.094407081604004, + "learning_rate": 5.374966027992934e-06, + "loss": 2.9913, + "step": 54480 + }, + { + "epoch": 3.701929610001359, + "grad_norm": 5.862379550933838, + "learning_rate": 5.374541377904607e-06, + "loss": 3.0368, + "step": 54485 + }, + { + "epoch": 3.7022693300720206, + "grad_norm": 7.130770206451416, + "learning_rate": 5.37411672781628e-06, + "loss": 3.008, + "step": 54490 + }, + { + "epoch": 3.7026090501426823, + "grad_norm": 6.767646789550781, + "learning_rate": 5.373692077727952e-06, + "loss": 2.9178, + "step": 54495 + }, + { + "epoch": 3.7029487702133443, + "grad_norm": 5.908061504364014, + "learning_rate": 5.3732674276396255e-06, + "loss": 2.9273, + "step": 54500 + }, + { + "epoch": 3.703288490284006, + "grad_norm": 7.105865001678467, + "learning_rate": 5.372842777551298e-06, + "loss": 2.8448, + "step": 54505 + }, + { + "epoch": 3.7036282103546676, + "grad_norm": 8.558389663696289, + "learning_rate": 5.37241812746297e-06, + "loss": 3.0729, + "step": 54510 + }, + { + "epoch": 3.7039679304253297, + "grad_norm": 6.442228317260742, + "learning_rate": 5.371993477374644e-06, + "loss": 2.6607, + "step": 54515 + }, + { + "epoch": 3.7043076504959913, + "grad_norm": 8.569395065307617, + "learning_rate": 5.371568827286317e-06, + "loss": 2.8675, + "step": 54520 + }, + { + "epoch": 3.704647370566653, + "grad_norm": 6.433107376098633, + "learning_rate": 5.371144177197989e-06, + "loss": 3.0882, + "step": 54525 + }, + { + "epoch": 3.704987090637315, + "grad_norm": 6.305704116821289, + "learning_rate": 5.370719527109662e-06, + "loss": 2.9629, + "step": 54530 + }, + { + "epoch": 3.7053268107079766, + "grad_norm": 6.9746198654174805, + "learning_rate": 5.370294877021335e-06, + "loss": 2.9637, + "step": 54535 + }, + { + "epoch": 3.7056665307786383, + "grad_norm": 6.574917316436768, + "learning_rate": 5.369870226933007e-06, + "loss": 2.9243, + "step": 54540 + }, + { + "epoch": 3.7060062508493, + "grad_norm": 6.303855895996094, + "learning_rate": 5.369445576844681e-06, + "loss": 3.0041, + "step": 54545 + }, + { + "epoch": 3.706345970919962, + "grad_norm": 8.545530319213867, + "learning_rate": 5.3690209267563535e-06, + "loss": 3.0304, + "step": 54550 + }, + { + "epoch": 3.7066856909906236, + "grad_norm": 8.09876823425293, + "learning_rate": 5.3685962766680254e-06, + "loss": 2.819, + "step": 54555 + }, + { + "epoch": 3.7070254110612852, + "grad_norm": 6.338536262512207, + "learning_rate": 5.368171626579699e-06, + "loss": 2.8166, + "step": 54560 + }, + { + "epoch": 3.7073651311319473, + "grad_norm": 7.025755405426025, + "learning_rate": 5.367746976491371e-06, + "loss": 3.1027, + "step": 54565 + }, + { + "epoch": 3.707704851202609, + "grad_norm": 6.247654438018799, + "learning_rate": 5.367322326403044e-06, + "loss": 2.7253, + "step": 54570 + }, + { + "epoch": 3.7080445712732706, + "grad_norm": 7.117190361022949, + "learning_rate": 5.3668976763147175e-06, + "loss": 2.9195, + "step": 54575 + }, + { + "epoch": 3.7083842913439327, + "grad_norm": 6.819138526916504, + "learning_rate": 5.3664730262263894e-06, + "loss": 3.0198, + "step": 54580 + }, + { + "epoch": 3.7087240114145943, + "grad_norm": 7.67862606048584, + "learning_rate": 5.366048376138062e-06, + "loss": 2.8614, + "step": 54585 + }, + { + "epoch": 3.709063731485256, + "grad_norm": 9.663198471069336, + "learning_rate": 5.365623726049736e-06, + "loss": 3.2875, + "step": 54590 + }, + { + "epoch": 3.709403451555918, + "grad_norm": 7.217931270599365, + "learning_rate": 5.365199075961408e-06, + "loss": 2.9802, + "step": 54595 + }, + { + "epoch": 3.7097431716265796, + "grad_norm": 7.0568013191223145, + "learning_rate": 5.364774425873081e-06, + "loss": 2.902, + "step": 54600 + }, + { + "epoch": 3.7100828916972413, + "grad_norm": 7.611706733703613, + "learning_rate": 5.364349775784754e-06, + "loss": 2.9793, + "step": 54605 + }, + { + "epoch": 3.7104226117679033, + "grad_norm": 9.218213081359863, + "learning_rate": 5.363925125696426e-06, + "loss": 2.8215, + "step": 54610 + }, + { + "epoch": 3.710762331838565, + "grad_norm": 6.965456485748291, + "learning_rate": 5.363500475608099e-06, + "loss": 2.6927, + "step": 54615 + }, + { + "epoch": 3.7111020519092266, + "grad_norm": 6.516326427459717, + "learning_rate": 5.363075825519773e-06, + "loss": 2.81, + "step": 54620 + }, + { + "epoch": 3.7114417719798887, + "grad_norm": 6.772133827209473, + "learning_rate": 5.362651175431445e-06, + "loss": 2.8391, + "step": 54625 + }, + { + "epoch": 3.7117814920505503, + "grad_norm": 9.265544891357422, + "learning_rate": 5.3622265253431174e-06, + "loss": 3.2378, + "step": 54630 + }, + { + "epoch": 3.712121212121212, + "grad_norm": 6.081288814544678, + "learning_rate": 5.361801875254791e-06, + "loss": 3.0779, + "step": 54635 + }, + { + "epoch": 3.712460932191874, + "grad_norm": 8.41604995727539, + "learning_rate": 5.361377225166463e-06, + "loss": 2.9347, + "step": 54640 + }, + { + "epoch": 3.7128006522625356, + "grad_norm": 7.926767349243164, + "learning_rate": 5.360952575078137e-06, + "loss": 2.9912, + "step": 54645 + }, + { + "epoch": 3.7131403723331973, + "grad_norm": 6.841075420379639, + "learning_rate": 5.360527924989809e-06, + "loss": 2.8689, + "step": 54650 + }, + { + "epoch": 3.7134800924038593, + "grad_norm": 9.858086585998535, + "learning_rate": 5.3601032749014814e-06, + "loss": 2.964, + "step": 54655 + }, + { + "epoch": 3.713819812474521, + "grad_norm": 6.914586544036865, + "learning_rate": 5.359678624813155e-06, + "loss": 2.7928, + "step": 54660 + }, + { + "epoch": 3.7141595325451826, + "grad_norm": 7.46320104598999, + "learning_rate": 5.359253974724827e-06, + "loss": 2.7756, + "step": 54665 + }, + { + "epoch": 3.7144992526158447, + "grad_norm": 6.9777021408081055, + "learning_rate": 5.3588293246365e-06, + "loss": 3.0373, + "step": 54670 + }, + { + "epoch": 3.7148389726865063, + "grad_norm": 6.550053596496582, + "learning_rate": 5.3584046745481735e-06, + "loss": 2.9683, + "step": 54675 + }, + { + "epoch": 3.715178692757168, + "grad_norm": 9.574063301086426, + "learning_rate": 5.3579800244598455e-06, + "loss": 2.8023, + "step": 54680 + }, + { + "epoch": 3.71551841282783, + "grad_norm": 8.09786605834961, + "learning_rate": 5.357555374371518e-06, + "loss": 2.9871, + "step": 54685 + }, + { + "epoch": 3.7158581328984917, + "grad_norm": 8.380114555358887, + "learning_rate": 5.357130724283192e-06, + "loss": 3.1964, + "step": 54690 + }, + { + "epoch": 3.7161978529691533, + "grad_norm": 7.0071282386779785, + "learning_rate": 5.356706074194864e-06, + "loss": 3.034, + "step": 54695 + }, + { + "epoch": 3.7165375730398154, + "grad_norm": 7.275604724884033, + "learning_rate": 5.356281424106537e-06, + "loss": 2.9626, + "step": 54700 + }, + { + "epoch": 3.716877293110477, + "grad_norm": 9.191485404968262, + "learning_rate": 5.35585677401821e-06, + "loss": 2.9904, + "step": 54705 + }, + { + "epoch": 3.7172170131811386, + "grad_norm": 6.743439674377441, + "learning_rate": 5.355432123929882e-06, + "loss": 3.111, + "step": 54710 + }, + { + "epoch": 3.7175567332518007, + "grad_norm": 5.881748199462891, + "learning_rate": 5.355007473841555e-06, + "loss": 2.8413, + "step": 54715 + }, + { + "epoch": 3.7178964533224623, + "grad_norm": 6.959352970123291, + "learning_rate": 5.354582823753228e-06, + "loss": 2.9966, + "step": 54720 + }, + { + "epoch": 3.718236173393124, + "grad_norm": 7.119819641113281, + "learning_rate": 5.354158173664901e-06, + "loss": 2.8505, + "step": 54725 + }, + { + "epoch": 3.718575893463786, + "grad_norm": 7.1460347175598145, + "learning_rate": 5.3537335235765735e-06, + "loss": 3.047, + "step": 54730 + }, + { + "epoch": 3.7189156135344477, + "grad_norm": 6.295806407928467, + "learning_rate": 5.353308873488246e-06, + "loss": 2.8055, + "step": 54735 + }, + { + "epoch": 3.7192553336051093, + "grad_norm": 7.234686851501465, + "learning_rate": 5.352884223399919e-06, + "loss": 3.0182, + "step": 54740 + }, + { + "epoch": 3.7195950536757714, + "grad_norm": 7.380784511566162, + "learning_rate": 5.352459573311591e-06, + "loss": 2.9275, + "step": 54745 + }, + { + "epoch": 3.719934773746433, + "grad_norm": 6.888154029846191, + "learning_rate": 5.352034923223265e-06, + "loss": 2.965, + "step": 54750 + }, + { + "epoch": 3.7202744938170946, + "grad_norm": 5.175154685974121, + "learning_rate": 5.3516102731349375e-06, + "loss": 2.999, + "step": 54755 + }, + { + "epoch": 3.7206142138877567, + "grad_norm": 5.618346691131592, + "learning_rate": 5.351185623046609e-06, + "loss": 2.9848, + "step": 54760 + }, + { + "epoch": 3.7209539339584183, + "grad_norm": 5.565560340881348, + "learning_rate": 5.350760972958283e-06, + "loss": 2.8461, + "step": 54765 + }, + { + "epoch": 3.72129365402908, + "grad_norm": 7.2226481437683105, + "learning_rate": 5.350336322869956e-06, + "loss": 2.8616, + "step": 54770 + }, + { + "epoch": 3.721633374099742, + "grad_norm": 7.397861003875732, + "learning_rate": 5.349911672781628e-06, + "loss": 2.8561, + "step": 54775 + }, + { + "epoch": 3.7219730941704037, + "grad_norm": 8.345297813415527, + "learning_rate": 5.3494870226933015e-06, + "loss": 3.0377, + "step": 54780 + }, + { + "epoch": 3.7223128142410653, + "grad_norm": 6.535958766937256, + "learning_rate": 5.349062372604974e-06, + "loss": 2.7616, + "step": 54785 + }, + { + "epoch": 3.7226525343117274, + "grad_norm": 7.840590953826904, + "learning_rate": 5.348637722516646e-06, + "loss": 3.0841, + "step": 54790 + }, + { + "epoch": 3.722992254382389, + "grad_norm": 8.321555137634277, + "learning_rate": 5.34821307242832e-06, + "loss": 2.9183, + "step": 54795 + }, + { + "epoch": 3.7233319744530506, + "grad_norm": 6.849716663360596, + "learning_rate": 5.347788422339993e-06, + "loss": 2.6575, + "step": 54800 + }, + { + "epoch": 3.7236716945237127, + "grad_norm": 6.466902732849121, + "learning_rate": 5.347363772251665e-06, + "loss": 2.8583, + "step": 54805 + }, + { + "epoch": 3.7240114145943743, + "grad_norm": 7.476350784301758, + "learning_rate": 5.346939122163338e-06, + "loss": 2.8286, + "step": 54810 + }, + { + "epoch": 3.724351134665036, + "grad_norm": 7.231649398803711, + "learning_rate": 5.34651447207501e-06, + "loss": 2.7902, + "step": 54815 + }, + { + "epoch": 3.724690854735698, + "grad_norm": 7.222131729125977, + "learning_rate": 5.346089821986683e-06, + "loss": 3.2946, + "step": 54820 + }, + { + "epoch": 3.7250305748063597, + "grad_norm": 8.188896179199219, + "learning_rate": 5.345665171898357e-06, + "loss": 2.7433, + "step": 54825 + }, + { + "epoch": 3.7253702948770213, + "grad_norm": 8.283720016479492, + "learning_rate": 5.345240521810029e-06, + "loss": 2.7903, + "step": 54830 + }, + { + "epoch": 3.725710014947683, + "grad_norm": 7.550257682800293, + "learning_rate": 5.3448158717217014e-06, + "loss": 2.7967, + "step": 54835 + }, + { + "epoch": 3.726049735018345, + "grad_norm": 7.993086814880371, + "learning_rate": 5.344391221633375e-06, + "loss": 2.8835, + "step": 54840 + }, + { + "epoch": 3.7263894550890067, + "grad_norm": 6.899912357330322, + "learning_rate": 5.343966571545047e-06, + "loss": 2.9582, + "step": 54845 + }, + { + "epoch": 3.7267291751596683, + "grad_norm": 6.390876770019531, + "learning_rate": 5.34354192145672e-06, + "loss": 2.6795, + "step": 54850 + }, + { + "epoch": 3.7270688952303304, + "grad_norm": 7.548876762390137, + "learning_rate": 5.3431172713683935e-06, + "loss": 2.9096, + "step": 54855 + }, + { + "epoch": 3.727408615300992, + "grad_norm": 6.900452613830566, + "learning_rate": 5.3426926212800654e-06, + "loss": 2.9313, + "step": 54860 + }, + { + "epoch": 3.7277483353716536, + "grad_norm": 6.890153408050537, + "learning_rate": 5.342267971191738e-06, + "loss": 3.1344, + "step": 54865 + }, + { + "epoch": 3.7280880554423157, + "grad_norm": 8.147134780883789, + "learning_rate": 5.341843321103412e-06, + "loss": 2.8597, + "step": 54870 + }, + { + "epoch": 3.7284277755129773, + "grad_norm": 7.477948188781738, + "learning_rate": 5.341418671015084e-06, + "loss": 2.8872, + "step": 54875 + }, + { + "epoch": 3.728767495583639, + "grad_norm": 5.702703952789307, + "learning_rate": 5.340994020926757e-06, + "loss": 2.8577, + "step": 54880 + }, + { + "epoch": 3.7291072156543006, + "grad_norm": 6.8930487632751465, + "learning_rate": 5.3405693708384294e-06, + "loss": 2.9577, + "step": 54885 + }, + { + "epoch": 3.7294469357249627, + "grad_norm": 8.211759567260742, + "learning_rate": 5.340144720750102e-06, + "loss": 3.0418, + "step": 54890 + }, + { + "epoch": 3.7297866557956243, + "grad_norm": 9.322861671447754, + "learning_rate": 5.339720070661775e-06, + "loss": 2.9723, + "step": 54895 + }, + { + "epoch": 3.730126375866286, + "grad_norm": 8.659802436828613, + "learning_rate": 5.339295420573448e-06, + "loss": 2.986, + "step": 54900 + }, + { + "epoch": 3.730466095936948, + "grad_norm": 7.080680847167969, + "learning_rate": 5.338870770485121e-06, + "loss": 2.7055, + "step": 54905 + }, + { + "epoch": 3.7308058160076096, + "grad_norm": 6.8076887130737305, + "learning_rate": 5.338446120396793e-06, + "loss": 2.9494, + "step": 54910 + }, + { + "epoch": 3.7311455360782713, + "grad_norm": 7.838016033172607, + "learning_rate": 5.338021470308466e-06, + "loss": 2.7358, + "step": 54915 + }, + { + "epoch": 3.7314852561489333, + "grad_norm": 6.585851669311523, + "learning_rate": 5.337596820220139e-06, + "loss": 2.6567, + "step": 54920 + }, + { + "epoch": 3.731824976219595, + "grad_norm": 7.1747894287109375, + "learning_rate": 5.337172170131811e-06, + "loss": 3.1061, + "step": 54925 + }, + { + "epoch": 3.7321646962902566, + "grad_norm": 7.812771797180176, + "learning_rate": 5.336747520043485e-06, + "loss": 2.9396, + "step": 54930 + }, + { + "epoch": 3.7325044163609187, + "grad_norm": 6.425092697143555, + "learning_rate": 5.3363228699551574e-06, + "loss": 3.0052, + "step": 54935 + }, + { + "epoch": 3.7328441364315803, + "grad_norm": 7.381518840789795, + "learning_rate": 5.335898219866829e-06, + "loss": 2.9836, + "step": 54940 + }, + { + "epoch": 3.733183856502242, + "grad_norm": 6.95283317565918, + "learning_rate": 5.335473569778503e-06, + "loss": 2.8037, + "step": 54945 + }, + { + "epoch": 3.733523576572904, + "grad_norm": 6.477662563323975, + "learning_rate": 5.335048919690176e-06, + "loss": 2.6741, + "step": 54950 + }, + { + "epoch": 3.7338632966435656, + "grad_norm": 8.3198881149292, + "learning_rate": 5.334624269601848e-06, + "loss": 3.0984, + "step": 54955 + }, + { + "epoch": 3.7342030167142273, + "grad_norm": 6.233654022216797, + "learning_rate": 5.3341996195135214e-06, + "loss": 2.9789, + "step": 54960 + }, + { + "epoch": 3.7345427367848893, + "grad_norm": 6.881605625152588, + "learning_rate": 5.333774969425194e-06, + "loss": 2.8526, + "step": 54965 + }, + { + "epoch": 3.734882456855551, + "grad_norm": 6.755268573760986, + "learning_rate": 5.333350319336866e-06, + "loss": 2.9042, + "step": 54970 + }, + { + "epoch": 3.7352221769262126, + "grad_norm": 7.3397955894470215, + "learning_rate": 5.33292566924854e-06, + "loss": 3.0765, + "step": 54975 + }, + { + "epoch": 3.7355618969968747, + "grad_norm": 7.33524227142334, + "learning_rate": 5.332501019160213e-06, + "loss": 2.9672, + "step": 54980 + }, + { + "epoch": 3.7359016170675363, + "grad_norm": 6.352851867675781, + "learning_rate": 5.3320763690718855e-06, + "loss": 3.0398, + "step": 54985 + }, + { + "epoch": 3.736241337138198, + "grad_norm": 6.065139293670654, + "learning_rate": 5.331651718983558e-06, + "loss": 2.8308, + "step": 54990 + }, + { + "epoch": 3.73658105720886, + "grad_norm": 7.346911907196045, + "learning_rate": 5.33122706889523e-06, + "loss": 3.0522, + "step": 54995 + }, + { + "epoch": 3.7369207772795217, + "grad_norm": 7.053059101104736, + "learning_rate": 5.330802418806904e-06, + "loss": 2.8575, + "step": 55000 + }, + { + "epoch": 3.7372604973501833, + "grad_norm": 6.256278038024902, + "learning_rate": 5.330377768718577e-06, + "loss": 2.9003, + "step": 55005 + }, + { + "epoch": 3.7376002174208454, + "grad_norm": 8.014909744262695, + "learning_rate": 5.329953118630249e-06, + "loss": 2.9375, + "step": 55010 + }, + { + "epoch": 3.737939937491507, + "grad_norm": 7.553582191467285, + "learning_rate": 5.329528468541922e-06, + "loss": 3.101, + "step": 55015 + }, + { + "epoch": 3.7382796575621686, + "grad_norm": 5.646291732788086, + "learning_rate": 5.329103818453595e-06, + "loss": 3.0321, + "step": 55020 + }, + { + "epoch": 3.7386193776328307, + "grad_norm": 9.360108375549316, + "learning_rate": 5.328679168365267e-06, + "loss": 2.9393, + "step": 55025 + }, + { + "epoch": 3.7389590977034923, + "grad_norm": 9.58621883392334, + "learning_rate": 5.328254518276941e-06, + "loss": 3.052, + "step": 55030 + }, + { + "epoch": 3.739298817774154, + "grad_norm": 8.731414794921875, + "learning_rate": 5.3278298681886135e-06, + "loss": 2.7674, + "step": 55035 + }, + { + "epoch": 3.739638537844816, + "grad_norm": 10.059528350830078, + "learning_rate": 5.327405218100285e-06, + "loss": 3.0088, + "step": 55040 + }, + { + "epoch": 3.7399782579154777, + "grad_norm": 6.915221214294434, + "learning_rate": 5.326980568011959e-06, + "loss": 2.657, + "step": 55045 + }, + { + "epoch": 3.7403179779861393, + "grad_norm": 8.015008926391602, + "learning_rate": 5.326555917923632e-06, + "loss": 2.8847, + "step": 55050 + }, + { + "epoch": 3.7406576980568014, + "grad_norm": 6.691417217254639, + "learning_rate": 5.326131267835304e-06, + "loss": 2.9882, + "step": 55055 + }, + { + "epoch": 3.740997418127463, + "grad_norm": 8.625088691711426, + "learning_rate": 5.3257066177469775e-06, + "loss": 3.076, + "step": 55060 + }, + { + "epoch": 3.7413371381981246, + "grad_norm": 6.325211524963379, + "learning_rate": 5.325281967658649e-06, + "loss": 2.9942, + "step": 55065 + }, + { + "epoch": 3.7416768582687867, + "grad_norm": 6.096395969390869, + "learning_rate": 5.324857317570322e-06, + "loss": 3.1281, + "step": 55070 + }, + { + "epoch": 3.7420165783394483, + "grad_norm": 6.7192912101745605, + "learning_rate": 5.324432667481996e-06, + "loss": 3.3465, + "step": 55075 + }, + { + "epoch": 3.74235629841011, + "grad_norm": 6.214964389801025, + "learning_rate": 5.324008017393668e-06, + "loss": 2.7887, + "step": 55080 + }, + { + "epoch": 3.742696018480772, + "grad_norm": 6.462056636810303, + "learning_rate": 5.323583367305341e-06, + "loss": 2.7572, + "step": 55085 + }, + { + "epoch": 3.7430357385514337, + "grad_norm": 12.297266006469727, + "learning_rate": 5.323158717217014e-06, + "loss": 3.0188, + "step": 55090 + }, + { + "epoch": 3.7433754586220953, + "grad_norm": 7.171382904052734, + "learning_rate": 5.322734067128686e-06, + "loss": 2.9381, + "step": 55095 + }, + { + "epoch": 3.7437151786927574, + "grad_norm": 9.003594398498535, + "learning_rate": 5.322309417040359e-06, + "loss": 2.8591, + "step": 55100 + }, + { + "epoch": 3.744054898763419, + "grad_norm": 8.94033432006836, + "learning_rate": 5.321884766952033e-06, + "loss": 2.8181, + "step": 55105 + }, + { + "epoch": 3.7443946188340806, + "grad_norm": 7.413585186004639, + "learning_rate": 5.321460116863705e-06, + "loss": 2.7665, + "step": 55110 + }, + { + "epoch": 3.7447343389047427, + "grad_norm": 8.420931816101074, + "learning_rate": 5.3210354667753774e-06, + "loss": 3.1084, + "step": 55115 + }, + { + "epoch": 3.7450740589754044, + "grad_norm": 6.9676032066345215, + "learning_rate": 5.320610816687051e-06, + "loss": 2.6725, + "step": 55120 + }, + { + "epoch": 3.745413779046066, + "grad_norm": 8.585698127746582, + "learning_rate": 5.320186166598723e-06, + "loss": 2.6558, + "step": 55125 + }, + { + "epoch": 3.745753499116728, + "grad_norm": 7.837374210357666, + "learning_rate": 5.319761516510396e-06, + "loss": 3.0423, + "step": 55130 + }, + { + "epoch": 3.7460932191873897, + "grad_norm": 7.309175491333008, + "learning_rate": 5.319336866422069e-06, + "loss": 2.9617, + "step": 55135 + }, + { + "epoch": 3.7464329392580513, + "grad_norm": 5.206963062286377, + "learning_rate": 5.3189122163337414e-06, + "loss": 2.9286, + "step": 55140 + }, + { + "epoch": 3.7467726593287134, + "grad_norm": 6.46407413482666, + "learning_rate": 5.318487566245414e-06, + "loss": 2.738, + "step": 55145 + }, + { + "epoch": 3.747112379399375, + "grad_norm": 6.695964813232422, + "learning_rate": 5.318062916157087e-06, + "loss": 2.8628, + "step": 55150 + }, + { + "epoch": 3.7474520994700367, + "grad_norm": 8.091985702514648, + "learning_rate": 5.31763826606876e-06, + "loss": 2.9384, + "step": 55155 + }, + { + "epoch": 3.7477918195406987, + "grad_norm": 6.849616527557373, + "learning_rate": 5.317213615980432e-06, + "loss": 2.8488, + "step": 55160 + }, + { + "epoch": 3.7481315396113604, + "grad_norm": 7.404327392578125, + "learning_rate": 5.3167889658921054e-06, + "loss": 2.9871, + "step": 55165 + }, + { + "epoch": 3.748471259682022, + "grad_norm": 6.867428779602051, + "learning_rate": 5.316364315803778e-06, + "loss": 2.8212, + "step": 55170 + }, + { + "epoch": 3.7488109797526836, + "grad_norm": 8.70411491394043, + "learning_rate": 5.31593966571545e-06, + "loss": 2.9303, + "step": 55175 + }, + { + "epoch": 3.7491506998233457, + "grad_norm": 7.9814982414245605, + "learning_rate": 5.315515015627124e-06, + "loss": 2.9624, + "step": 55180 + }, + { + "epoch": 3.7494904198940073, + "grad_norm": 7.212625503540039, + "learning_rate": 5.315090365538797e-06, + "loss": 2.7716, + "step": 55185 + }, + { + "epoch": 3.749830139964669, + "grad_norm": 7.222711086273193, + "learning_rate": 5.314665715450469e-06, + "loss": 3.1039, + "step": 55190 + }, + { + "epoch": 3.750169860035331, + "grad_norm": 8.644010543823242, + "learning_rate": 5.314241065362142e-06, + "loss": 2.6582, + "step": 55195 + }, + { + "epoch": 3.7505095801059927, + "grad_norm": 8.357366561889648, + "learning_rate": 5.313816415273815e-06, + "loss": 2.6588, + "step": 55200 + }, + { + "epoch": 3.7508493001766543, + "grad_norm": 6.813983917236328, + "learning_rate": 5.313391765185487e-06, + "loss": 3.1525, + "step": 55205 + }, + { + "epoch": 3.7511890202473164, + "grad_norm": 9.507562637329102, + "learning_rate": 5.312967115097161e-06, + "loss": 2.9843, + "step": 55210 + }, + { + "epoch": 3.751528740317978, + "grad_norm": 5.2126665115356445, + "learning_rate": 5.3125424650088334e-06, + "loss": 2.9195, + "step": 55215 + }, + { + "epoch": 3.7518684603886396, + "grad_norm": 6.195507049560547, + "learning_rate": 5.312117814920505e-06, + "loss": 2.7455, + "step": 55220 + }, + { + "epoch": 3.7522081804593013, + "grad_norm": 8.107015609741211, + "learning_rate": 5.311693164832179e-06, + "loss": 2.9918, + "step": 55225 + }, + { + "epoch": 3.7525479005299633, + "grad_norm": 6.88969087600708, + "learning_rate": 5.311268514743851e-06, + "loss": 3.2188, + "step": 55230 + }, + { + "epoch": 3.752887620600625, + "grad_norm": 7.687696933746338, + "learning_rate": 5.310843864655524e-06, + "loss": 2.9978, + "step": 55235 + }, + { + "epoch": 3.7532273406712866, + "grad_norm": 6.576469421386719, + "learning_rate": 5.3104192145671974e-06, + "loss": 2.6911, + "step": 55240 + }, + { + "epoch": 3.7535670607419487, + "grad_norm": 8.350380897521973, + "learning_rate": 5.309994564478869e-06, + "loss": 3.2633, + "step": 55245 + }, + { + "epoch": 3.7539067808126103, + "grad_norm": 7.777317047119141, + "learning_rate": 5.309569914390542e-06, + "loss": 3.111, + "step": 55250 + }, + { + "epoch": 3.754246500883272, + "grad_norm": 6.096813678741455, + "learning_rate": 5.309145264302216e-06, + "loss": 2.8495, + "step": 55255 + }, + { + "epoch": 3.754586220953934, + "grad_norm": 6.754490852355957, + "learning_rate": 5.308720614213888e-06, + "loss": 3.0773, + "step": 55260 + }, + { + "epoch": 3.7549259410245956, + "grad_norm": 8.03977108001709, + "learning_rate": 5.308295964125561e-06, + "loss": 3.0437, + "step": 55265 + }, + { + "epoch": 3.7552656610952573, + "grad_norm": 6.613144397735596, + "learning_rate": 5.307871314037234e-06, + "loss": 2.9615, + "step": 55270 + }, + { + "epoch": 3.7556053811659194, + "grad_norm": 6.7615790367126465, + "learning_rate": 5.307446663948906e-06, + "loss": 3.0166, + "step": 55275 + }, + { + "epoch": 3.755945101236581, + "grad_norm": 7.928764820098877, + "learning_rate": 5.307022013860579e-06, + "loss": 2.8697, + "step": 55280 + }, + { + "epoch": 3.7562848213072426, + "grad_norm": 7.449274063110352, + "learning_rate": 5.306597363772253e-06, + "loss": 2.9415, + "step": 55285 + }, + { + "epoch": 3.7566245413779047, + "grad_norm": 7.605307579040527, + "learning_rate": 5.306172713683925e-06, + "loss": 3.0034, + "step": 55290 + }, + { + "epoch": 3.7569642614485663, + "grad_norm": 7.9120354652404785, + "learning_rate": 5.305748063595597e-06, + "loss": 2.9534, + "step": 55295 + }, + { + "epoch": 3.757303981519228, + "grad_norm": 8.670450210571289, + "learning_rate": 5.305323413507271e-06, + "loss": 3.0527, + "step": 55300 + }, + { + "epoch": 3.75764370158989, + "grad_norm": 6.986420154571533, + "learning_rate": 5.304898763418943e-06, + "loss": 2.8586, + "step": 55305 + }, + { + "epoch": 3.7579834216605517, + "grad_norm": 10.473928451538086, + "learning_rate": 5.304474113330616e-06, + "loss": 3.1379, + "step": 55310 + }, + { + "epoch": 3.7583231417312133, + "grad_norm": 8.624397277832031, + "learning_rate": 5.304049463242289e-06, + "loss": 3.1652, + "step": 55315 + }, + { + "epoch": 3.7586628618018754, + "grad_norm": 7.736239433288574, + "learning_rate": 5.303624813153961e-06, + "loss": 2.7734, + "step": 55320 + }, + { + "epoch": 3.759002581872537, + "grad_norm": 5.3211493492126465, + "learning_rate": 5.303200163065635e-06, + "loss": 2.978, + "step": 55325 + }, + { + "epoch": 3.7593423019431986, + "grad_norm": 7.78225040435791, + "learning_rate": 5.302775512977307e-06, + "loss": 2.7654, + "step": 55330 + }, + { + "epoch": 3.7596820220138607, + "grad_norm": 6.08078670501709, + "learning_rate": 5.30235086288898e-06, + "loss": 2.8372, + "step": 55335 + }, + { + "epoch": 3.7600217420845223, + "grad_norm": 5.96317720413208, + "learning_rate": 5.3019262128006535e-06, + "loss": 3.0229, + "step": 55340 + }, + { + "epoch": 3.760361462155184, + "grad_norm": 7.522914409637451, + "learning_rate": 5.301501562712325e-06, + "loss": 3.063, + "step": 55345 + }, + { + "epoch": 3.760701182225846, + "grad_norm": 7.925598621368408, + "learning_rate": 5.301076912623998e-06, + "loss": 2.7965, + "step": 55350 + }, + { + "epoch": 3.7610409022965077, + "grad_norm": 7.832554817199707, + "learning_rate": 5.300652262535672e-06, + "loss": 3.2894, + "step": 55355 + }, + { + "epoch": 3.7613806223671693, + "grad_norm": 6.746572017669678, + "learning_rate": 5.300227612447344e-06, + "loss": 3.0768, + "step": 55360 + }, + { + "epoch": 3.7617203424378314, + "grad_norm": 7.916768550872803, + "learning_rate": 5.299802962359017e-06, + "loss": 2.9641, + "step": 55365 + }, + { + "epoch": 3.762060062508493, + "grad_norm": 9.072644233703613, + "learning_rate": 5.29937831227069e-06, + "loss": 2.6019, + "step": 55370 + }, + { + "epoch": 3.7623997825791546, + "grad_norm": 7.654318332672119, + "learning_rate": 5.298953662182362e-06, + "loss": 3.0191, + "step": 55375 + }, + { + "epoch": 3.7627395026498167, + "grad_norm": 7.460832595825195, + "learning_rate": 5.298529012094035e-06, + "loss": 3.0488, + "step": 55380 + }, + { + "epoch": 3.7630792227204783, + "grad_norm": 6.539683818817139, + "learning_rate": 5.298104362005708e-06, + "loss": 2.8563, + "step": 55385 + }, + { + "epoch": 3.76341894279114, + "grad_norm": 6.923435688018799, + "learning_rate": 5.297679711917381e-06, + "loss": 2.9286, + "step": 55390 + }, + { + "epoch": 3.763758662861802, + "grad_norm": 7.461727619171143, + "learning_rate": 5.297255061829053e-06, + "loss": 3.5224, + "step": 55395 + }, + { + "epoch": 3.7640983829324637, + "grad_norm": 7.35437536239624, + "learning_rate": 5.296830411740726e-06, + "loss": 2.9723, + "step": 55400 + }, + { + "epoch": 3.7644381030031253, + "grad_norm": 7.2476301193237305, + "learning_rate": 5.296405761652399e-06, + "loss": 2.8413, + "step": 55405 + }, + { + "epoch": 3.7647778230737874, + "grad_norm": 6.166809558868408, + "learning_rate": 5.295981111564071e-06, + "loss": 2.9401, + "step": 55410 + }, + { + "epoch": 3.765117543144449, + "grad_norm": 6.293031692504883, + "learning_rate": 5.295556461475745e-06, + "loss": 2.9433, + "step": 55415 + }, + { + "epoch": 3.7654572632151107, + "grad_norm": 10.790740013122559, + "learning_rate": 5.2951318113874174e-06, + "loss": 3.1301, + "step": 55420 + }, + { + "epoch": 3.7657969832857727, + "grad_norm": 5.9592366218566895, + "learning_rate": 5.294707161299089e-06, + "loss": 3.0774, + "step": 55425 + }, + { + "epoch": 3.7661367033564344, + "grad_norm": 7.699052333831787, + "learning_rate": 5.294282511210763e-06, + "loss": 2.9111, + "step": 55430 + }, + { + "epoch": 3.766476423427096, + "grad_norm": 9.397971153259277, + "learning_rate": 5.293857861122436e-06, + "loss": 3.1803, + "step": 55435 + }, + { + "epoch": 3.766816143497758, + "grad_norm": 7.060092926025391, + "learning_rate": 5.293433211034108e-06, + "loss": 2.9327, + "step": 55440 + }, + { + "epoch": 3.7671558635684197, + "grad_norm": 7.918240070343018, + "learning_rate": 5.2930085609457814e-06, + "loss": 2.9198, + "step": 55445 + }, + { + "epoch": 3.7674955836390813, + "grad_norm": 8.017526626586914, + "learning_rate": 5.292583910857454e-06, + "loss": 3.1637, + "step": 55450 + }, + { + "epoch": 3.7678353037097434, + "grad_norm": 6.461684226989746, + "learning_rate": 5.292159260769126e-06, + "loss": 3.0187, + "step": 55455 + }, + { + "epoch": 3.768175023780405, + "grad_norm": 9.281364440917969, + "learning_rate": 5.2917346106808e-06, + "loss": 2.7729, + "step": 55460 + }, + { + "epoch": 3.7685147438510667, + "grad_norm": 7.234760284423828, + "learning_rate": 5.291309960592473e-06, + "loss": 2.7125, + "step": 55465 + }, + { + "epoch": 3.7688544639217287, + "grad_norm": 8.240352630615234, + "learning_rate": 5.290885310504145e-06, + "loss": 2.708, + "step": 55470 + }, + { + "epoch": 3.7691941839923904, + "grad_norm": 9.278200149536133, + "learning_rate": 5.290460660415818e-06, + "loss": 2.7924, + "step": 55475 + }, + { + "epoch": 3.769533904063052, + "grad_norm": 6.863879203796387, + "learning_rate": 5.29003601032749e-06, + "loss": 2.9013, + "step": 55480 + }, + { + "epoch": 3.769873624133714, + "grad_norm": 6.564492225646973, + "learning_rate": 5.289611360239163e-06, + "loss": 2.9275, + "step": 55485 + }, + { + "epoch": 3.7702133442043757, + "grad_norm": 6.73570442199707, + "learning_rate": 5.289186710150837e-06, + "loss": 3.0596, + "step": 55490 + }, + { + "epoch": 3.7705530642750373, + "grad_norm": 6.220968723297119, + "learning_rate": 5.288762060062509e-06, + "loss": 2.9907, + "step": 55495 + }, + { + "epoch": 3.7708927843456994, + "grad_norm": 7.89124059677124, + "learning_rate": 5.288337409974181e-06, + "loss": 3.002, + "step": 55500 + }, + { + "epoch": 3.771232504416361, + "grad_norm": 9.80626106262207, + "learning_rate": 5.287912759885855e-06, + "loss": 2.9022, + "step": 55505 + }, + { + "epoch": 3.7715722244870227, + "grad_norm": 6.2559614181518555, + "learning_rate": 5.287488109797527e-06, + "loss": 2.716, + "step": 55510 + }, + { + "epoch": 3.7719119445576843, + "grad_norm": 7.7524733543396, + "learning_rate": 5.2870634597092e-06, + "loss": 3.0793, + "step": 55515 + }, + { + "epoch": 3.7722516646283464, + "grad_norm": 5.955062389373779, + "learning_rate": 5.2866388096208734e-06, + "loss": 3.214, + "step": 55520 + }, + { + "epoch": 3.772591384699008, + "grad_norm": 8.963662147521973, + "learning_rate": 5.286214159532545e-06, + "loss": 3.0887, + "step": 55525 + }, + { + "epoch": 3.7729311047696696, + "grad_norm": 6.862767696380615, + "learning_rate": 5.285789509444218e-06, + "loss": 2.7184, + "step": 55530 + }, + { + "epoch": 3.7732708248403317, + "grad_norm": 6.2118306159973145, + "learning_rate": 5.285364859355892e-06, + "loss": 2.8476, + "step": 55535 + }, + { + "epoch": 3.7736105449109933, + "grad_norm": 6.058140277862549, + "learning_rate": 5.284940209267564e-06, + "loss": 2.9217, + "step": 55540 + }, + { + "epoch": 3.773950264981655, + "grad_norm": 7.489439964294434, + "learning_rate": 5.284515559179237e-06, + "loss": 3.0128, + "step": 55545 + }, + { + "epoch": 3.774289985052317, + "grad_norm": 6.649240970611572, + "learning_rate": 5.28409090909091e-06, + "loss": 3.0299, + "step": 55550 + }, + { + "epoch": 3.7746297051229787, + "grad_norm": 7.624663352966309, + "learning_rate": 5.283666259002582e-06, + "loss": 2.9809, + "step": 55555 + }, + { + "epoch": 3.7749694251936403, + "grad_norm": 7.787308216094971, + "learning_rate": 5.283241608914255e-06, + "loss": 2.8269, + "step": 55560 + }, + { + "epoch": 3.775309145264302, + "grad_norm": 7.353287696838379, + "learning_rate": 5.282816958825928e-06, + "loss": 2.6182, + "step": 55565 + }, + { + "epoch": 3.775648865334964, + "grad_norm": 7.793950080871582, + "learning_rate": 5.282392308737601e-06, + "loss": 2.86, + "step": 55570 + }, + { + "epoch": 3.7759885854056257, + "grad_norm": 7.38749361038208, + "learning_rate": 5.2819676586492726e-06, + "loss": 3.2145, + "step": 55575 + }, + { + "epoch": 3.7763283054762873, + "grad_norm": 6.820108413696289, + "learning_rate": 5.281543008560946e-06, + "loss": 2.8769, + "step": 55580 + }, + { + "epoch": 3.7766680255469494, + "grad_norm": 9.605792045593262, + "learning_rate": 5.281118358472619e-06, + "loss": 3.072, + "step": 55585 + }, + { + "epoch": 3.777007745617611, + "grad_norm": 9.146442413330078, + "learning_rate": 5.280693708384291e-06, + "loss": 3.0074, + "step": 55590 + }, + { + "epoch": 3.7773474656882726, + "grad_norm": 9.859718322753906, + "learning_rate": 5.280269058295965e-06, + "loss": 3.071, + "step": 55595 + }, + { + "epoch": 3.7776871857589347, + "grad_norm": 8.321723937988281, + "learning_rate": 5.279844408207637e-06, + "loss": 3.0272, + "step": 55600 + }, + { + "epoch": 3.7780269058295963, + "grad_norm": 8.308369636535645, + "learning_rate": 5.279419758119309e-06, + "loss": 2.8959, + "step": 55605 + }, + { + "epoch": 3.778366625900258, + "grad_norm": 6.8179755210876465, + "learning_rate": 5.278995108030983e-06, + "loss": 3.005, + "step": 55610 + }, + { + "epoch": 3.77870634597092, + "grad_norm": 5.522640705108643, + "learning_rate": 5.278570457942656e-06, + "loss": 3.0318, + "step": 55615 + }, + { + "epoch": 3.7790460660415817, + "grad_norm": 7.397274494171143, + "learning_rate": 5.278145807854328e-06, + "loss": 2.8656, + "step": 55620 + }, + { + "epoch": 3.7793857861122433, + "grad_norm": 6.065529823303223, + "learning_rate": 5.277721157766001e-06, + "loss": 3.1114, + "step": 55625 + }, + { + "epoch": 3.7797255061829054, + "grad_norm": 7.520146369934082, + "learning_rate": 5.277296507677674e-06, + "loss": 3.2535, + "step": 55630 + }, + { + "epoch": 3.780065226253567, + "grad_norm": 6.621714115142822, + "learning_rate": 5.276871857589346e-06, + "loss": 3.0781, + "step": 55635 + }, + { + "epoch": 3.7804049463242286, + "grad_norm": 7.0756072998046875, + "learning_rate": 5.27644720750102e-06, + "loss": 3.2059, + "step": 55640 + }, + { + "epoch": 3.7807446663948907, + "grad_norm": 6.145853042602539, + "learning_rate": 5.276022557412693e-06, + "loss": 2.9979, + "step": 55645 + }, + { + "epoch": 3.7810843864655523, + "grad_norm": 6.934426307678223, + "learning_rate": 5.2755979073243646e-06, + "loss": 2.8437, + "step": 55650 + }, + { + "epoch": 3.781424106536214, + "grad_norm": 6.34348201751709, + "learning_rate": 5.275173257236038e-06, + "loss": 2.9573, + "step": 55655 + }, + { + "epoch": 3.781763826606876, + "grad_norm": 7.0239176750183105, + "learning_rate": 5.27474860714771e-06, + "loss": 2.9534, + "step": 55660 + }, + { + "epoch": 3.7821035466775377, + "grad_norm": 9.734477996826172, + "learning_rate": 5.274323957059384e-06, + "loss": 2.7846, + "step": 55665 + }, + { + "epoch": 3.7824432667481993, + "grad_norm": 6.498329162597656, + "learning_rate": 5.273899306971057e-06, + "loss": 3.1382, + "step": 55670 + }, + { + "epoch": 3.7827829868188614, + "grad_norm": 8.839012145996094, + "learning_rate": 5.2734746568827286e-06, + "loss": 2.9243, + "step": 55675 + }, + { + "epoch": 3.783122706889523, + "grad_norm": 6.1862382888793945, + "learning_rate": 5.273050006794402e-06, + "loss": 2.8235, + "step": 55680 + }, + { + "epoch": 3.7834624269601846, + "grad_norm": 7.422094821929932, + "learning_rate": 5.272625356706075e-06, + "loss": 3.0277, + "step": 55685 + }, + { + "epoch": 3.7838021470308467, + "grad_norm": 8.338091850280762, + "learning_rate": 5.272200706617747e-06, + "loss": 2.8026, + "step": 55690 + }, + { + "epoch": 3.7841418671015083, + "grad_norm": 8.491352081298828, + "learning_rate": 5.271776056529421e-06, + "loss": 3.27, + "step": 55695 + }, + { + "epoch": 3.78448158717217, + "grad_norm": 6.810319423675537, + "learning_rate": 5.271351406441093e-06, + "loss": 2.8723, + "step": 55700 + }, + { + "epoch": 3.784821307242832, + "grad_norm": 9.08966064453125, + "learning_rate": 5.270926756352765e-06, + "loss": 2.8891, + "step": 55705 + }, + { + "epoch": 3.7851610273134937, + "grad_norm": 6.251689434051514, + "learning_rate": 5.270502106264439e-06, + "loss": 2.831, + "step": 55710 + }, + { + "epoch": 3.7855007473841553, + "grad_norm": 7.422691822052002, + "learning_rate": 5.270077456176112e-06, + "loss": 2.8813, + "step": 55715 + }, + { + "epoch": 3.7858404674548174, + "grad_norm": 7.630753517150879, + "learning_rate": 5.269652806087784e-06, + "loss": 2.8395, + "step": 55720 + }, + { + "epoch": 3.786180187525479, + "grad_norm": 7.055378437042236, + "learning_rate": 5.2692281559994574e-06, + "loss": 2.7978, + "step": 55725 + }, + { + "epoch": 3.7865199075961407, + "grad_norm": 7.696295261383057, + "learning_rate": 5.268803505911129e-06, + "loss": 3.1326, + "step": 55730 + }, + { + "epoch": 3.7868596276668027, + "grad_norm": 5.433672904968262, + "learning_rate": 5.268378855822802e-06, + "loss": 2.6816, + "step": 55735 + }, + { + "epoch": 3.7871993477374644, + "grad_norm": 6.897697448730469, + "learning_rate": 5.267954205734476e-06, + "loss": 3.0476, + "step": 55740 + }, + { + "epoch": 3.787539067808126, + "grad_norm": 5.880942344665527, + "learning_rate": 5.267529555646148e-06, + "loss": 2.9669, + "step": 55745 + }, + { + "epoch": 3.787878787878788, + "grad_norm": 6.747469425201416, + "learning_rate": 5.267104905557821e-06, + "loss": 2.9129, + "step": 55750 + }, + { + "epoch": 3.7882185079494497, + "grad_norm": 7.0134782791137695, + "learning_rate": 5.266680255469494e-06, + "loss": 3.0782, + "step": 55755 + }, + { + "epoch": 3.7885582280201113, + "grad_norm": 6.613712787628174, + "learning_rate": 5.266255605381166e-06, + "loss": 2.9643, + "step": 55760 + }, + { + "epoch": 3.7888979480907734, + "grad_norm": 5.495772838592529, + "learning_rate": 5.265830955292839e-06, + "loss": 2.8778, + "step": 55765 + }, + { + "epoch": 3.789237668161435, + "grad_norm": 6.595362663269043, + "learning_rate": 5.265406305204513e-06, + "loss": 3.1073, + "step": 55770 + }, + { + "epoch": 3.7895773882320967, + "grad_norm": 7.0982794761657715, + "learning_rate": 5.264981655116185e-06, + "loss": 3.0152, + "step": 55775 + }, + { + "epoch": 3.7899171083027587, + "grad_norm": 7.648982524871826, + "learning_rate": 5.264557005027857e-06, + "loss": 2.866, + "step": 55780 + }, + { + "epoch": 3.7902568283734204, + "grad_norm": 6.875426292419434, + "learning_rate": 5.264132354939531e-06, + "loss": 3.1855, + "step": 55785 + }, + { + "epoch": 3.790596548444082, + "grad_norm": 7.215467929840088, + "learning_rate": 5.263707704851203e-06, + "loss": 2.8577, + "step": 55790 + }, + { + "epoch": 3.790936268514744, + "grad_norm": 6.943484783172607, + "learning_rate": 5.263283054762876e-06, + "loss": 2.9882, + "step": 55795 + }, + { + "epoch": 3.7912759885854057, + "grad_norm": 7.407690048217773, + "learning_rate": 5.262858404674549e-06, + "loss": 2.7194, + "step": 55800 + }, + { + "epoch": 3.7916157086560673, + "grad_norm": 8.261438369750977, + "learning_rate": 5.262433754586221e-06, + "loss": 2.912, + "step": 55805 + }, + { + "epoch": 3.7919554287267294, + "grad_norm": 5.991395950317383, + "learning_rate": 5.262009104497894e-06, + "loss": 2.928, + "step": 55810 + }, + { + "epoch": 3.792295148797391, + "grad_norm": 6.046579360961914, + "learning_rate": 5.261584454409567e-06, + "loss": 2.8239, + "step": 55815 + }, + { + "epoch": 3.7926348688680527, + "grad_norm": 6.635809898376465, + "learning_rate": 5.26115980432124e-06, + "loss": 3.1413, + "step": 55820 + }, + { + "epoch": 3.7929745889387148, + "grad_norm": 8.224469184875488, + "learning_rate": 5.260735154232912e-06, + "loss": 2.8895, + "step": 55825 + }, + { + "epoch": 3.7933143090093764, + "grad_norm": 6.252205848693848, + "learning_rate": 5.260310504144585e-06, + "loss": 3.1641, + "step": 55830 + }, + { + "epoch": 3.793654029080038, + "grad_norm": 8.87745475769043, + "learning_rate": 5.259885854056258e-06, + "loss": 2.891, + "step": 55835 + }, + { + "epoch": 3.7939937491507, + "grad_norm": 7.352450847625732, + "learning_rate": 5.25946120396793e-06, + "loss": 2.9484, + "step": 55840 + }, + { + "epoch": 3.7943334692213617, + "grad_norm": 6.894290447235107, + "learning_rate": 5.259036553879604e-06, + "loss": 2.943, + "step": 55845 + }, + { + "epoch": 3.7946731892920234, + "grad_norm": 7.7457661628723145, + "learning_rate": 5.258611903791277e-06, + "loss": 2.8167, + "step": 55850 + }, + { + "epoch": 3.795012909362685, + "grad_norm": 7.503870010375977, + "learning_rate": 5.2581872537029486e-06, + "loss": 2.8605, + "step": 55855 + }, + { + "epoch": 3.795352629433347, + "grad_norm": 6.5679216384887695, + "learning_rate": 5.257762603614622e-06, + "loss": 3.0123, + "step": 55860 + }, + { + "epoch": 3.7956923495040087, + "grad_norm": 6.627756595611572, + "learning_rate": 5.257337953526295e-06, + "loss": 2.938, + "step": 55865 + }, + { + "epoch": 3.7960320695746703, + "grad_norm": 7.337286472320557, + "learning_rate": 5.256913303437967e-06, + "loss": 2.9408, + "step": 55870 + }, + { + "epoch": 3.7963717896453324, + "grad_norm": 7.454246520996094, + "learning_rate": 5.256488653349641e-06, + "loss": 2.8318, + "step": 55875 + }, + { + "epoch": 3.796711509715994, + "grad_norm": 6.332583904266357, + "learning_rate": 5.256064003261313e-06, + "loss": 3.0652, + "step": 55880 + }, + { + "epoch": 3.7970512297866557, + "grad_norm": 6.967348098754883, + "learning_rate": 5.255639353172985e-06, + "loss": 3.0452, + "step": 55885 + }, + { + "epoch": 3.7973909498573177, + "grad_norm": 7.560019016265869, + "learning_rate": 5.255214703084659e-06, + "loss": 3.0653, + "step": 55890 + }, + { + "epoch": 3.7977306699279794, + "grad_norm": 7.978262901306152, + "learning_rate": 5.254790052996332e-06, + "loss": 2.8789, + "step": 55895 + }, + { + "epoch": 3.798070389998641, + "grad_norm": 6.655038356781006, + "learning_rate": 5.254365402908004e-06, + "loss": 3.0011, + "step": 55900 + }, + { + "epoch": 3.7984101100693026, + "grad_norm": Infinity, + "learning_rate": 5.254025682837343e-06, + "loss": 3.0284, + "step": 55905 + }, + { + "epoch": 3.7987498301399647, + "grad_norm": 7.050739765167236, + "learning_rate": 5.253601032749015e-06, + "loss": 2.5467, + "step": 55910 + }, + { + "epoch": 3.7990895502106263, + "grad_norm": 5.8792266845703125, + "learning_rate": 5.253176382660688e-06, + "loss": 2.8566, + "step": 55915 + }, + { + "epoch": 3.799429270281288, + "grad_norm": 8.908794403076172, + "learning_rate": 5.252751732572361e-06, + "loss": 3.0266, + "step": 55920 + }, + { + "epoch": 3.79976899035195, + "grad_norm": 6.592099666595459, + "learning_rate": 5.252327082484033e-06, + "loss": 3.0554, + "step": 55925 + }, + { + "epoch": 3.8001087104226117, + "grad_norm": 7.61477518081665, + "learning_rate": 5.251902432395707e-06, + "loss": 3.0285, + "step": 55930 + }, + { + "epoch": 3.8004484304932733, + "grad_norm": 6.509330749511719, + "learning_rate": 5.2514777823073795e-06, + "loss": 3.0514, + "step": 55935 + }, + { + "epoch": 3.8007881505639354, + "grad_norm": 5.938945293426514, + "learning_rate": 5.2510531322190514e-06, + "loss": 2.8769, + "step": 55940 + }, + { + "epoch": 3.801127870634597, + "grad_norm": 7.998153209686279, + "learning_rate": 5.250628482130725e-06, + "loss": 3.0844, + "step": 55945 + }, + { + "epoch": 3.8014675907052586, + "grad_norm": 9.109268188476562, + "learning_rate": 5.250203832042398e-06, + "loss": 2.973, + "step": 55950 + }, + { + "epoch": 3.8018073107759207, + "grad_norm": 7.097419738769531, + "learning_rate": 5.24977918195407e-06, + "loss": 2.717, + "step": 55955 + }, + { + "epoch": 3.8021470308465823, + "grad_norm": 5.6822052001953125, + "learning_rate": 5.2493545318657435e-06, + "loss": 2.9155, + "step": 55960 + }, + { + "epoch": 3.802486750917244, + "grad_norm": 6.7763237953186035, + "learning_rate": 5.248929881777416e-06, + "loss": 2.847, + "step": 55965 + }, + { + "epoch": 3.802826470987906, + "grad_norm": 7.041713237762451, + "learning_rate": 5.248505231689088e-06, + "loss": 3.1272, + "step": 55970 + }, + { + "epoch": 3.8031661910585677, + "grad_norm": 9.350690841674805, + "learning_rate": 5.248080581600762e-06, + "loss": 2.9953, + "step": 55975 + }, + { + "epoch": 3.8035059111292293, + "grad_norm": 6.623997688293457, + "learning_rate": 5.247655931512434e-06, + "loss": 2.8563, + "step": 55980 + }, + { + "epoch": 3.8038456311998914, + "grad_norm": 7.109527111053467, + "learning_rate": 5.247231281424107e-06, + "loss": 3.055, + "step": 55985 + }, + { + "epoch": 3.804185351270553, + "grad_norm": 9.01604175567627, + "learning_rate": 5.24680663133578e-06, + "loss": 2.8605, + "step": 55990 + }, + { + "epoch": 3.8045250713412146, + "grad_norm": 7.116501331329346, + "learning_rate": 5.246381981247452e-06, + "loss": 2.992, + "step": 55995 + }, + { + "epoch": 3.8048647914118767, + "grad_norm": 7.6634626388549805, + "learning_rate": 5.245957331159125e-06, + "loss": 2.8893, + "step": 56000 + }, + { + "epoch": 3.8052045114825384, + "grad_norm": 8.505950927734375, + "learning_rate": 5.245532681070799e-06, + "loss": 2.8523, + "step": 56005 + }, + { + "epoch": 3.8055442315532, + "grad_norm": 9.094532012939453, + "learning_rate": 5.245108030982471e-06, + "loss": 2.8215, + "step": 56010 + }, + { + "epoch": 3.805883951623862, + "grad_norm": 6.157588481903076, + "learning_rate": 5.2446833808941434e-06, + "loss": 2.8979, + "step": 56015 + }, + { + "epoch": 3.8062236716945237, + "grad_norm": 6.0371785163879395, + "learning_rate": 5.244258730805817e-06, + "loss": 2.938, + "step": 56020 + }, + { + "epoch": 3.8065633917651853, + "grad_norm": 8.411288261413574, + "learning_rate": 5.243834080717489e-06, + "loss": 2.9913, + "step": 56025 + }, + { + "epoch": 3.8069031118358474, + "grad_norm": 6.347329139709473, + "learning_rate": 5.243409430629162e-06, + "loss": 3.0086, + "step": 56030 + }, + { + "epoch": 3.807242831906509, + "grad_norm": 7.153141975402832, + "learning_rate": 5.2429847805408355e-06, + "loss": 2.7585, + "step": 56035 + }, + { + "epoch": 3.8075825519771707, + "grad_norm": 5.333366394042969, + "learning_rate": 5.2425601304525075e-06, + "loss": 2.9938, + "step": 56040 + }, + { + "epoch": 3.8079222720478327, + "grad_norm": 6.428933143615723, + "learning_rate": 5.24213548036418e-06, + "loss": 2.7946, + "step": 56045 + }, + { + "epoch": 3.8082619921184944, + "grad_norm": 10.633663177490234, + "learning_rate": 5.241710830275854e-06, + "loss": 3.1389, + "step": 56050 + }, + { + "epoch": 3.808601712189156, + "grad_norm": 8.432319641113281, + "learning_rate": 5.241286180187526e-06, + "loss": 3.0699, + "step": 56055 + }, + { + "epoch": 3.808941432259818, + "grad_norm": 6.372212886810303, + "learning_rate": 5.240861530099199e-06, + "loss": 2.9362, + "step": 56060 + }, + { + "epoch": 3.8092811523304797, + "grad_norm": 6.698460578918457, + "learning_rate": 5.2404368800108715e-06, + "loss": 3.0729, + "step": 56065 + }, + { + "epoch": 3.8096208724011413, + "grad_norm": 9.202035903930664, + "learning_rate": 5.240012229922544e-06, + "loss": 2.9654, + "step": 56070 + }, + { + "epoch": 3.8099605924718034, + "grad_norm": 8.046438217163086, + "learning_rate": 5.239587579834216e-06, + "loss": 3.085, + "step": 56075 + }, + { + "epoch": 3.810300312542465, + "grad_norm": 7.534450054168701, + "learning_rate": 5.23916292974589e-06, + "loss": 3.0587, + "step": 56080 + }, + { + "epoch": 3.8106400326131267, + "grad_norm": 6.602912425994873, + "learning_rate": 5.238738279657563e-06, + "loss": 2.8718, + "step": 56085 + }, + { + "epoch": 3.8109797526837887, + "grad_norm": 8.827362060546875, + "learning_rate": 5.238313629569235e-06, + "loss": 2.8, + "step": 56090 + }, + { + "epoch": 3.8113194727544504, + "grad_norm": 9.669992446899414, + "learning_rate": 5.237888979480908e-06, + "loss": 3.0606, + "step": 56095 + }, + { + "epoch": 3.811659192825112, + "grad_norm": 6.719114303588867, + "learning_rate": 5.237464329392581e-06, + "loss": 3.0147, + "step": 56100 + }, + { + "epoch": 3.811998912895774, + "grad_norm": 7.523292064666748, + "learning_rate": 5.237039679304253e-06, + "loss": 3.0027, + "step": 56105 + }, + { + "epoch": 3.8123386329664357, + "grad_norm": 6.661201000213623, + "learning_rate": 5.236615029215927e-06, + "loss": 3.0966, + "step": 56110 + }, + { + "epoch": 3.8126783530370973, + "grad_norm": 7.994574069976807, + "learning_rate": 5.2361903791275995e-06, + "loss": 3.1742, + "step": 56115 + }, + { + "epoch": 3.8130180731077594, + "grad_norm": 7.386528015136719, + "learning_rate": 5.235765729039271e-06, + "loss": 2.9561, + "step": 56120 + }, + { + "epoch": 3.813357793178421, + "grad_norm": 7.071661949157715, + "learning_rate": 5.235341078950945e-06, + "loss": 3.0204, + "step": 56125 + }, + { + "epoch": 3.8136975132490827, + "grad_norm": 7.4668402671813965, + "learning_rate": 5.234916428862618e-06, + "loss": 2.9084, + "step": 56130 + }, + { + "epoch": 3.8140372333197448, + "grad_norm": 8.11345386505127, + "learning_rate": 5.23449177877429e-06, + "loss": 2.8238, + "step": 56135 + }, + { + "epoch": 3.8143769533904064, + "grad_norm": 7.901330947875977, + "learning_rate": 5.2340671286859635e-06, + "loss": 2.8319, + "step": 56140 + }, + { + "epoch": 3.814716673461068, + "grad_norm": 9.573214530944824, + "learning_rate": 5.233642478597636e-06, + "loss": 2.9204, + "step": 56145 + }, + { + "epoch": 3.81505639353173, + "grad_norm": 7.3035359382629395, + "learning_rate": 5.233217828509308e-06, + "loss": 3.2106, + "step": 56150 + }, + { + "epoch": 3.8153961136023917, + "grad_norm": 7.948726177215576, + "learning_rate": 5.232793178420982e-06, + "loss": 2.856, + "step": 56155 + }, + { + "epoch": 3.8157358336730534, + "grad_norm": 9.839200973510742, + "learning_rate": 5.232368528332654e-06, + "loss": 2.6912, + "step": 56160 + }, + { + "epoch": 3.8160755537437154, + "grad_norm": 9.047606468200684, + "learning_rate": 5.231943878244327e-06, + "loss": 3.1197, + "step": 56165 + }, + { + "epoch": 3.816415273814377, + "grad_norm": 8.013545989990234, + "learning_rate": 5.231519228156e-06, + "loss": 2.862, + "step": 56170 + }, + { + "epoch": 3.8167549938850387, + "grad_norm": 6.692659854888916, + "learning_rate": 5.231094578067672e-06, + "loss": 2.8729, + "step": 56175 + }, + { + "epoch": 3.8170947139557008, + "grad_norm": 7.2746429443359375, + "learning_rate": 5.230669927979345e-06, + "loss": 2.9873, + "step": 56180 + }, + { + "epoch": 3.8174344340263624, + "grad_norm": 6.730053424835205, + "learning_rate": 5.230245277891019e-06, + "loss": 2.9565, + "step": 56185 + }, + { + "epoch": 3.817774154097024, + "grad_norm": 7.9081950187683105, + "learning_rate": 5.229820627802691e-06, + "loss": 3.0434, + "step": 56190 + }, + { + "epoch": 3.8181138741676857, + "grad_norm": 7.867705821990967, + "learning_rate": 5.2293959777143634e-06, + "loss": 3.0775, + "step": 56195 + }, + { + "epoch": 3.8184535942383477, + "grad_norm": 7.76271390914917, + "learning_rate": 5.228971327626037e-06, + "loss": 3.0789, + "step": 56200 + }, + { + "epoch": 3.8187933143090094, + "grad_norm": 8.029452323913574, + "learning_rate": 5.228546677537709e-06, + "loss": 2.7722, + "step": 56205 + }, + { + "epoch": 3.819133034379671, + "grad_norm": 7.164859771728516, + "learning_rate": 5.228122027449383e-06, + "loss": 2.9333, + "step": 56210 + }, + { + "epoch": 3.819472754450333, + "grad_norm": 7.989279270172119, + "learning_rate": 5.2276973773610555e-06, + "loss": 2.9518, + "step": 56215 + }, + { + "epoch": 3.8198124745209947, + "grad_norm": 8.178238868713379, + "learning_rate": 5.2272727272727274e-06, + "loss": 3.1685, + "step": 56220 + }, + { + "epoch": 3.8201521945916563, + "grad_norm": 8.32581615447998, + "learning_rate": 5.226848077184401e-06, + "loss": 2.8633, + "step": 56225 + }, + { + "epoch": 3.8204919146623184, + "grad_norm": 7.067039966583252, + "learning_rate": 5.226423427096073e-06, + "loss": 2.7793, + "step": 56230 + }, + { + "epoch": 3.82083163473298, + "grad_norm": 6.751238822937012, + "learning_rate": 5.225998777007746e-06, + "loss": 3.0877, + "step": 56235 + }, + { + "epoch": 3.8211713548036417, + "grad_norm": 6.050195693969727, + "learning_rate": 5.2255741269194195e-06, + "loss": 2.9896, + "step": 56240 + }, + { + "epoch": 3.8215110748743033, + "grad_norm": 7.555814266204834, + "learning_rate": 5.2251494768310914e-06, + "loss": 2.9163, + "step": 56245 + }, + { + "epoch": 3.8218507949449654, + "grad_norm": 8.396634101867676, + "learning_rate": 5.224724826742764e-06, + "loss": 2.8424, + "step": 56250 + }, + { + "epoch": 3.822190515015627, + "grad_norm": 7.870179176330566, + "learning_rate": 5.224300176654438e-06, + "loss": 3.0697, + "step": 56255 + }, + { + "epoch": 3.8225302350862886, + "grad_norm": 6.698899745941162, + "learning_rate": 5.22387552656611e-06, + "loss": 2.8742, + "step": 56260 + }, + { + "epoch": 3.8228699551569507, + "grad_norm": 5.639778137207031, + "learning_rate": 5.223450876477783e-06, + "loss": 2.8219, + "step": 56265 + }, + { + "epoch": 3.8232096752276123, + "grad_norm": 5.457712173461914, + "learning_rate": 5.223026226389456e-06, + "loss": 2.7059, + "step": 56270 + }, + { + "epoch": 3.823549395298274, + "grad_norm": 6.4339070320129395, + "learning_rate": 5.222601576301128e-06, + "loss": 2.997, + "step": 56275 + }, + { + "epoch": 3.823889115368936, + "grad_norm": 9.268007278442383, + "learning_rate": 5.222176926212801e-06, + "loss": 3.0491, + "step": 56280 + }, + { + "epoch": 3.8242288354395977, + "grad_norm": 8.744354248046875, + "learning_rate": 5.221752276124475e-06, + "loss": 3.088, + "step": 56285 + }, + { + "epoch": 3.8245685555102593, + "grad_norm": 6.9078474044799805, + "learning_rate": 5.221327626036147e-06, + "loss": 2.9801, + "step": 56290 + }, + { + "epoch": 3.8249082755809214, + "grad_norm": 6.913817405700684, + "learning_rate": 5.2209029759478194e-06, + "loss": 2.8883, + "step": 56295 + }, + { + "epoch": 3.825247995651583, + "grad_norm": 6.403287410736084, + "learning_rate": 5.220478325859492e-06, + "loss": 3.0555, + "step": 56300 + }, + { + "epoch": 3.8255877157222447, + "grad_norm": 6.91747522354126, + "learning_rate": 5.220053675771165e-06, + "loss": 2.9859, + "step": 56305 + }, + { + "epoch": 3.8259274357929067, + "grad_norm": 9.089579582214355, + "learning_rate": 5.219629025682838e-06, + "loss": 3.1364, + "step": 56310 + }, + { + "epoch": 3.8262671558635684, + "grad_norm": 5.852970123291016, + "learning_rate": 5.219204375594511e-06, + "loss": 3.0132, + "step": 56315 + }, + { + "epoch": 3.82660687593423, + "grad_norm": 7.079545974731445, + "learning_rate": 5.2187797255061834e-06, + "loss": 2.6663, + "step": 56320 + }, + { + "epoch": 3.826946596004892, + "grad_norm": 7.376994609832764, + "learning_rate": 5.218355075417855e-06, + "loss": 2.9419, + "step": 56325 + }, + { + "epoch": 3.8272863160755537, + "grad_norm": 5.783348560333252, + "learning_rate": 5.217930425329529e-06, + "loss": 2.8313, + "step": 56330 + }, + { + "epoch": 3.8276260361462153, + "grad_norm": 8.562227249145508, + "learning_rate": 5.217505775241202e-06, + "loss": 3.015, + "step": 56335 + }, + { + "epoch": 3.8279657562168774, + "grad_norm": 5.850096702575684, + "learning_rate": 5.217081125152874e-06, + "loss": 3.0597, + "step": 56340 + }, + { + "epoch": 3.828305476287539, + "grad_norm": 6.793254375457764, + "learning_rate": 5.2166564750645475e-06, + "loss": 3.0003, + "step": 56345 + }, + { + "epoch": 3.8286451963582007, + "grad_norm": 7.1840009689331055, + "learning_rate": 5.21623182497622e-06, + "loss": 3.0925, + "step": 56350 + }, + { + "epoch": 3.8289849164288627, + "grad_norm": 9.634355545043945, + "learning_rate": 5.215807174887892e-06, + "loss": 2.8231, + "step": 56355 + }, + { + "epoch": 3.8293246364995244, + "grad_norm": 7.494197845458984, + "learning_rate": 5.215382524799566e-06, + "loss": 3.0008, + "step": 56360 + }, + { + "epoch": 3.829664356570186, + "grad_norm": 7.913677215576172, + "learning_rate": 5.214957874711239e-06, + "loss": 2.9155, + "step": 56365 + }, + { + "epoch": 3.830004076640848, + "grad_norm": 6.626039981842041, + "learning_rate": 5.214533224622911e-06, + "loss": 3.0258, + "step": 56370 + }, + { + "epoch": 3.8303437967115097, + "grad_norm": 9.759196281433105, + "learning_rate": 5.214108574534584e-06, + "loss": 2.9593, + "step": 56375 + }, + { + "epoch": 3.8306835167821713, + "grad_norm": 8.313185691833496, + "learning_rate": 5.213683924446257e-06, + "loss": 2.9039, + "step": 56380 + }, + { + "epoch": 3.8310232368528334, + "grad_norm": 6.41386604309082, + "learning_rate": 5.213259274357929e-06, + "loss": 2.8677, + "step": 56385 + }, + { + "epoch": 3.831362956923495, + "grad_norm": 8.042376518249512, + "learning_rate": 5.212834624269603e-06, + "loss": 2.8956, + "step": 56390 + }, + { + "epoch": 3.8317026769941567, + "grad_norm": 10.195067405700684, + "learning_rate": 5.212409974181275e-06, + "loss": 3.0552, + "step": 56395 + }, + { + "epoch": 3.8320423970648188, + "grad_norm": 6.23694372177124, + "learning_rate": 5.211985324092947e-06, + "loss": 2.8809, + "step": 56400 + }, + { + "epoch": 3.8323821171354804, + "grad_norm": 7.942667007446289, + "learning_rate": 5.211560674004621e-06, + "loss": 3.0043, + "step": 56405 + }, + { + "epoch": 3.832721837206142, + "grad_norm": 6.55766487121582, + "learning_rate": 5.211136023916293e-06, + "loss": 3.0232, + "step": 56410 + }, + { + "epoch": 3.833061557276804, + "grad_norm": 7.621715545654297, + "learning_rate": 5.210711373827966e-06, + "loss": 2.8139, + "step": 56415 + }, + { + "epoch": 3.8334012773474657, + "grad_norm": 7.486972808837891, + "learning_rate": 5.2102867237396395e-06, + "loss": 2.7173, + "step": 56420 + }, + { + "epoch": 3.8337409974181273, + "grad_norm": 7.842951774597168, + "learning_rate": 5.209862073651311e-06, + "loss": 2.8237, + "step": 56425 + }, + { + "epoch": 3.8340807174887894, + "grad_norm": 8.196451187133789, + "learning_rate": 5.209437423562984e-06, + "loss": 2.8567, + "step": 56430 + }, + { + "epoch": 3.834420437559451, + "grad_norm": 7.435050010681152, + "learning_rate": 5.209012773474658e-06, + "loss": 2.961, + "step": 56435 + }, + { + "epoch": 3.8347601576301127, + "grad_norm": 7.082326889038086, + "learning_rate": 5.20858812338633e-06, + "loss": 2.8702, + "step": 56440 + }, + { + "epoch": 3.8350998777007748, + "grad_norm": 7.31846809387207, + "learning_rate": 5.208163473298003e-06, + "loss": 2.8186, + "step": 56445 + }, + { + "epoch": 3.8354395977714364, + "grad_norm": 7.971628665924072, + "learning_rate": 5.207738823209676e-06, + "loss": 2.9634, + "step": 56450 + }, + { + "epoch": 3.835779317842098, + "grad_norm": 6.483433723449707, + "learning_rate": 5.207314173121348e-06, + "loss": 3.1127, + "step": 56455 + }, + { + "epoch": 3.83611903791276, + "grad_norm": 6.899120330810547, + "learning_rate": 5.206889523033021e-06, + "loss": 3.0665, + "step": 56460 + }, + { + "epoch": 3.8364587579834217, + "grad_norm": 7.324709892272949, + "learning_rate": 5.206464872944695e-06, + "loss": 2.9057, + "step": 56465 + }, + { + "epoch": 3.8367984780540834, + "grad_norm": 7.0837812423706055, + "learning_rate": 5.206040222856367e-06, + "loss": 2.892, + "step": 56470 + }, + { + "epoch": 3.8371381981247454, + "grad_norm": 8.229080200195312, + "learning_rate": 5.205615572768039e-06, + "loss": 3.0944, + "step": 56475 + }, + { + "epoch": 3.837477918195407, + "grad_norm": 8.137633323669434, + "learning_rate": 5.205190922679712e-06, + "loss": 2.9758, + "step": 56480 + }, + { + "epoch": 3.8378176382660687, + "grad_norm": 7.603518962860107, + "learning_rate": 5.204766272591385e-06, + "loss": 3.1862, + "step": 56485 + }, + { + "epoch": 3.8381573583367308, + "grad_norm": 10.09211540222168, + "learning_rate": 5.204341622503058e-06, + "loss": 2.9237, + "step": 56490 + }, + { + "epoch": 3.8384970784073924, + "grad_norm": 5.755003929138184, + "learning_rate": 5.203916972414731e-06, + "loss": 3.183, + "step": 56495 + }, + { + "epoch": 3.838836798478054, + "grad_norm": 8.914250373840332, + "learning_rate": 5.2034923223264034e-06, + "loss": 2.9686, + "step": 56500 + }, + { + "epoch": 3.839176518548716, + "grad_norm": 6.0821533203125, + "learning_rate": 5.203067672238075e-06, + "loss": 2.8551, + "step": 56505 + }, + { + "epoch": 3.8395162386193777, + "grad_norm": 7.043781757354736, + "learning_rate": 5.202643022149749e-06, + "loss": 3.0783, + "step": 56510 + }, + { + "epoch": 3.8398559586900394, + "grad_norm": 7.75283670425415, + "learning_rate": 5.202218372061422e-06, + "loss": 2.9373, + "step": 56515 + }, + { + "epoch": 3.8401956787607014, + "grad_norm": 8.605851173400879, + "learning_rate": 5.201793721973094e-06, + "loss": 3.038, + "step": 56520 + }, + { + "epoch": 3.840535398831363, + "grad_norm": 6.412093162536621, + "learning_rate": 5.2013690718847674e-06, + "loss": 2.7069, + "step": 56525 + }, + { + "epoch": 3.8408751189020247, + "grad_norm": 7.32870626449585, + "learning_rate": 5.20094442179644e-06, + "loss": 2.995, + "step": 56530 + }, + { + "epoch": 3.8412148389726863, + "grad_norm": 6.361015319824219, + "learning_rate": 5.200519771708112e-06, + "loss": 2.7567, + "step": 56535 + }, + { + "epoch": 3.8415545590433484, + "grad_norm": 7.982752799987793, + "learning_rate": 5.200095121619786e-06, + "loss": 3.2236, + "step": 56540 + }, + { + "epoch": 3.84189427911401, + "grad_norm": 9.537859916687012, + "learning_rate": 5.199670471531459e-06, + "loss": 3.134, + "step": 56545 + }, + { + "epoch": 3.8422339991846717, + "grad_norm": 6.409560680389404, + "learning_rate": 5.1992458214431314e-06, + "loss": 2.9779, + "step": 56550 + }, + { + "epoch": 3.8425737192553338, + "grad_norm": 9.815372467041016, + "learning_rate": 5.198821171354804e-06, + "loss": 3.1541, + "step": 56555 + }, + { + "epoch": 3.8429134393259954, + "grad_norm": 7.218655586242676, + "learning_rate": 5.198396521266477e-06, + "loss": 2.9961, + "step": 56560 + }, + { + "epoch": 3.843253159396657, + "grad_norm": 7.04759407043457, + "learning_rate": 5.19797187117815e-06, + "loss": 2.8091, + "step": 56565 + }, + { + "epoch": 3.843592879467319, + "grad_norm": 7.343015670776367, + "learning_rate": 5.197547221089823e-06, + "loss": 2.9826, + "step": 56570 + }, + { + "epoch": 3.8439325995379807, + "grad_norm": 7.497339725494385, + "learning_rate": 5.197122571001495e-06, + "loss": 2.7354, + "step": 56575 + }, + { + "epoch": 3.8442723196086424, + "grad_norm": 5.802175045013428, + "learning_rate": 5.196697920913168e-06, + "loss": 3.0614, + "step": 56580 + }, + { + "epoch": 3.844612039679304, + "grad_norm": 8.512466430664062, + "learning_rate": 5.196273270824841e-06, + "loss": 3.2179, + "step": 56585 + }, + { + "epoch": 3.844951759749966, + "grad_norm": 6.932217597961426, + "learning_rate": 5.195848620736513e-06, + "loss": 2.8574, + "step": 56590 + }, + { + "epoch": 3.8452914798206277, + "grad_norm": 7.017580032348633, + "learning_rate": 5.195423970648187e-06, + "loss": 2.83, + "step": 56595 + }, + { + "epoch": 3.8456311998912893, + "grad_norm": 8.069262504577637, + "learning_rate": 5.1949993205598594e-06, + "loss": 3.1041, + "step": 56600 + }, + { + "epoch": 3.8459709199619514, + "grad_norm": 7.006806373596191, + "learning_rate": 5.194574670471531e-06, + "loss": 2.8314, + "step": 56605 + }, + { + "epoch": 3.846310640032613, + "grad_norm": 5.430966854095459, + "learning_rate": 5.194150020383205e-06, + "loss": 2.8363, + "step": 56610 + }, + { + "epoch": 3.8466503601032747, + "grad_norm": 6.105525493621826, + "learning_rate": 5.193725370294878e-06, + "loss": 2.8948, + "step": 56615 + }, + { + "epoch": 3.8469900801739367, + "grad_norm": 7.1504387855529785, + "learning_rate": 5.19330072020655e-06, + "loss": 2.9041, + "step": 56620 + }, + { + "epoch": 3.8473298002445984, + "grad_norm": 6.3962788581848145, + "learning_rate": 5.1928760701182234e-06, + "loss": 2.7839, + "step": 56625 + }, + { + "epoch": 3.84766952031526, + "grad_norm": 7.790157794952393, + "learning_rate": 5.192451420029896e-06, + "loss": 3.02, + "step": 56630 + }, + { + "epoch": 3.848009240385922, + "grad_norm": 7.621463298797607, + "learning_rate": 5.192026769941568e-06, + "loss": 3.1422, + "step": 56635 + }, + { + "epoch": 3.8483489604565837, + "grad_norm": 6.417639255523682, + "learning_rate": 5.191602119853242e-06, + "loss": 3.0158, + "step": 56640 + }, + { + "epoch": 3.8486886805272453, + "grad_norm": 6.005905628204346, + "learning_rate": 5.191177469764914e-06, + "loss": 3.125, + "step": 56645 + }, + { + "epoch": 3.8490284005979074, + "grad_norm": 5.954068183898926, + "learning_rate": 5.190752819676587e-06, + "loss": 2.7919, + "step": 56650 + }, + { + "epoch": 3.849368120668569, + "grad_norm": 7.800382137298584, + "learning_rate": 5.19032816958826e-06, + "loss": 3.0017, + "step": 56655 + }, + { + "epoch": 3.8497078407392307, + "grad_norm": 7.7108473777771, + "learning_rate": 5.189903519499932e-06, + "loss": 3.129, + "step": 56660 + }, + { + "epoch": 3.8500475608098927, + "grad_norm": 8.246349334716797, + "learning_rate": 5.189478869411605e-06, + "loss": 2.9518, + "step": 56665 + }, + { + "epoch": 3.8503872808805544, + "grad_norm": 6.561103820800781, + "learning_rate": 5.189054219323279e-06, + "loss": 2.9045, + "step": 56670 + }, + { + "epoch": 3.850727000951216, + "grad_norm": 6.615615367889404, + "learning_rate": 5.188629569234951e-06, + "loss": 2.9909, + "step": 56675 + }, + { + "epoch": 3.851066721021878, + "grad_norm": 9.26103687286377, + "learning_rate": 5.188204919146623e-06, + "loss": 2.9351, + "step": 56680 + }, + { + "epoch": 3.8514064410925397, + "grad_norm": 7.308867454528809, + "learning_rate": 5.187780269058297e-06, + "loss": 2.7021, + "step": 56685 + }, + { + "epoch": 3.8517461611632013, + "grad_norm": 6.714588165283203, + "learning_rate": 5.187355618969969e-06, + "loss": 2.7622, + "step": 56690 + }, + { + "epoch": 3.8520858812338634, + "grad_norm": 8.570169448852539, + "learning_rate": 5.186930968881642e-06, + "loss": 3.1488, + "step": 56695 + }, + { + "epoch": 3.852425601304525, + "grad_norm": 7.433956146240234, + "learning_rate": 5.1865063187933155e-06, + "loss": 2.7862, + "step": 56700 + }, + { + "epoch": 3.8527653213751867, + "grad_norm": 6.894741535186768, + "learning_rate": 5.186081668704987e-06, + "loss": 2.723, + "step": 56705 + }, + { + "epoch": 3.8531050414458488, + "grad_norm": 6.293022632598877, + "learning_rate": 5.18565701861666e-06, + "loss": 2.967, + "step": 56710 + }, + { + "epoch": 3.8534447615165104, + "grad_norm": 6.918468475341797, + "learning_rate": 5.185232368528334e-06, + "loss": 2.8983, + "step": 56715 + }, + { + "epoch": 3.853784481587172, + "grad_norm": 5.738967418670654, + "learning_rate": 5.184807718440006e-06, + "loss": 2.9955, + "step": 56720 + }, + { + "epoch": 3.854124201657834, + "grad_norm": 7.4023823738098145, + "learning_rate": 5.184383068351679e-06, + "loss": 2.9487, + "step": 56725 + }, + { + "epoch": 3.8544639217284957, + "grad_norm": 7.42182731628418, + "learning_rate": 5.183958418263351e-06, + "loss": 2.9009, + "step": 56730 + }, + { + "epoch": 3.8548036417991574, + "grad_norm": 7.092685222625732, + "learning_rate": 5.183533768175024e-06, + "loss": 2.9067, + "step": 56735 + }, + { + "epoch": 3.8551433618698194, + "grad_norm": 7.9974164962768555, + "learning_rate": 5.183109118086696e-06, + "loss": 3.0705, + "step": 56740 + }, + { + "epoch": 3.855483081940481, + "grad_norm": 7.739042282104492, + "learning_rate": 5.18268446799837e-06, + "loss": 2.7641, + "step": 56745 + }, + { + "epoch": 3.8558228020111427, + "grad_norm": 5.993439674377441, + "learning_rate": 5.182259817910043e-06, + "loss": 3.0365, + "step": 56750 + }, + { + "epoch": 3.8561625220818048, + "grad_norm": 6.60123348236084, + "learning_rate": 5.1818351678217146e-06, + "loss": 2.9436, + "step": 56755 + }, + { + "epoch": 3.8565022421524664, + "grad_norm": 6.110010147094727, + "learning_rate": 5.181410517733388e-06, + "loss": 2.9847, + "step": 56760 + }, + { + "epoch": 3.856841962223128, + "grad_norm": 6.0788116455078125, + "learning_rate": 5.180985867645061e-06, + "loss": 2.8623, + "step": 56765 + }, + { + "epoch": 3.85718168229379, + "grad_norm": 9.027710914611816, + "learning_rate": 5.180561217556733e-06, + "loss": 3.0966, + "step": 56770 + }, + { + "epoch": 3.8575214023644517, + "grad_norm": 5.672186851501465, + "learning_rate": 5.180136567468407e-06, + "loss": 2.9587, + "step": 56775 + }, + { + "epoch": 3.8578611224351134, + "grad_norm": 6.504351615905762, + "learning_rate": 5.179711917380079e-06, + "loss": 2.8263, + "step": 56780 + }, + { + "epoch": 3.8582008425057754, + "grad_norm": 6.372799396514893, + "learning_rate": 5.179287267291751e-06, + "loss": 2.8201, + "step": 56785 + }, + { + "epoch": 3.858540562576437, + "grad_norm": 8.225695610046387, + "learning_rate": 5.178862617203425e-06, + "loss": 2.8528, + "step": 56790 + }, + { + "epoch": 3.8588802826470987, + "grad_norm": 6.083288192749023, + "learning_rate": 5.178437967115098e-06, + "loss": 3.0837, + "step": 56795 + }, + { + "epoch": 3.8592200027177608, + "grad_norm": 8.36689281463623, + "learning_rate": 5.17801331702677e-06, + "loss": 2.7783, + "step": 56800 + }, + { + "epoch": 3.8595597227884224, + "grad_norm": 8.346795082092285, + "learning_rate": 5.1775886669384434e-06, + "loss": 2.8615, + "step": 56805 + }, + { + "epoch": 3.859899442859084, + "grad_norm": 6.832315921783447, + "learning_rate": 5.177164016850116e-06, + "loss": 3.0106, + "step": 56810 + }, + { + "epoch": 3.860239162929746, + "grad_norm": 8.162426948547363, + "learning_rate": 5.176739366761788e-06, + "loss": 2.9263, + "step": 56815 + }, + { + "epoch": 3.8605788830004077, + "grad_norm": 7.498331069946289, + "learning_rate": 5.176314716673462e-06, + "loss": 3.112, + "step": 56820 + }, + { + "epoch": 3.8609186030710694, + "grad_norm": 7.317134857177734, + "learning_rate": 5.175890066585134e-06, + "loss": 3.0396, + "step": 56825 + }, + { + "epoch": 3.8612583231417315, + "grad_norm": 6.724002838134766, + "learning_rate": 5.175465416496807e-06, + "loss": 2.8174, + "step": 56830 + }, + { + "epoch": 3.861598043212393, + "grad_norm": 7.09665584564209, + "learning_rate": 5.17504076640848e-06, + "loss": 2.7672, + "step": 56835 + }, + { + "epoch": 3.8619377632830547, + "grad_norm": 7.831989765167236, + "learning_rate": 5.174616116320152e-06, + "loss": 2.9046, + "step": 56840 + }, + { + "epoch": 3.862277483353717, + "grad_norm": 7.4202728271484375, + "learning_rate": 5.174191466231825e-06, + "loss": 2.8068, + "step": 56845 + }, + { + "epoch": 3.8626172034243784, + "grad_norm": 8.125340461730957, + "learning_rate": 5.173766816143499e-06, + "loss": 2.7818, + "step": 56850 + }, + { + "epoch": 3.86295692349504, + "grad_norm": 7.884785175323486, + "learning_rate": 5.173342166055171e-06, + "loss": 3.0073, + "step": 56855 + }, + { + "epoch": 3.863296643565702, + "grad_norm": 6.088525772094727, + "learning_rate": 5.172917515966843e-06, + "loss": 2.721, + "step": 56860 + }, + { + "epoch": 3.8636363636363638, + "grad_norm": 8.491461753845215, + "learning_rate": 5.172492865878517e-06, + "loss": 2.6884, + "step": 56865 + }, + { + "epoch": 3.8639760837070254, + "grad_norm": 6.470210075378418, + "learning_rate": 5.172068215790189e-06, + "loss": 2.951, + "step": 56870 + }, + { + "epoch": 3.864315803777687, + "grad_norm": 7.754590034484863, + "learning_rate": 5.171643565701862e-06, + "loss": 3.226, + "step": 56875 + }, + { + "epoch": 3.864655523848349, + "grad_norm": 6.824779987335205, + "learning_rate": 5.1712189156135354e-06, + "loss": 2.9884, + "step": 56880 + }, + { + "epoch": 3.8649952439190107, + "grad_norm": 8.613801002502441, + "learning_rate": 5.170794265525207e-06, + "loss": 2.7972, + "step": 56885 + }, + { + "epoch": 3.8653349639896724, + "grad_norm": 7.5393805503845215, + "learning_rate": 5.170369615436881e-06, + "loss": 2.9458, + "step": 56890 + }, + { + "epoch": 3.8656746840603344, + "grad_norm": 6.440260887145996, + "learning_rate": 5.169944965348553e-06, + "loss": 2.8062, + "step": 56895 + }, + { + "epoch": 3.866014404130996, + "grad_norm": 8.169600486755371, + "learning_rate": 5.169520315260226e-06, + "loss": 3.0227, + "step": 56900 + }, + { + "epoch": 3.8663541242016577, + "grad_norm": 5.718965530395508, + "learning_rate": 5.1690956651718994e-06, + "loss": 2.8962, + "step": 56905 + }, + { + "epoch": 3.8666938442723198, + "grad_norm": 8.720466613769531, + "learning_rate": 5.168671015083571e-06, + "loss": 3.1889, + "step": 56910 + }, + { + "epoch": 3.8670335643429814, + "grad_norm": 7.681772708892822, + "learning_rate": 5.168246364995244e-06, + "loss": 2.9245, + "step": 56915 + }, + { + "epoch": 3.867373284413643, + "grad_norm": 6.948947429656982, + "learning_rate": 5.167821714906918e-06, + "loss": 2.9647, + "step": 56920 + }, + { + "epoch": 3.8677130044843047, + "grad_norm": 6.453523635864258, + "learning_rate": 5.16739706481859e-06, + "loss": 2.7864, + "step": 56925 + }, + { + "epoch": 3.8680527245549667, + "grad_norm": 7.975355625152588, + "learning_rate": 5.166972414730263e-06, + "loss": 2.9605, + "step": 56930 + }, + { + "epoch": 3.8683924446256284, + "grad_norm": 7.731218338012695, + "learning_rate": 5.166547764641936e-06, + "loss": 2.7863, + "step": 56935 + }, + { + "epoch": 3.86873216469629, + "grad_norm": 9.277607917785645, + "learning_rate": 5.166123114553608e-06, + "loss": 2.9486, + "step": 56940 + }, + { + "epoch": 3.869071884766952, + "grad_norm": 8.573084831237793, + "learning_rate": 5.165698464465281e-06, + "loss": 3.2097, + "step": 56945 + }, + { + "epoch": 3.8694116048376137, + "grad_norm": 7.037232875823975, + "learning_rate": 5.165273814376955e-06, + "loss": 2.8891, + "step": 56950 + }, + { + "epoch": 3.8697513249082753, + "grad_norm": 6.74941873550415, + "learning_rate": 5.164849164288627e-06, + "loss": 2.9854, + "step": 56955 + }, + { + "epoch": 3.8700910449789374, + "grad_norm": 8.394638061523438, + "learning_rate": 5.164424514200299e-06, + "loss": 3.0444, + "step": 56960 + }, + { + "epoch": 3.870430765049599, + "grad_norm": 7.41395902633667, + "learning_rate": 5.163999864111972e-06, + "loss": 3.0748, + "step": 56965 + }, + { + "epoch": 3.8707704851202607, + "grad_norm": 6.948450565338135, + "learning_rate": 5.163575214023645e-06, + "loss": 2.9784, + "step": 56970 + }, + { + "epoch": 3.8711102051909227, + "grad_norm": 7.794321060180664, + "learning_rate": 5.163150563935318e-06, + "loss": 2.8694, + "step": 56975 + }, + { + "epoch": 3.8714499252615844, + "grad_norm": 6.2092790603637695, + "learning_rate": 5.162725913846991e-06, + "loss": 2.979, + "step": 56980 + }, + { + "epoch": 3.871789645332246, + "grad_norm": 6.364084720611572, + "learning_rate": 5.162301263758663e-06, + "loss": 2.723, + "step": 56985 + }, + { + "epoch": 3.872129365402908, + "grad_norm": 9.039603233337402, + "learning_rate": 5.161876613670335e-06, + "loss": 2.9702, + "step": 56990 + }, + { + "epoch": 3.8724690854735697, + "grad_norm": 7.018442153930664, + "learning_rate": 5.161451963582009e-06, + "loss": 2.9543, + "step": 56995 + }, + { + "epoch": 3.8728088055442313, + "grad_norm": 5.690568447113037, + "learning_rate": 5.161027313493682e-06, + "loss": 2.9734, + "step": 57000 + }, + { + "epoch": 3.8731485256148934, + "grad_norm": 6.236125469207764, + "learning_rate": 5.160602663405354e-06, + "loss": 2.9466, + "step": 57005 + }, + { + "epoch": 3.873488245685555, + "grad_norm": 7.659172058105469, + "learning_rate": 5.160178013317027e-06, + "loss": 2.9438, + "step": 57010 + }, + { + "epoch": 3.8738279657562167, + "grad_norm": 6.58408260345459, + "learning_rate": 5.1597533632287e-06, + "loss": 2.8832, + "step": 57015 + }, + { + "epoch": 3.8741676858268788, + "grad_norm": 7.6371684074401855, + "learning_rate": 5.159328713140372e-06, + "loss": 3.0362, + "step": 57020 + }, + { + "epoch": 3.8745074058975404, + "grad_norm": 7.978360652923584, + "learning_rate": 5.158904063052046e-06, + "loss": 2.9069, + "step": 57025 + }, + { + "epoch": 3.874847125968202, + "grad_norm": 6.785677909851074, + "learning_rate": 5.158479412963719e-06, + "loss": 2.7973, + "step": 57030 + }, + { + "epoch": 3.875186846038864, + "grad_norm": 7.223132610321045, + "learning_rate": 5.1580547628753906e-06, + "loss": 2.9992, + "step": 57035 + }, + { + "epoch": 3.8755265661095257, + "grad_norm": 6.544042587280273, + "learning_rate": 5.157630112787064e-06, + "loss": 2.8235, + "step": 57040 + }, + { + "epoch": 3.8758662861801874, + "grad_norm": 7.469699859619141, + "learning_rate": 5.157205462698737e-06, + "loss": 2.8274, + "step": 57045 + }, + { + "epoch": 3.8762060062508494, + "grad_norm": 8.193137168884277, + "learning_rate": 5.156780812610409e-06, + "loss": 2.8553, + "step": 57050 + }, + { + "epoch": 3.876545726321511, + "grad_norm": 8.336374282836914, + "learning_rate": 5.156356162522083e-06, + "loss": 2.833, + "step": 57055 + }, + { + "epoch": 3.8768854463921727, + "grad_norm": 6.450822353363037, + "learning_rate": 5.155931512433755e-06, + "loss": 2.9793, + "step": 57060 + }, + { + "epoch": 3.8772251664628348, + "grad_norm": 8.376712799072266, + "learning_rate": 5.155506862345427e-06, + "loss": 2.7702, + "step": 57065 + }, + { + "epoch": 3.8775648865334964, + "grad_norm": 7.8536834716796875, + "learning_rate": 5.155082212257101e-06, + "loss": 2.8131, + "step": 57070 + }, + { + "epoch": 3.877904606604158, + "grad_norm": 6.61588716506958, + "learning_rate": 5.154657562168773e-06, + "loss": 3.0722, + "step": 57075 + }, + { + "epoch": 3.87824432667482, + "grad_norm": 7.149332523345947, + "learning_rate": 5.154232912080446e-06, + "loss": 2.7457, + "step": 57080 + }, + { + "epoch": 3.8785840467454817, + "grad_norm": 7.2796149253845215, + "learning_rate": 5.153808261992119e-06, + "loss": 3.3032, + "step": 57085 + }, + { + "epoch": 3.8789237668161434, + "grad_norm": 9.94266414642334, + "learning_rate": 5.153383611903791e-06, + "loss": 3.1321, + "step": 57090 + }, + { + "epoch": 3.8792634868868054, + "grad_norm": 8.43735122680664, + "learning_rate": 5.152958961815464e-06, + "loss": 2.9257, + "step": 57095 + }, + { + "epoch": 3.879603206957467, + "grad_norm": 6.759091377258301, + "learning_rate": 5.152534311727138e-06, + "loss": 2.808, + "step": 57100 + }, + { + "epoch": 3.8799429270281287, + "grad_norm": 8.34271240234375, + "learning_rate": 5.15210966163881e-06, + "loss": 2.9732, + "step": 57105 + }, + { + "epoch": 3.880282647098791, + "grad_norm": 6.685417652130127, + "learning_rate": 5.151685011550483e-06, + "loss": 2.9675, + "step": 57110 + }, + { + "epoch": 3.8806223671694524, + "grad_norm": 5.936111927032471, + "learning_rate": 5.151260361462156e-06, + "loss": 3.0104, + "step": 57115 + }, + { + "epoch": 3.880962087240114, + "grad_norm": 8.880756378173828, + "learning_rate": 5.150835711373828e-06, + "loss": 2.8473, + "step": 57120 + }, + { + "epoch": 3.881301807310776, + "grad_norm": 6.6762285232543945, + "learning_rate": 5.150411061285501e-06, + "loss": 2.9705, + "step": 57125 + }, + { + "epoch": 3.8816415273814378, + "grad_norm": 6.389550685882568, + "learning_rate": 5.149986411197175e-06, + "loss": 3.1259, + "step": 57130 + }, + { + "epoch": 3.8819812474520994, + "grad_norm": 7.699306011199951, + "learning_rate": 5.149561761108847e-06, + "loss": 2.9123, + "step": 57135 + }, + { + "epoch": 3.8823209675227615, + "grad_norm": 7.09928560256958, + "learning_rate": 5.149137111020519e-06, + "loss": 2.9099, + "step": 57140 + }, + { + "epoch": 3.882660687593423, + "grad_norm": 6.4385223388671875, + "learning_rate": 5.148712460932192e-06, + "loss": 2.9073, + "step": 57145 + }, + { + "epoch": 3.8830004076640847, + "grad_norm": 7.4853668212890625, + "learning_rate": 5.148287810843865e-06, + "loss": 2.8927, + "step": 57150 + }, + { + "epoch": 3.883340127734747, + "grad_norm": 7.117622375488281, + "learning_rate": 5.147863160755538e-06, + "loss": 2.7346, + "step": 57155 + }, + { + "epoch": 3.8836798478054084, + "grad_norm": 7.040567874908447, + "learning_rate": 5.147438510667211e-06, + "loss": 3.3643, + "step": 57160 + }, + { + "epoch": 3.88401956787607, + "grad_norm": 7.411393642425537, + "learning_rate": 5.147013860578883e-06, + "loss": 3.0366, + "step": 57165 + }, + { + "epoch": 3.884359287946732, + "grad_norm": 6.478836536407471, + "learning_rate": 5.146589210490555e-06, + "loss": 3.0759, + "step": 57170 + }, + { + "epoch": 3.8846990080173938, + "grad_norm": 7.403102874755859, + "learning_rate": 5.146164560402229e-06, + "loss": 2.7962, + "step": 57175 + }, + { + "epoch": 3.8850387280880554, + "grad_norm": 6.485502243041992, + "learning_rate": 5.145739910313902e-06, + "loss": 2.7026, + "step": 57180 + }, + { + "epoch": 3.8853784481587175, + "grad_norm": 6.606417655944824, + "learning_rate": 5.145315260225574e-06, + "loss": 2.9195, + "step": 57185 + }, + { + "epoch": 3.885718168229379, + "grad_norm": 6.489627361297607, + "learning_rate": 5.144890610137247e-06, + "loss": 2.9795, + "step": 57190 + }, + { + "epoch": 3.8860578883000407, + "grad_norm": 5.377984046936035, + "learning_rate": 5.14446596004892e-06, + "loss": 2.9539, + "step": 57195 + }, + { + "epoch": 3.886397608370703, + "grad_norm": 6.432602882385254, + "learning_rate": 5.144041309960592e-06, + "loss": 2.6721, + "step": 57200 + }, + { + "epoch": 3.8867373284413644, + "grad_norm": 6.771035671234131, + "learning_rate": 5.143616659872266e-06, + "loss": 3.0195, + "step": 57205 + }, + { + "epoch": 3.887077048512026, + "grad_norm": 6.969998359680176, + "learning_rate": 5.143192009783939e-06, + "loss": 3.0523, + "step": 57210 + }, + { + "epoch": 3.887416768582688, + "grad_norm": 12.66846752166748, + "learning_rate": 5.1427673596956105e-06, + "loss": 3.1721, + "step": 57215 + }, + { + "epoch": 3.8877564886533498, + "grad_norm": 7.321181774139404, + "learning_rate": 5.142342709607284e-06, + "loss": 2.9876, + "step": 57220 + }, + { + "epoch": 3.8880962087240114, + "grad_norm": 7.062364101409912, + "learning_rate": 5.141918059518957e-06, + "loss": 2.8468, + "step": 57225 + }, + { + "epoch": 3.888435928794673, + "grad_norm": 10.302241325378418, + "learning_rate": 5.14149340943063e-06, + "loss": 2.8228, + "step": 57230 + }, + { + "epoch": 3.888775648865335, + "grad_norm": 7.3097405433654785, + "learning_rate": 5.141068759342303e-06, + "loss": 2.9862, + "step": 57235 + }, + { + "epoch": 3.8891153689359967, + "grad_norm": 5.645130634307861, + "learning_rate": 5.1406441092539746e-06, + "loss": 2.8191, + "step": 57240 + }, + { + "epoch": 3.8894550890066584, + "grad_norm": 6.449465751647949, + "learning_rate": 5.140219459165648e-06, + "loss": 2.8645, + "step": 57245 + }, + { + "epoch": 3.8897948090773204, + "grad_norm": 6.02706241607666, + "learning_rate": 5.139794809077321e-06, + "loss": 2.825, + "step": 57250 + }, + { + "epoch": 3.890134529147982, + "grad_norm": 6.547154903411865, + "learning_rate": 5.139370158988993e-06, + "loss": 3.0254, + "step": 57255 + }, + { + "epoch": 3.8904742492186437, + "grad_norm": 8.092560768127441, + "learning_rate": 5.138945508900667e-06, + "loss": 2.9684, + "step": 57260 + }, + { + "epoch": 3.8908139692893053, + "grad_norm": 8.006394386291504, + "learning_rate": 5.138520858812339e-06, + "loss": 3.1696, + "step": 57265 + }, + { + "epoch": 3.8911536893599674, + "grad_norm": 8.1122465133667, + "learning_rate": 5.138096208724011e-06, + "loss": 2.7219, + "step": 57270 + }, + { + "epoch": 3.891493409430629, + "grad_norm": 8.006141662597656, + "learning_rate": 5.137671558635685e-06, + "loss": 2.922, + "step": 57275 + }, + { + "epoch": 3.8918331295012907, + "grad_norm": 7.2896857261657715, + "learning_rate": 5.137246908547358e-06, + "loss": 2.8312, + "step": 57280 + }, + { + "epoch": 3.8921728495719528, + "grad_norm": 6.688697338104248, + "learning_rate": 5.13682225845903e-06, + "loss": 2.9267, + "step": 57285 + }, + { + "epoch": 3.8925125696426144, + "grad_norm": 7.407423973083496, + "learning_rate": 5.136397608370703e-06, + "loss": 2.9626, + "step": 57290 + }, + { + "epoch": 3.892852289713276, + "grad_norm": 7.132134437561035, + "learning_rate": 5.135972958282376e-06, + "loss": 2.9681, + "step": 57295 + }, + { + "epoch": 3.893192009783938, + "grad_norm": 8.637615203857422, + "learning_rate": 5.135548308194048e-06, + "loss": 2.8501, + "step": 57300 + }, + { + "epoch": 3.8935317298545997, + "grad_norm": 7.420948028564453, + "learning_rate": 5.135123658105722e-06, + "loss": 3.1495, + "step": 57305 + }, + { + "epoch": 3.8938714499252614, + "grad_norm": 6.821170330047607, + "learning_rate": 5.134699008017394e-06, + "loss": 2.848, + "step": 57310 + }, + { + "epoch": 3.8942111699959234, + "grad_norm": 8.068052291870117, + "learning_rate": 5.1342743579290666e-06, + "loss": 2.9834, + "step": 57315 + }, + { + "epoch": 3.894550890066585, + "grad_norm": 9.510186195373535, + "learning_rate": 5.13384970784074e-06, + "loss": 2.9296, + "step": 57320 + }, + { + "epoch": 3.8948906101372467, + "grad_norm": 7.260696887969971, + "learning_rate": 5.133425057752412e-06, + "loss": 3.0729, + "step": 57325 + }, + { + "epoch": 3.8952303302079088, + "grad_norm": 7.302424907684326, + "learning_rate": 5.133000407664085e-06, + "loss": 2.9739, + "step": 57330 + }, + { + "epoch": 3.8955700502785704, + "grad_norm": 7.591085910797119, + "learning_rate": 5.132575757575759e-06, + "loss": 2.9119, + "step": 57335 + }, + { + "epoch": 3.895909770349232, + "grad_norm": 6.179679870605469, + "learning_rate": 5.1321511074874306e-06, + "loss": 2.7612, + "step": 57340 + }, + { + "epoch": 3.896249490419894, + "grad_norm": 7.785654067993164, + "learning_rate": 5.131726457399103e-06, + "loss": 3.081, + "step": 57345 + }, + { + "epoch": 3.8965892104905557, + "grad_norm": 4.9033379554748535, + "learning_rate": 5.131301807310777e-06, + "loss": 3.1754, + "step": 57350 + }, + { + "epoch": 3.8969289305612174, + "grad_norm": 6.5306782722473145, + "learning_rate": 5.130877157222449e-06, + "loss": 3.0786, + "step": 57355 + }, + { + "epoch": 3.8972686506318794, + "grad_norm": 7.842265605926514, + "learning_rate": 5.130452507134122e-06, + "loss": 3.0022, + "step": 57360 + }, + { + "epoch": 3.897608370702541, + "grad_norm": 6.299127101898193, + "learning_rate": 5.130027857045795e-06, + "loss": 3.1538, + "step": 57365 + }, + { + "epoch": 3.8979480907732027, + "grad_norm": 6.357698917388916, + "learning_rate": 5.129603206957467e-06, + "loss": 3.1096, + "step": 57370 + }, + { + "epoch": 3.8982878108438648, + "grad_norm": 7.493488788604736, + "learning_rate": 5.12917855686914e-06, + "loss": 3.1418, + "step": 57375 + }, + { + "epoch": 3.8986275309145264, + "grad_norm": 9.394401550292969, + "learning_rate": 5.128753906780814e-06, + "loss": 3.2065, + "step": 57380 + }, + { + "epoch": 3.898967250985188, + "grad_norm": 6.785834312438965, + "learning_rate": 5.128329256692486e-06, + "loss": 2.8666, + "step": 57385 + }, + { + "epoch": 3.89930697105585, + "grad_norm": 7.862931251525879, + "learning_rate": 5.1279046066041586e-06, + "loss": 3.0182, + "step": 57390 + }, + { + "epoch": 3.8996466911265117, + "grad_norm": 8.334562301635742, + "learning_rate": 5.127479956515831e-06, + "loss": 2.8828, + "step": 57395 + }, + { + "epoch": 3.8999864111971734, + "grad_norm": 8.903299331665039, + "learning_rate": 5.127055306427504e-06, + "loss": 2.8524, + "step": 57400 + }, + { + "epoch": 3.9003261312678354, + "grad_norm": 5.968112468719482, + "learning_rate": 5.126630656339177e-06, + "loss": 2.7156, + "step": 57405 + }, + { + "epoch": 3.900665851338497, + "grad_norm": 6.145309925079346, + "learning_rate": 5.12620600625085e-06, + "loss": 2.9107, + "step": 57410 + }, + { + "epoch": 3.9010055714091587, + "grad_norm": 7.682365417480469, + "learning_rate": 5.125781356162523e-06, + "loss": 2.663, + "step": 57415 + }, + { + "epoch": 3.901345291479821, + "grad_norm": 7.521278381347656, + "learning_rate": 5.1253567060741945e-06, + "loss": 3.0133, + "step": 57420 + }, + { + "epoch": 3.9016850115504824, + "grad_norm": 7.847676753997803, + "learning_rate": 5.124932055985868e-06, + "loss": 3.0165, + "step": 57425 + }, + { + "epoch": 3.902024731621144, + "grad_norm": 6.335671901702881, + "learning_rate": 5.124507405897541e-06, + "loss": 2.8479, + "step": 57430 + }, + { + "epoch": 3.902364451691806, + "grad_norm": 7.242545127868652, + "learning_rate": 5.124082755809213e-06, + "loss": 2.838, + "step": 57435 + }, + { + "epoch": 3.9027041717624678, + "grad_norm": 6.127883434295654, + "learning_rate": 5.123658105720887e-06, + "loss": 2.9221, + "step": 57440 + }, + { + "epoch": 3.9030438918331294, + "grad_norm": 8.0955810546875, + "learning_rate": 5.123233455632559e-06, + "loss": 2.6744, + "step": 57445 + }, + { + "epoch": 3.9033836119037915, + "grad_norm": 6.142028331756592, + "learning_rate": 5.122808805544231e-06, + "loss": 2.8334, + "step": 57450 + }, + { + "epoch": 3.903723331974453, + "grad_norm": 6.475046157836914, + "learning_rate": 5.122384155455905e-06, + "loss": 3.2192, + "step": 57455 + }, + { + "epoch": 3.9040630520451147, + "grad_norm": 6.010650634765625, + "learning_rate": 5.121959505367578e-06, + "loss": 2.9742, + "step": 57460 + }, + { + "epoch": 3.904402772115777, + "grad_norm": 5.918391704559326, + "learning_rate": 5.12153485527925e-06, + "loss": 2.9236, + "step": 57465 + }, + { + "epoch": 3.9047424921864384, + "grad_norm": 8.67648696899414, + "learning_rate": 5.121110205190923e-06, + "loss": 3.0071, + "step": 57470 + }, + { + "epoch": 3.9050822122571, + "grad_norm": 7.019846439361572, + "learning_rate": 5.120685555102596e-06, + "loss": 2.9054, + "step": 57475 + }, + { + "epoch": 3.905421932327762, + "grad_norm": 7.633588790893555, + "learning_rate": 5.120260905014268e-06, + "loss": 2.9033, + "step": 57480 + }, + { + "epoch": 3.9057616523984238, + "grad_norm": 8.901670455932617, + "learning_rate": 5.119836254925942e-06, + "loss": 2.8379, + "step": 57485 + }, + { + "epoch": 3.9061013724690854, + "grad_norm": 7.1467814445495605, + "learning_rate": 5.119411604837614e-06, + "loss": 2.5441, + "step": 57490 + }, + { + "epoch": 3.9064410925397475, + "grad_norm": 8.399045944213867, + "learning_rate": 5.1189869547492865e-06, + "loss": 2.9651, + "step": 57495 + }, + { + "epoch": 3.906780812610409, + "grad_norm": 6.324592590332031, + "learning_rate": 5.11856230466096e-06, + "loss": 2.943, + "step": 57500 + }, + { + "epoch": 3.9071205326810707, + "grad_norm": 7.049717426300049, + "learning_rate": 5.118137654572632e-06, + "loss": 3.0958, + "step": 57505 + }, + { + "epoch": 3.907460252751733, + "grad_norm": 6.089790344238281, + "learning_rate": 5.117713004484305e-06, + "loss": 2.7994, + "step": 57510 + }, + { + "epoch": 3.9077999728223944, + "grad_norm": 7.713000774383545, + "learning_rate": 5.117288354395979e-06, + "loss": 2.9964, + "step": 57515 + }, + { + "epoch": 3.908139692893056, + "grad_norm": 7.0582122802734375, + "learning_rate": 5.1168637043076505e-06, + "loss": 3.0671, + "step": 57520 + }, + { + "epoch": 3.908479412963718, + "grad_norm": 9.59473991394043, + "learning_rate": 5.116439054219323e-06, + "loss": 2.5989, + "step": 57525 + }, + { + "epoch": 3.90881913303438, + "grad_norm": 7.333600044250488, + "learning_rate": 5.116014404130997e-06, + "loss": 2.7706, + "step": 57530 + }, + { + "epoch": 3.9091588531050414, + "grad_norm": 8.706433296203613, + "learning_rate": 5.115589754042669e-06, + "loss": 3.0101, + "step": 57535 + }, + { + "epoch": 3.9094985731757035, + "grad_norm": 6.834606170654297, + "learning_rate": 5.115165103954342e-06, + "loss": 2.903, + "step": 57540 + }, + { + "epoch": 3.909838293246365, + "grad_norm": 6.817859649658203, + "learning_rate": 5.114740453866015e-06, + "loss": 3.0037, + "step": 57545 + }, + { + "epoch": 3.9101780133170267, + "grad_norm": 8.76455307006836, + "learning_rate": 5.114315803777687e-06, + "loss": 2.8996, + "step": 57550 + }, + { + "epoch": 3.910517733387689, + "grad_norm": 6.578968524932861, + "learning_rate": 5.11389115368936e-06, + "loss": 2.8788, + "step": 57555 + }, + { + "epoch": 3.9108574534583505, + "grad_norm": 6.686611175537109, + "learning_rate": 5.113466503601033e-06, + "loss": 3.0449, + "step": 57560 + }, + { + "epoch": 3.911197173529012, + "grad_norm": 6.263457775115967, + "learning_rate": 5.113041853512706e-06, + "loss": 2.9053, + "step": 57565 + }, + { + "epoch": 3.9115368935996737, + "grad_norm": 5.827911376953125, + "learning_rate": 5.112617203424379e-06, + "loss": 2.9611, + "step": 57570 + }, + { + "epoch": 3.911876613670336, + "grad_norm": 7.451427459716797, + "learning_rate": 5.112192553336051e-06, + "loss": 2.8557, + "step": 57575 + }, + { + "epoch": 3.9122163337409974, + "grad_norm": 6.524883270263672, + "learning_rate": 5.111767903247724e-06, + "loss": 2.9761, + "step": 57580 + }, + { + "epoch": 3.912556053811659, + "grad_norm": 7.520023822784424, + "learning_rate": 5.111343253159398e-06, + "loss": 2.9625, + "step": 57585 + }, + { + "epoch": 3.912895773882321, + "grad_norm": 7.811160564422607, + "learning_rate": 5.11091860307107e-06, + "loss": 3.1549, + "step": 57590 + }, + { + "epoch": 3.9132354939529828, + "grad_norm": 8.649754524230957, + "learning_rate": 5.1104939529827426e-06, + "loss": 2.8587, + "step": 57595 + }, + { + "epoch": 3.9135752140236444, + "grad_norm": 7.1846795082092285, + "learning_rate": 5.110069302894416e-06, + "loss": 2.951, + "step": 57600 + }, + { + "epoch": 3.913914934094306, + "grad_norm": 7.885265350341797, + "learning_rate": 5.109644652806088e-06, + "loss": 3.0421, + "step": 57605 + }, + { + "epoch": 3.914254654164968, + "grad_norm": 7.368917465209961, + "learning_rate": 5.109220002717761e-06, + "loss": 2.8879, + "step": 57610 + }, + { + "epoch": 3.9145943742356297, + "grad_norm": 9.128931045532227, + "learning_rate": 5.108795352629435e-06, + "loss": 2.9792, + "step": 57615 + }, + { + "epoch": 3.9149340943062914, + "grad_norm": 7.569687843322754, + "learning_rate": 5.1083707025411066e-06, + "loss": 2.9677, + "step": 57620 + }, + { + "epoch": 3.9152738143769534, + "grad_norm": 5.933368682861328, + "learning_rate": 5.107946052452779e-06, + "loss": 3.0628, + "step": 57625 + }, + { + "epoch": 3.915613534447615, + "grad_norm": 10.302277565002441, + "learning_rate": 5.107521402364453e-06, + "loss": 2.9128, + "step": 57630 + }, + { + "epoch": 3.9159532545182767, + "grad_norm": 7.422207832336426, + "learning_rate": 5.107096752276125e-06, + "loss": 3.0149, + "step": 57635 + }, + { + "epoch": 3.9162929745889388, + "grad_norm": 7.497369766235352, + "learning_rate": 5.106672102187798e-06, + "loss": 3.1223, + "step": 57640 + }, + { + "epoch": 3.9166326946596004, + "grad_norm": 6.972593784332275, + "learning_rate": 5.1062474520994706e-06, + "loss": 3.1593, + "step": 57645 + }, + { + "epoch": 3.916972414730262, + "grad_norm": 6.336212635040283, + "learning_rate": 5.105822802011143e-06, + "loss": 2.8377, + "step": 57650 + }, + { + "epoch": 3.917312134800924, + "grad_norm": 7.799880504608154, + "learning_rate": 5.105398151922815e-06, + "loss": 2.7613, + "step": 57655 + }, + { + "epoch": 3.9176518548715857, + "grad_norm": 8.46728515625, + "learning_rate": 5.104973501834489e-06, + "loss": 2.8064, + "step": 57660 + }, + { + "epoch": 3.9179915749422474, + "grad_norm": 4.952028274536133, + "learning_rate": 5.104548851746162e-06, + "loss": 3.0741, + "step": 57665 + }, + { + "epoch": 3.9183312950129094, + "grad_norm": 9.322397232055664, + "learning_rate": 5.104124201657834e-06, + "loss": 3.0108, + "step": 57670 + }, + { + "epoch": 3.918671015083571, + "grad_norm": 6.041268348693848, + "learning_rate": 5.103699551569507e-06, + "loss": 3.057, + "step": 57675 + }, + { + "epoch": 3.9190107351542327, + "grad_norm": 5.56353235244751, + "learning_rate": 5.10327490148118e-06, + "loss": 2.9407, + "step": 57680 + }, + { + "epoch": 3.919350455224895, + "grad_norm": 6.654147624969482, + "learning_rate": 5.102850251392852e-06, + "loss": 2.9461, + "step": 57685 + }, + { + "epoch": 3.9196901752955564, + "grad_norm": 6.024432182312012, + "learning_rate": 5.102425601304526e-06, + "loss": 2.982, + "step": 57690 + }, + { + "epoch": 3.920029895366218, + "grad_norm": 5.650152206420898, + "learning_rate": 5.1020009512161986e-06, + "loss": 3.1614, + "step": 57695 + }, + { + "epoch": 3.92036961543688, + "grad_norm": 6.732383728027344, + "learning_rate": 5.1015763011278705e-06, + "loss": 3.1066, + "step": 57700 + }, + { + "epoch": 3.9207093355075417, + "grad_norm": 6.153006076812744, + "learning_rate": 5.101151651039544e-06, + "loss": 3.2022, + "step": 57705 + }, + { + "epoch": 3.9210490555782034, + "grad_norm": 8.225197792053223, + "learning_rate": 5.100727000951217e-06, + "loss": 2.886, + "step": 57710 + }, + { + "epoch": 3.9213887756488655, + "grad_norm": 7.062103748321533, + "learning_rate": 5.100302350862889e-06, + "loss": 2.8272, + "step": 57715 + }, + { + "epoch": 3.921728495719527, + "grad_norm": 8.953025817871094, + "learning_rate": 5.099877700774563e-06, + "loss": 2.6962, + "step": 57720 + }, + { + "epoch": 3.9220682157901887, + "grad_norm": 7.368322372436523, + "learning_rate": 5.099453050686235e-06, + "loss": 2.5972, + "step": 57725 + }, + { + "epoch": 3.922407935860851, + "grad_norm": 6.251682281494141, + "learning_rate": 5.099028400597907e-06, + "loss": 2.9465, + "step": 57730 + }, + { + "epoch": 3.9227476559315124, + "grad_norm": 6.614869117736816, + "learning_rate": 5.098603750509581e-06, + "loss": 3.0363, + "step": 57735 + }, + { + "epoch": 3.923087376002174, + "grad_norm": 9.047126770019531, + "learning_rate": 5.098179100421253e-06, + "loss": 2.8777, + "step": 57740 + }, + { + "epoch": 3.923427096072836, + "grad_norm": 6.789674282073975, + "learning_rate": 5.097754450332926e-06, + "loss": 2.8881, + "step": 57745 + }, + { + "epoch": 3.9237668161434978, + "grad_norm": 7.720230579376221, + "learning_rate": 5.097329800244599e-06, + "loss": 3.03, + "step": 57750 + }, + { + "epoch": 3.9241065362141594, + "grad_norm": 8.997504234313965, + "learning_rate": 5.096905150156271e-06, + "loss": 2.8534, + "step": 57755 + }, + { + "epoch": 3.9244462562848215, + "grad_norm": 5.654687881469727, + "learning_rate": 5.096480500067944e-06, + "loss": 2.9444, + "step": 57760 + }, + { + "epoch": 3.924785976355483, + "grad_norm": 5.557434558868408, + "learning_rate": 5.096055849979618e-06, + "loss": 2.9309, + "step": 57765 + }, + { + "epoch": 3.9251256964261447, + "grad_norm": 7.168519973754883, + "learning_rate": 5.09563119989129e-06, + "loss": 2.7916, + "step": 57770 + }, + { + "epoch": 3.925465416496807, + "grad_norm": 6.3820905685424805, + "learning_rate": 5.0952065498029625e-06, + "loss": 2.8476, + "step": 57775 + }, + { + "epoch": 3.9258051365674684, + "grad_norm": 6.458563327789307, + "learning_rate": 5.094781899714636e-06, + "loss": 2.8412, + "step": 57780 + }, + { + "epoch": 3.92614485663813, + "grad_norm": 7.637524127960205, + "learning_rate": 5.094357249626308e-06, + "loss": 2.9458, + "step": 57785 + }, + { + "epoch": 3.926484576708792, + "grad_norm": 6.736976146697998, + "learning_rate": 5.093932599537981e-06, + "loss": 3.0887, + "step": 57790 + }, + { + "epoch": 3.9268242967794538, + "grad_norm": 6.50447940826416, + "learning_rate": 5.093507949449655e-06, + "loss": 2.6942, + "step": 57795 + }, + { + "epoch": 3.9271640168501154, + "grad_norm": 6.979786396026611, + "learning_rate": 5.0930832993613265e-06, + "loss": 3.0058, + "step": 57800 + }, + { + "epoch": 3.9275037369207775, + "grad_norm": 6.671960353851318, + "learning_rate": 5.092658649272999e-06, + "loss": 3.1107, + "step": 57805 + }, + { + "epoch": 3.927843456991439, + "grad_norm": 9.187422752380371, + "learning_rate": 5.092233999184672e-06, + "loss": 2.945, + "step": 57810 + }, + { + "epoch": 3.9281831770621007, + "grad_norm": 8.44941234588623, + "learning_rate": 5.091809349096345e-06, + "loss": 2.9604, + "step": 57815 + }, + { + "epoch": 3.928522897132763, + "grad_norm": 7.3184123039245605, + "learning_rate": 5.091384699008018e-06, + "loss": 2.7012, + "step": 57820 + }, + { + "epoch": 3.9288626172034244, + "grad_norm": 9.453534126281738, + "learning_rate": 5.0909600489196905e-06, + "loss": 3.0867, + "step": 57825 + }, + { + "epoch": 3.929202337274086, + "grad_norm": 6.848318576812744, + "learning_rate": 5.090535398831363e-06, + "loss": 3.0164, + "step": 57830 + }, + { + "epoch": 3.929542057344748, + "grad_norm": 8.719680786132812, + "learning_rate": 5.090110748743035e-06, + "loss": 2.7592, + "step": 57835 + }, + { + "epoch": 3.92988177741541, + "grad_norm": 7.36031436920166, + "learning_rate": 5.089686098654709e-06, + "loss": 2.7879, + "step": 57840 + }, + { + "epoch": 3.9302214974860714, + "grad_norm": 7.6869072914123535, + "learning_rate": 5.089261448566382e-06, + "loss": 3.0349, + "step": 57845 + }, + { + "epoch": 3.9305612175567335, + "grad_norm": 6.789783954620361, + "learning_rate": 5.088836798478054e-06, + "loss": 2.9095, + "step": 57850 + }, + { + "epoch": 3.930900937627395, + "grad_norm": 5.646289348602295, + "learning_rate": 5.088412148389727e-06, + "loss": 3.0431, + "step": 57855 + }, + { + "epoch": 3.9312406576980568, + "grad_norm": 8.134613990783691, + "learning_rate": 5.0879874983014e-06, + "loss": 3.0554, + "step": 57860 + }, + { + "epoch": 3.931580377768719, + "grad_norm": 7.4117326736450195, + "learning_rate": 5.087562848213072e-06, + "loss": 2.8397, + "step": 57865 + }, + { + "epoch": 3.9319200978393805, + "grad_norm": 7.017190933227539, + "learning_rate": 5.087138198124746e-06, + "loss": 2.8234, + "step": 57870 + }, + { + "epoch": 3.932259817910042, + "grad_norm": 7.679982662200928, + "learning_rate": 5.0867135480364186e-06, + "loss": 3.1074, + "step": 57875 + }, + { + "epoch": 3.932599537980704, + "grad_norm": 6.2795891761779785, + "learning_rate": 5.0862888979480905e-06, + "loss": 2.9471, + "step": 57880 + }, + { + "epoch": 3.932939258051366, + "grad_norm": 7.388028621673584, + "learning_rate": 5.085864247859764e-06, + "loss": 3.0467, + "step": 57885 + }, + { + "epoch": 3.9332789781220274, + "grad_norm": 5.425405979156494, + "learning_rate": 5.085439597771437e-06, + "loss": 2.8371, + "step": 57890 + }, + { + "epoch": 3.9336186981926895, + "grad_norm": 6.3778767585754395, + "learning_rate": 5.085014947683109e-06, + "loss": 2.8901, + "step": 57895 + }, + { + "epoch": 3.933958418263351, + "grad_norm": 5.996710777282715, + "learning_rate": 5.0845902975947826e-06, + "loss": 2.8352, + "step": 57900 + }, + { + "epoch": 3.9342981383340128, + "grad_norm": 8.635138511657715, + "learning_rate": 5.0841656475064545e-06, + "loss": 2.9696, + "step": 57905 + }, + { + "epoch": 3.9346378584046744, + "grad_norm": 6.819986343383789, + "learning_rate": 5.083740997418128e-06, + "loss": 3.1139, + "step": 57910 + }, + { + "epoch": 3.9349775784753365, + "grad_norm": 6.194701194763184, + "learning_rate": 5.083316347329801e-06, + "loss": 2.9627, + "step": 57915 + }, + { + "epoch": 3.935317298545998, + "grad_norm": 9.10603141784668, + "learning_rate": 5.082891697241473e-06, + "loss": 2.6158, + "step": 57920 + }, + { + "epoch": 3.9356570186166597, + "grad_norm": 9.451972961425781, + "learning_rate": 5.0824670471531466e-06, + "loss": 3.0286, + "step": 57925 + }, + { + "epoch": 3.935996738687322, + "grad_norm": 8.722200393676758, + "learning_rate": 5.082042397064819e-06, + "loss": 2.7813, + "step": 57930 + }, + { + "epoch": 3.9363364587579834, + "grad_norm": 8.037236213684082, + "learning_rate": 5.081617746976491e-06, + "loss": 2.9826, + "step": 57935 + }, + { + "epoch": 3.936676178828645, + "grad_norm": 8.727582931518555, + "learning_rate": 5.081193096888165e-06, + "loss": 2.9507, + "step": 57940 + }, + { + "epoch": 3.9370158988993067, + "grad_norm": 5.839478492736816, + "learning_rate": 5.080768446799838e-06, + "loss": 2.6389, + "step": 57945 + }, + { + "epoch": 3.9373556189699688, + "grad_norm": 8.902623176574707, + "learning_rate": 5.08034379671151e-06, + "loss": 2.9769, + "step": 57950 + }, + { + "epoch": 3.9376953390406304, + "grad_norm": 6.420717239379883, + "learning_rate": 5.079919146623183e-06, + "loss": 2.9929, + "step": 57955 + }, + { + "epoch": 3.938035059111292, + "grad_norm": 6.327124118804932, + "learning_rate": 5.079494496534856e-06, + "loss": 2.671, + "step": 57960 + }, + { + "epoch": 3.938374779181954, + "grad_norm": 7.085660934448242, + "learning_rate": 5.079069846446528e-06, + "loss": 2.941, + "step": 57965 + }, + { + "epoch": 3.9387144992526157, + "grad_norm": 9.01092529296875, + "learning_rate": 5.078645196358202e-06, + "loss": 3.2195, + "step": 57970 + }, + { + "epoch": 3.9390542193232774, + "grad_norm": 7.341434955596924, + "learning_rate": 5.0782205462698746e-06, + "loss": 2.9822, + "step": 57975 + }, + { + "epoch": 3.9393939393939394, + "grad_norm": 7.601425647735596, + "learning_rate": 5.0777958961815465e-06, + "loss": 2.7215, + "step": 57980 + }, + { + "epoch": 3.939733659464601, + "grad_norm": 7.095455169677734, + "learning_rate": 5.07737124609322e-06, + "loss": 3.0152, + "step": 57985 + }, + { + "epoch": 3.9400733795352627, + "grad_norm": 7.17445707321167, + "learning_rate": 5.076946596004892e-06, + "loss": 2.8905, + "step": 57990 + }, + { + "epoch": 3.940413099605925, + "grad_norm": 7.0949625968933105, + "learning_rate": 5.076521945916565e-06, + "loss": 2.848, + "step": 57995 + }, + { + "epoch": 3.9407528196765864, + "grad_norm": 7.959496021270752, + "learning_rate": 5.0760972958282386e-06, + "loss": 2.8996, + "step": 58000 + }, + { + "epoch": 3.941092539747248, + "grad_norm": 8.41901969909668, + "learning_rate": 5.0756726457399105e-06, + "loss": 2.7357, + "step": 58005 + }, + { + "epoch": 3.94143225981791, + "grad_norm": 7.423103332519531, + "learning_rate": 5.075247995651583e-06, + "loss": 3.0305, + "step": 58010 + }, + { + "epoch": 3.9417719798885718, + "grad_norm": 7.9328107833862305, + "learning_rate": 5.074823345563257e-06, + "loss": 2.9568, + "step": 58015 + }, + { + "epoch": 3.9421116999592334, + "grad_norm": 7.044035911560059, + "learning_rate": 5.074398695474929e-06, + "loss": 2.9919, + "step": 58020 + }, + { + "epoch": 3.9424514200298955, + "grad_norm": 7.484975814819336, + "learning_rate": 5.073974045386602e-06, + "loss": 2.8441, + "step": 58025 + }, + { + "epoch": 3.942791140100557, + "grad_norm": 7.436596870422363, + "learning_rate": 5.073549395298275e-06, + "loss": 2.8482, + "step": 58030 + }, + { + "epoch": 3.9431308601712187, + "grad_norm": 5.960792541503906, + "learning_rate": 5.073124745209947e-06, + "loss": 2.8102, + "step": 58035 + }, + { + "epoch": 3.943470580241881, + "grad_norm": 6.990728378295898, + "learning_rate": 5.07270009512162e-06, + "loss": 2.9416, + "step": 58040 + }, + { + "epoch": 3.9438103003125424, + "grad_norm": 5.735625267028809, + "learning_rate": 5.072275445033294e-06, + "loss": 2.6897, + "step": 58045 + }, + { + "epoch": 3.944150020383204, + "grad_norm": 7.844133377075195, + "learning_rate": 5.071850794944966e-06, + "loss": 2.8368, + "step": 58050 + }, + { + "epoch": 3.944489740453866, + "grad_norm": 6.609884262084961, + "learning_rate": 5.0714261448566385e-06, + "loss": 3.065, + "step": 58055 + }, + { + "epoch": 3.9448294605245278, + "grad_norm": 7.071068286895752, + "learning_rate": 5.071001494768311e-06, + "loss": 3.0019, + "step": 58060 + }, + { + "epoch": 3.9451691805951894, + "grad_norm": 8.035484313964844, + "learning_rate": 5.070576844679984e-06, + "loss": 2.8549, + "step": 58065 + }, + { + "epoch": 3.9455089006658515, + "grad_norm": 6.7581095695495605, + "learning_rate": 5.070152194591657e-06, + "loss": 3.0005, + "step": 58070 + }, + { + "epoch": 3.945848620736513, + "grad_norm": 6.580272674560547, + "learning_rate": 5.06972754450333e-06, + "loss": 3.2028, + "step": 58075 + }, + { + "epoch": 3.9461883408071747, + "grad_norm": 5.8768205642700195, + "learning_rate": 5.0693028944150025e-06, + "loss": 2.9394, + "step": 58080 + }, + { + "epoch": 3.946528060877837, + "grad_norm": 6.247860431671143, + "learning_rate": 5.0688782443266745e-06, + "loss": 2.9317, + "step": 58085 + }, + { + "epoch": 3.9468677809484984, + "grad_norm": 8.219464302062988, + "learning_rate": 5.068453594238348e-06, + "loss": 2.854, + "step": 58090 + }, + { + "epoch": 3.94720750101916, + "grad_norm": 6.441203594207764, + "learning_rate": 5.068028944150021e-06, + "loss": 2.9338, + "step": 58095 + }, + { + "epoch": 3.947547221089822, + "grad_norm": 7.662567615509033, + "learning_rate": 5.067604294061693e-06, + "loss": 3.1679, + "step": 58100 + }, + { + "epoch": 3.9478869411604838, + "grad_norm": 10.817723274230957, + "learning_rate": 5.0671796439733665e-06, + "loss": 3.1495, + "step": 58105 + }, + { + "epoch": 3.9482266612311454, + "grad_norm": 8.744471549987793, + "learning_rate": 5.066754993885039e-06, + "loss": 2.7279, + "step": 58110 + }, + { + "epoch": 3.9485663813018075, + "grad_norm": 8.434906959533691, + "learning_rate": 5.066330343796711e-06, + "loss": 3.0902, + "step": 58115 + }, + { + "epoch": 3.948906101372469, + "grad_norm": 7.619828701019287, + "learning_rate": 5.065905693708385e-06, + "loss": 2.7346, + "step": 58120 + }, + { + "epoch": 3.9492458214431307, + "grad_norm": 7.860090255737305, + "learning_rate": 5.065481043620058e-06, + "loss": 2.8613, + "step": 58125 + }, + { + "epoch": 3.949585541513793, + "grad_norm": 8.039851188659668, + "learning_rate": 5.06505639353173e-06, + "loss": 3.2999, + "step": 58130 + }, + { + "epoch": 3.9499252615844545, + "grad_norm": 7.561966419219971, + "learning_rate": 5.064631743443403e-06, + "loss": 3.0089, + "step": 58135 + }, + { + "epoch": 3.950264981655116, + "grad_norm": 6.855386734008789, + "learning_rate": 5.064207093355076e-06, + "loss": 2.8715, + "step": 58140 + }, + { + "epoch": 3.950604701725778, + "grad_norm": 6.854715347290039, + "learning_rate": 5.063782443266748e-06, + "loss": 3.0395, + "step": 58145 + }, + { + "epoch": 3.95094442179644, + "grad_norm": 7.940497875213623, + "learning_rate": 5.063357793178422e-06, + "loss": 3.0651, + "step": 58150 + }, + { + "epoch": 3.9512841418671014, + "grad_norm": 6.709354877471924, + "learning_rate": 5.063018073107759e-06, + "loss": 2.9823, + "step": 58155 + }, + { + "epoch": 3.9516238619377635, + "grad_norm": 7.00873327255249, + "learning_rate": 5.062593423019433e-06, + "loss": 3.119, + "step": 58160 + }, + { + "epoch": 3.951963582008425, + "grad_norm": 7.161801338195801, + "learning_rate": 5.0621687729311054e-06, + "loss": 3.2221, + "step": 58165 + }, + { + "epoch": 3.9523033020790868, + "grad_norm": 7.915674209594727, + "learning_rate": 5.061744122842777e-06, + "loss": 2.8808, + "step": 58170 + }, + { + "epoch": 3.952643022149749, + "grad_norm": 6.853318214416504, + "learning_rate": 5.061319472754451e-06, + "loss": 2.7963, + "step": 58175 + }, + { + "epoch": 3.9529827422204105, + "grad_norm": 6.845624923706055, + "learning_rate": 5.060894822666124e-06, + "loss": 2.88, + "step": 58180 + }, + { + "epoch": 3.953322462291072, + "grad_norm": 6.784619331359863, + "learning_rate": 5.060470172577796e-06, + "loss": 2.8819, + "step": 58185 + }, + { + "epoch": 3.953662182361734, + "grad_norm": 6.339039325714111, + "learning_rate": 5.0600455224894694e-06, + "loss": 2.8546, + "step": 58190 + }, + { + "epoch": 3.954001902432396, + "grad_norm": 7.783962249755859, + "learning_rate": 5.059620872401142e-06, + "loss": 2.8728, + "step": 58195 + }, + { + "epoch": 3.9543416225030574, + "grad_norm": 6.507606029510498, + "learning_rate": 5.059196222312814e-06, + "loss": 2.7085, + "step": 58200 + }, + { + "epoch": 3.9546813425737195, + "grad_norm": 7.492311954498291, + "learning_rate": 5.058771572224488e-06, + "loss": 3.0065, + "step": 58205 + }, + { + "epoch": 3.955021062644381, + "grad_norm": 8.06869125366211, + "learning_rate": 5.058346922136161e-06, + "loss": 2.8352, + "step": 58210 + }, + { + "epoch": 3.9553607827150428, + "grad_norm": 7.333759784698486, + "learning_rate": 5.057922272047833e-06, + "loss": 2.8258, + "step": 58215 + }, + { + "epoch": 3.955700502785705, + "grad_norm": 8.336865425109863, + "learning_rate": 5.057497621959506e-06, + "loss": 3.2118, + "step": 58220 + }, + { + "epoch": 3.9560402228563665, + "grad_norm": 7.122461318969727, + "learning_rate": 5.057072971871179e-06, + "loss": 2.7676, + "step": 58225 + }, + { + "epoch": 3.956379942927028, + "grad_norm": 6.090131759643555, + "learning_rate": 5.056648321782851e-06, + "loss": 2.8633, + "step": 58230 + }, + { + "epoch": 3.95671966299769, + "grad_norm": 7.050220012664795, + "learning_rate": 5.056223671694525e-06, + "loss": 3.1243, + "step": 58235 + }, + { + "epoch": 3.957059383068352, + "grad_norm": 7.027996063232422, + "learning_rate": 5.055799021606197e-06, + "loss": 3.024, + "step": 58240 + }, + { + "epoch": 3.9573991031390134, + "grad_norm": 7.28241491317749, + "learning_rate": 5.055374371517869e-06, + "loss": 3.0614, + "step": 58245 + }, + { + "epoch": 3.957738823209675, + "grad_norm": 6.186925888061523, + "learning_rate": 5.054949721429543e-06, + "loss": 2.9162, + "step": 58250 + }, + { + "epoch": 3.958078543280337, + "grad_norm": 7.421182155609131, + "learning_rate": 5.054525071341215e-06, + "loss": 2.7696, + "step": 58255 + }, + { + "epoch": 3.958418263350999, + "grad_norm": 7.717873573303223, + "learning_rate": 5.054100421252888e-06, + "loss": 2.9112, + "step": 58260 + }, + { + "epoch": 3.9587579834216604, + "grad_norm": 7.1602702140808105, + "learning_rate": 5.0536757711645614e-06, + "loss": 3.0888, + "step": 58265 + }, + { + "epoch": 3.9590977034923225, + "grad_norm": 7.440293312072754, + "learning_rate": 5.053251121076233e-06, + "loss": 2.9422, + "step": 58270 + }, + { + "epoch": 3.959437423562984, + "grad_norm": 6.644742488861084, + "learning_rate": 5.052826470987906e-06, + "loss": 2.6811, + "step": 58275 + }, + { + "epoch": 3.9597771436336457, + "grad_norm": 6.267693996429443, + "learning_rate": 5.05240182089958e-06, + "loss": 2.9853, + "step": 58280 + }, + { + "epoch": 3.9601168637043074, + "grad_norm": 7.887292385101318, + "learning_rate": 5.051977170811252e-06, + "loss": 3.0232, + "step": 58285 + }, + { + "epoch": 3.9604565837749695, + "grad_norm": 6.553692817687988, + "learning_rate": 5.051552520722925e-06, + "loss": 3.1734, + "step": 58290 + }, + { + "epoch": 3.960796303845631, + "grad_norm": 6.160555839538574, + "learning_rate": 5.051127870634598e-06, + "loss": 2.9008, + "step": 58295 + }, + { + "epoch": 3.9611360239162927, + "grad_norm": 10.122099876403809, + "learning_rate": 5.05070322054627e-06, + "loss": 2.9659, + "step": 58300 + }, + { + "epoch": 3.961475743986955, + "grad_norm": 7.376701831817627, + "learning_rate": 5.050278570457943e-06, + "loss": 2.9598, + "step": 58305 + }, + { + "epoch": 3.9618154640576164, + "grad_norm": 6.400658130645752, + "learning_rate": 5.049853920369616e-06, + "loss": 2.8033, + "step": 58310 + }, + { + "epoch": 3.962155184128278, + "grad_norm": 7.7574543952941895, + "learning_rate": 5.049429270281289e-06, + "loss": 3.2387, + "step": 58315 + }, + { + "epoch": 3.96249490419894, + "grad_norm": 8.091256141662598, + "learning_rate": 5.049004620192961e-06, + "loss": 2.9355, + "step": 58320 + }, + { + "epoch": 3.9628346242696018, + "grad_norm": 6.209370136260986, + "learning_rate": 5.048579970104634e-06, + "loss": 2.9589, + "step": 58325 + }, + { + "epoch": 3.9631743443402634, + "grad_norm": 9.374650955200195, + "learning_rate": 5.048155320016307e-06, + "loss": 2.759, + "step": 58330 + }, + { + "epoch": 3.9635140644109255, + "grad_norm": 6.599238872528076, + "learning_rate": 5.047730669927979e-06, + "loss": 2.9465, + "step": 58335 + }, + { + "epoch": 3.963853784481587, + "grad_norm": 5.735815525054932, + "learning_rate": 5.047306019839653e-06, + "loss": 3.0138, + "step": 58340 + }, + { + "epoch": 3.9641935045522487, + "grad_norm": 7.881016731262207, + "learning_rate": 5.046881369751325e-06, + "loss": 2.7039, + "step": 58345 + }, + { + "epoch": 3.964533224622911, + "grad_norm": 6.9006571769714355, + "learning_rate": 5.046456719662997e-06, + "loss": 3.193, + "step": 58350 + }, + { + "epoch": 3.9648729446935724, + "grad_norm": 8.942822456359863, + "learning_rate": 5.046032069574671e-06, + "loss": 2.7407, + "step": 58355 + }, + { + "epoch": 3.965212664764234, + "grad_norm": 7.648166656494141, + "learning_rate": 5.045607419486344e-06, + "loss": 2.9138, + "step": 58360 + }, + { + "epoch": 3.965552384834896, + "grad_norm": 6.861351013183594, + "learning_rate": 5.045182769398016e-06, + "loss": 3.079, + "step": 58365 + }, + { + "epoch": 3.9658921049055578, + "grad_norm": 8.766233444213867, + "learning_rate": 5.044758119309689e-06, + "loss": 2.9863, + "step": 58370 + }, + { + "epoch": 3.9662318249762194, + "grad_norm": 6.876299858093262, + "learning_rate": 5.044333469221362e-06, + "loss": 2.9293, + "step": 58375 + }, + { + "epoch": 3.9665715450468815, + "grad_norm": 7.566990852355957, + "learning_rate": 5.043908819133034e-06, + "loss": 2.9643, + "step": 58380 + }, + { + "epoch": 3.966911265117543, + "grad_norm": 7.900280952453613, + "learning_rate": 5.043484169044708e-06, + "loss": 3.1451, + "step": 58385 + }, + { + "epoch": 3.9672509851882047, + "grad_norm": 7.947548866271973, + "learning_rate": 5.043059518956381e-06, + "loss": 3.0652, + "step": 58390 + }, + { + "epoch": 3.967590705258867, + "grad_norm": 6.664670944213867, + "learning_rate": 5.0426348688680526e-06, + "loss": 2.9891, + "step": 58395 + }, + { + "epoch": 3.9679304253295284, + "grad_norm": 6.571717262268066, + "learning_rate": 5.042210218779726e-06, + "loss": 2.9147, + "step": 58400 + }, + { + "epoch": 3.96827014540019, + "grad_norm": 6.0201096534729, + "learning_rate": 5.041785568691398e-06, + "loss": 2.843, + "step": 58405 + }, + { + "epoch": 3.968609865470852, + "grad_norm": 6.479857921600342, + "learning_rate": 5.041360918603071e-06, + "loss": 2.8866, + "step": 58410 + }, + { + "epoch": 3.968949585541514, + "grad_norm": 7.696969985961914, + "learning_rate": 5.040936268514745e-06, + "loss": 2.808, + "step": 58415 + }, + { + "epoch": 3.9692893056121754, + "grad_norm": 7.850301265716553, + "learning_rate": 5.0405116184264166e-06, + "loss": 2.827, + "step": 58420 + }, + { + "epoch": 3.9696290256828375, + "grad_norm": 7.668791770935059, + "learning_rate": 5.040086968338089e-06, + "loss": 3.1671, + "step": 58425 + }, + { + "epoch": 3.969968745753499, + "grad_norm": 9.7078218460083, + "learning_rate": 5.039662318249763e-06, + "loss": 2.8568, + "step": 58430 + }, + { + "epoch": 3.9703084658241607, + "grad_norm": 7.029344081878662, + "learning_rate": 5.039237668161435e-06, + "loss": 2.8094, + "step": 58435 + }, + { + "epoch": 3.970648185894823, + "grad_norm": 7.438438892364502, + "learning_rate": 5.038813018073108e-06, + "loss": 2.8272, + "step": 58440 + }, + { + "epoch": 3.9709879059654845, + "grad_norm": 8.092881202697754, + "learning_rate": 5.038388367984781e-06, + "loss": 2.9508, + "step": 58445 + }, + { + "epoch": 3.971327626036146, + "grad_norm": 7.248647689819336, + "learning_rate": 5.037963717896453e-06, + "loss": 2.604, + "step": 58450 + }, + { + "epoch": 3.971667346106808, + "grad_norm": 7.734695911407471, + "learning_rate": 5.037539067808127e-06, + "loss": 3.018, + "step": 58455 + }, + { + "epoch": 3.97200706617747, + "grad_norm": 7.3985114097595215, + "learning_rate": 5.0371144177198e-06, + "loss": 2.9586, + "step": 58460 + }, + { + "epoch": 3.9723467862481314, + "grad_norm": 8.53721809387207, + "learning_rate": 5.036689767631472e-06, + "loss": 2.8532, + "step": 58465 + }, + { + "epoch": 3.9726865063187935, + "grad_norm": 9.839543342590332, + "learning_rate": 5.0362651175431454e-06, + "loss": 2.7372, + "step": 58470 + }, + { + "epoch": 3.973026226389455, + "grad_norm": 6.619738578796387, + "learning_rate": 5.035840467454818e-06, + "loss": 2.7586, + "step": 58475 + }, + { + "epoch": 3.9733659464601168, + "grad_norm": 7.860377311706543, + "learning_rate": 5.03541581736649e-06, + "loss": 2.9929, + "step": 58480 + }, + { + "epoch": 3.973705666530779, + "grad_norm": 7.409476280212402, + "learning_rate": 5.034991167278164e-06, + "loss": 2.7341, + "step": 58485 + }, + { + "epoch": 3.9740453866014405, + "grad_norm": 5.975533962249756, + "learning_rate": 5.034566517189836e-06, + "loss": 3.0568, + "step": 58490 + }, + { + "epoch": 3.974385106672102, + "grad_norm": 7.071813106536865, + "learning_rate": 5.034141867101509e-06, + "loss": 2.7856, + "step": 58495 + }, + { + "epoch": 3.974724826742764, + "grad_norm": 8.576473236083984, + "learning_rate": 5.033717217013182e-06, + "loss": 2.9701, + "step": 58500 + }, + { + "epoch": 3.975064546813426, + "grad_norm": 7.78136682510376, + "learning_rate": 5.033292566924854e-06, + "loss": 2.707, + "step": 58505 + }, + { + "epoch": 3.9754042668840874, + "grad_norm": 6.584414482116699, + "learning_rate": 5.032867916836527e-06, + "loss": 3.098, + "step": 58510 + }, + { + "epoch": 3.9757439869547495, + "grad_norm": 6.865097999572754, + "learning_rate": 5.032443266748201e-06, + "loss": 3.0359, + "step": 58515 + }, + { + "epoch": 3.976083707025411, + "grad_norm": 8.598742485046387, + "learning_rate": 5.032018616659873e-06, + "loss": 2.9552, + "step": 58520 + }, + { + "epoch": 3.9764234270960728, + "grad_norm": 6.780503273010254, + "learning_rate": 5.031593966571545e-06, + "loss": 3.0481, + "step": 58525 + }, + { + "epoch": 3.976763147166735, + "grad_norm": 7.177070617675781, + "learning_rate": 5.031169316483219e-06, + "loss": 2.8115, + "step": 58530 + }, + { + "epoch": 3.9771028672373965, + "grad_norm": 9.824962615966797, + "learning_rate": 5.030744666394891e-06, + "loss": 2.8309, + "step": 58535 + }, + { + "epoch": 3.977442587308058, + "grad_norm": 7.886376857757568, + "learning_rate": 5.030320016306564e-06, + "loss": 2.9894, + "step": 58540 + }, + { + "epoch": 3.97778230737872, + "grad_norm": 9.192893981933594, + "learning_rate": 5.0298953662182374e-06, + "loss": 2.8209, + "step": 58545 + }, + { + "epoch": 3.978122027449382, + "grad_norm": 7.465247631072998, + "learning_rate": 5.029470716129909e-06, + "loss": 3.023, + "step": 58550 + }, + { + "epoch": 3.9784617475200434, + "grad_norm": 9.98451042175293, + "learning_rate": 5.029046066041582e-06, + "loss": 3.207, + "step": 58555 + }, + { + "epoch": 3.9788014675907055, + "grad_norm": 7.46638298034668, + "learning_rate": 5.028621415953255e-06, + "loss": 3.029, + "step": 58560 + }, + { + "epoch": 3.979141187661367, + "grad_norm": 5.678178310394287, + "learning_rate": 5.028196765864928e-06, + "loss": 3.0213, + "step": 58565 + }, + { + "epoch": 3.979480907732029, + "grad_norm": 7.756289958953857, + "learning_rate": 5.027772115776601e-06, + "loss": 2.9722, + "step": 58570 + }, + { + "epoch": 3.979820627802691, + "grad_norm": 8.561086654663086, + "learning_rate": 5.027347465688273e-06, + "loss": 2.8264, + "step": 58575 + }, + { + "epoch": 3.9801603478733525, + "grad_norm": 5.579083442687988, + "learning_rate": 5.026922815599946e-06, + "loss": 2.9373, + "step": 58580 + }, + { + "epoch": 3.980500067944014, + "grad_norm": 6.48182487487793, + "learning_rate": 5.026498165511618e-06, + "loss": 2.9528, + "step": 58585 + }, + { + "epoch": 3.9808397880146758, + "grad_norm": 7.13715934753418, + "learning_rate": 5.026073515423292e-06, + "loss": 2.8547, + "step": 58590 + }, + { + "epoch": 3.981179508085338, + "grad_norm": 8.826693534851074, + "learning_rate": 5.025648865334965e-06, + "loss": 2.9498, + "step": 58595 + }, + { + "epoch": 3.9815192281559995, + "grad_norm": 6.3760294914245605, + "learning_rate": 5.0252242152466366e-06, + "loss": 3.0187, + "step": 58600 + }, + { + "epoch": 3.981858948226661, + "grad_norm": 5.588311672210693, + "learning_rate": 5.02479956515831e-06, + "loss": 2.9093, + "step": 58605 + }, + { + "epoch": 3.982198668297323, + "grad_norm": 7.603944301605225, + "learning_rate": 5.024374915069983e-06, + "loss": 2.816, + "step": 58610 + }, + { + "epoch": 3.982538388367985, + "grad_norm": 5.966248512268066, + "learning_rate": 5.023950264981655e-06, + "loss": 2.7958, + "step": 58615 + }, + { + "epoch": 3.9828781084386464, + "grad_norm": 6.905531883239746, + "learning_rate": 5.023525614893329e-06, + "loss": 2.9541, + "step": 58620 + }, + { + "epoch": 3.983217828509308, + "grad_norm": 6.530621528625488, + "learning_rate": 5.023100964805001e-06, + "loss": 2.8088, + "step": 58625 + }, + { + "epoch": 3.98355754857997, + "grad_norm": 5.861178874969482, + "learning_rate": 5.022676314716673e-06, + "loss": 2.9421, + "step": 58630 + }, + { + "epoch": 3.9838972686506318, + "grad_norm": 6.536908149719238, + "learning_rate": 5.022251664628347e-06, + "loss": 2.8324, + "step": 58635 + }, + { + "epoch": 3.9842369887212934, + "grad_norm": 6.884718894958496, + "learning_rate": 5.02182701454002e-06, + "loss": 3.1132, + "step": 58640 + }, + { + "epoch": 3.9845767087919555, + "grad_norm": 6.879098892211914, + "learning_rate": 5.021402364451692e-06, + "loss": 2.8789, + "step": 58645 + }, + { + "epoch": 3.984916428862617, + "grad_norm": 6.575525760650635, + "learning_rate": 5.020977714363365e-06, + "loss": 3.0794, + "step": 58650 + }, + { + "epoch": 3.9852561489332787, + "grad_norm": 6.0738525390625, + "learning_rate": 5.020553064275037e-06, + "loss": 2.7958, + "step": 58655 + }, + { + "epoch": 3.985595869003941, + "grad_norm": 6.822373390197754, + "learning_rate": 5.02012841418671e-06, + "loss": 2.8713, + "step": 58660 + }, + { + "epoch": 3.9859355890746024, + "grad_norm": 7.513241767883301, + "learning_rate": 5.019703764098384e-06, + "loss": 2.9691, + "step": 58665 + }, + { + "epoch": 3.986275309145264, + "grad_norm": 8.497602462768555, + "learning_rate": 5.019279114010056e-06, + "loss": 2.813, + "step": 58670 + }, + { + "epoch": 3.986615029215926, + "grad_norm": 6.45693302154541, + "learning_rate": 5.0188544639217286e-06, + "loss": 3.0506, + "step": 58675 + }, + { + "epoch": 3.9869547492865878, + "grad_norm": 9.266493797302246, + "learning_rate": 5.018429813833402e-06, + "loss": 2.9228, + "step": 58680 + }, + { + "epoch": 3.9872944693572494, + "grad_norm": 9.320100784301758, + "learning_rate": 5.018005163745074e-06, + "loss": 2.6934, + "step": 58685 + }, + { + "epoch": 3.9876341894279115, + "grad_norm": 7.572221755981445, + "learning_rate": 5.017580513656747e-06, + "loss": 3.0661, + "step": 58690 + }, + { + "epoch": 3.987973909498573, + "grad_norm": 5.075204849243164, + "learning_rate": 5.017155863568421e-06, + "loss": 2.9962, + "step": 58695 + }, + { + "epoch": 3.9883136295692347, + "grad_norm": 6.693419933319092, + "learning_rate": 5.0167312134800926e-06, + "loss": 2.8266, + "step": 58700 + }, + { + "epoch": 3.988653349639897, + "grad_norm": 6.484538555145264, + "learning_rate": 5.016306563391765e-06, + "loss": 3.0323, + "step": 58705 + }, + { + "epoch": 3.9889930697105584, + "grad_norm": 6.609862327575684, + "learning_rate": 5.015881913303439e-06, + "loss": 2.7958, + "step": 58710 + }, + { + "epoch": 3.98933278978122, + "grad_norm": 6.995121955871582, + "learning_rate": 5.015457263215111e-06, + "loss": 3.0405, + "step": 58715 + }, + { + "epoch": 3.989672509851882, + "grad_norm": 6.676509380340576, + "learning_rate": 5.015032613126784e-06, + "loss": 3.1954, + "step": 58720 + }, + { + "epoch": 3.990012229922544, + "grad_norm": 8.160017967224121, + "learning_rate": 5.0146079630384566e-06, + "loss": 2.5975, + "step": 58725 + }, + { + "epoch": 3.9903519499932054, + "grad_norm": 6.997619152069092, + "learning_rate": 5.014183312950129e-06, + "loss": 2.7775, + "step": 58730 + }, + { + "epoch": 3.9906916700638675, + "grad_norm": 7.929022789001465, + "learning_rate": 5.013758662861802e-06, + "loss": 2.6775, + "step": 58735 + }, + { + "epoch": 3.991031390134529, + "grad_norm": 5.860179424285889, + "learning_rate": 5.013334012773475e-06, + "loss": 2.9365, + "step": 58740 + }, + { + "epoch": 3.9913711102051908, + "grad_norm": 6.880831718444824, + "learning_rate": 5.012909362685148e-06, + "loss": 2.8009, + "step": 58745 + }, + { + "epoch": 3.991710830275853, + "grad_norm": 7.148499488830566, + "learning_rate": 5.01248471259682e-06, + "loss": 2.8899, + "step": 58750 + }, + { + "epoch": 3.9920505503465145, + "grad_norm": 8.335850715637207, + "learning_rate": 5.012060062508493e-06, + "loss": 2.9743, + "step": 58755 + }, + { + "epoch": 3.992390270417176, + "grad_norm": 7.933506011962891, + "learning_rate": 5.011635412420166e-06, + "loss": 2.864, + "step": 58760 + }, + { + "epoch": 3.992729990487838, + "grad_norm": 9.29239559173584, + "learning_rate": 5.011210762331838e-06, + "loss": 3.2248, + "step": 58765 + }, + { + "epoch": 3.9930697105585, + "grad_norm": 7.802409648895264, + "learning_rate": 5.010786112243512e-06, + "loss": 2.7975, + "step": 58770 + }, + { + "epoch": 3.9934094306291614, + "grad_norm": 8.318764686584473, + "learning_rate": 5.010361462155185e-06, + "loss": 2.6397, + "step": 58775 + }, + { + "epoch": 3.9937491506998235, + "grad_norm": 6.199640274047852, + "learning_rate": 5.0099368120668565e-06, + "loss": 2.7511, + "step": 58780 + }, + { + "epoch": 3.994088870770485, + "grad_norm": 7.0163493156433105, + "learning_rate": 5.00951216197853e-06, + "loss": 2.9608, + "step": 58785 + }, + { + "epoch": 3.9944285908411468, + "grad_norm": 7.36647891998291, + "learning_rate": 5.009087511890203e-06, + "loss": 2.7465, + "step": 58790 + }, + { + "epoch": 3.994768310911809, + "grad_norm": 7.692253112792969, + "learning_rate": 5.008662861801877e-06, + "loss": 2.985, + "step": 58795 + }, + { + "epoch": 3.9951080309824705, + "grad_norm": 8.820894241333008, + "learning_rate": 5.008238211713549e-06, + "loss": 3.0246, + "step": 58800 + }, + { + "epoch": 3.995447751053132, + "grad_norm": 8.691399574279785, + "learning_rate": 5.007813561625221e-06, + "loss": 3.1689, + "step": 58805 + }, + { + "epoch": 3.995787471123794, + "grad_norm": 5.8145670890808105, + "learning_rate": 5.007388911536894e-06, + "loss": 2.8916, + "step": 58810 + }, + { + "epoch": 3.996127191194456, + "grad_norm": 7.302025318145752, + "learning_rate": 5.006964261448567e-06, + "loss": 2.7469, + "step": 58815 + }, + { + "epoch": 3.9964669112651174, + "grad_norm": 9.838007926940918, + "learning_rate": 5.006539611360239e-06, + "loss": 3.1684, + "step": 58820 + }, + { + "epoch": 3.9968066313357795, + "grad_norm": 7.488550662994385, + "learning_rate": 5.006114961271913e-06, + "loss": 2.9234, + "step": 58825 + }, + { + "epoch": 3.997146351406441, + "grad_norm": 6.877864360809326, + "learning_rate": 5.005690311183585e-06, + "loss": 3.0405, + "step": 58830 + }, + { + "epoch": 3.9974860714771028, + "grad_norm": 7.133301734924316, + "learning_rate": 5.005265661095257e-06, + "loss": 2.8161, + "step": 58835 + }, + { + "epoch": 3.997825791547765, + "grad_norm": 8.246053695678711, + "learning_rate": 5.004841011006931e-06, + "loss": 2.9692, + "step": 58840 + }, + { + "epoch": 3.9981655116184265, + "grad_norm": 5.309601306915283, + "learning_rate": 5.004416360918604e-06, + "loss": 2.9437, + "step": 58845 + }, + { + "epoch": 3.998505231689088, + "grad_norm": 8.249139785766602, + "learning_rate": 5.003991710830276e-06, + "loss": 2.8931, + "step": 58850 + }, + { + "epoch": 3.99884495175975, + "grad_norm": 5.898532390594482, + "learning_rate": 5.003567060741949e-06, + "loss": 2.9286, + "step": 58855 + }, + { + "epoch": 3.999184671830412, + "grad_norm": 7.975898265838623, + "learning_rate": 5.003142410653622e-06, + "loss": 3.1629, + "step": 58860 + }, + { + "epoch": 3.9995243919010735, + "grad_norm": 8.67392635345459, + "learning_rate": 5.002717760565294e-06, + "loss": 3.1415, + "step": 58865 + }, + { + "epoch": 3.9998641119717355, + "grad_norm": 11.282739639282227, + "learning_rate": 5.002293110476968e-06, + "loss": 2.8109, + "step": 58870 + }, + { + "epoch": 4.0, + "eval_bertscore": { + "f1": 0.8391637762672552, + "precision": 0.8396891777061863, + "recall": 0.8393752173763316 + }, + "eval_bleu_4": 0.0149128661194431, + "eval_exact_match": 0.00019381723035177828, + "eval_loss": 3.303499460220337, + "eval_meteor": 0.10575527268653148, + "eval_rouge": { + "rouge1": 0.13441547469317994, + "rouge2": 0.016611867515205075, + "rougeL": 0.1130208890991124, + "rougeLsum": 0.11301101082116002 + }, + "eval_runtime": 1105.4189, + "eval_samples_per_second": 9.335, + "eval_steps_per_second": 1.167, + "step": 58872 + }, + { + "epoch": 4.000203832042397, + "grad_norm": 7.249695301055908, + "learning_rate": 5.001868460388641e-06, + "loss": 3.1079, + "step": 58875 + }, + { + "epoch": 4.000543552113059, + "grad_norm": 8.911230087280273, + "learning_rate": 5.0014438103003125e-06, + "loss": 2.9346, + "step": 58880 + }, + { + "epoch": 4.000883272183721, + "grad_norm": 7.598415851593018, + "learning_rate": 5.001019160211986e-06, + "loss": 2.7918, + "step": 58885 + }, + { + "epoch": 4.001222992254382, + "grad_norm": 6.835188388824463, + "learning_rate": 5.000594510123659e-06, + "loss": 2.8319, + "step": 58890 + }, + { + "epoch": 4.001562712325044, + "grad_norm": 6.3920369148254395, + "learning_rate": 5.000169860035331e-06, + "loss": 2.8357, + "step": 58895 + }, + { + "epoch": 4.001902432395706, + "grad_norm": 8.010555267333984, + "learning_rate": 4.999745209947004e-06, + "loss": 2.9277, + "step": 58900 + }, + { + "epoch": 4.002242152466367, + "grad_norm": 7.3897576332092285, + "learning_rate": 4.9993205598586766e-06, + "loss": 2.7326, + "step": 58905 + }, + { + "epoch": 4.0025818725370295, + "grad_norm": 5.456431865692139, + "learning_rate": 4.99889590977035e-06, + "loss": 2.7919, + "step": 58910 + }, + { + "epoch": 4.0029215926076915, + "grad_norm": 8.591818809509277, + "learning_rate": 4.998471259682022e-06, + "loss": 2.6852, + "step": 58915 + }, + { + "epoch": 4.003261312678353, + "grad_norm": 8.169405937194824, + "learning_rate": 4.998046609593695e-06, + "loss": 3.0452, + "step": 58920 + }, + { + "epoch": 4.003601032749015, + "grad_norm": 6.836751461029053, + "learning_rate": 4.997621959505369e-06, + "loss": 2.8522, + "step": 58925 + }, + { + "epoch": 4.003940752819677, + "grad_norm": 9.11959457397461, + "learning_rate": 4.9971973094170406e-06, + "loss": 3.0809, + "step": 58930 + }, + { + "epoch": 4.004280472890338, + "grad_norm": 9.250102043151855, + "learning_rate": 4.996772659328713e-06, + "loss": 2.8383, + "step": 58935 + }, + { + "epoch": 4.004620192961, + "grad_norm": 6.314119338989258, + "learning_rate": 4.996348009240386e-06, + "loss": 2.9264, + "step": 58940 + }, + { + "epoch": 4.004959913031662, + "grad_norm": 6.347469806671143, + "learning_rate": 4.995923359152059e-06, + "loss": 2.8023, + "step": 58945 + }, + { + "epoch": 4.005299633102323, + "grad_norm": 5.63197135925293, + "learning_rate": 4.995498709063732e-06, + "loss": 2.8103, + "step": 58950 + }, + { + "epoch": 4.0056393531729855, + "grad_norm": 7.998116493225098, + "learning_rate": 4.9950740589754046e-06, + "loss": 2.8013, + "step": 58955 + }, + { + "epoch": 4.0059790732436475, + "grad_norm": 8.013602256774902, + "learning_rate": 4.994649408887077e-06, + "loss": 2.9149, + "step": 58960 + }, + { + "epoch": 4.006318793314309, + "grad_norm": 6.828099250793457, + "learning_rate": 4.99422475879875e-06, + "loss": 2.7023, + "step": 58965 + }, + { + "epoch": 4.006658513384971, + "grad_norm": 5.992077827453613, + "learning_rate": 4.993800108710423e-06, + "loss": 2.7166, + "step": 58970 + }, + { + "epoch": 4.006998233455633, + "grad_norm": 7.678843021392822, + "learning_rate": 4.993375458622096e-06, + "loss": 2.9596, + "step": 58975 + }, + { + "epoch": 4.007337953526294, + "grad_norm": 7.004114151000977, + "learning_rate": 4.9929508085337686e-06, + "loss": 2.9315, + "step": 58980 + }, + { + "epoch": 4.007677673596956, + "grad_norm": 7.3110880851745605, + "learning_rate": 4.992526158445441e-06, + "loss": 2.8777, + "step": 58985 + }, + { + "epoch": 4.008017393667618, + "grad_norm": 6.496171474456787, + "learning_rate": 4.992101508357114e-06, + "loss": 2.8307, + "step": 58990 + }, + { + "epoch": 4.008357113738279, + "grad_norm": 8.180511474609375, + "learning_rate": 4.991676858268787e-06, + "loss": 2.8743, + "step": 58995 + }, + { + "epoch": 4.0086968338089415, + "grad_norm": 7.103981018066406, + "learning_rate": 4.99125220818046e-06, + "loss": 3.0473, + "step": 59000 + }, + { + "epoch": 4.009036553879604, + "grad_norm": 6.538247585296631, + "learning_rate": 4.9908275580921326e-06, + "loss": 2.8089, + "step": 59005 + }, + { + "epoch": 4.009376273950265, + "grad_norm": 6.461260795593262, + "learning_rate": 4.990402908003805e-06, + "loss": 2.801, + "step": 59010 + }, + { + "epoch": 4.009715994020927, + "grad_norm": 9.570207595825195, + "learning_rate": 4.989978257915478e-06, + "loss": 2.6142, + "step": 59015 + }, + { + "epoch": 4.010055714091589, + "grad_norm": 5.3772664070129395, + "learning_rate": 4.989553607827151e-06, + "loss": 3.0955, + "step": 59020 + }, + { + "epoch": 4.01039543416225, + "grad_norm": 7.2932024002075195, + "learning_rate": 4.989128957738824e-06, + "loss": 2.8621, + "step": 59025 + }, + { + "epoch": 4.010735154232912, + "grad_norm": 6.085014820098877, + "learning_rate": 4.9887043076504966e-06, + "loss": 2.7578, + "step": 59030 + }, + { + "epoch": 4.011074874303574, + "grad_norm": 6.798295497894287, + "learning_rate": 4.9882796575621685e-06, + "loss": 2.762, + "step": 59035 + }, + { + "epoch": 4.011414594374235, + "grad_norm": 7.562862873077393, + "learning_rate": 4.987855007473842e-06, + "loss": 2.7561, + "step": 59040 + }, + { + "epoch": 4.0117543144448975, + "grad_norm": 5.792308807373047, + "learning_rate": 4.987430357385515e-06, + "loss": 2.9786, + "step": 59045 + }, + { + "epoch": 4.01209403451556, + "grad_norm": 8.059475898742676, + "learning_rate": 4.987005707297187e-06, + "loss": 2.857, + "step": 59050 + }, + { + "epoch": 4.012433754586221, + "grad_norm": 8.404601097106934, + "learning_rate": 4.9865810572088606e-06, + "loss": 2.8337, + "step": 59055 + }, + { + "epoch": 4.012773474656883, + "grad_norm": 6.433998107910156, + "learning_rate": 4.986156407120533e-06, + "loss": 2.9767, + "step": 59060 + }, + { + "epoch": 4.013113194727545, + "grad_norm": 7.329721927642822, + "learning_rate": 4.985731757032206e-06, + "loss": 3.001, + "step": 59065 + }, + { + "epoch": 4.013452914798206, + "grad_norm": 6.969733238220215, + "learning_rate": 4.985307106943878e-06, + "loss": 2.7786, + "step": 59070 + }, + { + "epoch": 4.013792634868868, + "grad_norm": 6.052149772644043, + "learning_rate": 4.984882456855552e-06, + "loss": 2.5909, + "step": 59075 + }, + { + "epoch": 4.01413235493953, + "grad_norm": 7.745763778686523, + "learning_rate": 4.984457806767225e-06, + "loss": 2.8787, + "step": 59080 + }, + { + "epoch": 4.014472075010191, + "grad_norm": 7.5934224128723145, + "learning_rate": 4.9840331566788965e-06, + "loss": 2.854, + "step": 59085 + }, + { + "epoch": 4.0148117950808535, + "grad_norm": 5.316802501678467, + "learning_rate": 4.98360850659057e-06, + "loss": 2.8926, + "step": 59090 + }, + { + "epoch": 4.015151515151516, + "grad_norm": 5.702142238616943, + "learning_rate": 4.983183856502243e-06, + "loss": 2.8971, + "step": 59095 + }, + { + "epoch": 4.015491235222177, + "grad_norm": 6.822263717651367, + "learning_rate": 4.982759206413915e-06, + "loss": 2.9671, + "step": 59100 + }, + { + "epoch": 4.015830955292839, + "grad_norm": 7.1588850021362305, + "learning_rate": 4.982334556325588e-06, + "loss": 2.8137, + "step": 59105 + }, + { + "epoch": 4.016170675363501, + "grad_norm": 7.50213098526001, + "learning_rate": 4.981909906237261e-06, + "loss": 2.8261, + "step": 59110 + }, + { + "epoch": 4.016510395434162, + "grad_norm": 8.7726469039917, + "learning_rate": 4.981485256148933e-06, + "loss": 2.6942, + "step": 59115 + }, + { + "epoch": 4.016850115504824, + "grad_norm": 7.791707515716553, + "learning_rate": 4.981060606060606e-06, + "loss": 2.7891, + "step": 59120 + }, + { + "epoch": 4.017189835575485, + "grad_norm": 7.893668174743652, + "learning_rate": 4.98063595597228e-06, + "loss": 2.6393, + "step": 59125 + }, + { + "epoch": 4.017529555646147, + "grad_norm": 5.740914344787598, + "learning_rate": 4.980211305883952e-06, + "loss": 3.0175, + "step": 59130 + }, + { + "epoch": 4.0178692757168095, + "grad_norm": 7.3009490966796875, + "learning_rate": 4.9797866557956245e-06, + "loss": 2.9449, + "step": 59135 + }, + { + "epoch": 4.018208995787471, + "grad_norm": 9.230728149414062, + "learning_rate": 4.979362005707298e-06, + "loss": 2.9184, + "step": 59140 + }, + { + "epoch": 4.018548715858133, + "grad_norm": 6.8174238204956055, + "learning_rate": 4.97893735561897e-06, + "loss": 2.7766, + "step": 59145 + }, + { + "epoch": 4.018888435928795, + "grad_norm": 6.781542778015137, + "learning_rate": 4.978512705530643e-06, + "loss": 2.8464, + "step": 59150 + }, + { + "epoch": 4.019228155999456, + "grad_norm": 4.914586067199707, + "learning_rate": 4.978088055442316e-06, + "loss": 3.0432, + "step": 59155 + }, + { + "epoch": 4.019567876070118, + "grad_norm": 6.774506568908691, + "learning_rate": 4.9776634053539885e-06, + "loss": 2.657, + "step": 59160 + }, + { + "epoch": 4.01990759614078, + "grad_norm": 6.729639530181885, + "learning_rate": 4.977238755265661e-06, + "loss": 2.8899, + "step": 59165 + }, + { + "epoch": 4.020247316211441, + "grad_norm": 5.739269733428955, + "learning_rate": 4.976814105177334e-06, + "loss": 2.8132, + "step": 59170 + }, + { + "epoch": 4.0205870362821035, + "grad_norm": 7.171593189239502, + "learning_rate": 4.976389455089007e-06, + "loss": 3.04, + "step": 59175 + }, + { + "epoch": 4.0209267563527655, + "grad_norm": 7.340668678283691, + "learning_rate": 4.97596480500068e-06, + "loss": 2.7057, + "step": 59180 + }, + { + "epoch": 4.021266476423427, + "grad_norm": 8.866446495056152, + "learning_rate": 4.9755401549123525e-06, + "loss": 3.0345, + "step": 59185 + }, + { + "epoch": 4.021606196494089, + "grad_norm": 5.809514999389648, + "learning_rate": 4.975115504824025e-06, + "loss": 3.0591, + "step": 59190 + }, + { + "epoch": 4.021945916564751, + "grad_norm": 6.278859615325928, + "learning_rate": 4.974690854735698e-06, + "loss": 2.9609, + "step": 59195 + }, + { + "epoch": 4.022285636635412, + "grad_norm": 7.850459575653076, + "learning_rate": 4.974266204647371e-06, + "loss": 2.8903, + "step": 59200 + }, + { + "epoch": 4.022625356706074, + "grad_norm": 7.039351463317871, + "learning_rate": 4.973841554559044e-06, + "loss": 2.9959, + "step": 59205 + }, + { + "epoch": 4.022965076776736, + "grad_norm": 6.980344772338867, + "learning_rate": 4.9734169044707166e-06, + "loss": 2.9346, + "step": 59210 + }, + { + "epoch": 4.023304796847397, + "grad_norm": 7.451782703399658, + "learning_rate": 4.972992254382389e-06, + "loss": 2.9035, + "step": 59215 + }, + { + "epoch": 4.0236445169180595, + "grad_norm": 7.387642860412598, + "learning_rate": 4.972567604294062e-06, + "loss": 2.9352, + "step": 59220 + }, + { + "epoch": 4.0239842369887215, + "grad_norm": 8.710426330566406, + "learning_rate": 4.972142954205735e-06, + "loss": 2.835, + "step": 59225 + }, + { + "epoch": 4.024323957059383, + "grad_norm": 5.357048988342285, + "learning_rate": 4.971718304117408e-06, + "loss": 3.0002, + "step": 59230 + }, + { + "epoch": 4.024663677130045, + "grad_norm": 6.640896320343018, + "learning_rate": 4.9712936540290806e-06, + "loss": 2.9612, + "step": 59235 + }, + { + "epoch": 4.025003397200707, + "grad_norm": 5.859526634216309, + "learning_rate": 4.970869003940753e-06, + "loss": 2.8418, + "step": 59240 + }, + { + "epoch": 4.025343117271368, + "grad_norm": 7.76796817779541, + "learning_rate": 4.970444353852426e-06, + "loss": 2.9838, + "step": 59245 + }, + { + "epoch": 4.02568283734203, + "grad_norm": 7.038700103759766, + "learning_rate": 4.970019703764099e-06, + "loss": 2.9462, + "step": 59250 + }, + { + "epoch": 4.026022557412692, + "grad_norm": 6.888749599456787, + "learning_rate": 4.969595053675772e-06, + "loss": 2.8835, + "step": 59255 + }, + { + "epoch": 4.026362277483353, + "grad_norm": 6.731431007385254, + "learning_rate": 4.9691704035874446e-06, + "loss": 2.839, + "step": 59260 + }, + { + "epoch": 4.0267019975540155, + "grad_norm": 7.056030750274658, + "learning_rate": 4.968745753499117e-06, + "loss": 3.0349, + "step": 59265 + }, + { + "epoch": 4.0270417176246776, + "grad_norm": 8.16157054901123, + "learning_rate": 4.96832110341079e-06, + "loss": 2.9436, + "step": 59270 + }, + { + "epoch": 4.027381437695339, + "grad_norm": 8.190460205078125, + "learning_rate": 4.967896453322463e-06, + "loss": 2.9614, + "step": 59275 + }, + { + "epoch": 4.027721157766001, + "grad_norm": 7.224695205688477, + "learning_rate": 4.967471803234136e-06, + "loss": 2.7711, + "step": 59280 + }, + { + "epoch": 4.028060877836663, + "grad_norm": 7.9551472663879395, + "learning_rate": 4.967047153145808e-06, + "loss": 2.9583, + "step": 59285 + }, + { + "epoch": 4.028400597907324, + "grad_norm": 6.1070170402526855, + "learning_rate": 4.966622503057481e-06, + "loss": 2.7469, + "step": 59290 + }, + { + "epoch": 4.028740317977986, + "grad_norm": 8.64848518371582, + "learning_rate": 4.966197852969154e-06, + "loss": 2.7932, + "step": 59295 + }, + { + "epoch": 4.029080038048648, + "grad_norm": 5.684412002563477, + "learning_rate": 4.965773202880826e-06, + "loss": 2.8573, + "step": 59300 + }, + { + "epoch": 4.029419758119309, + "grad_norm": 6.682589530944824, + "learning_rate": 4.9653485527925e-06, + "loss": 2.8843, + "step": 59305 + }, + { + "epoch": 4.0297594781899715, + "grad_norm": 7.906208038330078, + "learning_rate": 4.9649239027041726e-06, + "loss": 2.7313, + "step": 59310 + }, + { + "epoch": 4.030099198260634, + "grad_norm": 7.362555027008057, + "learning_rate": 4.9644992526158445e-06, + "loss": 2.7682, + "step": 59315 + }, + { + "epoch": 4.030438918331295, + "grad_norm": 6.802735328674316, + "learning_rate": 4.964074602527517e-06, + "loss": 3.0141, + "step": 59320 + }, + { + "epoch": 4.030778638401957, + "grad_norm": 7.57252311706543, + "learning_rate": 4.963649952439191e-06, + "loss": 2.8891, + "step": 59325 + }, + { + "epoch": 4.031118358472619, + "grad_norm": 6.4134979248046875, + "learning_rate": 4.963225302350863e-06, + "loss": 2.9586, + "step": 59330 + }, + { + "epoch": 4.03145807854328, + "grad_norm": 7.9903059005737305, + "learning_rate": 4.962800652262536e-06, + "loss": 2.9673, + "step": 59335 + }, + { + "epoch": 4.031797798613942, + "grad_norm": 7.3027448654174805, + "learning_rate": 4.962376002174209e-06, + "loss": 2.8401, + "step": 59340 + }, + { + "epoch": 4.032137518684604, + "grad_norm": 8.264392852783203, + "learning_rate": 4.961951352085881e-06, + "loss": 2.988, + "step": 59345 + }, + { + "epoch": 4.032477238755265, + "grad_norm": 6.853834629058838, + "learning_rate": 4.961526701997554e-06, + "loss": 2.8465, + "step": 59350 + }, + { + "epoch": 4.0328169588259275, + "grad_norm": 8.260125160217285, + "learning_rate": 4.961102051909227e-06, + "loss": 2.9157, + "step": 59355 + }, + { + "epoch": 4.03315667889659, + "grad_norm": 8.618648529052734, + "learning_rate": 4.9606774018209e-06, + "loss": 2.7373, + "step": 59360 + }, + { + "epoch": 4.033496398967251, + "grad_norm": 7.2522807121276855, + "learning_rate": 4.9602527517325725e-06, + "loss": 2.8515, + "step": 59365 + }, + { + "epoch": 4.033836119037913, + "grad_norm": 6.9599785804748535, + "learning_rate": 4.959828101644245e-06, + "loss": 2.8602, + "step": 59370 + }, + { + "epoch": 4.034175839108575, + "grad_norm": 6.045756816864014, + "learning_rate": 4.959403451555918e-06, + "loss": 2.7926, + "step": 59375 + }, + { + "epoch": 4.034515559179236, + "grad_norm": 7.275684833526611, + "learning_rate": 4.958978801467591e-06, + "loss": 3.1031, + "step": 59380 + }, + { + "epoch": 4.034855279249898, + "grad_norm": 6.48802375793457, + "learning_rate": 4.958554151379264e-06, + "loss": 2.8981, + "step": 59385 + }, + { + "epoch": 4.03519499932056, + "grad_norm": 6.841697692871094, + "learning_rate": 4.9581295012909365e-06, + "loss": 2.7598, + "step": 59390 + }, + { + "epoch": 4.035534719391221, + "grad_norm": 8.807452201843262, + "learning_rate": 4.957704851202609e-06, + "loss": 2.711, + "step": 59395 + }, + { + "epoch": 4.0358744394618835, + "grad_norm": 9.530858993530273, + "learning_rate": 4.957280201114282e-06, + "loss": 2.6792, + "step": 59400 + }, + { + "epoch": 4.036214159532546, + "grad_norm": 6.977512359619141, + "learning_rate": 4.956855551025955e-06, + "loss": 2.7717, + "step": 59405 + }, + { + "epoch": 4.036553879603207, + "grad_norm": 7.079077243804932, + "learning_rate": 4.956430900937628e-06, + "loss": 2.6178, + "step": 59410 + }, + { + "epoch": 4.036893599673869, + "grad_norm": 7.3676958084106445, + "learning_rate": 4.9560062508493005e-06, + "loss": 2.7745, + "step": 59415 + }, + { + "epoch": 4.037233319744531, + "grad_norm": 7.2478837966918945, + "learning_rate": 4.955581600760973e-06, + "loss": 2.8103, + "step": 59420 + }, + { + "epoch": 4.037573039815192, + "grad_norm": 7.340716361999512, + "learning_rate": 4.955156950672646e-06, + "loss": 2.798, + "step": 59425 + }, + { + "epoch": 4.037912759885854, + "grad_norm": 9.83406925201416, + "learning_rate": 4.954732300584319e-06, + "loss": 2.8063, + "step": 59430 + }, + { + "epoch": 4.038252479956516, + "grad_norm": 6.498510837554932, + "learning_rate": 4.954307650495992e-06, + "loss": 2.79, + "step": 59435 + }, + { + "epoch": 4.0385922000271774, + "grad_norm": 7.505764961242676, + "learning_rate": 4.9538830004076645e-06, + "loss": 2.6404, + "step": 59440 + }, + { + "epoch": 4.0389319200978395, + "grad_norm": 10.307002067565918, + "learning_rate": 4.953458350319337e-06, + "loss": 2.8814, + "step": 59445 + }, + { + "epoch": 4.039271640168501, + "grad_norm": 8.262414932250977, + "learning_rate": 4.95303370023101e-06, + "loss": 2.9113, + "step": 59450 + }, + { + "epoch": 4.039611360239163, + "grad_norm": 8.035114288330078, + "learning_rate": 4.952609050142683e-06, + "loss": 3.0676, + "step": 59455 + }, + { + "epoch": 4.039951080309825, + "grad_norm": 6.8241753578186035, + "learning_rate": 4.952184400054356e-06, + "loss": 2.9373, + "step": 59460 + }, + { + "epoch": 4.040290800380486, + "grad_norm": 8.98779296875, + "learning_rate": 4.9517597499660285e-06, + "loss": 2.7407, + "step": 59465 + }, + { + "epoch": 4.040630520451148, + "grad_norm": 5.956225395202637, + "learning_rate": 4.951335099877701e-06, + "loss": 2.949, + "step": 59470 + }, + { + "epoch": 4.04097024052181, + "grad_norm": 5.989510536193848, + "learning_rate": 4.950910449789374e-06, + "loss": 2.8304, + "step": 59475 + }, + { + "epoch": 4.041309960592471, + "grad_norm": 7.366296768188477, + "learning_rate": 4.950485799701047e-06, + "loss": 2.8463, + "step": 59480 + }, + { + "epoch": 4.0416496806631335, + "grad_norm": 6.940897464752197, + "learning_rate": 4.95006114961272e-06, + "loss": 2.8334, + "step": 59485 + }, + { + "epoch": 4.0419894007337955, + "grad_norm": 9.027238845825195, + "learning_rate": 4.9496364995243925e-06, + "loss": 2.9524, + "step": 59490 + }, + { + "epoch": 4.042329120804457, + "grad_norm": 8.535918235778809, + "learning_rate": 4.949211849436065e-06, + "loss": 3.0472, + "step": 59495 + }, + { + "epoch": 4.042668840875119, + "grad_norm": 8.418119430541992, + "learning_rate": 4.948787199347737e-06, + "loss": 3.0463, + "step": 59500 + }, + { + "epoch": 4.043008560945781, + "grad_norm": 5.954195499420166, + "learning_rate": 4.948362549259411e-06, + "loss": 2.9983, + "step": 59505 + }, + { + "epoch": 4.043348281016442, + "grad_norm": 8.137399673461914, + "learning_rate": 4.947937899171084e-06, + "loss": 2.8174, + "step": 59510 + }, + { + "epoch": 4.043688001087104, + "grad_norm": 7.775012969970703, + "learning_rate": 4.947513249082756e-06, + "loss": 2.863, + "step": 59515 + }, + { + "epoch": 4.044027721157766, + "grad_norm": 7.8245038986206055, + "learning_rate": 4.947088598994429e-06, + "loss": 2.8467, + "step": 59520 + }, + { + "epoch": 4.044367441228427, + "grad_norm": 6.69458532333374, + "learning_rate": 4.946663948906102e-06, + "loss": 2.7042, + "step": 59525 + }, + { + "epoch": 4.0447071612990895, + "grad_norm": 5.443755626678467, + "learning_rate": 4.946239298817774e-06, + "loss": 2.8286, + "step": 59530 + }, + { + "epoch": 4.0450468813697515, + "grad_norm": 7.918039321899414, + "learning_rate": 4.945814648729447e-06, + "loss": 3.0866, + "step": 59535 + }, + { + "epoch": 4.045386601440413, + "grad_norm": 6.147830486297607, + "learning_rate": 4.9453899986411206e-06, + "loss": 3.0261, + "step": 59540 + }, + { + "epoch": 4.045726321511075, + "grad_norm": 7.256722927093506, + "learning_rate": 4.9449653485527925e-06, + "loss": 2.7816, + "step": 59545 + }, + { + "epoch": 4.046066041581737, + "grad_norm": 7.228250026702881, + "learning_rate": 4.944540698464465e-06, + "loss": 2.7547, + "step": 59550 + }, + { + "epoch": 4.046405761652398, + "grad_norm": 6.495712757110596, + "learning_rate": 4.944116048376139e-06, + "loss": 2.6573, + "step": 59555 + }, + { + "epoch": 4.04674548172306, + "grad_norm": 7.8823418617248535, + "learning_rate": 4.943691398287811e-06, + "loss": 3.0837, + "step": 59560 + }, + { + "epoch": 4.047085201793722, + "grad_norm": 8.80430793762207, + "learning_rate": 4.943266748199484e-06, + "loss": 2.9571, + "step": 59565 + }, + { + "epoch": 4.047424921864383, + "grad_norm": 8.450982093811035, + "learning_rate": 4.9428420981111565e-06, + "loss": 2.7674, + "step": 59570 + }, + { + "epoch": 4.0477646419350455, + "grad_norm": 6.204038143157959, + "learning_rate": 4.94241744802283e-06, + "loss": 2.5887, + "step": 59575 + }, + { + "epoch": 4.048104362005708, + "grad_norm": 7.101531505584717, + "learning_rate": 4.941992797934502e-06, + "loss": 2.9609, + "step": 59580 + }, + { + "epoch": 4.048444082076369, + "grad_norm": 7.6059794425964355, + "learning_rate": 4.941568147846175e-06, + "loss": 2.8888, + "step": 59585 + }, + { + "epoch": 4.048783802147031, + "grad_norm": 7.465005874633789, + "learning_rate": 4.9411434977578486e-06, + "loss": 2.7711, + "step": 59590 + }, + { + "epoch": 4.049123522217693, + "grad_norm": 8.158862113952637, + "learning_rate": 4.9407188476695205e-06, + "loss": 2.8634, + "step": 59595 + }, + { + "epoch": 4.049463242288354, + "grad_norm": 6.639705181121826, + "learning_rate": 4.940294197581193e-06, + "loss": 2.9294, + "step": 59600 + }, + { + "epoch": 4.049802962359016, + "grad_norm": 8.188910484313965, + "learning_rate": 4.939869547492866e-06, + "loss": 2.9081, + "step": 59605 + }, + { + "epoch": 4.050142682429678, + "grad_norm": 7.953195571899414, + "learning_rate": 4.939444897404539e-06, + "loss": 2.9954, + "step": 59610 + }, + { + "epoch": 4.050482402500339, + "grad_norm": 7.759307861328125, + "learning_rate": 4.939020247316212e-06, + "loss": 2.8411, + "step": 59615 + }, + { + "epoch": 4.0508221225710015, + "grad_norm": 6.9509477615356445, + "learning_rate": 4.9385955972278845e-06, + "loss": 2.8818, + "step": 59620 + }, + { + "epoch": 4.051161842641664, + "grad_norm": 8.5126371383667, + "learning_rate": 4.938170947139557e-06, + "loss": 2.8063, + "step": 59625 + }, + { + "epoch": 4.051501562712325, + "grad_norm": 6.802994728088379, + "learning_rate": 4.93774629705123e-06, + "loss": 2.7807, + "step": 59630 + }, + { + "epoch": 4.051841282782987, + "grad_norm": 8.74599838256836, + "learning_rate": 4.937321646962903e-06, + "loss": 3.1667, + "step": 59635 + }, + { + "epoch": 4.052181002853649, + "grad_norm": 5.486954689025879, + "learning_rate": 4.936896996874576e-06, + "loss": 2.6246, + "step": 59640 + }, + { + "epoch": 4.05252072292431, + "grad_norm": 6.432558059692383, + "learning_rate": 4.9364723467862485e-06, + "loss": 2.8392, + "step": 59645 + }, + { + "epoch": 4.052860442994972, + "grad_norm": 9.461219787597656, + "learning_rate": 4.936047696697921e-06, + "loss": 2.9807, + "step": 59650 + }, + { + "epoch": 4.053200163065634, + "grad_norm": 8.016858100891113, + "learning_rate": 4.935623046609594e-06, + "loss": 2.891, + "step": 59655 + }, + { + "epoch": 4.053539883136295, + "grad_norm": 7.49808931350708, + "learning_rate": 4.935198396521267e-06, + "loss": 2.772, + "step": 59660 + }, + { + "epoch": 4.0538796032069575, + "grad_norm": 8.617615699768066, + "learning_rate": 4.93477374643294e-06, + "loss": 2.9292, + "step": 59665 + }, + { + "epoch": 4.05421932327762, + "grad_norm": 7.4893903732299805, + "learning_rate": 4.9343490963446125e-06, + "loss": 3.1096, + "step": 59670 + }, + { + "epoch": 4.054559043348281, + "grad_norm": 7.58221960067749, + "learning_rate": 4.933924446256285e-06, + "loss": 2.8837, + "step": 59675 + }, + { + "epoch": 4.054898763418943, + "grad_norm": 7.156195640563965, + "learning_rate": 4.933499796167958e-06, + "loss": 2.9599, + "step": 59680 + }, + { + "epoch": 4.055238483489605, + "grad_norm": 6.081698894500732, + "learning_rate": 4.933075146079631e-06, + "loss": 2.8035, + "step": 59685 + }, + { + "epoch": 4.055578203560266, + "grad_norm": 7.770419120788574, + "learning_rate": 4.932650495991304e-06, + "loss": 2.8413, + "step": 59690 + }, + { + "epoch": 4.055917923630928, + "grad_norm": 7.098233222961426, + "learning_rate": 4.9322258459029765e-06, + "loss": 2.8233, + "step": 59695 + }, + { + "epoch": 4.05625764370159, + "grad_norm": 7.058158874511719, + "learning_rate": 4.9318011958146485e-06, + "loss": 2.6679, + "step": 59700 + }, + { + "epoch": 4.056597363772251, + "grad_norm": 6.043190956115723, + "learning_rate": 4.931376545726322e-06, + "loss": 2.7561, + "step": 59705 + }, + { + "epoch": 4.0569370838429135, + "grad_norm": 8.004807472229004, + "learning_rate": 4.930951895637995e-06, + "loss": 2.9247, + "step": 59710 + }, + { + "epoch": 4.057276803913576, + "grad_norm": 6.383126735687256, + "learning_rate": 4.930527245549667e-06, + "loss": 2.9429, + "step": 59715 + }, + { + "epoch": 4.057616523984237, + "grad_norm": 5.821954250335693, + "learning_rate": 4.9301025954613405e-06, + "loss": 2.9049, + "step": 59720 + }, + { + "epoch": 4.057956244054899, + "grad_norm": 6.906919479370117, + "learning_rate": 4.929677945373013e-06, + "loss": 2.9338, + "step": 59725 + }, + { + "epoch": 4.058295964125561, + "grad_norm": 7.7338433265686035, + "learning_rate": 4.929253295284685e-06, + "loss": 2.892, + "step": 59730 + }, + { + "epoch": 4.058635684196222, + "grad_norm": 7.084253787994385, + "learning_rate": 4.928828645196358e-06, + "loss": 2.9475, + "step": 59735 + }, + { + "epoch": 4.058975404266884, + "grad_norm": 8.580099105834961, + "learning_rate": 4.928403995108032e-06, + "loss": 2.9738, + "step": 59740 + }, + { + "epoch": 4.059315124337546, + "grad_norm": 6.659943580627441, + "learning_rate": 4.9279793450197045e-06, + "loss": 3.0268, + "step": 59745 + }, + { + "epoch": 4.0596548444082075, + "grad_norm": 7.455867290496826, + "learning_rate": 4.9275546949313765e-06, + "loss": 2.9179, + "step": 59750 + }, + { + "epoch": 4.0599945644788695, + "grad_norm": 6.163914203643799, + "learning_rate": 4.92713004484305e-06, + "loss": 2.8501, + "step": 59755 + }, + { + "epoch": 4.060334284549532, + "grad_norm": 5.857725143432617, + "learning_rate": 4.926705394754723e-06, + "loss": 2.8009, + "step": 59760 + }, + { + "epoch": 4.060674004620193, + "grad_norm": 9.416610717773438, + "learning_rate": 4.926280744666395e-06, + "loss": 2.8638, + "step": 59765 + }, + { + "epoch": 4.061013724690855, + "grad_norm": 5.9221882820129395, + "learning_rate": 4.9258560945780685e-06, + "loss": 2.7816, + "step": 59770 + }, + { + "epoch": 4.061353444761517, + "grad_norm": 6.855648040771484, + "learning_rate": 4.925431444489741e-06, + "loss": 2.8224, + "step": 59775 + }, + { + "epoch": 4.061693164832178, + "grad_norm": 7.206960678100586, + "learning_rate": 4.925006794401413e-06, + "loss": 2.7955, + "step": 59780 + }, + { + "epoch": 4.06203288490284, + "grad_norm": 8.631123542785645, + "learning_rate": 4.924582144313086e-06, + "loss": 2.8774, + "step": 59785 + }, + { + "epoch": 4.062372604973502, + "grad_norm": 6.659198760986328, + "learning_rate": 4.92415749422476e-06, + "loss": 2.968, + "step": 59790 + }, + { + "epoch": 4.0627123250441635, + "grad_norm": 9.152936935424805, + "learning_rate": 4.923732844136432e-06, + "loss": 2.9838, + "step": 59795 + }, + { + "epoch": 4.0630520451148255, + "grad_norm": 7.296744346618652, + "learning_rate": 4.9233081940481045e-06, + "loss": 2.8666, + "step": 59800 + }, + { + "epoch": 4.063391765185487, + "grad_norm": 5.697526931762695, + "learning_rate": 4.922883543959778e-06, + "loss": 2.9855, + "step": 59805 + }, + { + "epoch": 4.063731485256149, + "grad_norm": 6.665969371795654, + "learning_rate": 4.92245889387145e-06, + "loss": 2.8962, + "step": 59810 + }, + { + "epoch": 4.064071205326811, + "grad_norm": 7.705109119415283, + "learning_rate": 4.922034243783123e-06, + "loss": 2.8216, + "step": 59815 + }, + { + "epoch": 4.064410925397472, + "grad_norm": 5.445897579193115, + "learning_rate": 4.921609593694796e-06, + "loss": 2.7901, + "step": 59820 + }, + { + "epoch": 4.064750645468134, + "grad_norm": 8.644323348999023, + "learning_rate": 4.9211849436064685e-06, + "loss": 2.8981, + "step": 59825 + }, + { + "epoch": 4.065090365538796, + "grad_norm": 7.4123311042785645, + "learning_rate": 4.920760293518141e-06, + "loss": 2.8144, + "step": 59830 + }, + { + "epoch": 4.065430085609457, + "grad_norm": 8.134114265441895, + "learning_rate": 4.920335643429814e-06, + "loss": 2.6612, + "step": 59835 + }, + { + "epoch": 4.0657698056801195, + "grad_norm": 7.472886562347412, + "learning_rate": 4.919910993341487e-06, + "loss": 3.089, + "step": 59840 + }, + { + "epoch": 4.0661095257507816, + "grad_norm": 6.493059158325195, + "learning_rate": 4.91948634325316e-06, + "loss": 3.0241, + "step": 59845 + }, + { + "epoch": 4.066449245821443, + "grad_norm": 7.344827175140381, + "learning_rate": 4.9190616931648325e-06, + "loss": 2.844, + "step": 59850 + }, + { + "epoch": 4.066788965892105, + "grad_norm": 10.259474754333496, + "learning_rate": 4.918637043076505e-06, + "loss": 2.5692, + "step": 59855 + }, + { + "epoch": 4.067128685962767, + "grad_norm": 6.543625831604004, + "learning_rate": 4.918212392988178e-06, + "loss": 2.8065, + "step": 59860 + }, + { + "epoch": 4.067468406033428, + "grad_norm": 6.909773826599121, + "learning_rate": 4.917787742899851e-06, + "loss": 2.8251, + "step": 59865 + }, + { + "epoch": 4.06780812610409, + "grad_norm": 7.720160484313965, + "learning_rate": 4.917363092811524e-06, + "loss": 2.987, + "step": 59870 + }, + { + "epoch": 4.068147846174752, + "grad_norm": 7.224229335784912, + "learning_rate": 4.9169384427231965e-06, + "loss": 2.9841, + "step": 59875 + }, + { + "epoch": 4.068487566245413, + "grad_norm": 6.767229080200195, + "learning_rate": 4.916513792634869e-06, + "loss": 3.0011, + "step": 59880 + }, + { + "epoch": 4.0688272863160755, + "grad_norm": 7.454165458679199, + "learning_rate": 4.916089142546542e-06, + "loss": 2.931, + "step": 59885 + }, + { + "epoch": 4.069167006386738, + "grad_norm": 6.681205749511719, + "learning_rate": 4.915664492458215e-06, + "loss": 2.841, + "step": 59890 + }, + { + "epoch": 4.069506726457399, + "grad_norm": 8.80533218383789, + "learning_rate": 4.915239842369888e-06, + "loss": 2.956, + "step": 59895 + }, + { + "epoch": 4.069846446528061, + "grad_norm": 6.716211795806885, + "learning_rate": 4.9148151922815605e-06, + "loss": 3.0744, + "step": 59900 + }, + { + "epoch": 4.070186166598723, + "grad_norm": 5.793525695800781, + "learning_rate": 4.914390542193233e-06, + "loss": 3.0263, + "step": 59905 + }, + { + "epoch": 4.070525886669384, + "grad_norm": 6.998322486877441, + "learning_rate": 4.913965892104906e-06, + "loss": 2.7536, + "step": 59910 + }, + { + "epoch": 4.070865606740046, + "grad_norm": 8.109857559204102, + "learning_rate": 4.913541242016579e-06, + "loss": 3.0148, + "step": 59915 + }, + { + "epoch": 4.071205326810708, + "grad_norm": 9.054686546325684, + "learning_rate": 4.913116591928252e-06, + "loss": 2.9588, + "step": 59920 + }, + { + "epoch": 4.071545046881369, + "grad_norm": 6.3182220458984375, + "learning_rate": 4.9126919418399245e-06, + "loss": 2.9299, + "step": 59925 + }, + { + "epoch": 4.0718847669520315, + "grad_norm": 8.1839017868042, + "learning_rate": 4.912267291751597e-06, + "loss": 2.9953, + "step": 59930 + }, + { + "epoch": 4.072224487022694, + "grad_norm": 6.311458110809326, + "learning_rate": 4.91184264166327e-06, + "loss": 2.8248, + "step": 59935 + }, + { + "epoch": 4.072564207093355, + "grad_norm": 6.754423141479492, + "learning_rate": 4.911417991574943e-06, + "loss": 3.021, + "step": 59940 + }, + { + "epoch": 4.072903927164017, + "grad_norm": 7.182726860046387, + "learning_rate": 4.910993341486616e-06, + "loss": 3.1406, + "step": 59945 + }, + { + "epoch": 4.073243647234679, + "grad_norm": 7.278266906738281, + "learning_rate": 4.910568691398288e-06, + "loss": 2.8782, + "step": 59950 + }, + { + "epoch": 4.07358336730534, + "grad_norm": 8.050676345825195, + "learning_rate": 4.910144041309961e-06, + "loss": 2.8141, + "step": 59955 + }, + { + "epoch": 4.073923087376002, + "grad_norm": 7.676116466522217, + "learning_rate": 4.909719391221634e-06, + "loss": 2.769, + "step": 59960 + }, + { + "epoch": 4.074262807446664, + "grad_norm": 7.32595157623291, + "learning_rate": 4.909294741133306e-06, + "loss": 2.6041, + "step": 59965 + }, + { + "epoch": 4.074602527517325, + "grad_norm": 7.551571369171143, + "learning_rate": 4.90887009104498e-06, + "loss": 2.8724, + "step": 59970 + }, + { + "epoch": 4.0749422475879875, + "grad_norm": 6.751758575439453, + "learning_rate": 4.9084454409566525e-06, + "loss": 2.8551, + "step": 59975 + }, + { + "epoch": 4.07528196765865, + "grad_norm": 6.346200466156006, + "learning_rate": 4.9080207908683245e-06, + "loss": 2.8601, + "step": 59980 + }, + { + "epoch": 4.075621687729311, + "grad_norm": 6.473633766174316, + "learning_rate": 4.907596140779997e-06, + "loss": 2.7698, + "step": 59985 + }, + { + "epoch": 4.075961407799973, + "grad_norm": 7.032909393310547, + "learning_rate": 4.907171490691671e-06, + "loss": 2.8124, + "step": 59990 + }, + { + "epoch": 4.076301127870635, + "grad_norm": 8.469171524047852, + "learning_rate": 4.906746840603343e-06, + "loss": 2.6904, + "step": 59995 + }, + { + "epoch": 4.076640847941296, + "grad_norm": 6.490625858306885, + "learning_rate": 4.906322190515016e-06, + "loss": 2.9389, + "step": 60000 + }, + { + "epoch": 4.076980568011958, + "grad_norm": 6.548933029174805, + "learning_rate": 4.905897540426689e-06, + "loss": 2.8664, + "step": 60005 + }, + { + "epoch": 4.07732028808262, + "grad_norm": 6.73244047164917, + "learning_rate": 4.905472890338361e-06, + "loss": 2.7448, + "step": 60010 + }, + { + "epoch": 4.0776600081532814, + "grad_norm": 8.978996276855469, + "learning_rate": 4.905048240250034e-06, + "loss": 2.6408, + "step": 60015 + }, + { + "epoch": 4.0779997282239435, + "grad_norm": 10.65623950958252, + "learning_rate": 4.904623590161707e-06, + "loss": 2.7899, + "step": 60020 + }, + { + "epoch": 4.078339448294606, + "grad_norm": 7.174496173858643, + "learning_rate": 4.90419894007338e-06, + "loss": 2.7348, + "step": 60025 + }, + { + "epoch": 4.078679168365267, + "grad_norm": 6.416026592254639, + "learning_rate": 4.9037742899850525e-06, + "loss": 2.8637, + "step": 60030 + }, + { + "epoch": 4.079018888435929, + "grad_norm": 7.303685665130615, + "learning_rate": 4.903349639896725e-06, + "loss": 2.9168, + "step": 60035 + }, + { + "epoch": 4.079358608506591, + "grad_norm": 7.860996246337891, + "learning_rate": 4.902924989808398e-06, + "loss": 2.9317, + "step": 60040 + }, + { + "epoch": 4.079698328577252, + "grad_norm": 7.9909491539001465, + "learning_rate": 4.902500339720071e-06, + "loss": 2.8984, + "step": 60045 + }, + { + "epoch": 4.080038048647914, + "grad_norm": 6.180850982666016, + "learning_rate": 4.902075689631744e-06, + "loss": 2.8865, + "step": 60050 + }, + { + "epoch": 4.080377768718576, + "grad_norm": 6.001675128936768, + "learning_rate": 4.9016510395434165e-06, + "loss": 2.88, + "step": 60055 + }, + { + "epoch": 4.0807174887892375, + "grad_norm": 5.689236164093018, + "learning_rate": 4.901226389455089e-06, + "loss": 2.7977, + "step": 60060 + }, + { + "epoch": 4.0810572088598995, + "grad_norm": 7.743323802947998, + "learning_rate": 4.900801739366762e-06, + "loss": 2.6882, + "step": 60065 + }, + { + "epoch": 4.081396928930562, + "grad_norm": 9.68921184539795, + "learning_rate": 4.900377089278435e-06, + "loss": 2.8529, + "step": 60070 + }, + { + "epoch": 4.081736649001223, + "grad_norm": 8.792872428894043, + "learning_rate": 4.899952439190108e-06, + "loss": 2.8582, + "step": 60075 + }, + { + "epoch": 4.082076369071885, + "grad_norm": 8.75819206237793, + "learning_rate": 4.8995277891017805e-06, + "loss": 2.8806, + "step": 60080 + }, + { + "epoch": 4.082416089142547, + "grad_norm": 7.245890140533447, + "learning_rate": 4.899103139013453e-06, + "loss": 3.022, + "step": 60085 + }, + { + "epoch": 4.082755809213208, + "grad_norm": 6.964841842651367, + "learning_rate": 4.898678488925126e-06, + "loss": 2.929, + "step": 60090 + }, + { + "epoch": 4.08309552928387, + "grad_norm": 8.595419883728027, + "learning_rate": 4.898253838836799e-06, + "loss": 3.2315, + "step": 60095 + }, + { + "epoch": 4.083435249354532, + "grad_norm": 7.635554313659668, + "learning_rate": 4.897829188748472e-06, + "loss": 2.7182, + "step": 60100 + }, + { + "epoch": 4.0837749694251935, + "grad_norm": 7.293752670288086, + "learning_rate": 4.8974045386601445e-06, + "loss": 2.6655, + "step": 60105 + }, + { + "epoch": 4.0841146894958555, + "grad_norm": 6.844944477081299, + "learning_rate": 4.896979888571817e-06, + "loss": 2.9131, + "step": 60110 + }, + { + "epoch": 4.084454409566518, + "grad_norm": 7.528110980987549, + "learning_rate": 4.89655523848349e-06, + "loss": 3.0458, + "step": 60115 + }, + { + "epoch": 4.084794129637179, + "grad_norm": 5.958136558532715, + "learning_rate": 4.896130588395163e-06, + "loss": 2.8965, + "step": 60120 + }, + { + "epoch": 4.085133849707841, + "grad_norm": 8.827089309692383, + "learning_rate": 4.895705938306836e-06, + "loss": 2.9697, + "step": 60125 + }, + { + "epoch": 4.085473569778502, + "grad_norm": 8.594598770141602, + "learning_rate": 4.8952812882185085e-06, + "loss": 2.9555, + "step": 60130 + }, + { + "epoch": 4.085813289849164, + "grad_norm": 8.150786399841309, + "learning_rate": 4.894856638130181e-06, + "loss": 3.1977, + "step": 60135 + }, + { + "epoch": 4.086153009919826, + "grad_norm": 7.584416389465332, + "learning_rate": 4.894431988041854e-06, + "loss": 2.697, + "step": 60140 + }, + { + "epoch": 4.086492729990487, + "grad_norm": 7.04803991317749, + "learning_rate": 4.894007337953527e-06, + "loss": 2.8878, + "step": 60145 + }, + { + "epoch": 4.0868324500611495, + "grad_norm": 7.027462959289551, + "learning_rate": 4.8935826878652e-06, + "loss": 2.9087, + "step": 60150 + }, + { + "epoch": 4.0871721701318116, + "grad_norm": 5.623779773712158, + "learning_rate": 4.8931580377768725e-06, + "loss": 3.0912, + "step": 60155 + }, + { + "epoch": 4.087511890202473, + "grad_norm": 7.029716491699219, + "learning_rate": 4.892733387688545e-06, + "loss": 2.8273, + "step": 60160 + }, + { + "epoch": 4.087851610273135, + "grad_norm": 6.449114799499512, + "learning_rate": 4.892308737600217e-06, + "loss": 3.0113, + "step": 60165 + }, + { + "epoch": 4.088191330343797, + "grad_norm": 7.584178924560547, + "learning_rate": 4.891884087511891e-06, + "loss": 2.8197, + "step": 60170 + }, + { + "epoch": 4.088531050414458, + "grad_norm": 7.6096367835998535, + "learning_rate": 4.891459437423564e-06, + "loss": 2.8627, + "step": 60175 + }, + { + "epoch": 4.08887077048512, + "grad_norm": 8.167500495910645, + "learning_rate": 4.891034787335236e-06, + "loss": 3.0716, + "step": 60180 + }, + { + "epoch": 4.089210490555782, + "grad_norm": 8.059659957885742, + "learning_rate": 4.890610137246909e-06, + "loss": 2.7457, + "step": 60185 + }, + { + "epoch": 4.089550210626443, + "grad_norm": 7.875836372375488, + "learning_rate": 4.890185487158582e-06, + "loss": 2.8117, + "step": 60190 + }, + { + "epoch": 4.0898899306971055, + "grad_norm": 7.76408576965332, + "learning_rate": 4.889760837070254e-06, + "loss": 3.0062, + "step": 60195 + }, + { + "epoch": 4.090229650767768, + "grad_norm": 6.812608242034912, + "learning_rate": 4.889336186981927e-06, + "loss": 2.8738, + "step": 60200 + }, + { + "epoch": 4.090569370838429, + "grad_norm": 11.354918479919434, + "learning_rate": 4.8889115368936005e-06, + "loss": 3.0275, + "step": 60205 + }, + { + "epoch": 4.090909090909091, + "grad_norm": 6.915278911590576, + "learning_rate": 4.8884868868052725e-06, + "loss": 2.7048, + "step": 60210 + }, + { + "epoch": 4.091248810979753, + "grad_norm": 7.3370561599731445, + "learning_rate": 4.888062236716945e-06, + "loss": 2.8196, + "step": 60215 + }, + { + "epoch": 4.091588531050414, + "grad_norm": 7.960884094238281, + "learning_rate": 4.887637586628619e-06, + "loss": 2.8456, + "step": 60220 + }, + { + "epoch": 4.091928251121076, + "grad_norm": 6.463621139526367, + "learning_rate": 4.887212936540291e-06, + "loss": 2.9822, + "step": 60225 + }, + { + "epoch": 4.092267971191738, + "grad_norm": 6.209446907043457, + "learning_rate": 4.886788286451964e-06, + "loss": 3.0226, + "step": 60230 + }, + { + "epoch": 4.092607691262399, + "grad_norm": 6.795416355133057, + "learning_rate": 4.8863636363636365e-06, + "loss": 2.8245, + "step": 60235 + }, + { + "epoch": 4.0929474113330615, + "grad_norm": 8.77635383605957, + "learning_rate": 4.885938986275309e-06, + "loss": 2.732, + "step": 60240 + }, + { + "epoch": 4.093287131403724, + "grad_norm": 7.907521724700928, + "learning_rate": 4.885514336186982e-06, + "loss": 2.7754, + "step": 60245 + }, + { + "epoch": 4.093626851474385, + "grad_norm": 7.98337459564209, + "learning_rate": 4.885089686098655e-06, + "loss": 2.6568, + "step": 60250 + }, + { + "epoch": 4.093966571545047, + "grad_norm": 7.516916751861572, + "learning_rate": 4.8846650360103285e-06, + "loss": 2.9766, + "step": 60255 + }, + { + "epoch": 4.094306291615709, + "grad_norm": 8.25122356414795, + "learning_rate": 4.8842403859220005e-06, + "loss": 2.9227, + "step": 60260 + }, + { + "epoch": 4.09464601168637, + "grad_norm": 6.739058971405029, + "learning_rate": 4.883815735833673e-06, + "loss": 2.9616, + "step": 60265 + }, + { + "epoch": 4.094985731757032, + "grad_norm": 8.993966102600098, + "learning_rate": 4.883391085745346e-06, + "loss": 2.8591, + "step": 60270 + }, + { + "epoch": 4.095325451827694, + "grad_norm": 7.701512336730957, + "learning_rate": 4.882966435657019e-06, + "loss": 2.6007, + "step": 60275 + }, + { + "epoch": 4.095665171898355, + "grad_norm": 7.533486366271973, + "learning_rate": 4.882541785568692e-06, + "loss": 2.9179, + "step": 60280 + }, + { + "epoch": 4.0960048919690175, + "grad_norm": 5.67487907409668, + "learning_rate": 4.8821171354803645e-06, + "loss": 2.7249, + "step": 60285 + }, + { + "epoch": 4.09634461203968, + "grad_norm": 7.379180908203125, + "learning_rate": 4.881692485392037e-06, + "loss": 2.8106, + "step": 60290 + }, + { + "epoch": 4.096684332110341, + "grad_norm": 7.997586727142334, + "learning_rate": 4.88126783530371e-06, + "loss": 2.8934, + "step": 60295 + }, + { + "epoch": 4.097024052181003, + "grad_norm": 8.552322387695312, + "learning_rate": 4.880843185215383e-06, + "loss": 3.0384, + "step": 60300 + }, + { + "epoch": 4.097363772251665, + "grad_norm": 6.976053237915039, + "learning_rate": 4.880418535127056e-06, + "loss": 2.9201, + "step": 60305 + }, + { + "epoch": 4.097703492322326, + "grad_norm": 5.704463481903076, + "learning_rate": 4.8799938850387285e-06, + "loss": 2.6515, + "step": 60310 + }, + { + "epoch": 4.098043212392988, + "grad_norm": 7.163688659667969, + "learning_rate": 4.879569234950401e-06, + "loss": 2.9523, + "step": 60315 + }, + { + "epoch": 4.09838293246365, + "grad_norm": 6.402314186096191, + "learning_rate": 4.879144584862074e-06, + "loss": 2.7384, + "step": 60320 + }, + { + "epoch": 4.0987226525343115, + "grad_norm": 8.616961479187012, + "learning_rate": 4.878719934773747e-06, + "loss": 2.8861, + "step": 60325 + }, + { + "epoch": 4.0990623726049735, + "grad_norm": 6.710136890411377, + "learning_rate": 4.87829528468542e-06, + "loss": 2.8071, + "step": 60330 + }, + { + "epoch": 4.099402092675636, + "grad_norm": 8.03081226348877, + "learning_rate": 4.8778706345970925e-06, + "loss": 2.933, + "step": 60335 + }, + { + "epoch": 4.099741812746297, + "grad_norm": 7.988720893859863, + "learning_rate": 4.877445984508765e-06, + "loss": 2.948, + "step": 60340 + }, + { + "epoch": 4.100081532816959, + "grad_norm": 6.713408470153809, + "learning_rate": 4.877021334420438e-06, + "loss": 2.8514, + "step": 60345 + }, + { + "epoch": 4.100421252887621, + "grad_norm": 6.50644063949585, + "learning_rate": 4.876596684332111e-06, + "loss": 2.9336, + "step": 60350 + }, + { + "epoch": 4.100760972958282, + "grad_norm": 6.472694396972656, + "learning_rate": 4.876172034243784e-06, + "loss": 2.9725, + "step": 60355 + }, + { + "epoch": 4.101100693028944, + "grad_norm": 7.1277666091918945, + "learning_rate": 4.8757473841554565e-06, + "loss": 2.7761, + "step": 60360 + }, + { + "epoch": 4.101440413099606, + "grad_norm": 6.4939284324646, + "learning_rate": 4.8753227340671284e-06, + "loss": 2.8631, + "step": 60365 + }, + { + "epoch": 4.1017801331702675, + "grad_norm": 8.254693984985352, + "learning_rate": 4.874898083978802e-06, + "loss": 2.922, + "step": 60370 + }, + { + "epoch": 4.1021198532409295, + "grad_norm": 7.970438003540039, + "learning_rate": 4.874473433890475e-06, + "loss": 2.8809, + "step": 60375 + }, + { + "epoch": 4.102459573311592, + "grad_norm": 6.756032943725586, + "learning_rate": 4.874048783802147e-06, + "loss": 2.9294, + "step": 60380 + }, + { + "epoch": 4.102799293382253, + "grad_norm": 7.0850067138671875, + "learning_rate": 4.8736241337138205e-06, + "loss": 2.8192, + "step": 60385 + }, + { + "epoch": 4.103139013452915, + "grad_norm": 6.5904741287231445, + "learning_rate": 4.873199483625493e-06, + "loss": 2.7515, + "step": 60390 + }, + { + "epoch": 4.103478733523577, + "grad_norm": 6.583408832550049, + "learning_rate": 4.872774833537165e-06, + "loss": 2.6374, + "step": 60395 + }, + { + "epoch": 4.103818453594238, + "grad_norm": 6.596832275390625, + "learning_rate": 4.872350183448839e-06, + "loss": 2.7918, + "step": 60400 + }, + { + "epoch": 4.1041581736649, + "grad_norm": 9.504509925842285, + "learning_rate": 4.871925533360512e-06, + "loss": 2.9293, + "step": 60405 + }, + { + "epoch": 4.104497893735562, + "grad_norm": 6.364888668060303, + "learning_rate": 4.871500883272184e-06, + "loss": 2.6353, + "step": 60410 + }, + { + "epoch": 4.1048376138062235, + "grad_norm": 8.036624908447266, + "learning_rate": 4.8710762331838565e-06, + "loss": 2.8895, + "step": 60415 + }, + { + "epoch": 4.1051773338768855, + "grad_norm": 8.445289611816406, + "learning_rate": 4.87065158309553e-06, + "loss": 2.9168, + "step": 60420 + }, + { + "epoch": 4.105517053947548, + "grad_norm": 8.411364555358887, + "learning_rate": 4.870226933007203e-06, + "loss": 2.891, + "step": 60425 + }, + { + "epoch": 4.105856774018209, + "grad_norm": 7.117765426635742, + "learning_rate": 4.869802282918875e-06, + "loss": 2.8031, + "step": 60430 + }, + { + "epoch": 4.106196494088871, + "grad_norm": 7.190127849578857, + "learning_rate": 4.8693776328305485e-06, + "loss": 2.5925, + "step": 60435 + }, + { + "epoch": 4.106536214159533, + "grad_norm": 8.386926651000977, + "learning_rate": 4.868952982742221e-06, + "loss": 2.871, + "step": 60440 + }, + { + "epoch": 4.106875934230194, + "grad_norm": 7.038069725036621, + "learning_rate": 4.868528332653893e-06, + "loss": 2.839, + "step": 60445 + }, + { + "epoch": 4.107215654300856, + "grad_norm": 7.520260334014893, + "learning_rate": 4.868103682565566e-06, + "loss": 3.019, + "step": 60450 + }, + { + "epoch": 4.107555374371518, + "grad_norm": 6.9234299659729, + "learning_rate": 4.86767903247724e-06, + "loss": 2.8661, + "step": 60455 + }, + { + "epoch": 4.1078950944421795, + "grad_norm": 6.382512092590332, + "learning_rate": 4.867254382388912e-06, + "loss": 2.7652, + "step": 60460 + }, + { + "epoch": 4.108234814512842, + "grad_norm": 7.818853855133057, + "learning_rate": 4.8668297323005845e-06, + "loss": 2.8175, + "step": 60465 + }, + { + "epoch": 4.108574534583504, + "grad_norm": 7.285856246948242, + "learning_rate": 4.866405082212258e-06, + "loss": 2.7348, + "step": 60470 + }, + { + "epoch": 4.108914254654165, + "grad_norm": 8.573527336120605, + "learning_rate": 4.86598043212393e-06, + "loss": 3.0092, + "step": 60475 + }, + { + "epoch": 4.109253974724827, + "grad_norm": 6.4177680015563965, + "learning_rate": 4.865555782035603e-06, + "loss": 2.9122, + "step": 60480 + }, + { + "epoch": 4.109593694795488, + "grad_norm": 7.208138465881348, + "learning_rate": 4.865131131947276e-06, + "loss": 2.8699, + "step": 60485 + }, + { + "epoch": 4.10993341486615, + "grad_norm": 6.475455284118652, + "learning_rate": 4.8647064818589485e-06, + "loss": 3.1034, + "step": 60490 + }, + { + "epoch": 4.110273134936812, + "grad_norm": 8.766071319580078, + "learning_rate": 4.864281831770621e-06, + "loss": 2.9332, + "step": 60495 + }, + { + "epoch": 4.110612855007473, + "grad_norm": 8.44765567779541, + "learning_rate": 4.863857181682294e-06, + "loss": 2.5697, + "step": 60500 + }, + { + "epoch": 4.1109525750781355, + "grad_norm": 7.138019561767578, + "learning_rate": 4.863432531593967e-06, + "loss": 2.8151, + "step": 60505 + }, + { + "epoch": 4.111292295148798, + "grad_norm": 8.146322250366211, + "learning_rate": 4.86300788150564e-06, + "loss": 2.9738, + "step": 60510 + }, + { + "epoch": 4.111632015219459, + "grad_norm": 5.329372406005859, + "learning_rate": 4.8625832314173125e-06, + "loss": 2.9446, + "step": 60515 + }, + { + "epoch": 4.111971735290121, + "grad_norm": 9.028100967407227, + "learning_rate": 4.862158581328985e-06, + "loss": 2.8173, + "step": 60520 + }, + { + "epoch": 4.112311455360783, + "grad_norm": 8.725778579711914, + "learning_rate": 4.861733931240658e-06, + "loss": 2.8242, + "step": 60525 + }, + { + "epoch": 4.112651175431444, + "grad_norm": 9.148268699645996, + "learning_rate": 4.861309281152331e-06, + "loss": 2.7572, + "step": 60530 + }, + { + "epoch": 4.112990895502106, + "grad_norm": 7.916311264038086, + "learning_rate": 4.860884631064004e-06, + "loss": 2.5742, + "step": 60535 + }, + { + "epoch": 4.113330615572768, + "grad_norm": 7.30198860168457, + "learning_rate": 4.8604599809756765e-06, + "loss": 2.6721, + "step": 60540 + }, + { + "epoch": 4.113670335643429, + "grad_norm": 6.135364532470703, + "learning_rate": 4.860035330887349e-06, + "loss": 2.8378, + "step": 60545 + }, + { + "epoch": 4.1140100557140915, + "grad_norm": 7.210912704467773, + "learning_rate": 4.859610680799022e-06, + "loss": 2.6193, + "step": 60550 + }, + { + "epoch": 4.114349775784754, + "grad_norm": 7.053772449493408, + "learning_rate": 4.859186030710695e-06, + "loss": 2.7264, + "step": 60555 + }, + { + "epoch": 4.114689495855415, + "grad_norm": 9.05108642578125, + "learning_rate": 4.858761380622368e-06, + "loss": 2.7156, + "step": 60560 + }, + { + "epoch": 4.115029215926077, + "grad_norm": 7.565269470214844, + "learning_rate": 4.8583367305340405e-06, + "loss": 2.9262, + "step": 60565 + }, + { + "epoch": 4.115368935996739, + "grad_norm": 8.710519790649414, + "learning_rate": 4.857912080445713e-06, + "loss": 2.879, + "step": 60570 + }, + { + "epoch": 4.1157086560674, + "grad_norm": 6.4681854248046875, + "learning_rate": 4.857487430357386e-06, + "loss": 2.5602, + "step": 60575 + }, + { + "epoch": 4.116048376138062, + "grad_norm": 10.326025009155273, + "learning_rate": 4.857062780269058e-06, + "loss": 3.0043, + "step": 60580 + }, + { + "epoch": 4.116388096208724, + "grad_norm": 6.918997287750244, + "learning_rate": 4.856638130180732e-06, + "loss": 2.8097, + "step": 60585 + }, + { + "epoch": 4.116727816279385, + "grad_norm": 8.072678565979004, + "learning_rate": 4.8562134800924045e-06, + "loss": 2.8631, + "step": 60590 + }, + { + "epoch": 4.1170675363500475, + "grad_norm": 7.0070481300354, + "learning_rate": 4.855788830004077e-06, + "loss": 2.9734, + "step": 60595 + }, + { + "epoch": 4.11740725642071, + "grad_norm": 7.813503265380859, + "learning_rate": 4.85536417991575e-06, + "loss": 3.053, + "step": 60600 + }, + { + "epoch": 4.117746976491371, + "grad_norm": 7.99049711227417, + "learning_rate": 4.854939529827423e-06, + "loss": 2.8532, + "step": 60605 + }, + { + "epoch": 4.118086696562033, + "grad_norm": 6.782552242279053, + "learning_rate": 4.854514879739096e-06, + "loss": 2.7547, + "step": 60610 + }, + { + "epoch": 4.118426416632695, + "grad_norm": 7.177046775817871, + "learning_rate": 4.854090229650768e-06, + "loss": 2.8049, + "step": 60615 + }, + { + "epoch": 4.118766136703356, + "grad_norm": 6.373247146606445, + "learning_rate": 4.853665579562441e-06, + "loss": 2.8488, + "step": 60620 + }, + { + "epoch": 4.119105856774018, + "grad_norm": 7.1774115562438965, + "learning_rate": 4.853240929474114e-06, + "loss": 2.9471, + "step": 60625 + }, + { + "epoch": 4.11944557684468, + "grad_norm": 6.972933292388916, + "learning_rate": 4.852816279385786e-06, + "loss": 2.6078, + "step": 60630 + }, + { + "epoch": 4.1197852969153415, + "grad_norm": 6.426468372344971, + "learning_rate": 4.85239162929746e-06, + "loss": 3.0997, + "step": 60635 + }, + { + "epoch": 4.1201250169860035, + "grad_norm": 6.707523345947266, + "learning_rate": 4.8519669792091325e-06, + "loss": 2.8862, + "step": 60640 + }, + { + "epoch": 4.120464737056666, + "grad_norm": 7.625383377075195, + "learning_rate": 4.8515423291208044e-06, + "loss": 2.896, + "step": 60645 + }, + { + "epoch": 4.120804457127327, + "grad_norm": 7.640692234039307, + "learning_rate": 4.851117679032477e-06, + "loss": 3.0507, + "step": 60650 + }, + { + "epoch": 4.121144177197989, + "grad_norm": 6.460451602935791, + "learning_rate": 4.850693028944151e-06, + "loss": 2.9211, + "step": 60655 + }, + { + "epoch": 4.121483897268651, + "grad_norm": 6.014593124389648, + "learning_rate": 4.850268378855823e-06, + "loss": 2.7483, + "step": 60660 + }, + { + "epoch": 4.121823617339312, + "grad_norm": 7.36786413192749, + "learning_rate": 4.849843728767496e-06, + "loss": 2.6859, + "step": 60665 + }, + { + "epoch": 4.122163337409974, + "grad_norm": 8.421765327453613, + "learning_rate": 4.849419078679169e-06, + "loss": 2.7988, + "step": 60670 + }, + { + "epoch": 4.122503057480636, + "grad_norm": 7.833392143249512, + "learning_rate": 4.848994428590841e-06, + "loss": 2.6633, + "step": 60675 + }, + { + "epoch": 4.1228427775512975, + "grad_norm": 8.938583374023438, + "learning_rate": 4.848569778502514e-06, + "loss": 2.6799, + "step": 60680 + }, + { + "epoch": 4.1231824976219595, + "grad_norm": 11.181468963623047, + "learning_rate": 4.848145128414188e-06, + "loss": 2.7764, + "step": 60685 + }, + { + "epoch": 4.123522217692622, + "grad_norm": 6.239317417144775, + "learning_rate": 4.84772047832586e-06, + "loss": 3.2198, + "step": 60690 + }, + { + "epoch": 4.123861937763283, + "grad_norm": 7.043363094329834, + "learning_rate": 4.8472958282375325e-06, + "loss": 2.7106, + "step": 60695 + }, + { + "epoch": 4.124201657833945, + "grad_norm": 5.742588520050049, + "learning_rate": 4.846871178149205e-06, + "loss": 2.6533, + "step": 60700 + }, + { + "epoch": 4.124541377904607, + "grad_norm": 7.993927478790283, + "learning_rate": 4.846446528060878e-06, + "loss": 2.8904, + "step": 60705 + }, + { + "epoch": 4.124881097975268, + "grad_norm": 7.262521266937256, + "learning_rate": 4.846021877972551e-06, + "loss": 2.763, + "step": 60710 + }, + { + "epoch": 4.12522081804593, + "grad_norm": 6.607539653778076, + "learning_rate": 4.845597227884224e-06, + "loss": 2.8722, + "step": 60715 + }, + { + "epoch": 4.125560538116592, + "grad_norm": 6.735789775848389, + "learning_rate": 4.8451725777958965e-06, + "loss": 2.7692, + "step": 60720 + }, + { + "epoch": 4.1259002581872535, + "grad_norm": 6.077165126800537, + "learning_rate": 4.844747927707569e-06, + "loss": 2.5329, + "step": 60725 + }, + { + "epoch": 4.1262399782579156, + "grad_norm": 9.86886215209961, + "learning_rate": 4.844323277619242e-06, + "loss": 2.6961, + "step": 60730 + }, + { + "epoch": 4.126579698328578, + "grad_norm": 8.68247127532959, + "learning_rate": 4.843898627530915e-06, + "loss": 2.793, + "step": 60735 + }, + { + "epoch": 4.126919418399239, + "grad_norm": 6.688948631286621, + "learning_rate": 4.843473977442588e-06, + "loss": 2.7068, + "step": 60740 + }, + { + "epoch": 4.127259138469901, + "grad_norm": 8.27657413482666, + "learning_rate": 4.8430493273542605e-06, + "loss": 2.4756, + "step": 60745 + }, + { + "epoch": 4.127598858540563, + "grad_norm": 7.898627758026123, + "learning_rate": 4.842624677265933e-06, + "loss": 3.0595, + "step": 60750 + }, + { + "epoch": 4.127938578611224, + "grad_norm": 7.151381015777588, + "learning_rate": 4.842200027177606e-06, + "loss": 2.8471, + "step": 60755 + }, + { + "epoch": 4.128278298681886, + "grad_norm": 7.718079566955566, + "learning_rate": 4.841775377089279e-06, + "loss": 2.9055, + "step": 60760 + }, + { + "epoch": 4.128618018752548, + "grad_norm": 7.254812717437744, + "learning_rate": 4.841350727000952e-06, + "loss": 2.8032, + "step": 60765 + }, + { + "epoch": 4.1289577388232095, + "grad_norm": 8.633294105529785, + "learning_rate": 4.8409260769126245e-06, + "loss": 2.8896, + "step": 60770 + }, + { + "epoch": 4.129297458893872, + "grad_norm": 6.333627700805664, + "learning_rate": 4.840501426824297e-06, + "loss": 2.6813, + "step": 60775 + }, + { + "epoch": 4.129637178964534, + "grad_norm": 8.05040168762207, + "learning_rate": 4.84007677673597e-06, + "loss": 2.9379, + "step": 60780 + }, + { + "epoch": 4.129976899035195, + "grad_norm": 5.395543098449707, + "learning_rate": 4.839652126647643e-06, + "loss": 2.9495, + "step": 60785 + }, + { + "epoch": 4.130316619105857, + "grad_norm": 9.369839668273926, + "learning_rate": 4.839227476559316e-06, + "loss": 2.9098, + "step": 60790 + }, + { + "epoch": 4.130656339176519, + "grad_norm": 7.793503761291504, + "learning_rate": 4.8388028264709885e-06, + "loss": 2.8838, + "step": 60795 + }, + { + "epoch": 4.13099605924718, + "grad_norm": 6.376972675323486, + "learning_rate": 4.838378176382661e-06, + "loss": 2.7803, + "step": 60800 + }, + { + "epoch": 4.131335779317842, + "grad_norm": 5.746786594390869, + "learning_rate": 4.837953526294334e-06, + "loss": 2.6842, + "step": 60805 + }, + { + "epoch": 4.131675499388503, + "grad_norm": 8.092576026916504, + "learning_rate": 4.837528876206007e-06, + "loss": 2.827, + "step": 60810 + }, + { + "epoch": 4.1320152194591655, + "grad_norm": 7.270989894866943, + "learning_rate": 4.83710422611768e-06, + "loss": 2.6118, + "step": 60815 + }, + { + "epoch": 4.132354939529828, + "grad_norm": 7.160315990447998, + "learning_rate": 4.8366795760293525e-06, + "loss": 2.9196, + "step": 60820 + }, + { + "epoch": 4.132694659600489, + "grad_norm": 6.8476128578186035, + "learning_rate": 4.836254925941025e-06, + "loss": 2.8007, + "step": 60825 + }, + { + "epoch": 4.133034379671151, + "grad_norm": 8.16492748260498, + "learning_rate": 4.835830275852697e-06, + "loss": 2.8479, + "step": 60830 + }, + { + "epoch": 4.133374099741813, + "grad_norm": 7.445682048797607, + "learning_rate": 4.835405625764371e-06, + "loss": 2.9218, + "step": 60835 + }, + { + "epoch": 4.133713819812474, + "grad_norm": 7.265353202819824, + "learning_rate": 4.834980975676044e-06, + "loss": 2.7287, + "step": 60840 + }, + { + "epoch": 4.134053539883136, + "grad_norm": 9.888020515441895, + "learning_rate": 4.834556325587716e-06, + "loss": 2.6681, + "step": 60845 + }, + { + "epoch": 4.134393259953798, + "grad_norm": 7.2970290184021, + "learning_rate": 4.834131675499389e-06, + "loss": 2.8813, + "step": 60850 + }, + { + "epoch": 4.134732980024459, + "grad_norm": 7.19477653503418, + "learning_rate": 4.833707025411062e-06, + "loss": 2.8272, + "step": 60855 + }, + { + "epoch": 4.1350727000951215, + "grad_norm": 7.6617279052734375, + "learning_rate": 4.833282375322734e-06, + "loss": 2.8509, + "step": 60860 + }, + { + "epoch": 4.135412420165784, + "grad_norm": 7.731016635894775, + "learning_rate": 4.832857725234407e-06, + "loss": 2.8958, + "step": 60865 + }, + { + "epoch": 4.135752140236445, + "grad_norm": 7.294753551483154, + "learning_rate": 4.8324330751460805e-06, + "loss": 2.8317, + "step": 60870 + }, + { + "epoch": 4.136091860307107, + "grad_norm": 7.317753314971924, + "learning_rate": 4.8320084250577524e-06, + "loss": 2.8855, + "step": 60875 + }, + { + "epoch": 4.136431580377769, + "grad_norm": 8.220069885253906, + "learning_rate": 4.831583774969425e-06, + "loss": 2.9703, + "step": 60880 + }, + { + "epoch": 4.13677130044843, + "grad_norm": 8.315837860107422, + "learning_rate": 4.831159124881099e-06, + "loss": 2.8493, + "step": 60885 + }, + { + "epoch": 4.137111020519092, + "grad_norm": 7.931826591491699, + "learning_rate": 4.830734474792771e-06, + "loss": 2.9223, + "step": 60890 + }, + { + "epoch": 4.137450740589754, + "grad_norm": 7.174849987030029, + "learning_rate": 4.830309824704444e-06, + "loss": 2.785, + "step": 60895 + }, + { + "epoch": 4.1377904606604154, + "grad_norm": 7.70205020904541, + "learning_rate": 4.8298851746161164e-06, + "loss": 2.8292, + "step": 60900 + }, + { + "epoch": 4.1381301807310775, + "grad_norm": 7.767065525054932, + "learning_rate": 4.829460524527789e-06, + "loss": 2.746, + "step": 60905 + }, + { + "epoch": 4.13846990080174, + "grad_norm": 8.235774993896484, + "learning_rate": 4.829035874439462e-06, + "loss": 2.753, + "step": 60910 + }, + { + "epoch": 4.138809620872401, + "grad_norm": 8.560287475585938, + "learning_rate": 4.828611224351135e-06, + "loss": 3.0622, + "step": 60915 + }, + { + "epoch": 4.139149340943063, + "grad_norm": 8.135002136230469, + "learning_rate": 4.828186574262808e-06, + "loss": 2.822, + "step": 60920 + }, + { + "epoch": 4.139489061013725, + "grad_norm": 6.766875267028809, + "learning_rate": 4.8277619241744804e-06, + "loss": 2.7354, + "step": 60925 + }, + { + "epoch": 4.139828781084386, + "grad_norm": 8.90800952911377, + "learning_rate": 4.827337274086153e-06, + "loss": 3.2292, + "step": 60930 + }, + { + "epoch": 4.140168501155048, + "grad_norm": 7.934556484222412, + "learning_rate": 4.826912623997826e-06, + "loss": 2.8077, + "step": 60935 + }, + { + "epoch": 4.14050822122571, + "grad_norm": 6.041985988616943, + "learning_rate": 4.826487973909499e-06, + "loss": 2.8239, + "step": 60940 + }, + { + "epoch": 4.1408479412963715, + "grad_norm": 7.493823528289795, + "learning_rate": 4.826063323821172e-06, + "loss": 3.0399, + "step": 60945 + }, + { + "epoch": 4.1411876613670335, + "grad_norm": 6.824673175811768, + "learning_rate": 4.8256386737328444e-06, + "loss": 2.9834, + "step": 60950 + }, + { + "epoch": 4.141527381437696, + "grad_norm": 6.165374755859375, + "learning_rate": 4.825214023644517e-06, + "loss": 2.5168, + "step": 60955 + }, + { + "epoch": 4.141867101508357, + "grad_norm": 6.923586368560791, + "learning_rate": 4.82478937355619e-06, + "loss": 2.6667, + "step": 60960 + }, + { + "epoch": 4.142206821579019, + "grad_norm": 7.876861095428467, + "learning_rate": 4.824364723467863e-06, + "loss": 2.7962, + "step": 60965 + }, + { + "epoch": 4.142546541649681, + "grad_norm": Infinity, + "learning_rate": 4.824025003397202e-06, + "loss": 2.9582, + "step": 60970 + }, + { + "epoch": 4.142886261720342, + "grad_norm": 8.061529159545898, + "learning_rate": 4.823600353308874e-06, + "loss": 2.7452, + "step": 60975 + }, + { + "epoch": 4.143225981791004, + "grad_norm": 7.011836051940918, + "learning_rate": 4.8231757032205465e-06, + "loss": 2.7089, + "step": 60980 + }, + { + "epoch": 4.143565701861666, + "grad_norm": 7.71331262588501, + "learning_rate": 4.822751053132219e-06, + "loss": 2.9619, + "step": 60985 + }, + { + "epoch": 4.1439054219323275, + "grad_norm": 9.263612747192383, + "learning_rate": 4.822326403043892e-06, + "loss": 2.8503, + "step": 60990 + }, + { + "epoch": 4.1442451420029895, + "grad_norm": 8.028031349182129, + "learning_rate": 4.821901752955565e-06, + "loss": 2.7659, + "step": 60995 + }, + { + "epoch": 4.144584862073652, + "grad_norm": 6.23300838470459, + "learning_rate": 4.821477102867238e-06, + "loss": 2.5361, + "step": 61000 + }, + { + "epoch": 4.144924582144313, + "grad_norm": 7.060107231140137, + "learning_rate": 4.8210524527789105e-06, + "loss": 2.7641, + "step": 61005 + }, + { + "epoch": 4.145264302214975, + "grad_norm": 6.108644485473633, + "learning_rate": 4.820627802690583e-06, + "loss": 2.8741, + "step": 61010 + }, + { + "epoch": 4.145604022285637, + "grad_norm": 7.293610095977783, + "learning_rate": 4.820203152602256e-06, + "loss": 2.9295, + "step": 61015 + }, + { + "epoch": 4.145943742356298, + "grad_norm": 7.63712215423584, + "learning_rate": 4.819778502513929e-06, + "loss": 2.7881, + "step": 61020 + }, + { + "epoch": 4.14628346242696, + "grad_norm": 9.39401626586914, + "learning_rate": 4.819353852425602e-06, + "loss": 2.9718, + "step": 61025 + }, + { + "epoch": 4.146623182497622, + "grad_norm": 7.7603678703308105, + "learning_rate": 4.8189292023372745e-06, + "loss": 2.964, + "step": 61030 + }, + { + "epoch": 4.1469629025682835, + "grad_norm": 7.2331342697143555, + "learning_rate": 4.818504552248947e-06, + "loss": 3.0075, + "step": 61035 + }, + { + "epoch": 4.147302622638946, + "grad_norm": 6.537491321563721, + "learning_rate": 4.81807990216062e-06, + "loss": 2.919, + "step": 61040 + }, + { + "epoch": 4.147642342709608, + "grad_norm": 7.933871269226074, + "learning_rate": 4.817655252072293e-06, + "loss": 3.0953, + "step": 61045 + }, + { + "epoch": 4.147982062780269, + "grad_norm": 7.679904937744141, + "learning_rate": 4.817230601983966e-06, + "loss": 2.7434, + "step": 61050 + }, + { + "epoch": 4.148321782850931, + "grad_norm": 6.97415828704834, + "learning_rate": 4.8168059518956385e-06, + "loss": 2.9785, + "step": 61055 + }, + { + "epoch": 4.148661502921593, + "grad_norm": 5.58017635345459, + "learning_rate": 4.816381301807311e-06, + "loss": 2.9067, + "step": 61060 + }, + { + "epoch": 4.149001222992254, + "grad_norm": 7.124690532684326, + "learning_rate": 4.815956651718984e-06, + "loss": 2.7697, + "step": 61065 + }, + { + "epoch": 4.149340943062916, + "grad_norm": 6.807746887207031, + "learning_rate": 4.815532001630657e-06, + "loss": 2.9302, + "step": 61070 + }, + { + "epoch": 4.149680663133578, + "grad_norm": 7.3177924156188965, + "learning_rate": 4.81510735154233e-06, + "loss": 2.8884, + "step": 61075 + }, + { + "epoch": 4.1500203832042395, + "grad_norm": 6.84965181350708, + "learning_rate": 4.814682701454002e-06, + "loss": 2.8698, + "step": 61080 + }, + { + "epoch": 4.150360103274902, + "grad_norm": 7.834253787994385, + "learning_rate": 4.814258051365675e-06, + "loss": 2.735, + "step": 61085 + }, + { + "epoch": 4.150699823345564, + "grad_norm": 7.812054634094238, + "learning_rate": 4.813833401277348e-06, + "loss": 2.7964, + "step": 61090 + }, + { + "epoch": 4.151039543416225, + "grad_norm": 7.894629955291748, + "learning_rate": 4.81340875118902e-06, + "loss": 3.0305, + "step": 61095 + }, + { + "epoch": 4.151379263486887, + "grad_norm": 6.583600044250488, + "learning_rate": 4.812984101100694e-06, + "loss": 2.8608, + "step": 61100 + }, + { + "epoch": 4.151718983557549, + "grad_norm": 6.286562442779541, + "learning_rate": 4.8125594510123665e-06, + "loss": 2.8364, + "step": 61105 + }, + { + "epoch": 4.15205870362821, + "grad_norm": 8.420775413513184, + "learning_rate": 4.8121348009240385e-06, + "loss": 3.061, + "step": 61110 + }, + { + "epoch": 4.152398423698872, + "grad_norm": 8.73855972290039, + "learning_rate": 4.811710150835711e-06, + "loss": 2.9351, + "step": 61115 + }, + { + "epoch": 4.152738143769534, + "grad_norm": 6.530299663543701, + "learning_rate": 4.811285500747385e-06, + "loss": 2.9014, + "step": 61120 + }, + { + "epoch": 4.1530778638401955, + "grad_norm": 6.4884934425354, + "learning_rate": 4.810860850659057e-06, + "loss": 2.876, + "step": 61125 + }, + { + "epoch": 4.153417583910858, + "grad_norm": 7.123256206512451, + "learning_rate": 4.81043620057073e-06, + "loss": 2.6933, + "step": 61130 + }, + { + "epoch": 4.15375730398152, + "grad_norm": 6.606703281402588, + "learning_rate": 4.810011550482403e-06, + "loss": 2.7994, + "step": 61135 + }, + { + "epoch": 4.154097024052181, + "grad_norm": 8.577404022216797, + "learning_rate": 4.809586900394076e-06, + "loss": 2.8957, + "step": 61140 + }, + { + "epoch": 4.154436744122843, + "grad_norm": 7.180393218994141, + "learning_rate": 4.809162250305748e-06, + "loss": 2.8327, + "step": 61145 + }, + { + "epoch": 4.154776464193505, + "grad_norm": 7.774493217468262, + "learning_rate": 4.808737600217421e-06, + "loss": 2.8963, + "step": 61150 + }, + { + "epoch": 4.155116184264166, + "grad_norm": 6.470309734344482, + "learning_rate": 4.8083129501290945e-06, + "loss": 2.9821, + "step": 61155 + }, + { + "epoch": 4.155455904334828, + "grad_norm": 8.946200370788574, + "learning_rate": 4.8078883000407665e-06, + "loss": 2.8068, + "step": 61160 + }, + { + "epoch": 4.15579562440549, + "grad_norm": 8.593653678894043, + "learning_rate": 4.807463649952439e-06, + "loss": 3.0269, + "step": 61165 + }, + { + "epoch": 4.1561353444761515, + "grad_norm": 6.62117338180542, + "learning_rate": 4.807038999864113e-06, + "loss": 3.0373, + "step": 61170 + }, + { + "epoch": 4.156475064546814, + "grad_norm": 7.05885648727417, + "learning_rate": 4.806614349775785e-06, + "loss": 2.7713, + "step": 61175 + }, + { + "epoch": 4.156814784617475, + "grad_norm": 8.559711456298828, + "learning_rate": 4.806189699687458e-06, + "loss": 2.9, + "step": 61180 + }, + { + "epoch": 4.157154504688137, + "grad_norm": 8.88837718963623, + "learning_rate": 4.8057650495991305e-06, + "loss": 2.8812, + "step": 61185 + }, + { + "epoch": 4.157494224758799, + "grad_norm": 7.4066243171691895, + "learning_rate": 4.805340399510803e-06, + "loss": 2.8811, + "step": 61190 + }, + { + "epoch": 4.15783394482946, + "grad_norm": 6.483935356140137, + "learning_rate": 4.804915749422476e-06, + "loss": 2.9129, + "step": 61195 + }, + { + "epoch": 4.158173664900122, + "grad_norm": 7.2546491622924805, + "learning_rate": 4.804491099334149e-06, + "loss": 2.8211, + "step": 61200 + }, + { + "epoch": 4.158513384970784, + "grad_norm": 7.939694881439209, + "learning_rate": 4.804066449245822e-06, + "loss": 2.8369, + "step": 61205 + }, + { + "epoch": 4.1588531050414455, + "grad_norm": 7.643743991851807, + "learning_rate": 4.8036417991574945e-06, + "loss": 3.0542, + "step": 61210 + }, + { + "epoch": 4.1591928251121075, + "grad_norm": 6.686635971069336, + "learning_rate": 4.803217149069167e-06, + "loss": 2.8343, + "step": 61215 + }, + { + "epoch": 4.15953254518277, + "grad_norm": 7.735744476318359, + "learning_rate": 4.80279249898084e-06, + "loss": 2.9839, + "step": 61220 + }, + { + "epoch": 4.159872265253431, + "grad_norm": 7.980849742889404, + "learning_rate": 4.802367848892513e-06, + "loss": 2.8726, + "step": 61225 + }, + { + "epoch": 4.160211985324093, + "grad_norm": 6.32576322555542, + "learning_rate": 4.801943198804186e-06, + "loss": 2.7738, + "step": 61230 + }, + { + "epoch": 4.160551705394755, + "grad_norm": 7.4367265701293945, + "learning_rate": 4.8015185487158585e-06, + "loss": 2.8697, + "step": 61235 + }, + { + "epoch": 4.160891425465416, + "grad_norm": 7.750222206115723, + "learning_rate": 4.801093898627531e-06, + "loss": 2.7351, + "step": 61240 + }, + { + "epoch": 4.161231145536078, + "grad_norm": 6.599553108215332, + "learning_rate": 4.800669248539204e-06, + "loss": 2.9273, + "step": 61245 + }, + { + "epoch": 4.16157086560674, + "grad_norm": 9.013086318969727, + "learning_rate": 4.800244598450877e-06, + "loss": 2.6997, + "step": 61250 + }, + { + "epoch": 4.1619105856774015, + "grad_norm": 7.610951900482178, + "learning_rate": 4.79981994836255e-06, + "loss": 2.884, + "step": 61255 + }, + { + "epoch": 4.1622503057480635, + "grad_norm": 6.6097636222839355, + "learning_rate": 4.7993952982742225e-06, + "loss": 3.0068, + "step": 61260 + }, + { + "epoch": 4.162590025818726, + "grad_norm": 7.611195087432861, + "learning_rate": 4.798970648185895e-06, + "loss": 2.9698, + "step": 61265 + }, + { + "epoch": 4.162929745889387, + "grad_norm": 7.677777290344238, + "learning_rate": 4.798545998097568e-06, + "loss": 2.6378, + "step": 61270 + }, + { + "epoch": 4.163269465960049, + "grad_norm": 7.250895977020264, + "learning_rate": 4.798121348009241e-06, + "loss": 3.0853, + "step": 61275 + }, + { + "epoch": 4.163609186030711, + "grad_norm": 7.91898250579834, + "learning_rate": 4.797696697920914e-06, + "loss": 2.8953, + "step": 61280 + }, + { + "epoch": 4.163948906101372, + "grad_norm": 6.503249168395996, + "learning_rate": 4.7972720478325865e-06, + "loss": 2.8731, + "step": 61285 + }, + { + "epoch": 4.164288626172034, + "grad_norm": 7.045506477355957, + "learning_rate": 4.796847397744259e-06, + "loss": 2.8165, + "step": 61290 + }, + { + "epoch": 4.164628346242696, + "grad_norm": 7.726833343505859, + "learning_rate": 4.796422747655931e-06, + "loss": 2.8686, + "step": 61295 + }, + { + "epoch": 4.1649680663133575, + "grad_norm": 7.957799434661865, + "learning_rate": 4.795998097567605e-06, + "loss": 2.8934, + "step": 61300 + }, + { + "epoch": 4.1653077863840196, + "grad_norm": 6.637351036071777, + "learning_rate": 4.795573447479278e-06, + "loss": 2.8946, + "step": 61305 + }, + { + "epoch": 4.165647506454682, + "grad_norm": 6.610342025756836, + "learning_rate": 4.7951487973909505e-06, + "loss": 2.708, + "step": 61310 + }, + { + "epoch": 4.165987226525343, + "grad_norm": 8.258647918701172, + "learning_rate": 4.794724147302623e-06, + "loss": 2.9619, + "step": 61315 + }, + { + "epoch": 4.166326946596005, + "grad_norm": 7.101433753967285, + "learning_rate": 4.794299497214296e-06, + "loss": 2.7162, + "step": 61320 + }, + { + "epoch": 4.166666666666667, + "grad_norm": 7.0926432609558105, + "learning_rate": 4.793874847125969e-06, + "loss": 2.7154, + "step": 61325 + }, + { + "epoch": 4.167006386737328, + "grad_norm": 7.878020763397217, + "learning_rate": 4.793450197037641e-06, + "loss": 2.9778, + "step": 61330 + }, + { + "epoch": 4.16734610680799, + "grad_norm": 7.024019241333008, + "learning_rate": 4.7930255469493145e-06, + "loss": 2.8706, + "step": 61335 + }, + { + "epoch": 4.167685826878652, + "grad_norm": 7.600521564483643, + "learning_rate": 4.792600896860987e-06, + "loss": 3.0537, + "step": 61340 + }, + { + "epoch": 4.1680255469493135, + "grad_norm": 6.907558441162109, + "learning_rate": 4.792176246772659e-06, + "loss": 2.6159, + "step": 61345 + }, + { + "epoch": 4.168365267019976, + "grad_norm": 9.39015007019043, + "learning_rate": 4.791751596684333e-06, + "loss": 2.7228, + "step": 61350 + }, + { + "epoch": 4.168704987090638, + "grad_norm": 7.916399002075195, + "learning_rate": 4.791326946596006e-06, + "loss": 3.0665, + "step": 61355 + }, + { + "epoch": 4.169044707161299, + "grad_norm": 6.960449695587158, + "learning_rate": 4.790902296507678e-06, + "loss": 2.958, + "step": 61360 + }, + { + "epoch": 4.169384427231961, + "grad_norm": 8.035140037536621, + "learning_rate": 4.7904776464193505e-06, + "loss": 2.6495, + "step": 61365 + }, + { + "epoch": 4.169724147302623, + "grad_norm": 8.113743782043457, + "learning_rate": 4.790052996331024e-06, + "loss": 2.7426, + "step": 61370 + }, + { + "epoch": 4.170063867373284, + "grad_norm": 7.6103901863098145, + "learning_rate": 4.789628346242696e-06, + "loss": 2.7582, + "step": 61375 + }, + { + "epoch": 4.170403587443946, + "grad_norm": 8.021345138549805, + "learning_rate": 4.789203696154369e-06, + "loss": 2.9347, + "step": 61380 + }, + { + "epoch": 4.170743307514608, + "grad_norm": 8.052239418029785, + "learning_rate": 4.7887790460660425e-06, + "loss": 3.0095, + "step": 61385 + }, + { + "epoch": 4.1710830275852695, + "grad_norm": 8.48503303527832, + "learning_rate": 4.7883543959777145e-06, + "loss": 2.5052, + "step": 61390 + }, + { + "epoch": 4.171422747655932, + "grad_norm": 8.34555721282959, + "learning_rate": 4.787929745889387e-06, + "loss": 3.093, + "step": 61395 + }, + { + "epoch": 4.171762467726594, + "grad_norm": 6.805370330810547, + "learning_rate": 4.78750509580106e-06, + "loss": 2.8827, + "step": 61400 + }, + { + "epoch": 4.172102187797255, + "grad_norm": 6.235246658325195, + "learning_rate": 4.787080445712733e-06, + "loss": 2.7472, + "step": 61405 + }, + { + "epoch": 4.172441907867917, + "grad_norm": 7.352156639099121, + "learning_rate": 4.786655795624406e-06, + "loss": 2.9938, + "step": 61410 + }, + { + "epoch": 4.172781627938579, + "grad_norm": 7.9455037117004395, + "learning_rate": 4.7862311455360785e-06, + "loss": 2.8391, + "step": 61415 + }, + { + "epoch": 4.17312134800924, + "grad_norm": 7.3499755859375, + "learning_rate": 4.785806495447751e-06, + "loss": 2.8562, + "step": 61420 + }, + { + "epoch": 4.173461068079902, + "grad_norm": 7.9338483810424805, + "learning_rate": 4.785381845359424e-06, + "loss": 2.7327, + "step": 61425 + }, + { + "epoch": 4.173800788150564, + "grad_norm": 6.2962822914123535, + "learning_rate": 4.784957195271097e-06, + "loss": 2.9176, + "step": 61430 + }, + { + "epoch": 4.1741405082212255, + "grad_norm": 8.166555404663086, + "learning_rate": 4.78453254518277e-06, + "loss": 2.9165, + "step": 61435 + }, + { + "epoch": 4.174480228291888, + "grad_norm": 7.439605236053467, + "learning_rate": 4.7841078950944425e-06, + "loss": 2.9662, + "step": 61440 + }, + { + "epoch": 4.17481994836255, + "grad_norm": 6.978837490081787, + "learning_rate": 4.783683245006115e-06, + "loss": 2.7859, + "step": 61445 + }, + { + "epoch": 4.175159668433211, + "grad_norm": 5.205122470855713, + "learning_rate": 4.783258594917788e-06, + "loss": 2.9271, + "step": 61450 + }, + { + "epoch": 4.175499388503873, + "grad_norm": 7.078430652618408, + "learning_rate": 4.782833944829461e-06, + "loss": 2.9205, + "step": 61455 + }, + { + "epoch": 4.175839108574535, + "grad_norm": 7.082332134246826, + "learning_rate": 4.782409294741134e-06, + "loss": 2.7644, + "step": 61460 + }, + { + "epoch": 4.176178828645196, + "grad_norm": 7.1800947189331055, + "learning_rate": 4.7819846446528065e-06, + "loss": 2.9465, + "step": 61465 + }, + { + "epoch": 4.176518548715858, + "grad_norm": 9.830550193786621, + "learning_rate": 4.781559994564479e-06, + "loss": 2.7649, + "step": 61470 + }, + { + "epoch": 4.17685826878652, + "grad_norm": 5.897298336029053, + "learning_rate": 4.781135344476152e-06, + "loss": 2.7924, + "step": 61475 + }, + { + "epoch": 4.1771979888571815, + "grad_norm": 6.251313209533691, + "learning_rate": 4.780710694387825e-06, + "loss": 2.8987, + "step": 61480 + }, + { + "epoch": 4.177537708927844, + "grad_norm": 8.955798149108887, + "learning_rate": 4.780286044299498e-06, + "loss": 2.596, + "step": 61485 + }, + { + "epoch": 4.177877428998505, + "grad_norm": 6.813113689422607, + "learning_rate": 4.7798613942111705e-06, + "loss": 2.7825, + "step": 61490 + }, + { + "epoch": 4.178217149069167, + "grad_norm": 8.352544784545898, + "learning_rate": 4.779436744122843e-06, + "loss": 2.6616, + "step": 61495 + }, + { + "epoch": 4.178556869139829, + "grad_norm": 7.736942291259766, + "learning_rate": 4.779012094034516e-06, + "loss": 2.787, + "step": 61500 + }, + { + "epoch": 4.17889658921049, + "grad_norm": 6.387747287750244, + "learning_rate": 4.778587443946189e-06, + "loss": 2.6249, + "step": 61505 + }, + { + "epoch": 4.179236309281152, + "grad_norm": 7.534182071685791, + "learning_rate": 4.778162793857862e-06, + "loss": 2.9539, + "step": 61510 + }, + { + "epoch": 4.179576029351814, + "grad_norm": 7.98275089263916, + "learning_rate": 4.7777381437695345e-06, + "loss": 2.6298, + "step": 61515 + }, + { + "epoch": 4.1799157494224755, + "grad_norm": 6.42755126953125, + "learning_rate": 4.777313493681207e-06, + "loss": 2.9568, + "step": 61520 + }, + { + "epoch": 4.1802554694931375, + "grad_norm": 8.457306861877441, + "learning_rate": 4.77688884359288e-06, + "loss": 2.6872, + "step": 61525 + }, + { + "epoch": 4.1805951895638, + "grad_norm": 5.375438213348389, + "learning_rate": 4.776464193504552e-06, + "loss": 3.0387, + "step": 61530 + }, + { + "epoch": 4.180934909634461, + "grad_norm": 8.883524894714355, + "learning_rate": 4.776039543416226e-06, + "loss": 3.093, + "step": 61535 + }, + { + "epoch": 4.181274629705123, + "grad_norm": 7.8871235847473145, + "learning_rate": 4.7756148933278985e-06, + "loss": 2.8637, + "step": 61540 + }, + { + "epoch": 4.181614349775785, + "grad_norm": 8.176709175109863, + "learning_rate": 4.7751902432395705e-06, + "loss": 2.9246, + "step": 61545 + }, + { + "epoch": 4.181954069846446, + "grad_norm": 7.620335102081299, + "learning_rate": 4.774765593151244e-06, + "loss": 2.8721, + "step": 61550 + }, + { + "epoch": 4.182293789917108, + "grad_norm": 7.370183944702148, + "learning_rate": 4.774340943062917e-06, + "loss": 2.8952, + "step": 61555 + }, + { + "epoch": 4.18263350998777, + "grad_norm": 6.739614963531494, + "learning_rate": 4.773916292974589e-06, + "loss": 2.6673, + "step": 61560 + }, + { + "epoch": 4.1829732300584315, + "grad_norm": 6.48914098739624, + "learning_rate": 4.7734916428862625e-06, + "loss": 3.0098, + "step": 61565 + }, + { + "epoch": 4.1833129501290935, + "grad_norm": 8.244256019592285, + "learning_rate": 4.773066992797935e-06, + "loss": 2.8398, + "step": 61570 + }, + { + "epoch": 4.183652670199756, + "grad_norm": 7.453587055206299, + "learning_rate": 4.772642342709607e-06, + "loss": 2.9823, + "step": 61575 + }, + { + "epoch": 4.183992390270417, + "grad_norm": 6.339913845062256, + "learning_rate": 4.77221769262128e-06, + "loss": 2.8997, + "step": 61580 + }, + { + "epoch": 4.184332110341079, + "grad_norm": 5.330577850341797, + "learning_rate": 4.771793042532954e-06, + "loss": 2.9422, + "step": 61585 + }, + { + "epoch": 4.184671830411741, + "grad_norm": 6.452070713043213, + "learning_rate": 4.771368392444626e-06, + "loss": 2.8446, + "step": 61590 + }, + { + "epoch": 4.185011550482402, + "grad_norm": 7.517843723297119, + "learning_rate": 4.7709437423562985e-06, + "loss": 2.9859, + "step": 61595 + }, + { + "epoch": 4.185351270553064, + "grad_norm": 6.116885662078857, + "learning_rate": 4.770519092267972e-06, + "loss": 3.0548, + "step": 61600 + }, + { + "epoch": 4.185690990623726, + "grad_norm": 7.246168613433838, + "learning_rate": 4.770094442179644e-06, + "loss": 2.9404, + "step": 61605 + }, + { + "epoch": 4.1860307106943875, + "grad_norm": 8.487010955810547, + "learning_rate": 4.769669792091317e-06, + "loss": 2.8864, + "step": 61610 + }, + { + "epoch": 4.1863704307650496, + "grad_norm": 7.569413185119629, + "learning_rate": 4.76924514200299e-06, + "loss": 2.8867, + "step": 61615 + }, + { + "epoch": 4.186710150835712, + "grad_norm": 10.08929443359375, + "learning_rate": 4.7688204919146625e-06, + "loss": 2.9342, + "step": 61620 + }, + { + "epoch": 4.187049870906373, + "grad_norm": 7.243996620178223, + "learning_rate": 4.768395841826335e-06, + "loss": 2.8029, + "step": 61625 + }, + { + "epoch": 4.187389590977035, + "grad_norm": 8.22843074798584, + "learning_rate": 4.767971191738008e-06, + "loss": 2.8883, + "step": 61630 + }, + { + "epoch": 4.187729311047697, + "grad_norm": 8.462362289428711, + "learning_rate": 4.767546541649681e-06, + "loss": 2.7866, + "step": 61635 + }, + { + "epoch": 4.188069031118358, + "grad_norm": 6.618731498718262, + "learning_rate": 4.767121891561354e-06, + "loss": 3.0471, + "step": 61640 + }, + { + "epoch": 4.18840875118902, + "grad_norm": 6.540345191955566, + "learning_rate": 4.7666972414730265e-06, + "loss": 2.8037, + "step": 61645 + }, + { + "epoch": 4.188748471259682, + "grad_norm": 9.459342002868652, + "learning_rate": 4.766272591384699e-06, + "loss": 2.8051, + "step": 61650 + }, + { + "epoch": 4.1890881913303435, + "grad_norm": 7.358839511871338, + "learning_rate": 4.765847941296372e-06, + "loss": 2.9345, + "step": 61655 + }, + { + "epoch": 4.189427911401006, + "grad_norm": 8.796646118164062, + "learning_rate": 4.765423291208045e-06, + "loss": 2.6872, + "step": 61660 + }, + { + "epoch": 4.189767631471668, + "grad_norm": 6.408356189727783, + "learning_rate": 4.764998641119718e-06, + "loss": 2.9016, + "step": 61665 + }, + { + "epoch": 4.190107351542329, + "grad_norm": 6.786556243896484, + "learning_rate": 4.7645739910313905e-06, + "loss": 2.8145, + "step": 61670 + }, + { + "epoch": 4.190447071612991, + "grad_norm": 7.780019760131836, + "learning_rate": 4.764149340943063e-06, + "loss": 2.9521, + "step": 61675 + }, + { + "epoch": 4.190786791683653, + "grad_norm": 7.18480920791626, + "learning_rate": 4.763724690854736e-06, + "loss": 2.7723, + "step": 61680 + }, + { + "epoch": 4.191126511754314, + "grad_norm": 7.21019172668457, + "learning_rate": 4.763300040766409e-06, + "loss": 2.6041, + "step": 61685 + }, + { + "epoch": 4.191466231824976, + "grad_norm": 6.8266825675964355, + "learning_rate": 4.762875390678082e-06, + "loss": 3.0383, + "step": 61690 + }, + { + "epoch": 4.191805951895638, + "grad_norm": 6.986363887786865, + "learning_rate": 4.7624507405897545e-06, + "loss": 3.1732, + "step": 61695 + }, + { + "epoch": 4.1921456719662995, + "grad_norm": 7.875244140625, + "learning_rate": 4.762026090501427e-06, + "loss": 2.9813, + "step": 61700 + }, + { + "epoch": 4.192485392036962, + "grad_norm": 6.910520076751709, + "learning_rate": 4.7616014404131e-06, + "loss": 2.6396, + "step": 61705 + }, + { + "epoch": 4.192825112107624, + "grad_norm": 7.668290615081787, + "learning_rate": 4.761176790324773e-06, + "loss": 2.7142, + "step": 61710 + }, + { + "epoch": 4.193164832178285, + "grad_norm": 7.134779453277588, + "learning_rate": 4.760752140236446e-06, + "loss": 2.5998, + "step": 61715 + }, + { + "epoch": 4.193504552248947, + "grad_norm": 6.760236740112305, + "learning_rate": 4.7603274901481185e-06, + "loss": 2.8697, + "step": 61720 + }, + { + "epoch": 4.193844272319609, + "grad_norm": 6.759984970092773, + "learning_rate": 4.759902840059791e-06, + "loss": 2.8052, + "step": 61725 + }, + { + "epoch": 4.19418399239027, + "grad_norm": 6.33078670501709, + "learning_rate": 4.759478189971464e-06, + "loss": 2.8924, + "step": 61730 + }, + { + "epoch": 4.194523712460932, + "grad_norm": 7.343992233276367, + "learning_rate": 4.759053539883137e-06, + "loss": 2.9134, + "step": 61735 + }, + { + "epoch": 4.194863432531594, + "grad_norm": 7.549562454223633, + "learning_rate": 4.75862888979481e-06, + "loss": 2.8841, + "step": 61740 + }, + { + "epoch": 4.1952031526022555, + "grad_norm": 6.737825870513916, + "learning_rate": 4.758204239706482e-06, + "loss": 2.8887, + "step": 61745 + }, + { + "epoch": 4.195542872672918, + "grad_norm": 5.603460311889648, + "learning_rate": 4.757779589618155e-06, + "loss": 2.8895, + "step": 61750 + }, + { + "epoch": 4.19588259274358, + "grad_norm": 7.240264892578125, + "learning_rate": 4.757354939529828e-06, + "loss": 2.8774, + "step": 61755 + }, + { + "epoch": 4.196222312814241, + "grad_norm": 8.664094924926758, + "learning_rate": 4.7569302894415e-06, + "loss": 2.8746, + "step": 61760 + }, + { + "epoch": 4.196562032884903, + "grad_norm": 6.1621246337890625, + "learning_rate": 4.756505639353174e-06, + "loss": 2.6406, + "step": 61765 + }, + { + "epoch": 4.196901752955565, + "grad_norm": 8.58886432647705, + "learning_rate": 4.7560809892648465e-06, + "loss": 3.0706, + "step": 61770 + }, + { + "epoch": 4.197241473026226, + "grad_norm": 8.18408203125, + "learning_rate": 4.7556563391765185e-06, + "loss": 2.9704, + "step": 61775 + }, + { + "epoch": 4.197581193096888, + "grad_norm": 7.070132732391357, + "learning_rate": 4.755231689088191e-06, + "loss": 2.8566, + "step": 61780 + }, + { + "epoch": 4.19792091316755, + "grad_norm": 8.783843994140625, + "learning_rate": 4.754807038999865e-06, + "loss": 2.8296, + "step": 61785 + }, + { + "epoch": 4.1982606332382115, + "grad_norm": 6.403838634490967, + "learning_rate": 4.754382388911537e-06, + "loss": 2.8163, + "step": 61790 + }, + { + "epoch": 4.198600353308874, + "grad_norm": 7.423403263092041, + "learning_rate": 4.75395773882321e-06, + "loss": 2.7718, + "step": 61795 + }, + { + "epoch": 4.198940073379536, + "grad_norm": 6.458978652954102, + "learning_rate": 4.753533088734883e-06, + "loss": 3.0642, + "step": 61800 + }, + { + "epoch": 4.199279793450197, + "grad_norm": 8.311686515808105, + "learning_rate": 4.753108438646555e-06, + "loss": 2.8479, + "step": 61805 + }, + { + "epoch": 4.199619513520859, + "grad_norm": 6.37904691696167, + "learning_rate": 4.752683788558228e-06, + "loss": 2.976, + "step": 61810 + }, + { + "epoch": 4.199959233591521, + "grad_norm": 6.657650947570801, + "learning_rate": 4.752259138469901e-06, + "loss": 2.9599, + "step": 61815 + }, + { + "epoch": 4.200298953662182, + "grad_norm": 9.193421363830566, + "learning_rate": 4.7518344883815745e-06, + "loss": 2.8674, + "step": 61820 + }, + { + "epoch": 4.200638673732844, + "grad_norm": 8.843236923217773, + "learning_rate": 4.7514098382932465e-06, + "loss": 2.9087, + "step": 61825 + }, + { + "epoch": 4.200978393803506, + "grad_norm": 6.674525737762451, + "learning_rate": 4.750985188204919e-06, + "loss": 2.9887, + "step": 61830 + }, + { + "epoch": 4.2013181138741675, + "grad_norm": 11.407764434814453, + "learning_rate": 4.750560538116593e-06, + "loss": 2.9068, + "step": 61835 + }, + { + "epoch": 4.20165783394483, + "grad_norm": 6.478918552398682, + "learning_rate": 4.750135888028265e-06, + "loss": 2.8052, + "step": 61840 + }, + { + "epoch": 4.201997554015492, + "grad_norm": 9.634099960327148, + "learning_rate": 4.749711237939938e-06, + "loss": 2.7976, + "step": 61845 + }, + { + "epoch": 4.202337274086153, + "grad_norm": 6.914095401763916, + "learning_rate": 4.749286587851611e-06, + "loss": 2.8953, + "step": 61850 + }, + { + "epoch": 4.202676994156815, + "grad_norm": 9.60102367401123, + "learning_rate": 4.748861937763283e-06, + "loss": 2.7344, + "step": 61855 + }, + { + "epoch": 4.203016714227476, + "grad_norm": 7.737424373626709, + "learning_rate": 4.748437287674956e-06, + "loss": 2.7343, + "step": 61860 + }, + { + "epoch": 4.203356434298138, + "grad_norm": 7.552150726318359, + "learning_rate": 4.748012637586629e-06, + "loss": 2.7274, + "step": 61865 + }, + { + "epoch": 4.2036961543688, + "grad_norm": 8.125782012939453, + "learning_rate": 4.747587987498302e-06, + "loss": 2.8226, + "step": 61870 + }, + { + "epoch": 4.2040358744394615, + "grad_norm": 9.35307788848877, + "learning_rate": 4.7471633374099745e-06, + "loss": 2.4827, + "step": 61875 + }, + { + "epoch": 4.2043755945101235, + "grad_norm": 6.366557598114014, + "learning_rate": 4.746738687321647e-06, + "loss": 2.9325, + "step": 61880 + }, + { + "epoch": 4.204715314580786, + "grad_norm": 6.780035018920898, + "learning_rate": 4.74631403723332e-06, + "loss": 2.9708, + "step": 61885 + }, + { + "epoch": 4.205055034651447, + "grad_norm": 8.514254570007324, + "learning_rate": 4.745889387144993e-06, + "loss": 2.8956, + "step": 61890 + }, + { + "epoch": 4.205394754722109, + "grad_norm": 8.699222564697266, + "learning_rate": 4.745464737056666e-06, + "loss": 2.9635, + "step": 61895 + }, + { + "epoch": 4.205734474792771, + "grad_norm": 9.523750305175781, + "learning_rate": 4.7450400869683385e-06, + "loss": 2.8269, + "step": 61900 + }, + { + "epoch": 4.206074194863432, + "grad_norm": 8.532763481140137, + "learning_rate": 4.744615436880011e-06, + "loss": 2.7628, + "step": 61905 + }, + { + "epoch": 4.206413914934094, + "grad_norm": 7.1497063636779785, + "learning_rate": 4.744190786791684e-06, + "loss": 2.8621, + "step": 61910 + }, + { + "epoch": 4.206753635004756, + "grad_norm": 5.971903324127197, + "learning_rate": 4.743766136703357e-06, + "loss": 3.0086, + "step": 61915 + }, + { + "epoch": 4.2070933550754175, + "grad_norm": 9.850119590759277, + "learning_rate": 4.74334148661503e-06, + "loss": 2.9945, + "step": 61920 + }, + { + "epoch": 4.20743307514608, + "grad_norm": 6.318268775939941, + "learning_rate": 4.7429168365267025e-06, + "loss": 2.7213, + "step": 61925 + }, + { + "epoch": 4.207772795216742, + "grad_norm": 6.783935070037842, + "learning_rate": 4.742492186438375e-06, + "loss": 2.7982, + "step": 61930 + }, + { + "epoch": 4.208112515287403, + "grad_norm": 6.20629358291626, + "learning_rate": 4.742067536350048e-06, + "loss": 2.8267, + "step": 61935 + }, + { + "epoch": 4.208452235358065, + "grad_norm": 9.373065948486328, + "learning_rate": 4.741642886261721e-06, + "loss": 2.9809, + "step": 61940 + }, + { + "epoch": 4.208791955428727, + "grad_norm": 6.202873706817627, + "learning_rate": 4.741218236173394e-06, + "loss": 2.7279, + "step": 61945 + }, + { + "epoch": 4.209131675499388, + "grad_norm": 6.837894439697266, + "learning_rate": 4.7407935860850665e-06, + "loss": 2.9133, + "step": 61950 + }, + { + "epoch": 4.20947139557005, + "grad_norm": 7.400472164154053, + "learning_rate": 4.740368935996739e-06, + "loss": 2.9255, + "step": 61955 + }, + { + "epoch": 4.209811115640712, + "grad_norm": 8.377433776855469, + "learning_rate": 4.739944285908411e-06, + "loss": 2.7915, + "step": 61960 + }, + { + "epoch": 4.2101508357113735, + "grad_norm": 7.991532325744629, + "learning_rate": 4.739519635820085e-06, + "loss": 2.8383, + "step": 61965 + }, + { + "epoch": 4.210490555782036, + "grad_norm": 6.769793510437012, + "learning_rate": 4.739094985731758e-06, + "loss": 2.7746, + "step": 61970 + }, + { + "epoch": 4.210830275852698, + "grad_norm": 5.927778244018555, + "learning_rate": 4.73867033564343e-06, + "loss": 2.8894, + "step": 61975 + }, + { + "epoch": 4.211169995923359, + "grad_norm": 7.246372699737549, + "learning_rate": 4.738245685555103e-06, + "loss": 2.8402, + "step": 61980 + }, + { + "epoch": 4.211509715994021, + "grad_norm": 7.405990123748779, + "learning_rate": 4.737821035466776e-06, + "loss": 2.9164, + "step": 61985 + }, + { + "epoch": 4.211849436064683, + "grad_norm": 7.969038009643555, + "learning_rate": 4.737396385378449e-06, + "loss": 2.8025, + "step": 61990 + }, + { + "epoch": 4.212189156135344, + "grad_norm": 9.964340209960938, + "learning_rate": 4.736971735290121e-06, + "loss": 2.9519, + "step": 61995 + }, + { + "epoch": 4.212528876206006, + "grad_norm": 6.406582832336426, + "learning_rate": 4.7365470852017945e-06, + "loss": 3.0145, + "step": 62000 + }, + { + "epoch": 4.212868596276668, + "grad_norm": 8.413992881774902, + "learning_rate": 4.736122435113467e-06, + "loss": 2.7884, + "step": 62005 + }, + { + "epoch": 4.2132083163473295, + "grad_norm": 9.051087379455566, + "learning_rate": 4.735697785025139e-06, + "loss": 2.7247, + "step": 62010 + }, + { + "epoch": 4.213548036417992, + "grad_norm": 6.06341028213501, + "learning_rate": 4.735273134936813e-06, + "loss": 2.9513, + "step": 62015 + }, + { + "epoch": 4.213887756488654, + "grad_norm": 8.0044584274292, + "learning_rate": 4.734848484848486e-06, + "loss": 2.9748, + "step": 62020 + }, + { + "epoch": 4.214227476559315, + "grad_norm": 5.947976589202881, + "learning_rate": 4.734423834760158e-06, + "loss": 2.8826, + "step": 62025 + }, + { + "epoch": 4.214567196629977, + "grad_norm": 6.280881881713867, + "learning_rate": 4.7339991846718304e-06, + "loss": 2.828, + "step": 62030 + }, + { + "epoch": 4.214906916700639, + "grad_norm": 10.551678657531738, + "learning_rate": 4.733574534583504e-06, + "loss": 3.0376, + "step": 62035 + }, + { + "epoch": 4.2152466367713, + "grad_norm": 8.086363792419434, + "learning_rate": 4.733149884495176e-06, + "loss": 2.5714, + "step": 62040 + }, + { + "epoch": 4.215586356841962, + "grad_norm": 7.371453762054443, + "learning_rate": 4.732725234406849e-06, + "loss": 3.0221, + "step": 62045 + }, + { + "epoch": 4.215926076912624, + "grad_norm": 7.42244815826416, + "learning_rate": 4.7323005843185225e-06, + "loss": 2.747, + "step": 62050 + }, + { + "epoch": 4.2162657969832855, + "grad_norm": 8.528596878051758, + "learning_rate": 4.7318759342301944e-06, + "loss": 2.7509, + "step": 62055 + }, + { + "epoch": 4.216605517053948, + "grad_norm": 6.828658580780029, + "learning_rate": 4.731451284141867e-06, + "loss": 2.8524, + "step": 62060 + }, + { + "epoch": 4.21694523712461, + "grad_norm": 6.336338520050049, + "learning_rate": 4.73102663405354e-06, + "loss": 2.8831, + "step": 62065 + }, + { + "epoch": 4.217284957195271, + "grad_norm": 7.5315351486206055, + "learning_rate": 4.730601983965213e-06, + "loss": 3.0259, + "step": 62070 + }, + { + "epoch": 4.217624677265933, + "grad_norm": 6.725474834442139, + "learning_rate": 4.730177333876886e-06, + "loss": 2.7105, + "step": 62075 + }, + { + "epoch": 4.217964397336595, + "grad_norm": 7.581910610198975, + "learning_rate": 4.7297526837885585e-06, + "loss": 2.7248, + "step": 62080 + }, + { + "epoch": 4.218304117407256, + "grad_norm": 8.24612045288086, + "learning_rate": 4.729328033700231e-06, + "loss": 2.8604, + "step": 62085 + }, + { + "epoch": 4.218643837477918, + "grad_norm": 9.942718505859375, + "learning_rate": 4.728903383611904e-06, + "loss": 2.9257, + "step": 62090 + }, + { + "epoch": 4.21898355754858, + "grad_norm": 7.448544502258301, + "learning_rate": 4.728478733523577e-06, + "loss": 2.843, + "step": 62095 + }, + { + "epoch": 4.2193232776192415, + "grad_norm": 7.8035664558410645, + "learning_rate": 4.72805408343525e-06, + "loss": 2.9516, + "step": 62100 + }, + { + "epoch": 4.219662997689904, + "grad_norm": 6.2360711097717285, + "learning_rate": 4.7276294333469225e-06, + "loss": 3.1129, + "step": 62105 + }, + { + "epoch": 4.220002717760566, + "grad_norm": 9.537057876586914, + "learning_rate": 4.727204783258595e-06, + "loss": 2.5343, + "step": 62110 + }, + { + "epoch": 4.220342437831227, + "grad_norm": 7.268779754638672, + "learning_rate": 4.726780133170268e-06, + "loss": 2.8493, + "step": 62115 + }, + { + "epoch": 4.220682157901889, + "grad_norm": 6.934360027313232, + "learning_rate": 4.726355483081941e-06, + "loss": 2.9465, + "step": 62120 + }, + { + "epoch": 4.221021877972551, + "grad_norm": 7.728514671325684, + "learning_rate": 4.725930832993614e-06, + "loss": 2.9239, + "step": 62125 + }, + { + "epoch": 4.221361598043212, + "grad_norm": 7.197935581207275, + "learning_rate": 4.7255061829052865e-06, + "loss": 2.8511, + "step": 62130 + }, + { + "epoch": 4.221701318113874, + "grad_norm": 6.956178188323975, + "learning_rate": 4.725081532816959e-06, + "loss": 2.6951, + "step": 62135 + }, + { + "epoch": 4.222041038184536, + "grad_norm": 6.632051944732666, + "learning_rate": 4.724656882728632e-06, + "loss": 2.8077, + "step": 62140 + }, + { + "epoch": 4.2223807582551975, + "grad_norm": 6.33482027053833, + "learning_rate": 4.724232232640305e-06, + "loss": 2.9306, + "step": 62145 + }, + { + "epoch": 4.22272047832586, + "grad_norm": 7.4976959228515625, + "learning_rate": 4.723807582551978e-06, + "loss": 2.9924, + "step": 62150 + }, + { + "epoch": 4.223060198396522, + "grad_norm": 8.881071090698242, + "learning_rate": 4.7233829324636505e-06, + "loss": 3.1222, + "step": 62155 + }, + { + "epoch": 4.223399918467183, + "grad_norm": 6.137530326843262, + "learning_rate": 4.722958282375323e-06, + "loss": 2.7763, + "step": 62160 + }, + { + "epoch": 4.223739638537845, + "grad_norm": 7.778079032897949, + "learning_rate": 4.722533632286996e-06, + "loss": 2.6774, + "step": 62165 + }, + { + "epoch": 4.224079358608506, + "grad_norm": 7.6610636711120605, + "learning_rate": 4.722108982198669e-06, + "loss": 2.6587, + "step": 62170 + }, + { + "epoch": 4.224419078679168, + "grad_norm": 6.103399753570557, + "learning_rate": 4.721684332110342e-06, + "loss": 2.7076, + "step": 62175 + }, + { + "epoch": 4.22475879874983, + "grad_norm": 8.99585247039795, + "learning_rate": 4.7212596820220145e-06, + "loss": 3.004, + "step": 62180 + }, + { + "epoch": 4.2250985188204915, + "grad_norm": 8.688633918762207, + "learning_rate": 4.720835031933687e-06, + "loss": 2.8265, + "step": 62185 + }, + { + "epoch": 4.2254382388911536, + "grad_norm": 8.796523094177246, + "learning_rate": 4.72041038184536e-06, + "loss": 2.8465, + "step": 62190 + }, + { + "epoch": 4.225777958961816, + "grad_norm": 7.306980609893799, + "learning_rate": 4.719985731757033e-06, + "loss": 2.7098, + "step": 62195 + }, + { + "epoch": 4.226117679032477, + "grad_norm": 7.8331122398376465, + "learning_rate": 4.719561081668706e-06, + "loss": 2.9998, + "step": 62200 + }, + { + "epoch": 4.226457399103139, + "grad_norm": 8.43602180480957, + "learning_rate": 4.7191364315803785e-06, + "loss": 2.8653, + "step": 62205 + }, + { + "epoch": 4.226797119173801, + "grad_norm": 6.696905136108398, + "learning_rate": 4.7187117814920504e-06, + "loss": 2.6766, + "step": 62210 + }, + { + "epoch": 4.227136839244462, + "grad_norm": 7.214219093322754, + "learning_rate": 4.718287131403724e-06, + "loss": 2.6858, + "step": 62215 + }, + { + "epoch": 4.227476559315124, + "grad_norm": 7.471963882446289, + "learning_rate": 4.717862481315397e-06, + "loss": 2.9685, + "step": 62220 + }, + { + "epoch": 4.227816279385786, + "grad_norm": 7.2317376136779785, + "learning_rate": 4.717437831227069e-06, + "loss": 2.7445, + "step": 62225 + }, + { + "epoch": 4.2281559994564475, + "grad_norm": 9.124311447143555, + "learning_rate": 4.7170131811387425e-06, + "loss": 2.8498, + "step": 62230 + }, + { + "epoch": 4.22849571952711, + "grad_norm": 8.320853233337402, + "learning_rate": 4.716588531050415e-06, + "loss": 2.7615, + "step": 62235 + }, + { + "epoch": 4.228835439597772, + "grad_norm": 6.561459541320801, + "learning_rate": 4.716163880962087e-06, + "loss": 2.8221, + "step": 62240 + }, + { + "epoch": 4.229175159668433, + "grad_norm": 8.615215301513672, + "learning_rate": 4.71573923087376e-06, + "loss": 2.8541, + "step": 62245 + }, + { + "epoch": 4.229514879739095, + "grad_norm": 7.012057781219482, + "learning_rate": 4.715314580785434e-06, + "loss": 2.7447, + "step": 62250 + }, + { + "epoch": 4.229854599809757, + "grad_norm": 7.83788537979126, + "learning_rate": 4.714889930697106e-06, + "loss": 2.7986, + "step": 62255 + }, + { + "epoch": 4.230194319880418, + "grad_norm": 5.457570552825928, + "learning_rate": 4.7144652806087784e-06, + "loss": 3.0214, + "step": 62260 + }, + { + "epoch": 4.23053403995108, + "grad_norm": 7.0680365562438965, + "learning_rate": 4.714040630520452e-06, + "loss": 2.9208, + "step": 62265 + }, + { + "epoch": 4.230873760021742, + "grad_norm": 8.777610778808594, + "learning_rate": 4.713615980432124e-06, + "loss": 2.6575, + "step": 62270 + }, + { + "epoch": 4.2312134800924035, + "grad_norm": 7.159873008728027, + "learning_rate": 4.713191330343797e-06, + "loss": 2.9809, + "step": 62275 + }, + { + "epoch": 4.231553200163066, + "grad_norm": 7.364511013031006, + "learning_rate": 4.71276668025547e-06, + "loss": 2.9358, + "step": 62280 + }, + { + "epoch": 4.231892920233728, + "grad_norm": 6.262247085571289, + "learning_rate": 4.7123420301671424e-06, + "loss": 2.8291, + "step": 62285 + }, + { + "epoch": 4.232232640304389, + "grad_norm": 7.607542037963867, + "learning_rate": 4.711917380078815e-06, + "loss": 2.8615, + "step": 62290 + }, + { + "epoch": 4.232572360375051, + "grad_norm": 7.695446968078613, + "learning_rate": 4.711492729990488e-06, + "loss": 2.8858, + "step": 62295 + }, + { + "epoch": 4.232912080445713, + "grad_norm": 7.613288879394531, + "learning_rate": 4.711068079902161e-06, + "loss": 2.8466, + "step": 62300 + }, + { + "epoch": 4.233251800516374, + "grad_norm": 9.180991172790527, + "learning_rate": 4.710643429813834e-06, + "loss": 2.6825, + "step": 62305 + }, + { + "epoch": 4.233591520587036, + "grad_norm": 6.691295146942139, + "learning_rate": 4.7102187797255064e-06, + "loss": 2.5863, + "step": 62310 + }, + { + "epoch": 4.233931240657698, + "grad_norm": 6.787306308746338, + "learning_rate": 4.709794129637179e-06, + "loss": 2.9128, + "step": 62315 + }, + { + "epoch": 4.2342709607283595, + "grad_norm": 7.681412220001221, + "learning_rate": 4.709369479548852e-06, + "loss": 2.646, + "step": 62320 + }, + { + "epoch": 4.234610680799022, + "grad_norm": 9.175857543945312, + "learning_rate": 4.708944829460525e-06, + "loss": 2.5844, + "step": 62325 + }, + { + "epoch": 4.234950400869684, + "grad_norm": 7.957472324371338, + "learning_rate": 4.708520179372198e-06, + "loss": 2.6373, + "step": 62330 + }, + { + "epoch": 4.235290120940345, + "grad_norm": 5.711918830871582, + "learning_rate": 4.7080955292838704e-06, + "loss": 3.0302, + "step": 62335 + }, + { + "epoch": 4.235629841011007, + "grad_norm": 7.466590404510498, + "learning_rate": 4.707670879195543e-06, + "loss": 2.9352, + "step": 62340 + }, + { + "epoch": 4.235969561081669, + "grad_norm": 8.497478485107422, + "learning_rate": 4.707246229107216e-06, + "loss": 2.6402, + "step": 62345 + }, + { + "epoch": 4.23630928115233, + "grad_norm": 8.67585277557373, + "learning_rate": 4.706821579018889e-06, + "loss": 2.8691, + "step": 62350 + }, + { + "epoch": 4.236649001222992, + "grad_norm": 6.301081657409668, + "learning_rate": 4.706396928930562e-06, + "loss": 2.7386, + "step": 62355 + }, + { + "epoch": 4.236988721293654, + "grad_norm": 8.614048957824707, + "learning_rate": 4.7059722788422344e-06, + "loss": 2.8728, + "step": 62360 + }, + { + "epoch": 4.2373284413643155, + "grad_norm": 5.847254276275635, + "learning_rate": 4.705547628753907e-06, + "loss": 2.8419, + "step": 62365 + }, + { + "epoch": 4.237668161434978, + "grad_norm": 6.521881103515625, + "learning_rate": 4.70512297866558e-06, + "loss": 2.8527, + "step": 62370 + }, + { + "epoch": 4.23800788150564, + "grad_norm": 7.6454339027404785, + "learning_rate": 4.704698328577253e-06, + "loss": 2.7384, + "step": 62375 + }, + { + "epoch": 4.238347601576301, + "grad_norm": 7.229321002960205, + "learning_rate": 4.704273678488926e-06, + "loss": 2.8871, + "step": 62380 + }, + { + "epoch": 4.238687321646963, + "grad_norm": 8.094993591308594, + "learning_rate": 4.7038490284005985e-06, + "loss": 2.8194, + "step": 62385 + }, + { + "epoch": 4.239027041717625, + "grad_norm": 7.140035152435303, + "learning_rate": 4.703424378312271e-06, + "loss": 2.8916, + "step": 62390 + }, + { + "epoch": 4.239366761788286, + "grad_norm": 8.623907089233398, + "learning_rate": 4.702999728223944e-06, + "loss": 2.936, + "step": 62395 + }, + { + "epoch": 4.239706481858948, + "grad_norm": 8.177948951721191, + "learning_rate": 4.702575078135617e-06, + "loss": 2.8677, + "step": 62400 + }, + { + "epoch": 4.24004620192961, + "grad_norm": 7.752731800079346, + "learning_rate": 4.70215042804729e-06, + "loss": 2.8887, + "step": 62405 + }, + { + "epoch": 4.2403859220002715, + "grad_norm": 7.5042009353637695, + "learning_rate": 4.701725777958962e-06, + "loss": 2.9872, + "step": 62410 + }, + { + "epoch": 4.240725642070934, + "grad_norm": 8.275252342224121, + "learning_rate": 4.701301127870635e-06, + "loss": 2.9967, + "step": 62415 + }, + { + "epoch": 4.241065362141596, + "grad_norm": 5.953550338745117, + "learning_rate": 4.700876477782308e-06, + "loss": 3.0742, + "step": 62420 + }, + { + "epoch": 4.241405082212257, + "grad_norm": 8.815391540527344, + "learning_rate": 4.70045182769398e-06, + "loss": 2.8984, + "step": 62425 + }, + { + "epoch": 4.241744802282919, + "grad_norm": 8.002898216247559, + "learning_rate": 4.700027177605654e-06, + "loss": 2.7193, + "step": 62430 + }, + { + "epoch": 4.242084522353581, + "grad_norm": 7.3552937507629395, + "learning_rate": 4.6996025275173265e-06, + "loss": 2.7278, + "step": 62435 + }, + { + "epoch": 4.242424242424242, + "grad_norm": 6.696755409240723, + "learning_rate": 4.699177877428998e-06, + "loss": 2.7947, + "step": 62440 + }, + { + "epoch": 4.242763962494904, + "grad_norm": 7.872463226318359, + "learning_rate": 4.698753227340671e-06, + "loss": 3.0435, + "step": 62445 + }, + { + "epoch": 4.243103682565566, + "grad_norm": 7.413952350616455, + "learning_rate": 4.698328577252345e-06, + "loss": 2.8733, + "step": 62450 + }, + { + "epoch": 4.2434434026362275, + "grad_norm": 7.069926738739014, + "learning_rate": 4.697903927164017e-06, + "loss": 2.9283, + "step": 62455 + }, + { + "epoch": 4.24378312270689, + "grad_norm": 4.821772575378418, + "learning_rate": 4.69747927707569e-06, + "loss": 3.0048, + "step": 62460 + }, + { + "epoch": 4.244122842777552, + "grad_norm": 6.766275405883789, + "learning_rate": 4.697054626987363e-06, + "loss": 2.8229, + "step": 62465 + }, + { + "epoch": 4.244462562848213, + "grad_norm": 7.0280561447143555, + "learning_rate": 4.696629976899035e-06, + "loss": 2.6129, + "step": 62470 + }, + { + "epoch": 4.244802282918875, + "grad_norm": 6.390316963195801, + "learning_rate": 4.696205326810708e-06, + "loss": 2.792, + "step": 62475 + }, + { + "epoch": 4.245142002989537, + "grad_norm": 6.555776119232178, + "learning_rate": 4.695780676722382e-06, + "loss": 2.6211, + "step": 62480 + }, + { + "epoch": 4.245481723060198, + "grad_norm": 8.190047264099121, + "learning_rate": 4.695356026634054e-06, + "loss": 2.7035, + "step": 62485 + }, + { + "epoch": 4.24582144313086, + "grad_norm": 9.08929443359375, + "learning_rate": 4.694931376545726e-06, + "loss": 2.7521, + "step": 62490 + }, + { + "epoch": 4.246161163201522, + "grad_norm": 8.275747299194336, + "learning_rate": 4.694506726457399e-06, + "loss": 2.7819, + "step": 62495 + }, + { + "epoch": 4.246500883272184, + "grad_norm": 8.778658866882324, + "learning_rate": 4.694082076369073e-06, + "loss": 2.9189, + "step": 62500 + }, + { + "epoch": 4.246840603342846, + "grad_norm": 6.772474765777588, + "learning_rate": 4.693657426280745e-06, + "loss": 2.853, + "step": 62505 + }, + { + "epoch": 4.247180323413508, + "grad_norm": 7.997346878051758, + "learning_rate": 4.693232776192418e-06, + "loss": 2.8629, + "step": 62510 + }, + { + "epoch": 4.247520043484169, + "grad_norm": 9.787732124328613, + "learning_rate": 4.692808126104091e-06, + "loss": 2.9878, + "step": 62515 + }, + { + "epoch": 4.247859763554831, + "grad_norm": 8.248177528381348, + "learning_rate": 4.692383476015763e-06, + "loss": 2.6141, + "step": 62520 + }, + { + "epoch": 4.248199483625493, + "grad_norm": 6.57719612121582, + "learning_rate": 4.691958825927436e-06, + "loss": 2.6966, + "step": 62525 + }, + { + "epoch": 4.248539203696154, + "grad_norm": 6.63153600692749, + "learning_rate": 4.691534175839109e-06, + "loss": 2.8727, + "step": 62530 + }, + { + "epoch": 4.248878923766816, + "grad_norm": 7.217060565948486, + "learning_rate": 4.691109525750782e-06, + "loss": 2.7592, + "step": 62535 + }, + { + "epoch": 4.2492186438374775, + "grad_norm": 8.605978965759277, + "learning_rate": 4.6906848756624544e-06, + "loss": 2.8781, + "step": 62540 + }, + { + "epoch": 4.24955836390814, + "grad_norm": 6.962170600891113, + "learning_rate": 4.690260225574127e-06, + "loss": 2.6965, + "step": 62545 + }, + { + "epoch": 4.249898083978802, + "grad_norm": 6.718651294708252, + "learning_rate": 4.6898355754858e-06, + "loss": 2.7943, + "step": 62550 + }, + { + "epoch": 4.250237804049463, + "grad_norm": 6.022754192352295, + "learning_rate": 4.689410925397473e-06, + "loss": 2.7174, + "step": 62555 + }, + { + "epoch": 4.250577524120125, + "grad_norm": 6.380655288696289, + "learning_rate": 4.688986275309146e-06, + "loss": 2.5386, + "step": 62560 + }, + { + "epoch": 4.250917244190787, + "grad_norm": 10.1522216796875, + "learning_rate": 4.6885616252208184e-06, + "loss": 2.7932, + "step": 62565 + }, + { + "epoch": 4.251256964261448, + "grad_norm": 7.257017612457275, + "learning_rate": 4.688136975132491e-06, + "loss": 2.8789, + "step": 62570 + }, + { + "epoch": 4.25159668433211, + "grad_norm": 7.844435691833496, + "learning_rate": 4.687712325044164e-06, + "loss": 2.8005, + "step": 62575 + }, + { + "epoch": 4.251936404402772, + "grad_norm": 7.887001991271973, + "learning_rate": 4.687287674955837e-06, + "loss": 2.7121, + "step": 62580 + }, + { + "epoch": 4.2522761244734335, + "grad_norm": 6.866995334625244, + "learning_rate": 4.68686302486751e-06, + "loss": 2.8194, + "step": 62585 + }, + { + "epoch": 4.252615844544096, + "grad_norm": 8.54642391204834, + "learning_rate": 4.6864383747791824e-06, + "loss": 2.9178, + "step": 62590 + }, + { + "epoch": 4.252955564614758, + "grad_norm": 5.792194843292236, + "learning_rate": 4.686013724690855e-06, + "loss": 2.8519, + "step": 62595 + }, + { + "epoch": 4.253295284685419, + "grad_norm": 7.717559337615967, + "learning_rate": 4.685589074602528e-06, + "loss": 2.8186, + "step": 62600 + }, + { + "epoch": 4.253635004756081, + "grad_norm": 7.752984523773193, + "learning_rate": 4.685164424514201e-06, + "loss": 2.5483, + "step": 62605 + }, + { + "epoch": 4.253974724826743, + "grad_norm": 7.2851409912109375, + "learning_rate": 4.684739774425874e-06, + "loss": 2.8009, + "step": 62610 + }, + { + "epoch": 4.254314444897404, + "grad_norm": 6.543951511383057, + "learning_rate": 4.6843151243375464e-06, + "loss": 2.7887, + "step": 62615 + }, + { + "epoch": 4.254654164968066, + "grad_norm": 8.494802474975586, + "learning_rate": 4.683890474249219e-06, + "loss": 2.8903, + "step": 62620 + }, + { + "epoch": 4.254993885038728, + "grad_norm": 7.815267086029053, + "learning_rate": 4.683465824160891e-06, + "loss": 2.9674, + "step": 62625 + }, + { + "epoch": 4.2553336051093895, + "grad_norm": 6.809991359710693, + "learning_rate": 4.683041174072565e-06, + "loss": 2.947, + "step": 62630 + }, + { + "epoch": 4.255673325180052, + "grad_norm": 9.014466285705566, + "learning_rate": 4.682616523984238e-06, + "loss": 2.9124, + "step": 62635 + }, + { + "epoch": 4.256013045250714, + "grad_norm": 9.21878433227539, + "learning_rate": 4.68219187389591e-06, + "loss": 2.6519, + "step": 62640 + }, + { + "epoch": 4.256352765321375, + "grad_norm": 7.4542155265808105, + "learning_rate": 4.681767223807583e-06, + "loss": 2.9054, + "step": 62645 + }, + { + "epoch": 4.256692485392037, + "grad_norm": 6.9362592697143555, + "learning_rate": 4.681342573719256e-06, + "loss": 2.8438, + "step": 62650 + }, + { + "epoch": 4.257032205462699, + "grad_norm": 6.855062484741211, + "learning_rate": 4.680917923630928e-06, + "loss": 2.9639, + "step": 62655 + }, + { + "epoch": 4.25737192553336, + "grad_norm": 6.761603832244873, + "learning_rate": 4.680493273542601e-06, + "loss": 2.7238, + "step": 62660 + }, + { + "epoch": 4.257711645604022, + "grad_norm": 6.928186893463135, + "learning_rate": 4.6800686234542744e-06, + "loss": 2.8707, + "step": 62665 + }, + { + "epoch": 4.258051365674684, + "grad_norm": 6.7148518562316895, + "learning_rate": 4.679643973365947e-06, + "loss": 3.0547, + "step": 62670 + }, + { + "epoch": 4.2583910857453455, + "grad_norm": 6.337646961212158, + "learning_rate": 4.679219323277619e-06, + "loss": 2.7907, + "step": 62675 + }, + { + "epoch": 4.258730805816008, + "grad_norm": 11.82925796508789, + "learning_rate": 4.678794673189293e-06, + "loss": 2.7218, + "step": 62680 + }, + { + "epoch": 4.25907052588667, + "grad_norm": 8.201228141784668, + "learning_rate": 4.678370023100966e-06, + "loss": 2.9647, + "step": 62685 + }, + { + "epoch": 4.259410245957331, + "grad_norm": 9.52551555633545, + "learning_rate": 4.677945373012638e-06, + "loss": 2.7636, + "step": 62690 + }, + { + "epoch": 4.259749966027993, + "grad_norm": 6.850560665130615, + "learning_rate": 4.67752072292431e-06, + "loss": 2.7215, + "step": 62695 + }, + { + "epoch": 4.260089686098655, + "grad_norm": 6.583461284637451, + "learning_rate": 4.677096072835984e-06, + "loss": 2.9289, + "step": 62700 + }, + { + "epoch": 4.260429406169316, + "grad_norm": 10.982481956481934, + "learning_rate": 4.676671422747656e-06, + "loss": 2.9537, + "step": 62705 + }, + { + "epoch": 4.260769126239978, + "grad_norm": 6.744027137756348, + "learning_rate": 4.676246772659329e-06, + "loss": 3.0047, + "step": 62710 + }, + { + "epoch": 4.26110884631064, + "grad_norm": 6.489953517913818, + "learning_rate": 4.6758221225710025e-06, + "loss": 2.9533, + "step": 62715 + }, + { + "epoch": 4.2614485663813015, + "grad_norm": 7.871822357177734, + "learning_rate": 4.675397472482674e-06, + "loss": 2.7115, + "step": 62720 + }, + { + "epoch": 4.261788286451964, + "grad_norm": 9.357126235961914, + "learning_rate": 4.674972822394347e-06, + "loss": 2.8258, + "step": 62725 + }, + { + "epoch": 4.262128006522626, + "grad_norm": 8.661425590515137, + "learning_rate": 4.67454817230602e-06, + "loss": 3.0668, + "step": 62730 + }, + { + "epoch": 4.262467726593287, + "grad_norm": 7.778095722198486, + "learning_rate": 4.674123522217693e-06, + "loss": 2.9597, + "step": 62735 + }, + { + "epoch": 4.262807446663949, + "grad_norm": 7.361079216003418, + "learning_rate": 4.673698872129366e-06, + "loss": 2.7534, + "step": 62740 + }, + { + "epoch": 4.263147166734611, + "grad_norm": 7.493388652801514, + "learning_rate": 4.673274222041038e-06, + "loss": 3.0496, + "step": 62745 + }, + { + "epoch": 4.263486886805272, + "grad_norm": 10.321579933166504, + "learning_rate": 4.672849571952711e-06, + "loss": 2.8365, + "step": 62750 + }, + { + "epoch": 4.263826606875934, + "grad_norm": 7.263087749481201, + "learning_rate": 4.672424921864384e-06, + "loss": 2.7751, + "step": 62755 + }, + { + "epoch": 4.264166326946596, + "grad_norm": 7.727146148681641, + "learning_rate": 4.672000271776057e-06, + "loss": 2.9258, + "step": 62760 + }, + { + "epoch": 4.2645060470172576, + "grad_norm": 8.182364463806152, + "learning_rate": 4.67157562168773e-06, + "loss": 2.8926, + "step": 62765 + }, + { + "epoch": 4.26484576708792, + "grad_norm": 6.441690921783447, + "learning_rate": 4.671150971599402e-06, + "loss": 2.7937, + "step": 62770 + }, + { + "epoch": 4.265185487158582, + "grad_norm": 7.44880485534668, + "learning_rate": 4.670726321511075e-06, + "loss": 2.8908, + "step": 62775 + }, + { + "epoch": 4.265525207229243, + "grad_norm": 9.591525077819824, + "learning_rate": 4.670301671422748e-06, + "loss": 2.596, + "step": 62780 + }, + { + "epoch": 4.265864927299905, + "grad_norm": 8.396940231323242, + "learning_rate": 4.669877021334421e-06, + "loss": 2.7999, + "step": 62785 + }, + { + "epoch": 4.266204647370567, + "grad_norm": 8.987008094787598, + "learning_rate": 4.669452371246094e-06, + "loss": 2.9131, + "step": 62790 + }, + { + "epoch": 4.266544367441228, + "grad_norm": 6.707448482513428, + "learning_rate": 4.669027721157766e-06, + "loss": 2.8028, + "step": 62795 + }, + { + "epoch": 4.26688408751189, + "grad_norm": 6.5318145751953125, + "learning_rate": 4.668603071069439e-06, + "loss": 2.8579, + "step": 62800 + }, + { + "epoch": 4.267223807582552, + "grad_norm": 7.507208347320557, + "learning_rate": 4.668178420981112e-06, + "loss": 2.8679, + "step": 62805 + }, + { + "epoch": 4.267563527653214, + "grad_norm": 8.81515884399414, + "learning_rate": 4.667753770892785e-06, + "loss": 2.7258, + "step": 62810 + }, + { + "epoch": 4.267903247723876, + "grad_norm": 8.789772987365723, + "learning_rate": 4.667329120804458e-06, + "loss": 3.1006, + "step": 62815 + }, + { + "epoch": 4.268242967794538, + "grad_norm": 8.396578788757324, + "learning_rate": 4.6669044707161304e-06, + "loss": 2.5682, + "step": 62820 + }, + { + "epoch": 4.268582687865199, + "grad_norm": 7.150136470794678, + "learning_rate": 4.666479820627802e-06, + "loss": 2.8945, + "step": 62825 + }, + { + "epoch": 4.268922407935861, + "grad_norm": 7.6158061027526855, + "learning_rate": 4.666055170539476e-06, + "loss": 2.9747, + "step": 62830 + }, + { + "epoch": 4.269262128006522, + "grad_norm": 7.126455307006836, + "learning_rate": 4.665630520451149e-06, + "loss": 2.9845, + "step": 62835 + }, + { + "epoch": 4.269601848077184, + "grad_norm": 6.869636535644531, + "learning_rate": 4.665205870362822e-06, + "loss": 2.87, + "step": 62840 + }, + { + "epoch": 4.269941568147846, + "grad_norm": 8.944473266601562, + "learning_rate": 4.6647812202744944e-06, + "loss": 3.1405, + "step": 62845 + }, + { + "epoch": 4.2702812882185075, + "grad_norm": 6.463403701782227, + "learning_rate": 4.664356570186167e-06, + "loss": 2.9126, + "step": 62850 + }, + { + "epoch": 4.27062100828917, + "grad_norm": 7.070896148681641, + "learning_rate": 4.66393192009784e-06, + "loss": 2.923, + "step": 62855 + }, + { + "epoch": 4.270960728359832, + "grad_norm": 8.116348266601562, + "learning_rate": 4.663507270009513e-06, + "loss": 2.8101, + "step": 62860 + }, + { + "epoch": 4.271300448430493, + "grad_norm": 6.7842559814453125, + "learning_rate": 4.663082619921186e-06, + "loss": 3.0045, + "step": 62865 + }, + { + "epoch": 4.271640168501155, + "grad_norm": 5.969136714935303, + "learning_rate": 4.6626579698328584e-06, + "loss": 2.6433, + "step": 62870 + }, + { + "epoch": 4.271979888571817, + "grad_norm": 5.577493190765381, + "learning_rate": 4.66223331974453e-06, + "loss": 2.9726, + "step": 62875 + }, + { + "epoch": 4.272319608642478, + "grad_norm": 8.086090087890625, + "learning_rate": 4.661808669656204e-06, + "loss": 2.7011, + "step": 62880 + }, + { + "epoch": 4.27265932871314, + "grad_norm": 10.072179794311523, + "learning_rate": 4.661384019567877e-06, + "loss": 2.8333, + "step": 62885 + }, + { + "epoch": 4.272999048783802, + "grad_norm": 7.508802890777588, + "learning_rate": 4.660959369479549e-06, + "loss": 2.9946, + "step": 62890 + }, + { + "epoch": 4.2733387688544635, + "grad_norm": 9.960588455200195, + "learning_rate": 4.6605347193912224e-06, + "loss": 2.874, + "step": 62895 + }, + { + "epoch": 4.273678488925126, + "grad_norm": 7.232886791229248, + "learning_rate": 4.660110069302895e-06, + "loss": 2.7589, + "step": 62900 + }, + { + "epoch": 4.274018208995788, + "grad_norm": 7.425696849822998, + "learning_rate": 4.659685419214567e-06, + "loss": 2.8471, + "step": 62905 + }, + { + "epoch": 4.274357929066449, + "grad_norm": 5.8708977699279785, + "learning_rate": 4.65926076912624e-06, + "loss": 2.7795, + "step": 62910 + }, + { + "epoch": 4.274697649137111, + "grad_norm": 7.493600845336914, + "learning_rate": 4.658836119037914e-06, + "loss": 2.8384, + "step": 62915 + }, + { + "epoch": 4.275037369207773, + "grad_norm": 6.276280879974365, + "learning_rate": 4.658411468949586e-06, + "loss": 2.9473, + "step": 62920 + }, + { + "epoch": 4.275377089278434, + "grad_norm": 7.088441371917725, + "learning_rate": 4.657986818861258e-06, + "loss": 2.7737, + "step": 62925 + }, + { + "epoch": 4.275716809349096, + "grad_norm": 7.5101213455200195, + "learning_rate": 4.657562168772932e-06, + "loss": 2.9546, + "step": 62930 + }, + { + "epoch": 4.276056529419758, + "grad_norm": 6.400911808013916, + "learning_rate": 4.657137518684604e-06, + "loss": 2.6195, + "step": 62935 + }, + { + "epoch": 4.2763962494904195, + "grad_norm": 5.978126049041748, + "learning_rate": 4.656712868596277e-06, + "loss": 2.785, + "step": 62940 + }, + { + "epoch": 4.276735969561082, + "grad_norm": 9.651934623718262, + "learning_rate": 4.65628821850795e-06, + "loss": 2.82, + "step": 62945 + }, + { + "epoch": 4.277075689631744, + "grad_norm": 7.442403316497803, + "learning_rate": 4.655863568419622e-06, + "loss": 2.9135, + "step": 62950 + }, + { + "epoch": 4.277415409702405, + "grad_norm": 7.546807765960693, + "learning_rate": 4.655438918331295e-06, + "loss": 2.8638, + "step": 62955 + }, + { + "epoch": 4.277755129773067, + "grad_norm": 7.862437725067139, + "learning_rate": 4.655014268242968e-06, + "loss": 2.8076, + "step": 62960 + }, + { + "epoch": 4.278094849843729, + "grad_norm": 7.000691890716553, + "learning_rate": 4.654589618154641e-06, + "loss": 2.9217, + "step": 62965 + }, + { + "epoch": 4.27843456991439, + "grad_norm": 8.285785675048828, + "learning_rate": 4.654164968066314e-06, + "loss": 2.7404, + "step": 62970 + }, + { + "epoch": 4.278774289985052, + "grad_norm": 6.776730537414551, + "learning_rate": 4.653740317977986e-06, + "loss": 2.8351, + "step": 62975 + }, + { + "epoch": 4.279114010055714, + "grad_norm": 7.466279029846191, + "learning_rate": 4.653315667889659e-06, + "loss": 3.0256, + "step": 62980 + }, + { + "epoch": 4.2794537301263755, + "grad_norm": 5.566476345062256, + "learning_rate": 4.652891017801332e-06, + "loss": 2.7693, + "step": 62985 + }, + { + "epoch": 4.279793450197038, + "grad_norm": 6.419910430908203, + "learning_rate": 4.652466367713005e-06, + "loss": 2.7879, + "step": 62990 + }, + { + "epoch": 4.2801331702677, + "grad_norm": 6.384851455688477, + "learning_rate": 4.652041717624678e-06, + "loss": 2.5521, + "step": 62995 + }, + { + "epoch": 4.280472890338361, + "grad_norm": 8.434751510620117, + "learning_rate": 4.65161706753635e-06, + "loss": 2.9416, + "step": 63000 + }, + { + "epoch": 4.280812610409023, + "grad_norm": 5.970948219299316, + "learning_rate": 4.651192417448023e-06, + "loss": 3.0263, + "step": 63005 + }, + { + "epoch": 4.281152330479685, + "grad_norm": 7.395815372467041, + "learning_rate": 4.650767767359696e-06, + "loss": 2.9872, + "step": 63010 + }, + { + "epoch": 4.281492050550346, + "grad_norm": 7.358449459075928, + "learning_rate": 4.650343117271369e-06, + "loss": 2.7646, + "step": 63015 + }, + { + "epoch": 4.281831770621008, + "grad_norm": 8.62920093536377, + "learning_rate": 4.649918467183042e-06, + "loss": 3.1451, + "step": 63020 + }, + { + "epoch": 4.28217149069167, + "grad_norm": 6.587813377380371, + "learning_rate": 4.649493817094714e-06, + "loss": 2.9341, + "step": 63025 + }, + { + "epoch": 4.2825112107623315, + "grad_norm": 8.196343421936035, + "learning_rate": 4.649069167006387e-06, + "loss": 2.8646, + "step": 63030 + }, + { + "epoch": 4.282850930832994, + "grad_norm": 6.8110737800598145, + "learning_rate": 4.64864451691806e-06, + "loss": 3.0372, + "step": 63035 + }, + { + "epoch": 4.283190650903656, + "grad_norm": 8.147140502929688, + "learning_rate": 4.648219866829733e-06, + "loss": 2.9024, + "step": 63040 + }, + { + "epoch": 4.283530370974317, + "grad_norm": 8.444870948791504, + "learning_rate": 4.647795216741406e-06, + "loss": 3.0148, + "step": 63045 + }, + { + "epoch": 4.283870091044979, + "grad_norm": 7.221855163574219, + "learning_rate": 4.647370566653078e-06, + "loss": 2.9829, + "step": 63050 + }, + { + "epoch": 4.284209811115641, + "grad_norm": 6.699174880981445, + "learning_rate": 4.646945916564751e-06, + "loss": 3.0419, + "step": 63055 + }, + { + "epoch": 4.284549531186302, + "grad_norm": 7.73271369934082, + "learning_rate": 4.646521266476424e-06, + "loss": 2.9043, + "step": 63060 + }, + { + "epoch": 4.284889251256964, + "grad_norm": 8.700098991394043, + "learning_rate": 4.646096616388097e-06, + "loss": 3.0107, + "step": 63065 + }, + { + "epoch": 4.285228971327626, + "grad_norm": 7.394592761993408, + "learning_rate": 4.64567196629977e-06, + "loss": 2.8854, + "step": 63070 + }, + { + "epoch": 4.2855686913982876, + "grad_norm": 9.11603832244873, + "learning_rate": 4.6452473162114416e-06, + "loss": 3.1386, + "step": 63075 + }, + { + "epoch": 4.28590841146895, + "grad_norm": 8.086433410644531, + "learning_rate": 4.644822666123115e-06, + "loss": 3.0644, + "step": 63080 + }, + { + "epoch": 4.286248131539612, + "grad_norm": 7.103802680969238, + "learning_rate": 4.644398016034788e-06, + "loss": 2.83, + "step": 63085 + }, + { + "epoch": 4.286587851610273, + "grad_norm": 5.793385028839111, + "learning_rate": 4.64397336594646e-06, + "loss": 2.7545, + "step": 63090 + }, + { + "epoch": 4.286927571680935, + "grad_norm": 7.737665176391602, + "learning_rate": 4.643548715858134e-06, + "loss": 2.398, + "step": 63095 + }, + { + "epoch": 4.287267291751597, + "grad_norm": 6.4552764892578125, + "learning_rate": 4.643124065769806e-06, + "loss": 2.9126, + "step": 63100 + }, + { + "epoch": 4.287607011822258, + "grad_norm": 6.998864650726318, + "learning_rate": 4.642699415681478e-06, + "loss": 2.7359, + "step": 63105 + }, + { + "epoch": 4.28794673189292, + "grad_norm": 8.734489440917969, + "learning_rate": 4.642274765593151e-06, + "loss": 2.8436, + "step": 63110 + }, + { + "epoch": 4.288286451963582, + "grad_norm": 7.316539764404297, + "learning_rate": 4.641850115504825e-06, + "loss": 2.9236, + "step": 63115 + }, + { + "epoch": 4.288626172034244, + "grad_norm": 9.834442138671875, + "learning_rate": 4.641425465416497e-06, + "loss": 2.8982, + "step": 63120 + }, + { + "epoch": 4.288965892104906, + "grad_norm": 7.73295259475708, + "learning_rate": 4.64100081532817e-06, + "loss": 3.1772, + "step": 63125 + }, + { + "epoch": 4.289305612175568, + "grad_norm": 9.796952247619629, + "learning_rate": 4.640576165239843e-06, + "loss": 2.8194, + "step": 63130 + }, + { + "epoch": 4.289645332246229, + "grad_norm": 6.936681270599365, + "learning_rate": 4.640151515151515e-06, + "loss": 2.9147, + "step": 63135 + }, + { + "epoch": 4.289985052316891, + "grad_norm": 9.032187461853027, + "learning_rate": 4.639726865063188e-06, + "loss": 2.8969, + "step": 63140 + }, + { + "epoch": 4.290324772387553, + "grad_norm": 6.558069229125977, + "learning_rate": 4.639302214974862e-06, + "loss": 2.8998, + "step": 63145 + }, + { + "epoch": 4.290664492458214, + "grad_norm": 7.402829170227051, + "learning_rate": 4.638877564886534e-06, + "loss": 3.0547, + "step": 63150 + }, + { + "epoch": 4.291004212528876, + "grad_norm": 7.349817752838135, + "learning_rate": 4.638452914798206e-06, + "loss": 2.9676, + "step": 63155 + }, + { + "epoch": 4.291343932599538, + "grad_norm": 7.577779769897461, + "learning_rate": 4.638028264709879e-06, + "loss": 2.9784, + "step": 63160 + }, + { + "epoch": 4.2916836526702, + "grad_norm": 7.977151870727539, + "learning_rate": 4.637603614621552e-06, + "loss": 2.5783, + "step": 63165 + }, + { + "epoch": 4.292023372740862, + "grad_norm": 7.34816312789917, + "learning_rate": 4.637178964533225e-06, + "loss": 2.7715, + "step": 63170 + }, + { + "epoch": 4.292363092811524, + "grad_norm": 7.598769664764404, + "learning_rate": 4.636754314444898e-06, + "loss": 2.8236, + "step": 63175 + }, + { + "epoch": 4.292702812882185, + "grad_norm": 7.6872477531433105, + "learning_rate": 4.636329664356571e-06, + "loss": 2.8159, + "step": 63180 + }, + { + "epoch": 4.293042532952847, + "grad_norm": 7.255385875701904, + "learning_rate": 4.635905014268243e-06, + "loss": 2.7841, + "step": 63185 + }, + { + "epoch": 4.293382253023509, + "grad_norm": 7.662545680999756, + "learning_rate": 4.635480364179916e-06, + "loss": 2.8001, + "step": 63190 + }, + { + "epoch": 4.29372197309417, + "grad_norm": 5.482833385467529, + "learning_rate": 4.635055714091589e-06, + "loss": 3.0292, + "step": 63195 + }, + { + "epoch": 4.294061693164832, + "grad_norm": 8.109508514404297, + "learning_rate": 4.634631064003262e-06, + "loss": 2.5677, + "step": 63200 + }, + { + "epoch": 4.294401413235494, + "grad_norm": 9.104565620422363, + "learning_rate": 4.634206413914934e-06, + "loss": 2.9573, + "step": 63205 + }, + { + "epoch": 4.294741133306156, + "grad_norm": 6.335338592529297, + "learning_rate": 4.633781763826607e-06, + "loss": 2.8567, + "step": 63210 + }, + { + "epoch": 4.295080853376818, + "grad_norm": 8.287076950073242, + "learning_rate": 4.63335711373828e-06, + "loss": 2.797, + "step": 63215 + }, + { + "epoch": 4.29542057344748, + "grad_norm": 8.264930725097656, + "learning_rate": 4.632932463649953e-06, + "loss": 2.9961, + "step": 63220 + }, + { + "epoch": 4.295760293518141, + "grad_norm": 7.736843585968018, + "learning_rate": 4.632507813561626e-06, + "loss": 2.7267, + "step": 63225 + }, + { + "epoch": 4.296100013588803, + "grad_norm": 7.515257358551025, + "learning_rate": 4.632083163473298e-06, + "loss": 2.9086, + "step": 63230 + }, + { + "epoch": 4.296439733659464, + "grad_norm": 9.170676231384277, + "learning_rate": 4.631658513384971e-06, + "loss": 2.8303, + "step": 63235 + }, + { + "epoch": 4.296779453730126, + "grad_norm": 9.372577667236328, + "learning_rate": 4.631233863296644e-06, + "loss": 2.7212, + "step": 63240 + }, + { + "epoch": 4.297119173800788, + "grad_norm": 6.140687942504883, + "learning_rate": 4.630809213208317e-06, + "loss": 2.8846, + "step": 63245 + }, + { + "epoch": 4.2974588938714495, + "grad_norm": 6.009908199310303, + "learning_rate": 4.63038456311999e-06, + "loss": 3.0196, + "step": 63250 + }, + { + "epoch": 4.297798613942112, + "grad_norm": 7.662829399108887, + "learning_rate": 4.629959913031662e-06, + "loss": 3.0077, + "step": 63255 + }, + { + "epoch": 4.298138334012774, + "grad_norm": 8.234529495239258, + "learning_rate": 4.629535262943335e-06, + "loss": 2.905, + "step": 63260 + }, + { + "epoch": 4.298478054083435, + "grad_norm": 7.629650115966797, + "learning_rate": 4.629110612855008e-06, + "loss": 3.0244, + "step": 63265 + }, + { + "epoch": 4.298817774154097, + "grad_norm": 8.005126953125, + "learning_rate": 4.628685962766681e-06, + "loss": 2.6596, + "step": 63270 + }, + { + "epoch": 4.299157494224759, + "grad_norm": 7.726984977722168, + "learning_rate": 4.628261312678354e-06, + "loss": 3.0134, + "step": 63275 + }, + { + "epoch": 4.29949721429542, + "grad_norm": NaN, + "learning_rate": 4.627921592607692e-06, + "loss": 2.9033, + "step": 63280 + }, + { + "epoch": 4.299836934366082, + "grad_norm": 7.34937858581543, + "learning_rate": 4.6274969425193645e-06, + "loss": 2.87, + "step": 63285 + }, + { + "epoch": 4.300176654436744, + "grad_norm": 6.63901424407959, + "learning_rate": 4.627072292431037e-06, + "loss": 3.0065, + "step": 63290 + }, + { + "epoch": 4.3005163745074055, + "grad_norm": 8.272637367248535, + "learning_rate": 4.62664764234271e-06, + "loss": 2.9913, + "step": 63295 + }, + { + "epoch": 4.300856094578068, + "grad_norm": 8.184621810913086, + "learning_rate": 4.626222992254383e-06, + "loss": 2.7935, + "step": 63300 + }, + { + "epoch": 4.30119581464873, + "grad_norm": 7.939073085784912, + "learning_rate": 4.625798342166056e-06, + "loss": 2.9444, + "step": 63305 + }, + { + "epoch": 4.301535534719391, + "grad_norm": 7.441401481628418, + "learning_rate": 4.6253736920777285e-06, + "loss": 2.625, + "step": 63310 + }, + { + "epoch": 4.301875254790053, + "grad_norm": 9.219676971435547, + "learning_rate": 4.624949041989401e-06, + "loss": 2.8517, + "step": 63315 + }, + { + "epoch": 4.302214974860715, + "grad_norm": 7.541032314300537, + "learning_rate": 4.624524391901074e-06, + "loss": 3.1443, + "step": 63320 + }, + { + "epoch": 4.302554694931376, + "grad_norm": 6.77532434463501, + "learning_rate": 4.624099741812746e-06, + "loss": 2.8912, + "step": 63325 + }, + { + "epoch": 4.302894415002038, + "grad_norm": 6.922149181365967, + "learning_rate": 4.62367509172442e-06, + "loss": 2.8076, + "step": 63330 + }, + { + "epoch": 4.3032341350727, + "grad_norm": 8.63870620727539, + "learning_rate": 4.6232504416360925e-06, + "loss": 2.7526, + "step": 63335 + }, + { + "epoch": 4.3035738551433615, + "grad_norm": 8.432437896728516, + "learning_rate": 4.6228257915477644e-06, + "loss": 2.7012, + "step": 63340 + }, + { + "epoch": 4.303913575214024, + "grad_norm": 6.846409320831299, + "learning_rate": 4.622401141459438e-06, + "loss": 2.6912, + "step": 63345 + }, + { + "epoch": 4.304253295284686, + "grad_norm": 6.701126575469971, + "learning_rate": 4.621976491371111e-06, + "loss": 2.8838, + "step": 63350 + }, + { + "epoch": 4.304593015355347, + "grad_norm": 6.050832748413086, + "learning_rate": 4.621551841282783e-06, + "loss": 2.737, + "step": 63355 + }, + { + "epoch": 4.304932735426009, + "grad_norm": 5.95407247543335, + "learning_rate": 4.6211271911944565e-06, + "loss": 3.2471, + "step": 63360 + }, + { + "epoch": 4.305272455496671, + "grad_norm": 8.84823989868164, + "learning_rate": 4.620702541106129e-06, + "loss": 2.9901, + "step": 63365 + }, + { + "epoch": 4.305612175567332, + "grad_norm": 8.383228302001953, + "learning_rate": 4.620277891017801e-06, + "loss": 2.8942, + "step": 63370 + }, + { + "epoch": 4.305951895637994, + "grad_norm": 7.411113739013672, + "learning_rate": 4.619853240929474e-06, + "loss": 2.9007, + "step": 63375 + }, + { + "epoch": 4.306291615708656, + "grad_norm": 7.517906665802002, + "learning_rate": 4.619428590841148e-06, + "loss": 2.6028, + "step": 63380 + }, + { + "epoch": 4.306631335779318, + "grad_norm": 7.505152225494385, + "learning_rate": 4.6190039407528205e-06, + "loss": 2.6856, + "step": 63385 + }, + { + "epoch": 4.30697105584998, + "grad_norm": 7.513760089874268, + "learning_rate": 4.6185792906644924e-06, + "loss": 2.8188, + "step": 63390 + }, + { + "epoch": 4.307310775920642, + "grad_norm": 7.253246307373047, + "learning_rate": 4.618154640576166e-06, + "loss": 2.8093, + "step": 63395 + }, + { + "epoch": 4.307650495991303, + "grad_norm": 6.824127197265625, + "learning_rate": 4.617729990487839e-06, + "loss": 3.0017, + "step": 63400 + }, + { + "epoch": 4.307990216061965, + "grad_norm": 6.225640773773193, + "learning_rate": 4.617305340399511e-06, + "loss": 2.7345, + "step": 63405 + }, + { + "epoch": 4.308329936132627, + "grad_norm": 6.760220050811768, + "learning_rate": 4.616880690311184e-06, + "loss": 2.7929, + "step": 63410 + }, + { + "epoch": 4.308669656203288, + "grad_norm": 6.002750873565674, + "learning_rate": 4.616456040222857e-06, + "loss": 2.983, + "step": 63415 + }, + { + "epoch": 4.30900937627395, + "grad_norm": 7.54432487487793, + "learning_rate": 4.616031390134529e-06, + "loss": 2.8871, + "step": 63420 + }, + { + "epoch": 4.309349096344612, + "grad_norm": 5.879334926605225, + "learning_rate": 4.615606740046202e-06, + "loss": 2.9258, + "step": 63425 + }, + { + "epoch": 4.309688816415274, + "grad_norm": 6.922394752502441, + "learning_rate": 4.615182089957876e-06, + "loss": 2.9566, + "step": 63430 + }, + { + "epoch": 4.310028536485936, + "grad_norm": 6.182366371154785, + "learning_rate": 4.614757439869548e-06, + "loss": 2.9812, + "step": 63435 + }, + { + "epoch": 4.310368256556598, + "grad_norm": 8.23377513885498, + "learning_rate": 4.6143327897812205e-06, + "loss": 2.8566, + "step": 63440 + }, + { + "epoch": 4.310707976627259, + "grad_norm": 7.773716926574707, + "learning_rate": 4.613908139692893e-06, + "loss": 2.7444, + "step": 63445 + }, + { + "epoch": 4.311047696697921, + "grad_norm": 6.709845542907715, + "learning_rate": 4.613483489604566e-06, + "loss": 2.8686, + "step": 63450 + }, + { + "epoch": 4.311387416768583, + "grad_norm": 7.406970500946045, + "learning_rate": 4.613058839516239e-06, + "loss": 3.0178, + "step": 63455 + }, + { + "epoch": 4.311727136839244, + "grad_norm": 9.483231544494629, + "learning_rate": 4.612634189427912e-06, + "loss": 2.9715, + "step": 63460 + }, + { + "epoch": 4.312066856909906, + "grad_norm": 7.051815986633301, + "learning_rate": 4.6122095393395845e-06, + "loss": 2.7603, + "step": 63465 + }, + { + "epoch": 4.312406576980568, + "grad_norm": 7.622135639190674, + "learning_rate": 4.611784889251257e-06, + "loss": 2.9295, + "step": 63470 + }, + { + "epoch": 4.31274629705123, + "grad_norm": 9.939727783203125, + "learning_rate": 4.61136023916293e-06, + "loss": 2.9946, + "step": 63475 + }, + { + "epoch": 4.313086017121892, + "grad_norm": 8.02879810333252, + "learning_rate": 4.610935589074603e-06, + "loss": 2.9665, + "step": 63480 + }, + { + "epoch": 4.313425737192554, + "grad_norm": 7.5535430908203125, + "learning_rate": 4.610510938986276e-06, + "loss": 2.7862, + "step": 63485 + }, + { + "epoch": 4.313765457263215, + "grad_norm": 8.273752212524414, + "learning_rate": 4.6100862888979485e-06, + "loss": 2.9065, + "step": 63490 + }, + { + "epoch": 4.314105177333877, + "grad_norm": 7.668753147125244, + "learning_rate": 4.609661638809621e-06, + "loss": 2.7574, + "step": 63495 + }, + { + "epoch": 4.314444897404539, + "grad_norm": 6.605005741119385, + "learning_rate": 4.609236988721294e-06, + "loss": 3.0507, + "step": 63500 + }, + { + "epoch": 4.3147846174752, + "grad_norm": 7.944074630737305, + "learning_rate": 4.608812338632967e-06, + "loss": 2.9736, + "step": 63505 + }, + { + "epoch": 4.315124337545862, + "grad_norm": 9.947481155395508, + "learning_rate": 4.60838768854464e-06, + "loss": 2.9941, + "step": 63510 + }, + { + "epoch": 4.3154640576165235, + "grad_norm": 8.076628684997559, + "learning_rate": 4.6079630384563125e-06, + "loss": 3.0167, + "step": 63515 + }, + { + "epoch": 4.315803777687186, + "grad_norm": 8.468987464904785, + "learning_rate": 4.607538388367985e-06, + "loss": 2.7761, + "step": 63520 + }, + { + "epoch": 4.316143497757848, + "grad_norm": 6.217399597167969, + "learning_rate": 4.607113738279658e-06, + "loss": 2.9268, + "step": 63525 + }, + { + "epoch": 4.316483217828509, + "grad_norm": 8.039557456970215, + "learning_rate": 4.606689088191331e-06, + "loss": 2.7288, + "step": 63530 + }, + { + "epoch": 4.316822937899171, + "grad_norm": 8.784933090209961, + "learning_rate": 4.606264438103004e-06, + "loss": 3.0414, + "step": 63535 + }, + { + "epoch": 4.317162657969833, + "grad_norm": 6.6808905601501465, + "learning_rate": 4.605839788014676e-06, + "loss": 2.8637, + "step": 63540 + }, + { + "epoch": 4.317502378040494, + "grad_norm": 7.076836585998535, + "learning_rate": 4.605415137926349e-06, + "loss": 2.9569, + "step": 63545 + }, + { + "epoch": 4.317842098111156, + "grad_norm": 6.991944789886475, + "learning_rate": 4.604990487838022e-06, + "loss": 2.8226, + "step": 63550 + }, + { + "epoch": 4.318181818181818, + "grad_norm": 7.725498199462891, + "learning_rate": 4.604565837749695e-06, + "loss": 3.0054, + "step": 63555 + }, + { + "epoch": 4.3185215382524795, + "grad_norm": 8.160186767578125, + "learning_rate": 4.604141187661368e-06, + "loss": 3.0362, + "step": 63560 + }, + { + "epoch": 4.318861258323142, + "grad_norm": 7.39808464050293, + "learning_rate": 4.6037165375730405e-06, + "loss": 2.8698, + "step": 63565 + }, + { + "epoch": 4.319200978393804, + "grad_norm": 6.94004487991333, + "learning_rate": 4.603291887484713e-06, + "loss": 2.8547, + "step": 63570 + }, + { + "epoch": 4.319540698464465, + "grad_norm": 7.305065155029297, + "learning_rate": 4.602867237396385e-06, + "loss": 2.7918, + "step": 63575 + }, + { + "epoch": 4.319880418535127, + "grad_norm": 6.493807315826416, + "learning_rate": 4.602442587308059e-06, + "loss": 3.032, + "step": 63580 + }, + { + "epoch": 4.320220138605789, + "grad_norm": 6.117049694061279, + "learning_rate": 4.602017937219732e-06, + "loss": 2.9936, + "step": 63585 + }, + { + "epoch": 4.32055985867645, + "grad_norm": 8.177962303161621, + "learning_rate": 4.601593287131404e-06, + "loss": 2.7758, + "step": 63590 + }, + { + "epoch": 4.320899578747112, + "grad_norm": 6.687529563903809, + "learning_rate": 4.601168637043077e-06, + "loss": 2.6668, + "step": 63595 + }, + { + "epoch": 4.321239298817774, + "grad_norm": 7.205031871795654, + "learning_rate": 4.60074398695475e-06, + "loss": 2.9521, + "step": 63600 + }, + { + "epoch": 4.3215790188884355, + "grad_norm": 6.980223178863525, + "learning_rate": 4.600319336866422e-06, + "loss": 2.8827, + "step": 63605 + }, + { + "epoch": 4.321918738959098, + "grad_norm": 8.809675216674805, + "learning_rate": 4.599894686778095e-06, + "loss": 2.8169, + "step": 63610 + }, + { + "epoch": 4.32225845902976, + "grad_norm": 7.441550254821777, + "learning_rate": 4.5994700366897685e-06, + "loss": 2.8617, + "step": 63615 + }, + { + "epoch": 4.322598179100421, + "grad_norm": 6.63363790512085, + "learning_rate": 4.5990453866014404e-06, + "loss": 2.8037, + "step": 63620 + }, + { + "epoch": 4.322937899171083, + "grad_norm": 6.91447639465332, + "learning_rate": 4.598620736513113e-06, + "loss": 2.8084, + "step": 63625 + }, + { + "epoch": 4.323277619241745, + "grad_norm": 6.586512088775635, + "learning_rate": 4.598196086424787e-06, + "loss": 2.9666, + "step": 63630 + }, + { + "epoch": 4.323617339312406, + "grad_norm": 7.262307167053223, + "learning_rate": 4.597771436336459e-06, + "loss": 2.5934, + "step": 63635 + }, + { + "epoch": 4.323957059383068, + "grad_norm": 5.694019794464111, + "learning_rate": 4.597346786248132e-06, + "loss": 2.8305, + "step": 63640 + }, + { + "epoch": 4.32429677945373, + "grad_norm": 6.131400108337402, + "learning_rate": 4.596922136159805e-06, + "loss": 2.8697, + "step": 63645 + }, + { + "epoch": 4.3246364995243916, + "grad_norm": 6.37682580947876, + "learning_rate": 4.596497486071477e-06, + "loss": 2.8445, + "step": 63650 + }, + { + "epoch": 4.324976219595054, + "grad_norm": 7.195163726806641, + "learning_rate": 4.59607283598315e-06, + "loss": 2.8097, + "step": 63655 + }, + { + "epoch": 4.325315939665716, + "grad_norm": 7.714854717254639, + "learning_rate": 4.595648185894823e-06, + "loss": 2.6973, + "step": 63660 + }, + { + "epoch": 4.325655659736377, + "grad_norm": 9.552149772644043, + "learning_rate": 4.595223535806496e-06, + "loss": 3.1741, + "step": 63665 + }, + { + "epoch": 4.325995379807039, + "grad_norm": 6.235179424285889, + "learning_rate": 4.5947988857181684e-06, + "loss": 2.9927, + "step": 63670 + }, + { + "epoch": 4.326335099877701, + "grad_norm": 6.2537031173706055, + "learning_rate": 4.594374235629841e-06, + "loss": 3.0045, + "step": 63675 + }, + { + "epoch": 4.326674819948362, + "grad_norm": 7.8473286628723145, + "learning_rate": 4.593949585541514e-06, + "loss": 3.1279, + "step": 63680 + }, + { + "epoch": 4.327014540019024, + "grad_norm": 6.153866767883301, + "learning_rate": 4.593524935453187e-06, + "loss": 2.8747, + "step": 63685 + }, + { + "epoch": 4.327354260089686, + "grad_norm": 6.464003562927246, + "learning_rate": 4.59310028536486e-06, + "loss": 2.9684, + "step": 63690 + }, + { + "epoch": 4.327693980160348, + "grad_norm": 7.28519344329834, + "learning_rate": 4.5926756352765324e-06, + "loss": 2.9102, + "step": 63695 + }, + { + "epoch": 4.32803370023101, + "grad_norm": 7.118729591369629, + "learning_rate": 4.592250985188205e-06, + "loss": 2.7573, + "step": 63700 + }, + { + "epoch": 4.328373420301672, + "grad_norm": 6.534059047698975, + "learning_rate": 4.591826335099878e-06, + "loss": 2.7252, + "step": 63705 + }, + { + "epoch": 4.328713140372333, + "grad_norm": 6.264959335327148, + "learning_rate": 4.591401685011551e-06, + "loss": 2.929, + "step": 63710 + }, + { + "epoch": 4.329052860442995, + "grad_norm": 6.4210286140441895, + "learning_rate": 4.590977034923224e-06, + "loss": 2.5903, + "step": 63715 + }, + { + "epoch": 4.329392580513657, + "grad_norm": 6.4047746658325195, + "learning_rate": 4.5905523848348964e-06, + "loss": 2.8283, + "step": 63720 + }, + { + "epoch": 4.329732300584318, + "grad_norm": 6.732541561126709, + "learning_rate": 4.590127734746569e-06, + "loss": 2.6792, + "step": 63725 + }, + { + "epoch": 4.33007202065498, + "grad_norm": 6.498129367828369, + "learning_rate": 4.589703084658242e-06, + "loss": 2.6721, + "step": 63730 + }, + { + "epoch": 4.330411740725642, + "grad_norm": 8.958803176879883, + "learning_rate": 4.589278434569915e-06, + "loss": 2.5283, + "step": 63735 + }, + { + "epoch": 4.330751460796304, + "grad_norm": 7.111974716186523, + "learning_rate": 4.588853784481588e-06, + "loss": 2.8504, + "step": 63740 + }, + { + "epoch": 4.331091180866966, + "grad_norm": 8.18364429473877, + "learning_rate": 4.5884291343932605e-06, + "loss": 2.6222, + "step": 63745 + }, + { + "epoch": 4.331430900937628, + "grad_norm": 6.727498531341553, + "learning_rate": 4.588004484304933e-06, + "loss": 3.0041, + "step": 63750 + }, + { + "epoch": 4.331770621008289, + "grad_norm": 7.430977821350098, + "learning_rate": 4.587579834216606e-06, + "loss": 2.7103, + "step": 63755 + }, + { + "epoch": 4.332110341078951, + "grad_norm": 7.036398410797119, + "learning_rate": 4.587155184128279e-06, + "loss": 2.78, + "step": 63760 + }, + { + "epoch": 4.332450061149613, + "grad_norm": 8.159478187561035, + "learning_rate": 4.586730534039952e-06, + "loss": 2.6021, + "step": 63765 + }, + { + "epoch": 4.332789781220274, + "grad_norm": 5.967456340789795, + "learning_rate": 4.5863058839516245e-06, + "loss": 2.8034, + "step": 63770 + }, + { + "epoch": 4.333129501290936, + "grad_norm": 9.174521446228027, + "learning_rate": 4.585881233863297e-06, + "loss": 2.7969, + "step": 63775 + }, + { + "epoch": 4.333469221361598, + "grad_norm": 6.445182800292969, + "learning_rate": 4.58545658377497e-06, + "loss": 2.5941, + "step": 63780 + }, + { + "epoch": 4.33380894143226, + "grad_norm": 7.262064456939697, + "learning_rate": 4.585031933686643e-06, + "loss": 3.2635, + "step": 63785 + }, + { + "epoch": 4.334148661502922, + "grad_norm": 5.803561687469482, + "learning_rate": 4.584607283598315e-06, + "loss": 2.7504, + "step": 63790 + }, + { + "epoch": 4.334488381573584, + "grad_norm": 8.282407760620117, + "learning_rate": 4.5841826335099885e-06, + "loss": 2.7741, + "step": 63795 + }, + { + "epoch": 4.334828101644245, + "grad_norm": 7.443223476409912, + "learning_rate": 4.583757983421661e-06, + "loss": 2.6463, + "step": 63800 + }, + { + "epoch": 4.335167821714907, + "grad_norm": 5.213032245635986, + "learning_rate": 4.583333333333333e-06, + "loss": 2.9571, + "step": 63805 + }, + { + "epoch": 4.335507541785569, + "grad_norm": 8.624944686889648, + "learning_rate": 4.582908683245007e-06, + "loss": 2.6322, + "step": 63810 + }, + { + "epoch": 4.33584726185623, + "grad_norm": 7.647025108337402, + "learning_rate": 4.58248403315668e-06, + "loss": 3.1141, + "step": 63815 + }, + { + "epoch": 4.336186981926892, + "grad_norm": 8.143477439880371, + "learning_rate": 4.582059383068352e-06, + "loss": 2.9138, + "step": 63820 + }, + { + "epoch": 4.336526701997554, + "grad_norm": 7.005324363708496, + "learning_rate": 4.581634732980024e-06, + "loss": 3.079, + "step": 63825 + }, + { + "epoch": 4.336866422068216, + "grad_norm": 7.726224422454834, + "learning_rate": 4.581210082891698e-06, + "loss": 2.9441, + "step": 63830 + }, + { + "epoch": 4.337206142138878, + "grad_norm": 8.842230796813965, + "learning_rate": 4.58078543280337e-06, + "loss": 2.836, + "step": 63835 + }, + { + "epoch": 4.33754586220954, + "grad_norm": 6.851377487182617, + "learning_rate": 4.580360782715043e-06, + "loss": 2.8637, + "step": 63840 + }, + { + "epoch": 4.337885582280201, + "grad_norm": 7.672262191772461, + "learning_rate": 4.5799361326267165e-06, + "loss": 2.8816, + "step": 63845 + }, + { + "epoch": 4.338225302350863, + "grad_norm": 8.116138458251953, + "learning_rate": 4.579511482538388e-06, + "loss": 2.5017, + "step": 63850 + }, + { + "epoch": 4.338565022421525, + "grad_norm": 5.788112163543701, + "learning_rate": 4.579086832450061e-06, + "loss": 2.8143, + "step": 63855 + }, + { + "epoch": 4.338904742492186, + "grad_norm": 7.767604351043701, + "learning_rate": 4.578662182361734e-06, + "loss": 2.8593, + "step": 63860 + }, + { + "epoch": 4.339244462562848, + "grad_norm": 9.587363243103027, + "learning_rate": 4.578237532273407e-06, + "loss": 3.0903, + "step": 63865 + }, + { + "epoch": 4.33958418263351, + "grad_norm": 6.913327217102051, + "learning_rate": 4.57781288218508e-06, + "loss": 2.9196, + "step": 63870 + }, + { + "epoch": 4.339923902704172, + "grad_norm": 6.605997562408447, + "learning_rate": 4.5773882320967524e-06, + "loss": 2.8878, + "step": 63875 + }, + { + "epoch": 4.340263622774834, + "grad_norm": 7.005033016204834, + "learning_rate": 4.576963582008425e-06, + "loss": 3.2009, + "step": 63880 + }, + { + "epoch": 4.340603342845496, + "grad_norm": 8.990693092346191, + "learning_rate": 4.576538931920098e-06, + "loss": 3.142, + "step": 63885 + }, + { + "epoch": 4.340943062916157, + "grad_norm": 6.949220180511475, + "learning_rate": 4.576114281831771e-06, + "loss": 2.9466, + "step": 63890 + }, + { + "epoch": 4.341282782986819, + "grad_norm": 5.646663665771484, + "learning_rate": 4.575689631743444e-06, + "loss": 2.8159, + "step": 63895 + }, + { + "epoch": 4.341622503057481, + "grad_norm": 8.303557395935059, + "learning_rate": 4.5752649816551164e-06, + "loss": 2.7406, + "step": 63900 + }, + { + "epoch": 4.341962223128142, + "grad_norm": 7.506004333496094, + "learning_rate": 4.574840331566789e-06, + "loss": 2.7885, + "step": 63905 + }, + { + "epoch": 4.342301943198804, + "grad_norm": 7.728633880615234, + "learning_rate": 4.574415681478462e-06, + "loss": 2.7033, + "step": 63910 + }, + { + "epoch": 4.3426416632694655, + "grad_norm": 6.481416702270508, + "learning_rate": 4.573991031390135e-06, + "loss": 2.9565, + "step": 63915 + }, + { + "epoch": 4.342981383340128, + "grad_norm": 8.156644821166992, + "learning_rate": 4.573566381301808e-06, + "loss": 3.0375, + "step": 63920 + }, + { + "epoch": 4.34332110341079, + "grad_norm": 6.55365514755249, + "learning_rate": 4.5731417312134804e-06, + "loss": 3.0585, + "step": 63925 + }, + { + "epoch": 4.343660823481451, + "grad_norm": 6.885136127471924, + "learning_rate": 4.572717081125153e-06, + "loss": 2.9849, + "step": 63930 + }, + { + "epoch": 4.344000543552113, + "grad_norm": 6.735259532928467, + "learning_rate": 4.572292431036826e-06, + "loss": 2.9405, + "step": 63935 + }, + { + "epoch": 4.344340263622775, + "grad_norm": 7.096541881561279, + "learning_rate": 4.571867780948499e-06, + "loss": 2.7999, + "step": 63940 + }, + { + "epoch": 4.344679983693436, + "grad_norm": 8.944013595581055, + "learning_rate": 4.571443130860172e-06, + "loss": 2.7504, + "step": 63945 + }, + { + "epoch": 4.345019703764098, + "grad_norm": 6.5519890785217285, + "learning_rate": 4.5710184807718444e-06, + "loss": 2.8213, + "step": 63950 + }, + { + "epoch": 4.34535942383476, + "grad_norm": 7.392837047576904, + "learning_rate": 4.570593830683517e-06, + "loss": 2.7, + "step": 63955 + }, + { + "epoch": 4.345699143905422, + "grad_norm": 7.092373847961426, + "learning_rate": 4.57016918059519e-06, + "loss": 2.8299, + "step": 63960 + }, + { + "epoch": 4.346038863976084, + "grad_norm": 7.77861213684082, + "learning_rate": 4.569744530506863e-06, + "loss": 2.6503, + "step": 63965 + }, + { + "epoch": 4.346378584046746, + "grad_norm": 7.936300754547119, + "learning_rate": 4.569319880418536e-06, + "loss": 2.4607, + "step": 63970 + }, + { + "epoch": 4.346718304117407, + "grad_norm": 7.248700141906738, + "learning_rate": 4.5688952303302084e-06, + "loss": 2.9822, + "step": 63975 + }, + { + "epoch": 4.347058024188069, + "grad_norm": 7.905431747436523, + "learning_rate": 4.568470580241881e-06, + "loss": 2.6937, + "step": 63980 + }, + { + "epoch": 4.347397744258731, + "grad_norm": 6.545019626617432, + "learning_rate": 4.568045930153554e-06, + "loss": 3.0458, + "step": 63985 + }, + { + "epoch": 4.347737464329392, + "grad_norm": 8.293449401855469, + "learning_rate": 4.567621280065227e-06, + "loss": 2.8103, + "step": 63990 + }, + { + "epoch": 4.348077184400054, + "grad_norm": 7.428039073944092, + "learning_rate": 4.5671966299769e-06, + "loss": 2.953, + "step": 63995 + }, + { + "epoch": 4.348416904470716, + "grad_norm": 6.458433151245117, + "learning_rate": 4.5667719798885724e-06, + "loss": 2.6642, + "step": 64000 + }, + { + "epoch": 4.348756624541378, + "grad_norm": 7.604665756225586, + "learning_rate": 4.566347329800244e-06, + "loss": 2.7443, + "step": 64005 + }, + { + "epoch": 4.34909634461204, + "grad_norm": 8.639554023742676, + "learning_rate": 4.565922679711918e-06, + "loss": 2.9335, + "step": 64010 + }, + { + "epoch": 4.349436064682702, + "grad_norm": 8.347829818725586, + "learning_rate": 4.565498029623591e-06, + "loss": 2.5826, + "step": 64015 + }, + { + "epoch": 4.349775784753363, + "grad_norm": 7.69260311126709, + "learning_rate": 4.565073379535263e-06, + "loss": 2.9871, + "step": 64020 + }, + { + "epoch": 4.350115504824025, + "grad_norm": 7.99383020401001, + "learning_rate": 4.5646487294469364e-06, + "loss": 2.6206, + "step": 64025 + }, + { + "epoch": 4.350455224894687, + "grad_norm": 7.614063739776611, + "learning_rate": 4.564224079358609e-06, + "loss": 3.0419, + "step": 64030 + }, + { + "epoch": 4.350794944965348, + "grad_norm": 7.082902908325195, + "learning_rate": 4.563799429270281e-06, + "loss": 2.7262, + "step": 64035 + }, + { + "epoch": 4.35113466503601, + "grad_norm": 8.092765808105469, + "learning_rate": 4.563374779181954e-06, + "loss": 3.0864, + "step": 64040 + }, + { + "epoch": 4.351474385106672, + "grad_norm": 5.938857555389404, + "learning_rate": 4.562950129093628e-06, + "loss": 2.84, + "step": 64045 + }, + { + "epoch": 4.351814105177334, + "grad_norm": 7.236920356750488, + "learning_rate": 4.5625254790053e-06, + "loss": 2.3872, + "step": 64050 + }, + { + "epoch": 4.352153825247996, + "grad_norm": 6.3980183601379395, + "learning_rate": 4.562100828916972e-06, + "loss": 2.7515, + "step": 64055 + }, + { + "epoch": 4.352493545318658, + "grad_norm": 8.68111801147461, + "learning_rate": 4.561676178828646e-06, + "loss": 2.8652, + "step": 64060 + }, + { + "epoch": 4.352833265389319, + "grad_norm": 7.904919624328613, + "learning_rate": 4.561251528740319e-06, + "loss": 3.1047, + "step": 64065 + }, + { + "epoch": 4.353172985459981, + "grad_norm": 6.882758140563965, + "learning_rate": 4.560826878651991e-06, + "loss": 2.9059, + "step": 64070 + }, + { + "epoch": 4.353512705530643, + "grad_norm": 7.55946159362793, + "learning_rate": 4.560402228563664e-06, + "loss": 2.9695, + "step": 64075 + }, + { + "epoch": 4.353852425601304, + "grad_norm": 6.780253887176514, + "learning_rate": 4.559977578475337e-06, + "loss": 2.8726, + "step": 64080 + }, + { + "epoch": 4.354192145671966, + "grad_norm": 8.742928504943848, + "learning_rate": 4.559552928387009e-06, + "loss": 2.6627, + "step": 64085 + }, + { + "epoch": 4.354531865742628, + "grad_norm": 6.450955867767334, + "learning_rate": 4.559128278298682e-06, + "loss": 2.7738, + "step": 64090 + }, + { + "epoch": 4.35487158581329, + "grad_norm": 6.342488765716553, + "learning_rate": 4.558703628210356e-06, + "loss": 3.0317, + "step": 64095 + }, + { + "epoch": 4.355211305883952, + "grad_norm": 6.867430686950684, + "learning_rate": 4.558278978122028e-06, + "loss": 2.7527, + "step": 64100 + }, + { + "epoch": 4.355551025954614, + "grad_norm": 7.036966323852539, + "learning_rate": 4.5578543280337e-06, + "loss": 2.9074, + "step": 64105 + }, + { + "epoch": 4.355890746025275, + "grad_norm": 6.594929218292236, + "learning_rate": 4.557429677945373e-06, + "loss": 2.8961, + "step": 64110 + }, + { + "epoch": 4.356230466095937, + "grad_norm": 6.917245388031006, + "learning_rate": 4.557005027857046e-06, + "loss": 2.6753, + "step": 64115 + }, + { + "epoch": 4.356570186166599, + "grad_norm": 6.406293869018555, + "learning_rate": 4.556580377768719e-06, + "loss": 3.087, + "step": 64120 + }, + { + "epoch": 4.35690990623726, + "grad_norm": 7.906967639923096, + "learning_rate": 4.556155727680392e-06, + "loss": 2.9205, + "step": 64125 + }, + { + "epoch": 4.357249626307922, + "grad_norm": 6.671586513519287, + "learning_rate": 4.555731077592064e-06, + "loss": 2.7473, + "step": 64130 + }, + { + "epoch": 4.357589346378584, + "grad_norm": 5.688414573669434, + "learning_rate": 4.555306427503737e-06, + "loss": 2.869, + "step": 64135 + }, + { + "epoch": 4.357929066449246, + "grad_norm": 6.091311931610107, + "learning_rate": 4.55488177741541e-06, + "loss": 2.6913, + "step": 64140 + }, + { + "epoch": 4.358268786519908, + "grad_norm": 8.07968807220459, + "learning_rate": 4.554457127327083e-06, + "loss": 3.0338, + "step": 64145 + }, + { + "epoch": 4.35860850659057, + "grad_norm": 5.822421073913574, + "learning_rate": 4.554032477238756e-06, + "loss": 2.6979, + "step": 64150 + }, + { + "epoch": 4.358948226661231, + "grad_norm": 6.801706314086914, + "learning_rate": 4.553607827150428e-06, + "loss": 3.0306, + "step": 64155 + }, + { + "epoch": 4.359287946731893, + "grad_norm": 10.338948249816895, + "learning_rate": 4.553183177062101e-06, + "loss": 2.9344, + "step": 64160 + }, + { + "epoch": 4.359627666802555, + "grad_norm": 7.97553825378418, + "learning_rate": 4.552758526973774e-06, + "loss": 2.7325, + "step": 64165 + }, + { + "epoch": 4.359967386873216, + "grad_norm": 10.023709297180176, + "learning_rate": 4.552333876885447e-06, + "loss": 2.8368, + "step": 64170 + }, + { + "epoch": 4.360307106943878, + "grad_norm": 7.171899318695068, + "learning_rate": 4.55190922679712e-06, + "loss": 2.9087, + "step": 64175 + }, + { + "epoch": 4.36064682701454, + "grad_norm": 7.178852081298828, + "learning_rate": 4.5514845767087924e-06, + "loss": 2.8841, + "step": 64180 + }, + { + "epoch": 4.360986547085202, + "grad_norm": 6.853766441345215, + "learning_rate": 4.551059926620465e-06, + "loss": 3.0646, + "step": 64185 + }, + { + "epoch": 4.361326267155864, + "grad_norm": 6.863982677459717, + "learning_rate": 4.550635276532138e-06, + "loss": 2.8731, + "step": 64190 + }, + { + "epoch": 4.361665987226525, + "grad_norm": 7.687100887298584, + "learning_rate": 4.550210626443811e-06, + "loss": 2.9957, + "step": 64195 + }, + { + "epoch": 4.362005707297187, + "grad_norm": 8.07049560546875, + "learning_rate": 4.549785976355484e-06, + "loss": 2.692, + "step": 64200 + }, + { + "epoch": 4.362345427367849, + "grad_norm": 9.351568222045898, + "learning_rate": 4.549361326267156e-06, + "loss": 2.8748, + "step": 64205 + }, + { + "epoch": 4.36268514743851, + "grad_norm": 8.287212371826172, + "learning_rate": 4.548936676178829e-06, + "loss": 2.5999, + "step": 64210 + }, + { + "epoch": 4.363024867509172, + "grad_norm": 7.7772650718688965, + "learning_rate": 4.548512026090502e-06, + "loss": 2.8987, + "step": 64215 + }, + { + "epoch": 4.363364587579834, + "grad_norm": 6.118226528167725, + "learning_rate": 4.548087376002174e-06, + "loss": 2.9131, + "step": 64220 + }, + { + "epoch": 4.3637043076504956, + "grad_norm": 6.815870761871338, + "learning_rate": 4.547662725913848e-06, + "loss": 2.7503, + "step": 64225 + }, + { + "epoch": 4.364044027721158, + "grad_norm": 6.830853462219238, + "learning_rate": 4.5472380758255204e-06, + "loss": 2.9042, + "step": 64230 + }, + { + "epoch": 4.36438374779182, + "grad_norm": 9.570159912109375, + "learning_rate": 4.546813425737193e-06, + "loss": 2.836, + "step": 64235 + }, + { + "epoch": 4.364723467862481, + "grad_norm": 6.962830066680908, + "learning_rate": 4.546388775648865e-06, + "loss": 2.8645, + "step": 64240 + }, + { + "epoch": 4.365063187933143, + "grad_norm": 7.295672416687012, + "learning_rate": 4.545964125560539e-06, + "loss": 2.9908, + "step": 64245 + }, + { + "epoch": 4.365402908003805, + "grad_norm": 6.533268451690674, + "learning_rate": 4.545539475472212e-06, + "loss": 2.7141, + "step": 64250 + }, + { + "epoch": 4.365742628074466, + "grad_norm": 7.299379348754883, + "learning_rate": 4.545114825383884e-06, + "loss": 2.6904, + "step": 64255 + }, + { + "epoch": 4.366082348145128, + "grad_norm": 7.954117774963379, + "learning_rate": 4.544690175295557e-06, + "loss": 2.7079, + "step": 64260 + }, + { + "epoch": 4.36642206821579, + "grad_norm": 7.232874870300293, + "learning_rate": 4.54426552520723e-06, + "loss": 2.5347, + "step": 64265 + }, + { + "epoch": 4.366761788286452, + "grad_norm": 6.403970241546631, + "learning_rate": 4.543840875118902e-06, + "loss": 2.6239, + "step": 64270 + }, + { + "epoch": 4.367101508357114, + "grad_norm": 7.4125518798828125, + "learning_rate": 4.543416225030576e-06, + "loss": 2.9812, + "step": 64275 + }, + { + "epoch": 4.367441228427776, + "grad_norm": 8.37592887878418, + "learning_rate": 4.5429915749422484e-06, + "loss": 2.7989, + "step": 64280 + }, + { + "epoch": 4.367780948498437, + "grad_norm": 6.573644161224365, + "learning_rate": 4.54256692485392e-06, + "loss": 2.8228, + "step": 64285 + }, + { + "epoch": 4.368120668569099, + "grad_norm": 10.071675300598145, + "learning_rate": 4.542142274765593e-06, + "loss": 2.8601, + "step": 64290 + }, + { + "epoch": 4.368460388639761, + "grad_norm": 7.134279251098633, + "learning_rate": 4.541717624677267e-06, + "loss": 2.8966, + "step": 64295 + }, + { + "epoch": 4.368800108710422, + "grad_norm": 8.545855522155762, + "learning_rate": 4.541292974588939e-06, + "loss": 2.942, + "step": 64300 + }, + { + "epoch": 4.369139828781084, + "grad_norm": 7.841909885406494, + "learning_rate": 4.540868324500612e-06, + "loss": 3.1774, + "step": 64305 + }, + { + "epoch": 4.369479548851746, + "grad_norm": 7.293898582458496, + "learning_rate": 4.540443674412285e-06, + "loss": 2.7408, + "step": 64310 + }, + { + "epoch": 4.369819268922408, + "grad_norm": 6.264365196228027, + "learning_rate": 4.540019024323957e-06, + "loss": 2.6786, + "step": 64315 + }, + { + "epoch": 4.37015898899307, + "grad_norm": 6.671264171600342, + "learning_rate": 4.53959437423563e-06, + "loss": 3.046, + "step": 64320 + }, + { + "epoch": 4.370498709063732, + "grad_norm": 6.563968181610107, + "learning_rate": 4.539169724147303e-06, + "loss": 2.6914, + "step": 64325 + }, + { + "epoch": 4.370838429134393, + "grad_norm": 5.219542503356934, + "learning_rate": 4.538745074058976e-06, + "loss": 3.0912, + "step": 64330 + }, + { + "epoch": 4.371178149205055, + "grad_norm": 9.305569648742676, + "learning_rate": 4.538320423970648e-06, + "loss": 2.9733, + "step": 64335 + }, + { + "epoch": 4.371517869275717, + "grad_norm": 8.039407730102539, + "learning_rate": 4.537895773882321e-06, + "loss": 2.6171, + "step": 64340 + }, + { + "epoch": 4.371857589346378, + "grad_norm": 6.191408634185791, + "learning_rate": 4.537471123793994e-06, + "loss": 3.2766, + "step": 64345 + }, + { + "epoch": 4.37219730941704, + "grad_norm": 6.975499629974365, + "learning_rate": 4.537046473705667e-06, + "loss": 2.756, + "step": 64350 + }, + { + "epoch": 4.372537029487702, + "grad_norm": 7.476132869720459, + "learning_rate": 4.53662182361734e-06, + "loss": 2.9944, + "step": 64355 + }, + { + "epoch": 4.372876749558364, + "grad_norm": 7.717142105102539, + "learning_rate": 4.536197173529012e-06, + "loss": 2.8629, + "step": 64360 + }, + { + "epoch": 4.373216469629026, + "grad_norm": 7.935731887817383, + "learning_rate": 4.535772523440685e-06, + "loss": 3.0027, + "step": 64365 + }, + { + "epoch": 4.373556189699688, + "grad_norm": 7.391154766082764, + "learning_rate": 4.535347873352358e-06, + "loss": 2.5734, + "step": 64370 + }, + { + "epoch": 4.373895909770349, + "grad_norm": 7.118316173553467, + "learning_rate": 4.534923223264031e-06, + "loss": 3.0078, + "step": 64375 + }, + { + "epoch": 4.374235629841011, + "grad_norm": 6.0696635246276855, + "learning_rate": 4.534498573175704e-06, + "loss": 2.8934, + "step": 64380 + }, + { + "epoch": 4.374575349911673, + "grad_norm": 6.451412677764893, + "learning_rate": 4.534073923087376e-06, + "loss": 2.6218, + "step": 64385 + }, + { + "epoch": 4.374915069982334, + "grad_norm": 7.959374904632568, + "learning_rate": 4.533649272999049e-06, + "loss": 2.7278, + "step": 64390 + }, + { + "epoch": 4.375254790052996, + "grad_norm": 8.715719223022461, + "learning_rate": 4.533224622910722e-06, + "loss": 2.9985, + "step": 64395 + }, + { + "epoch": 4.375594510123658, + "grad_norm": 9.958508491516113, + "learning_rate": 4.532799972822395e-06, + "loss": 2.7441, + "step": 64400 + }, + { + "epoch": 4.37593423019432, + "grad_norm": 7.795088291168213, + "learning_rate": 4.532375322734068e-06, + "loss": 2.8728, + "step": 64405 + }, + { + "epoch": 4.376273950264982, + "grad_norm": 7.274155616760254, + "learning_rate": 4.53195067264574e-06, + "loss": 2.7673, + "step": 64410 + }, + { + "epoch": 4.376613670335644, + "grad_norm": 6.4864325523376465, + "learning_rate": 4.531526022557413e-06, + "loss": 2.9036, + "step": 64415 + }, + { + "epoch": 4.376953390406305, + "grad_norm": 5.69954252243042, + "learning_rate": 4.531101372469086e-06, + "loss": 2.9821, + "step": 64420 + }, + { + "epoch": 4.377293110476967, + "grad_norm": 9.206598281860352, + "learning_rate": 4.530676722380759e-06, + "loss": 2.8024, + "step": 64425 + }, + { + "epoch": 4.377632830547629, + "grad_norm": 8.467958450317383, + "learning_rate": 4.530252072292432e-06, + "loss": 3.1561, + "step": 64430 + }, + { + "epoch": 4.37797255061829, + "grad_norm": 6.293515682220459, + "learning_rate": 4.529827422204104e-06, + "loss": 3.1542, + "step": 64435 + }, + { + "epoch": 4.378312270688952, + "grad_norm": 7.05228853225708, + "learning_rate": 4.529402772115777e-06, + "loss": 2.6748, + "step": 64440 + }, + { + "epoch": 4.378651990759614, + "grad_norm": 6.942229270935059, + "learning_rate": 4.52897812202745e-06, + "loss": 2.7118, + "step": 64445 + }, + { + "epoch": 4.378991710830276, + "grad_norm": 9.094605445861816, + "learning_rate": 4.528553471939123e-06, + "loss": 3.2128, + "step": 64450 + }, + { + "epoch": 4.379331430900938, + "grad_norm": 6.607741832733154, + "learning_rate": 4.528128821850795e-06, + "loss": 2.7543, + "step": 64455 + }, + { + "epoch": 4.3796711509716, + "grad_norm": 6.150272369384766, + "learning_rate": 4.527704171762468e-06, + "loss": 2.783, + "step": 64460 + }, + { + "epoch": 4.380010871042261, + "grad_norm": 7.424169063568115, + "learning_rate": 4.527279521674141e-06, + "loss": 3.0003, + "step": 64465 + }, + { + "epoch": 4.380350591112923, + "grad_norm": 7.477290630340576, + "learning_rate": 4.526854871585813e-06, + "loss": 2.6776, + "step": 64470 + }, + { + "epoch": 4.380690311183585, + "grad_norm": 6.871988773345947, + "learning_rate": 4.526430221497487e-06, + "loss": 2.78, + "step": 64475 + }, + { + "epoch": 4.381030031254246, + "grad_norm": 7.595373630523682, + "learning_rate": 4.52600557140916e-06, + "loss": 2.8765, + "step": 64480 + }, + { + "epoch": 4.381369751324908, + "grad_norm": 7.318154811859131, + "learning_rate": 4.525580921320832e-06, + "loss": 2.9368, + "step": 64485 + }, + { + "epoch": 4.38170947139557, + "grad_norm": 8.118996620178223, + "learning_rate": 4.525156271232504e-06, + "loss": 3.0091, + "step": 64490 + }, + { + "epoch": 4.382049191466232, + "grad_norm": 9.05588150024414, + "learning_rate": 4.524731621144178e-06, + "loss": 2.9591, + "step": 64495 + }, + { + "epoch": 4.382388911536894, + "grad_norm": 8.283642768859863, + "learning_rate": 4.52430697105585e-06, + "loss": 2.6094, + "step": 64500 + }, + { + "epoch": 4.382728631607556, + "grad_norm": 8.409831047058105, + "learning_rate": 4.523882320967523e-06, + "loss": 3.1205, + "step": 64505 + }, + { + "epoch": 4.383068351678217, + "grad_norm": 7.347724437713623, + "learning_rate": 4.5234576708791964e-06, + "loss": 2.7894, + "step": 64510 + }, + { + "epoch": 4.383408071748879, + "grad_norm": 6.668837547302246, + "learning_rate": 4.523033020790868e-06, + "loss": 2.8851, + "step": 64515 + }, + { + "epoch": 4.383747791819541, + "grad_norm": 9.647384643554688, + "learning_rate": 4.522608370702541e-06, + "loss": 3.0241, + "step": 64520 + }, + { + "epoch": 4.384087511890202, + "grad_norm": 8.740947723388672, + "learning_rate": 4.522183720614214e-06, + "loss": 3.2081, + "step": 64525 + }, + { + "epoch": 4.384427231960864, + "grad_norm": 6.612226486206055, + "learning_rate": 4.521759070525887e-06, + "loss": 3.1135, + "step": 64530 + }, + { + "epoch": 4.384766952031526, + "grad_norm": 8.167317390441895, + "learning_rate": 4.52133442043756e-06, + "loss": 2.9401, + "step": 64535 + }, + { + "epoch": 4.385106672102188, + "grad_norm": 8.037854194641113, + "learning_rate": 4.520909770349232e-06, + "loss": 2.9908, + "step": 64540 + }, + { + "epoch": 4.38544639217285, + "grad_norm": 8.01064395904541, + "learning_rate": 4.520485120260905e-06, + "loss": 2.6795, + "step": 64545 + }, + { + "epoch": 4.385786112243512, + "grad_norm": 8.542391777038574, + "learning_rate": 4.520060470172578e-06, + "loss": 3.1017, + "step": 64550 + }, + { + "epoch": 4.386125832314173, + "grad_norm": 6.748038291931152, + "learning_rate": 4.519635820084251e-06, + "loss": 2.8434, + "step": 64555 + }, + { + "epoch": 4.386465552384835, + "grad_norm": 6.1513848304748535, + "learning_rate": 4.519211169995924e-06, + "loss": 2.8076, + "step": 64560 + }, + { + "epoch": 4.386805272455497, + "grad_norm": 7.907018661499023, + "learning_rate": 4.518786519907596e-06, + "loss": 3.0199, + "step": 64565 + }, + { + "epoch": 4.387144992526158, + "grad_norm": 7.604128360748291, + "learning_rate": 4.518361869819269e-06, + "loss": 2.8172, + "step": 64570 + }, + { + "epoch": 4.38748471259682, + "grad_norm": 9.417689323425293, + "learning_rate": 4.517937219730942e-06, + "loss": 2.8519, + "step": 64575 + }, + { + "epoch": 4.3878244326674825, + "grad_norm": 7.904128074645996, + "learning_rate": 4.517512569642615e-06, + "loss": 3.075, + "step": 64580 + }, + { + "epoch": 4.388164152738144, + "grad_norm": 5.426961421966553, + "learning_rate": 4.517087919554288e-06, + "loss": 3.0078, + "step": 64585 + }, + { + "epoch": 4.388503872808806, + "grad_norm": 7.628528118133545, + "learning_rate": 4.51666326946596e-06, + "loss": 2.9777, + "step": 64590 + }, + { + "epoch": 4.388843592879467, + "grad_norm": 6.823154449462891, + "learning_rate": 4.516238619377633e-06, + "loss": 2.996, + "step": 64595 + }, + { + "epoch": 4.389183312950129, + "grad_norm": 7.6384148597717285, + "learning_rate": 4.515813969289306e-06, + "loss": 2.8444, + "step": 64600 + }, + { + "epoch": 4.389523033020791, + "grad_norm": 8.401437759399414, + "learning_rate": 4.515389319200979e-06, + "loss": 3.1632, + "step": 64605 + }, + { + "epoch": 4.389862753091452, + "grad_norm": 8.593976974487305, + "learning_rate": 4.514964669112652e-06, + "loss": 3.0277, + "step": 64610 + }, + { + "epoch": 4.390202473162114, + "grad_norm": 7.58574914932251, + "learning_rate": 4.514540019024324e-06, + "loss": 3.1583, + "step": 64615 + }, + { + "epoch": 4.390542193232776, + "grad_norm": 6.928238868713379, + "learning_rate": 4.514115368935997e-06, + "loss": 2.8429, + "step": 64620 + }, + { + "epoch": 4.390881913303438, + "grad_norm": 8.27660846710205, + "learning_rate": 4.51369071884767e-06, + "loss": 2.8736, + "step": 64625 + }, + { + "epoch": 4.3912216333741, + "grad_norm": 7.439405918121338, + "learning_rate": 4.513266068759343e-06, + "loss": 2.6817, + "step": 64630 + }, + { + "epoch": 4.391561353444762, + "grad_norm": 7.372568607330322, + "learning_rate": 4.512841418671016e-06, + "loss": 2.751, + "step": 64635 + }, + { + "epoch": 4.391901073515423, + "grad_norm": 7.009113788604736, + "learning_rate": 4.512416768582688e-06, + "loss": 3.0553, + "step": 64640 + }, + { + "epoch": 4.392240793586085, + "grad_norm": 7.011868000030518, + "learning_rate": 4.511992118494361e-06, + "loss": 2.7486, + "step": 64645 + }, + { + "epoch": 4.392580513656747, + "grad_norm": 7.460968971252441, + "learning_rate": 4.511567468406034e-06, + "loss": 2.8248, + "step": 64650 + }, + { + "epoch": 4.392920233727408, + "grad_norm": 7.160153388977051, + "learning_rate": 4.511142818317707e-06, + "loss": 2.7889, + "step": 64655 + }, + { + "epoch": 4.39325995379807, + "grad_norm": 6.553597927093506, + "learning_rate": 4.51071816822938e-06, + "loss": 3.1119, + "step": 64660 + }, + { + "epoch": 4.393599673868732, + "grad_norm": 9.353087425231934, + "learning_rate": 4.510293518141052e-06, + "loss": 2.7716, + "step": 64665 + }, + { + "epoch": 4.393939393939394, + "grad_norm": 6.88578462600708, + "learning_rate": 4.509868868052724e-06, + "loss": 2.8024, + "step": 64670 + }, + { + "epoch": 4.394279114010056, + "grad_norm": 7.340888500213623, + "learning_rate": 4.509444217964398e-06, + "loss": 2.7928, + "step": 64675 + }, + { + "epoch": 4.394618834080718, + "grad_norm": 5.966917514801025, + "learning_rate": 4.509019567876071e-06, + "loss": 3.0172, + "step": 64680 + }, + { + "epoch": 4.394958554151379, + "grad_norm": 7.2274956703186035, + "learning_rate": 4.508594917787743e-06, + "loss": 3.001, + "step": 64685 + }, + { + "epoch": 4.395298274222041, + "grad_norm": 8.062185287475586, + "learning_rate": 4.508170267699416e-06, + "loss": 2.9136, + "step": 64690 + }, + { + "epoch": 4.395637994292703, + "grad_norm": 8.999799728393555, + "learning_rate": 4.507745617611089e-06, + "loss": 2.8582, + "step": 64695 + }, + { + "epoch": 4.395977714363364, + "grad_norm": 7.6598663330078125, + "learning_rate": 4.507320967522761e-06, + "loss": 2.6834, + "step": 64700 + }, + { + "epoch": 4.396317434434026, + "grad_norm": 6.967264175415039, + "learning_rate": 4.506896317434434e-06, + "loss": 2.6127, + "step": 64705 + }, + { + "epoch": 4.396657154504688, + "grad_norm": 7.245832443237305, + "learning_rate": 4.506471667346108e-06, + "loss": 2.6611, + "step": 64710 + }, + { + "epoch": 4.39699687457535, + "grad_norm": 6.085833549499512, + "learning_rate": 4.5060470172577796e-06, + "loss": 2.737, + "step": 64715 + }, + { + "epoch": 4.397336594646012, + "grad_norm": 9.28798770904541, + "learning_rate": 4.505622367169452e-06, + "loss": 2.7649, + "step": 64720 + }, + { + "epoch": 4.397676314716674, + "grad_norm": 8.417078971862793, + "learning_rate": 4.505197717081126e-06, + "loss": 2.7999, + "step": 64725 + }, + { + "epoch": 4.398016034787335, + "grad_norm": 6.508436679840088, + "learning_rate": 4.504773066992798e-06, + "loss": 2.6717, + "step": 64730 + }, + { + "epoch": 4.398355754857997, + "grad_norm": 5.164678573608398, + "learning_rate": 4.504348416904471e-06, + "loss": 3.0284, + "step": 64735 + }, + { + "epoch": 4.398695474928659, + "grad_norm": 8.082717895507812, + "learning_rate": 4.5039237668161436e-06, + "loss": 2.6552, + "step": 64740 + }, + { + "epoch": 4.39903519499932, + "grad_norm": 8.06281566619873, + "learning_rate": 4.503499116727817e-06, + "loss": 2.9639, + "step": 64745 + }, + { + "epoch": 4.399374915069982, + "grad_norm": 6.386168003082275, + "learning_rate": 4.503074466639489e-06, + "loss": 2.671, + "step": 64750 + }, + { + "epoch": 4.399714635140644, + "grad_norm": 6.8398895263671875, + "learning_rate": 4.502649816551162e-06, + "loss": 2.9946, + "step": 64755 + }, + { + "epoch": 4.400054355211306, + "grad_norm": 7.878732204437256, + "learning_rate": 4.502225166462836e-06, + "loss": 2.8716, + "step": 64760 + }, + { + "epoch": 4.400394075281968, + "grad_norm": 6.712821960449219, + "learning_rate": 4.5018005163745076e-06, + "loss": 2.728, + "step": 64765 + }, + { + "epoch": 4.40073379535263, + "grad_norm": 6.568000793457031, + "learning_rate": 4.50137586628618e-06, + "loss": 2.6917, + "step": 64770 + }, + { + "epoch": 4.401073515423291, + "grad_norm": 6.468076705932617, + "learning_rate": 4.500951216197853e-06, + "loss": 2.8338, + "step": 64775 + }, + { + "epoch": 4.401413235493953, + "grad_norm": 6.677975654602051, + "learning_rate": 4.500526566109526e-06, + "loss": 2.8378, + "step": 64780 + }, + { + "epoch": 4.401752955564615, + "grad_norm": 7.318912029266357, + "learning_rate": 4.500101916021199e-06, + "loss": 2.633, + "step": 64785 + }, + { + "epoch": 4.402092675635276, + "grad_norm": 7.498873233795166, + "learning_rate": 4.499677265932872e-06, + "loss": 2.8958, + "step": 64790 + }, + { + "epoch": 4.402432395705938, + "grad_norm": 8.09167766571045, + "learning_rate": 4.499252615844544e-06, + "loss": 2.9348, + "step": 64795 + }, + { + "epoch": 4.4027721157766, + "grad_norm": 7.804293632507324, + "learning_rate": 4.498827965756217e-06, + "loss": 2.5685, + "step": 64800 + }, + { + "epoch": 4.403111835847262, + "grad_norm": 9.417351722717285, + "learning_rate": 4.49840331566789e-06, + "loss": 2.8396, + "step": 64805 + }, + { + "epoch": 4.403451555917924, + "grad_norm": 8.429304122924805, + "learning_rate": 4.497978665579563e-06, + "loss": 2.859, + "step": 64810 + }, + { + "epoch": 4.403791275988586, + "grad_norm": 8.10791301727295, + "learning_rate": 4.497554015491236e-06, + "loss": 2.8697, + "step": 64815 + }, + { + "epoch": 4.404130996059247, + "grad_norm": 6.8878092765808105, + "learning_rate": 4.497129365402908e-06, + "loss": 2.7347, + "step": 64820 + }, + { + "epoch": 4.404470716129909, + "grad_norm": 5.707459926605225, + "learning_rate": 4.496704715314581e-06, + "loss": 2.6988, + "step": 64825 + }, + { + "epoch": 4.404810436200571, + "grad_norm": 6.071962356567383, + "learning_rate": 4.496280065226254e-06, + "loss": 3.1178, + "step": 64830 + }, + { + "epoch": 4.405150156271232, + "grad_norm": 7.398830413818359, + "learning_rate": 4.495855415137927e-06, + "loss": 3.0018, + "step": 64835 + }, + { + "epoch": 4.405489876341894, + "grad_norm": 7.252024173736572, + "learning_rate": 4.4954307650496e-06, + "loss": 2.8439, + "step": 64840 + }, + { + "epoch": 4.4058295964125564, + "grad_norm": 7.373745918273926, + "learning_rate": 4.495006114961272e-06, + "loss": 2.5356, + "step": 64845 + }, + { + "epoch": 4.406169316483218, + "grad_norm": 6.426562309265137, + "learning_rate": 4.494581464872945e-06, + "loss": 2.8124, + "step": 64850 + }, + { + "epoch": 4.40650903655388, + "grad_norm": 6.726249694824219, + "learning_rate": 4.494156814784618e-06, + "loss": 2.8544, + "step": 64855 + }, + { + "epoch": 4.406848756624542, + "grad_norm": 8.278674125671387, + "learning_rate": 4.493732164696291e-06, + "loss": 2.6608, + "step": 64860 + }, + { + "epoch": 4.407188476695203, + "grad_norm": 7.061807155609131, + "learning_rate": 4.493307514607964e-06, + "loss": 2.88, + "step": 64865 + }, + { + "epoch": 4.407528196765865, + "grad_norm": 8.016139030456543, + "learning_rate": 4.4928828645196355e-06, + "loss": 2.7493, + "step": 64870 + }, + { + "epoch": 4.407867916836526, + "grad_norm": 9.356366157531738, + "learning_rate": 4.492458214431309e-06, + "loss": 2.9564, + "step": 64875 + }, + { + "epoch": 4.408207636907188, + "grad_norm": 8.099394798278809, + "learning_rate": 4.492033564342982e-06, + "loss": 3.0823, + "step": 64880 + }, + { + "epoch": 4.40854735697785, + "grad_norm": 5.310863494873047, + "learning_rate": 4.491608914254654e-06, + "loss": 2.6583, + "step": 64885 + }, + { + "epoch": 4.408887077048512, + "grad_norm": 7.280906677246094, + "learning_rate": 4.491184264166328e-06, + "loss": 2.899, + "step": 64890 + }, + { + "epoch": 4.409226797119174, + "grad_norm": 6.56165885925293, + "learning_rate": 4.490759614078e-06, + "loss": 2.7778, + "step": 64895 + }, + { + "epoch": 4.409566517189836, + "grad_norm": 6.617203712463379, + "learning_rate": 4.490334963989672e-06, + "loss": 2.8183, + "step": 64900 + }, + { + "epoch": 4.409906237260497, + "grad_norm": 6.676570415496826, + "learning_rate": 4.489910313901346e-06, + "loss": 2.6926, + "step": 64905 + }, + { + "epoch": 4.410245957331159, + "grad_norm": 8.122987747192383, + "learning_rate": 4.489485663813019e-06, + "loss": 2.9916, + "step": 64910 + }, + { + "epoch": 4.410585677401821, + "grad_norm": 8.827649116516113, + "learning_rate": 4.489061013724692e-06, + "loss": 2.9636, + "step": 64915 + }, + { + "epoch": 4.410925397472482, + "grad_norm": 8.736635208129883, + "learning_rate": 4.4886363636363636e-06, + "loss": 2.5758, + "step": 64920 + }, + { + "epoch": 4.411265117543144, + "grad_norm": 7.627286434173584, + "learning_rate": 4.488211713548037e-06, + "loss": 2.7685, + "step": 64925 + }, + { + "epoch": 4.411604837613806, + "grad_norm": 5.601761817932129, + "learning_rate": 4.48778706345971e-06, + "loss": 2.6972, + "step": 64930 + }, + { + "epoch": 4.411944557684468, + "grad_norm": 7.976710319519043, + "learning_rate": 4.487362413371382e-06, + "loss": 3.0347, + "step": 64935 + }, + { + "epoch": 4.41228427775513, + "grad_norm": 6.707119464874268, + "learning_rate": 4.486937763283056e-06, + "loss": 2.9181, + "step": 64940 + }, + { + "epoch": 4.412623997825792, + "grad_norm": 6.271629810333252, + "learning_rate": 4.486513113194728e-06, + "loss": 2.8594, + "step": 64945 + }, + { + "epoch": 4.412963717896453, + "grad_norm": 8.358573913574219, + "learning_rate": 4.4860884631064e-06, + "loss": 2.808, + "step": 64950 + }, + { + "epoch": 4.413303437967115, + "grad_norm": 8.109557151794434, + "learning_rate": 4.485663813018073e-06, + "loss": 2.8931, + "step": 64955 + }, + { + "epoch": 4.413643158037777, + "grad_norm": 7.770748138427734, + "learning_rate": 4.485239162929747e-06, + "loss": 3.1024, + "step": 64960 + }, + { + "epoch": 4.413982878108438, + "grad_norm": 5.1503005027771, + "learning_rate": 4.484814512841419e-06, + "loss": 2.7308, + "step": 64965 + }, + { + "epoch": 4.4143225981791, + "grad_norm": 8.991857528686523, + "learning_rate": 4.4843898627530916e-06, + "loss": 2.978, + "step": 64970 + }, + { + "epoch": 4.414662318249762, + "grad_norm": 6.40927791595459, + "learning_rate": 4.483965212664765e-06, + "loss": 2.8035, + "step": 64975 + }, + { + "epoch": 4.415002038320424, + "grad_norm": 7.200418472290039, + "learning_rate": 4.483540562576437e-06, + "loss": 3.0138, + "step": 64980 + }, + { + "epoch": 4.415341758391086, + "grad_norm": 8.89847469329834, + "learning_rate": 4.48311591248811e-06, + "loss": 3.0604, + "step": 64985 + }, + { + "epoch": 4.415681478461748, + "grad_norm": 7.135839462280273, + "learning_rate": 4.482691262399783e-06, + "loss": 3.0965, + "step": 64990 + }, + { + "epoch": 4.416021198532409, + "grad_norm": 6.787608623504639, + "learning_rate": 4.4822666123114556e-06, + "loss": 2.8976, + "step": 64995 + }, + { + "epoch": 4.416360918603071, + "grad_norm": 7.779758930206299, + "learning_rate": 4.481841962223128e-06, + "loss": 2.6288, + "step": 65000 + }, + { + "epoch": 4.416700638673733, + "grad_norm": 5.549878120422363, + "learning_rate": 4.481417312134801e-06, + "loss": 2.3902, + "step": 65005 + }, + { + "epoch": 4.417040358744394, + "grad_norm": 6.500032901763916, + "learning_rate": 4.480992662046474e-06, + "loss": 2.7234, + "step": 65010 + }, + { + "epoch": 4.417380078815056, + "grad_norm": 7.659340858459473, + "learning_rate": 4.480568011958147e-06, + "loss": 2.8453, + "step": 65015 + }, + { + "epoch": 4.417719798885718, + "grad_norm": 8.115215301513672, + "learning_rate": 4.4801433618698196e-06, + "loss": 2.9095, + "step": 65020 + }, + { + "epoch": 4.41805951895638, + "grad_norm": 7.002143383026123, + "learning_rate": 4.479718711781492e-06, + "loss": 2.704, + "step": 65025 + }, + { + "epoch": 4.418399239027042, + "grad_norm": 8.473146438598633, + "learning_rate": 4.479294061693165e-06, + "loss": 2.97, + "step": 65030 + }, + { + "epoch": 4.418738959097704, + "grad_norm": 6.6659369468688965, + "learning_rate": 4.478869411604838e-06, + "loss": 2.8916, + "step": 65035 + }, + { + "epoch": 4.419078679168365, + "grad_norm": 6.002534866333008, + "learning_rate": 4.478444761516511e-06, + "loss": 2.771, + "step": 65040 + }, + { + "epoch": 4.419418399239027, + "grad_norm": 8.397666931152344, + "learning_rate": 4.4780201114281836e-06, + "loss": 2.6676, + "step": 65045 + }, + { + "epoch": 4.419758119309689, + "grad_norm": 9.085214614868164, + "learning_rate": 4.477595461339856e-06, + "loss": 2.869, + "step": 65050 + }, + { + "epoch": 4.42009783938035, + "grad_norm": 6.455034255981445, + "learning_rate": 4.477170811251529e-06, + "loss": 3.0995, + "step": 65055 + }, + { + "epoch": 4.420437559451012, + "grad_norm": 7.207921504974365, + "learning_rate": 4.476746161163202e-06, + "loss": 3.0507, + "step": 65060 + }, + { + "epoch": 4.420777279521674, + "grad_norm": 6.515471458435059, + "learning_rate": 4.476321511074875e-06, + "loss": 2.868, + "step": 65065 + }, + { + "epoch": 4.421116999592336, + "grad_norm": 7.407264232635498, + "learning_rate": 4.4758968609865476e-06, + "loss": 2.6834, + "step": 65070 + }, + { + "epoch": 4.421456719662998, + "grad_norm": 9.26873779296875, + "learning_rate": 4.47547221089822e-06, + "loss": 2.738, + "step": 65075 + }, + { + "epoch": 4.42179643973366, + "grad_norm": 7.293568134307861, + "learning_rate": 4.475047560809893e-06, + "loss": 2.7482, + "step": 65080 + }, + { + "epoch": 4.422136159804321, + "grad_norm": 7.016181945800781, + "learning_rate": 4.474622910721566e-06, + "loss": 3.0904, + "step": 65085 + }, + { + "epoch": 4.422475879874983, + "grad_norm": 7.154242992401123, + "learning_rate": 4.474198260633239e-06, + "loss": 2.6018, + "step": 65090 + }, + { + "epoch": 4.422815599945645, + "grad_norm": 7.539826393127441, + "learning_rate": 4.473773610544912e-06, + "loss": 2.8973, + "step": 65095 + }, + { + "epoch": 4.423155320016306, + "grad_norm": 7.718934535980225, + "learning_rate": 4.473348960456584e-06, + "loss": 2.9724, + "step": 65100 + }, + { + "epoch": 4.423495040086968, + "grad_norm": 7.520257472991943, + "learning_rate": 4.472924310368257e-06, + "loss": 2.9218, + "step": 65105 + }, + { + "epoch": 4.42383476015763, + "grad_norm": 6.292236804962158, + "learning_rate": 4.47249966027993e-06, + "loss": 2.8747, + "step": 65110 + }, + { + "epoch": 4.424174480228292, + "grad_norm": 8.735925674438477, + "learning_rate": 4.472075010191603e-06, + "loss": 2.9588, + "step": 65115 + }, + { + "epoch": 4.424514200298954, + "grad_norm": 8.635765075683594, + "learning_rate": 4.471650360103275e-06, + "loss": 2.8407, + "step": 65120 + }, + { + "epoch": 4.424853920369616, + "grad_norm": 8.99227237701416, + "learning_rate": 4.471225710014948e-06, + "loss": 2.8285, + "step": 65125 + }, + { + "epoch": 4.425193640440277, + "grad_norm": 7.689553260803223, + "learning_rate": 4.470801059926621e-06, + "loss": 2.6704, + "step": 65130 + }, + { + "epoch": 4.425533360510939, + "grad_norm": 6.3766374588012695, + "learning_rate": 4.470376409838293e-06, + "loss": 2.7502, + "step": 65135 + }, + { + "epoch": 4.425873080581601, + "grad_norm": 7.543941974639893, + "learning_rate": 4.469951759749967e-06, + "loss": 2.7987, + "step": 65140 + }, + { + "epoch": 4.426212800652262, + "grad_norm": 9.734946250915527, + "learning_rate": 4.46952710966164e-06, + "loss": 3.0347, + "step": 65145 + }, + { + "epoch": 4.426552520722924, + "grad_norm": 8.342293739318848, + "learning_rate": 4.4691024595733115e-06, + "loss": 2.9649, + "step": 65150 + }, + { + "epoch": 4.4268922407935865, + "grad_norm": 7.043417930603027, + "learning_rate": 4.468677809484984e-06, + "loss": 2.8215, + "step": 65155 + }, + { + "epoch": 4.427231960864248, + "grad_norm": 7.492164611816406, + "learning_rate": 4.468253159396658e-06, + "loss": 3.2615, + "step": 65160 + }, + { + "epoch": 4.42757168093491, + "grad_norm": 8.01657485961914, + "learning_rate": 4.46782850930833e-06, + "loss": 3.07, + "step": 65165 + }, + { + "epoch": 4.427911401005572, + "grad_norm": 6.414392471313477, + "learning_rate": 4.467403859220003e-06, + "loss": 3.0317, + "step": 65170 + }, + { + "epoch": 4.428251121076233, + "grad_norm": 6.462695598602295, + "learning_rate": 4.466979209131676e-06, + "loss": 2.7309, + "step": 65175 + }, + { + "epoch": 4.428590841146895, + "grad_norm": 6.751462936401367, + "learning_rate": 4.466554559043348e-06, + "loss": 2.9832, + "step": 65180 + }, + { + "epoch": 4.428930561217557, + "grad_norm": 6.525119304656982, + "learning_rate": 4.466129908955021e-06, + "loss": 2.9642, + "step": 65185 + }, + { + "epoch": 4.429270281288218, + "grad_norm": 6.691567420959473, + "learning_rate": 4.465705258866695e-06, + "loss": 2.9838, + "step": 65190 + }, + { + "epoch": 4.42961000135888, + "grad_norm": 7.388646125793457, + "learning_rate": 4.465280608778367e-06, + "loss": 2.8018, + "step": 65195 + }, + { + "epoch": 4.4299497214295425, + "grad_norm": 6.662818431854248, + "learning_rate": 4.4648559586900395e-06, + "loss": 3.0719, + "step": 65200 + }, + { + "epoch": 4.430289441500204, + "grad_norm": 7.332695960998535, + "learning_rate": 4.464431308601712e-06, + "loss": 2.9282, + "step": 65205 + }, + { + "epoch": 4.430629161570866, + "grad_norm": 5.665883541107178, + "learning_rate": 4.464006658513385e-06, + "loss": 2.6178, + "step": 65210 + }, + { + "epoch": 4.430968881641528, + "grad_norm": 8.740418434143066, + "learning_rate": 4.463582008425058e-06, + "loss": 2.9609, + "step": 65215 + }, + { + "epoch": 4.431308601712189, + "grad_norm": 7.181007385253906, + "learning_rate": 4.463157358336731e-06, + "loss": 3.0314, + "step": 65220 + }, + { + "epoch": 4.431648321782851, + "grad_norm": 7.223173141479492, + "learning_rate": 4.4627327082484036e-06, + "loss": 2.8777, + "step": 65225 + }, + { + "epoch": 4.431988041853513, + "grad_norm": 8.360816955566406, + "learning_rate": 4.462308058160076e-06, + "loss": 2.9106, + "step": 65230 + }, + { + "epoch": 4.432327761924174, + "grad_norm": 9.134892463684082, + "learning_rate": 4.461883408071749e-06, + "loss": 3.0239, + "step": 65235 + }, + { + "epoch": 4.432667481994836, + "grad_norm": 8.175055503845215, + "learning_rate": 4.461458757983422e-06, + "loss": 2.9709, + "step": 65240 + }, + { + "epoch": 4.4330072020654985, + "grad_norm": 7.631781578063965, + "learning_rate": 4.461034107895095e-06, + "loss": 2.8579, + "step": 65245 + }, + { + "epoch": 4.43334692213616, + "grad_norm": 6.099722385406494, + "learning_rate": 4.4606094578067676e-06, + "loss": 3.2442, + "step": 65250 + }, + { + "epoch": 4.433686642206822, + "grad_norm": 8.734107971191406, + "learning_rate": 4.46018480771844e-06, + "loss": 2.8642, + "step": 65255 + }, + { + "epoch": 4.434026362277484, + "grad_norm": 9.15400218963623, + "learning_rate": 4.459760157630113e-06, + "loss": 2.8216, + "step": 65260 + }, + { + "epoch": 4.434366082348145, + "grad_norm": 11.141730308532715, + "learning_rate": 4.459335507541786e-06, + "loss": 3.1397, + "step": 65265 + }, + { + "epoch": 4.434705802418807, + "grad_norm": 7.580162525177002, + "learning_rate": 4.458910857453459e-06, + "loss": 3.0589, + "step": 65270 + }, + { + "epoch": 4.435045522489468, + "grad_norm": 7.502528190612793, + "learning_rate": 4.4584862073651316e-06, + "loss": 2.8372, + "step": 65275 + }, + { + "epoch": 4.43538524256013, + "grad_norm": 7.534671306610107, + "learning_rate": 4.458061557276804e-06, + "loss": 2.8288, + "step": 65280 + }, + { + "epoch": 4.435724962630792, + "grad_norm": 8.058333396911621, + "learning_rate": 4.457636907188477e-06, + "loss": 3.1966, + "step": 65285 + }, + { + "epoch": 4.436064682701454, + "grad_norm": 8.150233268737793, + "learning_rate": 4.45721225710015e-06, + "loss": 2.5997, + "step": 65290 + }, + { + "epoch": 4.436404402772116, + "grad_norm": 11.058812141418457, + "learning_rate": 4.456787607011823e-06, + "loss": 2.8085, + "step": 65295 + }, + { + "epoch": 4.436744122842778, + "grad_norm": 7.810964584350586, + "learning_rate": 4.4563629569234956e-06, + "loss": 2.9217, + "step": 65300 + }, + { + "epoch": 4.437083842913439, + "grad_norm": 7.676878929138184, + "learning_rate": 4.455938306835168e-06, + "loss": 2.9835, + "step": 65305 + }, + { + "epoch": 4.437423562984101, + "grad_norm": 7.918145656585693, + "learning_rate": 4.455513656746841e-06, + "loss": 3.0522, + "step": 65310 + }, + { + "epoch": 4.437763283054763, + "grad_norm": 8.05861759185791, + "learning_rate": 4.455089006658514e-06, + "loss": 2.7165, + "step": 65315 + }, + { + "epoch": 4.438103003125424, + "grad_norm": 6.884952545166016, + "learning_rate": 4.454664356570187e-06, + "loss": 3.0673, + "step": 65320 + }, + { + "epoch": 4.438442723196086, + "grad_norm": 6.654789924621582, + "learning_rate": 4.4542397064818596e-06, + "loss": 2.7063, + "step": 65325 + }, + { + "epoch": 4.438782443266748, + "grad_norm": 5.775970458984375, + "learning_rate": 4.453815056393532e-06, + "loss": 2.7208, + "step": 65330 + }, + { + "epoch": 4.43912216333741, + "grad_norm": 7.969020366668701, + "learning_rate": 4.453390406305204e-06, + "loss": 3.0629, + "step": 65335 + }, + { + "epoch": 4.439461883408072, + "grad_norm": 7.482897758483887, + "learning_rate": 4.452965756216878e-06, + "loss": 2.7009, + "step": 65340 + }, + { + "epoch": 4.439801603478734, + "grad_norm": 6.641744613647461, + "learning_rate": 4.452541106128551e-06, + "loss": 2.825, + "step": 65345 + }, + { + "epoch": 4.440141323549395, + "grad_norm": 6.493098735809326, + "learning_rate": 4.452116456040223e-06, + "loss": 2.972, + "step": 65350 + }, + { + "epoch": 4.440481043620057, + "grad_norm": 6.651182651519775, + "learning_rate": 4.451691805951896e-06, + "loss": 2.7417, + "step": 65355 + }, + { + "epoch": 4.440820763690719, + "grad_norm": 7.5370635986328125, + "learning_rate": 4.451267155863569e-06, + "loss": 2.6403, + "step": 65360 + }, + { + "epoch": 4.44116048376138, + "grad_norm": 7.589852333068848, + "learning_rate": 4.450842505775241e-06, + "loss": 3.0777, + "step": 65365 + }, + { + "epoch": 4.441500203832042, + "grad_norm": 6.953929424285889, + "learning_rate": 4.450502785704579e-06, + "loss": 2.7931, + "step": 65370 + }, + { + "epoch": 4.441839923902704, + "grad_norm": 8.090773582458496, + "learning_rate": 4.450078135616253e-06, + "loss": 2.6209, + "step": 65375 + }, + { + "epoch": 4.442179643973366, + "grad_norm": 7.724896430969238, + "learning_rate": 4.449653485527926e-06, + "loss": 2.9311, + "step": 65380 + }, + { + "epoch": 4.442519364044028, + "grad_norm": 7.01217794418335, + "learning_rate": 4.449228835439598e-06, + "loss": 2.6434, + "step": 65385 + }, + { + "epoch": 4.44285908411469, + "grad_norm": 8.204504013061523, + "learning_rate": 4.448804185351271e-06, + "loss": 2.9479, + "step": 65390 + }, + { + "epoch": 4.443198804185351, + "grad_norm": 7.580327033996582, + "learning_rate": 4.448379535262944e-06, + "loss": 2.8667, + "step": 65395 + }, + { + "epoch": 4.443538524256013, + "grad_norm": 7.8426995277404785, + "learning_rate": 4.447954885174616e-06, + "loss": 2.9084, + "step": 65400 + }, + { + "epoch": 4.443878244326675, + "grad_norm": 7.259670734405518, + "learning_rate": 4.44753023508629e-06, + "loss": 2.7772, + "step": 65405 + }, + { + "epoch": 4.444217964397336, + "grad_norm": 6.657705307006836, + "learning_rate": 4.4471055849979625e-06, + "loss": 2.7687, + "step": 65410 + }, + { + "epoch": 4.444557684467998, + "grad_norm": 7.520407199859619, + "learning_rate": 4.446680934909634e-06, + "loss": 2.8878, + "step": 65415 + }, + { + "epoch": 4.4448974045386604, + "grad_norm": 7.474799156188965, + "learning_rate": 4.446256284821307e-06, + "loss": 2.9367, + "step": 65420 + }, + { + "epoch": 4.445237124609322, + "grad_norm": 7.186287879943848, + "learning_rate": 4.445831634732981e-06, + "loss": 2.9823, + "step": 65425 + }, + { + "epoch": 4.445576844679984, + "grad_norm": 8.19920539855957, + "learning_rate": 4.445406984644653e-06, + "loss": 3.0206, + "step": 65430 + }, + { + "epoch": 4.445916564750646, + "grad_norm": 8.817578315734863, + "learning_rate": 4.444982334556326e-06, + "loss": 2.8569, + "step": 65435 + }, + { + "epoch": 4.446256284821307, + "grad_norm": 5.841573238372803, + "learning_rate": 4.444557684467999e-06, + "loss": 2.9408, + "step": 65440 + }, + { + "epoch": 4.446596004891969, + "grad_norm": 7.521056175231934, + "learning_rate": 4.444133034379671e-06, + "loss": 2.8716, + "step": 65445 + }, + { + "epoch": 4.446935724962631, + "grad_norm": 7.027820110321045, + "learning_rate": 4.443708384291344e-06, + "loss": 2.9154, + "step": 65450 + }, + { + "epoch": 4.447275445033292, + "grad_norm": 6.962704181671143, + "learning_rate": 4.443283734203017e-06, + "loss": 2.9533, + "step": 65455 + }, + { + "epoch": 4.447615165103954, + "grad_norm": 6.687907695770264, + "learning_rate": 4.4428590841146905e-06, + "loss": 2.7999, + "step": 65460 + }, + { + "epoch": 4.4479548851746165, + "grad_norm": 9.109600067138672, + "learning_rate": 4.442434434026362e-06, + "loss": 3.0561, + "step": 65465 + }, + { + "epoch": 4.448294605245278, + "grad_norm": 7.776969909667969, + "learning_rate": 4.442009783938035e-06, + "loss": 2.804, + "step": 65470 + }, + { + "epoch": 4.44863432531594, + "grad_norm": 7.303133010864258, + "learning_rate": 4.441585133849709e-06, + "loss": 2.8039, + "step": 65475 + }, + { + "epoch": 4.448974045386602, + "grad_norm": 9.585874557495117, + "learning_rate": 4.441160483761381e-06, + "loss": 2.8126, + "step": 65480 + }, + { + "epoch": 4.449313765457263, + "grad_norm": 7.466758728027344, + "learning_rate": 4.440735833673054e-06, + "loss": 2.7944, + "step": 65485 + }, + { + "epoch": 4.449653485527925, + "grad_norm": 7.23909854888916, + "learning_rate": 4.440311183584726e-06, + "loss": 2.885, + "step": 65490 + }, + { + "epoch": 4.449993205598587, + "grad_norm": 7.426211357116699, + "learning_rate": 4.439886533496399e-06, + "loss": 3.0059, + "step": 65495 + }, + { + "epoch": 4.450332925669248, + "grad_norm": 5.923503398895264, + "learning_rate": 4.439461883408072e-06, + "loss": 2.8629, + "step": 65500 + }, + { + "epoch": 4.45067264573991, + "grad_norm": 6.499667167663574, + "learning_rate": 4.439037233319745e-06, + "loss": 2.8452, + "step": 65505 + }, + { + "epoch": 4.4510123658105725, + "grad_norm": 7.217161178588867, + "learning_rate": 4.438612583231418e-06, + "loss": 2.8148, + "step": 65510 + }, + { + "epoch": 4.451352085881234, + "grad_norm": 7.033680438995361, + "learning_rate": 4.43818793314309e-06, + "loss": 3.1521, + "step": 65515 + }, + { + "epoch": 4.451691805951896, + "grad_norm": 7.3908467292785645, + "learning_rate": 4.437763283054763e-06, + "loss": 2.9514, + "step": 65520 + }, + { + "epoch": 4.452031526022558, + "grad_norm": 7.741453647613525, + "learning_rate": 4.437338632966436e-06, + "loss": 2.9914, + "step": 65525 + }, + { + "epoch": 4.452371246093219, + "grad_norm": 6.454855442047119, + "learning_rate": 4.436913982878109e-06, + "loss": 2.472, + "step": 65530 + }, + { + "epoch": 4.452710966163881, + "grad_norm": 7.142010688781738, + "learning_rate": 4.436489332789782e-06, + "loss": 2.6196, + "step": 65535 + }, + { + "epoch": 4.453050686234543, + "grad_norm": 6.520177364349365, + "learning_rate": 4.436064682701454e-06, + "loss": 2.7856, + "step": 65540 + }, + { + "epoch": 4.453390406305204, + "grad_norm": 7.057395935058594, + "learning_rate": 4.435640032613127e-06, + "loss": 2.9651, + "step": 65545 + }, + { + "epoch": 4.453730126375866, + "grad_norm": 6.199829578399658, + "learning_rate": 4.4352153825248e-06, + "loss": 2.7526, + "step": 65550 + }, + { + "epoch": 4.454069846446528, + "grad_norm": 6.121747970581055, + "learning_rate": 4.434790732436473e-06, + "loss": 2.8301, + "step": 65555 + }, + { + "epoch": 4.45440956651719, + "grad_norm": 6.059357643127441, + "learning_rate": 4.434366082348146e-06, + "loss": 2.8615, + "step": 65560 + }, + { + "epoch": 4.454749286587852, + "grad_norm": 7.634134292602539, + "learning_rate": 4.4339414322598184e-06, + "loss": 2.7747, + "step": 65565 + }, + { + "epoch": 4.455089006658513, + "grad_norm": 9.36118221282959, + "learning_rate": 4.433516782171491e-06, + "loss": 3.1175, + "step": 65570 + }, + { + "epoch": 4.455428726729175, + "grad_norm": 5.753622531890869, + "learning_rate": 4.433092132083164e-06, + "loss": 3.0217, + "step": 65575 + }, + { + "epoch": 4.455768446799837, + "grad_norm": 6.74358606338501, + "learning_rate": 4.432667481994837e-06, + "loss": 2.6856, + "step": 65580 + }, + { + "epoch": 4.456108166870498, + "grad_norm": 6.951475143432617, + "learning_rate": 4.432242831906509e-06, + "loss": 2.6191, + "step": 65585 + }, + { + "epoch": 4.45644788694116, + "grad_norm": 8.1795654296875, + "learning_rate": 4.4318181818181824e-06, + "loss": 2.7241, + "step": 65590 + }, + { + "epoch": 4.456787607011822, + "grad_norm": 7.20034646987915, + "learning_rate": 4.431393531729855e-06, + "loss": 2.9292, + "step": 65595 + }, + { + "epoch": 4.457127327082484, + "grad_norm": 7.956523418426514, + "learning_rate": 4.430968881641527e-06, + "loss": 2.843, + "step": 65600 + }, + { + "epoch": 4.457467047153146, + "grad_norm": 8.459524154663086, + "learning_rate": 4.430544231553201e-06, + "loss": 2.9532, + "step": 65605 + }, + { + "epoch": 4.457806767223808, + "grad_norm": 9.030925750732422, + "learning_rate": 4.430119581464874e-06, + "loss": 2.9644, + "step": 65610 + }, + { + "epoch": 4.458146487294469, + "grad_norm": 7.8725152015686035, + "learning_rate": 4.429694931376546e-06, + "loss": 2.8309, + "step": 65615 + }, + { + "epoch": 4.458486207365131, + "grad_norm": 6.205360412597656, + "learning_rate": 4.429270281288218e-06, + "loss": 2.9856, + "step": 65620 + }, + { + "epoch": 4.458825927435793, + "grad_norm": 7.217780590057373, + "learning_rate": 4.428845631199892e-06, + "loss": 2.527, + "step": 65625 + }, + { + "epoch": 4.459165647506454, + "grad_norm": 8.506919860839844, + "learning_rate": 4.428420981111565e-06, + "loss": 2.8409, + "step": 65630 + }, + { + "epoch": 4.459505367577116, + "grad_norm": 8.758769035339355, + "learning_rate": 4.427996331023237e-06, + "loss": 2.8928, + "step": 65635 + }, + { + "epoch": 4.459845087647778, + "grad_norm": 7.131397724151611, + "learning_rate": 4.4275716809349104e-06, + "loss": 3.0251, + "step": 65640 + }, + { + "epoch": 4.46018480771844, + "grad_norm": 9.464037895202637, + "learning_rate": 4.427147030846583e-06, + "loss": 2.8515, + "step": 65645 + }, + { + "epoch": 4.460524527789102, + "grad_norm": 6.83289098739624, + "learning_rate": 4.426722380758255e-06, + "loss": 2.7573, + "step": 65650 + }, + { + "epoch": 4.460864247859764, + "grad_norm": 8.148774147033691, + "learning_rate": 4.426297730669928e-06, + "loss": 2.7739, + "step": 65655 + }, + { + "epoch": 4.461203967930425, + "grad_norm": 8.974903106689453, + "learning_rate": 4.425873080581602e-06, + "loss": 2.7292, + "step": 65660 + }, + { + "epoch": 4.461543688001087, + "grad_norm": 5.958298206329346, + "learning_rate": 4.425448430493274e-06, + "loss": 2.8394, + "step": 65665 + }, + { + "epoch": 4.461883408071749, + "grad_norm": 7.340010166168213, + "learning_rate": 4.425023780404946e-06, + "loss": 2.903, + "step": 65670 + }, + { + "epoch": 4.46222312814241, + "grad_norm": 7.069703102111816, + "learning_rate": 4.42459913031662e-06, + "loss": 3.0788, + "step": 65675 + }, + { + "epoch": 4.462562848213072, + "grad_norm": 5.08614444732666, + "learning_rate": 4.424174480228292e-06, + "loss": 2.8296, + "step": 65680 + }, + { + "epoch": 4.462902568283734, + "grad_norm": 8.71953010559082, + "learning_rate": 4.423749830139965e-06, + "loss": 2.9096, + "step": 65685 + }, + { + "epoch": 4.463242288354396, + "grad_norm": 7.853511333465576, + "learning_rate": 4.4233251800516384e-06, + "loss": 2.9959, + "step": 65690 + }, + { + "epoch": 4.463582008425058, + "grad_norm": 9.428333282470703, + "learning_rate": 4.42290052996331e-06, + "loss": 2.6824, + "step": 65695 + }, + { + "epoch": 4.46392172849572, + "grad_norm": 8.179023742675781, + "learning_rate": 4.422475879874983e-06, + "loss": 2.8076, + "step": 65700 + }, + { + "epoch": 4.464261448566381, + "grad_norm": 6.72432279586792, + "learning_rate": 4.422051229786656e-06, + "loss": 2.8928, + "step": 65705 + }, + { + "epoch": 4.464601168637043, + "grad_norm": 7.09868860244751, + "learning_rate": 4.421626579698329e-06, + "loss": 2.6999, + "step": 65710 + }, + { + "epoch": 4.464940888707705, + "grad_norm": 7.580143928527832, + "learning_rate": 4.421201929610002e-06, + "loss": 2.9192, + "step": 65715 + }, + { + "epoch": 4.465280608778366, + "grad_norm": 8.503209114074707, + "learning_rate": 4.420777279521674e-06, + "loss": 2.9198, + "step": 65720 + }, + { + "epoch": 4.465620328849028, + "grad_norm": 7.397788047790527, + "learning_rate": 4.420352629433347e-06, + "loss": 2.7681, + "step": 65725 + }, + { + "epoch": 4.4659600489196905, + "grad_norm": 6.550795078277588, + "learning_rate": 4.41992797934502e-06, + "loss": 2.8353, + "step": 65730 + }, + { + "epoch": 4.466299768990352, + "grad_norm": 7.075780391693115, + "learning_rate": 4.419503329256693e-06, + "loss": 2.8318, + "step": 65735 + }, + { + "epoch": 4.466639489061014, + "grad_norm": 10.334001541137695, + "learning_rate": 4.419078679168366e-06, + "loss": 2.5952, + "step": 65740 + }, + { + "epoch": 4.466979209131676, + "grad_norm": 7.540238380432129, + "learning_rate": 4.418654029080038e-06, + "loss": 2.8752, + "step": 65745 + }, + { + "epoch": 4.467318929202337, + "grad_norm": 6.312294960021973, + "learning_rate": 4.418229378991711e-06, + "loss": 2.7867, + "step": 65750 + }, + { + "epoch": 4.467658649272999, + "grad_norm": 7.19986629486084, + "learning_rate": 4.417804728903384e-06, + "loss": 2.8516, + "step": 65755 + }, + { + "epoch": 4.467998369343661, + "grad_norm": 6.630057334899902, + "learning_rate": 4.417380078815057e-06, + "loss": 3.0343, + "step": 65760 + }, + { + "epoch": 4.468338089414322, + "grad_norm": 7.0921173095703125, + "learning_rate": 4.41695542872673e-06, + "loss": 2.966, + "step": 65765 + }, + { + "epoch": 4.468677809484984, + "grad_norm": 8.038835525512695, + "learning_rate": 4.416530778638402e-06, + "loss": 2.9334, + "step": 65770 + }, + { + "epoch": 4.4690175295556465, + "grad_norm": 7.239415645599365, + "learning_rate": 4.416106128550075e-06, + "loss": 2.9395, + "step": 65775 + }, + { + "epoch": 4.469357249626308, + "grad_norm": 6.264778137207031, + "learning_rate": 4.415681478461748e-06, + "loss": 2.7619, + "step": 65780 + }, + { + "epoch": 4.46969696969697, + "grad_norm": 6.580264568328857, + "learning_rate": 4.415256828373421e-06, + "loss": 2.6789, + "step": 65785 + }, + { + "epoch": 4.470036689767632, + "grad_norm": 6.4041428565979, + "learning_rate": 4.414832178285094e-06, + "loss": 2.8699, + "step": 65790 + }, + { + "epoch": 4.470376409838293, + "grad_norm": 8.04963493347168, + "learning_rate": 4.414407528196766e-06, + "loss": 3.0088, + "step": 65795 + }, + { + "epoch": 4.470716129908955, + "grad_norm": 7.107929229736328, + "learning_rate": 4.413982878108439e-06, + "loss": 2.7724, + "step": 65800 + }, + { + "epoch": 4.471055849979617, + "grad_norm": 9.62900161743164, + "learning_rate": 4.413558228020112e-06, + "loss": 2.9099, + "step": 65805 + }, + { + "epoch": 4.471395570050278, + "grad_norm": 6.8763957023620605, + "learning_rate": 4.413133577931785e-06, + "loss": 2.9366, + "step": 65810 + }, + { + "epoch": 4.47173529012094, + "grad_norm": 7.071463108062744, + "learning_rate": 4.412708927843458e-06, + "loss": 2.7513, + "step": 65815 + }, + { + "epoch": 4.4720750101916025, + "grad_norm": 8.110194206237793, + "learning_rate": 4.41228427775513e-06, + "loss": 2.8386, + "step": 65820 + }, + { + "epoch": 4.472414730262264, + "grad_norm": 8.239524841308594, + "learning_rate": 4.411859627666803e-06, + "loss": 2.7451, + "step": 65825 + }, + { + "epoch": 4.472754450332926, + "grad_norm": 7.515798568725586, + "learning_rate": 4.411434977578476e-06, + "loss": 2.8435, + "step": 65830 + }, + { + "epoch": 4.473094170403588, + "grad_norm": 7.949866771697998, + "learning_rate": 4.411010327490148e-06, + "loss": 2.7503, + "step": 65835 + }, + { + "epoch": 4.473433890474249, + "grad_norm": 6.717155933380127, + "learning_rate": 4.410585677401822e-06, + "loss": 2.8215, + "step": 65840 + }, + { + "epoch": 4.473773610544911, + "grad_norm": 7.428483009338379, + "learning_rate": 4.410161027313494e-06, + "loss": 2.7018, + "step": 65845 + }, + { + "epoch": 4.474113330615573, + "grad_norm": 7.45149040222168, + "learning_rate": 4.409736377225166e-06, + "loss": 2.6691, + "step": 65850 + }, + { + "epoch": 4.474453050686234, + "grad_norm": 7.300680637359619, + "learning_rate": 4.40931172713684e-06, + "loss": 2.5009, + "step": 65855 + }, + { + "epoch": 4.474792770756896, + "grad_norm": 6.1848039627075195, + "learning_rate": 4.408887077048513e-06, + "loss": 2.6827, + "step": 65860 + }, + { + "epoch": 4.4751324908275585, + "grad_norm": 7.5022077560424805, + "learning_rate": 4.408462426960185e-06, + "loss": 2.8774, + "step": 65865 + }, + { + "epoch": 4.47547221089822, + "grad_norm": 6.202575206756592, + "learning_rate": 4.408037776871858e-06, + "loss": 2.783, + "step": 65870 + }, + { + "epoch": 4.475811930968882, + "grad_norm": 9.751943588256836, + "learning_rate": 4.407613126783531e-06, + "loss": 2.8808, + "step": 65875 + }, + { + "epoch": 4.476151651039544, + "grad_norm": 6.360543727874756, + "learning_rate": 4.407188476695203e-06, + "loss": 2.8855, + "step": 65880 + }, + { + "epoch": 4.476491371110205, + "grad_norm": 8.650413513183594, + "learning_rate": 4.406763826606876e-06, + "loss": 2.7339, + "step": 65885 + }, + { + "epoch": 4.476831091180867, + "grad_norm": 7.673550605773926, + "learning_rate": 4.40633917651855e-06, + "loss": 2.9387, + "step": 65890 + }, + { + "epoch": 4.477170811251529, + "grad_norm": 7.338352680206299, + "learning_rate": 4.405914526430222e-06, + "loss": 2.8653, + "step": 65895 + }, + { + "epoch": 4.47751053132219, + "grad_norm": 9.161606788635254, + "learning_rate": 4.405489876341894e-06, + "loss": 2.7241, + "step": 65900 + }, + { + "epoch": 4.477850251392852, + "grad_norm": 7.828989505767822, + "learning_rate": 4.405065226253567e-06, + "loss": 2.792, + "step": 65905 + }, + { + "epoch": 4.4781899714635145, + "grad_norm": 6.645326614379883, + "learning_rate": 4.40464057616524e-06, + "loss": 2.7625, + "step": 65910 + }, + { + "epoch": 4.478529691534176, + "grad_norm": 6.154331207275391, + "learning_rate": 4.404215926076913e-06, + "loss": 3.0511, + "step": 65915 + }, + { + "epoch": 4.478869411604838, + "grad_norm": 7.5787200927734375, + "learning_rate": 4.403791275988586e-06, + "loss": 2.7552, + "step": 65920 + }, + { + "epoch": 4.4792091316755, + "grad_norm": 8.904237747192383, + "learning_rate": 4.403366625900258e-06, + "loss": 2.9926, + "step": 65925 + }, + { + "epoch": 4.479548851746161, + "grad_norm": 7.119237422943115, + "learning_rate": 4.402941975811931e-06, + "loss": 2.7805, + "step": 65930 + }, + { + "epoch": 4.479888571816823, + "grad_norm": 10.334485054016113, + "learning_rate": 4.402517325723604e-06, + "loss": 2.7638, + "step": 65935 + }, + { + "epoch": 4.480228291887485, + "grad_norm": 6.405920028686523, + "learning_rate": 4.402092675635277e-06, + "loss": 2.827, + "step": 65940 + }, + { + "epoch": 4.480568011958146, + "grad_norm": 8.865471839904785, + "learning_rate": 4.40166802554695e-06, + "loss": 2.7814, + "step": 65945 + }, + { + "epoch": 4.480907732028808, + "grad_norm": 6.973134994506836, + "learning_rate": 4.401243375458622e-06, + "loss": 2.852, + "step": 65950 + }, + { + "epoch": 4.4812474520994705, + "grad_norm": 8.495271682739258, + "learning_rate": 4.400818725370295e-06, + "loss": 2.983, + "step": 65955 + }, + { + "epoch": 4.481587172170132, + "grad_norm": 9.115106582641602, + "learning_rate": 4.400394075281968e-06, + "loss": 2.8969, + "step": 65960 + }, + { + "epoch": 4.481926892240794, + "grad_norm": 6.641221523284912, + "learning_rate": 4.399969425193641e-06, + "loss": 2.7807, + "step": 65965 + }, + { + "epoch": 4.482266612311455, + "grad_norm": 7.271432399749756, + "learning_rate": 4.399544775105314e-06, + "loss": 2.638, + "step": 65970 + }, + { + "epoch": 4.482606332382117, + "grad_norm": 7.522682189941406, + "learning_rate": 4.399120125016986e-06, + "loss": 2.7945, + "step": 65975 + }, + { + "epoch": 4.482946052452779, + "grad_norm": 7.219949245452881, + "learning_rate": 4.398695474928659e-06, + "loss": 2.7651, + "step": 65980 + }, + { + "epoch": 4.48328577252344, + "grad_norm": 9.882403373718262, + "learning_rate": 4.398270824840332e-06, + "loss": 2.9755, + "step": 65985 + }, + { + "epoch": 4.483625492594102, + "grad_norm": 5.9390869140625, + "learning_rate": 4.397846174752005e-06, + "loss": 2.7331, + "step": 65990 + }, + { + "epoch": 4.483965212664764, + "grad_norm": 7.0584540367126465, + "learning_rate": 4.397421524663678e-06, + "loss": 2.7547, + "step": 65995 + }, + { + "epoch": 4.484304932735426, + "grad_norm": 6.9977569580078125, + "learning_rate": 4.39699687457535e-06, + "loss": 2.7466, + "step": 66000 + }, + { + "epoch": 4.484644652806088, + "grad_norm": 9.716686248779297, + "learning_rate": 4.396572224487023e-06, + "loss": 3.0095, + "step": 66005 + }, + { + "epoch": 4.48498437287675, + "grad_norm": 8.798345565795898, + "learning_rate": 4.396147574398696e-06, + "loss": 2.765, + "step": 66010 + }, + { + "epoch": 4.485324092947411, + "grad_norm": 8.36082649230957, + "learning_rate": 4.395722924310369e-06, + "loss": 3.0035, + "step": 66015 + }, + { + "epoch": 4.485663813018073, + "grad_norm": 6.781090259552002, + "learning_rate": 4.395298274222042e-06, + "loss": 2.8896, + "step": 66020 + }, + { + "epoch": 4.486003533088735, + "grad_norm": 7.443031311035156, + "learning_rate": 4.394873624133714e-06, + "loss": 2.8285, + "step": 66025 + }, + { + "epoch": 4.486343253159396, + "grad_norm": 8.109349250793457, + "learning_rate": 4.394448974045387e-06, + "loss": 2.765, + "step": 66030 + }, + { + "epoch": 4.486682973230058, + "grad_norm": 7.662062168121338, + "learning_rate": 4.394024323957059e-06, + "loss": 2.6789, + "step": 66035 + }, + { + "epoch": 4.4870226933007205, + "grad_norm": 7.671258926391602, + "learning_rate": 4.393599673868733e-06, + "loss": 2.9267, + "step": 66040 + }, + { + "epoch": 4.487362413371382, + "grad_norm": 8.82510757446289, + "learning_rate": 4.393175023780406e-06, + "loss": 3.0425, + "step": 66045 + }, + { + "epoch": 4.487702133442044, + "grad_norm": 7.187978267669678, + "learning_rate": 4.3927503736920776e-06, + "loss": 2.9446, + "step": 66050 + }, + { + "epoch": 4.488041853512706, + "grad_norm": 6.119616508483887, + "learning_rate": 4.392325723603751e-06, + "loss": 2.6769, + "step": 66055 + }, + { + "epoch": 4.488381573583367, + "grad_norm": 7.009775638580322, + "learning_rate": 4.391901073515424e-06, + "loss": 2.8405, + "step": 66060 + }, + { + "epoch": 4.488721293654029, + "grad_norm": 6.421275615692139, + "learning_rate": 4.391476423427096e-06, + "loss": 2.8764, + "step": 66065 + }, + { + "epoch": 4.489061013724691, + "grad_norm": 8.977246284484863, + "learning_rate": 4.39105177333877e-06, + "loss": 3.0631, + "step": 66070 + }, + { + "epoch": 4.489400733795352, + "grad_norm": 8.625687599182129, + "learning_rate": 4.390627123250442e-06, + "loss": 2.9949, + "step": 66075 + }, + { + "epoch": 4.489740453866014, + "grad_norm": 7.4992804527282715, + "learning_rate": 4.390202473162114e-06, + "loss": 2.6735, + "step": 66080 + }, + { + "epoch": 4.4900801739366765, + "grad_norm": 8.985154151916504, + "learning_rate": 4.389777823073787e-06, + "loss": 2.8899, + "step": 66085 + }, + { + "epoch": 4.490419894007338, + "grad_norm": 8.679689407348633, + "learning_rate": 4.389353172985461e-06, + "loss": 2.8708, + "step": 66090 + }, + { + "epoch": 4.490759614078, + "grad_norm": 8.297513008117676, + "learning_rate": 4.388928522897133e-06, + "loss": 2.7195, + "step": 66095 + }, + { + "epoch": 4.491099334148662, + "grad_norm": 7.887598037719727, + "learning_rate": 4.3885038728088056e-06, + "loss": 2.8108, + "step": 66100 + }, + { + "epoch": 4.491439054219323, + "grad_norm": 5.800116539001465, + "learning_rate": 4.388079222720479e-06, + "loss": 2.8777, + "step": 66105 + }, + { + "epoch": 4.491778774289985, + "grad_norm": 8.636337280273438, + "learning_rate": 4.387654572632151e-06, + "loss": 2.8314, + "step": 66110 + }, + { + "epoch": 4.492118494360647, + "grad_norm": 7.9027509689331055, + "learning_rate": 4.387229922543824e-06, + "loss": 2.8883, + "step": 66115 + }, + { + "epoch": 4.492458214431308, + "grad_norm": 6.485196590423584, + "learning_rate": 4.386805272455497e-06, + "loss": 3.0938, + "step": 66120 + }, + { + "epoch": 4.49279793450197, + "grad_norm": 5.175412178039551, + "learning_rate": 4.3863806223671696e-06, + "loss": 2.7025, + "step": 66125 + }, + { + "epoch": 4.4931376545726325, + "grad_norm": 8.575743675231934, + "learning_rate": 4.385955972278842e-06, + "loss": 2.8389, + "step": 66130 + }, + { + "epoch": 4.493477374643294, + "grad_norm": 8.168375015258789, + "learning_rate": 4.385531322190515e-06, + "loss": 2.6263, + "step": 66135 + }, + { + "epoch": 4.493817094713956, + "grad_norm": 6.849006175994873, + "learning_rate": 4.385106672102189e-06, + "loss": 2.9667, + "step": 66140 + }, + { + "epoch": 4.494156814784618, + "grad_norm": 6.975369930267334, + "learning_rate": 4.384682022013861e-06, + "loss": 2.7819, + "step": 66145 + }, + { + "epoch": 4.494496534855279, + "grad_norm": 7.455338954925537, + "learning_rate": 4.3842573719255336e-06, + "loss": 2.9317, + "step": 66150 + }, + { + "epoch": 4.494836254925941, + "grad_norm": 6.752047061920166, + "learning_rate": 4.383832721837206e-06, + "loss": 2.9006, + "step": 66155 + }, + { + "epoch": 4.495175974996603, + "grad_norm": 8.114937782287598, + "learning_rate": 4.383408071748879e-06, + "loss": 3.174, + "step": 66160 + }, + { + "epoch": 4.495515695067264, + "grad_norm": 8.981732368469238, + "learning_rate": 4.382983421660552e-06, + "loss": 2.8424, + "step": 66165 + }, + { + "epoch": 4.495855415137926, + "grad_norm": 8.916362762451172, + "learning_rate": 4.382558771572225e-06, + "loss": 2.8817, + "step": 66170 + }, + { + "epoch": 4.4961951352085885, + "grad_norm": 9.106762886047363, + "learning_rate": 4.382134121483898e-06, + "loss": 2.8756, + "step": 66175 + }, + { + "epoch": 4.49653485527925, + "grad_norm": 7.411455154418945, + "learning_rate": 4.38170947139557e-06, + "loss": 2.7765, + "step": 66180 + }, + { + "epoch": 4.496874575349912, + "grad_norm": 8.921788215637207, + "learning_rate": 4.381284821307243e-06, + "loss": 2.835, + "step": 66185 + }, + { + "epoch": 4.497214295420574, + "grad_norm": 6.778903484344482, + "learning_rate": 4.380860171218916e-06, + "loss": 3.0894, + "step": 66190 + }, + { + "epoch": 4.497554015491235, + "grad_norm": 6.620034694671631, + "learning_rate": 4.380435521130589e-06, + "loss": 3.0019, + "step": 66195 + }, + { + "epoch": 4.497893735561897, + "grad_norm": 8.411985397338867, + "learning_rate": 4.380010871042262e-06, + "loss": 2.868, + "step": 66200 + }, + { + "epoch": 4.498233455632559, + "grad_norm": 9.238056182861328, + "learning_rate": 4.379586220953934e-06, + "loss": 2.6673, + "step": 66205 + }, + { + "epoch": 4.49857317570322, + "grad_norm": 6.175892353057861, + "learning_rate": 4.379161570865607e-06, + "loss": 2.9789, + "step": 66210 + }, + { + "epoch": 4.498912895773882, + "grad_norm": 6.450384616851807, + "learning_rate": 4.37873692077728e-06, + "loss": 2.7412, + "step": 66215 + }, + { + "epoch": 4.4992526158445445, + "grad_norm": 6.612368583679199, + "learning_rate": 4.378312270688953e-06, + "loss": 2.7809, + "step": 66220 + }, + { + "epoch": 4.499592335915206, + "grad_norm": 9.244763374328613, + "learning_rate": 4.377887620600626e-06, + "loss": 2.9489, + "step": 66225 + }, + { + "epoch": 4.499932055985868, + "grad_norm": 7.378873825073242, + "learning_rate": 4.377462970512298e-06, + "loss": 2.9445, + "step": 66230 + }, + { + "epoch": 4.500271776056529, + "grad_norm": 6.949915409088135, + "learning_rate": 4.377038320423971e-06, + "loss": 2.7735, + "step": 66235 + }, + { + "epoch": 4.500611496127191, + "grad_norm": 5.903344631195068, + "learning_rate": 4.376613670335644e-06, + "loss": 2.592, + "step": 66240 + }, + { + "epoch": 4.500951216197853, + "grad_norm": 9.8287353515625, + "learning_rate": 4.376189020247317e-06, + "loss": 2.8071, + "step": 66245 + }, + { + "epoch": 4.501290936268514, + "grad_norm": 7.7247395515441895, + "learning_rate": 4.375764370158989e-06, + "loss": 2.9036, + "step": 66250 + }, + { + "epoch": 4.501630656339176, + "grad_norm": 7.477797031402588, + "learning_rate": 4.375339720070662e-06, + "loss": 2.6988, + "step": 66255 + }, + { + "epoch": 4.501970376409838, + "grad_norm": 5.979978084564209, + "learning_rate": 4.374915069982335e-06, + "loss": 2.8101, + "step": 66260 + }, + { + "epoch": 4.5023100964805, + "grad_norm": 6.48785924911499, + "learning_rate": 4.374490419894007e-06, + "loss": 2.7762, + "step": 66265 + }, + { + "epoch": 4.502649816551162, + "grad_norm": 8.180209159851074, + "learning_rate": 4.374065769805681e-06, + "loss": 2.7756, + "step": 66270 + }, + { + "epoch": 4.502989536621824, + "grad_norm": 8.006416320800781, + "learning_rate": 4.373641119717354e-06, + "loss": 2.9214, + "step": 66275 + }, + { + "epoch": 4.503329256692485, + "grad_norm": 7.855606555938721, + "learning_rate": 4.3732164696290255e-06, + "loss": 2.8796, + "step": 66280 + }, + { + "epoch": 4.503668976763147, + "grad_norm": 7.732750415802002, + "learning_rate": 4.372791819540698e-06, + "loss": 3.11, + "step": 66285 + }, + { + "epoch": 4.504008696833809, + "grad_norm": 9.736087799072266, + "learning_rate": 4.372367169452372e-06, + "loss": 2.9774, + "step": 66290 + }, + { + "epoch": 4.50434841690447, + "grad_norm": 7.790249347686768, + "learning_rate": 4.371942519364044e-06, + "loss": 2.8864, + "step": 66295 + }, + { + "epoch": 4.504688136975132, + "grad_norm": 6.889333724975586, + "learning_rate": 4.371517869275717e-06, + "loss": 2.9021, + "step": 66300 + }, + { + "epoch": 4.5050278570457944, + "grad_norm": 5.84000301361084, + "learning_rate": 4.37109321918739e-06, + "loss": 2.967, + "step": 66305 + }, + { + "epoch": 4.505367577116456, + "grad_norm": 6.547068119049072, + "learning_rate": 4.370668569099062e-06, + "loss": 2.6137, + "step": 66310 + }, + { + "epoch": 4.505707297187118, + "grad_norm": 8.67695140838623, + "learning_rate": 4.370243919010735e-06, + "loss": 2.7989, + "step": 66315 + }, + { + "epoch": 4.50604701725778, + "grad_norm": 8.65807819366455, + "learning_rate": 4.369819268922408e-06, + "loss": 2.703, + "step": 66320 + }, + { + "epoch": 4.506386737328441, + "grad_norm": 7.423616886138916, + "learning_rate": 4.369394618834082e-06, + "loss": 3.0809, + "step": 66325 + }, + { + "epoch": 4.506726457399103, + "grad_norm": 6.873010158538818, + "learning_rate": 4.3689699687457536e-06, + "loss": 2.8479, + "step": 66330 + }, + { + "epoch": 4.507066177469765, + "grad_norm": 6.615419387817383, + "learning_rate": 4.368545318657426e-06, + "loss": 2.8733, + "step": 66335 + }, + { + "epoch": 4.507405897540426, + "grad_norm": 8.866081237792969, + "learning_rate": 4.3681206685691e-06, + "loss": 2.3866, + "step": 66340 + }, + { + "epoch": 4.507745617611088, + "grad_norm": 7.524534702301025, + "learning_rate": 4.367696018480772e-06, + "loss": 3.1425, + "step": 66345 + }, + { + "epoch": 4.5080853376817505, + "grad_norm": 7.727817535400391, + "learning_rate": 4.367271368392445e-06, + "loss": 2.7676, + "step": 66350 + }, + { + "epoch": 4.508425057752412, + "grad_norm": 7.905357837677002, + "learning_rate": 4.366846718304118e-06, + "loss": 2.8491, + "step": 66355 + }, + { + "epoch": 4.508764777823074, + "grad_norm": 6.466491222381592, + "learning_rate": 4.36642206821579e-06, + "loss": 2.9938, + "step": 66360 + }, + { + "epoch": 4.509104497893736, + "grad_norm": 7.571550369262695, + "learning_rate": 4.365997418127463e-06, + "loss": 3.1203, + "step": 66365 + }, + { + "epoch": 4.509444217964397, + "grad_norm": 9.472257614135742, + "learning_rate": 4.365572768039136e-06, + "loss": 2.8704, + "step": 66370 + }, + { + "epoch": 4.509783938035059, + "grad_norm": 7.270031452178955, + "learning_rate": 4.365148117950809e-06, + "loss": 2.6954, + "step": 66375 + }, + { + "epoch": 4.510123658105721, + "grad_norm": 7.140004634857178, + "learning_rate": 4.3647234678624816e-06, + "loss": 2.8671, + "step": 66380 + }, + { + "epoch": 4.510463378176382, + "grad_norm": 7.715550422668457, + "learning_rate": 4.364298817774154e-06, + "loss": 3.001, + "step": 66385 + }, + { + "epoch": 4.510803098247044, + "grad_norm": 7.848750114440918, + "learning_rate": 4.363874167685827e-06, + "loss": 2.8192, + "step": 66390 + }, + { + "epoch": 4.5111428183177065, + "grad_norm": 6.569530010223389, + "learning_rate": 4.3634495175975e-06, + "loss": 2.7796, + "step": 66395 + }, + { + "epoch": 4.511482538388368, + "grad_norm": 9.06702709197998, + "learning_rate": 4.363024867509173e-06, + "loss": 2.7094, + "step": 66400 + }, + { + "epoch": 4.51182225845903, + "grad_norm": 9.075337409973145, + "learning_rate": 4.3626002174208456e-06, + "loss": 2.8726, + "step": 66405 + }, + { + "epoch": 4.512161978529692, + "grad_norm": 10.627570152282715, + "learning_rate": 4.362175567332518e-06, + "loss": 2.7932, + "step": 66410 + }, + { + "epoch": 4.512501698600353, + "grad_norm": 6.916647911071777, + "learning_rate": 4.361750917244191e-06, + "loss": 2.8143, + "step": 66415 + }, + { + "epoch": 4.512841418671015, + "grad_norm": 8.333046913146973, + "learning_rate": 4.361326267155864e-06, + "loss": 2.8806, + "step": 66420 + }, + { + "epoch": 4.513181138741677, + "grad_norm": 6.880171775817871, + "learning_rate": 4.360901617067537e-06, + "loss": 2.6934, + "step": 66425 + }, + { + "epoch": 4.513520858812338, + "grad_norm": 8.181893348693848, + "learning_rate": 4.3604769669792096e-06, + "loss": 2.7737, + "step": 66430 + }, + { + "epoch": 4.513860578883, + "grad_norm": 7.010909557342529, + "learning_rate": 4.360052316890882e-06, + "loss": 2.9482, + "step": 66435 + }, + { + "epoch": 4.5142002989536625, + "grad_norm": 8.506612777709961, + "learning_rate": 4.359627666802555e-06, + "loss": 2.9224, + "step": 66440 + }, + { + "epoch": 4.514540019024324, + "grad_norm": 6.956234455108643, + "learning_rate": 4.359203016714228e-06, + "loss": 2.6888, + "step": 66445 + }, + { + "epoch": 4.514879739094986, + "grad_norm": 7.648332118988037, + "learning_rate": 4.358778366625901e-06, + "loss": 2.9385, + "step": 66450 + }, + { + "epoch": 4.515219459165648, + "grad_norm": 7.003561973571777, + "learning_rate": 4.3583537165375736e-06, + "loss": 2.7256, + "step": 66455 + }, + { + "epoch": 4.515559179236309, + "grad_norm": 5.8136491775512695, + "learning_rate": 4.357929066449246e-06, + "loss": 2.8416, + "step": 66460 + }, + { + "epoch": 4.515898899306971, + "grad_norm": 7.206639766693115, + "learning_rate": 4.357504416360918e-06, + "loss": 2.9148, + "step": 66465 + }, + { + "epoch": 4.516238619377633, + "grad_norm": 7.696659088134766, + "learning_rate": 4.357079766272592e-06, + "loss": 2.9094, + "step": 66470 + }, + { + "epoch": 4.516578339448294, + "grad_norm": 8.582890510559082, + "learning_rate": 4.356655116184265e-06, + "loss": 2.5705, + "step": 66475 + }, + { + "epoch": 4.516918059518956, + "grad_norm": 8.010285377502441, + "learning_rate": 4.356230466095937e-06, + "loss": 2.6153, + "step": 66480 + }, + { + "epoch": 4.5172577795896185, + "grad_norm": 7.135915279388428, + "learning_rate": 4.35580581600761e-06, + "loss": 2.8208, + "step": 66485 + }, + { + "epoch": 4.51759749966028, + "grad_norm": 6.638965606689453, + "learning_rate": 4.355381165919283e-06, + "loss": 2.8557, + "step": 66490 + }, + { + "epoch": 4.517937219730942, + "grad_norm": 7.9567179679870605, + "learning_rate": 4.354956515830956e-06, + "loss": 2.8159, + "step": 66495 + }, + { + "epoch": 4.518276939801604, + "grad_norm": 7.561642169952393, + "learning_rate": 4.354531865742628e-06, + "loss": 2.924, + "step": 66500 + }, + { + "epoch": 4.518616659872265, + "grad_norm": 6.3117289543151855, + "learning_rate": 4.354107215654302e-06, + "loss": 2.6731, + "step": 66505 + }, + { + "epoch": 4.518956379942927, + "grad_norm": 9.572174072265625, + "learning_rate": 4.353682565565974e-06, + "loss": 2.8833, + "step": 66510 + }, + { + "epoch": 4.519296100013589, + "grad_norm": 7.976221084594727, + "learning_rate": 4.353257915477646e-06, + "loss": 2.9197, + "step": 66515 + }, + { + "epoch": 4.51963582008425, + "grad_norm": 6.77128267288208, + "learning_rate": 4.35283326538932e-06, + "loss": 2.8073, + "step": 66520 + }, + { + "epoch": 4.519975540154912, + "grad_norm": 6.692219257354736, + "learning_rate": 4.352408615300993e-06, + "loss": 3.0104, + "step": 66525 + }, + { + "epoch": 4.5203152602255745, + "grad_norm": 6.140235424041748, + "learning_rate": 4.351983965212665e-06, + "loss": 2.883, + "step": 66530 + }, + { + "epoch": 4.520654980296236, + "grad_norm": 6.911149501800537, + "learning_rate": 4.3515593151243375e-06, + "loss": 2.6222, + "step": 66535 + }, + { + "epoch": 4.520994700366898, + "grad_norm": 7.332305908203125, + "learning_rate": 4.351134665036011e-06, + "loss": 2.973, + "step": 66540 + }, + { + "epoch": 4.52133442043756, + "grad_norm": 6.792359352111816, + "learning_rate": 4.350710014947683e-06, + "loss": 2.7557, + "step": 66545 + }, + { + "epoch": 4.521674140508221, + "grad_norm": 8.075122833251953, + "learning_rate": 4.350285364859356e-06, + "loss": 2.7905, + "step": 66550 + }, + { + "epoch": 4.522013860578883, + "grad_norm": 7.2956156730651855, + "learning_rate": 4.34986071477103e-06, + "loss": 2.8501, + "step": 66555 + }, + { + "epoch": 4.522353580649545, + "grad_norm": 7.332286834716797, + "learning_rate": 4.3494360646827015e-06, + "loss": 2.7576, + "step": 66560 + }, + { + "epoch": 4.522693300720206, + "grad_norm": 7.147439956665039, + "learning_rate": 4.349011414594374e-06, + "loss": 2.8136, + "step": 66565 + }, + { + "epoch": 4.523033020790868, + "grad_norm": 6.2614545822143555, + "learning_rate": 4.348586764506047e-06, + "loss": 3.0736, + "step": 66570 + }, + { + "epoch": 4.5233727408615305, + "grad_norm": 9.622418403625488, + "learning_rate": 4.34816211441772e-06, + "loss": 2.6937, + "step": 66575 + }, + { + "epoch": 4.523712460932192, + "grad_norm": 6.398077487945557, + "learning_rate": 4.347737464329393e-06, + "loss": 2.9528, + "step": 66580 + }, + { + "epoch": 4.524052181002854, + "grad_norm": 7.643658638000488, + "learning_rate": 4.3473128142410655e-06, + "loss": 2.8217, + "step": 66585 + }, + { + "epoch": 4.524391901073516, + "grad_norm": 6.387275218963623, + "learning_rate": 4.346888164152738e-06, + "loss": 3.0947, + "step": 66590 + }, + { + "epoch": 4.524731621144177, + "grad_norm": 5.826456546783447, + "learning_rate": 4.346463514064411e-06, + "loss": 2.7676, + "step": 66595 + }, + { + "epoch": 4.525071341214839, + "grad_norm": 7.66792106628418, + "learning_rate": 4.346038863976084e-06, + "loss": 3.0718, + "step": 66600 + }, + { + "epoch": 4.525411061285501, + "grad_norm": 9.773279190063477, + "learning_rate": 4.345614213887757e-06, + "loss": 3.0632, + "step": 66605 + }, + { + "epoch": 4.525750781356162, + "grad_norm": 7.595631122589111, + "learning_rate": 4.3451895637994296e-06, + "loss": 2.8093, + "step": 66610 + }, + { + "epoch": 4.5260905014268245, + "grad_norm": 6.934203147888184, + "learning_rate": 4.344764913711102e-06, + "loss": 2.8103, + "step": 66615 + }, + { + "epoch": 4.5264302214974865, + "grad_norm": 8.46451187133789, + "learning_rate": 4.344340263622775e-06, + "loss": 2.7474, + "step": 66620 + }, + { + "epoch": 4.526769941568148, + "grad_norm": 5.372869968414307, + "learning_rate": 4.343915613534448e-06, + "loss": 2.6423, + "step": 66625 + }, + { + "epoch": 4.52710966163881, + "grad_norm": 8.673969268798828, + "learning_rate": 4.343490963446121e-06, + "loss": 2.6535, + "step": 66630 + }, + { + "epoch": 4.527449381709472, + "grad_norm": 7.726800441741943, + "learning_rate": 4.3430663133577936e-06, + "loss": 2.8084, + "step": 66635 + }, + { + "epoch": 4.527789101780133, + "grad_norm": 6.2927961349487305, + "learning_rate": 4.342641663269466e-06, + "loss": 2.6474, + "step": 66640 + }, + { + "epoch": 4.528128821850795, + "grad_norm": 7.295479774475098, + "learning_rate": 4.342217013181139e-06, + "loss": 2.9207, + "step": 66645 + }, + { + "epoch": 4.528468541921457, + "grad_norm": 6.683809280395508, + "learning_rate": 4.341792363092812e-06, + "loss": 2.9567, + "step": 66650 + }, + { + "epoch": 4.528808261992118, + "grad_norm": 5.943655490875244, + "learning_rate": 4.341367713004485e-06, + "loss": 3.0375, + "step": 66655 + }, + { + "epoch": 4.5291479820627805, + "grad_norm": 6.55837345123291, + "learning_rate": 4.3409430629161576e-06, + "loss": 2.7744, + "step": 66660 + }, + { + "epoch": 4.5294877021334425, + "grad_norm": 7.236589431762695, + "learning_rate": 4.34051841282783e-06, + "loss": 2.837, + "step": 66665 + }, + { + "epoch": 4.529827422204104, + "grad_norm": 7.202128887176514, + "learning_rate": 4.340093762739503e-06, + "loss": 2.9762, + "step": 66670 + }, + { + "epoch": 4.530167142274766, + "grad_norm": 7.8666839599609375, + "learning_rate": 4.339669112651176e-06, + "loss": 3.0708, + "step": 66675 + }, + { + "epoch": 4.530506862345427, + "grad_norm": 7.377781867980957, + "learning_rate": 4.339244462562849e-06, + "loss": 3.0787, + "step": 66680 + }, + { + "epoch": 4.530846582416089, + "grad_norm": 8.276941299438477, + "learning_rate": 4.3388198124745216e-06, + "loss": 3.0183, + "step": 66685 + }, + { + "epoch": 4.531186302486751, + "grad_norm": 6.703425407409668, + "learning_rate": 4.338395162386194e-06, + "loss": 2.7959, + "step": 66690 + }, + { + "epoch": 4.531526022557412, + "grad_norm": 7.2073469161987305, + "learning_rate": 4.337970512297867e-06, + "loss": 2.9696, + "step": 66695 + }, + { + "epoch": 4.531865742628074, + "grad_norm": 5.894723415374756, + "learning_rate": 4.33754586220954e-06, + "loss": 2.8766, + "step": 66700 + }, + { + "epoch": 4.5322054626987365, + "grad_norm": 7.940489768981934, + "learning_rate": 4.337121212121213e-06, + "loss": 2.9235, + "step": 66705 + }, + { + "epoch": 4.532545182769398, + "grad_norm": 7.840071678161621, + "learning_rate": 4.3366965620328856e-06, + "loss": 2.9372, + "step": 66710 + }, + { + "epoch": 4.53288490284006, + "grad_norm": 7.978917598724365, + "learning_rate": 4.3362719119445575e-06, + "loss": 2.7231, + "step": 66715 + }, + { + "epoch": 4.533224622910722, + "grad_norm": 9.120746612548828, + "learning_rate": 4.335847261856231e-06, + "loss": 3.0033, + "step": 66720 + }, + { + "epoch": 4.533564342981383, + "grad_norm": 7.3236589431762695, + "learning_rate": 4.335422611767904e-06, + "loss": 2.7692, + "step": 66725 + }, + { + "epoch": 4.533904063052045, + "grad_norm": 6.458324909210205, + "learning_rate": 4.334997961679576e-06, + "loss": 2.7789, + "step": 66730 + }, + { + "epoch": 4.534243783122707, + "grad_norm": 6.722433090209961, + "learning_rate": 4.3345733115912496e-06, + "loss": 2.8884, + "step": 66735 + }, + { + "epoch": 4.534583503193368, + "grad_norm": 6.5938720703125, + "learning_rate": 4.334148661502922e-06, + "loss": 2.8266, + "step": 66740 + }, + { + "epoch": 4.53492322326403, + "grad_norm": 7.262376308441162, + "learning_rate": 4.333724011414594e-06, + "loss": 3.0211, + "step": 66745 + }, + { + "epoch": 4.5352629433346925, + "grad_norm": 9.144984245300293, + "learning_rate": 4.333299361326267e-06, + "loss": 2.8841, + "step": 66750 + }, + { + "epoch": 4.535602663405354, + "grad_norm": 7.452955722808838, + "learning_rate": 4.332874711237941e-06, + "loss": 2.7459, + "step": 66755 + }, + { + "epoch": 4.535942383476016, + "grad_norm": 6.276570796966553, + "learning_rate": 4.332450061149613e-06, + "loss": 2.9424, + "step": 66760 + }, + { + "epoch": 4.536282103546678, + "grad_norm": 6.407381057739258, + "learning_rate": 4.3320254110612855e-06, + "loss": 2.7374, + "step": 66765 + }, + { + "epoch": 4.536621823617339, + "grad_norm": 7.166661262512207, + "learning_rate": 4.331600760972959e-06, + "loss": 2.7258, + "step": 66770 + }, + { + "epoch": 4.536961543688001, + "grad_norm": 7.910689353942871, + "learning_rate": 4.331176110884631e-06, + "loss": 3.0414, + "step": 66775 + }, + { + "epoch": 4.537301263758663, + "grad_norm": 7.8528008460998535, + "learning_rate": 4.330751460796304e-06, + "loss": 2.6494, + "step": 66780 + }, + { + "epoch": 4.537640983829324, + "grad_norm": 9.624772071838379, + "learning_rate": 4.330326810707977e-06, + "loss": 2.9632, + "step": 66785 + }, + { + "epoch": 4.537980703899986, + "grad_norm": 8.517428398132324, + "learning_rate": 4.3299021606196495e-06, + "loss": 2.9198, + "step": 66790 + }, + { + "epoch": 4.5383204239706485, + "grad_norm": 7.674215793609619, + "learning_rate": 4.329477510531322e-06, + "loss": 2.9011, + "step": 66795 + }, + { + "epoch": 4.53866014404131, + "grad_norm": 7.4446258544921875, + "learning_rate": 4.329052860442995e-06, + "loss": 2.9249, + "step": 66800 + }, + { + "epoch": 4.538999864111972, + "grad_norm": 8.199191093444824, + "learning_rate": 4.328628210354668e-06, + "loss": 2.9496, + "step": 66805 + }, + { + "epoch": 4.539339584182634, + "grad_norm": 6.092288017272949, + "learning_rate": 4.328203560266341e-06, + "loss": 3.1615, + "step": 66810 + }, + { + "epoch": 4.539679304253295, + "grad_norm": 6.9803667068481445, + "learning_rate": 4.3277789101780135e-06, + "loss": 3.0589, + "step": 66815 + }, + { + "epoch": 4.540019024323957, + "grad_norm": 7.22813081741333, + "learning_rate": 4.327354260089686e-06, + "loss": 2.8867, + "step": 66820 + }, + { + "epoch": 4.540358744394619, + "grad_norm": 7.2115678787231445, + "learning_rate": 4.326929610001359e-06, + "loss": 3.0665, + "step": 66825 + }, + { + "epoch": 4.54069846446528, + "grad_norm": 5.946534633636475, + "learning_rate": 4.326504959913032e-06, + "loss": 2.8127, + "step": 66830 + }, + { + "epoch": 4.541038184535942, + "grad_norm": 8.963308334350586, + "learning_rate": 4.326080309824705e-06, + "loss": 2.9368, + "step": 66835 + }, + { + "epoch": 4.5413779046066045, + "grad_norm": 7.827805995941162, + "learning_rate": 4.3256556597363775e-06, + "loss": 2.8885, + "step": 66840 + }, + { + "epoch": 4.541717624677266, + "grad_norm": 9.045577049255371, + "learning_rate": 4.32523100964805e-06, + "loss": 2.799, + "step": 66845 + }, + { + "epoch": 4.542057344747928, + "grad_norm": 7.409165382385254, + "learning_rate": 4.324806359559723e-06, + "loss": 2.9654, + "step": 66850 + }, + { + "epoch": 4.54239706481859, + "grad_norm": 7.675050258636475, + "learning_rate": 4.324381709471396e-06, + "loss": 2.886, + "step": 66855 + }, + { + "epoch": 4.542736784889251, + "grad_norm": 5.928195953369141, + "learning_rate": 4.323957059383069e-06, + "loss": 2.8339, + "step": 66860 + }, + { + "epoch": 4.543076504959913, + "grad_norm": 8.786993026733398, + "learning_rate": 4.3235324092947415e-06, + "loss": 2.9489, + "step": 66865 + }, + { + "epoch": 4.543416225030575, + "grad_norm": 9.765472412109375, + "learning_rate": 4.323107759206414e-06, + "loss": 2.9628, + "step": 66870 + }, + { + "epoch": 4.543755945101236, + "grad_norm": 6.3664093017578125, + "learning_rate": 4.322683109118087e-06, + "loss": 2.8168, + "step": 66875 + }, + { + "epoch": 4.5440956651718984, + "grad_norm": 7.121088027954102, + "learning_rate": 4.32225845902976e-06, + "loss": 3.045, + "step": 66880 + }, + { + "epoch": 4.54443538524256, + "grad_norm": 8.117875099182129, + "learning_rate": 4.321833808941433e-06, + "loss": 2.8034, + "step": 66885 + }, + { + "epoch": 4.544775105313222, + "grad_norm": 6.662786960601807, + "learning_rate": 4.3214091588531055e-06, + "loss": 2.6849, + "step": 66890 + }, + { + "epoch": 4.545114825383884, + "grad_norm": 7.1194586753845215, + "learning_rate": 4.320984508764778e-06, + "loss": 2.7954, + "step": 66895 + }, + { + "epoch": 4.545454545454545, + "grad_norm": 8.51156234741211, + "learning_rate": 4.320559858676451e-06, + "loss": 2.7101, + "step": 66900 + }, + { + "epoch": 4.545794265525207, + "grad_norm": 8.6022367477417, + "learning_rate": 4.320135208588124e-06, + "loss": 2.8306, + "step": 66905 + }, + { + "epoch": 4.546133985595869, + "grad_norm": 8.844788551330566, + "learning_rate": 4.319710558499797e-06, + "loss": 2.9973, + "step": 66910 + }, + { + "epoch": 4.54647370566653, + "grad_norm": 5.785736560821533, + "learning_rate": 4.319285908411469e-06, + "loss": 2.8776, + "step": 66915 + }, + { + "epoch": 4.546813425737192, + "grad_norm": 6.322824001312256, + "learning_rate": 4.318861258323142e-06, + "loss": 2.7784, + "step": 66920 + }, + { + "epoch": 4.5471531458078545, + "grad_norm": 6.53204345703125, + "learning_rate": 4.318436608234815e-06, + "loss": 3.0685, + "step": 66925 + }, + { + "epoch": 4.547492865878516, + "grad_norm": 7.292391300201416, + "learning_rate": 4.318011958146487e-06, + "loss": 2.8786, + "step": 66930 + }, + { + "epoch": 4.547832585949178, + "grad_norm": 6.612290859222412, + "learning_rate": 4.317587308058161e-06, + "loss": 2.8529, + "step": 66935 + }, + { + "epoch": 4.54817230601984, + "grad_norm": 9.537367820739746, + "learning_rate": 4.3171626579698336e-06, + "loss": 2.7904, + "step": 66940 + }, + { + "epoch": 4.548512026090501, + "grad_norm": 8.446849822998047, + "learning_rate": 4.3167380078815055e-06, + "loss": 2.8334, + "step": 66945 + }, + { + "epoch": 4.548851746161163, + "grad_norm": 7.60261344909668, + "learning_rate": 4.316313357793178e-06, + "loss": 3.0495, + "step": 66950 + }, + { + "epoch": 4.549191466231825, + "grad_norm": 8.53396224975586, + "learning_rate": 4.315888707704852e-06, + "loss": 2.8495, + "step": 66955 + }, + { + "epoch": 4.549531186302486, + "grad_norm": 7.48768424987793, + "learning_rate": 4.315464057616524e-06, + "loss": 2.9965, + "step": 66960 + }, + { + "epoch": 4.549870906373148, + "grad_norm": 7.759291648864746, + "learning_rate": 4.315039407528197e-06, + "loss": 2.6475, + "step": 66965 + }, + { + "epoch": 4.5502106264438105, + "grad_norm": 6.077394008636475, + "learning_rate": 4.31461475743987e-06, + "loss": 2.7623, + "step": 66970 + }, + { + "epoch": 4.550550346514472, + "grad_norm": 7.011690616607666, + "learning_rate": 4.314190107351542e-06, + "loss": 2.7858, + "step": 66975 + }, + { + "epoch": 4.550890066585134, + "grad_norm": 7.0845842361450195, + "learning_rate": 4.313765457263215e-06, + "loss": 2.7513, + "step": 66980 + }, + { + "epoch": 4.551229786655796, + "grad_norm": 6.522925853729248, + "learning_rate": 4.313340807174889e-06, + "loss": 2.9704, + "step": 66985 + }, + { + "epoch": 4.551569506726457, + "grad_norm": 8.593342781066895, + "learning_rate": 4.312916157086561e-06, + "loss": 2.7439, + "step": 66990 + }, + { + "epoch": 4.551909226797119, + "grad_norm": 7.4219069480896, + "learning_rate": 4.3124915069982335e-06, + "loss": 2.9315, + "step": 66995 + }, + { + "epoch": 4.552248946867781, + "grad_norm": 8.362862586975098, + "learning_rate": 4.312066856909906e-06, + "loss": 2.7022, + "step": 67000 + }, + { + "epoch": 4.552588666938442, + "grad_norm": 6.421005725860596, + "learning_rate": 4.31164220682158e-06, + "loss": 2.7272, + "step": 67005 + }, + { + "epoch": 4.552928387009104, + "grad_norm": 5.728044033050537, + "learning_rate": 4.311217556733252e-06, + "loss": 2.8746, + "step": 67010 + }, + { + "epoch": 4.5532681070797665, + "grad_norm": 6.8594841957092285, + "learning_rate": 4.310792906644925e-06, + "loss": 2.7497, + "step": 67015 + }, + { + "epoch": 4.553607827150428, + "grad_norm": 8.008940696716309, + "learning_rate": 4.310368256556598e-06, + "loss": 2.7591, + "step": 67020 + }, + { + "epoch": 4.55394754722109, + "grad_norm": 8.004775047302246, + "learning_rate": 4.30994360646827e-06, + "loss": 2.9989, + "step": 67025 + }, + { + "epoch": 4.554287267291752, + "grad_norm": 8.372661590576172, + "learning_rate": 4.309518956379943e-06, + "loss": 2.96, + "step": 67030 + }, + { + "epoch": 4.554626987362413, + "grad_norm": 7.721646785736084, + "learning_rate": 4.309094306291616e-06, + "loss": 2.739, + "step": 67035 + }, + { + "epoch": 4.554966707433075, + "grad_norm": 6.5720534324646, + "learning_rate": 4.308669656203289e-06, + "loss": 2.9451, + "step": 67040 + }, + { + "epoch": 4.555306427503737, + "grad_norm": 7.710890293121338, + "learning_rate": 4.3082450061149615e-06, + "loss": 2.8857, + "step": 67045 + }, + { + "epoch": 4.555646147574398, + "grad_norm": 7.731987476348877, + "learning_rate": 4.307820356026634e-06, + "loss": 2.8041, + "step": 67050 + }, + { + "epoch": 4.55598586764506, + "grad_norm": 6.628121376037598, + "learning_rate": 4.307395705938307e-06, + "loss": 2.8882, + "step": 67055 + }, + { + "epoch": 4.5563255877157225, + "grad_norm": 7.793959617614746, + "learning_rate": 4.30697105584998e-06, + "loss": 3.0242, + "step": 67060 + }, + { + "epoch": 4.556665307786384, + "grad_norm": 8.395159721374512, + "learning_rate": 4.306546405761653e-06, + "loss": 2.9491, + "step": 67065 + }, + { + "epoch": 4.557005027857046, + "grad_norm": 6.804808616638184, + "learning_rate": 4.3061217556733255e-06, + "loss": 2.8791, + "step": 67070 + }, + { + "epoch": 4.557344747927708, + "grad_norm": 8.091233253479004, + "learning_rate": 4.305697105584998e-06, + "loss": 3.1856, + "step": 67075 + }, + { + "epoch": 4.557684467998369, + "grad_norm": 7.192871570587158, + "learning_rate": 4.305272455496671e-06, + "loss": 2.8414, + "step": 67080 + }, + { + "epoch": 4.558024188069031, + "grad_norm": 7.131143093109131, + "learning_rate": 4.304847805408344e-06, + "loss": 2.9898, + "step": 67085 + }, + { + "epoch": 4.558363908139693, + "grad_norm": 5.956217288970947, + "learning_rate": 4.304423155320017e-06, + "loss": 2.9024, + "step": 67090 + }, + { + "epoch": 4.558703628210354, + "grad_norm": 6.305017471313477, + "learning_rate": 4.3039985052316895e-06, + "loss": 2.9038, + "step": 67095 + }, + { + "epoch": 4.559043348281016, + "grad_norm": 8.837326049804688, + "learning_rate": 4.303573855143362e-06, + "loss": 2.7218, + "step": 67100 + }, + { + "epoch": 4.5593830683516785, + "grad_norm": 8.406978607177734, + "learning_rate": 4.303149205055035e-06, + "loss": 2.9541, + "step": 67105 + }, + { + "epoch": 4.55972278842234, + "grad_norm": 7.635625839233398, + "learning_rate": 4.302724554966708e-06, + "loss": 2.6749, + "step": 67110 + }, + { + "epoch": 4.560062508493002, + "grad_norm": 6.850518226623535, + "learning_rate": 4.302299904878381e-06, + "loss": 2.7829, + "step": 67115 + }, + { + "epoch": 4.560402228563664, + "grad_norm": 8.509450912475586, + "learning_rate": 4.3018752547900535e-06, + "loss": 2.9134, + "step": 67120 + }, + { + "epoch": 4.560741948634325, + "grad_norm": 8.680867195129395, + "learning_rate": 4.301450604701726e-06, + "loss": 2.6644, + "step": 67125 + }, + { + "epoch": 4.561081668704987, + "grad_norm": 6.1437506675720215, + "learning_rate": 4.301025954613398e-06, + "loss": 2.8715, + "step": 67130 + }, + { + "epoch": 4.561421388775649, + "grad_norm": 8.082651138305664, + "learning_rate": 4.300601304525072e-06, + "loss": 2.6706, + "step": 67135 + }, + { + "epoch": 4.56176110884631, + "grad_norm": 6.6856913566589355, + "learning_rate": 4.300176654436745e-06, + "loss": 2.9372, + "step": 67140 + }, + { + "epoch": 4.562100828916972, + "grad_norm": 7.154584884643555, + "learning_rate": 4.299752004348417e-06, + "loss": 2.8512, + "step": 67145 + }, + { + "epoch": 4.5624405489876345, + "grad_norm": 8.755339622497559, + "learning_rate": 4.29932735426009e-06, + "loss": 2.9649, + "step": 67150 + }, + { + "epoch": 4.562780269058296, + "grad_norm": 6.905618190765381, + "learning_rate": 4.298902704171763e-06, + "loss": 2.905, + "step": 67155 + }, + { + "epoch": 4.563119989128958, + "grad_norm": 8.281270027160645, + "learning_rate": 4.298478054083435e-06, + "loss": 2.8856, + "step": 67160 + }, + { + "epoch": 4.56345970919962, + "grad_norm": 7.777323246002197, + "learning_rate": 4.298053403995108e-06, + "loss": 3.0017, + "step": 67165 + }, + { + "epoch": 4.563799429270281, + "grad_norm": 7.042450904846191, + "learning_rate": 4.2976287539067815e-06, + "loss": 3.0496, + "step": 67170 + }, + { + "epoch": 4.564139149340943, + "grad_norm": 7.234004974365234, + "learning_rate": 4.297204103818454e-06, + "loss": 2.8929, + "step": 67175 + }, + { + "epoch": 4.564478869411605, + "grad_norm": 8.733428955078125, + "learning_rate": 4.296779453730126e-06, + "loss": 2.8027, + "step": 67180 + }, + { + "epoch": 4.564818589482266, + "grad_norm": 12.094447135925293, + "learning_rate": 4.2963548036418e-06, + "loss": 2.7154, + "step": 67185 + }, + { + "epoch": 4.5651583095529285, + "grad_norm": 7.904615879058838, + "learning_rate": 4.295930153553473e-06, + "loss": 2.852, + "step": 67190 + }, + { + "epoch": 4.5654980296235905, + "grad_norm": 7.450514793395996, + "learning_rate": 4.295505503465145e-06, + "loss": 2.7004, + "step": 67195 + }, + { + "epoch": 4.565837749694252, + "grad_norm": 8.902787208557129, + "learning_rate": 4.2950808533768175e-06, + "loss": 2.8424, + "step": 67200 + }, + { + "epoch": 4.566177469764914, + "grad_norm": 8.353681564331055, + "learning_rate": 4.294656203288491e-06, + "loss": 3.0168, + "step": 67205 + }, + { + "epoch": 4.566517189835576, + "grad_norm": 6.548771858215332, + "learning_rate": 4.294231553200163e-06, + "loss": 2.934, + "step": 67210 + }, + { + "epoch": 4.566856909906237, + "grad_norm": 7.162700176239014, + "learning_rate": 4.293806903111836e-06, + "loss": 2.5552, + "step": 67215 + }, + { + "epoch": 4.567196629976899, + "grad_norm": 6.705132007598877, + "learning_rate": 4.2933822530235096e-06, + "loss": 2.7089, + "step": 67220 + }, + { + "epoch": 4.567536350047561, + "grad_norm": 8.172039031982422, + "learning_rate": 4.2929576029351815e-06, + "loss": 2.7653, + "step": 67225 + }, + { + "epoch": 4.567876070118222, + "grad_norm": 7.25412654876709, + "learning_rate": 4.292532952846854e-06, + "loss": 2.675, + "step": 67230 + }, + { + "epoch": 4.5682157901888845, + "grad_norm": 6.079987525939941, + "learning_rate": 4.292108302758527e-06, + "loss": 2.9446, + "step": 67235 + }, + { + "epoch": 4.5685555102595465, + "grad_norm": 7.566319465637207, + "learning_rate": 4.2916836526702e-06, + "loss": 2.888, + "step": 67240 + }, + { + "epoch": 4.568895230330208, + "grad_norm": 5.775628566741943, + "learning_rate": 4.291259002581873e-06, + "loss": 2.8716, + "step": 67245 + }, + { + "epoch": 4.56923495040087, + "grad_norm": 7.79258394241333, + "learning_rate": 4.2908343524935455e-06, + "loss": 3.0528, + "step": 67250 + }, + { + "epoch": 4.569574670471532, + "grad_norm": 7.623246669769287, + "learning_rate": 4.290409702405218e-06, + "loss": 2.7665, + "step": 67255 + }, + { + "epoch": 4.569914390542193, + "grad_norm": 8.222856521606445, + "learning_rate": 4.289985052316891e-06, + "loss": 2.8013, + "step": 67260 + }, + { + "epoch": 4.570254110612855, + "grad_norm": 8.783113479614258, + "learning_rate": 4.289560402228564e-06, + "loss": 2.8344, + "step": 67265 + }, + { + "epoch": 4.570593830683517, + "grad_norm": 6.646522521972656, + "learning_rate": 4.289135752140237e-06, + "loss": 2.8292, + "step": 67270 + }, + { + "epoch": 4.570933550754178, + "grad_norm": 7.558707237243652, + "learning_rate": 4.2887111020519095e-06, + "loss": 2.886, + "step": 67275 + }, + { + "epoch": 4.5712732708248405, + "grad_norm": 6.233703136444092, + "learning_rate": 4.288286451963582e-06, + "loss": 2.6085, + "step": 67280 + }, + { + "epoch": 4.5716129908955025, + "grad_norm": 7.480814456939697, + "learning_rate": 4.287861801875255e-06, + "loss": 2.8322, + "step": 67285 + }, + { + "epoch": 4.571952710966164, + "grad_norm": 7.822928428649902, + "learning_rate": 4.287437151786928e-06, + "loss": 3.0544, + "step": 67290 + }, + { + "epoch": 4.572292431036826, + "grad_norm": 7.925179481506348, + "learning_rate": 4.287012501698601e-06, + "loss": 2.6569, + "step": 67295 + }, + { + "epoch": 4.572632151107488, + "grad_norm": 9.332741737365723, + "learning_rate": 4.2865878516102735e-06, + "loss": 2.8769, + "step": 67300 + }, + { + "epoch": 4.572971871178149, + "grad_norm": 8.80361270904541, + "learning_rate": 4.286163201521946e-06, + "loss": 2.8386, + "step": 67305 + }, + { + "epoch": 4.573311591248811, + "grad_norm": 5.983442306518555, + "learning_rate": 4.285738551433619e-06, + "loss": 3.0304, + "step": 67310 + }, + { + "epoch": 4.573651311319473, + "grad_norm": 7.0455803871154785, + "learning_rate": 4.285313901345292e-06, + "loss": 2.9281, + "step": 67315 + }, + { + "epoch": 4.573991031390134, + "grad_norm": 7.751582145690918, + "learning_rate": 4.284889251256965e-06, + "loss": 2.8238, + "step": 67320 + }, + { + "epoch": 4.5743307514607965, + "grad_norm": 8.959065437316895, + "learning_rate": 4.2844646011686375e-06, + "loss": 2.8251, + "step": 67325 + }, + { + "epoch": 4.574670471531459, + "grad_norm": 6.022031307220459, + "learning_rate": 4.28403995108031e-06, + "loss": 2.8042, + "step": 67330 + }, + { + "epoch": 4.57501019160212, + "grad_norm": 7.9526591300964355, + "learning_rate": 4.283615300991983e-06, + "loss": 3.014, + "step": 67335 + }, + { + "epoch": 4.575349911672782, + "grad_norm": 5.256960868835449, + "learning_rate": 4.283190650903656e-06, + "loss": 2.4548, + "step": 67340 + }, + { + "epoch": 4.575689631743444, + "grad_norm": 6.892160415649414, + "learning_rate": 4.282766000815329e-06, + "loss": 2.9028, + "step": 67345 + }, + { + "epoch": 4.576029351814105, + "grad_norm": 6.773529529571533, + "learning_rate": 4.2823413507270015e-06, + "loss": 2.8521, + "step": 67350 + }, + { + "epoch": 4.576369071884767, + "grad_norm": 7.705590724945068, + "learning_rate": 4.281916700638674e-06, + "loss": 2.906, + "step": 67355 + }, + { + "epoch": 4.576708791955428, + "grad_norm": 7.57264518737793, + "learning_rate": 4.281492050550347e-06, + "loss": 2.8764, + "step": 67360 + }, + { + "epoch": 4.57704851202609, + "grad_norm": 7.008147239685059, + "learning_rate": 4.28106740046202e-06, + "loss": 2.808, + "step": 67365 + }, + { + "epoch": 4.5773882320967525, + "grad_norm": 9.685969352722168, + "learning_rate": 4.280642750373693e-06, + "loss": 2.7818, + "step": 67370 + }, + { + "epoch": 4.577727952167414, + "grad_norm": 7.413999557495117, + "learning_rate": 4.2802181002853655e-06, + "loss": 2.783, + "step": 67375 + }, + { + "epoch": 4.578067672238076, + "grad_norm": 7.884137153625488, + "learning_rate": 4.2797934501970375e-06, + "loss": 2.9925, + "step": 67380 + }, + { + "epoch": 4.578407392308738, + "grad_norm": 8.116898536682129, + "learning_rate": 4.279368800108711e-06, + "loss": 3.101, + "step": 67385 + }, + { + "epoch": 4.578747112379399, + "grad_norm": 8.217714309692383, + "learning_rate": 4.278944150020384e-06, + "loss": 2.8354, + "step": 67390 + }, + { + "epoch": 4.579086832450061, + "grad_norm": 6.050133228302002, + "learning_rate": 4.278519499932056e-06, + "loss": 2.813, + "step": 67395 + }, + { + "epoch": 4.579426552520723, + "grad_norm": Infinity, + "learning_rate": 4.278179779861395e-06, + "loss": 2.9375, + "step": 67400 + }, + { + "epoch": 4.579766272591384, + "grad_norm": 7.186899185180664, + "learning_rate": 4.277755129773068e-06, + "loss": 3.0133, + "step": 67405 + }, + { + "epoch": 4.580105992662046, + "grad_norm": 6.70400857925415, + "learning_rate": 4.27733047968474e-06, + "loss": 2.9584, + "step": 67410 + }, + { + "epoch": 4.5804457127327085, + "grad_norm": 6.990941524505615, + "learning_rate": 4.276905829596412e-06, + "loss": 2.8517, + "step": 67415 + }, + { + "epoch": 4.58078543280337, + "grad_norm": 8.889575004577637, + "learning_rate": 4.276481179508086e-06, + "loss": 2.6877, + "step": 67420 + }, + { + "epoch": 4.581125152874032, + "grad_norm": 6.379281044006348, + "learning_rate": 4.276056529419759e-06, + "loss": 2.8327, + "step": 67425 + }, + { + "epoch": 4.581464872944694, + "grad_norm": 8.01566219329834, + "learning_rate": 4.275631879331431e-06, + "loss": 2.6821, + "step": 67430 + }, + { + "epoch": 4.581804593015355, + "grad_norm": 7.724262237548828, + "learning_rate": 4.275207229243104e-06, + "loss": 2.9734, + "step": 67435 + }, + { + "epoch": 4.582144313086017, + "grad_norm": 9.317051887512207, + "learning_rate": 4.274782579154777e-06, + "loss": 2.593, + "step": 67440 + }, + { + "epoch": 4.582484033156679, + "grad_norm": 6.5644145011901855, + "learning_rate": 4.274357929066449e-06, + "loss": 2.8239, + "step": 67445 + }, + { + "epoch": 4.58282375322734, + "grad_norm": 8.6926851272583, + "learning_rate": 4.273933278978122e-06, + "loss": 2.79, + "step": 67450 + }, + { + "epoch": 4.583163473298002, + "grad_norm": 6.816800117492676, + "learning_rate": 4.273508628889796e-06, + "loss": 2.6299, + "step": 67455 + }, + { + "epoch": 4.5835031933686645, + "grad_norm": 8.060696601867676, + "learning_rate": 4.2730839788014676e-06, + "loss": 3.0832, + "step": 67460 + }, + { + "epoch": 4.583842913439326, + "grad_norm": 7.082326889038086, + "learning_rate": 4.27265932871314e-06, + "loss": 2.734, + "step": 67465 + }, + { + "epoch": 4.584182633509988, + "grad_norm": 6.510263919830322, + "learning_rate": 4.272234678624814e-06, + "loss": 2.795, + "step": 67470 + }, + { + "epoch": 4.58452235358065, + "grad_norm": 8.850203514099121, + "learning_rate": 4.271810028536486e-06, + "loss": 2.9527, + "step": 67475 + }, + { + "epoch": 4.584862073651311, + "grad_norm": 8.719550132751465, + "learning_rate": 4.271385378448159e-06, + "loss": 2.6812, + "step": 67480 + }, + { + "epoch": 4.585201793721973, + "grad_norm": 5.996084213256836, + "learning_rate": 4.270960728359832e-06, + "loss": 2.7052, + "step": 67485 + }, + { + "epoch": 4.585541513792635, + "grad_norm": 9.725092887878418, + "learning_rate": 4.270536078271504e-06, + "loss": 2.9952, + "step": 67490 + }, + { + "epoch": 4.585881233863296, + "grad_norm": 7.235688209533691, + "learning_rate": 4.270111428183177e-06, + "loss": 2.7914, + "step": 67495 + }, + { + "epoch": 4.5862209539339585, + "grad_norm": 8.462879180908203, + "learning_rate": 4.26968677809485e-06, + "loss": 2.9644, + "step": 67500 + }, + { + "epoch": 4.5865606740046205, + "grad_norm": 5.437614917755127, + "learning_rate": 4.269262128006523e-06, + "loss": 2.8994, + "step": 67505 + }, + { + "epoch": 4.586900394075282, + "grad_norm": 8.833001136779785, + "learning_rate": 4.2688374779181956e-06, + "loss": 2.9181, + "step": 67510 + }, + { + "epoch": 4.587240114145944, + "grad_norm": 5.752147674560547, + "learning_rate": 4.268412827829868e-06, + "loss": 2.9118, + "step": 67515 + }, + { + "epoch": 4.587579834216606, + "grad_norm": 7.541007041931152, + "learning_rate": 4.267988177741541e-06, + "loss": 2.9504, + "step": 67520 + }, + { + "epoch": 4.587919554287267, + "grad_norm": 7.0710320472717285, + "learning_rate": 4.267563527653214e-06, + "loss": 2.9545, + "step": 67525 + }, + { + "epoch": 4.588259274357929, + "grad_norm": 7.437605381011963, + "learning_rate": 4.267138877564887e-06, + "loss": 2.9311, + "step": 67530 + }, + { + "epoch": 4.588598994428591, + "grad_norm": 6.800104141235352, + "learning_rate": 4.26671422747656e-06, + "loss": 2.8414, + "step": 67535 + }, + { + "epoch": 4.588938714499252, + "grad_norm": 8.412789344787598, + "learning_rate": 4.266289577388232e-06, + "loss": 2.8015, + "step": 67540 + }, + { + "epoch": 4.5892784345699145, + "grad_norm": 8.206246376037598, + "learning_rate": 4.265864927299905e-06, + "loss": 3.0242, + "step": 67545 + }, + { + "epoch": 4.5896181546405765, + "grad_norm": 10.527657508850098, + "learning_rate": 4.265440277211578e-06, + "loss": 2.812, + "step": 67550 + }, + { + "epoch": 4.589957874711238, + "grad_norm": 8.269431114196777, + "learning_rate": 4.265015627123251e-06, + "loss": 2.8005, + "step": 67555 + }, + { + "epoch": 4.5902975947819, + "grad_norm": 6.697743892669678, + "learning_rate": 4.264590977034924e-06, + "loss": 2.9898, + "step": 67560 + }, + { + "epoch": 4.590637314852561, + "grad_norm": 8.81983757019043, + "learning_rate": 4.264166326946596e-06, + "loss": 2.6762, + "step": 67565 + }, + { + "epoch": 4.590977034923223, + "grad_norm": 7.514915943145752, + "learning_rate": 4.263741676858269e-06, + "loss": 2.6281, + "step": 67570 + }, + { + "epoch": 4.591316754993885, + "grad_norm": 6.325368404388428, + "learning_rate": 4.263317026769942e-06, + "loss": 2.8503, + "step": 67575 + }, + { + "epoch": 4.591656475064546, + "grad_norm": 8.042694091796875, + "learning_rate": 4.262892376681615e-06, + "loss": 2.8096, + "step": 67580 + }, + { + "epoch": 4.591996195135208, + "grad_norm": 6.623729228973389, + "learning_rate": 4.262467726593288e-06, + "loss": 2.8967, + "step": 67585 + }, + { + "epoch": 4.5923359152058705, + "grad_norm": 7.563882350921631, + "learning_rate": 4.26204307650496e-06, + "loss": 2.8625, + "step": 67590 + }, + { + "epoch": 4.592675635276532, + "grad_norm": 7.915948390960693, + "learning_rate": 4.261618426416633e-06, + "loss": 2.5857, + "step": 67595 + }, + { + "epoch": 4.593015355347194, + "grad_norm": 6.242954254150391, + "learning_rate": 4.261193776328306e-06, + "loss": 2.7272, + "step": 67600 + }, + { + "epoch": 4.593355075417856, + "grad_norm": 8.515583992004395, + "learning_rate": 4.260769126239979e-06, + "loss": 2.7727, + "step": 67605 + }, + { + "epoch": 4.593694795488517, + "grad_norm": 7.331887245178223, + "learning_rate": 4.260344476151652e-06, + "loss": 2.8528, + "step": 67610 + }, + { + "epoch": 4.594034515559179, + "grad_norm": 7.327744007110596, + "learning_rate": 4.259919826063324e-06, + "loss": 2.8137, + "step": 67615 + }, + { + "epoch": 4.594374235629841, + "grad_norm": 8.510196685791016, + "learning_rate": 4.259495175974997e-06, + "loss": 2.6319, + "step": 67620 + }, + { + "epoch": 4.594713955700502, + "grad_norm": 10.373910903930664, + "learning_rate": 4.25907052588667e-06, + "loss": 2.9095, + "step": 67625 + }, + { + "epoch": 4.595053675771164, + "grad_norm": 7.26392126083374, + "learning_rate": 4.258645875798342e-06, + "loss": 3.0319, + "step": 67630 + }, + { + "epoch": 4.5953933958418265, + "grad_norm": 6.526801586151123, + "learning_rate": 4.258221225710016e-06, + "loss": 2.7518, + "step": 67635 + }, + { + "epoch": 4.595733115912488, + "grad_norm": 8.389659881591797, + "learning_rate": 4.257796575621688e-06, + "loss": 2.7089, + "step": 67640 + }, + { + "epoch": 4.59607283598315, + "grad_norm": 8.490142822265625, + "learning_rate": 4.25737192553336e-06, + "loss": 3.1258, + "step": 67645 + }, + { + "epoch": 4.596412556053812, + "grad_norm": 8.607649803161621, + "learning_rate": 4.256947275445034e-06, + "loss": 2.9676, + "step": 67650 + }, + { + "epoch": 4.596752276124473, + "grad_norm": 7.966787338256836, + "learning_rate": 4.256522625356707e-06, + "loss": 3.0741, + "step": 67655 + }, + { + "epoch": 4.597091996195135, + "grad_norm": 8.605193138122559, + "learning_rate": 4.256097975268379e-06, + "loss": 2.7102, + "step": 67660 + }, + { + "epoch": 4.597431716265797, + "grad_norm": 7.681575775146484, + "learning_rate": 4.2556733251800516e-06, + "loss": 2.7923, + "step": 67665 + }, + { + "epoch": 4.597771436336458, + "grad_norm": 5.5282182693481445, + "learning_rate": 4.255248675091725e-06, + "loss": 2.9474, + "step": 67670 + }, + { + "epoch": 4.59811115640712, + "grad_norm": 7.179149627685547, + "learning_rate": 4.254824025003397e-06, + "loss": 2.9066, + "step": 67675 + }, + { + "epoch": 4.5984508764777825, + "grad_norm": 6.689708232879639, + "learning_rate": 4.25439937491507e-06, + "loss": 2.7647, + "step": 67680 + }, + { + "epoch": 4.598790596548444, + "grad_norm": 8.102468490600586, + "learning_rate": 4.253974724826744e-06, + "loss": 2.6651, + "step": 67685 + }, + { + "epoch": 4.599130316619106, + "grad_norm": 7.756166458129883, + "learning_rate": 4.2535500747384156e-06, + "loss": 2.9654, + "step": 67690 + }, + { + "epoch": 4.599470036689768, + "grad_norm": 7.039397716522217, + "learning_rate": 4.253125424650088e-06, + "loss": 2.6894, + "step": 67695 + }, + { + "epoch": 4.599809756760429, + "grad_norm": 7.119719505310059, + "learning_rate": 4.252700774561761e-06, + "loss": 2.8589, + "step": 67700 + }, + { + "epoch": 4.600149476831091, + "grad_norm": 8.28580093383789, + "learning_rate": 4.252276124473434e-06, + "loss": 3.0985, + "step": 67705 + }, + { + "epoch": 4.600489196901753, + "grad_norm": 7.185210704803467, + "learning_rate": 4.251851474385107e-06, + "loss": 2.6123, + "step": 67710 + }, + { + "epoch": 4.600828916972414, + "grad_norm": 7.971365451812744, + "learning_rate": 4.2514268242967796e-06, + "loss": 2.729, + "step": 67715 + }, + { + "epoch": 4.601168637043076, + "grad_norm": 9.607169151306152, + "learning_rate": 4.251002174208453e-06, + "loss": 2.8865, + "step": 67720 + }, + { + "epoch": 4.6015083571137385, + "grad_norm": 6.545289993286133, + "learning_rate": 4.250577524120125e-06, + "loss": 2.6468, + "step": 67725 + }, + { + "epoch": 4.6018480771844, + "grad_norm": 8.063446044921875, + "learning_rate": 4.250152874031798e-06, + "loss": 2.7758, + "step": 67730 + }, + { + "epoch": 4.602187797255062, + "grad_norm": 7.862409591674805, + "learning_rate": 4.249728223943471e-06, + "loss": 2.791, + "step": 67735 + }, + { + "epoch": 4.602527517325724, + "grad_norm": 8.004803657531738, + "learning_rate": 4.2493035738551436e-06, + "loss": 3.0343, + "step": 67740 + }, + { + "epoch": 4.602867237396385, + "grad_norm": 7.706376552581787, + "learning_rate": 4.248878923766816e-06, + "loss": 2.8339, + "step": 67745 + }, + { + "epoch": 4.603206957467047, + "grad_norm": 8.715018272399902, + "learning_rate": 4.248454273678489e-06, + "loss": 2.9256, + "step": 67750 + }, + { + "epoch": 4.603546677537709, + "grad_norm": 8.828454971313477, + "learning_rate": 4.248029623590162e-06, + "loss": 2.7446, + "step": 67755 + }, + { + "epoch": 4.60388639760837, + "grad_norm": 8.069912910461426, + "learning_rate": 4.247604973501835e-06, + "loss": 2.8431, + "step": 67760 + }, + { + "epoch": 4.6042261176790324, + "grad_norm": 9.552574157714844, + "learning_rate": 4.2471803234135076e-06, + "loss": 3.0197, + "step": 67765 + }, + { + "epoch": 4.6045658377496945, + "grad_norm": 8.311102867126465, + "learning_rate": 4.24675567332518e-06, + "loss": 3.0607, + "step": 67770 + }, + { + "epoch": 4.604905557820356, + "grad_norm": 8.032041549682617, + "learning_rate": 4.246331023236853e-06, + "loss": 2.902, + "step": 67775 + }, + { + "epoch": 4.605245277891018, + "grad_norm": 7.030670642852783, + "learning_rate": 4.245906373148526e-06, + "loss": 2.9222, + "step": 67780 + }, + { + "epoch": 4.60558499796168, + "grad_norm": 8.9550199508667, + "learning_rate": 4.245481723060199e-06, + "loss": 2.9676, + "step": 67785 + }, + { + "epoch": 4.605924718032341, + "grad_norm": 6.984374046325684, + "learning_rate": 4.2450570729718716e-06, + "loss": 2.7785, + "step": 67790 + }, + { + "epoch": 4.606264438103003, + "grad_norm": 4.977385997772217, + "learning_rate": 4.244632422883544e-06, + "loss": 2.8613, + "step": 67795 + }, + { + "epoch": 4.606604158173665, + "grad_norm": 7.675411701202393, + "learning_rate": 4.244207772795217e-06, + "loss": 2.7824, + "step": 67800 + }, + { + "epoch": 4.606943878244326, + "grad_norm": 6.388731956481934, + "learning_rate": 4.24378312270689e-06, + "loss": 2.9536, + "step": 67805 + }, + { + "epoch": 4.6072835983149885, + "grad_norm": 7.262224197387695, + "learning_rate": 4.243358472618563e-06, + "loss": 2.7183, + "step": 67810 + }, + { + "epoch": 4.6076233183856505, + "grad_norm": 5.892361640930176, + "learning_rate": 4.2429338225302356e-06, + "loss": 2.9309, + "step": 67815 + }, + { + "epoch": 4.607963038456312, + "grad_norm": 6.370724201202393, + "learning_rate": 4.242509172441908e-06, + "loss": 2.5897, + "step": 67820 + }, + { + "epoch": 4.608302758526974, + "grad_norm": 7.548400402069092, + "learning_rate": 4.242084522353581e-06, + "loss": 3.0501, + "step": 67825 + }, + { + "epoch": 4.608642478597636, + "grad_norm": 7.017244815826416, + "learning_rate": 4.241659872265253e-06, + "loss": 3.1304, + "step": 67830 + }, + { + "epoch": 4.608982198668297, + "grad_norm": 7.780664920806885, + "learning_rate": 4.241235222176927e-06, + "loss": 2.8218, + "step": 67835 + }, + { + "epoch": 4.609321918738959, + "grad_norm": 6.396341800689697, + "learning_rate": 4.2408105720886e-06, + "loss": 2.6254, + "step": 67840 + }, + { + "epoch": 4.609661638809621, + "grad_norm": 8.732373237609863, + "learning_rate": 4.2403859220002715e-06, + "loss": 2.8339, + "step": 67845 + }, + { + "epoch": 4.610001358880282, + "grad_norm": 8.077101707458496, + "learning_rate": 4.239961271911945e-06, + "loss": 2.9079, + "step": 67850 + }, + { + "epoch": 4.6103410789509445, + "grad_norm": 7.887930870056152, + "learning_rate": 4.239536621823618e-06, + "loss": 2.865, + "step": 67855 + }, + { + "epoch": 4.6106807990216065, + "grad_norm": 8.195606231689453, + "learning_rate": 4.23911197173529e-06, + "loss": 2.8799, + "step": 67860 + }, + { + "epoch": 4.611020519092268, + "grad_norm": 7.157407760620117, + "learning_rate": 4.238687321646964e-06, + "loss": 2.7665, + "step": 67865 + }, + { + "epoch": 4.61136023916293, + "grad_norm": 6.948520183563232, + "learning_rate": 4.238262671558636e-06, + "loss": 2.5714, + "step": 67870 + }, + { + "epoch": 4.611699959233592, + "grad_norm": 8.499303817749023, + "learning_rate": 4.237838021470308e-06, + "loss": 2.7567, + "step": 67875 + }, + { + "epoch": 4.612039679304253, + "grad_norm": 5.9341511726379395, + "learning_rate": 4.237413371381981e-06, + "loss": 2.872, + "step": 67880 + }, + { + "epoch": 4.612379399374915, + "grad_norm": 5.395257949829102, + "learning_rate": 4.236988721293655e-06, + "loss": 2.8199, + "step": 67885 + }, + { + "epoch": 4.612719119445577, + "grad_norm": 7.507593154907227, + "learning_rate": 4.236564071205328e-06, + "loss": 2.967, + "step": 67890 + }, + { + "epoch": 4.613058839516238, + "grad_norm": 9.687628746032715, + "learning_rate": 4.2361394211169995e-06, + "loss": 2.7379, + "step": 67895 + }, + { + "epoch": 4.6133985595869005, + "grad_norm": 6.482899188995361, + "learning_rate": 4.235714771028673e-06, + "loss": 2.9898, + "step": 67900 + }, + { + "epoch": 4.613738279657563, + "grad_norm": 5.6973876953125, + "learning_rate": 4.235290120940346e-06, + "loss": 2.7909, + "step": 67905 + }, + { + "epoch": 4.614077999728224, + "grad_norm": 8.176898002624512, + "learning_rate": 4.234865470852018e-06, + "loss": 2.7032, + "step": 67910 + }, + { + "epoch": 4.614417719798886, + "grad_norm": 9.019661903381348, + "learning_rate": 4.234440820763691e-06, + "loss": 2.9248, + "step": 67915 + }, + { + "epoch": 4.614757439869548, + "grad_norm": 7.719452857971191, + "learning_rate": 4.234016170675364e-06, + "loss": 2.8426, + "step": 67920 + }, + { + "epoch": 4.615097159940209, + "grad_norm": 7.837551116943359, + "learning_rate": 4.233591520587036e-06, + "loss": 3.0284, + "step": 67925 + }, + { + "epoch": 4.615436880010871, + "grad_norm": 6.126484394073486, + "learning_rate": 4.233166870498709e-06, + "loss": 2.7144, + "step": 67930 + }, + { + "epoch": 4.615776600081533, + "grad_norm": 6.660331726074219, + "learning_rate": 4.232742220410383e-06, + "loss": 2.7788, + "step": 67935 + }, + { + "epoch": 4.616116320152194, + "grad_norm": 6.917897701263428, + "learning_rate": 4.232317570322055e-06, + "loss": 2.8278, + "step": 67940 + }, + { + "epoch": 4.6164560402228565, + "grad_norm": 6.678741455078125, + "learning_rate": 4.2318929202337275e-06, + "loss": 2.971, + "step": 67945 + }, + { + "epoch": 4.616795760293519, + "grad_norm": 6.651390075683594, + "learning_rate": 4.2314682701454e-06, + "loss": 2.914, + "step": 67950 + }, + { + "epoch": 4.61713548036418, + "grad_norm": 7.961653232574463, + "learning_rate": 4.231043620057073e-06, + "loss": 3.0844, + "step": 67955 + }, + { + "epoch": 4.617475200434842, + "grad_norm": 8.538301467895508, + "learning_rate": 4.230618969968746e-06, + "loss": 2.8079, + "step": 67960 + }, + { + "epoch": 4.617814920505504, + "grad_norm": 6.435659885406494, + "learning_rate": 4.230194319880419e-06, + "loss": 2.7117, + "step": 67965 + }, + { + "epoch": 4.618154640576165, + "grad_norm": 10.137450218200684, + "learning_rate": 4.2297696697920916e-06, + "loss": 2.813, + "step": 67970 + }, + { + "epoch": 4.618494360646827, + "grad_norm": 9.105772972106934, + "learning_rate": 4.229345019703764e-06, + "loss": 2.9804, + "step": 67975 + }, + { + "epoch": 4.618834080717489, + "grad_norm": 5.845820426940918, + "learning_rate": 4.228920369615437e-06, + "loss": 2.6615, + "step": 67980 + }, + { + "epoch": 4.61917380078815, + "grad_norm": 7.093904972076416, + "learning_rate": 4.22849571952711e-06, + "loss": 2.8692, + "step": 67985 + }, + { + "epoch": 4.6195135208588125, + "grad_norm": 8.048479080200195, + "learning_rate": 4.228071069438783e-06, + "loss": 2.92, + "step": 67990 + }, + { + "epoch": 4.619853240929475, + "grad_norm": 8.713844299316406, + "learning_rate": 4.2276464193504556e-06, + "loss": 2.9852, + "step": 67995 + }, + { + "epoch": 4.620192961000136, + "grad_norm": 7.748255729675293, + "learning_rate": 4.227221769262128e-06, + "loss": 2.6074, + "step": 68000 + }, + { + "epoch": 4.620532681070798, + "grad_norm": 7.2786993980407715, + "learning_rate": 4.226797119173801e-06, + "loss": 2.9365, + "step": 68005 + }, + { + "epoch": 4.62087240114146, + "grad_norm": 7.622429370880127, + "learning_rate": 4.226372469085474e-06, + "loss": 2.916, + "step": 68010 + }, + { + "epoch": 4.621212121212121, + "grad_norm": 6.354182243347168, + "learning_rate": 4.225947818997147e-06, + "loss": 2.6416, + "step": 68015 + }, + { + "epoch": 4.621551841282783, + "grad_norm": 7.342607498168945, + "learning_rate": 4.2255231689088196e-06, + "loss": 2.7275, + "step": 68020 + }, + { + "epoch": 4.621891561353445, + "grad_norm": 6.950204372406006, + "learning_rate": 4.225098518820492e-06, + "loss": 2.5875, + "step": 68025 + }, + { + "epoch": 4.622231281424106, + "grad_norm": 7.762843132019043, + "learning_rate": 4.224673868732165e-06, + "loss": 2.7168, + "step": 68030 + }, + { + "epoch": 4.6225710014947685, + "grad_norm": 10.278541564941406, + "learning_rate": 4.224249218643838e-06, + "loss": 2.8341, + "step": 68035 + }, + { + "epoch": 4.62291072156543, + "grad_norm": 7.616506099700928, + "learning_rate": 4.223824568555511e-06, + "loss": 2.7724, + "step": 68040 + }, + { + "epoch": 4.623250441636092, + "grad_norm": 7.338340759277344, + "learning_rate": 4.223399918467183e-06, + "loss": 2.5751, + "step": 68045 + }, + { + "epoch": 4.623590161706754, + "grad_norm": 7.496903896331787, + "learning_rate": 4.222975268378856e-06, + "loss": 2.9332, + "step": 68050 + }, + { + "epoch": 4.623929881777415, + "grad_norm": 6.657071113586426, + "learning_rate": 4.222550618290529e-06, + "loss": 2.7188, + "step": 68055 + }, + { + "epoch": 4.624269601848077, + "grad_norm": 7.603232383728027, + "learning_rate": 4.222125968202202e-06, + "loss": 2.9745, + "step": 68060 + }, + { + "epoch": 4.624609321918739, + "grad_norm": 6.987937927246094, + "learning_rate": 4.221701318113875e-06, + "loss": 3.0581, + "step": 68065 + }, + { + "epoch": 4.6249490419894, + "grad_norm": 8.313549995422363, + "learning_rate": 4.2212766680255476e-06, + "loss": 2.7214, + "step": 68070 + }, + { + "epoch": 4.6252887620600625, + "grad_norm": 5.894706726074219, + "learning_rate": 4.22085201793722e-06, + "loss": 2.8691, + "step": 68075 + }, + { + "epoch": 4.6256284821307245, + "grad_norm": 8.92448902130127, + "learning_rate": 4.220427367848892e-06, + "loss": 2.906, + "step": 68080 + }, + { + "epoch": 4.625968202201386, + "grad_norm": 6.334659576416016, + "learning_rate": 4.220002717760566e-06, + "loss": 2.8347, + "step": 68085 + }, + { + "epoch": 4.626307922272048, + "grad_norm": 8.27712345123291, + "learning_rate": 4.219578067672239e-06, + "loss": 2.8368, + "step": 68090 + }, + { + "epoch": 4.62664764234271, + "grad_norm": 6.38062047958374, + "learning_rate": 4.219153417583911e-06, + "loss": 2.9757, + "step": 68095 + }, + { + "epoch": 4.626987362413371, + "grad_norm": 8.84516429901123, + "learning_rate": 4.218728767495584e-06, + "loss": 3.0363, + "step": 68100 + }, + { + "epoch": 4.627327082484033, + "grad_norm": 7.4734578132629395, + "learning_rate": 4.218304117407257e-06, + "loss": 2.8941, + "step": 68105 + }, + { + "epoch": 4.627666802554695, + "grad_norm": 7.03882360458374, + "learning_rate": 4.217879467318929e-06, + "loss": 2.6483, + "step": 68110 + }, + { + "epoch": 4.628006522625356, + "grad_norm": 7.4109110832214355, + "learning_rate": 4.217454817230602e-06, + "loss": 3.0148, + "step": 68115 + }, + { + "epoch": 4.6283462426960185, + "grad_norm": 7.340741157531738, + "learning_rate": 4.2170301671422756e-06, + "loss": 2.8384, + "step": 68120 + }, + { + "epoch": 4.6286859627666805, + "grad_norm": 9.399697303771973, + "learning_rate": 4.2166055170539475e-06, + "loss": 2.9831, + "step": 68125 + }, + { + "epoch": 4.629025682837342, + "grad_norm": 6.146501064300537, + "learning_rate": 4.21618086696562e-06, + "loss": 2.9367, + "step": 68130 + }, + { + "epoch": 4.629365402908004, + "grad_norm": 5.935805320739746, + "learning_rate": 4.215756216877294e-06, + "loss": 2.8038, + "step": 68135 + }, + { + "epoch": 4.629705122978666, + "grad_norm": 6.917463302612305, + "learning_rate": 4.215331566788966e-06, + "loss": 2.8025, + "step": 68140 + }, + { + "epoch": 4.630044843049327, + "grad_norm": 6.226720333099365, + "learning_rate": 4.214906916700639e-06, + "loss": 2.8282, + "step": 68145 + }, + { + "epoch": 4.630384563119989, + "grad_norm": 9.176196098327637, + "learning_rate": 4.214482266612312e-06, + "loss": 2.8311, + "step": 68150 + }, + { + "epoch": 4.630724283190651, + "grad_norm": 7.16357421875, + "learning_rate": 4.214057616523984e-06, + "loss": 2.7537, + "step": 68155 + }, + { + "epoch": 4.631064003261312, + "grad_norm": 7.659153938293457, + "learning_rate": 4.213632966435657e-06, + "loss": 2.9532, + "step": 68160 + }, + { + "epoch": 4.6314037233319745, + "grad_norm": 7.6256632804870605, + "learning_rate": 4.21320831634733e-06, + "loss": 2.9152, + "step": 68165 + }, + { + "epoch": 4.6317434434026366, + "grad_norm": 6.811580181121826, + "learning_rate": 4.212783666259003e-06, + "loss": 2.8312, + "step": 68170 + }, + { + "epoch": 4.632083163473298, + "grad_norm": 8.237366676330566, + "learning_rate": 4.2123590161706755e-06, + "loss": 2.9127, + "step": 68175 + }, + { + "epoch": 4.63242288354396, + "grad_norm": 6.504398345947266, + "learning_rate": 4.211934366082348e-06, + "loss": 2.8885, + "step": 68180 + }, + { + "epoch": 4.632762603614622, + "grad_norm": 9.161149024963379, + "learning_rate": 4.211509715994021e-06, + "loss": 2.9201, + "step": 68185 + }, + { + "epoch": 4.633102323685283, + "grad_norm": 6.793470859527588, + "learning_rate": 4.211085065905694e-06, + "loss": 2.891, + "step": 68190 + }, + { + "epoch": 4.633442043755945, + "grad_norm": 7.168518543243408, + "learning_rate": 4.210660415817367e-06, + "loss": 3.1518, + "step": 68195 + }, + { + "epoch": 4.633781763826607, + "grad_norm": 6.863064289093018, + "learning_rate": 4.2102357657290395e-06, + "loss": 2.7797, + "step": 68200 + }, + { + "epoch": 4.634121483897268, + "grad_norm": 8.165279388427734, + "learning_rate": 4.209811115640712e-06, + "loss": 3.0228, + "step": 68205 + }, + { + "epoch": 4.6344612039679305, + "grad_norm": 7.705974578857422, + "learning_rate": 4.209386465552385e-06, + "loss": 2.6243, + "step": 68210 + }, + { + "epoch": 4.634800924038593, + "grad_norm": 7.299661636352539, + "learning_rate": 4.208961815464058e-06, + "loss": 3.2473, + "step": 68215 + }, + { + "epoch": 4.635140644109254, + "grad_norm": 7.8854289054870605, + "learning_rate": 4.208537165375731e-06, + "loss": 2.7986, + "step": 68220 + }, + { + "epoch": 4.635480364179916, + "grad_norm": 6.67617654800415, + "learning_rate": 4.2081125152874035e-06, + "loss": 2.9816, + "step": 68225 + }, + { + "epoch": 4.635820084250578, + "grad_norm": 8.188300132751465, + "learning_rate": 4.207687865199076e-06, + "loss": 2.6998, + "step": 68230 + }, + { + "epoch": 4.636159804321239, + "grad_norm": 9.05941104888916, + "learning_rate": 4.207263215110749e-06, + "loss": 2.8113, + "step": 68235 + }, + { + "epoch": 4.636499524391901, + "grad_norm": 8.210592269897461, + "learning_rate": 4.206838565022422e-06, + "loss": 2.9681, + "step": 68240 + }, + { + "epoch": 4.636839244462563, + "grad_norm": 6.413578987121582, + "learning_rate": 4.206413914934095e-06, + "loss": 2.9237, + "step": 68245 + }, + { + "epoch": 4.637178964533224, + "grad_norm": 9.47437858581543, + "learning_rate": 4.2059892648457675e-06, + "loss": 2.8578, + "step": 68250 + }, + { + "epoch": 4.6375186846038865, + "grad_norm": 6.612038612365723, + "learning_rate": 4.20556461475744e-06, + "loss": 2.7857, + "step": 68255 + }, + { + "epoch": 4.637858404674548, + "grad_norm": 6.331183433532715, + "learning_rate": 4.205139964669113e-06, + "loss": 2.9555, + "step": 68260 + }, + { + "epoch": 4.63819812474521, + "grad_norm": 7.365171909332275, + "learning_rate": 4.204715314580786e-06, + "loss": 2.8401, + "step": 68265 + }, + { + "epoch": 4.638537844815872, + "grad_norm": 7.076393127441406, + "learning_rate": 4.204290664492459e-06, + "loss": 2.6845, + "step": 68270 + }, + { + "epoch": 4.638877564886533, + "grad_norm": 6.31442403793335, + "learning_rate": 4.2038660144041316e-06, + "loss": 2.9052, + "step": 68275 + }, + { + "epoch": 4.639217284957195, + "grad_norm": 8.607978820800781, + "learning_rate": 4.203441364315804e-06, + "loss": 3.0172, + "step": 68280 + }, + { + "epoch": 4.639557005027857, + "grad_norm": 7.825046539306641, + "learning_rate": 4.203016714227477e-06, + "loss": 2.6114, + "step": 68285 + }, + { + "epoch": 4.639896725098518, + "grad_norm": 6.77092981338501, + "learning_rate": 4.20259206413915e-06, + "loss": 2.8108, + "step": 68290 + }, + { + "epoch": 4.64023644516918, + "grad_norm": 8.181745529174805, + "learning_rate": 4.202167414050822e-06, + "loss": 2.981, + "step": 68295 + }, + { + "epoch": 4.6405761652398425, + "grad_norm": 7.692387104034424, + "learning_rate": 4.2017427639624956e-06, + "loss": 2.6309, + "step": 68300 + }, + { + "epoch": 4.640915885310504, + "grad_norm": 6.597299098968506, + "learning_rate": 4.201318113874168e-06, + "loss": 2.5912, + "step": 68305 + }, + { + "epoch": 4.641255605381166, + "grad_norm": 7.305192947387695, + "learning_rate": 4.20089346378584e-06, + "loss": 2.7599, + "step": 68310 + }, + { + "epoch": 4.641595325451828, + "grad_norm": 9.030657768249512, + "learning_rate": 4.200468813697514e-06, + "loss": 2.9537, + "step": 68315 + }, + { + "epoch": 4.641935045522489, + "grad_norm": 6.887563228607178, + "learning_rate": 4.200044163609187e-06, + "loss": 2.8618, + "step": 68320 + }, + { + "epoch": 4.642274765593151, + "grad_norm": 7.362038612365723, + "learning_rate": 4.199619513520859e-06, + "loss": 2.6971, + "step": 68325 + }, + { + "epoch": 4.642614485663813, + "grad_norm": 9.242076873779297, + "learning_rate": 4.1991948634325315e-06, + "loss": 2.9369, + "step": 68330 + }, + { + "epoch": 4.642954205734474, + "grad_norm": 5.916932106018066, + "learning_rate": 4.198770213344205e-06, + "loss": 2.8351, + "step": 68335 + }, + { + "epoch": 4.6432939258051364, + "grad_norm": 7.2249860763549805, + "learning_rate": 4.198345563255877e-06, + "loss": 2.9465, + "step": 68340 + }, + { + "epoch": 4.6436336458757985, + "grad_norm": 7.386667728424072, + "learning_rate": 4.19792091316755e-06, + "loss": 2.5406, + "step": 68345 + }, + { + "epoch": 4.64397336594646, + "grad_norm": 9.074287414550781, + "learning_rate": 4.1974962630792236e-06, + "loss": 2.8385, + "step": 68350 + }, + { + "epoch": 4.644313086017122, + "grad_norm": 6.830821990966797, + "learning_rate": 4.1970716129908955e-06, + "loss": 2.8897, + "step": 68355 + }, + { + "epoch": 4.644652806087784, + "grad_norm": 7.130026817321777, + "learning_rate": 4.196646962902568e-06, + "loss": 2.799, + "step": 68360 + }, + { + "epoch": 4.644992526158445, + "grad_norm": 8.42487907409668, + "learning_rate": 4.196222312814241e-06, + "loss": 2.6636, + "step": 68365 + }, + { + "epoch": 4.645332246229107, + "grad_norm": 9.607242584228516, + "learning_rate": 4.195797662725914e-06, + "loss": 3.0514, + "step": 68370 + }, + { + "epoch": 4.645671966299769, + "grad_norm": 8.529074668884277, + "learning_rate": 4.195373012637587e-06, + "loss": 3.0926, + "step": 68375 + }, + { + "epoch": 4.64601168637043, + "grad_norm": 7.812297821044922, + "learning_rate": 4.1949483625492595e-06, + "loss": 2.7231, + "step": 68380 + }, + { + "epoch": 4.6463514064410925, + "grad_norm": 5.826145172119141, + "learning_rate": 4.194523712460932e-06, + "loss": 2.8524, + "step": 68385 + }, + { + "epoch": 4.6466911265117545, + "grad_norm": 7.79865837097168, + "learning_rate": 4.194099062372605e-06, + "loss": 2.793, + "step": 68390 + }, + { + "epoch": 4.647030846582416, + "grad_norm": 8.69351577758789, + "learning_rate": 4.193674412284278e-06, + "loss": 2.9662, + "step": 68395 + }, + { + "epoch": 4.647370566653078, + "grad_norm": 8.286032676696777, + "learning_rate": 4.193249762195951e-06, + "loss": 3.0769, + "step": 68400 + }, + { + "epoch": 4.64771028672374, + "grad_norm": 8.469779014587402, + "learning_rate": 4.1928251121076235e-06, + "loss": 2.895, + "step": 68405 + }, + { + "epoch": 4.648050006794401, + "grad_norm": 7.480401515960693, + "learning_rate": 4.192400462019296e-06, + "loss": 2.6986, + "step": 68410 + }, + { + "epoch": 4.648389726865063, + "grad_norm": 10.19239330291748, + "learning_rate": 4.191975811930969e-06, + "loss": 2.9284, + "step": 68415 + }, + { + "epoch": 4.648729446935725, + "grad_norm": 6.452463626861572, + "learning_rate": 4.191551161842642e-06, + "loss": 2.8038, + "step": 68420 + }, + { + "epoch": 4.649069167006386, + "grad_norm": 6.597055435180664, + "learning_rate": 4.191126511754315e-06, + "loss": 2.6977, + "step": 68425 + }, + { + "epoch": 4.6494088870770485, + "grad_norm": 7.405770301818848, + "learning_rate": 4.1907018616659875e-06, + "loss": 2.8904, + "step": 68430 + }, + { + "epoch": 4.6497486071477105, + "grad_norm": 7.336877822875977, + "learning_rate": 4.19027721157766e-06, + "loss": 2.8124, + "step": 68435 + }, + { + "epoch": 4.650088327218372, + "grad_norm": 6.360779285430908, + "learning_rate": 4.189852561489333e-06, + "loss": 3.0927, + "step": 68440 + }, + { + "epoch": 4.650428047289034, + "grad_norm": 7.014015197753906, + "learning_rate": 4.189427911401006e-06, + "loss": 2.9216, + "step": 68445 + }, + { + "epoch": 4.650767767359696, + "grad_norm": 9.56564712524414, + "learning_rate": 4.189003261312679e-06, + "loss": 2.9409, + "step": 68450 + }, + { + "epoch": 4.651107487430357, + "grad_norm": 7.201160907745361, + "learning_rate": 4.1885786112243515e-06, + "loss": 2.8142, + "step": 68455 + }, + { + "epoch": 4.651447207501019, + "grad_norm": 6.55320930480957, + "learning_rate": 4.188153961136024e-06, + "loss": 3.0821, + "step": 68460 + }, + { + "epoch": 4.651786927571681, + "grad_norm": 6.30709171295166, + "learning_rate": 4.187729311047697e-06, + "loss": 2.6887, + "step": 68465 + }, + { + "epoch": 4.652126647642342, + "grad_norm": 7.431978225708008, + "learning_rate": 4.18730466095937e-06, + "loss": 2.8464, + "step": 68470 + }, + { + "epoch": 4.6524663677130045, + "grad_norm": 8.836938858032227, + "learning_rate": 4.186880010871043e-06, + "loss": 2.6911, + "step": 68475 + }, + { + "epoch": 4.6528060877836666, + "grad_norm": 6.319708824157715, + "learning_rate": 4.1864553607827155e-06, + "loss": 2.588, + "step": 68480 + }, + { + "epoch": 4.653145807854328, + "grad_norm": 8.86195182800293, + "learning_rate": 4.186030710694388e-06, + "loss": 3.0087, + "step": 68485 + }, + { + "epoch": 4.65348552792499, + "grad_norm": 7.714229583740234, + "learning_rate": 4.185606060606061e-06, + "loss": 3.027, + "step": 68490 + }, + { + "epoch": 4.653825247995652, + "grad_norm": 6.598628044128418, + "learning_rate": 4.185181410517734e-06, + "loss": 2.6607, + "step": 68495 + }, + { + "epoch": 4.654164968066313, + "grad_norm": 7.224616050720215, + "learning_rate": 4.184756760429407e-06, + "loss": 2.8686, + "step": 68500 + }, + { + "epoch": 4.654504688136975, + "grad_norm": 7.31086540222168, + "learning_rate": 4.1843321103410795e-06, + "loss": 2.8063, + "step": 68505 + }, + { + "epoch": 4.654844408207637, + "grad_norm": 7.603959560394287, + "learning_rate": 4.1839074602527515e-06, + "loss": 2.8976, + "step": 68510 + }, + { + "epoch": 4.655184128278298, + "grad_norm": 7.366803169250488, + "learning_rate": 4.183482810164425e-06, + "loss": 2.8461, + "step": 68515 + }, + { + "epoch": 4.6555238483489605, + "grad_norm": 8.080523490905762, + "learning_rate": 4.183058160076098e-06, + "loss": 2.9542, + "step": 68520 + }, + { + "epoch": 4.655863568419623, + "grad_norm": 6.612279891967773, + "learning_rate": 4.18263350998777e-06, + "loss": 2.7524, + "step": 68525 + }, + { + "epoch": 4.656203288490284, + "grad_norm": 8.593560218811035, + "learning_rate": 4.1822088598994435e-06, + "loss": 2.9292, + "step": 68530 + }, + { + "epoch": 4.656543008560946, + "grad_norm": 7.642291069030762, + "learning_rate": 4.181784209811116e-06, + "loss": 3.034, + "step": 68535 + }, + { + "epoch": 4.656882728631608, + "grad_norm": 9.023877143859863, + "learning_rate": 4.181359559722788e-06, + "loss": 2.9706, + "step": 68540 + }, + { + "epoch": 4.657222448702269, + "grad_norm": 7.454680442810059, + "learning_rate": 4.180934909634461e-06, + "loss": 2.9024, + "step": 68545 + }, + { + "epoch": 4.657562168772931, + "grad_norm": 7.53230619430542, + "learning_rate": 4.180510259546135e-06, + "loss": 3.0486, + "step": 68550 + }, + { + "epoch": 4.657901888843593, + "grad_norm": 6.8429341316223145, + "learning_rate": 4.180085609457807e-06, + "loss": 2.6771, + "step": 68555 + }, + { + "epoch": 4.658241608914254, + "grad_norm": 7.240707874298096, + "learning_rate": 4.1796609593694795e-06, + "loss": 2.8041, + "step": 68560 + }, + { + "epoch": 4.6585813289849165, + "grad_norm": 7.025838375091553, + "learning_rate": 4.179236309281153e-06, + "loss": 3.049, + "step": 68565 + }, + { + "epoch": 4.658921049055579, + "grad_norm": 10.016670227050781, + "learning_rate": 4.178811659192826e-06, + "loss": 3.0304, + "step": 68570 + }, + { + "epoch": 4.65926076912624, + "grad_norm": 6.952427864074707, + "learning_rate": 4.178387009104498e-06, + "loss": 2.778, + "step": 68575 + }, + { + "epoch": 4.659600489196902, + "grad_norm": 6.591986179351807, + "learning_rate": 4.177962359016171e-06, + "loss": 3.0011, + "step": 68580 + }, + { + "epoch": 4.659940209267564, + "grad_norm": 7.7397332191467285, + "learning_rate": 4.177537708927844e-06, + "loss": 2.9992, + "step": 68585 + }, + { + "epoch": 4.660279929338225, + "grad_norm": 7.716129779815674, + "learning_rate": 4.177113058839516e-06, + "loss": 2.8556, + "step": 68590 + }, + { + "epoch": 4.660619649408887, + "grad_norm": 7.55331563949585, + "learning_rate": 4.176688408751189e-06, + "loss": 2.8448, + "step": 68595 + }, + { + "epoch": 4.660959369479549, + "grad_norm": 6.354416370391846, + "learning_rate": 4.176263758662863e-06, + "loss": 2.481, + "step": 68600 + }, + { + "epoch": 4.66129908955021, + "grad_norm": 6.90347957611084, + "learning_rate": 4.175839108574535e-06, + "loss": 2.8427, + "step": 68605 + }, + { + "epoch": 4.6616388096208725, + "grad_norm": 7.405783653259277, + "learning_rate": 4.1754144584862075e-06, + "loss": 2.8498, + "step": 68610 + }, + { + "epoch": 4.661978529691535, + "grad_norm": 7.318639755249023, + "learning_rate": 4.17498980839788e-06, + "loss": 2.8533, + "step": 68615 + }, + { + "epoch": 4.662318249762196, + "grad_norm": 9.53602409362793, + "learning_rate": 4.174565158309553e-06, + "loss": 2.828, + "step": 68620 + }, + { + "epoch": 4.662657969832858, + "grad_norm": 6.667930603027344, + "learning_rate": 4.174140508221226e-06, + "loss": 2.8434, + "step": 68625 + }, + { + "epoch": 4.66299768990352, + "grad_norm": 7.554039478302002, + "learning_rate": 4.173715858132899e-06, + "loss": 2.9452, + "step": 68630 + }, + { + "epoch": 4.663337409974181, + "grad_norm": 9.65698528289795, + "learning_rate": 4.1732912080445715e-06, + "loss": 2.9187, + "step": 68635 + }, + { + "epoch": 4.663677130044843, + "grad_norm": 8.376275062561035, + "learning_rate": 4.172866557956244e-06, + "loss": 2.7399, + "step": 68640 + }, + { + "epoch": 4.664016850115505, + "grad_norm": 5.6332783699035645, + "learning_rate": 4.172441907867917e-06, + "loss": 2.8333, + "step": 68645 + }, + { + "epoch": 4.6643565701861665, + "grad_norm": 8.892239570617676, + "learning_rate": 4.17201725777959e-06, + "loss": 2.7427, + "step": 68650 + }, + { + "epoch": 4.6646962902568285, + "grad_norm": 6.660672187805176, + "learning_rate": 4.171592607691263e-06, + "loss": 2.8605, + "step": 68655 + }, + { + "epoch": 4.665036010327491, + "grad_norm": 6.36131477355957, + "learning_rate": 4.1711679576029355e-06, + "loss": 2.9286, + "step": 68660 + }, + { + "epoch": 4.665375730398152, + "grad_norm": 7.476963996887207, + "learning_rate": 4.170743307514608e-06, + "loss": 2.8047, + "step": 68665 + }, + { + "epoch": 4.665715450468814, + "grad_norm": 7.770412921905518, + "learning_rate": 4.170318657426281e-06, + "loss": 2.8742, + "step": 68670 + }, + { + "epoch": 4.666055170539476, + "grad_norm": 6.95335054397583, + "learning_rate": 4.169894007337954e-06, + "loss": 3.0219, + "step": 68675 + }, + { + "epoch": 4.666394890610137, + "grad_norm": 8.489352226257324, + "learning_rate": 4.169469357249627e-06, + "loss": 2.6868, + "step": 68680 + }, + { + "epoch": 4.666734610680799, + "grad_norm": 8.709622383117676, + "learning_rate": 4.1690447071612995e-06, + "loss": 2.9872, + "step": 68685 + }, + { + "epoch": 4.667074330751461, + "grad_norm": 9.080438613891602, + "learning_rate": 4.168620057072972e-06, + "loss": 3.0082, + "step": 68690 + }, + { + "epoch": 4.6674140508221225, + "grad_norm": 6.257317066192627, + "learning_rate": 4.168195406984645e-06, + "loss": 2.7187, + "step": 68695 + }, + { + "epoch": 4.6677537708927845, + "grad_norm": 7.559149265289307, + "learning_rate": 4.167770756896318e-06, + "loss": 3.0213, + "step": 68700 + }, + { + "epoch": 4.668093490963447, + "grad_norm": 7.353180408477783, + "learning_rate": 4.167346106807991e-06, + "loss": 2.8727, + "step": 68705 + }, + { + "epoch": 4.668433211034108, + "grad_norm": 6.566103458404541, + "learning_rate": 4.166921456719663e-06, + "loss": 2.5609, + "step": 68710 + }, + { + "epoch": 4.66877293110477, + "grad_norm": 7.043023109436035, + "learning_rate": 4.166496806631336e-06, + "loss": 2.8028, + "step": 68715 + }, + { + "epoch": 4.669112651175431, + "grad_norm": 8.937663078308105, + "learning_rate": 4.166072156543009e-06, + "loss": 2.9842, + "step": 68720 + }, + { + "epoch": 4.669452371246093, + "grad_norm": 8.855393409729004, + "learning_rate": 4.165647506454681e-06, + "loss": 2.8858, + "step": 68725 + }, + { + "epoch": 4.669792091316755, + "grad_norm": 8.828709602355957, + "learning_rate": 4.165222856366355e-06, + "loss": 2.8261, + "step": 68730 + }, + { + "epoch": 4.670131811387416, + "grad_norm": 8.725804328918457, + "learning_rate": 4.1647982062780275e-06, + "loss": 2.8452, + "step": 68735 + }, + { + "epoch": 4.6704715314580785, + "grad_norm": 6.412111759185791, + "learning_rate": 4.1643735561897e-06, + "loss": 2.8884, + "step": 68740 + }, + { + "epoch": 4.6708112515287405, + "grad_norm": 8.394564628601074, + "learning_rate": 4.163948906101372e-06, + "loss": 2.7722, + "step": 68745 + }, + { + "epoch": 4.671150971599402, + "grad_norm": 9.068254470825195, + "learning_rate": 4.163524256013046e-06, + "loss": 2.9526, + "step": 68750 + }, + { + "epoch": 4.671490691670064, + "grad_norm": 8.748550415039062, + "learning_rate": 4.163099605924719e-06, + "loss": 2.8913, + "step": 68755 + }, + { + "epoch": 4.671830411740726, + "grad_norm": 8.1381254196167, + "learning_rate": 4.162674955836391e-06, + "loss": 2.9443, + "step": 68760 + }, + { + "epoch": 4.672170131811387, + "grad_norm": 6.9438934326171875, + "learning_rate": 4.162250305748064e-06, + "loss": 3.051, + "step": 68765 + }, + { + "epoch": 4.672509851882049, + "grad_norm": 8.979554176330566, + "learning_rate": 4.161825655659737e-06, + "loss": 2.7858, + "step": 68770 + }, + { + "epoch": 4.672849571952711, + "grad_norm": 5.954489707946777, + "learning_rate": 4.161401005571409e-06, + "loss": 2.9548, + "step": 68775 + }, + { + "epoch": 4.673189292023372, + "grad_norm": 7.198116302490234, + "learning_rate": 4.160976355483083e-06, + "loss": 3.1034, + "step": 68780 + }, + { + "epoch": 4.6735290120940345, + "grad_norm": 7.076038837432861, + "learning_rate": 4.1605517053947555e-06, + "loss": 2.8763, + "step": 68785 + }, + { + "epoch": 4.673868732164697, + "grad_norm": 6.565167427062988, + "learning_rate": 4.1601270553064275e-06, + "loss": 2.752, + "step": 68790 + }, + { + "epoch": 4.674208452235358, + "grad_norm": 8.153335571289062, + "learning_rate": 4.1597024052181e-06, + "loss": 2.8723, + "step": 68795 + }, + { + "epoch": 4.67454817230602, + "grad_norm": 7.413612365722656, + "learning_rate": 4.159277755129774e-06, + "loss": 2.7866, + "step": 68800 + }, + { + "epoch": 4.674887892376682, + "grad_norm": 7.772429943084717, + "learning_rate": 4.158853105041446e-06, + "loss": 2.693, + "step": 68805 + }, + { + "epoch": 4.675227612447343, + "grad_norm": 6.980223178863525, + "learning_rate": 4.158428454953119e-06, + "loss": 3.086, + "step": 68810 + }, + { + "epoch": 4.675567332518005, + "grad_norm": 6.520440578460693, + "learning_rate": 4.158003804864792e-06, + "loss": 2.958, + "step": 68815 + }, + { + "epoch": 4.675907052588667, + "grad_norm": 7.192272663116455, + "learning_rate": 4.157579154776464e-06, + "loss": 2.9154, + "step": 68820 + }, + { + "epoch": 4.676246772659328, + "grad_norm": 7.174256324768066, + "learning_rate": 4.157154504688137e-06, + "loss": 2.7789, + "step": 68825 + }, + { + "epoch": 4.6765864927299905, + "grad_norm": 7.223195552825928, + "learning_rate": 4.15672985459981e-06, + "loss": 2.805, + "step": 68830 + }, + { + "epoch": 4.676926212800653, + "grad_norm": 7.6829094886779785, + "learning_rate": 4.156305204511483e-06, + "loss": 2.8468, + "step": 68835 + }, + { + "epoch": 4.677265932871314, + "grad_norm": 8.945550918579102, + "learning_rate": 4.1558805544231555e-06, + "loss": 2.6096, + "step": 68840 + }, + { + "epoch": 4.677605652941976, + "grad_norm": 8.559803009033203, + "learning_rate": 4.155455904334828e-06, + "loss": 2.9233, + "step": 68845 + }, + { + "epoch": 4.677945373012638, + "grad_norm": 6.759169578552246, + "learning_rate": 4.155031254246501e-06, + "loss": 2.7649, + "step": 68850 + }, + { + "epoch": 4.678285093083299, + "grad_norm": 6.345160484313965, + "learning_rate": 4.154606604158174e-06, + "loss": 2.8555, + "step": 68855 + }, + { + "epoch": 4.678624813153961, + "grad_norm": 9.240788459777832, + "learning_rate": 4.154181954069847e-06, + "loss": 2.7869, + "step": 68860 + }, + { + "epoch": 4.678964533224623, + "grad_norm": 8.534950256347656, + "learning_rate": 4.1537573039815195e-06, + "loss": 2.6377, + "step": 68865 + }, + { + "epoch": 4.679304253295284, + "grad_norm": 7.479507923126221, + "learning_rate": 4.153332653893192e-06, + "loss": 2.7385, + "step": 68870 + }, + { + "epoch": 4.6796439733659465, + "grad_norm": 8.407483100891113, + "learning_rate": 4.152908003804865e-06, + "loss": 2.9062, + "step": 68875 + }, + { + "epoch": 4.679983693436609, + "grad_norm": 6.690313339233398, + "learning_rate": 4.152483353716538e-06, + "loss": 2.8088, + "step": 68880 + }, + { + "epoch": 4.68032341350727, + "grad_norm": 6.927433490753174, + "learning_rate": 4.152058703628211e-06, + "loss": 2.7698, + "step": 68885 + }, + { + "epoch": 4.680663133577932, + "grad_norm": 9.298418998718262, + "learning_rate": 4.1516340535398835e-06, + "loss": 3.0304, + "step": 68890 + }, + { + "epoch": 4.681002853648594, + "grad_norm": 8.363165855407715, + "learning_rate": 4.151209403451556e-06, + "loss": 2.828, + "step": 68895 + }, + { + "epoch": 4.681342573719255, + "grad_norm": 9.000171661376953, + "learning_rate": 4.150784753363229e-06, + "loss": 2.7902, + "step": 68900 + }, + { + "epoch": 4.681682293789917, + "grad_norm": 8.445024490356445, + "learning_rate": 4.150360103274902e-06, + "loss": 2.9554, + "step": 68905 + }, + { + "epoch": 4.682022013860579, + "grad_norm": 7.137688636779785, + "learning_rate": 4.149935453186575e-06, + "loss": 2.8271, + "step": 68910 + }, + { + "epoch": 4.68236173393124, + "grad_norm": 8.946784973144531, + "learning_rate": 4.1495108030982475e-06, + "loss": 2.7701, + "step": 68915 + }, + { + "epoch": 4.6827014540019025, + "grad_norm": 7.93794059753418, + "learning_rate": 4.14908615300992e-06, + "loss": 3.0983, + "step": 68920 + }, + { + "epoch": 4.683041174072565, + "grad_norm": 8.204059600830078, + "learning_rate": 4.148661502921593e-06, + "loss": 2.9692, + "step": 68925 + }, + { + "epoch": 4.683380894143226, + "grad_norm": 7.357179641723633, + "learning_rate": 4.148236852833266e-06, + "loss": 2.7441, + "step": 68930 + }, + { + "epoch": 4.683720614213888, + "grad_norm": 6.653669357299805, + "learning_rate": 4.147812202744939e-06, + "loss": 2.622, + "step": 68935 + }, + { + "epoch": 4.684060334284549, + "grad_norm": 6.8183088302612305, + "learning_rate": 4.1473875526566115e-06, + "loss": 3.0331, + "step": 68940 + }, + { + "epoch": 4.684400054355211, + "grad_norm": 7.103939056396484, + "learning_rate": 4.146962902568284e-06, + "loss": 2.6453, + "step": 68945 + }, + { + "epoch": 4.684739774425873, + "grad_norm": 7.540617942810059, + "learning_rate": 4.146538252479957e-06, + "loss": 2.8467, + "step": 68950 + }, + { + "epoch": 4.685079494496534, + "grad_norm": 8.261682510375977, + "learning_rate": 4.14611360239163e-06, + "loss": 2.899, + "step": 68955 + }, + { + "epoch": 4.6854192145671965, + "grad_norm": 7.195068359375, + "learning_rate": 4.145688952303302e-06, + "loss": 2.7692, + "step": 68960 + }, + { + "epoch": 4.6857589346378585, + "grad_norm": 8.410150527954102, + "learning_rate": 4.1452643022149755e-06, + "loss": 2.8835, + "step": 68965 + }, + { + "epoch": 4.68609865470852, + "grad_norm": 7.557873725891113, + "learning_rate": 4.144839652126648e-06, + "loss": 2.7458, + "step": 68970 + }, + { + "epoch": 4.686438374779182, + "grad_norm": 7.845638751983643, + "learning_rate": 4.14441500203832e-06, + "loss": 2.9474, + "step": 68975 + }, + { + "epoch": 4.686778094849844, + "grad_norm": 6.034903526306152, + "learning_rate": 4.143990351949994e-06, + "loss": 2.9942, + "step": 68980 + }, + { + "epoch": 4.687117814920505, + "grad_norm": 6.9527692794799805, + "learning_rate": 4.143565701861667e-06, + "loss": 2.5802, + "step": 68985 + }, + { + "epoch": 4.687457534991167, + "grad_norm": 7.209727764129639, + "learning_rate": 4.143141051773339e-06, + "loss": 2.8568, + "step": 68990 + }, + { + "epoch": 4.687797255061829, + "grad_norm": 8.294698715209961, + "learning_rate": 4.1427164016850115e-06, + "loss": 3.1078, + "step": 68995 + }, + { + "epoch": 4.68813697513249, + "grad_norm": 8.445097923278809, + "learning_rate": 4.142291751596685e-06, + "loss": 2.8135, + "step": 69000 + }, + { + "epoch": 4.6884766952031525, + "grad_norm": 7.823604583740234, + "learning_rate": 4.141867101508357e-06, + "loss": 3.0951, + "step": 69005 + }, + { + "epoch": 4.6888164152738145, + "grad_norm": 7.831103801727295, + "learning_rate": 4.14144245142003e-06, + "loss": 2.938, + "step": 69010 + }, + { + "epoch": 4.689156135344476, + "grad_norm": 6.790821075439453, + "learning_rate": 4.1410178013317035e-06, + "loss": 3.046, + "step": 69015 + }, + { + "epoch": 4.689495855415138, + "grad_norm": 9.98753547668457, + "learning_rate": 4.1405931512433755e-06, + "loss": 2.9461, + "step": 69020 + }, + { + "epoch": 4.6898355754858, + "grad_norm": 9.535670280456543, + "learning_rate": 4.140168501155048e-06, + "loss": 3.0101, + "step": 69025 + }, + { + "epoch": 4.690175295556461, + "grad_norm": 6.314366817474365, + "learning_rate": 4.139743851066721e-06, + "loss": 3.1325, + "step": 69030 + }, + { + "epoch": 4.690515015627123, + "grad_norm": 6.421880722045898, + "learning_rate": 4.139319200978394e-06, + "loss": 2.7973, + "step": 69035 + }, + { + "epoch": 4.690854735697785, + "grad_norm": 6.361968994140625, + "learning_rate": 4.138894550890067e-06, + "loss": 2.6849, + "step": 69040 + }, + { + "epoch": 4.691194455768446, + "grad_norm": 8.045999526977539, + "learning_rate": 4.1384699008017395e-06, + "loss": 2.8309, + "step": 69045 + }, + { + "epoch": 4.6915341758391085, + "grad_norm": 11.704133033752441, + "learning_rate": 4.138045250713412e-06, + "loss": 2.8908, + "step": 69050 + }, + { + "epoch": 4.6918738959097706, + "grad_norm": 7.4056243896484375, + "learning_rate": 4.137620600625085e-06, + "loss": 3.0178, + "step": 69055 + }, + { + "epoch": 4.692213615980432, + "grad_norm": 9.85625171661377, + "learning_rate": 4.137195950536758e-06, + "loss": 2.9186, + "step": 69060 + }, + { + "epoch": 4.692553336051094, + "grad_norm": 6.1944122314453125, + "learning_rate": 4.136771300448431e-06, + "loss": 2.8954, + "step": 69065 + }, + { + "epoch": 4.692893056121756, + "grad_norm": 7.51109504699707, + "learning_rate": 4.1363466503601035e-06, + "loss": 2.9981, + "step": 69070 + }, + { + "epoch": 4.693232776192417, + "grad_norm": 8.61050796508789, + "learning_rate": 4.135922000271776e-06, + "loss": 3.1796, + "step": 69075 + }, + { + "epoch": 4.693572496263079, + "grad_norm": 7.129899024963379, + "learning_rate": 4.135497350183449e-06, + "loss": 2.9602, + "step": 69080 + }, + { + "epoch": 4.693912216333741, + "grad_norm": 6.883184909820557, + "learning_rate": 4.135072700095122e-06, + "loss": 3.0781, + "step": 69085 + }, + { + "epoch": 4.694251936404402, + "grad_norm": 7.791264057159424, + "learning_rate": 4.134648050006795e-06, + "loss": 2.603, + "step": 69090 + }, + { + "epoch": 4.6945916564750645, + "grad_norm": 6.402120113372803, + "learning_rate": 4.1342233999184675e-06, + "loss": 2.9655, + "step": 69095 + }, + { + "epoch": 4.694931376545727, + "grad_norm": 8.238972663879395, + "learning_rate": 4.13379874983014e-06, + "loss": 2.9768, + "step": 69100 + }, + { + "epoch": 4.695271096616388, + "grad_norm": 6.939578056335449, + "learning_rate": 4.133374099741813e-06, + "loss": 3.0036, + "step": 69105 + }, + { + "epoch": 4.69561081668705, + "grad_norm": 8.114757537841797, + "learning_rate": 4.132949449653486e-06, + "loss": 2.9338, + "step": 69110 + }, + { + "epoch": 4.695950536757712, + "grad_norm": 9.072858810424805, + "learning_rate": 4.132524799565159e-06, + "loss": 3.0248, + "step": 69115 + }, + { + "epoch": 4.696290256828373, + "grad_norm": 6.696927070617676, + "learning_rate": 4.1321001494768315e-06, + "loss": 3.0714, + "step": 69120 + }, + { + "epoch": 4.696629976899035, + "grad_norm": 8.064798355102539, + "learning_rate": 4.131675499388504e-06, + "loss": 2.8476, + "step": 69125 + }, + { + "epoch": 4.696969696969697, + "grad_norm": 9.72150707244873, + "learning_rate": 4.131250849300177e-06, + "loss": 2.6206, + "step": 69130 + }, + { + "epoch": 4.697309417040358, + "grad_norm": 6.217075347900391, + "learning_rate": 4.13082619921185e-06, + "loss": 2.9915, + "step": 69135 + }, + { + "epoch": 4.6976491371110205, + "grad_norm": 7.314822673797607, + "learning_rate": 4.130401549123523e-06, + "loss": 2.8238, + "step": 69140 + }, + { + "epoch": 4.697988857181683, + "grad_norm": 5.775234222412109, + "learning_rate": 4.1299768990351955e-06, + "loss": 2.7959, + "step": 69145 + }, + { + "epoch": 4.698328577252344, + "grad_norm": 7.134083271026611, + "learning_rate": 4.129552248946868e-06, + "loss": 3.1196, + "step": 69150 + }, + { + "epoch": 4.698668297323006, + "grad_norm": 9.086981773376465, + "learning_rate": 4.129127598858541e-06, + "loss": 2.9604, + "step": 69155 + }, + { + "epoch": 4.699008017393668, + "grad_norm": 6.228525638580322, + "learning_rate": 4.128702948770214e-06, + "loss": 2.9665, + "step": 69160 + }, + { + "epoch": 4.699347737464329, + "grad_norm": 8.229211807250977, + "learning_rate": 4.128278298681887e-06, + "loss": 2.7066, + "step": 69165 + }, + { + "epoch": 4.699687457534991, + "grad_norm": 7.187283039093018, + "learning_rate": 4.1278536485935595e-06, + "loss": 2.8586, + "step": 69170 + }, + { + "epoch": 4.700027177605653, + "grad_norm": 7.082828521728516, + "learning_rate": 4.1274289985052315e-06, + "loss": 2.7729, + "step": 69175 + }, + { + "epoch": 4.700366897676314, + "grad_norm": 6.4638519287109375, + "learning_rate": 4.127004348416905e-06, + "loss": 2.8423, + "step": 69180 + }, + { + "epoch": 4.7007066177469765, + "grad_norm": 7.1092095375061035, + "learning_rate": 4.126579698328578e-06, + "loss": 2.785, + "step": 69185 + }, + { + "epoch": 4.701046337817639, + "grad_norm": 10.200101852416992, + "learning_rate": 4.12615504824025e-06, + "loss": 2.4368, + "step": 69190 + }, + { + "epoch": 4.7013860578883, + "grad_norm": 6.356154918670654, + "learning_rate": 4.1257303981519235e-06, + "loss": 3.0793, + "step": 69195 + }, + { + "epoch": 4.701725777958962, + "grad_norm": 7.63789701461792, + "learning_rate": 4.125305748063596e-06, + "loss": 3.0071, + "step": 69200 + }, + { + "epoch": 4.702065498029624, + "grad_norm": 7.940438270568848, + "learning_rate": 4.124881097975268e-06, + "loss": 2.9383, + "step": 69205 + }, + { + "epoch": 4.702405218100285, + "grad_norm": 7.048295497894287, + "learning_rate": 4.124456447886941e-06, + "loss": 2.8313, + "step": 69210 + }, + { + "epoch": 4.702744938170947, + "grad_norm": 9.748927116394043, + "learning_rate": 4.124031797798615e-06, + "loss": 2.7895, + "step": 69215 + }, + { + "epoch": 4.703084658241609, + "grad_norm": 10.307607650756836, + "learning_rate": 4.123607147710287e-06, + "loss": 3.022, + "step": 69220 + }, + { + "epoch": 4.7034243783122704, + "grad_norm": 8.433063507080078, + "learning_rate": 4.1231824976219595e-06, + "loss": 3.0519, + "step": 69225 + }, + { + "epoch": 4.7037640983829325, + "grad_norm": 6.320310115814209, + "learning_rate": 4.122757847533633e-06, + "loss": 2.9358, + "step": 69230 + }, + { + "epoch": 4.704103818453595, + "grad_norm": 8.284287452697754, + "learning_rate": 4.122333197445305e-06, + "loss": 2.941, + "step": 69235 + }, + { + "epoch": 4.704443538524256, + "grad_norm": 6.908013820648193, + "learning_rate": 4.121908547356978e-06, + "loss": 2.9478, + "step": 69240 + }, + { + "epoch": 4.704783258594918, + "grad_norm": 8.02249813079834, + "learning_rate": 4.121483897268651e-06, + "loss": 3.0043, + "step": 69245 + }, + { + "epoch": 4.70512297866558, + "grad_norm": 7.862093925476074, + "learning_rate": 4.121059247180324e-06, + "loss": 2.9736, + "step": 69250 + }, + { + "epoch": 4.705462698736241, + "grad_norm": 7.574509143829346, + "learning_rate": 4.120634597091996e-06, + "loss": 2.9176, + "step": 69255 + }, + { + "epoch": 4.705802418806903, + "grad_norm": 8.931574821472168, + "learning_rate": 4.120209947003669e-06, + "loss": 2.9596, + "step": 69260 + }, + { + "epoch": 4.706142138877565, + "grad_norm": 8.167730331420898, + "learning_rate": 4.119785296915343e-06, + "loss": 2.7283, + "step": 69265 + }, + { + "epoch": 4.7064818589482265, + "grad_norm": 6.668659210205078, + "learning_rate": 4.119360646827015e-06, + "loss": 2.9416, + "step": 69270 + }, + { + "epoch": 4.7068215790188885, + "grad_norm": 7.961626052856445, + "learning_rate": 4.1189359967386875e-06, + "loss": 2.7699, + "step": 69275 + }, + { + "epoch": 4.707161299089551, + "grad_norm": 6.763678073883057, + "learning_rate": 4.11851134665036e-06, + "loss": 2.6838, + "step": 69280 + }, + { + "epoch": 4.707501019160212, + "grad_norm": 8.485099792480469, + "learning_rate": 4.118086696562033e-06, + "loss": 2.8112, + "step": 69285 + }, + { + "epoch": 4.707840739230874, + "grad_norm": 8.457523345947266, + "learning_rate": 4.117662046473706e-06, + "loss": 2.6958, + "step": 69290 + }, + { + "epoch": 4.708180459301536, + "grad_norm": 7.879430294036865, + "learning_rate": 4.117237396385379e-06, + "loss": 2.7708, + "step": 69295 + }, + { + "epoch": 4.708520179372197, + "grad_norm": 8.048828125, + "learning_rate": 4.1168127462970515e-06, + "loss": 3.1024, + "step": 69300 + }, + { + "epoch": 4.708859899442859, + "grad_norm": 6.861772537231445, + "learning_rate": 4.116388096208724e-06, + "loss": 2.9977, + "step": 69305 + }, + { + "epoch": 4.709199619513521, + "grad_norm": 7.902546405792236, + "learning_rate": 4.115963446120397e-06, + "loss": 2.7982, + "step": 69310 + }, + { + "epoch": 4.7095393395841825, + "grad_norm": 5.6740946769714355, + "learning_rate": 4.11553879603207e-06, + "loss": 2.8771, + "step": 69315 + }, + { + "epoch": 4.7098790596548445, + "grad_norm": 8.545731544494629, + "learning_rate": 4.115114145943743e-06, + "loss": 2.8823, + "step": 69320 + }, + { + "epoch": 4.710218779725507, + "grad_norm": 6.544488906860352, + "learning_rate": 4.1146894958554155e-06, + "loss": 2.874, + "step": 69325 + }, + { + "epoch": 4.710558499796168, + "grad_norm": 9.20140266418457, + "learning_rate": 4.114264845767088e-06, + "loss": 2.7662, + "step": 69330 + }, + { + "epoch": 4.71089821986683, + "grad_norm": 7.211358547210693, + "learning_rate": 4.113840195678761e-06, + "loss": 2.776, + "step": 69335 + }, + { + "epoch": 4.711237939937492, + "grad_norm": 6.864688873291016, + "learning_rate": 4.113415545590434e-06, + "loss": 3.2623, + "step": 69340 + }, + { + "epoch": 4.711577660008153, + "grad_norm": 9.044844627380371, + "learning_rate": 4.112990895502107e-06, + "loss": 3.0031, + "step": 69345 + }, + { + "epoch": 4.711917380078815, + "grad_norm": 8.096562385559082, + "learning_rate": 4.1125662454137795e-06, + "loss": 2.6841, + "step": 69350 + }, + { + "epoch": 4.712257100149477, + "grad_norm": 8.041571617126465, + "learning_rate": 4.112141595325452e-06, + "loss": 2.6521, + "step": 69355 + }, + { + "epoch": 4.7125968202201385, + "grad_norm": 6.263869285583496, + "learning_rate": 4.111716945237125e-06, + "loss": 2.763, + "step": 69360 + }, + { + "epoch": 4.712936540290801, + "grad_norm": 5.672672748565674, + "learning_rate": 4.111292295148798e-06, + "loss": 2.8415, + "step": 69365 + }, + { + "epoch": 4.713276260361463, + "grad_norm": 8.258085250854492, + "learning_rate": 4.110867645060471e-06, + "loss": 2.8827, + "step": 69370 + }, + { + "epoch": 4.713615980432124, + "grad_norm": 9.196026802062988, + "learning_rate": 4.110442994972143e-06, + "loss": 2.8002, + "step": 69375 + }, + { + "epoch": 4.713955700502786, + "grad_norm": 6.504744529724121, + "learning_rate": 4.110018344883816e-06, + "loss": 2.9035, + "step": 69380 + }, + { + "epoch": 4.714295420573448, + "grad_norm": 6.564459323883057, + "learning_rate": 4.109593694795489e-06, + "loss": 2.7275, + "step": 69385 + }, + { + "epoch": 4.714635140644109, + "grad_norm": 6.757410049438477, + "learning_rate": 4.109169044707161e-06, + "loss": 2.6658, + "step": 69390 + }, + { + "epoch": 4.714974860714771, + "grad_norm": 8.022624015808105, + "learning_rate": 4.108744394618835e-06, + "loss": 3.0055, + "step": 69395 + }, + { + "epoch": 4.715314580785432, + "grad_norm": 7.093825340270996, + "learning_rate": 4.1083197445305075e-06, + "loss": 2.8036, + "step": 69400 + }, + { + "epoch": 4.7156543008560945, + "grad_norm": 9.209218978881836, + "learning_rate": 4.1078950944421794e-06, + "loss": 2.9349, + "step": 69405 + }, + { + "epoch": 4.715994020926757, + "grad_norm": 8.72637939453125, + "learning_rate": 4.107470444353853e-06, + "loss": 2.8533, + "step": 69410 + }, + { + "epoch": 4.716333740997418, + "grad_norm": 7.087860584259033, + "learning_rate": 4.107045794265526e-06, + "loss": 2.6765, + "step": 69415 + }, + { + "epoch": 4.71667346106808, + "grad_norm": 11.315792083740234, + "learning_rate": 4.106621144177199e-06, + "loss": 2.8194, + "step": 69420 + }, + { + "epoch": 4.717013181138742, + "grad_norm": 6.622773170471191, + "learning_rate": 4.106196494088871e-06, + "loss": 2.9761, + "step": 69425 + }, + { + "epoch": 4.717352901209403, + "grad_norm": 6.418554306030273, + "learning_rate": 4.105771844000544e-06, + "loss": 2.8863, + "step": 69430 + }, + { + "epoch": 4.717692621280065, + "grad_norm": 6.644332408905029, + "learning_rate": 4.105347193912217e-06, + "loss": 2.8771, + "step": 69435 + }, + { + "epoch": 4.718032341350727, + "grad_norm": 6.414377689361572, + "learning_rate": 4.104922543823889e-06, + "loss": 2.6023, + "step": 69440 + }, + { + "epoch": 4.718372061421388, + "grad_norm": 8.61982250213623, + "learning_rate": 4.104497893735563e-06, + "loss": 2.9416, + "step": 69445 + }, + { + "epoch": 4.7187117814920505, + "grad_norm": 6.584926128387451, + "learning_rate": 4.1040732436472355e-06, + "loss": 2.9877, + "step": 69450 + }, + { + "epoch": 4.719051501562713, + "grad_norm": 7.923765659332275, + "learning_rate": 4.1036485935589075e-06, + "loss": 2.4744, + "step": 69455 + }, + { + "epoch": 4.719391221633374, + "grad_norm": 9.364460945129395, + "learning_rate": 4.10322394347058e-06, + "loss": 2.8803, + "step": 69460 + }, + { + "epoch": 4.719730941704036, + "grad_norm": 7.847431659698486, + "learning_rate": 4.102799293382254e-06, + "loss": 2.8469, + "step": 69465 + }, + { + "epoch": 4.720070661774698, + "grad_norm": 7.957949638366699, + "learning_rate": 4.102374643293926e-06, + "loss": 2.9201, + "step": 69470 + }, + { + "epoch": 4.720410381845359, + "grad_norm": 7.87952184677124, + "learning_rate": 4.101949993205599e-06, + "loss": 2.8247, + "step": 69475 + }, + { + "epoch": 4.720750101916021, + "grad_norm": 6.470667839050293, + "learning_rate": 4.101525343117272e-06, + "loss": 2.8289, + "step": 69480 + }, + { + "epoch": 4.721089821986683, + "grad_norm": 8.137008666992188, + "learning_rate": 4.101100693028944e-06, + "loss": 2.5324, + "step": 69485 + }, + { + "epoch": 4.721429542057344, + "grad_norm": 6.941066741943359, + "learning_rate": 4.100676042940617e-06, + "loss": 2.7202, + "step": 69490 + }, + { + "epoch": 4.7217692621280065, + "grad_norm": 7.293180465698242, + "learning_rate": 4.10025139285229e-06, + "loss": 2.9628, + "step": 69495 + }, + { + "epoch": 4.722108982198669, + "grad_norm": 7.3054656982421875, + "learning_rate": 4.099826742763963e-06, + "loss": 2.9477, + "step": 69500 + }, + { + "epoch": 4.72244870226933, + "grad_norm": 6.6042890548706055, + "learning_rate": 4.0994020926756355e-06, + "loss": 3.0067, + "step": 69505 + }, + { + "epoch": 4.722788422339992, + "grad_norm": 8.353394508361816, + "learning_rate": 4.098977442587308e-06, + "loss": 3.008, + "step": 69510 + }, + { + "epoch": 4.723128142410654, + "grad_norm": 9.25339412689209, + "learning_rate": 4.098552792498981e-06, + "loss": 2.9253, + "step": 69515 + }, + { + "epoch": 4.723467862481315, + "grad_norm": 7.93428897857666, + "learning_rate": 4.098128142410654e-06, + "loss": 2.8032, + "step": 69520 + }, + { + "epoch": 4.723807582551977, + "grad_norm": 6.69842529296875, + "learning_rate": 4.097703492322327e-06, + "loss": 2.7741, + "step": 69525 + }, + { + "epoch": 4.724147302622639, + "grad_norm": 8.327607154846191, + "learning_rate": 4.0972788422339995e-06, + "loss": 2.8201, + "step": 69530 + }, + { + "epoch": 4.7244870226933005, + "grad_norm": 8.199775695800781, + "learning_rate": 4.096854192145672e-06, + "loss": 2.6741, + "step": 69535 + }, + { + "epoch": 4.7248267427639625, + "grad_norm": 4.995678901672363, + "learning_rate": 4.096429542057345e-06, + "loss": 2.641, + "step": 69540 + }, + { + "epoch": 4.725166462834625, + "grad_norm": 7.8643598556518555, + "learning_rate": 4.096004891969018e-06, + "loss": 2.9235, + "step": 69545 + }, + { + "epoch": 4.725506182905286, + "grad_norm": 6.374395847320557, + "learning_rate": 4.095580241880691e-06, + "loss": 3.02, + "step": 69550 + }, + { + "epoch": 4.725845902975948, + "grad_norm": 6.227447032928467, + "learning_rate": 4.0951555917923635e-06, + "loss": 2.7512, + "step": 69555 + }, + { + "epoch": 4.72618562304661, + "grad_norm": 7.457821369171143, + "learning_rate": 4.094730941704036e-06, + "loss": 2.6749, + "step": 69560 + }, + { + "epoch": 4.726525343117271, + "grad_norm": 9.3117036819458, + "learning_rate": 4.094306291615709e-06, + "loss": 2.9798, + "step": 69565 + }, + { + "epoch": 4.726865063187933, + "grad_norm": 7.91553258895874, + "learning_rate": 4.093881641527382e-06, + "loss": 2.8449, + "step": 69570 + }, + { + "epoch": 4.727204783258595, + "grad_norm": 6.625877857208252, + "learning_rate": 4.093456991439055e-06, + "loss": 2.6642, + "step": 69575 + }, + { + "epoch": 4.7275445033292565, + "grad_norm": 8.505919456481934, + "learning_rate": 4.0930323413507275e-06, + "loss": 2.8994, + "step": 69580 + }, + { + "epoch": 4.7278842233999185, + "grad_norm": 8.877716064453125, + "learning_rate": 4.0926076912624e-06, + "loss": 2.8643, + "step": 69585 + }, + { + "epoch": 4.728223943470581, + "grad_norm": 8.223478317260742, + "learning_rate": 4.092183041174073e-06, + "loss": 2.8738, + "step": 69590 + }, + { + "epoch": 4.728563663541242, + "grad_norm": 9.88388729095459, + "learning_rate": 4.091758391085746e-06, + "loss": 2.8849, + "step": 69595 + }, + { + "epoch": 4.728903383611904, + "grad_norm": 7.614864349365234, + "learning_rate": 4.091333740997419e-06, + "loss": 2.8358, + "step": 69600 + }, + { + "epoch": 4.729243103682566, + "grad_norm": 7.466608047485352, + "learning_rate": 4.0909090909090915e-06, + "loss": 2.7004, + "step": 69605 + }, + { + "epoch": 4.729582823753227, + "grad_norm": 8.286622047424316, + "learning_rate": 4.090484440820764e-06, + "loss": 3.0304, + "step": 69610 + }, + { + "epoch": 4.729922543823889, + "grad_norm": 7.9156036376953125, + "learning_rate": 4.090059790732437e-06, + "loss": 2.7551, + "step": 69615 + }, + { + "epoch": 4.73026226389455, + "grad_norm": 7.179176330566406, + "learning_rate": 4.08963514064411e-06, + "loss": 2.9688, + "step": 69620 + }, + { + "epoch": 4.7306019839652125, + "grad_norm": 6.59487247467041, + "learning_rate": 4.089210490555782e-06, + "loss": 2.8396, + "step": 69625 + }, + { + "epoch": 4.7309417040358746, + "grad_norm": 7.438930988311768, + "learning_rate": 4.0887858404674555e-06, + "loss": 2.9363, + "step": 69630 + }, + { + "epoch": 4.731281424106536, + "grad_norm": 7.676087856292725, + "learning_rate": 4.088361190379128e-06, + "loss": 2.7037, + "step": 69635 + }, + { + "epoch": 4.731621144177198, + "grad_norm": 8.678693771362305, + "learning_rate": 4.0879365402908e-06, + "loss": 2.9252, + "step": 69640 + }, + { + "epoch": 4.73196086424786, + "grad_norm": 7.121419429779053, + "learning_rate": 4.087511890202474e-06, + "loss": 2.7433, + "step": 69645 + }, + { + "epoch": 4.732300584318521, + "grad_norm": 6.677030086517334, + "learning_rate": 4.087087240114147e-06, + "loss": 2.7756, + "step": 69650 + }, + { + "epoch": 4.732640304389183, + "grad_norm": 7.771850109100342, + "learning_rate": 4.086662590025819e-06, + "loss": 2.8299, + "step": 69655 + }, + { + "epoch": 4.732980024459845, + "grad_norm": 7.048447132110596, + "learning_rate": 4.0862379399374914e-06, + "loss": 2.9047, + "step": 69660 + }, + { + "epoch": 4.733319744530506, + "grad_norm": 7.462263107299805, + "learning_rate": 4.085813289849165e-06, + "loss": 2.4944, + "step": 69665 + }, + { + "epoch": 4.7336594646011685, + "grad_norm": 6.392679214477539, + "learning_rate": 4.085388639760837e-06, + "loss": 2.9027, + "step": 69670 + }, + { + "epoch": 4.733999184671831, + "grad_norm": 7.718158721923828, + "learning_rate": 4.08496398967251e-06, + "loss": 2.9907, + "step": 69675 + }, + { + "epoch": 4.734338904742492, + "grad_norm": 6.464780330657959, + "learning_rate": 4.0845393395841835e-06, + "loss": 2.82, + "step": 69680 + }, + { + "epoch": 4.734678624813154, + "grad_norm": 6.077358722686768, + "learning_rate": 4.0841146894958554e-06, + "loss": 2.7961, + "step": 69685 + }, + { + "epoch": 4.735018344883816, + "grad_norm": 7.8396172523498535, + "learning_rate": 4.083690039407528e-06, + "loss": 3.1239, + "step": 69690 + }, + { + "epoch": 4.735358064954477, + "grad_norm": 8.165177345275879, + "learning_rate": 4.083265389319202e-06, + "loss": 2.8515, + "step": 69695 + }, + { + "epoch": 4.735697785025139, + "grad_norm": 7.017200469970703, + "learning_rate": 4.082840739230874e-06, + "loss": 2.6392, + "step": 69700 + }, + { + "epoch": 4.736037505095801, + "grad_norm": 7.403879642486572, + "learning_rate": 4.082416089142547e-06, + "loss": 2.8503, + "step": 69705 + }, + { + "epoch": 4.736377225166462, + "grad_norm": 9.207771301269531, + "learning_rate": 4.0819914390542194e-06, + "loss": 2.9247, + "step": 69710 + }, + { + "epoch": 4.7367169452371245, + "grad_norm": 6.254444122314453, + "learning_rate": 4.081566788965892e-06, + "loss": 2.9108, + "step": 69715 + }, + { + "epoch": 4.737056665307787, + "grad_norm": 9.25250244140625, + "learning_rate": 4.081142138877565e-06, + "loss": 2.4429, + "step": 69720 + }, + { + "epoch": 4.737396385378448, + "grad_norm": 6.814023494720459, + "learning_rate": 4.080717488789238e-06, + "loss": 3.2194, + "step": 69725 + }, + { + "epoch": 4.73773610544911, + "grad_norm": 7.512392520904541, + "learning_rate": 4.080292838700911e-06, + "loss": 2.8552, + "step": 69730 + }, + { + "epoch": 4.738075825519772, + "grad_norm": 7.54428243637085, + "learning_rate": 4.0798681886125834e-06, + "loss": 2.771, + "step": 69735 + }, + { + "epoch": 4.738415545590433, + "grad_norm": 6.85346794128418, + "learning_rate": 4.079443538524256e-06, + "loss": 2.9791, + "step": 69740 + }, + { + "epoch": 4.738755265661095, + "grad_norm": 9.528291702270508, + "learning_rate": 4.079018888435929e-06, + "loss": 3.0054, + "step": 69745 + }, + { + "epoch": 4.739094985731757, + "grad_norm": 6.96536111831665, + "learning_rate": 4.078594238347602e-06, + "loss": 2.8754, + "step": 69750 + }, + { + "epoch": 4.739434705802418, + "grad_norm": 7.434269428253174, + "learning_rate": 4.078169588259275e-06, + "loss": 3.0467, + "step": 69755 + }, + { + "epoch": 4.7397744258730805, + "grad_norm": 10.061299324035645, + "learning_rate": 4.0777449381709475e-06, + "loss": 2.6747, + "step": 69760 + }, + { + "epoch": 4.740114145943743, + "grad_norm": 6.6959733963012695, + "learning_rate": 4.07732028808262e-06, + "loss": 2.873, + "step": 69765 + }, + { + "epoch": 4.740453866014404, + "grad_norm": 7.245044231414795, + "learning_rate": 4.076895637994293e-06, + "loss": 2.8015, + "step": 69770 + }, + { + "epoch": 4.740793586085066, + "grad_norm": 7.9449896812438965, + "learning_rate": 4.076470987905966e-06, + "loss": 2.7966, + "step": 69775 + }, + { + "epoch": 4.741133306155728, + "grad_norm": 10.314709663391113, + "learning_rate": 4.076046337817639e-06, + "loss": 2.796, + "step": 69780 + }, + { + "epoch": 4.741473026226389, + "grad_norm": 8.07064437866211, + "learning_rate": 4.0756216877293115e-06, + "loss": 2.8197, + "step": 69785 + }, + { + "epoch": 4.741812746297051, + "grad_norm": 9.242300033569336, + "learning_rate": 4.075197037640984e-06, + "loss": 2.8282, + "step": 69790 + }, + { + "epoch": 4.742152466367713, + "grad_norm": 7.187984466552734, + "learning_rate": 4.074772387552657e-06, + "loss": 3.1579, + "step": 69795 + }, + { + "epoch": 4.7424921864383744, + "grad_norm": 9.143167495727539, + "learning_rate": 4.07434773746433e-06, + "loss": 2.742, + "step": 69800 + }, + { + "epoch": 4.7428319065090365, + "grad_norm": 7.113763332366943, + "learning_rate": 4.073923087376003e-06, + "loss": 2.7637, + "step": 69805 + }, + { + "epoch": 4.743171626579699, + "grad_norm": 7.930026054382324, + "learning_rate": 4.0734984372876755e-06, + "loss": 2.9223, + "step": 69810 + }, + { + "epoch": 4.74351134665036, + "grad_norm": 6.0419697761535645, + "learning_rate": 4.073073787199348e-06, + "loss": 2.3905, + "step": 69815 + }, + { + "epoch": 4.743851066721022, + "grad_norm": 7.068907737731934, + "learning_rate": 4.072649137111021e-06, + "loss": 2.8213, + "step": 69820 + }, + { + "epoch": 4.744190786791684, + "grad_norm": 6.630192279815674, + "learning_rate": 4.072224487022694e-06, + "loss": 2.7909, + "step": 69825 + }, + { + "epoch": 4.744530506862345, + "grad_norm": 7.221631050109863, + "learning_rate": 4.071799836934367e-06, + "loss": 2.7875, + "step": 69830 + }, + { + "epoch": 4.744870226933007, + "grad_norm": 7.821907997131348, + "learning_rate": 4.0713751868460395e-06, + "loss": 2.9667, + "step": 69835 + }, + { + "epoch": 4.745209947003669, + "grad_norm": 8.18173599243164, + "learning_rate": 4.070950536757711e-06, + "loss": 2.7004, + "step": 69840 + }, + { + "epoch": 4.7455496670743305, + "grad_norm": 8.702386856079102, + "learning_rate": 4.070525886669385e-06, + "loss": 2.9274, + "step": 69845 + }, + { + "epoch": 4.7458893871449925, + "grad_norm": 7.550404071807861, + "learning_rate": 4.070101236581058e-06, + "loss": 2.9146, + "step": 69850 + }, + { + "epoch": 4.746229107215655, + "grad_norm": 7.678280353546143, + "learning_rate": 4.06967658649273e-06, + "loss": 3.0465, + "step": 69855 + }, + { + "epoch": 4.746568827286316, + "grad_norm": 8.740262985229492, + "learning_rate": 4.0692519364044035e-06, + "loss": 2.6608, + "step": 69860 + }, + { + "epoch": 4.746908547356978, + "grad_norm": 6.722103118896484, + "learning_rate": 4.068827286316076e-06, + "loss": 3.048, + "step": 69865 + }, + { + "epoch": 4.74724826742764, + "grad_norm": 8.816770553588867, + "learning_rate": 4.068402636227748e-06, + "loss": 2.7349, + "step": 69870 + }, + { + "epoch": 4.747587987498301, + "grad_norm": 7.442765235900879, + "learning_rate": 4.067977986139421e-06, + "loss": 2.7544, + "step": 69875 + }, + { + "epoch": 4.747927707568963, + "grad_norm": 7.762429714202881, + "learning_rate": 4.067553336051095e-06, + "loss": 2.925, + "step": 69880 + }, + { + "epoch": 4.748267427639625, + "grad_norm": 7.201550483703613, + "learning_rate": 4.067128685962767e-06, + "loss": 3.0152, + "step": 69885 + }, + { + "epoch": 4.7486071477102865, + "grad_norm": 7.241006851196289, + "learning_rate": 4.0667040358744394e-06, + "loss": 3.0628, + "step": 69890 + }, + { + "epoch": 4.7489468677809485, + "grad_norm": 9.165730476379395, + "learning_rate": 4.066279385786113e-06, + "loss": 2.7394, + "step": 69895 + }, + { + "epoch": 4.749286587851611, + "grad_norm": 7.265216827392578, + "learning_rate": 4.065854735697785e-06, + "loss": 2.7776, + "step": 69900 + }, + { + "epoch": 4.749626307922272, + "grad_norm": 7.490589618682861, + "learning_rate": 4.065430085609458e-06, + "loss": 2.7439, + "step": 69905 + }, + { + "epoch": 4.749966027992934, + "grad_norm": 6.240069389343262, + "learning_rate": 4.065005435521131e-06, + "loss": 2.9749, + "step": 69910 + }, + { + "epoch": 4.750305748063596, + "grad_norm": 6.532631874084473, + "learning_rate": 4.0645807854328034e-06, + "loss": 3.047, + "step": 69915 + }, + { + "epoch": 4.750645468134257, + "grad_norm": 8.04442024230957, + "learning_rate": 4.064156135344476e-06, + "loss": 2.8984, + "step": 69920 + }, + { + "epoch": 4.750985188204919, + "grad_norm": 8.47575569152832, + "learning_rate": 4.063731485256149e-06, + "loss": 2.7996, + "step": 69925 + }, + { + "epoch": 4.751324908275581, + "grad_norm": 6.509352207183838, + "learning_rate": 4.063306835167823e-06, + "loss": 2.6316, + "step": 69930 + }, + { + "epoch": 4.7516646283462425, + "grad_norm": 10.699859619140625, + "learning_rate": 4.062882185079495e-06, + "loss": 2.6398, + "step": 69935 + }, + { + "epoch": 4.7520043484169046, + "grad_norm": 6.863437652587891, + "learning_rate": 4.0624575349911674e-06, + "loss": 2.917, + "step": 69940 + }, + { + "epoch": 4.752344068487567, + "grad_norm": 8.510180473327637, + "learning_rate": 4.06203288490284e-06, + "loss": 3.2339, + "step": 69945 + }, + { + "epoch": 4.752683788558228, + "grad_norm": 7.542960166931152, + "learning_rate": 4.061608234814513e-06, + "loss": 2.8253, + "step": 69950 + }, + { + "epoch": 4.75302350862889, + "grad_norm": 6.880081653594971, + "learning_rate": 4.061183584726186e-06, + "loss": 2.783, + "step": 69955 + }, + { + "epoch": 4.753363228699552, + "grad_norm": 9.071404457092285, + "learning_rate": 4.060758934637859e-06, + "loss": 3.0098, + "step": 69960 + }, + { + "epoch": 4.753702948770213, + "grad_norm": 7.203479766845703, + "learning_rate": 4.0603342845495314e-06, + "loss": 3.0389, + "step": 69965 + }, + { + "epoch": 4.754042668840875, + "grad_norm": 8.138117790222168, + "learning_rate": 4.059909634461204e-06, + "loss": 2.8869, + "step": 69970 + }, + { + "epoch": 4.754382388911537, + "grad_norm": 8.796070098876953, + "learning_rate": 4.059484984372877e-06, + "loss": 2.5879, + "step": 69975 + }, + { + "epoch": 4.7547221089821985, + "grad_norm": 8.770644187927246, + "learning_rate": 4.05906033428455e-06, + "loss": 2.8919, + "step": 69980 + }, + { + "epoch": 4.755061829052861, + "grad_norm": 8.912216186523438, + "learning_rate": 4.058720614213888e-06, + "loss": 2.8224, + "step": 69985 + }, + { + "epoch": 4.755401549123523, + "grad_norm": 6.737054824829102, + "learning_rate": 4.058295964125561e-06, + "loss": 2.7865, + "step": 69990 + }, + { + "epoch": 4.755741269194184, + "grad_norm": 10.318191528320312, + "learning_rate": 4.0578713140372335e-06, + "loss": 2.7786, + "step": 69995 + }, + { + "epoch": 4.756080989264846, + "grad_norm": 8.726061820983887, + "learning_rate": 4.057446663948906e-06, + "loss": 2.8273, + "step": 70000 + }, + { + "epoch": 4.756420709335508, + "grad_norm": 9.120784759521484, + "learning_rate": 4.057022013860579e-06, + "loss": 3.2172, + "step": 70005 + }, + { + "epoch": 4.756760429406169, + "grad_norm": 6.591955184936523, + "learning_rate": 4.056597363772252e-06, + "loss": 2.9007, + "step": 70010 + }, + { + "epoch": 4.757100149476831, + "grad_norm": 6.879819869995117, + "learning_rate": 4.056172713683925e-06, + "loss": 2.8938, + "step": 70015 + }, + { + "epoch": 4.757439869547493, + "grad_norm": 7.0894246101379395, + "learning_rate": 4.0557480635955975e-06, + "loss": 2.905, + "step": 70020 + }, + { + "epoch": 4.7577795896181545, + "grad_norm": 9.03333854675293, + "learning_rate": 4.05532341350727e-06, + "loss": 2.7893, + "step": 70025 + }, + { + "epoch": 4.758119309688817, + "grad_norm": 8.050281524658203, + "learning_rate": 4.054898763418943e-06, + "loss": 2.8942, + "step": 70030 + }, + { + "epoch": 4.758459029759479, + "grad_norm": 7.44018030166626, + "learning_rate": 4.054474113330616e-06, + "loss": 3.0789, + "step": 70035 + }, + { + "epoch": 4.75879874983014, + "grad_norm": 7.351480484008789, + "learning_rate": 4.054049463242289e-06, + "loss": 2.9442, + "step": 70040 + }, + { + "epoch": 4.759138469900802, + "grad_norm": 6.618431568145752, + "learning_rate": 4.0536248131539615e-06, + "loss": 3.0234, + "step": 70045 + }, + { + "epoch": 4.759478189971464, + "grad_norm": 5.820522785186768, + "learning_rate": 4.053200163065634e-06, + "loss": 2.4221, + "step": 70050 + }, + { + "epoch": 4.759817910042125, + "grad_norm": 6.474215030670166, + "learning_rate": 4.052775512977307e-06, + "loss": 3.0073, + "step": 70055 + }, + { + "epoch": 4.760157630112787, + "grad_norm": 5.941565990447998, + "learning_rate": 4.05235086288898e-06, + "loss": 2.6681, + "step": 70060 + }, + { + "epoch": 4.760497350183449, + "grad_norm": 8.265279769897461, + "learning_rate": 4.051926212800653e-06, + "loss": 2.7751, + "step": 70065 + }, + { + "epoch": 4.7608370702541105, + "grad_norm": 8.225489616394043, + "learning_rate": 4.0515015627123255e-06, + "loss": 3.0454, + "step": 70070 + }, + { + "epoch": 4.761176790324773, + "grad_norm": 9.539755821228027, + "learning_rate": 4.051076912623998e-06, + "loss": 2.8705, + "step": 70075 + }, + { + "epoch": 4.761516510395434, + "grad_norm": 6.96189546585083, + "learning_rate": 4.050652262535671e-06, + "loss": 3.0032, + "step": 70080 + }, + { + "epoch": 4.761856230466096, + "grad_norm": 7.710817813873291, + "learning_rate": 4.050227612447344e-06, + "loss": 2.7667, + "step": 70085 + }, + { + "epoch": 4.762195950536758, + "grad_norm": 6.076213359832764, + "learning_rate": 4.049802962359016e-06, + "loss": 2.5495, + "step": 70090 + }, + { + "epoch": 4.762535670607419, + "grad_norm": 9.112956047058105, + "learning_rate": 4.0493783122706895e-06, + "loss": 2.7629, + "step": 70095 + }, + { + "epoch": 4.762875390678081, + "grad_norm": 6.930112838745117, + "learning_rate": 4.048953662182362e-06, + "loss": 3.0626, + "step": 70100 + }, + { + "epoch": 4.763215110748743, + "grad_norm": 6.3004302978515625, + "learning_rate": 4.048529012094034e-06, + "loss": 2.8473, + "step": 70105 + }, + { + "epoch": 4.7635548308194045, + "grad_norm": 7.02985143661499, + "learning_rate": 4.048104362005708e-06, + "loss": 2.5125, + "step": 70110 + }, + { + "epoch": 4.7638945508900665, + "grad_norm": 8.138883590698242, + "learning_rate": 4.047679711917381e-06, + "loss": 2.7414, + "step": 70115 + }, + { + "epoch": 4.764234270960729, + "grad_norm": 7.9835734367370605, + "learning_rate": 4.047255061829053e-06, + "loss": 2.8544, + "step": 70120 + }, + { + "epoch": 4.76457399103139, + "grad_norm": 6.044841289520264, + "learning_rate": 4.0468304117407255e-06, + "loss": 2.8269, + "step": 70125 + }, + { + "epoch": 4.764913711102052, + "grad_norm": 7.841904163360596, + "learning_rate": 4.046405761652399e-06, + "loss": 2.7087, + "step": 70130 + }, + { + "epoch": 4.765253431172714, + "grad_norm": 6.95511531829834, + "learning_rate": 4.045981111564072e-06, + "loss": 3.019, + "step": 70135 + }, + { + "epoch": 4.765593151243375, + "grad_norm": 8.295077323913574, + "learning_rate": 4.045556461475744e-06, + "loss": 2.9238, + "step": 70140 + }, + { + "epoch": 4.765932871314037, + "grad_norm": 6.951941967010498, + "learning_rate": 4.0451318113874175e-06, + "loss": 2.8271, + "step": 70145 + }, + { + "epoch": 4.766272591384699, + "grad_norm": 7.476322650909424, + "learning_rate": 4.04470716129909e-06, + "loss": 2.6726, + "step": 70150 + }, + { + "epoch": 4.7666123114553605, + "grad_norm": 6.605938911437988, + "learning_rate": 4.044282511210762e-06, + "loss": 2.6679, + "step": 70155 + }, + { + "epoch": 4.7669520315260225, + "grad_norm": 8.600687026977539, + "learning_rate": 4.043857861122435e-06, + "loss": 2.9547, + "step": 70160 + }, + { + "epoch": 4.767291751596685, + "grad_norm": 6.77747106552124, + "learning_rate": 4.043433211034109e-06, + "loss": 2.959, + "step": 70165 + }, + { + "epoch": 4.767631471667346, + "grad_norm": 8.753137588500977, + "learning_rate": 4.043008560945781e-06, + "loss": 2.9759, + "step": 70170 + }, + { + "epoch": 4.767971191738008, + "grad_norm": 6.490703105926514, + "learning_rate": 4.0425839108574535e-06, + "loss": 2.8022, + "step": 70175 + }, + { + "epoch": 4.76831091180867, + "grad_norm": 8.460712432861328, + "learning_rate": 4.042159260769127e-06, + "loss": 3.1289, + "step": 70180 + }, + { + "epoch": 4.768650631879331, + "grad_norm": 6.8829827308654785, + "learning_rate": 4.041734610680799e-06, + "loss": 2.9031, + "step": 70185 + }, + { + "epoch": 4.768990351949993, + "grad_norm": 7.937158107757568, + "learning_rate": 4.041309960592472e-06, + "loss": 3.0208, + "step": 70190 + }, + { + "epoch": 4.769330072020655, + "grad_norm": 7.117422103881836, + "learning_rate": 4.0408853105041455e-06, + "loss": 2.7727, + "step": 70195 + }, + { + "epoch": 4.7696697920913165, + "grad_norm": 6.955021858215332, + "learning_rate": 4.0404606604158175e-06, + "loss": 2.7652, + "step": 70200 + }, + { + "epoch": 4.7700095121619785, + "grad_norm": 6.273568153381348, + "learning_rate": 4.04003601032749e-06, + "loss": 2.8883, + "step": 70205 + }, + { + "epoch": 4.770349232232641, + "grad_norm": 5.514206886291504, + "learning_rate": 4.039611360239163e-06, + "loss": 2.8141, + "step": 70210 + }, + { + "epoch": 4.770688952303302, + "grad_norm": 7.302504062652588, + "learning_rate": 4.039186710150836e-06, + "loss": 2.8878, + "step": 70215 + }, + { + "epoch": 4.771028672373964, + "grad_norm": 7.25615930557251, + "learning_rate": 4.038762060062509e-06, + "loss": 2.8699, + "step": 70220 + }, + { + "epoch": 4.771368392444626, + "grad_norm": 7.763983726501465, + "learning_rate": 4.0383374099741815e-06, + "loss": 2.8425, + "step": 70225 + }, + { + "epoch": 4.771708112515287, + "grad_norm": 6.477502346038818, + "learning_rate": 4.037912759885854e-06, + "loss": 2.9054, + "step": 70230 + }, + { + "epoch": 4.772047832585949, + "grad_norm": 7.501580238342285, + "learning_rate": 4.037488109797527e-06, + "loss": 2.7554, + "step": 70235 + }, + { + "epoch": 4.772387552656611, + "grad_norm": 7.421238422393799, + "learning_rate": 4.0370634597092e-06, + "loss": 2.687, + "step": 70240 + }, + { + "epoch": 4.7727272727272725, + "grad_norm": 6.311860084533691, + "learning_rate": 4.036638809620873e-06, + "loss": 2.8582, + "step": 70245 + }, + { + "epoch": 4.773066992797935, + "grad_norm": 5.924078464508057, + "learning_rate": 4.0362141595325455e-06, + "loss": 2.8708, + "step": 70250 + }, + { + "epoch": 4.773406712868597, + "grad_norm": 5.993656635284424, + "learning_rate": 4.035789509444218e-06, + "loss": 2.84, + "step": 70255 + }, + { + "epoch": 4.773746432939258, + "grad_norm": 7.946517467498779, + "learning_rate": 4.035364859355891e-06, + "loss": 2.9264, + "step": 70260 + }, + { + "epoch": 4.77408615300992, + "grad_norm": 9.478656768798828, + "learning_rate": 4.034940209267564e-06, + "loss": 2.829, + "step": 70265 + }, + { + "epoch": 4.774425873080582, + "grad_norm": 8.156126976013184, + "learning_rate": 4.034515559179237e-06, + "loss": 2.8486, + "step": 70270 + }, + { + "epoch": 4.774765593151243, + "grad_norm": 6.726442337036133, + "learning_rate": 4.0340909090909095e-06, + "loss": 3.0209, + "step": 70275 + }, + { + "epoch": 4.775105313221905, + "grad_norm": 6.774346351623535, + "learning_rate": 4.033666259002582e-06, + "loss": 2.6491, + "step": 70280 + }, + { + "epoch": 4.775445033292567, + "grad_norm": 6.804166316986084, + "learning_rate": 4.033241608914255e-06, + "loss": 3.0998, + "step": 70285 + }, + { + "epoch": 4.7757847533632285, + "grad_norm": 6.960669994354248, + "learning_rate": 4.032816958825928e-06, + "loss": 2.9092, + "step": 70290 + }, + { + "epoch": 4.776124473433891, + "grad_norm": 8.355180740356445, + "learning_rate": 4.032392308737601e-06, + "loss": 2.996, + "step": 70295 + }, + { + "epoch": 4.776464193504552, + "grad_norm": 8.3262939453125, + "learning_rate": 4.0319676586492735e-06, + "loss": 3.0184, + "step": 70300 + }, + { + "epoch": 4.776803913575214, + "grad_norm": 9.31286907196045, + "learning_rate": 4.031543008560946e-06, + "loss": 3.1821, + "step": 70305 + }, + { + "epoch": 4.777143633645876, + "grad_norm": 8.21440315246582, + "learning_rate": 4.031118358472619e-06, + "loss": 2.8362, + "step": 70310 + }, + { + "epoch": 4.777483353716537, + "grad_norm": 7.779482841491699, + "learning_rate": 4.030693708384292e-06, + "loss": 2.883, + "step": 70315 + }, + { + "epoch": 4.777823073787199, + "grad_norm": 7.8373894691467285, + "learning_rate": 4.030269058295965e-06, + "loss": 2.9533, + "step": 70320 + }, + { + "epoch": 4.778162793857861, + "grad_norm": 6.798405170440674, + "learning_rate": 4.0298444082076375e-06, + "loss": 2.8477, + "step": 70325 + }, + { + "epoch": 4.778502513928522, + "grad_norm": 8.006448745727539, + "learning_rate": 4.02941975811931e-06, + "loss": 2.8168, + "step": 70330 + }, + { + "epoch": 4.7788422339991845, + "grad_norm": 7.869976997375488, + "learning_rate": 4.028995108030983e-06, + "loss": 2.9824, + "step": 70335 + }, + { + "epoch": 4.779181954069847, + "grad_norm": 9.337663650512695, + "learning_rate": 4.028570457942655e-06, + "loss": 2.8111, + "step": 70340 + }, + { + "epoch": 4.779521674140508, + "grad_norm": 8.038771629333496, + "learning_rate": 4.028145807854329e-06, + "loss": 2.8932, + "step": 70345 + }, + { + "epoch": 4.77986139421117, + "grad_norm": 6.904372692108154, + "learning_rate": 4.0277211577660015e-06, + "loss": 2.6683, + "step": 70350 + }, + { + "epoch": 4.780201114281832, + "grad_norm": 6.9807915687561035, + "learning_rate": 4.0272965076776735e-06, + "loss": 3.0756, + "step": 70355 + }, + { + "epoch": 4.780540834352493, + "grad_norm": 5.821282863616943, + "learning_rate": 4.026871857589347e-06, + "loss": 2.9331, + "step": 70360 + }, + { + "epoch": 4.780880554423155, + "grad_norm": 8.47693157196045, + "learning_rate": 4.02644720750102e-06, + "loss": 2.6626, + "step": 70365 + }, + { + "epoch": 4.781220274493817, + "grad_norm": 6.244318962097168, + "learning_rate": 4.026022557412692e-06, + "loss": 3.0357, + "step": 70370 + }, + { + "epoch": 4.7815599945644784, + "grad_norm": 7.607468605041504, + "learning_rate": 4.025597907324365e-06, + "loss": 2.9676, + "step": 70375 + }, + { + "epoch": 4.7818997146351405, + "grad_norm": 7.979380130767822, + "learning_rate": 4.025173257236038e-06, + "loss": 2.7257, + "step": 70380 + }, + { + "epoch": 4.782239434705803, + "grad_norm": 6.920733451843262, + "learning_rate": 4.02474860714771e-06, + "loss": 2.9753, + "step": 70385 + }, + { + "epoch": 4.782579154776464, + "grad_norm": 7.661959648132324, + "learning_rate": 4.024323957059383e-06, + "loss": 2.8776, + "step": 70390 + }, + { + "epoch": 4.782918874847126, + "grad_norm": 7.511734485626221, + "learning_rate": 4.023899306971057e-06, + "loss": 2.7665, + "step": 70395 + }, + { + "epoch": 4.783258594917788, + "grad_norm": 7.868188858032227, + "learning_rate": 4.023474656882729e-06, + "loss": 2.6695, + "step": 70400 + }, + { + "epoch": 4.783598314988449, + "grad_norm": 8.355965614318848, + "learning_rate": 4.0230500067944015e-06, + "loss": 3.1006, + "step": 70405 + }, + { + "epoch": 4.783938035059111, + "grad_norm": 5.882506847381592, + "learning_rate": 4.022625356706074e-06, + "loss": 3.0015, + "step": 70410 + }, + { + "epoch": 4.784277755129773, + "grad_norm": 7.293107032775879, + "learning_rate": 4.022200706617747e-06, + "loss": 2.9866, + "step": 70415 + }, + { + "epoch": 4.7846174752004345, + "grad_norm": 7.858738422393799, + "learning_rate": 4.02177605652942e-06, + "loss": 2.7823, + "step": 70420 + }, + { + "epoch": 4.7849571952710965, + "grad_norm": 8.050634384155273, + "learning_rate": 4.021351406441093e-06, + "loss": 2.6829, + "step": 70425 + }, + { + "epoch": 4.785296915341759, + "grad_norm": 7.404603958129883, + "learning_rate": 4.0209267563527655e-06, + "loss": 2.9911, + "step": 70430 + }, + { + "epoch": 4.78563663541242, + "grad_norm": 6.675609588623047, + "learning_rate": 4.020502106264438e-06, + "loss": 2.8562, + "step": 70435 + }, + { + "epoch": 4.785976355483082, + "grad_norm": 7.711898326873779, + "learning_rate": 4.020077456176111e-06, + "loss": 2.7008, + "step": 70440 + }, + { + "epoch": 4.786316075553744, + "grad_norm": 8.981241226196289, + "learning_rate": 4.019652806087784e-06, + "loss": 2.8733, + "step": 70445 + }, + { + "epoch": 4.786655795624405, + "grad_norm": 7.729081630706787, + "learning_rate": 4.019228155999457e-06, + "loss": 2.8734, + "step": 70450 + }, + { + "epoch": 4.786995515695067, + "grad_norm": 6.918320655822754, + "learning_rate": 4.0188035059111295e-06, + "loss": 2.5746, + "step": 70455 + }, + { + "epoch": 4.787335235765729, + "grad_norm": 7.26939058303833, + "learning_rate": 4.018378855822802e-06, + "loss": 2.9289, + "step": 70460 + }, + { + "epoch": 4.7876749558363905, + "grad_norm": 8.30086612701416, + "learning_rate": 4.017954205734475e-06, + "loss": 2.8198, + "step": 70465 + }, + { + "epoch": 4.7880146759070525, + "grad_norm": 7.7238945960998535, + "learning_rate": 4.017529555646148e-06, + "loss": 2.8681, + "step": 70470 + }, + { + "epoch": 4.788354395977715, + "grad_norm": 8.733859062194824, + "learning_rate": 4.017104905557821e-06, + "loss": 2.9183, + "step": 70475 + }, + { + "epoch": 4.788694116048376, + "grad_norm": 6.8572187423706055, + "learning_rate": 4.0166802554694935e-06, + "loss": 2.7014, + "step": 70480 + }, + { + "epoch": 4.789033836119038, + "grad_norm": 7.580574035644531, + "learning_rate": 4.016255605381166e-06, + "loss": 2.8454, + "step": 70485 + }, + { + "epoch": 4.7893735561897, + "grad_norm": 6.340493679046631, + "learning_rate": 4.015830955292839e-06, + "loss": 2.6688, + "step": 70490 + }, + { + "epoch": 4.789713276260361, + "grad_norm": 7.554346084594727, + "learning_rate": 4.015406305204512e-06, + "loss": 2.7098, + "step": 70495 + }, + { + "epoch": 4.790052996331023, + "grad_norm": 8.961535453796387, + "learning_rate": 4.014981655116185e-06, + "loss": 3.0953, + "step": 70500 + }, + { + "epoch": 4.790392716401685, + "grad_norm": 6.702139377593994, + "learning_rate": 4.0145570050278575e-06, + "loss": 2.6081, + "step": 70505 + }, + { + "epoch": 4.7907324364723465, + "grad_norm": 6.212055683135986, + "learning_rate": 4.01413235493953e-06, + "loss": 2.9023, + "step": 70510 + }, + { + "epoch": 4.7910721565430086, + "grad_norm": 7.61061954498291, + "learning_rate": 4.013707704851203e-06, + "loss": 2.793, + "step": 70515 + }, + { + "epoch": 4.791411876613671, + "grad_norm": 6.5078816413879395, + "learning_rate": 4.013283054762876e-06, + "loss": 3.1908, + "step": 70520 + }, + { + "epoch": 4.791751596684332, + "grad_norm": 7.458581924438477, + "learning_rate": 4.012858404674549e-06, + "loss": 2.8608, + "step": 70525 + }, + { + "epoch": 4.792091316754994, + "grad_norm": 9.35076904296875, + "learning_rate": 4.0124337545862215e-06, + "loss": 2.6693, + "step": 70530 + }, + { + "epoch": 4.792431036825656, + "grad_norm": 8.27328109741211, + "learning_rate": 4.012009104497894e-06, + "loss": 3.0163, + "step": 70535 + }, + { + "epoch": 4.792770756896317, + "grad_norm": 6.411585330963135, + "learning_rate": 4.011584454409566e-06, + "loss": 2.677, + "step": 70540 + }, + { + "epoch": 4.793110476966979, + "grad_norm": 8.414441108703613, + "learning_rate": 4.01115980432124e-06, + "loss": 2.7444, + "step": 70545 + }, + { + "epoch": 4.793450197037641, + "grad_norm": 9.84262466430664, + "learning_rate": 4.010735154232913e-06, + "loss": 2.9849, + "step": 70550 + }, + { + "epoch": 4.7937899171083025, + "grad_norm": 7.6511030197143555, + "learning_rate": 4.010310504144585e-06, + "loss": 3.1175, + "step": 70555 + }, + { + "epoch": 4.794129637178965, + "grad_norm": 7.529627799987793, + "learning_rate": 4.009885854056258e-06, + "loss": 3.085, + "step": 70560 + }, + { + "epoch": 4.794469357249627, + "grad_norm": 8.839617729187012, + "learning_rate": 4.009461203967931e-06, + "loss": 2.9359, + "step": 70565 + }, + { + "epoch": 4.794809077320288, + "grad_norm": 9.153458595275879, + "learning_rate": 4.009036553879603e-06, + "loss": 3.0388, + "step": 70570 + }, + { + "epoch": 4.79514879739095, + "grad_norm": 7.762622356414795, + "learning_rate": 4.008611903791277e-06, + "loss": 2.6988, + "step": 70575 + }, + { + "epoch": 4.795488517461612, + "grad_norm": 8.526358604431152, + "learning_rate": 4.0081872537029495e-06, + "loss": 2.8506, + "step": 70580 + }, + { + "epoch": 4.795828237532273, + "grad_norm": 6.1265668869018555, + "learning_rate": 4.0077626036146215e-06, + "loss": 2.791, + "step": 70585 + }, + { + "epoch": 4.796167957602935, + "grad_norm": 7.590435028076172, + "learning_rate": 4.007337953526294e-06, + "loss": 2.9273, + "step": 70590 + }, + { + "epoch": 4.796507677673597, + "grad_norm": 7.642634868621826, + "learning_rate": 4.006913303437968e-06, + "loss": 2.8705, + "step": 70595 + }, + { + "epoch": 4.7968473977442585, + "grad_norm": 7.605443954467773, + "learning_rate": 4.00648865334964e-06, + "loss": 2.9219, + "step": 70600 + }, + { + "epoch": 4.797187117814921, + "grad_norm": 6.0552873611450195, + "learning_rate": 4.006064003261313e-06, + "loss": 2.8026, + "step": 70605 + }, + { + "epoch": 4.797526837885583, + "grad_norm": 7.123010635375977, + "learning_rate": 4.005639353172986e-06, + "loss": 2.9834, + "step": 70610 + }, + { + "epoch": 4.797866557956244, + "grad_norm": 7.436769008636475, + "learning_rate": 4.005214703084658e-06, + "loss": 2.8144, + "step": 70615 + }, + { + "epoch": 4.798206278026906, + "grad_norm": 6.9662957191467285, + "learning_rate": 4.004790052996331e-06, + "loss": 3.0268, + "step": 70620 + }, + { + "epoch": 4.798545998097568, + "grad_norm": 8.720091819763184, + "learning_rate": 4.004365402908004e-06, + "loss": 2.9988, + "step": 70625 + }, + { + "epoch": 4.798885718168229, + "grad_norm": 7.5440874099731445, + "learning_rate": 4.003940752819677e-06, + "loss": 2.7255, + "step": 70630 + }, + { + "epoch": 4.799225438238891, + "grad_norm": 7.040589332580566, + "learning_rate": 4.0035161027313495e-06, + "loss": 2.7635, + "step": 70635 + }, + { + "epoch": 4.799565158309553, + "grad_norm": 8.954111099243164, + "learning_rate": 4.003091452643022e-06, + "loss": 2.7853, + "step": 70640 + }, + { + "epoch": 4.7999048783802145, + "grad_norm": 7.848041534423828, + "learning_rate": 4.002666802554696e-06, + "loss": 2.7527, + "step": 70645 + }, + { + "epoch": 4.800244598450877, + "grad_norm": 7.734989166259766, + "learning_rate": 4.002242152466368e-06, + "loss": 2.8247, + "step": 70650 + }, + { + "epoch": 4.800584318521539, + "grad_norm": 7.9117536544799805, + "learning_rate": 4.001817502378041e-06, + "loss": 2.8821, + "step": 70655 + }, + { + "epoch": 4.8009240385922, + "grad_norm": 6.852624893188477, + "learning_rate": 4.0013928522897135e-06, + "loss": 2.9118, + "step": 70660 + }, + { + "epoch": 4.801263758662862, + "grad_norm": 6.107615947723389, + "learning_rate": 4.000968202201386e-06, + "loss": 3.0165, + "step": 70665 + }, + { + "epoch": 4.801603478733524, + "grad_norm": 7.873947620391846, + "learning_rate": 4.000543552113059e-06, + "loss": 2.6844, + "step": 70670 + }, + { + "epoch": 4.801943198804185, + "grad_norm": 7.710322856903076, + "learning_rate": 4.000118902024732e-06, + "loss": 3.217, + "step": 70675 + }, + { + "epoch": 4.802282918874847, + "grad_norm": 6.559734344482422, + "learning_rate": 3.999694251936405e-06, + "loss": 2.9943, + "step": 70680 + }, + { + "epoch": 4.802622638945509, + "grad_norm": 6.043304920196533, + "learning_rate": 3.9992696018480775e-06, + "loss": 2.8296, + "step": 70685 + }, + { + "epoch": 4.8029623590161705, + "grad_norm": 8.23183822631836, + "learning_rate": 3.99884495175975e-06, + "loss": 2.8586, + "step": 70690 + }, + { + "epoch": 4.803302079086833, + "grad_norm": 7.506150245666504, + "learning_rate": 3.998420301671423e-06, + "loss": 3.0533, + "step": 70695 + }, + { + "epoch": 4.803641799157495, + "grad_norm": 8.641501426696777, + "learning_rate": 3.997995651583096e-06, + "loss": 2.803, + "step": 70700 + }, + { + "epoch": 4.803981519228156, + "grad_norm": 8.30065631866455, + "learning_rate": 3.997571001494769e-06, + "loss": 2.7692, + "step": 70705 + }, + { + "epoch": 4.804321239298818, + "grad_norm": 7.522398948669434, + "learning_rate": 3.9971463514064415e-06, + "loss": 2.934, + "step": 70710 + }, + { + "epoch": 4.80466095936948, + "grad_norm": 7.71860933303833, + "learning_rate": 3.996721701318114e-06, + "loss": 2.8003, + "step": 70715 + }, + { + "epoch": 4.805000679440141, + "grad_norm": 5.382832050323486, + "learning_rate": 3.996297051229787e-06, + "loss": 2.846, + "step": 70720 + }, + { + "epoch": 4.805340399510803, + "grad_norm": 5.754716873168945, + "learning_rate": 3.99587240114146e-06, + "loss": 2.7054, + "step": 70725 + }, + { + "epoch": 4.805680119581465, + "grad_norm": 7.374516487121582, + "learning_rate": 3.995447751053133e-06, + "loss": 2.8912, + "step": 70730 + }, + { + "epoch": 4.8060198396521265, + "grad_norm": 7.1272430419921875, + "learning_rate": 3.9950231009648055e-06, + "loss": 2.8916, + "step": 70735 + }, + { + "epoch": 4.806359559722789, + "grad_norm": 7.255885601043701, + "learning_rate": 3.994598450876478e-06, + "loss": 2.8244, + "step": 70740 + }, + { + "epoch": 4.806699279793451, + "grad_norm": 7.2174859046936035, + "learning_rate": 3.994173800788151e-06, + "loss": 2.8731, + "step": 70745 + }, + { + "epoch": 4.807038999864112, + "grad_norm": 7.982731342315674, + "learning_rate": 3.993749150699824e-06, + "loss": 3.0362, + "step": 70750 + }, + { + "epoch": 4.807378719934774, + "grad_norm": 5.980330467224121, + "learning_rate": 3.993324500611496e-06, + "loss": 2.8493, + "step": 70755 + }, + { + "epoch": 4.807718440005435, + "grad_norm": 8.34940242767334, + "learning_rate": 3.9928998505231695e-06, + "loss": 2.8145, + "step": 70760 + }, + { + "epoch": 4.808058160076097, + "grad_norm": 6.878525733947754, + "learning_rate": 3.992475200434842e-06, + "loss": 3.0624, + "step": 70765 + }, + { + "epoch": 4.808397880146759, + "grad_norm": 8.91287899017334, + "learning_rate": 3.992050550346514e-06, + "loss": 2.8805, + "step": 70770 + }, + { + "epoch": 4.8087376002174205, + "grad_norm": 6.774704456329346, + "learning_rate": 3.991625900258188e-06, + "loss": 2.8866, + "step": 70775 + }, + { + "epoch": 4.8090773202880825, + "grad_norm": 6.325443744659424, + "learning_rate": 3.991201250169861e-06, + "loss": 2.6274, + "step": 70780 + }, + { + "epoch": 4.809417040358745, + "grad_norm": 5.750970363616943, + "learning_rate": 3.990776600081533e-06, + "loss": 2.9398, + "step": 70785 + }, + { + "epoch": 4.809756760429406, + "grad_norm": 7.881587028503418, + "learning_rate": 3.9903519499932054e-06, + "loss": 2.7057, + "step": 70790 + }, + { + "epoch": 4.810096480500068, + "grad_norm": 7.530877590179443, + "learning_rate": 3.989927299904879e-06, + "loss": 2.7956, + "step": 70795 + }, + { + "epoch": 4.81043620057073, + "grad_norm": 6.845832347869873, + "learning_rate": 3.989502649816551e-06, + "loss": 2.7604, + "step": 70800 + }, + { + "epoch": 4.810775920641391, + "grad_norm": 8.008890151977539, + "learning_rate": 3.989077999728224e-06, + "loss": 2.8822, + "step": 70805 + }, + { + "epoch": 4.811115640712053, + "grad_norm": 6.887450218200684, + "learning_rate": 3.9886533496398975e-06, + "loss": 2.8298, + "step": 70810 + }, + { + "epoch": 4.811455360782715, + "grad_norm": 7.993348121643066, + "learning_rate": 3.98822869955157e-06, + "loss": 2.9871, + "step": 70815 + }, + { + "epoch": 4.8117950808533765, + "grad_norm": 7.351254940032959, + "learning_rate": 3.987804049463242e-06, + "loss": 2.9008, + "step": 70820 + }, + { + "epoch": 4.812134800924039, + "grad_norm": 7.205068111419678, + "learning_rate": 3.987379399374915e-06, + "loss": 2.8663, + "step": 70825 + }, + { + "epoch": 4.812474520994701, + "grad_norm": 6.936645030975342, + "learning_rate": 3.986954749286589e-06, + "loss": 3.0292, + "step": 70830 + }, + { + "epoch": 4.812814241065362, + "grad_norm": 8.151089668273926, + "learning_rate": 3.986530099198261e-06, + "loss": 2.9088, + "step": 70835 + }, + { + "epoch": 4.813153961136024, + "grad_norm": 9.829867362976074, + "learning_rate": 3.9861054491099335e-06, + "loss": 2.9346, + "step": 70840 + }, + { + "epoch": 4.813493681206686, + "grad_norm": 8.701244354248047, + "learning_rate": 3.985680799021607e-06, + "loss": 3.212, + "step": 70845 + }, + { + "epoch": 4.813833401277347, + "grad_norm": 5.920718669891357, + "learning_rate": 3.985256148933279e-06, + "loss": 2.8429, + "step": 70850 + }, + { + "epoch": 4.814173121348009, + "grad_norm": 6.84199333190918, + "learning_rate": 3.984831498844952e-06, + "loss": 2.9709, + "step": 70855 + }, + { + "epoch": 4.814512841418671, + "grad_norm": 5.8079352378845215, + "learning_rate": 3.9844068487566255e-06, + "loss": 2.7628, + "step": 70860 + }, + { + "epoch": 4.8148525614893325, + "grad_norm": 7.687019348144531, + "learning_rate": 3.9839821986682975e-06, + "loss": 2.8865, + "step": 70865 + }, + { + "epoch": 4.815192281559995, + "grad_norm": 6.463468551635742, + "learning_rate": 3.98355754857997e-06, + "loss": 2.729, + "step": 70870 + }, + { + "epoch": 4.815532001630657, + "grad_norm": 5.964129447937012, + "learning_rate": 3.983132898491643e-06, + "loss": 2.6573, + "step": 70875 + }, + { + "epoch": 4.815871721701318, + "grad_norm": 7.931541442871094, + "learning_rate": 3.982708248403316e-06, + "loss": 3.021, + "step": 70880 + }, + { + "epoch": 4.81621144177198, + "grad_norm": 6.581204414367676, + "learning_rate": 3.982283598314989e-06, + "loss": 2.8567, + "step": 70885 + }, + { + "epoch": 4.816551161842642, + "grad_norm": 10.281426429748535, + "learning_rate": 3.9818589482266615e-06, + "loss": 3.0927, + "step": 70890 + }, + { + "epoch": 4.816890881913303, + "grad_norm": 6.641591548919678, + "learning_rate": 3.981434298138334e-06, + "loss": 3.0573, + "step": 70895 + }, + { + "epoch": 4.817230601983965, + "grad_norm": 8.458725929260254, + "learning_rate": 3.981009648050007e-06, + "loss": 2.9516, + "step": 70900 + }, + { + "epoch": 4.817570322054627, + "grad_norm": 8.700994491577148, + "learning_rate": 3.98058499796168e-06, + "loss": 3.0523, + "step": 70905 + }, + { + "epoch": 4.8179100421252885, + "grad_norm": 7.932255744934082, + "learning_rate": 3.980160347873353e-06, + "loss": 3.1792, + "step": 70910 + }, + { + "epoch": 4.818249762195951, + "grad_norm": 8.290681838989258, + "learning_rate": 3.9797356977850255e-06, + "loss": 2.8541, + "step": 70915 + }, + { + "epoch": 4.818589482266613, + "grad_norm": 6.494248867034912, + "learning_rate": 3.979311047696698e-06, + "loss": 2.6546, + "step": 70920 + }, + { + "epoch": 4.818929202337274, + "grad_norm": 9.04956340789795, + "learning_rate": 3.978886397608371e-06, + "loss": 2.8364, + "step": 70925 + }, + { + "epoch": 4.819268922407936, + "grad_norm": 7.7919416427612305, + "learning_rate": 3.978461747520044e-06, + "loss": 2.8192, + "step": 70930 + }, + { + "epoch": 4.819608642478598, + "grad_norm": 7.669704437255859, + "learning_rate": 3.978037097431717e-06, + "loss": 2.8145, + "step": 70935 + }, + { + "epoch": 4.819948362549259, + "grad_norm": 7.103949069976807, + "learning_rate": 3.9776124473433895e-06, + "loss": 2.9326, + "step": 70940 + }, + { + "epoch": 4.820288082619921, + "grad_norm": 7.474774360656738, + "learning_rate": 3.977187797255062e-06, + "loss": 2.9813, + "step": 70945 + }, + { + "epoch": 4.820627802690583, + "grad_norm": 6.88027286529541, + "learning_rate": 3.976763147166735e-06, + "loss": 2.8615, + "step": 70950 + }, + { + "epoch": 4.8209675227612445, + "grad_norm": 6.884002685546875, + "learning_rate": 3.976338497078408e-06, + "loss": 2.8064, + "step": 70955 + }, + { + "epoch": 4.821307242831907, + "grad_norm": 6.728096961975098, + "learning_rate": 3.975913846990081e-06, + "loss": 2.9109, + "step": 70960 + }, + { + "epoch": 4.821646962902569, + "grad_norm": 7.3729329109191895, + "learning_rate": 3.9754891969017535e-06, + "loss": 3.0022, + "step": 70965 + }, + { + "epoch": 4.82198668297323, + "grad_norm": 7.24360990524292, + "learning_rate": 3.9750645468134254e-06, + "loss": 2.9791, + "step": 70970 + }, + { + "epoch": 4.822326403043892, + "grad_norm": 6.673053741455078, + "learning_rate": 3.974639896725099e-06, + "loss": 2.8723, + "step": 70975 + }, + { + "epoch": 4.822666123114553, + "grad_norm": 6.952994346618652, + "learning_rate": 3.974215246636772e-06, + "loss": 2.5675, + "step": 70980 + }, + { + "epoch": 4.823005843185215, + "grad_norm": 7.703061103820801, + "learning_rate": 3.973790596548445e-06, + "loss": 2.9087, + "step": 70985 + }, + { + "epoch": 4.823345563255877, + "grad_norm": 7.246501445770264, + "learning_rate": 3.9733659464601175e-06, + "loss": 2.7697, + "step": 70990 + }, + { + "epoch": 4.8236852833265385, + "grad_norm": 7.7329816818237305, + "learning_rate": 3.97294129637179e-06, + "loss": 2.822, + "step": 70995 + }, + { + "epoch": 4.8240250033972005, + "grad_norm": 8.425265312194824, + "learning_rate": 3.972516646283463e-06, + "loss": 2.9196, + "step": 71000 + }, + { + "epoch": 4.824364723467863, + "grad_norm": 6.967437267303467, + "learning_rate": 3.972091996195135e-06, + "loss": 2.8097, + "step": 71005 + }, + { + "epoch": 4.824704443538524, + "grad_norm": 9.550254821777344, + "learning_rate": 3.971667346106809e-06, + "loss": 2.9339, + "step": 71010 + }, + { + "epoch": 4.825044163609186, + "grad_norm": 8.954854011535645, + "learning_rate": 3.9712426960184815e-06, + "loss": 2.4436, + "step": 71015 + }, + { + "epoch": 4.825383883679848, + "grad_norm": 6.5403618812561035, + "learning_rate": 3.9708180459301534e-06, + "loss": 2.7198, + "step": 71020 + }, + { + "epoch": 4.825723603750509, + "grad_norm": 7.820460319519043, + "learning_rate": 3.970393395841827e-06, + "loss": 2.9966, + "step": 71025 + }, + { + "epoch": 4.826063323821171, + "grad_norm": 6.611223220825195, + "learning_rate": 3.9699687457535e-06, + "loss": 3.0487, + "step": 71030 + }, + { + "epoch": 4.826403043891833, + "grad_norm": 8.073612213134766, + "learning_rate": 3.969544095665172e-06, + "loss": 3.0444, + "step": 71035 + }, + { + "epoch": 4.8267427639624945, + "grad_norm": 6.9860405921936035, + "learning_rate": 3.969119445576845e-06, + "loss": 2.8399, + "step": 71040 + }, + { + "epoch": 4.8270824840331565, + "grad_norm": 7.96758508682251, + "learning_rate": 3.968694795488518e-06, + "loss": 2.6525, + "step": 71045 + }, + { + "epoch": 4.827422204103819, + "grad_norm": 6.515589714050293, + "learning_rate": 3.96827014540019e-06, + "loss": 2.8771, + "step": 71050 + }, + { + "epoch": 4.82776192417448, + "grad_norm": 6.967366695404053, + "learning_rate": 3.967845495311863e-06, + "loss": 2.7575, + "step": 71055 + }, + { + "epoch": 4.828101644245142, + "grad_norm": 6.728609085083008, + "learning_rate": 3.967420845223537e-06, + "loss": 2.8874, + "step": 71060 + }, + { + "epoch": 4.828441364315804, + "grad_norm": 7.909602642059326, + "learning_rate": 3.966996195135209e-06, + "loss": 2.973, + "step": 71065 + }, + { + "epoch": 4.828781084386465, + "grad_norm": 8.201729774475098, + "learning_rate": 3.9665715450468814e-06, + "loss": 2.8052, + "step": 71070 + }, + { + "epoch": 4.829120804457127, + "grad_norm": 8.380651473999023, + "learning_rate": 3.966146894958554e-06, + "loss": 2.8453, + "step": 71075 + }, + { + "epoch": 4.829460524527789, + "grad_norm": 7.739345073699951, + "learning_rate": 3.965722244870227e-06, + "loss": 2.7725, + "step": 71080 + }, + { + "epoch": 4.8298002445984505, + "grad_norm": 7.10455846786499, + "learning_rate": 3.9652975947819e-06, + "loss": 2.9934, + "step": 71085 + }, + { + "epoch": 4.8301399646691126, + "grad_norm": 7.013828277587891, + "learning_rate": 3.964872944693573e-06, + "loss": 2.9676, + "step": 71090 + }, + { + "epoch": 4.830479684739775, + "grad_norm": 7.894550800323486, + "learning_rate": 3.9644482946052454e-06, + "loss": 2.8815, + "step": 71095 + }, + { + "epoch": 4.830819404810436, + "grad_norm": 7.095306396484375, + "learning_rate": 3.964023644516918e-06, + "loss": 2.8537, + "step": 71100 + }, + { + "epoch": 4.831159124881098, + "grad_norm": 8.60036849975586, + "learning_rate": 3.963598994428591e-06, + "loss": 2.7689, + "step": 71105 + }, + { + "epoch": 4.83149884495176, + "grad_norm": 7.724127769470215, + "learning_rate": 3.963174344340264e-06, + "loss": 3.0716, + "step": 71110 + }, + { + "epoch": 4.831838565022421, + "grad_norm": 7.930990695953369, + "learning_rate": 3.962749694251937e-06, + "loss": 2.994, + "step": 71115 + }, + { + "epoch": 4.832178285093083, + "grad_norm": 6.652949333190918, + "learning_rate": 3.9623250441636094e-06, + "loss": 2.7256, + "step": 71120 + }, + { + "epoch": 4.832518005163745, + "grad_norm": 6.967902183532715, + "learning_rate": 3.961900394075282e-06, + "loss": 3.0693, + "step": 71125 + }, + { + "epoch": 4.8328577252344065, + "grad_norm": 7.832798480987549, + "learning_rate": 3.961475743986955e-06, + "loss": 2.7406, + "step": 71130 + }, + { + "epoch": 4.833197445305069, + "grad_norm": 6.18649435043335, + "learning_rate": 3.961051093898628e-06, + "loss": 2.9778, + "step": 71135 + }, + { + "epoch": 4.833537165375731, + "grad_norm": 8.337956428527832, + "learning_rate": 3.960626443810301e-06, + "loss": 2.5854, + "step": 71140 + }, + { + "epoch": 4.833876885446392, + "grad_norm": 7.576107501983643, + "learning_rate": 3.9602017937219735e-06, + "loss": 2.8782, + "step": 71145 + }, + { + "epoch": 4.834216605517054, + "grad_norm": 6.163726329803467, + "learning_rate": 3.959777143633646e-06, + "loss": 2.9833, + "step": 71150 + }, + { + "epoch": 4.834556325587716, + "grad_norm": 9.326109886169434, + "learning_rate": 3.959352493545319e-06, + "loss": 2.776, + "step": 71155 + }, + { + "epoch": 4.834896045658377, + "grad_norm": 7.345394134521484, + "learning_rate": 3.958927843456992e-06, + "loss": 2.7665, + "step": 71160 + }, + { + "epoch": 4.835235765729039, + "grad_norm": 8.600248336791992, + "learning_rate": 3.958503193368665e-06, + "loss": 3.0529, + "step": 71165 + }, + { + "epoch": 4.835575485799701, + "grad_norm": 7.0162672996521, + "learning_rate": 3.9580785432803375e-06, + "loss": 2.9516, + "step": 71170 + }, + { + "epoch": 4.8359152058703625, + "grad_norm": 11.089599609375, + "learning_rate": 3.95765389319201e-06, + "loss": 2.9495, + "step": 71175 + }, + { + "epoch": 4.836254925941025, + "grad_norm": 5.057751178741455, + "learning_rate": 3.957229243103683e-06, + "loss": 2.8189, + "step": 71180 + }, + { + "epoch": 4.836594646011687, + "grad_norm": 8.775968551635742, + "learning_rate": 3.956804593015356e-06, + "loss": 2.9479, + "step": 71185 + }, + { + "epoch": 4.836934366082348, + "grad_norm": 7.1127448081970215, + "learning_rate": 3.956379942927029e-06, + "loss": 2.7525, + "step": 71190 + }, + { + "epoch": 4.83727408615301, + "grad_norm": 8.055057525634766, + "learning_rate": 3.9559552928387015e-06, + "loss": 2.7797, + "step": 71195 + }, + { + "epoch": 4.837613806223672, + "grad_norm": 7.295104026794434, + "learning_rate": 3.955530642750374e-06, + "loss": 2.8085, + "step": 71200 + }, + { + "epoch": 4.837953526294333, + "grad_norm": 8.534187316894531, + "learning_rate": 3.955105992662047e-06, + "loss": 2.9258, + "step": 71205 + }, + { + "epoch": 4.838293246364995, + "grad_norm": 10.30492115020752, + "learning_rate": 3.95468134257372e-06, + "loss": 2.9191, + "step": 71210 + }, + { + "epoch": 4.838632966435657, + "grad_norm": 7.911304950714111, + "learning_rate": 3.954256692485393e-06, + "loss": 2.9628, + "step": 71215 + }, + { + "epoch": 4.8389726865063185, + "grad_norm": 6.5441107749938965, + "learning_rate": 3.953832042397065e-06, + "loss": 2.7586, + "step": 71220 + }, + { + "epoch": 4.839312406576981, + "grad_norm": 6.5068135261535645, + "learning_rate": 3.953407392308738e-06, + "loss": 2.7951, + "step": 71225 + }, + { + "epoch": 4.839652126647643, + "grad_norm": 5.506045341491699, + "learning_rate": 3.952982742220411e-06, + "loss": 2.6122, + "step": 71230 + }, + { + "epoch": 4.839991846718304, + "grad_norm": 7.448179721832275, + "learning_rate": 3.952558092132083e-06, + "loss": 2.719, + "step": 71235 + }, + { + "epoch": 4.840331566788966, + "grad_norm": 7.2891526222229, + "learning_rate": 3.952133442043757e-06, + "loss": 3.0269, + "step": 71240 + }, + { + "epoch": 4.840671286859628, + "grad_norm": 8.91067123413086, + "learning_rate": 3.9517087919554295e-06, + "loss": 2.739, + "step": 71245 + }, + { + "epoch": 4.841011006930289, + "grad_norm": 6.972611904144287, + "learning_rate": 3.951284141867101e-06, + "loss": 2.9797, + "step": 71250 + }, + { + "epoch": 4.841350727000951, + "grad_norm": 7.874289035797119, + "learning_rate": 3.950859491778774e-06, + "loss": 2.975, + "step": 71255 + }, + { + "epoch": 4.841690447071613, + "grad_norm": 8.0787353515625, + "learning_rate": 3.950434841690448e-06, + "loss": 2.8988, + "step": 71260 + }, + { + "epoch": 4.8420301671422745, + "grad_norm": 8.061734199523926, + "learning_rate": 3.95001019160212e-06, + "loss": 2.9374, + "step": 71265 + }, + { + "epoch": 4.842369887212937, + "grad_norm": 6.549830913543701, + "learning_rate": 3.949585541513793e-06, + "loss": 2.7421, + "step": 71270 + }, + { + "epoch": 4.842709607283599, + "grad_norm": 7.168211460113525, + "learning_rate": 3.949160891425466e-06, + "loss": 2.9163, + "step": 71275 + }, + { + "epoch": 4.84304932735426, + "grad_norm": 7.0362091064453125, + "learning_rate": 3.948736241337138e-06, + "loss": 2.8325, + "step": 71280 + }, + { + "epoch": 4.843389047424922, + "grad_norm": 7.935643672943115, + "learning_rate": 3.948311591248811e-06, + "loss": 2.8125, + "step": 71285 + }, + { + "epoch": 4.843728767495584, + "grad_norm": 8.246258735656738, + "learning_rate": 3.947886941160484e-06, + "loss": 2.756, + "step": 71290 + }, + { + "epoch": 4.844068487566245, + "grad_norm": 6.215963363647461, + "learning_rate": 3.947462291072157e-06, + "loss": 2.7566, + "step": 71295 + }, + { + "epoch": 4.844408207636907, + "grad_norm": 5.7078633308410645, + "learning_rate": 3.9470376409838294e-06, + "loss": 2.6208, + "step": 71300 + }, + { + "epoch": 4.844747927707569, + "grad_norm": 5.879544734954834, + "learning_rate": 3.946612990895502e-06, + "loss": 3.0521, + "step": 71305 + }, + { + "epoch": 4.8450876477782305, + "grad_norm": 7.612137317657471, + "learning_rate": 3.946188340807175e-06, + "loss": 2.6797, + "step": 71310 + }, + { + "epoch": 4.845427367848893, + "grad_norm": 7.830124378204346, + "learning_rate": 3.945763690718848e-06, + "loss": 2.8981, + "step": 71315 + }, + { + "epoch": 4.845767087919555, + "grad_norm": 7.332218647003174, + "learning_rate": 3.945339040630521e-06, + "loss": 3.0287, + "step": 71320 + }, + { + "epoch": 4.846106807990216, + "grad_norm": 7.18240213394165, + "learning_rate": 3.9449143905421934e-06, + "loss": 3.0285, + "step": 71325 + }, + { + "epoch": 4.846446528060878, + "grad_norm": 11.027581214904785, + "learning_rate": 3.944489740453866e-06, + "loss": 3.0086, + "step": 71330 + }, + { + "epoch": 4.84678624813154, + "grad_norm": 8.859554290771484, + "learning_rate": 3.944065090365539e-06, + "loss": 3.023, + "step": 71335 + }, + { + "epoch": 4.847125968202201, + "grad_norm": 7.4293951988220215, + "learning_rate": 3.943640440277212e-06, + "loss": 2.6729, + "step": 71340 + }, + { + "epoch": 4.847465688272863, + "grad_norm": 6.760748863220215, + "learning_rate": 3.943215790188885e-06, + "loss": 2.9509, + "step": 71345 + }, + { + "epoch": 4.847805408343525, + "grad_norm": 7.186984062194824, + "learning_rate": 3.9427911401005574e-06, + "loss": 2.9591, + "step": 71350 + }, + { + "epoch": 4.8481451284141865, + "grad_norm": 8.79909610748291, + "learning_rate": 3.94236649001223e-06, + "loss": 3.1578, + "step": 71355 + }, + { + "epoch": 4.848484848484849, + "grad_norm": 6.327155113220215, + "learning_rate": 3.941941839923903e-06, + "loss": 2.7883, + "step": 71360 + }, + { + "epoch": 4.848824568555511, + "grad_norm": 6.789480686187744, + "learning_rate": 3.941517189835576e-06, + "loss": 2.9026, + "step": 71365 + }, + { + "epoch": 4.849164288626172, + "grad_norm": 6.611035346984863, + "learning_rate": 3.941092539747249e-06, + "loss": 2.8339, + "step": 71370 + }, + { + "epoch": 4.849504008696834, + "grad_norm": 7.178590774536133, + "learning_rate": 3.9406678896589214e-06, + "loss": 2.8979, + "step": 71375 + }, + { + "epoch": 4.849843728767496, + "grad_norm": 6.587455749511719, + "learning_rate": 3.940243239570594e-06, + "loss": 2.8546, + "step": 71380 + }, + { + "epoch": 4.850183448838157, + "grad_norm": 8.606696128845215, + "learning_rate": 3.939818589482267e-06, + "loss": 2.7151, + "step": 71385 + }, + { + "epoch": 4.850523168908819, + "grad_norm": 7.126378059387207, + "learning_rate": 3.93939393939394e-06, + "loss": 2.9165, + "step": 71390 + }, + { + "epoch": 4.850862888979481, + "grad_norm": 6.996674537658691, + "learning_rate": 3.938969289305613e-06, + "loss": 2.6378, + "step": 71395 + }, + { + "epoch": 4.8512026090501426, + "grad_norm": 6.200626373291016, + "learning_rate": 3.9385446392172854e-06, + "loss": 2.7413, + "step": 71400 + }, + { + "epoch": 4.851542329120805, + "grad_norm": 9.475825309753418, + "learning_rate": 3.938119989128958e-06, + "loss": 2.8365, + "step": 71405 + }, + { + "epoch": 4.851882049191467, + "grad_norm": 8.563474655151367, + "learning_rate": 3.937695339040631e-06, + "loss": 2.7416, + "step": 71410 + }, + { + "epoch": 4.852221769262128, + "grad_norm": 7.860194683074951, + "learning_rate": 3.937270688952304e-06, + "loss": 2.9107, + "step": 71415 + }, + { + "epoch": 4.85256148933279, + "grad_norm": 7.0129852294921875, + "learning_rate": 3.936846038863976e-06, + "loss": 2.7524, + "step": 71420 + }, + { + "epoch": 4.852901209403452, + "grad_norm": 6.445945739746094, + "learning_rate": 3.9364213887756494e-06, + "loss": 2.7732, + "step": 71425 + }, + { + "epoch": 4.853240929474113, + "grad_norm": 7.08650016784668, + "learning_rate": 3.935996738687322e-06, + "loss": 3.0674, + "step": 71430 + }, + { + "epoch": 4.853580649544775, + "grad_norm": 8.017946243286133, + "learning_rate": 3.935572088598994e-06, + "loss": 3.0433, + "step": 71435 + }, + { + "epoch": 4.8539203696154365, + "grad_norm": 6.477481842041016, + "learning_rate": 3.935147438510668e-06, + "loss": 2.9672, + "step": 71440 + }, + { + "epoch": 4.854260089686099, + "grad_norm": 9.009156227111816, + "learning_rate": 3.934722788422341e-06, + "loss": 2.7803, + "step": 71445 + }, + { + "epoch": 4.854599809756761, + "grad_norm": 6.9333367347717285, + "learning_rate": 3.934298138334013e-06, + "loss": 2.9331, + "step": 71450 + }, + { + "epoch": 4.854939529827422, + "grad_norm": 7.305639743804932, + "learning_rate": 3.933873488245685e-06, + "loss": 2.6987, + "step": 71455 + }, + { + "epoch": 4.855279249898084, + "grad_norm": 8.540698051452637, + "learning_rate": 3.933448838157359e-06, + "loss": 2.7636, + "step": 71460 + }, + { + "epoch": 4.855618969968746, + "grad_norm": 8.443689346313477, + "learning_rate": 3.933024188069031e-06, + "loss": 2.8571, + "step": 71465 + }, + { + "epoch": 4.855958690039407, + "grad_norm": 8.707584381103516, + "learning_rate": 3.932599537980704e-06, + "loss": 3.0042, + "step": 71470 + }, + { + "epoch": 4.856298410110069, + "grad_norm": 9.33671760559082, + "learning_rate": 3.9321748878923775e-06, + "loss": 2.9083, + "step": 71475 + }, + { + "epoch": 4.856638130180731, + "grad_norm": 6.342630386352539, + "learning_rate": 3.931750237804049e-06, + "loss": 2.642, + "step": 71480 + }, + { + "epoch": 4.8569778502513925, + "grad_norm": 7.756043434143066, + "learning_rate": 3.931325587715722e-06, + "loss": 3.202, + "step": 71485 + }, + { + "epoch": 4.857317570322055, + "grad_norm": 8.708495140075684, + "learning_rate": 3.930900937627396e-06, + "loss": 2.997, + "step": 71490 + }, + { + "epoch": 4.857657290392717, + "grad_norm": 7.298834323883057, + "learning_rate": 3.930476287539069e-06, + "loss": 2.7162, + "step": 71495 + }, + { + "epoch": 4.857997010463378, + "grad_norm": 7.434762001037598, + "learning_rate": 3.930051637450741e-06, + "loss": 2.9908, + "step": 71500 + }, + { + "epoch": 4.85833673053404, + "grad_norm": 9.152462005615234, + "learning_rate": 3.929626987362413e-06, + "loss": 2.7062, + "step": 71505 + }, + { + "epoch": 4.858676450604702, + "grad_norm": 7.262764930725098, + "learning_rate": 3.929202337274087e-06, + "loss": 2.6146, + "step": 71510 + }, + { + "epoch": 4.859016170675363, + "grad_norm": 8.948013305664062, + "learning_rate": 3.928777687185759e-06, + "loss": 2.8188, + "step": 71515 + }, + { + "epoch": 4.859355890746025, + "grad_norm": 5.986660480499268, + "learning_rate": 3.928353037097432e-06, + "loss": 2.7643, + "step": 71520 + }, + { + "epoch": 4.859695610816687, + "grad_norm": 8.177078247070312, + "learning_rate": 3.9279283870091055e-06, + "loss": 2.9382, + "step": 71525 + }, + { + "epoch": 4.8600353308873485, + "grad_norm": 8.23403263092041, + "learning_rate": 3.927503736920777e-06, + "loss": 2.758, + "step": 71530 + }, + { + "epoch": 4.860375050958011, + "grad_norm": 9.149523735046387, + "learning_rate": 3.92707908683245e-06, + "loss": 2.8103, + "step": 71535 + }, + { + "epoch": 4.860714771028673, + "grad_norm": 8.504227638244629, + "learning_rate": 3.926654436744123e-06, + "loss": 2.8346, + "step": 71540 + }, + { + "epoch": 4.861054491099334, + "grad_norm": 7.48217248916626, + "learning_rate": 3.926229786655796e-06, + "loss": 3.17, + "step": 71545 + }, + { + "epoch": 4.861394211169996, + "grad_norm": 7.12393856048584, + "learning_rate": 3.925805136567469e-06, + "loss": 2.9789, + "step": 71550 + }, + { + "epoch": 4.861733931240658, + "grad_norm": 7.115837574005127, + "learning_rate": 3.925380486479141e-06, + "loss": 2.9511, + "step": 71555 + }, + { + "epoch": 4.862073651311319, + "grad_norm": 7.697124004364014, + "learning_rate": 3.924955836390814e-06, + "loss": 2.7115, + "step": 71560 + }, + { + "epoch": 4.862413371381981, + "grad_norm": 10.42916202545166, + "learning_rate": 3.924531186302487e-06, + "loss": 2.8385, + "step": 71565 + }, + { + "epoch": 4.862753091452643, + "grad_norm": 8.543478965759277, + "learning_rate": 3.92410653621416e-06, + "loss": 2.7705, + "step": 71570 + }, + { + "epoch": 4.8630928115233045, + "grad_norm": 6.447871685028076, + "learning_rate": 3.923681886125833e-06, + "loss": 2.988, + "step": 71575 + }, + { + "epoch": 4.863432531593967, + "grad_norm": 8.009345054626465, + "learning_rate": 3.9232572360375054e-06, + "loss": 2.8629, + "step": 71580 + }, + { + "epoch": 4.863772251664629, + "grad_norm": 6.951723575592041, + "learning_rate": 3.922832585949178e-06, + "loss": 3.0991, + "step": 71585 + }, + { + "epoch": 4.86411197173529, + "grad_norm": 6.266445159912109, + "learning_rate": 3.922407935860851e-06, + "loss": 2.5685, + "step": 71590 + }, + { + "epoch": 4.864451691805952, + "grad_norm": 6.850884437561035, + "learning_rate": 3.921983285772524e-06, + "loss": 3.0183, + "step": 71595 + }, + { + "epoch": 4.864791411876614, + "grad_norm": 6.168191909790039, + "learning_rate": 3.921558635684197e-06, + "loss": 2.8011, + "step": 71600 + }, + { + "epoch": 4.865131131947275, + "grad_norm": 6.637813091278076, + "learning_rate": 3.9211339855958694e-06, + "loss": 2.9467, + "step": 71605 + }, + { + "epoch": 4.865470852017937, + "grad_norm": 8.401575088500977, + "learning_rate": 3.920709335507542e-06, + "loss": 2.6833, + "step": 71610 + }, + { + "epoch": 4.865810572088599, + "grad_norm": 7.205918788909912, + "learning_rate": 3.920284685419215e-06, + "loss": 2.8016, + "step": 71615 + }, + { + "epoch": 4.8661502921592605, + "grad_norm": 8.928770065307617, + "learning_rate": 3.919860035330888e-06, + "loss": 2.8494, + "step": 71620 + }, + { + "epoch": 4.866490012229923, + "grad_norm": 7.323704719543457, + "learning_rate": 3.919435385242561e-06, + "loss": 3.0535, + "step": 71625 + }, + { + "epoch": 4.866829732300585, + "grad_norm": 7.472269058227539, + "learning_rate": 3.9190107351542334e-06, + "loss": 2.7971, + "step": 71630 + }, + { + "epoch": 4.867169452371246, + "grad_norm": 6.964853286743164, + "learning_rate": 3.918586085065905e-06, + "loss": 2.9128, + "step": 71635 + }, + { + "epoch": 4.867509172441908, + "grad_norm": 6.858909606933594, + "learning_rate": 3.918161434977579e-06, + "loss": 2.7783, + "step": 71640 + }, + { + "epoch": 4.86784889251257, + "grad_norm": 9.247932434082031, + "learning_rate": 3.917736784889252e-06, + "loss": 3.0674, + "step": 71645 + }, + { + "epoch": 4.868188612583231, + "grad_norm": 5.708276271820068, + "learning_rate": 3.917312134800924e-06, + "loss": 2.8962, + "step": 71650 + }, + { + "epoch": 4.868528332653893, + "grad_norm": 8.036192893981934, + "learning_rate": 3.9168874847125974e-06, + "loss": 2.9734, + "step": 71655 + }, + { + "epoch": 4.8688680527245545, + "grad_norm": 7.698583126068115, + "learning_rate": 3.91646283462427e-06, + "loss": 2.6509, + "step": 71660 + }, + { + "epoch": 4.8692077727952165, + "grad_norm": 7.146472454071045, + "learning_rate": 3.916038184535943e-06, + "loss": 3.0183, + "step": 71665 + }, + { + "epoch": 4.869547492865879, + "grad_norm": 6.084766387939453, + "learning_rate": 3.915613534447615e-06, + "loss": 3.2494, + "step": 71670 + }, + { + "epoch": 4.86988721293654, + "grad_norm": 6.4263434410095215, + "learning_rate": 3.915188884359289e-06, + "loss": 3.0148, + "step": 71675 + }, + { + "epoch": 4.870226933007202, + "grad_norm": 6.9338274002075195, + "learning_rate": 3.9147642342709614e-06, + "loss": 2.9359, + "step": 71680 + }, + { + "epoch": 4.870566653077864, + "grad_norm": 8.313895225524902, + "learning_rate": 3.914339584182633e-06, + "loss": 2.5764, + "step": 71685 + }, + { + "epoch": 4.870906373148525, + "grad_norm": 7.650493144989014, + "learning_rate": 3.913914934094307e-06, + "loss": 2.739, + "step": 71690 + }, + { + "epoch": 4.871246093219187, + "grad_norm": 7.091349124908447, + "learning_rate": 3.91349028400598e-06, + "loss": 2.8372, + "step": 71695 + }, + { + "epoch": 4.871585813289849, + "grad_norm": 7.220814228057861, + "learning_rate": 3.913065633917652e-06, + "loss": 2.8766, + "step": 71700 + }, + { + "epoch": 4.8719255333605105, + "grad_norm": 6.117645263671875, + "learning_rate": 3.912640983829325e-06, + "loss": 2.738, + "step": 71705 + }, + { + "epoch": 4.872265253431173, + "grad_norm": 7.856549263000488, + "learning_rate": 3.912216333740998e-06, + "loss": 2.5843, + "step": 71710 + }, + { + "epoch": 4.872604973501835, + "grad_norm": 7.66455078125, + "learning_rate": 3.91179168365267e-06, + "loss": 2.6544, + "step": 71715 + }, + { + "epoch": 4.872944693572496, + "grad_norm": 6.997447967529297, + "learning_rate": 3.911367033564343e-06, + "loss": 3.0282, + "step": 71720 + }, + { + "epoch": 4.873284413643158, + "grad_norm": 8.083439826965332, + "learning_rate": 3.910942383476017e-06, + "loss": 2.9158, + "step": 71725 + }, + { + "epoch": 4.87362413371382, + "grad_norm": 7.048851490020752, + "learning_rate": 3.910517733387689e-06, + "loss": 2.9915, + "step": 71730 + }, + { + "epoch": 4.873963853784481, + "grad_norm": 7.475986480712891, + "learning_rate": 3.910093083299361e-06, + "loss": 3.0593, + "step": 71735 + }, + { + "epoch": 4.874303573855143, + "grad_norm": 6.4395551681518555, + "learning_rate": 3.909668433211034e-06, + "loss": 2.8697, + "step": 71740 + }, + { + "epoch": 4.874643293925805, + "grad_norm": 7.697344779968262, + "learning_rate": 3.909243783122707e-06, + "loss": 3.0254, + "step": 71745 + }, + { + "epoch": 4.8749830139964665, + "grad_norm": 6.8789777755737305, + "learning_rate": 3.90881913303438e-06, + "loss": 2.8555, + "step": 71750 + }, + { + "epoch": 4.875322734067129, + "grad_norm": 7.02832555770874, + "learning_rate": 3.908394482946053e-06, + "loss": 2.7156, + "step": 71755 + }, + { + "epoch": 4.875662454137791, + "grad_norm": 6.5248637199401855, + "learning_rate": 3.907969832857725e-06, + "loss": 2.8269, + "step": 71760 + }, + { + "epoch": 4.876002174208452, + "grad_norm": 5.904773235321045, + "learning_rate": 3.907545182769398e-06, + "loss": 3.003, + "step": 71765 + }, + { + "epoch": 4.876341894279114, + "grad_norm": 8.86627197265625, + "learning_rate": 3.907120532681071e-06, + "loss": 2.8039, + "step": 71770 + }, + { + "epoch": 4.876681614349776, + "grad_norm": 8.024470329284668, + "learning_rate": 3.906695882592744e-06, + "loss": 2.9649, + "step": 71775 + }, + { + "epoch": 4.877021334420437, + "grad_norm": 6.036489963531494, + "learning_rate": 3.906271232504417e-06, + "loss": 2.9914, + "step": 71780 + }, + { + "epoch": 4.877361054491099, + "grad_norm": 6.82889986038208, + "learning_rate": 3.905846582416089e-06, + "loss": 2.906, + "step": 71785 + }, + { + "epoch": 4.877700774561761, + "grad_norm": 8.444062232971191, + "learning_rate": 3.905421932327762e-06, + "loss": 2.8694, + "step": 71790 + }, + { + "epoch": 4.8780404946324225, + "grad_norm": 6.686005115509033, + "learning_rate": 3.904997282239435e-06, + "loss": 2.5953, + "step": 71795 + }, + { + "epoch": 4.878380214703085, + "grad_norm": 6.982565402984619, + "learning_rate": 3.904572632151108e-06, + "loss": 2.9825, + "step": 71800 + }, + { + "epoch": 4.878719934773747, + "grad_norm": 8.501317024230957, + "learning_rate": 3.904147982062781e-06, + "loss": 2.7979, + "step": 71805 + }, + { + "epoch": 4.879059654844408, + "grad_norm": 8.359970092773438, + "learning_rate": 3.903723331974453e-06, + "loss": 2.8823, + "step": 71810 + }, + { + "epoch": 4.87939937491507, + "grad_norm": 8.059720039367676, + "learning_rate": 3.903298681886126e-06, + "loss": 2.8812, + "step": 71815 + }, + { + "epoch": 4.879739094985732, + "grad_norm": 9.037762641906738, + "learning_rate": 3.902874031797799e-06, + "loss": 2.9604, + "step": 71820 + }, + { + "epoch": 4.880078815056393, + "grad_norm": 5.757523059844971, + "learning_rate": 3.902449381709472e-06, + "loss": 2.895, + "step": 71825 + }, + { + "epoch": 4.880418535127055, + "grad_norm": 8.134393692016602, + "learning_rate": 3.902024731621145e-06, + "loss": 2.9368, + "step": 71830 + }, + { + "epoch": 4.880758255197717, + "grad_norm": 7.592741966247559, + "learning_rate": 3.901600081532817e-06, + "loss": 3.0983, + "step": 71835 + }, + { + "epoch": 4.8810979752683785, + "grad_norm": 7.35185432434082, + "learning_rate": 3.90117543144449e-06, + "loss": 2.9351, + "step": 71840 + }, + { + "epoch": 4.881437695339041, + "grad_norm": 8.78035831451416, + "learning_rate": 3.900750781356163e-06, + "loss": 2.9774, + "step": 71845 + }, + { + "epoch": 4.881777415409703, + "grad_norm": 9.083820343017578, + "learning_rate": 3.900326131267836e-06, + "loss": 2.8557, + "step": 71850 + }, + { + "epoch": 4.882117135480364, + "grad_norm": 7.048730850219727, + "learning_rate": 3.899901481179509e-06, + "loss": 2.7816, + "step": 71855 + }, + { + "epoch": 4.882456855551026, + "grad_norm": 7.018658638000488, + "learning_rate": 3.899476831091181e-06, + "loss": 3.0188, + "step": 71860 + }, + { + "epoch": 4.882796575621688, + "grad_norm": 7.018612861633301, + "learning_rate": 3.899052181002854e-06, + "loss": 3.2176, + "step": 71865 + }, + { + "epoch": 4.883136295692349, + "grad_norm": 7.797927379608154, + "learning_rate": 3.898627530914527e-06, + "loss": 2.9957, + "step": 71870 + }, + { + "epoch": 4.883476015763011, + "grad_norm": 9.119231224060059, + "learning_rate": 3.8982028808262e-06, + "loss": 2.8111, + "step": 71875 + }, + { + "epoch": 4.883815735833673, + "grad_norm": 6.754885673522949, + "learning_rate": 3.897778230737873e-06, + "loss": 3.0365, + "step": 71880 + }, + { + "epoch": 4.8841554559043345, + "grad_norm": 9.084126472473145, + "learning_rate": 3.897353580649545e-06, + "loss": 2.8411, + "step": 71885 + }, + { + "epoch": 4.884495175974997, + "grad_norm": 7.381624698638916, + "learning_rate": 3.896928930561218e-06, + "loss": 2.8467, + "step": 71890 + }, + { + "epoch": 4.884834896045659, + "grad_norm": 7.499363899230957, + "learning_rate": 3.896504280472891e-06, + "loss": 2.8616, + "step": 71895 + }, + { + "epoch": 4.88517461611632, + "grad_norm": 8.488059997558594, + "learning_rate": 3.896079630384563e-06, + "loss": 2.783, + "step": 71900 + }, + { + "epoch": 4.885514336186982, + "grad_norm": 6.215630531311035, + "learning_rate": 3.895654980296237e-06, + "loss": 2.8922, + "step": 71905 + }, + { + "epoch": 4.885854056257644, + "grad_norm": 8.366189002990723, + "learning_rate": 3.8952303302079094e-06, + "loss": 2.7745, + "step": 71910 + }, + { + "epoch": 4.886193776328305, + "grad_norm": 6.6882405281066895, + "learning_rate": 3.894805680119581e-06, + "loss": 2.8308, + "step": 71915 + }, + { + "epoch": 4.886533496398967, + "grad_norm": 6.7161054611206055, + "learning_rate": 3.894381030031254e-06, + "loss": 2.9548, + "step": 71920 + }, + { + "epoch": 4.886873216469629, + "grad_norm": 7.727422714233398, + "learning_rate": 3.893956379942928e-06, + "loss": 2.8306, + "step": 71925 + }, + { + "epoch": 4.8872129365402905, + "grad_norm": 7.787153720855713, + "learning_rate": 3.8935317298546e-06, + "loss": 2.84, + "step": 71930 + }, + { + "epoch": 4.887552656610953, + "grad_norm": 8.407255172729492, + "learning_rate": 3.893107079766273e-06, + "loss": 2.8511, + "step": 71935 + }, + { + "epoch": 4.887892376681615, + "grad_norm": 8.550366401672363, + "learning_rate": 3.892682429677946e-06, + "loss": 3.0996, + "step": 71940 + }, + { + "epoch": 4.888232096752276, + "grad_norm": 6.878753662109375, + "learning_rate": 3.892257779589618e-06, + "loss": 2.8362, + "step": 71945 + }, + { + "epoch": 4.888571816822938, + "grad_norm": 6.041813850402832, + "learning_rate": 3.891833129501291e-06, + "loss": 2.8516, + "step": 71950 + }, + { + "epoch": 4.8889115368936, + "grad_norm": 7.50216817855835, + "learning_rate": 3.891408479412964e-06, + "loss": 3.0047, + "step": 71955 + }, + { + "epoch": 4.889251256964261, + "grad_norm": 7.1754961013793945, + "learning_rate": 3.890983829324637e-06, + "loss": 2.677, + "step": 71960 + }, + { + "epoch": 4.889590977034923, + "grad_norm": 7.114614009857178, + "learning_rate": 3.890559179236309e-06, + "loss": 2.805, + "step": 71965 + }, + { + "epoch": 4.889930697105585, + "grad_norm": 7.0314555168151855, + "learning_rate": 3.890134529147982e-06, + "loss": 2.6802, + "step": 71970 + }, + { + "epoch": 4.8902704171762466, + "grad_norm": 6.982093334197998, + "learning_rate": 3.889709879059655e-06, + "loss": 2.9905, + "step": 71975 + }, + { + "epoch": 4.890610137246909, + "grad_norm": 8.961502075195312, + "learning_rate": 3.889285228971328e-06, + "loss": 2.884, + "step": 71980 + }, + { + "epoch": 4.890949857317571, + "grad_norm": 7.133643627166748, + "learning_rate": 3.888860578883001e-06, + "loss": 3.0416, + "step": 71985 + }, + { + "epoch": 4.891289577388232, + "grad_norm": 7.405947685241699, + "learning_rate": 3.888435928794673e-06, + "loss": 2.9027, + "step": 71990 + }, + { + "epoch": 4.891629297458894, + "grad_norm": 8.501916885375977, + "learning_rate": 3.888011278706346e-06, + "loss": 3.0744, + "step": 71995 + }, + { + "epoch": 4.891969017529556, + "grad_norm": 5.7019124031066895, + "learning_rate": 3.887586628618019e-06, + "loss": 2.9089, + "step": 72000 + }, + { + "epoch": 4.892308737600217, + "grad_norm": 7.364502429962158, + "learning_rate": 3.887161978529692e-06, + "loss": 2.8922, + "step": 72005 + }, + { + "epoch": 4.892648457670879, + "grad_norm": 9.72369384765625, + "learning_rate": 3.886737328441365e-06, + "loss": 2.6428, + "step": 72010 + }, + { + "epoch": 4.892988177741541, + "grad_norm": 7.174439430236816, + "learning_rate": 3.886312678353037e-06, + "loss": 2.9528, + "step": 72015 + }, + { + "epoch": 4.893327897812203, + "grad_norm": 9.189477920532227, + "learning_rate": 3.88588802826471e-06, + "loss": 3.002, + "step": 72020 + }, + { + "epoch": 4.893667617882865, + "grad_norm": 6.968160629272461, + "learning_rate": 3.885463378176383e-06, + "loss": 2.7735, + "step": 72025 + }, + { + "epoch": 4.894007337953527, + "grad_norm": 7.922974586486816, + "learning_rate": 3.885038728088056e-06, + "loss": 2.9621, + "step": 72030 + }, + { + "epoch": 4.894347058024188, + "grad_norm": 5.827046871185303, + "learning_rate": 3.884614077999729e-06, + "loss": 2.5204, + "step": 72035 + }, + { + "epoch": 4.89468677809485, + "grad_norm": 9.0361328125, + "learning_rate": 3.884189427911401e-06, + "loss": 3.1762, + "step": 72040 + }, + { + "epoch": 4.895026498165512, + "grad_norm": 9.899840354919434, + "learning_rate": 3.883764777823074e-06, + "loss": 2.9493, + "step": 72045 + }, + { + "epoch": 4.895366218236173, + "grad_norm": Infinity, + "learning_rate": 3.883425057752412e-06, + "loss": 2.7815, + "step": 72050 + }, + { + "epoch": 4.895705938306835, + "grad_norm": 8.365386009216309, + "learning_rate": 3.883000407664085e-06, + "loss": 2.9103, + "step": 72055 + }, + { + "epoch": 4.896045658377497, + "grad_norm": 8.346719741821289, + "learning_rate": 3.882575757575758e-06, + "loss": 2.8109, + "step": 72060 + }, + { + "epoch": 4.896385378448159, + "grad_norm": 8.167499542236328, + "learning_rate": 3.882151107487431e-06, + "loss": 3.0305, + "step": 72065 + }, + { + "epoch": 4.896725098518821, + "grad_norm": 8.995285034179688, + "learning_rate": 3.8817264573991035e-06, + "loss": 2.9606, + "step": 72070 + }, + { + "epoch": 4.897064818589483, + "grad_norm": 6.650740146636963, + "learning_rate": 3.881301807310776e-06, + "loss": 2.9026, + "step": 72075 + }, + { + "epoch": 4.897404538660144, + "grad_norm": 8.813192367553711, + "learning_rate": 3.880877157222449e-06, + "loss": 2.9864, + "step": 72080 + }, + { + "epoch": 4.897744258730806, + "grad_norm": 6.5872883796691895, + "learning_rate": 3.880452507134122e-06, + "loss": 3.0286, + "step": 72085 + }, + { + "epoch": 4.898083978801468, + "grad_norm": 8.211437225341797, + "learning_rate": 3.880027857045795e-06, + "loss": 2.9608, + "step": 72090 + }, + { + "epoch": 4.898423698872129, + "grad_norm": 10.168561935424805, + "learning_rate": 3.8796032069574675e-06, + "loss": 2.7352, + "step": 72095 + }, + { + "epoch": 4.898763418942791, + "grad_norm": 6.490999221801758, + "learning_rate": 3.87917855686914e-06, + "loss": 2.8399, + "step": 72100 + }, + { + "epoch": 4.899103139013453, + "grad_norm": 8.003836631774902, + "learning_rate": 3.878753906780813e-06, + "loss": 2.9968, + "step": 72105 + }, + { + "epoch": 4.899442859084115, + "grad_norm": 6.494423866271973, + "learning_rate": 3.878329256692486e-06, + "loss": 2.8701, + "step": 72110 + }, + { + "epoch": 4.899782579154777, + "grad_norm": 7.934749603271484, + "learning_rate": 3.877904606604159e-06, + "loss": 2.641, + "step": 72115 + }, + { + "epoch": 4.900122299225439, + "grad_norm": 7.311397552490234, + "learning_rate": 3.8774799565158315e-06, + "loss": 2.9034, + "step": 72120 + }, + { + "epoch": 4.9004620192961, + "grad_norm": 6.015068054199219, + "learning_rate": 3.877055306427504e-06, + "loss": 2.7569, + "step": 72125 + }, + { + "epoch": 4.900801739366762, + "grad_norm": 6.98524808883667, + "learning_rate": 3.876630656339177e-06, + "loss": 3.0389, + "step": 72130 + }, + { + "epoch": 4.901141459437423, + "grad_norm": 6.167906284332275, + "learning_rate": 3.876206006250849e-06, + "loss": 2.9132, + "step": 72135 + }, + { + "epoch": 4.901481179508085, + "grad_norm": 9.802852630615234, + "learning_rate": 3.875781356162523e-06, + "loss": 2.8678, + "step": 72140 + }, + { + "epoch": 4.901820899578747, + "grad_norm": 8.044967651367188, + "learning_rate": 3.8753567060741955e-06, + "loss": 2.8151, + "step": 72145 + }, + { + "epoch": 4.9021606196494085, + "grad_norm": 7.301809310913086, + "learning_rate": 3.8749320559858674e-06, + "loss": 2.9173, + "step": 72150 + }, + { + "epoch": 4.902500339720071, + "grad_norm": 7.337078094482422, + "learning_rate": 3.874507405897541e-06, + "loss": 2.7754, + "step": 72155 + }, + { + "epoch": 4.902840059790733, + "grad_norm": 6.8307414054870605, + "learning_rate": 3.874082755809214e-06, + "loss": 2.8735, + "step": 72160 + }, + { + "epoch": 4.903179779861394, + "grad_norm": 6.487274646759033, + "learning_rate": 3.873658105720886e-06, + "loss": 2.8673, + "step": 72165 + }, + { + "epoch": 4.903519499932056, + "grad_norm": 7.656099796295166, + "learning_rate": 3.873233455632559e-06, + "loss": 2.7381, + "step": 72170 + }, + { + "epoch": 4.903859220002718, + "grad_norm": 6.448001861572266, + "learning_rate": 3.872808805544232e-06, + "loss": 2.9687, + "step": 72175 + }, + { + "epoch": 4.904198940073379, + "grad_norm": 6.305579662322998, + "learning_rate": 3.872384155455904e-06, + "loss": 2.8704, + "step": 72180 + }, + { + "epoch": 4.904538660144041, + "grad_norm": 8.46509838104248, + "learning_rate": 3.871959505367577e-06, + "loss": 2.9457, + "step": 72185 + }, + { + "epoch": 4.904878380214703, + "grad_norm": 6.201821327209473, + "learning_rate": 3.871534855279251e-06, + "loss": 3.1462, + "step": 72190 + }, + { + "epoch": 4.9052181002853645, + "grad_norm": 9.072657585144043, + "learning_rate": 3.871110205190923e-06, + "loss": 2.8949, + "step": 72195 + }, + { + "epoch": 4.905557820356027, + "grad_norm": 8.32385540008545, + "learning_rate": 3.8706855551025955e-06, + "loss": 2.8106, + "step": 72200 + }, + { + "epoch": 4.905897540426689, + "grad_norm": 7.543735980987549, + "learning_rate": 3.870260905014268e-06, + "loss": 2.8439, + "step": 72205 + }, + { + "epoch": 4.90623726049735, + "grad_norm": 8.635437965393066, + "learning_rate": 3.869836254925942e-06, + "loss": 3.0877, + "step": 72210 + }, + { + "epoch": 4.906576980568012, + "grad_norm": 5.8149237632751465, + "learning_rate": 3.869411604837614e-06, + "loss": 2.844, + "step": 72215 + }, + { + "epoch": 4.906916700638674, + "grad_norm": 8.12366008758545, + "learning_rate": 3.868986954749287e-06, + "loss": 2.7198, + "step": 72220 + }, + { + "epoch": 4.907256420709335, + "grad_norm": 7.295947551727295, + "learning_rate": 3.86856230466096e-06, + "loss": 2.9086, + "step": 72225 + }, + { + "epoch": 4.907596140779997, + "grad_norm": 8.412210464477539, + "learning_rate": 3.868137654572632e-06, + "loss": 2.6142, + "step": 72230 + }, + { + "epoch": 4.907935860850659, + "grad_norm": 7.342120170593262, + "learning_rate": 3.867713004484305e-06, + "loss": 2.6694, + "step": 72235 + }, + { + "epoch": 4.9082755809213205, + "grad_norm": 7.1603875160217285, + "learning_rate": 3.867288354395978e-06, + "loss": 2.8985, + "step": 72240 + }, + { + "epoch": 4.908615300991983, + "grad_norm": 8.097023963928223, + "learning_rate": 3.866863704307651e-06, + "loss": 2.9192, + "step": 72245 + }, + { + "epoch": 4.908955021062645, + "grad_norm": 8.081822395324707, + "learning_rate": 3.8664390542193235e-06, + "loss": 2.7387, + "step": 72250 + }, + { + "epoch": 4.909294741133306, + "grad_norm": 9.540225982666016, + "learning_rate": 3.866014404130996e-06, + "loss": 2.7883, + "step": 72255 + }, + { + "epoch": 4.909634461203968, + "grad_norm": 9.247708320617676, + "learning_rate": 3.865589754042669e-06, + "loss": 3.1187, + "step": 72260 + }, + { + "epoch": 4.90997418127463, + "grad_norm": 7.213711738586426, + "learning_rate": 3.865165103954342e-06, + "loss": 2.99, + "step": 72265 + }, + { + "epoch": 4.910313901345291, + "grad_norm": 8.035661697387695, + "learning_rate": 3.864740453866015e-06, + "loss": 2.743, + "step": 72270 + }, + { + "epoch": 4.910653621415953, + "grad_norm": 11.150137901306152, + "learning_rate": 3.8643158037776875e-06, + "loss": 2.957, + "step": 72275 + }, + { + "epoch": 4.910993341486615, + "grad_norm": 6.7172532081604, + "learning_rate": 3.86389115368936e-06, + "loss": 2.7359, + "step": 72280 + }, + { + "epoch": 4.911333061557277, + "grad_norm": 8.4299898147583, + "learning_rate": 3.863466503601033e-06, + "loss": 2.8135, + "step": 72285 + }, + { + "epoch": 4.911672781627939, + "grad_norm": 6.159140586853027, + "learning_rate": 3.863041853512706e-06, + "loss": 2.5647, + "step": 72290 + }, + { + "epoch": 4.912012501698601, + "grad_norm": 6.609157085418701, + "learning_rate": 3.862617203424379e-06, + "loss": 2.7233, + "step": 72295 + }, + { + "epoch": 4.912352221769262, + "grad_norm": 5.321810245513916, + "learning_rate": 3.8621925533360515e-06, + "loss": 2.6797, + "step": 72300 + }, + { + "epoch": 4.912691941839924, + "grad_norm": 6.6352972984313965, + "learning_rate": 3.861767903247724e-06, + "loss": 2.9111, + "step": 72305 + }, + { + "epoch": 4.913031661910586, + "grad_norm": 8.11176872253418, + "learning_rate": 3.861343253159397e-06, + "loss": 2.7572, + "step": 72310 + }, + { + "epoch": 4.913371381981247, + "grad_norm": 8.975784301757812, + "learning_rate": 3.86091860307107e-06, + "loss": 2.7235, + "step": 72315 + }, + { + "epoch": 4.913711102051909, + "grad_norm": 6.45372200012207, + "learning_rate": 3.860493952982743e-06, + "loss": 2.8377, + "step": 72320 + }, + { + "epoch": 4.914050822122571, + "grad_norm": 7.390737533569336, + "learning_rate": 3.8600693028944155e-06, + "loss": 2.7904, + "step": 72325 + }, + { + "epoch": 4.914390542193233, + "grad_norm": 7.7222185134887695, + "learning_rate": 3.859644652806088e-06, + "loss": 2.7826, + "step": 72330 + }, + { + "epoch": 4.914730262263895, + "grad_norm": 7.8337178230285645, + "learning_rate": 3.859220002717761e-06, + "loss": 2.8368, + "step": 72335 + }, + { + "epoch": 4.915069982334556, + "grad_norm": 6.9013872146606445, + "learning_rate": 3.858795352629434e-06, + "loss": 3.0074, + "step": 72340 + }, + { + "epoch": 4.915409702405218, + "grad_norm": 8.337400436401367, + "learning_rate": 3.858370702541107e-06, + "loss": 2.8597, + "step": 72345 + }, + { + "epoch": 4.91574942247588, + "grad_norm": 10.164567947387695, + "learning_rate": 3.857946052452779e-06, + "loss": 2.7252, + "step": 72350 + }, + { + "epoch": 4.916089142546541, + "grad_norm": 6.337740421295166, + "learning_rate": 3.857521402364452e-06, + "loss": 2.8244, + "step": 72355 + }, + { + "epoch": 4.916428862617203, + "grad_norm": 6.9610395431518555, + "learning_rate": 3.857096752276125e-06, + "loss": 2.7862, + "step": 72360 + }, + { + "epoch": 4.916768582687865, + "grad_norm": 8.637347221374512, + "learning_rate": 3.856672102187797e-06, + "loss": 2.8328, + "step": 72365 + }, + { + "epoch": 4.9171083027585265, + "grad_norm": 7.5064802169799805, + "learning_rate": 3.856247452099471e-06, + "loss": 2.7576, + "step": 72370 + }, + { + "epoch": 4.917448022829189, + "grad_norm": 7.290255069732666, + "learning_rate": 3.8558228020111435e-06, + "loss": 3.0145, + "step": 72375 + }, + { + "epoch": 4.917787742899851, + "grad_norm": 8.437417030334473, + "learning_rate": 3.855398151922816e-06, + "loss": 2.7463, + "step": 72380 + }, + { + "epoch": 4.918127462970512, + "grad_norm": 6.368012428283691, + "learning_rate": 3.854973501834488e-06, + "loss": 2.6926, + "step": 72385 + }, + { + "epoch": 4.918467183041174, + "grad_norm": 8.031230926513672, + "learning_rate": 3.854548851746162e-06, + "loss": 2.8667, + "step": 72390 + }, + { + "epoch": 4.918806903111836, + "grad_norm": 9.581392288208008, + "learning_rate": 3.854124201657835e-06, + "loss": 2.9073, + "step": 72395 + }, + { + "epoch": 4.919146623182497, + "grad_norm": 6.064151763916016, + "learning_rate": 3.853699551569507e-06, + "loss": 3.0109, + "step": 72400 + }, + { + "epoch": 4.919486343253159, + "grad_norm": 7.781054496765137, + "learning_rate": 3.85327490148118e-06, + "loss": 2.7381, + "step": 72405 + }, + { + "epoch": 4.919826063323821, + "grad_norm": 10.428775787353516, + "learning_rate": 3.852850251392853e-06, + "loss": 2.9811, + "step": 72410 + }, + { + "epoch": 4.9201657833944825, + "grad_norm": 10.440874099731445, + "learning_rate": 3.852425601304525e-06, + "loss": 2.8366, + "step": 72415 + }, + { + "epoch": 4.920505503465145, + "grad_norm": 6.054600715637207, + "learning_rate": 3.852000951216198e-06, + "loss": 2.752, + "step": 72420 + }, + { + "epoch": 4.920845223535807, + "grad_norm": 8.644317626953125, + "learning_rate": 3.8515763011278715e-06, + "loss": 2.7447, + "step": 72425 + }, + { + "epoch": 4.921184943606468, + "grad_norm": 8.232606887817383, + "learning_rate": 3.8511516510395434e-06, + "loss": 2.7115, + "step": 72430 + }, + { + "epoch": 4.92152466367713, + "grad_norm": 6.161240577697754, + "learning_rate": 3.850727000951216e-06, + "loss": 2.664, + "step": 72435 + }, + { + "epoch": 4.921864383747792, + "grad_norm": 6.619409084320068, + "learning_rate": 3.85030235086289e-06, + "loss": 2.8572, + "step": 72440 + }, + { + "epoch": 4.922204103818453, + "grad_norm": 8.618521690368652, + "learning_rate": 3.849877700774562e-06, + "loss": 2.9451, + "step": 72445 + }, + { + "epoch": 4.922543823889115, + "grad_norm": 10.149602890014648, + "learning_rate": 3.849453050686235e-06, + "loss": 2.7543, + "step": 72450 + }, + { + "epoch": 4.922883543959777, + "grad_norm": 7.765580177307129, + "learning_rate": 3.8490284005979074e-06, + "loss": 2.8672, + "step": 72455 + }, + { + "epoch": 4.9232232640304385, + "grad_norm": 5.556722164154053, + "learning_rate": 3.84860375050958e-06, + "loss": 2.792, + "step": 72460 + }, + { + "epoch": 4.923562984101101, + "grad_norm": 7.588559627532959, + "learning_rate": 3.848179100421253e-06, + "loss": 2.8786, + "step": 72465 + }, + { + "epoch": 4.923902704171763, + "grad_norm": 6.208338737487793, + "learning_rate": 3.847754450332926e-06, + "loss": 3.0033, + "step": 72470 + }, + { + "epoch": 4.924242424242424, + "grad_norm": 7.376086235046387, + "learning_rate": 3.847329800244599e-06, + "loss": 2.8837, + "step": 72475 + }, + { + "epoch": 4.924582144313086, + "grad_norm": 6.1284918785095215, + "learning_rate": 3.8469051501562714e-06, + "loss": 2.749, + "step": 72480 + }, + { + "epoch": 4.924921864383748, + "grad_norm": 8.611567497253418, + "learning_rate": 3.846480500067944e-06, + "loss": 2.7322, + "step": 72485 + }, + { + "epoch": 4.925261584454409, + "grad_norm": 7.5867018699646, + "learning_rate": 3.846055849979617e-06, + "loss": 2.812, + "step": 72490 + }, + { + "epoch": 4.925601304525071, + "grad_norm": 10.11610221862793, + "learning_rate": 3.84563119989129e-06, + "loss": 2.6816, + "step": 72495 + }, + { + "epoch": 4.925941024595733, + "grad_norm": 7.466106414794922, + "learning_rate": 3.845206549802963e-06, + "loss": 2.8274, + "step": 72500 + }, + { + "epoch": 4.9262807446663945, + "grad_norm": 10.348697662353516, + "learning_rate": 3.8447818997146355e-06, + "loss": 2.7905, + "step": 72505 + }, + { + "epoch": 4.926620464737057, + "grad_norm": 9.485030174255371, + "learning_rate": 3.844357249626308e-06, + "loss": 3.1289, + "step": 72510 + }, + { + "epoch": 4.926960184807719, + "grad_norm": 8.052311897277832, + "learning_rate": 3.843932599537981e-06, + "loss": 3.2283, + "step": 72515 + }, + { + "epoch": 4.92729990487838, + "grad_norm": 7.367252349853516, + "learning_rate": 3.843507949449654e-06, + "loss": 3.0218, + "step": 72520 + }, + { + "epoch": 4.927639624949042, + "grad_norm": 7.692474365234375, + "learning_rate": 3.843083299361327e-06, + "loss": 2.86, + "step": 72525 + }, + { + "epoch": 4.927979345019704, + "grad_norm": 7.538905143737793, + "learning_rate": 3.8426586492729995e-06, + "loss": 2.7297, + "step": 72530 + }, + { + "epoch": 4.928319065090365, + "grad_norm": 5.610098361968994, + "learning_rate": 3.842233999184672e-06, + "loss": 2.9229, + "step": 72535 + }, + { + "epoch": 4.928658785161027, + "grad_norm": 6.724775791168213, + "learning_rate": 3.841809349096345e-06, + "loss": 3.0437, + "step": 72540 + }, + { + "epoch": 4.928998505231689, + "grad_norm": 7.350745677947998, + "learning_rate": 3.841384699008018e-06, + "loss": 2.7598, + "step": 72545 + }, + { + "epoch": 4.9293382253023506, + "grad_norm": 8.105917930603027, + "learning_rate": 3.840960048919691e-06, + "loss": 2.8635, + "step": 72550 + }, + { + "epoch": 4.929677945373013, + "grad_norm": 5.672839164733887, + "learning_rate": 3.8405353988313635e-06, + "loss": 2.8854, + "step": 72555 + }, + { + "epoch": 4.930017665443675, + "grad_norm": 7.445183753967285, + "learning_rate": 3.840110748743036e-06, + "loss": 2.8827, + "step": 72560 + }, + { + "epoch": 4.930357385514336, + "grad_norm": 7.702518463134766, + "learning_rate": 3.839686098654709e-06, + "loss": 2.891, + "step": 72565 + }, + { + "epoch": 4.930697105584998, + "grad_norm": 7.696429252624512, + "learning_rate": 3.839261448566382e-06, + "loss": 2.9133, + "step": 72570 + }, + { + "epoch": 4.93103682565566, + "grad_norm": 9.695487976074219, + "learning_rate": 3.838836798478055e-06, + "loss": 3.0282, + "step": 72575 + }, + { + "epoch": 4.931376545726321, + "grad_norm": 7.984599590301514, + "learning_rate": 3.8384121483897275e-06, + "loss": 2.9119, + "step": 72580 + }, + { + "epoch": 4.931716265796983, + "grad_norm": 7.580045700073242, + "learning_rate": 3.837987498301399e-06, + "loss": 2.8616, + "step": 72585 + }, + { + "epoch": 4.932055985867645, + "grad_norm": 8.326951026916504, + "learning_rate": 3.837562848213073e-06, + "loss": 2.8652, + "step": 72590 + }, + { + "epoch": 4.932395705938307, + "grad_norm": 6.466984272003174, + "learning_rate": 3.837138198124746e-06, + "loss": 2.8545, + "step": 72595 + }, + { + "epoch": 4.932735426008969, + "grad_norm": 8.383600234985352, + "learning_rate": 3.836713548036418e-06, + "loss": 2.8215, + "step": 72600 + }, + { + "epoch": 4.933075146079631, + "grad_norm": 7.362869739532471, + "learning_rate": 3.8362888979480915e-06, + "loss": 3.2241, + "step": 72605 + }, + { + "epoch": 4.933414866150292, + "grad_norm": 7.456689357757568, + "learning_rate": 3.835864247859764e-06, + "loss": 3.118, + "step": 72610 + }, + { + "epoch": 4.933754586220954, + "grad_norm": 7.946510314941406, + "learning_rate": 3.835439597771436e-06, + "loss": 2.8678, + "step": 72615 + }, + { + "epoch": 4.934094306291616, + "grad_norm": 6.511791229248047, + "learning_rate": 3.835014947683109e-06, + "loss": 3.12, + "step": 72620 + }, + { + "epoch": 4.934434026362277, + "grad_norm": 6.21824836730957, + "learning_rate": 3.834590297594783e-06, + "loss": 2.551, + "step": 72625 + }, + { + "epoch": 4.934773746432939, + "grad_norm": 6.620795249938965, + "learning_rate": 3.834165647506455e-06, + "loss": 2.9751, + "step": 72630 + }, + { + "epoch": 4.935113466503601, + "grad_norm": 5.657891750335693, + "learning_rate": 3.8337409974181274e-06, + "loss": 2.4756, + "step": 72635 + }, + { + "epoch": 4.935453186574263, + "grad_norm": 7.06327486038208, + "learning_rate": 3.833316347329801e-06, + "loss": 2.8768, + "step": 72640 + }, + { + "epoch": 4.935792906644925, + "grad_norm": 7.716787338256836, + "learning_rate": 3.832891697241473e-06, + "loss": 2.939, + "step": 72645 + }, + { + "epoch": 4.936132626715587, + "grad_norm": 8.094112396240234, + "learning_rate": 3.832467047153146e-06, + "loss": 2.9238, + "step": 72650 + }, + { + "epoch": 4.936472346786248, + "grad_norm": 7.049835681915283, + "learning_rate": 3.8320423970648195e-06, + "loss": 2.6931, + "step": 72655 + }, + { + "epoch": 4.93681206685691, + "grad_norm": 7.50380802154541, + "learning_rate": 3.8316177469764914e-06, + "loss": 3.125, + "step": 72660 + }, + { + "epoch": 4.937151786927572, + "grad_norm": 7.781037330627441, + "learning_rate": 3.831193096888164e-06, + "loss": 2.7953, + "step": 72665 + }, + { + "epoch": 4.937491506998233, + "grad_norm": 5.77232551574707, + "learning_rate": 3.830768446799837e-06, + "loss": 2.8077, + "step": 72670 + }, + { + "epoch": 4.937831227068895, + "grad_norm": 6.812666416168213, + "learning_rate": 3.83034379671151e-06, + "loss": 2.751, + "step": 72675 + }, + { + "epoch": 4.938170947139557, + "grad_norm": 6.065067768096924, + "learning_rate": 3.829919146623183e-06, + "loss": 2.802, + "step": 72680 + }, + { + "epoch": 4.938510667210219, + "grad_norm": 6.299059867858887, + "learning_rate": 3.8294944965348554e-06, + "loss": 2.6622, + "step": 72685 + }, + { + "epoch": 4.938850387280881, + "grad_norm": 6.713107585906982, + "learning_rate": 3.829069846446528e-06, + "loss": 2.9571, + "step": 72690 + }, + { + "epoch": 4.939190107351543, + "grad_norm": 9.00195026397705, + "learning_rate": 3.828645196358201e-06, + "loss": 2.6707, + "step": 72695 + }, + { + "epoch": 4.939529827422204, + "grad_norm": 8.082316398620605, + "learning_rate": 3.828220546269874e-06, + "loss": 2.9687, + "step": 72700 + }, + { + "epoch": 4.939869547492866, + "grad_norm": 7.445838451385498, + "learning_rate": 3.827795896181547e-06, + "loss": 2.8579, + "step": 72705 + }, + { + "epoch": 4.940209267563528, + "grad_norm": 6.97344970703125, + "learning_rate": 3.8273712460932194e-06, + "loss": 2.6887, + "step": 72710 + }, + { + "epoch": 4.940548987634189, + "grad_norm": 6.022574424743652, + "learning_rate": 3.826946596004892e-06, + "loss": 2.8551, + "step": 72715 + }, + { + "epoch": 4.940888707704851, + "grad_norm": 8.291943550109863, + "learning_rate": 3.826521945916565e-06, + "loss": 2.9851, + "step": 72720 + }, + { + "epoch": 4.941228427775513, + "grad_norm": 6.390888690948486, + "learning_rate": 3.826097295828238e-06, + "loss": 2.8483, + "step": 72725 + }, + { + "epoch": 4.941568147846175, + "grad_norm": 8.284476280212402, + "learning_rate": 3.825672645739911e-06, + "loss": 2.6991, + "step": 72730 + }, + { + "epoch": 4.941907867916837, + "grad_norm": 8.087716102600098, + "learning_rate": 3.8252479956515834e-06, + "loss": 2.8278, + "step": 72735 + }, + { + "epoch": 4.942247587987499, + "grad_norm": 8.240164756774902, + "learning_rate": 3.824823345563256e-06, + "loss": 2.9631, + "step": 72740 + }, + { + "epoch": 4.94258730805816, + "grad_norm": 6.439369201660156, + "learning_rate": 3.824398695474929e-06, + "loss": 2.7841, + "step": 72745 + }, + { + "epoch": 4.942927028128822, + "grad_norm": 6.248239994049072, + "learning_rate": 3.823974045386602e-06, + "loss": 2.846, + "step": 72750 + }, + { + "epoch": 4.943266748199484, + "grad_norm": 8.815537452697754, + "learning_rate": 3.823549395298275e-06, + "loss": 2.9574, + "step": 72755 + }, + { + "epoch": 4.943606468270145, + "grad_norm": 8.279749870300293, + "learning_rate": 3.8231247452099474e-06, + "loss": 2.9372, + "step": 72760 + }, + { + "epoch": 4.943946188340807, + "grad_norm": 7.275330543518066, + "learning_rate": 3.82270009512162e-06, + "loss": 3.0741, + "step": 72765 + }, + { + "epoch": 4.944285908411469, + "grad_norm": 7.526088714599609, + "learning_rate": 3.822275445033293e-06, + "loss": 2.8071, + "step": 72770 + }, + { + "epoch": 4.944625628482131, + "grad_norm": 8.016992568969727, + "learning_rate": 3.821850794944966e-06, + "loss": 2.9319, + "step": 72775 + }, + { + "epoch": 4.944965348552793, + "grad_norm": 7.993246078491211, + "learning_rate": 3.821426144856639e-06, + "loss": 3.0927, + "step": 72780 + }, + { + "epoch": 4.945305068623455, + "grad_norm": 8.096721649169922, + "learning_rate": 3.8210014947683114e-06, + "loss": 2.791, + "step": 72785 + }, + { + "epoch": 4.945644788694116, + "grad_norm": 9.697352409362793, + "learning_rate": 3.820576844679984e-06, + "loss": 3.0418, + "step": 72790 + }, + { + "epoch": 4.945984508764778, + "grad_norm": 6.260574817657471, + "learning_rate": 3.820152194591657e-06, + "loss": 2.8484, + "step": 72795 + }, + { + "epoch": 4.94632422883544, + "grad_norm": 7.721189975738525, + "learning_rate": 3.819727544503329e-06, + "loss": 3.13, + "step": 72800 + }, + { + "epoch": 4.946663948906101, + "grad_norm": 6.248744487762451, + "learning_rate": 3.819302894415003e-06, + "loss": 2.6684, + "step": 72805 + }, + { + "epoch": 4.947003668976763, + "grad_norm": 8.275923728942871, + "learning_rate": 3.8188782443266755e-06, + "loss": 2.7396, + "step": 72810 + }, + { + "epoch": 4.9473433890474245, + "grad_norm": 6.812037944793701, + "learning_rate": 3.818453594238347e-06, + "loss": 3.0322, + "step": 72815 + }, + { + "epoch": 4.947683109118087, + "grad_norm": 6.818813323974609, + "learning_rate": 3.818028944150021e-06, + "loss": 2.629, + "step": 72820 + }, + { + "epoch": 4.948022829188749, + "grad_norm": 7.854610443115234, + "learning_rate": 3.817604294061694e-06, + "loss": 2.7654, + "step": 72825 + }, + { + "epoch": 4.94836254925941, + "grad_norm": 8.52818489074707, + "learning_rate": 3.817179643973366e-06, + "loss": 2.8146, + "step": 72830 + }, + { + "epoch": 4.948702269330072, + "grad_norm": 7.583927631378174, + "learning_rate": 3.816754993885039e-06, + "loss": 2.9742, + "step": 72835 + }, + { + "epoch": 4.949041989400734, + "grad_norm": 6.482582092285156, + "learning_rate": 3.816330343796712e-06, + "loss": 2.9226, + "step": 72840 + }, + { + "epoch": 4.949381709471395, + "grad_norm": 8.101174354553223, + "learning_rate": 3.815905693708384e-06, + "loss": 2.8718, + "step": 72845 + }, + { + "epoch": 4.949721429542057, + "grad_norm": 7.498144149780273, + "learning_rate": 3.815481043620057e-06, + "loss": 2.9943, + "step": 72850 + }, + { + "epoch": 4.950061149612719, + "grad_norm": 7.146426677703857, + "learning_rate": 3.815056393531731e-06, + "loss": 2.5397, + "step": 72855 + }, + { + "epoch": 4.9504008696833806, + "grad_norm": 7.778851509094238, + "learning_rate": 3.8146317434434026e-06, + "loss": 3.0172, + "step": 72860 + }, + { + "epoch": 4.950740589754043, + "grad_norm": 7.900962829589844, + "learning_rate": 3.814207093355076e-06, + "loss": 3.0039, + "step": 72865 + }, + { + "epoch": 4.951080309824705, + "grad_norm": 6.72878885269165, + "learning_rate": 3.8137824432667486e-06, + "loss": 2.9815, + "step": 72870 + }, + { + "epoch": 4.951420029895366, + "grad_norm": 8.30671501159668, + "learning_rate": 3.813357793178421e-06, + "loss": 2.9637, + "step": 72875 + }, + { + "epoch": 4.951759749966028, + "grad_norm": 6.482297897338867, + "learning_rate": 3.812933143090094e-06, + "loss": 2.8263, + "step": 72880 + }, + { + "epoch": 4.95209947003669, + "grad_norm": 7.178943157196045, + "learning_rate": 3.812508493001767e-06, + "loss": 2.8284, + "step": 72885 + }, + { + "epoch": 4.952439190107351, + "grad_norm": 6.723806858062744, + "learning_rate": 3.81208384291344e-06, + "loss": 2.7191, + "step": 72890 + }, + { + "epoch": 4.952778910178013, + "grad_norm": 7.023056983947754, + "learning_rate": 3.8116591928251122e-06, + "loss": 2.9851, + "step": 72895 + }, + { + "epoch": 4.953118630248675, + "grad_norm": 8.016886711120605, + "learning_rate": 3.8112345427367854e-06, + "loss": 2.5418, + "step": 72900 + }, + { + "epoch": 4.953458350319337, + "grad_norm": 7.607239246368408, + "learning_rate": 3.8108098926484582e-06, + "loss": 2.5693, + "step": 72905 + }, + { + "epoch": 4.953798070389999, + "grad_norm": 5.990352153778076, + "learning_rate": 3.8103852425601306e-06, + "loss": 2.6679, + "step": 72910 + }, + { + "epoch": 4.954137790460661, + "grad_norm": 6.209375858306885, + "learning_rate": 3.8099605924718034e-06, + "loss": 3.0701, + "step": 72915 + }, + { + "epoch": 4.954477510531322, + "grad_norm": 9.282676696777344, + "learning_rate": 3.8095359423834766e-06, + "loss": 2.9194, + "step": 72920 + }, + { + "epoch": 4.954817230601984, + "grad_norm": 8.741230010986328, + "learning_rate": 3.809111292295149e-06, + "loss": 3.0822, + "step": 72925 + }, + { + "epoch": 4.955156950672646, + "grad_norm": 6.306914329528809, + "learning_rate": 3.808686642206822e-06, + "loss": 2.9172, + "step": 72930 + }, + { + "epoch": 4.955496670743307, + "grad_norm": 8.779870986938477, + "learning_rate": 3.808261992118495e-06, + "loss": 2.8296, + "step": 72935 + }, + { + "epoch": 4.955836390813969, + "grad_norm": 7.496679782867432, + "learning_rate": 3.8078373420301674e-06, + "loss": 3.0498, + "step": 72940 + }, + { + "epoch": 4.956176110884631, + "grad_norm": 8.552268028259277, + "learning_rate": 3.8074126919418402e-06, + "loss": 2.7294, + "step": 72945 + }, + { + "epoch": 4.956515830955293, + "grad_norm": 8.628500938415527, + "learning_rate": 3.806988041853513e-06, + "loss": 2.92, + "step": 72950 + }, + { + "epoch": 4.956855551025955, + "grad_norm": 7.280274868011475, + "learning_rate": 3.806563391765186e-06, + "loss": 2.6493, + "step": 72955 + }, + { + "epoch": 4.957195271096617, + "grad_norm": 6.423334121704102, + "learning_rate": 3.8061387416768586e-06, + "loss": 2.9397, + "step": 72960 + }, + { + "epoch": 4.957534991167278, + "grad_norm": 9.71061897277832, + "learning_rate": 3.8057140915885314e-06, + "loss": 2.9949, + "step": 72965 + }, + { + "epoch": 4.95787471123794, + "grad_norm": 7.988460063934326, + "learning_rate": 3.805289441500204e-06, + "loss": 3.0509, + "step": 72970 + }, + { + "epoch": 4.958214431308602, + "grad_norm": 6.403669357299805, + "learning_rate": 3.804864791411877e-06, + "loss": 3.0341, + "step": 72975 + }, + { + "epoch": 4.958554151379263, + "grad_norm": 7.2818474769592285, + "learning_rate": 3.80444014132355e-06, + "loss": 2.8921, + "step": 72980 + }, + { + "epoch": 4.958893871449925, + "grad_norm": 8.727547645568848, + "learning_rate": 3.804015491235222e-06, + "loss": 2.9452, + "step": 72985 + }, + { + "epoch": 4.959233591520587, + "grad_norm": 8.540467262268066, + "learning_rate": 3.8035908411468954e-06, + "loss": 2.7321, + "step": 72990 + }, + { + "epoch": 4.959573311591249, + "grad_norm": 7.487075328826904, + "learning_rate": 3.8031661910585682e-06, + "loss": 2.7803, + "step": 72995 + }, + { + "epoch": 4.959913031661911, + "grad_norm": 9.03288459777832, + "learning_rate": 3.8027415409702406e-06, + "loss": 2.9393, + "step": 73000 + }, + { + "epoch": 4.960252751732573, + "grad_norm": 6.295324325561523, + "learning_rate": 3.8023168908819134e-06, + "loss": 2.8584, + "step": 73005 + }, + { + "epoch": 4.960592471803234, + "grad_norm": 6.594711780548096, + "learning_rate": 3.8018922407935866e-06, + "loss": 3.0074, + "step": 73010 + }, + { + "epoch": 4.960932191873896, + "grad_norm": 6.5551676750183105, + "learning_rate": 3.801467590705259e-06, + "loss": 2.7895, + "step": 73015 + }, + { + "epoch": 4.961271911944557, + "grad_norm": 7.390100479125977, + "learning_rate": 3.801042940616932e-06, + "loss": 3.0948, + "step": 73020 + }, + { + "epoch": 4.961611632015219, + "grad_norm": 6.934728622436523, + "learning_rate": 3.800618290528605e-06, + "loss": 2.7116, + "step": 73025 + }, + { + "epoch": 4.961951352085881, + "grad_norm": 7.145624160766602, + "learning_rate": 3.8001936404402774e-06, + "loss": 2.8567, + "step": 73030 + }, + { + "epoch": 4.9622910721565425, + "grad_norm": 9.033565521240234, + "learning_rate": 3.7997689903519502e-06, + "loss": 3.1549, + "step": 73035 + }, + { + "epoch": 4.962630792227205, + "grad_norm": 6.501895427703857, + "learning_rate": 3.799344340263623e-06, + "loss": 2.8986, + "step": 73040 + }, + { + "epoch": 4.962970512297867, + "grad_norm": 7.292293548583984, + "learning_rate": 3.7989196901752954e-06, + "loss": 2.9405, + "step": 73045 + }, + { + "epoch": 4.963310232368528, + "grad_norm": 8.522661209106445, + "learning_rate": 3.7984950400869686e-06, + "loss": 2.9119, + "step": 73050 + }, + { + "epoch": 4.96364995243919, + "grad_norm": 6.552318572998047, + "learning_rate": 3.7980703899986414e-06, + "loss": 2.6919, + "step": 73055 + }, + { + "epoch": 4.963989672509852, + "grad_norm": 8.568987846374512, + "learning_rate": 3.7976457399103146e-06, + "loss": 2.8991, + "step": 73060 + }, + { + "epoch": 4.964329392580513, + "grad_norm": 7.228154182434082, + "learning_rate": 3.797221089821987e-06, + "loss": 2.6613, + "step": 73065 + }, + { + "epoch": 4.964669112651175, + "grad_norm": 8.334687232971191, + "learning_rate": 3.79679643973366e-06, + "loss": 3.0318, + "step": 73070 + }, + { + "epoch": 4.965008832721837, + "grad_norm": 7.289567947387695, + "learning_rate": 3.7963717896453326e-06, + "loss": 2.8013, + "step": 73075 + }, + { + "epoch": 4.9653485527924985, + "grad_norm": 6.315866947174072, + "learning_rate": 3.7959471395570054e-06, + "loss": 2.8338, + "step": 73080 + }, + { + "epoch": 4.965688272863161, + "grad_norm": 6.13030481338501, + "learning_rate": 3.7955224894686782e-06, + "loss": 2.9631, + "step": 73085 + }, + { + "epoch": 4.966027992933823, + "grad_norm": 6.166565895080566, + "learning_rate": 3.795097839380351e-06, + "loss": 2.8044, + "step": 73090 + }, + { + "epoch": 4.966367713004484, + "grad_norm": 8.100127220153809, + "learning_rate": 3.7946731892920234e-06, + "loss": 2.8072, + "step": 73095 + }, + { + "epoch": 4.966707433075146, + "grad_norm": 8.074478149414062, + "learning_rate": 3.7942485392036966e-06, + "loss": 2.6826, + "step": 73100 + }, + { + "epoch": 4.967047153145808, + "grad_norm": 5.342319965362549, + "learning_rate": 3.7938238891153694e-06, + "loss": 3.0172, + "step": 73105 + }, + { + "epoch": 4.967386873216469, + "grad_norm": 7.718866348266602, + "learning_rate": 3.793399239027042e-06, + "loss": 2.8252, + "step": 73110 + }, + { + "epoch": 4.967726593287131, + "grad_norm": 6.630699634552002, + "learning_rate": 3.792974588938715e-06, + "loss": 2.7396, + "step": 73115 + }, + { + "epoch": 4.968066313357793, + "grad_norm": 6.718502044677734, + "learning_rate": 3.792549938850388e-06, + "loss": 3.0142, + "step": 73120 + }, + { + "epoch": 4.9684060334284545, + "grad_norm": 8.470141410827637, + "learning_rate": 3.79212528876206e-06, + "loss": 3.052, + "step": 73125 + }, + { + "epoch": 4.968745753499117, + "grad_norm": 6.383848667144775, + "learning_rate": 3.791700638673733e-06, + "loss": 2.7462, + "step": 73130 + }, + { + "epoch": 4.969085473569779, + "grad_norm": 6.5624470710754395, + "learning_rate": 3.7912759885854062e-06, + "loss": 2.7251, + "step": 73135 + }, + { + "epoch": 4.96942519364044, + "grad_norm": 7.072451591491699, + "learning_rate": 3.7908513384970786e-06, + "loss": 2.9809, + "step": 73140 + }, + { + "epoch": 4.969764913711102, + "grad_norm": 6.8649163246154785, + "learning_rate": 3.7904266884087514e-06, + "loss": 2.6138, + "step": 73145 + }, + { + "epoch": 4.970104633781764, + "grad_norm": 7.439380645751953, + "learning_rate": 3.7900020383204246e-06, + "loss": 3.2174, + "step": 73150 + }, + { + "epoch": 4.970444353852425, + "grad_norm": 6.764210224151611, + "learning_rate": 3.789577388232097e-06, + "loss": 2.9338, + "step": 73155 + }, + { + "epoch": 4.970784073923087, + "grad_norm": 5.629883766174316, + "learning_rate": 3.78915273814377e-06, + "loss": 2.9794, + "step": 73160 + }, + { + "epoch": 4.971123793993749, + "grad_norm": 7.991227626800537, + "learning_rate": 3.7887280880554426e-06, + "loss": 3.0571, + "step": 73165 + }, + { + "epoch": 4.971463514064411, + "grad_norm": 6.5648393630981445, + "learning_rate": 3.788303437967115e-06, + "loss": 2.9327, + "step": 73170 + }, + { + "epoch": 4.971803234135073, + "grad_norm": 7.670884609222412, + "learning_rate": 3.7878787878787882e-06, + "loss": 2.994, + "step": 73175 + }, + { + "epoch": 4.972142954205735, + "grad_norm": 7.020423412322998, + "learning_rate": 3.787454137790461e-06, + "loss": 2.8419, + "step": 73180 + }, + { + "epoch": 4.972482674276396, + "grad_norm": 6.6049370765686035, + "learning_rate": 3.7870294877021334e-06, + "loss": 2.7842, + "step": 73185 + }, + { + "epoch": 4.972822394347058, + "grad_norm": 6.997190952301025, + "learning_rate": 3.7866048376138066e-06, + "loss": 2.8944, + "step": 73190 + }, + { + "epoch": 4.97316211441772, + "grad_norm": 7.304419994354248, + "learning_rate": 3.7861801875254794e-06, + "loss": 2.9533, + "step": 73195 + }, + { + "epoch": 4.973501834488381, + "grad_norm": 7.750965595245361, + "learning_rate": 3.785755537437152e-06, + "loss": 2.8506, + "step": 73200 + }, + { + "epoch": 4.973841554559043, + "grad_norm": 7.604078769683838, + "learning_rate": 3.785330887348825e-06, + "loss": 3.0391, + "step": 73205 + }, + { + "epoch": 4.974181274629705, + "grad_norm": 7.262415885925293, + "learning_rate": 3.784906237260498e-06, + "loss": 2.944, + "step": 73210 + }, + { + "epoch": 4.974520994700367, + "grad_norm": 6.462901592254639, + "learning_rate": 3.78448158717217e-06, + "loss": 2.7569, + "step": 73215 + }, + { + "epoch": 4.974860714771029, + "grad_norm": 7.022021770477295, + "learning_rate": 3.784056937083843e-06, + "loss": 2.8869, + "step": 73220 + }, + { + "epoch": 4.975200434841691, + "grad_norm": 7.652563571929932, + "learning_rate": 3.7836322869955162e-06, + "loss": 2.7935, + "step": 73225 + }, + { + "epoch": 4.975540154912352, + "grad_norm": 7.524420261383057, + "learning_rate": 3.783207636907189e-06, + "loss": 2.5908, + "step": 73230 + }, + { + "epoch": 4.975879874983014, + "grad_norm": 7.413530349731445, + "learning_rate": 3.7827829868188614e-06, + "loss": 2.7572, + "step": 73235 + }, + { + "epoch": 4.976219595053676, + "grad_norm": 7.39080810546875, + "learning_rate": 3.7823583367305346e-06, + "loss": 2.6358, + "step": 73240 + }, + { + "epoch": 4.976559315124337, + "grad_norm": 8.700078964233398, + "learning_rate": 3.7819336866422074e-06, + "loss": 2.8232, + "step": 73245 + }, + { + "epoch": 4.976899035194999, + "grad_norm": 6.5293426513671875, + "learning_rate": 3.78150903655388e-06, + "loss": 3.1066, + "step": 73250 + }, + { + "epoch": 4.977238755265661, + "grad_norm": 5.293467044830322, + "learning_rate": 3.7810843864655526e-06, + "loss": 2.9137, + "step": 73255 + }, + { + "epoch": 4.977578475336323, + "grad_norm": 6.179623603820801, + "learning_rate": 3.780659736377226e-06, + "loss": 2.8719, + "step": 73260 + }, + { + "epoch": 4.977918195406985, + "grad_norm": 6.080752849578857, + "learning_rate": 3.780235086288898e-06, + "loss": 3.0348, + "step": 73265 + }, + { + "epoch": 4.978257915477647, + "grad_norm": 8.455582618713379, + "learning_rate": 3.779810436200571e-06, + "loss": 2.7462, + "step": 73270 + }, + { + "epoch": 4.978597635548308, + "grad_norm": 8.23652458190918, + "learning_rate": 3.7793857861122442e-06, + "loss": 2.9448, + "step": 73275 + }, + { + "epoch": 4.97893735561897, + "grad_norm": 7.052402019500732, + "learning_rate": 3.7789611360239166e-06, + "loss": 2.7652, + "step": 73280 + }, + { + "epoch": 4.979277075689632, + "grad_norm": 7.387182235717773, + "learning_rate": 3.7785364859355894e-06, + "loss": 2.7698, + "step": 73285 + }, + { + "epoch": 4.979616795760293, + "grad_norm": 7.822403430938721, + "learning_rate": 3.778111835847262e-06, + "loss": 2.612, + "step": 73290 + }, + { + "epoch": 4.979956515830955, + "grad_norm": 10.587508201599121, + "learning_rate": 3.7776871857589346e-06, + "loss": 2.7379, + "step": 73295 + }, + { + "epoch": 4.980296235901617, + "grad_norm": 6.58331298828125, + "learning_rate": 3.777262535670608e-06, + "loss": 2.6006, + "step": 73300 + }, + { + "epoch": 4.980635955972279, + "grad_norm": 6.893303871154785, + "learning_rate": 3.7768378855822806e-06, + "loss": 2.9351, + "step": 73305 + }, + { + "epoch": 4.980975676042941, + "grad_norm": 6.722116947174072, + "learning_rate": 3.776413235493953e-06, + "loss": 2.8959, + "step": 73310 + }, + { + "epoch": 4.981315396113603, + "grad_norm": 8.587299346923828, + "learning_rate": 3.7759885854056262e-06, + "loss": 2.8256, + "step": 73315 + }, + { + "epoch": 4.981655116184264, + "grad_norm": 8.714028358459473, + "learning_rate": 3.775563935317299e-06, + "loss": 2.9888, + "step": 73320 + }, + { + "epoch": 4.981994836254926, + "grad_norm": 6.894909858703613, + "learning_rate": 3.7751392852289714e-06, + "loss": 2.7683, + "step": 73325 + }, + { + "epoch": 4.982334556325588, + "grad_norm": 7.46129846572876, + "learning_rate": 3.774714635140644e-06, + "loss": 2.7112, + "step": 73330 + }, + { + "epoch": 4.982674276396249, + "grad_norm": 7.112610340118408, + "learning_rate": 3.7742899850523174e-06, + "loss": 2.9673, + "step": 73335 + }, + { + "epoch": 4.983013996466911, + "grad_norm": 7.479787826538086, + "learning_rate": 3.77386533496399e-06, + "loss": 2.9696, + "step": 73340 + }, + { + "epoch": 4.983353716537573, + "grad_norm": 7.738795280456543, + "learning_rate": 3.7734406848756626e-06, + "loss": 2.6516, + "step": 73345 + }, + { + "epoch": 4.983693436608235, + "grad_norm": 7.513720989227295, + "learning_rate": 3.773016034787336e-06, + "loss": 2.856, + "step": 73350 + }, + { + "epoch": 4.984033156678897, + "grad_norm": 6.449756622314453, + "learning_rate": 3.772591384699008e-06, + "loss": 2.7372, + "step": 73355 + }, + { + "epoch": 4.984372876749559, + "grad_norm": 8.748842239379883, + "learning_rate": 3.772166734610681e-06, + "loss": 2.7412, + "step": 73360 + }, + { + "epoch": 4.98471259682022, + "grad_norm": 6.69316291809082, + "learning_rate": 3.7717420845223542e-06, + "loss": 3.0911, + "step": 73365 + }, + { + "epoch": 4.985052316890882, + "grad_norm": 6.583887100219727, + "learning_rate": 3.7713174344340266e-06, + "loss": 2.9694, + "step": 73370 + }, + { + "epoch": 4.985392036961544, + "grad_norm": 6.943853855133057, + "learning_rate": 3.7708927843456994e-06, + "loss": 2.943, + "step": 73375 + }, + { + "epoch": 4.985731757032205, + "grad_norm": 6.817622184753418, + "learning_rate": 3.770468134257372e-06, + "loss": 2.8133, + "step": 73380 + }, + { + "epoch": 4.986071477102867, + "grad_norm": 7.8053131103515625, + "learning_rate": 3.7700434841690446e-06, + "loss": 2.6538, + "step": 73385 + }, + { + "epoch": 4.986411197173529, + "grad_norm": 7.214580535888672, + "learning_rate": 3.769618834080718e-06, + "loss": 2.7963, + "step": 73390 + }, + { + "epoch": 4.986750917244191, + "grad_norm": 6.865508079528809, + "learning_rate": 3.7691941839923906e-06, + "loss": 2.7815, + "step": 73395 + }, + { + "epoch": 4.987090637314853, + "grad_norm": 7.365975856781006, + "learning_rate": 3.768769533904064e-06, + "loss": 2.7697, + "step": 73400 + }, + { + "epoch": 4.987430357385515, + "grad_norm": 6.303061008453369, + "learning_rate": 3.768344883815736e-06, + "loss": 2.7926, + "step": 73405 + }, + { + "epoch": 4.987770077456176, + "grad_norm": 7.636626720428467, + "learning_rate": 3.767920233727409e-06, + "loss": 2.857, + "step": 73410 + }, + { + "epoch": 4.988109797526838, + "grad_norm": 7.747525215148926, + "learning_rate": 3.767495583639082e-06, + "loss": 2.6523, + "step": 73415 + }, + { + "epoch": 4.9884495175975, + "grad_norm": 7.471425533294678, + "learning_rate": 3.767070933550754e-06, + "loss": 2.9362, + "step": 73420 + }, + { + "epoch": 4.988789237668161, + "grad_norm": 8.32661247253418, + "learning_rate": 3.7666462834624274e-06, + "loss": 3.0427, + "step": 73425 + }, + { + "epoch": 4.989128957738823, + "grad_norm": 8.797091484069824, + "learning_rate": 3.7662216333741e-06, + "loss": 2.8608, + "step": 73430 + }, + { + "epoch": 4.989468677809485, + "grad_norm": 8.310559272766113, + "learning_rate": 3.7657969832857726e-06, + "loss": 2.9548, + "step": 73435 + }, + { + "epoch": 4.989808397880147, + "grad_norm": 6.6858134269714355, + "learning_rate": 3.765372333197446e-06, + "loss": 3.0114, + "step": 73440 + }, + { + "epoch": 4.990148117950809, + "grad_norm": 8.504158973693848, + "learning_rate": 3.7649476831091186e-06, + "loss": 2.9338, + "step": 73445 + }, + { + "epoch": 4.990487838021471, + "grad_norm": 5.287508010864258, + "learning_rate": 3.764523033020791e-06, + "loss": 2.6309, + "step": 73450 + }, + { + "epoch": 4.990827558092132, + "grad_norm": 7.365618705749512, + "learning_rate": 3.7640983829324638e-06, + "loss": 2.8099, + "step": 73455 + }, + { + "epoch": 4.991167278162794, + "grad_norm": 6.556382179260254, + "learning_rate": 3.763673732844137e-06, + "loss": 2.9352, + "step": 73460 + }, + { + "epoch": 4.991506998233456, + "grad_norm": 7.907986164093018, + "learning_rate": 3.7632490827558094e-06, + "loss": 2.6572, + "step": 73465 + }, + { + "epoch": 4.991846718304117, + "grad_norm": 8.188528060913086, + "learning_rate": 3.762824432667482e-06, + "loss": 2.8697, + "step": 73470 + }, + { + "epoch": 4.992186438374779, + "grad_norm": 7.369777202606201, + "learning_rate": 3.7623997825791554e-06, + "loss": 2.9227, + "step": 73475 + }, + { + "epoch": 4.9925261584454415, + "grad_norm": 6.170557022094727, + "learning_rate": 3.761975132490828e-06, + "loss": 3.002, + "step": 73480 + }, + { + "epoch": 4.992865878516103, + "grad_norm": 6.544437885284424, + "learning_rate": 3.7615504824025006e-06, + "loss": 2.8874, + "step": 73485 + }, + { + "epoch": 4.993205598586765, + "grad_norm": 8.743667602539062, + "learning_rate": 3.761125832314174e-06, + "loss": 2.9931, + "step": 73490 + }, + { + "epoch": 4.993545318657426, + "grad_norm": 5.868346691131592, + "learning_rate": 3.760701182225846e-06, + "loss": 2.8555, + "step": 73495 + }, + { + "epoch": 4.993885038728088, + "grad_norm": 7.73525857925415, + "learning_rate": 3.760276532137519e-06, + "loss": 2.7625, + "step": 73500 + }, + { + "epoch": 4.99422475879875, + "grad_norm": 7.664160251617432, + "learning_rate": 3.759851882049192e-06, + "loss": 2.8021, + "step": 73505 + }, + { + "epoch": 4.994564478869411, + "grad_norm": 7.781805038452148, + "learning_rate": 3.759427231960864e-06, + "loss": 2.9134, + "step": 73510 + }, + { + "epoch": 4.994904198940073, + "grad_norm": 9.725112915039062, + "learning_rate": 3.7590025818725374e-06, + "loss": 2.9858, + "step": 73515 + }, + { + "epoch": 4.995243919010735, + "grad_norm": 8.369850158691406, + "learning_rate": 3.75857793178421e-06, + "loss": 2.8098, + "step": 73520 + }, + { + "epoch": 4.995583639081397, + "grad_norm": 7.813419342041016, + "learning_rate": 3.7581532816958826e-06, + "loss": 2.727, + "step": 73525 + }, + { + "epoch": 4.995923359152059, + "grad_norm": 8.002814292907715, + "learning_rate": 3.757728631607556e-06, + "loss": 2.9198, + "step": 73530 + }, + { + "epoch": 4.996263079222721, + "grad_norm": 7.7104644775390625, + "learning_rate": 3.7573039815192286e-06, + "loss": 2.9282, + "step": 73535 + }, + { + "epoch": 4.996602799293382, + "grad_norm": 7.180261611938477, + "learning_rate": 3.756879331430901e-06, + "loss": 2.8472, + "step": 73540 + }, + { + "epoch": 4.996942519364044, + "grad_norm": 7.33673620223999, + "learning_rate": 3.7564546813425738e-06, + "loss": 2.9297, + "step": 73545 + }, + { + "epoch": 4.997282239434706, + "grad_norm": 7.448824882507324, + "learning_rate": 3.756030031254247e-06, + "loss": 2.9047, + "step": 73550 + }, + { + "epoch": 4.997621959505367, + "grad_norm": 7.326620578765869, + "learning_rate": 3.7556053811659194e-06, + "loss": 2.9036, + "step": 73555 + }, + { + "epoch": 4.997961679576029, + "grad_norm": 6.866332054138184, + "learning_rate": 3.755180731077592e-06, + "loss": 3.0261, + "step": 73560 + }, + { + "epoch": 4.998301399646691, + "grad_norm": 8.895586013793945, + "learning_rate": 3.7547560809892654e-06, + "loss": 2.719, + "step": 73565 + }, + { + "epoch": 4.998641119717353, + "grad_norm": 7.052312850952148, + "learning_rate": 3.754331430900938e-06, + "loss": 2.9116, + "step": 73570 + }, + { + "epoch": 4.998980839788015, + "grad_norm": 6.947500705718994, + "learning_rate": 3.7539067808126106e-06, + "loss": 2.808, + "step": 73575 + }, + { + "epoch": 4.999320559858677, + "grad_norm": 7.026329040527344, + "learning_rate": 3.7534821307242834e-06, + "loss": 2.8511, + "step": 73580 + }, + { + "epoch": 4.999660279929338, + "grad_norm": 8.7249174118042, + "learning_rate": 3.7530574806359566e-06, + "loss": 3.0056, + "step": 73585 + }, + { + "epoch": 5.0, + "grad_norm": 19.06011390686035, + "learning_rate": 3.752632830547629e-06, + "loss": 2.8514, + "step": 73590 + }, + { + "epoch": 5.0, + "eval_bertscore": { + "f1": 0.8409231472805565, + "precision": 0.8407067746856853, + "recall": 0.8419542997853101 + }, + "eval_bleu_4": 0.01728640629133737, + "eval_exact_match": 9.690861517588914e-05, + "eval_loss": 3.315546751022339, + "eval_meteor": 0.10523973710215814, + "eval_rouge": { + "rouge1": 0.13641138113691834, + "rouge2": 0.01765766020879509, + "rougeL": 0.11308985192250398, + "rougeLsum": 0.11309312592980891 + }, + "eval_runtime": 1083.0356, + "eval_samples_per_second": 9.528, + "eval_steps_per_second": 1.191, + "step": 73590 + }, + { + "epoch": 5.000339720070662, + "grad_norm": 8.256293296813965, + "learning_rate": 3.7522081804593018e-06, + "loss": 2.897, + "step": 73595 + }, + { + "epoch": 5.000679440141323, + "grad_norm": 7.179742336273193, + "learning_rate": 3.751783530370975e-06, + "loss": 2.665, + "step": 73600 + }, + { + "epoch": 5.001019160211985, + "grad_norm": 6.916292667388916, + "learning_rate": 3.7513588802826474e-06, + "loss": 2.7886, + "step": 73605 + }, + { + "epoch": 5.001358880282647, + "grad_norm": 8.898502349853516, + "learning_rate": 3.75093423019432e-06, + "loss": 2.8323, + "step": 73610 + }, + { + "epoch": 5.001698600353309, + "grad_norm": 7.928882122039795, + "learning_rate": 3.750509580105993e-06, + "loss": 2.7492, + "step": 73615 + }, + { + "epoch": 5.002038320423971, + "grad_norm": 8.565394401550293, + "learning_rate": 3.750084930017666e-06, + "loss": 2.743, + "step": 73620 + }, + { + "epoch": 5.002378040494633, + "grad_norm": 6.666855812072754, + "learning_rate": 3.7496602799293386e-06, + "loss": 2.9027, + "step": 73625 + }, + { + "epoch": 5.002717760565294, + "grad_norm": 6.875427722930908, + "learning_rate": 3.7492356298410114e-06, + "loss": 2.6862, + "step": 73630 + }, + { + "epoch": 5.003057480635956, + "grad_norm": 6.543339729309082, + "learning_rate": 3.7488109797526838e-06, + "loss": 2.9791, + "step": 73635 + }, + { + "epoch": 5.003397200706618, + "grad_norm": 7.847986698150635, + "learning_rate": 3.748386329664357e-06, + "loss": 2.6892, + "step": 73640 + }, + { + "epoch": 5.003736920777279, + "grad_norm": 7.657154083251953, + "learning_rate": 3.74796167957603e-06, + "loss": 2.6807, + "step": 73645 + }, + { + "epoch": 5.004076640847941, + "grad_norm": 8.636756896972656, + "learning_rate": 3.747537029487702e-06, + "loss": 2.8342, + "step": 73650 + }, + { + "epoch": 5.004416360918603, + "grad_norm": 6.712889671325684, + "learning_rate": 3.7471123793993754e-06, + "loss": 2.8177, + "step": 73655 + }, + { + "epoch": 5.004756080989265, + "grad_norm": 7.954500675201416, + "learning_rate": 3.746687729311048e-06, + "loss": 2.8225, + "step": 73660 + }, + { + "epoch": 5.005095801059927, + "grad_norm": 8.084739685058594, + "learning_rate": 3.7462630792227206e-06, + "loss": 2.7637, + "step": 73665 + }, + { + "epoch": 5.005435521130589, + "grad_norm": 8.035821914672852, + "learning_rate": 3.7458384291343934e-06, + "loss": 2.6973, + "step": 73670 + }, + { + "epoch": 5.00577524120125, + "grad_norm": 6.149903774261475, + "learning_rate": 3.7454137790460666e-06, + "loss": 2.8444, + "step": 73675 + }, + { + "epoch": 5.006114961271912, + "grad_norm": 6.505760192871094, + "learning_rate": 3.744989128957739e-06, + "loss": 2.8615, + "step": 73680 + }, + { + "epoch": 5.006454681342574, + "grad_norm": 7.032881736755371, + "learning_rate": 3.7445644788694118e-06, + "loss": 2.7749, + "step": 73685 + }, + { + "epoch": 5.006794401413235, + "grad_norm": 7.105157852172852, + "learning_rate": 3.744139828781085e-06, + "loss": 2.6569, + "step": 73690 + }, + { + "epoch": 5.007134121483897, + "grad_norm": 8.68187141418457, + "learning_rate": 3.7437151786927574e-06, + "loss": 2.8663, + "step": 73695 + }, + { + "epoch": 5.007473841554559, + "grad_norm": 6.3594183921813965, + "learning_rate": 3.74329052860443e-06, + "loss": 2.7394, + "step": 73700 + }, + { + "epoch": 5.007813561625221, + "grad_norm": 7.12614107131958, + "learning_rate": 3.742865878516103e-06, + "loss": 2.748, + "step": 73705 + }, + { + "epoch": 5.008153281695883, + "grad_norm": 7.882846355438232, + "learning_rate": 3.7424412284277758e-06, + "loss": 2.756, + "step": 73710 + }, + { + "epoch": 5.008493001766545, + "grad_norm": 8.116713523864746, + "learning_rate": 3.7420165783394486e-06, + "loss": 2.9123, + "step": 73715 + }, + { + "epoch": 5.008832721837206, + "grad_norm": 5.9536967277526855, + "learning_rate": 3.7415919282511214e-06, + "loss": 2.7853, + "step": 73720 + }, + { + "epoch": 5.009172441907868, + "grad_norm": 6.75257682800293, + "learning_rate": 3.7411672781627938e-06, + "loss": 2.638, + "step": 73725 + }, + { + "epoch": 5.00951216197853, + "grad_norm": 7.760793685913086, + "learning_rate": 3.740742628074467e-06, + "loss": 2.6986, + "step": 73730 + }, + { + "epoch": 5.009851882049191, + "grad_norm": 7.044058322906494, + "learning_rate": 3.7403179779861398e-06, + "loss": 2.838, + "step": 73735 + }, + { + "epoch": 5.010191602119853, + "grad_norm": 6.62687873840332, + "learning_rate": 3.739893327897812e-06, + "loss": 2.8365, + "step": 73740 + }, + { + "epoch": 5.0105313221905154, + "grad_norm": 6.544068336486816, + "learning_rate": 3.7394686778094854e-06, + "loss": 2.6671, + "step": 73745 + }, + { + "epoch": 5.010871042261177, + "grad_norm": 6.386043548583984, + "learning_rate": 3.739044027721158e-06, + "loss": 2.556, + "step": 73750 + }, + { + "epoch": 5.011210762331839, + "grad_norm": 8.076324462890625, + "learning_rate": 3.738619377632831e-06, + "loss": 2.7234, + "step": 73755 + }, + { + "epoch": 5.0115504824025, + "grad_norm": 7.968382835388184, + "learning_rate": 3.7381947275445034e-06, + "loss": 2.8352, + "step": 73760 + }, + { + "epoch": 5.011890202473162, + "grad_norm": 10.508129119873047, + "learning_rate": 3.7377700774561766e-06, + "loss": 2.8623, + "step": 73765 + }, + { + "epoch": 5.012229922543824, + "grad_norm": 7.405519962310791, + "learning_rate": 3.7373454273678494e-06, + "loss": 2.4944, + "step": 73770 + }, + { + "epoch": 5.012569642614485, + "grad_norm": 7.133551120758057, + "learning_rate": 3.7369207772795218e-06, + "loss": 2.7477, + "step": 73775 + }, + { + "epoch": 5.012909362685147, + "grad_norm": 7.559813022613525, + "learning_rate": 3.736496127191195e-06, + "loss": 2.9449, + "step": 73780 + }, + { + "epoch": 5.013249082755809, + "grad_norm": 6.413923263549805, + "learning_rate": 3.736071477102868e-06, + "loss": 2.7459, + "step": 73785 + }, + { + "epoch": 5.013588802826471, + "grad_norm": 7.982732772827148, + "learning_rate": 3.73564682701454e-06, + "loss": 2.7718, + "step": 73790 + }, + { + "epoch": 5.013928522897133, + "grad_norm": 9.250762939453125, + "learning_rate": 3.735222176926213e-06, + "loss": 2.9138, + "step": 73795 + }, + { + "epoch": 5.014268242967795, + "grad_norm": 7.7262420654296875, + "learning_rate": 3.734797526837886e-06, + "loss": 2.7622, + "step": 73800 + }, + { + "epoch": 5.014607963038456, + "grad_norm": 6.666176795959473, + "learning_rate": 3.7343728767495586e-06, + "loss": 2.5557, + "step": 73805 + }, + { + "epoch": 5.014947683109118, + "grad_norm": 6.337995529174805, + "learning_rate": 3.7339482266612314e-06, + "loss": 2.8779, + "step": 73810 + }, + { + "epoch": 5.01528740317978, + "grad_norm": 7.89417028427124, + "learning_rate": 3.7335235765729046e-06, + "loss": 2.7006, + "step": 73815 + }, + { + "epoch": 5.015627123250441, + "grad_norm": 9.81161880493164, + "learning_rate": 3.733098926484577e-06, + "loss": 2.7911, + "step": 73820 + }, + { + "epoch": 5.015966843321103, + "grad_norm": 7.166833877563477, + "learning_rate": 3.7326742763962498e-06, + "loss": 2.7226, + "step": 73825 + }, + { + "epoch": 5.016306563391765, + "grad_norm": 6.917985916137695, + "learning_rate": 3.7322496263079226e-06, + "loss": 2.844, + "step": 73830 + }, + { + "epoch": 5.016646283462427, + "grad_norm": 7.836352348327637, + "learning_rate": 3.7318249762195954e-06, + "loss": 2.8882, + "step": 73835 + }, + { + "epoch": 5.016986003533089, + "grad_norm": 6.728044033050537, + "learning_rate": 3.731400326131268e-06, + "loss": 2.5193, + "step": 73840 + }, + { + "epoch": 5.017325723603751, + "grad_norm": 7.108718395233154, + "learning_rate": 3.730975676042941e-06, + "loss": 2.5996, + "step": 73845 + }, + { + "epoch": 5.017665443674412, + "grad_norm": 6.352814197540283, + "learning_rate": 3.7305510259546134e-06, + "loss": 2.7296, + "step": 73850 + }, + { + "epoch": 5.018005163745074, + "grad_norm": 8.150849342346191, + "learning_rate": 3.7301263758662866e-06, + "loss": 2.8465, + "step": 73855 + }, + { + "epoch": 5.018344883815736, + "grad_norm": 6.496094703674316, + "learning_rate": 3.7297017257779594e-06, + "loss": 2.6759, + "step": 73860 + }, + { + "epoch": 5.018684603886397, + "grad_norm": 7.927548408508301, + "learning_rate": 3.7292770756896318e-06, + "loss": 2.7136, + "step": 73865 + }, + { + "epoch": 5.019024323957059, + "grad_norm": 8.449132919311523, + "learning_rate": 3.728852425601305e-06, + "loss": 2.8634, + "step": 73870 + }, + { + "epoch": 5.019364044027721, + "grad_norm": 6.912847995758057, + "learning_rate": 3.7284277755129778e-06, + "loss": 2.8893, + "step": 73875 + }, + { + "epoch": 5.019703764098383, + "grad_norm": 8.737635612487793, + "learning_rate": 3.72800312542465e-06, + "loss": 2.7322, + "step": 73880 + }, + { + "epoch": 5.020043484169045, + "grad_norm": 8.196133613586426, + "learning_rate": 3.727578475336323e-06, + "loss": 2.6825, + "step": 73885 + }, + { + "epoch": 5.020383204239707, + "grad_norm": 9.346474647521973, + "learning_rate": 3.727153825247996e-06, + "loss": 2.9448, + "step": 73890 + }, + { + "epoch": 5.020722924310368, + "grad_norm": 7.180441379547119, + "learning_rate": 3.7267291751596686e-06, + "loss": 2.7058, + "step": 73895 + }, + { + "epoch": 5.02106264438103, + "grad_norm": 10.39455795288086, + "learning_rate": 3.7263045250713414e-06, + "loss": 2.8435, + "step": 73900 + }, + { + "epoch": 5.021402364451692, + "grad_norm": 6.715665817260742, + "learning_rate": 3.7258798749830146e-06, + "loss": 3.0016, + "step": 73905 + }, + { + "epoch": 5.021742084522353, + "grad_norm": 8.52259635925293, + "learning_rate": 3.725455224894687e-06, + "loss": 2.6401, + "step": 73910 + }, + { + "epoch": 5.022081804593015, + "grad_norm": 9.143594741821289, + "learning_rate": 3.7250305748063598e-06, + "loss": 2.8586, + "step": 73915 + }, + { + "epoch": 5.022421524663677, + "grad_norm": 6.812370300292969, + "learning_rate": 3.7246059247180326e-06, + "loss": 2.7471, + "step": 73920 + }, + { + "epoch": 5.022761244734339, + "grad_norm": 9.226218223571777, + "learning_rate": 3.724181274629706e-06, + "loss": 2.696, + "step": 73925 + }, + { + "epoch": 5.023100964805001, + "grad_norm": 6.292170524597168, + "learning_rate": 3.723756624541378e-06, + "loss": 2.7059, + "step": 73930 + }, + { + "epoch": 5.023440684875663, + "grad_norm": 8.64374828338623, + "learning_rate": 3.723331974453051e-06, + "loss": 3.0238, + "step": 73935 + }, + { + "epoch": 5.023780404946324, + "grad_norm": 7.251244068145752, + "learning_rate": 3.722907324364724e-06, + "loss": 2.5359, + "step": 73940 + }, + { + "epoch": 5.024120125016986, + "grad_norm": 5.451649188995361, + "learning_rate": 3.7224826742763966e-06, + "loss": 2.6007, + "step": 73945 + }, + { + "epoch": 5.024459845087648, + "grad_norm": 6.825295925140381, + "learning_rate": 3.7220580241880694e-06, + "loss": 2.5956, + "step": 73950 + }, + { + "epoch": 5.024799565158309, + "grad_norm": 7.269388198852539, + "learning_rate": 3.721633374099742e-06, + "loss": 2.5944, + "step": 73955 + }, + { + "epoch": 5.025139285228971, + "grad_norm": 6.278788089752197, + "learning_rate": 3.7212087240114145e-06, + "loss": 2.9163, + "step": 73960 + }, + { + "epoch": 5.025479005299633, + "grad_norm": 7.600319862365723, + "learning_rate": 3.7207840739230878e-06, + "loss": 2.9279, + "step": 73965 + }, + { + "epoch": 5.025818725370295, + "grad_norm": 7.158032417297363, + "learning_rate": 3.7203594238347606e-06, + "loss": 2.962, + "step": 73970 + }, + { + "epoch": 5.026158445440957, + "grad_norm": 5.8560075759887695, + "learning_rate": 3.719934773746433e-06, + "loss": 2.8024, + "step": 73975 + }, + { + "epoch": 5.026498165511619, + "grad_norm": 7.1994171142578125, + "learning_rate": 3.719510123658106e-06, + "loss": 2.7605, + "step": 73980 + }, + { + "epoch": 5.02683788558228, + "grad_norm": 7.535231113433838, + "learning_rate": 3.719085473569779e-06, + "loss": 2.705, + "step": 73985 + }, + { + "epoch": 5.027177605652942, + "grad_norm": 7.714955806732178, + "learning_rate": 3.7186608234814514e-06, + "loss": 2.7597, + "step": 73990 + }, + { + "epoch": 5.027517325723604, + "grad_norm": 6.006187915802002, + "learning_rate": 3.7182361733931246e-06, + "loss": 2.7338, + "step": 73995 + }, + { + "epoch": 5.027857045794265, + "grad_norm": 6.7583746910095215, + "learning_rate": 3.7178115233047974e-06, + "loss": 2.6369, + "step": 74000 + }, + { + "epoch": 5.028196765864927, + "grad_norm": 7.951467990875244, + "learning_rate": 3.7173868732164698e-06, + "loss": 2.9945, + "step": 74005 + }, + { + "epoch": 5.028536485935589, + "grad_norm": 8.450323104858398, + "learning_rate": 3.7169622231281426e-06, + "loss": 2.8169, + "step": 74010 + }, + { + "epoch": 5.028876206006251, + "grad_norm": 9.91334056854248, + "learning_rate": 3.7165375730398158e-06, + "loss": 2.7324, + "step": 74015 + }, + { + "epoch": 5.029215926076913, + "grad_norm": 8.352290153503418, + "learning_rate": 3.716112922951488e-06, + "loss": 2.6652, + "step": 74020 + }, + { + "epoch": 5.029555646147575, + "grad_norm": 8.424809455871582, + "learning_rate": 3.715688272863161e-06, + "loss": 2.9891, + "step": 74025 + }, + { + "epoch": 5.029895366218236, + "grad_norm": 7.790605068206787, + "learning_rate": 3.715263622774834e-06, + "loss": 2.8981, + "step": 74030 + }, + { + "epoch": 5.030235086288898, + "grad_norm": 7.10322904586792, + "learning_rate": 3.7148389726865066e-06, + "loss": 2.8292, + "step": 74035 + }, + { + "epoch": 5.03057480635956, + "grad_norm": 8.040102005004883, + "learning_rate": 3.7144143225981794e-06, + "loss": 2.6787, + "step": 74040 + }, + { + "epoch": 5.030914526430221, + "grad_norm": 8.05504322052002, + "learning_rate": 3.713989672509852e-06, + "loss": 2.5594, + "step": 74045 + }, + { + "epoch": 5.031254246500883, + "grad_norm": 9.911808013916016, + "learning_rate": 3.7135650224215245e-06, + "loss": 3.076, + "step": 74050 + }, + { + "epoch": 5.0315939665715455, + "grad_norm": 7.104210376739502, + "learning_rate": 3.7131403723331978e-06, + "loss": 2.4936, + "step": 74055 + }, + { + "epoch": 5.031933686642207, + "grad_norm": 10.779367446899414, + "learning_rate": 3.7127157222448706e-06, + "loss": 2.5383, + "step": 74060 + }, + { + "epoch": 5.032273406712869, + "grad_norm": 6.844131946563721, + "learning_rate": 3.712291072156543e-06, + "loss": 2.8578, + "step": 74065 + }, + { + "epoch": 5.032613126783531, + "grad_norm": 8.810541152954102, + "learning_rate": 3.711866422068216e-06, + "loss": 2.7045, + "step": 74070 + }, + { + "epoch": 5.032952846854192, + "grad_norm": 10.3795747756958, + "learning_rate": 3.711441771979889e-06, + "loss": 2.663, + "step": 74075 + }, + { + "epoch": 5.033292566924854, + "grad_norm": 9.38516616821289, + "learning_rate": 3.7110171218915613e-06, + "loss": 2.5944, + "step": 74080 + }, + { + "epoch": 5.033632286995516, + "grad_norm": 9.389474868774414, + "learning_rate": 3.710592471803234e-06, + "loss": 2.9258, + "step": 74085 + }, + { + "epoch": 5.033972007066177, + "grad_norm": 10.058926582336426, + "learning_rate": 3.7101678217149074e-06, + "loss": 2.7416, + "step": 74090 + }, + { + "epoch": 5.034311727136839, + "grad_norm": 6.726471424102783, + "learning_rate": 3.70974317162658e-06, + "loss": 3.107, + "step": 74095 + }, + { + "epoch": 5.0346514472075015, + "grad_norm": 7.062777519226074, + "learning_rate": 3.7093185215382525e-06, + "loss": 2.9058, + "step": 74100 + }, + { + "epoch": 5.034991167278163, + "grad_norm": 7.6221699714660645, + "learning_rate": 3.7088938714499258e-06, + "loss": 2.9014, + "step": 74105 + }, + { + "epoch": 5.035330887348825, + "grad_norm": 7.917490005493164, + "learning_rate": 3.7084692213615986e-06, + "loss": 2.8075, + "step": 74110 + }, + { + "epoch": 5.035670607419486, + "grad_norm": 7.542130470275879, + "learning_rate": 3.708044571273271e-06, + "loss": 3.0679, + "step": 74115 + }, + { + "epoch": 5.036010327490148, + "grad_norm": 6.321074485778809, + "learning_rate": 3.7076199211849437e-06, + "loss": 2.4697, + "step": 74120 + }, + { + "epoch": 5.03635004756081, + "grad_norm": 6.8685736656188965, + "learning_rate": 3.707195271096617e-06, + "loss": 2.8236, + "step": 74125 + }, + { + "epoch": 5.036689767631471, + "grad_norm": 7.6730780601501465, + "learning_rate": 3.7067706210082894e-06, + "loss": 2.9591, + "step": 74130 + }, + { + "epoch": 5.037029487702133, + "grad_norm": 9.098894119262695, + "learning_rate": 3.706345970919962e-06, + "loss": 2.5871, + "step": 74135 + }, + { + "epoch": 5.037369207772795, + "grad_norm": 8.030238151550293, + "learning_rate": 3.7059213208316354e-06, + "loss": 2.9089, + "step": 74140 + }, + { + "epoch": 5.037708927843457, + "grad_norm": 7.528008460998535, + "learning_rate": 3.7054966707433078e-06, + "loss": 2.8988, + "step": 74145 + }, + { + "epoch": 5.038048647914119, + "grad_norm": 6.059206962585449, + "learning_rate": 3.7050720206549806e-06, + "loss": 2.7262, + "step": 74150 + }, + { + "epoch": 5.038388367984781, + "grad_norm": 6.18208646774292, + "learning_rate": 3.7046473705666538e-06, + "loss": 2.794, + "step": 74155 + }, + { + "epoch": 5.038728088055442, + "grad_norm": 8.356470108032227, + "learning_rate": 3.704222720478326e-06, + "loss": 2.6054, + "step": 74160 + }, + { + "epoch": 5.039067808126104, + "grad_norm": 7.278294086456299, + "learning_rate": 3.703798070389999e-06, + "loss": 3.102, + "step": 74165 + }, + { + "epoch": 5.039407528196766, + "grad_norm": 8.202195167541504, + "learning_rate": 3.7033734203016718e-06, + "loss": 2.7981, + "step": 74170 + }, + { + "epoch": 5.039747248267427, + "grad_norm": 8.675713539123535, + "learning_rate": 3.702948770213344e-06, + "loss": 2.986, + "step": 74175 + }, + { + "epoch": 5.040086968338089, + "grad_norm": 7.553218364715576, + "learning_rate": 3.7025241201250174e-06, + "loss": 2.7613, + "step": 74180 + }, + { + "epoch": 5.040426688408751, + "grad_norm": 6.132992267608643, + "learning_rate": 3.70209947003669e-06, + "loss": 2.7966, + "step": 74185 + }, + { + "epoch": 5.040766408479413, + "grad_norm": 6.974486351013184, + "learning_rate": 3.7016748199483625e-06, + "loss": 2.7948, + "step": 74190 + }, + { + "epoch": 5.041106128550075, + "grad_norm": 6.212691307067871, + "learning_rate": 3.7012501698600358e-06, + "loss": 2.7442, + "step": 74195 + }, + { + "epoch": 5.041445848620737, + "grad_norm": 8.448264122009277, + "learning_rate": 3.7008255197717086e-06, + "loss": 3.0185, + "step": 74200 + }, + { + "epoch": 5.041785568691398, + "grad_norm": 6.540849208831787, + "learning_rate": 3.700400869683381e-06, + "loss": 2.7564, + "step": 74205 + }, + { + "epoch": 5.04212528876206, + "grad_norm": 8.733291625976562, + "learning_rate": 3.6999762195950537e-06, + "loss": 3.1213, + "step": 74210 + }, + { + "epoch": 5.042465008832722, + "grad_norm": 6.79180908203125, + "learning_rate": 3.699551569506727e-06, + "loss": 2.7371, + "step": 74215 + }, + { + "epoch": 5.042804728903383, + "grad_norm": 7.21260929107666, + "learning_rate": 3.6991269194183993e-06, + "loss": 2.8712, + "step": 74220 + }, + { + "epoch": 5.043144448974045, + "grad_norm": 8.168962478637695, + "learning_rate": 3.698702269330072e-06, + "loss": 2.782, + "step": 74225 + }, + { + "epoch": 5.043484169044707, + "grad_norm": 8.374811172485352, + "learning_rate": 3.6982776192417454e-06, + "loss": 2.9406, + "step": 74230 + }, + { + "epoch": 5.043823889115369, + "grad_norm": 6.126360893249512, + "learning_rate": 3.6978529691534177e-06, + "loss": 2.6527, + "step": 74235 + }, + { + "epoch": 5.044163609186031, + "grad_norm": 7.808990478515625, + "learning_rate": 3.6974283190650905e-06, + "loss": 2.3993, + "step": 74240 + }, + { + "epoch": 5.044503329256693, + "grad_norm": 9.79412841796875, + "learning_rate": 3.6970036689767633e-06, + "loss": 2.8535, + "step": 74245 + }, + { + "epoch": 5.044843049327354, + "grad_norm": 7.777141571044922, + "learning_rate": 3.696579018888436e-06, + "loss": 2.6621, + "step": 74250 + }, + { + "epoch": 5.045182769398016, + "grad_norm": 7.389467716217041, + "learning_rate": 3.6962392988177742e-06, + "loss": 2.9448, + "step": 74255 + }, + { + "epoch": 5.045522489468678, + "grad_norm": 7.570728302001953, + "learning_rate": 3.695814648729447e-06, + "loss": 2.7289, + "step": 74260 + }, + { + "epoch": 5.045862209539339, + "grad_norm": 7.838103294372559, + "learning_rate": 3.6953899986411202e-06, + "loss": 2.9822, + "step": 74265 + }, + { + "epoch": 5.046201929610001, + "grad_norm": 6.856711387634277, + "learning_rate": 3.6949653485527926e-06, + "loss": 2.8574, + "step": 74270 + }, + { + "epoch": 5.046541649680663, + "grad_norm": 7.297884464263916, + "learning_rate": 3.6945406984644654e-06, + "loss": 2.7529, + "step": 74275 + }, + { + "epoch": 5.046881369751325, + "grad_norm": 8.058323860168457, + "learning_rate": 3.6941160483761386e-06, + "loss": 2.471, + "step": 74280 + }, + { + "epoch": 5.047221089821987, + "grad_norm": 7.718306541442871, + "learning_rate": 3.693691398287811e-06, + "loss": 2.7139, + "step": 74285 + }, + { + "epoch": 5.047560809892649, + "grad_norm": 6.430118083953857, + "learning_rate": 3.693266748199484e-06, + "loss": 2.7896, + "step": 74290 + }, + { + "epoch": 5.04790052996331, + "grad_norm": 7.352906703948975, + "learning_rate": 3.6928420981111566e-06, + "loss": 2.5971, + "step": 74295 + }, + { + "epoch": 5.048240250033972, + "grad_norm": 7.050657749176025, + "learning_rate": 3.69241744802283e-06, + "loss": 2.9643, + "step": 74300 + }, + { + "epoch": 5.048579970104634, + "grad_norm": 12.688314437866211, + "learning_rate": 3.6919927979345022e-06, + "loss": 2.9695, + "step": 74305 + }, + { + "epoch": 5.048919690175295, + "grad_norm": 7.476974964141846, + "learning_rate": 3.691568147846175e-06, + "loss": 3.021, + "step": 74310 + }, + { + "epoch": 5.049259410245957, + "grad_norm": 6.821500301361084, + "learning_rate": 3.6911434977578482e-06, + "loss": 2.9059, + "step": 74315 + }, + { + "epoch": 5.0495991303166194, + "grad_norm": 6.399652481079102, + "learning_rate": 3.6907188476695206e-06, + "loss": 2.9691, + "step": 74320 + }, + { + "epoch": 5.049938850387281, + "grad_norm": 5.360423564910889, + "learning_rate": 3.6902941975811934e-06, + "loss": 2.7575, + "step": 74325 + }, + { + "epoch": 5.050278570457943, + "grad_norm": 7.693698406219482, + "learning_rate": 3.6898695474928662e-06, + "loss": 2.6044, + "step": 74330 + }, + { + "epoch": 5.050618290528605, + "grad_norm": 8.043656349182129, + "learning_rate": 3.6894448974045386e-06, + "loss": 2.9802, + "step": 74335 + }, + { + "epoch": 5.050958010599266, + "grad_norm": 8.285843849182129, + "learning_rate": 3.689020247316212e-06, + "loss": 2.6575, + "step": 74340 + }, + { + "epoch": 5.051297730669928, + "grad_norm": 7.972245693206787, + "learning_rate": 3.6885955972278846e-06, + "loss": 2.6359, + "step": 74345 + }, + { + "epoch": 5.05163745074059, + "grad_norm": 6.580002307891846, + "learning_rate": 3.688170947139557e-06, + "loss": 2.7636, + "step": 74350 + }, + { + "epoch": 5.051977170811251, + "grad_norm": 6.72556734085083, + "learning_rate": 3.6877462970512302e-06, + "loss": 2.822, + "step": 74355 + }, + { + "epoch": 5.052316890881913, + "grad_norm": 6.6624932289123535, + "learning_rate": 3.687321646962903e-06, + "loss": 2.7088, + "step": 74360 + }, + { + "epoch": 5.0526566109525755, + "grad_norm": 7.2804646492004395, + "learning_rate": 3.6868969968745754e-06, + "loss": 2.802, + "step": 74365 + }, + { + "epoch": 5.052996331023237, + "grad_norm": 8.092520713806152, + "learning_rate": 3.6864723467862486e-06, + "loss": 2.8029, + "step": 74370 + }, + { + "epoch": 5.053336051093899, + "grad_norm": 7.288541316986084, + "learning_rate": 3.6860476966979214e-06, + "loss": 2.7243, + "step": 74375 + }, + { + "epoch": 5.053675771164561, + "grad_norm": 7.1720871925354, + "learning_rate": 3.685623046609594e-06, + "loss": 2.4366, + "step": 74380 + }, + { + "epoch": 5.054015491235222, + "grad_norm": 10.648429870605469, + "learning_rate": 3.6851983965212666e-06, + "loss": 2.6922, + "step": 74385 + }, + { + "epoch": 5.054355211305884, + "grad_norm": 9.32312297821045, + "learning_rate": 3.68477374643294e-06, + "loss": 2.9664, + "step": 74390 + }, + { + "epoch": 5.054694931376546, + "grad_norm": 7.095841407775879, + "learning_rate": 3.6843490963446122e-06, + "loss": 2.7614, + "step": 74395 + }, + { + "epoch": 5.055034651447207, + "grad_norm": 8.073848724365234, + "learning_rate": 3.683924446256285e-06, + "loss": 2.479, + "step": 74400 + }, + { + "epoch": 5.055374371517869, + "grad_norm": 7.035075664520264, + "learning_rate": 3.6834997961679582e-06, + "loss": 2.7656, + "step": 74405 + }, + { + "epoch": 5.0557140915885315, + "grad_norm": 6.431181907653809, + "learning_rate": 3.6830751460796306e-06, + "loss": 2.7747, + "step": 74410 + }, + { + "epoch": 5.056053811659193, + "grad_norm": 6.575903415679932, + "learning_rate": 3.6826504959913034e-06, + "loss": 2.7572, + "step": 74415 + }, + { + "epoch": 5.056393531729855, + "grad_norm": 7.58433198928833, + "learning_rate": 3.6822258459029762e-06, + "loss": 2.8346, + "step": 74420 + }, + { + "epoch": 5.056733251800517, + "grad_norm": 7.4280104637146, + "learning_rate": 3.6818011958146486e-06, + "loss": 3.2605, + "step": 74425 + }, + { + "epoch": 5.057072971871178, + "grad_norm": 7.114099502563477, + "learning_rate": 3.681376545726322e-06, + "loss": 2.8377, + "step": 74430 + }, + { + "epoch": 5.05741269194184, + "grad_norm": 8.110834121704102, + "learning_rate": 3.6809518956379946e-06, + "loss": 2.7433, + "step": 74435 + }, + { + "epoch": 5.057752412012501, + "grad_norm": 6.79685640335083, + "learning_rate": 3.680527245549667e-06, + "loss": 2.6864, + "step": 74440 + }, + { + "epoch": 5.058092132083163, + "grad_norm": 6.962424278259277, + "learning_rate": 3.6801025954613402e-06, + "loss": 2.6941, + "step": 74445 + }, + { + "epoch": 5.058431852153825, + "grad_norm": 7.430924892425537, + "learning_rate": 3.679677945373013e-06, + "loss": 2.4882, + "step": 74450 + }, + { + "epoch": 5.058771572224487, + "grad_norm": 6.304492473602295, + "learning_rate": 3.6792532952846854e-06, + "loss": 2.697, + "step": 74455 + }, + { + "epoch": 5.059111292295149, + "grad_norm": 9.474282264709473, + "learning_rate": 3.678828645196358e-06, + "loss": 2.858, + "step": 74460 + }, + { + "epoch": 5.059451012365811, + "grad_norm": 9.183499336242676, + "learning_rate": 3.6784039951080314e-06, + "loss": 2.8622, + "step": 74465 + }, + { + "epoch": 5.059790732436472, + "grad_norm": 7.142753601074219, + "learning_rate": 3.6779793450197042e-06, + "loss": 2.6583, + "step": 74470 + }, + { + "epoch": 5.060130452507134, + "grad_norm": 7.49550724029541, + "learning_rate": 3.6775546949313766e-06, + "loss": 2.8817, + "step": 74475 + }, + { + "epoch": 5.060470172577796, + "grad_norm": 7.562838077545166, + "learning_rate": 3.67713004484305e-06, + "loss": 2.8407, + "step": 74480 + }, + { + "epoch": 5.060809892648457, + "grad_norm": 7.198897361755371, + "learning_rate": 3.6767053947547226e-06, + "loss": 2.8702, + "step": 74485 + }, + { + "epoch": 5.061149612719119, + "grad_norm": 8.943120002746582, + "learning_rate": 3.676280744666395e-06, + "loss": 2.666, + "step": 74490 + }, + { + "epoch": 5.061489332789781, + "grad_norm": 9.170707702636719, + "learning_rate": 3.6758560945780682e-06, + "loss": 2.6715, + "step": 74495 + }, + { + "epoch": 5.061829052860443, + "grad_norm": 9.463250160217285, + "learning_rate": 3.675431444489741e-06, + "loss": 2.9906, + "step": 74500 + }, + { + "epoch": 5.062168772931105, + "grad_norm": 6.7712907791137695, + "learning_rate": 3.6750067944014134e-06, + "loss": 2.4953, + "step": 74505 + }, + { + "epoch": 5.062508493001767, + "grad_norm": 8.227871894836426, + "learning_rate": 3.674582144313086e-06, + "loss": 2.9813, + "step": 74510 + }, + { + "epoch": 5.062848213072428, + "grad_norm": 7.235427379608154, + "learning_rate": 3.6741574942247594e-06, + "loss": 2.7992, + "step": 74515 + }, + { + "epoch": 5.06318793314309, + "grad_norm": 6.8528218269348145, + "learning_rate": 3.673732844136432e-06, + "loss": 2.759, + "step": 74520 + }, + { + "epoch": 5.063527653213752, + "grad_norm": 7.278111457824707, + "learning_rate": 3.6733081940481046e-06, + "loss": 2.4112, + "step": 74525 + }, + { + "epoch": 5.063867373284413, + "grad_norm": 9.310016632080078, + "learning_rate": 3.672883543959778e-06, + "loss": 2.9945, + "step": 74530 + }, + { + "epoch": 5.064207093355075, + "grad_norm": 7.184089660644531, + "learning_rate": 3.6724588938714502e-06, + "loss": 2.8856, + "step": 74535 + }, + { + "epoch": 5.064546813425737, + "grad_norm": 9.895227432250977, + "learning_rate": 3.672034243783123e-06, + "loss": 2.7514, + "step": 74540 + }, + { + "epoch": 5.064886533496399, + "grad_norm": 6.851855278015137, + "learning_rate": 3.671609593694796e-06, + "loss": 2.711, + "step": 74545 + }, + { + "epoch": 5.065226253567061, + "grad_norm": 7.1998820304870605, + "learning_rate": 3.671184943606468e-06, + "loss": 2.6959, + "step": 74550 + }, + { + "epoch": 5.065565973637723, + "grad_norm": 7.862881183624268, + "learning_rate": 3.6707602935181414e-06, + "loss": 2.8255, + "step": 74555 + }, + { + "epoch": 5.065905693708384, + "grad_norm": 7.054964542388916, + "learning_rate": 3.6703356434298142e-06, + "loss": 2.6958, + "step": 74560 + }, + { + "epoch": 5.066245413779046, + "grad_norm": 8.351388931274414, + "learning_rate": 3.6699109933414866e-06, + "loss": 2.9421, + "step": 74565 + }, + { + "epoch": 5.066585133849708, + "grad_norm": 7.750319957733154, + "learning_rate": 3.66948634325316e-06, + "loss": 2.8585, + "step": 74570 + }, + { + "epoch": 5.066924853920369, + "grad_norm": 10.289549827575684, + "learning_rate": 3.6690616931648326e-06, + "loss": 2.9555, + "step": 74575 + }, + { + "epoch": 5.067264573991031, + "grad_norm": 8.531167984008789, + "learning_rate": 3.668637043076505e-06, + "loss": 2.4987, + "step": 74580 + }, + { + "epoch": 5.067604294061693, + "grad_norm": 8.91225528717041, + "learning_rate": 3.668212392988178e-06, + "loss": 3.0218, + "step": 74585 + }, + { + "epoch": 5.067944014132355, + "grad_norm": 8.983231544494629, + "learning_rate": 3.667787742899851e-06, + "loss": 2.7504, + "step": 74590 + }, + { + "epoch": 5.068283734203017, + "grad_norm": 9.614591598510742, + "learning_rate": 3.6673630928115234e-06, + "loss": 2.8503, + "step": 74595 + }, + { + "epoch": 5.068623454273679, + "grad_norm": 9.298423767089844, + "learning_rate": 3.666938442723196e-06, + "loss": 2.9251, + "step": 74600 + }, + { + "epoch": 5.06896317434434, + "grad_norm": 6.451038360595703, + "learning_rate": 3.6665137926348694e-06, + "loss": 2.7655, + "step": 74605 + }, + { + "epoch": 5.069302894415002, + "grad_norm": 7.804687023162842, + "learning_rate": 3.666089142546542e-06, + "loss": 3.025, + "step": 74610 + }, + { + "epoch": 5.069642614485664, + "grad_norm": 6.164207458496094, + "learning_rate": 3.6656644924582146e-06, + "loss": 2.8389, + "step": 74615 + }, + { + "epoch": 5.069982334556325, + "grad_norm": 6.666715621948242, + "learning_rate": 3.6652398423698874e-06, + "loss": 2.8704, + "step": 74620 + }, + { + "epoch": 5.070322054626987, + "grad_norm": 7.868244647979736, + "learning_rate": 3.66481519228156e-06, + "loss": 2.5526, + "step": 74625 + }, + { + "epoch": 5.0706617746976494, + "grad_norm": 8.509115219116211, + "learning_rate": 3.664390542193233e-06, + "loss": 2.6918, + "step": 74630 + }, + { + "epoch": 5.071001494768311, + "grad_norm": 8.839675903320312, + "learning_rate": 3.663965892104906e-06, + "loss": 2.7859, + "step": 74635 + }, + { + "epoch": 5.071341214838973, + "grad_norm": 7.346179485321045, + "learning_rate": 3.663541242016579e-06, + "loss": 2.852, + "step": 74640 + }, + { + "epoch": 5.071680934909635, + "grad_norm": 8.750874519348145, + "learning_rate": 3.6631165919282514e-06, + "loss": 2.6451, + "step": 74645 + }, + { + "epoch": 5.072020654980296, + "grad_norm": 7.151185035705566, + "learning_rate": 3.662691941839924e-06, + "loss": 2.7705, + "step": 74650 + }, + { + "epoch": 5.072360375050958, + "grad_norm": 6.643683910369873, + "learning_rate": 3.6622672917515974e-06, + "loss": 2.759, + "step": 74655 + }, + { + "epoch": 5.07270009512162, + "grad_norm": 9.681907653808594, + "learning_rate": 3.66184264166327e-06, + "loss": 2.7286, + "step": 74660 + }, + { + "epoch": 5.073039815192281, + "grad_norm": 6.556301116943359, + "learning_rate": 3.6614179915749426e-06, + "loss": 2.9365, + "step": 74665 + }, + { + "epoch": 5.073379535262943, + "grad_norm": 7.123031139373779, + "learning_rate": 3.6609933414866154e-06, + "loss": 3.1663, + "step": 74670 + }, + { + "epoch": 5.0737192553336055, + "grad_norm": 8.602259635925293, + "learning_rate": 3.6605686913982878e-06, + "loss": 2.6722, + "step": 74675 + }, + { + "epoch": 5.074058975404267, + "grad_norm": 6.9023332595825195, + "learning_rate": 3.660144041309961e-06, + "loss": 2.7677, + "step": 74680 + }, + { + "epoch": 5.074398695474929, + "grad_norm": 7.646767616271973, + "learning_rate": 3.659719391221634e-06, + "loss": 2.936, + "step": 74685 + }, + { + "epoch": 5.074738415545591, + "grad_norm": 7.251730442047119, + "learning_rate": 3.659294741133306e-06, + "loss": 2.5323, + "step": 74690 + }, + { + "epoch": 5.075078135616252, + "grad_norm": 8.497015953063965, + "learning_rate": 3.6588700910449794e-06, + "loss": 2.6333, + "step": 74695 + }, + { + "epoch": 5.075417855686914, + "grad_norm": 5.800334930419922, + "learning_rate": 3.6584454409566522e-06, + "loss": 2.7771, + "step": 74700 + }, + { + "epoch": 5.075757575757576, + "grad_norm": 7.991626262664795, + "learning_rate": 3.6580207908683246e-06, + "loss": 2.5482, + "step": 74705 + }, + { + "epoch": 5.076097295828237, + "grad_norm": 7.619595527648926, + "learning_rate": 3.6575961407799974e-06, + "loss": 2.708, + "step": 74710 + }, + { + "epoch": 5.076437015898899, + "grad_norm": 8.660513877868652, + "learning_rate": 3.6571714906916706e-06, + "loss": 2.8792, + "step": 74715 + }, + { + "epoch": 5.0767767359695615, + "grad_norm": 7.692246913909912, + "learning_rate": 3.656746840603343e-06, + "loss": 2.885, + "step": 74720 + }, + { + "epoch": 5.077116456040223, + "grad_norm": 7.7670440673828125, + "learning_rate": 3.656322190515016e-06, + "loss": 2.7658, + "step": 74725 + }, + { + "epoch": 5.077456176110885, + "grad_norm": 7.507140636444092, + "learning_rate": 3.655897540426689e-06, + "loss": 2.9044, + "step": 74730 + }, + { + "epoch": 5.077795896181547, + "grad_norm": 7.2814717292785645, + "learning_rate": 3.6554728903383614e-06, + "loss": 2.6556, + "step": 74735 + }, + { + "epoch": 5.078135616252208, + "grad_norm": 10.38766098022461, + "learning_rate": 3.655048240250034e-06, + "loss": 2.7803, + "step": 74740 + }, + { + "epoch": 5.07847533632287, + "grad_norm": 8.64825439453125, + "learning_rate": 3.654623590161707e-06, + "loss": 2.6378, + "step": 74745 + }, + { + "epoch": 5.078815056393532, + "grad_norm": 7.332557201385498, + "learning_rate": 3.65419894007338e-06, + "loss": 2.6841, + "step": 74750 + }, + { + "epoch": 5.079154776464193, + "grad_norm": 7.285817623138428, + "learning_rate": 3.6537742899850526e-06, + "loss": 2.9401, + "step": 74755 + }, + { + "epoch": 5.079494496534855, + "grad_norm": 7.920439720153809, + "learning_rate": 3.6533496398967254e-06, + "loss": 2.8112, + "step": 74760 + }, + { + "epoch": 5.0798342166055175, + "grad_norm": 8.627187728881836, + "learning_rate": 3.6529249898083978e-06, + "loss": 2.823, + "step": 74765 + }, + { + "epoch": 5.080173936676179, + "grad_norm": 6.632467269897461, + "learning_rate": 3.652500339720071e-06, + "loss": 2.8837, + "step": 74770 + }, + { + "epoch": 5.080513656746841, + "grad_norm": 7.329732894897461, + "learning_rate": 3.652075689631744e-06, + "loss": 2.9227, + "step": 74775 + }, + { + "epoch": 5.080853376817503, + "grad_norm": 7.347278594970703, + "learning_rate": 3.651651039543416e-06, + "loss": 2.6448, + "step": 74780 + }, + { + "epoch": 5.081193096888164, + "grad_norm": 6.9138054847717285, + "learning_rate": 3.6512263894550894e-06, + "loss": 2.5462, + "step": 74785 + }, + { + "epoch": 5.081532816958826, + "grad_norm": 7.797292232513428, + "learning_rate": 3.650801739366762e-06, + "loss": 2.7602, + "step": 74790 + }, + { + "epoch": 5.081872537029487, + "grad_norm": 7.722536087036133, + "learning_rate": 3.6503770892784346e-06, + "loss": 2.82, + "step": 74795 + }, + { + "epoch": 5.082212257100149, + "grad_norm": 6.658084392547607, + "learning_rate": 3.6499524391901074e-06, + "loss": 2.7104, + "step": 74800 + }, + { + "epoch": 5.082551977170811, + "grad_norm": 6.265371322631836, + "learning_rate": 3.6495277891017806e-06, + "loss": 2.6091, + "step": 74805 + }, + { + "epoch": 5.082891697241473, + "grad_norm": 8.16205883026123, + "learning_rate": 3.6491031390134534e-06, + "loss": 2.9013, + "step": 74810 + }, + { + "epoch": 5.083231417312135, + "grad_norm": 8.075675010681152, + "learning_rate": 3.6486784889251258e-06, + "loss": 2.8008, + "step": 74815 + }, + { + "epoch": 5.083571137382797, + "grad_norm": 7.3262810707092285, + "learning_rate": 3.648253838836799e-06, + "loss": 2.8056, + "step": 74820 + }, + { + "epoch": 5.083910857453458, + "grad_norm": 8.018720626831055, + "learning_rate": 3.647829188748472e-06, + "loss": 2.9068, + "step": 74825 + }, + { + "epoch": 5.08425057752412, + "grad_norm": 7.52132511138916, + "learning_rate": 3.647404538660144e-06, + "loss": 2.8982, + "step": 74830 + }, + { + "epoch": 5.084590297594782, + "grad_norm": 6.248065948486328, + "learning_rate": 3.646979888571817e-06, + "loss": 2.7972, + "step": 74835 + }, + { + "epoch": 5.084930017665443, + "grad_norm": 8.414071083068848, + "learning_rate": 3.6465552384834902e-06, + "loss": 2.7447, + "step": 74840 + }, + { + "epoch": 5.085269737736105, + "grad_norm": 7.554330825805664, + "learning_rate": 3.6461305883951626e-06, + "loss": 3.0614, + "step": 74845 + }, + { + "epoch": 5.085609457806767, + "grad_norm": 6.504989147186279, + "learning_rate": 3.6457059383068354e-06, + "loss": 2.8425, + "step": 74850 + }, + { + "epoch": 5.085949177877429, + "grad_norm": 7.363525867462158, + "learning_rate": 3.6452812882185086e-06, + "loss": 2.6288, + "step": 74855 + }, + { + "epoch": 5.086288897948091, + "grad_norm": 6.241554260253906, + "learning_rate": 3.644856638130181e-06, + "loss": 2.7982, + "step": 74860 + }, + { + "epoch": 5.086628618018753, + "grad_norm": 7.273104190826416, + "learning_rate": 3.644431988041854e-06, + "loss": 2.7684, + "step": 74865 + }, + { + "epoch": 5.086968338089414, + "grad_norm": 8.131708145141602, + "learning_rate": 3.6440073379535266e-06, + "loss": 2.5852, + "step": 74870 + }, + { + "epoch": 5.087308058160076, + "grad_norm": 9.666478157043457, + "learning_rate": 3.6435826878651994e-06, + "loss": 2.7458, + "step": 74875 + }, + { + "epoch": 5.087647778230738, + "grad_norm": 6.561551570892334, + "learning_rate": 3.643158037776872e-06, + "loss": 2.9189, + "step": 74880 + }, + { + "epoch": 5.087987498301399, + "grad_norm": 8.230778694152832, + "learning_rate": 3.642733387688545e-06, + "loss": 2.6448, + "step": 74885 + }, + { + "epoch": 5.088327218372061, + "grad_norm": 7.7142534255981445, + "learning_rate": 3.6423087376002174e-06, + "loss": 2.6124, + "step": 74890 + }, + { + "epoch": 5.088666938442723, + "grad_norm": 6.256820201873779, + "learning_rate": 3.6418840875118906e-06, + "loss": 2.5132, + "step": 74895 + }, + { + "epoch": 5.089006658513385, + "grad_norm": 9.249774932861328, + "learning_rate": 3.6414594374235634e-06, + "loss": 2.919, + "step": 74900 + }, + { + "epoch": 5.089346378584047, + "grad_norm": 6.974494934082031, + "learning_rate": 3.6410347873352358e-06, + "loss": 2.7376, + "step": 74905 + }, + { + "epoch": 5.089686098654709, + "grad_norm": 8.942278861999512, + "learning_rate": 3.640610137246909e-06, + "loss": 2.5947, + "step": 74910 + }, + { + "epoch": 5.09002581872537, + "grad_norm": 6.850768089294434, + "learning_rate": 3.640185487158582e-06, + "loss": 2.7673, + "step": 74915 + }, + { + "epoch": 5.090365538796032, + "grad_norm": 6.134953498840332, + "learning_rate": 3.639760837070254e-06, + "loss": 2.7498, + "step": 74920 + }, + { + "epoch": 5.090705258866694, + "grad_norm": 7.740884304046631, + "learning_rate": 3.639336186981927e-06, + "loss": 2.6341, + "step": 74925 + }, + { + "epoch": 5.091044978937355, + "grad_norm": 6.919931888580322, + "learning_rate": 3.6389115368936e-06, + "loss": 2.7618, + "step": 74930 + }, + { + "epoch": 5.091384699008017, + "grad_norm": 6.461601257324219, + "learning_rate": 3.6384868868052726e-06, + "loss": 2.7431, + "step": 74935 + }, + { + "epoch": 5.0917244190786795, + "grad_norm": 7.5809550285339355, + "learning_rate": 3.6380622367169454e-06, + "loss": 3.0026, + "step": 74940 + }, + { + "epoch": 5.092064139149341, + "grad_norm": 7.758588790893555, + "learning_rate": 3.6376375866286186e-06, + "loss": 2.6357, + "step": 74945 + }, + { + "epoch": 5.092403859220003, + "grad_norm": 7.699160099029541, + "learning_rate": 3.637212936540291e-06, + "loss": 2.6267, + "step": 74950 + }, + { + "epoch": 5.092743579290665, + "grad_norm": 6.656247138977051, + "learning_rate": 3.6367882864519638e-06, + "loss": 2.9517, + "step": 74955 + }, + { + "epoch": 5.093083299361326, + "grad_norm": 7.348042964935303, + "learning_rate": 3.6363636363636366e-06, + "loss": 2.734, + "step": 74960 + }, + { + "epoch": 5.093423019431988, + "grad_norm": 5.972877025604248, + "learning_rate": 3.635938986275309e-06, + "loss": 2.8483, + "step": 74965 + }, + { + "epoch": 5.09376273950265, + "grad_norm": 6.967957496643066, + "learning_rate": 3.635514336186982e-06, + "loss": 2.9478, + "step": 74970 + }, + { + "epoch": 5.094102459573311, + "grad_norm": 8.729045867919922, + "learning_rate": 3.635089686098655e-06, + "loss": 2.8754, + "step": 74975 + }, + { + "epoch": 5.094442179643973, + "grad_norm": 8.882508277893066, + "learning_rate": 3.634665036010328e-06, + "loss": 2.6919, + "step": 74980 + }, + { + "epoch": 5.0947818997146355, + "grad_norm": 8.351875305175781, + "learning_rate": 3.6342403859220006e-06, + "loss": 2.908, + "step": 74985 + }, + { + "epoch": 5.095121619785297, + "grad_norm": 9.774331092834473, + "learning_rate": 3.6338157358336734e-06, + "loss": 2.9639, + "step": 74990 + }, + { + "epoch": 5.095461339855959, + "grad_norm": 6.0857930183410645, + "learning_rate": 3.633391085745346e-06, + "loss": 3.0762, + "step": 74995 + }, + { + "epoch": 5.095801059926621, + "grad_norm": 7.714611530303955, + "learning_rate": 3.632966435657019e-06, + "loss": 3.0, + "step": 75000 + }, + { + "epoch": 5.096140779997282, + "grad_norm": 6.804567337036133, + "learning_rate": 3.632541785568692e-06, + "loss": 2.6421, + "step": 75005 + }, + { + "epoch": 5.096480500067944, + "grad_norm": 7.606232643127441, + "learning_rate": 3.6321171354803646e-06, + "loss": 3.0061, + "step": 75010 + }, + { + "epoch": 5.096820220138606, + "grad_norm": 6.7516069412231445, + "learning_rate": 3.631692485392037e-06, + "loss": 2.9334, + "step": 75015 + }, + { + "epoch": 5.097159940209267, + "grad_norm": 10.325407028198242, + "learning_rate": 3.63126783530371e-06, + "loss": 2.7736, + "step": 75020 + }, + { + "epoch": 5.097499660279929, + "grad_norm": 6.40892219543457, + "learning_rate": 3.630843185215383e-06, + "loss": 2.9299, + "step": 75025 + }, + { + "epoch": 5.0978393803505915, + "grad_norm": 7.697464466094971, + "learning_rate": 3.6304185351270554e-06, + "loss": 2.7825, + "step": 75030 + }, + { + "epoch": 5.098179100421253, + "grad_norm": 8.272195816040039, + "learning_rate": 3.6299938850387286e-06, + "loss": 2.7284, + "step": 75035 + }, + { + "epoch": 5.098518820491915, + "grad_norm": 6.183879375457764, + "learning_rate": 3.6295692349504014e-06, + "loss": 2.7918, + "step": 75040 + }, + { + "epoch": 5.098858540562577, + "grad_norm": 6.044052600860596, + "learning_rate": 3.6291445848620738e-06, + "loss": 2.9038, + "step": 75045 + }, + { + "epoch": 5.099198260633238, + "grad_norm": 8.363934516906738, + "learning_rate": 3.6287199347737466e-06, + "loss": 2.5248, + "step": 75050 + }, + { + "epoch": 5.0995379807039, + "grad_norm": 9.52977180480957, + "learning_rate": 3.62829528468542e-06, + "loss": 2.5913, + "step": 75055 + }, + { + "epoch": 5.099877700774562, + "grad_norm": 6.589165687561035, + "learning_rate": 3.627870634597092e-06, + "loss": 2.8702, + "step": 75060 + }, + { + "epoch": 5.100217420845223, + "grad_norm": 6.2389326095581055, + "learning_rate": 3.627445984508765e-06, + "loss": 2.6875, + "step": 75065 + }, + { + "epoch": 5.100557140915885, + "grad_norm": 8.23326301574707, + "learning_rate": 3.627021334420438e-06, + "loss": 2.9022, + "step": 75070 + }, + { + "epoch": 5.1008968609865475, + "grad_norm": 8.056761741638184, + "learning_rate": 3.6265966843321106e-06, + "loss": 2.7855, + "step": 75075 + }, + { + "epoch": 5.101236581057209, + "grad_norm": 7.230251789093018, + "learning_rate": 3.6261720342437834e-06, + "loss": 2.9526, + "step": 75080 + }, + { + "epoch": 5.101576301127871, + "grad_norm": 9.109784126281738, + "learning_rate": 3.625747384155456e-06, + "loss": 2.6709, + "step": 75085 + }, + { + "epoch": 5.101916021198533, + "grad_norm": 7.962153911590576, + "learning_rate": 3.6253227340671286e-06, + "loss": 3.0049, + "step": 75090 + }, + { + "epoch": 5.102255741269194, + "grad_norm": 8.491347312927246, + "learning_rate": 3.6248980839788018e-06, + "loss": 2.7801, + "step": 75095 + }, + { + "epoch": 5.102595461339856, + "grad_norm": 6.591828346252441, + "learning_rate": 3.6244734338904746e-06, + "loss": 2.7529, + "step": 75100 + }, + { + "epoch": 5.102935181410518, + "grad_norm": 8.543539047241211, + "learning_rate": 3.624048783802147e-06, + "loss": 2.8203, + "step": 75105 + }, + { + "epoch": 5.103274901481179, + "grad_norm": 8.616044998168945, + "learning_rate": 3.62362413371382e-06, + "loss": 2.7498, + "step": 75110 + }, + { + "epoch": 5.103614621551841, + "grad_norm": 6.348686695098877, + "learning_rate": 3.623199483625493e-06, + "loss": 2.7593, + "step": 75115 + }, + { + "epoch": 5.103954341622503, + "grad_norm": 7.395886421203613, + "learning_rate": 3.6227748335371654e-06, + "loss": 2.7914, + "step": 75120 + }, + { + "epoch": 5.104294061693165, + "grad_norm": 7.538263320922852, + "learning_rate": 3.6223501834488386e-06, + "loss": 2.6748, + "step": 75125 + }, + { + "epoch": 5.104633781763827, + "grad_norm": 7.367254734039307, + "learning_rate": 3.6219255333605114e-06, + "loss": 2.7273, + "step": 75130 + }, + { + "epoch": 5.104973501834488, + "grad_norm": 10.123950004577637, + "learning_rate": 3.6215008832721838e-06, + "loss": 2.7436, + "step": 75135 + }, + { + "epoch": 5.10531322190515, + "grad_norm": 9.118555068969727, + "learning_rate": 3.6210762331838566e-06, + "loss": 2.5575, + "step": 75140 + }, + { + "epoch": 5.105652941975812, + "grad_norm": 7.320741176605225, + "learning_rate": 3.62065158309553e-06, + "loss": 2.5256, + "step": 75145 + }, + { + "epoch": 5.105992662046473, + "grad_norm": 8.299802780151367, + "learning_rate": 3.6202269330072026e-06, + "loss": 2.9216, + "step": 75150 + }, + { + "epoch": 5.106332382117135, + "grad_norm": 7.7880048751831055, + "learning_rate": 3.619802282918875e-06, + "loss": 2.8563, + "step": 75155 + }, + { + "epoch": 5.106672102187797, + "grad_norm": 11.203071594238281, + "learning_rate": 3.619377632830548e-06, + "loss": 2.6559, + "step": 75160 + }, + { + "epoch": 5.107011822258459, + "grad_norm": 8.375679016113281, + "learning_rate": 3.618952982742221e-06, + "loss": 3.0878, + "step": 75165 + }, + { + "epoch": 5.107351542329121, + "grad_norm": 5.548704147338867, + "learning_rate": 3.6185283326538934e-06, + "loss": 2.5517, + "step": 75170 + }, + { + "epoch": 5.107691262399783, + "grad_norm": 6.492560386657715, + "learning_rate": 3.618103682565566e-06, + "loss": 2.7677, + "step": 75175 + }, + { + "epoch": 5.108030982470444, + "grad_norm": 7.165373802185059, + "learning_rate": 3.6176790324772394e-06, + "loss": 2.6338, + "step": 75180 + }, + { + "epoch": 5.108370702541106, + "grad_norm": 8.4487886428833, + "learning_rate": 3.6172543823889118e-06, + "loss": 2.7898, + "step": 75185 + }, + { + "epoch": 5.108710422611768, + "grad_norm": 6.230859756469727, + "learning_rate": 3.6168297323005846e-06, + "loss": 2.799, + "step": 75190 + }, + { + "epoch": 5.109050142682429, + "grad_norm": 8.77225112915039, + "learning_rate": 3.616405082212258e-06, + "loss": 2.7943, + "step": 75195 + }, + { + "epoch": 5.109389862753091, + "grad_norm": 8.831985473632812, + "learning_rate": 3.61598043212393e-06, + "loss": 2.8184, + "step": 75200 + }, + { + "epoch": 5.1097295828237534, + "grad_norm": 6.4608378410339355, + "learning_rate": 3.615555782035603e-06, + "loss": 2.8442, + "step": 75205 + }, + { + "epoch": 5.110069302894415, + "grad_norm": 6.742042541503906, + "learning_rate": 3.6151311319472758e-06, + "loss": 2.9034, + "step": 75210 + }, + { + "epoch": 5.110409022965077, + "grad_norm": 8.31653118133545, + "learning_rate": 3.614706481858948e-06, + "loss": 2.7022, + "step": 75215 + }, + { + "epoch": 5.110748743035739, + "grad_norm": 7.77963924407959, + "learning_rate": 3.6142818317706214e-06, + "loss": 2.7673, + "step": 75220 + }, + { + "epoch": 5.1110884631064, + "grad_norm": 6.946410179138184, + "learning_rate": 3.613857181682294e-06, + "loss": 2.9448, + "step": 75225 + }, + { + "epoch": 5.111428183177062, + "grad_norm": 8.045173645019531, + "learning_rate": 3.6134325315939666e-06, + "loss": 2.9545, + "step": 75230 + }, + { + "epoch": 5.111767903247724, + "grad_norm": 6.8428053855896, + "learning_rate": 3.6130078815056398e-06, + "loss": 2.7834, + "step": 75235 + }, + { + "epoch": 5.112107623318385, + "grad_norm": 8.065154075622559, + "learning_rate": 3.6125832314173126e-06, + "loss": 2.7604, + "step": 75240 + }, + { + "epoch": 5.112447343389047, + "grad_norm": 8.685958862304688, + "learning_rate": 3.612158581328985e-06, + "loss": 2.6714, + "step": 75245 + }, + { + "epoch": 5.1127870634597095, + "grad_norm": 9.301458358764648, + "learning_rate": 3.6117339312406578e-06, + "loss": 2.9559, + "step": 75250 + }, + { + "epoch": 5.113126783530371, + "grad_norm": 3.949446439743042, + "learning_rate": 3.611309281152331e-06, + "loss": 2.4136, + "step": 75255 + }, + { + "epoch": 5.113466503601033, + "grad_norm": 7.040449142456055, + "learning_rate": 3.6108846310640034e-06, + "loss": 2.6723, + "step": 75260 + }, + { + "epoch": 5.113806223671695, + "grad_norm": 7.303028583526611, + "learning_rate": 3.610459980975676e-06, + "loss": 2.8746, + "step": 75265 + }, + { + "epoch": 5.114145943742356, + "grad_norm": 6.678990364074707, + "learning_rate": 3.6100353308873494e-06, + "loss": 2.7655, + "step": 75270 + }, + { + "epoch": 5.114485663813018, + "grad_norm": 7.723365783691406, + "learning_rate": 3.6096106807990218e-06, + "loss": 2.7284, + "step": 75275 + }, + { + "epoch": 5.11482538388368, + "grad_norm": 7.660174369812012, + "learning_rate": 3.6091860307106946e-06, + "loss": 2.841, + "step": 75280 + }, + { + "epoch": 5.115165103954341, + "grad_norm": 9.774785041809082, + "learning_rate": 3.6087613806223678e-06, + "loss": 2.8511, + "step": 75285 + }, + { + "epoch": 5.115504824025003, + "grad_norm": 8.470609664916992, + "learning_rate": 3.60833673053404e-06, + "loss": 2.6763, + "step": 75290 + }, + { + "epoch": 5.1158445440956655, + "grad_norm": 9.374637603759766, + "learning_rate": 3.607912080445713e-06, + "loss": 2.8918, + "step": 75295 + }, + { + "epoch": 5.116184264166327, + "grad_norm": 8.465057373046875, + "learning_rate": 3.6074874303573858e-06, + "loss": 2.8079, + "step": 75300 + }, + { + "epoch": 5.116523984236989, + "grad_norm": 9.010817527770996, + "learning_rate": 3.607062780269058e-06, + "loss": 2.4792, + "step": 75305 + }, + { + "epoch": 5.116863704307651, + "grad_norm": 6.96035623550415, + "learning_rate": 3.6066381301807314e-06, + "loss": 2.9324, + "step": 75310 + }, + { + "epoch": 5.117203424378312, + "grad_norm": 8.450922966003418, + "learning_rate": 3.606213480092404e-06, + "loss": 2.6849, + "step": 75315 + }, + { + "epoch": 5.117543144448974, + "grad_norm": 6.832557678222656, + "learning_rate": 3.6057888300040774e-06, + "loss": 2.857, + "step": 75320 + }, + { + "epoch": 5.117882864519636, + "grad_norm": 7.0380425453186035, + "learning_rate": 3.6053641799157498e-06, + "loss": 2.879, + "step": 75325 + }, + { + "epoch": 5.118222584590297, + "grad_norm": 8.228437423706055, + "learning_rate": 3.6049395298274226e-06, + "loss": 2.8186, + "step": 75330 + }, + { + "epoch": 5.118562304660959, + "grad_norm": 8.966174125671387, + "learning_rate": 3.6045148797390954e-06, + "loss": 2.737, + "step": 75335 + }, + { + "epoch": 5.1189020247316215, + "grad_norm": 8.120325088500977, + "learning_rate": 3.6040902296507677e-06, + "loss": 2.735, + "step": 75340 + }, + { + "epoch": 5.119241744802283, + "grad_norm": 10.019984245300293, + "learning_rate": 3.603665579562441e-06, + "loss": 2.6742, + "step": 75345 + }, + { + "epoch": 5.119581464872945, + "grad_norm": 6.022406578063965, + "learning_rate": 3.6032409294741138e-06, + "loss": 3.0456, + "step": 75350 + }, + { + "epoch": 5.119921184943607, + "grad_norm": 7.642506122589111, + "learning_rate": 3.602816279385786e-06, + "loss": 2.6952, + "step": 75355 + }, + { + "epoch": 5.120260905014268, + "grad_norm": 7.429193019866943, + "learning_rate": 3.6023916292974594e-06, + "loss": 2.6418, + "step": 75360 + }, + { + "epoch": 5.12060062508493, + "grad_norm": 7.546069622039795, + "learning_rate": 3.601966979209132e-06, + "loss": 2.8467, + "step": 75365 + }, + { + "epoch": 5.120940345155592, + "grad_norm": 10.214181900024414, + "learning_rate": 3.6015423291208046e-06, + "loss": 2.8735, + "step": 75370 + }, + { + "epoch": 5.121280065226253, + "grad_norm": 8.704825401306152, + "learning_rate": 3.6011176790324774e-06, + "loss": 2.761, + "step": 75375 + }, + { + "epoch": 5.121619785296915, + "grad_norm": 8.389822006225586, + "learning_rate": 3.6006930289441506e-06, + "loss": 2.7894, + "step": 75380 + }, + { + "epoch": 5.1219595053675775, + "grad_norm": 8.063505172729492, + "learning_rate": 3.600268378855823e-06, + "loss": 2.5357, + "step": 75385 + }, + { + "epoch": 5.122299225438239, + "grad_norm": 6.631052017211914, + "learning_rate": 3.5998437287674958e-06, + "loss": 2.5716, + "step": 75390 + }, + { + "epoch": 5.122638945508901, + "grad_norm": 8.542157173156738, + "learning_rate": 3.599419078679169e-06, + "loss": 2.6402, + "step": 75395 + }, + { + "epoch": 5.122978665579563, + "grad_norm": 7.277540683746338, + "learning_rate": 3.5989944285908414e-06, + "loss": 2.7707, + "step": 75400 + }, + { + "epoch": 5.123318385650224, + "grad_norm": 7.780378818511963, + "learning_rate": 3.598569778502514e-06, + "loss": 2.7802, + "step": 75405 + }, + { + "epoch": 5.123658105720886, + "grad_norm": 8.356101036071777, + "learning_rate": 3.5981451284141874e-06, + "loss": 2.8064, + "step": 75410 + }, + { + "epoch": 5.123997825791548, + "grad_norm": 8.57731819152832, + "learning_rate": 3.5977204783258598e-06, + "loss": 2.7318, + "step": 75415 + }, + { + "epoch": 5.124337545862209, + "grad_norm": 6.399848461151123, + "learning_rate": 3.5972958282375326e-06, + "loss": 2.7372, + "step": 75420 + }, + { + "epoch": 5.124677265932871, + "grad_norm": 6.923789978027344, + "learning_rate": 3.5968711781492054e-06, + "loss": 2.6124, + "step": 75425 + }, + { + "epoch": 5.1250169860035335, + "grad_norm": 6.8565802574157715, + "learning_rate": 3.5964465280608777e-06, + "loss": 2.9148, + "step": 75430 + }, + { + "epoch": 5.125356706074195, + "grad_norm": 8.857308387756348, + "learning_rate": 3.596021877972551e-06, + "loss": 2.8113, + "step": 75435 + }, + { + "epoch": 5.125696426144857, + "grad_norm": 8.375158309936523, + "learning_rate": 3.5955972278842238e-06, + "loss": 2.7532, + "step": 75440 + }, + { + "epoch": 5.126036146215519, + "grad_norm": 7.835436820983887, + "learning_rate": 3.595172577795896e-06, + "loss": 3.0594, + "step": 75445 + }, + { + "epoch": 5.12637586628618, + "grad_norm": 7.414833068847656, + "learning_rate": 3.5947479277075694e-06, + "loss": 2.813, + "step": 75450 + }, + { + "epoch": 5.126715586356842, + "grad_norm": 7.084911823272705, + "learning_rate": 3.594323277619242e-06, + "loss": 3.0263, + "step": 75455 + }, + { + "epoch": 5.127055306427504, + "grad_norm": 9.17391300201416, + "learning_rate": 3.5938986275309145e-06, + "loss": 2.6716, + "step": 75460 + }, + { + "epoch": 5.127395026498165, + "grad_norm": 8.844358444213867, + "learning_rate": 3.5934739774425873e-06, + "loss": 2.7649, + "step": 75465 + }, + { + "epoch": 5.127734746568827, + "grad_norm": 8.339046478271484, + "learning_rate": 3.5930493273542606e-06, + "loss": 2.7239, + "step": 75470 + }, + { + "epoch": 5.1280744666394895, + "grad_norm": 6.348814010620117, + "learning_rate": 3.592624677265933e-06, + "loss": 2.3671, + "step": 75475 + }, + { + "epoch": 5.128414186710151, + "grad_norm": 9.979347229003906, + "learning_rate": 3.5922000271776057e-06, + "loss": 2.8228, + "step": 75480 + }, + { + "epoch": 5.128753906780813, + "grad_norm": 10.145583152770996, + "learning_rate": 3.591775377089279e-06, + "loss": 2.7497, + "step": 75485 + }, + { + "epoch": 5.129093626851474, + "grad_norm": 6.1513495445251465, + "learning_rate": 3.5913507270009518e-06, + "loss": 2.8359, + "step": 75490 + }, + { + "epoch": 5.129433346922136, + "grad_norm": 7.407793998718262, + "learning_rate": 3.590926076912624e-06, + "loss": 2.8209, + "step": 75495 + }, + { + "epoch": 5.129773066992798, + "grad_norm": 6.088146686553955, + "learning_rate": 3.590501426824297e-06, + "loss": 2.8532, + "step": 75500 + }, + { + "epoch": 5.130112787063459, + "grad_norm": 7.074132442474365, + "learning_rate": 3.59007677673597e-06, + "loss": 2.7206, + "step": 75505 + }, + { + "epoch": 5.130452507134121, + "grad_norm": 8.196124076843262, + "learning_rate": 3.5896521266476426e-06, + "loss": 2.7266, + "step": 75510 + }, + { + "epoch": 5.1307922272047835, + "grad_norm": 8.640053749084473, + "learning_rate": 3.5892274765593154e-06, + "loss": 3.0468, + "step": 75515 + }, + { + "epoch": 5.131131947275445, + "grad_norm": 8.792655944824219, + "learning_rate": 3.5888028264709886e-06, + "loss": 2.7232, + "step": 75520 + }, + { + "epoch": 5.131471667346107, + "grad_norm": 7.632265090942383, + "learning_rate": 3.588378176382661e-06, + "loss": 2.7259, + "step": 75525 + }, + { + "epoch": 5.131811387416769, + "grad_norm": 6.742375373840332, + "learning_rate": 3.5879535262943338e-06, + "loss": 2.5442, + "step": 75530 + }, + { + "epoch": 5.13215110748743, + "grad_norm": 6.976895332336426, + "learning_rate": 3.5875288762060066e-06, + "loss": 2.6772, + "step": 75535 + }, + { + "epoch": 5.132490827558092, + "grad_norm": 8.58842945098877, + "learning_rate": 3.5871042261176794e-06, + "loss": 2.8587, + "step": 75540 + }, + { + "epoch": 5.132830547628754, + "grad_norm": 5.8106608390808105, + "learning_rate": 3.586679576029352e-06, + "loss": 3.1186, + "step": 75545 + }, + { + "epoch": 5.133170267699415, + "grad_norm": 8.392135620117188, + "learning_rate": 3.586254925941025e-06, + "loss": 2.6294, + "step": 75550 + }, + { + "epoch": 5.133509987770077, + "grad_norm": 6.737679481506348, + "learning_rate": 3.5858302758526973e-06, + "loss": 2.7678, + "step": 75555 + }, + { + "epoch": 5.1338497078407395, + "grad_norm": 8.648784637451172, + "learning_rate": 3.5854056257643706e-06, + "loss": 2.8612, + "step": 75560 + }, + { + "epoch": 5.134189427911401, + "grad_norm": 7.197586536407471, + "learning_rate": 3.5849809756760434e-06, + "loss": 2.8912, + "step": 75565 + }, + { + "epoch": 5.134529147982063, + "grad_norm": 7.972540855407715, + "learning_rate": 3.5845563255877157e-06, + "loss": 2.7371, + "step": 75570 + }, + { + "epoch": 5.134868868052725, + "grad_norm": 7.660970211029053, + "learning_rate": 3.584131675499389e-06, + "loss": 2.6768, + "step": 75575 + }, + { + "epoch": 5.135208588123386, + "grad_norm": 7.928661823272705, + "learning_rate": 3.5837070254110618e-06, + "loss": 2.8234, + "step": 75580 + }, + { + "epoch": 5.135548308194048, + "grad_norm": 5.937446117401123, + "learning_rate": 3.583282375322734e-06, + "loss": 2.9479, + "step": 75585 + }, + { + "epoch": 5.13588802826471, + "grad_norm": 9.942023277282715, + "learning_rate": 3.582857725234407e-06, + "loss": 2.6128, + "step": 75590 + }, + { + "epoch": 5.136227748335371, + "grad_norm": 6.89783239364624, + "learning_rate": 3.58243307514608e-06, + "loss": 2.7582, + "step": 75595 + }, + { + "epoch": 5.136567468406033, + "grad_norm": 8.00713062286377, + "learning_rate": 3.5820084250577525e-06, + "loss": 2.7452, + "step": 75600 + }, + { + "epoch": 5.1369071884766955, + "grad_norm": 8.843347549438477, + "learning_rate": 3.5815837749694253e-06, + "loss": 2.9478, + "step": 75605 + }, + { + "epoch": 5.137246908547357, + "grad_norm": 6.875868320465088, + "learning_rate": 3.5811591248810986e-06, + "loss": 3.0306, + "step": 75610 + }, + { + "epoch": 5.137586628618019, + "grad_norm": 7.1522064208984375, + "learning_rate": 3.580734474792771e-06, + "loss": 2.9408, + "step": 75615 + }, + { + "epoch": 5.137926348688681, + "grad_norm": 8.16013240814209, + "learning_rate": 3.5803098247044437e-06, + "loss": 3.1095, + "step": 75620 + }, + { + "epoch": 5.138266068759342, + "grad_norm": 8.271356582641602, + "learning_rate": 3.5798851746161165e-06, + "loss": 2.9139, + "step": 75625 + }, + { + "epoch": 5.138605788830004, + "grad_norm": 9.05181884765625, + "learning_rate": 3.5794605245277893e-06, + "loss": 2.8703, + "step": 75630 + }, + { + "epoch": 5.138945508900666, + "grad_norm": 7.049115180969238, + "learning_rate": 3.579035874439462e-06, + "loss": 2.8791, + "step": 75635 + }, + { + "epoch": 5.139285228971327, + "grad_norm": 7.0922346115112305, + "learning_rate": 3.578611224351135e-06, + "loss": 2.4912, + "step": 75640 + }, + { + "epoch": 5.139624949041989, + "grad_norm": 6.799310207366943, + "learning_rate": 3.5781865742628073e-06, + "loss": 2.6989, + "step": 75645 + }, + { + "epoch": 5.1399646691126515, + "grad_norm": 7.474533557891846, + "learning_rate": 3.5777619241744805e-06, + "loss": 2.8795, + "step": 75650 + }, + { + "epoch": 5.140304389183313, + "grad_norm": 7.690563201904297, + "learning_rate": 3.5773372740861534e-06, + "loss": 2.9, + "step": 75655 + }, + { + "epoch": 5.140644109253975, + "grad_norm": 6.871948719024658, + "learning_rate": 3.576912623997826e-06, + "loss": 2.6085, + "step": 75660 + }, + { + "epoch": 5.140983829324637, + "grad_norm": 6.848483085632324, + "learning_rate": 3.576487973909499e-06, + "loss": 2.7681, + "step": 75665 + }, + { + "epoch": 5.141323549395298, + "grad_norm": 7.564730167388916, + "learning_rate": 3.5760633238211718e-06, + "loss": 2.7985, + "step": 75670 + }, + { + "epoch": 5.14166326946596, + "grad_norm": 7.281168460845947, + "learning_rate": 3.5756386737328446e-06, + "loss": 2.9858, + "step": 75675 + }, + { + "epoch": 5.142002989536622, + "grad_norm": 6.064858913421631, + "learning_rate": 3.575214023644517e-06, + "loss": 2.7039, + "step": 75680 + }, + { + "epoch": 5.142342709607283, + "grad_norm": 6.9628143310546875, + "learning_rate": 3.57478937355619e-06, + "loss": 2.8148, + "step": 75685 + }, + { + "epoch": 5.142682429677945, + "grad_norm": 7.664475440979004, + "learning_rate": 3.574364723467863e-06, + "loss": 2.719, + "step": 75690 + }, + { + "epoch": 5.1430221497486075, + "grad_norm": 8.475595474243164, + "learning_rate": 3.5739400733795353e-06, + "loss": 2.8042, + "step": 75695 + }, + { + "epoch": 5.143361869819269, + "grad_norm": 7.754055023193359, + "learning_rate": 3.5735154232912086e-06, + "loss": 2.9488, + "step": 75700 + }, + { + "epoch": 5.143701589889931, + "grad_norm": 7.364457607269287, + "learning_rate": 3.5730907732028814e-06, + "loss": 3.0641, + "step": 75705 + }, + { + "epoch": 5.144041309960593, + "grad_norm": 8.954045295715332, + "learning_rate": 3.5726661231145537e-06, + "loss": 2.8483, + "step": 75710 + }, + { + "epoch": 5.144381030031254, + "grad_norm": 7.8172688484191895, + "learning_rate": 3.5722414730262265e-06, + "loss": 2.785, + "step": 75715 + }, + { + "epoch": 5.144720750101916, + "grad_norm": 7.874024391174316, + "learning_rate": 3.5718168229378998e-06, + "loss": 2.9525, + "step": 75720 + }, + { + "epoch": 5.145060470172578, + "grad_norm": 6.9391889572143555, + "learning_rate": 3.571392172849572e-06, + "loss": 2.9009, + "step": 75725 + }, + { + "epoch": 5.145400190243239, + "grad_norm": 10.710660934448242, + "learning_rate": 3.570967522761245e-06, + "loss": 2.901, + "step": 75730 + }, + { + "epoch": 5.145739910313901, + "grad_norm": 7.466298580169678, + "learning_rate": 3.570542872672918e-06, + "loss": 2.645, + "step": 75735 + }, + { + "epoch": 5.1460796303845635, + "grad_norm": 6.398344039916992, + "learning_rate": 3.5701182225845905e-06, + "loss": 2.6187, + "step": 75740 + }, + { + "epoch": 5.146419350455225, + "grad_norm": 7.358253002166748, + "learning_rate": 3.5696935724962633e-06, + "loss": 2.6295, + "step": 75745 + }, + { + "epoch": 5.146759070525887, + "grad_norm": 7.819764614105225, + "learning_rate": 3.569268922407936e-06, + "loss": 2.908, + "step": 75750 + }, + { + "epoch": 5.147098790596549, + "grad_norm": 7.926208972930908, + "learning_rate": 3.5688442723196085e-06, + "loss": 2.7155, + "step": 75755 + }, + { + "epoch": 5.14743851066721, + "grad_norm": 9.024320602416992, + "learning_rate": 3.5684196222312817e-06, + "loss": 2.6503, + "step": 75760 + }, + { + "epoch": 5.147778230737872, + "grad_norm": 8.071686744689941, + "learning_rate": 3.5679949721429545e-06, + "loss": 3.0835, + "step": 75765 + }, + { + "epoch": 5.148117950808534, + "grad_norm": 8.617642402648926, + "learning_rate": 3.567570322054627e-06, + "loss": 2.8623, + "step": 75770 + }, + { + "epoch": 5.148457670879195, + "grad_norm": 8.782686233520508, + "learning_rate": 3.5671456719663e-06, + "loss": 2.8571, + "step": 75775 + }, + { + "epoch": 5.1487973909498574, + "grad_norm": 8.21315860748291, + "learning_rate": 3.566721021877973e-06, + "loss": 2.7176, + "step": 75780 + }, + { + "epoch": 5.1491371110205195, + "grad_norm": 10.776037216186523, + "learning_rate": 3.5662963717896453e-06, + "loss": 2.7297, + "step": 75785 + }, + { + "epoch": 5.149476831091181, + "grad_norm": 6.821892738342285, + "learning_rate": 3.5658717217013185e-06, + "loss": 2.8474, + "step": 75790 + }, + { + "epoch": 5.149816551161843, + "grad_norm": 7.196925640106201, + "learning_rate": 3.5654470716129913e-06, + "loss": 2.6064, + "step": 75795 + }, + { + "epoch": 5.150156271232504, + "grad_norm": 6.536879062652588, + "learning_rate": 3.5650224215246637e-06, + "loss": 2.8358, + "step": 75800 + }, + { + "epoch": 5.150495991303166, + "grad_norm": 7.621335506439209, + "learning_rate": 3.5645977714363365e-06, + "loss": 2.5281, + "step": 75805 + }, + { + "epoch": 5.150835711373828, + "grad_norm": 6.703457355499268, + "learning_rate": 3.5641731213480098e-06, + "loss": 2.9174, + "step": 75810 + }, + { + "epoch": 5.151175431444489, + "grad_norm": 8.674171447753906, + "learning_rate": 3.563748471259682e-06, + "loss": 2.6534, + "step": 75815 + }, + { + "epoch": 5.151515151515151, + "grad_norm": 8.37836742401123, + "learning_rate": 3.563323821171355e-06, + "loss": 2.7993, + "step": 75820 + }, + { + "epoch": 5.1518548715858135, + "grad_norm": 9.371787071228027, + "learning_rate": 3.562899171083028e-06, + "loss": 2.7908, + "step": 75825 + }, + { + "epoch": 5.152194591656475, + "grad_norm": 6.5950493812561035, + "learning_rate": 3.562474520994701e-06, + "loss": 3.0664, + "step": 75830 + }, + { + "epoch": 5.152534311727137, + "grad_norm": 7.799555778503418, + "learning_rate": 3.5620498709063733e-06, + "loss": 2.9239, + "step": 75835 + }, + { + "epoch": 5.152874031797799, + "grad_norm": 9.163161277770996, + "learning_rate": 3.561625220818046e-06, + "loss": 2.8305, + "step": 75840 + }, + { + "epoch": 5.15321375186846, + "grad_norm": 6.408659934997559, + "learning_rate": 3.5612005707297194e-06, + "loss": 2.6639, + "step": 75845 + }, + { + "epoch": 5.153553471939122, + "grad_norm": 7.730906963348389, + "learning_rate": 3.5607759206413917e-06, + "loss": 2.7378, + "step": 75850 + }, + { + "epoch": 5.153893192009784, + "grad_norm": 8.558679580688477, + "learning_rate": 3.5603512705530645e-06, + "loss": 2.8135, + "step": 75855 + }, + { + "epoch": 5.154232912080445, + "grad_norm": 7.047049522399902, + "learning_rate": 3.5599266204647378e-06, + "loss": 2.7769, + "step": 75860 + }, + { + "epoch": 5.154572632151107, + "grad_norm": 7.8057732582092285, + "learning_rate": 3.55950197037641e-06, + "loss": 2.8454, + "step": 75865 + }, + { + "epoch": 5.1549123522217695, + "grad_norm": 7.971770763397217, + "learning_rate": 3.559077320288083e-06, + "loss": 2.6953, + "step": 75870 + }, + { + "epoch": 5.155252072292431, + "grad_norm": 7.912935733795166, + "learning_rate": 3.5586526701997557e-06, + "loss": 2.8307, + "step": 75875 + }, + { + "epoch": 5.155591792363093, + "grad_norm": 7.210872650146484, + "learning_rate": 3.558228020111428e-06, + "loss": 2.7041, + "step": 75880 + }, + { + "epoch": 5.155931512433755, + "grad_norm": 9.01548957824707, + "learning_rate": 3.5578033700231013e-06, + "loss": 2.9065, + "step": 75885 + }, + { + "epoch": 5.156271232504416, + "grad_norm": 8.034222602844238, + "learning_rate": 3.557378719934774e-06, + "loss": 2.8368, + "step": 75890 + }, + { + "epoch": 5.156610952575078, + "grad_norm": 8.39077377319336, + "learning_rate": 3.5569540698464465e-06, + "loss": 2.6284, + "step": 75895 + }, + { + "epoch": 5.15695067264574, + "grad_norm": 6.9449076652526855, + "learning_rate": 3.5565294197581197e-06, + "loss": 2.6658, + "step": 75900 + }, + { + "epoch": 5.157290392716401, + "grad_norm": 6.992207050323486, + "learning_rate": 3.5561047696697925e-06, + "loss": 2.7212, + "step": 75905 + }, + { + "epoch": 5.157630112787063, + "grad_norm": 7.798896312713623, + "learning_rate": 3.555680119581465e-06, + "loss": 2.7054, + "step": 75910 + }, + { + "epoch": 5.1579698328577255, + "grad_norm": 6.900976657867432, + "learning_rate": 3.555255469493138e-06, + "loss": 2.3242, + "step": 75915 + }, + { + "epoch": 5.158309552928387, + "grad_norm": 8.334446907043457, + "learning_rate": 3.554830819404811e-06, + "loss": 2.9149, + "step": 75920 + }, + { + "epoch": 5.158649272999049, + "grad_norm": 6.534523963928223, + "learning_rate": 3.5544061693164833e-06, + "loss": 2.8044, + "step": 75925 + }, + { + "epoch": 5.158988993069711, + "grad_norm": 7.130104064941406, + "learning_rate": 3.553981519228156e-06, + "loss": 2.8253, + "step": 75930 + }, + { + "epoch": 5.159328713140372, + "grad_norm": 8.655830383300781, + "learning_rate": 3.5535568691398293e-06, + "loss": 2.7472, + "step": 75935 + }, + { + "epoch": 5.159668433211034, + "grad_norm": 7.090734958648682, + "learning_rate": 3.5531322190515017e-06, + "loss": 2.8072, + "step": 75940 + }, + { + "epoch": 5.160008153281696, + "grad_norm": 8.130126953125, + "learning_rate": 3.5527075689631745e-06, + "loss": 2.8283, + "step": 75945 + }, + { + "epoch": 5.160347873352357, + "grad_norm": 5.7900567054748535, + "learning_rate": 3.5522829188748477e-06, + "loss": 2.6774, + "step": 75950 + }, + { + "epoch": 5.160687593423019, + "grad_norm": 7.559119701385498, + "learning_rate": 3.55185826878652e-06, + "loss": 2.6424, + "step": 75955 + }, + { + "epoch": 5.1610273134936815, + "grad_norm": 7.778460502624512, + "learning_rate": 3.551433618698193e-06, + "loss": 2.7776, + "step": 75960 + }, + { + "epoch": 5.161367033564343, + "grad_norm": 11.288789749145508, + "learning_rate": 3.5510089686098657e-06, + "loss": 2.9368, + "step": 75965 + }, + { + "epoch": 5.161706753635005, + "grad_norm": 7.583946704864502, + "learning_rate": 3.550584318521538e-06, + "loss": 2.7374, + "step": 75970 + }, + { + "epoch": 5.162046473705667, + "grad_norm": 6.829509258270264, + "learning_rate": 3.5501596684332113e-06, + "loss": 2.6177, + "step": 75975 + }, + { + "epoch": 5.162386193776328, + "grad_norm": 6.729467868804932, + "learning_rate": 3.549735018344884e-06, + "loss": 3.0322, + "step": 75980 + }, + { + "epoch": 5.16272591384699, + "grad_norm": 6.382696151733398, + "learning_rate": 3.5493103682565565e-06, + "loss": 2.6737, + "step": 75985 + }, + { + "epoch": 5.163065633917652, + "grad_norm": 8.03680419921875, + "learning_rate": 3.5488857181682297e-06, + "loss": 2.8496, + "step": 75990 + }, + { + "epoch": 5.163405353988313, + "grad_norm": 7.712112903594971, + "learning_rate": 3.5484610680799025e-06, + "loss": 2.8565, + "step": 75995 + }, + { + "epoch": 5.163745074058975, + "grad_norm": 5.454498767852783, + "learning_rate": 3.5480364179915753e-06, + "loss": 3.0107, + "step": 76000 + }, + { + "epoch": 5.1640847941296375, + "grad_norm": 5.788515090942383, + "learning_rate": 3.5476117679032477e-06, + "loss": 2.8805, + "step": 76005 + }, + { + "epoch": 5.164424514200299, + "grad_norm": 10.670456886291504, + "learning_rate": 3.547187117814921e-06, + "loss": 2.8644, + "step": 76010 + }, + { + "epoch": 5.164764234270961, + "grad_norm": 7.037939548492432, + "learning_rate": 3.5467624677265937e-06, + "loss": 2.8389, + "step": 76015 + }, + { + "epoch": 5.165103954341623, + "grad_norm": 6.865109920501709, + "learning_rate": 3.546337817638266e-06, + "loss": 2.9146, + "step": 76020 + }, + { + "epoch": 5.165443674412284, + "grad_norm": 8.365262985229492, + "learning_rate": 3.5459131675499393e-06, + "loss": 2.7013, + "step": 76025 + }, + { + "epoch": 5.165783394482946, + "grad_norm": 9.82561206817627, + "learning_rate": 3.545488517461612e-06, + "loss": 2.7301, + "step": 76030 + }, + { + "epoch": 5.166123114553608, + "grad_norm": 9.006709098815918, + "learning_rate": 3.5450638673732845e-06, + "loss": 2.8917, + "step": 76035 + }, + { + "epoch": 5.166462834624269, + "grad_norm": 9.645294189453125, + "learning_rate": 3.5446392172849573e-06, + "loss": 2.7913, + "step": 76040 + }, + { + "epoch": 5.166802554694931, + "grad_norm": 6.768420696258545, + "learning_rate": 3.5442145671966305e-06, + "loss": 2.5939, + "step": 76045 + }, + { + "epoch": 5.1671422747655935, + "grad_norm": 7.87033748626709, + "learning_rate": 3.543789917108303e-06, + "loss": 2.9711, + "step": 76050 + }, + { + "epoch": 5.167481994836255, + "grad_norm": 8.551143646240234, + "learning_rate": 3.5433652670199757e-06, + "loss": 2.6792, + "step": 76055 + }, + { + "epoch": 5.167821714906917, + "grad_norm": 7.798831939697266, + "learning_rate": 3.542940616931649e-06, + "loss": 2.6247, + "step": 76060 + }, + { + "epoch": 5.168161434977579, + "grad_norm": 6.217085361480713, + "learning_rate": 3.5425159668433213e-06, + "loss": 2.8414, + "step": 76065 + }, + { + "epoch": 5.16850115504824, + "grad_norm": 6.553350448608398, + "learning_rate": 3.542091316754994e-06, + "loss": 3.0552, + "step": 76070 + }, + { + "epoch": 5.168840875118902, + "grad_norm": 10.980384826660156, + "learning_rate": 3.5416666666666673e-06, + "loss": 2.7386, + "step": 76075 + }, + { + "epoch": 5.169180595189564, + "grad_norm": 8.558438301086426, + "learning_rate": 3.5412420165783397e-06, + "loss": 2.8607, + "step": 76080 + }, + { + "epoch": 5.169520315260225, + "grad_norm": 7.457603931427002, + "learning_rate": 3.5408173664900125e-06, + "loss": 2.8683, + "step": 76085 + }, + { + "epoch": 5.1698600353308874, + "grad_norm": 7.701426029205322, + "learning_rate": 3.5403927164016853e-06, + "loss": 2.8071, + "step": 76090 + }, + { + "epoch": 5.1701997554015495, + "grad_norm": 7.712591648101807, + "learning_rate": 3.5399680663133577e-06, + "loss": 2.7806, + "step": 76095 + }, + { + "epoch": 5.170539475472211, + "grad_norm": 8.75273323059082, + "learning_rate": 3.539543416225031e-06, + "loss": 2.8667, + "step": 76100 + }, + { + "epoch": 5.170879195542873, + "grad_norm": 7.168203830718994, + "learning_rate": 3.5391187661367037e-06, + "loss": 2.7014, + "step": 76105 + }, + { + "epoch": 5.171218915613535, + "grad_norm": 7.088225841522217, + "learning_rate": 3.538694116048376e-06, + "loss": 2.5917, + "step": 76110 + }, + { + "epoch": 5.171558635684196, + "grad_norm": 7.900550842285156, + "learning_rate": 3.5382694659600493e-06, + "loss": 2.9729, + "step": 76115 + }, + { + "epoch": 5.171898355754858, + "grad_norm": 7.518162727355957, + "learning_rate": 3.537844815871722e-06, + "loss": 2.9009, + "step": 76120 + }, + { + "epoch": 5.17223807582552, + "grad_norm": 7.404953956604004, + "learning_rate": 3.5374201657833945e-06, + "loss": 2.545, + "step": 76125 + }, + { + "epoch": 5.172577795896181, + "grad_norm": 7.20460844039917, + "learning_rate": 3.5369955156950673e-06, + "loss": 2.6801, + "step": 76130 + }, + { + "epoch": 5.1729175159668435, + "grad_norm": 10.163227081298828, + "learning_rate": 3.5365708656067405e-06, + "loss": 2.9928, + "step": 76135 + }, + { + "epoch": 5.1732572360375055, + "grad_norm": 7.224212646484375, + "learning_rate": 3.536146215518413e-06, + "loss": 2.6938, + "step": 76140 + }, + { + "epoch": 5.173596956108167, + "grad_norm": 8.167670249938965, + "learning_rate": 3.5357215654300857e-06, + "loss": 2.6985, + "step": 76145 + }, + { + "epoch": 5.173936676178829, + "grad_norm": 8.062154769897461, + "learning_rate": 3.535296915341759e-06, + "loss": 2.7722, + "step": 76150 + }, + { + "epoch": 5.174276396249491, + "grad_norm": 8.5385103225708, + "learning_rate": 3.5348722652534313e-06, + "loss": 2.7339, + "step": 76155 + }, + { + "epoch": 5.174616116320152, + "grad_norm": 6.466253280639648, + "learning_rate": 3.534447615165104e-06, + "loss": 2.8372, + "step": 76160 + }, + { + "epoch": 5.174955836390814, + "grad_norm": 6.947239398956299, + "learning_rate": 3.534022965076777e-06, + "loss": 2.8612, + "step": 76165 + }, + { + "epoch": 5.175295556461475, + "grad_norm": 7.781897068023682, + "learning_rate": 3.53359831498845e-06, + "loss": 2.6251, + "step": 76170 + }, + { + "epoch": 5.175635276532137, + "grad_norm": 8.438575744628906, + "learning_rate": 3.5331736649001225e-06, + "loss": 2.9002, + "step": 76175 + }, + { + "epoch": 5.1759749966027995, + "grad_norm": 6.9654059410095215, + "learning_rate": 3.5327490148117953e-06, + "loss": 2.7444, + "step": 76180 + }, + { + "epoch": 5.176314716673461, + "grad_norm": 6.114065647125244, + "learning_rate": 3.5323243647234685e-06, + "loss": 2.9462, + "step": 76185 + }, + { + "epoch": 5.176654436744123, + "grad_norm": 8.643150329589844, + "learning_rate": 3.531899714635141e-06, + "loss": 2.6806, + "step": 76190 + }, + { + "epoch": 5.176994156814785, + "grad_norm": 7.584604263305664, + "learning_rate": 3.5314750645468137e-06, + "loss": 2.7965, + "step": 76195 + }, + { + "epoch": 5.177333876885446, + "grad_norm": 8.828566551208496, + "learning_rate": 3.531050414458487e-06, + "loss": 3.0767, + "step": 76200 + }, + { + "epoch": 5.177673596956108, + "grad_norm": 9.962576866149902, + "learning_rate": 3.530710694387825e-06, + "loss": 2.6955, + "step": 76205 + }, + { + "epoch": 5.17801331702677, + "grad_norm": 7.69232177734375, + "learning_rate": 3.5302860442994974e-06, + "loss": 2.8319, + "step": 76210 + }, + { + "epoch": 5.178353037097431, + "grad_norm": 8.309650421142578, + "learning_rate": 3.52986139421117e-06, + "loss": 2.7882, + "step": 76215 + }, + { + "epoch": 5.178692757168093, + "grad_norm": 7.178797721862793, + "learning_rate": 3.5294367441228434e-06, + "loss": 3.1643, + "step": 76220 + }, + { + "epoch": 5.1790324772387555, + "grad_norm": 6.242847919464111, + "learning_rate": 3.529012094034516e-06, + "loss": 2.9929, + "step": 76225 + }, + { + "epoch": 5.179372197309417, + "grad_norm": 7.823974132537842, + "learning_rate": 3.5285874439461886e-06, + "loss": 2.6538, + "step": 76230 + }, + { + "epoch": 5.179711917380079, + "grad_norm": 5.672163009643555, + "learning_rate": 3.528162793857862e-06, + "loss": 2.5289, + "step": 76235 + }, + { + "epoch": 5.180051637450741, + "grad_norm": 6.931310653686523, + "learning_rate": 3.527738143769534e-06, + "loss": 2.939, + "step": 76240 + }, + { + "epoch": 5.180391357521402, + "grad_norm": 6.669547080993652, + "learning_rate": 3.527313493681207e-06, + "loss": 2.8208, + "step": 76245 + }, + { + "epoch": 5.180731077592064, + "grad_norm": 7.189908981323242, + "learning_rate": 3.52688884359288e-06, + "loss": 2.8987, + "step": 76250 + }, + { + "epoch": 5.181070797662726, + "grad_norm": 7.51224946975708, + "learning_rate": 3.526464193504552e-06, + "loss": 2.6902, + "step": 76255 + }, + { + "epoch": 5.181410517733387, + "grad_norm": 8.069366455078125, + "learning_rate": 3.5260395434162254e-06, + "loss": 2.7878, + "step": 76260 + }, + { + "epoch": 5.181750237804049, + "grad_norm": 8.034979820251465, + "learning_rate": 3.525614893327898e-06, + "loss": 2.7382, + "step": 76265 + }, + { + "epoch": 5.1820899578747115, + "grad_norm": 8.279938697814941, + "learning_rate": 3.5251902432395706e-06, + "loss": 2.7298, + "step": 76270 + }, + { + "epoch": 5.182429677945373, + "grad_norm": 6.625631332397461, + "learning_rate": 3.524765593151244e-06, + "loss": 2.9076, + "step": 76275 + }, + { + "epoch": 5.182769398016035, + "grad_norm": 8.955010414123535, + "learning_rate": 3.5243409430629166e-06, + "loss": 2.812, + "step": 76280 + }, + { + "epoch": 5.183109118086697, + "grad_norm": 9.707942962646484, + "learning_rate": 3.523916292974589e-06, + "loss": 2.6973, + "step": 76285 + }, + { + "epoch": 5.183448838157358, + "grad_norm": 6.478180408477783, + "learning_rate": 3.523491642886262e-06, + "loss": 2.9188, + "step": 76290 + }, + { + "epoch": 5.18378855822802, + "grad_norm": 8.884936332702637, + "learning_rate": 3.523066992797935e-06, + "loss": 2.539, + "step": 76295 + }, + { + "epoch": 5.184128278298682, + "grad_norm": 9.757353782653809, + "learning_rate": 3.5226423427096074e-06, + "loss": 2.7853, + "step": 76300 + }, + { + "epoch": 5.184467998369343, + "grad_norm": 6.98405122756958, + "learning_rate": 3.52221769262128e-06, + "loss": 3.0066, + "step": 76305 + }, + { + "epoch": 5.184807718440005, + "grad_norm": 8.798186302185059, + "learning_rate": 3.5217930425329534e-06, + "loss": 2.7833, + "step": 76310 + }, + { + "epoch": 5.1851474385106675, + "grad_norm": 8.581128120422363, + "learning_rate": 3.5213683924446258e-06, + "loss": 2.6655, + "step": 76315 + }, + { + "epoch": 5.185487158581329, + "grad_norm": 7.324142932891846, + "learning_rate": 3.5209437423562986e-06, + "loss": 2.7876, + "step": 76320 + }, + { + "epoch": 5.185826878651991, + "grad_norm": 8.351958274841309, + "learning_rate": 3.520519092267972e-06, + "loss": 2.7927, + "step": 76325 + }, + { + "epoch": 5.186166598722653, + "grad_norm": 8.393477439880371, + "learning_rate": 3.520094442179644e-06, + "loss": 2.7353, + "step": 76330 + }, + { + "epoch": 5.186506318793314, + "grad_norm": 7.281877517700195, + "learning_rate": 3.519669792091317e-06, + "loss": 2.8216, + "step": 76335 + }, + { + "epoch": 5.186846038863976, + "grad_norm": 7.513680458068848, + "learning_rate": 3.5192451420029898e-06, + "loss": 2.7136, + "step": 76340 + }, + { + "epoch": 5.187185758934638, + "grad_norm": 7.511502265930176, + "learning_rate": 3.518820491914662e-06, + "loss": 2.831, + "step": 76345 + }, + { + "epoch": 5.187525479005299, + "grad_norm": 7.720317363739014, + "learning_rate": 3.5183958418263354e-06, + "loss": 2.7393, + "step": 76350 + }, + { + "epoch": 5.187865199075961, + "grad_norm": 7.760242938995361, + "learning_rate": 3.517971191738008e-06, + "loss": 2.9099, + "step": 76355 + }, + { + "epoch": 5.1882049191466235, + "grad_norm": 6.891537666320801, + "learning_rate": 3.5175465416496806e-06, + "loss": 2.6705, + "step": 76360 + }, + { + "epoch": 5.188544639217285, + "grad_norm": 6.433002471923828, + "learning_rate": 3.517121891561354e-06, + "loss": 2.6894, + "step": 76365 + }, + { + "epoch": 5.188884359287947, + "grad_norm": 6.414089202880859, + "learning_rate": 3.5166972414730266e-06, + "loss": 2.9445, + "step": 76370 + }, + { + "epoch": 5.189224079358609, + "grad_norm": 7.912922382354736, + "learning_rate": 3.5162725913846994e-06, + "loss": 2.7023, + "step": 76375 + }, + { + "epoch": 5.18956379942927, + "grad_norm": 6.101161956787109, + "learning_rate": 3.5158479412963718e-06, + "loss": 2.643, + "step": 76380 + }, + { + "epoch": 5.189903519499932, + "grad_norm": 7.246249675750732, + "learning_rate": 3.515423291208045e-06, + "loss": 2.8911, + "step": 76385 + }, + { + "epoch": 5.190243239570594, + "grad_norm": 7.79417610168457, + "learning_rate": 3.514998641119718e-06, + "loss": 2.8887, + "step": 76390 + }, + { + "epoch": 5.190582959641255, + "grad_norm": 8.738237380981445, + "learning_rate": 3.51457399103139e-06, + "loss": 2.9457, + "step": 76395 + }, + { + "epoch": 5.1909226797119175, + "grad_norm": 6.983440399169922, + "learning_rate": 3.5141493409430634e-06, + "loss": 2.715, + "step": 76400 + }, + { + "epoch": 5.1912623997825795, + "grad_norm": 6.438875198364258, + "learning_rate": 3.513724690854736e-06, + "loss": 2.7104, + "step": 76405 + }, + { + "epoch": 5.191602119853241, + "grad_norm": 8.281827926635742, + "learning_rate": 3.5133000407664086e-06, + "loss": 2.8869, + "step": 76410 + }, + { + "epoch": 5.191941839923903, + "grad_norm": 5.376118183135986, + "learning_rate": 3.512875390678082e-06, + "loss": 2.5898, + "step": 76415 + }, + { + "epoch": 5.192281559994565, + "grad_norm": 6.544867992401123, + "learning_rate": 3.5124507405897546e-06, + "loss": 2.4109, + "step": 76420 + }, + { + "epoch": 5.192621280065226, + "grad_norm": 6.312778949737549, + "learning_rate": 3.512026090501427e-06, + "loss": 2.9681, + "step": 76425 + }, + { + "epoch": 5.192961000135888, + "grad_norm": 7.007158279418945, + "learning_rate": 3.5116014404130998e-06, + "loss": 2.6924, + "step": 76430 + }, + { + "epoch": 5.19330072020655, + "grad_norm": 6.24033260345459, + "learning_rate": 3.511176790324773e-06, + "loss": 2.6694, + "step": 76435 + }, + { + "epoch": 5.193640440277211, + "grad_norm": 7.7807416915893555, + "learning_rate": 3.5107521402364454e-06, + "loss": 2.7231, + "step": 76440 + }, + { + "epoch": 5.1939801603478735, + "grad_norm": 8.096150398254395, + "learning_rate": 3.510327490148118e-06, + "loss": 2.943, + "step": 76445 + }, + { + "epoch": 5.1943198804185355, + "grad_norm": 6.165858268737793, + "learning_rate": 3.5099028400597914e-06, + "loss": 2.9602, + "step": 76450 + }, + { + "epoch": 5.194659600489197, + "grad_norm": 8.149053573608398, + "learning_rate": 3.5094781899714638e-06, + "loss": 2.8282, + "step": 76455 + }, + { + "epoch": 5.194999320559859, + "grad_norm": 6.505476951599121, + "learning_rate": 3.5090535398831366e-06, + "loss": 2.9473, + "step": 76460 + }, + { + "epoch": 5.195339040630521, + "grad_norm": 7.528768539428711, + "learning_rate": 3.5086288897948094e-06, + "loss": 2.916, + "step": 76465 + }, + { + "epoch": 5.195678760701182, + "grad_norm": 7.860887050628662, + "learning_rate": 3.5082042397064818e-06, + "loss": 2.7715, + "step": 76470 + }, + { + "epoch": 5.196018480771844, + "grad_norm": 8.736234664916992, + "learning_rate": 3.507779589618155e-06, + "loss": 2.9211, + "step": 76475 + }, + { + "epoch": 5.196358200842505, + "grad_norm": 5.6503520011901855, + "learning_rate": 3.5073549395298278e-06, + "loss": 2.9226, + "step": 76480 + }, + { + "epoch": 5.196697920913167, + "grad_norm": 8.318073272705078, + "learning_rate": 3.5069302894415e-06, + "loss": 2.7636, + "step": 76485 + }, + { + "epoch": 5.1970376409838295, + "grad_norm": 6.868382930755615, + "learning_rate": 3.5065056393531734e-06, + "loss": 2.8605, + "step": 76490 + }, + { + "epoch": 5.197377361054491, + "grad_norm": 6.404179573059082, + "learning_rate": 3.506080989264846e-06, + "loss": 2.7255, + "step": 76495 + }, + { + "epoch": 5.197717081125153, + "grad_norm": 9.383706092834473, + "learning_rate": 3.5056563391765186e-06, + "loss": 2.8258, + "step": 76500 + }, + { + "epoch": 5.198056801195815, + "grad_norm": 8.067282676696777, + "learning_rate": 3.5052316890881914e-06, + "loss": 2.7386, + "step": 76505 + }, + { + "epoch": 5.198396521266476, + "grad_norm": 8.92784309387207, + "learning_rate": 3.5048070389998646e-06, + "loss": 2.826, + "step": 76510 + }, + { + "epoch": 5.198736241337138, + "grad_norm": 6.680359840393066, + "learning_rate": 3.504382388911537e-06, + "loss": 2.8716, + "step": 76515 + }, + { + "epoch": 5.1990759614078, + "grad_norm": 5.8241353034973145, + "learning_rate": 3.5039577388232098e-06, + "loss": 2.5865, + "step": 76520 + }, + { + "epoch": 5.199415681478461, + "grad_norm": 9.76077938079834, + "learning_rate": 3.503533088734883e-06, + "loss": 2.818, + "step": 76525 + }, + { + "epoch": 5.199755401549123, + "grad_norm": 5.787497520446777, + "learning_rate": 3.5031084386465554e-06, + "loss": 2.8635, + "step": 76530 + }, + { + "epoch": 5.2000951216197855, + "grad_norm": 7.697829246520996, + "learning_rate": 3.502683788558228e-06, + "loss": 2.9805, + "step": 76535 + }, + { + "epoch": 5.200434841690447, + "grad_norm": 8.380167007446289, + "learning_rate": 3.502259138469901e-06, + "loss": 2.8731, + "step": 76540 + }, + { + "epoch": 5.200774561761109, + "grad_norm": 7.572904109954834, + "learning_rate": 3.501834488381574e-06, + "loss": 2.805, + "step": 76545 + }, + { + "epoch": 5.201114281831771, + "grad_norm": 9.53442668914795, + "learning_rate": 3.5014098382932466e-06, + "loss": 2.9796, + "step": 76550 + }, + { + "epoch": 5.201454001902432, + "grad_norm": 6.790987968444824, + "learning_rate": 3.5009851882049194e-06, + "loss": 2.6541, + "step": 76555 + }, + { + "epoch": 5.201793721973094, + "grad_norm": 8.141711235046387, + "learning_rate": 3.5005605381165926e-06, + "loss": 2.7948, + "step": 76560 + }, + { + "epoch": 5.202133442043756, + "grad_norm": 8.544922828674316, + "learning_rate": 3.500135888028265e-06, + "loss": 2.7861, + "step": 76565 + }, + { + "epoch": 5.202473162114417, + "grad_norm": 9.023662567138672, + "learning_rate": 3.4997112379399378e-06, + "loss": 2.7378, + "step": 76570 + }, + { + "epoch": 5.202812882185079, + "grad_norm": 6.448622703552246, + "learning_rate": 3.499286587851611e-06, + "loss": 2.8147, + "step": 76575 + }, + { + "epoch": 5.2031526022557415, + "grad_norm": 6.803184986114502, + "learning_rate": 3.4988619377632834e-06, + "loss": 2.9304, + "step": 76580 + }, + { + "epoch": 5.203492322326403, + "grad_norm": 6.188681125640869, + "learning_rate": 3.498437287674956e-06, + "loss": 2.6744, + "step": 76585 + }, + { + "epoch": 5.203832042397065, + "grad_norm": 7.834408760070801, + "learning_rate": 3.498012637586629e-06, + "loss": 2.9255, + "step": 76590 + }, + { + "epoch": 5.204171762467727, + "grad_norm": 7.079641342163086, + "learning_rate": 3.4975879874983014e-06, + "loss": 2.8606, + "step": 76595 + }, + { + "epoch": 5.204511482538388, + "grad_norm": 7.716006755828857, + "learning_rate": 3.4971633374099746e-06, + "loss": 2.924, + "step": 76600 + }, + { + "epoch": 5.20485120260905, + "grad_norm": 7.56413459777832, + "learning_rate": 3.4967386873216474e-06, + "loss": 2.7424, + "step": 76605 + }, + { + "epoch": 5.205190922679712, + "grad_norm": 8.02752685546875, + "learning_rate": 3.4963140372333198e-06, + "loss": 2.6925, + "step": 76610 + }, + { + "epoch": 5.205530642750373, + "grad_norm": 7.5811920166015625, + "learning_rate": 3.495889387144993e-06, + "loss": 2.8044, + "step": 76615 + }, + { + "epoch": 5.205870362821035, + "grad_norm": 6.900209903717041, + "learning_rate": 3.4954647370566658e-06, + "loss": 3.0029, + "step": 76620 + }, + { + "epoch": 5.2062100828916975, + "grad_norm": 9.704415321350098, + "learning_rate": 3.495040086968338e-06, + "loss": 2.9609, + "step": 76625 + }, + { + "epoch": 5.206549802962359, + "grad_norm": 7.069758415222168, + "learning_rate": 3.494615436880011e-06, + "loss": 2.7838, + "step": 76630 + }, + { + "epoch": 5.206889523033021, + "grad_norm": 8.588719367980957, + "learning_rate": 3.494190786791684e-06, + "loss": 2.9504, + "step": 76635 + }, + { + "epoch": 5.207229243103683, + "grad_norm": 6.713133335113525, + "learning_rate": 3.4937661367033566e-06, + "loss": 2.8266, + "step": 76640 + }, + { + "epoch": 5.207568963174344, + "grad_norm": 6.49757194519043, + "learning_rate": 3.4933414866150294e-06, + "loss": 2.6193, + "step": 76645 + }, + { + "epoch": 5.207908683245006, + "grad_norm": 7.6848297119140625, + "learning_rate": 3.4929168365267026e-06, + "loss": 2.6484, + "step": 76650 + }, + { + "epoch": 5.208248403315668, + "grad_norm": 6.571275234222412, + "learning_rate": 3.492492186438375e-06, + "loss": 2.9311, + "step": 76655 + }, + { + "epoch": 5.208588123386329, + "grad_norm": 6.089924335479736, + "learning_rate": 3.4920675363500478e-06, + "loss": 2.6109, + "step": 76660 + }, + { + "epoch": 5.2089278434569914, + "grad_norm": 8.726767539978027, + "learning_rate": 3.4916428862617206e-06, + "loss": 2.8072, + "step": 76665 + }, + { + "epoch": 5.2092675635276535, + "grad_norm": 7.76417875289917, + "learning_rate": 3.4912182361733934e-06, + "loss": 2.5946, + "step": 76670 + }, + { + "epoch": 5.209607283598315, + "grad_norm": 8.24079704284668, + "learning_rate": 3.490793586085066e-06, + "loss": 2.8369, + "step": 76675 + }, + { + "epoch": 5.209947003668977, + "grad_norm": 6.962181568145752, + "learning_rate": 3.490368935996739e-06, + "loss": 2.8169, + "step": 76680 + }, + { + "epoch": 5.210286723739639, + "grad_norm": 7.9557366371154785, + "learning_rate": 3.4899442859084113e-06, + "loss": 2.7496, + "step": 76685 + }, + { + "epoch": 5.2106264438103, + "grad_norm": 8.577557563781738, + "learning_rate": 3.4895196358200846e-06, + "loss": 2.601, + "step": 76690 + }, + { + "epoch": 5.210966163880962, + "grad_norm": 6.817206859588623, + "learning_rate": 3.4890949857317574e-06, + "loss": 2.839, + "step": 76695 + }, + { + "epoch": 5.211305883951624, + "grad_norm": 6.61426305770874, + "learning_rate": 3.4886703356434297e-06, + "loss": 2.9043, + "step": 76700 + }, + { + "epoch": 5.211645604022285, + "grad_norm": 7.304494857788086, + "learning_rate": 3.488245685555103e-06, + "loss": 2.7403, + "step": 76705 + }, + { + "epoch": 5.2119853240929475, + "grad_norm": 8.385682106018066, + "learning_rate": 3.4878210354667758e-06, + "loss": 2.9823, + "step": 76710 + }, + { + "epoch": 5.2123250441636095, + "grad_norm": 10.03336238861084, + "learning_rate": 3.4873963853784486e-06, + "loss": 2.8368, + "step": 76715 + }, + { + "epoch": 5.212664764234271, + "grad_norm": 7.7654500007629395, + "learning_rate": 3.486971735290121e-06, + "loss": 3.0685, + "step": 76720 + }, + { + "epoch": 5.213004484304933, + "grad_norm": 7.182599067687988, + "learning_rate": 3.486547085201794e-06, + "loss": 2.879, + "step": 76725 + }, + { + "epoch": 5.213344204375595, + "grad_norm": 8.324812889099121, + "learning_rate": 3.486122435113467e-06, + "loss": 2.7419, + "step": 76730 + }, + { + "epoch": 5.213683924446256, + "grad_norm": 7.407346725463867, + "learning_rate": 3.4856977850251394e-06, + "loss": 3.0206, + "step": 76735 + }, + { + "epoch": 5.214023644516918, + "grad_norm": 8.856599807739258, + "learning_rate": 3.4852731349368126e-06, + "loss": 2.6566, + "step": 76740 + }, + { + "epoch": 5.21436336458758, + "grad_norm": 7.627528667449951, + "learning_rate": 3.4848484848484854e-06, + "loss": 2.7782, + "step": 76745 + }, + { + "epoch": 5.214703084658241, + "grad_norm": 8.861865997314453, + "learning_rate": 3.4844238347601578e-06, + "loss": 2.6642, + "step": 76750 + }, + { + "epoch": 5.2150428047289035, + "grad_norm": 9.867135047912598, + "learning_rate": 3.4839991846718306e-06, + "loss": 2.6511, + "step": 76755 + }, + { + "epoch": 5.2153825247995655, + "grad_norm": 7.3676042556762695, + "learning_rate": 3.4835745345835038e-06, + "loss": 2.928, + "step": 76760 + }, + { + "epoch": 5.215722244870227, + "grad_norm": 10.621467590332031, + "learning_rate": 3.483149884495176e-06, + "loss": 2.8734, + "step": 76765 + }, + { + "epoch": 5.216061964940889, + "grad_norm": 9.034059524536133, + "learning_rate": 3.482725234406849e-06, + "loss": 2.6768, + "step": 76770 + }, + { + "epoch": 5.216401685011551, + "grad_norm": 6.872338771820068, + "learning_rate": 3.482300584318522e-06, + "loss": 2.6968, + "step": 76775 + }, + { + "epoch": 5.216741405082212, + "grad_norm": 8.519576072692871, + "learning_rate": 3.4818759342301946e-06, + "loss": 2.6906, + "step": 76780 + }, + { + "epoch": 5.217081125152874, + "grad_norm": 8.689241409301758, + "learning_rate": 3.4814512841418674e-06, + "loss": 2.7887, + "step": 76785 + }, + { + "epoch": 5.217420845223536, + "grad_norm": 9.309816360473633, + "learning_rate": 3.48102663405354e-06, + "loss": 3.1593, + "step": 76790 + }, + { + "epoch": 5.217760565294197, + "grad_norm": 6.909776210784912, + "learning_rate": 3.480601983965213e-06, + "loss": 2.732, + "step": 76795 + }, + { + "epoch": 5.2181002853648595, + "grad_norm": 8.030982971191406, + "learning_rate": 3.4801773338768858e-06, + "loss": 2.6661, + "step": 76800 + }, + { + "epoch": 5.2184400054355216, + "grad_norm": 6.278336048126221, + "learning_rate": 3.4797526837885586e-06, + "loss": 2.9514, + "step": 76805 + }, + { + "epoch": 5.218779725506183, + "grad_norm": 7.071508407592773, + "learning_rate": 3.479328033700231e-06, + "loss": 2.7865, + "step": 76810 + }, + { + "epoch": 5.219119445576845, + "grad_norm": 5.993310451507568, + "learning_rate": 3.478903383611904e-06, + "loss": 2.8153, + "step": 76815 + }, + { + "epoch": 5.219459165647507, + "grad_norm": 7.351986885070801, + "learning_rate": 3.478478733523577e-06, + "loss": 2.7822, + "step": 76820 + }, + { + "epoch": 5.219798885718168, + "grad_norm": 6.926560401916504, + "learning_rate": 3.4780540834352493e-06, + "loss": 2.8346, + "step": 76825 + }, + { + "epoch": 5.22013860578883, + "grad_norm": 6.802603244781494, + "learning_rate": 3.4776294333469226e-06, + "loss": 2.8161, + "step": 76830 + }, + { + "epoch": 5.220478325859492, + "grad_norm": 7.297199726104736, + "learning_rate": 3.4772047832585954e-06, + "loss": 2.6193, + "step": 76835 + }, + { + "epoch": 5.220818045930153, + "grad_norm": 9.1303071975708, + "learning_rate": 3.4767801331702677e-06, + "loss": 2.698, + "step": 76840 + }, + { + "epoch": 5.2211577660008155, + "grad_norm": 6.402771472930908, + "learning_rate": 3.4763554830819405e-06, + "loss": 2.7297, + "step": 76845 + }, + { + "epoch": 5.221497486071477, + "grad_norm": 8.227482795715332, + "learning_rate": 3.4759308329936138e-06, + "loss": 2.8289, + "step": 76850 + }, + { + "epoch": 5.221837206142139, + "grad_norm": 7.096495151519775, + "learning_rate": 3.475506182905286e-06, + "loss": 2.7239, + "step": 76855 + }, + { + "epoch": 5.222176926212801, + "grad_norm": 8.283794403076172, + "learning_rate": 3.475081532816959e-06, + "loss": 2.8955, + "step": 76860 + }, + { + "epoch": 5.222516646283462, + "grad_norm": 7.289886951446533, + "learning_rate": 3.474656882728632e-06, + "loss": 2.9485, + "step": 76865 + }, + { + "epoch": 5.222856366354124, + "grad_norm": 7.381964206695557, + "learning_rate": 3.4742322326403045e-06, + "loss": 2.9497, + "step": 76870 + }, + { + "epoch": 5.223196086424786, + "grad_norm": 7.895534515380859, + "learning_rate": 3.4738075825519774e-06, + "loss": 2.8639, + "step": 76875 + }, + { + "epoch": 5.223535806495447, + "grad_norm": 6.096207141876221, + "learning_rate": 3.47338293246365e-06, + "loss": 2.6213, + "step": 76880 + }, + { + "epoch": 5.223875526566109, + "grad_norm": 7.821425437927246, + "learning_rate": 3.4729582823753234e-06, + "loss": 2.629, + "step": 76885 + }, + { + "epoch": 5.2242152466367715, + "grad_norm": 6.5538740158081055, + "learning_rate": 3.4725336322869958e-06, + "loss": 3.0117, + "step": 76890 + }, + { + "epoch": 5.224554966707433, + "grad_norm": 7.543023586273193, + "learning_rate": 3.4721089821986686e-06, + "loss": 2.5967, + "step": 76895 + }, + { + "epoch": 5.224894686778095, + "grad_norm": 8.752140998840332, + "learning_rate": 3.4716843321103418e-06, + "loss": 2.775, + "step": 76900 + }, + { + "epoch": 5.225234406848757, + "grad_norm": 7.575557708740234, + "learning_rate": 3.471259682022014e-06, + "loss": 2.9518, + "step": 76905 + }, + { + "epoch": 5.225574126919418, + "grad_norm": 7.556154251098633, + "learning_rate": 3.470835031933687e-06, + "loss": 2.874, + "step": 76910 + }, + { + "epoch": 5.22591384699008, + "grad_norm": 7.763406753540039, + "learning_rate": 3.4704103818453598e-06, + "loss": 2.8016, + "step": 76915 + }, + { + "epoch": 5.226253567060742, + "grad_norm": 8.651609420776367, + "learning_rate": 3.4699857317570326e-06, + "loss": 2.8499, + "step": 76920 + }, + { + "epoch": 5.226593287131403, + "grad_norm": 7.648528099060059, + "learning_rate": 3.4695610816687054e-06, + "loss": 2.9832, + "step": 76925 + }, + { + "epoch": 5.226933007202065, + "grad_norm": 6.143137454986572, + "learning_rate": 3.469136431580378e-06, + "loss": 3.0739, + "step": 76930 + }, + { + "epoch": 5.2272727272727275, + "grad_norm": 8.343608856201172, + "learning_rate": 3.4687117814920505e-06, + "loss": 2.6927, + "step": 76935 + }, + { + "epoch": 5.227612447343389, + "grad_norm": 7.017902851104736, + "learning_rate": 3.4682871314037238e-06, + "loss": 2.6305, + "step": 76940 + }, + { + "epoch": 5.227952167414051, + "grad_norm": 6.66448450088501, + "learning_rate": 3.4678624813153966e-06, + "loss": 2.8004, + "step": 76945 + }, + { + "epoch": 5.228291887484713, + "grad_norm": 7.500174522399902, + "learning_rate": 3.467437831227069e-06, + "loss": 2.7029, + "step": 76950 + }, + { + "epoch": 5.228631607555374, + "grad_norm": 6.266868591308594, + "learning_rate": 3.467013181138742e-06, + "loss": 2.8962, + "step": 76955 + }, + { + "epoch": 5.228971327626036, + "grad_norm": 7.309754848480225, + "learning_rate": 3.466588531050415e-06, + "loss": 2.7827, + "step": 76960 + }, + { + "epoch": 5.229311047696698, + "grad_norm": 7.662424087524414, + "learning_rate": 3.4661638809620873e-06, + "loss": 2.8312, + "step": 76965 + }, + { + "epoch": 5.229650767767359, + "grad_norm": 7.675118923187256, + "learning_rate": 3.46573923087376e-06, + "loss": 2.802, + "step": 76970 + }, + { + "epoch": 5.2299904878380215, + "grad_norm": 7.722368240356445, + "learning_rate": 3.4653145807854334e-06, + "loss": 2.9151, + "step": 76975 + }, + { + "epoch": 5.2303302079086835, + "grad_norm": 7.323424816131592, + "learning_rate": 3.4648899306971057e-06, + "loss": 2.8728, + "step": 76980 + }, + { + "epoch": 5.230669927979345, + "grad_norm": 8.326774597167969, + "learning_rate": 3.4644652806087785e-06, + "loss": 2.763, + "step": 76985 + }, + { + "epoch": 5.231009648050007, + "grad_norm": 7.992800235748291, + "learning_rate": 3.4640406305204518e-06, + "loss": 2.6746, + "step": 76990 + }, + { + "epoch": 5.231349368120669, + "grad_norm": 7.318576335906982, + "learning_rate": 3.463615980432124e-06, + "loss": 2.8376, + "step": 76995 + }, + { + "epoch": 5.23168908819133, + "grad_norm": 7.667619228363037, + "learning_rate": 3.463191330343797e-06, + "loss": 2.8353, + "step": 77000 + }, + { + "epoch": 5.232028808261992, + "grad_norm": 8.103976249694824, + "learning_rate": 3.4627666802554697e-06, + "loss": 2.7237, + "step": 77005 + }, + { + "epoch": 5.232368528332654, + "grad_norm": 8.016229629516602, + "learning_rate": 3.462342030167142e-06, + "loss": 2.6731, + "step": 77010 + }, + { + "epoch": 5.232708248403315, + "grad_norm": 6.487110614776611, + "learning_rate": 3.4619173800788153e-06, + "loss": 2.715, + "step": 77015 + }, + { + "epoch": 5.2330479684739775, + "grad_norm": 7.825606822967529, + "learning_rate": 3.461492729990488e-06, + "loss": 2.762, + "step": 77020 + }, + { + "epoch": 5.2333876885446395, + "grad_norm": 7.8768510818481445, + "learning_rate": 3.4610680799021605e-06, + "loss": 2.9049, + "step": 77025 + }, + { + "epoch": 5.233727408615301, + "grad_norm": 6.580146312713623, + "learning_rate": 3.4606434298138338e-06, + "loss": 2.8238, + "step": 77030 + }, + { + "epoch": 5.234067128685963, + "grad_norm": 9.809882164001465, + "learning_rate": 3.4602187797255066e-06, + "loss": 2.821, + "step": 77035 + }, + { + "epoch": 5.234406848756625, + "grad_norm": 6.153665065765381, + "learning_rate": 3.459794129637179e-06, + "loss": 2.8626, + "step": 77040 + }, + { + "epoch": 5.234746568827286, + "grad_norm": 7.6021904945373535, + "learning_rate": 3.4593694795488517e-06, + "loss": 2.6956, + "step": 77045 + }, + { + "epoch": 5.235086288897948, + "grad_norm": 6.959191799163818, + "learning_rate": 3.458944829460525e-06, + "loss": 2.8766, + "step": 77050 + }, + { + "epoch": 5.23542600896861, + "grad_norm": 7.275630950927734, + "learning_rate": 3.4585201793721978e-06, + "loss": 2.7619, + "step": 77055 + }, + { + "epoch": 5.235765729039271, + "grad_norm": 7.000885486602783, + "learning_rate": 3.45809552928387e-06, + "loss": 2.5811, + "step": 77060 + }, + { + "epoch": 5.2361054491099335, + "grad_norm": 8.874237060546875, + "learning_rate": 3.4576708791955434e-06, + "loss": 2.837, + "step": 77065 + }, + { + "epoch": 5.2364451691805955, + "grad_norm": 8.540796279907227, + "learning_rate": 3.457246229107216e-06, + "loss": 2.9828, + "step": 77070 + }, + { + "epoch": 5.236784889251257, + "grad_norm": 6.641334533691406, + "learning_rate": 3.4568215790188885e-06, + "loss": 2.7549, + "step": 77075 + }, + { + "epoch": 5.237124609321919, + "grad_norm": 7.73248291015625, + "learning_rate": 3.4563969289305618e-06, + "loss": 2.8665, + "step": 77080 + }, + { + "epoch": 5.237464329392581, + "grad_norm": 8.341876983642578, + "learning_rate": 3.4559722788422346e-06, + "loss": 2.9659, + "step": 77085 + }, + { + "epoch": 5.237804049463242, + "grad_norm": 7.86350154876709, + "learning_rate": 3.455547628753907e-06, + "loss": 2.9063, + "step": 77090 + }, + { + "epoch": 5.238143769533904, + "grad_norm": 8.361855506896973, + "learning_rate": 3.4551229786655797e-06, + "loss": 2.9071, + "step": 77095 + }, + { + "epoch": 5.238483489604566, + "grad_norm": 6.932414531707764, + "learning_rate": 3.454698328577253e-06, + "loss": 2.8526, + "step": 77100 + }, + { + "epoch": 5.238823209675227, + "grad_norm": 7.343037128448486, + "learning_rate": 3.4542736784889253e-06, + "loss": 2.6488, + "step": 77105 + }, + { + "epoch": 5.2391629297458895, + "grad_norm": 9.269522666931152, + "learning_rate": 3.453849028400598e-06, + "loss": 2.7862, + "step": 77110 + }, + { + "epoch": 5.239502649816552, + "grad_norm": 5.42886209487915, + "learning_rate": 3.4534243783122714e-06, + "loss": 2.6224, + "step": 77115 + }, + { + "epoch": 5.239842369887213, + "grad_norm": 6.801142692565918, + "learning_rate": 3.4529997282239437e-06, + "loss": 2.8567, + "step": 77120 + }, + { + "epoch": 5.240182089957875, + "grad_norm": 7.761082172393799, + "learning_rate": 3.4525750781356165e-06, + "loss": 2.6402, + "step": 77125 + }, + { + "epoch": 5.240521810028537, + "grad_norm": 7.779077529907227, + "learning_rate": 3.4521504280472893e-06, + "loss": 2.7957, + "step": 77130 + }, + { + "epoch": 5.240861530099198, + "grad_norm": 8.658432006835938, + "learning_rate": 3.4517257779589617e-06, + "loss": 2.884, + "step": 77135 + }, + { + "epoch": 5.24120125016986, + "grad_norm": 8.147982597351074, + "learning_rate": 3.451301127870635e-06, + "loss": 2.9975, + "step": 77140 + }, + { + "epoch": 5.241540970240522, + "grad_norm": 6.807070255279541, + "learning_rate": 3.4508764777823077e-06, + "loss": 2.8304, + "step": 77145 + }, + { + "epoch": 5.241880690311183, + "grad_norm": 6.808895111083984, + "learning_rate": 3.45045182769398e-06, + "loss": 2.9075, + "step": 77150 + }, + { + "epoch": 5.2422204103818455, + "grad_norm": 5.885965824127197, + "learning_rate": 3.4500271776056533e-06, + "loss": 2.7235, + "step": 77155 + }, + { + "epoch": 5.242560130452507, + "grad_norm": 7.580207824707031, + "learning_rate": 3.449602527517326e-06, + "loss": 2.7843, + "step": 77160 + }, + { + "epoch": 5.242899850523169, + "grad_norm": 6.4630022048950195, + "learning_rate": 3.4491778774289985e-06, + "loss": 2.8584, + "step": 77165 + }, + { + "epoch": 5.243239570593831, + "grad_norm": 8.398513793945312, + "learning_rate": 3.4487532273406713e-06, + "loss": 2.9689, + "step": 77170 + }, + { + "epoch": 5.243579290664492, + "grad_norm": 9.906005859375, + "learning_rate": 3.4483285772523445e-06, + "loss": 2.63, + "step": 77175 + }, + { + "epoch": 5.243919010735154, + "grad_norm": 9.927313804626465, + "learning_rate": 3.447903927164017e-06, + "loss": 2.8792, + "step": 77180 + }, + { + "epoch": 5.244258730805816, + "grad_norm": 8.944010734558105, + "learning_rate": 3.4474792770756897e-06, + "loss": 2.6906, + "step": 77185 + }, + { + "epoch": 5.244598450876477, + "grad_norm": 6.983775615692139, + "learning_rate": 3.447054626987363e-06, + "loss": 2.526, + "step": 77190 + }, + { + "epoch": 5.244938170947139, + "grad_norm": 7.929068565368652, + "learning_rate": 3.4466299768990353e-06, + "loss": 3.0251, + "step": 77195 + }, + { + "epoch": 5.2452778910178015, + "grad_norm": 7.002962112426758, + "learning_rate": 3.446205326810708e-06, + "loss": 2.8586, + "step": 77200 + }, + { + "epoch": 5.245617611088463, + "grad_norm": 6.901952266693115, + "learning_rate": 3.4457806767223814e-06, + "loss": 2.8933, + "step": 77205 + }, + { + "epoch": 5.245957331159125, + "grad_norm": 7.730902194976807, + "learning_rate": 3.4453560266340537e-06, + "loss": 2.6168, + "step": 77210 + }, + { + "epoch": 5.246297051229787, + "grad_norm": 7.559895038604736, + "learning_rate": 3.4449313765457265e-06, + "loss": 2.7697, + "step": 77215 + }, + { + "epoch": 5.246636771300448, + "grad_norm": 8.313554763793945, + "learning_rate": 3.4445067264573993e-06, + "loss": 2.8664, + "step": 77220 + }, + { + "epoch": 5.24697649137111, + "grad_norm": 7.487074851989746, + "learning_rate": 3.4440820763690726e-06, + "loss": 2.6306, + "step": 77225 + }, + { + "epoch": 5.247316211441772, + "grad_norm": 8.903019905090332, + "learning_rate": 3.443657426280745e-06, + "loss": 2.855, + "step": 77230 + }, + { + "epoch": 5.247655931512433, + "grad_norm": 6.866024971008301, + "learning_rate": 3.4432327761924177e-06, + "loss": 2.6109, + "step": 77235 + }, + { + "epoch": 5.2479956515830954, + "grad_norm": 6.917783260345459, + "learning_rate": 3.442808126104091e-06, + "loss": 2.7117, + "step": 77240 + }, + { + "epoch": 5.2483353716537575, + "grad_norm": 7.553918838500977, + "learning_rate": 3.4423834760157633e-06, + "loss": 2.8467, + "step": 77245 + }, + { + "epoch": 5.248675091724419, + "grad_norm": 6.705289363861084, + "learning_rate": 3.441958825927436e-06, + "loss": 2.8617, + "step": 77250 + }, + { + "epoch": 5.249014811795081, + "grad_norm": 6.505599021911621, + "learning_rate": 3.441534175839109e-06, + "loss": 2.6075, + "step": 77255 + }, + { + "epoch": 5.249354531865743, + "grad_norm": 5.789057731628418, + "learning_rate": 3.4411095257507813e-06, + "loss": 2.794, + "step": 77260 + }, + { + "epoch": 5.249694251936404, + "grad_norm": 8.30710220336914, + "learning_rate": 3.4406848756624545e-06, + "loss": 2.8145, + "step": 77265 + }, + { + "epoch": 5.250033972007066, + "grad_norm": 7.907102108001709, + "learning_rate": 3.4402602255741273e-06, + "loss": 2.8438, + "step": 77270 + }, + { + "epoch": 5.250373692077728, + "grad_norm": 8.446694374084473, + "learning_rate": 3.4398355754857997e-06, + "loss": 2.8435, + "step": 77275 + }, + { + "epoch": 5.250713412148389, + "grad_norm": 6.897339820861816, + "learning_rate": 3.439410925397473e-06, + "loss": 2.9603, + "step": 77280 + }, + { + "epoch": 5.2510531322190515, + "grad_norm": 8.28187084197998, + "learning_rate": 3.4389862753091457e-06, + "loss": 2.8732, + "step": 77285 + }, + { + "epoch": 5.2513928522897135, + "grad_norm": 6.105609893798828, + "learning_rate": 3.438561625220818e-06, + "loss": 2.8003, + "step": 77290 + }, + { + "epoch": 5.251732572360375, + "grad_norm": 7.184239387512207, + "learning_rate": 3.438136975132491e-06, + "loss": 2.7738, + "step": 77295 + }, + { + "epoch": 5.252072292431037, + "grad_norm": 8.083962440490723, + "learning_rate": 3.437712325044164e-06, + "loss": 2.4608, + "step": 77300 + }, + { + "epoch": 5.252412012501699, + "grad_norm": 6.620611667633057, + "learning_rate": 3.4372876749558365e-06, + "loss": 2.904, + "step": 77305 + }, + { + "epoch": 5.25275173257236, + "grad_norm": 7.671315670013428, + "learning_rate": 3.4368630248675093e-06, + "loss": 2.7891, + "step": 77310 + }, + { + "epoch": 5.253091452643022, + "grad_norm": 6.453346252441406, + "learning_rate": 3.4364383747791825e-06, + "loss": 2.8392, + "step": 77315 + }, + { + "epoch": 5.253431172713684, + "grad_norm": 6.102938175201416, + "learning_rate": 3.436013724690855e-06, + "loss": 2.734, + "step": 77320 + }, + { + "epoch": 5.253770892784345, + "grad_norm": 6.920714378356934, + "learning_rate": 3.4355890746025277e-06, + "loss": 3.0689, + "step": 77325 + }, + { + "epoch": 5.2541106128550075, + "grad_norm": 7.888373851776123, + "learning_rate": 3.4351644245142005e-06, + "loss": 2.7896, + "step": 77330 + }, + { + "epoch": 5.2544503329256695, + "grad_norm": 7.798133850097656, + "learning_rate": 3.4347397744258733e-06, + "loss": 2.889, + "step": 77335 + }, + { + "epoch": 5.254790052996331, + "grad_norm": 5.335867404937744, + "learning_rate": 3.434315124337546e-06, + "loss": 2.9692, + "step": 77340 + }, + { + "epoch": 5.255129773066993, + "grad_norm": 7.671693325042725, + "learning_rate": 3.433890474249219e-06, + "loss": 2.8975, + "step": 77345 + }, + { + "epoch": 5.255469493137655, + "grad_norm": 7.3644185066223145, + "learning_rate": 3.4334658241608913e-06, + "loss": 2.9234, + "step": 77350 + }, + { + "epoch": 5.255809213208316, + "grad_norm": 6.816401958465576, + "learning_rate": 3.4330411740725645e-06, + "loss": 2.738, + "step": 77355 + }, + { + "epoch": 5.256148933278978, + "grad_norm": 7.17115592956543, + "learning_rate": 3.4326165239842373e-06, + "loss": 2.8007, + "step": 77360 + }, + { + "epoch": 5.25648865334964, + "grad_norm": 6.851750373840332, + "learning_rate": 3.4321918738959097e-06, + "loss": 2.9274, + "step": 77365 + }, + { + "epoch": 5.256828373420301, + "grad_norm": 6.130197048187256, + "learning_rate": 3.431767223807583e-06, + "loss": 3.0286, + "step": 77370 + }, + { + "epoch": 5.2571680934909635, + "grad_norm": 7.105106830596924, + "learning_rate": 3.4313425737192557e-06, + "loss": 2.7838, + "step": 77375 + }, + { + "epoch": 5.2575078135616256, + "grad_norm": 9.906761169433594, + "learning_rate": 3.430917923630928e-06, + "loss": 2.8919, + "step": 77380 + }, + { + "epoch": 5.257847533632287, + "grad_norm": 8.293594360351562, + "learning_rate": 3.430493273542601e-06, + "loss": 2.7605, + "step": 77385 + }, + { + "epoch": 5.258187253702949, + "grad_norm": 7.853929042816162, + "learning_rate": 3.430068623454274e-06, + "loss": 2.7478, + "step": 77390 + }, + { + "epoch": 5.258526973773611, + "grad_norm": 8.817373275756836, + "learning_rate": 3.429643973365947e-06, + "loss": 2.8748, + "step": 77395 + }, + { + "epoch": 5.258866693844272, + "grad_norm": 6.5900187492370605, + "learning_rate": 3.4292193232776193e-06, + "loss": 2.7166, + "step": 77400 + }, + { + "epoch": 5.259206413914934, + "grad_norm": 8.172823905944824, + "learning_rate": 3.4287946731892925e-06, + "loss": 2.8278, + "step": 77405 + }, + { + "epoch": 5.259546133985596, + "grad_norm": 6.3473405838012695, + "learning_rate": 3.4283700231009653e-06, + "loss": 2.812, + "step": 77410 + }, + { + "epoch": 5.259885854056257, + "grad_norm": 8.090889930725098, + "learning_rate": 3.4279453730126377e-06, + "loss": 2.7323, + "step": 77415 + }, + { + "epoch": 5.2602255741269195, + "grad_norm": 8.077652931213379, + "learning_rate": 3.4275207229243105e-06, + "loss": 3.0785, + "step": 77420 + }, + { + "epoch": 5.260565294197582, + "grad_norm": 9.152837753295898, + "learning_rate": 3.4270960728359837e-06, + "loss": 2.616, + "step": 77425 + }, + { + "epoch": 5.260905014268243, + "grad_norm": 9.406628608703613, + "learning_rate": 3.426671422747656e-06, + "loss": 2.9606, + "step": 77430 + }, + { + "epoch": 5.261244734338905, + "grad_norm": 8.271830558776855, + "learning_rate": 3.426246772659329e-06, + "loss": 2.8212, + "step": 77435 + }, + { + "epoch": 5.261584454409567, + "grad_norm": 5.811341762542725, + "learning_rate": 3.425822122571002e-06, + "loss": 2.8599, + "step": 77440 + }, + { + "epoch": 5.261924174480228, + "grad_norm": 8.125734329223633, + "learning_rate": 3.4253974724826745e-06, + "loss": 2.8872, + "step": 77445 + }, + { + "epoch": 5.26226389455089, + "grad_norm": 7.977941989898682, + "learning_rate": 3.4249728223943473e-06, + "loss": 2.7538, + "step": 77450 + }, + { + "epoch": 5.262603614621552, + "grad_norm": 8.817922592163086, + "learning_rate": 3.42454817230602e-06, + "loss": 2.901, + "step": 77455 + }, + { + "epoch": 5.262943334692213, + "grad_norm": 7.779094219207764, + "learning_rate": 3.424123522217693e-06, + "loss": 2.8218, + "step": 77460 + }, + { + "epoch": 5.2632830547628755, + "grad_norm": 6.0766801834106445, + "learning_rate": 3.4236988721293657e-06, + "loss": 2.9526, + "step": 77465 + }, + { + "epoch": 5.263622774833538, + "grad_norm": 8.33569049835205, + "learning_rate": 3.4232742220410385e-06, + "loss": 2.7963, + "step": 77470 + }, + { + "epoch": 5.263962494904199, + "grad_norm": 8.34500503540039, + "learning_rate": 3.422849571952711e-06, + "loss": 2.9702, + "step": 77475 + }, + { + "epoch": 5.264302214974861, + "grad_norm": 10.140252113342285, + "learning_rate": 3.422424921864384e-06, + "loss": 2.742, + "step": 77480 + }, + { + "epoch": 5.264641935045523, + "grad_norm": 6.1950507164001465, + "learning_rate": 3.422000271776057e-06, + "loss": 2.6977, + "step": 77485 + }, + { + "epoch": 5.264981655116184, + "grad_norm": 7.082727909088135, + "learning_rate": 3.4215756216877293e-06, + "loss": 2.6584, + "step": 77490 + }, + { + "epoch": 5.265321375186846, + "grad_norm": 8.523783683776855, + "learning_rate": 3.4211509715994025e-06, + "loss": 2.7047, + "step": 77495 + }, + { + "epoch": 5.265661095257508, + "grad_norm": 7.615654945373535, + "learning_rate": 3.4207263215110753e-06, + "loss": 2.9846, + "step": 77500 + }, + { + "epoch": 5.266000815328169, + "grad_norm": 7.224148750305176, + "learning_rate": 3.4203016714227477e-06, + "loss": 2.729, + "step": 77505 + }, + { + "epoch": 5.2663405353988315, + "grad_norm": 7.79769229888916, + "learning_rate": 3.4198770213344205e-06, + "loss": 2.7262, + "step": 77510 + }, + { + "epoch": 5.266680255469494, + "grad_norm": 5.738525390625, + "learning_rate": 3.4194523712460937e-06, + "loss": 2.9318, + "step": 77515 + }, + { + "epoch": 5.267019975540155, + "grad_norm": 10.436843872070312, + "learning_rate": 3.419027721157766e-06, + "loss": 2.9103, + "step": 77520 + }, + { + "epoch": 5.267359695610817, + "grad_norm": 6.175951957702637, + "learning_rate": 3.418603071069439e-06, + "loss": 2.8035, + "step": 77525 + }, + { + "epoch": 5.267699415681479, + "grad_norm": 7.581534385681152, + "learning_rate": 3.418178420981112e-06, + "loss": 2.7585, + "step": 77530 + }, + { + "epoch": 5.26803913575214, + "grad_norm": 9.37668228149414, + "learning_rate": 3.4177537708927845e-06, + "loss": 2.8204, + "step": 77535 + }, + { + "epoch": 5.268378855822802, + "grad_norm": 7.687971591949463, + "learning_rate": 3.4173291208044573e-06, + "loss": 2.8353, + "step": 77540 + }, + { + "epoch": 5.268718575893463, + "grad_norm": 6.987886905670166, + "learning_rate": 3.41690447071613e-06, + "loss": 2.8706, + "step": 77545 + }, + { + "epoch": 5.2690582959641254, + "grad_norm": 7.324626922607422, + "learning_rate": 3.4164798206278025e-06, + "loss": 2.8214, + "step": 77550 + }, + { + "epoch": 5.2693980160347875, + "grad_norm": 7.636943340301514, + "learning_rate": 3.4160551705394757e-06, + "loss": 2.69, + "step": 77555 + }, + { + "epoch": 5.269737736105449, + "grad_norm": 7.294870376586914, + "learning_rate": 3.4156305204511485e-06, + "loss": 2.6972, + "step": 77560 + }, + { + "epoch": 5.270077456176111, + "grad_norm": 8.184235572814941, + "learning_rate": 3.4152058703628217e-06, + "loss": 2.9588, + "step": 77565 + }, + { + "epoch": 5.270417176246773, + "grad_norm": 7.821312427520752, + "learning_rate": 3.414781220274494e-06, + "loss": 3.0468, + "step": 77570 + }, + { + "epoch": 5.270756896317434, + "grad_norm": 7.166367530822754, + "learning_rate": 3.414356570186167e-06, + "loss": 3.0695, + "step": 77575 + }, + { + "epoch": 5.271096616388096, + "grad_norm": 9.630763053894043, + "learning_rate": 3.4139319200978397e-06, + "loss": 2.7668, + "step": 77580 + }, + { + "epoch": 5.271436336458758, + "grad_norm": 8.40422248840332, + "learning_rate": 3.4135072700095125e-06, + "loss": 2.7456, + "step": 77585 + }, + { + "epoch": 5.271776056529419, + "grad_norm": 6.82971715927124, + "learning_rate": 3.4130826199211853e-06, + "loss": 2.6424, + "step": 77590 + }, + { + "epoch": 5.2721157766000815, + "grad_norm": 7.172990798950195, + "learning_rate": 3.412657969832858e-06, + "loss": 2.8719, + "step": 77595 + }, + { + "epoch": 5.2724554966707435, + "grad_norm": 8.323954582214355, + "learning_rate": 3.4122333197445305e-06, + "loss": 2.8698, + "step": 77600 + }, + { + "epoch": 5.272795216741405, + "grad_norm": 7.592198848724365, + "learning_rate": 3.4118086696562037e-06, + "loss": 2.648, + "step": 77605 + }, + { + "epoch": 5.273134936812067, + "grad_norm": 8.174111366271973, + "learning_rate": 3.4113840195678765e-06, + "loss": 2.7246, + "step": 77610 + }, + { + "epoch": 5.273474656882729, + "grad_norm": 7.222848892211914, + "learning_rate": 3.410959369479549e-06, + "loss": 2.7163, + "step": 77615 + }, + { + "epoch": 5.27381437695339, + "grad_norm": 7.598320960998535, + "learning_rate": 3.410534719391222e-06, + "loss": 2.9835, + "step": 77620 + }, + { + "epoch": 5.274154097024052, + "grad_norm": 9.622547149658203, + "learning_rate": 3.410110069302895e-06, + "loss": 2.7022, + "step": 77625 + }, + { + "epoch": 5.274493817094714, + "grad_norm": 7.53154182434082, + "learning_rate": 3.4096854192145673e-06, + "loss": 2.9191, + "step": 77630 + }, + { + "epoch": 5.274833537165375, + "grad_norm": 7.888096332550049, + "learning_rate": 3.40926076912624e-06, + "loss": 2.9842, + "step": 77635 + }, + { + "epoch": 5.2751732572360375, + "grad_norm": 7.704676151275635, + "learning_rate": 3.4088361190379133e-06, + "loss": 2.775, + "step": 77640 + }, + { + "epoch": 5.2755129773066995, + "grad_norm": 6.6442179679870605, + "learning_rate": 3.4084114689495857e-06, + "loss": 2.7712, + "step": 77645 + }, + { + "epoch": 5.275852697377361, + "grad_norm": 8.074427604675293, + "learning_rate": 3.4079868188612585e-06, + "loss": 2.6857, + "step": 77650 + }, + { + "epoch": 5.276192417448023, + "grad_norm": 7.102512836456299, + "learning_rate": 3.4075621687729317e-06, + "loss": 2.8524, + "step": 77655 + }, + { + "epoch": 5.276532137518685, + "grad_norm": 6.644049644470215, + "learning_rate": 3.407137518684604e-06, + "loss": 2.8562, + "step": 77660 + }, + { + "epoch": 5.276871857589346, + "grad_norm": 8.377463340759277, + "learning_rate": 3.406712868596277e-06, + "loss": 2.6305, + "step": 77665 + }, + { + "epoch": 5.277211577660008, + "grad_norm": 6.762970447540283, + "learning_rate": 3.4062882185079497e-06, + "loss": 2.9044, + "step": 77670 + }, + { + "epoch": 5.27755129773067, + "grad_norm": 7.313464164733887, + "learning_rate": 3.405863568419622e-06, + "loss": 2.8132, + "step": 77675 + }, + { + "epoch": 5.277891017801331, + "grad_norm": 8.655638694763184, + "learning_rate": 3.4054389183312953e-06, + "loss": 2.5283, + "step": 77680 + }, + { + "epoch": 5.2782307378719935, + "grad_norm": 6.775223731994629, + "learning_rate": 3.405014268242968e-06, + "loss": 2.8627, + "step": 77685 + }, + { + "epoch": 5.278570457942656, + "grad_norm": 9.770163536071777, + "learning_rate": 3.4045896181546405e-06, + "loss": 2.7045, + "step": 77690 + }, + { + "epoch": 5.278910178013317, + "grad_norm": 7.40760612487793, + "learning_rate": 3.4041649680663137e-06, + "loss": 2.8968, + "step": 77695 + }, + { + "epoch": 5.279249898083979, + "grad_norm": 5.859793186187744, + "learning_rate": 3.4037403179779865e-06, + "loss": 2.6237, + "step": 77700 + }, + { + "epoch": 5.279589618154641, + "grad_norm": 7.113816738128662, + "learning_rate": 3.403315667889659e-06, + "loss": 3.0969, + "step": 77705 + }, + { + "epoch": 5.279929338225302, + "grad_norm": 7.149500846862793, + "learning_rate": 3.402891017801332e-06, + "loss": 2.5229, + "step": 77710 + }, + { + "epoch": 5.280269058295964, + "grad_norm": 7.779094696044922, + "learning_rate": 3.402466367713005e-06, + "loss": 2.7349, + "step": 77715 + }, + { + "epoch": 5.280608778366626, + "grad_norm": 7.285008907318115, + "learning_rate": 3.4020417176246773e-06, + "loss": 2.8607, + "step": 77720 + }, + { + "epoch": 5.280948498437287, + "grad_norm": 10.27237606048584, + "learning_rate": 3.40161706753635e-06, + "loss": 2.9234, + "step": 77725 + }, + { + "epoch": 5.2812882185079495, + "grad_norm": 9.722543716430664, + "learning_rate": 3.4011924174480233e-06, + "loss": 2.706, + "step": 77730 + }, + { + "epoch": 5.281627938578612, + "grad_norm": 7.4465460777282715, + "learning_rate": 3.400767767359696e-06, + "loss": 2.8974, + "step": 77735 + }, + { + "epoch": 5.281967658649273, + "grad_norm": 7.340736389160156, + "learning_rate": 3.4003431172713685e-06, + "loss": 2.6761, + "step": 77740 + }, + { + "epoch": 5.282307378719935, + "grad_norm": 7.840407371520996, + "learning_rate": 3.3999184671830417e-06, + "loss": 2.7326, + "step": 77745 + }, + { + "epoch": 5.282647098790597, + "grad_norm": 6.346174240112305, + "learning_rate": 3.3994938170947145e-06, + "loss": 2.6192, + "step": 77750 + }, + { + "epoch": 5.282986818861258, + "grad_norm": 7.116240501403809, + "learning_rate": 3.399069167006387e-06, + "loss": 2.8331, + "step": 77755 + }, + { + "epoch": 5.28332653893192, + "grad_norm": 9.806361198425293, + "learning_rate": 3.3986445169180597e-06, + "loss": 2.8832, + "step": 77760 + }, + { + "epoch": 5.283666259002582, + "grad_norm": 7.483185768127441, + "learning_rate": 3.398219866829733e-06, + "loss": 2.3711, + "step": 77765 + }, + { + "epoch": 5.284005979073243, + "grad_norm": 6.887763023376465, + "learning_rate": 3.3977952167414053e-06, + "loss": 2.7686, + "step": 77770 + }, + { + "epoch": 5.2843456991439055, + "grad_norm": 7.30866813659668, + "learning_rate": 3.397370566653078e-06, + "loss": 2.7655, + "step": 77775 + }, + { + "epoch": 5.284685419214568, + "grad_norm": 8.583504676818848, + "learning_rate": 3.3969459165647513e-06, + "loss": 2.7292, + "step": 77780 + }, + { + "epoch": 5.285025139285229, + "grad_norm": 7.9636969566345215, + "learning_rate": 3.3965212664764237e-06, + "loss": 2.7497, + "step": 77785 + }, + { + "epoch": 5.285364859355891, + "grad_norm": 7.937514781951904, + "learning_rate": 3.3960966163880965e-06, + "loss": 2.9016, + "step": 77790 + }, + { + "epoch": 5.285704579426553, + "grad_norm": 7.831546306610107, + "learning_rate": 3.3956719662997693e-06, + "loss": 2.7332, + "step": 77795 + }, + { + "epoch": 5.286044299497214, + "grad_norm": 8.366074562072754, + "learning_rate": 3.3952473162114417e-06, + "loss": 2.7637, + "step": 77800 + }, + { + "epoch": 5.286384019567876, + "grad_norm": 6.254373550415039, + "learning_rate": 3.394822666123115e-06, + "loss": 2.8357, + "step": 77805 + }, + { + "epoch": 5.286723739638538, + "grad_norm": 8.068448066711426, + "learning_rate": 3.3943980160347877e-06, + "loss": 2.8848, + "step": 77810 + }, + { + "epoch": 5.287063459709199, + "grad_norm": 9.041193962097168, + "learning_rate": 3.39397336594646e-06, + "loss": 2.7865, + "step": 77815 + }, + { + "epoch": 5.2874031797798615, + "grad_norm": 7.638749122619629, + "learning_rate": 3.3935487158581333e-06, + "loss": 2.8195, + "step": 77820 + }, + { + "epoch": 5.287742899850523, + "grad_norm": 5.726134300231934, + "learning_rate": 3.393124065769806e-06, + "loss": 2.9086, + "step": 77825 + }, + { + "epoch": 5.288082619921185, + "grad_norm": 6.790616035461426, + "learning_rate": 3.3926994156814785e-06, + "loss": 2.641, + "step": 77830 + }, + { + "epoch": 5.288422339991847, + "grad_norm": 7.65903377532959, + "learning_rate": 3.3922747655931513e-06, + "loss": 2.9387, + "step": 77835 + }, + { + "epoch": 5.288762060062508, + "grad_norm": 7.268938064575195, + "learning_rate": 3.3918501155048245e-06, + "loss": 3.0135, + "step": 77840 + }, + { + "epoch": 5.28910178013317, + "grad_norm": 9.182209014892578, + "learning_rate": 3.391425465416497e-06, + "loss": 2.9647, + "step": 77845 + }, + { + "epoch": 5.289441500203832, + "grad_norm": 7.285344123840332, + "learning_rate": 3.3910008153281697e-06, + "loss": 2.7566, + "step": 77850 + }, + { + "epoch": 5.289781220274493, + "grad_norm": 8.429323196411133, + "learning_rate": 3.390576165239843e-06, + "loss": 2.8376, + "step": 77855 + }, + { + "epoch": 5.2901209403451555, + "grad_norm": 8.23072338104248, + "learning_rate": 3.3901515151515153e-06, + "loss": 2.8973, + "step": 77860 + }, + { + "epoch": 5.2904606604158175, + "grad_norm": 7.51400089263916, + "learning_rate": 3.389726865063188e-06, + "loss": 2.6281, + "step": 77865 + }, + { + "epoch": 5.290800380486479, + "grad_norm": 5.522827625274658, + "learning_rate": 3.3893022149748613e-06, + "loss": 2.6468, + "step": 77870 + }, + { + "epoch": 5.291140100557141, + "grad_norm": 7.368304252624512, + "learning_rate": 3.3888775648865337e-06, + "loss": 3.0012, + "step": 77875 + }, + { + "epoch": 5.291479820627803, + "grad_norm": 9.761494636535645, + "learning_rate": 3.3884529147982065e-06, + "loss": 2.7338, + "step": 77880 + }, + { + "epoch": 5.291819540698464, + "grad_norm": 6.946755409240723, + "learning_rate": 3.3880282647098793e-06, + "loss": 2.6549, + "step": 77885 + }, + { + "epoch": 5.292159260769126, + "grad_norm": 7.262035369873047, + "learning_rate": 3.3876036146215517e-06, + "loss": 2.9683, + "step": 77890 + }, + { + "epoch": 5.292498980839788, + "grad_norm": 8.677509307861328, + "learning_rate": 3.387178964533225e-06, + "loss": 2.7595, + "step": 77895 + }, + { + "epoch": 5.292838700910449, + "grad_norm": 8.654637336730957, + "learning_rate": 3.3867543144448977e-06, + "loss": 2.8637, + "step": 77900 + }, + { + "epoch": 5.2931784209811115, + "grad_norm": 8.248823165893555, + "learning_rate": 3.386329664356571e-06, + "loss": 2.8154, + "step": 77905 + }, + { + "epoch": 5.2935181410517735, + "grad_norm": 9.103212356567383, + "learning_rate": 3.3859050142682433e-06, + "loss": 2.732, + "step": 77910 + }, + { + "epoch": 5.293857861122435, + "grad_norm": 8.021491050720215, + "learning_rate": 3.385480364179916e-06, + "loss": 2.8273, + "step": 77915 + }, + { + "epoch": 5.294197581193097, + "grad_norm": 9.134908676147461, + "learning_rate": 3.385055714091589e-06, + "loss": 2.7867, + "step": 77920 + }, + { + "epoch": 5.294537301263759, + "grad_norm": 8.123705863952637, + "learning_rate": 3.3846310640032613e-06, + "loss": 2.6834, + "step": 77925 + }, + { + "epoch": 5.29487702133442, + "grad_norm": 8.142678260803223, + "learning_rate": 3.3842064139149345e-06, + "loss": 2.7622, + "step": 77930 + }, + { + "epoch": 5.295216741405082, + "grad_norm": 6.167224884033203, + "learning_rate": 3.3837817638266073e-06, + "loss": 2.8919, + "step": 77935 + }, + { + "epoch": 5.295556461475744, + "grad_norm": 9.539207458496094, + "learning_rate": 3.3833571137382797e-06, + "loss": 2.8241, + "step": 77940 + }, + { + "epoch": 5.295896181546405, + "grad_norm": 8.77602481842041, + "learning_rate": 3.382932463649953e-06, + "loss": 2.9274, + "step": 77945 + }, + { + "epoch": 5.2962359016170675, + "grad_norm": 6.84126615524292, + "learning_rate": 3.3825078135616257e-06, + "loss": 2.9285, + "step": 77950 + }, + { + "epoch": 5.2965756216877296, + "grad_norm": 7.3427042961120605, + "learning_rate": 3.382083163473298e-06, + "loss": 2.877, + "step": 77955 + }, + { + "epoch": 5.296915341758391, + "grad_norm": 7.478747844696045, + "learning_rate": 3.381658513384971e-06, + "loss": 2.6372, + "step": 77960 + }, + { + "epoch": 5.297255061829053, + "grad_norm": 8.605347633361816, + "learning_rate": 3.381233863296644e-06, + "loss": 2.7426, + "step": 77965 + }, + { + "epoch": 5.297594781899715, + "grad_norm": 6.739909648895264, + "learning_rate": 3.3808092132083165e-06, + "loss": 2.7697, + "step": 77970 + }, + { + "epoch": 5.297934501970376, + "grad_norm": 8.663623809814453, + "learning_rate": 3.3803845631199893e-06, + "loss": 2.7237, + "step": 77975 + }, + { + "epoch": 5.298274222041038, + "grad_norm": 7.917196273803711, + "learning_rate": 3.3799599130316625e-06, + "loss": 3.0531, + "step": 77980 + }, + { + "epoch": 5.2986139421117, + "grad_norm": 7.827566146850586, + "learning_rate": 3.379535262943335e-06, + "loss": 2.7775, + "step": 77985 + }, + { + "epoch": 5.298953662182361, + "grad_norm": 7.880524635314941, + "learning_rate": 3.3791106128550077e-06, + "loss": 2.9111, + "step": 77990 + }, + { + "epoch": 5.2992933822530235, + "grad_norm": 10.317913055419922, + "learning_rate": 3.378685962766681e-06, + "loss": 2.6962, + "step": 77995 + }, + { + "epoch": 5.299633102323686, + "grad_norm": 7.422260761260986, + "learning_rate": 3.3782613126783533e-06, + "loss": 2.7019, + "step": 78000 + }, + { + "epoch": 5.299972822394347, + "grad_norm": 9.81033706665039, + "learning_rate": 3.377836662590026e-06, + "loss": 2.8011, + "step": 78005 + }, + { + "epoch": 5.300312542465009, + "grad_norm": 7.729961395263672, + "learning_rate": 3.377412012501699e-06, + "loss": 2.5262, + "step": 78010 + }, + { + "epoch": 5.300652262535671, + "grad_norm": 6.874310493469238, + "learning_rate": 3.3769873624133713e-06, + "loss": 2.8512, + "step": 78015 + }, + { + "epoch": 5.300991982606332, + "grad_norm": 7.864467144012451, + "learning_rate": 3.3765627123250445e-06, + "loss": 2.7503, + "step": 78020 + }, + { + "epoch": 5.301331702676994, + "grad_norm": 9.164294242858887, + "learning_rate": 3.3761380622367173e-06, + "loss": 2.8638, + "step": 78025 + }, + { + "epoch": 5.301671422747656, + "grad_norm": 8.02586555480957, + "learning_rate": 3.3757134121483897e-06, + "loss": 2.7759, + "step": 78030 + }, + { + "epoch": 5.302011142818317, + "grad_norm": 9.573680877685547, + "learning_rate": 3.375288762060063e-06, + "loss": 2.9937, + "step": 78035 + }, + { + "epoch": 5.3023508628889795, + "grad_norm": 7.6447672843933105, + "learning_rate": 3.3748641119717357e-06, + "loss": 2.9193, + "step": 78040 + }, + { + "epoch": 5.302690582959642, + "grad_norm": 5.965930938720703, + "learning_rate": 3.374439461883408e-06, + "loss": 2.858, + "step": 78045 + }, + { + "epoch": 5.303030303030303, + "grad_norm": 9.461540222167969, + "learning_rate": 3.374014811795081e-06, + "loss": 2.7653, + "step": 78050 + }, + { + "epoch": 5.303370023100965, + "grad_norm": 9.560654640197754, + "learning_rate": 3.373590161706754e-06, + "loss": 2.9408, + "step": 78055 + }, + { + "epoch": 5.303709743171627, + "grad_norm": 6.83142614364624, + "learning_rate": 3.3731655116184265e-06, + "loss": 2.7909, + "step": 78060 + }, + { + "epoch": 5.304049463242288, + "grad_norm": 8.24374008178711, + "learning_rate": 3.3727408615300993e-06, + "loss": 2.9818, + "step": 78065 + }, + { + "epoch": 5.30438918331295, + "grad_norm": 7.610008716583252, + "learning_rate": 3.3723162114417725e-06, + "loss": 2.7132, + "step": 78070 + }, + { + "epoch": 5.304728903383612, + "grad_norm": 9.820059776306152, + "learning_rate": 3.3718915613534453e-06, + "loss": 2.7019, + "step": 78075 + }, + { + "epoch": 5.305068623454273, + "grad_norm": 8.635795593261719, + "learning_rate": 3.3714669112651177e-06, + "loss": 2.977, + "step": 78080 + }, + { + "epoch": 5.3054083435249355, + "grad_norm": 7.124879360198975, + "learning_rate": 3.3710422611767905e-06, + "loss": 2.746, + "step": 78085 + }, + { + "epoch": 5.305748063595598, + "grad_norm": 9.017210960388184, + "learning_rate": 3.3706176110884637e-06, + "loss": 2.7794, + "step": 78090 + }, + { + "epoch": 5.306087783666259, + "grad_norm": 8.402993202209473, + "learning_rate": 3.370192961000136e-06, + "loss": 2.8439, + "step": 78095 + }, + { + "epoch": 5.306427503736921, + "grad_norm": 7.037230014801025, + "learning_rate": 3.369768310911809e-06, + "loss": 2.5919, + "step": 78100 + }, + { + "epoch": 5.306767223807583, + "grad_norm": 6.902787208557129, + "learning_rate": 3.369343660823482e-06, + "loss": 2.9609, + "step": 78105 + }, + { + "epoch": 5.307106943878244, + "grad_norm": 11.015796661376953, + "learning_rate": 3.3689190107351545e-06, + "loss": 3.2319, + "step": 78110 + }, + { + "epoch": 5.307446663948906, + "grad_norm": 7.004506587982178, + "learning_rate": 3.3684943606468273e-06, + "loss": 2.9227, + "step": 78115 + }, + { + "epoch": 5.307786384019568, + "grad_norm": 6.590823173522949, + "learning_rate": 3.3680697105585e-06, + "loss": 2.6772, + "step": 78120 + }, + { + "epoch": 5.3081261040902294, + "grad_norm": 6.509113311767578, + "learning_rate": 3.367645060470173e-06, + "loss": 2.6494, + "step": 78125 + }, + { + "epoch": 5.3084658241608915, + "grad_norm": 9.197436332702637, + "learning_rate": 3.3672204103818457e-06, + "loss": 2.9067, + "step": 78130 + }, + { + "epoch": 5.308805544231554, + "grad_norm": 6.094157695770264, + "learning_rate": 3.3667957602935185e-06, + "loss": 2.5969, + "step": 78135 + }, + { + "epoch": 5.309145264302215, + "grad_norm": 7.031855583190918, + "learning_rate": 3.366371110205191e-06, + "loss": 2.746, + "step": 78140 + }, + { + "epoch": 5.309484984372877, + "grad_norm": 7.8515777587890625, + "learning_rate": 3.365946460116864e-06, + "loss": 2.5364, + "step": 78145 + }, + { + "epoch": 5.309824704443539, + "grad_norm": 7.788740634918213, + "learning_rate": 3.365521810028537e-06, + "loss": 2.8625, + "step": 78150 + }, + { + "epoch": 5.3101644245142, + "grad_norm": 6.970776081085205, + "learning_rate": 3.3650971599402093e-06, + "loss": 2.8773, + "step": 78155 + }, + { + "epoch": 5.310504144584862, + "grad_norm": 6.784427642822266, + "learning_rate": 3.3646725098518825e-06, + "loss": 2.7409, + "step": 78160 + }, + { + "epoch": 5.310843864655524, + "grad_norm": 7.330292224884033, + "learning_rate": 3.3642478597635553e-06, + "loss": 3.2041, + "step": 78165 + }, + { + "epoch": 5.3111835847261855, + "grad_norm": 7.03996467590332, + "learning_rate": 3.3638232096752277e-06, + "loss": 2.7476, + "step": 78170 + }, + { + "epoch": 5.3115233047968475, + "grad_norm": 8.719185829162598, + "learning_rate": 3.3633985595869005e-06, + "loss": 3.0105, + "step": 78175 + }, + { + "epoch": 5.31186302486751, + "grad_norm": 10.945630073547363, + "learning_rate": 3.3629739094985737e-06, + "loss": 3.0939, + "step": 78180 + }, + { + "epoch": 5.312202744938171, + "grad_norm": 6.317886829376221, + "learning_rate": 3.362549259410246e-06, + "loss": 2.6279, + "step": 78185 + }, + { + "epoch": 5.312542465008833, + "grad_norm": 8.235958099365234, + "learning_rate": 3.362124609321919e-06, + "loss": 2.5786, + "step": 78190 + }, + { + "epoch": 5.312882185079495, + "grad_norm": 7.049444675445557, + "learning_rate": 3.361699959233592e-06, + "loss": 2.683, + "step": 78195 + }, + { + "epoch": 5.313221905150156, + "grad_norm": 7.013874530792236, + "learning_rate": 3.3612753091452645e-06, + "loss": 2.8585, + "step": 78200 + }, + { + "epoch": 5.313561625220818, + "grad_norm": 8.28852653503418, + "learning_rate": 3.3608506590569373e-06, + "loss": 2.9655, + "step": 78205 + }, + { + "epoch": 5.31390134529148, + "grad_norm": 8.207657814025879, + "learning_rate": 3.36042600896861e-06, + "loss": 2.9764, + "step": 78210 + }, + { + "epoch": 5.3142410653621415, + "grad_norm": 9.358247756958008, + "learning_rate": 3.360001358880283e-06, + "loss": 2.6705, + "step": 78215 + }, + { + "epoch": 5.3145807854328035, + "grad_norm": 6.622672080993652, + "learning_rate": 3.3595767087919557e-06, + "loss": 2.8471, + "step": 78220 + }, + { + "epoch": 5.314920505503465, + "grad_norm": 7.844764232635498, + "learning_rate": 3.3591520587036285e-06, + "loss": 2.9307, + "step": 78225 + }, + { + "epoch": 5.315260225574127, + "grad_norm": 9.67589282989502, + "learning_rate": 3.358727408615301e-06, + "loss": 2.8525, + "step": 78230 + }, + { + "epoch": 5.315599945644789, + "grad_norm": 7.290565490722656, + "learning_rate": 3.358302758526974e-06, + "loss": 2.6144, + "step": 78235 + }, + { + "epoch": 5.31593966571545, + "grad_norm": 7.058045864105225, + "learning_rate": 3.357878108438647e-06, + "loss": 2.8426, + "step": 78240 + }, + { + "epoch": 5.316279385786112, + "grad_norm": 7.855140686035156, + "learning_rate": 3.3574534583503197e-06, + "loss": 2.7581, + "step": 78245 + }, + { + "epoch": 5.316619105856774, + "grad_norm": 8.040959358215332, + "learning_rate": 3.3570288082619925e-06, + "loss": 2.5866, + "step": 78250 + }, + { + "epoch": 5.316958825927435, + "grad_norm": 7.991828918457031, + "learning_rate": 3.3566041581736653e-06, + "loss": 2.8315, + "step": 78255 + }, + { + "epoch": 5.3172985459980975, + "grad_norm": 7.147004127502441, + "learning_rate": 3.356179508085338e-06, + "loss": 2.8716, + "step": 78260 + }, + { + "epoch": 5.31763826606876, + "grad_norm": 6.84009313583374, + "learning_rate": 3.3557548579970105e-06, + "loss": 2.6923, + "step": 78265 + }, + { + "epoch": 5.317977986139421, + "grad_norm": 7.047084331512451, + "learning_rate": 3.3553302079086837e-06, + "loss": 2.7583, + "step": 78270 + }, + { + "epoch": 5.318317706210083, + "grad_norm": 7.895512104034424, + "learning_rate": 3.3549055578203565e-06, + "loss": 2.7837, + "step": 78275 + }, + { + "epoch": 5.318657426280745, + "grad_norm": 6.917085647583008, + "learning_rate": 3.354480907732029e-06, + "loss": 2.5113, + "step": 78280 + }, + { + "epoch": 5.318997146351406, + "grad_norm": 9.650428771972656, + "learning_rate": 3.354056257643702e-06, + "loss": 2.79, + "step": 78285 + }, + { + "epoch": 5.319336866422068, + "grad_norm": 5.811108589172363, + "learning_rate": 3.353631607555375e-06, + "loss": 2.8265, + "step": 78290 + }, + { + "epoch": 5.31967658649273, + "grad_norm": 8.670526504516602, + "learning_rate": 3.3532069574670473e-06, + "loss": 2.9769, + "step": 78295 + }, + { + "epoch": 5.320016306563391, + "grad_norm": 6.560847282409668, + "learning_rate": 3.35278230737872e-06, + "loss": 2.6839, + "step": 78300 + }, + { + "epoch": 5.3203560266340535, + "grad_norm": 6.915842533111572, + "learning_rate": 3.3523576572903933e-06, + "loss": 2.7777, + "step": 78305 + }, + { + "epoch": 5.320695746704716, + "grad_norm": 9.479021072387695, + "learning_rate": 3.3519330072020657e-06, + "loss": 2.7391, + "step": 78310 + }, + { + "epoch": 5.321035466775377, + "grad_norm": 7.773929595947266, + "learning_rate": 3.3515083571137385e-06, + "loss": 2.8765, + "step": 78315 + }, + { + "epoch": 5.321375186846039, + "grad_norm": 7.357637882232666, + "learning_rate": 3.3510837070254117e-06, + "loss": 2.6843, + "step": 78320 + }, + { + "epoch": 5.321714906916701, + "grad_norm": 7.940112113952637, + "learning_rate": 3.350659056937084e-06, + "loss": 2.6562, + "step": 78325 + }, + { + "epoch": 5.322054626987362, + "grad_norm": 6.766191005706787, + "learning_rate": 3.350234406848757e-06, + "loss": 2.5953, + "step": 78330 + }, + { + "epoch": 5.322394347058024, + "grad_norm": 7.043492317199707, + "learning_rate": 3.3498097567604297e-06, + "loss": 2.7525, + "step": 78335 + }, + { + "epoch": 5.322734067128686, + "grad_norm": 6.9589385986328125, + "learning_rate": 3.3493851066721025e-06, + "loss": 2.9194, + "step": 78340 + }, + { + "epoch": 5.323073787199347, + "grad_norm": 8.520585060119629, + "learning_rate": 3.3489604565837753e-06, + "loss": 2.8927, + "step": 78345 + }, + { + "epoch": 5.3234135072700095, + "grad_norm": 7.296145915985107, + "learning_rate": 3.348535806495448e-06, + "loss": 2.6938, + "step": 78350 + }, + { + "epoch": 5.323753227340672, + "grad_norm": 7.748090744018555, + "learning_rate": 3.3481111564071204e-06, + "loss": 2.9135, + "step": 78355 + }, + { + "epoch": 5.324092947411333, + "grad_norm": 7.381665229797363, + "learning_rate": 3.3476865063187937e-06, + "loss": 2.821, + "step": 78360 + }, + { + "epoch": 5.324432667481995, + "grad_norm": 8.626529693603516, + "learning_rate": 3.3472618562304665e-06, + "loss": 2.8612, + "step": 78365 + }, + { + "epoch": 5.324772387552657, + "grad_norm": 7.695214748382568, + "learning_rate": 3.346837206142139e-06, + "loss": 2.8634, + "step": 78370 + }, + { + "epoch": 5.325112107623318, + "grad_norm": 7.610021114349365, + "learning_rate": 3.346412556053812e-06, + "loss": 2.7586, + "step": 78375 + }, + { + "epoch": 5.32545182769398, + "grad_norm": 7.396597385406494, + "learning_rate": 3.345987905965485e-06, + "loss": 2.8184, + "step": 78380 + }, + { + "epoch": 5.325791547764642, + "grad_norm": 7.767471790313721, + "learning_rate": 3.3455632558771573e-06, + "loss": 2.9177, + "step": 78385 + }, + { + "epoch": 5.326131267835303, + "grad_norm": 7.3708038330078125, + "learning_rate": 3.34513860578883e-06, + "loss": 2.7304, + "step": 78390 + }, + { + "epoch": 5.3264709879059655, + "grad_norm": 9.197552680969238, + "learning_rate": 3.3447139557005033e-06, + "loss": 2.8848, + "step": 78395 + }, + { + "epoch": 5.326810707976628, + "grad_norm": 9.706000328063965, + "learning_rate": 3.3442893056121757e-06, + "loss": 2.7362, + "step": 78400 + }, + { + "epoch": 5.327150428047289, + "grad_norm": 7.873934268951416, + "learning_rate": 3.3438646555238485e-06, + "loss": 2.7065, + "step": 78405 + }, + { + "epoch": 5.327490148117951, + "grad_norm": 6.760535717010498, + "learning_rate": 3.3434400054355217e-06, + "loss": 2.7198, + "step": 78410 + }, + { + "epoch": 5.327829868188613, + "grad_norm": 7.952264785766602, + "learning_rate": 3.3430153553471945e-06, + "loss": 2.743, + "step": 78415 + }, + { + "epoch": 5.328169588259274, + "grad_norm": 9.40622615814209, + "learning_rate": 3.342590705258867e-06, + "loss": 2.5119, + "step": 78420 + }, + { + "epoch": 5.328509308329936, + "grad_norm": 8.165218353271484, + "learning_rate": 3.3421660551705397e-06, + "loss": 2.7012, + "step": 78425 + }, + { + "epoch": 5.328849028400598, + "grad_norm": 6.444234371185303, + "learning_rate": 3.341741405082213e-06, + "loss": 2.8106, + "step": 78430 + }, + { + "epoch": 5.3291887484712595, + "grad_norm": 7.2981276512146, + "learning_rate": 3.3413167549938853e-06, + "loss": 2.5925, + "step": 78435 + }, + { + "epoch": 5.3295284685419215, + "grad_norm": 10.006826400756836, + "learning_rate": 3.340892104905558e-06, + "loss": 2.9974, + "step": 78440 + }, + { + "epoch": 5.329868188612584, + "grad_norm": 7.088541030883789, + "learning_rate": 3.3404674548172313e-06, + "loss": 2.6646, + "step": 78445 + }, + { + "epoch": 5.330207908683245, + "grad_norm": 9.20001220703125, + "learning_rate": 3.3400428047289037e-06, + "loss": 2.8731, + "step": 78450 + }, + { + "epoch": 5.330547628753907, + "grad_norm": 7.475586891174316, + "learning_rate": 3.3396181546405765e-06, + "loss": 3.0263, + "step": 78455 + }, + { + "epoch": 5.330887348824569, + "grad_norm": 7.402812480926514, + "learning_rate": 3.3391935045522493e-06, + "loss": 2.9477, + "step": 78460 + }, + { + "epoch": 5.33122706889523, + "grad_norm": 6.819693088531494, + "learning_rate": 3.3387688544639216e-06, + "loss": 3.0105, + "step": 78465 + }, + { + "epoch": 5.331566788965892, + "grad_norm": 7.969503402709961, + "learning_rate": 3.338344204375595e-06, + "loss": 2.7007, + "step": 78470 + }, + { + "epoch": 5.331906509036554, + "grad_norm": 7.65482234954834, + "learning_rate": 3.3379195542872677e-06, + "loss": 2.7544, + "step": 78475 + }, + { + "epoch": 5.3322462291072155, + "grad_norm": 6.807391166687012, + "learning_rate": 3.33749490419894e-06, + "loss": 2.8818, + "step": 78480 + }, + { + "epoch": 5.3325859491778775, + "grad_norm": 10.260564804077148, + "learning_rate": 3.3370702541106133e-06, + "loss": 2.7664, + "step": 78485 + }, + { + "epoch": 5.33292566924854, + "grad_norm": 9.916248321533203, + "learning_rate": 3.336645604022286e-06, + "loss": 2.8061, + "step": 78490 + }, + { + "epoch": 5.333265389319201, + "grad_norm": 7.337722301483154, + "learning_rate": 3.3362209539339584e-06, + "loss": 2.6417, + "step": 78495 + }, + { + "epoch": 5.333605109389863, + "grad_norm": 6.6566314697265625, + "learning_rate": 3.3357963038456317e-06, + "loss": 2.7209, + "step": 78500 + }, + { + "epoch": 5.333944829460524, + "grad_norm": 7.094947814941406, + "learning_rate": 3.3353716537573045e-06, + "loss": 2.9481, + "step": 78505 + }, + { + "epoch": 5.334284549531186, + "grad_norm": 8.248584747314453, + "learning_rate": 3.334947003668977e-06, + "loss": 2.7059, + "step": 78510 + }, + { + "epoch": 5.334624269601848, + "grad_norm": 8.57978343963623, + "learning_rate": 3.3345223535806497e-06, + "loss": 2.6672, + "step": 78515 + }, + { + "epoch": 5.334963989672509, + "grad_norm": 7.759997367858887, + "learning_rate": 3.334097703492323e-06, + "loss": 2.9503, + "step": 78520 + }, + { + "epoch": 5.3353037097431715, + "grad_norm": 6.588073253631592, + "learning_rate": 3.3336730534039953e-06, + "loss": 3.2816, + "step": 78525 + }, + { + "epoch": 5.3356434298138335, + "grad_norm": 6.744055271148682, + "learning_rate": 3.333248403315668e-06, + "loss": 2.8842, + "step": 78530 + }, + { + "epoch": 5.335983149884495, + "grad_norm": 6.757699966430664, + "learning_rate": 3.3328237532273413e-06, + "loss": 2.938, + "step": 78535 + }, + { + "epoch": 5.336322869955157, + "grad_norm": 7.363801956176758, + "learning_rate": 3.3323991031390137e-06, + "loss": 2.4826, + "step": 78540 + }, + { + "epoch": 5.336662590025819, + "grad_norm": 7.332493305206299, + "learning_rate": 3.3319744530506865e-06, + "loss": 2.7087, + "step": 78545 + }, + { + "epoch": 5.33700231009648, + "grad_norm": 7.559603691101074, + "learning_rate": 3.3315498029623593e-06, + "loss": 2.5086, + "step": 78550 + }, + { + "epoch": 5.337342030167142, + "grad_norm": 6.733367919921875, + "learning_rate": 3.3311251528740316e-06, + "loss": 2.6756, + "step": 78555 + }, + { + "epoch": 5.337681750237804, + "grad_norm": 7.779409408569336, + "learning_rate": 3.330700502785705e-06, + "loss": 2.9866, + "step": 78560 + }, + { + "epoch": 5.338021470308465, + "grad_norm": 7.712082862854004, + "learning_rate": 3.3302758526973777e-06, + "loss": 2.6601, + "step": 78565 + }, + { + "epoch": 5.3383611903791275, + "grad_norm": 7.915218353271484, + "learning_rate": 3.32985120260905e-06, + "loss": 2.5868, + "step": 78570 + }, + { + "epoch": 5.33870091044979, + "grad_norm": 8.80859375, + "learning_rate": 3.3294265525207233e-06, + "loss": 2.4213, + "step": 78575 + }, + { + "epoch": 5.339040630520451, + "grad_norm": 7.895727634429932, + "learning_rate": 3.329001902432396e-06, + "loss": 2.8416, + "step": 78580 + }, + { + "epoch": 5.339380350591113, + "grad_norm": 7.619846820831299, + "learning_rate": 3.328577252344069e-06, + "loss": 2.5987, + "step": 78585 + }, + { + "epoch": 5.339720070661775, + "grad_norm": 8.747994422912598, + "learning_rate": 3.3281526022557412e-06, + "loss": 2.6214, + "step": 78590 + }, + { + "epoch": 5.340059790732436, + "grad_norm": 8.388882637023926, + "learning_rate": 3.3277279521674145e-06, + "loss": 2.8194, + "step": 78595 + }, + { + "epoch": 5.340399510803098, + "grad_norm": 9.018698692321777, + "learning_rate": 3.3273033020790873e-06, + "loss": 2.8745, + "step": 78600 + }, + { + "epoch": 5.34073923087376, + "grad_norm": 7.905946254730225, + "learning_rate": 3.3268786519907596e-06, + "loss": 2.6631, + "step": 78605 + }, + { + "epoch": 5.341078950944421, + "grad_norm": 7.923675060272217, + "learning_rate": 3.326454001902433e-06, + "loss": 2.7385, + "step": 78610 + }, + { + "epoch": 5.3414186710150835, + "grad_norm": 7.334740161895752, + "learning_rate": 3.3260293518141057e-06, + "loss": 2.7416, + "step": 78615 + }, + { + "epoch": 5.341758391085746, + "grad_norm": 8.812074661254883, + "learning_rate": 3.325604701725778e-06, + "loss": 2.9207, + "step": 78620 + }, + { + "epoch": 5.342098111156407, + "grad_norm": 9.139256477355957, + "learning_rate": 3.3251800516374513e-06, + "loss": 2.6239, + "step": 78625 + }, + { + "epoch": 5.342437831227069, + "grad_norm": 8.883186340332031, + "learning_rate": 3.324755401549124e-06, + "loss": 3.0988, + "step": 78630 + }, + { + "epoch": 5.342777551297731, + "grad_norm": 9.267800331115723, + "learning_rate": 3.3243307514607964e-06, + "loss": 2.8702, + "step": 78635 + }, + { + "epoch": 5.343117271368392, + "grad_norm": 6.357990264892578, + "learning_rate": 3.3239061013724692e-06, + "loss": 2.6133, + "step": 78640 + }, + { + "epoch": 5.343456991439054, + "grad_norm": 9.901374816894531, + "learning_rate": 3.3234814512841425e-06, + "loss": 2.9498, + "step": 78645 + }, + { + "epoch": 5.343796711509716, + "grad_norm": 7.96356201171875, + "learning_rate": 3.323056801195815e-06, + "loss": 2.6159, + "step": 78650 + }, + { + "epoch": 5.344136431580377, + "grad_norm": 6.174136161804199, + "learning_rate": 3.3226321511074876e-06, + "loss": 2.644, + "step": 78655 + }, + { + "epoch": 5.3444761516510395, + "grad_norm": 8.359615325927734, + "learning_rate": 3.322207501019161e-06, + "loss": 3.0263, + "step": 78660 + }, + { + "epoch": 5.344815871721702, + "grad_norm": 9.942817687988281, + "learning_rate": 3.3217828509308333e-06, + "loss": 2.8787, + "step": 78665 + }, + { + "epoch": 5.345155591792363, + "grad_norm": 8.637685775756836, + "learning_rate": 3.321358200842506e-06, + "loss": 2.8188, + "step": 78670 + }, + { + "epoch": 5.345495311863025, + "grad_norm": 8.301544189453125, + "learning_rate": 3.320933550754179e-06, + "loss": 2.7669, + "step": 78675 + }, + { + "epoch": 5.345835031933687, + "grad_norm": 9.196176528930664, + "learning_rate": 3.3205089006658512e-06, + "loss": 2.7738, + "step": 78680 + }, + { + "epoch": 5.346174752004348, + "grad_norm": 7.993314743041992, + "learning_rate": 3.3200842505775245e-06, + "loss": 2.7949, + "step": 78685 + }, + { + "epoch": 5.34651447207501, + "grad_norm": 6.186060428619385, + "learning_rate": 3.3196596004891973e-06, + "loss": 2.9324, + "step": 78690 + }, + { + "epoch": 5.346854192145672, + "grad_norm": 6.234829902648926, + "learning_rate": 3.3192349504008696e-06, + "loss": 3.0038, + "step": 78695 + }, + { + "epoch": 5.3471939122163334, + "grad_norm": 8.31732177734375, + "learning_rate": 3.318810300312543e-06, + "loss": 2.9987, + "step": 78700 + }, + { + "epoch": 5.3475336322869955, + "grad_norm": 7.077832221984863, + "learning_rate": 3.3183856502242157e-06, + "loss": 2.8413, + "step": 78705 + }, + { + "epoch": 5.347873352357658, + "grad_norm": 7.000889301300049, + "learning_rate": 3.317961000135888e-06, + "loss": 2.4454, + "step": 78710 + }, + { + "epoch": 5.348213072428319, + "grad_norm": 5.614900588989258, + "learning_rate": 3.317536350047561e-06, + "loss": 2.6115, + "step": 78715 + }, + { + "epoch": 5.348552792498981, + "grad_norm": 8.575909614562988, + "learning_rate": 3.317111699959234e-06, + "loss": 2.653, + "step": 78720 + }, + { + "epoch": 5.348892512569643, + "grad_norm": 6.947973251342773, + "learning_rate": 3.3166870498709064e-06, + "loss": 2.9403, + "step": 78725 + }, + { + "epoch": 5.349232232640304, + "grad_norm": 8.229299545288086, + "learning_rate": 3.3162623997825792e-06, + "loss": 2.8235, + "step": 78730 + }, + { + "epoch": 5.349571952710966, + "grad_norm": 6.879662990570068, + "learning_rate": 3.3158377496942525e-06, + "loss": 2.7785, + "step": 78735 + }, + { + "epoch": 5.349911672781628, + "grad_norm": 7.359688758850098, + "learning_rate": 3.315413099605925e-06, + "loss": 2.8487, + "step": 78740 + }, + { + "epoch": 5.3502513928522895, + "grad_norm": 10.548379898071289, + "learning_rate": 3.3149884495175976e-06, + "loss": 2.7099, + "step": 78745 + }, + { + "epoch": 5.3505911129229515, + "grad_norm": 8.922178268432617, + "learning_rate": 3.3145637994292704e-06, + "loss": 2.9641, + "step": 78750 + }, + { + "epoch": 5.350930832993614, + "grad_norm": 7.24674654006958, + "learning_rate": 3.3141391493409437e-06, + "loss": 2.66, + "step": 78755 + }, + { + "epoch": 5.351270553064275, + "grad_norm": 6.613490104675293, + "learning_rate": 3.313714499252616e-06, + "loss": 2.7942, + "step": 78760 + }, + { + "epoch": 5.351610273134937, + "grad_norm": 7.460920810699463, + "learning_rate": 3.313289849164289e-06, + "loss": 3.127, + "step": 78765 + }, + { + "epoch": 5.351949993205599, + "grad_norm": 7.144969463348389, + "learning_rate": 3.312865199075962e-06, + "loss": 2.6253, + "step": 78770 + }, + { + "epoch": 5.35228971327626, + "grad_norm": 7.901735782623291, + "learning_rate": 3.3124405489876344e-06, + "loss": 2.7627, + "step": 78775 + }, + { + "epoch": 5.352629433346922, + "grad_norm": 7.336155414581299, + "learning_rate": 3.3120158988993072e-06, + "loss": 2.8427, + "step": 78780 + }, + { + "epoch": 5.352969153417584, + "grad_norm": 9.466272354125977, + "learning_rate": 3.3115912488109805e-06, + "loss": 2.7144, + "step": 78785 + }, + { + "epoch": 5.3533088734882455, + "grad_norm": 7.474786281585693, + "learning_rate": 3.311166598722653e-06, + "loss": 2.8097, + "step": 78790 + }, + { + "epoch": 5.3536485935589075, + "grad_norm": 7.3783345222473145, + "learning_rate": 3.3107419486343256e-06, + "loss": 2.7264, + "step": 78795 + }, + { + "epoch": 5.35398831362957, + "grad_norm": 7.474273204803467, + "learning_rate": 3.3103172985459984e-06, + "loss": 2.5168, + "step": 78800 + }, + { + "epoch": 5.354328033700231, + "grad_norm": 8.383116722106934, + "learning_rate": 3.309892648457671e-06, + "loss": 2.8234, + "step": 78805 + }, + { + "epoch": 5.354667753770893, + "grad_norm": 10.172518730163574, + "learning_rate": 3.309467998369344e-06, + "loss": 2.7717, + "step": 78810 + }, + { + "epoch": 5.355007473841555, + "grad_norm": 6.163010120391846, + "learning_rate": 3.309043348281017e-06, + "loss": 2.7325, + "step": 78815 + }, + { + "epoch": 5.355347193912216, + "grad_norm": 8.221217155456543, + "learning_rate": 3.3086186981926892e-06, + "loss": 2.8366, + "step": 78820 + }, + { + "epoch": 5.355686913982878, + "grad_norm": 5.6243109703063965, + "learning_rate": 3.3081940481043625e-06, + "loss": 2.8963, + "step": 78825 + }, + { + "epoch": 5.35602663405354, + "grad_norm": 7.08461856842041, + "learning_rate": 3.3077693980160353e-06, + "loss": 2.683, + "step": 78830 + }, + { + "epoch": 5.3563663541242015, + "grad_norm": 8.739259719848633, + "learning_rate": 3.3073447479277076e-06, + "loss": 2.7487, + "step": 78835 + }, + { + "epoch": 5.3567060741948636, + "grad_norm": 7.845824241638184, + "learning_rate": 3.3069200978393804e-06, + "loss": 2.7691, + "step": 78840 + }, + { + "epoch": 5.357045794265526, + "grad_norm": 8.248862266540527, + "learning_rate": 3.3064954477510537e-06, + "loss": 2.8319, + "step": 78845 + }, + { + "epoch": 5.357385514336187, + "grad_norm": 6.727006912231445, + "learning_rate": 3.306070797662726e-06, + "loss": 2.6797, + "step": 78850 + }, + { + "epoch": 5.357725234406849, + "grad_norm": 8.161182403564453, + "learning_rate": 3.305646147574399e-06, + "loss": 2.9239, + "step": 78855 + }, + { + "epoch": 5.358064954477511, + "grad_norm": 9.098353385925293, + "learning_rate": 3.305221497486072e-06, + "loss": 2.5867, + "step": 78860 + }, + { + "epoch": 5.358404674548172, + "grad_norm": 6.9882049560546875, + "learning_rate": 3.3047968473977444e-06, + "loss": 2.8052, + "step": 78865 + }, + { + "epoch": 5.358744394618834, + "grad_norm": 6.50730562210083, + "learning_rate": 3.3043721973094172e-06, + "loss": 2.6807, + "step": 78870 + }, + { + "epoch": 5.359084114689496, + "grad_norm": 6.415805339813232, + "learning_rate": 3.30394754722109e-06, + "loss": 2.8535, + "step": 78875 + }, + { + "epoch": 5.3594238347601575, + "grad_norm": 7.0247039794921875, + "learning_rate": 3.303522897132763e-06, + "loss": 2.9362, + "step": 78880 + }, + { + "epoch": 5.35976355483082, + "grad_norm": 7.226624011993408, + "learning_rate": 3.3030982470444356e-06, + "loss": 2.9448, + "step": 78885 + }, + { + "epoch": 5.360103274901482, + "grad_norm": 7.256021022796631, + "learning_rate": 3.3026735969561084e-06, + "loss": 3.0177, + "step": 78890 + }, + { + "epoch": 5.360442994972143, + "grad_norm": 6.776871681213379, + "learning_rate": 3.302248946867781e-06, + "loss": 2.7545, + "step": 78895 + }, + { + "epoch": 5.360782715042805, + "grad_norm": 7.454595565795898, + "learning_rate": 3.301824296779454e-06, + "loss": 2.6457, + "step": 78900 + }, + { + "epoch": 5.361122435113466, + "grad_norm": 7.915805339813232, + "learning_rate": 3.301399646691127e-06, + "loss": 2.8685, + "step": 78905 + }, + { + "epoch": 5.361462155184128, + "grad_norm": 6.364496231079102, + "learning_rate": 3.3009749966027992e-06, + "loss": 2.6101, + "step": 78910 + }, + { + "epoch": 5.36180187525479, + "grad_norm": 9.185137748718262, + "learning_rate": 3.3005503465144724e-06, + "loss": 2.8554, + "step": 78915 + }, + { + "epoch": 5.362141595325451, + "grad_norm": 9.012959480285645, + "learning_rate": 3.3001256964261452e-06, + "loss": 2.9366, + "step": 78920 + }, + { + "epoch": 5.3624813153961135, + "grad_norm": 8.027328491210938, + "learning_rate": 3.299701046337818e-06, + "loss": 2.7434, + "step": 78925 + }, + { + "epoch": 5.362821035466776, + "grad_norm": 6.996366024017334, + "learning_rate": 3.2992763962494904e-06, + "loss": 2.7315, + "step": 78930 + }, + { + "epoch": 5.363160755537437, + "grad_norm": 7.370737075805664, + "learning_rate": 3.2988517461611636e-06, + "loss": 2.9973, + "step": 78935 + }, + { + "epoch": 5.363500475608099, + "grad_norm": 7.172896385192871, + "learning_rate": 3.2984270960728364e-06, + "loss": 2.6704, + "step": 78940 + }, + { + "epoch": 5.363840195678761, + "grad_norm": 9.397255897521973, + "learning_rate": 3.298002445984509e-06, + "loss": 2.7701, + "step": 78945 + }, + { + "epoch": 5.364179915749422, + "grad_norm": 7.5928826332092285, + "learning_rate": 3.297577795896182e-06, + "loss": 2.8599, + "step": 78950 + }, + { + "epoch": 5.364519635820084, + "grad_norm": 9.89490032196045, + "learning_rate": 3.297153145807855e-06, + "loss": 2.9235, + "step": 78955 + }, + { + "epoch": 5.364859355890746, + "grad_norm": 8.30036449432373, + "learning_rate": 3.2967284957195272e-06, + "loss": 2.6531, + "step": 78960 + }, + { + "epoch": 5.365199075961407, + "grad_norm": 8.931896209716797, + "learning_rate": 3.2963038456312e-06, + "loss": 2.7558, + "step": 78965 + }, + { + "epoch": 5.3655387960320695, + "grad_norm": 7.691019058227539, + "learning_rate": 3.2958791955428733e-06, + "loss": 2.8544, + "step": 78970 + }, + { + "epoch": 5.365878516102732, + "grad_norm": 6.963018417358398, + "learning_rate": 3.2954545454545456e-06, + "loss": 3.0707, + "step": 78975 + }, + { + "epoch": 5.366218236173393, + "grad_norm": 7.31321382522583, + "learning_rate": 3.2950298953662184e-06, + "loss": 2.7687, + "step": 78980 + }, + { + "epoch": 5.366557956244055, + "grad_norm": 7.705272674560547, + "learning_rate": 3.2946052452778917e-06, + "loss": 2.6029, + "step": 78985 + }, + { + "epoch": 5.366897676314717, + "grad_norm": 6.718627452850342, + "learning_rate": 3.294180595189564e-06, + "loss": 2.569, + "step": 78990 + }, + { + "epoch": 5.367237396385378, + "grad_norm": 7.937402248382568, + "learning_rate": 3.293755945101237e-06, + "loss": 2.6679, + "step": 78995 + }, + { + "epoch": 5.36757711645604, + "grad_norm": 8.36039924621582, + "learning_rate": 3.2933312950129096e-06, + "loss": 2.7438, + "step": 79000 + }, + { + "epoch": 5.367916836526702, + "grad_norm": 10.627918243408203, + "learning_rate": 3.2929066449245824e-06, + "loss": 2.8189, + "step": 79005 + }, + { + "epoch": 5.3682565565973634, + "grad_norm": 7.808963298797607, + "learning_rate": 3.2924819948362552e-06, + "loss": 3.0231, + "step": 79010 + }, + { + "epoch": 5.3685962766680255, + "grad_norm": 9.583592414855957, + "learning_rate": 3.292057344747928e-06, + "loss": 2.7567, + "step": 79015 + }, + { + "epoch": 5.368935996738688, + "grad_norm": 9.874672889709473, + "learning_rate": 3.2916326946596004e-06, + "loss": 2.8476, + "step": 79020 + }, + { + "epoch": 5.369275716809349, + "grad_norm": 7.693580627441406, + "learning_rate": 3.2912080445712736e-06, + "loss": 2.814, + "step": 79025 + }, + { + "epoch": 5.369615436880011, + "grad_norm": 8.66244125366211, + "learning_rate": 3.2907833944829464e-06, + "loss": 2.9518, + "step": 79030 + }, + { + "epoch": 5.369955156950673, + "grad_norm": 6.52756929397583, + "learning_rate": 3.290358744394619e-06, + "loss": 2.9191, + "step": 79035 + }, + { + "epoch": 5.370294877021334, + "grad_norm": 7.850391864776611, + "learning_rate": 3.289934094306292e-06, + "loss": 2.9627, + "step": 79040 + }, + { + "epoch": 5.370634597091996, + "grad_norm": 8.556844711303711, + "learning_rate": 3.289509444217965e-06, + "loss": 2.8082, + "step": 79045 + }, + { + "epoch": 5.370974317162658, + "grad_norm": 9.992022514343262, + "learning_rate": 3.2890847941296372e-06, + "loss": 2.6444, + "step": 79050 + }, + { + "epoch": 5.3713140372333195, + "grad_norm": 8.175214767456055, + "learning_rate": 3.28866014404131e-06, + "loss": 2.7135, + "step": 79055 + }, + { + "epoch": 5.3716537573039815, + "grad_norm": 7.229085922241211, + "learning_rate": 3.2882354939529832e-06, + "loss": 2.743, + "step": 79060 + }, + { + "epoch": 5.371993477374644, + "grad_norm": 7.921959400177002, + "learning_rate": 3.2878108438646556e-06, + "loss": 2.9393, + "step": 79065 + }, + { + "epoch": 5.372333197445305, + "grad_norm": 8.324519157409668, + "learning_rate": 3.2873861937763284e-06, + "loss": 2.6253, + "step": 79070 + }, + { + "epoch": 5.372672917515967, + "grad_norm": 5.7139387130737305, + "learning_rate": 3.2869615436880016e-06, + "loss": 2.788, + "step": 79075 + }, + { + "epoch": 5.373012637586629, + "grad_norm": 6.207279682159424, + "learning_rate": 3.286536893599674e-06, + "loss": 2.6865, + "step": 79080 + }, + { + "epoch": 5.37335235765729, + "grad_norm": 7.331900596618652, + "learning_rate": 3.286112243511347e-06, + "loss": 2.7505, + "step": 79085 + }, + { + "epoch": 5.373692077727952, + "grad_norm": 8.891644477844238, + "learning_rate": 3.2856875934230196e-06, + "loss": 2.6433, + "step": 79090 + }, + { + "epoch": 5.374031797798614, + "grad_norm": 6.819655418395996, + "learning_rate": 3.285262943334693e-06, + "loss": 2.6434, + "step": 79095 + }, + { + "epoch": 5.3743715178692755, + "grad_norm": 6.643011569976807, + "learning_rate": 3.2848382932463652e-06, + "loss": 2.6231, + "step": 79100 + }, + { + "epoch": 5.3747112379399375, + "grad_norm": 7.598659515380859, + "learning_rate": 3.284413643158038e-06, + "loss": 2.7446, + "step": 79105 + }, + { + "epoch": 5.3750509580106, + "grad_norm": 7.656314373016357, + "learning_rate": 3.2839889930697112e-06, + "loss": 2.7639, + "step": 79110 + }, + { + "epoch": 5.375390678081261, + "grad_norm": 8.361180305480957, + "learning_rate": 3.2835643429813836e-06, + "loss": 2.7723, + "step": 79115 + }, + { + "epoch": 5.375730398151923, + "grad_norm": 10.374156951904297, + "learning_rate": 3.2831396928930564e-06, + "loss": 2.9112, + "step": 79120 + }, + { + "epoch": 5.376070118222585, + "grad_norm": 7.946458339691162, + "learning_rate": 3.2827150428047292e-06, + "loss": 2.8215, + "step": 79125 + }, + { + "epoch": 5.376409838293246, + "grad_norm": 7.748198986053467, + "learning_rate": 3.282290392716402e-06, + "loss": 2.8497, + "step": 79130 + }, + { + "epoch": 5.376749558363908, + "grad_norm": 8.990706443786621, + "learning_rate": 3.281865742628075e-06, + "loss": 2.7904, + "step": 79135 + }, + { + "epoch": 5.37708927843457, + "grad_norm": 8.099181175231934, + "learning_rate": 3.2814410925397476e-06, + "loss": 2.8048, + "step": 79140 + }, + { + "epoch": 5.3774289985052315, + "grad_norm": 8.868005752563477, + "learning_rate": 3.28101644245142e-06, + "loss": 3.0005, + "step": 79145 + }, + { + "epoch": 5.377768718575894, + "grad_norm": 8.179549217224121, + "learning_rate": 3.2805917923630932e-06, + "loss": 2.8052, + "step": 79150 + }, + { + "epoch": 5.378108438646556, + "grad_norm": 7.829744338989258, + "learning_rate": 3.280167142274766e-06, + "loss": 2.7616, + "step": 79155 + }, + { + "epoch": 5.378448158717217, + "grad_norm": 6.706933498382568, + "learning_rate": 3.2797424921864384e-06, + "loss": 2.7296, + "step": 79160 + }, + { + "epoch": 5.378787878787879, + "grad_norm": 7.412611484527588, + "learning_rate": 3.2793178420981116e-06, + "loss": 2.6449, + "step": 79165 + }, + { + "epoch": 5.379127598858541, + "grad_norm": 6.774895191192627, + "learning_rate": 3.2788931920097844e-06, + "loss": 3.0315, + "step": 79170 + }, + { + "epoch": 5.379467318929202, + "grad_norm": 7.555469989776611, + "learning_rate": 3.278468541921457e-06, + "loss": 2.6118, + "step": 79175 + }, + { + "epoch": 5.379807038999864, + "grad_norm": 6.934598445892334, + "learning_rate": 3.2780438918331296e-06, + "loss": 2.9332, + "step": 79180 + }, + { + "epoch": 5.380146759070525, + "grad_norm": 7.744095325469971, + "learning_rate": 3.277619241744803e-06, + "loss": 2.7731, + "step": 79185 + }, + { + "epoch": 5.3804864791411875, + "grad_norm": 6.603458881378174, + "learning_rate": 3.2771945916564752e-06, + "loss": 2.7922, + "step": 79190 + }, + { + "epoch": 5.38082619921185, + "grad_norm": 6.889558792114258, + "learning_rate": 3.276769941568148e-06, + "loss": 2.8167, + "step": 79195 + }, + { + "epoch": 5.381165919282511, + "grad_norm": 10.138224601745605, + "learning_rate": 3.2763452914798212e-06, + "loss": 2.9432, + "step": 79200 + }, + { + "epoch": 5.381505639353173, + "grad_norm": 5.995547771453857, + "learning_rate": 3.2759206413914936e-06, + "loss": 2.7618, + "step": 79205 + }, + { + "epoch": 5.381845359423835, + "grad_norm": 6.571298122406006, + "learning_rate": 3.2754959913031664e-06, + "loss": 2.5638, + "step": 79210 + }, + { + "epoch": 5.382185079494496, + "grad_norm": 8.128743171691895, + "learning_rate": 3.2750713412148392e-06, + "loss": 2.7535, + "step": 79215 + }, + { + "epoch": 5.382524799565158, + "grad_norm": 6.681683540344238, + "learning_rate": 3.2746466911265116e-06, + "loss": 2.7932, + "step": 79220 + }, + { + "epoch": 5.38286451963582, + "grad_norm": 7.374581336975098, + "learning_rate": 3.274222041038185e-06, + "loss": 2.7886, + "step": 79225 + }, + { + "epoch": 5.383204239706481, + "grad_norm": 6.319744110107422, + "learning_rate": 3.2737973909498576e-06, + "loss": 3.0201, + "step": 79230 + }, + { + "epoch": 5.3835439597771435, + "grad_norm": 9.592141151428223, + "learning_rate": 3.27337274086153e-06, + "loss": 2.9521, + "step": 79235 + }, + { + "epoch": 5.383883679847806, + "grad_norm": 7.15531063079834, + "learning_rate": 3.2729480907732032e-06, + "loss": 2.8518, + "step": 79240 + }, + { + "epoch": 5.384223399918467, + "grad_norm": 8.118084907531738, + "learning_rate": 3.272523440684876e-06, + "loss": 2.6356, + "step": 79245 + }, + { + "epoch": 5.384563119989129, + "grad_norm": 7.877566814422607, + "learning_rate": 3.2720987905965484e-06, + "loss": 2.6956, + "step": 79250 + }, + { + "epoch": 5.384902840059791, + "grad_norm": 9.42712116241455, + "learning_rate": 3.271674140508221e-06, + "loss": 2.6421, + "step": 79255 + }, + { + "epoch": 5.385242560130452, + "grad_norm": 6.379480361938477, + "learning_rate": 3.2712494904198944e-06, + "loss": 2.7413, + "step": 79260 + }, + { + "epoch": 5.385582280201114, + "grad_norm": 8.764702796936035, + "learning_rate": 3.2708248403315672e-06, + "loss": 2.7789, + "step": 79265 + }, + { + "epoch": 5.385922000271776, + "grad_norm": 7.2636189460754395, + "learning_rate": 3.2704001902432396e-06, + "loss": 2.7393, + "step": 79270 + }, + { + "epoch": 5.386261720342437, + "grad_norm": 7.586832523345947, + "learning_rate": 3.269975540154913e-06, + "loss": 2.9524, + "step": 79275 + }, + { + "epoch": 5.3866014404130995, + "grad_norm": 6.525365829467773, + "learning_rate": 3.2695508900665856e-06, + "loss": 2.8235, + "step": 79280 + }, + { + "epoch": 5.386941160483762, + "grad_norm": 7.620903491973877, + "learning_rate": 3.269126239978258e-06, + "loss": 2.658, + "step": 79285 + }, + { + "epoch": 5.387280880554423, + "grad_norm": 6.682773590087891, + "learning_rate": 3.2687015898899312e-06, + "loss": 2.8266, + "step": 79290 + }, + { + "epoch": 5.387620600625085, + "grad_norm": 6.090714931488037, + "learning_rate": 3.268276939801604e-06, + "loss": 2.696, + "step": 79295 + }, + { + "epoch": 5.387960320695747, + "grad_norm": 6.503645420074463, + "learning_rate": 3.2678522897132764e-06, + "loss": 2.5147, + "step": 79300 + }, + { + "epoch": 5.388300040766408, + "grad_norm": 9.861836433410645, + "learning_rate": 3.267427639624949e-06, + "loss": 2.7374, + "step": 79305 + }, + { + "epoch": 5.38863976083707, + "grad_norm": 5.641104698181152, + "learning_rate": 3.2670029895366224e-06, + "loss": 2.835, + "step": 79310 + }, + { + "epoch": 5.388979480907732, + "grad_norm": 7.184336185455322, + "learning_rate": 3.266578339448295e-06, + "loss": 2.7312, + "step": 79315 + }, + { + "epoch": 5.3893192009783935, + "grad_norm": 7.908451080322266, + "learning_rate": 3.2661536893599676e-06, + "loss": 2.8029, + "step": 79320 + }, + { + "epoch": 5.3896589210490555, + "grad_norm": 7.687658309936523, + "learning_rate": 3.265729039271641e-06, + "loss": 2.6839, + "step": 79325 + }, + { + "epoch": 5.389998641119718, + "grad_norm": 6.9040093421936035, + "learning_rate": 3.2653043891833132e-06, + "loss": 2.9548, + "step": 79330 + }, + { + "epoch": 5.390338361190379, + "grad_norm": 6.908881187438965, + "learning_rate": 3.264879739094986e-06, + "loss": 2.7502, + "step": 79335 + }, + { + "epoch": 5.390678081261041, + "grad_norm": 6.018588066101074, + "learning_rate": 3.264455089006659e-06, + "loss": 2.8419, + "step": 79340 + }, + { + "epoch": 5.391017801331703, + "grad_norm": 8.703825950622559, + "learning_rate": 3.264030438918331e-06, + "loss": 2.7083, + "step": 79345 + }, + { + "epoch": 5.391357521402364, + "grad_norm": 9.67609977722168, + "learning_rate": 3.2636057888300044e-06, + "loss": 2.9077, + "step": 79350 + }, + { + "epoch": 5.391697241473026, + "grad_norm": 7.537957191467285, + "learning_rate": 3.2631811387416772e-06, + "loss": 2.6857, + "step": 79355 + }, + { + "epoch": 5.392036961543688, + "grad_norm": 7.646080017089844, + "learning_rate": 3.2627564886533496e-06, + "loss": 2.7979, + "step": 79360 + }, + { + "epoch": 5.3923766816143495, + "grad_norm": 8.599396705627441, + "learning_rate": 3.262331838565023e-06, + "loss": 2.8414, + "step": 79365 + }, + { + "epoch": 5.3927164016850115, + "grad_norm": 7.819849014282227, + "learning_rate": 3.2619071884766956e-06, + "loss": 2.5768, + "step": 79370 + }, + { + "epoch": 5.393056121755674, + "grad_norm": 8.355876922607422, + "learning_rate": 3.261482538388368e-06, + "loss": 2.8747, + "step": 79375 + }, + { + "epoch": 5.393395841826335, + "grad_norm": 7.167396545410156, + "learning_rate": 3.261057888300041e-06, + "loss": 2.5825, + "step": 79380 + }, + { + "epoch": 5.393735561896997, + "grad_norm": 8.080140113830566, + "learning_rate": 3.260633238211714e-06, + "loss": 2.797, + "step": 79385 + }, + { + "epoch": 5.394075281967659, + "grad_norm": 7.593163967132568, + "learning_rate": 3.2602085881233864e-06, + "loss": 2.9504, + "step": 79390 + }, + { + "epoch": 5.39441500203832, + "grad_norm": 7.495482921600342, + "learning_rate": 3.259783938035059e-06, + "loss": 2.7341, + "step": 79395 + }, + { + "epoch": 5.394754722108982, + "grad_norm": 7.741452693939209, + "learning_rate": 3.2593592879467324e-06, + "loss": 2.8318, + "step": 79400 + }, + { + "epoch": 5.395094442179644, + "grad_norm": 8.330422401428223, + "learning_rate": 3.258934637858405e-06, + "loss": 2.6883, + "step": 79405 + }, + { + "epoch": 5.3954341622503055, + "grad_norm": 7.53606653213501, + "learning_rate": 3.2585099877700776e-06, + "loss": 2.9389, + "step": 79410 + }, + { + "epoch": 5.3957738823209676, + "grad_norm": 9.216047286987305, + "learning_rate": 3.258085337681751e-06, + "loss": 2.7584, + "step": 79415 + }, + { + "epoch": 5.39611360239163, + "grad_norm": 5.38931131362915, + "learning_rate": 3.257660687593423e-06, + "loss": 2.8458, + "step": 79420 + }, + { + "epoch": 5.396453322462291, + "grad_norm": 6.747099876403809, + "learning_rate": 3.257236037505096e-06, + "loss": 2.7712, + "step": 79425 + }, + { + "epoch": 5.396793042532953, + "grad_norm": 6.9178853034973145, + "learning_rate": 3.256811387416769e-06, + "loss": 2.8077, + "step": 79430 + }, + { + "epoch": 5.397132762603615, + "grad_norm": 8.105717658996582, + "learning_rate": 3.256386737328442e-06, + "loss": 2.7517, + "step": 79435 + }, + { + "epoch": 5.397472482674276, + "grad_norm": 6.044765472412109, + "learning_rate": 3.2559620872401144e-06, + "loss": 2.9866, + "step": 79440 + }, + { + "epoch": 5.397812202744938, + "grad_norm": 6.937339782714844, + "learning_rate": 3.255537437151787e-06, + "loss": 2.7629, + "step": 79445 + }, + { + "epoch": 5.3981519228156, + "grad_norm": 7.739088535308838, + "learning_rate": 3.2551127870634604e-06, + "loss": 2.6459, + "step": 79450 + }, + { + "epoch": 5.3984916428862615, + "grad_norm": 9.369369506835938, + "learning_rate": 3.254688136975133e-06, + "loss": 2.6274, + "step": 79455 + }, + { + "epoch": 5.398831362956924, + "grad_norm": 6.7217841148376465, + "learning_rate": 3.2542634868868056e-06, + "loss": 2.9081, + "step": 79460 + }, + { + "epoch": 5.399171083027586, + "grad_norm": 9.404096603393555, + "learning_rate": 3.2538388367984784e-06, + "loss": 2.4054, + "step": 79465 + }, + { + "epoch": 5.399510803098247, + "grad_norm": 7.24492073059082, + "learning_rate": 3.2534141867101508e-06, + "loss": 2.7765, + "step": 79470 + }, + { + "epoch": 5.399850523168909, + "grad_norm": 7.942500114440918, + "learning_rate": 3.252989536621824e-06, + "loss": 2.7988, + "step": 79475 + }, + { + "epoch": 5.400190243239571, + "grad_norm": 6.652422904968262, + "learning_rate": 3.252564886533497e-06, + "loss": 2.8666, + "step": 79480 + }, + { + "epoch": 5.400529963310232, + "grad_norm": 6.762847423553467, + "learning_rate": 3.252140236445169e-06, + "loss": 2.6241, + "step": 79485 + }, + { + "epoch": 5.400869683380894, + "grad_norm": 6.769217014312744, + "learning_rate": 3.2517155863568424e-06, + "loss": 2.8717, + "step": 79490 + }, + { + "epoch": 5.401209403451556, + "grad_norm": 7.587493896484375, + "learning_rate": 3.2512909362685152e-06, + "loss": 2.9109, + "step": 79495 + }, + { + "epoch": 5.4015491235222175, + "grad_norm": 7.984808921813965, + "learning_rate": 3.2508662861801876e-06, + "loss": 2.8606, + "step": 79500 + }, + { + "epoch": 5.40188884359288, + "grad_norm": 9.522405624389648, + "learning_rate": 3.2504416360918604e-06, + "loss": 2.7007, + "step": 79505 + }, + { + "epoch": 5.402228563663542, + "grad_norm": 7.174073696136475, + "learning_rate": 3.2500169860035336e-06, + "loss": 2.7228, + "step": 79510 + }, + { + "epoch": 5.402568283734203, + "grad_norm": 8.562308311462402, + "learning_rate": 3.249592335915206e-06, + "loss": 2.7525, + "step": 79515 + }, + { + "epoch": 5.402908003804865, + "grad_norm": 9.124753952026367, + "learning_rate": 3.249167685826879e-06, + "loss": 2.8196, + "step": 79520 + }, + { + "epoch": 5.403247723875527, + "grad_norm": 7.281068801879883, + "learning_rate": 3.248743035738552e-06, + "loss": 2.8455, + "step": 79525 + }, + { + "epoch": 5.403587443946188, + "grad_norm": 8.441579818725586, + "learning_rate": 3.2483183856502244e-06, + "loss": 2.897, + "step": 79530 + }, + { + "epoch": 5.40392716401685, + "grad_norm": 8.180581092834473, + "learning_rate": 3.247893735561897e-06, + "loss": 2.6401, + "step": 79535 + }, + { + "epoch": 5.404266884087512, + "grad_norm": 9.174281120300293, + "learning_rate": 3.24746908547357e-06, + "loss": 2.748, + "step": 79540 + }, + { + "epoch": 5.4046066041581735, + "grad_norm": 6.864157199859619, + "learning_rate": 3.247044435385243e-06, + "loss": 2.8472, + "step": 79545 + }, + { + "epoch": 5.404946324228836, + "grad_norm": 7.38355827331543, + "learning_rate": 3.2466197852969156e-06, + "loss": 2.858, + "step": 79550 + }, + { + "epoch": 5.405286044299498, + "grad_norm": 8.316614151000977, + "learning_rate": 3.2461951352085884e-06, + "loss": 2.9747, + "step": 79555 + }, + { + "epoch": 5.405625764370159, + "grad_norm": 7.529183864593506, + "learning_rate": 3.2457704851202608e-06, + "loss": 2.9396, + "step": 79560 + }, + { + "epoch": 5.405965484440821, + "grad_norm": 7.433546543121338, + "learning_rate": 3.245345835031934e-06, + "loss": 2.648, + "step": 79565 + }, + { + "epoch": 5.406305204511483, + "grad_norm": 6.8754143714904785, + "learning_rate": 3.244921184943607e-06, + "loss": 2.785, + "step": 79570 + }, + { + "epoch": 5.406644924582144, + "grad_norm": 9.090396881103516, + "learning_rate": 3.244496534855279e-06, + "loss": 2.7427, + "step": 79575 + }, + { + "epoch": 5.406984644652806, + "grad_norm": 9.729501724243164, + "learning_rate": 3.2440718847669524e-06, + "loss": 2.8866, + "step": 79580 + }, + { + "epoch": 5.4073243647234674, + "grad_norm": 7.47044563293457, + "learning_rate": 3.243647234678625e-06, + "loss": 2.6109, + "step": 79585 + }, + { + "epoch": 5.4076640847941295, + "grad_norm": 7.376293182373047, + "learning_rate": 3.2432225845902976e-06, + "loss": 2.6998, + "step": 79590 + }, + { + "epoch": 5.408003804864792, + "grad_norm": 7.865494728088379, + "learning_rate": 3.2427979345019704e-06, + "loss": 2.5753, + "step": 79595 + }, + { + "epoch": 5.408343524935453, + "grad_norm": 6.291088581085205, + "learning_rate": 3.2423732844136436e-06, + "loss": 3.0255, + "step": 79600 + }, + { + "epoch": 5.408683245006115, + "grad_norm": 6.766725063323975, + "learning_rate": 3.2419486343253164e-06, + "loss": 2.7076, + "step": 79605 + }, + { + "epoch": 5.409022965076777, + "grad_norm": 7.5765862464904785, + "learning_rate": 3.2415239842369888e-06, + "loss": 2.7695, + "step": 79610 + }, + { + "epoch": 5.409362685147438, + "grad_norm": 6.940617084503174, + "learning_rate": 3.241099334148662e-06, + "loss": 2.529, + "step": 79615 + }, + { + "epoch": 5.4097024052181, + "grad_norm": 7.161371231079102, + "learning_rate": 3.240674684060335e-06, + "loss": 2.6492, + "step": 79620 + }, + { + "epoch": 5.410042125288762, + "grad_norm": 7.884843826293945, + "learning_rate": 3.240250033972007e-06, + "loss": 3.0568, + "step": 79625 + }, + { + "epoch": 5.4103818453594235, + "grad_norm": 8.069146156311035, + "learning_rate": 3.23982538388368e-06, + "loss": 2.7686, + "step": 79630 + }, + { + "epoch": 5.4107215654300855, + "grad_norm": 6.867645263671875, + "learning_rate": 3.2394007337953532e-06, + "loss": 2.8803, + "step": 79635 + }, + { + "epoch": 5.411061285500748, + "grad_norm": 7.586307525634766, + "learning_rate": 3.2389760837070256e-06, + "loss": 2.7238, + "step": 79640 + }, + { + "epoch": 5.411401005571409, + "grad_norm": 7.339497089385986, + "learning_rate": 3.2385514336186984e-06, + "loss": 2.8066, + "step": 79645 + }, + { + "epoch": 5.411740725642071, + "grad_norm": 6.973203182220459, + "learning_rate": 3.2381267835303716e-06, + "loss": 2.6622, + "step": 79650 + }, + { + "epoch": 5.412080445712733, + "grad_norm": 6.178313255310059, + "learning_rate": 3.237702133442044e-06, + "loss": 2.9652, + "step": 79655 + }, + { + "epoch": 5.412420165783394, + "grad_norm": 7.235699653625488, + "learning_rate": 3.237277483353717e-06, + "loss": 2.7098, + "step": 79660 + }, + { + "epoch": 5.412759885854056, + "grad_norm": 7.499035358428955, + "learning_rate": 3.2368528332653896e-06, + "loss": 2.9186, + "step": 79665 + }, + { + "epoch": 5.413099605924718, + "grad_norm": 8.073675155639648, + "learning_rate": 3.2364281831770624e-06, + "loss": 2.8179, + "step": 79670 + }, + { + "epoch": 5.4134393259953795, + "grad_norm": 6.900339603424072, + "learning_rate": 3.236003533088735e-06, + "loss": 2.8303, + "step": 79675 + }, + { + "epoch": 5.4137790460660415, + "grad_norm": 6.381783485412598, + "learning_rate": 3.235578883000408e-06, + "loss": 2.7183, + "step": 79680 + }, + { + "epoch": 5.414118766136704, + "grad_norm": 6.828946113586426, + "learning_rate": 3.2351542329120804e-06, + "loss": 2.6916, + "step": 79685 + }, + { + "epoch": 5.414458486207365, + "grad_norm": 7.856477737426758, + "learning_rate": 3.2347295828237536e-06, + "loss": 2.8759, + "step": 79690 + }, + { + "epoch": 5.414798206278027, + "grad_norm": 6.919300079345703, + "learning_rate": 3.2343049327354264e-06, + "loss": 2.8594, + "step": 79695 + }, + { + "epoch": 5.415137926348689, + "grad_norm": 9.663956642150879, + "learning_rate": 3.2338802826470988e-06, + "loss": 2.8916, + "step": 79700 + }, + { + "epoch": 5.41547764641935, + "grad_norm": 5.807082176208496, + "learning_rate": 3.233455632558772e-06, + "loss": 2.7936, + "step": 79705 + }, + { + "epoch": 5.415817366490012, + "grad_norm": 8.47457218170166, + "learning_rate": 3.233030982470445e-06, + "loss": 2.7371, + "step": 79710 + }, + { + "epoch": 5.416157086560674, + "grad_norm": 6.8300251960754395, + "learning_rate": 3.232606332382117e-06, + "loss": 2.6414, + "step": 79715 + }, + { + "epoch": 5.4164968066313355, + "grad_norm": 7.211875915527344, + "learning_rate": 3.23218168229379e-06, + "loss": 2.6404, + "step": 79720 + }, + { + "epoch": 5.416836526701998, + "grad_norm": 7.425293922424316, + "learning_rate": 3.231757032205463e-06, + "loss": 2.7687, + "step": 79725 + }, + { + "epoch": 5.41717624677266, + "grad_norm": 7.140376091003418, + "learning_rate": 3.2313323821171356e-06, + "loss": 2.8353, + "step": 79730 + }, + { + "epoch": 5.417515966843321, + "grad_norm": 5.944222450256348, + "learning_rate": 3.2309077320288084e-06, + "loss": 2.7807, + "step": 79735 + }, + { + "epoch": 5.417855686913983, + "grad_norm": 8.178101539611816, + "learning_rate": 3.2304830819404816e-06, + "loss": 2.7437, + "step": 79740 + }, + { + "epoch": 5.418195406984645, + "grad_norm": 9.602950096130371, + "learning_rate": 3.230058431852154e-06, + "loss": 2.7944, + "step": 79745 + }, + { + "epoch": 5.418535127055306, + "grad_norm": 9.117044448852539, + "learning_rate": 3.2296337817638268e-06, + "loss": 2.868, + "step": 79750 + }, + { + "epoch": 5.418874847125968, + "grad_norm": 9.761408805847168, + "learning_rate": 3.2292091316754996e-06, + "loss": 3.0438, + "step": 79755 + }, + { + "epoch": 5.41921456719663, + "grad_norm": 9.057791709899902, + "learning_rate": 3.228784481587172e-06, + "loss": 2.544, + "step": 79760 + }, + { + "epoch": 5.4195542872672915, + "grad_norm": 7.9863176345825195, + "learning_rate": 3.228359831498845e-06, + "loss": 2.7358, + "step": 79765 + }, + { + "epoch": 5.419894007337954, + "grad_norm": 7.886415481567383, + "learning_rate": 3.227935181410518e-06, + "loss": 2.7053, + "step": 79770 + }, + { + "epoch": 5.420233727408616, + "grad_norm": 6.881526470184326, + "learning_rate": 3.227510531322191e-06, + "loss": 2.8469, + "step": 79775 + }, + { + "epoch": 5.420573447479277, + "grad_norm": 8.06374454498291, + "learning_rate": 3.2270858812338636e-06, + "loss": 2.9915, + "step": 79780 + }, + { + "epoch": 5.420913167549939, + "grad_norm": 7.093713760375977, + "learning_rate": 3.2266612311455364e-06, + "loss": 2.9137, + "step": 79785 + }, + { + "epoch": 5.421252887620601, + "grad_norm": 9.9684419631958, + "learning_rate": 3.226236581057209e-06, + "loss": 2.7243, + "step": 79790 + }, + { + "epoch": 5.421592607691262, + "grad_norm": 7.979467391967773, + "learning_rate": 3.225811930968882e-06, + "loss": 2.6139, + "step": 79795 + }, + { + "epoch": 5.421932327761924, + "grad_norm": 7.7276129722595215, + "learning_rate": 3.225387280880555e-06, + "loss": 2.7316, + "step": 79800 + }, + { + "epoch": 5.422272047832586, + "grad_norm": 6.540769577026367, + "learning_rate": 3.2249626307922276e-06, + "loss": 2.5599, + "step": 79805 + }, + { + "epoch": 5.4226117679032475, + "grad_norm": 10.066524505615234, + "learning_rate": 3.2245379807039e-06, + "loss": 2.7194, + "step": 79810 + }, + { + "epoch": 5.42295148797391, + "grad_norm": 13.425704956054688, + "learning_rate": 3.224113330615573e-06, + "loss": 2.9823, + "step": 79815 + }, + { + "epoch": 5.423291208044572, + "grad_norm": 7.279655933380127, + "learning_rate": 3.223688680527246e-06, + "loss": 2.7713, + "step": 79820 + }, + { + "epoch": 5.423630928115233, + "grad_norm": 7.841846466064453, + "learning_rate": 3.2232640304389184e-06, + "loss": 2.9453, + "step": 79825 + }, + { + "epoch": 5.423970648185895, + "grad_norm": 8.185593605041504, + "learning_rate": 3.2228393803505916e-06, + "loss": 2.8931, + "step": 79830 + }, + { + "epoch": 5.424310368256557, + "grad_norm": 6.954844951629639, + "learning_rate": 3.2224147302622644e-06, + "loss": 2.9643, + "step": 79835 + }, + { + "epoch": 5.424650088327218, + "grad_norm": 7.965742111206055, + "learning_rate": 3.2219900801739368e-06, + "loss": 2.8696, + "step": 79840 + }, + { + "epoch": 5.42498980839788, + "grad_norm": 6.475313663482666, + "learning_rate": 3.2215654300856096e-06, + "loss": 2.8359, + "step": 79845 + }, + { + "epoch": 5.425329528468542, + "grad_norm": 6.070295333862305, + "learning_rate": 3.221140779997283e-06, + "loss": 2.8226, + "step": 79850 + }, + { + "epoch": 5.4256692485392035, + "grad_norm": 7.217876434326172, + "learning_rate": 3.220716129908955e-06, + "loss": 2.8705, + "step": 79855 + }, + { + "epoch": 5.426008968609866, + "grad_norm": 7.6996049880981445, + "learning_rate": 3.220291479820628e-06, + "loss": 2.5943, + "step": 79860 + }, + { + "epoch": 5.426348688680527, + "grad_norm": 8.492180824279785, + "learning_rate": 3.219866829732301e-06, + "loss": 2.9627, + "step": 79865 + }, + { + "epoch": 5.426688408751189, + "grad_norm": 6.991530895233154, + "learning_rate": 3.2194421796439736e-06, + "loss": 2.5233, + "step": 79870 + }, + { + "epoch": 5.427028128821851, + "grad_norm": 13.11917781829834, + "learning_rate": 3.2190175295556464e-06, + "loss": 2.8285, + "step": 79875 + }, + { + "epoch": 5.427367848892512, + "grad_norm": 6.180764675140381, + "learning_rate": 3.218592879467319e-06, + "loss": 2.6874, + "step": 79880 + }, + { + "epoch": 5.427707568963174, + "grad_norm": 6.825751781463623, + "learning_rate": 3.2181682293789916e-06, + "loss": 2.574, + "step": 79885 + }, + { + "epoch": 5.428047289033836, + "grad_norm": 8.040400505065918, + "learning_rate": 3.2177435792906648e-06, + "loss": 2.8914, + "step": 79890 + }, + { + "epoch": 5.4283870091044975, + "grad_norm": 6.606110572814941, + "learning_rate": 3.2173189292023376e-06, + "loss": 2.9111, + "step": 79895 + }, + { + "epoch": 5.4287267291751595, + "grad_norm": 9.055234909057617, + "learning_rate": 3.21689427911401e-06, + "loss": 2.7929, + "step": 79900 + }, + { + "epoch": 5.429066449245822, + "grad_norm": 7.030545234680176, + "learning_rate": 3.216469629025683e-06, + "loss": 2.7715, + "step": 79905 + }, + { + "epoch": 5.429406169316483, + "grad_norm": 8.850118637084961, + "learning_rate": 3.216044978937356e-06, + "loss": 2.9833, + "step": 79910 + }, + { + "epoch": 5.429745889387145, + "grad_norm": 9.238393783569336, + "learning_rate": 3.2156203288490284e-06, + "loss": 2.8026, + "step": 79915 + }, + { + "epoch": 5.430085609457807, + "grad_norm": 9.018233299255371, + "learning_rate": 3.2151956787607016e-06, + "loss": 2.4245, + "step": 79920 + }, + { + "epoch": 5.430425329528468, + "grad_norm": 5.8928070068359375, + "learning_rate": 3.2147710286723744e-06, + "loss": 2.8045, + "step": 79925 + }, + { + "epoch": 5.43076504959913, + "grad_norm": 7.709640026092529, + "learning_rate": 3.2143463785840468e-06, + "loss": 2.5474, + "step": 79930 + }, + { + "epoch": 5.431104769669792, + "grad_norm": 7.81953763961792, + "learning_rate": 3.2139217284957196e-06, + "loss": 2.4526, + "step": 79935 + }, + { + "epoch": 5.4314444897404535, + "grad_norm": 7.45698356628418, + "learning_rate": 3.213497078407393e-06, + "loss": 2.8146, + "step": 79940 + }, + { + "epoch": 5.4317842098111155, + "grad_norm": 7.49837589263916, + "learning_rate": 3.2130724283190656e-06, + "loss": 2.7935, + "step": 79945 + }, + { + "epoch": 5.432123929881778, + "grad_norm": 6.280383586883545, + "learning_rate": 3.212647778230738e-06, + "loss": 2.6482, + "step": 79950 + }, + { + "epoch": 5.432463649952439, + "grad_norm": 8.515695571899414, + "learning_rate": 3.212223128142411e-06, + "loss": 2.7768, + "step": 79955 + }, + { + "epoch": 5.432803370023101, + "grad_norm": 9.924869537353516, + "learning_rate": 3.211798478054084e-06, + "loss": 2.8942, + "step": 79960 + }, + { + "epoch": 5.433143090093763, + "grad_norm": 10.516337394714355, + "learning_rate": 3.2113738279657564e-06, + "loss": 2.8766, + "step": 79965 + }, + { + "epoch": 5.433482810164424, + "grad_norm": 6.858526229858398, + "learning_rate": 3.210949177877429e-06, + "loss": 2.7903, + "step": 79970 + }, + { + "epoch": 5.433822530235086, + "grad_norm": 7.2702484130859375, + "learning_rate": 3.2105245277891024e-06, + "loss": 2.7525, + "step": 79975 + }, + { + "epoch": 5.434162250305748, + "grad_norm": 8.255374908447266, + "learning_rate": 3.2100998777007748e-06, + "loss": 2.6515, + "step": 79980 + }, + { + "epoch": 5.4345019703764095, + "grad_norm": 7.982422351837158, + "learning_rate": 3.2096752276124476e-06, + "loss": 2.6775, + "step": 79985 + }, + { + "epoch": 5.4348416904470715, + "grad_norm": 6.5863471031188965, + "learning_rate": 3.209250577524121e-06, + "loss": 3.02, + "step": 79990 + }, + { + "epoch": 5.435181410517734, + "grad_norm": 6.462390422821045, + "learning_rate": 3.208825927435793e-06, + "loss": 2.6805, + "step": 79995 + }, + { + "epoch": 5.435521130588395, + "grad_norm": 8.996553421020508, + "learning_rate": 3.208401277347466e-06, + "loss": 2.832, + "step": 80000 + }, + { + "epoch": 5.435860850659057, + "grad_norm": 7.107350826263428, + "learning_rate": 3.2079766272591388e-06, + "loss": 2.9494, + "step": 80005 + }, + { + "epoch": 5.436200570729719, + "grad_norm": 7.466838359832764, + "learning_rate": 3.207551977170811e-06, + "loss": 2.8377, + "step": 80010 + }, + { + "epoch": 5.43654029080038, + "grad_norm": 6.915792465209961, + "learning_rate": 3.2071273270824844e-06, + "loss": 2.8399, + "step": 80015 + }, + { + "epoch": 5.436880010871042, + "grad_norm": 8.833065032958984, + "learning_rate": 3.206702676994157e-06, + "loss": 2.8623, + "step": 80020 + }, + { + "epoch": 5.437219730941704, + "grad_norm": 8.012791633605957, + "learning_rate": 3.2062780269058296e-06, + "loss": 2.6137, + "step": 80025 + }, + { + "epoch": 5.4375594510123655, + "grad_norm": 9.896724700927734, + "learning_rate": 3.2058533768175028e-06, + "loss": 2.9294, + "step": 80030 + }, + { + "epoch": 5.437899171083028, + "grad_norm": 6.567287921905518, + "learning_rate": 3.2054287267291756e-06, + "loss": 2.8601, + "step": 80035 + }, + { + "epoch": 5.43823889115369, + "grad_norm": 5.72925329208374, + "learning_rate": 3.205004076640848e-06, + "loss": 2.7122, + "step": 80040 + }, + { + "epoch": 5.438578611224351, + "grad_norm": 6.243253231048584, + "learning_rate": 3.2045794265525208e-06, + "loss": 2.6367, + "step": 80045 + }, + { + "epoch": 5.438918331295013, + "grad_norm": 8.197590827941895, + "learning_rate": 3.204154776464194e-06, + "loss": 2.842, + "step": 80050 + }, + { + "epoch": 5.439258051365675, + "grad_norm": 8.348443984985352, + "learning_rate": 3.2037301263758664e-06, + "loss": 2.8977, + "step": 80055 + }, + { + "epoch": 5.439597771436336, + "grad_norm": 6.244843482971191, + "learning_rate": 3.203305476287539e-06, + "loss": 2.6699, + "step": 80060 + }, + { + "epoch": 5.439937491506998, + "grad_norm": 8.154508590698242, + "learning_rate": 3.2028808261992124e-06, + "loss": 2.5989, + "step": 80065 + }, + { + "epoch": 5.44027721157766, + "grad_norm": 6.805279731750488, + "learning_rate": 3.2024561761108848e-06, + "loss": 2.7026, + "step": 80070 + }, + { + "epoch": 5.4406169316483215, + "grad_norm": 7.046724319458008, + "learning_rate": 3.2020315260225576e-06, + "loss": 2.744, + "step": 80075 + }, + { + "epoch": 5.440956651718984, + "grad_norm": 8.505802154541016, + "learning_rate": 3.2016068759342308e-06, + "loss": 2.8675, + "step": 80080 + }, + { + "epoch": 5.441296371789646, + "grad_norm": 6.908926486968994, + "learning_rate": 3.201182225845903e-06, + "loss": 2.9729, + "step": 80085 + }, + { + "epoch": 5.441636091860307, + "grad_norm": 7.175556182861328, + "learning_rate": 3.200757575757576e-06, + "loss": 2.6116, + "step": 80090 + }, + { + "epoch": 5.441975811930969, + "grad_norm": 8.231294631958008, + "learning_rate": 3.2003329256692488e-06, + "loss": 2.6561, + "step": 80095 + }, + { + "epoch": 5.442315532001631, + "grad_norm": 8.838484764099121, + "learning_rate": 3.199908275580921e-06, + "loss": 2.6823, + "step": 80100 + }, + { + "epoch": 5.442655252072292, + "grad_norm": 5.907911777496338, + "learning_rate": 3.1994836254925944e-06, + "loss": 2.7936, + "step": 80105 + }, + { + "epoch": 5.442994972142954, + "grad_norm": 8.697554588317871, + "learning_rate": 3.199058975404267e-06, + "loss": 2.8179, + "step": 80110 + }, + { + "epoch": 5.443334692213616, + "grad_norm": 7.209812164306641, + "learning_rate": 3.1986343253159404e-06, + "loss": 3.0014, + "step": 80115 + }, + { + "epoch": 5.4436744122842775, + "grad_norm": 6.579317569732666, + "learning_rate": 3.1982096752276128e-06, + "loss": 2.8139, + "step": 80120 + }, + { + "epoch": 5.44401413235494, + "grad_norm": 7.674442291259766, + "learning_rate": 3.1977850251392856e-06, + "loss": 2.6953, + "step": 80125 + }, + { + "epoch": 5.444353852425602, + "grad_norm": 6.980925559997559, + "learning_rate": 3.1973603750509584e-06, + "loss": 2.9501, + "step": 80130 + }, + { + "epoch": 5.444693572496263, + "grad_norm": 8.09326457977295, + "learning_rate": 3.1969357249626307e-06, + "loss": 2.9066, + "step": 80135 + }, + { + "epoch": 5.445033292566925, + "grad_norm": 9.98365592956543, + "learning_rate": 3.196511074874304e-06, + "loss": 2.9729, + "step": 80140 + }, + { + "epoch": 5.445373012637587, + "grad_norm": 12.942919731140137, + "learning_rate": 3.1960864247859768e-06, + "loss": 2.7987, + "step": 80145 + }, + { + "epoch": 5.445712732708248, + "grad_norm": 7.267859935760498, + "learning_rate": 3.195661774697649e-06, + "loss": 2.8041, + "step": 80150 + }, + { + "epoch": 5.44605245277891, + "grad_norm": 7.27834415435791, + "learning_rate": 3.1952371246093224e-06, + "loss": 2.6472, + "step": 80155 + }, + { + "epoch": 5.446392172849572, + "grad_norm": 7.078075885772705, + "learning_rate": 3.194812474520995e-06, + "loss": 2.8647, + "step": 80160 + }, + { + "epoch": 5.4467318929202335, + "grad_norm": 5.38944149017334, + "learning_rate": 3.1943878244326676e-06, + "loss": 2.8922, + "step": 80165 + }, + { + "epoch": 5.447071612990896, + "grad_norm": 8.441191673278809, + "learning_rate": 3.1939631743443404e-06, + "loss": 2.9333, + "step": 80170 + }, + { + "epoch": 5.447411333061558, + "grad_norm": 9.169171333312988, + "learning_rate": 3.1935385242560136e-06, + "loss": 2.8453, + "step": 80175 + }, + { + "epoch": 5.447751053132219, + "grad_norm": 7.434726238250732, + "learning_rate": 3.193113874167686e-06, + "loss": 2.7251, + "step": 80180 + }, + { + "epoch": 5.448090773202881, + "grad_norm": 8.92915153503418, + "learning_rate": 3.1926892240793588e-06, + "loss": 2.9376, + "step": 80185 + }, + { + "epoch": 5.448430493273543, + "grad_norm": 6.821780681610107, + "learning_rate": 3.192264573991032e-06, + "loss": 2.922, + "step": 80190 + }, + { + "epoch": 5.448770213344204, + "grad_norm": 8.448460578918457, + "learning_rate": 3.1918399239027044e-06, + "loss": 2.873, + "step": 80195 + }, + { + "epoch": 5.449109933414866, + "grad_norm": 8.345943450927734, + "learning_rate": 3.191415273814377e-06, + "loss": 2.9401, + "step": 80200 + }, + { + "epoch": 5.449449653485528, + "grad_norm": 6.220082759857178, + "learning_rate": 3.1909906237260504e-06, + "loss": 2.7717, + "step": 80205 + }, + { + "epoch": 5.4497893735561895, + "grad_norm": 8.74923324584961, + "learning_rate": 3.1905659736377228e-06, + "loss": 2.7886, + "step": 80210 + }, + { + "epoch": 5.450129093626852, + "grad_norm": 7.773735046386719, + "learning_rate": 3.1901413235493956e-06, + "loss": 2.9954, + "step": 80215 + }, + { + "epoch": 5.450468813697514, + "grad_norm": 7.14982271194458, + "learning_rate": 3.1897166734610684e-06, + "loss": 2.7173, + "step": 80220 + }, + { + "epoch": 5.450808533768175, + "grad_norm": 8.8048677444458, + "learning_rate": 3.1892920233727407e-06, + "loss": 2.6808, + "step": 80225 + }, + { + "epoch": 5.451148253838837, + "grad_norm": 9.125326156616211, + "learning_rate": 3.188867373284414e-06, + "loss": 2.8731, + "step": 80230 + }, + { + "epoch": 5.451487973909499, + "grad_norm": 6.945939540863037, + "learning_rate": 3.1884427231960868e-06, + "loss": 2.8737, + "step": 80235 + }, + { + "epoch": 5.45182769398016, + "grad_norm": 6.846155643463135, + "learning_rate": 3.188018073107759e-06, + "loss": 2.6579, + "step": 80240 + }, + { + "epoch": 5.452167414050822, + "grad_norm": 7.895087718963623, + "learning_rate": 3.1875934230194324e-06, + "loss": 2.7088, + "step": 80245 + }, + { + "epoch": 5.452507134121484, + "grad_norm": 7.3231987953186035, + "learning_rate": 3.187168772931105e-06, + "loss": 2.5687, + "step": 80250 + }, + { + "epoch": 5.4528468541921455, + "grad_norm": 6.814955234527588, + "learning_rate": 3.1867441228427775e-06, + "loss": 3.0303, + "step": 80255 + }, + { + "epoch": 5.453186574262808, + "grad_norm": 8.605203628540039, + "learning_rate": 3.1863194727544503e-06, + "loss": 2.6019, + "step": 80260 + }, + { + "epoch": 5.45352629433347, + "grad_norm": 8.459290504455566, + "learning_rate": 3.1858948226661236e-06, + "loss": 2.8385, + "step": 80265 + }, + { + "epoch": 5.453866014404131, + "grad_norm": 7.546747207641602, + "learning_rate": 3.185470172577796e-06, + "loss": 2.8516, + "step": 80270 + }, + { + "epoch": 5.454205734474793, + "grad_norm": 8.95142936706543, + "learning_rate": 3.1850455224894687e-06, + "loss": 2.74, + "step": 80275 + }, + { + "epoch": 5.454545454545454, + "grad_norm": 7.176027297973633, + "learning_rate": 3.184620872401142e-06, + "loss": 2.8215, + "step": 80280 + }, + { + "epoch": 5.454885174616116, + "grad_norm": 6.676496982574463, + "learning_rate": 3.1841962223128148e-06, + "loss": 2.7814, + "step": 80285 + }, + { + "epoch": 5.455224894686778, + "grad_norm": 10.144692420959473, + "learning_rate": 3.183771572224487e-06, + "loss": 2.6288, + "step": 80290 + }, + { + "epoch": 5.4555646147574395, + "grad_norm": 7.137001991271973, + "learning_rate": 3.18334692213616e-06, + "loss": 3.1709, + "step": 80295 + }, + { + "epoch": 5.4559043348281016, + "grad_norm": 8.288511276245117, + "learning_rate": 3.182922272047833e-06, + "loss": 3.0606, + "step": 80300 + }, + { + "epoch": 5.456244054898764, + "grad_norm": 8.137746810913086, + "learning_rate": 3.1824976219595056e-06, + "loss": 2.922, + "step": 80305 + }, + { + "epoch": 5.456583774969425, + "grad_norm": 7.5274457931518555, + "learning_rate": 3.1820729718711784e-06, + "loss": 2.7872, + "step": 80310 + }, + { + "epoch": 5.456923495040087, + "grad_norm": 8.860949516296387, + "learning_rate": 3.1816483217828516e-06, + "loss": 2.7348, + "step": 80315 + }, + { + "epoch": 5.457263215110749, + "grad_norm": 7.783829689025879, + "learning_rate": 3.181223671694524e-06, + "loss": 2.6098, + "step": 80320 + }, + { + "epoch": 5.45760293518141, + "grad_norm": 8.868389129638672, + "learning_rate": 3.1807990216061968e-06, + "loss": 2.8664, + "step": 80325 + }, + { + "epoch": 5.457942655252072, + "grad_norm": 7.156944751739502, + "learning_rate": 3.1803743715178696e-06, + "loss": 3.0354, + "step": 80330 + }, + { + "epoch": 5.458282375322734, + "grad_norm": 6.27373743057251, + "learning_rate": 3.1799497214295424e-06, + "loss": 2.9134, + "step": 80335 + }, + { + "epoch": 5.4586220953933955, + "grad_norm": 8.012731552124023, + "learning_rate": 3.179525071341215e-06, + "loss": 2.7924, + "step": 80340 + }, + { + "epoch": 5.458961815464058, + "grad_norm": 7.724426746368408, + "learning_rate": 3.179100421252888e-06, + "loss": 2.8532, + "step": 80345 + }, + { + "epoch": 5.45930153553472, + "grad_norm": 7.712535858154297, + "learning_rate": 3.1786757711645603e-06, + "loss": 2.7221, + "step": 80350 + }, + { + "epoch": 5.459641255605381, + "grad_norm": 8.040057182312012, + "learning_rate": 3.1782511210762336e-06, + "loss": 2.9167, + "step": 80355 + }, + { + "epoch": 5.459980975676043, + "grad_norm": 6.885867118835449, + "learning_rate": 3.1778264709879064e-06, + "loss": 2.6316, + "step": 80360 + }, + { + "epoch": 5.460320695746705, + "grad_norm": 7.6399946212768555, + "learning_rate": 3.1774018208995787e-06, + "loss": 3.017, + "step": 80365 + }, + { + "epoch": 5.460660415817366, + "grad_norm": 9.270859718322754, + "learning_rate": 3.176977170811252e-06, + "loss": 2.7247, + "step": 80370 + }, + { + "epoch": 5.461000135888028, + "grad_norm": 7.236166477203369, + "learning_rate": 3.1765525207229248e-06, + "loss": 2.8691, + "step": 80375 + }, + { + "epoch": 5.46133985595869, + "grad_norm": 9.910210609436035, + "learning_rate": 3.176127870634597e-06, + "loss": 2.9598, + "step": 80380 + }, + { + "epoch": 5.4616795760293515, + "grad_norm": 7.084965705871582, + "learning_rate": 3.17570322054627e-06, + "loss": 2.8682, + "step": 80385 + }, + { + "epoch": 5.462019296100014, + "grad_norm": 8.445430755615234, + "learning_rate": 3.175278570457943e-06, + "loss": 2.9571, + "step": 80390 + }, + { + "epoch": 5.462359016170676, + "grad_norm": 7.853988170623779, + "learning_rate": 3.1748539203696155e-06, + "loss": 2.8294, + "step": 80395 + }, + { + "epoch": 5.462698736241337, + "grad_norm": 6.834273338317871, + "learning_rate": 3.1744292702812883e-06, + "loss": 2.8203, + "step": 80400 + }, + { + "epoch": 5.463038456311999, + "grad_norm": 7.4142069816589355, + "learning_rate": 3.1740046201929616e-06, + "loss": 2.8011, + "step": 80405 + }, + { + "epoch": 5.463378176382661, + "grad_norm": 6.9620490074157715, + "learning_rate": 3.173579970104634e-06, + "loss": 2.8751, + "step": 80410 + }, + { + "epoch": 5.463717896453322, + "grad_norm": 7.291962146759033, + "learning_rate": 3.1731553200163067e-06, + "loss": 2.9312, + "step": 80415 + }, + { + "epoch": 5.464057616523984, + "grad_norm": 7.387164115905762, + "learning_rate": 3.1727306699279795e-06, + "loss": 2.6643, + "step": 80420 + }, + { + "epoch": 5.464397336594646, + "grad_norm": 6.405792236328125, + "learning_rate": 3.1723060198396523e-06, + "loss": 2.3985, + "step": 80425 + }, + { + "epoch": 5.4647370566653075, + "grad_norm": 8.585647583007812, + "learning_rate": 3.171881369751325e-06, + "loss": 2.8324, + "step": 80430 + }, + { + "epoch": 5.46507677673597, + "grad_norm": 6.1233296394348145, + "learning_rate": 3.171456719662998e-06, + "loss": 2.9382, + "step": 80435 + }, + { + "epoch": 5.465416496806632, + "grad_norm": 7.665796756744385, + "learning_rate": 3.1710320695746703e-06, + "loss": 2.8795, + "step": 80440 + }, + { + "epoch": 5.465756216877293, + "grad_norm": 9.217597961425781, + "learning_rate": 3.1706074194863435e-06, + "loss": 3.2493, + "step": 80445 + }, + { + "epoch": 5.466095936947955, + "grad_norm": 8.023078918457031, + "learning_rate": 3.1701827693980163e-06, + "loss": 3.1747, + "step": 80450 + }, + { + "epoch": 5.466435657018617, + "grad_norm": 6.559951305389404, + "learning_rate": 3.169758119309689e-06, + "loss": 2.6819, + "step": 80455 + }, + { + "epoch": 5.466775377089278, + "grad_norm": 7.898010730743408, + "learning_rate": 3.169333469221362e-06, + "loss": 2.8968, + "step": 80460 + }, + { + "epoch": 5.46711509715994, + "grad_norm": 10.37618350982666, + "learning_rate": 3.1689088191330348e-06, + "loss": 3.0968, + "step": 80465 + }, + { + "epoch": 5.467454817230602, + "grad_norm": 8.565732955932617, + "learning_rate": 3.1684841690447076e-06, + "loss": 2.685, + "step": 80470 + }, + { + "epoch": 5.4677945373012635, + "grad_norm": 7.589852809906006, + "learning_rate": 3.16805951895638e-06, + "loss": 2.6777, + "step": 80475 + }, + { + "epoch": 5.468134257371926, + "grad_norm": 7.724740028381348, + "learning_rate": 3.167634868868053e-06, + "loss": 2.8995, + "step": 80480 + }, + { + "epoch": 5.468473977442588, + "grad_norm": 6.921764373779297, + "learning_rate": 3.167210218779726e-06, + "loss": 2.7371, + "step": 80485 + }, + { + "epoch": 5.468813697513249, + "grad_norm": 9.111316680908203, + "learning_rate": 3.1667855686913983e-06, + "loss": 2.8809, + "step": 80490 + }, + { + "epoch": 5.469153417583911, + "grad_norm": 8.191106796264648, + "learning_rate": 3.1663609186030716e-06, + "loss": 2.7583, + "step": 80495 + }, + { + "epoch": 5.469493137654573, + "grad_norm": 7.865516662597656, + "learning_rate": 3.1659362685147444e-06, + "loss": 2.5894, + "step": 80500 + }, + { + "epoch": 5.469832857725234, + "grad_norm": 6.803445339202881, + "learning_rate": 3.1655116184264167e-06, + "loss": 2.9034, + "step": 80505 + }, + { + "epoch": 5.470172577795896, + "grad_norm": 7.229240894317627, + "learning_rate": 3.1650869683380895e-06, + "loss": 2.8702, + "step": 80510 + }, + { + "epoch": 5.470512297866558, + "grad_norm": Infinity, + "learning_rate": 3.164747248267428e-06, + "loss": 2.8603, + "step": 80515 + }, + { + "epoch": 5.4708520179372195, + "grad_norm": 10.874167442321777, + "learning_rate": 3.164322598179101e-06, + "loss": 2.6792, + "step": 80520 + }, + { + "epoch": 5.471191738007882, + "grad_norm": 8.346098899841309, + "learning_rate": 3.163897948090773e-06, + "loss": 2.8745, + "step": 80525 + }, + { + "epoch": 5.471531458078544, + "grad_norm": 5.938292026519775, + "learning_rate": 3.1634732980024464e-06, + "loss": 2.8039, + "step": 80530 + }, + { + "epoch": 5.471871178149205, + "grad_norm": 5.91140079498291, + "learning_rate": 3.1630486479141192e-06, + "loss": 2.6288, + "step": 80535 + }, + { + "epoch": 5.472210898219867, + "grad_norm": 9.854947090148926, + "learning_rate": 3.1626239978257916e-06, + "loss": 2.8352, + "step": 80540 + }, + { + "epoch": 5.472550618290528, + "grad_norm": 9.183088302612305, + "learning_rate": 3.1621993477374644e-06, + "loss": 2.9093, + "step": 80545 + }, + { + "epoch": 5.47289033836119, + "grad_norm": 7.364377021789551, + "learning_rate": 3.1617746976491376e-06, + "loss": 2.7614, + "step": 80550 + }, + { + "epoch": 5.473230058431852, + "grad_norm": 6.253659248352051, + "learning_rate": 3.16135004756081e-06, + "loss": 2.7177, + "step": 80555 + }, + { + "epoch": 5.4735697785025135, + "grad_norm": 7.679121017456055, + "learning_rate": 3.160925397472483e-06, + "loss": 2.7018, + "step": 80560 + }, + { + "epoch": 5.4739094985731755, + "grad_norm": 9.799028396606445, + "learning_rate": 3.160500747384156e-06, + "loss": 2.7024, + "step": 80565 + }, + { + "epoch": 5.474249218643838, + "grad_norm": 6.276325225830078, + "learning_rate": 3.1600760972958284e-06, + "loss": 2.8835, + "step": 80570 + }, + { + "epoch": 5.474588938714499, + "grad_norm": 7.946207046508789, + "learning_rate": 3.1596514472075012e-06, + "loss": 2.9105, + "step": 80575 + }, + { + "epoch": 5.474928658785161, + "grad_norm": 6.8621368408203125, + "learning_rate": 3.1592267971191744e-06, + "loss": 2.9698, + "step": 80580 + }, + { + "epoch": 5.475268378855823, + "grad_norm": 8.437963485717773, + "learning_rate": 3.158802147030847e-06, + "loss": 2.9349, + "step": 80585 + }, + { + "epoch": 5.475608098926484, + "grad_norm": 7.074895858764648, + "learning_rate": 3.1583774969425196e-06, + "loss": 2.8287, + "step": 80590 + }, + { + "epoch": 5.475947818997146, + "grad_norm": 11.298371315002441, + "learning_rate": 3.1579528468541924e-06, + "loss": 3.013, + "step": 80595 + }, + { + "epoch": 5.476287539067808, + "grad_norm": 6.717634201049805, + "learning_rate": 3.157528196765865e-06, + "loss": 2.8666, + "step": 80600 + }, + { + "epoch": 5.4766272591384695, + "grad_norm": 7.813323974609375, + "learning_rate": 3.157103546677538e-06, + "loss": 2.6955, + "step": 80605 + }, + { + "epoch": 5.476966979209132, + "grad_norm": 6.266973495483398, + "learning_rate": 3.156678896589211e-06, + "loss": 2.729, + "step": 80610 + }, + { + "epoch": 5.477306699279794, + "grad_norm": 8.496439933776855, + "learning_rate": 3.156254246500883e-06, + "loss": 2.7848, + "step": 80615 + }, + { + "epoch": 5.477646419350455, + "grad_norm": 6.745867729187012, + "learning_rate": 3.1558295964125564e-06, + "loss": 3.1645, + "step": 80620 + }, + { + "epoch": 5.477986139421117, + "grad_norm": 7.764371871948242, + "learning_rate": 3.1554049463242292e-06, + "loss": 3.0058, + "step": 80625 + }, + { + "epoch": 5.478325859491779, + "grad_norm": 7.825116157531738, + "learning_rate": 3.1549802962359016e-06, + "loss": 2.731, + "step": 80630 + }, + { + "epoch": 5.47866557956244, + "grad_norm": 7.24560022354126, + "learning_rate": 3.1545556461475744e-06, + "loss": 2.7472, + "step": 80635 + }, + { + "epoch": 5.479005299633102, + "grad_norm": 7.099850654602051, + "learning_rate": 3.1541309960592476e-06, + "loss": 2.9561, + "step": 80640 + }, + { + "epoch": 5.479345019703764, + "grad_norm": 8.087679862976074, + "learning_rate": 3.15370634597092e-06, + "loss": 2.6974, + "step": 80645 + }, + { + "epoch": 5.4796847397744255, + "grad_norm": 6.088825702667236, + "learning_rate": 3.153281695882593e-06, + "loss": 2.8557, + "step": 80650 + }, + { + "epoch": 5.480024459845088, + "grad_norm": 8.876537322998047, + "learning_rate": 3.152857045794266e-06, + "loss": 2.8425, + "step": 80655 + }, + { + "epoch": 5.48036417991575, + "grad_norm": 7.540768623352051, + "learning_rate": 3.152432395705939e-06, + "loss": 2.6809, + "step": 80660 + }, + { + "epoch": 5.480703899986411, + "grad_norm": 9.672914505004883, + "learning_rate": 3.152007745617611e-06, + "loss": 2.7619, + "step": 80665 + }, + { + "epoch": 5.481043620057073, + "grad_norm": 5.7933149337768555, + "learning_rate": 3.151583095529284e-06, + "loss": 2.8382, + "step": 80670 + }, + { + "epoch": 5.481383340127735, + "grad_norm": 7.5170488357543945, + "learning_rate": 3.1511584454409572e-06, + "loss": 2.9073, + "step": 80675 + }, + { + "epoch": 5.481723060198396, + "grad_norm": 7.5509114265441895, + "learning_rate": 3.1507337953526296e-06, + "loss": 2.8331, + "step": 80680 + }, + { + "epoch": 5.482062780269058, + "grad_norm": 8.76768970489502, + "learning_rate": 3.1503091452643024e-06, + "loss": 2.9038, + "step": 80685 + }, + { + "epoch": 5.48240250033972, + "grad_norm": 6.903867721557617, + "learning_rate": 3.1498844951759756e-06, + "loss": 2.6535, + "step": 80690 + }, + { + "epoch": 5.4827422204103815, + "grad_norm": 8.894484519958496, + "learning_rate": 3.149459845087648e-06, + "loss": 3.0, + "step": 80695 + }, + { + "epoch": 5.483081940481044, + "grad_norm": 5.466567516326904, + "learning_rate": 3.149035194999321e-06, + "loss": 2.7863, + "step": 80700 + }, + { + "epoch": 5.483421660551706, + "grad_norm": 5.976149082183838, + "learning_rate": 3.148610544910994e-06, + "loss": 2.6198, + "step": 80705 + }, + { + "epoch": 5.483761380622367, + "grad_norm": 5.7627692222595215, + "learning_rate": 3.1481858948226664e-06, + "loss": 2.9483, + "step": 80710 + }, + { + "epoch": 5.484101100693029, + "grad_norm": 7.051069736480713, + "learning_rate": 3.1477612447343392e-06, + "loss": 2.7409, + "step": 80715 + }, + { + "epoch": 5.484440820763691, + "grad_norm": 7.628771781921387, + "learning_rate": 3.147336594646012e-06, + "loss": 2.873, + "step": 80720 + }, + { + "epoch": 5.484780540834352, + "grad_norm": 6.961503982543945, + "learning_rate": 3.1469119445576844e-06, + "loss": 2.667, + "step": 80725 + }, + { + "epoch": 5.485120260905014, + "grad_norm": 6.172229766845703, + "learning_rate": 3.1464872944693576e-06, + "loss": 3.1168, + "step": 80730 + }, + { + "epoch": 5.485459980975676, + "grad_norm": 8.137547492980957, + "learning_rate": 3.1460626443810304e-06, + "loss": 2.9361, + "step": 80735 + }, + { + "epoch": 5.4857997010463375, + "grad_norm": 6.9813761711120605, + "learning_rate": 3.145637994292703e-06, + "loss": 2.691, + "step": 80740 + }, + { + "epoch": 5.486139421117, + "grad_norm": 7.079623699188232, + "learning_rate": 3.145213344204376e-06, + "loss": 2.5571, + "step": 80745 + }, + { + "epoch": 5.486479141187662, + "grad_norm": 7.503795623779297, + "learning_rate": 3.144788694116049e-06, + "loss": 2.8302, + "step": 80750 + }, + { + "epoch": 5.486818861258323, + "grad_norm": 7.245449542999268, + "learning_rate": 3.144364044027721e-06, + "loss": 2.7886, + "step": 80755 + }, + { + "epoch": 5.487158581328985, + "grad_norm": 5.970884323120117, + "learning_rate": 3.143939393939394e-06, + "loss": 2.6651, + "step": 80760 + }, + { + "epoch": 5.487498301399647, + "grad_norm": 9.047517776489258, + "learning_rate": 3.1435147438510672e-06, + "loss": 2.5871, + "step": 80765 + }, + { + "epoch": 5.487838021470308, + "grad_norm": 7.197771072387695, + "learning_rate": 3.1430900937627396e-06, + "loss": 3.0489, + "step": 80770 + }, + { + "epoch": 5.48817774154097, + "grad_norm": 6.674461841583252, + "learning_rate": 3.1426654436744124e-06, + "loss": 2.5179, + "step": 80775 + }, + { + "epoch": 5.488517461611632, + "grad_norm": 9.060393333435059, + "learning_rate": 3.1422407935860856e-06, + "loss": 2.7305, + "step": 80780 + }, + { + "epoch": 5.4888571816822935, + "grad_norm": 8.077242851257324, + "learning_rate": 3.141816143497758e-06, + "loss": 2.7184, + "step": 80785 + }, + { + "epoch": 5.489196901752956, + "grad_norm": 6.671316146850586, + "learning_rate": 3.141391493409431e-06, + "loss": 2.7484, + "step": 80790 + }, + { + "epoch": 5.489536621823618, + "grad_norm": 5.630682945251465, + "learning_rate": 3.1409668433211036e-06, + "loss": 2.8592, + "step": 80795 + }, + { + "epoch": 5.489876341894279, + "grad_norm": 6.447530269622803, + "learning_rate": 3.1405421932327764e-06, + "loss": 2.6463, + "step": 80800 + }, + { + "epoch": 5.490216061964941, + "grad_norm": 7.1260480880737305, + "learning_rate": 3.140117543144449e-06, + "loss": 2.8888, + "step": 80805 + }, + { + "epoch": 5.490555782035603, + "grad_norm": 10.496623039245605, + "learning_rate": 3.139692893056122e-06, + "loss": 2.98, + "step": 80810 + }, + { + "epoch": 5.490895502106264, + "grad_norm": 7.969837188720703, + "learning_rate": 3.1392682429677944e-06, + "loss": 2.8502, + "step": 80815 + }, + { + "epoch": 5.491235222176926, + "grad_norm": 8.841824531555176, + "learning_rate": 3.1388435928794676e-06, + "loss": 2.6543, + "step": 80820 + }, + { + "epoch": 5.491574942247588, + "grad_norm": 8.311161041259766, + "learning_rate": 3.1384189427911404e-06, + "loss": 2.6616, + "step": 80825 + }, + { + "epoch": 5.4919146623182495, + "grad_norm": 9.3076753616333, + "learning_rate": 3.137994292702813e-06, + "loss": 2.7867, + "step": 80830 + }, + { + "epoch": 5.492254382388912, + "grad_norm": 6.6128153800964355, + "learning_rate": 3.137569642614486e-06, + "loss": 2.724, + "step": 80835 + }, + { + "epoch": 5.492594102459574, + "grad_norm": 7.503707408905029, + "learning_rate": 3.137144992526159e-06, + "loss": 2.9559, + "step": 80840 + }, + { + "epoch": 5.492933822530235, + "grad_norm": 9.085430145263672, + "learning_rate": 3.1367203424378316e-06, + "loss": 2.6469, + "step": 80845 + }, + { + "epoch": 5.493273542600897, + "grad_norm": 6.580571174621582, + "learning_rate": 3.136295692349504e-06, + "loss": 2.6222, + "step": 80850 + }, + { + "epoch": 5.493613262671559, + "grad_norm": 7.05752420425415, + "learning_rate": 3.1358710422611772e-06, + "loss": 2.6377, + "step": 80855 + }, + { + "epoch": 5.49395298274222, + "grad_norm": 6.193661689758301, + "learning_rate": 3.13544639217285e-06, + "loss": 3.0028, + "step": 80860 + }, + { + "epoch": 5.494292702812882, + "grad_norm": 7.2927751541137695, + "learning_rate": 3.1350217420845224e-06, + "loss": 2.7459, + "step": 80865 + }, + { + "epoch": 5.494632422883544, + "grad_norm": 9.170475959777832, + "learning_rate": 3.1345970919961956e-06, + "loss": 3.0014, + "step": 80870 + }, + { + "epoch": 5.4949721429542056, + "grad_norm": 8.36008358001709, + "learning_rate": 3.1341724419078684e-06, + "loss": 2.9933, + "step": 80875 + }, + { + "epoch": 5.495311863024868, + "grad_norm": 9.301528930664062, + "learning_rate": 3.133747791819541e-06, + "loss": 2.8581, + "step": 80880 + }, + { + "epoch": 5.49565158309553, + "grad_norm": 7.708411693572998, + "learning_rate": 3.1333231417312136e-06, + "loss": 2.6297, + "step": 80885 + }, + { + "epoch": 5.495991303166191, + "grad_norm": 7.4404120445251465, + "learning_rate": 3.132898491642887e-06, + "loss": 2.8286, + "step": 80890 + }, + { + "epoch": 5.496331023236853, + "grad_norm": 7.49722146987915, + "learning_rate": 3.132473841554559e-06, + "loss": 2.6847, + "step": 80895 + }, + { + "epoch": 5.496670743307515, + "grad_norm": 6.910409450531006, + "learning_rate": 3.132049191466232e-06, + "loss": 2.5571, + "step": 80900 + }, + { + "epoch": 5.497010463378176, + "grad_norm": 8.851664543151855, + "learning_rate": 3.1316245413779052e-06, + "loss": 3.0209, + "step": 80905 + }, + { + "epoch": 5.497350183448838, + "grad_norm": 8.895845413208008, + "learning_rate": 3.1311998912895776e-06, + "loss": 2.6187, + "step": 80910 + }, + { + "epoch": 5.4976899035195, + "grad_norm": 8.318026542663574, + "learning_rate": 3.1307752412012504e-06, + "loss": 3.0792, + "step": 80915 + }, + { + "epoch": 5.498029623590162, + "grad_norm": 6.67105770111084, + "learning_rate": 3.130350591112923e-06, + "loss": 2.8301, + "step": 80920 + }, + { + "epoch": 5.498369343660824, + "grad_norm": 9.016663551330566, + "learning_rate": 3.129925941024596e-06, + "loss": 2.7769, + "step": 80925 + }, + { + "epoch": 5.498709063731486, + "grad_norm": 6.89541482925415, + "learning_rate": 3.129501290936269e-06, + "loss": 2.8497, + "step": 80930 + }, + { + "epoch": 5.499048783802147, + "grad_norm": 8.08893871307373, + "learning_rate": 3.1290766408479416e-06, + "loss": 2.829, + "step": 80935 + }, + { + "epoch": 5.499388503872809, + "grad_norm": 6.246453762054443, + "learning_rate": 3.128651990759614e-06, + "loss": 2.8068, + "step": 80940 + }, + { + "epoch": 5.499728223943471, + "grad_norm": 6.791709899902344, + "learning_rate": 3.128227340671287e-06, + "loss": 2.8312, + "step": 80945 + }, + { + "epoch": 5.500067944014132, + "grad_norm": 7.407154083251953, + "learning_rate": 3.12780269058296e-06, + "loss": 2.7314, + "step": 80950 + }, + { + "epoch": 5.500407664084794, + "grad_norm": 8.592952728271484, + "learning_rate": 3.1273780404946324e-06, + "loss": 2.8702, + "step": 80955 + }, + { + "epoch": 5.500747384155456, + "grad_norm": 8.678059577941895, + "learning_rate": 3.1269533904063056e-06, + "loss": 2.8044, + "step": 80960 + }, + { + "epoch": 5.501087104226118, + "grad_norm": 6.529604434967041, + "learning_rate": 3.1265287403179784e-06, + "loss": 2.525, + "step": 80965 + }, + { + "epoch": 5.50142682429678, + "grad_norm": 7.0043768882751465, + "learning_rate": 3.1261040902296508e-06, + "loss": 2.9164, + "step": 80970 + }, + { + "epoch": 5.501766544367442, + "grad_norm": 7.0364460945129395, + "learning_rate": 3.1256794401413236e-06, + "loss": 2.6328, + "step": 80975 + }, + { + "epoch": 5.502106264438103, + "grad_norm": 7.24789571762085, + "learning_rate": 3.125254790052997e-06, + "loss": 2.9619, + "step": 80980 + }, + { + "epoch": 5.502445984508765, + "grad_norm": 7.671616077423096, + "learning_rate": 3.124830139964669e-06, + "loss": 2.7952, + "step": 80985 + }, + { + "epoch": 5.502785704579426, + "grad_norm": 8.913887023925781, + "learning_rate": 3.124405489876342e-06, + "loss": 2.6071, + "step": 80990 + }, + { + "epoch": 5.503125424650088, + "grad_norm": 7.861739158630371, + "learning_rate": 3.123980839788015e-06, + "loss": 2.5688, + "step": 80995 + }, + { + "epoch": 5.50346514472075, + "grad_norm": 6.844958782196045, + "learning_rate": 3.1235561896996876e-06, + "loss": 2.8773, + "step": 81000 + }, + { + "epoch": 5.5038048647914115, + "grad_norm": 6.892159938812256, + "learning_rate": 3.1231315396113604e-06, + "loss": 2.8919, + "step": 81005 + }, + { + "epoch": 5.504144584862074, + "grad_norm": 6.959752082824707, + "learning_rate": 3.122706889523033e-06, + "loss": 2.6828, + "step": 81010 + }, + { + "epoch": 5.504484304932736, + "grad_norm": 9.502127647399902, + "learning_rate": 3.1222822394347064e-06, + "loss": 2.766, + "step": 81015 + }, + { + "epoch": 5.504824025003397, + "grad_norm": 7.07962703704834, + "learning_rate": 3.121857589346379e-06, + "loss": 2.808, + "step": 81020 + }, + { + "epoch": 5.505163745074059, + "grad_norm": 6.722910404205322, + "learning_rate": 3.1214329392580516e-06, + "loss": 2.7076, + "step": 81025 + }, + { + "epoch": 5.505503465144721, + "grad_norm": 6.03963565826416, + "learning_rate": 3.121008289169725e-06, + "loss": 2.7077, + "step": 81030 + }, + { + "epoch": 5.505843185215382, + "grad_norm": 6.949385643005371, + "learning_rate": 3.120583639081397e-06, + "loss": 2.8416, + "step": 81035 + }, + { + "epoch": 5.506182905286044, + "grad_norm": 7.236776351928711, + "learning_rate": 3.12015898899307e-06, + "loss": 2.94, + "step": 81040 + }, + { + "epoch": 5.506522625356706, + "grad_norm": 8.731423377990723, + "learning_rate": 3.119734338904743e-06, + "loss": 2.8885, + "step": 81045 + }, + { + "epoch": 5.5068623454273675, + "grad_norm": 6.74139404296875, + "learning_rate": 3.119309688816415e-06, + "loss": 2.8621, + "step": 81050 + }, + { + "epoch": 5.50720206549803, + "grad_norm": 8.492399215698242, + "learning_rate": 3.1188850387280884e-06, + "loss": 2.8133, + "step": 81055 + }, + { + "epoch": 5.507541785568692, + "grad_norm": 8.633499145507812, + "learning_rate": 3.118460388639761e-06, + "loss": 2.6886, + "step": 81060 + }, + { + "epoch": 5.507881505639353, + "grad_norm": 6.403256893157959, + "learning_rate": 3.1180357385514336e-06, + "loss": 2.8705, + "step": 81065 + }, + { + "epoch": 5.508221225710015, + "grad_norm": 6.564901351928711, + "learning_rate": 3.117611088463107e-06, + "loss": 2.8032, + "step": 81070 + }, + { + "epoch": 5.508560945780677, + "grad_norm": 7.712397575378418, + "learning_rate": 3.1171864383747796e-06, + "loss": 2.854, + "step": 81075 + }, + { + "epoch": 5.508900665851338, + "grad_norm": 7.966338634490967, + "learning_rate": 3.116761788286452e-06, + "loss": 2.704, + "step": 81080 + }, + { + "epoch": 5.509240385922, + "grad_norm": 8.63495922088623, + "learning_rate": 3.116337138198125e-06, + "loss": 2.8394, + "step": 81085 + }, + { + "epoch": 5.509580105992662, + "grad_norm": 7.423491477966309, + "learning_rate": 3.115912488109798e-06, + "loss": 2.8168, + "step": 81090 + }, + { + "epoch": 5.5099198260633235, + "grad_norm": 8.806135177612305, + "learning_rate": 3.1154878380214704e-06, + "loss": 2.8582, + "step": 81095 + }, + { + "epoch": 5.510259546133986, + "grad_norm": 8.275737762451172, + "learning_rate": 3.115063187933143e-06, + "loss": 2.89, + "step": 81100 + }, + { + "epoch": 5.510599266204648, + "grad_norm": 9.002470970153809, + "learning_rate": 3.1146385378448164e-06, + "loss": 2.9497, + "step": 81105 + }, + { + "epoch": 5.510938986275309, + "grad_norm": 8.217832565307617, + "learning_rate": 3.1142138877564888e-06, + "loss": 2.9706, + "step": 81110 + }, + { + "epoch": 5.511278706345971, + "grad_norm": 6.583602428436279, + "learning_rate": 3.1137892376681616e-06, + "loss": 2.6904, + "step": 81115 + }, + { + "epoch": 5.511618426416633, + "grad_norm": 6.141956329345703, + "learning_rate": 3.113364587579835e-06, + "loss": 2.6445, + "step": 81120 + }, + { + "epoch": 5.511958146487294, + "grad_norm": 7.754620552062988, + "learning_rate": 3.112939937491507e-06, + "loss": 2.6662, + "step": 81125 + }, + { + "epoch": 5.512297866557956, + "grad_norm": 9.424832344055176, + "learning_rate": 3.11251528740318e-06, + "loss": 2.9412, + "step": 81130 + }, + { + "epoch": 5.512637586628618, + "grad_norm": 6.334692001342773, + "learning_rate": 3.1120906373148528e-06, + "loss": 2.7883, + "step": 81135 + }, + { + "epoch": 5.5129773066992795, + "grad_norm": 7.597446441650391, + "learning_rate": 3.111665987226525e-06, + "loss": 2.6829, + "step": 81140 + }, + { + "epoch": 5.513317026769942, + "grad_norm": 10.38585090637207, + "learning_rate": 3.1112413371381984e-06, + "loss": 2.9882, + "step": 81145 + }, + { + "epoch": 5.513656746840604, + "grad_norm": 7.774537086486816, + "learning_rate": 3.110816687049871e-06, + "loss": 2.8266, + "step": 81150 + }, + { + "epoch": 5.513996466911265, + "grad_norm": 6.277904510498047, + "learning_rate": 3.1103920369615436e-06, + "loss": 2.7771, + "step": 81155 + }, + { + "epoch": 5.514336186981927, + "grad_norm": 7.659296989440918, + "learning_rate": 3.109967386873217e-06, + "loss": 3.1763, + "step": 81160 + }, + { + "epoch": 5.514675907052589, + "grad_norm": 8.352374076843262, + "learning_rate": 3.1095427367848896e-06, + "loss": 2.9595, + "step": 81165 + }, + { + "epoch": 5.51501562712325, + "grad_norm": 9.00911808013916, + "learning_rate": 3.109118086696562e-06, + "loss": 2.9437, + "step": 81170 + }, + { + "epoch": 5.515355347193912, + "grad_norm": 7.49501895904541, + "learning_rate": 3.1086934366082348e-06, + "loss": 2.7044, + "step": 81175 + }, + { + "epoch": 5.515695067264574, + "grad_norm": 7.813156604766846, + "learning_rate": 3.108268786519908e-06, + "loss": 2.8859, + "step": 81180 + }, + { + "epoch": 5.516034787335236, + "grad_norm": 8.781254768371582, + "learning_rate": 3.107844136431581e-06, + "loss": 2.8909, + "step": 81185 + }, + { + "epoch": 5.516374507405898, + "grad_norm": 6.821071624755859, + "learning_rate": 3.107419486343253e-06, + "loss": 2.9073, + "step": 81190 + }, + { + "epoch": 5.516714227476559, + "grad_norm": 7.59909725189209, + "learning_rate": 3.1069948362549264e-06, + "loss": 2.676, + "step": 81195 + }, + { + "epoch": 5.517053947547221, + "grad_norm": 6.9547505378723145, + "learning_rate": 3.106570186166599e-06, + "loss": 2.8994, + "step": 81200 + }, + { + "epoch": 5.517393667617883, + "grad_norm": 7.889379024505615, + "learning_rate": 3.1061455360782716e-06, + "loss": 2.7248, + "step": 81205 + }, + { + "epoch": 5.517733387688544, + "grad_norm": 8.379843711853027, + "learning_rate": 3.105720885989945e-06, + "loss": 2.9402, + "step": 81210 + }, + { + "epoch": 5.518073107759206, + "grad_norm": 6.778860092163086, + "learning_rate": 3.1052962359016176e-06, + "loss": 2.8155, + "step": 81215 + }, + { + "epoch": 5.518412827829868, + "grad_norm": 7.203083515167236, + "learning_rate": 3.10487158581329e-06, + "loss": 2.9148, + "step": 81220 + }, + { + "epoch": 5.5187525479005295, + "grad_norm": 6.526036262512207, + "learning_rate": 3.1044469357249628e-06, + "loss": 2.9402, + "step": 81225 + }, + { + "epoch": 5.519092267971192, + "grad_norm": 8.827467918395996, + "learning_rate": 3.104022285636636e-06, + "loss": 2.8184, + "step": 81230 + }, + { + "epoch": 5.519431988041854, + "grad_norm": 6.952475070953369, + "learning_rate": 3.1035976355483084e-06, + "loss": 3.0267, + "step": 81235 + }, + { + "epoch": 5.519771708112515, + "grad_norm": 6.811104774475098, + "learning_rate": 3.103172985459981e-06, + "loss": 3.0425, + "step": 81240 + }, + { + "epoch": 5.520111428183177, + "grad_norm": 6.57170295715332, + "learning_rate": 3.1027483353716544e-06, + "loss": 3.0141, + "step": 81245 + }, + { + "epoch": 5.520451148253839, + "grad_norm": 8.115510940551758, + "learning_rate": 3.1023236852833268e-06, + "loss": 3.0669, + "step": 81250 + }, + { + "epoch": 5.5207908683245, + "grad_norm": 8.728240013122559, + "learning_rate": 3.1018990351949996e-06, + "loss": 2.769, + "step": 81255 + }, + { + "epoch": 5.521130588395162, + "grad_norm": 8.202220916748047, + "learning_rate": 3.1014743851066724e-06, + "loss": 3.0558, + "step": 81260 + }, + { + "epoch": 5.521470308465824, + "grad_norm": 8.591655731201172, + "learning_rate": 3.1010497350183448e-06, + "loss": 2.772, + "step": 81265 + }, + { + "epoch": 5.5218100285364855, + "grad_norm": 8.904900550842285, + "learning_rate": 3.100625084930018e-06, + "loss": 2.7596, + "step": 81270 + }, + { + "epoch": 5.522149748607148, + "grad_norm": 9.60312557220459, + "learning_rate": 3.1002004348416908e-06, + "loss": 2.7406, + "step": 81275 + }, + { + "epoch": 5.52248946867781, + "grad_norm": 7.784924507141113, + "learning_rate": 3.099775784753363e-06, + "loss": 2.9313, + "step": 81280 + }, + { + "epoch": 5.522829188748471, + "grad_norm": 7.196827411651611, + "learning_rate": 3.0993511346650364e-06, + "loss": 2.9272, + "step": 81285 + }, + { + "epoch": 5.523168908819133, + "grad_norm": 8.726551055908203, + "learning_rate": 3.098926484576709e-06, + "loss": 2.9492, + "step": 81290 + }, + { + "epoch": 5.523508628889795, + "grad_norm": 8.599771499633789, + "learning_rate": 3.0985018344883816e-06, + "loss": 2.6452, + "step": 81295 + }, + { + "epoch": 5.523848348960456, + "grad_norm": 6.741677284240723, + "learning_rate": 3.0980771844000544e-06, + "loss": 2.7191, + "step": 81300 + }, + { + "epoch": 5.524188069031118, + "grad_norm": 6.266202926635742, + "learning_rate": 3.0976525343117276e-06, + "loss": 2.8331, + "step": 81305 + }, + { + "epoch": 5.52452778910178, + "grad_norm": 9.710051536560059, + "learning_rate": 3.0972278842234e-06, + "loss": 2.947, + "step": 81310 + }, + { + "epoch": 5.5248675091724415, + "grad_norm": 7.683992385864258, + "learning_rate": 3.0968032341350728e-06, + "loss": 2.7852, + "step": 81315 + }, + { + "epoch": 5.525207229243104, + "grad_norm": 8.929862976074219, + "learning_rate": 3.096378584046746e-06, + "loss": 2.5835, + "step": 81320 + }, + { + "epoch": 5.525546949313766, + "grad_norm": 7.813653945922852, + "learning_rate": 3.0959539339584184e-06, + "loss": 2.8192, + "step": 81325 + }, + { + "epoch": 5.525886669384427, + "grad_norm": 6.503312110900879, + "learning_rate": 3.095529283870091e-06, + "loss": 2.7502, + "step": 81330 + }, + { + "epoch": 5.526226389455089, + "grad_norm": 7.289678573608398, + "learning_rate": 3.095104633781764e-06, + "loss": 3.0555, + "step": 81335 + }, + { + "epoch": 5.526566109525751, + "grad_norm": 6.180988788604736, + "learning_rate": 3.0946799836934368e-06, + "loss": 2.8772, + "step": 81340 + }, + { + "epoch": 5.526905829596412, + "grad_norm": 9.453923225402832, + "learning_rate": 3.0942553336051096e-06, + "loss": 2.7968, + "step": 81345 + }, + { + "epoch": 5.527245549667074, + "grad_norm": 6.893520832061768, + "learning_rate": 3.0938306835167824e-06, + "loss": 2.8818, + "step": 81350 + }, + { + "epoch": 5.527585269737736, + "grad_norm": 6.920835494995117, + "learning_rate": 3.0934060334284556e-06, + "loss": 2.9005, + "step": 81355 + }, + { + "epoch": 5.5279249898083975, + "grad_norm": 7.007225513458252, + "learning_rate": 3.092981383340128e-06, + "loss": 2.5263, + "step": 81360 + }, + { + "epoch": 5.52826470987906, + "grad_norm": 7.906284809112549, + "learning_rate": 3.0925567332518008e-06, + "loss": 2.9071, + "step": 81365 + }, + { + "epoch": 5.528604429949722, + "grad_norm": 8.482473373413086, + "learning_rate": 3.092132083163474e-06, + "loss": 2.6306, + "step": 81370 + }, + { + "epoch": 5.528944150020383, + "grad_norm": 7.199779510498047, + "learning_rate": 3.0917074330751464e-06, + "loss": 2.7464, + "step": 81375 + }, + { + "epoch": 5.529283870091045, + "grad_norm": 7.874231338500977, + "learning_rate": 3.091282782986819e-06, + "loss": 2.8168, + "step": 81380 + }, + { + "epoch": 5.529623590161707, + "grad_norm": 7.8435750007629395, + "learning_rate": 3.090858132898492e-06, + "loss": 2.9154, + "step": 81385 + }, + { + "epoch": 5.529963310232368, + "grad_norm": 6.9464335441589355, + "learning_rate": 3.0904334828101644e-06, + "loss": 2.8813, + "step": 81390 + }, + { + "epoch": 5.53030303030303, + "grad_norm": 7.680911540985107, + "learning_rate": 3.0900088327218376e-06, + "loss": 2.6258, + "step": 81395 + }, + { + "epoch": 5.530642750373692, + "grad_norm": 8.038789749145508, + "learning_rate": 3.0895841826335104e-06, + "loss": 2.7559, + "step": 81400 + }, + { + "epoch": 5.5309824704443535, + "grad_norm": 6.234437465667725, + "learning_rate": 3.0891595325451828e-06, + "loss": 2.8636, + "step": 81405 + }, + { + "epoch": 5.531322190515016, + "grad_norm": 9.55551528930664, + "learning_rate": 3.088734882456856e-06, + "loss": 2.9386, + "step": 81410 + }, + { + "epoch": 5.531661910585678, + "grad_norm": 7.605619430541992, + "learning_rate": 3.0883102323685288e-06, + "loss": 2.9553, + "step": 81415 + }, + { + "epoch": 5.532001630656339, + "grad_norm": 6.458227634429932, + "learning_rate": 3.087885582280201e-06, + "loss": 2.8518, + "step": 81420 + }, + { + "epoch": 5.532341350727001, + "grad_norm": 7.56896448135376, + "learning_rate": 3.087460932191874e-06, + "loss": 2.7166, + "step": 81425 + }, + { + "epoch": 5.532681070797663, + "grad_norm": 6.358946800231934, + "learning_rate": 3.087036282103547e-06, + "loss": 2.7404, + "step": 81430 + }, + { + "epoch": 5.533020790868324, + "grad_norm": 6.71909761428833, + "learning_rate": 3.0866116320152196e-06, + "loss": 2.889, + "step": 81435 + }, + { + "epoch": 5.533360510938986, + "grad_norm": 7.143061637878418, + "learning_rate": 3.0861869819268924e-06, + "loss": 2.8429, + "step": 81440 + }, + { + "epoch": 5.533700231009648, + "grad_norm": 8.905333518981934, + "learning_rate": 3.0857623318385656e-06, + "loss": 2.9654, + "step": 81445 + }, + { + "epoch": 5.5340399510803095, + "grad_norm": 7.8138508796691895, + "learning_rate": 3.085337681750238e-06, + "loss": 2.5234, + "step": 81450 + }, + { + "epoch": 5.534379671150972, + "grad_norm": 7.533329010009766, + "learning_rate": 3.0849130316619108e-06, + "loss": 2.856, + "step": 81455 + }, + { + "epoch": 5.534719391221634, + "grad_norm": 7.406436920166016, + "learning_rate": 3.0844883815735836e-06, + "loss": 2.7516, + "step": 81460 + }, + { + "epoch": 5.535059111292295, + "grad_norm": 9.512773513793945, + "learning_rate": 3.0840637314852564e-06, + "loss": 2.8846, + "step": 81465 + }, + { + "epoch": 5.535398831362957, + "grad_norm": 7.997474670410156, + "learning_rate": 3.083639081396929e-06, + "loss": 2.7411, + "step": 81470 + }, + { + "epoch": 5.535738551433619, + "grad_norm": 6.967758655548096, + "learning_rate": 3.083214431308602e-06, + "loss": 2.9519, + "step": 81475 + }, + { + "epoch": 5.53607827150428, + "grad_norm": 7.913606643676758, + "learning_rate": 3.0827897812202743e-06, + "loss": 2.7655, + "step": 81480 + }, + { + "epoch": 5.536417991574942, + "grad_norm": 8.525800704956055, + "learning_rate": 3.0823651311319476e-06, + "loss": 2.8132, + "step": 81485 + }, + { + "epoch": 5.536757711645604, + "grad_norm": 6.370110511779785, + "learning_rate": 3.0819404810436204e-06, + "loss": 2.68, + "step": 81490 + }, + { + "epoch": 5.537097431716266, + "grad_norm": 7.958478927612305, + "learning_rate": 3.0815158309552927e-06, + "loss": 2.9691, + "step": 81495 + }, + { + "epoch": 5.537437151786928, + "grad_norm": 7.2692742347717285, + "learning_rate": 3.081091180866966e-06, + "loss": 2.7701, + "step": 81500 + }, + { + "epoch": 5.53777687185759, + "grad_norm": 7.418701648712158, + "learning_rate": 3.0806665307786388e-06, + "loss": 3.1991, + "step": 81505 + }, + { + "epoch": 5.538116591928251, + "grad_norm": 8.164874076843262, + "learning_rate": 3.080241880690311e-06, + "loss": 2.6018, + "step": 81510 + }, + { + "epoch": 5.538456311998913, + "grad_norm": 8.623762130737305, + "learning_rate": 3.079817230601984e-06, + "loss": 2.7667, + "step": 81515 + }, + { + "epoch": 5.538796032069575, + "grad_norm": 6.11676549911499, + "learning_rate": 3.079392580513657e-06, + "loss": 2.6295, + "step": 81520 + }, + { + "epoch": 5.539135752140236, + "grad_norm": 6.2475266456604, + "learning_rate": 3.07896793042533e-06, + "loss": 2.9024, + "step": 81525 + }, + { + "epoch": 5.539475472210898, + "grad_norm": 8.173833847045898, + "learning_rate": 3.0785432803370024e-06, + "loss": 2.9776, + "step": 81530 + }, + { + "epoch": 5.53981519228156, + "grad_norm": 7.950927734375, + "learning_rate": 3.0781186302486756e-06, + "loss": 2.9392, + "step": 81535 + }, + { + "epoch": 5.540154912352222, + "grad_norm": 8.728041648864746, + "learning_rate": 3.0776939801603484e-06, + "loss": 2.8789, + "step": 81540 + }, + { + "epoch": 5.540494632422884, + "grad_norm": 5.546003818511963, + "learning_rate": 3.0772693300720208e-06, + "loss": 2.8979, + "step": 81545 + }, + { + "epoch": 5.540834352493546, + "grad_norm": 5.68574333190918, + "learning_rate": 3.0768446799836936e-06, + "loss": 2.8831, + "step": 81550 + }, + { + "epoch": 5.541174072564207, + "grad_norm": 7.666851997375488, + "learning_rate": 3.0764200298953668e-06, + "loss": 2.8675, + "step": 81555 + }, + { + "epoch": 5.541513792634869, + "grad_norm": 6.466148376464844, + "learning_rate": 3.075995379807039e-06, + "loss": 2.9198, + "step": 81560 + }, + { + "epoch": 5.541853512705531, + "grad_norm": 6.977210521697998, + "learning_rate": 3.075570729718712e-06, + "loss": 2.5317, + "step": 81565 + }, + { + "epoch": 5.542193232776192, + "grad_norm": 9.834880828857422, + "learning_rate": 3.075146079630385e-06, + "loss": 2.8776, + "step": 81570 + }, + { + "epoch": 5.542532952846854, + "grad_norm": 8.45362663269043, + "learning_rate": 3.0747214295420576e-06, + "loss": 2.5668, + "step": 81575 + }, + { + "epoch": 5.542872672917516, + "grad_norm": 7.128251552581787, + "learning_rate": 3.0742967794537304e-06, + "loss": 2.9101, + "step": 81580 + }, + { + "epoch": 5.543212392988178, + "grad_norm": 9.15047550201416, + "learning_rate": 3.073872129365403e-06, + "loss": 2.8863, + "step": 81585 + }, + { + "epoch": 5.54355211305884, + "grad_norm": 9.440424919128418, + "learning_rate": 3.073447479277076e-06, + "loss": 2.8256, + "step": 81590 + }, + { + "epoch": 5.543891833129502, + "grad_norm": 7.490690231323242, + "learning_rate": 3.0730228291887488e-06, + "loss": 2.7096, + "step": 81595 + }, + { + "epoch": 5.544231553200163, + "grad_norm": 6.45566987991333, + "learning_rate": 3.0725981791004216e-06, + "loss": 2.7106, + "step": 81600 + }, + { + "epoch": 5.544571273270825, + "grad_norm": 7.26777982711792, + "learning_rate": 3.072173529012094e-06, + "loss": 2.8551, + "step": 81605 + }, + { + "epoch": 5.544910993341487, + "grad_norm": 6.7652177810668945, + "learning_rate": 3.071748878923767e-06, + "loss": 2.8596, + "step": 81610 + }, + { + "epoch": 5.545250713412148, + "grad_norm": 8.043323516845703, + "learning_rate": 3.07132422883544e-06, + "loss": 2.6579, + "step": 81615 + }, + { + "epoch": 5.54559043348281, + "grad_norm": 9.340909004211426, + "learning_rate": 3.0708995787471123e-06, + "loss": 2.6466, + "step": 81620 + }, + { + "epoch": 5.545930153553472, + "grad_norm": 7.249515533447266, + "learning_rate": 3.0704749286587856e-06, + "loss": 2.8051, + "step": 81625 + }, + { + "epoch": 5.546269873624134, + "grad_norm": 7.834413051605225, + "learning_rate": 3.0700502785704584e-06, + "loss": 2.7085, + "step": 81630 + }, + { + "epoch": 5.546609593694796, + "grad_norm": 6.794702053070068, + "learning_rate": 3.0696256284821307e-06, + "loss": 2.7372, + "step": 81635 + }, + { + "epoch": 5.546949313765458, + "grad_norm": 8.565410614013672, + "learning_rate": 3.0692009783938035e-06, + "loss": 2.434, + "step": 81640 + }, + { + "epoch": 5.547289033836119, + "grad_norm": 6.703794002532959, + "learning_rate": 3.0687763283054768e-06, + "loss": 2.6711, + "step": 81645 + }, + { + "epoch": 5.547628753906781, + "grad_norm": 6.125923156738281, + "learning_rate": 3.068351678217149e-06, + "loss": 3.0316, + "step": 81650 + }, + { + "epoch": 5.547968473977443, + "grad_norm": 8.399734497070312, + "learning_rate": 3.067927028128822e-06, + "loss": 2.5068, + "step": 81655 + }, + { + "epoch": 5.548308194048104, + "grad_norm": 6.070581912994385, + "learning_rate": 3.067502378040495e-06, + "loss": 2.8756, + "step": 81660 + }, + { + "epoch": 5.548647914118766, + "grad_norm": 7.866259574890137, + "learning_rate": 3.0670777279521675e-06, + "loss": 2.7254, + "step": 81665 + }, + { + "epoch": 5.5489876341894275, + "grad_norm": 9.73484992980957, + "learning_rate": 3.0666530778638404e-06, + "loss": 2.9589, + "step": 81670 + }, + { + "epoch": 5.54932735426009, + "grad_norm": 6.238121509552002, + "learning_rate": 3.066228427775513e-06, + "loss": 2.5957, + "step": 81675 + }, + { + "epoch": 5.549667074330752, + "grad_norm": 9.924651145935059, + "learning_rate": 3.0658037776871855e-06, + "loss": 2.8597, + "step": 81680 + }, + { + "epoch": 5.550006794401413, + "grad_norm": 6.458099365234375, + "learning_rate": 3.0653791275988588e-06, + "loss": 2.7469, + "step": 81685 + }, + { + "epoch": 5.550346514472075, + "grad_norm": 6.56572961807251, + "learning_rate": 3.0649544775105316e-06, + "loss": 2.8063, + "step": 81690 + }, + { + "epoch": 5.550686234542737, + "grad_norm": 8.860949516296387, + "learning_rate": 3.0645298274222048e-06, + "loss": 2.7392, + "step": 81695 + }, + { + "epoch": 5.551025954613398, + "grad_norm": 8.03487777709961, + "learning_rate": 3.064105177333877e-06, + "loss": 2.8849, + "step": 81700 + }, + { + "epoch": 5.55136567468406, + "grad_norm": 7.695075988769531, + "learning_rate": 3.06368052724555e-06, + "loss": 2.8636, + "step": 81705 + }, + { + "epoch": 5.551705394754722, + "grad_norm": 8.288150787353516, + "learning_rate": 3.0632558771572228e-06, + "loss": 2.5806, + "step": 81710 + }, + { + "epoch": 5.5520451148253835, + "grad_norm": 7.510121822357178, + "learning_rate": 3.0628312270688956e-06, + "loss": 2.9016, + "step": 81715 + }, + { + "epoch": 5.552384834896046, + "grad_norm": 8.68862533569336, + "learning_rate": 3.0624065769805684e-06, + "loss": 2.8747, + "step": 81720 + }, + { + "epoch": 5.552724554966708, + "grad_norm": 7.546193599700928, + "learning_rate": 3.061981926892241e-06, + "loss": 3.0137, + "step": 81725 + }, + { + "epoch": 5.553064275037369, + "grad_norm": 8.830429077148438, + "learning_rate": 3.0615572768039135e-06, + "loss": 2.7439, + "step": 81730 + }, + { + "epoch": 5.553403995108031, + "grad_norm": 9.07779312133789, + "learning_rate": 3.0611326267155868e-06, + "loss": 2.8339, + "step": 81735 + }, + { + "epoch": 5.553743715178693, + "grad_norm": 7.560153961181641, + "learning_rate": 3.0607079766272596e-06, + "loss": 2.981, + "step": 81740 + }, + { + "epoch": 5.554083435249354, + "grad_norm": 7.138617515563965, + "learning_rate": 3.060283326538932e-06, + "loss": 2.9052, + "step": 81745 + }, + { + "epoch": 5.554423155320016, + "grad_norm": 9.923242568969727, + "learning_rate": 3.059858676450605e-06, + "loss": 2.6049, + "step": 81750 + }, + { + "epoch": 5.554762875390678, + "grad_norm": 6.220303058624268, + "learning_rate": 3.059434026362278e-06, + "loss": 2.7413, + "step": 81755 + }, + { + "epoch": 5.5551025954613396, + "grad_norm": 7.112962245941162, + "learning_rate": 3.0590093762739503e-06, + "loss": 2.8894, + "step": 81760 + }, + { + "epoch": 5.555442315532002, + "grad_norm": 8.945708274841309, + "learning_rate": 3.058584726185623e-06, + "loss": 2.6511, + "step": 81765 + }, + { + "epoch": 5.555782035602664, + "grad_norm": 7.553746223449707, + "learning_rate": 3.0581600760972964e-06, + "loss": 2.7703, + "step": 81770 + }, + { + "epoch": 5.556121755673325, + "grad_norm": 9.466100692749023, + "learning_rate": 3.0577354260089687e-06, + "loss": 2.7958, + "step": 81775 + }, + { + "epoch": 5.556461475743987, + "grad_norm": 8.814338684082031, + "learning_rate": 3.0573107759206415e-06, + "loss": 2.7955, + "step": 81780 + }, + { + "epoch": 5.556801195814649, + "grad_norm": 6.828869342803955, + "learning_rate": 3.0568861258323148e-06, + "loss": 2.9376, + "step": 81785 + }, + { + "epoch": 5.55714091588531, + "grad_norm": 8.598109245300293, + "learning_rate": 3.056461475743987e-06, + "loss": 2.8268, + "step": 81790 + }, + { + "epoch": 5.557480635955972, + "grad_norm": 6.383266925811768, + "learning_rate": 3.05603682565566e-06, + "loss": 2.7199, + "step": 81795 + }, + { + "epoch": 5.557820356026634, + "grad_norm": 8.882954597473145, + "learning_rate": 3.0556121755673327e-06, + "loss": 2.8548, + "step": 81800 + }, + { + "epoch": 5.558160076097296, + "grad_norm": 7.1775312423706055, + "learning_rate": 3.055187525479005e-06, + "loss": 2.9765, + "step": 81805 + }, + { + "epoch": 5.558499796167958, + "grad_norm": 6.98095178604126, + "learning_rate": 3.0547628753906783e-06, + "loss": 2.7177, + "step": 81810 + }, + { + "epoch": 5.55883951623862, + "grad_norm": 6.987677097320557, + "learning_rate": 3.054338225302351e-06, + "loss": 2.777, + "step": 81815 + }, + { + "epoch": 5.559179236309281, + "grad_norm": 8.017196655273438, + "learning_rate": 3.0539135752140235e-06, + "loss": 2.8705, + "step": 81820 + }, + { + "epoch": 5.559518956379943, + "grad_norm": 7.874476432800293, + "learning_rate": 3.0534889251256968e-06, + "loss": 2.8508, + "step": 81825 + }, + { + "epoch": 5.559858676450605, + "grad_norm": 7.303093910217285, + "learning_rate": 3.0530642750373696e-06, + "loss": 2.904, + "step": 81830 + }, + { + "epoch": 5.560198396521266, + "grad_norm": 7.8887128829956055, + "learning_rate": 3.052639624949042e-06, + "loss": 2.7942, + "step": 81835 + }, + { + "epoch": 5.560538116591928, + "grad_norm": 7.744365692138672, + "learning_rate": 3.0522149748607147e-06, + "loss": 2.9653, + "step": 81840 + }, + { + "epoch": 5.56087783666259, + "grad_norm": 8.577476501464844, + "learning_rate": 3.051790324772388e-06, + "loss": 2.864, + "step": 81845 + }, + { + "epoch": 5.561217556733252, + "grad_norm": 7.455338001251221, + "learning_rate": 3.0513656746840603e-06, + "loss": 2.6549, + "step": 81850 + }, + { + "epoch": 5.561557276803914, + "grad_norm": 8.094817161560059, + "learning_rate": 3.050941024595733e-06, + "loss": 2.9647, + "step": 81855 + }, + { + "epoch": 5.561896996874576, + "grad_norm": 7.69933557510376, + "learning_rate": 3.0505163745074064e-06, + "loss": 2.7492, + "step": 81860 + }, + { + "epoch": 5.562236716945237, + "grad_norm": 7.621384143829346, + "learning_rate": 3.050091724419079e-06, + "loss": 2.4876, + "step": 81865 + }, + { + "epoch": 5.562576437015899, + "grad_norm": 7.578593730926514, + "learning_rate": 3.0496670743307515e-06, + "loss": 2.9965, + "step": 81870 + }, + { + "epoch": 5.56291615708656, + "grad_norm": 6.508697986602783, + "learning_rate": 3.0492424242424248e-06, + "loss": 2.6464, + "step": 81875 + }, + { + "epoch": 5.563255877157222, + "grad_norm": 8.004517555236816, + "learning_rate": 3.0488177741540976e-06, + "loss": 2.5774, + "step": 81880 + }, + { + "epoch": 5.563595597227884, + "grad_norm": 7.906736850738525, + "learning_rate": 3.04839312406577e-06, + "loss": 2.6317, + "step": 81885 + }, + { + "epoch": 5.5639353172985455, + "grad_norm": 7.631043434143066, + "learning_rate": 3.0479684739774427e-06, + "loss": 2.736, + "step": 81890 + }, + { + "epoch": 5.564275037369208, + "grad_norm": 6.880875110626221, + "learning_rate": 3.047543823889116e-06, + "loss": 2.5532, + "step": 81895 + }, + { + "epoch": 5.56461475743987, + "grad_norm": 7.795739650726318, + "learning_rate": 3.0471191738007883e-06, + "loss": 2.6296, + "step": 81900 + }, + { + "epoch": 5.564954477510531, + "grad_norm": 8.427262306213379, + "learning_rate": 3.046694523712461e-06, + "loss": 2.723, + "step": 81905 + }, + { + "epoch": 5.565294197581193, + "grad_norm": 7.29574728012085, + "learning_rate": 3.0462698736241344e-06, + "loss": 2.8579, + "step": 81910 + }, + { + "epoch": 5.565633917651855, + "grad_norm": 7.4493408203125, + "learning_rate": 3.0458452235358067e-06, + "loss": 2.79, + "step": 81915 + }, + { + "epoch": 5.565973637722516, + "grad_norm": 7.943283557891846, + "learning_rate": 3.0454205734474795e-06, + "loss": 2.4443, + "step": 81920 + }, + { + "epoch": 5.566313357793178, + "grad_norm": 8.433207511901855, + "learning_rate": 3.0449959233591523e-06, + "loss": 2.8109, + "step": 81925 + }, + { + "epoch": 5.56665307786384, + "grad_norm": 7.093053340911865, + "learning_rate": 3.0445712732708247e-06, + "loss": 2.6011, + "step": 81930 + }, + { + "epoch": 5.5669927979345015, + "grad_norm": 8.899467468261719, + "learning_rate": 3.044146623182498e-06, + "loss": 2.8676, + "step": 81935 + }, + { + "epoch": 5.567332518005164, + "grad_norm": 5.815932273864746, + "learning_rate": 3.0437219730941707e-06, + "loss": 3.0436, + "step": 81940 + }, + { + "epoch": 5.567672238075826, + "grad_norm": 8.207385063171387, + "learning_rate": 3.043297323005843e-06, + "loss": 2.7943, + "step": 81945 + }, + { + "epoch": 5.568011958146487, + "grad_norm": 7.650859832763672, + "learning_rate": 3.0428726729175163e-06, + "loss": 2.929, + "step": 81950 + }, + { + "epoch": 5.568351678217149, + "grad_norm": 7.840413570404053, + "learning_rate": 3.042448022829189e-06, + "loss": 2.6723, + "step": 81955 + }, + { + "epoch": 5.568691398287811, + "grad_norm": 7.783895015716553, + "learning_rate": 3.0420233727408615e-06, + "loss": 2.8873, + "step": 81960 + }, + { + "epoch": 5.569031118358472, + "grad_norm": 7.333739757537842, + "learning_rate": 3.0415987226525343e-06, + "loss": 2.7556, + "step": 81965 + }, + { + "epoch": 5.569370838429134, + "grad_norm": 6.96833610534668, + "learning_rate": 3.0411740725642075e-06, + "loss": 2.9523, + "step": 81970 + }, + { + "epoch": 5.569710558499796, + "grad_norm": 9.308839797973633, + "learning_rate": 3.04074942247588e-06, + "loss": 2.8, + "step": 81975 + }, + { + "epoch": 5.5700502785704575, + "grad_norm": 7.059290409088135, + "learning_rate": 3.0403247723875527e-06, + "loss": 2.815, + "step": 81980 + }, + { + "epoch": 5.57038999864112, + "grad_norm": 7.616621971130371, + "learning_rate": 3.039900122299226e-06, + "loss": 2.8588, + "step": 81985 + }, + { + "epoch": 5.570729718711782, + "grad_norm": 7.884449481964111, + "learning_rate": 3.0394754722108983e-06, + "loss": 2.6721, + "step": 81990 + }, + { + "epoch": 5.571069438782443, + "grad_norm": 8.326251029968262, + "learning_rate": 3.039050822122571e-06, + "loss": 2.8449, + "step": 81995 + }, + { + "epoch": 5.571409158853105, + "grad_norm": 8.053083419799805, + "learning_rate": 3.0386261720342444e-06, + "loss": 2.7647, + "step": 82000 + }, + { + "epoch": 5.571748878923767, + "grad_norm": 7.514009952545166, + "learning_rate": 3.0382015219459167e-06, + "loss": 2.7233, + "step": 82005 + }, + { + "epoch": 5.572088598994428, + "grad_norm": 7.748331069946289, + "learning_rate": 3.0377768718575895e-06, + "loss": 2.8644, + "step": 82010 + }, + { + "epoch": 5.57242831906509, + "grad_norm": 10.601570129394531, + "learning_rate": 3.0373522217692623e-06, + "loss": 2.8954, + "step": 82015 + }, + { + "epoch": 5.572768039135752, + "grad_norm": 7.628369331359863, + "learning_rate": 3.0369275716809347e-06, + "loss": 2.6795, + "step": 82020 + }, + { + "epoch": 5.5731077592064135, + "grad_norm": 6.487970352172852, + "learning_rate": 3.036502921592608e-06, + "loss": 2.9186, + "step": 82025 + }, + { + "epoch": 5.573447479277076, + "grad_norm": 7.643880367279053, + "learning_rate": 3.0360782715042807e-06, + "loss": 3.0727, + "step": 82030 + }, + { + "epoch": 5.573787199347738, + "grad_norm": 7.653085708618164, + "learning_rate": 3.035653621415954e-06, + "loss": 3.0733, + "step": 82035 + }, + { + "epoch": 5.574126919418399, + "grad_norm": 7.974109649658203, + "learning_rate": 3.0352289713276263e-06, + "loss": 2.9032, + "step": 82040 + }, + { + "epoch": 5.574466639489061, + "grad_norm": 6.843406677246094, + "learning_rate": 3.034804321239299e-06, + "loss": 2.7262, + "step": 82045 + }, + { + "epoch": 5.574806359559723, + "grad_norm": 7.929856777191162, + "learning_rate": 3.034379671150972e-06, + "loss": 2.8313, + "step": 82050 + }, + { + "epoch": 5.575146079630384, + "grad_norm": 6.736059188842773, + "learning_rate": 3.0339550210626443e-06, + "loss": 2.675, + "step": 82055 + }, + { + "epoch": 5.575485799701046, + "grad_norm": 9.131354331970215, + "learning_rate": 3.0335303709743175e-06, + "loss": 2.73, + "step": 82060 + }, + { + "epoch": 5.575825519771708, + "grad_norm": 6.393605709075928, + "learning_rate": 3.0331057208859903e-06, + "loss": 2.6521, + "step": 82065 + }, + { + "epoch": 5.57616523984237, + "grad_norm": 6.124885559082031, + "learning_rate": 3.0326810707976627e-06, + "loss": 2.7985, + "step": 82070 + }, + { + "epoch": 5.576504959913032, + "grad_norm": 7.095017910003662, + "learning_rate": 3.032256420709336e-06, + "loss": 2.8183, + "step": 82075 + }, + { + "epoch": 5.576844679983694, + "grad_norm": 6.416907787322998, + "learning_rate": 3.0318317706210087e-06, + "loss": 2.9264, + "step": 82080 + }, + { + "epoch": 5.577184400054355, + "grad_norm": 9.0038480758667, + "learning_rate": 3.031407120532681e-06, + "loss": 2.981, + "step": 82085 + }, + { + "epoch": 5.577524120125017, + "grad_norm": 6.735428333282471, + "learning_rate": 3.030982470444354e-06, + "loss": 2.9473, + "step": 82090 + }, + { + "epoch": 5.577863840195679, + "grad_norm": 8.288901329040527, + "learning_rate": 3.030557820356027e-06, + "loss": 2.6832, + "step": 82095 + }, + { + "epoch": 5.57820356026634, + "grad_norm": 9.08900260925293, + "learning_rate": 3.0301331702676995e-06, + "loss": 3.0169, + "step": 82100 + }, + { + "epoch": 5.578543280337002, + "grad_norm": 6.844126224517822, + "learning_rate": 3.0297085201793723e-06, + "loss": 2.8708, + "step": 82105 + }, + { + "epoch": 5.578883000407664, + "grad_norm": 7.166069984436035, + "learning_rate": 3.0292838700910455e-06, + "loss": 2.789, + "step": 82110 + }, + { + "epoch": 5.579222720478326, + "grad_norm": 11.285016059875488, + "learning_rate": 3.028859220002718e-06, + "loss": 2.7089, + "step": 82115 + }, + { + "epoch": 5.579562440548988, + "grad_norm": 9.21208667755127, + "learning_rate": 3.0284345699143907e-06, + "loss": 2.854, + "step": 82120 + }, + { + "epoch": 5.57990216061965, + "grad_norm": 7.057098865509033, + "learning_rate": 3.0280099198260635e-06, + "loss": 2.7404, + "step": 82125 + }, + { + "epoch": 5.580241880690311, + "grad_norm": 9.206889152526855, + "learning_rate": 3.0275852697377363e-06, + "loss": 2.7832, + "step": 82130 + }, + { + "epoch": 5.580581600760973, + "grad_norm": 10.78685188293457, + "learning_rate": 3.027160619649409e-06, + "loss": 2.8699, + "step": 82135 + }, + { + "epoch": 5.580921320831635, + "grad_norm": 7.6874213218688965, + "learning_rate": 3.026735969561082e-06, + "loss": 2.8363, + "step": 82140 + }, + { + "epoch": 5.581261040902296, + "grad_norm": 7.179947376251221, + "learning_rate": 3.0263113194727543e-06, + "loss": 3.0488, + "step": 82145 + }, + { + "epoch": 5.581600760972958, + "grad_norm": 8.194497108459473, + "learning_rate": 3.0258866693844275e-06, + "loss": 2.9953, + "step": 82150 + }, + { + "epoch": 5.58194048104362, + "grad_norm": 7.248968601226807, + "learning_rate": 3.0254620192961003e-06, + "loss": 2.8923, + "step": 82155 + }, + { + "epoch": 5.582280201114282, + "grad_norm": 7.752989768981934, + "learning_rate": 3.0250373692077727e-06, + "loss": 3.0433, + "step": 82160 + }, + { + "epoch": 5.582619921184944, + "grad_norm": 6.62257194519043, + "learning_rate": 3.024612719119446e-06, + "loss": 2.7225, + "step": 82165 + }, + { + "epoch": 5.582959641255606, + "grad_norm": 8.243804931640625, + "learning_rate": 3.0241880690311187e-06, + "loss": 2.9019, + "step": 82170 + }, + { + "epoch": 5.583299361326267, + "grad_norm": 7.778248310089111, + "learning_rate": 3.023763418942791e-06, + "loss": 2.9739, + "step": 82175 + }, + { + "epoch": 5.583639081396929, + "grad_norm": 7.2763590812683105, + "learning_rate": 3.023338768854464e-06, + "loss": 2.5436, + "step": 82180 + }, + { + "epoch": 5.583978801467591, + "grad_norm": 7.866823673248291, + "learning_rate": 3.022914118766137e-06, + "loss": 2.635, + "step": 82185 + }, + { + "epoch": 5.584318521538252, + "grad_norm": 9.232349395751953, + "learning_rate": 3.0224894686778095e-06, + "loss": 2.8037, + "step": 82190 + }, + { + "epoch": 5.584658241608914, + "grad_norm": 8.532126426696777, + "learning_rate": 3.0220648185894823e-06, + "loss": 2.6606, + "step": 82195 + }, + { + "epoch": 5.584997961679576, + "grad_norm": 6.026040554046631, + "learning_rate": 3.0216401685011555e-06, + "loss": 2.9997, + "step": 82200 + }, + { + "epoch": 5.585337681750238, + "grad_norm": 6.245650291442871, + "learning_rate": 3.0212155184128283e-06, + "loss": 2.7191, + "step": 82205 + }, + { + "epoch": 5.5856774018209, + "grad_norm": 8.496849060058594, + "learning_rate": 3.0207908683245007e-06, + "loss": 2.5706, + "step": 82210 + }, + { + "epoch": 5.586017121891562, + "grad_norm": 8.193642616271973, + "learning_rate": 3.0203662182361735e-06, + "loss": 2.8865, + "step": 82215 + }, + { + "epoch": 5.586356841962223, + "grad_norm": 7.574296951293945, + "learning_rate": 3.0199415681478467e-06, + "loss": 2.7082, + "step": 82220 + }, + { + "epoch": 5.586696562032885, + "grad_norm": 7.57627010345459, + "learning_rate": 3.019516918059519e-06, + "loss": 2.8025, + "step": 82225 + }, + { + "epoch": 5.587036282103547, + "grad_norm": 8.272833824157715, + "learning_rate": 3.019092267971192e-06, + "loss": 2.7899, + "step": 82230 + }, + { + "epoch": 5.587376002174208, + "grad_norm": 8.764439582824707, + "learning_rate": 3.018667617882865e-06, + "loss": 2.995, + "step": 82235 + }, + { + "epoch": 5.58771572224487, + "grad_norm": 10.459131240844727, + "learning_rate": 3.0182429677945375e-06, + "loss": 2.6232, + "step": 82240 + }, + { + "epoch": 5.588055442315532, + "grad_norm": 6.3893818855285645, + "learning_rate": 3.0178183177062103e-06, + "loss": 2.9772, + "step": 82245 + }, + { + "epoch": 5.588395162386194, + "grad_norm": 7.2002272605896, + "learning_rate": 3.017393667617883e-06, + "loss": 2.764, + "step": 82250 + }, + { + "epoch": 5.588734882456856, + "grad_norm": 6.782023906707764, + "learning_rate": 3.016969017529556e-06, + "loss": 2.8921, + "step": 82255 + }, + { + "epoch": 5.589074602527518, + "grad_norm": 6.547126293182373, + "learning_rate": 3.0165443674412287e-06, + "loss": 2.9829, + "step": 82260 + }, + { + "epoch": 5.589414322598179, + "grad_norm": 6.787909030914307, + "learning_rate": 3.0161197173529015e-06, + "loss": 2.7496, + "step": 82265 + }, + { + "epoch": 5.589754042668841, + "grad_norm": 9.548754692077637, + "learning_rate": 3.015695067264574e-06, + "loss": 2.9808, + "step": 82270 + }, + { + "epoch": 5.590093762739503, + "grad_norm": 8.857441902160645, + "learning_rate": 3.015270417176247e-06, + "loss": 2.7286, + "step": 82275 + }, + { + "epoch": 5.590433482810164, + "grad_norm": 8.512176513671875, + "learning_rate": 3.01484576708792e-06, + "loss": 2.8408, + "step": 82280 + }, + { + "epoch": 5.590773202880826, + "grad_norm": 7.189035892486572, + "learning_rate": 3.0144211169995923e-06, + "loss": 2.5527, + "step": 82285 + }, + { + "epoch": 5.591112922951488, + "grad_norm": 8.72624397277832, + "learning_rate": 3.0139964669112655e-06, + "loss": 2.7807, + "step": 82290 + }, + { + "epoch": 5.59145264302215, + "grad_norm": 9.256014823913574, + "learning_rate": 3.0135718168229383e-06, + "loss": 2.6881, + "step": 82295 + }, + { + "epoch": 5.591792363092812, + "grad_norm": 7.175114631652832, + "learning_rate": 3.0131471667346107e-06, + "loss": 2.8104, + "step": 82300 + }, + { + "epoch": 5.592132083163474, + "grad_norm": 6.512487411499023, + "learning_rate": 3.0127225166462835e-06, + "loss": 2.781, + "step": 82305 + }, + { + "epoch": 5.592471803234135, + "grad_norm": 7.666197776794434, + "learning_rate": 3.0122978665579567e-06, + "loss": 2.8102, + "step": 82310 + }, + { + "epoch": 5.592811523304797, + "grad_norm": 9.939597129821777, + "learning_rate": 3.011873216469629e-06, + "loss": 2.8784, + "step": 82315 + }, + { + "epoch": 5.593151243375459, + "grad_norm": 7.11344575881958, + "learning_rate": 3.011448566381302e-06, + "loss": 2.6818, + "step": 82320 + }, + { + "epoch": 5.59349096344612, + "grad_norm": 8.97618293762207, + "learning_rate": 3.011023916292975e-06, + "loss": 2.7147, + "step": 82325 + }, + { + "epoch": 5.593830683516782, + "grad_norm": 8.308394432067871, + "learning_rate": 3.0105992662046475e-06, + "loss": 2.9725, + "step": 82330 + }, + { + "epoch": 5.594170403587444, + "grad_norm": 8.042670249938965, + "learning_rate": 3.0101746161163203e-06, + "loss": 2.838, + "step": 82335 + }, + { + "epoch": 5.594510123658106, + "grad_norm": 10.472792625427246, + "learning_rate": 3.009749966027993e-06, + "loss": 2.9914, + "step": 82340 + }, + { + "epoch": 5.594849843728768, + "grad_norm": 9.042076110839844, + "learning_rate": 3.009325315939666e-06, + "loss": 2.8681, + "step": 82345 + }, + { + "epoch": 5.595189563799429, + "grad_norm": 7.030008792877197, + "learning_rate": 3.0089006658513387e-06, + "loss": 2.4738, + "step": 82350 + }, + { + "epoch": 5.595529283870091, + "grad_norm": 8.564061164855957, + "learning_rate": 3.0084760157630115e-06, + "loss": 3.2132, + "step": 82355 + }, + { + "epoch": 5.595869003940753, + "grad_norm": 5.555597305297852, + "learning_rate": 3.008051365674684e-06, + "loss": 2.9067, + "step": 82360 + }, + { + "epoch": 5.596208724011414, + "grad_norm": 7.661035537719727, + "learning_rate": 3.007626715586357e-06, + "loss": 2.975, + "step": 82365 + }, + { + "epoch": 5.596548444082076, + "grad_norm": 9.033773422241211, + "learning_rate": 3.00720206549803e-06, + "loss": 2.9698, + "step": 82370 + }, + { + "epoch": 5.596888164152738, + "grad_norm": 7.136502742767334, + "learning_rate": 3.0067774154097027e-06, + "loss": 2.7218, + "step": 82375 + }, + { + "epoch": 5.5972278842234, + "grad_norm": 9.884513854980469, + "learning_rate": 3.0063527653213755e-06, + "loss": 2.9497, + "step": 82380 + }, + { + "epoch": 5.597567604294062, + "grad_norm": 8.598033905029297, + "learning_rate": 3.0059281152330483e-06, + "loss": 2.7242, + "step": 82385 + }, + { + "epoch": 5.597907324364724, + "grad_norm": 7.327530860900879, + "learning_rate": 3.005503465144721e-06, + "loss": 2.6881, + "step": 82390 + }, + { + "epoch": 5.598247044435385, + "grad_norm": 6.581859111785889, + "learning_rate": 3.0050788150563935e-06, + "loss": 2.8405, + "step": 82395 + }, + { + "epoch": 5.598586764506047, + "grad_norm": 7.849301338195801, + "learning_rate": 3.0046541649680667e-06, + "loss": 2.8821, + "step": 82400 + }, + { + "epoch": 5.598926484576709, + "grad_norm": 8.07975959777832, + "learning_rate": 3.0042295148797395e-06, + "loss": 2.6348, + "step": 82405 + }, + { + "epoch": 5.59926620464737, + "grad_norm": 6.7727885246276855, + "learning_rate": 3.003804864791412e-06, + "loss": 2.7555, + "step": 82410 + }, + { + "epoch": 5.599605924718032, + "grad_norm": 10.159627914428711, + "learning_rate": 3.003380214703085e-06, + "loss": 2.995, + "step": 82415 + }, + { + "epoch": 5.599945644788694, + "grad_norm": 7.7905497550964355, + "learning_rate": 3.002955564614758e-06, + "loss": 2.7808, + "step": 82420 + }, + { + "epoch": 5.600285364859356, + "grad_norm": 7.093993663787842, + "learning_rate": 3.0025309145264303e-06, + "loss": 2.8117, + "step": 82425 + }, + { + "epoch": 5.600625084930018, + "grad_norm": 7.587623596191406, + "learning_rate": 3.002106264438103e-06, + "loss": 2.881, + "step": 82430 + }, + { + "epoch": 5.60096480500068, + "grad_norm": 7.363799095153809, + "learning_rate": 3.0016816143497763e-06, + "loss": 2.7135, + "step": 82435 + }, + { + "epoch": 5.601304525071341, + "grad_norm": 6.029301643371582, + "learning_rate": 3.0012569642614487e-06, + "loss": 2.6411, + "step": 82440 + }, + { + "epoch": 5.601644245142003, + "grad_norm": 6.165777683258057, + "learning_rate": 3.0008323141731215e-06, + "loss": 2.8448, + "step": 82445 + }, + { + "epoch": 5.601983965212665, + "grad_norm": 9.687088012695312, + "learning_rate": 3.0004076640847947e-06, + "loss": 2.7915, + "step": 82450 + }, + { + "epoch": 5.602323685283326, + "grad_norm": 8.180281639099121, + "learning_rate": 2.999983013996467e-06, + "loss": 3.005, + "step": 82455 + }, + { + "epoch": 5.602663405353988, + "grad_norm": 6.670405864715576, + "learning_rate": 2.99955836390814e-06, + "loss": 2.6101, + "step": 82460 + }, + { + "epoch": 5.60300312542465, + "grad_norm": 9.379386901855469, + "learning_rate": 2.9991337138198127e-06, + "loss": 2.8993, + "step": 82465 + }, + { + "epoch": 5.603342845495312, + "grad_norm": 8.35258674621582, + "learning_rate": 2.998709063731485e-06, + "loss": 2.9491, + "step": 82470 + }, + { + "epoch": 5.603682565565974, + "grad_norm": 7.588983058929443, + "learning_rate": 2.9982844136431583e-06, + "loss": 2.7685, + "step": 82475 + }, + { + "epoch": 5.604022285636636, + "grad_norm": 7.545551300048828, + "learning_rate": 2.997859763554831e-06, + "loss": 2.7768, + "step": 82480 + }, + { + "epoch": 5.604362005707297, + "grad_norm": 5.401533126831055, + "learning_rate": 2.9974351134665035e-06, + "loss": 2.6809, + "step": 82485 + }, + { + "epoch": 5.604701725777959, + "grad_norm": 5.9890875816345215, + "learning_rate": 2.9970104633781767e-06, + "loss": 2.6168, + "step": 82490 + }, + { + "epoch": 5.605041445848621, + "grad_norm": 8.529633522033691, + "learning_rate": 2.9965858132898495e-06, + "loss": 2.7073, + "step": 82495 + }, + { + "epoch": 5.605381165919282, + "grad_norm": 7.45816707611084, + "learning_rate": 2.996161163201522e-06, + "loss": 2.7572, + "step": 82500 + }, + { + "epoch": 5.605720885989944, + "grad_norm": 9.269601821899414, + "learning_rate": 2.995736513113195e-06, + "loss": 2.877, + "step": 82505 + }, + { + "epoch": 5.606060606060606, + "grad_norm": 6.326603412628174, + "learning_rate": 2.995311863024868e-06, + "loss": 2.8828, + "step": 82510 + }, + { + "epoch": 5.606400326131268, + "grad_norm": 6.494403839111328, + "learning_rate": 2.9948872129365403e-06, + "loss": 2.8639, + "step": 82515 + }, + { + "epoch": 5.60674004620193, + "grad_norm": 6.849823951721191, + "learning_rate": 2.994462562848213e-06, + "loss": 2.9504, + "step": 82520 + }, + { + "epoch": 5.607079766272592, + "grad_norm": 6.165917873382568, + "learning_rate": 2.9940379127598863e-06, + "loss": 2.7768, + "step": 82525 + }, + { + "epoch": 5.607419486343253, + "grad_norm": 9.067278861999512, + "learning_rate": 2.9936132626715587e-06, + "loss": 2.9632, + "step": 82530 + }, + { + "epoch": 5.607759206413915, + "grad_norm": 5.334126949310303, + "learning_rate": 2.9931886125832315e-06, + "loss": 2.814, + "step": 82535 + }, + { + "epoch": 5.608098926484577, + "grad_norm": 10.441044807434082, + "learning_rate": 2.9927639624949047e-06, + "loss": 2.8134, + "step": 82540 + }, + { + "epoch": 5.608438646555238, + "grad_norm": 6.82082462310791, + "learning_rate": 2.9923393124065775e-06, + "loss": 2.8462, + "step": 82545 + }, + { + "epoch": 5.6087783666259, + "grad_norm": 8.145807266235352, + "learning_rate": 2.99191466231825e-06, + "loss": 3.0064, + "step": 82550 + }, + { + "epoch": 5.6091180866965615, + "grad_norm": 10.830296516418457, + "learning_rate": 2.9914900122299227e-06, + "loss": 2.6363, + "step": 82555 + }, + { + "epoch": 5.609457806767224, + "grad_norm": 6.033693313598633, + "learning_rate": 2.991065362141596e-06, + "loss": 2.8392, + "step": 82560 + }, + { + "epoch": 5.609797526837886, + "grad_norm": 6.74465799331665, + "learning_rate": 2.9906407120532683e-06, + "loss": 2.8175, + "step": 82565 + }, + { + "epoch": 5.610137246908547, + "grad_norm": 7.279460430145264, + "learning_rate": 2.990216061964941e-06, + "loss": 2.8985, + "step": 82570 + }, + { + "epoch": 5.610476966979209, + "grad_norm": 9.296154022216797, + "learning_rate": 2.9897914118766143e-06, + "loss": 2.5499, + "step": 82575 + }, + { + "epoch": 5.610816687049871, + "grad_norm": 7.965391635894775, + "learning_rate": 2.9893667617882867e-06, + "loss": 2.8533, + "step": 82580 + }, + { + "epoch": 5.611156407120532, + "grad_norm": 8.010807037353516, + "learning_rate": 2.9889421116999595e-06, + "loss": 2.8838, + "step": 82585 + }, + { + "epoch": 5.611496127191194, + "grad_norm": 8.374863624572754, + "learning_rate": 2.9885174616116323e-06, + "loss": 2.8006, + "step": 82590 + }, + { + "epoch": 5.611835847261856, + "grad_norm": 6.288270950317383, + "learning_rate": 2.9880928115233047e-06, + "loss": 2.7383, + "step": 82595 + }, + { + "epoch": 5.6121755673325175, + "grad_norm": 6.78412389755249, + "learning_rate": 2.987668161434978e-06, + "loss": 2.5264, + "step": 82600 + }, + { + "epoch": 5.61251528740318, + "grad_norm": 7.1097187995910645, + "learning_rate": 2.9872435113466507e-06, + "loss": 2.8129, + "step": 82605 + }, + { + "epoch": 5.612855007473842, + "grad_norm": 7.594406604766846, + "learning_rate": 2.986818861258323e-06, + "loss": 2.7283, + "step": 82610 + }, + { + "epoch": 5.613194727544503, + "grad_norm": 9.88791561126709, + "learning_rate": 2.9863942111699963e-06, + "loss": 2.8203, + "step": 82615 + }, + { + "epoch": 5.613534447615165, + "grad_norm": 9.377320289611816, + "learning_rate": 2.985969561081669e-06, + "loss": 2.8567, + "step": 82620 + }, + { + "epoch": 5.613874167685827, + "grad_norm": 8.487141609191895, + "learning_rate": 2.9855449109933415e-06, + "loss": 3.0075, + "step": 82625 + }, + { + "epoch": 5.614213887756488, + "grad_norm": 7.955927848815918, + "learning_rate": 2.9851202609050147e-06, + "loss": 2.9928, + "step": 82630 + }, + { + "epoch": 5.61455360782715, + "grad_norm": 6.751275539398193, + "learning_rate": 2.9846956108166875e-06, + "loss": 2.7133, + "step": 82635 + }, + { + "epoch": 5.614893327897812, + "grad_norm": 7.281413555145264, + "learning_rate": 2.98427096072836e-06, + "loss": 2.6002, + "step": 82640 + }, + { + "epoch": 5.615233047968474, + "grad_norm": 7.709632396697998, + "learning_rate": 2.9838463106400327e-06, + "loss": 2.8162, + "step": 82645 + }, + { + "epoch": 5.615572768039136, + "grad_norm": 6.407207012176514, + "learning_rate": 2.983421660551706e-06, + "loss": 2.861, + "step": 82650 + }, + { + "epoch": 5.615912488109798, + "grad_norm": 8.582771301269531, + "learning_rate": 2.9829970104633783e-06, + "loss": 2.8312, + "step": 82655 + }, + { + "epoch": 5.616252208180459, + "grad_norm": 6.315241813659668, + "learning_rate": 2.982572360375051e-06, + "loss": 2.7779, + "step": 82660 + }, + { + "epoch": 5.616591928251121, + "grad_norm": 7.004912376403809, + "learning_rate": 2.9821477102867243e-06, + "loss": 2.7828, + "step": 82665 + }, + { + "epoch": 5.616931648321783, + "grad_norm": 7.0550079345703125, + "learning_rate": 2.9817230601983967e-06, + "loss": 2.5578, + "step": 82670 + }, + { + "epoch": 5.617271368392444, + "grad_norm": 7.843329429626465, + "learning_rate": 2.9812984101100695e-06, + "loss": 2.9168, + "step": 82675 + }, + { + "epoch": 5.617611088463106, + "grad_norm": 8.307127952575684, + "learning_rate": 2.9808737600217423e-06, + "loss": 2.5761, + "step": 82680 + }, + { + "epoch": 5.617950808533768, + "grad_norm": 8.363107681274414, + "learning_rate": 2.9804491099334147e-06, + "loss": 2.805, + "step": 82685 + }, + { + "epoch": 5.61829052860443, + "grad_norm": 7.712331771850586, + "learning_rate": 2.980024459845088e-06, + "loss": 2.5815, + "step": 82690 + }, + { + "epoch": 5.618630248675092, + "grad_norm": 6.250504970550537, + "learning_rate": 2.9795998097567607e-06, + "loss": 2.923, + "step": 82695 + }, + { + "epoch": 5.618969968745754, + "grad_norm": 8.325677871704102, + "learning_rate": 2.979175159668433e-06, + "loss": 2.9766, + "step": 82700 + }, + { + "epoch": 5.619309688816415, + "grad_norm": 9.75019359588623, + "learning_rate": 2.9787505095801063e-06, + "loss": 2.9511, + "step": 82705 + }, + { + "epoch": 5.619649408887077, + "grad_norm": 7.136398792266846, + "learning_rate": 2.978325859491779e-06, + "loss": 2.7661, + "step": 82710 + }, + { + "epoch": 5.619989128957739, + "grad_norm": 7.3908586502075195, + "learning_rate": 2.977901209403452e-06, + "loss": 2.6799, + "step": 82715 + }, + { + "epoch": 5.6203288490284, + "grad_norm": 7.246316909790039, + "learning_rate": 2.9774765593151243e-06, + "loss": 2.6284, + "step": 82720 + }, + { + "epoch": 5.620668569099062, + "grad_norm": 5.676892280578613, + "learning_rate": 2.9770519092267975e-06, + "loss": 2.9721, + "step": 82725 + }, + { + "epoch": 5.621008289169724, + "grad_norm": 10.1298189163208, + "learning_rate": 2.9766272591384703e-06, + "loss": 2.7418, + "step": 82730 + }, + { + "epoch": 5.621348009240386, + "grad_norm": 6.186156272888184, + "learning_rate": 2.9762026090501427e-06, + "loss": 2.9166, + "step": 82735 + }, + { + "epoch": 5.621687729311048, + "grad_norm": 6.752101421356201, + "learning_rate": 2.975777958961816e-06, + "loss": 2.8793, + "step": 82740 + }, + { + "epoch": 5.62202744938171, + "grad_norm": 7.895613193511963, + "learning_rate": 2.9753533088734887e-06, + "loss": 2.8058, + "step": 82745 + }, + { + "epoch": 5.622367169452371, + "grad_norm": 6.306188106536865, + "learning_rate": 2.974928658785161e-06, + "loss": 2.8344, + "step": 82750 + }, + { + "epoch": 5.622706889523033, + "grad_norm": 7.346429824829102, + "learning_rate": 2.974504008696834e-06, + "loss": 2.8167, + "step": 82755 + }, + { + "epoch": 5.623046609593695, + "grad_norm": 8.288858413696289, + "learning_rate": 2.974079358608507e-06, + "loss": 2.9032, + "step": 82760 + }, + { + "epoch": 5.623386329664356, + "grad_norm": 7.4174723625183105, + "learning_rate": 2.9736547085201795e-06, + "loss": 2.6532, + "step": 82765 + }, + { + "epoch": 5.623726049735018, + "grad_norm": 8.52344799041748, + "learning_rate": 2.9732300584318523e-06, + "loss": 2.8577, + "step": 82770 + }, + { + "epoch": 5.62406576980568, + "grad_norm": 7.602110385894775, + "learning_rate": 2.9728054083435255e-06, + "loss": 2.6885, + "step": 82775 + }, + { + "epoch": 5.624405489876342, + "grad_norm": 9.082027435302734, + "learning_rate": 2.972380758255198e-06, + "loss": 2.8077, + "step": 82780 + }, + { + "epoch": 5.624745209947004, + "grad_norm": 7.666388034820557, + "learning_rate": 2.9719561081668707e-06, + "loss": 3.04, + "step": 82785 + }, + { + "epoch": 5.625084930017666, + "grad_norm": 7.674550533294678, + "learning_rate": 2.971531458078544e-06, + "loss": 2.8118, + "step": 82790 + }, + { + "epoch": 5.625424650088327, + "grad_norm": 8.534518241882324, + "learning_rate": 2.9711068079902163e-06, + "loss": 2.8263, + "step": 82795 + }, + { + "epoch": 5.625764370158989, + "grad_norm": 8.297013282775879, + "learning_rate": 2.970682157901889e-06, + "loss": 2.959, + "step": 82800 + }, + { + "epoch": 5.626104090229651, + "grad_norm": 8.30518913269043, + "learning_rate": 2.970257507813562e-06, + "loss": 2.9046, + "step": 82805 + }, + { + "epoch": 5.626443810300312, + "grad_norm": 8.253268241882324, + "learning_rate": 2.9698328577252343e-06, + "loss": 2.6543, + "step": 82810 + }, + { + "epoch": 5.626783530370974, + "grad_norm": 10.36507797241211, + "learning_rate": 2.9694082076369075e-06, + "loss": 2.9057, + "step": 82815 + }, + { + "epoch": 5.627123250441636, + "grad_norm": 7.96386194229126, + "learning_rate": 2.9689835575485803e-06, + "loss": 2.7633, + "step": 82820 + }, + { + "epoch": 5.627462970512298, + "grad_norm": 8.103285789489746, + "learning_rate": 2.9685589074602527e-06, + "loss": 2.681, + "step": 82825 + }, + { + "epoch": 5.62780269058296, + "grad_norm": 8.401517868041992, + "learning_rate": 2.968134257371926e-06, + "loss": 2.8462, + "step": 82830 + }, + { + "epoch": 5.628142410653622, + "grad_norm": 8.223236083984375, + "learning_rate": 2.9677096072835987e-06, + "loss": 2.5278, + "step": 82835 + }, + { + "epoch": 5.628482130724283, + "grad_norm": 7.063620567321777, + "learning_rate": 2.967284957195271e-06, + "loss": 2.8976, + "step": 82840 + }, + { + "epoch": 5.628821850794945, + "grad_norm": 7.175790309906006, + "learning_rate": 2.966860307106944e-06, + "loss": 2.8299, + "step": 82845 + }, + { + "epoch": 5.629161570865607, + "grad_norm": 6.801551342010498, + "learning_rate": 2.966435657018617e-06, + "loss": 2.6575, + "step": 82850 + }, + { + "epoch": 5.629501290936268, + "grad_norm": 8.168939590454102, + "learning_rate": 2.9660110069302895e-06, + "loss": 2.9488, + "step": 82855 + }, + { + "epoch": 5.62984101100693, + "grad_norm": 8.722701072692871, + "learning_rate": 2.9655863568419623e-06, + "loss": 2.9279, + "step": 82860 + }, + { + "epoch": 5.630180731077592, + "grad_norm": 5.991828441619873, + "learning_rate": 2.9651617067536355e-06, + "loss": 2.6823, + "step": 82865 + }, + { + "epoch": 5.630520451148254, + "grad_norm": 6.8183112144470215, + "learning_rate": 2.964737056665308e-06, + "loss": 2.7602, + "step": 82870 + }, + { + "epoch": 5.630860171218916, + "grad_norm": 8.04472541809082, + "learning_rate": 2.9643124065769807e-06, + "loss": 2.649, + "step": 82875 + }, + { + "epoch": 5.631199891289578, + "grad_norm": 10.138544082641602, + "learning_rate": 2.9638877564886535e-06, + "loss": 2.8571, + "step": 82880 + }, + { + "epoch": 5.631539611360239, + "grad_norm": 7.43946647644043, + "learning_rate": 2.9634631064003267e-06, + "loss": 2.6853, + "step": 82885 + }, + { + "epoch": 5.631879331430901, + "grad_norm": 10.824695587158203, + "learning_rate": 2.963038456311999e-06, + "loss": 2.8679, + "step": 82890 + }, + { + "epoch": 5.632219051501563, + "grad_norm": 10.19198226928711, + "learning_rate": 2.962613806223672e-06, + "loss": 2.6837, + "step": 82895 + }, + { + "epoch": 5.632558771572224, + "grad_norm": 8.523277282714844, + "learning_rate": 2.962189156135345e-06, + "loss": 2.8173, + "step": 82900 + }, + { + "epoch": 5.632898491642886, + "grad_norm": 5.517080783843994, + "learning_rate": 2.9617645060470175e-06, + "loss": 3.0809, + "step": 82905 + }, + { + "epoch": 5.633238211713548, + "grad_norm": 7.9603424072265625, + "learning_rate": 2.9613398559586903e-06, + "loss": 2.8095, + "step": 82910 + }, + { + "epoch": 5.63357793178421, + "grad_norm": 6.635314464569092, + "learning_rate": 2.9609152058703635e-06, + "loss": 2.9466, + "step": 82915 + }, + { + "epoch": 5.633917651854872, + "grad_norm": 8.127737045288086, + "learning_rate": 2.960490555782036e-06, + "loss": 2.8824, + "step": 82920 + }, + { + "epoch": 5.634257371925534, + "grad_norm": 9.062355995178223, + "learning_rate": 2.9600659056937087e-06, + "loss": 2.8056, + "step": 82925 + }, + { + "epoch": 5.634597091996195, + "grad_norm": 7.209922790527344, + "learning_rate": 2.9596412556053815e-06, + "loss": 2.8426, + "step": 82930 + }, + { + "epoch": 5.634936812066857, + "grad_norm": 8.369945526123047, + "learning_rate": 2.959216605517054e-06, + "loss": 2.7887, + "step": 82935 + }, + { + "epoch": 5.635276532137519, + "grad_norm": 6.60723876953125, + "learning_rate": 2.958791955428727e-06, + "loss": 2.3439, + "step": 82940 + }, + { + "epoch": 5.63561625220818, + "grad_norm": 9.807393074035645, + "learning_rate": 2.9583673053404e-06, + "loss": 2.7045, + "step": 82945 + }, + { + "epoch": 5.635955972278842, + "grad_norm": 8.355433464050293, + "learning_rate": 2.9579426552520723e-06, + "loss": 2.8016, + "step": 82950 + }, + { + "epoch": 5.6362956923495044, + "grad_norm": 6.684100151062012, + "learning_rate": 2.9575180051637455e-06, + "loss": 2.8266, + "step": 82955 + }, + { + "epoch": 5.636635412420166, + "grad_norm": 9.754965782165527, + "learning_rate": 2.9570933550754183e-06, + "loss": 3.0327, + "step": 82960 + }, + { + "epoch": 5.636975132490828, + "grad_norm": 6.559696197509766, + "learning_rate": 2.9566687049870907e-06, + "loss": 2.6062, + "step": 82965 + }, + { + "epoch": 5.63731485256149, + "grad_norm": 6.669576644897461, + "learning_rate": 2.9562440548987635e-06, + "loss": 2.7431, + "step": 82970 + }, + { + "epoch": 5.637654572632151, + "grad_norm": 6.875999927520752, + "learning_rate": 2.9558194048104367e-06, + "loss": 3.137, + "step": 82975 + }, + { + "epoch": 5.637994292702813, + "grad_norm": 8.51895809173584, + "learning_rate": 2.955394754722109e-06, + "loss": 2.7015, + "step": 82980 + }, + { + "epoch": 5.638334012773475, + "grad_norm": Infinity, + "learning_rate": 2.955055034651447e-06, + "loss": 2.8496, + "step": 82985 + }, + { + "epoch": 5.638673732844136, + "grad_norm": 6.610581874847412, + "learning_rate": 2.9546303845631204e-06, + "loss": 2.7166, + "step": 82990 + }, + { + "epoch": 5.639013452914798, + "grad_norm": 6.08845853805542, + "learning_rate": 2.954205734474793e-06, + "loss": 2.7207, + "step": 82995 + }, + { + "epoch": 5.6393531729854605, + "grad_norm": 6.159934043884277, + "learning_rate": 2.9537810843864655e-06, + "loss": 2.8755, + "step": 83000 + }, + { + "epoch": 5.639692893056122, + "grad_norm": 8.253933906555176, + "learning_rate": 2.9533564342981388e-06, + "loss": 2.726, + "step": 83005 + }, + { + "epoch": 5.640032613126784, + "grad_norm": 8.22368335723877, + "learning_rate": 2.9529317842098116e-06, + "loss": 2.9568, + "step": 83010 + }, + { + "epoch": 5.640372333197446, + "grad_norm": 9.389457702636719, + "learning_rate": 2.952507134121484e-06, + "loss": 2.8764, + "step": 83015 + }, + { + "epoch": 5.640712053268107, + "grad_norm": 10.377302169799805, + "learning_rate": 2.9520824840331567e-06, + "loss": 2.6457, + "step": 83020 + }, + { + "epoch": 5.641051773338769, + "grad_norm": 8.818493843078613, + "learning_rate": 2.95165783394483e-06, + "loss": 2.9028, + "step": 83025 + }, + { + "epoch": 5.64139149340943, + "grad_norm": 6.753974914550781, + "learning_rate": 2.9512331838565023e-06, + "loss": 2.7767, + "step": 83030 + }, + { + "epoch": 5.641731213480092, + "grad_norm": 8.481945037841797, + "learning_rate": 2.950808533768175e-06, + "loss": 2.8166, + "step": 83035 + }, + { + "epoch": 5.642070933550754, + "grad_norm": 8.340285301208496, + "learning_rate": 2.9503838836798484e-06, + "loss": 2.9236, + "step": 83040 + }, + { + "epoch": 5.642410653621416, + "grad_norm": 9.43564224243164, + "learning_rate": 2.9499592335915208e-06, + "loss": 2.7422, + "step": 83045 + }, + { + "epoch": 5.642750373692078, + "grad_norm": 7.139571666717529, + "learning_rate": 2.9495345835031936e-06, + "loss": 2.9575, + "step": 83050 + }, + { + "epoch": 5.64309009376274, + "grad_norm": 9.180729866027832, + "learning_rate": 2.9491099334148664e-06, + "loss": 2.8553, + "step": 83055 + }, + { + "epoch": 5.643429813833401, + "grad_norm": 7.5587158203125, + "learning_rate": 2.9486852833265387e-06, + "loss": 2.7906, + "step": 83060 + }, + { + "epoch": 5.643769533904063, + "grad_norm": 6.576598644256592, + "learning_rate": 2.948260633238212e-06, + "loss": 2.841, + "step": 83065 + }, + { + "epoch": 5.644109253974725, + "grad_norm": 6.800677299499512, + "learning_rate": 2.9478359831498848e-06, + "loss": 2.8097, + "step": 83070 + }, + { + "epoch": 5.644448974045386, + "grad_norm": 7.0964179039001465, + "learning_rate": 2.947411333061557e-06, + "loss": 2.6061, + "step": 83075 + }, + { + "epoch": 5.644788694116048, + "grad_norm": 7.280599117279053, + "learning_rate": 2.9469866829732304e-06, + "loss": 2.6097, + "step": 83080 + }, + { + "epoch": 5.64512841418671, + "grad_norm": 8.965255737304688, + "learning_rate": 2.946562032884903e-06, + "loss": 2.7376, + "step": 83085 + }, + { + "epoch": 5.645468134257372, + "grad_norm": 7.177356243133545, + "learning_rate": 2.946137382796576e-06, + "loss": 2.8335, + "step": 83090 + }, + { + "epoch": 5.645807854328034, + "grad_norm": 7.427753925323486, + "learning_rate": 2.9457127327082483e-06, + "loss": 2.6713, + "step": 83095 + }, + { + "epoch": 5.646147574398696, + "grad_norm": 8.662294387817383, + "learning_rate": 2.9452880826199216e-06, + "loss": 2.8704, + "step": 83100 + }, + { + "epoch": 5.646487294469357, + "grad_norm": 9.343313217163086, + "learning_rate": 2.9448634325315944e-06, + "loss": 2.7365, + "step": 83105 + }, + { + "epoch": 5.646827014540019, + "grad_norm": 6.762842655181885, + "learning_rate": 2.9444387824432667e-06, + "loss": 2.6602, + "step": 83110 + }, + { + "epoch": 5.647166734610681, + "grad_norm": 9.156468391418457, + "learning_rate": 2.94401413235494e-06, + "loss": 3.0082, + "step": 83115 + }, + { + "epoch": 5.647506454681342, + "grad_norm": 6.891422271728516, + "learning_rate": 2.9435894822666128e-06, + "loss": 2.8042, + "step": 83120 + }, + { + "epoch": 5.647846174752004, + "grad_norm": 8.419234275817871, + "learning_rate": 2.943164832178285e-06, + "loss": 2.6775, + "step": 83125 + }, + { + "epoch": 5.648185894822666, + "grad_norm": 8.563151359558105, + "learning_rate": 2.9427401820899584e-06, + "loss": 2.8863, + "step": 83130 + }, + { + "epoch": 5.648525614893328, + "grad_norm": 6.354923248291016, + "learning_rate": 2.942315532001631e-06, + "loss": 2.682, + "step": 83135 + }, + { + "epoch": 5.64886533496399, + "grad_norm": 6.061637878417969, + "learning_rate": 2.9418908819133035e-06, + "loss": 2.7655, + "step": 83140 + }, + { + "epoch": 5.649205055034652, + "grad_norm": 7.870954990386963, + "learning_rate": 2.9414662318249763e-06, + "loss": 2.4694, + "step": 83145 + }, + { + "epoch": 5.649544775105313, + "grad_norm": 6.949525833129883, + "learning_rate": 2.9410415817366496e-06, + "loss": 2.8522, + "step": 83150 + }, + { + "epoch": 5.649884495175975, + "grad_norm": 8.280388832092285, + "learning_rate": 2.940616931648322e-06, + "loss": 2.7897, + "step": 83155 + }, + { + "epoch": 5.650224215246637, + "grad_norm": 10.979107856750488, + "learning_rate": 2.9401922815599947e-06, + "loss": 2.9733, + "step": 83160 + }, + { + "epoch": 5.650563935317298, + "grad_norm": 7.480356693267822, + "learning_rate": 2.939767631471668e-06, + "loss": 2.8207, + "step": 83165 + }, + { + "epoch": 5.65090365538796, + "grad_norm": 7.185631275177002, + "learning_rate": 2.9393429813833403e-06, + "loss": 2.781, + "step": 83170 + }, + { + "epoch": 5.651243375458622, + "grad_norm": 9.413375854492188, + "learning_rate": 2.938918331295013e-06, + "loss": 2.8636, + "step": 83175 + }, + { + "epoch": 5.651583095529284, + "grad_norm": 7.805652618408203, + "learning_rate": 2.938493681206686e-06, + "loss": 2.6119, + "step": 83180 + }, + { + "epoch": 5.651922815599946, + "grad_norm": 9.069661140441895, + "learning_rate": 2.9380690311183583e-06, + "loss": 2.785, + "step": 83185 + }, + { + "epoch": 5.652262535670608, + "grad_norm": 11.639243125915527, + "learning_rate": 2.9376443810300315e-06, + "loss": 2.5035, + "step": 83190 + }, + { + "epoch": 5.652602255741269, + "grad_norm": 6.125056266784668, + "learning_rate": 2.9372197309417044e-06, + "loss": 2.685, + "step": 83195 + }, + { + "epoch": 5.652941975811931, + "grad_norm": 5.612496852874756, + "learning_rate": 2.9367950808533767e-06, + "loss": 2.8153, + "step": 83200 + }, + { + "epoch": 5.653281695882593, + "grad_norm": 7.079156398773193, + "learning_rate": 2.93637043076505e-06, + "loss": 2.709, + "step": 83205 + }, + { + "epoch": 5.653621415953254, + "grad_norm": 9.685203552246094, + "learning_rate": 2.9359457806767228e-06, + "loss": 3.0548, + "step": 83210 + }, + { + "epoch": 5.653961136023916, + "grad_norm": 7.658054351806641, + "learning_rate": 2.935521130588395e-06, + "loss": 2.8419, + "step": 83215 + }, + { + "epoch": 5.654300856094578, + "grad_norm": 6.13841438293457, + "learning_rate": 2.935096480500068e-06, + "loss": 2.9788, + "step": 83220 + }, + { + "epoch": 5.65464057616524, + "grad_norm": 6.948779582977295, + "learning_rate": 2.934671830411741e-06, + "loss": 2.7916, + "step": 83225 + }, + { + "epoch": 5.654980296235902, + "grad_norm": 6.423551082611084, + "learning_rate": 2.9342471803234135e-06, + "loss": 2.6166, + "step": 83230 + }, + { + "epoch": 5.655320016306564, + "grad_norm": 8.315068244934082, + "learning_rate": 2.9338225302350863e-06, + "loss": 2.8048, + "step": 83235 + }, + { + "epoch": 5.655659736377225, + "grad_norm": 8.5771484375, + "learning_rate": 2.9333978801467596e-06, + "loss": 2.8451, + "step": 83240 + }, + { + "epoch": 5.655999456447887, + "grad_norm": 7.390467643737793, + "learning_rate": 2.932973230058432e-06, + "loss": 2.8087, + "step": 83245 + }, + { + "epoch": 5.656339176518548, + "grad_norm": 5.725922107696533, + "learning_rate": 2.9325485799701047e-06, + "loss": 2.5926, + "step": 83250 + }, + { + "epoch": 5.65667889658921, + "grad_norm": 8.142882347106934, + "learning_rate": 2.9321239298817775e-06, + "loss": 2.9139, + "step": 83255 + }, + { + "epoch": 5.657018616659872, + "grad_norm": 8.693532943725586, + "learning_rate": 2.9316992797934508e-06, + "loss": 2.6775, + "step": 83260 + }, + { + "epoch": 5.657358336730534, + "grad_norm": 8.23947811126709, + "learning_rate": 2.931274629705123e-06, + "loss": 2.9796, + "step": 83265 + }, + { + "epoch": 5.657698056801196, + "grad_norm": 5.877624034881592, + "learning_rate": 2.930849979616796e-06, + "loss": 2.9206, + "step": 83270 + }, + { + "epoch": 5.658037776871858, + "grad_norm": 4.899290561676025, + "learning_rate": 2.930425329528469e-06, + "loss": 2.4894, + "step": 83275 + }, + { + "epoch": 5.658377496942519, + "grad_norm": 6.981583118438721, + "learning_rate": 2.9300006794401415e-06, + "loss": 2.7967, + "step": 83280 + }, + { + "epoch": 5.658717217013181, + "grad_norm": 8.684202194213867, + "learning_rate": 2.9295760293518143e-06, + "loss": 2.5911, + "step": 83285 + }, + { + "epoch": 5.659056937083843, + "grad_norm": 7.373007774353027, + "learning_rate": 2.9291513792634876e-06, + "loss": 2.9164, + "step": 83290 + }, + { + "epoch": 5.659396657154504, + "grad_norm": 7.090517520904541, + "learning_rate": 2.92872672917516e-06, + "loss": 2.4157, + "step": 83295 + }, + { + "epoch": 5.659736377225166, + "grad_norm": 7.492156505584717, + "learning_rate": 2.9283020790868327e-06, + "loss": 2.7442, + "step": 83300 + }, + { + "epoch": 5.660076097295828, + "grad_norm": 7.099887847900391, + "learning_rate": 2.9278774289985055e-06, + "loss": 2.6272, + "step": 83305 + }, + { + "epoch": 5.66041581736649, + "grad_norm": 10.260855674743652, + "learning_rate": 2.927452778910178e-06, + "loss": 2.6161, + "step": 83310 + }, + { + "epoch": 5.660755537437152, + "grad_norm": 6.869961261749268, + "learning_rate": 2.927028128821851e-06, + "loss": 2.8191, + "step": 83315 + }, + { + "epoch": 5.661095257507814, + "grad_norm": 9.303549766540527, + "learning_rate": 2.926603478733524e-06, + "loss": 2.8891, + "step": 83320 + }, + { + "epoch": 5.661434977578475, + "grad_norm": 8.776312828063965, + "learning_rate": 2.9261788286451963e-06, + "loss": 2.8276, + "step": 83325 + }, + { + "epoch": 5.661774697649137, + "grad_norm": 8.207859992980957, + "learning_rate": 2.9257541785568695e-06, + "loss": 2.9441, + "step": 83330 + }, + { + "epoch": 5.662114417719799, + "grad_norm": 6.128815650939941, + "learning_rate": 2.9253295284685423e-06, + "loss": 2.7001, + "step": 83335 + }, + { + "epoch": 5.66245413779046, + "grad_norm": 7.0096235275268555, + "learning_rate": 2.9249048783802147e-06, + "loss": 2.923, + "step": 83340 + }, + { + "epoch": 5.662793857861122, + "grad_norm": 7.820930480957031, + "learning_rate": 2.9244802282918875e-06, + "loss": 2.8299, + "step": 83345 + }, + { + "epoch": 5.663133577931784, + "grad_norm": 8.791473388671875, + "learning_rate": 2.9240555782035608e-06, + "loss": 2.7874, + "step": 83350 + }, + { + "epoch": 5.663473298002446, + "grad_norm": 8.253132820129395, + "learning_rate": 2.923630928115233e-06, + "loss": 2.9873, + "step": 83355 + }, + { + "epoch": 5.663813018073108, + "grad_norm": 8.27933406829834, + "learning_rate": 2.923206278026906e-06, + "loss": 2.9658, + "step": 83360 + }, + { + "epoch": 5.66415273814377, + "grad_norm": 9.646707534790039, + "learning_rate": 2.922781627938579e-06, + "loss": 2.789, + "step": 83365 + }, + { + "epoch": 5.664492458214431, + "grad_norm": 7.61890983581543, + "learning_rate": 2.9223569778502515e-06, + "loss": 2.5855, + "step": 83370 + }, + { + "epoch": 5.664832178285093, + "grad_norm": 6.529424667358398, + "learning_rate": 2.9219323277619243e-06, + "loss": 2.7045, + "step": 83375 + }, + { + "epoch": 5.665171898355755, + "grad_norm": 7.263291835784912, + "learning_rate": 2.921507677673597e-06, + "loss": 2.7057, + "step": 83380 + }, + { + "epoch": 5.665511618426416, + "grad_norm": 8.367122650146484, + "learning_rate": 2.92108302758527e-06, + "loss": 2.5998, + "step": 83385 + }, + { + "epoch": 5.665851338497078, + "grad_norm": 6.773377895355225, + "learning_rate": 2.9206583774969427e-06, + "loss": 2.9015, + "step": 83390 + }, + { + "epoch": 5.66619105856774, + "grad_norm": 7.502056121826172, + "learning_rate": 2.9202337274086155e-06, + "loss": 2.8866, + "step": 83395 + }, + { + "epoch": 5.666530778638402, + "grad_norm": 9.985097885131836, + "learning_rate": 2.919809077320288e-06, + "loss": 2.5245, + "step": 83400 + }, + { + "epoch": 5.666870498709064, + "grad_norm": 7.976466178894043, + "learning_rate": 2.919384427231961e-06, + "loss": 2.6012, + "step": 83405 + }, + { + "epoch": 5.667210218779726, + "grad_norm": 5.779356479644775, + "learning_rate": 2.918959777143634e-06, + "loss": 2.6254, + "step": 83410 + }, + { + "epoch": 5.667549938850387, + "grad_norm": 6.488324165344238, + "learning_rate": 2.9185351270553063e-06, + "loss": 2.9108, + "step": 83415 + }, + { + "epoch": 5.667889658921049, + "grad_norm": 9.018143653869629, + "learning_rate": 2.9181104769669795e-06, + "loss": 2.8811, + "step": 83420 + }, + { + "epoch": 5.668229378991711, + "grad_norm": 7.407769680023193, + "learning_rate": 2.9176858268786523e-06, + "loss": 2.6363, + "step": 83425 + }, + { + "epoch": 5.668569099062372, + "grad_norm": 8.673829078674316, + "learning_rate": 2.917261176790325e-06, + "loss": 2.7605, + "step": 83430 + }, + { + "epoch": 5.668908819133034, + "grad_norm": 6.952643871307373, + "learning_rate": 2.9168365267019975e-06, + "loss": 3.0416, + "step": 83435 + }, + { + "epoch": 5.669248539203696, + "grad_norm": 10.578001022338867, + "learning_rate": 2.9164118766136707e-06, + "loss": 2.7744, + "step": 83440 + }, + { + "epoch": 5.669588259274358, + "grad_norm": 7.450813293457031, + "learning_rate": 2.9159872265253435e-06, + "loss": 2.8615, + "step": 83445 + }, + { + "epoch": 5.66992797934502, + "grad_norm": 8.099013328552246, + "learning_rate": 2.915562576437016e-06, + "loss": 2.5555, + "step": 83450 + }, + { + "epoch": 5.670267699415682, + "grad_norm": 6.962283134460449, + "learning_rate": 2.915137926348689e-06, + "loss": 2.8496, + "step": 83455 + }, + { + "epoch": 5.670607419486343, + "grad_norm": 10.660201072692871, + "learning_rate": 2.914713276260362e-06, + "loss": 2.846, + "step": 83460 + }, + { + "epoch": 5.670947139557005, + "grad_norm": 6.810696601867676, + "learning_rate": 2.9142886261720343e-06, + "loss": 2.9614, + "step": 83465 + }, + { + "epoch": 5.671286859627667, + "grad_norm": 6.267060279846191, + "learning_rate": 2.913863976083707e-06, + "loss": 2.6588, + "step": 83470 + }, + { + "epoch": 5.671626579698328, + "grad_norm": 8.266528129577637, + "learning_rate": 2.9134393259953803e-06, + "loss": 2.5899, + "step": 83475 + }, + { + "epoch": 5.67196629976899, + "grad_norm": 9.226873397827148, + "learning_rate": 2.9130146759070527e-06, + "loss": 2.6974, + "step": 83480 + }, + { + "epoch": 5.672306019839652, + "grad_norm": 6.606198787689209, + "learning_rate": 2.9125900258187255e-06, + "loss": 2.8093, + "step": 83485 + }, + { + "epoch": 5.672645739910314, + "grad_norm": 6.017663955688477, + "learning_rate": 2.9121653757303987e-06, + "loss": 2.9773, + "step": 83490 + }, + { + "epoch": 5.672985459980976, + "grad_norm": 8.944218635559082, + "learning_rate": 2.911740725642071e-06, + "loss": 2.8505, + "step": 83495 + }, + { + "epoch": 5.673325180051638, + "grad_norm": 6.2489914894104, + "learning_rate": 2.911316075553744e-06, + "loss": 2.833, + "step": 83500 + }, + { + "epoch": 5.673664900122299, + "grad_norm": 7.9791131019592285, + "learning_rate": 2.9108914254654167e-06, + "loss": 2.7384, + "step": 83505 + }, + { + "epoch": 5.674004620192961, + "grad_norm": 8.058942794799805, + "learning_rate": 2.9104667753770895e-06, + "loss": 2.8442, + "step": 83510 + }, + { + "epoch": 5.674344340263623, + "grad_norm": 9.676512718200684, + "learning_rate": 2.9100421252887623e-06, + "loss": 2.8734, + "step": 83515 + }, + { + "epoch": 5.674684060334284, + "grad_norm": 7.3159027099609375, + "learning_rate": 2.909617475200435e-06, + "loss": 2.8165, + "step": 83520 + }, + { + "epoch": 5.675023780404946, + "grad_norm": 6.309149265289307, + "learning_rate": 2.9091928251121075e-06, + "loss": 2.5469, + "step": 83525 + }, + { + "epoch": 5.6753635004756084, + "grad_norm": 7.423947334289551, + "learning_rate": 2.9087681750237807e-06, + "loss": 2.9144, + "step": 83530 + }, + { + "epoch": 5.67570322054627, + "grad_norm": 9.353094100952148, + "learning_rate": 2.9083435249354535e-06, + "loss": 2.7934, + "step": 83535 + }, + { + "epoch": 5.676042940616932, + "grad_norm": 8.168624877929688, + "learning_rate": 2.907918874847126e-06, + "loss": 2.9473, + "step": 83540 + }, + { + "epoch": 5.676382660687594, + "grad_norm": 8.43233585357666, + "learning_rate": 2.907494224758799e-06, + "loss": 2.6304, + "step": 83545 + }, + { + "epoch": 5.676722380758255, + "grad_norm": 7.690804958343506, + "learning_rate": 2.907069574670472e-06, + "loss": 2.5689, + "step": 83550 + }, + { + "epoch": 5.677062100828917, + "grad_norm": 7.256691932678223, + "learning_rate": 2.9066449245821443e-06, + "loss": 2.7035, + "step": 83555 + }, + { + "epoch": 5.677401820899579, + "grad_norm": 6.909422397613525, + "learning_rate": 2.906220274493817e-06, + "loss": 2.7278, + "step": 83560 + }, + { + "epoch": 5.67774154097024, + "grad_norm": 8.234074592590332, + "learning_rate": 2.9057956244054903e-06, + "loss": 2.8608, + "step": 83565 + }, + { + "epoch": 5.678081261040902, + "grad_norm": 9.644822120666504, + "learning_rate": 2.9053709743171627e-06, + "loss": 2.8535, + "step": 83570 + }, + { + "epoch": 5.6784209811115645, + "grad_norm": 8.925819396972656, + "learning_rate": 2.9049463242288355e-06, + "loss": 2.7894, + "step": 83575 + }, + { + "epoch": 5.678760701182226, + "grad_norm": 6.847811698913574, + "learning_rate": 2.9045216741405087e-06, + "loss": 2.7751, + "step": 83580 + }, + { + "epoch": 5.679100421252888, + "grad_norm": 8.92032241821289, + "learning_rate": 2.904097024052181e-06, + "loss": 2.7483, + "step": 83585 + }, + { + "epoch": 5.67944014132355, + "grad_norm": 7.009796142578125, + "learning_rate": 2.903672373963854e-06, + "loss": 2.8694, + "step": 83590 + }, + { + "epoch": 5.679779861394211, + "grad_norm": 6.457345962524414, + "learning_rate": 2.9032477238755267e-06, + "loss": 2.7533, + "step": 83595 + }, + { + "epoch": 5.680119581464873, + "grad_norm": 7.992620944976807, + "learning_rate": 2.9028230737872e-06, + "loss": 2.8046, + "step": 83600 + }, + { + "epoch": 5.680459301535535, + "grad_norm": 8.869555473327637, + "learning_rate": 2.9023984236988723e-06, + "loss": 2.8233, + "step": 83605 + }, + { + "epoch": 5.680799021606196, + "grad_norm": 7.445135116577148, + "learning_rate": 2.901973773610545e-06, + "loss": 2.9508, + "step": 83610 + }, + { + "epoch": 5.681138741676858, + "grad_norm": 7.325652122497559, + "learning_rate": 2.9015491235222183e-06, + "loss": 2.9017, + "step": 83615 + }, + { + "epoch": 5.6814784617475205, + "grad_norm": 6.702394008636475, + "learning_rate": 2.9011244734338907e-06, + "loss": 2.8776, + "step": 83620 + }, + { + "epoch": 5.681818181818182, + "grad_norm": 6.886290550231934, + "learning_rate": 2.9006998233455635e-06, + "loss": 2.9497, + "step": 83625 + }, + { + "epoch": 5.682157901888844, + "grad_norm": 8.645089149475098, + "learning_rate": 2.9002751732572363e-06, + "loss": 2.8393, + "step": 83630 + }, + { + "epoch": 5.682497621959506, + "grad_norm": 6.73293399810791, + "learning_rate": 2.899850523168909e-06, + "loss": 2.7984, + "step": 83635 + }, + { + "epoch": 5.682837342030167, + "grad_norm": 9.782265663146973, + "learning_rate": 2.899425873080582e-06, + "loss": 2.971, + "step": 83640 + }, + { + "epoch": 5.683177062100829, + "grad_norm": 6.189968585968018, + "learning_rate": 2.8990012229922547e-06, + "loss": 2.9291, + "step": 83645 + }, + { + "epoch": 5.683516782171491, + "grad_norm": 7.6053571701049805, + "learning_rate": 2.898576572903927e-06, + "loss": 2.436, + "step": 83650 + }, + { + "epoch": 5.683856502242152, + "grad_norm": 10.372597694396973, + "learning_rate": 2.8981519228156003e-06, + "loss": 2.6521, + "step": 83655 + }, + { + "epoch": 5.684196222312814, + "grad_norm": 4.927159309387207, + "learning_rate": 2.897727272727273e-06, + "loss": 2.941, + "step": 83660 + }, + { + "epoch": 5.6845359423834765, + "grad_norm": 8.569095611572266, + "learning_rate": 2.8973026226389455e-06, + "loss": 2.6688, + "step": 83665 + }, + { + "epoch": 5.684875662454138, + "grad_norm": 6.26657247543335, + "learning_rate": 2.8968779725506187e-06, + "loss": 2.7329, + "step": 83670 + }, + { + "epoch": 5.6852153825248, + "grad_norm": 7.229158401489258, + "learning_rate": 2.8964533224622915e-06, + "loss": 2.7604, + "step": 83675 + }, + { + "epoch": 5.685555102595462, + "grad_norm": 9.866373062133789, + "learning_rate": 2.896028672373964e-06, + "loss": 2.943, + "step": 83680 + }, + { + "epoch": 5.685894822666123, + "grad_norm": 6.349828720092773, + "learning_rate": 2.8956040222856367e-06, + "loss": 2.8403, + "step": 83685 + }, + { + "epoch": 5.686234542736785, + "grad_norm": 8.646428108215332, + "learning_rate": 2.89517937219731e-06, + "loss": 2.8455, + "step": 83690 + }, + { + "epoch": 5.686574262807447, + "grad_norm": 7.7154693603515625, + "learning_rate": 2.8947547221089823e-06, + "loss": 2.7872, + "step": 83695 + }, + { + "epoch": 5.686913982878108, + "grad_norm": 6.402797222137451, + "learning_rate": 2.894330072020655e-06, + "loss": 2.9067, + "step": 83700 + }, + { + "epoch": 5.68725370294877, + "grad_norm": 7.271215915679932, + "learning_rate": 2.8939054219323283e-06, + "loss": 2.6826, + "step": 83705 + }, + { + "epoch": 5.687593423019432, + "grad_norm": 6.969164848327637, + "learning_rate": 2.8934807718440007e-06, + "loss": 2.5909, + "step": 83710 + }, + { + "epoch": 5.687933143090094, + "grad_norm": 8.178953170776367, + "learning_rate": 2.8930561217556735e-06, + "loss": 2.7333, + "step": 83715 + }, + { + "epoch": 5.688272863160756, + "grad_norm": 6.705800533294678, + "learning_rate": 2.8926314716673463e-06, + "loss": 2.7846, + "step": 83720 + }, + { + "epoch": 5.688612583231417, + "grad_norm": 8.794839859008789, + "learning_rate": 2.8922068215790187e-06, + "loss": 2.8853, + "step": 83725 + }, + { + "epoch": 5.688952303302079, + "grad_norm": 7.165651798248291, + "learning_rate": 2.891782171490692e-06, + "loss": 2.9161, + "step": 83730 + }, + { + "epoch": 5.689292023372741, + "grad_norm": 7.545275688171387, + "learning_rate": 2.8913575214023647e-06, + "loss": 2.5988, + "step": 83735 + }, + { + "epoch": 5.689631743443402, + "grad_norm": 7.144908428192139, + "learning_rate": 2.890932871314037e-06, + "loss": 2.7945, + "step": 83740 + }, + { + "epoch": 5.689971463514064, + "grad_norm": 7.2816643714904785, + "learning_rate": 2.8905082212257103e-06, + "loss": 2.8423, + "step": 83745 + }, + { + "epoch": 5.690311183584726, + "grad_norm": 7.354428291320801, + "learning_rate": 2.890083571137383e-06, + "loss": 2.8401, + "step": 83750 + }, + { + "epoch": 5.690650903655388, + "grad_norm": 9.337530136108398, + "learning_rate": 2.8896589210490555e-06, + "loss": 2.9175, + "step": 83755 + }, + { + "epoch": 5.69099062372605, + "grad_norm": 6.023916244506836, + "learning_rate": 2.8892342709607283e-06, + "loss": 2.8181, + "step": 83760 + }, + { + "epoch": 5.691330343796712, + "grad_norm": 8.264372825622559, + "learning_rate": 2.8888096208724015e-06, + "loss": 2.8907, + "step": 83765 + }, + { + "epoch": 5.691670063867373, + "grad_norm": 7.255343914031982, + "learning_rate": 2.8883849707840743e-06, + "loss": 2.7126, + "step": 83770 + }, + { + "epoch": 5.692009783938035, + "grad_norm": 5.55580997467041, + "learning_rate": 2.8879603206957467e-06, + "loss": 2.7428, + "step": 83775 + }, + { + "epoch": 5.692349504008697, + "grad_norm": 6.404973030090332, + "learning_rate": 2.88753567060742e-06, + "loss": 2.6528, + "step": 83780 + }, + { + "epoch": 5.692689224079358, + "grad_norm": 8.783622741699219, + "learning_rate": 2.8871110205190927e-06, + "loss": 2.8917, + "step": 83785 + }, + { + "epoch": 5.69302894415002, + "grad_norm": 9.141359329223633, + "learning_rate": 2.886686370430765e-06, + "loss": 2.578, + "step": 83790 + }, + { + "epoch": 5.693368664220682, + "grad_norm": 7.506980895996094, + "learning_rate": 2.8862617203424383e-06, + "loss": 2.9938, + "step": 83795 + }, + { + "epoch": 5.693708384291344, + "grad_norm": 8.511075019836426, + "learning_rate": 2.885837070254111e-06, + "loss": 2.8153, + "step": 83800 + }, + { + "epoch": 5.694048104362006, + "grad_norm": 6.826348304748535, + "learning_rate": 2.8854124201657835e-06, + "loss": 2.6025, + "step": 83805 + }, + { + "epoch": 5.694387824432668, + "grad_norm": 9.203706741333008, + "learning_rate": 2.8849877700774563e-06, + "loss": 2.8968, + "step": 83810 + }, + { + "epoch": 5.694727544503329, + "grad_norm": 8.853022575378418, + "learning_rate": 2.8845631199891295e-06, + "loss": 3.027, + "step": 83815 + }, + { + "epoch": 5.695067264573991, + "grad_norm": 8.961712837219238, + "learning_rate": 2.884138469900802e-06, + "loss": 2.9015, + "step": 83820 + }, + { + "epoch": 5.695406984644653, + "grad_norm": 8.19416618347168, + "learning_rate": 2.8837138198124747e-06, + "loss": 2.6522, + "step": 83825 + }, + { + "epoch": 5.695746704715314, + "grad_norm": 8.506999969482422, + "learning_rate": 2.883289169724148e-06, + "loss": 2.8427, + "step": 83830 + }, + { + "epoch": 5.696086424785976, + "grad_norm": 7.55390739440918, + "learning_rate": 2.8828645196358203e-06, + "loss": 3.0549, + "step": 83835 + }, + { + "epoch": 5.6964261448566385, + "grad_norm": 7.760462760925293, + "learning_rate": 2.882439869547493e-06, + "loss": 2.7573, + "step": 83840 + }, + { + "epoch": 5.6967658649273, + "grad_norm": 8.278762817382812, + "learning_rate": 2.882015219459166e-06, + "loss": 2.7508, + "step": 83845 + }, + { + "epoch": 5.697105584997962, + "grad_norm": 6.9666643142700195, + "learning_rate": 2.8815905693708383e-06, + "loss": 2.768, + "step": 83850 + }, + { + "epoch": 5.697445305068624, + "grad_norm": 9.060338020324707, + "learning_rate": 2.8811659192825115e-06, + "loss": 2.7773, + "step": 83855 + }, + { + "epoch": 5.697785025139285, + "grad_norm": 8.668307304382324, + "learning_rate": 2.8807412691941843e-06, + "loss": 2.7918, + "step": 83860 + }, + { + "epoch": 5.698124745209947, + "grad_norm": 9.486851692199707, + "learning_rate": 2.8803166191058567e-06, + "loss": 2.801, + "step": 83865 + }, + { + "epoch": 5.698464465280609, + "grad_norm": 6.889429569244385, + "learning_rate": 2.87989196901753e-06, + "loss": 2.6525, + "step": 83870 + }, + { + "epoch": 5.69880418535127, + "grad_norm": 7.450894355773926, + "learning_rate": 2.8794673189292027e-06, + "loss": 2.8458, + "step": 83875 + }, + { + "epoch": 5.699143905421932, + "grad_norm": 8.439443588256836, + "learning_rate": 2.879042668840875e-06, + "loss": 2.8505, + "step": 83880 + }, + { + "epoch": 5.6994836254925945, + "grad_norm": 8.23160171508789, + "learning_rate": 2.878618018752548e-06, + "loss": 2.6795, + "step": 83885 + }, + { + "epoch": 5.699823345563256, + "grad_norm": 6.69447135925293, + "learning_rate": 2.878193368664221e-06, + "loss": 2.7362, + "step": 83890 + }, + { + "epoch": 5.700163065633918, + "grad_norm": 5.592097759246826, + "learning_rate": 2.8777687185758935e-06, + "loss": 2.7764, + "step": 83895 + }, + { + "epoch": 5.70050278570458, + "grad_norm": 7.384605884552002, + "learning_rate": 2.8773440684875663e-06, + "loss": 2.7155, + "step": 83900 + }, + { + "epoch": 5.700842505775241, + "grad_norm": 7.818335056304932, + "learning_rate": 2.8769194183992395e-06, + "loss": 2.7463, + "step": 83905 + }, + { + "epoch": 5.701182225845903, + "grad_norm": 8.863048553466797, + "learning_rate": 2.876494768310912e-06, + "loss": 2.9436, + "step": 83910 + }, + { + "epoch": 5.701521945916565, + "grad_norm": 6.445924758911133, + "learning_rate": 2.8760701182225847e-06, + "loss": 2.9601, + "step": 83915 + }, + { + "epoch": 5.701861665987226, + "grad_norm": 9.383340835571289, + "learning_rate": 2.875645468134258e-06, + "loss": 2.7695, + "step": 83920 + }, + { + "epoch": 5.702201386057888, + "grad_norm": 7.3359456062316895, + "learning_rate": 2.8752208180459303e-06, + "loss": 2.6807, + "step": 83925 + }, + { + "epoch": 5.70254110612855, + "grad_norm": 7.869028091430664, + "learning_rate": 2.874796167957603e-06, + "loss": 2.7227, + "step": 83930 + }, + { + "epoch": 5.702880826199212, + "grad_norm": 6.687619686126709, + "learning_rate": 2.874371517869276e-06, + "loss": 2.757, + "step": 83935 + }, + { + "epoch": 5.703220546269874, + "grad_norm": 7.497808933258057, + "learning_rate": 2.873946867780949e-06, + "loss": 2.4189, + "step": 83940 + }, + { + "epoch": 5.703560266340535, + "grad_norm": 7.365642070770264, + "learning_rate": 2.8735222176926215e-06, + "loss": 2.9294, + "step": 83945 + }, + { + "epoch": 5.703899986411197, + "grad_norm": 9.057573318481445, + "learning_rate": 2.8730975676042943e-06, + "loss": 2.5738, + "step": 83950 + }, + { + "epoch": 5.704239706481859, + "grad_norm": 8.666080474853516, + "learning_rate": 2.8726729175159675e-06, + "loss": 2.7679, + "step": 83955 + }, + { + "epoch": 5.70457942655252, + "grad_norm": 7.884298801422119, + "learning_rate": 2.87224826742764e-06, + "loss": 2.6151, + "step": 83960 + }, + { + "epoch": 5.704919146623182, + "grad_norm": 8.566354751586914, + "learning_rate": 2.8718236173393127e-06, + "loss": 2.764, + "step": 83965 + }, + { + "epoch": 5.705258866693844, + "grad_norm": 10.087664604187012, + "learning_rate": 2.8713989672509855e-06, + "loss": 2.6273, + "step": 83970 + }, + { + "epoch": 5.705598586764506, + "grad_norm": 7.426565170288086, + "learning_rate": 2.870974317162658e-06, + "loss": 2.8199, + "step": 83975 + }, + { + "epoch": 5.705938306835168, + "grad_norm": 7.0957512855529785, + "learning_rate": 2.870549667074331e-06, + "loss": 2.8333, + "step": 83980 + }, + { + "epoch": 5.70627802690583, + "grad_norm": 6.548980712890625, + "learning_rate": 2.870125016986004e-06, + "loss": 2.7523, + "step": 83985 + }, + { + "epoch": 5.706617746976491, + "grad_norm": 7.435582160949707, + "learning_rate": 2.8697003668976763e-06, + "loss": 2.8588, + "step": 83990 + }, + { + "epoch": 5.706957467047153, + "grad_norm": 8.352128982543945, + "learning_rate": 2.8692757168093495e-06, + "loss": 2.8022, + "step": 83995 + }, + { + "epoch": 5.707297187117815, + "grad_norm": 7.406789779663086, + "learning_rate": 2.8688510667210223e-06, + "loss": 2.6987, + "step": 84000 + }, + { + "epoch": 5.707636907188476, + "grad_norm": 7.733839511871338, + "learning_rate": 2.8684264166326947e-06, + "loss": 2.6735, + "step": 84005 + }, + { + "epoch": 5.707976627259138, + "grad_norm": 7.005251407623291, + "learning_rate": 2.8680017665443675e-06, + "loss": 2.879, + "step": 84010 + }, + { + "epoch": 5.7083163473298, + "grad_norm": 7.279009819030762, + "learning_rate": 2.8675771164560407e-06, + "loss": 2.6372, + "step": 84015 + }, + { + "epoch": 5.708656067400462, + "grad_norm": 7.565399646759033, + "learning_rate": 2.867152466367713e-06, + "loss": 2.814, + "step": 84020 + }, + { + "epoch": 5.708995787471124, + "grad_norm": 7.021165370941162, + "learning_rate": 2.866727816279386e-06, + "loss": 2.7511, + "step": 84025 + }, + { + "epoch": 5.709335507541786, + "grad_norm": 7.857454299926758, + "learning_rate": 2.866303166191059e-06, + "loss": 2.6675, + "step": 84030 + }, + { + "epoch": 5.709675227612447, + "grad_norm": 7.6611127853393555, + "learning_rate": 2.8658785161027315e-06, + "loss": 2.8257, + "step": 84035 + }, + { + "epoch": 5.710014947683109, + "grad_norm": 8.18106746673584, + "learning_rate": 2.8654538660144043e-06, + "loss": 2.9256, + "step": 84040 + }, + { + "epoch": 5.710354667753771, + "grad_norm": 7.827388763427734, + "learning_rate": 2.865029215926077e-06, + "loss": 2.8619, + "step": 84045 + }, + { + "epoch": 5.710694387824432, + "grad_norm": 6.3607964515686035, + "learning_rate": 2.86460456583775e-06, + "loss": 3.1179, + "step": 84050 + }, + { + "epoch": 5.711034107895094, + "grad_norm": 7.989642143249512, + "learning_rate": 2.8641799157494227e-06, + "loss": 2.8779, + "step": 84055 + }, + { + "epoch": 5.711373827965756, + "grad_norm": 8.033563613891602, + "learning_rate": 2.8637552656610955e-06, + "loss": 2.7491, + "step": 84060 + }, + { + "epoch": 5.711713548036418, + "grad_norm": 9.975299835205078, + "learning_rate": 2.863330615572768e-06, + "loss": 2.7546, + "step": 84065 + }, + { + "epoch": 5.71205326810708, + "grad_norm": 8.111858367919922, + "learning_rate": 2.862905965484441e-06, + "loss": 2.7826, + "step": 84070 + }, + { + "epoch": 5.712392988177742, + "grad_norm": 7.832136154174805, + "learning_rate": 2.862481315396114e-06, + "loss": 2.76, + "step": 84075 + }, + { + "epoch": 5.712732708248403, + "grad_norm": 6.548087120056152, + "learning_rate": 2.8620566653077863e-06, + "loss": 3.0675, + "step": 84080 + }, + { + "epoch": 5.713072428319065, + "grad_norm": 6.632147789001465, + "learning_rate": 2.8616320152194595e-06, + "loss": 2.7208, + "step": 84085 + }, + { + "epoch": 5.713412148389727, + "grad_norm": 8.9343843460083, + "learning_rate": 2.8612073651311323e-06, + "loss": 2.7162, + "step": 84090 + }, + { + "epoch": 5.713751868460388, + "grad_norm": 6.607219696044922, + "learning_rate": 2.8607827150428047e-06, + "loss": 2.8795, + "step": 84095 + }, + { + "epoch": 5.71409158853105, + "grad_norm": 5.3700480461120605, + "learning_rate": 2.8603580649544775e-06, + "loss": 2.5892, + "step": 84100 + }, + { + "epoch": 5.7144313086017124, + "grad_norm": 8.111413955688477, + "learning_rate": 2.8599334148661507e-06, + "loss": 2.8262, + "step": 84105 + }, + { + "epoch": 5.714771028672374, + "grad_norm": 7.391913890838623, + "learning_rate": 2.8595087647778235e-06, + "loss": 2.7463, + "step": 84110 + }, + { + "epoch": 5.715110748743036, + "grad_norm": 7.418228626251221, + "learning_rate": 2.859084114689496e-06, + "loss": 2.8592, + "step": 84115 + }, + { + "epoch": 5.715450468813698, + "grad_norm": 7.330922603607178, + "learning_rate": 2.858659464601169e-06, + "loss": 2.9299, + "step": 84120 + }, + { + "epoch": 5.715790188884359, + "grad_norm": 9.183985710144043, + "learning_rate": 2.858234814512842e-06, + "loss": 2.689, + "step": 84125 + }, + { + "epoch": 5.716129908955021, + "grad_norm": 8.87065601348877, + "learning_rate": 2.8578101644245143e-06, + "loss": 2.7121, + "step": 84130 + }, + { + "epoch": 5.716469629025683, + "grad_norm": 7.432700157165527, + "learning_rate": 2.857385514336187e-06, + "loss": 2.8441, + "step": 84135 + }, + { + "epoch": 5.716809349096344, + "grad_norm": 6.082976818084717, + "learning_rate": 2.8569608642478603e-06, + "loss": 2.9575, + "step": 84140 + }, + { + "epoch": 5.717149069167006, + "grad_norm": 9.315159797668457, + "learning_rate": 2.8565362141595327e-06, + "loss": 2.8819, + "step": 84145 + }, + { + "epoch": 5.7174887892376685, + "grad_norm": 6.2086405754089355, + "learning_rate": 2.8561115640712055e-06, + "loss": 2.8717, + "step": 84150 + }, + { + "epoch": 5.71782850930833, + "grad_norm": 6.504182815551758, + "learning_rate": 2.8556869139828787e-06, + "loss": 3.0237, + "step": 84155 + }, + { + "epoch": 5.718168229378992, + "grad_norm": 8.447598457336426, + "learning_rate": 2.855262263894551e-06, + "loss": 2.7157, + "step": 84160 + }, + { + "epoch": 5.718507949449654, + "grad_norm": 6.708861827850342, + "learning_rate": 2.854837613806224e-06, + "loss": 2.9319, + "step": 84165 + }, + { + "epoch": 5.718847669520315, + "grad_norm": 7.545810222625732, + "learning_rate": 2.8544129637178967e-06, + "loss": 2.9187, + "step": 84170 + }, + { + "epoch": 5.719187389590977, + "grad_norm": 11.261640548706055, + "learning_rate": 2.8539883136295695e-06, + "loss": 2.5641, + "step": 84175 + }, + { + "epoch": 5.719527109661639, + "grad_norm": 8.422371864318848, + "learning_rate": 2.8535636635412423e-06, + "loss": 2.6492, + "step": 84180 + }, + { + "epoch": 5.7198668297323, + "grad_norm": 6.7983317375183105, + "learning_rate": 2.853139013452915e-06, + "loss": 2.9488, + "step": 84185 + }, + { + "epoch": 5.720206549802962, + "grad_norm": 6.591261863708496, + "learning_rate": 2.8527143633645875e-06, + "loss": 2.711, + "step": 84190 + }, + { + "epoch": 5.7205462698736245, + "grad_norm": 8.014564514160156, + "learning_rate": 2.8522897132762607e-06, + "loss": 2.7667, + "step": 84195 + }, + { + "epoch": 5.720885989944286, + "grad_norm": 9.056631088256836, + "learning_rate": 2.8518650631879335e-06, + "loss": 2.6922, + "step": 84200 + }, + { + "epoch": 5.721225710014948, + "grad_norm": 6.316252708435059, + "learning_rate": 2.851440413099606e-06, + "loss": 2.6858, + "step": 84205 + }, + { + "epoch": 5.72156543008561, + "grad_norm": 7.724079132080078, + "learning_rate": 2.851015763011279e-06, + "loss": 2.9041, + "step": 84210 + }, + { + "epoch": 5.721905150156271, + "grad_norm": 6.606870174407959, + "learning_rate": 2.850591112922952e-06, + "loss": 2.9042, + "step": 84215 + }, + { + "epoch": 5.722244870226933, + "grad_norm": 8.429875373840332, + "learning_rate": 2.8501664628346243e-06, + "loss": 2.735, + "step": 84220 + }, + { + "epoch": 5.722584590297595, + "grad_norm": 7.0506591796875, + "learning_rate": 2.849741812746297e-06, + "loss": 2.7702, + "step": 84225 + }, + { + "epoch": 5.722924310368256, + "grad_norm": 6.138162136077881, + "learning_rate": 2.8493171626579703e-06, + "loss": 2.8326, + "step": 84230 + }, + { + "epoch": 5.723264030438918, + "grad_norm": 6.664769649505615, + "learning_rate": 2.8488925125696427e-06, + "loss": 2.664, + "step": 84235 + }, + { + "epoch": 5.7236037505095805, + "grad_norm": 6.998845100402832, + "learning_rate": 2.8484678624813155e-06, + "loss": 2.9919, + "step": 84240 + }, + { + "epoch": 5.723943470580242, + "grad_norm": 8.369108200073242, + "learning_rate": 2.8480432123929887e-06, + "loss": 2.833, + "step": 84245 + }, + { + "epoch": 5.724283190650904, + "grad_norm": 7.4409499168396, + "learning_rate": 2.847618562304661e-06, + "loss": 2.755, + "step": 84250 + }, + { + "epoch": 5.724622910721566, + "grad_norm": 7.833133220672607, + "learning_rate": 2.847193912216334e-06, + "loss": 2.6338, + "step": 84255 + }, + { + "epoch": 5.724962630792227, + "grad_norm": 6.471041679382324, + "learning_rate": 2.8467692621280067e-06, + "loss": 2.3958, + "step": 84260 + }, + { + "epoch": 5.725302350862889, + "grad_norm": 6.9669623374938965, + "learning_rate": 2.846344612039679e-06, + "loss": 2.8591, + "step": 84265 + }, + { + "epoch": 5.725642070933551, + "grad_norm": 7.3854475021362305, + "learning_rate": 2.8459199619513523e-06, + "loss": 2.776, + "step": 84270 + }, + { + "epoch": 5.725981791004212, + "grad_norm": 6.827661991119385, + "learning_rate": 2.845495311863025e-06, + "loss": 2.8615, + "step": 84275 + }, + { + "epoch": 5.726321511074874, + "grad_norm": 7.7611894607543945, + "learning_rate": 2.8450706617746983e-06, + "loss": 2.8611, + "step": 84280 + }, + { + "epoch": 5.7266612311455365, + "grad_norm": 8.127013206481934, + "learning_rate": 2.8446460116863707e-06, + "loss": 2.567, + "step": 84285 + }, + { + "epoch": 5.727000951216198, + "grad_norm": 6.270554542541504, + "learning_rate": 2.8442213615980435e-06, + "loss": 2.8016, + "step": 84290 + }, + { + "epoch": 5.72734067128686, + "grad_norm": 6.935439586639404, + "learning_rate": 2.8437967115097163e-06, + "loss": 2.521, + "step": 84295 + }, + { + "epoch": 5.727680391357522, + "grad_norm": 5.742412090301514, + "learning_rate": 2.843372061421389e-06, + "loss": 2.7413, + "step": 84300 + }, + { + "epoch": 5.728020111428183, + "grad_norm": 6.473744869232178, + "learning_rate": 2.842947411333062e-06, + "loss": 2.7921, + "step": 84305 + }, + { + "epoch": 5.728359831498845, + "grad_norm": 9.074492454528809, + "learning_rate": 2.8425227612447347e-06, + "loss": 2.7618, + "step": 84310 + }, + { + "epoch": 5.728699551569507, + "grad_norm": 8.87924861907959, + "learning_rate": 2.842098111156407e-06, + "loss": 2.8072, + "step": 84315 + }, + { + "epoch": 5.729039271640168, + "grad_norm": 9.309487342834473, + "learning_rate": 2.8416734610680803e-06, + "loss": 2.6787, + "step": 84320 + }, + { + "epoch": 5.72937899171083, + "grad_norm": 7.261570930480957, + "learning_rate": 2.841248810979753e-06, + "loss": 2.5532, + "step": 84325 + }, + { + "epoch": 5.7297187117814925, + "grad_norm": 7.4514923095703125, + "learning_rate": 2.8408241608914255e-06, + "loss": 2.6826, + "step": 84330 + }, + { + "epoch": 5.730058431852154, + "grad_norm": 8.856866836547852, + "learning_rate": 2.8403995108030987e-06, + "loss": 2.5853, + "step": 84335 + }, + { + "epoch": 5.730398151922816, + "grad_norm": 5.999508857727051, + "learning_rate": 2.8399748607147715e-06, + "loss": 2.6707, + "step": 84340 + }, + { + "epoch": 5.730737871993478, + "grad_norm": 9.405400276184082, + "learning_rate": 2.839550210626444e-06, + "loss": 2.9615, + "step": 84345 + }, + { + "epoch": 5.731077592064139, + "grad_norm": 7.820446491241455, + "learning_rate": 2.8391255605381167e-06, + "loss": 2.6336, + "step": 84350 + }, + { + "epoch": 5.731417312134801, + "grad_norm": 7.317455291748047, + "learning_rate": 2.83870091044979e-06, + "loss": 2.8695, + "step": 84355 + }, + { + "epoch": 5.731757032205463, + "grad_norm": 7.788919448852539, + "learning_rate": 2.8382762603614623e-06, + "loss": 2.5576, + "step": 84360 + }, + { + "epoch": 5.732096752276124, + "grad_norm": 9.427186012268066, + "learning_rate": 2.837851610273135e-06, + "loss": 2.8092, + "step": 84365 + }, + { + "epoch": 5.732436472346786, + "grad_norm": 8.543972969055176, + "learning_rate": 2.8374269601848083e-06, + "loss": 2.6381, + "step": 84370 + }, + { + "epoch": 5.7327761924174485, + "grad_norm": 7.918887615203857, + "learning_rate": 2.8370023100964807e-06, + "loss": 2.7541, + "step": 84375 + }, + { + "epoch": 5.73311591248811, + "grad_norm": 8.326447486877441, + "learning_rate": 2.8365776600081535e-06, + "loss": 2.7253, + "step": 84380 + }, + { + "epoch": 5.733455632558772, + "grad_norm": 6.758184432983398, + "learning_rate": 2.8361530099198263e-06, + "loss": 2.7475, + "step": 84385 + }, + { + "epoch": 5.733795352629433, + "grad_norm": 8.907609939575195, + "learning_rate": 2.8357283598314987e-06, + "loss": 3.0033, + "step": 84390 + }, + { + "epoch": 5.734135072700095, + "grad_norm": 10.222064018249512, + "learning_rate": 2.835303709743172e-06, + "loss": 3.0337, + "step": 84395 + }, + { + "epoch": 5.734474792770757, + "grad_norm": 7.18264102935791, + "learning_rate": 2.8348790596548447e-06, + "loss": 2.8164, + "step": 84400 + }, + { + "epoch": 5.734814512841418, + "grad_norm": 8.416074752807617, + "learning_rate": 2.834454409566517e-06, + "loss": 2.4531, + "step": 84405 + }, + { + "epoch": 5.73515423291208, + "grad_norm": 6.217318534851074, + "learning_rate": 2.8340297594781903e-06, + "loss": 3.0852, + "step": 84410 + }, + { + "epoch": 5.7354939529827424, + "grad_norm": 7.892551422119141, + "learning_rate": 2.833605109389863e-06, + "loss": 2.8123, + "step": 84415 + }, + { + "epoch": 5.735833673053404, + "grad_norm": 7.401577949523926, + "learning_rate": 2.8331804593015355e-06, + "loss": 2.906, + "step": 84420 + }, + { + "epoch": 5.736173393124066, + "grad_norm": 7.917328357696533, + "learning_rate": 2.8327558092132087e-06, + "loss": 2.9843, + "step": 84425 + }, + { + "epoch": 5.736513113194728, + "grad_norm": 7.14724588394165, + "learning_rate": 2.8323311591248815e-06, + "loss": 2.9168, + "step": 84430 + }, + { + "epoch": 5.736852833265389, + "grad_norm": 8.461077690124512, + "learning_rate": 2.831906509036554e-06, + "loss": 2.76, + "step": 84435 + }, + { + "epoch": 5.737192553336051, + "grad_norm": 6.670105457305908, + "learning_rate": 2.8314818589482267e-06, + "loss": 2.8568, + "step": 84440 + }, + { + "epoch": 5.737532273406713, + "grad_norm": 9.304792404174805, + "learning_rate": 2.8310572088599e-06, + "loss": 2.6031, + "step": 84445 + }, + { + "epoch": 5.737871993477374, + "grad_norm": 8.608020782470703, + "learning_rate": 2.8306325587715727e-06, + "loss": 2.9193, + "step": 84450 + }, + { + "epoch": 5.738211713548036, + "grad_norm": 7.01513147354126, + "learning_rate": 2.830207908683245e-06, + "loss": 2.7287, + "step": 84455 + }, + { + "epoch": 5.7385514336186985, + "grad_norm": 7.134644985198975, + "learning_rate": 2.8297832585949183e-06, + "loss": 2.7837, + "step": 84460 + }, + { + "epoch": 5.73889115368936, + "grad_norm": 9.79334545135498, + "learning_rate": 2.829358608506591e-06, + "loss": 2.8234, + "step": 84465 + }, + { + "epoch": 5.739230873760022, + "grad_norm": 8.882400512695312, + "learning_rate": 2.8289339584182635e-06, + "loss": 2.9055, + "step": 84470 + }, + { + "epoch": 5.739570593830684, + "grad_norm": 7.718000888824463, + "learning_rate": 2.8285093083299363e-06, + "loss": 2.9819, + "step": 84475 + }, + { + "epoch": 5.739910313901345, + "grad_norm": 11.618367195129395, + "learning_rate": 2.8280846582416095e-06, + "loss": 2.7218, + "step": 84480 + }, + { + "epoch": 5.740250033972007, + "grad_norm": 7.012813091278076, + "learning_rate": 2.827660008153282e-06, + "loss": 2.727, + "step": 84485 + }, + { + "epoch": 5.740589754042669, + "grad_norm": 7.786489963531494, + "learning_rate": 2.8272353580649547e-06, + "loss": 2.9225, + "step": 84490 + }, + { + "epoch": 5.74092947411333, + "grad_norm": 5.858476161956787, + "learning_rate": 2.826810707976628e-06, + "loss": 2.8041, + "step": 84495 + }, + { + "epoch": 5.741269194183992, + "grad_norm": 6.987631797790527, + "learning_rate": 2.8263860578883003e-06, + "loss": 2.804, + "step": 84500 + }, + { + "epoch": 5.7416089142546545, + "grad_norm": 7.355350017547607, + "learning_rate": 2.825961407799973e-06, + "loss": 2.529, + "step": 84505 + }, + { + "epoch": 5.741948634325316, + "grad_norm": 8.354860305786133, + "learning_rate": 2.825536757711646e-06, + "loss": 2.5203, + "step": 84510 + }, + { + "epoch": 5.742288354395978, + "grad_norm": 6.569573879241943, + "learning_rate": 2.8251121076233182e-06, + "loss": 2.8616, + "step": 84515 + }, + { + "epoch": 5.74262807446664, + "grad_norm": 7.704877853393555, + "learning_rate": 2.8246874575349915e-06, + "loss": 2.7533, + "step": 84520 + }, + { + "epoch": 5.742967794537301, + "grad_norm": 6.734518051147461, + "learning_rate": 2.8242628074466643e-06, + "loss": 2.8212, + "step": 84525 + }, + { + "epoch": 5.743307514607963, + "grad_norm": 5.810152530670166, + "learning_rate": 2.8238381573583367e-06, + "loss": 2.8804, + "step": 84530 + }, + { + "epoch": 5.743647234678625, + "grad_norm": 8.734076499938965, + "learning_rate": 2.82341350727001e-06, + "loss": 2.7748, + "step": 84535 + }, + { + "epoch": 5.743986954749286, + "grad_norm": 6.845181941986084, + "learning_rate": 2.8229888571816827e-06, + "loss": 2.9205, + "step": 84540 + }, + { + "epoch": 5.744326674819948, + "grad_norm": 8.717354774475098, + "learning_rate": 2.822564207093355e-06, + "loss": 2.8679, + "step": 84545 + }, + { + "epoch": 5.7446663948906105, + "grad_norm": 8.337745666503906, + "learning_rate": 2.822139557005028e-06, + "loss": 2.7947, + "step": 84550 + }, + { + "epoch": 5.745006114961272, + "grad_norm": 8.533824920654297, + "learning_rate": 2.821714906916701e-06, + "loss": 2.7341, + "step": 84555 + }, + { + "epoch": 5.745345835031934, + "grad_norm": 7.056649684906006, + "learning_rate": 2.8212902568283735e-06, + "loss": 2.8545, + "step": 84560 + }, + { + "epoch": 5.745685555102596, + "grad_norm": 9.61431884765625, + "learning_rate": 2.8208656067400463e-06, + "loss": 2.7622, + "step": 84565 + }, + { + "epoch": 5.746025275173257, + "grad_norm": 6.9288153648376465, + "learning_rate": 2.8204409566517195e-06, + "loss": 2.9645, + "step": 84570 + }, + { + "epoch": 5.746364995243919, + "grad_norm": 8.629104614257812, + "learning_rate": 2.820016306563392e-06, + "loss": 2.8688, + "step": 84575 + }, + { + "epoch": 5.746704715314581, + "grad_norm": 7.375525951385498, + "learning_rate": 2.8195916564750647e-06, + "loss": 2.6742, + "step": 84580 + }, + { + "epoch": 5.747044435385242, + "grad_norm": 9.019692420959473, + "learning_rate": 2.819167006386738e-06, + "loss": 2.8595, + "step": 84585 + }, + { + "epoch": 5.747384155455904, + "grad_norm": 8.31320571899414, + "learning_rate": 2.8187423562984103e-06, + "loss": 3.1157, + "step": 84590 + }, + { + "epoch": 5.7477238755265665, + "grad_norm": 7.770040988922119, + "learning_rate": 2.818317706210083e-06, + "loss": 2.757, + "step": 84595 + }, + { + "epoch": 5.748063595597228, + "grad_norm": 7.071457862854004, + "learning_rate": 2.817893056121756e-06, + "loss": 2.5238, + "step": 84600 + }, + { + "epoch": 5.74840331566789, + "grad_norm": 6.415531158447266, + "learning_rate": 2.8174684060334282e-06, + "loss": 2.8689, + "step": 84605 + }, + { + "epoch": 5.748743035738551, + "grad_norm": 6.796010971069336, + "learning_rate": 2.8170437559451015e-06, + "loss": 2.6151, + "step": 84610 + }, + { + "epoch": 5.749082755809213, + "grad_norm": 7.555593967437744, + "learning_rate": 2.8166191058567743e-06, + "loss": 2.7758, + "step": 84615 + }, + { + "epoch": 5.749422475879875, + "grad_norm": 6.829349994659424, + "learning_rate": 2.8161944557684475e-06, + "loss": 2.729, + "step": 84620 + }, + { + "epoch": 5.749762195950536, + "grad_norm": 6.897397994995117, + "learning_rate": 2.81576980568012e-06, + "loss": 2.8059, + "step": 84625 + }, + { + "epoch": 5.750101916021198, + "grad_norm": 9.732126235961914, + "learning_rate": 2.8153451555917927e-06, + "loss": 2.7873, + "step": 84630 + }, + { + "epoch": 5.75044163609186, + "grad_norm": 7.291754245758057, + "learning_rate": 2.8149205055034655e-06, + "loss": 2.5636, + "step": 84635 + }, + { + "epoch": 5.750781356162522, + "grad_norm": 7.011735439300537, + "learning_rate": 2.814495855415138e-06, + "loss": 2.6734, + "step": 84640 + }, + { + "epoch": 5.751121076233184, + "grad_norm": 6.008299350738525, + "learning_rate": 2.814071205326811e-06, + "loss": 2.7395, + "step": 84645 + }, + { + "epoch": 5.751460796303846, + "grad_norm": 6.199178695678711, + "learning_rate": 2.813646555238484e-06, + "loss": 2.5911, + "step": 84650 + }, + { + "epoch": 5.751800516374507, + "grad_norm": 7.674642562866211, + "learning_rate": 2.8132219051501562e-06, + "loss": 2.8145, + "step": 84655 + }, + { + "epoch": 5.752140236445169, + "grad_norm": 7.042503833770752, + "learning_rate": 2.8127972550618295e-06, + "loss": 3.0034, + "step": 84660 + }, + { + "epoch": 5.752479956515831, + "grad_norm": 7.341274261474609, + "learning_rate": 2.8123726049735023e-06, + "loss": 2.6875, + "step": 84665 + }, + { + "epoch": 5.752819676586492, + "grad_norm": 8.10067081451416, + "learning_rate": 2.8119479548851746e-06, + "loss": 2.8427, + "step": 84670 + }, + { + "epoch": 5.753159396657154, + "grad_norm": 7.308704853057861, + "learning_rate": 2.8115233047968474e-06, + "loss": 2.7068, + "step": 84675 + }, + { + "epoch": 5.753499116727816, + "grad_norm": 9.821982383728027, + "learning_rate": 2.8110986547085207e-06, + "loss": 2.8214, + "step": 84680 + }, + { + "epoch": 5.753838836798478, + "grad_norm": 7.210488796234131, + "learning_rate": 2.810674004620193e-06, + "loss": 2.6139, + "step": 84685 + }, + { + "epoch": 5.75417855686914, + "grad_norm": 8.084369659423828, + "learning_rate": 2.810249354531866e-06, + "loss": 2.8181, + "step": 84690 + }, + { + "epoch": 5.754518276939802, + "grad_norm": 8.417155265808105, + "learning_rate": 2.809824704443539e-06, + "loss": 2.8257, + "step": 84695 + }, + { + "epoch": 5.754857997010463, + "grad_norm": 10.576476097106934, + "learning_rate": 2.8094000543552115e-06, + "loss": 2.5623, + "step": 84700 + }, + { + "epoch": 5.755197717081125, + "grad_norm": 8.660144805908203, + "learning_rate": 2.8089754042668843e-06, + "loss": 2.7067, + "step": 84705 + }, + { + "epoch": 5.755537437151787, + "grad_norm": 8.477875709533691, + "learning_rate": 2.8085507541785575e-06, + "loss": 2.7524, + "step": 84710 + }, + { + "epoch": 5.755877157222448, + "grad_norm": 8.679234504699707, + "learning_rate": 2.80812610409023e-06, + "loss": 2.376, + "step": 84715 + }, + { + "epoch": 5.75621687729311, + "grad_norm": 8.776702880859375, + "learning_rate": 2.8077014540019027e-06, + "loss": 2.7684, + "step": 84720 + }, + { + "epoch": 5.7565565973637725, + "grad_norm": 8.412358283996582, + "learning_rate": 2.8072768039135755e-06, + "loss": 2.6379, + "step": 84725 + }, + { + "epoch": 5.756896317434434, + "grad_norm": 7.33870792388916, + "learning_rate": 2.806852153825248e-06, + "loss": 2.582, + "step": 84730 + }, + { + "epoch": 5.757236037505096, + "grad_norm": 6.756582736968994, + "learning_rate": 2.806427503736921e-06, + "loss": 2.6783, + "step": 84735 + }, + { + "epoch": 5.757575757575758, + "grad_norm": 7.9137444496154785, + "learning_rate": 2.806002853648594e-06, + "loss": 2.772, + "step": 84740 + }, + { + "epoch": 5.757915477646419, + "grad_norm": 9.586688041687012, + "learning_rate": 2.8055782035602662e-06, + "loss": 2.6622, + "step": 84745 + }, + { + "epoch": 5.758255197717081, + "grad_norm": 6.017298698425293, + "learning_rate": 2.8051535534719395e-06, + "loss": 2.8222, + "step": 84750 + }, + { + "epoch": 5.758594917787743, + "grad_norm": 6.88961935043335, + "learning_rate": 2.8047289033836123e-06, + "loss": 3.1606, + "step": 84755 + }, + { + "epoch": 5.758934637858404, + "grad_norm": 8.294904708862305, + "learning_rate": 2.8043042532952846e-06, + "loss": 2.6521, + "step": 84760 + }, + { + "epoch": 5.759274357929066, + "grad_norm": 8.282238006591797, + "learning_rate": 2.8038796032069574e-06, + "loss": 2.9166, + "step": 84765 + }, + { + "epoch": 5.7596140779997285, + "grad_norm": 7.014315605163574, + "learning_rate": 2.8034549531186307e-06, + "loss": 2.6437, + "step": 84770 + }, + { + "epoch": 5.75995379807039, + "grad_norm": 7.715274810791016, + "learning_rate": 2.803030303030303e-06, + "loss": 2.7047, + "step": 84775 + }, + { + "epoch": 5.760293518141052, + "grad_norm": 8.4244966506958, + "learning_rate": 2.802605652941976e-06, + "loss": 2.9988, + "step": 84780 + }, + { + "epoch": 5.760633238211714, + "grad_norm": 7.514344215393066, + "learning_rate": 2.802181002853649e-06, + "loss": 2.8157, + "step": 84785 + }, + { + "epoch": 5.760972958282375, + "grad_norm": 7.145947456359863, + "learning_rate": 2.801756352765322e-06, + "loss": 2.6252, + "step": 84790 + }, + { + "epoch": 5.761312678353037, + "grad_norm": 8.041818618774414, + "learning_rate": 2.8013317026769942e-06, + "loss": 2.7256, + "step": 84795 + }, + { + "epoch": 5.761652398423699, + "grad_norm": 7.584071159362793, + "learning_rate": 2.800907052588667e-06, + "loss": 2.6478, + "step": 84800 + }, + { + "epoch": 5.76199211849436, + "grad_norm": 8.966475486755371, + "learning_rate": 2.8004824025003403e-06, + "loss": 2.4629, + "step": 84805 + }, + { + "epoch": 5.762331838565022, + "grad_norm": 6.979758262634277, + "learning_rate": 2.8000577524120126e-06, + "loss": 2.6305, + "step": 84810 + }, + { + "epoch": 5.7626715586356845, + "grad_norm": 7.955801010131836, + "learning_rate": 2.7996331023236854e-06, + "loss": 2.9586, + "step": 84815 + }, + { + "epoch": 5.763011278706346, + "grad_norm": 7.563728332519531, + "learning_rate": 2.7992084522353587e-06, + "loss": 2.9845, + "step": 84820 + }, + { + "epoch": 5.763350998777008, + "grad_norm": 6.332408905029297, + "learning_rate": 2.798783802147031e-06, + "loss": 2.6482, + "step": 84825 + }, + { + "epoch": 5.76369071884767, + "grad_norm": 8.76204776763916, + "learning_rate": 2.798359152058704e-06, + "loss": 2.9301, + "step": 84830 + }, + { + "epoch": 5.764030438918331, + "grad_norm": 6.91094970703125, + "learning_rate": 2.7979345019703767e-06, + "loss": 2.7457, + "step": 84835 + }, + { + "epoch": 5.764370158988993, + "grad_norm": 7.343262672424316, + "learning_rate": 2.7975098518820495e-06, + "loss": 2.7577, + "step": 84840 + }, + { + "epoch": 5.764709879059655, + "grad_norm": 7.255947589874268, + "learning_rate": 2.7970852017937223e-06, + "loss": 2.7611, + "step": 84845 + }, + { + "epoch": 5.765049599130316, + "grad_norm": 7.214290142059326, + "learning_rate": 2.796660551705395e-06, + "loss": 2.7295, + "step": 84850 + }, + { + "epoch": 5.765389319200978, + "grad_norm": 7.698740482330322, + "learning_rate": 2.7962359016170674e-06, + "loss": 2.8893, + "step": 84855 + }, + { + "epoch": 5.7657290392716405, + "grad_norm": 6.703338623046875, + "learning_rate": 2.7958112515287407e-06, + "loss": 2.8343, + "step": 84860 + }, + { + "epoch": 5.766068759342302, + "grad_norm": 6.36066198348999, + "learning_rate": 2.7953866014404135e-06, + "loss": 2.8507, + "step": 84865 + }, + { + "epoch": 5.766408479412964, + "grad_norm": 6.419217109680176, + "learning_rate": 2.794961951352086e-06, + "loss": 2.8612, + "step": 84870 + }, + { + "epoch": 5.766748199483626, + "grad_norm": 9.339496612548828, + "learning_rate": 2.794537301263759e-06, + "loss": 2.8677, + "step": 84875 + }, + { + "epoch": 5.767087919554287, + "grad_norm": 5.368003845214844, + "learning_rate": 2.794112651175432e-06, + "loss": 2.7382, + "step": 84880 + }, + { + "epoch": 5.767427639624949, + "grad_norm": 7.22109842300415, + "learning_rate": 2.7936880010871042e-06, + "loss": 2.6457, + "step": 84885 + }, + { + "epoch": 5.767767359695611, + "grad_norm": 8.075758934020996, + "learning_rate": 2.793263350998777e-06, + "loss": 2.8082, + "step": 84890 + }, + { + "epoch": 5.768107079766272, + "grad_norm": 7.266218185424805, + "learning_rate": 2.7928387009104503e-06, + "loss": 2.6927, + "step": 84895 + }, + { + "epoch": 5.768446799836934, + "grad_norm": 6.997739315032959, + "learning_rate": 2.7924140508221226e-06, + "loss": 2.8216, + "step": 84900 + }, + { + "epoch": 5.7687865199075965, + "grad_norm": 6.6721296310424805, + "learning_rate": 2.7919894007337954e-06, + "loss": 2.9024, + "step": 84905 + }, + { + "epoch": 5.769126239978258, + "grad_norm": 6.181339740753174, + "learning_rate": 2.7915647506454687e-06, + "loss": 2.8039, + "step": 84910 + }, + { + "epoch": 5.76946596004892, + "grad_norm": 9.358875274658203, + "learning_rate": 2.791140100557141e-06, + "loss": 2.7431, + "step": 84915 + }, + { + "epoch": 5.769805680119582, + "grad_norm": 7.764708995819092, + "learning_rate": 2.790715450468814e-06, + "loss": 2.7759, + "step": 84920 + }, + { + "epoch": 5.770145400190243, + "grad_norm": 8.57807731628418, + "learning_rate": 2.7902908003804866e-06, + "loss": 2.9879, + "step": 84925 + }, + { + "epoch": 5.770485120260905, + "grad_norm": 6.936430931091309, + "learning_rate": 2.7898661502921594e-06, + "loss": 2.8794, + "step": 84930 + }, + { + "epoch": 5.770824840331567, + "grad_norm": 7.841735363006592, + "learning_rate": 2.7894415002038322e-06, + "loss": 2.9798, + "step": 84935 + }, + { + "epoch": 5.771164560402228, + "grad_norm": 7.267552852630615, + "learning_rate": 2.789016850115505e-06, + "loss": 2.9087, + "step": 84940 + }, + { + "epoch": 5.77150428047289, + "grad_norm": 6.9451212882995605, + "learning_rate": 2.7885922000271774e-06, + "loss": 2.8198, + "step": 84945 + }, + { + "epoch": 5.7718440005435525, + "grad_norm": 8.016003608703613, + "learning_rate": 2.7881675499388506e-06, + "loss": 2.9141, + "step": 84950 + }, + { + "epoch": 5.772183720614214, + "grad_norm": 5.446606636047363, + "learning_rate": 2.7877428998505234e-06, + "loss": 2.8009, + "step": 84955 + }, + { + "epoch": 5.772523440684876, + "grad_norm": 7.377190589904785, + "learning_rate": 2.7873182497621962e-06, + "loss": 2.8466, + "step": 84960 + }, + { + "epoch": 5.772863160755538, + "grad_norm": 6.50379753112793, + "learning_rate": 2.786893599673869e-06, + "loss": 2.6528, + "step": 84965 + }, + { + "epoch": 5.773202880826199, + "grad_norm": 6.676731586456299, + "learning_rate": 2.786468949585542e-06, + "loss": 2.7483, + "step": 84970 + }, + { + "epoch": 5.773542600896861, + "grad_norm": 7.7723002433776855, + "learning_rate": 2.7860442994972146e-06, + "loss": 2.784, + "step": 84975 + }, + { + "epoch": 5.773882320967523, + "grad_norm": 7.919195652008057, + "learning_rate": 2.785619649408887e-06, + "loss": 2.7628, + "step": 84980 + }, + { + "epoch": 5.774222041038184, + "grad_norm": 7.771161079406738, + "learning_rate": 2.7851949993205603e-06, + "loss": 2.6317, + "step": 84985 + }, + { + "epoch": 5.7745617611088464, + "grad_norm": 7.9606194496154785, + "learning_rate": 2.784770349232233e-06, + "loss": 2.8994, + "step": 84990 + }, + { + "epoch": 5.7749014811795085, + "grad_norm": 7.994619369506836, + "learning_rate": 2.7843456991439054e-06, + "loss": 2.7397, + "step": 84995 + }, + { + "epoch": 5.77524120125017, + "grad_norm": 6.148029804229736, + "learning_rate": 2.7839210490555787e-06, + "loss": 2.8242, + "step": 85000 + }, + { + "epoch": 5.775580921320832, + "grad_norm": 6.597979545593262, + "learning_rate": 2.7834963989672515e-06, + "loss": 2.6168, + "step": 85005 + }, + { + "epoch": 5.775920641391494, + "grad_norm": 7.667588233947754, + "learning_rate": 2.783071748878924e-06, + "loss": 2.8171, + "step": 85010 + }, + { + "epoch": 5.776260361462155, + "grad_norm": 6.917665004730225, + "learning_rate": 2.7826470987905966e-06, + "loss": 2.9291, + "step": 85015 + }, + { + "epoch": 5.776600081532817, + "grad_norm": 7.404199600219727, + "learning_rate": 2.78222244870227e-06, + "loss": 2.8043, + "step": 85020 + }, + { + "epoch": 5.776939801603479, + "grad_norm": 7.339073657989502, + "learning_rate": 2.7817977986139422e-06, + "loss": 2.7122, + "step": 85025 + }, + { + "epoch": 5.77727952167414, + "grad_norm": 6.695223808288574, + "learning_rate": 2.781373148525615e-06, + "loss": 2.889, + "step": 85030 + }, + { + "epoch": 5.7776192417448025, + "grad_norm": 7.255655765533447, + "learning_rate": 2.7809484984372883e-06, + "loss": 2.8825, + "step": 85035 + }, + { + "epoch": 5.7779589618154645, + "grad_norm": 9.343147277832031, + "learning_rate": 2.7805238483489606e-06, + "loss": 2.6436, + "step": 85040 + }, + { + "epoch": 5.778298681886126, + "grad_norm": 6.885312080383301, + "learning_rate": 2.7800991982606334e-06, + "loss": 3.0175, + "step": 85045 + }, + { + "epoch": 5.778638401956788, + "grad_norm": 9.337586402893066, + "learning_rate": 2.7796745481723062e-06, + "loss": 2.5659, + "step": 85050 + }, + { + "epoch": 5.77897812202745, + "grad_norm": 6.947477340698242, + "learning_rate": 2.7792498980839786e-06, + "loss": 2.8406, + "step": 85055 + }, + { + "epoch": 5.779317842098111, + "grad_norm": 8.611851692199707, + "learning_rate": 2.778825247995652e-06, + "loss": 2.6915, + "step": 85060 + }, + { + "epoch": 5.779657562168773, + "grad_norm": 7.61469841003418, + "learning_rate": 2.7784005979073246e-06, + "loss": 2.9202, + "step": 85065 + }, + { + "epoch": 5.779997282239434, + "grad_norm": 6.629349708557129, + "learning_rate": 2.777975947818997e-06, + "loss": 2.5739, + "step": 85070 + }, + { + "epoch": 5.780337002310096, + "grad_norm": 6.849355220794678, + "learning_rate": 2.7775512977306702e-06, + "loss": 2.6268, + "step": 85075 + }, + { + "epoch": 5.7806767223807585, + "grad_norm": 7.140292167663574, + "learning_rate": 2.777126647642343e-06, + "loss": 2.8175, + "step": 85080 + }, + { + "epoch": 5.78101644245142, + "grad_norm": 8.322524070739746, + "learning_rate": 2.7767019975540154e-06, + "loss": 2.6769, + "step": 85085 + }, + { + "epoch": 5.781356162522082, + "grad_norm": 7.778590202331543, + "learning_rate": 2.7762773474656886e-06, + "loss": 2.8541, + "step": 85090 + }, + { + "epoch": 5.781695882592744, + "grad_norm": 7.834430694580078, + "learning_rate": 2.7758526973773614e-06, + "loss": 2.9301, + "step": 85095 + }, + { + "epoch": 5.782035602663405, + "grad_norm": 7.224252223968506, + "learning_rate": 2.775428047289034e-06, + "loss": 2.6512, + "step": 85100 + }, + { + "epoch": 5.782375322734067, + "grad_norm": 9.052635192871094, + "learning_rate": 2.7750033972007066e-06, + "loss": 2.6369, + "step": 85105 + }, + { + "epoch": 5.782715042804729, + "grad_norm": 6.193243980407715, + "learning_rate": 2.77457874711238e-06, + "loss": 2.886, + "step": 85110 + }, + { + "epoch": 5.78305476287539, + "grad_norm": 7.197983264923096, + "learning_rate": 2.7741540970240522e-06, + "loss": 2.8135, + "step": 85115 + }, + { + "epoch": 5.783394482946052, + "grad_norm": 9.664298057556152, + "learning_rate": 2.773729446935725e-06, + "loss": 2.6827, + "step": 85120 + }, + { + "epoch": 5.7837342030167145, + "grad_norm": 6.876417636871338, + "learning_rate": 2.7733047968473982e-06, + "loss": 2.8272, + "step": 85125 + }, + { + "epoch": 5.784073923087376, + "grad_norm": 8.095561027526855, + "learning_rate": 2.772880146759071e-06, + "loss": 2.6999, + "step": 85130 + }, + { + "epoch": 5.784413643158038, + "grad_norm": 7.325069904327393, + "learning_rate": 2.7724554966707434e-06, + "loss": 3.1114, + "step": 85135 + }, + { + "epoch": 5.7847533632287, + "grad_norm": 6.26387357711792, + "learning_rate": 2.7720308465824162e-06, + "loss": 2.7037, + "step": 85140 + }, + { + "epoch": 5.785093083299361, + "grad_norm": 7.3378520011901855, + "learning_rate": 2.7716061964940895e-06, + "loss": 2.7752, + "step": 85145 + }, + { + "epoch": 5.785432803370023, + "grad_norm": 7.165867328643799, + "learning_rate": 2.771181546405762e-06, + "loss": 2.6853, + "step": 85150 + }, + { + "epoch": 5.785772523440685, + "grad_norm": 8.451613426208496, + "learning_rate": 2.7707568963174346e-06, + "loss": 2.7306, + "step": 85155 + }, + { + "epoch": 5.786112243511346, + "grad_norm": 7.666119575500488, + "learning_rate": 2.770332246229108e-06, + "loss": 2.8281, + "step": 85160 + }, + { + "epoch": 5.786451963582008, + "grad_norm": 7.321788311004639, + "learning_rate": 2.7699075961407802e-06, + "loss": 2.7309, + "step": 85165 + }, + { + "epoch": 5.7867916836526705, + "grad_norm": 6.863550186157227, + "learning_rate": 2.769482946052453e-06, + "loss": 2.8431, + "step": 85170 + }, + { + "epoch": 5.787131403723332, + "grad_norm": 7.204655647277832, + "learning_rate": 2.769058295964126e-06, + "loss": 2.756, + "step": 85175 + }, + { + "epoch": 5.787471123793994, + "grad_norm": 7.069775581359863, + "learning_rate": 2.768633645875798e-06, + "loss": 3.0407, + "step": 85180 + }, + { + "epoch": 5.787810843864656, + "grad_norm": 7.036509990692139, + "learning_rate": 2.7682089957874714e-06, + "loss": 2.7575, + "step": 85185 + }, + { + "epoch": 5.788150563935317, + "grad_norm": 7.116365909576416, + "learning_rate": 2.7677843456991442e-06, + "loss": 3.0447, + "step": 85190 + }, + { + "epoch": 5.788490284005979, + "grad_norm": 7.5289812088012695, + "learning_rate": 2.7673596956108166e-06, + "loss": 2.8606, + "step": 85195 + }, + { + "epoch": 5.788830004076641, + "grad_norm": 9.049714088439941, + "learning_rate": 2.76693504552249e-06, + "loss": 2.7181, + "step": 85200 + }, + { + "epoch": 5.789169724147302, + "grad_norm": 8.263801574707031, + "learning_rate": 2.7665103954341626e-06, + "loss": 2.907, + "step": 85205 + }, + { + "epoch": 5.789509444217964, + "grad_norm": 9.158549308776855, + "learning_rate": 2.766085745345835e-06, + "loss": 2.9816, + "step": 85210 + }, + { + "epoch": 5.7898491642886265, + "grad_norm": 8.062713623046875, + "learning_rate": 2.7656610952575082e-06, + "loss": 2.6523, + "step": 85215 + }, + { + "epoch": 5.790188884359288, + "grad_norm": 7.8338165283203125, + "learning_rate": 2.765236445169181e-06, + "loss": 2.8996, + "step": 85220 + }, + { + "epoch": 5.79052860442995, + "grad_norm": 8.223831176757812, + "learning_rate": 2.7648117950808534e-06, + "loss": 2.6504, + "step": 85225 + }, + { + "epoch": 5.790868324500612, + "grad_norm": Infinity, + "learning_rate": 2.7644720750101915e-06, + "loss": 2.9174, + "step": 85230 + }, + { + "epoch": 5.791208044571273, + "grad_norm": 7.684545516967773, + "learning_rate": 2.7640474249218647e-06, + "loss": 2.6727, + "step": 85235 + }, + { + "epoch": 5.791547764641935, + "grad_norm": 8.192809104919434, + "learning_rate": 2.7636227748335375e-06, + "loss": 2.7912, + "step": 85240 + }, + { + "epoch": 5.791887484712597, + "grad_norm": 8.371162414550781, + "learning_rate": 2.76319812474521e-06, + "loss": 2.9794, + "step": 85245 + }, + { + "epoch": 5.792227204783258, + "grad_norm": 9.18525218963623, + "learning_rate": 2.762773474656883e-06, + "loss": 2.7714, + "step": 85250 + }, + { + "epoch": 5.79256692485392, + "grad_norm": 7.8003668785095215, + "learning_rate": 2.762348824568556e-06, + "loss": 2.7322, + "step": 85255 + }, + { + "epoch": 5.7929066449245825, + "grad_norm": 9.416179656982422, + "learning_rate": 2.7619241744802283e-06, + "loss": 3.0453, + "step": 85260 + }, + { + "epoch": 5.793246364995244, + "grad_norm": 7.30811071395874, + "learning_rate": 2.761499524391901e-06, + "loss": 2.6706, + "step": 85265 + }, + { + "epoch": 5.793586085065906, + "grad_norm": 10.870594024658203, + "learning_rate": 2.7610748743035743e-06, + "loss": 2.6085, + "step": 85270 + }, + { + "epoch": 5.793925805136568, + "grad_norm": 5.971461772918701, + "learning_rate": 2.7606502242152467e-06, + "loss": 2.7634, + "step": 85275 + }, + { + "epoch": 5.794265525207229, + "grad_norm": 8.036794662475586, + "learning_rate": 2.7602255741269195e-06, + "loss": 2.9577, + "step": 85280 + }, + { + "epoch": 5.794605245277891, + "grad_norm": 6.983469486236572, + "learning_rate": 2.7598009240385927e-06, + "loss": 2.9708, + "step": 85285 + }, + { + "epoch": 5.794944965348552, + "grad_norm": 6.733338356018066, + "learning_rate": 2.759376273950265e-06, + "loss": 2.8255, + "step": 85290 + }, + { + "epoch": 5.795284685419214, + "grad_norm": 8.562172889709473, + "learning_rate": 2.758951623861938e-06, + "loss": 2.9211, + "step": 85295 + }, + { + "epoch": 5.7956244054898765, + "grad_norm": 7.859045028686523, + "learning_rate": 2.7585269737736107e-06, + "loss": 2.8566, + "step": 85300 + }, + { + "epoch": 5.795964125560538, + "grad_norm": 9.150776863098145, + "learning_rate": 2.7581023236852835e-06, + "loss": 2.907, + "step": 85305 + }, + { + "epoch": 5.7963038456312, + "grad_norm": 8.238774299621582, + "learning_rate": 2.7576776735969563e-06, + "loss": 2.8946, + "step": 85310 + }, + { + "epoch": 5.796643565701862, + "grad_norm": 6.92051362991333, + "learning_rate": 2.757253023508629e-06, + "loss": 2.6602, + "step": 85315 + }, + { + "epoch": 5.796983285772523, + "grad_norm": 7.7333807945251465, + "learning_rate": 2.7568283734203015e-06, + "loss": 2.5933, + "step": 85320 + }, + { + "epoch": 5.797323005843185, + "grad_norm": 5.688127040863037, + "learning_rate": 2.7564037233319747e-06, + "loss": 2.5917, + "step": 85325 + }, + { + "epoch": 5.797662725913847, + "grad_norm": 7.162230491638184, + "learning_rate": 2.7559790732436475e-06, + "loss": 2.8094, + "step": 85330 + }, + { + "epoch": 5.798002445984508, + "grad_norm": 6.0815653800964355, + "learning_rate": 2.7555544231553203e-06, + "loss": 2.8562, + "step": 85335 + }, + { + "epoch": 5.79834216605517, + "grad_norm": 8.30396842956543, + "learning_rate": 2.755129773066993e-06, + "loss": 2.6115, + "step": 85340 + }, + { + "epoch": 5.7986818861258325, + "grad_norm": 9.431504249572754, + "learning_rate": 2.754705122978666e-06, + "loss": 2.6971, + "step": 85345 + }, + { + "epoch": 5.799021606196494, + "grad_norm": 10.239717483520508, + "learning_rate": 2.7542804728903387e-06, + "loss": 2.7615, + "step": 85350 + }, + { + "epoch": 5.799361326267156, + "grad_norm": 7.172730445861816, + "learning_rate": 2.753855822802011e-06, + "loss": 2.8732, + "step": 85355 + }, + { + "epoch": 5.799701046337818, + "grad_norm": 8.168627738952637, + "learning_rate": 2.7534311727136843e-06, + "loss": 2.7661, + "step": 85360 + }, + { + "epoch": 5.800040766408479, + "grad_norm": 8.183281898498535, + "learning_rate": 2.753006522625357e-06, + "loss": 2.6492, + "step": 85365 + }, + { + "epoch": 5.800380486479141, + "grad_norm": 9.967691421508789, + "learning_rate": 2.7525818725370295e-06, + "loss": 2.4099, + "step": 85370 + }, + { + "epoch": 5.800720206549803, + "grad_norm": 8.85660457611084, + "learning_rate": 2.7521572224487027e-06, + "loss": 2.7954, + "step": 85375 + }, + { + "epoch": 5.801059926620464, + "grad_norm": 7.019636154174805, + "learning_rate": 2.7517325723603755e-06, + "loss": 2.6784, + "step": 85380 + }, + { + "epoch": 5.801399646691126, + "grad_norm": 7.181417942047119, + "learning_rate": 2.751307922272048e-06, + "loss": 2.8407, + "step": 85385 + }, + { + "epoch": 5.8017393667617885, + "grad_norm": 7.087803363800049, + "learning_rate": 2.7508832721837207e-06, + "loss": 2.7687, + "step": 85390 + }, + { + "epoch": 5.80207908683245, + "grad_norm": 8.187827110290527, + "learning_rate": 2.750458622095394e-06, + "loss": 3.0071, + "step": 85395 + }, + { + "epoch": 5.802418806903112, + "grad_norm": 7.706393241882324, + "learning_rate": 2.7500339720070663e-06, + "loss": 2.7419, + "step": 85400 + }, + { + "epoch": 5.802758526973774, + "grad_norm": 7.895090103149414, + "learning_rate": 2.749609321918739e-06, + "loss": 2.9004, + "step": 85405 + }, + { + "epoch": 5.803098247044435, + "grad_norm": 8.733431816101074, + "learning_rate": 2.7491846718304123e-06, + "loss": 2.8008, + "step": 85410 + }, + { + "epoch": 5.803437967115097, + "grad_norm": 10.301295280456543, + "learning_rate": 2.7487600217420847e-06, + "loss": 2.6334, + "step": 85415 + }, + { + "epoch": 5.803777687185759, + "grad_norm": 6.06310510635376, + "learning_rate": 2.7483353716537575e-06, + "loss": 2.7202, + "step": 85420 + }, + { + "epoch": 5.80411740725642, + "grad_norm": 8.51274299621582, + "learning_rate": 2.7479107215654303e-06, + "loss": 2.5423, + "step": 85425 + }, + { + "epoch": 5.804457127327082, + "grad_norm": 7.689414978027344, + "learning_rate": 2.747486071477103e-06, + "loss": 2.5693, + "step": 85430 + }, + { + "epoch": 5.8047968473977445, + "grad_norm": 10.237645149230957, + "learning_rate": 2.747061421388776e-06, + "loss": 3.1087, + "step": 85435 + }, + { + "epoch": 5.805136567468406, + "grad_norm": 7.429971218109131, + "learning_rate": 2.7466367713004487e-06, + "loss": 2.6984, + "step": 85440 + }, + { + "epoch": 5.805476287539068, + "grad_norm": 7.597606658935547, + "learning_rate": 2.746212121212121e-06, + "loss": 2.7715, + "step": 85445 + }, + { + "epoch": 5.80581600760973, + "grad_norm": 12.182013511657715, + "learning_rate": 2.7457874711237943e-06, + "loss": 2.8187, + "step": 85450 + }, + { + "epoch": 5.806155727680391, + "grad_norm": 8.154048919677734, + "learning_rate": 2.745362821035467e-06, + "loss": 2.8969, + "step": 85455 + }, + { + "epoch": 5.806495447751053, + "grad_norm": 7.256857395172119, + "learning_rate": 2.7449381709471395e-06, + "loss": 2.7423, + "step": 85460 + }, + { + "epoch": 5.806835167821715, + "grad_norm": 6.989985942840576, + "learning_rate": 2.7445135208588127e-06, + "loss": 2.8111, + "step": 85465 + }, + { + "epoch": 5.807174887892376, + "grad_norm": 9.090336799621582, + "learning_rate": 2.7440888707704855e-06, + "loss": 2.8683, + "step": 85470 + }, + { + "epoch": 5.807514607963038, + "grad_norm": 7.142548084259033, + "learning_rate": 2.743664220682158e-06, + "loss": 2.8497, + "step": 85475 + }, + { + "epoch": 5.8078543280337005, + "grad_norm": 6.512356758117676, + "learning_rate": 2.7432395705938307e-06, + "loss": 2.8717, + "step": 85480 + }, + { + "epoch": 5.808194048104362, + "grad_norm": 7.503692626953125, + "learning_rate": 2.742814920505504e-06, + "loss": 2.9588, + "step": 85485 + }, + { + "epoch": 5.808533768175024, + "grad_norm": 6.192331790924072, + "learning_rate": 2.7423902704171763e-06, + "loss": 2.6566, + "step": 85490 + }, + { + "epoch": 5.808873488245686, + "grad_norm": 6.47369384765625, + "learning_rate": 2.741965620328849e-06, + "loss": 2.674, + "step": 85495 + }, + { + "epoch": 5.809213208316347, + "grad_norm": 8.146064758300781, + "learning_rate": 2.7415409702405223e-06, + "loss": 2.8459, + "step": 85500 + }, + { + "epoch": 5.809552928387009, + "grad_norm": 7.976600646972656, + "learning_rate": 2.741116320152195e-06, + "loss": 2.7994, + "step": 85505 + }, + { + "epoch": 5.809892648457671, + "grad_norm": 6.40078067779541, + "learning_rate": 2.7406916700638675e-06, + "loss": 2.6293, + "step": 85510 + }, + { + "epoch": 5.810232368528332, + "grad_norm": 9.131986618041992, + "learning_rate": 2.7402670199755403e-06, + "loss": 2.8597, + "step": 85515 + }, + { + "epoch": 5.810572088598994, + "grad_norm": 6.535749435424805, + "learning_rate": 2.7398423698872135e-06, + "loss": 2.7796, + "step": 85520 + }, + { + "epoch": 5.8109118086696565, + "grad_norm": 8.072465896606445, + "learning_rate": 2.739417719798886e-06, + "loss": 2.9142, + "step": 85525 + }, + { + "epoch": 5.811251528740318, + "grad_norm": 8.016342163085938, + "learning_rate": 2.7389930697105587e-06, + "loss": 2.904, + "step": 85530 + }, + { + "epoch": 5.81159124881098, + "grad_norm": 7.509693622589111, + "learning_rate": 2.738568419622232e-06, + "loss": 2.9654, + "step": 85535 + }, + { + "epoch": 5.811930968881642, + "grad_norm": 10.367588996887207, + "learning_rate": 2.7381437695339043e-06, + "loss": 2.5972, + "step": 85540 + }, + { + "epoch": 5.812270688952303, + "grad_norm": 6.417271614074707, + "learning_rate": 2.737719119445577e-06, + "loss": 2.8301, + "step": 85545 + }, + { + "epoch": 5.812610409022965, + "grad_norm": 6.66392183303833, + "learning_rate": 2.73729446935725e-06, + "loss": 2.6254, + "step": 85550 + }, + { + "epoch": 5.812950129093627, + "grad_norm": 7.688570976257324, + "learning_rate": 2.7368698192689223e-06, + "loss": 2.8307, + "step": 85555 + }, + { + "epoch": 5.813289849164288, + "grad_norm": 5.943771839141846, + "learning_rate": 2.7364451691805955e-06, + "loss": 2.935, + "step": 85560 + }, + { + "epoch": 5.8136295692349504, + "grad_norm": 5.501032829284668, + "learning_rate": 2.7360205190922683e-06, + "loss": 2.8365, + "step": 85565 + }, + { + "epoch": 5.8139692893056125, + "grad_norm": 8.343705177307129, + "learning_rate": 2.7355958690039407e-06, + "loss": 2.6687, + "step": 85570 + }, + { + "epoch": 5.814309009376274, + "grad_norm": 7.890737056732178, + "learning_rate": 2.735171218915614e-06, + "loss": 2.8213, + "step": 85575 + }, + { + "epoch": 5.814648729446936, + "grad_norm": 8.702289581298828, + "learning_rate": 2.7347465688272867e-06, + "loss": 2.8987, + "step": 85580 + }, + { + "epoch": 5.814988449517598, + "grad_norm": 6.220884799957275, + "learning_rate": 2.734321918738959e-06, + "loss": 2.893, + "step": 85585 + }, + { + "epoch": 5.815328169588259, + "grad_norm": 7.726226806640625, + "learning_rate": 2.7338972686506323e-06, + "loss": 2.5707, + "step": 85590 + }, + { + "epoch": 5.815667889658921, + "grad_norm": 9.010030746459961, + "learning_rate": 2.733472618562305e-06, + "loss": 2.7587, + "step": 85595 + }, + { + "epoch": 5.816007609729583, + "grad_norm": 8.051262855529785, + "learning_rate": 2.7330479684739775e-06, + "loss": 2.8244, + "step": 85600 + }, + { + "epoch": 5.816347329800244, + "grad_norm": 7.782646656036377, + "learning_rate": 2.7326233183856503e-06, + "loss": 2.7961, + "step": 85605 + }, + { + "epoch": 5.8166870498709065, + "grad_norm": 10.134042739868164, + "learning_rate": 2.7321986682973235e-06, + "loss": 2.6714, + "step": 85610 + }, + { + "epoch": 5.8170267699415685, + "grad_norm": 9.265110969543457, + "learning_rate": 2.731774018208996e-06, + "loss": 2.683, + "step": 85615 + }, + { + "epoch": 5.81736649001223, + "grad_norm": 7.114266395568848, + "learning_rate": 2.7313493681206687e-06, + "loss": 2.7094, + "step": 85620 + }, + { + "epoch": 5.817706210082892, + "grad_norm": 8.141345977783203, + "learning_rate": 2.730924718032342e-06, + "loss": 2.6994, + "step": 85625 + }, + { + "epoch": 5.818045930153554, + "grad_norm": 6.6364970207214355, + "learning_rate": 2.7305000679440143e-06, + "loss": 2.842, + "step": 85630 + }, + { + "epoch": 5.818385650224215, + "grad_norm": 9.000679969787598, + "learning_rate": 2.730075417855687e-06, + "loss": 2.7797, + "step": 85635 + }, + { + "epoch": 5.818725370294877, + "grad_norm": 7.1201934814453125, + "learning_rate": 2.72965076776736e-06, + "loss": 2.8792, + "step": 85640 + }, + { + "epoch": 5.819065090365539, + "grad_norm": 8.871028900146484, + "learning_rate": 2.7292261176790323e-06, + "loss": 2.8239, + "step": 85645 + }, + { + "epoch": 5.8194048104362, + "grad_norm": 9.104815483093262, + "learning_rate": 2.7288014675907055e-06, + "loss": 2.9281, + "step": 85650 + }, + { + "epoch": 5.8197445305068625, + "grad_norm": 5.492991924285889, + "learning_rate": 2.7283768175023783e-06, + "loss": 2.836, + "step": 85655 + }, + { + "epoch": 5.8200842505775245, + "grad_norm": 8.838160514831543, + "learning_rate": 2.7279521674140507e-06, + "loss": 2.9077, + "step": 85660 + }, + { + "epoch": 5.820423970648186, + "grad_norm": 7.475596904754639, + "learning_rate": 2.727527517325724e-06, + "loss": 2.8544, + "step": 85665 + }, + { + "epoch": 5.820763690718848, + "grad_norm": 7.364386558532715, + "learning_rate": 2.7271028672373967e-06, + "loss": 2.8903, + "step": 85670 + }, + { + "epoch": 5.82110341078951, + "grad_norm": 7.2030558586120605, + "learning_rate": 2.7266782171490695e-06, + "loss": 2.7749, + "step": 85675 + }, + { + "epoch": 5.821443130860171, + "grad_norm": 7.748060703277588, + "learning_rate": 2.726253567060742e-06, + "loss": 2.8405, + "step": 85680 + }, + { + "epoch": 5.821782850930833, + "grad_norm": 6.824673175811768, + "learning_rate": 2.725828916972415e-06, + "loss": 2.6863, + "step": 85685 + }, + { + "epoch": 5.822122571001495, + "grad_norm": 7.610093593597412, + "learning_rate": 2.725404266884088e-06, + "loss": 2.6162, + "step": 85690 + }, + { + "epoch": 5.822462291072156, + "grad_norm": 8.85302734375, + "learning_rate": 2.7249796167957603e-06, + "loss": 2.7605, + "step": 85695 + }, + { + "epoch": 5.8228020111428185, + "grad_norm": 8.644574165344238, + "learning_rate": 2.7245549667074335e-06, + "loss": 2.767, + "step": 85700 + }, + { + "epoch": 5.8231417312134806, + "grad_norm": 8.307133674621582, + "learning_rate": 2.7241303166191063e-06, + "loss": 2.8769, + "step": 85705 + }, + { + "epoch": 5.823481451284142, + "grad_norm": 6.561727523803711, + "learning_rate": 2.7237056665307787e-06, + "loss": 2.6742, + "step": 85710 + }, + { + "epoch": 5.823821171354804, + "grad_norm": 7.414653778076172, + "learning_rate": 2.723281016442452e-06, + "loss": 2.5165, + "step": 85715 + }, + { + "epoch": 5.824160891425466, + "grad_norm": 9.99599838256836, + "learning_rate": 2.7228563663541247e-06, + "loss": 2.8605, + "step": 85720 + }, + { + "epoch": 5.824500611496127, + "grad_norm": 9.447954177856445, + "learning_rate": 2.722431716265797e-06, + "loss": 2.8368, + "step": 85725 + }, + { + "epoch": 5.824840331566789, + "grad_norm": 8.345987319946289, + "learning_rate": 2.72200706617747e-06, + "loss": 2.8964, + "step": 85730 + }, + { + "epoch": 5.825180051637451, + "grad_norm": 9.173872947692871, + "learning_rate": 2.721582416089143e-06, + "loss": 2.6546, + "step": 85735 + }, + { + "epoch": 5.825519771708112, + "grad_norm": 8.43875789642334, + "learning_rate": 2.7211577660008155e-06, + "loss": 2.7644, + "step": 85740 + }, + { + "epoch": 5.8258594917787745, + "grad_norm": 7.605291843414307, + "learning_rate": 2.7207331159124883e-06, + "loss": 2.768, + "step": 85745 + }, + { + "epoch": 5.826199211849436, + "grad_norm": 7.155747890472412, + "learning_rate": 2.7203084658241615e-06, + "loss": 2.7893, + "step": 85750 + }, + { + "epoch": 5.826538931920098, + "grad_norm": 9.773563385009766, + "learning_rate": 2.719883815735834e-06, + "loss": 2.6001, + "step": 85755 + }, + { + "epoch": 5.82687865199076, + "grad_norm": 7.1058125495910645, + "learning_rate": 2.7194591656475067e-06, + "loss": 2.5138, + "step": 85760 + }, + { + "epoch": 5.827218372061421, + "grad_norm": 7.792947292327881, + "learning_rate": 2.7190345155591795e-06, + "loss": 3.2107, + "step": 85765 + }, + { + "epoch": 5.827558092132083, + "grad_norm": 9.878795623779297, + "learning_rate": 2.718609865470852e-06, + "loss": 2.7684, + "step": 85770 + }, + { + "epoch": 5.827897812202745, + "grad_norm": 8.587726593017578, + "learning_rate": 2.718185215382525e-06, + "loss": 2.5873, + "step": 85775 + }, + { + "epoch": 5.828237532273406, + "grad_norm": 6.781473159790039, + "learning_rate": 2.717760565294198e-06, + "loss": 2.8596, + "step": 85780 + }, + { + "epoch": 5.828577252344068, + "grad_norm": 7.550368309020996, + "learning_rate": 2.7173359152058703e-06, + "loss": 2.7964, + "step": 85785 + }, + { + "epoch": 5.8289169724147305, + "grad_norm": 6.6625261306762695, + "learning_rate": 2.7169112651175435e-06, + "loss": 2.884, + "step": 85790 + }, + { + "epoch": 5.829256692485392, + "grad_norm": 5.935856342315674, + "learning_rate": 2.7164866150292163e-06, + "loss": 2.933, + "step": 85795 + }, + { + "epoch": 5.829596412556054, + "grad_norm": 8.08574104309082, + "learning_rate": 2.7160619649408887e-06, + "loss": 2.6845, + "step": 85800 + }, + { + "epoch": 5.829936132626716, + "grad_norm": 6.654534816741943, + "learning_rate": 2.7156373148525615e-06, + "loss": 2.6156, + "step": 85805 + }, + { + "epoch": 5.830275852697377, + "grad_norm": 6.698038578033447, + "learning_rate": 2.7152126647642347e-06, + "loss": 2.8488, + "step": 85810 + }, + { + "epoch": 5.830615572768039, + "grad_norm": 8.807133674621582, + "learning_rate": 2.714788014675907e-06, + "loss": 2.6754, + "step": 85815 + }, + { + "epoch": 5.830955292838701, + "grad_norm": 5.661410331726074, + "learning_rate": 2.71436336458758e-06, + "loss": 2.6716, + "step": 85820 + }, + { + "epoch": 5.831295012909362, + "grad_norm": 7.644466876983643, + "learning_rate": 2.713938714499253e-06, + "loss": 2.9475, + "step": 85825 + }, + { + "epoch": 5.831634732980024, + "grad_norm": 8.625405311584473, + "learning_rate": 2.7135140644109255e-06, + "loss": 2.6674, + "step": 85830 + }, + { + "epoch": 5.8319744530506865, + "grad_norm": 5.645451068878174, + "learning_rate": 2.7130894143225983e-06, + "loss": 3.0627, + "step": 85835 + }, + { + "epoch": 5.832314173121348, + "grad_norm": 6.267141819000244, + "learning_rate": 2.712664764234271e-06, + "loss": 2.9785, + "step": 85840 + }, + { + "epoch": 5.83265389319201, + "grad_norm": 8.629406929016113, + "learning_rate": 2.7122401141459443e-06, + "loss": 2.6801, + "step": 85845 + }, + { + "epoch": 5.832993613262672, + "grad_norm": 6.288719654083252, + "learning_rate": 2.7118154640576167e-06, + "loss": 2.8124, + "step": 85850 + }, + { + "epoch": 5.833333333333333, + "grad_norm": 7.033648490905762, + "learning_rate": 2.7113908139692895e-06, + "loss": 3.0433, + "step": 85855 + }, + { + "epoch": 5.833673053403995, + "grad_norm": 7.567659378051758, + "learning_rate": 2.7109661638809627e-06, + "loss": 2.8409, + "step": 85860 + }, + { + "epoch": 5.834012773474657, + "grad_norm": 8.230682373046875, + "learning_rate": 2.710541513792635e-06, + "loss": 2.7785, + "step": 85865 + }, + { + "epoch": 5.834352493545318, + "grad_norm": 7.587870121002197, + "learning_rate": 2.710116863704308e-06, + "loss": 2.9907, + "step": 85870 + }, + { + "epoch": 5.8346922136159804, + "grad_norm": 7.345120906829834, + "learning_rate": 2.709692213615981e-06, + "loss": 2.643, + "step": 85875 + }, + { + "epoch": 5.8350319336866425, + "grad_norm": 6.343017101287842, + "learning_rate": 2.7092675635276535e-06, + "loss": 2.8325, + "step": 85880 + }, + { + "epoch": 5.835371653757304, + "grad_norm": 6.459888458251953, + "learning_rate": 2.7088429134393263e-06, + "loss": 2.6442, + "step": 85885 + }, + { + "epoch": 5.835711373827966, + "grad_norm": 9.604764938354492, + "learning_rate": 2.708418263350999e-06, + "loss": 2.905, + "step": 85890 + }, + { + "epoch": 5.836051093898628, + "grad_norm": 7.555423736572266, + "learning_rate": 2.7079936132626714e-06, + "loss": 2.7507, + "step": 85895 + }, + { + "epoch": 5.836390813969289, + "grad_norm": 7.849231243133545, + "learning_rate": 2.7075689631743447e-06, + "loss": 2.7615, + "step": 85900 + }, + { + "epoch": 5.836730534039951, + "grad_norm": 6.8641204833984375, + "learning_rate": 2.7071443130860175e-06, + "loss": 2.7308, + "step": 85905 + }, + { + "epoch": 5.837070254110613, + "grad_norm": 7.855870246887207, + "learning_rate": 2.70671966299769e-06, + "loss": 2.895, + "step": 85910 + }, + { + "epoch": 5.837409974181274, + "grad_norm": 6.63414192199707, + "learning_rate": 2.706295012909363e-06, + "loss": 2.6502, + "step": 85915 + }, + { + "epoch": 5.8377496942519365, + "grad_norm": 8.287221908569336, + "learning_rate": 2.705870362821036e-06, + "loss": 2.8535, + "step": 85920 + }, + { + "epoch": 5.8380894143225985, + "grad_norm": 6.394478797912598, + "learning_rate": 2.7054457127327083e-06, + "loss": 2.7602, + "step": 85925 + }, + { + "epoch": 5.83842913439326, + "grad_norm": 7.149446964263916, + "learning_rate": 2.705021062644381e-06, + "loss": 2.962, + "step": 85930 + }, + { + "epoch": 5.838768854463922, + "grad_norm": 8.815812110900879, + "learning_rate": 2.7045964125560543e-06, + "loss": 2.6402, + "step": 85935 + }, + { + "epoch": 5.839108574534584, + "grad_norm": 6.189547538757324, + "learning_rate": 2.7041717624677267e-06, + "loss": 2.5188, + "step": 85940 + }, + { + "epoch": 5.839448294605245, + "grad_norm": 7.160663604736328, + "learning_rate": 2.7037471123793995e-06, + "loss": 2.84, + "step": 85945 + }, + { + "epoch": 5.839788014675907, + "grad_norm": 6.091845512390137, + "learning_rate": 2.7033224622910727e-06, + "loss": 2.9685, + "step": 85950 + }, + { + "epoch": 5.840127734746569, + "grad_norm": 7.164645671844482, + "learning_rate": 2.702897812202745e-06, + "loss": 2.7776, + "step": 85955 + }, + { + "epoch": 5.84046745481723, + "grad_norm": 7.041911602020264, + "learning_rate": 2.702473162114418e-06, + "loss": 2.7383, + "step": 85960 + }, + { + "epoch": 5.8408071748878925, + "grad_norm": 7.4904398918151855, + "learning_rate": 2.7020485120260907e-06, + "loss": 2.6621, + "step": 85965 + }, + { + "epoch": 5.841146894958554, + "grad_norm": 9.634166717529297, + "learning_rate": 2.7016238619377635e-06, + "loss": 2.4913, + "step": 85970 + }, + { + "epoch": 5.841486615029216, + "grad_norm": 7.0300211906433105, + "learning_rate": 2.7011992118494363e-06, + "loss": 2.8963, + "step": 85975 + }, + { + "epoch": 5.841826335099878, + "grad_norm": 7.3883771896362305, + "learning_rate": 2.700774561761109e-06, + "loss": 2.7442, + "step": 85980 + }, + { + "epoch": 5.842166055170539, + "grad_norm": 8.66206169128418, + "learning_rate": 2.7003499116727814e-06, + "loss": 3.0863, + "step": 85985 + }, + { + "epoch": 5.842505775241201, + "grad_norm": 9.580866813659668, + "learning_rate": 2.6999252615844547e-06, + "loss": 2.7516, + "step": 85990 + }, + { + "epoch": 5.842845495311863, + "grad_norm": 6.246075630187988, + "learning_rate": 2.6995006114961275e-06, + "loss": 2.7545, + "step": 85995 + }, + { + "epoch": 5.843185215382524, + "grad_norm": 7.68729829788208, + "learning_rate": 2.6990759614078e-06, + "loss": 2.6685, + "step": 86000 + }, + { + "epoch": 5.843524935453186, + "grad_norm": 9.008600234985352, + "learning_rate": 2.698651311319473e-06, + "loss": 2.8577, + "step": 86005 + }, + { + "epoch": 5.8438646555238485, + "grad_norm": 9.196906089782715, + "learning_rate": 2.698226661231146e-06, + "loss": 2.9471, + "step": 86010 + }, + { + "epoch": 5.84420437559451, + "grad_norm": 8.24289608001709, + "learning_rate": 2.6978020111428187e-06, + "loss": 2.7752, + "step": 86015 + }, + { + "epoch": 5.844544095665172, + "grad_norm": 7.350587368011475, + "learning_rate": 2.697377361054491e-06, + "loss": 2.7245, + "step": 86020 + }, + { + "epoch": 5.844883815735834, + "grad_norm": 9.0267915725708, + "learning_rate": 2.6969527109661643e-06, + "loss": 2.8391, + "step": 86025 + }, + { + "epoch": 5.845223535806495, + "grad_norm": 8.330617904663086, + "learning_rate": 2.696528060877837e-06, + "loss": 2.9263, + "step": 86030 + }, + { + "epoch": 5.845563255877157, + "grad_norm": 9.088397026062012, + "learning_rate": 2.6961034107895094e-06, + "loss": 2.7476, + "step": 86035 + }, + { + "epoch": 5.845902975947819, + "grad_norm": 8.308521270751953, + "learning_rate": 2.6956787607011827e-06, + "loss": 2.7873, + "step": 86040 + }, + { + "epoch": 5.84624269601848, + "grad_norm": 8.289993286132812, + "learning_rate": 2.6952541106128555e-06, + "loss": 2.8569, + "step": 86045 + }, + { + "epoch": 5.846582416089142, + "grad_norm": 8.060312271118164, + "learning_rate": 2.694829460524528e-06, + "loss": 2.6169, + "step": 86050 + }, + { + "epoch": 5.8469221361598045, + "grad_norm": 8.236058235168457, + "learning_rate": 2.6944048104362007e-06, + "loss": 2.5481, + "step": 86055 + }, + { + "epoch": 5.847261856230466, + "grad_norm": 6.399294376373291, + "learning_rate": 2.693980160347874e-06, + "loss": 2.7038, + "step": 86060 + }, + { + "epoch": 5.847601576301128, + "grad_norm": 6.917961597442627, + "learning_rate": 2.6935555102595463e-06, + "loss": 2.5994, + "step": 86065 + }, + { + "epoch": 5.84794129637179, + "grad_norm": 8.194610595703125, + "learning_rate": 2.693130860171219e-06, + "loss": 2.616, + "step": 86070 + }, + { + "epoch": 5.848281016442451, + "grad_norm": 7.280604362487793, + "learning_rate": 2.6927062100828923e-06, + "loss": 2.9479, + "step": 86075 + }, + { + "epoch": 5.848620736513113, + "grad_norm": 8.804479598999023, + "learning_rate": 2.6922815599945647e-06, + "loss": 2.6514, + "step": 86080 + }, + { + "epoch": 5.848960456583775, + "grad_norm": 7.5747294425964355, + "learning_rate": 2.6918569099062375e-06, + "loss": 2.7853, + "step": 86085 + }, + { + "epoch": 5.849300176654436, + "grad_norm": 7.302542209625244, + "learning_rate": 2.6914322598179103e-06, + "loss": 2.9479, + "step": 86090 + }, + { + "epoch": 5.849639896725098, + "grad_norm": 8.688085556030273, + "learning_rate": 2.691007609729583e-06, + "loss": 2.8, + "step": 86095 + }, + { + "epoch": 5.8499796167957605, + "grad_norm": 7.064518928527832, + "learning_rate": 2.690582959641256e-06, + "loss": 2.6275, + "step": 86100 + }, + { + "epoch": 5.850319336866422, + "grad_norm": 6.070912837982178, + "learning_rate": 2.6901583095529287e-06, + "loss": 2.7376, + "step": 86105 + }, + { + "epoch": 5.850659056937084, + "grad_norm": 6.219048976898193, + "learning_rate": 2.689733659464601e-06, + "loss": 2.7754, + "step": 86110 + }, + { + "epoch": 5.850998777007746, + "grad_norm": 6.946257591247559, + "learning_rate": 2.6893090093762743e-06, + "loss": 2.5981, + "step": 86115 + }, + { + "epoch": 5.851338497078407, + "grad_norm": 7.584947109222412, + "learning_rate": 2.688884359287947e-06, + "loss": 2.7314, + "step": 86120 + }, + { + "epoch": 5.851678217149069, + "grad_norm": 6.97735071182251, + "learning_rate": 2.6884597091996194e-06, + "loss": 2.6128, + "step": 86125 + }, + { + "epoch": 5.852017937219731, + "grad_norm": 6.408961772918701, + "learning_rate": 2.6880350591112927e-06, + "loss": 2.7845, + "step": 86130 + }, + { + "epoch": 5.852357657290392, + "grad_norm": 9.092462539672852, + "learning_rate": 2.6876104090229655e-06, + "loss": 2.787, + "step": 86135 + }, + { + "epoch": 5.852697377361054, + "grad_norm": 7.176620006561279, + "learning_rate": 2.687185758934638e-06, + "loss": 2.8506, + "step": 86140 + }, + { + "epoch": 5.8530370974317165, + "grad_norm": 6.38130521774292, + "learning_rate": 2.6867611088463106e-06, + "loss": 2.7333, + "step": 86145 + }, + { + "epoch": 5.853376817502378, + "grad_norm": 6.71672248840332, + "learning_rate": 2.686336458757984e-06, + "loss": 2.8547, + "step": 86150 + }, + { + "epoch": 5.85371653757304, + "grad_norm": 8.342729568481445, + "learning_rate": 2.6859118086696562e-06, + "loss": 2.7425, + "step": 86155 + }, + { + "epoch": 5.854056257643702, + "grad_norm": 5.737105369567871, + "learning_rate": 2.685487158581329e-06, + "loss": 2.4636, + "step": 86160 + }, + { + "epoch": 5.854395977714363, + "grad_norm": 7.6229753494262695, + "learning_rate": 2.6850625084930023e-06, + "loss": 2.9321, + "step": 86165 + }, + { + "epoch": 5.854735697785025, + "grad_norm": 11.649222373962402, + "learning_rate": 2.6846378584046746e-06, + "loss": 2.918, + "step": 86170 + }, + { + "epoch": 5.855075417855687, + "grad_norm": 7.12325382232666, + "learning_rate": 2.6842132083163474e-06, + "loss": 2.8027, + "step": 86175 + }, + { + "epoch": 5.855415137926348, + "grad_norm": 8.014245986938477, + "learning_rate": 2.6837885582280202e-06, + "loss": 2.8555, + "step": 86180 + }, + { + "epoch": 5.8557548579970105, + "grad_norm": 7.279939651489258, + "learning_rate": 2.6833639081396935e-06, + "loss": 2.8953, + "step": 86185 + }, + { + "epoch": 5.8560945780676725, + "grad_norm": 7.720238208770752, + "learning_rate": 2.682939258051366e-06, + "loss": 2.7759, + "step": 86190 + }, + { + "epoch": 5.856434298138334, + "grad_norm": 5.780701160430908, + "learning_rate": 2.6825146079630386e-06, + "loss": 2.8209, + "step": 86195 + }, + { + "epoch": 5.856774018208996, + "grad_norm": 6.729400157928467, + "learning_rate": 2.682089957874712e-06, + "loss": 2.6724, + "step": 86200 + }, + { + "epoch": 5.857113738279658, + "grad_norm": 9.146760940551758, + "learning_rate": 2.6816653077863843e-06, + "loss": 2.8471, + "step": 86205 + }, + { + "epoch": 5.857453458350319, + "grad_norm": 8.935423851013184, + "learning_rate": 2.681240657698057e-06, + "loss": 2.7234, + "step": 86210 + }, + { + "epoch": 5.857793178420981, + "grad_norm": 8.422840118408203, + "learning_rate": 2.68081600760973e-06, + "loss": 2.8884, + "step": 86215 + }, + { + "epoch": 5.858132898491643, + "grad_norm": 6.78443717956543, + "learning_rate": 2.6803913575214027e-06, + "loss": 2.8213, + "step": 86220 + }, + { + "epoch": 5.858472618562304, + "grad_norm": 6.886849403381348, + "learning_rate": 2.6799667074330755e-06, + "loss": 2.6511, + "step": 86225 + }, + { + "epoch": 5.8588123386329665, + "grad_norm": 8.128793716430664, + "learning_rate": 2.6795420573447483e-06, + "loss": 3.0289, + "step": 86230 + }, + { + "epoch": 5.8591520587036285, + "grad_norm": 7.6426496505737305, + "learning_rate": 2.6791174072564206e-06, + "loss": 2.5996, + "step": 86235 + }, + { + "epoch": 5.85949177877429, + "grad_norm": 6.81716251373291, + "learning_rate": 2.678692757168094e-06, + "loss": 2.6699, + "step": 86240 + }, + { + "epoch": 5.859831498844952, + "grad_norm": 6.982584476470947, + "learning_rate": 2.6782681070797667e-06, + "loss": 2.8704, + "step": 86245 + }, + { + "epoch": 5.860171218915614, + "grad_norm": 10.248334884643555, + "learning_rate": 2.677843456991439e-06, + "loss": 2.7242, + "step": 86250 + }, + { + "epoch": 5.860510938986275, + "grad_norm": 7.174362659454346, + "learning_rate": 2.6774188069031123e-06, + "loss": 2.609, + "step": 86255 + }, + { + "epoch": 5.860850659056937, + "grad_norm": 8.446528434753418, + "learning_rate": 2.676994156814785e-06, + "loss": 2.8171, + "step": 86260 + }, + { + "epoch": 5.861190379127599, + "grad_norm": 7.270869255065918, + "learning_rate": 2.6765695067264574e-06, + "loss": 2.8022, + "step": 86265 + }, + { + "epoch": 5.86153009919826, + "grad_norm": 7.519709587097168, + "learning_rate": 2.6761448566381302e-06, + "loss": 2.9094, + "step": 86270 + }, + { + "epoch": 5.8618698192689225, + "grad_norm": 8.86086654663086, + "learning_rate": 2.6757202065498035e-06, + "loss": 2.8629, + "step": 86275 + }, + { + "epoch": 5.8622095393395846, + "grad_norm": 6.375826835632324, + "learning_rate": 2.675295556461476e-06, + "loss": 2.7622, + "step": 86280 + }, + { + "epoch": 5.862549259410246, + "grad_norm": 6.884980201721191, + "learning_rate": 2.6748709063731486e-06, + "loss": 2.4799, + "step": 86285 + }, + { + "epoch": 5.862888979480908, + "grad_norm": 6.891262054443359, + "learning_rate": 2.674446256284822e-06, + "loss": 2.6554, + "step": 86290 + }, + { + "epoch": 5.86322869955157, + "grad_norm": 6.7444634437561035, + "learning_rate": 2.6740216061964942e-06, + "loss": 2.9062, + "step": 86295 + }, + { + "epoch": 5.863568419622231, + "grad_norm": 6.152189254760742, + "learning_rate": 2.673596956108167e-06, + "loss": 2.5692, + "step": 86300 + }, + { + "epoch": 5.863908139692893, + "grad_norm": 6.10999059677124, + "learning_rate": 2.67317230601984e-06, + "loss": 3.0179, + "step": 86305 + }, + { + "epoch": 5.864247859763555, + "grad_norm": 8.124295234680176, + "learning_rate": 2.6727476559315122e-06, + "loss": 2.9515, + "step": 86310 + }, + { + "epoch": 5.864587579834216, + "grad_norm": 7.156731128692627, + "learning_rate": 2.6723230058431854e-06, + "loss": 3.124, + "step": 86315 + }, + { + "epoch": 5.8649272999048785, + "grad_norm": 6.871606349945068, + "learning_rate": 2.6718983557548582e-06, + "loss": 3.0498, + "step": 86320 + }, + { + "epoch": 5.865267019975541, + "grad_norm": 7.425524711608887, + "learning_rate": 2.6714737056665306e-06, + "loss": 2.7648, + "step": 86325 + }, + { + "epoch": 5.865606740046202, + "grad_norm": 7.183222770690918, + "learning_rate": 2.671049055578204e-06, + "loss": 2.9136, + "step": 86330 + }, + { + "epoch": 5.865946460116864, + "grad_norm": 7.528032302856445, + "learning_rate": 2.6706244054898766e-06, + "loss": 2.7641, + "step": 86335 + }, + { + "epoch": 5.866286180187526, + "grad_norm": 6.830860614776611, + "learning_rate": 2.670199755401549e-06, + "loss": 2.959, + "step": 86340 + }, + { + "epoch": 5.866625900258187, + "grad_norm": 8.480504035949707, + "learning_rate": 2.6697751053132222e-06, + "loss": 2.5563, + "step": 86345 + }, + { + "epoch": 5.866965620328849, + "grad_norm": 6.031699180603027, + "learning_rate": 2.669350455224895e-06, + "loss": 2.6909, + "step": 86350 + }, + { + "epoch": 5.867305340399511, + "grad_norm": 7.267444133758545, + "learning_rate": 2.668925805136568e-06, + "loss": 2.6235, + "step": 86355 + }, + { + "epoch": 5.867645060470172, + "grad_norm": 8.583807945251465, + "learning_rate": 2.6685011550482402e-06, + "loss": 2.7014, + "step": 86360 + }, + { + "epoch": 5.8679847805408345, + "grad_norm": 7.582790851593018, + "learning_rate": 2.6680765049599135e-06, + "loss": 2.8039, + "step": 86365 + }, + { + "epoch": 5.868324500611497, + "grad_norm": 5.705752372741699, + "learning_rate": 2.6676518548715863e-06, + "loss": 2.7789, + "step": 86370 + }, + { + "epoch": 5.868664220682158, + "grad_norm": 7.847194194793701, + "learning_rate": 2.6672272047832586e-06, + "loss": 3.154, + "step": 86375 + }, + { + "epoch": 5.86900394075282, + "grad_norm": 11.056459426879883, + "learning_rate": 2.666802554694932e-06, + "loss": 2.7845, + "step": 86380 + }, + { + "epoch": 5.869343660823482, + "grad_norm": 8.793299674987793, + "learning_rate": 2.6663779046066047e-06, + "loss": 2.7764, + "step": 86385 + }, + { + "epoch": 5.869683380894143, + "grad_norm": 8.280779838562012, + "learning_rate": 2.665953254518277e-06, + "loss": 2.9808, + "step": 86390 + }, + { + "epoch": 5.870023100964805, + "grad_norm": 8.453447341918945, + "learning_rate": 2.66552860442995e-06, + "loss": 2.7802, + "step": 86395 + }, + { + "epoch": 5.870362821035467, + "grad_norm": 7.826081275939941, + "learning_rate": 2.665103954341623e-06, + "loss": 3.0526, + "step": 86400 + }, + { + "epoch": 5.870702541106128, + "grad_norm": 7.678689956665039, + "learning_rate": 2.6646793042532954e-06, + "loss": 2.754, + "step": 86405 + }, + { + "epoch": 5.8710422611767905, + "grad_norm": 6.948781967163086, + "learning_rate": 2.6642546541649682e-06, + "loss": 2.6924, + "step": 86410 + }, + { + "epoch": 5.871381981247453, + "grad_norm": 6.1026716232299805, + "learning_rate": 2.6638300040766415e-06, + "loss": 2.9538, + "step": 86415 + }, + { + "epoch": 5.871721701318114, + "grad_norm": 7.0508713722229, + "learning_rate": 2.663405353988314e-06, + "loss": 2.7379, + "step": 86420 + }, + { + "epoch": 5.872061421388776, + "grad_norm": 8.847135543823242, + "learning_rate": 2.6629807038999866e-06, + "loss": 2.571, + "step": 86425 + }, + { + "epoch": 5.872401141459437, + "grad_norm": 7.524612903594971, + "learning_rate": 2.6625560538116594e-06, + "loss": 2.7612, + "step": 86430 + }, + { + "epoch": 5.872740861530099, + "grad_norm": 7.932565689086914, + "learning_rate": 2.662131403723332e-06, + "loss": 2.9851, + "step": 86435 + }, + { + "epoch": 5.873080581600761, + "grad_norm": 8.058122634887695, + "learning_rate": 2.661706753635005e-06, + "loss": 2.603, + "step": 86440 + }, + { + "epoch": 5.873420301671422, + "grad_norm": 7.885810852050781, + "learning_rate": 2.661282103546678e-06, + "loss": 2.7884, + "step": 86445 + }, + { + "epoch": 5.8737600217420844, + "grad_norm": 7.974452972412109, + "learning_rate": 2.6608574534583502e-06, + "loss": 3.0119, + "step": 86450 + }, + { + "epoch": 5.8740997418127465, + "grad_norm": 9.238103866577148, + "learning_rate": 2.6604328033700234e-06, + "loss": 2.8676, + "step": 86455 + }, + { + "epoch": 5.874439461883408, + "grad_norm": 7.67671537399292, + "learning_rate": 2.6600081532816962e-06, + "loss": 2.8266, + "step": 86460 + }, + { + "epoch": 5.87477918195407, + "grad_norm": 7.586812973022461, + "learning_rate": 2.6595835031933686e-06, + "loss": 2.6699, + "step": 86465 + }, + { + "epoch": 5.875118902024732, + "grad_norm": 7.043745040893555, + "learning_rate": 2.6591588531050414e-06, + "loss": 2.8368, + "step": 86470 + }, + { + "epoch": 5.875458622095393, + "grad_norm": 6.236489772796631, + "learning_rate": 2.6587342030167146e-06, + "loss": 3.05, + "step": 86475 + }, + { + "epoch": 5.875798342166055, + "grad_norm": 6.332433223724365, + "learning_rate": 2.658309552928387e-06, + "loss": 2.9862, + "step": 86480 + }, + { + "epoch": 5.876138062236717, + "grad_norm": 6.73360538482666, + "learning_rate": 2.65788490284006e-06, + "loss": 2.6936, + "step": 86485 + }, + { + "epoch": 5.876477782307378, + "grad_norm": 7.217305660247803, + "learning_rate": 2.657460252751733e-06, + "loss": 2.7578, + "step": 86490 + }, + { + "epoch": 5.8768175023780405, + "grad_norm": 7.646780014038086, + "learning_rate": 2.6570356026634054e-06, + "loss": 2.6901, + "step": 86495 + }, + { + "epoch": 5.8771572224487025, + "grad_norm": 11.079753875732422, + "learning_rate": 2.6566109525750782e-06, + "loss": 2.8026, + "step": 86500 + }, + { + "epoch": 5.877496942519364, + "grad_norm": 8.23289680480957, + "learning_rate": 2.6561863024867514e-06, + "loss": 2.5907, + "step": 86505 + }, + { + "epoch": 5.877836662590026, + "grad_norm": 7.80136775970459, + "learning_rate": 2.655761652398424e-06, + "loss": 2.6878, + "step": 86510 + }, + { + "epoch": 5.878176382660688, + "grad_norm": 6.3637590408325195, + "learning_rate": 2.6553370023100966e-06, + "loss": 2.8287, + "step": 86515 + }, + { + "epoch": 5.878516102731349, + "grad_norm": 9.077383041381836, + "learning_rate": 2.6549123522217694e-06, + "loss": 2.9232, + "step": 86520 + }, + { + "epoch": 5.878855822802011, + "grad_norm": 7.755465984344482, + "learning_rate": 2.6544877021334427e-06, + "loss": 2.8671, + "step": 86525 + }, + { + "epoch": 5.879195542872673, + "grad_norm": 6.966489315032959, + "learning_rate": 2.654063052045115e-06, + "loss": 2.3294, + "step": 86530 + }, + { + "epoch": 5.879535262943334, + "grad_norm": 7.5453691482543945, + "learning_rate": 2.653638401956788e-06, + "loss": 2.7722, + "step": 86535 + }, + { + "epoch": 5.8798749830139965, + "grad_norm": 8.593985557556152, + "learning_rate": 2.653213751868461e-06, + "loss": 2.8675, + "step": 86540 + }, + { + "epoch": 5.8802147030846585, + "grad_norm": 6.217946529388428, + "learning_rate": 2.6527891017801334e-06, + "loss": 2.6262, + "step": 86545 + }, + { + "epoch": 5.88055442315532, + "grad_norm": 7.182479381561279, + "learning_rate": 2.6523644516918062e-06, + "loss": 2.8787, + "step": 86550 + }, + { + "epoch": 5.880894143225982, + "grad_norm": 9.609943389892578, + "learning_rate": 2.651939801603479e-06, + "loss": 2.8489, + "step": 86555 + }, + { + "epoch": 5.881233863296644, + "grad_norm": 9.173105239868164, + "learning_rate": 2.6515151515151514e-06, + "loss": 2.815, + "step": 86560 + }, + { + "epoch": 5.881573583367305, + "grad_norm": 9.72048282623291, + "learning_rate": 2.6510905014268246e-06, + "loss": 2.8659, + "step": 86565 + }, + { + "epoch": 5.881913303437967, + "grad_norm": 6.689980506896973, + "learning_rate": 2.6506658513384974e-06, + "loss": 2.9081, + "step": 86570 + }, + { + "epoch": 5.882253023508629, + "grad_norm": 7.876737117767334, + "learning_rate": 2.65024120125017e-06, + "loss": 2.6617, + "step": 86575 + }, + { + "epoch": 5.88259274357929, + "grad_norm": 7.071070194244385, + "learning_rate": 2.649816551161843e-06, + "loss": 2.7581, + "step": 86580 + }, + { + "epoch": 5.8829324636499525, + "grad_norm": 7.4354248046875, + "learning_rate": 2.649391901073516e-06, + "loss": 2.9564, + "step": 86585 + }, + { + "epoch": 5.883272183720615, + "grad_norm": 9.605548858642578, + "learning_rate": 2.6489672509851882e-06, + "loss": 3.0086, + "step": 86590 + }, + { + "epoch": 5.883611903791276, + "grad_norm": 8.561723709106445, + "learning_rate": 2.648542600896861e-06, + "loss": 2.493, + "step": 86595 + }, + { + "epoch": 5.883951623861938, + "grad_norm": 8.260880470275879, + "learning_rate": 2.6481179508085342e-06, + "loss": 2.7793, + "step": 86600 + }, + { + "epoch": 5.8842913439326, + "grad_norm": 8.912945747375488, + "learning_rate": 2.6476933007202066e-06, + "loss": 2.8143, + "step": 86605 + }, + { + "epoch": 5.884631064003261, + "grad_norm": 10.494484901428223, + "learning_rate": 2.6472686506318794e-06, + "loss": 2.6327, + "step": 86610 + }, + { + "epoch": 5.884970784073923, + "grad_norm": 7.229320526123047, + "learning_rate": 2.6468440005435526e-06, + "loss": 2.7517, + "step": 86615 + }, + { + "epoch": 5.885310504144585, + "grad_norm": 6.572964191436768, + "learning_rate": 2.646419350455225e-06, + "loss": 2.9034, + "step": 86620 + }, + { + "epoch": 5.885650224215246, + "grad_norm": 7.172508239746094, + "learning_rate": 2.645994700366898e-06, + "loss": 2.6382, + "step": 86625 + }, + { + "epoch": 5.8859899442859085, + "grad_norm": 8.179291725158691, + "learning_rate": 2.645570050278571e-06, + "loss": 2.802, + "step": 86630 + }, + { + "epoch": 5.886329664356571, + "grad_norm": 6.8986310958862305, + "learning_rate": 2.6451454001902434e-06, + "loss": 2.7246, + "step": 86635 + }, + { + "epoch": 5.886669384427232, + "grad_norm": 8.588726043701172, + "learning_rate": 2.6447207501019162e-06, + "loss": 2.7971, + "step": 86640 + }, + { + "epoch": 5.887009104497894, + "grad_norm": 8.683350563049316, + "learning_rate": 2.644296100013589e-06, + "loss": 2.6386, + "step": 86645 + }, + { + "epoch": 5.887348824568555, + "grad_norm": 7.355286121368408, + "learning_rate": 2.6438714499252614e-06, + "loss": 2.8175, + "step": 86650 + }, + { + "epoch": 5.887688544639217, + "grad_norm": 7.227291107177734, + "learning_rate": 2.6434467998369346e-06, + "loss": 3.0047, + "step": 86655 + }, + { + "epoch": 5.888028264709879, + "grad_norm": 7.886414527893066, + "learning_rate": 2.6430221497486074e-06, + "loss": 2.7952, + "step": 86660 + }, + { + "epoch": 5.88836798478054, + "grad_norm": 6.737736225128174, + "learning_rate": 2.64259749966028e-06, + "loss": 2.9926, + "step": 86665 + }, + { + "epoch": 5.888707704851202, + "grad_norm": 9.328856468200684, + "learning_rate": 2.642172849571953e-06, + "loss": 2.7391, + "step": 86670 + }, + { + "epoch": 5.8890474249218645, + "grad_norm": 7.985702991485596, + "learning_rate": 2.641748199483626e-06, + "loss": 2.627, + "step": 86675 + }, + { + "epoch": 5.889387144992526, + "grad_norm": 8.144874572753906, + "learning_rate": 2.641323549395298e-06, + "loss": 2.6372, + "step": 86680 + }, + { + "epoch": 5.889726865063188, + "grad_norm": 6.702776908874512, + "learning_rate": 2.640898899306971e-06, + "loss": 2.9242, + "step": 86685 + }, + { + "epoch": 5.89006658513385, + "grad_norm": 6.0880446434021, + "learning_rate": 2.6404742492186442e-06, + "loss": 2.6126, + "step": 86690 + }, + { + "epoch": 5.890406305204511, + "grad_norm": 7.369516372680664, + "learning_rate": 2.640049599130317e-06, + "loss": 2.7124, + "step": 86695 + }, + { + "epoch": 5.890746025275173, + "grad_norm": 6.752168655395508, + "learning_rate": 2.6396249490419894e-06, + "loss": 2.6086, + "step": 86700 + }, + { + "epoch": 5.891085745345835, + "grad_norm": 9.210824012756348, + "learning_rate": 2.6392002989536626e-06, + "loss": 2.6924, + "step": 86705 + }, + { + "epoch": 5.891425465416496, + "grad_norm": 9.690969467163086, + "learning_rate": 2.6387756488653354e-06, + "loss": 2.9624, + "step": 86710 + }, + { + "epoch": 5.891765185487158, + "grad_norm": 7.902092456817627, + "learning_rate": 2.638350998777008e-06, + "loss": 2.7284, + "step": 86715 + }, + { + "epoch": 5.8921049055578205, + "grad_norm": 6.490725040435791, + "learning_rate": 2.6379263486886806e-06, + "loss": 2.5904, + "step": 86720 + }, + { + "epoch": 5.892444625628482, + "grad_norm": 7.66533899307251, + "learning_rate": 2.637501698600354e-06, + "loss": 3.1567, + "step": 86725 + }, + { + "epoch": 5.892784345699144, + "grad_norm": 7.002712726593018, + "learning_rate": 2.6370770485120262e-06, + "loss": 2.8391, + "step": 86730 + }, + { + "epoch": 5.893124065769806, + "grad_norm": 6.199477672576904, + "learning_rate": 2.636652398423699e-06, + "loss": 2.757, + "step": 86735 + }, + { + "epoch": 5.893463785840467, + "grad_norm": 7.833486557006836, + "learning_rate": 2.6362277483353722e-06, + "loss": 2.9118, + "step": 86740 + }, + { + "epoch": 5.893803505911129, + "grad_norm": 11.072782516479492, + "learning_rate": 2.6358030982470446e-06, + "loss": 2.6304, + "step": 86745 + }, + { + "epoch": 5.894143225981791, + "grad_norm": 8.47913646697998, + "learning_rate": 2.6353784481587174e-06, + "loss": 2.7925, + "step": 86750 + }, + { + "epoch": 5.894482946052452, + "grad_norm": 7.605189800262451, + "learning_rate": 2.6349537980703902e-06, + "loss": 2.8367, + "step": 86755 + }, + { + "epoch": 5.8948226661231145, + "grad_norm": 7.276067733764648, + "learning_rate": 2.634529147982063e-06, + "loss": 2.7243, + "step": 86760 + }, + { + "epoch": 5.8951623861937765, + "grad_norm": 7.651033401489258, + "learning_rate": 2.634104497893736e-06, + "loss": 2.605, + "step": 86765 + }, + { + "epoch": 5.895502106264438, + "grad_norm": 6.765523433685303, + "learning_rate": 2.6336798478054086e-06, + "loss": 2.8242, + "step": 86770 + }, + { + "epoch": 5.8958418263351, + "grad_norm": 7.206450462341309, + "learning_rate": 2.633255197717081e-06, + "loss": 2.8643, + "step": 86775 + }, + { + "epoch": 5.896181546405762, + "grad_norm": 8.94601821899414, + "learning_rate": 2.6328305476287542e-06, + "loss": 2.7521, + "step": 86780 + }, + { + "epoch": 5.896521266476423, + "grad_norm": 9.0768404006958, + "learning_rate": 2.632405897540427e-06, + "loss": 3.0582, + "step": 86785 + }, + { + "epoch": 5.896860986547085, + "grad_norm": 10.185194969177246, + "learning_rate": 2.6319812474520994e-06, + "loss": 2.7458, + "step": 86790 + }, + { + "epoch": 5.897200706617747, + "grad_norm": 9.12020492553711, + "learning_rate": 2.6315565973637726e-06, + "loss": 2.6979, + "step": 86795 + }, + { + "epoch": 5.897540426688408, + "grad_norm": 7.278135299682617, + "learning_rate": 2.6311319472754454e-06, + "loss": 2.732, + "step": 86800 + }, + { + "epoch": 5.8978801467590705, + "grad_norm": 11.322600364685059, + "learning_rate": 2.630707297187118e-06, + "loss": 2.9333, + "step": 86805 + }, + { + "epoch": 5.8982198668297325, + "grad_norm": 7.463925361633301, + "learning_rate": 2.6302826470987906e-06, + "loss": 2.7714, + "step": 86810 + }, + { + "epoch": 5.898559586900394, + "grad_norm": 7.333821773529053, + "learning_rate": 2.629857997010464e-06, + "loss": 2.6312, + "step": 86815 + }, + { + "epoch": 5.898899306971056, + "grad_norm": 6.709471225738525, + "learning_rate": 2.629433346922136e-06, + "loss": 2.8146, + "step": 86820 + }, + { + "epoch": 5.899239027041718, + "grad_norm": 5.531641006469727, + "learning_rate": 2.629008696833809e-06, + "loss": 2.9279, + "step": 86825 + }, + { + "epoch": 5.899578747112379, + "grad_norm": 7.703588962554932, + "learning_rate": 2.6285840467454822e-06, + "loss": 2.6575, + "step": 86830 + }, + { + "epoch": 5.899918467183041, + "grad_norm": 7.398118019104004, + "learning_rate": 2.6281593966571546e-06, + "loss": 2.7922, + "step": 86835 + }, + { + "epoch": 5.900258187253703, + "grad_norm": 8.114093780517578, + "learning_rate": 2.6277347465688274e-06, + "loss": 2.8497, + "step": 86840 + }, + { + "epoch": 5.900597907324364, + "grad_norm": 8.531373023986816, + "learning_rate": 2.6273100964805e-06, + "loss": 2.9579, + "step": 86845 + }, + { + "epoch": 5.9009376273950265, + "grad_norm": 6.9586663246154785, + "learning_rate": 2.626885446392173e-06, + "loss": 2.8314, + "step": 86850 + }, + { + "epoch": 5.9012773474656885, + "grad_norm": 7.415487766265869, + "learning_rate": 2.626460796303846e-06, + "loss": 2.5354, + "step": 86855 + }, + { + "epoch": 5.90161706753635, + "grad_norm": 6.634643077850342, + "learning_rate": 2.6260361462155186e-06, + "loss": 2.8889, + "step": 86860 + }, + { + "epoch": 5.901956787607012, + "grad_norm": 7.669686794281006, + "learning_rate": 2.625611496127192e-06, + "loss": 2.701, + "step": 86865 + }, + { + "epoch": 5.902296507677674, + "grad_norm": 9.784700393676758, + "learning_rate": 2.6251868460388642e-06, + "loss": 3.0087, + "step": 86870 + }, + { + "epoch": 5.902636227748335, + "grad_norm": 7.498017311096191, + "learning_rate": 2.624762195950537e-06, + "loss": 2.8342, + "step": 86875 + }, + { + "epoch": 5.902975947818997, + "grad_norm": 7.093341827392578, + "learning_rate": 2.62433754586221e-06, + "loss": 2.8644, + "step": 86880 + }, + { + "epoch": 5.903315667889659, + "grad_norm": 9.057205200195312, + "learning_rate": 2.6239128957738826e-06, + "loss": 2.6267, + "step": 86885 + }, + { + "epoch": 5.90365538796032, + "grad_norm": 7.535729885101318, + "learning_rate": 2.6234882456855554e-06, + "loss": 2.8285, + "step": 86890 + }, + { + "epoch": 5.9039951080309825, + "grad_norm": 7.175337314605713, + "learning_rate": 2.6230635955972282e-06, + "loss": 2.8267, + "step": 86895 + }, + { + "epoch": 5.904334828101645, + "grad_norm": 7.58060884475708, + "learning_rate": 2.6226389455089006e-06, + "loss": 2.8218, + "step": 86900 + }, + { + "epoch": 5.904674548172306, + "grad_norm": 7.915521621704102, + "learning_rate": 2.622214295420574e-06, + "loss": 2.8064, + "step": 86905 + }, + { + "epoch": 5.905014268242968, + "grad_norm": 6.881134510040283, + "learning_rate": 2.6217896453322466e-06, + "loss": 2.6139, + "step": 86910 + }, + { + "epoch": 5.90535398831363, + "grad_norm": 8.572714805603027, + "learning_rate": 2.621364995243919e-06, + "loss": 2.8321, + "step": 86915 + }, + { + "epoch": 5.905693708384291, + "grad_norm": 7.41854190826416, + "learning_rate": 2.6209403451555922e-06, + "loss": 2.6587, + "step": 86920 + }, + { + "epoch": 5.906033428454953, + "grad_norm": 8.439016342163086, + "learning_rate": 2.620515695067265e-06, + "loss": 2.8309, + "step": 86925 + }, + { + "epoch": 5.906373148525615, + "grad_norm": 7.6479034423828125, + "learning_rate": 2.6200910449789374e-06, + "loss": 2.6569, + "step": 86930 + }, + { + "epoch": 5.906712868596276, + "grad_norm": 5.904439926147461, + "learning_rate": 2.61966639489061e-06, + "loss": 2.5168, + "step": 86935 + }, + { + "epoch": 5.9070525886669385, + "grad_norm": 8.727859497070312, + "learning_rate": 2.6192417448022834e-06, + "loss": 2.5893, + "step": 86940 + }, + { + "epoch": 5.907392308737601, + "grad_norm": 7.185622215270996, + "learning_rate": 2.618817094713956e-06, + "loss": 2.7214, + "step": 86945 + }, + { + "epoch": 5.907732028808262, + "grad_norm": 8.37906265258789, + "learning_rate": 2.6183924446256286e-06, + "loss": 3.1063, + "step": 86950 + }, + { + "epoch": 5.908071748878924, + "grad_norm": 6.496012210845947, + "learning_rate": 2.617967794537302e-06, + "loss": 2.7669, + "step": 86955 + }, + { + "epoch": 5.908411468949586, + "grad_norm": 6.546732425689697, + "learning_rate": 2.617543144448974e-06, + "loss": 2.6885, + "step": 86960 + }, + { + "epoch": 5.908751189020247, + "grad_norm": 8.936962127685547, + "learning_rate": 2.617118494360647e-06, + "loss": 2.8318, + "step": 86965 + }, + { + "epoch": 5.909090909090909, + "grad_norm": 7.822211742401123, + "learning_rate": 2.61669384427232e-06, + "loss": 2.7513, + "step": 86970 + }, + { + "epoch": 5.909430629161571, + "grad_norm": 7.820229530334473, + "learning_rate": 2.616269194183992e-06, + "loss": 2.9279, + "step": 86975 + }, + { + "epoch": 5.909770349232232, + "grad_norm": 7.70895528793335, + "learning_rate": 2.6158445440956654e-06, + "loss": 2.7144, + "step": 86980 + }, + { + "epoch": 5.9101100693028945, + "grad_norm": 7.027346611022949, + "learning_rate": 2.615419894007338e-06, + "loss": 2.8887, + "step": 86985 + }, + { + "epoch": 5.910449789373557, + "grad_norm": 6.636425495147705, + "learning_rate": 2.6149952439190106e-06, + "loss": 2.7331, + "step": 86990 + }, + { + "epoch": 5.910789509444218, + "grad_norm": 6.578648090362549, + "learning_rate": 2.614570593830684e-06, + "loss": 2.8363, + "step": 86995 + }, + { + "epoch": 5.91112922951488, + "grad_norm": 8.40433120727539, + "learning_rate": 2.6141459437423566e-06, + "loss": 2.8823, + "step": 87000 + }, + { + "epoch": 5.911468949585542, + "grad_norm": 7.301177501678467, + "learning_rate": 2.613721293654029e-06, + "loss": 2.9321, + "step": 87005 + }, + { + "epoch": 5.911808669656203, + "grad_norm": 7.956798553466797, + "learning_rate": 2.613296643565702e-06, + "loss": 2.5079, + "step": 87010 + }, + { + "epoch": 5.912148389726865, + "grad_norm": 7.506089687347412, + "learning_rate": 2.612871993477375e-06, + "loss": 2.6925, + "step": 87015 + }, + { + "epoch": 5.912488109797527, + "grad_norm": 7.943443775177002, + "learning_rate": 2.6124473433890474e-06, + "loss": 2.9011, + "step": 87020 + }, + { + "epoch": 5.9128278298681884, + "grad_norm": 7.8331427574157715, + "learning_rate": 2.61202269330072e-06, + "loss": 2.8855, + "step": 87025 + }, + { + "epoch": 5.9131675499388505, + "grad_norm": 8.1744966506958, + "learning_rate": 2.6115980432123934e-06, + "loss": 3.0783, + "step": 87030 + }, + { + "epoch": 5.913507270009513, + "grad_norm": 7.485353946685791, + "learning_rate": 2.6111733931240662e-06, + "loss": 3.0248, + "step": 87035 + }, + { + "epoch": 5.913846990080174, + "grad_norm": 7.944075107574463, + "learning_rate": 2.6107487430357386e-06, + "loss": 2.6707, + "step": 87040 + }, + { + "epoch": 5.914186710150836, + "grad_norm": 6.455550670623779, + "learning_rate": 2.610324092947412e-06, + "loss": 2.4481, + "step": 87045 + }, + { + "epoch": 5.914526430221498, + "grad_norm": 6.60777473449707, + "learning_rate": 2.6098994428590846e-06, + "loss": 2.8353, + "step": 87050 + }, + { + "epoch": 5.914866150292159, + "grad_norm": 12.174728393554688, + "learning_rate": 2.609474792770757e-06, + "loss": 2.6977, + "step": 87055 + }, + { + "epoch": 5.915205870362821, + "grad_norm": 8.048422813415527, + "learning_rate": 2.60905014268243e-06, + "loss": 2.7411, + "step": 87060 + }, + { + "epoch": 5.915545590433483, + "grad_norm": 7.791759967803955, + "learning_rate": 2.608625492594103e-06, + "loss": 2.8383, + "step": 87065 + }, + { + "epoch": 5.9158853105041445, + "grad_norm": 8.673075675964355, + "learning_rate": 2.6082008425057754e-06, + "loss": 2.7446, + "step": 87070 + }, + { + "epoch": 5.9162250305748065, + "grad_norm": 6.193070888519287, + "learning_rate": 2.607776192417448e-06, + "loss": 2.8049, + "step": 87075 + }, + { + "epoch": 5.916564750645469, + "grad_norm": 8.606045722961426, + "learning_rate": 2.6073515423291214e-06, + "loss": 2.857, + "step": 87080 + }, + { + "epoch": 5.91690447071613, + "grad_norm": 7.829275131225586, + "learning_rate": 2.606926892240794e-06, + "loss": 2.6424, + "step": 87085 + }, + { + "epoch": 5.917244190786792, + "grad_norm": 6.223496437072754, + "learning_rate": 2.6065022421524666e-06, + "loss": 2.9998, + "step": 87090 + }, + { + "epoch": 5.917583910857454, + "grad_norm": 8.63735580444336, + "learning_rate": 2.6060775920641394e-06, + "loss": 2.6298, + "step": 87095 + }, + { + "epoch": 5.917923630928115, + "grad_norm": 6.703771591186523, + "learning_rate": 2.6056529419758118e-06, + "loss": 2.779, + "step": 87100 + }, + { + "epoch": 5.918263350998777, + "grad_norm": 7.50837516784668, + "learning_rate": 2.605228291887485e-06, + "loss": 2.8072, + "step": 87105 + }, + { + "epoch": 5.918603071069439, + "grad_norm": 7.860311985015869, + "learning_rate": 2.604803641799158e-06, + "loss": 2.7218, + "step": 87110 + }, + { + "epoch": 5.9189427911401005, + "grad_norm": 6.975951671600342, + "learning_rate": 2.60437899171083e-06, + "loss": 2.8912, + "step": 87115 + }, + { + "epoch": 5.9192825112107625, + "grad_norm": 8.311738014221191, + "learning_rate": 2.6039543416225034e-06, + "loss": 2.7095, + "step": 87120 + }, + { + "epoch": 5.919622231281424, + "grad_norm": 7.5709004402160645, + "learning_rate": 2.603529691534176e-06, + "loss": 2.9469, + "step": 87125 + }, + { + "epoch": 5.919961951352086, + "grad_norm": 7.364569187164307, + "learning_rate": 2.6031050414458486e-06, + "loss": 2.505, + "step": 87130 + }, + { + "epoch": 5.920301671422748, + "grad_norm": 7.969705104827881, + "learning_rate": 2.602680391357522e-06, + "loss": 2.9647, + "step": 87135 + }, + { + "epoch": 5.920641391493409, + "grad_norm": 6.327001094818115, + "learning_rate": 2.6022557412691946e-06, + "loss": 2.7584, + "step": 87140 + }, + { + "epoch": 5.920981111564071, + "grad_norm": 8.178354263305664, + "learning_rate": 2.601831091180867e-06, + "loss": 2.8447, + "step": 87145 + }, + { + "epoch": 5.921320831634733, + "grad_norm": 7.754770755767822, + "learning_rate": 2.6014064410925398e-06, + "loss": 2.8649, + "step": 87150 + }, + { + "epoch": 5.921660551705394, + "grad_norm": 7.345972061157227, + "learning_rate": 2.600981791004213e-06, + "loss": 2.632, + "step": 87155 + }, + { + "epoch": 5.9220002717760565, + "grad_norm": 7.5997724533081055, + "learning_rate": 2.6005571409158854e-06, + "loss": 2.6924, + "step": 87160 + }, + { + "epoch": 5.9223399918467186, + "grad_norm": 7.190567493438721, + "learning_rate": 2.600132490827558e-06, + "loss": 3.1378, + "step": 87165 + }, + { + "epoch": 5.92267971191738, + "grad_norm": 9.048321723937988, + "learning_rate": 2.5997078407392314e-06, + "loss": 2.9407, + "step": 87170 + }, + { + "epoch": 5.923019431988042, + "grad_norm": 7.334078311920166, + "learning_rate": 2.599283190650904e-06, + "loss": 2.7126, + "step": 87175 + }, + { + "epoch": 5.923359152058704, + "grad_norm": 7.688469409942627, + "learning_rate": 2.5988585405625766e-06, + "loss": 2.911, + "step": 87180 + }, + { + "epoch": 5.923698872129365, + "grad_norm": 6.758007526397705, + "learning_rate": 2.5984338904742494e-06, + "loss": 2.8505, + "step": 87185 + }, + { + "epoch": 5.924038592200027, + "grad_norm": 7.145202159881592, + "learning_rate": 2.5980092403859218e-06, + "loss": 2.9073, + "step": 87190 + }, + { + "epoch": 5.924378312270689, + "grad_norm": 7.196474075317383, + "learning_rate": 2.597584590297595e-06, + "loss": 3.0173, + "step": 87195 + }, + { + "epoch": 5.92471803234135, + "grad_norm": 6.197676658630371, + "learning_rate": 2.597159940209268e-06, + "loss": 2.9557, + "step": 87200 + }, + { + "epoch": 5.9250577524120125, + "grad_norm": 7.000977993011475, + "learning_rate": 2.596735290120941e-06, + "loss": 2.6547, + "step": 87205 + }, + { + "epoch": 5.925397472482675, + "grad_norm": 9.743568420410156, + "learning_rate": 2.5963106400326134e-06, + "loss": 2.7597, + "step": 87210 + }, + { + "epoch": 5.925737192553336, + "grad_norm": 8.560778617858887, + "learning_rate": 2.595885989944286e-06, + "loss": 2.8278, + "step": 87215 + }, + { + "epoch": 5.926076912623998, + "grad_norm": 6.977264881134033, + "learning_rate": 2.595461339855959e-06, + "loss": 2.7834, + "step": 87220 + }, + { + "epoch": 5.92641663269466, + "grad_norm": 7.390051364898682, + "learning_rate": 2.5950366897676314e-06, + "loss": 2.6697, + "step": 87225 + }, + { + "epoch": 5.926756352765321, + "grad_norm": 8.301827430725098, + "learning_rate": 2.5946120396793046e-06, + "loss": 2.8697, + "step": 87230 + }, + { + "epoch": 5.927096072835983, + "grad_norm": 6.37503719329834, + "learning_rate": 2.5941873895909774e-06, + "loss": 2.902, + "step": 87235 + }, + { + "epoch": 5.927435792906645, + "grad_norm": 6.341259479522705, + "learning_rate": 2.5937627395026498e-06, + "loss": 2.7157, + "step": 87240 + }, + { + "epoch": 5.927775512977306, + "grad_norm": 6.904914379119873, + "learning_rate": 2.593338089414323e-06, + "loss": 2.4217, + "step": 87245 + }, + { + "epoch": 5.9281152330479685, + "grad_norm": 7.78179407119751, + "learning_rate": 2.592913439325996e-06, + "loss": 2.891, + "step": 87250 + }, + { + "epoch": 5.928454953118631, + "grad_norm": 8.23275375366211, + "learning_rate": 2.592488789237668e-06, + "loss": 2.6682, + "step": 87255 + }, + { + "epoch": 5.928794673189292, + "grad_norm": 10.001775741577148, + "learning_rate": 2.592064139149341e-06, + "loss": 2.7636, + "step": 87260 + }, + { + "epoch": 5.929134393259954, + "grad_norm": 11.473904609680176, + "learning_rate": 2.591639489061014e-06, + "loss": 2.5679, + "step": 87265 + }, + { + "epoch": 5.929474113330616, + "grad_norm": 7.874361991882324, + "learning_rate": 2.5912148389726866e-06, + "loss": 2.9713, + "step": 87270 + }, + { + "epoch": 5.929813833401277, + "grad_norm": 8.055327415466309, + "learning_rate": 2.5907901888843594e-06, + "loss": 2.6709, + "step": 87275 + }, + { + "epoch": 5.930153553471939, + "grad_norm": 6.959315776824951, + "learning_rate": 2.5903655387960326e-06, + "loss": 2.7014, + "step": 87280 + }, + { + "epoch": 5.930493273542601, + "grad_norm": 6.778260231018066, + "learning_rate": 2.589940888707705e-06, + "loss": 2.8365, + "step": 87285 + }, + { + "epoch": 5.930832993613262, + "grad_norm": 6.931142330169678, + "learning_rate": 2.5895162386193778e-06, + "loss": 2.9256, + "step": 87290 + }, + { + "epoch": 5.9311727136839245, + "grad_norm": 6.66252326965332, + "learning_rate": 2.589091588531051e-06, + "loss": 2.7863, + "step": 87295 + }, + { + "epoch": 5.931512433754587, + "grad_norm": 8.444536209106445, + "learning_rate": 2.5886669384427234e-06, + "loss": 2.7135, + "step": 87300 + }, + { + "epoch": 5.931852153825248, + "grad_norm": 7.832890033721924, + "learning_rate": 2.588242288354396e-06, + "loss": 2.8785, + "step": 87305 + }, + { + "epoch": 5.93219187389591, + "grad_norm": 8.008363723754883, + "learning_rate": 2.587817638266069e-06, + "loss": 2.7654, + "step": 87310 + }, + { + "epoch": 5.932531593966572, + "grad_norm": 7.371170997619629, + "learning_rate": 2.5873929881777414e-06, + "loss": 2.8307, + "step": 87315 + }, + { + "epoch": 5.932871314037233, + "grad_norm": 5.6425275802612305, + "learning_rate": 2.5869683380894146e-06, + "loss": 2.8108, + "step": 87320 + }, + { + "epoch": 5.933211034107895, + "grad_norm": 7.004494667053223, + "learning_rate": 2.5866286180187527e-06, + "loss": 2.8948, + "step": 87325 + }, + { + "epoch": 5.933550754178556, + "grad_norm": 8.692717552185059, + "learning_rate": 2.586203967930426e-06, + "loss": 2.8235, + "step": 87330 + }, + { + "epoch": 5.9338904742492184, + "grad_norm": 8.719755172729492, + "learning_rate": 2.5857793178420983e-06, + "loss": 3.0009, + "step": 87335 + }, + { + "epoch": 5.9342301943198805, + "grad_norm": 6.7700419425964355, + "learning_rate": 2.585354667753771e-06, + "loss": 3.0455, + "step": 87340 + }, + { + "epoch": 5.934569914390542, + "grad_norm": 6.63473653793335, + "learning_rate": 2.584930017665444e-06, + "loss": 2.7674, + "step": 87345 + }, + { + "epoch": 5.934909634461204, + "grad_norm": 8.376060485839844, + "learning_rate": 2.5845053675771167e-06, + "loss": 2.8484, + "step": 87350 + }, + { + "epoch": 5.935249354531866, + "grad_norm": 7.398067951202393, + "learning_rate": 2.5840807174887895e-06, + "loss": 2.8028, + "step": 87355 + }, + { + "epoch": 5.935589074602527, + "grad_norm": 8.589723587036133, + "learning_rate": 2.5836560674004623e-06, + "loss": 2.8655, + "step": 87360 + }, + { + "epoch": 5.935928794673189, + "grad_norm": 6.194755554199219, + "learning_rate": 2.5832314173121346e-06, + "loss": 2.6127, + "step": 87365 + }, + { + "epoch": 5.936268514743851, + "grad_norm": 6.738603115081787, + "learning_rate": 2.582806767223808e-06, + "loss": 2.7605, + "step": 87370 + }, + { + "epoch": 5.936608234814512, + "grad_norm": 7.25214147567749, + "learning_rate": 2.5823821171354807e-06, + "loss": 2.4964, + "step": 87375 + }, + { + "epoch": 5.9369479548851745, + "grad_norm": 8.031086921691895, + "learning_rate": 2.581957467047153e-06, + "loss": 2.7122, + "step": 87380 + }, + { + "epoch": 5.9372876749558365, + "grad_norm": 9.319350242614746, + "learning_rate": 2.5815328169588263e-06, + "loss": 2.8554, + "step": 87385 + }, + { + "epoch": 5.937627395026498, + "grad_norm": 8.931585311889648, + "learning_rate": 2.581108166870499e-06, + "loss": 2.651, + "step": 87390 + }, + { + "epoch": 5.93796711509716, + "grad_norm": 6.794182300567627, + "learning_rate": 2.5806835167821714e-06, + "loss": 2.4834, + "step": 87395 + }, + { + "epoch": 5.938306835167822, + "grad_norm": 6.575504302978516, + "learning_rate": 2.5802588666938442e-06, + "loss": 2.4456, + "step": 87400 + }, + { + "epoch": 5.938646555238483, + "grad_norm": 9.44399642944336, + "learning_rate": 2.5798342166055175e-06, + "loss": 2.7816, + "step": 87405 + }, + { + "epoch": 5.938986275309145, + "grad_norm": 9.0089750289917, + "learning_rate": 2.5794095665171903e-06, + "loss": 2.8746, + "step": 87410 + }, + { + "epoch": 5.939325995379807, + "grad_norm": 7.996983528137207, + "learning_rate": 2.5789849164288626e-06, + "loss": 2.5803, + "step": 87415 + }, + { + "epoch": 5.939665715450468, + "grad_norm": 6.7056403160095215, + "learning_rate": 2.578560266340536e-06, + "loss": 2.5692, + "step": 87420 + }, + { + "epoch": 5.9400054355211305, + "grad_norm": 6.759990692138672, + "learning_rate": 2.5781356162522087e-06, + "loss": 2.6338, + "step": 87425 + }, + { + "epoch": 5.9403451555917925, + "grad_norm": 5.302646160125732, + "learning_rate": 2.577710966163881e-06, + "loss": 2.5927, + "step": 87430 + }, + { + "epoch": 5.940684875662454, + "grad_norm": 7.331327438354492, + "learning_rate": 2.577286316075554e-06, + "loss": 2.8628, + "step": 87435 + }, + { + "epoch": 5.941024595733116, + "grad_norm": 7.6486897468566895, + "learning_rate": 2.576861665987227e-06, + "loss": 2.7249, + "step": 87440 + }, + { + "epoch": 5.941364315803778, + "grad_norm": 7.730722904205322, + "learning_rate": 2.5764370158988995e-06, + "loss": 2.7815, + "step": 87445 + }, + { + "epoch": 5.941704035874439, + "grad_norm": 6.82385778427124, + "learning_rate": 2.5760123658105723e-06, + "loss": 2.8635, + "step": 87450 + }, + { + "epoch": 5.942043755945101, + "grad_norm": 7.324700355529785, + "learning_rate": 2.5755877157222455e-06, + "loss": 2.7957, + "step": 87455 + }, + { + "epoch": 5.942383476015763, + "grad_norm": 7.454965114593506, + "learning_rate": 2.575163065633918e-06, + "loss": 2.8139, + "step": 87460 + }, + { + "epoch": 5.942723196086424, + "grad_norm": 8.160391807556152, + "learning_rate": 2.5747384155455907e-06, + "loss": 2.6477, + "step": 87465 + }, + { + "epoch": 5.9430629161570865, + "grad_norm": 6.859951496124268, + "learning_rate": 2.5743137654572635e-06, + "loss": 2.6704, + "step": 87470 + }, + { + "epoch": 5.943402636227749, + "grad_norm": 7.9384589195251465, + "learning_rate": 2.573889115368936e-06, + "loss": 2.8857, + "step": 87475 + }, + { + "epoch": 5.94374235629841, + "grad_norm": 7.350348472595215, + "learning_rate": 2.573464465280609e-06, + "loss": 2.9511, + "step": 87480 + }, + { + "epoch": 5.944082076369072, + "grad_norm": 6.684255599975586, + "learning_rate": 2.573039815192282e-06, + "loss": 2.8638, + "step": 87485 + }, + { + "epoch": 5.944421796439734, + "grad_norm": 6.967676639556885, + "learning_rate": 2.5726151651039542e-06, + "loss": 2.699, + "step": 87490 + }, + { + "epoch": 5.944761516510395, + "grad_norm": 8.187925338745117, + "learning_rate": 2.5721905150156275e-06, + "loss": 2.6256, + "step": 87495 + }, + { + "epoch": 5.945101236581057, + "grad_norm": 9.48575496673584, + "learning_rate": 2.5717658649273003e-06, + "loss": 2.8172, + "step": 87500 + }, + { + "epoch": 5.945440956651719, + "grad_norm": 6.793229103088379, + "learning_rate": 2.5713412148389726e-06, + "loss": 2.8333, + "step": 87505 + }, + { + "epoch": 5.94578067672238, + "grad_norm": 6.615182399749756, + "learning_rate": 2.570916564750646e-06, + "loss": 2.6904, + "step": 87510 + }, + { + "epoch": 5.9461203967930425, + "grad_norm": 7.287671089172363, + "learning_rate": 2.5704919146623187e-06, + "loss": 2.975, + "step": 87515 + }, + { + "epoch": 5.946460116863705, + "grad_norm": 8.84032154083252, + "learning_rate": 2.570067264573991e-06, + "loss": 2.9164, + "step": 87520 + }, + { + "epoch": 5.946799836934366, + "grad_norm": 6.69686222076416, + "learning_rate": 2.569642614485664e-06, + "loss": 2.9655, + "step": 87525 + }, + { + "epoch": 5.947139557005028, + "grad_norm": 7.760894298553467, + "learning_rate": 2.569217964397337e-06, + "loss": 2.5365, + "step": 87530 + }, + { + "epoch": 5.94747927707569, + "grad_norm": 7.784102439880371, + "learning_rate": 2.5687933143090094e-06, + "loss": 3.0066, + "step": 87535 + }, + { + "epoch": 5.947818997146351, + "grad_norm": 6.942320823669434, + "learning_rate": 2.5683686642206822e-06, + "loss": 2.7188, + "step": 87540 + }, + { + "epoch": 5.948158717217013, + "grad_norm": 8.587801933288574, + "learning_rate": 2.5679440141323555e-06, + "loss": 2.6204, + "step": 87545 + }, + { + "epoch": 5.948498437287675, + "grad_norm": 6.938898086547852, + "learning_rate": 2.567519364044028e-06, + "loss": 2.659, + "step": 87550 + }, + { + "epoch": 5.948838157358336, + "grad_norm": 9.237525939941406, + "learning_rate": 2.5670947139557006e-06, + "loss": 3.0965, + "step": 87555 + }, + { + "epoch": 5.9491778774289985, + "grad_norm": 5.631956577301025, + "learning_rate": 2.5666700638673734e-06, + "loss": 2.6336, + "step": 87560 + }, + { + "epoch": 5.949517597499661, + "grad_norm": 7.46579122543335, + "learning_rate": 2.566245413779046e-06, + "loss": 2.8275, + "step": 87565 + }, + { + "epoch": 5.949857317570322, + "grad_norm": 7.914516925811768, + "learning_rate": 2.565820763690719e-06, + "loss": 2.72, + "step": 87570 + }, + { + "epoch": 5.950197037640984, + "grad_norm": 7.449821949005127, + "learning_rate": 2.565396113602392e-06, + "loss": 2.6403, + "step": 87575 + }, + { + "epoch": 5.950536757711646, + "grad_norm": 9.171615600585938, + "learning_rate": 2.564971463514065e-06, + "loss": 2.8429, + "step": 87580 + }, + { + "epoch": 5.950876477782307, + "grad_norm": 7.250139236450195, + "learning_rate": 2.5645468134257375e-06, + "loss": 2.8796, + "step": 87585 + }, + { + "epoch": 5.951216197852969, + "grad_norm": 9.044633865356445, + "learning_rate": 2.5641221633374103e-06, + "loss": 2.637, + "step": 87590 + }, + { + "epoch": 5.951555917923631, + "grad_norm": 8.750799179077148, + "learning_rate": 2.563697513249083e-06, + "loss": 2.8302, + "step": 87595 + }, + { + "epoch": 5.951895637994292, + "grad_norm": 6.573577404022217, + "learning_rate": 2.5632728631607554e-06, + "loss": 2.7494, + "step": 87600 + }, + { + "epoch": 5.9522353580649545, + "grad_norm": 10.24917221069336, + "learning_rate": 2.5628482130724287e-06, + "loss": 2.9448, + "step": 87605 + }, + { + "epoch": 5.952575078135617, + "grad_norm": 8.144367218017578, + "learning_rate": 2.5624235629841015e-06, + "loss": 2.9239, + "step": 87610 + }, + { + "epoch": 5.952914798206278, + "grad_norm": 7.345460414886475, + "learning_rate": 2.561998912895774e-06, + "loss": 2.979, + "step": 87615 + }, + { + "epoch": 5.95325451827694, + "grad_norm": 7.822276592254639, + "learning_rate": 2.561574262807447e-06, + "loss": 2.7171, + "step": 87620 + }, + { + "epoch": 5.953594238347602, + "grad_norm": 7.207851409912109, + "learning_rate": 2.56114961271912e-06, + "loss": 2.6299, + "step": 87625 + }, + { + "epoch": 5.953933958418263, + "grad_norm": 7.206345081329346, + "learning_rate": 2.5607249626307922e-06, + "loss": 2.9607, + "step": 87630 + }, + { + "epoch": 5.954273678488925, + "grad_norm": 7.902287483215332, + "learning_rate": 2.5603003125424655e-06, + "loss": 2.719, + "step": 87635 + }, + { + "epoch": 5.954613398559587, + "grad_norm": 6.366586208343506, + "learning_rate": 2.5598756624541383e-06, + "loss": 2.6672, + "step": 87640 + }, + { + "epoch": 5.9549531186302485, + "grad_norm": 6.708835124969482, + "learning_rate": 2.5594510123658106e-06, + "loss": 2.777, + "step": 87645 + }, + { + "epoch": 5.9552928387009105, + "grad_norm": 7.222185134887695, + "learning_rate": 2.5590263622774834e-06, + "loss": 2.8758, + "step": 87650 + }, + { + "epoch": 5.955632558771573, + "grad_norm": 8.132079124450684, + "learning_rate": 2.5586017121891567e-06, + "loss": 2.7883, + "step": 87655 + }, + { + "epoch": 5.955972278842234, + "grad_norm": 6.692577838897705, + "learning_rate": 2.558177062100829e-06, + "loss": 2.8428, + "step": 87660 + }, + { + "epoch": 5.956311998912896, + "grad_norm": 10.285198211669922, + "learning_rate": 2.557752412012502e-06, + "loss": 2.8612, + "step": 87665 + }, + { + "epoch": 5.956651718983558, + "grad_norm": 8.497172355651855, + "learning_rate": 2.557327761924175e-06, + "loss": 2.7785, + "step": 87670 + }, + { + "epoch": 5.956991439054219, + "grad_norm": 7.369865894317627, + "learning_rate": 2.5569031118358474e-06, + "loss": 2.8214, + "step": 87675 + }, + { + "epoch": 5.957331159124881, + "grad_norm": 7.321639537811279, + "learning_rate": 2.5564784617475202e-06, + "loss": 2.9614, + "step": 87680 + }, + { + "epoch": 5.957670879195543, + "grad_norm": 6.630091190338135, + "learning_rate": 2.556053811659193e-06, + "loss": 2.7143, + "step": 87685 + }, + { + "epoch": 5.9580105992662045, + "grad_norm": 6.038715839385986, + "learning_rate": 2.5556291615708654e-06, + "loss": 2.7227, + "step": 87690 + }, + { + "epoch": 5.9583503193368665, + "grad_norm": 7.33322811126709, + "learning_rate": 2.5552045114825386e-06, + "loss": 2.7026, + "step": 87695 + }, + { + "epoch": 5.958690039407529, + "grad_norm": 7.859006881713867, + "learning_rate": 2.5547798613942114e-06, + "loss": 2.9935, + "step": 87700 + }, + { + "epoch": 5.95902975947819, + "grad_norm": 6.769540309906006, + "learning_rate": 2.554355211305884e-06, + "loss": 3.0241, + "step": 87705 + }, + { + "epoch": 5.959369479548852, + "grad_norm": 7.831155776977539, + "learning_rate": 2.553930561217557e-06, + "loss": 2.7366, + "step": 87710 + }, + { + "epoch": 5.959709199619514, + "grad_norm": 6.481639385223389, + "learning_rate": 2.55350591112923e-06, + "loss": 2.733, + "step": 87715 + }, + { + "epoch": 5.960048919690175, + "grad_norm": 7.195981502532959, + "learning_rate": 2.5530812610409022e-06, + "loss": 2.8388, + "step": 87720 + }, + { + "epoch": 5.960388639760837, + "grad_norm": 7.158052444458008, + "learning_rate": 2.552656610952575e-06, + "loss": 2.8656, + "step": 87725 + }, + { + "epoch": 5.960728359831499, + "grad_norm": 7.813359260559082, + "learning_rate": 2.5522319608642483e-06, + "loss": 3.0622, + "step": 87730 + }, + { + "epoch": 5.9610680799021605, + "grad_norm": 7.425779819488525, + "learning_rate": 2.5518073107759206e-06, + "loss": 2.9899, + "step": 87735 + }, + { + "epoch": 5.9614077999728226, + "grad_norm": 7.206847190856934, + "learning_rate": 2.5513826606875934e-06, + "loss": 2.7859, + "step": 87740 + }, + { + "epoch": 5.961747520043485, + "grad_norm": 7.736489295959473, + "learning_rate": 2.5509580105992667e-06, + "loss": 2.5219, + "step": 87745 + }, + { + "epoch": 5.962087240114146, + "grad_norm": 7.974602699279785, + "learning_rate": 2.5505333605109395e-06, + "loss": 3.0562, + "step": 87750 + }, + { + "epoch": 5.962426960184808, + "grad_norm": 6.746820449829102, + "learning_rate": 2.550108710422612e-06, + "loss": 2.8138, + "step": 87755 + }, + { + "epoch": 5.96276668025547, + "grad_norm": 7.83573055267334, + "learning_rate": 2.5496840603342846e-06, + "loss": 2.7908, + "step": 87760 + }, + { + "epoch": 5.963106400326131, + "grad_norm": 6.961240768432617, + "learning_rate": 2.549259410245958e-06, + "loss": 3.0685, + "step": 87765 + }, + { + "epoch": 5.963446120396793, + "grad_norm": 9.053071022033691, + "learning_rate": 2.5488347601576302e-06, + "loss": 3.0782, + "step": 87770 + }, + { + "epoch": 5.963785840467455, + "grad_norm": 8.86628532409668, + "learning_rate": 2.548410110069303e-06, + "loss": 2.745, + "step": 87775 + }, + { + "epoch": 5.9641255605381165, + "grad_norm": 6.302230358123779, + "learning_rate": 2.5479854599809763e-06, + "loss": 2.8079, + "step": 87780 + }, + { + "epoch": 5.964465280608779, + "grad_norm": 7.877472877502441, + "learning_rate": 2.5475608098926486e-06, + "loss": 2.9011, + "step": 87785 + }, + { + "epoch": 5.964805000679441, + "grad_norm": 8.355131149291992, + "learning_rate": 2.5471361598043214e-06, + "loss": 2.9078, + "step": 87790 + }, + { + "epoch": 5.965144720750102, + "grad_norm": 6.432174205780029, + "learning_rate": 2.5467115097159947e-06, + "loss": 2.4597, + "step": 87795 + }, + { + "epoch": 5.965484440820764, + "grad_norm": 8.145868301391602, + "learning_rate": 2.546286859627667e-06, + "loss": 2.9514, + "step": 87800 + }, + { + "epoch": 5.965824160891425, + "grad_norm": 7.939833164215088, + "learning_rate": 2.54586220953934e-06, + "loss": 2.6239, + "step": 87805 + }, + { + "epoch": 5.966163880962087, + "grad_norm": 7.455596446990967, + "learning_rate": 2.5454375594510126e-06, + "loss": 2.8787, + "step": 87810 + }, + { + "epoch": 5.966503601032749, + "grad_norm": 7.419914722442627, + "learning_rate": 2.545012909362685e-06, + "loss": 3.0946, + "step": 87815 + }, + { + "epoch": 5.96684332110341, + "grad_norm": 7.0004963874816895, + "learning_rate": 2.5445882592743582e-06, + "loss": 2.776, + "step": 87820 + }, + { + "epoch": 5.9671830411740725, + "grad_norm": 6.654152870178223, + "learning_rate": 2.544163609186031e-06, + "loss": 2.8831, + "step": 87825 + }, + { + "epoch": 5.967522761244735, + "grad_norm": 8.909355163574219, + "learning_rate": 2.5437389590977034e-06, + "loss": 3.0105, + "step": 87830 + }, + { + "epoch": 5.967862481315396, + "grad_norm": 8.754937171936035, + "learning_rate": 2.5433143090093766e-06, + "loss": 2.9159, + "step": 87835 + }, + { + "epoch": 5.968202201386058, + "grad_norm": 8.015571594238281, + "learning_rate": 2.5428896589210494e-06, + "loss": 2.7204, + "step": 87840 + }, + { + "epoch": 5.96854192145672, + "grad_norm": 6.428354263305664, + "learning_rate": 2.542465008832722e-06, + "loss": 2.8429, + "step": 87845 + }, + { + "epoch": 5.968881641527381, + "grad_norm": 7.41081428527832, + "learning_rate": 2.5420403587443946e-06, + "loss": 2.7528, + "step": 87850 + }, + { + "epoch": 5.969221361598043, + "grad_norm": 7.685995578765869, + "learning_rate": 2.541615708656068e-06, + "loss": 2.9531, + "step": 87855 + }, + { + "epoch": 5.969561081668705, + "grad_norm": 8.1915283203125, + "learning_rate": 2.5411910585677402e-06, + "loss": 2.7461, + "step": 87860 + }, + { + "epoch": 5.969900801739366, + "grad_norm": 6.366104602813721, + "learning_rate": 2.540766408479413e-06, + "loss": 2.8789, + "step": 87865 + }, + { + "epoch": 5.9702405218100285, + "grad_norm": 7.776057243347168, + "learning_rate": 2.5403417583910862e-06, + "loss": 2.8229, + "step": 87870 + }, + { + "epoch": 5.970580241880691, + "grad_norm": 8.54426097869873, + "learning_rate": 2.5399171083027586e-06, + "loss": 2.6494, + "step": 87875 + }, + { + "epoch": 5.970919961951352, + "grad_norm": 6.512936592102051, + "learning_rate": 2.5394924582144314e-06, + "loss": 2.7106, + "step": 87880 + }, + { + "epoch": 5.971259682022014, + "grad_norm": 6.416604995727539, + "learning_rate": 2.5390678081261042e-06, + "loss": 2.9385, + "step": 87885 + }, + { + "epoch": 5.971599402092676, + "grad_norm": 8.445387840270996, + "learning_rate": 2.538643158037777e-06, + "loss": 2.5895, + "step": 87890 + }, + { + "epoch": 5.971939122163337, + "grad_norm": 7.308071613311768, + "learning_rate": 2.53821850794945e-06, + "loss": 2.9305, + "step": 87895 + }, + { + "epoch": 5.972278842233999, + "grad_norm": 6.959143161773682, + "learning_rate": 2.5377938578611226e-06, + "loss": 2.5981, + "step": 87900 + }, + { + "epoch": 5.972618562304661, + "grad_norm": 6.180930137634277, + "learning_rate": 2.537369207772795e-06, + "loss": 2.5755, + "step": 87905 + }, + { + "epoch": 5.9729582823753224, + "grad_norm": 8.040555953979492, + "learning_rate": 2.5369445576844682e-06, + "loss": 2.997, + "step": 87910 + }, + { + "epoch": 5.9732980024459845, + "grad_norm": 6.468486309051514, + "learning_rate": 2.536519907596141e-06, + "loss": 2.8909, + "step": 87915 + }, + { + "epoch": 5.973637722516647, + "grad_norm": 8.34212589263916, + "learning_rate": 2.5360952575078143e-06, + "loss": 2.7875, + "step": 87920 + }, + { + "epoch": 5.973977442587308, + "grad_norm": 7.936434745788574, + "learning_rate": 2.5356706074194866e-06, + "loss": 2.711, + "step": 87925 + }, + { + "epoch": 5.97431716265797, + "grad_norm": 5.898910999298096, + "learning_rate": 2.5352459573311594e-06, + "loss": 2.7545, + "step": 87930 + }, + { + "epoch": 5.974656882728632, + "grad_norm": 8.132671356201172, + "learning_rate": 2.5348213072428322e-06, + "loss": 2.834, + "step": 87935 + }, + { + "epoch": 5.974996602799293, + "grad_norm": 6.713802814483643, + "learning_rate": 2.5343966571545046e-06, + "loss": 2.5835, + "step": 87940 + }, + { + "epoch": 5.975336322869955, + "grad_norm": 6.584664344787598, + "learning_rate": 2.533972007066178e-06, + "loss": 2.9257, + "step": 87945 + }, + { + "epoch": 5.975676042940617, + "grad_norm": 7.171813488006592, + "learning_rate": 2.5335473569778506e-06, + "loss": 2.6118, + "step": 87950 + }, + { + "epoch": 5.9760157630112785, + "grad_norm": 6.959924697875977, + "learning_rate": 2.533122706889523e-06, + "loss": 2.6742, + "step": 87955 + }, + { + "epoch": 5.9763554830819405, + "grad_norm": 8.078673362731934, + "learning_rate": 2.5326980568011962e-06, + "loss": 2.7985, + "step": 87960 + }, + { + "epoch": 5.976695203152603, + "grad_norm": 6.255548000335693, + "learning_rate": 2.532273406712869e-06, + "loss": 2.621, + "step": 87965 + }, + { + "epoch": 5.977034923223264, + "grad_norm": 6.7446608543396, + "learning_rate": 2.5318487566245414e-06, + "loss": 2.9068, + "step": 87970 + }, + { + "epoch": 5.977374643293926, + "grad_norm": 6.458696365356445, + "learning_rate": 2.5314241065362142e-06, + "loss": 3.0085, + "step": 87975 + }, + { + "epoch": 5.977714363364588, + "grad_norm": 7.896036148071289, + "learning_rate": 2.5309994564478874e-06, + "loss": 2.8773, + "step": 87980 + }, + { + "epoch": 5.978054083435249, + "grad_norm": 6.186614513397217, + "learning_rate": 2.53057480635956e-06, + "loss": 2.9645, + "step": 87985 + }, + { + "epoch": 5.978393803505911, + "grad_norm": 8.726489067077637, + "learning_rate": 2.5301501562712326e-06, + "loss": 2.6153, + "step": 87990 + }, + { + "epoch": 5.978733523576573, + "grad_norm": 7.552318096160889, + "learning_rate": 2.529725506182906e-06, + "loss": 2.9442, + "step": 87995 + }, + { + "epoch": 5.9790732436472345, + "grad_norm": 8.259123802185059, + "learning_rate": 2.5293008560945782e-06, + "loss": 2.7137, + "step": 88000 + }, + { + "epoch": 5.9794129637178965, + "grad_norm": 8.633614540100098, + "learning_rate": 2.528876206006251e-06, + "loss": 2.7722, + "step": 88005 + }, + { + "epoch": 5.979752683788558, + "grad_norm": 7.027446746826172, + "learning_rate": 2.528451555917924e-06, + "loss": 2.5789, + "step": 88010 + }, + { + "epoch": 5.98009240385922, + "grad_norm": 9.685215950012207, + "learning_rate": 2.5280269058295966e-06, + "loss": 2.6588, + "step": 88015 + }, + { + "epoch": 5.980432123929882, + "grad_norm": 6.890354156494141, + "learning_rate": 2.5276022557412694e-06, + "loss": 2.8275, + "step": 88020 + }, + { + "epoch": 5.980771844000543, + "grad_norm": 7.941444396972656, + "learning_rate": 2.5271776056529422e-06, + "loss": 2.7347, + "step": 88025 + }, + { + "epoch": 5.981111564071205, + "grad_norm": 7.868041515350342, + "learning_rate": 2.5267529555646146e-06, + "loss": 2.8885, + "step": 88030 + }, + { + "epoch": 5.981451284141867, + "grad_norm": 8.17994213104248, + "learning_rate": 2.526328305476288e-06, + "loss": 2.9477, + "step": 88035 + }, + { + "epoch": 5.981791004212528, + "grad_norm": 9.940924644470215, + "learning_rate": 2.5259036553879606e-06, + "loss": 2.9923, + "step": 88040 + }, + { + "epoch": 5.9821307242831905, + "grad_norm": 8.215929985046387, + "learning_rate": 2.525479005299633e-06, + "loss": 2.7111, + "step": 88045 + }, + { + "epoch": 5.982470444353853, + "grad_norm": 8.110418319702148, + "learning_rate": 2.5250543552113062e-06, + "loss": 2.8657, + "step": 88050 + }, + { + "epoch": 5.982810164424514, + "grad_norm": 7.976748943328857, + "learning_rate": 2.524629705122979e-06, + "loss": 2.7051, + "step": 88055 + }, + { + "epoch": 5.983149884495176, + "grad_norm": 6.302985191345215, + "learning_rate": 2.5242050550346514e-06, + "loss": 2.8173, + "step": 88060 + }, + { + "epoch": 5.983489604565838, + "grad_norm": 7.383005619049072, + "learning_rate": 2.523780404946324e-06, + "loss": 2.8587, + "step": 88065 + }, + { + "epoch": 5.983829324636499, + "grad_norm": 7.235894203186035, + "learning_rate": 2.5233557548579974e-06, + "loss": 2.9334, + "step": 88070 + }, + { + "epoch": 5.984169044707161, + "grad_norm": 7.7359490394592285, + "learning_rate": 2.52293110476967e-06, + "loss": 2.6117, + "step": 88075 + }, + { + "epoch": 5.984508764777823, + "grad_norm": 12.349215507507324, + "learning_rate": 2.5225064546813426e-06, + "loss": 3.0366, + "step": 88080 + }, + { + "epoch": 5.984848484848484, + "grad_norm": 9.135725975036621, + "learning_rate": 2.522081804593016e-06, + "loss": 2.5496, + "step": 88085 + }, + { + "epoch": 5.9851882049191465, + "grad_norm": 6.930065155029297, + "learning_rate": 2.5216571545046886e-06, + "loss": 2.8808, + "step": 88090 + }, + { + "epoch": 5.985527924989809, + "grad_norm": 7.574583530426025, + "learning_rate": 2.521232504416361e-06, + "loss": 2.6169, + "step": 88095 + }, + { + "epoch": 5.98586764506047, + "grad_norm": 7.713226795196533, + "learning_rate": 2.520807854328034e-06, + "loss": 2.8985, + "step": 88100 + }, + { + "epoch": 5.986207365131132, + "grad_norm": 5.635679721832275, + "learning_rate": 2.520383204239707e-06, + "loss": 2.7727, + "step": 88105 + }, + { + "epoch": 5.986547085201794, + "grad_norm": 8.168068885803223, + "learning_rate": 2.5199585541513794e-06, + "loss": 3.0387, + "step": 88110 + }, + { + "epoch": 5.986886805272455, + "grad_norm": 7.054158687591553, + "learning_rate": 2.5195339040630522e-06, + "loss": 2.9792, + "step": 88115 + }, + { + "epoch": 5.987226525343117, + "grad_norm": 6.879838466644287, + "learning_rate": 2.5191092539747254e-06, + "loss": 2.7276, + "step": 88120 + }, + { + "epoch": 5.987566245413779, + "grad_norm": 7.806295394897461, + "learning_rate": 2.518684603886398e-06, + "loss": 2.709, + "step": 88125 + }, + { + "epoch": 5.98790596548444, + "grad_norm": 8.975173950195312, + "learning_rate": 2.5182599537980706e-06, + "loss": 2.5003, + "step": 88130 + }, + { + "epoch": 5.9882456855551025, + "grad_norm": 8.031257629394531, + "learning_rate": 2.5178353037097434e-06, + "loss": 2.6817, + "step": 88135 + }, + { + "epoch": 5.988585405625765, + "grad_norm": 8.561695098876953, + "learning_rate": 2.5174106536214162e-06, + "loss": 2.7598, + "step": 88140 + }, + { + "epoch": 5.988925125696426, + "grad_norm": 7.231501579284668, + "learning_rate": 2.516986003533089e-06, + "loss": 2.7731, + "step": 88145 + }, + { + "epoch": 5.989264845767088, + "grad_norm": 7.666912078857422, + "learning_rate": 2.516561353444762e-06, + "loss": 2.7055, + "step": 88150 + }, + { + "epoch": 5.98960456583775, + "grad_norm": 7.889750003814697, + "learning_rate": 2.516136703356434e-06, + "loss": 2.8318, + "step": 88155 + }, + { + "epoch": 5.989944285908411, + "grad_norm": 7.961043357849121, + "learning_rate": 2.5157120532681074e-06, + "loss": 2.7591, + "step": 88160 + }, + { + "epoch": 5.990284005979073, + "grad_norm": 7.954765319824219, + "learning_rate": 2.5152874031797802e-06, + "loss": 2.7826, + "step": 88165 + }, + { + "epoch": 5.990623726049735, + "grad_norm": 10.193178176879883, + "learning_rate": 2.5148627530914526e-06, + "loss": 3.1592, + "step": 88170 + }, + { + "epoch": 5.990963446120396, + "grad_norm": 9.651201248168945, + "learning_rate": 2.514438103003126e-06, + "loss": 2.864, + "step": 88175 + }, + { + "epoch": 5.9913031661910585, + "grad_norm": 9.646783828735352, + "learning_rate": 2.5140134529147986e-06, + "loss": 3.0041, + "step": 88180 + }, + { + "epoch": 5.991642886261721, + "grad_norm": 7.4356513023376465, + "learning_rate": 2.513588802826471e-06, + "loss": 2.7498, + "step": 88185 + }, + { + "epoch": 5.991982606332382, + "grad_norm": 7.490082740783691, + "learning_rate": 2.513164152738144e-06, + "loss": 2.6949, + "step": 88190 + }, + { + "epoch": 5.992322326403044, + "grad_norm": 9.386964797973633, + "learning_rate": 2.512739502649817e-06, + "loss": 2.6588, + "step": 88195 + }, + { + "epoch": 5.992662046473706, + "grad_norm": 6.6324286460876465, + "learning_rate": 2.5123148525614894e-06, + "loss": 2.9532, + "step": 88200 + }, + { + "epoch": 5.993001766544367, + "grad_norm": 7.030721664428711, + "learning_rate": 2.511890202473162e-06, + "loss": 2.5954, + "step": 88205 + }, + { + "epoch": 5.993341486615029, + "grad_norm": 7.188068866729736, + "learning_rate": 2.5114655523848354e-06, + "loss": 2.9519, + "step": 88210 + }, + { + "epoch": 5.993681206685691, + "grad_norm": 7.0051679611206055, + "learning_rate": 2.511040902296508e-06, + "loss": 2.6946, + "step": 88215 + }, + { + "epoch": 5.9940209267563525, + "grad_norm": 6.942164421081543, + "learning_rate": 2.5106162522081806e-06, + "loss": 2.631, + "step": 88220 + }, + { + "epoch": 5.9943606468270145, + "grad_norm": 7.508433818817139, + "learning_rate": 2.5101916021198534e-06, + "loss": 2.9192, + "step": 88225 + }, + { + "epoch": 5.994700366897677, + "grad_norm": 10.251540184020996, + "learning_rate": 2.5097669520315258e-06, + "loss": 3.0173, + "step": 88230 + }, + { + "epoch": 5.995040086968338, + "grad_norm": 9.357657432556152, + "learning_rate": 2.509342301943199e-06, + "loss": 2.9795, + "step": 88235 + }, + { + "epoch": 5.995379807039, + "grad_norm": 7.8011627197265625, + "learning_rate": 2.508917651854872e-06, + "loss": 2.8993, + "step": 88240 + }, + { + "epoch": 5.995719527109662, + "grad_norm": 6.400810718536377, + "learning_rate": 2.508493001766544e-06, + "loss": 2.7499, + "step": 88245 + }, + { + "epoch": 5.996059247180323, + "grad_norm": 8.720054626464844, + "learning_rate": 2.5080683516782174e-06, + "loss": 2.8699, + "step": 88250 + }, + { + "epoch": 5.996398967250985, + "grad_norm": 6.850367546081543, + "learning_rate": 2.5076437015898902e-06, + "loss": 2.5662, + "step": 88255 + }, + { + "epoch": 5.996738687321647, + "grad_norm": 7.405206203460693, + "learning_rate": 2.507219051501563e-06, + "loss": 2.6674, + "step": 88260 + }, + { + "epoch": 5.9970784073923085, + "grad_norm": 9.42208194732666, + "learning_rate": 2.5067944014132354e-06, + "loss": 2.8811, + "step": 88265 + }, + { + "epoch": 5.9974181274629705, + "grad_norm": 8.750737190246582, + "learning_rate": 2.5063697513249086e-06, + "loss": 2.6028, + "step": 88270 + }, + { + "epoch": 5.997757847533633, + "grad_norm": 7.042572498321533, + "learning_rate": 2.5059451012365814e-06, + "loss": 2.6581, + "step": 88275 + }, + { + "epoch": 5.998097567604294, + "grad_norm": 7.584527969360352, + "learning_rate": 2.505520451148254e-06, + "loss": 2.8786, + "step": 88280 + }, + { + "epoch": 5.998437287674956, + "grad_norm": 7.068945407867432, + "learning_rate": 2.505095801059927e-06, + "loss": 2.8973, + "step": 88285 + }, + { + "epoch": 5.998777007745618, + "grad_norm": 7.569953918457031, + "learning_rate": 2.5046711509716e-06, + "loss": 2.7973, + "step": 88290 + }, + { + "epoch": 5.999116727816279, + "grad_norm": 6.7882513999938965, + "learning_rate": 2.504246500883272e-06, + "loss": 2.7075, + "step": 88295 + }, + { + "epoch": 5.999456447886941, + "grad_norm": 11.497030258178711, + "learning_rate": 2.5038218507949454e-06, + "loss": 2.9308, + "step": 88300 + }, + { + "epoch": 5.999796167957603, + "grad_norm": 10.550098419189453, + "learning_rate": 2.5033972007066182e-06, + "loss": 2.8748, + "step": 88305 + }, + { + "epoch": 6.0, + "eval_bertscore": { + "f1": 0.8402707553184798, + "precision": 0.8376392230286087, + "recall": 0.8437456374124959 + }, + "eval_bleu_4": 0.017613291685675762, + "eval_exact_match": 9.690861517588914e-05, + "eval_loss": 3.3355672359466553, + "eval_meteor": 0.10705072336109801, + "eval_rouge": { + "rouge1": 0.13454544114498396, + "rouge2": 0.01922666124280154, + "rougeL": 0.11203277503361059, + "rougeLsum": 0.11200767837018327 + }, + "eval_runtime": 1170.2671, + "eval_samples_per_second": 8.818, + "eval_steps_per_second": 1.102, + "step": 88308 + }, + { + "epoch": 6.0001358880282645, + "grad_norm": 6.801965713500977, + "learning_rate": 2.5029725506182906e-06, + "loss": 2.6469, + "step": 88310 + }, + { + "epoch": 6.0004756080989265, + "grad_norm": 7.295616149902344, + "learning_rate": 2.5025479005299634e-06, + "loss": 2.6086, + "step": 88315 + }, + { + "epoch": 6.000815328169589, + "grad_norm": 6.84285306930542, + "learning_rate": 2.5021232504416366e-06, + "loss": 2.7658, + "step": 88320 + }, + { + "epoch": 6.00115504824025, + "grad_norm": 7.472037315368652, + "learning_rate": 2.501698600353309e-06, + "loss": 2.5974, + "step": 88325 + }, + { + "epoch": 6.001494768310912, + "grad_norm": 6.188960552215576, + "learning_rate": 2.501273950264982e-06, + "loss": 2.5222, + "step": 88330 + }, + { + "epoch": 6.001834488381574, + "grad_norm": 6.628161907196045, + "learning_rate": 2.500849300176655e-06, + "loss": 2.6489, + "step": 88335 + }, + { + "epoch": 6.002174208452235, + "grad_norm": 8.119482040405273, + "learning_rate": 2.5004246500883274e-06, + "loss": 2.6995, + "step": 88340 + }, + { + "epoch": 6.002513928522897, + "grad_norm": 7.091310501098633, + "learning_rate": 2.5e-06, + "loss": 2.8933, + "step": 88345 + }, + { + "epoch": 6.002853648593559, + "grad_norm": 7.407850742340088, + "learning_rate": 2.499575349911673e-06, + "loss": 2.7819, + "step": 88350 + }, + { + "epoch": 6.0031933686642205, + "grad_norm": 8.815082550048828, + "learning_rate": 2.499150699823346e-06, + "loss": 2.7525, + "step": 88355 + }, + { + "epoch": 6.003533088734883, + "grad_norm": 5.823575019836426, + "learning_rate": 2.4987260497350186e-06, + "loss": 2.6311, + "step": 88360 + }, + { + "epoch": 6.003872808805545, + "grad_norm": 8.283552169799805, + "learning_rate": 2.4983013996466914e-06, + "loss": 2.484, + "step": 88365 + }, + { + "epoch": 6.004212528876206, + "grad_norm": 8.078027725219727, + "learning_rate": 2.497876749558364e-06, + "loss": 2.4745, + "step": 88370 + }, + { + "epoch": 6.004552248946868, + "grad_norm": 6.720069885253906, + "learning_rate": 2.497452099470037e-06, + "loss": 2.7375, + "step": 88375 + }, + { + "epoch": 6.00489196901753, + "grad_norm": 9.064099311828613, + "learning_rate": 2.4970274493817094e-06, + "loss": 2.6625, + "step": 88380 + }, + { + "epoch": 6.005231689088191, + "grad_norm": 8.503819465637207, + "learning_rate": 2.4966027992933826e-06, + "loss": 2.9277, + "step": 88385 + }, + { + "epoch": 6.005571409158853, + "grad_norm": 7.418428421020508, + "learning_rate": 2.496178149205055e-06, + "loss": 2.8008, + "step": 88390 + }, + { + "epoch": 6.005911129229515, + "grad_norm": 9.273445129394531, + "learning_rate": 2.4957534991167282e-06, + "loss": 2.7365, + "step": 88395 + }, + { + "epoch": 6.0062508493001765, + "grad_norm": 8.055441856384277, + "learning_rate": 2.495328849028401e-06, + "loss": 2.7823, + "step": 88400 + }, + { + "epoch": 6.006590569370839, + "grad_norm": 9.746804237365723, + "learning_rate": 2.4949041989400734e-06, + "loss": 2.8001, + "step": 88405 + }, + { + "epoch": 6.006930289441501, + "grad_norm": 7.091623783111572, + "learning_rate": 2.4944795488517466e-06, + "loss": 2.706, + "step": 88410 + }, + { + "epoch": 6.007270009512162, + "grad_norm": 7.974311828613281, + "learning_rate": 2.494054898763419e-06, + "loss": 2.8531, + "step": 88415 + }, + { + "epoch": 6.007609729582824, + "grad_norm": 7.292480945587158, + "learning_rate": 2.493630248675092e-06, + "loss": 2.7094, + "step": 88420 + }, + { + "epoch": 6.007949449653485, + "grad_norm": 11.942742347717285, + "learning_rate": 2.493205598586765e-06, + "loss": 2.6921, + "step": 88425 + }, + { + "epoch": 6.008289169724147, + "grad_norm": 9.577291488647461, + "learning_rate": 2.4927809484984374e-06, + "loss": 2.9362, + "step": 88430 + }, + { + "epoch": 6.008628889794809, + "grad_norm": 7.97474479675293, + "learning_rate": 2.49235629841011e-06, + "loss": 2.7119, + "step": 88435 + }, + { + "epoch": 6.00896860986547, + "grad_norm": 8.510894775390625, + "learning_rate": 2.491931648321783e-06, + "loss": 2.5512, + "step": 88440 + }, + { + "epoch": 6.0093083299361325, + "grad_norm": 9.992838859558105, + "learning_rate": 2.491506998233456e-06, + "loss": 2.9579, + "step": 88445 + }, + { + "epoch": 6.009648050006795, + "grad_norm": 7.557194232940674, + "learning_rate": 2.4910823481451286e-06, + "loss": 2.5883, + "step": 88450 + }, + { + "epoch": 6.009987770077456, + "grad_norm": 9.100587844848633, + "learning_rate": 2.4906576980568014e-06, + "loss": 2.6821, + "step": 88455 + }, + { + "epoch": 6.010327490148118, + "grad_norm": 8.177042961120605, + "learning_rate": 2.490233047968474e-06, + "loss": 2.553, + "step": 88460 + }, + { + "epoch": 6.01066721021878, + "grad_norm": 7.815145492553711, + "learning_rate": 2.489808397880147e-06, + "loss": 2.7677, + "step": 88465 + }, + { + "epoch": 6.011006930289441, + "grad_norm": 8.765454292297363, + "learning_rate": 2.48938374779182e-06, + "loss": 2.8414, + "step": 88470 + }, + { + "epoch": 6.011346650360103, + "grad_norm": 5.801558017730713, + "learning_rate": 2.4889590977034926e-06, + "loss": 2.9523, + "step": 88475 + }, + { + "epoch": 6.011686370430765, + "grad_norm": 8.85046100616455, + "learning_rate": 2.4885344476151654e-06, + "loss": 2.6302, + "step": 88480 + }, + { + "epoch": 6.0120260905014264, + "grad_norm": 8.764089584350586, + "learning_rate": 2.488109797526838e-06, + "loss": 2.6923, + "step": 88485 + }, + { + "epoch": 6.0123658105720885, + "grad_norm": 8.879746437072754, + "learning_rate": 2.487685147438511e-06, + "loss": 2.6871, + "step": 88490 + }, + { + "epoch": 6.012705530642751, + "grad_norm": 7.743130683898926, + "learning_rate": 2.487260497350184e-06, + "loss": 2.8405, + "step": 88495 + }, + { + "epoch": 6.013045250713412, + "grad_norm": 7.290804862976074, + "learning_rate": 2.4868358472618566e-06, + "loss": 2.6307, + "step": 88500 + }, + { + "epoch": 6.013384970784074, + "grad_norm": 8.808784484863281, + "learning_rate": 2.486411197173529e-06, + "loss": 2.8438, + "step": 88505 + }, + { + "epoch": 6.013724690854736, + "grad_norm": 8.174595832824707, + "learning_rate": 2.485986547085202e-06, + "loss": 2.7145, + "step": 88510 + }, + { + "epoch": 6.014064410925397, + "grad_norm": 9.235601425170898, + "learning_rate": 2.4855618969968746e-06, + "loss": 2.6796, + "step": 88515 + }, + { + "epoch": 6.014404130996059, + "grad_norm": 7.479829788208008, + "learning_rate": 2.4851372469085474e-06, + "loss": 2.7004, + "step": 88520 + }, + { + "epoch": 6.014743851066721, + "grad_norm": 8.587425231933594, + "learning_rate": 2.4847125968202206e-06, + "loss": 2.8015, + "step": 88525 + }, + { + "epoch": 6.0150835711373825, + "grad_norm": 9.448309898376465, + "learning_rate": 2.484287946731893e-06, + "loss": 2.8238, + "step": 88530 + }, + { + "epoch": 6.0154232912080445, + "grad_norm": 8.32510757446289, + "learning_rate": 2.4838632966435658e-06, + "loss": 2.824, + "step": 88535 + }, + { + "epoch": 6.015763011278707, + "grad_norm": 7.987407684326172, + "learning_rate": 2.4834386465552386e-06, + "loss": 2.5136, + "step": 88540 + }, + { + "epoch": 6.016102731349368, + "grad_norm": 6.943992614746094, + "learning_rate": 2.4830139964669114e-06, + "loss": 2.801, + "step": 88545 + }, + { + "epoch": 6.01644245142003, + "grad_norm": 8.190129280090332, + "learning_rate": 2.482589346378584e-06, + "loss": 2.7284, + "step": 88550 + }, + { + "epoch": 6.016782171490692, + "grad_norm": 9.11534309387207, + "learning_rate": 2.482164696290257e-06, + "loss": 2.5918, + "step": 88555 + }, + { + "epoch": 6.017121891561353, + "grad_norm": 8.41000747680664, + "learning_rate": 2.48174004620193e-06, + "loss": 2.5042, + "step": 88560 + }, + { + "epoch": 6.017461611632015, + "grad_norm": 7.861779689788818, + "learning_rate": 2.4813153961136026e-06, + "loss": 2.6607, + "step": 88565 + }, + { + "epoch": 6.017801331702677, + "grad_norm": 9.12961196899414, + "learning_rate": 2.4808907460252754e-06, + "loss": 2.7247, + "step": 88570 + }, + { + "epoch": 6.0181410517733385, + "grad_norm": 6.392255783081055, + "learning_rate": 2.480466095936948e-06, + "loss": 2.6552, + "step": 88575 + }, + { + "epoch": 6.0184807718440005, + "grad_norm": 7.191421031951904, + "learning_rate": 2.480041445848621e-06, + "loss": 2.8188, + "step": 88580 + }, + { + "epoch": 6.018820491914663, + "grad_norm": 6.187231063842773, + "learning_rate": 2.479616795760294e-06, + "loss": 2.6962, + "step": 88585 + }, + { + "epoch": 6.019160211985324, + "grad_norm": 8.287899017333984, + "learning_rate": 2.4791921456719666e-06, + "loss": 2.7564, + "step": 88590 + }, + { + "epoch": 6.019499932055986, + "grad_norm": 8.017489433288574, + "learning_rate": 2.4787674955836394e-06, + "loss": 2.4253, + "step": 88595 + }, + { + "epoch": 6.019839652126648, + "grad_norm": 7.593644618988037, + "learning_rate": 2.478342845495312e-06, + "loss": 2.8038, + "step": 88600 + }, + { + "epoch": 6.020179372197309, + "grad_norm": 7.962754726409912, + "learning_rate": 2.4779181954069846e-06, + "loss": 2.7532, + "step": 88605 + }, + { + "epoch": 6.020519092267971, + "grad_norm": 7.9266438484191895, + "learning_rate": 2.477493545318658e-06, + "loss": 2.7003, + "step": 88610 + }, + { + "epoch": 6.020858812338633, + "grad_norm": 7.408517360687256, + "learning_rate": 2.4770688952303306e-06, + "loss": 2.7275, + "step": 88615 + }, + { + "epoch": 6.0211985324092945, + "grad_norm": 7.609011173248291, + "learning_rate": 2.476644245142003e-06, + "loss": 2.6811, + "step": 88620 + }, + { + "epoch": 6.0215382524799566, + "grad_norm": 9.019559860229492, + "learning_rate": 2.476219595053676e-06, + "loss": 2.5684, + "step": 88625 + }, + { + "epoch": 6.021877972550619, + "grad_norm": 8.917839050292969, + "learning_rate": 2.4757949449653486e-06, + "loss": 2.7683, + "step": 88630 + }, + { + "epoch": 6.02221769262128, + "grad_norm": 6.61858606338501, + "learning_rate": 2.4753702948770214e-06, + "loss": 2.7059, + "step": 88635 + }, + { + "epoch": 6.022557412691942, + "grad_norm": 7.020137786865234, + "learning_rate": 2.474945644788694e-06, + "loss": 2.6639, + "step": 88640 + }, + { + "epoch": 6.022897132762604, + "grad_norm": 8.063943862915039, + "learning_rate": 2.474520994700367e-06, + "loss": 2.7701, + "step": 88645 + }, + { + "epoch": 6.023236852833265, + "grad_norm": 7.837402820587158, + "learning_rate": 2.47409634461204e-06, + "loss": 2.6072, + "step": 88650 + }, + { + "epoch": 6.023576572903927, + "grad_norm": 7.737195014953613, + "learning_rate": 2.4736716945237126e-06, + "loss": 2.6351, + "step": 88655 + }, + { + "epoch": 6.023916292974589, + "grad_norm": 8.653424263000488, + "learning_rate": 2.4732470444353854e-06, + "loss": 2.7918, + "step": 88660 + }, + { + "epoch": 6.0242560130452505, + "grad_norm": 6.787927627563477, + "learning_rate": 2.472822394347058e-06, + "loss": 2.5873, + "step": 88665 + }, + { + "epoch": 6.024595733115913, + "grad_norm": 7.169686794281006, + "learning_rate": 2.472397744258731e-06, + "loss": 2.9262, + "step": 88670 + }, + { + "epoch": 6.024935453186575, + "grad_norm": 6.940724849700928, + "learning_rate": 2.4719730941704038e-06, + "loss": 2.7804, + "step": 88675 + }, + { + "epoch": 6.025275173257236, + "grad_norm": 8.353888511657715, + "learning_rate": 2.4715484440820766e-06, + "loss": 2.62, + "step": 88680 + }, + { + "epoch": 6.025614893327898, + "grad_norm": 8.20220947265625, + "learning_rate": 2.4711237939937494e-06, + "loss": 2.6803, + "step": 88685 + }, + { + "epoch": 6.02595461339856, + "grad_norm": 9.588525772094727, + "learning_rate": 2.470699143905422e-06, + "loss": 2.8645, + "step": 88690 + }, + { + "epoch": 6.026294333469221, + "grad_norm": 6.487490653991699, + "learning_rate": 2.470274493817095e-06, + "loss": 2.473, + "step": 88695 + }, + { + "epoch": 6.026634053539883, + "grad_norm": 6.221311569213867, + "learning_rate": 2.469849843728768e-06, + "loss": 3.0593, + "step": 88700 + }, + { + "epoch": 6.026973773610545, + "grad_norm": 8.756589889526367, + "learning_rate": 2.46942519364044e-06, + "loss": 2.6184, + "step": 88705 + }, + { + "epoch": 6.0273134936812065, + "grad_norm": 8.738482475280762, + "learning_rate": 2.4690005435521134e-06, + "loss": 2.5866, + "step": 88710 + }, + { + "epoch": 6.027653213751869, + "grad_norm": 8.349427223205566, + "learning_rate": 2.468575893463786e-06, + "loss": 2.7995, + "step": 88715 + }, + { + "epoch": 6.027992933822531, + "grad_norm": 8.132102966308594, + "learning_rate": 2.4681512433754586e-06, + "loss": 2.7705, + "step": 88720 + }, + { + "epoch": 6.028332653893192, + "grad_norm": 7.703674793243408, + "learning_rate": 2.467726593287132e-06, + "loss": 2.8136, + "step": 88725 + }, + { + "epoch": 6.028672373963854, + "grad_norm": 7.070004463195801, + "learning_rate": 2.467301943198804e-06, + "loss": 2.9364, + "step": 88730 + }, + { + "epoch": 6.029012094034516, + "grad_norm": 6.820708751678467, + "learning_rate": 2.4668772931104774e-06, + "loss": 2.7186, + "step": 88735 + }, + { + "epoch": 6.029351814105177, + "grad_norm": 7.767928123474121, + "learning_rate": 2.46645264302215e-06, + "loss": 2.8404, + "step": 88740 + }, + { + "epoch": 6.029691534175839, + "grad_norm": 6.220415115356445, + "learning_rate": 2.4660279929338226e-06, + "loss": 2.7902, + "step": 88745 + }, + { + "epoch": 6.0300312542465, + "grad_norm": 8.68720817565918, + "learning_rate": 2.465603342845496e-06, + "loss": 2.6764, + "step": 88750 + }, + { + "epoch": 6.0303709743171625, + "grad_norm": 8.035892486572266, + "learning_rate": 2.465178692757168e-06, + "loss": 2.8637, + "step": 88755 + }, + { + "epoch": 6.030710694387825, + "grad_norm": 7.991364479064941, + "learning_rate": 2.464754042668841e-06, + "loss": 2.5051, + "step": 88760 + }, + { + "epoch": 6.031050414458486, + "grad_norm": 9.295421600341797, + "learning_rate": 2.4643293925805138e-06, + "loss": 2.739, + "step": 88765 + }, + { + "epoch": 6.031390134529148, + "grad_norm": 7.494598865509033, + "learning_rate": 2.4639047424921866e-06, + "loss": 2.4783, + "step": 88770 + }, + { + "epoch": 6.03172985459981, + "grad_norm": 6.181230068206787, + "learning_rate": 2.4634800924038594e-06, + "loss": 2.8339, + "step": 88775 + }, + { + "epoch": 6.032069574670471, + "grad_norm": 8.556110382080078, + "learning_rate": 2.463055442315532e-06, + "loss": 2.6692, + "step": 88780 + }, + { + "epoch": 6.032409294741133, + "grad_norm": 8.795641899108887, + "learning_rate": 2.462630792227205e-06, + "loss": 2.6327, + "step": 88785 + }, + { + "epoch": 6.032749014811795, + "grad_norm": 8.45124340057373, + "learning_rate": 2.4622061421388778e-06, + "loss": 2.7864, + "step": 88790 + }, + { + "epoch": 6.0330887348824564, + "grad_norm": 7.383288383483887, + "learning_rate": 2.4617814920505506e-06, + "loss": 2.8279, + "step": 88795 + }, + { + "epoch": 6.0334284549531185, + "grad_norm": 9.275744438171387, + "learning_rate": 2.4613568419622234e-06, + "loss": 2.7646, + "step": 88800 + }, + { + "epoch": 6.033768175023781, + "grad_norm": 9.223021507263184, + "learning_rate": 2.460932191873896e-06, + "loss": 2.7796, + "step": 88805 + }, + { + "epoch": 6.034107895094442, + "grad_norm": 7.979017734527588, + "learning_rate": 2.460507541785569e-06, + "loss": 2.832, + "step": 88810 + }, + { + "epoch": 6.034447615165104, + "grad_norm": 9.748626708984375, + "learning_rate": 2.4600828916972418e-06, + "loss": 2.4504, + "step": 88815 + }, + { + "epoch": 6.034787335235766, + "grad_norm": 8.739514350891113, + "learning_rate": 2.4596582416089146e-06, + "loss": 2.8515, + "step": 88820 + }, + { + "epoch": 6.035127055306427, + "grad_norm": 7.325338363647461, + "learning_rate": 2.4592335915205874e-06, + "loss": 2.7562, + "step": 88825 + }, + { + "epoch": 6.035466775377089, + "grad_norm": 7.325120449066162, + "learning_rate": 2.4588089414322598e-06, + "loss": 2.6454, + "step": 88830 + }, + { + "epoch": 6.035806495447751, + "grad_norm": 7.748314380645752, + "learning_rate": 2.458384291343933e-06, + "loss": 2.7347, + "step": 88835 + }, + { + "epoch": 6.0361462155184125, + "grad_norm": 6.964789390563965, + "learning_rate": 2.4579596412556058e-06, + "loss": 2.606, + "step": 88840 + }, + { + "epoch": 6.0364859355890745, + "grad_norm": 10.228455543518066, + "learning_rate": 2.457534991167278e-06, + "loss": 2.5013, + "step": 88845 + }, + { + "epoch": 6.036825655659737, + "grad_norm": 6.01151704788208, + "learning_rate": 2.4571103410789514e-06, + "loss": 2.6699, + "step": 88850 + }, + { + "epoch": 6.037165375730398, + "grad_norm": 6.6345648765563965, + "learning_rate": 2.4566856909906238e-06, + "loss": 2.7425, + "step": 88855 + }, + { + "epoch": 6.03750509580106, + "grad_norm": 7.2025861740112305, + "learning_rate": 2.4562610409022966e-06, + "loss": 3.1111, + "step": 88860 + }, + { + "epoch": 6.037844815871722, + "grad_norm": 8.44596004486084, + "learning_rate": 2.4558363908139694e-06, + "loss": 2.4214, + "step": 88865 + }, + { + "epoch": 6.038184535942383, + "grad_norm": 7.97368049621582, + "learning_rate": 2.455411740725642e-06, + "loss": 2.8931, + "step": 88870 + }, + { + "epoch": 6.038524256013045, + "grad_norm": 8.000532150268555, + "learning_rate": 2.454987090637315e-06, + "loss": 2.5985, + "step": 88875 + }, + { + "epoch": 6.038863976083707, + "grad_norm": 7.8562541007995605, + "learning_rate": 2.4545624405489878e-06, + "loss": 2.9907, + "step": 88880 + }, + { + "epoch": 6.0392036961543685, + "grad_norm": 8.453653335571289, + "learning_rate": 2.4541377904606606e-06, + "loss": 2.8632, + "step": 88885 + }, + { + "epoch": 6.0395434162250305, + "grad_norm": 6.97987174987793, + "learning_rate": 2.4537131403723334e-06, + "loss": 2.6999, + "step": 88890 + }, + { + "epoch": 6.039883136295693, + "grad_norm": 6.756584167480469, + "learning_rate": 2.453288490284006e-06, + "loss": 2.6023, + "step": 88895 + }, + { + "epoch": 6.040222856366354, + "grad_norm": 8.453326225280762, + "learning_rate": 2.452863840195679e-06, + "loss": 2.7867, + "step": 88900 + }, + { + "epoch": 6.040562576437016, + "grad_norm": 8.354803085327148, + "learning_rate": 2.4524391901073518e-06, + "loss": 2.7527, + "step": 88905 + }, + { + "epoch": 6.040902296507678, + "grad_norm": 10.012934684753418, + "learning_rate": 2.4520145400190246e-06, + "loss": 2.7435, + "step": 88910 + }, + { + "epoch": 6.041242016578339, + "grad_norm": 8.346677780151367, + "learning_rate": 2.4515898899306974e-06, + "loss": 2.6712, + "step": 88915 + }, + { + "epoch": 6.041581736649001, + "grad_norm": 10.986197471618652, + "learning_rate": 2.45116523984237e-06, + "loss": 2.6934, + "step": 88920 + }, + { + "epoch": 6.041921456719663, + "grad_norm": 7.47542142868042, + "learning_rate": 2.450740589754043e-06, + "loss": 2.5658, + "step": 88925 + }, + { + "epoch": 6.0422611767903245, + "grad_norm": 7.277143955230713, + "learning_rate": 2.4503159396657158e-06, + "loss": 2.818, + "step": 88930 + }, + { + "epoch": 6.042600896860987, + "grad_norm": 6.709305763244629, + "learning_rate": 2.4498912895773886e-06, + "loss": 3.0373, + "step": 88935 + }, + { + "epoch": 6.042940616931649, + "grad_norm": 7.647279262542725, + "learning_rate": 2.4494666394890614e-06, + "loss": 2.5446, + "step": 88940 + }, + { + "epoch": 6.04328033700231, + "grad_norm": 7.820634841918945, + "learning_rate": 2.4490419894007338e-06, + "loss": 2.6813, + "step": 88945 + }, + { + "epoch": 6.043620057072972, + "grad_norm": 6.919781684875488, + "learning_rate": 2.448617339312407e-06, + "loss": 2.8044, + "step": 88950 + }, + { + "epoch": 6.043959777143634, + "grad_norm": 8.629165649414062, + "learning_rate": 2.4481926892240794e-06, + "loss": 2.8953, + "step": 88955 + }, + { + "epoch": 6.044299497214295, + "grad_norm": 8.017291069030762, + "learning_rate": 2.447768039135752e-06, + "loss": 2.9597, + "step": 88960 + }, + { + "epoch": 6.044639217284957, + "grad_norm": 8.662763595581055, + "learning_rate": 2.4473433890474254e-06, + "loss": 2.7255, + "step": 88965 + }, + { + "epoch": 6.044978937355619, + "grad_norm": 8.512313842773438, + "learning_rate": 2.4469187389590978e-06, + "loss": 2.9157, + "step": 88970 + }, + { + "epoch": 6.0453186574262805, + "grad_norm": 7.047857761383057, + "learning_rate": 2.4464940888707706e-06, + "loss": 2.5928, + "step": 88975 + }, + { + "epoch": 6.045658377496943, + "grad_norm": 7.562466144561768, + "learning_rate": 2.4460694387824434e-06, + "loss": 2.7272, + "step": 88980 + }, + { + "epoch": 6.045998097567605, + "grad_norm": 7.386942386627197, + "learning_rate": 2.445644788694116e-06, + "loss": 2.7391, + "step": 88985 + }, + { + "epoch": 6.046337817638266, + "grad_norm": 7.803728103637695, + "learning_rate": 2.445220138605789e-06, + "loss": 2.5556, + "step": 88990 + }, + { + "epoch": 6.046677537708928, + "grad_norm": 9.107205390930176, + "learning_rate": 2.4447954885174618e-06, + "loss": 2.7831, + "step": 88995 + }, + { + "epoch": 6.04701725777959, + "grad_norm": 7.450016021728516, + "learning_rate": 2.4443708384291346e-06, + "loss": 2.4637, + "step": 89000 + }, + { + "epoch": 6.047356977850251, + "grad_norm": 8.460637092590332, + "learning_rate": 2.4439461883408074e-06, + "loss": 2.7047, + "step": 89005 + }, + { + "epoch": 6.047696697920913, + "grad_norm": 9.924635887145996, + "learning_rate": 2.44352153825248e-06, + "loss": 2.7473, + "step": 89010 + }, + { + "epoch": 6.048036417991575, + "grad_norm": 8.062000274658203, + "learning_rate": 2.443096888164153e-06, + "loss": 2.7139, + "step": 89015 + }, + { + "epoch": 6.0483761380622365, + "grad_norm": 6.120619297027588, + "learning_rate": 2.4426722380758258e-06, + "loss": 2.5647, + "step": 89020 + }, + { + "epoch": 6.048715858132899, + "grad_norm": 8.86256217956543, + "learning_rate": 2.4422475879874986e-06, + "loss": 2.6429, + "step": 89025 + }, + { + "epoch": 6.049055578203561, + "grad_norm": 6.887220859527588, + "learning_rate": 2.4418229378991714e-06, + "loss": 2.6221, + "step": 89030 + }, + { + "epoch": 6.049395298274222, + "grad_norm": 7.5417799949646, + "learning_rate": 2.441398287810844e-06, + "loss": 2.8758, + "step": 89035 + }, + { + "epoch": 6.049735018344884, + "grad_norm": 8.71093463897705, + "learning_rate": 2.440973637722517e-06, + "loss": 2.9322, + "step": 89040 + }, + { + "epoch": 6.050074738415546, + "grad_norm": 9.109854698181152, + "learning_rate": 2.4405489876341893e-06, + "loss": 2.789, + "step": 89045 + }, + { + "epoch": 6.050414458486207, + "grad_norm": 7.246347427368164, + "learning_rate": 2.4401243375458626e-06, + "loss": 2.7132, + "step": 89050 + }, + { + "epoch": 6.050754178556869, + "grad_norm": 8.14885425567627, + "learning_rate": 2.439699687457535e-06, + "loss": 2.8109, + "step": 89055 + }, + { + "epoch": 6.051093898627531, + "grad_norm": 7.744828701019287, + "learning_rate": 2.4392750373692078e-06, + "loss": 2.6924, + "step": 89060 + }, + { + "epoch": 6.0514336186981925, + "grad_norm": 8.569154739379883, + "learning_rate": 2.438850387280881e-06, + "loss": 2.795, + "step": 89065 + }, + { + "epoch": 6.051773338768855, + "grad_norm": 8.0256986618042, + "learning_rate": 2.4384257371925534e-06, + "loss": 2.8277, + "step": 89070 + }, + { + "epoch": 6.052113058839517, + "grad_norm": 8.176799774169922, + "learning_rate": 2.4380010871042266e-06, + "loss": 2.7064, + "step": 89075 + }, + { + "epoch": 6.052452778910178, + "grad_norm": 7.558905124664307, + "learning_rate": 2.437576437015899e-06, + "loss": 2.7787, + "step": 89080 + }, + { + "epoch": 6.05279249898084, + "grad_norm": 8.62056827545166, + "learning_rate": 2.4371517869275718e-06, + "loss": 2.5696, + "step": 89085 + }, + { + "epoch": 6.053132219051502, + "grad_norm": 7.028520107269287, + "learning_rate": 2.436727136839245e-06, + "loss": 2.7767, + "step": 89090 + }, + { + "epoch": 6.053471939122163, + "grad_norm": 7.326026439666748, + "learning_rate": 2.4363024867509174e-06, + "loss": 2.5975, + "step": 89095 + }, + { + "epoch": 6.053811659192825, + "grad_norm": 6.068175792694092, + "learning_rate": 2.43587783666259e-06, + "loss": 2.5585, + "step": 89100 + }, + { + "epoch": 6.0541513792634865, + "grad_norm": 7.040455341339111, + "learning_rate": 2.435453186574263e-06, + "loss": 2.755, + "step": 89105 + }, + { + "epoch": 6.0544910993341485, + "grad_norm": 6.530883312225342, + "learning_rate": 2.4350285364859358e-06, + "loss": 2.7441, + "step": 89110 + }, + { + "epoch": 6.054830819404811, + "grad_norm": 6.853539943695068, + "learning_rate": 2.4346038863976086e-06, + "loss": 2.3342, + "step": 89115 + }, + { + "epoch": 6.055170539475472, + "grad_norm": 6.725762367248535, + "learning_rate": 2.4341792363092814e-06, + "loss": 2.9688, + "step": 89120 + }, + { + "epoch": 6.055510259546134, + "grad_norm": 8.424919128417969, + "learning_rate": 2.433754586220954e-06, + "loss": 2.7672, + "step": 89125 + }, + { + "epoch": 6.055849979616796, + "grad_norm": 7.718776702880859, + "learning_rate": 2.433329936132627e-06, + "loss": 2.5614, + "step": 89130 + }, + { + "epoch": 6.056189699687457, + "grad_norm": 6.451811790466309, + "learning_rate": 2.4329052860442998e-06, + "loss": 2.8859, + "step": 89135 + }, + { + "epoch": 6.056529419758119, + "grad_norm": 5.709166049957275, + "learning_rate": 2.4324806359559726e-06, + "loss": 2.6737, + "step": 89140 + }, + { + "epoch": 6.056869139828781, + "grad_norm": 7.823782444000244, + "learning_rate": 2.432055985867645e-06, + "loss": 2.6373, + "step": 89145 + }, + { + "epoch": 6.0572088598994425, + "grad_norm": 7.2377190589904785, + "learning_rate": 2.431631335779318e-06, + "loss": 2.6531, + "step": 89150 + }, + { + "epoch": 6.0575485799701045, + "grad_norm": 7.23751974105835, + "learning_rate": 2.431206685690991e-06, + "loss": 2.7841, + "step": 89155 + }, + { + "epoch": 6.057888300040767, + "grad_norm": 7.403600215911865, + "learning_rate": 2.4307820356026638e-06, + "loss": 2.6616, + "step": 89160 + }, + { + "epoch": 6.058228020111428, + "grad_norm": 10.439715385437012, + "learning_rate": 2.4303573855143366e-06, + "loss": 2.8513, + "step": 89165 + }, + { + "epoch": 6.05856774018209, + "grad_norm": 8.707594871520996, + "learning_rate": 2.429932735426009e-06, + "loss": 2.7235, + "step": 89170 + }, + { + "epoch": 6.058907460252752, + "grad_norm": 8.435046195983887, + "learning_rate": 2.429508085337682e-06, + "loss": 3.1506, + "step": 89175 + }, + { + "epoch": 6.059247180323413, + "grad_norm": 8.226500511169434, + "learning_rate": 2.4290834352493545e-06, + "loss": 2.7624, + "step": 89180 + }, + { + "epoch": 6.059586900394075, + "grad_norm": 8.090685844421387, + "learning_rate": 2.4286587851610273e-06, + "loss": 2.8257, + "step": 89185 + }, + { + "epoch": 6.059926620464737, + "grad_norm": 6.609647274017334, + "learning_rate": 2.4282341350727006e-06, + "loss": 2.8036, + "step": 89190 + }, + { + "epoch": 6.0602663405353985, + "grad_norm": 6.541097640991211, + "learning_rate": 2.427809484984373e-06, + "loss": 2.6918, + "step": 89195 + }, + { + "epoch": 6.0606060606060606, + "grad_norm": 8.611214637756348, + "learning_rate": 2.4273848348960457e-06, + "loss": 2.7865, + "step": 89200 + }, + { + "epoch": 6.060945780676723, + "grad_norm": 6.438347816467285, + "learning_rate": 2.4269601848077185e-06, + "loss": 2.5596, + "step": 89205 + }, + { + "epoch": 6.061285500747384, + "grad_norm": 9.660845756530762, + "learning_rate": 2.4265355347193913e-06, + "loss": 2.7763, + "step": 89210 + }, + { + "epoch": 6.061625220818046, + "grad_norm": 6.590084075927734, + "learning_rate": 2.426110884631064e-06, + "loss": 2.521, + "step": 89215 + }, + { + "epoch": 6.061964940888708, + "grad_norm": 8.463235855102539, + "learning_rate": 2.425686234542737e-06, + "loss": 2.8166, + "step": 89220 + }, + { + "epoch": 6.062304660959369, + "grad_norm": 9.071331977844238, + "learning_rate": 2.4252615844544098e-06, + "loss": 2.6513, + "step": 89225 + }, + { + "epoch": 6.062644381030031, + "grad_norm": 7.546687126159668, + "learning_rate": 2.4248369343660826e-06, + "loss": 2.897, + "step": 89230 + }, + { + "epoch": 6.062984101100693, + "grad_norm": 6.41969633102417, + "learning_rate": 2.4244122842777554e-06, + "loss": 2.9353, + "step": 89235 + }, + { + "epoch": 6.0633238211713545, + "grad_norm": 8.935516357421875, + "learning_rate": 2.423987634189428e-06, + "loss": 2.6882, + "step": 89240 + }, + { + "epoch": 6.063663541242017, + "grad_norm": 7.10534143447876, + "learning_rate": 2.423562984101101e-06, + "loss": 2.747, + "step": 89245 + }, + { + "epoch": 6.064003261312679, + "grad_norm": 7.411280632019043, + "learning_rate": 2.4231383340127738e-06, + "loss": 2.8223, + "step": 89250 + }, + { + "epoch": 6.06434298138334, + "grad_norm": 6.450438976287842, + "learning_rate": 2.4227136839244466e-06, + "loss": 2.7852, + "step": 89255 + }, + { + "epoch": 6.064682701454002, + "grad_norm": 7.260171413421631, + "learning_rate": 2.4222890338361194e-06, + "loss": 2.433, + "step": 89260 + }, + { + "epoch": 6.065022421524664, + "grad_norm": 6.16189432144165, + "learning_rate": 2.421864383747792e-06, + "loss": 2.7963, + "step": 89265 + }, + { + "epoch": 6.065362141595325, + "grad_norm": 6.467992782592773, + "learning_rate": 2.4214397336594645e-06, + "loss": 2.8988, + "step": 89270 + }, + { + "epoch": 6.065701861665987, + "grad_norm": 9.13394546508789, + "learning_rate": 2.4210150835711378e-06, + "loss": 2.6512, + "step": 89275 + }, + { + "epoch": 6.066041581736649, + "grad_norm": 8.854164123535156, + "learning_rate": 2.4205904334828106e-06, + "loss": 2.716, + "step": 89280 + }, + { + "epoch": 6.0663813018073105, + "grad_norm": 8.070068359375, + "learning_rate": 2.420165783394483e-06, + "loss": 2.8193, + "step": 89285 + }, + { + "epoch": 6.066721021877973, + "grad_norm": 11.565975189208984, + "learning_rate": 2.419741133306156e-06, + "loss": 3.0589, + "step": 89290 + }, + { + "epoch": 6.067060741948635, + "grad_norm": 8.039432525634766, + "learning_rate": 2.4193164832178285e-06, + "loss": 2.9979, + "step": 89295 + }, + { + "epoch": 6.067400462019296, + "grad_norm": 5.944626331329346, + "learning_rate": 2.4188918331295013e-06, + "loss": 2.7331, + "step": 89300 + }, + { + "epoch": 6.067740182089958, + "grad_norm": 8.752118110656738, + "learning_rate": 2.418467183041174e-06, + "loss": 2.5722, + "step": 89305 + }, + { + "epoch": 6.06807990216062, + "grad_norm": 8.14942741394043, + "learning_rate": 2.418042532952847e-06, + "loss": 2.7403, + "step": 89310 + }, + { + "epoch": 6.068419622231281, + "grad_norm": 9.785299301147461, + "learning_rate": 2.4176178828645197e-06, + "loss": 2.7092, + "step": 89315 + }, + { + "epoch": 6.068759342301943, + "grad_norm": 9.4253568649292, + "learning_rate": 2.4171932327761925e-06, + "loss": 2.5146, + "step": 89320 + }, + { + "epoch": 6.069099062372605, + "grad_norm": 6.748258113861084, + "learning_rate": 2.4167685826878653e-06, + "loss": 3.0528, + "step": 89325 + }, + { + "epoch": 6.0694387824432665, + "grad_norm": 7.912113189697266, + "learning_rate": 2.416343932599538e-06, + "loss": 2.7878, + "step": 89330 + }, + { + "epoch": 6.069778502513929, + "grad_norm": 7.916996955871582, + "learning_rate": 2.415919282511211e-06, + "loss": 2.9232, + "step": 89335 + }, + { + "epoch": 6.070118222584591, + "grad_norm": 8.291378021240234, + "learning_rate": 2.4154946324228837e-06, + "loss": 3.099, + "step": 89340 + }, + { + "epoch": 6.070457942655252, + "grad_norm": 6.051464080810547, + "learning_rate": 2.4150699823345565e-06, + "loss": 2.5177, + "step": 89345 + }, + { + "epoch": 6.070797662725914, + "grad_norm": 7.1791863441467285, + "learning_rate": 2.4146453322462293e-06, + "loss": 2.5997, + "step": 89350 + }, + { + "epoch": 6.071137382796576, + "grad_norm": 8.453042984008789, + "learning_rate": 2.414220682157902e-06, + "loss": 2.8123, + "step": 89355 + }, + { + "epoch": 6.071477102867237, + "grad_norm": 8.178339958190918, + "learning_rate": 2.413796032069575e-06, + "loss": 2.6537, + "step": 89360 + }, + { + "epoch": 6.071816822937899, + "grad_norm": 7.472082138061523, + "learning_rate": 2.4133713819812478e-06, + "loss": 2.9637, + "step": 89365 + }, + { + "epoch": 6.072156543008561, + "grad_norm": 7.4861555099487305, + "learning_rate": 2.41294673189292e-06, + "loss": 2.8874, + "step": 89370 + }, + { + "epoch": 6.0724962630792225, + "grad_norm": 8.514103889465332, + "learning_rate": 2.4125220818045934e-06, + "loss": 2.8235, + "step": 89375 + }, + { + "epoch": 6.072835983149885, + "grad_norm": 6.65118408203125, + "learning_rate": 2.412097431716266e-06, + "loss": 2.8181, + "step": 89380 + }, + { + "epoch": 6.073175703220547, + "grad_norm": 7.051214218139648, + "learning_rate": 2.4116727816279385e-06, + "loss": 2.8836, + "step": 89385 + }, + { + "epoch": 6.073515423291208, + "grad_norm": 6.122878551483154, + "learning_rate": 2.4112481315396118e-06, + "loss": 2.8608, + "step": 89390 + }, + { + "epoch": 6.07385514336187, + "grad_norm": 9.61026668548584, + "learning_rate": 2.410823481451284e-06, + "loss": 2.764, + "step": 89395 + }, + { + "epoch": 6.074194863432532, + "grad_norm": 7.246504306793213, + "learning_rate": 2.410398831362957e-06, + "loss": 2.5067, + "step": 89400 + }, + { + "epoch": 6.074534583503193, + "grad_norm": 7.183257102966309, + "learning_rate": 2.40997418127463e-06, + "loss": 2.8403, + "step": 89405 + }, + { + "epoch": 6.074874303573855, + "grad_norm": 8.395000457763672, + "learning_rate": 2.4095495311863025e-06, + "loss": 2.871, + "step": 89410 + }, + { + "epoch": 6.075214023644517, + "grad_norm": 8.71192455291748, + "learning_rate": 2.4091248810979758e-06, + "loss": 2.7557, + "step": 89415 + }, + { + "epoch": 6.0755537437151785, + "grad_norm": 5.830206394195557, + "learning_rate": 2.408700231009648e-06, + "loss": 2.6705, + "step": 89420 + }, + { + "epoch": 6.075893463785841, + "grad_norm": 7.845045566558838, + "learning_rate": 2.408275580921321e-06, + "loss": 2.8283, + "step": 89425 + }, + { + "epoch": 6.076233183856502, + "grad_norm": 6.869813919067383, + "learning_rate": 2.4078509308329937e-06, + "loss": 2.877, + "step": 89430 + }, + { + "epoch": 6.076572903927164, + "grad_norm": 8.6096773147583, + "learning_rate": 2.4074262807446665e-06, + "loss": 2.8316, + "step": 89435 + }, + { + "epoch": 6.076912623997826, + "grad_norm": 8.507885932922363, + "learning_rate": 2.4070016306563393e-06, + "loss": 2.8934, + "step": 89440 + }, + { + "epoch": 6.077252344068487, + "grad_norm": 7.979517936706543, + "learning_rate": 2.406576980568012e-06, + "loss": 2.9135, + "step": 89445 + }, + { + "epoch": 6.077592064139149, + "grad_norm": 7.05569314956665, + "learning_rate": 2.406152330479685e-06, + "loss": 2.6072, + "step": 89450 + }, + { + "epoch": 6.077931784209811, + "grad_norm": 8.285672187805176, + "learning_rate": 2.4057276803913577e-06, + "loss": 2.8797, + "step": 89455 + }, + { + "epoch": 6.0782715042804725, + "grad_norm": 6.632143974304199, + "learning_rate": 2.4053030303030305e-06, + "loss": 2.8343, + "step": 89460 + }, + { + "epoch": 6.0786112243511345, + "grad_norm": 7.167137145996094, + "learning_rate": 2.4048783802147033e-06, + "loss": 2.7253, + "step": 89465 + }, + { + "epoch": 6.078950944421797, + "grad_norm": 9.93211555480957, + "learning_rate": 2.404453730126376e-06, + "loss": 2.7932, + "step": 89470 + }, + { + "epoch": 6.079290664492458, + "grad_norm": 7.060735702514648, + "learning_rate": 2.404029080038049e-06, + "loss": 2.5105, + "step": 89475 + }, + { + "epoch": 6.07963038456312, + "grad_norm": 7.0085649490356445, + "learning_rate": 2.4036044299497217e-06, + "loss": 2.6846, + "step": 89480 + }, + { + "epoch": 6.079970104633782, + "grad_norm": 8.989215850830078, + "learning_rate": 2.403179779861394e-06, + "loss": 2.6354, + "step": 89485 + }, + { + "epoch": 6.080309824704443, + "grad_norm": 7.279220104217529, + "learning_rate": 2.4027551297730673e-06, + "loss": 2.7923, + "step": 89490 + }, + { + "epoch": 6.080649544775105, + "grad_norm": 8.467425346374512, + "learning_rate": 2.4023304796847397e-06, + "loss": 2.7204, + "step": 89495 + }, + { + "epoch": 6.080989264845767, + "grad_norm": 7.750746250152588, + "learning_rate": 2.401905829596413e-06, + "loss": 2.75, + "step": 89500 + }, + { + "epoch": 6.0813289849164285, + "grad_norm": 8.108251571655273, + "learning_rate": 2.4014811795080857e-06, + "loss": 2.7762, + "step": 89505 + }, + { + "epoch": 6.081668704987091, + "grad_norm": 9.65465259552002, + "learning_rate": 2.401056529419758e-06, + "loss": 3.0475, + "step": 89510 + }, + { + "epoch": 6.082008425057753, + "grad_norm": 8.210965156555176, + "learning_rate": 2.4006318793314313e-06, + "loss": 2.81, + "step": 89515 + }, + { + "epoch": 6.082348145128414, + "grad_norm": 5.257288455963135, + "learning_rate": 2.4002072292431037e-06, + "loss": 2.7623, + "step": 89520 + }, + { + "epoch": 6.082687865199076, + "grad_norm": 7.495991230010986, + "learning_rate": 2.3997825791547765e-06, + "loss": 2.7752, + "step": 89525 + }, + { + "epoch": 6.083027585269738, + "grad_norm": 8.28526782989502, + "learning_rate": 2.3993579290664498e-06, + "loss": 2.7096, + "step": 89530 + }, + { + "epoch": 6.083367305340399, + "grad_norm": 8.018433570861816, + "learning_rate": 2.398933278978122e-06, + "loss": 2.7943, + "step": 89535 + }, + { + "epoch": 6.083707025411061, + "grad_norm": 8.921503067016602, + "learning_rate": 2.398508628889795e-06, + "loss": 2.4905, + "step": 89540 + }, + { + "epoch": 6.084046745481723, + "grad_norm": 13.152642250061035, + "learning_rate": 2.3980839788014677e-06, + "loss": 2.7016, + "step": 89545 + }, + { + "epoch": 6.0843864655523845, + "grad_norm": 7.568053722381592, + "learning_rate": 2.3976593287131405e-06, + "loss": 2.7083, + "step": 89550 + }, + { + "epoch": 6.084726185623047, + "grad_norm": 7.158093452453613, + "learning_rate": 2.3972346786248133e-06, + "loss": 2.716, + "step": 89555 + }, + { + "epoch": 6.085065905693709, + "grad_norm": 7.807582855224609, + "learning_rate": 2.396810028536486e-06, + "loss": 2.4526, + "step": 89560 + }, + { + "epoch": 6.08540562576437, + "grad_norm": 10.2885103225708, + "learning_rate": 2.396385378448159e-06, + "loss": 2.5496, + "step": 89565 + }, + { + "epoch": 6.085745345835032, + "grad_norm": 6.777692794799805, + "learning_rate": 2.3959607283598317e-06, + "loss": 2.7577, + "step": 89570 + }, + { + "epoch": 6.086085065905694, + "grad_norm": 7.835500240325928, + "learning_rate": 2.3955360782715045e-06, + "loss": 2.6679, + "step": 89575 + }, + { + "epoch": 6.086424785976355, + "grad_norm": 7.205965995788574, + "learning_rate": 2.3951114281831773e-06, + "loss": 2.8362, + "step": 89580 + }, + { + "epoch": 6.086764506047017, + "grad_norm": 7.302637100219727, + "learning_rate": 2.39468677809485e-06, + "loss": 2.4962, + "step": 89585 + }, + { + "epoch": 6.087104226117679, + "grad_norm": 7.0085368156433105, + "learning_rate": 2.394262128006523e-06, + "loss": 2.6291, + "step": 89590 + }, + { + "epoch": 6.0874439461883405, + "grad_norm": 6.805295944213867, + "learning_rate": 2.3938374779181957e-06, + "loss": 2.8388, + "step": 89595 + }, + { + "epoch": 6.087783666259003, + "grad_norm": 7.080421447753906, + "learning_rate": 2.3934128278298685e-06, + "loss": 2.9717, + "step": 89600 + }, + { + "epoch": 6.088123386329665, + "grad_norm": 8.240720748901367, + "learning_rate": 2.3929881777415413e-06, + "loss": 2.5854, + "step": 89605 + }, + { + "epoch": 6.088463106400326, + "grad_norm": 7.981748104095459, + "learning_rate": 2.3925635276532137e-06, + "loss": 2.8447, + "step": 89610 + }, + { + "epoch": 6.088802826470988, + "grad_norm": 8.500998497009277, + "learning_rate": 2.392138877564887e-06, + "loss": 2.758, + "step": 89615 + }, + { + "epoch": 6.08914254654165, + "grad_norm": 7.7749714851379395, + "learning_rate": 2.3917142274765593e-06, + "loss": 2.8526, + "step": 89620 + }, + { + "epoch": 6.089482266612311, + "grad_norm": 8.257038116455078, + "learning_rate": 2.391289577388232e-06, + "loss": 2.9369, + "step": 89625 + }, + { + "epoch": 6.089821986682973, + "grad_norm": 9.387752532958984, + "learning_rate": 2.3908649272999053e-06, + "loss": 2.7725, + "step": 89630 + }, + { + "epoch": 6.090161706753635, + "grad_norm": 6.376345157623291, + "learning_rate": 2.3904402772115777e-06, + "loss": 2.4311, + "step": 89635 + }, + { + "epoch": 6.0905014268242965, + "grad_norm": 6.7104363441467285, + "learning_rate": 2.3900156271232505e-06, + "loss": 2.7205, + "step": 89640 + }, + { + "epoch": 6.090841146894959, + "grad_norm": 7.307795524597168, + "learning_rate": 2.3895909770349233e-06, + "loss": 2.6624, + "step": 89645 + }, + { + "epoch": 6.091180866965621, + "grad_norm": 8.336435317993164, + "learning_rate": 2.389166326946596e-06, + "loss": 2.5169, + "step": 89650 + }, + { + "epoch": 6.091520587036282, + "grad_norm": 8.297274589538574, + "learning_rate": 2.388741676858269e-06, + "loss": 2.915, + "step": 89655 + }, + { + "epoch": 6.091860307106944, + "grad_norm": 10.298018455505371, + "learning_rate": 2.3883170267699417e-06, + "loss": 2.7693, + "step": 89660 + }, + { + "epoch": 6.092200027177606, + "grad_norm": 7.905885219573975, + "learning_rate": 2.3878923766816145e-06, + "loss": 2.547, + "step": 89665 + }, + { + "epoch": 6.092539747248267, + "grad_norm": 6.656125545501709, + "learning_rate": 2.3874677265932873e-06, + "loss": 2.8236, + "step": 89670 + }, + { + "epoch": 6.092879467318929, + "grad_norm": 8.844425201416016, + "learning_rate": 2.38704307650496e-06, + "loss": 2.6928, + "step": 89675 + }, + { + "epoch": 6.093219187389591, + "grad_norm": 7.55325984954834, + "learning_rate": 2.386618426416633e-06, + "loss": 2.6235, + "step": 89680 + }, + { + "epoch": 6.0935589074602525, + "grad_norm": 8.688292503356934, + "learning_rate": 2.3861937763283057e-06, + "loss": 2.6571, + "step": 89685 + }, + { + "epoch": 6.093898627530915, + "grad_norm": 6.611508369445801, + "learning_rate": 2.3857691262399785e-06, + "loss": 2.6457, + "step": 89690 + }, + { + "epoch": 6.094238347601577, + "grad_norm": 8.940531730651855, + "learning_rate": 2.3853444761516513e-06, + "loss": 2.9826, + "step": 89695 + }, + { + "epoch": 6.094578067672238, + "grad_norm": 7.153899669647217, + "learning_rate": 2.384919826063324e-06, + "loss": 2.7377, + "step": 89700 + }, + { + "epoch": 6.0949177877429, + "grad_norm": 9.02791976928711, + "learning_rate": 2.384495175974997e-06, + "loss": 2.8283, + "step": 89705 + }, + { + "epoch": 6.095257507813562, + "grad_norm": 8.007794380187988, + "learning_rate": 2.3840705258866693e-06, + "loss": 2.8459, + "step": 89710 + }, + { + "epoch": 6.095597227884223, + "grad_norm": 7.558252811431885, + "learning_rate": 2.3836458757983425e-06, + "loss": 2.7008, + "step": 89715 + }, + { + "epoch": 6.095936947954885, + "grad_norm": 8.776949882507324, + "learning_rate": 2.3832212257100153e-06, + "loss": 3.0474, + "step": 89720 + }, + { + "epoch": 6.096276668025547, + "grad_norm": 10.450349807739258, + "learning_rate": 2.3827965756216877e-06, + "loss": 2.8751, + "step": 89725 + }, + { + "epoch": 6.0966163880962085, + "grad_norm": 6.48425817489624, + "learning_rate": 2.382371925533361e-06, + "loss": 2.4771, + "step": 89730 + }, + { + "epoch": 6.096956108166871, + "grad_norm": 5.846245765686035, + "learning_rate": 2.3819472754450333e-06, + "loss": 2.5762, + "step": 89735 + }, + { + "epoch": 6.097295828237533, + "grad_norm": 9.126445770263672, + "learning_rate": 2.381522625356706e-06, + "loss": 2.6869, + "step": 89740 + }, + { + "epoch": 6.097635548308194, + "grad_norm": 9.26333236694336, + "learning_rate": 2.381097975268379e-06, + "loss": 2.8788, + "step": 89745 + }, + { + "epoch": 6.097975268378856, + "grad_norm": 7.952342987060547, + "learning_rate": 2.3806733251800517e-06, + "loss": 2.6735, + "step": 89750 + }, + { + "epoch": 6.098314988449518, + "grad_norm": 9.894837379455566, + "learning_rate": 2.380248675091725e-06, + "loss": 2.5607, + "step": 89755 + }, + { + "epoch": 6.098654708520179, + "grad_norm": 6.496739864349365, + "learning_rate": 2.3798240250033973e-06, + "loss": 2.8304, + "step": 89760 + }, + { + "epoch": 6.098994428590841, + "grad_norm": 7.265023231506348, + "learning_rate": 2.37939937491507e-06, + "loss": 2.7488, + "step": 89765 + }, + { + "epoch": 6.099334148661503, + "grad_norm": 7.51047945022583, + "learning_rate": 2.378974724826743e-06, + "loss": 2.6927, + "step": 89770 + }, + { + "epoch": 6.0996738687321646, + "grad_norm": 9.322392463684082, + "learning_rate": 2.3785500747384157e-06, + "loss": 2.8405, + "step": 89775 + }, + { + "epoch": 6.100013588802827, + "grad_norm": 8.283933639526367, + "learning_rate": 2.3781254246500885e-06, + "loss": 2.8031, + "step": 89780 + }, + { + "epoch": 6.100353308873488, + "grad_norm": 8.348801612854004, + "learning_rate": 2.3777007745617613e-06, + "loss": 2.7051, + "step": 89785 + }, + { + "epoch": 6.10069302894415, + "grad_norm": 6.340610027313232, + "learning_rate": 2.377276124473434e-06, + "loss": 2.6606, + "step": 89790 + }, + { + "epoch": 6.101032749014812, + "grad_norm": 5.503652095794678, + "learning_rate": 2.376851474385107e-06, + "loss": 2.6913, + "step": 89795 + }, + { + "epoch": 6.101372469085473, + "grad_norm": 9.813587188720703, + "learning_rate": 2.3764268242967797e-06, + "loss": 2.6812, + "step": 89800 + }, + { + "epoch": 6.101712189156135, + "grad_norm": 8.191551208496094, + "learning_rate": 2.3760021742084525e-06, + "loss": 2.817, + "step": 89805 + }, + { + "epoch": 6.102051909226797, + "grad_norm": 7.72884464263916, + "learning_rate": 2.375577524120125e-06, + "loss": 2.4893, + "step": 89810 + }, + { + "epoch": 6.1023916292974585, + "grad_norm": 7.568680763244629, + "learning_rate": 2.375152874031798e-06, + "loss": 2.7208, + "step": 89815 + }, + { + "epoch": 6.102731349368121, + "grad_norm": 6.394395351409912, + "learning_rate": 2.374728223943471e-06, + "loss": 2.7163, + "step": 89820 + }, + { + "epoch": 6.103071069438783, + "grad_norm": 8.385030746459961, + "learning_rate": 2.3743035738551433e-06, + "loss": 2.9722, + "step": 89825 + }, + { + "epoch": 6.103410789509444, + "grad_norm": 6.831686019897461, + "learning_rate": 2.3738789237668165e-06, + "loss": 2.6888, + "step": 89830 + }, + { + "epoch": 6.103750509580106, + "grad_norm": 7.662550926208496, + "learning_rate": 2.373454273678489e-06, + "loss": 2.8802, + "step": 89835 + }, + { + "epoch": 6.104090229650768, + "grad_norm": 7.535350322723389, + "learning_rate": 2.373029623590162e-06, + "loss": 2.6162, + "step": 89840 + }, + { + "epoch": 6.104429949721429, + "grad_norm": 5.809226989746094, + "learning_rate": 2.3726049735018345e-06, + "loss": 2.6776, + "step": 89845 + }, + { + "epoch": 6.104769669792091, + "grad_norm": 6.852434158325195, + "learning_rate": 2.3721803234135073e-06, + "loss": 2.6388, + "step": 89850 + }, + { + "epoch": 6.105109389862753, + "grad_norm": 6.1476850509643555, + "learning_rate": 2.3717556733251805e-06, + "loss": 2.6268, + "step": 89855 + }, + { + "epoch": 6.1054491099334145, + "grad_norm": 9.757298469543457, + "learning_rate": 2.371331023236853e-06, + "loss": 2.7718, + "step": 89860 + }, + { + "epoch": 6.105788830004077, + "grad_norm": 9.944807052612305, + "learning_rate": 2.3709063731485257e-06, + "loss": 2.8509, + "step": 89865 + }, + { + "epoch": 6.106128550074739, + "grad_norm": 10.224894523620605, + "learning_rate": 2.3704817230601985e-06, + "loss": 2.8835, + "step": 89870 + }, + { + "epoch": 6.1064682701454, + "grad_norm": 7.456078052520752, + "learning_rate": 2.3700570729718713e-06, + "loss": 2.8796, + "step": 89875 + }, + { + "epoch": 6.106807990216062, + "grad_norm": 8.600767135620117, + "learning_rate": 2.369632422883544e-06, + "loss": 2.7952, + "step": 89880 + }, + { + "epoch": 6.107147710286724, + "grad_norm": 8.641974449157715, + "learning_rate": 2.369207772795217e-06, + "loss": 2.9261, + "step": 89885 + }, + { + "epoch": 6.107487430357385, + "grad_norm": 5.791991710662842, + "learning_rate": 2.3687831227068897e-06, + "loss": 2.6849, + "step": 89890 + }, + { + "epoch": 6.107827150428047, + "grad_norm": 7.857969760894775, + "learning_rate": 2.3683584726185625e-06, + "loss": 2.7418, + "step": 89895 + }, + { + "epoch": 6.108166870498709, + "grad_norm": 7.727721691131592, + "learning_rate": 2.3679338225302353e-06, + "loss": 2.5984, + "step": 89900 + }, + { + "epoch": 6.1085065905693705, + "grad_norm": 9.073456764221191, + "learning_rate": 2.367509172441908e-06, + "loss": 2.7647, + "step": 89905 + }, + { + "epoch": 6.108846310640033, + "grad_norm": 8.293290138244629, + "learning_rate": 2.367084522353581e-06, + "loss": 2.5369, + "step": 89910 + }, + { + "epoch": 6.109186030710695, + "grad_norm": 8.93151569366455, + "learning_rate": 2.3666598722652537e-06, + "loss": 2.8721, + "step": 89915 + }, + { + "epoch": 6.109525750781356, + "grad_norm": 10.501977920532227, + "learning_rate": 2.3662352221769265e-06, + "loss": 2.7931, + "step": 89920 + }, + { + "epoch": 6.109865470852018, + "grad_norm": 7.9431281089782715, + "learning_rate": 2.3658105720885993e-06, + "loss": 2.5823, + "step": 89925 + }, + { + "epoch": 6.11020519092268, + "grad_norm": 6.267332077026367, + "learning_rate": 2.365385922000272e-06, + "loss": 2.7176, + "step": 89930 + }, + { + "epoch": 6.110544910993341, + "grad_norm": 6.931185245513916, + "learning_rate": 2.3649612719119445e-06, + "loss": 2.6399, + "step": 89935 + }, + { + "epoch": 6.110884631064003, + "grad_norm": 8.681975364685059, + "learning_rate": 2.3645366218236177e-06, + "loss": 3.0494, + "step": 89940 + }, + { + "epoch": 6.111224351134665, + "grad_norm": 7.77213191986084, + "learning_rate": 2.3641119717352905e-06, + "loss": 2.8041, + "step": 89945 + }, + { + "epoch": 6.1115640712053265, + "grad_norm": 7.61405611038208, + "learning_rate": 2.363687321646963e-06, + "loss": 2.8254, + "step": 89950 + }, + { + "epoch": 6.111903791275989, + "grad_norm": 7.6518096923828125, + "learning_rate": 2.363262671558636e-06, + "loss": 2.6653, + "step": 89955 + }, + { + "epoch": 6.112243511346651, + "grad_norm": 9.686224937438965, + "learning_rate": 2.362922951487974e-06, + "loss": 2.695, + "step": 89960 + }, + { + "epoch": 6.112583231417312, + "grad_norm": 9.44251823425293, + "learning_rate": 2.362498301399647e-06, + "loss": 2.8309, + "step": 89965 + }, + { + "epoch": 6.112922951487974, + "grad_norm": 7.108981609344482, + "learning_rate": 2.36207365131132e-06, + "loss": 2.976, + "step": 89970 + }, + { + "epoch": 6.113262671558636, + "grad_norm": 8.652037620544434, + "learning_rate": 2.3616490012229926e-06, + "loss": 2.9696, + "step": 89975 + }, + { + "epoch": 6.113602391629297, + "grad_norm": 6.751550674438477, + "learning_rate": 2.3612243511346654e-06, + "loss": 2.7657, + "step": 89980 + }, + { + "epoch": 6.113942111699959, + "grad_norm": 9.131843566894531, + "learning_rate": 2.3607997010463378e-06, + "loss": 2.7023, + "step": 89985 + }, + { + "epoch": 6.114281831770621, + "grad_norm": 7.39678430557251, + "learning_rate": 2.360375050958011e-06, + "loss": 2.5533, + "step": 89990 + }, + { + "epoch": 6.1146215518412825, + "grad_norm": 9.572065353393555, + "learning_rate": 2.3599504008696834e-06, + "loss": 2.7986, + "step": 89995 + }, + { + "epoch": 6.114961271911945, + "grad_norm": 9.019510269165039, + "learning_rate": 2.359525750781356e-06, + "loss": 2.9067, + "step": 90000 + }, + { + "epoch": 6.115300991982607, + "grad_norm": 5.701376438140869, + "learning_rate": 2.3591011006930294e-06, + "loss": 2.8376, + "step": 90005 + }, + { + "epoch": 6.115640712053268, + "grad_norm": 7.148617267608643, + "learning_rate": 2.3586764506047018e-06, + "loss": 2.6455, + "step": 90010 + }, + { + "epoch": 6.11598043212393, + "grad_norm": 7.678857326507568, + "learning_rate": 2.3582518005163746e-06, + "loss": 3.0016, + "step": 90015 + }, + { + "epoch": 6.116320152194592, + "grad_norm": 8.594390869140625, + "learning_rate": 2.3578271504280474e-06, + "loss": 2.6939, + "step": 90020 + }, + { + "epoch": 6.116659872265253, + "grad_norm": 5.627438068389893, + "learning_rate": 2.35740250033972e-06, + "loss": 2.767, + "step": 90025 + }, + { + "epoch": 6.116999592335915, + "grad_norm": 7.388391494750977, + "learning_rate": 2.356977850251393e-06, + "loss": 2.6397, + "step": 90030 + }, + { + "epoch": 6.117339312406577, + "grad_norm": 9.260028839111328, + "learning_rate": 2.3565532001630658e-06, + "loss": 2.8019, + "step": 90035 + }, + { + "epoch": 6.1176790324772385, + "grad_norm": 7.057618618011475, + "learning_rate": 2.3561285500747386e-06, + "loss": 2.8647, + "step": 90040 + }, + { + "epoch": 6.118018752547901, + "grad_norm": 6.881780624389648, + "learning_rate": 2.3557038999864114e-06, + "loss": 2.8092, + "step": 90045 + }, + { + "epoch": 6.118358472618563, + "grad_norm": 7.391209602355957, + "learning_rate": 2.355279249898084e-06, + "loss": 3.0776, + "step": 90050 + }, + { + "epoch": 6.118698192689224, + "grad_norm": 7.437679290771484, + "learning_rate": 2.354854599809757e-06, + "loss": 2.8581, + "step": 90055 + }, + { + "epoch": 6.119037912759886, + "grad_norm": 8.787217140197754, + "learning_rate": 2.3544299497214298e-06, + "loss": 2.8863, + "step": 90060 + }, + { + "epoch": 6.119377632830548, + "grad_norm": 7.971737384796143, + "learning_rate": 2.3540052996331026e-06, + "loss": 2.8389, + "step": 90065 + }, + { + "epoch": 6.119717352901209, + "grad_norm": 7.1593194007873535, + "learning_rate": 2.3535806495447754e-06, + "loss": 2.5843, + "step": 90070 + }, + { + "epoch": 6.120057072971871, + "grad_norm": 7.689611911773682, + "learning_rate": 2.353155999456448e-06, + "loss": 2.7709, + "step": 90075 + }, + { + "epoch": 6.120396793042533, + "grad_norm": 8.061816215515137, + "learning_rate": 2.352731349368121e-06, + "loss": 2.8638, + "step": 90080 + }, + { + "epoch": 6.1207365131131946, + "grad_norm": 7.034337520599365, + "learning_rate": 2.3523066992797934e-06, + "loss": 2.4408, + "step": 90085 + }, + { + "epoch": 6.121076233183857, + "grad_norm": 6.426536560058594, + "learning_rate": 2.3518820491914666e-06, + "loss": 2.5907, + "step": 90090 + }, + { + "epoch": 6.121415953254519, + "grad_norm": 7.307139873504639, + "learning_rate": 2.3514573991031394e-06, + "loss": 2.876, + "step": 90095 + }, + { + "epoch": 6.12175567332518, + "grad_norm": 7.18395471572876, + "learning_rate": 2.3510327490148118e-06, + "loss": 2.7788, + "step": 90100 + }, + { + "epoch": 6.122095393395842, + "grad_norm": 8.486047744750977, + "learning_rate": 2.350608098926485e-06, + "loss": 2.7273, + "step": 90105 + }, + { + "epoch": 6.122435113466503, + "grad_norm": 7.485057353973389, + "learning_rate": 2.3501834488381574e-06, + "loss": 2.5566, + "step": 90110 + }, + { + "epoch": 6.122774833537165, + "grad_norm": 9.423746109008789, + "learning_rate": 2.34975879874983e-06, + "loss": 2.7264, + "step": 90115 + }, + { + "epoch": 6.123114553607827, + "grad_norm": 9.68844223022461, + "learning_rate": 2.349334148661503e-06, + "loss": 2.8135, + "step": 90120 + }, + { + "epoch": 6.1234542736784885, + "grad_norm": 7.182912826538086, + "learning_rate": 2.3489094985731758e-06, + "loss": 2.5138, + "step": 90125 + }, + { + "epoch": 6.123793993749151, + "grad_norm": 7.608016014099121, + "learning_rate": 2.348484848484849e-06, + "loss": 2.6912, + "step": 90130 + }, + { + "epoch": 6.124133713819813, + "grad_norm": 6.973527908325195, + "learning_rate": 2.3480601983965214e-06, + "loss": 2.5454, + "step": 90135 + }, + { + "epoch": 6.124473433890474, + "grad_norm": 8.593442916870117, + "learning_rate": 2.347635548308194e-06, + "loss": 2.9097, + "step": 90140 + }, + { + "epoch": 6.124813153961136, + "grad_norm": 7.104532718658447, + "learning_rate": 2.347210898219867e-06, + "loss": 2.7295, + "step": 90145 + }, + { + "epoch": 6.125152874031798, + "grad_norm": 8.39213752746582, + "learning_rate": 2.3467862481315398e-06, + "loss": 2.7152, + "step": 90150 + }, + { + "epoch": 6.125492594102459, + "grad_norm": 9.421333312988281, + "learning_rate": 2.3463615980432126e-06, + "loss": 2.9107, + "step": 90155 + }, + { + "epoch": 6.125832314173121, + "grad_norm": 8.370349884033203, + "learning_rate": 2.3459369479548854e-06, + "loss": 2.6878, + "step": 90160 + }, + { + "epoch": 6.126172034243783, + "grad_norm": 8.698807716369629, + "learning_rate": 2.345512297866558e-06, + "loss": 2.8702, + "step": 90165 + }, + { + "epoch": 6.1265117543144445, + "grad_norm": 9.419167518615723, + "learning_rate": 2.345087647778231e-06, + "loss": 2.6992, + "step": 90170 + }, + { + "epoch": 6.126851474385107, + "grad_norm": 7.732470989227295, + "learning_rate": 2.3446629976899038e-06, + "loss": 2.8602, + "step": 90175 + }, + { + "epoch": 6.127191194455769, + "grad_norm": 7.5719990730285645, + "learning_rate": 2.3442383476015766e-06, + "loss": 2.7409, + "step": 90180 + }, + { + "epoch": 6.12753091452643, + "grad_norm": 7.518253803253174, + "learning_rate": 2.343813697513249e-06, + "loss": 2.7792, + "step": 90185 + }, + { + "epoch": 6.127870634597092, + "grad_norm": 6.692862033843994, + "learning_rate": 2.343389047424922e-06, + "loss": 2.9851, + "step": 90190 + }, + { + "epoch": 6.128210354667754, + "grad_norm": 8.412348747253418, + "learning_rate": 2.342964397336595e-06, + "loss": 2.9628, + "step": 90195 + }, + { + "epoch": 6.128550074738415, + "grad_norm": 6.62495231628418, + "learning_rate": 2.3425397472482674e-06, + "loss": 3.0195, + "step": 90200 + }, + { + "epoch": 6.128889794809077, + "grad_norm": 7.502620697021484, + "learning_rate": 2.3421150971599406e-06, + "loss": 2.9885, + "step": 90205 + }, + { + "epoch": 6.129229514879739, + "grad_norm": 8.51587200164795, + "learning_rate": 2.341690447071613e-06, + "loss": 2.7386, + "step": 90210 + }, + { + "epoch": 6.1295692349504005, + "grad_norm": 7.075504779815674, + "learning_rate": 2.341265796983286e-06, + "loss": 2.7344, + "step": 90215 + }, + { + "epoch": 6.129908955021063, + "grad_norm": 5.846652030944824, + "learning_rate": 2.340841146894959e-06, + "loss": 2.4736, + "step": 90220 + }, + { + "epoch": 6.130248675091725, + "grad_norm": 6.067281723022461, + "learning_rate": 2.3404164968066314e-06, + "loss": 2.596, + "step": 90225 + }, + { + "epoch": 6.130588395162386, + "grad_norm": 7.327848434448242, + "learning_rate": 2.3399918467183046e-06, + "loss": 2.426, + "step": 90230 + }, + { + "epoch": 6.130928115233048, + "grad_norm": 7.434764385223389, + "learning_rate": 2.339567196629977e-06, + "loss": 2.7933, + "step": 90235 + }, + { + "epoch": 6.13126783530371, + "grad_norm": 7.926757335662842, + "learning_rate": 2.3391425465416498e-06, + "loss": 2.9899, + "step": 90240 + }, + { + "epoch": 6.131607555374371, + "grad_norm": 7.583887100219727, + "learning_rate": 2.3387178964533226e-06, + "loss": 2.8315, + "step": 90245 + }, + { + "epoch": 6.131947275445033, + "grad_norm": 7.3122735023498535, + "learning_rate": 2.3382932463649954e-06, + "loss": 2.5083, + "step": 90250 + }, + { + "epoch": 6.132286995515695, + "grad_norm": 8.592399597167969, + "learning_rate": 2.337868596276668e-06, + "loss": 2.8506, + "step": 90255 + }, + { + "epoch": 6.1326267155863565, + "grad_norm": 8.265646934509277, + "learning_rate": 2.337443946188341e-06, + "loss": 2.889, + "step": 90260 + }, + { + "epoch": 6.132966435657019, + "grad_norm": 7.599621295928955, + "learning_rate": 2.3370192961000138e-06, + "loss": 2.7044, + "step": 90265 + }, + { + "epoch": 6.133306155727681, + "grad_norm": 7.077208042144775, + "learning_rate": 2.3365946460116866e-06, + "loss": 2.4705, + "step": 90270 + }, + { + "epoch": 6.133645875798342, + "grad_norm": 9.596799850463867, + "learning_rate": 2.3361699959233594e-06, + "loss": 2.9191, + "step": 90275 + }, + { + "epoch": 6.133985595869004, + "grad_norm": 7.3589348793029785, + "learning_rate": 2.335745345835032e-06, + "loss": 2.8371, + "step": 90280 + }, + { + "epoch": 6.134325315939666, + "grad_norm": 10.34317684173584, + "learning_rate": 2.335320695746705e-06, + "loss": 2.9109, + "step": 90285 + }, + { + "epoch": 6.134665036010327, + "grad_norm": 8.257349967956543, + "learning_rate": 2.3348960456583778e-06, + "loss": 2.6992, + "step": 90290 + }, + { + "epoch": 6.135004756080989, + "grad_norm": 7.751276016235352, + "learning_rate": 2.3344713955700506e-06, + "loss": 2.8854, + "step": 90295 + }, + { + "epoch": 6.135344476151651, + "grad_norm": 7.093174457550049, + "learning_rate": 2.3340467454817234e-06, + "loss": 2.7545, + "step": 90300 + }, + { + "epoch": 6.1356841962223125, + "grad_norm": 6.204989910125732, + "learning_rate": 2.333622095393396e-06, + "loss": 2.7263, + "step": 90305 + }, + { + "epoch": 6.136023916292975, + "grad_norm": 10.288751602172852, + "learning_rate": 2.3331974453050686e-06, + "loss": 2.8073, + "step": 90310 + }, + { + "epoch": 6.136363636363637, + "grad_norm": 9.44891357421875, + "learning_rate": 2.3327727952167418e-06, + "loss": 2.6766, + "step": 90315 + }, + { + "epoch": 6.136703356434298, + "grad_norm": 7.3029022216796875, + "learning_rate": 2.3323481451284146e-06, + "loss": 2.7446, + "step": 90320 + }, + { + "epoch": 6.13704307650496, + "grad_norm": 8.237735748291016, + "learning_rate": 2.331923495040087e-06, + "loss": 2.5727, + "step": 90325 + }, + { + "epoch": 6.137382796575622, + "grad_norm": 7.61347770690918, + "learning_rate": 2.33149884495176e-06, + "loss": 2.7553, + "step": 90330 + }, + { + "epoch": 6.137722516646283, + "grad_norm": 6.094682693481445, + "learning_rate": 2.3310741948634326e-06, + "loss": 2.9676, + "step": 90335 + }, + { + "epoch": 6.138062236716945, + "grad_norm": 6.495875835418701, + "learning_rate": 2.330734474792771e-06, + "loss": 2.7689, + "step": 90340 + }, + { + "epoch": 6.138401956787607, + "grad_norm": 8.717094421386719, + "learning_rate": 2.330309824704444e-06, + "loss": 2.992, + "step": 90345 + }, + { + "epoch": 6.1387416768582685, + "grad_norm": 6.0008225440979, + "learning_rate": 2.3298851746161167e-06, + "loss": 2.7915, + "step": 90350 + }, + { + "epoch": 6.139081396928931, + "grad_norm": 8.62615966796875, + "learning_rate": 2.3294605245277895e-06, + "loss": 2.7499, + "step": 90355 + }, + { + "epoch": 6.139421116999593, + "grad_norm": 8.737244606018066, + "learning_rate": 2.329035874439462e-06, + "loss": 2.8334, + "step": 90360 + }, + { + "epoch": 6.139760837070254, + "grad_norm": 10.615655899047852, + "learning_rate": 2.328611224351135e-06, + "loss": 2.7711, + "step": 90365 + }, + { + "epoch": 6.140100557140916, + "grad_norm": 7.669405460357666, + "learning_rate": 2.3281865742628074e-06, + "loss": 2.6983, + "step": 90370 + }, + { + "epoch": 6.140440277211578, + "grad_norm": 9.361895561218262, + "learning_rate": 2.3277619241744802e-06, + "loss": 2.6856, + "step": 90375 + }, + { + "epoch": 6.140779997282239, + "grad_norm": 8.194318771362305, + "learning_rate": 2.3273372740861535e-06, + "loss": 2.9019, + "step": 90380 + }, + { + "epoch": 6.141119717352901, + "grad_norm": 8.782922744750977, + "learning_rate": 2.326912623997826e-06, + "loss": 2.8474, + "step": 90385 + }, + { + "epoch": 6.141459437423563, + "grad_norm": 8.850916862487793, + "learning_rate": 2.3264879739094986e-06, + "loss": 2.7347, + "step": 90390 + }, + { + "epoch": 6.141799157494225, + "grad_norm": 9.9853515625, + "learning_rate": 2.3260633238211714e-06, + "loss": 2.6259, + "step": 90395 + }, + { + "epoch": 6.142138877564887, + "grad_norm": 9.130897521972656, + "learning_rate": 2.3256386737328442e-06, + "loss": 2.7702, + "step": 90400 + }, + { + "epoch": 6.142478597635549, + "grad_norm": 7.1968302726745605, + "learning_rate": 2.325214023644517e-06, + "loss": 2.863, + "step": 90405 + }, + { + "epoch": 6.14281831770621, + "grad_norm": 7.354928493499756, + "learning_rate": 2.32478937355619e-06, + "loss": 2.6293, + "step": 90410 + }, + { + "epoch": 6.143158037776872, + "grad_norm": 10.252912521362305, + "learning_rate": 2.3243647234678626e-06, + "loss": 2.9611, + "step": 90415 + }, + { + "epoch": 6.143497757847534, + "grad_norm": 6.691549777984619, + "learning_rate": 2.3239400733795354e-06, + "loss": 2.791, + "step": 90420 + }, + { + "epoch": 6.143837477918195, + "grad_norm": 7.723906517028809, + "learning_rate": 2.3235154232912082e-06, + "loss": 2.7089, + "step": 90425 + }, + { + "epoch": 6.144177197988857, + "grad_norm": 6.966592788696289, + "learning_rate": 2.323090773202881e-06, + "loss": 2.7608, + "step": 90430 + }, + { + "epoch": 6.144516918059519, + "grad_norm": 6.894263744354248, + "learning_rate": 2.322666123114554e-06, + "loss": 2.7402, + "step": 90435 + }, + { + "epoch": 6.144856638130181, + "grad_norm": 7.764959335327148, + "learning_rate": 2.3222414730262266e-06, + "loss": 2.926, + "step": 90440 + }, + { + "epoch": 6.145196358200843, + "grad_norm": 7.309459686279297, + "learning_rate": 2.3218168229378994e-06, + "loss": 2.7683, + "step": 90445 + }, + { + "epoch": 6.145536078271505, + "grad_norm": 9.07062816619873, + "learning_rate": 2.3213921728495722e-06, + "loss": 2.7533, + "step": 90450 + }, + { + "epoch": 6.145875798342166, + "grad_norm": 7.214293479919434, + "learning_rate": 2.320967522761245e-06, + "loss": 2.9015, + "step": 90455 + }, + { + "epoch": 6.146215518412828, + "grad_norm": 8.67996883392334, + "learning_rate": 2.3205428726729174e-06, + "loss": 2.8818, + "step": 90460 + }, + { + "epoch": 6.14655523848349, + "grad_norm": 5.116530418395996, + "learning_rate": 2.3201182225845906e-06, + "loss": 2.8549, + "step": 90465 + }, + { + "epoch": 6.146894958554151, + "grad_norm": 6.681309223175049, + "learning_rate": 2.3196935724962635e-06, + "loss": 2.7817, + "step": 90470 + }, + { + "epoch": 6.147234678624813, + "grad_norm": 7.611561298370361, + "learning_rate": 2.319268922407936e-06, + "loss": 2.6237, + "step": 90475 + }, + { + "epoch": 6.1475743986954745, + "grad_norm": 8.143086433410645, + "learning_rate": 2.318844272319609e-06, + "loss": 2.6343, + "step": 90480 + }, + { + "epoch": 6.147914118766137, + "grad_norm": 9.609049797058105, + "learning_rate": 2.3184196222312814e-06, + "loss": 2.9608, + "step": 90485 + }, + { + "epoch": 6.148253838836799, + "grad_norm": 10.806676864624023, + "learning_rate": 2.3179949721429542e-06, + "loss": 2.7049, + "step": 90490 + }, + { + "epoch": 6.14859355890746, + "grad_norm": 7.096436023712158, + "learning_rate": 2.317570322054627e-06, + "loss": 2.9129, + "step": 90495 + }, + { + "epoch": 6.148933278978122, + "grad_norm": 7.610986709594727, + "learning_rate": 2.3171456719663e-06, + "loss": 2.7021, + "step": 90500 + }, + { + "epoch": 6.149272999048784, + "grad_norm": 9.065730094909668, + "learning_rate": 2.316721021877973e-06, + "loss": 2.7283, + "step": 90505 + }, + { + "epoch": 6.149612719119445, + "grad_norm": 7.498267650604248, + "learning_rate": 2.3162963717896454e-06, + "loss": 2.6954, + "step": 90510 + }, + { + "epoch": 6.149952439190107, + "grad_norm": 9.42788028717041, + "learning_rate": 2.3158717217013182e-06, + "loss": 2.8449, + "step": 90515 + }, + { + "epoch": 6.150292159260769, + "grad_norm": 8.868522644042969, + "learning_rate": 2.315447071612991e-06, + "loss": 2.6548, + "step": 90520 + }, + { + "epoch": 6.1506318793314305, + "grad_norm": 7.727507591247559, + "learning_rate": 2.315022421524664e-06, + "loss": 2.7944, + "step": 90525 + }, + { + "epoch": 6.150971599402093, + "grad_norm": 8.653313636779785, + "learning_rate": 2.3145977714363366e-06, + "loss": 2.8367, + "step": 90530 + }, + { + "epoch": 6.151311319472755, + "grad_norm": 8.322360038757324, + "learning_rate": 2.3141731213480094e-06, + "loss": 2.8847, + "step": 90535 + }, + { + "epoch": 6.151651039543416, + "grad_norm": 9.085646629333496, + "learning_rate": 2.3137484712596822e-06, + "loss": 2.8622, + "step": 90540 + }, + { + "epoch": 6.151990759614078, + "grad_norm": 8.797323226928711, + "learning_rate": 2.313323821171355e-06, + "loss": 2.8161, + "step": 90545 + }, + { + "epoch": 6.15233047968474, + "grad_norm": 8.887505531311035, + "learning_rate": 2.312899171083028e-06, + "loss": 2.8355, + "step": 90550 + }, + { + "epoch": 6.152670199755401, + "grad_norm": 8.276894569396973, + "learning_rate": 2.3124745209947006e-06, + "loss": 2.9523, + "step": 90555 + }, + { + "epoch": 6.153009919826063, + "grad_norm": 7.083070755004883, + "learning_rate": 2.312049870906373e-06, + "loss": 2.7509, + "step": 90560 + }, + { + "epoch": 6.153349639896725, + "grad_norm": 7.049843788146973, + "learning_rate": 2.3116252208180462e-06, + "loss": 2.8636, + "step": 90565 + }, + { + "epoch": 6.1536893599673865, + "grad_norm": 9.86999797821045, + "learning_rate": 2.311200570729719e-06, + "loss": 2.794, + "step": 90570 + }, + { + "epoch": 6.154029080038049, + "grad_norm": 8.120774269104004, + "learning_rate": 2.3107759206413914e-06, + "loss": 2.6537, + "step": 90575 + }, + { + "epoch": 6.154368800108711, + "grad_norm": 7.815553665161133, + "learning_rate": 2.3103512705530646e-06, + "loss": 2.7567, + "step": 90580 + }, + { + "epoch": 6.154708520179372, + "grad_norm": 8.112126350402832, + "learning_rate": 2.309926620464737e-06, + "loss": 2.5489, + "step": 90585 + }, + { + "epoch": 6.155048240250034, + "grad_norm": 8.796355247497559, + "learning_rate": 2.3095019703764102e-06, + "loss": 2.8203, + "step": 90590 + }, + { + "epoch": 6.155387960320696, + "grad_norm": 6.798379898071289, + "learning_rate": 2.309077320288083e-06, + "loss": 2.6215, + "step": 90595 + }, + { + "epoch": 6.155727680391357, + "grad_norm": 7.171195030212402, + "learning_rate": 2.3086526701997554e-06, + "loss": 2.855, + "step": 90600 + }, + { + "epoch": 6.156067400462019, + "grad_norm": 7.599555969238281, + "learning_rate": 2.3082280201114286e-06, + "loss": 2.8308, + "step": 90605 + }, + { + "epoch": 6.156407120532681, + "grad_norm": 8.797735214233398, + "learning_rate": 2.307803370023101e-06, + "loss": 2.9595, + "step": 90610 + }, + { + "epoch": 6.1567468406033425, + "grad_norm": 6.0858540534973145, + "learning_rate": 2.307378719934774e-06, + "loss": 2.8759, + "step": 90615 + }, + { + "epoch": 6.157086560674005, + "grad_norm": 7.232677936553955, + "learning_rate": 2.3069540698464466e-06, + "loss": 2.8998, + "step": 90620 + }, + { + "epoch": 6.157426280744667, + "grad_norm": 7.399884223937988, + "learning_rate": 2.3065294197581194e-06, + "loss": 2.7742, + "step": 90625 + }, + { + "epoch": 6.157766000815328, + "grad_norm": 7.922135829925537, + "learning_rate": 2.3061047696697922e-06, + "loss": 2.718, + "step": 90630 + }, + { + "epoch": 6.15810572088599, + "grad_norm": 8.708133697509766, + "learning_rate": 2.305680119581465e-06, + "loss": 2.6871, + "step": 90635 + }, + { + "epoch": 6.158445440956652, + "grad_norm": 7.1297688484191895, + "learning_rate": 2.305255469493138e-06, + "loss": 2.755, + "step": 90640 + }, + { + "epoch": 6.158785161027313, + "grad_norm": 7.149209499359131, + "learning_rate": 2.3048308194048106e-06, + "loss": 2.6136, + "step": 90645 + }, + { + "epoch": 6.159124881097975, + "grad_norm": 7.945423126220703, + "learning_rate": 2.3044061693164834e-06, + "loss": 2.8282, + "step": 90650 + }, + { + "epoch": 6.159464601168637, + "grad_norm": 6.588756561279297, + "learning_rate": 2.3039815192281562e-06, + "loss": 2.6144, + "step": 90655 + }, + { + "epoch": 6.1598043212392986, + "grad_norm": 7.154215335845947, + "learning_rate": 2.303556869139829e-06, + "loss": 2.931, + "step": 90660 + }, + { + "epoch": 6.160144041309961, + "grad_norm": 8.726757049560547, + "learning_rate": 2.303132219051502e-06, + "loss": 2.4851, + "step": 90665 + }, + { + "epoch": 6.160483761380623, + "grad_norm": 7.558352470397949, + "learning_rate": 2.3027075689631746e-06, + "loss": 3.0258, + "step": 90670 + }, + { + "epoch": 6.160823481451284, + "grad_norm": 6.772473335266113, + "learning_rate": 2.3022829188748474e-06, + "loss": 2.7227, + "step": 90675 + }, + { + "epoch": 6.161163201521946, + "grad_norm": 8.396879196166992, + "learning_rate": 2.3018582687865202e-06, + "loss": 3.0609, + "step": 90680 + }, + { + "epoch": 6.161502921592608, + "grad_norm": 8.920998573303223, + "learning_rate": 2.3014336186981926e-06, + "loss": 2.7749, + "step": 90685 + }, + { + "epoch": 6.161842641663269, + "grad_norm": 7.6037726402282715, + "learning_rate": 2.301008968609866e-06, + "loss": 2.7359, + "step": 90690 + }, + { + "epoch": 6.162182361733931, + "grad_norm": 8.071002006530762, + "learning_rate": 2.3005843185215386e-06, + "loss": 2.9889, + "step": 90695 + }, + { + "epoch": 6.162522081804593, + "grad_norm": 8.134525299072266, + "learning_rate": 2.300159668433211e-06, + "loss": 2.7071, + "step": 90700 + }, + { + "epoch": 6.162861801875255, + "grad_norm": 7.215526103973389, + "learning_rate": 2.2997350183448842e-06, + "loss": 2.9432, + "step": 90705 + }, + { + "epoch": 6.163201521945917, + "grad_norm": 8.460434913635254, + "learning_rate": 2.2993103682565566e-06, + "loss": 2.7582, + "step": 90710 + }, + { + "epoch": 6.163541242016579, + "grad_norm": 7.010861396789551, + "learning_rate": 2.2988857181682294e-06, + "loss": 2.6883, + "step": 90715 + }, + { + "epoch": 6.16388096208724, + "grad_norm": 8.479626655578613, + "learning_rate": 2.2984610680799026e-06, + "loss": 2.7614, + "step": 90720 + }, + { + "epoch": 6.164220682157902, + "grad_norm": 10.462898254394531, + "learning_rate": 2.298036417991575e-06, + "loss": 2.8634, + "step": 90725 + }, + { + "epoch": 6.164560402228564, + "grad_norm": 10.031872749328613, + "learning_rate": 2.297611767903248e-06, + "loss": 2.8352, + "step": 90730 + }, + { + "epoch": 6.164900122299225, + "grad_norm": 7.250736236572266, + "learning_rate": 2.2971871178149206e-06, + "loss": 2.7618, + "step": 90735 + }, + { + "epoch": 6.165239842369887, + "grad_norm": 7.887839317321777, + "learning_rate": 2.2967624677265934e-06, + "loss": 2.7982, + "step": 90740 + }, + { + "epoch": 6.165579562440549, + "grad_norm": 9.887123107910156, + "learning_rate": 2.2963378176382662e-06, + "loss": 2.7069, + "step": 90745 + }, + { + "epoch": 6.165919282511211, + "grad_norm": 9.315994262695312, + "learning_rate": 2.295913167549939e-06, + "loss": 2.6482, + "step": 90750 + }, + { + "epoch": 6.166259002581873, + "grad_norm": 6.636514663696289, + "learning_rate": 2.295488517461612e-06, + "loss": 2.7259, + "step": 90755 + }, + { + "epoch": 6.166598722652535, + "grad_norm": 7.285003662109375, + "learning_rate": 2.2950638673732846e-06, + "loss": 2.6265, + "step": 90760 + }, + { + "epoch": 6.166938442723196, + "grad_norm": 7.711078643798828, + "learning_rate": 2.2946392172849574e-06, + "loss": 2.6781, + "step": 90765 + }, + { + "epoch": 6.167278162793858, + "grad_norm": 9.480205535888672, + "learning_rate": 2.2942145671966302e-06, + "loss": 2.6336, + "step": 90770 + }, + { + "epoch": 6.16761788286452, + "grad_norm": 7.520438194274902, + "learning_rate": 2.293789917108303e-06, + "loss": 2.6567, + "step": 90775 + }, + { + "epoch": 6.167957602935181, + "grad_norm": 8.099278450012207, + "learning_rate": 2.293365267019976e-06, + "loss": 2.8565, + "step": 90780 + }, + { + "epoch": 6.168297323005843, + "grad_norm": 5.673004627227783, + "learning_rate": 2.2929406169316486e-06, + "loss": 2.6443, + "step": 90785 + }, + { + "epoch": 6.1686370430765045, + "grad_norm": 7.8897705078125, + "learning_rate": 2.2925159668433214e-06, + "loss": 3.0019, + "step": 90790 + }, + { + "epoch": 6.168976763147167, + "grad_norm": 15.593742370605469, + "learning_rate": 2.2920913167549942e-06, + "loss": 2.7831, + "step": 90795 + }, + { + "epoch": 6.169316483217829, + "grad_norm": 7.752662658691406, + "learning_rate": 2.2916666666666666e-06, + "loss": 2.6542, + "step": 90800 + }, + { + "epoch": 6.16965620328849, + "grad_norm": 6.021692276000977, + "learning_rate": 2.29124201657834e-06, + "loss": 2.6655, + "step": 90805 + }, + { + "epoch": 6.169995923359152, + "grad_norm": 8.763997077941895, + "learning_rate": 2.290817366490012e-06, + "loss": 2.9084, + "step": 90810 + }, + { + "epoch": 6.170335643429814, + "grad_norm": 8.110492706298828, + "learning_rate": 2.290392716401685e-06, + "loss": 2.8464, + "step": 90815 + }, + { + "epoch": 6.170675363500475, + "grad_norm": 8.347991943359375, + "learning_rate": 2.2899680663133582e-06, + "loss": 2.7748, + "step": 90820 + }, + { + "epoch": 6.171015083571137, + "grad_norm": 7.027602672576904, + "learning_rate": 2.2895434162250306e-06, + "loss": 2.7113, + "step": 90825 + }, + { + "epoch": 6.171354803641799, + "grad_norm": 7.173009872436523, + "learning_rate": 2.2891187661367034e-06, + "loss": 2.5621, + "step": 90830 + }, + { + "epoch": 6.1716945237124605, + "grad_norm": 9.394646644592285, + "learning_rate": 2.2886941160483762e-06, + "loss": 2.593, + "step": 90835 + }, + { + "epoch": 6.172034243783123, + "grad_norm": 8.267658233642578, + "learning_rate": 2.288269465960049e-06, + "loss": 2.7223, + "step": 90840 + }, + { + "epoch": 6.172373963853785, + "grad_norm": 8.152230262756348, + "learning_rate": 2.287844815871722e-06, + "loss": 2.5692, + "step": 90845 + }, + { + "epoch": 6.172713683924446, + "grad_norm": 6.937267780303955, + "learning_rate": 2.2874201657833946e-06, + "loss": 2.7874, + "step": 90850 + }, + { + "epoch": 6.173053403995108, + "grad_norm": 7.9643120765686035, + "learning_rate": 2.2869955156950674e-06, + "loss": 2.6902, + "step": 90855 + }, + { + "epoch": 6.17339312406577, + "grad_norm": 7.781460285186768, + "learning_rate": 2.2865708656067402e-06, + "loss": 2.5831, + "step": 90860 + }, + { + "epoch": 6.173732844136431, + "grad_norm": 6.645206451416016, + "learning_rate": 2.286146215518413e-06, + "loss": 2.7741, + "step": 90865 + }, + { + "epoch": 6.174072564207093, + "grad_norm": 6.8588738441467285, + "learning_rate": 2.285721565430086e-06, + "loss": 2.7039, + "step": 90870 + }, + { + "epoch": 6.174412284277755, + "grad_norm": 6.80266809463501, + "learning_rate": 2.2852969153417586e-06, + "loss": 2.3592, + "step": 90875 + }, + { + "epoch": 6.1747520043484165, + "grad_norm": 7.937875270843506, + "learning_rate": 2.2848722652534314e-06, + "loss": 2.7913, + "step": 90880 + }, + { + "epoch": 6.175091724419079, + "grad_norm": 10.113842964172363, + "learning_rate": 2.2844476151651042e-06, + "loss": 2.8043, + "step": 90885 + }, + { + "epoch": 6.175431444489741, + "grad_norm": 6.971272945404053, + "learning_rate": 2.284022965076777e-06, + "loss": 2.6735, + "step": 90890 + }, + { + "epoch": 6.175771164560402, + "grad_norm": 7.738661766052246, + "learning_rate": 2.28359831498845e-06, + "loss": 2.7335, + "step": 90895 + }, + { + "epoch": 6.176110884631064, + "grad_norm": 8.748778343200684, + "learning_rate": 2.283173664900122e-06, + "loss": 2.6415, + "step": 90900 + }, + { + "epoch": 6.176450604701726, + "grad_norm": 7.555642604827881, + "learning_rate": 2.2827490148117954e-06, + "loss": 2.5434, + "step": 90905 + }, + { + "epoch": 6.176790324772387, + "grad_norm": 7.366908550262451, + "learning_rate": 2.2823243647234682e-06, + "loss": 2.6521, + "step": 90910 + }, + { + "epoch": 6.177130044843049, + "grad_norm": 7.325109958648682, + "learning_rate": 2.2818997146351406e-06, + "loss": 2.6615, + "step": 90915 + }, + { + "epoch": 6.177469764913711, + "grad_norm": 8.103775024414062, + "learning_rate": 2.281475064546814e-06, + "loss": 2.7799, + "step": 90920 + }, + { + "epoch": 6.1778094849843725, + "grad_norm": 8.010918617248535, + "learning_rate": 2.281050414458486e-06, + "loss": 2.591, + "step": 90925 + }, + { + "epoch": 6.178149205055035, + "grad_norm": 7.065635681152344, + "learning_rate": 2.2806257643701594e-06, + "loss": 2.7267, + "step": 90930 + }, + { + "epoch": 6.178488925125697, + "grad_norm": 8.292085647583008, + "learning_rate": 2.280201114281832e-06, + "loss": 2.8196, + "step": 90935 + }, + { + "epoch": 6.178828645196358, + "grad_norm": 6.873868465423584, + "learning_rate": 2.2797764641935046e-06, + "loss": 2.8636, + "step": 90940 + }, + { + "epoch": 6.17916836526702, + "grad_norm": 7.609066963195801, + "learning_rate": 2.279351814105178e-06, + "loss": 2.6243, + "step": 90945 + }, + { + "epoch": 6.179508085337682, + "grad_norm": 8.235844612121582, + "learning_rate": 2.27892716401685e-06, + "loss": 2.6133, + "step": 90950 + }, + { + "epoch": 6.179847805408343, + "grad_norm": 8.9719877243042, + "learning_rate": 2.278502513928523e-06, + "loss": 2.7564, + "step": 90955 + }, + { + "epoch": 6.180187525479005, + "grad_norm": 7.921579360961914, + "learning_rate": 2.278077863840196e-06, + "loss": 2.7556, + "step": 90960 + }, + { + "epoch": 6.180527245549667, + "grad_norm": 9.832858085632324, + "learning_rate": 2.2776532137518686e-06, + "loss": 2.6022, + "step": 90965 + }, + { + "epoch": 6.180866965620329, + "grad_norm": 8.810938835144043, + "learning_rate": 2.2772285636635414e-06, + "loss": 2.9457, + "step": 90970 + }, + { + "epoch": 6.181206685690991, + "grad_norm": 7.1766204833984375, + "learning_rate": 2.276803913575214e-06, + "loss": 2.8711, + "step": 90975 + }, + { + "epoch": 6.181546405761653, + "grad_norm": 8.900644302368164, + "learning_rate": 2.276379263486887e-06, + "loss": 3.025, + "step": 90980 + }, + { + "epoch": 6.181886125832314, + "grad_norm": 8.166348457336426, + "learning_rate": 2.27595461339856e-06, + "loss": 2.5041, + "step": 90985 + }, + { + "epoch": 6.182225845902976, + "grad_norm": 7.337247371673584, + "learning_rate": 2.2755299633102326e-06, + "loss": 2.6975, + "step": 90990 + }, + { + "epoch": 6.182565565973638, + "grad_norm": 7.757140636444092, + "learning_rate": 2.2751053132219054e-06, + "loss": 2.8194, + "step": 90995 + }, + { + "epoch": 6.182905286044299, + "grad_norm": 8.839167594909668, + "learning_rate": 2.274680663133578e-06, + "loss": 2.8789, + "step": 91000 + }, + { + "epoch": 6.183245006114961, + "grad_norm": 6.726457595825195, + "learning_rate": 2.274256013045251e-06, + "loss": 2.7116, + "step": 91005 + }, + { + "epoch": 6.183584726185623, + "grad_norm": 7.359989166259766, + "learning_rate": 2.273831362956924e-06, + "loss": 2.7057, + "step": 91010 + }, + { + "epoch": 6.183924446256285, + "grad_norm": 6.958961486816406, + "learning_rate": 2.2734067128685966e-06, + "loss": 2.9457, + "step": 91015 + }, + { + "epoch": 6.184264166326947, + "grad_norm": 6.383208751678467, + "learning_rate": 2.2729820627802694e-06, + "loss": 2.6077, + "step": 91020 + }, + { + "epoch": 6.184603886397609, + "grad_norm": 7.961606502532959, + "learning_rate": 2.272557412691942e-06, + "loss": 2.8484, + "step": 91025 + }, + { + "epoch": 6.18494360646827, + "grad_norm": 8.590713500976562, + "learning_rate": 2.272132762603615e-06, + "loss": 2.7438, + "step": 91030 + }, + { + "epoch": 6.185283326538932, + "grad_norm": 7.3758978843688965, + "learning_rate": 2.271708112515288e-06, + "loss": 2.6728, + "step": 91035 + }, + { + "epoch": 6.185623046609594, + "grad_norm": 8.267060279846191, + "learning_rate": 2.27128346242696e-06, + "loss": 2.6556, + "step": 91040 + }, + { + "epoch": 6.185962766680255, + "grad_norm": 8.038496971130371, + "learning_rate": 2.2708588123386334e-06, + "loss": 2.8184, + "step": 91045 + }, + { + "epoch": 6.186302486750917, + "grad_norm": 5.75028657913208, + "learning_rate": 2.270434162250306e-06, + "loss": 2.9262, + "step": 91050 + }, + { + "epoch": 6.186642206821579, + "grad_norm": 7.898735523223877, + "learning_rate": 2.2700095121619786e-06, + "loss": 3.0683, + "step": 91055 + }, + { + "epoch": 6.186981926892241, + "grad_norm": 6.865084648132324, + "learning_rate": 2.2695848620736514e-06, + "loss": 2.5552, + "step": 91060 + }, + { + "epoch": 6.187321646962903, + "grad_norm": 9.469365119934082, + "learning_rate": 2.269160211985324e-06, + "loss": 2.8681, + "step": 91065 + }, + { + "epoch": 6.187661367033565, + "grad_norm": 6.653181552886963, + "learning_rate": 2.268735561896997e-06, + "loss": 2.5514, + "step": 91070 + }, + { + "epoch": 6.188001087104226, + "grad_norm": 7.643032073974609, + "learning_rate": 2.26831091180867e-06, + "loss": 2.6646, + "step": 91075 + }, + { + "epoch": 6.188340807174888, + "grad_norm": 6.851491451263428, + "learning_rate": 2.2678862617203426e-06, + "loss": 2.7679, + "step": 91080 + }, + { + "epoch": 6.18868052724555, + "grad_norm": 10.934592247009277, + "learning_rate": 2.2674616116320154e-06, + "loss": 2.7466, + "step": 91085 + }, + { + "epoch": 6.189020247316211, + "grad_norm": 8.111181259155273, + "learning_rate": 2.267036961543688e-06, + "loss": 2.783, + "step": 91090 + }, + { + "epoch": 6.189359967386873, + "grad_norm": 7.039917945861816, + "learning_rate": 2.266612311455361e-06, + "loss": 2.9302, + "step": 91095 + }, + { + "epoch": 6.189699687457535, + "grad_norm": 9.406437873840332, + "learning_rate": 2.266187661367034e-06, + "loss": 2.7011, + "step": 91100 + }, + { + "epoch": 6.190039407528197, + "grad_norm": 6.842634677886963, + "learning_rate": 2.2657630112787066e-06, + "loss": 2.5634, + "step": 91105 + }, + { + "epoch": 6.190379127598859, + "grad_norm": 6.060662269592285, + "learning_rate": 2.2653383611903794e-06, + "loss": 2.988, + "step": 91110 + }, + { + "epoch": 6.190718847669521, + "grad_norm": 7.697386741638184, + "learning_rate": 2.264913711102052e-06, + "loss": 2.9255, + "step": 91115 + }, + { + "epoch": 6.191058567740182, + "grad_norm": 9.98515510559082, + "learning_rate": 2.264489061013725e-06, + "loss": 2.8176, + "step": 91120 + }, + { + "epoch": 6.191398287810844, + "grad_norm": 8.601571083068848, + "learning_rate": 2.2640644109253974e-06, + "loss": 2.7652, + "step": 91125 + }, + { + "epoch": 6.191738007881506, + "grad_norm": 9.154590606689453, + "learning_rate": 2.2636397608370706e-06, + "loss": 2.6443, + "step": 91130 + }, + { + "epoch": 6.192077727952167, + "grad_norm": 7.077844142913818, + "learning_rate": 2.2632151107487434e-06, + "loss": 2.9659, + "step": 91135 + }, + { + "epoch": 6.192417448022829, + "grad_norm": 6.825596809387207, + "learning_rate": 2.262790460660416e-06, + "loss": 2.955, + "step": 91140 + }, + { + "epoch": 6.192757168093491, + "grad_norm": 8.297039985656738, + "learning_rate": 2.262365810572089e-06, + "loss": 2.4657, + "step": 91145 + }, + { + "epoch": 6.193096888164153, + "grad_norm": 6.537344455718994, + "learning_rate": 2.2619411604837614e-06, + "loss": 2.7764, + "step": 91150 + }, + { + "epoch": 6.193436608234815, + "grad_norm": 8.496272087097168, + "learning_rate": 2.261516510395434e-06, + "loss": 2.8953, + "step": 91155 + }, + { + "epoch": 6.193776328305476, + "grad_norm": 7.102780818939209, + "learning_rate": 2.261091860307107e-06, + "loss": 2.7475, + "step": 91160 + }, + { + "epoch": 6.194116048376138, + "grad_norm": 6.19757080078125, + "learning_rate": 2.26066721021878e-06, + "loss": 2.7253, + "step": 91165 + }, + { + "epoch": 6.1944557684468, + "grad_norm": 7.109752655029297, + "learning_rate": 2.2602425601304526e-06, + "loss": 2.7074, + "step": 91170 + }, + { + "epoch": 6.194795488517461, + "grad_norm": 7.897248268127441, + "learning_rate": 2.2598179100421254e-06, + "loss": 2.5482, + "step": 91175 + }, + { + "epoch": 6.195135208588123, + "grad_norm": 8.494067192077637, + "learning_rate": 2.259393259953798e-06, + "loss": 2.8171, + "step": 91180 + }, + { + "epoch": 6.195474928658785, + "grad_norm": 10.214097023010254, + "learning_rate": 2.258968609865471e-06, + "loss": 2.8502, + "step": 91185 + }, + { + "epoch": 6.1958146487294465, + "grad_norm": 7.734096050262451, + "learning_rate": 2.258543959777144e-06, + "loss": 2.7306, + "step": 91190 + }, + { + "epoch": 6.196154368800109, + "grad_norm": 5.597986221313477, + "learning_rate": 2.2581193096888166e-06, + "loss": 2.6231, + "step": 91195 + }, + { + "epoch": 6.196494088870771, + "grad_norm": 7.675076007843018, + "learning_rate": 2.2576946596004894e-06, + "loss": 2.842, + "step": 91200 + }, + { + "epoch": 6.196833808941432, + "grad_norm": 8.02856731414795, + "learning_rate": 2.257270009512162e-06, + "loss": 2.6767, + "step": 91205 + }, + { + "epoch": 6.197173529012094, + "grad_norm": 10.611495971679688, + "learning_rate": 2.256845359423835e-06, + "loss": 2.6553, + "step": 91210 + }, + { + "epoch": 6.197513249082756, + "grad_norm": 7.260876655578613, + "learning_rate": 2.256420709335508e-06, + "loss": 2.7607, + "step": 91215 + }, + { + "epoch": 6.197852969153417, + "grad_norm": 7.3160505294799805, + "learning_rate": 2.2559960592471806e-06, + "loss": 2.62, + "step": 91220 + }, + { + "epoch": 6.198192689224079, + "grad_norm": 7.390191555023193, + "learning_rate": 2.2555714091588534e-06, + "loss": 2.8986, + "step": 91225 + }, + { + "epoch": 6.198532409294741, + "grad_norm": 8.011627197265625, + "learning_rate": 2.255146759070526e-06, + "loss": 2.7925, + "step": 91230 + }, + { + "epoch": 6.1988721293654026, + "grad_norm": 7.442531108856201, + "learning_rate": 2.254722108982199e-06, + "loss": 2.6999, + "step": 91235 + }, + { + "epoch": 6.199211849436065, + "grad_norm": 7.767531871795654, + "learning_rate": 2.2542974588938714e-06, + "loss": 2.6753, + "step": 91240 + }, + { + "epoch": 6.199551569506727, + "grad_norm": 6.326804161071777, + "learning_rate": 2.2538728088055446e-06, + "loss": 2.7794, + "step": 91245 + }, + { + "epoch": 6.199891289577388, + "grad_norm": 8.259685516357422, + "learning_rate": 2.253448158717217e-06, + "loss": 2.7451, + "step": 91250 + }, + { + "epoch": 6.20023100964805, + "grad_norm": 6.911654472351074, + "learning_rate": 2.2530235086288898e-06, + "loss": 2.3605, + "step": 91255 + }, + { + "epoch": 6.200570729718712, + "grad_norm": 8.500389099121094, + "learning_rate": 2.252598858540563e-06, + "loss": 2.6024, + "step": 91260 + }, + { + "epoch": 6.200910449789373, + "grad_norm": 7.711841106414795, + "learning_rate": 2.2521742084522354e-06, + "loss": 2.7978, + "step": 91265 + }, + { + "epoch": 6.201250169860035, + "grad_norm": 8.232673645019531, + "learning_rate": 2.2517495583639086e-06, + "loss": 2.7425, + "step": 91270 + }, + { + "epoch": 6.201589889930697, + "grad_norm": 7.266749382019043, + "learning_rate": 2.251324908275581e-06, + "loss": 2.7411, + "step": 91275 + }, + { + "epoch": 6.201929610001359, + "grad_norm": 5.964229583740234, + "learning_rate": 2.2509002581872538e-06, + "loss": 2.8134, + "step": 91280 + }, + { + "epoch": 6.202269330072021, + "grad_norm": 7.785319805145264, + "learning_rate": 2.2504756080989266e-06, + "loss": 2.5255, + "step": 91285 + }, + { + "epoch": 6.202609050142683, + "grad_norm": 6.855353832244873, + "learning_rate": 2.2500509580105994e-06, + "loss": 2.7882, + "step": 91290 + }, + { + "epoch": 6.202948770213344, + "grad_norm": 8.593867301940918, + "learning_rate": 2.249626307922272e-06, + "loss": 2.9214, + "step": 91295 + }, + { + "epoch": 6.203288490284006, + "grad_norm": 9.243343353271484, + "learning_rate": 2.249201657833945e-06, + "loss": 2.4268, + "step": 91300 + }, + { + "epoch": 6.203628210354668, + "grad_norm": 6.938484191894531, + "learning_rate": 2.248777007745618e-06, + "loss": 2.8322, + "step": 91305 + }, + { + "epoch": 6.203967930425329, + "grad_norm": 7.403357028961182, + "learning_rate": 2.2483523576572906e-06, + "loss": 2.6254, + "step": 91310 + }, + { + "epoch": 6.204307650495991, + "grad_norm": 6.658154487609863, + "learning_rate": 2.2479277075689634e-06, + "loss": 2.6723, + "step": 91315 + }, + { + "epoch": 6.204647370566653, + "grad_norm": 9.13429069519043, + "learning_rate": 2.247503057480636e-06, + "loss": 2.8509, + "step": 91320 + }, + { + "epoch": 6.204987090637315, + "grad_norm": 9.656012535095215, + "learning_rate": 2.247078407392309e-06, + "loss": 2.6128, + "step": 91325 + }, + { + "epoch": 6.205326810707977, + "grad_norm": 6.452877521514893, + "learning_rate": 2.246653757303982e-06, + "loss": 2.7305, + "step": 91330 + }, + { + "epoch": 6.205666530778639, + "grad_norm": 6.635073184967041, + "learning_rate": 2.2462291072156546e-06, + "loss": 2.7429, + "step": 91335 + }, + { + "epoch": 6.2060062508493, + "grad_norm": 6.477306365966797, + "learning_rate": 2.245804457127327e-06, + "loss": 2.6584, + "step": 91340 + }, + { + "epoch": 6.206345970919962, + "grad_norm": 9.543712615966797, + "learning_rate": 2.245379807039e-06, + "loss": 2.727, + "step": 91345 + }, + { + "epoch": 6.206685690990624, + "grad_norm": 8.454511642456055, + "learning_rate": 2.244955156950673e-06, + "loss": 2.5731, + "step": 91350 + }, + { + "epoch": 6.207025411061285, + "grad_norm": 8.601164817810059, + "learning_rate": 2.244530506862346e-06, + "loss": 2.703, + "step": 91355 + }, + { + "epoch": 6.207365131131947, + "grad_norm": 6.875840187072754, + "learning_rate": 2.2441058567740186e-06, + "loss": 2.6745, + "step": 91360 + }, + { + "epoch": 6.207704851202609, + "grad_norm": 10.543889045715332, + "learning_rate": 2.243681206685691e-06, + "loss": 2.6792, + "step": 91365 + }, + { + "epoch": 6.208044571273271, + "grad_norm": 8.863862991333008, + "learning_rate": 2.243256556597364e-06, + "loss": 2.9176, + "step": 91370 + }, + { + "epoch": 6.208384291343933, + "grad_norm": 7.603339672088623, + "learning_rate": 2.2428319065090366e-06, + "loss": 2.9719, + "step": 91375 + }, + { + "epoch": 6.208724011414595, + "grad_norm": 8.984429359436035, + "learning_rate": 2.2424072564207094e-06, + "loss": 2.8049, + "step": 91380 + }, + { + "epoch": 6.209063731485256, + "grad_norm": 7.706449031829834, + "learning_rate": 2.2419826063323826e-06, + "loss": 2.7879, + "step": 91385 + }, + { + "epoch": 6.209403451555918, + "grad_norm": 8.477530479431152, + "learning_rate": 2.241557956244055e-06, + "loss": 2.7341, + "step": 91390 + }, + { + "epoch": 6.20974317162658, + "grad_norm": 8.229475975036621, + "learning_rate": 2.2411333061557278e-06, + "loss": 2.6928, + "step": 91395 + }, + { + "epoch": 6.210082891697241, + "grad_norm": 6.3940019607543945, + "learning_rate": 2.2407086560674006e-06, + "loss": 2.6648, + "step": 91400 + }, + { + "epoch": 6.210422611767903, + "grad_norm": 6.009758472442627, + "learning_rate": 2.2402840059790734e-06, + "loss": 2.7648, + "step": 91405 + }, + { + "epoch": 6.210762331838565, + "grad_norm": 6.346810817718506, + "learning_rate": 2.239859355890746e-06, + "loss": 2.4998, + "step": 91410 + }, + { + "epoch": 6.211102051909227, + "grad_norm": 8.798271179199219, + "learning_rate": 2.239434705802419e-06, + "loss": 2.8402, + "step": 91415 + }, + { + "epoch": 6.211441771979889, + "grad_norm": 7.688589572906494, + "learning_rate": 2.2390100557140918e-06, + "loss": 2.9435, + "step": 91420 + }, + { + "epoch": 6.211781492050551, + "grad_norm": 6.825893402099609, + "learning_rate": 2.2385854056257646e-06, + "loss": 2.6028, + "step": 91425 + }, + { + "epoch": 6.212121212121212, + "grad_norm": 9.637923240661621, + "learning_rate": 2.2381607555374374e-06, + "loss": 2.6215, + "step": 91430 + }, + { + "epoch": 6.212460932191874, + "grad_norm": 7.345296382904053, + "learning_rate": 2.23773610544911e-06, + "loss": 2.9608, + "step": 91435 + }, + { + "epoch": 6.212800652262536, + "grad_norm": 8.10209846496582, + "learning_rate": 2.237311455360783e-06, + "loss": 2.9921, + "step": 91440 + }, + { + "epoch": 6.213140372333197, + "grad_norm": 7.198303699493408, + "learning_rate": 2.236886805272456e-06, + "loss": 2.739, + "step": 91445 + }, + { + "epoch": 6.213480092403859, + "grad_norm": 7.242414474487305, + "learning_rate": 2.2364621551841286e-06, + "loss": 2.7953, + "step": 91450 + }, + { + "epoch": 6.213819812474521, + "grad_norm": 8.173453330993652, + "learning_rate": 2.2360375050958014e-06, + "loss": 2.8092, + "step": 91455 + }, + { + "epoch": 6.214159532545183, + "grad_norm": 6.507915019989014, + "learning_rate": 2.235612855007474e-06, + "loss": 2.9379, + "step": 91460 + }, + { + "epoch": 6.214499252615845, + "grad_norm": 8.185636520385742, + "learning_rate": 2.2351882049191466e-06, + "loss": 2.6891, + "step": 91465 + }, + { + "epoch": 6.214838972686506, + "grad_norm": 10.042739868164062, + "learning_rate": 2.23476355483082e-06, + "loss": 2.6504, + "step": 91470 + }, + { + "epoch": 6.215178692757168, + "grad_norm": 7.046680450439453, + "learning_rate": 2.234338904742492e-06, + "loss": 2.6819, + "step": 91475 + }, + { + "epoch": 6.21551841282783, + "grad_norm": 6.846033096313477, + "learning_rate": 2.233914254654165e-06, + "loss": 2.5888, + "step": 91480 + }, + { + "epoch": 6.215858132898491, + "grad_norm": 9.417280197143555, + "learning_rate": 2.233489604565838e-06, + "loss": 2.7618, + "step": 91485 + }, + { + "epoch": 6.216197852969153, + "grad_norm": 9.880456924438477, + "learning_rate": 2.2330649544775106e-06, + "loss": 3.0344, + "step": 91490 + }, + { + "epoch": 6.216537573039815, + "grad_norm": 7.2952704429626465, + "learning_rate": 2.2326403043891834e-06, + "loss": 2.847, + "step": 91495 + }, + { + "epoch": 6.2168772931104765, + "grad_norm": 8.797821044921875, + "learning_rate": 2.232215654300856e-06, + "loss": 2.7027, + "step": 91500 + }, + { + "epoch": 6.217217013181139, + "grad_norm": 6.581066608428955, + "learning_rate": 2.231791004212529e-06, + "loss": 2.7657, + "step": 91505 + }, + { + "epoch": 6.217556733251801, + "grad_norm": 9.609334945678711, + "learning_rate": 2.2313663541242018e-06, + "loss": 2.4192, + "step": 91510 + }, + { + "epoch": 6.217896453322462, + "grad_norm": 8.690607070922852, + "learning_rate": 2.2309417040358746e-06, + "loss": 2.4755, + "step": 91515 + }, + { + "epoch": 6.218236173393124, + "grad_norm": 7.972048759460449, + "learning_rate": 2.2305170539475474e-06, + "loss": 2.5442, + "step": 91520 + }, + { + "epoch": 6.218575893463786, + "grad_norm": 8.528945922851562, + "learning_rate": 2.23009240385922e-06, + "loss": 2.8377, + "step": 91525 + }, + { + "epoch": 6.218915613534447, + "grad_norm": 8.59028148651123, + "learning_rate": 2.229667753770893e-06, + "loss": 2.7264, + "step": 91530 + }, + { + "epoch": 6.219255333605109, + "grad_norm": 8.754313468933105, + "learning_rate": 2.2292431036825658e-06, + "loss": 2.6387, + "step": 91535 + }, + { + "epoch": 6.219595053675771, + "grad_norm": 7.843552589416504, + "learning_rate": 2.2288184535942386e-06, + "loss": 2.8266, + "step": 91540 + }, + { + "epoch": 6.2199347737464326, + "grad_norm": 8.692566871643066, + "learning_rate": 2.2283938035059114e-06, + "loss": 2.6621, + "step": 91545 + }, + { + "epoch": 6.220274493817095, + "grad_norm": 6.832518577575684, + "learning_rate": 2.227969153417584e-06, + "loss": 2.6285, + "step": 91550 + }, + { + "epoch": 6.220614213887757, + "grad_norm": 7.243305683135986, + "learning_rate": 2.227544503329257e-06, + "loss": 2.8125, + "step": 91555 + }, + { + "epoch": 6.220953933958418, + "grad_norm": 7.588293075561523, + "learning_rate": 2.2271198532409298e-06, + "loss": 2.8549, + "step": 91560 + }, + { + "epoch": 6.22129365402908, + "grad_norm": 7.160055637359619, + "learning_rate": 2.226695203152602e-06, + "loss": 2.7601, + "step": 91565 + }, + { + "epoch": 6.221633374099742, + "grad_norm": 8.313560485839844, + "learning_rate": 2.2262705530642754e-06, + "loss": 2.6895, + "step": 91570 + }, + { + "epoch": 6.221973094170403, + "grad_norm": 7.181273460388184, + "learning_rate": 2.225845902975948e-06, + "loss": 2.6816, + "step": 91575 + }, + { + "epoch": 6.222312814241065, + "grad_norm": 9.249593734741211, + "learning_rate": 2.2254212528876206e-06, + "loss": 2.9198, + "step": 91580 + }, + { + "epoch": 6.222652534311727, + "grad_norm": 7.262036323547363, + "learning_rate": 2.2249966027992938e-06, + "loss": 2.638, + "step": 91585 + }, + { + "epoch": 6.222992254382389, + "grad_norm": 6.645593643188477, + "learning_rate": 2.224571952710966e-06, + "loss": 2.5735, + "step": 91590 + }, + { + "epoch": 6.223331974453051, + "grad_norm": 6.607216835021973, + "learning_rate": 2.224147302622639e-06, + "loss": 2.6867, + "step": 91595 + }, + { + "epoch": 6.223671694523713, + "grad_norm": 7.906120777130127, + "learning_rate": 2.2237226525343118e-06, + "loss": 2.791, + "step": 91600 + }, + { + "epoch": 6.224011414594374, + "grad_norm": 7.898187637329102, + "learning_rate": 2.2232980024459846e-06, + "loss": 2.7895, + "step": 91605 + }, + { + "epoch": 6.224351134665036, + "grad_norm": 7.783328533172607, + "learning_rate": 2.222873352357658e-06, + "loss": 2.6088, + "step": 91610 + }, + { + "epoch": 6.224690854735698, + "grad_norm": 8.499261856079102, + "learning_rate": 2.22244870226933e-06, + "loss": 3.0352, + "step": 91615 + }, + { + "epoch": 6.225030574806359, + "grad_norm": 9.393688201904297, + "learning_rate": 2.222024052181003e-06, + "loss": 2.7294, + "step": 91620 + }, + { + "epoch": 6.225370294877021, + "grad_norm": 8.028640747070312, + "learning_rate": 2.2215994020926758e-06, + "loss": 2.9003, + "step": 91625 + }, + { + "epoch": 6.225710014947683, + "grad_norm": 6.964957237243652, + "learning_rate": 2.2211747520043486e-06, + "loss": 2.6494, + "step": 91630 + }, + { + "epoch": 6.226049735018345, + "grad_norm": 6.7380290031433105, + "learning_rate": 2.2207501019160214e-06, + "loss": 2.716, + "step": 91635 + }, + { + "epoch": 6.226389455089007, + "grad_norm": 8.474470138549805, + "learning_rate": 2.220325451827694e-06, + "loss": 2.548, + "step": 91640 + }, + { + "epoch": 6.226729175159669, + "grad_norm": 8.066264152526855, + "learning_rate": 2.219900801739367e-06, + "loss": 2.9844, + "step": 91645 + }, + { + "epoch": 6.22706889523033, + "grad_norm": 7.431432247161865, + "learning_rate": 2.2194761516510398e-06, + "loss": 2.5441, + "step": 91650 + }, + { + "epoch": 6.227408615300992, + "grad_norm": 8.039901733398438, + "learning_rate": 2.2190515015627126e-06, + "loss": 2.5363, + "step": 91655 + }, + { + "epoch": 6.227748335371654, + "grad_norm": 8.428902626037598, + "learning_rate": 2.2186268514743854e-06, + "loss": 2.715, + "step": 91660 + }, + { + "epoch": 6.228088055442315, + "grad_norm": 6.298513412475586, + "learning_rate": 2.2182022013860578e-06, + "loss": 2.7088, + "step": 91665 + }, + { + "epoch": 6.228427775512977, + "grad_norm": 7.574665546417236, + "learning_rate": 2.217777551297731e-06, + "loss": 2.5981, + "step": 91670 + }, + { + "epoch": 6.228767495583639, + "grad_norm": 8.688020706176758, + "learning_rate": 2.2173529012094038e-06, + "loss": 2.6503, + "step": 91675 + }, + { + "epoch": 6.229107215654301, + "grad_norm": 7.014461994171143, + "learning_rate": 2.216928251121076e-06, + "loss": 2.6922, + "step": 91680 + }, + { + "epoch": 6.229446935724963, + "grad_norm": 7.100880146026611, + "learning_rate": 2.2165036010327494e-06, + "loss": 2.8486, + "step": 91685 + }, + { + "epoch": 6.229786655795625, + "grad_norm": 6.74859094619751, + "learning_rate": 2.2160789509444218e-06, + "loss": 2.5709, + "step": 91690 + }, + { + "epoch": 6.230126375866286, + "grad_norm": 8.926258087158203, + "learning_rate": 2.215654300856095e-06, + "loss": 2.6149, + "step": 91695 + }, + { + "epoch": 6.230466095936948, + "grad_norm": 6.944140434265137, + "learning_rate": 2.2152296507677678e-06, + "loss": 2.6192, + "step": 91700 + }, + { + "epoch": 6.23080581600761, + "grad_norm": 6.565008163452148, + "learning_rate": 2.21480500067944e-06, + "loss": 2.6841, + "step": 91705 + }, + { + "epoch": 6.231145536078271, + "grad_norm": 7.284104824066162, + "learning_rate": 2.2143803505911134e-06, + "loss": 2.5927, + "step": 91710 + }, + { + "epoch": 6.231485256148933, + "grad_norm": 7.976815700531006, + "learning_rate": 2.2139557005027858e-06, + "loss": 3.0538, + "step": 91715 + }, + { + "epoch": 6.231824976219595, + "grad_norm": 10.566468238830566, + "learning_rate": 2.2135310504144586e-06, + "loss": 2.756, + "step": 91720 + }, + { + "epoch": 6.232164696290257, + "grad_norm": 7.172761917114258, + "learning_rate": 2.2131064003261314e-06, + "loss": 2.6889, + "step": 91725 + }, + { + "epoch": 6.232504416360919, + "grad_norm": 6.194476127624512, + "learning_rate": 2.212681750237804e-06, + "loss": 2.4979, + "step": 91730 + }, + { + "epoch": 6.232844136431581, + "grad_norm": 6.708912372589111, + "learning_rate": 2.212257100149477e-06, + "loss": 2.8803, + "step": 91735 + }, + { + "epoch": 6.233183856502242, + "grad_norm": 8.903338432312012, + "learning_rate": 2.2118324500611498e-06, + "loss": 2.8087, + "step": 91740 + }, + { + "epoch": 6.233523576572904, + "grad_norm": 9.286985397338867, + "learning_rate": 2.2114077999728226e-06, + "loss": 2.6866, + "step": 91745 + }, + { + "epoch": 6.233863296643566, + "grad_norm": 9.27858829498291, + "learning_rate": 2.2109831498844954e-06, + "loss": 2.772, + "step": 91750 + }, + { + "epoch": 6.234203016714227, + "grad_norm": 8.34775447845459, + "learning_rate": 2.210558499796168e-06, + "loss": 2.8218, + "step": 91755 + }, + { + "epoch": 6.234542736784889, + "grad_norm": 7.6397905349731445, + "learning_rate": 2.210133849707841e-06, + "loss": 2.8991, + "step": 91760 + }, + { + "epoch": 6.234882456855551, + "grad_norm": 7.791481971740723, + "learning_rate": 2.2097091996195138e-06, + "loss": 2.6117, + "step": 91765 + }, + { + "epoch": 6.235222176926213, + "grad_norm": 6.88769006729126, + "learning_rate": 2.2092845495311866e-06, + "loss": 2.8274, + "step": 91770 + }, + { + "epoch": 6.235561896996875, + "grad_norm": 8.905467987060547, + "learning_rate": 2.2088598994428594e-06, + "loss": 2.7394, + "step": 91775 + }, + { + "epoch": 6.235901617067537, + "grad_norm": 8.182682037353516, + "learning_rate": 2.208435249354532e-06, + "loss": 2.9582, + "step": 91780 + }, + { + "epoch": 6.236241337138198, + "grad_norm": 7.9934258460998535, + "learning_rate": 2.208010599266205e-06, + "loss": 2.6371, + "step": 91785 + }, + { + "epoch": 6.23658105720886, + "grad_norm": 7.365067958831787, + "learning_rate": 2.2075859491778773e-06, + "loss": 2.7254, + "step": 91790 + }, + { + "epoch": 6.236920777279522, + "grad_norm": 7.593716144561768, + "learning_rate": 2.2071612990895506e-06, + "loss": 2.7605, + "step": 91795 + }, + { + "epoch": 6.237260497350183, + "grad_norm": 7.08482551574707, + "learning_rate": 2.2068215790188886e-06, + "loss": 3.0551, + "step": 91800 + }, + { + "epoch": 6.237600217420845, + "grad_norm": 8.275591850280762, + "learning_rate": 2.2063969289305614e-06, + "loss": 2.846, + "step": 91805 + }, + { + "epoch": 6.237939937491507, + "grad_norm": 9.736361503601074, + "learning_rate": 2.2059722788422342e-06, + "loss": 2.7731, + "step": 91810 + }, + { + "epoch": 6.238279657562169, + "grad_norm": 8.992243766784668, + "learning_rate": 2.205547628753907e-06, + "loss": 2.824, + "step": 91815 + }, + { + "epoch": 6.238619377632831, + "grad_norm": 8.907621383666992, + "learning_rate": 2.20512297866558e-06, + "loss": 2.6415, + "step": 91820 + }, + { + "epoch": 6.238959097703493, + "grad_norm": 6.5603790283203125, + "learning_rate": 2.2046983285772526e-06, + "loss": 2.5841, + "step": 91825 + }, + { + "epoch": 6.239298817774154, + "grad_norm": 6.4248151779174805, + "learning_rate": 2.2042736784889254e-06, + "loss": 2.409, + "step": 91830 + }, + { + "epoch": 6.239638537844816, + "grad_norm": 8.319783210754395, + "learning_rate": 2.2038490284005982e-06, + "loss": 2.6468, + "step": 91835 + }, + { + "epoch": 6.239978257915477, + "grad_norm": 8.668034553527832, + "learning_rate": 2.2034243783122706e-06, + "loss": 2.6063, + "step": 91840 + }, + { + "epoch": 6.240317977986139, + "grad_norm": 8.550347328186035, + "learning_rate": 2.202999728223944e-06, + "loss": 2.8041, + "step": 91845 + }, + { + "epoch": 6.240657698056801, + "grad_norm": 7.135409832000732, + "learning_rate": 2.2025750781356167e-06, + "loss": 2.7708, + "step": 91850 + }, + { + "epoch": 6.240997418127463, + "grad_norm": 9.319823265075684, + "learning_rate": 2.202150428047289e-06, + "loss": 2.7365, + "step": 91855 + }, + { + "epoch": 6.241337138198125, + "grad_norm": 8.256237030029297, + "learning_rate": 2.2017257779589623e-06, + "loss": 2.9381, + "step": 91860 + }, + { + "epoch": 6.241676858268787, + "grad_norm": 8.57169246673584, + "learning_rate": 2.2013011278706346e-06, + "loss": 2.7583, + "step": 91865 + }, + { + "epoch": 6.242016578339448, + "grad_norm": 6.759589672088623, + "learning_rate": 2.2008764777823074e-06, + "loss": 2.5113, + "step": 91870 + }, + { + "epoch": 6.24235629841011, + "grad_norm": 11.145397186279297, + "learning_rate": 2.2004518276939802e-06, + "loss": 2.6968, + "step": 91875 + }, + { + "epoch": 6.242696018480772, + "grad_norm": 6.277942180633545, + "learning_rate": 2.200027177605653e-06, + "loss": 2.713, + "step": 91880 + }, + { + "epoch": 6.243035738551433, + "grad_norm": 8.226974487304688, + "learning_rate": 2.199602527517326e-06, + "loss": 2.7094, + "step": 91885 + }, + { + "epoch": 6.243375458622095, + "grad_norm": 7.280538082122803, + "learning_rate": 2.1991778774289986e-06, + "loss": 2.6001, + "step": 91890 + }, + { + "epoch": 6.243715178692757, + "grad_norm": 6.987292289733887, + "learning_rate": 2.1987532273406714e-06, + "loss": 2.7546, + "step": 91895 + }, + { + "epoch": 6.244054898763419, + "grad_norm": 8.832015991210938, + "learning_rate": 2.1983285772523442e-06, + "loss": 2.8107, + "step": 91900 + }, + { + "epoch": 6.244394618834081, + "grad_norm": 8.631720542907715, + "learning_rate": 2.197903927164017e-06, + "loss": 2.8116, + "step": 91905 + }, + { + "epoch": 6.244734338904743, + "grad_norm": 9.379955291748047, + "learning_rate": 2.19747927707569e-06, + "loss": 2.9679, + "step": 91910 + }, + { + "epoch": 6.245074058975404, + "grad_norm": 7.588642597198486, + "learning_rate": 2.1970546269873626e-06, + "loss": 2.6415, + "step": 91915 + }, + { + "epoch": 6.245413779046066, + "grad_norm": 8.149881362915039, + "learning_rate": 2.1966299768990354e-06, + "loss": 2.8081, + "step": 91920 + }, + { + "epoch": 6.245753499116728, + "grad_norm": 8.141511917114258, + "learning_rate": 2.1962053268107082e-06, + "loss": 2.8202, + "step": 91925 + }, + { + "epoch": 6.246093219187389, + "grad_norm": 7.976735591888428, + "learning_rate": 2.195780676722381e-06, + "loss": 2.7532, + "step": 91930 + }, + { + "epoch": 6.246432939258051, + "grad_norm": 7.80977201461792, + "learning_rate": 2.195356026634054e-06, + "loss": 2.5571, + "step": 91935 + }, + { + "epoch": 6.246772659328713, + "grad_norm": 7.517392635345459, + "learning_rate": 2.1949313765457262e-06, + "loss": 2.7824, + "step": 91940 + }, + { + "epoch": 6.247112379399375, + "grad_norm": 8.99686050415039, + "learning_rate": 2.1945067264573994e-06, + "loss": 2.7827, + "step": 91945 + }, + { + "epoch": 6.247452099470037, + "grad_norm": 8.00601577758789, + "learning_rate": 2.1940820763690722e-06, + "loss": 2.6044, + "step": 91950 + }, + { + "epoch": 6.247791819540699, + "grad_norm": 7.159762382507324, + "learning_rate": 2.1936574262807446e-06, + "loss": 2.8062, + "step": 91955 + }, + { + "epoch": 6.24813153961136, + "grad_norm": 6.407093048095703, + "learning_rate": 2.193232776192418e-06, + "loss": 2.7576, + "step": 91960 + }, + { + "epoch": 6.248471259682022, + "grad_norm": 8.107746124267578, + "learning_rate": 2.1928081261040902e-06, + "loss": 2.8017, + "step": 91965 + }, + { + "epoch": 6.248810979752684, + "grad_norm": 7.7524800300598145, + "learning_rate": 2.192383476015763e-06, + "loss": 2.7954, + "step": 91970 + }, + { + "epoch": 6.249150699823345, + "grad_norm": 6.49964714050293, + "learning_rate": 2.191958825927436e-06, + "loss": 2.7433, + "step": 91975 + }, + { + "epoch": 6.249490419894007, + "grad_norm": 7.825250625610352, + "learning_rate": 2.1915341758391086e-06, + "loss": 2.6523, + "step": 91980 + }, + { + "epoch": 6.249830139964669, + "grad_norm": 7.442342758178711, + "learning_rate": 2.191109525750782e-06, + "loss": 2.5895, + "step": 91985 + }, + { + "epoch": 6.250169860035331, + "grad_norm": 7.5263495445251465, + "learning_rate": 2.1906848756624542e-06, + "loss": 2.7907, + "step": 91990 + }, + { + "epoch": 6.250509580105993, + "grad_norm": 6.938475131988525, + "learning_rate": 2.190260225574127e-06, + "loss": 2.6872, + "step": 91995 + }, + { + "epoch": 6.250849300176655, + "grad_norm": 9.201751708984375, + "learning_rate": 2.1898355754858e-06, + "loss": 2.7411, + "step": 92000 + }, + { + "epoch": 6.251189020247316, + "grad_norm": 5.888461112976074, + "learning_rate": 2.1894109253974726e-06, + "loss": 2.9592, + "step": 92005 + }, + { + "epoch": 6.251528740317978, + "grad_norm": 6.564146041870117, + "learning_rate": 2.1889862753091454e-06, + "loss": 2.7598, + "step": 92010 + }, + { + "epoch": 6.25186846038864, + "grad_norm": 6.976151943206787, + "learning_rate": 2.1885616252208182e-06, + "loss": 2.4937, + "step": 92015 + }, + { + "epoch": 6.252208180459301, + "grad_norm": 9.387384414672852, + "learning_rate": 2.188136975132491e-06, + "loss": 2.7288, + "step": 92020 + }, + { + "epoch": 6.252547900529963, + "grad_norm": 7.782924175262451, + "learning_rate": 2.187712325044164e-06, + "loss": 2.6963, + "step": 92025 + }, + { + "epoch": 6.252887620600625, + "grad_norm": 7.838528156280518, + "learning_rate": 2.1872876749558366e-06, + "loss": 2.595, + "step": 92030 + }, + { + "epoch": 6.253227340671287, + "grad_norm": 6.269098281860352, + "learning_rate": 2.1868630248675094e-06, + "loss": 2.8033, + "step": 92035 + }, + { + "epoch": 6.253567060741949, + "grad_norm": 6.351477146148682, + "learning_rate": 2.1864383747791822e-06, + "loss": 2.5751, + "step": 92040 + }, + { + "epoch": 6.253906780812611, + "grad_norm": 6.642183780670166, + "learning_rate": 2.186013724690855e-06, + "loss": 2.8492, + "step": 92045 + }, + { + "epoch": 6.254246500883272, + "grad_norm": 6.126511573791504, + "learning_rate": 2.185589074602528e-06, + "loss": 2.8555, + "step": 92050 + }, + { + "epoch": 6.254586220953934, + "grad_norm": 8.110295295715332, + "learning_rate": 2.1851644245142002e-06, + "loss": 2.7766, + "step": 92055 + }, + { + "epoch": 6.254925941024596, + "grad_norm": 7.44191312789917, + "learning_rate": 2.1847397744258734e-06, + "loss": 2.7978, + "step": 92060 + }, + { + "epoch": 6.255265661095257, + "grad_norm": 8.318718910217285, + "learning_rate": 2.184315124337546e-06, + "loss": 2.6374, + "step": 92065 + }, + { + "epoch": 6.255605381165919, + "grad_norm": 8.420241355895996, + "learning_rate": 2.1838904742492186e-06, + "loss": 2.4967, + "step": 92070 + }, + { + "epoch": 6.255945101236581, + "grad_norm": 7.993727684020996, + "learning_rate": 2.183465824160892e-06, + "loss": 2.9301, + "step": 92075 + }, + { + "epoch": 6.256284821307243, + "grad_norm": 7.393075942993164, + "learning_rate": 2.1830411740725642e-06, + "loss": 2.6731, + "step": 92080 + }, + { + "epoch": 6.256624541377905, + "grad_norm": 7.170165061950684, + "learning_rate": 2.1826165239842374e-06, + "loss": 2.6785, + "step": 92085 + }, + { + "epoch": 6.256964261448567, + "grad_norm": 5.699951171875, + "learning_rate": 2.18219187389591e-06, + "loss": 2.6243, + "step": 92090 + }, + { + "epoch": 6.257303981519228, + "grad_norm": 9.84184741973877, + "learning_rate": 2.1817672238075826e-06, + "loss": 2.4784, + "step": 92095 + }, + { + "epoch": 6.25764370158989, + "grad_norm": 7.13460636138916, + "learning_rate": 2.1813425737192554e-06, + "loss": 2.7642, + "step": 92100 + }, + { + "epoch": 6.257983421660552, + "grad_norm": 6.941720485687256, + "learning_rate": 2.1809179236309282e-06, + "loss": 2.762, + "step": 92105 + }, + { + "epoch": 6.258323141731213, + "grad_norm": 7.9134979248046875, + "learning_rate": 2.180493273542601e-06, + "loss": 2.7849, + "step": 92110 + }, + { + "epoch": 6.258662861801875, + "grad_norm": 8.24018669128418, + "learning_rate": 2.180068623454274e-06, + "loss": 2.9076, + "step": 92115 + }, + { + "epoch": 6.259002581872537, + "grad_norm": 6.5673136711120605, + "learning_rate": 2.1796439733659466e-06, + "loss": 2.9122, + "step": 92120 + }, + { + "epoch": 6.259342301943199, + "grad_norm": 8.433721542358398, + "learning_rate": 2.1792193232776194e-06, + "loss": 2.451, + "step": 92125 + }, + { + "epoch": 6.259682022013861, + "grad_norm": 8.353034973144531, + "learning_rate": 2.1787946731892922e-06, + "loss": 2.8352, + "step": 92130 + }, + { + "epoch": 6.260021742084522, + "grad_norm": 6.196984767913818, + "learning_rate": 2.178370023100965e-06, + "loss": 2.6332, + "step": 92135 + }, + { + "epoch": 6.260361462155184, + "grad_norm": 6.682956695556641, + "learning_rate": 2.177945373012638e-06, + "loss": 2.7533, + "step": 92140 + }, + { + "epoch": 6.260701182225846, + "grad_norm": 7.541657447814941, + "learning_rate": 2.1775207229243106e-06, + "loss": 2.6033, + "step": 92145 + }, + { + "epoch": 6.261040902296507, + "grad_norm": 6.8570556640625, + "learning_rate": 2.1770960728359834e-06, + "loss": 2.6416, + "step": 92150 + }, + { + "epoch": 6.261380622367169, + "grad_norm": 9.89833927154541, + "learning_rate": 2.176671422747656e-06, + "loss": 2.7355, + "step": 92155 + }, + { + "epoch": 6.261720342437831, + "grad_norm": 7.852871894836426, + "learning_rate": 2.176246772659329e-06, + "loss": 2.8985, + "step": 92160 + }, + { + "epoch": 6.262060062508493, + "grad_norm": 8.901154518127441, + "learning_rate": 2.1758221225710014e-06, + "loss": 2.7693, + "step": 92165 + }, + { + "epoch": 6.262399782579155, + "grad_norm": 7.147118091583252, + "learning_rate": 2.1753974724826746e-06, + "loss": 2.8087, + "step": 92170 + }, + { + "epoch": 6.262739502649817, + "grad_norm": 6.643764495849609, + "learning_rate": 2.1749728223943474e-06, + "loss": 2.8854, + "step": 92175 + }, + { + "epoch": 6.263079222720478, + "grad_norm": 9.429443359375, + "learning_rate": 2.17454817230602e-06, + "loss": 2.7397, + "step": 92180 + }, + { + "epoch": 6.26341894279114, + "grad_norm": 7.809380531311035, + "learning_rate": 2.174123522217693e-06, + "loss": 2.7568, + "step": 92185 + }, + { + "epoch": 6.263758662861802, + "grad_norm": 7.6042799949646, + "learning_rate": 2.1736988721293654e-06, + "loss": 2.7929, + "step": 92190 + }, + { + "epoch": 6.264098382932463, + "grad_norm": 7.876460552215576, + "learning_rate": 2.173274222041038e-06, + "loss": 2.8405, + "step": 92195 + }, + { + "epoch": 6.264438103003125, + "grad_norm": 9.271334648132324, + "learning_rate": 2.1728495719527114e-06, + "loss": 2.8549, + "step": 92200 + }, + { + "epoch": 6.264777823073787, + "grad_norm": 8.196702003479004, + "learning_rate": 2.172424921864384e-06, + "loss": 2.5992, + "step": 92205 + }, + { + "epoch": 6.265117543144449, + "grad_norm": 7.948054790496826, + "learning_rate": 2.1720002717760566e-06, + "loss": 2.7756, + "step": 92210 + }, + { + "epoch": 6.265457263215111, + "grad_norm": 8.373573303222656, + "learning_rate": 2.1715756216877294e-06, + "loss": 2.8555, + "step": 92215 + }, + { + "epoch": 6.265796983285773, + "grad_norm": 9.166963577270508, + "learning_rate": 2.1711509715994022e-06, + "loss": 2.6838, + "step": 92220 + }, + { + "epoch": 6.266136703356434, + "grad_norm": 7.380805015563965, + "learning_rate": 2.170726321511075e-06, + "loss": 2.7776, + "step": 92225 + }, + { + "epoch": 6.266476423427096, + "grad_norm": 7.546975612640381, + "learning_rate": 2.170301671422748e-06, + "loss": 2.4557, + "step": 92230 + }, + { + "epoch": 6.266816143497758, + "grad_norm": 12.8489408493042, + "learning_rate": 2.1698770213344206e-06, + "loss": 2.7563, + "step": 92235 + }, + { + "epoch": 6.267155863568419, + "grad_norm": 7.948424339294434, + "learning_rate": 2.1694523712460934e-06, + "loss": 2.6706, + "step": 92240 + }, + { + "epoch": 6.267495583639081, + "grad_norm": 7.301477909088135, + "learning_rate": 2.1690277211577662e-06, + "loss": 2.9204, + "step": 92245 + }, + { + "epoch": 6.267835303709743, + "grad_norm": 9.406197547912598, + "learning_rate": 2.168603071069439e-06, + "loss": 2.739, + "step": 92250 + }, + { + "epoch": 6.268175023780405, + "grad_norm": 9.190116882324219, + "learning_rate": 2.168178420981112e-06, + "loss": 2.9896, + "step": 92255 + }, + { + "epoch": 6.268514743851067, + "grad_norm": 6.596593379974365, + "learning_rate": 2.1677537708927846e-06, + "loss": 2.9285, + "step": 92260 + }, + { + "epoch": 6.268854463921729, + "grad_norm": 7.619528770446777, + "learning_rate": 2.1673291208044574e-06, + "loss": 2.8288, + "step": 92265 + }, + { + "epoch": 6.26919418399239, + "grad_norm": 8.533955574035645, + "learning_rate": 2.1669044707161302e-06, + "loss": 2.6065, + "step": 92270 + }, + { + "epoch": 6.269533904063052, + "grad_norm": 9.358833312988281, + "learning_rate": 2.166479820627803e-06, + "loss": 2.7344, + "step": 92275 + }, + { + "epoch": 6.269873624133714, + "grad_norm": 7.433910846710205, + "learning_rate": 2.1660551705394754e-06, + "loss": 2.5676, + "step": 92280 + }, + { + "epoch": 6.270213344204375, + "grad_norm": 7.890280723571777, + "learning_rate": 2.1656305204511486e-06, + "loss": 2.8896, + "step": 92285 + }, + { + "epoch": 6.270553064275037, + "grad_norm": 9.6113920211792, + "learning_rate": 2.165205870362821e-06, + "loss": 2.9256, + "step": 92290 + }, + { + "epoch": 6.270892784345699, + "grad_norm": 9.192876815795898, + "learning_rate": 2.164781220274494e-06, + "loss": 2.7148, + "step": 92295 + }, + { + "epoch": 6.271232504416361, + "grad_norm": 6.581601142883301, + "learning_rate": 2.164356570186167e-06, + "loss": 2.5547, + "step": 92300 + }, + { + "epoch": 6.271572224487023, + "grad_norm": 6.639545440673828, + "learning_rate": 2.1639319200978394e-06, + "loss": 2.6533, + "step": 92305 + }, + { + "epoch": 6.271911944557685, + "grad_norm": 6.760821342468262, + "learning_rate": 2.163507270009512e-06, + "loss": 2.9997, + "step": 92310 + }, + { + "epoch": 6.272251664628346, + "grad_norm": 8.375635147094727, + "learning_rate": 2.163082619921185e-06, + "loss": 2.8879, + "step": 92315 + }, + { + "epoch": 6.272591384699008, + "grad_norm": 9.260544776916504, + "learning_rate": 2.162657969832858e-06, + "loss": 2.8591, + "step": 92320 + }, + { + "epoch": 6.27293110476967, + "grad_norm": 7.314982891082764, + "learning_rate": 2.1622333197445306e-06, + "loss": 2.8571, + "step": 92325 + }, + { + "epoch": 6.273270824840331, + "grad_norm": 7.82557487487793, + "learning_rate": 2.1618086696562034e-06, + "loss": 2.7544, + "step": 92330 + }, + { + "epoch": 6.273610544910993, + "grad_norm": 7.240082740783691, + "learning_rate": 2.161384019567876e-06, + "loss": 2.7028, + "step": 92335 + }, + { + "epoch": 6.273950264981655, + "grad_norm": 8.711541175842285, + "learning_rate": 2.160959369479549e-06, + "loss": 2.8769, + "step": 92340 + }, + { + "epoch": 6.274289985052317, + "grad_norm": 6.935323238372803, + "learning_rate": 2.160534719391222e-06, + "loss": 2.5481, + "step": 92345 + }, + { + "epoch": 6.274629705122979, + "grad_norm": 7.970047950744629, + "learning_rate": 2.1601100693028946e-06, + "loss": 2.7709, + "step": 92350 + }, + { + "epoch": 6.274969425193641, + "grad_norm": 9.08497142791748, + "learning_rate": 2.1596854192145674e-06, + "loss": 2.5435, + "step": 92355 + }, + { + "epoch": 6.275309145264302, + "grad_norm": 7.166296482086182, + "learning_rate": 2.1592607691262402e-06, + "loss": 2.6756, + "step": 92360 + }, + { + "epoch": 6.275648865334964, + "grad_norm": 7.213673114776611, + "learning_rate": 2.158836119037913e-06, + "loss": 2.6266, + "step": 92365 + }, + { + "epoch": 6.275988585405626, + "grad_norm": 7.79071044921875, + "learning_rate": 2.158411468949586e-06, + "loss": 2.7113, + "step": 92370 + }, + { + "epoch": 6.276328305476287, + "grad_norm": 7.774782180786133, + "learning_rate": 2.1579868188612586e-06, + "loss": 2.7709, + "step": 92375 + }, + { + "epoch": 6.276668025546949, + "grad_norm": 7.6533308029174805, + "learning_rate": 2.157562168772931e-06, + "loss": 2.666, + "step": 92380 + }, + { + "epoch": 6.277007745617611, + "grad_norm": 5.964529514312744, + "learning_rate": 2.1571375186846042e-06, + "loss": 2.8364, + "step": 92385 + }, + { + "epoch": 6.277347465688273, + "grad_norm": 9.668278694152832, + "learning_rate": 2.156712868596277e-06, + "loss": 2.6884, + "step": 92390 + }, + { + "epoch": 6.277687185758935, + "grad_norm": 4.99120569229126, + "learning_rate": 2.1562882185079494e-06, + "loss": 2.6532, + "step": 92395 + }, + { + "epoch": 6.278026905829597, + "grad_norm": 7.261707782745361, + "learning_rate": 2.1558635684196226e-06, + "loss": 2.5229, + "step": 92400 + }, + { + "epoch": 6.278366625900258, + "grad_norm": 7.376436710357666, + "learning_rate": 2.155438918331295e-06, + "loss": 2.8878, + "step": 92405 + }, + { + "epoch": 6.27870634597092, + "grad_norm": 7.212932586669922, + "learning_rate": 2.155014268242968e-06, + "loss": 2.6423, + "step": 92410 + }, + { + "epoch": 6.279046066041582, + "grad_norm": 10.38585376739502, + "learning_rate": 2.1545896181546406e-06, + "loss": 2.725, + "step": 92415 + }, + { + "epoch": 6.279385786112243, + "grad_norm": 7.707838535308838, + "learning_rate": 2.1541649680663134e-06, + "loss": 2.6368, + "step": 92420 + }, + { + "epoch": 6.279725506182905, + "grad_norm": 6.6255879402160645, + "learning_rate": 2.1537403179779866e-06, + "loss": 2.8865, + "step": 92425 + }, + { + "epoch": 6.2800652262535674, + "grad_norm": 8.548131942749023, + "learning_rate": 2.153315667889659e-06, + "loss": 2.7088, + "step": 92430 + }, + { + "epoch": 6.280404946324229, + "grad_norm": 8.847705841064453, + "learning_rate": 2.152891017801332e-06, + "loss": 2.8662, + "step": 92435 + }, + { + "epoch": 6.280744666394891, + "grad_norm": 8.795374870300293, + "learning_rate": 2.1524663677130046e-06, + "loss": 2.7353, + "step": 92440 + }, + { + "epoch": 6.281084386465553, + "grad_norm": 9.902814865112305, + "learning_rate": 2.1520417176246774e-06, + "loss": 2.8542, + "step": 92445 + }, + { + "epoch": 6.281424106536214, + "grad_norm": 8.389163970947266, + "learning_rate": 2.15161706753635e-06, + "loss": 2.7758, + "step": 92450 + }, + { + "epoch": 6.281763826606876, + "grad_norm": 8.284526824951172, + "learning_rate": 2.151192417448023e-06, + "loss": 2.8446, + "step": 92455 + }, + { + "epoch": 6.282103546677538, + "grad_norm": 8.026206016540527, + "learning_rate": 2.150767767359696e-06, + "loss": 2.9121, + "step": 92460 + }, + { + "epoch": 6.282443266748199, + "grad_norm": 8.33360481262207, + "learning_rate": 2.1503431172713686e-06, + "loss": 2.54, + "step": 92465 + }, + { + "epoch": 6.282782986818861, + "grad_norm": 8.40531063079834, + "learning_rate": 2.1499184671830414e-06, + "loss": 2.8518, + "step": 92470 + }, + { + "epoch": 6.2831227068895235, + "grad_norm": 10.081503868103027, + "learning_rate": 2.149493817094714e-06, + "loss": 2.6999, + "step": 92475 + }, + { + "epoch": 6.283462426960185, + "grad_norm": 8.751154899597168, + "learning_rate": 2.1490691670063866e-06, + "loss": 2.9172, + "step": 92480 + }, + { + "epoch": 6.283802147030847, + "grad_norm": 6.800371170043945, + "learning_rate": 2.14864451691806e-06, + "loss": 2.7627, + "step": 92485 + }, + { + "epoch": 6.284141867101509, + "grad_norm": 8.007058143615723, + "learning_rate": 2.1482198668297326e-06, + "loss": 2.4385, + "step": 92490 + }, + { + "epoch": 6.28448158717217, + "grad_norm": 7.584883689880371, + "learning_rate": 2.147795216741405e-06, + "loss": 2.7222, + "step": 92495 + }, + { + "epoch": 6.284821307242832, + "grad_norm": 11.917820930480957, + "learning_rate": 2.147370566653078e-06, + "loss": 2.6831, + "step": 92500 + }, + { + "epoch": 6.285161027313494, + "grad_norm": 8.249909400939941, + "learning_rate": 2.1469459165647506e-06, + "loss": 2.9522, + "step": 92505 + }, + { + "epoch": 6.285500747384155, + "grad_norm": 6.911370754241943, + "learning_rate": 2.146521266476424e-06, + "loss": 2.6159, + "step": 92510 + }, + { + "epoch": 6.285840467454817, + "grad_norm": 6.520767688751221, + "learning_rate": 2.1460966163880966e-06, + "loss": 2.685, + "step": 92515 + }, + { + "epoch": 6.2861801875254795, + "grad_norm": 6.541023254394531, + "learning_rate": 2.145671966299769e-06, + "loss": 2.6177, + "step": 92520 + }, + { + "epoch": 6.286519907596141, + "grad_norm": 8.28116512298584, + "learning_rate": 2.1452473162114422e-06, + "loss": 2.6724, + "step": 92525 + }, + { + "epoch": 6.286859627666803, + "grad_norm": 8.366643905639648, + "learning_rate": 2.1448226661231146e-06, + "loss": 2.9777, + "step": 92530 + }, + { + "epoch": 6.287199347737464, + "grad_norm": 6.3336615562438965, + "learning_rate": 2.1443980160347874e-06, + "loss": 2.64, + "step": 92535 + }, + { + "epoch": 6.287539067808126, + "grad_norm": 10.415332794189453, + "learning_rate": 2.14397336594646e-06, + "loss": 3.0862, + "step": 92540 + }, + { + "epoch": 6.287878787878788, + "grad_norm": 7.320164203643799, + "learning_rate": 2.143548715858133e-06, + "loss": 2.6447, + "step": 92545 + }, + { + "epoch": 6.288218507949449, + "grad_norm": 7.441736698150635, + "learning_rate": 2.143124065769806e-06, + "loss": 2.6105, + "step": 92550 + }, + { + "epoch": 6.288558228020111, + "grad_norm": 5.851437568664551, + "learning_rate": 2.1426994156814786e-06, + "loss": 2.6038, + "step": 92555 + }, + { + "epoch": 6.288897948090773, + "grad_norm": 8.100356101989746, + "learning_rate": 2.1422747655931514e-06, + "loss": 2.9374, + "step": 92560 + }, + { + "epoch": 6.289237668161435, + "grad_norm": 7.8196611404418945, + "learning_rate": 2.141850115504824e-06, + "loss": 2.5992, + "step": 92565 + }, + { + "epoch": 6.289577388232097, + "grad_norm": 10.767210006713867, + "learning_rate": 2.141425465416497e-06, + "loss": 2.8777, + "step": 92570 + }, + { + "epoch": 6.289917108302759, + "grad_norm": 7.868034362792969, + "learning_rate": 2.14100081532817e-06, + "loss": 2.8718, + "step": 92575 + }, + { + "epoch": 6.29025682837342, + "grad_norm": 9.452044486999512, + "learning_rate": 2.1405761652398426e-06, + "loss": 2.59, + "step": 92580 + }, + { + "epoch": 6.290596548444082, + "grad_norm": 8.424427032470703, + "learning_rate": 2.1401515151515154e-06, + "loss": 2.9026, + "step": 92585 + }, + { + "epoch": 6.290936268514744, + "grad_norm": 7.185664653778076, + "learning_rate": 2.139726865063188e-06, + "loss": 2.8041, + "step": 92590 + }, + { + "epoch": 6.291275988585405, + "grad_norm": 8.506217002868652, + "learning_rate": 2.139302214974861e-06, + "loss": 2.6049, + "step": 92595 + }, + { + "epoch": 6.291615708656067, + "grad_norm": 9.801877975463867, + "learning_rate": 2.138877564886534e-06, + "loss": 2.8312, + "step": 92600 + }, + { + "epoch": 6.291955428726729, + "grad_norm": 8.676046371459961, + "learning_rate": 2.138452914798206e-06, + "loss": 2.8322, + "step": 92605 + }, + { + "epoch": 6.292295148797391, + "grad_norm": 7.6605682373046875, + "learning_rate": 2.1380282647098794e-06, + "loss": 2.684, + "step": 92610 + }, + { + "epoch": 6.292634868868053, + "grad_norm": 7.9124016761779785, + "learning_rate": 2.137603614621552e-06, + "loss": 2.9063, + "step": 92615 + }, + { + "epoch": 6.292974588938715, + "grad_norm": 6.384786128997803, + "learning_rate": 2.1371789645332246e-06, + "loss": 2.6767, + "step": 92620 + }, + { + "epoch": 6.293314309009376, + "grad_norm": 7.3471479415893555, + "learning_rate": 2.136754314444898e-06, + "loss": 2.6931, + "step": 92625 + }, + { + "epoch": 6.293654029080038, + "grad_norm": 9.116742134094238, + "learning_rate": 2.13632966435657e-06, + "loss": 2.7594, + "step": 92630 + }, + { + "epoch": 6.2939937491507, + "grad_norm": 7.418399333953857, + "learning_rate": 2.135905014268243e-06, + "loss": 2.8202, + "step": 92635 + }, + { + "epoch": 6.294333469221361, + "grad_norm": 6.40993070602417, + "learning_rate": 2.135480364179916e-06, + "loss": 2.6955, + "step": 92640 + }, + { + "epoch": 6.294673189292023, + "grad_norm": 6.506413459777832, + "learning_rate": 2.1350557140915886e-06, + "loss": 2.6006, + "step": 92645 + }, + { + "epoch": 6.295012909362685, + "grad_norm": 6.931040287017822, + "learning_rate": 2.1346310640032614e-06, + "loss": 2.7762, + "step": 92650 + }, + { + "epoch": 6.295352629433347, + "grad_norm": 6.663118839263916, + "learning_rate": 2.134206413914934e-06, + "loss": 2.87, + "step": 92655 + }, + { + "epoch": 6.295692349504009, + "grad_norm": 6.814593315124512, + "learning_rate": 2.133781763826607e-06, + "loss": 2.755, + "step": 92660 + }, + { + "epoch": 6.296032069574671, + "grad_norm": 6.890774250030518, + "learning_rate": 2.13335711373828e-06, + "loss": 2.7513, + "step": 92665 + }, + { + "epoch": 6.296371789645332, + "grad_norm": 7.186982154846191, + "learning_rate": 2.1329324636499526e-06, + "loss": 2.4993, + "step": 92670 + }, + { + "epoch": 6.296711509715994, + "grad_norm": 6.513902187347412, + "learning_rate": 2.1325078135616254e-06, + "loss": 2.7345, + "step": 92675 + }, + { + "epoch": 6.297051229786656, + "grad_norm": 7.889866828918457, + "learning_rate": 2.132083163473298e-06, + "loss": 2.7074, + "step": 92680 + }, + { + "epoch": 6.297390949857317, + "grad_norm": 8.564316749572754, + "learning_rate": 2.131658513384971e-06, + "loss": 2.7423, + "step": 92685 + }, + { + "epoch": 6.297730669927979, + "grad_norm": 7.462630748748779, + "learning_rate": 2.131233863296644e-06, + "loss": 2.7177, + "step": 92690 + }, + { + "epoch": 6.298070389998641, + "grad_norm": 9.255680084228516, + "learning_rate": 2.1308092132083166e-06, + "loss": 2.7191, + "step": 92695 + }, + { + "epoch": 6.298410110069303, + "grad_norm": 7.103756904602051, + "learning_rate": 2.1303845631199894e-06, + "loss": 2.6498, + "step": 92700 + }, + { + "epoch": 6.298749830139965, + "grad_norm": 9.647695541381836, + "learning_rate": 2.129959913031662e-06, + "loss": 2.6178, + "step": 92705 + }, + { + "epoch": 6.299089550210627, + "grad_norm": 9.182822227478027, + "learning_rate": 2.129535262943335e-06, + "loss": 2.757, + "step": 92710 + }, + { + "epoch": 6.299429270281288, + "grad_norm": 7.8549957275390625, + "learning_rate": 2.129110612855008e-06, + "loss": 3.1899, + "step": 92715 + }, + { + "epoch": 6.29976899035195, + "grad_norm": 7.793550968170166, + "learning_rate": 2.12868596276668e-06, + "loss": 2.9452, + "step": 92720 + }, + { + "epoch": 6.300108710422612, + "grad_norm": 7.791130065917969, + "learning_rate": 2.1282613126783534e-06, + "loss": 2.8866, + "step": 92725 + }, + { + "epoch": 6.300448430493273, + "grad_norm": 7.038779258728027, + "learning_rate": 2.1278366625900258e-06, + "loss": 2.8328, + "step": 92730 + }, + { + "epoch": 6.300788150563935, + "grad_norm": 7.5099334716796875, + "learning_rate": 2.1274120125016986e-06, + "loss": 2.7943, + "step": 92735 + }, + { + "epoch": 6.3011278706345974, + "grad_norm": 9.103437423706055, + "learning_rate": 2.126987362413372e-06, + "loss": 2.7812, + "step": 92740 + }, + { + "epoch": 6.301467590705259, + "grad_norm": 6.685379505157471, + "learning_rate": 2.126562712325044e-06, + "loss": 2.6487, + "step": 92745 + }, + { + "epoch": 6.301807310775921, + "grad_norm": 6.868640422821045, + "learning_rate": 2.126138062236717e-06, + "loss": 2.5041, + "step": 92750 + }, + { + "epoch": 6.302147030846583, + "grad_norm": 6.6789231300354, + "learning_rate": 2.1257134121483898e-06, + "loss": 2.5341, + "step": 92755 + }, + { + "epoch": 6.302486750917244, + "grad_norm": 7.456124305725098, + "learning_rate": 2.1252887620600626e-06, + "loss": 2.7874, + "step": 92760 + }, + { + "epoch": 6.302826470987906, + "grad_norm": 7.084011554718018, + "learning_rate": 2.1248641119717354e-06, + "loss": 2.7202, + "step": 92765 + }, + { + "epoch": 6.303166191058568, + "grad_norm": 5.923976898193359, + "learning_rate": 2.124439461883408e-06, + "loss": 2.7275, + "step": 92770 + }, + { + "epoch": 6.303505911129229, + "grad_norm": 8.860143661499023, + "learning_rate": 2.124014811795081e-06, + "loss": 2.8678, + "step": 92775 + }, + { + "epoch": 6.303845631199891, + "grad_norm": 8.159180641174316, + "learning_rate": 2.1235901617067538e-06, + "loss": 2.8738, + "step": 92780 + }, + { + "epoch": 6.3041853512705535, + "grad_norm": 8.36866569519043, + "learning_rate": 2.1231655116184266e-06, + "loss": 2.7211, + "step": 92785 + }, + { + "epoch": 6.304525071341215, + "grad_norm": 7.652031421661377, + "learning_rate": 2.1227408615300994e-06, + "loss": 2.7055, + "step": 92790 + }, + { + "epoch": 6.304864791411877, + "grad_norm": 6.705999851226807, + "learning_rate": 2.122316211441772e-06, + "loss": 2.7617, + "step": 92795 + }, + { + "epoch": 6.305204511482539, + "grad_norm": 9.540984153747559, + "learning_rate": 2.121891561353445e-06, + "loss": 2.8905, + "step": 92800 + }, + { + "epoch": 6.3055442315532, + "grad_norm": 6.731831073760986, + "learning_rate": 2.1214669112651178e-06, + "loss": 2.715, + "step": 92805 + }, + { + "epoch": 6.305883951623862, + "grad_norm": 8.784442901611328, + "learning_rate": 2.1210422611767906e-06, + "loss": 2.7094, + "step": 92810 + }, + { + "epoch": 6.306223671694523, + "grad_norm": 11.283059120178223, + "learning_rate": 2.1206176110884634e-06, + "loss": 2.6716, + "step": 92815 + }, + { + "epoch": 6.306563391765185, + "grad_norm": 6.773078441619873, + "learning_rate": 2.1201929610001358e-06, + "loss": 2.6149, + "step": 92820 + }, + { + "epoch": 6.306903111835847, + "grad_norm": 6.316473960876465, + "learning_rate": 2.119768310911809e-06, + "loss": 2.8324, + "step": 92825 + }, + { + "epoch": 6.307242831906509, + "grad_norm": 8.345779418945312, + "learning_rate": 2.119343660823482e-06, + "loss": 2.9469, + "step": 92830 + }, + { + "epoch": 6.307582551977171, + "grad_norm": 9.863905906677246, + "learning_rate": 2.118919010735154e-06, + "loss": 2.9747, + "step": 92835 + }, + { + "epoch": 6.307922272047833, + "grad_norm": 7.2724289894104, + "learning_rate": 2.1184943606468274e-06, + "loss": 2.7967, + "step": 92840 + }, + { + "epoch": 6.308261992118494, + "grad_norm": 10.595268249511719, + "learning_rate": 2.1180697105584998e-06, + "loss": 2.6836, + "step": 92845 + }, + { + "epoch": 6.308601712189156, + "grad_norm": 7.3148932456970215, + "learning_rate": 2.117645060470173e-06, + "loss": 2.7486, + "step": 92850 + }, + { + "epoch": 6.308941432259818, + "grad_norm": 8.244969367980957, + "learning_rate": 2.1172204103818454e-06, + "loss": 2.7888, + "step": 92855 + }, + { + "epoch": 6.309281152330479, + "grad_norm": 7.637800693511963, + "learning_rate": 2.116795760293518e-06, + "loss": 2.7524, + "step": 92860 + }, + { + "epoch": 6.309620872401141, + "grad_norm": 10.29572582244873, + "learning_rate": 2.1163711102051914e-06, + "loss": 3.0156, + "step": 92865 + }, + { + "epoch": 6.309960592471803, + "grad_norm": 9.415451049804688, + "learning_rate": 2.1159464601168638e-06, + "loss": 2.7537, + "step": 92870 + }, + { + "epoch": 6.310300312542465, + "grad_norm": 7.309174060821533, + "learning_rate": 2.1155218100285366e-06, + "loss": 2.81, + "step": 92875 + }, + { + "epoch": 6.310640032613127, + "grad_norm": 9.003006935119629, + "learning_rate": 2.1150971599402094e-06, + "loss": 2.5888, + "step": 92880 + }, + { + "epoch": 6.310979752683789, + "grad_norm": 8.2605562210083, + "learning_rate": 2.114672509851882e-06, + "loss": 2.8401, + "step": 92885 + }, + { + "epoch": 6.31131947275445, + "grad_norm": 7.121325492858887, + "learning_rate": 2.114247859763555e-06, + "loss": 2.6505, + "step": 92890 + }, + { + "epoch": 6.311659192825112, + "grad_norm": 7.7497711181640625, + "learning_rate": 2.1138232096752278e-06, + "loss": 2.7483, + "step": 92895 + }, + { + "epoch": 6.311998912895774, + "grad_norm": 7.101687431335449, + "learning_rate": 2.1133985595869006e-06, + "loss": 2.7955, + "step": 92900 + }, + { + "epoch": 6.312338632966435, + "grad_norm": 6.076467990875244, + "learning_rate": 2.1129739094985734e-06, + "loss": 2.5566, + "step": 92905 + }, + { + "epoch": 6.312678353037097, + "grad_norm": 7.775421619415283, + "learning_rate": 2.112549259410246e-06, + "loss": 2.6957, + "step": 92910 + }, + { + "epoch": 6.313018073107759, + "grad_norm": 8.8804931640625, + "learning_rate": 2.112124609321919e-06, + "loss": 2.5397, + "step": 92915 + }, + { + "epoch": 6.313357793178421, + "grad_norm": 6.898365497589111, + "learning_rate": 2.1116999592335914e-06, + "loss": 2.569, + "step": 92920 + }, + { + "epoch": 6.313697513249083, + "grad_norm": 8.13007926940918, + "learning_rate": 2.1112753091452646e-06, + "loss": 2.6985, + "step": 92925 + }, + { + "epoch": 6.314037233319745, + "grad_norm": 6.425570487976074, + "learning_rate": 2.1108506590569374e-06, + "loss": 2.7953, + "step": 92930 + }, + { + "epoch": 6.314376953390406, + "grad_norm": 8.704297065734863, + "learning_rate": 2.11042600896861e-06, + "loss": 2.6921, + "step": 92935 + }, + { + "epoch": 6.314716673461068, + "grad_norm": 6.703993797302246, + "learning_rate": 2.110001358880283e-06, + "loss": 2.5955, + "step": 92940 + }, + { + "epoch": 6.31505639353173, + "grad_norm": 6.0937724113464355, + "learning_rate": 2.1095767087919554e-06, + "loss": 2.8083, + "step": 92945 + }, + { + "epoch": 6.315396113602391, + "grad_norm": 7.980727672576904, + "learning_rate": 2.1091520587036286e-06, + "loss": 2.5541, + "step": 92950 + }, + { + "epoch": 6.315735833673053, + "grad_norm": 7.689615249633789, + "learning_rate": 2.108727408615301e-06, + "loss": 2.6477, + "step": 92955 + }, + { + "epoch": 6.316075553743715, + "grad_norm": 6.778273105621338, + "learning_rate": 2.1083027585269738e-06, + "loss": 2.8188, + "step": 92960 + }, + { + "epoch": 6.316415273814377, + "grad_norm": 6.947371006011963, + "learning_rate": 2.107878108438647e-06, + "loss": 2.7481, + "step": 92965 + }, + { + "epoch": 6.316754993885039, + "grad_norm": 7.409592151641846, + "learning_rate": 2.1074534583503194e-06, + "loss": 2.637, + "step": 92970 + }, + { + "epoch": 6.317094713955701, + "grad_norm": 6.629470348358154, + "learning_rate": 2.107028808261992e-06, + "loss": 2.8044, + "step": 92975 + }, + { + "epoch": 6.317434434026362, + "grad_norm": 10.232121467590332, + "learning_rate": 2.106604158173665e-06, + "loss": 2.7989, + "step": 92980 + }, + { + "epoch": 6.317774154097024, + "grad_norm": 8.751911163330078, + "learning_rate": 2.1061795080853378e-06, + "loss": 2.6766, + "step": 92985 + }, + { + "epoch": 6.318113874167686, + "grad_norm": 6.0341410636901855, + "learning_rate": 2.1057548579970106e-06, + "loss": 2.9149, + "step": 92990 + }, + { + "epoch": 6.318453594238347, + "grad_norm": 9.4694185256958, + "learning_rate": 2.1053302079086834e-06, + "loss": 2.7978, + "step": 92995 + }, + { + "epoch": 6.318793314309009, + "grad_norm": 9.016297340393066, + "learning_rate": 2.104905557820356e-06, + "loss": 2.5724, + "step": 93000 + }, + { + "epoch": 6.319133034379671, + "grad_norm": 8.362164497375488, + "learning_rate": 2.104480907732029e-06, + "loss": 2.8327, + "step": 93005 + }, + { + "epoch": 6.319472754450333, + "grad_norm": 7.242002010345459, + "learning_rate": 2.1040562576437018e-06, + "loss": 2.7961, + "step": 93010 + }, + { + "epoch": 6.319812474520995, + "grad_norm": 9.120631217956543, + "learning_rate": 2.1036316075553746e-06, + "loss": 2.3167, + "step": 93015 + }, + { + "epoch": 6.320152194591657, + "grad_norm": 8.83115291595459, + "learning_rate": 2.1032069574670474e-06, + "loss": 2.6198, + "step": 93020 + }, + { + "epoch": 6.320491914662318, + "grad_norm": 7.9696173667907715, + "learning_rate": 2.10278230737872e-06, + "loss": 2.6478, + "step": 93025 + }, + { + "epoch": 6.32083163473298, + "grad_norm": 8.118918418884277, + "learning_rate": 2.102357657290393e-06, + "loss": 2.6665, + "step": 93030 + }, + { + "epoch": 6.321171354803642, + "grad_norm": 7.641896724700928, + "learning_rate": 2.1019330072020658e-06, + "loss": 2.7176, + "step": 93035 + }, + { + "epoch": 6.321511074874303, + "grad_norm": 7.003383159637451, + "learning_rate": 2.1015083571137386e-06, + "loss": 2.6612, + "step": 93040 + }, + { + "epoch": 6.321850794944965, + "grad_norm": 8.207450866699219, + "learning_rate": 2.101083707025411e-06, + "loss": 2.6469, + "step": 93045 + }, + { + "epoch": 6.3221905150156275, + "grad_norm": 11.982419967651367, + "learning_rate": 2.100659056937084e-06, + "loss": 2.5424, + "step": 93050 + }, + { + "epoch": 6.322530235086289, + "grad_norm": 9.446880340576172, + "learning_rate": 2.100234406848757e-06, + "loss": 2.5222, + "step": 93055 + }, + { + "epoch": 6.322869955156951, + "grad_norm": 8.927103042602539, + "learning_rate": 2.0998097567604294e-06, + "loss": 2.5583, + "step": 93060 + }, + { + "epoch": 6.323209675227613, + "grad_norm": 8.387188911437988, + "learning_rate": 2.0993851066721026e-06, + "loss": 2.6069, + "step": 93065 + }, + { + "epoch": 6.323549395298274, + "grad_norm": 8.863692283630371, + "learning_rate": 2.098960456583775e-06, + "loss": 2.6953, + "step": 93070 + }, + { + "epoch": 6.323889115368936, + "grad_norm": 9.086649894714355, + "learning_rate": 2.0985358064954478e-06, + "loss": 2.855, + "step": 93075 + }, + { + "epoch": 6.324228835439598, + "grad_norm": 8.6886568069458, + "learning_rate": 2.0981111564071206e-06, + "loss": 2.8356, + "step": 93080 + }, + { + "epoch": 6.324568555510259, + "grad_norm": 6.604861736297607, + "learning_rate": 2.0976865063187934e-06, + "loss": 2.719, + "step": 93085 + }, + { + "epoch": 6.324908275580921, + "grad_norm": 8.577193260192871, + "learning_rate": 2.097261856230466e-06, + "loss": 2.9556, + "step": 93090 + }, + { + "epoch": 6.3252479956515835, + "grad_norm": 8.094219207763672, + "learning_rate": 2.096837206142139e-06, + "loss": 2.647, + "step": 93095 + }, + { + "epoch": 6.325587715722245, + "grad_norm": 8.83037281036377, + "learning_rate": 2.0964125560538118e-06, + "loss": 2.5511, + "step": 93100 + }, + { + "epoch": 6.325927435792907, + "grad_norm": 6.689243793487549, + "learning_rate": 2.0959879059654846e-06, + "loss": 2.8713, + "step": 93105 + }, + { + "epoch": 6.326267155863569, + "grad_norm": 7.965803623199463, + "learning_rate": 2.0955632558771574e-06, + "loss": 2.7232, + "step": 93110 + }, + { + "epoch": 6.32660687593423, + "grad_norm": 7.48342227935791, + "learning_rate": 2.09513860578883e-06, + "loss": 2.8024, + "step": 93115 + }, + { + "epoch": 6.326946596004892, + "grad_norm": 7.055203914642334, + "learning_rate": 2.094713955700503e-06, + "loss": 2.8923, + "step": 93120 + }, + { + "epoch": 6.327286316075554, + "grad_norm": 6.398489475250244, + "learning_rate": 2.0942893056121758e-06, + "loss": 2.7425, + "step": 93125 + }, + { + "epoch": 6.327626036146215, + "grad_norm": 7.819915771484375, + "learning_rate": 2.0938646555238486e-06, + "loss": 2.8307, + "step": 93130 + }, + { + "epoch": 6.327965756216877, + "grad_norm": 10.277963638305664, + "learning_rate": 2.0934400054355214e-06, + "loss": 2.734, + "step": 93135 + }, + { + "epoch": 6.3283054762875395, + "grad_norm": 9.933089256286621, + "learning_rate": 2.093015355347194e-06, + "loss": 2.811, + "step": 93140 + }, + { + "epoch": 6.328645196358201, + "grad_norm": 8.81994915008545, + "learning_rate": 2.092590705258867e-06, + "loss": 2.4543, + "step": 93145 + }, + { + "epoch": 6.328984916428863, + "grad_norm": 6.850278854370117, + "learning_rate": 2.0921660551705398e-06, + "loss": 2.7459, + "step": 93150 + }, + { + "epoch": 6.329324636499525, + "grad_norm": 6.4277424812316895, + "learning_rate": 2.0917414050822126e-06, + "loss": 2.7682, + "step": 93155 + }, + { + "epoch": 6.329664356570186, + "grad_norm": 8.589630126953125, + "learning_rate": 2.091316754993885e-06, + "loss": 2.9363, + "step": 93160 + }, + { + "epoch": 6.330004076640848, + "grad_norm": 8.236515998840332, + "learning_rate": 2.090892104905558e-06, + "loss": 2.6889, + "step": 93165 + }, + { + "epoch": 6.33034379671151, + "grad_norm": 7.459204196929932, + "learning_rate": 2.0904674548172305e-06, + "loss": 2.6295, + "step": 93170 + }, + { + "epoch": 6.330683516782171, + "grad_norm": 8.488280296325684, + "learning_rate": 2.0900428047289034e-06, + "loss": 2.571, + "step": 93175 + }, + { + "epoch": 6.331023236852833, + "grad_norm": 8.01620864868164, + "learning_rate": 2.0896181546405766e-06, + "loss": 2.7126, + "step": 93180 + }, + { + "epoch": 6.3313629569234955, + "grad_norm": 7.63667631149292, + "learning_rate": 2.089193504552249e-06, + "loss": 2.7214, + "step": 93185 + }, + { + "epoch": 6.331702676994157, + "grad_norm": 7.3027024269104, + "learning_rate": 2.088768854463922e-06, + "loss": 2.8064, + "step": 93190 + }, + { + "epoch": 6.332042397064819, + "grad_norm": 6.524458885192871, + "learning_rate": 2.0883442043755946e-06, + "loss": 2.8848, + "step": 93195 + }, + { + "epoch": 6.332382117135481, + "grad_norm": 8.750920295715332, + "learning_rate": 2.0879195542872674e-06, + "loss": 2.857, + "step": 93200 + }, + { + "epoch": 6.332721837206142, + "grad_norm": 6.2856316566467285, + "learning_rate": 2.08749490419894e-06, + "loss": 2.4191, + "step": 93205 + }, + { + "epoch": 6.333061557276804, + "grad_norm": 7.114850044250488, + "learning_rate": 2.087070254110613e-06, + "loss": 2.7139, + "step": 93210 + }, + { + "epoch": 6.333401277347465, + "grad_norm": 7.314382076263428, + "learning_rate": 2.0866456040222858e-06, + "loss": 2.8424, + "step": 93215 + }, + { + "epoch": 6.333740997418127, + "grad_norm": 8.505025863647461, + "learning_rate": 2.0862209539339586e-06, + "loss": 2.8398, + "step": 93220 + }, + { + "epoch": 6.334080717488789, + "grad_norm": 8.29892635345459, + "learning_rate": 2.0857963038456314e-06, + "loss": 2.6172, + "step": 93225 + }, + { + "epoch": 6.334420437559451, + "grad_norm": 7.107554912567139, + "learning_rate": 2.085371653757304e-06, + "loss": 2.7369, + "step": 93230 + }, + { + "epoch": 6.334760157630113, + "grad_norm": 8.336970329284668, + "learning_rate": 2.084947003668977e-06, + "loss": 3.1198, + "step": 93235 + }, + { + "epoch": 6.335099877700775, + "grad_norm": 8.95376968383789, + "learning_rate": 2.0845223535806498e-06, + "loss": 2.7833, + "step": 93240 + }, + { + "epoch": 6.335439597771436, + "grad_norm": 7.153709411621094, + "learning_rate": 2.0840977034923226e-06, + "loss": 2.7042, + "step": 93245 + }, + { + "epoch": 6.335779317842098, + "grad_norm": 6.633956432342529, + "learning_rate": 2.0836730534039954e-06, + "loss": 2.7302, + "step": 93250 + }, + { + "epoch": 6.33611903791276, + "grad_norm": 10.28852653503418, + "learning_rate": 2.083248403315668e-06, + "loss": 2.7504, + "step": 93255 + }, + { + "epoch": 6.336458757983421, + "grad_norm": 5.632843017578125, + "learning_rate": 2.0828237532273405e-06, + "loss": 2.7574, + "step": 93260 + }, + { + "epoch": 6.336798478054083, + "grad_norm": 8.058631896972656, + "learning_rate": 2.0823991031390138e-06, + "loss": 2.7758, + "step": 93265 + }, + { + "epoch": 6.337138198124745, + "grad_norm": 8.848660469055176, + "learning_rate": 2.081974453050686e-06, + "loss": 2.6124, + "step": 93270 + }, + { + "epoch": 6.337477918195407, + "grad_norm": 8.233636856079102, + "learning_rate": 2.0815498029623594e-06, + "loss": 2.9774, + "step": 93275 + }, + { + "epoch": 6.337817638266069, + "grad_norm": 10.753950119018555, + "learning_rate": 2.081125152874032e-06, + "loss": 2.8049, + "step": 93280 + }, + { + "epoch": 6.338157358336731, + "grad_norm": 8.013737678527832, + "learning_rate": 2.0807005027857045e-06, + "loss": 2.5698, + "step": 93285 + }, + { + "epoch": 6.338497078407392, + "grad_norm": 7.674637317657471, + "learning_rate": 2.0802758526973778e-06, + "loss": 2.7933, + "step": 93290 + }, + { + "epoch": 6.338836798478054, + "grad_norm": 6.952195644378662, + "learning_rate": 2.07985120260905e-06, + "loss": 2.8516, + "step": 93295 + }, + { + "epoch": 6.339176518548716, + "grad_norm": 6.988265514373779, + "learning_rate": 2.079426552520723e-06, + "loss": 2.5442, + "step": 93300 + }, + { + "epoch": 6.339516238619377, + "grad_norm": 8.627424240112305, + "learning_rate": 2.079001902432396e-06, + "loss": 2.8613, + "step": 93305 + }, + { + "epoch": 6.339855958690039, + "grad_norm": 7.821730136871338, + "learning_rate": 2.0785772523440685e-06, + "loss": 2.8664, + "step": 93310 + }, + { + "epoch": 6.3401956787607014, + "grad_norm": 7.917416095733643, + "learning_rate": 2.0781526022557413e-06, + "loss": 2.5102, + "step": 93315 + }, + { + "epoch": 6.340535398831363, + "grad_norm": 7.690560340881348, + "learning_rate": 2.077727952167414e-06, + "loss": 2.8254, + "step": 93320 + }, + { + "epoch": 6.340875118902025, + "grad_norm": 6.934567928314209, + "learning_rate": 2.077303302079087e-06, + "loss": 2.7, + "step": 93325 + }, + { + "epoch": 6.341214838972687, + "grad_norm": 8.085579872131348, + "learning_rate": 2.0768786519907598e-06, + "loss": 2.7542, + "step": 93330 + }, + { + "epoch": 6.341554559043348, + "grad_norm": 6.110536098480225, + "learning_rate": 2.0764540019024326e-06, + "loss": 2.5825, + "step": 93335 + }, + { + "epoch": 6.34189427911401, + "grad_norm": 9.169790267944336, + "learning_rate": 2.0760293518141054e-06, + "loss": 2.628, + "step": 93340 + }, + { + "epoch": 6.342233999184672, + "grad_norm": 6.2502899169921875, + "learning_rate": 2.075604701725778e-06, + "loss": 2.8656, + "step": 93345 + }, + { + "epoch": 6.342573719255333, + "grad_norm": 7.499764442443848, + "learning_rate": 2.075180051637451e-06, + "loss": 2.646, + "step": 93350 + }, + { + "epoch": 6.342913439325995, + "grad_norm": 7.864729404449463, + "learning_rate": 2.0747554015491238e-06, + "loss": 2.727, + "step": 93355 + }, + { + "epoch": 6.3432531593966575, + "grad_norm": 6.81976318359375, + "learning_rate": 2.0743307514607966e-06, + "loss": 2.6351, + "step": 93360 + }, + { + "epoch": 6.343592879467319, + "grad_norm": 9.032159805297852, + "learning_rate": 2.0739061013724694e-06, + "loss": 2.7787, + "step": 93365 + }, + { + "epoch": 6.343932599537981, + "grad_norm": 8.865262985229492, + "learning_rate": 2.073481451284142e-06, + "loss": 2.8452, + "step": 93370 + }, + { + "epoch": 6.344272319608643, + "grad_norm": 8.185694694519043, + "learning_rate": 2.073056801195815e-06, + "loss": 2.8582, + "step": 93375 + }, + { + "epoch": 6.344612039679304, + "grad_norm": 6.8963212966918945, + "learning_rate": 2.0726321511074878e-06, + "loss": 2.8307, + "step": 93380 + }, + { + "epoch": 6.344951759749966, + "grad_norm": 9.806923866271973, + "learning_rate": 2.07220750101916e-06, + "loss": 2.937, + "step": 93385 + }, + { + "epoch": 6.345291479820628, + "grad_norm": 10.0729398727417, + "learning_rate": 2.0717828509308334e-06, + "loss": 2.7069, + "step": 93390 + }, + { + "epoch": 6.345631199891289, + "grad_norm": 6.703135967254639, + "learning_rate": 2.0713582008425057e-06, + "loss": 2.8301, + "step": 93395 + }, + { + "epoch": 6.345970919961951, + "grad_norm": 8.2046480178833, + "learning_rate": 2.0709335507541785e-06, + "loss": 2.5844, + "step": 93400 + }, + { + "epoch": 6.3463106400326135, + "grad_norm": 8.99268627166748, + "learning_rate": 2.0705089006658518e-06, + "loss": 2.7115, + "step": 93405 + }, + { + "epoch": 6.346650360103275, + "grad_norm": 8.240884780883789, + "learning_rate": 2.070084250577524e-06, + "loss": 2.7213, + "step": 93410 + }, + { + "epoch": 6.346990080173937, + "grad_norm": 6.401706695556641, + "learning_rate": 2.069659600489197e-06, + "loss": 2.604, + "step": 93415 + }, + { + "epoch": 6.347329800244599, + "grad_norm": 10.089455604553223, + "learning_rate": 2.0692349504008697e-06, + "loss": 2.787, + "step": 93420 + }, + { + "epoch": 6.34766952031526, + "grad_norm": 7.787832260131836, + "learning_rate": 2.0688103003125425e-06, + "loss": 2.8394, + "step": 93425 + }, + { + "epoch": 6.348009240385922, + "grad_norm": 9.62082576751709, + "learning_rate": 2.0683856502242153e-06, + "loss": 2.7819, + "step": 93430 + }, + { + "epoch": 6.348348960456584, + "grad_norm": 7.3198347091674805, + "learning_rate": 2.067961000135888e-06, + "loss": 2.6113, + "step": 93435 + }, + { + "epoch": 6.348688680527245, + "grad_norm": 7.851779460906982, + "learning_rate": 2.067536350047561e-06, + "loss": 2.7428, + "step": 93440 + }, + { + "epoch": 6.349028400597907, + "grad_norm": 8.06151294708252, + "learning_rate": 2.0671116999592337e-06, + "loss": 2.7686, + "step": 93445 + }, + { + "epoch": 6.3493681206685695, + "grad_norm": 7.539677143096924, + "learning_rate": 2.0666870498709065e-06, + "loss": 2.7291, + "step": 93450 + }, + { + "epoch": 6.349707840739231, + "grad_norm": 6.7629594802856445, + "learning_rate": 2.0662623997825793e-06, + "loss": 2.9135, + "step": 93455 + }, + { + "epoch": 6.350047560809893, + "grad_norm": 8.865966796875, + "learning_rate": 2.065837749694252e-06, + "loss": 2.6201, + "step": 93460 + }, + { + "epoch": 6.350387280880555, + "grad_norm": 8.891963958740234, + "learning_rate": 2.065413099605925e-06, + "loss": 2.6516, + "step": 93465 + }, + { + "epoch": 6.350727000951216, + "grad_norm": 8.302760124206543, + "learning_rate": 2.0649884495175977e-06, + "loss": 2.7992, + "step": 93470 + }, + { + "epoch": 6.351066721021878, + "grad_norm": 7.989439487457275, + "learning_rate": 2.0645637994292705e-06, + "loss": 3.0379, + "step": 93475 + }, + { + "epoch": 6.35140644109254, + "grad_norm": 9.659808158874512, + "learning_rate": 2.0641391493409434e-06, + "loss": 2.7567, + "step": 93480 + }, + { + "epoch": 6.351746161163201, + "grad_norm": 7.327106952667236, + "learning_rate": 2.0637144992526157e-06, + "loss": 2.6701, + "step": 93485 + }, + { + "epoch": 6.352085881233863, + "grad_norm": 7.347385883331299, + "learning_rate": 2.063289849164289e-06, + "loss": 2.5189, + "step": 93490 + }, + { + "epoch": 6.352425601304525, + "grad_norm": 7.185581207275391, + "learning_rate": 2.0628651990759618e-06, + "loss": 2.6483, + "step": 93495 + }, + { + "epoch": 6.352765321375187, + "grad_norm": 6.982369899749756, + "learning_rate": 2.062440548987634e-06, + "loss": 2.7715, + "step": 93500 + }, + { + "epoch": 6.353105041445849, + "grad_norm": 7.173745632171631, + "learning_rate": 2.0620158988993074e-06, + "loss": 2.6537, + "step": 93505 + }, + { + "epoch": 6.35344476151651, + "grad_norm": 8.329512596130371, + "learning_rate": 2.0615912488109797e-06, + "loss": 2.759, + "step": 93510 + }, + { + "epoch": 6.353784481587172, + "grad_norm": 7.097439289093018, + "learning_rate": 2.0611665987226525e-06, + "loss": 2.6207, + "step": 93515 + }, + { + "epoch": 6.354124201657834, + "grad_norm": 8.552267074584961, + "learning_rate": 2.0607419486343253e-06, + "loss": 2.5162, + "step": 93520 + }, + { + "epoch": 6.354463921728495, + "grad_norm": 6.077950954437256, + "learning_rate": 2.060317298545998e-06, + "loss": 2.7441, + "step": 93525 + }, + { + "epoch": 6.354803641799157, + "grad_norm": 7.455424785614014, + "learning_rate": 2.0598926484576714e-06, + "loss": 2.7829, + "step": 93530 + }, + { + "epoch": 6.355143361869819, + "grad_norm": 8.30145263671875, + "learning_rate": 2.0594679983693437e-06, + "loss": 2.84, + "step": 93535 + }, + { + "epoch": 6.355483081940481, + "grad_norm": 8.194536209106445, + "learning_rate": 2.0590433482810165e-06, + "loss": 2.7364, + "step": 93540 + }, + { + "epoch": 6.355822802011143, + "grad_norm": 13.212019920349121, + "learning_rate": 2.0586186981926893e-06, + "loss": 2.9991, + "step": 93545 + }, + { + "epoch": 6.356162522081805, + "grad_norm": 8.283411026000977, + "learning_rate": 2.058194048104362e-06, + "loss": 2.714, + "step": 93550 + }, + { + "epoch": 6.356502242152466, + "grad_norm": 7.789950847625732, + "learning_rate": 2.057769398016035e-06, + "loss": 2.8782, + "step": 93555 + }, + { + "epoch": 6.356841962223128, + "grad_norm": 9.095558166503906, + "learning_rate": 2.0573447479277077e-06, + "loss": 2.4164, + "step": 93560 + }, + { + "epoch": 6.35718168229379, + "grad_norm": 8.752788543701172, + "learning_rate": 2.0569200978393805e-06, + "loss": 2.5996, + "step": 93565 + }, + { + "epoch": 6.357521402364451, + "grad_norm": 6.466411590576172, + "learning_rate": 2.0564954477510533e-06, + "loss": 2.8397, + "step": 93570 + }, + { + "epoch": 6.357861122435113, + "grad_norm": 8.577057838439941, + "learning_rate": 2.056070797662726e-06, + "loss": 2.8492, + "step": 93575 + }, + { + "epoch": 6.358200842505775, + "grad_norm": 8.104619979858398, + "learning_rate": 2.055646147574399e-06, + "loss": 2.7325, + "step": 93580 + }, + { + "epoch": 6.358540562576437, + "grad_norm": 6.954634189605713, + "learning_rate": 2.0552214974860713e-06, + "loss": 2.6644, + "step": 93585 + }, + { + "epoch": 6.358880282647099, + "grad_norm": 10.210637092590332, + "learning_rate": 2.0547968473977445e-06, + "loss": 2.8469, + "step": 93590 + }, + { + "epoch": 6.359220002717761, + "grad_norm": 7.965843677520752, + "learning_rate": 2.0543721973094173e-06, + "loss": 2.8218, + "step": 93595 + }, + { + "epoch": 6.359559722788422, + "grad_norm": 6.863248348236084, + "learning_rate": 2.0539475472210897e-06, + "loss": 3.0446, + "step": 93600 + }, + { + "epoch": 6.359899442859084, + "grad_norm": 8.635540008544922, + "learning_rate": 2.053522897132763e-06, + "loss": 2.5709, + "step": 93605 + }, + { + "epoch": 6.360239162929746, + "grad_norm": 6.07316780090332, + "learning_rate": 2.0530982470444353e-06, + "loss": 2.8191, + "step": 93610 + }, + { + "epoch": 6.360578883000407, + "grad_norm": 6.926756858825684, + "learning_rate": 2.0526735969561085e-06, + "loss": 2.7255, + "step": 93615 + }, + { + "epoch": 6.360918603071069, + "grad_norm": 7.046008110046387, + "learning_rate": 2.0522489468677813e-06, + "loss": 2.7198, + "step": 93620 + }, + { + "epoch": 6.3612583231417315, + "grad_norm": 7.020352840423584, + "learning_rate": 2.0518242967794537e-06, + "loss": 2.7706, + "step": 93625 + }, + { + "epoch": 6.361598043212393, + "grad_norm": 6.425993919372559, + "learning_rate": 2.051399646691127e-06, + "loss": 2.547, + "step": 93630 + }, + { + "epoch": 6.361937763283055, + "grad_norm": 6.15590238571167, + "learning_rate": 2.0509749966027993e-06, + "loss": 2.9212, + "step": 93635 + }, + { + "epoch": 6.362277483353717, + "grad_norm": 7.935145378112793, + "learning_rate": 2.050550346514472e-06, + "loss": 2.7221, + "step": 93640 + }, + { + "epoch": 6.362617203424378, + "grad_norm": 8.948138236999512, + "learning_rate": 2.050125696426145e-06, + "loss": 2.8638, + "step": 93645 + }, + { + "epoch": 6.36295692349504, + "grad_norm": 7.123894214630127, + "learning_rate": 2.0497010463378177e-06, + "loss": 2.6324, + "step": 93650 + }, + { + "epoch": 6.363296643565702, + "grad_norm": 7.4090895652771, + "learning_rate": 2.0492763962494905e-06, + "loss": 2.6903, + "step": 93655 + }, + { + "epoch": 6.363636363636363, + "grad_norm": 6.865089416503906, + "learning_rate": 2.0488517461611633e-06, + "loss": 2.5041, + "step": 93660 + }, + { + "epoch": 6.363976083707025, + "grad_norm": 7.722663402557373, + "learning_rate": 2.048427096072836e-06, + "loss": 2.6951, + "step": 93665 + }, + { + "epoch": 6.3643158037776875, + "grad_norm": 9.497812271118164, + "learning_rate": 2.048002445984509e-06, + "loss": 2.8238, + "step": 93670 + }, + { + "epoch": 6.364655523848349, + "grad_norm": 6.102041244506836, + "learning_rate": 2.0475777958961817e-06, + "loss": 2.7022, + "step": 93675 + }, + { + "epoch": 6.364995243919011, + "grad_norm": 8.95920181274414, + "learning_rate": 2.0471531458078545e-06, + "loss": 2.5956, + "step": 93680 + }, + { + "epoch": 6.365334963989673, + "grad_norm": 9.127541542053223, + "learning_rate": 2.0467284957195273e-06, + "loss": 2.9626, + "step": 93685 + }, + { + "epoch": 6.365674684060334, + "grad_norm": 7.768154144287109, + "learning_rate": 2.0463038456312e-06, + "loss": 3.0216, + "step": 93690 + }, + { + "epoch": 6.366014404130996, + "grad_norm": 7.090205669403076, + "learning_rate": 2.045879195542873e-06, + "loss": 2.6172, + "step": 93695 + }, + { + "epoch": 6.366354124201658, + "grad_norm": 5.838825225830078, + "learning_rate": 2.0454545454545457e-06, + "loss": 2.6421, + "step": 93700 + }, + { + "epoch": 6.366693844272319, + "grad_norm": 6.740925312042236, + "learning_rate": 2.0450298953662185e-06, + "loss": 2.3782, + "step": 93705 + }, + { + "epoch": 6.367033564342981, + "grad_norm": 8.955354690551758, + "learning_rate": 2.044605245277891e-06, + "loss": 2.7805, + "step": 93710 + }, + { + "epoch": 6.3673732844136435, + "grad_norm": 7.278988361358643, + "learning_rate": 2.044180595189564e-06, + "loss": 2.7063, + "step": 93715 + }, + { + "epoch": 6.367713004484305, + "grad_norm": 7.610101699829102, + "learning_rate": 2.043755945101237e-06, + "loss": 2.585, + "step": 93720 + }, + { + "epoch": 6.368052724554967, + "grad_norm": 6.945420742034912, + "learning_rate": 2.0433312950129093e-06, + "loss": 2.7919, + "step": 93725 + }, + { + "epoch": 6.368392444625629, + "grad_norm": 7.6648383140563965, + "learning_rate": 2.0429066449245825e-06, + "loss": 2.793, + "step": 93730 + }, + { + "epoch": 6.36873216469629, + "grad_norm": 7.167535305023193, + "learning_rate": 2.042481994836255e-06, + "loss": 2.6556, + "step": 93735 + }, + { + "epoch": 6.369071884766952, + "grad_norm": 7.082310676574707, + "learning_rate": 2.0420573447479277e-06, + "loss": 2.6077, + "step": 93740 + }, + { + "epoch": 6.369411604837614, + "grad_norm": 6.717898368835449, + "learning_rate": 2.041632694659601e-06, + "loss": 2.971, + "step": 93745 + }, + { + "epoch": 6.369751324908275, + "grad_norm": 8.348587989807129, + "learning_rate": 2.0412080445712733e-06, + "loss": 2.7059, + "step": 93750 + }, + { + "epoch": 6.370091044978937, + "grad_norm": 8.215481758117676, + "learning_rate": 2.040783394482946e-06, + "loss": 2.7423, + "step": 93755 + }, + { + "epoch": 6.3704307650495995, + "grad_norm": 6.590269088745117, + "learning_rate": 2.040358744394619e-06, + "loss": 2.8721, + "step": 93760 + }, + { + "epoch": 6.370770485120261, + "grad_norm": 7.984429359436035, + "learning_rate": 2.0399340943062917e-06, + "loss": 2.7975, + "step": 93765 + }, + { + "epoch": 6.371110205190923, + "grad_norm": 6.722492694854736, + "learning_rate": 2.0395094442179645e-06, + "loss": 2.7806, + "step": 93770 + }, + { + "epoch": 6.371449925261585, + "grad_norm": 6.885574817657471, + "learning_rate": 2.0390847941296373e-06, + "loss": 2.7983, + "step": 93775 + }, + { + "epoch": 6.371789645332246, + "grad_norm": 10.751540184020996, + "learning_rate": 2.03866014404131e-06, + "loss": 2.8691, + "step": 93780 + }, + { + "epoch": 6.372129365402908, + "grad_norm": 7.927139759063721, + "learning_rate": 2.038235493952983e-06, + "loss": 2.6755, + "step": 93785 + }, + { + "epoch": 6.37246908547357, + "grad_norm": 6.4198994636535645, + "learning_rate": 2.0378108438646557e-06, + "loss": 2.594, + "step": 93790 + }, + { + "epoch": 6.372808805544231, + "grad_norm": 6.388673305511475, + "learning_rate": 2.0373861937763285e-06, + "loss": 2.7806, + "step": 93795 + }, + { + "epoch": 6.373148525614893, + "grad_norm": 7.350955486297607, + "learning_rate": 2.0369615436880013e-06, + "loss": 2.6356, + "step": 93800 + }, + { + "epoch": 6.3734882456855555, + "grad_norm": 8.738067626953125, + "learning_rate": 2.036536893599674e-06, + "loss": 2.9021, + "step": 93805 + }, + { + "epoch": 6.373827965756217, + "grad_norm": 8.305689811706543, + "learning_rate": 2.036112243511347e-06, + "loss": 2.7822, + "step": 93810 + }, + { + "epoch": 6.374167685826879, + "grad_norm": 8.067432403564453, + "learning_rate": 2.0356875934230197e-06, + "loss": 2.7256, + "step": 93815 + }, + { + "epoch": 6.374507405897541, + "grad_norm": 7.092358112335205, + "learning_rate": 2.0352629433346925e-06, + "loss": 2.7839, + "step": 93820 + }, + { + "epoch": 6.374847125968202, + "grad_norm": 8.171871185302734, + "learning_rate": 2.034838293246365e-06, + "loss": 2.6676, + "step": 93825 + }, + { + "epoch": 6.375186846038864, + "grad_norm": 9.377861022949219, + "learning_rate": 2.034413643158038e-06, + "loss": 2.8835, + "step": 93830 + }, + { + "epoch": 6.375526566109526, + "grad_norm": 8.93750286102295, + "learning_rate": 2.0339889930697105e-06, + "loss": 2.5967, + "step": 93835 + }, + { + "epoch": 6.375866286180187, + "grad_norm": 7.2486443519592285, + "learning_rate": 2.0335643429813833e-06, + "loss": 2.8079, + "step": 93840 + }, + { + "epoch": 6.376206006250849, + "grad_norm": 9.433049201965332, + "learning_rate": 2.0331396928930565e-06, + "loss": 2.7549, + "step": 93845 + }, + { + "epoch": 6.3765457263215115, + "grad_norm": 7.032293796539307, + "learning_rate": 2.032715042804729e-06, + "loss": 2.7048, + "step": 93850 + }, + { + "epoch": 6.376885446392173, + "grad_norm": 7.983603477478027, + "learning_rate": 2.0322903927164017e-06, + "loss": 2.7627, + "step": 93855 + }, + { + "epoch": 6.377225166462835, + "grad_norm": 5.8654985427856445, + "learning_rate": 2.0318657426280745e-06, + "loss": 2.9093, + "step": 93860 + }, + { + "epoch": 6.377564886533497, + "grad_norm": 7.964938640594482, + "learning_rate": 2.0314410925397473e-06, + "loss": 2.6793, + "step": 93865 + }, + { + "epoch": 6.377904606604158, + "grad_norm": 7.074906349182129, + "learning_rate": 2.03101644245142e-06, + "loss": 2.706, + "step": 93870 + }, + { + "epoch": 6.37824432667482, + "grad_norm": 7.361352920532227, + "learning_rate": 2.030591792363093e-06, + "loss": 2.6056, + "step": 93875 + }, + { + "epoch": 6.378584046745482, + "grad_norm": 9.290477752685547, + "learning_rate": 2.0301671422747657e-06, + "loss": 2.6958, + "step": 93880 + }, + { + "epoch": 6.378923766816143, + "grad_norm": 9.020936012268066, + "learning_rate": 2.0297424921864385e-06, + "loss": 2.5338, + "step": 93885 + }, + { + "epoch": 6.3792634868868054, + "grad_norm": 8.573186874389648, + "learning_rate": 2.0293178420981113e-06, + "loss": 2.6359, + "step": 93890 + }, + { + "epoch": 6.379603206957467, + "grad_norm": 5.897372722625732, + "learning_rate": 2.028893192009784e-06, + "loss": 2.3985, + "step": 93895 + }, + { + "epoch": 6.379942927028129, + "grad_norm": 8.812445640563965, + "learning_rate": 2.028468541921457e-06, + "loss": 2.6582, + "step": 93900 + }, + { + "epoch": 6.380282647098791, + "grad_norm": 7.483961582183838, + "learning_rate": 2.0280438918331297e-06, + "loss": 2.6088, + "step": 93905 + }, + { + "epoch": 6.380622367169452, + "grad_norm": 7.659496784210205, + "learning_rate": 2.0276192417448025e-06, + "loss": 2.5492, + "step": 93910 + }, + { + "epoch": 6.380962087240114, + "grad_norm": 7.781505584716797, + "learning_rate": 2.0271945916564753e-06, + "loss": 2.9758, + "step": 93915 + }, + { + "epoch": 6.381301807310776, + "grad_norm": 7.217710971832275, + "learning_rate": 2.026769941568148e-06, + "loss": 2.2122, + "step": 93920 + }, + { + "epoch": 6.381641527381437, + "grad_norm": 7.136734962463379, + "learning_rate": 2.0263452914798205e-06, + "loss": 2.7455, + "step": 93925 + }, + { + "epoch": 6.381981247452099, + "grad_norm": 6.620400905609131, + "learning_rate": 2.0259206413914937e-06, + "loss": 2.8909, + "step": 93930 + }, + { + "epoch": 6.3823209675227615, + "grad_norm": 7.49239444732666, + "learning_rate": 2.0254959913031665e-06, + "loss": 2.4902, + "step": 93935 + }, + { + "epoch": 6.382660687593423, + "grad_norm": 7.054052352905273, + "learning_rate": 2.025071341214839e-06, + "loss": 2.8861, + "step": 93940 + }, + { + "epoch": 6.383000407664085, + "grad_norm": 7.642750263214111, + "learning_rate": 2.024646691126512e-06, + "loss": 2.7575, + "step": 93945 + }, + { + "epoch": 6.383340127734747, + "grad_norm": 7.431938648223877, + "learning_rate": 2.0242220410381845e-06, + "loss": 2.5848, + "step": 93950 + }, + { + "epoch": 6.383679847805408, + "grad_norm": 7.9500226974487305, + "learning_rate": 2.0237973909498577e-06, + "loss": 2.7374, + "step": 93955 + }, + { + "epoch": 6.38401956787607, + "grad_norm": 7.950977802276611, + "learning_rate": 2.02337274086153e-06, + "loss": 2.7241, + "step": 93960 + }, + { + "epoch": 6.384359287946732, + "grad_norm": 7.910435676574707, + "learning_rate": 2.022948090773203e-06, + "loss": 2.8775, + "step": 93965 + }, + { + "epoch": 6.384699008017393, + "grad_norm": 8.850142478942871, + "learning_rate": 2.022523440684876e-06, + "loss": 2.8934, + "step": 93970 + }, + { + "epoch": 6.385038728088055, + "grad_norm": 6.563311576843262, + "learning_rate": 2.0220987905965485e-06, + "loss": 2.6426, + "step": 93975 + }, + { + "epoch": 6.3853784481587175, + "grad_norm": 6.958871364593506, + "learning_rate": 2.0216741405082213e-06, + "loss": 2.7677, + "step": 93980 + }, + { + "epoch": 6.385718168229379, + "grad_norm": 8.063705444335938, + "learning_rate": 2.021249490419894e-06, + "loss": 2.7261, + "step": 93985 + }, + { + "epoch": 6.386057888300041, + "grad_norm": 5.907277584075928, + "learning_rate": 2.020824840331567e-06, + "loss": 2.6288, + "step": 93990 + }, + { + "epoch": 6.386397608370703, + "grad_norm": 8.750221252441406, + "learning_rate": 2.0204001902432397e-06, + "loss": 2.6424, + "step": 93995 + }, + { + "epoch": 6.386737328441364, + "grad_norm": 7.439680576324463, + "learning_rate": 2.0199755401549125e-06, + "loss": 2.6048, + "step": 94000 + }, + { + "epoch": 6.387077048512026, + "grad_norm": 9.974352836608887, + "learning_rate": 2.0195508900665853e-06, + "loss": 2.7126, + "step": 94005 + }, + { + "epoch": 6.387416768582688, + "grad_norm": 9.594038963317871, + "learning_rate": 2.019126239978258e-06, + "loss": 2.9704, + "step": 94010 + }, + { + "epoch": 6.387756488653349, + "grad_norm": 7.328137397766113, + "learning_rate": 2.018701589889931e-06, + "loss": 2.807, + "step": 94015 + }, + { + "epoch": 6.388096208724011, + "grad_norm": 6.952571868896484, + "learning_rate": 2.0182769398016037e-06, + "loss": 2.8068, + "step": 94020 + }, + { + "epoch": 6.3884359287946735, + "grad_norm": 6.794363498687744, + "learning_rate": 2.017852289713276e-06, + "loss": 2.8488, + "step": 94025 + }, + { + "epoch": 6.388775648865335, + "grad_norm": 8.366904258728027, + "learning_rate": 2.0174276396249493e-06, + "loss": 2.8067, + "step": 94030 + }, + { + "epoch": 6.389115368935997, + "grad_norm": 7.73095703125, + "learning_rate": 2.017002989536622e-06, + "loss": 2.7774, + "step": 94035 + }, + { + "epoch": 6.389455089006659, + "grad_norm": 8.791485786437988, + "learning_rate": 2.016578339448295e-06, + "loss": 2.9108, + "step": 94040 + }, + { + "epoch": 6.38979480907732, + "grad_norm": 7.2598371505737305, + "learning_rate": 2.0161536893599677e-06, + "loss": 2.8228, + "step": 94045 + }, + { + "epoch": 6.390134529147982, + "grad_norm": 7.102850914001465, + "learning_rate": 2.01572903927164e-06, + "loss": 2.9087, + "step": 94050 + }, + { + "epoch": 6.390474249218644, + "grad_norm": 6.521540641784668, + "learning_rate": 2.0153043891833133e-06, + "loss": 2.3542, + "step": 94055 + }, + { + "epoch": 6.390813969289305, + "grad_norm": 8.286924362182617, + "learning_rate": 2.0148797390949857e-06, + "loss": 2.6962, + "step": 94060 + }, + { + "epoch": 6.391153689359967, + "grad_norm": 6.639287948608398, + "learning_rate": 2.0144550890066585e-06, + "loss": 2.6309, + "step": 94065 + }, + { + "epoch": 6.3914934094306295, + "grad_norm": 8.53637981414795, + "learning_rate": 2.0140304389183317e-06, + "loss": 2.9415, + "step": 94070 + }, + { + "epoch": 6.391833129501291, + "grad_norm": 8.098712921142578, + "learning_rate": 2.013605788830004e-06, + "loss": 2.9156, + "step": 94075 + }, + { + "epoch": 6.392172849571953, + "grad_norm": 6.54344367980957, + "learning_rate": 2.013181138741677e-06, + "loss": 2.7891, + "step": 94080 + }, + { + "epoch": 6.392512569642615, + "grad_norm": 7.755815505981445, + "learning_rate": 2.0127564886533497e-06, + "loss": 2.7506, + "step": 94085 + }, + { + "epoch": 6.392852289713276, + "grad_norm": 7.271775722503662, + "learning_rate": 2.0123318385650225e-06, + "loss": 2.8893, + "step": 94090 + }, + { + "epoch": 6.393192009783938, + "grad_norm": 6.810474395751953, + "learning_rate": 2.0119071884766953e-06, + "loss": 2.7063, + "step": 94095 + }, + { + "epoch": 6.3935317298546, + "grad_norm": 6.103761672973633, + "learning_rate": 2.011482538388368e-06, + "loss": 2.765, + "step": 94100 + }, + { + "epoch": 6.393871449925261, + "grad_norm": 7.7727885246276855, + "learning_rate": 2.011057888300041e-06, + "loss": 2.6971, + "step": 94105 + }, + { + "epoch": 6.394211169995923, + "grad_norm": 6.467475414276123, + "learning_rate": 2.0106332382117137e-06, + "loss": 2.7374, + "step": 94110 + }, + { + "epoch": 6.3945508900665855, + "grad_norm": 10.275988578796387, + "learning_rate": 2.0102085881233865e-06, + "loss": 2.5382, + "step": 94115 + }, + { + "epoch": 6.394890610137247, + "grad_norm": 8.019536018371582, + "learning_rate": 2.0097839380350593e-06, + "loss": 2.5701, + "step": 94120 + }, + { + "epoch": 6.395230330207909, + "grad_norm": 7.5733137130737305, + "learning_rate": 2.009359287946732e-06, + "loss": 2.6933, + "step": 94125 + }, + { + "epoch": 6.395570050278571, + "grad_norm": 7.935953140258789, + "learning_rate": 2.008934637858405e-06, + "loss": 2.9429, + "step": 94130 + }, + { + "epoch": 6.395909770349232, + "grad_norm": 7.510812282562256, + "learning_rate": 2.0085099877700777e-06, + "loss": 2.5946, + "step": 94135 + }, + { + "epoch": 6.396249490419894, + "grad_norm": 6.641021728515625, + "learning_rate": 2.0080853376817505e-06, + "loss": 2.738, + "step": 94140 + }, + { + "epoch": 6.396589210490556, + "grad_norm": 9.418412208557129, + "learning_rate": 2.0076606875934233e-06, + "loss": 2.4913, + "step": 94145 + }, + { + "epoch": 6.396928930561217, + "grad_norm": 8.534475326538086, + "learning_rate": 2.0072360375050957e-06, + "loss": 2.8314, + "step": 94150 + }, + { + "epoch": 6.397268650631879, + "grad_norm": 7.307567119598389, + "learning_rate": 2.006811387416769e-06, + "loss": 2.7411, + "step": 94155 + }, + { + "epoch": 6.3976083707025415, + "grad_norm": 8.592875480651855, + "learning_rate": 2.0063867373284417e-06, + "loss": 2.8396, + "step": 94160 + }, + { + "epoch": 6.397948090773203, + "grad_norm": 7.170207977294922, + "learning_rate": 2.005962087240114e-06, + "loss": 2.7087, + "step": 94165 + }, + { + "epoch": 6.398287810843865, + "grad_norm": 7.452258586883545, + "learning_rate": 2.0055374371517873e-06, + "loss": 2.7928, + "step": 94170 + }, + { + "epoch": 6.398627530914526, + "grad_norm": 7.382579326629639, + "learning_rate": 2.0051127870634597e-06, + "loss": 2.6236, + "step": 94175 + }, + { + "epoch": 6.398967250985188, + "grad_norm": 7.796016693115234, + "learning_rate": 2.0046881369751325e-06, + "loss": 2.6758, + "step": 94180 + }, + { + "epoch": 6.39930697105585, + "grad_norm": 6.544139385223389, + "learning_rate": 2.0042634868868053e-06, + "loss": 2.6683, + "step": 94185 + }, + { + "epoch": 6.399646691126511, + "grad_norm": 6.591373920440674, + "learning_rate": 2.003838836798478e-06, + "loss": 2.7895, + "step": 94190 + }, + { + "epoch": 6.399986411197173, + "grad_norm": 8.048644065856934, + "learning_rate": 2.003414186710151e-06, + "loss": 2.7828, + "step": 94195 + }, + { + "epoch": 6.4003261312678354, + "grad_norm": 7.0386576652526855, + "learning_rate": 2.0029895366218237e-06, + "loss": 2.6832, + "step": 94200 + }, + { + "epoch": 6.400665851338497, + "grad_norm": 8.835149765014648, + "learning_rate": 2.0025648865334965e-06, + "loss": 2.7306, + "step": 94205 + }, + { + "epoch": 6.401005571409159, + "grad_norm": 7.929872035980225, + "learning_rate": 2.0021402364451693e-06, + "loss": 2.6781, + "step": 94210 + }, + { + "epoch": 6.401345291479821, + "grad_norm": 8.35898208618164, + "learning_rate": 2.001715586356842e-06, + "loss": 2.8188, + "step": 94215 + }, + { + "epoch": 6.401685011550482, + "grad_norm": 5.7935686111450195, + "learning_rate": 2.001290936268515e-06, + "loss": 2.8458, + "step": 94220 + }, + { + "epoch": 6.402024731621144, + "grad_norm": 7.553280830383301, + "learning_rate": 2.0008662861801877e-06, + "loss": 2.5829, + "step": 94225 + }, + { + "epoch": 6.402364451691806, + "grad_norm": 7.0966291427612305, + "learning_rate": 2.0004416360918605e-06, + "loss": 2.5588, + "step": 94230 + }, + { + "epoch": 6.402704171762467, + "grad_norm": 7.791469097137451, + "learning_rate": 2.0000169860035333e-06, + "loss": 2.8121, + "step": 94235 + }, + { + "epoch": 6.403043891833129, + "grad_norm": 7.429393768310547, + "learning_rate": 1.999592335915206e-06, + "loss": 2.6357, + "step": 94240 + }, + { + "epoch": 6.4033836119037915, + "grad_norm": 7.403961181640625, + "learning_rate": 1.999167685826879e-06, + "loss": 2.6228, + "step": 94245 + }, + { + "epoch": 6.403723331974453, + "grad_norm": 9.255453109741211, + "learning_rate": 1.9987430357385517e-06, + "loss": 2.9459, + "step": 94250 + }, + { + "epoch": 6.404063052045115, + "grad_norm": 9.260871887207031, + "learning_rate": 1.9983183856502245e-06, + "loss": 2.9086, + "step": 94255 + }, + { + "epoch": 6.404402772115777, + "grad_norm": 8.311056137084961, + "learning_rate": 1.9978937355618973e-06, + "loss": 2.6765, + "step": 94260 + }, + { + "epoch": 6.404742492186438, + "grad_norm": 9.606380462646484, + "learning_rate": 1.9974690854735697e-06, + "loss": 2.8989, + "step": 94265 + }, + { + "epoch": 6.4050822122571, + "grad_norm": 9.775857925415039, + "learning_rate": 1.997044435385243e-06, + "loss": 2.8353, + "step": 94270 + }, + { + "epoch": 6.405421932327762, + "grad_norm": 7.908637046813965, + "learning_rate": 1.9966197852969153e-06, + "loss": 2.6455, + "step": 94275 + }, + { + "epoch": 6.405761652398423, + "grad_norm": 6.453804969787598, + "learning_rate": 1.996195135208588e-06, + "loss": 2.7007, + "step": 94280 + }, + { + "epoch": 6.406101372469085, + "grad_norm": 6.961145401000977, + "learning_rate": 1.9957704851202613e-06, + "loss": 2.8889, + "step": 94285 + }, + { + "epoch": 6.4064410925397475, + "grad_norm": 7.5739030838012695, + "learning_rate": 1.9953458350319337e-06, + "loss": 2.7566, + "step": 94290 + }, + { + "epoch": 6.406780812610409, + "grad_norm": 7.7972092628479, + "learning_rate": 1.994921184943607e-06, + "loss": 2.4963, + "step": 94295 + }, + { + "epoch": 6.407120532681071, + "grad_norm": 8.625242233276367, + "learning_rate": 1.9944965348552793e-06, + "loss": 2.928, + "step": 94300 + }, + { + "epoch": 6.407460252751733, + "grad_norm": 7.316011428833008, + "learning_rate": 1.994071884766952e-06, + "loss": 2.8441, + "step": 94305 + }, + { + "epoch": 6.407799972822394, + "grad_norm": 8.337227821350098, + "learning_rate": 1.993647234678625e-06, + "loss": 2.441, + "step": 94310 + }, + { + "epoch": 6.408139692893056, + "grad_norm": 7.086065769195557, + "learning_rate": 1.9932225845902977e-06, + "loss": 2.6563, + "step": 94315 + }, + { + "epoch": 6.408479412963718, + "grad_norm": 8.553881645202637, + "learning_rate": 1.9927979345019705e-06, + "loss": 2.8266, + "step": 94320 + }, + { + "epoch": 6.408819133034379, + "grad_norm": 7.448922634124756, + "learning_rate": 1.9923732844136433e-06, + "loss": 2.6182, + "step": 94325 + }, + { + "epoch": 6.409158853105041, + "grad_norm": 8.419415473937988, + "learning_rate": 1.991948634325316e-06, + "loss": 2.919, + "step": 94330 + }, + { + "epoch": 6.4094985731757035, + "grad_norm": 6.71493673324585, + "learning_rate": 1.991523984236989e-06, + "loss": 2.6633, + "step": 94335 + }, + { + "epoch": 6.409838293246365, + "grad_norm": 8.772445678710938, + "learning_rate": 1.9910993341486617e-06, + "loss": 2.7061, + "step": 94340 + }, + { + "epoch": 6.410178013317027, + "grad_norm": 7.3390116691589355, + "learning_rate": 1.9906746840603345e-06, + "loss": 2.8996, + "step": 94345 + }, + { + "epoch": 6.410517733387689, + "grad_norm": 7.837122917175293, + "learning_rate": 1.9902500339720073e-06, + "loss": 2.8556, + "step": 94350 + }, + { + "epoch": 6.41085745345835, + "grad_norm": 7.675112247467041, + "learning_rate": 1.98982538388368e-06, + "loss": 2.6733, + "step": 94355 + }, + { + "epoch": 6.411197173529012, + "grad_norm": 7.92939567565918, + "learning_rate": 1.989400733795353e-06, + "loss": 2.9096, + "step": 94360 + }, + { + "epoch": 6.411536893599674, + "grad_norm": 8.691758155822754, + "learning_rate": 1.9889760837070253e-06, + "loss": 2.737, + "step": 94365 + }, + { + "epoch": 6.411876613670335, + "grad_norm": 7.247778415679932, + "learning_rate": 1.9885514336186985e-06, + "loss": 2.6703, + "step": 94370 + }, + { + "epoch": 6.412216333740997, + "grad_norm": 8.524548530578613, + "learning_rate": 1.988126783530371e-06, + "loss": 2.7589, + "step": 94375 + }, + { + "epoch": 6.4125560538116595, + "grad_norm": 6.454031944274902, + "learning_rate": 1.987702133442044e-06, + "loss": 2.7958, + "step": 94380 + }, + { + "epoch": 6.412895773882321, + "grad_norm": 6.366765022277832, + "learning_rate": 1.987277483353717e-06, + "loss": 2.666, + "step": 94385 + }, + { + "epoch": 6.413235493952983, + "grad_norm": 6.039974689483643, + "learning_rate": 1.9868528332653893e-06, + "loss": 2.5439, + "step": 94390 + }, + { + "epoch": 6.413575214023645, + "grad_norm": 9.548377990722656, + "learning_rate": 1.9864281831770625e-06, + "loss": 2.6391, + "step": 94395 + }, + { + "epoch": 6.413914934094306, + "grad_norm": 6.162178993225098, + "learning_rate": 1.986003533088735e-06, + "loss": 2.7239, + "step": 94400 + }, + { + "epoch": 6.414254654164968, + "grad_norm": 8.078471183776855, + "learning_rate": 1.9855788830004077e-06, + "loss": 2.6789, + "step": 94405 + }, + { + "epoch": 6.41459437423563, + "grad_norm": 5.936951637268066, + "learning_rate": 1.985154232912081e-06, + "loss": 2.8277, + "step": 94410 + }, + { + "epoch": 6.414934094306291, + "grad_norm": 9.247952461242676, + "learning_rate": 1.9847295828237533e-06, + "loss": 2.9038, + "step": 94415 + }, + { + "epoch": 6.415273814376953, + "grad_norm": 9.761014938354492, + "learning_rate": 1.984304932735426e-06, + "loss": 2.6874, + "step": 94420 + }, + { + "epoch": 6.4156135344476155, + "grad_norm": 7.297794818878174, + "learning_rate": 1.983880282647099e-06, + "loss": 2.753, + "step": 94425 + }, + { + "epoch": 6.415953254518277, + "grad_norm": 7.092164039611816, + "learning_rate": 1.9834556325587717e-06, + "loss": 2.9301, + "step": 94430 + }, + { + "epoch": 6.416292974588939, + "grad_norm": 7.508615016937256, + "learning_rate": 1.9830309824704445e-06, + "loss": 2.5785, + "step": 94435 + }, + { + "epoch": 6.416632694659601, + "grad_norm": 7.633037090301514, + "learning_rate": 1.9826063323821173e-06, + "loss": 2.6345, + "step": 94440 + }, + { + "epoch": 6.416972414730262, + "grad_norm": 7.742671966552734, + "learning_rate": 1.98218168229379e-06, + "loss": 2.8545, + "step": 94445 + }, + { + "epoch": 6.417312134800924, + "grad_norm": 7.654028415679932, + "learning_rate": 1.981757032205463e-06, + "loss": 2.6723, + "step": 94450 + }, + { + "epoch": 6.417651854871586, + "grad_norm": 8.957837104797363, + "learning_rate": 1.9813323821171357e-06, + "loss": 2.955, + "step": 94455 + }, + { + "epoch": 6.417991574942247, + "grad_norm": 7.67033052444458, + "learning_rate": 1.9809077320288085e-06, + "loss": 2.9022, + "step": 94460 + }, + { + "epoch": 6.418331295012909, + "grad_norm": 9.710723876953125, + "learning_rate": 1.9804830819404813e-06, + "loss": 2.7512, + "step": 94465 + }, + { + "epoch": 6.4186710150835715, + "grad_norm": 7.086363792419434, + "learning_rate": 1.980058431852154e-06, + "loss": 2.6656, + "step": 94470 + }, + { + "epoch": 6.419010735154233, + "grad_norm": 7.285918712615967, + "learning_rate": 1.979633781763827e-06, + "loss": 2.6882, + "step": 94475 + }, + { + "epoch": 6.419350455224895, + "grad_norm": 7.068158149719238, + "learning_rate": 1.9792091316754997e-06, + "loss": 2.9606, + "step": 94480 + }, + { + "epoch": 6.419690175295557, + "grad_norm": 7.17516565322876, + "learning_rate": 1.9787844815871725e-06, + "loss": 2.739, + "step": 94485 + }, + { + "epoch": 6.420029895366218, + "grad_norm": 8.93331527709961, + "learning_rate": 1.978359831498845e-06, + "loss": 2.741, + "step": 94490 + }, + { + "epoch": 6.42036961543688, + "grad_norm": 8.681904792785645, + "learning_rate": 1.977935181410518e-06, + "loss": 2.5721, + "step": 94495 + }, + { + "epoch": 6.420709335507542, + "grad_norm": 7.582896709442139, + "learning_rate": 1.9775105313221905e-06, + "loss": 2.9072, + "step": 94500 + }, + { + "epoch": 6.421049055578203, + "grad_norm": 7.57451868057251, + "learning_rate": 1.9770858812338633e-06, + "loss": 2.628, + "step": 94505 + }, + { + "epoch": 6.4213887756488655, + "grad_norm": 8.616607666015625, + "learning_rate": 1.9766612311455365e-06, + "loss": 2.5403, + "step": 94510 + }, + { + "epoch": 6.4217284957195275, + "grad_norm": 7.635075092315674, + "learning_rate": 1.976236581057209e-06, + "loss": 2.9214, + "step": 94515 + }, + { + "epoch": 6.422068215790189, + "grad_norm": 6.604520320892334, + "learning_rate": 1.9758119309688817e-06, + "loss": 2.6497, + "step": 94520 + }, + { + "epoch": 6.422407935860851, + "grad_norm": 7.5130085945129395, + "learning_rate": 1.9753872808805545e-06, + "loss": 2.5845, + "step": 94525 + }, + { + "epoch": 6.422747655931513, + "grad_norm": 7.165524005889893, + "learning_rate": 1.9749626307922273e-06, + "loss": 2.7319, + "step": 94530 + }, + { + "epoch": 6.423087376002174, + "grad_norm": 7.69969367980957, + "learning_rate": 1.9745379807039e-06, + "loss": 2.8114, + "step": 94535 + }, + { + "epoch": 6.423427096072836, + "grad_norm": 8.282722473144531, + "learning_rate": 1.974113330615573e-06, + "loss": 2.921, + "step": 94540 + }, + { + "epoch": 6.423766816143498, + "grad_norm": 6.616476058959961, + "learning_rate": 1.9736886805272457e-06, + "loss": 2.7139, + "step": 94545 + }, + { + "epoch": 6.424106536214159, + "grad_norm": 7.783656120300293, + "learning_rate": 1.9732640304389185e-06, + "loss": 2.518, + "step": 94550 + }, + { + "epoch": 6.4244462562848215, + "grad_norm": 8.423067092895508, + "learning_rate": 1.9728393803505913e-06, + "loss": 2.7988, + "step": 94555 + }, + { + "epoch": 6.4247859763554835, + "grad_norm": 7.260446548461914, + "learning_rate": 1.972414730262264e-06, + "loss": 3.0187, + "step": 94560 + }, + { + "epoch": 6.425125696426145, + "grad_norm": 9.914694786071777, + "learning_rate": 1.971990080173937e-06, + "loss": 2.7041, + "step": 94565 + }, + { + "epoch": 6.425465416496807, + "grad_norm": 8.756084442138672, + "learning_rate": 1.9715654300856097e-06, + "loss": 2.6715, + "step": 94570 + }, + { + "epoch": 6.425805136567468, + "grad_norm": 7.532454490661621, + "learning_rate": 1.9711407799972825e-06, + "loss": 2.3653, + "step": 94575 + }, + { + "epoch": 6.42614485663813, + "grad_norm": 7.598728656768799, + "learning_rate": 1.9707161299089553e-06, + "loss": 2.7028, + "step": 94580 + }, + { + "epoch": 6.426484576708792, + "grad_norm": 6.783681869506836, + "learning_rate": 1.970291479820628e-06, + "loss": 2.8738, + "step": 94585 + }, + { + "epoch": 6.426824296779453, + "grad_norm": 9.143403053283691, + "learning_rate": 1.9698668297323005e-06, + "loss": 2.8143, + "step": 94590 + }, + { + "epoch": 6.427164016850115, + "grad_norm": 8.436873435974121, + "learning_rate": 1.9694421796439737e-06, + "loss": 2.9556, + "step": 94595 + }, + { + "epoch": 6.4275037369207775, + "grad_norm": 8.36999225616455, + "learning_rate": 1.9690175295556465e-06, + "loss": 2.8287, + "step": 94600 + }, + { + "epoch": 6.427843456991439, + "grad_norm": 7.21806526184082, + "learning_rate": 1.968592879467319e-06, + "loss": 2.6687, + "step": 94605 + }, + { + "epoch": 6.428183177062101, + "grad_norm": 6.402049541473389, + "learning_rate": 1.968168229378992e-06, + "loss": 2.779, + "step": 94610 + }, + { + "epoch": 6.428522897132763, + "grad_norm": 8.28503131866455, + "learning_rate": 1.9677435792906645e-06, + "loss": 2.6687, + "step": 94615 + }, + { + "epoch": 6.428862617203424, + "grad_norm": 7.843505382537842, + "learning_rate": 1.9673189292023373e-06, + "loss": 2.8014, + "step": 94620 + }, + { + "epoch": 6.429202337274086, + "grad_norm": 9.02017879486084, + "learning_rate": 1.96689427911401e-06, + "loss": 2.7547, + "step": 94625 + }, + { + "epoch": 6.429542057344748, + "grad_norm": 6.856951713562012, + "learning_rate": 1.966469629025683e-06, + "loss": 2.936, + "step": 94630 + }, + { + "epoch": 6.429881777415409, + "grad_norm": 9.932976722717285, + "learning_rate": 1.966044978937356e-06, + "loss": 2.8154, + "step": 94635 + }, + { + "epoch": 6.430221497486071, + "grad_norm": 7.4051971435546875, + "learning_rate": 1.9656203288490285e-06, + "loss": 2.9617, + "step": 94640 + }, + { + "epoch": 6.4305612175567335, + "grad_norm": 9.922369003295898, + "learning_rate": 1.9651956787607013e-06, + "loss": 2.7269, + "step": 94645 + }, + { + "epoch": 6.430900937627395, + "grad_norm": 7.514723300933838, + "learning_rate": 1.964771028672374e-06, + "loss": 2.9645, + "step": 94650 + }, + { + "epoch": 6.431240657698057, + "grad_norm": 9.06718635559082, + "learning_rate": 1.964346378584047e-06, + "loss": 2.8389, + "step": 94655 + }, + { + "epoch": 6.431580377768719, + "grad_norm": 7.1572675704956055, + "learning_rate": 1.9639217284957197e-06, + "loss": 2.707, + "step": 94660 + }, + { + "epoch": 6.43192009783938, + "grad_norm": 6.639120578765869, + "learning_rate": 1.9634970784073925e-06, + "loss": 2.8871, + "step": 94665 + }, + { + "epoch": 6.432259817910042, + "grad_norm": 6.899158000946045, + "learning_rate": 1.9630724283190653e-06, + "loss": 2.671, + "step": 94670 + }, + { + "epoch": 6.432599537980704, + "grad_norm": 6.864559173583984, + "learning_rate": 1.962647778230738e-06, + "loss": 3.0137, + "step": 94675 + }, + { + "epoch": 6.432939258051365, + "grad_norm": 7.399965286254883, + "learning_rate": 1.962223128142411e-06, + "loss": 2.6174, + "step": 94680 + }, + { + "epoch": 6.433278978122027, + "grad_norm": 9.846447944641113, + "learning_rate": 1.9617984780540837e-06, + "loss": 2.8048, + "step": 94685 + }, + { + "epoch": 6.4336186981926895, + "grad_norm": 7.254260063171387, + "learning_rate": 1.961373827965756e-06, + "loss": 2.7154, + "step": 94690 + }, + { + "epoch": 6.433958418263351, + "grad_norm": 7.598560810089111, + "learning_rate": 1.9609491778774293e-06, + "loss": 2.7, + "step": 94695 + }, + { + "epoch": 6.434298138334013, + "grad_norm": 7.484780311584473, + "learning_rate": 1.960524527789102e-06, + "loss": 2.6112, + "step": 94700 + }, + { + "epoch": 6.434637858404675, + "grad_norm": 8.41141414642334, + "learning_rate": 1.9600998777007745e-06, + "loss": 2.8659, + "step": 94705 + }, + { + "epoch": 6.434977578475336, + "grad_norm": 8.39917278289795, + "learning_rate": 1.9596752276124477e-06, + "loss": 2.9087, + "step": 94710 + }, + { + "epoch": 6.435317298545998, + "grad_norm": 8.263262748718262, + "learning_rate": 1.95925057752412e-06, + "loss": 2.6814, + "step": 94715 + }, + { + "epoch": 6.43565701861666, + "grad_norm": 8.188558578491211, + "learning_rate": 1.9588259274357933e-06, + "loss": 2.7253, + "step": 94720 + }, + { + "epoch": 6.435996738687321, + "grad_norm": 6.882499694824219, + "learning_rate": 1.958401277347466e-06, + "loss": 2.7032, + "step": 94725 + }, + { + "epoch": 6.436336458757983, + "grad_norm": 9.744742393493652, + "learning_rate": 1.9579766272591385e-06, + "loss": 2.7018, + "step": 94730 + }, + { + "epoch": 6.4366761788286455, + "grad_norm": 7.501221656799316, + "learning_rate": 1.9575519771708117e-06, + "loss": 2.7478, + "step": 94735 + }, + { + "epoch": 6.437015898899307, + "grad_norm": 7.915822505950928, + "learning_rate": 1.957127327082484e-06, + "loss": 2.6432, + "step": 94740 + }, + { + "epoch": 6.437355618969969, + "grad_norm": 8.124241828918457, + "learning_rate": 1.956702676994157e-06, + "loss": 2.647, + "step": 94745 + }, + { + "epoch": 6.437695339040631, + "grad_norm": 7.483804225921631, + "learning_rate": 1.9562780269058297e-06, + "loss": 2.8741, + "step": 94750 + }, + { + "epoch": 6.438035059111292, + "grad_norm": 9.72989559173584, + "learning_rate": 1.9558533768175025e-06, + "loss": 2.7883, + "step": 94755 + }, + { + "epoch": 6.438374779181954, + "grad_norm": 7.10172700881958, + "learning_rate": 1.9554287267291753e-06, + "loss": 2.8489, + "step": 94760 + }, + { + "epoch": 6.438714499252616, + "grad_norm": 9.381755828857422, + "learning_rate": 1.955004076640848e-06, + "loss": 2.8965, + "step": 94765 + }, + { + "epoch": 6.439054219323277, + "grad_norm": 9.408645629882812, + "learning_rate": 1.954579426552521e-06, + "loss": 2.8185, + "step": 94770 + }, + { + "epoch": 6.4393939393939394, + "grad_norm": 6.889217376708984, + "learning_rate": 1.9541547764641937e-06, + "loss": 2.8843, + "step": 94775 + }, + { + "epoch": 6.4397336594646015, + "grad_norm": 7.583342552185059, + "learning_rate": 1.9537301263758665e-06, + "loss": 2.8564, + "step": 94780 + }, + { + "epoch": 6.440073379535263, + "grad_norm": 8.07756519317627, + "learning_rate": 1.9533054762875393e-06, + "loss": 2.831, + "step": 94785 + }, + { + "epoch": 6.440413099605925, + "grad_norm": 7.550759315490723, + "learning_rate": 1.952880826199212e-06, + "loss": 2.5258, + "step": 94790 + }, + { + "epoch": 6.440752819676587, + "grad_norm": 9.59679889678955, + "learning_rate": 1.952456176110885e-06, + "loss": 2.5629, + "step": 94795 + }, + { + "epoch": 6.441092539747248, + "grad_norm": 8.532920837402344, + "learning_rate": 1.9520315260225577e-06, + "loss": 2.8337, + "step": 94800 + }, + { + "epoch": 6.44143225981791, + "grad_norm": 8.47569751739502, + "learning_rate": 1.9516068759342305e-06, + "loss": 2.6927, + "step": 94805 + }, + { + "epoch": 6.441771979888572, + "grad_norm": 5.489899635314941, + "learning_rate": 1.9511822258459033e-06, + "loss": 2.6523, + "step": 94810 + }, + { + "epoch": 6.442111699959233, + "grad_norm": 8.330167770385742, + "learning_rate": 1.9507575757575757e-06, + "loss": 2.9177, + "step": 94815 + }, + { + "epoch": 6.4424514200298955, + "grad_norm": 7.738033294677734, + "learning_rate": 1.950332925669249e-06, + "loss": 2.6903, + "step": 94820 + }, + { + "epoch": 6.4427911401005575, + "grad_norm": 7.551983833312988, + "learning_rate": 1.9499082755809217e-06, + "loss": 2.722, + "step": 94825 + }, + { + "epoch": 6.443130860171219, + "grad_norm": 6.980626106262207, + "learning_rate": 1.949483625492594e-06, + "loss": 2.8737, + "step": 94830 + }, + { + "epoch": 6.443470580241881, + "grad_norm": 9.185314178466797, + "learning_rate": 1.9490589754042673e-06, + "loss": 2.7987, + "step": 94835 + }, + { + "epoch": 6.443810300312543, + "grad_norm": 9.552301406860352, + "learning_rate": 1.9486343253159397e-06, + "loss": 2.8296, + "step": 94840 + }, + { + "epoch": 6.444150020383204, + "grad_norm": 7.9713029861450195, + "learning_rate": 1.9482096752276125e-06, + "loss": 2.6941, + "step": 94845 + }, + { + "epoch": 6.444489740453866, + "grad_norm": 7.4247918128967285, + "learning_rate": 1.9477850251392857e-06, + "loss": 2.7965, + "step": 94850 + }, + { + "epoch": 6.444829460524527, + "grad_norm": 8.922025680541992, + "learning_rate": 1.947360375050958e-06, + "loss": 2.5377, + "step": 94855 + }, + { + "epoch": 6.445169180595189, + "grad_norm": 9.20680046081543, + "learning_rate": 1.946935724962631e-06, + "loss": 2.829, + "step": 94860 + }, + { + "epoch": 6.4455089006658515, + "grad_norm": 7.7654194831848145, + "learning_rate": 1.9465110748743037e-06, + "loss": 2.7325, + "step": 94865 + }, + { + "epoch": 6.445848620736513, + "grad_norm": 6.185161113739014, + "learning_rate": 1.9460864247859765e-06, + "loss": 2.8039, + "step": 94870 + }, + { + "epoch": 6.446188340807175, + "grad_norm": 6.816103935241699, + "learning_rate": 1.9456617746976493e-06, + "loss": 2.7242, + "step": 94875 + }, + { + "epoch": 6.446528060877837, + "grad_norm": 7.291685581207275, + "learning_rate": 1.945237124609322e-06, + "loss": 2.8544, + "step": 94880 + }, + { + "epoch": 6.446867780948498, + "grad_norm": 8.556329727172852, + "learning_rate": 1.944812474520995e-06, + "loss": 2.8325, + "step": 94885 + }, + { + "epoch": 6.44720750101916, + "grad_norm": 9.219834327697754, + "learning_rate": 1.9443878244326677e-06, + "loss": 2.6102, + "step": 94890 + }, + { + "epoch": 6.447547221089822, + "grad_norm": 7.384257793426514, + "learning_rate": 1.9439631743443405e-06, + "loss": 2.684, + "step": 94895 + }, + { + "epoch": 6.447886941160483, + "grad_norm": 6.74599552154541, + "learning_rate": 1.9435385242560133e-06, + "loss": 2.7258, + "step": 94900 + }, + { + "epoch": 6.448226661231145, + "grad_norm": 7.714547634124756, + "learning_rate": 1.943113874167686e-06, + "loss": 2.5013, + "step": 94905 + }, + { + "epoch": 6.4485663813018075, + "grad_norm": 8.716645240783691, + "learning_rate": 1.942689224079359e-06, + "loss": 2.8569, + "step": 94910 + }, + { + "epoch": 6.448906101372469, + "grad_norm": 9.32925033569336, + "learning_rate": 1.9422645739910317e-06, + "loss": 2.7667, + "step": 94915 + }, + { + "epoch": 6.449245821443131, + "grad_norm": 7.22106409072876, + "learning_rate": 1.9418399239027045e-06, + "loss": 2.8848, + "step": 94920 + }, + { + "epoch": 6.449585541513793, + "grad_norm": 7.17078971862793, + "learning_rate": 1.9414152738143773e-06, + "loss": 2.6292, + "step": 94925 + }, + { + "epoch": 6.449925261584454, + "grad_norm": 7.3990936279296875, + "learning_rate": 1.9409906237260496e-06, + "loss": 2.6592, + "step": 94930 + }, + { + "epoch": 6.450264981655116, + "grad_norm": 8.67204475402832, + "learning_rate": 1.940565973637723e-06, + "loss": 2.8391, + "step": 94935 + }, + { + "epoch": 6.450604701725778, + "grad_norm": 8.207303047180176, + "learning_rate": 1.9401413235493952e-06, + "loss": 2.8198, + "step": 94940 + }, + { + "epoch": 6.450944421796439, + "grad_norm": 7.131435871124268, + "learning_rate": 1.939716673461068e-06, + "loss": 2.7651, + "step": 94945 + }, + { + "epoch": 6.451284141867101, + "grad_norm": 6.5886125564575195, + "learning_rate": 1.9392920233727413e-06, + "loss": 2.7068, + "step": 94950 + }, + { + "epoch": 6.4516238619377635, + "grad_norm": 8.585858345031738, + "learning_rate": 1.9388673732844136e-06, + "loss": 2.7412, + "step": 94955 + }, + { + "epoch": 6.451963582008425, + "grad_norm": 7.031380653381348, + "learning_rate": 1.9384427231960864e-06, + "loss": 2.8473, + "step": 94960 + }, + { + "epoch": 6.452303302079087, + "grad_norm": 8.655980110168457, + "learning_rate": 1.9380180731077593e-06, + "loss": 2.9503, + "step": 94965 + }, + { + "epoch": 6.452643022149749, + "grad_norm": 8.857473373413086, + "learning_rate": 1.937593423019432e-06, + "loss": 3.0285, + "step": 94970 + }, + { + "epoch": 6.45298274222041, + "grad_norm": 7.716906547546387, + "learning_rate": 1.937168772931105e-06, + "loss": 2.752, + "step": 94975 + }, + { + "epoch": 6.453322462291072, + "grad_norm": 7.129394054412842, + "learning_rate": 1.9367441228427777e-06, + "loss": 2.507, + "step": 94980 + }, + { + "epoch": 6.453662182361734, + "grad_norm": 8.979938507080078, + "learning_rate": 1.9363194727544505e-06, + "loss": 2.7944, + "step": 94985 + }, + { + "epoch": 6.454001902432395, + "grad_norm": 7.347237586975098, + "learning_rate": 1.9358948226661233e-06, + "loss": 2.785, + "step": 94990 + }, + { + "epoch": 6.454341622503057, + "grad_norm": 7.730876445770264, + "learning_rate": 1.935470172577796e-06, + "loss": 2.7018, + "step": 94995 + }, + { + "epoch": 6.4546813425737195, + "grad_norm": 4.990516662597656, + "learning_rate": 1.935045522489469e-06, + "loss": 2.4657, + "step": 95000 + }, + { + "epoch": 6.455021062644381, + "grad_norm": 7.252951622009277, + "learning_rate": 1.9346208724011417e-06, + "loss": 2.8904, + "step": 95005 + }, + { + "epoch": 6.455360782715043, + "grad_norm": 8.152275085449219, + "learning_rate": 1.9341962223128145e-06, + "loss": 2.7974, + "step": 95010 + }, + { + "epoch": 6.455700502785705, + "grad_norm": 6.558776378631592, + "learning_rate": 1.9337715722244873e-06, + "loss": 2.6638, + "step": 95015 + }, + { + "epoch": 6.456040222856366, + "grad_norm": 7.280182361602783, + "learning_rate": 1.93334692213616e-06, + "loss": 2.893, + "step": 95020 + }, + { + "epoch": 6.456379942927028, + "grad_norm": 6.462860107421875, + "learning_rate": 1.932922272047833e-06, + "loss": 2.9792, + "step": 95025 + }, + { + "epoch": 6.45671966299769, + "grad_norm": 9.255096435546875, + "learning_rate": 1.9324976219595052e-06, + "loss": 2.7416, + "step": 95030 + }, + { + "epoch": 6.457059383068351, + "grad_norm": 6.91048002243042, + "learning_rate": 1.9320729718711785e-06, + "loss": 2.7178, + "step": 95035 + }, + { + "epoch": 6.457399103139013, + "grad_norm": 8.698607444763184, + "learning_rate": 1.9316483217828513e-06, + "loss": 2.6775, + "step": 95040 + }, + { + "epoch": 6.4577388232096755, + "grad_norm": 6.269466876983643, + "learning_rate": 1.9312236716945236e-06, + "loss": 2.7406, + "step": 95045 + }, + { + "epoch": 6.458078543280337, + "grad_norm": 6.689545154571533, + "learning_rate": 1.930799021606197e-06, + "loss": 2.7951, + "step": 95050 + }, + { + "epoch": 6.458418263350999, + "grad_norm": 8.014735221862793, + "learning_rate": 1.9303743715178692e-06, + "loss": 2.5234, + "step": 95055 + }, + { + "epoch": 6.458757983421661, + "grad_norm": 10.494274139404297, + "learning_rate": 1.9299497214295425e-06, + "loss": 2.9899, + "step": 95060 + }, + { + "epoch": 6.459097703492322, + "grad_norm": 7.276008605957031, + "learning_rate": 1.929525071341215e-06, + "loss": 2.8158, + "step": 95065 + }, + { + "epoch": 6.459437423562984, + "grad_norm": 7.740854263305664, + "learning_rate": 1.9291004212528876e-06, + "loss": 2.6971, + "step": 95070 + }, + { + "epoch": 6.459777143633646, + "grad_norm": 8.514102935791016, + "learning_rate": 1.928675771164561e-06, + "loss": 2.9221, + "step": 95075 + }, + { + "epoch": 6.460116863704307, + "grad_norm": 8.050697326660156, + "learning_rate": 1.9282511210762332e-06, + "loss": 2.884, + "step": 95080 + }, + { + "epoch": 6.4604565837749695, + "grad_norm": 9.20468521118164, + "learning_rate": 1.927826470987906e-06, + "loss": 2.7025, + "step": 95085 + }, + { + "epoch": 6.4607963038456315, + "grad_norm": 7.498117923736572, + "learning_rate": 1.927401820899579e-06, + "loss": 2.8655, + "step": 95090 + }, + { + "epoch": 6.461136023916293, + "grad_norm": 7.555875778198242, + "learning_rate": 1.9269771708112516e-06, + "loss": 2.7064, + "step": 95095 + }, + { + "epoch": 6.461475743986955, + "grad_norm": 7.159021854400635, + "learning_rate": 1.9265525207229244e-06, + "loss": 2.5554, + "step": 95100 + }, + { + "epoch": 6.461815464057617, + "grad_norm": 8.931445121765137, + "learning_rate": 1.9261278706345972e-06, + "loss": 2.6865, + "step": 95105 + }, + { + "epoch": 6.462155184128278, + "grad_norm": 7.016768455505371, + "learning_rate": 1.92570322054627e-06, + "loss": 2.5104, + "step": 95110 + }, + { + "epoch": 6.46249490419894, + "grad_norm": 5.714457035064697, + "learning_rate": 1.925278570457943e-06, + "loss": 2.8421, + "step": 95115 + }, + { + "epoch": 6.462834624269602, + "grad_norm": 6.744965076446533, + "learning_rate": 1.9248539203696157e-06, + "loss": 2.6953, + "step": 95120 + }, + { + "epoch": 6.463174344340263, + "grad_norm": 6.085676670074463, + "learning_rate": 1.9244292702812885e-06, + "loss": 2.9401, + "step": 95125 + }, + { + "epoch": 6.4635140644109255, + "grad_norm": 8.718742370605469, + "learning_rate": 1.924004620192961e-06, + "loss": 2.7065, + "step": 95130 + }, + { + "epoch": 6.4638537844815875, + "grad_norm": 6.868541240692139, + "learning_rate": 1.923579970104634e-06, + "loss": 2.6522, + "step": 95135 + }, + { + "epoch": 6.464193504552249, + "grad_norm": 7.883282661437988, + "learning_rate": 1.923155320016307e-06, + "loss": 2.8024, + "step": 95140 + }, + { + "epoch": 6.464533224622911, + "grad_norm": 6.584187984466553, + "learning_rate": 1.9227306699279797e-06, + "loss": 2.8372, + "step": 95145 + }, + { + "epoch": 6.464872944693573, + "grad_norm": 5.748874187469482, + "learning_rate": 1.9223060198396525e-06, + "loss": 2.8318, + "step": 95150 + }, + { + "epoch": 6.465212664764234, + "grad_norm": 6.600970268249512, + "learning_rate": 1.921881369751325e-06, + "loss": 2.7801, + "step": 95155 + }, + { + "epoch": 6.465552384834896, + "grad_norm": 8.706438064575195, + "learning_rate": 1.921456719662998e-06, + "loss": 2.899, + "step": 95160 + }, + { + "epoch": 6.465892104905558, + "grad_norm": 8.98550796508789, + "learning_rate": 1.9210320695746704e-06, + "loss": 2.7012, + "step": 95165 + }, + { + "epoch": 6.466231824976219, + "grad_norm": 8.101622581481934, + "learning_rate": 1.9206074194863432e-06, + "loss": 2.6927, + "step": 95170 + }, + { + "epoch": 6.4665715450468815, + "grad_norm": 7.399065971374512, + "learning_rate": 1.9201827693980165e-06, + "loss": 2.8903, + "step": 95175 + }, + { + "epoch": 6.4669112651175436, + "grad_norm": 11.178589820861816, + "learning_rate": 1.919758119309689e-06, + "loss": 2.6189, + "step": 95180 + }, + { + "epoch": 6.467250985188205, + "grad_norm": 7.195501804351807, + "learning_rate": 1.9193334692213616e-06, + "loss": 2.7232, + "step": 95185 + }, + { + "epoch": 6.467590705258867, + "grad_norm": 9.642745018005371, + "learning_rate": 1.9189088191330344e-06, + "loss": 2.7155, + "step": 95190 + }, + { + "epoch": 6.467930425329529, + "grad_norm": 7.946195602416992, + "learning_rate": 1.9184841690447072e-06, + "loss": 2.7238, + "step": 95195 + }, + { + "epoch": 6.46827014540019, + "grad_norm": 9.490893363952637, + "learning_rate": 1.91805951895638e-06, + "loss": 2.9235, + "step": 95200 + }, + { + "epoch": 6.468609865470852, + "grad_norm": 8.185136795043945, + "learning_rate": 1.917634868868053e-06, + "loss": 2.9107, + "step": 95205 + }, + { + "epoch": 6.468949585541514, + "grad_norm": 10.080998420715332, + "learning_rate": 1.9172102187797256e-06, + "loss": 2.6425, + "step": 95210 + }, + { + "epoch": 6.469289305612175, + "grad_norm": 7.544717311859131, + "learning_rate": 1.9167855686913984e-06, + "loss": 2.7648, + "step": 95215 + }, + { + "epoch": 6.4696290256828375, + "grad_norm": 9.17216968536377, + "learning_rate": 1.9163609186030712e-06, + "loss": 2.6418, + "step": 95220 + }, + { + "epoch": 6.4699687457535, + "grad_norm": 8.765765190124512, + "learning_rate": 1.915936268514744e-06, + "loss": 2.6586, + "step": 95225 + }, + { + "epoch": 6.470308465824161, + "grad_norm": 8.336320877075195, + "learning_rate": 1.915511618426417e-06, + "loss": 2.9463, + "step": 95230 + }, + { + "epoch": 6.470648185894823, + "grad_norm": 8.18278980255127, + "learning_rate": 1.9150869683380896e-06, + "loss": 3.0494, + "step": 95235 + }, + { + "epoch": 6.470987905965485, + "grad_norm": 7.624467372894287, + "learning_rate": 1.9146623182497624e-06, + "loss": 2.9651, + "step": 95240 + }, + { + "epoch": 6.471327626036146, + "grad_norm": 9.3745698928833, + "learning_rate": 1.9142376681614352e-06, + "loss": 2.9144, + "step": 95245 + }, + { + "epoch": 6.471667346106808, + "grad_norm": 6.917361736297607, + "learning_rate": 1.913813018073108e-06, + "loss": 2.7616, + "step": 95250 + }, + { + "epoch": 6.47200706617747, + "grad_norm": 9.888331413269043, + "learning_rate": 1.9133883679847804e-06, + "loss": 2.7779, + "step": 95255 + }, + { + "epoch": 6.472346786248131, + "grad_norm": 7.320250511169434, + "learning_rate": 1.9129637178964536e-06, + "loss": 2.8493, + "step": 95260 + }, + { + "epoch": 6.4726865063187935, + "grad_norm": 6.612654209136963, + "learning_rate": 1.9125390678081264e-06, + "loss": 2.7226, + "step": 95265 + }, + { + "epoch": 6.473026226389455, + "grad_norm": 6.186715126037598, + "learning_rate": 1.912114417719799e-06, + "loss": 2.7097, + "step": 95270 + }, + { + "epoch": 6.473365946460117, + "grad_norm": 8.422805786132812, + "learning_rate": 1.911689767631472e-06, + "loss": 2.5823, + "step": 95275 + }, + { + "epoch": 6.473705666530779, + "grad_norm": 7.004366874694824, + "learning_rate": 1.9112651175431444e-06, + "loss": 2.9835, + "step": 95280 + }, + { + "epoch": 6.47404538660144, + "grad_norm": 6.525287628173828, + "learning_rate": 1.9108404674548172e-06, + "loss": 2.7806, + "step": 95285 + }, + { + "epoch": 6.474385106672102, + "grad_norm": 7.235603332519531, + "learning_rate": 1.91041581736649e-06, + "loss": 2.6964, + "step": 95290 + }, + { + "epoch": 6.474724826742764, + "grad_norm": 8.71073055267334, + "learning_rate": 1.909991167278163e-06, + "loss": 2.5186, + "step": 95295 + }, + { + "epoch": 6.475064546813425, + "grad_norm": 7.820006847381592, + "learning_rate": 1.9095665171898356e-06, + "loss": 2.8537, + "step": 95300 + }, + { + "epoch": 6.475404266884087, + "grad_norm": 6.097821235656738, + "learning_rate": 1.9091418671015084e-06, + "loss": 2.7594, + "step": 95305 + }, + { + "epoch": 6.4757439869547495, + "grad_norm": 7.4156599044799805, + "learning_rate": 1.9087172170131812e-06, + "loss": 2.9027, + "step": 95310 + }, + { + "epoch": 6.476083707025411, + "grad_norm": 6.850721836090088, + "learning_rate": 1.908292566924854e-06, + "loss": 2.6051, + "step": 95315 + }, + { + "epoch": 6.476423427096073, + "grad_norm": 7.544737815856934, + "learning_rate": 1.907867916836527e-06, + "loss": 2.6003, + "step": 95320 + }, + { + "epoch": 6.476763147166735, + "grad_norm": 8.693119049072266, + "learning_rate": 1.9074432667481996e-06, + "loss": 2.649, + "step": 95325 + }, + { + "epoch": 6.477102867237396, + "grad_norm": 7.47951078414917, + "learning_rate": 1.9070186166598724e-06, + "loss": 2.7924, + "step": 95330 + }, + { + "epoch": 6.477442587308058, + "grad_norm": 6.200802803039551, + "learning_rate": 1.9065939665715452e-06, + "loss": 2.8539, + "step": 95335 + }, + { + "epoch": 6.47778230737872, + "grad_norm": 9.876529693603516, + "learning_rate": 1.9061693164832178e-06, + "loss": 2.6909, + "step": 95340 + }, + { + "epoch": 6.478122027449381, + "grad_norm": 8.007118225097656, + "learning_rate": 1.9057446663948908e-06, + "loss": 2.6684, + "step": 95345 + }, + { + "epoch": 6.4784617475200434, + "grad_norm": 6.092844009399414, + "learning_rate": 1.9053200163065636e-06, + "loss": 2.7119, + "step": 95350 + }, + { + "epoch": 6.4788014675907055, + "grad_norm": 9.612844467163086, + "learning_rate": 1.9048953662182362e-06, + "loss": 2.7219, + "step": 95355 + }, + { + "epoch": 6.479141187661367, + "grad_norm": 9.706061363220215, + "learning_rate": 1.9044707161299092e-06, + "loss": 2.5214, + "step": 95360 + }, + { + "epoch": 6.479480907732029, + "grad_norm": 7.184065341949463, + "learning_rate": 1.9040460660415818e-06, + "loss": 2.7797, + "step": 95365 + }, + { + "epoch": 6.479820627802691, + "grad_norm": 9.582939147949219, + "learning_rate": 1.9036214159532546e-06, + "loss": 2.7877, + "step": 95370 + }, + { + "epoch": 6.480160347873352, + "grad_norm": 9.490038871765137, + "learning_rate": 1.9031967658649274e-06, + "loss": 2.3953, + "step": 95375 + }, + { + "epoch": 6.480500067944014, + "grad_norm": 6.924794673919678, + "learning_rate": 1.9027721157766002e-06, + "loss": 2.9048, + "step": 95380 + }, + { + "epoch": 6.480839788014676, + "grad_norm": 6.320626258850098, + "learning_rate": 1.9023474656882728e-06, + "loss": 2.7073, + "step": 95385 + }, + { + "epoch": 6.481179508085337, + "grad_norm": 7.811081886291504, + "learning_rate": 1.9019228155999458e-06, + "loss": 2.7272, + "step": 95390 + }, + { + "epoch": 6.4815192281559995, + "grad_norm": 6.826091766357422, + "learning_rate": 1.9014981655116186e-06, + "loss": 2.7876, + "step": 95395 + }, + { + "epoch": 6.4818589482266615, + "grad_norm": 9.612600326538086, + "learning_rate": 1.9010735154232914e-06, + "loss": 2.9001, + "step": 95400 + }, + { + "epoch": 6.482198668297323, + "grad_norm": 7.943897724151611, + "learning_rate": 1.9006488653349642e-06, + "loss": 2.7778, + "step": 95405 + }, + { + "epoch": 6.482538388367985, + "grad_norm": 8.17135238647461, + "learning_rate": 1.9002242152466368e-06, + "loss": 2.7591, + "step": 95410 + }, + { + "epoch": 6.482878108438647, + "grad_norm": 7.6953864097595215, + "learning_rate": 1.8997995651583098e-06, + "loss": 2.7672, + "step": 95415 + }, + { + "epoch": 6.483217828509308, + "grad_norm": 7.640289306640625, + "learning_rate": 1.8993749150699824e-06, + "loss": 2.8059, + "step": 95420 + }, + { + "epoch": 6.48355754857997, + "grad_norm": 5.996829032897949, + "learning_rate": 1.8989502649816552e-06, + "loss": 2.7581, + "step": 95425 + }, + { + "epoch": 6.483897268650632, + "grad_norm": 7.711213111877441, + "learning_rate": 1.8985256148933282e-06, + "loss": 2.7146, + "step": 95430 + }, + { + "epoch": 6.484236988721293, + "grad_norm": 7.853606700897217, + "learning_rate": 1.8981009648050008e-06, + "loss": 2.76, + "step": 95435 + }, + { + "epoch": 6.4845767087919555, + "grad_norm": 5.932071685791016, + "learning_rate": 1.8976763147166734e-06, + "loss": 2.7781, + "step": 95440 + }, + { + "epoch": 6.4849164288626175, + "grad_norm": 7.178204536437988, + "learning_rate": 1.8972516646283464e-06, + "loss": 2.7226, + "step": 95445 + }, + { + "epoch": 6.485256148933279, + "grad_norm": 9.431085586547852, + "learning_rate": 1.8968270145400192e-06, + "loss": 2.755, + "step": 95450 + }, + { + "epoch": 6.485595869003941, + "grad_norm": 7.380239963531494, + "learning_rate": 1.8964023644516918e-06, + "loss": 2.7217, + "step": 95455 + }, + { + "epoch": 6.485935589074603, + "grad_norm": 6.977940082550049, + "learning_rate": 1.8959777143633648e-06, + "loss": 2.7645, + "step": 95460 + }, + { + "epoch": 6.486275309145264, + "grad_norm": 8.324806213378906, + "learning_rate": 1.8955530642750374e-06, + "loss": 3.0319, + "step": 95465 + }, + { + "epoch": 6.486615029215926, + "grad_norm": 7.41068172454834, + "learning_rate": 1.8951284141867102e-06, + "loss": 2.7354, + "step": 95470 + }, + { + "epoch": 6.486954749286588, + "grad_norm": 9.415226936340332, + "learning_rate": 1.8947037640983832e-06, + "loss": 2.9362, + "step": 95475 + }, + { + "epoch": 6.487294469357249, + "grad_norm": 7.0434064865112305, + "learning_rate": 1.8942791140100558e-06, + "loss": 2.6079, + "step": 95480 + }, + { + "epoch": 6.4876341894279115, + "grad_norm": 7.742704391479492, + "learning_rate": 1.8938544639217288e-06, + "loss": 2.8811, + "step": 95485 + }, + { + "epoch": 6.4879739094985736, + "grad_norm": 8.469873428344727, + "learning_rate": 1.8934298138334014e-06, + "loss": 2.6909, + "step": 95490 + }, + { + "epoch": 6.488313629569235, + "grad_norm": 5.9360833168029785, + "learning_rate": 1.8930051637450742e-06, + "loss": 2.6934, + "step": 95495 + }, + { + "epoch": 6.488653349639897, + "grad_norm": 7.85503625869751, + "learning_rate": 1.892580513656747e-06, + "loss": 2.7395, + "step": 95500 + }, + { + "epoch": 6.488993069710559, + "grad_norm": 7.929828643798828, + "learning_rate": 1.8921558635684198e-06, + "loss": 2.5598, + "step": 95505 + }, + { + "epoch": 6.48933278978122, + "grad_norm": 6.239131927490234, + "learning_rate": 1.8917312134800924e-06, + "loss": 2.7315, + "step": 95510 + }, + { + "epoch": 6.489672509851882, + "grad_norm": 8.4684476852417, + "learning_rate": 1.8913065633917654e-06, + "loss": 2.7973, + "step": 95515 + }, + { + "epoch": 6.490012229922544, + "grad_norm": 7.523194313049316, + "learning_rate": 1.890881913303438e-06, + "loss": 2.9863, + "step": 95520 + }, + { + "epoch": 6.490351949993205, + "grad_norm": 9.100685119628906, + "learning_rate": 1.8904572632151108e-06, + "loss": 2.5508, + "step": 95525 + }, + { + "epoch": 6.4906916700638675, + "grad_norm": 6.2138671875, + "learning_rate": 1.8900326131267838e-06, + "loss": 2.5613, + "step": 95530 + }, + { + "epoch": 6.491031390134529, + "grad_norm": 8.212746620178223, + "learning_rate": 1.8896079630384564e-06, + "loss": 2.6389, + "step": 95535 + }, + { + "epoch": 6.491371110205191, + "grad_norm": 8.768271446228027, + "learning_rate": 1.8891833129501292e-06, + "loss": 2.9379, + "step": 95540 + }, + { + "epoch": 6.491710830275853, + "grad_norm": 7.242812633514404, + "learning_rate": 1.888758662861802e-06, + "loss": 2.6527, + "step": 95545 + }, + { + "epoch": 6.492050550346514, + "grad_norm": 7.706879138946533, + "learning_rate": 1.8883340127734748e-06, + "loss": 2.5563, + "step": 95550 + }, + { + "epoch": 6.492390270417176, + "grad_norm": 7.468216419219971, + "learning_rate": 1.8879093626851474e-06, + "loss": 2.7516, + "step": 95555 + }, + { + "epoch": 6.492729990487838, + "grad_norm": 10.143671035766602, + "learning_rate": 1.8874847125968204e-06, + "loss": 2.885, + "step": 95560 + }, + { + "epoch": 6.493069710558499, + "grad_norm": 7.652007102966309, + "learning_rate": 1.887060062508493e-06, + "loss": 2.928, + "step": 95565 + }, + { + "epoch": 6.493409430629161, + "grad_norm": 8.723074913024902, + "learning_rate": 1.886635412420166e-06, + "loss": 2.7097, + "step": 95570 + }, + { + "epoch": 6.4937491506998235, + "grad_norm": 7.378088474273682, + "learning_rate": 1.8862107623318388e-06, + "loss": 2.9486, + "step": 95575 + }, + { + "epoch": 6.494088870770485, + "grad_norm": 7.465143203735352, + "learning_rate": 1.8857861122435114e-06, + "loss": 2.6756, + "step": 95580 + }, + { + "epoch": 6.494428590841147, + "grad_norm": 7.59383487701416, + "learning_rate": 1.8853614621551844e-06, + "loss": 2.8449, + "step": 95585 + }, + { + "epoch": 6.494768310911809, + "grad_norm": 8.575701713562012, + "learning_rate": 1.884936812066857e-06, + "loss": 2.688, + "step": 95590 + }, + { + "epoch": 6.49510803098247, + "grad_norm": 7.36960506439209, + "learning_rate": 1.8845121619785298e-06, + "loss": 2.6224, + "step": 95595 + }, + { + "epoch": 6.495447751053132, + "grad_norm": 6.447954177856445, + "learning_rate": 1.8840875118902026e-06, + "loss": 2.6696, + "step": 95600 + }, + { + "epoch": 6.495787471123794, + "grad_norm": 7.169445991516113, + "learning_rate": 1.8836628618018754e-06, + "loss": 2.6982, + "step": 95605 + }, + { + "epoch": 6.496127191194455, + "grad_norm": 9.709681510925293, + "learning_rate": 1.883238211713548e-06, + "loss": 2.6923, + "step": 95610 + }, + { + "epoch": 6.496466911265117, + "grad_norm": 5.98372745513916, + "learning_rate": 1.882813561625221e-06, + "loss": 2.8848, + "step": 95615 + }, + { + "epoch": 6.4968066313357795, + "grad_norm": 8.620735168457031, + "learning_rate": 1.8823889115368938e-06, + "loss": 2.858, + "step": 95620 + }, + { + "epoch": 6.497146351406441, + "grad_norm": 7.6042561531066895, + "learning_rate": 1.8819642614485664e-06, + "loss": 2.6365, + "step": 95625 + }, + { + "epoch": 6.497486071477103, + "grad_norm": 7.633787631988525, + "learning_rate": 1.8815396113602394e-06, + "loss": 2.7595, + "step": 95630 + }, + { + "epoch": 6.497825791547765, + "grad_norm": 7.236606597900391, + "learning_rate": 1.881114961271912e-06, + "loss": 2.9644, + "step": 95635 + }, + { + "epoch": 6.498165511618426, + "grad_norm": 8.031432151794434, + "learning_rate": 1.8806903111835848e-06, + "loss": 2.6755, + "step": 95640 + }, + { + "epoch": 6.498505231689088, + "grad_norm": 6.809305191040039, + "learning_rate": 1.8802656610952576e-06, + "loss": 2.7786, + "step": 95645 + }, + { + "epoch": 6.49884495175975, + "grad_norm": 8.609596252441406, + "learning_rate": 1.8798410110069304e-06, + "loss": 2.889, + "step": 95650 + }, + { + "epoch": 6.499184671830411, + "grad_norm": 8.373762130737305, + "learning_rate": 1.8794163609186034e-06, + "loss": 2.6024, + "step": 95655 + }, + { + "epoch": 6.4995243919010735, + "grad_norm": 8.130200386047363, + "learning_rate": 1.878991710830276e-06, + "loss": 2.6068, + "step": 95660 + }, + { + "epoch": 6.4998641119717355, + "grad_norm": 7.997067928314209, + "learning_rate": 1.8785670607419488e-06, + "loss": 2.8251, + "step": 95665 + }, + { + "epoch": 6.500203832042397, + "grad_norm": 10.98542308807373, + "learning_rate": 1.8781424106536216e-06, + "loss": 2.7699, + "step": 95670 + }, + { + "epoch": 6.500543552113059, + "grad_norm": 11.562114715576172, + "learning_rate": 1.8777177605652944e-06, + "loss": 2.811, + "step": 95675 + }, + { + "epoch": 6.500883272183721, + "grad_norm": 7.517239093780518, + "learning_rate": 1.877293110476967e-06, + "loss": 2.4047, + "step": 95680 + }, + { + "epoch": 6.501222992254382, + "grad_norm": 6.484041213989258, + "learning_rate": 1.87686846038864e-06, + "loss": 2.6974, + "step": 95685 + }, + { + "epoch": 6.501562712325044, + "grad_norm": 7.891783714294434, + "learning_rate": 1.8764438103003126e-06, + "loss": 2.8269, + "step": 95690 + }, + { + "epoch": 6.501902432395706, + "grad_norm": 6.838071823120117, + "learning_rate": 1.8760191602119854e-06, + "loss": 2.7441, + "step": 95695 + }, + { + "epoch": 6.502242152466367, + "grad_norm": 7.852952003479004, + "learning_rate": 1.8755945101236584e-06, + "loss": 2.8429, + "step": 95700 + }, + { + "epoch": 6.5025818725370295, + "grad_norm": 9.654129028320312, + "learning_rate": 1.875169860035331e-06, + "loss": 2.8361, + "step": 95705 + }, + { + "epoch": 6.5029215926076915, + "grad_norm": 8.448972702026367, + "learning_rate": 1.8747452099470038e-06, + "loss": 2.8549, + "step": 95710 + }, + { + "epoch": 6.503261312678353, + "grad_norm": 9.240806579589844, + "learning_rate": 1.8743205598586766e-06, + "loss": 2.633, + "step": 95715 + }, + { + "epoch": 6.503601032749015, + "grad_norm": 8.32994270324707, + "learning_rate": 1.8738959097703494e-06, + "loss": 2.7705, + "step": 95720 + }, + { + "epoch": 6.503940752819677, + "grad_norm": 7.175226211547852, + "learning_rate": 1.873471259682022e-06, + "loss": 2.9125, + "step": 95725 + }, + { + "epoch": 6.504280472890338, + "grad_norm": 9.759556770324707, + "learning_rate": 1.873046609593695e-06, + "loss": 2.8313, + "step": 95730 + }, + { + "epoch": 6.504620192961, + "grad_norm": 8.811339378356934, + "learning_rate": 1.8726219595053676e-06, + "loss": 2.6457, + "step": 95735 + }, + { + "epoch": 6.504959913031662, + "grad_norm": 6.9998345375061035, + "learning_rate": 1.8721973094170406e-06, + "loss": 2.7183, + "step": 95740 + }, + { + "epoch": 6.505299633102323, + "grad_norm": 6.663975715637207, + "learning_rate": 1.8717726593287134e-06, + "loss": 2.8631, + "step": 95745 + }, + { + "epoch": 6.5056393531729855, + "grad_norm": 7.405427932739258, + "learning_rate": 1.871348009240386e-06, + "loss": 2.578, + "step": 95750 + }, + { + "epoch": 6.5059790732436475, + "grad_norm": 7.988168239593506, + "learning_rate": 1.870923359152059e-06, + "loss": 2.6758, + "step": 95755 + }, + { + "epoch": 6.506318793314309, + "grad_norm": 7.128800868988037, + "learning_rate": 1.8704987090637316e-06, + "loss": 2.8447, + "step": 95760 + }, + { + "epoch": 6.506658513384971, + "grad_norm": 6.4223175048828125, + "learning_rate": 1.8700740589754044e-06, + "loss": 2.7587, + "step": 95765 + }, + { + "epoch": 6.506998233455633, + "grad_norm": 7.2729811668396, + "learning_rate": 1.8696494088870772e-06, + "loss": 2.8579, + "step": 95770 + }, + { + "epoch": 6.507337953526294, + "grad_norm": 7.387099742889404, + "learning_rate": 1.86922475879875e-06, + "loss": 2.7034, + "step": 95775 + }, + { + "epoch": 6.507677673596956, + "grad_norm": 7.153995990753174, + "learning_rate": 1.8688001087104226e-06, + "loss": 2.9237, + "step": 95780 + }, + { + "epoch": 6.508017393667618, + "grad_norm": 5.88897705078125, + "learning_rate": 1.8683754586220956e-06, + "loss": 3.0223, + "step": 95785 + }, + { + "epoch": 6.508357113738279, + "grad_norm": 7.6762375831604, + "learning_rate": 1.8679508085337684e-06, + "loss": 2.7705, + "step": 95790 + }, + { + "epoch": 6.5086968338089415, + "grad_norm": 7.903982639312744, + "learning_rate": 1.867526158445441e-06, + "loss": 2.7753, + "step": 95795 + }, + { + "epoch": 6.509036553879604, + "grad_norm": 7.071325302124023, + "learning_rate": 1.867101508357114e-06, + "loss": 2.567, + "step": 95800 + }, + { + "epoch": 6.509376273950265, + "grad_norm": 9.345107078552246, + "learning_rate": 1.8666768582687866e-06, + "loss": 2.7912, + "step": 95805 + }, + { + "epoch": 6.509715994020927, + "grad_norm": 7.745225429534912, + "learning_rate": 1.8662522081804594e-06, + "loss": 2.6552, + "step": 95810 + }, + { + "epoch": 6.510055714091589, + "grad_norm": 6.982185363769531, + "learning_rate": 1.8658275580921322e-06, + "loss": 2.7682, + "step": 95815 + }, + { + "epoch": 6.51039543416225, + "grad_norm": 8.880058288574219, + "learning_rate": 1.865402908003805e-06, + "loss": 2.9553, + "step": 95820 + }, + { + "epoch": 6.510735154232912, + "grad_norm": 7.664175510406494, + "learning_rate": 1.864978257915478e-06, + "loss": 2.5558, + "step": 95825 + }, + { + "epoch": 6.511074874303574, + "grad_norm": 8.38342571258545, + "learning_rate": 1.8645536078271506e-06, + "loss": 2.7391, + "step": 95830 + }, + { + "epoch": 6.511414594374235, + "grad_norm": 6.609884262084961, + "learning_rate": 1.8641289577388232e-06, + "loss": 2.6186, + "step": 95835 + }, + { + "epoch": 6.5117543144448975, + "grad_norm": 9.187339782714844, + "learning_rate": 1.8637043076504962e-06, + "loss": 2.5644, + "step": 95840 + }, + { + "epoch": 6.51209403451556, + "grad_norm": 6.1106438636779785, + "learning_rate": 1.863279657562169e-06, + "loss": 2.8382, + "step": 95845 + }, + { + "epoch": 6.512433754586221, + "grad_norm": 6.247530460357666, + "learning_rate": 1.8628550074738416e-06, + "loss": 2.9863, + "step": 95850 + }, + { + "epoch": 6.512773474656883, + "grad_norm": 8.098893165588379, + "learning_rate": 1.8624303573855146e-06, + "loss": 2.8514, + "step": 95855 + }, + { + "epoch": 6.513113194727545, + "grad_norm": 8.830493927001953, + "learning_rate": 1.8620057072971872e-06, + "loss": 2.7428, + "step": 95860 + }, + { + "epoch": 6.513452914798206, + "grad_norm": 8.013407707214355, + "learning_rate": 1.86158105720886e-06, + "loss": 2.6774, + "step": 95865 + }, + { + "epoch": 6.513792634868868, + "grad_norm": 7.368642330169678, + "learning_rate": 1.861156407120533e-06, + "loss": 2.6404, + "step": 95870 + }, + { + "epoch": 6.51413235493953, + "grad_norm": 7.830506324768066, + "learning_rate": 1.8607317570322056e-06, + "loss": 2.707, + "step": 95875 + }, + { + "epoch": 6.514472075010191, + "grad_norm": 8.94349479675293, + "learning_rate": 1.8603071069438782e-06, + "loss": 2.8484, + "step": 95880 + }, + { + "epoch": 6.5148117950808535, + "grad_norm": 7.893578052520752, + "learning_rate": 1.8598824568555512e-06, + "loss": 2.6296, + "step": 95885 + }, + { + "epoch": 6.515151515151516, + "grad_norm": 10.519274711608887, + "learning_rate": 1.859457806767224e-06, + "loss": 2.8956, + "step": 95890 + }, + { + "epoch": 6.515491235222177, + "grad_norm": 9.150325775146484, + "learning_rate": 1.8590331566788966e-06, + "loss": 2.8695, + "step": 95895 + }, + { + "epoch": 6.515830955292839, + "grad_norm": 7.023410320281982, + "learning_rate": 1.8586085065905696e-06, + "loss": 2.85, + "step": 95900 + }, + { + "epoch": 6.516170675363501, + "grad_norm": 8.506609916687012, + "learning_rate": 1.8581838565022422e-06, + "loss": 2.8047, + "step": 95905 + }, + { + "epoch": 6.516510395434162, + "grad_norm": 9.636197090148926, + "learning_rate": 1.8577592064139152e-06, + "loss": 2.933, + "step": 95910 + }, + { + "epoch": 6.516850115504824, + "grad_norm": 7.598984241485596, + "learning_rate": 1.8573345563255878e-06, + "loss": 3.0454, + "step": 95915 + }, + { + "epoch": 6.517189835575486, + "grad_norm": 7.2191548347473145, + "learning_rate": 1.8569099062372606e-06, + "loss": 2.7745, + "step": 95920 + }, + { + "epoch": 6.517529555646147, + "grad_norm": 8.285683631896973, + "learning_rate": 1.8564852561489336e-06, + "loss": 2.597, + "step": 95925 + }, + { + "epoch": 6.5178692757168095, + "grad_norm": 7.815497398376465, + "learning_rate": 1.8560606060606062e-06, + "loss": 2.5588, + "step": 95930 + }, + { + "epoch": 6.518208995787472, + "grad_norm": 6.302978992462158, + "learning_rate": 1.855635955972279e-06, + "loss": 2.5461, + "step": 95935 + }, + { + "epoch": 6.518548715858133, + "grad_norm": 10.22835636138916, + "learning_rate": 1.8552113058839518e-06, + "loss": 2.8481, + "step": 95940 + }, + { + "epoch": 6.518888435928795, + "grad_norm": 7.127660274505615, + "learning_rate": 1.8547866557956246e-06, + "loss": 3.0333, + "step": 95945 + }, + { + "epoch": 6.519228155999457, + "grad_norm": 7.971288681030273, + "learning_rate": 1.8543620057072972e-06, + "loss": 2.8351, + "step": 95950 + }, + { + "epoch": 6.519567876070118, + "grad_norm": 8.46610164642334, + "learning_rate": 1.8539373556189702e-06, + "loss": 2.764, + "step": 95955 + }, + { + "epoch": 6.51990759614078, + "grad_norm": 7.8346638679504395, + "learning_rate": 1.8535127055306428e-06, + "loss": 2.7914, + "step": 95960 + }, + { + "epoch": 6.520247316211442, + "grad_norm": 8.648024559020996, + "learning_rate": 1.8530880554423156e-06, + "loss": 2.6043, + "step": 95965 + }, + { + "epoch": 6.5205870362821035, + "grad_norm": 7.402613639831543, + "learning_rate": 1.8526634053539886e-06, + "loss": 2.8006, + "step": 95970 + }, + { + "epoch": 6.5209267563527655, + "grad_norm": 7.879322528839111, + "learning_rate": 1.8522387552656612e-06, + "loss": 2.6056, + "step": 95975 + }, + { + "epoch": 6.521266476423427, + "grad_norm": 9.291487693786621, + "learning_rate": 1.851814105177334e-06, + "loss": 2.5243, + "step": 95980 + }, + { + "epoch": 6.521606196494089, + "grad_norm": 6.019532203674316, + "learning_rate": 1.8513894550890068e-06, + "loss": 2.6947, + "step": 95985 + }, + { + "epoch": 6.521945916564751, + "grad_norm": 7.1749773025512695, + "learning_rate": 1.8509648050006796e-06, + "loss": 2.5282, + "step": 95990 + }, + { + "epoch": 6.522285636635412, + "grad_norm": 8.495203018188477, + "learning_rate": 1.8505401549123526e-06, + "loss": 2.8663, + "step": 95995 + }, + { + "epoch": 6.522625356706074, + "grad_norm": 6.85206413269043, + "learning_rate": 1.8501155048240252e-06, + "loss": 2.6364, + "step": 96000 + }, + { + "epoch": 6.522965076776736, + "grad_norm": 6.584136962890625, + "learning_rate": 1.8496908547356978e-06, + "loss": 2.5553, + "step": 96005 + }, + { + "epoch": 6.523304796847397, + "grad_norm": 8.077330589294434, + "learning_rate": 1.8492662046473708e-06, + "loss": 2.7263, + "step": 96010 + }, + { + "epoch": 6.5236445169180595, + "grad_norm": 8.977319717407227, + "learning_rate": 1.8488415545590436e-06, + "loss": 2.8126, + "step": 96015 + }, + { + "epoch": 6.5239842369887215, + "grad_norm": 8.024284362792969, + "learning_rate": 1.8484169044707162e-06, + "loss": 2.7938, + "step": 96020 + }, + { + "epoch": 6.524323957059383, + "grad_norm": 7.324007511138916, + "learning_rate": 1.8479922543823892e-06, + "loss": 2.8526, + "step": 96025 + }, + { + "epoch": 6.524663677130045, + "grad_norm": 6.738648414611816, + "learning_rate": 1.8475676042940618e-06, + "loss": 2.6874, + "step": 96030 + }, + { + "epoch": 6.525003397200707, + "grad_norm": 8.147062301635742, + "learning_rate": 1.8471429542057346e-06, + "loss": 2.7398, + "step": 96035 + }, + { + "epoch": 6.525343117271368, + "grad_norm": 8.98824405670166, + "learning_rate": 1.8467183041174074e-06, + "loss": 2.6405, + "step": 96040 + }, + { + "epoch": 6.52568283734203, + "grad_norm": 6.915759563446045, + "learning_rate": 1.8462936540290802e-06, + "loss": 2.8959, + "step": 96045 + }, + { + "epoch": 6.526022557412692, + "grad_norm": 8.608049392700195, + "learning_rate": 1.8458690039407528e-06, + "loss": 2.7192, + "step": 96050 + }, + { + "epoch": 6.526362277483353, + "grad_norm": 9.495339393615723, + "learning_rate": 1.8454443538524258e-06, + "loss": 3.0043, + "step": 96055 + }, + { + "epoch": 6.5267019975540155, + "grad_norm": 7.996838092803955, + "learning_rate": 1.8450197037640986e-06, + "loss": 2.9015, + "step": 96060 + }, + { + "epoch": 6.5270417176246776, + "grad_norm": 7.423856258392334, + "learning_rate": 1.8445950536757712e-06, + "loss": 2.6077, + "step": 96065 + }, + { + "epoch": 6.527381437695339, + "grad_norm": 7.486032009124756, + "learning_rate": 1.8441704035874442e-06, + "loss": 2.7788, + "step": 96070 + }, + { + "epoch": 6.527721157766001, + "grad_norm": 6.962563991546631, + "learning_rate": 1.8437457534991168e-06, + "loss": 2.741, + "step": 96075 + }, + { + "epoch": 6.528060877836663, + "grad_norm": 11.956891059875488, + "learning_rate": 1.8433211034107898e-06, + "loss": 2.9524, + "step": 96080 + }, + { + "epoch": 6.528400597907324, + "grad_norm": 7.500802993774414, + "learning_rate": 1.8428964533224624e-06, + "loss": 2.9742, + "step": 96085 + }, + { + "epoch": 6.528740317977986, + "grad_norm": 8.119436264038086, + "learning_rate": 1.8424718032341352e-06, + "loss": 2.4514, + "step": 96090 + }, + { + "epoch": 6.529080038048648, + "grad_norm": 6.834690570831299, + "learning_rate": 1.8420471531458082e-06, + "loss": 2.6645, + "step": 96095 + }, + { + "epoch": 6.529419758119309, + "grad_norm": 8.290087699890137, + "learning_rate": 1.8416225030574808e-06, + "loss": 2.8075, + "step": 96100 + }, + { + "epoch": 6.5297594781899715, + "grad_norm": 6.630476474761963, + "learning_rate": 1.8411978529691536e-06, + "loss": 2.6902, + "step": 96105 + }, + { + "epoch": 6.530099198260634, + "grad_norm": 9.82726764678955, + "learning_rate": 1.8407732028808264e-06, + "loss": 2.9306, + "step": 96110 + }, + { + "epoch": 6.530438918331295, + "grad_norm": 8.41440200805664, + "learning_rate": 1.8403485527924992e-06, + "loss": 2.9679, + "step": 96115 + }, + { + "epoch": 6.530778638401957, + "grad_norm": 6.473034381866455, + "learning_rate": 1.8399239027041718e-06, + "loss": 2.8106, + "step": 96120 + }, + { + "epoch": 6.531118358472619, + "grad_norm": 7.808239936828613, + "learning_rate": 1.8394992526158448e-06, + "loss": 2.7481, + "step": 96125 + }, + { + "epoch": 6.53145807854328, + "grad_norm": 8.957684516906738, + "learning_rate": 1.8390746025275174e-06, + "loss": 2.9212, + "step": 96130 + }, + { + "epoch": 6.531797798613942, + "grad_norm": 7.15973424911499, + "learning_rate": 1.8386499524391902e-06, + "loss": 2.7213, + "step": 96135 + }, + { + "epoch": 6.532137518684604, + "grad_norm": 6.111083984375, + "learning_rate": 1.8382253023508632e-06, + "loss": 2.3208, + "step": 96140 + }, + { + "epoch": 6.532477238755265, + "grad_norm": 6.586544513702393, + "learning_rate": 1.8378006522625358e-06, + "loss": 2.6347, + "step": 96145 + }, + { + "epoch": 6.5328169588259275, + "grad_norm": 7.024056434631348, + "learning_rate": 1.8373760021742084e-06, + "loss": 2.7957, + "step": 96150 + }, + { + "epoch": 6.53315667889659, + "grad_norm": 7.006433486938477, + "learning_rate": 1.8369513520858814e-06, + "loss": 2.6326, + "step": 96155 + }, + { + "epoch": 6.533496398967251, + "grad_norm": 7.125040054321289, + "learning_rate": 1.8365267019975542e-06, + "loss": 2.6996, + "step": 96160 + }, + { + "epoch": 6.533836119037913, + "grad_norm": 7.072227478027344, + "learning_rate": 1.836102051909227e-06, + "loss": 2.8198, + "step": 96165 + }, + { + "epoch": 6.534175839108575, + "grad_norm": 8.086432456970215, + "learning_rate": 1.8356774018208998e-06, + "loss": 2.6832, + "step": 96170 + }, + { + "epoch": 6.534515559179236, + "grad_norm": 7.263732433319092, + "learning_rate": 1.8352527517325724e-06, + "loss": 2.9415, + "step": 96175 + }, + { + "epoch": 6.534855279249898, + "grad_norm": 8.457895278930664, + "learning_rate": 1.8348281016442454e-06, + "loss": 2.5806, + "step": 96180 + }, + { + "epoch": 6.535194999320559, + "grad_norm": 6.567202091217041, + "learning_rate": 1.8344034515559182e-06, + "loss": 2.9073, + "step": 96185 + }, + { + "epoch": 6.535534719391221, + "grad_norm": 7.669363975524902, + "learning_rate": 1.8339788014675908e-06, + "loss": 2.8493, + "step": 96190 + }, + { + "epoch": 6.5358744394618835, + "grad_norm": 9.85775089263916, + "learning_rate": 1.8335541513792638e-06, + "loss": 2.6728, + "step": 96195 + }, + { + "epoch": 6.536214159532545, + "grad_norm": 8.605064392089844, + "learning_rate": 1.8331295012909364e-06, + "loss": 2.4717, + "step": 96200 + }, + { + "epoch": 6.536553879603207, + "grad_norm": 6.948742866516113, + "learning_rate": 1.8327048512026092e-06, + "loss": 2.9335, + "step": 96205 + }, + { + "epoch": 6.536893599673869, + "grad_norm": 8.248838424682617, + "learning_rate": 1.832280201114282e-06, + "loss": 2.8482, + "step": 96210 + }, + { + "epoch": 6.53723331974453, + "grad_norm": 8.619105339050293, + "learning_rate": 1.8318555510259548e-06, + "loss": 2.8155, + "step": 96215 + }, + { + "epoch": 6.537573039815192, + "grad_norm": 7.811519145965576, + "learning_rate": 1.8314309009376274e-06, + "loss": 2.7102, + "step": 96220 + }, + { + "epoch": 6.537912759885854, + "grad_norm": 9.368132591247559, + "learning_rate": 1.8310062508493004e-06, + "loss": 2.5655, + "step": 96225 + }, + { + "epoch": 6.538252479956515, + "grad_norm": 8.828171730041504, + "learning_rate": 1.830581600760973e-06, + "loss": 2.5788, + "step": 96230 + }, + { + "epoch": 6.5385922000271774, + "grad_norm": 7.387315273284912, + "learning_rate": 1.8301569506726458e-06, + "loss": 2.6872, + "step": 96235 + }, + { + "epoch": 6.5389319200978395, + "grad_norm": 7.763647079467773, + "learning_rate": 1.8297323005843188e-06, + "loss": 2.7234, + "step": 96240 + }, + { + "epoch": 6.539271640168501, + "grad_norm": 8.068455696105957, + "learning_rate": 1.8293076504959914e-06, + "loss": 2.9015, + "step": 96245 + }, + { + "epoch": 6.539611360239163, + "grad_norm": 6.928471088409424, + "learning_rate": 1.8288830004076644e-06, + "loss": 2.5166, + "step": 96250 + }, + { + "epoch": 6.539951080309825, + "grad_norm": 6.698517322540283, + "learning_rate": 1.828458350319337e-06, + "loss": 2.6848, + "step": 96255 + }, + { + "epoch": 6.540290800380486, + "grad_norm": 8.223236083984375, + "learning_rate": 1.8280337002310098e-06, + "loss": 2.7546, + "step": 96260 + }, + { + "epoch": 6.540630520451148, + "grad_norm": 5.493038654327393, + "learning_rate": 1.8276090501426828e-06, + "loss": 2.9511, + "step": 96265 + }, + { + "epoch": 6.54097024052181, + "grad_norm": 6.954012393951416, + "learning_rate": 1.8271844000543554e-06, + "loss": 2.5491, + "step": 96270 + }, + { + "epoch": 6.541309960592471, + "grad_norm": 7.947055816650391, + "learning_rate": 1.826759749966028e-06, + "loss": 2.7924, + "step": 96275 + }, + { + "epoch": 6.5416496806631335, + "grad_norm": 8.644485473632812, + "learning_rate": 1.826335099877701e-06, + "loss": 2.4126, + "step": 96280 + }, + { + "epoch": 6.5419894007337955, + "grad_norm": 7.492074489593506, + "learning_rate": 1.8259104497893738e-06, + "loss": 2.619, + "step": 96285 + }, + { + "epoch": 6.542329120804457, + "grad_norm": 6.727151870727539, + "learning_rate": 1.8254857997010464e-06, + "loss": 2.985, + "step": 96290 + }, + { + "epoch": 6.542668840875119, + "grad_norm": 7.9575910568237305, + "learning_rate": 1.8250611496127194e-06, + "loss": 2.9589, + "step": 96295 + }, + { + "epoch": 6.543008560945781, + "grad_norm": 6.680161952972412, + "learning_rate": 1.824636499524392e-06, + "loss": 2.7508, + "step": 96300 + }, + { + "epoch": 6.543348281016442, + "grad_norm": 8.087322235107422, + "learning_rate": 1.8242118494360648e-06, + "loss": 2.7344, + "step": 96305 + }, + { + "epoch": 6.543688001087104, + "grad_norm": 6.784502983093262, + "learning_rate": 1.8237871993477376e-06, + "loss": 2.7573, + "step": 96310 + }, + { + "epoch": 6.544027721157766, + "grad_norm": 7.829084396362305, + "learning_rate": 1.8233625492594104e-06, + "loss": 3.0554, + "step": 96315 + }, + { + "epoch": 6.544367441228427, + "grad_norm": 7.525301456451416, + "learning_rate": 1.822937899171083e-06, + "loss": 2.6719, + "step": 96320 + }, + { + "epoch": 6.5447071612990895, + "grad_norm": 8.897469520568848, + "learning_rate": 1.822513249082756e-06, + "loss": 2.7655, + "step": 96325 + }, + { + "epoch": 6.5450468813697515, + "grad_norm": 7.364392280578613, + "learning_rate": 1.8220885989944288e-06, + "loss": 2.4416, + "step": 96330 + }, + { + "epoch": 6.545386601440413, + "grad_norm": 8.227490425109863, + "learning_rate": 1.8216639489061016e-06, + "loss": 2.8557, + "step": 96335 + }, + { + "epoch": 6.545726321511075, + "grad_norm": 6.1433281898498535, + "learning_rate": 1.8212392988177744e-06, + "loss": 3.0259, + "step": 96340 + }, + { + "epoch": 6.546066041581737, + "grad_norm": 8.150893211364746, + "learning_rate": 1.820814648729447e-06, + "loss": 2.6885, + "step": 96345 + }, + { + "epoch": 6.546405761652398, + "grad_norm": 7.117456912994385, + "learning_rate": 1.82038999864112e-06, + "loss": 2.6154, + "step": 96350 + }, + { + "epoch": 6.54674548172306, + "grad_norm": 8.116875648498535, + "learning_rate": 1.8199653485527926e-06, + "loss": 2.7262, + "step": 96355 + }, + { + "epoch": 6.547085201793722, + "grad_norm": 8.51578140258789, + "learning_rate": 1.8195406984644654e-06, + "loss": 2.7045, + "step": 96360 + }, + { + "epoch": 6.547424921864383, + "grad_norm": 7.821852207183838, + "learning_rate": 1.8191160483761384e-06, + "loss": 2.9017, + "step": 96365 + }, + { + "epoch": 6.5477646419350455, + "grad_norm": 6.499422073364258, + "learning_rate": 1.818691398287811e-06, + "loss": 2.8326, + "step": 96370 + }, + { + "epoch": 6.548104362005708, + "grad_norm": 7.6697235107421875, + "learning_rate": 1.8182667481994838e-06, + "loss": 2.5563, + "step": 96375 + }, + { + "epoch": 6.548444082076369, + "grad_norm": 7.252240180969238, + "learning_rate": 1.8178420981111566e-06, + "loss": 2.7525, + "step": 96380 + }, + { + "epoch": 6.548783802147031, + "grad_norm": 7.621118545532227, + "learning_rate": 1.8174174480228294e-06, + "loss": 2.6981, + "step": 96385 + }, + { + "epoch": 6.549123522217693, + "grad_norm": 7.648522853851318, + "learning_rate": 1.816992797934502e-06, + "loss": 2.8153, + "step": 96390 + }, + { + "epoch": 6.549463242288354, + "grad_norm": 7.629059791564941, + "learning_rate": 1.816568147846175e-06, + "loss": 2.8196, + "step": 96395 + }, + { + "epoch": 6.549802962359016, + "grad_norm": 7.327462196350098, + "learning_rate": 1.8161434977578476e-06, + "loss": 2.6745, + "step": 96400 + }, + { + "epoch": 6.550142682429678, + "grad_norm": 7.47981071472168, + "learning_rate": 1.8157188476695204e-06, + "loss": 2.7585, + "step": 96405 + }, + { + "epoch": 6.550482402500339, + "grad_norm": 8.357397079467773, + "learning_rate": 1.8152941975811934e-06, + "loss": 2.8033, + "step": 96410 + }, + { + "epoch": 6.5508221225710015, + "grad_norm": 8.093677520751953, + "learning_rate": 1.814869547492866e-06, + "loss": 2.8051, + "step": 96415 + }, + { + "epoch": 6.551161842641664, + "grad_norm": 7.591118812561035, + "learning_rate": 1.814444897404539e-06, + "loss": 2.742, + "step": 96420 + }, + { + "epoch": 6.551501562712325, + "grad_norm": 9.890850067138672, + "learning_rate": 1.8140202473162116e-06, + "loss": 2.9, + "step": 96425 + }, + { + "epoch": 6.551841282782987, + "grad_norm": 6.790143966674805, + "learning_rate": 1.8135955972278844e-06, + "loss": 2.846, + "step": 96430 + }, + { + "epoch": 6.552181002853649, + "grad_norm": 8.592902183532715, + "learning_rate": 1.8131709471395572e-06, + "loss": 2.943, + "step": 96435 + }, + { + "epoch": 6.55252072292431, + "grad_norm": 6.082153797149658, + "learning_rate": 1.81274629705123e-06, + "loss": 2.5666, + "step": 96440 + }, + { + "epoch": 6.552860442994972, + "grad_norm": 8.419458389282227, + "learning_rate": 1.8123216469629026e-06, + "loss": 2.4714, + "step": 96445 + }, + { + "epoch": 6.553200163065634, + "grad_norm": 8.528735160827637, + "learning_rate": 1.8118969968745756e-06, + "loss": 2.7379, + "step": 96450 + }, + { + "epoch": 6.553539883136295, + "grad_norm": 8.492977142333984, + "learning_rate": 1.8114723467862484e-06, + "loss": 2.7078, + "step": 96455 + }, + { + "epoch": 6.5538796032069575, + "grad_norm": 6.566711902618408, + "learning_rate": 1.811047696697921e-06, + "loss": 2.614, + "step": 96460 + }, + { + "epoch": 6.55421932327762, + "grad_norm": 7.558453559875488, + "learning_rate": 1.810623046609594e-06, + "loss": 2.6896, + "step": 96465 + }, + { + "epoch": 6.554559043348281, + "grad_norm": 6.637338161468506, + "learning_rate": 1.8101983965212666e-06, + "loss": 3.03, + "step": 96470 + }, + { + "epoch": 6.554898763418943, + "grad_norm": 10.179574966430664, + "learning_rate": 1.8097737464329394e-06, + "loss": 2.8197, + "step": 96475 + }, + { + "epoch": 6.555238483489605, + "grad_norm": 7.638820648193359, + "learning_rate": 1.8093490963446122e-06, + "loss": 2.6792, + "step": 96480 + }, + { + "epoch": 6.555578203560266, + "grad_norm": 6.898288726806641, + "learning_rate": 1.808924446256285e-06, + "loss": 2.7227, + "step": 96485 + }, + { + "epoch": 6.555917923630928, + "grad_norm": 8.681228637695312, + "learning_rate": 1.8084997961679576e-06, + "loss": 2.7607, + "step": 96490 + }, + { + "epoch": 6.55625764370159, + "grad_norm": 9.465142250061035, + "learning_rate": 1.8080751460796306e-06, + "loss": 2.4617, + "step": 96495 + }, + { + "epoch": 6.556597363772251, + "grad_norm": 9.102312088012695, + "learning_rate": 1.8076504959913034e-06, + "loss": 2.8289, + "step": 96500 + }, + { + "epoch": 6.5569370838429135, + "grad_norm": 7.386853218078613, + "learning_rate": 1.8072258459029762e-06, + "loss": 2.6914, + "step": 96505 + }, + { + "epoch": 6.557276803913576, + "grad_norm": 7.584442615509033, + "learning_rate": 1.806801195814649e-06, + "loss": 2.8088, + "step": 96510 + }, + { + "epoch": 6.557616523984237, + "grad_norm": 7.837776184082031, + "learning_rate": 1.8063765457263216e-06, + "loss": 2.6847, + "step": 96515 + }, + { + "epoch": 6.557956244054899, + "grad_norm": 6.929447650909424, + "learning_rate": 1.8059518956379946e-06, + "loss": 2.8738, + "step": 96520 + }, + { + "epoch": 6.558295964125561, + "grad_norm": 9.865377426147461, + "learning_rate": 1.8055272455496672e-06, + "loss": 2.9292, + "step": 96525 + }, + { + "epoch": 6.558635684196222, + "grad_norm": 7.268836498260498, + "learning_rate": 1.80510259546134e-06, + "loss": 2.8585, + "step": 96530 + }, + { + "epoch": 6.558975404266884, + "grad_norm": 9.77476978302002, + "learning_rate": 1.804677945373013e-06, + "loss": 3.0176, + "step": 96535 + }, + { + "epoch": 6.559315124337546, + "grad_norm": 6.570459842681885, + "learning_rate": 1.8042532952846856e-06, + "loss": 2.7692, + "step": 96540 + }, + { + "epoch": 6.5596548444082075, + "grad_norm": 6.392986297607422, + "learning_rate": 1.8038286451963582e-06, + "loss": 2.7662, + "step": 96545 + }, + { + "epoch": 6.5599945644788695, + "grad_norm": 8.06689453125, + "learning_rate": 1.8034039951080312e-06, + "loss": 2.9137, + "step": 96550 + }, + { + "epoch": 6.560334284549532, + "grad_norm": 8.151798248291016, + "learning_rate": 1.802979345019704e-06, + "loss": 2.673, + "step": 96555 + }, + { + "epoch": 6.560674004620193, + "grad_norm": 7.317183494567871, + "learning_rate": 1.8025546949313766e-06, + "loss": 2.8069, + "step": 96560 + }, + { + "epoch": 6.561013724690855, + "grad_norm": 8.890661239624023, + "learning_rate": 1.8021300448430496e-06, + "loss": 2.66, + "step": 96565 + }, + { + "epoch": 6.561353444761517, + "grad_norm": 7.117705345153809, + "learning_rate": 1.8017053947547222e-06, + "loss": 2.5389, + "step": 96570 + }, + { + "epoch": 6.561693164832178, + "grad_norm": 7.591373920440674, + "learning_rate": 1.801280744666395e-06, + "loss": 2.5918, + "step": 96575 + }, + { + "epoch": 6.56203288490284, + "grad_norm": 5.9741740226745605, + "learning_rate": 1.800856094578068e-06, + "loss": 2.7993, + "step": 96580 + }, + { + "epoch": 6.562372604973502, + "grad_norm": 8.471784591674805, + "learning_rate": 1.8004314444897406e-06, + "loss": 2.5325, + "step": 96585 + }, + { + "epoch": 6.5627123250441635, + "grad_norm": 7.904341697692871, + "learning_rate": 1.8000067944014136e-06, + "loss": 2.6792, + "step": 96590 + }, + { + "epoch": 6.5630520451148255, + "grad_norm": 7.819769859313965, + "learning_rate": 1.7995821443130862e-06, + "loss": 2.6315, + "step": 96595 + }, + { + "epoch": 6.563391765185488, + "grad_norm": 8.322093963623047, + "learning_rate": 1.799157494224759e-06, + "loss": 2.7082, + "step": 96600 + }, + { + "epoch": 6.563731485256149, + "grad_norm": 8.234905242919922, + "learning_rate": 1.7987328441364318e-06, + "loss": 2.6214, + "step": 96605 + }, + { + "epoch": 6.564071205326811, + "grad_norm": 6.531400680541992, + "learning_rate": 1.7983081940481046e-06, + "loss": 2.7783, + "step": 96610 + }, + { + "epoch": 6.564410925397473, + "grad_norm": 6.501478672027588, + "learning_rate": 1.7978835439597772e-06, + "loss": 2.9002, + "step": 96615 + }, + { + "epoch": 6.564750645468134, + "grad_norm": 7.099910259246826, + "learning_rate": 1.7974588938714502e-06, + "loss": 2.6264, + "step": 96620 + }, + { + "epoch": 6.565090365538796, + "grad_norm": 9.067872047424316, + "learning_rate": 1.7970342437831228e-06, + "loss": 2.6902, + "step": 96625 + }, + { + "epoch": 6.565430085609458, + "grad_norm": 6.377791404724121, + "learning_rate": 1.7966095936947956e-06, + "loss": 2.6642, + "step": 96630 + }, + { + "epoch": 6.5657698056801195, + "grad_norm": 7.73051643371582, + "learning_rate": 1.7961849436064686e-06, + "loss": 2.8065, + "step": 96635 + }, + { + "epoch": 6.5661095257507816, + "grad_norm": 8.359081268310547, + "learning_rate": 1.7957602935181412e-06, + "loss": 2.5631, + "step": 96640 + }, + { + "epoch": 6.566449245821444, + "grad_norm": 8.443819046020508, + "learning_rate": 1.795335643429814e-06, + "loss": 2.7486, + "step": 96645 + }, + { + "epoch": 6.566788965892105, + "grad_norm": 8.621193885803223, + "learning_rate": 1.7949109933414868e-06, + "loss": 3.0059, + "step": 96650 + }, + { + "epoch": 6.567128685962767, + "grad_norm": 7.738986968994141, + "learning_rate": 1.7944863432531596e-06, + "loss": 2.875, + "step": 96655 + }, + { + "epoch": 6.567468406033428, + "grad_norm": 7.677167892456055, + "learning_rate": 1.7940616931648321e-06, + "loss": 2.8175, + "step": 96660 + }, + { + "epoch": 6.56780812610409, + "grad_norm": 8.702792167663574, + "learning_rate": 1.7936370430765052e-06, + "loss": 2.9264, + "step": 96665 + }, + { + "epoch": 6.568147846174752, + "grad_norm": 7.541313648223877, + "learning_rate": 1.7932123929881777e-06, + "loss": 2.7673, + "step": 96670 + }, + { + "epoch": 6.568487566245413, + "grad_norm": 7.289279460906982, + "learning_rate": 1.7927877428998508e-06, + "loss": 2.9362, + "step": 96675 + }, + { + "epoch": 6.5688272863160755, + "grad_norm": 6.886176586151123, + "learning_rate": 1.7923630928115236e-06, + "loss": 2.8027, + "step": 96680 + }, + { + "epoch": 6.569167006386738, + "grad_norm": 8.355775833129883, + "learning_rate": 1.7919384427231962e-06, + "loss": 2.8303, + "step": 96685 + }, + { + "epoch": 6.569506726457399, + "grad_norm": 9.03452205657959, + "learning_rate": 1.7915137926348692e-06, + "loss": 2.713, + "step": 96690 + }, + { + "epoch": 6.569846446528061, + "grad_norm": 9.537426948547363, + "learning_rate": 1.7910891425465418e-06, + "loss": 2.5283, + "step": 96695 + }, + { + "epoch": 6.570186166598723, + "grad_norm": 7.80419921875, + "learning_rate": 1.7906644924582146e-06, + "loss": 2.6912, + "step": 96700 + }, + { + "epoch": 6.570525886669384, + "grad_norm": 7.757489204406738, + "learning_rate": 1.7902398423698874e-06, + "loss": 2.5727, + "step": 96705 + }, + { + "epoch": 6.570865606740046, + "grad_norm": 7.7425312995910645, + "learning_rate": 1.7898151922815602e-06, + "loss": 2.5422, + "step": 96710 + }, + { + "epoch": 6.571205326810708, + "grad_norm": 8.383624076843262, + "learning_rate": 1.7893905421932327e-06, + "loss": 2.6892, + "step": 96715 + }, + { + "epoch": 6.571545046881369, + "grad_norm": 8.389318466186523, + "learning_rate": 1.7889658921049058e-06, + "loss": 2.7965, + "step": 96720 + }, + { + "epoch": 6.5718847669520315, + "grad_norm": 6.451775074005127, + "learning_rate": 1.7885412420165786e-06, + "loss": 2.4862, + "step": 96725 + }, + { + "epoch": 6.572224487022694, + "grad_norm": 7.3302459716796875, + "learning_rate": 1.7881165919282511e-06, + "loss": 2.5567, + "step": 96730 + }, + { + "epoch": 6.572564207093355, + "grad_norm": 8.621257781982422, + "learning_rate": 1.7876919418399242e-06, + "loss": 2.6125, + "step": 96735 + }, + { + "epoch": 6.572903927164017, + "grad_norm": 7.855593681335449, + "learning_rate": 1.7872672917515967e-06, + "loss": 2.8143, + "step": 96740 + }, + { + "epoch": 6.573243647234679, + "grad_norm": 8.244879722595215, + "learning_rate": 1.7868426416632695e-06, + "loss": 2.4791, + "step": 96745 + }, + { + "epoch": 6.57358336730534, + "grad_norm": 6.803488254547119, + "learning_rate": 1.7864179915749423e-06, + "loss": 2.9385, + "step": 96750 + }, + { + "epoch": 6.573923087376002, + "grad_norm": 7.082513809204102, + "learning_rate": 1.7859933414866152e-06, + "loss": 2.7146, + "step": 96755 + }, + { + "epoch": 6.574262807446664, + "grad_norm": 10.9163818359375, + "learning_rate": 1.7855686913982882e-06, + "loss": 2.7694, + "step": 96760 + }, + { + "epoch": 6.574602527517325, + "grad_norm": 7.686572074890137, + "learning_rate": 1.7851440413099608e-06, + "loss": 2.8021, + "step": 96765 + }, + { + "epoch": 6.5749422475879875, + "grad_norm": 9.687129020690918, + "learning_rate": 1.7847193912216336e-06, + "loss": 2.5819, + "step": 96770 + }, + { + "epoch": 6.57528196765865, + "grad_norm": 7.322307109832764, + "learning_rate": 1.7842947411333064e-06, + "loss": 2.7897, + "step": 96775 + }, + { + "epoch": 6.575621687729311, + "grad_norm": 6.932802677154541, + "learning_rate": 1.7838700910449792e-06, + "loss": 2.6405, + "step": 96780 + }, + { + "epoch": 6.575961407799973, + "grad_norm": 7.131968975067139, + "learning_rate": 1.7834454409566517e-06, + "loss": 2.7735, + "step": 96785 + }, + { + "epoch": 6.576301127870635, + "grad_norm": 8.894665718078613, + "learning_rate": 1.7830207908683248e-06, + "loss": 2.6922, + "step": 96790 + }, + { + "epoch": 6.576640847941296, + "grad_norm": 7.486577987670898, + "learning_rate": 1.7825961407799973e-06, + "loss": 2.8258, + "step": 96795 + }, + { + "epoch": 6.576980568011958, + "grad_norm": 8.818733215332031, + "learning_rate": 1.7821714906916701e-06, + "loss": 2.9407, + "step": 96800 + }, + { + "epoch": 6.57732028808262, + "grad_norm": 9.111137390136719, + "learning_rate": 1.7817468406033432e-06, + "loss": 2.7786, + "step": 96805 + }, + { + "epoch": 6.5776600081532814, + "grad_norm": 6.552361965179443, + "learning_rate": 1.7813221905150157e-06, + "loss": 3.0175, + "step": 96810 + }, + { + "epoch": 6.5779997282239435, + "grad_norm": 8.256339073181152, + "learning_rate": 1.7808975404266885e-06, + "loss": 2.7146, + "step": 96815 + }, + { + "epoch": 6.578339448294606, + "grad_norm": 7.414646148681641, + "learning_rate": 1.7804728903383613e-06, + "loss": 2.694, + "step": 96820 + }, + { + "epoch": 6.578679168365267, + "grad_norm": 8.14529037475586, + "learning_rate": 1.7800482402500341e-06, + "loss": 2.9942, + "step": 96825 + }, + { + "epoch": 6.579018888435929, + "grad_norm": 8.416414260864258, + "learning_rate": 1.7796235901617067e-06, + "loss": 2.4503, + "step": 96830 + }, + { + "epoch": 6.579358608506591, + "grad_norm": 7.2204461097717285, + "learning_rate": 1.7791989400733798e-06, + "loss": 2.8074, + "step": 96835 + }, + { + "epoch": 6.579698328577252, + "grad_norm": 7.7424397468566895, + "learning_rate": 1.7787742899850523e-06, + "loss": 2.7473, + "step": 96840 + }, + { + "epoch": 6.580038048647914, + "grad_norm": 7.860209941864014, + "learning_rate": 1.7783496398967254e-06, + "loss": 2.6519, + "step": 96845 + }, + { + "epoch": 6.580377768718576, + "grad_norm": 6.8412957191467285, + "learning_rate": 1.7779249898083982e-06, + "loss": 2.6553, + "step": 96850 + }, + { + "epoch": 6.5807174887892375, + "grad_norm": 9.14222526550293, + "learning_rate": 1.7775003397200707e-06, + "loss": 2.7969, + "step": 96855 + }, + { + "epoch": 6.5810572088598995, + "grad_norm": 6.8151140213012695, + "learning_rate": 1.7770756896317438e-06, + "loss": 2.9816, + "step": 96860 + }, + { + "epoch": 6.581396928930561, + "grad_norm": 7.307686805725098, + "learning_rate": 1.7766510395434163e-06, + "loss": 2.8894, + "step": 96865 + }, + { + "epoch": 6.581736649001223, + "grad_norm": 9.4226713180542, + "learning_rate": 1.7762263894550891e-06, + "loss": 2.6563, + "step": 96870 + }, + { + "epoch": 6.582076369071885, + "grad_norm": 6.233492374420166, + "learning_rate": 1.775801739366762e-06, + "loss": 2.8712, + "step": 96875 + }, + { + "epoch": 6.582416089142546, + "grad_norm": 6.573737621307373, + "learning_rate": 1.7753770892784347e-06, + "loss": 2.7577, + "step": 96880 + }, + { + "epoch": 6.582755809213208, + "grad_norm": 8.53531265258789, + "learning_rate": 1.7749524391901073e-06, + "loss": 3.0274, + "step": 96885 + }, + { + "epoch": 6.58309552928387, + "grad_norm": 9.831085205078125, + "learning_rate": 1.7745277891017803e-06, + "loss": 2.5499, + "step": 96890 + }, + { + "epoch": 6.583435249354531, + "grad_norm": 7.282685279846191, + "learning_rate": 1.7741031390134531e-06, + "loss": 2.7554, + "step": 96895 + }, + { + "epoch": 6.5837749694251935, + "grad_norm": 9.110808372497559, + "learning_rate": 1.7736784889251257e-06, + "loss": 2.8877, + "step": 96900 + }, + { + "epoch": 6.5841146894958555, + "grad_norm": 6.904494285583496, + "learning_rate": 1.7732538388367988e-06, + "loss": 2.8538, + "step": 96905 + }, + { + "epoch": 6.584454409566517, + "grad_norm": 8.160578727722168, + "learning_rate": 1.7728291887484713e-06, + "loss": 2.7359, + "step": 96910 + }, + { + "epoch": 6.584794129637179, + "grad_norm": 7.6598663330078125, + "learning_rate": 1.7724045386601441e-06, + "loss": 2.7253, + "step": 96915 + }, + { + "epoch": 6.585133849707841, + "grad_norm": 10.139781951904297, + "learning_rate": 1.771979888571817e-06, + "loss": 2.8186, + "step": 96920 + }, + { + "epoch": 6.585473569778502, + "grad_norm": 7.655200004577637, + "learning_rate": 1.7715552384834897e-06, + "loss": 2.5207, + "step": 96925 + }, + { + "epoch": 6.585813289849164, + "grad_norm": 6.614182949066162, + "learning_rate": 1.7711305883951628e-06, + "loss": 2.882, + "step": 96930 + }, + { + "epoch": 6.586153009919826, + "grad_norm": 8.416629791259766, + "learning_rate": 1.7707059383068353e-06, + "loss": 2.612, + "step": 96935 + }, + { + "epoch": 6.586492729990487, + "grad_norm": 8.174846649169922, + "learning_rate": 1.770281288218508e-06, + "loss": 2.7476, + "step": 96940 + }, + { + "epoch": 6.5868324500611495, + "grad_norm": 8.174213409423828, + "learning_rate": 1.769856638130181e-06, + "loss": 2.6886, + "step": 96945 + }, + { + "epoch": 6.5871721701318116, + "grad_norm": 6.774690628051758, + "learning_rate": 1.7694319880418537e-06, + "loss": 2.5498, + "step": 96950 + }, + { + "epoch": 6.587511890202473, + "grad_norm": 8.780969619750977, + "learning_rate": 1.7690073379535263e-06, + "loss": 2.626, + "step": 96955 + }, + { + "epoch": 6.587851610273135, + "grad_norm": 7.787018299102783, + "learning_rate": 1.7685826878651993e-06, + "loss": 2.9805, + "step": 96960 + }, + { + "epoch": 6.588191330343797, + "grad_norm": 7.1682915687561035, + "learning_rate": 1.768158037776872e-06, + "loss": 2.6204, + "step": 96965 + }, + { + "epoch": 6.588531050414458, + "grad_norm": 7.343495845794678, + "learning_rate": 1.7677333876885447e-06, + "loss": 3.0168, + "step": 96970 + }, + { + "epoch": 6.58887077048512, + "grad_norm": 9.055154800415039, + "learning_rate": 1.7673087376002177e-06, + "loss": 2.3319, + "step": 96975 + }, + { + "epoch": 6.589210490555782, + "grad_norm": 9.412845611572266, + "learning_rate": 1.7668840875118903e-06, + "loss": 2.6182, + "step": 96980 + }, + { + "epoch": 6.589550210626443, + "grad_norm": 10.196908950805664, + "learning_rate": 1.766459437423563e-06, + "loss": 2.9171, + "step": 96985 + }, + { + "epoch": 6.5898899306971055, + "grad_norm": 7.920354843139648, + "learning_rate": 1.766034787335236e-06, + "loss": 2.8784, + "step": 96990 + }, + { + "epoch": 6.590229650767768, + "grad_norm": 7.8043951988220215, + "learning_rate": 1.7656101372469087e-06, + "loss": 2.8937, + "step": 96995 + }, + { + "epoch": 6.590569370838429, + "grad_norm": 10.462239265441895, + "learning_rate": 1.7651854871585813e-06, + "loss": 2.8509, + "step": 97000 + }, + { + "epoch": 6.590909090909091, + "grad_norm": 7.254518508911133, + "learning_rate": 1.7647608370702543e-06, + "loss": 2.7661, + "step": 97005 + }, + { + "epoch": 6.591248810979753, + "grad_norm": 6.691717624664307, + "learning_rate": 1.764336186981927e-06, + "loss": 2.422, + "step": 97010 + }, + { + "epoch": 6.591588531050414, + "grad_norm": 8.058101654052734, + "learning_rate": 1.7639115368936e-06, + "loss": 2.7147, + "step": 97015 + }, + { + "epoch": 6.591928251121076, + "grad_norm": 8.780071258544922, + "learning_rate": 1.7634868868052725e-06, + "loss": 2.811, + "step": 97020 + }, + { + "epoch": 6.592267971191738, + "grad_norm": 8.440184593200684, + "learning_rate": 1.7630622367169453e-06, + "loss": 2.7351, + "step": 97025 + }, + { + "epoch": 6.592607691262399, + "grad_norm": 7.80251407623291, + "learning_rate": 1.7626375866286183e-06, + "loss": 2.85, + "step": 97030 + }, + { + "epoch": 6.5929474113330615, + "grad_norm": 9.352581024169922, + "learning_rate": 1.762212936540291e-06, + "loss": 2.4049, + "step": 97035 + }, + { + "epoch": 6.593287131403724, + "grad_norm": 8.816365242004395, + "learning_rate": 1.7617882864519637e-06, + "loss": 2.6072, + "step": 97040 + }, + { + "epoch": 6.593626851474385, + "grad_norm": 8.732912063598633, + "learning_rate": 1.7613636363636365e-06, + "loss": 2.7068, + "step": 97045 + }, + { + "epoch": 6.593966571545047, + "grad_norm": 9.875299453735352, + "learning_rate": 1.7609389862753093e-06, + "loss": 2.6873, + "step": 97050 + }, + { + "epoch": 6.594306291615709, + "grad_norm": 8.90860366821289, + "learning_rate": 1.760514336186982e-06, + "loss": 2.7377, + "step": 97055 + }, + { + "epoch": 6.59464601168637, + "grad_norm": 10.594971656799316, + "learning_rate": 1.760089686098655e-06, + "loss": 2.7261, + "step": 97060 + }, + { + "epoch": 6.594985731757032, + "grad_norm": 7.297945976257324, + "learning_rate": 1.7596650360103275e-06, + "loss": 2.5591, + "step": 97065 + }, + { + "epoch": 6.595325451827694, + "grad_norm": 8.5922212600708, + "learning_rate": 1.7592403859220003e-06, + "loss": 2.7957, + "step": 97070 + }, + { + "epoch": 6.595665171898355, + "grad_norm": 8.2332763671875, + "learning_rate": 1.7588157358336733e-06, + "loss": 2.6752, + "step": 97075 + }, + { + "epoch": 6.5960048919690175, + "grad_norm": 9.325264930725098, + "learning_rate": 1.758391085745346e-06, + "loss": 2.6305, + "step": 97080 + }, + { + "epoch": 6.59634461203968, + "grad_norm": 6.529994487762451, + "learning_rate": 1.7579664356570187e-06, + "loss": 2.8011, + "step": 97085 + }, + { + "epoch": 6.596684332110341, + "grad_norm": 6.390645980834961, + "learning_rate": 1.7575417855686915e-06, + "loss": 2.773, + "step": 97090 + }, + { + "epoch": 6.597024052181003, + "grad_norm": 8.54409408569336, + "learning_rate": 1.7571171354803643e-06, + "loss": 2.8266, + "step": 97095 + }, + { + "epoch": 6.597363772251665, + "grad_norm": 8.752368927001953, + "learning_rate": 1.7566924853920371e-06, + "loss": 2.7802, + "step": 97100 + }, + { + "epoch": 6.597703492322326, + "grad_norm": 9.901514053344727, + "learning_rate": 1.75626783530371e-06, + "loss": 2.7653, + "step": 97105 + }, + { + "epoch": 6.598043212392988, + "grad_norm": 8.544876098632812, + "learning_rate": 1.7558431852153825e-06, + "loss": 2.9283, + "step": 97110 + }, + { + "epoch": 6.59838293246365, + "grad_norm": 9.377903938293457, + "learning_rate": 1.7554185351270555e-06, + "loss": 2.6994, + "step": 97115 + }, + { + "epoch": 6.5987226525343115, + "grad_norm": 8.18397045135498, + "learning_rate": 1.7549938850387283e-06, + "loss": 2.679, + "step": 97120 + }, + { + "epoch": 6.5990623726049735, + "grad_norm": 7.907973289489746, + "learning_rate": 1.754569234950401e-06, + "loss": 2.5863, + "step": 97125 + }, + { + "epoch": 6.599402092675636, + "grad_norm": 7.2686944007873535, + "learning_rate": 1.754144584862074e-06, + "loss": 2.6334, + "step": 97130 + }, + { + "epoch": 6.599741812746297, + "grad_norm": 6.585489749908447, + "learning_rate": 1.7537199347737465e-06, + "loss": 2.656, + "step": 97135 + }, + { + "epoch": 6.600081532816959, + "grad_norm": 7.1425347328186035, + "learning_rate": 1.7532952846854193e-06, + "loss": 2.5872, + "step": 97140 + }, + { + "epoch": 6.600421252887621, + "grad_norm": 8.716243743896484, + "learning_rate": 1.7528706345970921e-06, + "loss": 2.7227, + "step": 97145 + }, + { + "epoch": 6.600760972958282, + "grad_norm": 7.964604377746582, + "learning_rate": 1.752445984508765e-06, + "loss": 2.661, + "step": 97150 + }, + { + "epoch": 6.601100693028944, + "grad_norm": 6.020787239074707, + "learning_rate": 1.7520213344204375e-06, + "loss": 2.6923, + "step": 97155 + }, + { + "epoch": 6.601440413099606, + "grad_norm": 8.357338905334473, + "learning_rate": 1.7515966843321105e-06, + "loss": 2.961, + "step": 97160 + }, + { + "epoch": 6.6017801331702675, + "grad_norm": 8.150574684143066, + "learning_rate": 1.7511720342437833e-06, + "loss": 3.0016, + "step": 97165 + }, + { + "epoch": 6.6021198532409295, + "grad_norm": 8.32544231414795, + "learning_rate": 1.750747384155456e-06, + "loss": 2.8025, + "step": 97170 + }, + { + "epoch": 6.602459573311592, + "grad_norm": 7.105157852172852, + "learning_rate": 1.750322734067129e-06, + "loss": 2.5891, + "step": 97175 + }, + { + "epoch": 6.602799293382253, + "grad_norm": 8.908803939819336, + "learning_rate": 1.7498980839788015e-06, + "loss": 2.8417, + "step": 97180 + }, + { + "epoch": 6.603139013452915, + "grad_norm": 6.832803249359131, + "learning_rate": 1.7494734338904745e-06, + "loss": 2.707, + "step": 97185 + }, + { + "epoch": 6.603478733523577, + "grad_norm": 6.664856910705566, + "learning_rate": 1.7490487838021471e-06, + "loss": 2.8274, + "step": 97190 + }, + { + "epoch": 6.603818453594238, + "grad_norm": 8.474688529968262, + "learning_rate": 1.74862413371382e-06, + "loss": 2.7495, + "step": 97195 + }, + { + "epoch": 6.6041581736649, + "grad_norm": 8.100284576416016, + "learning_rate": 1.748199483625493e-06, + "loss": 2.9934, + "step": 97200 + }, + { + "epoch": 6.604497893735562, + "grad_norm": 8.082798957824707, + "learning_rate": 1.7477748335371655e-06, + "loss": 2.65, + "step": 97205 + }, + { + "epoch": 6.6048376138062235, + "grad_norm": 8.338310241699219, + "learning_rate": 1.7473501834488383e-06, + "loss": 2.6116, + "step": 97210 + }, + { + "epoch": 6.6051773338768855, + "grad_norm": 7.566677570343018, + "learning_rate": 1.7469255333605111e-06, + "loss": 2.7665, + "step": 97215 + }, + { + "epoch": 6.605517053947548, + "grad_norm": 8.0798921585083, + "learning_rate": 1.746500883272184e-06, + "loss": 2.7085, + "step": 97220 + }, + { + "epoch": 6.605856774018209, + "grad_norm": 8.40735912322998, + "learning_rate": 1.7460762331838565e-06, + "loss": 2.7731, + "step": 97225 + }, + { + "epoch": 6.606196494088871, + "grad_norm": 7.255143165588379, + "learning_rate": 1.7456515830955295e-06, + "loss": 2.7625, + "step": 97230 + }, + { + "epoch": 6.606536214159533, + "grad_norm": 7.131769180297852, + "learning_rate": 1.7452269330072021e-06, + "loss": 2.8262, + "step": 97235 + }, + { + "epoch": 6.606875934230194, + "grad_norm": 8.601962089538574, + "learning_rate": 1.744802282918875e-06, + "loss": 2.652, + "step": 97240 + }, + { + "epoch": 6.607215654300856, + "grad_norm": 7.642778396606445, + "learning_rate": 1.744377632830548e-06, + "loss": 2.6444, + "step": 97245 + }, + { + "epoch": 6.607555374371518, + "grad_norm": 7.26497745513916, + "learning_rate": 1.7439529827422205e-06, + "loss": 2.5065, + "step": 97250 + }, + { + "epoch": 6.6078950944421795, + "grad_norm": 7.569540500640869, + "learning_rate": 1.7435283326538931e-06, + "loss": 2.5653, + "step": 97255 + }, + { + "epoch": 6.608234814512842, + "grad_norm": 6.6329545974731445, + "learning_rate": 1.7431036825655661e-06, + "loss": 2.8845, + "step": 97260 + }, + { + "epoch": 6.608574534583504, + "grad_norm": 6.159069061279297, + "learning_rate": 1.742679032477239e-06, + "loss": 2.7985, + "step": 97265 + }, + { + "epoch": 6.608914254654165, + "grad_norm": 7.852718830108643, + "learning_rate": 1.7422543823889117e-06, + "loss": 2.524, + "step": 97270 + }, + { + "epoch": 6.609253974724827, + "grad_norm": 8.945971488952637, + "learning_rate": 1.7418297323005845e-06, + "loss": 2.8996, + "step": 97275 + }, + { + "epoch": 6.609593694795489, + "grad_norm": 6.839635848999023, + "learning_rate": 1.7414050822122571e-06, + "loss": 2.8999, + "step": 97280 + }, + { + "epoch": 6.60993341486615, + "grad_norm": 9.690317153930664, + "learning_rate": 1.7409804321239301e-06, + "loss": 2.9394, + "step": 97285 + }, + { + "epoch": 6.610273134936812, + "grad_norm": 7.487035751342773, + "learning_rate": 1.740555782035603e-06, + "loss": 2.6672, + "step": 97290 + }, + { + "epoch": 6.610612855007474, + "grad_norm": 6.463808059692383, + "learning_rate": 1.7401311319472755e-06, + "loss": 2.6493, + "step": 97295 + }, + { + "epoch": 6.6109525750781355, + "grad_norm": 6.309762954711914, + "learning_rate": 1.7397064818589485e-06, + "loss": 3.0096, + "step": 97300 + }, + { + "epoch": 6.611292295148798, + "grad_norm": 7.082768440246582, + "learning_rate": 1.7392818317706211e-06, + "loss": 2.5881, + "step": 97305 + }, + { + "epoch": 6.61163201521946, + "grad_norm": 9.286774635314941, + "learning_rate": 1.738857181682294e-06, + "loss": 2.6492, + "step": 97310 + }, + { + "epoch": 6.611971735290121, + "grad_norm": 5.567418575286865, + "learning_rate": 1.7384325315939667e-06, + "loss": 2.7508, + "step": 97315 + }, + { + "epoch": 6.612311455360783, + "grad_norm": 9.125064849853516, + "learning_rate": 1.7380078815056395e-06, + "loss": 2.6786, + "step": 97320 + }, + { + "epoch": 6.612651175431445, + "grad_norm": 8.442872047424316, + "learning_rate": 1.7375832314173121e-06, + "loss": 2.6463, + "step": 97325 + }, + { + "epoch": 6.612990895502106, + "grad_norm": 8.769536018371582, + "learning_rate": 1.7371585813289851e-06, + "loss": 2.6718, + "step": 97330 + }, + { + "epoch": 6.613330615572768, + "grad_norm": 7.3382039070129395, + "learning_rate": 1.7367339312406577e-06, + "loss": 2.6572, + "step": 97335 + }, + { + "epoch": 6.613670335643429, + "grad_norm": 7.505669116973877, + "learning_rate": 1.7363092811523305e-06, + "loss": 2.7186, + "step": 97340 + }, + { + "epoch": 6.6140100557140915, + "grad_norm": 9.65966796875, + "learning_rate": 1.7358846310640035e-06, + "loss": 2.8728, + "step": 97345 + }, + { + "epoch": 6.614349775784754, + "grad_norm": 11.698508262634277, + "learning_rate": 1.7354599809756761e-06, + "loss": 2.8171, + "step": 97350 + }, + { + "epoch": 6.614689495855415, + "grad_norm": 7.703452110290527, + "learning_rate": 1.7350353308873491e-06, + "loss": 2.5199, + "step": 97355 + }, + { + "epoch": 6.615029215926077, + "grad_norm": 8.163111686706543, + "learning_rate": 1.7346106807990217e-06, + "loss": 2.5077, + "step": 97360 + }, + { + "epoch": 6.615368935996739, + "grad_norm": 8.862068176269531, + "learning_rate": 1.7341860307106945e-06, + "loss": 2.7512, + "step": 97365 + }, + { + "epoch": 6.6157086560674, + "grad_norm": 8.581684112548828, + "learning_rate": 1.7337613806223675e-06, + "loss": 2.7414, + "step": 97370 + }, + { + "epoch": 6.616048376138062, + "grad_norm": 7.253393173217773, + "learning_rate": 1.7333367305340401e-06, + "loss": 2.4049, + "step": 97375 + }, + { + "epoch": 6.616388096208724, + "grad_norm": 7.578714847564697, + "learning_rate": 1.7329120804457127e-06, + "loss": 2.5814, + "step": 97380 + }, + { + "epoch": 6.616727816279385, + "grad_norm": 7.482307434082031, + "learning_rate": 1.7324874303573857e-06, + "loss": 2.8702, + "step": 97385 + }, + { + "epoch": 6.6170675363500475, + "grad_norm": 8.033817291259766, + "learning_rate": 1.7320627802690585e-06, + "loss": 2.4756, + "step": 97390 + }, + { + "epoch": 6.61740725642071, + "grad_norm": 6.774059772491455, + "learning_rate": 1.7316381301807311e-06, + "loss": 2.7535, + "step": 97395 + }, + { + "epoch": 6.617746976491371, + "grad_norm": 8.694161415100098, + "learning_rate": 1.7312134800924041e-06, + "loss": 2.696, + "step": 97400 + }, + { + "epoch": 6.618086696562033, + "grad_norm": 7.557983875274658, + "learning_rate": 1.7307888300040767e-06, + "loss": 2.7265, + "step": 97405 + }, + { + "epoch": 6.618426416632695, + "grad_norm": 7.45591926574707, + "learning_rate": 1.7303641799157495e-06, + "loss": 2.7815, + "step": 97410 + }, + { + "epoch": 6.618766136703356, + "grad_norm": 8.347150802612305, + "learning_rate": 1.7299395298274223e-06, + "loss": 2.6911, + "step": 97415 + }, + { + "epoch": 6.619105856774018, + "grad_norm": 8.93746566772461, + "learning_rate": 1.7295148797390951e-06, + "loss": 2.7233, + "step": 97420 + }, + { + "epoch": 6.61944557684468, + "grad_norm": 7.042125225067139, + "learning_rate": 1.7290902296507677e-06, + "loss": 2.8632, + "step": 97425 + }, + { + "epoch": 6.6197852969153415, + "grad_norm": 6.518222332000732, + "learning_rate": 1.7286655795624407e-06, + "loss": 2.8116, + "step": 97430 + }, + { + "epoch": 6.6201250169860035, + "grad_norm": 7.907305717468262, + "learning_rate": 1.7282409294741135e-06, + "loss": 2.6474, + "step": 97435 + }, + { + "epoch": 6.620464737056666, + "grad_norm": 8.01518440246582, + "learning_rate": 1.7278162793857863e-06, + "loss": 2.7271, + "step": 97440 + }, + { + "epoch": 6.620804457127327, + "grad_norm": 7.313780784606934, + "learning_rate": 1.7273916292974591e-06, + "loss": 2.4692, + "step": 97445 + }, + { + "epoch": 6.621144177197989, + "grad_norm": 7.365372180938721, + "learning_rate": 1.7269669792091317e-06, + "loss": 2.9698, + "step": 97450 + }, + { + "epoch": 6.621483897268651, + "grad_norm": 7.924027919769287, + "learning_rate": 1.7265423291208047e-06, + "loss": 2.6047, + "step": 97455 + }, + { + "epoch": 6.621823617339312, + "grad_norm": 8.086487770080566, + "learning_rate": 1.7261176790324773e-06, + "loss": 2.8873, + "step": 97460 + }, + { + "epoch": 6.622163337409974, + "grad_norm": 8.547785758972168, + "learning_rate": 1.7256930289441501e-06, + "loss": 2.7344, + "step": 97465 + }, + { + "epoch": 6.622503057480636, + "grad_norm": 8.809765815734863, + "learning_rate": 1.7252683788558231e-06, + "loss": 2.7595, + "step": 97470 + }, + { + "epoch": 6.6228427775512975, + "grad_norm": 9.038677215576172, + "learning_rate": 1.7248437287674957e-06, + "loss": 2.7699, + "step": 97475 + }, + { + "epoch": 6.6231824976219595, + "grad_norm": 5.8524298667907715, + "learning_rate": 1.7244190786791685e-06, + "loss": 2.5758, + "step": 97480 + }, + { + "epoch": 6.623522217692622, + "grad_norm": 6.137109756469727, + "learning_rate": 1.7239944285908413e-06, + "loss": 2.9695, + "step": 97485 + }, + { + "epoch": 6.623861937763283, + "grad_norm": 9.871548652648926, + "learning_rate": 1.7235697785025141e-06, + "loss": 2.71, + "step": 97490 + }, + { + "epoch": 6.624201657833945, + "grad_norm": 9.469706535339355, + "learning_rate": 1.7231451284141867e-06, + "loss": 2.8458, + "step": 97495 + }, + { + "epoch": 6.624541377904607, + "grad_norm": 9.859594345092773, + "learning_rate": 1.7227204783258597e-06, + "loss": 2.6822, + "step": 97500 + }, + { + "epoch": 6.624881097975268, + "grad_norm": 9.345145225524902, + "learning_rate": 1.7222958282375323e-06, + "loss": 2.7873, + "step": 97505 + }, + { + "epoch": 6.62522081804593, + "grad_norm": 9.504279136657715, + "learning_rate": 1.721871178149205e-06, + "loss": 2.7935, + "step": 97510 + }, + { + "epoch": 6.625560538116592, + "grad_norm": 8.684274673461914, + "learning_rate": 1.7214465280608781e-06, + "loss": 2.8343, + "step": 97515 + }, + { + "epoch": 6.6259002581872535, + "grad_norm": 7.017950057983398, + "learning_rate": 1.7210218779725507e-06, + "loss": 3.0122, + "step": 97520 + }, + { + "epoch": 6.6262399782579156, + "grad_norm": 8.355237007141113, + "learning_rate": 1.7205972278842237e-06, + "loss": 2.5545, + "step": 97525 + }, + { + "epoch": 6.626579698328578, + "grad_norm": 9.450165748596191, + "learning_rate": 1.7201725777958963e-06, + "loss": 2.9084, + "step": 97530 + }, + { + "epoch": 6.626919418399239, + "grad_norm": 8.12367057800293, + "learning_rate": 1.7197479277075691e-06, + "loss": 2.7645, + "step": 97535 + }, + { + "epoch": 6.627259138469901, + "grad_norm": 8.910567283630371, + "learning_rate": 1.719323277619242e-06, + "loss": 2.8486, + "step": 97540 + }, + { + "epoch": 6.627598858540563, + "grad_norm": 7.091715335845947, + "learning_rate": 1.7188986275309147e-06, + "loss": 2.6128, + "step": 97545 + }, + { + "epoch": 6.627938578611224, + "grad_norm": 6.933107852935791, + "learning_rate": 1.7184739774425873e-06, + "loss": 2.6984, + "step": 97550 + }, + { + "epoch": 6.628278298681886, + "grad_norm": 7.624637126922607, + "learning_rate": 1.7180493273542603e-06, + "loss": 2.9122, + "step": 97555 + }, + { + "epoch": 6.628618018752547, + "grad_norm": 6.984764575958252, + "learning_rate": 1.7176246772659331e-06, + "loss": 2.6159, + "step": 97560 + }, + { + "epoch": 6.6289577388232095, + "grad_norm": 6.487199783325195, + "learning_rate": 1.7172000271776057e-06, + "loss": 2.5556, + "step": 97565 + }, + { + "epoch": 6.629297458893872, + "grad_norm": 7.362191200256348, + "learning_rate": 1.7167753770892787e-06, + "loss": 2.646, + "step": 97570 + }, + { + "epoch": 6.629637178964533, + "grad_norm": 8.129165649414062, + "learning_rate": 1.7163507270009513e-06, + "loss": 2.7248, + "step": 97575 + }, + { + "epoch": 6.629976899035195, + "grad_norm": 6.738131999969482, + "learning_rate": 1.715926076912624e-06, + "loss": 2.873, + "step": 97580 + }, + { + "epoch": 6.630316619105857, + "grad_norm": 10.499324798583984, + "learning_rate": 1.715501426824297e-06, + "loss": 2.6361, + "step": 97585 + }, + { + "epoch": 6.630656339176518, + "grad_norm": 7.546633243560791, + "learning_rate": 1.7150767767359697e-06, + "loss": 2.5297, + "step": 97590 + }, + { + "epoch": 6.63099605924718, + "grad_norm": 6.946383953094482, + "learning_rate": 1.7146521266476423e-06, + "loss": 2.7948, + "step": 97595 + }, + { + "epoch": 6.631335779317842, + "grad_norm": 7.410245418548584, + "learning_rate": 1.7142274765593153e-06, + "loss": 2.764, + "step": 97600 + }, + { + "epoch": 6.631675499388503, + "grad_norm": 9.667917251586914, + "learning_rate": 1.713802826470988e-06, + "loss": 3.1586, + "step": 97605 + }, + { + "epoch": 6.6320152194591655, + "grad_norm": 9.805974006652832, + "learning_rate": 1.713378176382661e-06, + "loss": 2.9384, + "step": 97610 + }, + { + "epoch": 6.632354939529828, + "grad_norm": 8.735411643981934, + "learning_rate": 1.7129535262943337e-06, + "loss": 2.739, + "step": 97615 + }, + { + "epoch": 6.632694659600489, + "grad_norm": 6.721978187561035, + "learning_rate": 1.7125288762060063e-06, + "loss": 2.4852, + "step": 97620 + }, + { + "epoch": 6.633034379671151, + "grad_norm": 8.505195617675781, + "learning_rate": 1.7121042261176793e-06, + "loss": 2.4793, + "step": 97625 + }, + { + "epoch": 6.633374099741813, + "grad_norm": 7.583646774291992, + "learning_rate": 1.711679576029352e-06, + "loss": 2.7189, + "step": 97630 + }, + { + "epoch": 6.633713819812474, + "grad_norm": 7.899536609649658, + "learning_rate": 1.7112549259410247e-06, + "loss": 2.6722, + "step": 97635 + }, + { + "epoch": 6.634053539883136, + "grad_norm": 7.089374542236328, + "learning_rate": 1.7108302758526977e-06, + "loss": 2.6487, + "step": 97640 + }, + { + "epoch": 6.634393259953798, + "grad_norm": 5.849532604217529, + "learning_rate": 1.7104056257643703e-06, + "loss": 2.8017, + "step": 97645 + }, + { + "epoch": 6.634732980024459, + "grad_norm": 8.357816696166992, + "learning_rate": 1.7099809756760429e-06, + "loss": 2.8103, + "step": 97650 + }, + { + "epoch": 6.6350727000951215, + "grad_norm": 6.724771976470947, + "learning_rate": 1.709556325587716e-06, + "loss": 2.7588, + "step": 97655 + }, + { + "epoch": 6.635412420165784, + "grad_norm": 6.8261518478393555, + "learning_rate": 1.7091316754993887e-06, + "loss": 2.9267, + "step": 97660 + }, + { + "epoch": 6.635752140236445, + "grad_norm": 6.951377868652344, + "learning_rate": 1.7087070254110613e-06, + "loss": 2.603, + "step": 97665 + }, + { + "epoch": 6.636091860307107, + "grad_norm": 8.873030662536621, + "learning_rate": 1.7082823753227343e-06, + "loss": 2.8401, + "step": 97670 + }, + { + "epoch": 6.636431580377769, + "grad_norm": 7.1774163246154785, + "learning_rate": 1.7078577252344069e-06, + "loss": 2.9989, + "step": 97675 + }, + { + "epoch": 6.63677130044843, + "grad_norm": 8.096982955932617, + "learning_rate": 1.7074330751460797e-06, + "loss": 2.7744, + "step": 97680 + }, + { + "epoch": 6.637111020519092, + "grad_norm": 7.611165523529053, + "learning_rate": 1.7070084250577527e-06, + "loss": 2.4445, + "step": 97685 + }, + { + "epoch": 6.637450740589754, + "grad_norm": 7.1281514167785645, + "learning_rate": 1.7065837749694253e-06, + "loss": 3.0649, + "step": 97690 + }, + { + "epoch": 6.6377904606604154, + "grad_norm": 8.102239608764648, + "learning_rate": 1.7061591248810983e-06, + "loss": 2.9126, + "step": 97695 + }, + { + "epoch": 6.6381301807310775, + "grad_norm": 7.994332790374756, + "learning_rate": 1.705734474792771e-06, + "loss": 2.7536, + "step": 97700 + }, + { + "epoch": 6.63846990080174, + "grad_norm": 8.868598937988281, + "learning_rate": 1.7053098247044437e-06, + "loss": 2.5609, + "step": 97705 + }, + { + "epoch": 6.638809620872401, + "grad_norm": 7.108345985412598, + "learning_rate": 1.7048851746161165e-06, + "loss": 2.7348, + "step": 97710 + }, + { + "epoch": 6.639149340943063, + "grad_norm": 6.56611442565918, + "learning_rate": 1.7044605245277893e-06, + "loss": 2.8303, + "step": 97715 + }, + { + "epoch": 6.639489061013725, + "grad_norm": 6.949734687805176, + "learning_rate": 1.7040358744394619e-06, + "loss": 2.7064, + "step": 97720 + }, + { + "epoch": 6.639828781084386, + "grad_norm": 7.429576873779297, + "learning_rate": 1.703611224351135e-06, + "loss": 2.6036, + "step": 97725 + }, + { + "epoch": 6.640168501155048, + "grad_norm": 7.764285564422607, + "learning_rate": 1.7031865742628075e-06, + "loss": 2.7826, + "step": 97730 + }, + { + "epoch": 6.64050822122571, + "grad_norm": 10.548026084899902, + "learning_rate": 1.7027619241744803e-06, + "loss": 2.7493, + "step": 97735 + }, + { + "epoch": 6.6408479412963715, + "grad_norm": 8.678383827209473, + "learning_rate": 1.7023372740861533e-06, + "loss": 2.7737, + "step": 97740 + }, + { + "epoch": 6.6411876613670335, + "grad_norm": 7.54449987411499, + "learning_rate": 1.7019126239978259e-06, + "loss": 2.9299, + "step": 97745 + }, + { + "epoch": 6.641527381437696, + "grad_norm": 7.743546485900879, + "learning_rate": 1.7014879739094987e-06, + "loss": 2.7438, + "step": 97750 + }, + { + "epoch": 6.641867101508357, + "grad_norm": 8.283300399780273, + "learning_rate": 1.7010633238211715e-06, + "loss": 2.6463, + "step": 97755 + }, + { + "epoch": 6.642206821579019, + "grad_norm": 8.527435302734375, + "learning_rate": 1.7006386737328443e-06, + "loss": 2.7938, + "step": 97760 + }, + { + "epoch": 6.642546541649681, + "grad_norm": 7.1628098487854, + "learning_rate": 1.7002140236445169e-06, + "loss": 2.8509, + "step": 97765 + }, + { + "epoch": 6.642886261720342, + "grad_norm": 7.37278413772583, + "learning_rate": 1.69978937355619e-06, + "loss": 2.6414, + "step": 97770 + }, + { + "epoch": 6.643225981791004, + "grad_norm": 7.604936599731445, + "learning_rate": 1.6993647234678625e-06, + "loss": 2.7174, + "step": 97775 + }, + { + "epoch": 6.643565701861666, + "grad_norm": 7.071926593780518, + "learning_rate": 1.6989400733795355e-06, + "loss": 2.5665, + "step": 97780 + }, + { + "epoch": 6.6439054219323275, + "grad_norm": 8.281545639038086, + "learning_rate": 1.6985154232912083e-06, + "loss": 2.788, + "step": 97785 + }, + { + "epoch": 6.6442451420029895, + "grad_norm": 8.242487907409668, + "learning_rate": 1.6980907732028809e-06, + "loss": 2.8783, + "step": 97790 + }, + { + "epoch": 6.644584862073652, + "grad_norm": 7.050469398498535, + "learning_rate": 1.697666123114554e-06, + "loss": 2.631, + "step": 97795 + }, + { + "epoch": 6.644924582144313, + "grad_norm": 7.456001281738281, + "learning_rate": 1.6972414730262265e-06, + "loss": 2.692, + "step": 97800 + }, + { + "epoch": 6.645264302214975, + "grad_norm": 8.33384895324707, + "learning_rate": 1.6968168229378993e-06, + "loss": 2.7344, + "step": 97805 + }, + { + "epoch": 6.645604022285637, + "grad_norm": 9.019536972045898, + "learning_rate": 1.696392172849572e-06, + "loss": 2.6534, + "step": 97810 + }, + { + "epoch": 6.645943742356298, + "grad_norm": 7.374364376068115, + "learning_rate": 1.6959675227612449e-06, + "loss": 2.5103, + "step": 97815 + }, + { + "epoch": 6.64628346242696, + "grad_norm": 7.414464950561523, + "learning_rate": 1.6955428726729175e-06, + "loss": 2.5883, + "step": 97820 + }, + { + "epoch": 6.646623182497622, + "grad_norm": 7.658450126647949, + "learning_rate": 1.6951182225845905e-06, + "loss": 2.8162, + "step": 97825 + }, + { + "epoch": 6.6469629025682835, + "grad_norm": 8.942645072937012, + "learning_rate": 1.6946935724962633e-06, + "loss": 2.5091, + "step": 97830 + }, + { + "epoch": 6.647302622638946, + "grad_norm": 9.62048053741455, + "learning_rate": 1.6942689224079359e-06, + "loss": 2.8917, + "step": 97835 + }, + { + "epoch": 6.647642342709608, + "grad_norm": 7.696637153625488, + "learning_rate": 1.693844272319609e-06, + "loss": 2.8953, + "step": 97840 + }, + { + "epoch": 6.647982062780269, + "grad_norm": 6.464919090270996, + "learning_rate": 1.6934196222312815e-06, + "loss": 2.6531, + "step": 97845 + }, + { + "epoch": 6.648321782850931, + "grad_norm": 7.865664958953857, + "learning_rate": 1.6929949721429543e-06, + "loss": 2.6951, + "step": 97850 + }, + { + "epoch": 6.648661502921593, + "grad_norm": 8.912714958190918, + "learning_rate": 1.692570322054627e-06, + "loss": 2.522, + "step": 97855 + }, + { + "epoch": 6.649001222992254, + "grad_norm": 8.2407865524292, + "learning_rate": 1.6921456719662999e-06, + "loss": 2.797, + "step": 97860 + }, + { + "epoch": 6.649340943062916, + "grad_norm": 8.40130615234375, + "learning_rate": 1.691721021877973e-06, + "loss": 2.9387, + "step": 97865 + }, + { + "epoch": 6.649680663133578, + "grad_norm": 7.310177326202393, + "learning_rate": 1.6912963717896455e-06, + "loss": 2.6173, + "step": 97870 + }, + { + "epoch": 6.6500203832042395, + "grad_norm": 7.105623245239258, + "learning_rate": 1.6908717217013183e-06, + "loss": 2.4933, + "step": 97875 + }, + { + "epoch": 6.650360103274902, + "grad_norm": 9.53848648071289, + "learning_rate": 1.690447071612991e-06, + "loss": 2.6098, + "step": 97880 + }, + { + "epoch": 6.650699823345564, + "grad_norm": 8.012760162353516, + "learning_rate": 1.6900224215246639e-06, + "loss": 2.84, + "step": 97885 + }, + { + "epoch": 6.651039543416225, + "grad_norm": 7.414684295654297, + "learning_rate": 1.6895977714363365e-06, + "loss": 2.8275, + "step": 97890 + }, + { + "epoch": 6.651379263486887, + "grad_norm": 9.098775863647461, + "learning_rate": 1.6891731213480095e-06, + "loss": 2.8544, + "step": 97895 + }, + { + "epoch": 6.651718983557549, + "grad_norm": 7.744487285614014, + "learning_rate": 1.688748471259682e-06, + "loss": 2.619, + "step": 97900 + }, + { + "epoch": 6.65205870362821, + "grad_norm": 8.590907096862793, + "learning_rate": 1.6883238211713549e-06, + "loss": 2.6772, + "step": 97905 + }, + { + "epoch": 6.652398423698872, + "grad_norm": 7.017999649047852, + "learning_rate": 1.6878991710830279e-06, + "loss": 2.8511, + "step": 97910 + }, + { + "epoch": 6.652738143769534, + "grad_norm": 7.7711286544799805, + "learning_rate": 1.6874745209947005e-06, + "loss": 2.6667, + "step": 97915 + }, + { + "epoch": 6.6530778638401955, + "grad_norm": 10.376761436462402, + "learning_rate": 1.687049870906373e-06, + "loss": 2.6546, + "step": 97920 + }, + { + "epoch": 6.653417583910858, + "grad_norm": 9.184099197387695, + "learning_rate": 1.686625220818046e-06, + "loss": 2.7633, + "step": 97925 + }, + { + "epoch": 6.65375730398152, + "grad_norm": 5.869951248168945, + "learning_rate": 1.6862005707297189e-06, + "loss": 2.594, + "step": 97930 + }, + { + "epoch": 6.654097024052181, + "grad_norm": 8.650247573852539, + "learning_rate": 1.6857759206413915e-06, + "loss": 2.8585, + "step": 97935 + }, + { + "epoch": 6.654436744122843, + "grad_norm": 7.369294166564941, + "learning_rate": 1.6853512705530645e-06, + "loss": 2.6264, + "step": 97940 + }, + { + "epoch": 6.654776464193505, + "grad_norm": 10.749123573303223, + "learning_rate": 1.684926620464737e-06, + "loss": 2.8611, + "step": 97945 + }, + { + "epoch": 6.655116184264166, + "grad_norm": 8.902116775512695, + "learning_rate": 1.68450197037641e-06, + "loss": 2.7452, + "step": 97950 + }, + { + "epoch": 6.655455904334828, + "grad_norm": 7.5208563804626465, + "learning_rate": 1.6840773202880829e-06, + "loss": 2.8246, + "step": 97955 + }, + { + "epoch": 6.65579562440549, + "grad_norm": 6.749599933624268, + "learning_rate": 1.6836526701997555e-06, + "loss": 2.4627, + "step": 97960 + }, + { + "epoch": 6.6561353444761515, + "grad_norm": 9.131784439086914, + "learning_rate": 1.6832280201114285e-06, + "loss": 2.8932, + "step": 97965 + }, + { + "epoch": 6.656475064546814, + "grad_norm": 8.050311088562012, + "learning_rate": 1.682803370023101e-06, + "loss": 2.9199, + "step": 97970 + }, + { + "epoch": 6.656814784617476, + "grad_norm": 7.912906169891357, + "learning_rate": 1.6823787199347739e-06, + "loss": 3.0413, + "step": 97975 + }, + { + "epoch": 6.657154504688137, + "grad_norm": 7.208940029144287, + "learning_rate": 1.6819540698464467e-06, + "loss": 2.6044, + "step": 97980 + }, + { + "epoch": 6.657494224758799, + "grad_norm": 6.566174030303955, + "learning_rate": 1.6815294197581195e-06, + "loss": 2.7537, + "step": 97985 + }, + { + "epoch": 6.657833944829461, + "grad_norm": 6.064883708953857, + "learning_rate": 1.681104769669792e-06, + "loss": 2.8382, + "step": 97990 + }, + { + "epoch": 6.658173664900122, + "grad_norm": 9.413931846618652, + "learning_rate": 1.680680119581465e-06, + "loss": 2.911, + "step": 97995 + }, + { + "epoch": 6.658513384970784, + "grad_norm": 8.852323532104492, + "learning_rate": 1.6802554694931379e-06, + "loss": 2.6712, + "step": 98000 + }, + { + "epoch": 6.658853105041446, + "grad_norm": 7.358608245849609, + "learning_rate": 1.6798308194048105e-06, + "loss": 2.7913, + "step": 98005 + }, + { + "epoch": 6.6591928251121075, + "grad_norm": 8.46707820892334, + "learning_rate": 1.6794061693164835e-06, + "loss": 2.6248, + "step": 98010 + }, + { + "epoch": 6.65953254518277, + "grad_norm": 6.502264976501465, + "learning_rate": 1.678981519228156e-06, + "loss": 2.7572, + "step": 98015 + }, + { + "epoch": 6.659872265253431, + "grad_norm": 10.002777099609375, + "learning_rate": 1.6785568691398289e-06, + "loss": 2.6966, + "step": 98020 + }, + { + "epoch": 6.660211985324093, + "grad_norm": 7.289783477783203, + "learning_rate": 1.6781322190515017e-06, + "loss": 2.7514, + "step": 98025 + }, + { + "epoch": 6.660551705394755, + "grad_norm": 9.375835418701172, + "learning_rate": 1.6777075689631745e-06, + "loss": 2.8883, + "step": 98030 + }, + { + "epoch": 6.660891425465416, + "grad_norm": 7.443408012390137, + "learning_rate": 1.6772829188748475e-06, + "loss": 2.9817, + "step": 98035 + }, + { + "epoch": 6.661231145536078, + "grad_norm": 9.885089874267578, + "learning_rate": 1.67685826878652e-06, + "loss": 2.8409, + "step": 98040 + }, + { + "epoch": 6.66157086560674, + "grad_norm": 9.72065544128418, + "learning_rate": 1.6764336186981927e-06, + "loss": 2.8066, + "step": 98045 + }, + { + "epoch": 6.6619105856774015, + "grad_norm": 7.084667205810547, + "learning_rate": 1.6760089686098657e-06, + "loss": 2.7151, + "step": 98050 + }, + { + "epoch": 6.6622503057480635, + "grad_norm": 7.741603851318359, + "learning_rate": 1.6755843185215385e-06, + "loss": 2.8896, + "step": 98055 + }, + { + "epoch": 6.662590025818726, + "grad_norm": 8.406639099121094, + "learning_rate": 1.675159668433211e-06, + "loss": 2.7786, + "step": 98060 + }, + { + "epoch": 6.662929745889387, + "grad_norm": 9.754859924316406, + "learning_rate": 1.674735018344884e-06, + "loss": 2.6449, + "step": 98065 + }, + { + "epoch": 6.663269465960049, + "grad_norm": 8.535348892211914, + "learning_rate": 1.6743103682565567e-06, + "loss": 2.7535, + "step": 98070 + }, + { + "epoch": 6.663609186030711, + "grad_norm": 8.415833473205566, + "learning_rate": 1.6738857181682295e-06, + "loss": 2.8222, + "step": 98075 + }, + { + "epoch": 6.663948906101372, + "grad_norm": 9.427228927612305, + "learning_rate": 1.6734610680799025e-06, + "loss": 2.696, + "step": 98080 + }, + { + "epoch": 6.664288626172034, + "grad_norm": 5.074481010437012, + "learning_rate": 1.673036417991575e-06, + "loss": 2.6377, + "step": 98085 + }, + { + "epoch": 6.664628346242696, + "grad_norm": 6.913283348083496, + "learning_rate": 1.6726117679032477e-06, + "loss": 2.6789, + "step": 98090 + }, + { + "epoch": 6.6649680663133575, + "grad_norm": 7.542990207672119, + "learning_rate": 1.6721871178149207e-06, + "loss": 2.7747, + "step": 98095 + }, + { + "epoch": 6.6653077863840196, + "grad_norm": 6.994697570800781, + "learning_rate": 1.6717624677265935e-06, + "loss": 2.6161, + "step": 98100 + }, + { + "epoch": 6.665647506454682, + "grad_norm": 7.468025207519531, + "learning_rate": 1.671337817638266e-06, + "loss": 2.8202, + "step": 98105 + }, + { + "epoch": 6.665987226525343, + "grad_norm": 7.740980625152588, + "learning_rate": 1.670913167549939e-06, + "loss": 2.5134, + "step": 98110 + }, + { + "epoch": 6.666326946596005, + "grad_norm": 6.400062084197998, + "learning_rate": 1.6704885174616117e-06, + "loss": 2.6365, + "step": 98115 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 9.381692886352539, + "learning_rate": 1.6700638673732847e-06, + "loss": 2.9534, + "step": 98120 + }, + { + "epoch": 6.667006386737328, + "grad_norm": 5.935305118560791, + "learning_rate": 1.6696392172849573e-06, + "loss": 2.7075, + "step": 98125 + }, + { + "epoch": 6.66734610680799, + "grad_norm": 6.5468854904174805, + "learning_rate": 1.66921456719663e-06, + "loss": 2.8458, + "step": 98130 + }, + { + "epoch": 6.667685826878652, + "grad_norm": 6.867237091064453, + "learning_rate": 1.668789917108303e-06, + "loss": 2.6849, + "step": 98135 + }, + { + "epoch": 6.6680255469493135, + "grad_norm": 7.5514936447143555, + "learning_rate": 1.6683652670199757e-06, + "loss": 2.6937, + "step": 98140 + }, + { + "epoch": 6.668365267019976, + "grad_norm": 8.019577980041504, + "learning_rate": 1.6679406169316485e-06, + "loss": 2.7746, + "step": 98145 + }, + { + "epoch": 6.668704987090638, + "grad_norm": 8.871225357055664, + "learning_rate": 1.6675159668433213e-06, + "loss": 2.8182, + "step": 98150 + }, + { + "epoch": 6.669044707161299, + "grad_norm": 8.663877487182617, + "learning_rate": 1.667091316754994e-06, + "loss": 2.7155, + "step": 98155 + }, + { + "epoch": 6.669384427231961, + "grad_norm": 10.30851936340332, + "learning_rate": 1.6666666666666667e-06, + "loss": 2.9248, + "step": 98160 + }, + { + "epoch": 6.669724147302623, + "grad_norm": 8.103200912475586, + "learning_rate": 1.6662420165783397e-06, + "loss": 2.6136, + "step": 98165 + }, + { + "epoch": 6.670063867373284, + "grad_norm": 7.030391693115234, + "learning_rate": 1.6658173664900123e-06, + "loss": 2.8891, + "step": 98170 + }, + { + "epoch": 6.670403587443946, + "grad_norm": 6.842001914978027, + "learning_rate": 1.665392716401685e-06, + "loss": 2.6403, + "step": 98175 + }, + { + "epoch": 6.670743307514608, + "grad_norm": 8.561697006225586, + "learning_rate": 1.664968066313358e-06, + "loss": 2.8642, + "step": 98180 + }, + { + "epoch": 6.6710830275852695, + "grad_norm": 9.535197257995605, + "learning_rate": 1.6645434162250307e-06, + "loss": 2.6954, + "step": 98185 + }, + { + "epoch": 6.671422747655932, + "grad_norm": 7.958057880401611, + "learning_rate": 1.6641187661367035e-06, + "loss": 2.8712, + "step": 98190 + }, + { + "epoch": 6.671762467726594, + "grad_norm": 7.53833532333374, + "learning_rate": 1.6636941160483763e-06, + "loss": 2.7353, + "step": 98195 + }, + { + "epoch": 6.672102187797255, + "grad_norm": 8.742924690246582, + "learning_rate": 1.663269465960049e-06, + "loss": 2.7336, + "step": 98200 + }, + { + "epoch": 6.672441907867917, + "grad_norm": 7.295168876647949, + "learning_rate": 1.6628448158717219e-06, + "loss": 2.7082, + "step": 98205 + }, + { + "epoch": 6.672781627938579, + "grad_norm": 9.315132141113281, + "learning_rate": 1.6624201657833947e-06, + "loss": 3.0057, + "step": 98210 + }, + { + "epoch": 6.67312134800924, + "grad_norm": 7.377832412719727, + "learning_rate": 1.662080445712733e-06, + "loss": 2.6194, + "step": 98215 + }, + { + "epoch": 6.673461068079902, + "grad_norm": 6.744179725646973, + "learning_rate": 1.6616557956244055e-06, + "loss": 2.7372, + "step": 98220 + }, + { + "epoch": 6.673800788150564, + "grad_norm": 7.466214179992676, + "learning_rate": 1.6612311455360783e-06, + "loss": 2.5421, + "step": 98225 + }, + { + "epoch": 6.6741405082212255, + "grad_norm": 7.868276119232178, + "learning_rate": 1.6608064954477511e-06, + "loss": 2.7314, + "step": 98230 + }, + { + "epoch": 6.674480228291888, + "grad_norm": 6.759302139282227, + "learning_rate": 1.660381845359424e-06, + "loss": 2.6905, + "step": 98235 + }, + { + "epoch": 6.674819948362549, + "grad_norm": 8.765759468078613, + "learning_rate": 1.659957195271097e-06, + "loss": 2.515, + "step": 98240 + }, + { + "epoch": 6.675159668433211, + "grad_norm": 6.363535404205322, + "learning_rate": 1.6595325451827695e-06, + "loss": 2.8169, + "step": 98245 + }, + { + "epoch": 6.675499388503873, + "grad_norm": 7.945319652557373, + "learning_rate": 1.6591078950944423e-06, + "loss": 2.7139, + "step": 98250 + }, + { + "epoch": 6.675839108574534, + "grad_norm": 8.775999069213867, + "learning_rate": 1.6586832450061151e-06, + "loss": 2.3958, + "step": 98255 + }, + { + "epoch": 6.676178828645196, + "grad_norm": 7.77465295791626, + "learning_rate": 1.658258594917788e-06, + "loss": 2.7865, + "step": 98260 + }, + { + "epoch": 6.676518548715858, + "grad_norm": 7.524080276489258, + "learning_rate": 1.6578339448294605e-06, + "loss": 2.7174, + "step": 98265 + }, + { + "epoch": 6.6768582687865194, + "grad_norm": 6.572109699249268, + "learning_rate": 1.6574092947411335e-06, + "loss": 2.6877, + "step": 98270 + }, + { + "epoch": 6.6771979888571815, + "grad_norm": 8.556947708129883, + "learning_rate": 1.6569846446528061e-06, + "loss": 2.5073, + "step": 98275 + }, + { + "epoch": 6.677537708927844, + "grad_norm": 5.79365348815918, + "learning_rate": 1.656559994564479e-06, + "loss": 2.811, + "step": 98280 + }, + { + "epoch": 6.677877428998505, + "grad_norm": 6.868859767913818, + "learning_rate": 1.656135344476152e-06, + "loss": 2.8928, + "step": 98285 + }, + { + "epoch": 6.678217149069167, + "grad_norm": 7.558251857757568, + "learning_rate": 1.6557106943878245e-06, + "loss": 2.8673, + "step": 98290 + }, + { + "epoch": 6.678556869139829, + "grad_norm": 8.154888153076172, + "learning_rate": 1.6552860442994973e-06, + "loss": 2.698, + "step": 98295 + }, + { + "epoch": 6.67889658921049, + "grad_norm": 6.8999409675598145, + "learning_rate": 1.6548613942111701e-06, + "loss": 2.9134, + "step": 98300 + }, + { + "epoch": 6.679236309281152, + "grad_norm": 6.751811981201172, + "learning_rate": 1.654436744122843e-06, + "loss": 2.8065, + "step": 98305 + }, + { + "epoch": 6.679576029351814, + "grad_norm": 6.331544876098633, + "learning_rate": 1.6540120940345155e-06, + "loss": 2.6625, + "step": 98310 + }, + { + "epoch": 6.6799157494224755, + "grad_norm": 6.1293416023254395, + "learning_rate": 1.6535874439461885e-06, + "loss": 2.9723, + "step": 98315 + }, + { + "epoch": 6.6802554694931375, + "grad_norm": 8.341939926147461, + "learning_rate": 1.6531627938578611e-06, + "loss": 3.0303, + "step": 98320 + }, + { + "epoch": 6.6805951895638, + "grad_norm": 8.03868579864502, + "learning_rate": 1.6527381437695341e-06, + "loss": 2.7081, + "step": 98325 + }, + { + "epoch": 6.680934909634461, + "grad_norm": 8.24315357208252, + "learning_rate": 1.652313493681207e-06, + "loss": 2.7448, + "step": 98330 + }, + { + "epoch": 6.681274629705123, + "grad_norm": 10.292070388793945, + "learning_rate": 1.6518888435928795e-06, + "loss": 2.6878, + "step": 98335 + }, + { + "epoch": 6.681614349775785, + "grad_norm": 6.065502643585205, + "learning_rate": 1.6514641935045525e-06, + "loss": 2.8325, + "step": 98340 + }, + { + "epoch": 6.681954069846446, + "grad_norm": 8.023184776306152, + "learning_rate": 1.6510395434162251e-06, + "loss": 2.8536, + "step": 98345 + }, + { + "epoch": 6.682293789917108, + "grad_norm": 9.186172485351562, + "learning_rate": 1.650614893327898e-06, + "loss": 2.7278, + "step": 98350 + }, + { + "epoch": 6.68263350998777, + "grad_norm": 7.3696208000183105, + "learning_rate": 1.6501902432395707e-06, + "loss": 2.8549, + "step": 98355 + }, + { + "epoch": 6.6829732300584315, + "grad_norm": 10.119688987731934, + "learning_rate": 1.6497655931512435e-06, + "loss": 2.8514, + "step": 98360 + }, + { + "epoch": 6.6833129501290935, + "grad_norm": 10.345453262329102, + "learning_rate": 1.6493409430629161e-06, + "loss": 2.7811, + "step": 98365 + }, + { + "epoch": 6.683652670199756, + "grad_norm": 7.431002616882324, + "learning_rate": 1.6489162929745891e-06, + "loss": 2.5434, + "step": 98370 + }, + { + "epoch": 6.683992390270417, + "grad_norm": 8.705595016479492, + "learning_rate": 1.648491642886262e-06, + "loss": 2.6657, + "step": 98375 + }, + { + "epoch": 6.684332110341079, + "grad_norm": 8.054548263549805, + "learning_rate": 1.6480669927979345e-06, + "loss": 2.7181, + "step": 98380 + }, + { + "epoch": 6.684671830411741, + "grad_norm": 9.403029441833496, + "learning_rate": 1.6476423427096075e-06, + "loss": 2.9193, + "step": 98385 + }, + { + "epoch": 6.685011550482402, + "grad_norm": 8.12195110321045, + "learning_rate": 1.6472176926212801e-06, + "loss": 2.5008, + "step": 98390 + }, + { + "epoch": 6.685351270553064, + "grad_norm": 7.737227916717529, + "learning_rate": 1.646793042532953e-06, + "loss": 3.0062, + "step": 98395 + }, + { + "epoch": 6.685690990623726, + "grad_norm": 8.77937126159668, + "learning_rate": 1.6463683924446257e-06, + "loss": 2.545, + "step": 98400 + }, + { + "epoch": 6.6860307106943875, + "grad_norm": 9.348872184753418, + "learning_rate": 1.6459437423562985e-06, + "loss": 2.8805, + "step": 98405 + }, + { + "epoch": 6.6863704307650496, + "grad_norm": 6.319632053375244, + "learning_rate": 1.6455190922679715e-06, + "loss": 2.5671, + "step": 98410 + }, + { + "epoch": 6.686710150835712, + "grad_norm": 10.115038871765137, + "learning_rate": 1.6450944421796441e-06, + "loss": 2.7681, + "step": 98415 + }, + { + "epoch": 6.687049870906373, + "grad_norm": 6.721112251281738, + "learning_rate": 1.6446697920913167e-06, + "loss": 2.6345, + "step": 98420 + }, + { + "epoch": 6.687389590977035, + "grad_norm": 7.359717845916748, + "learning_rate": 1.6442451420029897e-06, + "loss": 2.6961, + "step": 98425 + }, + { + "epoch": 6.687729311047697, + "grad_norm": 8.263457298278809, + "learning_rate": 1.6438204919146625e-06, + "loss": 2.6168, + "step": 98430 + }, + { + "epoch": 6.688069031118358, + "grad_norm": 8.272380828857422, + "learning_rate": 1.6433958418263351e-06, + "loss": 2.673, + "step": 98435 + }, + { + "epoch": 6.68840875118902, + "grad_norm": 6.891575813293457, + "learning_rate": 1.6429711917380081e-06, + "loss": 2.7073, + "step": 98440 + }, + { + "epoch": 6.688748471259682, + "grad_norm": 7.297968864440918, + "learning_rate": 1.6425465416496807e-06, + "loss": 2.7875, + "step": 98445 + }, + { + "epoch": 6.6890881913303435, + "grad_norm": 6.590471267700195, + "learning_rate": 1.6421218915613535e-06, + "loss": 3.1419, + "step": 98450 + }, + { + "epoch": 6.689427911401006, + "grad_norm": 7.144557476043701, + "learning_rate": 1.6416972414730265e-06, + "loss": 2.7253, + "step": 98455 + }, + { + "epoch": 6.689767631471668, + "grad_norm": 7.036118507385254, + "learning_rate": 1.6412725913846991e-06, + "loss": 2.9773, + "step": 98460 + }, + { + "epoch": 6.690107351542329, + "grad_norm": 8.403972625732422, + "learning_rate": 1.6408479412963717e-06, + "loss": 2.7304, + "step": 98465 + }, + { + "epoch": 6.690447071612991, + "grad_norm": 6.546739101409912, + "learning_rate": 1.6404232912080447e-06, + "loss": 2.6282, + "step": 98470 + }, + { + "epoch": 6.690786791683653, + "grad_norm": 5.889793872833252, + "learning_rate": 1.6399986411197175e-06, + "loss": 2.8848, + "step": 98475 + }, + { + "epoch": 6.691126511754314, + "grad_norm": 7.13950252532959, + "learning_rate": 1.6395739910313901e-06, + "loss": 2.7063, + "step": 98480 + }, + { + "epoch": 6.691466231824976, + "grad_norm": 6.530831813812256, + "learning_rate": 1.6391493409430631e-06, + "loss": 2.9284, + "step": 98485 + }, + { + "epoch": 6.691805951895638, + "grad_norm": 9.333250999450684, + "learning_rate": 1.6387246908547357e-06, + "loss": 2.6405, + "step": 98490 + }, + { + "epoch": 6.6921456719662995, + "grad_norm": 6.438342571258545, + "learning_rate": 1.6383000407664087e-06, + "loss": 2.8112, + "step": 98495 + }, + { + "epoch": 6.692485392036962, + "grad_norm": 9.774618148803711, + "learning_rate": 1.6378753906780815e-06, + "loss": 2.9898, + "step": 98500 + }, + { + "epoch": 6.692825112107624, + "grad_norm": 6.895060062408447, + "learning_rate": 1.6374507405897541e-06, + "loss": 2.5639, + "step": 98505 + }, + { + "epoch": 6.693164832178285, + "grad_norm": 8.987464904785156, + "learning_rate": 1.6370260905014271e-06, + "loss": 2.6611, + "step": 98510 + }, + { + "epoch": 6.693504552248947, + "grad_norm": 9.225298881530762, + "learning_rate": 1.6366014404130997e-06, + "loss": 2.9216, + "step": 98515 + }, + { + "epoch": 6.693844272319609, + "grad_norm": 8.303857803344727, + "learning_rate": 1.6361767903247725e-06, + "loss": 2.4018, + "step": 98520 + }, + { + "epoch": 6.69418399239027, + "grad_norm": 6.147050380706787, + "learning_rate": 1.6357521402364453e-06, + "loss": 2.4271, + "step": 98525 + }, + { + "epoch": 6.694523712460932, + "grad_norm": 8.11688232421875, + "learning_rate": 1.6353274901481181e-06, + "loss": 2.7971, + "step": 98530 + }, + { + "epoch": 6.694863432531594, + "grad_norm": 8.57666301727295, + "learning_rate": 1.6349028400597907e-06, + "loss": 2.7167, + "step": 98535 + }, + { + "epoch": 6.6952031526022555, + "grad_norm": 8.571475982666016, + "learning_rate": 1.6344781899714637e-06, + "loss": 2.7745, + "step": 98540 + }, + { + "epoch": 6.695542872672918, + "grad_norm": 6.22192907333374, + "learning_rate": 1.6340535398831363e-06, + "loss": 2.9023, + "step": 98545 + }, + { + "epoch": 6.69588259274358, + "grad_norm": 8.528888702392578, + "learning_rate": 1.6336288897948091e-06, + "loss": 2.7487, + "step": 98550 + }, + { + "epoch": 6.696222312814241, + "grad_norm": 11.663591384887695, + "learning_rate": 1.6332042397064821e-06, + "loss": 2.7434, + "step": 98555 + }, + { + "epoch": 6.696562032884903, + "grad_norm": 7.896675109863281, + "learning_rate": 1.6327795896181547e-06, + "loss": 2.8912, + "step": 98560 + }, + { + "epoch": 6.696901752955565, + "grad_norm": 8.973904609680176, + "learning_rate": 1.6323549395298275e-06, + "loss": 2.6697, + "step": 98565 + }, + { + "epoch": 6.697241473026226, + "grad_norm": 9.391470909118652, + "learning_rate": 1.6319302894415003e-06, + "loss": 2.6021, + "step": 98570 + }, + { + "epoch": 6.697581193096888, + "grad_norm": 6.628322601318359, + "learning_rate": 1.6315056393531731e-06, + "loss": 2.7062, + "step": 98575 + }, + { + "epoch": 6.69792091316755, + "grad_norm": 10.258395195007324, + "learning_rate": 1.6310809892648461e-06, + "loss": 2.8536, + "step": 98580 + }, + { + "epoch": 6.6982606332382115, + "grad_norm": 7.226230144500732, + "learning_rate": 1.6306563391765187e-06, + "loss": 2.5888, + "step": 98585 + }, + { + "epoch": 6.698600353308874, + "grad_norm": 7.895419120788574, + "learning_rate": 1.6302316890881913e-06, + "loss": 2.6757, + "step": 98590 + }, + { + "epoch": 6.698940073379536, + "grad_norm": 7.161984443664551, + "learning_rate": 1.6298070389998643e-06, + "loss": 2.5839, + "step": 98595 + }, + { + "epoch": 6.699279793450197, + "grad_norm": 8.653459548950195, + "learning_rate": 1.6293823889115371e-06, + "loss": 2.7214, + "step": 98600 + }, + { + "epoch": 6.699619513520859, + "grad_norm": 8.029451370239258, + "learning_rate": 1.6289577388232097e-06, + "loss": 2.8889, + "step": 98605 + }, + { + "epoch": 6.699959233591521, + "grad_norm": 9.364514350891113, + "learning_rate": 1.6285330887348827e-06, + "loss": 2.7551, + "step": 98610 + }, + { + "epoch": 6.700298953662182, + "grad_norm": 7.373682498931885, + "learning_rate": 1.6281084386465553e-06, + "loss": 2.5797, + "step": 98615 + }, + { + "epoch": 6.700638673732844, + "grad_norm": 8.724410057067871, + "learning_rate": 1.6276837885582281e-06, + "loss": 2.8307, + "step": 98620 + }, + { + "epoch": 6.700978393803506, + "grad_norm": 7.807668209075928, + "learning_rate": 1.627259138469901e-06, + "loss": 2.6758, + "step": 98625 + }, + { + "epoch": 6.7013181138741675, + "grad_norm": 6.866488456726074, + "learning_rate": 1.6268344883815737e-06, + "loss": 2.7646, + "step": 98630 + }, + { + "epoch": 6.70165783394483, + "grad_norm": 7.5399956703186035, + "learning_rate": 1.6264098382932463e-06, + "loss": 2.7311, + "step": 98635 + }, + { + "epoch": 6.701997554015492, + "grad_norm": 10.202910423278809, + "learning_rate": 1.6259851882049193e-06, + "loss": 2.9423, + "step": 98640 + }, + { + "epoch": 6.702337274086153, + "grad_norm": 8.186772346496582, + "learning_rate": 1.6255605381165921e-06, + "loss": 2.6938, + "step": 98645 + }, + { + "epoch": 6.702676994156815, + "grad_norm": 7.37224006652832, + "learning_rate": 1.6251358880282647e-06, + "loss": 2.6237, + "step": 98650 + }, + { + "epoch": 6.703016714227477, + "grad_norm": 6.924142360687256, + "learning_rate": 1.6247112379399377e-06, + "loss": 2.6434, + "step": 98655 + }, + { + "epoch": 6.703356434298138, + "grad_norm": 6.930785655975342, + "learning_rate": 1.6242865878516103e-06, + "loss": 2.7815, + "step": 98660 + }, + { + "epoch": 6.7036961543688, + "grad_norm": 7.1640625, + "learning_rate": 1.6238619377632833e-06, + "loss": 2.7446, + "step": 98665 + }, + { + "epoch": 6.704035874439462, + "grad_norm": 7.526279926300049, + "learning_rate": 1.623437287674956e-06, + "loss": 2.7348, + "step": 98670 + }, + { + "epoch": 6.7043755945101235, + "grad_norm": 6.913562297821045, + "learning_rate": 1.6230126375866287e-06, + "loss": 2.6048, + "step": 98675 + }, + { + "epoch": 6.704715314580786, + "grad_norm": 7.269034385681152, + "learning_rate": 1.6225879874983017e-06, + "loss": 2.7667, + "step": 98680 + }, + { + "epoch": 6.705055034651448, + "grad_norm": 5.659295082092285, + "learning_rate": 1.6221633374099743e-06, + "loss": 2.724, + "step": 98685 + }, + { + "epoch": 6.705394754722109, + "grad_norm": 8.260478973388672, + "learning_rate": 1.6217386873216471e-06, + "loss": 2.8266, + "step": 98690 + }, + { + "epoch": 6.705734474792771, + "grad_norm": 8.226606369018555, + "learning_rate": 1.62131403723332e-06, + "loss": 2.7946, + "step": 98695 + }, + { + "epoch": 6.706074194863432, + "grad_norm": 9.450334548950195, + "learning_rate": 1.6208893871449927e-06, + "loss": 2.7155, + "step": 98700 + }, + { + "epoch": 6.706413914934094, + "grad_norm": 10.24116325378418, + "learning_rate": 1.6204647370566653e-06, + "loss": 2.8044, + "step": 98705 + }, + { + "epoch": 6.706753635004756, + "grad_norm": 6.435482978820801, + "learning_rate": 1.6200400869683383e-06, + "loss": 2.8762, + "step": 98710 + }, + { + "epoch": 6.7070933550754175, + "grad_norm": 8.952729225158691, + "learning_rate": 1.619615436880011e-06, + "loss": 2.7804, + "step": 98715 + }, + { + "epoch": 6.70743307514608, + "grad_norm": 7.242732524871826, + "learning_rate": 1.6191907867916837e-06, + "loss": 2.786, + "step": 98720 + }, + { + "epoch": 6.707772795216742, + "grad_norm": 7.91616153717041, + "learning_rate": 1.6187661367033567e-06, + "loss": 2.6023, + "step": 98725 + }, + { + "epoch": 6.708112515287403, + "grad_norm": 9.195215225219727, + "learning_rate": 1.6183414866150293e-06, + "loss": 2.8593, + "step": 98730 + }, + { + "epoch": 6.708452235358065, + "grad_norm": 7.903242588043213, + "learning_rate": 1.617916836526702e-06, + "loss": 2.673, + "step": 98735 + }, + { + "epoch": 6.708791955428727, + "grad_norm": 9.328652381896973, + "learning_rate": 1.617492186438375e-06, + "loss": 2.6045, + "step": 98740 + }, + { + "epoch": 6.709131675499388, + "grad_norm": 6.635259628295898, + "learning_rate": 1.6170675363500477e-06, + "loss": 2.7521, + "step": 98745 + }, + { + "epoch": 6.70947139557005, + "grad_norm": 8.041481018066406, + "learning_rate": 1.6166428862617205e-06, + "loss": 2.8056, + "step": 98750 + }, + { + "epoch": 6.709811115640712, + "grad_norm": 6.861136436462402, + "learning_rate": 1.6162182361733933e-06, + "loss": 3.027, + "step": 98755 + }, + { + "epoch": 6.7101508357113735, + "grad_norm": 8.174510955810547, + "learning_rate": 1.615793586085066e-06, + "loss": 2.7149, + "step": 98760 + }, + { + "epoch": 6.710490555782036, + "grad_norm": 7.960415363311768, + "learning_rate": 1.615368935996739e-06, + "loss": 2.9116, + "step": 98765 + }, + { + "epoch": 6.710830275852698, + "grad_norm": 6.400790691375732, + "learning_rate": 1.6149442859084117e-06, + "loss": 2.9217, + "step": 98770 + }, + { + "epoch": 6.711169995923359, + "grad_norm": 8.266475677490234, + "learning_rate": 1.6145196358200843e-06, + "loss": 2.6952, + "step": 98775 + }, + { + "epoch": 6.711509715994021, + "grad_norm": 7.182483673095703, + "learning_rate": 1.6140949857317573e-06, + "loss": 2.8719, + "step": 98780 + }, + { + "epoch": 6.711849436064683, + "grad_norm": 7.5218586921691895, + "learning_rate": 1.61367033564343e-06, + "loss": 2.748, + "step": 98785 + }, + { + "epoch": 6.712189156135344, + "grad_norm": 8.3903169631958, + "learning_rate": 1.6132456855551027e-06, + "loss": 2.7922, + "step": 98790 + }, + { + "epoch": 6.712528876206006, + "grad_norm": 8.004837036132812, + "learning_rate": 1.6128210354667755e-06, + "loss": 2.8655, + "step": 98795 + }, + { + "epoch": 6.712868596276668, + "grad_norm": 7.73512601852417, + "learning_rate": 1.6123963853784483e-06, + "loss": 2.7308, + "step": 98800 + }, + { + "epoch": 6.7132083163473295, + "grad_norm": 7.807742595672607, + "learning_rate": 1.611971735290121e-06, + "loss": 2.6492, + "step": 98805 + }, + { + "epoch": 6.713548036417992, + "grad_norm": 10.066100120544434, + "learning_rate": 1.611547085201794e-06, + "loss": 2.8012, + "step": 98810 + }, + { + "epoch": 6.713887756488654, + "grad_norm": 8.702032089233398, + "learning_rate": 1.6111224351134665e-06, + "loss": 2.8585, + "step": 98815 + }, + { + "epoch": 6.714227476559315, + "grad_norm": 7.538363456726074, + "learning_rate": 1.6106977850251393e-06, + "loss": 2.8114, + "step": 98820 + }, + { + "epoch": 6.714567196629977, + "grad_norm": 8.075674057006836, + "learning_rate": 1.6102731349368123e-06, + "loss": 2.8781, + "step": 98825 + }, + { + "epoch": 6.714906916700639, + "grad_norm": 6.662252902984619, + "learning_rate": 1.609848484848485e-06, + "loss": 2.7127, + "step": 98830 + }, + { + "epoch": 6.7152466367713, + "grad_norm": 7.347725868225098, + "learning_rate": 1.609423834760158e-06, + "loss": 2.8202, + "step": 98835 + }, + { + "epoch": 6.715586356841962, + "grad_norm": 7.840866565704346, + "learning_rate": 1.6089991846718305e-06, + "loss": 2.7394, + "step": 98840 + }, + { + "epoch": 6.715926076912624, + "grad_norm": 6.76512336730957, + "learning_rate": 1.6085745345835033e-06, + "loss": 2.527, + "step": 98845 + }, + { + "epoch": 6.7162657969832855, + "grad_norm": 8.254493713378906, + "learning_rate": 1.6081498844951763e-06, + "loss": 2.6231, + "step": 98850 + }, + { + "epoch": 6.716605517053948, + "grad_norm": 11.236031532287598, + "learning_rate": 1.607725234406849e-06, + "loss": 2.7913, + "step": 98855 + }, + { + "epoch": 6.71694523712461, + "grad_norm": 7.136048793792725, + "learning_rate": 1.6073005843185215e-06, + "loss": 2.7259, + "step": 98860 + }, + { + "epoch": 6.717284957195271, + "grad_norm": 6.64562463760376, + "learning_rate": 1.6068759342301945e-06, + "loss": 2.9251, + "step": 98865 + }, + { + "epoch": 6.717624677265933, + "grad_norm": 7.667712211608887, + "learning_rate": 1.6064512841418673e-06, + "loss": 2.8765, + "step": 98870 + }, + { + "epoch": 6.717964397336595, + "grad_norm": 6.314569473266602, + "learning_rate": 1.60602663405354e-06, + "loss": 2.5382, + "step": 98875 + }, + { + "epoch": 6.718304117407256, + "grad_norm": 9.386256217956543, + "learning_rate": 1.605601983965213e-06, + "loss": 2.8703, + "step": 98880 + }, + { + "epoch": 6.718643837477918, + "grad_norm": 7.97166633605957, + "learning_rate": 1.6051773338768855e-06, + "loss": 2.5702, + "step": 98885 + }, + { + "epoch": 6.71898355754858, + "grad_norm": 8.138944625854492, + "learning_rate": 1.6047526837885583e-06, + "loss": 2.6868, + "step": 98890 + }, + { + "epoch": 6.7193232776192415, + "grad_norm": 7.231527328491211, + "learning_rate": 1.6043280337002313e-06, + "loss": 2.6016, + "step": 98895 + }, + { + "epoch": 6.719662997689904, + "grad_norm": 8.140182495117188, + "learning_rate": 1.603903383611904e-06, + "loss": 2.7674, + "step": 98900 + }, + { + "epoch": 6.720002717760566, + "grad_norm": 6.603130340576172, + "learning_rate": 1.6034787335235765e-06, + "loss": 2.7601, + "step": 98905 + }, + { + "epoch": 6.720342437831227, + "grad_norm": 7.525303840637207, + "learning_rate": 1.6030540834352495e-06, + "loss": 2.8803, + "step": 98910 + }, + { + "epoch": 6.720682157901889, + "grad_norm": 9.812705039978027, + "learning_rate": 1.6026294333469223e-06, + "loss": 2.6788, + "step": 98915 + }, + { + "epoch": 6.72102187797255, + "grad_norm": 5.977384090423584, + "learning_rate": 1.6022047832585951e-06, + "loss": 2.7336, + "step": 98920 + }, + { + "epoch": 6.721361598043212, + "grad_norm": 7.273551940917969, + "learning_rate": 1.601780133170268e-06, + "loss": 2.8408, + "step": 98925 + }, + { + "epoch": 6.721701318113874, + "grad_norm": 6.298039436340332, + "learning_rate": 1.6013554830819405e-06, + "loss": 2.7475, + "step": 98930 + }, + { + "epoch": 6.7220410381845355, + "grad_norm": 9.092859268188477, + "learning_rate": 1.6009308329936135e-06, + "loss": 3.0027, + "step": 98935 + }, + { + "epoch": 6.7223807582551975, + "grad_norm": 7.658259868621826, + "learning_rate": 1.600506182905286e-06, + "loss": 2.8082, + "step": 98940 + }, + { + "epoch": 6.72272047832586, + "grad_norm": 7.665197849273682, + "learning_rate": 1.600081532816959e-06, + "loss": 2.8035, + "step": 98945 + }, + { + "epoch": 6.723060198396521, + "grad_norm": 7.79249382019043, + "learning_rate": 1.599656882728632e-06, + "loss": 2.7499, + "step": 98950 + }, + { + "epoch": 6.723399918467183, + "grad_norm": 6.642478942871094, + "learning_rate": 1.5992322326403045e-06, + "loss": 2.7604, + "step": 98955 + }, + { + "epoch": 6.723739638537845, + "grad_norm": 7.638674736022949, + "learning_rate": 1.5988075825519773e-06, + "loss": 2.7353, + "step": 98960 + }, + { + "epoch": 6.724079358608506, + "grad_norm": 6.591018199920654, + "learning_rate": 1.59838293246365e-06, + "loss": 2.9334, + "step": 98965 + }, + { + "epoch": 6.724419078679168, + "grad_norm": 8.941815376281738, + "learning_rate": 1.597958282375323e-06, + "loss": 2.8414, + "step": 98970 + }, + { + "epoch": 6.72475879874983, + "grad_norm": 8.23969841003418, + "learning_rate": 1.5975336322869955e-06, + "loss": 2.5823, + "step": 98975 + }, + { + "epoch": 6.7250985188204915, + "grad_norm": 8.239937782287598, + "learning_rate": 1.5971089821986685e-06, + "loss": 2.8321, + "step": 98980 + }, + { + "epoch": 6.7254382388911536, + "grad_norm": 7.011073589324951, + "learning_rate": 1.596684332110341e-06, + "loss": 2.6699, + "step": 98985 + }, + { + "epoch": 6.725777958961816, + "grad_norm": 9.26196002960205, + "learning_rate": 1.596259682022014e-06, + "loss": 2.8651, + "step": 98990 + }, + { + "epoch": 6.726117679032477, + "grad_norm": 7.350743770599365, + "learning_rate": 1.595835031933687e-06, + "loss": 2.9786, + "step": 98995 + }, + { + "epoch": 6.726457399103139, + "grad_norm": 7.2783989906311035, + "learning_rate": 1.5954103818453595e-06, + "loss": 2.78, + "step": 99000 + }, + { + "epoch": 6.726797119173801, + "grad_norm": 6.239356517791748, + "learning_rate": 1.5949857317570325e-06, + "loss": 2.9054, + "step": 99005 + }, + { + "epoch": 6.727136839244462, + "grad_norm": 7.910572052001953, + "learning_rate": 1.594561081668705e-06, + "loss": 2.584, + "step": 99010 + }, + { + "epoch": 6.727476559315124, + "grad_norm": 7.289283752441406, + "learning_rate": 1.594136431580378e-06, + "loss": 2.802, + "step": 99015 + }, + { + "epoch": 6.727816279385786, + "grad_norm": 8.10448932647705, + "learning_rate": 1.5937117814920507e-06, + "loss": 2.8207, + "step": 99020 + }, + { + "epoch": 6.7281559994564475, + "grad_norm": 8.492897033691406, + "learning_rate": 1.5932871314037235e-06, + "loss": 2.9142, + "step": 99025 + }, + { + "epoch": 6.72849571952711, + "grad_norm": 9.312482833862305, + "learning_rate": 1.592862481315396e-06, + "loss": 2.7799, + "step": 99030 + }, + { + "epoch": 6.728835439597772, + "grad_norm": 8.177632331848145, + "learning_rate": 1.592437831227069e-06, + "loss": 2.7382, + "step": 99035 + }, + { + "epoch": 6.729175159668433, + "grad_norm": 8.00464153289795, + "learning_rate": 1.592013181138742e-06, + "loss": 2.7631, + "step": 99040 + }, + { + "epoch": 6.729514879739095, + "grad_norm": 6.560206890106201, + "learning_rate": 1.5915885310504145e-06, + "loss": 2.729, + "step": 99045 + }, + { + "epoch": 6.729854599809757, + "grad_norm": 8.438061714172363, + "learning_rate": 1.5911638809620875e-06, + "loss": 2.7036, + "step": 99050 + }, + { + "epoch": 6.730194319880418, + "grad_norm": 7.222878932952881, + "learning_rate": 1.59073923087376e-06, + "loss": 2.7763, + "step": 99055 + }, + { + "epoch": 6.73053403995108, + "grad_norm": 9.574946403503418, + "learning_rate": 1.590314580785433e-06, + "loss": 2.6585, + "step": 99060 + }, + { + "epoch": 6.730873760021742, + "grad_norm": 8.864625930786133, + "learning_rate": 1.5898899306971057e-06, + "loss": 2.5483, + "step": 99065 + }, + { + "epoch": 6.7312134800924035, + "grad_norm": 5.96672248840332, + "learning_rate": 1.5894652806087785e-06, + "loss": 2.956, + "step": 99070 + }, + { + "epoch": 6.731553200163066, + "grad_norm": 8.27556324005127, + "learning_rate": 1.589040630520451e-06, + "loss": 2.6868, + "step": 99075 + }, + { + "epoch": 6.731892920233728, + "grad_norm": 6.9059648513793945, + "learning_rate": 1.588615980432124e-06, + "loss": 2.7419, + "step": 99080 + }, + { + "epoch": 6.732232640304389, + "grad_norm": 7.208372592926025, + "learning_rate": 1.588191330343797e-06, + "loss": 2.839, + "step": 99085 + }, + { + "epoch": 6.732572360375051, + "grad_norm": 8.733908653259277, + "learning_rate": 1.5877666802554697e-06, + "loss": 2.7779, + "step": 99090 + }, + { + "epoch": 6.732912080445713, + "grad_norm": 8.830954551696777, + "learning_rate": 1.5873420301671425e-06, + "loss": 2.7422, + "step": 99095 + }, + { + "epoch": 6.733251800516374, + "grad_norm": 6.889632701873779, + "learning_rate": 1.586917380078815e-06, + "loss": 2.7349, + "step": 99100 + }, + { + "epoch": 6.733591520587036, + "grad_norm": 7.507079124450684, + "learning_rate": 1.586492729990488e-06, + "loss": 2.4735, + "step": 99105 + }, + { + "epoch": 6.733931240657698, + "grad_norm": 6.236019611358643, + "learning_rate": 1.5860680799021607e-06, + "loss": 2.6532, + "step": 99110 + }, + { + "epoch": 6.7342709607283595, + "grad_norm": 8.561013221740723, + "learning_rate": 1.5856434298138335e-06, + "loss": 2.8049, + "step": 99115 + }, + { + "epoch": 6.734610680799022, + "grad_norm": 8.003301620483398, + "learning_rate": 1.5852187797255065e-06, + "loss": 2.7215, + "step": 99120 + }, + { + "epoch": 6.734950400869684, + "grad_norm": 6.126279830932617, + "learning_rate": 1.584794129637179e-06, + "loss": 2.5731, + "step": 99125 + }, + { + "epoch": 6.735290120940345, + "grad_norm": 6.993357181549072, + "learning_rate": 1.5843694795488517e-06, + "loss": 2.9053, + "step": 99130 + }, + { + "epoch": 6.735629841011007, + "grad_norm": 7.217382907867432, + "learning_rate": 1.5839448294605247e-06, + "loss": 2.553, + "step": 99135 + }, + { + "epoch": 6.735969561081669, + "grad_norm": 6.149299621582031, + "learning_rate": 1.5835201793721975e-06, + "loss": 2.7186, + "step": 99140 + }, + { + "epoch": 6.73630928115233, + "grad_norm": 7.789433002471924, + "learning_rate": 1.58309552928387e-06, + "loss": 2.7461, + "step": 99145 + }, + { + "epoch": 6.736649001222992, + "grad_norm": 6.920262336730957, + "learning_rate": 1.582670879195543e-06, + "loss": 2.7242, + "step": 99150 + }, + { + "epoch": 6.736988721293654, + "grad_norm": 7.933272361755371, + "learning_rate": 1.5822462291072157e-06, + "loss": 2.6799, + "step": 99155 + }, + { + "epoch": 6.7373284413643155, + "grad_norm": 7.214669227600098, + "learning_rate": 1.5818215790188885e-06, + "loss": 2.6033, + "step": 99160 + }, + { + "epoch": 6.737668161434978, + "grad_norm": 6.6606340408325195, + "learning_rate": 1.5813969289305615e-06, + "loss": 2.8187, + "step": 99165 + }, + { + "epoch": 6.73800788150564, + "grad_norm": 7.57672119140625, + "learning_rate": 1.580972278842234e-06, + "loss": 2.6103, + "step": 99170 + }, + { + "epoch": 6.738347601576301, + "grad_norm": 7.058208465576172, + "learning_rate": 1.580547628753907e-06, + "loss": 2.7519, + "step": 99175 + }, + { + "epoch": 6.738687321646963, + "grad_norm": 7.951140880584717, + "learning_rate": 1.5801229786655797e-06, + "loss": 2.778, + "step": 99180 + }, + { + "epoch": 6.739027041717625, + "grad_norm": 7.9763922691345215, + "learning_rate": 1.5796983285772525e-06, + "loss": 2.8301, + "step": 99185 + }, + { + "epoch": 6.739366761788286, + "grad_norm": 8.315508842468262, + "learning_rate": 1.5792736784889253e-06, + "loss": 2.826, + "step": 99190 + }, + { + "epoch": 6.739706481858948, + "grad_norm": 7.433767318725586, + "learning_rate": 1.578849028400598e-06, + "loss": 2.8029, + "step": 99195 + }, + { + "epoch": 6.74004620192961, + "grad_norm": 7.232564926147461, + "learning_rate": 1.5784243783122707e-06, + "loss": 2.8292, + "step": 99200 + }, + { + "epoch": 6.7403859220002715, + "grad_norm": 7.195744514465332, + "learning_rate": 1.5779997282239437e-06, + "loss": 2.9844, + "step": 99205 + }, + { + "epoch": 6.740725642070934, + "grad_norm": 8.510432243347168, + "learning_rate": 1.5775750781356165e-06, + "loss": 2.866, + "step": 99210 + }, + { + "epoch": 6.741065362141596, + "grad_norm": 7.920547008514404, + "learning_rate": 1.577150428047289e-06, + "loss": 2.6987, + "step": 99215 + }, + { + "epoch": 6.741405082212257, + "grad_norm": 7.447322368621826, + "learning_rate": 1.576725777958962e-06, + "loss": 2.8055, + "step": 99220 + }, + { + "epoch": 6.741744802282919, + "grad_norm": 6.967648506164551, + "learning_rate": 1.5763011278706347e-06, + "loss": 2.6914, + "step": 99225 + }, + { + "epoch": 6.742084522353581, + "grad_norm": 8.603375434875488, + "learning_rate": 1.5758764777823075e-06, + "loss": 2.8345, + "step": 99230 + }, + { + "epoch": 6.742424242424242, + "grad_norm": 7.612309455871582, + "learning_rate": 1.5754518276939803e-06, + "loss": 2.7757, + "step": 99235 + }, + { + "epoch": 6.742763962494904, + "grad_norm": 8.610147476196289, + "learning_rate": 1.575027177605653e-06, + "loss": 2.8635, + "step": 99240 + }, + { + "epoch": 6.743103682565566, + "grad_norm": 8.05917739868164, + "learning_rate": 1.5746025275173257e-06, + "loss": 2.5814, + "step": 99245 + }, + { + "epoch": 6.7434434026362275, + "grad_norm": 7.0839715003967285, + "learning_rate": 1.5741778774289987e-06, + "loss": 2.6212, + "step": 99250 + }, + { + "epoch": 6.74378312270689, + "grad_norm": 7.256511211395264, + "learning_rate": 1.5737532273406713e-06, + "loss": 2.7573, + "step": 99255 + }, + { + "epoch": 6.744122842777552, + "grad_norm": 6.978280067443848, + "learning_rate": 1.5733285772523443e-06, + "loss": 2.9171, + "step": 99260 + }, + { + "epoch": 6.744462562848213, + "grad_norm": 8.114102363586426, + "learning_rate": 1.572903927164017e-06, + "loss": 2.5555, + "step": 99265 + }, + { + "epoch": 6.744802282918875, + "grad_norm": 7.435775279998779, + "learning_rate": 1.5724792770756897e-06, + "loss": 2.6592, + "step": 99270 + }, + { + "epoch": 6.745142002989537, + "grad_norm": 8.272127151489258, + "learning_rate": 1.5720546269873627e-06, + "loss": 2.6892, + "step": 99275 + }, + { + "epoch": 6.745481723060198, + "grad_norm": 6.507496356964111, + "learning_rate": 1.5716299768990353e-06, + "loss": 2.8263, + "step": 99280 + }, + { + "epoch": 6.74582144313086, + "grad_norm": 6.720442295074463, + "learning_rate": 1.571205326810708e-06, + "loss": 2.6499, + "step": 99285 + }, + { + "epoch": 6.746161163201522, + "grad_norm": 5.616119384765625, + "learning_rate": 1.570780676722381e-06, + "loss": 2.4983, + "step": 99290 + }, + { + "epoch": 6.746500883272184, + "grad_norm": 8.548194885253906, + "learning_rate": 1.5703560266340537e-06, + "loss": 2.6235, + "step": 99295 + }, + { + "epoch": 6.746840603342846, + "grad_norm": 6.1615777015686035, + "learning_rate": 1.5699313765457263e-06, + "loss": 2.8386, + "step": 99300 + }, + { + "epoch": 6.747180323413508, + "grad_norm": 6.180100917816162, + "learning_rate": 1.5695067264573993e-06, + "loss": 2.836, + "step": 99305 + }, + { + "epoch": 6.747520043484169, + "grad_norm": 8.853775978088379, + "learning_rate": 1.569082076369072e-06, + "loss": 2.5756, + "step": 99310 + }, + { + "epoch": 6.747859763554831, + "grad_norm": 8.433314323425293, + "learning_rate": 1.5686574262807447e-06, + "loss": 2.7056, + "step": 99315 + }, + { + "epoch": 6.748199483625493, + "grad_norm": 6.9847540855407715, + "learning_rate": 1.5682327761924177e-06, + "loss": 2.8058, + "step": 99320 + }, + { + "epoch": 6.748539203696154, + "grad_norm": 7.3052897453308105, + "learning_rate": 1.5678081261040903e-06, + "loss": 2.9106, + "step": 99325 + }, + { + "epoch": 6.748878923766816, + "grad_norm": 8.768317222595215, + "learning_rate": 1.567383476015763e-06, + "loss": 2.8668, + "step": 99330 + }, + { + "epoch": 6.749218643837478, + "grad_norm": 8.346076965332031, + "learning_rate": 1.5669588259274359e-06, + "loss": 2.6314, + "step": 99335 + }, + { + "epoch": 6.74955836390814, + "grad_norm": 8.225126266479492, + "learning_rate": 1.5665341758391087e-06, + "loss": 2.5478, + "step": 99340 + }, + { + "epoch": 6.749898083978802, + "grad_norm": 8.90688419342041, + "learning_rate": 1.5661095257507817e-06, + "loss": 3.0472, + "step": 99345 + }, + { + "epoch": 6.750237804049464, + "grad_norm": 9.093600273132324, + "learning_rate": 1.5656848756624543e-06, + "loss": 2.9087, + "step": 99350 + }, + { + "epoch": 6.750577524120125, + "grad_norm": 7.194469928741455, + "learning_rate": 1.565260225574127e-06, + "loss": 2.6603, + "step": 99355 + }, + { + "epoch": 6.750917244190787, + "grad_norm": 7.139776706695557, + "learning_rate": 1.5648355754857999e-06, + "loss": 2.6222, + "step": 99360 + }, + { + "epoch": 6.751256964261449, + "grad_norm": 6.940005779266357, + "learning_rate": 1.5644109253974727e-06, + "loss": 2.7792, + "step": 99365 + }, + { + "epoch": 6.75159668433211, + "grad_norm": 6.562094211578369, + "learning_rate": 1.5639862753091453e-06, + "loss": 2.8189, + "step": 99370 + }, + { + "epoch": 6.751936404402772, + "grad_norm": 7.886548042297363, + "learning_rate": 1.5635616252208183e-06, + "loss": 2.738, + "step": 99375 + }, + { + "epoch": 6.7522761244734335, + "grad_norm": 8.940587043762207, + "learning_rate": 1.5631369751324909e-06, + "loss": 2.8098, + "step": 99380 + }, + { + "epoch": 6.752615844544096, + "grad_norm": 7.034802436828613, + "learning_rate": 1.5627123250441637e-06, + "loss": 2.8114, + "step": 99385 + }, + { + "epoch": 6.752955564614758, + "grad_norm": 8.880162239074707, + "learning_rate": 1.5622876749558367e-06, + "loss": 2.9237, + "step": 99390 + }, + { + "epoch": 6.753295284685419, + "grad_norm": 7.737538814544678, + "learning_rate": 1.5618630248675093e-06, + "loss": 2.7551, + "step": 99395 + }, + { + "epoch": 6.753635004756081, + "grad_norm": 8.14730453491211, + "learning_rate": 1.561438374779182e-06, + "loss": 2.9248, + "step": 99400 + }, + { + "epoch": 6.753974724826743, + "grad_norm": 8.055213928222656, + "learning_rate": 1.5610137246908549e-06, + "loss": 2.7327, + "step": 99405 + }, + { + "epoch": 6.754314444897404, + "grad_norm": 8.211530685424805, + "learning_rate": 1.5605890746025277e-06, + "loss": 2.7383, + "step": 99410 + }, + { + "epoch": 6.754654164968066, + "grad_norm": 7.183360576629639, + "learning_rate": 1.5601644245142003e-06, + "loss": 2.7484, + "step": 99415 + }, + { + "epoch": 6.754993885038728, + "grad_norm": 11.53103256225586, + "learning_rate": 1.5597397744258733e-06, + "loss": 2.742, + "step": 99420 + }, + { + "epoch": 6.7553336051093895, + "grad_norm": 8.345850944519043, + "learning_rate": 1.5593151243375459e-06, + "loss": 2.8745, + "step": 99425 + }, + { + "epoch": 6.755673325180052, + "grad_norm": 8.04703426361084, + "learning_rate": 1.5588904742492187e-06, + "loss": 2.8255, + "step": 99430 + }, + { + "epoch": 6.756013045250714, + "grad_norm": 7.029872417449951, + "learning_rate": 1.5584658241608917e-06, + "loss": 2.7184, + "step": 99435 + }, + { + "epoch": 6.756352765321375, + "grad_norm": 7.3233771324157715, + "learning_rate": 1.5580411740725643e-06, + "loss": 2.7973, + "step": 99440 + }, + { + "epoch": 6.756692485392037, + "grad_norm": 8.123271942138672, + "learning_rate": 1.5576165239842373e-06, + "loss": 2.9251, + "step": 99445 + }, + { + "epoch": 6.757032205462699, + "grad_norm": 6.323784351348877, + "learning_rate": 1.5571918738959099e-06, + "loss": 2.8766, + "step": 99450 + }, + { + "epoch": 6.75737192553336, + "grad_norm": 5.8793182373046875, + "learning_rate": 1.5567672238075827e-06, + "loss": 2.7336, + "step": 99455 + }, + { + "epoch": 6.757711645604022, + "grad_norm": 7.031970500946045, + "learning_rate": 1.5563425737192555e-06, + "loss": 2.8492, + "step": 99460 + }, + { + "epoch": 6.758051365674684, + "grad_norm": 10.222493171691895, + "learning_rate": 1.5559179236309283e-06, + "loss": 2.687, + "step": 99465 + }, + { + "epoch": 6.7583910857453455, + "grad_norm": 8.540298461914062, + "learning_rate": 1.5554932735426009e-06, + "loss": 2.7417, + "step": 99470 + }, + { + "epoch": 6.758730805816008, + "grad_norm": 7.998124599456787, + "learning_rate": 1.5550686234542739e-06, + "loss": 2.4655, + "step": 99475 + }, + { + "epoch": 6.75907052588667, + "grad_norm": 6.8766655921936035, + "learning_rate": 1.5546439733659467e-06, + "loss": 2.6498, + "step": 99480 + }, + { + "epoch": 6.759410245957331, + "grad_norm": 6.962401866912842, + "learning_rate": 1.5542193232776193e-06, + "loss": 2.9197, + "step": 99485 + }, + { + "epoch": 6.759749966027993, + "grad_norm": 11.530644416809082, + "learning_rate": 1.5537946731892923e-06, + "loss": 2.9851, + "step": 99490 + }, + { + "epoch": 6.760089686098655, + "grad_norm": 6.794754505157471, + "learning_rate": 1.5533700231009649e-06, + "loss": 2.6668, + "step": 99495 + }, + { + "epoch": 6.760429406169316, + "grad_norm": 8.060430526733398, + "learning_rate": 1.5529453730126377e-06, + "loss": 2.7487, + "step": 99500 + }, + { + "epoch": 6.760769126239978, + "grad_norm": 6.26507568359375, + "learning_rate": 1.5525207229243105e-06, + "loss": 2.9067, + "step": 99505 + }, + { + "epoch": 6.76110884631064, + "grad_norm": 7.292962551116943, + "learning_rate": 1.5520960728359833e-06, + "loss": 2.7305, + "step": 99510 + }, + { + "epoch": 6.7614485663813015, + "grad_norm": 7.524231910705566, + "learning_rate": 1.5516714227476559e-06, + "loss": 2.7721, + "step": 99515 + }, + { + "epoch": 6.761788286451964, + "grad_norm": 9.549725532531738, + "learning_rate": 1.5512467726593289e-06, + "loss": 2.6445, + "step": 99520 + }, + { + "epoch": 6.762128006522626, + "grad_norm": 8.174299240112305, + "learning_rate": 1.5508221225710015e-06, + "loss": 2.8567, + "step": 99525 + }, + { + "epoch": 6.762467726593287, + "grad_norm": 8.524654388427734, + "learning_rate": 1.5503974724826745e-06, + "loss": 2.8045, + "step": 99530 + }, + { + "epoch": 6.762807446663949, + "grad_norm": 8.696467399597168, + "learning_rate": 1.5499728223943473e-06, + "loss": 2.9832, + "step": 99535 + }, + { + "epoch": 6.763147166734611, + "grad_norm": 7.7822489738464355, + "learning_rate": 1.5495481723060199e-06, + "loss": 2.9353, + "step": 99540 + }, + { + "epoch": 6.763486886805272, + "grad_norm": 7.600208759307861, + "learning_rate": 1.5491235222176929e-06, + "loss": 2.6368, + "step": 99545 + }, + { + "epoch": 6.763826606875934, + "grad_norm": 8.553905487060547, + "learning_rate": 1.5486988721293655e-06, + "loss": 2.5983, + "step": 99550 + }, + { + "epoch": 6.764166326946596, + "grad_norm": 6.818842887878418, + "learning_rate": 1.5482742220410383e-06, + "loss": 2.7238, + "step": 99555 + }, + { + "epoch": 6.7645060470172576, + "grad_norm": 8.031900405883789, + "learning_rate": 1.5478495719527113e-06, + "loss": 2.4293, + "step": 99560 + }, + { + "epoch": 6.76484576708792, + "grad_norm": 6.735208034515381, + "learning_rate": 1.5474249218643839e-06, + "loss": 2.9314, + "step": 99565 + }, + { + "epoch": 6.765185487158582, + "grad_norm": 8.014544486999512, + "learning_rate": 1.5470002717760565e-06, + "loss": 2.6681, + "step": 99570 + }, + { + "epoch": 6.765525207229243, + "grad_norm": 6.734002590179443, + "learning_rate": 1.5465756216877295e-06, + "loss": 2.8892, + "step": 99575 + }, + { + "epoch": 6.765864927299905, + "grad_norm": 8.42762565612793, + "learning_rate": 1.5461509715994023e-06, + "loss": 2.7338, + "step": 99580 + }, + { + "epoch": 6.766204647370567, + "grad_norm": 8.658708572387695, + "learning_rate": 1.5457263215110749e-06, + "loss": 2.5571, + "step": 99585 + }, + { + "epoch": 6.766544367441228, + "grad_norm": 8.00947380065918, + "learning_rate": 1.5453016714227479e-06, + "loss": 2.6066, + "step": 99590 + }, + { + "epoch": 6.76688408751189, + "grad_norm": 8.884808540344238, + "learning_rate": 1.5448770213344205e-06, + "loss": 2.9158, + "step": 99595 + }, + { + "epoch": 6.7672238075825515, + "grad_norm": 7.935183525085449, + "learning_rate": 1.5444523712460933e-06, + "loss": 2.8744, + "step": 99600 + }, + { + "epoch": 6.767563527653214, + "grad_norm": 6.784822940826416, + "learning_rate": 1.5440277211577663e-06, + "loss": 2.6436, + "step": 99605 + }, + { + "epoch": 6.767903247723876, + "grad_norm": 7.486087799072266, + "learning_rate": 1.5436030710694389e-06, + "loss": 2.6597, + "step": 99610 + }, + { + "epoch": 6.768242967794537, + "grad_norm": 11.173824310302734, + "learning_rate": 1.5431784209811119e-06, + "loss": 2.8772, + "step": 99615 + }, + { + "epoch": 6.768582687865199, + "grad_norm": 9.385327339172363, + "learning_rate": 1.5427537708927845e-06, + "loss": 2.6679, + "step": 99620 + }, + { + "epoch": 6.768922407935861, + "grad_norm": 6.336228370666504, + "learning_rate": 1.5423291208044573e-06, + "loss": 2.5564, + "step": 99625 + }, + { + "epoch": 6.769262128006522, + "grad_norm": 9.306472778320312, + "learning_rate": 1.54190447071613e-06, + "loss": 2.8041, + "step": 99630 + }, + { + "epoch": 6.769601848077184, + "grad_norm": 6.663817882537842, + "learning_rate": 1.5414798206278029e-06, + "loss": 2.6631, + "step": 99635 + }, + { + "epoch": 6.769941568147846, + "grad_norm": 6.788413047790527, + "learning_rate": 1.5410551705394755e-06, + "loss": 2.646, + "step": 99640 + }, + { + "epoch": 6.7702812882185075, + "grad_norm": 8.38353157043457, + "learning_rate": 1.5406305204511485e-06, + "loss": 2.6768, + "step": 99645 + }, + { + "epoch": 6.77062100828917, + "grad_norm": 6.688623428344727, + "learning_rate": 1.540205870362821e-06, + "loss": 2.9089, + "step": 99650 + }, + { + "epoch": 6.770960728359832, + "grad_norm": 7.8634796142578125, + "learning_rate": 1.5397812202744939e-06, + "loss": 2.7544, + "step": 99655 + }, + { + "epoch": 6.771300448430493, + "grad_norm": 8.187458992004395, + "learning_rate": 1.5393565701861669e-06, + "loss": 2.8676, + "step": 99660 + }, + { + "epoch": 6.771640168501155, + "grad_norm": 6.034658908843994, + "learning_rate": 1.5389319200978395e-06, + "loss": 2.4869, + "step": 99665 + }, + { + "epoch": 6.771979888571817, + "grad_norm": 6.8296709060668945, + "learning_rate": 1.5385072700095123e-06, + "loss": 2.7242, + "step": 99670 + }, + { + "epoch": 6.772319608642478, + "grad_norm": 10.111144065856934, + "learning_rate": 1.538082619921185e-06, + "loss": 2.857, + "step": 99675 + }, + { + "epoch": 6.77265932871314, + "grad_norm": 7.022618293762207, + "learning_rate": 1.5376579698328579e-06, + "loss": 2.759, + "step": 99680 + }, + { + "epoch": 6.772999048783802, + "grad_norm": 6.911999225616455, + "learning_rate": 1.5372333197445305e-06, + "loss": 2.7638, + "step": 99685 + }, + { + "epoch": 6.7733387688544635, + "grad_norm": 8.206948280334473, + "learning_rate": 1.5368086696562035e-06, + "loss": 2.7945, + "step": 99690 + }, + { + "epoch": 6.773678488925126, + "grad_norm": 8.522308349609375, + "learning_rate": 1.536384019567876e-06, + "loss": 2.7831, + "step": 99695 + }, + { + "epoch": 6.774018208995788, + "grad_norm": 6.176642417907715, + "learning_rate": 1.535959369479549e-06, + "loss": 2.7803, + "step": 99700 + }, + { + "epoch": 6.774357929066449, + "grad_norm": 6.929075717926025, + "learning_rate": 1.5355347193912219e-06, + "loss": 2.9104, + "step": 99705 + }, + { + "epoch": 6.774697649137111, + "grad_norm": 9.428079605102539, + "learning_rate": 1.5351100693028945e-06, + "loss": 2.6107, + "step": 99710 + }, + { + "epoch": 6.775037369207773, + "grad_norm": 7.833496570587158, + "learning_rate": 1.5346854192145675e-06, + "loss": 2.6022, + "step": 99715 + }, + { + "epoch": 6.775377089278434, + "grad_norm": 5.9922308921813965, + "learning_rate": 1.53426076912624e-06, + "loss": 2.5876, + "step": 99720 + }, + { + "epoch": 6.775716809349096, + "grad_norm": 7.628636837005615, + "learning_rate": 1.5338361190379129e-06, + "loss": 2.7206, + "step": 99725 + }, + { + "epoch": 6.776056529419758, + "grad_norm": 8.645155906677246, + "learning_rate": 1.5334114689495857e-06, + "loss": 2.9107, + "step": 99730 + }, + { + "epoch": 6.7763962494904195, + "grad_norm": 8.761368751525879, + "learning_rate": 1.5329868188612585e-06, + "loss": 2.8228, + "step": 99735 + }, + { + "epoch": 6.776735969561082, + "grad_norm": 7.331550121307373, + "learning_rate": 1.532562168772931e-06, + "loss": 2.4924, + "step": 99740 + }, + { + "epoch": 6.777075689631744, + "grad_norm": 6.510809898376465, + "learning_rate": 1.532137518684604e-06, + "loss": 2.3941, + "step": 99745 + }, + { + "epoch": 6.777415409702405, + "grad_norm": 8.322365760803223, + "learning_rate": 1.5317128685962769e-06, + "loss": 2.4786, + "step": 99750 + }, + { + "epoch": 6.777755129773067, + "grad_norm": 5.874312877655029, + "learning_rate": 1.5312882185079494e-06, + "loss": 2.7682, + "step": 99755 + }, + { + "epoch": 6.778094849843729, + "grad_norm": 8.275398254394531, + "learning_rate": 1.5308635684196225e-06, + "loss": 2.6684, + "step": 99760 + }, + { + "epoch": 6.77843456991439, + "grad_norm": 6.813601493835449, + "learning_rate": 1.530438918331295e-06, + "loss": 2.7604, + "step": 99765 + }, + { + "epoch": 6.778774289985052, + "grad_norm": 9.754210472106934, + "learning_rate": 1.5300142682429679e-06, + "loss": 2.6125, + "step": 99770 + }, + { + "epoch": 6.779114010055714, + "grad_norm": 8.814371109008789, + "learning_rate": 1.5295896181546407e-06, + "loss": 2.6202, + "step": 99775 + }, + { + "epoch": 6.7794537301263755, + "grad_norm": 7.278542518615723, + "learning_rate": 1.5291649680663135e-06, + "loss": 2.7091, + "step": 99780 + }, + { + "epoch": 6.779793450197038, + "grad_norm": 8.501009941101074, + "learning_rate": 1.5287403179779865e-06, + "loss": 2.6681, + "step": 99785 + }, + { + "epoch": 6.7801331702677, + "grad_norm": 8.167764663696289, + "learning_rate": 1.528315667889659e-06, + "loss": 2.8062, + "step": 99790 + }, + { + "epoch": 6.780472890338361, + "grad_norm": 8.306682586669922, + "learning_rate": 1.5278910178013319e-06, + "loss": 2.9438, + "step": 99795 + }, + { + "epoch": 6.780812610409023, + "grad_norm": 7.18358850479126, + "learning_rate": 1.5274663677130047e-06, + "loss": 2.7476, + "step": 99800 + }, + { + "epoch": 6.781152330479685, + "grad_norm": 9.109997749328613, + "learning_rate": 1.5270417176246775e-06, + "loss": 2.7614, + "step": 99805 + }, + { + "epoch": 6.781492050550346, + "grad_norm": 7.067588806152344, + "learning_rate": 1.52661706753635e-06, + "loss": 2.598, + "step": 99810 + }, + { + "epoch": 6.781831770621008, + "grad_norm": 8.333213806152344, + "learning_rate": 1.526192417448023e-06, + "loss": 2.6166, + "step": 99815 + }, + { + "epoch": 6.78217149069167, + "grad_norm": 8.345401763916016, + "learning_rate": 1.5257677673596956e-06, + "loss": 2.9238, + "step": 99820 + }, + { + "epoch": 6.7825112107623315, + "grad_norm": 8.133475303649902, + "learning_rate": 1.5253431172713684e-06, + "loss": 2.7263, + "step": 99825 + }, + { + "epoch": 6.782850930832994, + "grad_norm": 7.769392013549805, + "learning_rate": 1.5249184671830415e-06, + "loss": 2.6864, + "step": 99830 + }, + { + "epoch": 6.783190650903656, + "grad_norm": 7.034060001373291, + "learning_rate": 1.524493817094714e-06, + "loss": 2.7437, + "step": 99835 + }, + { + "epoch": 6.783530370974317, + "grad_norm": 7.203169822692871, + "learning_rate": 1.5240691670063866e-06, + "loss": 2.8117, + "step": 99840 + }, + { + "epoch": 6.783870091044979, + "grad_norm": 6.344126224517822, + "learning_rate": 1.5236445169180597e-06, + "loss": 2.7013, + "step": 99845 + }, + { + "epoch": 6.784209811115641, + "grad_norm": 7.815637111663818, + "learning_rate": 1.5232198668297325e-06, + "loss": 2.8141, + "step": 99850 + }, + { + "epoch": 6.784549531186302, + "grad_norm": 6.435030460357666, + "learning_rate": 1.522795216741405e-06, + "loss": 2.7652, + "step": 99855 + }, + { + "epoch": 6.784889251256964, + "grad_norm": 7.777980327606201, + "learning_rate": 1.522370566653078e-06, + "loss": 2.7028, + "step": 99860 + }, + { + "epoch": 6.785228971327626, + "grad_norm": 7.545733451843262, + "learning_rate": 1.5219459165647506e-06, + "loss": 2.8567, + "step": 99865 + }, + { + "epoch": 6.7855686913982876, + "grad_norm": 12.059737205505371, + "learning_rate": 1.5215212664764237e-06, + "loss": 2.6952, + "step": 99870 + }, + { + "epoch": 6.78590841146895, + "grad_norm": 8.721807479858398, + "learning_rate": 1.5210966163880965e-06, + "loss": 2.8588, + "step": 99875 + }, + { + "epoch": 6.786248131539612, + "grad_norm": 8.016009330749512, + "learning_rate": 1.520671966299769e-06, + "loss": 2.8284, + "step": 99880 + }, + { + "epoch": 6.786587851610273, + "grad_norm": 8.005661010742188, + "learning_rate": 1.520247316211442e-06, + "loss": 2.689, + "step": 99885 + }, + { + "epoch": 6.786927571680935, + "grad_norm": 7.5203447341918945, + "learning_rate": 1.5198226661231146e-06, + "loss": 2.798, + "step": 99890 + }, + { + "epoch": 6.787267291751597, + "grad_norm": 6.98354434967041, + "learning_rate": 1.5193980160347874e-06, + "loss": 3.0534, + "step": 99895 + }, + { + "epoch": 6.787607011822258, + "grad_norm": 8.352424621582031, + "learning_rate": 1.5189733659464602e-06, + "loss": 2.7876, + "step": 99900 + }, + { + "epoch": 6.78794673189292, + "grad_norm": 6.503790855407715, + "learning_rate": 1.518548715858133e-06, + "loss": 2.7715, + "step": 99905 + }, + { + "epoch": 6.788286451963582, + "grad_norm": 7.362179279327393, + "learning_rate": 1.5181240657698056e-06, + "loss": 2.9269, + "step": 99910 + }, + { + "epoch": 6.788626172034244, + "grad_norm": 8.73366641998291, + "learning_rate": 1.5176994156814787e-06, + "loss": 2.5713, + "step": 99915 + }, + { + "epoch": 6.788965892104906, + "grad_norm": 7.423530578613281, + "learning_rate": 1.5172747655931512e-06, + "loss": 2.921, + "step": 99920 + }, + { + "epoch": 6.789305612175568, + "grad_norm": 9.344599723815918, + "learning_rate": 1.516850115504824e-06, + "loss": 2.5455, + "step": 99925 + }, + { + "epoch": 6.789645332246229, + "grad_norm": 5.8861589431762695, + "learning_rate": 1.516425465416497e-06, + "loss": 2.7408, + "step": 99930 + }, + { + "epoch": 6.789985052316891, + "grad_norm": 7.138103485107422, + "learning_rate": 1.5160008153281696e-06, + "loss": 2.7124, + "step": 99935 + }, + { + "epoch": 6.790324772387553, + "grad_norm": 8.62389087677002, + "learning_rate": 1.5155761652398424e-06, + "loss": 2.7784, + "step": 99940 + }, + { + "epoch": 6.790664492458214, + "grad_norm": 8.092501640319824, + "learning_rate": 1.5151515151515152e-06, + "loss": 2.1819, + "step": 99945 + }, + { + "epoch": 6.791004212528876, + "grad_norm": 8.89443302154541, + "learning_rate": 1.514726865063188e-06, + "loss": 2.6836, + "step": 99950 + }, + { + "epoch": 6.791343932599538, + "grad_norm": 8.065990447998047, + "learning_rate": 1.514302214974861e-06, + "loss": 2.9248, + "step": 99955 + }, + { + "epoch": 6.7916836526702, + "grad_norm": 8.497373580932617, + "learning_rate": 1.5138775648865336e-06, + "loss": 2.901, + "step": 99960 + }, + { + "epoch": 6.792023372740862, + "grad_norm": 8.095375061035156, + "learning_rate": 1.5134529147982062e-06, + "loss": 2.606, + "step": 99965 + }, + { + "epoch": 6.792363092811524, + "grad_norm": 8.80080795288086, + "learning_rate": 1.5130282647098792e-06, + "loss": 2.738, + "step": 99970 + }, + { + "epoch": 6.792702812882185, + "grad_norm": 7.756475448608398, + "learning_rate": 1.512603614621552e-06, + "loss": 2.7252, + "step": 99975 + }, + { + "epoch": 6.793042532952847, + "grad_norm": 8.868678092956543, + "learning_rate": 1.5121789645332246e-06, + "loss": 2.6755, + "step": 99980 + }, + { + "epoch": 6.793382253023509, + "grad_norm": 6.356509685516357, + "learning_rate": 1.5117543144448976e-06, + "loss": 2.7521, + "step": 99985 + }, + { + "epoch": 6.79372197309417, + "grad_norm": 9.514041900634766, + "learning_rate": 1.5113296643565702e-06, + "loss": 2.6722, + "step": 99990 + }, + { + "epoch": 6.794061693164832, + "grad_norm": 9.39490795135498, + "learning_rate": 1.510905014268243e-06, + "loss": 2.5263, + "step": 99995 + }, + { + "epoch": 6.794401413235494, + "grad_norm": 8.280467987060547, + "learning_rate": 1.510480364179916e-06, + "loss": 2.9416, + "step": 100000 + }, + { + "epoch": 6.794741133306156, + "grad_norm": 8.200018882751465, + "learning_rate": 1.5100557140915886e-06, + "loss": 2.8221, + "step": 100005 + }, + { + "epoch": 6.795080853376818, + "grad_norm": 8.763875961303711, + "learning_rate": 1.5096310640032612e-06, + "loss": 2.5985, + "step": 100010 + }, + { + "epoch": 6.79542057344748, + "grad_norm": 8.720273971557617, + "learning_rate": 1.5092064139149342e-06, + "loss": 2.9544, + "step": 100015 + }, + { + "epoch": 6.795760293518141, + "grad_norm": 8.567846298217773, + "learning_rate": 1.508781763826607e-06, + "loss": 2.7097, + "step": 100020 + }, + { + "epoch": 6.796100013588803, + "grad_norm": 6.280730247497559, + "learning_rate": 1.5083571137382796e-06, + "loss": 2.8315, + "step": 100025 + }, + { + "epoch": 6.796439733659465, + "grad_norm": 7.213073253631592, + "learning_rate": 1.5079324636499526e-06, + "loss": 2.7234, + "step": 100030 + }, + { + "epoch": 6.796779453730126, + "grad_norm": 8.145968437194824, + "learning_rate": 1.5075078135616252e-06, + "loss": 2.8034, + "step": 100035 + }, + { + "epoch": 6.797119173800788, + "grad_norm": 8.258977890014648, + "learning_rate": 1.5070831634732982e-06, + "loss": 2.4844, + "step": 100040 + }, + { + "epoch": 6.79745889387145, + "grad_norm": 6.4941301345825195, + "learning_rate": 1.5066585133849708e-06, + "loss": 2.7034, + "step": 100045 + }, + { + "epoch": 6.797798613942112, + "grad_norm": 7.50999641418457, + "learning_rate": 1.5062338632966436e-06, + "loss": 2.7974, + "step": 100050 + }, + { + "epoch": 6.798138334012774, + "grad_norm": 7.3644585609436035, + "learning_rate": 1.5058092132083166e-06, + "loss": 2.769, + "step": 100055 + }, + { + "epoch": 6.798478054083435, + "grad_norm": 9.53225326538086, + "learning_rate": 1.5053845631199892e-06, + "loss": 2.7955, + "step": 100060 + }, + { + "epoch": 6.798817774154097, + "grad_norm": 6.831931114196777, + "learning_rate": 1.504959913031662e-06, + "loss": 2.9426, + "step": 100065 + }, + { + "epoch": 6.799157494224759, + "grad_norm": 6.9576287269592285, + "learning_rate": 1.5045352629433348e-06, + "loss": 2.8552, + "step": 100070 + }, + { + "epoch": 6.79949721429542, + "grad_norm": 9.022992134094238, + "learning_rate": 1.5041106128550076e-06, + "loss": 2.5046, + "step": 100075 + }, + { + "epoch": 6.799836934366082, + "grad_norm": 9.299979209899902, + "learning_rate": 1.5036859627666802e-06, + "loss": 2.6872, + "step": 100080 + }, + { + "epoch": 6.800176654436744, + "grad_norm": 7.797939300537109, + "learning_rate": 1.5032613126783532e-06, + "loss": 2.7142, + "step": 100085 + }, + { + "epoch": 6.8005163745074055, + "grad_norm": 8.120092391967773, + "learning_rate": 1.5028366625900258e-06, + "loss": 2.6837, + "step": 100090 + }, + { + "epoch": 6.800856094578068, + "grad_norm": 5.9335126876831055, + "learning_rate": 1.5024120125016986e-06, + "loss": 2.8092, + "step": 100095 + }, + { + "epoch": 6.80119581464873, + "grad_norm": 8.371817588806152, + "learning_rate": 1.5019873624133716e-06, + "loss": 2.7135, + "step": 100100 + }, + { + "epoch": 6.801535534719391, + "grad_norm": 7.153759956359863, + "learning_rate": 1.5015627123250442e-06, + "loss": 2.87, + "step": 100105 + }, + { + "epoch": 6.801875254790053, + "grad_norm": 9.678121566772461, + "learning_rate": 1.501138062236717e-06, + "loss": 2.5163, + "step": 100110 + }, + { + "epoch": 6.802214974860715, + "grad_norm": 8.035100936889648, + "learning_rate": 1.5007134121483898e-06, + "loss": 2.9723, + "step": 100115 + }, + { + "epoch": 6.802554694931376, + "grad_norm": 8.471980094909668, + "learning_rate": 1.5002887620600626e-06, + "loss": 2.5407, + "step": 100120 + }, + { + "epoch": 6.802894415002038, + "grad_norm": 8.405136108398438, + "learning_rate": 1.4998641119717354e-06, + "loss": 2.9334, + "step": 100125 + }, + { + "epoch": 6.8032341350727, + "grad_norm": 7.450535297393799, + "learning_rate": 1.4994394618834082e-06, + "loss": 2.7516, + "step": 100130 + }, + { + "epoch": 6.8035738551433615, + "grad_norm": 7.887058258056641, + "learning_rate": 1.4990148117950808e-06, + "loss": 2.7571, + "step": 100135 + }, + { + "epoch": 6.803913575214024, + "grad_norm": 7.191063404083252, + "learning_rate": 1.4985901617067538e-06, + "loss": 2.9342, + "step": 100140 + }, + { + "epoch": 6.804253295284686, + "grad_norm": 6.817574501037598, + "learning_rate": 1.4981655116184266e-06, + "loss": 2.6897, + "step": 100145 + }, + { + "epoch": 6.804593015355347, + "grad_norm": 9.34835433959961, + "learning_rate": 1.4977408615300992e-06, + "loss": 2.8761, + "step": 100150 + }, + { + "epoch": 6.804932735426009, + "grad_norm": 7.998344898223877, + "learning_rate": 1.4973162114417722e-06, + "loss": 2.9146, + "step": 100155 + }, + { + "epoch": 6.805272455496671, + "grad_norm": 7.1688103675842285, + "learning_rate": 1.4968915613534448e-06, + "loss": 2.556, + "step": 100160 + }, + { + "epoch": 6.805612175567332, + "grad_norm": 8.531310081481934, + "learning_rate": 1.4964669112651176e-06, + "loss": 2.8862, + "step": 100165 + }, + { + "epoch": 6.805951895637994, + "grad_norm": 8.115538597106934, + "learning_rate": 1.4960422611767904e-06, + "loss": 2.5606, + "step": 100170 + }, + { + "epoch": 6.806291615708656, + "grad_norm": 7.717208385467529, + "learning_rate": 1.4956176110884632e-06, + "loss": 2.751, + "step": 100175 + }, + { + "epoch": 6.806631335779318, + "grad_norm": 7.585662841796875, + "learning_rate": 1.4951929610001358e-06, + "loss": 2.7414, + "step": 100180 + }, + { + "epoch": 6.80697105584998, + "grad_norm": 8.388456344604492, + "learning_rate": 1.4947683109118088e-06, + "loss": 3.0191, + "step": 100185 + }, + { + "epoch": 6.807310775920642, + "grad_norm": 7.342506408691406, + "learning_rate": 1.4943436608234816e-06, + "loss": 2.7146, + "step": 100190 + }, + { + "epoch": 6.807650495991303, + "grad_norm": 10.739118576049805, + "learning_rate": 1.4939190107351542e-06, + "loss": 2.9712, + "step": 100195 + }, + { + "epoch": 6.807990216061965, + "grad_norm": 9.889180183410645, + "learning_rate": 1.4934943606468272e-06, + "loss": 2.707, + "step": 100200 + }, + { + "epoch": 6.808329936132627, + "grad_norm": 7.833818435668945, + "learning_rate": 1.4930697105584998e-06, + "loss": 2.5269, + "step": 100205 + }, + { + "epoch": 6.808669656203288, + "grad_norm": 7.987654685974121, + "learning_rate": 1.4926450604701728e-06, + "loss": 2.7211, + "step": 100210 + }, + { + "epoch": 6.80900937627395, + "grad_norm": 8.350715637207031, + "learning_rate": 1.4922204103818454e-06, + "loss": 2.7492, + "step": 100215 + }, + { + "epoch": 6.809349096344612, + "grad_norm": 7.755645751953125, + "learning_rate": 1.4917957602935182e-06, + "loss": 2.7768, + "step": 100220 + }, + { + "epoch": 6.809688816415274, + "grad_norm": 6.586424827575684, + "learning_rate": 1.4913711102051912e-06, + "loss": 2.6697, + "step": 100225 + }, + { + "epoch": 6.810028536485936, + "grad_norm": 9.618992805480957, + "learning_rate": 1.4909464601168638e-06, + "loss": 2.6816, + "step": 100230 + }, + { + "epoch": 6.810368256556598, + "grad_norm": 10.84060287475586, + "learning_rate": 1.4905218100285364e-06, + "loss": 2.869, + "step": 100235 + }, + { + "epoch": 6.810707976627259, + "grad_norm": 7.603799343109131, + "learning_rate": 1.4900971599402094e-06, + "loss": 2.7017, + "step": 100240 + }, + { + "epoch": 6.811047696697921, + "grad_norm": 6.953880786895752, + "learning_rate": 1.4896725098518822e-06, + "loss": 2.9406, + "step": 100245 + }, + { + "epoch": 6.811387416768583, + "grad_norm": 11.16911506652832, + "learning_rate": 1.4892478597635548e-06, + "loss": 2.526, + "step": 100250 + }, + { + "epoch": 6.811727136839244, + "grad_norm": 7.87492561340332, + "learning_rate": 1.4888232096752278e-06, + "loss": 2.6193, + "step": 100255 + }, + { + "epoch": 6.812066856909906, + "grad_norm": 9.743950843811035, + "learning_rate": 1.4883985595869004e-06, + "loss": 2.5943, + "step": 100260 + }, + { + "epoch": 6.812406576980568, + "grad_norm": 8.269951820373535, + "learning_rate": 1.4879739094985732e-06, + "loss": 2.9045, + "step": 100265 + }, + { + "epoch": 6.81274629705123, + "grad_norm": 7.8816914558410645, + "learning_rate": 1.4875492594102462e-06, + "loss": 2.6909, + "step": 100270 + }, + { + "epoch": 6.813086017121892, + "grad_norm": 9.724018096923828, + "learning_rate": 1.4871246093219188e-06, + "loss": 2.8949, + "step": 100275 + }, + { + "epoch": 6.813425737192553, + "grad_norm": 10.174015045166016, + "learning_rate": 1.4866999592335914e-06, + "loss": 2.5888, + "step": 100280 + }, + { + "epoch": 6.813765457263215, + "grad_norm": 8.763854026794434, + "learning_rate": 1.4862753091452644e-06, + "loss": 2.5631, + "step": 100285 + }, + { + "epoch": 6.814105177333877, + "grad_norm": 7.036227226257324, + "learning_rate": 1.4858506590569372e-06, + "loss": 2.6044, + "step": 100290 + }, + { + "epoch": 6.814444897404538, + "grad_norm": 7.21997594833374, + "learning_rate": 1.48542600896861e-06, + "loss": 2.6615, + "step": 100295 + }, + { + "epoch": 6.8147846174752, + "grad_norm": 6.817104816436768, + "learning_rate": 1.4850013588802828e-06, + "loss": 2.9242, + "step": 100300 + }, + { + "epoch": 6.815124337545862, + "grad_norm": 10.522254943847656, + "learning_rate": 1.4845767087919554e-06, + "loss": 2.2784, + "step": 100305 + }, + { + "epoch": 6.8154640576165235, + "grad_norm": 6.807160377502441, + "learning_rate": 1.4841520587036284e-06, + "loss": 2.7421, + "step": 100310 + }, + { + "epoch": 6.815803777687186, + "grad_norm": 6.508237838745117, + "learning_rate": 1.483727408615301e-06, + "loss": 2.7434, + "step": 100315 + }, + { + "epoch": 6.816143497757848, + "grad_norm": 7.257586479187012, + "learning_rate": 1.4833027585269738e-06, + "loss": 2.5471, + "step": 100320 + }, + { + "epoch": 6.816483217828509, + "grad_norm": 8.79420280456543, + "learning_rate": 1.4828781084386468e-06, + "loss": 2.7936, + "step": 100325 + }, + { + "epoch": 6.816822937899171, + "grad_norm": 7.644285202026367, + "learning_rate": 1.4824534583503194e-06, + "loss": 2.6382, + "step": 100330 + }, + { + "epoch": 6.817162657969833, + "grad_norm": 9.244316101074219, + "learning_rate": 1.4820288082619922e-06, + "loss": 2.7881, + "step": 100335 + }, + { + "epoch": 6.817502378040494, + "grad_norm": 9.905659675598145, + "learning_rate": 1.481604158173665e-06, + "loss": 2.8156, + "step": 100340 + }, + { + "epoch": 6.817842098111156, + "grad_norm": 9.385675430297852, + "learning_rate": 1.4811795080853378e-06, + "loss": 2.5848, + "step": 100345 + }, + { + "epoch": 6.818181818181818, + "grad_norm": 7.25966739654541, + "learning_rate": 1.4807548579970104e-06, + "loss": 2.8121, + "step": 100350 + }, + { + "epoch": 6.8185215382524795, + "grad_norm": 7.544384002685547, + "learning_rate": 1.4803302079086834e-06, + "loss": 2.7068, + "step": 100355 + }, + { + "epoch": 6.818861258323142, + "grad_norm": 6.808858394622803, + "learning_rate": 1.479905557820356e-06, + "loss": 2.9021, + "step": 100360 + }, + { + "epoch": 6.819200978393804, + "grad_norm": 6.377962112426758, + "learning_rate": 1.4794809077320288e-06, + "loss": 2.9207, + "step": 100365 + }, + { + "epoch": 6.819540698464465, + "grad_norm": 9.198530197143555, + "learning_rate": 1.4790562576437018e-06, + "loss": 2.7798, + "step": 100370 + }, + { + "epoch": 6.819880418535127, + "grad_norm": 10.35055160522461, + "learning_rate": 1.4786316075553744e-06, + "loss": 2.5423, + "step": 100375 + }, + { + "epoch": 6.820220138605789, + "grad_norm": 9.6407470703125, + "learning_rate": 1.4782069574670474e-06, + "loss": 2.909, + "step": 100380 + }, + { + "epoch": 6.82055985867645, + "grad_norm": 7.753232479095459, + "learning_rate": 1.47778230737872e-06, + "loss": 2.7227, + "step": 100385 + }, + { + "epoch": 6.820899578747112, + "grad_norm": 7.646531105041504, + "learning_rate": 1.4773576572903928e-06, + "loss": 2.7511, + "step": 100390 + }, + { + "epoch": 6.821239298817774, + "grad_norm": 7.014517784118652, + "learning_rate": 1.4769330072020658e-06, + "loss": 2.8933, + "step": 100395 + }, + { + "epoch": 6.8215790188884355, + "grad_norm": 7.096541404724121, + "learning_rate": 1.4765083571137384e-06, + "loss": 2.761, + "step": 100400 + }, + { + "epoch": 6.821918738959098, + "grad_norm": 8.210748672485352, + "learning_rate": 1.476083707025411e-06, + "loss": 2.7615, + "step": 100405 + }, + { + "epoch": 6.82225845902976, + "grad_norm": 8.706738471984863, + "learning_rate": 1.475659056937084e-06, + "loss": 2.782, + "step": 100410 + }, + { + "epoch": 6.822598179100421, + "grad_norm": 6.300721645355225, + "learning_rate": 1.4752344068487568e-06, + "loss": 2.8686, + "step": 100415 + }, + { + "epoch": 6.822937899171083, + "grad_norm": 7.418874263763428, + "learning_rate": 1.4748097567604294e-06, + "loss": 2.796, + "step": 100420 + }, + { + "epoch": 6.823277619241745, + "grad_norm": 6.71877908706665, + "learning_rate": 1.4743851066721024e-06, + "loss": 2.671, + "step": 100425 + }, + { + "epoch": 6.823617339312406, + "grad_norm": 8.343009948730469, + "learning_rate": 1.473960456583775e-06, + "loss": 2.7638, + "step": 100430 + }, + { + "epoch": 6.823957059383068, + "grad_norm": 6.573097229003906, + "learning_rate": 1.4735358064954478e-06, + "loss": 2.7235, + "step": 100435 + }, + { + "epoch": 6.82429677945373, + "grad_norm": 7.370694637298584, + "learning_rate": 1.4731111564071206e-06, + "loss": 2.5664, + "step": 100440 + }, + { + "epoch": 6.8246364995243916, + "grad_norm": 7.714045524597168, + "learning_rate": 1.4726865063187934e-06, + "loss": 2.8374, + "step": 100445 + }, + { + "epoch": 6.824976219595054, + "grad_norm": 7.099782466888428, + "learning_rate": 1.472261856230466e-06, + "loss": 2.7679, + "step": 100450 + }, + { + "epoch": 6.825315939665716, + "grad_norm": 8.794264793395996, + "learning_rate": 1.471837206142139e-06, + "loss": 2.9035, + "step": 100455 + }, + { + "epoch": 6.825655659736377, + "grad_norm": 6.6029438972473145, + "learning_rate": 1.4714125560538118e-06, + "loss": 2.8071, + "step": 100460 + }, + { + "epoch": 6.825995379807039, + "grad_norm": 6.636258125305176, + "learning_rate": 1.4709879059654846e-06, + "loss": 2.9576, + "step": 100465 + }, + { + "epoch": 6.826335099877701, + "grad_norm": 9.969595909118652, + "learning_rate": 1.4705632558771574e-06, + "loss": 2.7642, + "step": 100470 + }, + { + "epoch": 6.826674819948362, + "grad_norm": 7.987395763397217, + "learning_rate": 1.47013860578883e-06, + "loss": 2.9479, + "step": 100475 + }, + { + "epoch": 6.827014540019024, + "grad_norm": 7.7036452293396, + "learning_rate": 1.469713955700503e-06, + "loss": 2.8526, + "step": 100480 + }, + { + "epoch": 6.827354260089686, + "grad_norm": 7.77497673034668, + "learning_rate": 1.4692893056121756e-06, + "loss": 2.6053, + "step": 100485 + }, + { + "epoch": 6.827693980160348, + "grad_norm": 8.435556411743164, + "learning_rate": 1.4688646555238484e-06, + "loss": 2.3884, + "step": 100490 + }, + { + "epoch": 6.82803370023101, + "grad_norm": 6.389490127563477, + "learning_rate": 1.4684400054355214e-06, + "loss": 2.7082, + "step": 100495 + }, + { + "epoch": 6.828373420301672, + "grad_norm": 9.85214614868164, + "learning_rate": 1.468015355347194e-06, + "loss": 2.8412, + "step": 100500 + }, + { + "epoch": 6.828713140372333, + "grad_norm": 8.565444946289062, + "learning_rate": 1.4675907052588668e-06, + "loss": 3.0166, + "step": 100505 + }, + { + "epoch": 6.829052860442995, + "grad_norm": 10.912675857543945, + "learning_rate": 1.4671660551705396e-06, + "loss": 2.7973, + "step": 100510 + }, + { + "epoch": 6.829392580513657, + "grad_norm": 7.621373653411865, + "learning_rate": 1.4667414050822124e-06, + "loss": 2.9678, + "step": 100515 + }, + { + "epoch": 6.829732300584318, + "grad_norm": 7.486422061920166, + "learning_rate": 1.466316754993885e-06, + "loss": 2.6784, + "step": 100520 + }, + { + "epoch": 6.83007202065498, + "grad_norm": 7.2893853187561035, + "learning_rate": 1.465892104905558e-06, + "loss": 2.54, + "step": 100525 + }, + { + "epoch": 6.830411740725642, + "grad_norm": 7.929211139678955, + "learning_rate": 1.4654674548172306e-06, + "loss": 2.8798, + "step": 100530 + }, + { + "epoch": 6.830751460796304, + "grad_norm": 7.233201026916504, + "learning_rate": 1.4650428047289034e-06, + "loss": 2.8372, + "step": 100535 + }, + { + "epoch": 6.831091180866966, + "grad_norm": 8.469122886657715, + "learning_rate": 1.4646181546405764e-06, + "loss": 2.8206, + "step": 100540 + }, + { + "epoch": 6.831430900937628, + "grad_norm": 10.635320663452148, + "learning_rate": 1.464193504552249e-06, + "loss": 2.7236, + "step": 100545 + }, + { + "epoch": 6.831770621008289, + "grad_norm": 6.333259582519531, + "learning_rate": 1.463768854463922e-06, + "loss": 2.8743, + "step": 100550 + }, + { + "epoch": 6.832110341078951, + "grad_norm": 8.712508201599121, + "learning_rate": 1.4633442043755946e-06, + "loss": 2.75, + "step": 100555 + }, + { + "epoch": 6.832450061149613, + "grad_norm": 7.150306224822998, + "learning_rate": 1.4629195542872674e-06, + "loss": 2.6798, + "step": 100560 + }, + { + "epoch": 6.832789781220274, + "grad_norm": 7.983236312866211, + "learning_rate": 1.4624949041989402e-06, + "loss": 2.7316, + "step": 100565 + }, + { + "epoch": 6.833129501290936, + "grad_norm": 7.805286407470703, + "learning_rate": 1.462070254110613e-06, + "loss": 2.6925, + "step": 100570 + }, + { + "epoch": 6.833469221361598, + "grad_norm": 7.225557327270508, + "learning_rate": 1.4616456040222856e-06, + "loss": 2.8446, + "step": 100575 + }, + { + "epoch": 6.83380894143226, + "grad_norm": 7.931825637817383, + "learning_rate": 1.4612209539339586e-06, + "loss": 2.6042, + "step": 100580 + }, + { + "epoch": 6.834148661502922, + "grad_norm": 8.722328186035156, + "learning_rate": 1.4607963038456314e-06, + "loss": 2.8104, + "step": 100585 + }, + { + "epoch": 6.834488381573584, + "grad_norm": 6.5232648849487305, + "learning_rate": 1.460371653757304e-06, + "loss": 3.0192, + "step": 100590 + }, + { + "epoch": 6.834828101644245, + "grad_norm": 7.535799503326416, + "learning_rate": 1.459947003668977e-06, + "loss": 2.9622, + "step": 100595 + }, + { + "epoch": 6.835167821714907, + "grad_norm": 7.463340759277344, + "learning_rate": 1.4595223535806496e-06, + "loss": 2.7377, + "step": 100600 + }, + { + "epoch": 6.835507541785569, + "grad_norm": 7.756800651550293, + "learning_rate": 1.4590977034923224e-06, + "loss": 2.7829, + "step": 100605 + }, + { + "epoch": 6.83584726185623, + "grad_norm": 6.455596446990967, + "learning_rate": 1.4586730534039952e-06, + "loss": 2.65, + "step": 100610 + }, + { + "epoch": 6.836186981926892, + "grad_norm": 6.629147052764893, + "learning_rate": 1.458248403315668e-06, + "loss": 2.5153, + "step": 100615 + }, + { + "epoch": 6.836526701997554, + "grad_norm": 7.049134254455566, + "learning_rate": 1.4578237532273406e-06, + "loss": 2.6242, + "step": 100620 + }, + { + "epoch": 6.836866422068216, + "grad_norm": 7.4517822265625, + "learning_rate": 1.4573991031390136e-06, + "loss": 2.5057, + "step": 100625 + }, + { + "epoch": 6.837206142138878, + "grad_norm": 7.836109638214111, + "learning_rate": 1.4569744530506862e-06, + "loss": 2.6677, + "step": 100630 + }, + { + "epoch": 6.83754586220954, + "grad_norm": 6.828293800354004, + "learning_rate": 1.4565498029623592e-06, + "loss": 2.6667, + "step": 100635 + }, + { + "epoch": 6.837885582280201, + "grad_norm": 7.935734748840332, + "learning_rate": 1.456125152874032e-06, + "loss": 2.8191, + "step": 100640 + }, + { + "epoch": 6.838225302350863, + "grad_norm": 9.593779563903809, + "learning_rate": 1.4557005027857046e-06, + "loss": 2.4884, + "step": 100645 + }, + { + "epoch": 6.838565022421525, + "grad_norm": 7.687258720397949, + "learning_rate": 1.4552758526973776e-06, + "loss": 2.687, + "step": 100650 + }, + { + "epoch": 6.838904742492186, + "grad_norm": 8.664887428283691, + "learning_rate": 1.4548512026090502e-06, + "loss": 2.6363, + "step": 100655 + }, + { + "epoch": 6.839244462562848, + "grad_norm": 6.8089423179626465, + "learning_rate": 1.454426552520723e-06, + "loss": 2.8744, + "step": 100660 + }, + { + "epoch": 6.83958418263351, + "grad_norm": 5.992090225219727, + "learning_rate": 1.454001902432396e-06, + "loss": 2.8612, + "step": 100665 + }, + { + "epoch": 6.839923902704172, + "grad_norm": 8.59688949584961, + "learning_rate": 1.4535772523440686e-06, + "loss": 2.8175, + "step": 100670 + }, + { + "epoch": 6.840263622774834, + "grad_norm": 8.233197212219238, + "learning_rate": 1.4531526022557412e-06, + "loss": 2.7222, + "step": 100675 + }, + { + "epoch": 6.840603342845496, + "grad_norm": 6.508237838745117, + "learning_rate": 1.4527279521674142e-06, + "loss": 2.6358, + "step": 100680 + }, + { + "epoch": 6.840943062916157, + "grad_norm": 9.822943687438965, + "learning_rate": 1.452303302079087e-06, + "loss": 2.8798, + "step": 100685 + }, + { + "epoch": 6.841282782986819, + "grad_norm": 7.85537576675415, + "learning_rate": 1.4518786519907596e-06, + "loss": 2.9138, + "step": 100690 + }, + { + "epoch": 6.841622503057481, + "grad_norm": 9.790574073791504, + "learning_rate": 1.4514540019024326e-06, + "loss": 2.8006, + "step": 100695 + }, + { + "epoch": 6.841962223128142, + "grad_norm": 8.158690452575684, + "learning_rate": 1.4510293518141052e-06, + "loss": 2.8394, + "step": 100700 + }, + { + "epoch": 6.842301943198804, + "grad_norm": 8.239510536193848, + "learning_rate": 1.450604701725778e-06, + "loss": 2.8624, + "step": 100705 + }, + { + "epoch": 6.842641663269466, + "grad_norm": 7.877737998962402, + "learning_rate": 1.450180051637451e-06, + "loss": 2.5361, + "step": 100710 + }, + { + "epoch": 6.842981383340128, + "grad_norm": 7.163262367248535, + "learning_rate": 1.4497554015491236e-06, + "loss": 2.6386, + "step": 100715 + }, + { + "epoch": 6.84332110341079, + "grad_norm": 7.923988342285156, + "learning_rate": 1.4493307514607966e-06, + "loss": 2.7651, + "step": 100720 + }, + { + "epoch": 6.843660823481452, + "grad_norm": 8.521552085876465, + "learning_rate": 1.4489061013724692e-06, + "loss": 2.622, + "step": 100725 + }, + { + "epoch": 6.844000543552113, + "grad_norm": 8.60042953491211, + "learning_rate": 1.448481451284142e-06, + "loss": 2.8331, + "step": 100730 + }, + { + "epoch": 6.844340263622775, + "grad_norm": 9.40158462524414, + "learning_rate": 1.4480568011958148e-06, + "loss": 2.4963, + "step": 100735 + }, + { + "epoch": 6.844679983693436, + "grad_norm": 7.572789192199707, + "learning_rate": 1.4476321511074876e-06, + "loss": 2.6314, + "step": 100740 + }, + { + "epoch": 6.845019703764098, + "grad_norm": 9.091361045837402, + "learning_rate": 1.4472075010191602e-06, + "loss": 2.6141, + "step": 100745 + }, + { + "epoch": 6.84535942383476, + "grad_norm": 8.49807357788086, + "learning_rate": 1.4467828509308332e-06, + "loss": 2.8975, + "step": 100750 + }, + { + "epoch": 6.845699143905422, + "grad_norm": 8.746768951416016, + "learning_rate": 1.4463582008425058e-06, + "loss": 2.8385, + "step": 100755 + }, + { + "epoch": 6.846038863976084, + "grad_norm": 6.725901126861572, + "learning_rate": 1.4459335507541786e-06, + "loss": 2.8481, + "step": 100760 + }, + { + "epoch": 6.846378584046746, + "grad_norm": 7.5192036628723145, + "learning_rate": 1.4455089006658516e-06, + "loss": 2.8608, + "step": 100765 + }, + { + "epoch": 6.846718304117407, + "grad_norm": 6.716104030609131, + "learning_rate": 1.4450842505775242e-06, + "loss": 2.8142, + "step": 100770 + }, + { + "epoch": 6.847058024188069, + "grad_norm": 9.335583686828613, + "learning_rate": 1.444659600489197e-06, + "loss": 2.4974, + "step": 100775 + }, + { + "epoch": 6.847397744258731, + "grad_norm": 8.399110794067383, + "learning_rate": 1.4442349504008698e-06, + "loss": 2.6976, + "step": 100780 + }, + { + "epoch": 6.847737464329392, + "grad_norm": 7.498351573944092, + "learning_rate": 1.4438103003125426e-06, + "loss": 2.8604, + "step": 100785 + }, + { + "epoch": 6.848077184400054, + "grad_norm": 7.86061429977417, + "learning_rate": 1.4433856502242152e-06, + "loss": 2.7059, + "step": 100790 + }, + { + "epoch": 6.848416904470716, + "grad_norm": 7.240978240966797, + "learning_rate": 1.4429610001358882e-06, + "loss": 2.6547, + "step": 100795 + }, + { + "epoch": 6.848756624541378, + "grad_norm": 7.986537456512451, + "learning_rate": 1.4425363500475608e-06, + "loss": 2.5432, + "step": 100800 + }, + { + "epoch": 6.84909634461204, + "grad_norm": 9.246909141540527, + "learning_rate": 1.4421116999592338e-06, + "loss": 2.7289, + "step": 100805 + }, + { + "epoch": 6.849436064682702, + "grad_norm": 8.934577941894531, + "learning_rate": 1.4416870498709066e-06, + "loss": 2.8301, + "step": 100810 + }, + { + "epoch": 6.849775784753363, + "grad_norm": 7.462509632110596, + "learning_rate": 1.4412623997825792e-06, + "loss": 3.071, + "step": 100815 + }, + { + "epoch": 6.850115504824025, + "grad_norm": 9.811108589172363, + "learning_rate": 1.4408377496942522e-06, + "loss": 2.7267, + "step": 100820 + }, + { + "epoch": 6.850455224894687, + "grad_norm": 9.051263809204102, + "learning_rate": 1.4404130996059248e-06, + "loss": 2.6836, + "step": 100825 + }, + { + "epoch": 6.850794944965348, + "grad_norm": 7.297354221343994, + "learning_rate": 1.4399884495175976e-06, + "loss": 2.5534, + "step": 100830 + }, + { + "epoch": 6.85113466503601, + "grad_norm": 7.618339538574219, + "learning_rate": 1.4395637994292704e-06, + "loss": 2.733, + "step": 100835 + }, + { + "epoch": 6.851474385106672, + "grad_norm": 7.691980361938477, + "learning_rate": 1.4391391493409432e-06, + "loss": 2.8272, + "step": 100840 + }, + { + "epoch": 6.851814105177334, + "grad_norm": 7.427978515625, + "learning_rate": 1.4387144992526158e-06, + "loss": 2.7823, + "step": 100845 + }, + { + "epoch": 6.852153825247996, + "grad_norm": 6.531676292419434, + "learning_rate": 1.4382898491642888e-06, + "loss": 2.6516, + "step": 100850 + }, + { + "epoch": 6.852493545318658, + "grad_norm": 7.754213809967041, + "learning_rate": 1.4378651990759616e-06, + "loss": 2.8744, + "step": 100855 + }, + { + "epoch": 6.852833265389319, + "grad_norm": 7.31394100189209, + "learning_rate": 1.4374405489876342e-06, + "loss": 2.9092, + "step": 100860 + }, + { + "epoch": 6.853172985459981, + "grad_norm": 7.108164310455322, + "learning_rate": 1.4370158988993072e-06, + "loss": 2.6541, + "step": 100865 + }, + { + "epoch": 6.853512705530643, + "grad_norm": 7.5579705238342285, + "learning_rate": 1.4365912488109798e-06, + "loss": 2.7378, + "step": 100870 + }, + { + "epoch": 6.853852425601304, + "grad_norm": 9.337305068969727, + "learning_rate": 1.4361665987226526e-06, + "loss": 3.0839, + "step": 100875 + }, + { + "epoch": 6.854192145671966, + "grad_norm": 7.348062515258789, + "learning_rate": 1.4357419486343254e-06, + "loss": 2.7502, + "step": 100880 + }, + { + "epoch": 6.854531865742628, + "grad_norm": 6.8273234367370605, + "learning_rate": 1.4353172985459982e-06, + "loss": 2.8115, + "step": 100885 + }, + { + "epoch": 6.85487158581329, + "grad_norm": 6.890200614929199, + "learning_rate": 1.4348926484576712e-06, + "loss": 2.6306, + "step": 100890 + }, + { + "epoch": 6.855211305883952, + "grad_norm": 9.122087478637695, + "learning_rate": 1.4344679983693438e-06, + "loss": 2.7062, + "step": 100895 + }, + { + "epoch": 6.855551025954614, + "grad_norm": 8.018495559692383, + "learning_rate": 1.4340433482810166e-06, + "loss": 2.9572, + "step": 100900 + }, + { + "epoch": 6.855890746025275, + "grad_norm": 8.497836112976074, + "learning_rate": 1.4336186981926894e-06, + "loss": 2.5415, + "step": 100905 + }, + { + "epoch": 6.856230466095937, + "grad_norm": 9.827608108520508, + "learning_rate": 1.4331940481043622e-06, + "loss": 2.7104, + "step": 100910 + }, + { + "epoch": 6.856570186166599, + "grad_norm": 6.6163153648376465, + "learning_rate": 1.4327693980160348e-06, + "loss": 2.6324, + "step": 100915 + }, + { + "epoch": 6.85690990623726, + "grad_norm": 10.087737083435059, + "learning_rate": 1.4323447479277078e-06, + "loss": 2.8238, + "step": 100920 + }, + { + "epoch": 6.857249626307922, + "grad_norm": 6.6088995933532715, + "learning_rate": 1.4319200978393804e-06, + "loss": 2.9008, + "step": 100925 + }, + { + "epoch": 6.857589346378584, + "grad_norm": 6.402979373931885, + "learning_rate": 1.4314954477510532e-06, + "loss": 2.6674, + "step": 100930 + }, + { + "epoch": 6.857929066449246, + "grad_norm": 9.076142311096191, + "learning_rate": 1.4310707976627262e-06, + "loss": 2.8059, + "step": 100935 + }, + { + "epoch": 6.858268786519908, + "grad_norm": 7.793289661407471, + "learning_rate": 1.4306461475743988e-06, + "loss": 2.5937, + "step": 100940 + }, + { + "epoch": 6.85860850659057, + "grad_norm": 7.23136043548584, + "learning_rate": 1.4302214974860714e-06, + "loss": 2.7176, + "step": 100945 + }, + { + "epoch": 6.858948226661231, + "grad_norm": 7.058582305908203, + "learning_rate": 1.4297968473977444e-06, + "loss": 2.6623, + "step": 100950 + }, + { + "epoch": 6.859287946731893, + "grad_norm": 6.773387432098389, + "learning_rate": 1.4293721973094172e-06, + "loss": 2.5706, + "step": 100955 + }, + { + "epoch": 6.859627666802554, + "grad_norm": 6.532224178314209, + "learning_rate": 1.4289475472210898e-06, + "loss": 2.524, + "step": 100960 + }, + { + "epoch": 6.859967386873216, + "grad_norm": 8.341713905334473, + "learning_rate": 1.4285228971327628e-06, + "loss": 2.8506, + "step": 100965 + }, + { + "epoch": 6.860307106943878, + "grad_norm": 8.667464256286621, + "learning_rate": 1.4280982470444354e-06, + "loss": 2.6318, + "step": 100970 + }, + { + "epoch": 6.8606468270145395, + "grad_norm": 6.794430255889893, + "learning_rate": 1.4276735969561084e-06, + "loss": 2.7969, + "step": 100975 + }, + { + "epoch": 6.860986547085202, + "grad_norm": 7.4729084968566895, + "learning_rate": 1.4272489468677812e-06, + "loss": 2.8281, + "step": 100980 + }, + { + "epoch": 6.861326267155864, + "grad_norm": 10.54859733581543, + "learning_rate": 1.4268242967794538e-06, + "loss": 2.6325, + "step": 100985 + }, + { + "epoch": 6.861665987226525, + "grad_norm": 11.146018028259277, + "learning_rate": 1.4263996466911268e-06, + "loss": 2.7656, + "step": 100990 + }, + { + "epoch": 6.862005707297187, + "grad_norm": 7.873415470123291, + "learning_rate": 1.4259749966027994e-06, + "loss": 3.0787, + "step": 100995 + }, + { + "epoch": 6.862345427367849, + "grad_norm": 8.427788734436035, + "learning_rate": 1.4255503465144722e-06, + "loss": 2.8403, + "step": 101000 + }, + { + "epoch": 6.86268514743851, + "grad_norm": 7.694795608520508, + "learning_rate": 1.425125696426145e-06, + "loss": 2.7459, + "step": 101005 + }, + { + "epoch": 6.863024867509172, + "grad_norm": 6.190284252166748, + "learning_rate": 1.4247010463378178e-06, + "loss": 2.6697, + "step": 101010 + }, + { + "epoch": 6.863364587579834, + "grad_norm": 7.311808109283447, + "learning_rate": 1.4242763962494904e-06, + "loss": 2.6265, + "step": 101015 + }, + { + "epoch": 6.8637043076504956, + "grad_norm": 6.693887710571289, + "learning_rate": 1.4238517461611634e-06, + "loss": 3.0346, + "step": 101020 + }, + { + "epoch": 6.864044027721158, + "grad_norm": 12.247655868530273, + "learning_rate": 1.423427096072836e-06, + "loss": 2.8098, + "step": 101025 + }, + { + "epoch": 6.86438374779182, + "grad_norm": 6.48545503616333, + "learning_rate": 1.4230024459845088e-06, + "loss": 2.7521, + "step": 101030 + }, + { + "epoch": 6.864723467862481, + "grad_norm": 7.169023036956787, + "learning_rate": 1.4225777958961818e-06, + "loss": 2.969, + "step": 101035 + }, + { + "epoch": 6.865063187933143, + "grad_norm": 11.145105361938477, + "learning_rate": 1.4221531458078544e-06, + "loss": 2.719, + "step": 101040 + }, + { + "epoch": 6.865402908003805, + "grad_norm": 8.063270568847656, + "learning_rate": 1.4218134257371927e-06, + "loss": 2.8036, + "step": 101045 + }, + { + "epoch": 6.865742628074466, + "grad_norm": 9.316757202148438, + "learning_rate": 1.4213887756488652e-06, + "loss": 2.9297, + "step": 101050 + }, + { + "epoch": 6.866082348145128, + "grad_norm": 10.737598419189453, + "learning_rate": 1.4209641255605383e-06, + "loss": 2.6778, + "step": 101055 + }, + { + "epoch": 6.86642206821579, + "grad_norm": 6.9581193923950195, + "learning_rate": 1.420539475472211e-06, + "loss": 2.7382, + "step": 101060 + }, + { + "epoch": 6.866761788286452, + "grad_norm": 8.11921501159668, + "learning_rate": 1.4201148253838837e-06, + "loss": 2.6512, + "step": 101065 + }, + { + "epoch": 6.867101508357114, + "grad_norm": 7.1777262687683105, + "learning_rate": 1.4196901752955567e-06, + "loss": 2.7042, + "step": 101070 + }, + { + "epoch": 6.867441228427776, + "grad_norm": 9.137845993041992, + "learning_rate": 1.4192655252072293e-06, + "loss": 2.7225, + "step": 101075 + }, + { + "epoch": 6.867780948498437, + "grad_norm": 7.319669246673584, + "learning_rate": 1.418840875118902e-06, + "loss": 2.7949, + "step": 101080 + }, + { + "epoch": 6.868120668569099, + "grad_norm": 8.599930763244629, + "learning_rate": 1.418416225030575e-06, + "loss": 2.5341, + "step": 101085 + }, + { + "epoch": 6.868460388639761, + "grad_norm": 8.087148666381836, + "learning_rate": 1.4179915749422477e-06, + "loss": 2.7462, + "step": 101090 + }, + { + "epoch": 6.868800108710422, + "grad_norm": 10.194280624389648, + "learning_rate": 1.4175669248539207e-06, + "loss": 2.609, + "step": 101095 + }, + { + "epoch": 6.869139828781084, + "grad_norm": 6.621752738952637, + "learning_rate": 1.4171422747655933e-06, + "loss": 2.7638, + "step": 101100 + }, + { + "epoch": 6.869479548851746, + "grad_norm": 8.55059814453125, + "learning_rate": 1.416717624677266e-06, + "loss": 2.5861, + "step": 101105 + }, + { + "epoch": 6.869819268922408, + "grad_norm": 7.846027851104736, + "learning_rate": 1.4162929745889389e-06, + "loss": 3.0656, + "step": 101110 + }, + { + "epoch": 6.87015898899307, + "grad_norm": 6.73380708694458, + "learning_rate": 1.4158683245006117e-06, + "loss": 2.5856, + "step": 101115 + }, + { + "epoch": 6.870498709063732, + "grad_norm": 8.865402221679688, + "learning_rate": 1.4154436744122842e-06, + "loss": 2.7526, + "step": 101120 + }, + { + "epoch": 6.870838429134393, + "grad_norm": 8.542128562927246, + "learning_rate": 1.4150190243239573e-06, + "loss": 2.7948, + "step": 101125 + }, + { + "epoch": 6.871178149205055, + "grad_norm": 9.097875595092773, + "learning_rate": 1.4145943742356298e-06, + "loss": 2.6978, + "step": 101130 + }, + { + "epoch": 6.871517869275717, + "grad_norm": 7.355920314788818, + "learning_rate": 1.4141697241473027e-06, + "loss": 2.7209, + "step": 101135 + }, + { + "epoch": 6.871857589346378, + "grad_norm": 10.210661888122559, + "learning_rate": 1.4137450740589757e-06, + "loss": 2.8472, + "step": 101140 + }, + { + "epoch": 6.87219730941704, + "grad_norm": 6.903034687042236, + "learning_rate": 1.4133204239706483e-06, + "loss": 2.8144, + "step": 101145 + }, + { + "epoch": 6.872537029487702, + "grad_norm": 8.866291999816895, + "learning_rate": 1.412895773882321e-06, + "loss": 3.0813, + "step": 101150 + }, + { + "epoch": 6.872876749558364, + "grad_norm": 6.2979960441589355, + "learning_rate": 1.4124711237939939e-06, + "loss": 2.8055, + "step": 101155 + }, + { + "epoch": 6.873216469629026, + "grad_norm": 7.107325553894043, + "learning_rate": 1.4120464737056667e-06, + "loss": 2.6085, + "step": 101160 + }, + { + "epoch": 6.873556189699688, + "grad_norm": 8.954635620117188, + "learning_rate": 1.4116218236173392e-06, + "loss": 2.8201, + "step": 101165 + }, + { + "epoch": 6.873895909770349, + "grad_norm": 6.723842144012451, + "learning_rate": 1.4111971735290123e-06, + "loss": 2.8649, + "step": 101170 + }, + { + "epoch": 6.874235629841011, + "grad_norm": 8.152511596679688, + "learning_rate": 1.4107725234406848e-06, + "loss": 2.8619, + "step": 101175 + }, + { + "epoch": 6.874575349911673, + "grad_norm": 8.080720901489258, + "learning_rate": 1.4103478733523579e-06, + "loss": 2.9056, + "step": 101180 + }, + { + "epoch": 6.874915069982334, + "grad_norm": 8.628165245056152, + "learning_rate": 1.4099232232640307e-06, + "loss": 2.8489, + "step": 101185 + }, + { + "epoch": 6.875254790052996, + "grad_norm": 8.438560485839844, + "learning_rate": 1.4094985731757032e-06, + "loss": 2.4291, + "step": 101190 + }, + { + "epoch": 6.875594510123658, + "grad_norm": 7.9356513023376465, + "learning_rate": 1.4090739230873763e-06, + "loss": 2.7248, + "step": 101195 + }, + { + "epoch": 6.87593423019432, + "grad_norm": 6.858735084533691, + "learning_rate": 1.4086492729990488e-06, + "loss": 2.6012, + "step": 101200 + }, + { + "epoch": 6.876273950264982, + "grad_norm": 8.429696083068848, + "learning_rate": 1.4082246229107216e-06, + "loss": 2.7089, + "step": 101205 + }, + { + "epoch": 6.876613670335644, + "grad_norm": 7.090850353240967, + "learning_rate": 1.4077999728223945e-06, + "loss": 2.6198, + "step": 101210 + }, + { + "epoch": 6.876953390406305, + "grad_norm": 11.069016456604004, + "learning_rate": 1.4073753227340673e-06, + "loss": 3.0874, + "step": 101215 + }, + { + "epoch": 6.877293110476967, + "grad_norm": 8.06354808807373, + "learning_rate": 1.4069506726457398e-06, + "loss": 2.6422, + "step": 101220 + }, + { + "epoch": 6.877632830547629, + "grad_norm": 8.906600952148438, + "learning_rate": 1.4065260225574129e-06, + "loss": 2.9523, + "step": 101225 + }, + { + "epoch": 6.87797255061829, + "grad_norm": 7.842840194702148, + "learning_rate": 1.4061013724690857e-06, + "loss": 2.8943, + "step": 101230 + }, + { + "epoch": 6.878312270688952, + "grad_norm": 7.380180835723877, + "learning_rate": 1.4056767223807582e-06, + "loss": 2.8223, + "step": 101235 + }, + { + "epoch": 6.878651990759614, + "grad_norm": 8.705032348632812, + "learning_rate": 1.4052520722924313e-06, + "loss": 2.6893, + "step": 101240 + }, + { + "epoch": 6.878991710830276, + "grad_norm": 9.135735511779785, + "learning_rate": 1.4048274222041038e-06, + "loss": 2.7422, + "step": 101245 + }, + { + "epoch": 6.879331430900938, + "grad_norm": 8.926889419555664, + "learning_rate": 1.4044027721157766e-06, + "loss": 2.6963, + "step": 101250 + }, + { + "epoch": 6.8796711509716, + "grad_norm": 6.452734470367432, + "learning_rate": 1.4039781220274494e-06, + "loss": 2.8039, + "step": 101255 + }, + { + "epoch": 6.880010871042261, + "grad_norm": 5.9156718254089355, + "learning_rate": 1.4035534719391222e-06, + "loss": 2.7278, + "step": 101260 + }, + { + "epoch": 6.880350591112923, + "grad_norm": 10.512868881225586, + "learning_rate": 1.4031288218507953e-06, + "loss": 2.7707, + "step": 101265 + }, + { + "epoch": 6.880690311183585, + "grad_norm": 6.286500453948975, + "learning_rate": 1.4027041717624678e-06, + "loss": 2.5487, + "step": 101270 + }, + { + "epoch": 6.881030031254246, + "grad_norm": 7.917832374572754, + "learning_rate": 1.4022795216741406e-06, + "loss": 3.0507, + "step": 101275 + }, + { + "epoch": 6.881369751324908, + "grad_norm": 9.784248352050781, + "learning_rate": 1.4018548715858134e-06, + "loss": 2.7476, + "step": 101280 + }, + { + "epoch": 6.88170947139557, + "grad_norm": 8.156915664672852, + "learning_rate": 1.4014302214974863e-06, + "loss": 2.6441, + "step": 101285 + }, + { + "epoch": 6.882049191466232, + "grad_norm": 6.481977462768555, + "learning_rate": 1.4010055714091588e-06, + "loss": 2.8529, + "step": 101290 + }, + { + "epoch": 6.882388911536894, + "grad_norm": 8.258233070373535, + "learning_rate": 1.4005809213208319e-06, + "loss": 2.6663, + "step": 101295 + }, + { + "epoch": 6.882728631607556, + "grad_norm": 7.749050140380859, + "learning_rate": 1.4001562712325044e-06, + "loss": 2.8409, + "step": 101300 + }, + { + "epoch": 6.883068351678217, + "grad_norm": 7.975905418395996, + "learning_rate": 1.3997316211441772e-06, + "loss": 2.6082, + "step": 101305 + }, + { + "epoch": 6.883408071748879, + "grad_norm": 6.792169094085693, + "learning_rate": 1.3993069710558503e-06, + "loss": 2.6561, + "step": 101310 + }, + { + "epoch": 6.883747791819541, + "grad_norm": 9.820690155029297, + "learning_rate": 1.3988823209675228e-06, + "loss": 2.8094, + "step": 101315 + }, + { + "epoch": 6.884087511890202, + "grad_norm": 8.485185623168945, + "learning_rate": 1.3984576708791956e-06, + "loss": 2.6727, + "step": 101320 + }, + { + "epoch": 6.884427231960864, + "grad_norm": 8.921945571899414, + "learning_rate": 1.3980330207908684e-06, + "loss": 2.8901, + "step": 101325 + }, + { + "epoch": 6.884766952031526, + "grad_norm": 6.680518627166748, + "learning_rate": 1.3976083707025412e-06, + "loss": 2.5783, + "step": 101330 + }, + { + "epoch": 6.885106672102188, + "grad_norm": 6.989187717437744, + "learning_rate": 1.3971837206142138e-06, + "loss": 2.8753, + "step": 101335 + }, + { + "epoch": 6.88544639217285, + "grad_norm": 6.425272464752197, + "learning_rate": 1.3967590705258868e-06, + "loss": 2.6067, + "step": 101340 + }, + { + "epoch": 6.885786112243512, + "grad_norm": 10.069926261901855, + "learning_rate": 1.3963344204375594e-06, + "loss": 2.9774, + "step": 101345 + }, + { + "epoch": 6.886125832314173, + "grad_norm": 8.484918594360352, + "learning_rate": 1.3959097703492324e-06, + "loss": 2.925, + "step": 101350 + }, + { + "epoch": 6.886465552384835, + "grad_norm": 7.224843978881836, + "learning_rate": 1.3954851202609052e-06, + "loss": 2.7026, + "step": 101355 + }, + { + "epoch": 6.886805272455497, + "grad_norm": 7.734981060028076, + "learning_rate": 1.3950604701725778e-06, + "loss": 2.6588, + "step": 101360 + }, + { + "epoch": 6.887144992526158, + "grad_norm": 6.602078914642334, + "learning_rate": 1.3946358200842509e-06, + "loss": 2.8144, + "step": 101365 + }, + { + "epoch": 6.88748471259682, + "grad_norm": 8.57118034362793, + "learning_rate": 1.3942111699959234e-06, + "loss": 2.6626, + "step": 101370 + }, + { + "epoch": 6.8878244326674825, + "grad_norm": 6.855382919311523, + "learning_rate": 1.3937865199075962e-06, + "loss": 2.5705, + "step": 101375 + }, + { + "epoch": 6.888164152738144, + "grad_norm": 6.593111038208008, + "learning_rate": 1.393361869819269e-06, + "loss": 2.8234, + "step": 101380 + }, + { + "epoch": 6.888503872808806, + "grad_norm": 8.074965476989746, + "learning_rate": 1.3929372197309418e-06, + "loss": 2.7373, + "step": 101385 + }, + { + "epoch": 6.888843592879468, + "grad_norm": 8.521238327026367, + "learning_rate": 1.3925125696426144e-06, + "loss": 2.8088, + "step": 101390 + }, + { + "epoch": 6.889183312950129, + "grad_norm": 9.915562629699707, + "learning_rate": 1.3920879195542874e-06, + "loss": 2.9612, + "step": 101395 + }, + { + "epoch": 6.889523033020791, + "grad_norm": 6.37860107421875, + "learning_rate": 1.3916632694659602e-06, + "loss": 2.4807, + "step": 101400 + }, + { + "epoch": 6.889862753091453, + "grad_norm": 8.134831428527832, + "learning_rate": 1.3912386193776328e-06, + "loss": 2.7463, + "step": 101405 + }, + { + "epoch": 6.890202473162114, + "grad_norm": 9.524706840515137, + "learning_rate": 1.3908139692893058e-06, + "loss": 2.7645, + "step": 101410 + }, + { + "epoch": 6.890542193232776, + "grad_norm": 7.787766456604004, + "learning_rate": 1.3903893192009784e-06, + "loss": 2.7017, + "step": 101415 + }, + { + "epoch": 6.8908819133034385, + "grad_norm": 9.712987899780273, + "learning_rate": 1.3899646691126512e-06, + "loss": 2.5689, + "step": 101420 + }, + { + "epoch": 6.8912216333741, + "grad_norm": 9.700383186340332, + "learning_rate": 1.389540019024324e-06, + "loss": 2.6722, + "step": 101425 + }, + { + "epoch": 6.891561353444762, + "grad_norm": 8.034222602844238, + "learning_rate": 1.3891153689359968e-06, + "loss": 2.849, + "step": 101430 + }, + { + "epoch": 6.891901073515423, + "grad_norm": 7.440584659576416, + "learning_rate": 1.3886907188476698e-06, + "loss": 2.6191, + "step": 101435 + }, + { + "epoch": 6.892240793586085, + "grad_norm": 9.797490119934082, + "learning_rate": 1.3882660687593424e-06, + "loss": 3.0938, + "step": 101440 + }, + { + "epoch": 6.892580513656747, + "grad_norm": 5.843235492706299, + "learning_rate": 1.387841418671015e-06, + "loss": 2.7889, + "step": 101445 + }, + { + "epoch": 6.892920233727408, + "grad_norm": 8.254926681518555, + "learning_rate": 1.387416768582688e-06, + "loss": 2.6992, + "step": 101450 + }, + { + "epoch": 6.89325995379807, + "grad_norm": 7.25180196762085, + "learning_rate": 1.3869921184943608e-06, + "loss": 2.6684, + "step": 101455 + }, + { + "epoch": 6.893599673868732, + "grad_norm": 8.90988540649414, + "learning_rate": 1.3865674684060334e-06, + "loss": 2.7725, + "step": 101460 + }, + { + "epoch": 6.893939393939394, + "grad_norm": 9.042435646057129, + "learning_rate": 1.3861428183177064e-06, + "loss": 2.5832, + "step": 101465 + }, + { + "epoch": 6.894279114010056, + "grad_norm": 7.529412269592285, + "learning_rate": 1.385718168229379e-06, + "loss": 2.8734, + "step": 101470 + }, + { + "epoch": 6.894618834080718, + "grad_norm": 9.757654190063477, + "learning_rate": 1.3852935181410518e-06, + "loss": 2.845, + "step": 101475 + }, + { + "epoch": 6.894958554151379, + "grad_norm": 6.211622714996338, + "learning_rate": 1.3848688680527248e-06, + "loss": 2.4383, + "step": 101480 + }, + { + "epoch": 6.895298274222041, + "grad_norm": 8.215272903442383, + "learning_rate": 1.3844442179643974e-06, + "loss": 2.6264, + "step": 101485 + }, + { + "epoch": 6.895637994292703, + "grad_norm": 7.033082008361816, + "learning_rate": 1.38401956787607e-06, + "loss": 2.4929, + "step": 101490 + }, + { + "epoch": 6.895977714363364, + "grad_norm": 10.994002342224121, + "learning_rate": 1.383594917787743e-06, + "loss": 2.7407, + "step": 101495 + }, + { + "epoch": 6.896317434434026, + "grad_norm": 7.025272369384766, + "learning_rate": 1.3831702676994158e-06, + "loss": 2.6755, + "step": 101500 + }, + { + "epoch": 6.896657154504688, + "grad_norm": 8.35177230834961, + "learning_rate": 1.3827456176110884e-06, + "loss": 2.7917, + "step": 101505 + }, + { + "epoch": 6.89699687457535, + "grad_norm": 8.99940299987793, + "learning_rate": 1.3823209675227614e-06, + "loss": 2.7073, + "step": 101510 + }, + { + "epoch": 6.897336594646012, + "grad_norm": 8.596282005310059, + "learning_rate": 1.381896317434434e-06, + "loss": 2.8062, + "step": 101515 + }, + { + "epoch": 6.897676314716674, + "grad_norm": 8.092142105102539, + "learning_rate": 1.381471667346107e-06, + "loss": 2.399, + "step": 101520 + }, + { + "epoch": 6.898016034787335, + "grad_norm": 8.258686065673828, + "learning_rate": 1.3810470172577796e-06, + "loss": 2.8811, + "step": 101525 + }, + { + "epoch": 6.898355754857997, + "grad_norm": 6.246628761291504, + "learning_rate": 1.3806223671694524e-06, + "loss": 2.6317, + "step": 101530 + }, + { + "epoch": 6.898695474928659, + "grad_norm": 7.31314754486084, + "learning_rate": 1.3801977170811254e-06, + "loss": 2.6394, + "step": 101535 + }, + { + "epoch": 6.89903519499932, + "grad_norm": 5.9486870765686035, + "learning_rate": 1.379773066992798e-06, + "loss": 2.6904, + "step": 101540 + }, + { + "epoch": 6.899374915069982, + "grad_norm": 6.614745140075684, + "learning_rate": 1.3793484169044708e-06, + "loss": 2.584, + "step": 101545 + }, + { + "epoch": 6.899714635140644, + "grad_norm": 7.526974678039551, + "learning_rate": 1.3789237668161436e-06, + "loss": 3.0061, + "step": 101550 + }, + { + "epoch": 6.900054355211306, + "grad_norm": 7.986260414123535, + "learning_rate": 1.3784991167278164e-06, + "loss": 2.707, + "step": 101555 + }, + { + "epoch": 6.900394075281968, + "grad_norm": 7.285943508148193, + "learning_rate": 1.378074466639489e-06, + "loss": 2.8401, + "step": 101560 + }, + { + "epoch": 6.90073379535263, + "grad_norm": 6.105693817138672, + "learning_rate": 1.377649816551162e-06, + "loss": 2.7043, + "step": 101565 + }, + { + "epoch": 6.901073515423291, + "grad_norm": 9.298837661743164, + "learning_rate": 1.3772251664628346e-06, + "loss": 2.7158, + "step": 101570 + }, + { + "epoch": 6.901413235493953, + "grad_norm": 6.890979766845703, + "learning_rate": 1.3768005163745074e-06, + "loss": 2.7313, + "step": 101575 + }, + { + "epoch": 6.901752955564615, + "grad_norm": 9.09143352508545, + "learning_rate": 1.3763758662861804e-06, + "loss": 2.543, + "step": 101580 + }, + { + "epoch": 6.902092675635276, + "grad_norm": 8.32126522064209, + "learning_rate": 1.375951216197853e-06, + "loss": 2.7744, + "step": 101585 + }, + { + "epoch": 6.902432395705938, + "grad_norm": 7.769838333129883, + "learning_rate": 1.3755265661095258e-06, + "loss": 2.5866, + "step": 101590 + }, + { + "epoch": 6.9027721157766, + "grad_norm": 7.111766815185547, + "learning_rate": 1.3751019160211986e-06, + "loss": 2.6117, + "step": 101595 + }, + { + "epoch": 6.903111835847262, + "grad_norm": 6.734354019165039, + "learning_rate": 1.3746772659328714e-06, + "loss": 2.4739, + "step": 101600 + }, + { + "epoch": 6.903451555917924, + "grad_norm": 6.024893283843994, + "learning_rate": 1.3742526158445444e-06, + "loss": 2.9131, + "step": 101605 + }, + { + "epoch": 6.903791275988586, + "grad_norm": 8.612615585327148, + "learning_rate": 1.373827965756217e-06, + "loss": 2.5354, + "step": 101610 + }, + { + "epoch": 6.904130996059247, + "grad_norm": 7.275318145751953, + "learning_rate": 1.3734033156678896e-06, + "loss": 2.7774, + "step": 101615 + }, + { + "epoch": 6.904470716129909, + "grad_norm": 7.786962985992432, + "learning_rate": 1.3729786655795626e-06, + "loss": 3.0172, + "step": 101620 + }, + { + "epoch": 6.904810436200571, + "grad_norm": 7.493892669677734, + "learning_rate": 1.3725540154912354e-06, + "loss": 2.8904, + "step": 101625 + }, + { + "epoch": 6.905150156271232, + "grad_norm": 9.4501314163208, + "learning_rate": 1.372129365402908e-06, + "loss": 2.79, + "step": 101630 + }, + { + "epoch": 6.905489876341894, + "grad_norm": 8.215094566345215, + "learning_rate": 1.371704715314581e-06, + "loss": 2.8408, + "step": 101635 + }, + { + "epoch": 6.905829596412556, + "grad_norm": 8.274662017822266, + "learning_rate": 1.3712800652262536e-06, + "loss": 2.8518, + "step": 101640 + }, + { + "epoch": 6.906169316483218, + "grad_norm": 8.88097858428955, + "learning_rate": 1.3708554151379264e-06, + "loss": 2.5746, + "step": 101645 + }, + { + "epoch": 6.90650903655388, + "grad_norm": 6.762631893157959, + "learning_rate": 1.3704307650495992e-06, + "loss": 2.9836, + "step": 101650 + }, + { + "epoch": 6.906848756624541, + "grad_norm": 8.029423713684082, + "learning_rate": 1.370006114961272e-06, + "loss": 2.9257, + "step": 101655 + }, + { + "epoch": 6.907188476695203, + "grad_norm": 5.974935531616211, + "learning_rate": 1.3695814648729446e-06, + "loss": 2.7945, + "step": 101660 + }, + { + "epoch": 6.907528196765865, + "grad_norm": 9.440386772155762, + "learning_rate": 1.3691568147846176e-06, + "loss": 2.8472, + "step": 101665 + }, + { + "epoch": 6.907867916836526, + "grad_norm": 6.9730610847473145, + "learning_rate": 1.3687321646962904e-06, + "loss": 2.8027, + "step": 101670 + }, + { + "epoch": 6.908207636907188, + "grad_norm": 7.988716125488281, + "learning_rate": 1.368307514607963e-06, + "loss": 2.7204, + "step": 101675 + }, + { + "epoch": 6.90854735697785, + "grad_norm": 8.566075325012207, + "learning_rate": 1.367882864519636e-06, + "loss": 2.7869, + "step": 101680 + }, + { + "epoch": 6.908887077048512, + "grad_norm": 8.734868049621582, + "learning_rate": 1.3674582144313086e-06, + "loss": 2.7918, + "step": 101685 + }, + { + "epoch": 6.909226797119174, + "grad_norm": 7.702843189239502, + "learning_rate": 1.3670335643429816e-06, + "loss": 2.6184, + "step": 101690 + }, + { + "epoch": 6.909566517189836, + "grad_norm": 6.890658855438232, + "learning_rate": 1.3666089142546542e-06, + "loss": 2.4938, + "step": 101695 + }, + { + "epoch": 6.909906237260497, + "grad_norm": 8.461602210998535, + "learning_rate": 1.366184264166327e-06, + "loss": 2.6612, + "step": 101700 + }, + { + "epoch": 6.910245957331159, + "grad_norm": 8.154468536376953, + "learning_rate": 1.365759614078e-06, + "loss": 2.6329, + "step": 101705 + }, + { + "epoch": 6.910585677401821, + "grad_norm": 8.552310943603516, + "learning_rate": 1.3653349639896726e-06, + "loss": 2.7366, + "step": 101710 + }, + { + "epoch": 6.910925397472482, + "grad_norm": 6.237288475036621, + "learning_rate": 1.3649103139013454e-06, + "loss": 2.7612, + "step": 101715 + }, + { + "epoch": 6.911265117543144, + "grad_norm": 6.202717304229736, + "learning_rate": 1.3644856638130182e-06, + "loss": 2.731, + "step": 101720 + }, + { + "epoch": 6.911604837613806, + "grad_norm": 7.525658130645752, + "learning_rate": 1.364061013724691e-06, + "loss": 2.9428, + "step": 101725 + }, + { + "epoch": 6.911944557684468, + "grad_norm": 7.909127235412598, + "learning_rate": 1.3636363636363636e-06, + "loss": 2.8108, + "step": 101730 + }, + { + "epoch": 6.91228427775513, + "grad_norm": 7.239237308502197, + "learning_rate": 1.3632117135480366e-06, + "loss": 2.7893, + "step": 101735 + }, + { + "epoch": 6.912623997825792, + "grad_norm": 6.081937313079834, + "learning_rate": 1.3627870634597092e-06, + "loss": 2.5797, + "step": 101740 + }, + { + "epoch": 6.912963717896453, + "grad_norm": 7.168456554412842, + "learning_rate": 1.362362413371382e-06, + "loss": 2.7795, + "step": 101745 + }, + { + "epoch": 6.913303437967115, + "grad_norm": 9.848494529724121, + "learning_rate": 1.361937763283055e-06, + "loss": 2.5912, + "step": 101750 + }, + { + "epoch": 6.913643158037777, + "grad_norm": 9.299542427062988, + "learning_rate": 1.3615131131947276e-06, + "loss": 2.8821, + "step": 101755 + }, + { + "epoch": 6.913982878108438, + "grad_norm": 8.816255569458008, + "learning_rate": 1.3610884631064002e-06, + "loss": 2.6695, + "step": 101760 + }, + { + "epoch": 6.9143225981791, + "grad_norm": 7.504025936126709, + "learning_rate": 1.3606638130180732e-06, + "loss": 2.7947, + "step": 101765 + }, + { + "epoch": 6.914662318249762, + "grad_norm": 6.70170783996582, + "learning_rate": 1.360239162929746e-06, + "loss": 2.7272, + "step": 101770 + }, + { + "epoch": 6.915002038320424, + "grad_norm": 8.373004913330078, + "learning_rate": 1.3598145128414188e-06, + "loss": 2.5395, + "step": 101775 + }, + { + "epoch": 6.915341758391086, + "grad_norm": 8.801859855651855, + "learning_rate": 1.3593898627530916e-06, + "loss": 3.0226, + "step": 101780 + }, + { + "epoch": 6.915681478461748, + "grad_norm": 8.187067985534668, + "learning_rate": 1.3589652126647642e-06, + "loss": 2.564, + "step": 101785 + }, + { + "epoch": 6.916021198532409, + "grad_norm": 7.649663925170898, + "learning_rate": 1.3585405625764372e-06, + "loss": 2.7618, + "step": 101790 + }, + { + "epoch": 6.916360918603071, + "grad_norm": 7.772900104522705, + "learning_rate": 1.35811591248811e-06, + "loss": 2.6692, + "step": 101795 + }, + { + "epoch": 6.916700638673733, + "grad_norm": 9.1927490234375, + "learning_rate": 1.3576912623997826e-06, + "loss": 2.7435, + "step": 101800 + }, + { + "epoch": 6.917040358744394, + "grad_norm": 11.156736373901367, + "learning_rate": 1.3572666123114556e-06, + "loss": 2.6503, + "step": 101805 + }, + { + "epoch": 6.917380078815056, + "grad_norm": 7.432740211486816, + "learning_rate": 1.3568419622231282e-06, + "loss": 2.9408, + "step": 101810 + }, + { + "epoch": 6.917719798885718, + "grad_norm": 7.244011878967285, + "learning_rate": 1.356417312134801e-06, + "loss": 2.7369, + "step": 101815 + }, + { + "epoch": 6.91805951895638, + "grad_norm": 8.049358367919922, + "learning_rate": 1.3559926620464738e-06, + "loss": 2.8029, + "step": 101820 + }, + { + "epoch": 6.918399239027042, + "grad_norm": 8.087469100952148, + "learning_rate": 1.3555680119581466e-06, + "loss": 2.8272, + "step": 101825 + }, + { + "epoch": 6.918738959097704, + "grad_norm": 5.843347072601318, + "learning_rate": 1.3551433618698192e-06, + "loss": 2.8721, + "step": 101830 + }, + { + "epoch": 6.919078679168365, + "grad_norm": 10.846035957336426, + "learning_rate": 1.3547187117814922e-06, + "loss": 2.8162, + "step": 101835 + }, + { + "epoch": 6.919418399239027, + "grad_norm": 6.666922092437744, + "learning_rate": 1.3542940616931648e-06, + "loss": 2.7012, + "step": 101840 + }, + { + "epoch": 6.919758119309689, + "grad_norm": 9.058433532714844, + "learning_rate": 1.3538694116048376e-06, + "loss": 2.897, + "step": 101845 + }, + { + "epoch": 6.92009783938035, + "grad_norm": 6.984557628631592, + "learning_rate": 1.3534447615165106e-06, + "loss": 2.5574, + "step": 101850 + }, + { + "epoch": 6.920437559451012, + "grad_norm": 7.271780967712402, + "learning_rate": 1.3530201114281832e-06, + "loss": 2.9512, + "step": 101855 + }, + { + "epoch": 6.920777279521674, + "grad_norm": 6.330089569091797, + "learning_rate": 1.3525954613398562e-06, + "loss": 2.6941, + "step": 101860 + }, + { + "epoch": 6.921116999592336, + "grad_norm": 9.261301040649414, + "learning_rate": 1.3521708112515288e-06, + "loss": 2.6764, + "step": 101865 + }, + { + "epoch": 6.921456719662998, + "grad_norm": 6.875956058502197, + "learning_rate": 1.3517461611632016e-06, + "loss": 2.5928, + "step": 101870 + }, + { + "epoch": 6.92179643973366, + "grad_norm": 8.577924728393555, + "learning_rate": 1.3513215110748746e-06, + "loss": 2.5271, + "step": 101875 + }, + { + "epoch": 6.922136159804321, + "grad_norm": 7.7378249168396, + "learning_rate": 1.3508968609865472e-06, + "loss": 2.7824, + "step": 101880 + }, + { + "epoch": 6.922475879874983, + "grad_norm": 7.298318386077881, + "learning_rate": 1.3504722108982198e-06, + "loss": 2.6287, + "step": 101885 + }, + { + "epoch": 6.922815599945645, + "grad_norm": 9.020804405212402, + "learning_rate": 1.3500475608098928e-06, + "loss": 2.6236, + "step": 101890 + }, + { + "epoch": 6.923155320016306, + "grad_norm": 9.240277290344238, + "learning_rate": 1.3496229107215656e-06, + "loss": 2.9368, + "step": 101895 + }, + { + "epoch": 6.923495040086968, + "grad_norm": 6.9629950523376465, + "learning_rate": 1.3491982606332382e-06, + "loss": 2.5825, + "step": 101900 + }, + { + "epoch": 6.92383476015763, + "grad_norm": 10.161518096923828, + "learning_rate": 1.3487736105449112e-06, + "loss": 2.7111, + "step": 101905 + }, + { + "epoch": 6.924174480228292, + "grad_norm": 6.288658142089844, + "learning_rate": 1.3483489604565838e-06, + "loss": 2.6674, + "step": 101910 + }, + { + "epoch": 6.924514200298954, + "grad_norm": 9.565102577209473, + "learning_rate": 1.3479243103682566e-06, + "loss": 2.5491, + "step": 101915 + }, + { + "epoch": 6.924853920369616, + "grad_norm": 9.228358268737793, + "learning_rate": 1.3474996602799294e-06, + "loss": 2.6774, + "step": 101920 + }, + { + "epoch": 6.925193640440277, + "grad_norm": 7.70155668258667, + "learning_rate": 1.3470750101916022e-06, + "loss": 2.8351, + "step": 101925 + }, + { + "epoch": 6.925533360510939, + "grad_norm": 7.673140525817871, + "learning_rate": 1.3466503601032748e-06, + "loss": 2.6005, + "step": 101930 + }, + { + "epoch": 6.925873080581601, + "grad_norm": 7.592631816864014, + "learning_rate": 1.3462257100149478e-06, + "loss": 2.5972, + "step": 101935 + }, + { + "epoch": 6.926212800652262, + "grad_norm": 9.005341529846191, + "learning_rate": 1.3458010599266206e-06, + "loss": 2.685, + "step": 101940 + }, + { + "epoch": 6.926552520722924, + "grad_norm": 8.62851619720459, + "learning_rate": 1.3453764098382934e-06, + "loss": 2.7631, + "step": 101945 + }, + { + "epoch": 6.9268922407935865, + "grad_norm": 8.945269584655762, + "learning_rate": 1.3449517597499662e-06, + "loss": 2.7064, + "step": 101950 + }, + { + "epoch": 6.927231960864248, + "grad_norm": 8.084900856018066, + "learning_rate": 1.3445271096616388e-06, + "loss": 2.6248, + "step": 101955 + }, + { + "epoch": 6.92757168093491, + "grad_norm": 6.081833839416504, + "learning_rate": 1.3441024595733118e-06, + "loss": 2.9201, + "step": 101960 + }, + { + "epoch": 6.927911401005572, + "grad_norm": 6.870244026184082, + "learning_rate": 1.3436778094849844e-06, + "loss": 2.5811, + "step": 101965 + }, + { + "epoch": 6.928251121076233, + "grad_norm": 7.306449890136719, + "learning_rate": 1.3432531593966572e-06, + "loss": 2.972, + "step": 101970 + }, + { + "epoch": 6.928590841146895, + "grad_norm": 7.135505676269531, + "learning_rate": 1.3428285093083302e-06, + "loss": 2.8443, + "step": 101975 + }, + { + "epoch": 6.928930561217557, + "grad_norm": 7.519437789916992, + "learning_rate": 1.3424038592200028e-06, + "loss": 2.6031, + "step": 101980 + }, + { + "epoch": 6.929270281288218, + "grad_norm": 8.627616882324219, + "learning_rate": 1.3419792091316756e-06, + "loss": 2.5573, + "step": 101985 + }, + { + "epoch": 6.92961000135888, + "grad_norm": 9.332833290100098, + "learning_rate": 1.3415545590433484e-06, + "loss": 2.6952, + "step": 101990 + }, + { + "epoch": 6.9299497214295425, + "grad_norm": 8.376314163208008, + "learning_rate": 1.3411299089550212e-06, + "loss": 2.6977, + "step": 101995 + }, + { + "epoch": 6.930289441500204, + "grad_norm": 7.811742305755615, + "learning_rate": 1.3407052588666938e-06, + "loss": 3.0026, + "step": 102000 + }, + { + "epoch": 6.930629161570866, + "grad_norm": 8.374086380004883, + "learning_rate": 1.3402806087783668e-06, + "loss": 2.7063, + "step": 102005 + }, + { + "epoch": 6.930968881641528, + "grad_norm": 8.572470664978027, + "learning_rate": 1.3398559586900394e-06, + "loss": 2.746, + "step": 102010 + }, + { + "epoch": 6.931308601712189, + "grad_norm": 7.805063724517822, + "learning_rate": 1.3394313086017122e-06, + "loss": 2.6671, + "step": 102015 + }, + { + "epoch": 6.931648321782851, + "grad_norm": 5.784371376037598, + "learning_rate": 1.3390066585133852e-06, + "loss": 2.8235, + "step": 102020 + }, + { + "epoch": 6.931988041853513, + "grad_norm": 7.760021686553955, + "learning_rate": 1.3385820084250578e-06, + "loss": 2.7379, + "step": 102025 + }, + { + "epoch": 6.932327761924174, + "grad_norm": 8.5758638381958, + "learning_rate": 1.3381573583367308e-06, + "loss": 2.392, + "step": 102030 + }, + { + "epoch": 6.932667481994836, + "grad_norm": 9.945347785949707, + "learning_rate": 1.3377327082484034e-06, + "loss": 2.8072, + "step": 102035 + }, + { + "epoch": 6.9330072020654985, + "grad_norm": 6.357569694519043, + "learning_rate": 1.3373080581600762e-06, + "loss": 2.8419, + "step": 102040 + }, + { + "epoch": 6.93334692213616, + "grad_norm": 8.677470207214355, + "learning_rate": 1.336883408071749e-06, + "loss": 2.699, + "step": 102045 + }, + { + "epoch": 6.933686642206822, + "grad_norm": 8.195550918579102, + "learning_rate": 1.3364587579834218e-06, + "loss": 2.7097, + "step": 102050 + }, + { + "epoch": 6.934026362277484, + "grad_norm": 9.235910415649414, + "learning_rate": 1.3360341078950944e-06, + "loss": 2.7143, + "step": 102055 + }, + { + "epoch": 6.934366082348145, + "grad_norm": 4.805835247039795, + "learning_rate": 1.3356094578067674e-06, + "loss": 2.6935, + "step": 102060 + }, + { + "epoch": 6.934705802418807, + "grad_norm": 10.167900085449219, + "learning_rate": 1.3351848077184402e-06, + "loss": 2.7168, + "step": 102065 + }, + { + "epoch": 6.935045522489469, + "grad_norm": 8.407793998718262, + "learning_rate": 1.3347601576301128e-06, + "loss": 2.6591, + "step": 102070 + }, + { + "epoch": 6.93538524256013, + "grad_norm": 8.845714569091797, + "learning_rate": 1.3343355075417858e-06, + "loss": 2.7445, + "step": 102075 + }, + { + "epoch": 6.935724962630792, + "grad_norm": 9.254257202148438, + "learning_rate": 1.3339108574534584e-06, + "loss": 2.8046, + "step": 102080 + }, + { + "epoch": 6.9360646827014545, + "grad_norm": 9.071935653686523, + "learning_rate": 1.3334862073651312e-06, + "loss": 2.8637, + "step": 102085 + }, + { + "epoch": 6.936404402772116, + "grad_norm": 9.454960823059082, + "learning_rate": 1.333061557276804e-06, + "loss": 2.8434, + "step": 102090 + }, + { + "epoch": 6.936744122842778, + "grad_norm": 9.040096282958984, + "learning_rate": 1.3326369071884768e-06, + "loss": 2.7244, + "step": 102095 + }, + { + "epoch": 6.93708384291344, + "grad_norm": 8.55260181427002, + "learning_rate": 1.3322122571001494e-06, + "loss": 2.4061, + "step": 102100 + }, + { + "epoch": 6.937423562984101, + "grad_norm": 7.708678245544434, + "learning_rate": 1.3317876070118224e-06, + "loss": 2.6585, + "step": 102105 + }, + { + "epoch": 6.937763283054763, + "grad_norm": 7.6055803298950195, + "learning_rate": 1.3313629569234952e-06, + "loss": 2.5197, + "step": 102110 + }, + { + "epoch": 6.938103003125424, + "grad_norm": 8.016973495483398, + "learning_rate": 1.330938306835168e-06, + "loss": 2.5086, + "step": 102115 + }, + { + "epoch": 6.938442723196086, + "grad_norm": 6.504159450531006, + "learning_rate": 1.3305136567468408e-06, + "loss": 2.7804, + "step": 102120 + }, + { + "epoch": 6.938782443266748, + "grad_norm": 7.786032199859619, + "learning_rate": 1.3300890066585134e-06, + "loss": 2.8607, + "step": 102125 + }, + { + "epoch": 6.93912216333741, + "grad_norm": 6.10360050201416, + "learning_rate": 1.3296643565701864e-06, + "loss": 2.8967, + "step": 102130 + }, + { + "epoch": 6.939461883408072, + "grad_norm": 7.674428462982178, + "learning_rate": 1.329239706481859e-06, + "loss": 2.6284, + "step": 102135 + }, + { + "epoch": 6.939801603478734, + "grad_norm": 7.734315395355225, + "learning_rate": 1.3288150563935318e-06, + "loss": 2.8127, + "step": 102140 + }, + { + "epoch": 6.940141323549395, + "grad_norm": 8.916954040527344, + "learning_rate": 1.3283904063052048e-06, + "loss": 2.6502, + "step": 102145 + }, + { + "epoch": 6.940481043620057, + "grad_norm": 6.640257358551025, + "learning_rate": 1.3279657562168774e-06, + "loss": 2.8004, + "step": 102150 + }, + { + "epoch": 6.940820763690719, + "grad_norm": 7.994046688079834, + "learning_rate": 1.32754110612855e-06, + "loss": 2.7736, + "step": 102155 + }, + { + "epoch": 6.94116048376138, + "grad_norm": 8.058365821838379, + "learning_rate": 1.327116456040223e-06, + "loss": 2.561, + "step": 102160 + }, + { + "epoch": 6.941500203832042, + "grad_norm": 8.711188316345215, + "learning_rate": 1.3266918059518958e-06, + "loss": 2.8108, + "step": 102165 + }, + { + "epoch": 6.941839923902704, + "grad_norm": 6.8840155601501465, + "learning_rate": 1.3262671558635684e-06, + "loss": 2.5942, + "step": 102170 + }, + { + "epoch": 6.942179643973366, + "grad_norm": 8.989970207214355, + "learning_rate": 1.3258425057752414e-06, + "loss": 2.919, + "step": 102175 + }, + { + "epoch": 6.942519364044028, + "grad_norm": 6.341495037078857, + "learning_rate": 1.325417855686914e-06, + "loss": 2.6512, + "step": 102180 + }, + { + "epoch": 6.94285908411469, + "grad_norm": 9.034095764160156, + "learning_rate": 1.3249932055985868e-06, + "loss": 2.9255, + "step": 102185 + }, + { + "epoch": 6.943198804185351, + "grad_norm": 7.335689544677734, + "learning_rate": 1.3245685555102598e-06, + "loss": 2.6438, + "step": 102190 + }, + { + "epoch": 6.943538524256013, + "grad_norm": 6.900457859039307, + "learning_rate": 1.3241439054219324e-06, + "loss": 2.7814, + "step": 102195 + }, + { + "epoch": 6.943878244326675, + "grad_norm": 9.308131217956543, + "learning_rate": 1.3237192553336054e-06, + "loss": 2.5767, + "step": 102200 + }, + { + "epoch": 6.944217964397336, + "grad_norm": 6.892062664031982, + "learning_rate": 1.323294605245278e-06, + "loss": 2.894, + "step": 102205 + }, + { + "epoch": 6.944557684467998, + "grad_norm": 8.316444396972656, + "learning_rate": 1.3228699551569508e-06, + "loss": 2.4464, + "step": 102210 + }, + { + "epoch": 6.9448974045386604, + "grad_norm": 9.422224044799805, + "learning_rate": 1.3224453050686236e-06, + "loss": 2.7585, + "step": 102215 + }, + { + "epoch": 6.945237124609322, + "grad_norm": 7.075852870941162, + "learning_rate": 1.3220206549802964e-06, + "loss": 2.8367, + "step": 102220 + }, + { + "epoch": 6.945576844679984, + "grad_norm": 7.222334861755371, + "learning_rate": 1.321596004891969e-06, + "loss": 2.8154, + "step": 102225 + }, + { + "epoch": 6.945916564750646, + "grad_norm": 7.260406494140625, + "learning_rate": 1.321171354803642e-06, + "loss": 2.9441, + "step": 102230 + }, + { + "epoch": 6.946256284821307, + "grad_norm": 6.8639302253723145, + "learning_rate": 1.3207467047153146e-06, + "loss": 2.8548, + "step": 102235 + }, + { + "epoch": 6.946596004891969, + "grad_norm": 9.279582977294922, + "learning_rate": 1.3203220546269874e-06, + "loss": 2.986, + "step": 102240 + }, + { + "epoch": 6.946935724962631, + "grad_norm": 6.456822872161865, + "learning_rate": 1.3198974045386604e-06, + "loss": 2.6751, + "step": 102245 + }, + { + "epoch": 6.947275445033292, + "grad_norm": 6.796950340270996, + "learning_rate": 1.319472754450333e-06, + "loss": 2.9705, + "step": 102250 + }, + { + "epoch": 6.947615165103954, + "grad_norm": 7.050084114074707, + "learning_rate": 1.3190481043620058e-06, + "loss": 2.8035, + "step": 102255 + }, + { + "epoch": 6.9479548851746165, + "grad_norm": 7.185593128204346, + "learning_rate": 1.3186234542736786e-06, + "loss": 2.6312, + "step": 102260 + }, + { + "epoch": 6.948294605245278, + "grad_norm": 7.867452621459961, + "learning_rate": 1.3181988041853514e-06, + "loss": 2.8391, + "step": 102265 + }, + { + "epoch": 6.94863432531594, + "grad_norm": 8.62576675415039, + "learning_rate": 1.317774154097024e-06, + "loss": 2.647, + "step": 102270 + }, + { + "epoch": 6.948974045386602, + "grad_norm": 9.772052764892578, + "learning_rate": 1.317349504008697e-06, + "loss": 2.7372, + "step": 102275 + }, + { + "epoch": 6.949313765457263, + "grad_norm": 6.181421756744385, + "learning_rate": 1.3169248539203696e-06, + "loss": 2.808, + "step": 102280 + }, + { + "epoch": 6.949653485527925, + "grad_norm": 8.618608474731445, + "learning_rate": 1.3165002038320426e-06, + "loss": 2.708, + "step": 102285 + }, + { + "epoch": 6.949993205598587, + "grad_norm": 8.123919486999512, + "learning_rate": 1.3160755537437154e-06, + "loss": 2.6529, + "step": 102290 + }, + { + "epoch": 6.950332925669248, + "grad_norm": 7.89006233215332, + "learning_rate": 1.315650903655388e-06, + "loss": 2.713, + "step": 102295 + }, + { + "epoch": 6.95067264573991, + "grad_norm": 8.377711296081543, + "learning_rate": 1.315226253567061e-06, + "loss": 2.6751, + "step": 102300 + }, + { + "epoch": 6.9510123658105725, + "grad_norm": 7.69728422164917, + "learning_rate": 1.3148016034787336e-06, + "loss": 2.8585, + "step": 102305 + }, + { + "epoch": 6.951352085881234, + "grad_norm": 9.008362770080566, + "learning_rate": 1.3143769533904064e-06, + "loss": 2.8151, + "step": 102310 + }, + { + "epoch": 6.951691805951896, + "grad_norm": 9.154732704162598, + "learning_rate": 1.3139523033020792e-06, + "loss": 2.6875, + "step": 102315 + }, + { + "epoch": 6.952031526022557, + "grad_norm": 6.82924747467041, + "learning_rate": 1.313527653213752e-06, + "loss": 2.7668, + "step": 102320 + }, + { + "epoch": 6.952371246093219, + "grad_norm": 7.144805908203125, + "learning_rate": 1.3131030031254246e-06, + "loss": 2.6612, + "step": 102325 + }, + { + "epoch": 6.952710966163881, + "grad_norm": 7.936432361602783, + "learning_rate": 1.3126783530370976e-06, + "loss": 2.9178, + "step": 102330 + }, + { + "epoch": 6.953050686234542, + "grad_norm": 8.144574165344238, + "learning_rate": 1.3122537029487704e-06, + "loss": 2.7847, + "step": 102335 + }, + { + "epoch": 6.953390406305204, + "grad_norm": 9.548989295959473, + "learning_rate": 1.311829052860443e-06, + "loss": 2.7073, + "step": 102340 + }, + { + "epoch": 6.953730126375866, + "grad_norm": 5.747408390045166, + "learning_rate": 1.311404402772116e-06, + "loss": 3.0039, + "step": 102345 + }, + { + "epoch": 6.954069846446528, + "grad_norm": 7.648177146911621, + "learning_rate": 1.3109797526837886e-06, + "loss": 3.033, + "step": 102350 + }, + { + "epoch": 6.95440956651719, + "grad_norm": 7.112540245056152, + "learning_rate": 1.3105551025954614e-06, + "loss": 2.6311, + "step": 102355 + }, + { + "epoch": 6.954749286587852, + "grad_norm": 7.206064224243164, + "learning_rate": 1.3101304525071342e-06, + "loss": 2.5762, + "step": 102360 + }, + { + "epoch": 6.955089006658513, + "grad_norm": 10.400552749633789, + "learning_rate": 1.309705802418807e-06, + "loss": 2.8402, + "step": 102365 + }, + { + "epoch": 6.955428726729175, + "grad_norm": 6.538234233856201, + "learning_rate": 1.30928115233048e-06, + "loss": 2.841, + "step": 102370 + }, + { + "epoch": 6.955768446799837, + "grad_norm": 9.18172836303711, + "learning_rate": 1.3088565022421526e-06, + "loss": 2.8488, + "step": 102375 + }, + { + "epoch": 6.956108166870498, + "grad_norm": 8.743943214416504, + "learning_rate": 1.3084318521538254e-06, + "loss": 2.8195, + "step": 102380 + }, + { + "epoch": 6.95644788694116, + "grad_norm": 6.551158428192139, + "learning_rate": 1.3080072020654982e-06, + "loss": 2.7575, + "step": 102385 + }, + { + "epoch": 6.956787607011822, + "grad_norm": 7.34097146987915, + "learning_rate": 1.307582551977171e-06, + "loss": 2.8566, + "step": 102390 + }, + { + "epoch": 6.957127327082484, + "grad_norm": 6.33268404006958, + "learning_rate": 1.3071579018888436e-06, + "loss": 2.4056, + "step": 102395 + }, + { + "epoch": 6.957467047153146, + "grad_norm": 9.489953994750977, + "learning_rate": 1.3067332518005166e-06, + "loss": 2.8249, + "step": 102400 + }, + { + "epoch": 6.957806767223808, + "grad_norm": 8.186125755310059, + "learning_rate": 1.3063086017121892e-06, + "loss": 2.9497, + "step": 102405 + }, + { + "epoch": 6.958146487294469, + "grad_norm": 6.9292216300964355, + "learning_rate": 1.305883951623862e-06, + "loss": 2.809, + "step": 102410 + }, + { + "epoch": 6.958486207365131, + "grad_norm": 7.415529251098633, + "learning_rate": 1.305459301535535e-06, + "loss": 2.4925, + "step": 102415 + }, + { + "epoch": 6.958825927435793, + "grad_norm": 6.641077995300293, + "learning_rate": 1.3050346514472076e-06, + "loss": 2.76, + "step": 102420 + }, + { + "epoch": 6.959165647506454, + "grad_norm": 7.405792713165283, + "learning_rate": 1.3046100013588804e-06, + "loss": 2.8449, + "step": 102425 + }, + { + "epoch": 6.959505367577116, + "grad_norm": 8.160822868347168, + "learning_rate": 1.3041853512705532e-06, + "loss": 2.8113, + "step": 102430 + }, + { + "epoch": 6.959845087647778, + "grad_norm": 8.806172370910645, + "learning_rate": 1.303760701182226e-06, + "loss": 2.8536, + "step": 102435 + }, + { + "epoch": 6.96018480771844, + "grad_norm": 7.388904094696045, + "learning_rate": 1.3033360510938986e-06, + "loss": 2.8171, + "step": 102440 + }, + { + "epoch": 6.960524527789102, + "grad_norm": 7.169656753540039, + "learning_rate": 1.3029114010055716e-06, + "loss": 2.8298, + "step": 102445 + }, + { + "epoch": 6.960864247859764, + "grad_norm": 7.506324768066406, + "learning_rate": 1.3024867509172442e-06, + "loss": 2.6323, + "step": 102450 + }, + { + "epoch": 6.961203967930425, + "grad_norm": 7.1527419090271, + "learning_rate": 1.3020621008289172e-06, + "loss": 2.5192, + "step": 102455 + }, + { + "epoch": 6.961543688001087, + "grad_norm": 7.723557472229004, + "learning_rate": 1.30163745074059e-06, + "loss": 2.9425, + "step": 102460 + }, + { + "epoch": 6.961883408071749, + "grad_norm": 7.1255903244018555, + "learning_rate": 1.3012128006522626e-06, + "loss": 2.7211, + "step": 102465 + }, + { + "epoch": 6.96222312814241, + "grad_norm": 6.486015319824219, + "learning_rate": 1.3007881505639356e-06, + "loss": 2.4932, + "step": 102470 + }, + { + "epoch": 6.962562848213072, + "grad_norm": 6.830507278442383, + "learning_rate": 1.3003635004756082e-06, + "loss": 2.4869, + "step": 102475 + }, + { + "epoch": 6.962902568283734, + "grad_norm": 8.91594409942627, + "learning_rate": 1.299938850387281e-06, + "loss": 2.6114, + "step": 102480 + }, + { + "epoch": 6.963242288354396, + "grad_norm": 6.512566566467285, + "learning_rate": 1.2995142002989538e-06, + "loss": 3.0018, + "step": 102485 + }, + { + "epoch": 6.963582008425058, + "grad_norm": 10.01087474822998, + "learning_rate": 1.2990895502106266e-06, + "loss": 2.7876, + "step": 102490 + }, + { + "epoch": 6.96392172849572, + "grad_norm": 7.4916815757751465, + "learning_rate": 1.2986649001222992e-06, + "loss": 2.9008, + "step": 102495 + }, + { + "epoch": 6.964261448566381, + "grad_norm": 8.545708656311035, + "learning_rate": 1.2982402500339722e-06, + "loss": 2.7303, + "step": 102500 + }, + { + "epoch": 6.964601168637043, + "grad_norm": 8.52513313293457, + "learning_rate": 1.297815599945645e-06, + "loss": 2.8511, + "step": 102505 + }, + { + "epoch": 6.964940888707705, + "grad_norm": 5.634533882141113, + "learning_rate": 1.2973909498573176e-06, + "loss": 2.5042, + "step": 102510 + }, + { + "epoch": 6.965280608778366, + "grad_norm": 8.34119987487793, + "learning_rate": 1.2969662997689906e-06, + "loss": 2.8519, + "step": 102515 + }, + { + "epoch": 6.965620328849028, + "grad_norm": 6.968369483947754, + "learning_rate": 1.2965416496806632e-06, + "loss": 2.7651, + "step": 102520 + }, + { + "epoch": 6.9659600489196905, + "grad_norm": 8.898454666137695, + "learning_rate": 1.296116999592336e-06, + "loss": 2.9366, + "step": 102525 + }, + { + "epoch": 6.966299768990352, + "grad_norm": 8.630002975463867, + "learning_rate": 1.2956923495040088e-06, + "loss": 2.7767, + "step": 102530 + }, + { + "epoch": 6.966639489061014, + "grad_norm": 7.990040302276611, + "learning_rate": 1.2952676994156816e-06, + "loss": 2.8589, + "step": 102535 + }, + { + "epoch": 6.966979209131676, + "grad_norm": 8.1199951171875, + "learning_rate": 1.2948430493273546e-06, + "loss": 3.0441, + "step": 102540 + }, + { + "epoch": 6.967318929202337, + "grad_norm": 7.689202308654785, + "learning_rate": 1.2944183992390272e-06, + "loss": 2.7485, + "step": 102545 + }, + { + "epoch": 6.967658649272999, + "grad_norm": 9.134881019592285, + "learning_rate": 1.2939937491506998e-06, + "loss": 2.7222, + "step": 102550 + }, + { + "epoch": 6.967998369343661, + "grad_norm": 7.870962619781494, + "learning_rate": 1.2935690990623728e-06, + "loss": 2.8768, + "step": 102555 + }, + { + "epoch": 6.968338089414322, + "grad_norm": 7.528188705444336, + "learning_rate": 1.2931444489740456e-06, + "loss": 2.8023, + "step": 102560 + }, + { + "epoch": 6.968677809484984, + "grad_norm": 8.571666717529297, + "learning_rate": 1.2927197988857182e-06, + "loss": 2.841, + "step": 102565 + }, + { + "epoch": 6.9690175295556465, + "grad_norm": 10.391617774963379, + "learning_rate": 1.2922951487973912e-06, + "loss": 2.6779, + "step": 102570 + }, + { + "epoch": 6.969357249626308, + "grad_norm": 8.1799898147583, + "learning_rate": 1.2918704987090638e-06, + "loss": 2.7666, + "step": 102575 + }, + { + "epoch": 6.96969696969697, + "grad_norm": 9.093749046325684, + "learning_rate": 1.2914458486207366e-06, + "loss": 2.938, + "step": 102580 + }, + { + "epoch": 6.970036689767632, + "grad_norm": 8.961901664733887, + "learning_rate": 1.2910211985324096e-06, + "loss": 2.7114, + "step": 102585 + }, + { + "epoch": 6.970376409838293, + "grad_norm": 8.702773094177246, + "learning_rate": 1.2905965484440822e-06, + "loss": 2.5246, + "step": 102590 + }, + { + "epoch": 6.970716129908955, + "grad_norm": 7.744819641113281, + "learning_rate": 1.2901718983557548e-06, + "loss": 2.6931, + "step": 102595 + }, + { + "epoch": 6.971055849979617, + "grad_norm": 9.470341682434082, + "learning_rate": 1.2897472482674278e-06, + "loss": 2.8058, + "step": 102600 + }, + { + "epoch": 6.971395570050278, + "grad_norm": 9.751203536987305, + "learning_rate": 1.2893225981791006e-06, + "loss": 2.812, + "step": 102605 + }, + { + "epoch": 6.97173529012094, + "grad_norm": 8.991437911987305, + "learning_rate": 1.2888979480907732e-06, + "loss": 2.5921, + "step": 102610 + }, + { + "epoch": 6.9720750101916025, + "grad_norm": 7.688000202178955, + "learning_rate": 1.2884732980024462e-06, + "loss": 2.6905, + "step": 102615 + }, + { + "epoch": 6.972414730262264, + "grad_norm": 8.011898040771484, + "learning_rate": 1.2880486479141188e-06, + "loss": 2.8396, + "step": 102620 + }, + { + "epoch": 6.972754450332926, + "grad_norm": 6.895003795623779, + "learning_rate": 1.2876239978257918e-06, + "loss": 2.7825, + "step": 102625 + }, + { + "epoch": 6.973094170403588, + "grad_norm": 7.224112033843994, + "learning_rate": 1.2871993477374644e-06, + "loss": 2.7356, + "step": 102630 + }, + { + "epoch": 6.973433890474249, + "grad_norm": 9.647807121276855, + "learning_rate": 1.2867746976491372e-06, + "loss": 2.6236, + "step": 102635 + }, + { + "epoch": 6.973773610544911, + "grad_norm": 7.575270652770996, + "learning_rate": 1.2863500475608102e-06, + "loss": 2.7185, + "step": 102640 + }, + { + "epoch": 6.974113330615573, + "grad_norm": 8.812464714050293, + "learning_rate": 1.2859253974724828e-06, + "loss": 2.6619, + "step": 102645 + }, + { + "epoch": 6.974453050686234, + "grad_norm": 7.993076324462891, + "learning_rate": 1.2855007473841556e-06, + "loss": 2.6795, + "step": 102650 + }, + { + "epoch": 6.974792770756896, + "grad_norm": 9.216817855834961, + "learning_rate": 1.2850760972958284e-06, + "loss": 2.7742, + "step": 102655 + }, + { + "epoch": 6.9751324908275585, + "grad_norm": 8.744462966918945, + "learning_rate": 1.2846514472075012e-06, + "loss": 2.4832, + "step": 102660 + }, + { + "epoch": 6.97547221089822, + "grad_norm": 6.756660461425781, + "learning_rate": 1.2842267971191738e-06, + "loss": 2.6176, + "step": 102665 + }, + { + "epoch": 6.975811930968882, + "grad_norm": 6.309206485748291, + "learning_rate": 1.2838021470308468e-06, + "loss": 2.6522, + "step": 102670 + }, + { + "epoch": 6.976151651039544, + "grad_norm": 7.212698459625244, + "learning_rate": 1.2833774969425194e-06, + "loss": 2.736, + "step": 102675 + }, + { + "epoch": 6.976491371110205, + "grad_norm": 8.61663818359375, + "learning_rate": 1.2829528468541922e-06, + "loss": 2.6242, + "step": 102680 + }, + { + "epoch": 6.976831091180867, + "grad_norm": 7.276638984680176, + "learning_rate": 1.2825281967658652e-06, + "loss": 2.714, + "step": 102685 + }, + { + "epoch": 6.977170811251529, + "grad_norm": 8.20138168334961, + "learning_rate": 1.2821035466775378e-06, + "loss": 2.5727, + "step": 102690 + }, + { + "epoch": 6.97751053132219, + "grad_norm": 7.199765682220459, + "learning_rate": 1.2816788965892106e-06, + "loss": 2.6713, + "step": 102695 + }, + { + "epoch": 6.977850251392852, + "grad_norm": 8.121194839477539, + "learning_rate": 1.2812542465008834e-06, + "loss": 2.8048, + "step": 102700 + }, + { + "epoch": 6.9781899714635145, + "grad_norm": 7.1491804122924805, + "learning_rate": 1.2808295964125562e-06, + "loss": 2.8725, + "step": 102705 + }, + { + "epoch": 6.978529691534176, + "grad_norm": 7.411249160766602, + "learning_rate": 1.280404946324229e-06, + "loss": 2.7102, + "step": 102710 + }, + { + "epoch": 6.978869411604838, + "grad_norm": 8.221524238586426, + "learning_rate": 1.2799802962359018e-06, + "loss": 2.6598, + "step": 102715 + }, + { + "epoch": 6.9792091316755, + "grad_norm": 8.932463645935059, + "learning_rate": 1.2795556461475744e-06, + "loss": 2.6533, + "step": 102720 + }, + { + "epoch": 6.979548851746161, + "grad_norm": 6.8556294441223145, + "learning_rate": 1.2791309960592474e-06, + "loss": 2.8005, + "step": 102725 + }, + { + "epoch": 6.979888571816823, + "grad_norm": 6.145258903503418, + "learning_rate": 1.2787063459709202e-06, + "loss": 2.8208, + "step": 102730 + }, + { + "epoch": 6.980228291887485, + "grad_norm": 8.33495044708252, + "learning_rate": 1.2782816958825928e-06, + "loss": 2.7568, + "step": 102735 + }, + { + "epoch": 6.980568011958146, + "grad_norm": 9.255611419677734, + "learning_rate": 1.2778570457942658e-06, + "loss": 2.6403, + "step": 102740 + }, + { + "epoch": 6.980907732028808, + "grad_norm": 8.063619613647461, + "learning_rate": 1.2774323957059384e-06, + "loss": 2.9664, + "step": 102745 + }, + { + "epoch": 6.9812474520994705, + "grad_norm": 7.975089073181152, + "learning_rate": 1.2770077456176112e-06, + "loss": 2.7893, + "step": 102750 + }, + { + "epoch": 6.981587172170132, + "grad_norm": 8.904863357543945, + "learning_rate": 1.276583095529284e-06, + "loss": 2.901, + "step": 102755 + }, + { + "epoch": 6.981926892240794, + "grad_norm": 8.5765380859375, + "learning_rate": 1.2761584454409568e-06, + "loss": 2.5553, + "step": 102760 + }, + { + "epoch": 6.982266612311456, + "grad_norm": 8.853114128112793, + "learning_rate": 1.2757337953526293e-06, + "loss": 2.781, + "step": 102765 + }, + { + "epoch": 6.982606332382117, + "grad_norm": 7.197207450866699, + "learning_rate": 1.2753091452643024e-06, + "loss": 2.845, + "step": 102770 + }, + { + "epoch": 6.982946052452779, + "grad_norm": 8.500534057617188, + "learning_rate": 1.2748844951759752e-06, + "loss": 2.8258, + "step": 102775 + }, + { + "epoch": 6.983285772523441, + "grad_norm": 7.526294231414795, + "learning_rate": 1.2744598450876478e-06, + "loss": 2.7669, + "step": 102780 + }, + { + "epoch": 6.983625492594102, + "grad_norm": 5.404770851135254, + "learning_rate": 1.2740351949993208e-06, + "loss": 2.7458, + "step": 102785 + }, + { + "epoch": 6.983965212664764, + "grad_norm": 8.30547046661377, + "learning_rate": 1.2736105449109934e-06, + "loss": 2.629, + "step": 102790 + }, + { + "epoch": 6.984304932735426, + "grad_norm": 7.984428882598877, + "learning_rate": 1.2731858948226664e-06, + "loss": 2.8798, + "step": 102795 + }, + { + "epoch": 6.984644652806088, + "grad_norm": 6.69755744934082, + "learning_rate": 1.272761244734339e-06, + "loss": 2.767, + "step": 102800 + }, + { + "epoch": 6.98498437287675, + "grad_norm": 7.962437152862549, + "learning_rate": 1.2723365946460118e-06, + "loss": 2.7171, + "step": 102805 + }, + { + "epoch": 6.985324092947411, + "grad_norm": 6.280322551727295, + "learning_rate": 1.2719119445576848e-06, + "loss": 2.9244, + "step": 102810 + }, + { + "epoch": 6.985663813018073, + "grad_norm": 7.990108966827393, + "learning_rate": 1.2714872944693574e-06, + "loss": 2.7667, + "step": 102815 + }, + { + "epoch": 6.986003533088735, + "grad_norm": 7.890318393707275, + "learning_rate": 1.2710626443810302e-06, + "loss": 2.8591, + "step": 102820 + }, + { + "epoch": 6.986343253159396, + "grad_norm": 9.860941886901855, + "learning_rate": 1.270637994292703e-06, + "loss": 2.8851, + "step": 102825 + }, + { + "epoch": 6.986682973230058, + "grad_norm": 9.2820463180542, + "learning_rate": 1.2702133442043758e-06, + "loss": 2.9299, + "step": 102830 + }, + { + "epoch": 6.9870226933007205, + "grad_norm": 11.29103946685791, + "learning_rate": 1.2697886941160483e-06, + "loss": 2.9325, + "step": 102835 + }, + { + "epoch": 6.987362413371382, + "grad_norm": 9.284786224365234, + "learning_rate": 1.2693640440277214e-06, + "loss": 2.8979, + "step": 102840 + }, + { + "epoch": 6.987702133442044, + "grad_norm": 7.648629665374756, + "learning_rate": 1.268939393939394e-06, + "loss": 2.7384, + "step": 102845 + }, + { + "epoch": 6.988041853512706, + "grad_norm": 6.5826802253723145, + "learning_rate": 1.2685147438510668e-06, + "loss": 2.5238, + "step": 102850 + }, + { + "epoch": 6.988381573583367, + "grad_norm": 9.397858619689941, + "learning_rate": 1.2680900937627398e-06, + "loss": 2.467, + "step": 102855 + }, + { + "epoch": 6.988721293654029, + "grad_norm": 8.206071853637695, + "learning_rate": 1.2676654436744124e-06, + "loss": 2.7266, + "step": 102860 + }, + { + "epoch": 6.989061013724691, + "grad_norm": 6.628955841064453, + "learning_rate": 1.267240793586085e-06, + "loss": 2.5993, + "step": 102865 + }, + { + "epoch": 6.989400733795352, + "grad_norm": 8.551078796386719, + "learning_rate": 1.266816143497758e-06, + "loss": 2.5507, + "step": 102870 + }, + { + "epoch": 6.989740453866014, + "grad_norm": 9.612774848937988, + "learning_rate": 1.2663914934094308e-06, + "loss": 2.6132, + "step": 102875 + }, + { + "epoch": 6.9900801739366765, + "grad_norm": 5.157508850097656, + "learning_rate": 1.2659668433211036e-06, + "loss": 2.8132, + "step": 102880 + }, + { + "epoch": 6.990419894007338, + "grad_norm": 8.57419204711914, + "learning_rate": 1.2655421932327764e-06, + "loss": 2.8658, + "step": 102885 + }, + { + "epoch": 6.990759614078, + "grad_norm": 7.64235258102417, + "learning_rate": 1.265117543144449e-06, + "loss": 2.7323, + "step": 102890 + }, + { + "epoch": 6.991099334148662, + "grad_norm": 7.951957702636719, + "learning_rate": 1.264692893056122e-06, + "loss": 2.6164, + "step": 102895 + }, + { + "epoch": 6.991439054219323, + "grad_norm": 7.900129318237305, + "learning_rate": 1.2642682429677948e-06, + "loss": 2.7384, + "step": 102900 + }, + { + "epoch": 6.991778774289985, + "grad_norm": 9.769871711730957, + "learning_rate": 1.2638435928794673e-06, + "loss": 2.5823, + "step": 102905 + }, + { + "epoch": 6.992118494360647, + "grad_norm": 9.045907020568848, + "learning_rate": 1.2634189427911404e-06, + "loss": 3.03, + "step": 102910 + }, + { + "epoch": 6.992458214431308, + "grad_norm": 8.983277320861816, + "learning_rate": 1.262994292702813e-06, + "loss": 3.0272, + "step": 102915 + }, + { + "epoch": 6.99279793450197, + "grad_norm": 7.641284465789795, + "learning_rate": 1.2625696426144857e-06, + "loss": 2.7874, + "step": 102920 + }, + { + "epoch": 6.9931376545726325, + "grad_norm": 9.980202674865723, + "learning_rate": 1.2621449925261586e-06, + "loss": 2.8867, + "step": 102925 + }, + { + "epoch": 6.993477374643294, + "grad_norm": 8.07332706451416, + "learning_rate": 1.2617203424378314e-06, + "loss": 3.0342, + "step": 102930 + }, + { + "epoch": 6.993817094713956, + "grad_norm": 7.24611234664917, + "learning_rate": 1.261295692349504e-06, + "loss": 2.7281, + "step": 102935 + }, + { + "epoch": 6.994156814784618, + "grad_norm": 7.905289173126221, + "learning_rate": 1.260871042261177e-06, + "loss": 2.7636, + "step": 102940 + }, + { + "epoch": 6.994496534855279, + "grad_norm": 7.64995813369751, + "learning_rate": 1.2604463921728495e-06, + "loss": 2.8956, + "step": 102945 + }, + { + "epoch": 6.994836254925941, + "grad_norm": 10.58121395111084, + "learning_rate": 1.2600217420845223e-06, + "loss": 2.7846, + "step": 102950 + }, + { + "epoch": 6.995175974996603, + "grad_norm": 6.018618583679199, + "learning_rate": 1.2595970919961954e-06, + "loss": 2.7076, + "step": 102955 + }, + { + "epoch": 6.995515695067264, + "grad_norm": 7.520453453063965, + "learning_rate": 1.259172441907868e-06, + "loss": 2.8719, + "step": 102960 + }, + { + "epoch": 6.995855415137926, + "grad_norm": 6.758708477020264, + "learning_rate": 1.258747791819541e-06, + "loss": 2.5333, + "step": 102965 + }, + { + "epoch": 6.9961951352085885, + "grad_norm": 7.997344017028809, + "learning_rate": 1.2583231417312135e-06, + "loss": 2.9214, + "step": 102970 + }, + { + "epoch": 6.99653485527925, + "grad_norm": 6.833018779754639, + "learning_rate": 1.2578984916428863e-06, + "loss": 2.7854, + "step": 102975 + }, + { + "epoch": 6.996874575349912, + "grad_norm": 6.580789089202881, + "learning_rate": 1.2574738415545594e-06, + "loss": 2.9571, + "step": 102980 + }, + { + "epoch": 6.997214295420574, + "grad_norm": 8.053912162780762, + "learning_rate": 1.257049191466232e-06, + "loss": 2.8654, + "step": 102985 + }, + { + "epoch": 6.997554015491235, + "grad_norm": 6.748602390289307, + "learning_rate": 1.2566245413779045e-06, + "loss": 2.7214, + "step": 102990 + }, + { + "epoch": 6.997893735561897, + "grad_norm": 8.632611274719238, + "learning_rate": 1.2561998912895775e-06, + "loss": 2.6613, + "step": 102995 + }, + { + "epoch": 6.998233455632558, + "grad_norm": 10.808577537536621, + "learning_rate": 1.2557752412012504e-06, + "loss": 2.9264, + "step": 103000 + }, + { + "epoch": 6.99857317570322, + "grad_norm": 8.780824661254883, + "learning_rate": 1.255350591112923e-06, + "loss": 2.7033, + "step": 103005 + }, + { + "epoch": 6.998912895773882, + "grad_norm": 9.402179718017578, + "learning_rate": 1.254925941024596e-06, + "loss": 2.6259, + "step": 103010 + }, + { + "epoch": 6.999252615844544, + "grad_norm": 9.345235824584961, + "learning_rate": 1.2545012909362685e-06, + "loss": 2.8015, + "step": 103015 + }, + { + "epoch": 6.999592335915206, + "grad_norm": 7.0279951095581055, + "learning_rate": 1.2540766408479413e-06, + "loss": 2.8156, + "step": 103020 + }, + { + "epoch": 6.999932055985868, + "grad_norm": 8.878084182739258, + "learning_rate": 1.2536519907596141e-06, + "loss": 2.5372, + "step": 103025 + }, + { + "epoch": 7.0, + "eval_bertscore": { + "f1": 0.8417620439263251, + "precision": 0.8426981780611968, + "recall": 0.8416421878579305 + }, + "eval_bleu_4": 0.017746460291639673, + "eval_exact_match": 9.690861517588914e-05, + "eval_loss": 3.344142198562622, + "eval_meteor": 0.1020990541391059, + "eval_rouge": { + "rouge1": 0.1340443960098778, + "rouge2": 0.017595284160277398, + "rougeL": 0.11237285387288437, + "rougeLsum": 0.11238516065302384 + }, + "eval_runtime": 986.0066, + "eval_samples_per_second": 10.465, + "eval_steps_per_second": 1.308, + "step": 103026 + }, + { + "epoch": 7.00027177605653, + "grad_norm": 6.91963529586792, + "learning_rate": 1.253227340671287e-06, + "loss": 2.7765, + "step": 103030 + }, + { + "epoch": 7.000611496127191, + "grad_norm": 12.131243705749512, + "learning_rate": 1.2528026905829595e-06, + "loss": 2.7731, + "step": 103035 + }, + { + "epoch": 7.000951216197853, + "grad_norm": 9.270806312561035, + "learning_rate": 1.2523780404946325e-06, + "loss": 2.691, + "step": 103040 + }, + { + "epoch": 7.001290936268515, + "grad_norm": 8.38871955871582, + "learning_rate": 1.2519533904063053e-06, + "loss": 2.8307, + "step": 103045 + }, + { + "epoch": 7.001630656339176, + "grad_norm": 7.687541484832764, + "learning_rate": 1.2516136703356434e-06, + "loss": 2.7804, + "step": 103050 + }, + { + "epoch": 7.001970376409838, + "grad_norm": 7.637139320373535, + "learning_rate": 1.2511890202473162e-06, + "loss": 2.8712, + "step": 103055 + }, + { + "epoch": 7.0023100964805, + "grad_norm": 9.395818710327148, + "learning_rate": 1.2507643701589892e-06, + "loss": 2.7593, + "step": 103060 + }, + { + "epoch": 7.002649816551162, + "grad_norm": 8.894570350646973, + "learning_rate": 1.2503397200706618e-06, + "loss": 2.8704, + "step": 103065 + }, + { + "epoch": 7.002989536621824, + "grad_norm": 9.129762649536133, + "learning_rate": 1.2499150699823346e-06, + "loss": 2.5583, + "step": 103070 + }, + { + "epoch": 7.003329256692485, + "grad_norm": 9.412446022033691, + "learning_rate": 1.2494904198940074e-06, + "loss": 2.6075, + "step": 103075 + }, + { + "epoch": 7.003668976763147, + "grad_norm": 5.930469989776611, + "learning_rate": 1.2490657698056802e-06, + "loss": 2.3265, + "step": 103080 + }, + { + "epoch": 7.004008696833809, + "grad_norm": 7.0107855796813965, + "learning_rate": 1.248641119717353e-06, + "loss": 2.9279, + "step": 103085 + }, + { + "epoch": 7.00434841690447, + "grad_norm": 9.719318389892578, + "learning_rate": 1.2482164696290258e-06, + "loss": 2.6018, + "step": 103090 + }, + { + "epoch": 7.004688136975132, + "grad_norm": 6.594224452972412, + "learning_rate": 1.2477918195406984e-06, + "loss": 2.525, + "step": 103095 + }, + { + "epoch": 7.0050278570457944, + "grad_norm": 8.369888305664062, + "learning_rate": 1.2473671694523714e-06, + "loss": 2.8155, + "step": 103100 + }, + { + "epoch": 7.005367577116456, + "grad_norm": 7.485729217529297, + "learning_rate": 1.2469425193640442e-06, + "loss": 2.7348, + "step": 103105 + }, + { + "epoch": 7.005707297187118, + "grad_norm": 8.497403144836426, + "learning_rate": 1.246517869275717e-06, + "loss": 2.5127, + "step": 103110 + }, + { + "epoch": 7.00604701725778, + "grad_norm": 9.711892127990723, + "learning_rate": 1.2460932191873896e-06, + "loss": 2.8415, + "step": 103115 + }, + { + "epoch": 7.006386737328441, + "grad_norm": 10.679337501525879, + "learning_rate": 1.2456685690990624e-06, + "loss": 2.6735, + "step": 103120 + }, + { + "epoch": 7.006726457399103, + "grad_norm": 7.0917863845825195, + "learning_rate": 1.2452439190107352e-06, + "loss": 2.5406, + "step": 103125 + }, + { + "epoch": 7.007066177469765, + "grad_norm": 9.444332122802734, + "learning_rate": 1.244819268922408e-06, + "loss": 2.9098, + "step": 103130 + }, + { + "epoch": 7.007405897540426, + "grad_norm": 8.825024604797363, + "learning_rate": 1.2443946188340808e-06, + "loss": 2.6954, + "step": 103135 + }, + { + "epoch": 7.007745617611088, + "grad_norm": 7.796572685241699, + "learning_rate": 1.2439699687457536e-06, + "loss": 2.7263, + "step": 103140 + }, + { + "epoch": 7.0080853376817505, + "grad_norm": 10.402098655700684, + "learning_rate": 1.2435453186574264e-06, + "loss": 2.7533, + "step": 103145 + }, + { + "epoch": 7.008425057752412, + "grad_norm": 8.321833610534668, + "learning_rate": 1.2431206685690992e-06, + "loss": 2.706, + "step": 103150 + }, + { + "epoch": 7.008764777823074, + "grad_norm": 7.384513854980469, + "learning_rate": 1.242696018480772e-06, + "loss": 2.7206, + "step": 103155 + }, + { + "epoch": 7.009104497893736, + "grad_norm": 7.321744918823242, + "learning_rate": 1.2422713683924448e-06, + "loss": 2.7035, + "step": 103160 + }, + { + "epoch": 7.009444217964397, + "grad_norm": 5.827642440795898, + "learning_rate": 1.2418467183041174e-06, + "loss": 2.5514, + "step": 103165 + }, + { + "epoch": 7.009783938035059, + "grad_norm": 7.79904317855835, + "learning_rate": 1.2414220682157902e-06, + "loss": 2.7281, + "step": 103170 + }, + { + "epoch": 7.010123658105721, + "grad_norm": 7.058202743530273, + "learning_rate": 1.240997418127463e-06, + "loss": 2.435, + "step": 103175 + }, + { + "epoch": 7.010463378176382, + "grad_norm": 8.783384323120117, + "learning_rate": 1.2405727680391358e-06, + "loss": 2.9301, + "step": 103180 + }, + { + "epoch": 7.010803098247044, + "grad_norm": 8.43266487121582, + "learning_rate": 1.2401481179508086e-06, + "loss": 2.698, + "step": 103185 + }, + { + "epoch": 7.0111428183177065, + "grad_norm": 9.589219093322754, + "learning_rate": 1.2397234678624814e-06, + "loss": 2.794, + "step": 103190 + }, + { + "epoch": 7.011482538388368, + "grad_norm": 6.264298915863037, + "learning_rate": 1.2392988177741542e-06, + "loss": 2.6954, + "step": 103195 + }, + { + "epoch": 7.01182225845903, + "grad_norm": 7.286409378051758, + "learning_rate": 1.238874167685827e-06, + "loss": 2.8625, + "step": 103200 + }, + { + "epoch": 7.012161978529692, + "grad_norm": 6.6670451164245605, + "learning_rate": 1.2384495175974998e-06, + "loss": 2.6649, + "step": 103205 + }, + { + "epoch": 7.012501698600353, + "grad_norm": 6.0170745849609375, + "learning_rate": 1.2380248675091726e-06, + "loss": 2.639, + "step": 103210 + }, + { + "epoch": 7.012841418671015, + "grad_norm": 7.346533298492432, + "learning_rate": 1.2376002174208452e-06, + "loss": 2.7718, + "step": 103215 + }, + { + "epoch": 7.013181138741677, + "grad_norm": 7.395686149597168, + "learning_rate": 1.237175567332518e-06, + "loss": 2.5884, + "step": 103220 + }, + { + "epoch": 7.013520858812338, + "grad_norm": 5.858913898468018, + "learning_rate": 1.2367509172441908e-06, + "loss": 2.7782, + "step": 103225 + }, + { + "epoch": 7.013860578883, + "grad_norm": 8.256254196166992, + "learning_rate": 1.2363262671558638e-06, + "loss": 2.7375, + "step": 103230 + }, + { + "epoch": 7.0142002989536625, + "grad_norm": 7.9259724617004395, + "learning_rate": 1.2359016170675364e-06, + "loss": 2.7358, + "step": 103235 + }, + { + "epoch": 7.014540019024324, + "grad_norm": 7.010885238647461, + "learning_rate": 1.2354769669792092e-06, + "loss": 2.7658, + "step": 103240 + }, + { + "epoch": 7.014879739094986, + "grad_norm": 8.663174629211426, + "learning_rate": 1.235052316890882e-06, + "loss": 2.7253, + "step": 103245 + }, + { + "epoch": 7.015219459165648, + "grad_norm": 7.737855434417725, + "learning_rate": 1.2346276668025548e-06, + "loss": 2.6818, + "step": 103250 + }, + { + "epoch": 7.015559179236309, + "grad_norm": 8.845963478088379, + "learning_rate": 1.2342030167142276e-06, + "loss": 2.7563, + "step": 103255 + }, + { + "epoch": 7.015898899306971, + "grad_norm": 6.966759204864502, + "learning_rate": 1.2337783666259004e-06, + "loss": 2.8937, + "step": 103260 + }, + { + "epoch": 7.016238619377633, + "grad_norm": 7.552066326141357, + "learning_rate": 1.233353716537573e-06, + "loss": 2.7277, + "step": 103265 + }, + { + "epoch": 7.016578339448294, + "grad_norm": 7.752874851226807, + "learning_rate": 1.2329290664492458e-06, + "loss": 2.5921, + "step": 103270 + }, + { + "epoch": 7.016918059518956, + "grad_norm": 8.531498908996582, + "learning_rate": 1.2325044163609188e-06, + "loss": 2.6303, + "step": 103275 + }, + { + "epoch": 7.0172577795896185, + "grad_norm": 10.012505531311035, + "learning_rate": 1.2320797662725916e-06, + "loss": 2.6698, + "step": 103280 + }, + { + "epoch": 7.01759749966028, + "grad_norm": 8.594461441040039, + "learning_rate": 1.2316551161842642e-06, + "loss": 2.6522, + "step": 103285 + }, + { + "epoch": 7.017937219730942, + "grad_norm": 8.713818550109863, + "learning_rate": 1.231230466095937e-06, + "loss": 2.7782, + "step": 103290 + }, + { + "epoch": 7.018276939801604, + "grad_norm": 6.12217903137207, + "learning_rate": 1.2308058160076098e-06, + "loss": 2.4042, + "step": 103295 + }, + { + "epoch": 7.018616659872265, + "grad_norm": 7.085483074188232, + "learning_rate": 1.2303811659192826e-06, + "loss": 2.5597, + "step": 103300 + }, + { + "epoch": 7.018956379942927, + "grad_norm": 8.387343406677246, + "learning_rate": 1.2299565158309554e-06, + "loss": 2.9033, + "step": 103305 + }, + { + "epoch": 7.019296100013589, + "grad_norm": 9.760148048400879, + "learning_rate": 1.2295318657426282e-06, + "loss": 2.4678, + "step": 103310 + }, + { + "epoch": 7.01963582008425, + "grad_norm": 6.193734645843506, + "learning_rate": 1.229107215654301e-06, + "loss": 2.3461, + "step": 103315 + }, + { + "epoch": 7.019975540154912, + "grad_norm": 9.606021881103516, + "learning_rate": 1.2286825655659738e-06, + "loss": 2.5618, + "step": 103320 + }, + { + "epoch": 7.0203152602255745, + "grad_norm": 7.35476541519165, + "learning_rate": 1.2282579154776466e-06, + "loss": 2.9569, + "step": 103325 + }, + { + "epoch": 7.020654980296236, + "grad_norm": 9.206866264343262, + "learning_rate": 1.2278332653893194e-06, + "loss": 2.9935, + "step": 103330 + }, + { + "epoch": 7.020994700366898, + "grad_norm": 9.672351837158203, + "learning_rate": 1.227408615300992e-06, + "loss": 2.6703, + "step": 103335 + }, + { + "epoch": 7.02133442043756, + "grad_norm": 7.895009517669678, + "learning_rate": 1.2269839652126648e-06, + "loss": 2.6817, + "step": 103340 + }, + { + "epoch": 7.021674140508221, + "grad_norm": 8.235981941223145, + "learning_rate": 1.2265593151243376e-06, + "loss": 2.8405, + "step": 103345 + }, + { + "epoch": 7.022013860578883, + "grad_norm": 8.711468696594238, + "learning_rate": 1.2261346650360104e-06, + "loss": 2.7631, + "step": 103350 + }, + { + "epoch": 7.022353580649545, + "grad_norm": 7.92756462097168, + "learning_rate": 1.2257100149476832e-06, + "loss": 2.6437, + "step": 103355 + }, + { + "epoch": 7.022693300720206, + "grad_norm": 7.2631449699401855, + "learning_rate": 1.225285364859356e-06, + "loss": 2.7669, + "step": 103360 + }, + { + "epoch": 7.023033020790868, + "grad_norm": 7.60557222366333, + "learning_rate": 1.2248607147710288e-06, + "loss": 2.6089, + "step": 103365 + }, + { + "epoch": 7.0233727408615305, + "grad_norm": 7.202353000640869, + "learning_rate": 1.2244360646827016e-06, + "loss": 2.7213, + "step": 103370 + }, + { + "epoch": 7.023712460932192, + "grad_norm": 7.422977447509766, + "learning_rate": 1.2240114145943744e-06, + "loss": 2.4358, + "step": 103375 + }, + { + "epoch": 7.024052181002854, + "grad_norm": 7.569727420806885, + "learning_rate": 1.2235867645060472e-06, + "loss": 2.6793, + "step": 103380 + }, + { + "epoch": 7.024391901073516, + "grad_norm": 8.765396118164062, + "learning_rate": 1.2231621144177198e-06, + "loss": 2.9621, + "step": 103385 + }, + { + "epoch": 7.024731621144177, + "grad_norm": 6.442164421081543, + "learning_rate": 1.2227374643293926e-06, + "loss": 2.6551, + "step": 103390 + }, + { + "epoch": 7.025071341214839, + "grad_norm": 7.156656265258789, + "learning_rate": 1.2223128142410654e-06, + "loss": 2.5857, + "step": 103395 + }, + { + "epoch": 7.025411061285501, + "grad_norm": 9.129670143127441, + "learning_rate": 1.2218881641527384e-06, + "loss": 2.7583, + "step": 103400 + }, + { + "epoch": 7.025750781356162, + "grad_norm": 6.27313756942749, + "learning_rate": 1.221463514064411e-06, + "loss": 2.7316, + "step": 103405 + }, + { + "epoch": 7.0260905014268245, + "grad_norm": 8.873566627502441, + "learning_rate": 1.2210388639760838e-06, + "loss": 2.7295, + "step": 103410 + }, + { + "epoch": 7.026430221497486, + "grad_norm": 8.76534652709961, + "learning_rate": 1.2206142138877566e-06, + "loss": 2.6848, + "step": 103415 + }, + { + "epoch": 7.026769941568148, + "grad_norm": 8.062005043029785, + "learning_rate": 1.2201895637994294e-06, + "loss": 2.8222, + "step": 103420 + }, + { + "epoch": 7.02710966163881, + "grad_norm": 7.454228401184082, + "learning_rate": 1.2197649137111022e-06, + "loss": 2.6594, + "step": 103425 + }, + { + "epoch": 7.027449381709471, + "grad_norm": 8.895292282104492, + "learning_rate": 1.219340263622775e-06, + "loss": 2.6457, + "step": 103430 + }, + { + "epoch": 7.027789101780133, + "grad_norm": 8.46064281463623, + "learning_rate": 1.2189156135344476e-06, + "loss": 2.7771, + "step": 103435 + }, + { + "epoch": 7.028128821850795, + "grad_norm": 8.648619651794434, + "learning_rate": 1.2184909634461204e-06, + "loss": 2.6396, + "step": 103440 + }, + { + "epoch": 7.028468541921456, + "grad_norm": 7.208324909210205, + "learning_rate": 1.2180663133577932e-06, + "loss": 2.8461, + "step": 103445 + }, + { + "epoch": 7.028808261992118, + "grad_norm": 8.329790115356445, + "learning_rate": 1.2176416632694662e-06, + "loss": 2.6941, + "step": 103450 + }, + { + "epoch": 7.0291479820627805, + "grad_norm": 8.523114204406738, + "learning_rate": 1.2172170131811388e-06, + "loss": 2.5599, + "step": 103455 + }, + { + "epoch": 7.029487702133442, + "grad_norm": 8.191163063049316, + "learning_rate": 1.2167923630928116e-06, + "loss": 2.7515, + "step": 103460 + }, + { + "epoch": 7.029827422204104, + "grad_norm": 9.453951835632324, + "learning_rate": 1.2163677130044844e-06, + "loss": 2.8624, + "step": 103465 + }, + { + "epoch": 7.030167142274766, + "grad_norm": 6.46652364730835, + "learning_rate": 1.2159430629161572e-06, + "loss": 2.8147, + "step": 103470 + }, + { + "epoch": 7.030506862345427, + "grad_norm": 6.464824199676514, + "learning_rate": 1.21551841282783e-06, + "loss": 2.4967, + "step": 103475 + }, + { + "epoch": 7.030846582416089, + "grad_norm": 9.505082130432129, + "learning_rate": 1.2150937627395028e-06, + "loss": 2.6302, + "step": 103480 + }, + { + "epoch": 7.031186302486751, + "grad_norm": 9.81102466583252, + "learning_rate": 1.2146691126511756e-06, + "loss": 2.7276, + "step": 103485 + }, + { + "epoch": 7.031526022557412, + "grad_norm": 9.718607902526855, + "learning_rate": 1.2142444625628482e-06, + "loss": 2.7122, + "step": 103490 + }, + { + "epoch": 7.031865742628074, + "grad_norm": 8.245657920837402, + "learning_rate": 1.2138198124745212e-06, + "loss": 2.6953, + "step": 103495 + }, + { + "epoch": 7.0322054626987365, + "grad_norm": 9.258429527282715, + "learning_rate": 1.213395162386194e-06, + "loss": 2.8614, + "step": 103500 + }, + { + "epoch": 7.032545182769398, + "grad_norm": 6.791820049285889, + "learning_rate": 1.2129705122978666e-06, + "loss": 2.5651, + "step": 103505 + }, + { + "epoch": 7.03288490284006, + "grad_norm": 6.800486087799072, + "learning_rate": 1.2125458622095394e-06, + "loss": 2.7706, + "step": 103510 + }, + { + "epoch": 7.033224622910722, + "grad_norm": 5.771674633026123, + "learning_rate": 1.2121212121212122e-06, + "loss": 2.7748, + "step": 103515 + }, + { + "epoch": 7.033564342981383, + "grad_norm": 7.653301239013672, + "learning_rate": 1.211696562032885e-06, + "loss": 2.5641, + "step": 103520 + }, + { + "epoch": 7.033904063052045, + "grad_norm": 7.365122318267822, + "learning_rate": 1.2112719119445578e-06, + "loss": 2.6299, + "step": 103525 + }, + { + "epoch": 7.034243783122707, + "grad_norm": 7.42977237701416, + "learning_rate": 1.2108472618562306e-06, + "loss": 2.8649, + "step": 103530 + }, + { + "epoch": 7.034583503193368, + "grad_norm": 7.1612935066223145, + "learning_rate": 1.2104226117679034e-06, + "loss": 2.759, + "step": 103535 + }, + { + "epoch": 7.03492322326403, + "grad_norm": 7.811254501342773, + "learning_rate": 1.209997961679576e-06, + "loss": 2.8437, + "step": 103540 + }, + { + "epoch": 7.0352629433346925, + "grad_norm": 8.811553955078125, + "learning_rate": 1.209573311591249e-06, + "loss": 2.5698, + "step": 103545 + }, + { + "epoch": 7.035602663405354, + "grad_norm": 6.808506011962891, + "learning_rate": 1.2091486615029218e-06, + "loss": 2.5203, + "step": 103550 + }, + { + "epoch": 7.035942383476016, + "grad_norm": 6.502536296844482, + "learning_rate": 1.2087240114145944e-06, + "loss": 2.93, + "step": 103555 + }, + { + "epoch": 7.036282103546678, + "grad_norm": 7.3601884841918945, + "learning_rate": 1.2082993613262672e-06, + "loss": 2.9374, + "step": 103560 + }, + { + "epoch": 7.036621823617339, + "grad_norm": 6.968810558319092, + "learning_rate": 1.20787471123794e-06, + "loss": 2.8892, + "step": 103565 + }, + { + "epoch": 7.036961543688001, + "grad_norm": 8.042439460754395, + "learning_rate": 1.2074500611496128e-06, + "loss": 2.7503, + "step": 103570 + }, + { + "epoch": 7.037301263758663, + "grad_norm": 6.452536582946777, + "learning_rate": 1.2070254110612856e-06, + "loss": 2.7476, + "step": 103575 + }, + { + "epoch": 7.037640983829324, + "grad_norm": 7.686956882476807, + "learning_rate": 1.2066007609729584e-06, + "loss": 2.5863, + "step": 103580 + }, + { + "epoch": 7.037980703899986, + "grad_norm": 6.733666896820068, + "learning_rate": 1.2061761108846312e-06, + "loss": 2.7394, + "step": 103585 + }, + { + "epoch": 7.0383204239706485, + "grad_norm": 7.621760845184326, + "learning_rate": 1.205751460796304e-06, + "loss": 2.5812, + "step": 103590 + }, + { + "epoch": 7.03866014404131, + "grad_norm": 7.663336277008057, + "learning_rate": 1.2053268107079768e-06, + "loss": 2.8726, + "step": 103595 + }, + { + "epoch": 7.038999864111972, + "grad_norm": 10.064101219177246, + "learning_rate": 1.2049021606196496e-06, + "loss": 2.6325, + "step": 103600 + }, + { + "epoch": 7.039339584182634, + "grad_norm": 6.736776828765869, + "learning_rate": 1.2044775105313222e-06, + "loss": 2.891, + "step": 103605 + }, + { + "epoch": 7.039679304253295, + "grad_norm": 8.487974166870117, + "learning_rate": 1.204052860442995e-06, + "loss": 3.015, + "step": 103610 + }, + { + "epoch": 7.040019024323957, + "grad_norm": 8.583741188049316, + "learning_rate": 1.2036282103546678e-06, + "loss": 2.8929, + "step": 103615 + }, + { + "epoch": 7.040358744394619, + "grad_norm": 7.82539176940918, + "learning_rate": 1.2032035602663406e-06, + "loss": 2.7948, + "step": 103620 + }, + { + "epoch": 7.04069846446528, + "grad_norm": 7.633459091186523, + "learning_rate": 1.2027789101780134e-06, + "loss": 2.7887, + "step": 103625 + }, + { + "epoch": 7.041038184535942, + "grad_norm": 8.401410102844238, + "learning_rate": 1.2023542600896862e-06, + "loss": 2.6705, + "step": 103630 + }, + { + "epoch": 7.0413779046066045, + "grad_norm": 5.794362545013428, + "learning_rate": 1.201929610001359e-06, + "loss": 2.6816, + "step": 103635 + }, + { + "epoch": 7.041717624677266, + "grad_norm": 8.466882705688477, + "learning_rate": 1.2015049599130318e-06, + "loss": 2.659, + "step": 103640 + }, + { + "epoch": 7.042057344747928, + "grad_norm": 7.7287492752075195, + "learning_rate": 1.2010803098247046e-06, + "loss": 2.7866, + "step": 103645 + }, + { + "epoch": 7.04239706481859, + "grad_norm": 11.13013744354248, + "learning_rate": 1.2006556597363774e-06, + "loss": 2.9798, + "step": 103650 + }, + { + "epoch": 7.042736784889251, + "grad_norm": 10.269341468811035, + "learning_rate": 1.2002310096480502e-06, + "loss": 3.0534, + "step": 103655 + }, + { + "epoch": 7.043076504959913, + "grad_norm": 8.749787330627441, + "learning_rate": 1.1998063595597228e-06, + "loss": 2.7962, + "step": 103660 + }, + { + "epoch": 7.043416225030575, + "grad_norm": 7.5270209312438965, + "learning_rate": 1.1993817094713956e-06, + "loss": 2.7467, + "step": 103665 + }, + { + "epoch": 7.043755945101236, + "grad_norm": 7.6988205909729, + "learning_rate": 1.1989570593830686e-06, + "loss": 2.9432, + "step": 103670 + }, + { + "epoch": 7.0440956651718984, + "grad_norm": 10.169244766235352, + "learning_rate": 1.1985324092947412e-06, + "loss": 2.8482, + "step": 103675 + }, + { + "epoch": 7.0444353852425605, + "grad_norm": 8.36737060546875, + "learning_rate": 1.198107759206414e-06, + "loss": 2.7076, + "step": 103680 + }, + { + "epoch": 7.044775105313222, + "grad_norm": 7.98732852935791, + "learning_rate": 1.1976831091180868e-06, + "loss": 2.9141, + "step": 103685 + }, + { + "epoch": 7.045114825383884, + "grad_norm": 9.010825157165527, + "learning_rate": 1.1972584590297596e-06, + "loss": 2.411, + "step": 103690 + }, + { + "epoch": 7.045454545454546, + "grad_norm": 6.853374004364014, + "learning_rate": 1.1968338089414324e-06, + "loss": 2.5627, + "step": 103695 + }, + { + "epoch": 7.045794265525207, + "grad_norm": 7.226500034332275, + "learning_rate": 1.1964091588531052e-06, + "loss": 2.664, + "step": 103700 + }, + { + "epoch": 7.046133985595869, + "grad_norm": 9.407818794250488, + "learning_rate": 1.195984508764778e-06, + "loss": 2.8057, + "step": 103705 + }, + { + "epoch": 7.046473705666531, + "grad_norm": 8.649452209472656, + "learning_rate": 1.1955598586764506e-06, + "loss": 2.925, + "step": 103710 + }, + { + "epoch": 7.046813425737192, + "grad_norm": 8.316137313842773, + "learning_rate": 1.1951352085881236e-06, + "loss": 2.5781, + "step": 103715 + }, + { + "epoch": 7.0471531458078545, + "grad_norm": 7.312090873718262, + "learning_rate": 1.1947105584997964e-06, + "loss": 2.5785, + "step": 103720 + }, + { + "epoch": 7.0474928658785165, + "grad_norm": 8.011280059814453, + "learning_rate": 1.194285908411469e-06, + "loss": 3.0135, + "step": 103725 + }, + { + "epoch": 7.047832585949178, + "grad_norm": 7.739573955535889, + "learning_rate": 1.1938612583231418e-06, + "loss": 2.7102, + "step": 103730 + }, + { + "epoch": 7.04817230601984, + "grad_norm": 7.807209491729736, + "learning_rate": 1.1934366082348146e-06, + "loss": 2.664, + "step": 103735 + }, + { + "epoch": 7.048512026090501, + "grad_norm": 8.286297798156738, + "learning_rate": 1.1930119581464874e-06, + "loss": 2.929, + "step": 103740 + }, + { + "epoch": 7.048851746161163, + "grad_norm": 7.490187168121338, + "learning_rate": 1.1925873080581602e-06, + "loss": 2.858, + "step": 103745 + }, + { + "epoch": 7.049191466231825, + "grad_norm": 7.206377029418945, + "learning_rate": 1.192162657969833e-06, + "loss": 2.597, + "step": 103750 + }, + { + "epoch": 7.049531186302486, + "grad_norm": 7.93928861618042, + "learning_rate": 1.1917380078815058e-06, + "loss": 2.7373, + "step": 103755 + }, + { + "epoch": 7.049870906373148, + "grad_norm": 10.085618019104004, + "learning_rate": 1.1913133577931784e-06, + "loss": 2.6264, + "step": 103760 + }, + { + "epoch": 7.0502106264438105, + "grad_norm": 10.485694885253906, + "learning_rate": 1.1908887077048514e-06, + "loss": 2.42, + "step": 103765 + }, + { + "epoch": 7.050550346514472, + "grad_norm": 8.171640396118164, + "learning_rate": 1.1904640576165242e-06, + "loss": 2.8093, + "step": 103770 + }, + { + "epoch": 7.050890066585134, + "grad_norm": 6.37685489654541, + "learning_rate": 1.1900394075281968e-06, + "loss": 2.5877, + "step": 103775 + }, + { + "epoch": 7.051229786655796, + "grad_norm": 7.7281494140625, + "learning_rate": 1.1896147574398696e-06, + "loss": 2.5887, + "step": 103780 + }, + { + "epoch": 7.051569506726457, + "grad_norm": 7.366497993469238, + "learning_rate": 1.1891901073515424e-06, + "loss": 2.879, + "step": 103785 + }, + { + "epoch": 7.051909226797119, + "grad_norm": 7.509390354156494, + "learning_rate": 1.1887654572632152e-06, + "loss": 2.7129, + "step": 103790 + }, + { + "epoch": 7.052248946867781, + "grad_norm": 6.518222808837891, + "learning_rate": 1.188340807174888e-06, + "loss": 2.7325, + "step": 103795 + }, + { + "epoch": 7.052588666938442, + "grad_norm": 7.751368999481201, + "learning_rate": 1.1879161570865608e-06, + "loss": 2.5956, + "step": 103800 + }, + { + "epoch": 7.052928387009104, + "grad_norm": 6.174520969390869, + "learning_rate": 1.1874915069982336e-06, + "loss": 2.4941, + "step": 103805 + }, + { + "epoch": 7.0532681070797665, + "grad_norm": 6.408194065093994, + "learning_rate": 1.1870668569099064e-06, + "loss": 2.5422, + "step": 103810 + }, + { + "epoch": 7.053607827150428, + "grad_norm": 7.77686071395874, + "learning_rate": 1.1866422068215792e-06, + "loss": 2.8118, + "step": 103815 + }, + { + "epoch": 7.05394754722109, + "grad_norm": 6.966019153594971, + "learning_rate": 1.186217556733252e-06, + "loss": 2.6317, + "step": 103820 + }, + { + "epoch": 7.054287267291752, + "grad_norm": 8.481375694274902, + "learning_rate": 1.1857929066449248e-06, + "loss": 2.5391, + "step": 103825 + }, + { + "epoch": 7.054626987362413, + "grad_norm": 8.055741310119629, + "learning_rate": 1.1853682565565974e-06, + "loss": 2.7209, + "step": 103830 + }, + { + "epoch": 7.054966707433075, + "grad_norm": 10.557083129882812, + "learning_rate": 1.1849436064682702e-06, + "loss": 2.7972, + "step": 103835 + }, + { + "epoch": 7.055306427503737, + "grad_norm": 5.856865882873535, + "learning_rate": 1.184518956379943e-06, + "loss": 2.4198, + "step": 103840 + }, + { + "epoch": 7.055646147574398, + "grad_norm": 6.7474446296691895, + "learning_rate": 1.1840943062916158e-06, + "loss": 2.7057, + "step": 103845 + }, + { + "epoch": 7.05598586764506, + "grad_norm": 7.959518909454346, + "learning_rate": 1.1836696562032886e-06, + "loss": 2.7435, + "step": 103850 + }, + { + "epoch": 7.0563255877157225, + "grad_norm": 7.413493633270264, + "learning_rate": 1.1832450061149614e-06, + "loss": 2.7428, + "step": 103855 + }, + { + "epoch": 7.056665307786384, + "grad_norm": 9.175695419311523, + "learning_rate": 1.1828203560266342e-06, + "loss": 2.8199, + "step": 103860 + }, + { + "epoch": 7.057005027857046, + "grad_norm": 6.593232154846191, + "learning_rate": 1.182395705938307e-06, + "loss": 2.5971, + "step": 103865 + }, + { + "epoch": 7.057344747927708, + "grad_norm": 9.345986366271973, + "learning_rate": 1.1819710558499798e-06, + "loss": 2.7963, + "step": 103870 + }, + { + "epoch": 7.057684467998369, + "grad_norm": 10.543851852416992, + "learning_rate": 1.1815464057616526e-06, + "loss": 2.6635, + "step": 103875 + }, + { + "epoch": 7.058024188069031, + "grad_norm": 9.292976379394531, + "learning_rate": 1.1811217556733252e-06, + "loss": 2.2363, + "step": 103880 + }, + { + "epoch": 7.058363908139693, + "grad_norm": 6.542015075683594, + "learning_rate": 1.180697105584998e-06, + "loss": 2.6731, + "step": 103885 + }, + { + "epoch": 7.058703628210354, + "grad_norm": 7.828426361083984, + "learning_rate": 1.180272455496671e-06, + "loss": 2.9379, + "step": 103890 + }, + { + "epoch": 7.059043348281016, + "grad_norm": 6.138631343841553, + "learning_rate": 1.1798478054083436e-06, + "loss": 2.6627, + "step": 103895 + }, + { + "epoch": 7.0593830683516785, + "grad_norm": 8.490220069885254, + "learning_rate": 1.1794231553200164e-06, + "loss": 2.7684, + "step": 103900 + }, + { + "epoch": 7.05972278842234, + "grad_norm": 7.876797199249268, + "learning_rate": 1.1789985052316892e-06, + "loss": 2.9749, + "step": 103905 + }, + { + "epoch": 7.060062508493002, + "grad_norm": 7.242765426635742, + "learning_rate": 1.178573855143362e-06, + "loss": 2.7838, + "step": 103910 + }, + { + "epoch": 7.060402228563664, + "grad_norm": 7.961067199707031, + "learning_rate": 1.1781492050550348e-06, + "loss": 2.5527, + "step": 103915 + }, + { + "epoch": 7.060741948634325, + "grad_norm": 10.508158683776855, + "learning_rate": 1.1777245549667076e-06, + "loss": 2.5245, + "step": 103920 + }, + { + "epoch": 7.061081668704987, + "grad_norm": 8.183365821838379, + "learning_rate": 1.1772999048783804e-06, + "loss": 2.7505, + "step": 103925 + }, + { + "epoch": 7.061421388775649, + "grad_norm": 10.277576446533203, + "learning_rate": 1.176875254790053e-06, + "loss": 2.7187, + "step": 103930 + }, + { + "epoch": 7.06176110884631, + "grad_norm": 6.79478645324707, + "learning_rate": 1.1764506047017258e-06, + "loss": 2.8762, + "step": 103935 + }, + { + "epoch": 7.062100828916972, + "grad_norm": 9.11600399017334, + "learning_rate": 1.1760259546133988e-06, + "loss": 2.6865, + "step": 103940 + }, + { + "epoch": 7.0624405489876345, + "grad_norm": 7.066452980041504, + "learning_rate": 1.1756013045250714e-06, + "loss": 2.9471, + "step": 103945 + }, + { + "epoch": 7.062780269058296, + "grad_norm": 8.557899475097656, + "learning_rate": 1.1751766544367442e-06, + "loss": 2.6143, + "step": 103950 + }, + { + "epoch": 7.063119989128958, + "grad_norm": 6.840616703033447, + "learning_rate": 1.174752004348417e-06, + "loss": 2.8153, + "step": 103955 + }, + { + "epoch": 7.06345970919962, + "grad_norm": 7.305384159088135, + "learning_rate": 1.1743273542600898e-06, + "loss": 2.7583, + "step": 103960 + }, + { + "epoch": 7.063799429270281, + "grad_norm": 8.130250930786133, + "learning_rate": 1.1739027041717626e-06, + "loss": 2.878, + "step": 103965 + }, + { + "epoch": 7.064139149340943, + "grad_norm": 8.957350730895996, + "learning_rate": 1.1734780540834354e-06, + "loss": 2.8058, + "step": 103970 + }, + { + "epoch": 7.064478869411605, + "grad_norm": 6.125240802764893, + "learning_rate": 1.1730534039951082e-06, + "loss": 2.8726, + "step": 103975 + }, + { + "epoch": 7.064818589482266, + "grad_norm": 8.652688980102539, + "learning_rate": 1.1726287539067808e-06, + "loss": 2.7938, + "step": 103980 + }, + { + "epoch": 7.0651583095529285, + "grad_norm": 8.055071830749512, + "learning_rate": 1.1722041038184538e-06, + "loss": 2.5575, + "step": 103985 + }, + { + "epoch": 7.0654980296235905, + "grad_norm": 8.220398902893066, + "learning_rate": 1.1717794537301266e-06, + "loss": 2.4822, + "step": 103990 + }, + { + "epoch": 7.065837749694252, + "grad_norm": 8.895602226257324, + "learning_rate": 1.1713548036417994e-06, + "loss": 2.7752, + "step": 103995 + }, + { + "epoch": 7.066177469764914, + "grad_norm": 9.77940559387207, + "learning_rate": 1.170930153553472e-06, + "loss": 2.8467, + "step": 104000 + }, + { + "epoch": 7.066517189835576, + "grad_norm": 6.611229419708252, + "learning_rate": 1.1705055034651448e-06, + "loss": 2.659, + "step": 104005 + }, + { + "epoch": 7.066856909906237, + "grad_norm": 8.373123168945312, + "learning_rate": 1.1700808533768176e-06, + "loss": 2.7245, + "step": 104010 + }, + { + "epoch": 7.067196629976899, + "grad_norm": 8.738224983215332, + "learning_rate": 1.1696562032884904e-06, + "loss": 2.6514, + "step": 104015 + }, + { + "epoch": 7.067536350047561, + "grad_norm": 11.058295249938965, + "learning_rate": 1.1692315532001632e-06, + "loss": 2.9395, + "step": 104020 + }, + { + "epoch": 7.067876070118222, + "grad_norm": 8.741022109985352, + "learning_rate": 1.168806903111836e-06, + "loss": 2.6452, + "step": 104025 + }, + { + "epoch": 7.0682157901888845, + "grad_norm": 6.35155725479126, + "learning_rate": 1.1683822530235086e-06, + "loss": 2.754, + "step": 104030 + }, + { + "epoch": 7.0685555102595465, + "grad_norm": 6.793537616729736, + "learning_rate": 1.1679576029351816e-06, + "loss": 2.8313, + "step": 104035 + }, + { + "epoch": 7.068895230330208, + "grad_norm": 7.259872913360596, + "learning_rate": 1.1675329528468544e-06, + "loss": 2.8407, + "step": 104040 + }, + { + "epoch": 7.06923495040087, + "grad_norm": 6.674647808074951, + "learning_rate": 1.1671083027585272e-06, + "loss": 2.7424, + "step": 104045 + }, + { + "epoch": 7.069574670471532, + "grad_norm": 6.793631553649902, + "learning_rate": 1.1666836526701998e-06, + "loss": 2.7891, + "step": 104050 + }, + { + "epoch": 7.069914390542193, + "grad_norm": 9.869624137878418, + "learning_rate": 1.1662590025818726e-06, + "loss": 2.53, + "step": 104055 + }, + { + "epoch": 7.070254110612855, + "grad_norm": 6.943650722503662, + "learning_rate": 1.1658343524935454e-06, + "loss": 2.9153, + "step": 104060 + }, + { + "epoch": 7.070593830683517, + "grad_norm": 6.241462230682373, + "learning_rate": 1.1654097024052182e-06, + "loss": 2.479, + "step": 104065 + }, + { + "epoch": 7.070933550754178, + "grad_norm": 8.640606880187988, + "learning_rate": 1.164985052316891e-06, + "loss": 2.6958, + "step": 104070 + }, + { + "epoch": 7.0712732708248405, + "grad_norm": 5.657482624053955, + "learning_rate": 1.1645604022285638e-06, + "loss": 2.5966, + "step": 104075 + }, + { + "epoch": 7.0716129908955025, + "grad_norm": 10.282247543334961, + "learning_rate": 1.1641357521402366e-06, + "loss": 2.569, + "step": 104080 + }, + { + "epoch": 7.071952710966164, + "grad_norm": 8.088379859924316, + "learning_rate": 1.1637111020519094e-06, + "loss": 2.7728, + "step": 104085 + }, + { + "epoch": 7.072292431036826, + "grad_norm": 9.271677017211914, + "learning_rate": 1.1632864519635822e-06, + "loss": 2.8216, + "step": 104090 + }, + { + "epoch": 7.072632151107487, + "grad_norm": 8.232909202575684, + "learning_rate": 1.162861801875255e-06, + "loss": 2.7525, + "step": 104095 + }, + { + "epoch": 7.072971871178149, + "grad_norm": 10.294151306152344, + "learning_rate": 1.1624371517869276e-06, + "loss": 2.6053, + "step": 104100 + }, + { + "epoch": 7.073311591248811, + "grad_norm": 7.99905252456665, + "learning_rate": 1.1620125016986004e-06, + "loss": 2.7707, + "step": 104105 + }, + { + "epoch": 7.073651311319472, + "grad_norm": 8.352002143859863, + "learning_rate": 1.1615878516102734e-06, + "loss": 2.5724, + "step": 104110 + }, + { + "epoch": 7.073991031390134, + "grad_norm": 8.083425521850586, + "learning_rate": 1.161163201521946e-06, + "loss": 2.8736, + "step": 104115 + }, + { + "epoch": 7.0743307514607965, + "grad_norm": 7.641461372375488, + "learning_rate": 1.1607385514336188e-06, + "loss": 2.8069, + "step": 104120 + }, + { + "epoch": 7.074670471531458, + "grad_norm": 9.929224014282227, + "learning_rate": 1.1603139013452916e-06, + "loss": 2.6001, + "step": 104125 + }, + { + "epoch": 7.07501019160212, + "grad_norm": 7.615929126739502, + "learning_rate": 1.1598892512569644e-06, + "loss": 2.7937, + "step": 104130 + }, + { + "epoch": 7.075349911672782, + "grad_norm": 8.09175968170166, + "learning_rate": 1.1594646011686372e-06, + "loss": 2.8789, + "step": 104135 + }, + { + "epoch": 7.075689631743443, + "grad_norm": 5.880781173706055, + "learning_rate": 1.15903995108031e-06, + "loss": 2.6378, + "step": 104140 + }, + { + "epoch": 7.076029351814105, + "grad_norm": 6.511928081512451, + "learning_rate": 1.1586153009919828e-06, + "loss": 2.8107, + "step": 104145 + }, + { + "epoch": 7.076369071884767, + "grad_norm": 7.647661209106445, + "learning_rate": 1.1581906509036554e-06, + "loss": 2.7974, + "step": 104150 + }, + { + "epoch": 7.076708791955428, + "grad_norm": 7.817718982696533, + "learning_rate": 1.1577660008153282e-06, + "loss": 2.7891, + "step": 104155 + }, + { + "epoch": 7.07704851202609, + "grad_norm": 9.319141387939453, + "learning_rate": 1.1573413507270012e-06, + "loss": 2.8326, + "step": 104160 + }, + { + "epoch": 7.0773882320967525, + "grad_norm": 7.9052348136901855, + "learning_rate": 1.156916700638674e-06, + "loss": 2.541, + "step": 104165 + }, + { + "epoch": 7.077727952167414, + "grad_norm": 8.811529159545898, + "learning_rate": 1.1564920505503466e-06, + "loss": 2.5331, + "step": 104170 + }, + { + "epoch": 7.078067672238076, + "grad_norm": 10.341818809509277, + "learning_rate": 1.1560674004620194e-06, + "loss": 2.762, + "step": 104175 + }, + { + "epoch": 7.078407392308738, + "grad_norm": 6.77129602432251, + "learning_rate": 1.1556427503736922e-06, + "loss": 2.6838, + "step": 104180 + }, + { + "epoch": 7.078747112379399, + "grad_norm": 8.23361587524414, + "learning_rate": 1.155218100285365e-06, + "loss": 2.7759, + "step": 104185 + }, + { + "epoch": 7.079086832450061, + "grad_norm": 8.556734085083008, + "learning_rate": 1.1547934501970378e-06, + "loss": 2.6806, + "step": 104190 + }, + { + "epoch": 7.079426552520723, + "grad_norm": 8.921568870544434, + "learning_rate": 1.1543688001087106e-06, + "loss": 2.7985, + "step": 104195 + }, + { + "epoch": 7.079766272591384, + "grad_norm": 7.472016334533691, + "learning_rate": 1.1539441500203831e-06, + "loss": 2.7538, + "step": 104200 + }, + { + "epoch": 7.080105992662046, + "grad_norm": 5.816795825958252, + "learning_rate": 1.1535194999320562e-06, + "loss": 2.5994, + "step": 104205 + }, + { + "epoch": 7.0804457127327085, + "grad_norm": 9.124732971191406, + "learning_rate": 1.153094849843729e-06, + "loss": 2.7886, + "step": 104210 + }, + { + "epoch": 7.08078543280337, + "grad_norm": 8.581677436828613, + "learning_rate": 1.1526701997554018e-06, + "loss": 2.8225, + "step": 104215 + }, + { + "epoch": 7.081125152874032, + "grad_norm": 7.869369983673096, + "learning_rate": 1.1522455496670744e-06, + "loss": 2.7251, + "step": 104220 + }, + { + "epoch": 7.081464872944694, + "grad_norm": 6.7740864753723145, + "learning_rate": 1.1518208995787472e-06, + "loss": 2.5465, + "step": 104225 + }, + { + "epoch": 7.081804593015355, + "grad_norm": 10.1924467086792, + "learning_rate": 1.15139624949042e-06, + "loss": 2.4958, + "step": 104230 + }, + { + "epoch": 7.082144313086017, + "grad_norm": 9.766861915588379, + "learning_rate": 1.1509715994020928e-06, + "loss": 2.5009, + "step": 104235 + }, + { + "epoch": 7.082484033156679, + "grad_norm": 8.70950698852539, + "learning_rate": 1.1505469493137656e-06, + "loss": 2.6195, + "step": 104240 + }, + { + "epoch": 7.08282375322734, + "grad_norm": 7.063488483428955, + "learning_rate": 1.1501222992254384e-06, + "loss": 2.7491, + "step": 104245 + }, + { + "epoch": 7.083163473298002, + "grad_norm": 10.400564193725586, + "learning_rate": 1.1496976491371112e-06, + "loss": 2.7779, + "step": 104250 + }, + { + "epoch": 7.0835031933686645, + "grad_norm": 7.464344501495361, + "learning_rate": 1.149272999048784e-06, + "loss": 2.6753, + "step": 104255 + }, + { + "epoch": 7.083842913439326, + "grad_norm": 6.309104919433594, + "learning_rate": 1.1488483489604568e-06, + "loss": 2.8602, + "step": 104260 + }, + { + "epoch": 7.084182633509988, + "grad_norm": 8.475736618041992, + "learning_rate": 1.1484236988721296e-06, + "loss": 2.69, + "step": 104265 + }, + { + "epoch": 7.08452235358065, + "grad_norm": 5.738335132598877, + "learning_rate": 1.1479990487838021e-06, + "loss": 2.7557, + "step": 104270 + }, + { + "epoch": 7.084862073651311, + "grad_norm": 7.703033447265625, + "learning_rate": 1.147574398695475e-06, + "loss": 2.651, + "step": 104275 + }, + { + "epoch": 7.085201793721973, + "grad_norm": 6.89184045791626, + "learning_rate": 1.1471497486071477e-06, + "loss": 2.8329, + "step": 104280 + }, + { + "epoch": 7.085541513792635, + "grad_norm": 7.039224624633789, + "learning_rate": 1.1467250985188205e-06, + "loss": 2.7047, + "step": 104285 + }, + { + "epoch": 7.085881233863296, + "grad_norm": 10.244451522827148, + "learning_rate": 1.1463004484304933e-06, + "loss": 2.9034, + "step": 104290 + }, + { + "epoch": 7.0862209539339585, + "grad_norm": 8.539753913879395, + "learning_rate": 1.1458757983421662e-06, + "loss": 2.8719, + "step": 104295 + }, + { + "epoch": 7.0865606740046205, + "grad_norm": 7.523624420166016, + "learning_rate": 1.145451148253839e-06, + "loss": 2.6876, + "step": 104300 + }, + { + "epoch": 7.086900394075282, + "grad_norm": 7.234994888305664, + "learning_rate": 1.1450264981655118e-06, + "loss": 2.7636, + "step": 104305 + }, + { + "epoch": 7.087240114145944, + "grad_norm": 9.300262451171875, + "learning_rate": 1.1446018480771846e-06, + "loss": 2.511, + "step": 104310 + }, + { + "epoch": 7.087579834216606, + "grad_norm": 7.015552520751953, + "learning_rate": 1.1441771979888574e-06, + "loss": 2.6427, + "step": 104315 + }, + { + "epoch": 7.087919554287267, + "grad_norm": 8.505663871765137, + "learning_rate": 1.14375254790053e-06, + "loss": 2.6705, + "step": 104320 + }, + { + "epoch": 7.088259274357929, + "grad_norm": 8.40054702758789, + "learning_rate": 1.1433278978122027e-06, + "loss": 2.9972, + "step": 104325 + }, + { + "epoch": 7.088598994428591, + "grad_norm": 7.457705497741699, + "learning_rate": 1.1429032477238755e-06, + "loss": 2.6521, + "step": 104330 + }, + { + "epoch": 7.088938714499252, + "grad_norm": 8.367969512939453, + "learning_rate": 1.1424785976355486e-06, + "loss": 2.6738, + "step": 104335 + }, + { + "epoch": 7.0892784345699145, + "grad_norm": 6.679562568664551, + "learning_rate": 1.1420539475472211e-06, + "loss": 2.6479, + "step": 104340 + }, + { + "epoch": 7.0896181546405765, + "grad_norm": 9.180099487304688, + "learning_rate": 1.141629297458894e-06, + "loss": 2.6388, + "step": 104345 + }, + { + "epoch": 7.089957874711238, + "grad_norm": 9.084012031555176, + "learning_rate": 1.1412046473705667e-06, + "loss": 2.4786, + "step": 104350 + }, + { + "epoch": 7.0902975947819, + "grad_norm": 6.834775924682617, + "learning_rate": 1.1407799972822395e-06, + "loss": 2.7771, + "step": 104355 + }, + { + "epoch": 7.090637314852562, + "grad_norm": 10.315804481506348, + "learning_rate": 1.1403553471939123e-06, + "loss": 2.7969, + "step": 104360 + }, + { + "epoch": 7.090977034923223, + "grad_norm": 6.377922058105469, + "learning_rate": 1.1399306971055851e-06, + "loss": 2.6938, + "step": 104365 + }, + { + "epoch": 7.091316754993885, + "grad_norm": 7.517040252685547, + "learning_rate": 1.1395060470172577e-06, + "loss": 2.78, + "step": 104370 + }, + { + "epoch": 7.091656475064547, + "grad_norm": 6.748663902282715, + "learning_rate": 1.1390813969289305e-06, + "loss": 2.8154, + "step": 104375 + }, + { + "epoch": 7.091996195135208, + "grad_norm": 7.354791164398193, + "learning_rate": 1.1386567468406036e-06, + "loss": 2.5866, + "step": 104380 + }, + { + "epoch": 7.0923359152058705, + "grad_norm": 7.714321613311768, + "learning_rate": 1.1382320967522764e-06, + "loss": 2.4451, + "step": 104385 + }, + { + "epoch": 7.0926756352765326, + "grad_norm": 7.305339813232422, + "learning_rate": 1.137807446663949e-06, + "loss": 2.7781, + "step": 104390 + }, + { + "epoch": 7.093015355347194, + "grad_norm": 8.330290794372559, + "learning_rate": 1.1373827965756217e-06, + "loss": 2.7926, + "step": 104395 + }, + { + "epoch": 7.093355075417856, + "grad_norm": 6.8638081550598145, + "learning_rate": 1.1369581464872945e-06, + "loss": 2.6879, + "step": 104400 + }, + { + "epoch": 7.093694795488518, + "grad_norm": 8.162582397460938, + "learning_rate": 1.1365334963989673e-06, + "loss": 2.5357, + "step": 104405 + }, + { + "epoch": 7.094034515559179, + "grad_norm": 9.030562400817871, + "learning_rate": 1.1361088463106401e-06, + "loss": 2.7301, + "step": 104410 + }, + { + "epoch": 7.094374235629841, + "grad_norm": 6.360898971557617, + "learning_rate": 1.135684196222313e-06, + "loss": 2.6071, + "step": 104415 + }, + { + "epoch": 7.094713955700502, + "grad_norm": 8.857171058654785, + "learning_rate": 1.1352595461339857e-06, + "loss": 2.5586, + "step": 104420 + }, + { + "epoch": 7.095053675771164, + "grad_norm": 6.897754192352295, + "learning_rate": 1.1348348960456583e-06, + "loss": 2.6401, + "step": 104425 + }, + { + "epoch": 7.0953933958418265, + "grad_norm": 6.600069046020508, + "learning_rate": 1.1344102459573313e-06, + "loss": 2.5497, + "step": 104430 + }, + { + "epoch": 7.095733115912488, + "grad_norm": 7.358003616333008, + "learning_rate": 1.1339855958690041e-06, + "loss": 2.734, + "step": 104435 + }, + { + "epoch": 7.09607283598315, + "grad_norm": 8.216312408447266, + "learning_rate": 1.1335609457806767e-06, + "loss": 2.7582, + "step": 104440 + }, + { + "epoch": 7.096412556053812, + "grad_norm": 6.881643295288086, + "learning_rate": 1.1331362956923495e-06, + "loss": 2.6504, + "step": 104445 + }, + { + "epoch": 7.096752276124473, + "grad_norm": 10.530577659606934, + "learning_rate": 1.1327116456040223e-06, + "loss": 2.653, + "step": 104450 + }, + { + "epoch": 7.097091996195135, + "grad_norm": 7.898348331451416, + "learning_rate": 1.1322869955156951e-06, + "loss": 2.6236, + "step": 104455 + }, + { + "epoch": 7.097431716265797, + "grad_norm": 8.843284606933594, + "learning_rate": 1.131862345427368e-06, + "loss": 2.8018, + "step": 104460 + }, + { + "epoch": 7.097771436336458, + "grad_norm": 7.94812536239624, + "learning_rate": 1.1314376953390407e-06, + "loss": 2.7239, + "step": 104465 + }, + { + "epoch": 7.09811115640712, + "grad_norm": 6.53042459487915, + "learning_rate": 1.1310130452507135e-06, + "loss": 2.3288, + "step": 104470 + }, + { + "epoch": 7.0984508764777825, + "grad_norm": 6.55878210067749, + "learning_rate": 1.1305883951623863e-06, + "loss": 2.5473, + "step": 104475 + }, + { + "epoch": 7.098790596548444, + "grad_norm": 6.824540138244629, + "learning_rate": 1.1301637450740591e-06, + "loss": 2.7541, + "step": 104480 + }, + { + "epoch": 7.099130316619106, + "grad_norm": 7.720854759216309, + "learning_rate": 1.129739094985732e-06, + "loss": 2.475, + "step": 104485 + }, + { + "epoch": 7.099470036689768, + "grad_norm": 6.909055709838867, + "learning_rate": 1.1293144448974045e-06, + "loss": 2.822, + "step": 104490 + }, + { + "epoch": 7.099809756760429, + "grad_norm": 7.7963643074035645, + "learning_rate": 1.1288897948090773e-06, + "loss": 2.4729, + "step": 104495 + }, + { + "epoch": 7.100149476831091, + "grad_norm": 9.273942947387695, + "learning_rate": 1.1284651447207501e-06, + "loss": 2.893, + "step": 104500 + }, + { + "epoch": 7.100489196901753, + "grad_norm": 8.809816360473633, + "learning_rate": 1.1280404946324231e-06, + "loss": 2.7027, + "step": 104505 + }, + { + "epoch": 7.100828916972414, + "grad_norm": 7.410029411315918, + "learning_rate": 1.1276158445440957e-06, + "loss": 2.9011, + "step": 104510 + }, + { + "epoch": 7.101168637043076, + "grad_norm": 7.203756332397461, + "learning_rate": 1.1271911944557685e-06, + "loss": 2.8132, + "step": 104515 + }, + { + "epoch": 7.1015083571137385, + "grad_norm": 8.695901870727539, + "learning_rate": 1.1267665443674413e-06, + "loss": 2.6676, + "step": 104520 + }, + { + "epoch": 7.1018480771844, + "grad_norm": 7.730510711669922, + "learning_rate": 1.1263418942791141e-06, + "loss": 2.5828, + "step": 104525 + }, + { + "epoch": 7.102187797255062, + "grad_norm": 7.781470775604248, + "learning_rate": 1.125917244190787e-06, + "loss": 2.6343, + "step": 104530 + }, + { + "epoch": 7.102527517325724, + "grad_norm": 7.651203632354736, + "learning_rate": 1.1254925941024597e-06, + "loss": 2.6328, + "step": 104535 + }, + { + "epoch": 7.102867237396385, + "grad_norm": 8.21090030670166, + "learning_rate": 1.1250679440141323e-06, + "loss": 2.7645, + "step": 104540 + }, + { + "epoch": 7.103206957467047, + "grad_norm": 8.072559356689453, + "learning_rate": 1.1246432939258051e-06, + "loss": 2.7912, + "step": 104545 + }, + { + "epoch": 7.103546677537709, + "grad_norm": 7.973447799682617, + "learning_rate": 1.124218643837478e-06, + "loss": 2.6926, + "step": 104550 + }, + { + "epoch": 7.10388639760837, + "grad_norm": 7.628564834594727, + "learning_rate": 1.123793993749151e-06, + "loss": 2.5543, + "step": 104555 + }, + { + "epoch": 7.1042261176790324, + "grad_norm": 9.283966064453125, + "learning_rate": 1.1233693436608235e-06, + "loss": 2.787, + "step": 104560 + }, + { + "epoch": 7.1045658377496945, + "grad_norm": 8.071331977844238, + "learning_rate": 1.1229446935724963e-06, + "loss": 2.5578, + "step": 104565 + }, + { + "epoch": 7.104905557820356, + "grad_norm": 6.861661911010742, + "learning_rate": 1.1225200434841691e-06, + "loss": 2.5313, + "step": 104570 + }, + { + "epoch": 7.105245277891018, + "grad_norm": 8.828817367553711, + "learning_rate": 1.122095393395842e-06, + "loss": 2.6688, + "step": 104575 + }, + { + "epoch": 7.10558499796168, + "grad_norm": 10.293119430541992, + "learning_rate": 1.1216707433075147e-06, + "loss": 2.7407, + "step": 104580 + }, + { + "epoch": 7.105924718032341, + "grad_norm": 10.452231407165527, + "learning_rate": 1.1212460932191875e-06, + "loss": 2.8277, + "step": 104585 + }, + { + "epoch": 7.106264438103003, + "grad_norm": 8.203280448913574, + "learning_rate": 1.1208214431308603e-06, + "loss": 2.8832, + "step": 104590 + }, + { + "epoch": 7.106604158173665, + "grad_norm": 8.792510032653809, + "learning_rate": 1.120396793042533e-06, + "loss": 2.5381, + "step": 104595 + }, + { + "epoch": 7.106943878244326, + "grad_norm": 8.962986946105957, + "learning_rate": 1.119972142954206e-06, + "loss": 2.7408, + "step": 104600 + }, + { + "epoch": 7.1072835983149885, + "grad_norm": 9.666723251342773, + "learning_rate": 1.1195474928658787e-06, + "loss": 2.8436, + "step": 104605 + }, + { + "epoch": 7.1076233183856505, + "grad_norm": 7.183034896850586, + "learning_rate": 1.1191228427775513e-06, + "loss": 2.405, + "step": 104610 + }, + { + "epoch": 7.107963038456312, + "grad_norm": 6.412460803985596, + "learning_rate": 1.1186981926892241e-06, + "loss": 2.5649, + "step": 104615 + }, + { + "epoch": 7.108302758526974, + "grad_norm": 10.093032836914062, + "learning_rate": 1.118273542600897e-06, + "loss": 2.8566, + "step": 104620 + }, + { + "epoch": 7.108642478597636, + "grad_norm": 8.012133598327637, + "learning_rate": 1.1178488925125697e-06, + "loss": 2.5358, + "step": 104625 + }, + { + "epoch": 7.108982198668297, + "grad_norm": 7.75939416885376, + "learning_rate": 1.1174242424242425e-06, + "loss": 2.7373, + "step": 104630 + }, + { + "epoch": 7.109321918738959, + "grad_norm": 9.48357105255127, + "learning_rate": 1.1169995923359153e-06, + "loss": 2.7037, + "step": 104635 + }, + { + "epoch": 7.109661638809621, + "grad_norm": 7.843395709991455, + "learning_rate": 1.1165749422475881e-06, + "loss": 2.7629, + "step": 104640 + }, + { + "epoch": 7.110001358880282, + "grad_norm": 8.347086906433105, + "learning_rate": 1.1161502921592607e-06, + "loss": 2.6093, + "step": 104645 + }, + { + "epoch": 7.1103410789509445, + "grad_norm": 8.754720687866211, + "learning_rate": 1.1157256420709337e-06, + "loss": 2.6759, + "step": 104650 + }, + { + "epoch": 7.1106807990216065, + "grad_norm": 8.909736633300781, + "learning_rate": 1.1153009919826065e-06, + "loss": 2.5937, + "step": 104655 + }, + { + "epoch": 7.111020519092268, + "grad_norm": 8.063894271850586, + "learning_rate": 1.1148763418942791e-06, + "loss": 2.7832, + "step": 104660 + }, + { + "epoch": 7.11136023916293, + "grad_norm": 9.936700820922852, + "learning_rate": 1.114451691805952e-06, + "loss": 2.7412, + "step": 104665 + }, + { + "epoch": 7.111699959233592, + "grad_norm": 6.8919453620910645, + "learning_rate": 1.1140270417176247e-06, + "loss": 2.8105, + "step": 104670 + }, + { + "epoch": 7.112039679304253, + "grad_norm": 10.7137451171875, + "learning_rate": 1.1136023916292975e-06, + "loss": 2.8976, + "step": 104675 + }, + { + "epoch": 7.112379399374915, + "grad_norm": 7.983726978302002, + "learning_rate": 1.1131777415409703e-06, + "loss": 2.7229, + "step": 104680 + }, + { + "epoch": 7.112719119445577, + "grad_norm": 8.08758544921875, + "learning_rate": 1.1127530914526431e-06, + "loss": 2.7969, + "step": 104685 + }, + { + "epoch": 7.113058839516238, + "grad_norm": 7.92309045791626, + "learning_rate": 1.112328441364316e-06, + "loss": 2.7934, + "step": 104690 + }, + { + "epoch": 7.1133985595869005, + "grad_norm": 7.7157745361328125, + "learning_rate": 1.1119037912759887e-06, + "loss": 2.7517, + "step": 104695 + }, + { + "epoch": 7.113738279657563, + "grad_norm": 8.15283203125, + "learning_rate": 1.1114791411876615e-06, + "loss": 2.7148, + "step": 104700 + }, + { + "epoch": 7.114077999728224, + "grad_norm": 7.336297988891602, + "learning_rate": 1.1110544910993343e-06, + "loss": 2.7828, + "step": 104705 + }, + { + "epoch": 7.114417719798886, + "grad_norm": 7.468229293823242, + "learning_rate": 1.110629841011007e-06, + "loss": 2.7185, + "step": 104710 + }, + { + "epoch": 7.114757439869548, + "grad_norm": 6.8682990074157715, + "learning_rate": 1.1102051909226797e-06, + "loss": 2.8919, + "step": 104715 + }, + { + "epoch": 7.115097159940209, + "grad_norm": 7.322002410888672, + "learning_rate": 1.1097805408343525e-06, + "loss": 2.6759, + "step": 104720 + }, + { + "epoch": 7.115436880010871, + "grad_norm": 7.892187118530273, + "learning_rate": 1.1093558907460253e-06, + "loss": 2.3907, + "step": 104725 + }, + { + "epoch": 7.115776600081533, + "grad_norm": 7.561103343963623, + "learning_rate": 1.1089312406576981e-06, + "loss": 2.8068, + "step": 104730 + }, + { + "epoch": 7.116116320152194, + "grad_norm": 7.615213871002197, + "learning_rate": 1.108506590569371e-06, + "loss": 2.6277, + "step": 104735 + }, + { + "epoch": 7.1164560402228565, + "grad_norm": 7.969517707824707, + "learning_rate": 1.1080819404810437e-06, + "loss": 2.7295, + "step": 104740 + }, + { + "epoch": 7.116795760293519, + "grad_norm": 11.467740058898926, + "learning_rate": 1.1076572903927165e-06, + "loss": 2.5986, + "step": 104745 + }, + { + "epoch": 7.11713548036418, + "grad_norm": 8.418764114379883, + "learning_rate": 1.1072326403043893e-06, + "loss": 2.6266, + "step": 104750 + }, + { + "epoch": 7.117475200434842, + "grad_norm": 7.2322611808776855, + "learning_rate": 1.1068079902160621e-06, + "loss": 2.7792, + "step": 104755 + }, + { + "epoch": 7.117814920505504, + "grad_norm": 7.66762638092041, + "learning_rate": 1.106383340127735e-06, + "loss": 2.4945, + "step": 104760 + }, + { + "epoch": 7.118154640576165, + "grad_norm": 7.1755452156066895, + "learning_rate": 1.1059586900394075e-06, + "loss": 2.637, + "step": 104765 + }, + { + "epoch": 7.118494360646827, + "grad_norm": 7.3310394287109375, + "learning_rate": 1.1055340399510803e-06, + "loss": 2.9173, + "step": 104770 + }, + { + "epoch": 7.118834080717488, + "grad_norm": 6.689573287963867, + "learning_rate": 1.1051093898627533e-06, + "loss": 2.8658, + "step": 104775 + }, + { + "epoch": 7.11917380078815, + "grad_norm": 8.74635124206543, + "learning_rate": 1.104684739774426e-06, + "loss": 2.8225, + "step": 104780 + }, + { + "epoch": 7.1195135208588125, + "grad_norm": 6.957638263702393, + "learning_rate": 1.1042600896860987e-06, + "loss": 2.8534, + "step": 104785 + }, + { + "epoch": 7.119853240929474, + "grad_norm": 6.621041774749756, + "learning_rate": 1.1038354395977715e-06, + "loss": 2.627, + "step": 104790 + }, + { + "epoch": 7.120192961000136, + "grad_norm": 7.077373027801514, + "learning_rate": 1.1034107895094443e-06, + "loss": 2.7071, + "step": 104795 + }, + { + "epoch": 7.120532681070798, + "grad_norm": 9.531144142150879, + "learning_rate": 1.1029861394211171e-06, + "loss": 2.7212, + "step": 104800 + }, + { + "epoch": 7.120872401141459, + "grad_norm": 6.44842004776001, + "learning_rate": 1.10256148933279e-06, + "loss": 2.8889, + "step": 104805 + }, + { + "epoch": 7.121212121212121, + "grad_norm": 9.491754531860352, + "learning_rate": 1.1021368392444627e-06, + "loss": 2.5689, + "step": 104810 + }, + { + "epoch": 7.121551841282783, + "grad_norm": 8.699389457702637, + "learning_rate": 1.1017121891561353e-06, + "loss": 2.9583, + "step": 104815 + }, + { + "epoch": 7.121891561353444, + "grad_norm": 8.261096954345703, + "learning_rate": 1.1012875390678083e-06, + "loss": 2.6096, + "step": 104820 + }, + { + "epoch": 7.122231281424106, + "grad_norm": 7.59757137298584, + "learning_rate": 1.1008628889794811e-06, + "loss": 2.6857, + "step": 104825 + }, + { + "epoch": 7.1225710014947685, + "grad_norm": 8.109143257141113, + "learning_rate": 1.1004382388911537e-06, + "loss": 2.7258, + "step": 104830 + }, + { + "epoch": 7.12291072156543, + "grad_norm": 7.744998455047607, + "learning_rate": 1.1000135888028265e-06, + "loss": 2.9407, + "step": 104835 + }, + { + "epoch": 7.123250441636092, + "grad_norm": 7.928214073181152, + "learning_rate": 1.0995889387144993e-06, + "loss": 2.6892, + "step": 104840 + }, + { + "epoch": 7.123590161706754, + "grad_norm": 7.413460731506348, + "learning_rate": 1.0991642886261721e-06, + "loss": 2.7007, + "step": 104845 + }, + { + "epoch": 7.123929881777415, + "grad_norm": 7.043473720550537, + "learning_rate": 1.098739638537845e-06, + "loss": 2.7443, + "step": 104850 + }, + { + "epoch": 7.124269601848077, + "grad_norm": 7.148290634155273, + "learning_rate": 1.0983149884495177e-06, + "loss": 2.6197, + "step": 104855 + }, + { + "epoch": 7.124609321918739, + "grad_norm": 8.412796020507812, + "learning_rate": 1.0978903383611905e-06, + "loss": 2.5652, + "step": 104860 + }, + { + "epoch": 7.1249490419894, + "grad_norm": 7.387073516845703, + "learning_rate": 1.0974656882728631e-06, + "loss": 2.8115, + "step": 104865 + }, + { + "epoch": 7.1252887620600625, + "grad_norm": 7.2167067527771, + "learning_rate": 1.0970410381845361e-06, + "loss": 2.612, + "step": 104870 + }, + { + "epoch": 7.1256284821307245, + "grad_norm": 8.133050918579102, + "learning_rate": 1.096616388096209e-06, + "loss": 2.5076, + "step": 104875 + }, + { + "epoch": 7.125968202201386, + "grad_norm": 7.127135753631592, + "learning_rate": 1.0961917380078815e-06, + "loss": 2.7244, + "step": 104880 + }, + { + "epoch": 7.126307922272048, + "grad_norm": 7.979738712310791, + "learning_rate": 1.0957670879195543e-06, + "loss": 2.7486, + "step": 104885 + }, + { + "epoch": 7.12664764234271, + "grad_norm": 7.410349369049072, + "learning_rate": 1.0953424378312271e-06, + "loss": 2.913, + "step": 104890 + }, + { + "epoch": 7.126987362413371, + "grad_norm": 6.373028755187988, + "learning_rate": 1.0949177877429e-06, + "loss": 2.4769, + "step": 104895 + }, + { + "epoch": 7.127327082484033, + "grad_norm": 7.890411376953125, + "learning_rate": 1.0944931376545727e-06, + "loss": 2.8368, + "step": 104900 + }, + { + "epoch": 7.127666802554695, + "grad_norm": 7.747740745544434, + "learning_rate": 1.0940684875662455e-06, + "loss": 2.8509, + "step": 104905 + }, + { + "epoch": 7.128006522625356, + "grad_norm": 5.97813081741333, + "learning_rate": 1.0936438374779183e-06, + "loss": 2.7036, + "step": 104910 + }, + { + "epoch": 7.1283462426960185, + "grad_norm": 9.855740547180176, + "learning_rate": 1.0932191873895911e-06, + "loss": 2.8438, + "step": 104915 + }, + { + "epoch": 7.1286859627666805, + "grad_norm": 8.740962982177734, + "learning_rate": 1.092794537301264e-06, + "loss": 3.034, + "step": 104920 + }, + { + "epoch": 7.129025682837342, + "grad_norm": 7.671245574951172, + "learning_rate": 1.0923698872129367e-06, + "loss": 2.8691, + "step": 104925 + }, + { + "epoch": 7.129365402908004, + "grad_norm": 7.3189005851745605, + "learning_rate": 1.0919452371246093e-06, + "loss": 2.5087, + "step": 104930 + }, + { + "epoch": 7.129705122978666, + "grad_norm": 9.104320526123047, + "learning_rate": 1.0915205870362821e-06, + "loss": 2.7121, + "step": 104935 + }, + { + "epoch": 7.130044843049327, + "grad_norm": 7.202327728271484, + "learning_rate": 1.091095936947955e-06, + "loss": 2.6783, + "step": 104940 + }, + { + "epoch": 7.130384563119989, + "grad_norm": 8.509103775024414, + "learning_rate": 1.0906712868596277e-06, + "loss": 2.8426, + "step": 104945 + }, + { + "epoch": 7.130724283190651, + "grad_norm": 9.083382606506348, + "learning_rate": 1.0902466367713005e-06, + "loss": 2.6661, + "step": 104950 + }, + { + "epoch": 7.131064003261312, + "grad_norm": 7.88314151763916, + "learning_rate": 1.0898219866829733e-06, + "loss": 2.7858, + "step": 104955 + }, + { + "epoch": 7.1314037233319745, + "grad_norm": 7.395561218261719, + "learning_rate": 1.0893973365946461e-06, + "loss": 2.6702, + "step": 104960 + }, + { + "epoch": 7.1317434434026366, + "grad_norm": 7.335485935211182, + "learning_rate": 1.088972686506319e-06, + "loss": 2.6556, + "step": 104965 + }, + { + "epoch": 7.132083163473298, + "grad_norm": 6.126856803894043, + "learning_rate": 1.0885480364179917e-06, + "loss": 2.8325, + "step": 104970 + }, + { + "epoch": 7.13242288354396, + "grad_norm": 8.855899810791016, + "learning_rate": 1.0881233863296645e-06, + "loss": 2.8448, + "step": 104975 + }, + { + "epoch": 7.132762603614622, + "grad_norm": 5.893100261688232, + "learning_rate": 1.0876987362413373e-06, + "loss": 2.6826, + "step": 104980 + }, + { + "epoch": 7.133102323685283, + "grad_norm": 7.5125732421875, + "learning_rate": 1.08727408615301e-06, + "loss": 2.6059, + "step": 104985 + }, + { + "epoch": 7.133442043755945, + "grad_norm": 7.95310640335083, + "learning_rate": 1.0868494360646827e-06, + "loss": 2.7233, + "step": 104990 + }, + { + "epoch": 7.133781763826607, + "grad_norm": 6.305450916290283, + "learning_rate": 1.0864247859763557e-06, + "loss": 2.6925, + "step": 104995 + }, + { + "epoch": 7.134121483897268, + "grad_norm": 11.269280433654785, + "learning_rate": 1.0860001358880283e-06, + "loss": 2.7822, + "step": 105000 + }, + { + "epoch": 7.1344612039679305, + "grad_norm": 6.709474563598633, + "learning_rate": 1.0855754857997011e-06, + "loss": 2.7353, + "step": 105005 + }, + { + "epoch": 7.134800924038593, + "grad_norm": 9.905110359191895, + "learning_rate": 1.085150835711374e-06, + "loss": 2.6117, + "step": 105010 + }, + { + "epoch": 7.135140644109254, + "grad_norm": 5.922102451324463, + "learning_rate": 1.0847261856230467e-06, + "loss": 2.6485, + "step": 105015 + }, + { + "epoch": 7.135480364179916, + "grad_norm": 8.757665634155273, + "learning_rate": 1.0843015355347195e-06, + "loss": 2.5728, + "step": 105020 + }, + { + "epoch": 7.135820084250578, + "grad_norm": 8.430902481079102, + "learning_rate": 1.0838768854463923e-06, + "loss": 2.7746, + "step": 105025 + }, + { + "epoch": 7.136159804321239, + "grad_norm": 9.676690101623535, + "learning_rate": 1.0834522353580651e-06, + "loss": 2.8528, + "step": 105030 + }, + { + "epoch": 7.136499524391901, + "grad_norm": 6.819650173187256, + "learning_rate": 1.0830275852697377e-06, + "loss": 2.6769, + "step": 105035 + }, + { + "epoch": 7.136839244462563, + "grad_norm": 11.475403785705566, + "learning_rate": 1.0826029351814105e-06, + "loss": 2.6486, + "step": 105040 + }, + { + "epoch": 7.137178964533224, + "grad_norm": 6.0492401123046875, + "learning_rate": 1.0821782850930835e-06, + "loss": 2.855, + "step": 105045 + }, + { + "epoch": 7.1375186846038865, + "grad_norm": 8.194437026977539, + "learning_rate": 1.081753635004756e-06, + "loss": 3.0048, + "step": 105050 + }, + { + "epoch": 7.137858404674549, + "grad_norm": 9.580215454101562, + "learning_rate": 1.081328984916429e-06, + "loss": 2.7119, + "step": 105055 + }, + { + "epoch": 7.13819812474521, + "grad_norm": 7.416913032531738, + "learning_rate": 1.0809043348281017e-06, + "loss": 2.7459, + "step": 105060 + }, + { + "epoch": 7.138537844815872, + "grad_norm": 8.258360862731934, + "learning_rate": 1.0804796847397745e-06, + "loss": 2.5777, + "step": 105065 + }, + { + "epoch": 7.138877564886534, + "grad_norm": 10.849705696105957, + "learning_rate": 1.0800550346514473e-06, + "loss": 2.6636, + "step": 105070 + }, + { + "epoch": 7.139217284957195, + "grad_norm": 9.17493724822998, + "learning_rate": 1.0796303845631201e-06, + "loss": 2.7171, + "step": 105075 + }, + { + "epoch": 7.139557005027857, + "grad_norm": 8.536355972290039, + "learning_rate": 1.079205734474793e-06, + "loss": 2.4529, + "step": 105080 + }, + { + "epoch": 7.139896725098519, + "grad_norm": 11.865471839904785, + "learning_rate": 1.0787810843864655e-06, + "loss": 2.564, + "step": 105085 + }, + { + "epoch": 7.14023644516918, + "grad_norm": 8.533295631408691, + "learning_rate": 1.0783564342981385e-06, + "loss": 2.7171, + "step": 105090 + }, + { + "epoch": 7.1405761652398425, + "grad_norm": 8.667363166809082, + "learning_rate": 1.0779317842098113e-06, + "loss": 2.4321, + "step": 105095 + }, + { + "epoch": 7.140915885310504, + "grad_norm": 7.220179080963135, + "learning_rate": 1.077507134121484e-06, + "loss": 3.0126, + "step": 105100 + }, + { + "epoch": 7.141255605381166, + "grad_norm": 9.045069694519043, + "learning_rate": 1.0770824840331567e-06, + "loss": 2.7566, + "step": 105105 + }, + { + "epoch": 7.141595325451828, + "grad_norm": 8.206028938293457, + "learning_rate": 1.0766578339448295e-06, + "loss": 2.8771, + "step": 105110 + }, + { + "epoch": 7.141935045522489, + "grad_norm": 8.365222930908203, + "learning_rate": 1.0762331838565023e-06, + "loss": 2.6235, + "step": 105115 + }, + { + "epoch": 7.142274765593151, + "grad_norm": 7.0074381828308105, + "learning_rate": 1.075808533768175e-06, + "loss": 2.5916, + "step": 105120 + }, + { + "epoch": 7.142614485663813, + "grad_norm": 6.390689373016357, + "learning_rate": 1.075383883679848e-06, + "loss": 2.7435, + "step": 105125 + }, + { + "epoch": 7.142954205734474, + "grad_norm": 7.86121940612793, + "learning_rate": 1.0749592335915207e-06, + "loss": 2.5919, + "step": 105130 + }, + { + "epoch": 7.1432939258051364, + "grad_norm": 9.375768661499023, + "learning_rate": 1.0745345835031933e-06, + "loss": 2.6418, + "step": 105135 + }, + { + "epoch": 7.1436336458757985, + "grad_norm": 8.961603164672852, + "learning_rate": 1.0741948634325316e-06, + "loss": 2.7641, + "step": 105140 + }, + { + "epoch": 7.14397336594646, + "grad_norm": 9.154789924621582, + "learning_rate": 1.0737702133442044e-06, + "loss": 2.3545, + "step": 105145 + }, + { + "epoch": 7.144313086017122, + "grad_norm": 7.337275505065918, + "learning_rate": 1.0733455632558774e-06, + "loss": 2.5793, + "step": 105150 + }, + { + "epoch": 7.144652806087784, + "grad_norm": 7.140657901763916, + "learning_rate": 1.07292091316755e-06, + "loss": 2.8575, + "step": 105155 + }, + { + "epoch": 7.144992526158445, + "grad_norm": 7.137389183044434, + "learning_rate": 1.0724962630792228e-06, + "loss": 2.7299, + "step": 105160 + }, + { + "epoch": 7.145332246229107, + "grad_norm": 7.361810207366943, + "learning_rate": 1.0720716129908956e-06, + "loss": 2.7208, + "step": 105165 + }, + { + "epoch": 7.145671966299769, + "grad_norm": 5.967323303222656, + "learning_rate": 1.0716469629025684e-06, + "loss": 2.846, + "step": 105170 + }, + { + "epoch": 7.14601168637043, + "grad_norm": 7.747457981109619, + "learning_rate": 1.0712223128142412e-06, + "loss": 2.6916, + "step": 105175 + }, + { + "epoch": 7.1463514064410925, + "grad_norm": 8.94692611694336, + "learning_rate": 1.070797662725914e-06, + "loss": 2.6158, + "step": 105180 + }, + { + "epoch": 7.1466911265117545, + "grad_norm": 8.599268913269043, + "learning_rate": 1.0703730126375868e-06, + "loss": 2.7591, + "step": 105185 + }, + { + "epoch": 7.147030846582416, + "grad_norm": 6.9941277503967285, + "learning_rate": 1.0699483625492594e-06, + "loss": 2.7296, + "step": 105190 + }, + { + "epoch": 7.147370566653078, + "grad_norm": 11.323493003845215, + "learning_rate": 1.0695237124609324e-06, + "loss": 2.8227, + "step": 105195 + }, + { + "epoch": 7.14771028672374, + "grad_norm": 8.14979076385498, + "learning_rate": 1.0690990623726052e-06, + "loss": 2.6968, + "step": 105200 + }, + { + "epoch": 7.148050006794401, + "grad_norm": 7.12731409072876, + "learning_rate": 1.0686744122842778e-06, + "loss": 2.4869, + "step": 105205 + }, + { + "epoch": 7.148389726865063, + "grad_norm": 8.092644691467285, + "learning_rate": 1.0682497621959506e-06, + "loss": 2.6731, + "step": 105210 + }, + { + "epoch": 7.148729446935725, + "grad_norm": 7.730327606201172, + "learning_rate": 1.0678251121076234e-06, + "loss": 2.5777, + "step": 105215 + }, + { + "epoch": 7.149069167006386, + "grad_norm": 6.614198207855225, + "learning_rate": 1.0674004620192962e-06, + "loss": 2.6023, + "step": 105220 + }, + { + "epoch": 7.1494088870770485, + "grad_norm": 7.994431018829346, + "learning_rate": 1.066975811930969e-06, + "loss": 2.8756, + "step": 105225 + }, + { + "epoch": 7.1497486071477105, + "grad_norm": 8.863616943359375, + "learning_rate": 1.0665511618426418e-06, + "loss": 2.744, + "step": 105230 + }, + { + "epoch": 7.150088327218372, + "grad_norm": 9.538383483886719, + "learning_rate": 1.0661265117543146e-06, + "loss": 2.811, + "step": 105235 + }, + { + "epoch": 7.150428047289034, + "grad_norm": 7.783647060394287, + "learning_rate": 1.0657018616659872e-06, + "loss": 2.4963, + "step": 105240 + }, + { + "epoch": 7.150767767359696, + "grad_norm": 7.628624439239502, + "learning_rate": 1.0652772115776602e-06, + "loss": 2.8679, + "step": 105245 + }, + { + "epoch": 7.151107487430357, + "grad_norm": 5.506185531616211, + "learning_rate": 1.064852561489333e-06, + "loss": 2.6798, + "step": 105250 + }, + { + "epoch": 7.151447207501019, + "grad_norm": 7.086246967315674, + "learning_rate": 1.0644279114010056e-06, + "loss": 2.9283, + "step": 105255 + }, + { + "epoch": 7.151786927571681, + "grad_norm": 8.730437278747559, + "learning_rate": 1.0640032613126784e-06, + "loss": 2.6858, + "step": 105260 + }, + { + "epoch": 7.152126647642342, + "grad_norm": 8.577476501464844, + "learning_rate": 1.0635786112243512e-06, + "loss": 2.6605, + "step": 105265 + }, + { + "epoch": 7.1524663677130045, + "grad_norm": 5.409427165985107, + "learning_rate": 1.063153961136024e-06, + "loss": 2.8635, + "step": 105270 + }, + { + "epoch": 7.1528060877836666, + "grad_norm": 8.457685470581055, + "learning_rate": 1.0627293110476968e-06, + "loss": 2.7211, + "step": 105275 + }, + { + "epoch": 7.153145807854328, + "grad_norm": 8.426928520202637, + "learning_rate": 1.0623046609593696e-06, + "loss": 2.6112, + "step": 105280 + }, + { + "epoch": 7.15348552792499, + "grad_norm": 7.65524959564209, + "learning_rate": 1.0618800108710424e-06, + "loss": 2.74, + "step": 105285 + }, + { + "epoch": 7.153825247995652, + "grad_norm": 7.900294780731201, + "learning_rate": 1.0614553607827152e-06, + "loss": 2.9575, + "step": 105290 + }, + { + "epoch": 7.154164968066313, + "grad_norm": 8.410309791564941, + "learning_rate": 1.061030710694388e-06, + "loss": 2.8458, + "step": 105295 + }, + { + "epoch": 7.154504688136975, + "grad_norm": 6.873233795166016, + "learning_rate": 1.0606060606060608e-06, + "loss": 2.8327, + "step": 105300 + }, + { + "epoch": 7.154844408207637, + "grad_norm": 6.773321628570557, + "learning_rate": 1.0601814105177334e-06, + "loss": 2.7479, + "step": 105305 + }, + { + "epoch": 7.155184128278298, + "grad_norm": 8.281073570251465, + "learning_rate": 1.0597567604294062e-06, + "loss": 2.8162, + "step": 105310 + }, + { + "epoch": 7.1555238483489605, + "grad_norm": 8.203624725341797, + "learning_rate": 1.059332110341079e-06, + "loss": 2.6146, + "step": 105315 + }, + { + "epoch": 7.155863568419623, + "grad_norm": 8.04532241821289, + "learning_rate": 1.0589074602527518e-06, + "loss": 2.798, + "step": 105320 + }, + { + "epoch": 7.156203288490284, + "grad_norm": 6.846789836883545, + "learning_rate": 1.0584828101644246e-06, + "loss": 2.6999, + "step": 105325 + }, + { + "epoch": 7.156543008560946, + "grad_norm": 9.002914428710938, + "learning_rate": 1.0580581600760974e-06, + "loss": 2.6689, + "step": 105330 + }, + { + "epoch": 7.156882728631608, + "grad_norm": 6.643833160400391, + "learning_rate": 1.0576335099877702e-06, + "loss": 2.5507, + "step": 105335 + }, + { + "epoch": 7.157222448702269, + "grad_norm": 8.442418098449707, + "learning_rate": 1.057208859899443e-06, + "loss": 2.6105, + "step": 105340 + }, + { + "epoch": 7.157562168772931, + "grad_norm": 7.068309307098389, + "learning_rate": 1.0567842098111158e-06, + "loss": 2.592, + "step": 105345 + }, + { + "epoch": 7.157901888843593, + "grad_norm": 7.765908241271973, + "learning_rate": 1.0563595597227886e-06, + "loss": 2.8724, + "step": 105350 + }, + { + "epoch": 7.158241608914254, + "grad_norm": 9.380897521972656, + "learning_rate": 1.0559349096344614e-06, + "loss": 2.7634, + "step": 105355 + }, + { + "epoch": 7.1585813289849165, + "grad_norm": 7.427799701690674, + "learning_rate": 1.055510259546134e-06, + "loss": 2.7391, + "step": 105360 + }, + { + "epoch": 7.158921049055579, + "grad_norm": 7.689691066741943, + "learning_rate": 1.0550856094578068e-06, + "loss": 2.7749, + "step": 105365 + }, + { + "epoch": 7.15926076912624, + "grad_norm": 7.895869731903076, + "learning_rate": 1.0546609593694798e-06, + "loss": 2.6491, + "step": 105370 + }, + { + "epoch": 7.159600489196902, + "grad_norm": 8.097757339477539, + "learning_rate": 1.0542363092811524e-06, + "loss": 2.9004, + "step": 105375 + }, + { + "epoch": 7.159940209267564, + "grad_norm": 7.140805721282959, + "learning_rate": 1.0538116591928252e-06, + "loss": 2.837, + "step": 105380 + }, + { + "epoch": 7.160279929338225, + "grad_norm": 8.813451766967773, + "learning_rate": 1.053387009104498e-06, + "loss": 2.7804, + "step": 105385 + }, + { + "epoch": 7.160619649408887, + "grad_norm": 6.684091091156006, + "learning_rate": 1.0529623590161708e-06, + "loss": 2.8321, + "step": 105390 + }, + { + "epoch": 7.160959369479549, + "grad_norm": 5.999983310699463, + "learning_rate": 1.0525377089278436e-06, + "loss": 2.6018, + "step": 105395 + }, + { + "epoch": 7.16129908955021, + "grad_norm": 6.6705756187438965, + "learning_rate": 1.0521130588395164e-06, + "loss": 2.6756, + "step": 105400 + }, + { + "epoch": 7.1616388096208725, + "grad_norm": 6.435191631317139, + "learning_rate": 1.0516884087511892e-06, + "loss": 2.5187, + "step": 105405 + }, + { + "epoch": 7.161978529691535, + "grad_norm": 10.004772186279297, + "learning_rate": 1.0512637586628618e-06, + "loss": 2.7281, + "step": 105410 + }, + { + "epoch": 7.162318249762196, + "grad_norm": 9.26150894165039, + "learning_rate": 1.0508391085745348e-06, + "loss": 2.6354, + "step": 105415 + }, + { + "epoch": 7.162657969832858, + "grad_norm": 7.674956798553467, + "learning_rate": 1.0504144584862076e-06, + "loss": 2.5945, + "step": 105420 + }, + { + "epoch": 7.16299768990352, + "grad_norm": 11.583255767822266, + "learning_rate": 1.0499898083978802e-06, + "loss": 2.6875, + "step": 105425 + }, + { + "epoch": 7.163337409974181, + "grad_norm": 9.17087459564209, + "learning_rate": 1.049565158309553e-06, + "loss": 2.8015, + "step": 105430 + }, + { + "epoch": 7.163677130044843, + "grad_norm": 7.46390962600708, + "learning_rate": 1.0491405082212258e-06, + "loss": 2.4832, + "step": 105435 + }, + { + "epoch": 7.164016850115505, + "grad_norm": 6.990577220916748, + "learning_rate": 1.0487158581328986e-06, + "loss": 2.618, + "step": 105440 + }, + { + "epoch": 7.1643565701861665, + "grad_norm": 7.4151105880737305, + "learning_rate": 1.0482912080445714e-06, + "loss": 2.7711, + "step": 105445 + }, + { + "epoch": 7.1646962902568285, + "grad_norm": 7.764579772949219, + "learning_rate": 1.0478665579562442e-06, + "loss": 2.7731, + "step": 105450 + }, + { + "epoch": 7.165036010327491, + "grad_norm": 8.469579696655273, + "learning_rate": 1.047441907867917e-06, + "loss": 2.6099, + "step": 105455 + }, + { + "epoch": 7.165375730398152, + "grad_norm": 8.828025817871094, + "learning_rate": 1.0470172577795896e-06, + "loss": 2.7238, + "step": 105460 + }, + { + "epoch": 7.165715450468814, + "grad_norm": 5.8945488929748535, + "learning_rate": 1.0465926076912626e-06, + "loss": 2.3346, + "step": 105465 + }, + { + "epoch": 7.166055170539475, + "grad_norm": 9.981593132019043, + "learning_rate": 1.0461679576029354e-06, + "loss": 2.6524, + "step": 105470 + }, + { + "epoch": 7.166394890610137, + "grad_norm": 8.643203735351562, + "learning_rate": 1.045743307514608e-06, + "loss": 2.725, + "step": 105475 + }, + { + "epoch": 7.166734610680799, + "grad_norm": 7.998159408569336, + "learning_rate": 1.0453186574262808e-06, + "loss": 2.6279, + "step": 105480 + }, + { + "epoch": 7.16707433075146, + "grad_norm": 8.247846603393555, + "learning_rate": 1.0448940073379536e-06, + "loss": 2.4861, + "step": 105485 + }, + { + "epoch": 7.1674140508221225, + "grad_norm": 6.668605327606201, + "learning_rate": 1.0444693572496264e-06, + "loss": 2.5447, + "step": 105490 + }, + { + "epoch": 7.1677537708927845, + "grad_norm": 7.615560054779053, + "learning_rate": 1.0440447071612992e-06, + "loss": 2.6641, + "step": 105495 + }, + { + "epoch": 7.168093490963446, + "grad_norm": 8.646456718444824, + "learning_rate": 1.043620057072972e-06, + "loss": 2.8169, + "step": 105500 + }, + { + "epoch": 7.168433211034108, + "grad_norm": 5.842045783996582, + "learning_rate": 1.0431954069846448e-06, + "loss": 2.7323, + "step": 105505 + }, + { + "epoch": 7.16877293110477, + "grad_norm": 8.680768013000488, + "learning_rate": 1.0427707568963176e-06, + "loss": 2.4604, + "step": 105510 + }, + { + "epoch": 7.169112651175431, + "grad_norm": 10.177083969116211, + "learning_rate": 1.0423461068079904e-06, + "loss": 2.8442, + "step": 105515 + }, + { + "epoch": 7.169452371246093, + "grad_norm": 7.340485572814941, + "learning_rate": 1.0419214567196632e-06, + "loss": 2.7425, + "step": 105520 + }, + { + "epoch": 7.169792091316755, + "grad_norm": 6.9777398109436035, + "learning_rate": 1.041496806631336e-06, + "loss": 2.8633, + "step": 105525 + }, + { + "epoch": 7.170131811387416, + "grad_norm": 7.498989582061768, + "learning_rate": 1.0410721565430086e-06, + "loss": 2.4811, + "step": 105530 + }, + { + "epoch": 7.1704715314580785, + "grad_norm": 6.549466609954834, + "learning_rate": 1.0406475064546814e-06, + "loss": 2.7983, + "step": 105535 + }, + { + "epoch": 7.1708112515287405, + "grad_norm": 9.247330665588379, + "learning_rate": 1.0402228563663542e-06, + "loss": 2.789, + "step": 105540 + }, + { + "epoch": 7.171150971599402, + "grad_norm": 12.10132884979248, + "learning_rate": 1.039798206278027e-06, + "loss": 2.9665, + "step": 105545 + }, + { + "epoch": 7.171490691670064, + "grad_norm": 8.121194839477539, + "learning_rate": 1.0393735561896998e-06, + "loss": 2.6715, + "step": 105550 + }, + { + "epoch": 7.171830411740726, + "grad_norm": 6.964968204498291, + "learning_rate": 1.0389489061013726e-06, + "loss": 2.5053, + "step": 105555 + }, + { + "epoch": 7.172170131811387, + "grad_norm": 13.023079872131348, + "learning_rate": 1.0385242560130454e-06, + "loss": 2.8858, + "step": 105560 + }, + { + "epoch": 7.172509851882049, + "grad_norm": 8.357477188110352, + "learning_rate": 1.0380996059247182e-06, + "loss": 2.8348, + "step": 105565 + }, + { + "epoch": 7.172849571952711, + "grad_norm": 6.533574104309082, + "learning_rate": 1.037674955836391e-06, + "loss": 2.7474, + "step": 105570 + }, + { + "epoch": 7.173189292023372, + "grad_norm": 8.421281814575195, + "learning_rate": 1.0372503057480638e-06, + "loss": 2.4909, + "step": 105575 + }, + { + "epoch": 7.1735290120940345, + "grad_norm": 8.316354751586914, + "learning_rate": 1.0368256556597363e-06, + "loss": 2.6931, + "step": 105580 + }, + { + "epoch": 7.173868732164697, + "grad_norm": 9.238686561584473, + "learning_rate": 1.0364010055714091e-06, + "loss": 2.7828, + "step": 105585 + }, + { + "epoch": 7.174208452235358, + "grad_norm": 6.310760021209717, + "learning_rate": 1.0359763554830822e-06, + "loss": 2.6014, + "step": 105590 + }, + { + "epoch": 7.17454817230602, + "grad_norm": 7.194926738739014, + "learning_rate": 1.0355517053947548e-06, + "loss": 2.8634, + "step": 105595 + }, + { + "epoch": 7.174887892376682, + "grad_norm": 7.444775581359863, + "learning_rate": 1.0351270553064276e-06, + "loss": 2.8162, + "step": 105600 + }, + { + "epoch": 7.175227612447343, + "grad_norm": 8.435542106628418, + "learning_rate": 1.0347024052181004e-06, + "loss": 3.018, + "step": 105605 + }, + { + "epoch": 7.175567332518005, + "grad_norm": 8.992465019226074, + "learning_rate": 1.0342777551297732e-06, + "loss": 2.6511, + "step": 105610 + }, + { + "epoch": 7.175907052588667, + "grad_norm": 8.525819778442383, + "learning_rate": 1.033853105041446e-06, + "loss": 2.8008, + "step": 105615 + }, + { + "epoch": 7.176246772659328, + "grad_norm": 6.4265570640563965, + "learning_rate": 1.0334284549531188e-06, + "loss": 2.636, + "step": 105620 + }, + { + "epoch": 7.1765864927299905, + "grad_norm": 7.393314838409424, + "learning_rate": 1.0330038048647916e-06, + "loss": 2.4656, + "step": 105625 + }, + { + "epoch": 7.176926212800653, + "grad_norm": 7.232936859130859, + "learning_rate": 1.0325791547764641e-06, + "loss": 2.7913, + "step": 105630 + }, + { + "epoch": 7.177265932871314, + "grad_norm": 8.58774185180664, + "learning_rate": 1.032154504688137e-06, + "loss": 2.7316, + "step": 105635 + }, + { + "epoch": 7.177605652941976, + "grad_norm": 12.444374084472656, + "learning_rate": 1.03172985459981e-06, + "loss": 2.7907, + "step": 105640 + }, + { + "epoch": 7.177945373012638, + "grad_norm": 8.296032905578613, + "learning_rate": 1.0313052045114825e-06, + "loss": 2.6754, + "step": 105645 + }, + { + "epoch": 7.178285093083299, + "grad_norm": 7.410162925720215, + "learning_rate": 1.0308805544231553e-06, + "loss": 3.0628, + "step": 105650 + }, + { + "epoch": 7.178624813153961, + "grad_norm": 7.534445762634277, + "learning_rate": 1.0304559043348281e-06, + "loss": 2.774, + "step": 105655 + }, + { + "epoch": 7.178964533224623, + "grad_norm": 6.366321563720703, + "learning_rate": 1.030031254246501e-06, + "loss": 2.8286, + "step": 105660 + }, + { + "epoch": 7.179304253295284, + "grad_norm": 11.182124137878418, + "learning_rate": 1.0296066041581738e-06, + "loss": 2.82, + "step": 105665 + }, + { + "epoch": 7.1796439733659465, + "grad_norm": 6.904507160186768, + "learning_rate": 1.0291819540698466e-06, + "loss": 2.5406, + "step": 105670 + }, + { + "epoch": 7.179983693436609, + "grad_norm": 8.4142427444458, + "learning_rate": 1.0287573039815194e-06, + "loss": 2.8139, + "step": 105675 + }, + { + "epoch": 7.18032341350727, + "grad_norm": 6.2416205406188965, + "learning_rate": 1.028332653893192e-06, + "loss": 2.8165, + "step": 105680 + }, + { + "epoch": 7.180663133577932, + "grad_norm": 6.870877742767334, + "learning_rate": 1.027908003804865e-06, + "loss": 2.7426, + "step": 105685 + }, + { + "epoch": 7.181002853648594, + "grad_norm": 8.504051208496094, + "learning_rate": 1.0274833537165378e-06, + "loss": 2.9588, + "step": 105690 + }, + { + "epoch": 7.181342573719255, + "grad_norm": 7.93702507019043, + "learning_rate": 1.0270587036282106e-06, + "loss": 2.623, + "step": 105695 + }, + { + "epoch": 7.181682293789917, + "grad_norm": 7.164635181427002, + "learning_rate": 1.0266340535398831e-06, + "loss": 2.7492, + "step": 105700 + }, + { + "epoch": 7.182022013860579, + "grad_norm": 8.458118438720703, + "learning_rate": 1.026209403451556e-06, + "loss": 2.5544, + "step": 105705 + }, + { + "epoch": 7.18236173393124, + "grad_norm": 8.240283966064453, + "learning_rate": 1.0257847533632287e-06, + "loss": 2.7155, + "step": 105710 + }, + { + "epoch": 7.1827014540019025, + "grad_norm": 9.305705070495605, + "learning_rate": 1.0253601032749015e-06, + "loss": 2.6822, + "step": 105715 + }, + { + "epoch": 7.183041174072565, + "grad_norm": 10.239106178283691, + "learning_rate": 1.0249354531865743e-06, + "loss": 2.7374, + "step": 105720 + }, + { + "epoch": 7.183380894143226, + "grad_norm": 7.115032196044922, + "learning_rate": 1.0245108030982471e-06, + "loss": 2.6548, + "step": 105725 + }, + { + "epoch": 7.183720614213888, + "grad_norm": 9.209497451782227, + "learning_rate": 1.0240861530099197e-06, + "loss": 2.6735, + "step": 105730 + }, + { + "epoch": 7.18406033428455, + "grad_norm": 8.165367126464844, + "learning_rate": 1.0236615029215927e-06, + "loss": 2.6537, + "step": 105735 + }, + { + "epoch": 7.184400054355211, + "grad_norm": 7.103401184082031, + "learning_rate": 1.0232368528332656e-06, + "loss": 2.6246, + "step": 105740 + }, + { + "epoch": 7.184739774425873, + "grad_norm": 10.457343101501465, + "learning_rate": 1.0228122027449384e-06, + "loss": 2.5337, + "step": 105745 + }, + { + "epoch": 7.185079494496535, + "grad_norm": 8.97873306274414, + "learning_rate": 1.022387552656611e-06, + "loss": 2.4837, + "step": 105750 + }, + { + "epoch": 7.1854192145671965, + "grad_norm": 7.343898773193359, + "learning_rate": 1.0219629025682837e-06, + "loss": 2.7892, + "step": 105755 + }, + { + "epoch": 7.1857589346378585, + "grad_norm": 6.508875370025635, + "learning_rate": 1.0215382524799565e-06, + "loss": 2.5834, + "step": 105760 + }, + { + "epoch": 7.186098654708521, + "grad_norm": 9.523537635803223, + "learning_rate": 1.0211136023916293e-06, + "loss": 2.7556, + "step": 105765 + }, + { + "epoch": 7.186438374779182, + "grad_norm": 7.975299835205078, + "learning_rate": 1.0206889523033021e-06, + "loss": 2.6791, + "step": 105770 + }, + { + "epoch": 7.186778094849844, + "grad_norm": 6.15582799911499, + "learning_rate": 1.020264302214975e-06, + "loss": 2.6439, + "step": 105775 + }, + { + "epoch": 7.187117814920505, + "grad_norm": 9.679908752441406, + "learning_rate": 1.0198396521266477e-06, + "loss": 2.8173, + "step": 105780 + }, + { + "epoch": 7.187457534991167, + "grad_norm": 6.344203948974609, + "learning_rate": 1.0194150020383205e-06, + "loss": 2.6602, + "step": 105785 + }, + { + "epoch": 7.187797255061829, + "grad_norm": 6.4072065353393555, + "learning_rate": 1.0189903519499933e-06, + "loss": 2.7145, + "step": 105790 + }, + { + "epoch": 7.18813697513249, + "grad_norm": 11.746530532836914, + "learning_rate": 1.0185657018616661e-06, + "loss": 2.7504, + "step": 105795 + }, + { + "epoch": 7.1884766952031525, + "grad_norm": 8.326172828674316, + "learning_rate": 1.0181410517733387e-06, + "loss": 2.5071, + "step": 105800 + }, + { + "epoch": 7.1888164152738145, + "grad_norm": 8.23453140258789, + "learning_rate": 1.0177164016850115e-06, + "loss": 2.6651, + "step": 105805 + }, + { + "epoch": 7.189156135344476, + "grad_norm": 5.781088829040527, + "learning_rate": 1.0172917515966845e-06, + "loss": 2.4807, + "step": 105810 + }, + { + "epoch": 7.189495855415138, + "grad_norm": 7.704298496246338, + "learning_rate": 1.0168671015083571e-06, + "loss": 2.9762, + "step": 105815 + }, + { + "epoch": 7.1898355754858, + "grad_norm": 7.209316730499268, + "learning_rate": 1.01644245142003e-06, + "loss": 2.8671, + "step": 105820 + }, + { + "epoch": 7.190175295556461, + "grad_norm": 7.473333835601807, + "learning_rate": 1.0160178013317027e-06, + "loss": 2.8516, + "step": 105825 + }, + { + "epoch": 7.190515015627123, + "grad_norm": 9.386994361877441, + "learning_rate": 1.0155931512433755e-06, + "loss": 2.8661, + "step": 105830 + }, + { + "epoch": 7.190854735697785, + "grad_norm": 10.548104286193848, + "learning_rate": 1.0151685011550483e-06, + "loss": 2.6341, + "step": 105835 + }, + { + "epoch": 7.191194455768446, + "grad_norm": 8.344762802124023, + "learning_rate": 1.0147438510667211e-06, + "loss": 2.5952, + "step": 105840 + }, + { + "epoch": 7.1915341758391085, + "grad_norm": 7.314056396484375, + "learning_rate": 1.014319200978394e-06, + "loss": 2.7029, + "step": 105845 + }, + { + "epoch": 7.1918738959097706, + "grad_norm": 7.056920051574707, + "learning_rate": 1.0138945508900665e-06, + "loss": 2.7177, + "step": 105850 + }, + { + "epoch": 7.192213615980432, + "grad_norm": 7.519902229309082, + "learning_rate": 1.0134699008017393e-06, + "loss": 2.7906, + "step": 105855 + }, + { + "epoch": 7.192553336051094, + "grad_norm": 7.228780269622803, + "learning_rate": 1.0130452507134123e-06, + "loss": 2.5947, + "step": 105860 + }, + { + "epoch": 7.192893056121756, + "grad_norm": 7.9117279052734375, + "learning_rate": 1.0126206006250851e-06, + "loss": 2.7877, + "step": 105865 + }, + { + "epoch": 7.193232776192417, + "grad_norm": 6.877336025238037, + "learning_rate": 1.0121959505367577e-06, + "loss": 2.5146, + "step": 105870 + }, + { + "epoch": 7.193572496263079, + "grad_norm": 8.930508613586426, + "learning_rate": 1.0117713004484305e-06, + "loss": 2.6321, + "step": 105875 + }, + { + "epoch": 7.193912216333741, + "grad_norm": 8.723546028137207, + "learning_rate": 1.0113466503601033e-06, + "loss": 2.6002, + "step": 105880 + }, + { + "epoch": 7.194251936404402, + "grad_norm": 9.524205207824707, + "learning_rate": 1.0109220002717761e-06, + "loss": 2.6119, + "step": 105885 + }, + { + "epoch": 7.1945916564750645, + "grad_norm": 10.782066345214844, + "learning_rate": 1.010497350183449e-06, + "loss": 2.7503, + "step": 105890 + }, + { + "epoch": 7.194931376545727, + "grad_norm": 7.5298380851745605, + "learning_rate": 1.0100727000951217e-06, + "loss": 2.8015, + "step": 105895 + }, + { + "epoch": 7.195271096616388, + "grad_norm": 7.4335222244262695, + "learning_rate": 1.0096480500067943e-06, + "loss": 2.8987, + "step": 105900 + }, + { + "epoch": 7.19561081668705, + "grad_norm": 7.71552038192749, + "learning_rate": 1.0092233999184673e-06, + "loss": 2.5764, + "step": 105905 + }, + { + "epoch": 7.195950536757712, + "grad_norm": 8.270758628845215, + "learning_rate": 1.0087987498301401e-06, + "loss": 2.605, + "step": 105910 + }, + { + "epoch": 7.196290256828373, + "grad_norm": 7.323399066925049, + "learning_rate": 1.008374099741813e-06, + "loss": 2.7267, + "step": 105915 + }, + { + "epoch": 7.196629976899035, + "grad_norm": 6.313686847686768, + "learning_rate": 1.0079494496534855e-06, + "loss": 2.6955, + "step": 105920 + }, + { + "epoch": 7.196969696969697, + "grad_norm": 7.135973930358887, + "learning_rate": 1.0075247995651583e-06, + "loss": 2.7413, + "step": 105925 + }, + { + "epoch": 7.197309417040358, + "grad_norm": 8.86043930053711, + "learning_rate": 1.0071001494768311e-06, + "loss": 2.5488, + "step": 105930 + }, + { + "epoch": 7.1976491371110205, + "grad_norm": 10.530817031860352, + "learning_rate": 1.006675499388504e-06, + "loss": 2.6992, + "step": 105935 + }, + { + "epoch": 7.197988857181683, + "grad_norm": 8.066862106323242, + "learning_rate": 1.0062508493001767e-06, + "loss": 2.9505, + "step": 105940 + }, + { + "epoch": 7.198328577252344, + "grad_norm": 6.61447811126709, + "learning_rate": 1.0058261992118495e-06, + "loss": 2.7623, + "step": 105945 + }, + { + "epoch": 7.198668297323006, + "grad_norm": 7.350464344024658, + "learning_rate": 1.0054015491235223e-06, + "loss": 2.96, + "step": 105950 + }, + { + "epoch": 7.199008017393668, + "grad_norm": 7.881575107574463, + "learning_rate": 1.0049768990351951e-06, + "loss": 2.7327, + "step": 105955 + }, + { + "epoch": 7.199347737464329, + "grad_norm": 7.795426368713379, + "learning_rate": 1.004552248946868e-06, + "loss": 2.8367, + "step": 105960 + }, + { + "epoch": 7.199687457534991, + "grad_norm": 10.13981819152832, + "learning_rate": 1.0041275988585407e-06, + "loss": 2.5093, + "step": 105965 + }, + { + "epoch": 7.200027177605653, + "grad_norm": 8.409256935119629, + "learning_rate": 1.0037029487702133e-06, + "loss": 2.8268, + "step": 105970 + }, + { + "epoch": 7.200366897676314, + "grad_norm": 6.4699482917785645, + "learning_rate": 1.0032782986818861e-06, + "loss": 2.6253, + "step": 105975 + }, + { + "epoch": 7.2007066177469765, + "grad_norm": 9.961845397949219, + "learning_rate": 1.002853648593559e-06, + "loss": 2.8192, + "step": 105980 + }, + { + "epoch": 7.201046337817639, + "grad_norm": 6.617745399475098, + "learning_rate": 1.0024289985052317e-06, + "loss": 2.5994, + "step": 105985 + }, + { + "epoch": 7.2013860578883, + "grad_norm": 8.042280197143555, + "learning_rate": 1.0020043484169045e-06, + "loss": 2.6841, + "step": 105990 + }, + { + "epoch": 7.201725777958962, + "grad_norm": 6.6707892417907715, + "learning_rate": 1.0015796983285773e-06, + "loss": 2.6962, + "step": 105995 + }, + { + "epoch": 7.202065498029624, + "grad_norm": 8.391739845275879, + "learning_rate": 1.0011550482402501e-06, + "loss": 2.5848, + "step": 106000 + }, + { + "epoch": 7.202405218100285, + "grad_norm": 7.1442060470581055, + "learning_rate": 1.000730398151923e-06, + "loss": 2.762, + "step": 106005 + }, + { + "epoch": 7.202744938170947, + "grad_norm": 9.2811861038208, + "learning_rate": 1.0003057480635957e-06, + "loss": 2.5494, + "step": 106010 + }, + { + "epoch": 7.203084658241609, + "grad_norm": 7.561613082885742, + "learning_rate": 9.998810979752685e-07, + "loss": 2.7579, + "step": 106015 + }, + { + "epoch": 7.2034243783122704, + "grad_norm": 8.984397888183594, + "learning_rate": 9.994564478869411e-07, + "loss": 2.8503, + "step": 106020 + }, + { + "epoch": 7.2037640983829325, + "grad_norm": 8.020155906677246, + "learning_rate": 9.99031797798614e-07, + "loss": 2.6566, + "step": 106025 + }, + { + "epoch": 7.204103818453595, + "grad_norm": 6.260512828826904, + "learning_rate": 9.986071477102867e-07, + "loss": 2.7842, + "step": 106030 + }, + { + "epoch": 7.204443538524256, + "grad_norm": 10.021385192871094, + "learning_rate": 9.981824976219597e-07, + "loss": 2.8512, + "step": 106035 + }, + { + "epoch": 7.204783258594918, + "grad_norm": 7.9101762771606445, + "learning_rate": 9.977578475336323e-07, + "loss": 2.46, + "step": 106040 + }, + { + "epoch": 7.20512297866558, + "grad_norm": 7.965175151824951, + "learning_rate": 9.973331974453051e-07, + "loss": 2.7597, + "step": 106045 + }, + { + "epoch": 7.205462698736241, + "grad_norm": 8.641672134399414, + "learning_rate": 9.96908547356978e-07, + "loss": 2.9349, + "step": 106050 + }, + { + "epoch": 7.205802418806903, + "grad_norm": 6.302975177764893, + "learning_rate": 9.964838972686507e-07, + "loss": 2.6683, + "step": 106055 + }, + { + "epoch": 7.206142138877565, + "grad_norm": 7.207335472106934, + "learning_rate": 9.960592471803235e-07, + "loss": 2.9959, + "step": 106060 + }, + { + "epoch": 7.2064818589482265, + "grad_norm": 8.145156860351562, + "learning_rate": 9.956345970919963e-07, + "loss": 2.6842, + "step": 106065 + }, + { + "epoch": 7.2068215790188885, + "grad_norm": 6.901100158691406, + "learning_rate": 9.95209947003669e-07, + "loss": 2.9398, + "step": 106070 + }, + { + "epoch": 7.207161299089551, + "grad_norm": 6.344216346740723, + "learning_rate": 9.947852969153417e-07, + "loss": 2.6636, + "step": 106075 + }, + { + "epoch": 7.207501019160212, + "grad_norm": 6.4112629890441895, + "learning_rate": 9.943606468270147e-07, + "loss": 2.5954, + "step": 106080 + }, + { + "epoch": 7.207840739230874, + "grad_norm": 8.537149429321289, + "learning_rate": 9.939359967386875e-07, + "loss": 2.7952, + "step": 106085 + }, + { + "epoch": 7.208180459301536, + "grad_norm": 8.085062980651855, + "learning_rate": 9.935113466503601e-07, + "loss": 2.5297, + "step": 106090 + }, + { + "epoch": 7.208520179372197, + "grad_norm": 9.526427268981934, + "learning_rate": 9.93086696562033e-07, + "loss": 2.7107, + "step": 106095 + }, + { + "epoch": 7.208859899442859, + "grad_norm": 6.951053619384766, + "learning_rate": 9.926620464737057e-07, + "loss": 2.9514, + "step": 106100 + }, + { + "epoch": 7.209199619513521, + "grad_norm": 7.681023120880127, + "learning_rate": 9.922373963853785e-07, + "loss": 2.8619, + "step": 106105 + }, + { + "epoch": 7.2095393395841825, + "grad_norm": 6.4099016189575195, + "learning_rate": 9.918127462970513e-07, + "loss": 2.8547, + "step": 106110 + }, + { + "epoch": 7.2098790596548445, + "grad_norm": 7.368464946746826, + "learning_rate": 9.913880962087241e-07, + "loss": 2.606, + "step": 106115 + }, + { + "epoch": 7.210218779725507, + "grad_norm": 8.042156219482422, + "learning_rate": 9.90963446120397e-07, + "loss": 2.7653, + "step": 106120 + }, + { + "epoch": 7.210558499796168, + "grad_norm": 8.244956970214844, + "learning_rate": 9.905387960320697e-07, + "loss": 2.8083, + "step": 106125 + }, + { + "epoch": 7.21089821986683, + "grad_norm": 7.6439995765686035, + "learning_rate": 9.901141459437425e-07, + "loss": 2.7648, + "step": 106130 + }, + { + "epoch": 7.211237939937492, + "grad_norm": 10.316307067871094, + "learning_rate": 9.896894958554153e-07, + "loss": 2.641, + "step": 106135 + }, + { + "epoch": 7.211577660008153, + "grad_norm": 8.952571868896484, + "learning_rate": 9.89264845767088e-07, + "loss": 2.5801, + "step": 106140 + }, + { + "epoch": 7.211917380078815, + "grad_norm": 7.332852363586426, + "learning_rate": 9.888401956787607e-07, + "loss": 2.6377, + "step": 106145 + }, + { + "epoch": 7.212257100149476, + "grad_norm": 6.146500587463379, + "learning_rate": 9.884155455904335e-07, + "loss": 2.6757, + "step": 106150 + }, + { + "epoch": 7.2125968202201385, + "grad_norm": 6.903100967407227, + "learning_rate": 9.879908955021063e-07, + "loss": 2.731, + "step": 106155 + }, + { + "epoch": 7.212936540290801, + "grad_norm": 8.558199882507324, + "learning_rate": 9.875662454137791e-07, + "loss": 2.5967, + "step": 106160 + }, + { + "epoch": 7.213276260361462, + "grad_norm": 7.7122979164123535, + "learning_rate": 9.87141595325452e-07, + "loss": 2.6497, + "step": 106165 + }, + { + "epoch": 7.213615980432124, + "grad_norm": 6.3619866371154785, + "learning_rate": 9.867169452371247e-07, + "loss": 2.7279, + "step": 106170 + }, + { + "epoch": 7.213955700502786, + "grad_norm": 7.2272467613220215, + "learning_rate": 9.862922951487975e-07, + "loss": 2.5879, + "step": 106175 + }, + { + "epoch": 7.214295420573447, + "grad_norm": 8.811673164367676, + "learning_rate": 9.858676450604703e-07, + "loss": 2.7203, + "step": 106180 + }, + { + "epoch": 7.214635140644109, + "grad_norm": 7.502418041229248, + "learning_rate": 9.854429949721431e-07, + "loss": 2.7901, + "step": 106185 + }, + { + "epoch": 7.214974860714771, + "grad_norm": 7.659355163574219, + "learning_rate": 9.850183448838157e-07, + "loss": 2.4407, + "step": 106190 + }, + { + "epoch": 7.215314580785432, + "grad_norm": 6.885815620422363, + "learning_rate": 9.845936947954885e-07, + "loss": 2.7227, + "step": 106195 + }, + { + "epoch": 7.2156543008560945, + "grad_norm": 6.760584354400635, + "learning_rate": 9.841690447071613e-07, + "loss": 2.5696, + "step": 106200 + }, + { + "epoch": 7.215994020926757, + "grad_norm": 6.565791606903076, + "learning_rate": 9.837443946188343e-07, + "loss": 2.9709, + "step": 106205 + }, + { + "epoch": 7.216333740997418, + "grad_norm": 7.672346591949463, + "learning_rate": 9.83319744530507e-07, + "loss": 2.6213, + "step": 106210 + }, + { + "epoch": 7.21667346106808, + "grad_norm": 6.574037075042725, + "learning_rate": 9.828950944421797e-07, + "loss": 2.5797, + "step": 106215 + }, + { + "epoch": 7.217013181138742, + "grad_norm": 10.562686920166016, + "learning_rate": 9.824704443538525e-07, + "loss": 2.6575, + "step": 106220 + }, + { + "epoch": 7.217352901209403, + "grad_norm": 7.92944860458374, + "learning_rate": 9.820457942655253e-07, + "loss": 2.6695, + "step": 106225 + }, + { + "epoch": 7.217692621280065, + "grad_norm": 10.391828536987305, + "learning_rate": 9.816211441771981e-07, + "loss": 3.0013, + "step": 106230 + }, + { + "epoch": 7.218032341350727, + "grad_norm": 8.941758155822754, + "learning_rate": 9.81196494088871e-07, + "loss": 2.748, + "step": 106235 + }, + { + "epoch": 7.218372061421388, + "grad_norm": 8.049941062927246, + "learning_rate": 9.807718440005435e-07, + "loss": 2.7284, + "step": 106240 + }, + { + "epoch": 7.2187117814920505, + "grad_norm": 8.547898292541504, + "learning_rate": 9.803471939122163e-07, + "loss": 2.3582, + "step": 106245 + }, + { + "epoch": 7.219051501562713, + "grad_norm": 9.656671524047852, + "learning_rate": 9.799225438238891e-07, + "loss": 2.703, + "step": 106250 + }, + { + "epoch": 7.219391221633374, + "grad_norm": 9.232255935668945, + "learning_rate": 9.794978937355621e-07, + "loss": 2.7142, + "step": 106255 + }, + { + "epoch": 7.219730941704036, + "grad_norm": 7.922831058502197, + "learning_rate": 9.790732436472347e-07, + "loss": 2.868, + "step": 106260 + }, + { + "epoch": 7.220070661774698, + "grad_norm": 6.109257698059082, + "learning_rate": 9.786485935589075e-07, + "loss": 2.8105, + "step": 106265 + }, + { + "epoch": 7.220410381845359, + "grad_norm": 7.500777244567871, + "learning_rate": 9.782239434705803e-07, + "loss": 2.6215, + "step": 106270 + }, + { + "epoch": 7.220750101916021, + "grad_norm": 8.931193351745605, + "learning_rate": 9.777992933822531e-07, + "loss": 2.7964, + "step": 106275 + }, + { + "epoch": 7.221089821986683, + "grad_norm": 9.193473815917969, + "learning_rate": 9.77374643293926e-07, + "loss": 2.9425, + "step": 106280 + }, + { + "epoch": 7.221429542057344, + "grad_norm": 10.748164176940918, + "learning_rate": 9.769499932055987e-07, + "loss": 2.9451, + "step": 106285 + }, + { + "epoch": 7.2217692621280065, + "grad_norm": 12.179668426513672, + "learning_rate": 9.765253431172715e-07, + "loss": 2.6625, + "step": 106290 + }, + { + "epoch": 7.222108982198669, + "grad_norm": 7.952839374542236, + "learning_rate": 9.761006930289441e-07, + "loss": 2.937, + "step": 106295 + }, + { + "epoch": 7.22244870226933, + "grad_norm": 7.227358341217041, + "learning_rate": 9.756760429406171e-07, + "loss": 2.6144, + "step": 106300 + }, + { + "epoch": 7.222788422339992, + "grad_norm": 7.7374749183654785, + "learning_rate": 9.7525139285229e-07, + "loss": 2.6952, + "step": 106305 + }, + { + "epoch": 7.223128142410654, + "grad_norm": 8.299359321594238, + "learning_rate": 9.748267427639625e-07, + "loss": 2.8957, + "step": 106310 + }, + { + "epoch": 7.223467862481315, + "grad_norm": 7.565348148345947, + "learning_rate": 9.744020926756353e-07, + "loss": 2.6254, + "step": 106315 + }, + { + "epoch": 7.223807582551977, + "grad_norm": 8.701656341552734, + "learning_rate": 9.739774425873081e-07, + "loss": 2.7123, + "step": 106320 + }, + { + "epoch": 7.224147302622639, + "grad_norm": 7.963233947753906, + "learning_rate": 9.73552792498981e-07, + "loss": 2.7566, + "step": 106325 + }, + { + "epoch": 7.2244870226933005, + "grad_norm": 8.940755844116211, + "learning_rate": 9.731281424106537e-07, + "loss": 2.5056, + "step": 106330 + }, + { + "epoch": 7.2248267427639625, + "grad_norm": 6.825336456298828, + "learning_rate": 9.727034923223265e-07, + "loss": 2.5873, + "step": 106335 + }, + { + "epoch": 7.225166462834625, + "grad_norm": 7.272945880889893, + "learning_rate": 9.722788422339993e-07, + "loss": 2.8221, + "step": 106340 + }, + { + "epoch": 7.225506182905286, + "grad_norm": 8.618428230285645, + "learning_rate": 9.71854192145672e-07, + "loss": 2.9928, + "step": 106345 + }, + { + "epoch": 7.225845902975948, + "grad_norm": 6.610442161560059, + "learning_rate": 9.71429542057345e-07, + "loss": 2.6727, + "step": 106350 + }, + { + "epoch": 7.22618562304661, + "grad_norm": 10.040436744689941, + "learning_rate": 9.710048919690177e-07, + "loss": 2.6735, + "step": 106355 + }, + { + "epoch": 7.226525343117271, + "grad_norm": 8.146082878112793, + "learning_rate": 9.705802418806903e-07, + "loss": 2.7267, + "step": 106360 + }, + { + "epoch": 7.226865063187933, + "grad_norm": 8.116862297058105, + "learning_rate": 9.70155591792363e-07, + "loss": 2.6847, + "step": 106365 + }, + { + "epoch": 7.227204783258595, + "grad_norm": 7.607934474945068, + "learning_rate": 9.69730941704036e-07, + "loss": 3.0755, + "step": 106370 + }, + { + "epoch": 7.2275445033292565, + "grad_norm": 6.8699140548706055, + "learning_rate": 9.693062916157087e-07, + "loss": 2.6732, + "step": 106375 + }, + { + "epoch": 7.2278842233999185, + "grad_norm": 9.004081726074219, + "learning_rate": 9.688816415273815e-07, + "loss": 2.9998, + "step": 106380 + }, + { + "epoch": 7.228223943470581, + "grad_norm": 8.947263717651367, + "learning_rate": 9.684569914390543e-07, + "loss": 2.3599, + "step": 106385 + }, + { + "epoch": 7.228563663541242, + "grad_norm": 7.914424419403076, + "learning_rate": 9.680323413507271e-07, + "loss": 2.7343, + "step": 106390 + }, + { + "epoch": 7.228903383611904, + "grad_norm": 9.564920425415039, + "learning_rate": 9.676076912624e-07, + "loss": 2.6792, + "step": 106395 + }, + { + "epoch": 7.229243103682566, + "grad_norm": 6.136417865753174, + "learning_rate": 9.671830411740727e-07, + "loss": 2.6504, + "step": 106400 + }, + { + "epoch": 7.229582823753227, + "grad_norm": 8.896201133728027, + "learning_rate": 9.667583910857455e-07, + "loss": 2.6074, + "step": 106405 + }, + { + "epoch": 7.229922543823889, + "grad_norm": 8.411800384521484, + "learning_rate": 9.66333740997418e-07, + "loss": 2.6975, + "step": 106410 + }, + { + "epoch": 7.230262263894551, + "grad_norm": 6.985508918762207, + "learning_rate": 9.65909090909091e-07, + "loss": 2.6695, + "step": 106415 + }, + { + "epoch": 7.2306019839652125, + "grad_norm": 9.800419807434082, + "learning_rate": 9.654844408207637e-07, + "loss": 2.7575, + "step": 106420 + }, + { + "epoch": 7.2309417040358746, + "grad_norm": 8.062477111816406, + "learning_rate": 9.650597907324365e-07, + "loss": 2.7397, + "step": 106425 + }, + { + "epoch": 7.231281424106537, + "grad_norm": 8.080543518066406, + "learning_rate": 9.646351406441093e-07, + "loss": 2.7473, + "step": 106430 + }, + { + "epoch": 7.231621144177198, + "grad_norm": 6.794626712799072, + "learning_rate": 9.64210490555782e-07, + "loss": 2.6967, + "step": 106435 + }, + { + "epoch": 7.23196086424786, + "grad_norm": 8.420416831970215, + "learning_rate": 9.63785840467455e-07, + "loss": 2.8383, + "step": 106440 + }, + { + "epoch": 7.232300584318522, + "grad_norm": 6.8772125244140625, + "learning_rate": 9.633611903791277e-07, + "loss": 2.7652, + "step": 106445 + }, + { + "epoch": 7.232640304389183, + "grad_norm": 7.486163139343262, + "learning_rate": 9.629365402908005e-07, + "loss": 2.8994, + "step": 106450 + }, + { + "epoch": 7.232980024459845, + "grad_norm": 8.378482818603516, + "learning_rate": 9.625118902024733e-07, + "loss": 2.5163, + "step": 106455 + }, + { + "epoch": 7.233319744530506, + "grad_norm": 9.089418411254883, + "learning_rate": 9.620872401141461e-07, + "loss": 2.587, + "step": 106460 + }, + { + "epoch": 7.2336594646011685, + "grad_norm": 7.170839786529541, + "learning_rate": 9.616625900258187e-07, + "loss": 2.7182, + "step": 106465 + }, + { + "epoch": 7.233999184671831, + "grad_norm": 9.7916898727417, + "learning_rate": 9.612379399374915e-07, + "loss": 2.6589, + "step": 106470 + }, + { + "epoch": 7.234338904742492, + "grad_norm": 8.964126586914062, + "learning_rate": 9.608132898491645e-07, + "loss": 2.6375, + "step": 106475 + }, + { + "epoch": 7.234678624813154, + "grad_norm": 7.2866058349609375, + "learning_rate": 9.60388639760837e-07, + "loss": 2.7285, + "step": 106480 + }, + { + "epoch": 7.235018344883816, + "grad_norm": 7.611893177032471, + "learning_rate": 9.5996398967251e-07, + "loss": 2.6646, + "step": 106485 + }, + { + "epoch": 7.235358064954477, + "grad_norm": 8.762697219848633, + "learning_rate": 9.595393395841827e-07, + "loss": 2.5411, + "step": 106490 + }, + { + "epoch": 7.235697785025139, + "grad_norm": 9.111235618591309, + "learning_rate": 9.591146894958555e-07, + "loss": 2.8748, + "step": 106495 + }, + { + "epoch": 7.236037505095801, + "grad_norm": 6.662254333496094, + "learning_rate": 9.586900394075283e-07, + "loss": 2.6443, + "step": 106500 + }, + { + "epoch": 7.236377225166462, + "grad_norm": 9.597932815551758, + "learning_rate": 9.58265389319201e-07, + "loss": 2.6274, + "step": 106505 + }, + { + "epoch": 7.2367169452371245, + "grad_norm": 8.4818696975708, + "learning_rate": 9.57840739230874e-07, + "loss": 2.7702, + "step": 106510 + }, + { + "epoch": 7.237056665307787, + "grad_norm": 8.800204277038574, + "learning_rate": 9.574160891425465e-07, + "loss": 2.6902, + "step": 106515 + }, + { + "epoch": 7.237396385378448, + "grad_norm": 8.189190864562988, + "learning_rate": 9.569914390542195e-07, + "loss": 2.8678, + "step": 106520 + }, + { + "epoch": 7.23773610544911, + "grad_norm": 5.9522385597229, + "learning_rate": 9.565667889658923e-07, + "loss": 2.436, + "step": 106525 + }, + { + "epoch": 7.238075825519772, + "grad_norm": 7.291375637054443, + "learning_rate": 9.56142138877565e-07, + "loss": 2.549, + "step": 106530 + }, + { + "epoch": 7.238415545590433, + "grad_norm": 7.280632495880127, + "learning_rate": 9.557174887892377e-07, + "loss": 2.7532, + "step": 106535 + }, + { + "epoch": 7.238755265661095, + "grad_norm": 7.763545036315918, + "learning_rate": 9.552928387009105e-07, + "loss": 2.9259, + "step": 106540 + }, + { + "epoch": 7.239094985731757, + "grad_norm": 8.89502239227295, + "learning_rate": 9.548681886125833e-07, + "loss": 2.838, + "step": 106545 + }, + { + "epoch": 7.239434705802418, + "grad_norm": 8.426985740661621, + "learning_rate": 9.54443538524256e-07, + "loss": 2.6623, + "step": 106550 + }, + { + "epoch": 7.2397744258730805, + "grad_norm": 8.078146934509277, + "learning_rate": 9.54018888435929e-07, + "loss": 2.6646, + "step": 106555 + }, + { + "epoch": 7.240114145943743, + "grad_norm": 8.733449935913086, + "learning_rate": 9.535942383476017e-07, + "loss": 2.7522, + "step": 106560 + }, + { + "epoch": 7.240453866014404, + "grad_norm": 8.41611099243164, + "learning_rate": 9.531695882592744e-07, + "loss": 2.7288, + "step": 106565 + }, + { + "epoch": 7.240793586085066, + "grad_norm": 7.787777900695801, + "learning_rate": 9.527449381709472e-07, + "loss": 2.8944, + "step": 106570 + }, + { + "epoch": 7.241133306155728, + "grad_norm": 8.09063720703125, + "learning_rate": 9.5232028808262e-07, + "loss": 2.8044, + "step": 106575 + }, + { + "epoch": 7.241473026226389, + "grad_norm": 7.33929443359375, + "learning_rate": 9.518956379942927e-07, + "loss": 2.6295, + "step": 106580 + }, + { + "epoch": 7.241812746297051, + "grad_norm": 7.599001407623291, + "learning_rate": 9.514709879059655e-07, + "loss": 2.7414, + "step": 106585 + }, + { + "epoch": 7.242152466367713, + "grad_norm": 5.862817764282227, + "learning_rate": 9.510463378176384e-07, + "loss": 2.8171, + "step": 106590 + }, + { + "epoch": 7.2424921864383744, + "grad_norm": 8.999138832092285, + "learning_rate": 9.506216877293112e-07, + "loss": 2.6112, + "step": 106595 + }, + { + "epoch": 7.2428319065090365, + "grad_norm": 8.596128463745117, + "learning_rate": 9.501970376409839e-07, + "loss": 2.8668, + "step": 106600 + }, + { + "epoch": 7.243171626579699, + "grad_norm": 9.36375904083252, + "learning_rate": 9.497723875526567e-07, + "loss": 2.5961, + "step": 106605 + }, + { + "epoch": 7.24351134665036, + "grad_norm": 6.925714015960693, + "learning_rate": 9.493477374643295e-07, + "loss": 2.6908, + "step": 106610 + }, + { + "epoch": 7.243851066721022, + "grad_norm": 11.420615196228027, + "learning_rate": 9.489230873760022e-07, + "loss": 2.8532, + "step": 106615 + }, + { + "epoch": 7.244190786791684, + "grad_norm": 8.601739883422852, + "learning_rate": 9.48498437287675e-07, + "loss": 2.4559, + "step": 106620 + }, + { + "epoch": 7.244530506862345, + "grad_norm": 6.583225727081299, + "learning_rate": 9.480737871993478e-07, + "loss": 2.6503, + "step": 106625 + }, + { + "epoch": 7.244870226933007, + "grad_norm": 8.717860221862793, + "learning_rate": 9.476491371110207e-07, + "loss": 2.7401, + "step": 106630 + }, + { + "epoch": 7.245209947003669, + "grad_norm": 6.783697128295898, + "learning_rate": 9.472244870226934e-07, + "loss": 2.9223, + "step": 106635 + }, + { + "epoch": 7.2455496670743305, + "grad_norm": 6.611771583557129, + "learning_rate": 9.467998369343662e-07, + "loss": 2.6511, + "step": 106640 + }, + { + "epoch": 7.2458893871449925, + "grad_norm": 8.025530815124512, + "learning_rate": 9.46375186846039e-07, + "loss": 2.6898, + "step": 106645 + }, + { + "epoch": 7.246229107215655, + "grad_norm": 6.118989944458008, + "learning_rate": 9.459505367577117e-07, + "loss": 2.8448, + "step": 106650 + }, + { + "epoch": 7.246568827286316, + "grad_norm": 8.766334533691406, + "learning_rate": 9.455258866693845e-07, + "loss": 2.4869, + "step": 106655 + }, + { + "epoch": 7.246908547356978, + "grad_norm": 5.77450704574585, + "learning_rate": 9.451012365810573e-07, + "loss": 2.578, + "step": 106660 + }, + { + "epoch": 7.24724826742764, + "grad_norm": 6.797705173492432, + "learning_rate": 9.4467658649273e-07, + "loss": 2.8288, + "step": 106665 + }, + { + "epoch": 7.247587987498301, + "grad_norm": 8.318942070007324, + "learning_rate": 9.442519364044028e-07, + "loss": 2.4778, + "step": 106670 + }, + { + "epoch": 7.247927707568963, + "grad_norm": 6.921727180480957, + "learning_rate": 9.438272863160757e-07, + "loss": 2.6625, + "step": 106675 + }, + { + "epoch": 7.248267427639625, + "grad_norm": 6.878922462463379, + "learning_rate": 9.434026362277485e-07, + "loss": 2.7435, + "step": 106680 + }, + { + "epoch": 7.2486071477102865, + "grad_norm": 8.270376205444336, + "learning_rate": 9.429779861394212e-07, + "loss": 2.9002, + "step": 106685 + }, + { + "epoch": 7.2489468677809485, + "grad_norm": 10.800959587097168, + "learning_rate": 9.42553336051094e-07, + "loss": 2.567, + "step": 106690 + }, + { + "epoch": 7.249286587851611, + "grad_norm": 6.912361145019531, + "learning_rate": 9.421286859627668e-07, + "loss": 2.6891, + "step": 106695 + }, + { + "epoch": 7.249626307922272, + "grad_norm": 7.275270462036133, + "learning_rate": 9.417040358744395e-07, + "loss": 2.7213, + "step": 106700 + }, + { + "epoch": 7.249966027992934, + "grad_norm": 7.373057842254639, + "learning_rate": 9.412793857861123e-07, + "loss": 2.797, + "step": 106705 + }, + { + "epoch": 7.250305748063596, + "grad_norm": 6.452434539794922, + "learning_rate": 9.408547356977851e-07, + "loss": 2.7167, + "step": 106710 + }, + { + "epoch": 7.250645468134257, + "grad_norm": 6.4691009521484375, + "learning_rate": 9.40430085609458e-07, + "loss": 2.6009, + "step": 106715 + }, + { + "epoch": 7.250985188204919, + "grad_norm": 8.162636756896973, + "learning_rate": 9.400054355211306e-07, + "loss": 2.7109, + "step": 106720 + }, + { + "epoch": 7.251324908275581, + "grad_norm": 8.950014114379883, + "learning_rate": 9.395807854328035e-07, + "loss": 2.7983, + "step": 106725 + }, + { + "epoch": 7.2516646283462425, + "grad_norm": 7.878164291381836, + "learning_rate": 9.391561353444763e-07, + "loss": 2.7292, + "step": 106730 + }, + { + "epoch": 7.2520043484169046, + "grad_norm": 10.101112365722656, + "learning_rate": 9.38731485256149e-07, + "loss": 2.6264, + "step": 106735 + }, + { + "epoch": 7.252344068487567, + "grad_norm": 7.923755168914795, + "learning_rate": 9.383068351678218e-07, + "loss": 2.7333, + "step": 106740 + }, + { + "epoch": 7.252683788558228, + "grad_norm": 7.980396747589111, + "learning_rate": 9.378821850794946e-07, + "loss": 2.6898, + "step": 106745 + }, + { + "epoch": 7.25302350862889, + "grad_norm": 8.251115798950195, + "learning_rate": 9.374575349911673e-07, + "loss": 2.5809, + "step": 106750 + }, + { + "epoch": 7.253363228699552, + "grad_norm": 8.409238815307617, + "learning_rate": 9.370328849028401e-07, + "loss": 2.2727, + "step": 106755 + }, + { + "epoch": 7.253702948770213, + "grad_norm": 7.3151021003723145, + "learning_rate": 9.366082348145129e-07, + "loss": 2.7222, + "step": 106760 + }, + { + "epoch": 7.254042668840875, + "grad_norm": NaN, + "learning_rate": 9.362685147438512e-07, + "loss": 2.4414, + "step": 106765 + }, + { + "epoch": 7.254382388911537, + "grad_norm": 9.093890190124512, + "learning_rate": 9.358438646555239e-07, + "loss": 2.5051, + "step": 106770 + }, + { + "epoch": 7.2547221089821985, + "grad_norm": 9.626092910766602, + "learning_rate": 9.354192145671967e-07, + "loss": 2.8117, + "step": 106775 + }, + { + "epoch": 7.255061829052861, + "grad_norm": 8.029305458068848, + "learning_rate": 9.349945644788696e-07, + "loss": 2.7433, + "step": 106780 + }, + { + "epoch": 7.255401549123523, + "grad_norm": 7.883792877197266, + "learning_rate": 9.345699143905422e-07, + "loss": 2.5533, + "step": 106785 + }, + { + "epoch": 7.255741269194184, + "grad_norm": 7.851497173309326, + "learning_rate": 9.341452643022151e-07, + "loss": 2.6738, + "step": 106790 + }, + { + "epoch": 7.256080989264846, + "grad_norm": 7.975062847137451, + "learning_rate": 9.337206142138879e-07, + "loss": 2.6612, + "step": 106795 + }, + { + "epoch": 7.256420709335508, + "grad_norm": 6.921911716461182, + "learning_rate": 9.332959641255607e-07, + "loss": 2.7459, + "step": 106800 + }, + { + "epoch": 7.256760429406169, + "grad_norm": 8.374539375305176, + "learning_rate": 9.328713140372334e-07, + "loss": 2.6453, + "step": 106805 + }, + { + "epoch": 7.257100149476831, + "grad_norm": 10.215455055236816, + "learning_rate": 9.324466639489062e-07, + "loss": 2.6538, + "step": 106810 + }, + { + "epoch": 7.257439869547493, + "grad_norm": 11.824597358703613, + "learning_rate": 9.32022013860579e-07, + "loss": 2.9074, + "step": 106815 + }, + { + "epoch": 7.2577795896181545, + "grad_norm": 8.076260566711426, + "learning_rate": 9.315973637722517e-07, + "loss": 2.8989, + "step": 106820 + }, + { + "epoch": 7.258119309688817, + "grad_norm": 11.220206260681152, + "learning_rate": 9.311727136839245e-07, + "loss": 2.7656, + "step": 106825 + }, + { + "epoch": 7.258459029759479, + "grad_norm": 7.1772284507751465, + "learning_rate": 9.307480635955974e-07, + "loss": 2.7519, + "step": 106830 + }, + { + "epoch": 7.25879874983014, + "grad_norm": 9.077616691589355, + "learning_rate": 9.303234135072702e-07, + "loss": 2.8589, + "step": 106835 + }, + { + "epoch": 7.259138469900802, + "grad_norm": 9.833687782287598, + "learning_rate": 9.298987634189429e-07, + "loss": 2.7503, + "step": 106840 + }, + { + "epoch": 7.259478189971463, + "grad_norm": 7.66193962097168, + "learning_rate": 9.294741133306157e-07, + "loss": 2.9025, + "step": 106845 + }, + { + "epoch": 7.259817910042125, + "grad_norm": 5.917505264282227, + "learning_rate": 9.290494632422885e-07, + "loss": 2.5273, + "step": 106850 + }, + { + "epoch": 7.260157630112787, + "grad_norm": 7.53303861618042, + "learning_rate": 9.286248131539612e-07, + "loss": 2.6958, + "step": 106855 + }, + { + "epoch": 7.260497350183448, + "grad_norm": 8.212607383728027, + "learning_rate": 9.28200163065634e-07, + "loss": 2.6257, + "step": 106860 + }, + { + "epoch": 7.2608370702541105, + "grad_norm": 9.111879348754883, + "learning_rate": 9.277755129773068e-07, + "loss": 2.385, + "step": 106865 + }, + { + "epoch": 7.261176790324773, + "grad_norm": 7.540946960449219, + "learning_rate": 9.273508628889795e-07, + "loss": 2.7669, + "step": 106870 + }, + { + "epoch": 7.261516510395434, + "grad_norm": 5.508481502532959, + "learning_rate": 9.269262128006524e-07, + "loss": 2.6302, + "step": 106875 + }, + { + "epoch": 7.261856230466096, + "grad_norm": 8.283979415893555, + "learning_rate": 9.265015627123252e-07, + "loss": 2.8048, + "step": 106880 + }, + { + "epoch": 7.262195950536758, + "grad_norm": 8.216681480407715, + "learning_rate": 9.26076912623998e-07, + "loss": 3.0852, + "step": 106885 + }, + { + "epoch": 7.262535670607419, + "grad_norm": 9.210396766662598, + "learning_rate": 9.256522625356707e-07, + "loss": 2.5159, + "step": 106890 + }, + { + "epoch": 7.262875390678081, + "grad_norm": 6.959631443023682, + "learning_rate": 9.252276124473435e-07, + "loss": 2.6962, + "step": 106895 + }, + { + "epoch": 7.263215110748743, + "grad_norm": 8.93163013458252, + "learning_rate": 9.248029623590163e-07, + "loss": 2.6665, + "step": 106900 + }, + { + "epoch": 7.2635548308194045, + "grad_norm": 9.095648765563965, + "learning_rate": 9.24378312270689e-07, + "loss": 2.6557, + "step": 106905 + }, + { + "epoch": 7.2638945508900665, + "grad_norm": 8.265477180480957, + "learning_rate": 9.239536621823618e-07, + "loss": 2.6354, + "step": 106910 + }, + { + "epoch": 7.264234270960729, + "grad_norm": 6.961041450500488, + "learning_rate": 9.235290120940347e-07, + "loss": 2.7069, + "step": 106915 + }, + { + "epoch": 7.26457399103139, + "grad_norm": 9.344313621520996, + "learning_rate": 9.231043620057075e-07, + "loss": 2.8483, + "step": 106920 + }, + { + "epoch": 7.264913711102052, + "grad_norm": 7.778683662414551, + "learning_rate": 9.226797119173802e-07, + "loss": 2.7517, + "step": 106925 + }, + { + "epoch": 7.265253431172714, + "grad_norm": 7.516369819641113, + "learning_rate": 9.22255061829053e-07, + "loss": 2.5637, + "step": 106930 + }, + { + "epoch": 7.265593151243375, + "grad_norm": 9.548513412475586, + "learning_rate": 9.218304117407258e-07, + "loss": 2.7579, + "step": 106935 + }, + { + "epoch": 7.265932871314037, + "grad_norm": 5.657835483551025, + "learning_rate": 9.214057616523985e-07, + "loss": 2.6802, + "step": 106940 + }, + { + "epoch": 7.266272591384699, + "grad_norm": 7.610093116760254, + "learning_rate": 9.209811115640713e-07, + "loss": 2.7972, + "step": 106945 + }, + { + "epoch": 7.2666123114553605, + "grad_norm": 9.7278470993042, + "learning_rate": 9.205564614757441e-07, + "loss": 2.8716, + "step": 106950 + }, + { + "epoch": 7.2669520315260225, + "grad_norm": 8.351678848266602, + "learning_rate": 9.201318113874167e-07, + "loss": 2.7322, + "step": 106955 + }, + { + "epoch": 7.267291751596685, + "grad_norm": 7.371947288513184, + "learning_rate": 9.197071612990896e-07, + "loss": 2.6725, + "step": 106960 + }, + { + "epoch": 7.267631471667346, + "grad_norm": 10.78805923461914, + "learning_rate": 9.192825112107625e-07, + "loss": 2.7507, + "step": 106965 + }, + { + "epoch": 7.267971191738008, + "grad_norm": 8.187137603759766, + "learning_rate": 9.188578611224353e-07, + "loss": 2.6358, + "step": 106970 + }, + { + "epoch": 7.26831091180867, + "grad_norm": 9.095118522644043, + "learning_rate": 9.18433211034108e-07, + "loss": 2.806, + "step": 106975 + }, + { + "epoch": 7.268650631879331, + "grad_norm": 9.127182006835938, + "learning_rate": 9.180085609457808e-07, + "loss": 2.8309, + "step": 106980 + }, + { + "epoch": 7.268990351949993, + "grad_norm": 9.55671215057373, + "learning_rate": 9.175839108574536e-07, + "loss": 2.7159, + "step": 106985 + }, + { + "epoch": 7.269330072020655, + "grad_norm": 7.965484142303467, + "learning_rate": 9.171592607691262e-07, + "loss": 2.7882, + "step": 106990 + }, + { + "epoch": 7.2696697920913165, + "grad_norm": 9.766633033752441, + "learning_rate": 9.16734610680799e-07, + "loss": 2.8259, + "step": 106995 + }, + { + "epoch": 7.2700095121619785, + "grad_norm": 8.480485916137695, + "learning_rate": 9.163099605924719e-07, + "loss": 2.6563, + "step": 107000 + }, + { + "epoch": 7.270349232232641, + "grad_norm": 6.831821441650391, + "learning_rate": 9.158853105041448e-07, + "loss": 2.6931, + "step": 107005 + }, + { + "epoch": 7.270688952303302, + "grad_norm": 7.380340576171875, + "learning_rate": 9.154606604158175e-07, + "loss": 2.6541, + "step": 107010 + }, + { + "epoch": 7.271028672373964, + "grad_norm": 7.278806686401367, + "learning_rate": 9.150360103274903e-07, + "loss": 2.8392, + "step": 107015 + }, + { + "epoch": 7.271368392444626, + "grad_norm": 9.742876052856445, + "learning_rate": 9.146113602391631e-07, + "loss": 2.7033, + "step": 107020 + }, + { + "epoch": 7.271708112515287, + "grad_norm": 8.81979751586914, + "learning_rate": 9.141867101508357e-07, + "loss": 2.777, + "step": 107025 + }, + { + "epoch": 7.272047832585949, + "grad_norm": 7.496324062347412, + "learning_rate": 9.137620600625085e-07, + "loss": 2.5635, + "step": 107030 + }, + { + "epoch": 7.272387552656611, + "grad_norm": 7.83656120300293, + "learning_rate": 9.133374099741814e-07, + "loss": 2.5331, + "step": 107035 + }, + { + "epoch": 7.2727272727272725, + "grad_norm": 8.01039981842041, + "learning_rate": 9.12912759885854e-07, + "loss": 2.7102, + "step": 107040 + }, + { + "epoch": 7.273066992797935, + "grad_norm": 7.87438440322876, + "learning_rate": 9.124881097975268e-07, + "loss": 2.8495, + "step": 107045 + }, + { + "epoch": 7.273406712868597, + "grad_norm": 7.079061508178711, + "learning_rate": 9.120634597091998e-07, + "loss": 2.483, + "step": 107050 + }, + { + "epoch": 7.273746432939258, + "grad_norm": 7.652215480804443, + "learning_rate": 9.116388096208726e-07, + "loss": 2.6442, + "step": 107055 + }, + { + "epoch": 7.27408615300992, + "grad_norm": 8.443690299987793, + "learning_rate": 9.112141595325452e-07, + "loss": 2.7929, + "step": 107060 + }, + { + "epoch": 7.274425873080582, + "grad_norm": 7.661357402801514, + "learning_rate": 9.10789509444218e-07, + "loss": 2.5352, + "step": 107065 + }, + { + "epoch": 7.274765593151243, + "grad_norm": 7.082754135131836, + "learning_rate": 9.103648593558908e-07, + "loss": 2.617, + "step": 107070 + }, + { + "epoch": 7.275105313221905, + "grad_norm": 7.36772346496582, + "learning_rate": 9.099402092675635e-07, + "loss": 2.7424, + "step": 107075 + }, + { + "epoch": 7.275445033292567, + "grad_norm": 8.380769729614258, + "learning_rate": 9.095155591792363e-07, + "loss": 2.8218, + "step": 107080 + }, + { + "epoch": 7.2757847533632285, + "grad_norm": 6.34929084777832, + "learning_rate": 9.090909090909091e-07, + "loss": 2.6225, + "step": 107085 + }, + { + "epoch": 7.276124473433891, + "grad_norm": 8.269854545593262, + "learning_rate": 9.08666259002582e-07, + "loss": 2.7324, + "step": 107090 + }, + { + "epoch": 7.276464193504553, + "grad_norm": 7.4502949714660645, + "learning_rate": 9.082416089142547e-07, + "loss": 2.6839, + "step": 107095 + }, + { + "epoch": 7.276803913575214, + "grad_norm": 9.030095100402832, + "learning_rate": 9.078169588259275e-07, + "loss": 2.6355, + "step": 107100 + }, + { + "epoch": 7.277143633645876, + "grad_norm": 12.68734359741211, + "learning_rate": 9.073923087376003e-07, + "loss": 2.9911, + "step": 107105 + }, + { + "epoch": 7.277483353716538, + "grad_norm": 6.953848838806152, + "learning_rate": 9.06967658649273e-07, + "loss": 2.809, + "step": 107110 + }, + { + "epoch": 7.277823073787199, + "grad_norm": 7.060987949371338, + "learning_rate": 9.065430085609458e-07, + "loss": 2.9401, + "step": 107115 + }, + { + "epoch": 7.278162793857861, + "grad_norm": 7.35673189163208, + "learning_rate": 9.061183584726186e-07, + "loss": 2.6861, + "step": 107120 + }, + { + "epoch": 7.278502513928522, + "grad_norm": 7.313431262969971, + "learning_rate": 9.056937083842913e-07, + "loss": 2.7949, + "step": 107125 + }, + { + "epoch": 7.2788422339991845, + "grad_norm": 8.488419532775879, + "learning_rate": 9.052690582959641e-07, + "loss": 2.773, + "step": 107130 + }, + { + "epoch": 7.279181954069847, + "grad_norm": 7.373781204223633, + "learning_rate": 9.04844408207637e-07, + "loss": 2.5651, + "step": 107135 + }, + { + "epoch": 7.279521674140508, + "grad_norm": 9.42931079864502, + "learning_rate": 9.044197581193098e-07, + "loss": 2.5609, + "step": 107140 + }, + { + "epoch": 7.27986139421117, + "grad_norm": 7.881364822387695, + "learning_rate": 9.039951080309825e-07, + "loss": 2.6993, + "step": 107145 + }, + { + "epoch": 7.280201114281832, + "grad_norm": 7.949563503265381, + "learning_rate": 9.035704579426553e-07, + "loss": 2.7448, + "step": 107150 + }, + { + "epoch": 7.280540834352493, + "grad_norm": 7.158080101013184, + "learning_rate": 9.031458078543281e-07, + "loss": 2.6121, + "step": 107155 + }, + { + "epoch": 7.280880554423155, + "grad_norm": 7.012019634246826, + "learning_rate": 9.027211577660008e-07, + "loss": 2.5597, + "step": 107160 + }, + { + "epoch": 7.281220274493817, + "grad_norm": 7.787356853485107, + "learning_rate": 9.022965076776736e-07, + "loss": 2.6268, + "step": 107165 + }, + { + "epoch": 7.2815599945644784, + "grad_norm": 11.037877082824707, + "learning_rate": 9.018718575893464e-07, + "loss": 2.6776, + "step": 107170 + }, + { + "epoch": 7.2818997146351405, + "grad_norm": 7.574409008026123, + "learning_rate": 9.014472075010193e-07, + "loss": 2.8339, + "step": 107175 + }, + { + "epoch": 7.282239434705803, + "grad_norm": 8.864795684814453, + "learning_rate": 9.010225574126919e-07, + "loss": 2.7241, + "step": 107180 + }, + { + "epoch": 7.282579154776464, + "grad_norm": 6.395377159118652, + "learning_rate": 9.005979073243648e-07, + "loss": 2.7532, + "step": 107185 + }, + { + "epoch": 7.282918874847126, + "grad_norm": 9.59575080871582, + "learning_rate": 9.001732572360376e-07, + "loss": 2.731, + "step": 107190 + }, + { + "epoch": 7.283258594917788, + "grad_norm": 8.793062210083008, + "learning_rate": 8.997486071477103e-07, + "loss": 2.6906, + "step": 107195 + }, + { + "epoch": 7.283598314988449, + "grad_norm": 7.989499092102051, + "learning_rate": 8.993239570593831e-07, + "loss": 2.6718, + "step": 107200 + }, + { + "epoch": 7.283938035059111, + "grad_norm": 8.434430122375488, + "learning_rate": 8.988993069710559e-07, + "loss": 2.6701, + "step": 107205 + }, + { + "epoch": 7.284277755129773, + "grad_norm": 6.758728504180908, + "learning_rate": 8.984746568827286e-07, + "loss": 2.6718, + "step": 107210 + }, + { + "epoch": 7.2846174752004345, + "grad_norm": 7.743309497833252, + "learning_rate": 8.980500067944014e-07, + "loss": 2.5637, + "step": 107215 + }, + { + "epoch": 7.2849571952710965, + "grad_norm": 8.881295204162598, + "learning_rate": 8.976253567060742e-07, + "loss": 2.5047, + "step": 107220 + }, + { + "epoch": 7.285296915341759, + "grad_norm": 8.484530448913574, + "learning_rate": 8.972007066177471e-07, + "loss": 2.7719, + "step": 107225 + }, + { + "epoch": 7.28563663541242, + "grad_norm": 9.427203178405762, + "learning_rate": 8.967760565294198e-07, + "loss": 2.9518, + "step": 107230 + }, + { + "epoch": 7.285976355483082, + "grad_norm": 9.972919464111328, + "learning_rate": 8.963514064410926e-07, + "loss": 2.6636, + "step": 107235 + }, + { + "epoch": 7.286316075553744, + "grad_norm": 6.205423355102539, + "learning_rate": 8.959267563527654e-07, + "loss": 2.5866, + "step": 107240 + }, + { + "epoch": 7.286655795624405, + "grad_norm": 9.254122734069824, + "learning_rate": 8.955021062644381e-07, + "loss": 2.5444, + "step": 107245 + }, + { + "epoch": 7.286995515695067, + "grad_norm": 9.082417488098145, + "learning_rate": 8.950774561761109e-07, + "loss": 2.5705, + "step": 107250 + }, + { + "epoch": 7.287335235765729, + "grad_norm": 8.265817642211914, + "learning_rate": 8.946528060877837e-07, + "loss": 2.817, + "step": 107255 + }, + { + "epoch": 7.2876749558363905, + "grad_norm": 7.193954944610596, + "learning_rate": 8.942281559994565e-07, + "loss": 2.9489, + "step": 107260 + }, + { + "epoch": 7.2880146759070525, + "grad_norm": 8.936059951782227, + "learning_rate": 8.938035059111292e-07, + "loss": 2.5737, + "step": 107265 + }, + { + "epoch": 7.288354395977715, + "grad_norm": 6.878145217895508, + "learning_rate": 8.933788558228021e-07, + "loss": 2.6851, + "step": 107270 + }, + { + "epoch": 7.288694116048376, + "grad_norm": 8.650972366333008, + "learning_rate": 8.929542057344749e-07, + "loss": 2.7192, + "step": 107275 + }, + { + "epoch": 7.289033836119038, + "grad_norm": 7.957695960998535, + "learning_rate": 8.925295556461476e-07, + "loss": 2.3495, + "step": 107280 + }, + { + "epoch": 7.2893735561897, + "grad_norm": 8.113895416259766, + "learning_rate": 8.921049055578204e-07, + "loss": 2.7112, + "step": 107285 + }, + { + "epoch": 7.289713276260361, + "grad_norm": 6.531818866729736, + "learning_rate": 8.916802554694932e-07, + "loss": 2.7598, + "step": 107290 + }, + { + "epoch": 7.290052996331023, + "grad_norm": 8.222046852111816, + "learning_rate": 8.912556053811659e-07, + "loss": 2.8097, + "step": 107295 + }, + { + "epoch": 7.290392716401685, + "grad_norm": 7.386383056640625, + "learning_rate": 8.908309552928387e-07, + "loss": 3.0602, + "step": 107300 + }, + { + "epoch": 7.2907324364723465, + "grad_norm": 8.051446914672852, + "learning_rate": 8.904063052045115e-07, + "loss": 2.7633, + "step": 107305 + }, + { + "epoch": 7.2910721565430086, + "grad_norm": 8.318888664245605, + "learning_rate": 8.899816551161844e-07, + "loss": 2.7927, + "step": 107310 + }, + { + "epoch": 7.291411876613671, + "grad_norm": 6.546463966369629, + "learning_rate": 8.89557005027857e-07, + "loss": 2.8394, + "step": 107315 + }, + { + "epoch": 7.291751596684332, + "grad_norm": 8.50214958190918, + "learning_rate": 8.891323549395299e-07, + "loss": 2.8496, + "step": 107320 + }, + { + "epoch": 7.292091316754994, + "grad_norm": 10.202154159545898, + "learning_rate": 8.887077048512027e-07, + "loss": 2.8219, + "step": 107325 + }, + { + "epoch": 7.292431036825656, + "grad_norm": 8.3466215133667, + "learning_rate": 8.882830547628754e-07, + "loss": 2.8433, + "step": 107330 + }, + { + "epoch": 7.292770756896317, + "grad_norm": 7.682055473327637, + "learning_rate": 8.878584046745482e-07, + "loss": 2.6228, + "step": 107335 + }, + { + "epoch": 7.293110476966979, + "grad_norm": 6.940911769866943, + "learning_rate": 8.87433754586221e-07, + "loss": 2.8543, + "step": 107340 + }, + { + "epoch": 7.293450197037641, + "grad_norm": 6.391060829162598, + "learning_rate": 8.870091044978938e-07, + "loss": 2.9402, + "step": 107345 + }, + { + "epoch": 7.2937899171083025, + "grad_norm": 7.2267608642578125, + "learning_rate": 8.865844544095665e-07, + "loss": 2.913, + "step": 107350 + }, + { + "epoch": 7.294129637178965, + "grad_norm": 7.425749778747559, + "learning_rate": 8.861598043212393e-07, + "loss": 2.45, + "step": 107355 + }, + { + "epoch": 7.294469357249627, + "grad_norm": 6.296276569366455, + "learning_rate": 8.857351542329122e-07, + "loss": 2.7816, + "step": 107360 + }, + { + "epoch": 7.294809077320288, + "grad_norm": 7.1641621589660645, + "learning_rate": 8.853105041445849e-07, + "loss": 2.8474, + "step": 107365 + }, + { + "epoch": 7.29514879739095, + "grad_norm": 9.055822372436523, + "learning_rate": 8.848858540562577e-07, + "loss": 2.5398, + "step": 107370 + }, + { + "epoch": 7.295488517461612, + "grad_norm": 8.116527557373047, + "learning_rate": 8.844612039679305e-07, + "loss": 2.6332, + "step": 107375 + }, + { + "epoch": 7.295828237532273, + "grad_norm": 9.131979942321777, + "learning_rate": 8.840365538796032e-07, + "loss": 2.6621, + "step": 107380 + }, + { + "epoch": 7.296167957602935, + "grad_norm": 8.424589157104492, + "learning_rate": 8.83611903791276e-07, + "loss": 2.6745, + "step": 107385 + }, + { + "epoch": 7.296507677673597, + "grad_norm": 6.532317638397217, + "learning_rate": 8.831872537029488e-07, + "loss": 2.9619, + "step": 107390 + }, + { + "epoch": 7.2968473977442585, + "grad_norm": 8.583907127380371, + "learning_rate": 8.827626036146217e-07, + "loss": 2.6986, + "step": 107395 + }, + { + "epoch": 7.297187117814921, + "grad_norm": 8.014986991882324, + "learning_rate": 8.823379535262943e-07, + "loss": 2.9166, + "step": 107400 + }, + { + "epoch": 7.297526837885583, + "grad_norm": 7.111091613769531, + "learning_rate": 8.819133034379672e-07, + "loss": 2.7355, + "step": 107405 + }, + { + "epoch": 7.297866557956244, + "grad_norm": 8.837662696838379, + "learning_rate": 8.8148865334964e-07, + "loss": 2.7831, + "step": 107410 + }, + { + "epoch": 7.298206278026906, + "grad_norm": 7.899513244628906, + "learning_rate": 8.810640032613127e-07, + "loss": 2.7553, + "step": 107415 + }, + { + "epoch": 7.298545998097568, + "grad_norm": 9.010810852050781, + "learning_rate": 8.806393531729855e-07, + "loss": 2.7602, + "step": 107420 + }, + { + "epoch": 7.298885718168229, + "grad_norm": 7.831737995147705, + "learning_rate": 8.802147030846583e-07, + "loss": 2.6402, + "step": 107425 + }, + { + "epoch": 7.299225438238891, + "grad_norm": 7.166820526123047, + "learning_rate": 8.797900529963311e-07, + "loss": 2.7407, + "step": 107430 + }, + { + "epoch": 7.299565158309553, + "grad_norm": 8.437785148620605, + "learning_rate": 8.793654029080038e-07, + "loss": 2.542, + "step": 107435 + }, + { + "epoch": 7.2999048783802145, + "grad_norm": 6.683592796325684, + "learning_rate": 8.789407528196766e-07, + "loss": 2.6617, + "step": 107440 + }, + { + "epoch": 7.300244598450877, + "grad_norm": 7.388490200042725, + "learning_rate": 8.785161027313495e-07, + "loss": 2.4788, + "step": 107445 + }, + { + "epoch": 7.300584318521539, + "grad_norm": 7.588757514953613, + "learning_rate": 8.780914526430222e-07, + "loss": 2.9079, + "step": 107450 + }, + { + "epoch": 7.3009240385922, + "grad_norm": 6.287565231323242, + "learning_rate": 8.77666802554695e-07, + "loss": 2.552, + "step": 107455 + }, + { + "epoch": 7.301263758662862, + "grad_norm": 8.544292449951172, + "learning_rate": 8.772421524663678e-07, + "loss": 2.685, + "step": 107460 + }, + { + "epoch": 7.301603478733524, + "grad_norm": 6.683928966522217, + "learning_rate": 8.768175023780405e-07, + "loss": 2.6491, + "step": 107465 + }, + { + "epoch": 7.301943198804185, + "grad_norm": 10.004426002502441, + "learning_rate": 8.763928522897133e-07, + "loss": 2.9805, + "step": 107470 + }, + { + "epoch": 7.302282918874847, + "grad_norm": 8.808972358703613, + "learning_rate": 8.759682022013861e-07, + "loss": 2.7152, + "step": 107475 + }, + { + "epoch": 7.302622638945509, + "grad_norm": 6.3907952308654785, + "learning_rate": 8.755435521130589e-07, + "loss": 2.7998, + "step": 107480 + }, + { + "epoch": 7.3029623590161705, + "grad_norm": 8.320813179016113, + "learning_rate": 8.751189020247316e-07, + "loss": 2.6543, + "step": 107485 + }, + { + "epoch": 7.303302079086833, + "grad_norm": 7.025247573852539, + "learning_rate": 8.746942519364045e-07, + "loss": 2.5477, + "step": 107490 + }, + { + "epoch": 7.303641799157495, + "grad_norm": 9.289794921875, + "learning_rate": 8.742696018480773e-07, + "loss": 2.5551, + "step": 107495 + }, + { + "epoch": 7.303981519228156, + "grad_norm": 8.621664047241211, + "learning_rate": 8.7384495175975e-07, + "loss": 2.5477, + "step": 107500 + }, + { + "epoch": 7.304321239298818, + "grad_norm": 7.61398458480835, + "learning_rate": 8.734203016714228e-07, + "loss": 3.0445, + "step": 107505 + }, + { + "epoch": 7.30466095936948, + "grad_norm": 8.972440719604492, + "learning_rate": 8.729956515830956e-07, + "loss": 2.4463, + "step": 107510 + }, + { + "epoch": 7.305000679440141, + "grad_norm": 10.837211608886719, + "learning_rate": 8.725710014947684e-07, + "loss": 2.761, + "step": 107515 + }, + { + "epoch": 7.305340399510803, + "grad_norm": 7.587305068969727, + "learning_rate": 8.721463514064411e-07, + "loss": 2.895, + "step": 107520 + }, + { + "epoch": 7.3056801195814645, + "grad_norm": 8.411999702453613, + "learning_rate": 8.717217013181139e-07, + "loss": 2.5788, + "step": 107525 + }, + { + "epoch": 7.3060198396521265, + "grad_norm": 6.671482563018799, + "learning_rate": 8.712970512297868e-07, + "loss": 2.7186, + "step": 107530 + }, + { + "epoch": 7.306359559722789, + "grad_norm": 7.920572757720947, + "learning_rate": 8.708724011414594e-07, + "loss": 3.0461, + "step": 107535 + }, + { + "epoch": 7.30669927979345, + "grad_norm": 10.167068481445312, + "learning_rate": 8.704477510531323e-07, + "loss": 2.5669, + "step": 107540 + }, + { + "epoch": 7.307038999864112, + "grad_norm": 7.243312835693359, + "learning_rate": 8.700231009648051e-07, + "loss": 2.5796, + "step": 107545 + }, + { + "epoch": 7.307378719934774, + "grad_norm": 8.454487800598145, + "learning_rate": 8.695984508764778e-07, + "loss": 2.7432, + "step": 107550 + }, + { + "epoch": 7.307718440005435, + "grad_norm": 8.305373191833496, + "learning_rate": 8.691738007881506e-07, + "loss": 3.0085, + "step": 107555 + }, + { + "epoch": 7.308058160076097, + "grad_norm": 7.973908424377441, + "learning_rate": 8.687491506998234e-07, + "loss": 2.8243, + "step": 107560 + }, + { + "epoch": 7.308397880146759, + "grad_norm": 6.505761623382568, + "learning_rate": 8.683245006114962e-07, + "loss": 2.6806, + "step": 107565 + }, + { + "epoch": 7.3087376002174205, + "grad_norm": 7.546040058135986, + "learning_rate": 8.678998505231689e-07, + "loss": 3.1286, + "step": 107570 + }, + { + "epoch": 7.3090773202880825, + "grad_norm": 9.0552339553833, + "learning_rate": 8.674752004348417e-07, + "loss": 2.814, + "step": 107575 + }, + { + "epoch": 7.309417040358745, + "grad_norm": 7.760507583618164, + "learning_rate": 8.670505503465146e-07, + "loss": 2.792, + "step": 107580 + }, + { + "epoch": 7.309756760429406, + "grad_norm": 7.5030694007873535, + "learning_rate": 8.666259002581873e-07, + "loss": 2.7743, + "step": 107585 + }, + { + "epoch": 7.310096480500068, + "grad_norm": 7.481101989746094, + "learning_rate": 8.662012501698601e-07, + "loss": 2.6152, + "step": 107590 + }, + { + "epoch": 7.31043620057073, + "grad_norm": 8.766133308410645, + "learning_rate": 8.657766000815329e-07, + "loss": 2.6536, + "step": 107595 + }, + { + "epoch": 7.310775920641391, + "grad_norm": 8.90047836303711, + "learning_rate": 8.653519499932057e-07, + "loss": 2.936, + "step": 107600 + }, + { + "epoch": 7.311115640712053, + "grad_norm": 8.259672164916992, + "learning_rate": 8.649272999048784e-07, + "loss": 2.67, + "step": 107605 + }, + { + "epoch": 7.311455360782715, + "grad_norm": 6.759815216064453, + "learning_rate": 8.645026498165512e-07, + "loss": 2.8979, + "step": 107610 + }, + { + "epoch": 7.3117950808533765, + "grad_norm": 8.004960060119629, + "learning_rate": 8.64077999728224e-07, + "loss": 2.7991, + "step": 107615 + }, + { + "epoch": 7.312134800924039, + "grad_norm": 6.939455032348633, + "learning_rate": 8.636533496398967e-07, + "loss": 2.6504, + "step": 107620 + }, + { + "epoch": 7.312474520994701, + "grad_norm": 7.703768253326416, + "learning_rate": 8.632286995515696e-07, + "loss": 2.7398, + "step": 107625 + }, + { + "epoch": 7.312814241065362, + "grad_norm": 7.978891849517822, + "learning_rate": 8.628040494632424e-07, + "loss": 2.8964, + "step": 107630 + }, + { + "epoch": 7.313153961136024, + "grad_norm": 6.877345561981201, + "learning_rate": 8.623793993749151e-07, + "loss": 2.5512, + "step": 107635 + }, + { + "epoch": 7.313493681206686, + "grad_norm": 8.297554969787598, + "learning_rate": 8.619547492865879e-07, + "loss": 2.6048, + "step": 107640 + }, + { + "epoch": 7.313833401277347, + "grad_norm": 6.565570831298828, + "learning_rate": 8.615300991982607e-07, + "loss": 2.8087, + "step": 107645 + }, + { + "epoch": 7.314173121348009, + "grad_norm": 7.5230278968811035, + "learning_rate": 8.611054491099335e-07, + "loss": 2.5769, + "step": 107650 + }, + { + "epoch": 7.314512841418671, + "grad_norm": 8.6853609085083, + "learning_rate": 8.606807990216062e-07, + "loss": 2.562, + "step": 107655 + }, + { + "epoch": 7.3148525614893325, + "grad_norm": 7.211042881011963, + "learning_rate": 8.60256148933279e-07, + "loss": 2.5292, + "step": 107660 + }, + { + "epoch": 7.315192281559995, + "grad_norm": 10.302599906921387, + "learning_rate": 8.598314988449519e-07, + "loss": 2.6886, + "step": 107665 + }, + { + "epoch": 7.315532001630657, + "grad_norm": 6.7324981689453125, + "learning_rate": 8.594068487566245e-07, + "loss": 2.7553, + "step": 107670 + }, + { + "epoch": 7.315871721701318, + "grad_norm": 8.139041900634766, + "learning_rate": 8.589821986682974e-07, + "loss": 2.9389, + "step": 107675 + }, + { + "epoch": 7.31621144177198, + "grad_norm": 7.742859363555908, + "learning_rate": 8.585575485799702e-07, + "loss": 2.7595, + "step": 107680 + }, + { + "epoch": 7.316551161842642, + "grad_norm": 6.420536994934082, + "learning_rate": 8.58132898491643e-07, + "loss": 2.736, + "step": 107685 + }, + { + "epoch": 7.316890881913303, + "grad_norm": 7.163976669311523, + "learning_rate": 8.577082484033157e-07, + "loss": 2.8879, + "step": 107690 + }, + { + "epoch": 7.317230601983965, + "grad_norm": 7.178442001342773, + "learning_rate": 8.572835983149885e-07, + "loss": 2.5512, + "step": 107695 + }, + { + "epoch": 7.317570322054627, + "grad_norm": 7.269235610961914, + "learning_rate": 8.568589482266613e-07, + "loss": 2.7117, + "step": 107700 + }, + { + "epoch": 7.3179100421252885, + "grad_norm": 10.877388954162598, + "learning_rate": 8.56434298138334e-07, + "loss": 2.9686, + "step": 107705 + }, + { + "epoch": 7.318249762195951, + "grad_norm": 8.835049629211426, + "learning_rate": 8.560096480500068e-07, + "loss": 2.5533, + "step": 107710 + }, + { + "epoch": 7.318589482266613, + "grad_norm": 9.50257682800293, + "learning_rate": 8.555849979616797e-07, + "loss": 2.9073, + "step": 107715 + }, + { + "epoch": 7.318929202337274, + "grad_norm": 8.191704750061035, + "learning_rate": 8.551603478733524e-07, + "loss": 2.9234, + "step": 107720 + }, + { + "epoch": 7.319268922407936, + "grad_norm": 8.77236557006836, + "learning_rate": 8.547356977850252e-07, + "loss": 2.7271, + "step": 107725 + }, + { + "epoch": 7.319608642478598, + "grad_norm": 8.328187942504883, + "learning_rate": 8.54311047696698e-07, + "loss": 2.7032, + "step": 107730 + }, + { + "epoch": 7.319948362549259, + "grad_norm": 9.45592212677002, + "learning_rate": 8.538863976083708e-07, + "loss": 2.5302, + "step": 107735 + }, + { + "epoch": 7.320288082619921, + "grad_norm": 9.593728065490723, + "learning_rate": 8.534617475200435e-07, + "loss": 2.8875, + "step": 107740 + }, + { + "epoch": 7.320627802690583, + "grad_norm": 7.041121482849121, + "learning_rate": 8.530370974317163e-07, + "loss": 2.714, + "step": 107745 + }, + { + "epoch": 7.3209675227612445, + "grad_norm": 9.429364204406738, + "learning_rate": 8.526124473433891e-07, + "loss": 2.7345, + "step": 107750 + }, + { + "epoch": 7.321307242831907, + "grad_norm": 10.575763702392578, + "learning_rate": 8.521877972550618e-07, + "loss": 2.4911, + "step": 107755 + }, + { + "epoch": 7.321646962902569, + "grad_norm": 6.7153096199035645, + "learning_rate": 8.517631471667347e-07, + "loss": 2.6012, + "step": 107760 + }, + { + "epoch": 7.32198668297323, + "grad_norm": 7.288552284240723, + "learning_rate": 8.513384970784075e-07, + "loss": 2.6392, + "step": 107765 + }, + { + "epoch": 7.322326403043892, + "grad_norm": 9.170777320861816, + "learning_rate": 8.509138469900803e-07, + "loss": 2.6699, + "step": 107770 + }, + { + "epoch": 7.322666123114554, + "grad_norm": 7.616882801055908, + "learning_rate": 8.50489196901753e-07, + "loss": 2.6871, + "step": 107775 + }, + { + "epoch": 7.323005843185215, + "grad_norm": 7.594313144683838, + "learning_rate": 8.500645468134258e-07, + "loss": 2.7702, + "step": 107780 + }, + { + "epoch": 7.323345563255877, + "grad_norm": 7.355625152587891, + "learning_rate": 8.496398967250986e-07, + "loss": 2.8087, + "step": 107785 + }, + { + "epoch": 7.323685283326539, + "grad_norm": 6.709613800048828, + "learning_rate": 8.492152466367713e-07, + "loss": 2.4882, + "step": 107790 + }, + { + "epoch": 7.3240250033972005, + "grad_norm": 7.076710224151611, + "learning_rate": 8.487905965484441e-07, + "loss": 2.7672, + "step": 107795 + }, + { + "epoch": 7.324364723467863, + "grad_norm": 7.617498874664307, + "learning_rate": 8.48365946460117e-07, + "loss": 2.8588, + "step": 107800 + }, + { + "epoch": 7.324704443538524, + "grad_norm": 8.07226276397705, + "learning_rate": 8.479412963717897e-07, + "loss": 2.7418, + "step": 107805 + }, + { + "epoch": 7.325044163609186, + "grad_norm": 6.965863227844238, + "learning_rate": 8.475166462834625e-07, + "loss": 2.8283, + "step": 107810 + }, + { + "epoch": 7.325383883679848, + "grad_norm": 7.6436638832092285, + "learning_rate": 8.470919961951353e-07, + "loss": 2.71, + "step": 107815 + }, + { + "epoch": 7.325723603750509, + "grad_norm": 6.973076343536377, + "learning_rate": 8.466673461068081e-07, + "loss": 2.7756, + "step": 107820 + }, + { + "epoch": 7.326063323821171, + "grad_norm": 6.943172454833984, + "learning_rate": 8.462426960184808e-07, + "loss": 2.5453, + "step": 107825 + }, + { + "epoch": 7.326403043891833, + "grad_norm": 9.208136558532715, + "learning_rate": 8.458180459301536e-07, + "loss": 2.8196, + "step": 107830 + }, + { + "epoch": 7.3267427639624945, + "grad_norm": 7.040218353271484, + "learning_rate": 8.453933958418264e-07, + "loss": 2.7888, + "step": 107835 + }, + { + "epoch": 7.3270824840331565, + "grad_norm": 7.936530590057373, + "learning_rate": 8.449687457534991e-07, + "loss": 2.6232, + "step": 107840 + }, + { + "epoch": 7.327422204103819, + "grad_norm": 7.7348246574401855, + "learning_rate": 8.44544095665172e-07, + "loss": 2.594, + "step": 107845 + }, + { + "epoch": 7.32776192417448, + "grad_norm": 9.055193901062012, + "learning_rate": 8.441194455768448e-07, + "loss": 2.7973, + "step": 107850 + }, + { + "epoch": 7.328101644245142, + "grad_norm": 8.988910675048828, + "learning_rate": 8.436947954885176e-07, + "loss": 2.8661, + "step": 107855 + }, + { + "epoch": 7.328441364315804, + "grad_norm": 6.312682628631592, + "learning_rate": 8.432701454001903e-07, + "loss": 2.6371, + "step": 107860 + }, + { + "epoch": 7.328781084386465, + "grad_norm": 7.56577730178833, + "learning_rate": 8.428454953118631e-07, + "loss": 2.7314, + "step": 107865 + }, + { + "epoch": 7.329120804457127, + "grad_norm": 8.98923110961914, + "learning_rate": 8.424208452235359e-07, + "loss": 2.8273, + "step": 107870 + }, + { + "epoch": 7.329460524527789, + "grad_norm": 7.1738691329956055, + "learning_rate": 8.419961951352086e-07, + "loss": 2.8604, + "step": 107875 + }, + { + "epoch": 7.3298002445984505, + "grad_norm": 9.393229484558105, + "learning_rate": 8.415715450468814e-07, + "loss": 2.774, + "step": 107880 + }, + { + "epoch": 7.3301399646691126, + "grad_norm": 8.412394523620605, + "learning_rate": 8.411468949585543e-07, + "loss": 2.8351, + "step": 107885 + }, + { + "epoch": 7.330479684739775, + "grad_norm": 6.076410293579102, + "learning_rate": 8.407222448702269e-07, + "loss": 2.7644, + "step": 107890 + }, + { + "epoch": 7.330819404810436, + "grad_norm": 7.297347545623779, + "learning_rate": 8.402975947818998e-07, + "loss": 2.6494, + "step": 107895 + }, + { + "epoch": 7.331159124881098, + "grad_norm": 8.918320655822754, + "learning_rate": 8.398729446935726e-07, + "loss": 3.0793, + "step": 107900 + }, + { + "epoch": 7.33149884495176, + "grad_norm": 7.109703063964844, + "learning_rate": 8.394482946052454e-07, + "loss": 2.7041, + "step": 107905 + }, + { + "epoch": 7.331838565022421, + "grad_norm": 7.106328010559082, + "learning_rate": 8.390236445169181e-07, + "loss": 2.7358, + "step": 107910 + }, + { + "epoch": 7.332178285093083, + "grad_norm": 7.867804527282715, + "learning_rate": 8.385989944285909e-07, + "loss": 2.8883, + "step": 107915 + }, + { + "epoch": 7.332518005163745, + "grad_norm": 9.191204071044922, + "learning_rate": 8.381743443402637e-07, + "loss": 2.7138, + "step": 107920 + }, + { + "epoch": 7.3328577252344065, + "grad_norm": 7.430572032928467, + "learning_rate": 8.377496942519364e-07, + "loss": 2.5062, + "step": 107925 + }, + { + "epoch": 7.333197445305069, + "grad_norm": 8.387548446655273, + "learning_rate": 8.373250441636092e-07, + "loss": 2.5957, + "step": 107930 + }, + { + "epoch": 7.333537165375731, + "grad_norm": 8.08757495880127, + "learning_rate": 8.369003940752821e-07, + "loss": 2.9134, + "step": 107935 + }, + { + "epoch": 7.333876885446392, + "grad_norm": 8.304179191589355, + "learning_rate": 8.364757439869549e-07, + "loss": 2.9734, + "step": 107940 + }, + { + "epoch": 7.334216605517054, + "grad_norm": 8.540249824523926, + "learning_rate": 8.360510938986276e-07, + "loss": 2.7458, + "step": 107945 + }, + { + "epoch": 7.334556325587716, + "grad_norm": 7.473106861114502, + "learning_rate": 8.356264438103004e-07, + "loss": 2.6166, + "step": 107950 + }, + { + "epoch": 7.334896045658377, + "grad_norm": 6.972504615783691, + "learning_rate": 8.352017937219732e-07, + "loss": 2.7374, + "step": 107955 + }, + { + "epoch": 7.335235765729039, + "grad_norm": 7.730543613433838, + "learning_rate": 8.347771436336459e-07, + "loss": 2.6287, + "step": 107960 + }, + { + "epoch": 7.335575485799701, + "grad_norm": 7.939846992492676, + "learning_rate": 8.343524935453187e-07, + "loss": 2.6128, + "step": 107965 + }, + { + "epoch": 7.3359152058703625, + "grad_norm": 7.637646198272705, + "learning_rate": 8.339278434569915e-07, + "loss": 2.4351, + "step": 107970 + }, + { + "epoch": 7.336254925941025, + "grad_norm": 7.453808784484863, + "learning_rate": 8.335031933686642e-07, + "loss": 2.5694, + "step": 107975 + }, + { + "epoch": 7.336594646011687, + "grad_norm": 9.559919357299805, + "learning_rate": 8.330785432803371e-07, + "loss": 2.6562, + "step": 107980 + }, + { + "epoch": 7.336934366082348, + "grad_norm": 7.332932949066162, + "learning_rate": 8.326538931920099e-07, + "loss": 2.966, + "step": 107985 + }, + { + "epoch": 7.33727408615301, + "grad_norm": 6.7604594230651855, + "learning_rate": 8.322292431036827e-07, + "loss": 2.5398, + "step": 107990 + }, + { + "epoch": 7.337613806223672, + "grad_norm": 9.180462837219238, + "learning_rate": 8.318045930153554e-07, + "loss": 2.7583, + "step": 107995 + }, + { + "epoch": 7.337953526294333, + "grad_norm": 6.812851905822754, + "learning_rate": 8.313799429270282e-07, + "loss": 2.6282, + "step": 108000 + }, + { + "epoch": 7.338293246364995, + "grad_norm": 9.12286376953125, + "learning_rate": 8.30955292838701e-07, + "loss": 2.818, + "step": 108005 + }, + { + "epoch": 7.338632966435657, + "grad_norm": 8.11171817779541, + "learning_rate": 8.305306427503737e-07, + "loss": 2.8629, + "step": 108010 + }, + { + "epoch": 7.3389726865063185, + "grad_norm": 6.350249767303467, + "learning_rate": 8.301059926620465e-07, + "loss": 2.7426, + "step": 108015 + }, + { + "epoch": 7.339312406576981, + "grad_norm": 7.979992389678955, + "learning_rate": 8.296813425737194e-07, + "loss": 2.6529, + "step": 108020 + }, + { + "epoch": 7.339652126647643, + "grad_norm": 7.581684112548828, + "learning_rate": 8.292566924853922e-07, + "loss": 2.8592, + "step": 108025 + }, + { + "epoch": 7.339991846718304, + "grad_norm": 7.998531818389893, + "learning_rate": 8.288320423970649e-07, + "loss": 2.9292, + "step": 108030 + }, + { + "epoch": 7.340331566788966, + "grad_norm": 9.407636642456055, + "learning_rate": 8.284073923087377e-07, + "loss": 2.6774, + "step": 108035 + }, + { + "epoch": 7.340671286859628, + "grad_norm": 6.292857646942139, + "learning_rate": 8.279827422204105e-07, + "loss": 2.5688, + "step": 108040 + }, + { + "epoch": 7.341011006930289, + "grad_norm": 8.61185073852539, + "learning_rate": 8.275580921320832e-07, + "loss": 2.8895, + "step": 108045 + }, + { + "epoch": 7.341350727000951, + "grad_norm": 9.781105041503906, + "learning_rate": 8.27133442043756e-07, + "loss": 2.5775, + "step": 108050 + }, + { + "epoch": 7.341690447071613, + "grad_norm": 8.40855598449707, + "learning_rate": 8.267087919554288e-07, + "loss": 2.6825, + "step": 108055 + }, + { + "epoch": 7.3420301671422745, + "grad_norm": 6.972967147827148, + "learning_rate": 8.262841418671015e-07, + "loss": 2.7595, + "step": 108060 + }, + { + "epoch": 7.342369887212937, + "grad_norm": 7.100914001464844, + "learning_rate": 8.258594917787743e-07, + "loss": 2.687, + "step": 108065 + }, + { + "epoch": 7.342709607283599, + "grad_norm": 7.062318325042725, + "learning_rate": 8.254348416904472e-07, + "loss": 2.9235, + "step": 108070 + }, + { + "epoch": 7.34304932735426, + "grad_norm": 6.16217041015625, + "learning_rate": 8.2501019160212e-07, + "loss": 2.6118, + "step": 108075 + }, + { + "epoch": 7.343389047424922, + "grad_norm": 6.811344623565674, + "learning_rate": 8.245855415137927e-07, + "loss": 2.6971, + "step": 108080 + }, + { + "epoch": 7.343728767495584, + "grad_norm": 6.444242477416992, + "learning_rate": 8.241608914254655e-07, + "loss": 2.7276, + "step": 108085 + }, + { + "epoch": 7.344068487566245, + "grad_norm": 8.881088256835938, + "learning_rate": 8.237362413371383e-07, + "loss": 2.6931, + "step": 108090 + }, + { + "epoch": 7.344408207636907, + "grad_norm": 7.3837785720825195, + "learning_rate": 8.23311591248811e-07, + "loss": 2.6097, + "step": 108095 + }, + { + "epoch": 7.344747927707569, + "grad_norm": 6.3960161209106445, + "learning_rate": 8.228869411604838e-07, + "loss": 2.5337, + "step": 108100 + }, + { + "epoch": 7.3450876477782305, + "grad_norm": 6.674572944641113, + "learning_rate": 8.224622910721566e-07, + "loss": 2.6888, + "step": 108105 + }, + { + "epoch": 7.345427367848893, + "grad_norm": 6.631059169769287, + "learning_rate": 8.220376409838295e-07, + "loss": 3.1398, + "step": 108110 + }, + { + "epoch": 7.345767087919555, + "grad_norm": 5.800178050994873, + "learning_rate": 8.216129908955022e-07, + "loss": 2.613, + "step": 108115 + }, + { + "epoch": 7.346106807990216, + "grad_norm": 8.953721046447754, + "learning_rate": 8.21188340807175e-07, + "loss": 2.4031, + "step": 108120 + }, + { + "epoch": 7.346446528060878, + "grad_norm": 9.62155532836914, + "learning_rate": 8.207636907188478e-07, + "loss": 2.8775, + "step": 108125 + }, + { + "epoch": 7.34678624813154, + "grad_norm": 8.689583778381348, + "learning_rate": 8.203390406305205e-07, + "loss": 2.6627, + "step": 108130 + }, + { + "epoch": 7.347125968202201, + "grad_norm": 7.047158241271973, + "learning_rate": 8.199143905421933e-07, + "loss": 2.5021, + "step": 108135 + }, + { + "epoch": 7.347465688272863, + "grad_norm": 7.277770042419434, + "learning_rate": 8.194897404538661e-07, + "loss": 2.7635, + "step": 108140 + }, + { + "epoch": 7.347805408343525, + "grad_norm": 8.766533851623535, + "learning_rate": 8.190650903655388e-07, + "loss": 2.5265, + "step": 108145 + }, + { + "epoch": 7.3481451284141865, + "grad_norm": 9.031854629516602, + "learning_rate": 8.186404402772116e-07, + "loss": 2.877, + "step": 108150 + }, + { + "epoch": 7.348484848484849, + "grad_norm": 9.419276237487793, + "learning_rate": 8.182157901888845e-07, + "loss": 2.7177, + "step": 108155 + }, + { + "epoch": 7.348824568555511, + "grad_norm": 7.084001541137695, + "learning_rate": 8.177911401005573e-07, + "loss": 2.8815, + "step": 108160 + }, + { + "epoch": 7.349164288626172, + "grad_norm": 8.474133491516113, + "learning_rate": 8.1736649001223e-07, + "loss": 2.7056, + "step": 108165 + }, + { + "epoch": 7.349504008696834, + "grad_norm": 8.18535327911377, + "learning_rate": 8.169418399239028e-07, + "loss": 2.6093, + "step": 108170 + }, + { + "epoch": 7.349843728767496, + "grad_norm": 7.656606197357178, + "learning_rate": 8.165171898355756e-07, + "loss": 2.7709, + "step": 108175 + }, + { + "epoch": 7.350183448838157, + "grad_norm": 7.531088829040527, + "learning_rate": 8.160925397472483e-07, + "loss": 2.8958, + "step": 108180 + }, + { + "epoch": 7.350523168908819, + "grad_norm": 6.655362606048584, + "learning_rate": 8.156678896589211e-07, + "loss": 2.4912, + "step": 108185 + }, + { + "epoch": 7.350862888979481, + "grad_norm": 7.313601016998291, + "learning_rate": 8.152432395705939e-07, + "loss": 2.768, + "step": 108190 + }, + { + "epoch": 7.3512026090501426, + "grad_norm": 9.069279670715332, + "learning_rate": 8.148185894822668e-07, + "loss": 2.8463, + "step": 108195 + }, + { + "epoch": 7.351542329120805, + "grad_norm": 9.177742004394531, + "learning_rate": 8.143939393939395e-07, + "loss": 2.5469, + "step": 108200 + }, + { + "epoch": 7.351882049191466, + "grad_norm": 9.624187469482422, + "learning_rate": 8.139692893056123e-07, + "loss": 2.5789, + "step": 108205 + }, + { + "epoch": 7.352221769262128, + "grad_norm": 6.710440158843994, + "learning_rate": 8.135446392172851e-07, + "loss": 2.7411, + "step": 108210 + }, + { + "epoch": 7.35256148933279, + "grad_norm": 7.384397029876709, + "learning_rate": 8.131199891289578e-07, + "loss": 2.9658, + "step": 108215 + }, + { + "epoch": 7.352901209403451, + "grad_norm": 7.517411708831787, + "learning_rate": 8.126953390406306e-07, + "loss": 2.7534, + "step": 108220 + }, + { + "epoch": 7.353240929474113, + "grad_norm": 7.762171268463135, + "learning_rate": 8.122706889523034e-07, + "loss": 2.7912, + "step": 108225 + }, + { + "epoch": 7.353580649544775, + "grad_norm": 10.642465591430664, + "learning_rate": 8.118460388639761e-07, + "loss": 2.8005, + "step": 108230 + }, + { + "epoch": 7.3539203696154365, + "grad_norm": 10.343825340270996, + "learning_rate": 8.114213887756489e-07, + "loss": 2.715, + "step": 108235 + }, + { + "epoch": 7.354260089686099, + "grad_norm": 6.331682205200195, + "learning_rate": 8.109967386873218e-07, + "loss": 2.7898, + "step": 108240 + }, + { + "epoch": 7.354599809756761, + "grad_norm": 7.905826091766357, + "learning_rate": 8.105720885989946e-07, + "loss": 2.9106, + "step": 108245 + }, + { + "epoch": 7.354939529827422, + "grad_norm": 6.84955358505249, + "learning_rate": 8.101474385106673e-07, + "loss": 2.7031, + "step": 108250 + }, + { + "epoch": 7.355279249898084, + "grad_norm": 8.336087226867676, + "learning_rate": 8.097227884223401e-07, + "loss": 2.697, + "step": 108255 + }, + { + "epoch": 7.355618969968746, + "grad_norm": 7.554656982421875, + "learning_rate": 8.092981383340129e-07, + "loss": 2.7314, + "step": 108260 + }, + { + "epoch": 7.355958690039407, + "grad_norm": 7.704533100128174, + "learning_rate": 8.088734882456856e-07, + "loss": 2.881, + "step": 108265 + }, + { + "epoch": 7.356298410110069, + "grad_norm": 6.8892107009887695, + "learning_rate": 8.084488381573584e-07, + "loss": 2.7315, + "step": 108270 + }, + { + "epoch": 7.356638130180731, + "grad_norm": 6.905055046081543, + "learning_rate": 8.080241880690312e-07, + "loss": 2.9275, + "step": 108275 + }, + { + "epoch": 7.3569778502513925, + "grad_norm": 7.213602066040039, + "learning_rate": 8.075995379807041e-07, + "loss": 2.9049, + "step": 108280 + }, + { + "epoch": 7.357317570322055, + "grad_norm": 7.765556812286377, + "learning_rate": 8.071748878923767e-07, + "loss": 2.8575, + "step": 108285 + }, + { + "epoch": 7.357657290392717, + "grad_norm": 8.131658554077148, + "learning_rate": 8.067502378040496e-07, + "loss": 2.4506, + "step": 108290 + }, + { + "epoch": 7.357997010463378, + "grad_norm": 8.391251564025879, + "learning_rate": 8.063255877157224e-07, + "loss": 2.8059, + "step": 108295 + }, + { + "epoch": 7.35833673053404, + "grad_norm": 7.7764058113098145, + "learning_rate": 8.059009376273951e-07, + "loss": 2.6056, + "step": 108300 + }, + { + "epoch": 7.358676450604702, + "grad_norm": 10.229583740234375, + "learning_rate": 8.054762875390679e-07, + "loss": 2.8854, + "step": 108305 + }, + { + "epoch": 7.359016170675363, + "grad_norm": 6.932631969451904, + "learning_rate": 8.050516374507407e-07, + "loss": 2.6581, + "step": 108310 + }, + { + "epoch": 7.359355890746025, + "grad_norm": 6.7900848388671875, + "learning_rate": 8.046269873624134e-07, + "loss": 2.7136, + "step": 108315 + }, + { + "epoch": 7.359695610816687, + "grad_norm": 7.522702693939209, + "learning_rate": 8.042023372740862e-07, + "loss": 2.7091, + "step": 108320 + }, + { + "epoch": 7.3600353308873485, + "grad_norm": 7.741783142089844, + "learning_rate": 8.03777687185759e-07, + "loss": 2.8505, + "step": 108325 + }, + { + "epoch": 7.360375050958011, + "grad_norm": 6.87900447845459, + "learning_rate": 8.033530370974319e-07, + "loss": 2.7275, + "step": 108330 + }, + { + "epoch": 7.360714771028673, + "grad_norm": 5.931906700134277, + "learning_rate": 8.029283870091046e-07, + "loss": 2.9698, + "step": 108335 + }, + { + "epoch": 7.361054491099334, + "grad_norm": 9.731841087341309, + "learning_rate": 8.025037369207774e-07, + "loss": 2.7183, + "step": 108340 + }, + { + "epoch": 7.361394211169996, + "grad_norm": 8.892797470092773, + "learning_rate": 8.020790868324502e-07, + "loss": 2.5158, + "step": 108345 + }, + { + "epoch": 7.361733931240658, + "grad_norm": 11.730889320373535, + "learning_rate": 8.016544367441229e-07, + "loss": 2.5401, + "step": 108350 + }, + { + "epoch": 7.362073651311319, + "grad_norm": 8.581110954284668, + "learning_rate": 8.012297866557957e-07, + "loss": 2.5824, + "step": 108355 + }, + { + "epoch": 7.362413371381981, + "grad_norm": 8.097129821777344, + "learning_rate": 8.008051365674685e-07, + "loss": 2.7708, + "step": 108360 + }, + { + "epoch": 7.362753091452643, + "grad_norm": 7.818551063537598, + "learning_rate": 8.003804864791413e-07, + "loss": 2.8509, + "step": 108365 + }, + { + "epoch": 7.3630928115233045, + "grad_norm": 8.402209281921387, + "learning_rate": 7.99955836390814e-07, + "loss": 2.788, + "step": 108370 + }, + { + "epoch": 7.363432531593967, + "grad_norm": 9.483694076538086, + "learning_rate": 7.995311863024869e-07, + "loss": 2.7593, + "step": 108375 + }, + { + "epoch": 7.363772251664629, + "grad_norm": 8.238534927368164, + "learning_rate": 7.991065362141597e-07, + "loss": 2.9349, + "step": 108380 + }, + { + "epoch": 7.36411197173529, + "grad_norm": 10.222623825073242, + "learning_rate": 7.986818861258324e-07, + "loss": 2.881, + "step": 108385 + }, + { + "epoch": 7.364451691805952, + "grad_norm": 9.364889144897461, + "learning_rate": 7.982572360375052e-07, + "loss": 2.3331, + "step": 108390 + }, + { + "epoch": 7.364791411876614, + "grad_norm": 10.189896583557129, + "learning_rate": 7.97832585949178e-07, + "loss": 2.64, + "step": 108395 + }, + { + "epoch": 7.365131131947275, + "grad_norm": 9.254812240600586, + "learning_rate": 7.974079358608507e-07, + "loss": 2.6699, + "step": 108400 + }, + { + "epoch": 7.365470852017937, + "grad_norm": 7.697299957275391, + "learning_rate": 7.969832857725235e-07, + "loss": 2.5802, + "step": 108405 + }, + { + "epoch": 7.365810572088599, + "grad_norm": 9.28584098815918, + "learning_rate": 7.965586356841963e-07, + "loss": 2.5252, + "step": 108410 + }, + { + "epoch": 7.3661502921592605, + "grad_norm": 8.369637489318848, + "learning_rate": 7.961339855958692e-07, + "loss": 2.5833, + "step": 108415 + }, + { + "epoch": 7.366490012229923, + "grad_norm": 8.499449729919434, + "learning_rate": 7.957093355075418e-07, + "loss": 2.6032, + "step": 108420 + }, + { + "epoch": 7.366829732300585, + "grad_norm": 8.381911277770996, + "learning_rate": 7.952846854192147e-07, + "loss": 2.898, + "step": 108425 + }, + { + "epoch": 7.367169452371246, + "grad_norm": 6.800487995147705, + "learning_rate": 7.948600353308875e-07, + "loss": 2.4782, + "step": 108430 + }, + { + "epoch": 7.367509172441908, + "grad_norm": 8.139395713806152, + "learning_rate": 7.944353852425602e-07, + "loss": 2.9233, + "step": 108435 + }, + { + "epoch": 7.36784889251257, + "grad_norm": 7.969667434692383, + "learning_rate": 7.94010735154233e-07, + "loss": 2.8977, + "step": 108440 + }, + { + "epoch": 7.368188612583231, + "grad_norm": 9.955801010131836, + "learning_rate": 7.935860850659058e-07, + "loss": 2.6355, + "step": 108445 + }, + { + "epoch": 7.368528332653893, + "grad_norm": 8.009778022766113, + "learning_rate": 7.931614349775786e-07, + "loss": 2.8393, + "step": 108450 + }, + { + "epoch": 7.368868052724555, + "grad_norm": 8.033568382263184, + "learning_rate": 7.927367848892513e-07, + "loss": 2.7383, + "step": 108455 + }, + { + "epoch": 7.3692077727952165, + "grad_norm": 8.21921157836914, + "learning_rate": 7.923121348009241e-07, + "loss": 2.7574, + "step": 108460 + }, + { + "epoch": 7.369547492865879, + "grad_norm": 6.682713031768799, + "learning_rate": 7.91887484712597e-07, + "loss": 2.6619, + "step": 108465 + }, + { + "epoch": 7.369887212936541, + "grad_norm": 6.403555393218994, + "learning_rate": 7.914628346242697e-07, + "loss": 2.6995, + "step": 108470 + }, + { + "epoch": 7.370226933007202, + "grad_norm": 6.544936180114746, + "learning_rate": 7.910381845359425e-07, + "loss": 2.6435, + "step": 108475 + }, + { + "epoch": 7.370566653077864, + "grad_norm": 7.091193199157715, + "learning_rate": 7.906135344476153e-07, + "loss": 2.6923, + "step": 108480 + }, + { + "epoch": 7.370906373148525, + "grad_norm": 8.512991905212402, + "learning_rate": 7.90188884359288e-07, + "loss": 2.8983, + "step": 108485 + }, + { + "epoch": 7.371246093219187, + "grad_norm": 8.03117561340332, + "learning_rate": 7.897642342709608e-07, + "loss": 2.5651, + "step": 108490 + }, + { + "epoch": 7.371585813289849, + "grad_norm": 9.336080551147461, + "learning_rate": 7.893395841826336e-07, + "loss": 2.6763, + "step": 108495 + }, + { + "epoch": 7.3719255333605105, + "grad_norm": 7.721248149871826, + "learning_rate": 7.889149340943064e-07, + "loss": 2.651, + "step": 108500 + }, + { + "epoch": 7.372265253431173, + "grad_norm": 7.059929370880127, + "learning_rate": 7.884902840059791e-07, + "loss": 2.8413, + "step": 108505 + }, + { + "epoch": 7.372604973501835, + "grad_norm": 7.877887725830078, + "learning_rate": 7.88065633917652e-07, + "loss": 2.8553, + "step": 108510 + }, + { + "epoch": 7.372944693572496, + "grad_norm": 7.466799736022949, + "learning_rate": 7.876409838293248e-07, + "loss": 2.9261, + "step": 108515 + }, + { + "epoch": 7.373284413643158, + "grad_norm": 6.877745151519775, + "learning_rate": 7.872163337409975e-07, + "loss": 2.7175, + "step": 108520 + }, + { + "epoch": 7.37362413371382, + "grad_norm": 9.25239372253418, + "learning_rate": 7.867916836526703e-07, + "loss": 2.6394, + "step": 108525 + }, + { + "epoch": 7.373963853784481, + "grad_norm": 9.345293045043945, + "learning_rate": 7.863670335643431e-07, + "loss": 2.6221, + "step": 108530 + }, + { + "epoch": 7.374303573855143, + "grad_norm": 6.515368461608887, + "learning_rate": 7.859423834760159e-07, + "loss": 2.605, + "step": 108535 + }, + { + "epoch": 7.374643293925805, + "grad_norm": 6.614431381225586, + "learning_rate": 7.855177333876886e-07, + "loss": 2.7159, + "step": 108540 + }, + { + "epoch": 7.3749830139964665, + "grad_norm": 6.558492660522461, + "learning_rate": 7.850930832993614e-07, + "loss": 2.744, + "step": 108545 + }, + { + "epoch": 7.375322734067129, + "grad_norm": 6.858980655670166, + "learning_rate": 7.846684332110343e-07, + "loss": 2.5071, + "step": 108550 + }, + { + "epoch": 7.375662454137791, + "grad_norm": 6.921413421630859, + "learning_rate": 7.84243783122707e-07, + "loss": 2.8028, + "step": 108555 + }, + { + "epoch": 7.376002174208452, + "grad_norm": 9.661172866821289, + "learning_rate": 7.838191330343798e-07, + "loss": 2.7687, + "step": 108560 + }, + { + "epoch": 7.376341894279114, + "grad_norm": 7.747291564941406, + "learning_rate": 7.833944829460526e-07, + "loss": 2.646, + "step": 108565 + }, + { + "epoch": 7.376681614349776, + "grad_norm": 7.7334794998168945, + "learning_rate": 7.829698328577253e-07, + "loss": 2.8246, + "step": 108570 + }, + { + "epoch": 7.377021334420437, + "grad_norm": 10.319998741149902, + "learning_rate": 7.825451827693981e-07, + "loss": 2.8691, + "step": 108575 + }, + { + "epoch": 7.377361054491099, + "grad_norm": 9.270442962646484, + "learning_rate": 7.821205326810709e-07, + "loss": 2.6409, + "step": 108580 + }, + { + "epoch": 7.377700774561761, + "grad_norm": 7.6081929206848145, + "learning_rate": 7.816958825927437e-07, + "loss": 2.7015, + "step": 108585 + }, + { + "epoch": 7.3780404946324225, + "grad_norm": 9.470988273620605, + "learning_rate": 7.812712325044164e-07, + "loss": 2.8212, + "step": 108590 + }, + { + "epoch": 7.378380214703085, + "grad_norm": 7.807088851928711, + "learning_rate": 7.808465824160893e-07, + "loss": 2.9502, + "step": 108595 + }, + { + "epoch": 7.378719934773747, + "grad_norm": 6.594217300415039, + "learning_rate": 7.804219323277621e-07, + "loss": 2.543, + "step": 108600 + }, + { + "epoch": 7.379059654844408, + "grad_norm": 8.205777168273926, + "learning_rate": 7.799972822394348e-07, + "loss": 2.7897, + "step": 108605 + }, + { + "epoch": 7.37939937491507, + "grad_norm": 6.586516857147217, + "learning_rate": 7.795726321511076e-07, + "loss": 2.6004, + "step": 108610 + }, + { + "epoch": 7.379739094985732, + "grad_norm": 7.597264766693115, + "learning_rate": 7.791479820627804e-07, + "loss": 2.929, + "step": 108615 + }, + { + "epoch": 7.380078815056393, + "grad_norm": 7.487385272979736, + "learning_rate": 7.787233319744531e-07, + "loss": 2.7353, + "step": 108620 + }, + { + "epoch": 7.380418535127055, + "grad_norm": 6.8612446784973145, + "learning_rate": 7.782986818861259e-07, + "loss": 2.7341, + "step": 108625 + }, + { + "epoch": 7.380758255197717, + "grad_norm": 7.250245094299316, + "learning_rate": 7.778740317977987e-07, + "loss": 2.6894, + "step": 108630 + }, + { + "epoch": 7.3810979752683785, + "grad_norm": 7.563803195953369, + "learning_rate": 7.774493817094716e-07, + "loss": 2.7105, + "step": 108635 + }, + { + "epoch": 7.381437695339041, + "grad_norm": 7.954570770263672, + "learning_rate": 7.770247316211442e-07, + "loss": 2.8758, + "step": 108640 + }, + { + "epoch": 7.381777415409703, + "grad_norm": 7.4854254722595215, + "learning_rate": 7.766000815328171e-07, + "loss": 2.7537, + "step": 108645 + }, + { + "epoch": 7.382117135480364, + "grad_norm": 6.468206882476807, + "learning_rate": 7.761754314444899e-07, + "loss": 2.7384, + "step": 108650 + }, + { + "epoch": 7.382456855551026, + "grad_norm": 8.617639541625977, + "learning_rate": 7.757507813561626e-07, + "loss": 2.7245, + "step": 108655 + }, + { + "epoch": 7.382796575621688, + "grad_norm": 7.6216301918029785, + "learning_rate": 7.753261312678354e-07, + "loss": 2.8804, + "step": 108660 + }, + { + "epoch": 7.383136295692349, + "grad_norm": 8.422435760498047, + "learning_rate": 7.749014811795082e-07, + "loss": 2.3164, + "step": 108665 + }, + { + "epoch": 7.383476015763011, + "grad_norm": 7.2596435546875, + "learning_rate": 7.74476831091181e-07, + "loss": 2.5873, + "step": 108670 + }, + { + "epoch": 7.383815735833673, + "grad_norm": 8.019487380981445, + "learning_rate": 7.740521810028537e-07, + "loss": 2.668, + "step": 108675 + }, + { + "epoch": 7.3841554559043345, + "grad_norm": 9.32997989654541, + "learning_rate": 7.736275309145265e-07, + "loss": 2.7342, + "step": 108680 + }, + { + "epoch": 7.384495175974997, + "grad_norm": 6.474757194519043, + "learning_rate": 7.732028808261994e-07, + "loss": 2.9124, + "step": 108685 + }, + { + "epoch": 7.384834896045659, + "grad_norm": 7.896395206451416, + "learning_rate": 7.72778230737872e-07, + "loss": 2.696, + "step": 108690 + }, + { + "epoch": 7.38517461611632, + "grad_norm": 10.12212085723877, + "learning_rate": 7.723535806495449e-07, + "loss": 2.627, + "step": 108695 + }, + { + "epoch": 7.385514336186982, + "grad_norm": 6.739946365356445, + "learning_rate": 7.719289305612177e-07, + "loss": 2.5868, + "step": 108700 + }, + { + "epoch": 7.385854056257644, + "grad_norm": 11.48195743560791, + "learning_rate": 7.715042804728903e-07, + "loss": 2.6535, + "step": 108705 + }, + { + "epoch": 7.386193776328305, + "grad_norm": 7.177083969116211, + "learning_rate": 7.710796303845632e-07, + "loss": 2.7238, + "step": 108710 + }, + { + "epoch": 7.386533496398967, + "grad_norm": 7.7388811111450195, + "learning_rate": 7.70654980296236e-07, + "loss": 2.7381, + "step": 108715 + }, + { + "epoch": 7.386873216469629, + "grad_norm": 8.074382781982422, + "learning_rate": 7.702303302079088e-07, + "loss": 2.8326, + "step": 108720 + }, + { + "epoch": 7.3872129365402905, + "grad_norm": 7.504567623138428, + "learning_rate": 7.698056801195814e-07, + "loss": 2.6911, + "step": 108725 + }, + { + "epoch": 7.387552656610953, + "grad_norm": 9.912631034851074, + "learning_rate": 7.693810300312544e-07, + "loss": 2.7039, + "step": 108730 + }, + { + "epoch": 7.387892376681615, + "grad_norm": 9.32684326171875, + "learning_rate": 7.689563799429272e-07, + "loss": 2.8062, + "step": 108735 + }, + { + "epoch": 7.388232096752276, + "grad_norm": 9.157233238220215, + "learning_rate": 7.685317298545998e-07, + "loss": 2.8393, + "step": 108740 + }, + { + "epoch": 7.388571816822938, + "grad_norm": 7.535455226898193, + "learning_rate": 7.681070797662726e-07, + "loss": 2.8342, + "step": 108745 + }, + { + "epoch": 7.3889115368936, + "grad_norm": 6.533695220947266, + "learning_rate": 7.676824296779455e-07, + "loss": 2.6494, + "step": 108750 + }, + { + "epoch": 7.389251256964261, + "grad_norm": 7.5190582275390625, + "learning_rate": 7.672577795896183e-07, + "loss": 3.0312, + "step": 108755 + }, + { + "epoch": 7.389590977034923, + "grad_norm": 8.864728927612305, + "learning_rate": 7.668331295012909e-07, + "loss": 2.6769, + "step": 108760 + }, + { + "epoch": 7.389930697105585, + "grad_norm": 10.43032455444336, + "learning_rate": 7.664084794129637e-07, + "loss": 2.6173, + "step": 108765 + }, + { + "epoch": 7.3902704171762466, + "grad_norm": 7.760120868682861, + "learning_rate": 7.659838293246367e-07, + "loss": 2.8062, + "step": 108770 + }, + { + "epoch": 7.390610137246909, + "grad_norm": 7.565427780151367, + "learning_rate": 7.655591792363092e-07, + "loss": 2.7622, + "step": 108775 + }, + { + "epoch": 7.390949857317571, + "grad_norm": 10.897457122802734, + "learning_rate": 7.651345291479821e-07, + "loss": 2.9396, + "step": 108780 + }, + { + "epoch": 7.391289577388232, + "grad_norm": 7.680095195770264, + "learning_rate": 7.64709879059655e-07, + "loss": 2.8548, + "step": 108785 + }, + { + "epoch": 7.391629297458894, + "grad_norm": 8.216023445129395, + "learning_rate": 7.642852289713276e-07, + "loss": 2.8063, + "step": 108790 + }, + { + "epoch": 7.391969017529556, + "grad_norm": 9.319355010986328, + "learning_rate": 7.638605788830004e-07, + "loss": 2.7105, + "step": 108795 + }, + { + "epoch": 7.392308737600217, + "grad_norm": 7.201712608337402, + "learning_rate": 7.634359287946732e-07, + "loss": 2.611, + "step": 108800 + }, + { + "epoch": 7.392648457670879, + "grad_norm": 8.635052680969238, + "learning_rate": 7.63011278706346e-07, + "loss": 2.8585, + "step": 108805 + }, + { + "epoch": 7.392988177741541, + "grad_norm": 8.92066478729248, + "learning_rate": 7.625866286180187e-07, + "loss": 2.8018, + "step": 108810 + }, + { + "epoch": 7.393327897812203, + "grad_norm": 7.052112579345703, + "learning_rate": 7.621619785296915e-07, + "loss": 2.7655, + "step": 108815 + }, + { + "epoch": 7.393667617882865, + "grad_norm": 10.179844856262207, + "learning_rate": 7.617373284413644e-07, + "loss": 2.9848, + "step": 108820 + }, + { + "epoch": 7.394007337953527, + "grad_norm": 6.803329944610596, + "learning_rate": 7.613126783530371e-07, + "loss": 2.8124, + "step": 108825 + }, + { + "epoch": 7.394347058024188, + "grad_norm": 6.681632995605469, + "learning_rate": 7.608880282647099e-07, + "loss": 3.0186, + "step": 108830 + }, + { + "epoch": 7.39468677809485, + "grad_norm": 7.907755374908447, + "learning_rate": 7.604633781763827e-07, + "loss": 2.6615, + "step": 108835 + }, + { + "epoch": 7.395026498165512, + "grad_norm": 6.937529563903809, + "learning_rate": 7.600387280880555e-07, + "loss": 2.6768, + "step": 108840 + }, + { + "epoch": 7.395366218236173, + "grad_norm": 9.063668251037598, + "learning_rate": 7.596140779997282e-07, + "loss": 2.742, + "step": 108845 + }, + { + "epoch": 7.395705938306835, + "grad_norm": 7.699275493621826, + "learning_rate": 7.59189427911401e-07, + "loss": 2.6242, + "step": 108850 + }, + { + "epoch": 7.396045658377497, + "grad_norm": 7.403553485870361, + "learning_rate": 7.587647778230738e-07, + "loss": 2.5988, + "step": 108855 + }, + { + "epoch": 7.396385378448159, + "grad_norm": 7.617466926574707, + "learning_rate": 7.583401277347465e-07, + "loss": 2.7174, + "step": 108860 + }, + { + "epoch": 7.396725098518821, + "grad_norm": 7.865771770477295, + "learning_rate": 7.579154776464194e-07, + "loss": 2.5565, + "step": 108865 + }, + { + "epoch": 7.397064818589483, + "grad_norm": 7.948723793029785, + "learning_rate": 7.574908275580922e-07, + "loss": 2.6474, + "step": 108870 + }, + { + "epoch": 7.397404538660144, + "grad_norm": 7.853491306304932, + "learning_rate": 7.570661774697649e-07, + "loss": 2.9642, + "step": 108875 + }, + { + "epoch": 7.397744258730806, + "grad_norm": 7.091930389404297, + "learning_rate": 7.566415273814377e-07, + "loss": 2.8427, + "step": 108880 + }, + { + "epoch": 7.398083978801467, + "grad_norm": 8.452563285827637, + "learning_rate": 7.562168772931105e-07, + "loss": 2.7788, + "step": 108885 + }, + { + "epoch": 7.398423698872129, + "grad_norm": 7.7323527336120605, + "learning_rate": 7.557922272047833e-07, + "loss": 2.6097, + "step": 108890 + }, + { + "epoch": 7.398763418942791, + "grad_norm": 7.587540626525879, + "learning_rate": 7.55367577116456e-07, + "loss": 2.5294, + "step": 108895 + }, + { + "epoch": 7.3991031390134525, + "grad_norm": 9.652206420898438, + "learning_rate": 7.549429270281288e-07, + "loss": 2.5052, + "step": 108900 + }, + { + "epoch": 7.399442859084115, + "grad_norm": 8.68868637084961, + "learning_rate": 7.545182769398017e-07, + "loss": 2.8279, + "step": 108905 + }, + { + "epoch": 7.399782579154777, + "grad_norm": 6.989866256713867, + "learning_rate": 7.540936268514743e-07, + "loss": 2.6956, + "step": 108910 + }, + { + "epoch": 7.400122299225438, + "grad_norm": 8.599563598632812, + "learning_rate": 7.536689767631472e-07, + "loss": 2.6398, + "step": 108915 + }, + { + "epoch": 7.4004620192961, + "grad_norm": 6.632136821746826, + "learning_rate": 7.5324432667482e-07, + "loss": 2.409, + "step": 108920 + }, + { + "epoch": 7.400801739366762, + "grad_norm": 7.937742233276367, + "learning_rate": 7.528196765864928e-07, + "loss": 2.8122, + "step": 108925 + }, + { + "epoch": 7.401141459437423, + "grad_norm": 9.400239944458008, + "learning_rate": 7.523950264981655e-07, + "loss": 2.8601, + "step": 108930 + }, + { + "epoch": 7.401481179508085, + "grad_norm": 9.099034309387207, + "learning_rate": 7.519703764098383e-07, + "loss": 2.7214, + "step": 108935 + }, + { + "epoch": 7.401820899578747, + "grad_norm": 7.91291618347168, + "learning_rate": 7.515457263215111e-07, + "loss": 2.6093, + "step": 108940 + }, + { + "epoch": 7.4021606196494085, + "grad_norm": 8.582574844360352, + "learning_rate": 7.511210762331838e-07, + "loss": 2.6641, + "step": 108945 + }, + { + "epoch": 7.402500339720071, + "grad_norm": 8.083342552185059, + "learning_rate": 7.506964261448567e-07, + "loss": 2.6542, + "step": 108950 + }, + { + "epoch": 7.402840059790733, + "grad_norm": 7.925924777984619, + "learning_rate": 7.502717760565295e-07, + "loss": 2.7196, + "step": 108955 + }, + { + "epoch": 7.403179779861394, + "grad_norm": 6.999407768249512, + "learning_rate": 7.498471259682022e-07, + "loss": 2.8623, + "step": 108960 + }, + { + "epoch": 7.403519499932056, + "grad_norm": 7.814270973205566, + "learning_rate": 7.49422475879875e-07, + "loss": 2.6651, + "step": 108965 + }, + { + "epoch": 7.403859220002718, + "grad_norm": 8.362404823303223, + "learning_rate": 7.489978257915478e-07, + "loss": 2.6895, + "step": 108970 + }, + { + "epoch": 7.404198940073379, + "grad_norm": 8.782601356506348, + "learning_rate": 7.485731757032206e-07, + "loss": 2.8235, + "step": 108975 + }, + { + "epoch": 7.404538660144041, + "grad_norm": 9.20459270477295, + "learning_rate": 7.481485256148933e-07, + "loss": 2.5796, + "step": 108980 + }, + { + "epoch": 7.404878380214703, + "grad_norm": 8.467878341674805, + "learning_rate": 7.477238755265661e-07, + "loss": 2.6652, + "step": 108985 + }, + { + "epoch": 7.4052181002853645, + "grad_norm": 7.594973564147949, + "learning_rate": 7.47299225438239e-07, + "loss": 2.5809, + "step": 108990 + }, + { + "epoch": 7.405557820356027, + "grad_norm": 8.692694664001465, + "learning_rate": 7.468745753499116e-07, + "loss": 2.7134, + "step": 108995 + }, + { + "epoch": 7.405897540426689, + "grad_norm": 7.1581244468688965, + "learning_rate": 7.464499252615845e-07, + "loss": 2.5759, + "step": 109000 + }, + { + "epoch": 7.40623726049735, + "grad_norm": 9.136553764343262, + "learning_rate": 7.460252751732573e-07, + "loss": 2.5537, + "step": 109005 + }, + { + "epoch": 7.406576980568012, + "grad_norm": 7.878888130187988, + "learning_rate": 7.456006250849301e-07, + "loss": 2.7482, + "step": 109010 + }, + { + "epoch": 7.406916700638674, + "grad_norm": 10.084283828735352, + "learning_rate": 7.451759749966028e-07, + "loss": 2.6667, + "step": 109015 + }, + { + "epoch": 7.407256420709335, + "grad_norm": 8.569806098937988, + "learning_rate": 7.447513249082756e-07, + "loss": 2.5513, + "step": 109020 + }, + { + "epoch": 7.407596140779997, + "grad_norm": 8.350968360900879, + "learning_rate": 7.443266748199484e-07, + "loss": 2.7424, + "step": 109025 + }, + { + "epoch": 7.407935860850659, + "grad_norm": 9.972041130065918, + "learning_rate": 7.439020247316211e-07, + "loss": 2.8217, + "step": 109030 + }, + { + "epoch": 7.4082755809213205, + "grad_norm": 6.937572479248047, + "learning_rate": 7.434773746432939e-07, + "loss": 2.5856, + "step": 109035 + }, + { + "epoch": 7.408615300991983, + "grad_norm": 7.873424053192139, + "learning_rate": 7.430527245549668e-07, + "loss": 2.446, + "step": 109040 + }, + { + "epoch": 7.408955021062645, + "grad_norm": 8.218826293945312, + "learning_rate": 7.426280744666395e-07, + "loss": 2.6342, + "step": 109045 + }, + { + "epoch": 7.409294741133306, + "grad_norm": 8.857415199279785, + "learning_rate": 7.422034243783123e-07, + "loss": 2.5502, + "step": 109050 + }, + { + "epoch": 7.409634461203968, + "grad_norm": 8.09196949005127, + "learning_rate": 7.417787742899851e-07, + "loss": 2.6312, + "step": 109055 + }, + { + "epoch": 7.40997418127463, + "grad_norm": 8.327770233154297, + "learning_rate": 7.413541242016579e-07, + "loss": 2.5915, + "step": 109060 + }, + { + "epoch": 7.410313901345291, + "grad_norm": 8.950363159179688, + "learning_rate": 7.409294741133306e-07, + "loss": 2.8477, + "step": 109065 + }, + { + "epoch": 7.410653621415953, + "grad_norm": 8.42910099029541, + "learning_rate": 7.405048240250034e-07, + "loss": 2.5552, + "step": 109070 + }, + { + "epoch": 7.410993341486615, + "grad_norm": 11.177499771118164, + "learning_rate": 7.400801739366762e-07, + "loss": 2.5288, + "step": 109075 + }, + { + "epoch": 7.411333061557277, + "grad_norm": 7.185899257659912, + "learning_rate": 7.396555238483489e-07, + "loss": 2.926, + "step": 109080 + }, + { + "epoch": 7.411672781627939, + "grad_norm": 8.83035945892334, + "learning_rate": 7.392308737600218e-07, + "loss": 2.6192, + "step": 109085 + }, + { + "epoch": 7.412012501698601, + "grad_norm": 8.338017463684082, + "learning_rate": 7.388062236716946e-07, + "loss": 2.4705, + "step": 109090 + }, + { + "epoch": 7.412352221769262, + "grad_norm": 8.649909973144531, + "learning_rate": 7.383815735833674e-07, + "loss": 2.6117, + "step": 109095 + }, + { + "epoch": 7.412691941839924, + "grad_norm": 7.45427131652832, + "learning_rate": 7.379569234950401e-07, + "loss": 2.6096, + "step": 109100 + }, + { + "epoch": 7.413031661910586, + "grad_norm": 10.358201026916504, + "learning_rate": 7.375322734067129e-07, + "loss": 2.6151, + "step": 109105 + }, + { + "epoch": 7.413371381981247, + "grad_norm": 8.538263320922852, + "learning_rate": 7.371076233183857e-07, + "loss": 2.753, + "step": 109110 + }, + { + "epoch": 7.413711102051909, + "grad_norm": 8.827425003051758, + "learning_rate": 7.366829732300584e-07, + "loss": 2.5359, + "step": 109115 + }, + { + "epoch": 7.414050822122571, + "grad_norm": 6.238026142120361, + "learning_rate": 7.362583231417312e-07, + "loss": 2.8116, + "step": 109120 + }, + { + "epoch": 7.414390542193233, + "grad_norm": 7.644581317901611, + "learning_rate": 7.358336730534041e-07, + "loss": 2.8355, + "step": 109125 + }, + { + "epoch": 7.414730262263895, + "grad_norm": 6.9503631591796875, + "learning_rate": 7.354090229650767e-07, + "loss": 2.7217, + "step": 109130 + }, + { + "epoch": 7.415069982334557, + "grad_norm": 7.781391143798828, + "learning_rate": 7.349843728767496e-07, + "loss": 2.8904, + "step": 109135 + }, + { + "epoch": 7.415409702405218, + "grad_norm": 6.268641948699951, + "learning_rate": 7.345597227884224e-07, + "loss": 2.6622, + "step": 109140 + }, + { + "epoch": 7.41574942247588, + "grad_norm": 10.047820091247559, + "learning_rate": 7.341350727000952e-07, + "loss": 2.7924, + "step": 109145 + }, + { + "epoch": 7.416089142546542, + "grad_norm": 9.655617713928223, + "learning_rate": 7.337104226117679e-07, + "loss": 2.3708, + "step": 109150 + }, + { + "epoch": 7.416428862617203, + "grad_norm": 5.985727310180664, + "learning_rate": 7.332857725234407e-07, + "loss": 2.7202, + "step": 109155 + }, + { + "epoch": 7.416768582687865, + "grad_norm": 6.788645267486572, + "learning_rate": 7.328611224351135e-07, + "loss": 2.5679, + "step": 109160 + }, + { + "epoch": 7.4171083027585265, + "grad_norm": 7.640200138092041, + "learning_rate": 7.324364723467862e-07, + "loss": 2.5504, + "step": 109165 + }, + { + "epoch": 7.417448022829189, + "grad_norm": 6.851373672485352, + "learning_rate": 7.32011822258459e-07, + "loss": 2.6322, + "step": 109170 + }, + { + "epoch": 7.417787742899851, + "grad_norm": 7.502577781677246, + "learning_rate": 7.315871721701319e-07, + "loss": 2.8913, + "step": 109175 + }, + { + "epoch": 7.418127462970512, + "grad_norm": 8.630245208740234, + "learning_rate": 7.311625220818047e-07, + "loss": 2.7317, + "step": 109180 + }, + { + "epoch": 7.418467183041174, + "grad_norm": 6.6192169189453125, + "learning_rate": 7.307378719934774e-07, + "loss": 2.6728, + "step": 109185 + }, + { + "epoch": 7.418806903111836, + "grad_norm": 7.414216995239258, + "learning_rate": 7.303132219051502e-07, + "loss": 2.7155, + "step": 109190 + }, + { + "epoch": 7.419146623182497, + "grad_norm": 9.767638206481934, + "learning_rate": 7.29888571816823e-07, + "loss": 2.8676, + "step": 109195 + }, + { + "epoch": 7.419486343253159, + "grad_norm": 8.440362930297852, + "learning_rate": 7.294639217284957e-07, + "loss": 2.6143, + "step": 109200 + }, + { + "epoch": 7.419826063323821, + "grad_norm": 7.34689998626709, + "learning_rate": 7.290392716401685e-07, + "loss": 2.7044, + "step": 109205 + }, + { + "epoch": 7.4201657833944825, + "grad_norm": 8.394227981567383, + "learning_rate": 7.286146215518413e-07, + "loss": 2.5543, + "step": 109210 + }, + { + "epoch": 7.420505503465145, + "grad_norm": 5.348016262054443, + "learning_rate": 7.28189971463514e-07, + "loss": 2.66, + "step": 109215 + }, + { + "epoch": 7.420845223535807, + "grad_norm": 6.874626636505127, + "learning_rate": 7.277653213751869e-07, + "loss": 2.6328, + "step": 109220 + }, + { + "epoch": 7.421184943606468, + "grad_norm": 7.701363563537598, + "learning_rate": 7.273406712868597e-07, + "loss": 2.8933, + "step": 109225 + }, + { + "epoch": 7.42152466367713, + "grad_norm": 8.156770706176758, + "learning_rate": 7.269160211985325e-07, + "loss": 2.603, + "step": 109230 + }, + { + "epoch": 7.421864383747792, + "grad_norm": 6.697198867797852, + "learning_rate": 7.264913711102052e-07, + "loss": 2.7132, + "step": 109235 + }, + { + "epoch": 7.422204103818453, + "grad_norm": 7.782395839691162, + "learning_rate": 7.26066721021878e-07, + "loss": 2.9235, + "step": 109240 + }, + { + "epoch": 7.422543823889115, + "grad_norm": 8.959189414978027, + "learning_rate": 7.256420709335508e-07, + "loss": 2.8303, + "step": 109245 + }, + { + "epoch": 7.422883543959777, + "grad_norm": 8.878335952758789, + "learning_rate": 7.252174208452235e-07, + "loss": 2.6513, + "step": 109250 + }, + { + "epoch": 7.4232232640304385, + "grad_norm": 7.195616245269775, + "learning_rate": 7.247927707568963e-07, + "loss": 2.7632, + "step": 109255 + }, + { + "epoch": 7.423562984101101, + "grad_norm": 8.479022026062012, + "learning_rate": 7.243681206685692e-07, + "loss": 2.5694, + "step": 109260 + }, + { + "epoch": 7.423902704171763, + "grad_norm": 7.139410495758057, + "learning_rate": 7.23943470580242e-07, + "loss": 2.9726, + "step": 109265 + }, + { + "epoch": 7.424242424242424, + "grad_norm": 6.320602893829346, + "learning_rate": 7.235188204919147e-07, + "loss": 2.9658, + "step": 109270 + }, + { + "epoch": 7.424582144313086, + "grad_norm": 8.865583419799805, + "learning_rate": 7.230941704035875e-07, + "loss": 2.8046, + "step": 109275 + }, + { + "epoch": 7.424921864383748, + "grad_norm": 8.993613243103027, + "learning_rate": 7.226695203152603e-07, + "loss": 2.8204, + "step": 109280 + }, + { + "epoch": 7.425261584454409, + "grad_norm": 6.651431560516357, + "learning_rate": 7.22244870226933e-07, + "loss": 2.8342, + "step": 109285 + }, + { + "epoch": 7.425601304525071, + "grad_norm": 7.714261054992676, + "learning_rate": 7.218202201386058e-07, + "loss": 2.7227, + "step": 109290 + }, + { + "epoch": 7.425941024595733, + "grad_norm": 8.649382591247559, + "learning_rate": 7.213955700502786e-07, + "loss": 2.3964, + "step": 109295 + }, + { + "epoch": 7.4262807446663945, + "grad_norm": 7.133782386779785, + "learning_rate": 7.209709199619513e-07, + "loss": 2.8102, + "step": 109300 + }, + { + "epoch": 7.426620464737057, + "grad_norm": 8.383637428283691, + "learning_rate": 7.205462698736242e-07, + "loss": 2.5957, + "step": 109305 + }, + { + "epoch": 7.426960184807719, + "grad_norm": 9.243467330932617, + "learning_rate": 7.20121619785297e-07, + "loss": 2.5497, + "step": 109310 + }, + { + "epoch": 7.42729990487838, + "grad_norm": 7.920694351196289, + "learning_rate": 7.196969696969698e-07, + "loss": 2.7019, + "step": 109315 + }, + { + "epoch": 7.427639624949042, + "grad_norm": 8.2942533493042, + "learning_rate": 7.192723196086425e-07, + "loss": 2.8892, + "step": 109320 + }, + { + "epoch": 7.427979345019704, + "grad_norm": 8.25329303741455, + "learning_rate": 7.188476695203153e-07, + "loss": 2.7554, + "step": 109325 + }, + { + "epoch": 7.428319065090365, + "grad_norm": 5.762096881866455, + "learning_rate": 7.184230194319881e-07, + "loss": 2.8044, + "step": 109330 + }, + { + "epoch": 7.428658785161027, + "grad_norm": 7.738041400909424, + "learning_rate": 7.179983693436608e-07, + "loss": 2.6916, + "step": 109335 + }, + { + "epoch": 7.428998505231689, + "grad_norm": 6.696270942687988, + "learning_rate": 7.175737192553336e-07, + "loss": 2.8965, + "step": 109340 + }, + { + "epoch": 7.4293382253023506, + "grad_norm": 9.190727233886719, + "learning_rate": 7.171490691670065e-07, + "loss": 2.6528, + "step": 109345 + }, + { + "epoch": 7.429677945373013, + "grad_norm": 7.991182327270508, + "learning_rate": 7.167244190786793e-07, + "loss": 2.867, + "step": 109350 + }, + { + "epoch": 7.430017665443675, + "grad_norm": 8.437281608581543, + "learning_rate": 7.16299768990352e-07, + "loss": 2.565, + "step": 109355 + }, + { + "epoch": 7.430357385514336, + "grad_norm": 6.760589599609375, + "learning_rate": 7.158751189020248e-07, + "loss": 2.7247, + "step": 109360 + }, + { + "epoch": 7.430697105584998, + "grad_norm": 8.637972831726074, + "learning_rate": 7.154504688136976e-07, + "loss": 2.9035, + "step": 109365 + }, + { + "epoch": 7.43103682565566, + "grad_norm": 8.967430114746094, + "learning_rate": 7.150258187253703e-07, + "loss": 2.8577, + "step": 109370 + }, + { + "epoch": 7.431376545726321, + "grad_norm": 8.775769233703613, + "learning_rate": 7.146011686370431e-07, + "loss": 2.7138, + "step": 109375 + }, + { + "epoch": 7.431716265796983, + "grad_norm": 7.03678035736084, + "learning_rate": 7.141765185487159e-07, + "loss": 2.8458, + "step": 109380 + }, + { + "epoch": 7.432055985867645, + "grad_norm": 9.185450553894043, + "learning_rate": 7.137518684603886e-07, + "loss": 2.7488, + "step": 109385 + }, + { + "epoch": 7.432395705938307, + "grad_norm": 7.761990547180176, + "learning_rate": 7.133272183720614e-07, + "loss": 2.8402, + "step": 109390 + }, + { + "epoch": 7.432735426008969, + "grad_norm": 8.099312782287598, + "learning_rate": 7.129025682837343e-07, + "loss": 2.6865, + "step": 109395 + }, + { + "epoch": 7.433075146079631, + "grad_norm": 7.811006546020508, + "learning_rate": 7.124779181954071e-07, + "loss": 2.802, + "step": 109400 + }, + { + "epoch": 7.433414866150292, + "grad_norm": 8.456084251403809, + "learning_rate": 7.120532681070798e-07, + "loss": 2.8119, + "step": 109405 + }, + { + "epoch": 7.433754586220954, + "grad_norm": 7.077322483062744, + "learning_rate": 7.116286180187526e-07, + "loss": 2.7287, + "step": 109410 + }, + { + "epoch": 7.434094306291616, + "grad_norm": 6.605988502502441, + "learning_rate": 7.112039679304254e-07, + "loss": 2.7, + "step": 109415 + }, + { + "epoch": 7.434434026362277, + "grad_norm": 7.941909313201904, + "learning_rate": 7.107793178420981e-07, + "loss": 2.8701, + "step": 109420 + }, + { + "epoch": 7.434773746432939, + "grad_norm": 9.82040786743164, + "learning_rate": 7.103546677537709e-07, + "loss": 2.8349, + "step": 109425 + }, + { + "epoch": 7.435113466503601, + "grad_norm": 8.318913459777832, + "learning_rate": 7.099300176654437e-07, + "loss": 2.5346, + "step": 109430 + }, + { + "epoch": 7.435453186574263, + "grad_norm": 7.0778303146362305, + "learning_rate": 7.095053675771166e-07, + "loss": 2.4962, + "step": 109435 + }, + { + "epoch": 7.435792906644925, + "grad_norm": 5.912898063659668, + "learning_rate": 7.090807174887893e-07, + "loss": 2.4395, + "step": 109440 + }, + { + "epoch": 7.436132626715587, + "grad_norm": 7.562562465667725, + "learning_rate": 7.086560674004621e-07, + "loss": 2.7554, + "step": 109445 + }, + { + "epoch": 7.436472346786248, + "grad_norm": 6.76503849029541, + "learning_rate": 7.082314173121349e-07, + "loss": 2.8992, + "step": 109450 + }, + { + "epoch": 7.43681206685691, + "grad_norm": 7.343021869659424, + "learning_rate": 7.078067672238076e-07, + "loss": 2.809, + "step": 109455 + }, + { + "epoch": 7.437151786927572, + "grad_norm": 8.871760368347168, + "learning_rate": 7.073821171354804e-07, + "loss": 2.6083, + "step": 109460 + }, + { + "epoch": 7.437491506998233, + "grad_norm": 9.157471656799316, + "learning_rate": 7.069574670471532e-07, + "loss": 2.5147, + "step": 109465 + }, + { + "epoch": 7.437831227068895, + "grad_norm": 8.270730018615723, + "learning_rate": 7.065328169588259e-07, + "loss": 2.707, + "step": 109470 + }, + { + "epoch": 7.438170947139557, + "grad_norm": 7.296523571014404, + "learning_rate": 7.061081668704987e-07, + "loss": 2.7308, + "step": 109475 + }, + { + "epoch": 7.438510667210219, + "grad_norm": 10.235407829284668, + "learning_rate": 7.056835167821716e-07, + "loss": 3.1034, + "step": 109480 + }, + { + "epoch": 7.438850387280881, + "grad_norm": 7.352067947387695, + "learning_rate": 7.052588666938444e-07, + "loss": 2.8184, + "step": 109485 + }, + { + "epoch": 7.439190107351543, + "grad_norm": 7.429886817932129, + "learning_rate": 7.048342166055171e-07, + "loss": 2.7245, + "step": 109490 + }, + { + "epoch": 7.439529827422204, + "grad_norm": 8.6124267578125, + "learning_rate": 7.044095665171899e-07, + "loss": 3.0132, + "step": 109495 + }, + { + "epoch": 7.439869547492866, + "grad_norm": 10.115928649902344, + "learning_rate": 7.039849164288627e-07, + "loss": 2.6695, + "step": 109500 + }, + { + "epoch": 7.440209267563528, + "grad_norm": 9.733351707458496, + "learning_rate": 7.035602663405354e-07, + "loss": 2.6893, + "step": 109505 + }, + { + "epoch": 7.440548987634189, + "grad_norm": 8.078473091125488, + "learning_rate": 7.031356162522082e-07, + "loss": 2.6027, + "step": 109510 + }, + { + "epoch": 7.440888707704851, + "grad_norm": 8.342901229858398, + "learning_rate": 7.02710966163881e-07, + "loss": 2.7548, + "step": 109515 + }, + { + "epoch": 7.441228427775513, + "grad_norm": 7.925652027130127, + "learning_rate": 7.022863160755539e-07, + "loss": 2.737, + "step": 109520 + }, + { + "epoch": 7.441568147846175, + "grad_norm": 6.454066753387451, + "learning_rate": 7.018616659872265e-07, + "loss": 2.6241, + "step": 109525 + }, + { + "epoch": 7.441907867916837, + "grad_norm": 7.666416168212891, + "learning_rate": 7.014370158988994e-07, + "loss": 2.7583, + "step": 109530 + }, + { + "epoch": 7.442247587987499, + "grad_norm": 8.006790161132812, + "learning_rate": 7.010123658105722e-07, + "loss": 2.7853, + "step": 109535 + }, + { + "epoch": 7.44258730805816, + "grad_norm": 7.4551472663879395, + "learning_rate": 7.005877157222449e-07, + "loss": 2.7054, + "step": 109540 + }, + { + "epoch": 7.442927028128822, + "grad_norm": 6.121517658233643, + "learning_rate": 7.001630656339177e-07, + "loss": 2.6407, + "step": 109545 + }, + { + "epoch": 7.443266748199484, + "grad_norm": 9.146050453186035, + "learning_rate": 6.997384155455905e-07, + "loss": 3.04, + "step": 109550 + }, + { + "epoch": 7.443606468270145, + "grad_norm": 7.152986526489258, + "learning_rate": 6.993137654572632e-07, + "loss": 2.6994, + "step": 109555 + }, + { + "epoch": 7.443946188340807, + "grad_norm": 10.597867012023926, + "learning_rate": 6.98889115368936e-07, + "loss": 2.6298, + "step": 109560 + }, + { + "epoch": 7.444285908411469, + "grad_norm": 6.750720977783203, + "learning_rate": 6.984644652806088e-07, + "loss": 2.6761, + "step": 109565 + }, + { + "epoch": 7.444625628482131, + "grad_norm": 7.304224967956543, + "learning_rate": 6.980398151922817e-07, + "loss": 2.8733, + "step": 109570 + }, + { + "epoch": 7.444965348552793, + "grad_norm": 7.150983810424805, + "learning_rate": 6.976151651039544e-07, + "loss": 2.7512, + "step": 109575 + }, + { + "epoch": 7.445305068623454, + "grad_norm": 9.030220985412598, + "learning_rate": 6.971905150156272e-07, + "loss": 2.8782, + "step": 109580 + }, + { + "epoch": 7.445644788694116, + "grad_norm": 8.015070915222168, + "learning_rate": 6.967658649273e-07, + "loss": 2.8387, + "step": 109585 + }, + { + "epoch": 7.445984508764778, + "grad_norm": 7.153843402862549, + "learning_rate": 6.963412148389727e-07, + "loss": 2.6731, + "step": 109590 + }, + { + "epoch": 7.446324228835439, + "grad_norm": 7.47194766998291, + "learning_rate": 6.959165647506455e-07, + "loss": 2.5896, + "step": 109595 + }, + { + "epoch": 7.446663948906101, + "grad_norm": 10.975841522216797, + "learning_rate": 6.954919146623183e-07, + "loss": 2.7518, + "step": 109600 + }, + { + "epoch": 7.447003668976763, + "grad_norm": 9.89434814453125, + "learning_rate": 6.950672645739911e-07, + "loss": 2.7044, + "step": 109605 + }, + { + "epoch": 7.4473433890474245, + "grad_norm": 8.860107421875, + "learning_rate": 6.946426144856638e-07, + "loss": 2.7056, + "step": 109610 + }, + { + "epoch": 7.447683109118087, + "grad_norm": 7.792104244232178, + "learning_rate": 6.942179643973367e-07, + "loss": 2.6154, + "step": 109615 + }, + { + "epoch": 7.448022829188749, + "grad_norm": 8.898449897766113, + "learning_rate": 6.937933143090095e-07, + "loss": 2.831, + "step": 109620 + }, + { + "epoch": 7.44836254925941, + "grad_norm": 7.437720775604248, + "learning_rate": 6.933686642206822e-07, + "loss": 2.5442, + "step": 109625 + }, + { + "epoch": 7.448702269330072, + "grad_norm": 9.54236125946045, + "learning_rate": 6.92944014132355e-07, + "loss": 2.9089, + "step": 109630 + }, + { + "epoch": 7.449041989400734, + "grad_norm": 9.156172752380371, + "learning_rate": 6.925193640440278e-07, + "loss": 2.8198, + "step": 109635 + }, + { + "epoch": 7.449381709471395, + "grad_norm": 6.257879734039307, + "learning_rate": 6.920947139557005e-07, + "loss": 2.7219, + "step": 109640 + }, + { + "epoch": 7.449721429542057, + "grad_norm": 8.18147087097168, + "learning_rate": 6.916700638673733e-07, + "loss": 2.8614, + "step": 109645 + }, + { + "epoch": 7.450061149612719, + "grad_norm": 7.414788246154785, + "learning_rate": 6.912454137790461e-07, + "loss": 2.5843, + "step": 109650 + }, + { + "epoch": 7.4504008696833806, + "grad_norm": 7.962213516235352, + "learning_rate": 6.90820763690719e-07, + "loss": 2.4835, + "step": 109655 + }, + { + "epoch": 7.450740589754043, + "grad_norm": 7.20475959777832, + "learning_rate": 6.903961136023916e-07, + "loss": 2.7899, + "step": 109660 + }, + { + "epoch": 7.451080309824705, + "grad_norm": 6.682797431945801, + "learning_rate": 6.899714635140645e-07, + "loss": 2.521, + "step": 109665 + }, + { + "epoch": 7.451420029895366, + "grad_norm": 8.97825813293457, + "learning_rate": 6.895468134257373e-07, + "loss": 2.752, + "step": 109670 + }, + { + "epoch": 7.451759749966028, + "grad_norm": 6.8530192375183105, + "learning_rate": 6.8912216333741e-07, + "loss": 2.7537, + "step": 109675 + }, + { + "epoch": 7.45209947003669, + "grad_norm": 6.82707405090332, + "learning_rate": 6.886975132490828e-07, + "loss": 2.5463, + "step": 109680 + }, + { + "epoch": 7.452439190107351, + "grad_norm": 9.4226713180542, + "learning_rate": 6.882728631607556e-07, + "loss": 2.6212, + "step": 109685 + }, + { + "epoch": 7.452778910178013, + "grad_norm": 8.16360092163086, + "learning_rate": 6.878482130724284e-07, + "loss": 2.677, + "step": 109690 + }, + { + "epoch": 7.453118630248675, + "grad_norm": 7.2322916984558105, + "learning_rate": 6.874235629841011e-07, + "loss": 2.6049, + "step": 109695 + }, + { + "epoch": 7.453458350319337, + "grad_norm": 8.927751541137695, + "learning_rate": 6.86998912895774e-07, + "loss": 2.6873, + "step": 109700 + }, + { + "epoch": 7.453798070389999, + "grad_norm": 7.9958062171936035, + "learning_rate": 6.865742628074468e-07, + "loss": 2.6543, + "step": 109705 + }, + { + "epoch": 7.454137790460661, + "grad_norm": 7.099665641784668, + "learning_rate": 6.861496127191195e-07, + "loss": 2.5672, + "step": 109710 + }, + { + "epoch": 7.454477510531322, + "grad_norm": 10.638300895690918, + "learning_rate": 6.857249626307923e-07, + "loss": 2.6369, + "step": 109715 + }, + { + "epoch": 7.454817230601984, + "grad_norm": 8.513601303100586, + "learning_rate": 6.853003125424651e-07, + "loss": 2.7027, + "step": 109720 + }, + { + "epoch": 7.455156950672646, + "grad_norm": 7.461792945861816, + "learning_rate": 6.848756624541378e-07, + "loss": 2.7515, + "step": 109725 + }, + { + "epoch": 7.455496670743307, + "grad_norm": 8.924720764160156, + "learning_rate": 6.844510123658106e-07, + "loss": 2.4643, + "step": 109730 + }, + { + "epoch": 7.455836390813969, + "grad_norm": 7.551925182342529, + "learning_rate": 6.840263622774834e-07, + "loss": 2.7364, + "step": 109735 + }, + { + "epoch": 7.456176110884631, + "grad_norm": 7.401301383972168, + "learning_rate": 6.836017121891563e-07, + "loss": 2.8092, + "step": 109740 + }, + { + "epoch": 7.456515830955293, + "grad_norm": 6.763650417327881, + "learning_rate": 6.831770621008289e-07, + "loss": 2.7158, + "step": 109745 + }, + { + "epoch": 7.456855551025955, + "grad_norm": 6.643920421600342, + "learning_rate": 6.827524120125018e-07, + "loss": 2.7641, + "step": 109750 + }, + { + "epoch": 7.457195271096617, + "grad_norm": 7.487472057342529, + "learning_rate": 6.823277619241746e-07, + "loss": 2.8075, + "step": 109755 + }, + { + "epoch": 7.457534991167278, + "grad_norm": 6.874231815338135, + "learning_rate": 6.819031118358473e-07, + "loss": 2.6145, + "step": 109760 + }, + { + "epoch": 7.45787471123794, + "grad_norm": 8.29895305633545, + "learning_rate": 6.814784617475201e-07, + "loss": 2.6793, + "step": 109765 + }, + { + "epoch": 7.458214431308602, + "grad_norm": 8.052714347839355, + "learning_rate": 6.810538116591929e-07, + "loss": 2.6823, + "step": 109770 + }, + { + "epoch": 7.458554151379263, + "grad_norm": 9.319953918457031, + "learning_rate": 6.806291615708657e-07, + "loss": 2.6265, + "step": 109775 + }, + { + "epoch": 7.458893871449925, + "grad_norm": 7.320334434509277, + "learning_rate": 6.802045114825384e-07, + "loss": 2.446, + "step": 109780 + }, + { + "epoch": 7.459233591520587, + "grad_norm": 10.332253456115723, + "learning_rate": 6.797798613942112e-07, + "loss": 2.6836, + "step": 109785 + }, + { + "epoch": 7.459573311591249, + "grad_norm": 11.278948783874512, + "learning_rate": 6.793552113058841e-07, + "loss": 2.6527, + "step": 109790 + }, + { + "epoch": 7.459913031661911, + "grad_norm": 9.379776954650879, + "learning_rate": 6.789305612175568e-07, + "loss": 2.8362, + "step": 109795 + }, + { + "epoch": 7.460252751732573, + "grad_norm": 8.82522201538086, + "learning_rate": 6.785059111292296e-07, + "loss": 2.6449, + "step": 109800 + }, + { + "epoch": 7.460592471803234, + "grad_norm": 7.458028793334961, + "learning_rate": 6.780812610409024e-07, + "loss": 2.8415, + "step": 109805 + }, + { + "epoch": 7.460932191873896, + "grad_norm": 7.092739582061768, + "learning_rate": 6.776566109525751e-07, + "loss": 2.8052, + "step": 109810 + }, + { + "epoch": 7.461271911944558, + "grad_norm": 6.087337493896484, + "learning_rate": 6.772319608642479e-07, + "loss": 2.9279, + "step": 109815 + }, + { + "epoch": 7.461611632015219, + "grad_norm": 9.681904792785645, + "learning_rate": 6.768073107759207e-07, + "loss": 2.7739, + "step": 109820 + }, + { + "epoch": 7.461951352085881, + "grad_norm": 7.334592342376709, + "learning_rate": 6.763826606875935e-07, + "loss": 2.5598, + "step": 109825 + }, + { + "epoch": 7.462291072156543, + "grad_norm": 7.700098037719727, + "learning_rate": 6.759580105992662e-07, + "loss": 3.01, + "step": 109830 + }, + { + "epoch": 7.462630792227205, + "grad_norm": 7.214632034301758, + "learning_rate": 6.755333605109391e-07, + "loss": 2.6109, + "step": 109835 + }, + { + "epoch": 7.462970512297867, + "grad_norm": 7.583872318267822, + "learning_rate": 6.751087104226119e-07, + "loss": 2.6586, + "step": 109840 + }, + { + "epoch": 7.463310232368528, + "grad_norm": 7.28479528427124, + "learning_rate": 6.746840603342846e-07, + "loss": 2.7035, + "step": 109845 + }, + { + "epoch": 7.46364995243919, + "grad_norm": 5.927567958831787, + "learning_rate": 6.742594102459574e-07, + "loss": 2.6061, + "step": 109850 + }, + { + "epoch": 7.463989672509852, + "grad_norm": 8.536701202392578, + "learning_rate": 6.738347601576302e-07, + "loss": 2.4975, + "step": 109855 + }, + { + "epoch": 7.464329392580513, + "grad_norm": 8.99262523651123, + "learning_rate": 6.73410110069303e-07, + "loss": 2.8519, + "step": 109860 + }, + { + "epoch": 7.464669112651175, + "grad_norm": 7.320992469787598, + "learning_rate": 6.729854599809757e-07, + "loss": 2.5283, + "step": 109865 + }, + { + "epoch": 7.465008832721837, + "grad_norm": 8.901582717895508, + "learning_rate": 6.725608098926485e-07, + "loss": 2.3045, + "step": 109870 + }, + { + "epoch": 7.4653485527924985, + "grad_norm": 6.896786689758301, + "learning_rate": 6.721361598043214e-07, + "loss": 2.8536, + "step": 109875 + }, + { + "epoch": 7.465688272863161, + "grad_norm": 8.881342887878418, + "learning_rate": 6.71711509715994e-07, + "loss": 2.6502, + "step": 109880 + }, + { + "epoch": 7.466027992933823, + "grad_norm": 9.692427635192871, + "learning_rate": 6.712868596276669e-07, + "loss": 2.8606, + "step": 109885 + }, + { + "epoch": 7.466367713004484, + "grad_norm": 10.096088409423828, + "learning_rate": 6.708622095393397e-07, + "loss": 2.6194, + "step": 109890 + }, + { + "epoch": 7.466707433075146, + "grad_norm": 7.535724639892578, + "learning_rate": 6.704375594510124e-07, + "loss": 2.9383, + "step": 109895 + }, + { + "epoch": 7.467047153145808, + "grad_norm": 7.330959796905518, + "learning_rate": 6.700129093626852e-07, + "loss": 2.7917, + "step": 109900 + }, + { + "epoch": 7.467386873216469, + "grad_norm": 8.823891639709473, + "learning_rate": 6.69588259274358e-07, + "loss": 2.6274, + "step": 109905 + }, + { + "epoch": 7.467726593287131, + "grad_norm": 6.400233745574951, + "learning_rate": 6.691636091860308e-07, + "loss": 2.5397, + "step": 109910 + }, + { + "epoch": 7.468066313357793, + "grad_norm": 8.59380054473877, + "learning_rate": 6.687389590977035e-07, + "loss": 2.7606, + "step": 109915 + }, + { + "epoch": 7.4684060334284545, + "grad_norm": 8.447466850280762, + "learning_rate": 6.683143090093763e-07, + "loss": 2.8041, + "step": 109920 + }, + { + "epoch": 7.468745753499117, + "grad_norm": 8.553298950195312, + "learning_rate": 6.678896589210492e-07, + "loss": 2.7339, + "step": 109925 + }, + { + "epoch": 7.469085473569779, + "grad_norm": 7.85037088394165, + "learning_rate": 6.674650088327219e-07, + "loss": 2.6724, + "step": 109930 + }, + { + "epoch": 7.46942519364044, + "grad_norm": 8.571322441101074, + "learning_rate": 6.670403587443947e-07, + "loss": 2.5857, + "step": 109935 + }, + { + "epoch": 7.469764913711102, + "grad_norm": 7.23392391204834, + "learning_rate": 6.666157086560675e-07, + "loss": 2.687, + "step": 109940 + }, + { + "epoch": 7.470104633781764, + "grad_norm": 8.430475234985352, + "learning_rate": 6.661910585677403e-07, + "loss": 2.5693, + "step": 109945 + }, + { + "epoch": 7.470444353852425, + "grad_norm": 8.181297302246094, + "learning_rate": 6.65766408479413e-07, + "loss": 2.6684, + "step": 109950 + }, + { + "epoch": 7.470784073923087, + "grad_norm": 8.477030754089355, + "learning_rate": 6.653417583910858e-07, + "loss": 2.4961, + "step": 109955 + }, + { + "epoch": 7.471123793993749, + "grad_norm": 9.992152214050293, + "learning_rate": 6.649171083027586e-07, + "loss": 2.9703, + "step": 109960 + }, + { + "epoch": 7.471463514064411, + "grad_norm": 9.986894607543945, + "learning_rate": 6.644924582144313e-07, + "loss": 2.7632, + "step": 109965 + }, + { + "epoch": 7.471803234135073, + "grad_norm": 7.957117080688477, + "learning_rate": 6.640678081261042e-07, + "loss": 2.5826, + "step": 109970 + }, + { + "epoch": 7.472142954205735, + "grad_norm": 9.542886734008789, + "learning_rate": 6.63643158037777e-07, + "loss": 2.5899, + "step": 109975 + }, + { + "epoch": 7.472482674276396, + "grad_norm": 9.957964897155762, + "learning_rate": 6.632185079494497e-07, + "loss": 2.8554, + "step": 109980 + }, + { + "epoch": 7.472822394347058, + "grad_norm": 7.5473856925964355, + "learning_rate": 6.627938578611225e-07, + "loss": 2.8573, + "step": 109985 + }, + { + "epoch": 7.47316211441772, + "grad_norm": 8.229348182678223, + "learning_rate": 6.623692077727953e-07, + "loss": 2.4785, + "step": 109990 + }, + { + "epoch": 7.473501834488381, + "grad_norm": 6.412472724914551, + "learning_rate": 6.619445576844681e-07, + "loss": 2.7318, + "step": 109995 + }, + { + "epoch": 7.473841554559043, + "grad_norm": 9.459336280822754, + "learning_rate": 6.615199075961408e-07, + "loss": 2.6339, + "step": 110000 + }, + { + "epoch": 7.474181274629705, + "grad_norm": 8.120963096618652, + "learning_rate": 6.610952575078136e-07, + "loss": 2.7618, + "step": 110005 + }, + { + "epoch": 7.474520994700367, + "grad_norm": 7.368685245513916, + "learning_rate": 6.606706074194865e-07, + "loss": 2.7154, + "step": 110010 + }, + { + "epoch": 7.474860714771029, + "grad_norm": 8.6095609664917, + "learning_rate": 6.602459573311591e-07, + "loss": 2.6068, + "step": 110015 + }, + { + "epoch": 7.475200434841691, + "grad_norm": 7.539052486419678, + "learning_rate": 6.59821307242832e-07, + "loss": 2.4481, + "step": 110020 + }, + { + "epoch": 7.475540154912352, + "grad_norm": 9.853160858154297, + "learning_rate": 6.593966571545048e-07, + "loss": 2.7608, + "step": 110025 + }, + { + "epoch": 7.475879874983014, + "grad_norm": 7.543886184692383, + "learning_rate": 6.589720070661776e-07, + "loss": 2.7499, + "step": 110030 + }, + { + "epoch": 7.476219595053676, + "grad_norm": 8.20207691192627, + "learning_rate": 6.585473569778503e-07, + "loss": 2.6723, + "step": 110035 + }, + { + "epoch": 7.476559315124337, + "grad_norm": 7.392677307128906, + "learning_rate": 6.581227068895231e-07, + "loss": 2.4815, + "step": 110040 + }, + { + "epoch": 7.476899035194999, + "grad_norm": 9.308073043823242, + "learning_rate": 6.576980568011959e-07, + "loss": 2.4645, + "step": 110045 + }, + { + "epoch": 7.477238755265661, + "grad_norm": 6.854190349578857, + "learning_rate": 6.572734067128686e-07, + "loss": 2.6404, + "step": 110050 + }, + { + "epoch": 7.477578475336323, + "grad_norm": 7.359142303466797, + "learning_rate": 6.568487566245415e-07, + "loss": 2.7075, + "step": 110055 + }, + { + "epoch": 7.477918195406985, + "grad_norm": 7.869117259979248, + "learning_rate": 6.564241065362143e-07, + "loss": 2.4786, + "step": 110060 + }, + { + "epoch": 7.478257915477647, + "grad_norm": 7.164360046386719, + "learning_rate": 6.55999456447887e-07, + "loss": 2.8881, + "step": 110065 + }, + { + "epoch": 7.478597635548308, + "grad_norm": 7.825385093688965, + "learning_rate": 6.555748063595598e-07, + "loss": 2.473, + "step": 110070 + }, + { + "epoch": 7.47893735561897, + "grad_norm": 7.633702754974365, + "learning_rate": 6.551501562712326e-07, + "loss": 2.8404, + "step": 110075 + }, + { + "epoch": 7.479277075689632, + "grad_norm": 8.828535079956055, + "learning_rate": 6.547255061829054e-07, + "loss": 2.6963, + "step": 110080 + }, + { + "epoch": 7.479616795760293, + "grad_norm": 8.054842948913574, + "learning_rate": 6.543008560945781e-07, + "loss": 2.5401, + "step": 110085 + }, + { + "epoch": 7.479956515830955, + "grad_norm": 7.5545878410339355, + "learning_rate": 6.538762060062509e-07, + "loss": 2.4801, + "step": 110090 + }, + { + "epoch": 7.480296235901617, + "grad_norm": 7.9318060874938965, + "learning_rate": 6.534515559179238e-07, + "loss": 2.9573, + "step": 110095 + }, + { + "epoch": 7.480635955972279, + "grad_norm": 8.380894660949707, + "learning_rate": 6.530269058295964e-07, + "loss": 2.6779, + "step": 110100 + }, + { + "epoch": 7.480975676042941, + "grad_norm": 8.292238235473633, + "learning_rate": 6.526022557412693e-07, + "loss": 2.6871, + "step": 110105 + }, + { + "epoch": 7.481315396113603, + "grad_norm": 8.208317756652832, + "learning_rate": 6.521776056529421e-07, + "loss": 2.7959, + "step": 110110 + }, + { + "epoch": 7.481655116184264, + "grad_norm": 6.852465629577637, + "learning_rate": 6.517529555646149e-07, + "loss": 2.7814, + "step": 110115 + }, + { + "epoch": 7.481994836254926, + "grad_norm": 8.621064186096191, + "learning_rate": 6.513283054762876e-07, + "loss": 2.8019, + "step": 110120 + }, + { + "epoch": 7.482334556325588, + "grad_norm": 8.425521850585938, + "learning_rate": 6.509036553879604e-07, + "loss": 2.6146, + "step": 110125 + }, + { + "epoch": 7.482674276396249, + "grad_norm": 8.509178161621094, + "learning_rate": 6.504790052996332e-07, + "loss": 2.6042, + "step": 110130 + }, + { + "epoch": 7.483013996466911, + "grad_norm": 8.478939056396484, + "learning_rate": 6.500543552113059e-07, + "loss": 2.6801, + "step": 110135 + }, + { + "epoch": 7.483353716537573, + "grad_norm": 8.220926284790039, + "learning_rate": 6.496297051229787e-07, + "loss": 2.7068, + "step": 110140 + }, + { + "epoch": 7.483693436608235, + "grad_norm": 6.885931968688965, + "learning_rate": 6.492050550346516e-07, + "loss": 2.8408, + "step": 110145 + }, + { + "epoch": 7.484033156678897, + "grad_norm": 6.651174545288086, + "learning_rate": 6.487804049463243e-07, + "loss": 2.7736, + "step": 110150 + }, + { + "epoch": 7.484372876749559, + "grad_norm": 8.069007873535156, + "learning_rate": 6.483557548579971e-07, + "loss": 2.8136, + "step": 110155 + }, + { + "epoch": 7.48471259682022, + "grad_norm": 7.461751937866211, + "learning_rate": 6.479311047696699e-07, + "loss": 2.5271, + "step": 110160 + }, + { + "epoch": 7.485052316890882, + "grad_norm": 7.535569190979004, + "learning_rate": 6.475064546813427e-07, + "loss": 2.5443, + "step": 110165 + }, + { + "epoch": 7.485392036961544, + "grad_norm": 8.207982063293457, + "learning_rate": 6.470818045930154e-07, + "loss": 2.7718, + "step": 110170 + }, + { + "epoch": 7.485731757032205, + "grad_norm": 8.432657241821289, + "learning_rate": 6.466571545046882e-07, + "loss": 2.857, + "step": 110175 + }, + { + "epoch": 7.486071477102867, + "grad_norm": 9.147743225097656, + "learning_rate": 6.46232504416361e-07, + "loss": 2.5263, + "step": 110180 + }, + { + "epoch": 7.486411197173529, + "grad_norm": 7.270946025848389, + "learning_rate": 6.458078543280337e-07, + "loss": 2.6922, + "step": 110185 + }, + { + "epoch": 7.486750917244191, + "grad_norm": 6.8556084632873535, + "learning_rate": 6.453832042397066e-07, + "loss": 2.805, + "step": 110190 + }, + { + "epoch": 7.487090637314853, + "grad_norm": 7.039989471435547, + "learning_rate": 6.449585541513794e-07, + "loss": 2.6797, + "step": 110195 + }, + { + "epoch": 7.487430357385515, + "grad_norm": 9.600274085998535, + "learning_rate": 6.445339040630522e-07, + "loss": 2.4339, + "step": 110200 + }, + { + "epoch": 7.487770077456176, + "grad_norm": 8.777997016906738, + "learning_rate": 6.441092539747249e-07, + "loss": 2.7963, + "step": 110205 + }, + { + "epoch": 7.488109797526838, + "grad_norm": 9.62950325012207, + "learning_rate": 6.436846038863977e-07, + "loss": 2.789, + "step": 110210 + }, + { + "epoch": 7.4884495175975, + "grad_norm": 6.453490257263184, + "learning_rate": 6.432599537980705e-07, + "loss": 2.9321, + "step": 110215 + }, + { + "epoch": 7.488789237668161, + "grad_norm": 9.244599342346191, + "learning_rate": 6.428353037097432e-07, + "loss": 2.3888, + "step": 110220 + }, + { + "epoch": 7.489128957738823, + "grad_norm": 9.280381202697754, + "learning_rate": 6.42410653621416e-07, + "loss": 2.834, + "step": 110225 + }, + { + "epoch": 7.489468677809485, + "grad_norm": 7.087199687957764, + "learning_rate": 6.419860035330889e-07, + "loss": 2.735, + "step": 110230 + }, + { + "epoch": 7.489808397880147, + "grad_norm": 5.486847400665283, + "learning_rate": 6.415613534447615e-07, + "loss": 2.6809, + "step": 110235 + }, + { + "epoch": 7.490148117950809, + "grad_norm": 8.627104759216309, + "learning_rate": 6.411367033564344e-07, + "loss": 2.761, + "step": 110240 + }, + { + "epoch": 7.490487838021471, + "grad_norm": 8.299152374267578, + "learning_rate": 6.407120532681072e-07, + "loss": 2.5217, + "step": 110245 + }, + { + "epoch": 7.490827558092132, + "grad_norm": 8.358835220336914, + "learning_rate": 6.4028740317978e-07, + "loss": 2.6668, + "step": 110250 + }, + { + "epoch": 7.491167278162794, + "grad_norm": 7.710747241973877, + "learning_rate": 6.398627530914527e-07, + "loss": 2.6459, + "step": 110255 + }, + { + "epoch": 7.491506998233455, + "grad_norm": 7.808692455291748, + "learning_rate": 6.394381030031255e-07, + "loss": 2.714, + "step": 110260 + }, + { + "epoch": 7.491846718304117, + "grad_norm": 9.038928031921387, + "learning_rate": 6.390134529147983e-07, + "loss": 2.8598, + "step": 110265 + }, + { + "epoch": 7.492186438374779, + "grad_norm": 8.643088340759277, + "learning_rate": 6.38588802826471e-07, + "loss": 2.8005, + "step": 110270 + }, + { + "epoch": 7.492526158445441, + "grad_norm": 7.325209140777588, + "learning_rate": 6.381641527381438e-07, + "loss": 2.6677, + "step": 110275 + }, + { + "epoch": 7.492865878516103, + "grad_norm": 8.851298332214355, + "learning_rate": 6.377395026498167e-07, + "loss": 2.5801, + "step": 110280 + }, + { + "epoch": 7.493205598586765, + "grad_norm": 8.149370193481445, + "learning_rate": 6.373148525614895e-07, + "loss": 2.8204, + "step": 110285 + }, + { + "epoch": 7.493545318657426, + "grad_norm": 9.055127143859863, + "learning_rate": 6.368902024731622e-07, + "loss": 2.5666, + "step": 110290 + }, + { + "epoch": 7.493885038728088, + "grad_norm": 7.310084819793701, + "learning_rate": 6.36465552384835e-07, + "loss": 2.6795, + "step": 110295 + }, + { + "epoch": 7.49422475879875, + "grad_norm": 7.3816657066345215, + "learning_rate": 6.360409022965078e-07, + "loss": 2.7238, + "step": 110300 + }, + { + "epoch": 7.494564478869411, + "grad_norm": 7.431446552276611, + "learning_rate": 6.356162522081805e-07, + "loss": 2.6183, + "step": 110305 + }, + { + "epoch": 7.494904198940073, + "grad_norm": 7.44704008102417, + "learning_rate": 6.351916021198533e-07, + "loss": 2.6582, + "step": 110310 + }, + { + "epoch": 7.495243919010735, + "grad_norm": 10.55724811553955, + "learning_rate": 6.347669520315261e-07, + "loss": 2.9726, + "step": 110315 + }, + { + "epoch": 7.495583639081397, + "grad_norm": 7.361636161804199, + "learning_rate": 6.343423019431988e-07, + "loss": 2.6297, + "step": 110320 + }, + { + "epoch": 7.495923359152059, + "grad_norm": 6.6251115798950195, + "learning_rate": 6.339176518548717e-07, + "loss": 2.5982, + "step": 110325 + }, + { + "epoch": 7.496263079222721, + "grad_norm": 6.774097919464111, + "learning_rate": 6.334930017665445e-07, + "loss": 2.5184, + "step": 110330 + }, + { + "epoch": 7.496602799293382, + "grad_norm": 8.271601676940918, + "learning_rate": 6.330683516782173e-07, + "loss": 2.9095, + "step": 110335 + }, + { + "epoch": 7.496942519364044, + "grad_norm": 7.5065999031066895, + "learning_rate": 6.3264370158989e-07, + "loss": 2.6564, + "step": 110340 + }, + { + "epoch": 7.497282239434706, + "grad_norm": 8.532506942749023, + "learning_rate": 6.322190515015628e-07, + "loss": 2.5865, + "step": 110345 + }, + { + "epoch": 7.497621959505367, + "grad_norm": 7.21927547454834, + "learning_rate": 6.317944014132356e-07, + "loss": 2.7535, + "step": 110350 + }, + { + "epoch": 7.497961679576029, + "grad_norm": 8.359955787658691, + "learning_rate": 6.313697513249083e-07, + "loss": 2.8769, + "step": 110355 + }, + { + "epoch": 7.498301399646691, + "grad_norm": 7.735912799835205, + "learning_rate": 6.30945101236581e-07, + "loss": 2.8542, + "step": 110360 + }, + { + "epoch": 7.498641119717353, + "grad_norm": 7.261948585510254, + "learning_rate": 6.30520451148254e-07, + "loss": 2.7246, + "step": 110365 + }, + { + "epoch": 7.498980839788015, + "grad_norm": 8.595059394836426, + "learning_rate": 6.300958010599268e-07, + "loss": 2.7085, + "step": 110370 + }, + { + "epoch": 7.499320559858677, + "grad_norm": 7.937162399291992, + "learning_rate": 6.296711509715995e-07, + "loss": 2.604, + "step": 110375 + }, + { + "epoch": 7.499660279929338, + "grad_norm": 7.191903591156006, + "learning_rate": 6.292465008832723e-07, + "loss": 2.773, + "step": 110380 + }, + { + "epoch": 7.5, + "grad_norm": 9.075708389282227, + "learning_rate": 6.288218507949451e-07, + "loss": 2.9871, + "step": 110385 + }, + { + "epoch": 7.500339720070662, + "grad_norm": 6.415442943572998, + "learning_rate": 6.283972007066178e-07, + "loss": 2.5763, + "step": 110390 + }, + { + "epoch": 7.500679440141323, + "grad_norm": 6.907186508178711, + "learning_rate": 6.279725506182906e-07, + "loss": 2.6967, + "step": 110395 + }, + { + "epoch": 7.501019160211985, + "grad_norm": 7.661083221435547, + "learning_rate": 6.275479005299634e-07, + "loss": 2.7631, + "step": 110400 + }, + { + "epoch": 7.501358880282647, + "grad_norm": 8.61470890045166, + "learning_rate": 6.27123250441636e-07, + "loss": 2.5576, + "step": 110405 + }, + { + "epoch": 7.501698600353309, + "grad_norm": 8.01742935180664, + "learning_rate": 6.266986003533088e-07, + "loss": 2.2789, + "step": 110410 + }, + { + "epoch": 7.502038320423971, + "grad_norm": 7.58854341506958, + "learning_rate": 6.262739502649818e-07, + "loss": 2.4033, + "step": 110415 + }, + { + "epoch": 7.502378040494633, + "grad_norm": 8.394571304321289, + "learning_rate": 6.258493001766546e-07, + "loss": 2.7835, + "step": 110420 + }, + { + "epoch": 7.502717760565294, + "grad_norm": 7.371801376342773, + "learning_rate": 6.254246500883273e-07, + "loss": 2.7366, + "step": 110425 + }, + { + "epoch": 7.503057480635956, + "grad_norm": 6.088286399841309, + "learning_rate": 6.25e-07, + "loss": 2.293, + "step": 110430 + }, + { + "epoch": 7.503397200706618, + "grad_norm": 9.78136920928955, + "learning_rate": 6.245753499116729e-07, + "loss": 2.9715, + "step": 110435 + }, + { + "epoch": 7.503736920777279, + "grad_norm": 6.062674522399902, + "learning_rate": 6.241506998233457e-07, + "loss": 2.522, + "step": 110440 + }, + { + "epoch": 7.504076640847941, + "grad_norm": 7.403176784515381, + "learning_rate": 6.237260497350183e-07, + "loss": 2.9252, + "step": 110445 + }, + { + "epoch": 7.504416360918603, + "grad_norm": 7.714274883270264, + "learning_rate": 6.233013996466913e-07, + "loss": 2.7357, + "step": 110450 + }, + { + "epoch": 7.504756080989265, + "grad_norm": 6.640329360961914, + "learning_rate": 6.22876749558364e-07, + "loss": 2.749, + "step": 110455 + }, + { + "epoch": 7.505095801059927, + "grad_norm": 8.02706241607666, + "learning_rate": 6.224520994700367e-07, + "loss": 2.2426, + "step": 110460 + }, + { + "epoch": 7.505435521130589, + "grad_norm": 8.274518966674805, + "learning_rate": 6.220274493817096e-07, + "loss": 2.6752, + "step": 110465 + }, + { + "epoch": 7.50577524120125, + "grad_norm": 6.550164699554443, + "learning_rate": 6.216027992933822e-07, + "loss": 2.5709, + "step": 110470 + }, + { + "epoch": 7.506114961271912, + "grad_norm": 6.889023780822754, + "learning_rate": 6.211781492050552e-07, + "loss": 2.8682, + "step": 110475 + }, + { + "epoch": 7.506454681342574, + "grad_norm": 7.691156387329102, + "learning_rate": 6.207534991167278e-07, + "loss": 2.7709, + "step": 110480 + }, + { + "epoch": 7.506794401413235, + "grad_norm": 9.827272415161133, + "learning_rate": 6.203288490284006e-07, + "loss": 2.7243, + "step": 110485 + }, + { + "epoch": 7.507134121483897, + "grad_norm": 8.05014419555664, + "learning_rate": 6.199041989400734e-07, + "loss": 2.6236, + "step": 110490 + }, + { + "epoch": 7.5074738415545585, + "grad_norm": 6.319286346435547, + "learning_rate": 6.194795488517461e-07, + "loss": 2.5503, + "step": 110495 + }, + { + "epoch": 7.507813561625221, + "grad_norm": 7.677103519439697, + "learning_rate": 6.19054898763419e-07, + "loss": 2.6259, + "step": 110500 + }, + { + "epoch": 7.508153281695883, + "grad_norm": 9.039483070373535, + "learning_rate": 6.186302486750917e-07, + "loss": 2.9277, + "step": 110505 + }, + { + "epoch": 7.508493001766544, + "grad_norm": 7.345539093017578, + "learning_rate": 6.182055985867645e-07, + "loss": 2.5856, + "step": 110510 + }, + { + "epoch": 7.508832721837206, + "grad_norm": 9.969281196594238, + "learning_rate": 6.177809484984373e-07, + "loss": 2.7947, + "step": 110515 + }, + { + "epoch": 7.509172441907868, + "grad_norm": 10.095715522766113, + "learning_rate": 6.1735629841011e-07, + "loss": 2.6423, + "step": 110520 + }, + { + "epoch": 7.509512161978529, + "grad_norm": 8.855118751525879, + "learning_rate": 6.16931648321783e-07, + "loss": 2.6772, + "step": 110525 + }, + { + "epoch": 7.509851882049191, + "grad_norm": 8.761573791503906, + "learning_rate": 6.165069982334556e-07, + "loss": 2.8443, + "step": 110530 + }, + { + "epoch": 7.510191602119853, + "grad_norm": 7.366950511932373, + "learning_rate": 6.160823481451284e-07, + "loss": 2.6204, + "step": 110535 + }, + { + "epoch": 7.510531322190515, + "grad_norm": 7.231166839599609, + "learning_rate": 6.156576980568012e-07, + "loss": 2.8503, + "step": 110540 + }, + { + "epoch": 7.510871042261177, + "grad_norm": 9.83340072631836, + "learning_rate": 6.15233047968474e-07, + "loss": 2.7207, + "step": 110545 + }, + { + "epoch": 7.511210762331839, + "grad_norm": 9.442113876342773, + "learning_rate": 6.148083978801468e-07, + "loss": 2.8095, + "step": 110550 + }, + { + "epoch": 7.5115504824025, + "grad_norm": 8.391064643859863, + "learning_rate": 6.143837477918195e-07, + "loss": 2.6061, + "step": 110555 + }, + { + "epoch": 7.511890202473162, + "grad_norm": 6.230015277862549, + "learning_rate": 6.139590977034923e-07, + "loss": 2.8477, + "step": 110560 + }, + { + "epoch": 7.512229922543824, + "grad_norm": 11.023337364196777, + "learning_rate": 6.135344476151651e-07, + "loss": 2.9067, + "step": 110565 + }, + { + "epoch": 7.512569642614485, + "grad_norm": 7.228876113891602, + "learning_rate": 6.131097975268379e-07, + "loss": 2.5955, + "step": 110570 + }, + { + "epoch": 7.512909362685147, + "grad_norm": 9.334137916564941, + "learning_rate": 6.126851474385107e-07, + "loss": 2.6864, + "step": 110575 + }, + { + "epoch": 7.513249082755809, + "grad_norm": 6.165365695953369, + "learning_rate": 6.122604973501834e-07, + "loss": 2.9521, + "step": 110580 + }, + { + "epoch": 7.513588802826471, + "grad_norm": 7.920706272125244, + "learning_rate": 6.118358472618563e-07, + "loss": 2.7344, + "step": 110585 + }, + { + "epoch": 7.513928522897133, + "grad_norm": 8.684767723083496, + "learning_rate": 6.11411197173529e-07, + "loss": 2.7566, + "step": 110590 + }, + { + "epoch": 7.514268242967795, + "grad_norm": 8.748844146728516, + "learning_rate": 6.109865470852018e-07, + "loss": 2.5935, + "step": 110595 + }, + { + "epoch": 7.514607963038456, + "grad_norm": 6.578708171844482, + "learning_rate": 6.105618969968746e-07, + "loss": 2.8559, + "step": 110600 + }, + { + "epoch": 7.514947683109118, + "grad_norm": 8.08964729309082, + "learning_rate": 6.101372469085473e-07, + "loss": 2.7477, + "step": 110605 + }, + { + "epoch": 7.51528740317978, + "grad_norm": 8.8649320602417, + "learning_rate": 6.097125968202202e-07, + "loss": 2.8615, + "step": 110610 + }, + { + "epoch": 7.515627123250441, + "grad_norm": 8.132830619812012, + "learning_rate": 6.092879467318929e-07, + "loss": 2.6726, + "step": 110615 + }, + { + "epoch": 7.515966843321103, + "grad_norm": 8.08113956451416, + "learning_rate": 6.088632966435657e-07, + "loss": 2.6703, + "step": 110620 + }, + { + "epoch": 7.516306563391765, + "grad_norm": 5.9414167404174805, + "learning_rate": 6.084386465552385e-07, + "loss": 2.6341, + "step": 110625 + }, + { + "epoch": 7.516646283462427, + "grad_norm": 6.177631378173828, + "learning_rate": 6.080139964669112e-07, + "loss": 2.7205, + "step": 110630 + }, + { + "epoch": 7.516986003533089, + "grad_norm": 7.985648155212402, + "learning_rate": 6.075893463785841e-07, + "loss": 2.6177, + "step": 110635 + }, + { + "epoch": 7.517325723603751, + "grad_norm": 8.679648399353027, + "learning_rate": 6.071646962902568e-07, + "loss": 2.5641, + "step": 110640 + }, + { + "epoch": 7.517665443674412, + "grad_norm": 9.498720169067383, + "learning_rate": 6.067400462019296e-07, + "loss": 2.6774, + "step": 110645 + }, + { + "epoch": 7.518005163745074, + "grad_norm": 10.859721183776855, + "learning_rate": 6.063153961136024e-07, + "loss": 2.7275, + "step": 110650 + }, + { + "epoch": 7.518344883815736, + "grad_norm": 12.352694511413574, + "learning_rate": 6.058907460252752e-07, + "loss": 2.6821, + "step": 110655 + }, + { + "epoch": 7.518684603886397, + "grad_norm": 6.978072166442871, + "learning_rate": 6.05466095936948e-07, + "loss": 2.6329, + "step": 110660 + }, + { + "epoch": 7.519024323957059, + "grad_norm": 7.764515399932861, + "learning_rate": 6.050414458486207e-07, + "loss": 2.7906, + "step": 110665 + }, + { + "epoch": 7.519364044027721, + "grad_norm": 7.434322834014893, + "learning_rate": 6.046167957602935e-07, + "loss": 2.8467, + "step": 110670 + }, + { + "epoch": 7.519703764098383, + "grad_norm": 6.981851100921631, + "learning_rate": 6.041921456719663e-07, + "loss": 2.5388, + "step": 110675 + }, + { + "epoch": 7.520043484169045, + "grad_norm": 7.882450580596924, + "learning_rate": 6.037674955836391e-07, + "loss": 2.8457, + "step": 110680 + }, + { + "epoch": 7.520383204239707, + "grad_norm": 7.4685587882995605, + "learning_rate": 6.033428454953119e-07, + "loss": 2.7107, + "step": 110685 + }, + { + "epoch": 7.520722924310368, + "grad_norm": 6.9119038581848145, + "learning_rate": 6.029181954069846e-07, + "loss": 2.6525, + "step": 110690 + }, + { + "epoch": 7.52106264438103, + "grad_norm": 8.597212791442871, + "learning_rate": 6.024935453186575e-07, + "loss": 2.7165, + "step": 110695 + }, + { + "epoch": 7.521402364451692, + "grad_norm": 8.089032173156738, + "learning_rate": 6.020688952303302e-07, + "loss": 2.641, + "step": 110700 + }, + { + "epoch": 7.521742084522353, + "grad_norm": 7.305942535400391, + "learning_rate": 6.01644245142003e-07, + "loss": 2.5938, + "step": 110705 + }, + { + "epoch": 7.522081804593015, + "grad_norm": 8.213868141174316, + "learning_rate": 6.012195950536758e-07, + "loss": 3.0954, + "step": 110710 + }, + { + "epoch": 7.522421524663677, + "grad_norm": 8.034557342529297, + "learning_rate": 6.007949449653485e-07, + "loss": 2.9077, + "step": 110715 + }, + { + "epoch": 7.522761244734339, + "grad_norm": 8.47375202178955, + "learning_rate": 6.003702948770214e-07, + "loss": 2.7069, + "step": 110720 + }, + { + "epoch": 7.523100964805001, + "grad_norm": 10.638311386108398, + "learning_rate": 5.999456447886941e-07, + "loss": 2.7833, + "step": 110725 + }, + { + "epoch": 7.523440684875663, + "grad_norm": 8.804049491882324, + "learning_rate": 5.995209947003669e-07, + "loss": 3.0196, + "step": 110730 + }, + { + "epoch": 7.523780404946324, + "grad_norm": 8.137636184692383, + "learning_rate": 5.990963446120397e-07, + "loss": 2.9212, + "step": 110735 + }, + { + "epoch": 7.524120125016986, + "grad_norm": 9.630269050598145, + "learning_rate": 5.986716945237125e-07, + "loss": 2.7752, + "step": 110740 + }, + { + "epoch": 7.524459845087648, + "grad_norm": 6.139490127563477, + "learning_rate": 5.982470444353853e-07, + "loss": 2.5935, + "step": 110745 + }, + { + "epoch": 7.524799565158309, + "grad_norm": 7.5238261222839355, + "learning_rate": 5.97822394347058e-07, + "loss": 2.6823, + "step": 110750 + }, + { + "epoch": 7.525139285228971, + "grad_norm": 8.376795768737793, + "learning_rate": 5.973977442587308e-07, + "loss": 2.8107, + "step": 110755 + }, + { + "epoch": 7.525479005299633, + "grad_norm": 8.855828285217285, + "learning_rate": 5.969730941704036e-07, + "loss": 2.5421, + "step": 110760 + }, + { + "epoch": 7.525818725370295, + "grad_norm": 6.0814690589904785, + "learning_rate": 5.965484440820764e-07, + "loss": 2.7606, + "step": 110765 + }, + { + "epoch": 7.526158445440957, + "grad_norm": 8.290997505187988, + "learning_rate": 5.961237939937492e-07, + "loss": 2.606, + "step": 110770 + }, + { + "epoch": 7.526498165511619, + "grad_norm": 8.344297409057617, + "learning_rate": 5.956991439054219e-07, + "loss": 2.8581, + "step": 110775 + }, + { + "epoch": 7.52683788558228, + "grad_norm": 11.042706489562988, + "learning_rate": 5.952744938170947e-07, + "loss": 2.7673, + "step": 110780 + }, + { + "epoch": 7.527177605652942, + "grad_norm": 8.990596771240234, + "learning_rate": 5.948498437287675e-07, + "loss": 2.6359, + "step": 110785 + }, + { + "epoch": 7.527517325723604, + "grad_norm": 9.15394115447998, + "learning_rate": 5.944251936404403e-07, + "loss": 2.6323, + "step": 110790 + }, + { + "epoch": 7.527857045794265, + "grad_norm": 7.023731708526611, + "learning_rate": 5.940005435521131e-07, + "loss": 2.7382, + "step": 110795 + }, + { + "epoch": 7.528196765864927, + "grad_norm": 6.537796497344971, + "learning_rate": 5.935758934637858e-07, + "loss": 2.8919, + "step": 110800 + }, + { + "epoch": 7.528536485935589, + "grad_norm": 7.279346466064453, + "learning_rate": 5.931512433754586e-07, + "loss": 2.9594, + "step": 110805 + }, + { + "epoch": 7.528876206006251, + "grad_norm": 6.78327751159668, + "learning_rate": 5.927265932871314e-07, + "loss": 2.6474, + "step": 110810 + }, + { + "epoch": 7.529215926076913, + "grad_norm": Infinity, + "learning_rate": 5.923868732164697e-07, + "loss": 2.7515, + "step": 110815 + }, + { + "epoch": 7.529555646147575, + "grad_norm": 8.454920768737793, + "learning_rate": 5.919622231281424e-07, + "loss": 2.5955, + "step": 110820 + }, + { + "epoch": 7.529895366218236, + "grad_norm": 7.191596031188965, + "learning_rate": 5.915375730398153e-07, + "loss": 2.5036, + "step": 110825 + }, + { + "epoch": 7.530235086288898, + "grad_norm": 7.513804912567139, + "learning_rate": 5.91112922951488e-07, + "loss": 2.5581, + "step": 110830 + }, + { + "epoch": 7.53057480635956, + "grad_norm": 6.828058242797852, + "learning_rate": 5.906882728631608e-07, + "loss": 2.7062, + "step": 110835 + }, + { + "epoch": 7.530914526430221, + "grad_norm": 7.099848747253418, + "learning_rate": 5.902636227748336e-07, + "loss": 2.5834, + "step": 110840 + }, + { + "epoch": 7.531254246500883, + "grad_norm": 10.12086009979248, + "learning_rate": 5.898389726865063e-07, + "loss": 2.7661, + "step": 110845 + }, + { + "epoch": 7.5315939665715455, + "grad_norm": 8.355945587158203, + "learning_rate": 5.894143225981792e-07, + "loss": 2.7745, + "step": 110850 + }, + { + "epoch": 7.531933686642207, + "grad_norm": 7.596131801605225, + "learning_rate": 5.889896725098519e-07, + "loss": 2.5802, + "step": 110855 + }, + { + "epoch": 7.532273406712869, + "grad_norm": 9.943354606628418, + "learning_rate": 5.885650224215247e-07, + "loss": 2.585, + "step": 110860 + }, + { + "epoch": 7.532613126783531, + "grad_norm": 7.629568576812744, + "learning_rate": 5.881403723331975e-07, + "loss": 2.6464, + "step": 110865 + }, + { + "epoch": 7.532952846854192, + "grad_norm": 9.26366138458252, + "learning_rate": 5.877157222448702e-07, + "loss": 2.6752, + "step": 110870 + }, + { + "epoch": 7.533292566924854, + "grad_norm": 5.909780502319336, + "learning_rate": 5.872910721565431e-07, + "loss": 2.8007, + "step": 110875 + }, + { + "epoch": 7.533632286995516, + "grad_norm": 7.4300007820129395, + "learning_rate": 5.868664220682158e-07, + "loss": 2.6647, + "step": 110880 + }, + { + "epoch": 7.533972007066177, + "grad_norm": 7.826601505279541, + "learning_rate": 5.864417719798886e-07, + "loss": 2.8006, + "step": 110885 + }, + { + "epoch": 7.534311727136839, + "grad_norm": 7.487787246704102, + "learning_rate": 5.860171218915614e-07, + "loss": 2.8192, + "step": 110890 + }, + { + "epoch": 7.5346514472075015, + "grad_norm": 8.516627311706543, + "learning_rate": 5.855924718032342e-07, + "loss": 3.0977, + "step": 110895 + }, + { + "epoch": 7.534991167278163, + "grad_norm": 7.341084003448486, + "learning_rate": 5.85167821714907e-07, + "loss": 2.6575, + "step": 110900 + }, + { + "epoch": 7.535330887348825, + "grad_norm": 6.820083141326904, + "learning_rate": 5.847431716265797e-07, + "loss": 2.7776, + "step": 110905 + }, + { + "epoch": 7.535670607419487, + "grad_norm": 9.65720272064209, + "learning_rate": 5.843185215382525e-07, + "loss": 2.9616, + "step": 110910 + }, + { + "epoch": 7.536010327490148, + "grad_norm": 7.698943138122559, + "learning_rate": 5.838938714499253e-07, + "loss": 2.9533, + "step": 110915 + }, + { + "epoch": 7.53635004756081, + "grad_norm": 7.5934906005859375, + "learning_rate": 5.834692213615981e-07, + "loss": 2.7194, + "step": 110920 + }, + { + "epoch": 7.536689767631472, + "grad_norm": 8.060555458068848, + "learning_rate": 5.830445712732709e-07, + "loss": 2.5365, + "step": 110925 + }, + { + "epoch": 7.537029487702133, + "grad_norm": 8.798453330993652, + "learning_rate": 5.826199211849436e-07, + "loss": 2.6979, + "step": 110930 + }, + { + "epoch": 7.537369207772795, + "grad_norm": 5.707899570465088, + "learning_rate": 5.821952710966165e-07, + "loss": 2.9242, + "step": 110935 + }, + { + "epoch": 7.5377089278434575, + "grad_norm": 6.933856010437012, + "learning_rate": 5.817706210082892e-07, + "loss": 2.8872, + "step": 110940 + }, + { + "epoch": 7.538048647914119, + "grad_norm": 8.254125595092773, + "learning_rate": 5.81345970919962e-07, + "loss": 2.8736, + "step": 110945 + }, + { + "epoch": 7.538388367984781, + "grad_norm": 8.488239288330078, + "learning_rate": 5.809213208316348e-07, + "loss": 2.6363, + "step": 110950 + }, + { + "epoch": 7.538728088055443, + "grad_norm": 10.752299308776855, + "learning_rate": 5.804966707433075e-07, + "loss": 2.7737, + "step": 110955 + }, + { + "epoch": 7.539067808126104, + "grad_norm": 6.504798412322998, + "learning_rate": 5.800720206549804e-07, + "loss": 2.5564, + "step": 110960 + }, + { + "epoch": 7.539407528196766, + "grad_norm": 7.857079982757568, + "learning_rate": 5.796473705666531e-07, + "loss": 2.6423, + "step": 110965 + }, + { + "epoch": 7.539747248267427, + "grad_norm": 7.505609035491943, + "learning_rate": 5.792227204783259e-07, + "loss": 2.5233, + "step": 110970 + }, + { + "epoch": 7.540086968338089, + "grad_norm": 6.918153285980225, + "learning_rate": 5.787980703899987e-07, + "loss": 2.7082, + "step": 110975 + }, + { + "epoch": 7.540426688408751, + "grad_norm": 8.605635643005371, + "learning_rate": 5.783734203016714e-07, + "loss": 2.5344, + "step": 110980 + }, + { + "epoch": 7.540766408479413, + "grad_norm": 6.70853853225708, + "learning_rate": 5.779487702133443e-07, + "loss": 2.7059, + "step": 110985 + }, + { + "epoch": 7.541106128550075, + "grad_norm": 7.0924553871154785, + "learning_rate": 5.77524120125017e-07, + "loss": 2.7287, + "step": 110990 + }, + { + "epoch": 7.541445848620737, + "grad_norm": 7.2031025886535645, + "learning_rate": 5.770994700366898e-07, + "loss": 2.6224, + "step": 110995 + }, + { + "epoch": 7.541785568691398, + "grad_norm": 9.13940715789795, + "learning_rate": 5.766748199483626e-07, + "loss": 2.8922, + "step": 111000 + }, + { + "epoch": 7.54212528876206, + "grad_norm": 7.868760585784912, + "learning_rate": 5.762501698600354e-07, + "loss": 2.7509, + "step": 111005 + }, + { + "epoch": 7.542465008832722, + "grad_norm": 7.105201721191406, + "learning_rate": 5.758255197717082e-07, + "loss": 2.8585, + "step": 111010 + }, + { + "epoch": 7.542804728903383, + "grad_norm": 7.388810157775879, + "learning_rate": 5.754008696833809e-07, + "loss": 2.6581, + "step": 111015 + }, + { + "epoch": 7.543144448974045, + "grad_norm": 6.618013381958008, + "learning_rate": 5.749762195950537e-07, + "loss": 2.7987, + "step": 111020 + }, + { + "epoch": 7.543484169044707, + "grad_norm": 8.268585205078125, + "learning_rate": 5.745515695067265e-07, + "loss": 2.7551, + "step": 111025 + }, + { + "epoch": 7.543823889115369, + "grad_norm": 8.067215919494629, + "learning_rate": 5.741269194183993e-07, + "loss": 2.534, + "step": 111030 + }, + { + "epoch": 7.544163609186031, + "grad_norm": 8.580101013183594, + "learning_rate": 5.737022693300721e-07, + "loss": 2.72, + "step": 111035 + }, + { + "epoch": 7.544503329256693, + "grad_norm": 8.810538291931152, + "learning_rate": 5.732776192417448e-07, + "loss": 2.7365, + "step": 111040 + }, + { + "epoch": 7.544843049327354, + "grad_norm": 5.2294440269470215, + "learning_rate": 5.728529691534177e-07, + "loss": 2.4731, + "step": 111045 + }, + { + "epoch": 7.545182769398016, + "grad_norm": 7.0440897941589355, + "learning_rate": 5.724283190650904e-07, + "loss": 2.9516, + "step": 111050 + }, + { + "epoch": 7.545522489468678, + "grad_norm": 9.150626182556152, + "learning_rate": 5.720036689767632e-07, + "loss": 2.8847, + "step": 111055 + }, + { + "epoch": 7.545862209539339, + "grad_norm": 7.428769111633301, + "learning_rate": 5.71579018888436e-07, + "loss": 2.4372, + "step": 111060 + }, + { + "epoch": 7.546201929610001, + "grad_norm": 8.393723487854004, + "learning_rate": 5.711543688001087e-07, + "loss": 2.5966, + "step": 111065 + }, + { + "epoch": 7.546541649680663, + "grad_norm": 7.581338882446289, + "learning_rate": 5.707297187117816e-07, + "loss": 2.7077, + "step": 111070 + }, + { + "epoch": 7.546881369751325, + "grad_norm": 8.124147415161133, + "learning_rate": 5.703050686234543e-07, + "loss": 2.8405, + "step": 111075 + }, + { + "epoch": 7.547221089821987, + "grad_norm": 7.17965030670166, + "learning_rate": 5.698804185351271e-07, + "loss": 2.7036, + "step": 111080 + }, + { + "epoch": 7.547560809892649, + "grad_norm": 9.447061538696289, + "learning_rate": 5.694557684467999e-07, + "loss": 2.6618, + "step": 111085 + }, + { + "epoch": 7.54790052996331, + "grad_norm": 7.7200846672058105, + "learning_rate": 5.690311183584726e-07, + "loss": 2.8957, + "step": 111090 + }, + { + "epoch": 7.548240250033972, + "grad_norm": 7.569753170013428, + "learning_rate": 5.686064682701455e-07, + "loss": 2.717, + "step": 111095 + }, + { + "epoch": 7.548579970104634, + "grad_norm": 8.193463325500488, + "learning_rate": 5.681818181818182e-07, + "loss": 2.8031, + "step": 111100 + }, + { + "epoch": 7.548919690175295, + "grad_norm": 9.253642082214355, + "learning_rate": 5.67757168093491e-07, + "loss": 2.7944, + "step": 111105 + }, + { + "epoch": 7.549259410245957, + "grad_norm": 7.865592956542969, + "learning_rate": 5.673325180051638e-07, + "loss": 2.5685, + "step": 111110 + }, + { + "epoch": 7.5495991303166194, + "grad_norm": 8.438749313354492, + "learning_rate": 5.669078679168366e-07, + "loss": 2.5073, + "step": 111115 + }, + { + "epoch": 7.549938850387281, + "grad_norm": 8.11707878112793, + "learning_rate": 5.664832178285094e-07, + "loss": 2.9293, + "step": 111120 + }, + { + "epoch": 7.550278570457943, + "grad_norm": 8.33649730682373, + "learning_rate": 5.660585677401821e-07, + "loss": 2.7793, + "step": 111125 + }, + { + "epoch": 7.550618290528605, + "grad_norm": 6.429049491882324, + "learning_rate": 5.656339176518549e-07, + "loss": 2.6195, + "step": 111130 + }, + { + "epoch": 7.550958010599266, + "grad_norm": 9.492036819458008, + "learning_rate": 5.652092675635277e-07, + "loss": 2.5831, + "step": 111135 + }, + { + "epoch": 7.551297730669928, + "grad_norm": 8.28927993774414, + "learning_rate": 5.647846174752005e-07, + "loss": 2.6309, + "step": 111140 + }, + { + "epoch": 7.55163745074059, + "grad_norm": 6.5711236000061035, + "learning_rate": 5.643599673868733e-07, + "loss": 2.5597, + "step": 111145 + }, + { + "epoch": 7.551977170811251, + "grad_norm": 7.530154705047607, + "learning_rate": 5.63935317298546e-07, + "loss": 2.8359, + "step": 111150 + }, + { + "epoch": 7.552316890881913, + "grad_norm": 8.029460906982422, + "learning_rate": 5.635106672102189e-07, + "loss": 2.7946, + "step": 111155 + }, + { + "epoch": 7.5526566109525755, + "grad_norm": 7.0175251960754395, + "learning_rate": 5.630860171218916e-07, + "loss": 2.6554, + "step": 111160 + }, + { + "epoch": 7.552996331023237, + "grad_norm": 6.368805885314941, + "learning_rate": 5.626613670335644e-07, + "loss": 2.6852, + "step": 111165 + }, + { + "epoch": 7.553336051093899, + "grad_norm": 9.45346736907959, + "learning_rate": 5.622367169452372e-07, + "loss": 2.7429, + "step": 111170 + }, + { + "epoch": 7.55367577116456, + "grad_norm": 6.265377044677734, + "learning_rate": 5.618120668569099e-07, + "loss": 2.5321, + "step": 111175 + }, + { + "epoch": 7.554015491235222, + "grad_norm": 7.449191570281982, + "learning_rate": 5.613874167685828e-07, + "loss": 2.6558, + "step": 111180 + }, + { + "epoch": 7.554355211305884, + "grad_norm": 7.495030403137207, + "learning_rate": 5.609627666802555e-07, + "loss": 2.7053, + "step": 111185 + }, + { + "epoch": 7.554694931376545, + "grad_norm": 8.267243385314941, + "learning_rate": 5.605381165919283e-07, + "loss": 2.7033, + "step": 111190 + }, + { + "epoch": 7.555034651447207, + "grad_norm": 11.603331565856934, + "learning_rate": 5.601134665036011e-07, + "loss": 2.5754, + "step": 111195 + }, + { + "epoch": 7.555374371517869, + "grad_norm": 8.41443920135498, + "learning_rate": 5.596888164152739e-07, + "loss": 2.8402, + "step": 111200 + }, + { + "epoch": 7.555714091588531, + "grad_norm": 6.805974006652832, + "learning_rate": 5.592641663269467e-07, + "loss": 2.8146, + "step": 111205 + }, + { + "epoch": 7.556053811659193, + "grad_norm": 9.674696922302246, + "learning_rate": 5.588395162386194e-07, + "loss": 2.7524, + "step": 111210 + }, + { + "epoch": 7.556393531729855, + "grad_norm": 10.388762474060059, + "learning_rate": 5.584148661502922e-07, + "loss": 2.5525, + "step": 111215 + }, + { + "epoch": 7.556733251800516, + "grad_norm": 7.368160247802734, + "learning_rate": 5.57990216061965e-07, + "loss": 2.5212, + "step": 111220 + }, + { + "epoch": 7.557072971871178, + "grad_norm": 8.540329933166504, + "learning_rate": 5.575655659736378e-07, + "loss": 2.5347, + "step": 111225 + }, + { + "epoch": 7.55741269194184, + "grad_norm": 8.226265907287598, + "learning_rate": 5.571409158853106e-07, + "loss": 2.6656, + "step": 111230 + }, + { + "epoch": 7.557752412012501, + "grad_norm": 8.68885612487793, + "learning_rate": 5.567162657969833e-07, + "loss": 2.5496, + "step": 111235 + }, + { + "epoch": 7.558092132083163, + "grad_norm": 7.672744274139404, + "learning_rate": 5.562916157086561e-07, + "loss": 2.7503, + "step": 111240 + }, + { + "epoch": 7.558431852153825, + "grad_norm": 13.09041690826416, + "learning_rate": 5.558669656203289e-07, + "loss": 2.8328, + "step": 111245 + }, + { + "epoch": 7.558771572224487, + "grad_norm": 7.635196685791016, + "learning_rate": 5.554423155320017e-07, + "loss": 2.901, + "step": 111250 + }, + { + "epoch": 7.559111292295149, + "grad_norm": 7.458548069000244, + "learning_rate": 5.550176654436745e-07, + "loss": 2.6951, + "step": 111255 + }, + { + "epoch": 7.559451012365811, + "grad_norm": 7.725655555725098, + "learning_rate": 5.545930153553472e-07, + "loss": 2.7315, + "step": 111260 + }, + { + "epoch": 7.559790732436472, + "grad_norm": 7.245729923248291, + "learning_rate": 5.5416836526702e-07, + "loss": 2.7611, + "step": 111265 + }, + { + "epoch": 7.560130452507134, + "grad_norm": 7.970195293426514, + "learning_rate": 5.537437151786928e-07, + "loss": 2.6297, + "step": 111270 + }, + { + "epoch": 7.560470172577796, + "grad_norm": 7.444393634796143, + "learning_rate": 5.533190650903656e-07, + "loss": 2.8295, + "step": 111275 + }, + { + "epoch": 7.560809892648457, + "grad_norm": 6.350983619689941, + "learning_rate": 5.528944150020384e-07, + "loss": 2.8735, + "step": 111280 + }, + { + "epoch": 7.561149612719119, + "grad_norm": 7.2531208992004395, + "learning_rate": 5.524697649137112e-07, + "loss": 2.559, + "step": 111285 + }, + { + "epoch": 7.561489332789781, + "grad_norm": 9.015109062194824, + "learning_rate": 5.52045114825384e-07, + "loss": 2.8279, + "step": 111290 + }, + { + "epoch": 7.561829052860443, + "grad_norm": 7.672097206115723, + "learning_rate": 5.516204647370567e-07, + "loss": 2.6842, + "step": 111295 + }, + { + "epoch": 7.562168772931105, + "grad_norm": 8.043976783752441, + "learning_rate": 5.511958146487295e-07, + "loss": 3.0129, + "step": 111300 + }, + { + "epoch": 7.562508493001767, + "grad_norm": 7.676620006561279, + "learning_rate": 5.507711645604023e-07, + "loss": 2.7869, + "step": 111305 + }, + { + "epoch": 7.562848213072428, + "grad_norm": 7.136316299438477, + "learning_rate": 5.503465144720751e-07, + "loss": 2.8955, + "step": 111310 + }, + { + "epoch": 7.56318793314309, + "grad_norm": 8.180030822753906, + "learning_rate": 5.499218643837479e-07, + "loss": 2.6942, + "step": 111315 + }, + { + "epoch": 7.563527653213752, + "grad_norm": 8.65311336517334, + "learning_rate": 5.494972142954206e-07, + "loss": 2.7421, + "step": 111320 + }, + { + "epoch": 7.563867373284413, + "grad_norm": 9.151578903198242, + "learning_rate": 5.490725642070934e-07, + "loss": 2.6418, + "step": 111325 + }, + { + "epoch": 7.564207093355075, + "grad_norm": 7.773746967315674, + "learning_rate": 5.486479141187662e-07, + "loss": 2.6934, + "step": 111330 + }, + { + "epoch": 7.564546813425737, + "grad_norm": 8.106117248535156, + "learning_rate": 5.48223264030439e-07, + "loss": 2.6024, + "step": 111335 + }, + { + "epoch": 7.564886533496399, + "grad_norm": 8.226522445678711, + "learning_rate": 5.477986139421118e-07, + "loss": 2.6848, + "step": 111340 + }, + { + "epoch": 7.565226253567061, + "grad_norm": 6.922102928161621, + "learning_rate": 5.473739638537845e-07, + "loss": 2.7213, + "step": 111345 + }, + { + "epoch": 7.565565973637723, + "grad_norm": 8.054496765136719, + "learning_rate": 5.469493137654573e-07, + "loss": 2.7887, + "step": 111350 + }, + { + "epoch": 7.565905693708384, + "grad_norm": 7.159262657165527, + "learning_rate": 5.465246636771301e-07, + "loss": 2.6225, + "step": 111355 + }, + { + "epoch": 7.566245413779046, + "grad_norm": 8.169861793518066, + "learning_rate": 5.461000135888029e-07, + "loss": 2.7854, + "step": 111360 + }, + { + "epoch": 7.566585133849708, + "grad_norm": 7.38322639465332, + "learning_rate": 5.456753635004757e-07, + "loss": 2.7166, + "step": 111365 + }, + { + "epoch": 7.566924853920369, + "grad_norm": 8.974421501159668, + "learning_rate": 5.452507134121484e-07, + "loss": 2.9219, + "step": 111370 + }, + { + "epoch": 7.567264573991031, + "grad_norm": 9.081103324890137, + "learning_rate": 5.448260633238212e-07, + "loss": 2.8269, + "step": 111375 + }, + { + "epoch": 7.567604294061693, + "grad_norm": 9.332342147827148, + "learning_rate": 5.44401413235494e-07, + "loss": 2.705, + "step": 111380 + }, + { + "epoch": 7.567944014132355, + "grad_norm": 7.043005466461182, + "learning_rate": 5.439767631471668e-07, + "loss": 2.7466, + "step": 111385 + }, + { + "epoch": 7.568283734203017, + "grad_norm": 7.374774932861328, + "learning_rate": 5.435521130588396e-07, + "loss": 2.7551, + "step": 111390 + }, + { + "epoch": 7.568623454273679, + "grad_norm": 7.4252190589904785, + "learning_rate": 5.431274629705124e-07, + "loss": 2.7203, + "step": 111395 + }, + { + "epoch": 7.56896317434434, + "grad_norm": 7.067168235778809, + "learning_rate": 5.427028128821852e-07, + "loss": 2.9227, + "step": 111400 + }, + { + "epoch": 7.569302894415002, + "grad_norm": 7.388291358947754, + "learning_rate": 5.422781627938579e-07, + "loss": 2.7184, + "step": 111405 + }, + { + "epoch": 7.569642614485664, + "grad_norm": 6.424842834472656, + "learning_rate": 5.418535127055307e-07, + "loss": 2.7735, + "step": 111410 + }, + { + "epoch": 7.569982334556325, + "grad_norm": 7.903252124786377, + "learning_rate": 5.414288626172035e-07, + "loss": 2.3971, + "step": 111415 + }, + { + "epoch": 7.570322054626987, + "grad_norm": 7.017987251281738, + "learning_rate": 5.410042125288763e-07, + "loss": 2.5159, + "step": 111420 + }, + { + "epoch": 7.5706617746976494, + "grad_norm": 7.39009952545166, + "learning_rate": 5.405795624405491e-07, + "loss": 2.879, + "step": 111425 + }, + { + "epoch": 7.571001494768311, + "grad_norm": 8.060490608215332, + "learning_rate": 5.401549123522218e-07, + "loss": 2.8672, + "step": 111430 + }, + { + "epoch": 7.571341214838973, + "grad_norm": 10.845553398132324, + "learning_rate": 5.397302622638946e-07, + "loss": 2.5417, + "step": 111435 + }, + { + "epoch": 7.571680934909635, + "grad_norm": 6.437387466430664, + "learning_rate": 5.393056121755674e-07, + "loss": 2.8506, + "step": 111440 + }, + { + "epoch": 7.572020654980296, + "grad_norm": 7.229641437530518, + "learning_rate": 5.388809620872402e-07, + "loss": 2.9726, + "step": 111445 + }, + { + "epoch": 7.572360375050958, + "grad_norm": 7.618025302886963, + "learning_rate": 5.38456311998913e-07, + "loss": 2.5685, + "step": 111450 + }, + { + "epoch": 7.57270009512162, + "grad_norm": 7.066426753997803, + "learning_rate": 5.380316619105857e-07, + "loss": 2.8625, + "step": 111455 + }, + { + "epoch": 7.573039815192281, + "grad_norm": 7.640340328216553, + "learning_rate": 5.376070118222585e-07, + "loss": 2.5978, + "step": 111460 + }, + { + "epoch": 7.573379535262943, + "grad_norm": 10.000248908996582, + "learning_rate": 5.371823617339313e-07, + "loss": 2.6461, + "step": 111465 + }, + { + "epoch": 7.5737192553336055, + "grad_norm": 7.411088466644287, + "learning_rate": 5.367577116456041e-07, + "loss": 2.8079, + "step": 111470 + }, + { + "epoch": 7.574058975404267, + "grad_norm": 8.710107803344727, + "learning_rate": 5.363330615572769e-07, + "loss": 2.6502, + "step": 111475 + }, + { + "epoch": 7.574398695474929, + "grad_norm": 7.419076442718506, + "learning_rate": 5.359084114689497e-07, + "loss": 2.9162, + "step": 111480 + }, + { + "epoch": 7.574738415545591, + "grad_norm": 6.035788059234619, + "learning_rate": 5.354837613806224e-07, + "loss": 2.5623, + "step": 111485 + }, + { + "epoch": 7.575078135616252, + "grad_norm": 6.993513584136963, + "learning_rate": 5.350591112922952e-07, + "loss": 2.898, + "step": 111490 + }, + { + "epoch": 7.575417855686914, + "grad_norm": 9.021159172058105, + "learning_rate": 5.34634461203968e-07, + "loss": 2.8218, + "step": 111495 + }, + { + "epoch": 7.575757575757576, + "grad_norm": 7.519659519195557, + "learning_rate": 5.342098111156408e-07, + "loss": 2.6737, + "step": 111500 + }, + { + "epoch": 7.576097295828237, + "grad_norm": 7.692965984344482, + "learning_rate": 5.337851610273136e-07, + "loss": 2.6498, + "step": 111505 + }, + { + "epoch": 7.576437015898899, + "grad_norm": 9.264719009399414, + "learning_rate": 5.333605109389863e-07, + "loss": 2.8682, + "step": 111510 + }, + { + "epoch": 7.5767767359695615, + "grad_norm": 7.225400447845459, + "learning_rate": 5.329358608506591e-07, + "loss": 2.6052, + "step": 111515 + }, + { + "epoch": 7.577116456040223, + "grad_norm": 7.982194423675537, + "learning_rate": 5.325112107623319e-07, + "loss": 2.7532, + "step": 111520 + }, + { + "epoch": 7.577456176110885, + "grad_norm": 7.925765514373779, + "learning_rate": 5.320865606740047e-07, + "loss": 2.7764, + "step": 111525 + }, + { + "epoch": 7.577795896181547, + "grad_norm": 9.130925178527832, + "learning_rate": 5.316619105856775e-07, + "loss": 2.7789, + "step": 111530 + }, + { + "epoch": 7.578135616252208, + "grad_norm": 9.09439468383789, + "learning_rate": 5.312372604973503e-07, + "loss": 2.7528, + "step": 111535 + }, + { + "epoch": 7.57847533632287, + "grad_norm": 7.46720027923584, + "learning_rate": 5.30812610409023e-07, + "loss": 2.4559, + "step": 111540 + }, + { + "epoch": 7.578815056393532, + "grad_norm": 7.949573040008545, + "learning_rate": 5.303879603206958e-07, + "loss": 2.5732, + "step": 111545 + }, + { + "epoch": 7.579154776464193, + "grad_norm": 5.95432710647583, + "learning_rate": 5.299633102323686e-07, + "loss": 2.6143, + "step": 111550 + }, + { + "epoch": 7.579494496534855, + "grad_norm": 8.815731048583984, + "learning_rate": 5.295386601440414e-07, + "loss": 2.5267, + "step": 111555 + }, + { + "epoch": 7.5798342166055175, + "grad_norm": 7.247824668884277, + "learning_rate": 5.291140100557142e-07, + "loss": 2.6028, + "step": 111560 + }, + { + "epoch": 7.580173936676179, + "grad_norm": 9.113094329833984, + "learning_rate": 5.28689359967387e-07, + "loss": 2.7698, + "step": 111565 + }, + { + "epoch": 7.580513656746841, + "grad_norm": 7.635958671569824, + "learning_rate": 5.282647098790597e-07, + "loss": 2.3456, + "step": 111570 + }, + { + "epoch": 7.580853376817503, + "grad_norm": 8.286458969116211, + "learning_rate": 5.278400597907325e-07, + "loss": 2.6896, + "step": 111575 + }, + { + "epoch": 7.581193096888164, + "grad_norm": 7.734936714172363, + "learning_rate": 5.274154097024053e-07, + "loss": 2.544, + "step": 111580 + }, + { + "epoch": 7.581532816958826, + "grad_norm": 7.304090976715088, + "learning_rate": 5.269907596140781e-07, + "loss": 2.6227, + "step": 111585 + }, + { + "epoch": 7.581872537029488, + "grad_norm": 6.745754241943359, + "learning_rate": 5.265661095257509e-07, + "loss": 2.8728, + "step": 111590 + }, + { + "epoch": 7.582212257100149, + "grad_norm": 7.282619953155518, + "learning_rate": 5.261414594374236e-07, + "loss": 2.8173, + "step": 111595 + }, + { + "epoch": 7.582551977170811, + "grad_norm": 8.62187671661377, + "learning_rate": 5.257168093490964e-07, + "loss": 2.7343, + "step": 111600 + }, + { + "epoch": 7.5828916972414735, + "grad_norm": 6.740376949310303, + "learning_rate": 5.252921592607692e-07, + "loss": 2.9737, + "step": 111605 + }, + { + "epoch": 7.583231417312135, + "grad_norm": 8.335614204406738, + "learning_rate": 5.24867509172442e-07, + "loss": 2.7289, + "step": 111610 + }, + { + "epoch": 7.583571137382797, + "grad_norm": 9.740936279296875, + "learning_rate": 5.244428590841148e-07, + "loss": 2.858, + "step": 111615 + }, + { + "epoch": 7.583910857453459, + "grad_norm": 9.000419616699219, + "learning_rate": 5.240182089957875e-07, + "loss": 2.7185, + "step": 111620 + }, + { + "epoch": 7.58425057752412, + "grad_norm": 7.7445597648620605, + "learning_rate": 5.235935589074603e-07, + "loss": 2.7253, + "step": 111625 + }, + { + "epoch": 7.584590297594782, + "grad_norm": 7.859252452850342, + "learning_rate": 5.231689088191331e-07, + "loss": 2.7937, + "step": 111630 + }, + { + "epoch": 7.584930017665444, + "grad_norm": 8.586748123168945, + "learning_rate": 5.227442587308059e-07, + "loss": 2.794, + "step": 111635 + }, + { + "epoch": 7.585269737736105, + "grad_norm": 10.266010284423828, + "learning_rate": 5.223196086424787e-07, + "loss": 2.517, + "step": 111640 + }, + { + "epoch": 7.585609457806767, + "grad_norm": 7.750977993011475, + "learning_rate": 5.218949585541515e-07, + "loss": 2.7316, + "step": 111645 + }, + { + "epoch": 7.585949177877429, + "grad_norm": 8.647531509399414, + "learning_rate": 5.214703084658243e-07, + "loss": 2.4658, + "step": 111650 + }, + { + "epoch": 7.586288897948091, + "grad_norm": 9.61570930480957, + "learning_rate": 5.21045658377497e-07, + "loss": 2.763, + "step": 111655 + }, + { + "epoch": 7.586628618018753, + "grad_norm": 9.042078971862793, + "learning_rate": 5.206210082891698e-07, + "loss": 2.9071, + "step": 111660 + }, + { + "epoch": 7.586968338089414, + "grad_norm": 9.208327293395996, + "learning_rate": 5.201963582008426e-07, + "loss": 2.5968, + "step": 111665 + }, + { + "epoch": 7.587308058160076, + "grad_norm": 8.866157531738281, + "learning_rate": 5.197717081125154e-07, + "loss": 2.4889, + "step": 111670 + }, + { + "epoch": 7.587647778230738, + "grad_norm": 7.329458713531494, + "learning_rate": 5.193470580241882e-07, + "loss": 2.8402, + "step": 111675 + }, + { + "epoch": 7.587987498301399, + "grad_norm": 6.5094380378723145, + "learning_rate": 5.189224079358609e-07, + "loss": 2.5808, + "step": 111680 + }, + { + "epoch": 7.588327218372061, + "grad_norm": 8.78499984741211, + "learning_rate": 5.184977578475337e-07, + "loss": 2.6499, + "step": 111685 + }, + { + "epoch": 7.588666938442723, + "grad_norm": 10.347463607788086, + "learning_rate": 5.180731077592065e-07, + "loss": 2.7187, + "step": 111690 + }, + { + "epoch": 7.589006658513385, + "grad_norm": 9.221776008605957, + "learning_rate": 5.176484576708793e-07, + "loss": 2.9313, + "step": 111695 + }, + { + "epoch": 7.589346378584047, + "grad_norm": 8.765559196472168, + "learning_rate": 5.172238075825521e-07, + "loss": 2.5605, + "step": 111700 + }, + { + "epoch": 7.589686098654709, + "grad_norm": 7.170686721801758, + "learning_rate": 5.167991574942248e-07, + "loss": 2.7292, + "step": 111705 + }, + { + "epoch": 7.59002581872537, + "grad_norm": 6.848781585693359, + "learning_rate": 5.163745074058976e-07, + "loss": 2.7521, + "step": 111710 + }, + { + "epoch": 7.590365538796032, + "grad_norm": 9.30940055847168, + "learning_rate": 5.159498573175704e-07, + "loss": 2.6842, + "step": 111715 + }, + { + "epoch": 7.590705258866694, + "grad_norm": 6.906902313232422, + "learning_rate": 5.155252072292432e-07, + "loss": 2.9847, + "step": 111720 + }, + { + "epoch": 7.591044978937355, + "grad_norm": 7.303408622741699, + "learning_rate": 5.15100557140916e-07, + "loss": 2.6221, + "step": 111725 + }, + { + "epoch": 7.591384699008017, + "grad_norm": 9.51347827911377, + "learning_rate": 5.146759070525887e-07, + "loss": 2.5524, + "step": 111730 + }, + { + "epoch": 7.5917244190786795, + "grad_norm": 7.58659029006958, + "learning_rate": 5.142512569642616e-07, + "loss": 2.7424, + "step": 111735 + }, + { + "epoch": 7.592064139149341, + "grad_norm": 7.31355619430542, + "learning_rate": 5.138266068759343e-07, + "loss": 2.8334, + "step": 111740 + }, + { + "epoch": 7.592403859220003, + "grad_norm": 7.988280296325684, + "learning_rate": 5.134019567876071e-07, + "loss": 2.6299, + "step": 111745 + }, + { + "epoch": 7.592743579290665, + "grad_norm": 10.431501388549805, + "learning_rate": 5.129773066992799e-07, + "loss": 2.614, + "step": 111750 + }, + { + "epoch": 7.593083299361326, + "grad_norm": 7.963977336883545, + "learning_rate": 5.125526566109527e-07, + "loss": 2.8292, + "step": 111755 + }, + { + "epoch": 7.593423019431988, + "grad_norm": 8.080605506896973, + "learning_rate": 5.121280065226255e-07, + "loss": 2.6378, + "step": 111760 + }, + { + "epoch": 7.59376273950265, + "grad_norm": 7.219750881195068, + "learning_rate": 5.117033564342982e-07, + "loss": 2.7535, + "step": 111765 + }, + { + "epoch": 7.594102459573311, + "grad_norm": 7.498917579650879, + "learning_rate": 5.11278706345971e-07, + "loss": 2.7338, + "step": 111770 + }, + { + "epoch": 7.594442179643973, + "grad_norm": 7.841579914093018, + "learning_rate": 5.108540562576438e-07, + "loss": 2.6658, + "step": 111775 + }, + { + "epoch": 7.5947818997146355, + "grad_norm": 7.300500869750977, + "learning_rate": 5.104294061693166e-07, + "loss": 2.6915, + "step": 111780 + }, + { + "epoch": 7.595121619785297, + "grad_norm": 9.351295471191406, + "learning_rate": 5.100047560809894e-07, + "loss": 2.6398, + "step": 111785 + }, + { + "epoch": 7.595461339855959, + "grad_norm": 8.100830078125, + "learning_rate": 5.09580105992662e-07, + "loss": 2.5629, + "step": 111790 + }, + { + "epoch": 7.595801059926621, + "grad_norm": 9.2692232131958, + "learning_rate": 5.091554559043349e-07, + "loss": 2.6405, + "step": 111795 + }, + { + "epoch": 7.596140779997282, + "grad_norm": 7.5054168701171875, + "learning_rate": 5.087308058160077e-07, + "loss": 2.7764, + "step": 111800 + }, + { + "epoch": 7.596480500067944, + "grad_norm": 9.408394813537598, + "learning_rate": 5.083061557276805e-07, + "loss": 2.7496, + "step": 111805 + }, + { + "epoch": 7.596820220138606, + "grad_norm": 7.656729698181152, + "learning_rate": 5.078815056393533e-07, + "loss": 2.7067, + "step": 111810 + }, + { + "epoch": 7.597159940209267, + "grad_norm": 7.125969886779785, + "learning_rate": 5.07456855551026e-07, + "loss": 2.5297, + "step": 111815 + }, + { + "epoch": 7.597499660279929, + "grad_norm": 6.7759809494018555, + "learning_rate": 5.070322054626989e-07, + "loss": 2.6097, + "step": 111820 + }, + { + "epoch": 7.5978393803505915, + "grad_norm": 8.459890365600586, + "learning_rate": 5.066075553743715e-07, + "loss": 2.7354, + "step": 111825 + }, + { + "epoch": 7.598179100421253, + "grad_norm": 9.596139907836914, + "learning_rate": 5.061829052860443e-07, + "loss": 2.7617, + "step": 111830 + }, + { + "epoch": 7.598518820491915, + "grad_norm": 7.26711893081665, + "learning_rate": 5.057582551977172e-07, + "loss": 2.8479, + "step": 111835 + }, + { + "epoch": 7.598858540562577, + "grad_norm": 6.979665279388428, + "learning_rate": 5.053336051093898e-07, + "loss": 2.7619, + "step": 111840 + }, + { + "epoch": 7.599198260633238, + "grad_norm": 6.780611038208008, + "learning_rate": 5.049089550210628e-07, + "loss": 2.5389, + "step": 111845 + }, + { + "epoch": 7.5995379807039, + "grad_norm": 8.618825912475586, + "learning_rate": 5.044843049327354e-07, + "loss": 2.4273, + "step": 111850 + }, + { + "epoch": 7.599877700774561, + "grad_norm": 8.723342895507812, + "learning_rate": 5.040596548444082e-07, + "loss": 2.7856, + "step": 111855 + }, + { + "epoch": 7.600217420845223, + "grad_norm": 9.22637939453125, + "learning_rate": 5.03635004756081e-07, + "loss": 2.7065, + "step": 111860 + }, + { + "epoch": 7.600557140915885, + "grad_norm": 9.334600448608398, + "learning_rate": 5.032103546677537e-07, + "loss": 2.716, + "step": 111865 + }, + { + "epoch": 7.600896860986547, + "grad_norm": 6.780307292938232, + "learning_rate": 5.027857045794266e-07, + "loss": 2.7309, + "step": 111870 + }, + { + "epoch": 7.601236581057209, + "grad_norm": 6.385853290557861, + "learning_rate": 5.023610544910993e-07, + "loss": 2.4375, + "step": 111875 + }, + { + "epoch": 7.601576301127871, + "grad_norm": 7.014157295227051, + "learning_rate": 5.019364044027721e-07, + "loss": 2.734, + "step": 111880 + }, + { + "epoch": 7.601916021198532, + "grad_norm": 8.034055709838867, + "learning_rate": 5.015117543144449e-07, + "loss": 2.6522, + "step": 111885 + }, + { + "epoch": 7.602255741269194, + "grad_norm": 9.178336143493652, + "learning_rate": 5.010871042261177e-07, + "loss": 2.8276, + "step": 111890 + }, + { + "epoch": 7.602595461339856, + "grad_norm": 7.4931840896606445, + "learning_rate": 5.006624541377905e-07, + "loss": 2.66, + "step": 111895 + }, + { + "epoch": 7.602935181410517, + "grad_norm": 7.113871097564697, + "learning_rate": 5.002378040494632e-07, + "loss": 2.6532, + "step": 111900 + }, + { + "epoch": 7.603274901481179, + "grad_norm": 7.383458137512207, + "learning_rate": 4.998131539611361e-07, + "loss": 2.5116, + "step": 111905 + }, + { + "epoch": 7.603614621551841, + "grad_norm": 7.147042751312256, + "learning_rate": 4.993885038728088e-07, + "loss": 2.7502, + "step": 111910 + }, + { + "epoch": 7.603954341622503, + "grad_norm": 8.704236030578613, + "learning_rate": 4.989638537844816e-07, + "loss": 2.7559, + "step": 111915 + }, + { + "epoch": 7.604294061693165, + "grad_norm": 9.01180648803711, + "learning_rate": 4.985392036961544e-07, + "loss": 2.7178, + "step": 111920 + }, + { + "epoch": 7.604633781763827, + "grad_norm": 8.680317878723145, + "learning_rate": 4.981145536078271e-07, + "loss": 2.6552, + "step": 111925 + }, + { + "epoch": 7.604973501834488, + "grad_norm": 7.6645660400390625, + "learning_rate": 4.976899035195e-07, + "loss": 2.475, + "step": 111930 + }, + { + "epoch": 7.60531322190515, + "grad_norm": 8.340302467346191, + "learning_rate": 4.972652534311727e-07, + "loss": 2.7345, + "step": 111935 + }, + { + "epoch": 7.605652941975812, + "grad_norm": 8.942254066467285, + "learning_rate": 4.968406033428455e-07, + "loss": 2.9837, + "step": 111940 + }, + { + "epoch": 7.605992662046473, + "grad_norm": 7.980987548828125, + "learning_rate": 4.964159532545183e-07, + "loss": 2.8584, + "step": 111945 + }, + { + "epoch": 7.606332382117135, + "grad_norm": 6.81416130065918, + "learning_rate": 4.95991303166191e-07, + "loss": 2.7674, + "step": 111950 + }, + { + "epoch": 7.606672102187797, + "grad_norm": 7.099669456481934, + "learning_rate": 4.955666530778639e-07, + "loss": 2.7198, + "step": 111955 + }, + { + "epoch": 7.607011822258459, + "grad_norm": 6.233026027679443, + "learning_rate": 4.951420029895366e-07, + "loss": 2.4795, + "step": 111960 + }, + { + "epoch": 7.607351542329121, + "grad_norm": 7.37167501449585, + "learning_rate": 4.947173529012094e-07, + "loss": 2.7583, + "step": 111965 + }, + { + "epoch": 7.607691262399783, + "grad_norm": 7.730803966522217, + "learning_rate": 4.942927028128822e-07, + "loss": 2.8372, + "step": 111970 + }, + { + "epoch": 7.608030982470444, + "grad_norm": 13.06959056854248, + "learning_rate": 4.938680527245549e-07, + "loss": 2.817, + "step": 111975 + }, + { + "epoch": 7.608370702541106, + "grad_norm": 7.056042671203613, + "learning_rate": 4.934434026362278e-07, + "loss": 2.5991, + "step": 111980 + }, + { + "epoch": 7.608710422611768, + "grad_norm": 9.176434516906738, + "learning_rate": 4.930187525479005e-07, + "loss": 2.6349, + "step": 111985 + }, + { + "epoch": 7.609050142682429, + "grad_norm": 6.798896789550781, + "learning_rate": 4.925941024595733e-07, + "loss": 2.7419, + "step": 111990 + }, + { + "epoch": 7.609389862753091, + "grad_norm": 6.679201126098633, + "learning_rate": 4.921694523712461e-07, + "loss": 2.936, + "step": 111995 + }, + { + "epoch": 7.6097295828237534, + "grad_norm": 8.407207489013672, + "learning_rate": 4.917448022829189e-07, + "loss": 2.6952, + "step": 112000 + }, + { + "epoch": 7.610069302894415, + "grad_norm": 6.569107532501221, + "learning_rate": 4.913201521945917e-07, + "loss": 2.6431, + "step": 112005 + }, + { + "epoch": 7.610409022965077, + "grad_norm": 7.207936763763428, + "learning_rate": 4.908955021062644e-07, + "loss": 2.7175, + "step": 112010 + }, + { + "epoch": 7.610748743035739, + "grad_norm": 7.652537822723389, + "learning_rate": 4.904708520179372e-07, + "loss": 2.323, + "step": 112015 + }, + { + "epoch": 7.6110884631064, + "grad_norm": 7.938170909881592, + "learning_rate": 4.9004620192961e-07, + "loss": 2.7082, + "step": 112020 + }, + { + "epoch": 7.611428183177062, + "grad_norm": 6.069085121154785, + "learning_rate": 4.896215518412828e-07, + "loss": 2.8073, + "step": 112025 + }, + { + "epoch": 7.611767903247724, + "grad_norm": 7.40568208694458, + "learning_rate": 4.891969017529556e-07, + "loss": 2.7951, + "step": 112030 + }, + { + "epoch": 7.612107623318385, + "grad_norm": 7.850821018218994, + "learning_rate": 4.887722516646283e-07, + "loss": 2.8519, + "step": 112035 + }, + { + "epoch": 7.612447343389047, + "grad_norm": 8.902141571044922, + "learning_rate": 4.883476015763012e-07, + "loss": 2.4935, + "step": 112040 + }, + { + "epoch": 7.6127870634597095, + "grad_norm": 7.025256156921387, + "learning_rate": 4.879229514879739e-07, + "loss": 2.4906, + "step": 112045 + }, + { + "epoch": 7.613126783530371, + "grad_norm": 6.748612403869629, + "learning_rate": 4.874983013996467e-07, + "loss": 2.7109, + "step": 112050 + }, + { + "epoch": 7.613466503601033, + "grad_norm": 8.275442123413086, + "learning_rate": 4.870736513113195e-07, + "loss": 2.7776, + "step": 112055 + }, + { + "epoch": 7.613806223671695, + "grad_norm": 7.92497444152832, + "learning_rate": 4.866490012229922e-07, + "loss": 2.5377, + "step": 112060 + }, + { + "epoch": 7.614145943742356, + "grad_norm": 7.400636196136475, + "learning_rate": 4.862243511346651e-07, + "loss": 2.5743, + "step": 112065 + }, + { + "epoch": 7.614485663813018, + "grad_norm": 8.259757041931152, + "learning_rate": 4.857997010463378e-07, + "loss": 2.5904, + "step": 112070 + }, + { + "epoch": 7.61482538388368, + "grad_norm": 7.727540016174316, + "learning_rate": 4.853750509580106e-07, + "loss": 2.657, + "step": 112075 + }, + { + "epoch": 7.615165103954341, + "grad_norm": 8.56795883178711, + "learning_rate": 4.849504008696834e-07, + "loss": 2.6891, + "step": 112080 + }, + { + "epoch": 7.615504824025003, + "grad_norm": 9.133292198181152, + "learning_rate": 4.845257507813561e-07, + "loss": 2.7592, + "step": 112085 + }, + { + "epoch": 7.6158445440956655, + "grad_norm": 9.364363670349121, + "learning_rate": 4.84101100693029e-07, + "loss": 2.7769, + "step": 112090 + }, + { + "epoch": 7.616184264166327, + "grad_norm": 7.211044788360596, + "learning_rate": 4.836764506047017e-07, + "loss": 2.7959, + "step": 112095 + }, + { + "epoch": 7.616523984236989, + "grad_norm": 7.8136091232299805, + "learning_rate": 4.832518005163745e-07, + "loss": 2.8273, + "step": 112100 + }, + { + "epoch": 7.616863704307651, + "grad_norm": 8.022795677185059, + "learning_rate": 4.828271504280473e-07, + "loss": 2.8183, + "step": 112105 + }, + { + "epoch": 7.617203424378312, + "grad_norm": 6.854950904846191, + "learning_rate": 4.824025003397201e-07, + "loss": 2.6485, + "step": 112110 + }, + { + "epoch": 7.617543144448974, + "grad_norm": 6.540565013885498, + "learning_rate": 4.819778502513929e-07, + "loss": 2.7047, + "step": 112115 + }, + { + "epoch": 7.617882864519636, + "grad_norm": 7.072689056396484, + "learning_rate": 4.815532001630656e-07, + "loss": 2.72, + "step": 112120 + }, + { + "epoch": 7.618222584590297, + "grad_norm": 6.98837423324585, + "learning_rate": 4.811285500747384e-07, + "loss": 2.9456, + "step": 112125 + }, + { + "epoch": 7.618562304660959, + "grad_norm": 8.731954574584961, + "learning_rate": 4.807038999864112e-07, + "loss": 2.6708, + "step": 112130 + }, + { + "epoch": 7.6189020247316215, + "grad_norm": 8.649048805236816, + "learning_rate": 4.80279249898084e-07, + "loss": 2.8871, + "step": 112135 + }, + { + "epoch": 7.619241744802283, + "grad_norm": 9.040225982666016, + "learning_rate": 4.798545998097568e-07, + "loss": 2.6242, + "step": 112140 + }, + { + "epoch": 7.619581464872945, + "grad_norm": 8.535425186157227, + "learning_rate": 4.794299497214295e-07, + "loss": 2.7602, + "step": 112145 + }, + { + "epoch": 7.619921184943607, + "grad_norm": 8.659055709838867, + "learning_rate": 4.790052996331024e-07, + "loss": 2.5479, + "step": 112150 + }, + { + "epoch": 7.620260905014268, + "grad_norm": 9.997694969177246, + "learning_rate": 4.785806495447751e-07, + "loss": 2.9099, + "step": 112155 + }, + { + "epoch": 7.62060062508493, + "grad_norm": 7.938199520111084, + "learning_rate": 4.781559994564479e-07, + "loss": 2.8546, + "step": 112160 + }, + { + "epoch": 7.620940345155592, + "grad_norm": 8.3782958984375, + "learning_rate": 4.777313493681207e-07, + "loss": 2.7434, + "step": 112165 + }, + { + "epoch": 7.621280065226253, + "grad_norm": 7.425605773925781, + "learning_rate": 4.773066992797934e-07, + "loss": 2.6467, + "step": 112170 + }, + { + "epoch": 7.621619785296915, + "grad_norm": 13.006743431091309, + "learning_rate": 4.768820491914663e-07, + "loss": 2.9067, + "step": 112175 + }, + { + "epoch": 7.6219595053675775, + "grad_norm": 10.49013614654541, + "learning_rate": 4.7645739910313903e-07, + "loss": 2.5344, + "step": 112180 + }, + { + "epoch": 7.622299225438239, + "grad_norm": 6.2577409744262695, + "learning_rate": 4.760327490148119e-07, + "loss": 2.8305, + "step": 112185 + }, + { + "epoch": 7.622638945508901, + "grad_norm": 7.874784469604492, + "learning_rate": 4.7560809892648463e-07, + "loss": 2.8327, + "step": 112190 + }, + { + "epoch": 7.622978665579563, + "grad_norm": 8.18346118927002, + "learning_rate": 4.751834488381574e-07, + "loss": 2.5107, + "step": 112195 + }, + { + "epoch": 7.623318385650224, + "grad_norm": 6.7767109870910645, + "learning_rate": 4.747587987498302e-07, + "loss": 2.662, + "step": 112200 + }, + { + "epoch": 7.623658105720886, + "grad_norm": 8.195930480957031, + "learning_rate": 4.743341486615029e-07, + "loss": 2.5751, + "step": 112205 + }, + { + "epoch": 7.623997825791548, + "grad_norm": 8.147151947021484, + "learning_rate": 4.739094985731758e-07, + "loss": 2.6294, + "step": 112210 + }, + { + "epoch": 7.624337545862209, + "grad_norm": 8.452248573303223, + "learning_rate": 4.7348484848484853e-07, + "loss": 2.9596, + "step": 112215 + }, + { + "epoch": 7.624677265932871, + "grad_norm": 9.254471778869629, + "learning_rate": 4.730601983965213e-07, + "loss": 2.6923, + "step": 112220 + }, + { + "epoch": 7.6250169860035335, + "grad_norm": 7.118449687957764, + "learning_rate": 4.726355483081941e-07, + "loss": 2.6343, + "step": 112225 + }, + { + "epoch": 7.625356706074195, + "grad_norm": 8.77448558807373, + "learning_rate": 4.722108982198668e-07, + "loss": 2.8091, + "step": 112230 + }, + { + "epoch": 7.625696426144857, + "grad_norm": 7.494492053985596, + "learning_rate": 4.717862481315397e-07, + "loss": 2.9004, + "step": 112235 + }, + { + "epoch": 7.626036146215519, + "grad_norm": 7.685522079467773, + "learning_rate": 4.713615980432124e-07, + "loss": 2.8609, + "step": 112240 + }, + { + "epoch": 7.62637586628618, + "grad_norm": 7.088893413543701, + "learning_rate": 4.709369479548852e-07, + "loss": 2.7672, + "step": 112245 + }, + { + "epoch": 7.626715586356842, + "grad_norm": 7.997490882873535, + "learning_rate": 4.7051229786655797e-07, + "loss": 2.7248, + "step": 112250 + }, + { + "epoch": 7.627055306427504, + "grad_norm": 8.97079086303711, + "learning_rate": 4.700876477782308e-07, + "loss": 2.5107, + "step": 112255 + }, + { + "epoch": 7.627395026498165, + "grad_norm": 6.262259483337402, + "learning_rate": 4.696629976899036e-07, + "loss": 2.7146, + "step": 112260 + }, + { + "epoch": 7.627734746568827, + "grad_norm": 10.33644962310791, + "learning_rate": 4.692383476015763e-07, + "loss": 2.8078, + "step": 112265 + }, + { + "epoch": 7.6280744666394895, + "grad_norm": 8.759045600891113, + "learning_rate": 4.688136975132491e-07, + "loss": 2.7261, + "step": 112270 + }, + { + "epoch": 7.628414186710151, + "grad_norm": 6.950026988983154, + "learning_rate": 4.683890474249219e-07, + "loss": 2.508, + "step": 112275 + }, + { + "epoch": 7.628753906780813, + "grad_norm": 7.709418296813965, + "learning_rate": 4.6796439733659467e-07, + "loss": 2.7575, + "step": 112280 + }, + { + "epoch": 7.629093626851475, + "grad_norm": 8.321752548217773, + "learning_rate": 4.6753974724826747e-07, + "loss": 2.8924, + "step": 112285 + }, + { + "epoch": 7.629433346922136, + "grad_norm": 7.866026401519775, + "learning_rate": 4.671150971599402e-07, + "loss": 2.4615, + "step": 112290 + }, + { + "epoch": 7.629773066992798, + "grad_norm": 9.719677925109863, + "learning_rate": 4.666904470716131e-07, + "loss": 2.7047, + "step": 112295 + }, + { + "epoch": 7.63011278706346, + "grad_norm": 7.550126075744629, + "learning_rate": 4.662657969832858e-07, + "loss": 2.7137, + "step": 112300 + }, + { + "epoch": 7.630452507134121, + "grad_norm": 7.926876544952393, + "learning_rate": 4.6584114689495857e-07, + "loss": 2.6136, + "step": 112305 + }, + { + "epoch": 7.6307922272047835, + "grad_norm": 12.006509780883789, + "learning_rate": 4.6541649680663137e-07, + "loss": 2.7958, + "step": 112310 + }, + { + "epoch": 7.6311319472754455, + "grad_norm": 8.963730812072754, + "learning_rate": 4.649918467183041e-07, + "loss": 2.6818, + "step": 112315 + }, + { + "epoch": 7.631471667346107, + "grad_norm": 6.912195205688477, + "learning_rate": 4.6456719662997697e-07, + "loss": 2.7883, + "step": 112320 + }, + { + "epoch": 7.631811387416769, + "grad_norm": 7.225232124328613, + "learning_rate": 4.641425465416497e-07, + "loss": 2.5609, + "step": 112325 + }, + { + "epoch": 7.63215110748743, + "grad_norm": 8.01986026763916, + "learning_rate": 4.637178964533225e-07, + "loss": 2.8479, + "step": 112330 + }, + { + "epoch": 7.632490827558092, + "grad_norm": 8.328531265258789, + "learning_rate": 4.6329324636499527e-07, + "loss": 2.6606, + "step": 112335 + }, + { + "epoch": 7.632830547628754, + "grad_norm": 7.991931438446045, + "learning_rate": 4.62868596276668e-07, + "loss": 2.6706, + "step": 112340 + }, + { + "epoch": 7.633170267699415, + "grad_norm": 8.68513011932373, + "learning_rate": 4.6244394618834087e-07, + "loss": 2.7414, + "step": 112345 + }, + { + "epoch": 7.633509987770077, + "grad_norm": 6.730503082275391, + "learning_rate": 4.620192961000136e-07, + "loss": 2.7626, + "step": 112350 + }, + { + "epoch": 7.6338497078407395, + "grad_norm": 7.595159530639648, + "learning_rate": 4.615946460116864e-07, + "loss": 2.6067, + "step": 112355 + }, + { + "epoch": 7.634189427911401, + "grad_norm": 9.366409301757812, + "learning_rate": 4.6116999592335917e-07, + "loss": 2.8099, + "step": 112360 + }, + { + "epoch": 7.634529147982063, + "grad_norm": 7.315995216369629, + "learning_rate": 4.6074534583503197e-07, + "loss": 2.5867, + "step": 112365 + }, + { + "epoch": 7.634868868052725, + "grad_norm": 6.790494918823242, + "learning_rate": 4.6032069574670477e-07, + "loss": 2.8304, + "step": 112370 + }, + { + "epoch": 7.635208588123386, + "grad_norm": 6.768820285797119, + "learning_rate": 4.598960456583775e-07, + "loss": 2.6541, + "step": 112375 + }, + { + "epoch": 7.635548308194048, + "grad_norm": 7.46260404586792, + "learning_rate": 4.594713955700503e-07, + "loss": 2.8958, + "step": 112380 + }, + { + "epoch": 7.63588802826471, + "grad_norm": 12.297042846679688, + "learning_rate": 4.590467454817231e-07, + "loss": 2.6517, + "step": 112385 + }, + { + "epoch": 7.636227748335371, + "grad_norm": 7.829619407653809, + "learning_rate": 4.5862209539339587e-07, + "loss": 2.6085, + "step": 112390 + }, + { + "epoch": 7.636567468406033, + "grad_norm": 7.726072311401367, + "learning_rate": 4.5819744530506867e-07, + "loss": 2.7714, + "step": 112395 + }, + { + "epoch": 7.6369071884766955, + "grad_norm": 8.16832447052002, + "learning_rate": 4.577727952167414e-07, + "loss": 2.6048, + "step": 112400 + }, + { + "epoch": 7.637246908547357, + "grad_norm": 9.568802833557129, + "learning_rate": 4.5734814512841427e-07, + "loss": 3.0043, + "step": 112405 + }, + { + "epoch": 7.637586628618019, + "grad_norm": 8.028766632080078, + "learning_rate": 4.56923495040087e-07, + "loss": 2.8421, + "step": 112410 + }, + { + "epoch": 7.637926348688681, + "grad_norm": 8.101367950439453, + "learning_rate": 4.564988449517598e-07, + "loss": 2.5735, + "step": 112415 + }, + { + "epoch": 7.638266068759342, + "grad_norm": 7.740286350250244, + "learning_rate": 4.5607419486343256e-07, + "loss": 2.7532, + "step": 112420 + }, + { + "epoch": 7.638605788830004, + "grad_norm": 8.069902420043945, + "learning_rate": 4.556495447751053e-07, + "loss": 2.7609, + "step": 112425 + }, + { + "epoch": 7.638945508900666, + "grad_norm": 7.737462043762207, + "learning_rate": 4.5522489468677817e-07, + "loss": 2.748, + "step": 112430 + }, + { + "epoch": 7.639285228971327, + "grad_norm": 7.025164604187012, + "learning_rate": 4.548002445984509e-07, + "loss": 2.6071, + "step": 112435 + }, + { + "epoch": 7.639624949041989, + "grad_norm": 7.612030029296875, + "learning_rate": 4.543755945101237e-07, + "loss": 2.7047, + "step": 112440 + }, + { + "epoch": 7.6399646691126515, + "grad_norm": 7.966617107391357, + "learning_rate": 4.5395094442179646e-07, + "loss": 2.7816, + "step": 112445 + }, + { + "epoch": 7.640304389183313, + "grad_norm": 10.730411529541016, + "learning_rate": 4.535262943334692e-07, + "loss": 2.7452, + "step": 112450 + }, + { + "epoch": 7.640644109253975, + "grad_norm": 7.720860004425049, + "learning_rate": 4.5310164424514206e-07, + "loss": 2.5995, + "step": 112455 + }, + { + "epoch": 7.640983829324637, + "grad_norm": 8.226264953613281, + "learning_rate": 4.526769941568148e-07, + "loss": 2.5985, + "step": 112460 + }, + { + "epoch": 7.641323549395298, + "grad_norm": 9.805551528930664, + "learning_rate": 4.522523440684876e-07, + "loss": 2.6483, + "step": 112465 + }, + { + "epoch": 7.64166326946596, + "grad_norm": 7.637010097503662, + "learning_rate": 4.5182769398016036e-07, + "loss": 2.6512, + "step": 112470 + }, + { + "epoch": 7.642002989536622, + "grad_norm": 7.277883529663086, + "learning_rate": 4.514030438918331e-07, + "loss": 2.7, + "step": 112475 + }, + { + "epoch": 7.642342709607283, + "grad_norm": 8.027174949645996, + "learning_rate": 4.5097839380350596e-07, + "loss": 2.583, + "step": 112480 + }, + { + "epoch": 7.642682429677945, + "grad_norm": 8.51908016204834, + "learning_rate": 4.505537437151787e-07, + "loss": 2.858, + "step": 112485 + }, + { + "epoch": 7.6430221497486075, + "grad_norm": 9.304633140563965, + "learning_rate": 4.501290936268515e-07, + "loss": 2.3502, + "step": 112490 + }, + { + "epoch": 7.643361869819269, + "grad_norm": 8.233780860900879, + "learning_rate": 4.497044435385243e-07, + "loss": 2.5466, + "step": 112495 + }, + { + "epoch": 7.643701589889931, + "grad_norm": 6.5243730545043945, + "learning_rate": 4.492797934501971e-07, + "loss": 2.8124, + "step": 112500 + }, + { + "epoch": 7.644041309960593, + "grad_norm": 7.116155624389648, + "learning_rate": 4.4885514336186986e-07, + "loss": 2.7412, + "step": 112505 + }, + { + "epoch": 7.644381030031254, + "grad_norm": 6.72390079498291, + "learning_rate": 4.484304932735426e-07, + "loss": 2.6771, + "step": 112510 + }, + { + "epoch": 7.644720750101916, + "grad_norm": 6.513645172119141, + "learning_rate": 4.4800584318521546e-07, + "loss": 2.7832, + "step": 112515 + }, + { + "epoch": 7.645060470172578, + "grad_norm": 9.243252754211426, + "learning_rate": 4.475811930968882e-07, + "loss": 2.6898, + "step": 112520 + }, + { + "epoch": 7.645400190243239, + "grad_norm": 9.278289794921875, + "learning_rate": 4.47156543008561e-07, + "loss": 2.755, + "step": 112525 + }, + { + "epoch": 7.645739910313901, + "grad_norm": 8.057369232177734, + "learning_rate": 4.4673189292023376e-07, + "loss": 2.7203, + "step": 112530 + }, + { + "epoch": 7.6460796303845635, + "grad_norm": 6.450955867767334, + "learning_rate": 4.463072428319065e-07, + "loss": 2.8202, + "step": 112535 + }, + { + "epoch": 7.646419350455225, + "grad_norm": 6.96385383605957, + "learning_rate": 4.4588259274357936e-07, + "loss": 2.5164, + "step": 112540 + }, + { + "epoch": 7.646759070525887, + "grad_norm": 9.861350059509277, + "learning_rate": 4.454579426552521e-07, + "loss": 2.834, + "step": 112545 + }, + { + "epoch": 7.647098790596548, + "grad_norm": 7.761515140533447, + "learning_rate": 4.450332925669249e-07, + "loss": 2.9414, + "step": 112550 + }, + { + "epoch": 7.64743851066721, + "grad_norm": 8.653602600097656, + "learning_rate": 4.4460864247859766e-07, + "loss": 2.5523, + "step": 112555 + }, + { + "epoch": 7.647778230737872, + "grad_norm": 6.332945823669434, + "learning_rate": 4.441839923902704e-07, + "loss": 2.7624, + "step": 112560 + }, + { + "epoch": 7.648117950808533, + "grad_norm": 10.318912506103516, + "learning_rate": 4.4375934230194326e-07, + "loss": 2.4381, + "step": 112565 + }, + { + "epoch": 7.648457670879195, + "grad_norm": 7.503247261047363, + "learning_rate": 4.43334692213616e-07, + "loss": 2.8543, + "step": 112570 + }, + { + "epoch": 7.6487973909498574, + "grad_norm": 8.31501579284668, + "learning_rate": 4.429100421252888e-07, + "loss": 2.7395, + "step": 112575 + }, + { + "epoch": 7.649137111020519, + "grad_norm": 9.915860176086426, + "learning_rate": 4.4248539203696155e-07, + "loss": 2.8501, + "step": 112580 + }, + { + "epoch": 7.649476831091181, + "grad_norm": 7.5353312492370605, + "learning_rate": 4.420607419486344e-07, + "loss": 2.4872, + "step": 112585 + }, + { + "epoch": 7.649816551161843, + "grad_norm": 8.942098617553711, + "learning_rate": 4.4163609186030716e-07, + "loss": 2.7748, + "step": 112590 + }, + { + "epoch": 7.650156271232504, + "grad_norm": 6.8038225173950195, + "learning_rate": 4.412114417719799e-07, + "loss": 2.8975, + "step": 112595 + }, + { + "epoch": 7.650495991303166, + "grad_norm": 7.8059306144714355, + "learning_rate": 4.407867916836527e-07, + "loss": 2.6535, + "step": 112600 + }, + { + "epoch": 7.650835711373828, + "grad_norm": 9.359060287475586, + "learning_rate": 4.4036214159532545e-07, + "loss": 2.7315, + "step": 112605 + }, + { + "epoch": 7.651175431444489, + "grad_norm": 6.2330851554870605, + "learning_rate": 4.399374915069983e-07, + "loss": 2.4117, + "step": 112610 + }, + { + "epoch": 7.651515151515151, + "grad_norm": 8.218414306640625, + "learning_rate": 4.3951284141867105e-07, + "loss": 2.8248, + "step": 112615 + }, + { + "epoch": 7.6518548715858135, + "grad_norm": 7.181587219238281, + "learning_rate": 4.390881913303438e-07, + "loss": 2.4592, + "step": 112620 + }, + { + "epoch": 7.652194591656475, + "grad_norm": 8.997601509094238, + "learning_rate": 4.386635412420166e-07, + "loss": 2.7912, + "step": 112625 + }, + { + "epoch": 7.652534311727137, + "grad_norm": 7.762347221374512, + "learning_rate": 4.382388911536894e-07, + "loss": 2.8089, + "step": 112630 + }, + { + "epoch": 7.652874031797799, + "grad_norm": 8.481740951538086, + "learning_rate": 4.378142410653622e-07, + "loss": 2.562, + "step": 112635 + }, + { + "epoch": 7.65321375186846, + "grad_norm": 7.71415901184082, + "learning_rate": 4.3738959097703495e-07, + "loss": 2.8557, + "step": 112640 + }, + { + "epoch": 7.653553471939122, + "grad_norm": 7.503321170806885, + "learning_rate": 4.369649408887077e-07, + "loss": 2.6367, + "step": 112645 + }, + { + "epoch": 7.653893192009784, + "grad_norm": 8.6915864944458, + "learning_rate": 4.3654029080038055e-07, + "loss": 2.6561, + "step": 112650 + }, + { + "epoch": 7.654232912080445, + "grad_norm": 8.637556076049805, + "learning_rate": 4.361156407120533e-07, + "loss": 2.7775, + "step": 112655 + }, + { + "epoch": 7.654572632151107, + "grad_norm": 9.887212753295898, + "learning_rate": 4.356909906237261e-07, + "loss": 2.68, + "step": 112660 + }, + { + "epoch": 7.6549123522217695, + "grad_norm": 10.8507719039917, + "learning_rate": 4.3526634053539885e-07, + "loss": 2.6429, + "step": 112665 + }, + { + "epoch": 7.655252072292431, + "grad_norm": 7.6906609535217285, + "learning_rate": 4.348416904470717e-07, + "loss": 2.3824, + "step": 112670 + }, + { + "epoch": 7.655591792363093, + "grad_norm": 8.93661880493164, + "learning_rate": 4.3441704035874445e-07, + "loss": 2.9727, + "step": 112675 + }, + { + "epoch": 7.655931512433755, + "grad_norm": 9.284189224243164, + "learning_rate": 4.339923902704172e-07, + "loss": 2.7879, + "step": 112680 + }, + { + "epoch": 7.656271232504416, + "grad_norm": 9.278068542480469, + "learning_rate": 4.3356774018209e-07, + "loss": 2.6737, + "step": 112685 + }, + { + "epoch": 7.656610952575078, + "grad_norm": 7.833571434020996, + "learning_rate": 4.3314309009376275e-07, + "loss": 2.7318, + "step": 112690 + }, + { + "epoch": 7.65695067264574, + "grad_norm": 9.314237594604492, + "learning_rate": 4.327184400054356e-07, + "loss": 2.6654, + "step": 112695 + }, + { + "epoch": 7.657290392716401, + "grad_norm": 7.3200507164001465, + "learning_rate": 4.3229378991710835e-07, + "loss": 2.4251, + "step": 112700 + }, + { + "epoch": 7.657630112787063, + "grad_norm": 8.963197708129883, + "learning_rate": 4.318691398287811e-07, + "loss": 2.586, + "step": 112705 + }, + { + "epoch": 7.6579698328577255, + "grad_norm": 7.687807559967041, + "learning_rate": 4.314444897404539e-07, + "loss": 2.6119, + "step": 112710 + }, + { + "epoch": 7.658309552928387, + "grad_norm": 9.184627532958984, + "learning_rate": 4.3101983965212665e-07, + "loss": 2.9383, + "step": 112715 + }, + { + "epoch": 7.658649272999049, + "grad_norm": 9.330338478088379, + "learning_rate": 4.305951895637995e-07, + "loss": 2.9186, + "step": 112720 + }, + { + "epoch": 7.658988993069711, + "grad_norm": 9.207608222961426, + "learning_rate": 4.3017053947547225e-07, + "loss": 2.5025, + "step": 112725 + }, + { + "epoch": 7.659328713140372, + "grad_norm": 7.995729446411133, + "learning_rate": 4.29745889387145e-07, + "loss": 2.6128, + "step": 112730 + }, + { + "epoch": 7.659668433211034, + "grad_norm": 8.052952766418457, + "learning_rate": 4.293212392988178e-07, + "loss": 2.6976, + "step": 112735 + }, + { + "epoch": 7.660008153281696, + "grad_norm": 7.546281337738037, + "learning_rate": 4.288965892104906e-07, + "loss": 2.9107, + "step": 112740 + }, + { + "epoch": 7.660347873352357, + "grad_norm": 7.119200706481934, + "learning_rate": 4.284719391221634e-07, + "loss": 2.9014, + "step": 112745 + }, + { + "epoch": 7.660687593423019, + "grad_norm": 8.195159912109375, + "learning_rate": 4.2804728903383614e-07, + "loss": 2.7643, + "step": 112750 + }, + { + "epoch": 7.6610273134936815, + "grad_norm": 7.924691677093506, + "learning_rate": 4.2762263894550895e-07, + "loss": 2.8252, + "step": 112755 + }, + { + "epoch": 7.661367033564343, + "grad_norm": 7.359403133392334, + "learning_rate": 4.2719798885718175e-07, + "loss": 2.6734, + "step": 112760 + }, + { + "epoch": 7.661706753635005, + "grad_norm": 7.743636131286621, + "learning_rate": 4.267733387688545e-07, + "loss": 2.6148, + "step": 112765 + }, + { + "epoch": 7.662046473705667, + "grad_norm": 7.323086738586426, + "learning_rate": 4.263486886805273e-07, + "loss": 2.67, + "step": 112770 + }, + { + "epoch": 7.662386193776328, + "grad_norm": 6.617745399475098, + "learning_rate": 4.2592403859220004e-07, + "loss": 2.7626, + "step": 112775 + }, + { + "epoch": 7.66272591384699, + "grad_norm": 8.656390190124512, + "learning_rate": 4.254993885038729e-07, + "loss": 2.5486, + "step": 112780 + }, + { + "epoch": 7.663065633917652, + "grad_norm": 7.135673522949219, + "learning_rate": 4.2507473841554564e-07, + "loss": 2.5889, + "step": 112785 + }, + { + "epoch": 7.663405353988313, + "grad_norm": 6.546563625335693, + "learning_rate": 4.246500883272184e-07, + "loss": 2.5813, + "step": 112790 + }, + { + "epoch": 7.663745074058975, + "grad_norm": 7.320182800292969, + "learning_rate": 4.242254382388912e-07, + "loss": 2.7309, + "step": 112795 + }, + { + "epoch": 7.6640847941296375, + "grad_norm": 6.931706428527832, + "learning_rate": 4.2380078815056394e-07, + "loss": 2.8686, + "step": 112800 + }, + { + "epoch": 7.664424514200299, + "grad_norm": 7.4613189697265625, + "learning_rate": 4.233761380622368e-07, + "loss": 2.949, + "step": 112805 + }, + { + "epoch": 7.664764234270961, + "grad_norm": 8.110925674438477, + "learning_rate": 4.2295148797390954e-07, + "loss": 2.4481, + "step": 112810 + }, + { + "epoch": 7.665103954341623, + "grad_norm": 8.642160415649414, + "learning_rate": 4.225268378855823e-07, + "loss": 2.7156, + "step": 112815 + }, + { + "epoch": 7.665443674412284, + "grad_norm": 8.908197402954102, + "learning_rate": 4.221021877972551e-07, + "loss": 2.735, + "step": 112820 + }, + { + "epoch": 7.665783394482946, + "grad_norm": 7.861430644989014, + "learning_rate": 4.2167753770892784e-07, + "loss": 2.6279, + "step": 112825 + }, + { + "epoch": 7.666123114553608, + "grad_norm": 7.263138294219971, + "learning_rate": 4.212528876206007e-07, + "loss": 2.247, + "step": 112830 + }, + { + "epoch": 7.666462834624269, + "grad_norm": 6.710785388946533, + "learning_rate": 4.2082823753227344e-07, + "loss": 2.7913, + "step": 112835 + }, + { + "epoch": 7.666802554694931, + "grad_norm": 7.768241882324219, + "learning_rate": 4.2040358744394624e-07, + "loss": 2.8523, + "step": 112840 + }, + { + "epoch": 7.6671422747655935, + "grad_norm": 7.366542339324951, + "learning_rate": 4.19978937355619e-07, + "loss": 2.6411, + "step": 112845 + }, + { + "epoch": 7.667481994836255, + "grad_norm": 7.987002849578857, + "learning_rate": 4.1955428726729174e-07, + "loss": 2.6253, + "step": 112850 + }, + { + "epoch": 7.667821714906917, + "grad_norm": 7.492487907409668, + "learning_rate": 4.191296371789646e-07, + "loss": 2.492, + "step": 112855 + }, + { + "epoch": 7.668161434977579, + "grad_norm": 8.747038841247559, + "learning_rate": 4.1870498709063734e-07, + "loss": 2.8649, + "step": 112860 + }, + { + "epoch": 7.66850115504824, + "grad_norm": 8.762309074401855, + "learning_rate": 4.1828033700231014e-07, + "loss": 2.6691, + "step": 112865 + }, + { + "epoch": 7.668840875118902, + "grad_norm": 10.382695198059082, + "learning_rate": 4.1785568691398294e-07, + "loss": 2.6052, + "step": 112870 + }, + { + "epoch": 7.669180595189564, + "grad_norm": 9.336195945739746, + "learning_rate": 4.174310368256557e-07, + "loss": 2.8232, + "step": 112875 + }, + { + "epoch": 7.669520315260225, + "grad_norm": 8.6975736618042, + "learning_rate": 4.170913167549939e-07, + "loss": 2.6195, + "step": 112880 + }, + { + "epoch": 7.6698600353308874, + "grad_norm": 9.433987617492676, + "learning_rate": 4.1666666666666667e-07, + "loss": 2.7593, + "step": 112885 + }, + { + "epoch": 7.6701997554015495, + "grad_norm": 6.683697700500488, + "learning_rate": 4.162420165783395e-07, + "loss": 2.8475, + "step": 112890 + }, + { + "epoch": 7.670539475472211, + "grad_norm": 7.982096195220947, + "learning_rate": 4.1581736649001227e-07, + "loss": 2.6706, + "step": 112895 + }, + { + "epoch": 7.670879195542873, + "grad_norm": 7.262607097625732, + "learning_rate": 4.1539271640168507e-07, + "loss": 2.6337, + "step": 112900 + }, + { + "epoch": 7.671218915613535, + "grad_norm": 7.473952770233154, + "learning_rate": 4.149680663133578e-07, + "loss": 2.4924, + "step": 112905 + }, + { + "epoch": 7.671558635684196, + "grad_norm": 8.030779838562012, + "learning_rate": 4.1454341622503056e-07, + "loss": 2.5177, + "step": 112910 + }, + { + "epoch": 7.671898355754858, + "grad_norm": 7.192797660827637, + "learning_rate": 4.141187661367034e-07, + "loss": 2.8166, + "step": 112915 + }, + { + "epoch": 7.67223807582552, + "grad_norm": 7.808196067810059, + "learning_rate": 4.1369411604837616e-07, + "loss": 2.906, + "step": 112920 + }, + { + "epoch": 7.672577795896181, + "grad_norm": 8.033120155334473, + "learning_rate": 4.1326946596004897e-07, + "loss": 2.8249, + "step": 112925 + }, + { + "epoch": 7.6729175159668435, + "grad_norm": 8.309894561767578, + "learning_rate": 4.128448158717217e-07, + "loss": 2.6118, + "step": 112930 + }, + { + "epoch": 7.6732572360375055, + "grad_norm": 8.360783576965332, + "learning_rate": 4.1242016578339446e-07, + "loss": 2.6103, + "step": 112935 + }, + { + "epoch": 7.673596956108167, + "grad_norm": 7.652331352233887, + "learning_rate": 4.119955156950673e-07, + "loss": 2.7797, + "step": 112940 + }, + { + "epoch": 7.673936676178829, + "grad_norm": 7.814927577972412, + "learning_rate": 4.1157086560674006e-07, + "loss": 2.4403, + "step": 112945 + }, + { + "epoch": 7.674276396249491, + "grad_norm": 8.424772262573242, + "learning_rate": 4.1114621551841286e-07, + "loss": 2.8949, + "step": 112950 + }, + { + "epoch": 7.674616116320152, + "grad_norm": 11.992019653320312, + "learning_rate": 4.107215654300856e-07, + "loss": 2.7791, + "step": 112955 + }, + { + "epoch": 7.674955836390814, + "grad_norm": 9.782203674316406, + "learning_rate": 4.1029691534175847e-07, + "loss": 2.6113, + "step": 112960 + }, + { + "epoch": 7.675295556461476, + "grad_norm": 7.718443393707275, + "learning_rate": 4.098722652534312e-07, + "loss": 2.6847, + "step": 112965 + }, + { + "epoch": 7.675635276532137, + "grad_norm": 9.762266159057617, + "learning_rate": 4.0944761516510396e-07, + "loss": 2.5902, + "step": 112970 + }, + { + "epoch": 7.6759749966027995, + "grad_norm": 7.80546760559082, + "learning_rate": 4.0902296507677676e-07, + "loss": 2.5928, + "step": 112975 + }, + { + "epoch": 7.6763147166734615, + "grad_norm": 8.696622848510742, + "learning_rate": 4.0859831498844956e-07, + "loss": 2.8124, + "step": 112980 + }, + { + "epoch": 7.676654436744123, + "grad_norm": 8.988733291625977, + "learning_rate": 4.0817366490012236e-07, + "loss": 2.5988, + "step": 112985 + }, + { + "epoch": 7.676994156814785, + "grad_norm": 8.293030738830566, + "learning_rate": 4.077490148117951e-07, + "loss": 2.6533, + "step": 112990 + }, + { + "epoch": 7.677333876885447, + "grad_norm": 9.095362663269043, + "learning_rate": 4.0732436472346786e-07, + "loss": 2.7574, + "step": 112995 + }, + { + "epoch": 7.677673596956108, + "grad_norm": 7.538613319396973, + "learning_rate": 4.068997146351407e-07, + "loss": 2.5264, + "step": 113000 + }, + { + "epoch": 7.67801331702677, + "grad_norm": 7.184240818023682, + "learning_rate": 4.0647506454681346e-07, + "loss": 2.7764, + "step": 113005 + }, + { + "epoch": 7.678353037097431, + "grad_norm": 8.949213981628418, + "learning_rate": 4.0605041445848626e-07, + "loss": 2.6863, + "step": 113010 + }, + { + "epoch": 7.678692757168093, + "grad_norm": 7.236576080322266, + "learning_rate": 4.05625764370159e-07, + "loss": 2.8471, + "step": 113015 + }, + { + "epoch": 7.6790324772387555, + "grad_norm": 9.13204574584961, + "learning_rate": 4.0520111428183176e-07, + "loss": 2.681, + "step": 113020 + }, + { + "epoch": 7.679372197309417, + "grad_norm": 6.7861409187316895, + "learning_rate": 4.047764641935046e-07, + "loss": 2.6588, + "step": 113025 + }, + { + "epoch": 7.679711917380079, + "grad_norm": 7.273203372955322, + "learning_rate": 4.0435181410517736e-07, + "loss": 2.6413, + "step": 113030 + }, + { + "epoch": 7.680051637450741, + "grad_norm": 9.446686744689941, + "learning_rate": 4.0392716401685016e-07, + "loss": 2.8243, + "step": 113035 + }, + { + "epoch": 7.680391357521402, + "grad_norm": 8.873855590820312, + "learning_rate": 4.035025139285229e-07, + "loss": 2.7844, + "step": 113040 + }, + { + "epoch": 7.680731077592064, + "grad_norm": 7.830885887145996, + "learning_rate": 4.0307786384019576e-07, + "loss": 2.5126, + "step": 113045 + }, + { + "epoch": 7.681070797662726, + "grad_norm": 8.294107437133789, + "learning_rate": 4.026532137518685e-07, + "loss": 2.6875, + "step": 113050 + }, + { + "epoch": 7.681410517733387, + "grad_norm": 7.399267673492432, + "learning_rate": 4.0222856366354126e-07, + "loss": 2.7447, + "step": 113055 + }, + { + "epoch": 7.681750237804049, + "grad_norm": 8.53316593170166, + "learning_rate": 4.0180391357521406e-07, + "loss": 2.8615, + "step": 113060 + }, + { + "epoch": 7.6820899578747115, + "grad_norm": 6.976664066314697, + "learning_rate": 4.013792634868868e-07, + "loss": 2.6422, + "step": 113065 + }, + { + "epoch": 7.682429677945373, + "grad_norm": 8.808432579040527, + "learning_rate": 4.0095461339855966e-07, + "loss": 2.8464, + "step": 113070 + }, + { + "epoch": 7.682769398016035, + "grad_norm": 7.878657341003418, + "learning_rate": 4.005299633102324e-07, + "loss": 2.7265, + "step": 113075 + }, + { + "epoch": 7.683109118086697, + "grad_norm": 9.209758758544922, + "learning_rate": 4.0010531322190515e-07, + "loss": 2.8146, + "step": 113080 + }, + { + "epoch": 7.683448838157358, + "grad_norm": 7.147486686706543, + "learning_rate": 3.9968066313357796e-07, + "loss": 2.7319, + "step": 113085 + }, + { + "epoch": 7.68378855822802, + "grad_norm": 8.769139289855957, + "learning_rate": 3.9925601304525076e-07, + "loss": 2.573, + "step": 113090 + }, + { + "epoch": 7.684128278298682, + "grad_norm": 8.976667404174805, + "learning_rate": 3.9883136295692356e-07, + "loss": 2.7658, + "step": 113095 + }, + { + "epoch": 7.684467998369343, + "grad_norm": 7.952949523925781, + "learning_rate": 3.984067128685963e-07, + "loss": 2.5719, + "step": 113100 + }, + { + "epoch": 7.684807718440005, + "grad_norm": 6.997871398925781, + "learning_rate": 3.9798206278026905e-07, + "loss": 2.5839, + "step": 113105 + }, + { + "epoch": 7.6851474385106675, + "grad_norm": 6.903526782989502, + "learning_rate": 3.975574126919419e-07, + "loss": 2.6267, + "step": 113110 + }, + { + "epoch": 7.685487158581329, + "grad_norm": 11.12015151977539, + "learning_rate": 3.9713276260361465e-07, + "loss": 2.6453, + "step": 113115 + }, + { + "epoch": 7.685826878651991, + "grad_norm": 8.653717994689941, + "learning_rate": 3.9670811251528745e-07, + "loss": 2.7067, + "step": 113120 + }, + { + "epoch": 7.686166598722653, + "grad_norm": 7.79026460647583, + "learning_rate": 3.962834624269602e-07, + "loss": 2.6707, + "step": 113125 + }, + { + "epoch": 7.686506318793314, + "grad_norm": 7.784934043884277, + "learning_rate": 3.9585881233863306e-07, + "loss": 2.6314, + "step": 113130 + }, + { + "epoch": 7.686846038863976, + "grad_norm": 12.750460624694824, + "learning_rate": 3.954341622503058e-07, + "loss": 2.428, + "step": 113135 + }, + { + "epoch": 7.687185758934638, + "grad_norm": 8.01950454711914, + "learning_rate": 3.9500951216197855e-07, + "loss": 2.7203, + "step": 113140 + }, + { + "epoch": 7.687525479005299, + "grad_norm": 7.041368007659912, + "learning_rate": 3.9458486207365135e-07, + "loss": 2.669, + "step": 113145 + }, + { + "epoch": 7.687865199075961, + "grad_norm": 10.80254077911377, + "learning_rate": 3.941602119853241e-07, + "loss": 2.7153, + "step": 113150 + }, + { + "epoch": 7.6882049191466235, + "grad_norm": 7.795762062072754, + "learning_rate": 3.9373556189699695e-07, + "loss": 2.8937, + "step": 113155 + }, + { + "epoch": 7.688544639217285, + "grad_norm": 9.592793464660645, + "learning_rate": 3.933109118086697e-07, + "loss": 2.6614, + "step": 113160 + }, + { + "epoch": 7.688884359287947, + "grad_norm": 8.420768737792969, + "learning_rate": 3.9288626172034245e-07, + "loss": 2.511, + "step": 113165 + }, + { + "epoch": 7.689224079358609, + "grad_norm": 9.440881729125977, + "learning_rate": 3.9246161163201525e-07, + "loss": 2.6997, + "step": 113170 + }, + { + "epoch": 7.68956379942927, + "grad_norm": 6.717258930206299, + "learning_rate": 3.92036961543688e-07, + "loss": 2.7414, + "step": 113175 + }, + { + "epoch": 7.689903519499932, + "grad_norm": 7.431273460388184, + "learning_rate": 3.9161231145536085e-07, + "loss": 2.8983, + "step": 113180 + }, + { + "epoch": 7.690243239570594, + "grad_norm": 8.100153923034668, + "learning_rate": 3.911876613670336e-07, + "loss": 2.6827, + "step": 113185 + }, + { + "epoch": 7.690582959641255, + "grad_norm": 7.372929096221924, + "learning_rate": 3.9076301127870635e-07, + "loss": 2.8694, + "step": 113190 + }, + { + "epoch": 7.6909226797119175, + "grad_norm": 9.751885414123535, + "learning_rate": 3.9033836119037915e-07, + "loss": 2.5835, + "step": 113195 + }, + { + "epoch": 7.6912623997825795, + "grad_norm": 6.91343355178833, + "learning_rate": 3.899137111020519e-07, + "loss": 2.5188, + "step": 113200 + }, + { + "epoch": 7.691602119853241, + "grad_norm": 8.657690048217773, + "learning_rate": 3.8948906101372475e-07, + "loss": 2.5634, + "step": 113205 + }, + { + "epoch": 7.691941839923903, + "grad_norm": 9.839330673217773, + "learning_rate": 3.890644109253975e-07, + "loss": 2.7478, + "step": 113210 + }, + { + "epoch": 7.692281559994565, + "grad_norm": 7.098385334014893, + "learning_rate": 3.8863976083707025e-07, + "loss": 2.6858, + "step": 113215 + }, + { + "epoch": 7.692621280065226, + "grad_norm": 9.186285018920898, + "learning_rate": 3.882151107487431e-07, + "loss": 2.7406, + "step": 113220 + }, + { + "epoch": 7.692961000135888, + "grad_norm": 7.914554119110107, + "learning_rate": 3.8779046066041585e-07, + "loss": 2.7178, + "step": 113225 + }, + { + "epoch": 7.693300720206549, + "grad_norm": 7.631162166595459, + "learning_rate": 3.8736581057208865e-07, + "loss": 2.6706, + "step": 113230 + }, + { + "epoch": 7.693640440277211, + "grad_norm": 7.797874927520752, + "learning_rate": 3.869411604837614e-07, + "loss": 2.6275, + "step": 113235 + }, + { + "epoch": 7.6939801603478735, + "grad_norm": 7.840734004974365, + "learning_rate": 3.8651651039543425e-07, + "loss": 2.8662, + "step": 113240 + }, + { + "epoch": 7.694319880418535, + "grad_norm": 9.609986305236816, + "learning_rate": 3.86091860307107e-07, + "loss": 2.8632, + "step": 113245 + }, + { + "epoch": 7.694659600489197, + "grad_norm": 8.869410514831543, + "learning_rate": 3.8566721021877975e-07, + "loss": 2.5774, + "step": 113250 + }, + { + "epoch": 7.694999320559859, + "grad_norm": 7.768618106842041, + "learning_rate": 3.8524256013045255e-07, + "loss": 2.7048, + "step": 113255 + }, + { + "epoch": 7.69533904063052, + "grad_norm": 9.041075706481934, + "learning_rate": 3.848179100421253e-07, + "loss": 2.781, + "step": 113260 + }, + { + "epoch": 7.695678760701182, + "grad_norm": 8.572905540466309, + "learning_rate": 3.8439325995379815e-07, + "loss": 2.5439, + "step": 113265 + }, + { + "epoch": 7.696018480771844, + "grad_norm": 9.233549118041992, + "learning_rate": 3.839686098654709e-07, + "loss": 2.7067, + "step": 113270 + }, + { + "epoch": 7.696358200842505, + "grad_norm": 7.352399826049805, + "learning_rate": 3.8354395977714364e-07, + "loss": 2.9769, + "step": 113275 + }, + { + "epoch": 7.696697920913167, + "grad_norm": 8.89102840423584, + "learning_rate": 3.8311930968881644e-07, + "loss": 2.732, + "step": 113280 + }, + { + "epoch": 7.6970376409838295, + "grad_norm": 7.355029582977295, + "learning_rate": 3.826946596004892e-07, + "loss": 2.7324, + "step": 113285 + }, + { + "epoch": 7.697377361054491, + "grad_norm": 8.403160095214844, + "learning_rate": 3.8227000951216205e-07, + "loss": 2.7129, + "step": 113290 + }, + { + "epoch": 7.697717081125153, + "grad_norm": 9.352324485778809, + "learning_rate": 3.818453594238348e-07, + "loss": 2.6786, + "step": 113295 + }, + { + "epoch": 7.698056801195815, + "grad_norm": 6.28654670715332, + "learning_rate": 3.8142070933550754e-07, + "loss": 2.6354, + "step": 113300 + }, + { + "epoch": 7.698396521266476, + "grad_norm": 7.951404094696045, + "learning_rate": 3.8099605924718034e-07, + "loss": 2.7277, + "step": 113305 + }, + { + "epoch": 7.698736241337138, + "grad_norm": 8.676368713378906, + "learning_rate": 3.805714091588531e-07, + "loss": 2.7203, + "step": 113310 + }, + { + "epoch": 7.6990759614078, + "grad_norm": 7.814903736114502, + "learning_rate": 3.8014675907052594e-07, + "loss": 2.7343, + "step": 113315 + }, + { + "epoch": 7.699415681478461, + "grad_norm": 8.859959602355957, + "learning_rate": 3.797221089821987e-07, + "loss": 2.6349, + "step": 113320 + }, + { + "epoch": 7.699755401549123, + "grad_norm": 7.450127601623535, + "learning_rate": 3.792974588938715e-07, + "loss": 2.7405, + "step": 113325 + }, + { + "epoch": 7.7000951216197855, + "grad_norm": 8.971728324890137, + "learning_rate": 3.7887280880554424e-07, + "loss": 2.7238, + "step": 113330 + }, + { + "epoch": 7.700434841690447, + "grad_norm": 8.290078163146973, + "learning_rate": 3.7844815871721704e-07, + "loss": 2.783, + "step": 113335 + }, + { + "epoch": 7.700774561761109, + "grad_norm": 8.221224784851074, + "learning_rate": 3.7802350862888984e-07, + "loss": 2.5987, + "step": 113340 + }, + { + "epoch": 7.701114281831771, + "grad_norm": 8.697818756103516, + "learning_rate": 3.775988585405626e-07, + "loss": 2.6886, + "step": 113345 + }, + { + "epoch": 7.701454001902432, + "grad_norm": 11.05338191986084, + "learning_rate": 3.771742084522354e-07, + "loss": 2.8242, + "step": 113350 + }, + { + "epoch": 7.701793721973094, + "grad_norm": 7.670877456665039, + "learning_rate": 3.767495583639082e-07, + "loss": 2.9731, + "step": 113355 + }, + { + "epoch": 7.702133442043756, + "grad_norm": 7.284666061401367, + "learning_rate": 3.7632490827558094e-07, + "loss": 2.6494, + "step": 113360 + }, + { + "epoch": 7.702473162114417, + "grad_norm": 9.438789367675781, + "learning_rate": 3.7590025818725374e-07, + "loss": 2.4216, + "step": 113365 + }, + { + "epoch": 7.702812882185079, + "grad_norm": 10.488374710083008, + "learning_rate": 3.754756080989265e-07, + "loss": 2.8062, + "step": 113370 + }, + { + "epoch": 7.7031526022557415, + "grad_norm": 8.150050163269043, + "learning_rate": 3.7505095801059934e-07, + "loss": 2.6546, + "step": 113375 + }, + { + "epoch": 7.703492322326403, + "grad_norm": 7.688365459442139, + "learning_rate": 3.746263079222721e-07, + "loss": 2.7281, + "step": 113380 + }, + { + "epoch": 7.703832042397065, + "grad_norm": 8.170842170715332, + "learning_rate": 3.7420165783394484e-07, + "loss": 2.9981, + "step": 113385 + }, + { + "epoch": 7.704171762467727, + "grad_norm": 7.618350505828857, + "learning_rate": 3.7377700774561764e-07, + "loss": 2.9812, + "step": 113390 + }, + { + "epoch": 7.704511482538388, + "grad_norm": 7.107935428619385, + "learning_rate": 3.733523576572904e-07, + "loss": 2.6841, + "step": 113395 + }, + { + "epoch": 7.70485120260905, + "grad_norm": 7.794602394104004, + "learning_rate": 3.7292770756896324e-07, + "loss": 2.6358, + "step": 113400 + }, + { + "epoch": 7.705190922679712, + "grad_norm": 7.341071605682373, + "learning_rate": 3.72503057480636e-07, + "loss": 2.7966, + "step": 113405 + }, + { + "epoch": 7.705530642750373, + "grad_norm": 9.342300415039062, + "learning_rate": 3.720784073923088e-07, + "loss": 2.885, + "step": 113410 + }, + { + "epoch": 7.705870362821035, + "grad_norm": 9.95678997039795, + "learning_rate": 3.7165375730398154e-07, + "loss": 2.6899, + "step": 113415 + }, + { + "epoch": 7.7062100828916975, + "grad_norm": 6.338374614715576, + "learning_rate": 3.712291072156543e-07, + "loss": 2.6365, + "step": 113420 + }, + { + "epoch": 7.706549802962359, + "grad_norm": 8.320364952087402, + "learning_rate": 3.7080445712732714e-07, + "loss": 2.7308, + "step": 113425 + }, + { + "epoch": 7.706889523033021, + "grad_norm": 9.393622398376465, + "learning_rate": 3.703798070389999e-07, + "loss": 2.505, + "step": 113430 + }, + { + "epoch": 7.707229243103683, + "grad_norm": 6.856492042541504, + "learning_rate": 3.699551569506727e-07, + "loss": 2.6472, + "step": 113435 + }, + { + "epoch": 7.707568963174344, + "grad_norm": 9.790009498596191, + "learning_rate": 3.6953050686234543e-07, + "loss": 2.4442, + "step": 113440 + }, + { + "epoch": 7.707908683245006, + "grad_norm": 9.6905517578125, + "learning_rate": 3.6910585677401823e-07, + "loss": 2.7941, + "step": 113445 + }, + { + "epoch": 7.708248403315668, + "grad_norm": 6.860785007476807, + "learning_rate": 3.6868120668569104e-07, + "loss": 2.5114, + "step": 113450 + }, + { + "epoch": 7.708588123386329, + "grad_norm": 6.835826873779297, + "learning_rate": 3.682565565973638e-07, + "loss": 2.5367, + "step": 113455 + }, + { + "epoch": 7.7089278434569914, + "grad_norm": 7.928064823150635, + "learning_rate": 3.678319065090366e-07, + "loss": 2.7434, + "step": 113460 + }, + { + "epoch": 7.7092675635276535, + "grad_norm": 8.102869033813477, + "learning_rate": 3.674072564207094e-07, + "loss": 2.6975, + "step": 113465 + }, + { + "epoch": 7.709607283598315, + "grad_norm": 6.631292343139648, + "learning_rate": 3.6698260633238213e-07, + "loss": 2.8224, + "step": 113470 + }, + { + "epoch": 7.709947003668977, + "grad_norm": 8.14957046508789, + "learning_rate": 3.6655795624405493e-07, + "loss": 2.7681, + "step": 113475 + }, + { + "epoch": 7.710286723739639, + "grad_norm": 8.086451530456543, + "learning_rate": 3.661333061557277e-07, + "loss": 2.7279, + "step": 113480 + }, + { + "epoch": 7.7106264438103, + "grad_norm": 7.7633490562438965, + "learning_rate": 3.6570865606740053e-07, + "loss": 2.6117, + "step": 113485 + }, + { + "epoch": 7.710966163880962, + "grad_norm": 6.8412394523620605, + "learning_rate": 3.652840059790733e-07, + "loss": 2.6189, + "step": 113490 + }, + { + "epoch": 7.711305883951624, + "grad_norm": 6.314999580383301, + "learning_rate": 3.648593558907461e-07, + "loss": 2.763, + "step": 113495 + }, + { + "epoch": 7.711645604022285, + "grad_norm": 10.200093269348145, + "learning_rate": 3.6443470580241883e-07, + "loss": 2.716, + "step": 113500 + }, + { + "epoch": 7.7119853240929475, + "grad_norm": 7.643606185913086, + "learning_rate": 3.640100557140916e-07, + "loss": 2.882, + "step": 113505 + }, + { + "epoch": 7.7123250441636095, + "grad_norm": 7.878694534301758, + "learning_rate": 3.6358540562576443e-07, + "loss": 2.6778, + "step": 113510 + }, + { + "epoch": 7.712664764234271, + "grad_norm": 7.505698204040527, + "learning_rate": 3.631607555374372e-07, + "loss": 2.7084, + "step": 113515 + }, + { + "epoch": 7.713004484304933, + "grad_norm": 9.512552261352539, + "learning_rate": 3.6273610544911e-07, + "loss": 2.5815, + "step": 113520 + }, + { + "epoch": 7.713344204375595, + "grad_norm": 8.273946762084961, + "learning_rate": 3.6231145536078273e-07, + "loss": 2.6694, + "step": 113525 + }, + { + "epoch": 7.713683924446256, + "grad_norm": 7.945606231689453, + "learning_rate": 3.618868052724555e-07, + "loss": 2.6975, + "step": 113530 + }, + { + "epoch": 7.714023644516918, + "grad_norm": 8.745502471923828, + "learning_rate": 3.6146215518412833e-07, + "loss": 2.5999, + "step": 113535 + }, + { + "epoch": 7.71436336458758, + "grad_norm": 8.340679168701172, + "learning_rate": 3.610375050958011e-07, + "loss": 2.5616, + "step": 113540 + }, + { + "epoch": 7.714703084658241, + "grad_norm": 8.814310073852539, + "learning_rate": 3.606128550074739e-07, + "loss": 2.4265, + "step": 113545 + }, + { + "epoch": 7.7150428047289035, + "grad_norm": 8.661347389221191, + "learning_rate": 3.6018820491914663e-07, + "loss": 2.6272, + "step": 113550 + }, + { + "epoch": 7.7153825247995655, + "grad_norm": 10.647444725036621, + "learning_rate": 3.597635548308194e-07, + "loss": 2.6443, + "step": 113555 + }, + { + "epoch": 7.715722244870227, + "grad_norm": 8.166482925415039, + "learning_rate": 3.5933890474249223e-07, + "loss": 2.5907, + "step": 113560 + }, + { + "epoch": 7.716061964940889, + "grad_norm": 6.712428092956543, + "learning_rate": 3.58914254654165e-07, + "loss": 2.4819, + "step": 113565 + }, + { + "epoch": 7.716401685011551, + "grad_norm": 9.475564002990723, + "learning_rate": 3.584896045658378e-07, + "loss": 2.6549, + "step": 113570 + }, + { + "epoch": 7.716741405082212, + "grad_norm": 7.789132595062256, + "learning_rate": 3.580649544775105e-07, + "loss": 2.4289, + "step": 113575 + }, + { + "epoch": 7.717081125152874, + "grad_norm": 9.529703140258789, + "learning_rate": 3.576403043891834e-07, + "loss": 2.6827, + "step": 113580 + }, + { + "epoch": 7.717420845223536, + "grad_norm": 10.506417274475098, + "learning_rate": 3.5721565430085613e-07, + "loss": 2.7304, + "step": 113585 + }, + { + "epoch": 7.717760565294197, + "grad_norm": 9.713706970214844, + "learning_rate": 3.567910042125289e-07, + "loss": 2.5382, + "step": 113590 + }, + { + "epoch": 7.7181002853648595, + "grad_norm": 10.31142520904541, + "learning_rate": 3.5636635412420173e-07, + "loss": 2.5682, + "step": 113595 + }, + { + "epoch": 7.7184400054355216, + "grad_norm": 8.379339218139648, + "learning_rate": 3.559417040358745e-07, + "loss": 2.3397, + "step": 113600 + }, + { + "epoch": 7.718779725506183, + "grad_norm": 7.738608360290527, + "learning_rate": 3.555170539475473e-07, + "loss": 2.5846, + "step": 113605 + }, + { + "epoch": 7.719119445576845, + "grad_norm": 8.691726684570312, + "learning_rate": 3.5509240385922e-07, + "loss": 2.6571, + "step": 113610 + }, + { + "epoch": 7.719459165647507, + "grad_norm": 7.215550899505615, + "learning_rate": 3.5466775377089277e-07, + "loss": 2.4543, + "step": 113615 + }, + { + "epoch": 7.719798885718168, + "grad_norm": 7.263142108917236, + "learning_rate": 3.542431036825656e-07, + "loss": 2.6688, + "step": 113620 + }, + { + "epoch": 7.72013860578883, + "grad_norm": 8.144495964050293, + "learning_rate": 3.538184535942384e-07, + "loss": 2.6906, + "step": 113625 + }, + { + "epoch": 7.720478325859492, + "grad_norm": 6.578419208526611, + "learning_rate": 3.533938035059112e-07, + "loss": 2.7878, + "step": 113630 + }, + { + "epoch": 7.720818045930153, + "grad_norm": 8.113940238952637, + "learning_rate": 3.529691534175839e-07, + "loss": 2.5864, + "step": 113635 + }, + { + "epoch": 7.7211577660008155, + "grad_norm": 7.226139545440674, + "learning_rate": 3.5254450332925667e-07, + "loss": 2.7609, + "step": 113640 + }, + { + "epoch": 7.721497486071478, + "grad_norm": 9.095551490783691, + "learning_rate": 3.521198532409295e-07, + "loss": 2.6475, + "step": 113645 + }, + { + "epoch": 7.721837206142139, + "grad_norm": 9.391404151916504, + "learning_rate": 3.5169520315260227e-07, + "loss": 2.8223, + "step": 113650 + }, + { + "epoch": 7.722176926212801, + "grad_norm": 6.442759037017822, + "learning_rate": 3.5127055306427507e-07, + "loss": 2.6057, + "step": 113655 + }, + { + "epoch": 7.722516646283463, + "grad_norm": 9.419758796691895, + "learning_rate": 3.508459029759478e-07, + "loss": 2.7466, + "step": 113660 + }, + { + "epoch": 7.722856366354124, + "grad_norm": 9.334787368774414, + "learning_rate": 3.504212528876207e-07, + "loss": 2.6375, + "step": 113665 + }, + { + "epoch": 7.723196086424786, + "grad_norm": 8.970197677612305, + "learning_rate": 3.499966027992934e-07, + "loss": 2.6818, + "step": 113670 + }, + { + "epoch": 7.723535806495448, + "grad_norm": 8.166556358337402, + "learning_rate": 3.4957195271096617e-07, + "loss": 2.9509, + "step": 113675 + }, + { + "epoch": 7.723875526566109, + "grad_norm": 8.546700477600098, + "learning_rate": 3.4914730262263897e-07, + "loss": 2.9236, + "step": 113680 + }, + { + "epoch": 7.7242152466367715, + "grad_norm": 7.052271842956543, + "learning_rate": 3.487226525343117e-07, + "loss": 2.8674, + "step": 113685 + }, + { + "epoch": 7.724554966707433, + "grad_norm": 8.637552261352539, + "learning_rate": 3.4829800244598457e-07, + "loss": 2.7544, + "step": 113690 + }, + { + "epoch": 7.724894686778095, + "grad_norm": 9.037741661071777, + "learning_rate": 3.478733523576573e-07, + "loss": 2.7754, + "step": 113695 + }, + { + "epoch": 7.725234406848757, + "grad_norm": 8.119441986083984, + "learning_rate": 3.4744870226933007e-07, + "loss": 2.6987, + "step": 113700 + }, + { + "epoch": 7.725574126919418, + "grad_norm": 8.50890827178955, + "learning_rate": 3.4702405218100287e-07, + "loss": 2.9351, + "step": 113705 + }, + { + "epoch": 7.72591384699008, + "grad_norm": 9.72246265411377, + "learning_rate": 3.4659940209267567e-07, + "loss": 2.5185, + "step": 113710 + }, + { + "epoch": 7.726253567060742, + "grad_norm": 7.691476345062256, + "learning_rate": 3.4617475200434847e-07, + "loss": 2.8892, + "step": 113715 + }, + { + "epoch": 7.726593287131403, + "grad_norm": 7.253110885620117, + "learning_rate": 3.457501019160212e-07, + "loss": 2.818, + "step": 113720 + }, + { + "epoch": 7.726933007202065, + "grad_norm": 9.048832893371582, + "learning_rate": 3.4532545182769397e-07, + "loss": 2.6528, + "step": 113725 + }, + { + "epoch": 7.7272727272727275, + "grad_norm": 8.82658576965332, + "learning_rate": 3.449008017393668e-07, + "loss": 2.9622, + "step": 113730 + }, + { + "epoch": 7.727612447343389, + "grad_norm": 10.015902519226074, + "learning_rate": 3.4447615165103957e-07, + "loss": 2.9099, + "step": 113735 + }, + { + "epoch": 7.727952167414051, + "grad_norm": 6.406132698059082, + "learning_rate": 3.4405150156271237e-07, + "loss": 2.8837, + "step": 113740 + }, + { + "epoch": 7.728291887484713, + "grad_norm": 7.542420387268066, + "learning_rate": 3.436268514743851e-07, + "loss": 2.8263, + "step": 113745 + }, + { + "epoch": 7.728631607555374, + "grad_norm": 10.37404727935791, + "learning_rate": 3.4320220138605797e-07, + "loss": 2.6293, + "step": 113750 + }, + { + "epoch": 7.728971327626036, + "grad_norm": 9.62850284576416, + "learning_rate": 3.427775512977307e-07, + "loss": 2.6582, + "step": 113755 + }, + { + "epoch": 7.729311047696698, + "grad_norm": 9.168839454650879, + "learning_rate": 3.4235290120940347e-07, + "loss": 2.7088, + "step": 113760 + }, + { + "epoch": 7.729650767767359, + "grad_norm": 8.825216293334961, + "learning_rate": 3.4192825112107627e-07, + "loss": 2.845, + "step": 113765 + }, + { + "epoch": 7.7299904878380215, + "grad_norm": 10.049570083618164, + "learning_rate": 3.41503601032749e-07, + "loss": 2.6489, + "step": 113770 + }, + { + "epoch": 7.7303302079086835, + "grad_norm": 7.729151725769043, + "learning_rate": 3.4107895094442187e-07, + "loss": 2.6227, + "step": 113775 + }, + { + "epoch": 7.730669927979345, + "grad_norm": 6.956608772277832, + "learning_rate": 3.406543008560946e-07, + "loss": 2.6847, + "step": 113780 + }, + { + "epoch": 7.731009648050007, + "grad_norm": 8.417196273803711, + "learning_rate": 3.4022965076776736e-07, + "loss": 2.589, + "step": 113785 + }, + { + "epoch": 7.731349368120669, + "grad_norm": 6.906093597412109, + "learning_rate": 3.3980500067944016e-07, + "loss": 2.5971, + "step": 113790 + }, + { + "epoch": 7.73168908819133, + "grad_norm": 6.857492446899414, + "learning_rate": 3.393803505911129e-07, + "loss": 2.6087, + "step": 113795 + }, + { + "epoch": 7.732028808261992, + "grad_norm": 7.817075729370117, + "learning_rate": 3.3895570050278577e-07, + "loss": 2.7654, + "step": 113800 + }, + { + "epoch": 7.732368528332654, + "grad_norm": 7.9623494148254395, + "learning_rate": 3.385310504144585e-07, + "loss": 2.7778, + "step": 113805 + }, + { + "epoch": 7.732708248403315, + "grad_norm": 8.848958969116211, + "learning_rate": 3.3810640032613126e-07, + "loss": 2.3926, + "step": 113810 + }, + { + "epoch": 7.7330479684739775, + "grad_norm": 6.743760585784912, + "learning_rate": 3.3768175023780406e-07, + "loss": 2.7422, + "step": 113815 + }, + { + "epoch": 7.7333876885446395, + "grad_norm": 7.888717174530029, + "learning_rate": 3.3725710014947686e-07, + "loss": 2.7225, + "step": 113820 + }, + { + "epoch": 7.733727408615301, + "grad_norm": 6.817549228668213, + "learning_rate": 3.3683245006114966e-07, + "loss": 2.6788, + "step": 113825 + }, + { + "epoch": 7.734067128685963, + "grad_norm": 8.189140319824219, + "learning_rate": 3.364077999728224e-07, + "loss": 2.5776, + "step": 113830 + }, + { + "epoch": 7.734406848756625, + "grad_norm": 10.705788612365723, + "learning_rate": 3.359831498844952e-07, + "loss": 3.0438, + "step": 113835 + }, + { + "epoch": 7.734746568827286, + "grad_norm": 7.109524250030518, + "learning_rate": 3.35558499796168e-07, + "loss": 2.9678, + "step": 113840 + }, + { + "epoch": 7.735086288897948, + "grad_norm": 8.14693832397461, + "learning_rate": 3.3513384970784076e-07, + "loss": 2.6902, + "step": 113845 + }, + { + "epoch": 7.73542600896861, + "grad_norm": 7.054921627044678, + "learning_rate": 3.3470919961951356e-07, + "loss": 2.6783, + "step": 113850 + }, + { + "epoch": 7.735765729039271, + "grad_norm": 8.562044143676758, + "learning_rate": 3.342845495311863e-07, + "loss": 2.7292, + "step": 113855 + }, + { + "epoch": 7.7361054491099335, + "grad_norm": 7.151323318481445, + "learning_rate": 3.3385989944285916e-07, + "loss": 2.457, + "step": 113860 + }, + { + "epoch": 7.7364451691805955, + "grad_norm": 9.634020805358887, + "learning_rate": 3.334352493545319e-07, + "loss": 2.797, + "step": 113865 + }, + { + "epoch": 7.736784889251257, + "grad_norm": 7.779661655426025, + "learning_rate": 3.3301059926620466e-07, + "loss": 2.7681, + "step": 113870 + }, + { + "epoch": 7.737124609321919, + "grad_norm": 5.748180389404297, + "learning_rate": 3.3258594917787746e-07, + "loss": 2.8405, + "step": 113875 + }, + { + "epoch": 7.737464329392581, + "grad_norm": 7.541518688201904, + "learning_rate": 3.321612990895502e-07, + "loss": 2.8766, + "step": 113880 + }, + { + "epoch": 7.737804049463242, + "grad_norm": 8.793869972229004, + "learning_rate": 3.3173664900122306e-07, + "loss": 2.5098, + "step": 113885 + }, + { + "epoch": 7.738143769533904, + "grad_norm": 10.168913841247559, + "learning_rate": 3.313119989128958e-07, + "loss": 2.6156, + "step": 113890 + }, + { + "epoch": 7.738483489604566, + "grad_norm": 7.989985466003418, + "learning_rate": 3.3088734882456856e-07, + "loss": 2.8141, + "step": 113895 + }, + { + "epoch": 7.738823209675227, + "grad_norm": 9.319067001342773, + "learning_rate": 3.3046269873624136e-07, + "loss": 2.7658, + "step": 113900 + }, + { + "epoch": 7.7391629297458895, + "grad_norm": 7.394050121307373, + "learning_rate": 3.300380486479141e-07, + "loss": 2.6038, + "step": 113905 + }, + { + "epoch": 7.739502649816551, + "grad_norm": 7.238613128662109, + "learning_rate": 3.2961339855958696e-07, + "loss": 2.6874, + "step": 113910 + }, + { + "epoch": 7.739842369887213, + "grad_norm": 9.074837684631348, + "learning_rate": 3.291887484712597e-07, + "loss": 2.7506, + "step": 113915 + }, + { + "epoch": 7.740182089957875, + "grad_norm": 6.522716999053955, + "learning_rate": 3.287640983829325e-07, + "loss": 2.6967, + "step": 113920 + }, + { + "epoch": 7.740521810028536, + "grad_norm": 9.481184959411621, + "learning_rate": 3.2833944829460526e-07, + "loss": 2.8515, + "step": 113925 + }, + { + "epoch": 7.740861530099198, + "grad_norm": 7.803825855255127, + "learning_rate": 3.27914798206278e-07, + "loss": 2.9651, + "step": 113930 + }, + { + "epoch": 7.74120125016986, + "grad_norm": 6.753282070159912, + "learning_rate": 3.2749014811795086e-07, + "loss": 2.6008, + "step": 113935 + }, + { + "epoch": 7.741540970240521, + "grad_norm": 8.537287712097168, + "learning_rate": 3.270654980296236e-07, + "loss": 2.8244, + "step": 113940 + }, + { + "epoch": 7.741880690311183, + "grad_norm": 6.91664981842041, + "learning_rate": 3.266408479412964e-07, + "loss": 2.6492, + "step": 113945 + }, + { + "epoch": 7.7422204103818455, + "grad_norm": 7.614202976226807, + "learning_rate": 3.2621619785296915e-07, + "loss": 2.5525, + "step": 113950 + }, + { + "epoch": 7.742560130452507, + "grad_norm": 7.353086948394775, + "learning_rate": 3.2579154776464195e-07, + "loss": 2.638, + "step": 113955 + }, + { + "epoch": 7.742899850523169, + "grad_norm": 7.96789026260376, + "learning_rate": 3.2536689767631476e-07, + "loss": 2.8706, + "step": 113960 + }, + { + "epoch": 7.743239570593831, + "grad_norm": 9.517501831054688, + "learning_rate": 3.249422475879875e-07, + "loss": 2.7685, + "step": 113965 + }, + { + "epoch": 7.743579290664492, + "grad_norm": 8.555281639099121, + "learning_rate": 3.2451759749966036e-07, + "loss": 2.7396, + "step": 113970 + }, + { + "epoch": 7.743919010735154, + "grad_norm": 6.996113300323486, + "learning_rate": 3.240929474113331e-07, + "loss": 2.6867, + "step": 113975 + }, + { + "epoch": 7.744258730805816, + "grad_norm": 7.567750453948975, + "learning_rate": 3.2366829732300585e-07, + "loss": 2.7762, + "step": 113980 + }, + { + "epoch": 7.744598450876477, + "grad_norm": 7.868231296539307, + "learning_rate": 3.2324364723467865e-07, + "loss": 2.488, + "step": 113985 + }, + { + "epoch": 7.744938170947139, + "grad_norm": 7.257296085357666, + "learning_rate": 3.228189971463514e-07, + "loss": 2.5477, + "step": 113990 + }, + { + "epoch": 7.7452778910178015, + "grad_norm": 6.33450174331665, + "learning_rate": 3.2239434705802425e-07, + "loss": 2.6851, + "step": 113995 + }, + { + "epoch": 7.745617611088463, + "grad_norm": 9.047738075256348, + "learning_rate": 3.21969696969697e-07, + "loss": 2.9905, + "step": 114000 + }, + { + "epoch": 7.745957331159125, + "grad_norm": 6.513221740722656, + "learning_rate": 3.215450468813698e-07, + "loss": 2.5176, + "step": 114005 + }, + { + "epoch": 7.746297051229787, + "grad_norm": 7.31383752822876, + "learning_rate": 3.2112039679304255e-07, + "loss": 2.6192, + "step": 114010 + }, + { + "epoch": 7.746636771300448, + "grad_norm": 8.61780834197998, + "learning_rate": 3.206957467047153e-07, + "loss": 2.7518, + "step": 114015 + }, + { + "epoch": 7.74697649137111, + "grad_norm": 7.068830966949463, + "learning_rate": 3.2027109661638815e-07, + "loss": 2.475, + "step": 114020 + }, + { + "epoch": 7.747316211441772, + "grad_norm": 8.015585899353027, + "learning_rate": 3.198464465280609e-07, + "loss": 2.8129, + "step": 114025 + }, + { + "epoch": 7.747655931512433, + "grad_norm": 7.392707824707031, + "learning_rate": 3.194217964397337e-07, + "loss": 2.6885, + "step": 114030 + }, + { + "epoch": 7.7479956515830954, + "grad_norm": 6.83221435546875, + "learning_rate": 3.1899714635140645e-07, + "loss": 2.7072, + "step": 114035 + }, + { + "epoch": 7.7483353716537575, + "grad_norm": 6.806998252868652, + "learning_rate": 3.185724962630792e-07, + "loss": 2.7978, + "step": 114040 + }, + { + "epoch": 7.748675091724419, + "grad_norm": 5.519594669342041, + "learning_rate": 3.1814784617475205e-07, + "loss": 2.7224, + "step": 114045 + }, + { + "epoch": 7.749014811795081, + "grad_norm": 10.610265731811523, + "learning_rate": 3.177231960864248e-07, + "loss": 2.748, + "step": 114050 + }, + { + "epoch": 7.749354531865743, + "grad_norm": 8.596470832824707, + "learning_rate": 3.172985459980976e-07, + "loss": 2.8115, + "step": 114055 + }, + { + "epoch": 7.749694251936404, + "grad_norm": 7.628328323364258, + "learning_rate": 3.1687389590977035e-07, + "loss": 2.5005, + "step": 114060 + }, + { + "epoch": 7.750033972007066, + "grad_norm": 8.587454795837402, + "learning_rate": 3.1644924582144315e-07, + "loss": 2.6901, + "step": 114065 + }, + { + "epoch": 7.750373692077728, + "grad_norm": 8.674003601074219, + "learning_rate": 3.1602459573311595e-07, + "loss": 2.6899, + "step": 114070 + }, + { + "epoch": 7.750713412148389, + "grad_norm": 7.267258167266846, + "learning_rate": 3.155999456447887e-07, + "loss": 2.5314, + "step": 114075 + }, + { + "epoch": 7.7510531322190515, + "grad_norm": 7.69483757019043, + "learning_rate": 3.151752955564615e-07, + "loss": 2.5697, + "step": 114080 + }, + { + "epoch": 7.7513928522897135, + "grad_norm": 8.500443458557129, + "learning_rate": 3.147506454681343e-07, + "loss": 2.6562, + "step": 114085 + }, + { + "epoch": 7.751732572360375, + "grad_norm": 6.058792591094971, + "learning_rate": 3.143259953798071e-07, + "loss": 2.6629, + "step": 114090 + }, + { + "epoch": 7.752072292431037, + "grad_norm": 8.258993148803711, + "learning_rate": 3.1390134529147985e-07, + "loss": 2.8644, + "step": 114095 + }, + { + "epoch": 7.752412012501699, + "grad_norm": 5.653958797454834, + "learning_rate": 3.134766952031526e-07, + "loss": 2.6093, + "step": 114100 + }, + { + "epoch": 7.75275173257236, + "grad_norm": 7.856331825256348, + "learning_rate": 3.1305204511482545e-07, + "loss": 2.5571, + "step": 114105 + }, + { + "epoch": 7.753091452643022, + "grad_norm": 6.559114456176758, + "learning_rate": 3.126273950264982e-07, + "loss": 2.7728, + "step": 114110 + }, + { + "epoch": 7.753431172713684, + "grad_norm": 9.830819129943848, + "learning_rate": 3.1220274493817094e-07, + "loss": 2.5573, + "step": 114115 + }, + { + "epoch": 7.753770892784345, + "grad_norm": 10.288336753845215, + "learning_rate": 3.1177809484984374e-07, + "loss": 3.146, + "step": 114120 + }, + { + "epoch": 7.7541106128550075, + "grad_norm": 7.625933647155762, + "learning_rate": 3.1135344476151655e-07, + "loss": 2.7783, + "step": 114125 + }, + { + "epoch": 7.7544503329256695, + "grad_norm": 8.078999519348145, + "learning_rate": 3.1092879467318935e-07, + "loss": 2.7037, + "step": 114130 + }, + { + "epoch": 7.754790052996331, + "grad_norm": 7.77820348739624, + "learning_rate": 3.105041445848621e-07, + "loss": 2.7292, + "step": 114135 + }, + { + "epoch": 7.755129773066993, + "grad_norm": 10.724055290222168, + "learning_rate": 3.100794944965349e-07, + "loss": 2.7054, + "step": 114140 + }, + { + "epoch": 7.755469493137655, + "grad_norm": 6.84876823425293, + "learning_rate": 3.0965484440820764e-07, + "loss": 2.7189, + "step": 114145 + }, + { + "epoch": 7.755809213208316, + "grad_norm": 8.711947441101074, + "learning_rate": 3.0923019431988044e-07, + "loss": 2.7199, + "step": 114150 + }, + { + "epoch": 7.756148933278978, + "grad_norm": 6.479823112487793, + "learning_rate": 3.0880554423155324e-07, + "loss": 2.7964, + "step": 114155 + }, + { + "epoch": 7.75648865334964, + "grad_norm": 8.715189933776855, + "learning_rate": 3.0838089414322604e-07, + "loss": 2.8677, + "step": 114160 + }, + { + "epoch": 7.756828373420301, + "grad_norm": 8.350300788879395, + "learning_rate": 3.079562440548988e-07, + "loss": 2.5072, + "step": 114165 + }, + { + "epoch": 7.7571680934909635, + "grad_norm": 8.338738441467285, + "learning_rate": 3.0753159396657154e-07, + "loss": 2.7559, + "step": 114170 + }, + { + "epoch": 7.7575078135616256, + "grad_norm": 7.923107624053955, + "learning_rate": 3.0710694387824434e-07, + "loss": 2.9657, + "step": 114175 + }, + { + "epoch": 7.757847533632287, + "grad_norm": 8.63033676147461, + "learning_rate": 3.0668229378991714e-07, + "loss": 2.8698, + "step": 114180 + }, + { + "epoch": 7.758187253702949, + "grad_norm": 10.213493347167969, + "learning_rate": 3.0625764370158994e-07, + "loss": 2.9106, + "step": 114185 + }, + { + "epoch": 7.758526973773611, + "grad_norm": 7.466827392578125, + "learning_rate": 3.058329936132627e-07, + "loss": 2.858, + "step": 114190 + }, + { + "epoch": 7.758866693844272, + "grad_norm": 6.947376251220703, + "learning_rate": 3.054083435249355e-07, + "loss": 2.7959, + "step": 114195 + }, + { + "epoch": 7.759206413914934, + "grad_norm": 7.178340435028076, + "learning_rate": 3.0498369343660824e-07, + "loss": 2.7073, + "step": 114200 + }, + { + "epoch": 7.759546133985596, + "grad_norm": 8.867268562316895, + "learning_rate": 3.0455904334828104e-07, + "loss": 2.7215, + "step": 114205 + }, + { + "epoch": 7.759885854056257, + "grad_norm": 9.217534065246582, + "learning_rate": 3.0413439325995384e-07, + "loss": 2.8755, + "step": 114210 + }, + { + "epoch": 7.7602255741269195, + "grad_norm": 6.982851982116699, + "learning_rate": 3.0370974317162664e-07, + "loss": 2.7967, + "step": 114215 + }, + { + "epoch": 7.760565294197582, + "grad_norm": 8.528509140014648, + "learning_rate": 3.032850930832994e-07, + "loss": 2.5608, + "step": 114220 + }, + { + "epoch": 7.760905014268243, + "grad_norm": 8.67634105682373, + "learning_rate": 3.0286044299497214e-07, + "loss": 2.6768, + "step": 114225 + }, + { + "epoch": 7.761244734338905, + "grad_norm": 6.476839542388916, + "learning_rate": 3.0243579290664494e-07, + "loss": 2.9267, + "step": 114230 + }, + { + "epoch": 7.761584454409567, + "grad_norm": 9.945734024047852, + "learning_rate": 3.0201114281831774e-07, + "loss": 2.5268, + "step": 114235 + }, + { + "epoch": 7.761924174480228, + "grad_norm": 11.328886985778809, + "learning_rate": 3.0158649272999054e-07, + "loss": 2.6113, + "step": 114240 + }, + { + "epoch": 7.76226389455089, + "grad_norm": 8.071566581726074, + "learning_rate": 3.011618426416633e-07, + "loss": 2.6786, + "step": 114245 + }, + { + "epoch": 7.762603614621552, + "grad_norm": 6.753025531768799, + "learning_rate": 3.007371925533361e-07, + "loss": 2.5186, + "step": 114250 + }, + { + "epoch": 7.762943334692213, + "grad_norm": 8.580702781677246, + "learning_rate": 3.0031254246500884e-07, + "loss": 2.8802, + "step": 114255 + }, + { + "epoch": 7.7632830547628755, + "grad_norm": 6.825517177581787, + "learning_rate": 2.9988789237668164e-07, + "loss": 2.9855, + "step": 114260 + }, + { + "epoch": 7.763622774833538, + "grad_norm": 7.186278343200684, + "learning_rate": 2.9946324228835444e-07, + "loss": 2.7595, + "step": 114265 + }, + { + "epoch": 7.763962494904199, + "grad_norm": 8.17024040222168, + "learning_rate": 2.9903859220002724e-07, + "loss": 2.4685, + "step": 114270 + }, + { + "epoch": 7.764302214974861, + "grad_norm": 8.39708423614502, + "learning_rate": 2.986139421117e-07, + "loss": 2.7189, + "step": 114275 + }, + { + "epoch": 7.764641935045523, + "grad_norm": 7.839599132537842, + "learning_rate": 2.9818929202337273e-07, + "loss": 2.6201, + "step": 114280 + }, + { + "epoch": 7.764981655116184, + "grad_norm": 6.764581203460693, + "learning_rate": 2.9776464193504553e-07, + "loss": 2.6777, + "step": 114285 + }, + { + "epoch": 7.765321375186846, + "grad_norm": 9.578606605529785, + "learning_rate": 2.9733999184671834e-07, + "loss": 2.9455, + "step": 114290 + }, + { + "epoch": 7.765661095257508, + "grad_norm": 7.185581207275391, + "learning_rate": 2.9691534175839114e-07, + "loss": 2.8003, + "step": 114295 + }, + { + "epoch": 7.766000815328169, + "grad_norm": 8.807286262512207, + "learning_rate": 2.964906916700639e-07, + "loss": 2.7241, + "step": 114300 + }, + { + "epoch": 7.7663405353988315, + "grad_norm": 7.441795825958252, + "learning_rate": 2.9606604158173663e-07, + "loss": 2.8268, + "step": 114305 + }, + { + "epoch": 7.766680255469494, + "grad_norm": 6.322577953338623, + "learning_rate": 2.9564139149340943e-07, + "loss": 2.7834, + "step": 114310 + }, + { + "epoch": 7.767019975540155, + "grad_norm": 8.070356369018555, + "learning_rate": 2.9521674140508223e-07, + "loss": 2.5693, + "step": 114315 + }, + { + "epoch": 7.767359695610817, + "grad_norm": 7.196419715881348, + "learning_rate": 2.9479209131675503e-07, + "loss": 2.9188, + "step": 114320 + }, + { + "epoch": 7.767699415681479, + "grad_norm": 7.515167236328125, + "learning_rate": 2.943674412284278e-07, + "loss": 2.9027, + "step": 114325 + }, + { + "epoch": 7.76803913575214, + "grad_norm": 7.287962913513184, + "learning_rate": 2.939427911401006e-07, + "loss": 2.6862, + "step": 114330 + }, + { + "epoch": 7.768378855822802, + "grad_norm": 8.107891082763672, + "learning_rate": 2.9351814105177333e-07, + "loss": 2.573, + "step": 114335 + }, + { + "epoch": 7.768718575893464, + "grad_norm": 5.689860820770264, + "learning_rate": 2.9309349096344613e-07, + "loss": 2.8539, + "step": 114340 + }, + { + "epoch": 7.7690582959641254, + "grad_norm": 7.232652187347412, + "learning_rate": 2.9266884087511893e-07, + "loss": 2.5955, + "step": 114345 + }, + { + "epoch": 7.7693980160347875, + "grad_norm": 9.33568286895752, + "learning_rate": 2.9224419078679173e-07, + "loss": 2.8378, + "step": 114350 + }, + { + "epoch": 7.76973773610545, + "grad_norm": 6.5194926261901855, + "learning_rate": 2.918195406984645e-07, + "loss": 2.5701, + "step": 114355 + }, + { + "epoch": 7.770077456176111, + "grad_norm": 8.227875709533691, + "learning_rate": 2.913948906101373e-07, + "loss": 2.6451, + "step": 114360 + }, + { + "epoch": 7.770417176246773, + "grad_norm": 7.4950432777404785, + "learning_rate": 2.9097024052181003e-07, + "loss": 2.7635, + "step": 114365 + }, + { + "epoch": 7.770756896317434, + "grad_norm": 8.890029907226562, + "learning_rate": 2.9054559043348283e-07, + "loss": 2.5935, + "step": 114370 + }, + { + "epoch": 7.771096616388096, + "grad_norm": 6.941178798675537, + "learning_rate": 2.9012094034515563e-07, + "loss": 2.6895, + "step": 114375 + }, + { + "epoch": 7.771436336458758, + "grad_norm": 8.595246315002441, + "learning_rate": 2.896962902568284e-07, + "loss": 2.7605, + "step": 114380 + }, + { + "epoch": 7.771776056529419, + "grad_norm": 7.906587600708008, + "learning_rate": 2.892716401685012e-07, + "loss": 2.6988, + "step": 114385 + }, + { + "epoch": 7.7721157766000815, + "grad_norm": 6.63019323348999, + "learning_rate": 2.8884699008017393e-07, + "loss": 2.6013, + "step": 114390 + }, + { + "epoch": 7.7724554966707435, + "grad_norm": 7.452862739562988, + "learning_rate": 2.8842233999184673e-07, + "loss": 2.8578, + "step": 114395 + }, + { + "epoch": 7.772795216741405, + "grad_norm": 8.041932106018066, + "learning_rate": 2.8799768990351953e-07, + "loss": 2.5268, + "step": 114400 + }, + { + "epoch": 7.773134936812067, + "grad_norm": 7.260488033294678, + "learning_rate": 2.8757303981519233e-07, + "loss": 2.6773, + "step": 114405 + }, + { + "epoch": 7.773474656882729, + "grad_norm": 6.749757766723633, + "learning_rate": 2.871483897268651e-07, + "loss": 2.5891, + "step": 114410 + }, + { + "epoch": 7.77381437695339, + "grad_norm": 9.226146697998047, + "learning_rate": 2.867237396385379e-07, + "loss": 2.7931, + "step": 114415 + }, + { + "epoch": 7.774154097024052, + "grad_norm": 6.060998916625977, + "learning_rate": 2.862990895502106e-07, + "loss": 2.7781, + "step": 114420 + }, + { + "epoch": 7.774493817094714, + "grad_norm": 8.678848266601562, + "learning_rate": 2.8587443946188343e-07, + "loss": 2.5468, + "step": 114425 + }, + { + "epoch": 7.774833537165375, + "grad_norm": 10.474563598632812, + "learning_rate": 2.8544978937355623e-07, + "loss": 2.6309, + "step": 114430 + }, + { + "epoch": 7.7751732572360375, + "grad_norm": 7.398797512054443, + "learning_rate": 2.85025139285229e-07, + "loss": 2.8345, + "step": 114435 + }, + { + "epoch": 7.7755129773066995, + "grad_norm": 8.072220802307129, + "learning_rate": 2.846004891969018e-07, + "loss": 3.0254, + "step": 114440 + }, + { + "epoch": 7.775852697377361, + "grad_norm": 7.579319953918457, + "learning_rate": 2.841758391085746e-07, + "loss": 2.5383, + "step": 114445 + }, + { + "epoch": 7.776192417448023, + "grad_norm": 8.232376098632812, + "learning_rate": 2.837511890202473e-07, + "loss": 2.7889, + "step": 114450 + }, + { + "epoch": 7.776532137518685, + "grad_norm": 7.995406150817871, + "learning_rate": 2.833265389319201e-07, + "loss": 2.4777, + "step": 114455 + }, + { + "epoch": 7.776871857589346, + "grad_norm": 7.836703777313232, + "learning_rate": 2.8290188884359293e-07, + "loss": 2.5883, + "step": 114460 + }, + { + "epoch": 7.777211577660008, + "grad_norm": 8.265981674194336, + "learning_rate": 2.824772387552657e-07, + "loss": 2.6171, + "step": 114465 + }, + { + "epoch": 7.77755129773067, + "grad_norm": 11.950753211975098, + "learning_rate": 2.820525886669385e-07, + "loss": 2.7747, + "step": 114470 + }, + { + "epoch": 7.777891017801331, + "grad_norm": 8.14674186706543, + "learning_rate": 2.816279385786112e-07, + "loss": 2.5242, + "step": 114475 + }, + { + "epoch": 7.7782307378719935, + "grad_norm": 7.703287124633789, + "learning_rate": 2.81203288490284e-07, + "loss": 2.6161, + "step": 114480 + }, + { + "epoch": 7.778570457942656, + "grad_norm": 7.028209209442139, + "learning_rate": 2.807786384019568e-07, + "loss": 2.911, + "step": 114485 + }, + { + "epoch": 7.778910178013317, + "grad_norm": 11.081083297729492, + "learning_rate": 2.8035398831362957e-07, + "loss": 2.9603, + "step": 114490 + }, + { + "epoch": 7.779249898083979, + "grad_norm": 6.717909336090088, + "learning_rate": 2.7992933822530237e-07, + "loss": 2.8699, + "step": 114495 + }, + { + "epoch": 7.779589618154641, + "grad_norm": 7.715188980102539, + "learning_rate": 2.795046881369752e-07, + "loss": 2.3474, + "step": 114500 + }, + { + "epoch": 7.779929338225302, + "grad_norm": 6.272082805633545, + "learning_rate": 2.790800380486479e-07, + "loss": 2.4386, + "step": 114505 + }, + { + "epoch": 7.780269058295964, + "grad_norm": 6.983165740966797, + "learning_rate": 2.786553879603207e-07, + "loss": 2.9461, + "step": 114510 + }, + { + "epoch": 7.780608778366626, + "grad_norm": 6.633607864379883, + "learning_rate": 2.782307378719935e-07, + "loss": 2.6856, + "step": 114515 + }, + { + "epoch": 7.780948498437287, + "grad_norm": 7.91344690322876, + "learning_rate": 2.7780608778366627e-07, + "loss": 2.6591, + "step": 114520 + }, + { + "epoch": 7.7812882185079495, + "grad_norm": 8.634305953979492, + "learning_rate": 2.7738143769533907e-07, + "loss": 2.8514, + "step": 114525 + }, + { + "epoch": 7.781627938578612, + "grad_norm": 7.637790203094482, + "learning_rate": 2.7695678760701187e-07, + "loss": 2.837, + "step": 114530 + }, + { + "epoch": 7.781967658649273, + "grad_norm": 10.001874923706055, + "learning_rate": 2.765321375186846e-07, + "loss": 2.7081, + "step": 114535 + }, + { + "epoch": 7.782307378719935, + "grad_norm": 7.750490188598633, + "learning_rate": 2.761074874303574e-07, + "loss": 2.6241, + "step": 114540 + }, + { + "epoch": 7.782647098790597, + "grad_norm": 8.286956787109375, + "learning_rate": 2.7568283734203017e-07, + "loss": 2.7514, + "step": 114545 + }, + { + "epoch": 7.782986818861258, + "grad_norm": 9.003355026245117, + "learning_rate": 2.7525818725370297e-07, + "loss": 2.8778, + "step": 114550 + }, + { + "epoch": 7.78332653893192, + "grad_norm": 7.533811092376709, + "learning_rate": 2.7483353716537577e-07, + "loss": 2.5578, + "step": 114555 + }, + { + "epoch": 7.783666259002582, + "grad_norm": 5.811221599578857, + "learning_rate": 2.744088870770485e-07, + "loss": 2.7189, + "step": 114560 + }, + { + "epoch": 7.784005979073243, + "grad_norm": 7.330059051513672, + "learning_rate": 2.739842369887213e-07, + "loss": 2.9586, + "step": 114565 + }, + { + "epoch": 7.7843456991439055, + "grad_norm": 8.785995483398438, + "learning_rate": 2.735595869003941e-07, + "loss": 2.52, + "step": 114570 + }, + { + "epoch": 7.784685419214568, + "grad_norm": 6.882681369781494, + "learning_rate": 2.7313493681206687e-07, + "loss": 2.5896, + "step": 114575 + }, + { + "epoch": 7.785025139285229, + "grad_norm": 8.395081520080566, + "learning_rate": 2.7271028672373967e-07, + "loss": 2.797, + "step": 114580 + }, + { + "epoch": 7.785364859355891, + "grad_norm": 8.198617935180664, + "learning_rate": 2.7228563663541247e-07, + "loss": 2.6365, + "step": 114585 + }, + { + "epoch": 7.785704579426552, + "grad_norm": 7.021304607391357, + "learning_rate": 2.718609865470852e-07, + "loss": 2.7411, + "step": 114590 + }, + { + "epoch": 7.786044299497214, + "grad_norm": 7.730652809143066, + "learning_rate": 2.71436336458758e-07, + "loss": 2.8072, + "step": 114595 + }, + { + "epoch": 7.786384019567876, + "grad_norm": 8.495217323303223, + "learning_rate": 2.7101168637043077e-07, + "loss": 2.7506, + "step": 114600 + }, + { + "epoch": 7.786723739638537, + "grad_norm": 7.446511268615723, + "learning_rate": 2.7058703628210357e-07, + "loss": 2.796, + "step": 114605 + }, + { + "epoch": 7.787063459709199, + "grad_norm": 8.591687202453613, + "learning_rate": 2.7016238619377637e-07, + "loss": 2.5091, + "step": 114610 + }, + { + "epoch": 7.7874031797798615, + "grad_norm": 7.72086763381958, + "learning_rate": 2.697377361054491e-07, + "loss": 2.7371, + "step": 114615 + }, + { + "epoch": 7.787742899850523, + "grad_norm": 6.7513651847839355, + "learning_rate": 2.693130860171219e-07, + "loss": 2.8577, + "step": 114620 + }, + { + "epoch": 7.788082619921185, + "grad_norm": 8.230768203735352, + "learning_rate": 2.6888843592879466e-07, + "loss": 2.6998, + "step": 114625 + }, + { + "epoch": 7.788422339991847, + "grad_norm": 8.042003631591797, + "learning_rate": 2.6846378584046746e-07, + "loss": 2.6281, + "step": 114630 + }, + { + "epoch": 7.788762060062508, + "grad_norm": 6.916992664337158, + "learning_rate": 2.6803913575214027e-07, + "loss": 2.8846, + "step": 114635 + }, + { + "epoch": 7.78910178013317, + "grad_norm": 6.828179836273193, + "learning_rate": 2.6761448566381307e-07, + "loss": 2.789, + "step": 114640 + }, + { + "epoch": 7.789441500203832, + "grad_norm": 7.135223388671875, + "learning_rate": 2.671898355754858e-07, + "loss": 2.5399, + "step": 114645 + }, + { + "epoch": 7.789781220274493, + "grad_norm": 8.276198387145996, + "learning_rate": 2.667651854871586e-07, + "loss": 2.6686, + "step": 114650 + }, + { + "epoch": 7.7901209403451555, + "grad_norm": 7.6797566413879395, + "learning_rate": 2.6634053539883136e-07, + "loss": 2.737, + "step": 114655 + }, + { + "epoch": 7.7904606604158175, + "grad_norm": 11.178754806518555, + "learning_rate": 2.6591588531050416e-07, + "loss": 2.8012, + "step": 114660 + }, + { + "epoch": 7.790800380486479, + "grad_norm": 8.656359672546387, + "learning_rate": 2.6549123522217696e-07, + "loss": 2.7973, + "step": 114665 + }, + { + "epoch": 7.791140100557141, + "grad_norm": 8.273550987243652, + "learning_rate": 2.6506658513384976e-07, + "loss": 2.7665, + "step": 114670 + }, + { + "epoch": 7.791479820627803, + "grad_norm": 7.318753242492676, + "learning_rate": 2.646419350455225e-07, + "loss": 2.7204, + "step": 114675 + }, + { + "epoch": 7.791819540698464, + "grad_norm": 6.777953624725342, + "learning_rate": 2.6421728495719526e-07, + "loss": 2.577, + "step": 114680 + }, + { + "epoch": 7.792159260769126, + "grad_norm": 8.258685111999512, + "learning_rate": 2.6379263486886806e-07, + "loss": 2.5522, + "step": 114685 + }, + { + "epoch": 7.792498980839788, + "grad_norm": 9.228285789489746, + "learning_rate": 2.6336798478054086e-07, + "loss": 2.7049, + "step": 114690 + }, + { + "epoch": 7.792838700910449, + "grad_norm": 6.558497905731201, + "learning_rate": 2.6294333469221366e-07, + "loss": 2.408, + "step": 114695 + }, + { + "epoch": 7.7931784209811115, + "grad_norm": 8.56920051574707, + "learning_rate": 2.625186846038864e-07, + "loss": 2.9002, + "step": 114700 + }, + { + "epoch": 7.7935181410517735, + "grad_norm": 7.183115482330322, + "learning_rate": 2.620940345155592e-07, + "loss": 2.5469, + "step": 114705 + }, + { + "epoch": 7.793857861122435, + "grad_norm": 7.7588090896606445, + "learning_rate": 2.6166938442723196e-07, + "loss": 2.3989, + "step": 114710 + }, + { + "epoch": 7.794197581193097, + "grad_norm": 9.41382122039795, + "learning_rate": 2.6124473433890476e-07, + "loss": 2.6178, + "step": 114715 + }, + { + "epoch": 7.794537301263759, + "grad_norm": 10.125990867614746, + "learning_rate": 2.6082008425057756e-07, + "loss": 2.6891, + "step": 114720 + }, + { + "epoch": 7.79487702133442, + "grad_norm": 6.638819217681885, + "learning_rate": 2.6039543416225036e-07, + "loss": 2.5628, + "step": 114725 + }, + { + "epoch": 7.795216741405082, + "grad_norm": 8.649521827697754, + "learning_rate": 2.599707840739231e-07, + "loss": 2.6379, + "step": 114730 + }, + { + "epoch": 7.795556461475744, + "grad_norm": 9.15991497039795, + "learning_rate": 2.5954613398559586e-07, + "loss": 2.4922, + "step": 114735 + }, + { + "epoch": 7.795896181546405, + "grad_norm": 8.839217185974121, + "learning_rate": 2.5912148389726866e-07, + "loss": 2.5854, + "step": 114740 + }, + { + "epoch": 7.7962359016170675, + "grad_norm": 7.158214569091797, + "learning_rate": 2.5869683380894146e-07, + "loss": 2.8621, + "step": 114745 + }, + { + "epoch": 7.7965756216877296, + "grad_norm": 9.611515045166016, + "learning_rate": 2.5827218372061426e-07, + "loss": 2.8598, + "step": 114750 + }, + { + "epoch": 7.796915341758391, + "grad_norm": 9.681757926940918, + "learning_rate": 2.57847533632287e-07, + "loss": 2.5181, + "step": 114755 + }, + { + "epoch": 7.797255061829053, + "grad_norm": 7.581335067749023, + "learning_rate": 2.574228835439598e-07, + "loss": 2.8031, + "step": 114760 + }, + { + "epoch": 7.797594781899715, + "grad_norm": 7.161927700042725, + "learning_rate": 2.5699823345563256e-07, + "loss": 2.8148, + "step": 114765 + }, + { + "epoch": 7.797934501970376, + "grad_norm": 8.177756309509277, + "learning_rate": 2.5657358336730536e-07, + "loss": 2.7709, + "step": 114770 + }, + { + "epoch": 7.798274222041038, + "grad_norm": 6.649795055389404, + "learning_rate": 2.5614893327897816e-07, + "loss": 2.6836, + "step": 114775 + }, + { + "epoch": 7.7986139421117, + "grad_norm": 8.732820510864258, + "learning_rate": 2.5572428319065096e-07, + "loss": 2.9177, + "step": 114780 + }, + { + "epoch": 7.798953662182361, + "grad_norm": 7.716604232788086, + "learning_rate": 2.552996331023237e-07, + "loss": 2.6548, + "step": 114785 + }, + { + "epoch": 7.7992933822530235, + "grad_norm": 7.125, + "learning_rate": 2.5487498301399645e-07, + "loss": 2.7315, + "step": 114790 + }, + { + "epoch": 7.799633102323686, + "grad_norm": 7.159175395965576, + "learning_rate": 2.5445033292566925e-07, + "loss": 2.8686, + "step": 114795 + }, + { + "epoch": 7.799972822394347, + "grad_norm": 7.228115558624268, + "learning_rate": 2.5402568283734206e-07, + "loss": 2.6515, + "step": 114800 + }, + { + "epoch": 7.800312542465009, + "grad_norm": 7.335932731628418, + "learning_rate": 2.5360103274901486e-07, + "loss": 2.9236, + "step": 114805 + }, + { + "epoch": 7.800652262535671, + "grad_norm": 6.351710319519043, + "learning_rate": 2.531763826606876e-07, + "loss": 2.7703, + "step": 114810 + }, + { + "epoch": 7.800991982606332, + "grad_norm": 9.025604248046875, + "learning_rate": 2.527517325723604e-07, + "loss": 2.6629, + "step": 114815 + }, + { + "epoch": 7.801331702676994, + "grad_norm": 11.29885482788086, + "learning_rate": 2.5232708248403315e-07, + "loss": 2.6963, + "step": 114820 + }, + { + "epoch": 7.801671422747656, + "grad_norm": 7.875865936279297, + "learning_rate": 2.5190243239570595e-07, + "loss": 2.7095, + "step": 114825 + }, + { + "epoch": 7.802011142818317, + "grad_norm": 7.62087345123291, + "learning_rate": 2.5147778230737875e-07, + "loss": 2.6871, + "step": 114830 + }, + { + "epoch": 7.8023508628889795, + "grad_norm": 6.66489315032959, + "learning_rate": 2.5105313221905156e-07, + "loss": 2.7381, + "step": 114835 + }, + { + "epoch": 7.802690582959642, + "grad_norm": 7.924178600311279, + "learning_rate": 2.506284821307243e-07, + "loss": 2.62, + "step": 114840 + }, + { + "epoch": 7.803030303030303, + "grad_norm": 6.951166152954102, + "learning_rate": 2.5020383204239705e-07, + "loss": 2.7387, + "step": 114845 + }, + { + "epoch": 7.803370023100965, + "grad_norm": 7.49631404876709, + "learning_rate": 2.4977918195406985e-07, + "loss": 2.5731, + "step": 114850 + }, + { + "epoch": 7.803709743171627, + "grad_norm": 8.368805885314941, + "learning_rate": 2.4935453186574265e-07, + "loss": 2.7941, + "step": 114855 + }, + { + "epoch": 7.804049463242288, + "grad_norm": 7.134113788604736, + "learning_rate": 2.4892988177741545e-07, + "loss": 2.4128, + "step": 114860 + }, + { + "epoch": 7.80438918331295, + "grad_norm": 10.015911102294922, + "learning_rate": 2.485052316890882e-07, + "loss": 2.892, + "step": 114865 + }, + { + "epoch": 7.804728903383612, + "grad_norm": 8.470890998840332, + "learning_rate": 2.48080581600761e-07, + "loss": 2.5291, + "step": 114870 + }, + { + "epoch": 7.805068623454273, + "grad_norm": 7.785782814025879, + "learning_rate": 2.4765593151243375e-07, + "loss": 2.5192, + "step": 114875 + }, + { + "epoch": 7.8054083435249355, + "grad_norm": 7.993209362030029, + "learning_rate": 2.4723128142410655e-07, + "loss": 2.5262, + "step": 114880 + }, + { + "epoch": 7.805748063595598, + "grad_norm": 7.404713153839111, + "learning_rate": 2.4680663133577935e-07, + "loss": 2.7639, + "step": 114885 + }, + { + "epoch": 7.806087783666259, + "grad_norm": 9.27019214630127, + "learning_rate": 2.4638198124745215e-07, + "loss": 2.4587, + "step": 114890 + }, + { + "epoch": 7.806427503736921, + "grad_norm": 6.859924793243408, + "learning_rate": 2.459573311591249e-07, + "loss": 2.8513, + "step": 114895 + }, + { + "epoch": 7.806767223807583, + "grad_norm": 8.680018424987793, + "learning_rate": 2.4553268107079765e-07, + "loss": 2.8427, + "step": 114900 + }, + { + "epoch": 7.807106943878244, + "grad_norm": 7.622374057769775, + "learning_rate": 2.4510803098247045e-07, + "loss": 2.255, + "step": 114905 + }, + { + "epoch": 7.807446663948906, + "grad_norm": 7.163965225219727, + "learning_rate": 2.4468338089414325e-07, + "loss": 2.8213, + "step": 114910 + }, + { + "epoch": 7.807786384019568, + "grad_norm": 10.087153434753418, + "learning_rate": 2.4425873080581605e-07, + "loss": 2.7921, + "step": 114915 + }, + { + "epoch": 7.8081261040902294, + "grad_norm": 7.6342244148254395, + "learning_rate": 2.438340807174888e-07, + "loss": 2.5029, + "step": 114920 + }, + { + "epoch": 7.8084658241608915, + "grad_norm": 9.323267936706543, + "learning_rate": 2.434094306291616e-07, + "loss": 2.6175, + "step": 114925 + }, + { + "epoch": 7.808805544231554, + "grad_norm": 8.166363716125488, + "learning_rate": 2.4298478054083435e-07, + "loss": 2.8615, + "step": 114930 + }, + { + "epoch": 7.809145264302215, + "grad_norm": 8.897570610046387, + "learning_rate": 2.4256013045250715e-07, + "loss": 2.6323, + "step": 114935 + }, + { + "epoch": 7.809484984372877, + "grad_norm": 7.93409538269043, + "learning_rate": 2.4213548036417995e-07, + "loss": 2.7781, + "step": 114940 + }, + { + "epoch": 7.809824704443539, + "grad_norm": 7.95390510559082, + "learning_rate": 2.4171083027585275e-07, + "loss": 2.62, + "step": 114945 + }, + { + "epoch": 7.8101644245142, + "grad_norm": 7.544737815856934, + "learning_rate": 2.412861801875255e-07, + "loss": 2.6747, + "step": 114950 + }, + { + "epoch": 7.810504144584862, + "grad_norm": 8.93802547454834, + "learning_rate": 2.4086153009919824e-07, + "loss": 2.8645, + "step": 114955 + }, + { + "epoch": 7.810843864655524, + "grad_norm": 8.920289993286133, + "learning_rate": 2.4043688001087104e-07, + "loss": 2.9512, + "step": 114960 + }, + { + "epoch": 7.8111835847261855, + "grad_norm": 8.9852933883667, + "learning_rate": 2.4001222992254385e-07, + "loss": 2.7173, + "step": 114965 + }, + { + "epoch": 7.8115233047968475, + "grad_norm": 7.546746253967285, + "learning_rate": 2.3958757983421665e-07, + "loss": 2.9082, + "step": 114970 + }, + { + "epoch": 7.81186302486751, + "grad_norm": 9.022119522094727, + "learning_rate": 2.391629297458894e-07, + "loss": 2.7664, + "step": 114975 + }, + { + "epoch": 7.812202744938171, + "grad_norm": 9.883119583129883, + "learning_rate": 2.387382796575622e-07, + "loss": 2.4538, + "step": 114980 + }, + { + "epoch": 7.812542465008833, + "grad_norm": 8.157175064086914, + "learning_rate": 2.3831362956923494e-07, + "loss": 2.7209, + "step": 114985 + }, + { + "epoch": 7.812882185079495, + "grad_norm": 7.698624134063721, + "learning_rate": 2.3788897948090774e-07, + "loss": 2.7031, + "step": 114990 + }, + { + "epoch": 7.813221905150156, + "grad_norm": 7.668309211730957, + "learning_rate": 2.3746432939258052e-07, + "loss": 2.7666, + "step": 114995 + }, + { + "epoch": 7.813561625220818, + "grad_norm": 8.867155075073242, + "learning_rate": 2.3703967930425332e-07, + "loss": 2.7341, + "step": 115000 + }, + { + "epoch": 7.81390134529148, + "grad_norm": 5.9789958000183105, + "learning_rate": 2.3661502921592612e-07, + "loss": 2.6156, + "step": 115005 + }, + { + "epoch": 7.8142410653621415, + "grad_norm": 7.596443176269531, + "learning_rate": 2.361903791275989e-07, + "loss": 2.7616, + "step": 115010 + }, + { + "epoch": 7.8145807854328035, + "grad_norm": 7.3708624839782715, + "learning_rate": 2.3576572903927164e-07, + "loss": 2.5889, + "step": 115015 + }, + { + "epoch": 7.814920505503466, + "grad_norm": 7.203526020050049, + "learning_rate": 2.3534107895094444e-07, + "loss": 2.6492, + "step": 115020 + }, + { + "epoch": 7.815260225574127, + "grad_norm": 6.820315361022949, + "learning_rate": 2.3491642886261722e-07, + "loss": 2.5208, + "step": 115025 + }, + { + "epoch": 7.815599945644789, + "grad_norm": 6.990660190582275, + "learning_rate": 2.3449177877429002e-07, + "loss": 2.5915, + "step": 115030 + }, + { + "epoch": 7.815939665715451, + "grad_norm": 8.67441177368164, + "learning_rate": 2.340671286859628e-07, + "loss": 2.7956, + "step": 115035 + }, + { + "epoch": 7.816279385786112, + "grad_norm": 6.782947063446045, + "learning_rate": 2.3364247859763554e-07, + "loss": 2.4854, + "step": 115040 + }, + { + "epoch": 7.816619105856774, + "grad_norm": 7.701325416564941, + "learning_rate": 2.3321782850930834e-07, + "loss": 2.777, + "step": 115045 + }, + { + "epoch": 7.816958825927435, + "grad_norm": 9.66102123260498, + "learning_rate": 2.3279317842098111e-07, + "loss": 2.4504, + "step": 115050 + }, + { + "epoch": 7.8172985459980975, + "grad_norm": 9.411872863769531, + "learning_rate": 2.3236852833265392e-07, + "loss": 2.8758, + "step": 115055 + }, + { + "epoch": 7.81763826606876, + "grad_norm": 7.856961727142334, + "learning_rate": 2.319438782443267e-07, + "loss": 3.0365, + "step": 115060 + }, + { + "epoch": 7.817977986139421, + "grad_norm": 6.037312984466553, + "learning_rate": 2.315192281559995e-07, + "loss": 2.952, + "step": 115065 + }, + { + "epoch": 7.818317706210083, + "grad_norm": 7.160463809967041, + "learning_rate": 2.3109457806767224e-07, + "loss": 2.7694, + "step": 115070 + }, + { + "epoch": 7.818657426280745, + "grad_norm": 7.929126262664795, + "learning_rate": 2.3066992797934504e-07, + "loss": 2.4078, + "step": 115075 + }, + { + "epoch": 7.818997146351406, + "grad_norm": 7.086804389953613, + "learning_rate": 2.3024527789101781e-07, + "loss": 2.7245, + "step": 115080 + }, + { + "epoch": 7.819336866422068, + "grad_norm": 8.266281127929688, + "learning_rate": 2.2982062780269061e-07, + "loss": 2.5649, + "step": 115085 + }, + { + "epoch": 7.81967658649273, + "grad_norm": 7.08773136138916, + "learning_rate": 2.293959777143634e-07, + "loss": 2.673, + "step": 115090 + }, + { + "epoch": 7.820016306563391, + "grad_norm": 9.416447639465332, + "learning_rate": 2.289713276260362e-07, + "loss": 2.8973, + "step": 115095 + }, + { + "epoch": 7.8203560266340535, + "grad_norm": 6.491541385650635, + "learning_rate": 2.2854667753770894e-07, + "loss": 2.5569, + "step": 115100 + }, + { + "epoch": 7.820695746704716, + "grad_norm": 8.881916046142578, + "learning_rate": 2.281220274493817e-07, + "loss": 2.6913, + "step": 115105 + }, + { + "epoch": 7.821035466775377, + "grad_norm": 7.266821384429932, + "learning_rate": 2.276973773610545e-07, + "loss": 2.7222, + "step": 115110 + }, + { + "epoch": 7.821375186846039, + "grad_norm": 7.6400299072265625, + "learning_rate": 2.2727272727272729e-07, + "loss": 2.9741, + "step": 115115 + }, + { + "epoch": 7.821714906916701, + "grad_norm": 6.706860542297363, + "learning_rate": 2.268480771844001e-07, + "loss": 2.7414, + "step": 115120 + }, + { + "epoch": 7.822054626987362, + "grad_norm": 8.006677627563477, + "learning_rate": 2.2642342709607284e-07, + "loss": 2.4312, + "step": 115125 + }, + { + "epoch": 7.822394347058024, + "grad_norm": 9.77651596069336, + "learning_rate": 2.2599877700774564e-07, + "loss": 2.7533, + "step": 115130 + }, + { + "epoch": 7.822734067128686, + "grad_norm": 8.266048431396484, + "learning_rate": 2.255741269194184e-07, + "loss": 2.7129, + "step": 115135 + }, + { + "epoch": 7.823073787199347, + "grad_norm": 7.078948020935059, + "learning_rate": 2.251494768310912e-07, + "loss": 2.6043, + "step": 115140 + }, + { + "epoch": 7.8234135072700095, + "grad_norm": 7.624580383300781, + "learning_rate": 2.2472482674276399e-07, + "loss": 2.577, + "step": 115145 + }, + { + "epoch": 7.823753227340672, + "grad_norm": 7.473845958709717, + "learning_rate": 2.2430017665443679e-07, + "loss": 2.6929, + "step": 115150 + }, + { + "epoch": 7.824092947411333, + "grad_norm": 8.681792259216309, + "learning_rate": 2.2387552656610953e-07, + "loss": 2.6806, + "step": 115155 + }, + { + "epoch": 7.824432667481995, + "grad_norm": 9.01116943359375, + "learning_rate": 2.234508764777823e-07, + "loss": 2.6847, + "step": 115160 + }, + { + "epoch": 7.824772387552657, + "grad_norm": 7.078084945678711, + "learning_rate": 2.230262263894551e-07, + "loss": 2.6794, + "step": 115165 + }, + { + "epoch": 7.825112107623318, + "grad_norm": 6.520614147186279, + "learning_rate": 2.2260157630112788e-07, + "loss": 2.6739, + "step": 115170 + }, + { + "epoch": 7.82545182769398, + "grad_norm": 6.738711833953857, + "learning_rate": 2.2217692621280068e-07, + "loss": 2.5736, + "step": 115175 + }, + { + "epoch": 7.825791547764642, + "grad_norm": 7.326616287231445, + "learning_rate": 2.2175227612447346e-07, + "loss": 2.7709, + "step": 115180 + }, + { + "epoch": 7.826131267835303, + "grad_norm": 7.551150798797607, + "learning_rate": 2.2132762603614623e-07, + "loss": 2.77, + "step": 115185 + }, + { + "epoch": 7.8264709879059655, + "grad_norm": 8.327741622924805, + "learning_rate": 2.20902975947819e-07, + "loss": 2.7843, + "step": 115190 + }, + { + "epoch": 7.826810707976628, + "grad_norm": 6.5582275390625, + "learning_rate": 2.204783258594918e-07, + "loss": 2.9221, + "step": 115195 + }, + { + "epoch": 7.827150428047289, + "grad_norm": 7.362957954406738, + "learning_rate": 2.2005367577116458e-07, + "loss": 2.6663, + "step": 115200 + }, + { + "epoch": 7.827490148117951, + "grad_norm": 8.580950736999512, + "learning_rate": 2.1962902568283738e-07, + "loss": 2.7001, + "step": 115205 + }, + { + "epoch": 7.827829868188613, + "grad_norm": 9.044306755065918, + "learning_rate": 2.1920437559451013e-07, + "loss": 2.6166, + "step": 115210 + }, + { + "epoch": 7.828169588259274, + "grad_norm": 8.310127258300781, + "learning_rate": 2.187797255061829e-07, + "loss": 2.7902, + "step": 115215 + }, + { + "epoch": 7.828509308329936, + "grad_norm": 9.613973617553711, + "learning_rate": 2.183550754178557e-07, + "loss": 2.6485, + "step": 115220 + }, + { + "epoch": 7.828849028400598, + "grad_norm": 8.42148494720459, + "learning_rate": 2.1793042532952848e-07, + "loss": 2.7741, + "step": 115225 + }, + { + "epoch": 7.8291887484712595, + "grad_norm": 8.55122184753418, + "learning_rate": 2.1750577524120128e-07, + "loss": 2.7032, + "step": 115230 + }, + { + "epoch": 7.8295284685419215, + "grad_norm": 6.451178550720215, + "learning_rate": 2.1708112515287406e-07, + "loss": 2.5787, + "step": 115235 + }, + { + "epoch": 7.829868188612584, + "grad_norm": 8.280424118041992, + "learning_rate": 2.1665647506454683e-07, + "loss": 2.7207, + "step": 115240 + }, + { + "epoch": 7.830207908683245, + "grad_norm": 9.79694938659668, + "learning_rate": 2.162318249762196e-07, + "loss": 2.7513, + "step": 115245 + }, + { + "epoch": 7.830547628753907, + "grad_norm": 8.425858497619629, + "learning_rate": 2.158071748878924e-07, + "loss": 2.7881, + "step": 115250 + }, + { + "epoch": 7.830887348824569, + "grad_norm": 7.62663459777832, + "learning_rate": 2.1538252479956518e-07, + "loss": 2.6867, + "step": 115255 + }, + { + "epoch": 7.83122706889523, + "grad_norm": 7.556674957275391, + "learning_rate": 2.1495787471123798e-07, + "loss": 2.7044, + "step": 115260 + }, + { + "epoch": 7.831566788965892, + "grad_norm": 6.646024703979492, + "learning_rate": 2.1453322462291075e-07, + "loss": 2.6239, + "step": 115265 + }, + { + "epoch": 7.831906509036553, + "grad_norm": 8.11947250366211, + "learning_rate": 2.1419350455224898e-07, + "loss": 3.0489, + "step": 115270 + }, + { + "epoch": 7.8322462291072155, + "grad_norm": 6.373020648956299, + "learning_rate": 2.1376885446392173e-07, + "loss": 2.6209, + "step": 115275 + }, + { + "epoch": 7.8325859491778775, + "grad_norm": 7.985374450683594, + "learning_rate": 2.1334420437559453e-07, + "loss": 2.7136, + "step": 115280 + }, + { + "epoch": 7.832925669248539, + "grad_norm": 8.677543640136719, + "learning_rate": 2.129195542872673e-07, + "loss": 2.6721, + "step": 115285 + }, + { + "epoch": 7.833265389319201, + "grad_norm": 6.06914758682251, + "learning_rate": 2.124949041989401e-07, + "loss": 2.5828, + "step": 115290 + }, + { + "epoch": 7.833605109389863, + "grad_norm": 7.4950103759765625, + "learning_rate": 2.1207025411061288e-07, + "loss": 2.7078, + "step": 115295 + }, + { + "epoch": 7.833944829460524, + "grad_norm": 8.966503143310547, + "learning_rate": 2.1164560402228568e-07, + "loss": 2.5309, + "step": 115300 + }, + { + "epoch": 7.834284549531186, + "grad_norm": 5.801728248596191, + "learning_rate": 2.1122095393395843e-07, + "loss": 2.5624, + "step": 115305 + }, + { + "epoch": 7.834624269601848, + "grad_norm": 7.168334484100342, + "learning_rate": 2.107963038456312e-07, + "loss": 2.8853, + "step": 115310 + }, + { + "epoch": 7.834963989672509, + "grad_norm": 8.027728080749512, + "learning_rate": 2.10371653757304e-07, + "loss": 2.5757, + "step": 115315 + }, + { + "epoch": 7.8353037097431715, + "grad_norm": 8.662349700927734, + "learning_rate": 2.0994700366897678e-07, + "loss": 2.8131, + "step": 115320 + }, + { + "epoch": 7.8356434298138335, + "grad_norm": 7.862412452697754, + "learning_rate": 2.0952235358064958e-07, + "loss": 2.8192, + "step": 115325 + }, + { + "epoch": 7.835983149884495, + "grad_norm": 7.605045795440674, + "learning_rate": 2.0909770349232233e-07, + "loss": 2.6944, + "step": 115330 + }, + { + "epoch": 7.836322869955157, + "grad_norm": 7.649329662322998, + "learning_rate": 2.086730534039951e-07, + "loss": 2.7983, + "step": 115335 + }, + { + "epoch": 7.836662590025819, + "grad_norm": 7.7730913162231445, + "learning_rate": 2.082484033156679e-07, + "loss": 2.686, + "step": 115340 + }, + { + "epoch": 7.83700231009648, + "grad_norm": 7.640106678009033, + "learning_rate": 2.078237532273407e-07, + "loss": 2.7654, + "step": 115345 + }, + { + "epoch": 7.837342030167142, + "grad_norm": 9.179606437683105, + "learning_rate": 2.0739910313901348e-07, + "loss": 2.682, + "step": 115350 + }, + { + "epoch": 7.837681750237804, + "grad_norm": 6.93477201461792, + "learning_rate": 2.0697445305068628e-07, + "loss": 2.8884, + "step": 115355 + }, + { + "epoch": 7.838021470308465, + "grad_norm": 9.136988639831543, + "learning_rate": 2.0654980296235903e-07, + "loss": 2.7035, + "step": 115360 + }, + { + "epoch": 7.8383611903791275, + "grad_norm": 9.402434349060059, + "learning_rate": 2.061251528740318e-07, + "loss": 2.9029, + "step": 115365 + }, + { + "epoch": 7.83870091044979, + "grad_norm": 9.27499771118164, + "learning_rate": 2.057005027857046e-07, + "loss": 2.7064, + "step": 115370 + }, + { + "epoch": 7.839040630520451, + "grad_norm": 8.839218139648438, + "learning_rate": 2.0527585269737738e-07, + "loss": 2.4458, + "step": 115375 + }, + { + "epoch": 7.839380350591113, + "grad_norm": 8.651019096374512, + "learning_rate": 2.0485120260905018e-07, + "loss": 2.9238, + "step": 115380 + }, + { + "epoch": 7.839720070661775, + "grad_norm": 8.423755645751953, + "learning_rate": 2.0442655252072295e-07, + "loss": 2.879, + "step": 115385 + }, + { + "epoch": 7.840059790732436, + "grad_norm": 9.347433090209961, + "learning_rate": 2.040019024323957e-07, + "loss": 2.815, + "step": 115390 + }, + { + "epoch": 7.840399510803098, + "grad_norm": 7.815495491027832, + "learning_rate": 2.035772523440685e-07, + "loss": 2.6943, + "step": 115395 + }, + { + "epoch": 7.84073923087376, + "grad_norm": 7.005505084991455, + "learning_rate": 2.0315260225574127e-07, + "loss": 2.6591, + "step": 115400 + }, + { + "epoch": 7.841078950944421, + "grad_norm": 9.258087158203125, + "learning_rate": 2.0272795216741408e-07, + "loss": 2.5508, + "step": 115405 + }, + { + "epoch": 7.8414186710150835, + "grad_norm": 9.561162948608398, + "learning_rate": 2.0230330207908685e-07, + "loss": 2.5336, + "step": 115410 + }, + { + "epoch": 7.841758391085746, + "grad_norm": 8.936165809631348, + "learning_rate": 2.0187865199075962e-07, + "loss": 2.7578, + "step": 115415 + }, + { + "epoch": 7.842098111156407, + "grad_norm": 7.284128189086914, + "learning_rate": 2.014540019024324e-07, + "loss": 2.6355, + "step": 115420 + }, + { + "epoch": 7.842437831227069, + "grad_norm": 7.689365863800049, + "learning_rate": 2.010293518141052e-07, + "loss": 2.8484, + "step": 115425 + }, + { + "epoch": 7.842777551297731, + "grad_norm": 7.184866428375244, + "learning_rate": 2.0060470172577797e-07, + "loss": 2.8548, + "step": 115430 + }, + { + "epoch": 7.843117271368392, + "grad_norm": 7.48171854019165, + "learning_rate": 2.0018005163745077e-07, + "loss": 2.9009, + "step": 115435 + }, + { + "epoch": 7.843456991439054, + "grad_norm": 8.880914688110352, + "learning_rate": 1.9975540154912355e-07, + "loss": 2.6858, + "step": 115440 + }, + { + "epoch": 7.843796711509716, + "grad_norm": 9.5300931930542, + "learning_rate": 1.993307514607963e-07, + "loss": 2.7325, + "step": 115445 + }, + { + "epoch": 7.844136431580377, + "grad_norm": 7.266800403594971, + "learning_rate": 1.989061013724691e-07, + "loss": 2.8895, + "step": 115450 + }, + { + "epoch": 7.8444761516510395, + "grad_norm": 6.432939052581787, + "learning_rate": 1.9848145128414187e-07, + "loss": 2.3357, + "step": 115455 + }, + { + "epoch": 7.844815871721702, + "grad_norm": 7.928047180175781, + "learning_rate": 1.9805680119581467e-07, + "loss": 2.4652, + "step": 115460 + }, + { + "epoch": 7.845155591792363, + "grad_norm": 7.11653470993042, + "learning_rate": 1.9763215110748745e-07, + "loss": 2.6695, + "step": 115465 + }, + { + "epoch": 7.845495311863025, + "grad_norm": 8.597466468811035, + "learning_rate": 1.9720750101916025e-07, + "loss": 2.9502, + "step": 115470 + }, + { + "epoch": 7.845835031933687, + "grad_norm": 6.883492946624756, + "learning_rate": 1.96782850930833e-07, + "loss": 2.6633, + "step": 115475 + }, + { + "epoch": 7.846174752004348, + "grad_norm": 8.176053047180176, + "learning_rate": 1.963582008425058e-07, + "loss": 2.7744, + "step": 115480 + }, + { + "epoch": 7.84651447207501, + "grad_norm": 6.765979290008545, + "learning_rate": 1.9593355075417857e-07, + "loss": 2.7102, + "step": 115485 + }, + { + "epoch": 7.846854192145672, + "grad_norm": 8.971013069152832, + "learning_rate": 1.9550890066585137e-07, + "loss": 2.7414, + "step": 115490 + }, + { + "epoch": 7.8471939122163334, + "grad_norm": 9.088057518005371, + "learning_rate": 1.9508425057752415e-07, + "loss": 2.7982, + "step": 115495 + }, + { + "epoch": 7.8475336322869955, + "grad_norm": 6.478650093078613, + "learning_rate": 1.946596004891969e-07, + "loss": 3.0232, + "step": 115500 + }, + { + "epoch": 7.847873352357658, + "grad_norm": 10.078633308410645, + "learning_rate": 1.942349504008697e-07, + "loss": 2.6639, + "step": 115505 + }, + { + "epoch": 7.848213072428319, + "grad_norm": 9.513819694519043, + "learning_rate": 1.9381030031254247e-07, + "loss": 2.6393, + "step": 115510 + }, + { + "epoch": 7.848552792498981, + "grad_norm": 7.5749664306640625, + "learning_rate": 1.9338565022421527e-07, + "loss": 2.5357, + "step": 115515 + }, + { + "epoch": 7.848892512569643, + "grad_norm": 7.164365291595459, + "learning_rate": 1.9296100013588804e-07, + "loss": 2.8247, + "step": 115520 + }, + { + "epoch": 7.849232232640304, + "grad_norm": 8.216625213623047, + "learning_rate": 1.9253635004756084e-07, + "loss": 2.5059, + "step": 115525 + }, + { + "epoch": 7.849571952710966, + "grad_norm": 9.004756927490234, + "learning_rate": 1.921116999592336e-07, + "loss": 2.5178, + "step": 115530 + }, + { + "epoch": 7.849911672781628, + "grad_norm": 7.754687786102295, + "learning_rate": 1.916870498709064e-07, + "loss": 2.6807, + "step": 115535 + }, + { + "epoch": 7.8502513928522895, + "grad_norm": 7.37281608581543, + "learning_rate": 1.9126239978257917e-07, + "loss": 2.765, + "step": 115540 + }, + { + "epoch": 7.8505911129229515, + "grad_norm": 9.543425559997559, + "learning_rate": 1.9083774969425197e-07, + "loss": 2.8432, + "step": 115545 + }, + { + "epoch": 7.850930832993614, + "grad_norm": 7.721584320068359, + "learning_rate": 1.9041309960592474e-07, + "loss": 2.9066, + "step": 115550 + }, + { + "epoch": 7.851270553064275, + "grad_norm": 9.176070213317871, + "learning_rate": 1.899884495175975e-07, + "loss": 2.6047, + "step": 115555 + }, + { + "epoch": 7.851610273134937, + "grad_norm": 6.017458438873291, + "learning_rate": 1.895637994292703e-07, + "loss": 2.542, + "step": 115560 + }, + { + "epoch": 7.851949993205599, + "grad_norm": 6.787586688995361, + "learning_rate": 1.8913914934094306e-07, + "loss": 2.6773, + "step": 115565 + }, + { + "epoch": 7.85228971327626, + "grad_norm": 8.042539596557617, + "learning_rate": 1.8871449925261587e-07, + "loss": 2.666, + "step": 115570 + }, + { + "epoch": 7.852629433346922, + "grad_norm": 7.208558082580566, + "learning_rate": 1.8828984916428864e-07, + "loss": 3.0154, + "step": 115575 + }, + { + "epoch": 7.852969153417584, + "grad_norm": 6.779191017150879, + "learning_rate": 1.8786519907596144e-07, + "loss": 2.839, + "step": 115580 + }, + { + "epoch": 7.8533088734882455, + "grad_norm": 7.382901668548584, + "learning_rate": 1.874405489876342e-07, + "loss": 2.6171, + "step": 115585 + }, + { + "epoch": 7.8536485935589075, + "grad_norm": 8.467909812927246, + "learning_rate": 1.87015898899307e-07, + "loss": 2.6781, + "step": 115590 + }, + { + "epoch": 7.85398831362957, + "grad_norm": 6.135566711425781, + "learning_rate": 1.8659124881097976e-07, + "loss": 2.6838, + "step": 115595 + }, + { + "epoch": 7.854328033700231, + "grad_norm": 7.966237545013428, + "learning_rate": 1.8616659872265256e-07, + "loss": 2.7527, + "step": 115600 + }, + { + "epoch": 7.854667753770893, + "grad_norm": 6.956146240234375, + "learning_rate": 1.8574194863432534e-07, + "loss": 2.6402, + "step": 115605 + }, + { + "epoch": 7.855007473841555, + "grad_norm": 8.705581665039062, + "learning_rate": 1.8531729854599814e-07, + "loss": 2.6792, + "step": 115610 + }, + { + "epoch": 7.855347193912216, + "grad_norm": 7.348071098327637, + "learning_rate": 1.848926484576709e-07, + "loss": 2.7052, + "step": 115615 + }, + { + "epoch": 7.855686913982878, + "grad_norm": 6.968916893005371, + "learning_rate": 1.8446799836934366e-07, + "loss": 2.5202, + "step": 115620 + }, + { + "epoch": 7.85602663405354, + "grad_norm": 7.48214864730835, + "learning_rate": 1.8404334828101646e-07, + "loss": 2.7861, + "step": 115625 + }, + { + "epoch": 7.8563663541242015, + "grad_norm": 7.784917831420898, + "learning_rate": 1.8361869819268924e-07, + "loss": 2.6441, + "step": 115630 + }, + { + "epoch": 7.8567060741948636, + "grad_norm": 8.04670238494873, + "learning_rate": 1.8319404810436204e-07, + "loss": 2.6971, + "step": 115635 + }, + { + "epoch": 7.857045794265526, + "grad_norm": 6.351220607757568, + "learning_rate": 1.8276939801603479e-07, + "loss": 2.7504, + "step": 115640 + }, + { + "epoch": 7.857385514336187, + "grad_norm": 7.798670768737793, + "learning_rate": 1.8234474792770759e-07, + "loss": 2.4747, + "step": 115645 + }, + { + "epoch": 7.857725234406849, + "grad_norm": 8.101747512817383, + "learning_rate": 1.8192009783938036e-07, + "loss": 2.809, + "step": 115650 + }, + { + "epoch": 7.858064954477511, + "grad_norm": 7.939355850219727, + "learning_rate": 1.8149544775105316e-07, + "loss": 2.7014, + "step": 115655 + }, + { + "epoch": 7.858404674548172, + "grad_norm": 10.298752784729004, + "learning_rate": 1.8107079766272594e-07, + "loss": 2.625, + "step": 115660 + }, + { + "epoch": 7.858744394618834, + "grad_norm": 6.979417324066162, + "learning_rate": 1.8064614757439874e-07, + "loss": 2.6732, + "step": 115665 + }, + { + "epoch": 7.859084114689496, + "grad_norm": 7.695047855377197, + "learning_rate": 1.8022149748607148e-07, + "loss": 2.733, + "step": 115670 + }, + { + "epoch": 7.8594238347601575, + "grad_norm": 7.909720420837402, + "learning_rate": 1.7979684739774426e-07, + "loss": 2.6919, + "step": 115675 + }, + { + "epoch": 7.85976355483082, + "grad_norm": 7.495145797729492, + "learning_rate": 1.7937219730941706e-07, + "loss": 2.749, + "step": 115680 + }, + { + "epoch": 7.860103274901482, + "grad_norm": 8.05528736114502, + "learning_rate": 1.7894754722108983e-07, + "loss": 2.8421, + "step": 115685 + }, + { + "epoch": 7.860442994972143, + "grad_norm": 6.753636837005615, + "learning_rate": 1.7852289713276263e-07, + "loss": 2.7405, + "step": 115690 + }, + { + "epoch": 7.860782715042805, + "grad_norm": 6.312516212463379, + "learning_rate": 1.780982470444354e-07, + "loss": 2.4844, + "step": 115695 + }, + { + "epoch": 7.861122435113467, + "grad_norm": 7.935538291931152, + "learning_rate": 1.7767359695610816e-07, + "loss": 2.6989, + "step": 115700 + }, + { + "epoch": 7.861462155184128, + "grad_norm": 6.086683750152588, + "learning_rate": 1.7724894686778096e-07, + "loss": 2.5151, + "step": 115705 + }, + { + "epoch": 7.86180187525479, + "grad_norm": 6.999774932861328, + "learning_rate": 1.7682429677945373e-07, + "loss": 2.644, + "step": 115710 + }, + { + "epoch": 7.862141595325452, + "grad_norm": 8.326530456542969, + "learning_rate": 1.7639964669112653e-07, + "loss": 2.638, + "step": 115715 + }, + { + "epoch": 7.8624813153961135, + "grad_norm": 7.183421611785889, + "learning_rate": 1.759749966027993e-07, + "loss": 2.637, + "step": 115720 + }, + { + "epoch": 7.862821035466776, + "grad_norm": 8.755902290344238, + "learning_rate": 1.7555034651447208e-07, + "loss": 2.6147, + "step": 115725 + }, + { + "epoch": 7.863160755537437, + "grad_norm": 7.263387680053711, + "learning_rate": 1.7512569642614485e-07, + "loss": 2.7001, + "step": 115730 + }, + { + "epoch": 7.863500475608099, + "grad_norm": 7.301671028137207, + "learning_rate": 1.7470104633781766e-07, + "loss": 2.5756, + "step": 115735 + }, + { + "epoch": 7.863840195678761, + "grad_norm": 8.112516403198242, + "learning_rate": 1.7427639624949043e-07, + "loss": 2.6883, + "step": 115740 + }, + { + "epoch": 7.864179915749422, + "grad_norm": 7.449815273284912, + "learning_rate": 1.7385174616116323e-07, + "loss": 2.751, + "step": 115745 + }, + { + "epoch": 7.864519635820084, + "grad_norm": 7.654869079589844, + "learning_rate": 1.73427096072836e-07, + "loss": 2.5134, + "step": 115750 + }, + { + "epoch": 7.864859355890746, + "grad_norm": 9.434760093688965, + "learning_rate": 1.7300244598450875e-07, + "loss": 2.7902, + "step": 115755 + }, + { + "epoch": 7.865199075961407, + "grad_norm": 7.060345649719238, + "learning_rate": 1.7257779589618155e-07, + "loss": 2.8514, + "step": 115760 + }, + { + "epoch": 7.8655387960320695, + "grad_norm": 7.706247806549072, + "learning_rate": 1.7215314580785433e-07, + "loss": 2.915, + "step": 115765 + }, + { + "epoch": 7.865878516102732, + "grad_norm": 7.3343892097473145, + "learning_rate": 1.7172849571952713e-07, + "loss": 2.743, + "step": 115770 + }, + { + "epoch": 7.866218236173393, + "grad_norm": 8.42363166809082, + "learning_rate": 1.713038456311999e-07, + "loss": 2.6222, + "step": 115775 + }, + { + "epoch": 7.866557956244055, + "grad_norm": 8.933846473693848, + "learning_rate": 1.708791955428727e-07, + "loss": 2.7905, + "step": 115780 + }, + { + "epoch": 7.866897676314717, + "grad_norm": 7.237271785736084, + "learning_rate": 1.7045454545454545e-07, + "loss": 2.7234, + "step": 115785 + }, + { + "epoch": 7.867237396385378, + "grad_norm": 7.990185260772705, + "learning_rate": 1.7002989536621825e-07, + "loss": 2.8076, + "step": 115790 + }, + { + "epoch": 7.86757711645604, + "grad_norm": 9.726520538330078, + "learning_rate": 1.6960524527789103e-07, + "loss": 2.837, + "step": 115795 + }, + { + "epoch": 7.867916836526702, + "grad_norm": 7.215406894683838, + "learning_rate": 1.6918059518956383e-07, + "loss": 2.6801, + "step": 115800 + }, + { + "epoch": 7.8682565565973634, + "grad_norm": 7.746291160583496, + "learning_rate": 1.687559451012366e-07, + "loss": 2.7371, + "step": 115805 + }, + { + "epoch": 7.8685962766680255, + "grad_norm": 10.7564697265625, + "learning_rate": 1.6833129501290935e-07, + "loss": 2.8688, + "step": 115810 + }, + { + "epoch": 7.868935996738688, + "grad_norm": 8.409051895141602, + "learning_rate": 1.6790664492458215e-07, + "loss": 2.8609, + "step": 115815 + }, + { + "epoch": 7.869275716809349, + "grad_norm": 6.856148719787598, + "learning_rate": 1.6748199483625492e-07, + "loss": 2.8086, + "step": 115820 + }, + { + "epoch": 7.869615436880011, + "grad_norm": 6.373811721801758, + "learning_rate": 1.6705734474792773e-07, + "loss": 2.8645, + "step": 115825 + }, + { + "epoch": 7.869955156950673, + "grad_norm": 7.465092658996582, + "learning_rate": 1.666326946596005e-07, + "loss": 2.784, + "step": 115830 + }, + { + "epoch": 7.870294877021334, + "grad_norm": 8.925203323364258, + "learning_rate": 1.662080445712733e-07, + "loss": 2.5876, + "step": 115835 + }, + { + "epoch": 7.870634597091996, + "grad_norm": 6.842958450317383, + "learning_rate": 1.6578339448294605e-07, + "loss": 2.5807, + "step": 115840 + }, + { + "epoch": 7.870974317162658, + "grad_norm": 8.267416000366211, + "learning_rate": 1.6535874439461885e-07, + "loss": 2.6078, + "step": 115845 + }, + { + "epoch": 7.8713140372333195, + "grad_norm": 8.696955680847168, + "learning_rate": 1.6493409430629162e-07, + "loss": 2.5679, + "step": 115850 + }, + { + "epoch": 7.8716537573039815, + "grad_norm": 7.498216152191162, + "learning_rate": 1.6450944421796442e-07, + "loss": 2.6204, + "step": 115855 + }, + { + "epoch": 7.871993477374644, + "grad_norm": 7.7357940673828125, + "learning_rate": 1.640847941296372e-07, + "loss": 2.7524, + "step": 115860 + }, + { + "epoch": 7.872333197445305, + "grad_norm": 7.184012413024902, + "learning_rate": 1.6366014404131e-07, + "loss": 2.7976, + "step": 115865 + }, + { + "epoch": 7.872672917515967, + "grad_norm": 8.783666610717773, + "learning_rate": 1.6323549395298275e-07, + "loss": 2.6777, + "step": 115870 + }, + { + "epoch": 7.873012637586629, + "grad_norm": 6.916706085205078, + "learning_rate": 1.6281084386465552e-07, + "loss": 2.8161, + "step": 115875 + }, + { + "epoch": 7.87335235765729, + "grad_norm": 5.277015686035156, + "learning_rate": 1.6238619377632832e-07, + "loss": 2.8008, + "step": 115880 + }, + { + "epoch": 7.873692077727952, + "grad_norm": 9.463258743286133, + "learning_rate": 1.619615436880011e-07, + "loss": 2.7865, + "step": 115885 + }, + { + "epoch": 7.874031797798614, + "grad_norm": 7.54430627822876, + "learning_rate": 1.615368935996739e-07, + "loss": 2.6938, + "step": 115890 + }, + { + "epoch": 7.8743715178692755, + "grad_norm": 8.558435440063477, + "learning_rate": 1.6111224351134665e-07, + "loss": 2.5941, + "step": 115895 + }, + { + "epoch": 7.8747112379399375, + "grad_norm": 8.933341026306152, + "learning_rate": 1.6068759342301945e-07, + "loss": 2.6532, + "step": 115900 + }, + { + "epoch": 7.8750509580106, + "grad_norm": 7.20840311050415, + "learning_rate": 1.6026294333469222e-07, + "loss": 2.4976, + "step": 115905 + }, + { + "epoch": 7.875390678081261, + "grad_norm": 7.322312355041504, + "learning_rate": 1.5983829324636502e-07, + "loss": 2.8212, + "step": 115910 + }, + { + "epoch": 7.875730398151923, + "grad_norm": 8.47230052947998, + "learning_rate": 1.594136431580378e-07, + "loss": 2.7673, + "step": 115915 + }, + { + "epoch": 7.876070118222585, + "grad_norm": 7.128031253814697, + "learning_rate": 1.589889930697106e-07, + "loss": 2.6155, + "step": 115920 + }, + { + "epoch": 7.876409838293246, + "grad_norm": 7.4002685546875, + "learning_rate": 1.5856434298138334e-07, + "loss": 2.9053, + "step": 115925 + }, + { + "epoch": 7.876749558363908, + "grad_norm": 8.343955993652344, + "learning_rate": 1.5813969289305612e-07, + "loss": 2.8258, + "step": 115930 + }, + { + "epoch": 7.87708927843457, + "grad_norm": 8.631684303283691, + "learning_rate": 1.5771504280472892e-07, + "loss": 2.333, + "step": 115935 + }, + { + "epoch": 7.8774289985052315, + "grad_norm": 9.22761344909668, + "learning_rate": 1.572903927164017e-07, + "loss": 2.453, + "step": 115940 + }, + { + "epoch": 7.877768718575894, + "grad_norm": 6.06572961807251, + "learning_rate": 1.568657426280745e-07, + "loss": 2.6944, + "step": 115945 + }, + { + "epoch": 7.878108438646555, + "grad_norm": 7.36019229888916, + "learning_rate": 1.5644109253974727e-07, + "loss": 2.6179, + "step": 115950 + }, + { + "epoch": 7.878448158717217, + "grad_norm": 9.318496704101562, + "learning_rate": 1.5601644245142004e-07, + "loss": 2.6558, + "step": 115955 + }, + { + "epoch": 7.878787878787879, + "grad_norm": 8.230361938476562, + "learning_rate": 1.5559179236309282e-07, + "loss": 2.7325, + "step": 115960 + }, + { + "epoch": 7.87912759885854, + "grad_norm": 6.818978309631348, + "learning_rate": 1.5516714227476562e-07, + "loss": 2.5936, + "step": 115965 + }, + { + "epoch": 7.879467318929202, + "grad_norm": 7.462869644165039, + "learning_rate": 1.547424921864384e-07, + "loss": 2.6322, + "step": 115970 + }, + { + "epoch": 7.879807038999864, + "grad_norm": 10.214398384094238, + "learning_rate": 1.5431784209811117e-07, + "loss": 2.7625, + "step": 115975 + }, + { + "epoch": 7.880146759070525, + "grad_norm": 8.073953628540039, + "learning_rate": 1.5389319200978394e-07, + "loss": 2.8489, + "step": 115980 + }, + { + "epoch": 7.8804864791411875, + "grad_norm": 7.293981552124023, + "learning_rate": 1.5346854192145674e-07, + "loss": 2.6238, + "step": 115985 + }, + { + "epoch": 7.88082619921185, + "grad_norm": 11.104315757751465, + "learning_rate": 1.5304389183312952e-07, + "loss": 2.734, + "step": 115990 + }, + { + "epoch": 7.881165919282511, + "grad_norm": 9.91050910949707, + "learning_rate": 1.526192417448023e-07, + "loss": 2.6281, + "step": 115995 + }, + { + "epoch": 7.881505639353173, + "grad_norm": 10.614744186401367, + "learning_rate": 1.521945916564751e-07, + "loss": 2.7298, + "step": 116000 + }, + { + "epoch": 7.881845359423835, + "grad_norm": 9.119718551635742, + "learning_rate": 1.5176994156814787e-07, + "loss": 2.5702, + "step": 116005 + }, + { + "epoch": 7.882185079494496, + "grad_norm": 7.7199578285217285, + "learning_rate": 1.5134529147982064e-07, + "loss": 2.6827, + "step": 116010 + }, + { + "epoch": 7.882524799565158, + "grad_norm": 9.685178756713867, + "learning_rate": 1.5092064139149341e-07, + "loss": 2.8694, + "step": 116015 + }, + { + "epoch": 7.88286451963582, + "grad_norm": 7.718935489654541, + "learning_rate": 1.5049599130316621e-07, + "loss": 2.7851, + "step": 116020 + }, + { + "epoch": 7.883204239706481, + "grad_norm": 9.033916473388672, + "learning_rate": 1.50071341214839e-07, + "loss": 2.7647, + "step": 116025 + }, + { + "epoch": 7.8835439597771435, + "grad_norm": 8.677457809448242, + "learning_rate": 1.4964669112651176e-07, + "loss": 2.8824, + "step": 116030 + }, + { + "epoch": 7.883883679847806, + "grad_norm": 6.411273002624512, + "learning_rate": 1.4922204103818454e-07, + "loss": 2.7054, + "step": 116035 + }, + { + "epoch": 7.884223399918467, + "grad_norm": 6.926469326019287, + "learning_rate": 1.4879739094985734e-07, + "loss": 2.7667, + "step": 116040 + }, + { + "epoch": 7.884563119989129, + "grad_norm": 8.14848804473877, + "learning_rate": 1.483727408615301e-07, + "loss": 2.4657, + "step": 116045 + }, + { + "epoch": 7.884902840059791, + "grad_norm": 9.245027542114258, + "learning_rate": 1.4794809077320289e-07, + "loss": 2.8225, + "step": 116050 + }, + { + "epoch": 7.885242560130452, + "grad_norm": 6.931942939758301, + "learning_rate": 1.475234406848757e-07, + "loss": 2.8448, + "step": 116055 + }, + { + "epoch": 7.885582280201114, + "grad_norm": 7.3820600509643555, + "learning_rate": 1.4709879059654846e-07, + "loss": 2.6988, + "step": 116060 + }, + { + "epoch": 7.885922000271776, + "grad_norm": 7.799426078796387, + "learning_rate": 1.4667414050822124e-07, + "loss": 2.7907, + "step": 116065 + }, + { + "epoch": 7.886261720342437, + "grad_norm": 6.822810649871826, + "learning_rate": 1.4624949041989404e-07, + "loss": 2.7942, + "step": 116070 + }, + { + "epoch": 7.8866014404130995, + "grad_norm": 7.058633327484131, + "learning_rate": 1.4582484033156678e-07, + "loss": 2.4789, + "step": 116075 + }, + { + "epoch": 7.886941160483762, + "grad_norm": 10.437260627746582, + "learning_rate": 1.4540019024323959e-07, + "loss": 2.6803, + "step": 116080 + }, + { + "epoch": 7.887280880554423, + "grad_norm": 8.376620292663574, + "learning_rate": 1.4497554015491236e-07, + "loss": 2.7269, + "step": 116085 + }, + { + "epoch": 7.887620600625085, + "grad_norm": 9.144288063049316, + "learning_rate": 1.4455089006658513e-07, + "loss": 2.6893, + "step": 116090 + }, + { + "epoch": 7.887960320695747, + "grad_norm": 7.547861099243164, + "learning_rate": 1.4412623997825793e-07, + "loss": 2.7187, + "step": 116095 + }, + { + "epoch": 7.888300040766408, + "grad_norm": 9.777499198913574, + "learning_rate": 1.437015898899307e-07, + "loss": 2.8216, + "step": 116100 + }, + { + "epoch": 7.88863976083707, + "grad_norm": 7.6278204917907715, + "learning_rate": 1.4327693980160348e-07, + "loss": 2.6514, + "step": 116105 + }, + { + "epoch": 7.888979480907732, + "grad_norm": 7.686793327331543, + "learning_rate": 1.4285228971327628e-07, + "loss": 2.5124, + "step": 116110 + }, + { + "epoch": 7.8893192009783935, + "grad_norm": 8.594939231872559, + "learning_rate": 1.4242763962494906e-07, + "loss": 2.7973, + "step": 116115 + }, + { + "epoch": 7.8896589210490555, + "grad_norm": 8.69112491607666, + "learning_rate": 1.4200298953662183e-07, + "loss": 2.5393, + "step": 116120 + }, + { + "epoch": 7.889998641119718, + "grad_norm": 7.736634731292725, + "learning_rate": 1.4157833944829463e-07, + "loss": 2.6761, + "step": 116125 + }, + { + "epoch": 7.890338361190379, + "grad_norm": 7.31415319442749, + "learning_rate": 1.4115368935996738e-07, + "loss": 2.8052, + "step": 116130 + }, + { + "epoch": 7.890678081261041, + "grad_norm": 9.636890411376953, + "learning_rate": 1.4072903927164018e-07, + "loss": 2.7778, + "step": 116135 + }, + { + "epoch": 7.891017801331703, + "grad_norm": 8.516348838806152, + "learning_rate": 1.4030438918331296e-07, + "loss": 2.514, + "step": 116140 + }, + { + "epoch": 7.891357521402364, + "grad_norm": 7.257963180541992, + "learning_rate": 1.3987973909498573e-07, + "loss": 2.4247, + "step": 116145 + }, + { + "epoch": 7.891697241473026, + "grad_norm": 10.115699768066406, + "learning_rate": 1.3945508900665853e-07, + "loss": 2.7063, + "step": 116150 + }, + { + "epoch": 7.892036961543688, + "grad_norm": 6.752028465270996, + "learning_rate": 1.390304389183313e-07, + "loss": 2.5629, + "step": 116155 + }, + { + "epoch": 7.8923766816143495, + "grad_norm": 8.294439315795898, + "learning_rate": 1.3869071884766954e-07, + "loss": 2.6987, + "step": 116160 + }, + { + "epoch": 7.8927164016850115, + "grad_norm": 8.195055961608887, + "learning_rate": 1.382660687593423e-07, + "loss": 2.7892, + "step": 116165 + }, + { + "epoch": 7.893056121755674, + "grad_norm": 7.066113471984863, + "learning_rate": 1.3784141867101508e-07, + "loss": 2.7307, + "step": 116170 + }, + { + "epoch": 7.893395841826335, + "grad_norm": 6.841455459594727, + "learning_rate": 1.3741676858268789e-07, + "loss": 2.7003, + "step": 116175 + }, + { + "epoch": 7.893735561896997, + "grad_norm": 8.369041442871094, + "learning_rate": 1.3699211849436066e-07, + "loss": 2.7311, + "step": 116180 + }, + { + "epoch": 7.894075281967659, + "grad_norm": 8.620226860046387, + "learning_rate": 1.3656746840603343e-07, + "loss": 2.9544, + "step": 116185 + }, + { + "epoch": 7.89441500203832, + "grad_norm": 7.468212604522705, + "learning_rate": 1.3614281831770623e-07, + "loss": 2.7687, + "step": 116190 + }, + { + "epoch": 7.894754722108982, + "grad_norm": 9.162482261657715, + "learning_rate": 1.35718168229379e-07, + "loss": 2.6776, + "step": 116195 + }, + { + "epoch": 7.895094442179644, + "grad_norm": 8.197978973388672, + "learning_rate": 1.3529351814105178e-07, + "loss": 2.7006, + "step": 116200 + }, + { + "epoch": 7.8954341622503055, + "grad_norm": 8.78309440612793, + "learning_rate": 1.3486886805272456e-07, + "loss": 2.8378, + "step": 116205 + }, + { + "epoch": 7.8957738823209676, + "grad_norm": 7.70646333694458, + "learning_rate": 1.3444421796439733e-07, + "loss": 2.5412, + "step": 116210 + }, + { + "epoch": 7.89611360239163, + "grad_norm": 6.893405914306641, + "learning_rate": 1.3401956787607013e-07, + "loss": 2.7033, + "step": 116215 + }, + { + "epoch": 7.896453322462291, + "grad_norm": 7.124172210693359, + "learning_rate": 1.335949177877429e-07, + "loss": 2.6222, + "step": 116220 + }, + { + "epoch": 7.896793042532953, + "grad_norm": 7.344628810882568, + "learning_rate": 1.3317026769941568e-07, + "loss": 2.8077, + "step": 116225 + }, + { + "epoch": 7.897132762603615, + "grad_norm": 9.43408489227295, + "learning_rate": 1.3274561761108848e-07, + "loss": 2.6299, + "step": 116230 + }, + { + "epoch": 7.897472482674276, + "grad_norm": 8.062580108642578, + "learning_rate": 1.3232096752276126e-07, + "loss": 2.5512, + "step": 116235 + }, + { + "epoch": 7.897812202744938, + "grad_norm": 7.884898662567139, + "learning_rate": 1.3189631743443403e-07, + "loss": 2.5957, + "step": 116240 + }, + { + "epoch": 7.8981519228156, + "grad_norm": 8.268927574157715, + "learning_rate": 1.3147166734610683e-07, + "loss": 2.7775, + "step": 116245 + }, + { + "epoch": 7.8984916428862615, + "grad_norm": 7.876834869384766, + "learning_rate": 1.310470172577796e-07, + "loss": 2.6472, + "step": 116250 + }, + { + "epoch": 7.898831362956924, + "grad_norm": 8.512510299682617, + "learning_rate": 1.3062236716945238e-07, + "loss": 2.5001, + "step": 116255 + }, + { + "epoch": 7.899171083027586, + "grad_norm": 7.070157051086426, + "learning_rate": 1.3019771708112518e-07, + "loss": 2.5152, + "step": 116260 + }, + { + "epoch": 7.899510803098247, + "grad_norm": 7.634885311126709, + "learning_rate": 1.2977306699279793e-07, + "loss": 2.8407, + "step": 116265 + }, + { + "epoch": 7.899850523168909, + "grad_norm": 9.092555046081543, + "learning_rate": 1.2934841690447073e-07, + "loss": 2.9064, + "step": 116270 + }, + { + "epoch": 7.900190243239571, + "grad_norm": 10.711078643798828, + "learning_rate": 1.289237668161435e-07, + "loss": 2.6779, + "step": 116275 + }, + { + "epoch": 7.900529963310232, + "grad_norm": 8.760250091552734, + "learning_rate": 1.2849911672781628e-07, + "loss": 2.7113, + "step": 116280 + }, + { + "epoch": 7.900869683380894, + "grad_norm": 11.024126052856445, + "learning_rate": 1.2807446663948908e-07, + "loss": 2.6732, + "step": 116285 + }, + { + "epoch": 7.901209403451556, + "grad_norm": 8.098424911499023, + "learning_rate": 1.2764981655116185e-07, + "loss": 2.8267, + "step": 116290 + }, + { + "epoch": 7.9015491235222175, + "grad_norm": 8.689288139343262, + "learning_rate": 1.2722516646283463e-07, + "loss": 2.7484, + "step": 116295 + }, + { + "epoch": 7.90188884359288, + "grad_norm": 8.65774917602539, + "learning_rate": 1.2680051637450743e-07, + "loss": 2.6641, + "step": 116300 + }, + { + "epoch": 7.902228563663542, + "grad_norm": 6.177457332611084, + "learning_rate": 1.263758662861802e-07, + "loss": 2.5599, + "step": 116305 + }, + { + "epoch": 7.902568283734203, + "grad_norm": 9.707437515258789, + "learning_rate": 1.2595121619785298e-07, + "loss": 2.6865, + "step": 116310 + }, + { + "epoch": 7.902908003804865, + "grad_norm": 7.930802345275879, + "learning_rate": 1.2552656610952578e-07, + "loss": 2.6884, + "step": 116315 + }, + { + "epoch": 7.903247723875527, + "grad_norm": 7.54844856262207, + "learning_rate": 1.2510191602119853e-07, + "loss": 2.8531, + "step": 116320 + }, + { + "epoch": 7.903587443946188, + "grad_norm": 8.319564819335938, + "learning_rate": 1.2467726593287133e-07, + "loss": 2.6625, + "step": 116325 + }, + { + "epoch": 7.90392716401685, + "grad_norm": 5.482362270355225, + "learning_rate": 1.242526158445441e-07, + "loss": 2.3834, + "step": 116330 + }, + { + "epoch": 7.904266884087512, + "grad_norm": 7.825061798095703, + "learning_rate": 1.2382796575621687e-07, + "loss": 2.8034, + "step": 116335 + }, + { + "epoch": 7.9046066041581735, + "grad_norm": 8.227855682373047, + "learning_rate": 1.2340331566788968e-07, + "loss": 2.6637, + "step": 116340 + }, + { + "epoch": 7.904946324228836, + "grad_norm": 7.304523944854736, + "learning_rate": 1.2297866557956245e-07, + "loss": 2.5567, + "step": 116345 + }, + { + "epoch": 7.905286044299498, + "grad_norm": 8.664576530456543, + "learning_rate": 1.2255401549123522e-07, + "loss": 2.9678, + "step": 116350 + }, + { + "epoch": 7.905625764370159, + "grad_norm": 9.080806732177734, + "learning_rate": 1.2212936540290802e-07, + "loss": 2.7015, + "step": 116355 + }, + { + "epoch": 7.905965484440821, + "grad_norm": 5.97186803817749, + "learning_rate": 1.217047153145808e-07, + "loss": 2.5831, + "step": 116360 + }, + { + "epoch": 7.906305204511483, + "grad_norm": 8.45978832244873, + "learning_rate": 1.2128006522625357e-07, + "loss": 2.8901, + "step": 116365 + }, + { + "epoch": 7.906644924582144, + "grad_norm": 7.036725997924805, + "learning_rate": 1.2085541513792637e-07, + "loss": 2.8366, + "step": 116370 + }, + { + "epoch": 7.906984644652806, + "grad_norm": 7.769842624664307, + "learning_rate": 1.2043076504959912e-07, + "loss": 2.4353, + "step": 116375 + }, + { + "epoch": 7.907324364723468, + "grad_norm": 6.308892250061035, + "learning_rate": 1.2000611496127192e-07, + "loss": 2.5832, + "step": 116380 + }, + { + "epoch": 7.9076640847941295, + "grad_norm": 9.732503890991211, + "learning_rate": 1.195814648729447e-07, + "loss": 2.6992, + "step": 116385 + }, + { + "epoch": 7.908003804864792, + "grad_norm": 7.810967445373535, + "learning_rate": 1.1915681478461747e-07, + "loss": 2.7973, + "step": 116390 + }, + { + "epoch": 7.908343524935454, + "grad_norm": 6.9793782234191895, + "learning_rate": 1.1873216469629026e-07, + "loss": 2.5933, + "step": 116395 + }, + { + "epoch": 7.908683245006115, + "grad_norm": 7.654290199279785, + "learning_rate": 1.1830751460796306e-07, + "loss": 2.5869, + "step": 116400 + }, + { + "epoch": 7.909022965076777, + "grad_norm": 8.005520820617676, + "learning_rate": 1.1788286451963582e-07, + "loss": 2.6563, + "step": 116405 + }, + { + "epoch": 7.909362685147439, + "grad_norm": 9.328704833984375, + "learning_rate": 1.1745821443130861e-07, + "loss": 2.7028, + "step": 116410 + }, + { + "epoch": 7.9097024052181, + "grad_norm": 6.646037578582764, + "learning_rate": 1.170335643429814e-07, + "loss": 2.7132, + "step": 116415 + }, + { + "epoch": 7.910042125288762, + "grad_norm": 8.53167724609375, + "learning_rate": 1.1660891425465417e-07, + "loss": 2.7452, + "step": 116420 + }, + { + "epoch": 7.9103818453594235, + "grad_norm": 10.10449504852295, + "learning_rate": 1.1618426416632696e-07, + "loss": 2.7996, + "step": 116425 + }, + { + "epoch": 7.9107215654300855, + "grad_norm": 9.426116943359375, + "learning_rate": 1.1575961407799975e-07, + "loss": 2.9541, + "step": 116430 + }, + { + "epoch": 7.911061285500748, + "grad_norm": 8.614806175231934, + "learning_rate": 1.1533496398967252e-07, + "loss": 2.7248, + "step": 116435 + }, + { + "epoch": 7.911401005571409, + "grad_norm": 6.477717876434326, + "learning_rate": 1.1491031390134531e-07, + "loss": 2.7232, + "step": 116440 + }, + { + "epoch": 7.911740725642071, + "grad_norm": 7.781088829040527, + "learning_rate": 1.144856638130181e-07, + "loss": 2.6193, + "step": 116445 + }, + { + "epoch": 7.912080445712733, + "grad_norm": 7.522253513336182, + "learning_rate": 1.1406101372469086e-07, + "loss": 2.7342, + "step": 116450 + }, + { + "epoch": 7.912420165783394, + "grad_norm": 8.649595260620117, + "learning_rate": 1.1363636363636364e-07, + "loss": 2.797, + "step": 116455 + }, + { + "epoch": 7.912759885854056, + "grad_norm": 7.715295314788818, + "learning_rate": 1.1321171354803642e-07, + "loss": 2.8522, + "step": 116460 + }, + { + "epoch": 7.913099605924718, + "grad_norm": 7.102340221405029, + "learning_rate": 1.127870634597092e-07, + "loss": 2.6046, + "step": 116465 + }, + { + "epoch": 7.9134393259953795, + "grad_norm": 10.582941055297852, + "learning_rate": 1.1236241337138199e-07, + "loss": 2.6264, + "step": 116470 + }, + { + "epoch": 7.9137790460660415, + "grad_norm": 7.025728702545166, + "learning_rate": 1.1193776328305477e-07, + "loss": 3.0005, + "step": 116475 + }, + { + "epoch": 7.914118766136704, + "grad_norm": 6.857227802276611, + "learning_rate": 1.1151311319472755e-07, + "loss": 2.6331, + "step": 116480 + }, + { + "epoch": 7.914458486207365, + "grad_norm": 8.522265434265137, + "learning_rate": 1.1108846310640034e-07, + "loss": 2.8421, + "step": 116485 + }, + { + "epoch": 7.914798206278027, + "grad_norm": 9.03812313079834, + "learning_rate": 1.1066381301807312e-07, + "loss": 2.7003, + "step": 116490 + }, + { + "epoch": 7.915137926348689, + "grad_norm": 6.75177001953125, + "learning_rate": 1.102391629297459e-07, + "loss": 2.5273, + "step": 116495 + }, + { + "epoch": 7.91547764641935, + "grad_norm": 10.71406078338623, + "learning_rate": 1.0981451284141869e-07, + "loss": 2.7146, + "step": 116500 + }, + { + "epoch": 7.915817366490012, + "grad_norm": 8.750303268432617, + "learning_rate": 1.0938986275309145e-07, + "loss": 2.6059, + "step": 116505 + }, + { + "epoch": 7.916157086560674, + "grad_norm": 6.644298553466797, + "learning_rate": 1.0896521266476424e-07, + "loss": 2.669, + "step": 116510 + }, + { + "epoch": 7.9164968066313355, + "grad_norm": 7.486092567443848, + "learning_rate": 1.0854056257643703e-07, + "loss": 2.7851, + "step": 116515 + }, + { + "epoch": 7.916836526701998, + "grad_norm": 8.600067138671875, + "learning_rate": 1.081159124881098e-07, + "loss": 2.7263, + "step": 116520 + }, + { + "epoch": 7.91717624677266, + "grad_norm": 9.363960266113281, + "learning_rate": 1.0769126239978259e-07, + "loss": 2.7055, + "step": 116525 + }, + { + "epoch": 7.917515966843321, + "grad_norm": 8.314468383789062, + "learning_rate": 1.0726661231145538e-07, + "loss": 2.7244, + "step": 116530 + }, + { + "epoch": 7.917855686913983, + "grad_norm": 8.006844520568848, + "learning_rate": 1.0684196222312815e-07, + "loss": 2.8588, + "step": 116535 + }, + { + "epoch": 7.918195406984645, + "grad_norm": 7.805295467376709, + "learning_rate": 1.0641731213480094e-07, + "loss": 2.7999, + "step": 116540 + }, + { + "epoch": 7.918535127055306, + "grad_norm": 7.098155975341797, + "learning_rate": 1.0599266204647371e-07, + "loss": 2.8013, + "step": 116545 + }, + { + "epoch": 7.918874847125968, + "grad_norm": 6.466693878173828, + "learning_rate": 1.055680119581465e-07, + "loss": 2.5701, + "step": 116550 + }, + { + "epoch": 7.91921456719663, + "grad_norm": 7.430341720581055, + "learning_rate": 1.0514336186981929e-07, + "loss": 2.7395, + "step": 116555 + }, + { + "epoch": 7.9195542872672915, + "grad_norm": 8.767889022827148, + "learning_rate": 1.0471871178149205e-07, + "loss": 2.7456, + "step": 116560 + }, + { + "epoch": 7.919894007337954, + "grad_norm": 8.651026725769043, + "learning_rate": 1.0429406169316484e-07, + "loss": 2.7138, + "step": 116565 + }, + { + "epoch": 7.920233727408616, + "grad_norm": 10.561858177185059, + "learning_rate": 1.0386941160483762e-07, + "loss": 2.6644, + "step": 116570 + }, + { + "epoch": 7.920573447479277, + "grad_norm": 7.171655654907227, + "learning_rate": 1.034447615165104e-07, + "loss": 2.8101, + "step": 116575 + }, + { + "epoch": 7.920913167549939, + "grad_norm": 8.612869262695312, + "learning_rate": 1.0302011142818319e-07, + "loss": 2.5998, + "step": 116580 + }, + { + "epoch": 7.921252887620601, + "grad_norm": 9.426855087280273, + "learning_rate": 1.0259546133985597e-07, + "loss": 2.7206, + "step": 116585 + }, + { + "epoch": 7.921592607691262, + "grad_norm": 10.841236114501953, + "learning_rate": 1.0217081125152875e-07, + "loss": 2.6372, + "step": 116590 + }, + { + "epoch": 7.921932327761924, + "grad_norm": 7.289388179779053, + "learning_rate": 1.0174616116320154e-07, + "loss": 2.6555, + "step": 116595 + }, + { + "epoch": 7.922272047832586, + "grad_norm": 6.142651081085205, + "learning_rate": 1.0132151107487432e-07, + "loss": 2.69, + "step": 116600 + }, + { + "epoch": 7.9226117679032475, + "grad_norm": 6.151421546936035, + "learning_rate": 1.0089686098654708e-07, + "loss": 2.6736, + "step": 116605 + }, + { + "epoch": 7.92295148797391, + "grad_norm": 8.089503288269043, + "learning_rate": 1.0047221089821987e-07, + "loss": 2.509, + "step": 116610 + }, + { + "epoch": 7.923291208044572, + "grad_norm": 8.484230041503906, + "learning_rate": 1.0004756080989266e-07, + "loss": 2.7588, + "step": 116615 + }, + { + "epoch": 7.923630928115233, + "grad_norm": 8.293414115905762, + "learning_rate": 9.962291072156543e-08, + "loss": 2.7699, + "step": 116620 + }, + { + "epoch": 7.923970648185895, + "grad_norm": 8.900044441223145, + "learning_rate": 9.919826063323822e-08, + "loss": 2.6758, + "step": 116625 + }, + { + "epoch": 7.924310368256556, + "grad_norm": 7.023808002471924, + "learning_rate": 9.8773610544911e-08, + "loss": 2.5229, + "step": 116630 + }, + { + "epoch": 7.924650088327218, + "grad_norm": 9.610682487487793, + "learning_rate": 9.834896045658378e-08, + "loss": 2.8216, + "step": 116635 + }, + { + "epoch": 7.92498980839788, + "grad_norm": 8.51916790008545, + "learning_rate": 9.792431036825657e-08, + "loss": 2.9316, + "step": 116640 + }, + { + "epoch": 7.925329528468541, + "grad_norm": 7.967522144317627, + "learning_rate": 9.749966027992934e-08, + "loss": 2.8703, + "step": 116645 + }, + { + "epoch": 7.9256692485392035, + "grad_norm": 7.927243709564209, + "learning_rate": 9.707501019160213e-08, + "loss": 2.8423, + "step": 116650 + }, + { + "epoch": 7.926008968609866, + "grad_norm": 8.922433853149414, + "learning_rate": 9.665036010327492e-08, + "loss": 2.9355, + "step": 116655 + }, + { + "epoch": 7.926348688680527, + "grad_norm": 8.979065895080566, + "learning_rate": 9.622571001494768e-08, + "loss": 2.7371, + "step": 116660 + }, + { + "epoch": 7.926688408751189, + "grad_norm": 6.839495658874512, + "learning_rate": 9.580105992662047e-08, + "loss": 2.8798, + "step": 116665 + }, + { + "epoch": 7.927028128821851, + "grad_norm": 6.5770955085754395, + "learning_rate": 9.537640983829326e-08, + "loss": 2.7489, + "step": 116670 + }, + { + "epoch": 7.927367848892512, + "grad_norm": 7.789799690246582, + "learning_rate": 9.495175974996603e-08, + "loss": 2.8039, + "step": 116675 + }, + { + "epoch": 7.927707568963174, + "grad_norm": 9.547961235046387, + "learning_rate": 9.452710966163882e-08, + "loss": 2.6328, + "step": 116680 + }, + { + "epoch": 7.928047289033836, + "grad_norm": 6.873141288757324, + "learning_rate": 9.41024595733116e-08, + "loss": 2.6886, + "step": 116685 + }, + { + "epoch": 7.9283870091044975, + "grad_norm": 7.870901107788086, + "learning_rate": 9.367780948498438e-08, + "loss": 2.8159, + "step": 116690 + }, + { + "epoch": 7.9287267291751595, + "grad_norm": 7.1297454833984375, + "learning_rate": 9.325315939665717e-08, + "loss": 2.5717, + "step": 116695 + }, + { + "epoch": 7.929066449245822, + "grad_norm": 9.886283874511719, + "learning_rate": 9.282850930832994e-08, + "loss": 2.7232, + "step": 116700 + }, + { + "epoch": 7.929406169316483, + "grad_norm": 8.786969184875488, + "learning_rate": 9.240385922000273e-08, + "loss": 2.6356, + "step": 116705 + }, + { + "epoch": 7.929745889387145, + "grad_norm": 8.696758270263672, + "learning_rate": 9.197920913167552e-08, + "loss": 2.7834, + "step": 116710 + }, + { + "epoch": 7.930085609457807, + "grad_norm": 9.201141357421875, + "learning_rate": 9.155455904334828e-08, + "loss": 2.8973, + "step": 116715 + }, + { + "epoch": 7.930425329528468, + "grad_norm": 7.9111456871032715, + "learning_rate": 9.112990895502107e-08, + "loss": 2.5235, + "step": 116720 + }, + { + "epoch": 7.93076504959913, + "grad_norm": 7.848461151123047, + "learning_rate": 9.070525886669385e-08, + "loss": 2.9815, + "step": 116725 + }, + { + "epoch": 7.931104769669792, + "grad_norm": 8.16352653503418, + "learning_rate": 9.028060877836663e-08, + "loss": 2.7823, + "step": 116730 + }, + { + "epoch": 7.9314444897404535, + "grad_norm": 9.49522590637207, + "learning_rate": 8.985595869003941e-08, + "loss": 2.5589, + "step": 116735 + }, + { + "epoch": 7.9317842098111155, + "grad_norm": 7.082513809204102, + "learning_rate": 8.94313086017122e-08, + "loss": 2.5149, + "step": 116740 + }, + { + "epoch": 7.932123929881778, + "grad_norm": 7.7790398597717285, + "learning_rate": 8.900665851338498e-08, + "loss": 2.6828, + "step": 116745 + }, + { + "epoch": 7.932463649952439, + "grad_norm": 8.418588638305664, + "learning_rate": 8.858200842505776e-08, + "loss": 2.6077, + "step": 116750 + }, + { + "epoch": 7.932803370023101, + "grad_norm": 8.414456367492676, + "learning_rate": 8.815735833673055e-08, + "loss": 2.6907, + "step": 116755 + }, + { + "epoch": 7.933143090093763, + "grad_norm": 8.348052978515625, + "learning_rate": 8.773270824840331e-08, + "loss": 2.68, + "step": 116760 + }, + { + "epoch": 7.933482810164424, + "grad_norm": 7.295603275299072, + "learning_rate": 8.73080581600761e-08, + "loss": 2.7631, + "step": 116765 + }, + { + "epoch": 7.933822530235086, + "grad_norm": 8.347474098205566, + "learning_rate": 8.688340807174889e-08, + "loss": 3.0154, + "step": 116770 + }, + { + "epoch": 7.934162250305748, + "grad_norm": 8.134578704833984, + "learning_rate": 8.645875798342166e-08, + "loss": 2.8046, + "step": 116775 + }, + { + "epoch": 7.9345019703764095, + "grad_norm": 10.149898529052734, + "learning_rate": 8.603410789509445e-08, + "loss": 2.4339, + "step": 116780 + }, + { + "epoch": 7.9348416904470715, + "grad_norm": 8.826155662536621, + "learning_rate": 8.560945780676722e-08, + "loss": 2.9154, + "step": 116785 + }, + { + "epoch": 7.935181410517734, + "grad_norm": 8.378748893737793, + "learning_rate": 8.518480771844001e-08, + "loss": 2.6341, + "step": 116790 + }, + { + "epoch": 7.935521130588395, + "grad_norm": 7.874111652374268, + "learning_rate": 8.47601576301128e-08, + "loss": 2.7184, + "step": 116795 + }, + { + "epoch": 7.935860850659057, + "grad_norm": 7.893584251403809, + "learning_rate": 8.433550754178557e-08, + "loss": 2.6767, + "step": 116800 + }, + { + "epoch": 7.936200570729719, + "grad_norm": 10.22302532196045, + "learning_rate": 8.391085745345836e-08, + "loss": 2.7569, + "step": 116805 + }, + { + "epoch": 7.93654029080038, + "grad_norm": 7.502962112426758, + "learning_rate": 8.348620736513115e-08, + "loss": 2.6866, + "step": 116810 + }, + { + "epoch": 7.936880010871042, + "grad_norm": 7.883656024932861, + "learning_rate": 8.306155727680391e-08, + "loss": 2.8211, + "step": 116815 + }, + { + "epoch": 7.937219730941704, + "grad_norm": 6.120839595794678, + "learning_rate": 8.26369071884767e-08, + "loss": 2.5636, + "step": 116820 + }, + { + "epoch": 7.9375594510123655, + "grad_norm": 8.472553253173828, + "learning_rate": 8.221225710014948e-08, + "loss": 2.6981, + "step": 116825 + }, + { + "epoch": 7.937899171083028, + "grad_norm": 7.013267993927002, + "learning_rate": 8.178760701182226e-08, + "loss": 2.7116, + "step": 116830 + }, + { + "epoch": 7.93823889115369, + "grad_norm": 7.932810306549072, + "learning_rate": 8.136295692349505e-08, + "loss": 2.966, + "step": 116835 + }, + { + "epoch": 7.938578611224351, + "grad_norm": 7.680690288543701, + "learning_rate": 8.093830683516783e-08, + "loss": 2.5083, + "step": 116840 + }, + { + "epoch": 7.938918331295013, + "grad_norm": 7.294590950012207, + "learning_rate": 8.051365674684061e-08, + "loss": 2.7808, + "step": 116845 + }, + { + "epoch": 7.939258051365675, + "grad_norm": 9.601723670959473, + "learning_rate": 8.00890066585134e-08, + "loss": 2.6897, + "step": 116850 + }, + { + "epoch": 7.939597771436336, + "grad_norm": 7.062492847442627, + "learning_rate": 7.966435657018618e-08, + "loss": 2.5199, + "step": 116855 + }, + { + "epoch": 7.939937491506998, + "grad_norm": 6.556288719177246, + "learning_rate": 7.923970648185896e-08, + "loss": 2.7934, + "step": 116860 + }, + { + "epoch": 7.94027721157766, + "grad_norm": 10.145628929138184, + "learning_rate": 7.881505639353174e-08, + "loss": 2.6884, + "step": 116865 + }, + { + "epoch": 7.9406169316483215, + "grad_norm": 8.602707862854004, + "learning_rate": 7.83904063052045e-08, + "loss": 2.69, + "step": 116870 + }, + { + "epoch": 7.940956651718984, + "grad_norm": 9.11845588684082, + "learning_rate": 7.79657562168773e-08, + "loss": 2.8171, + "step": 116875 + }, + { + "epoch": 7.941296371789646, + "grad_norm": 6.923299312591553, + "learning_rate": 7.754110612855008e-08, + "loss": 2.8944, + "step": 116880 + }, + { + "epoch": 7.941636091860307, + "grad_norm": 9.404691696166992, + "learning_rate": 7.711645604022287e-08, + "loss": 2.7214, + "step": 116885 + }, + { + "epoch": 7.941975811930969, + "grad_norm": 8.147955894470215, + "learning_rate": 7.669180595189564e-08, + "loss": 2.8818, + "step": 116890 + }, + { + "epoch": 7.942315532001631, + "grad_norm": 9.006000518798828, + "learning_rate": 7.626715586356842e-08, + "loss": 2.6601, + "step": 116895 + }, + { + "epoch": 7.942655252072292, + "grad_norm": 7.565447807312012, + "learning_rate": 7.58425057752412e-08, + "loss": 2.7648, + "step": 116900 + }, + { + "epoch": 7.942994972142954, + "grad_norm": 5.953572750091553, + "learning_rate": 7.541785568691399e-08, + "loss": 2.726, + "step": 116905 + }, + { + "epoch": 7.943334692213616, + "grad_norm": 5.607428073883057, + "learning_rate": 7.499320559858677e-08, + "loss": 2.6455, + "step": 116910 + }, + { + "epoch": 7.9436744122842775, + "grad_norm": 8.99843692779541, + "learning_rate": 7.456855551025955e-08, + "loss": 2.8269, + "step": 116915 + }, + { + "epoch": 7.94401413235494, + "grad_norm": 8.405858993530273, + "learning_rate": 7.414390542193233e-08, + "loss": 2.8007, + "step": 116920 + }, + { + "epoch": 7.944353852425602, + "grad_norm": 7.146877288818359, + "learning_rate": 7.371925533360512e-08, + "loss": 2.6752, + "step": 116925 + }, + { + "epoch": 7.944693572496263, + "grad_norm": 7.971839427947998, + "learning_rate": 7.329460524527789e-08, + "loss": 2.6134, + "step": 116930 + }, + { + "epoch": 7.945033292566925, + "grad_norm": 7.016451358795166, + "learning_rate": 7.286995515695068e-08, + "loss": 2.9241, + "step": 116935 + }, + { + "epoch": 7.945373012637587, + "grad_norm": 6.655670642852783, + "learning_rate": 7.244530506862347e-08, + "loss": 2.6049, + "step": 116940 + }, + { + "epoch": 7.945712732708248, + "grad_norm": 6.624749660491943, + "learning_rate": 7.202065498029624e-08, + "loss": 2.7096, + "step": 116945 + }, + { + "epoch": 7.94605245277891, + "grad_norm": 8.991133689880371, + "learning_rate": 7.159600489196903e-08, + "loss": 2.7885, + "step": 116950 + }, + { + "epoch": 7.946392172849572, + "grad_norm": 8.156643867492676, + "learning_rate": 7.11713548036418e-08, + "loss": 2.7536, + "step": 116955 + }, + { + "epoch": 7.9467318929202335, + "grad_norm": 8.298099517822266, + "learning_rate": 7.074670471531459e-08, + "loss": 2.7408, + "step": 116960 + }, + { + "epoch": 7.947071612990896, + "grad_norm": 6.64639139175415, + "learning_rate": 7.032205462698738e-08, + "loss": 2.6277, + "step": 116965 + }, + { + "epoch": 7.947411333061558, + "grad_norm": 6.109804630279541, + "learning_rate": 6.989740453866015e-08, + "loss": 2.5878, + "step": 116970 + }, + { + "epoch": 7.947751053132219, + "grad_norm": 7.552306175231934, + "learning_rate": 6.947275445033293e-08, + "loss": 2.5192, + "step": 116975 + }, + { + "epoch": 7.948090773202881, + "grad_norm": 8.865707397460938, + "learning_rate": 6.904810436200571e-08, + "loss": 2.7101, + "step": 116980 + }, + { + "epoch": 7.948430493273543, + "grad_norm": 8.208636283874512, + "learning_rate": 6.86234542736785e-08, + "loss": 2.6229, + "step": 116985 + }, + { + "epoch": 7.948770213344204, + "grad_norm": 12.711633682250977, + "learning_rate": 6.819880418535127e-08, + "loss": 2.8894, + "step": 116990 + }, + { + "epoch": 7.949109933414866, + "grad_norm": 9.716116905212402, + "learning_rate": 6.777415409702406e-08, + "loss": 2.6772, + "step": 116995 + }, + { + "epoch": 7.949449653485528, + "grad_norm": 8.309249877929688, + "learning_rate": 6.734950400869685e-08, + "loss": 2.8254, + "step": 117000 + }, + { + "epoch": 7.9497893735561895, + "grad_norm": 7.017250061035156, + "learning_rate": 6.692485392036962e-08, + "loss": 2.6593, + "step": 117005 + }, + { + "epoch": 7.950129093626852, + "grad_norm": 8.509529113769531, + "learning_rate": 6.65002038320424e-08, + "loss": 2.5094, + "step": 117010 + }, + { + "epoch": 7.950468813697514, + "grad_norm": 6.9272780418396, + "learning_rate": 6.607555374371519e-08, + "loss": 2.5994, + "step": 117015 + }, + { + "epoch": 7.950808533768175, + "grad_norm": 6.153290271759033, + "learning_rate": 6.565090365538797e-08, + "loss": 2.4781, + "step": 117020 + }, + { + "epoch": 7.951148253838837, + "grad_norm": 7.862964630126953, + "learning_rate": 6.522625356706075e-08, + "loss": 2.3388, + "step": 117025 + }, + { + "epoch": 7.951487973909499, + "grad_norm": 6.590076446533203, + "learning_rate": 6.480160347873352e-08, + "loss": 2.6747, + "step": 117030 + }, + { + "epoch": 7.95182769398016, + "grad_norm": 8.107426643371582, + "learning_rate": 6.437695339040631e-08, + "loss": 2.767, + "step": 117035 + }, + { + "epoch": 7.952167414050822, + "grad_norm": 7.910931587219238, + "learning_rate": 6.39523033020791e-08, + "loss": 2.6872, + "step": 117040 + }, + { + "epoch": 7.952507134121484, + "grad_norm": 5.654472351074219, + "learning_rate": 6.352765321375187e-08, + "loss": 2.8053, + "step": 117045 + }, + { + "epoch": 7.9528468541921455, + "grad_norm": 8.848631858825684, + "learning_rate": 6.310300312542465e-08, + "loss": 2.7982, + "step": 117050 + }, + { + "epoch": 7.953186574262808, + "grad_norm": 7.6574625968933105, + "learning_rate": 6.267835303709743e-08, + "loss": 2.5917, + "step": 117055 + }, + { + "epoch": 7.95352629433347, + "grad_norm": 8.736148834228516, + "learning_rate": 6.225370294877022e-08, + "loss": 2.7306, + "step": 117060 + }, + { + "epoch": 7.953866014404131, + "grad_norm": 6.58856725692749, + "learning_rate": 6.1829052860443e-08, + "loss": 2.6628, + "step": 117065 + }, + { + "epoch": 7.954205734474793, + "grad_norm": 7.154913902282715, + "learning_rate": 6.140440277211578e-08, + "loss": 2.7449, + "step": 117070 + }, + { + "epoch": 7.954545454545455, + "grad_norm": 8.155850410461426, + "learning_rate": 6.097975268378857e-08, + "loss": 2.6158, + "step": 117075 + }, + { + "epoch": 7.954885174616116, + "grad_norm": 11.507071495056152, + "learning_rate": 6.055510259546134e-08, + "loss": 2.6762, + "step": 117080 + }, + { + "epoch": 7.955224894686778, + "grad_norm": 8.726320266723633, + "learning_rate": 6.013045250713413e-08, + "loss": 2.5839, + "step": 117085 + }, + { + "epoch": 7.95556461475744, + "grad_norm": 8.92713451385498, + "learning_rate": 5.97058024188069e-08, + "loss": 2.5566, + "step": 117090 + }, + { + "epoch": 7.9559043348281016, + "grad_norm": 6.727827548980713, + "learning_rate": 5.928115233047969e-08, + "loss": 2.7623, + "step": 117095 + }, + { + "epoch": 7.956244054898764, + "grad_norm": 6.000484466552734, + "learning_rate": 5.885650224215247e-08, + "loss": 2.7796, + "step": 117100 + }, + { + "epoch": 7.956583774969425, + "grad_norm": 6.561611652374268, + "learning_rate": 5.8431852153825255e-08, + "loss": 2.7101, + "step": 117105 + }, + { + "epoch": 7.956923495040087, + "grad_norm": 8.937808990478516, + "learning_rate": 5.8007202065498036e-08, + "loss": 2.7185, + "step": 117110 + }, + { + "epoch": 7.957263215110749, + "grad_norm": 8.81234359741211, + "learning_rate": 5.758255197717081e-08, + "loss": 2.7312, + "step": 117115 + }, + { + "epoch": 7.95760293518141, + "grad_norm": 7.926998615264893, + "learning_rate": 5.71579018888436e-08, + "loss": 2.5478, + "step": 117120 + }, + { + "epoch": 7.957942655252072, + "grad_norm": 8.996459007263184, + "learning_rate": 5.673325180051638e-08, + "loss": 2.5368, + "step": 117125 + }, + { + "epoch": 7.958282375322734, + "grad_norm": 9.85107707977295, + "learning_rate": 5.630860171218916e-08, + "loss": 2.2929, + "step": 117130 + }, + { + "epoch": 7.9586220953933955, + "grad_norm": 5.889652252197266, + "learning_rate": 5.588395162386194e-08, + "loss": 2.8164, + "step": 117135 + }, + { + "epoch": 7.958961815464058, + "grad_norm": 7.670100688934326, + "learning_rate": 5.545930153553473e-08, + "loss": 2.8827, + "step": 117140 + }, + { + "epoch": 7.95930153553472, + "grad_norm": 8.316387176513672, + "learning_rate": 5.50346514472075e-08, + "loss": 2.7815, + "step": 117145 + }, + { + "epoch": 7.959641255605381, + "grad_norm": 9.214675903320312, + "learning_rate": 5.4610001358880284e-08, + "loss": 2.7261, + "step": 117150 + }, + { + "epoch": 7.959980975676043, + "grad_norm": 9.790353775024414, + "learning_rate": 5.418535127055307e-08, + "loss": 2.6901, + "step": 117155 + }, + { + "epoch": 7.960320695746705, + "grad_norm": 9.583436012268066, + "learning_rate": 5.376070118222585e-08, + "loss": 2.8241, + "step": 117160 + }, + { + "epoch": 7.960660415817366, + "grad_norm": 6.1456522941589355, + "learning_rate": 5.333605109389863e-08, + "loss": 2.7183, + "step": 117165 + }, + { + "epoch": 7.961000135888028, + "grad_norm": 8.713662147521973, + "learning_rate": 5.291140100557142e-08, + "loss": 2.5822, + "step": 117170 + }, + { + "epoch": 7.96133985595869, + "grad_norm": 9.595746994018555, + "learning_rate": 5.2486750917244195e-08, + "loss": 2.6256, + "step": 117175 + }, + { + "epoch": 7.9616795760293515, + "grad_norm": 7.459692001342773, + "learning_rate": 5.2062100828916976e-08, + "loss": 2.7472, + "step": 117180 + }, + { + "epoch": 7.962019296100014, + "grad_norm": 8.879729270935059, + "learning_rate": 5.163745074058976e-08, + "loss": 2.6344, + "step": 117185 + }, + { + "epoch": 7.962359016170676, + "grad_norm": 7.670926570892334, + "learning_rate": 5.1212800652262544e-08, + "loss": 2.6728, + "step": 117190 + }, + { + "epoch": 7.962698736241337, + "grad_norm": 9.704266548156738, + "learning_rate": 5.078815056393532e-08, + "loss": 2.5722, + "step": 117195 + }, + { + "epoch": 7.963038456311999, + "grad_norm": 6.763222694396973, + "learning_rate": 5.03635004756081e-08, + "loss": 2.8352, + "step": 117200 + }, + { + "epoch": 7.963378176382661, + "grad_norm": 7.50054931640625, + "learning_rate": 4.993885038728089e-08, + "loss": 2.6294, + "step": 117205 + }, + { + "epoch": 7.963717896453322, + "grad_norm": 7.769713401794434, + "learning_rate": 4.951420029895367e-08, + "loss": 2.7976, + "step": 117210 + }, + { + "epoch": 7.964057616523984, + "grad_norm": 8.445963859558105, + "learning_rate": 4.908955021062645e-08, + "loss": 2.7577, + "step": 117215 + }, + { + "epoch": 7.964397336594646, + "grad_norm": 9.371702194213867, + "learning_rate": 4.866490012229922e-08, + "loss": 2.7164, + "step": 117220 + }, + { + "epoch": 7.9647370566653075, + "grad_norm": 8.341949462890625, + "learning_rate": 4.824025003397201e-08, + "loss": 2.7401, + "step": 117225 + }, + { + "epoch": 7.96507677673597, + "grad_norm": 6.83805513381958, + "learning_rate": 4.781559994564479e-08, + "loss": 2.7102, + "step": 117230 + }, + { + "epoch": 7.965416496806632, + "grad_norm": 7.1665849685668945, + "learning_rate": 4.739094985731757e-08, + "loss": 2.5575, + "step": 117235 + }, + { + "epoch": 7.965756216877293, + "grad_norm": 6.609444618225098, + "learning_rate": 4.696629976899036e-08, + "loss": 2.8367, + "step": 117240 + }, + { + "epoch": 7.966095936947955, + "grad_norm": 8.449790000915527, + "learning_rate": 4.654164968066314e-08, + "loss": 3.0096, + "step": 117245 + }, + { + "epoch": 7.966435657018617, + "grad_norm": 6.420809268951416, + "learning_rate": 4.6116999592335915e-08, + "loss": 2.7059, + "step": 117250 + }, + { + "epoch": 7.966775377089278, + "grad_norm": 6.801746368408203, + "learning_rate": 4.5692349504008696e-08, + "loss": 2.6805, + "step": 117255 + }, + { + "epoch": 7.96711509715994, + "grad_norm": 6.963258266448975, + "learning_rate": 4.5267699415681484e-08, + "loss": 2.5139, + "step": 117260 + }, + { + "epoch": 7.967454817230602, + "grad_norm": 7.739333629608154, + "learning_rate": 4.4843049327354265e-08, + "loss": 2.6929, + "step": 117265 + }, + { + "epoch": 7.9677945373012635, + "grad_norm": 8.734375953674316, + "learning_rate": 4.441839923902704e-08, + "loss": 2.8837, + "step": 117270 + }, + { + "epoch": 7.968134257371926, + "grad_norm": 8.395490646362305, + "learning_rate": 4.3993749150699827e-08, + "loss": 2.7375, + "step": 117275 + }, + { + "epoch": 7.968473977442588, + "grad_norm": 7.865385055541992, + "learning_rate": 4.356909906237261e-08, + "loss": 2.5941, + "step": 117280 + }, + { + "epoch": 7.968813697513249, + "grad_norm": 7.546437740325928, + "learning_rate": 4.314444897404539e-08, + "loss": 2.6806, + "step": 117285 + }, + { + "epoch": 7.969153417583911, + "grad_norm": 7.0665283203125, + "learning_rate": 4.2719798885718176e-08, + "loss": 2.8352, + "step": 117290 + }, + { + "epoch": 7.969493137654573, + "grad_norm": 6.610653877258301, + "learning_rate": 4.229514879739096e-08, + "loss": 2.7791, + "step": 117295 + }, + { + "epoch": 7.969832857725234, + "grad_norm": 10.788460731506348, + "learning_rate": 4.187049870906373e-08, + "loss": 2.7089, + "step": 117300 + }, + { + "epoch": 7.970172577795896, + "grad_norm": 9.030673027038574, + "learning_rate": 4.144584862073651e-08, + "loss": 2.78, + "step": 117305 + }, + { + "epoch": 7.9705122978665575, + "grad_norm": 7.8972601890563965, + "learning_rate": 4.10211985324093e-08, + "loss": 2.6083, + "step": 117310 + }, + { + "epoch": 7.9708520179372195, + "grad_norm": 7.932689189910889, + "learning_rate": 4.059654844408208e-08, + "loss": 2.4181, + "step": 117315 + }, + { + "epoch": 7.971191738007882, + "grad_norm": 9.13099193572998, + "learning_rate": 4.017189835575486e-08, + "loss": 2.6889, + "step": 117320 + }, + { + "epoch": 7.971531458078543, + "grad_norm": 9.698552131652832, + "learning_rate": 3.974724826742765e-08, + "loss": 2.7034, + "step": 117325 + }, + { + "epoch": 7.971871178149205, + "grad_norm": 7.856801986694336, + "learning_rate": 3.9322598179100423e-08, + "loss": 2.744, + "step": 117330 + }, + { + "epoch": 7.972210898219867, + "grad_norm": 7.320738792419434, + "learning_rate": 3.8897948090773204e-08, + "loss": 2.8311, + "step": 117335 + }, + { + "epoch": 7.972550618290528, + "grad_norm": 8.034480094909668, + "learning_rate": 3.8473298002445985e-08, + "loss": 2.821, + "step": 117340 + }, + { + "epoch": 7.97289033836119, + "grad_norm": 6.356759548187256, + "learning_rate": 3.804864791411877e-08, + "loss": 2.7957, + "step": 117345 + }, + { + "epoch": 7.973230058431852, + "grad_norm": 8.80492115020752, + "learning_rate": 3.7623997825791554e-08, + "loss": 2.4232, + "step": 117350 + }, + { + "epoch": 7.9735697785025135, + "grad_norm": 8.243036270141602, + "learning_rate": 3.7199347737464335e-08, + "loss": 2.8477, + "step": 117355 + }, + { + "epoch": 7.9739094985731755, + "grad_norm": 9.079806327819824, + "learning_rate": 3.6774697649137115e-08, + "loss": 2.7665, + "step": 117360 + }, + { + "epoch": 7.974249218643838, + "grad_norm": 7.404937744140625, + "learning_rate": 3.6350047560809896e-08, + "loss": 2.6396, + "step": 117365 + }, + { + "epoch": 7.974588938714499, + "grad_norm": 12.558002471923828, + "learning_rate": 3.592539747248268e-08, + "loss": 2.7457, + "step": 117370 + }, + { + "epoch": 7.974928658785161, + "grad_norm": 9.203106880187988, + "learning_rate": 3.550074738415546e-08, + "loss": 2.9395, + "step": 117375 + }, + { + "epoch": 7.975268378855823, + "grad_norm": 7.9250712394714355, + "learning_rate": 3.507609729582824e-08, + "loss": 2.7909, + "step": 117380 + }, + { + "epoch": 7.975608098926484, + "grad_norm": 6.8285322189331055, + "learning_rate": 3.465144720750102e-08, + "loss": 2.7284, + "step": 117385 + }, + { + "epoch": 7.975947818997146, + "grad_norm": 6.746373653411865, + "learning_rate": 3.422679711917381e-08, + "loss": 2.7057, + "step": 117390 + }, + { + "epoch": 7.976287539067808, + "grad_norm": 7.183392524719238, + "learning_rate": 3.380214703084658e-08, + "loss": 2.7158, + "step": 117395 + }, + { + "epoch": 7.9766272591384695, + "grad_norm": 8.599082946777344, + "learning_rate": 3.337749694251937e-08, + "loss": 2.6404, + "step": 117400 + }, + { + "epoch": 7.976966979209132, + "grad_norm": 6.612517833709717, + "learning_rate": 3.295284685419215e-08, + "loss": 2.6783, + "step": 117405 + }, + { + "epoch": 7.977306699279794, + "grad_norm": 8.376697540283203, + "learning_rate": 3.252819676586493e-08, + "loss": 2.8381, + "step": 117410 + }, + { + "epoch": 7.977646419350455, + "grad_norm": 6.88329553604126, + "learning_rate": 3.210354667753771e-08, + "loss": 2.6848, + "step": 117415 + }, + { + "epoch": 7.977986139421117, + "grad_norm": 7.21010684967041, + "learning_rate": 3.167889658921049e-08, + "loss": 2.6618, + "step": 117420 + }, + { + "epoch": 7.978325859491779, + "grad_norm": 7.255218029022217, + "learning_rate": 3.1254246500883274e-08, + "loss": 3.0936, + "step": 117425 + }, + { + "epoch": 7.97866557956244, + "grad_norm": 9.087076187133789, + "learning_rate": 3.0829596412556055e-08, + "loss": 2.6651, + "step": 117430 + }, + { + "epoch": 7.979005299633102, + "grad_norm": 9.329203605651855, + "learning_rate": 3.0404946324228836e-08, + "loss": 2.9195, + "step": 117435 + }, + { + "epoch": 7.979345019703764, + "grad_norm": 9.29466438293457, + "learning_rate": 2.9980296235901623e-08, + "loss": 2.8542, + "step": 117440 + }, + { + "epoch": 7.9796847397744255, + "grad_norm": 7.376945972442627, + "learning_rate": 2.95556461475744e-08, + "loss": 2.9986, + "step": 117445 + }, + { + "epoch": 7.980024459845088, + "grad_norm": 9.788148880004883, + "learning_rate": 2.9130996059247182e-08, + "loss": 2.7095, + "step": 117450 + }, + { + "epoch": 7.98036417991575, + "grad_norm": 6.473140716552734, + "learning_rate": 2.8706345970919963e-08, + "loss": 2.6241, + "step": 117455 + }, + { + "epoch": 7.980703899986411, + "grad_norm": 8.058478355407715, + "learning_rate": 2.8281695882592747e-08, + "loss": 2.8086, + "step": 117460 + }, + { + "epoch": 7.981043620057073, + "grad_norm": 6.924862861633301, + "learning_rate": 2.7857045794265528e-08, + "loss": 2.6586, + "step": 117465 + }, + { + "epoch": 7.981383340127735, + "grad_norm": 8.406458854675293, + "learning_rate": 2.743239570593831e-08, + "loss": 2.7517, + "step": 117470 + }, + { + "epoch": 7.981723060198396, + "grad_norm": 7.860377311706543, + "learning_rate": 2.7007745617611093e-08, + "loss": 2.8099, + "step": 117475 + }, + { + "epoch": 7.982062780269058, + "grad_norm": 7.2896575927734375, + "learning_rate": 2.658309552928387e-08, + "loss": 2.686, + "step": 117480 + }, + { + "epoch": 7.98240250033972, + "grad_norm": 6.623136043548584, + "learning_rate": 2.6158445440956655e-08, + "loss": 2.6308, + "step": 117485 + }, + { + "epoch": 7.9827422204103815, + "grad_norm": 8.467308044433594, + "learning_rate": 2.573379535262944e-08, + "loss": 2.9298, + "step": 117490 + }, + { + "epoch": 7.983081940481044, + "grad_norm": 7.4243574142456055, + "learning_rate": 2.5309145264302217e-08, + "loss": 2.5059, + "step": 117495 + }, + { + "epoch": 7.983421660551706, + "grad_norm": 7.2167768478393555, + "learning_rate": 2.4884495175975e-08, + "loss": 2.805, + "step": 117500 + }, + { + "epoch": 7.983761380622367, + "grad_norm": 7.638394832611084, + "learning_rate": 2.445984508764778e-08, + "loss": 2.7552, + "step": 117505 + }, + { + "epoch": 7.984101100693029, + "grad_norm": 6.325710773468018, + "learning_rate": 2.4035194999320563e-08, + "loss": 2.9109, + "step": 117510 + }, + { + "epoch": 7.984440820763691, + "grad_norm": 6.620457649230957, + "learning_rate": 2.361054491099334e-08, + "loss": 2.7913, + "step": 117515 + }, + { + "epoch": 7.984780540834352, + "grad_norm": 6.925183296203613, + "learning_rate": 2.3185894822666125e-08, + "loss": 2.7856, + "step": 117520 + }, + { + "epoch": 7.985120260905014, + "grad_norm": 6.71173620223999, + "learning_rate": 2.276124473433891e-08, + "loss": 2.6175, + "step": 117525 + }, + { + "epoch": 7.985459980975676, + "grad_norm": 7.833379745483398, + "learning_rate": 2.2336594646011687e-08, + "loss": 2.5054, + "step": 117530 + }, + { + "epoch": 7.9857997010463375, + "grad_norm": 7.317165851593018, + "learning_rate": 2.191194455768447e-08, + "loss": 2.7815, + "step": 117535 + }, + { + "epoch": 7.986139421117, + "grad_norm": 7.710849285125732, + "learning_rate": 2.148729446935725e-08, + "loss": 2.5085, + "step": 117540 + }, + { + "epoch": 7.986479141187662, + "grad_norm": 5.7638163566589355, + "learning_rate": 2.1062644381030033e-08, + "loss": 2.6727, + "step": 117545 + }, + { + "epoch": 7.986818861258323, + "grad_norm": 8.341775894165039, + "learning_rate": 2.0637994292702817e-08, + "loss": 2.6035, + "step": 117550 + }, + { + "epoch": 7.987158581328985, + "grad_norm": 7.098612308502197, + "learning_rate": 2.0213344204375594e-08, + "loss": 2.7074, + "step": 117555 + }, + { + "epoch": 7.987498301399647, + "grad_norm": 6.506921768188477, + "learning_rate": 1.978869411604838e-08, + "loss": 2.8209, + "step": 117560 + }, + { + "epoch": 7.987838021470308, + "grad_norm": 7.6535725593566895, + "learning_rate": 1.936404402772116e-08, + "loss": 2.813, + "step": 117565 + }, + { + "epoch": 7.98817774154097, + "grad_norm": 8.10050106048584, + "learning_rate": 1.893939393939394e-08, + "loss": 2.6507, + "step": 117570 + }, + { + "epoch": 7.988517461611632, + "grad_norm": 9.039199829101562, + "learning_rate": 1.851474385106672e-08, + "loss": 2.5756, + "step": 117575 + }, + { + "epoch": 7.9888571816822935, + "grad_norm": 8.066468238830566, + "learning_rate": 1.8090093762739502e-08, + "loss": 2.5728, + "step": 117580 + }, + { + "epoch": 7.989196901752956, + "grad_norm": 7.8526225090026855, + "learning_rate": 1.7665443674412287e-08, + "loss": 2.5004, + "step": 117585 + }, + { + "epoch": 7.989536621823618, + "grad_norm": 10.502717971801758, + "learning_rate": 1.7240793586085068e-08, + "loss": 2.7066, + "step": 117590 + }, + { + "epoch": 7.989876341894279, + "grad_norm": 7.722026348114014, + "learning_rate": 1.681614349775785e-08, + "loss": 2.7447, + "step": 117595 + }, + { + "epoch": 7.990216061964941, + "grad_norm": 6.8401360511779785, + "learning_rate": 1.639149340943063e-08, + "loss": 2.8142, + "step": 117600 + }, + { + "epoch": 7.990555782035603, + "grad_norm": 7.68062686920166, + "learning_rate": 1.5966843321103414e-08, + "loss": 2.7662, + "step": 117605 + }, + { + "epoch": 7.990895502106264, + "grad_norm": 7.977413654327393, + "learning_rate": 1.5542193232776195e-08, + "loss": 2.6604, + "step": 117610 + }, + { + "epoch": 7.991235222176926, + "grad_norm": 8.149065971374512, + "learning_rate": 1.5117543144448975e-08, + "loss": 2.6405, + "step": 117615 + }, + { + "epoch": 7.991574942247588, + "grad_norm": 8.25558853149414, + "learning_rate": 1.4692893056121756e-08, + "loss": 2.8207, + "step": 117620 + }, + { + "epoch": 7.9919146623182495, + "grad_norm": 9.402177810668945, + "learning_rate": 1.4268242967794539e-08, + "loss": 2.9398, + "step": 117625 + }, + { + "epoch": 7.992254382388912, + "grad_norm": 5.8533616065979, + "learning_rate": 1.384359287946732e-08, + "loss": 2.8, + "step": 117630 + }, + { + "epoch": 7.992594102459574, + "grad_norm": 8.452311515808105, + "learning_rate": 1.3418942791140102e-08, + "loss": 2.6745, + "step": 117635 + }, + { + "epoch": 7.992933822530235, + "grad_norm": 7.856334209442139, + "learning_rate": 1.2994292702812883e-08, + "loss": 2.764, + "step": 117640 + }, + { + "epoch": 7.993273542600897, + "grad_norm": 8.159979820251465, + "learning_rate": 1.2569642614485664e-08, + "loss": 2.5904, + "step": 117645 + }, + { + "epoch": 7.993613262671559, + "grad_norm": 8.21860408782959, + "learning_rate": 1.2144992526158445e-08, + "loss": 2.682, + "step": 117650 + }, + { + "epoch": 7.99395298274222, + "grad_norm": 7.1467461585998535, + "learning_rate": 1.172034243783123e-08, + "loss": 2.6501, + "step": 117655 + }, + { + "epoch": 7.994292702812882, + "grad_norm": 7.677640914916992, + "learning_rate": 1.129569234950401e-08, + "loss": 2.6029, + "step": 117660 + }, + { + "epoch": 7.994632422883544, + "grad_norm": 10.205591201782227, + "learning_rate": 1.0871042261176791e-08, + "loss": 2.6544, + "step": 117665 + }, + { + "epoch": 7.9949721429542056, + "grad_norm": 8.186488151550293, + "learning_rate": 1.0446392172849572e-08, + "loss": 2.4496, + "step": 117670 + }, + { + "epoch": 7.995311863024868, + "grad_norm": 7.125949859619141, + "learning_rate": 1.0021742084522353e-08, + "loss": 2.7133, + "step": 117675 + }, + { + "epoch": 7.99565158309553, + "grad_norm": 7.17349910736084, + "learning_rate": 9.597091996195136e-09, + "loss": 2.5593, + "step": 117680 + }, + { + "epoch": 7.995991303166191, + "grad_norm": 7.541616916656494, + "learning_rate": 9.172441907867917e-09, + "loss": 2.7806, + "step": 117685 + }, + { + "epoch": 7.996331023236853, + "grad_norm": 8.53640079498291, + "learning_rate": 8.747791819540699e-09, + "loss": 2.7262, + "step": 117690 + }, + { + "epoch": 7.996670743307515, + "grad_norm": 6.109592914581299, + "learning_rate": 8.32314173121348e-09, + "loss": 2.6593, + "step": 117695 + }, + { + "epoch": 7.997010463378176, + "grad_norm": 8.709267616271973, + "learning_rate": 7.898491642886263e-09, + "loss": 2.6553, + "step": 117700 + }, + { + "epoch": 7.997350183448838, + "grad_norm": 6.23703145980835, + "learning_rate": 7.473841554559044e-09, + "loss": 2.6675, + "step": 117705 + }, + { + "epoch": 7.9976899035195, + "grad_norm": 8.49058723449707, + "learning_rate": 7.049191466231825e-09, + "loss": 2.6253, + "step": 117710 + }, + { + "epoch": 7.998029623590162, + "grad_norm": 7.567598819732666, + "learning_rate": 6.624541377904607e-09, + "loss": 2.6339, + "step": 117715 + }, + { + "epoch": 7.998369343660824, + "grad_norm": 9.637716293334961, + "learning_rate": 6.199891289577389e-09, + "loss": 2.7549, + "step": 117720 + }, + { + "epoch": 7.998709063731486, + "grad_norm": 8.574246406555176, + "learning_rate": 5.7752412012501706e-09, + "loss": 2.7681, + "step": 117725 + }, + { + "epoch": 7.999048783802147, + "grad_norm": 9.584497451782227, + "learning_rate": 5.3505911129229515e-09, + "loss": 2.8905, + "step": 117730 + }, + { + "epoch": 7.999388503872809, + "grad_norm": 8.832257270812988, + "learning_rate": 4.925941024595734e-09, + "loss": 2.7699, + "step": 117735 + }, + { + "epoch": 7.999728223943471, + "grad_norm": 9.127363204956055, + "learning_rate": 4.501290936268515e-09, + "loss": 2.9768, + "step": 117740 + }, + { + "epoch": 8.0, + "eval_bertscore": { + "f1": 0.8409928807986777, + "precision": 0.8403705409682413, + "recall": 0.8423897626095241 + }, + "eval_bleu_4": 0.017020498946732236, + "eval_exact_match": 0.00019381723035177828, + "eval_loss": 3.3546345233917236, + "eval_meteor": 0.10571031638688935, + "eval_rouge": { + "rouge1": 0.13520845950110405, + "rouge2": 0.017677431908815235, + "rougeL": 0.11232028225023491, + "rougeLsum": 0.11234145920335115 + }, + "eval_runtime": 1060.0895, + "eval_samples_per_second": 9.734, + "eval_steps_per_second": 1.217, + "step": 117744 + } + ], + "logging_steps": 5, + "max_steps": 117744, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.8715904342687744e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}