diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,12047 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 8576, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005830223880597014, + "grad_norm": 2.550195533598444, + "learning_rate": 5.827505827505828e-07, + "loss": 0.8852, + "step": 5 + }, + { + "epoch": 0.0011660447761194029, + "grad_norm": 2.2188855019798273, + "learning_rate": 1.1655011655011657e-06, + "loss": 0.8311, + "step": 10 + }, + { + "epoch": 0.0017490671641791045, + "grad_norm": 1.7967346771825998, + "learning_rate": 1.7482517482517485e-06, + "loss": 0.8113, + "step": 15 + }, + { + "epoch": 0.0023320895522388058, + "grad_norm": 1.618619190869765, + "learning_rate": 2.3310023310023313e-06, + "loss": 0.8333, + "step": 20 + }, + { + "epoch": 0.0029151119402985076, + "grad_norm": 1.137673984585037, + "learning_rate": 2.9137529137529138e-06, + "loss": 0.7745, + "step": 25 + }, + { + "epoch": 0.003498134328358209, + "grad_norm": 1.1933766134733839, + "learning_rate": 3.496503496503497e-06, + "loss": 0.7904, + "step": 30 + }, + { + "epoch": 0.00408115671641791, + "grad_norm": 1.0157304336095063, + "learning_rate": 4.079254079254079e-06, + "loss": 0.7624, + "step": 35 + }, + { + "epoch": 0.0046641791044776115, + "grad_norm": 0.9429820002012593, + "learning_rate": 4.662004662004663e-06, + "loss": 0.7285, + "step": 40 + }, + { + "epoch": 0.005247201492537314, + "grad_norm": 0.915430165066572, + "learning_rate": 5.244755244755245e-06, + "loss": 0.7469, + "step": 45 + }, + { + "epoch": 0.005830223880597015, + "grad_norm": 0.8161946795665328, + "learning_rate": 5.8275058275058275e-06, + "loss": 0.707, + "step": 50 + }, + { + "epoch": 0.006413246268656717, + "grad_norm": 0.8339668281290502, + "learning_rate": 6.41025641025641e-06, + "loss": 0.6905, + "step": 55 + }, + { + "epoch": 0.006996268656716418, + "grad_norm": 0.8574171717816127, + "learning_rate": 6.993006993006994e-06, + "loss": 0.7698, + "step": 60 + }, + { + "epoch": 0.00757929104477612, + "grad_norm": 0.8619956369841163, + "learning_rate": 7.5757575757575764e-06, + "loss": 0.7082, + "step": 65 + }, + { + "epoch": 0.00816231343283582, + "grad_norm": 0.8045557883502801, + "learning_rate": 8.158508158508159e-06, + "loss": 0.6587, + "step": 70 + }, + { + "epoch": 0.008745335820895522, + "grad_norm": 0.7946638318702278, + "learning_rate": 8.741258741258741e-06, + "loss": 0.6539, + "step": 75 + }, + { + "epoch": 0.009328358208955223, + "grad_norm": 0.8503979707980048, + "learning_rate": 9.324009324009325e-06, + "loss": 0.6879, + "step": 80 + }, + { + "epoch": 0.009911380597014926, + "grad_norm": 0.804711208532189, + "learning_rate": 9.906759906759908e-06, + "loss": 0.6743, + "step": 85 + }, + { + "epoch": 0.010494402985074628, + "grad_norm": 0.7951085316698083, + "learning_rate": 1.048951048951049e-05, + "loss": 0.6586, + "step": 90 + }, + { + "epoch": 0.011077425373134329, + "grad_norm": 0.793128900429868, + "learning_rate": 1.1072261072261073e-05, + "loss": 0.6842, + "step": 95 + }, + { + "epoch": 0.01166044776119403, + "grad_norm": 0.8075245421009535, + "learning_rate": 1.1655011655011655e-05, + "loss": 0.6684, + "step": 100 + }, + { + "epoch": 0.012243470149253732, + "grad_norm": 0.8581518728101114, + "learning_rate": 1.2237762237762239e-05, + "loss": 0.6207, + "step": 105 + }, + { + "epoch": 0.012826492537313433, + "grad_norm": 0.9082387952747121, + "learning_rate": 1.282051282051282e-05, + "loss": 0.6678, + "step": 110 + }, + { + "epoch": 0.013409514925373135, + "grad_norm": 0.9029958898223162, + "learning_rate": 1.3403263403263406e-05, + "loss": 0.6394, + "step": 115 + }, + { + "epoch": 0.013992537313432836, + "grad_norm": 0.8377367698700525, + "learning_rate": 1.3986013986013988e-05, + "loss": 0.6707, + "step": 120 + }, + { + "epoch": 0.014575559701492538, + "grad_norm": 0.7587966655835441, + "learning_rate": 1.456876456876457e-05, + "loss": 0.6258, + "step": 125 + }, + { + "epoch": 0.01515858208955224, + "grad_norm": 0.8173306054331542, + "learning_rate": 1.5151515151515153e-05, + "loss": 0.6619, + "step": 130 + }, + { + "epoch": 0.01574160447761194, + "grad_norm": 0.9376337343046882, + "learning_rate": 1.5734265734265734e-05, + "loss": 0.6673, + "step": 135 + }, + { + "epoch": 0.01632462686567164, + "grad_norm": 0.9363652268784762, + "learning_rate": 1.6317016317016318e-05, + "loss": 0.668, + "step": 140 + }, + { + "epoch": 0.016907649253731342, + "grad_norm": 0.8794426221093276, + "learning_rate": 1.68997668997669e-05, + "loss": 0.6569, + "step": 145 + }, + { + "epoch": 0.017490671641791043, + "grad_norm": 0.9407491307884387, + "learning_rate": 1.7482517482517483e-05, + "loss": 0.6778, + "step": 150 + }, + { + "epoch": 0.018073694029850745, + "grad_norm": 0.8720667910368793, + "learning_rate": 1.8065268065268067e-05, + "loss": 0.6291, + "step": 155 + }, + { + "epoch": 0.018656716417910446, + "grad_norm": 0.8603987252532928, + "learning_rate": 1.864801864801865e-05, + "loss": 0.6372, + "step": 160 + }, + { + "epoch": 0.019239738805970148, + "grad_norm": 0.8449689793110765, + "learning_rate": 1.923076923076923e-05, + "loss": 0.6586, + "step": 165 + }, + { + "epoch": 0.019822761194029852, + "grad_norm": 1.0325515822144362, + "learning_rate": 1.9813519813519816e-05, + "loss": 0.6386, + "step": 170 + }, + { + "epoch": 0.020405783582089554, + "grad_norm": 0.8513517345308381, + "learning_rate": 2.0396270396270396e-05, + "loss": 0.6311, + "step": 175 + }, + { + "epoch": 0.020988805970149255, + "grad_norm": 0.8847582203013, + "learning_rate": 2.097902097902098e-05, + "loss": 0.6834, + "step": 180 + }, + { + "epoch": 0.021571828358208957, + "grad_norm": 0.9006033563159529, + "learning_rate": 2.156177156177156e-05, + "loss": 0.6381, + "step": 185 + }, + { + "epoch": 0.022154850746268658, + "grad_norm": 0.8147114381410464, + "learning_rate": 2.2144522144522145e-05, + "loss": 0.6432, + "step": 190 + }, + { + "epoch": 0.02273787313432836, + "grad_norm": 0.9560611394473829, + "learning_rate": 2.272727272727273e-05, + "loss": 0.6261, + "step": 195 + }, + { + "epoch": 0.02332089552238806, + "grad_norm": 0.7951106600912393, + "learning_rate": 2.331002331002331e-05, + "loss": 0.6307, + "step": 200 + }, + { + "epoch": 0.023903917910447763, + "grad_norm": 0.855140755847949, + "learning_rate": 2.3892773892773894e-05, + "loss": 0.6213, + "step": 205 + }, + { + "epoch": 0.024486940298507464, + "grad_norm": 1.0420866065102492, + "learning_rate": 2.4475524475524478e-05, + "loss": 0.6275, + "step": 210 + }, + { + "epoch": 0.025069962686567165, + "grad_norm": 0.89019303742937, + "learning_rate": 2.505827505827506e-05, + "loss": 0.6275, + "step": 215 + }, + { + "epoch": 0.025652985074626867, + "grad_norm": 0.9758673014251983, + "learning_rate": 2.564102564102564e-05, + "loss": 0.6631, + "step": 220 + }, + { + "epoch": 0.02623600746268657, + "grad_norm": 0.8596860100346014, + "learning_rate": 2.6223776223776224e-05, + "loss": 0.6216, + "step": 225 + }, + { + "epoch": 0.02681902985074627, + "grad_norm": 0.8908792386022747, + "learning_rate": 2.680652680652681e-05, + "loss": 0.6406, + "step": 230 + }, + { + "epoch": 0.02740205223880597, + "grad_norm": 0.8721610635945679, + "learning_rate": 2.738927738927739e-05, + "loss": 0.6216, + "step": 235 + }, + { + "epoch": 0.027985074626865673, + "grad_norm": 0.8671325117829104, + "learning_rate": 2.7972027972027976e-05, + "loss": 0.6713, + "step": 240 + }, + { + "epoch": 0.028568097014925374, + "grad_norm": 0.8803714665797042, + "learning_rate": 2.8554778554778557e-05, + "loss": 0.629, + "step": 245 + }, + { + "epoch": 0.029151119402985076, + "grad_norm": 0.8308844754653052, + "learning_rate": 2.913752913752914e-05, + "loss": 0.6373, + "step": 250 + }, + { + "epoch": 0.029734141791044777, + "grad_norm": 0.829842731040146, + "learning_rate": 2.972027972027972e-05, + "loss": 0.6163, + "step": 255 + }, + { + "epoch": 0.03031716417910448, + "grad_norm": 0.9311493282737088, + "learning_rate": 3.0303030303030306e-05, + "loss": 0.6448, + "step": 260 + }, + { + "epoch": 0.03090018656716418, + "grad_norm": 0.8380478242539974, + "learning_rate": 3.088578088578088e-05, + "loss": 0.6123, + "step": 265 + }, + { + "epoch": 0.03148320895522388, + "grad_norm": 0.8612483599648676, + "learning_rate": 3.146853146853147e-05, + "loss": 0.6206, + "step": 270 + }, + { + "epoch": 0.03206623134328358, + "grad_norm": 0.7504191034824335, + "learning_rate": 3.205128205128206e-05, + "loss": 0.5956, + "step": 275 + }, + { + "epoch": 0.03264925373134328, + "grad_norm": 0.8435455726913331, + "learning_rate": 3.2634032634032635e-05, + "loss": 0.6147, + "step": 280 + }, + { + "epoch": 0.033232276119402986, + "grad_norm": 0.8228330420668449, + "learning_rate": 3.321678321678322e-05, + "loss": 0.6285, + "step": 285 + }, + { + "epoch": 0.033815298507462684, + "grad_norm": 0.7466739529712338, + "learning_rate": 3.37995337995338e-05, + "loss": 0.62, + "step": 290 + }, + { + "epoch": 0.03439832089552239, + "grad_norm": 0.8056760014927022, + "learning_rate": 3.438228438228439e-05, + "loss": 0.6216, + "step": 295 + }, + { + "epoch": 0.034981343283582086, + "grad_norm": 0.8113797852368729, + "learning_rate": 3.4965034965034965e-05, + "loss": 0.6105, + "step": 300 + }, + { + "epoch": 0.03556436567164179, + "grad_norm": 0.8890502750300378, + "learning_rate": 3.554778554778555e-05, + "loss": 0.6139, + "step": 305 + }, + { + "epoch": 0.03614738805970149, + "grad_norm": 0.7687545593968855, + "learning_rate": 3.613053613053613e-05, + "loss": 0.614, + "step": 310 + }, + { + "epoch": 0.036730410447761194, + "grad_norm": 0.8336903362213683, + "learning_rate": 3.671328671328672e-05, + "loss": 0.6278, + "step": 315 + }, + { + "epoch": 0.03731343283582089, + "grad_norm": 0.869875611794232, + "learning_rate": 3.72960372960373e-05, + "loss": 0.6645, + "step": 320 + }, + { + "epoch": 0.0378964552238806, + "grad_norm": 0.789926244606821, + "learning_rate": 3.787878787878788e-05, + "loss": 0.6027, + "step": 325 + }, + { + "epoch": 0.038479477611940295, + "grad_norm": 0.8392459694319648, + "learning_rate": 3.846153846153846e-05, + "loss": 0.6567, + "step": 330 + }, + { + "epoch": 0.0390625, + "grad_norm": 0.7918852769546142, + "learning_rate": 3.904428904428905e-05, + "loss": 0.6453, + "step": 335 + }, + { + "epoch": 0.039645522388059705, + "grad_norm": 0.7819447011686889, + "learning_rate": 3.962703962703963e-05, + "loss": 0.6, + "step": 340 + }, + { + "epoch": 0.0402285447761194, + "grad_norm": 0.7599671478703276, + "learning_rate": 4.020979020979021e-05, + "loss": 0.6097, + "step": 345 + }, + { + "epoch": 0.04081156716417911, + "grad_norm": 0.8770061789317196, + "learning_rate": 4.079254079254079e-05, + "loss": 0.6199, + "step": 350 + }, + { + "epoch": 0.041394589552238806, + "grad_norm": 0.8228590505028837, + "learning_rate": 4.1375291375291377e-05, + "loss": 0.63, + "step": 355 + }, + { + "epoch": 0.04197761194029851, + "grad_norm": 0.9266679768403295, + "learning_rate": 4.195804195804196e-05, + "loss": 0.6611, + "step": 360 + }, + { + "epoch": 0.04256063432835821, + "grad_norm": 0.776925533918814, + "learning_rate": 4.254079254079254e-05, + "loss": 0.6383, + "step": 365 + }, + { + "epoch": 0.043143656716417914, + "grad_norm": 0.7652459941000134, + "learning_rate": 4.312354312354312e-05, + "loss": 0.5967, + "step": 370 + }, + { + "epoch": 0.04372667910447761, + "grad_norm": 0.8438476797737248, + "learning_rate": 4.370629370629371e-05, + "loss": 0.6146, + "step": 375 + }, + { + "epoch": 0.044309701492537316, + "grad_norm": 0.8158888620815322, + "learning_rate": 4.428904428904429e-05, + "loss": 0.6237, + "step": 380 + }, + { + "epoch": 0.044892723880597014, + "grad_norm": 0.9144427364330033, + "learning_rate": 4.4871794871794874e-05, + "loss": 0.6098, + "step": 385 + }, + { + "epoch": 0.04547574626865672, + "grad_norm": 0.7260277122998825, + "learning_rate": 4.545454545454546e-05, + "loss": 0.6105, + "step": 390 + }, + { + "epoch": 0.04605876865671642, + "grad_norm": 0.774195742981333, + "learning_rate": 4.603729603729604e-05, + "loss": 0.6182, + "step": 395 + }, + { + "epoch": 0.04664179104477612, + "grad_norm": 0.8931158013280471, + "learning_rate": 4.662004662004662e-05, + "loss": 0.6269, + "step": 400 + }, + { + "epoch": 0.04722481343283582, + "grad_norm": 0.7219414153894123, + "learning_rate": 4.7202797202797204e-05, + "loss": 0.6159, + "step": 405 + }, + { + "epoch": 0.047807835820895525, + "grad_norm": 0.7392180125184434, + "learning_rate": 4.778554778554779e-05, + "loss": 0.6597, + "step": 410 + }, + { + "epoch": 0.04839085820895522, + "grad_norm": 0.7724562759848209, + "learning_rate": 4.836829836829837e-05, + "loss": 0.6588, + "step": 415 + }, + { + "epoch": 0.04897388059701493, + "grad_norm": 0.7698999004589875, + "learning_rate": 4.8951048951048956e-05, + "loss": 0.6431, + "step": 420 + }, + { + "epoch": 0.049556902985074626, + "grad_norm": 0.756825275252759, + "learning_rate": 4.9533799533799534e-05, + "loss": 0.6475, + "step": 425 + }, + { + "epoch": 0.05013992537313433, + "grad_norm": 0.7182404605677212, + "learning_rate": 4.9999998327150664e-05, + "loss": 0.6519, + "step": 430 + }, + { + "epoch": 0.05072294776119403, + "grad_norm": 0.7352734921536612, + "learning_rate": 4.999993977744981e-05, + "loss": 0.6367, + "step": 435 + }, + { + "epoch": 0.051305970149253734, + "grad_norm": 0.8663133271054537, + "learning_rate": 4.9999797585530614e-05, + "loss": 0.6367, + "step": 440 + }, + { + "epoch": 0.05188899253731343, + "grad_norm": 0.8007074508867144, + "learning_rate": 4.9999571751921666e-05, + "loss": 0.649, + "step": 445 + }, + { + "epoch": 0.05247201492537314, + "grad_norm": 0.8808725057777399, + "learning_rate": 4.999926227746247e-05, + "loss": 0.6407, + "step": 450 + }, + { + "epoch": 0.053055037313432835, + "grad_norm": 0.8207754194931053, + "learning_rate": 4.999886916330351e-05, + "loss": 0.6664, + "step": 455 + }, + { + "epoch": 0.05363805970149254, + "grad_norm": 0.7200644017362087, + "learning_rate": 4.9998392410906135e-05, + "loss": 0.6264, + "step": 460 + }, + { + "epoch": 0.05422108208955224, + "grad_norm": 0.7655394577960307, + "learning_rate": 4.9997832022042676e-05, + "loss": 0.6854, + "step": 465 + }, + { + "epoch": 0.05480410447761194, + "grad_norm": 1.5845328151194846, + "learning_rate": 4.9997187998796316e-05, + "loss": 0.6092, + "step": 470 + }, + { + "epoch": 0.05538712686567164, + "grad_norm": 0.763948541759687, + "learning_rate": 4.9996460343561184e-05, + "loss": 0.6601, + "step": 475 + }, + { + "epoch": 0.055970149253731345, + "grad_norm": 0.7370601652088519, + "learning_rate": 4.99956490590423e-05, + "loss": 0.6062, + "step": 480 + }, + { + "epoch": 0.05655317164179104, + "grad_norm": 0.6503724111090272, + "learning_rate": 4.9994754148255566e-05, + "loss": 0.597, + "step": 485 + }, + { + "epoch": 0.05713619402985075, + "grad_norm": 0.642633522142368, + "learning_rate": 4.999377561452776e-05, + "loss": 0.633, + "step": 490 + }, + { + "epoch": 0.057719216417910446, + "grad_norm": 0.6866312480811294, + "learning_rate": 4.999271346149652e-05, + "loss": 0.6421, + "step": 495 + }, + { + "epoch": 0.05830223880597015, + "grad_norm": 0.6472205796291965, + "learning_rate": 4.999156769311035e-05, + "loss": 0.615, + "step": 500 + }, + { + "epoch": 0.05888526119402985, + "grad_norm": 0.6977115669022897, + "learning_rate": 4.999033831362857e-05, + "loss": 0.6214, + "step": 505 + }, + { + "epoch": 0.059468283582089554, + "grad_norm": 0.6986043336376292, + "learning_rate": 4.998902532762132e-05, + "loss": 0.6193, + "step": 510 + }, + { + "epoch": 0.06005130597014925, + "grad_norm": 0.7475999210726344, + "learning_rate": 4.9987628739969554e-05, + "loss": 0.6224, + "step": 515 + }, + { + "epoch": 0.06063432835820896, + "grad_norm": 0.6813745517845872, + "learning_rate": 4.9986148555865016e-05, + "loss": 0.6177, + "step": 520 + }, + { + "epoch": 0.061217350746268655, + "grad_norm": 0.7015168996041015, + "learning_rate": 4.9984584780810196e-05, + "loss": 0.6768, + "step": 525 + }, + { + "epoch": 0.06180037313432836, + "grad_norm": 0.7037997780013406, + "learning_rate": 4.998293742061833e-05, + "loss": 0.6742, + "step": 530 + }, + { + "epoch": 0.06238339552238806, + "grad_norm": 0.7156012560309609, + "learning_rate": 4.998120648141338e-05, + "loss": 0.6304, + "step": 535 + }, + { + "epoch": 0.06296641791044776, + "grad_norm": 0.711127994617591, + "learning_rate": 4.997939196963004e-05, + "loss": 0.6765, + "step": 540 + }, + { + "epoch": 0.06354944029850747, + "grad_norm": 0.7367809530722819, + "learning_rate": 4.997749389201363e-05, + "loss": 0.6443, + "step": 545 + }, + { + "epoch": 0.06413246268656717, + "grad_norm": 0.7023254028972605, + "learning_rate": 4.997551225562014e-05, + "loss": 0.614, + "step": 550 + }, + { + "epoch": 0.06471548507462686, + "grad_norm": 0.7934049324407979, + "learning_rate": 4.99734470678162e-05, + "loss": 0.6804, + "step": 555 + }, + { + "epoch": 0.06529850746268656, + "grad_norm": 0.7211790595438915, + "learning_rate": 4.997129833627902e-05, + "loss": 0.6022, + "step": 560 + }, + { + "epoch": 0.06588152985074627, + "grad_norm": 0.7803510446535082, + "learning_rate": 4.996906606899639e-05, + "loss": 0.6324, + "step": 565 + }, + { + "epoch": 0.06646455223880597, + "grad_norm": 0.7268141035030203, + "learning_rate": 4.996675027426662e-05, + "loss": 0.6244, + "step": 570 + }, + { + "epoch": 0.06704757462686567, + "grad_norm": 1.937711458766374, + "learning_rate": 4.9964350960698564e-05, + "loss": 0.637, + "step": 575 + }, + { + "epoch": 0.06763059701492537, + "grad_norm": 0.6807880062940403, + "learning_rate": 4.996186813721152e-05, + "loss": 0.6009, + "step": 580 + }, + { + "epoch": 0.06821361940298508, + "grad_norm": 0.7031632269645188, + "learning_rate": 4.995930181303522e-05, + "loss": 0.6312, + "step": 585 + }, + { + "epoch": 0.06879664179104478, + "grad_norm": 0.6771751142969727, + "learning_rate": 4.995665199770986e-05, + "loss": 0.6604, + "step": 590 + }, + { + "epoch": 0.06937966417910447, + "grad_norm": 0.711794153251972, + "learning_rate": 4.995391870108595e-05, + "loss": 0.6527, + "step": 595 + }, + { + "epoch": 0.06996268656716417, + "grad_norm": 0.7336499566418994, + "learning_rate": 4.9951101933324374e-05, + "loss": 0.6056, + "step": 600 + }, + { + "epoch": 0.07054570895522388, + "grad_norm": 0.9123524114339076, + "learning_rate": 4.994820170489629e-05, + "loss": 0.6351, + "step": 605 + }, + { + "epoch": 0.07112873134328358, + "grad_norm": 0.6937351959939653, + "learning_rate": 4.9945218026583147e-05, + "loss": 0.6415, + "step": 610 + }, + { + "epoch": 0.07171175373134328, + "grad_norm": 0.6701402337273787, + "learning_rate": 4.9942150909476576e-05, + "loss": 0.616, + "step": 615 + }, + { + "epoch": 0.07229477611940298, + "grad_norm": 0.6737823946741938, + "learning_rate": 4.9939000364978424e-05, + "loss": 0.5916, + "step": 620 + }, + { + "epoch": 0.07287779850746269, + "grad_norm": 0.6443035255679863, + "learning_rate": 4.993576640480064e-05, + "loss": 0.593, + "step": 625 + }, + { + "epoch": 0.07346082089552239, + "grad_norm": 0.7194715850493604, + "learning_rate": 4.9932449040965296e-05, + "loss": 0.6537, + "step": 630 + }, + { + "epoch": 0.07404384328358209, + "grad_norm": 0.6638929998798874, + "learning_rate": 4.992904828580449e-05, + "loss": 0.651, + "step": 635 + }, + { + "epoch": 0.07462686567164178, + "grad_norm": 0.6403792391822621, + "learning_rate": 4.992556415196035e-05, + "loss": 0.5995, + "step": 640 + }, + { + "epoch": 0.0752098880597015, + "grad_norm": 0.7646937059991467, + "learning_rate": 4.9921996652384915e-05, + "loss": 0.6254, + "step": 645 + }, + { + "epoch": 0.0757929104477612, + "grad_norm": 0.708046339189841, + "learning_rate": 4.991834580034017e-05, + "loss": 0.6285, + "step": 650 + }, + { + "epoch": 0.07637593283582089, + "grad_norm": 0.6787038237176444, + "learning_rate": 4.991461160939795e-05, + "loss": 0.6089, + "step": 655 + }, + { + "epoch": 0.07695895522388059, + "grad_norm": 0.637822387437937, + "learning_rate": 4.991079409343989e-05, + "loss": 0.6339, + "step": 660 + }, + { + "epoch": 0.0775419776119403, + "grad_norm": 0.7266823799603278, + "learning_rate": 4.990689326665738e-05, + "loss": 0.601, + "step": 665 + }, + { + "epoch": 0.078125, + "grad_norm": 0.6501949889104538, + "learning_rate": 4.9902909143551516e-05, + "loss": 0.6016, + "step": 670 + }, + { + "epoch": 0.0787080223880597, + "grad_norm": 0.6820009767011616, + "learning_rate": 4.989884173893305e-05, + "loss": 0.6007, + "step": 675 + }, + { + "epoch": 0.07929104477611941, + "grad_norm": 0.6877686038902945, + "learning_rate": 4.989469106792231e-05, + "loss": 0.6136, + "step": 680 + }, + { + "epoch": 0.07987406716417911, + "grad_norm": 0.6085151630088098, + "learning_rate": 4.9890457145949186e-05, + "loss": 0.5881, + "step": 685 + }, + { + "epoch": 0.0804570895522388, + "grad_norm": 0.6214607197858061, + "learning_rate": 4.988613998875304e-05, + "loss": 0.6053, + "step": 690 + }, + { + "epoch": 0.0810401119402985, + "grad_norm": 0.6744013331308584, + "learning_rate": 4.988173961238264e-05, + "loss": 0.5802, + "step": 695 + }, + { + "epoch": 0.08162313432835822, + "grad_norm": 0.6808032691033227, + "learning_rate": 4.987725603319615e-05, + "loss": 0.6094, + "step": 700 + }, + { + "epoch": 0.08220615671641791, + "grad_norm": 0.6979985310436119, + "learning_rate": 4.987268926786098e-05, + "loss": 0.6323, + "step": 705 + }, + { + "epoch": 0.08278917910447761, + "grad_norm": 0.6226469149492844, + "learning_rate": 4.986803933335385e-05, + "loss": 0.6459, + "step": 710 + }, + { + "epoch": 0.08337220149253731, + "grad_norm": 0.676363656496179, + "learning_rate": 4.9863306246960605e-05, + "loss": 0.5761, + "step": 715 + }, + { + "epoch": 0.08395522388059702, + "grad_norm": 0.6685016854342901, + "learning_rate": 4.9858490026276226e-05, + "loss": 0.5988, + "step": 720 + }, + { + "epoch": 0.08453824626865672, + "grad_norm": 0.6396794014235626, + "learning_rate": 4.9853590689204715e-05, + "loss": 0.5854, + "step": 725 + }, + { + "epoch": 0.08512126865671642, + "grad_norm": 0.720334288248887, + "learning_rate": 4.9848608253959096e-05, + "loss": 0.6207, + "step": 730 + }, + { + "epoch": 0.08570429104477612, + "grad_norm": 0.6079861012766015, + "learning_rate": 4.984354273906127e-05, + "loss": 0.5953, + "step": 735 + }, + { + "epoch": 0.08628731343283583, + "grad_norm": 0.6782045394701461, + "learning_rate": 4.9838394163341993e-05, + "loss": 0.63, + "step": 740 + }, + { + "epoch": 0.08687033582089553, + "grad_norm": 0.6465534111571817, + "learning_rate": 4.983316254594081e-05, + "loss": 0.6006, + "step": 745 + }, + { + "epoch": 0.08745335820895522, + "grad_norm": 0.6048787945996146, + "learning_rate": 4.9827847906305934e-05, + "loss": 0.5937, + "step": 750 + }, + { + "epoch": 0.08803638059701492, + "grad_norm": 0.6704335077914664, + "learning_rate": 4.982245026419424e-05, + "loss": 0.6433, + "step": 755 + }, + { + "epoch": 0.08861940298507463, + "grad_norm": 0.6420162328599116, + "learning_rate": 4.981696963967116e-05, + "loss": 0.5806, + "step": 760 + }, + { + "epoch": 0.08920242537313433, + "grad_norm": 0.589278560052739, + "learning_rate": 4.981140605311057e-05, + "loss": 0.6049, + "step": 765 + }, + { + "epoch": 0.08978544776119403, + "grad_norm": 0.6681567808799939, + "learning_rate": 4.98057595251948e-05, + "loss": 0.6369, + "step": 770 + }, + { + "epoch": 0.09036847014925373, + "grad_norm": 0.5822554638380099, + "learning_rate": 4.980003007691449e-05, + "loss": 0.633, + "step": 775 + }, + { + "epoch": 0.09095149253731344, + "grad_norm": 0.6014893132349064, + "learning_rate": 4.979421772956852e-05, + "loss": 0.637, + "step": 780 + }, + { + "epoch": 0.09153451492537314, + "grad_norm": 0.6190152455325075, + "learning_rate": 4.9788322504763954e-05, + "loss": 0.626, + "step": 785 + }, + { + "epoch": 0.09211753731343283, + "grad_norm": 0.6043190595480243, + "learning_rate": 4.978234442441596e-05, + "loss": 0.5958, + "step": 790 + }, + { + "epoch": 0.09270055970149253, + "grad_norm": 0.7067979054927527, + "learning_rate": 4.977628351074769e-05, + "loss": 0.6391, + "step": 795 + }, + { + "epoch": 0.09328358208955224, + "grad_norm": 1.0900391421932065, + "learning_rate": 4.977013978629025e-05, + "loss": 0.5869, + "step": 800 + }, + { + "epoch": 0.09386660447761194, + "grad_norm": 0.7289575339056653, + "learning_rate": 4.976391327388257e-05, + "loss": 0.5817, + "step": 805 + }, + { + "epoch": 0.09444962686567164, + "grad_norm": 0.6772024103975534, + "learning_rate": 4.9757603996671354e-05, + "loss": 0.644, + "step": 810 + }, + { + "epoch": 0.09503264925373134, + "grad_norm": 0.6273836766329598, + "learning_rate": 4.975121197811096e-05, + "loss": 0.6343, + "step": 815 + }, + { + "epoch": 0.09561567164179105, + "grad_norm": 0.6925933519325139, + "learning_rate": 4.974473724196338e-05, + "loss": 0.6527, + "step": 820 + }, + { + "epoch": 0.09619869402985075, + "grad_norm": 0.7918262789810132, + "learning_rate": 4.973817981229802e-05, + "loss": 0.6043, + "step": 825 + }, + { + "epoch": 0.09678171641791045, + "grad_norm": 0.6915704244723462, + "learning_rate": 4.9731539713491776e-05, + "loss": 0.6101, + "step": 830 + }, + { + "epoch": 0.09736473880597014, + "grad_norm": 0.7026540645380082, + "learning_rate": 4.972481697022883e-05, + "loss": 0.6321, + "step": 835 + }, + { + "epoch": 0.09794776119402986, + "grad_norm": 0.6870682595723567, + "learning_rate": 4.971801160750057e-05, + "loss": 0.6431, + "step": 840 + }, + { + "epoch": 0.09853078358208955, + "grad_norm": 0.6762758634999102, + "learning_rate": 4.971112365060555e-05, + "loss": 0.6212, + "step": 845 + }, + { + "epoch": 0.09911380597014925, + "grad_norm": 1.0041969348534898, + "learning_rate": 4.970415312514936e-05, + "loss": 0.5934, + "step": 850 + }, + { + "epoch": 0.09969682835820895, + "grad_norm": 0.6273960276780831, + "learning_rate": 4.969710005704449e-05, + "loss": 0.6499, + "step": 855 + }, + { + "epoch": 0.10027985074626866, + "grad_norm": 0.6488940069009905, + "learning_rate": 4.9689964472510345e-05, + "loss": 0.5949, + "step": 860 + }, + { + "epoch": 0.10086287313432836, + "grad_norm": 0.763231378289182, + "learning_rate": 4.968274639807304e-05, + "loss": 0.6146, + "step": 865 + }, + { + "epoch": 0.10144589552238806, + "grad_norm": 0.6658920295533095, + "learning_rate": 4.967544586056532e-05, + "loss": 0.6049, + "step": 870 + }, + { + "epoch": 0.10202891791044776, + "grad_norm": 0.7074390644632157, + "learning_rate": 4.966806288712654e-05, + "loss": 0.6175, + "step": 875 + }, + { + "epoch": 0.10261194029850747, + "grad_norm": 0.695343426907801, + "learning_rate": 4.966059750520246e-05, + "loss": 0.591, + "step": 880 + }, + { + "epoch": 0.10319496268656717, + "grad_norm": 0.6418320757008029, + "learning_rate": 4.965304974254521e-05, + "loss": 0.6176, + "step": 885 + }, + { + "epoch": 0.10377798507462686, + "grad_norm": 0.7727923536862525, + "learning_rate": 4.9645419627213155e-05, + "loss": 0.6134, + "step": 890 + }, + { + "epoch": 0.10436100746268656, + "grad_norm": 0.6029656422873543, + "learning_rate": 4.96377071875708e-05, + "loss": 0.5654, + "step": 895 + }, + { + "epoch": 0.10494402985074627, + "grad_norm": 0.6358118150755105, + "learning_rate": 4.9629912452288696e-05, + "loss": 0.6252, + "step": 900 + }, + { + "epoch": 0.10552705223880597, + "grad_norm": 0.61344386800639, + "learning_rate": 4.962203545034332e-05, + "loss": 0.6059, + "step": 905 + }, + { + "epoch": 0.10611007462686567, + "grad_norm": 0.6135409466759565, + "learning_rate": 4.961407621101697e-05, + "loss": 0.5614, + "step": 910 + }, + { + "epoch": 0.10669309701492537, + "grad_norm": 1.0821214504665195, + "learning_rate": 4.960603476389765e-05, + "loss": 0.6162, + "step": 915 + }, + { + "epoch": 0.10727611940298508, + "grad_norm": 0.5759584165021776, + "learning_rate": 4.959791113887898e-05, + "loss": 0.6055, + "step": 920 + }, + { + "epoch": 0.10785914179104478, + "grad_norm": 0.5799139948464339, + "learning_rate": 4.958970536616006e-05, + "loss": 0.6067, + "step": 925 + }, + { + "epoch": 0.10844216417910447, + "grad_norm": 0.6049902178680088, + "learning_rate": 4.9581417476245365e-05, + "loss": 0.5808, + "step": 930 + }, + { + "epoch": 0.10902518656716417, + "grad_norm": 0.6216812883236675, + "learning_rate": 4.957304749994465e-05, + "loss": 0.5644, + "step": 935 + }, + { + "epoch": 0.10960820895522388, + "grad_norm": 0.6264774945250813, + "learning_rate": 4.956459546837283e-05, + "loss": 0.5889, + "step": 940 + }, + { + "epoch": 0.11019123134328358, + "grad_norm": 0.7584079121183279, + "learning_rate": 4.955606141294982e-05, + "loss": 0.6662, + "step": 945 + }, + { + "epoch": 0.11077425373134328, + "grad_norm": 0.6021040722392922, + "learning_rate": 4.954744536540048e-05, + "loss": 0.6075, + "step": 950 + }, + { + "epoch": 0.11135727611940298, + "grad_norm": 0.6516825177813871, + "learning_rate": 4.953874735775448e-05, + "loss": 0.6163, + "step": 955 + }, + { + "epoch": 0.11194029850746269, + "grad_norm": 0.6483316850868462, + "learning_rate": 4.9529967422346137e-05, + "loss": 0.6388, + "step": 960 + }, + { + "epoch": 0.11252332089552239, + "grad_norm": 1.0296359523919179, + "learning_rate": 4.952110559181437e-05, + "loss": 0.6187, + "step": 965 + }, + { + "epoch": 0.11310634328358209, + "grad_norm": 0.5963874485076113, + "learning_rate": 4.95121618991025e-05, + "loss": 0.653, + "step": 970 + }, + { + "epoch": 0.11368936567164178, + "grad_norm": 0.5827112927403142, + "learning_rate": 4.950313637745819e-05, + "loss": 0.6013, + "step": 975 + }, + { + "epoch": 0.1142723880597015, + "grad_norm": 0.5978368012555301, + "learning_rate": 4.9494029060433304e-05, + "loss": 0.5854, + "step": 980 + }, + { + "epoch": 0.1148554104477612, + "grad_norm": 0.6061921826097367, + "learning_rate": 4.9484839981883755e-05, + "loss": 0.6042, + "step": 985 + }, + { + "epoch": 0.11543843283582089, + "grad_norm": 0.577591628346454, + "learning_rate": 4.9475569175969414e-05, + "loss": 0.6033, + "step": 990 + }, + { + "epoch": 0.11602145522388059, + "grad_norm": 0.578876456470585, + "learning_rate": 4.9466216677153945e-05, + "loss": 0.5762, + "step": 995 + }, + { + "epoch": 0.1166044776119403, + "grad_norm": 0.6111711530967577, + "learning_rate": 4.945678252020475e-05, + "loss": 0.6078, + "step": 1000 + }, + { + "epoch": 0.1171875, + "grad_norm": 0.5389571001747652, + "learning_rate": 4.9447266740192735e-05, + "loss": 0.586, + "step": 1005 + }, + { + "epoch": 0.1177705223880597, + "grad_norm": 0.5665361736437953, + "learning_rate": 4.943766937249226e-05, + "loss": 0.5896, + "step": 1010 + }, + { + "epoch": 0.11835354477611941, + "grad_norm": 0.6004663875659557, + "learning_rate": 4.942799045278099e-05, + "loss": 0.5931, + "step": 1015 + }, + { + "epoch": 0.11893656716417911, + "grad_norm": 0.5877689595746617, + "learning_rate": 4.941823001703974e-05, + "loss": 0.5985, + "step": 1020 + }, + { + "epoch": 0.1195195895522388, + "grad_norm": 0.5935807095087586, + "learning_rate": 4.940838810155237e-05, + "loss": 0.5991, + "step": 1025 + }, + { + "epoch": 0.1201026119402985, + "grad_norm": 0.6656506474951656, + "learning_rate": 4.939846474290563e-05, + "loss": 0.6148, + "step": 1030 + }, + { + "epoch": 0.12068563432835822, + "grad_norm": 0.6400226054588848, + "learning_rate": 4.9388459977989016e-05, + "loss": 0.6035, + "step": 1035 + }, + { + "epoch": 0.12126865671641791, + "grad_norm": 0.5682054111741724, + "learning_rate": 4.937837384399467e-05, + "loss": 0.5942, + "step": 1040 + }, + { + "epoch": 0.12185167910447761, + "grad_norm": 0.5722092103161287, + "learning_rate": 4.936820637841721e-05, + "loss": 0.6297, + "step": 1045 + }, + { + "epoch": 0.12243470149253731, + "grad_norm": 0.5744583767360888, + "learning_rate": 4.935795761905359e-05, + "loss": 0.5799, + "step": 1050 + }, + { + "epoch": 0.12301772388059702, + "grad_norm": 0.5955258224599197, + "learning_rate": 4.934762760400299e-05, + "loss": 0.5901, + "step": 1055 + }, + { + "epoch": 0.12360074626865672, + "grad_norm": 0.5941450778138045, + "learning_rate": 4.933721637166662e-05, + "loss": 0.5932, + "step": 1060 + }, + { + "epoch": 0.12418376865671642, + "grad_norm": 0.6984953623677035, + "learning_rate": 4.9326723960747655e-05, + "loss": 0.5928, + "step": 1065 + }, + { + "epoch": 0.12476679104477612, + "grad_norm": 0.5556738969991728, + "learning_rate": 4.931615041025101e-05, + "loss": 0.5449, + "step": 1070 + }, + { + "epoch": 0.12534981343283583, + "grad_norm": 0.6202875202064847, + "learning_rate": 4.9305495759483246e-05, + "loss": 0.6245, + "step": 1075 + }, + { + "epoch": 0.1259328358208955, + "grad_norm": 0.6360648582792375, + "learning_rate": 4.929476004805241e-05, + "loss": 0.622, + "step": 1080 + }, + { + "epoch": 0.12651585820895522, + "grad_norm": 0.548961800240671, + "learning_rate": 4.928394331586788e-05, + "loss": 0.5878, + "step": 1085 + }, + { + "epoch": 0.12709888059701493, + "grad_norm": 0.6534887664001585, + "learning_rate": 4.927304560314023e-05, + "loss": 0.5985, + "step": 1090 + }, + { + "epoch": 0.12768190298507462, + "grad_norm": 0.6051258985285789, + "learning_rate": 4.9262066950381074e-05, + "loss": 0.5815, + "step": 1095 + }, + { + "epoch": 0.12826492537313433, + "grad_norm": 0.5940919877769313, + "learning_rate": 4.925100739840293e-05, + "loss": 0.6056, + "step": 1100 + }, + { + "epoch": 0.12884794776119404, + "grad_norm": 0.5795416953387396, + "learning_rate": 4.923986698831902e-05, + "loss": 0.593, + "step": 1105 + }, + { + "epoch": 0.12943097014925373, + "grad_norm": 0.6060003300391443, + "learning_rate": 4.922864576154318e-05, + "loss": 0.6293, + "step": 1110 + }, + { + "epoch": 0.13001399253731344, + "grad_norm": 0.5926697361708111, + "learning_rate": 4.921734375978966e-05, + "loss": 0.5863, + "step": 1115 + }, + { + "epoch": 0.13059701492537312, + "grad_norm": 0.5666399578950052, + "learning_rate": 4.9205961025073005e-05, + "loss": 0.6093, + "step": 1120 + }, + { + "epoch": 0.13118003731343283, + "grad_norm": 0.6290080779373148, + "learning_rate": 4.919449759970787e-05, + "loss": 0.6149, + "step": 1125 + }, + { + "epoch": 0.13176305970149255, + "grad_norm": 0.6030059289618742, + "learning_rate": 4.9182953526308866e-05, + "loss": 0.5983, + "step": 1130 + }, + { + "epoch": 0.13234608208955223, + "grad_norm": 0.6967548151886129, + "learning_rate": 4.9171328847790416e-05, + "loss": 0.5979, + "step": 1135 + }, + { + "epoch": 0.13292910447761194, + "grad_norm": 0.6568770405720038, + "learning_rate": 4.9159623607366587e-05, + "loss": 0.5857, + "step": 1140 + }, + { + "epoch": 0.13351212686567165, + "grad_norm": 0.5840444295366739, + "learning_rate": 4.914783784855093e-05, + "loss": 0.6438, + "step": 1145 + }, + { + "epoch": 0.13409514925373134, + "grad_norm": 0.7004511399791347, + "learning_rate": 4.913597161515633e-05, + "loss": 0.6184, + "step": 1150 + }, + { + "epoch": 0.13467817164179105, + "grad_norm": 0.550373143569304, + "learning_rate": 4.91240249512948e-05, + "loss": 0.567, + "step": 1155 + }, + { + "epoch": 0.13526119402985073, + "grad_norm": 0.5313433759195957, + "learning_rate": 4.9111997901377373e-05, + "loss": 0.5855, + "step": 1160 + }, + { + "epoch": 0.13584421641791045, + "grad_norm": 0.5655705667845206, + "learning_rate": 4.9099890510113924e-05, + "loss": 0.5661, + "step": 1165 + }, + { + "epoch": 0.13642723880597016, + "grad_norm": 0.6195584641586247, + "learning_rate": 4.908770282251296e-05, + "loss": 0.5905, + "step": 1170 + }, + { + "epoch": 0.13701026119402984, + "grad_norm": 0.5583698659457814, + "learning_rate": 4.9075434883881504e-05, + "loss": 0.6002, + "step": 1175 + }, + { + "epoch": 0.13759328358208955, + "grad_norm": 0.6389070121403235, + "learning_rate": 4.906308673982491e-05, + "loss": 0.5607, + "step": 1180 + }, + { + "epoch": 0.13817630597014927, + "grad_norm": 0.59774495236772, + "learning_rate": 4.905065843624668e-05, + "loss": 0.5678, + "step": 1185 + }, + { + "epoch": 0.13875932835820895, + "grad_norm": 0.6173804758323792, + "learning_rate": 4.903815001934832e-05, + "loss": 0.6002, + "step": 1190 + }, + { + "epoch": 0.13934235074626866, + "grad_norm": 0.5469869287414887, + "learning_rate": 4.9025561535629125e-05, + "loss": 0.5977, + "step": 1195 + }, + { + "epoch": 0.13992537313432835, + "grad_norm": 0.5624438700425504, + "learning_rate": 4.9012893031886075e-05, + "loss": 0.582, + "step": 1200 + }, + { + "epoch": 0.14050839552238806, + "grad_norm": 0.5985379982120503, + "learning_rate": 4.9000144555213575e-05, + "loss": 0.5825, + "step": 1205 + }, + { + "epoch": 0.14109141791044777, + "grad_norm": 0.5800377004502044, + "learning_rate": 4.898731615300336e-05, + "loss": 0.5706, + "step": 1210 + }, + { + "epoch": 0.14167444029850745, + "grad_norm": 0.5901666286652832, + "learning_rate": 4.8974407872944263e-05, + "loss": 0.5937, + "step": 1215 + }, + { + "epoch": 0.14225746268656717, + "grad_norm": 0.644009126842579, + "learning_rate": 4.8961419763022065e-05, + "loss": 0.5612, + "step": 1220 + }, + { + "epoch": 0.14284048507462688, + "grad_norm": 0.5922807627602008, + "learning_rate": 4.894835187151931e-05, + "loss": 0.6067, + "step": 1225 + }, + { + "epoch": 0.14342350746268656, + "grad_norm": 0.5722199265482402, + "learning_rate": 4.893520424701513e-05, + "loss": 0.6082, + "step": 1230 + }, + { + "epoch": 0.14400652985074627, + "grad_norm": 0.6126570623385963, + "learning_rate": 4.892197693838504e-05, + "loss": 0.5964, + "step": 1235 + }, + { + "epoch": 0.14458955223880596, + "grad_norm": 0.5973655677327272, + "learning_rate": 4.890866999480082e-05, + "loss": 0.5918, + "step": 1240 + }, + { + "epoch": 0.14517257462686567, + "grad_norm": 0.54393899856667, + "learning_rate": 4.889528346573023e-05, + "loss": 0.5827, + "step": 1245 + }, + { + "epoch": 0.14575559701492538, + "grad_norm": 0.5508599028743483, + "learning_rate": 4.888181740093693e-05, + "loss": 0.5966, + "step": 1250 + }, + { + "epoch": 0.14633861940298507, + "grad_norm": 0.5748810947596694, + "learning_rate": 4.886827185048023e-05, + "loss": 0.6069, + "step": 1255 + }, + { + "epoch": 0.14692164179104478, + "grad_norm": 0.5843720987519714, + "learning_rate": 4.8854646864714906e-05, + "loss": 0.626, + "step": 1260 + }, + { + "epoch": 0.1475046641791045, + "grad_norm": 0.5574624014831214, + "learning_rate": 4.884094249429109e-05, + "loss": 0.5513, + "step": 1265 + }, + { + "epoch": 0.14808768656716417, + "grad_norm": 0.6341906909347658, + "learning_rate": 4.882715879015396e-05, + "loss": 0.597, + "step": 1270 + }, + { + "epoch": 0.14867070895522388, + "grad_norm": 0.575813430423846, + "learning_rate": 4.881329580354363e-05, + "loss": 0.6081, + "step": 1275 + }, + { + "epoch": 0.14925373134328357, + "grad_norm": 0.6445668774873804, + "learning_rate": 4.8799353585994954e-05, + "loss": 0.6087, + "step": 1280 + }, + { + "epoch": 0.14983675373134328, + "grad_norm": 0.5859378252774211, + "learning_rate": 4.8785332189337325e-05, + "loss": 0.5602, + "step": 1285 + }, + { + "epoch": 0.150419776119403, + "grad_norm": 0.6268377383651598, + "learning_rate": 4.877123166569445e-05, + "loss": 0.621, + "step": 1290 + }, + { + "epoch": 0.15100279850746268, + "grad_norm": 0.539111443847871, + "learning_rate": 4.8757052067484216e-05, + "loss": 0.5778, + "step": 1295 + }, + { + "epoch": 0.1515858208955224, + "grad_norm": 0.6230886477061637, + "learning_rate": 4.874279344741846e-05, + "loss": 0.6052, + "step": 1300 + }, + { + "epoch": 0.1521688432835821, + "grad_norm": 0.5804714929817124, + "learning_rate": 4.872845585850277e-05, + "loss": 0.5704, + "step": 1305 + }, + { + "epoch": 0.15275186567164178, + "grad_norm": 0.5644557466436395, + "learning_rate": 4.87140393540363e-05, + "loss": 0.6135, + "step": 1310 + }, + { + "epoch": 0.1533348880597015, + "grad_norm": 0.5979238448583827, + "learning_rate": 4.869954398761156e-05, + "loss": 0.5891, + "step": 1315 + }, + { + "epoch": 0.15391791044776118, + "grad_norm": 0.5684870674237956, + "learning_rate": 4.868496981311424e-05, + "loss": 0.6057, + "step": 1320 + }, + { + "epoch": 0.1545009328358209, + "grad_norm": 0.6084433820062218, + "learning_rate": 4.8670316884722984e-05, + "loss": 0.6183, + "step": 1325 + }, + { + "epoch": 0.1550839552238806, + "grad_norm": 0.5168917985718466, + "learning_rate": 4.86555852569092e-05, + "loss": 0.558, + "step": 1330 + }, + { + "epoch": 0.1556669776119403, + "grad_norm": 0.6169882009143579, + "learning_rate": 4.864077498443687e-05, + "loss": 0.5894, + "step": 1335 + }, + { + "epoch": 0.15625, + "grad_norm": 0.5573154647203309, + "learning_rate": 4.8625886122362305e-05, + "loss": 0.6047, + "step": 1340 + }, + { + "epoch": 0.1568330223880597, + "grad_norm": 0.5430765223123376, + "learning_rate": 4.861091872603399e-05, + "loss": 0.5936, + "step": 1345 + }, + { + "epoch": 0.1574160447761194, + "grad_norm": 0.5952512138917507, + "learning_rate": 4.859587285109235e-05, + "loss": 0.6323, + "step": 1350 + }, + { + "epoch": 0.1579990671641791, + "grad_norm": 0.6257733920419736, + "learning_rate": 4.8580748553469554e-05, + "loss": 0.621, + "step": 1355 + }, + { + "epoch": 0.15858208955223882, + "grad_norm": 0.6484544777362733, + "learning_rate": 4.8565545889389295e-05, + "loss": 0.6225, + "step": 1360 + }, + { + "epoch": 0.1591651119402985, + "grad_norm": 0.6001757890053516, + "learning_rate": 4.85502649153666e-05, + "loss": 0.7485, + "step": 1365 + }, + { + "epoch": 0.15974813432835822, + "grad_norm": 0.661835508508043, + "learning_rate": 4.853490568820759e-05, + "loss": 0.8953, + "step": 1370 + }, + { + "epoch": 0.1603311567164179, + "grad_norm": 0.5424000700147391, + "learning_rate": 4.851946826500932e-05, + "loss": 0.6088, + "step": 1375 + }, + { + "epoch": 0.1609141791044776, + "grad_norm": 0.5642506132746299, + "learning_rate": 4.8503952703159485e-05, + "loss": 0.5788, + "step": 1380 + }, + { + "epoch": 0.16149720149253732, + "grad_norm": 1.5770888453223382, + "learning_rate": 4.8488359060336314e-05, + "loss": 0.6464, + "step": 1385 + }, + { + "epoch": 0.162080223880597, + "grad_norm": 0.5798567582108715, + "learning_rate": 4.847268739450825e-05, + "loss": 0.5962, + "step": 1390 + }, + { + "epoch": 0.16266324626865672, + "grad_norm": 0.6247134338348835, + "learning_rate": 4.84569377639338e-05, + "loss": 0.5762, + "step": 1395 + }, + { + "epoch": 0.16324626865671643, + "grad_norm": 0.6403693981471119, + "learning_rate": 4.84411102271613e-05, + "loss": 0.6033, + "step": 1400 + }, + { + "epoch": 0.16382929104477612, + "grad_norm": 0.6328461284086258, + "learning_rate": 4.842520484302871e-05, + "loss": 0.6124, + "step": 1405 + }, + { + "epoch": 0.16441231343283583, + "grad_norm": 0.5690467644511553, + "learning_rate": 4.840922167066335e-05, + "loss": 0.6058, + "step": 1410 + }, + { + "epoch": 0.1649953358208955, + "grad_norm": 0.5991271256192877, + "learning_rate": 4.8393160769481755e-05, + "loss": 0.5759, + "step": 1415 + }, + { + "epoch": 0.16557835820895522, + "grad_norm": 0.5518688454925015, + "learning_rate": 4.8377022199189374e-05, + "loss": 0.5758, + "step": 1420 + }, + { + "epoch": 0.16616138059701493, + "grad_norm": 0.8358279547212623, + "learning_rate": 4.836080601978043e-05, + "loss": 0.614, + "step": 1425 + }, + { + "epoch": 0.16674440298507462, + "grad_norm": 0.5736501078455153, + "learning_rate": 4.83445122915376e-05, + "loss": 0.6268, + "step": 1430 + }, + { + "epoch": 0.16732742537313433, + "grad_norm": 0.501139191324822, + "learning_rate": 4.832814107503188e-05, + "loss": 0.5522, + "step": 1435 + }, + { + "epoch": 0.16791044776119404, + "grad_norm": 0.5414270989894118, + "learning_rate": 4.8311692431122326e-05, + "loss": 0.5985, + "step": 1440 + }, + { + "epoch": 0.16849347014925373, + "grad_norm": 0.5503417116648404, + "learning_rate": 4.82951664209558e-05, + "loss": 0.5883, + "step": 1445 + }, + { + "epoch": 0.16907649253731344, + "grad_norm": 0.5828254158920338, + "learning_rate": 4.82785631059668e-05, + "loss": 0.5973, + "step": 1450 + }, + { + "epoch": 0.16965951492537312, + "grad_norm": 0.5750003990997431, + "learning_rate": 4.826188254787717e-05, + "loss": 0.5929, + "step": 1455 + }, + { + "epoch": 0.17024253731343283, + "grad_norm": 0.5931754407423996, + "learning_rate": 4.824512480869593e-05, + "loss": 0.584, + "step": 1460 + }, + { + "epoch": 0.17082555970149255, + "grad_norm": 0.5457054499115934, + "learning_rate": 4.822828995071899e-05, + "loss": 0.5711, + "step": 1465 + }, + { + "epoch": 0.17140858208955223, + "grad_norm": 0.5229569450197525, + "learning_rate": 4.821137803652896e-05, + "loss": 0.5459, + "step": 1470 + }, + { + "epoch": 0.17199160447761194, + "grad_norm": 0.6027439958371525, + "learning_rate": 4.819438912899489e-05, + "loss": 0.5814, + "step": 1475 + }, + { + "epoch": 0.17257462686567165, + "grad_norm": 0.5702069062273913, + "learning_rate": 4.8177323291272066e-05, + "loss": 0.6299, + "step": 1480 + }, + { + "epoch": 0.17315764925373134, + "grad_norm": 0.5780484151492952, + "learning_rate": 4.8160180586801744e-05, + "loss": 0.5777, + "step": 1485 + }, + { + "epoch": 0.17374067164179105, + "grad_norm": 0.575917162342149, + "learning_rate": 4.814296107931093e-05, + "loss": 0.5547, + "step": 1490 + }, + { + "epoch": 0.17432369402985073, + "grad_norm": 0.5192602462313004, + "learning_rate": 4.812566483281216e-05, + "loss": 0.5669, + "step": 1495 + }, + { + "epoch": 0.17490671641791045, + "grad_norm": 0.5345650171129616, + "learning_rate": 4.81082919116032e-05, + "loss": 0.5917, + "step": 1500 + }, + { + "epoch": 0.17548973880597016, + "grad_norm": 0.5475736070312467, + "learning_rate": 4.809084238026689e-05, + "loss": 0.602, + "step": 1505 + }, + { + "epoch": 0.17607276119402984, + "grad_norm": 0.5786621619994619, + "learning_rate": 4.8073316303670835e-05, + "loss": 0.5861, + "step": 1510 + }, + { + "epoch": 0.17665578358208955, + "grad_norm": 0.5448850237456587, + "learning_rate": 4.8055713746967216e-05, + "loss": 0.5857, + "step": 1515 + }, + { + "epoch": 0.17723880597014927, + "grad_norm": 0.5779873112523268, + "learning_rate": 4.803803477559252e-05, + "loss": 0.5665, + "step": 1520 + }, + { + "epoch": 0.17782182835820895, + "grad_norm": 0.5652601574196091, + "learning_rate": 4.8020279455267274e-05, + "loss": 0.5794, + "step": 1525 + }, + { + "epoch": 0.17840485074626866, + "grad_norm": 0.5437626429311837, + "learning_rate": 4.800244785199588e-05, + "loss": 0.547, + "step": 1530 + }, + { + "epoch": 0.17898787313432835, + "grad_norm": 0.5657663832577134, + "learning_rate": 4.7984540032066266e-05, + "loss": 0.5645, + "step": 1535 + }, + { + "epoch": 0.17957089552238806, + "grad_norm": 0.5396084565585528, + "learning_rate": 4.796655606204971e-05, + "loss": 0.5917, + "step": 1540 + }, + { + "epoch": 0.18015391791044777, + "grad_norm": 0.5659614973871961, + "learning_rate": 4.794849600880059e-05, + "loss": 0.5888, + "step": 1545 + }, + { + "epoch": 0.18073694029850745, + "grad_norm": 0.610344463972923, + "learning_rate": 4.793035993945609e-05, + "loss": 0.5973, + "step": 1550 + }, + { + "epoch": 0.18131996268656717, + "grad_norm": 0.5315364790581317, + "learning_rate": 4.7912147921436e-05, + "loss": 0.5793, + "step": 1555 + }, + { + "epoch": 0.18190298507462688, + "grad_norm": 0.5215816759572497, + "learning_rate": 4.789386002244244e-05, + "loss": 0.5773, + "step": 1560 + }, + { + "epoch": 0.18248600746268656, + "grad_norm": 0.48268753752530635, + "learning_rate": 4.7875496310459607e-05, + "loss": 0.5439, + "step": 1565 + }, + { + "epoch": 0.18306902985074627, + "grad_norm": 0.5928345158273391, + "learning_rate": 4.7857056853753536e-05, + "loss": 0.5946, + "step": 1570 + }, + { + "epoch": 0.18365205223880596, + "grad_norm": 0.5217322463974674, + "learning_rate": 4.783854172087183e-05, + "loss": 0.5633, + "step": 1575 + }, + { + "epoch": 0.18423507462686567, + "grad_norm": 0.5361438235075061, + "learning_rate": 4.781995098064343e-05, + "loss": 0.5616, + "step": 1580 + }, + { + "epoch": 0.18481809701492538, + "grad_norm": 0.5319409918375809, + "learning_rate": 4.780128470217833e-05, + "loss": 0.5959, + "step": 1585 + }, + { + "epoch": 0.18540111940298507, + "grad_norm": 0.5656220383824416, + "learning_rate": 4.778254295486732e-05, + "loss": 0.5941, + "step": 1590 + }, + { + "epoch": 0.18598414179104478, + "grad_norm": 0.664107972745298, + "learning_rate": 4.7763725808381777e-05, + "loss": 0.5932, + "step": 1595 + }, + { + "epoch": 0.1865671641791045, + "grad_norm": 0.5594876020852717, + "learning_rate": 4.7744833332673336e-05, + "loss": 0.5754, + "step": 1600 + }, + { + "epoch": 0.18715018656716417, + "grad_norm": 0.5748188546122887, + "learning_rate": 4.7725865597973684e-05, + "loss": 0.5929, + "step": 1605 + }, + { + "epoch": 0.18773320895522388, + "grad_norm": 0.5608090588962066, + "learning_rate": 4.770682267479427e-05, + "loss": 0.6, + "step": 1610 + }, + { + "epoch": 0.18831623134328357, + "grad_norm": 0.5614561936493092, + "learning_rate": 4.7687704633926056e-05, + "loss": 0.5885, + "step": 1615 + }, + { + "epoch": 0.18889925373134328, + "grad_norm": 0.5020575112535327, + "learning_rate": 4.766851154643924e-05, + "loss": 0.5766, + "step": 1620 + }, + { + "epoch": 0.189482276119403, + "grad_norm": 0.5350491170633971, + "learning_rate": 4.7649243483683015e-05, + "loss": 0.6016, + "step": 1625 + }, + { + "epoch": 0.19006529850746268, + "grad_norm": 0.5723463466212365, + "learning_rate": 4.762990051728529e-05, + "loss": 0.5938, + "step": 1630 + }, + { + "epoch": 0.1906483208955224, + "grad_norm": 0.528038155312228, + "learning_rate": 4.7610482719152404e-05, + "loss": 0.5919, + "step": 1635 + }, + { + "epoch": 0.1912313432835821, + "grad_norm": 0.5195292947087375, + "learning_rate": 4.7590990161468906e-05, + "loss": 0.587, + "step": 1640 + }, + { + "epoch": 0.19181436567164178, + "grad_norm": 0.5309411178378524, + "learning_rate": 4.757142291669724e-05, + "loss": 0.5584, + "step": 1645 + }, + { + "epoch": 0.1923973880597015, + "grad_norm": 0.5235366705843203, + "learning_rate": 4.755178105757751e-05, + "loss": 0.5332, + "step": 1650 + }, + { + "epoch": 0.19298041044776118, + "grad_norm": 0.5662388478996405, + "learning_rate": 4.753206465712717e-05, + "loss": 0.6003, + "step": 1655 + }, + { + "epoch": 0.1935634328358209, + "grad_norm": 0.541515776380167, + "learning_rate": 4.751227378864081e-05, + "loss": 0.6167, + "step": 1660 + }, + { + "epoch": 0.1941464552238806, + "grad_norm": 0.5451263614433529, + "learning_rate": 4.749240852568981e-05, + "loss": 0.5795, + "step": 1665 + }, + { + "epoch": 0.1947294776119403, + "grad_norm": 0.5539737195153389, + "learning_rate": 4.747246894212216e-05, + "loss": 0.6156, + "step": 1670 + }, + { + "epoch": 0.1953125, + "grad_norm": 0.4838135257147904, + "learning_rate": 4.7452455112062076e-05, + "loss": 0.586, + "step": 1675 + }, + { + "epoch": 0.1958955223880597, + "grad_norm": 0.6206912586835625, + "learning_rate": 4.743236710990982e-05, + "loss": 0.5835, + "step": 1680 + }, + { + "epoch": 0.1964785447761194, + "grad_norm": 0.5575344781989854, + "learning_rate": 4.7412205010341385e-05, + "loss": 0.5615, + "step": 1685 + }, + { + "epoch": 0.1970615671641791, + "grad_norm": 0.5465760234872273, + "learning_rate": 4.739196888830818e-05, + "loss": 0.5614, + "step": 1690 + }, + { + "epoch": 0.19764458955223882, + "grad_norm": 0.6116107229280592, + "learning_rate": 4.737165881903683e-05, + "loss": 0.5777, + "step": 1695 + }, + { + "epoch": 0.1982276119402985, + "grad_norm": 0.526352835776055, + "learning_rate": 4.735127487802882e-05, + "loss": 0.5499, + "step": 1700 + }, + { + "epoch": 0.19881063432835822, + "grad_norm": 0.5790768202270913, + "learning_rate": 4.7330817141060284e-05, + "loss": 0.6062, + "step": 1705 + }, + { + "epoch": 0.1993936567164179, + "grad_norm": 0.5786125838532169, + "learning_rate": 4.731028568418167e-05, + "loss": 0.5853, + "step": 1710 + }, + { + "epoch": 0.1999766791044776, + "grad_norm": 0.5220981929969758, + "learning_rate": 4.728968058371746e-05, + "loss": 0.5917, + "step": 1715 + }, + { + "epoch": 0.20055970149253732, + "grad_norm": 0.5962771524705726, + "learning_rate": 4.726900191626592e-05, + "loss": 0.615, + "step": 1720 + }, + { + "epoch": 0.201142723880597, + "grad_norm": 0.614179304629615, + "learning_rate": 4.724824975869881e-05, + "loss": 0.6088, + "step": 1725 + }, + { + "epoch": 0.20172574626865672, + "grad_norm": 0.5339809717905718, + "learning_rate": 4.722742418816106e-05, + "loss": 0.5856, + "step": 1730 + }, + { + "epoch": 0.20230876865671643, + "grad_norm": 0.5775559838523747, + "learning_rate": 4.7206525282070514e-05, + "loss": 0.5525, + "step": 1735 + }, + { + "epoch": 0.20289179104477612, + "grad_norm": 0.5787127160424809, + "learning_rate": 4.718555311811764e-05, + "loss": 0.5889, + "step": 1740 + }, + { + "epoch": 0.20347481343283583, + "grad_norm": 0.5507748006986873, + "learning_rate": 4.716450777426525e-05, + "loss": 0.5811, + "step": 1745 + }, + { + "epoch": 0.2040578358208955, + "grad_norm": 0.5386322774377927, + "learning_rate": 4.7143389328748174e-05, + "loss": 0.5681, + "step": 1750 + }, + { + "epoch": 0.20464085820895522, + "grad_norm": 0.5253847435139612, + "learning_rate": 4.712219786007302e-05, + "loss": 0.5352, + "step": 1755 + }, + { + "epoch": 0.20522388059701493, + "grad_norm": 0.5229542272956548, + "learning_rate": 4.710093344701782e-05, + "loss": 0.5878, + "step": 1760 + }, + { + "epoch": 0.20580690298507462, + "grad_norm": 0.5347624824538455, + "learning_rate": 4.707959616863181e-05, + "loss": 0.5622, + "step": 1765 + }, + { + "epoch": 0.20638992537313433, + "grad_norm": 0.5561607185138324, + "learning_rate": 4.7058186104235086e-05, + "loss": 0.5797, + "step": 1770 + }, + { + "epoch": 0.20697294776119404, + "grad_norm": 0.4892165988190017, + "learning_rate": 4.70367033334183e-05, + "loss": 0.5403, + "step": 1775 + }, + { + "epoch": 0.20755597014925373, + "grad_norm": 0.5510594553982379, + "learning_rate": 4.701514793604242e-05, + "loss": 0.5559, + "step": 1780 + }, + { + "epoch": 0.20813899253731344, + "grad_norm": 0.5473696320978395, + "learning_rate": 4.699351999223838e-05, + "loss": 0.5753, + "step": 1785 + }, + { + "epoch": 0.20872201492537312, + "grad_norm": 0.4873353162223706, + "learning_rate": 4.697181958240679e-05, + "loss": 0.5492, + "step": 1790 + }, + { + "epoch": 0.20930503731343283, + "grad_norm": 0.5064007191967134, + "learning_rate": 4.695004678721768e-05, + "loss": 0.5858, + "step": 1795 + }, + { + "epoch": 0.20988805970149255, + "grad_norm": 0.5667430033926578, + "learning_rate": 4.692820168761014e-05, + "loss": 0.6062, + "step": 1800 + }, + { + "epoch": 0.21047108208955223, + "grad_norm": 0.5212510033253821, + "learning_rate": 4.690628436479206e-05, + "loss": 0.5598, + "step": 1805 + }, + { + "epoch": 0.21105410447761194, + "grad_norm": 0.5588622002334643, + "learning_rate": 4.688429490023982e-05, + "loss": 0.5763, + "step": 1810 + }, + { + "epoch": 0.21163712686567165, + "grad_norm": 0.6512372849264118, + "learning_rate": 4.6862233375697964e-05, + "loss": 0.5808, + "step": 1815 + }, + { + "epoch": 0.21222014925373134, + "grad_norm": 0.600056817158455, + "learning_rate": 4.684009987317894e-05, + "loss": 0.5929, + "step": 1820 + }, + { + "epoch": 0.21280317164179105, + "grad_norm": 0.5525047925178632, + "learning_rate": 4.6817894474962756e-05, + "loss": 0.5598, + "step": 1825 + }, + { + "epoch": 0.21338619402985073, + "grad_norm": 0.49741821795163615, + "learning_rate": 4.679561726359668e-05, + "loss": 0.5689, + "step": 1830 + }, + { + "epoch": 0.21396921641791045, + "grad_norm": 0.5369544968462594, + "learning_rate": 4.677326832189496e-05, + "loss": 0.5846, + "step": 1835 + }, + { + "epoch": 0.21455223880597016, + "grad_norm": 0.578124984252202, + "learning_rate": 4.675084773293848e-05, + "loss": 0.6068, + "step": 1840 + }, + { + "epoch": 0.21513526119402984, + "grad_norm": 0.5489389574421355, + "learning_rate": 4.6728355580074476e-05, + "loss": 0.5799, + "step": 1845 + }, + { + "epoch": 0.21571828358208955, + "grad_norm": 0.5492579167575462, + "learning_rate": 4.6705791946916236e-05, + "loss": 0.5969, + "step": 1850 + }, + { + "epoch": 0.21630130597014927, + "grad_norm": 0.4976854096096666, + "learning_rate": 4.6683156917342726e-05, + "loss": 0.6034, + "step": 1855 + }, + { + "epoch": 0.21688432835820895, + "grad_norm": 0.600940601658767, + "learning_rate": 4.666045057549838e-05, + "loss": 0.5946, + "step": 1860 + }, + { + "epoch": 0.21746735074626866, + "grad_norm": 0.5506775932621145, + "learning_rate": 4.663767300579268e-05, + "loss": 0.5847, + "step": 1865 + }, + { + "epoch": 0.21805037313432835, + "grad_norm": 0.5271315191050433, + "learning_rate": 4.661482429289994e-05, + "loss": 0.5662, + "step": 1870 + }, + { + "epoch": 0.21863339552238806, + "grad_norm": 0.5806047125500812, + "learning_rate": 4.659190452175891e-05, + "loss": 0.5717, + "step": 1875 + }, + { + "epoch": 0.21921641791044777, + "grad_norm": 0.5548551191672546, + "learning_rate": 4.65689137775725e-05, + "loss": 0.5918, + "step": 1880 + }, + { + "epoch": 0.21979944029850745, + "grad_norm": 0.5224077396834346, + "learning_rate": 4.654585214580749e-05, + "loss": 0.5764, + "step": 1885 + }, + { + "epoch": 0.22038246268656717, + "grad_norm": 0.595212892553765, + "learning_rate": 4.652271971219412e-05, + "loss": 0.5718, + "step": 1890 + }, + { + "epoch": 0.22096548507462688, + "grad_norm": 0.5205351842902096, + "learning_rate": 4.6499516562725906e-05, + "loss": 0.5843, + "step": 1895 + }, + { + "epoch": 0.22154850746268656, + "grad_norm": 0.5193771435539847, + "learning_rate": 4.647624278365917e-05, + "loss": 0.6024, + "step": 1900 + }, + { + "epoch": 0.22213152985074627, + "grad_norm": 0.5762529929483823, + "learning_rate": 4.6452898461512866e-05, + "loss": 0.5841, + "step": 1905 + }, + { + "epoch": 0.22271455223880596, + "grad_norm": 0.5613099807801801, + "learning_rate": 4.642948368306814e-05, + "loss": 0.5909, + "step": 1910 + }, + { + "epoch": 0.22329757462686567, + "grad_norm": 0.5857098538355904, + "learning_rate": 4.640599853536806e-05, + "loss": 0.5986, + "step": 1915 + }, + { + "epoch": 0.22388059701492538, + "grad_norm": 0.5346551057187519, + "learning_rate": 4.6382443105717324e-05, + "loss": 0.5651, + "step": 1920 + }, + { + "epoch": 0.22446361940298507, + "grad_norm": 0.6133229780421945, + "learning_rate": 4.635881748168184e-05, + "loss": 0.6051, + "step": 1925 + }, + { + "epoch": 0.22504664179104478, + "grad_norm": 0.564856310131765, + "learning_rate": 4.633512175108851e-05, + "loss": 0.5645, + "step": 1930 + }, + { + "epoch": 0.2256296641791045, + "grad_norm": 0.6086776045173963, + "learning_rate": 4.631135600202485e-05, + "loss": 0.611, + "step": 1935 + }, + { + "epoch": 0.22621268656716417, + "grad_norm": 0.5053374707380298, + "learning_rate": 4.628752032283862e-05, + "loss": 0.6359, + "step": 1940 + }, + { + "epoch": 0.22679570895522388, + "grad_norm": 0.5177309796242289, + "learning_rate": 4.626361480213759e-05, + "loss": 0.5161, + "step": 1945 + }, + { + "epoch": 0.22737873134328357, + "grad_norm": 0.5655297929571091, + "learning_rate": 4.623963952878914e-05, + "loss": 0.5215, + "step": 1950 + }, + { + "epoch": 0.22796175373134328, + "grad_norm": 0.5007101086568317, + "learning_rate": 4.621559459191996e-05, + "loss": 0.5672, + "step": 1955 + }, + { + "epoch": 0.228544776119403, + "grad_norm": 0.5604337094818254, + "learning_rate": 4.619148008091569e-05, + "loss": 0.5495, + "step": 1960 + }, + { + "epoch": 0.22912779850746268, + "grad_norm": 0.5600231646159735, + "learning_rate": 4.616729608542064e-05, + "loss": 0.5914, + "step": 1965 + }, + { + "epoch": 0.2297108208955224, + "grad_norm": 0.5261560439698072, + "learning_rate": 4.61430426953374e-05, + "loss": 0.5761, + "step": 1970 + }, + { + "epoch": 0.2302938432835821, + "grad_norm": 0.5177160101117486, + "learning_rate": 4.611872000082654e-05, + "loss": 0.6105, + "step": 1975 + }, + { + "epoch": 0.23087686567164178, + "grad_norm": 0.4991176586608243, + "learning_rate": 4.609432809230627e-05, + "loss": 0.5576, + "step": 1980 + }, + { + "epoch": 0.2314598880597015, + "grad_norm": 0.49011496350766276, + "learning_rate": 4.606986706045207e-05, + "loss": 0.5217, + "step": 1985 + }, + { + "epoch": 0.23204291044776118, + "grad_norm": 0.49863205095192953, + "learning_rate": 4.604533699619643e-05, + "loss": 0.5606, + "step": 1990 + }, + { + "epoch": 0.2326259328358209, + "grad_norm": 0.5202522274733646, + "learning_rate": 4.602073799072841e-05, + "loss": 0.582, + "step": 1995 + }, + { + "epoch": 0.2332089552238806, + "grad_norm": 0.5778738097193321, + "learning_rate": 4.5996070135493426e-05, + "loss": 0.5608, + "step": 2000 + }, + { + "epoch": 0.2337919776119403, + "grad_norm": 0.5383730742466348, + "learning_rate": 4.597133352219275e-05, + "loss": 0.5833, + "step": 2005 + }, + { + "epoch": 0.234375, + "grad_norm": 0.504768235114523, + "learning_rate": 4.594652824278333e-05, + "loss": 0.5428, + "step": 2010 + }, + { + "epoch": 0.2349580223880597, + "grad_norm": 0.5424141377382832, + "learning_rate": 4.592165438947734e-05, + "loss": 0.5234, + "step": 2015 + }, + { + "epoch": 0.2355410447761194, + "grad_norm": 0.560600079358499, + "learning_rate": 4.589671205474189e-05, + "loss": 0.5949, + "step": 2020 + }, + { + "epoch": 0.2361240671641791, + "grad_norm": 0.5710842213852398, + "learning_rate": 4.587170133129867e-05, + "loss": 0.5675, + "step": 2025 + }, + { + "epoch": 0.23670708955223882, + "grad_norm": 0.6954740937313674, + "learning_rate": 4.5846622312123566e-05, + "loss": 0.5593, + "step": 2030 + }, + { + "epoch": 0.2372901119402985, + "grad_norm": 0.4936861843329207, + "learning_rate": 4.582147509044639e-05, + "loss": 0.544, + "step": 2035 + }, + { + "epoch": 0.23787313432835822, + "grad_norm": 0.5990080756647657, + "learning_rate": 4.579625975975047e-05, + "loss": 0.6093, + "step": 2040 + }, + { + "epoch": 0.2384561567164179, + "grad_norm": 0.46710498554704555, + "learning_rate": 4.577097641377234e-05, + "loss": 0.5421, + "step": 2045 + }, + { + "epoch": 0.2390391791044776, + "grad_norm": 0.6220614730173019, + "learning_rate": 4.574562514650137e-05, + "loss": 0.5959, + "step": 2050 + }, + { + "epoch": 0.23962220149253732, + "grad_norm": 0.5736687871170686, + "learning_rate": 4.572020605217941e-05, + "loss": 0.5705, + "step": 2055 + }, + { + "epoch": 0.240205223880597, + "grad_norm": 0.5700891932767567, + "learning_rate": 4.569471922530048e-05, + "loss": 0.5812, + "step": 2060 + }, + { + "epoch": 0.24078824626865672, + "grad_norm": 0.5076933159103915, + "learning_rate": 4.566916476061036e-05, + "loss": 0.565, + "step": 2065 + }, + { + "epoch": 0.24137126865671643, + "grad_norm": 0.5485662358157128, + "learning_rate": 4.56435427531063e-05, + "loss": 0.576, + "step": 2070 + }, + { + "epoch": 0.24195429104477612, + "grad_norm": 0.5699225728909032, + "learning_rate": 4.5617853298036634e-05, + "loss": 0.5984, + "step": 2075 + }, + { + "epoch": 0.24253731343283583, + "grad_norm": 0.5970990880266768, + "learning_rate": 4.559209649090039e-05, + "loss": 0.5648, + "step": 2080 + }, + { + "epoch": 0.2431203358208955, + "grad_norm": 0.5310811562750187, + "learning_rate": 4.556627242744703e-05, + "loss": 0.5616, + "step": 2085 + }, + { + "epoch": 0.24370335820895522, + "grad_norm": 0.4892958829299151, + "learning_rate": 4.5540381203675994e-05, + "loss": 0.5867, + "step": 2090 + }, + { + "epoch": 0.24428638059701493, + "grad_norm": 0.5227922779707119, + "learning_rate": 4.55144229158364e-05, + "loss": 0.5534, + "step": 2095 + }, + { + "epoch": 0.24486940298507462, + "grad_norm": 0.5041942400636352, + "learning_rate": 4.548839766042668e-05, + "loss": 0.5371, + "step": 2100 + }, + { + "epoch": 0.24545242537313433, + "grad_norm": 0.49660405961707166, + "learning_rate": 4.5462305534194204e-05, + "loss": 0.572, + "step": 2105 + }, + { + "epoch": 0.24603544776119404, + "grad_norm": 0.5499042134726534, + "learning_rate": 4.543614663413493e-05, + "loss": 0.5611, + "step": 2110 + }, + { + "epoch": 0.24661847014925373, + "grad_norm": 0.5496526828713953, + "learning_rate": 4.5409921057493064e-05, + "loss": 0.57, + "step": 2115 + }, + { + "epoch": 0.24720149253731344, + "grad_norm": 0.5599675685862884, + "learning_rate": 4.538362890176066e-05, + "loss": 0.5618, + "step": 2120 + }, + { + "epoch": 0.24778451492537312, + "grad_norm": 0.5579889614380124, + "learning_rate": 4.535727026467727e-05, + "loss": 0.5682, + "step": 2125 + }, + { + "epoch": 0.24836753731343283, + "grad_norm": 0.5300253135611827, + "learning_rate": 4.533084524422959e-05, + "loss": 0.5828, + "step": 2130 + }, + { + "epoch": 0.24895055970149255, + "grad_norm": 0.498730689430679, + "learning_rate": 4.530435393865111e-05, + "loss": 0.5535, + "step": 2135 + }, + { + "epoch": 0.24953358208955223, + "grad_norm": 0.5177126292248756, + "learning_rate": 4.527779644642172e-05, + "loss": 0.5661, + "step": 2140 + }, + { + "epoch": 0.2501166044776119, + "grad_norm": 0.5708740823647661, + "learning_rate": 4.525117286626734e-05, + "loss": 0.5764, + "step": 2145 + }, + { + "epoch": 0.25069962686567165, + "grad_norm": 0.5326668265886716, + "learning_rate": 4.522448329715959e-05, + "loss": 0.562, + "step": 2150 + }, + { + "epoch": 0.25128264925373134, + "grad_norm": 0.4878189090302149, + "learning_rate": 4.51977278383154e-05, + "loss": 0.5389, + "step": 2155 + }, + { + "epoch": 0.251865671641791, + "grad_norm": 0.5459043435498905, + "learning_rate": 4.517090658919662e-05, + "loss": 0.5456, + "step": 2160 + }, + { + "epoch": 0.25244869402985076, + "grad_norm": 0.5451507549234296, + "learning_rate": 4.5144019649509694e-05, + "loss": 0.5619, + "step": 2165 + }, + { + "epoch": 0.25303171641791045, + "grad_norm": 0.48817546876328066, + "learning_rate": 4.5117067119205256e-05, + "loss": 0.5328, + "step": 2170 + }, + { + "epoch": 0.25361473880597013, + "grad_norm": 0.5573481611505946, + "learning_rate": 4.5090049098477756e-05, + "loss": 0.6166, + "step": 2175 + }, + { + "epoch": 0.25419776119402987, + "grad_norm": 0.46166230865153285, + "learning_rate": 4.506296568776513e-05, + "loss": 0.5603, + "step": 2180 + }, + { + "epoch": 0.25478078358208955, + "grad_norm": 0.5255525351801149, + "learning_rate": 4.503581698774838e-05, + "loss": 0.5597, + "step": 2185 + }, + { + "epoch": 0.25536380597014924, + "grad_norm": 0.5036713071359231, + "learning_rate": 4.5008603099351235e-05, + "loss": 0.5572, + "step": 2190 + }, + { + "epoch": 0.255946828358209, + "grad_norm": 0.5629197395168302, + "learning_rate": 4.498132412373972e-05, + "loss": 0.5549, + "step": 2195 + }, + { + "epoch": 0.25652985074626866, + "grad_norm": 0.5281653518122478, + "learning_rate": 4.4953980162321845e-05, + "loss": 0.5783, + "step": 2200 + }, + { + "epoch": 0.25711287313432835, + "grad_norm": 0.52357818056316, + "learning_rate": 4.492657131674722e-05, + "loss": 0.5365, + "step": 2205 + }, + { + "epoch": 0.2576958955223881, + "grad_norm": 0.5581473844002001, + "learning_rate": 4.48990976889066e-05, + "loss": 0.5314, + "step": 2210 + }, + { + "epoch": 0.25827891791044777, + "grad_norm": 0.5463641738254288, + "learning_rate": 4.487155938093163e-05, + "loss": 0.5678, + "step": 2215 + }, + { + "epoch": 0.25886194029850745, + "grad_norm": 0.580427123805623, + "learning_rate": 4.484395649519435e-05, + "loss": 0.5831, + "step": 2220 + }, + { + "epoch": 0.25944496268656714, + "grad_norm": 0.5515157134010146, + "learning_rate": 4.48162891343069e-05, + "loss": 0.5916, + "step": 2225 + }, + { + "epoch": 0.2600279850746269, + "grad_norm": 0.4614774802102389, + "learning_rate": 4.478855740112107e-05, + "loss": 0.5747, + "step": 2230 + }, + { + "epoch": 0.26061100746268656, + "grad_norm": 0.5263544516058696, + "learning_rate": 4.476076139872797e-05, + "loss": 0.5873, + "step": 2235 + }, + { + "epoch": 0.26119402985074625, + "grad_norm": 0.5279147571827948, + "learning_rate": 4.473290123045764e-05, + "loss": 0.5694, + "step": 2240 + }, + { + "epoch": 0.261777052238806, + "grad_norm": 0.9742556257776407, + "learning_rate": 4.470497699987861e-05, + "loss": 0.5686, + "step": 2245 + }, + { + "epoch": 0.26236007462686567, + "grad_norm": 0.5328912557304052, + "learning_rate": 4.4676988810797596e-05, + "loss": 0.5591, + "step": 2250 + }, + { + "epoch": 0.26294309701492535, + "grad_norm": 0.5633803039126256, + "learning_rate": 4.464893676725906e-05, + "loss": 0.5728, + "step": 2255 + }, + { + "epoch": 0.2635261194029851, + "grad_norm": 0.5134466833234131, + "learning_rate": 4.4620820973544866e-05, + "loss": 0.588, + "step": 2260 + }, + { + "epoch": 0.2641091417910448, + "grad_norm": 0.5049188829803447, + "learning_rate": 4.459264153417381e-05, + "loss": 0.5973, + "step": 2265 + }, + { + "epoch": 0.26469216417910446, + "grad_norm": 0.5404106159275316, + "learning_rate": 4.4564398553901344e-05, + "loss": 0.5788, + "step": 2270 + }, + { + "epoch": 0.2652751865671642, + "grad_norm": 0.6037489755180414, + "learning_rate": 4.4536092137719094e-05, + "loss": 0.5935, + "step": 2275 + }, + { + "epoch": 0.2658582089552239, + "grad_norm": 0.5161487013459114, + "learning_rate": 4.450772239085452e-05, + "loss": 0.5371, + "step": 2280 + }, + { + "epoch": 0.26644123134328357, + "grad_norm": 0.5278496691940492, + "learning_rate": 4.44792894187705e-05, + "loss": 0.5234, + "step": 2285 + }, + { + "epoch": 0.2670242537313433, + "grad_norm": 0.5092042449789793, + "learning_rate": 4.445079332716497e-05, + "loss": 0.5925, + "step": 2290 + }, + { + "epoch": 0.267607276119403, + "grad_norm": 0.5223498221449892, + "learning_rate": 4.4422234221970475e-05, + "loss": 0.5629, + "step": 2295 + }, + { + "epoch": 0.2681902985074627, + "grad_norm": 0.5456470351851613, + "learning_rate": 4.439361220935385e-05, + "loss": 0.5416, + "step": 2300 + }, + { + "epoch": 0.26877332089552236, + "grad_norm": 0.5476526134142059, + "learning_rate": 4.436492739571575e-05, + "loss": 0.5499, + "step": 2305 + }, + { + "epoch": 0.2693563432835821, + "grad_norm": 0.49505688936501147, + "learning_rate": 4.433617988769031e-05, + "loss": 0.5431, + "step": 2310 + }, + { + "epoch": 0.2699393656716418, + "grad_norm": 0.5342065164923998, + "learning_rate": 4.43073697921447e-05, + "loss": 0.5693, + "step": 2315 + }, + { + "epoch": 0.27052238805970147, + "grad_norm": 0.5145761250584094, + "learning_rate": 4.4278497216178805e-05, + "loss": 0.5795, + "step": 2320 + }, + { + "epoch": 0.2711054104477612, + "grad_norm": 0.5273626768859015, + "learning_rate": 4.4249562267124735e-05, + "loss": 0.5549, + "step": 2325 + }, + { + "epoch": 0.2716884328358209, + "grad_norm": 0.5079624559473638, + "learning_rate": 4.422056505254648e-05, + "loss": 0.5746, + "step": 2330 + }, + { + "epoch": 0.2722714552238806, + "grad_norm": 0.505058134321932, + "learning_rate": 4.4191505680239494e-05, + "loss": 0.5565, + "step": 2335 + }, + { + "epoch": 0.2728544776119403, + "grad_norm": 0.5668879894960839, + "learning_rate": 4.416238425823031e-05, + "loss": 0.5578, + "step": 2340 + }, + { + "epoch": 0.2734375, + "grad_norm": 0.5141499926926323, + "learning_rate": 4.413320089477612e-05, + "loss": 0.5489, + "step": 2345 + }, + { + "epoch": 0.2740205223880597, + "grad_norm": 0.5109670063026779, + "learning_rate": 4.4103955698364394e-05, + "loss": 0.5592, + "step": 2350 + }, + { + "epoch": 0.2746035447761194, + "grad_norm": 0.5082096864806888, + "learning_rate": 4.407464877771243e-05, + "loss": 0.5213, + "step": 2355 + }, + { + "epoch": 0.2751865671641791, + "grad_norm": 0.5083698199510284, + "learning_rate": 4.4045280241767024e-05, + "loss": 0.5493, + "step": 2360 + }, + { + "epoch": 0.2757695895522388, + "grad_norm": 0.5053920381989545, + "learning_rate": 4.401585019970397e-05, + "loss": 0.5686, + "step": 2365 + }, + { + "epoch": 0.27635261194029853, + "grad_norm": 0.4963501376761793, + "learning_rate": 4.3986358760927774e-05, + "loss": 0.5545, + "step": 2370 + }, + { + "epoch": 0.2769356343283582, + "grad_norm": 0.5288831258581409, + "learning_rate": 4.3956806035071123e-05, + "loss": 0.5249, + "step": 2375 + }, + { + "epoch": 0.2775186567164179, + "grad_norm": 0.62308836025263, + "learning_rate": 4.392719213199457e-05, + "loss": 0.6042, + "step": 2380 + }, + { + "epoch": 0.27810167910447764, + "grad_norm": 0.4832296950151894, + "learning_rate": 4.389751716178606e-05, + "loss": 0.5633, + "step": 2385 + }, + { + "epoch": 0.2786847014925373, + "grad_norm": 0.5237890773447569, + "learning_rate": 4.386778123476059e-05, + "loss": 0.5271, + "step": 2390 + }, + { + "epoch": 0.279267723880597, + "grad_norm": 0.5595739424570599, + "learning_rate": 4.383798446145973e-05, + "loss": 0.6001, + "step": 2395 + }, + { + "epoch": 0.2798507462686567, + "grad_norm": 0.5332931182325285, + "learning_rate": 4.380812695265126e-05, + "loss": 0.6024, + "step": 2400 + }, + { + "epoch": 0.28043376865671643, + "grad_norm": 0.4918731480131771, + "learning_rate": 4.3778208819328724e-05, + "loss": 0.56, + "step": 2405 + }, + { + "epoch": 0.2810167910447761, + "grad_norm": 0.4625310842353903, + "learning_rate": 4.374823017271105e-05, + "loss": 0.5161, + "step": 2410 + }, + { + "epoch": 0.2815998134328358, + "grad_norm": 0.5070557648263391, + "learning_rate": 4.371819112424212e-05, + "loss": 0.518, + "step": 2415 + }, + { + "epoch": 0.28218283582089554, + "grad_norm": 0.56213077270057, + "learning_rate": 4.368809178559034e-05, + "loss": 0.5645, + "step": 2420 + }, + { + "epoch": 0.2827658582089552, + "grad_norm": 0.5027850751505164, + "learning_rate": 4.365793226864825e-05, + "loss": 0.5516, + "step": 2425 + }, + { + "epoch": 0.2833488805970149, + "grad_norm": 0.496969778411097, + "learning_rate": 4.3627712685532104e-05, + "loss": 0.5661, + "step": 2430 + }, + { + "epoch": 0.28393190298507465, + "grad_norm": 0.5203901533643535, + "learning_rate": 4.3597433148581465e-05, + "loss": 0.5564, + "step": 2435 + }, + { + "epoch": 0.28451492537313433, + "grad_norm": 0.5761459191507305, + "learning_rate": 4.3567093770358724e-05, + "loss": 0.584, + "step": 2440 + }, + { + "epoch": 0.285097947761194, + "grad_norm": 0.5440010534822834, + "learning_rate": 4.353669466364877e-05, + "loss": 0.5833, + "step": 2445 + }, + { + "epoch": 0.28568097014925375, + "grad_norm": 0.5575628208489772, + "learning_rate": 4.3506235941458516e-05, + "loss": 0.5926, + "step": 2450 + }, + { + "epoch": 0.28626399253731344, + "grad_norm": 0.500211988994469, + "learning_rate": 4.347571771701648e-05, + "loss": 0.5651, + "step": 2455 + }, + { + "epoch": 0.2868470149253731, + "grad_norm": 0.46825863619458763, + "learning_rate": 4.34451401037724e-05, + "loss": 0.5461, + "step": 2460 + }, + { + "epoch": 0.28743003731343286, + "grad_norm": 0.5521309621293441, + "learning_rate": 4.3414503215396776e-05, + "loss": 0.5659, + "step": 2465 + }, + { + "epoch": 0.28801305970149255, + "grad_norm": 0.4609786323278, + "learning_rate": 4.338380716578046e-05, + "loss": 0.5487, + "step": 2470 + }, + { + "epoch": 0.28859608208955223, + "grad_norm": 0.499738534899754, + "learning_rate": 4.3353052069034214e-05, + "loss": 0.5381, + "step": 2475 + }, + { + "epoch": 0.2891791044776119, + "grad_norm": 0.5917418512742165, + "learning_rate": 4.332223803948834e-05, + "loss": 0.5434, + "step": 2480 + }, + { + "epoch": 0.28976212686567165, + "grad_norm": 0.48180474136483686, + "learning_rate": 4.3291365191692204e-05, + "loss": 0.5734, + "step": 2485 + }, + { + "epoch": 0.29034514925373134, + "grad_norm": 0.5007564237127459, + "learning_rate": 4.326043364041381e-05, + "loss": 0.5352, + "step": 2490 + }, + { + "epoch": 0.290928171641791, + "grad_norm": 0.47349703509695523, + "learning_rate": 4.3229443500639414e-05, + "loss": 0.5553, + "step": 2495 + }, + { + "epoch": 0.29151119402985076, + "grad_norm": 0.5192204895800268, + "learning_rate": 4.319839488757305e-05, + "loss": 0.5398, + "step": 2500 + }, + { + "epoch": 0.29209421641791045, + "grad_norm": 0.5757316112230765, + "learning_rate": 4.3167287916636145e-05, + "loss": 0.5795, + "step": 2505 + }, + { + "epoch": 0.29267723880597013, + "grad_norm": 0.5046004476804008, + "learning_rate": 4.3136122703467045e-05, + "loss": 0.5657, + "step": 2510 + }, + { + "epoch": 0.29326026119402987, + "grad_norm": 0.6116298626095651, + "learning_rate": 4.3104899363920616e-05, + "loss": 0.5892, + "step": 2515 + }, + { + "epoch": 0.29384328358208955, + "grad_norm": 0.5028293067243745, + "learning_rate": 4.3073618014067824e-05, + "loss": 0.5614, + "step": 2520 + }, + { + "epoch": 0.29442630597014924, + "grad_norm": 0.5284740774719737, + "learning_rate": 4.304227877019525e-05, + "loss": 0.5509, + "step": 2525 + }, + { + "epoch": 0.295009328358209, + "grad_norm": 0.5043452273361897, + "learning_rate": 4.301088174880472e-05, + "loss": 0.5734, + "step": 2530 + }, + { + "epoch": 0.29559235074626866, + "grad_norm": 0.5491908726146885, + "learning_rate": 4.297942706661283e-05, + "loss": 0.5994, + "step": 2535 + }, + { + "epoch": 0.29617537313432835, + "grad_norm": 0.5967635993887994, + "learning_rate": 4.2947914840550544e-05, + "loss": 0.5895, + "step": 2540 + }, + { + "epoch": 0.2967583955223881, + "grad_norm": 0.49957976391073805, + "learning_rate": 4.291634518776273e-05, + "loss": 0.5559, + "step": 2545 + }, + { + "epoch": 0.29734141791044777, + "grad_norm": 0.586452369328343, + "learning_rate": 4.2884718225607736e-05, + "loss": 0.5987, + "step": 2550 + }, + { + "epoch": 0.29792444029850745, + "grad_norm": 0.5079777653661454, + "learning_rate": 4.285303407165694e-05, + "loss": 0.5892, + "step": 2555 + }, + { + "epoch": 0.29850746268656714, + "grad_norm": 0.45945289033221137, + "learning_rate": 4.282129284369436e-05, + "loss": 0.5353, + "step": 2560 + }, + { + "epoch": 0.2990904850746269, + "grad_norm": 0.47293762371540526, + "learning_rate": 4.278949465971616e-05, + "loss": 0.5451, + "step": 2565 + }, + { + "epoch": 0.29967350746268656, + "grad_norm": 0.5125425687772994, + "learning_rate": 4.2757639637930246e-05, + "loss": 0.588, + "step": 2570 + }, + { + "epoch": 0.30025652985074625, + "grad_norm": 0.560469657657478, + "learning_rate": 4.2725727896755804e-05, + "loss": 0.5353, + "step": 2575 + }, + { + "epoch": 0.300839552238806, + "grad_norm": 0.5254788469907365, + "learning_rate": 4.269375955482287e-05, + "loss": 0.547, + "step": 2580 + }, + { + "epoch": 0.30142257462686567, + "grad_norm": 0.48987507837663824, + "learning_rate": 4.266173473097192e-05, + "loss": 0.5175, + "step": 2585 + }, + { + "epoch": 0.30200559701492535, + "grad_norm": 0.5484732726515896, + "learning_rate": 4.262965354425335e-05, + "loss": 0.6067, + "step": 2590 + }, + { + "epoch": 0.3025886194029851, + "grad_norm": 0.5064490667857149, + "learning_rate": 4.259751611392712e-05, + "loss": 0.533, + "step": 2595 + }, + { + "epoch": 0.3031716417910448, + "grad_norm": 0.5014575792637805, + "learning_rate": 4.256532255946226e-05, + "loss": 0.5913, + "step": 2600 + }, + { + "epoch": 0.30375466417910446, + "grad_norm": 0.48901348419554475, + "learning_rate": 4.253307300053643e-05, + "loss": 0.5543, + "step": 2605 + }, + { + "epoch": 0.3043376865671642, + "grad_norm": 0.5232458296317694, + "learning_rate": 4.25007675570355e-05, + "loss": 0.5759, + "step": 2610 + }, + { + "epoch": 0.3049207089552239, + "grad_norm": 0.5070730983893377, + "learning_rate": 4.246840634905307e-05, + "loss": 0.5515, + "step": 2615 + }, + { + "epoch": 0.30550373134328357, + "grad_norm": 0.4828637362156461, + "learning_rate": 4.2435989496890054e-05, + "loss": 0.5817, + "step": 2620 + }, + { + "epoch": 0.3060867537313433, + "grad_norm": 0.45681007096402865, + "learning_rate": 4.240351712105422e-05, + "loss": 0.5538, + "step": 2625 + }, + { + "epoch": 0.306669776119403, + "grad_norm": 0.5633919156991598, + "learning_rate": 4.237098934225973e-05, + "loss": 0.5619, + "step": 2630 + }, + { + "epoch": 0.3072527985074627, + "grad_norm": 0.4744742163372956, + "learning_rate": 4.233840628142672e-05, + "loss": 0.5749, + "step": 2635 + }, + { + "epoch": 0.30783582089552236, + "grad_norm": 0.47221088669237776, + "learning_rate": 4.2305768059680806e-05, + "loss": 0.5689, + "step": 2640 + }, + { + "epoch": 0.3084188432835821, + "grad_norm": 0.4762515982934417, + "learning_rate": 4.2273074798352706e-05, + "loss": 0.5507, + "step": 2645 + }, + { + "epoch": 0.3090018656716418, + "grad_norm": 0.4718217399644616, + "learning_rate": 4.22403266189777e-05, + "loss": 0.5483, + "step": 2650 + }, + { + "epoch": 0.30958488805970147, + "grad_norm": 0.4671780211152603, + "learning_rate": 4.2207523643295253e-05, + "loss": 0.5463, + "step": 2655 + }, + { + "epoch": 0.3101679104477612, + "grad_norm": 0.49004196187956206, + "learning_rate": 4.2174665993248505e-05, + "loss": 0.5474, + "step": 2660 + }, + { + "epoch": 0.3107509328358209, + "grad_norm": 0.5092130863744895, + "learning_rate": 4.214175379098388e-05, + "loss": 0.5512, + "step": 2665 + }, + { + "epoch": 0.3113339552238806, + "grad_norm": 0.4925159889445034, + "learning_rate": 4.210878715885056e-05, + "loss": 0.5609, + "step": 2670 + }, + { + "epoch": 0.3119169776119403, + "grad_norm": 0.5438536521932689, + "learning_rate": 4.2075766219400095e-05, + "loss": 0.5949, + "step": 2675 + }, + { + "epoch": 0.3125, + "grad_norm": 0.4985765329826545, + "learning_rate": 4.20426910953859e-05, + "loss": 0.5726, + "step": 2680 + }, + { + "epoch": 0.3130830223880597, + "grad_norm": 0.5036063584187528, + "learning_rate": 4.200956190976284e-05, + "loss": 0.5255, + "step": 2685 + }, + { + "epoch": 0.3136660447761194, + "grad_norm": 0.5634159268083127, + "learning_rate": 4.1976378785686715e-05, + "loss": 0.5254, + "step": 2690 + }, + { + "epoch": 0.3142490671641791, + "grad_norm": 0.483875275656377, + "learning_rate": 4.1943141846513886e-05, + "loss": 0.5518, + "step": 2695 + }, + { + "epoch": 0.3148320895522388, + "grad_norm": 0.5510683619678165, + "learning_rate": 4.190985121580071e-05, + "loss": 0.557, + "step": 2700 + }, + { + "epoch": 0.31541511194029853, + "grad_norm": 0.45786711381238016, + "learning_rate": 4.18765070173032e-05, + "loss": 0.521, + "step": 2705 + }, + { + "epoch": 0.3159981343283582, + "grad_norm": 0.47954794442661985, + "learning_rate": 4.184310937497647e-05, + "loss": 0.55, + "step": 2710 + }, + { + "epoch": 0.3165811567164179, + "grad_norm": 0.5348827941441646, + "learning_rate": 4.1809658412974314e-05, + "loss": 0.5668, + "step": 2715 + }, + { + "epoch": 0.31716417910447764, + "grad_norm": 0.5035567889348412, + "learning_rate": 4.177615425564872e-05, + "loss": 0.5584, + "step": 2720 + }, + { + "epoch": 0.3177472014925373, + "grad_norm": 0.5014049010285576, + "learning_rate": 4.174259702754947e-05, + "loss": 0.5538, + "step": 2725 + }, + { + "epoch": 0.318330223880597, + "grad_norm": 0.7108080618041146, + "learning_rate": 4.17089868534236e-05, + "loss": 0.593, + "step": 2730 + }, + { + "epoch": 0.3189132462686567, + "grad_norm": 0.4901344322857948, + "learning_rate": 4.1675323858214975e-05, + "loss": 0.5504, + "step": 2735 + }, + { + "epoch": 0.31949626865671643, + "grad_norm": 0.5455638212871261, + "learning_rate": 4.164160816706383e-05, + "loss": 0.5781, + "step": 2740 + }, + { + "epoch": 0.3200792910447761, + "grad_norm": 0.4691865337288382, + "learning_rate": 4.160783990530629e-05, + "loss": 0.5349, + "step": 2745 + }, + { + "epoch": 0.3206623134328358, + "grad_norm": 0.5397164117756076, + "learning_rate": 4.157401919847389e-05, + "loss": 0.5635, + "step": 2750 + }, + { + "epoch": 0.32124533582089554, + "grad_norm": 0.49203133885978545, + "learning_rate": 4.1540146172293154e-05, + "loss": 0.553, + "step": 2755 + }, + { + "epoch": 0.3218283582089552, + "grad_norm": 0.6956383718262041, + "learning_rate": 4.150622095268508e-05, + "loss": 0.5454, + "step": 2760 + }, + { + "epoch": 0.3224113805970149, + "grad_norm": 0.5236479255481173, + "learning_rate": 4.1472243665764715e-05, + "loss": 0.546, + "step": 2765 + }, + { + "epoch": 0.32299440298507465, + "grad_norm": 0.5226139094665005, + "learning_rate": 4.1438214437840625e-05, + "loss": 0.5685, + "step": 2770 + }, + { + "epoch": 0.32357742537313433, + "grad_norm": 0.519532395995944, + "learning_rate": 4.140413339541451e-05, + "loss": 0.5621, + "step": 2775 + }, + { + "epoch": 0.324160447761194, + "grad_norm": 0.5221117984856967, + "learning_rate": 4.137000066518065e-05, + "loss": 0.5945, + "step": 2780 + }, + { + "epoch": 0.32474347014925375, + "grad_norm": 0.4662764439552845, + "learning_rate": 4.13358163740255e-05, + "loss": 0.5439, + "step": 2785 + }, + { + "epoch": 0.32532649253731344, + "grad_norm": 0.4825709779338081, + "learning_rate": 4.1301580649027154e-05, + "loss": 0.5487, + "step": 2790 + }, + { + "epoch": 0.3259095149253731, + "grad_norm": 0.49595525241615773, + "learning_rate": 4.126729361745495e-05, + "loss": 0.5456, + "step": 2795 + }, + { + "epoch": 0.32649253731343286, + "grad_norm": 0.46259611525800753, + "learning_rate": 4.1232955406768925e-05, + "loss": 0.5385, + "step": 2800 + }, + { + "epoch": 0.32707555970149255, + "grad_norm": 0.48140441921381905, + "learning_rate": 4.119856614461938e-05, + "loss": 0.5289, + "step": 2805 + }, + { + "epoch": 0.32765858208955223, + "grad_norm": 0.4878844018028054, + "learning_rate": 4.11641259588464e-05, + "loss": 0.5698, + "step": 2810 + }, + { + "epoch": 0.3282416044776119, + "grad_norm": 0.521050501093044, + "learning_rate": 4.1129634977479375e-05, + "loss": 0.5609, + "step": 2815 + }, + { + "epoch": 0.32882462686567165, + "grad_norm": 0.49828928878222534, + "learning_rate": 4.109509332873653e-05, + "loss": 0.5421, + "step": 2820 + }, + { + "epoch": 0.32940764925373134, + "grad_norm": 0.5526407883676059, + "learning_rate": 4.106050114102443e-05, + "loss": 0.5789, + "step": 2825 + }, + { + "epoch": 0.329990671641791, + "grad_norm": 0.5516248967750637, + "learning_rate": 4.102585854293751e-05, + "loss": 0.526, + "step": 2830 + }, + { + "epoch": 0.33057369402985076, + "grad_norm": 0.5461716849773546, + "learning_rate": 4.0991165663257636e-05, + "loss": 0.542, + "step": 2835 + }, + { + "epoch": 0.33115671641791045, + "grad_norm": 0.4524230892035543, + "learning_rate": 4.095642263095356e-05, + "loss": 0.5429, + "step": 2840 + }, + { + "epoch": 0.33173973880597013, + "grad_norm": 0.4907608846635581, + "learning_rate": 4.0921629575180485e-05, + "loss": 0.5536, + "step": 2845 + }, + { + "epoch": 0.33232276119402987, + "grad_norm": 0.5284875026264199, + "learning_rate": 4.088678662527959e-05, + "loss": 0.6164, + "step": 2850 + }, + { + "epoch": 0.33290578358208955, + "grad_norm": 0.48638662847344827, + "learning_rate": 4.085189391077749e-05, + "loss": 0.576, + "step": 2855 + }, + { + "epoch": 0.33348880597014924, + "grad_norm": 0.5695349098133777, + "learning_rate": 4.0816951561385836e-05, + "loss": 0.5521, + "step": 2860 + }, + { + "epoch": 0.334071828358209, + "grad_norm": 0.44821729017419637, + "learning_rate": 4.078195970700079e-05, + "loss": 0.526, + "step": 2865 + }, + { + "epoch": 0.33465485074626866, + "grad_norm": 0.5289515772805433, + "learning_rate": 4.074691847770251e-05, + "loss": 0.5505, + "step": 2870 + }, + { + "epoch": 0.33523787313432835, + "grad_norm": 0.5491964687999824, + "learning_rate": 4.0711828003754764e-05, + "loss": 0.5795, + "step": 2875 + }, + { + "epoch": 0.3358208955223881, + "grad_norm": 0.5184535389920294, + "learning_rate": 4.067668841560432e-05, + "loss": 0.5864, + "step": 2880 + }, + { + "epoch": 0.33640391791044777, + "grad_norm": 0.4640726695845034, + "learning_rate": 4.064149984388057e-05, + "loss": 0.5377, + "step": 2885 + }, + { + "epoch": 0.33698694029850745, + "grad_norm": 0.5223555480526277, + "learning_rate": 4.060626241939499e-05, + "loss": 0.5528, + "step": 2890 + }, + { + "epoch": 0.33756996268656714, + "grad_norm": 0.47444935336532806, + "learning_rate": 4.057097627314063e-05, + "loss": 0.5366, + "step": 2895 + }, + { + "epoch": 0.3381529850746269, + "grad_norm": 0.49936360299242055, + "learning_rate": 4.0535641536291725e-05, + "loss": 0.5981, + "step": 2900 + }, + { + "epoch": 0.33873600746268656, + "grad_norm": 0.49567180693590535, + "learning_rate": 4.050025834020307e-05, + "loss": 0.564, + "step": 2905 + }, + { + "epoch": 0.33931902985074625, + "grad_norm": 0.48231305889253284, + "learning_rate": 4.046482681640967e-05, + "loss": 0.5559, + "step": 2910 + }, + { + "epoch": 0.339902052238806, + "grad_norm": 0.5713231807843953, + "learning_rate": 4.042934709662613e-05, + "loss": 0.6046, + "step": 2915 + }, + { + "epoch": 0.34048507462686567, + "grad_norm": 0.5599418550405636, + "learning_rate": 4.039381931274626e-05, + "loss": 0.5459, + "step": 2920 + }, + { + "epoch": 0.34106809701492535, + "grad_norm": 0.5244470856538797, + "learning_rate": 4.035824359684253e-05, + "loss": 0.5559, + "step": 2925 + }, + { + "epoch": 0.3416511194029851, + "grad_norm": 0.46218246493450177, + "learning_rate": 4.032262008116559e-05, + "loss": 0.5496, + "step": 2930 + }, + { + "epoch": 0.3422341417910448, + "grad_norm": 0.5460308556475504, + "learning_rate": 4.02869488981438e-05, + "loss": 0.5726, + "step": 2935 + }, + { + "epoch": 0.34281716417910446, + "grad_norm": 0.4623737344709334, + "learning_rate": 4.025123018038271e-05, + "loss": 0.5422, + "step": 2940 + }, + { + "epoch": 0.3434001865671642, + "grad_norm": 0.5813565724181794, + "learning_rate": 4.0215464060664564e-05, + "loss": 0.5822, + "step": 2945 + }, + { + "epoch": 0.3439832089552239, + "grad_norm": 0.49376149527368923, + "learning_rate": 4.017965067194783e-05, + "loss": 0.5821, + "step": 2950 + }, + { + "epoch": 0.34456623134328357, + "grad_norm": 0.5169325234827248, + "learning_rate": 4.0143790147366724e-05, + "loss": 0.5482, + "step": 2955 + }, + { + "epoch": 0.3451492537313433, + "grad_norm": 0.515737079500457, + "learning_rate": 4.010788262023064e-05, + "loss": 0.5709, + "step": 2960 + }, + { + "epoch": 0.345732276119403, + "grad_norm": 0.5087208722494978, + "learning_rate": 4.007192822402372e-05, + "loss": 0.5565, + "step": 2965 + }, + { + "epoch": 0.3463152985074627, + "grad_norm": 0.4796963642209258, + "learning_rate": 4.003592709240438e-05, + "loss": 0.5494, + "step": 2970 + }, + { + "epoch": 0.34689832089552236, + "grad_norm": 0.49755643832004437, + "learning_rate": 3.9999879359204676e-05, + "loss": 0.5382, + "step": 2975 + }, + { + "epoch": 0.3474813432835821, + "grad_norm": 0.4690304314754959, + "learning_rate": 3.996378515843001e-05, + "loss": 0.5334, + "step": 2980 + }, + { + "epoch": 0.3480643656716418, + "grad_norm": 0.4772189221465707, + "learning_rate": 3.9927644624258445e-05, + "loss": 0.5902, + "step": 2985 + }, + { + "epoch": 0.34864738805970147, + "grad_norm": 0.46404845790530064, + "learning_rate": 3.989145789104033e-05, + "loss": 0.5431, + "step": 2990 + }, + { + "epoch": 0.3492304104477612, + "grad_norm": 0.5340362988246943, + "learning_rate": 3.985522509329775e-05, + "loss": 0.5486, + "step": 2995 + }, + { + "epoch": 0.3498134328358209, + "grad_norm": 0.510771920504794, + "learning_rate": 3.9818946365724004e-05, + "loss": 0.5401, + "step": 3000 + }, + { + "epoch": 0.3503964552238806, + "grad_norm": 0.49718158294490394, + "learning_rate": 3.978262184318317e-05, + "loss": 0.5626, + "step": 3005 + }, + { + "epoch": 0.3509794776119403, + "grad_norm": 0.5158633729934479, + "learning_rate": 3.974625166070953e-05, + "loss": 0.5364, + "step": 3010 + }, + { + "epoch": 0.3515625, + "grad_norm": 0.5204824836614376, + "learning_rate": 3.970983595350714e-05, + "loss": 0.5545, + "step": 3015 + }, + { + "epoch": 0.3521455223880597, + "grad_norm": 0.506085761814899, + "learning_rate": 3.967337485694929e-05, + "loss": 0.5492, + "step": 3020 + }, + { + "epoch": 0.3527285447761194, + "grad_norm": 0.5167191975821872, + "learning_rate": 3.963686850657795e-05, + "loss": 0.5326, + "step": 3025 + }, + { + "epoch": 0.3533115671641791, + "grad_norm": 0.49528479007831655, + "learning_rate": 3.9600317038103385e-05, + "loss": 0.548, + "step": 3030 + }, + { + "epoch": 0.3538945895522388, + "grad_norm": 0.514405428467145, + "learning_rate": 3.956372058740354e-05, + "loss": 0.5708, + "step": 3035 + }, + { + "epoch": 0.35447761194029853, + "grad_norm": 0.4760653718744451, + "learning_rate": 3.952707929052359e-05, + "loss": 0.5385, + "step": 3040 + }, + { + "epoch": 0.3550606343283582, + "grad_norm": 0.5086645138695323, + "learning_rate": 3.9490393283675445e-05, + "loss": 0.5425, + "step": 3045 + }, + { + "epoch": 0.3556436567164179, + "grad_norm": 0.48138382467030305, + "learning_rate": 3.9453662703237186e-05, + "loss": 0.5599, + "step": 3050 + }, + { + "epoch": 0.35622667910447764, + "grad_norm": 0.594129469541421, + "learning_rate": 3.941688768575261e-05, + "loss": 0.558, + "step": 3055 + }, + { + "epoch": 0.3568097014925373, + "grad_norm": 0.4896479074572315, + "learning_rate": 3.938006836793073e-05, + "loss": 0.5399, + "step": 3060 + }, + { + "epoch": 0.357392723880597, + "grad_norm": 0.5430860896504877, + "learning_rate": 3.934320488664519e-05, + "loss": 0.5614, + "step": 3065 + }, + { + "epoch": 0.3579757462686567, + "grad_norm": 0.5171549846963465, + "learning_rate": 3.9306297378933855e-05, + "loss": 0.5338, + "step": 3070 + }, + { + "epoch": 0.35855876865671643, + "grad_norm": 0.5030297563055794, + "learning_rate": 3.926934598199824e-05, + "loss": 0.5671, + "step": 3075 + }, + { + "epoch": 0.3591417910447761, + "grad_norm": 0.4738604088361965, + "learning_rate": 3.923235083320301e-05, + "loss": 0.5204, + "step": 3080 + }, + { + "epoch": 0.3597248134328358, + "grad_norm": 0.5081725848955316, + "learning_rate": 3.919531207007548e-05, + "loss": 0.5382, + "step": 3085 + }, + { + "epoch": 0.36030783582089554, + "grad_norm": 0.5336505974453601, + "learning_rate": 3.915822983030512e-05, + "loss": 0.5498, + "step": 3090 + }, + { + "epoch": 0.3608908582089552, + "grad_norm": 0.45862890034907244, + "learning_rate": 3.912110425174296e-05, + "loss": 0.5478, + "step": 3095 + }, + { + "epoch": 0.3614738805970149, + "grad_norm": 0.5692817255847623, + "learning_rate": 3.9083935472401214e-05, + "loss": 0.5511, + "step": 3100 + }, + { + "epoch": 0.36205690298507465, + "grad_norm": 0.5588630515105258, + "learning_rate": 3.904672363045265e-05, + "loss": 0.5713, + "step": 3105 + }, + { + "epoch": 0.36263992537313433, + "grad_norm": 0.7882164929530883, + "learning_rate": 3.900946886423012e-05, + "loss": 0.5967, + "step": 3110 + }, + { + "epoch": 0.363222947761194, + "grad_norm": 0.44828952211943274, + "learning_rate": 3.897217131222606e-05, + "loss": 0.5117, + "step": 3115 + }, + { + "epoch": 0.36380597014925375, + "grad_norm": 0.492899081711114, + "learning_rate": 3.893483111309196e-05, + "loss": 0.5381, + "step": 3120 + }, + { + "epoch": 0.36438899253731344, + "grad_norm": 0.46780156460670047, + "learning_rate": 3.889744840563781e-05, + "loss": 0.5561, + "step": 3125 + }, + { + "epoch": 0.3649720149253731, + "grad_norm": 0.5062858013949219, + "learning_rate": 3.886002332883169e-05, + "loss": 0.5501, + "step": 3130 + }, + { + "epoch": 0.36555503731343286, + "grad_norm": 0.4693101482730097, + "learning_rate": 3.8822556021799114e-05, + "loss": 0.5193, + "step": 3135 + }, + { + "epoch": 0.36613805970149255, + "grad_norm": 0.5060844807583814, + "learning_rate": 3.878504662382264e-05, + "loss": 0.5532, + "step": 3140 + }, + { + "epoch": 0.36672108208955223, + "grad_norm": 0.5201363699624885, + "learning_rate": 3.8747495274341274e-05, + "loss": 0.5845, + "step": 3145 + }, + { + "epoch": 0.3673041044776119, + "grad_norm": 0.5073064897043211, + "learning_rate": 3.870990211294997e-05, + "loss": 0.5444, + "step": 3150 + }, + { + "epoch": 0.36788712686567165, + "grad_norm": 0.5344016844623599, + "learning_rate": 3.867226727939912e-05, + "loss": 0.5606, + "step": 3155 + }, + { + "epoch": 0.36847014925373134, + "grad_norm": 0.5139307938716214, + "learning_rate": 3.863459091359401e-05, + "loss": 0.5882, + "step": 3160 + }, + { + "epoch": 0.369053171641791, + "grad_norm": 0.4425759816838131, + "learning_rate": 3.8596873155594385e-05, + "loss": 0.5202, + "step": 3165 + }, + { + "epoch": 0.36963619402985076, + "grad_norm": 0.5742393732533151, + "learning_rate": 3.855911414561378e-05, + "loss": 0.5662, + "step": 3170 + }, + { + "epoch": 0.37021921641791045, + "grad_norm": 0.44175421568947043, + "learning_rate": 3.852131402401914e-05, + "loss": 0.4854, + "step": 3175 + }, + { + "epoch": 0.37080223880597013, + "grad_norm": 0.4984055964239209, + "learning_rate": 3.848347293133021e-05, + "loss": 0.573, + "step": 3180 + }, + { + "epoch": 0.37138526119402987, + "grad_norm": 0.49757126890693293, + "learning_rate": 3.844559100821906e-05, + "loss": 0.5549, + "step": 3185 + }, + { + "epoch": 0.37196828358208955, + "grad_norm": 0.57357388001981, + "learning_rate": 3.8407668395509526e-05, + "loss": 0.5574, + "step": 3190 + }, + { + "epoch": 0.37255130597014924, + "grad_norm": 0.5019623696079913, + "learning_rate": 3.8369705234176726e-05, + "loss": 0.5894, + "step": 3195 + }, + { + "epoch": 0.373134328358209, + "grad_norm": 0.4858155386150297, + "learning_rate": 3.8331701665346495e-05, + "loss": 0.5383, + "step": 3200 + }, + { + "epoch": 0.37371735074626866, + "grad_norm": 0.5624113035875654, + "learning_rate": 3.829365783029492e-05, + "loss": 0.5585, + "step": 3205 + }, + { + "epoch": 0.37430037313432835, + "grad_norm": 0.5052807550374429, + "learning_rate": 3.8255573870447716e-05, + "loss": 0.5439, + "step": 3210 + }, + { + "epoch": 0.3748833955223881, + "grad_norm": 0.5085388458884429, + "learning_rate": 3.82174499273798e-05, + "loss": 0.565, + "step": 3215 + }, + { + "epoch": 0.37546641791044777, + "grad_norm": 0.4989534046655282, + "learning_rate": 3.817928614281471e-05, + "loss": 0.5324, + "step": 3220 + }, + { + "epoch": 0.37604944029850745, + "grad_norm": 0.8403204163889687, + "learning_rate": 3.8141082658624106e-05, + "loss": 0.5626, + "step": 3225 + }, + { + "epoch": 0.37663246268656714, + "grad_norm": 0.5559098643501764, + "learning_rate": 3.810283961682722e-05, + "loss": 0.5583, + "step": 3230 + }, + { + "epoch": 0.3772154850746269, + "grad_norm": 0.47644518540184416, + "learning_rate": 3.806455715959032e-05, + "loss": 0.5233, + "step": 3235 + }, + { + "epoch": 0.37779850746268656, + "grad_norm": 0.4930068420607561, + "learning_rate": 3.8026235429226236e-05, + "loss": 0.5642, + "step": 3240 + }, + { + "epoch": 0.37838152985074625, + "grad_norm": 0.5290426787630793, + "learning_rate": 3.798787456819377e-05, + "loss": 0.5234, + "step": 3245 + }, + { + "epoch": 0.378964552238806, + "grad_norm": 0.4868258441116788, + "learning_rate": 3.794947471909719e-05, + "loss": 0.5395, + "step": 3250 + }, + { + "epoch": 0.37954757462686567, + "grad_norm": 0.47174968576411624, + "learning_rate": 3.791103602468569e-05, + "loss": 0.5395, + "step": 3255 + }, + { + "epoch": 0.38013059701492535, + "grad_norm": 0.5363107908185231, + "learning_rate": 3.7872558627852905e-05, + "loss": 0.5557, + "step": 3260 + }, + { + "epoch": 0.3807136194029851, + "grad_norm": 0.49188998699814623, + "learning_rate": 3.78340426716363e-05, + "loss": 0.5268, + "step": 3265 + }, + { + "epoch": 0.3812966417910448, + "grad_norm": 0.48927529818476645, + "learning_rate": 3.779548829921673e-05, + "loss": 0.5387, + "step": 3270 + }, + { + "epoch": 0.38187966417910446, + "grad_norm": 0.47364382567752494, + "learning_rate": 3.775689565391781e-05, + "loss": 0.5129, + "step": 3275 + }, + { + "epoch": 0.3824626865671642, + "grad_norm": 0.5126016691334612, + "learning_rate": 3.771826487920546e-05, + "loss": 0.5467, + "step": 3280 + }, + { + "epoch": 0.3830457089552239, + "grad_norm": 0.5430085481644645, + "learning_rate": 3.767959611868734e-05, + "loss": 0.5536, + "step": 3285 + }, + { + "epoch": 0.38362873134328357, + "grad_norm": 0.4763687695806715, + "learning_rate": 3.764088951611233e-05, + "loss": 0.5071, + "step": 3290 + }, + { + "epoch": 0.3842117537313433, + "grad_norm": 0.47538986078328227, + "learning_rate": 3.7602145215369965e-05, + "loss": 0.5431, + "step": 3295 + }, + { + "epoch": 0.384794776119403, + "grad_norm": 0.44993211618209894, + "learning_rate": 3.756336336048994e-05, + "loss": 0.5608, + "step": 3300 + }, + { + "epoch": 0.3853777985074627, + "grad_norm": 0.4857510707261703, + "learning_rate": 3.752454409564152e-05, + "loss": 0.5307, + "step": 3305 + }, + { + "epoch": 0.38596082089552236, + "grad_norm": 0.49584291918089757, + "learning_rate": 3.74856875651331e-05, + "loss": 0.6047, + "step": 3310 + }, + { + "epoch": 0.3865438432835821, + "grad_norm": 0.5136523336548113, + "learning_rate": 3.744679391341157e-05, + "loss": 0.5516, + "step": 3315 + }, + { + "epoch": 0.3871268656716418, + "grad_norm": 0.5181136974711912, + "learning_rate": 3.740786328506179e-05, + "loss": 0.5439, + "step": 3320 + }, + { + "epoch": 0.38770988805970147, + "grad_norm": 0.5089785637081085, + "learning_rate": 3.7368895824806146e-05, + "loss": 0.5585, + "step": 3325 + }, + { + "epoch": 0.3882929104477612, + "grad_norm": 0.49682305539963434, + "learning_rate": 3.732989167750388e-05, + "loss": 0.562, + "step": 3330 + }, + { + "epoch": 0.3888759328358209, + "grad_norm": 0.5243718121493276, + "learning_rate": 3.7290850988150644e-05, + "loss": 0.5851, + "step": 3335 + }, + { + "epoch": 0.3894589552238806, + "grad_norm": 0.5098618228761496, + "learning_rate": 3.7251773901877945e-05, + "loss": 0.5191, + "step": 3340 + }, + { + "epoch": 0.3900419776119403, + "grad_norm": 0.47672882853039394, + "learning_rate": 3.721266056395257e-05, + "loss": 0.5284, + "step": 3345 + }, + { + "epoch": 0.390625, + "grad_norm": 0.4822727732005445, + "learning_rate": 3.7173511119776086e-05, + "loss": 0.5794, + "step": 3350 + }, + { + "epoch": 0.3912080223880597, + "grad_norm": 0.5222854783235708, + "learning_rate": 3.713432571488427e-05, + "loss": 0.5391, + "step": 3355 + }, + { + "epoch": 0.3917910447761194, + "grad_norm": 7.9962975681807595, + "learning_rate": 3.70951044949466e-05, + "loss": 0.5534, + "step": 3360 + }, + { + "epoch": 0.3923740671641791, + "grad_norm": 0.4764348433947033, + "learning_rate": 3.705584760576566e-05, + "loss": 0.5452, + "step": 3365 + }, + { + "epoch": 0.3929570895522388, + "grad_norm": 0.49080831286781906, + "learning_rate": 3.7016555193276667e-05, + "loss": 0.5746, + "step": 3370 + }, + { + "epoch": 0.39354011194029853, + "grad_norm": 0.5101400424865905, + "learning_rate": 3.697722740354688e-05, + "loss": 0.5729, + "step": 3375 + }, + { + "epoch": 0.3941231343283582, + "grad_norm": 0.4479587001588852, + "learning_rate": 3.6937864382775076e-05, + "loss": 0.5447, + "step": 3380 + }, + { + "epoch": 0.3947061567164179, + "grad_norm": 0.48466656620240023, + "learning_rate": 3.689846627729098e-05, + "loss": 0.5327, + "step": 3385 + }, + { + "epoch": 0.39528917910447764, + "grad_norm": 0.5157423888893855, + "learning_rate": 3.685903323355477e-05, + "loss": 0.5381, + "step": 3390 + }, + { + "epoch": 0.3958722014925373, + "grad_norm": 0.4820557574738787, + "learning_rate": 3.68195653981565e-05, + "loss": 0.5183, + "step": 3395 + }, + { + "epoch": 0.396455223880597, + "grad_norm": 0.48067690178778916, + "learning_rate": 3.678006291781555e-05, + "loss": 0.6121, + "step": 3400 + }, + { + "epoch": 0.3970382462686567, + "grad_norm": 0.48140274545401546, + "learning_rate": 3.6740525939380084e-05, + "loss": 0.5763, + "step": 3405 + }, + { + "epoch": 0.39762126865671643, + "grad_norm": 0.4821460133421974, + "learning_rate": 3.6700954609826535e-05, + "loss": 0.5498, + "step": 3410 + }, + { + "epoch": 0.3982042910447761, + "grad_norm": 0.5062442615898354, + "learning_rate": 3.6661349076259015e-05, + "loss": 0.5301, + "step": 3415 + }, + { + "epoch": 0.3987873134328358, + "grad_norm": 0.4894842886999361, + "learning_rate": 3.662170948590879e-05, + "loss": 0.5647, + "step": 3420 + }, + { + "epoch": 0.39937033582089554, + "grad_norm": 0.4635809485827499, + "learning_rate": 3.658203598613375e-05, + "loss": 0.5658, + "step": 3425 + }, + { + "epoch": 0.3999533582089552, + "grad_norm": 0.4756142318658808, + "learning_rate": 3.65423287244178e-05, + "loss": 0.5434, + "step": 3430 + }, + { + "epoch": 0.4005363805970149, + "grad_norm": 0.4768793799786962, + "learning_rate": 3.6502587848370395e-05, + "loss": 0.5129, + "step": 3435 + }, + { + "epoch": 0.40111940298507465, + "grad_norm": 0.49735047427469553, + "learning_rate": 3.6462813505725915e-05, + "loss": 0.5693, + "step": 3440 + }, + { + "epoch": 0.40170242537313433, + "grad_norm": 0.46768242382881264, + "learning_rate": 3.642300584434319e-05, + "loss": 0.5424, + "step": 3445 + }, + { + "epoch": 0.402285447761194, + "grad_norm": 0.48929227885921284, + "learning_rate": 3.638316501220487e-05, + "loss": 0.5613, + "step": 3450 + }, + { + "epoch": 0.40286847014925375, + "grad_norm": 0.49035092965063865, + "learning_rate": 3.6343291157416937e-05, + "loss": 0.5747, + "step": 3455 + }, + { + "epoch": 0.40345149253731344, + "grad_norm": 0.4920873718095396, + "learning_rate": 3.630338442820814e-05, + "loss": 0.5621, + "step": 3460 + }, + { + "epoch": 0.4040345149253731, + "grad_norm": 0.5721821448291733, + "learning_rate": 3.6263444972929395e-05, + "loss": 0.5642, + "step": 3465 + }, + { + "epoch": 0.40461753731343286, + "grad_norm": 0.4900664650078509, + "learning_rate": 3.622347294005334e-05, + "loss": 0.5637, + "step": 3470 + }, + { + "epoch": 0.40520055970149255, + "grad_norm": 0.4784540540109224, + "learning_rate": 3.618346847817366e-05, + "loss": 0.5166, + "step": 3475 + }, + { + "epoch": 0.40578358208955223, + "grad_norm": 0.5033729998350225, + "learning_rate": 3.6143431736004636e-05, + "loss": 0.5604, + "step": 3480 + }, + { + "epoch": 0.4063666044776119, + "grad_norm": 0.47667240088779256, + "learning_rate": 3.610336286238051e-05, + "loss": 0.5492, + "step": 3485 + }, + { + "epoch": 0.40694962686567165, + "grad_norm": 0.4874513651352348, + "learning_rate": 3.6063262006255006e-05, + "loss": 0.5563, + "step": 3490 + }, + { + "epoch": 0.40753264925373134, + "grad_norm": 0.523091113003475, + "learning_rate": 3.602312931670073e-05, + "loss": 0.5488, + "step": 3495 + }, + { + "epoch": 0.408115671641791, + "grad_norm": 0.5531785395812051, + "learning_rate": 3.59829649429086e-05, + "loss": 0.6013, + "step": 3500 + }, + { + "epoch": 0.40869869402985076, + "grad_norm": 0.47185275980717517, + "learning_rate": 3.5942769034187354e-05, + "loss": 0.536, + "step": 3505 + }, + { + "epoch": 0.40928171641791045, + "grad_norm": 0.46288526087001597, + "learning_rate": 3.590254173996295e-05, + "loss": 0.5169, + "step": 3510 + }, + { + "epoch": 0.40986473880597013, + "grad_norm": 0.48524872738357405, + "learning_rate": 3.586228320977801e-05, + "loss": 0.5271, + "step": 3515 + }, + { + "epoch": 0.41044776119402987, + "grad_norm": 0.5450240726382458, + "learning_rate": 3.582199359329129e-05, + "loss": 0.5427, + "step": 3520 + }, + { + "epoch": 0.41103078358208955, + "grad_norm": 0.48693873450848696, + "learning_rate": 3.5781673040277084e-05, + "loss": 0.5088, + "step": 3525 + }, + { + "epoch": 0.41161380597014924, + "grad_norm": 0.46864955154362703, + "learning_rate": 3.5741321700624726e-05, + "loss": 0.5174, + "step": 3530 + }, + { + "epoch": 0.412196828358209, + "grad_norm": 0.46728302060083887, + "learning_rate": 3.570093972433794e-05, + "loss": 0.5687, + "step": 3535 + }, + { + "epoch": 0.41277985074626866, + "grad_norm": 0.4634797945645771, + "learning_rate": 3.56605272615344e-05, + "loss": 0.528, + "step": 3540 + }, + { + "epoch": 0.41336287313432835, + "grad_norm": 0.5238785746405048, + "learning_rate": 3.562008446244509e-05, + "loss": 0.5375, + "step": 3545 + }, + { + "epoch": 0.4139458955223881, + "grad_norm": 0.5187794803590449, + "learning_rate": 3.557961147741376e-05, + "loss": 0.5985, + "step": 3550 + }, + { + "epoch": 0.41452891791044777, + "grad_norm": 0.4379680762591855, + "learning_rate": 3.553910845689638e-05, + "loss": 0.5554, + "step": 3555 + }, + { + "epoch": 0.41511194029850745, + "grad_norm": 0.46577740202461215, + "learning_rate": 3.549857555146056e-05, + "loss": 0.5472, + "step": 3560 + }, + { + "epoch": 0.41569496268656714, + "grad_norm": 0.4789932179858049, + "learning_rate": 3.5458012911785036e-05, + "loss": 0.5451, + "step": 3565 + }, + { + "epoch": 0.4162779850746269, + "grad_norm": 0.49654652817675987, + "learning_rate": 3.541742068865907e-05, + "loss": 0.5513, + "step": 3570 + }, + { + "epoch": 0.41686100746268656, + "grad_norm": 0.4692894243690009, + "learning_rate": 3.537679903298187e-05, + "loss": 0.5185, + "step": 3575 + }, + { + "epoch": 0.41744402985074625, + "grad_norm": 0.47941339153241536, + "learning_rate": 3.53361480957621e-05, + "loss": 0.5692, + "step": 3580 + }, + { + "epoch": 0.418027052238806, + "grad_norm": 0.46544865766435817, + "learning_rate": 3.529546802811725e-05, + "loss": 0.5405, + "step": 3585 + }, + { + "epoch": 0.41861007462686567, + "grad_norm": 0.46403588392065775, + "learning_rate": 3.5254758981273106e-05, + "loss": 0.5437, + "step": 3590 + }, + { + "epoch": 0.41919309701492535, + "grad_norm": 0.5179906749611947, + "learning_rate": 3.521402110656318e-05, + "loss": 0.5593, + "step": 3595 + }, + { + "epoch": 0.4197761194029851, + "grad_norm": 0.4781025455029735, + "learning_rate": 3.517325455542815e-05, + "loss": 0.5498, + "step": 3600 + }, + { + "epoch": 0.4203591417910448, + "grad_norm": 0.4934656687592932, + "learning_rate": 3.513245947941531e-05, + "loss": 0.5215, + "step": 3605 + }, + { + "epoch": 0.42094216417910446, + "grad_norm": 0.5326144345367541, + "learning_rate": 3.5091636030177995e-05, + "loss": 0.535, + "step": 3610 + }, + { + "epoch": 0.4215251865671642, + "grad_norm": 0.47801066829681754, + "learning_rate": 3.505078435947498e-05, + "loss": 0.5137, + "step": 3615 + }, + { + "epoch": 0.4221082089552239, + "grad_norm": 0.5286215614880156, + "learning_rate": 3.500990461916998e-05, + "loss": 0.5733, + "step": 3620 + }, + { + "epoch": 0.42269123134328357, + "grad_norm": 0.5073258841460547, + "learning_rate": 3.496899696123107e-05, + "loss": 0.5746, + "step": 3625 + }, + { + "epoch": 0.4232742537313433, + "grad_norm": 0.48126213143188057, + "learning_rate": 3.492806153773007e-05, + "loss": 0.559, + "step": 3630 + }, + { + "epoch": 0.423857276119403, + "grad_norm": 0.43727021104667335, + "learning_rate": 3.488709850084206e-05, + "loss": 0.4998, + "step": 3635 + }, + { + "epoch": 0.4244402985074627, + "grad_norm": 0.4897298712066961, + "learning_rate": 3.484610800284473e-05, + "loss": 0.5463, + "step": 3640 + }, + { + "epoch": 0.42502332089552236, + "grad_norm": 0.523356089677995, + "learning_rate": 3.480509019611788e-05, + "loss": 0.5659, + "step": 3645 + }, + { + "epoch": 0.4256063432835821, + "grad_norm": 0.5277720573146358, + "learning_rate": 3.476404523314282e-05, + "loss": 0.5241, + "step": 3650 + }, + { + "epoch": 0.4261893656716418, + "grad_norm": 0.526635621277189, + "learning_rate": 3.472297326650183e-05, + "loss": 0.5543, + "step": 3655 + }, + { + "epoch": 0.42677238805970147, + "grad_norm": 0.45989723897065593, + "learning_rate": 3.468187444887754e-05, + "loss": 0.4939, + "step": 3660 + }, + { + "epoch": 0.4273554104477612, + "grad_norm": 0.4622944623408051, + "learning_rate": 3.464074893305242e-05, + "loss": 0.5297, + "step": 3665 + }, + { + "epoch": 0.4279384328358209, + "grad_norm": 0.4783006627278798, + "learning_rate": 3.45995968719082e-05, + "loss": 0.5255, + "step": 3670 + }, + { + "epoch": 0.4285214552238806, + "grad_norm": 0.5059594374936409, + "learning_rate": 3.455841841842524e-05, + "loss": 0.5773, + "step": 3675 + }, + { + "epoch": 0.4291044776119403, + "grad_norm": 0.44346639279558187, + "learning_rate": 3.4517213725682085e-05, + "loss": 0.5196, + "step": 3680 + }, + { + "epoch": 0.4296875, + "grad_norm": 0.4742923141132099, + "learning_rate": 3.447598294685476e-05, + "loss": 0.5211, + "step": 3685 + }, + { + "epoch": 0.4302705223880597, + "grad_norm": 0.5096994312042691, + "learning_rate": 3.443472623521631e-05, + "loss": 0.5576, + "step": 3690 + }, + { + "epoch": 0.4308535447761194, + "grad_norm": 0.4757579513441239, + "learning_rate": 3.4393443744136136e-05, + "loss": 0.5342, + "step": 3695 + }, + { + "epoch": 0.4314365671641791, + "grad_norm": 0.4525723387319913, + "learning_rate": 3.435213562707953e-05, + "loss": 0.521, + "step": 3700 + }, + { + "epoch": 0.4320195895522388, + "grad_norm": 0.4655324458390628, + "learning_rate": 3.431080203760699e-05, + "loss": 0.5143, + "step": 3705 + }, + { + "epoch": 0.43260261194029853, + "grad_norm": 0.46493164741995624, + "learning_rate": 3.426944312937376e-05, + "loss": 0.5448, + "step": 3710 + }, + { + "epoch": 0.4331856343283582, + "grad_norm": 0.4705834455942846, + "learning_rate": 3.422805905612914e-05, + "loss": 0.5132, + "step": 3715 + }, + { + "epoch": 0.4337686567164179, + "grad_norm": 0.44735510242287085, + "learning_rate": 3.4186649971716044e-05, + "loss": 0.5078, + "step": 3720 + }, + { + "epoch": 0.43435167910447764, + "grad_norm": 0.4248977052828472, + "learning_rate": 3.4145216030070344e-05, + "loss": 0.5224, + "step": 3725 + }, + { + "epoch": 0.4349347014925373, + "grad_norm": 0.48361810889787654, + "learning_rate": 3.410375738522028e-05, + "loss": 0.5696, + "step": 3730 + }, + { + "epoch": 0.435517723880597, + "grad_norm": 0.5169029233248057, + "learning_rate": 3.406227419128596e-05, + "loss": 0.548, + "step": 3735 + }, + { + "epoch": 0.4361007462686567, + "grad_norm": 0.47352743387761187, + "learning_rate": 3.402076660247878e-05, + "loss": 0.5735, + "step": 3740 + }, + { + "epoch": 0.43668376865671643, + "grad_norm": 0.4598793206254637, + "learning_rate": 3.397923477310074e-05, + "loss": 0.5286, + "step": 3745 + }, + { + "epoch": 0.4372667910447761, + "grad_norm": 0.7329855818387587, + "learning_rate": 3.393767885754405e-05, + "loss": 0.5274, + "step": 3750 + }, + { + "epoch": 0.4378498134328358, + "grad_norm": 0.5374277209323949, + "learning_rate": 3.389609901029038e-05, + "loss": 0.569, + "step": 3755 + }, + { + "epoch": 0.43843283582089554, + "grad_norm": 0.4851132196571775, + "learning_rate": 3.38544953859104e-05, + "loss": 0.5658, + "step": 3760 + }, + { + "epoch": 0.4390158582089552, + "grad_norm": 0.4588569006808473, + "learning_rate": 3.381286813906317e-05, + "loss": 0.5463, + "step": 3765 + }, + { + "epoch": 0.4395988805970149, + "grad_norm": 0.49195337073823053, + "learning_rate": 3.3771217424495555e-05, + "loss": 0.5855, + "step": 3770 + }, + { + "epoch": 0.44018190298507465, + "grad_norm": 0.47991606249122304, + "learning_rate": 3.372954339704167e-05, + "loss": 0.5496, + "step": 3775 + }, + { + "epoch": 0.44076492537313433, + "grad_norm": 0.4690325622228386, + "learning_rate": 3.368784621162229e-05, + "loss": 0.5647, + "step": 3780 + }, + { + "epoch": 0.441347947761194, + "grad_norm": 0.5399827946397292, + "learning_rate": 3.364612602324429e-05, + "loss": 0.554, + "step": 3785 + }, + { + "epoch": 0.44193097014925375, + "grad_norm": 0.44956816879532197, + "learning_rate": 3.3604382987000016e-05, + "loss": 0.5033, + "step": 3790 + }, + { + "epoch": 0.44251399253731344, + "grad_norm": 0.49304141117786027, + "learning_rate": 3.356261725806681e-05, + "loss": 0.5651, + "step": 3795 + }, + { + "epoch": 0.4430970149253731, + "grad_norm": 0.4984710533841555, + "learning_rate": 3.352082899170631e-05, + "loss": 0.5238, + "step": 3800 + }, + { + "epoch": 0.44368003731343286, + "grad_norm": 0.5007565543332921, + "learning_rate": 3.3479018343264e-05, + "loss": 0.5519, + "step": 3805 + }, + { + "epoch": 0.44426305970149255, + "grad_norm": 0.5011462730265147, + "learning_rate": 3.343718546816852e-05, + "loss": 0.5523, + "step": 3810 + }, + { + "epoch": 0.44484608208955223, + "grad_norm": 0.49568146210324576, + "learning_rate": 3.339533052193114e-05, + "loss": 0.5132, + "step": 3815 + }, + { + "epoch": 0.4454291044776119, + "grad_norm": 0.45124156599655735, + "learning_rate": 3.335345366014522e-05, + "loss": 0.55, + "step": 3820 + }, + { + "epoch": 0.44601212686567165, + "grad_norm": 0.47223468640192207, + "learning_rate": 3.331155503848553e-05, + "loss": 0.5438, + "step": 3825 + }, + { + "epoch": 0.44659514925373134, + "grad_norm": 0.5287373716087295, + "learning_rate": 3.326963481270778e-05, + "loss": 0.5411, + "step": 3830 + }, + { + "epoch": 0.447178171641791, + "grad_norm": 0.44674599573765417, + "learning_rate": 3.322769313864796e-05, + "loss": 0.5363, + "step": 3835 + }, + { + "epoch": 0.44776119402985076, + "grad_norm": 0.4434254783508246, + "learning_rate": 3.3185730172221814e-05, + "loss": 0.537, + "step": 3840 + }, + { + "epoch": 0.44834421641791045, + "grad_norm": 0.5046794647102569, + "learning_rate": 3.3143746069424215e-05, + "loss": 0.5582, + "step": 3845 + }, + { + "epoch": 0.44892723880597013, + "grad_norm": 0.489005919425733, + "learning_rate": 3.310174098632865e-05, + "loss": 0.5549, + "step": 3850 + }, + { + "epoch": 0.44951026119402987, + "grad_norm": 0.49266732159431287, + "learning_rate": 3.305971507908655e-05, + "loss": 0.5313, + "step": 3855 + }, + { + "epoch": 0.45009328358208955, + "grad_norm": 0.5315237555379242, + "learning_rate": 3.301766850392681e-05, + "loss": 0.5581, + "step": 3860 + }, + { + "epoch": 0.45067630597014924, + "grad_norm": 0.46700902921771087, + "learning_rate": 3.29756014171551e-05, + "loss": 0.5371, + "step": 3865 + }, + { + "epoch": 0.451259328358209, + "grad_norm": 0.4395133061533244, + "learning_rate": 3.2933513975153384e-05, + "loss": 0.5399, + "step": 3870 + }, + { + "epoch": 0.45184235074626866, + "grad_norm": 0.485910728377908, + "learning_rate": 3.2891406334379285e-05, + "loss": 0.525, + "step": 3875 + }, + { + "epoch": 0.45242537313432835, + "grad_norm": 0.46551066433925664, + "learning_rate": 3.284927865136551e-05, + "loss": 0.4913, + "step": 3880 + }, + { + "epoch": 0.4530083955223881, + "grad_norm": 0.4792517888689515, + "learning_rate": 3.280713108271926e-05, + "loss": 0.5237, + "step": 3885 + }, + { + "epoch": 0.45359141791044777, + "grad_norm": 0.44580552095045006, + "learning_rate": 3.276496378512168e-05, + "loss": 0.5716, + "step": 3890 + }, + { + "epoch": 0.45417444029850745, + "grad_norm": 0.46092489328912967, + "learning_rate": 3.272277691532725e-05, + "loss": 0.5402, + "step": 3895 + }, + { + "epoch": 0.45475746268656714, + "grad_norm": 0.4413897932753195, + "learning_rate": 3.268057063016319e-05, + "loss": 0.5305, + "step": 3900 + }, + { + "epoch": 0.4553404850746269, + "grad_norm": 0.4782676785091174, + "learning_rate": 3.263834508652894e-05, + "loss": 0.5946, + "step": 3905 + }, + { + "epoch": 0.45592350746268656, + "grad_norm": 0.5238873675452039, + "learning_rate": 3.259610044139548e-05, + "loss": 0.5478, + "step": 3910 + }, + { + "epoch": 0.45650652985074625, + "grad_norm": 0.5182499166196407, + "learning_rate": 3.255383685180484e-05, + "loss": 0.5454, + "step": 3915 + }, + { + "epoch": 0.457089552238806, + "grad_norm": 0.47638810180727925, + "learning_rate": 3.251155447486945e-05, + "loss": 0.5446, + "step": 3920 + }, + { + "epoch": 0.45767257462686567, + "grad_norm": 0.4523229716036492, + "learning_rate": 3.246925346777158e-05, + "loss": 0.522, + "step": 3925 + }, + { + "epoch": 0.45825559701492535, + "grad_norm": 0.46909930899902835, + "learning_rate": 3.2426933987762785e-05, + "loss": 0.5393, + "step": 3930 + }, + { + "epoch": 0.4588386194029851, + "grad_norm": 0.43989917162010367, + "learning_rate": 3.238459619216326e-05, + "loss": 0.5211, + "step": 3935 + }, + { + "epoch": 0.4594216417910448, + "grad_norm": 0.48026394452289217, + "learning_rate": 3.23422402383613e-05, + "loss": 0.5547, + "step": 3940 + }, + { + "epoch": 0.46000466417910446, + "grad_norm": 0.48185947326085543, + "learning_rate": 3.22998662838127e-05, + "loss": 0.5041, + "step": 3945 + }, + { + "epoch": 0.4605876865671642, + "grad_norm": 0.4734018707747314, + "learning_rate": 3.2257474486040166e-05, + "loss": 0.5038, + "step": 3950 + }, + { + "epoch": 0.4611707089552239, + "grad_norm": 0.5075617095429634, + "learning_rate": 3.221506500263276e-05, + "loss": 0.5447, + "step": 3955 + }, + { + "epoch": 0.46175373134328357, + "grad_norm": 0.5046083432543891, + "learning_rate": 3.217263799124527e-05, + "loss": 0.5772, + "step": 3960 + }, + { + "epoch": 0.4623367537313433, + "grad_norm": 0.47069172866304554, + "learning_rate": 3.213019360959762e-05, + "loss": 0.5341, + "step": 3965 + }, + { + "epoch": 0.462919776119403, + "grad_norm": 0.47866635232292687, + "learning_rate": 3.2087732015474366e-05, + "loss": 0.5208, + "step": 3970 + }, + { + "epoch": 0.4635027985074627, + "grad_norm": 0.4516545061013375, + "learning_rate": 3.204525336672399e-05, + "loss": 0.5382, + "step": 3975 + }, + { + "epoch": 0.46408582089552236, + "grad_norm": 0.4987443516763455, + "learning_rate": 3.200275782125842e-05, + "loss": 0.5319, + "step": 3980 + }, + { + "epoch": 0.4646688432835821, + "grad_norm": 0.5260827157599703, + "learning_rate": 3.196024553705235e-05, + "loss": 0.5355, + "step": 3985 + }, + { + "epoch": 0.4652518656716418, + "grad_norm": 0.4668030990105745, + "learning_rate": 3.1917716672142746e-05, + "loss": 0.5057, + "step": 3990 + }, + { + "epoch": 0.46583488805970147, + "grad_norm": 0.4426983151393079, + "learning_rate": 3.187517138462819e-05, + "loss": 0.5254, + "step": 3995 + }, + { + "epoch": 0.4664179104477612, + "grad_norm": 0.4925988353031197, + "learning_rate": 3.1832609832668314e-05, + "loss": 0.5422, + "step": 4000 + }, + { + "epoch": 0.4670009328358209, + "grad_norm": 0.4560283471593519, + "learning_rate": 3.179003217448321e-05, + "loss": 0.5013, + "step": 4005 + }, + { + "epoch": 0.4675839552238806, + "grad_norm": 0.5244233872415475, + "learning_rate": 3.1747438568352844e-05, + "loss": 0.5736, + "step": 4010 + }, + { + "epoch": 0.4681669776119403, + "grad_norm": 0.5512150192012447, + "learning_rate": 3.170482917261648e-05, + "loss": 0.5688, + "step": 4015 + }, + { + "epoch": 0.46875, + "grad_norm": 0.5009381195241619, + "learning_rate": 3.166220414567206e-05, + "loss": 0.5468, + "step": 4020 + }, + { + "epoch": 0.4693330223880597, + "grad_norm": 0.47235216856933904, + "learning_rate": 3.161956364597566e-05, + "loss": 0.5318, + "step": 4025 + }, + { + "epoch": 0.4699160447761194, + "grad_norm": 0.5634765857129355, + "learning_rate": 3.1576907832040855e-05, + "loss": 0.5188, + "step": 4030 + }, + { + "epoch": 0.4704990671641791, + "grad_norm": 0.48072091370811076, + "learning_rate": 3.153423686243813e-05, + "loss": 0.5313, + "step": 4035 + }, + { + "epoch": 0.4710820895522388, + "grad_norm": 0.5120155067954245, + "learning_rate": 3.149155089579437e-05, + "loss": 0.5572, + "step": 4040 + }, + { + "epoch": 0.47166511194029853, + "grad_norm": 0.4812745114629162, + "learning_rate": 3.144885009079215e-05, + "loss": 0.5578, + "step": 4045 + }, + { + "epoch": 0.4722481343283582, + "grad_norm": 0.5058486137109877, + "learning_rate": 3.140613460616924e-05, + "loss": 0.5199, + "step": 4050 + }, + { + "epoch": 0.4728311567164179, + "grad_norm": 0.48002106325579913, + "learning_rate": 3.1363404600717965e-05, + "loss": 0.5659, + "step": 4055 + }, + { + "epoch": 0.47341417910447764, + "grad_norm": 0.48245091505853244, + "learning_rate": 3.132066023328465e-05, + "loss": 0.533, + "step": 4060 + }, + { + "epoch": 0.4739972014925373, + "grad_norm": 0.47536248109948226, + "learning_rate": 3.1277901662768983e-05, + "loss": 0.5433, + "step": 4065 + }, + { + "epoch": 0.474580223880597, + "grad_norm": 0.47200232932636466, + "learning_rate": 3.123512904812347e-05, + "loss": 0.5322, + "step": 4070 + }, + { + "epoch": 0.4751632462686567, + "grad_norm": 0.44740442637527467, + "learning_rate": 3.119234254835282e-05, + "loss": 0.5107, + "step": 4075 + }, + { + "epoch": 0.47574626865671643, + "grad_norm": 0.46154208469718466, + "learning_rate": 3.114954232251336e-05, + "loss": 0.527, + "step": 4080 + }, + { + "epoch": 0.4763292910447761, + "grad_norm": 0.5341929895695878, + "learning_rate": 3.110672852971243e-05, + "loss": 0.5689, + "step": 4085 + }, + { + "epoch": 0.4769123134328358, + "grad_norm": 0.4652305557977976, + "learning_rate": 3.1063901329107843e-05, + "loss": 0.5167, + "step": 4090 + }, + { + "epoch": 0.47749533582089554, + "grad_norm": 0.44417381469876005, + "learning_rate": 3.10210608799072e-05, + "loss": 0.5413, + "step": 4095 + }, + { + "epoch": 0.4780783582089552, + "grad_norm": 0.4507749342993046, + "learning_rate": 3.097820734136739e-05, + "loss": 0.5282, + "step": 4100 + }, + { + "epoch": 0.4786613805970149, + "grad_norm": 0.48297525105833106, + "learning_rate": 3.093534087279397e-05, + "loss": 0.5347, + "step": 4105 + }, + { + "epoch": 0.47924440298507465, + "grad_norm": 0.5100316904667305, + "learning_rate": 3.089246163354051e-05, + "loss": 0.5395, + "step": 4110 + }, + { + "epoch": 0.47982742537313433, + "grad_norm": 0.4995786265506999, + "learning_rate": 3.084956978300812e-05, + "loss": 0.5311, + "step": 4115 + }, + { + "epoch": 0.480410447761194, + "grad_norm": 0.4471832602242875, + "learning_rate": 3.080666548064475e-05, + "loss": 0.5193, + "step": 4120 + }, + { + "epoch": 0.48099347014925375, + "grad_norm": 0.46140397088213214, + "learning_rate": 3.076374888594464e-05, + "loss": 0.5345, + "step": 4125 + }, + { + "epoch": 0.48157649253731344, + "grad_norm": 0.44371711691404125, + "learning_rate": 3.0720820158447766e-05, + "loss": 0.5072, + "step": 4130 + }, + { + "epoch": 0.4821595149253731, + "grad_norm": 0.4561270995882789, + "learning_rate": 3.067787945773915e-05, + "loss": 0.5181, + "step": 4135 + }, + { + "epoch": 0.48274253731343286, + "grad_norm": 0.4468599316613869, + "learning_rate": 3.063492694344835e-05, + "loss": 0.5286, + "step": 4140 + }, + { + "epoch": 0.48332555970149255, + "grad_norm": 0.4921853335929309, + "learning_rate": 3.059196277524886e-05, + "loss": 0.5075, + "step": 4145 + }, + { + "epoch": 0.48390858208955223, + "grad_norm": 0.47049020704612843, + "learning_rate": 3.054898711285747e-05, + "loss": 0.5475, + "step": 4150 + }, + { + "epoch": 0.4844916044776119, + "grad_norm": 1.0060177708740357, + "learning_rate": 3.05060001160337e-05, + "loss": 0.5293, + "step": 4155 + }, + { + "epoch": 0.48507462686567165, + "grad_norm": 0.48784465637028546, + "learning_rate": 3.046300194457923e-05, + "loss": 0.534, + "step": 4160 + }, + { + "epoch": 0.48565764925373134, + "grad_norm": 0.4518879881598059, + "learning_rate": 3.0419992758337235e-05, + "loss": 0.5209, + "step": 4165 + }, + { + "epoch": 0.486240671641791, + "grad_norm": 0.4452974474644216, + "learning_rate": 3.0376972717191894e-05, + "loss": 0.5245, + "step": 4170 + }, + { + "epoch": 0.48682369402985076, + "grad_norm": 0.44833371464722693, + "learning_rate": 3.0333941981067688e-05, + "loss": 0.5156, + "step": 4175 + }, + { + "epoch": 0.48740671641791045, + "grad_norm": 0.44934997583495073, + "learning_rate": 3.029090070992889e-05, + "loss": 0.5144, + "step": 4180 + }, + { + "epoch": 0.48798973880597013, + "grad_norm": 0.47274610781733095, + "learning_rate": 3.0247849063778917e-05, + "loss": 0.5317, + "step": 4185 + }, + { + "epoch": 0.48857276119402987, + "grad_norm": 0.47590787280797764, + "learning_rate": 3.020478720265977e-05, + "loss": 0.51, + "step": 4190 + }, + { + "epoch": 0.48915578358208955, + "grad_norm": 0.44563102351746864, + "learning_rate": 3.01617152866514e-05, + "loss": 0.5301, + "step": 4195 + }, + { + "epoch": 0.48973880597014924, + "grad_norm": 0.4916861483726837, + "learning_rate": 3.0118633475871167e-05, + "loss": 0.5772, + "step": 4200 + }, + { + "epoch": 0.490321828358209, + "grad_norm": 0.4684037067244426, + "learning_rate": 3.0075541930473183e-05, + "loss": 0.4969, + "step": 4205 + }, + { + "epoch": 0.49090485074626866, + "grad_norm": 0.4797812232043466, + "learning_rate": 3.0032440810647783e-05, + "loss": 0.5038, + "step": 4210 + }, + { + "epoch": 0.49148787313432835, + "grad_norm": 0.4525162593921102, + "learning_rate": 2.998933027662086e-05, + "loss": 0.5266, + "step": 4215 + }, + { + "epoch": 0.4920708955223881, + "grad_norm": 0.4822829175013609, + "learning_rate": 2.9946210488653316e-05, + "loss": 0.5243, + "step": 4220 + }, + { + "epoch": 0.49265391791044777, + "grad_norm": 0.45841709338227005, + "learning_rate": 2.9903081607040473e-05, + "loss": 0.4948, + "step": 4225 + }, + { + "epoch": 0.49323694029850745, + "grad_norm": 0.4465507453121582, + "learning_rate": 2.9859943792111422e-05, + "loss": 0.5228, + "step": 4230 + }, + { + "epoch": 0.49381996268656714, + "grad_norm": 0.613185904602815, + "learning_rate": 2.9816797204228497e-05, + "loss": 0.5345, + "step": 4235 + }, + { + "epoch": 0.4944029850746269, + "grad_norm": 0.49396745715444307, + "learning_rate": 2.9773642003786627e-05, + "loss": 0.5412, + "step": 4240 + }, + { + "epoch": 0.49498600746268656, + "grad_norm": 0.5239548031082005, + "learning_rate": 2.9730478351212754e-05, + "loss": 0.5821, + "step": 4245 + }, + { + "epoch": 0.49556902985074625, + "grad_norm": 0.4221114790983556, + "learning_rate": 2.968730640696526e-05, + "loss": 0.5199, + "step": 4250 + }, + { + "epoch": 0.496152052238806, + "grad_norm": 0.480540611050451, + "learning_rate": 2.9644126331533328e-05, + "loss": 0.5149, + "step": 4255 + }, + { + "epoch": 0.49673507462686567, + "grad_norm": 0.4972848047633413, + "learning_rate": 2.9600938285436385e-05, + "loss": 0.5155, + "step": 4260 + }, + { + "epoch": 0.49731809701492535, + "grad_norm": 0.5100758433643319, + "learning_rate": 2.9557742429223478e-05, + "loss": 0.5495, + "step": 4265 + }, + { + "epoch": 0.4979011194029851, + "grad_norm": 0.46169645444430685, + "learning_rate": 2.95145389234727e-05, + "loss": 0.5432, + "step": 4270 + }, + { + "epoch": 0.4984841417910448, + "grad_norm": 0.478683421695544, + "learning_rate": 2.947132792879056e-05, + "loss": 0.5622, + "step": 4275 + }, + { + "epoch": 0.49906716417910446, + "grad_norm": 0.4937574145856353, + "learning_rate": 2.9428109605811427e-05, + "loss": 0.5292, + "step": 4280 + }, + { + "epoch": 0.4996501865671642, + "grad_norm": 0.4482984531417574, + "learning_rate": 2.9384884115196898e-05, + "loss": 0.5115, + "step": 4285 + }, + { + "epoch": 0.5002332089552238, + "grad_norm": 0.4581506836353169, + "learning_rate": 2.9341651617635236e-05, + "loss": 0.5369, + "step": 4290 + }, + { + "epoch": 0.5008162313432836, + "grad_norm": 0.4636121196389006, + "learning_rate": 2.929841227384072e-05, + "loss": 0.563, + "step": 4295 + }, + { + "epoch": 0.5013992537313433, + "grad_norm": 0.49303476266346524, + "learning_rate": 2.925516624455311e-05, + "loss": 0.5323, + "step": 4300 + }, + { + "epoch": 0.5019822761194029, + "grad_norm": 0.4792765198500162, + "learning_rate": 2.9211913690537003e-05, + "loss": 0.5238, + "step": 4305 + }, + { + "epoch": 0.5025652985074627, + "grad_norm": 0.47595384636734767, + "learning_rate": 2.9168654772581257e-05, + "loss": 0.5487, + "step": 4310 + }, + { + "epoch": 0.5031483208955224, + "grad_norm": 0.45475337905445856, + "learning_rate": 2.9125389651498374e-05, + "loss": 0.5551, + "step": 4315 + }, + { + "epoch": 0.503731343283582, + "grad_norm": 0.45368396432436275, + "learning_rate": 2.908211848812394e-05, + "loss": 0.5096, + "step": 4320 + }, + { + "epoch": 0.5043143656716418, + "grad_norm": 0.4720732964199861, + "learning_rate": 2.903884144331598e-05, + "loss": 0.5375, + "step": 4325 + }, + { + "epoch": 0.5048973880597015, + "grad_norm": 0.4609507788137596, + "learning_rate": 2.89955586779544e-05, + "loss": 0.5193, + "step": 4330 + }, + { + "epoch": 0.5054804104477612, + "grad_norm": 0.4437193884044816, + "learning_rate": 2.8952270352940362e-05, + "loss": 0.538, + "step": 4335 + }, + { + "epoch": 0.5060634328358209, + "grad_norm": 0.4686721143893824, + "learning_rate": 2.89089766291957e-05, + "loss": 0.5359, + "step": 4340 + }, + { + "epoch": 0.5066464552238806, + "grad_norm": 0.48522442554794026, + "learning_rate": 2.886567766766231e-05, + "loss": 0.5367, + "step": 4345 + }, + { + "epoch": 0.5072294776119403, + "grad_norm": 0.4682334708596764, + "learning_rate": 2.8822373629301573e-05, + "loss": 0.4979, + "step": 4350 + }, + { + "epoch": 0.5078125, + "grad_norm": 0.4461441009977717, + "learning_rate": 2.8779064675093724e-05, + "loss": 0.5473, + "step": 4355 + }, + { + "epoch": 0.5083955223880597, + "grad_norm": 0.523425795341376, + "learning_rate": 2.8735750966037295e-05, + "loss": 0.592, + "step": 4360 + }, + { + "epoch": 0.5089785447761194, + "grad_norm": 0.41208414865400855, + "learning_rate": 2.869243266314847e-05, + "loss": 0.5146, + "step": 4365 + }, + { + "epoch": 0.5095615671641791, + "grad_norm": 0.45512638703063274, + "learning_rate": 2.8649109927460533e-05, + "loss": 0.517, + "step": 4370 + }, + { + "epoch": 0.5101445895522388, + "grad_norm": 0.449015485615699, + "learning_rate": 2.8605782920023227e-05, + "loss": 0.5235, + "step": 4375 + }, + { + "epoch": 0.5107276119402985, + "grad_norm": 0.49156178005481366, + "learning_rate": 2.8562451801902197e-05, + "loss": 0.5308, + "step": 4380 + }, + { + "epoch": 0.5113106343283582, + "grad_norm": 0.4348710846671963, + "learning_rate": 2.8519116734178336e-05, + "loss": 0.5387, + "step": 4385 + }, + { + "epoch": 0.511893656716418, + "grad_norm": 0.4902492973827986, + "learning_rate": 2.8475777877947264e-05, + "loss": 0.5417, + "step": 4390 + }, + { + "epoch": 0.5124766791044776, + "grad_norm": 0.6547931036800456, + "learning_rate": 2.843243539431863e-05, + "loss": 0.5444, + "step": 4395 + }, + { + "epoch": 0.5130597014925373, + "grad_norm": 0.4876654982418252, + "learning_rate": 2.838908944441562e-05, + "loss": 0.5558, + "step": 4400 + }, + { + "epoch": 0.5136427238805971, + "grad_norm": 0.46099868761271035, + "learning_rate": 2.834574018937428e-05, + "loss": 0.5605, + "step": 4405 + }, + { + "epoch": 0.5142257462686567, + "grad_norm": 0.4545024437386275, + "learning_rate": 2.8302387790342943e-05, + "loss": 0.5175, + "step": 4410 + }, + { + "epoch": 0.5148087686567164, + "grad_norm": 0.4336617180340361, + "learning_rate": 2.8259032408481635e-05, + "loss": 0.5038, + "step": 4415 + }, + { + "epoch": 0.5153917910447762, + "grad_norm": 0.47838755970386043, + "learning_rate": 2.8215674204961462e-05, + "loss": 0.5594, + "step": 4420 + }, + { + "epoch": 0.5159748134328358, + "grad_norm": 0.5142813582002242, + "learning_rate": 2.817231334096403e-05, + "loss": 0.5177, + "step": 4425 + }, + { + "epoch": 0.5165578358208955, + "grad_norm": 0.44541441218084665, + "learning_rate": 2.812894997768083e-05, + "loss": 0.5465, + "step": 4430 + }, + { + "epoch": 0.5171408582089553, + "grad_norm": 0.4560004171104499, + "learning_rate": 2.8085584276312644e-05, + "loss": 0.5201, + "step": 4435 + }, + { + "epoch": 0.5177238805970149, + "grad_norm": 0.47964290891517664, + "learning_rate": 2.8042216398068942e-05, + "loss": 0.5247, + "step": 4440 + }, + { + "epoch": 0.5183069029850746, + "grad_norm": 0.8168752970003079, + "learning_rate": 2.7998846504167308e-05, + "loss": 0.5277, + "step": 4445 + }, + { + "epoch": 0.5188899253731343, + "grad_norm": 2.544656980904975, + "learning_rate": 2.7955474755832784e-05, + "loss": 0.505, + "step": 4450 + }, + { + "epoch": 0.519472947761194, + "grad_norm": 0.4659395065613913, + "learning_rate": 2.7912101314297327e-05, + "loss": 0.5269, + "step": 4455 + }, + { + "epoch": 0.5200559701492538, + "grad_norm": 2.6969407750363943, + "learning_rate": 2.7868726340799184e-05, + "loss": 0.5306, + "step": 4460 + }, + { + "epoch": 0.5206389925373134, + "grad_norm": 0.5270843633040919, + "learning_rate": 2.7825349996582313e-05, + "loss": 0.5711, + "step": 4465 + }, + { + "epoch": 0.5212220149253731, + "grad_norm": 0.5091777799722457, + "learning_rate": 2.7781972442895726e-05, + "loss": 0.5365, + "step": 4470 + }, + { + "epoch": 0.5218050373134329, + "grad_norm": 1.3533639882343287, + "learning_rate": 2.7738593840992975e-05, + "loss": 0.5539, + "step": 4475 + }, + { + "epoch": 0.5223880597014925, + "grad_norm": 0.4547594298673975, + "learning_rate": 2.769521435213149e-05, + "loss": 0.544, + "step": 4480 + }, + { + "epoch": 0.5229710820895522, + "grad_norm": 0.5510873748275549, + "learning_rate": 2.7651834137572003e-05, + "loss": 0.5353, + "step": 4485 + }, + { + "epoch": 0.523554104477612, + "grad_norm": 0.4328373857127542, + "learning_rate": 2.760845335857793e-05, + "loss": 0.5132, + "step": 4490 + }, + { + "epoch": 0.5241371268656716, + "grad_norm": 0.45053079578268007, + "learning_rate": 2.7565072176414803e-05, + "loss": 0.5231, + "step": 4495 + }, + { + "epoch": 0.5247201492537313, + "grad_norm": 0.49741815715434656, + "learning_rate": 2.7521690752349643e-05, + "loss": 0.5281, + "step": 4500 + }, + { + "epoch": 0.5253031716417911, + "grad_norm": 0.4607291755452952, + "learning_rate": 2.7478309247650362e-05, + "loss": 0.5558, + "step": 4505 + }, + { + "epoch": 0.5258861940298507, + "grad_norm": 0.5194917261722904, + "learning_rate": 2.7434927823585206e-05, + "loss": 0.539, + "step": 4510 + }, + { + "epoch": 0.5264692164179104, + "grad_norm": 0.45463126850370955, + "learning_rate": 2.739154664142208e-05, + "loss": 0.5215, + "step": 4515 + }, + { + "epoch": 0.5270522388059702, + "grad_norm": 0.4892107161079775, + "learning_rate": 2.7348165862428e-05, + "loss": 0.4891, + "step": 4520 + }, + { + "epoch": 0.5276352611940298, + "grad_norm": 0.5045457990411999, + "learning_rate": 2.7304785647868507e-05, + "loss": 0.5416, + "step": 4525 + }, + { + "epoch": 0.5282182835820896, + "grad_norm": 0.44346437150982526, + "learning_rate": 2.726140615900703e-05, + "loss": 0.5048, + "step": 4530 + }, + { + "epoch": 0.5288013059701493, + "grad_norm": 0.48666851854991494, + "learning_rate": 2.7218027557104286e-05, + "loss": 0.5034, + "step": 4535 + }, + { + "epoch": 0.5293843283582089, + "grad_norm": 0.4611162809005146, + "learning_rate": 2.7174650003417696e-05, + "loss": 0.5142, + "step": 4540 + }, + { + "epoch": 0.5299673507462687, + "grad_norm": 0.45514376590541833, + "learning_rate": 2.7131273659200818e-05, + "loss": 0.5162, + "step": 4545 + }, + { + "epoch": 0.5305503731343284, + "grad_norm": 0.48960501435122616, + "learning_rate": 2.7087898685702685e-05, + "loss": 0.5069, + "step": 4550 + }, + { + "epoch": 0.531133395522388, + "grad_norm": 0.4797862989191752, + "learning_rate": 2.704452524416722e-05, + "loss": 0.531, + "step": 4555 + }, + { + "epoch": 0.5317164179104478, + "grad_norm": 0.5893011284468495, + "learning_rate": 2.7001153495832697e-05, + "loss": 0.5061, + "step": 4560 + }, + { + "epoch": 0.5322994402985075, + "grad_norm": 0.4461810976300674, + "learning_rate": 2.6957783601931063e-05, + "loss": 0.5203, + "step": 4565 + }, + { + "epoch": 0.5328824626865671, + "grad_norm": 0.49982880523004874, + "learning_rate": 2.691441572368737e-05, + "loss": 0.5284, + "step": 4570 + }, + { + "epoch": 0.5334654850746269, + "grad_norm": 0.8194747676964631, + "learning_rate": 2.6871050022319177e-05, + "loss": 0.5642, + "step": 4575 + }, + { + "epoch": 0.5340485074626866, + "grad_norm": 0.45941020004877126, + "learning_rate": 2.6827686659035983e-05, + "loss": 0.5444, + "step": 4580 + }, + { + "epoch": 0.5346315298507462, + "grad_norm": 0.7401627959971693, + "learning_rate": 2.678432579503855e-05, + "loss": 0.5138, + "step": 4585 + }, + { + "epoch": 0.535214552238806, + "grad_norm": 0.49796442033400656, + "learning_rate": 2.6740967591518374e-05, + "loss": 0.5264, + "step": 4590 + }, + { + "epoch": 0.5357975746268657, + "grad_norm": 1.3853714172153995, + "learning_rate": 2.6697612209657063e-05, + "loss": 0.5375, + "step": 4595 + }, + { + "epoch": 0.5363805970149254, + "grad_norm": 0.4543560797621967, + "learning_rate": 2.665425981062573e-05, + "loss": 0.5155, + "step": 4600 + }, + { + "epoch": 0.5369636194029851, + "grad_norm": 0.484187959090983, + "learning_rate": 2.6610910555584384e-05, + "loss": 0.5478, + "step": 4605 + }, + { + "epoch": 0.5375466417910447, + "grad_norm": 0.49626488411866204, + "learning_rate": 2.6567564605681376e-05, + "loss": 0.5352, + "step": 4610 + }, + { + "epoch": 0.5381296641791045, + "grad_norm": 0.514181721385795, + "learning_rate": 2.652422212205275e-05, + "loss": 0.5387, + "step": 4615 + }, + { + "epoch": 0.5387126865671642, + "grad_norm": 0.5832049097179766, + "learning_rate": 2.6480883265821673e-05, + "loss": 0.5355, + "step": 4620 + }, + { + "epoch": 0.5392957089552238, + "grad_norm": 1.5120035406279293, + "learning_rate": 2.643754819809781e-05, + "loss": 0.5144, + "step": 4625 + }, + { + "epoch": 0.5398787313432836, + "grad_norm": 0.49157920203901545, + "learning_rate": 2.639421707997678e-05, + "loss": 0.5176, + "step": 4630 + }, + { + "epoch": 0.5404617537313433, + "grad_norm": 0.52902066701415, + "learning_rate": 2.6350890072539476e-05, + "loss": 0.5635, + "step": 4635 + }, + { + "epoch": 0.5410447761194029, + "grad_norm": 0.4637572082859708, + "learning_rate": 2.630756733685153e-05, + "loss": 0.5346, + "step": 4640 + }, + { + "epoch": 0.5416277985074627, + "grad_norm": 0.47595145214824397, + "learning_rate": 2.6264249033962713e-05, + "loss": 0.5249, + "step": 4645 + }, + { + "epoch": 0.5422108208955224, + "grad_norm": 1.6764787888943493, + "learning_rate": 2.622093532490628e-05, + "loss": 0.5064, + "step": 4650 + }, + { + "epoch": 0.542793843283582, + "grad_norm": 0.5788528459186878, + "learning_rate": 2.6177626370698443e-05, + "loss": 0.5051, + "step": 4655 + }, + { + "epoch": 0.5433768656716418, + "grad_norm": 0.48162500018331245, + "learning_rate": 2.6134322332337695e-05, + "loss": 0.5043, + "step": 4660 + }, + { + "epoch": 0.5439598880597015, + "grad_norm": 0.4737951941511267, + "learning_rate": 2.6091023370804307e-05, + "loss": 0.5142, + "step": 4665 + }, + { + "epoch": 0.5445429104477612, + "grad_norm": 0.5575644816288676, + "learning_rate": 2.604772964705965e-05, + "loss": 0.5505, + "step": 4670 + }, + { + "epoch": 0.5451259328358209, + "grad_norm": 0.4758372366326835, + "learning_rate": 2.6004441322045603e-05, + "loss": 0.5373, + "step": 4675 + }, + { + "epoch": 0.5457089552238806, + "grad_norm": 0.508958909313539, + "learning_rate": 2.596115855668403e-05, + "loss": 0.5353, + "step": 4680 + }, + { + "epoch": 0.5462919776119403, + "grad_norm": 0.4683786155826753, + "learning_rate": 2.5917881511876073e-05, + "loss": 0.4946, + "step": 4685 + }, + { + "epoch": 0.546875, + "grad_norm": 0.5456678942382134, + "learning_rate": 2.5874610348501632e-05, + "loss": 0.5371, + "step": 4690 + }, + { + "epoch": 0.5474580223880597, + "grad_norm": 0.4552682312653168, + "learning_rate": 2.5831345227418752e-05, + "loss": 0.5605, + "step": 4695 + }, + { + "epoch": 0.5480410447761194, + "grad_norm": 0.4776001785979188, + "learning_rate": 2.5788086309463006e-05, + "loss": 0.5005, + "step": 4700 + }, + { + "epoch": 0.5486240671641791, + "grad_norm": 0.5453229696717531, + "learning_rate": 2.57448337554469e-05, + "loss": 0.5194, + "step": 4705 + }, + { + "epoch": 0.5492070895522388, + "grad_norm": 0.5387817919914101, + "learning_rate": 2.570158772615928e-05, + "loss": 0.5333, + "step": 4710 + }, + { + "epoch": 0.5497901119402985, + "grad_norm": 0.48603015940285904, + "learning_rate": 2.5658348382364773e-05, + "loss": 0.5117, + "step": 4715 + }, + { + "epoch": 0.5503731343283582, + "grad_norm": 0.4789950555392135, + "learning_rate": 2.5615115884803108e-05, + "loss": 0.5168, + "step": 4720 + }, + { + "epoch": 0.550956156716418, + "grad_norm": 0.4593000342095894, + "learning_rate": 2.557189039418858e-05, + "loss": 0.5238, + "step": 4725 + }, + { + "epoch": 0.5515391791044776, + "grad_norm": 0.4671404581672369, + "learning_rate": 2.552867207120945e-05, + "loss": 0.5336, + "step": 4730 + }, + { + "epoch": 0.5521222014925373, + "grad_norm": 0.4867006683126969, + "learning_rate": 2.5485461076527308e-05, + "loss": 0.5432, + "step": 4735 + }, + { + "epoch": 0.5527052238805971, + "grad_norm": 0.4603754841175435, + "learning_rate": 2.5442257570776527e-05, + "loss": 0.5464, + "step": 4740 + }, + { + "epoch": 0.5532882462686567, + "grad_norm": 0.5322095390809611, + "learning_rate": 2.539906171456362e-05, + "loss": 0.5102, + "step": 4745 + }, + { + "epoch": 0.5538712686567164, + "grad_norm": 0.4501063054653902, + "learning_rate": 2.5355873668466677e-05, + "loss": 0.526, + "step": 4750 + }, + { + "epoch": 0.5544542910447762, + "grad_norm": 0.4645529985985766, + "learning_rate": 2.5312693593034746e-05, + "loss": 0.5035, + "step": 4755 + }, + { + "epoch": 0.5550373134328358, + "grad_norm": 0.4764710401107973, + "learning_rate": 2.5269521648787247e-05, + "loss": 0.5458, + "step": 4760 + }, + { + "epoch": 0.5556203358208955, + "grad_norm": 0.4534954950893274, + "learning_rate": 2.5226357996213378e-05, + "loss": 0.5199, + "step": 4765 + }, + { + "epoch": 0.5562033582089553, + "grad_norm": 0.4810706685920549, + "learning_rate": 2.518320279577151e-05, + "loss": 0.5263, + "step": 4770 + }, + { + "epoch": 0.5567863805970149, + "grad_norm": 0.4690472358261824, + "learning_rate": 2.514005620788858e-05, + "loss": 0.5245, + "step": 4775 + }, + { + "epoch": 0.5573694029850746, + "grad_norm": 0.46880508544260313, + "learning_rate": 2.5096918392959532e-05, + "loss": 0.5232, + "step": 4780 + }, + { + "epoch": 0.5579524253731343, + "grad_norm": 0.513925549118589, + "learning_rate": 2.5053789511346693e-05, + "loss": 0.5157, + "step": 4785 + }, + { + "epoch": 0.558535447761194, + "grad_norm": 0.5043455996761089, + "learning_rate": 2.5010669723379154e-05, + "loss": 0.5128, + "step": 4790 + }, + { + "epoch": 0.5591184701492538, + "grad_norm": 0.5924109641064016, + "learning_rate": 2.4967559189352226e-05, + "loss": 0.4921, + "step": 4795 + }, + { + "epoch": 0.5597014925373134, + "grad_norm": 0.47567528373039897, + "learning_rate": 2.492445806952682e-05, + "loss": 0.5406, + "step": 4800 + }, + { + "epoch": 0.5602845149253731, + "grad_norm": 0.4707447199114358, + "learning_rate": 2.4881366524128845e-05, + "loss": 0.5325, + "step": 4805 + }, + { + "epoch": 0.5608675373134329, + "grad_norm": 0.444463840534227, + "learning_rate": 2.4838284713348602e-05, + "loss": 0.4945, + "step": 4810 + }, + { + "epoch": 0.5614505597014925, + "grad_norm": 0.51359075031383, + "learning_rate": 2.479521279734024e-05, + "loss": 0.4986, + "step": 4815 + }, + { + "epoch": 0.5620335820895522, + "grad_norm": 0.508318043421021, + "learning_rate": 2.475215093622109e-05, + "loss": 0.5362, + "step": 4820 + }, + { + "epoch": 0.562616604477612, + "grad_norm": 0.4798125508084004, + "learning_rate": 2.4709099290071126e-05, + "loss": 0.498, + "step": 4825 + }, + { + "epoch": 0.5631996268656716, + "grad_norm": 0.46383793437769893, + "learning_rate": 2.4666058018932314e-05, + "loss": 0.5254, + "step": 4830 + }, + { + "epoch": 0.5637826492537313, + "grad_norm": 0.4825804953446005, + "learning_rate": 2.4623027282808114e-05, + "loss": 0.515, + "step": 4835 + }, + { + "epoch": 0.5643656716417911, + "grad_norm": 0.49356516542621953, + "learning_rate": 2.4580007241662773e-05, + "loss": 0.4913, + "step": 4840 + }, + { + "epoch": 0.5649486940298507, + "grad_norm": 0.46504399968658483, + "learning_rate": 2.4536998055420783e-05, + "loss": 0.5055, + "step": 4845 + }, + { + "epoch": 0.5655317164179104, + "grad_norm": 0.5178266867135491, + "learning_rate": 2.4493999883966308e-05, + "loss": 0.5338, + "step": 4850 + }, + { + "epoch": 0.5661147388059702, + "grad_norm": 0.443773396289914, + "learning_rate": 2.445101288714254e-05, + "loss": 0.527, + "step": 4855 + }, + { + "epoch": 0.5666977611940298, + "grad_norm": 0.4361179409054666, + "learning_rate": 2.440803722475114e-05, + "loss": 0.5115, + "step": 4860 + }, + { + "epoch": 0.5672807835820896, + "grad_norm": 0.4957031944579591, + "learning_rate": 2.436507305655165e-05, + "loss": 0.5224, + "step": 4865 + }, + { + "epoch": 0.5678638059701493, + "grad_norm": 0.5179909459677171, + "learning_rate": 2.4322120542260864e-05, + "loss": 0.5026, + "step": 4870 + }, + { + "epoch": 0.5684468283582089, + "grad_norm": 0.45931411628609564, + "learning_rate": 2.4279179841552246e-05, + "loss": 0.5492, + "step": 4875 + }, + { + "epoch": 0.5690298507462687, + "grad_norm": 0.4718292947061309, + "learning_rate": 2.4236251114055358e-05, + "loss": 0.5485, + "step": 4880 + }, + { + "epoch": 0.5696128731343284, + "grad_norm": 0.4509426009796579, + "learning_rate": 2.419333451935526e-05, + "loss": 0.5262, + "step": 4885 + }, + { + "epoch": 0.570195895522388, + "grad_norm": 0.520970250021266, + "learning_rate": 2.4150430216991888e-05, + "loss": 0.54, + "step": 4890 + }, + { + "epoch": 0.5707789179104478, + "grad_norm": 0.48125165415242044, + "learning_rate": 2.4107538366459494e-05, + "loss": 0.5535, + "step": 4895 + }, + { + "epoch": 0.5713619402985075, + "grad_norm": 0.49924612989919914, + "learning_rate": 2.406465912720604e-05, + "loss": 0.5378, + "step": 4900 + }, + { + "epoch": 0.5719449626865671, + "grad_norm": 0.4618022420362027, + "learning_rate": 2.4021792658632612e-05, + "loss": 0.5385, + "step": 4905 + }, + { + "epoch": 0.5725279850746269, + "grad_norm": 0.43524447366727087, + "learning_rate": 2.3978939120092814e-05, + "loss": 0.5275, + "step": 4910 + }, + { + "epoch": 0.5731110074626866, + "grad_norm": 0.5235831961960491, + "learning_rate": 2.3936098670892165e-05, + "loss": 0.5171, + "step": 4915 + }, + { + "epoch": 0.5736940298507462, + "grad_norm": 0.47253982117527504, + "learning_rate": 2.389327147028757e-05, + "loss": 0.5149, + "step": 4920 + }, + { + "epoch": 0.574277052238806, + "grad_norm": 0.536733235694404, + "learning_rate": 2.3850457677486655e-05, + "loss": 0.5385, + "step": 4925 + }, + { + "epoch": 0.5748600746268657, + "grad_norm": 0.49487264913586265, + "learning_rate": 2.380765745164718e-05, + "loss": 0.5453, + "step": 4930 + }, + { + "epoch": 0.5754430970149254, + "grad_norm": 0.4782990502193716, + "learning_rate": 2.376487095187654e-05, + "loss": 0.496, + "step": 4935 + }, + { + "epoch": 0.5760261194029851, + "grad_norm": 1.5309625007086856, + "learning_rate": 2.3722098337231025e-05, + "loss": 0.507, + "step": 4940 + }, + { + "epoch": 0.5766091417910447, + "grad_norm": 0.4155961423348815, + "learning_rate": 2.3679339766715358e-05, + "loss": 0.5219, + "step": 4945 + }, + { + "epoch": 0.5771921641791045, + "grad_norm": 0.4914382394629777, + "learning_rate": 2.363659539928204e-05, + "loss": 0.5107, + "step": 4950 + }, + { + "epoch": 0.5777751865671642, + "grad_norm": 0.4984691122251259, + "learning_rate": 2.3593865393830766e-05, + "loss": 0.5007, + "step": 4955 + }, + { + "epoch": 0.5783582089552238, + "grad_norm": 0.504442137117383, + "learning_rate": 2.355114990920786e-05, + "loss": 0.5103, + "step": 4960 + }, + { + "epoch": 0.5789412313432836, + "grad_norm": 0.774789963927665, + "learning_rate": 2.3508449104205636e-05, + "loss": 0.516, + "step": 4965 + }, + { + "epoch": 0.5795242537313433, + "grad_norm": 0.49669070623006295, + "learning_rate": 2.3465763137561875e-05, + "loss": 0.4984, + "step": 4970 + }, + { + "epoch": 0.5801072761194029, + "grad_norm": 0.4915699683156642, + "learning_rate": 2.342309216795916e-05, + "loss": 0.5237, + "step": 4975 + }, + { + "epoch": 0.5806902985074627, + "grad_norm": 0.7919278813140391, + "learning_rate": 2.3380436354024338e-05, + "loss": 0.5327, + "step": 4980 + }, + { + "epoch": 0.5812733208955224, + "grad_norm": 0.513367233603117, + "learning_rate": 2.333779585432794e-05, + "loss": 0.545, + "step": 4985 + }, + { + "epoch": 0.581856343283582, + "grad_norm": 0.5185055850159029, + "learning_rate": 2.329517082738353e-05, + "loss": 0.5671, + "step": 4990 + }, + { + "epoch": 0.5824393656716418, + "grad_norm": 0.4535336329617379, + "learning_rate": 2.3252561431647158e-05, + "loss": 0.5072, + "step": 4995 + }, + { + "epoch": 0.5830223880597015, + "grad_norm": 0.4214639260752155, + "learning_rate": 2.32099678255168e-05, + "loss": 0.4765, + "step": 5000 + }, + { + "epoch": 0.5836054104477612, + "grad_norm": 0.4292230083522436, + "learning_rate": 2.316739016733169e-05, + "loss": 0.535, + "step": 5005 + }, + { + "epoch": 0.5841884328358209, + "grad_norm": 0.4500847118246923, + "learning_rate": 2.3124828615371817e-05, + "loss": 0.5579, + "step": 5010 + }, + { + "epoch": 0.5847714552238806, + "grad_norm": 0.4441260617873408, + "learning_rate": 2.3082283327857253e-05, + "loss": 0.502, + "step": 5015 + }, + { + "epoch": 0.5853544776119403, + "grad_norm": 0.4929193442519856, + "learning_rate": 2.3039754462947653e-05, + "loss": 0.5154, + "step": 5020 + }, + { + "epoch": 0.5859375, + "grad_norm": 0.4604596804207028, + "learning_rate": 2.2997242178741596e-05, + "loss": 0.5173, + "step": 5025 + }, + { + "epoch": 0.5865205223880597, + "grad_norm": 0.4671301724369313, + "learning_rate": 2.2954746633276016e-05, + "loss": 0.5022, + "step": 5030 + }, + { + "epoch": 0.5871035447761194, + "grad_norm": 0.4666266550085784, + "learning_rate": 2.2912267984525643e-05, + "loss": 0.5263, + "step": 5035 + }, + { + "epoch": 0.5876865671641791, + "grad_norm": 0.5663717691966537, + "learning_rate": 2.2869806390402384e-05, + "loss": 0.5126, + "step": 5040 + }, + { + "epoch": 0.5882695895522388, + "grad_norm": 0.4709782228008594, + "learning_rate": 2.2827362008754743e-05, + "loss": 0.5374, + "step": 5045 + }, + { + "epoch": 0.5888526119402985, + "grad_norm": 0.523861503593284, + "learning_rate": 2.278493499736724e-05, + "loss": 0.5313, + "step": 5050 + }, + { + "epoch": 0.5894356343283582, + "grad_norm": 0.4797723190933745, + "learning_rate": 2.2742525513959832e-05, + "loss": 0.5382, + "step": 5055 + }, + { + "epoch": 0.590018656716418, + "grad_norm": 0.43860453059040094, + "learning_rate": 2.2700133716187316e-05, + "loss": 0.5265, + "step": 5060 + }, + { + "epoch": 0.5906016791044776, + "grad_norm": 0.4740764262851301, + "learning_rate": 2.2657759761638707e-05, + "loss": 0.5148, + "step": 5065 + }, + { + "epoch": 0.5911847014925373, + "grad_norm": 0.48957608183212625, + "learning_rate": 2.261540380783675e-05, + "loss": 0.4952, + "step": 5070 + }, + { + "epoch": 0.5917677238805971, + "grad_norm": 0.5162045189958194, + "learning_rate": 2.257306601223722e-05, + "loss": 0.5331, + "step": 5075 + }, + { + "epoch": 0.5923507462686567, + "grad_norm": 0.5728273214706431, + "learning_rate": 2.2530746532228413e-05, + "loss": 0.5143, + "step": 5080 + }, + { + "epoch": 0.5929337686567164, + "grad_norm": 0.44294658485783917, + "learning_rate": 2.2488445525130557e-05, + "loss": 0.5385, + "step": 5085 + }, + { + "epoch": 0.5935167910447762, + "grad_norm": 0.4728473251887733, + "learning_rate": 2.2446163148195164e-05, + "loss": 0.5249, + "step": 5090 + }, + { + "epoch": 0.5940998134328358, + "grad_norm": 0.42056687179230184, + "learning_rate": 2.2403899558604525e-05, + "loss": 0.4896, + "step": 5095 + }, + { + "epoch": 0.5946828358208955, + "grad_norm": 0.42312832455359684, + "learning_rate": 2.2361654913471065e-05, + "loss": 0.4831, + "step": 5100 + }, + { + "epoch": 0.5952658582089553, + "grad_norm": 0.4310382361089622, + "learning_rate": 2.2319429369836815e-05, + "loss": 0.5038, + "step": 5105 + }, + { + "epoch": 0.5958488805970149, + "grad_norm": 0.43349085099461926, + "learning_rate": 2.2277223084672765e-05, + "loss": 0.5332, + "step": 5110 + }, + { + "epoch": 0.5964319029850746, + "grad_norm": 0.4358686772263049, + "learning_rate": 2.2235036214878325e-05, + "loss": 0.4716, + "step": 5115 + }, + { + "epoch": 0.5970149253731343, + "grad_norm": 0.4998061706788218, + "learning_rate": 2.2192868917280745e-05, + "loss": 0.5277, + "step": 5120 + }, + { + "epoch": 0.597597947761194, + "grad_norm": 0.4569209342643407, + "learning_rate": 2.21507213486345e-05, + "loss": 0.4995, + "step": 5125 + }, + { + "epoch": 0.5981809701492538, + "grad_norm": 0.45297207736047584, + "learning_rate": 2.2108593665620724e-05, + "loss": 0.5254, + "step": 5130 + }, + { + "epoch": 0.5987639925373134, + "grad_norm": 0.49176995249756655, + "learning_rate": 2.2066486024846615e-05, + "loss": 0.5312, + "step": 5135 + }, + { + "epoch": 0.5993470149253731, + "grad_norm": 0.45352488174846867, + "learning_rate": 2.2024398582844906e-05, + "loss": 0.5183, + "step": 5140 + }, + { + "epoch": 0.5999300373134329, + "grad_norm": 0.4686989486366428, + "learning_rate": 2.19823314960732e-05, + "loss": 0.5337, + "step": 5145 + }, + { + "epoch": 0.6005130597014925, + "grad_norm": 0.4989471659468385, + "learning_rate": 2.1940284920913445e-05, + "loss": 0.513, + "step": 5150 + }, + { + "epoch": 0.6010960820895522, + "grad_norm": 0.7737032679878307, + "learning_rate": 2.1898259013671357e-05, + "loss": 0.4948, + "step": 5155 + }, + { + "epoch": 0.601679104477612, + "grad_norm": 0.6456584381963478, + "learning_rate": 2.1856253930575787e-05, + "loss": 0.5506, + "step": 5160 + }, + { + "epoch": 0.6022621268656716, + "grad_norm": 0.49420633212169496, + "learning_rate": 2.18142698277782e-05, + "loss": 0.534, + "step": 5165 + }, + { + "epoch": 0.6028451492537313, + "grad_norm": 0.43079700140392413, + "learning_rate": 2.1772306861352044e-05, + "loss": 0.4994, + "step": 5170 + }, + { + "epoch": 0.6034281716417911, + "grad_norm": 0.4477340131354866, + "learning_rate": 2.1730365187292228e-05, + "loss": 0.4895, + "step": 5175 + }, + { + "epoch": 0.6040111940298507, + "grad_norm": 0.45974056791460993, + "learning_rate": 2.1688444961514476e-05, + "loss": 0.4963, + "step": 5180 + }, + { + "epoch": 0.6045942164179104, + "grad_norm": 0.4463763393224647, + "learning_rate": 2.1646546339854788e-05, + "loss": 0.5139, + "step": 5185 + }, + { + "epoch": 0.6051772388059702, + "grad_norm": 0.5136717833408538, + "learning_rate": 2.1604669478068863e-05, + "loss": 0.5195, + "step": 5190 + }, + { + "epoch": 0.6057602611940298, + "grad_norm": 0.4554979479736964, + "learning_rate": 2.1562814531831487e-05, + "loss": 0.5062, + "step": 5195 + }, + { + "epoch": 0.6063432835820896, + "grad_norm": 0.45967404846816146, + "learning_rate": 2.1520981656736e-05, + "loss": 0.4984, + "step": 5200 + }, + { + "epoch": 0.6069263059701493, + "grad_norm": 0.44666913701763594, + "learning_rate": 2.1479171008293686e-05, + "loss": 0.4939, + "step": 5205 + }, + { + "epoch": 0.6075093283582089, + "grad_norm": 1.738573008880425, + "learning_rate": 2.1437382741933204e-05, + "loss": 0.542, + "step": 5210 + }, + { + "epoch": 0.6080923507462687, + "grad_norm": 0.45746766067753797, + "learning_rate": 2.1395617012999993e-05, + "loss": 0.5014, + "step": 5215 + }, + { + "epoch": 0.6086753731343284, + "grad_norm": 0.4542305089948559, + "learning_rate": 2.1353873976755716e-05, + "loss": 0.4852, + "step": 5220 + }, + { + "epoch": 0.609258395522388, + "grad_norm": 0.4734086638694241, + "learning_rate": 2.131215378837771e-05, + "loss": 0.5218, + "step": 5225 + }, + { + "epoch": 0.6098414179104478, + "grad_norm": 0.4738565708182124, + "learning_rate": 2.1270456602958332e-05, + "loss": 0.5322, + "step": 5230 + }, + { + "epoch": 0.6104244402985075, + "grad_norm": 0.49359869399689404, + "learning_rate": 2.1228782575504447e-05, + "loss": 0.537, + "step": 5235 + }, + { + "epoch": 0.6110074626865671, + "grad_norm": 0.47548423763939335, + "learning_rate": 2.1187131860936845e-05, + "loss": 0.5183, + "step": 5240 + }, + { + "epoch": 0.6115904850746269, + "grad_norm": 0.6813270131879138, + "learning_rate": 2.114550461408961e-05, + "loss": 0.5097, + "step": 5245 + }, + { + "epoch": 0.6121735074626866, + "grad_norm": 0.4858514966177657, + "learning_rate": 2.1103900989709623e-05, + "loss": 0.5267, + "step": 5250 + }, + { + "epoch": 0.6127565298507462, + "grad_norm": 0.46679943200730223, + "learning_rate": 2.1062321142455953e-05, + "loss": 0.5187, + "step": 5255 + }, + { + "epoch": 0.613339552238806, + "grad_norm": 0.43680321948998807, + "learning_rate": 2.1020765226899257e-05, + "loss": 0.5072, + "step": 5260 + }, + { + "epoch": 0.6139225746268657, + "grad_norm": 0.48765723203673156, + "learning_rate": 2.0979233397521237e-05, + "loss": 0.5172, + "step": 5265 + }, + { + "epoch": 0.6145055970149254, + "grad_norm": 0.5421523270806327, + "learning_rate": 2.0937725808714037e-05, + "loss": 0.5173, + "step": 5270 + }, + { + "epoch": 0.6150886194029851, + "grad_norm": 0.4821842656660482, + "learning_rate": 2.089624261477973e-05, + "loss": 0.522, + "step": 5275 + }, + { + "epoch": 0.6156716417910447, + "grad_norm": 0.473993764265411, + "learning_rate": 2.0854783969929668e-05, + "loss": 0.5394, + "step": 5280 + }, + { + "epoch": 0.6162546641791045, + "grad_norm": 0.4419301919043876, + "learning_rate": 2.0813350028283958e-05, + "loss": 0.5152, + "step": 5285 + }, + { + "epoch": 0.6168376865671642, + "grad_norm": 0.4672525694912507, + "learning_rate": 2.0771940943870866e-05, + "loss": 0.5195, + "step": 5290 + }, + { + "epoch": 0.6174207089552238, + "grad_norm": 0.4420478828415689, + "learning_rate": 2.073055687062625e-05, + "loss": 0.5131, + "step": 5295 + }, + { + "epoch": 0.6180037313432836, + "grad_norm": 0.4626362183961214, + "learning_rate": 2.0689197962393007e-05, + "loss": 0.5162, + "step": 5300 + }, + { + "epoch": 0.6185867537313433, + "grad_norm": 0.49784203270276733, + "learning_rate": 2.0647864372920472e-05, + "loss": 0.5279, + "step": 5305 + }, + { + "epoch": 0.6191697761194029, + "grad_norm": 0.454384681268438, + "learning_rate": 2.0606556255863862e-05, + "loss": 0.5193, + "step": 5310 + }, + { + "epoch": 0.6197527985074627, + "grad_norm": 0.549168830900785, + "learning_rate": 2.05652737647837e-05, + "loss": 0.508, + "step": 5315 + }, + { + "epoch": 0.6203358208955224, + "grad_norm": 0.4790027663801494, + "learning_rate": 2.0524017053145238e-05, + "loss": 0.5009, + "step": 5320 + }, + { + "epoch": 0.620918843283582, + "grad_norm": 0.4858133125717372, + "learning_rate": 2.0482786274317923e-05, + "loss": 0.5257, + "step": 5325 + }, + { + "epoch": 0.6215018656716418, + "grad_norm": 0.4686160974815818, + "learning_rate": 2.0441581581574765e-05, + "loss": 0.5006, + "step": 5330 + }, + { + "epoch": 0.6220848880597015, + "grad_norm": 0.4623666153609126, + "learning_rate": 2.0400403128091812e-05, + "loss": 0.5169, + "step": 5335 + }, + { + "epoch": 0.6226679104477612, + "grad_norm": 0.45877847406013955, + "learning_rate": 2.0359251066947583e-05, + "loss": 0.5334, + "step": 5340 + }, + { + "epoch": 0.6232509328358209, + "grad_norm": 0.4835869467134031, + "learning_rate": 2.0318125551122468e-05, + "loss": 0.4976, + "step": 5345 + }, + { + "epoch": 0.6238339552238806, + "grad_norm": 0.4575233875191603, + "learning_rate": 2.027702673349818e-05, + "loss": 0.5078, + "step": 5350 + }, + { + "epoch": 0.6244169776119403, + "grad_norm": 0.42998314282841227, + "learning_rate": 2.023595476685718e-05, + "loss": 0.5217, + "step": 5355 + }, + { + "epoch": 0.625, + "grad_norm": 0.40746461264097383, + "learning_rate": 2.0194909803882128e-05, + "loss": 0.4901, + "step": 5360 + }, + { + "epoch": 0.6255830223880597, + "grad_norm": 0.4597436174722755, + "learning_rate": 2.0153891997155282e-05, + "loss": 0.5494, + "step": 5365 + }, + { + "epoch": 0.6261660447761194, + "grad_norm": 0.47770020364059385, + "learning_rate": 2.011290149915795e-05, + "loss": 0.5137, + "step": 5370 + }, + { + "epoch": 0.6267490671641791, + "grad_norm": 0.4282249414588399, + "learning_rate": 2.0071938462269936e-05, + "loss": 0.4793, + "step": 5375 + }, + { + "epoch": 0.6273320895522388, + "grad_norm": 0.4509595314631354, + "learning_rate": 2.0031003038768942e-05, + "loss": 0.498, + "step": 5380 + }, + { + "epoch": 0.6279151119402985, + "grad_norm": 0.47714728709347026, + "learning_rate": 1.999009538083003e-05, + "loss": 0.545, + "step": 5385 + }, + { + "epoch": 0.6284981343283582, + "grad_norm": 0.47392502530376346, + "learning_rate": 1.994921564052503e-05, + "loss": 0.5133, + "step": 5390 + }, + { + "epoch": 0.629081156716418, + "grad_norm": 0.44759229419435465, + "learning_rate": 1.990836396982202e-05, + "loss": 0.5262, + "step": 5395 + }, + { + "epoch": 0.6296641791044776, + "grad_norm": 0.4912367043364836, + "learning_rate": 1.9867540520584693e-05, + "loss": 0.5346, + "step": 5400 + }, + { + "epoch": 0.6302472014925373, + "grad_norm": 0.4501000804982757, + "learning_rate": 1.9826745444571853e-05, + "loss": 0.5019, + "step": 5405 + }, + { + "epoch": 0.6308302238805971, + "grad_norm": 0.48836277098183645, + "learning_rate": 1.978597889343683e-05, + "loss": 0.5289, + "step": 5410 + }, + { + "epoch": 0.6314132462686567, + "grad_norm": 0.4625223996087579, + "learning_rate": 1.97452410187269e-05, + "loss": 0.4958, + "step": 5415 + }, + { + "epoch": 0.6319962686567164, + "grad_norm": 0.48092056548731926, + "learning_rate": 1.970453197188275e-05, + "loss": 0.5015, + "step": 5420 + }, + { + "epoch": 0.6325792910447762, + "grad_norm": 0.41113077299689604, + "learning_rate": 1.9663851904237903e-05, + "loss": 0.4989, + "step": 5425 + }, + { + "epoch": 0.6331623134328358, + "grad_norm": 0.43567184997194336, + "learning_rate": 1.9623200967018134e-05, + "loss": 0.5066, + "step": 5430 + }, + { + "epoch": 0.6337453358208955, + "grad_norm": 0.480598507617135, + "learning_rate": 1.9582579311340943e-05, + "loss": 0.5476, + "step": 5435 + }, + { + "epoch": 0.6343283582089553, + "grad_norm": 0.4632420890807915, + "learning_rate": 1.9541987088214963e-05, + "loss": 0.514, + "step": 5440 + }, + { + "epoch": 0.6349113805970149, + "grad_norm": 0.461886176881567, + "learning_rate": 1.9501424448539445e-05, + "loss": 0.4993, + "step": 5445 + }, + { + "epoch": 0.6354944029850746, + "grad_norm": 0.4863359836422005, + "learning_rate": 1.946089154310364e-05, + "loss": 0.5349, + "step": 5450 + }, + { + "epoch": 0.6360774253731343, + "grad_norm": 0.44233037285547316, + "learning_rate": 1.9420388522586242e-05, + "loss": 0.5036, + "step": 5455 + }, + { + "epoch": 0.636660447761194, + "grad_norm": 0.4399036507814764, + "learning_rate": 1.937991553755491e-05, + "loss": 0.5198, + "step": 5460 + }, + { + "epoch": 0.6372434701492538, + "grad_norm": 0.4644842288483705, + "learning_rate": 1.9339472738465604e-05, + "loss": 0.468, + "step": 5465 + }, + { + "epoch": 0.6378264925373134, + "grad_norm": 0.5021801630911482, + "learning_rate": 1.929906027566207e-05, + "loss": 0.5046, + "step": 5470 + }, + { + "epoch": 0.6384095149253731, + "grad_norm": 0.4906231561063412, + "learning_rate": 1.9258678299375287e-05, + "loss": 0.5637, + "step": 5475 + }, + { + "epoch": 0.6389925373134329, + "grad_norm": 0.4411840007547938, + "learning_rate": 1.9218326959722915e-05, + "loss": 0.4941, + "step": 5480 + }, + { + "epoch": 0.6395755597014925, + "grad_norm": 0.44492116681266264, + "learning_rate": 1.9178006406708716e-05, + "loss": 0.4999, + "step": 5485 + }, + { + "epoch": 0.6401585820895522, + "grad_norm": 0.45899402350391216, + "learning_rate": 1.913771679022199e-05, + "loss": 0.5143, + "step": 5490 + }, + { + "epoch": 0.640741604477612, + "grad_norm": 0.5002130360013761, + "learning_rate": 1.9097458260037055e-05, + "loss": 0.5223, + "step": 5495 + }, + { + "epoch": 0.6413246268656716, + "grad_norm": 0.4445252627269499, + "learning_rate": 1.9057230965812652e-05, + "loss": 0.5277, + "step": 5500 + }, + { + "epoch": 0.6419076492537313, + "grad_norm": 0.4327961966702548, + "learning_rate": 1.901703505709141e-05, + "loss": 0.4689, + "step": 5505 + }, + { + "epoch": 0.6424906716417911, + "grad_norm": 0.47514425543756855, + "learning_rate": 1.897687068329928e-05, + "loss": 0.499, + "step": 5510 + }, + { + "epoch": 0.6430736940298507, + "grad_norm": 0.4544747659785093, + "learning_rate": 1.8936737993744996e-05, + "loss": 0.5026, + "step": 5515 + }, + { + "epoch": 0.6436567164179104, + "grad_norm": 0.45829680386820026, + "learning_rate": 1.8896637137619495e-05, + "loss": 0.5066, + "step": 5520 + }, + { + "epoch": 0.6442397388059702, + "grad_norm": 0.42928587523633865, + "learning_rate": 1.8856568263995373e-05, + "loss": 0.4651, + "step": 5525 + }, + { + "epoch": 0.6448227611940298, + "grad_norm": 0.4785710795086254, + "learning_rate": 1.8816531521826346e-05, + "loss": 0.5118, + "step": 5530 + }, + { + "epoch": 0.6454057835820896, + "grad_norm": 0.44984484126810564, + "learning_rate": 1.8776527059946676e-05, + "loss": 0.4979, + "step": 5535 + }, + { + "epoch": 0.6459888059701493, + "grad_norm": 0.5084605177342773, + "learning_rate": 1.8736555027070607e-05, + "loss": 0.513, + "step": 5540 + }, + { + "epoch": 0.6465718283582089, + "grad_norm": 0.4587926020250396, + "learning_rate": 1.8696615571791876e-05, + "loss": 0.5056, + "step": 5545 + }, + { + "epoch": 0.6471548507462687, + "grad_norm": 0.5265935639558714, + "learning_rate": 1.865670884258307e-05, + "loss": 0.5328, + "step": 5550 + }, + { + "epoch": 0.6477378731343284, + "grad_norm": 0.48393501644987547, + "learning_rate": 1.861683498779514e-05, + "loss": 0.5409, + "step": 5555 + }, + { + "epoch": 0.648320895522388, + "grad_norm": 0.42845129776042723, + "learning_rate": 1.8576994155656814e-05, + "loss": 0.5036, + "step": 5560 + }, + { + "epoch": 0.6489039179104478, + "grad_norm": 0.45750371585274724, + "learning_rate": 1.853718649427409e-05, + "loss": 0.5458, + "step": 5565 + }, + { + "epoch": 0.6494869402985075, + "grad_norm": 0.5071290594881412, + "learning_rate": 1.8497412151629617e-05, + "loss": 0.5574, + "step": 5570 + }, + { + "epoch": 0.6500699626865671, + "grad_norm": 0.4193141376087725, + "learning_rate": 1.8457671275582202e-05, + "loss": 0.501, + "step": 5575 + }, + { + "epoch": 0.6506529850746269, + "grad_norm": 0.5097091316108154, + "learning_rate": 1.841796401386626e-05, + "loss": 0.5424, + "step": 5580 + }, + { + "epoch": 0.6512360074626866, + "grad_norm": 0.4574079642887348, + "learning_rate": 1.8378290514091214e-05, + "loss": 0.5143, + "step": 5585 + }, + { + "epoch": 0.6518190298507462, + "grad_norm": 0.48232820582938274, + "learning_rate": 1.8338650923740984e-05, + "loss": 0.5068, + "step": 5590 + }, + { + "epoch": 0.652402052238806, + "grad_norm": 0.4295823610973414, + "learning_rate": 1.829904539017347e-05, + "loss": 0.5076, + "step": 5595 + }, + { + "epoch": 0.6529850746268657, + "grad_norm": 0.4893118320191642, + "learning_rate": 1.8259474060619925e-05, + "loss": 0.5219, + "step": 5600 + }, + { + "epoch": 0.6535680970149254, + "grad_norm": 0.44539776731173597, + "learning_rate": 1.8219937082184462e-05, + "loss": 0.4935, + "step": 5605 + }, + { + "epoch": 0.6541511194029851, + "grad_norm": 0.5129311723129513, + "learning_rate": 1.8180434601843505e-05, + "loss": 0.5179, + "step": 5610 + }, + { + "epoch": 0.6547341417910447, + "grad_norm": 0.4864225115581459, + "learning_rate": 1.8140966766445235e-05, + "loss": 0.4969, + "step": 5615 + }, + { + "epoch": 0.6553171641791045, + "grad_norm": 0.47007224857589525, + "learning_rate": 1.8101533722709036e-05, + "loss": 0.5005, + "step": 5620 + }, + { + "epoch": 0.6559001865671642, + "grad_norm": 0.4597974424746557, + "learning_rate": 1.8062135617224933e-05, + "loss": 0.5294, + "step": 5625 + }, + { + "epoch": 0.6564832089552238, + "grad_norm": 0.49917347832714665, + "learning_rate": 1.802277259645313e-05, + "loss": 0.5261, + "step": 5630 + }, + { + "epoch": 0.6570662313432836, + "grad_norm": 0.46525260879023533, + "learning_rate": 1.798344480672334e-05, + "loss": 0.5119, + "step": 5635 + }, + { + "epoch": 0.6576492537313433, + "grad_norm": 0.49319911982873976, + "learning_rate": 1.7944152394234354e-05, + "loss": 0.4884, + "step": 5640 + }, + { + "epoch": 0.6582322761194029, + "grad_norm": 0.5132851806324479, + "learning_rate": 1.7904895505053405e-05, + "loss": 0.5158, + "step": 5645 + }, + { + "epoch": 0.6588152985074627, + "grad_norm": 0.47569949694794605, + "learning_rate": 1.7865674285115735e-05, + "loss": 0.5121, + "step": 5650 + }, + { + "epoch": 0.6593983208955224, + "grad_norm": 0.44552199263042797, + "learning_rate": 1.7826488880223913e-05, + "loss": 0.4761, + "step": 5655 + }, + { + "epoch": 0.659981343283582, + "grad_norm": 0.47720149730201705, + "learning_rate": 1.778733943604743e-05, + "loss": 0.5124, + "step": 5660 + }, + { + "epoch": 0.6605643656716418, + "grad_norm": 0.4432509370944976, + "learning_rate": 1.774822609812205e-05, + "loss": 0.4711, + "step": 5665 + }, + { + "epoch": 0.6611473880597015, + "grad_norm": 0.4450343141290362, + "learning_rate": 1.7709149011849364e-05, + "loss": 0.4761, + "step": 5670 + }, + { + "epoch": 0.6617304104477612, + "grad_norm": 0.4797782997827445, + "learning_rate": 1.767010832249613e-05, + "loss": 0.5175, + "step": 5675 + }, + { + "epoch": 0.6623134328358209, + "grad_norm": 0.47463931278850086, + "learning_rate": 1.7631104175193863e-05, + "loss": 0.5208, + "step": 5680 + }, + { + "epoch": 0.6628964552238806, + "grad_norm": 0.49271476148028914, + "learning_rate": 1.7592136714938206e-05, + "loss": 0.522, + "step": 5685 + }, + { + "epoch": 0.6634794776119403, + "grad_norm": 0.4366587132398326, + "learning_rate": 1.755320608658844e-05, + "loss": 0.4709, + "step": 5690 + }, + { + "epoch": 0.6640625, + "grad_norm": 0.44579614307494786, + "learning_rate": 1.7514312434866904e-05, + "loss": 0.5267, + "step": 5695 + }, + { + "epoch": 0.6646455223880597, + "grad_norm": 0.4487339036982587, + "learning_rate": 1.747545590435848e-05, + "loss": 0.4991, + "step": 5700 + }, + { + "epoch": 0.6652285447761194, + "grad_norm": 0.4383027709136693, + "learning_rate": 1.7436636639510082e-05, + "loss": 0.5141, + "step": 5705 + }, + { + "epoch": 0.6658115671641791, + "grad_norm": 0.4458649709251083, + "learning_rate": 1.739785478463004e-05, + "loss": 0.4921, + "step": 5710 + }, + { + "epoch": 0.6663945895522388, + "grad_norm": 0.49612990301681104, + "learning_rate": 1.735911048388768e-05, + "loss": 0.5081, + "step": 5715 + }, + { + "epoch": 0.6669776119402985, + "grad_norm": 0.4717384137501139, + "learning_rate": 1.7320403881312665e-05, + "loss": 0.4909, + "step": 5720 + }, + { + "epoch": 0.6675606343283582, + "grad_norm": 0.5029540121644192, + "learning_rate": 1.7281735120794555e-05, + "loss": 0.5439, + "step": 5725 + }, + { + "epoch": 0.668143656716418, + "grad_norm": 0.4643667414061285, + "learning_rate": 1.7243104346082194e-05, + "loss": 0.4794, + "step": 5730 + }, + { + "epoch": 0.6687266791044776, + "grad_norm": 0.47417180490760813, + "learning_rate": 1.720451170078328e-05, + "loss": 0.4996, + "step": 5735 + }, + { + "epoch": 0.6693097014925373, + "grad_norm": 0.4814261614829366, + "learning_rate": 1.7165957328363703e-05, + "loss": 0.5027, + "step": 5740 + }, + { + "epoch": 0.6698927238805971, + "grad_norm": 0.45627638279169563, + "learning_rate": 1.71274413721471e-05, + "loss": 0.513, + "step": 5745 + }, + { + "epoch": 0.6704757462686567, + "grad_norm": 0.4334633223796316, + "learning_rate": 1.708896397531431e-05, + "loss": 0.4659, + "step": 5750 + }, + { + "epoch": 0.6710587686567164, + "grad_norm": 0.4579206868255898, + "learning_rate": 1.7050525280902824e-05, + "loss": 0.5158, + "step": 5755 + }, + { + "epoch": 0.6716417910447762, + "grad_norm": 0.4386284661436938, + "learning_rate": 1.701212543180623e-05, + "loss": 0.5206, + "step": 5760 + }, + { + "epoch": 0.6722248134328358, + "grad_norm": 0.45478097582209165, + "learning_rate": 1.6973764570773766e-05, + "loss": 0.4958, + "step": 5765 + }, + { + "epoch": 0.6728078358208955, + "grad_norm": 0.49448818817184126, + "learning_rate": 1.693544284040968e-05, + "loss": 0.532, + "step": 5770 + }, + { + "epoch": 0.6733908582089553, + "grad_norm": 0.44392640275278356, + "learning_rate": 1.6897160383172794e-05, + "loss": 0.5102, + "step": 5775 + }, + { + "epoch": 0.6739738805970149, + "grad_norm": 0.4681803651785721, + "learning_rate": 1.6858917341375893e-05, + "loss": 0.5033, + "step": 5780 + }, + { + "epoch": 0.6745569029850746, + "grad_norm": 0.5012783536207814, + "learning_rate": 1.6820713857185296e-05, + "loss": 0.5261, + "step": 5785 + }, + { + "epoch": 0.6751399253731343, + "grad_norm": 0.5021117339189587, + "learning_rate": 1.6782550072620208e-05, + "loss": 0.4987, + "step": 5790 + }, + { + "epoch": 0.675722947761194, + "grad_norm": 0.48367815094839645, + "learning_rate": 1.674442612955229e-05, + "loss": 0.5231, + "step": 5795 + }, + { + "epoch": 0.6763059701492538, + "grad_norm": 0.42292867586088734, + "learning_rate": 1.6706342169705092e-05, + "loss": 0.4833, + "step": 5800 + }, + { + "epoch": 0.6768889925373134, + "grad_norm": 0.5498079373194198, + "learning_rate": 1.6668298334653504e-05, + "loss": 0.5303, + "step": 5805 + }, + { + "epoch": 0.6774720149253731, + "grad_norm": 0.45613249161600566, + "learning_rate": 1.663029476582328e-05, + "loss": 0.4946, + "step": 5810 + }, + { + "epoch": 0.6780550373134329, + "grad_norm": 0.4300584687753966, + "learning_rate": 1.659233160449048e-05, + "loss": 0.4835, + "step": 5815 + }, + { + "epoch": 0.6786380597014925, + "grad_norm": 0.47672064051122565, + "learning_rate": 1.6554408991780958e-05, + "loss": 0.5239, + "step": 5820 + }, + { + "epoch": 0.6792210820895522, + "grad_norm": 0.4363678330399839, + "learning_rate": 1.65165270686698e-05, + "loss": 0.5095, + "step": 5825 + }, + { + "epoch": 0.679804104477612, + "grad_norm": 0.4386976710048956, + "learning_rate": 1.6478685975980867e-05, + "loss": 0.5037, + "step": 5830 + }, + { + "epoch": 0.6803871268656716, + "grad_norm": 0.4713692579068845, + "learning_rate": 1.6440885854386223e-05, + "loss": 0.4919, + "step": 5835 + }, + { + "epoch": 0.6809701492537313, + "grad_norm": 0.4590294577914819, + "learning_rate": 1.6403126844405627e-05, + "loss": 0.5001, + "step": 5840 + }, + { + "epoch": 0.6815531716417911, + "grad_norm": 0.48010469655748705, + "learning_rate": 1.6365409086405982e-05, + "loss": 0.523, + "step": 5845 + }, + { + "epoch": 0.6821361940298507, + "grad_norm": 0.4572102553531434, + "learning_rate": 1.6327732720600893e-05, + "loss": 0.4981, + "step": 5850 + }, + { + "epoch": 0.6827192164179104, + "grad_norm": 0.42279317577956194, + "learning_rate": 1.6290097887050037e-05, + "loss": 0.4824, + "step": 5855 + }, + { + "epoch": 0.6833022388059702, + "grad_norm": 0.4672338547985673, + "learning_rate": 1.6252504725658738e-05, + "loss": 0.5276, + "step": 5860 + }, + { + "epoch": 0.6838852611940298, + "grad_norm": 0.5057683989918992, + "learning_rate": 1.6214953376177355e-05, + "loss": 0.5108, + "step": 5865 + }, + { + "epoch": 0.6844682835820896, + "grad_norm": 0.4285897123749069, + "learning_rate": 1.617744397820089e-05, + "loss": 0.4943, + "step": 5870 + }, + { + "epoch": 0.6850513059701493, + "grad_norm": 0.47280662245843785, + "learning_rate": 1.613997667116832e-05, + "loss": 0.5127, + "step": 5875 + }, + { + "epoch": 0.6856343283582089, + "grad_norm": 0.467214604689162, + "learning_rate": 1.610255159436219e-05, + "loss": 0.4953, + "step": 5880 + }, + { + "epoch": 0.6862173507462687, + "grad_norm": 0.4452219391184069, + "learning_rate": 1.6065168886908046e-05, + "loss": 0.4957, + "step": 5885 + }, + { + "epoch": 0.6868003731343284, + "grad_norm": 0.44301870809154087, + "learning_rate": 1.6027828687773947e-05, + "loss": 0.4975, + "step": 5890 + }, + { + "epoch": 0.687383395522388, + "grad_norm": 0.4401277833271277, + "learning_rate": 1.5990531135769885e-05, + "loss": 0.5195, + "step": 5895 + }, + { + "epoch": 0.6879664179104478, + "grad_norm": 2.0394489198103662, + "learning_rate": 1.5953276369547356e-05, + "loss": 0.4956, + "step": 5900 + }, + { + "epoch": 0.6885494402985075, + "grad_norm": 0.46836588645884025, + "learning_rate": 1.591606452759879e-05, + "loss": 0.5327, + "step": 5905 + }, + { + "epoch": 0.6891324626865671, + "grad_norm": 0.44598757398103644, + "learning_rate": 1.587889574825705e-05, + "loss": 0.4764, + "step": 5910 + }, + { + "epoch": 0.6897154850746269, + "grad_norm": 0.49742861746687556, + "learning_rate": 1.5841770169694895e-05, + "loss": 0.5111, + "step": 5915 + }, + { + "epoch": 0.6902985074626866, + "grad_norm": 0.47331933429323847, + "learning_rate": 1.5804687929924522e-05, + "loss": 0.5047, + "step": 5920 + }, + { + "epoch": 0.6908815298507462, + "grad_norm": 0.4652913554765252, + "learning_rate": 1.5767649166796995e-05, + "loss": 0.5038, + "step": 5925 + }, + { + "epoch": 0.691464552238806, + "grad_norm": 0.453898586144349, + "learning_rate": 1.573065401800176e-05, + "loss": 0.4755, + "step": 5930 + }, + { + "epoch": 0.6920475746268657, + "grad_norm": 0.43523535343423403, + "learning_rate": 1.569370262106615e-05, + "loss": 0.553, + "step": 5935 + }, + { + "epoch": 0.6926305970149254, + "grad_norm": 0.46238104794971113, + "learning_rate": 1.5656795113354816e-05, + "loss": 0.4496, + "step": 5940 + }, + { + "epoch": 0.6932136194029851, + "grad_norm": 0.4612926915904917, + "learning_rate": 1.5619931632069284e-05, + "loss": 0.5039, + "step": 5945 + }, + { + "epoch": 0.6937966417910447, + "grad_norm": 0.46826494703709465, + "learning_rate": 1.5583112314247386e-05, + "loss": 0.5022, + "step": 5950 + }, + { + "epoch": 0.6943796641791045, + "grad_norm": 0.47987543937588256, + "learning_rate": 1.5546337296762826e-05, + "loss": 0.5097, + "step": 5955 + }, + { + "epoch": 0.6949626865671642, + "grad_norm": 0.43334758409327906, + "learning_rate": 1.5509606716324563e-05, + "loss": 0.4943, + "step": 5960 + }, + { + "epoch": 0.6955457089552238, + "grad_norm": 0.4681763383562353, + "learning_rate": 1.547292070947641e-05, + "loss": 0.5063, + "step": 5965 + }, + { + "epoch": 0.6961287313432836, + "grad_norm": 0.4924802920273831, + "learning_rate": 1.5436279412596466e-05, + "loss": 0.5047, + "step": 5970 + }, + { + "epoch": 0.6967117537313433, + "grad_norm": 0.5347314807147506, + "learning_rate": 1.5399682961896627e-05, + "loss": 0.5254, + "step": 5975 + }, + { + "epoch": 0.6972947761194029, + "grad_norm": 0.46161765208167127, + "learning_rate": 1.5363131493422045e-05, + "loss": 0.5527, + "step": 5980 + }, + { + "epoch": 0.6978777985074627, + "grad_norm": 0.481592995924184, + "learning_rate": 1.5326625143050717e-05, + "loss": 0.5093, + "step": 5985 + }, + { + "epoch": 0.6984608208955224, + "grad_norm": 0.43698502229598946, + "learning_rate": 1.5290164046492855e-05, + "loss": 0.5113, + "step": 5990 + }, + { + "epoch": 0.699043843283582, + "grad_norm": 0.45737599000837936, + "learning_rate": 1.5253748339290478e-05, + "loss": 0.5072, + "step": 5995 + }, + { + "epoch": 0.6996268656716418, + "grad_norm": 0.4632305738785457, + "learning_rate": 1.5217378156816836e-05, + "loss": 0.5092, + "step": 6000 + }, + { + "epoch": 0.7002098880597015, + "grad_norm": 0.5658113543744521, + "learning_rate": 1.5181053634276005e-05, + "loss": 0.4714, + "step": 6005 + }, + { + "epoch": 0.7007929104477612, + "grad_norm": 0.5024081912461719, + "learning_rate": 1.5144774906702261e-05, + "loss": 0.5587, + "step": 6010 + }, + { + "epoch": 0.7013759328358209, + "grad_norm": 0.4346747521893763, + "learning_rate": 1.5108542108959666e-05, + "loss": 0.4874, + "step": 6015 + }, + { + "epoch": 0.7019589552238806, + "grad_norm": 0.48751850242768197, + "learning_rate": 1.5072355375741564e-05, + "loss": 0.5152, + "step": 6020 + }, + { + "epoch": 0.7025419776119403, + "grad_norm": 0.4679030016933106, + "learning_rate": 1.5036214841570002e-05, + "loss": 0.5177, + "step": 6025 + }, + { + "epoch": 0.703125, + "grad_norm": 0.43192813973658906, + "learning_rate": 1.500012064079533e-05, + "loss": 0.5381, + "step": 6030 + }, + { + "epoch": 0.7037080223880597, + "grad_norm": 0.47334729350720534, + "learning_rate": 1.4964072907595633e-05, + "loss": 0.5428, + "step": 6035 + }, + { + "epoch": 0.7042910447761194, + "grad_norm": 0.4625335667175621, + "learning_rate": 1.4928071775976283e-05, + "loss": 0.4965, + "step": 6040 + }, + { + "epoch": 0.7048740671641791, + "grad_norm": 0.4837863609737646, + "learning_rate": 1.489211737976937e-05, + "loss": 0.5201, + "step": 6045 + }, + { + "epoch": 0.7054570895522388, + "grad_norm": 0.45448831055589556, + "learning_rate": 1.4856209852633282e-05, + "loss": 0.4988, + "step": 6050 + }, + { + "epoch": 0.7060401119402985, + "grad_norm": 0.4470633313561567, + "learning_rate": 1.482034932805217e-05, + "loss": 0.4923, + "step": 6055 + }, + { + "epoch": 0.7066231343283582, + "grad_norm": 0.46146957236249214, + "learning_rate": 1.478453593933545e-05, + "loss": 0.4966, + "step": 6060 + }, + { + "epoch": 0.707206156716418, + "grad_norm": 0.44846726296477046, + "learning_rate": 1.4748769819617291e-05, + "loss": 0.5121, + "step": 6065 + }, + { + "epoch": 0.7077891791044776, + "grad_norm": 0.445769696660708, + "learning_rate": 1.47130511018562e-05, + "loss": 0.49, + "step": 6070 + }, + { + "epoch": 0.7083722014925373, + "grad_norm": 0.44424913913430536, + "learning_rate": 1.4677379918834408e-05, + "loss": 0.5154, + "step": 6075 + }, + { + "epoch": 0.7089552238805971, + "grad_norm": 0.5025893980492081, + "learning_rate": 1.464175640315748e-05, + "loss": 0.4886, + "step": 6080 + }, + { + "epoch": 0.7095382462686567, + "grad_norm": 0.4287175183865977, + "learning_rate": 1.460618068725374e-05, + "loss": 0.4943, + "step": 6085 + }, + { + "epoch": 0.7101212686567164, + "grad_norm": 0.46657717095673207, + "learning_rate": 1.4570652903373877e-05, + "loss": 0.4866, + "step": 6090 + }, + { + "epoch": 0.7107042910447762, + "grad_norm": 0.44658175711235154, + "learning_rate": 1.453517318359034e-05, + "loss": 0.4952, + "step": 6095 + }, + { + "epoch": 0.7112873134328358, + "grad_norm": 0.5109217950300224, + "learning_rate": 1.4499741659796927e-05, + "loss": 0.5495, + "step": 6100 + }, + { + "epoch": 0.7118703358208955, + "grad_norm": 0.439976204898742, + "learning_rate": 1.4464358463708277e-05, + "loss": 0.4947, + "step": 6105 + }, + { + "epoch": 0.7124533582089553, + "grad_norm": 0.442606078185802, + "learning_rate": 1.442902372685937e-05, + "loss": 0.5125, + "step": 6110 + }, + { + "epoch": 0.7130363805970149, + "grad_norm": 0.5113814435199087, + "learning_rate": 1.4393737580605019e-05, + "loss": 0.5397, + "step": 6115 + }, + { + "epoch": 0.7136194029850746, + "grad_norm": 0.4292248926969336, + "learning_rate": 1.435850015611943e-05, + "loss": 0.5038, + "step": 6120 + }, + { + "epoch": 0.7142024253731343, + "grad_norm": 0.4436015934650438, + "learning_rate": 1.432331158439568e-05, + "loss": 0.5015, + "step": 6125 + }, + { + "epoch": 0.714785447761194, + "grad_norm": 0.4679393946330053, + "learning_rate": 1.4288171996245247e-05, + "loss": 0.4961, + "step": 6130 + }, + { + "epoch": 0.7153684701492538, + "grad_norm": 0.46822261278058663, + "learning_rate": 1.425308152229749e-05, + "loss": 0.5565, + "step": 6135 + }, + { + "epoch": 0.7159514925373134, + "grad_norm": 0.48866887191385927, + "learning_rate": 1.4218040292999221e-05, + "loss": 0.531, + "step": 6140 + }, + { + "epoch": 0.7165345149253731, + "grad_norm": 0.4487435192453521, + "learning_rate": 1.4183048438614166e-05, + "loss": 0.4994, + "step": 6145 + }, + { + "epoch": 0.7171175373134329, + "grad_norm": 0.5036947554976395, + "learning_rate": 1.4148106089222513e-05, + "loss": 0.519, + "step": 6150 + }, + { + "epoch": 0.7177005597014925, + "grad_norm": 0.49765078956448927, + "learning_rate": 1.4113213374720425e-05, + "loss": 0.4825, + "step": 6155 + }, + { + "epoch": 0.7182835820895522, + "grad_norm": 0.49796549587751004, + "learning_rate": 1.4078370424819515e-05, + "loss": 0.5142, + "step": 6160 + }, + { + "epoch": 0.718866604477612, + "grad_norm": 0.46661776742591504, + "learning_rate": 1.404357736904645e-05, + "loss": 0.5121, + "step": 6165 + }, + { + "epoch": 0.7194496268656716, + "grad_norm": 0.4206361496368435, + "learning_rate": 1.4008834336742366e-05, + "loss": 0.5043, + "step": 6170 + }, + { + "epoch": 0.7200326492537313, + "grad_norm": 0.49140533816343374, + "learning_rate": 1.3974141457062498e-05, + "loss": 0.5465, + "step": 6175 + }, + { + "epoch": 0.7206156716417911, + "grad_norm": 0.4730841055221398, + "learning_rate": 1.3939498858975584e-05, + "loss": 0.496, + "step": 6180 + }, + { + "epoch": 0.7211986940298507, + "grad_norm": 0.5165044754977881, + "learning_rate": 1.390490667126348e-05, + "loss": 0.5699, + "step": 6185 + }, + { + "epoch": 0.7217817164179104, + "grad_norm": 0.4574833927556042, + "learning_rate": 1.3870365022520627e-05, + "loss": 0.494, + "step": 6190 + }, + { + "epoch": 0.7223647388059702, + "grad_norm": 0.4797480787975331, + "learning_rate": 1.3835874041153607e-05, + "loss": 0.5, + "step": 6195 + }, + { + "epoch": 0.7229477611940298, + "grad_norm": 0.5266171413115511, + "learning_rate": 1.380143385538063e-05, + "loss": 0.5248, + "step": 6200 + }, + { + "epoch": 0.7235307835820896, + "grad_norm": 0.4787651159553734, + "learning_rate": 1.3767044593231082e-05, + "loss": 0.4999, + "step": 6205 + }, + { + "epoch": 0.7241138059701493, + "grad_norm": 0.42275543868072146, + "learning_rate": 1.3732706382545054e-05, + "loss": 0.4731, + "step": 6210 + }, + { + "epoch": 0.7246968283582089, + "grad_norm": 0.48666619671292066, + "learning_rate": 1.3698419350972851e-05, + "loss": 0.5242, + "step": 6215 + }, + { + "epoch": 0.7252798507462687, + "grad_norm": 0.41898376072456794, + "learning_rate": 1.3664183625974503e-05, + "loss": 0.5104, + "step": 6220 + }, + { + "epoch": 0.7258628731343284, + "grad_norm": 0.4929104768550004, + "learning_rate": 1.362999933481935e-05, + "loss": 0.5206, + "step": 6225 + }, + { + "epoch": 0.726445895522388, + "grad_norm": 0.4308013069303933, + "learning_rate": 1.3595866604585492e-05, + "loss": 0.5074, + "step": 6230 + }, + { + "epoch": 0.7270289179104478, + "grad_norm": 0.4421936343319682, + "learning_rate": 1.3561785562159374e-05, + "loss": 0.4808, + "step": 6235 + }, + { + "epoch": 0.7276119402985075, + "grad_norm": 0.4980669606106666, + "learning_rate": 1.3527756334235288e-05, + "loss": 0.4746, + "step": 6240 + }, + { + "epoch": 0.7281949626865671, + "grad_norm": 0.4364859749653744, + "learning_rate": 1.3493779047314925e-05, + "loss": 0.4967, + "step": 6245 + }, + { + "epoch": 0.7287779850746269, + "grad_norm": 0.4424531855538569, + "learning_rate": 1.3459853827706853e-05, + "loss": 0.4962, + "step": 6250 + }, + { + "epoch": 0.7293610074626866, + "grad_norm": 0.5043638790699465, + "learning_rate": 1.3425980801526118e-05, + "loss": 0.5095, + "step": 6255 + }, + { + "epoch": 0.7299440298507462, + "grad_norm": 0.464384451619953, + "learning_rate": 1.3392160094693724e-05, + "loss": 0.5008, + "step": 6260 + }, + { + "epoch": 0.730527052238806, + "grad_norm": 0.44362217621259836, + "learning_rate": 1.3358391832936174e-05, + "loss": 0.4965, + "step": 6265 + }, + { + "epoch": 0.7311100746268657, + "grad_norm": 0.4092013778366071, + "learning_rate": 1.3324676141785029e-05, + "loss": 0.5133, + "step": 6270 + }, + { + "epoch": 0.7316930970149254, + "grad_norm": 0.4350575485273892, + "learning_rate": 1.3291013146576403e-05, + "loss": 0.5128, + "step": 6275 + }, + { + "epoch": 0.7322761194029851, + "grad_norm": 0.4744197561421012, + "learning_rate": 1.3257402972450539e-05, + "loss": 0.4784, + "step": 6280 + }, + { + "epoch": 0.7328591417910447, + "grad_norm": 0.4633376651924879, + "learning_rate": 1.3223845744351287e-05, + "loss": 0.475, + "step": 6285 + }, + { + "epoch": 0.7334421641791045, + "grad_norm": 0.47064248305550177, + "learning_rate": 1.3190341587025698e-05, + "loss": 0.5147, + "step": 6290 + }, + { + "epoch": 0.7340251865671642, + "grad_norm": 0.4745007087563231, + "learning_rate": 1.3156890625023532e-05, + "loss": 0.5131, + "step": 6295 + }, + { + "epoch": 0.7346082089552238, + "grad_norm": 0.47910451161364465, + "learning_rate": 1.3123492982696806e-05, + "loss": 0.5125, + "step": 6300 + }, + { + "epoch": 0.7351912313432836, + "grad_norm": 0.45150153399392984, + "learning_rate": 1.3090148784199288e-05, + "loss": 0.5195, + "step": 6305 + }, + { + "epoch": 0.7357742537313433, + "grad_norm": 0.4666852788990984, + "learning_rate": 1.305685815348613e-05, + "loss": 0.5348, + "step": 6310 + }, + { + "epoch": 0.7363572761194029, + "grad_norm": 0.43969109543694684, + "learning_rate": 1.3023621214313289e-05, + "loss": 0.4887, + "step": 6315 + }, + { + "epoch": 0.7369402985074627, + "grad_norm": 0.4520730263820748, + "learning_rate": 1.2990438090237167e-05, + "loss": 0.5174, + "step": 6320 + }, + { + "epoch": 0.7375233208955224, + "grad_norm": 0.4531016913171499, + "learning_rate": 1.2957308904614099e-05, + "loss": 0.513, + "step": 6325 + }, + { + "epoch": 0.738106343283582, + "grad_norm": 0.43990578576591227, + "learning_rate": 1.2924233780599915e-05, + "loss": 0.469, + "step": 6330 + }, + { + "epoch": 0.7386893656716418, + "grad_norm": 0.44084700165618407, + "learning_rate": 1.2891212841149447e-05, + "loss": 0.4997, + "step": 6335 + }, + { + "epoch": 0.7392723880597015, + "grad_norm": 0.49100928132633287, + "learning_rate": 1.2858246209016128e-05, + "loss": 0.5187, + "step": 6340 + }, + { + "epoch": 0.7398554104477612, + "grad_norm": 1.5928599989218044, + "learning_rate": 1.2825334006751493e-05, + "loss": 0.4954, + "step": 6345 + }, + { + "epoch": 0.7404384328358209, + "grad_norm": 0.4278160829593741, + "learning_rate": 1.2792476356704759e-05, + "loss": 0.462, + "step": 6350 + }, + { + "epoch": 0.7410214552238806, + "grad_norm": 0.45173765932520604, + "learning_rate": 1.2759673381022305e-05, + "loss": 0.5198, + "step": 6355 + }, + { + "epoch": 0.7416044776119403, + "grad_norm": 0.46060933457902786, + "learning_rate": 1.27269252016473e-05, + "loss": 0.501, + "step": 6360 + }, + { + "epoch": 0.7421875, + "grad_norm": 0.5194412989177508, + "learning_rate": 1.2694231940319192e-05, + "loss": 0.4862, + "step": 6365 + }, + { + "epoch": 0.7427705223880597, + "grad_norm": 0.46637710292865725, + "learning_rate": 1.2661593718573294e-05, + "loss": 0.4913, + "step": 6370 + }, + { + "epoch": 0.7433535447761194, + "grad_norm": 0.49083348251147924, + "learning_rate": 1.2629010657740275e-05, + "loss": 0.5073, + "step": 6375 + }, + { + "epoch": 0.7439365671641791, + "grad_norm": 0.476732025190355, + "learning_rate": 1.2596482878945787e-05, + "loss": 0.5343, + "step": 6380 + }, + { + "epoch": 0.7445195895522388, + "grad_norm": 0.43400279583829604, + "learning_rate": 1.2564010503109952e-05, + "loss": 0.4952, + "step": 6385 + }, + { + "epoch": 0.7451026119402985, + "grad_norm": 0.4474931990549133, + "learning_rate": 1.2531593650946932e-05, + "loss": 0.4966, + "step": 6390 + }, + { + "epoch": 0.7456856343283582, + "grad_norm": 0.4746293119307018, + "learning_rate": 1.2499232442964506e-05, + "loss": 0.5241, + "step": 6395 + }, + { + "epoch": 0.746268656716418, + "grad_norm": 0.4596957874880727, + "learning_rate": 1.2466926999463575e-05, + "loss": 0.4931, + "step": 6400 + }, + { + "epoch": 0.7468516791044776, + "grad_norm": 0.48590988974018173, + "learning_rate": 1.2434677440537745e-05, + "loss": 0.5498, + "step": 6405 + }, + { + "epoch": 0.7474347014925373, + "grad_norm": 0.43144811701119584, + "learning_rate": 1.2402483886072883e-05, + "loss": 0.4673, + "step": 6410 + }, + { + "epoch": 0.7480177238805971, + "grad_norm": 0.4694370814956435, + "learning_rate": 1.237034645574666e-05, + "loss": 0.5173, + "step": 6415 + }, + { + "epoch": 0.7486007462686567, + "grad_norm": 0.5215126950510659, + "learning_rate": 1.233826526902809e-05, + "loss": 0.5413, + "step": 6420 + }, + { + "epoch": 0.7491837686567164, + "grad_norm": 0.4667853644040054, + "learning_rate": 1.230624044517713e-05, + "loss": 0.5206, + "step": 6425 + }, + { + "epoch": 0.7497667910447762, + "grad_norm": 0.49822013134556736, + "learning_rate": 1.2274272103244201e-05, + "loss": 0.5264, + "step": 6430 + }, + { + "epoch": 0.7503498134328358, + "grad_norm": 0.4924144531446238, + "learning_rate": 1.2242360362069763e-05, + "loss": 0.4843, + "step": 6435 + }, + { + "epoch": 0.7509328358208955, + "grad_norm": 0.47214741828129897, + "learning_rate": 1.2210505340283838e-05, + "loss": 0.529, + "step": 6440 + }, + { + "epoch": 0.7515158582089553, + "grad_norm": 0.4726759175626485, + "learning_rate": 1.2178707156305644e-05, + "loss": 0.4993, + "step": 6445 + }, + { + "epoch": 0.7520988805970149, + "grad_norm": 0.47497735759010773, + "learning_rate": 1.2146965928343062e-05, + "loss": 0.4923, + "step": 6450 + }, + { + "epoch": 0.7526819029850746, + "grad_norm": 0.5282112383115118, + "learning_rate": 1.2115281774392278e-05, + "loss": 0.5043, + "step": 6455 + }, + { + "epoch": 0.7532649253731343, + "grad_norm": 0.4289024052469388, + "learning_rate": 1.208365481223727e-05, + "loss": 0.4915, + "step": 6460 + }, + { + "epoch": 0.753847947761194, + "grad_norm": 0.4527587796867484, + "learning_rate": 1.2052085159449455e-05, + "loss": 0.491, + "step": 6465 + }, + { + "epoch": 0.7544309701492538, + "grad_norm": 0.46413997296204734, + "learning_rate": 1.202057293338717e-05, + "loss": 0.5207, + "step": 6470 + }, + { + "epoch": 0.7550139925373134, + "grad_norm": 0.47431690969030404, + "learning_rate": 1.1989118251195284e-05, + "loss": 0.4807, + "step": 6475 + }, + { + "epoch": 0.7555970149253731, + "grad_norm": 0.6124168305417785, + "learning_rate": 1.1957721229804761e-05, + "loss": 0.4909, + "step": 6480 + }, + { + "epoch": 0.7561800373134329, + "grad_norm": 0.4612146149691418, + "learning_rate": 1.1926381985932186e-05, + "loss": 0.4912, + "step": 6485 + }, + { + "epoch": 0.7567630597014925, + "grad_norm": 0.4602032665320846, + "learning_rate": 1.1895100636079387e-05, + "loss": 0.5287, + "step": 6490 + }, + { + "epoch": 0.7573460820895522, + "grad_norm": 0.44850719377837156, + "learning_rate": 1.186387729653296e-05, + "loss": 0.5147, + "step": 6495 + }, + { + "epoch": 0.757929104477612, + "grad_norm": 0.43547527328367247, + "learning_rate": 1.1832712083363865e-05, + "loss": 0.4774, + "step": 6500 + }, + { + "epoch": 0.7585121268656716, + "grad_norm": 0.48648908094016996, + "learning_rate": 1.1801605112426953e-05, + "loss": 0.4948, + "step": 6505 + }, + { + "epoch": 0.7590951492537313, + "grad_norm": 0.4362070385676467, + "learning_rate": 1.1770556499360593e-05, + "loss": 0.4768, + "step": 6510 + }, + { + "epoch": 0.7596781716417911, + "grad_norm": 0.4594448702342764, + "learning_rate": 1.1739566359586195e-05, + "loss": 0.5192, + "step": 6515 + }, + { + "epoch": 0.7602611940298507, + "grad_norm": 0.4895001977535846, + "learning_rate": 1.170863480830781e-05, + "loss": 0.5029, + "step": 6520 + }, + { + "epoch": 0.7608442164179104, + "grad_norm": 0.4811285447973769, + "learning_rate": 1.167776196051166e-05, + "loss": 0.5089, + "step": 6525 + }, + { + "epoch": 0.7614272388059702, + "grad_norm": 0.4169504765893255, + "learning_rate": 1.1646947930965795e-05, + "loss": 0.4477, + "step": 6530 + }, + { + "epoch": 0.7620102611940298, + "grad_norm": 0.4342791005744433, + "learning_rate": 1.1616192834219553e-05, + "loss": 0.4957, + "step": 6535 + }, + { + "epoch": 0.7625932835820896, + "grad_norm": 0.500766952555154, + "learning_rate": 1.1585496784603234e-05, + "loss": 0.474, + "step": 6540 + }, + { + "epoch": 0.7631763059701493, + "grad_norm": 0.459834987252376, + "learning_rate": 1.15548598962276e-05, + "loss": 0.4653, + "step": 6545 + }, + { + "epoch": 0.7637593283582089, + "grad_norm": 0.4634667523018127, + "learning_rate": 1.1524282282983526e-05, + "loss": 0.4952, + "step": 6550 + }, + { + "epoch": 0.7643423507462687, + "grad_norm": 0.48306231366385766, + "learning_rate": 1.1493764058541493e-05, + "loss": 0.5092, + "step": 6555 + }, + { + "epoch": 0.7649253731343284, + "grad_norm": 0.4412886363099704, + "learning_rate": 1.1463305336351233e-05, + "loss": 0.4836, + "step": 6560 + }, + { + "epoch": 0.765508395522388, + "grad_norm": 0.4594326939876668, + "learning_rate": 1.143290622964128e-05, + "loss": 0.4858, + "step": 6565 + }, + { + "epoch": 0.7660914179104478, + "grad_norm": 0.495560886941702, + "learning_rate": 1.1402566851418545e-05, + "loss": 0.484, + "step": 6570 + }, + { + "epoch": 0.7666744402985075, + "grad_norm": 0.44698231009276274, + "learning_rate": 1.1372287314467896e-05, + "loss": 0.4938, + "step": 6575 + }, + { + "epoch": 0.7672574626865671, + "grad_norm": 0.48489845254460334, + "learning_rate": 1.1342067731351754e-05, + "loss": 0.5349, + "step": 6580 + }, + { + "epoch": 0.7678404850746269, + "grad_norm": 0.3983706507650223, + "learning_rate": 1.1311908214409666e-05, + "loss": 0.4916, + "step": 6585 + }, + { + "epoch": 0.7684235074626866, + "grad_norm": 0.4829203676168014, + "learning_rate": 1.128180887575789e-05, + "loss": 0.5538, + "step": 6590 + }, + { + "epoch": 0.7690065298507462, + "grad_norm": 0.5181252743781259, + "learning_rate": 1.1251769827288953e-05, + "loss": 0.5103, + "step": 6595 + }, + { + "epoch": 0.769589552238806, + "grad_norm": 0.45245249797253495, + "learning_rate": 1.122179118067128e-05, + "loss": 0.4877, + "step": 6600 + }, + { + "epoch": 0.7701725746268657, + "grad_norm": 0.45213952635934074, + "learning_rate": 1.1191873047348743e-05, + "loss": 0.4699, + "step": 6605 + }, + { + "epoch": 0.7707555970149254, + "grad_norm": 0.4384946771009236, + "learning_rate": 1.1162015538540268e-05, + "loss": 0.5199, + "step": 6610 + }, + { + "epoch": 0.7713386194029851, + "grad_norm": 1.4634272550071448, + "learning_rate": 1.1132218765239417e-05, + "loss": 0.5111, + "step": 6615 + }, + { + "epoch": 0.7719216417910447, + "grad_norm": 0.4528179441810264, + "learning_rate": 1.1102482838213945e-05, + "loss": 0.5095, + "step": 6620 + }, + { + "epoch": 0.7725046641791045, + "grad_norm": 0.48790948502665343, + "learning_rate": 1.1072807868005438e-05, + "loss": 0.5375, + "step": 6625 + }, + { + "epoch": 0.7730876865671642, + "grad_norm": 0.43356020787990573, + "learning_rate": 1.104319396492888e-05, + "loss": 0.494, + "step": 6630 + }, + { + "epoch": 0.7736707089552238, + "grad_norm": 0.46568426433428856, + "learning_rate": 1.1013641239072233e-05, + "loss": 0.5089, + "step": 6635 + }, + { + "epoch": 0.7742537313432836, + "grad_norm": 0.5058210347479575, + "learning_rate": 1.098414980029603e-05, + "loss": 0.5278, + "step": 6640 + }, + { + "epoch": 0.7748367537313433, + "grad_norm": 0.46834473900137596, + "learning_rate": 1.0954719758232983e-05, + "loss": 0.5183, + "step": 6645 + }, + { + "epoch": 0.7754197761194029, + "grad_norm": 0.4485355699073728, + "learning_rate": 1.092535122228757e-05, + "loss": 0.5067, + "step": 6650 + }, + { + "epoch": 0.7760027985074627, + "grad_norm": 0.44029418480334825, + "learning_rate": 1.0896044301635616e-05, + "loss": 0.4902, + "step": 6655 + }, + { + "epoch": 0.7765858208955224, + "grad_norm": 0.46043398790080686, + "learning_rate": 1.0866799105223877e-05, + "loss": 0.4752, + "step": 6660 + }, + { + "epoch": 0.777168843283582, + "grad_norm": 0.45411148412679725, + "learning_rate": 1.0837615741769695e-05, + "loss": 0.5027, + "step": 6665 + }, + { + "epoch": 0.7777518656716418, + "grad_norm": 0.46002535963451235, + "learning_rate": 1.0808494319760511e-05, + "loss": 0.4818, + "step": 6670 + }, + { + "epoch": 0.7783348880597015, + "grad_norm": 0.4613160618534238, + "learning_rate": 1.0779434947453531e-05, + "loss": 0.5305, + "step": 6675 + }, + { + "epoch": 0.7789179104477612, + "grad_norm": 0.5192961001346327, + "learning_rate": 1.0750437732875265e-05, + "loss": 0.4909, + "step": 6680 + }, + { + "epoch": 0.7795009328358209, + "grad_norm": 0.47703840671844866, + "learning_rate": 1.0721502783821194e-05, + "loss": 0.5433, + "step": 6685 + }, + { + "epoch": 0.7800839552238806, + "grad_norm": 0.4027202848278968, + "learning_rate": 1.0692630207855296e-05, + "loss": 0.4795, + "step": 6690 + }, + { + "epoch": 0.7806669776119403, + "grad_norm": 0.5254895186437003, + "learning_rate": 1.0663820112309695e-05, + "loss": 0.5234, + "step": 6695 + }, + { + "epoch": 0.78125, + "grad_norm": 0.4714359353636831, + "learning_rate": 1.0635072604284254e-05, + "loss": 0.4837, + "step": 6700 + }, + { + "epoch": 0.7818330223880597, + "grad_norm": 0.4409933416168129, + "learning_rate": 1.0606387790646154e-05, + "loss": 0.5124, + "step": 6705 + }, + { + "epoch": 0.7824160447761194, + "grad_norm": 0.47666045358791953, + "learning_rate": 1.0577765778029525e-05, + "loss": 0.4762, + "step": 6710 + }, + { + "epoch": 0.7829990671641791, + "grad_norm": 0.48287842316913426, + "learning_rate": 1.0549206672835033e-05, + "loss": 0.4879, + "step": 6715 + }, + { + "epoch": 0.7835820895522388, + "grad_norm": 0.4494356289008418, + "learning_rate": 1.0520710581229507e-05, + "loss": 0.4816, + "step": 6720 + }, + { + "epoch": 0.7841651119402985, + "grad_norm": 0.46052908698109446, + "learning_rate": 1.049227760914549e-05, + "loss": 0.4516, + "step": 6725 + }, + { + "epoch": 0.7847481343283582, + "grad_norm": 0.4567314964584747, + "learning_rate": 1.0463907862280913e-05, + "loss": 0.4871, + "step": 6730 + }, + { + "epoch": 0.785331156716418, + "grad_norm": 0.44868088537045625, + "learning_rate": 1.043560144609866e-05, + "loss": 0.4955, + "step": 6735 + }, + { + "epoch": 0.7859141791044776, + "grad_norm": 0.4530134493607295, + "learning_rate": 1.0407358465826198e-05, + "loss": 0.5199, + "step": 6740 + }, + { + "epoch": 0.7864972014925373, + "grad_norm": 0.4463551169201282, + "learning_rate": 1.0379179026455136e-05, + "loss": 0.4913, + "step": 6745 + }, + { + "epoch": 0.7870802238805971, + "grad_norm": 0.47099610185569996, + "learning_rate": 1.0351063232740937e-05, + "loss": 0.5332, + "step": 6750 + }, + { + "epoch": 0.7876632462686567, + "grad_norm": 0.4402447824147342, + "learning_rate": 1.0323011189202408e-05, + "loss": 0.5053, + "step": 6755 + }, + { + "epoch": 0.7882462686567164, + "grad_norm": 0.45593057954899197, + "learning_rate": 1.0295023000121404e-05, + "loss": 0.474, + "step": 6760 + }, + { + "epoch": 0.7888292910447762, + "grad_norm": 0.4374300903516831, + "learning_rate": 1.0267098769542368e-05, + "loss": 0.5427, + "step": 6765 + }, + { + "epoch": 0.7894123134328358, + "grad_norm": 0.4216054919323253, + "learning_rate": 1.0239238601272036e-05, + "loss": 0.4862, + "step": 6770 + }, + { + "epoch": 0.7899953358208955, + "grad_norm": 0.43838979602724326, + "learning_rate": 1.0211442598878936e-05, + "loss": 0.4697, + "step": 6775 + }, + { + "epoch": 0.7905783582089553, + "grad_norm": 0.41927389582266983, + "learning_rate": 1.0183710865693105e-05, + "loss": 0.4731, + "step": 6780 + }, + { + "epoch": 0.7911613805970149, + "grad_norm": 0.4618407627786652, + "learning_rate": 1.0156043504805648e-05, + "loss": 0.4946, + "step": 6785 + }, + { + "epoch": 0.7917444029850746, + "grad_norm": 0.49476862297527385, + "learning_rate": 1.0128440619068379e-05, + "loss": 0.5218, + "step": 6790 + }, + { + "epoch": 0.7923274253731343, + "grad_norm": 0.4878473893991779, + "learning_rate": 1.0100902311093405e-05, + "loss": 0.5127, + "step": 6795 + }, + { + "epoch": 0.792910447761194, + "grad_norm": 0.44555429187543133, + "learning_rate": 1.0073428683252788e-05, + "loss": 0.4893, + "step": 6800 + }, + { + "epoch": 0.7934934701492538, + "grad_norm": 0.44977845000528854, + "learning_rate": 1.0046019837678153e-05, + "loss": 0.4687, + "step": 6805 + }, + { + "epoch": 0.7940764925373134, + "grad_norm": 0.447310487987635, + "learning_rate": 1.001867587626029e-05, + "loss": 0.484, + "step": 6810 + }, + { + "epoch": 0.7946595149253731, + "grad_norm": 0.48144807868357464, + "learning_rate": 9.991396900648774e-06, + "loss": 0.5395, + "step": 6815 + }, + { + "epoch": 0.7952425373134329, + "grad_norm": 0.46182249334458747, + "learning_rate": 9.964183012251619e-06, + "loss": 0.4914, + "step": 6820 + }, + { + "epoch": 0.7958255597014925, + "grad_norm": 0.4389736266429193, + "learning_rate": 9.937034312234872e-06, + "loss": 0.4966, + "step": 6825 + }, + { + "epoch": 0.7964085820895522, + "grad_norm": 0.4683635632125285, + "learning_rate": 9.90995090152225e-06, + "loss": 0.4872, + "step": 6830 + }, + { + "epoch": 0.796991604477612, + "grad_norm": 0.449577983637824, + "learning_rate": 9.88293288079476e-06, + "loss": 0.4775, + "step": 6835 + }, + { + "epoch": 0.7975746268656716, + "grad_norm": 0.4679257203680657, + "learning_rate": 9.855980350490315e-06, + "loss": 0.4628, + "step": 6840 + }, + { + "epoch": 0.7981576492537313, + "grad_norm": 0.4910686168202571, + "learning_rate": 9.82909341080339e-06, + "loss": 0.516, + "step": 6845 + }, + { + "epoch": 0.7987406716417911, + "grad_norm": 0.4328561795827866, + "learning_rate": 9.802272161684601e-06, + "loss": 0.5062, + "step": 6850 + }, + { + "epoch": 0.7993236940298507, + "grad_norm": 0.48810830150277185, + "learning_rate": 9.775516702840411e-06, + "loss": 0.542, + "step": 6855 + }, + { + "epoch": 0.7999067164179104, + "grad_norm": 0.4326359263896414, + "learning_rate": 9.748827133732665e-06, + "loss": 0.491, + "step": 6860 + }, + { + "epoch": 0.8004897388059702, + "grad_norm": 0.45895357662154007, + "learning_rate": 9.722203553578288e-06, + "loss": 0.5017, + "step": 6865 + }, + { + "epoch": 0.8010727611940298, + "grad_norm": 0.4201551976753478, + "learning_rate": 9.695646061348892e-06, + "loss": 0.5233, + "step": 6870 + }, + { + "epoch": 0.8016557835820896, + "grad_norm": 0.4261865295483104, + "learning_rate": 9.669154755770415e-06, + "loss": 0.4615, + "step": 6875 + }, + { + "epoch": 0.8022388059701493, + "grad_norm": 0.7140092170328386, + "learning_rate": 9.642729735322733e-06, + "loss": 0.5097, + "step": 6880 + }, + { + "epoch": 0.8028218283582089, + "grad_norm": 0.4990283440949626, + "learning_rate": 9.616371098239346e-06, + "loss": 0.5716, + "step": 6885 + }, + { + "epoch": 0.8034048507462687, + "grad_norm": 0.4481559993333328, + "learning_rate": 9.590078942506933e-06, + "loss": 0.4856, + "step": 6890 + }, + { + "epoch": 0.8039878731343284, + "grad_norm": 0.4607158596942868, + "learning_rate": 9.56385336586507e-06, + "loss": 0.5107, + "step": 6895 + }, + { + "epoch": 0.804570895522388, + "grad_norm": 0.476387734456041, + "learning_rate": 9.537694465805797e-06, + "loss": 0.4998, + "step": 6900 + }, + { + "epoch": 0.8051539179104478, + "grad_norm": 0.46770623541362477, + "learning_rate": 9.511602339573324e-06, + "loss": 0.4998, + "step": 6905 + }, + { + "epoch": 0.8057369402985075, + "grad_norm": 0.5086514392993742, + "learning_rate": 9.485577084163604e-06, + "loss": 0.5105, + "step": 6910 + }, + { + "epoch": 0.8063199626865671, + "grad_norm": 0.5077908007862963, + "learning_rate": 9.45961879632401e-06, + "loss": 0.5116, + "step": 6915 + }, + { + "epoch": 0.8069029850746269, + "grad_norm": 0.5463748621976762, + "learning_rate": 9.43372757255297e-06, + "loss": 0.4687, + "step": 6920 + }, + { + "epoch": 0.8074860074626866, + "grad_norm": 0.4191317799403366, + "learning_rate": 9.40790350909961e-06, + "loss": 0.498, + "step": 6925 + }, + { + "epoch": 0.8080690298507462, + "grad_norm": 0.4865303702117012, + "learning_rate": 9.382146701963373e-06, + "loss": 0.456, + "step": 6930 + }, + { + "epoch": 0.808652052238806, + "grad_norm": 0.42674988798148156, + "learning_rate": 9.356457246893695e-06, + "loss": 0.5227, + "step": 6935 + }, + { + "epoch": 0.8092350746268657, + "grad_norm": 0.46859664111243005, + "learning_rate": 9.330835239389645e-06, + "loss": 0.5018, + "step": 6940 + }, + { + "epoch": 0.8098180970149254, + "grad_norm": 0.5581605519629456, + "learning_rate": 9.305280774699531e-06, + "loss": 0.4893, + "step": 6945 + }, + { + "epoch": 0.8104011194029851, + "grad_norm": 0.4648844812652044, + "learning_rate": 9.279793947820596e-06, + "loss": 0.5034, + "step": 6950 + }, + { + "epoch": 0.8109841417910447, + "grad_norm": 0.4348511386801257, + "learning_rate": 9.254374853498636e-06, + "loss": 0.476, + "step": 6955 + }, + { + "epoch": 0.8115671641791045, + "grad_norm": 0.4471748979975722, + "learning_rate": 9.229023586227666e-06, + "loss": 0.4746, + "step": 6960 + }, + { + "epoch": 0.8121501865671642, + "grad_norm": 0.49028467746657334, + "learning_rate": 9.203740240249527e-06, + "loss": 0.5075, + "step": 6965 + }, + { + "epoch": 0.8127332089552238, + "grad_norm": 0.45156139381243476, + "learning_rate": 9.178524909553617e-06, + "loss": 0.4997, + "step": 6970 + }, + { + "epoch": 0.8133162313432836, + "grad_norm": 0.4580984480401016, + "learning_rate": 9.153377687876439e-06, + "loss": 0.5098, + "step": 6975 + }, + { + "epoch": 0.8138992537313433, + "grad_norm": 0.4417064208682211, + "learning_rate": 9.128298668701341e-06, + "loss": 0.4866, + "step": 6980 + }, + { + "epoch": 0.8144822761194029, + "grad_norm": 0.4396784431419953, + "learning_rate": 9.103287945258104e-06, + "loss": 0.4843, + "step": 6985 + }, + { + "epoch": 0.8150652985074627, + "grad_norm": 0.42781740173572586, + "learning_rate": 9.078345610522662e-06, + "loss": 0.4864, + "step": 6990 + }, + { + "epoch": 0.8156483208955224, + "grad_norm": 0.4706194313899779, + "learning_rate": 9.053471757216675e-06, + "loss": 0.4829, + "step": 6995 + }, + { + "epoch": 0.816231343283582, + "grad_norm": 0.4373005186110652, + "learning_rate": 9.028666477807253e-06, + "loss": 0.4946, + "step": 7000 + }, + { + "epoch": 0.8168143656716418, + "grad_norm": 0.45822848787961307, + "learning_rate": 9.003929864506583e-06, + "loss": 0.4747, + "step": 7005 + }, + { + "epoch": 0.8173973880597015, + "grad_norm": 0.4376578697732612, + "learning_rate": 8.979262009271589e-06, + "loss": 0.4982, + "step": 7010 + }, + { + "epoch": 0.8179804104477612, + "grad_norm": 0.50115571632827, + "learning_rate": 8.954663003803579e-06, + "loss": 0.4934, + "step": 7015 + }, + { + "epoch": 0.8185634328358209, + "grad_norm": 0.44219752315988364, + "learning_rate": 8.930132939547932e-06, + "loss": 0.4663, + "step": 7020 + }, + { + "epoch": 0.8191464552238806, + "grad_norm": 0.440108685069607, + "learning_rate": 8.905671907693738e-06, + "loss": 0.4856, + "step": 7025 + }, + { + "epoch": 0.8197294776119403, + "grad_norm": 0.5246570100006753, + "learning_rate": 8.881279999173466e-06, + "loss": 0.5021, + "step": 7030 + }, + { + "epoch": 0.8203125, + "grad_norm": 0.4319239074767847, + "learning_rate": 8.856957304662602e-06, + "loss": 0.5123, + "step": 7035 + }, + { + "epoch": 0.8208955223880597, + "grad_norm": 0.4326829341676435, + "learning_rate": 8.832703914579363e-06, + "loss": 0.5021, + "step": 7040 + }, + { + "epoch": 0.8214785447761194, + "grad_norm": 0.4380817083829351, + "learning_rate": 8.80851991908431e-06, + "loss": 0.5044, + "step": 7045 + }, + { + "epoch": 0.8220615671641791, + "grad_norm": 0.4637369898587162, + "learning_rate": 8.784405408080046e-06, + "loss": 0.4852, + "step": 7050 + }, + { + "epoch": 0.8226445895522388, + "grad_norm": 0.4796910879347511, + "learning_rate": 8.760360471210865e-06, + "loss": 0.5338, + "step": 7055 + }, + { + "epoch": 0.8232276119402985, + "grad_norm": 0.4135280946993472, + "learning_rate": 8.736385197862415e-06, + "loss": 0.4824, + "step": 7060 + }, + { + "epoch": 0.8238106343283582, + "grad_norm": 0.4287858690307368, + "learning_rate": 8.712479677161388e-06, + "loss": 0.4869, + "step": 7065 + }, + { + "epoch": 0.824393656716418, + "grad_norm": 0.4714198385972294, + "learning_rate": 8.688643997975156e-06, + "loss": 0.5143, + "step": 7070 + }, + { + "epoch": 0.8249766791044776, + "grad_norm": 0.44395978208148723, + "learning_rate": 8.66487824891149e-06, + "loss": 0.4922, + "step": 7075 + }, + { + "epoch": 0.8255597014925373, + "grad_norm": 0.508754740613158, + "learning_rate": 8.641182518318162e-06, + "loss": 0.5184, + "step": 7080 + }, + { + "epoch": 0.8261427238805971, + "grad_norm": 0.4026422778012762, + "learning_rate": 8.617556894282683e-06, + "loss": 0.4886, + "step": 7085 + }, + { + "epoch": 0.8267257462686567, + "grad_norm": 0.47446388322469524, + "learning_rate": 8.594001464631938e-06, + "loss": 0.5189, + "step": 7090 + }, + { + "epoch": 0.8273087686567164, + "grad_norm": 0.47024774444454565, + "learning_rate": 8.570516316931869e-06, + "loss": 0.5266, + "step": 7095 + }, + { + "epoch": 0.8278917910447762, + "grad_norm": 0.44748825048810753, + "learning_rate": 8.547101538487136e-06, + "loss": 0.4845, + "step": 7100 + }, + { + "epoch": 0.8284748134328358, + "grad_norm": 0.4806024012846129, + "learning_rate": 8.52375721634083e-06, + "loss": 0.5031, + "step": 7105 + }, + { + "epoch": 0.8290578358208955, + "grad_norm": 0.5413375895981546, + "learning_rate": 8.5004834372741e-06, + "loss": 0.5106, + "step": 7110 + }, + { + "epoch": 0.8296408582089553, + "grad_norm": 0.4337024823163042, + "learning_rate": 8.477280287805883e-06, + "loss": 0.4954, + "step": 7115 + }, + { + "epoch": 0.8302238805970149, + "grad_norm": 0.4344528368728774, + "learning_rate": 8.454147854192515e-06, + "loss": 0.5021, + "step": 7120 + }, + { + "epoch": 0.8308069029850746, + "grad_norm": 0.45520925847409455, + "learning_rate": 8.4310862224275e-06, + "loss": 0.4828, + "step": 7125 + }, + { + "epoch": 0.8313899253731343, + "grad_norm": 0.4734386762743593, + "learning_rate": 8.408095478241099e-06, + "loss": 0.5243, + "step": 7130 + }, + { + "epoch": 0.831972947761194, + "grad_norm": 0.4717328221940539, + "learning_rate": 8.385175707100064e-06, + "loss": 0.4907, + "step": 7135 + }, + { + "epoch": 0.8325559701492538, + "grad_norm": 0.41240208092851716, + "learning_rate": 8.36232699420732e-06, + "loss": 0.5031, + "step": 7140 + }, + { + "epoch": 0.8331389925373134, + "grad_norm": 0.41514081930390884, + "learning_rate": 8.33954942450163e-06, + "loss": 0.4896, + "step": 7145 + }, + { + "epoch": 0.8337220149253731, + "grad_norm": 0.4675941109997365, + "learning_rate": 8.316843082657277e-06, + "loss": 0.5009, + "step": 7150 + }, + { + "epoch": 0.8343050373134329, + "grad_norm": 0.4427930397053809, + "learning_rate": 8.294208053083771e-06, + "loss": 0.511, + "step": 7155 + }, + { + "epoch": 0.8348880597014925, + "grad_norm": 0.45949248005156934, + "learning_rate": 8.271644419925526e-06, + "loss": 0.4719, + "step": 7160 + }, + { + "epoch": 0.8354710820895522, + "grad_norm": 0.471413986453529, + "learning_rate": 8.249152267061524e-06, + "loss": 0.4994, + "step": 7165 + }, + { + "epoch": 0.836054104477612, + "grad_norm": 0.44813478705837095, + "learning_rate": 8.226731678105045e-06, + "loss": 0.4804, + "step": 7170 + }, + { + "epoch": 0.8366371268656716, + "grad_norm": 0.4426301509227603, + "learning_rate": 8.20438273640332e-06, + "loss": 0.5143, + "step": 7175 + }, + { + "epoch": 0.8372201492537313, + "grad_norm": 0.4465652598526728, + "learning_rate": 8.18210552503725e-06, + "loss": 0.4871, + "step": 7180 + }, + { + "epoch": 0.8378031716417911, + "grad_norm": 0.47244655194156704, + "learning_rate": 8.159900126821062e-06, + "loss": 0.5003, + "step": 7185 + }, + { + "epoch": 0.8383861940298507, + "grad_norm": 0.4471968512251493, + "learning_rate": 8.137766624302036e-06, + "loss": 0.5142, + "step": 7190 + }, + { + "epoch": 0.8389692164179104, + "grad_norm": 0.4709917176706898, + "learning_rate": 8.115705099760184e-06, + "loss": 0.5195, + "step": 7195 + }, + { + "epoch": 0.8395522388059702, + "grad_norm": 0.4380879394783618, + "learning_rate": 8.093715635207948e-06, + "loss": 0.4885, + "step": 7200 + }, + { + "epoch": 0.8401352611940298, + "grad_norm": 0.41712410093981933, + "learning_rate": 8.071798312389863e-06, + "loss": 0.5116, + "step": 7205 + }, + { + "epoch": 0.8407182835820896, + "grad_norm": 0.4325869434630372, + "learning_rate": 8.049953212782329e-06, + "loss": 0.4478, + "step": 7210 + }, + { + "epoch": 0.8413013059701493, + "grad_norm": 0.4166574600617677, + "learning_rate": 8.028180417593215e-06, + "loss": 0.4824, + "step": 7215 + }, + { + "epoch": 0.8418843283582089, + "grad_norm": 0.43830478937394324, + "learning_rate": 8.006480007761628e-06, + "loss": 0.4674, + "step": 7220 + }, + { + "epoch": 0.8424673507462687, + "grad_norm": 0.4817669338294806, + "learning_rate": 7.98485206395758e-06, + "loss": 0.5077, + "step": 7225 + }, + { + "epoch": 0.8430503731343284, + "grad_norm": 0.4647045998674549, + "learning_rate": 7.963296666581703e-06, + "loss": 0.4702, + "step": 7230 + }, + { + "epoch": 0.843633395522388, + "grad_norm": 0.49174994550827694, + "learning_rate": 7.941813895764919e-06, + "loss": 0.5212, + "step": 7235 + }, + { + "epoch": 0.8442164179104478, + "grad_norm": 0.41757383756848515, + "learning_rate": 7.920403831368189e-06, + "loss": 0.5016, + "step": 7240 + }, + { + "epoch": 0.8447994402985075, + "grad_norm": 0.44263407474750815, + "learning_rate": 7.899066552982179e-06, + "loss": 0.4994, + "step": 7245 + }, + { + "epoch": 0.8453824626865671, + "grad_norm": 0.46533860910272895, + "learning_rate": 7.87780213992699e-06, + "loss": 0.4973, + "step": 7250 + }, + { + "epoch": 0.8459654850746269, + "grad_norm": 0.4396469203604777, + "learning_rate": 7.856610671251826e-06, + "loss": 0.4962, + "step": 7255 + }, + { + "epoch": 0.8465485074626866, + "grad_norm": 0.461412303324744, + "learning_rate": 7.835492225734753e-06, + "loss": 0.4848, + "step": 7260 + }, + { + "epoch": 0.8471315298507462, + "grad_norm": 0.5184880522572166, + "learning_rate": 7.81444688188236e-06, + "loss": 0.5133, + "step": 7265 + }, + { + "epoch": 0.847714552238806, + "grad_norm": 0.47001007740037093, + "learning_rate": 7.793474717929495e-06, + "loss": 0.4852, + "step": 7270 + }, + { + "epoch": 0.8482975746268657, + "grad_norm": 0.47109511656808234, + "learning_rate": 7.772575811838948e-06, + "loss": 0.4961, + "step": 7275 + }, + { + "epoch": 0.8488805970149254, + "grad_norm": 0.44116994424682127, + "learning_rate": 7.751750241301192e-06, + "loss": 0.4972, + "step": 7280 + }, + { + "epoch": 0.8494636194029851, + "grad_norm": 0.4501932519308333, + "learning_rate": 7.730998083734083e-06, + "loss": 0.4859, + "step": 7285 + }, + { + "epoch": 0.8500466417910447, + "grad_norm": 0.48144234778210115, + "learning_rate": 7.710319416282543e-06, + "loss": 0.4984, + "step": 7290 + }, + { + "epoch": 0.8506296641791045, + "grad_norm": 0.456988520365441, + "learning_rate": 7.689714315818339e-06, + "loss": 0.5232, + "step": 7295 + }, + { + "epoch": 0.8512126865671642, + "grad_norm": 0.5275485527755961, + "learning_rate": 7.669182858939715e-06, + "loss": 0.494, + "step": 7300 + }, + { + "epoch": 0.8517957089552238, + "grad_norm": 0.4358686155004703, + "learning_rate": 7.648725121971178e-06, + "loss": 0.4652, + "step": 7305 + }, + { + "epoch": 0.8523787313432836, + "grad_norm": 0.5241681742015615, + "learning_rate": 7.628341180963175e-06, + "loss": 0.5107, + "step": 7310 + }, + { + "epoch": 0.8529617537313433, + "grad_norm": 0.4446315316066114, + "learning_rate": 7.608031111691826e-06, + "loss": 0.4736, + "step": 7315 + }, + { + "epoch": 0.8535447761194029, + "grad_norm": 0.4586742881823037, + "learning_rate": 7.587794989658621e-06, + "loss": 0.4789, + "step": 7320 + }, + { + "epoch": 0.8541277985074627, + "grad_norm": 0.43393817483796693, + "learning_rate": 7.567632890090176e-06, + "loss": 0.4517, + "step": 7325 + }, + { + "epoch": 0.8547108208955224, + "grad_norm": 0.4530263088196386, + "learning_rate": 7.5475448879379255e-06, + "loss": 0.5204, + "step": 7330 + }, + { + "epoch": 0.855293843283582, + "grad_norm": 0.47232925084661864, + "learning_rate": 7.527531057877849e-06, + "loss": 0.5212, + "step": 7335 + }, + { + "epoch": 0.8558768656716418, + "grad_norm": 0.4413601041808957, + "learning_rate": 7.507591474310185e-06, + "loss": 0.4907, + "step": 7340 + }, + { + "epoch": 0.8564598880597015, + "grad_norm": 0.5105056148052758, + "learning_rate": 7.487726211359198e-06, + "loss": 0.5465, + "step": 7345 + }, + { + "epoch": 0.8570429104477612, + "grad_norm": 0.6715647626166893, + "learning_rate": 7.46793534287283e-06, + "loss": 0.5069, + "step": 7350 + }, + { + "epoch": 0.8576259328358209, + "grad_norm": 0.4588809773198785, + "learning_rate": 7.448218942422498e-06, + "loss": 0.5474, + "step": 7355 + }, + { + "epoch": 0.8582089552238806, + "grad_norm": 0.4831576147260091, + "learning_rate": 7.428577083302757e-06, + "loss": 0.5093, + "step": 7360 + }, + { + "epoch": 0.8587919776119403, + "grad_norm": 0.5042721886700091, + "learning_rate": 7.409009838531095e-06, + "loss": 0.5375, + "step": 7365 + }, + { + "epoch": 0.859375, + "grad_norm": 0.46284073380005986, + "learning_rate": 7.389517280847598e-06, + "loss": 0.5159, + "step": 7370 + }, + { + "epoch": 0.8599580223880597, + "grad_norm": 0.4661662360141043, + "learning_rate": 7.370099482714715e-06, + "loss": 0.5315, + "step": 7375 + }, + { + "epoch": 0.8605410447761194, + "grad_norm": 0.4476959939432871, + "learning_rate": 7.35075651631699e-06, + "loss": 0.4901, + "step": 7380 + }, + { + "epoch": 0.8611240671641791, + "grad_norm": 0.44717566109177675, + "learning_rate": 7.331488453560767e-06, + "loss": 0.4921, + "step": 7385 + }, + { + "epoch": 0.8617070895522388, + "grad_norm": 0.4476851149766431, + "learning_rate": 7.312295366073952e-06, + "loss": 0.4839, + "step": 7390 + }, + { + "epoch": 0.8622901119402985, + "grad_norm": 0.4656768131692163, + "learning_rate": 7.293177325205734e-06, + "loss": 0.5089, + "step": 7395 + }, + { + "epoch": 0.8628731343283582, + "grad_norm": 0.47867044879346754, + "learning_rate": 7.274134402026321e-06, + "loss": 0.5204, + "step": 7400 + }, + { + "epoch": 0.863456156716418, + "grad_norm": 0.4495376624711596, + "learning_rate": 7.255166667326668e-06, + "loss": 0.4864, + "step": 7405 + }, + { + "epoch": 0.8640391791044776, + "grad_norm": 0.45540868280070557, + "learning_rate": 7.236274191618228e-06, + "loss": 0.4969, + "step": 7410 + }, + { + "epoch": 0.8646222014925373, + "grad_norm": 0.5275767191951953, + "learning_rate": 7.217457045132682e-06, + "loss": 0.5086, + "step": 7415 + }, + { + "epoch": 0.8652052238805971, + "grad_norm": 0.42863000174753835, + "learning_rate": 7.198715297821681e-06, + "loss": 0.5033, + "step": 7420 + }, + { + "epoch": 0.8657882462686567, + "grad_norm": 0.4834849241582333, + "learning_rate": 7.18004901935657e-06, + "loss": 0.5057, + "step": 7425 + }, + { + "epoch": 0.8663712686567164, + "grad_norm": 0.46483441625060457, + "learning_rate": 7.161458279128172e-06, + "loss": 0.5058, + "step": 7430 + }, + { + "epoch": 0.8669542910447762, + "grad_norm": 0.4699147577232594, + "learning_rate": 7.142943146246471e-06, + "loss": 0.5052, + "step": 7435 + }, + { + "epoch": 0.8675373134328358, + "grad_norm": 0.4728336401428503, + "learning_rate": 7.124503689540403e-06, + "loss": 0.4945, + "step": 7440 + }, + { + "epoch": 0.8681203358208955, + "grad_norm": 0.42756247469245834, + "learning_rate": 7.106139977557563e-06, + "loss": 0.4868, + "step": 7445 + }, + { + "epoch": 0.8687033582089553, + "grad_norm": 0.5508454557117077, + "learning_rate": 7.087852078564006e-06, + "loss": 0.5078, + "step": 7450 + }, + { + "epoch": 0.8692863805970149, + "grad_norm": 0.42365611783227247, + "learning_rate": 7.069640060543914e-06, + "loss": 0.4795, + "step": 7455 + }, + { + "epoch": 0.8698694029850746, + "grad_norm": 0.4619373613002068, + "learning_rate": 7.051503991199415e-06, + "loss": 0.5093, + "step": 7460 + }, + { + "epoch": 0.8704524253731343, + "grad_norm": 0.46430333389633666, + "learning_rate": 7.03344393795029e-06, + "loss": 0.4628, + "step": 7465 + }, + { + "epoch": 0.871035447761194, + "grad_norm": 0.46350593595683315, + "learning_rate": 7.0154599679337405e-06, + "loss": 0.4966, + "step": 7470 + }, + { + "epoch": 0.8716184701492538, + "grad_norm": 0.46997441642678967, + "learning_rate": 6.997552148004124e-06, + "loss": 0.4619, + "step": 7475 + }, + { + "epoch": 0.8722014925373134, + "grad_norm": 0.4940809056382957, + "learning_rate": 6.9797205447327236e-06, + "loss": 0.5207, + "step": 7480 + }, + { + "epoch": 0.8727845149253731, + "grad_norm": 0.42737497657553214, + "learning_rate": 6.961965224407487e-06, + "loss": 0.4879, + "step": 7485 + }, + { + "epoch": 0.8733675373134329, + "grad_norm": 0.4420504460833519, + "learning_rate": 6.944286253032789e-06, + "loss": 0.4519, + "step": 7490 + }, + { + "epoch": 0.8739505597014925, + "grad_norm": 0.4482189949894458, + "learning_rate": 6.9266836963291725e-06, + "loss": 0.5216, + "step": 7495 + }, + { + "epoch": 0.8745335820895522, + "grad_norm": 0.48507550585616305, + "learning_rate": 6.90915761973312e-06, + "loss": 0.4904, + "step": 7500 + }, + { + "epoch": 0.875116604477612, + "grad_norm": 0.4845741351343996, + "learning_rate": 6.891708088396803e-06, + "loss": 0.5031, + "step": 7505 + }, + { + "epoch": 0.8756996268656716, + "grad_norm": 0.45661614078718604, + "learning_rate": 6.874335167187844e-06, + "loss": 0.4694, + "step": 7510 + }, + { + "epoch": 0.8762826492537313, + "grad_norm": 0.4378046368918111, + "learning_rate": 6.857038920689068e-06, + "loss": 0.473, + "step": 7515 + }, + { + "epoch": 0.8768656716417911, + "grad_norm": 0.4351278013857574, + "learning_rate": 6.839819413198259e-06, + "loss": 0.4686, + "step": 7520 + }, + { + "epoch": 0.8774486940298507, + "grad_norm": 0.4598040793979348, + "learning_rate": 6.822676708727941e-06, + "loss": 0.5058, + "step": 7525 + }, + { + "epoch": 0.8780317164179104, + "grad_norm": 0.46734999477037775, + "learning_rate": 6.805610871005115e-06, + "loss": 0.5142, + "step": 7530 + }, + { + "epoch": 0.8786147388059702, + "grad_norm": 0.42358187968622485, + "learning_rate": 6.788621963471055e-06, + "loss": 0.4656, + "step": 7535 + }, + { + "epoch": 0.8791977611940298, + "grad_norm": 0.42943564345062557, + "learning_rate": 6.771710049281019e-06, + "loss": 0.4866, + "step": 7540 + }, + { + "epoch": 0.8797807835820896, + "grad_norm": 0.46090527920896157, + "learning_rate": 6.754875191304076e-06, + "loss": 0.5283, + "step": 7545 + }, + { + "epoch": 0.8803638059701493, + "grad_norm": 0.46574541087337956, + "learning_rate": 6.73811745212283e-06, + "loss": 0.5072, + "step": 7550 + }, + { + "epoch": 0.8809468283582089, + "grad_norm": 0.5402513744522806, + "learning_rate": 6.721436894033206e-06, + "loss": 0.4474, + "step": 7555 + }, + { + "epoch": 0.8815298507462687, + "grad_norm": 0.4887426580865765, + "learning_rate": 6.704833579044198e-06, + "loss": 0.4945, + "step": 7560 + }, + { + "epoch": 0.8821128731343284, + "grad_norm": 0.4720599639216187, + "learning_rate": 6.688307568877681e-06, + "loss": 0.4757, + "step": 7565 + }, + { + "epoch": 0.882695895522388, + "grad_norm": 0.44242282186933635, + "learning_rate": 6.6718589249681215e-06, + "loss": 0.5141, + "step": 7570 + }, + { + "epoch": 0.8832789179104478, + "grad_norm": 0.47443100760410045, + "learning_rate": 6.655487708462407e-06, + "loss": 0.479, + "step": 7575 + }, + { + "epoch": 0.8838619402985075, + "grad_norm": 0.4484432447267727, + "learning_rate": 6.639193980219574e-06, + "loss": 0.503, + "step": 7580 + }, + { + "epoch": 0.8844449626865671, + "grad_norm": 0.4544364445156426, + "learning_rate": 6.622977800810626e-06, + "loss": 0.4757, + "step": 7585 + }, + { + "epoch": 0.8850279850746269, + "grad_norm": 0.470997939385338, + "learning_rate": 6.60683923051825e-06, + "loss": 0.4791, + "step": 7590 + }, + { + "epoch": 0.8856110074626866, + "grad_norm": 0.4213706332803066, + "learning_rate": 6.5907783293366525e-06, + "loss": 0.468, + "step": 7595 + }, + { + "epoch": 0.8861940298507462, + "grad_norm": 0.4352892615766016, + "learning_rate": 6.574795156971298e-06, + "loss": 0.4843, + "step": 7600 + }, + { + "epoch": 0.886777052238806, + "grad_norm": 0.45036633475522897, + "learning_rate": 6.5588897728387055e-06, + "loss": 0.4705, + "step": 7605 + }, + { + "epoch": 0.8873600746268657, + "grad_norm": 0.5077288646140908, + "learning_rate": 6.543062236066208e-06, + "loss": 0.4791, + "step": 7610 + }, + { + "epoch": 0.8879430970149254, + "grad_norm": 0.4513500980150284, + "learning_rate": 6.527312605491758e-06, + "loss": 0.5178, + "step": 7615 + }, + { + "epoch": 0.8885261194029851, + "grad_norm": 0.4274182074103898, + "learning_rate": 6.5116409396636935e-06, + "loss": 0.4626, + "step": 7620 + }, + { + "epoch": 0.8891091417910447, + "grad_norm": 0.4667801241554799, + "learning_rate": 6.496047296840513e-06, + "loss": 0.5071, + "step": 7625 + }, + { + "epoch": 0.8896921641791045, + "grad_norm": 0.4123902058374272, + "learning_rate": 6.480531734990686e-06, + "loss": 0.4992, + "step": 7630 + }, + { + "epoch": 0.8902751865671642, + "grad_norm": 0.47897565830736566, + "learning_rate": 6.4650943117924065e-06, + "loss": 0.5153, + "step": 7635 + }, + { + "epoch": 0.8908582089552238, + "grad_norm": 0.47530139972165525, + "learning_rate": 6.449735084633407e-06, + "loss": 0.4857, + "step": 7640 + }, + { + "epoch": 0.8914412313432836, + "grad_norm": 0.4311055331542708, + "learning_rate": 6.4344541106107046e-06, + "loss": 0.4877, + "step": 7645 + }, + { + "epoch": 0.8920242537313433, + "grad_norm": 0.4668273679980216, + "learning_rate": 6.419251446530451e-06, + "loss": 0.5164, + "step": 7650 + }, + { + "epoch": 0.8926072761194029, + "grad_norm": 0.44140094368731336, + "learning_rate": 6.404127148907656e-06, + "loss": 0.4784, + "step": 7655 + }, + { + "epoch": 0.8931902985074627, + "grad_norm": 0.4544488440695652, + "learning_rate": 6.38908127396602e-06, + "loss": 0.4988, + "step": 7660 + }, + { + "epoch": 0.8937733208955224, + "grad_norm": 0.4557410652922852, + "learning_rate": 6.374113877637701e-06, + "loss": 0.4987, + "step": 7665 + }, + { + "epoch": 0.894356343283582, + "grad_norm": 0.4444789436043567, + "learning_rate": 6.359225015563138e-06, + "loss": 0.5243, + "step": 7670 + }, + { + "epoch": 0.8949393656716418, + "grad_norm": 0.4698237483504685, + "learning_rate": 6.3444147430908015e-06, + "loss": 0.4691, + "step": 7675 + }, + { + "epoch": 0.8955223880597015, + "grad_norm": 0.4859998755530479, + "learning_rate": 6.329683115277018e-06, + "loss": 0.4916, + "step": 7680 + }, + { + "epoch": 0.8961054104477612, + "grad_norm": 0.515441325427795, + "learning_rate": 6.315030186885763e-06, + "loss": 0.5116, + "step": 7685 + }, + { + "epoch": 0.8966884328358209, + "grad_norm": 0.5283029015600534, + "learning_rate": 6.300456012388446e-06, + "loss": 0.4934, + "step": 7690 + }, + { + "epoch": 0.8972714552238806, + "grad_norm": 0.4417646004083893, + "learning_rate": 6.285960645963708e-06, + "loss": 0.5026, + "step": 7695 + }, + { + "epoch": 0.8978544776119403, + "grad_norm": 0.4605167879082956, + "learning_rate": 6.271544141497232e-06, + "loss": 0.4901, + "step": 7700 + }, + { + "epoch": 0.8984375, + "grad_norm": 0.4792028373581772, + "learning_rate": 6.257206552581541e-06, + "loss": 0.5118, + "step": 7705 + }, + { + "epoch": 0.8990205223880597, + "grad_norm": 0.4644526939230792, + "learning_rate": 6.242947932515786e-06, + "loss": 0.5282, + "step": 7710 + }, + { + "epoch": 0.8996035447761194, + "grad_norm": 0.5046663803144333, + "learning_rate": 6.228768334305555e-06, + "loss": 0.4946, + "step": 7715 + }, + { + "epoch": 0.9001865671641791, + "grad_norm": 0.43155084823278395, + "learning_rate": 6.214667810662682e-06, + "loss": 0.4794, + "step": 7720 + }, + { + "epoch": 0.9007695895522388, + "grad_norm": 0.4640789279302754, + "learning_rate": 6.200646414005046e-06, + "loss": 0.5239, + "step": 7725 + }, + { + "epoch": 0.9013526119402985, + "grad_norm": 0.45218047321306765, + "learning_rate": 6.18670419645637e-06, + "loss": 0.5015, + "step": 7730 + }, + { + "epoch": 0.9019356343283582, + "grad_norm": 0.4588478078182171, + "learning_rate": 6.172841209846046e-06, + "loss": 0.4913, + "step": 7735 + }, + { + "epoch": 0.902518656716418, + "grad_norm": 0.4433138530597733, + "learning_rate": 6.159057505708912e-06, + "loss": 0.4594, + "step": 7740 + }, + { + "epoch": 0.9031016791044776, + "grad_norm": 0.42245796632484117, + "learning_rate": 6.145353135285091e-06, + "loss": 0.4945, + "step": 7745 + }, + { + "epoch": 0.9036847014925373, + "grad_norm": 0.4756088628902291, + "learning_rate": 6.131728149519778e-06, + "loss": 0.4932, + "step": 7750 + }, + { + "epoch": 0.9042677238805971, + "grad_norm": 0.4597129158786207, + "learning_rate": 6.118182599063075e-06, + "loss": 0.5354, + "step": 7755 + }, + { + "epoch": 0.9048507462686567, + "grad_norm": 0.503389587802853, + "learning_rate": 6.104716534269772e-06, + "loss": 0.5077, + "step": 7760 + }, + { + "epoch": 0.9054337686567164, + "grad_norm": 0.4249365343444884, + "learning_rate": 6.091330005199183e-06, + "loss": 0.4868, + "step": 7765 + }, + { + "epoch": 0.9060167910447762, + "grad_norm": 0.43851878310292347, + "learning_rate": 6.078023061614953e-06, + "loss": 0.5015, + "step": 7770 + }, + { + "epoch": 0.9065998134328358, + "grad_norm": 0.43139132012318054, + "learning_rate": 6.064795752984875e-06, + "loss": 0.4832, + "step": 7775 + }, + { + "epoch": 0.9071828358208955, + "grad_norm": 0.45409545061847195, + "learning_rate": 6.0516481284806885e-06, + "loss": 0.4794, + "step": 7780 + }, + { + "epoch": 0.9077658582089553, + "grad_norm": 0.4411595137434583, + "learning_rate": 6.0385802369779385e-06, + "loss": 0.5183, + "step": 7785 + }, + { + "epoch": 0.9083488805970149, + "grad_norm": 0.45410062982495236, + "learning_rate": 6.025592127055741e-06, + "loss": 0.4736, + "step": 7790 + }, + { + "epoch": 0.9089319029850746, + "grad_norm": 0.42835954681482424, + "learning_rate": 6.012683846996645e-06, + "loss": 0.4541, + "step": 7795 + }, + { + "epoch": 0.9095149253731343, + "grad_norm": 0.42047696751886715, + "learning_rate": 5.999855444786425e-06, + "loss": 0.4973, + "step": 7800 + }, + { + "epoch": 0.910097947761194, + "grad_norm": 0.44914336580430964, + "learning_rate": 5.987106968113928e-06, + "loss": 0.5061, + "step": 7805 + }, + { + "epoch": 0.9106809701492538, + "grad_norm": 0.460206776855078, + "learning_rate": 5.974438464370872e-06, + "loss": 0.4888, + "step": 7810 + }, + { + "epoch": 0.9112639925373134, + "grad_norm": 0.4470862201000093, + "learning_rate": 5.961849980651684e-06, + "loss": 0.4659, + "step": 7815 + }, + { + "epoch": 0.9118470149253731, + "grad_norm": 0.47720200193566115, + "learning_rate": 5.949341563753319e-06, + "loss": 0.4992, + "step": 7820 + }, + { + "epoch": 0.9124300373134329, + "grad_norm": 0.4676169159668866, + "learning_rate": 5.936913260175094e-06, + "loss": 0.5179, + "step": 7825 + }, + { + "epoch": 0.9130130597014925, + "grad_norm": 0.4584704293118159, + "learning_rate": 5.924565116118499e-06, + "loss": 0.4797, + "step": 7830 + }, + { + "epoch": 0.9135960820895522, + "grad_norm": 0.4776754416693919, + "learning_rate": 5.9122971774870435e-06, + "loss": 0.4678, + "step": 7835 + }, + { + "epoch": 0.914179104477612, + "grad_norm": 0.42982811564868484, + "learning_rate": 5.900109489886081e-06, + "loss": 0.4804, + "step": 7840 + }, + { + "epoch": 0.9147621268656716, + "grad_norm": 0.41729830382315986, + "learning_rate": 5.8880020986226285e-06, + "loss": 0.468, + "step": 7845 + }, + { + "epoch": 0.9153451492537313, + "grad_norm": 0.4582133802824904, + "learning_rate": 5.875975048705206e-06, + "loss": 0.4939, + "step": 7850 + }, + { + "epoch": 0.9159281716417911, + "grad_norm": 0.4934433745443256, + "learning_rate": 5.864028384843678e-06, + "loss": 0.512, + "step": 7855 + }, + { + "epoch": 0.9165111940298507, + "grad_norm": 0.4461718822871272, + "learning_rate": 5.8521621514490715e-06, + "loss": 0.5226, + "step": 7860 + }, + { + "epoch": 0.9170942164179104, + "grad_norm": 0.43580261924917685, + "learning_rate": 5.8403763926334146e-06, + "loss": 0.4712, + "step": 7865 + }, + { + "epoch": 0.9176772388059702, + "grad_norm": 0.4289025060075553, + "learning_rate": 5.82867115220959e-06, + "loss": 0.4857, + "step": 7870 + }, + { + "epoch": 0.9182602611940298, + "grad_norm": 0.4471506171646922, + "learning_rate": 5.81704647369114e-06, + "loss": 0.5177, + "step": 7875 + }, + { + "epoch": 0.9188432835820896, + "grad_norm": 0.4349776664841292, + "learning_rate": 5.805502400292137e-06, + "loss": 0.4925, + "step": 7880 + }, + { + "epoch": 0.9194263059701493, + "grad_norm": 0.4524645377436028, + "learning_rate": 5.794038974926995e-06, + "loss": 0.4807, + "step": 7885 + }, + { + "epoch": 0.9200093283582089, + "grad_norm": 0.45618145888853556, + "learning_rate": 5.782656240210343e-06, + "loss": 0.5078, + "step": 7890 + }, + { + "epoch": 0.9205923507462687, + "grad_norm": 0.4316155039975515, + "learning_rate": 5.771354238456828e-06, + "loss": 0.4795, + "step": 7895 + }, + { + "epoch": 0.9211753731343284, + "grad_norm": 0.46008183460748864, + "learning_rate": 5.760133011680985e-06, + "loss": 0.4788, + "step": 7900 + }, + { + "epoch": 0.921758395522388, + "grad_norm": 0.44423836815839907, + "learning_rate": 5.748992601597076e-06, + "loss": 0.4949, + "step": 7905 + }, + { + "epoch": 0.9223414179104478, + "grad_norm": 0.45890602021310334, + "learning_rate": 5.737933049618925e-06, + "loss": 0.4811, + "step": 7910 + }, + { + "epoch": 0.9229244402985075, + "grad_norm": 0.424972332822205, + "learning_rate": 5.726954396859773e-06, + "loss": 0.4854, + "step": 7915 + }, + { + "epoch": 0.9235074626865671, + "grad_norm": 0.42910016132492174, + "learning_rate": 5.7160566841321255e-06, + "loss": 0.4512, + "step": 7920 + }, + { + "epoch": 0.9240904850746269, + "grad_norm": 0.46334797979450737, + "learning_rate": 5.705239951947597e-06, + "loss": 0.4834, + "step": 7925 + }, + { + "epoch": 0.9246735074626866, + "grad_norm": 0.4603535009443525, + "learning_rate": 5.694504240516759e-06, + "loss": 0.4837, + "step": 7930 + }, + { + "epoch": 0.9252565298507462, + "grad_norm": 0.4662682403880509, + "learning_rate": 5.683849589748994e-06, + "loss": 0.4823, + "step": 7935 + }, + { + "epoch": 0.925839552238806, + "grad_norm": 0.44330683887009237, + "learning_rate": 5.673276039252347e-06, + "loss": 0.5095, + "step": 7940 + }, + { + "epoch": 0.9264225746268657, + "grad_norm": 0.43601918566724357, + "learning_rate": 5.662783628333379e-06, + "loss": 0.5165, + "step": 7945 + }, + { + "epoch": 0.9270055970149254, + "grad_norm": 0.41717748530531923, + "learning_rate": 5.652372395997015e-06, + "loss": 0.4911, + "step": 7950 + }, + { + "epoch": 0.9275886194029851, + "grad_norm": 2.401919501282761, + "learning_rate": 5.642042380946412e-06, + "loss": 0.4687, + "step": 7955 + }, + { + "epoch": 0.9281716417910447, + "grad_norm": 0.41624865614532475, + "learning_rate": 5.631793621582793e-06, + "loss": 0.4945, + "step": 7960 + }, + { + "epoch": 0.9287546641791045, + "grad_norm": 0.5068445026903259, + "learning_rate": 5.621626156005335e-06, + "loss": 0.4786, + "step": 7965 + }, + { + "epoch": 0.9293376865671642, + "grad_norm": 0.45630828056750805, + "learning_rate": 5.611540022010985e-06, + "loss": 0.4951, + "step": 7970 + }, + { + "epoch": 0.9299207089552238, + "grad_norm": 0.4956233980595794, + "learning_rate": 5.6015352570943755e-06, + "loss": 0.4843, + "step": 7975 + }, + { + "epoch": 0.9305037313432836, + "grad_norm": 0.4329791152614355, + "learning_rate": 5.591611898447632e-06, + "loss": 0.4634, + "step": 7980 + }, + { + "epoch": 0.9310867537313433, + "grad_norm": 0.4663064517760106, + "learning_rate": 5.581769982960261e-06, + "loss": 0.5264, + "step": 7985 + }, + { + "epoch": 0.9316697761194029, + "grad_norm": 0.4836205261061597, + "learning_rate": 5.572009547219013e-06, + "loss": 0.5156, + "step": 7990 + }, + { + "epoch": 0.9322527985074627, + "grad_norm": 0.4277290525866144, + "learning_rate": 5.5623306275077475e-06, + "loss": 0.481, + "step": 7995 + }, + { + "epoch": 0.9328358208955224, + "grad_norm": 0.43341519970296755, + "learning_rate": 5.552733259807276e-06, + "loss": 0.498, + "step": 8000 + }, + { + "epoch": 0.933418843283582, + "grad_norm": 0.43071532419905234, + "learning_rate": 5.543217479795256e-06, + "loss": 0.4958, + "step": 8005 + }, + { + "epoch": 0.9340018656716418, + "grad_norm": 0.46622528251802137, + "learning_rate": 5.533783322846053e-06, + "loss": 0.4814, + "step": 8010 + }, + { + "epoch": 0.9345848880597015, + "grad_norm": 0.5249920187972582, + "learning_rate": 5.524430824030594e-06, + "loss": 0.4907, + "step": 8015 + }, + { + "epoch": 0.9351679104477612, + "grad_norm": 0.45135919668531826, + "learning_rate": 5.515160018116247e-06, + "loss": 0.492, + "step": 8020 + }, + { + "epoch": 0.9357509328358209, + "grad_norm": 0.4928051727444886, + "learning_rate": 5.505970939566699e-06, + "loss": 0.5035, + "step": 8025 + }, + { + "epoch": 0.9363339552238806, + "grad_norm": 0.44235862805439663, + "learning_rate": 5.4968636225418125e-06, + "loss": 0.4783, + "step": 8030 + }, + { + "epoch": 0.9369169776119403, + "grad_norm": 0.41958300409071425, + "learning_rate": 5.487838100897508e-06, + "loss": 0.4789, + "step": 8035 + }, + { + "epoch": 0.9375, + "grad_norm": 0.4570459794031607, + "learning_rate": 5.478894408185641e-06, + "loss": 0.4661, + "step": 8040 + }, + { + "epoch": 0.9380830223880597, + "grad_norm": 0.4783472459507973, + "learning_rate": 5.470032577653869e-06, + "loss": 0.4758, + "step": 8045 + }, + { + "epoch": 0.9386660447761194, + "grad_norm": 0.4549507857184879, + "learning_rate": 5.4612526422455265e-06, + "loss": 0.4961, + "step": 8050 + }, + { + "epoch": 0.9392490671641791, + "grad_norm": 0.4650558485006105, + "learning_rate": 5.452554634599519e-06, + "loss": 0.4891, + "step": 8055 + }, + { + "epoch": 0.9398320895522388, + "grad_norm": 0.580533723917234, + "learning_rate": 5.443938587050186e-06, + "loss": 0.5172, + "step": 8060 + }, + { + "epoch": 0.9404151119402985, + "grad_norm": 0.42296595646423346, + "learning_rate": 5.435404531627176e-06, + "loss": 0.4853, + "step": 8065 + }, + { + "epoch": 0.9409981343283582, + "grad_norm": 0.4573887758625825, + "learning_rate": 5.426952500055348e-06, + "loss": 0.4614, + "step": 8070 + }, + { + "epoch": 0.941581156716418, + "grad_norm": 0.4745039842823071, + "learning_rate": 5.41858252375464e-06, + "loss": 0.5061, + "step": 8075 + }, + { + "epoch": 0.9421641791044776, + "grad_norm": 0.47805447519039157, + "learning_rate": 5.410294633839949e-06, + "loss": 0.4735, + "step": 8080 + }, + { + "epoch": 0.9427472014925373, + "grad_norm": 0.43867449428861033, + "learning_rate": 5.402088861121025e-06, + "loss": 0.4718, + "step": 8085 + }, + { + "epoch": 0.9433302238805971, + "grad_norm": 0.4467252451572471, + "learning_rate": 5.393965236102353e-06, + "loss": 0.4798, + "step": 8090 + }, + { + "epoch": 0.9439132462686567, + "grad_norm": 0.5133360727759508, + "learning_rate": 5.385923788983034e-06, + "loss": 0.4894, + "step": 8095 + }, + { + "epoch": 0.9444962686567164, + "grad_norm": 0.4747210863179483, + "learning_rate": 5.377964549656685e-06, + "loss": 0.5098, + "step": 8100 + }, + { + "epoch": 0.9450792910447762, + "grad_norm": 0.44347953398302503, + "learning_rate": 5.370087547711307e-06, + "loss": 0.5105, + "step": 8105 + }, + { + "epoch": 0.9456623134328358, + "grad_norm": 0.4387472897639345, + "learning_rate": 5.362292812429207e-06, + "loss": 0.4815, + "step": 8110 + }, + { + "epoch": 0.9462453358208955, + "grad_norm": 0.4722026626305582, + "learning_rate": 5.354580372786854e-06, + "loss": 0.4776, + "step": 8115 + }, + { + "epoch": 0.9468283582089553, + "grad_norm": 0.4566856777345421, + "learning_rate": 5.346950257454792e-06, + "loss": 0.5002, + "step": 8120 + }, + { + "epoch": 0.9474113805970149, + "grad_norm": 0.4601768931007766, + "learning_rate": 5.339402494797539e-06, + "loss": 0.4725, + "step": 8125 + }, + { + "epoch": 0.9479944029850746, + "grad_norm": 0.4422563608916944, + "learning_rate": 5.331937112873462e-06, + "loss": 0.4411, + "step": 8130 + }, + { + "epoch": 0.9485774253731343, + "grad_norm": 0.44855925098469346, + "learning_rate": 5.324554139434679e-06, + "loss": 0.4941, + "step": 8135 + }, + { + "epoch": 0.949160447761194, + "grad_norm": 0.4437725309777671, + "learning_rate": 5.317253601926967e-06, + "loss": 0.4836, + "step": 8140 + }, + { + "epoch": 0.9497434701492538, + "grad_norm": 0.4488409563199269, + "learning_rate": 5.310035527489651e-06, + "loss": 0.4958, + "step": 8145 + }, + { + "epoch": 0.9503264925373134, + "grad_norm": 0.46309779837786297, + "learning_rate": 5.3028999429555045e-06, + "loss": 0.493, + "step": 8150 + }, + { + "epoch": 0.9509095149253731, + "grad_norm": 0.45673001540720787, + "learning_rate": 5.295846874850646e-06, + "loss": 0.4945, + "step": 8155 + }, + { + "epoch": 0.9514925373134329, + "grad_norm": 0.4481344375463158, + "learning_rate": 5.288876349394448e-06, + "loss": 0.4957, + "step": 8160 + }, + { + "epoch": 0.9520755597014925, + "grad_norm": 0.45392704978409076, + "learning_rate": 5.281988392499431e-06, + "loss": 0.4721, + "step": 8165 + }, + { + "epoch": 0.9526585820895522, + "grad_norm": 0.45349195761765704, + "learning_rate": 5.275183029771177e-06, + "loss": 0.4741, + "step": 8170 + }, + { + "epoch": 0.953241604477612, + "grad_norm": 0.4411901617730967, + "learning_rate": 5.2684602865082255e-06, + "loss": 0.4966, + "step": 8175 + }, + { + "epoch": 0.9538246268656716, + "grad_norm": 0.4585915951637137, + "learning_rate": 5.261820187701984e-06, + "loss": 0.5453, + "step": 8180 + }, + { + "epoch": 0.9544076492537313, + "grad_norm": 0.4593205732104069, + "learning_rate": 5.2552627580366334e-06, + "loss": 0.5013, + "step": 8185 + }, + { + "epoch": 0.9549906716417911, + "grad_norm": 0.42592095115888917, + "learning_rate": 5.248788021889036e-06, + "loss": 0.4797, + "step": 8190 + }, + { + "epoch": 0.9555736940298507, + "grad_norm": 0.46962819902389563, + "learning_rate": 5.2423960033286505e-06, + "loss": 0.4763, + "step": 8195 + }, + { + "epoch": 0.9561567164179104, + "grad_norm": 0.40855238448922176, + "learning_rate": 5.236086726117433e-06, + "loss": 0.4743, + "step": 8200 + }, + { + "epoch": 0.9567397388059702, + "grad_norm": 0.43575271699567064, + "learning_rate": 5.229860213709753e-06, + "loss": 0.4773, + "step": 8205 + }, + { + "epoch": 0.9573227611940298, + "grad_norm": 0.4688749371758774, + "learning_rate": 5.223716489252311e-06, + "loss": 0.5166, + "step": 8210 + }, + { + "epoch": 0.9579057835820896, + "grad_norm": 0.7670542201127664, + "learning_rate": 5.217655575584045e-06, + "loss": 0.493, + "step": 8215 + }, + { + "epoch": 0.9584888059701493, + "grad_norm": 0.46421302891934385, + "learning_rate": 5.211677495236046e-06, + "loss": 0.5096, + "step": 8220 + }, + { + "epoch": 0.9590718283582089, + "grad_norm": 0.4771862479599736, + "learning_rate": 5.205782270431484e-06, + "loss": 0.4693, + "step": 8225 + }, + { + "epoch": 0.9596548507462687, + "grad_norm": 0.44226860049129957, + "learning_rate": 5.199969923085515e-06, + "loss": 0.4459, + "step": 8230 + }, + { + "epoch": 0.9602378731343284, + "grad_norm": 0.4294703525768145, + "learning_rate": 5.194240474805201e-06, + "loss": 0.481, + "step": 8235 + }, + { + "epoch": 0.960820895522388, + "grad_norm": 0.46086183427389743, + "learning_rate": 5.188593946889429e-06, + "loss": 0.4973, + "step": 8240 + }, + { + "epoch": 0.9614039179104478, + "grad_norm": 0.4280169626175964, + "learning_rate": 5.183030360328846e-06, + "loss": 0.4698, + "step": 8245 + }, + { + "epoch": 0.9619869402985075, + "grad_norm": 0.4753252130786709, + "learning_rate": 5.177549735805758e-06, + "loss": 0.5015, + "step": 8250 + }, + { + "epoch": 0.9625699626865671, + "grad_norm": 0.4620963248632002, + "learning_rate": 5.172152093694067e-06, + "loss": 0.4802, + "step": 8255 + }, + { + "epoch": 0.9631529850746269, + "grad_norm": 0.5332881434749811, + "learning_rate": 5.166837454059193e-06, + "loss": 0.5106, + "step": 8260 + }, + { + "epoch": 0.9637360074626866, + "grad_norm": 0.5025697757795641, + "learning_rate": 5.161605836658004e-06, + "loss": 0.4986, + "step": 8265 + }, + { + "epoch": 0.9643190298507462, + "grad_norm": 0.46525870817249937, + "learning_rate": 5.156457260938732e-06, + "loss": 0.4925, + "step": 8270 + }, + { + "epoch": 0.964902052238806, + "grad_norm": 0.4466358224458004, + "learning_rate": 5.151391746040905e-06, + "loss": 0.4896, + "step": 8275 + }, + { + "epoch": 0.9654850746268657, + "grad_norm": 0.4722232462608345, + "learning_rate": 5.146409310795282e-06, + "loss": 0.5116, + "step": 8280 + }, + { + "epoch": 0.9660680970149254, + "grad_norm": 0.4839383338646517, + "learning_rate": 5.14150997372378e-06, + "loss": 0.4704, + "step": 8285 + }, + { + "epoch": 0.9666511194029851, + "grad_norm": 0.4598082477182381, + "learning_rate": 5.1366937530393955e-06, + "loss": 0.4675, + "step": 8290 + }, + { + "epoch": 0.9672341417910447, + "grad_norm": 0.4571533268091363, + "learning_rate": 5.131960666646149e-06, + "loss": 0.4886, + "step": 8295 + }, + { + "epoch": 0.9678171641791045, + "grad_norm": 0.4826350517333677, + "learning_rate": 5.127310732139018e-06, + "loss": 0.5239, + "step": 8300 + }, + { + "epoch": 0.9684001865671642, + "grad_norm": 0.501035586719562, + "learning_rate": 5.122743966803858e-06, + "loss": 0.4845, + "step": 8305 + }, + { + "epoch": 0.9689832089552238, + "grad_norm": 0.45689428681207833, + "learning_rate": 5.118260387617359e-06, + "loss": 0.4802, + "step": 8310 + }, + { + "epoch": 0.9695662313432836, + "grad_norm": 0.41839841626319924, + "learning_rate": 5.113860011246964e-06, + "loss": 0.4759, + "step": 8315 + }, + { + "epoch": 0.9701492537313433, + "grad_norm": 0.4623224498637685, + "learning_rate": 5.109542854050814e-06, + "loss": 0.5191, + "step": 8320 + }, + { + "epoch": 0.9707322761194029, + "grad_norm": 0.4400774444581771, + "learning_rate": 5.105308932077693e-06, + "loss": 0.4751, + "step": 8325 + }, + { + "epoch": 0.9713152985074627, + "grad_norm": 0.5458461815433695, + "learning_rate": 5.101158261066959e-06, + "loss": 0.4934, + "step": 8330 + }, + { + "epoch": 0.9718983208955224, + "grad_norm": 0.45228504839890776, + "learning_rate": 5.097090856448492e-06, + "loss": 0.5007, + "step": 8335 + }, + { + "epoch": 0.972481343283582, + "grad_norm": 0.467798541042544, + "learning_rate": 5.0931067333426275e-06, + "loss": 0.5082, + "step": 8340 + }, + { + "epoch": 0.9730643656716418, + "grad_norm": 0.4266615826249801, + "learning_rate": 5.0892059065601145e-06, + "loss": 0.4974, + "step": 8345 + }, + { + "epoch": 0.9736473880597015, + "grad_norm": 0.4621554435159516, + "learning_rate": 5.0853883906020525e-06, + "loss": 0.4965, + "step": 8350 + }, + { + "epoch": 0.9742304104477612, + "grad_norm": 0.46073201489628657, + "learning_rate": 5.081654199659831e-06, + "loss": 0.5071, + "step": 8355 + }, + { + "epoch": 0.9748134328358209, + "grad_norm": 0.5462060706132782, + "learning_rate": 5.07800334761509e-06, + "loss": 0.4938, + "step": 8360 + }, + { + "epoch": 0.9753964552238806, + "grad_norm": 0.48250851908785264, + "learning_rate": 5.074435848039658e-06, + "loss": 0.5091, + "step": 8365 + }, + { + "epoch": 0.9759794776119403, + "grad_norm": 0.44847740844622386, + "learning_rate": 5.070951714195508e-06, + "loss": 0.4957, + "step": 8370 + }, + { + "epoch": 0.9765625, + "grad_norm": 0.4467178102848771, + "learning_rate": 5.067550959034707e-06, + "loss": 0.49, + "step": 8375 + }, + { + "epoch": 0.9771455223880597, + "grad_norm": 0.46636700776310513, + "learning_rate": 5.064233595199362e-06, + "loss": 0.4884, + "step": 8380 + }, + { + "epoch": 0.9777285447761194, + "grad_norm": 0.4615425374049883, + "learning_rate": 5.060999635021583e-06, + "loss": 0.4771, + "step": 8385 + }, + { + "epoch": 0.9783115671641791, + "grad_norm": 0.45286825679857706, + "learning_rate": 5.057849090523426e-06, + "loss": 0.4959, + "step": 8390 + }, + { + "epoch": 0.9788945895522388, + "grad_norm": 0.4569055516084358, + "learning_rate": 5.054781973416858e-06, + "loss": 0.4955, + "step": 8395 + }, + { + "epoch": 0.9794776119402985, + "grad_norm": 0.4455395860822749, + "learning_rate": 5.051798295103711e-06, + "loss": 0.4752, + "step": 8400 + }, + { + "epoch": 0.9800606343283582, + "grad_norm": 0.4183019666197293, + "learning_rate": 5.048898066675631e-06, + "loss": 0.4552, + "step": 8405 + }, + { + "epoch": 0.980643656716418, + "grad_norm": 0.43323200908430703, + "learning_rate": 5.046081298914053e-06, + "loss": 0.5092, + "step": 8410 + }, + { + "epoch": 0.9812266791044776, + "grad_norm": 0.46606214824527303, + "learning_rate": 5.043348002290145e-06, + "loss": 0.4639, + "step": 8415 + }, + { + "epoch": 0.9818097014925373, + "grad_norm": 0.4620950107319596, + "learning_rate": 5.0406981869647805e-06, + "loss": 0.5072, + "step": 8420 + }, + { + "epoch": 0.9823927238805971, + "grad_norm": 0.44313034547356717, + "learning_rate": 5.038131862788491e-06, + "loss": 0.4765, + "step": 8425 + }, + { + "epoch": 0.9829757462686567, + "grad_norm": 0.44702546793927656, + "learning_rate": 5.035649039301438e-06, + "loss": 0.4612, + "step": 8430 + }, + { + "epoch": 0.9835587686567164, + "grad_norm": 0.45947101038805505, + "learning_rate": 5.033249725733377e-06, + "loss": 0.4967, + "step": 8435 + }, + { + "epoch": 0.9841417910447762, + "grad_norm": 0.42282849418109447, + "learning_rate": 5.0309339310036125e-06, + "loss": 0.507, + "step": 8440 + }, + { + "epoch": 0.9847248134328358, + "grad_norm": 0.44692770292599415, + "learning_rate": 5.02870166372098e-06, + "loss": 0.4808, + "step": 8445 + }, + { + "epoch": 0.9853078358208955, + "grad_norm": 0.5582818670824954, + "learning_rate": 5.0265529321838004e-06, + "loss": 0.5405, + "step": 8450 + }, + { + "epoch": 0.9858908582089553, + "grad_norm": 0.4665732954055579, + "learning_rate": 5.02448774437986e-06, + "loss": 0.5157, + "step": 8455 + }, + { + "epoch": 0.9864738805970149, + "grad_norm": 0.5202669052061701, + "learning_rate": 5.022506107986374e-06, + "loss": 0.5146, + "step": 8460 + }, + { + "epoch": 0.9870569029850746, + "grad_norm": 0.4563878919577009, + "learning_rate": 5.020608030369962e-06, + "loss": 0.4614, + "step": 8465 + }, + { + "epoch": 0.9876399253731343, + "grad_norm": 0.45761657701588687, + "learning_rate": 5.018793518586616e-06, + "loss": 0.5007, + "step": 8470 + }, + { + "epoch": 0.988222947761194, + "grad_norm": 0.46136936074859125, + "learning_rate": 5.017062579381676e-06, + "loss": 0.5068, + "step": 8475 + }, + { + "epoch": 0.9888059701492538, + "grad_norm": 0.48777910519871126, + "learning_rate": 5.015415219189812e-06, + "loss": 0.4889, + "step": 8480 + }, + { + "epoch": 0.9893889925373134, + "grad_norm": 0.47084471147391416, + "learning_rate": 5.013851444134987e-06, + "loss": 0.5022, + "step": 8485 + }, + { + "epoch": 0.9899720149253731, + "grad_norm": 0.432844624719787, + "learning_rate": 5.012371260030445e-06, + "loss": 0.491, + "step": 8490 + }, + { + "epoch": 0.9905550373134329, + "grad_norm": 0.4649632477223049, + "learning_rate": 5.010974672378682e-06, + "loss": 0.4741, + "step": 8495 + }, + { + "epoch": 0.9911380597014925, + "grad_norm": 0.4764372199483869, + "learning_rate": 5.009661686371434e-06, + "loss": 0.4772, + "step": 8500 + }, + { + "epoch": 0.9917210820895522, + "grad_norm": 0.4868588188840104, + "learning_rate": 5.008432306889652e-06, + "loss": 0.5214, + "step": 8505 + }, + { + "epoch": 0.992304104477612, + "grad_norm": 0.438360158275501, + "learning_rate": 5.0072865385034785e-06, + "loss": 0.4905, + "step": 8510 + }, + { + "epoch": 0.9928871268656716, + "grad_norm": 0.48340344564902316, + "learning_rate": 5.006224385472242e-06, + "loss": 0.4927, + "step": 8515 + }, + { + "epoch": 0.9934701492537313, + "grad_norm": 0.45531955111325645, + "learning_rate": 5.0052458517444364e-06, + "loss": 0.4888, + "step": 8520 + }, + { + "epoch": 0.9940531716417911, + "grad_norm": 0.4362698593831642, + "learning_rate": 5.004350940957703e-06, + "loss": 0.4818, + "step": 8525 + }, + { + "epoch": 0.9946361940298507, + "grad_norm": 0.46810128935885154, + "learning_rate": 5.0035396564388184e-06, + "loss": 0.5101, + "step": 8530 + }, + { + "epoch": 0.9952192164179104, + "grad_norm": 0.8386528644620809, + "learning_rate": 5.00281200120369e-06, + "loss": 0.4791, + "step": 8535 + }, + { + "epoch": 0.9958022388059702, + "grad_norm": 0.4834359055420526, + "learning_rate": 5.00216797795733e-06, + "loss": 0.4913, + "step": 8540 + }, + { + "epoch": 0.9963852611940298, + "grad_norm": 0.4925810326491755, + "learning_rate": 5.001607589093861e-06, + "loss": 0.5076, + "step": 8545 + }, + { + "epoch": 0.9969682835820896, + "grad_norm": 0.44173971825287517, + "learning_rate": 5.001130836696491e-06, + "loss": 0.4809, + "step": 8550 + }, + { + "epoch": 0.9975513059701493, + "grad_norm": 0.48077397748529965, + "learning_rate": 5.000737722537526e-06, + "loss": 0.4974, + "step": 8555 + }, + { + "epoch": 0.9981343283582089, + "grad_norm": 0.6138381620178532, + "learning_rate": 5.00042824807834e-06, + "loss": 0.4925, + "step": 8560 + }, + { + "epoch": 0.9987173507462687, + "grad_norm": 0.43738249534157836, + "learning_rate": 5.000202414469386e-06, + "loss": 0.4573, + "step": 8565 + }, + { + "epoch": 0.9993003731343284, + "grad_norm": 0.4562639975448534, + "learning_rate": 5.0000602225501925e-06, + "loss": 0.4941, + "step": 8570 + }, + { + "epoch": 0.999883395522388, + "grad_norm": 0.4592908023435152, + "learning_rate": 5.0000016728493425e-06, + "loss": 0.4962, + "step": 8575 + }, + { + "epoch": 1.0, + "step": 8576, + "total_flos": 488578517827584.0, + "train_loss": 0.5423156990836472, + "train_runtime": 27783.4456, + "train_samples_per_second": 1.235, + "train_steps_per_second": 0.309 + } + ], + "logging_steps": 5, + "max_steps": 8576, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 488578517827584.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}