{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 42880, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005830223880597014, "grad_norm": 2.550195533598444, "learning_rate": 5.827505827505828e-07, "loss": 0.8852, "step": 5 }, { "epoch": 0.0011660447761194029, "grad_norm": 2.2188855019798273, "learning_rate": 1.1655011655011657e-06, "loss": 0.8311, "step": 10 }, { "epoch": 0.0017490671641791045, "grad_norm": 1.7967346771825998, "learning_rate": 1.7482517482517485e-06, "loss": 0.8113, "step": 15 }, { "epoch": 0.0023320895522388058, "grad_norm": 1.618619190869765, "learning_rate": 2.3310023310023313e-06, "loss": 0.8333, "step": 20 }, { "epoch": 0.0029151119402985076, "grad_norm": 1.137673984585037, "learning_rate": 2.9137529137529138e-06, "loss": 0.7745, "step": 25 }, { "epoch": 0.003498134328358209, "grad_norm": 1.1933766134733839, "learning_rate": 3.496503496503497e-06, "loss": 0.7904, "step": 30 }, { "epoch": 0.00408115671641791, "grad_norm": 1.0157304336095063, "learning_rate": 4.079254079254079e-06, "loss": 0.7624, "step": 35 }, { "epoch": 0.0046641791044776115, "grad_norm": 0.9429820002012593, "learning_rate": 4.662004662004663e-06, "loss": 0.7285, "step": 40 }, { "epoch": 0.005247201492537314, "grad_norm": 0.915430165066572, "learning_rate": 5.244755244755245e-06, "loss": 0.7469, "step": 45 }, { "epoch": 0.005830223880597015, "grad_norm": 0.8161946795665328, "learning_rate": 5.8275058275058275e-06, "loss": 0.707, "step": 50 }, { "epoch": 0.006413246268656717, "grad_norm": 0.8339668281290502, "learning_rate": 6.41025641025641e-06, "loss": 0.6905, "step": 55 }, { "epoch": 0.006996268656716418, "grad_norm": 0.8574171717816127, "learning_rate": 6.993006993006994e-06, "loss": 0.7698, "step": 60 }, { "epoch": 0.00757929104477612, "grad_norm": 0.8619956369841163, "learning_rate": 7.5757575757575764e-06, "loss": 0.7082, "step": 65 }, { "epoch": 0.00816231343283582, "grad_norm": 0.8045557883502801, "learning_rate": 8.158508158508159e-06, "loss": 0.6587, "step": 70 }, { "epoch": 0.008745335820895522, "grad_norm": 0.7946638318702278, "learning_rate": 8.741258741258741e-06, "loss": 0.6539, "step": 75 }, { "epoch": 0.009328358208955223, "grad_norm": 0.8503979707980048, "learning_rate": 9.324009324009325e-06, "loss": 0.6879, "step": 80 }, { "epoch": 0.009911380597014926, "grad_norm": 0.804711208532189, "learning_rate": 9.906759906759908e-06, "loss": 0.6743, "step": 85 }, { "epoch": 0.010494402985074628, "grad_norm": 0.7951085316698083, "learning_rate": 1.048951048951049e-05, "loss": 0.6586, "step": 90 }, { "epoch": 0.011077425373134329, "grad_norm": 0.793128900429868, "learning_rate": 1.1072261072261073e-05, "loss": 0.6842, "step": 95 }, { "epoch": 0.01166044776119403, "grad_norm": 0.8075245421009535, "learning_rate": 1.1655011655011655e-05, "loss": 0.6684, "step": 100 }, { "epoch": 0.012243470149253732, "grad_norm": 0.8581518728101114, "learning_rate": 1.2237762237762239e-05, "loss": 0.6207, "step": 105 }, { "epoch": 0.012826492537313433, "grad_norm": 0.9082387952747121, "learning_rate": 1.282051282051282e-05, "loss": 0.6678, "step": 110 }, { "epoch": 0.013409514925373135, "grad_norm": 0.9029958898223162, "learning_rate": 1.3403263403263406e-05, "loss": 0.6394, "step": 115 }, { "epoch": 0.013992537313432836, "grad_norm": 0.8377367698700525, "learning_rate": 1.3986013986013988e-05, "loss": 0.6707, "step": 120 }, { "epoch": 0.014575559701492538, "grad_norm": 0.7587966655835441, "learning_rate": 1.456876456876457e-05, "loss": 0.6258, "step": 125 }, { "epoch": 0.01515858208955224, "grad_norm": 0.8173306054331542, "learning_rate": 1.5151515151515153e-05, "loss": 0.6619, "step": 130 }, { "epoch": 0.01574160447761194, "grad_norm": 0.9376337343046882, "learning_rate": 1.5734265734265734e-05, "loss": 0.6673, "step": 135 }, { "epoch": 0.01632462686567164, "grad_norm": 0.9363652268784762, "learning_rate": 1.6317016317016318e-05, "loss": 0.668, "step": 140 }, { "epoch": 0.016907649253731342, "grad_norm": 0.8794426221093276, "learning_rate": 1.68997668997669e-05, "loss": 0.6569, "step": 145 }, { "epoch": 0.017490671641791043, "grad_norm": 0.9407491307884387, "learning_rate": 1.7482517482517483e-05, "loss": 0.6778, "step": 150 }, { "epoch": 0.018073694029850745, "grad_norm": 0.8720667910368793, "learning_rate": 1.8065268065268067e-05, "loss": 0.6291, "step": 155 }, { "epoch": 0.018656716417910446, "grad_norm": 0.8603987252532928, "learning_rate": 1.864801864801865e-05, "loss": 0.6372, "step": 160 }, { "epoch": 0.019239738805970148, "grad_norm": 0.8449689793110765, "learning_rate": 1.923076923076923e-05, "loss": 0.6586, "step": 165 }, { "epoch": 0.019822761194029852, "grad_norm": 1.0325515822144362, "learning_rate": 1.9813519813519816e-05, "loss": 0.6386, "step": 170 }, { "epoch": 0.020405783582089554, "grad_norm": 0.8513517345308381, "learning_rate": 2.0396270396270396e-05, "loss": 0.6311, "step": 175 }, { "epoch": 0.020988805970149255, "grad_norm": 0.8847582203013, "learning_rate": 2.097902097902098e-05, "loss": 0.6834, "step": 180 }, { "epoch": 0.021571828358208957, "grad_norm": 0.9006033563159529, "learning_rate": 2.156177156177156e-05, "loss": 0.6381, "step": 185 }, { "epoch": 0.022154850746268658, "grad_norm": 0.8147114381410464, "learning_rate": 2.2144522144522145e-05, "loss": 0.6432, "step": 190 }, { "epoch": 0.02273787313432836, "grad_norm": 0.9560611394473829, "learning_rate": 2.272727272727273e-05, "loss": 0.6261, "step": 195 }, { "epoch": 0.02332089552238806, "grad_norm": 0.7951106600912393, "learning_rate": 2.331002331002331e-05, "loss": 0.6307, "step": 200 }, { "epoch": 0.023903917910447763, "grad_norm": 0.855140755847949, "learning_rate": 2.3892773892773894e-05, "loss": 0.6213, "step": 205 }, { "epoch": 0.024486940298507464, "grad_norm": 1.0420866065102492, "learning_rate": 2.4475524475524478e-05, "loss": 0.6275, "step": 210 }, { "epoch": 0.025069962686567165, "grad_norm": 0.89019303742937, "learning_rate": 2.505827505827506e-05, "loss": 0.6275, "step": 215 }, { "epoch": 0.025652985074626867, "grad_norm": 0.9758673014251983, "learning_rate": 2.564102564102564e-05, "loss": 0.6631, "step": 220 }, { "epoch": 0.02623600746268657, "grad_norm": 0.8596860100346014, "learning_rate": 2.6223776223776224e-05, "loss": 0.6216, "step": 225 }, { "epoch": 0.02681902985074627, "grad_norm": 0.8908792386022747, "learning_rate": 2.680652680652681e-05, "loss": 0.6406, "step": 230 }, { "epoch": 0.02740205223880597, "grad_norm": 0.8721610635945679, "learning_rate": 2.738927738927739e-05, "loss": 0.6216, "step": 235 }, { "epoch": 0.027985074626865673, "grad_norm": 0.8671325117829104, "learning_rate": 2.7972027972027976e-05, "loss": 0.6713, "step": 240 }, { "epoch": 0.028568097014925374, "grad_norm": 0.8803714665797042, "learning_rate": 2.8554778554778557e-05, "loss": 0.629, "step": 245 }, { "epoch": 0.029151119402985076, "grad_norm": 0.8308844754653052, "learning_rate": 2.913752913752914e-05, "loss": 0.6373, "step": 250 }, { "epoch": 0.029734141791044777, "grad_norm": 0.829842731040146, "learning_rate": 2.972027972027972e-05, "loss": 0.6163, "step": 255 }, { "epoch": 0.03031716417910448, "grad_norm": 0.9311493282737088, "learning_rate": 3.0303030303030306e-05, "loss": 0.6448, "step": 260 }, { "epoch": 0.03090018656716418, "grad_norm": 0.8380478242539974, "learning_rate": 3.088578088578088e-05, "loss": 0.6123, "step": 265 }, { "epoch": 0.03148320895522388, "grad_norm": 0.8612483599648676, "learning_rate": 3.146853146853147e-05, "loss": 0.6206, "step": 270 }, { "epoch": 0.03206623134328358, "grad_norm": 0.7504191034824335, "learning_rate": 3.205128205128206e-05, "loss": 0.5956, "step": 275 }, { "epoch": 0.03264925373134328, "grad_norm": 0.8435455726913331, "learning_rate": 3.2634032634032635e-05, "loss": 0.6147, "step": 280 }, { "epoch": 0.033232276119402986, "grad_norm": 0.8228330420668449, "learning_rate": 3.321678321678322e-05, "loss": 0.6285, "step": 285 }, { "epoch": 0.033815298507462684, "grad_norm": 0.7466739529712338, "learning_rate": 3.37995337995338e-05, "loss": 0.62, "step": 290 }, { "epoch": 0.03439832089552239, "grad_norm": 0.8056760014927022, "learning_rate": 3.438228438228439e-05, "loss": 0.6216, "step": 295 }, { "epoch": 0.034981343283582086, "grad_norm": 0.8113797852368729, "learning_rate": 3.4965034965034965e-05, "loss": 0.6105, "step": 300 }, { "epoch": 0.03556436567164179, "grad_norm": 0.8890502750300378, "learning_rate": 3.554778554778555e-05, "loss": 0.6139, "step": 305 }, { "epoch": 0.03614738805970149, "grad_norm": 0.7687545593968855, "learning_rate": 3.613053613053613e-05, "loss": 0.614, "step": 310 }, { "epoch": 0.036730410447761194, "grad_norm": 0.8336903362213683, "learning_rate": 3.671328671328672e-05, "loss": 0.6278, "step": 315 }, { "epoch": 0.03731343283582089, "grad_norm": 0.869875611794232, "learning_rate": 3.72960372960373e-05, "loss": 0.6645, "step": 320 }, { "epoch": 0.0378964552238806, "grad_norm": 0.789926244606821, "learning_rate": 3.787878787878788e-05, "loss": 0.6027, "step": 325 }, { "epoch": 0.038479477611940295, "grad_norm": 0.8392459694319648, "learning_rate": 3.846153846153846e-05, "loss": 0.6567, "step": 330 }, { "epoch": 0.0390625, "grad_norm": 0.7918852769546142, "learning_rate": 3.904428904428905e-05, "loss": 0.6453, "step": 335 }, { "epoch": 0.039645522388059705, "grad_norm": 0.7819447011686889, "learning_rate": 3.962703962703963e-05, "loss": 0.6, "step": 340 }, { "epoch": 0.0402285447761194, "grad_norm": 0.7599671478703276, "learning_rate": 4.020979020979021e-05, "loss": 0.6097, "step": 345 }, { "epoch": 0.04081156716417911, "grad_norm": 0.8770061789317196, "learning_rate": 4.079254079254079e-05, "loss": 0.6199, "step": 350 }, { "epoch": 0.041394589552238806, "grad_norm": 0.8228590505028837, "learning_rate": 4.1375291375291377e-05, "loss": 0.63, "step": 355 }, { "epoch": 0.04197761194029851, "grad_norm": 0.9266679768403295, "learning_rate": 4.195804195804196e-05, "loss": 0.6611, "step": 360 }, { "epoch": 0.04256063432835821, "grad_norm": 0.776925533918814, "learning_rate": 4.254079254079254e-05, "loss": 0.6383, "step": 365 }, { "epoch": 0.043143656716417914, "grad_norm": 0.7652459941000134, "learning_rate": 4.312354312354312e-05, "loss": 0.5967, "step": 370 }, { "epoch": 0.04372667910447761, "grad_norm": 0.8438476797737248, "learning_rate": 4.370629370629371e-05, "loss": 0.6146, "step": 375 }, { "epoch": 0.044309701492537316, "grad_norm": 0.8158888620815322, "learning_rate": 4.428904428904429e-05, "loss": 0.6237, "step": 380 }, { "epoch": 0.044892723880597014, "grad_norm": 0.9144427364330033, "learning_rate": 4.4871794871794874e-05, "loss": 0.6098, "step": 385 }, { "epoch": 0.04547574626865672, "grad_norm": 0.7260277122998825, "learning_rate": 4.545454545454546e-05, "loss": 0.6105, "step": 390 }, { "epoch": 0.04605876865671642, "grad_norm": 0.774195742981333, "learning_rate": 4.603729603729604e-05, "loss": 0.6182, "step": 395 }, { "epoch": 0.04664179104477612, "grad_norm": 0.8931158013280471, "learning_rate": 4.662004662004662e-05, "loss": 0.6269, "step": 400 }, { "epoch": 0.04722481343283582, "grad_norm": 0.7219414153894123, "learning_rate": 4.7202797202797204e-05, "loss": 0.6159, "step": 405 }, { "epoch": 0.047807835820895525, "grad_norm": 0.7392180125184434, "learning_rate": 4.778554778554779e-05, "loss": 0.6597, "step": 410 }, { "epoch": 0.04839085820895522, "grad_norm": 0.7724562759848209, "learning_rate": 4.836829836829837e-05, "loss": 0.6588, "step": 415 }, { "epoch": 0.04897388059701493, "grad_norm": 0.7698999004589875, "learning_rate": 4.8951048951048956e-05, "loss": 0.6431, "step": 420 }, { "epoch": 0.049556902985074626, "grad_norm": 0.756825275252759, "learning_rate": 4.9533799533799534e-05, "loss": 0.6475, "step": 425 }, { "epoch": 0.05013992537313433, "grad_norm": 0.7182404605677212, "learning_rate": 4.9999998327150664e-05, "loss": 0.6519, "step": 430 }, { "epoch": 0.05072294776119403, "grad_norm": 0.7352734921536612, "learning_rate": 4.999993977744981e-05, "loss": 0.6367, "step": 435 }, { "epoch": 0.051305970149253734, "grad_norm": 0.8663133271054537, "learning_rate": 4.9999797585530614e-05, "loss": 0.6367, "step": 440 }, { "epoch": 0.05188899253731343, "grad_norm": 0.8007074508867144, "learning_rate": 4.9999571751921666e-05, "loss": 0.649, "step": 445 }, { "epoch": 0.05247201492537314, "grad_norm": 0.8808725057777399, "learning_rate": 4.999926227746247e-05, "loss": 0.6407, "step": 450 }, { "epoch": 0.053055037313432835, "grad_norm": 0.8207754194931053, "learning_rate": 4.999886916330351e-05, "loss": 0.6664, "step": 455 }, { "epoch": 0.05363805970149254, "grad_norm": 0.7200644017362087, "learning_rate": 4.9998392410906135e-05, "loss": 0.6264, "step": 460 }, { "epoch": 0.05422108208955224, "grad_norm": 0.7655394577960307, "learning_rate": 4.9997832022042676e-05, "loss": 0.6854, "step": 465 }, { "epoch": 0.05480410447761194, "grad_norm": 1.5845328151194846, "learning_rate": 4.9997187998796316e-05, "loss": 0.6092, "step": 470 }, { "epoch": 0.05538712686567164, "grad_norm": 0.763948541759687, "learning_rate": 4.9996460343561184e-05, "loss": 0.6601, "step": 475 }, { "epoch": 0.055970149253731345, "grad_norm": 0.7370601652088519, "learning_rate": 4.99956490590423e-05, "loss": 0.6062, "step": 480 }, { "epoch": 0.05655317164179104, "grad_norm": 0.6503724111090272, "learning_rate": 4.9994754148255566e-05, "loss": 0.597, "step": 485 }, { "epoch": 0.05713619402985075, "grad_norm": 0.642633522142368, "learning_rate": 4.999377561452776e-05, "loss": 0.633, "step": 490 }, { "epoch": 0.057719216417910446, "grad_norm": 0.6866312480811294, "learning_rate": 4.999271346149652e-05, "loss": 0.6421, "step": 495 }, { "epoch": 0.05830223880597015, "grad_norm": 0.6472205796291965, "learning_rate": 4.999156769311035e-05, "loss": 0.615, "step": 500 }, { "epoch": 0.05888526119402985, "grad_norm": 0.6977115669022897, "learning_rate": 4.999033831362857e-05, "loss": 0.6214, "step": 505 }, { "epoch": 0.059468283582089554, "grad_norm": 0.6986043336376292, "learning_rate": 4.998902532762132e-05, "loss": 0.6193, "step": 510 }, { "epoch": 0.06005130597014925, "grad_norm": 0.7475999210726344, "learning_rate": 4.9987628739969554e-05, "loss": 0.6224, "step": 515 }, { "epoch": 0.06063432835820896, "grad_norm": 0.6813745517845872, "learning_rate": 4.9986148555865016e-05, "loss": 0.6177, "step": 520 }, { "epoch": 0.061217350746268655, "grad_norm": 0.7015168996041015, "learning_rate": 4.9984584780810196e-05, "loss": 0.6768, "step": 525 }, { "epoch": 0.06180037313432836, "grad_norm": 0.7037997780013406, "learning_rate": 4.998293742061833e-05, "loss": 0.6742, "step": 530 }, { "epoch": 0.06238339552238806, "grad_norm": 0.7156012560309609, "learning_rate": 4.998120648141338e-05, "loss": 0.6304, "step": 535 }, { "epoch": 0.06296641791044776, "grad_norm": 0.711127994617591, "learning_rate": 4.997939196963004e-05, "loss": 0.6765, "step": 540 }, { "epoch": 0.06354944029850747, "grad_norm": 0.7367809530722819, "learning_rate": 4.997749389201363e-05, "loss": 0.6443, "step": 545 }, { "epoch": 0.06413246268656717, "grad_norm": 0.7023254028972605, "learning_rate": 4.997551225562014e-05, "loss": 0.614, "step": 550 }, { "epoch": 0.06471548507462686, "grad_norm": 0.7934049324407979, "learning_rate": 4.99734470678162e-05, "loss": 0.6804, "step": 555 }, { "epoch": 0.06529850746268656, "grad_norm": 0.7211790595438915, "learning_rate": 4.997129833627902e-05, "loss": 0.6022, "step": 560 }, { "epoch": 0.06588152985074627, "grad_norm": 0.7803510446535082, "learning_rate": 4.996906606899639e-05, "loss": 0.6324, "step": 565 }, { "epoch": 0.06646455223880597, "grad_norm": 0.7268141035030203, "learning_rate": 4.996675027426662e-05, "loss": 0.6244, "step": 570 }, { "epoch": 0.06704757462686567, "grad_norm": 1.937711458766374, "learning_rate": 4.9964350960698564e-05, "loss": 0.637, "step": 575 }, { "epoch": 0.06763059701492537, "grad_norm": 0.6807880062940403, "learning_rate": 4.996186813721152e-05, "loss": 0.6009, "step": 580 }, { "epoch": 0.06821361940298508, "grad_norm": 0.7031632269645188, "learning_rate": 4.995930181303522e-05, "loss": 0.6312, "step": 585 }, { "epoch": 0.06879664179104478, "grad_norm": 0.6771751142969727, "learning_rate": 4.995665199770986e-05, "loss": 0.6604, "step": 590 }, { "epoch": 0.06937966417910447, "grad_norm": 0.711794153251972, "learning_rate": 4.995391870108595e-05, "loss": 0.6527, "step": 595 }, { "epoch": 0.06996268656716417, "grad_norm": 0.7336499566418994, "learning_rate": 4.9951101933324374e-05, "loss": 0.6056, "step": 600 }, { "epoch": 0.07054570895522388, "grad_norm": 0.9123524114339076, "learning_rate": 4.994820170489629e-05, "loss": 0.6351, "step": 605 }, { "epoch": 0.07112873134328358, "grad_norm": 0.6937351959939653, "learning_rate": 4.9945218026583147e-05, "loss": 0.6415, "step": 610 }, { "epoch": 0.07171175373134328, "grad_norm": 0.6701402337273787, "learning_rate": 4.9942150909476576e-05, "loss": 0.616, "step": 615 }, { "epoch": 0.07229477611940298, "grad_norm": 0.6737823946741938, "learning_rate": 4.9939000364978424e-05, "loss": 0.5916, "step": 620 }, { "epoch": 0.07287779850746269, "grad_norm": 0.6443035255679863, "learning_rate": 4.993576640480064e-05, "loss": 0.593, "step": 625 }, { "epoch": 0.07346082089552239, "grad_norm": 0.7194715850493604, "learning_rate": 4.9932449040965296e-05, "loss": 0.6537, "step": 630 }, { "epoch": 0.07404384328358209, "grad_norm": 0.6638929998798874, "learning_rate": 4.992904828580449e-05, "loss": 0.651, "step": 635 }, { "epoch": 0.07462686567164178, "grad_norm": 0.6403792391822621, "learning_rate": 4.992556415196035e-05, "loss": 0.5995, "step": 640 }, { "epoch": 0.0752098880597015, "grad_norm": 0.7646937059991467, "learning_rate": 4.9921996652384915e-05, "loss": 0.6254, "step": 645 }, { "epoch": 0.0757929104477612, "grad_norm": 0.708046339189841, "learning_rate": 4.991834580034017e-05, "loss": 0.6285, "step": 650 }, { "epoch": 0.07637593283582089, "grad_norm": 0.6787038237176444, "learning_rate": 4.991461160939795e-05, "loss": 0.6089, "step": 655 }, { "epoch": 0.07695895522388059, "grad_norm": 0.637822387437937, "learning_rate": 4.991079409343989e-05, "loss": 0.6339, "step": 660 }, { "epoch": 0.0775419776119403, "grad_norm": 0.7266823799603278, "learning_rate": 4.990689326665738e-05, "loss": 0.601, "step": 665 }, { "epoch": 0.078125, "grad_norm": 0.6501949889104538, "learning_rate": 4.9902909143551516e-05, "loss": 0.6016, "step": 670 }, { "epoch": 0.0787080223880597, "grad_norm": 0.6820009767011616, "learning_rate": 4.989884173893305e-05, "loss": 0.6007, "step": 675 }, { "epoch": 0.07929104477611941, "grad_norm": 0.6877686038902945, "learning_rate": 4.989469106792231e-05, "loss": 0.6136, "step": 680 }, { "epoch": 0.07987406716417911, "grad_norm": 0.6085151630088098, "learning_rate": 4.9890457145949186e-05, "loss": 0.5881, "step": 685 }, { "epoch": 0.0804570895522388, "grad_norm": 0.6214607197858061, "learning_rate": 4.988613998875304e-05, "loss": 0.6053, "step": 690 }, { "epoch": 0.0810401119402985, "grad_norm": 0.6744013331308584, "learning_rate": 4.988173961238264e-05, "loss": 0.5802, "step": 695 }, { "epoch": 0.08162313432835822, "grad_norm": 0.6808032691033227, "learning_rate": 4.987725603319615e-05, "loss": 0.6094, "step": 700 }, { "epoch": 0.08220615671641791, "grad_norm": 0.6979985310436119, "learning_rate": 4.987268926786098e-05, "loss": 0.6323, "step": 705 }, { "epoch": 0.08278917910447761, "grad_norm": 0.6226469149492844, "learning_rate": 4.986803933335385e-05, "loss": 0.6459, "step": 710 }, { "epoch": 0.08337220149253731, "grad_norm": 0.676363656496179, "learning_rate": 4.9863306246960605e-05, "loss": 0.5761, "step": 715 }, { "epoch": 0.08395522388059702, "grad_norm": 0.6685016854342901, "learning_rate": 4.9858490026276226e-05, "loss": 0.5988, "step": 720 }, { "epoch": 0.08453824626865672, "grad_norm": 0.6396794014235626, "learning_rate": 4.9853590689204715e-05, "loss": 0.5854, "step": 725 }, { "epoch": 0.08512126865671642, "grad_norm": 0.720334288248887, "learning_rate": 4.9848608253959096e-05, "loss": 0.6207, "step": 730 }, { "epoch": 0.08570429104477612, "grad_norm": 0.6079861012766015, "learning_rate": 4.984354273906127e-05, "loss": 0.5953, "step": 735 }, { "epoch": 0.08628731343283583, "grad_norm": 0.6782045394701461, "learning_rate": 4.9838394163341993e-05, "loss": 0.63, "step": 740 }, { "epoch": 0.08687033582089553, "grad_norm": 0.6465534111571817, "learning_rate": 4.983316254594081e-05, "loss": 0.6006, "step": 745 }, { "epoch": 0.08745335820895522, "grad_norm": 0.6048787945996146, "learning_rate": 4.9827847906305934e-05, "loss": 0.5937, "step": 750 }, { "epoch": 0.08803638059701492, "grad_norm": 0.6704335077914664, "learning_rate": 4.982245026419424e-05, "loss": 0.6433, "step": 755 }, { "epoch": 0.08861940298507463, "grad_norm": 0.6420162328599116, "learning_rate": 4.981696963967116e-05, "loss": 0.5806, "step": 760 }, { "epoch": 0.08920242537313433, "grad_norm": 0.589278560052739, "learning_rate": 4.981140605311057e-05, "loss": 0.6049, "step": 765 }, { "epoch": 0.08978544776119403, "grad_norm": 0.6681567808799939, "learning_rate": 4.98057595251948e-05, "loss": 0.6369, "step": 770 }, { "epoch": 0.09036847014925373, "grad_norm": 0.5822554638380099, "learning_rate": 4.980003007691449e-05, "loss": 0.633, "step": 775 }, { "epoch": 0.09095149253731344, "grad_norm": 0.6014893132349064, "learning_rate": 4.979421772956852e-05, "loss": 0.637, "step": 780 }, { "epoch": 0.09153451492537314, "grad_norm": 0.6190152455325075, "learning_rate": 4.9788322504763954e-05, "loss": 0.626, "step": 785 }, { "epoch": 0.09211753731343283, "grad_norm": 0.6043190595480243, "learning_rate": 4.978234442441596e-05, "loss": 0.5958, "step": 790 }, { "epoch": 0.09270055970149253, "grad_norm": 0.7067979054927527, "learning_rate": 4.977628351074769e-05, "loss": 0.6391, "step": 795 }, { "epoch": 0.09328358208955224, "grad_norm": 1.0900391421932065, "learning_rate": 4.977013978629025e-05, "loss": 0.5869, "step": 800 }, { "epoch": 0.09386660447761194, "grad_norm": 0.7289575339056653, "learning_rate": 4.976391327388257e-05, "loss": 0.5817, "step": 805 }, { "epoch": 0.09444962686567164, "grad_norm": 0.6772024103975534, "learning_rate": 4.9757603996671354e-05, "loss": 0.644, "step": 810 }, { "epoch": 0.09503264925373134, "grad_norm": 0.6273836766329598, "learning_rate": 4.975121197811096e-05, "loss": 0.6343, "step": 815 }, { "epoch": 0.09561567164179105, "grad_norm": 0.6925933519325139, "learning_rate": 4.974473724196338e-05, "loss": 0.6527, "step": 820 }, { "epoch": 0.09619869402985075, "grad_norm": 0.7918262789810132, "learning_rate": 4.973817981229802e-05, "loss": 0.6043, "step": 825 }, { "epoch": 0.09678171641791045, "grad_norm": 0.6915704244723462, "learning_rate": 4.9731539713491776e-05, "loss": 0.6101, "step": 830 }, { "epoch": 0.09736473880597014, "grad_norm": 0.7026540645380082, "learning_rate": 4.972481697022883e-05, "loss": 0.6321, "step": 835 }, { "epoch": 0.09794776119402986, "grad_norm": 0.6870682595723567, "learning_rate": 4.971801160750057e-05, "loss": 0.6431, "step": 840 }, { "epoch": 0.09853078358208955, "grad_norm": 0.6762758634999102, "learning_rate": 4.971112365060555e-05, "loss": 0.6212, "step": 845 }, { "epoch": 0.09911380597014925, "grad_norm": 1.0041969348534898, "learning_rate": 4.970415312514936e-05, "loss": 0.5934, "step": 850 }, { "epoch": 0.09969682835820895, "grad_norm": 0.6273960276780831, "learning_rate": 4.969710005704449e-05, "loss": 0.6499, "step": 855 }, { "epoch": 0.10027985074626866, "grad_norm": 0.6488940069009905, "learning_rate": 4.9689964472510345e-05, "loss": 0.5949, "step": 860 }, { "epoch": 0.10086287313432836, "grad_norm": 0.763231378289182, "learning_rate": 4.968274639807304e-05, "loss": 0.6146, "step": 865 }, { "epoch": 0.10144589552238806, "grad_norm": 0.6658920295533095, "learning_rate": 4.967544586056532e-05, "loss": 0.6049, "step": 870 }, { "epoch": 0.10202891791044776, "grad_norm": 0.7074390644632157, "learning_rate": 4.966806288712654e-05, "loss": 0.6175, "step": 875 }, { "epoch": 0.10261194029850747, "grad_norm": 0.695343426907801, "learning_rate": 4.966059750520246e-05, "loss": 0.591, "step": 880 }, { "epoch": 0.10319496268656717, "grad_norm": 0.6418320757008029, "learning_rate": 4.965304974254521e-05, "loss": 0.6176, "step": 885 }, { "epoch": 0.10377798507462686, "grad_norm": 0.7727923536862525, "learning_rate": 4.9645419627213155e-05, "loss": 0.6134, "step": 890 }, { "epoch": 0.10436100746268656, "grad_norm": 0.6029656422873543, "learning_rate": 4.96377071875708e-05, "loss": 0.5654, "step": 895 }, { "epoch": 0.10494402985074627, "grad_norm": 0.6358118150755105, "learning_rate": 4.9629912452288696e-05, "loss": 0.6252, "step": 900 }, { "epoch": 0.10552705223880597, "grad_norm": 0.61344386800639, "learning_rate": 4.962203545034332e-05, "loss": 0.6059, "step": 905 }, { "epoch": 0.10611007462686567, "grad_norm": 0.6135409466759565, "learning_rate": 4.961407621101697e-05, "loss": 0.5614, "step": 910 }, { "epoch": 0.10669309701492537, "grad_norm": 1.0821214504665195, "learning_rate": 4.960603476389765e-05, "loss": 0.6162, "step": 915 }, { "epoch": 0.10727611940298508, "grad_norm": 0.5759584165021776, "learning_rate": 4.959791113887898e-05, "loss": 0.6055, "step": 920 }, { "epoch": 0.10785914179104478, "grad_norm": 0.5799139948464339, "learning_rate": 4.958970536616006e-05, "loss": 0.6067, "step": 925 }, { "epoch": 0.10844216417910447, "grad_norm": 0.6049902178680088, "learning_rate": 4.9581417476245365e-05, "loss": 0.5808, "step": 930 }, { "epoch": 0.10902518656716417, "grad_norm": 0.6216812883236675, "learning_rate": 4.957304749994465e-05, "loss": 0.5644, "step": 935 }, { "epoch": 0.10960820895522388, "grad_norm": 0.6264774945250813, "learning_rate": 4.956459546837283e-05, "loss": 0.5889, "step": 940 }, { "epoch": 0.11019123134328358, "grad_norm": 0.7584079121183279, "learning_rate": 4.955606141294982e-05, "loss": 0.6662, "step": 945 }, { "epoch": 0.11077425373134328, "grad_norm": 0.6021040722392922, "learning_rate": 4.954744536540048e-05, "loss": 0.6075, "step": 950 }, { "epoch": 0.11135727611940298, "grad_norm": 0.6516825177813871, "learning_rate": 4.953874735775448e-05, "loss": 0.6163, "step": 955 }, { "epoch": 0.11194029850746269, "grad_norm": 0.6483316850868462, "learning_rate": 4.9529967422346137e-05, "loss": 0.6388, "step": 960 }, { "epoch": 0.11252332089552239, "grad_norm": 1.0296359523919179, "learning_rate": 4.952110559181437e-05, "loss": 0.6187, "step": 965 }, { "epoch": 0.11310634328358209, "grad_norm": 0.5963874485076113, "learning_rate": 4.95121618991025e-05, "loss": 0.653, "step": 970 }, { "epoch": 0.11368936567164178, "grad_norm": 0.5827112927403142, "learning_rate": 4.950313637745819e-05, "loss": 0.6013, "step": 975 }, { "epoch": 0.1142723880597015, "grad_norm": 0.5978368012555301, "learning_rate": 4.9494029060433304e-05, "loss": 0.5854, "step": 980 }, { "epoch": 0.1148554104477612, "grad_norm": 0.6061921826097367, "learning_rate": 4.9484839981883755e-05, "loss": 0.6042, "step": 985 }, { "epoch": 0.11543843283582089, "grad_norm": 0.577591628346454, "learning_rate": 4.9475569175969414e-05, "loss": 0.6033, "step": 990 }, { "epoch": 0.11602145522388059, "grad_norm": 0.578876456470585, "learning_rate": 4.9466216677153945e-05, "loss": 0.5762, "step": 995 }, { "epoch": 0.1166044776119403, "grad_norm": 0.6111711530967577, "learning_rate": 4.945678252020475e-05, "loss": 0.6078, "step": 1000 }, { "epoch": 0.1171875, "grad_norm": 0.5389571001747652, "learning_rate": 4.9447266740192735e-05, "loss": 0.586, "step": 1005 }, { "epoch": 0.1177705223880597, "grad_norm": 0.5665361736437953, "learning_rate": 4.943766937249226e-05, "loss": 0.5896, "step": 1010 }, { "epoch": 0.11835354477611941, "grad_norm": 0.6004663875659557, "learning_rate": 4.942799045278099e-05, "loss": 0.5931, "step": 1015 }, { "epoch": 0.11893656716417911, "grad_norm": 0.5877689595746617, "learning_rate": 4.941823001703974e-05, "loss": 0.5985, "step": 1020 }, { "epoch": 0.1195195895522388, "grad_norm": 0.5935807095087586, "learning_rate": 4.940838810155237e-05, "loss": 0.5991, "step": 1025 }, { "epoch": 0.1201026119402985, "grad_norm": 0.6656506474951656, "learning_rate": 4.939846474290563e-05, "loss": 0.6148, "step": 1030 }, { "epoch": 0.12068563432835822, "grad_norm": 0.6400226054588848, "learning_rate": 4.9388459977989016e-05, "loss": 0.6035, "step": 1035 }, { "epoch": 0.12126865671641791, "grad_norm": 0.5682054111741724, "learning_rate": 4.937837384399467e-05, "loss": 0.5942, "step": 1040 }, { "epoch": 0.12185167910447761, "grad_norm": 0.5722092103161287, "learning_rate": 4.936820637841721e-05, "loss": 0.6297, "step": 1045 }, { "epoch": 0.12243470149253731, "grad_norm": 0.5744583767360888, "learning_rate": 4.935795761905359e-05, "loss": 0.5799, "step": 1050 }, { "epoch": 0.12301772388059702, "grad_norm": 0.5955258224599197, "learning_rate": 4.934762760400299e-05, "loss": 0.5901, "step": 1055 }, { "epoch": 0.12360074626865672, "grad_norm": 0.5941450778138045, "learning_rate": 4.933721637166662e-05, "loss": 0.5932, "step": 1060 }, { "epoch": 0.12418376865671642, "grad_norm": 0.6984953623677035, "learning_rate": 4.9326723960747655e-05, "loss": 0.5928, "step": 1065 }, { "epoch": 0.12476679104477612, "grad_norm": 0.5556738969991728, "learning_rate": 4.931615041025101e-05, "loss": 0.5449, "step": 1070 }, { "epoch": 0.12534981343283583, "grad_norm": 0.6202875202064847, "learning_rate": 4.9305495759483246e-05, "loss": 0.6245, "step": 1075 }, { "epoch": 0.1259328358208955, "grad_norm": 0.6360648582792375, "learning_rate": 4.929476004805241e-05, "loss": 0.622, "step": 1080 }, { "epoch": 0.12651585820895522, "grad_norm": 0.548961800240671, "learning_rate": 4.928394331586788e-05, "loss": 0.5878, "step": 1085 }, { "epoch": 0.12709888059701493, "grad_norm": 0.6534887664001585, "learning_rate": 4.927304560314023e-05, "loss": 0.5985, "step": 1090 }, { "epoch": 0.12768190298507462, "grad_norm": 0.6051258985285789, "learning_rate": 4.9262066950381074e-05, "loss": 0.5815, "step": 1095 }, { "epoch": 0.12826492537313433, "grad_norm": 0.5940919877769313, "learning_rate": 4.925100739840293e-05, "loss": 0.6056, "step": 1100 }, { "epoch": 0.12884794776119404, "grad_norm": 0.5795416953387396, "learning_rate": 4.923986698831902e-05, "loss": 0.593, "step": 1105 }, { "epoch": 0.12943097014925373, "grad_norm": 0.6060003300391443, "learning_rate": 4.922864576154318e-05, "loss": 0.6293, "step": 1110 }, { "epoch": 0.13001399253731344, "grad_norm": 0.5926697361708111, "learning_rate": 4.921734375978966e-05, "loss": 0.5863, "step": 1115 }, { "epoch": 0.13059701492537312, "grad_norm": 0.5666399578950052, "learning_rate": 4.9205961025073005e-05, "loss": 0.6093, "step": 1120 }, { "epoch": 0.13118003731343283, "grad_norm": 0.6290080779373148, "learning_rate": 4.919449759970787e-05, "loss": 0.6149, "step": 1125 }, { "epoch": 0.13176305970149255, "grad_norm": 0.6030059289618742, "learning_rate": 4.9182953526308866e-05, "loss": 0.5983, "step": 1130 }, { "epoch": 0.13234608208955223, "grad_norm": 0.6967548151886129, "learning_rate": 4.9171328847790416e-05, "loss": 0.5979, "step": 1135 }, { "epoch": 0.13292910447761194, "grad_norm": 0.6568770405720038, "learning_rate": 4.9159623607366587e-05, "loss": 0.5857, "step": 1140 }, { "epoch": 0.13351212686567165, "grad_norm": 0.5840444295366739, "learning_rate": 4.914783784855093e-05, "loss": 0.6438, "step": 1145 }, { "epoch": 0.13409514925373134, "grad_norm": 0.7004511399791347, "learning_rate": 4.913597161515633e-05, "loss": 0.6184, "step": 1150 }, { "epoch": 0.13467817164179105, "grad_norm": 0.550373143569304, "learning_rate": 4.91240249512948e-05, "loss": 0.567, "step": 1155 }, { "epoch": 0.13526119402985073, "grad_norm": 0.5313433759195957, "learning_rate": 4.9111997901377373e-05, "loss": 0.5855, "step": 1160 }, { "epoch": 0.13584421641791045, "grad_norm": 0.5655705667845206, "learning_rate": 4.9099890510113924e-05, "loss": 0.5661, "step": 1165 }, { "epoch": 0.13642723880597016, "grad_norm": 0.6195584641586247, "learning_rate": 4.908770282251296e-05, "loss": 0.5905, "step": 1170 }, { "epoch": 0.13701026119402984, "grad_norm": 0.5583698659457814, "learning_rate": 4.9075434883881504e-05, "loss": 0.6002, "step": 1175 }, { "epoch": 0.13759328358208955, "grad_norm": 0.6389070121403235, "learning_rate": 4.906308673982491e-05, "loss": 0.5607, "step": 1180 }, { "epoch": 0.13817630597014927, "grad_norm": 0.59774495236772, "learning_rate": 4.905065843624668e-05, "loss": 0.5678, "step": 1185 }, { "epoch": 0.13875932835820895, "grad_norm": 0.6173804758323792, "learning_rate": 4.903815001934832e-05, "loss": 0.6002, "step": 1190 }, { "epoch": 0.13934235074626866, "grad_norm": 0.5469869287414887, "learning_rate": 4.9025561535629125e-05, "loss": 0.5977, "step": 1195 }, { "epoch": 0.13992537313432835, "grad_norm": 0.5624438700425504, "learning_rate": 4.9012893031886075e-05, "loss": 0.582, "step": 1200 }, { "epoch": 0.14050839552238806, "grad_norm": 0.5985379982120503, "learning_rate": 4.9000144555213575e-05, "loss": 0.5825, "step": 1205 }, { "epoch": 0.14109141791044777, "grad_norm": 0.5800377004502044, "learning_rate": 4.898731615300336e-05, "loss": 0.5706, "step": 1210 }, { "epoch": 0.14167444029850745, "grad_norm": 0.5901666286652832, "learning_rate": 4.8974407872944263e-05, "loss": 0.5937, "step": 1215 }, { "epoch": 0.14225746268656717, "grad_norm": 0.644009126842579, "learning_rate": 4.8961419763022065e-05, "loss": 0.5612, "step": 1220 }, { "epoch": 0.14284048507462688, "grad_norm": 0.5922807627602008, "learning_rate": 4.894835187151931e-05, "loss": 0.6067, "step": 1225 }, { "epoch": 0.14342350746268656, "grad_norm": 0.5722199265482402, "learning_rate": 4.893520424701513e-05, "loss": 0.6082, "step": 1230 }, { "epoch": 0.14400652985074627, "grad_norm": 0.6126570623385963, "learning_rate": 4.892197693838504e-05, "loss": 0.5964, "step": 1235 }, { "epoch": 0.14458955223880596, "grad_norm": 0.5973655677327272, "learning_rate": 4.890866999480082e-05, "loss": 0.5918, "step": 1240 }, { "epoch": 0.14517257462686567, "grad_norm": 0.54393899856667, "learning_rate": 4.889528346573023e-05, "loss": 0.5827, "step": 1245 }, { "epoch": 0.14575559701492538, "grad_norm": 0.5508599028743483, "learning_rate": 4.888181740093693e-05, "loss": 0.5966, "step": 1250 }, { "epoch": 0.14633861940298507, "grad_norm": 0.5748810947596694, "learning_rate": 4.886827185048023e-05, "loss": 0.6069, "step": 1255 }, { "epoch": 0.14692164179104478, "grad_norm": 0.5843720987519714, "learning_rate": 4.8854646864714906e-05, "loss": 0.626, "step": 1260 }, { "epoch": 0.1475046641791045, "grad_norm": 0.5574624014831214, "learning_rate": 4.884094249429109e-05, "loss": 0.5513, "step": 1265 }, { "epoch": 0.14808768656716417, "grad_norm": 0.6341906909347658, "learning_rate": 4.882715879015396e-05, "loss": 0.597, "step": 1270 }, { "epoch": 0.14867070895522388, "grad_norm": 0.575813430423846, "learning_rate": 4.881329580354363e-05, "loss": 0.6081, "step": 1275 }, { "epoch": 0.14925373134328357, "grad_norm": 0.6445668774873804, "learning_rate": 4.8799353585994954e-05, "loss": 0.6087, "step": 1280 }, { "epoch": 0.14983675373134328, "grad_norm": 0.5859378252774211, "learning_rate": 4.8785332189337325e-05, "loss": 0.5602, "step": 1285 }, { "epoch": 0.150419776119403, "grad_norm": 0.6268377383651598, "learning_rate": 4.877123166569445e-05, "loss": 0.621, "step": 1290 }, { "epoch": 0.15100279850746268, "grad_norm": 0.539111443847871, "learning_rate": 4.8757052067484216e-05, "loss": 0.5778, "step": 1295 }, { "epoch": 0.1515858208955224, "grad_norm": 0.6230886477061637, "learning_rate": 4.874279344741846e-05, "loss": 0.6052, "step": 1300 }, { "epoch": 0.1521688432835821, "grad_norm": 0.5804714929817124, "learning_rate": 4.872845585850277e-05, "loss": 0.5704, "step": 1305 }, { "epoch": 0.15275186567164178, "grad_norm": 0.5644557466436395, "learning_rate": 4.87140393540363e-05, "loss": 0.6135, "step": 1310 }, { "epoch": 0.1533348880597015, "grad_norm": 0.5979238448583827, "learning_rate": 4.869954398761156e-05, "loss": 0.5891, "step": 1315 }, { "epoch": 0.15391791044776118, "grad_norm": 0.5684870674237956, "learning_rate": 4.868496981311424e-05, "loss": 0.6057, "step": 1320 }, { "epoch": 0.1545009328358209, "grad_norm": 0.6084433820062218, "learning_rate": 4.8670316884722984e-05, "loss": 0.6183, "step": 1325 }, { "epoch": 0.1550839552238806, "grad_norm": 0.5168917985718466, "learning_rate": 4.86555852569092e-05, "loss": 0.558, "step": 1330 }, { "epoch": 0.1556669776119403, "grad_norm": 0.6169882009143579, "learning_rate": 4.864077498443687e-05, "loss": 0.5894, "step": 1335 }, { "epoch": 0.15625, "grad_norm": 0.5573154647203309, "learning_rate": 4.8625886122362305e-05, "loss": 0.6047, "step": 1340 }, { "epoch": 0.1568330223880597, "grad_norm": 0.5430765223123376, "learning_rate": 4.861091872603399e-05, "loss": 0.5936, "step": 1345 }, { "epoch": 0.1574160447761194, "grad_norm": 0.5952512138917507, "learning_rate": 4.859587285109235e-05, "loss": 0.6323, "step": 1350 }, { "epoch": 0.1579990671641791, "grad_norm": 0.6257733920419736, "learning_rate": 4.8580748553469554e-05, "loss": 0.621, "step": 1355 }, { "epoch": 0.15858208955223882, "grad_norm": 0.6484544777362733, "learning_rate": 4.8565545889389295e-05, "loss": 0.6225, "step": 1360 }, { "epoch": 0.1591651119402985, "grad_norm": 0.6001757890053516, "learning_rate": 4.85502649153666e-05, "loss": 0.7485, "step": 1365 }, { "epoch": 0.15974813432835822, "grad_norm": 0.661835508508043, "learning_rate": 4.853490568820759e-05, "loss": 0.8953, "step": 1370 }, { "epoch": 0.1603311567164179, "grad_norm": 0.5424000700147391, "learning_rate": 4.851946826500932e-05, "loss": 0.6088, "step": 1375 }, { "epoch": 0.1609141791044776, "grad_norm": 0.5642506132746299, "learning_rate": 4.8503952703159485e-05, "loss": 0.5788, "step": 1380 }, { "epoch": 0.16149720149253732, "grad_norm": 1.5770888453223382, "learning_rate": 4.8488359060336314e-05, "loss": 0.6464, "step": 1385 }, { "epoch": 0.162080223880597, "grad_norm": 0.5798567582108715, "learning_rate": 4.847268739450825e-05, "loss": 0.5962, "step": 1390 }, { "epoch": 0.16266324626865672, "grad_norm": 0.6247134338348835, "learning_rate": 4.84569377639338e-05, "loss": 0.5762, "step": 1395 }, { "epoch": 0.16324626865671643, "grad_norm": 0.6403693981471119, "learning_rate": 4.84411102271613e-05, "loss": 0.6033, "step": 1400 }, { "epoch": 0.16382929104477612, "grad_norm": 0.6328461284086258, "learning_rate": 4.842520484302871e-05, "loss": 0.6124, "step": 1405 }, { "epoch": 0.16441231343283583, "grad_norm": 0.5690467644511553, "learning_rate": 4.840922167066335e-05, "loss": 0.6058, "step": 1410 }, { "epoch": 0.1649953358208955, "grad_norm": 0.5991271256192877, "learning_rate": 4.8393160769481755e-05, "loss": 0.5759, "step": 1415 }, { "epoch": 0.16557835820895522, "grad_norm": 0.5518688454925015, "learning_rate": 4.8377022199189374e-05, "loss": 0.5758, "step": 1420 }, { "epoch": 0.16616138059701493, "grad_norm": 0.8358279547212623, "learning_rate": 4.836080601978043e-05, "loss": 0.614, "step": 1425 }, { "epoch": 0.16674440298507462, "grad_norm": 0.5736501078455153, "learning_rate": 4.83445122915376e-05, "loss": 0.6268, "step": 1430 }, { "epoch": 0.16732742537313433, "grad_norm": 0.501139191324822, "learning_rate": 4.832814107503188e-05, "loss": 0.5522, "step": 1435 }, { "epoch": 0.16791044776119404, "grad_norm": 0.5414270989894118, "learning_rate": 4.8311692431122326e-05, "loss": 0.5985, "step": 1440 }, { "epoch": 0.16849347014925373, "grad_norm": 0.5503417116648404, "learning_rate": 4.82951664209558e-05, "loss": 0.5883, "step": 1445 }, { "epoch": 0.16907649253731344, "grad_norm": 0.5828254158920338, "learning_rate": 4.82785631059668e-05, "loss": 0.5973, "step": 1450 }, { "epoch": 0.16965951492537312, "grad_norm": 0.5750003990997431, "learning_rate": 4.826188254787717e-05, "loss": 0.5929, "step": 1455 }, { "epoch": 0.17024253731343283, "grad_norm": 0.5931754407423996, "learning_rate": 4.824512480869593e-05, "loss": 0.584, "step": 1460 }, { "epoch": 0.17082555970149255, "grad_norm": 0.5457054499115934, "learning_rate": 4.822828995071899e-05, "loss": 0.5711, "step": 1465 }, { "epoch": 0.17140858208955223, "grad_norm": 0.5229569450197525, "learning_rate": 4.821137803652896e-05, "loss": 0.5459, "step": 1470 }, { "epoch": 0.17199160447761194, "grad_norm": 0.6027439958371525, "learning_rate": 4.819438912899489e-05, "loss": 0.5814, "step": 1475 }, { "epoch": 0.17257462686567165, "grad_norm": 0.5702069062273913, "learning_rate": 4.8177323291272066e-05, "loss": 0.6299, "step": 1480 }, { "epoch": 0.17315764925373134, "grad_norm": 0.5780484151492952, "learning_rate": 4.8160180586801744e-05, "loss": 0.5777, "step": 1485 }, { "epoch": 0.17374067164179105, "grad_norm": 0.575917162342149, "learning_rate": 4.814296107931093e-05, "loss": 0.5547, "step": 1490 }, { "epoch": 0.17432369402985073, "grad_norm": 0.5192602462313004, "learning_rate": 4.812566483281216e-05, "loss": 0.5669, "step": 1495 }, { "epoch": 0.17490671641791045, "grad_norm": 0.5345650171129616, "learning_rate": 4.81082919116032e-05, "loss": 0.5917, "step": 1500 }, { "epoch": 0.17548973880597016, "grad_norm": 0.5475736070312467, "learning_rate": 4.809084238026689e-05, "loss": 0.602, "step": 1505 }, { "epoch": 0.17607276119402984, "grad_norm": 0.5786621619994619, "learning_rate": 4.8073316303670835e-05, "loss": 0.5861, "step": 1510 }, { "epoch": 0.17665578358208955, "grad_norm": 0.5448850237456587, "learning_rate": 4.8055713746967216e-05, "loss": 0.5857, "step": 1515 }, { "epoch": 0.17723880597014927, "grad_norm": 0.5779873112523268, "learning_rate": 4.803803477559252e-05, "loss": 0.5665, "step": 1520 }, { "epoch": 0.17782182835820895, "grad_norm": 0.5652601574196091, "learning_rate": 4.8020279455267274e-05, "loss": 0.5794, "step": 1525 }, { "epoch": 0.17840485074626866, "grad_norm": 0.5437626429311837, "learning_rate": 4.800244785199588e-05, "loss": 0.547, "step": 1530 }, { "epoch": 0.17898787313432835, "grad_norm": 0.5657663832577134, "learning_rate": 4.7984540032066266e-05, "loss": 0.5645, "step": 1535 }, { "epoch": 0.17957089552238806, "grad_norm": 0.5396084565585528, "learning_rate": 4.796655606204971e-05, "loss": 0.5917, "step": 1540 }, { "epoch": 0.18015391791044777, "grad_norm": 0.5659614973871961, "learning_rate": 4.794849600880059e-05, "loss": 0.5888, "step": 1545 }, { "epoch": 0.18073694029850745, "grad_norm": 0.610344463972923, "learning_rate": 4.793035993945609e-05, "loss": 0.5973, "step": 1550 }, { "epoch": 0.18131996268656717, "grad_norm": 0.5315364790581317, "learning_rate": 4.7912147921436e-05, "loss": 0.5793, "step": 1555 }, { "epoch": 0.18190298507462688, "grad_norm": 0.5215816759572497, "learning_rate": 4.789386002244244e-05, "loss": 0.5773, "step": 1560 }, { "epoch": 0.18248600746268656, "grad_norm": 0.48268753752530635, "learning_rate": 4.7875496310459607e-05, "loss": 0.5439, "step": 1565 }, { "epoch": 0.18306902985074627, "grad_norm": 0.5928345158273391, "learning_rate": 4.7857056853753536e-05, "loss": 0.5946, "step": 1570 }, { "epoch": 0.18365205223880596, "grad_norm": 0.5217322463974674, "learning_rate": 4.783854172087183e-05, "loss": 0.5633, "step": 1575 }, { "epoch": 0.18423507462686567, "grad_norm": 0.5361438235075061, "learning_rate": 4.781995098064343e-05, "loss": 0.5616, "step": 1580 }, { "epoch": 0.18481809701492538, "grad_norm": 0.5319409918375809, "learning_rate": 4.780128470217833e-05, "loss": 0.5959, "step": 1585 }, { "epoch": 0.18540111940298507, "grad_norm": 0.5656220383824416, "learning_rate": 4.778254295486732e-05, "loss": 0.5941, "step": 1590 }, { "epoch": 0.18598414179104478, "grad_norm": 0.664107972745298, "learning_rate": 4.7763725808381777e-05, "loss": 0.5932, "step": 1595 }, { "epoch": 0.1865671641791045, "grad_norm": 0.5594876020852717, "learning_rate": 4.7744833332673336e-05, "loss": 0.5754, "step": 1600 }, { "epoch": 0.18715018656716417, "grad_norm": 0.5748188546122887, "learning_rate": 4.7725865597973684e-05, "loss": 0.5929, "step": 1605 }, { "epoch": 0.18773320895522388, "grad_norm": 0.5608090588962066, "learning_rate": 4.770682267479427e-05, "loss": 0.6, "step": 1610 }, { "epoch": 0.18831623134328357, "grad_norm": 0.5614561936493092, "learning_rate": 4.7687704633926056e-05, "loss": 0.5885, "step": 1615 }, { "epoch": 0.18889925373134328, "grad_norm": 0.5020575112535327, "learning_rate": 4.766851154643924e-05, "loss": 0.5766, "step": 1620 }, { "epoch": 0.189482276119403, "grad_norm": 0.5350491170633971, "learning_rate": 4.7649243483683015e-05, "loss": 0.6016, "step": 1625 }, { "epoch": 0.19006529850746268, "grad_norm": 0.5723463466212365, "learning_rate": 4.762990051728529e-05, "loss": 0.5938, "step": 1630 }, { "epoch": 0.1906483208955224, "grad_norm": 0.528038155312228, "learning_rate": 4.7610482719152404e-05, "loss": 0.5919, "step": 1635 }, { "epoch": 0.1912313432835821, "grad_norm": 0.5195292947087375, "learning_rate": 4.7590990161468906e-05, "loss": 0.587, "step": 1640 }, { "epoch": 0.19181436567164178, "grad_norm": 0.5309411178378524, "learning_rate": 4.757142291669724e-05, "loss": 0.5584, "step": 1645 }, { "epoch": 0.1923973880597015, "grad_norm": 0.5235366705843203, "learning_rate": 4.755178105757751e-05, "loss": 0.5332, "step": 1650 }, { "epoch": 0.19298041044776118, "grad_norm": 0.5662388478996405, "learning_rate": 4.753206465712717e-05, "loss": 0.6003, "step": 1655 }, { "epoch": 0.1935634328358209, "grad_norm": 0.541515776380167, "learning_rate": 4.751227378864081e-05, "loss": 0.6167, "step": 1660 }, { "epoch": 0.1941464552238806, "grad_norm": 0.5451263614433529, "learning_rate": 4.749240852568981e-05, "loss": 0.5795, "step": 1665 }, { "epoch": 0.1947294776119403, "grad_norm": 0.5539737195153389, "learning_rate": 4.747246894212216e-05, "loss": 0.6156, "step": 1670 }, { "epoch": 0.1953125, "grad_norm": 0.4838135257147904, "learning_rate": 4.7452455112062076e-05, "loss": 0.586, "step": 1675 }, { "epoch": 0.1958955223880597, "grad_norm": 0.6206912586835625, "learning_rate": 4.743236710990982e-05, "loss": 0.5835, "step": 1680 }, { "epoch": 0.1964785447761194, "grad_norm": 0.5575344781989854, "learning_rate": 4.7412205010341385e-05, "loss": 0.5615, "step": 1685 }, { "epoch": 0.1970615671641791, "grad_norm": 0.5465760234872273, "learning_rate": 4.739196888830818e-05, "loss": 0.5614, "step": 1690 }, { "epoch": 0.19764458955223882, "grad_norm": 0.6116107229280592, "learning_rate": 4.737165881903683e-05, "loss": 0.5777, "step": 1695 }, { "epoch": 0.1982276119402985, "grad_norm": 0.526352835776055, "learning_rate": 4.735127487802882e-05, "loss": 0.5499, "step": 1700 }, { "epoch": 0.19881063432835822, "grad_norm": 0.5790768202270913, "learning_rate": 4.7330817141060284e-05, "loss": 0.6062, "step": 1705 }, { "epoch": 0.1993936567164179, "grad_norm": 0.5786125838532169, "learning_rate": 4.731028568418167e-05, "loss": 0.5853, "step": 1710 }, { "epoch": 0.1999766791044776, "grad_norm": 0.5220981929969758, "learning_rate": 4.728968058371746e-05, "loss": 0.5917, "step": 1715 }, { "epoch": 0.20055970149253732, "grad_norm": 0.5962771524705726, "learning_rate": 4.726900191626592e-05, "loss": 0.615, "step": 1720 }, { "epoch": 0.201142723880597, "grad_norm": 0.614179304629615, "learning_rate": 4.724824975869881e-05, "loss": 0.6088, "step": 1725 }, { "epoch": 0.20172574626865672, "grad_norm": 0.5339809717905718, "learning_rate": 4.722742418816106e-05, "loss": 0.5856, "step": 1730 }, { "epoch": 0.20230876865671643, "grad_norm": 0.5775559838523747, "learning_rate": 4.7206525282070514e-05, "loss": 0.5525, "step": 1735 }, { "epoch": 0.20289179104477612, "grad_norm": 0.5787127160424809, "learning_rate": 4.718555311811764e-05, "loss": 0.5889, "step": 1740 }, { "epoch": 0.20347481343283583, "grad_norm": 0.5507748006986873, "learning_rate": 4.716450777426525e-05, "loss": 0.5811, "step": 1745 }, { "epoch": 0.2040578358208955, "grad_norm": 0.5386322774377927, "learning_rate": 4.7143389328748174e-05, "loss": 0.5681, "step": 1750 }, { "epoch": 0.20464085820895522, "grad_norm": 0.5253847435139612, "learning_rate": 4.712219786007302e-05, "loss": 0.5352, "step": 1755 }, { "epoch": 0.20522388059701493, "grad_norm": 0.5229542272956548, "learning_rate": 4.710093344701782e-05, "loss": 0.5878, "step": 1760 }, { "epoch": 0.20580690298507462, "grad_norm": 0.5347624824538455, "learning_rate": 4.707959616863181e-05, "loss": 0.5622, "step": 1765 }, { "epoch": 0.20638992537313433, "grad_norm": 0.5561607185138324, "learning_rate": 4.7058186104235086e-05, "loss": 0.5797, "step": 1770 }, { "epoch": 0.20697294776119404, "grad_norm": 0.4892165988190017, "learning_rate": 4.70367033334183e-05, "loss": 0.5403, "step": 1775 }, { "epoch": 0.20755597014925373, "grad_norm": 0.5510594553982379, "learning_rate": 4.701514793604242e-05, "loss": 0.5559, "step": 1780 }, { "epoch": 0.20813899253731344, "grad_norm": 0.5473696320978395, "learning_rate": 4.699351999223838e-05, "loss": 0.5753, "step": 1785 }, { "epoch": 0.20872201492537312, "grad_norm": 0.4873353162223706, "learning_rate": 4.697181958240679e-05, "loss": 0.5492, "step": 1790 }, { "epoch": 0.20930503731343283, "grad_norm": 0.5064007191967134, "learning_rate": 4.695004678721768e-05, "loss": 0.5858, "step": 1795 }, { "epoch": 0.20988805970149255, "grad_norm": 0.5667430033926578, "learning_rate": 4.692820168761014e-05, "loss": 0.6062, "step": 1800 }, { "epoch": 0.21047108208955223, "grad_norm": 0.5212510033253821, "learning_rate": 4.690628436479206e-05, "loss": 0.5598, "step": 1805 }, { "epoch": 0.21105410447761194, "grad_norm": 0.5588622002334643, "learning_rate": 4.688429490023982e-05, "loss": 0.5763, "step": 1810 }, { "epoch": 0.21163712686567165, "grad_norm": 0.6512372849264118, "learning_rate": 4.6862233375697964e-05, "loss": 0.5808, "step": 1815 }, { "epoch": 0.21222014925373134, "grad_norm": 0.600056817158455, "learning_rate": 4.684009987317894e-05, "loss": 0.5929, "step": 1820 }, { "epoch": 0.21280317164179105, "grad_norm": 0.5525047925178632, "learning_rate": 4.6817894474962756e-05, "loss": 0.5598, "step": 1825 }, { "epoch": 0.21338619402985073, "grad_norm": 0.49741821795163615, "learning_rate": 4.679561726359668e-05, "loss": 0.5689, "step": 1830 }, { "epoch": 0.21396921641791045, "grad_norm": 0.5369544968462594, "learning_rate": 4.677326832189496e-05, "loss": 0.5846, "step": 1835 }, { "epoch": 0.21455223880597016, "grad_norm": 0.578124984252202, "learning_rate": 4.675084773293848e-05, "loss": 0.6068, "step": 1840 }, { "epoch": 0.21513526119402984, "grad_norm": 0.5489389574421355, "learning_rate": 4.6728355580074476e-05, "loss": 0.5799, "step": 1845 }, { "epoch": 0.21571828358208955, "grad_norm": 0.5492579167575462, "learning_rate": 4.6705791946916236e-05, "loss": 0.5969, "step": 1850 }, { "epoch": 0.21630130597014927, "grad_norm": 0.4976854096096666, "learning_rate": 4.6683156917342726e-05, "loss": 0.6034, "step": 1855 }, { "epoch": 0.21688432835820895, "grad_norm": 0.600940601658767, "learning_rate": 4.666045057549838e-05, "loss": 0.5946, "step": 1860 }, { "epoch": 0.21746735074626866, "grad_norm": 0.5506775932621145, "learning_rate": 4.663767300579268e-05, "loss": 0.5847, "step": 1865 }, { "epoch": 0.21805037313432835, "grad_norm": 0.5271315191050433, "learning_rate": 4.661482429289994e-05, "loss": 0.5662, "step": 1870 }, { "epoch": 0.21863339552238806, "grad_norm": 0.5806047125500812, "learning_rate": 4.659190452175891e-05, "loss": 0.5717, "step": 1875 }, { "epoch": 0.21921641791044777, "grad_norm": 0.5548551191672546, "learning_rate": 4.65689137775725e-05, "loss": 0.5918, "step": 1880 }, { "epoch": 0.21979944029850745, "grad_norm": 0.5224077396834346, "learning_rate": 4.654585214580749e-05, "loss": 0.5764, "step": 1885 }, { "epoch": 0.22038246268656717, "grad_norm": 0.595212892553765, "learning_rate": 4.652271971219412e-05, "loss": 0.5718, "step": 1890 }, { "epoch": 0.22096548507462688, "grad_norm": 0.5205351842902096, "learning_rate": 4.6499516562725906e-05, "loss": 0.5843, "step": 1895 }, { "epoch": 0.22154850746268656, "grad_norm": 0.5193771435539847, "learning_rate": 4.647624278365917e-05, "loss": 0.6024, "step": 1900 }, { "epoch": 0.22213152985074627, "grad_norm": 0.5762529929483823, "learning_rate": 4.6452898461512866e-05, "loss": 0.5841, "step": 1905 }, { "epoch": 0.22271455223880596, "grad_norm": 0.5613099807801801, "learning_rate": 4.642948368306814e-05, "loss": 0.5909, "step": 1910 }, { "epoch": 0.22329757462686567, "grad_norm": 0.5857098538355904, "learning_rate": 4.640599853536806e-05, "loss": 0.5986, "step": 1915 }, { "epoch": 0.22388059701492538, "grad_norm": 0.5346551057187519, "learning_rate": 4.6382443105717324e-05, "loss": 0.5651, "step": 1920 }, { "epoch": 0.22446361940298507, "grad_norm": 0.6133229780421945, "learning_rate": 4.635881748168184e-05, "loss": 0.6051, "step": 1925 }, { "epoch": 0.22504664179104478, "grad_norm": 0.564856310131765, "learning_rate": 4.633512175108851e-05, "loss": 0.5645, "step": 1930 }, { "epoch": 0.2256296641791045, "grad_norm": 0.6086776045173963, "learning_rate": 4.631135600202485e-05, "loss": 0.611, "step": 1935 }, { "epoch": 0.22621268656716417, "grad_norm": 0.5053374707380298, "learning_rate": 4.628752032283862e-05, "loss": 0.6359, "step": 1940 }, { "epoch": 0.22679570895522388, "grad_norm": 0.5177309796242289, "learning_rate": 4.626361480213759e-05, "loss": 0.5161, "step": 1945 }, { "epoch": 0.22737873134328357, "grad_norm": 0.5655297929571091, "learning_rate": 4.623963952878914e-05, "loss": 0.5215, "step": 1950 }, { "epoch": 0.22796175373134328, "grad_norm": 0.5007101086568317, "learning_rate": 4.621559459191996e-05, "loss": 0.5672, "step": 1955 }, { "epoch": 0.228544776119403, "grad_norm": 0.5604337094818254, "learning_rate": 4.619148008091569e-05, "loss": 0.5495, "step": 1960 }, { "epoch": 0.22912779850746268, "grad_norm": 0.5600231646159735, "learning_rate": 4.616729608542064e-05, "loss": 0.5914, "step": 1965 }, { "epoch": 0.2297108208955224, "grad_norm": 0.5261560439698072, "learning_rate": 4.61430426953374e-05, "loss": 0.5761, "step": 1970 }, { "epoch": 0.2302938432835821, "grad_norm": 0.5177160101117486, "learning_rate": 4.611872000082654e-05, "loss": 0.6105, "step": 1975 }, { "epoch": 0.23087686567164178, "grad_norm": 0.4991176586608243, "learning_rate": 4.609432809230627e-05, "loss": 0.5576, "step": 1980 }, { "epoch": 0.2314598880597015, "grad_norm": 0.49011496350766276, "learning_rate": 4.606986706045207e-05, "loss": 0.5217, "step": 1985 }, { "epoch": 0.23204291044776118, "grad_norm": 0.49863205095192953, "learning_rate": 4.604533699619643e-05, "loss": 0.5606, "step": 1990 }, { "epoch": 0.2326259328358209, "grad_norm": 0.5202522274733646, "learning_rate": 4.602073799072841e-05, "loss": 0.582, "step": 1995 }, { "epoch": 0.2332089552238806, "grad_norm": 0.5778738097193321, "learning_rate": 4.5996070135493426e-05, "loss": 0.5608, "step": 2000 }, { "epoch": 0.2337919776119403, "grad_norm": 0.5383730742466348, "learning_rate": 4.597133352219275e-05, "loss": 0.5833, "step": 2005 }, { "epoch": 0.234375, "grad_norm": 0.504768235114523, "learning_rate": 4.594652824278333e-05, "loss": 0.5428, "step": 2010 }, { "epoch": 0.2349580223880597, "grad_norm": 0.5424141377382832, "learning_rate": 4.592165438947734e-05, "loss": 0.5234, "step": 2015 }, { "epoch": 0.2355410447761194, "grad_norm": 0.560600079358499, "learning_rate": 4.589671205474189e-05, "loss": 0.5949, "step": 2020 }, { "epoch": 0.2361240671641791, "grad_norm": 0.5710842213852398, "learning_rate": 4.587170133129867e-05, "loss": 0.5675, "step": 2025 }, { "epoch": 0.23670708955223882, "grad_norm": 0.6954740937313674, "learning_rate": 4.5846622312123566e-05, "loss": 0.5593, "step": 2030 }, { "epoch": 0.2372901119402985, "grad_norm": 0.4936861843329207, "learning_rate": 4.582147509044639e-05, "loss": 0.544, "step": 2035 }, { "epoch": 0.23787313432835822, "grad_norm": 0.5990080756647657, "learning_rate": 4.579625975975047e-05, "loss": 0.6093, "step": 2040 }, { "epoch": 0.2384561567164179, "grad_norm": 0.46710498554704555, "learning_rate": 4.577097641377234e-05, "loss": 0.5421, "step": 2045 }, { "epoch": 0.2390391791044776, "grad_norm": 0.6220614730173019, "learning_rate": 4.574562514650137e-05, "loss": 0.5959, "step": 2050 }, { "epoch": 0.23962220149253732, "grad_norm": 0.5736687871170686, "learning_rate": 4.572020605217941e-05, "loss": 0.5705, "step": 2055 }, { "epoch": 0.240205223880597, "grad_norm": 0.5700891932767567, "learning_rate": 4.569471922530048e-05, "loss": 0.5812, "step": 2060 }, { "epoch": 0.24078824626865672, "grad_norm": 0.5076933159103915, "learning_rate": 4.566916476061036e-05, "loss": 0.565, "step": 2065 }, { "epoch": 0.24137126865671643, "grad_norm": 0.5485662358157128, "learning_rate": 4.56435427531063e-05, "loss": 0.576, "step": 2070 }, { "epoch": 0.24195429104477612, "grad_norm": 0.5699225728909032, "learning_rate": 4.5617853298036634e-05, "loss": 0.5984, "step": 2075 }, { "epoch": 0.24253731343283583, "grad_norm": 0.5970990880266768, "learning_rate": 4.559209649090039e-05, "loss": 0.5648, "step": 2080 }, { "epoch": 0.2431203358208955, "grad_norm": 0.5310811562750187, "learning_rate": 4.556627242744703e-05, "loss": 0.5616, "step": 2085 }, { "epoch": 0.24370335820895522, "grad_norm": 0.4892958829299151, "learning_rate": 4.5540381203675994e-05, "loss": 0.5867, "step": 2090 }, { "epoch": 0.24428638059701493, "grad_norm": 0.5227922779707119, "learning_rate": 4.55144229158364e-05, "loss": 0.5534, "step": 2095 }, { "epoch": 0.24486940298507462, "grad_norm": 0.5041942400636352, "learning_rate": 4.548839766042668e-05, "loss": 0.5371, "step": 2100 }, { "epoch": 0.24545242537313433, "grad_norm": 0.49660405961707166, "learning_rate": 4.5462305534194204e-05, "loss": 0.572, "step": 2105 }, { "epoch": 0.24603544776119404, "grad_norm": 0.5499042134726534, "learning_rate": 4.543614663413493e-05, "loss": 0.5611, "step": 2110 }, { "epoch": 0.24661847014925373, "grad_norm": 0.5496526828713953, "learning_rate": 4.5409921057493064e-05, "loss": 0.57, "step": 2115 }, { "epoch": 0.24720149253731344, "grad_norm": 0.5599675685862884, "learning_rate": 4.538362890176066e-05, "loss": 0.5618, "step": 2120 }, { "epoch": 0.24778451492537312, "grad_norm": 0.5579889614380124, "learning_rate": 4.535727026467727e-05, "loss": 0.5682, "step": 2125 }, { "epoch": 0.24836753731343283, "grad_norm": 0.5300253135611827, "learning_rate": 4.533084524422959e-05, "loss": 0.5828, "step": 2130 }, { "epoch": 0.24895055970149255, "grad_norm": 0.498730689430679, "learning_rate": 4.530435393865111e-05, "loss": 0.5535, "step": 2135 }, { "epoch": 0.24953358208955223, "grad_norm": 0.5177126292248756, "learning_rate": 4.527779644642172e-05, "loss": 0.5661, "step": 2140 }, { "epoch": 0.2501166044776119, "grad_norm": 0.5708740823647661, "learning_rate": 4.525117286626734e-05, "loss": 0.5764, "step": 2145 }, { "epoch": 0.25069962686567165, "grad_norm": 0.5326668265886716, "learning_rate": 4.522448329715959e-05, "loss": 0.562, "step": 2150 }, { "epoch": 0.25128264925373134, "grad_norm": 0.4878189090302149, "learning_rate": 4.51977278383154e-05, "loss": 0.5389, "step": 2155 }, { "epoch": 0.251865671641791, "grad_norm": 0.5459043435498905, "learning_rate": 4.517090658919662e-05, "loss": 0.5456, "step": 2160 }, { "epoch": 0.25244869402985076, "grad_norm": 0.5451507549234296, "learning_rate": 4.5144019649509694e-05, "loss": 0.5619, "step": 2165 }, { "epoch": 0.25303171641791045, "grad_norm": 0.48817546876328066, "learning_rate": 4.5117067119205256e-05, "loss": 0.5328, "step": 2170 }, { "epoch": 0.25361473880597013, "grad_norm": 0.5573481611505946, "learning_rate": 4.5090049098477756e-05, "loss": 0.6166, "step": 2175 }, { "epoch": 0.25419776119402987, "grad_norm": 0.46166230865153285, "learning_rate": 4.506296568776513e-05, "loss": 0.5603, "step": 2180 }, { "epoch": 0.25478078358208955, "grad_norm": 0.5255525351801149, "learning_rate": 4.503581698774838e-05, "loss": 0.5597, "step": 2185 }, { "epoch": 0.25536380597014924, "grad_norm": 0.5036713071359231, "learning_rate": 4.5008603099351235e-05, "loss": 0.5572, "step": 2190 }, { "epoch": 0.255946828358209, "grad_norm": 0.5629197395168302, "learning_rate": 4.498132412373972e-05, "loss": 0.5549, "step": 2195 }, { "epoch": 0.25652985074626866, "grad_norm": 0.5281653518122478, "learning_rate": 4.4953980162321845e-05, "loss": 0.5783, "step": 2200 }, { "epoch": 0.25711287313432835, "grad_norm": 0.52357818056316, "learning_rate": 4.492657131674722e-05, "loss": 0.5365, "step": 2205 }, { "epoch": 0.2576958955223881, "grad_norm": 0.5581473844002001, "learning_rate": 4.48990976889066e-05, "loss": 0.5314, "step": 2210 }, { "epoch": 0.25827891791044777, "grad_norm": 0.5463641738254288, "learning_rate": 4.487155938093163e-05, "loss": 0.5678, "step": 2215 }, { "epoch": 0.25886194029850745, "grad_norm": 0.580427123805623, "learning_rate": 4.484395649519435e-05, "loss": 0.5831, "step": 2220 }, { "epoch": 0.25944496268656714, "grad_norm": 0.5515157134010146, "learning_rate": 4.48162891343069e-05, "loss": 0.5916, "step": 2225 }, { "epoch": 0.2600279850746269, "grad_norm": 0.4614774802102389, "learning_rate": 4.478855740112107e-05, "loss": 0.5747, "step": 2230 }, { "epoch": 0.26061100746268656, "grad_norm": 0.5263544516058696, "learning_rate": 4.476076139872797e-05, "loss": 0.5873, "step": 2235 }, { "epoch": 0.26119402985074625, "grad_norm": 0.5279147571827948, "learning_rate": 4.473290123045764e-05, "loss": 0.5694, "step": 2240 }, { "epoch": 0.261777052238806, "grad_norm": 0.9742556257776407, "learning_rate": 4.470497699987861e-05, "loss": 0.5686, "step": 2245 }, { "epoch": 0.26236007462686567, "grad_norm": 0.5328912557304052, "learning_rate": 4.4676988810797596e-05, "loss": 0.5591, "step": 2250 }, { "epoch": 0.26294309701492535, "grad_norm": 0.5633803039126256, "learning_rate": 4.464893676725906e-05, "loss": 0.5728, "step": 2255 }, { "epoch": 0.2635261194029851, "grad_norm": 0.5134466833234131, "learning_rate": 4.4620820973544866e-05, "loss": 0.588, "step": 2260 }, { "epoch": 0.2641091417910448, "grad_norm": 0.5049188829803447, "learning_rate": 4.459264153417381e-05, "loss": 0.5973, "step": 2265 }, { "epoch": 0.26469216417910446, "grad_norm": 0.5404106159275316, "learning_rate": 4.4564398553901344e-05, "loss": 0.5788, "step": 2270 }, { "epoch": 0.2652751865671642, "grad_norm": 0.6037489755180414, "learning_rate": 4.4536092137719094e-05, "loss": 0.5935, "step": 2275 }, { "epoch": 0.2658582089552239, "grad_norm": 0.5161487013459114, "learning_rate": 4.450772239085452e-05, "loss": 0.5371, "step": 2280 }, { "epoch": 0.26644123134328357, "grad_norm": 0.5278496691940492, "learning_rate": 4.44792894187705e-05, "loss": 0.5234, "step": 2285 }, { "epoch": 0.2670242537313433, "grad_norm": 0.5092042449789793, "learning_rate": 4.445079332716497e-05, "loss": 0.5925, "step": 2290 }, { "epoch": 0.267607276119403, "grad_norm": 0.5223498221449892, "learning_rate": 4.4422234221970475e-05, "loss": 0.5629, "step": 2295 }, { "epoch": 0.2681902985074627, "grad_norm": 0.5456470351851613, "learning_rate": 4.439361220935385e-05, "loss": 0.5416, "step": 2300 }, { "epoch": 0.26877332089552236, "grad_norm": 0.5476526134142059, "learning_rate": 4.436492739571575e-05, "loss": 0.5499, "step": 2305 }, { "epoch": 0.2693563432835821, "grad_norm": 0.49505688936501147, "learning_rate": 4.433617988769031e-05, "loss": 0.5431, "step": 2310 }, { "epoch": 0.2699393656716418, "grad_norm": 0.5342065164923998, "learning_rate": 4.43073697921447e-05, "loss": 0.5693, "step": 2315 }, { "epoch": 0.27052238805970147, "grad_norm": 0.5145761250584094, "learning_rate": 4.4278497216178805e-05, "loss": 0.5795, "step": 2320 }, { "epoch": 0.2711054104477612, "grad_norm": 0.5273626768859015, "learning_rate": 4.4249562267124735e-05, "loss": 0.5549, "step": 2325 }, { "epoch": 0.2716884328358209, "grad_norm": 0.5079624559473638, "learning_rate": 4.422056505254648e-05, "loss": 0.5746, "step": 2330 }, { "epoch": 0.2722714552238806, "grad_norm": 0.505058134321932, "learning_rate": 4.4191505680239494e-05, "loss": 0.5565, "step": 2335 }, { "epoch": 0.2728544776119403, "grad_norm": 0.5668879894960839, "learning_rate": 4.416238425823031e-05, "loss": 0.5578, "step": 2340 }, { "epoch": 0.2734375, "grad_norm": 0.5141499926926323, "learning_rate": 4.413320089477612e-05, "loss": 0.5489, "step": 2345 }, { "epoch": 0.2740205223880597, "grad_norm": 0.5109670063026779, "learning_rate": 4.4103955698364394e-05, "loss": 0.5592, "step": 2350 }, { "epoch": 0.2746035447761194, "grad_norm": 0.5082096864806888, "learning_rate": 4.407464877771243e-05, "loss": 0.5213, "step": 2355 }, { "epoch": 0.2751865671641791, "grad_norm": 0.5083698199510284, "learning_rate": 4.4045280241767024e-05, "loss": 0.5493, "step": 2360 }, { "epoch": 0.2757695895522388, "grad_norm": 0.5053920381989545, "learning_rate": 4.401585019970397e-05, "loss": 0.5686, "step": 2365 }, { "epoch": 0.27635261194029853, "grad_norm": 0.4963501376761793, "learning_rate": 4.3986358760927774e-05, "loss": 0.5545, "step": 2370 }, { "epoch": 0.2769356343283582, "grad_norm": 0.5288831258581409, "learning_rate": 4.3956806035071123e-05, "loss": 0.5249, "step": 2375 }, { "epoch": 0.2775186567164179, "grad_norm": 0.62308836025263, "learning_rate": 4.392719213199457e-05, "loss": 0.6042, "step": 2380 }, { "epoch": 0.27810167910447764, "grad_norm": 0.4832296950151894, "learning_rate": 4.389751716178606e-05, "loss": 0.5633, "step": 2385 }, { "epoch": 0.2786847014925373, "grad_norm": 0.5237890773447569, "learning_rate": 4.386778123476059e-05, "loss": 0.5271, "step": 2390 }, { "epoch": 0.279267723880597, "grad_norm": 0.5595739424570599, "learning_rate": 4.383798446145973e-05, "loss": 0.6001, "step": 2395 }, { "epoch": 0.2798507462686567, "grad_norm": 0.5332931182325285, "learning_rate": 4.380812695265126e-05, "loss": 0.6024, "step": 2400 }, { "epoch": 0.28043376865671643, "grad_norm": 0.4918731480131771, "learning_rate": 4.3778208819328724e-05, "loss": 0.56, "step": 2405 }, { "epoch": 0.2810167910447761, "grad_norm": 0.4625310842353903, "learning_rate": 4.374823017271105e-05, "loss": 0.5161, "step": 2410 }, { "epoch": 0.2815998134328358, "grad_norm": 0.5070557648263391, "learning_rate": 4.371819112424212e-05, "loss": 0.518, "step": 2415 }, { "epoch": 0.28218283582089554, "grad_norm": 0.56213077270057, "learning_rate": 4.368809178559034e-05, "loss": 0.5645, "step": 2420 }, { "epoch": 0.2827658582089552, "grad_norm": 0.5027850751505164, "learning_rate": 4.365793226864825e-05, "loss": 0.5516, "step": 2425 }, { "epoch": 0.2833488805970149, "grad_norm": 0.496969778411097, "learning_rate": 4.3627712685532104e-05, "loss": 0.5661, "step": 2430 }, { "epoch": 0.28393190298507465, "grad_norm": 0.5203901533643535, "learning_rate": 4.3597433148581465e-05, "loss": 0.5564, "step": 2435 }, { "epoch": 0.28451492537313433, "grad_norm": 0.5761459191507305, "learning_rate": 4.3567093770358724e-05, "loss": 0.584, "step": 2440 }, { "epoch": 0.285097947761194, "grad_norm": 0.5440010534822834, "learning_rate": 4.353669466364877e-05, "loss": 0.5833, "step": 2445 }, { "epoch": 0.28568097014925375, "grad_norm": 0.5575628208489772, "learning_rate": 4.3506235941458516e-05, "loss": 0.5926, "step": 2450 }, { "epoch": 0.28626399253731344, "grad_norm": 0.500211988994469, "learning_rate": 4.347571771701648e-05, "loss": 0.5651, "step": 2455 }, { "epoch": 0.2868470149253731, "grad_norm": 0.46825863619458763, "learning_rate": 4.34451401037724e-05, "loss": 0.5461, "step": 2460 }, { "epoch": 0.28743003731343286, "grad_norm": 0.5521309621293441, "learning_rate": 4.3414503215396776e-05, "loss": 0.5659, "step": 2465 }, { "epoch": 0.28801305970149255, "grad_norm": 0.4609786323278, "learning_rate": 4.338380716578046e-05, "loss": 0.5487, "step": 2470 }, { "epoch": 0.28859608208955223, "grad_norm": 0.499738534899754, "learning_rate": 4.3353052069034214e-05, "loss": 0.5381, "step": 2475 }, { "epoch": 0.2891791044776119, "grad_norm": 0.5917418512742165, "learning_rate": 4.332223803948834e-05, "loss": 0.5434, "step": 2480 }, { "epoch": 0.28976212686567165, "grad_norm": 0.48180474136483686, "learning_rate": 4.3291365191692204e-05, "loss": 0.5734, "step": 2485 }, { "epoch": 0.29034514925373134, "grad_norm": 0.5007564237127459, "learning_rate": 4.326043364041381e-05, "loss": 0.5352, "step": 2490 }, { "epoch": 0.290928171641791, "grad_norm": 0.47349703509695523, "learning_rate": 4.3229443500639414e-05, "loss": 0.5553, "step": 2495 }, { "epoch": 0.29151119402985076, "grad_norm": 0.5192204895800268, "learning_rate": 4.319839488757305e-05, "loss": 0.5398, "step": 2500 }, { "epoch": 0.29209421641791045, "grad_norm": 0.5757316112230765, "learning_rate": 4.3167287916636145e-05, "loss": 0.5795, "step": 2505 }, { "epoch": 0.29267723880597013, "grad_norm": 0.5046004476804008, "learning_rate": 4.3136122703467045e-05, "loss": 0.5657, "step": 2510 }, { "epoch": 0.29326026119402987, "grad_norm": 0.6116298626095651, "learning_rate": 4.3104899363920616e-05, "loss": 0.5892, "step": 2515 }, { "epoch": 0.29384328358208955, "grad_norm": 0.5028293067243745, "learning_rate": 4.3073618014067824e-05, "loss": 0.5614, "step": 2520 }, { "epoch": 0.29442630597014924, "grad_norm": 0.5284740774719737, "learning_rate": 4.304227877019525e-05, "loss": 0.5509, "step": 2525 }, { "epoch": 0.295009328358209, "grad_norm": 0.5043452273361897, "learning_rate": 4.301088174880472e-05, "loss": 0.5734, "step": 2530 }, { "epoch": 0.29559235074626866, "grad_norm": 0.5491908726146885, "learning_rate": 4.297942706661283e-05, "loss": 0.5994, "step": 2535 }, { "epoch": 0.29617537313432835, "grad_norm": 0.5967635993887994, "learning_rate": 4.2947914840550544e-05, "loss": 0.5895, "step": 2540 }, { "epoch": 0.2967583955223881, "grad_norm": 0.49957976391073805, "learning_rate": 4.291634518776273e-05, "loss": 0.5559, "step": 2545 }, { "epoch": 0.29734141791044777, "grad_norm": 0.586452369328343, "learning_rate": 4.2884718225607736e-05, "loss": 0.5987, "step": 2550 }, { "epoch": 0.29792444029850745, "grad_norm": 0.5079777653661454, "learning_rate": 4.285303407165694e-05, "loss": 0.5892, "step": 2555 }, { "epoch": 0.29850746268656714, "grad_norm": 0.45945289033221137, "learning_rate": 4.282129284369436e-05, "loss": 0.5353, "step": 2560 }, { "epoch": 0.2990904850746269, "grad_norm": 0.47293762371540526, "learning_rate": 4.278949465971616e-05, "loss": 0.5451, "step": 2565 }, { "epoch": 0.29967350746268656, "grad_norm": 0.5125425687772994, "learning_rate": 4.2757639637930246e-05, "loss": 0.588, "step": 2570 }, { "epoch": 0.30025652985074625, "grad_norm": 0.560469657657478, "learning_rate": 4.2725727896755804e-05, "loss": 0.5353, "step": 2575 }, { "epoch": 0.300839552238806, "grad_norm": 0.5254788469907365, "learning_rate": 4.269375955482287e-05, "loss": 0.547, "step": 2580 }, { "epoch": 0.30142257462686567, "grad_norm": 0.48987507837663824, "learning_rate": 4.266173473097192e-05, "loss": 0.5175, "step": 2585 }, { "epoch": 0.30200559701492535, "grad_norm": 0.5484732726515896, "learning_rate": 4.262965354425335e-05, "loss": 0.6067, "step": 2590 }, { "epoch": 0.3025886194029851, "grad_norm": 0.5064490667857149, "learning_rate": 4.259751611392712e-05, "loss": 0.533, "step": 2595 }, { "epoch": 0.3031716417910448, "grad_norm": 0.5014575792637805, "learning_rate": 4.256532255946226e-05, "loss": 0.5913, "step": 2600 }, { "epoch": 0.30375466417910446, "grad_norm": 0.48901348419554475, "learning_rate": 4.253307300053643e-05, "loss": 0.5543, "step": 2605 }, { "epoch": 0.3043376865671642, "grad_norm": 0.5232458296317694, "learning_rate": 4.25007675570355e-05, "loss": 0.5759, "step": 2610 }, { "epoch": 0.3049207089552239, "grad_norm": 0.5070730983893377, "learning_rate": 4.246840634905307e-05, "loss": 0.5515, "step": 2615 }, { "epoch": 0.30550373134328357, "grad_norm": 0.4828637362156461, "learning_rate": 4.2435989496890054e-05, "loss": 0.5817, "step": 2620 }, { "epoch": 0.3060867537313433, "grad_norm": 0.45681007096402865, "learning_rate": 4.240351712105422e-05, "loss": 0.5538, "step": 2625 }, { "epoch": 0.306669776119403, "grad_norm": 0.5633919156991598, "learning_rate": 4.237098934225973e-05, "loss": 0.5619, "step": 2630 }, { "epoch": 0.3072527985074627, "grad_norm": 0.4744742163372956, "learning_rate": 4.233840628142672e-05, "loss": 0.5749, "step": 2635 }, { "epoch": 0.30783582089552236, "grad_norm": 0.47221088669237776, "learning_rate": 4.2305768059680806e-05, "loss": 0.5689, "step": 2640 }, { "epoch": 0.3084188432835821, "grad_norm": 0.4762515982934417, "learning_rate": 4.2273074798352706e-05, "loss": 0.5507, "step": 2645 }, { "epoch": 0.3090018656716418, "grad_norm": 0.4718217399644616, "learning_rate": 4.22403266189777e-05, "loss": 0.5483, "step": 2650 }, { "epoch": 0.30958488805970147, "grad_norm": 0.4671780211152603, "learning_rate": 4.2207523643295253e-05, "loss": 0.5463, "step": 2655 }, { "epoch": 0.3101679104477612, "grad_norm": 0.49004196187956206, "learning_rate": 4.2174665993248505e-05, "loss": 0.5474, "step": 2660 }, { "epoch": 0.3107509328358209, "grad_norm": 0.5092130863744895, "learning_rate": 4.214175379098388e-05, "loss": 0.5512, "step": 2665 }, { "epoch": 0.3113339552238806, "grad_norm": 0.4925159889445034, "learning_rate": 4.210878715885056e-05, "loss": 0.5609, "step": 2670 }, { "epoch": 0.3119169776119403, "grad_norm": 0.5438536521932689, "learning_rate": 4.2075766219400095e-05, "loss": 0.5949, "step": 2675 }, { "epoch": 0.3125, "grad_norm": 0.4985765329826545, "learning_rate": 4.20426910953859e-05, "loss": 0.5726, "step": 2680 }, { "epoch": 0.3130830223880597, "grad_norm": 0.5036063584187528, "learning_rate": 4.200956190976284e-05, "loss": 0.5255, "step": 2685 }, { "epoch": 0.3136660447761194, "grad_norm": 0.5634159268083127, "learning_rate": 4.1976378785686715e-05, "loss": 0.5254, "step": 2690 }, { "epoch": 0.3142490671641791, "grad_norm": 0.483875275656377, "learning_rate": 4.1943141846513886e-05, "loss": 0.5518, "step": 2695 }, { "epoch": 0.3148320895522388, "grad_norm": 0.5510683619678165, "learning_rate": 4.190985121580071e-05, "loss": 0.557, "step": 2700 }, { "epoch": 0.31541511194029853, "grad_norm": 0.45786711381238016, "learning_rate": 4.18765070173032e-05, "loss": 0.521, "step": 2705 }, { "epoch": 0.3159981343283582, "grad_norm": 0.47954794442661985, "learning_rate": 4.184310937497647e-05, "loss": 0.55, "step": 2710 }, { "epoch": 0.3165811567164179, "grad_norm": 0.5348827941441646, "learning_rate": 4.1809658412974314e-05, "loss": 0.5668, "step": 2715 }, { "epoch": 0.31716417910447764, "grad_norm": 0.5035567889348412, "learning_rate": 4.177615425564872e-05, "loss": 0.5584, "step": 2720 }, { "epoch": 0.3177472014925373, "grad_norm": 0.5014049010285576, "learning_rate": 4.174259702754947e-05, "loss": 0.5538, "step": 2725 }, { "epoch": 0.318330223880597, "grad_norm": 0.7108080618041146, "learning_rate": 4.17089868534236e-05, "loss": 0.593, "step": 2730 }, { "epoch": 0.3189132462686567, "grad_norm": 0.4901344322857948, "learning_rate": 4.1675323858214975e-05, "loss": 0.5504, "step": 2735 }, { "epoch": 0.31949626865671643, "grad_norm": 0.5455638212871261, "learning_rate": 4.164160816706383e-05, "loss": 0.5781, "step": 2740 }, { "epoch": 0.3200792910447761, "grad_norm": 0.4691865337288382, "learning_rate": 4.160783990530629e-05, "loss": 0.5349, "step": 2745 }, { "epoch": 0.3206623134328358, "grad_norm": 0.5397164117756076, "learning_rate": 4.157401919847389e-05, "loss": 0.5635, "step": 2750 }, { "epoch": 0.32124533582089554, "grad_norm": 0.49203133885978545, "learning_rate": 4.1540146172293154e-05, "loss": 0.553, "step": 2755 }, { "epoch": 0.3218283582089552, "grad_norm": 0.6956383718262041, "learning_rate": 4.150622095268508e-05, "loss": 0.5454, "step": 2760 }, { "epoch": 0.3224113805970149, "grad_norm": 0.5236479255481173, "learning_rate": 4.1472243665764715e-05, "loss": 0.546, "step": 2765 }, { "epoch": 0.32299440298507465, "grad_norm": 0.5226139094665005, "learning_rate": 4.1438214437840625e-05, "loss": 0.5685, "step": 2770 }, { "epoch": 0.32357742537313433, "grad_norm": 0.519532395995944, "learning_rate": 4.140413339541451e-05, "loss": 0.5621, "step": 2775 }, { "epoch": 0.324160447761194, "grad_norm": 0.5221117984856967, "learning_rate": 4.137000066518065e-05, "loss": 0.5945, "step": 2780 }, { "epoch": 0.32474347014925375, "grad_norm": 0.4662764439552845, "learning_rate": 4.13358163740255e-05, "loss": 0.5439, "step": 2785 }, { "epoch": 0.32532649253731344, "grad_norm": 0.4825709779338081, "learning_rate": 4.1301580649027154e-05, "loss": 0.5487, "step": 2790 }, { "epoch": 0.3259095149253731, "grad_norm": 0.49595525241615773, "learning_rate": 4.126729361745495e-05, "loss": 0.5456, "step": 2795 }, { "epoch": 0.32649253731343286, "grad_norm": 0.46259611525800753, "learning_rate": 4.1232955406768925e-05, "loss": 0.5385, "step": 2800 }, { "epoch": 0.32707555970149255, "grad_norm": 0.48140441921381905, "learning_rate": 4.119856614461938e-05, "loss": 0.5289, "step": 2805 }, { "epoch": 0.32765858208955223, "grad_norm": 0.4878844018028054, "learning_rate": 4.11641259588464e-05, "loss": 0.5698, "step": 2810 }, { "epoch": 0.3282416044776119, "grad_norm": 0.521050501093044, "learning_rate": 4.1129634977479375e-05, "loss": 0.5609, "step": 2815 }, { "epoch": 0.32882462686567165, "grad_norm": 0.49828928878222534, "learning_rate": 4.109509332873653e-05, "loss": 0.5421, "step": 2820 }, { "epoch": 0.32940764925373134, "grad_norm": 0.5526407883676059, "learning_rate": 4.106050114102443e-05, "loss": 0.5789, "step": 2825 }, { "epoch": 0.329990671641791, "grad_norm": 0.5516248967750637, "learning_rate": 4.102585854293751e-05, "loss": 0.526, "step": 2830 }, { "epoch": 0.33057369402985076, "grad_norm": 0.5461716849773546, "learning_rate": 4.0991165663257636e-05, "loss": 0.542, "step": 2835 }, { "epoch": 0.33115671641791045, "grad_norm": 0.4524230892035543, "learning_rate": 4.095642263095356e-05, "loss": 0.5429, "step": 2840 }, { "epoch": 0.33173973880597013, "grad_norm": 0.4907608846635581, "learning_rate": 4.0921629575180485e-05, "loss": 0.5536, "step": 2845 }, { "epoch": 0.33232276119402987, "grad_norm": 0.5284875026264199, "learning_rate": 4.088678662527959e-05, "loss": 0.6164, "step": 2850 }, { "epoch": 0.33290578358208955, "grad_norm": 0.48638662847344827, "learning_rate": 4.085189391077749e-05, "loss": 0.576, "step": 2855 }, { "epoch": 0.33348880597014924, "grad_norm": 0.5695349098133777, "learning_rate": 4.0816951561385836e-05, "loss": 0.5521, "step": 2860 }, { "epoch": 0.334071828358209, "grad_norm": 0.44821729017419637, "learning_rate": 4.078195970700079e-05, "loss": 0.526, "step": 2865 }, { "epoch": 0.33465485074626866, "grad_norm": 0.5289515772805433, "learning_rate": 4.074691847770251e-05, "loss": 0.5505, "step": 2870 }, { "epoch": 0.33523787313432835, "grad_norm": 0.5491964687999824, "learning_rate": 4.0711828003754764e-05, "loss": 0.5795, "step": 2875 }, { "epoch": 0.3358208955223881, "grad_norm": 0.5184535389920294, "learning_rate": 4.067668841560432e-05, "loss": 0.5864, "step": 2880 }, { "epoch": 0.33640391791044777, "grad_norm": 0.4640726695845034, "learning_rate": 4.064149984388057e-05, "loss": 0.5377, "step": 2885 }, { "epoch": 0.33698694029850745, "grad_norm": 0.5223555480526277, "learning_rate": 4.060626241939499e-05, "loss": 0.5528, "step": 2890 }, { "epoch": 0.33756996268656714, "grad_norm": 0.47444935336532806, "learning_rate": 4.057097627314063e-05, "loss": 0.5366, "step": 2895 }, { "epoch": 0.3381529850746269, "grad_norm": 0.49936360299242055, "learning_rate": 4.0535641536291725e-05, "loss": 0.5981, "step": 2900 }, { "epoch": 0.33873600746268656, "grad_norm": 0.49567180693590535, "learning_rate": 4.050025834020307e-05, "loss": 0.564, "step": 2905 }, { "epoch": 0.33931902985074625, "grad_norm": 0.48231305889253284, "learning_rate": 4.046482681640967e-05, "loss": 0.5559, "step": 2910 }, { "epoch": 0.339902052238806, "grad_norm": 0.5713231807843953, "learning_rate": 4.042934709662613e-05, "loss": 0.6046, "step": 2915 }, { "epoch": 0.34048507462686567, "grad_norm": 0.5599418550405636, "learning_rate": 4.039381931274626e-05, "loss": 0.5459, "step": 2920 }, { "epoch": 0.34106809701492535, "grad_norm": 0.5244470856538797, "learning_rate": 4.035824359684253e-05, "loss": 0.5559, "step": 2925 }, { "epoch": 0.3416511194029851, "grad_norm": 0.46218246493450177, "learning_rate": 4.032262008116559e-05, "loss": 0.5496, "step": 2930 }, { "epoch": 0.3422341417910448, "grad_norm": 0.5460308556475504, "learning_rate": 4.02869488981438e-05, "loss": 0.5726, "step": 2935 }, { "epoch": 0.34281716417910446, "grad_norm": 0.4623737344709334, "learning_rate": 4.025123018038271e-05, "loss": 0.5422, "step": 2940 }, { "epoch": 0.3434001865671642, "grad_norm": 0.5813565724181794, "learning_rate": 4.0215464060664564e-05, "loss": 0.5822, "step": 2945 }, { "epoch": 0.3439832089552239, "grad_norm": 0.49376149527368923, "learning_rate": 4.017965067194783e-05, "loss": 0.5821, "step": 2950 }, { "epoch": 0.34456623134328357, "grad_norm": 0.5169325234827248, "learning_rate": 4.0143790147366724e-05, "loss": 0.5482, "step": 2955 }, { "epoch": 0.3451492537313433, "grad_norm": 0.515737079500457, "learning_rate": 4.010788262023064e-05, "loss": 0.5709, "step": 2960 }, { "epoch": 0.345732276119403, "grad_norm": 0.5087208722494978, "learning_rate": 4.007192822402372e-05, "loss": 0.5565, "step": 2965 }, { "epoch": 0.3463152985074627, "grad_norm": 0.4796963642209258, "learning_rate": 4.003592709240438e-05, "loss": 0.5494, "step": 2970 }, { "epoch": 0.34689832089552236, "grad_norm": 0.49755643832004437, "learning_rate": 3.9999879359204676e-05, "loss": 0.5382, "step": 2975 }, { "epoch": 0.3474813432835821, "grad_norm": 0.4690304314754959, "learning_rate": 3.996378515843001e-05, "loss": 0.5334, "step": 2980 }, { "epoch": 0.3480643656716418, "grad_norm": 0.4772189221465707, "learning_rate": 3.9927644624258445e-05, "loss": 0.5902, "step": 2985 }, { "epoch": 0.34864738805970147, "grad_norm": 0.46404845790530064, "learning_rate": 3.989145789104033e-05, "loss": 0.5431, "step": 2990 }, { "epoch": 0.3492304104477612, "grad_norm": 0.5340362988246943, "learning_rate": 3.985522509329775e-05, "loss": 0.5486, "step": 2995 }, { "epoch": 0.3498134328358209, "grad_norm": 0.510771920504794, "learning_rate": 3.9818946365724004e-05, "loss": 0.5401, "step": 3000 }, { "epoch": 0.3503964552238806, "grad_norm": 0.49718158294490394, "learning_rate": 3.978262184318317e-05, "loss": 0.5626, "step": 3005 }, { "epoch": 0.3509794776119403, "grad_norm": 0.5158633729934479, "learning_rate": 3.974625166070953e-05, "loss": 0.5364, "step": 3010 }, { "epoch": 0.3515625, "grad_norm": 0.5204824836614376, "learning_rate": 3.970983595350714e-05, "loss": 0.5545, "step": 3015 }, { "epoch": 0.3521455223880597, "grad_norm": 0.506085761814899, "learning_rate": 3.967337485694929e-05, "loss": 0.5492, "step": 3020 }, { "epoch": 0.3527285447761194, "grad_norm": 0.5167191975821872, "learning_rate": 3.963686850657795e-05, "loss": 0.5326, "step": 3025 }, { "epoch": 0.3533115671641791, "grad_norm": 0.49528479007831655, "learning_rate": 3.9600317038103385e-05, "loss": 0.548, "step": 3030 }, { "epoch": 0.3538945895522388, "grad_norm": 0.514405428467145, "learning_rate": 3.956372058740354e-05, "loss": 0.5708, "step": 3035 }, { "epoch": 0.35447761194029853, "grad_norm": 0.4760653718744451, "learning_rate": 3.952707929052359e-05, "loss": 0.5385, "step": 3040 }, { "epoch": 0.3550606343283582, "grad_norm": 0.5086645138695323, "learning_rate": 3.9490393283675445e-05, "loss": 0.5425, "step": 3045 }, { "epoch": 0.3556436567164179, "grad_norm": 0.48138382467030305, "learning_rate": 3.9453662703237186e-05, "loss": 0.5599, "step": 3050 }, { "epoch": 0.35622667910447764, "grad_norm": 0.594129469541421, "learning_rate": 3.941688768575261e-05, "loss": 0.558, "step": 3055 }, { "epoch": 0.3568097014925373, "grad_norm": 0.4896479074572315, "learning_rate": 3.938006836793073e-05, "loss": 0.5399, "step": 3060 }, { "epoch": 0.357392723880597, "grad_norm": 0.5430860896504877, "learning_rate": 3.934320488664519e-05, "loss": 0.5614, "step": 3065 }, { "epoch": 0.3579757462686567, "grad_norm": 0.5171549846963465, "learning_rate": 3.9306297378933855e-05, "loss": 0.5338, "step": 3070 }, { "epoch": 0.35855876865671643, "grad_norm": 0.5030297563055794, "learning_rate": 3.926934598199824e-05, "loss": 0.5671, "step": 3075 }, { "epoch": 0.3591417910447761, "grad_norm": 0.4738604088361965, "learning_rate": 3.923235083320301e-05, "loss": 0.5204, "step": 3080 }, { "epoch": 0.3597248134328358, "grad_norm": 0.5081725848955316, "learning_rate": 3.919531207007548e-05, "loss": 0.5382, "step": 3085 }, { "epoch": 0.36030783582089554, "grad_norm": 0.5336505974453601, "learning_rate": 3.915822983030512e-05, "loss": 0.5498, "step": 3090 }, { "epoch": 0.3608908582089552, "grad_norm": 0.45862890034907244, "learning_rate": 3.912110425174296e-05, "loss": 0.5478, "step": 3095 }, { "epoch": 0.3614738805970149, "grad_norm": 0.5692817255847623, "learning_rate": 3.9083935472401214e-05, "loss": 0.5511, "step": 3100 }, { "epoch": 0.36205690298507465, "grad_norm": 0.5588630515105258, "learning_rate": 3.904672363045265e-05, "loss": 0.5713, "step": 3105 }, { "epoch": 0.36263992537313433, "grad_norm": 0.7882164929530883, "learning_rate": 3.900946886423012e-05, "loss": 0.5967, "step": 3110 }, { "epoch": 0.363222947761194, "grad_norm": 0.44828952211943274, "learning_rate": 3.897217131222606e-05, "loss": 0.5117, "step": 3115 }, { "epoch": 0.36380597014925375, "grad_norm": 0.492899081711114, "learning_rate": 3.893483111309196e-05, "loss": 0.5381, "step": 3120 }, { "epoch": 0.36438899253731344, "grad_norm": 0.46780156460670047, "learning_rate": 3.889744840563781e-05, "loss": 0.5561, "step": 3125 }, { "epoch": 0.3649720149253731, "grad_norm": 0.5062858013949219, "learning_rate": 3.886002332883169e-05, "loss": 0.5501, "step": 3130 }, { "epoch": 0.36555503731343286, "grad_norm": 0.4693101482730097, "learning_rate": 3.8822556021799114e-05, "loss": 0.5193, "step": 3135 }, { "epoch": 0.36613805970149255, "grad_norm": 0.5060844807583814, "learning_rate": 3.878504662382264e-05, "loss": 0.5532, "step": 3140 }, { "epoch": 0.36672108208955223, "grad_norm": 0.5201363699624885, "learning_rate": 3.8747495274341274e-05, "loss": 0.5845, "step": 3145 }, { "epoch": 0.3673041044776119, "grad_norm": 0.5073064897043211, "learning_rate": 3.870990211294997e-05, "loss": 0.5444, "step": 3150 }, { "epoch": 0.36788712686567165, "grad_norm": 0.5344016844623599, "learning_rate": 3.867226727939912e-05, "loss": 0.5606, "step": 3155 }, { "epoch": 0.36847014925373134, "grad_norm": 0.5139307938716214, "learning_rate": 3.863459091359401e-05, "loss": 0.5882, "step": 3160 }, { "epoch": 0.369053171641791, "grad_norm": 0.4425759816838131, "learning_rate": 3.8596873155594385e-05, "loss": 0.5202, "step": 3165 }, { "epoch": 0.36963619402985076, "grad_norm": 0.5742393732533151, "learning_rate": 3.855911414561378e-05, "loss": 0.5662, "step": 3170 }, { "epoch": 0.37021921641791045, "grad_norm": 0.44175421568947043, "learning_rate": 3.852131402401914e-05, "loss": 0.4854, "step": 3175 }, { "epoch": 0.37080223880597013, "grad_norm": 0.4984055964239209, "learning_rate": 3.848347293133021e-05, "loss": 0.573, "step": 3180 }, { "epoch": 0.37138526119402987, "grad_norm": 0.49757126890693293, "learning_rate": 3.844559100821906e-05, "loss": 0.5549, "step": 3185 }, { "epoch": 0.37196828358208955, "grad_norm": 0.57357388001981, "learning_rate": 3.8407668395509526e-05, "loss": 0.5574, "step": 3190 }, { "epoch": 0.37255130597014924, "grad_norm": 0.5019623696079913, "learning_rate": 3.8369705234176726e-05, "loss": 0.5894, "step": 3195 }, { "epoch": 0.373134328358209, "grad_norm": 0.4858155386150297, "learning_rate": 3.8331701665346495e-05, "loss": 0.5383, "step": 3200 }, { "epoch": 0.37371735074626866, "grad_norm": 0.5624113035875654, "learning_rate": 3.829365783029492e-05, "loss": 0.5585, "step": 3205 }, { "epoch": 0.37430037313432835, "grad_norm": 0.5052807550374429, "learning_rate": 3.8255573870447716e-05, "loss": 0.5439, "step": 3210 }, { "epoch": 0.3748833955223881, "grad_norm": 0.5085388458884429, "learning_rate": 3.82174499273798e-05, "loss": 0.565, "step": 3215 }, { "epoch": 0.37546641791044777, "grad_norm": 0.4989534046655282, "learning_rate": 3.817928614281471e-05, "loss": 0.5324, "step": 3220 }, { "epoch": 0.37604944029850745, "grad_norm": 0.8403204163889687, "learning_rate": 3.8141082658624106e-05, "loss": 0.5626, "step": 3225 }, { "epoch": 0.37663246268656714, "grad_norm": 0.5559098643501764, "learning_rate": 3.810283961682722e-05, "loss": 0.5583, "step": 3230 }, { "epoch": 0.3772154850746269, "grad_norm": 0.47644518540184416, "learning_rate": 3.806455715959032e-05, "loss": 0.5233, "step": 3235 }, { "epoch": 0.37779850746268656, "grad_norm": 0.4930068420607561, "learning_rate": 3.8026235429226236e-05, "loss": 0.5642, "step": 3240 }, { "epoch": 0.37838152985074625, "grad_norm": 0.5290426787630793, "learning_rate": 3.798787456819377e-05, "loss": 0.5234, "step": 3245 }, { "epoch": 0.378964552238806, "grad_norm": 0.4868258441116788, "learning_rate": 3.794947471909719e-05, "loss": 0.5395, "step": 3250 }, { "epoch": 0.37954757462686567, "grad_norm": 0.47174968576411624, "learning_rate": 3.791103602468569e-05, "loss": 0.5395, "step": 3255 }, { "epoch": 0.38013059701492535, "grad_norm": 0.5363107908185231, "learning_rate": 3.7872558627852905e-05, "loss": 0.5557, "step": 3260 }, { "epoch": 0.3807136194029851, "grad_norm": 0.49188998699814623, "learning_rate": 3.78340426716363e-05, "loss": 0.5268, "step": 3265 }, { "epoch": 0.3812966417910448, "grad_norm": 0.48927529818476645, "learning_rate": 3.779548829921673e-05, "loss": 0.5387, "step": 3270 }, { "epoch": 0.38187966417910446, "grad_norm": 0.47364382567752494, "learning_rate": 3.775689565391781e-05, "loss": 0.5129, "step": 3275 }, { "epoch": 0.3824626865671642, "grad_norm": 0.5126016691334612, "learning_rate": 3.771826487920546e-05, "loss": 0.5467, "step": 3280 }, { "epoch": 0.3830457089552239, "grad_norm": 0.5430085481644645, "learning_rate": 3.767959611868734e-05, "loss": 0.5536, "step": 3285 }, { "epoch": 0.38362873134328357, "grad_norm": 0.4763687695806715, "learning_rate": 3.764088951611233e-05, "loss": 0.5071, "step": 3290 }, { "epoch": 0.3842117537313433, "grad_norm": 0.47538986078328227, "learning_rate": 3.7602145215369965e-05, "loss": 0.5431, "step": 3295 }, { "epoch": 0.384794776119403, "grad_norm": 0.44993211618209894, "learning_rate": 3.756336336048994e-05, "loss": 0.5608, "step": 3300 }, { "epoch": 0.3853777985074627, "grad_norm": 0.4857510707261703, "learning_rate": 3.752454409564152e-05, "loss": 0.5307, "step": 3305 }, { "epoch": 0.38596082089552236, "grad_norm": 0.49584291918089757, "learning_rate": 3.74856875651331e-05, "loss": 0.6047, "step": 3310 }, { "epoch": 0.3865438432835821, "grad_norm": 0.5136523336548113, "learning_rate": 3.744679391341157e-05, "loss": 0.5516, "step": 3315 }, { "epoch": 0.3871268656716418, "grad_norm": 0.5181136974711912, "learning_rate": 3.740786328506179e-05, "loss": 0.5439, "step": 3320 }, { "epoch": 0.38770988805970147, "grad_norm": 0.5089785637081085, "learning_rate": 3.7368895824806146e-05, "loss": 0.5585, "step": 3325 }, { "epoch": 0.3882929104477612, "grad_norm": 0.49682305539963434, "learning_rate": 3.732989167750388e-05, "loss": 0.562, "step": 3330 }, { "epoch": 0.3888759328358209, "grad_norm": 0.5243718121493276, "learning_rate": 3.7290850988150644e-05, "loss": 0.5851, "step": 3335 }, { "epoch": 0.3894589552238806, "grad_norm": 0.5098618228761496, "learning_rate": 3.7251773901877945e-05, "loss": 0.5191, "step": 3340 }, { "epoch": 0.3900419776119403, "grad_norm": 0.47672882853039394, "learning_rate": 3.721266056395257e-05, "loss": 0.5284, "step": 3345 }, { "epoch": 0.390625, "grad_norm": 0.4822727732005445, "learning_rate": 3.7173511119776086e-05, "loss": 0.5794, "step": 3350 }, { "epoch": 0.3912080223880597, "grad_norm": 0.5222854783235708, "learning_rate": 3.713432571488427e-05, "loss": 0.5391, "step": 3355 }, { "epoch": 0.3917910447761194, "grad_norm": 7.9962975681807595, "learning_rate": 3.70951044949466e-05, "loss": 0.5534, "step": 3360 }, { "epoch": 0.3923740671641791, "grad_norm": 0.4764348433947033, "learning_rate": 3.705584760576566e-05, "loss": 0.5452, "step": 3365 }, { "epoch": 0.3929570895522388, "grad_norm": 0.49080831286781906, "learning_rate": 3.7016555193276667e-05, "loss": 0.5746, "step": 3370 }, { "epoch": 0.39354011194029853, "grad_norm": 0.5101400424865905, "learning_rate": 3.697722740354688e-05, "loss": 0.5729, "step": 3375 }, { "epoch": 0.3941231343283582, "grad_norm": 0.4479587001588852, "learning_rate": 3.6937864382775076e-05, "loss": 0.5447, "step": 3380 }, { "epoch": 0.3947061567164179, "grad_norm": 0.48466656620240023, "learning_rate": 3.689846627729098e-05, "loss": 0.5327, "step": 3385 }, { "epoch": 0.39528917910447764, "grad_norm": 0.5157423888893855, "learning_rate": 3.685903323355477e-05, "loss": 0.5381, "step": 3390 }, { "epoch": 0.3958722014925373, "grad_norm": 0.4820557574738787, "learning_rate": 3.68195653981565e-05, "loss": 0.5183, "step": 3395 }, { "epoch": 0.396455223880597, "grad_norm": 0.48067690178778916, "learning_rate": 3.678006291781555e-05, "loss": 0.6121, "step": 3400 }, { "epoch": 0.3970382462686567, "grad_norm": 0.48140274545401546, "learning_rate": 3.6740525939380084e-05, "loss": 0.5763, "step": 3405 }, { "epoch": 0.39762126865671643, "grad_norm": 0.4821460133421974, "learning_rate": 3.6700954609826535e-05, "loss": 0.5498, "step": 3410 }, { "epoch": 0.3982042910447761, "grad_norm": 0.5062442615898354, "learning_rate": 3.6661349076259015e-05, "loss": 0.5301, "step": 3415 }, { "epoch": 0.3987873134328358, "grad_norm": 0.4894842886999361, "learning_rate": 3.662170948590879e-05, "loss": 0.5647, "step": 3420 }, { "epoch": 0.39937033582089554, "grad_norm": 0.4635809485827499, "learning_rate": 3.658203598613375e-05, "loss": 0.5658, "step": 3425 }, { "epoch": 0.3999533582089552, "grad_norm": 0.4756142318658808, "learning_rate": 3.65423287244178e-05, "loss": 0.5434, "step": 3430 }, { "epoch": 0.4005363805970149, "grad_norm": 0.4768793799786962, "learning_rate": 3.6502587848370395e-05, "loss": 0.5129, "step": 3435 }, { "epoch": 0.40111940298507465, "grad_norm": 0.49735047427469553, "learning_rate": 3.6462813505725915e-05, "loss": 0.5693, "step": 3440 }, { "epoch": 0.40170242537313433, "grad_norm": 0.46768242382881264, "learning_rate": 3.642300584434319e-05, "loss": 0.5424, "step": 3445 }, { "epoch": 0.402285447761194, "grad_norm": 0.48929227885921284, "learning_rate": 3.638316501220487e-05, "loss": 0.5613, "step": 3450 }, { "epoch": 0.40286847014925375, "grad_norm": 0.49035092965063865, "learning_rate": 3.6343291157416937e-05, "loss": 0.5747, "step": 3455 }, { "epoch": 0.40345149253731344, "grad_norm": 0.4920873718095396, "learning_rate": 3.630338442820814e-05, "loss": 0.5621, "step": 3460 }, { "epoch": 0.4040345149253731, "grad_norm": 0.5721821448291733, "learning_rate": 3.6263444972929395e-05, "loss": 0.5642, "step": 3465 }, { "epoch": 0.40461753731343286, "grad_norm": 0.4900664650078509, "learning_rate": 3.622347294005334e-05, "loss": 0.5637, "step": 3470 }, { "epoch": 0.40520055970149255, "grad_norm": 0.4784540540109224, "learning_rate": 3.618346847817366e-05, "loss": 0.5166, "step": 3475 }, { "epoch": 0.40578358208955223, "grad_norm": 0.5033729998350225, "learning_rate": 3.6143431736004636e-05, "loss": 0.5604, "step": 3480 }, { "epoch": 0.4063666044776119, "grad_norm": 0.47667240088779256, "learning_rate": 3.610336286238051e-05, "loss": 0.5492, "step": 3485 }, { "epoch": 0.40694962686567165, "grad_norm": 0.4874513651352348, "learning_rate": 3.6063262006255006e-05, "loss": 0.5563, "step": 3490 }, { "epoch": 0.40753264925373134, "grad_norm": 0.523091113003475, "learning_rate": 3.602312931670073e-05, "loss": 0.5488, "step": 3495 }, { "epoch": 0.408115671641791, "grad_norm": 0.5531785395812051, "learning_rate": 3.59829649429086e-05, "loss": 0.6013, "step": 3500 }, { "epoch": 0.40869869402985076, "grad_norm": 0.47185275980717517, "learning_rate": 3.5942769034187354e-05, "loss": 0.536, "step": 3505 }, { "epoch": 0.40928171641791045, "grad_norm": 0.46288526087001597, "learning_rate": 3.590254173996295e-05, "loss": 0.5169, "step": 3510 }, { "epoch": 0.40986473880597013, "grad_norm": 0.48524872738357405, "learning_rate": 3.586228320977801e-05, "loss": 0.5271, "step": 3515 }, { "epoch": 0.41044776119402987, "grad_norm": 0.5450240726382458, "learning_rate": 3.582199359329129e-05, "loss": 0.5427, "step": 3520 }, { "epoch": 0.41103078358208955, "grad_norm": 0.48693873450848696, "learning_rate": 3.5781673040277084e-05, "loss": 0.5088, "step": 3525 }, { "epoch": 0.41161380597014924, "grad_norm": 0.46864955154362703, "learning_rate": 3.5741321700624726e-05, "loss": 0.5174, "step": 3530 }, { "epoch": 0.412196828358209, "grad_norm": 0.46728302060083887, "learning_rate": 3.570093972433794e-05, "loss": 0.5687, "step": 3535 }, { "epoch": 0.41277985074626866, "grad_norm": 0.4634797945645771, "learning_rate": 3.56605272615344e-05, "loss": 0.528, "step": 3540 }, { "epoch": 0.41336287313432835, "grad_norm": 0.5238785746405048, "learning_rate": 3.562008446244509e-05, "loss": 0.5375, "step": 3545 }, { "epoch": 0.4139458955223881, "grad_norm": 0.5187794803590449, "learning_rate": 3.557961147741376e-05, "loss": 0.5985, "step": 3550 }, { "epoch": 0.41452891791044777, "grad_norm": 0.4379680762591855, "learning_rate": 3.553910845689638e-05, "loss": 0.5554, "step": 3555 }, { "epoch": 0.41511194029850745, "grad_norm": 0.46577740202461215, "learning_rate": 3.549857555146056e-05, "loss": 0.5472, "step": 3560 }, { "epoch": 0.41569496268656714, "grad_norm": 0.4789932179858049, "learning_rate": 3.5458012911785036e-05, "loss": 0.5451, "step": 3565 }, { "epoch": 0.4162779850746269, "grad_norm": 0.49654652817675987, "learning_rate": 3.541742068865907e-05, "loss": 0.5513, "step": 3570 }, { "epoch": 0.41686100746268656, "grad_norm": 0.4692894243690009, "learning_rate": 3.537679903298187e-05, "loss": 0.5185, "step": 3575 }, { "epoch": 0.41744402985074625, "grad_norm": 0.47941339153241536, "learning_rate": 3.53361480957621e-05, "loss": 0.5692, "step": 3580 }, { "epoch": 0.418027052238806, "grad_norm": 0.46544865766435817, "learning_rate": 3.529546802811725e-05, "loss": 0.5405, "step": 3585 }, { "epoch": 0.41861007462686567, "grad_norm": 0.46403588392065775, "learning_rate": 3.5254758981273106e-05, "loss": 0.5437, "step": 3590 }, { "epoch": 0.41919309701492535, "grad_norm": 0.5179906749611947, "learning_rate": 3.521402110656318e-05, "loss": 0.5593, "step": 3595 }, { "epoch": 0.4197761194029851, "grad_norm": 0.4781025455029735, "learning_rate": 3.517325455542815e-05, "loss": 0.5498, "step": 3600 }, { "epoch": 0.4203591417910448, "grad_norm": 0.4934656687592932, "learning_rate": 3.513245947941531e-05, "loss": 0.5215, "step": 3605 }, { "epoch": 0.42094216417910446, "grad_norm": 0.5326144345367541, "learning_rate": 3.5091636030177995e-05, "loss": 0.535, "step": 3610 }, { "epoch": 0.4215251865671642, "grad_norm": 0.47801066829681754, "learning_rate": 3.505078435947498e-05, "loss": 0.5137, "step": 3615 }, { "epoch": 0.4221082089552239, "grad_norm": 0.5286215614880156, "learning_rate": 3.500990461916998e-05, "loss": 0.5733, "step": 3620 }, { "epoch": 0.42269123134328357, "grad_norm": 0.5073258841460547, "learning_rate": 3.496899696123107e-05, "loss": 0.5746, "step": 3625 }, { "epoch": 0.4232742537313433, "grad_norm": 0.48126213143188057, "learning_rate": 3.492806153773007e-05, "loss": 0.559, "step": 3630 }, { "epoch": 0.423857276119403, "grad_norm": 0.43727021104667335, "learning_rate": 3.488709850084206e-05, "loss": 0.4998, "step": 3635 }, { "epoch": 0.4244402985074627, "grad_norm": 0.4897298712066961, "learning_rate": 3.484610800284473e-05, "loss": 0.5463, "step": 3640 }, { "epoch": 0.42502332089552236, "grad_norm": 0.523356089677995, "learning_rate": 3.480509019611788e-05, "loss": 0.5659, "step": 3645 }, { "epoch": 0.4256063432835821, "grad_norm": 0.5277720573146358, "learning_rate": 3.476404523314282e-05, "loss": 0.5241, "step": 3650 }, { "epoch": 0.4261893656716418, "grad_norm": 0.526635621277189, "learning_rate": 3.472297326650183e-05, "loss": 0.5543, "step": 3655 }, { "epoch": 0.42677238805970147, "grad_norm": 0.45989723897065593, "learning_rate": 3.468187444887754e-05, "loss": 0.4939, "step": 3660 }, { "epoch": 0.4273554104477612, "grad_norm": 0.4622944623408051, "learning_rate": 3.464074893305242e-05, "loss": 0.5297, "step": 3665 }, { "epoch": 0.4279384328358209, "grad_norm": 0.4783006627278798, "learning_rate": 3.45995968719082e-05, "loss": 0.5255, "step": 3670 }, { "epoch": 0.4285214552238806, "grad_norm": 0.5059594374936409, "learning_rate": 3.455841841842524e-05, "loss": 0.5773, "step": 3675 }, { "epoch": 0.4291044776119403, "grad_norm": 0.44346639279558187, "learning_rate": 3.4517213725682085e-05, "loss": 0.5196, "step": 3680 }, { "epoch": 0.4296875, "grad_norm": 0.4742923141132099, "learning_rate": 3.447598294685476e-05, "loss": 0.5211, "step": 3685 }, { "epoch": 0.4302705223880597, "grad_norm": 0.5096994312042691, "learning_rate": 3.443472623521631e-05, "loss": 0.5576, "step": 3690 }, { "epoch": 0.4308535447761194, "grad_norm": 0.4757579513441239, "learning_rate": 3.4393443744136136e-05, "loss": 0.5342, "step": 3695 }, { "epoch": 0.4314365671641791, "grad_norm": 0.4525723387319913, "learning_rate": 3.435213562707953e-05, "loss": 0.521, "step": 3700 }, { "epoch": 0.4320195895522388, "grad_norm": 0.4655324458390628, "learning_rate": 3.431080203760699e-05, "loss": 0.5143, "step": 3705 }, { "epoch": 0.43260261194029853, "grad_norm": 0.46493164741995624, "learning_rate": 3.426944312937376e-05, "loss": 0.5448, "step": 3710 }, { "epoch": 0.4331856343283582, "grad_norm": 0.4705834455942846, "learning_rate": 3.422805905612914e-05, "loss": 0.5132, "step": 3715 }, { "epoch": 0.4337686567164179, "grad_norm": 0.44735510242287085, "learning_rate": 3.4186649971716044e-05, "loss": 0.5078, "step": 3720 }, { "epoch": 0.43435167910447764, "grad_norm": 0.4248977052828472, "learning_rate": 3.4145216030070344e-05, "loss": 0.5224, "step": 3725 }, { "epoch": 0.4349347014925373, "grad_norm": 0.48361810889787654, "learning_rate": 3.410375738522028e-05, "loss": 0.5696, "step": 3730 }, { "epoch": 0.435517723880597, "grad_norm": 0.5169029233248057, "learning_rate": 3.406227419128596e-05, "loss": 0.548, "step": 3735 }, { "epoch": 0.4361007462686567, "grad_norm": 0.47352743387761187, "learning_rate": 3.402076660247878e-05, "loss": 0.5735, "step": 3740 }, { "epoch": 0.43668376865671643, "grad_norm": 0.4598793206254637, "learning_rate": 3.397923477310074e-05, "loss": 0.5286, "step": 3745 }, { "epoch": 0.4372667910447761, "grad_norm": 0.7329855818387587, "learning_rate": 3.393767885754405e-05, "loss": 0.5274, "step": 3750 }, { "epoch": 0.4378498134328358, "grad_norm": 0.5374277209323949, "learning_rate": 3.389609901029038e-05, "loss": 0.569, "step": 3755 }, { "epoch": 0.43843283582089554, "grad_norm": 0.4851132196571775, "learning_rate": 3.38544953859104e-05, "loss": 0.5658, "step": 3760 }, { "epoch": 0.4390158582089552, "grad_norm": 0.4588569006808473, "learning_rate": 3.381286813906317e-05, "loss": 0.5463, "step": 3765 }, { "epoch": 0.4395988805970149, "grad_norm": 0.49195337073823053, "learning_rate": 3.3771217424495555e-05, "loss": 0.5855, "step": 3770 }, { "epoch": 0.44018190298507465, "grad_norm": 0.47991606249122304, "learning_rate": 3.372954339704167e-05, "loss": 0.5496, "step": 3775 }, { "epoch": 0.44076492537313433, "grad_norm": 0.4690325622228386, "learning_rate": 3.368784621162229e-05, "loss": 0.5647, "step": 3780 }, { "epoch": 0.441347947761194, "grad_norm": 0.5399827946397292, "learning_rate": 3.364612602324429e-05, "loss": 0.554, "step": 3785 }, { "epoch": 0.44193097014925375, "grad_norm": 0.44956816879532197, "learning_rate": 3.3604382987000016e-05, "loss": 0.5033, "step": 3790 }, { "epoch": 0.44251399253731344, "grad_norm": 0.49304141117786027, "learning_rate": 3.356261725806681e-05, "loss": 0.5651, "step": 3795 }, { "epoch": 0.4430970149253731, "grad_norm": 0.4984710533841555, "learning_rate": 3.352082899170631e-05, "loss": 0.5238, "step": 3800 }, { "epoch": 0.44368003731343286, "grad_norm": 0.5007565543332921, "learning_rate": 3.3479018343264e-05, "loss": 0.5519, "step": 3805 }, { "epoch": 0.44426305970149255, "grad_norm": 0.5011462730265147, "learning_rate": 3.343718546816852e-05, "loss": 0.5523, "step": 3810 }, { "epoch": 0.44484608208955223, "grad_norm": 0.49568146210324576, "learning_rate": 3.339533052193114e-05, "loss": 0.5132, "step": 3815 }, { "epoch": 0.4454291044776119, "grad_norm": 0.45124156599655735, "learning_rate": 3.335345366014522e-05, "loss": 0.55, "step": 3820 }, { "epoch": 0.44601212686567165, "grad_norm": 0.47223468640192207, "learning_rate": 3.331155503848553e-05, "loss": 0.5438, "step": 3825 }, { "epoch": 0.44659514925373134, "grad_norm": 0.5287373716087295, "learning_rate": 3.326963481270778e-05, "loss": 0.5411, "step": 3830 }, { "epoch": 0.447178171641791, "grad_norm": 0.44674599573765417, "learning_rate": 3.322769313864796e-05, "loss": 0.5363, "step": 3835 }, { "epoch": 0.44776119402985076, "grad_norm": 0.4434254783508246, "learning_rate": 3.3185730172221814e-05, "loss": 0.537, "step": 3840 }, { "epoch": 0.44834421641791045, "grad_norm": 0.5046794647102569, "learning_rate": 3.3143746069424215e-05, "loss": 0.5582, "step": 3845 }, { "epoch": 0.44892723880597013, "grad_norm": 0.489005919425733, "learning_rate": 3.310174098632865e-05, "loss": 0.5549, "step": 3850 }, { "epoch": 0.44951026119402987, "grad_norm": 0.49266732159431287, "learning_rate": 3.305971507908655e-05, "loss": 0.5313, "step": 3855 }, { "epoch": 0.45009328358208955, "grad_norm": 0.5315237555379242, "learning_rate": 3.301766850392681e-05, "loss": 0.5581, "step": 3860 }, { "epoch": 0.45067630597014924, "grad_norm": 0.46700902921771087, "learning_rate": 3.29756014171551e-05, "loss": 0.5371, "step": 3865 }, { "epoch": 0.451259328358209, "grad_norm": 0.4395133061533244, "learning_rate": 3.2933513975153384e-05, "loss": 0.5399, "step": 3870 }, { "epoch": 0.45184235074626866, "grad_norm": 0.485910728377908, "learning_rate": 3.2891406334379285e-05, "loss": 0.525, "step": 3875 }, { "epoch": 0.45242537313432835, "grad_norm": 0.46551066433925664, "learning_rate": 3.284927865136551e-05, "loss": 0.4913, "step": 3880 }, { "epoch": 0.4530083955223881, "grad_norm": 0.4792517888689515, "learning_rate": 3.280713108271926e-05, "loss": 0.5237, "step": 3885 }, { "epoch": 0.45359141791044777, "grad_norm": 0.44580552095045006, "learning_rate": 3.276496378512168e-05, "loss": 0.5716, "step": 3890 }, { "epoch": 0.45417444029850745, "grad_norm": 0.46092489328912967, "learning_rate": 3.272277691532725e-05, "loss": 0.5402, "step": 3895 }, { "epoch": 0.45475746268656714, "grad_norm": 0.4413897932753195, "learning_rate": 3.268057063016319e-05, "loss": 0.5305, "step": 3900 }, { "epoch": 0.4553404850746269, "grad_norm": 0.4782676785091174, "learning_rate": 3.263834508652894e-05, "loss": 0.5946, "step": 3905 }, { "epoch": 0.45592350746268656, "grad_norm": 0.5238873675452039, "learning_rate": 3.259610044139548e-05, "loss": 0.5478, "step": 3910 }, { "epoch": 0.45650652985074625, "grad_norm": 0.5182499166196407, "learning_rate": 3.255383685180484e-05, "loss": 0.5454, "step": 3915 }, { "epoch": 0.457089552238806, "grad_norm": 0.47638810180727925, "learning_rate": 3.251155447486945e-05, "loss": 0.5446, "step": 3920 }, { "epoch": 0.45767257462686567, "grad_norm": 0.4523229716036492, "learning_rate": 3.246925346777158e-05, "loss": 0.522, "step": 3925 }, { "epoch": 0.45825559701492535, "grad_norm": 0.46909930899902835, "learning_rate": 3.2426933987762785e-05, "loss": 0.5393, "step": 3930 }, { "epoch": 0.4588386194029851, "grad_norm": 0.43989917162010367, "learning_rate": 3.238459619216326e-05, "loss": 0.5211, "step": 3935 }, { "epoch": 0.4594216417910448, "grad_norm": 0.48026394452289217, "learning_rate": 3.23422402383613e-05, "loss": 0.5547, "step": 3940 }, { "epoch": 0.46000466417910446, "grad_norm": 0.48185947326085543, "learning_rate": 3.22998662838127e-05, "loss": 0.5041, "step": 3945 }, { "epoch": 0.4605876865671642, "grad_norm": 0.4734018707747314, "learning_rate": 3.2257474486040166e-05, "loss": 0.5038, "step": 3950 }, { "epoch": 0.4611707089552239, "grad_norm": 0.5075617095429634, "learning_rate": 3.221506500263276e-05, "loss": 0.5447, "step": 3955 }, { "epoch": 0.46175373134328357, "grad_norm": 0.5046083432543891, "learning_rate": 3.217263799124527e-05, "loss": 0.5772, "step": 3960 }, { "epoch": 0.4623367537313433, "grad_norm": 0.47069172866304554, "learning_rate": 3.213019360959762e-05, "loss": 0.5341, "step": 3965 }, { "epoch": 0.462919776119403, "grad_norm": 0.47866635232292687, "learning_rate": 3.2087732015474366e-05, "loss": 0.5208, "step": 3970 }, { "epoch": 0.4635027985074627, "grad_norm": 0.4516545061013375, "learning_rate": 3.204525336672399e-05, "loss": 0.5382, "step": 3975 }, { "epoch": 0.46408582089552236, "grad_norm": 0.4987443516763455, "learning_rate": 3.200275782125842e-05, "loss": 0.5319, "step": 3980 }, { "epoch": 0.4646688432835821, "grad_norm": 0.5260827157599703, "learning_rate": 3.196024553705235e-05, "loss": 0.5355, "step": 3985 }, { "epoch": 0.4652518656716418, "grad_norm": 0.4668030990105745, "learning_rate": 3.1917716672142746e-05, "loss": 0.5057, "step": 3990 }, { "epoch": 0.46583488805970147, "grad_norm": 0.4426983151393079, "learning_rate": 3.187517138462819e-05, "loss": 0.5254, "step": 3995 }, { "epoch": 0.4664179104477612, "grad_norm": 0.4925988353031197, "learning_rate": 3.1832609832668314e-05, "loss": 0.5422, "step": 4000 }, { "epoch": 0.4670009328358209, "grad_norm": 0.4560283471593519, "learning_rate": 3.179003217448321e-05, "loss": 0.5013, "step": 4005 }, { "epoch": 0.4675839552238806, "grad_norm": 0.5244233872415475, "learning_rate": 3.1747438568352844e-05, "loss": 0.5736, "step": 4010 }, { "epoch": 0.4681669776119403, "grad_norm": 0.5512150192012447, "learning_rate": 3.170482917261648e-05, "loss": 0.5688, "step": 4015 }, { "epoch": 0.46875, "grad_norm": 0.5009381195241619, "learning_rate": 3.166220414567206e-05, "loss": 0.5468, "step": 4020 }, { "epoch": 0.4693330223880597, "grad_norm": 0.47235216856933904, "learning_rate": 3.161956364597566e-05, "loss": 0.5318, "step": 4025 }, { "epoch": 0.4699160447761194, "grad_norm": 0.5634765857129355, "learning_rate": 3.1576907832040855e-05, "loss": 0.5188, "step": 4030 }, { "epoch": 0.4704990671641791, "grad_norm": 0.48072091370811076, "learning_rate": 3.153423686243813e-05, "loss": 0.5313, "step": 4035 }, { "epoch": 0.4710820895522388, "grad_norm": 0.5120155067954245, "learning_rate": 3.149155089579437e-05, "loss": 0.5572, "step": 4040 }, { "epoch": 0.47166511194029853, "grad_norm": 0.4812745114629162, "learning_rate": 3.144885009079215e-05, "loss": 0.5578, "step": 4045 }, { "epoch": 0.4722481343283582, "grad_norm": 0.5058486137109877, "learning_rate": 3.140613460616924e-05, "loss": 0.5199, "step": 4050 }, { "epoch": 0.4728311567164179, "grad_norm": 0.48002106325579913, "learning_rate": 3.1363404600717965e-05, "loss": 0.5659, "step": 4055 }, { "epoch": 0.47341417910447764, "grad_norm": 0.48245091505853244, "learning_rate": 3.132066023328465e-05, "loss": 0.533, "step": 4060 }, { "epoch": 0.4739972014925373, "grad_norm": 0.47536248109948226, "learning_rate": 3.1277901662768983e-05, "loss": 0.5433, "step": 4065 }, { "epoch": 0.474580223880597, "grad_norm": 0.47200232932636466, "learning_rate": 3.123512904812347e-05, "loss": 0.5322, "step": 4070 }, { "epoch": 0.4751632462686567, "grad_norm": 0.44740442637527467, "learning_rate": 3.119234254835282e-05, "loss": 0.5107, "step": 4075 }, { "epoch": 0.47574626865671643, "grad_norm": 0.46154208469718466, "learning_rate": 3.114954232251336e-05, "loss": 0.527, "step": 4080 }, { "epoch": 0.4763292910447761, "grad_norm": 0.5341929895695878, "learning_rate": 3.110672852971243e-05, "loss": 0.5689, "step": 4085 }, { "epoch": 0.4769123134328358, "grad_norm": 0.4652305557977976, "learning_rate": 3.1063901329107843e-05, "loss": 0.5167, "step": 4090 }, { "epoch": 0.47749533582089554, "grad_norm": 0.44417381469876005, "learning_rate": 3.10210608799072e-05, "loss": 0.5413, "step": 4095 }, { "epoch": 0.4780783582089552, "grad_norm": 0.4507749342993046, "learning_rate": 3.097820734136739e-05, "loss": 0.5282, "step": 4100 }, { "epoch": 0.4786613805970149, "grad_norm": 0.48297525105833106, "learning_rate": 3.093534087279397e-05, "loss": 0.5347, "step": 4105 }, { "epoch": 0.47924440298507465, "grad_norm": 0.5100316904667305, "learning_rate": 3.089246163354051e-05, "loss": 0.5395, "step": 4110 }, { "epoch": 0.47982742537313433, "grad_norm": 0.4995786265506999, "learning_rate": 3.084956978300812e-05, "loss": 0.5311, "step": 4115 }, { "epoch": 0.480410447761194, "grad_norm": 0.4471832602242875, "learning_rate": 3.080666548064475e-05, "loss": 0.5193, "step": 4120 }, { "epoch": 0.48099347014925375, "grad_norm": 0.46140397088213214, "learning_rate": 3.076374888594464e-05, "loss": 0.5345, "step": 4125 }, { "epoch": 0.48157649253731344, "grad_norm": 0.44371711691404125, "learning_rate": 3.0720820158447766e-05, "loss": 0.5072, "step": 4130 }, { "epoch": 0.4821595149253731, "grad_norm": 0.4561270995882789, "learning_rate": 3.067787945773915e-05, "loss": 0.5181, "step": 4135 }, { "epoch": 0.48274253731343286, "grad_norm": 0.4468599316613869, "learning_rate": 3.063492694344835e-05, "loss": 0.5286, "step": 4140 }, { "epoch": 0.48332555970149255, "grad_norm": 0.4921853335929309, "learning_rate": 3.059196277524886e-05, "loss": 0.5075, "step": 4145 }, { "epoch": 0.48390858208955223, "grad_norm": 0.47049020704612843, "learning_rate": 3.054898711285747e-05, "loss": 0.5475, "step": 4150 }, { "epoch": 0.4844916044776119, "grad_norm": 1.0060177708740357, "learning_rate": 3.05060001160337e-05, "loss": 0.5293, "step": 4155 }, { "epoch": 0.48507462686567165, "grad_norm": 0.48784465637028546, "learning_rate": 3.046300194457923e-05, "loss": 0.534, "step": 4160 }, { "epoch": 0.48565764925373134, "grad_norm": 0.4518879881598059, "learning_rate": 3.0419992758337235e-05, "loss": 0.5209, "step": 4165 }, { "epoch": 0.486240671641791, "grad_norm": 0.4452974474644216, "learning_rate": 3.0376972717191894e-05, "loss": 0.5245, "step": 4170 }, { "epoch": 0.48682369402985076, "grad_norm": 0.44833371464722693, "learning_rate": 3.0333941981067688e-05, "loss": 0.5156, "step": 4175 }, { "epoch": 0.48740671641791045, "grad_norm": 0.44934997583495073, "learning_rate": 3.029090070992889e-05, "loss": 0.5144, "step": 4180 }, { "epoch": 0.48798973880597013, "grad_norm": 0.47274610781733095, "learning_rate": 3.0247849063778917e-05, "loss": 0.5317, "step": 4185 }, { "epoch": 0.48857276119402987, "grad_norm": 0.47590787280797764, "learning_rate": 3.020478720265977e-05, "loss": 0.51, "step": 4190 }, { "epoch": 0.48915578358208955, "grad_norm": 0.44563102351746864, "learning_rate": 3.01617152866514e-05, "loss": 0.5301, "step": 4195 }, { "epoch": 0.48973880597014924, "grad_norm": 0.4916861483726837, "learning_rate": 3.0118633475871167e-05, "loss": 0.5772, "step": 4200 }, { "epoch": 0.490321828358209, "grad_norm": 0.4684037067244426, "learning_rate": 3.0075541930473183e-05, "loss": 0.4969, "step": 4205 }, { "epoch": 0.49090485074626866, "grad_norm": 0.4797812232043466, "learning_rate": 3.0032440810647783e-05, "loss": 0.5038, "step": 4210 }, { "epoch": 0.49148787313432835, "grad_norm": 0.4525162593921102, "learning_rate": 2.998933027662086e-05, "loss": 0.5266, "step": 4215 }, { "epoch": 0.4920708955223881, "grad_norm": 0.4822829175013609, "learning_rate": 2.9946210488653316e-05, "loss": 0.5243, "step": 4220 }, { "epoch": 0.49265391791044777, "grad_norm": 0.45841709338227005, "learning_rate": 2.9903081607040473e-05, "loss": 0.4948, "step": 4225 }, { "epoch": 0.49323694029850745, "grad_norm": 0.4465507453121582, "learning_rate": 2.9859943792111422e-05, "loss": 0.5228, "step": 4230 }, { "epoch": 0.49381996268656714, "grad_norm": 0.613185904602815, "learning_rate": 2.9816797204228497e-05, "loss": 0.5345, "step": 4235 }, { "epoch": 0.4944029850746269, "grad_norm": 0.49396745715444307, "learning_rate": 2.9773642003786627e-05, "loss": 0.5412, "step": 4240 }, { "epoch": 0.49498600746268656, "grad_norm": 0.5239548031082005, "learning_rate": 2.9730478351212754e-05, "loss": 0.5821, "step": 4245 }, { "epoch": 0.49556902985074625, "grad_norm": 0.4221114790983556, "learning_rate": 2.968730640696526e-05, "loss": 0.5199, "step": 4250 }, { "epoch": 0.496152052238806, "grad_norm": 0.480540611050451, "learning_rate": 2.9644126331533328e-05, "loss": 0.5149, "step": 4255 }, { "epoch": 0.49673507462686567, "grad_norm": 0.4972848047633413, "learning_rate": 2.9600938285436385e-05, "loss": 0.5155, "step": 4260 }, { "epoch": 0.49731809701492535, "grad_norm": 0.5100758433643319, "learning_rate": 2.9557742429223478e-05, "loss": 0.5495, "step": 4265 }, { "epoch": 0.4979011194029851, "grad_norm": 0.46169645444430685, "learning_rate": 2.95145389234727e-05, "loss": 0.5432, "step": 4270 }, { "epoch": 0.4984841417910448, "grad_norm": 0.478683421695544, "learning_rate": 2.947132792879056e-05, "loss": 0.5622, "step": 4275 }, { "epoch": 0.49906716417910446, "grad_norm": 0.4937574145856353, "learning_rate": 2.9428109605811427e-05, "loss": 0.5292, "step": 4280 }, { "epoch": 0.4996501865671642, "grad_norm": 0.4482984531417574, "learning_rate": 2.9384884115196898e-05, "loss": 0.5115, "step": 4285 }, { "epoch": 0.5002332089552238, "grad_norm": 0.4581506836353169, "learning_rate": 2.9341651617635236e-05, "loss": 0.5369, "step": 4290 }, { "epoch": 0.5008162313432836, "grad_norm": 0.4636121196389006, "learning_rate": 2.929841227384072e-05, "loss": 0.563, "step": 4295 }, { "epoch": 0.5013992537313433, "grad_norm": 0.49303476266346524, "learning_rate": 2.925516624455311e-05, "loss": 0.5323, "step": 4300 }, { "epoch": 0.5019822761194029, "grad_norm": 0.4792765198500162, "learning_rate": 2.9211913690537003e-05, "loss": 0.5238, "step": 4305 }, { "epoch": 0.5025652985074627, "grad_norm": 0.47595384636734767, "learning_rate": 2.9168654772581257e-05, "loss": 0.5487, "step": 4310 }, { "epoch": 0.5031483208955224, "grad_norm": 0.45475337905445856, "learning_rate": 2.9125389651498374e-05, "loss": 0.5551, "step": 4315 }, { "epoch": 0.503731343283582, "grad_norm": 0.45368396432436275, "learning_rate": 2.908211848812394e-05, "loss": 0.5096, "step": 4320 }, { "epoch": 0.5043143656716418, "grad_norm": 0.4720732964199861, "learning_rate": 2.903884144331598e-05, "loss": 0.5375, "step": 4325 }, { "epoch": 0.5048973880597015, "grad_norm": 0.4609507788137596, "learning_rate": 2.89955586779544e-05, "loss": 0.5193, "step": 4330 }, { "epoch": 0.5054804104477612, "grad_norm": 0.4437193884044816, "learning_rate": 2.8952270352940362e-05, "loss": 0.538, "step": 4335 }, { "epoch": 0.5060634328358209, "grad_norm": 0.4686721143893824, "learning_rate": 2.89089766291957e-05, "loss": 0.5359, "step": 4340 }, { "epoch": 0.5066464552238806, "grad_norm": 0.48522442554794026, "learning_rate": 2.886567766766231e-05, "loss": 0.5367, "step": 4345 }, { "epoch": 0.5072294776119403, "grad_norm": 0.4682334708596764, "learning_rate": 2.8822373629301573e-05, "loss": 0.4979, "step": 4350 }, { "epoch": 0.5078125, "grad_norm": 0.4461441009977717, "learning_rate": 2.8779064675093724e-05, "loss": 0.5473, "step": 4355 }, { "epoch": 0.5083955223880597, "grad_norm": 0.523425795341376, "learning_rate": 2.8735750966037295e-05, "loss": 0.592, "step": 4360 }, { "epoch": 0.5089785447761194, "grad_norm": 0.41208414865400855, "learning_rate": 2.869243266314847e-05, "loss": 0.5146, "step": 4365 }, { "epoch": 0.5095615671641791, "grad_norm": 0.45512638703063274, "learning_rate": 2.8649109927460533e-05, "loss": 0.517, "step": 4370 }, { "epoch": 0.5101445895522388, "grad_norm": 0.449015485615699, "learning_rate": 2.8605782920023227e-05, "loss": 0.5235, "step": 4375 }, { "epoch": 0.5107276119402985, "grad_norm": 0.49156178005481366, "learning_rate": 2.8562451801902197e-05, "loss": 0.5308, "step": 4380 }, { "epoch": 0.5113106343283582, "grad_norm": 0.4348710846671963, "learning_rate": 2.8519116734178336e-05, "loss": 0.5387, "step": 4385 }, { "epoch": 0.511893656716418, "grad_norm": 0.4902492973827986, "learning_rate": 2.8475777877947264e-05, "loss": 0.5417, "step": 4390 }, { "epoch": 0.5124766791044776, "grad_norm": 0.6547931036800456, "learning_rate": 2.843243539431863e-05, "loss": 0.5444, "step": 4395 }, { "epoch": 0.5130597014925373, "grad_norm": 0.4876654982418252, "learning_rate": 2.838908944441562e-05, "loss": 0.5558, "step": 4400 }, { "epoch": 0.5136427238805971, "grad_norm": 0.46099868761271035, "learning_rate": 2.834574018937428e-05, "loss": 0.5605, "step": 4405 }, { "epoch": 0.5142257462686567, "grad_norm": 0.4545024437386275, "learning_rate": 2.8302387790342943e-05, "loss": 0.5175, "step": 4410 }, { "epoch": 0.5148087686567164, "grad_norm": 0.4336617180340361, "learning_rate": 2.8259032408481635e-05, "loss": 0.5038, "step": 4415 }, { "epoch": 0.5153917910447762, "grad_norm": 0.47838755970386043, "learning_rate": 2.8215674204961462e-05, "loss": 0.5594, "step": 4420 }, { "epoch": 0.5159748134328358, "grad_norm": 0.5142813582002242, "learning_rate": 2.817231334096403e-05, "loss": 0.5177, "step": 4425 }, { "epoch": 0.5165578358208955, "grad_norm": 0.44541441218084665, "learning_rate": 2.812894997768083e-05, "loss": 0.5465, "step": 4430 }, { "epoch": 0.5171408582089553, "grad_norm": 0.4560004171104499, "learning_rate": 2.8085584276312644e-05, "loss": 0.5201, "step": 4435 }, { "epoch": 0.5177238805970149, "grad_norm": 0.47964290891517664, "learning_rate": 2.8042216398068942e-05, "loss": 0.5247, "step": 4440 }, { "epoch": 0.5183069029850746, "grad_norm": 0.8168752970003079, "learning_rate": 2.7998846504167308e-05, "loss": 0.5277, "step": 4445 }, { "epoch": 0.5188899253731343, "grad_norm": 2.544656980904975, "learning_rate": 2.7955474755832784e-05, "loss": 0.505, "step": 4450 }, { "epoch": 0.519472947761194, "grad_norm": 0.4659395065613913, "learning_rate": 2.7912101314297327e-05, "loss": 0.5269, "step": 4455 }, { "epoch": 0.5200559701492538, "grad_norm": 2.6969407750363943, "learning_rate": 2.7868726340799184e-05, "loss": 0.5306, "step": 4460 }, { "epoch": 0.5206389925373134, "grad_norm": 0.5270843633040919, "learning_rate": 2.7825349996582313e-05, "loss": 0.5711, "step": 4465 }, { "epoch": 0.5212220149253731, "grad_norm": 0.5091777799722457, "learning_rate": 2.7781972442895726e-05, "loss": 0.5365, "step": 4470 }, { "epoch": 0.5218050373134329, "grad_norm": 1.3533639882343287, "learning_rate": 2.7738593840992975e-05, "loss": 0.5539, "step": 4475 }, { "epoch": 0.5223880597014925, "grad_norm": 0.4547594298673975, "learning_rate": 2.769521435213149e-05, "loss": 0.544, "step": 4480 }, { "epoch": 0.5229710820895522, "grad_norm": 0.5510873748275549, "learning_rate": 2.7651834137572003e-05, "loss": 0.5353, "step": 4485 }, { "epoch": 0.523554104477612, "grad_norm": 0.4328373857127542, "learning_rate": 2.760845335857793e-05, "loss": 0.5132, "step": 4490 }, { "epoch": 0.5241371268656716, "grad_norm": 0.45053079578268007, "learning_rate": 2.7565072176414803e-05, "loss": 0.5231, "step": 4495 }, { "epoch": 0.5247201492537313, "grad_norm": 0.49741815715434656, "learning_rate": 2.7521690752349643e-05, "loss": 0.5281, "step": 4500 }, { "epoch": 0.5253031716417911, "grad_norm": 0.4607291755452952, "learning_rate": 2.7478309247650362e-05, "loss": 0.5558, "step": 4505 }, { "epoch": 0.5258861940298507, "grad_norm": 0.5194917261722904, "learning_rate": 2.7434927823585206e-05, "loss": 0.539, "step": 4510 }, { "epoch": 0.5264692164179104, "grad_norm": 0.45463126850370955, "learning_rate": 2.739154664142208e-05, "loss": 0.5215, "step": 4515 }, { "epoch": 0.5270522388059702, "grad_norm": 0.4892107161079775, "learning_rate": 2.7348165862428e-05, "loss": 0.4891, "step": 4520 }, { "epoch": 0.5276352611940298, "grad_norm": 0.5045457990411999, "learning_rate": 2.7304785647868507e-05, "loss": 0.5416, "step": 4525 }, { "epoch": 0.5282182835820896, "grad_norm": 0.44346437150982526, "learning_rate": 2.726140615900703e-05, "loss": 0.5048, "step": 4530 }, { "epoch": 0.5288013059701493, "grad_norm": 0.48666851854991494, "learning_rate": 2.7218027557104286e-05, "loss": 0.5034, "step": 4535 }, { "epoch": 0.5293843283582089, "grad_norm": 0.4611162809005146, "learning_rate": 2.7174650003417696e-05, "loss": 0.5142, "step": 4540 }, { "epoch": 0.5299673507462687, "grad_norm": 0.45514376590541833, "learning_rate": 2.7131273659200818e-05, "loss": 0.5162, "step": 4545 }, { "epoch": 0.5305503731343284, "grad_norm": 0.48960501435122616, "learning_rate": 2.7087898685702685e-05, "loss": 0.5069, "step": 4550 }, { "epoch": 0.531133395522388, "grad_norm": 0.4797862989191752, "learning_rate": 2.704452524416722e-05, "loss": 0.531, "step": 4555 }, { "epoch": 0.5317164179104478, "grad_norm": 0.5893011284468495, "learning_rate": 2.7001153495832697e-05, "loss": 0.5061, "step": 4560 }, { "epoch": 0.5322994402985075, "grad_norm": 0.4461810976300674, "learning_rate": 2.6957783601931063e-05, "loss": 0.5203, "step": 4565 }, { "epoch": 0.5328824626865671, "grad_norm": 0.49982880523004874, "learning_rate": 2.691441572368737e-05, "loss": 0.5284, "step": 4570 }, { "epoch": 0.5334654850746269, "grad_norm": 0.8194747676964631, "learning_rate": 2.6871050022319177e-05, "loss": 0.5642, "step": 4575 }, { "epoch": 0.5340485074626866, "grad_norm": 0.45941020004877126, "learning_rate": 2.6827686659035983e-05, "loss": 0.5444, "step": 4580 }, { "epoch": 0.5346315298507462, "grad_norm": 0.7401627959971693, "learning_rate": 2.678432579503855e-05, "loss": 0.5138, "step": 4585 }, { "epoch": 0.535214552238806, "grad_norm": 0.49796442033400656, "learning_rate": 2.6740967591518374e-05, "loss": 0.5264, "step": 4590 }, { "epoch": 0.5357975746268657, "grad_norm": 1.3853714172153995, "learning_rate": 2.6697612209657063e-05, "loss": 0.5375, "step": 4595 }, { "epoch": 0.5363805970149254, "grad_norm": 0.4543560797621967, "learning_rate": 2.665425981062573e-05, "loss": 0.5155, "step": 4600 }, { "epoch": 0.5369636194029851, "grad_norm": 0.484187959090983, "learning_rate": 2.6610910555584384e-05, "loss": 0.5478, "step": 4605 }, { "epoch": 0.5375466417910447, "grad_norm": 0.49626488411866204, "learning_rate": 2.6567564605681376e-05, "loss": 0.5352, "step": 4610 }, { "epoch": 0.5381296641791045, "grad_norm": 0.514181721385795, "learning_rate": 2.652422212205275e-05, "loss": 0.5387, "step": 4615 }, { "epoch": 0.5387126865671642, "grad_norm": 0.5832049097179766, "learning_rate": 2.6480883265821673e-05, "loss": 0.5355, "step": 4620 }, { "epoch": 0.5392957089552238, "grad_norm": 1.5120035406279293, "learning_rate": 2.643754819809781e-05, "loss": 0.5144, "step": 4625 }, { "epoch": 0.5398787313432836, "grad_norm": 0.49157920203901545, "learning_rate": 2.639421707997678e-05, "loss": 0.5176, "step": 4630 }, { "epoch": 0.5404617537313433, "grad_norm": 0.52902066701415, "learning_rate": 2.6350890072539476e-05, "loss": 0.5635, "step": 4635 }, { "epoch": 0.5410447761194029, "grad_norm": 0.4637572082859708, "learning_rate": 2.630756733685153e-05, "loss": 0.5346, "step": 4640 }, { "epoch": 0.5416277985074627, "grad_norm": 0.47595145214824397, "learning_rate": 2.6264249033962713e-05, "loss": 0.5249, "step": 4645 }, { "epoch": 0.5422108208955224, "grad_norm": 1.6764787888943493, "learning_rate": 2.622093532490628e-05, "loss": 0.5064, "step": 4650 }, { "epoch": 0.542793843283582, "grad_norm": 0.5788528459186878, "learning_rate": 2.6177626370698443e-05, "loss": 0.5051, "step": 4655 }, { "epoch": 0.5433768656716418, "grad_norm": 0.48162500018331245, "learning_rate": 2.6134322332337695e-05, "loss": 0.5043, "step": 4660 }, { "epoch": 0.5439598880597015, "grad_norm": 0.4737951941511267, "learning_rate": 2.6091023370804307e-05, "loss": 0.5142, "step": 4665 }, { "epoch": 0.5445429104477612, "grad_norm": 0.5575644816288676, "learning_rate": 2.604772964705965e-05, "loss": 0.5505, "step": 4670 }, { "epoch": 0.5451259328358209, "grad_norm": 0.4758372366326835, "learning_rate": 2.6004441322045603e-05, "loss": 0.5373, "step": 4675 }, { "epoch": 0.5457089552238806, "grad_norm": 0.508958909313539, "learning_rate": 2.596115855668403e-05, "loss": 0.5353, "step": 4680 }, { "epoch": 0.5462919776119403, "grad_norm": 0.4683786155826753, "learning_rate": 2.5917881511876073e-05, "loss": 0.4946, "step": 4685 }, { "epoch": 0.546875, "grad_norm": 0.5456678942382134, "learning_rate": 2.5874610348501632e-05, "loss": 0.5371, "step": 4690 }, { "epoch": 0.5474580223880597, "grad_norm": 0.4552682312653168, "learning_rate": 2.5831345227418752e-05, "loss": 0.5605, "step": 4695 }, { "epoch": 0.5480410447761194, "grad_norm": 0.4776001785979188, "learning_rate": 2.5788086309463006e-05, "loss": 0.5005, "step": 4700 }, { "epoch": 0.5486240671641791, "grad_norm": 0.5453229696717531, "learning_rate": 2.57448337554469e-05, "loss": 0.5194, "step": 4705 }, { "epoch": 0.5492070895522388, "grad_norm": 0.5387817919914101, "learning_rate": 2.570158772615928e-05, "loss": 0.5333, "step": 4710 }, { "epoch": 0.5497901119402985, "grad_norm": 0.48603015940285904, "learning_rate": 2.5658348382364773e-05, "loss": 0.5117, "step": 4715 }, { "epoch": 0.5503731343283582, "grad_norm": 0.4789950555392135, "learning_rate": 2.5615115884803108e-05, "loss": 0.5168, "step": 4720 }, { "epoch": 0.550956156716418, "grad_norm": 0.4593000342095894, "learning_rate": 2.557189039418858e-05, "loss": 0.5238, "step": 4725 }, { "epoch": 0.5515391791044776, "grad_norm": 0.4671404581672369, "learning_rate": 2.552867207120945e-05, "loss": 0.5336, "step": 4730 }, { "epoch": 0.5521222014925373, "grad_norm": 0.4867006683126969, "learning_rate": 2.5485461076527308e-05, "loss": 0.5432, "step": 4735 }, { "epoch": 0.5527052238805971, "grad_norm": 0.4603754841175435, "learning_rate": 2.5442257570776527e-05, "loss": 0.5464, "step": 4740 }, { "epoch": 0.5532882462686567, "grad_norm": 0.5322095390809611, "learning_rate": 2.539906171456362e-05, "loss": 0.5102, "step": 4745 }, { "epoch": 0.5538712686567164, "grad_norm": 0.4501063054653902, "learning_rate": 2.5355873668466677e-05, "loss": 0.526, "step": 4750 }, { "epoch": 0.5544542910447762, "grad_norm": 0.4645529985985766, "learning_rate": 2.5312693593034746e-05, "loss": 0.5035, "step": 4755 }, { "epoch": 0.5550373134328358, "grad_norm": 0.4764710401107973, "learning_rate": 2.5269521648787247e-05, "loss": 0.5458, "step": 4760 }, { "epoch": 0.5556203358208955, "grad_norm": 0.4534954950893274, "learning_rate": 2.5226357996213378e-05, "loss": 0.5199, "step": 4765 }, { "epoch": 0.5562033582089553, "grad_norm": 0.4810706685920549, "learning_rate": 2.518320279577151e-05, "loss": 0.5263, "step": 4770 }, { "epoch": 0.5567863805970149, "grad_norm": 0.4690472358261824, "learning_rate": 2.514005620788858e-05, "loss": 0.5245, "step": 4775 }, { "epoch": 0.5573694029850746, "grad_norm": 0.46880508544260313, "learning_rate": 2.5096918392959532e-05, "loss": 0.5232, "step": 4780 }, { "epoch": 0.5579524253731343, "grad_norm": 0.513925549118589, "learning_rate": 2.5053789511346693e-05, "loss": 0.5157, "step": 4785 }, { "epoch": 0.558535447761194, "grad_norm": 0.5043455996761089, "learning_rate": 2.5010669723379154e-05, "loss": 0.5128, "step": 4790 }, { "epoch": 0.5591184701492538, "grad_norm": 0.5924109641064016, "learning_rate": 2.4967559189352226e-05, "loss": 0.4921, "step": 4795 }, { "epoch": 0.5597014925373134, "grad_norm": 0.47567528373039897, "learning_rate": 2.492445806952682e-05, "loss": 0.5406, "step": 4800 }, { "epoch": 0.5602845149253731, "grad_norm": 0.4707447199114358, "learning_rate": 2.4881366524128845e-05, "loss": 0.5325, "step": 4805 }, { "epoch": 0.5608675373134329, "grad_norm": 0.444463840534227, "learning_rate": 2.4838284713348602e-05, "loss": 0.4945, "step": 4810 }, { "epoch": 0.5614505597014925, "grad_norm": 0.51359075031383, "learning_rate": 2.479521279734024e-05, "loss": 0.4986, "step": 4815 }, { "epoch": 0.5620335820895522, "grad_norm": 0.508318043421021, "learning_rate": 2.475215093622109e-05, "loss": 0.5362, "step": 4820 }, { "epoch": 0.562616604477612, "grad_norm": 0.4798125508084004, "learning_rate": 2.4709099290071126e-05, "loss": 0.498, "step": 4825 }, { "epoch": 0.5631996268656716, "grad_norm": 0.46383793437769893, "learning_rate": 2.4666058018932314e-05, "loss": 0.5254, "step": 4830 }, { "epoch": 0.5637826492537313, "grad_norm": 0.4825804953446005, "learning_rate": 2.4623027282808114e-05, "loss": 0.515, "step": 4835 }, { "epoch": 0.5643656716417911, "grad_norm": 0.49356516542621953, "learning_rate": 2.4580007241662773e-05, "loss": 0.4913, "step": 4840 }, { "epoch": 0.5649486940298507, "grad_norm": 0.46504399968658483, "learning_rate": 2.4536998055420783e-05, "loss": 0.5055, "step": 4845 }, { "epoch": 0.5655317164179104, "grad_norm": 0.5178266867135491, "learning_rate": 2.4493999883966308e-05, "loss": 0.5338, "step": 4850 }, { "epoch": 0.5661147388059702, "grad_norm": 0.443773396289914, "learning_rate": 2.445101288714254e-05, "loss": 0.527, "step": 4855 }, { "epoch": 0.5666977611940298, "grad_norm": 0.4361179409054666, "learning_rate": 2.440803722475114e-05, "loss": 0.5115, "step": 4860 }, { "epoch": 0.5672807835820896, "grad_norm": 0.4957031944579591, "learning_rate": 2.436507305655165e-05, "loss": 0.5224, "step": 4865 }, { "epoch": 0.5678638059701493, "grad_norm": 0.5179909459677171, "learning_rate": 2.4322120542260864e-05, "loss": 0.5026, "step": 4870 }, { "epoch": 0.5684468283582089, "grad_norm": 0.45931411628609564, "learning_rate": 2.4279179841552246e-05, "loss": 0.5492, "step": 4875 }, { "epoch": 0.5690298507462687, "grad_norm": 0.4718292947061309, "learning_rate": 2.4236251114055358e-05, "loss": 0.5485, "step": 4880 }, { "epoch": 0.5696128731343284, "grad_norm": 0.4509426009796579, "learning_rate": 2.419333451935526e-05, "loss": 0.5262, "step": 4885 }, { "epoch": 0.570195895522388, "grad_norm": 0.520970250021266, "learning_rate": 2.4150430216991888e-05, "loss": 0.54, "step": 4890 }, { "epoch": 0.5707789179104478, "grad_norm": 0.48125165415242044, "learning_rate": 2.4107538366459494e-05, "loss": 0.5535, "step": 4895 }, { "epoch": 0.5713619402985075, "grad_norm": 0.49924612989919914, "learning_rate": 2.406465912720604e-05, "loss": 0.5378, "step": 4900 }, { "epoch": 0.5719449626865671, "grad_norm": 0.4618022420362027, "learning_rate": 2.4021792658632612e-05, "loss": 0.5385, "step": 4905 }, { "epoch": 0.5725279850746269, "grad_norm": 0.43524447366727087, "learning_rate": 2.3978939120092814e-05, "loss": 0.5275, "step": 4910 }, { "epoch": 0.5731110074626866, "grad_norm": 0.5235831961960491, "learning_rate": 2.3936098670892165e-05, "loss": 0.5171, "step": 4915 }, { "epoch": 0.5736940298507462, "grad_norm": 0.47253982117527504, "learning_rate": 2.389327147028757e-05, "loss": 0.5149, "step": 4920 }, { "epoch": 0.574277052238806, "grad_norm": 0.536733235694404, "learning_rate": 2.3850457677486655e-05, "loss": 0.5385, "step": 4925 }, { "epoch": 0.5748600746268657, "grad_norm": 0.49487264913586265, "learning_rate": 2.380765745164718e-05, "loss": 0.5453, "step": 4930 }, { "epoch": 0.5754430970149254, "grad_norm": 0.4782990502193716, "learning_rate": 2.376487095187654e-05, "loss": 0.496, "step": 4935 }, { "epoch": 0.5760261194029851, "grad_norm": 1.5309625007086856, "learning_rate": 2.3722098337231025e-05, "loss": 0.507, "step": 4940 }, { "epoch": 0.5766091417910447, "grad_norm": 0.4155961423348815, "learning_rate": 2.3679339766715358e-05, "loss": 0.5219, "step": 4945 }, { "epoch": 0.5771921641791045, "grad_norm": 0.4914382394629777, "learning_rate": 2.363659539928204e-05, "loss": 0.5107, "step": 4950 }, { "epoch": 0.5777751865671642, "grad_norm": 0.4984691122251259, "learning_rate": 2.3593865393830766e-05, "loss": 0.5007, "step": 4955 }, { "epoch": 0.5783582089552238, "grad_norm": 0.504442137117383, "learning_rate": 2.355114990920786e-05, "loss": 0.5103, "step": 4960 }, { "epoch": 0.5789412313432836, "grad_norm": 0.774789963927665, "learning_rate": 2.3508449104205636e-05, "loss": 0.516, "step": 4965 }, { "epoch": 0.5795242537313433, "grad_norm": 0.49669070623006295, "learning_rate": 2.3465763137561875e-05, "loss": 0.4984, "step": 4970 }, { "epoch": 0.5801072761194029, "grad_norm": 0.4915699683156642, "learning_rate": 2.342309216795916e-05, "loss": 0.5237, "step": 4975 }, { "epoch": 0.5806902985074627, "grad_norm": 0.7919278813140391, "learning_rate": 2.3380436354024338e-05, "loss": 0.5327, "step": 4980 }, { "epoch": 0.5812733208955224, "grad_norm": 0.513367233603117, "learning_rate": 2.333779585432794e-05, "loss": 0.545, "step": 4985 }, { "epoch": 0.581856343283582, "grad_norm": 0.5185055850159029, "learning_rate": 2.329517082738353e-05, "loss": 0.5671, "step": 4990 }, { "epoch": 0.5824393656716418, "grad_norm": 0.4535336329617379, "learning_rate": 2.3252561431647158e-05, "loss": 0.5072, "step": 4995 }, { "epoch": 0.5830223880597015, "grad_norm": 0.4214639260752155, "learning_rate": 2.32099678255168e-05, "loss": 0.4765, "step": 5000 }, { "epoch": 0.5836054104477612, "grad_norm": 0.4292230083522436, "learning_rate": 2.316739016733169e-05, "loss": 0.535, "step": 5005 }, { "epoch": 0.5841884328358209, "grad_norm": 0.4500847118246923, "learning_rate": 2.3124828615371817e-05, "loss": 0.5579, "step": 5010 }, { "epoch": 0.5847714552238806, "grad_norm": 0.4441260617873408, "learning_rate": 2.3082283327857253e-05, "loss": 0.502, "step": 5015 }, { "epoch": 0.5853544776119403, "grad_norm": 0.4929193442519856, "learning_rate": 2.3039754462947653e-05, "loss": 0.5154, "step": 5020 }, { "epoch": 0.5859375, "grad_norm": 0.4604596804207028, "learning_rate": 2.2997242178741596e-05, "loss": 0.5173, "step": 5025 }, { "epoch": 0.5865205223880597, "grad_norm": 0.4671301724369313, "learning_rate": 2.2954746633276016e-05, "loss": 0.5022, "step": 5030 }, { "epoch": 0.5871035447761194, "grad_norm": 0.4666266550085784, "learning_rate": 2.2912267984525643e-05, "loss": 0.5263, "step": 5035 }, { "epoch": 0.5876865671641791, "grad_norm": 0.5663717691966537, "learning_rate": 2.2869806390402384e-05, "loss": 0.5126, "step": 5040 }, { "epoch": 0.5882695895522388, "grad_norm": 0.4709782228008594, "learning_rate": 2.2827362008754743e-05, "loss": 0.5374, "step": 5045 }, { "epoch": 0.5888526119402985, "grad_norm": 0.523861503593284, "learning_rate": 2.278493499736724e-05, "loss": 0.5313, "step": 5050 }, { "epoch": 0.5894356343283582, "grad_norm": 0.4797723190933745, "learning_rate": 2.2742525513959832e-05, "loss": 0.5382, "step": 5055 }, { "epoch": 0.590018656716418, "grad_norm": 0.43860453059040094, "learning_rate": 2.2700133716187316e-05, "loss": 0.5265, "step": 5060 }, { "epoch": 0.5906016791044776, "grad_norm": 0.4740764262851301, "learning_rate": 2.2657759761638707e-05, "loss": 0.5148, "step": 5065 }, { "epoch": 0.5911847014925373, "grad_norm": 0.48957608183212625, "learning_rate": 2.261540380783675e-05, "loss": 0.4952, "step": 5070 }, { "epoch": 0.5917677238805971, "grad_norm": 0.5162045189958194, "learning_rate": 2.257306601223722e-05, "loss": 0.5331, "step": 5075 }, { "epoch": 0.5923507462686567, "grad_norm": 0.5728273214706431, "learning_rate": 2.2530746532228413e-05, "loss": 0.5143, "step": 5080 }, { "epoch": 0.5929337686567164, "grad_norm": 0.44294658485783917, "learning_rate": 2.2488445525130557e-05, "loss": 0.5385, "step": 5085 }, { "epoch": 0.5935167910447762, "grad_norm": 0.4728473251887733, "learning_rate": 2.2446163148195164e-05, "loss": 0.5249, "step": 5090 }, { "epoch": 0.5940998134328358, "grad_norm": 0.42056687179230184, "learning_rate": 2.2403899558604525e-05, "loss": 0.4896, "step": 5095 }, { "epoch": 0.5946828358208955, "grad_norm": 0.42312832455359684, "learning_rate": 2.2361654913471065e-05, "loss": 0.4831, "step": 5100 }, { "epoch": 0.5952658582089553, "grad_norm": 0.4310382361089622, "learning_rate": 2.2319429369836815e-05, "loss": 0.5038, "step": 5105 }, { "epoch": 0.5958488805970149, "grad_norm": 0.43349085099461926, "learning_rate": 2.2277223084672765e-05, "loss": 0.5332, "step": 5110 }, { "epoch": 0.5964319029850746, "grad_norm": 0.4358686772263049, "learning_rate": 2.2235036214878325e-05, "loss": 0.4716, "step": 5115 }, { "epoch": 0.5970149253731343, "grad_norm": 0.4998061706788218, "learning_rate": 2.2192868917280745e-05, "loss": 0.5277, "step": 5120 }, { "epoch": 0.597597947761194, "grad_norm": 0.4569209342643407, "learning_rate": 2.21507213486345e-05, "loss": 0.4995, "step": 5125 }, { "epoch": 0.5981809701492538, "grad_norm": 0.45297207736047584, "learning_rate": 2.2108593665620724e-05, "loss": 0.5254, "step": 5130 }, { "epoch": 0.5987639925373134, "grad_norm": 0.49176995249756655, "learning_rate": 2.2066486024846615e-05, "loss": 0.5312, "step": 5135 }, { "epoch": 0.5993470149253731, "grad_norm": 0.45352488174846867, "learning_rate": 2.2024398582844906e-05, "loss": 0.5183, "step": 5140 }, { "epoch": 0.5999300373134329, "grad_norm": 0.4686989486366428, "learning_rate": 2.19823314960732e-05, "loss": 0.5337, "step": 5145 }, { "epoch": 0.6005130597014925, "grad_norm": 0.4989471659468385, "learning_rate": 2.1940284920913445e-05, "loss": 0.513, "step": 5150 }, { "epoch": 0.6010960820895522, "grad_norm": 0.7737032679878307, "learning_rate": 2.1898259013671357e-05, "loss": 0.4948, "step": 5155 }, { "epoch": 0.601679104477612, "grad_norm": 0.6456584381963478, "learning_rate": 2.1856253930575787e-05, "loss": 0.5506, "step": 5160 }, { "epoch": 0.6022621268656716, "grad_norm": 0.49420633212169496, "learning_rate": 2.18142698277782e-05, "loss": 0.534, "step": 5165 }, { "epoch": 0.6028451492537313, "grad_norm": 0.43079700140392413, "learning_rate": 2.1772306861352044e-05, "loss": 0.4994, "step": 5170 }, { "epoch": 0.6034281716417911, "grad_norm": 0.4477340131354866, "learning_rate": 2.1730365187292228e-05, "loss": 0.4895, "step": 5175 }, { "epoch": 0.6040111940298507, "grad_norm": 0.45974056791460993, "learning_rate": 2.1688444961514476e-05, "loss": 0.4963, "step": 5180 }, { "epoch": 0.6045942164179104, "grad_norm": 0.4463763393224647, "learning_rate": 2.1646546339854788e-05, "loss": 0.5139, "step": 5185 }, { "epoch": 0.6051772388059702, "grad_norm": 0.5136717833408538, "learning_rate": 2.1604669478068863e-05, "loss": 0.5195, "step": 5190 }, { "epoch": 0.6057602611940298, "grad_norm": 0.4554979479736964, "learning_rate": 2.1562814531831487e-05, "loss": 0.5062, "step": 5195 }, { "epoch": 0.6063432835820896, "grad_norm": 0.45967404846816146, "learning_rate": 2.1520981656736e-05, "loss": 0.4984, "step": 5200 }, { "epoch": 0.6069263059701493, "grad_norm": 0.44666913701763594, "learning_rate": 2.1479171008293686e-05, "loss": 0.4939, "step": 5205 }, { "epoch": 0.6075093283582089, "grad_norm": 1.738573008880425, "learning_rate": 2.1437382741933204e-05, "loss": 0.542, "step": 5210 }, { "epoch": 0.6080923507462687, "grad_norm": 0.45746766067753797, "learning_rate": 2.1395617012999993e-05, "loss": 0.5014, "step": 5215 }, { "epoch": 0.6086753731343284, "grad_norm": 0.4542305089948559, "learning_rate": 2.1353873976755716e-05, "loss": 0.4852, "step": 5220 }, { "epoch": 0.609258395522388, "grad_norm": 0.4734086638694241, "learning_rate": 2.131215378837771e-05, "loss": 0.5218, "step": 5225 }, { "epoch": 0.6098414179104478, "grad_norm": 0.4738565708182124, "learning_rate": 2.1270456602958332e-05, "loss": 0.5322, "step": 5230 }, { "epoch": 0.6104244402985075, "grad_norm": 0.49359869399689404, "learning_rate": 2.1228782575504447e-05, "loss": 0.537, "step": 5235 }, { "epoch": 0.6110074626865671, "grad_norm": 0.47548423763939335, "learning_rate": 2.1187131860936845e-05, "loss": 0.5183, "step": 5240 }, { "epoch": 0.6115904850746269, "grad_norm": 0.6813270131879138, "learning_rate": 2.114550461408961e-05, "loss": 0.5097, "step": 5245 }, { "epoch": 0.6121735074626866, "grad_norm": 0.4858514966177657, "learning_rate": 2.1103900989709623e-05, "loss": 0.5267, "step": 5250 }, { "epoch": 0.6127565298507462, "grad_norm": 0.46679943200730223, "learning_rate": 2.1062321142455953e-05, "loss": 0.5187, "step": 5255 }, { "epoch": 0.613339552238806, "grad_norm": 0.43680321948998807, "learning_rate": 2.1020765226899257e-05, "loss": 0.5072, "step": 5260 }, { "epoch": 0.6139225746268657, "grad_norm": 0.48765723203673156, "learning_rate": 2.0979233397521237e-05, "loss": 0.5172, "step": 5265 }, { "epoch": 0.6145055970149254, "grad_norm": 0.5421523270806327, "learning_rate": 2.0937725808714037e-05, "loss": 0.5173, "step": 5270 }, { "epoch": 0.6150886194029851, "grad_norm": 0.4821842656660482, "learning_rate": 2.089624261477973e-05, "loss": 0.522, "step": 5275 }, { "epoch": 0.6156716417910447, "grad_norm": 0.473993764265411, "learning_rate": 2.0854783969929668e-05, "loss": 0.5394, "step": 5280 }, { "epoch": 0.6162546641791045, "grad_norm": 0.4419301919043876, "learning_rate": 2.0813350028283958e-05, "loss": 0.5152, "step": 5285 }, { "epoch": 0.6168376865671642, "grad_norm": 0.4672525694912507, "learning_rate": 2.0771940943870866e-05, "loss": 0.5195, "step": 5290 }, { "epoch": 0.6174207089552238, "grad_norm": 0.4420478828415689, "learning_rate": 2.073055687062625e-05, "loss": 0.5131, "step": 5295 }, { "epoch": 0.6180037313432836, "grad_norm": 0.4626362183961214, "learning_rate": 2.0689197962393007e-05, "loss": 0.5162, "step": 5300 }, { "epoch": 0.6185867537313433, "grad_norm": 0.49784203270276733, "learning_rate": 2.0647864372920472e-05, "loss": 0.5279, "step": 5305 }, { "epoch": 0.6191697761194029, "grad_norm": 0.454384681268438, "learning_rate": 2.0606556255863862e-05, "loss": 0.5193, "step": 5310 }, { "epoch": 0.6197527985074627, "grad_norm": 0.549168830900785, "learning_rate": 2.05652737647837e-05, "loss": 0.508, "step": 5315 }, { "epoch": 0.6203358208955224, "grad_norm": 0.4790027663801494, "learning_rate": 2.0524017053145238e-05, "loss": 0.5009, "step": 5320 }, { "epoch": 0.620918843283582, "grad_norm": 0.4858133125717372, "learning_rate": 2.0482786274317923e-05, "loss": 0.5257, "step": 5325 }, { "epoch": 0.6215018656716418, "grad_norm": 0.4686160974815818, "learning_rate": 2.0441581581574765e-05, "loss": 0.5006, "step": 5330 }, { "epoch": 0.6220848880597015, "grad_norm": 0.4623666153609126, "learning_rate": 2.0400403128091812e-05, "loss": 0.5169, "step": 5335 }, { "epoch": 0.6226679104477612, "grad_norm": 0.45877847406013955, "learning_rate": 2.0359251066947583e-05, "loss": 0.5334, "step": 5340 }, { "epoch": 0.6232509328358209, "grad_norm": 0.4835869467134031, "learning_rate": 2.0318125551122468e-05, "loss": 0.4976, "step": 5345 }, { "epoch": 0.6238339552238806, "grad_norm": 0.4575233875191603, "learning_rate": 2.027702673349818e-05, "loss": 0.5078, "step": 5350 }, { "epoch": 0.6244169776119403, "grad_norm": 0.42998314282841227, "learning_rate": 2.023595476685718e-05, "loss": 0.5217, "step": 5355 }, { "epoch": 0.625, "grad_norm": 0.40746461264097383, "learning_rate": 2.0194909803882128e-05, "loss": 0.4901, "step": 5360 }, { "epoch": 0.6255830223880597, "grad_norm": 0.4597436174722755, "learning_rate": 2.0153891997155282e-05, "loss": 0.5494, "step": 5365 }, { "epoch": 0.6261660447761194, "grad_norm": 0.47770020364059385, "learning_rate": 2.011290149915795e-05, "loss": 0.5137, "step": 5370 }, { "epoch": 0.6267490671641791, "grad_norm": 0.4282249414588399, "learning_rate": 2.0071938462269936e-05, "loss": 0.4793, "step": 5375 }, { "epoch": 0.6273320895522388, "grad_norm": 0.4509595314631354, "learning_rate": 2.0031003038768942e-05, "loss": 0.498, "step": 5380 }, { "epoch": 0.6279151119402985, "grad_norm": 0.47714728709347026, "learning_rate": 1.999009538083003e-05, "loss": 0.545, "step": 5385 }, { "epoch": 0.6284981343283582, "grad_norm": 0.47392502530376346, "learning_rate": 1.994921564052503e-05, "loss": 0.5133, "step": 5390 }, { "epoch": 0.629081156716418, "grad_norm": 0.44759229419435465, "learning_rate": 1.990836396982202e-05, "loss": 0.5262, "step": 5395 }, { "epoch": 0.6296641791044776, "grad_norm": 0.4912367043364836, "learning_rate": 1.9867540520584693e-05, "loss": 0.5346, "step": 5400 }, { "epoch": 0.6302472014925373, "grad_norm": 0.4501000804982757, "learning_rate": 1.9826745444571853e-05, "loss": 0.5019, "step": 5405 }, { "epoch": 0.6308302238805971, "grad_norm": 0.48836277098183645, "learning_rate": 1.978597889343683e-05, "loss": 0.5289, "step": 5410 }, { "epoch": 0.6314132462686567, "grad_norm": 0.4625223996087579, "learning_rate": 1.97452410187269e-05, "loss": 0.4958, "step": 5415 }, { "epoch": 0.6319962686567164, "grad_norm": 0.48092056548731926, "learning_rate": 1.970453197188275e-05, "loss": 0.5015, "step": 5420 }, { "epoch": 0.6325792910447762, "grad_norm": 0.41113077299689604, "learning_rate": 1.9663851904237903e-05, "loss": 0.4989, "step": 5425 }, { "epoch": 0.6331623134328358, "grad_norm": 0.43567184997194336, "learning_rate": 1.9623200967018134e-05, "loss": 0.5066, "step": 5430 }, { "epoch": 0.6337453358208955, "grad_norm": 0.480598507617135, "learning_rate": 1.9582579311340943e-05, "loss": 0.5476, "step": 5435 }, { "epoch": 0.6343283582089553, "grad_norm": 0.4632420890807915, "learning_rate": 1.9541987088214963e-05, "loss": 0.514, "step": 5440 }, { "epoch": 0.6349113805970149, "grad_norm": 0.461886176881567, "learning_rate": 1.9501424448539445e-05, "loss": 0.4993, "step": 5445 }, { "epoch": 0.6354944029850746, "grad_norm": 0.4863359836422005, "learning_rate": 1.946089154310364e-05, "loss": 0.5349, "step": 5450 }, { "epoch": 0.6360774253731343, "grad_norm": 0.44233037285547316, "learning_rate": 1.9420388522586242e-05, "loss": 0.5036, "step": 5455 }, { "epoch": 0.636660447761194, "grad_norm": 0.4399036507814764, "learning_rate": 1.937991553755491e-05, "loss": 0.5198, "step": 5460 }, { "epoch": 0.6372434701492538, "grad_norm": 0.4644842288483705, "learning_rate": 1.9339472738465604e-05, "loss": 0.468, "step": 5465 }, { "epoch": 0.6378264925373134, "grad_norm": 0.5021801630911482, "learning_rate": 1.929906027566207e-05, "loss": 0.5046, "step": 5470 }, { "epoch": 0.6384095149253731, "grad_norm": 0.4906231561063412, "learning_rate": 1.9258678299375287e-05, "loss": 0.5637, "step": 5475 }, { "epoch": 0.6389925373134329, "grad_norm": 0.4411840007547938, "learning_rate": 1.9218326959722915e-05, "loss": 0.4941, "step": 5480 }, { "epoch": 0.6395755597014925, "grad_norm": 0.44492116681266264, "learning_rate": 1.9178006406708716e-05, "loss": 0.4999, "step": 5485 }, { "epoch": 0.6401585820895522, "grad_norm": 0.45899402350391216, "learning_rate": 1.913771679022199e-05, "loss": 0.5143, "step": 5490 }, { "epoch": 0.640741604477612, "grad_norm": 0.5002130360013761, "learning_rate": 1.9097458260037055e-05, "loss": 0.5223, "step": 5495 }, { "epoch": 0.6413246268656716, "grad_norm": 0.4445252627269499, "learning_rate": 1.9057230965812652e-05, "loss": 0.5277, "step": 5500 }, { "epoch": 0.6419076492537313, "grad_norm": 0.4327961966702548, "learning_rate": 1.901703505709141e-05, "loss": 0.4689, "step": 5505 }, { "epoch": 0.6424906716417911, "grad_norm": 0.47514425543756855, "learning_rate": 1.897687068329928e-05, "loss": 0.499, "step": 5510 }, { "epoch": 0.6430736940298507, "grad_norm": 0.4544747659785093, "learning_rate": 1.8936737993744996e-05, "loss": 0.5026, "step": 5515 }, { "epoch": 0.6436567164179104, "grad_norm": 0.45829680386820026, "learning_rate": 1.8896637137619495e-05, "loss": 0.5066, "step": 5520 }, { "epoch": 0.6442397388059702, "grad_norm": 0.42928587523633865, "learning_rate": 1.8856568263995373e-05, "loss": 0.4651, "step": 5525 }, { "epoch": 0.6448227611940298, "grad_norm": 0.4785710795086254, "learning_rate": 1.8816531521826346e-05, "loss": 0.5118, "step": 5530 }, { "epoch": 0.6454057835820896, "grad_norm": 0.44984484126810564, "learning_rate": 1.8776527059946676e-05, "loss": 0.4979, "step": 5535 }, { "epoch": 0.6459888059701493, "grad_norm": 0.5084605177342773, "learning_rate": 1.8736555027070607e-05, "loss": 0.513, "step": 5540 }, { "epoch": 0.6465718283582089, "grad_norm": 0.4587926020250396, "learning_rate": 1.8696615571791876e-05, "loss": 0.5056, "step": 5545 }, { "epoch": 0.6471548507462687, "grad_norm": 0.5265935639558714, "learning_rate": 1.865670884258307e-05, "loss": 0.5328, "step": 5550 }, { "epoch": 0.6477378731343284, "grad_norm": 0.48393501644987547, "learning_rate": 1.861683498779514e-05, "loss": 0.5409, "step": 5555 }, { "epoch": 0.648320895522388, "grad_norm": 0.42845129776042723, "learning_rate": 1.8576994155656814e-05, "loss": 0.5036, "step": 5560 }, { "epoch": 0.6489039179104478, "grad_norm": 0.45750371585274724, "learning_rate": 1.853718649427409e-05, "loss": 0.5458, "step": 5565 }, { "epoch": 0.6494869402985075, "grad_norm": 0.5071290594881412, "learning_rate": 1.8497412151629617e-05, "loss": 0.5574, "step": 5570 }, { "epoch": 0.6500699626865671, "grad_norm": 0.4193141376087725, "learning_rate": 1.8457671275582202e-05, "loss": 0.501, "step": 5575 }, { "epoch": 0.6506529850746269, "grad_norm": 0.5097091316108154, "learning_rate": 1.841796401386626e-05, "loss": 0.5424, "step": 5580 }, { "epoch": 0.6512360074626866, "grad_norm": 0.4574079642887348, "learning_rate": 1.8378290514091214e-05, "loss": 0.5143, "step": 5585 }, { "epoch": 0.6518190298507462, "grad_norm": 0.48232820582938274, "learning_rate": 1.8338650923740984e-05, "loss": 0.5068, "step": 5590 }, { "epoch": 0.652402052238806, "grad_norm": 0.4295823610973414, "learning_rate": 1.829904539017347e-05, "loss": 0.5076, "step": 5595 }, { "epoch": 0.6529850746268657, "grad_norm": 0.4893118320191642, "learning_rate": 1.8259474060619925e-05, "loss": 0.5219, "step": 5600 }, { "epoch": 0.6535680970149254, "grad_norm": 0.44539776731173597, "learning_rate": 1.8219937082184462e-05, "loss": 0.4935, "step": 5605 }, { "epoch": 0.6541511194029851, "grad_norm": 0.5129311723129513, "learning_rate": 1.8180434601843505e-05, "loss": 0.5179, "step": 5610 }, { "epoch": 0.6547341417910447, "grad_norm": 0.4864225115581459, "learning_rate": 1.8140966766445235e-05, "loss": 0.4969, "step": 5615 }, { "epoch": 0.6553171641791045, "grad_norm": 0.47007224857589525, "learning_rate": 1.8101533722709036e-05, "loss": 0.5005, "step": 5620 }, { "epoch": 0.6559001865671642, "grad_norm": 0.4597974424746557, "learning_rate": 1.8062135617224933e-05, "loss": 0.5294, "step": 5625 }, { "epoch": 0.6564832089552238, "grad_norm": 0.49917347832714665, "learning_rate": 1.802277259645313e-05, "loss": 0.5261, "step": 5630 }, { "epoch": 0.6570662313432836, "grad_norm": 0.46525260879023533, "learning_rate": 1.798344480672334e-05, "loss": 0.5119, "step": 5635 }, { "epoch": 0.6576492537313433, "grad_norm": 0.49319911982873976, "learning_rate": 1.7944152394234354e-05, "loss": 0.4884, "step": 5640 }, { "epoch": 0.6582322761194029, "grad_norm": 0.5132851806324479, "learning_rate": 1.7904895505053405e-05, "loss": 0.5158, "step": 5645 }, { "epoch": 0.6588152985074627, "grad_norm": 0.47569949694794605, "learning_rate": 1.7865674285115735e-05, "loss": 0.5121, "step": 5650 }, { "epoch": 0.6593983208955224, "grad_norm": 0.44552199263042797, "learning_rate": 1.7826488880223913e-05, "loss": 0.4761, "step": 5655 }, { "epoch": 0.659981343283582, "grad_norm": 0.47720149730201705, "learning_rate": 1.778733943604743e-05, "loss": 0.5124, "step": 5660 }, { "epoch": 0.6605643656716418, "grad_norm": 0.4432509370944976, "learning_rate": 1.774822609812205e-05, "loss": 0.4711, "step": 5665 }, { "epoch": 0.6611473880597015, "grad_norm": 0.4450343141290362, "learning_rate": 1.7709149011849364e-05, "loss": 0.4761, "step": 5670 }, { "epoch": 0.6617304104477612, "grad_norm": 0.4797782997827445, "learning_rate": 1.767010832249613e-05, "loss": 0.5175, "step": 5675 }, { "epoch": 0.6623134328358209, "grad_norm": 0.47463931278850086, "learning_rate": 1.7631104175193863e-05, "loss": 0.5208, "step": 5680 }, { "epoch": 0.6628964552238806, "grad_norm": 0.49271476148028914, "learning_rate": 1.7592136714938206e-05, "loss": 0.522, "step": 5685 }, { "epoch": 0.6634794776119403, "grad_norm": 0.4366587132398326, "learning_rate": 1.755320608658844e-05, "loss": 0.4709, "step": 5690 }, { "epoch": 0.6640625, "grad_norm": 0.44579614307494786, "learning_rate": 1.7514312434866904e-05, "loss": 0.5267, "step": 5695 }, { "epoch": 0.6646455223880597, "grad_norm": 0.4487339036982587, "learning_rate": 1.747545590435848e-05, "loss": 0.4991, "step": 5700 }, { "epoch": 0.6652285447761194, "grad_norm": 0.4383027709136693, "learning_rate": 1.7436636639510082e-05, "loss": 0.5141, "step": 5705 }, { "epoch": 0.6658115671641791, "grad_norm": 0.4458649709251083, "learning_rate": 1.739785478463004e-05, "loss": 0.4921, "step": 5710 }, { "epoch": 0.6663945895522388, "grad_norm": 0.49612990301681104, "learning_rate": 1.735911048388768e-05, "loss": 0.5081, "step": 5715 }, { "epoch": 0.6669776119402985, "grad_norm": 0.4717384137501139, "learning_rate": 1.7320403881312665e-05, "loss": 0.4909, "step": 5720 }, { "epoch": 0.6675606343283582, "grad_norm": 0.5029540121644192, "learning_rate": 1.7281735120794555e-05, "loss": 0.5439, "step": 5725 }, { "epoch": 0.668143656716418, "grad_norm": 0.4643667414061285, "learning_rate": 1.7243104346082194e-05, "loss": 0.4794, "step": 5730 }, { "epoch": 0.6687266791044776, "grad_norm": 0.47417180490760813, "learning_rate": 1.720451170078328e-05, "loss": 0.4996, "step": 5735 }, { "epoch": 0.6693097014925373, "grad_norm": 0.4814261614829366, "learning_rate": 1.7165957328363703e-05, "loss": 0.5027, "step": 5740 }, { "epoch": 0.6698927238805971, "grad_norm": 0.45627638279169563, "learning_rate": 1.71274413721471e-05, "loss": 0.513, "step": 5745 }, { "epoch": 0.6704757462686567, "grad_norm": 0.4334633223796316, "learning_rate": 1.708896397531431e-05, "loss": 0.4659, "step": 5750 }, { "epoch": 0.6710587686567164, "grad_norm": 0.4579206868255898, "learning_rate": 1.7050525280902824e-05, "loss": 0.5158, "step": 5755 }, { "epoch": 0.6716417910447762, "grad_norm": 0.4386284661436938, "learning_rate": 1.701212543180623e-05, "loss": 0.5206, "step": 5760 }, { "epoch": 0.6722248134328358, "grad_norm": 0.45478097582209165, "learning_rate": 1.6973764570773766e-05, "loss": 0.4958, "step": 5765 }, { "epoch": 0.6728078358208955, "grad_norm": 0.49448818817184126, "learning_rate": 1.693544284040968e-05, "loss": 0.532, "step": 5770 }, { "epoch": 0.6733908582089553, "grad_norm": 0.44392640275278356, "learning_rate": 1.6897160383172794e-05, "loss": 0.5102, "step": 5775 }, { "epoch": 0.6739738805970149, "grad_norm": 0.4681803651785721, "learning_rate": 1.6858917341375893e-05, "loss": 0.5033, "step": 5780 }, { "epoch": 0.6745569029850746, "grad_norm": 0.5012783536207814, "learning_rate": 1.6820713857185296e-05, "loss": 0.5261, "step": 5785 }, { "epoch": 0.6751399253731343, "grad_norm": 0.5021117339189587, "learning_rate": 1.6782550072620208e-05, "loss": 0.4987, "step": 5790 }, { "epoch": 0.675722947761194, "grad_norm": 0.48367815094839645, "learning_rate": 1.674442612955229e-05, "loss": 0.5231, "step": 5795 }, { "epoch": 0.6763059701492538, "grad_norm": 0.42292867586088734, "learning_rate": 1.6706342169705092e-05, "loss": 0.4833, "step": 5800 }, { "epoch": 0.6768889925373134, "grad_norm": 0.5498079373194198, "learning_rate": 1.6668298334653504e-05, "loss": 0.5303, "step": 5805 }, { "epoch": 0.6774720149253731, "grad_norm": 0.45613249161600566, "learning_rate": 1.663029476582328e-05, "loss": 0.4946, "step": 5810 }, { "epoch": 0.6780550373134329, "grad_norm": 0.4300584687753966, "learning_rate": 1.659233160449048e-05, "loss": 0.4835, "step": 5815 }, { "epoch": 0.6786380597014925, "grad_norm": 0.47672064051122565, "learning_rate": 1.6554408991780958e-05, "loss": 0.5239, "step": 5820 }, { "epoch": 0.6792210820895522, "grad_norm": 0.4363678330399839, "learning_rate": 1.65165270686698e-05, "loss": 0.5095, "step": 5825 }, { "epoch": 0.679804104477612, "grad_norm": 0.4386976710048956, "learning_rate": 1.6478685975980867e-05, "loss": 0.5037, "step": 5830 }, { "epoch": 0.6803871268656716, "grad_norm": 0.4713692579068845, "learning_rate": 1.6440885854386223e-05, "loss": 0.4919, "step": 5835 }, { "epoch": 0.6809701492537313, "grad_norm": 0.4590294577914819, "learning_rate": 1.6403126844405627e-05, "loss": 0.5001, "step": 5840 }, { "epoch": 0.6815531716417911, "grad_norm": 0.48010469655748705, "learning_rate": 1.6365409086405982e-05, "loss": 0.523, "step": 5845 }, { "epoch": 0.6821361940298507, "grad_norm": 0.4572102553531434, "learning_rate": 1.6327732720600893e-05, "loss": 0.4981, "step": 5850 }, { "epoch": 0.6827192164179104, "grad_norm": 0.42279317577956194, "learning_rate": 1.6290097887050037e-05, "loss": 0.4824, "step": 5855 }, { "epoch": 0.6833022388059702, "grad_norm": 0.4672338547985673, "learning_rate": 1.6252504725658738e-05, "loss": 0.5276, "step": 5860 }, { "epoch": 0.6838852611940298, "grad_norm": 0.5057683989918992, "learning_rate": 1.6214953376177355e-05, "loss": 0.5108, "step": 5865 }, { "epoch": 0.6844682835820896, "grad_norm": 0.4285897123749069, "learning_rate": 1.617744397820089e-05, "loss": 0.4943, "step": 5870 }, { "epoch": 0.6850513059701493, "grad_norm": 0.47280662245843785, "learning_rate": 1.613997667116832e-05, "loss": 0.5127, "step": 5875 }, { "epoch": 0.6856343283582089, "grad_norm": 0.467214604689162, "learning_rate": 1.610255159436219e-05, "loss": 0.4953, "step": 5880 }, { "epoch": 0.6862173507462687, "grad_norm": 0.4452219391184069, "learning_rate": 1.6065168886908046e-05, "loss": 0.4957, "step": 5885 }, { "epoch": 0.6868003731343284, "grad_norm": 0.44301870809154087, "learning_rate": 1.6027828687773947e-05, "loss": 0.4975, "step": 5890 }, { "epoch": 0.687383395522388, "grad_norm": 0.4401277833271277, "learning_rate": 1.5990531135769885e-05, "loss": 0.5195, "step": 5895 }, { "epoch": 0.6879664179104478, "grad_norm": 2.0394489198103662, "learning_rate": 1.5953276369547356e-05, "loss": 0.4956, "step": 5900 }, { "epoch": 0.6885494402985075, "grad_norm": 0.46836588645884025, "learning_rate": 1.591606452759879e-05, "loss": 0.5327, "step": 5905 }, { "epoch": 0.6891324626865671, "grad_norm": 0.44598757398103644, "learning_rate": 1.587889574825705e-05, "loss": 0.4764, "step": 5910 }, { "epoch": 0.6897154850746269, "grad_norm": 0.49742861746687556, "learning_rate": 1.5841770169694895e-05, "loss": 0.5111, "step": 5915 }, { "epoch": 0.6902985074626866, "grad_norm": 0.47331933429323847, "learning_rate": 1.5804687929924522e-05, "loss": 0.5047, "step": 5920 }, { "epoch": 0.6908815298507462, "grad_norm": 0.4652913554765252, "learning_rate": 1.5767649166796995e-05, "loss": 0.5038, "step": 5925 }, { "epoch": 0.691464552238806, "grad_norm": 0.453898586144349, "learning_rate": 1.573065401800176e-05, "loss": 0.4755, "step": 5930 }, { "epoch": 0.6920475746268657, "grad_norm": 0.43523535343423403, "learning_rate": 1.569370262106615e-05, "loss": 0.553, "step": 5935 }, { "epoch": 0.6926305970149254, "grad_norm": 0.46238104794971113, "learning_rate": 1.5656795113354816e-05, "loss": 0.4496, "step": 5940 }, { "epoch": 0.6932136194029851, "grad_norm": 0.4612926915904917, "learning_rate": 1.5619931632069284e-05, "loss": 0.5039, "step": 5945 }, { "epoch": 0.6937966417910447, "grad_norm": 0.46826494703709465, "learning_rate": 1.5583112314247386e-05, "loss": 0.5022, "step": 5950 }, { "epoch": 0.6943796641791045, "grad_norm": 0.47987543937588256, "learning_rate": 1.5546337296762826e-05, "loss": 0.5097, "step": 5955 }, { "epoch": 0.6949626865671642, "grad_norm": 0.43334758409327906, "learning_rate": 1.5509606716324563e-05, "loss": 0.4943, "step": 5960 }, { "epoch": 0.6955457089552238, "grad_norm": 0.4681763383562353, "learning_rate": 1.547292070947641e-05, "loss": 0.5063, "step": 5965 }, { "epoch": 0.6961287313432836, "grad_norm": 0.4924802920273831, "learning_rate": 1.5436279412596466e-05, "loss": 0.5047, "step": 5970 }, { "epoch": 0.6967117537313433, "grad_norm": 0.5347314807147506, "learning_rate": 1.5399682961896627e-05, "loss": 0.5254, "step": 5975 }, { "epoch": 0.6972947761194029, "grad_norm": 0.46161765208167127, "learning_rate": 1.5363131493422045e-05, "loss": 0.5527, "step": 5980 }, { "epoch": 0.6978777985074627, "grad_norm": 0.481592995924184, "learning_rate": 1.5326625143050717e-05, "loss": 0.5093, "step": 5985 }, { "epoch": 0.6984608208955224, "grad_norm": 0.43698502229598946, "learning_rate": 1.5290164046492855e-05, "loss": 0.5113, "step": 5990 }, { "epoch": 0.699043843283582, "grad_norm": 0.45737599000837936, "learning_rate": 1.5253748339290478e-05, "loss": 0.5072, "step": 5995 }, { "epoch": 0.6996268656716418, "grad_norm": 0.4632305738785457, "learning_rate": 1.5217378156816836e-05, "loss": 0.5092, "step": 6000 }, { "epoch": 0.7002098880597015, "grad_norm": 0.5658113543744521, "learning_rate": 1.5181053634276005e-05, "loss": 0.4714, "step": 6005 }, { "epoch": 0.7007929104477612, "grad_norm": 0.5024081912461719, "learning_rate": 1.5144774906702261e-05, "loss": 0.5587, "step": 6010 }, { "epoch": 0.7013759328358209, "grad_norm": 0.4346747521893763, "learning_rate": 1.5108542108959666e-05, "loss": 0.4874, "step": 6015 }, { "epoch": 0.7019589552238806, "grad_norm": 0.48751850242768197, "learning_rate": 1.5072355375741564e-05, "loss": 0.5152, "step": 6020 }, { "epoch": 0.7025419776119403, "grad_norm": 0.4679030016933106, "learning_rate": 1.5036214841570002e-05, "loss": 0.5177, "step": 6025 }, { "epoch": 0.703125, "grad_norm": 0.43192813973658906, "learning_rate": 1.500012064079533e-05, "loss": 0.5381, "step": 6030 }, { "epoch": 0.7037080223880597, "grad_norm": 0.47334729350720534, "learning_rate": 1.4964072907595633e-05, "loss": 0.5428, "step": 6035 }, { "epoch": 0.7042910447761194, "grad_norm": 0.4625335667175621, "learning_rate": 1.4928071775976283e-05, "loss": 0.4965, "step": 6040 }, { "epoch": 0.7048740671641791, "grad_norm": 0.4837863609737646, "learning_rate": 1.489211737976937e-05, "loss": 0.5201, "step": 6045 }, { "epoch": 0.7054570895522388, "grad_norm": 0.45448831055589556, "learning_rate": 1.4856209852633282e-05, "loss": 0.4988, "step": 6050 }, { "epoch": 0.7060401119402985, "grad_norm": 0.4470633313561567, "learning_rate": 1.482034932805217e-05, "loss": 0.4923, "step": 6055 }, { "epoch": 0.7066231343283582, "grad_norm": 0.46146957236249214, "learning_rate": 1.478453593933545e-05, "loss": 0.4966, "step": 6060 }, { "epoch": 0.707206156716418, "grad_norm": 0.44846726296477046, "learning_rate": 1.4748769819617291e-05, "loss": 0.5121, "step": 6065 }, { "epoch": 0.7077891791044776, "grad_norm": 0.445769696660708, "learning_rate": 1.47130511018562e-05, "loss": 0.49, "step": 6070 }, { "epoch": 0.7083722014925373, "grad_norm": 0.44424913913430536, "learning_rate": 1.4677379918834408e-05, "loss": 0.5154, "step": 6075 }, { "epoch": 0.7089552238805971, "grad_norm": 0.5025893980492081, "learning_rate": 1.464175640315748e-05, "loss": 0.4886, "step": 6080 }, { "epoch": 0.7095382462686567, "grad_norm": 0.4287175183865977, "learning_rate": 1.460618068725374e-05, "loss": 0.4943, "step": 6085 }, { "epoch": 0.7101212686567164, "grad_norm": 0.46657717095673207, "learning_rate": 1.4570652903373877e-05, "loss": 0.4866, "step": 6090 }, { "epoch": 0.7107042910447762, "grad_norm": 0.44658175711235154, "learning_rate": 1.453517318359034e-05, "loss": 0.4952, "step": 6095 }, { "epoch": 0.7112873134328358, "grad_norm": 0.5109217950300224, "learning_rate": 1.4499741659796927e-05, "loss": 0.5495, "step": 6100 }, { "epoch": 0.7118703358208955, "grad_norm": 0.439976204898742, "learning_rate": 1.4464358463708277e-05, "loss": 0.4947, "step": 6105 }, { "epoch": 0.7124533582089553, "grad_norm": 0.442606078185802, "learning_rate": 1.442902372685937e-05, "loss": 0.5125, "step": 6110 }, { "epoch": 0.7130363805970149, "grad_norm": 0.5113814435199087, "learning_rate": 1.4393737580605019e-05, "loss": 0.5397, "step": 6115 }, { "epoch": 0.7136194029850746, "grad_norm": 0.4292248926969336, "learning_rate": 1.435850015611943e-05, "loss": 0.5038, "step": 6120 }, { "epoch": 0.7142024253731343, "grad_norm": 0.4436015934650438, "learning_rate": 1.432331158439568e-05, "loss": 0.5015, "step": 6125 }, { "epoch": 0.714785447761194, "grad_norm": 0.4679393946330053, "learning_rate": 1.4288171996245247e-05, "loss": 0.4961, "step": 6130 }, { "epoch": 0.7153684701492538, "grad_norm": 0.46822261278058663, "learning_rate": 1.425308152229749e-05, "loss": 0.5565, "step": 6135 }, { "epoch": 0.7159514925373134, "grad_norm": 0.48866887191385927, "learning_rate": 1.4218040292999221e-05, "loss": 0.531, "step": 6140 }, { "epoch": 0.7165345149253731, "grad_norm": 0.4487435192453521, "learning_rate": 1.4183048438614166e-05, "loss": 0.4994, "step": 6145 }, { "epoch": 0.7171175373134329, "grad_norm": 0.5036947554976395, "learning_rate": 1.4148106089222513e-05, "loss": 0.519, "step": 6150 }, { "epoch": 0.7177005597014925, "grad_norm": 0.49765078956448927, "learning_rate": 1.4113213374720425e-05, "loss": 0.4825, "step": 6155 }, { "epoch": 0.7182835820895522, "grad_norm": 0.49796549587751004, "learning_rate": 1.4078370424819515e-05, "loss": 0.5142, "step": 6160 }, { "epoch": 0.718866604477612, "grad_norm": 0.46661776742591504, "learning_rate": 1.404357736904645e-05, "loss": 0.5121, "step": 6165 }, { "epoch": 0.7194496268656716, "grad_norm": 0.4206361496368435, "learning_rate": 1.4008834336742366e-05, "loss": 0.5043, "step": 6170 }, { "epoch": 0.7200326492537313, "grad_norm": 0.49140533816343374, "learning_rate": 1.3974141457062498e-05, "loss": 0.5465, "step": 6175 }, { "epoch": 0.7206156716417911, "grad_norm": 0.4730841055221398, "learning_rate": 1.3939498858975584e-05, "loss": 0.496, "step": 6180 }, { "epoch": 0.7211986940298507, "grad_norm": 0.5165044754977881, "learning_rate": 1.390490667126348e-05, "loss": 0.5699, "step": 6185 }, { "epoch": 0.7217817164179104, "grad_norm": 0.4574833927556042, "learning_rate": 1.3870365022520627e-05, "loss": 0.494, "step": 6190 }, { "epoch": 0.7223647388059702, "grad_norm": 0.4797480787975331, "learning_rate": 1.3835874041153607e-05, "loss": 0.5, "step": 6195 }, { "epoch": 0.7229477611940298, "grad_norm": 0.5266171413115511, "learning_rate": 1.380143385538063e-05, "loss": 0.5248, "step": 6200 }, { "epoch": 0.7235307835820896, "grad_norm": 0.4787651159553734, "learning_rate": 1.3767044593231082e-05, "loss": 0.4999, "step": 6205 }, { "epoch": 0.7241138059701493, "grad_norm": 0.42275543868072146, "learning_rate": 1.3732706382545054e-05, "loss": 0.4731, "step": 6210 }, { "epoch": 0.7246968283582089, "grad_norm": 0.48666619671292066, "learning_rate": 1.3698419350972851e-05, "loss": 0.5242, "step": 6215 }, { "epoch": 0.7252798507462687, "grad_norm": 0.41898376072456794, "learning_rate": 1.3664183625974503e-05, "loss": 0.5104, "step": 6220 }, { "epoch": 0.7258628731343284, "grad_norm": 0.4929104768550004, "learning_rate": 1.362999933481935e-05, "loss": 0.5206, "step": 6225 }, { "epoch": 0.726445895522388, "grad_norm": 0.4308013069303933, "learning_rate": 1.3595866604585492e-05, "loss": 0.5074, "step": 6230 }, { "epoch": 0.7270289179104478, "grad_norm": 0.4421936343319682, "learning_rate": 1.3561785562159374e-05, "loss": 0.4808, "step": 6235 }, { "epoch": 0.7276119402985075, "grad_norm": 0.4980669606106666, "learning_rate": 1.3527756334235288e-05, "loss": 0.4746, "step": 6240 }, { "epoch": 0.7281949626865671, "grad_norm": 0.4364859749653744, "learning_rate": 1.3493779047314925e-05, "loss": 0.4967, "step": 6245 }, { "epoch": 0.7287779850746269, "grad_norm": 0.4424531855538569, "learning_rate": 1.3459853827706853e-05, "loss": 0.4962, "step": 6250 }, { "epoch": 0.7293610074626866, "grad_norm": 0.5043638790699465, "learning_rate": 1.3425980801526118e-05, "loss": 0.5095, "step": 6255 }, { "epoch": 0.7299440298507462, "grad_norm": 0.464384451619953, "learning_rate": 1.3392160094693724e-05, "loss": 0.5008, "step": 6260 }, { "epoch": 0.730527052238806, "grad_norm": 0.44362217621259836, "learning_rate": 1.3358391832936174e-05, "loss": 0.4965, "step": 6265 }, { "epoch": 0.7311100746268657, "grad_norm": 0.4092013778366071, "learning_rate": 1.3324676141785029e-05, "loss": 0.5133, "step": 6270 }, { "epoch": 0.7316930970149254, "grad_norm": 0.4350575485273892, "learning_rate": 1.3291013146576403e-05, "loss": 0.5128, "step": 6275 }, { "epoch": 0.7322761194029851, "grad_norm": 0.4744197561421012, "learning_rate": 1.3257402972450539e-05, "loss": 0.4784, "step": 6280 }, { "epoch": 0.7328591417910447, "grad_norm": 0.4633376651924879, "learning_rate": 1.3223845744351287e-05, "loss": 0.475, "step": 6285 }, { "epoch": 0.7334421641791045, "grad_norm": 0.47064248305550177, "learning_rate": 1.3190341587025698e-05, "loss": 0.5147, "step": 6290 }, { "epoch": 0.7340251865671642, "grad_norm": 0.4745007087563231, "learning_rate": 1.3156890625023532e-05, "loss": 0.5131, "step": 6295 }, { "epoch": 0.7346082089552238, "grad_norm": 0.47910451161364465, "learning_rate": 1.3123492982696806e-05, "loss": 0.5125, "step": 6300 }, { "epoch": 0.7351912313432836, "grad_norm": 0.45150153399392984, "learning_rate": 1.3090148784199288e-05, "loss": 0.5195, "step": 6305 }, { "epoch": 0.7357742537313433, "grad_norm": 0.4666852788990984, "learning_rate": 1.305685815348613e-05, "loss": 0.5348, "step": 6310 }, { "epoch": 0.7363572761194029, "grad_norm": 0.43969109543694684, "learning_rate": 1.3023621214313289e-05, "loss": 0.4887, "step": 6315 }, { "epoch": 0.7369402985074627, "grad_norm": 0.4520730263820748, "learning_rate": 1.2990438090237167e-05, "loss": 0.5174, "step": 6320 }, { "epoch": 0.7375233208955224, "grad_norm": 0.4531016913171499, "learning_rate": 1.2957308904614099e-05, "loss": 0.513, "step": 6325 }, { "epoch": 0.738106343283582, "grad_norm": 0.43990578576591227, "learning_rate": 1.2924233780599915e-05, "loss": 0.469, "step": 6330 }, { "epoch": 0.7386893656716418, "grad_norm": 0.44084700165618407, "learning_rate": 1.2891212841149447e-05, "loss": 0.4997, "step": 6335 }, { "epoch": 0.7392723880597015, "grad_norm": 0.49100928132633287, "learning_rate": 1.2858246209016128e-05, "loss": 0.5187, "step": 6340 }, { "epoch": 0.7398554104477612, "grad_norm": 1.5928599989218044, "learning_rate": 1.2825334006751493e-05, "loss": 0.4954, "step": 6345 }, { "epoch": 0.7404384328358209, "grad_norm": 0.4278160829593741, "learning_rate": 1.2792476356704759e-05, "loss": 0.462, "step": 6350 }, { "epoch": 0.7410214552238806, "grad_norm": 0.45173765932520604, "learning_rate": 1.2759673381022305e-05, "loss": 0.5198, "step": 6355 }, { "epoch": 0.7416044776119403, "grad_norm": 0.46060933457902786, "learning_rate": 1.27269252016473e-05, "loss": 0.501, "step": 6360 }, { "epoch": 0.7421875, "grad_norm": 0.5194412989177508, "learning_rate": 1.2694231940319192e-05, "loss": 0.4862, "step": 6365 }, { "epoch": 0.7427705223880597, "grad_norm": 0.46637710292865725, "learning_rate": 1.2661593718573294e-05, "loss": 0.4913, "step": 6370 }, { "epoch": 0.7433535447761194, "grad_norm": 0.49083348251147924, "learning_rate": 1.2629010657740275e-05, "loss": 0.5073, "step": 6375 }, { "epoch": 0.7439365671641791, "grad_norm": 0.476732025190355, "learning_rate": 1.2596482878945787e-05, "loss": 0.5343, "step": 6380 }, { "epoch": 0.7445195895522388, "grad_norm": 0.43400279583829604, "learning_rate": 1.2564010503109952e-05, "loss": 0.4952, "step": 6385 }, { "epoch": 0.7451026119402985, "grad_norm": 0.4474931990549133, "learning_rate": 1.2531593650946932e-05, "loss": 0.4966, "step": 6390 }, { "epoch": 0.7456856343283582, "grad_norm": 0.4746293119307018, "learning_rate": 1.2499232442964506e-05, "loss": 0.5241, "step": 6395 }, { "epoch": 0.746268656716418, "grad_norm": 0.4596957874880727, "learning_rate": 1.2466926999463575e-05, "loss": 0.4931, "step": 6400 }, { "epoch": 0.7468516791044776, "grad_norm": 0.48590988974018173, "learning_rate": 1.2434677440537745e-05, "loss": 0.5498, "step": 6405 }, { "epoch": 0.7474347014925373, "grad_norm": 0.43144811701119584, "learning_rate": 1.2402483886072883e-05, "loss": 0.4673, "step": 6410 }, { "epoch": 0.7480177238805971, "grad_norm": 0.4694370814956435, "learning_rate": 1.237034645574666e-05, "loss": 0.5173, "step": 6415 }, { "epoch": 0.7486007462686567, "grad_norm": 0.5215126950510659, "learning_rate": 1.233826526902809e-05, "loss": 0.5413, "step": 6420 }, { "epoch": 0.7491837686567164, "grad_norm": 0.4667853644040054, "learning_rate": 1.230624044517713e-05, "loss": 0.5206, "step": 6425 }, { "epoch": 0.7497667910447762, "grad_norm": 0.49822013134556736, "learning_rate": 1.2274272103244201e-05, "loss": 0.5264, "step": 6430 }, { "epoch": 0.7503498134328358, "grad_norm": 0.4924144531446238, "learning_rate": 1.2242360362069763e-05, "loss": 0.4843, "step": 6435 }, { "epoch": 0.7509328358208955, "grad_norm": 0.47214741828129897, "learning_rate": 1.2210505340283838e-05, "loss": 0.529, "step": 6440 }, { "epoch": 0.7515158582089553, "grad_norm": 0.4726759175626485, "learning_rate": 1.2178707156305644e-05, "loss": 0.4993, "step": 6445 }, { "epoch": 0.7520988805970149, "grad_norm": 0.47497735759010773, "learning_rate": 1.2146965928343062e-05, "loss": 0.4923, "step": 6450 }, { "epoch": 0.7526819029850746, "grad_norm": 0.5282112383115118, "learning_rate": 1.2115281774392278e-05, "loss": 0.5043, "step": 6455 }, { "epoch": 0.7532649253731343, "grad_norm": 0.4289024052469388, "learning_rate": 1.208365481223727e-05, "loss": 0.4915, "step": 6460 }, { "epoch": 0.753847947761194, "grad_norm": 0.4527587796867484, "learning_rate": 1.2052085159449455e-05, "loss": 0.491, "step": 6465 }, { "epoch": 0.7544309701492538, "grad_norm": 0.46413997296204734, "learning_rate": 1.202057293338717e-05, "loss": 0.5207, "step": 6470 }, { "epoch": 0.7550139925373134, "grad_norm": 0.47431690969030404, "learning_rate": 1.1989118251195284e-05, "loss": 0.4807, "step": 6475 }, { "epoch": 0.7555970149253731, "grad_norm": 0.6124168305417785, "learning_rate": 1.1957721229804761e-05, "loss": 0.4909, "step": 6480 }, { "epoch": 0.7561800373134329, "grad_norm": 0.4612146149691418, "learning_rate": 1.1926381985932186e-05, "loss": 0.4912, "step": 6485 }, { "epoch": 0.7567630597014925, "grad_norm": 0.4602032665320846, "learning_rate": 1.1895100636079387e-05, "loss": 0.5287, "step": 6490 }, { "epoch": 0.7573460820895522, "grad_norm": 0.44850719377837156, "learning_rate": 1.186387729653296e-05, "loss": 0.5147, "step": 6495 }, { "epoch": 0.757929104477612, "grad_norm": 0.43547527328367247, "learning_rate": 1.1832712083363865e-05, "loss": 0.4774, "step": 6500 }, { "epoch": 0.7585121268656716, "grad_norm": 0.48648908094016996, "learning_rate": 1.1801605112426953e-05, "loss": 0.4948, "step": 6505 }, { "epoch": 0.7590951492537313, "grad_norm": 0.4362070385676467, "learning_rate": 1.1770556499360593e-05, "loss": 0.4768, "step": 6510 }, { "epoch": 0.7596781716417911, "grad_norm": 0.4594448702342764, "learning_rate": 1.1739566359586195e-05, "loss": 0.5192, "step": 6515 }, { "epoch": 0.7602611940298507, "grad_norm": 0.4895001977535846, "learning_rate": 1.170863480830781e-05, "loss": 0.5029, "step": 6520 }, { "epoch": 0.7608442164179104, "grad_norm": 0.4811285447973769, "learning_rate": 1.167776196051166e-05, "loss": 0.5089, "step": 6525 }, { "epoch": 0.7614272388059702, "grad_norm": 0.4169504765893255, "learning_rate": 1.1646947930965795e-05, "loss": 0.4477, "step": 6530 }, { "epoch": 0.7620102611940298, "grad_norm": 0.4342791005744433, "learning_rate": 1.1616192834219553e-05, "loss": 0.4957, "step": 6535 }, { "epoch": 0.7625932835820896, "grad_norm": 0.500766952555154, "learning_rate": 1.1585496784603234e-05, "loss": 0.474, "step": 6540 }, { "epoch": 0.7631763059701493, "grad_norm": 0.459834987252376, "learning_rate": 1.15548598962276e-05, "loss": 0.4653, "step": 6545 }, { "epoch": 0.7637593283582089, "grad_norm": 0.4634667523018127, "learning_rate": 1.1524282282983526e-05, "loss": 0.4952, "step": 6550 }, { "epoch": 0.7643423507462687, "grad_norm": 0.48306231366385766, "learning_rate": 1.1493764058541493e-05, "loss": 0.5092, "step": 6555 }, { "epoch": 0.7649253731343284, "grad_norm": 0.4412886363099704, "learning_rate": 1.1463305336351233e-05, "loss": 0.4836, "step": 6560 }, { "epoch": 0.765508395522388, "grad_norm": 0.4594326939876668, "learning_rate": 1.143290622964128e-05, "loss": 0.4858, "step": 6565 }, { "epoch": 0.7660914179104478, "grad_norm": 0.495560886941702, "learning_rate": 1.1402566851418545e-05, "loss": 0.484, "step": 6570 }, { "epoch": 0.7666744402985075, "grad_norm": 0.44698231009276274, "learning_rate": 1.1372287314467896e-05, "loss": 0.4938, "step": 6575 }, { "epoch": 0.7672574626865671, "grad_norm": 0.48489845254460334, "learning_rate": 1.1342067731351754e-05, "loss": 0.5349, "step": 6580 }, { "epoch": 0.7678404850746269, "grad_norm": 0.3983706507650223, "learning_rate": 1.1311908214409666e-05, "loss": 0.4916, "step": 6585 }, { "epoch": 0.7684235074626866, "grad_norm": 0.4829203676168014, "learning_rate": 1.128180887575789e-05, "loss": 0.5538, "step": 6590 }, { "epoch": 0.7690065298507462, "grad_norm": 0.5181252743781259, "learning_rate": 1.1251769827288953e-05, "loss": 0.5103, "step": 6595 }, { "epoch": 0.769589552238806, "grad_norm": 0.45245249797253495, "learning_rate": 1.122179118067128e-05, "loss": 0.4877, "step": 6600 }, { "epoch": 0.7701725746268657, "grad_norm": 0.45213952635934074, "learning_rate": 1.1191873047348743e-05, "loss": 0.4699, "step": 6605 }, { "epoch": 0.7707555970149254, "grad_norm": 0.4384946771009236, "learning_rate": 1.1162015538540268e-05, "loss": 0.5199, "step": 6610 }, { "epoch": 0.7713386194029851, "grad_norm": 1.4634272550071448, "learning_rate": 1.1132218765239417e-05, "loss": 0.5111, "step": 6615 }, { "epoch": 0.7719216417910447, "grad_norm": 0.4528179441810264, "learning_rate": 1.1102482838213945e-05, "loss": 0.5095, "step": 6620 }, { "epoch": 0.7725046641791045, "grad_norm": 0.48790948502665343, "learning_rate": 1.1072807868005438e-05, "loss": 0.5375, "step": 6625 }, { "epoch": 0.7730876865671642, "grad_norm": 0.43356020787990573, "learning_rate": 1.104319396492888e-05, "loss": 0.494, "step": 6630 }, { "epoch": 0.7736707089552238, "grad_norm": 0.46568426433428856, "learning_rate": 1.1013641239072233e-05, "loss": 0.5089, "step": 6635 }, { "epoch": 0.7742537313432836, "grad_norm": 0.5058210347479575, "learning_rate": 1.098414980029603e-05, "loss": 0.5278, "step": 6640 }, { "epoch": 0.7748367537313433, "grad_norm": 0.46834473900137596, "learning_rate": 1.0954719758232983e-05, "loss": 0.5183, "step": 6645 }, { "epoch": 0.7754197761194029, "grad_norm": 0.4485355699073728, "learning_rate": 1.092535122228757e-05, "loss": 0.5067, "step": 6650 }, { "epoch": 0.7760027985074627, "grad_norm": 0.44029418480334825, "learning_rate": 1.0896044301635616e-05, "loss": 0.4902, "step": 6655 }, { "epoch": 0.7765858208955224, "grad_norm": 0.46043398790080686, "learning_rate": 1.0866799105223877e-05, "loss": 0.4752, "step": 6660 }, { "epoch": 0.777168843283582, "grad_norm": 0.45411148412679725, "learning_rate": 1.0837615741769695e-05, "loss": 0.5027, "step": 6665 }, { "epoch": 0.7777518656716418, "grad_norm": 0.46002535963451235, "learning_rate": 1.0808494319760511e-05, "loss": 0.4818, "step": 6670 }, { "epoch": 0.7783348880597015, "grad_norm": 0.4613160618534238, "learning_rate": 1.0779434947453531e-05, "loss": 0.5305, "step": 6675 }, { "epoch": 0.7789179104477612, "grad_norm": 0.5192961001346327, "learning_rate": 1.0750437732875265e-05, "loss": 0.4909, "step": 6680 }, { "epoch": 0.7795009328358209, "grad_norm": 0.47703840671844866, "learning_rate": 1.0721502783821194e-05, "loss": 0.5433, "step": 6685 }, { "epoch": 0.7800839552238806, "grad_norm": 0.4027202848278968, "learning_rate": 1.0692630207855296e-05, "loss": 0.4795, "step": 6690 }, { "epoch": 0.7806669776119403, "grad_norm": 0.5254895186437003, "learning_rate": 1.0663820112309695e-05, "loss": 0.5234, "step": 6695 }, { "epoch": 0.78125, "grad_norm": 0.4714359353636831, "learning_rate": 1.0635072604284254e-05, "loss": 0.4837, "step": 6700 }, { "epoch": 0.7818330223880597, "grad_norm": 0.4409933416168129, "learning_rate": 1.0606387790646154e-05, "loss": 0.5124, "step": 6705 }, { "epoch": 0.7824160447761194, "grad_norm": 0.47666045358791953, "learning_rate": 1.0577765778029525e-05, "loss": 0.4762, "step": 6710 }, { "epoch": 0.7829990671641791, "grad_norm": 0.48287842316913426, "learning_rate": 1.0549206672835033e-05, "loss": 0.4879, "step": 6715 }, { "epoch": 0.7835820895522388, "grad_norm": 0.4494356289008418, "learning_rate": 1.0520710581229507e-05, "loss": 0.4816, "step": 6720 }, { "epoch": 0.7841651119402985, "grad_norm": 0.46052908698109446, "learning_rate": 1.049227760914549e-05, "loss": 0.4516, "step": 6725 }, { "epoch": 0.7847481343283582, "grad_norm": 0.4567314964584747, "learning_rate": 1.0463907862280913e-05, "loss": 0.4871, "step": 6730 }, { "epoch": 0.785331156716418, "grad_norm": 0.44868088537045625, "learning_rate": 1.043560144609866e-05, "loss": 0.4955, "step": 6735 }, { "epoch": 0.7859141791044776, "grad_norm": 0.4530134493607295, "learning_rate": 1.0407358465826198e-05, "loss": 0.5199, "step": 6740 }, { "epoch": 0.7864972014925373, "grad_norm": 0.4463551169201282, "learning_rate": 1.0379179026455136e-05, "loss": 0.4913, "step": 6745 }, { "epoch": 0.7870802238805971, "grad_norm": 0.47099610185569996, "learning_rate": 1.0351063232740937e-05, "loss": 0.5332, "step": 6750 }, { "epoch": 0.7876632462686567, "grad_norm": 0.4402447824147342, "learning_rate": 1.0323011189202408e-05, "loss": 0.5053, "step": 6755 }, { "epoch": 0.7882462686567164, "grad_norm": 0.45593057954899197, "learning_rate": 1.0295023000121404e-05, "loss": 0.474, "step": 6760 }, { "epoch": 0.7888292910447762, "grad_norm": 0.4374300903516831, "learning_rate": 1.0267098769542368e-05, "loss": 0.5427, "step": 6765 }, { "epoch": 0.7894123134328358, "grad_norm": 0.4216054919323253, "learning_rate": 1.0239238601272036e-05, "loss": 0.4862, "step": 6770 }, { "epoch": 0.7899953358208955, "grad_norm": 0.43838979602724326, "learning_rate": 1.0211442598878936e-05, "loss": 0.4697, "step": 6775 }, { "epoch": 0.7905783582089553, "grad_norm": 0.41927389582266983, "learning_rate": 1.0183710865693105e-05, "loss": 0.4731, "step": 6780 }, { "epoch": 0.7911613805970149, "grad_norm": 0.4618407627786652, "learning_rate": 1.0156043504805648e-05, "loss": 0.4946, "step": 6785 }, { "epoch": 0.7917444029850746, "grad_norm": 0.49476862297527385, "learning_rate": 1.0128440619068379e-05, "loss": 0.5218, "step": 6790 }, { "epoch": 0.7923274253731343, "grad_norm": 0.4878473893991779, "learning_rate": 1.0100902311093405e-05, "loss": 0.5127, "step": 6795 }, { "epoch": 0.792910447761194, "grad_norm": 0.44555429187543133, "learning_rate": 1.0073428683252788e-05, "loss": 0.4893, "step": 6800 }, { "epoch": 0.7934934701492538, "grad_norm": 0.44977845000528854, "learning_rate": 1.0046019837678153e-05, "loss": 0.4687, "step": 6805 }, { "epoch": 0.7940764925373134, "grad_norm": 0.447310487987635, "learning_rate": 1.001867587626029e-05, "loss": 0.484, "step": 6810 }, { "epoch": 0.7946595149253731, "grad_norm": 0.48144807868357464, "learning_rate": 9.991396900648774e-06, "loss": 0.5395, "step": 6815 }, { "epoch": 0.7952425373134329, "grad_norm": 0.46182249334458747, "learning_rate": 9.964183012251619e-06, "loss": 0.4914, "step": 6820 }, { "epoch": 0.7958255597014925, "grad_norm": 0.4389736266429193, "learning_rate": 9.937034312234872e-06, "loss": 0.4966, "step": 6825 }, { "epoch": 0.7964085820895522, "grad_norm": 0.4683635632125285, "learning_rate": 9.90995090152225e-06, "loss": 0.4872, "step": 6830 }, { "epoch": 0.796991604477612, "grad_norm": 0.449577983637824, "learning_rate": 9.88293288079476e-06, "loss": 0.4775, "step": 6835 }, { "epoch": 0.7975746268656716, "grad_norm": 0.4679257203680657, "learning_rate": 9.855980350490315e-06, "loss": 0.4628, "step": 6840 }, { "epoch": 0.7981576492537313, "grad_norm": 0.4910686168202571, "learning_rate": 9.82909341080339e-06, "loss": 0.516, "step": 6845 }, { "epoch": 0.7987406716417911, "grad_norm": 0.4328561795827866, "learning_rate": 9.802272161684601e-06, "loss": 0.5062, "step": 6850 }, { "epoch": 0.7993236940298507, "grad_norm": 0.48810830150277185, "learning_rate": 9.775516702840411e-06, "loss": 0.542, "step": 6855 }, { "epoch": 0.7999067164179104, "grad_norm": 0.4326359263896414, "learning_rate": 9.748827133732665e-06, "loss": 0.491, "step": 6860 }, { "epoch": 0.8004897388059702, "grad_norm": 0.45895357662154007, "learning_rate": 9.722203553578288e-06, "loss": 0.5017, "step": 6865 }, { "epoch": 0.8010727611940298, "grad_norm": 0.4201551976753478, "learning_rate": 9.695646061348892e-06, "loss": 0.5233, "step": 6870 }, { "epoch": 0.8016557835820896, "grad_norm": 0.4261865295483104, "learning_rate": 9.669154755770415e-06, "loss": 0.4615, "step": 6875 }, { "epoch": 0.8022388059701493, "grad_norm": 0.7140092170328386, "learning_rate": 9.642729735322733e-06, "loss": 0.5097, "step": 6880 }, { "epoch": 0.8028218283582089, "grad_norm": 0.4990283440949626, "learning_rate": 9.616371098239346e-06, "loss": 0.5716, "step": 6885 }, { "epoch": 0.8034048507462687, "grad_norm": 0.4481559993333328, "learning_rate": 9.590078942506933e-06, "loss": 0.4856, "step": 6890 }, { "epoch": 0.8039878731343284, "grad_norm": 0.4607158596942868, "learning_rate": 9.56385336586507e-06, "loss": 0.5107, "step": 6895 }, { "epoch": 0.804570895522388, "grad_norm": 0.476387734456041, "learning_rate": 9.537694465805797e-06, "loss": 0.4998, "step": 6900 }, { "epoch": 0.8051539179104478, "grad_norm": 0.46770623541362477, "learning_rate": 9.511602339573324e-06, "loss": 0.4998, "step": 6905 }, { "epoch": 0.8057369402985075, "grad_norm": 0.5086514392993742, "learning_rate": 9.485577084163604e-06, "loss": 0.5105, "step": 6910 }, { "epoch": 0.8063199626865671, "grad_norm": 0.5077908007862963, "learning_rate": 9.45961879632401e-06, "loss": 0.5116, "step": 6915 }, { "epoch": 0.8069029850746269, "grad_norm": 0.5463748621976762, "learning_rate": 9.43372757255297e-06, "loss": 0.4687, "step": 6920 }, { "epoch": 0.8074860074626866, "grad_norm": 0.4191317799403366, "learning_rate": 9.40790350909961e-06, "loss": 0.498, "step": 6925 }, { "epoch": 0.8080690298507462, "grad_norm": 0.4865303702117012, "learning_rate": 9.382146701963373e-06, "loss": 0.456, "step": 6930 }, { "epoch": 0.808652052238806, "grad_norm": 0.42674988798148156, "learning_rate": 9.356457246893695e-06, "loss": 0.5227, "step": 6935 }, { "epoch": 0.8092350746268657, "grad_norm": 0.46859664111243005, "learning_rate": 9.330835239389645e-06, "loss": 0.5018, "step": 6940 }, { "epoch": 0.8098180970149254, "grad_norm": 0.5581605519629456, "learning_rate": 9.305280774699531e-06, "loss": 0.4893, "step": 6945 }, { "epoch": 0.8104011194029851, "grad_norm": 0.4648844812652044, "learning_rate": 9.279793947820596e-06, "loss": 0.5034, "step": 6950 }, { "epoch": 0.8109841417910447, "grad_norm": 0.4348511386801257, "learning_rate": 9.254374853498636e-06, "loss": 0.476, "step": 6955 }, { "epoch": 0.8115671641791045, "grad_norm": 0.4471748979975722, "learning_rate": 9.229023586227666e-06, "loss": 0.4746, "step": 6960 }, { "epoch": 0.8121501865671642, "grad_norm": 0.49028467746657334, "learning_rate": 9.203740240249527e-06, "loss": 0.5075, "step": 6965 }, { "epoch": 0.8127332089552238, "grad_norm": 0.45156139381243476, "learning_rate": 9.178524909553617e-06, "loss": 0.4997, "step": 6970 }, { "epoch": 0.8133162313432836, "grad_norm": 0.4580984480401016, "learning_rate": 9.153377687876439e-06, "loss": 0.5098, "step": 6975 }, { "epoch": 0.8138992537313433, "grad_norm": 0.4417064208682211, "learning_rate": 9.128298668701341e-06, "loss": 0.4866, "step": 6980 }, { "epoch": 0.8144822761194029, "grad_norm": 0.4396784431419953, "learning_rate": 9.103287945258104e-06, "loss": 0.4843, "step": 6985 }, { "epoch": 0.8150652985074627, "grad_norm": 0.42781740173572586, "learning_rate": 9.078345610522662e-06, "loss": 0.4864, "step": 6990 }, { "epoch": 0.8156483208955224, "grad_norm": 0.4706194313899779, "learning_rate": 9.053471757216675e-06, "loss": 0.4829, "step": 6995 }, { "epoch": 0.816231343283582, "grad_norm": 0.4373005186110652, "learning_rate": 9.028666477807253e-06, "loss": 0.4946, "step": 7000 }, { "epoch": 0.8168143656716418, "grad_norm": 0.45822848787961307, "learning_rate": 9.003929864506583e-06, "loss": 0.4747, "step": 7005 }, { "epoch": 0.8173973880597015, "grad_norm": 0.4376578697732612, "learning_rate": 8.979262009271589e-06, "loss": 0.4982, "step": 7010 }, { "epoch": 0.8179804104477612, "grad_norm": 0.50115571632827, "learning_rate": 8.954663003803579e-06, "loss": 0.4934, "step": 7015 }, { "epoch": 0.8185634328358209, "grad_norm": 0.44219752315988364, "learning_rate": 8.930132939547932e-06, "loss": 0.4663, "step": 7020 }, { "epoch": 0.8191464552238806, "grad_norm": 0.440108685069607, "learning_rate": 8.905671907693738e-06, "loss": 0.4856, "step": 7025 }, { "epoch": 0.8197294776119403, "grad_norm": 0.5246570100006753, "learning_rate": 8.881279999173466e-06, "loss": 0.5021, "step": 7030 }, { "epoch": 0.8203125, "grad_norm": 0.4319239074767847, "learning_rate": 8.856957304662602e-06, "loss": 0.5123, "step": 7035 }, { "epoch": 0.8208955223880597, "grad_norm": 0.4326829341676435, "learning_rate": 8.832703914579363e-06, "loss": 0.5021, "step": 7040 }, { "epoch": 0.8214785447761194, "grad_norm": 0.4380817083829351, "learning_rate": 8.80851991908431e-06, "loss": 0.5044, "step": 7045 }, { "epoch": 0.8220615671641791, "grad_norm": 0.4637369898587162, "learning_rate": 8.784405408080046e-06, "loss": 0.4852, "step": 7050 }, { "epoch": 0.8226445895522388, "grad_norm": 0.4796910879347511, "learning_rate": 8.760360471210865e-06, "loss": 0.5338, "step": 7055 }, { "epoch": 0.8232276119402985, "grad_norm": 0.4135280946993472, "learning_rate": 8.736385197862415e-06, "loss": 0.4824, "step": 7060 }, { "epoch": 0.8238106343283582, "grad_norm": 0.4287858690307368, "learning_rate": 8.712479677161388e-06, "loss": 0.4869, "step": 7065 }, { "epoch": 0.824393656716418, "grad_norm": 0.4714198385972294, "learning_rate": 8.688643997975156e-06, "loss": 0.5143, "step": 7070 }, { "epoch": 0.8249766791044776, "grad_norm": 0.44395978208148723, "learning_rate": 8.66487824891149e-06, "loss": 0.4922, "step": 7075 }, { "epoch": 0.8255597014925373, "grad_norm": 0.508754740613158, "learning_rate": 8.641182518318162e-06, "loss": 0.5184, "step": 7080 }, { "epoch": 0.8261427238805971, "grad_norm": 0.4026422778012762, "learning_rate": 8.617556894282683e-06, "loss": 0.4886, "step": 7085 }, { "epoch": 0.8267257462686567, "grad_norm": 0.47446388322469524, "learning_rate": 8.594001464631938e-06, "loss": 0.5189, "step": 7090 }, { "epoch": 0.8273087686567164, "grad_norm": 0.47024774444454565, "learning_rate": 8.570516316931869e-06, "loss": 0.5266, "step": 7095 }, { "epoch": 0.8278917910447762, "grad_norm": 0.44748825048810753, "learning_rate": 8.547101538487136e-06, "loss": 0.4845, "step": 7100 }, { "epoch": 0.8284748134328358, "grad_norm": 0.4806024012846129, "learning_rate": 8.52375721634083e-06, "loss": 0.5031, "step": 7105 }, { "epoch": 0.8290578358208955, "grad_norm": 0.5413375895981546, "learning_rate": 8.5004834372741e-06, "loss": 0.5106, "step": 7110 }, { "epoch": 0.8296408582089553, "grad_norm": 0.4337024823163042, "learning_rate": 8.477280287805883e-06, "loss": 0.4954, "step": 7115 }, { "epoch": 0.8302238805970149, "grad_norm": 0.4344528368728774, "learning_rate": 8.454147854192515e-06, "loss": 0.5021, "step": 7120 }, { "epoch": 0.8308069029850746, "grad_norm": 0.45520925847409455, "learning_rate": 8.4310862224275e-06, "loss": 0.4828, "step": 7125 }, { "epoch": 0.8313899253731343, "grad_norm": 0.4734386762743593, "learning_rate": 8.408095478241099e-06, "loss": 0.5243, "step": 7130 }, { "epoch": 0.831972947761194, "grad_norm": 0.4717328221940539, "learning_rate": 8.385175707100064e-06, "loss": 0.4907, "step": 7135 }, { "epoch": 0.8325559701492538, "grad_norm": 0.41240208092851716, "learning_rate": 8.36232699420732e-06, "loss": 0.5031, "step": 7140 }, { "epoch": 0.8331389925373134, "grad_norm": 0.41514081930390884, "learning_rate": 8.33954942450163e-06, "loss": 0.4896, "step": 7145 }, { "epoch": 0.8337220149253731, "grad_norm": 0.4675941109997365, "learning_rate": 8.316843082657277e-06, "loss": 0.5009, "step": 7150 }, { "epoch": 0.8343050373134329, "grad_norm": 0.4427930397053809, "learning_rate": 8.294208053083771e-06, "loss": 0.511, "step": 7155 }, { "epoch": 0.8348880597014925, "grad_norm": 0.45949248005156934, "learning_rate": 8.271644419925526e-06, "loss": 0.4719, "step": 7160 }, { "epoch": 0.8354710820895522, "grad_norm": 0.471413986453529, "learning_rate": 8.249152267061524e-06, "loss": 0.4994, "step": 7165 }, { "epoch": 0.836054104477612, "grad_norm": 0.44813478705837095, "learning_rate": 8.226731678105045e-06, "loss": 0.4804, "step": 7170 }, { "epoch": 0.8366371268656716, "grad_norm": 0.4426301509227603, "learning_rate": 8.20438273640332e-06, "loss": 0.5143, "step": 7175 }, { "epoch": 0.8372201492537313, "grad_norm": 0.4465652598526728, "learning_rate": 8.18210552503725e-06, "loss": 0.4871, "step": 7180 }, { "epoch": 0.8378031716417911, "grad_norm": 0.47244655194156704, "learning_rate": 8.159900126821062e-06, "loss": 0.5003, "step": 7185 }, { "epoch": 0.8383861940298507, "grad_norm": 0.4471968512251493, "learning_rate": 8.137766624302036e-06, "loss": 0.5142, "step": 7190 }, { "epoch": 0.8389692164179104, "grad_norm": 0.4709917176706898, "learning_rate": 8.115705099760184e-06, "loss": 0.5195, "step": 7195 }, { "epoch": 0.8395522388059702, "grad_norm": 0.4380879394783618, "learning_rate": 8.093715635207948e-06, "loss": 0.4885, "step": 7200 }, { "epoch": 0.8401352611940298, "grad_norm": 0.41712410093981933, "learning_rate": 8.071798312389863e-06, "loss": 0.5116, "step": 7205 }, { "epoch": 0.8407182835820896, "grad_norm": 0.4325869434630372, "learning_rate": 8.049953212782329e-06, "loss": 0.4478, "step": 7210 }, { "epoch": 0.8413013059701493, "grad_norm": 0.4166574600617677, "learning_rate": 8.028180417593215e-06, "loss": 0.4824, "step": 7215 }, { "epoch": 0.8418843283582089, "grad_norm": 0.43830478937394324, "learning_rate": 8.006480007761628e-06, "loss": 0.4674, "step": 7220 }, { "epoch": 0.8424673507462687, "grad_norm": 0.4817669338294806, "learning_rate": 7.98485206395758e-06, "loss": 0.5077, "step": 7225 }, { "epoch": 0.8430503731343284, "grad_norm": 0.4647045998674549, "learning_rate": 7.963296666581703e-06, "loss": 0.4702, "step": 7230 }, { "epoch": 0.843633395522388, "grad_norm": 0.49174994550827694, "learning_rate": 7.941813895764919e-06, "loss": 0.5212, "step": 7235 }, { "epoch": 0.8442164179104478, "grad_norm": 0.41757383756848515, "learning_rate": 7.920403831368189e-06, "loss": 0.5016, "step": 7240 }, { "epoch": 0.8447994402985075, "grad_norm": 0.44263407474750815, "learning_rate": 7.899066552982179e-06, "loss": 0.4994, "step": 7245 }, { "epoch": 0.8453824626865671, "grad_norm": 0.46533860910272895, "learning_rate": 7.87780213992699e-06, "loss": 0.4973, "step": 7250 }, { "epoch": 0.8459654850746269, "grad_norm": 0.4396469203604777, "learning_rate": 7.856610671251826e-06, "loss": 0.4962, "step": 7255 }, { "epoch": 0.8465485074626866, "grad_norm": 0.461412303324744, "learning_rate": 7.835492225734753e-06, "loss": 0.4848, "step": 7260 }, { "epoch": 0.8471315298507462, "grad_norm": 0.5184880522572166, "learning_rate": 7.81444688188236e-06, "loss": 0.5133, "step": 7265 }, { "epoch": 0.847714552238806, "grad_norm": 0.47001007740037093, "learning_rate": 7.793474717929495e-06, "loss": 0.4852, "step": 7270 }, { "epoch": 0.8482975746268657, "grad_norm": 0.47109511656808234, "learning_rate": 7.772575811838948e-06, "loss": 0.4961, "step": 7275 }, { "epoch": 0.8488805970149254, "grad_norm": 0.44116994424682127, "learning_rate": 7.751750241301192e-06, "loss": 0.4972, "step": 7280 }, { "epoch": 0.8494636194029851, "grad_norm": 0.4501932519308333, "learning_rate": 7.730998083734083e-06, "loss": 0.4859, "step": 7285 }, { "epoch": 0.8500466417910447, "grad_norm": 0.48144234778210115, "learning_rate": 7.710319416282543e-06, "loss": 0.4984, "step": 7290 }, { "epoch": 0.8506296641791045, "grad_norm": 0.456988520365441, "learning_rate": 7.689714315818339e-06, "loss": 0.5232, "step": 7295 }, { "epoch": 0.8512126865671642, "grad_norm": 0.5275485527755961, "learning_rate": 7.669182858939715e-06, "loss": 0.494, "step": 7300 }, { "epoch": 0.8517957089552238, "grad_norm": 0.4358686155004703, "learning_rate": 7.648725121971178e-06, "loss": 0.4652, "step": 7305 }, { "epoch": 0.8523787313432836, "grad_norm": 0.5241681742015615, "learning_rate": 7.628341180963175e-06, "loss": 0.5107, "step": 7310 }, { "epoch": 0.8529617537313433, "grad_norm": 0.4446315316066114, "learning_rate": 7.608031111691826e-06, "loss": 0.4736, "step": 7315 }, { "epoch": 0.8535447761194029, "grad_norm": 0.4586742881823037, "learning_rate": 7.587794989658621e-06, "loss": 0.4789, "step": 7320 }, { "epoch": 0.8541277985074627, "grad_norm": 0.43393817483796693, "learning_rate": 7.567632890090176e-06, "loss": 0.4517, "step": 7325 }, { "epoch": 0.8547108208955224, "grad_norm": 0.4530263088196386, "learning_rate": 7.5475448879379255e-06, "loss": 0.5204, "step": 7330 }, { "epoch": 0.855293843283582, "grad_norm": 0.47232925084661864, "learning_rate": 7.527531057877849e-06, "loss": 0.5212, "step": 7335 }, { "epoch": 0.8558768656716418, "grad_norm": 0.4413601041808957, "learning_rate": 7.507591474310185e-06, "loss": 0.4907, "step": 7340 }, { "epoch": 0.8564598880597015, "grad_norm": 0.5105056148052758, "learning_rate": 7.487726211359198e-06, "loss": 0.5465, "step": 7345 }, { "epoch": 0.8570429104477612, "grad_norm": 0.6715647626166893, "learning_rate": 7.46793534287283e-06, "loss": 0.5069, "step": 7350 }, { "epoch": 0.8576259328358209, "grad_norm": 0.4588809773198785, "learning_rate": 7.448218942422498e-06, "loss": 0.5474, "step": 7355 }, { "epoch": 0.8582089552238806, "grad_norm": 0.4831576147260091, "learning_rate": 7.428577083302757e-06, "loss": 0.5093, "step": 7360 }, { "epoch": 0.8587919776119403, "grad_norm": 0.5042721886700091, "learning_rate": 7.409009838531095e-06, "loss": 0.5375, "step": 7365 }, { "epoch": 0.859375, "grad_norm": 0.46284073380005986, "learning_rate": 7.389517280847598e-06, "loss": 0.5159, "step": 7370 }, { "epoch": 0.8599580223880597, "grad_norm": 0.4661662360141043, "learning_rate": 7.370099482714715e-06, "loss": 0.5315, "step": 7375 }, { "epoch": 0.8605410447761194, "grad_norm": 0.4476959939432871, "learning_rate": 7.35075651631699e-06, "loss": 0.4901, "step": 7380 }, { "epoch": 0.8611240671641791, "grad_norm": 0.44717566109177675, "learning_rate": 7.331488453560767e-06, "loss": 0.4921, "step": 7385 }, { "epoch": 0.8617070895522388, "grad_norm": 0.4476851149766431, "learning_rate": 7.312295366073952e-06, "loss": 0.4839, "step": 7390 }, { "epoch": 0.8622901119402985, "grad_norm": 0.4656768131692163, "learning_rate": 7.293177325205734e-06, "loss": 0.5089, "step": 7395 }, { "epoch": 0.8628731343283582, "grad_norm": 0.47867044879346754, "learning_rate": 7.274134402026321e-06, "loss": 0.5204, "step": 7400 }, { "epoch": 0.863456156716418, "grad_norm": 0.4495376624711596, "learning_rate": 7.255166667326668e-06, "loss": 0.4864, "step": 7405 }, { "epoch": 0.8640391791044776, "grad_norm": 0.45540868280070557, "learning_rate": 7.236274191618228e-06, "loss": 0.4969, "step": 7410 }, { "epoch": 0.8646222014925373, "grad_norm": 0.5275767191951953, "learning_rate": 7.217457045132682e-06, "loss": 0.5086, "step": 7415 }, { "epoch": 0.8652052238805971, "grad_norm": 0.42863000174753835, "learning_rate": 7.198715297821681e-06, "loss": 0.5033, "step": 7420 }, { "epoch": 0.8657882462686567, "grad_norm": 0.4834849241582333, "learning_rate": 7.18004901935657e-06, "loss": 0.5057, "step": 7425 }, { "epoch": 0.8663712686567164, "grad_norm": 0.46483441625060457, "learning_rate": 7.161458279128172e-06, "loss": 0.5058, "step": 7430 }, { "epoch": 0.8669542910447762, "grad_norm": 0.4699147577232594, "learning_rate": 7.142943146246471e-06, "loss": 0.5052, "step": 7435 }, { "epoch": 0.8675373134328358, "grad_norm": 0.4728336401428503, "learning_rate": 7.124503689540403e-06, "loss": 0.4945, "step": 7440 }, { "epoch": 0.8681203358208955, "grad_norm": 0.42756247469245834, "learning_rate": 7.106139977557563e-06, "loss": 0.4868, "step": 7445 }, { "epoch": 0.8687033582089553, "grad_norm": 0.5508454557117077, "learning_rate": 7.087852078564006e-06, "loss": 0.5078, "step": 7450 }, { "epoch": 0.8692863805970149, "grad_norm": 0.42365611783227247, "learning_rate": 7.069640060543914e-06, "loss": 0.4795, "step": 7455 }, { "epoch": 0.8698694029850746, "grad_norm": 0.4619373613002068, "learning_rate": 7.051503991199415e-06, "loss": 0.5093, "step": 7460 }, { "epoch": 0.8704524253731343, "grad_norm": 0.46430333389633666, "learning_rate": 7.03344393795029e-06, "loss": 0.4628, "step": 7465 }, { "epoch": 0.871035447761194, "grad_norm": 0.46350593595683315, "learning_rate": 7.0154599679337405e-06, "loss": 0.4966, "step": 7470 }, { "epoch": 0.8716184701492538, "grad_norm": 0.46997441642678967, "learning_rate": 6.997552148004124e-06, "loss": 0.4619, "step": 7475 }, { "epoch": 0.8722014925373134, "grad_norm": 0.4940809056382957, "learning_rate": 6.9797205447327236e-06, "loss": 0.5207, "step": 7480 }, { "epoch": 0.8727845149253731, "grad_norm": 0.42737497657553214, "learning_rate": 6.961965224407487e-06, "loss": 0.4879, "step": 7485 }, { "epoch": 0.8733675373134329, "grad_norm": 0.4420504460833519, "learning_rate": 6.944286253032789e-06, "loss": 0.4519, "step": 7490 }, { "epoch": 0.8739505597014925, "grad_norm": 0.4482189949894458, "learning_rate": 6.9266836963291725e-06, "loss": 0.5216, "step": 7495 }, { "epoch": 0.8745335820895522, "grad_norm": 0.48507550585616305, "learning_rate": 6.90915761973312e-06, "loss": 0.4904, "step": 7500 }, { "epoch": 0.875116604477612, "grad_norm": 0.4845741351343996, "learning_rate": 6.891708088396803e-06, "loss": 0.5031, "step": 7505 }, { "epoch": 0.8756996268656716, "grad_norm": 0.45661614078718604, "learning_rate": 6.874335167187844e-06, "loss": 0.4694, "step": 7510 }, { "epoch": 0.8762826492537313, "grad_norm": 0.4378046368918111, "learning_rate": 6.857038920689068e-06, "loss": 0.473, "step": 7515 }, { "epoch": 0.8768656716417911, "grad_norm": 0.4351278013857574, "learning_rate": 6.839819413198259e-06, "loss": 0.4686, "step": 7520 }, { "epoch": 0.8774486940298507, "grad_norm": 0.4598040793979348, "learning_rate": 6.822676708727941e-06, "loss": 0.5058, "step": 7525 }, { "epoch": 0.8780317164179104, "grad_norm": 0.46734999477037775, "learning_rate": 6.805610871005115e-06, "loss": 0.5142, "step": 7530 }, { "epoch": 0.8786147388059702, "grad_norm": 0.42358187968622485, "learning_rate": 6.788621963471055e-06, "loss": 0.4656, "step": 7535 }, { "epoch": 0.8791977611940298, "grad_norm": 0.42943564345062557, "learning_rate": 6.771710049281019e-06, "loss": 0.4866, "step": 7540 }, { "epoch": 0.8797807835820896, "grad_norm": 0.46090527920896157, "learning_rate": 6.754875191304076e-06, "loss": 0.5283, "step": 7545 }, { "epoch": 0.8803638059701493, "grad_norm": 0.46574541087337956, "learning_rate": 6.73811745212283e-06, "loss": 0.5072, "step": 7550 }, { "epoch": 0.8809468283582089, "grad_norm": 0.5402513744522806, "learning_rate": 6.721436894033206e-06, "loss": 0.4474, "step": 7555 }, { "epoch": 0.8815298507462687, "grad_norm": 0.4887426580865765, "learning_rate": 6.704833579044198e-06, "loss": 0.4945, "step": 7560 }, { "epoch": 0.8821128731343284, "grad_norm": 0.4720599639216187, "learning_rate": 6.688307568877681e-06, "loss": 0.4757, "step": 7565 }, { "epoch": 0.882695895522388, "grad_norm": 0.44242282186933635, "learning_rate": 6.6718589249681215e-06, "loss": 0.5141, "step": 7570 }, { "epoch": 0.8832789179104478, "grad_norm": 0.47443100760410045, "learning_rate": 6.655487708462407e-06, "loss": 0.479, "step": 7575 }, { "epoch": 0.8838619402985075, "grad_norm": 0.4484432447267727, "learning_rate": 6.639193980219574e-06, "loss": 0.503, "step": 7580 }, { "epoch": 0.8844449626865671, "grad_norm": 0.4544364445156426, "learning_rate": 6.622977800810626e-06, "loss": 0.4757, "step": 7585 }, { "epoch": 0.8850279850746269, "grad_norm": 0.470997939385338, "learning_rate": 6.60683923051825e-06, "loss": 0.4791, "step": 7590 }, { "epoch": 0.8856110074626866, "grad_norm": 0.4213706332803066, "learning_rate": 6.5907783293366525e-06, "loss": 0.468, "step": 7595 }, { "epoch": 0.8861940298507462, "grad_norm": 0.4352892615766016, "learning_rate": 6.574795156971298e-06, "loss": 0.4843, "step": 7600 }, { "epoch": 0.886777052238806, "grad_norm": 0.45036633475522897, "learning_rate": 6.5588897728387055e-06, "loss": 0.4705, "step": 7605 }, { "epoch": 0.8873600746268657, "grad_norm": 0.5077288646140908, "learning_rate": 6.543062236066208e-06, "loss": 0.4791, "step": 7610 }, { "epoch": 0.8879430970149254, "grad_norm": 0.4513500980150284, "learning_rate": 6.527312605491758e-06, "loss": 0.5178, "step": 7615 }, { "epoch": 0.8885261194029851, "grad_norm": 0.4274182074103898, "learning_rate": 6.5116409396636935e-06, "loss": 0.4626, "step": 7620 }, { "epoch": 0.8891091417910447, "grad_norm": 0.4667801241554799, "learning_rate": 6.496047296840513e-06, "loss": 0.5071, "step": 7625 }, { "epoch": 0.8896921641791045, "grad_norm": 0.4123902058374272, "learning_rate": 6.480531734990686e-06, "loss": 0.4992, "step": 7630 }, { "epoch": 0.8902751865671642, "grad_norm": 0.47897565830736566, "learning_rate": 6.4650943117924065e-06, "loss": 0.5153, "step": 7635 }, { "epoch": 0.8908582089552238, "grad_norm": 0.47530139972165525, "learning_rate": 6.449735084633407e-06, "loss": 0.4857, "step": 7640 }, { "epoch": 0.8914412313432836, "grad_norm": 0.4311055331542708, "learning_rate": 6.4344541106107046e-06, "loss": 0.4877, "step": 7645 }, { "epoch": 0.8920242537313433, "grad_norm": 0.4668273679980216, "learning_rate": 6.419251446530451e-06, "loss": 0.5164, "step": 7650 }, { "epoch": 0.8926072761194029, "grad_norm": 0.44140094368731336, "learning_rate": 6.404127148907656e-06, "loss": 0.4784, "step": 7655 }, { "epoch": 0.8931902985074627, "grad_norm": 0.4544488440695652, "learning_rate": 6.38908127396602e-06, "loss": 0.4988, "step": 7660 }, { "epoch": 0.8937733208955224, "grad_norm": 0.4557410652922852, "learning_rate": 6.374113877637701e-06, "loss": 0.4987, "step": 7665 }, { "epoch": 0.894356343283582, "grad_norm": 0.4444789436043567, "learning_rate": 6.359225015563138e-06, "loss": 0.5243, "step": 7670 }, { "epoch": 0.8949393656716418, "grad_norm": 0.4698237483504685, "learning_rate": 6.3444147430908015e-06, "loss": 0.4691, "step": 7675 }, { "epoch": 0.8955223880597015, "grad_norm": 0.4859998755530479, "learning_rate": 6.329683115277018e-06, "loss": 0.4916, "step": 7680 }, { "epoch": 0.8961054104477612, "grad_norm": 0.515441325427795, "learning_rate": 6.315030186885763e-06, "loss": 0.5116, "step": 7685 }, { "epoch": 0.8966884328358209, "grad_norm": 0.5283029015600534, "learning_rate": 6.300456012388446e-06, "loss": 0.4934, "step": 7690 }, { "epoch": 0.8972714552238806, "grad_norm": 0.4417646004083893, "learning_rate": 6.285960645963708e-06, "loss": 0.5026, "step": 7695 }, { "epoch": 0.8978544776119403, "grad_norm": 0.4605167879082956, "learning_rate": 6.271544141497232e-06, "loss": 0.4901, "step": 7700 }, { "epoch": 0.8984375, "grad_norm": 0.4792028373581772, "learning_rate": 6.257206552581541e-06, "loss": 0.5118, "step": 7705 }, { "epoch": 0.8990205223880597, "grad_norm": 0.4644526939230792, "learning_rate": 6.242947932515786e-06, "loss": 0.5282, "step": 7710 }, { "epoch": 0.8996035447761194, "grad_norm": 0.5046663803144333, "learning_rate": 6.228768334305555e-06, "loss": 0.4946, "step": 7715 }, { "epoch": 0.9001865671641791, "grad_norm": 0.43155084823278395, "learning_rate": 6.214667810662682e-06, "loss": 0.4794, "step": 7720 }, { "epoch": 0.9007695895522388, "grad_norm": 0.4640789279302754, "learning_rate": 6.200646414005046e-06, "loss": 0.5239, "step": 7725 }, { "epoch": 0.9013526119402985, "grad_norm": 0.45218047321306765, "learning_rate": 6.18670419645637e-06, "loss": 0.5015, "step": 7730 }, { "epoch": 0.9019356343283582, "grad_norm": 0.4588478078182171, "learning_rate": 6.172841209846046e-06, "loss": 0.4913, "step": 7735 }, { "epoch": 0.902518656716418, "grad_norm": 0.4433138530597733, "learning_rate": 6.159057505708912e-06, "loss": 0.4594, "step": 7740 }, { "epoch": 0.9031016791044776, "grad_norm": 0.42245796632484117, "learning_rate": 6.145353135285091e-06, "loss": 0.4945, "step": 7745 }, { "epoch": 0.9036847014925373, "grad_norm": 0.4756088628902291, "learning_rate": 6.131728149519778e-06, "loss": 0.4932, "step": 7750 }, { "epoch": 0.9042677238805971, "grad_norm": 0.4597129158786207, "learning_rate": 6.118182599063075e-06, "loss": 0.5354, "step": 7755 }, { "epoch": 0.9048507462686567, "grad_norm": 0.503389587802853, "learning_rate": 6.104716534269772e-06, "loss": 0.5077, "step": 7760 }, { "epoch": 0.9054337686567164, "grad_norm": 0.4249365343444884, "learning_rate": 6.091330005199183e-06, "loss": 0.4868, "step": 7765 }, { "epoch": 0.9060167910447762, "grad_norm": 0.43851878310292347, "learning_rate": 6.078023061614953e-06, "loss": 0.5015, "step": 7770 }, { "epoch": 0.9065998134328358, "grad_norm": 0.43139132012318054, "learning_rate": 6.064795752984875e-06, "loss": 0.4832, "step": 7775 }, { "epoch": 0.9071828358208955, "grad_norm": 0.45409545061847195, "learning_rate": 6.0516481284806885e-06, "loss": 0.4794, "step": 7780 }, { "epoch": 0.9077658582089553, "grad_norm": 0.4411595137434583, "learning_rate": 6.0385802369779385e-06, "loss": 0.5183, "step": 7785 }, { "epoch": 0.9083488805970149, "grad_norm": 0.45410062982495236, "learning_rate": 6.025592127055741e-06, "loss": 0.4736, "step": 7790 }, { "epoch": 0.9089319029850746, "grad_norm": 0.42835954681482424, "learning_rate": 6.012683846996645e-06, "loss": 0.4541, "step": 7795 }, { "epoch": 0.9095149253731343, "grad_norm": 0.42047696751886715, "learning_rate": 5.999855444786425e-06, "loss": 0.4973, "step": 7800 }, { "epoch": 0.910097947761194, "grad_norm": 0.44914336580430964, "learning_rate": 5.987106968113928e-06, "loss": 0.5061, "step": 7805 }, { "epoch": 0.9106809701492538, "grad_norm": 0.460206776855078, "learning_rate": 5.974438464370872e-06, "loss": 0.4888, "step": 7810 }, { "epoch": 0.9112639925373134, "grad_norm": 0.4470862201000093, "learning_rate": 5.961849980651684e-06, "loss": 0.4659, "step": 7815 }, { "epoch": 0.9118470149253731, "grad_norm": 0.47720200193566115, "learning_rate": 5.949341563753319e-06, "loss": 0.4992, "step": 7820 }, { "epoch": 0.9124300373134329, "grad_norm": 0.4676169159668866, "learning_rate": 5.936913260175094e-06, "loss": 0.5179, "step": 7825 }, { "epoch": 0.9130130597014925, "grad_norm": 0.4584704293118159, "learning_rate": 5.924565116118499e-06, "loss": 0.4797, "step": 7830 }, { "epoch": 0.9135960820895522, "grad_norm": 0.4776754416693919, "learning_rate": 5.9122971774870435e-06, "loss": 0.4678, "step": 7835 }, { "epoch": 0.914179104477612, "grad_norm": 0.42982811564868484, "learning_rate": 5.900109489886081e-06, "loss": 0.4804, "step": 7840 }, { "epoch": 0.9147621268656716, "grad_norm": 0.41729830382315986, "learning_rate": 5.8880020986226285e-06, "loss": 0.468, "step": 7845 }, { "epoch": 0.9153451492537313, "grad_norm": 0.4582133802824904, "learning_rate": 5.875975048705206e-06, "loss": 0.4939, "step": 7850 }, { "epoch": 0.9159281716417911, "grad_norm": 0.4934433745443256, "learning_rate": 5.864028384843678e-06, "loss": 0.512, "step": 7855 }, { "epoch": 0.9165111940298507, "grad_norm": 0.4461718822871272, "learning_rate": 5.8521621514490715e-06, "loss": 0.5226, "step": 7860 }, { "epoch": 0.9170942164179104, "grad_norm": 0.43580261924917685, "learning_rate": 5.8403763926334146e-06, "loss": 0.4712, "step": 7865 }, { "epoch": 0.9176772388059702, "grad_norm": 0.4289025060075553, "learning_rate": 5.82867115220959e-06, "loss": 0.4857, "step": 7870 }, { "epoch": 0.9182602611940298, "grad_norm": 0.4471506171646922, "learning_rate": 5.81704647369114e-06, "loss": 0.5177, "step": 7875 }, { "epoch": 0.9188432835820896, "grad_norm": 0.4349776664841292, "learning_rate": 5.805502400292137e-06, "loss": 0.4925, "step": 7880 }, { "epoch": 0.9194263059701493, "grad_norm": 0.4524645377436028, "learning_rate": 5.794038974926995e-06, "loss": 0.4807, "step": 7885 }, { "epoch": 0.9200093283582089, "grad_norm": 0.45618145888853556, "learning_rate": 5.782656240210343e-06, "loss": 0.5078, "step": 7890 }, { "epoch": 0.9205923507462687, "grad_norm": 0.4316155039975515, "learning_rate": 5.771354238456828e-06, "loss": 0.4795, "step": 7895 }, { "epoch": 0.9211753731343284, "grad_norm": 0.46008183460748864, "learning_rate": 5.760133011680985e-06, "loss": 0.4788, "step": 7900 }, { "epoch": 0.921758395522388, "grad_norm": 0.44423836815839907, "learning_rate": 5.748992601597076e-06, "loss": 0.4949, "step": 7905 }, { "epoch": 0.9223414179104478, "grad_norm": 0.45890602021310334, "learning_rate": 5.737933049618925e-06, "loss": 0.4811, "step": 7910 }, { "epoch": 0.9229244402985075, "grad_norm": 0.424972332822205, "learning_rate": 5.726954396859773e-06, "loss": 0.4854, "step": 7915 }, { "epoch": 0.9235074626865671, "grad_norm": 0.42910016132492174, "learning_rate": 5.7160566841321255e-06, "loss": 0.4512, "step": 7920 }, { "epoch": 0.9240904850746269, "grad_norm": 0.46334797979450737, "learning_rate": 5.705239951947597e-06, "loss": 0.4834, "step": 7925 }, { "epoch": 0.9246735074626866, "grad_norm": 0.4603535009443525, "learning_rate": 5.694504240516759e-06, "loss": 0.4837, "step": 7930 }, { "epoch": 0.9252565298507462, "grad_norm": 0.4662682403880509, "learning_rate": 5.683849589748994e-06, "loss": 0.4823, "step": 7935 }, { "epoch": 0.925839552238806, "grad_norm": 0.44330683887009237, "learning_rate": 5.673276039252347e-06, "loss": 0.5095, "step": 7940 }, { "epoch": 0.9264225746268657, "grad_norm": 0.43601918566724357, "learning_rate": 5.662783628333379e-06, "loss": 0.5165, "step": 7945 }, { "epoch": 0.9270055970149254, "grad_norm": 0.41717748530531923, "learning_rate": 5.652372395997015e-06, "loss": 0.4911, "step": 7950 }, { "epoch": 0.9275886194029851, "grad_norm": 2.401919501282761, "learning_rate": 5.642042380946412e-06, "loss": 0.4687, "step": 7955 }, { "epoch": 0.9281716417910447, "grad_norm": 0.41624865614532475, "learning_rate": 5.631793621582793e-06, "loss": 0.4945, "step": 7960 }, { "epoch": 0.9287546641791045, "grad_norm": 0.5068445026903259, "learning_rate": 5.621626156005335e-06, "loss": 0.4786, "step": 7965 }, { "epoch": 0.9293376865671642, "grad_norm": 0.45630828056750805, "learning_rate": 5.611540022010985e-06, "loss": 0.4951, "step": 7970 }, { "epoch": 0.9299207089552238, "grad_norm": 0.4956233980595794, "learning_rate": 5.6015352570943755e-06, "loss": 0.4843, "step": 7975 }, { "epoch": 0.9305037313432836, "grad_norm": 0.4329791152614355, "learning_rate": 5.591611898447632e-06, "loss": 0.4634, "step": 7980 }, { "epoch": 0.9310867537313433, "grad_norm": 0.4663064517760106, "learning_rate": 5.581769982960261e-06, "loss": 0.5264, "step": 7985 }, { "epoch": 0.9316697761194029, "grad_norm": 0.4836205261061597, "learning_rate": 5.572009547219013e-06, "loss": 0.5156, "step": 7990 }, { "epoch": 0.9322527985074627, "grad_norm": 0.4277290525866144, "learning_rate": 5.5623306275077475e-06, "loss": 0.481, "step": 7995 }, { "epoch": 0.9328358208955224, "grad_norm": 0.43341519970296755, "learning_rate": 5.552733259807276e-06, "loss": 0.498, "step": 8000 }, { "epoch": 0.933418843283582, "grad_norm": 0.43071532419905234, "learning_rate": 5.543217479795256e-06, "loss": 0.4958, "step": 8005 }, { "epoch": 0.9340018656716418, "grad_norm": 0.46622528251802137, "learning_rate": 5.533783322846053e-06, "loss": 0.4814, "step": 8010 }, { "epoch": 0.9345848880597015, "grad_norm": 0.5249920187972582, "learning_rate": 5.524430824030594e-06, "loss": 0.4907, "step": 8015 }, { "epoch": 0.9351679104477612, "grad_norm": 0.45135919668531826, "learning_rate": 5.515160018116247e-06, "loss": 0.492, "step": 8020 }, { "epoch": 0.9357509328358209, "grad_norm": 0.4928051727444886, "learning_rate": 5.505970939566699e-06, "loss": 0.5035, "step": 8025 }, { "epoch": 0.9363339552238806, "grad_norm": 0.44235862805439663, "learning_rate": 5.4968636225418125e-06, "loss": 0.4783, "step": 8030 }, { "epoch": 0.9369169776119403, "grad_norm": 0.41958300409071425, "learning_rate": 5.487838100897508e-06, "loss": 0.4789, "step": 8035 }, { "epoch": 0.9375, "grad_norm": 0.4570459794031607, "learning_rate": 5.478894408185641e-06, "loss": 0.4661, "step": 8040 }, { "epoch": 0.9380830223880597, "grad_norm": 0.4783472459507973, "learning_rate": 5.470032577653869e-06, "loss": 0.4758, "step": 8045 }, { "epoch": 0.9386660447761194, "grad_norm": 0.4549507857184879, "learning_rate": 5.4612526422455265e-06, "loss": 0.4961, "step": 8050 }, { "epoch": 0.9392490671641791, "grad_norm": 0.4650558485006105, "learning_rate": 5.452554634599519e-06, "loss": 0.4891, "step": 8055 }, { "epoch": 0.9398320895522388, "grad_norm": 0.580533723917234, "learning_rate": 5.443938587050186e-06, "loss": 0.5172, "step": 8060 }, { "epoch": 0.9404151119402985, "grad_norm": 0.42296595646423346, "learning_rate": 5.435404531627176e-06, "loss": 0.4853, "step": 8065 }, { "epoch": 0.9409981343283582, "grad_norm": 0.4573887758625825, "learning_rate": 5.426952500055348e-06, "loss": 0.4614, "step": 8070 }, { "epoch": 0.941581156716418, "grad_norm": 0.4745039842823071, "learning_rate": 5.41858252375464e-06, "loss": 0.5061, "step": 8075 }, { "epoch": 0.9421641791044776, "grad_norm": 0.47805447519039157, "learning_rate": 5.410294633839949e-06, "loss": 0.4735, "step": 8080 }, { "epoch": 0.9427472014925373, "grad_norm": 0.43867449428861033, "learning_rate": 5.402088861121025e-06, "loss": 0.4718, "step": 8085 }, { "epoch": 0.9433302238805971, "grad_norm": 0.4467252451572471, "learning_rate": 5.393965236102353e-06, "loss": 0.4798, "step": 8090 }, { "epoch": 0.9439132462686567, "grad_norm": 0.5133360727759508, "learning_rate": 5.385923788983034e-06, "loss": 0.4894, "step": 8095 }, { "epoch": 0.9444962686567164, "grad_norm": 0.4747210863179483, "learning_rate": 5.377964549656685e-06, "loss": 0.5098, "step": 8100 }, { "epoch": 0.9450792910447762, "grad_norm": 0.44347953398302503, "learning_rate": 5.370087547711307e-06, "loss": 0.5105, "step": 8105 }, { "epoch": 0.9456623134328358, "grad_norm": 0.4387472897639345, "learning_rate": 5.362292812429207e-06, "loss": 0.4815, "step": 8110 }, { "epoch": 0.9462453358208955, "grad_norm": 0.4722026626305582, "learning_rate": 5.354580372786854e-06, "loss": 0.4776, "step": 8115 }, { "epoch": 0.9468283582089553, "grad_norm": 0.4566856777345421, "learning_rate": 5.346950257454792e-06, "loss": 0.5002, "step": 8120 }, { "epoch": 0.9474113805970149, "grad_norm": 0.4601768931007766, "learning_rate": 5.339402494797539e-06, "loss": 0.4725, "step": 8125 }, { "epoch": 0.9479944029850746, "grad_norm": 0.4422563608916944, "learning_rate": 5.331937112873462e-06, "loss": 0.4411, "step": 8130 }, { "epoch": 0.9485774253731343, "grad_norm": 0.44855925098469346, "learning_rate": 5.324554139434679e-06, "loss": 0.4941, "step": 8135 }, { "epoch": 0.949160447761194, "grad_norm": 0.4437725309777671, "learning_rate": 5.317253601926967e-06, "loss": 0.4836, "step": 8140 }, { "epoch": 0.9497434701492538, "grad_norm": 0.4488409563199269, "learning_rate": 5.310035527489651e-06, "loss": 0.4958, "step": 8145 }, { "epoch": 0.9503264925373134, "grad_norm": 0.46309779837786297, "learning_rate": 5.3028999429555045e-06, "loss": 0.493, "step": 8150 }, { "epoch": 0.9509095149253731, "grad_norm": 0.45673001540720787, "learning_rate": 5.295846874850646e-06, "loss": 0.4945, "step": 8155 }, { "epoch": 0.9514925373134329, "grad_norm": 0.4481344375463158, "learning_rate": 5.288876349394448e-06, "loss": 0.4957, "step": 8160 }, { "epoch": 0.9520755597014925, "grad_norm": 0.45392704978409076, "learning_rate": 5.281988392499431e-06, "loss": 0.4721, "step": 8165 }, { "epoch": 0.9526585820895522, "grad_norm": 0.45349195761765704, "learning_rate": 5.275183029771177e-06, "loss": 0.4741, "step": 8170 }, { "epoch": 0.953241604477612, "grad_norm": 0.4411901617730967, "learning_rate": 5.2684602865082255e-06, "loss": 0.4966, "step": 8175 }, { "epoch": 0.9538246268656716, "grad_norm": 0.4585915951637137, "learning_rate": 5.261820187701984e-06, "loss": 0.5453, "step": 8180 }, { "epoch": 0.9544076492537313, "grad_norm": 0.4593205732104069, "learning_rate": 5.2552627580366334e-06, "loss": 0.5013, "step": 8185 }, { "epoch": 0.9549906716417911, "grad_norm": 0.42592095115888917, "learning_rate": 5.248788021889036e-06, "loss": 0.4797, "step": 8190 }, { "epoch": 0.9555736940298507, "grad_norm": 0.46962819902389563, "learning_rate": 5.2423960033286505e-06, "loss": 0.4763, "step": 8195 }, { "epoch": 0.9561567164179104, "grad_norm": 0.40855238448922176, "learning_rate": 5.236086726117433e-06, "loss": 0.4743, "step": 8200 }, { "epoch": 0.9567397388059702, "grad_norm": 0.43575271699567064, "learning_rate": 5.229860213709753e-06, "loss": 0.4773, "step": 8205 }, { "epoch": 0.9573227611940298, "grad_norm": 0.4688749371758774, "learning_rate": 5.223716489252311e-06, "loss": 0.5166, "step": 8210 }, { "epoch": 0.9579057835820896, "grad_norm": 0.7670542201127664, "learning_rate": 5.217655575584045e-06, "loss": 0.493, "step": 8215 }, { "epoch": 0.9584888059701493, "grad_norm": 0.46421302891934385, "learning_rate": 5.211677495236046e-06, "loss": 0.5096, "step": 8220 }, { "epoch": 0.9590718283582089, "grad_norm": 0.4771862479599736, "learning_rate": 5.205782270431484e-06, "loss": 0.4693, "step": 8225 }, { "epoch": 0.9596548507462687, "grad_norm": 0.44226860049129957, "learning_rate": 5.199969923085515e-06, "loss": 0.4459, "step": 8230 }, { "epoch": 0.9602378731343284, "grad_norm": 0.4294703525768145, "learning_rate": 5.194240474805201e-06, "loss": 0.481, "step": 8235 }, { "epoch": 0.960820895522388, "grad_norm": 0.46086183427389743, "learning_rate": 5.188593946889429e-06, "loss": 0.4973, "step": 8240 }, { "epoch": 0.9614039179104478, "grad_norm": 0.4280169626175964, "learning_rate": 5.183030360328846e-06, "loss": 0.4698, "step": 8245 }, { "epoch": 0.9619869402985075, "grad_norm": 0.4753252130786709, "learning_rate": 5.177549735805758e-06, "loss": 0.5015, "step": 8250 }, { "epoch": 0.9625699626865671, "grad_norm": 0.4620963248632002, "learning_rate": 5.172152093694067e-06, "loss": 0.4802, "step": 8255 }, { "epoch": 0.9631529850746269, "grad_norm": 0.5332881434749811, "learning_rate": 5.166837454059193e-06, "loss": 0.5106, "step": 8260 }, { "epoch": 0.9637360074626866, "grad_norm": 0.5025697757795641, "learning_rate": 5.161605836658004e-06, "loss": 0.4986, "step": 8265 }, { "epoch": 0.9643190298507462, "grad_norm": 0.46525870817249937, "learning_rate": 5.156457260938732e-06, "loss": 0.4925, "step": 8270 }, { "epoch": 0.964902052238806, "grad_norm": 0.4466358224458004, "learning_rate": 5.151391746040905e-06, "loss": 0.4896, "step": 8275 }, { "epoch": 0.9654850746268657, "grad_norm": 0.4722232462608345, "learning_rate": 5.146409310795282e-06, "loss": 0.5116, "step": 8280 }, { "epoch": 0.9660680970149254, "grad_norm": 0.4839383338646517, "learning_rate": 5.14150997372378e-06, "loss": 0.4704, "step": 8285 }, { "epoch": 0.9666511194029851, "grad_norm": 0.4598082477182381, "learning_rate": 5.1366937530393955e-06, "loss": 0.4675, "step": 8290 }, { "epoch": 0.9672341417910447, "grad_norm": 0.4571533268091363, "learning_rate": 5.131960666646149e-06, "loss": 0.4886, "step": 8295 }, { "epoch": 0.9678171641791045, "grad_norm": 0.4826350517333677, "learning_rate": 5.127310732139018e-06, "loss": 0.5239, "step": 8300 }, { "epoch": 0.9684001865671642, "grad_norm": 0.501035586719562, "learning_rate": 5.122743966803858e-06, "loss": 0.4845, "step": 8305 }, { "epoch": 0.9689832089552238, "grad_norm": 0.45689428681207833, "learning_rate": 5.118260387617359e-06, "loss": 0.4802, "step": 8310 }, { "epoch": 0.9695662313432836, "grad_norm": 0.41839841626319924, "learning_rate": 5.113860011246964e-06, "loss": 0.4759, "step": 8315 }, { "epoch": 0.9701492537313433, "grad_norm": 0.4623224498637685, "learning_rate": 5.109542854050814e-06, "loss": 0.5191, "step": 8320 }, { "epoch": 0.9707322761194029, "grad_norm": 0.4400774444581771, "learning_rate": 5.105308932077693e-06, "loss": 0.4751, "step": 8325 }, { "epoch": 0.9713152985074627, "grad_norm": 0.5458461815433695, "learning_rate": 5.101158261066959e-06, "loss": 0.4934, "step": 8330 }, { "epoch": 0.9718983208955224, "grad_norm": 0.45228504839890776, "learning_rate": 5.097090856448492e-06, "loss": 0.5007, "step": 8335 }, { "epoch": 0.972481343283582, "grad_norm": 0.467798541042544, "learning_rate": 5.0931067333426275e-06, "loss": 0.5082, "step": 8340 }, { "epoch": 0.9730643656716418, "grad_norm": 0.4266615826249801, "learning_rate": 5.0892059065601145e-06, "loss": 0.4974, "step": 8345 }, { "epoch": 0.9736473880597015, "grad_norm": 0.4621554435159516, "learning_rate": 5.0853883906020525e-06, "loss": 0.4965, "step": 8350 }, { "epoch": 0.9742304104477612, "grad_norm": 0.46073201489628657, "learning_rate": 5.081654199659831e-06, "loss": 0.5071, "step": 8355 }, { "epoch": 0.9748134328358209, "grad_norm": 0.5462060706132782, "learning_rate": 5.07800334761509e-06, "loss": 0.4938, "step": 8360 }, { "epoch": 0.9753964552238806, "grad_norm": 0.48250851908785264, "learning_rate": 5.074435848039658e-06, "loss": 0.5091, "step": 8365 }, { "epoch": 0.9759794776119403, "grad_norm": 0.44847740844622386, "learning_rate": 5.070951714195508e-06, "loss": 0.4957, "step": 8370 }, { "epoch": 0.9765625, "grad_norm": 0.4467178102848771, "learning_rate": 5.067550959034707e-06, "loss": 0.49, "step": 8375 }, { "epoch": 0.9771455223880597, "grad_norm": 0.46636700776310513, "learning_rate": 5.064233595199362e-06, "loss": 0.4884, "step": 8380 }, { "epoch": 0.9777285447761194, "grad_norm": 0.4615425374049883, "learning_rate": 5.060999635021583e-06, "loss": 0.4771, "step": 8385 }, { "epoch": 0.9783115671641791, "grad_norm": 0.45286825679857706, "learning_rate": 5.057849090523426e-06, "loss": 0.4959, "step": 8390 }, { "epoch": 0.9788945895522388, "grad_norm": 0.4569055516084358, "learning_rate": 5.054781973416858e-06, "loss": 0.4955, "step": 8395 }, { "epoch": 0.9794776119402985, "grad_norm": 0.4455395860822749, "learning_rate": 5.051798295103711e-06, "loss": 0.4752, "step": 8400 }, { "epoch": 0.9800606343283582, "grad_norm": 0.4183019666197293, "learning_rate": 5.048898066675631e-06, "loss": 0.4552, "step": 8405 }, { "epoch": 0.980643656716418, "grad_norm": 0.43323200908430703, "learning_rate": 5.046081298914053e-06, "loss": 0.5092, "step": 8410 }, { "epoch": 0.9812266791044776, "grad_norm": 0.46606214824527303, "learning_rate": 5.043348002290145e-06, "loss": 0.4639, "step": 8415 }, { "epoch": 0.9818097014925373, "grad_norm": 0.4620950107319596, "learning_rate": 5.0406981869647805e-06, "loss": 0.5072, "step": 8420 }, { "epoch": 0.9823927238805971, "grad_norm": 0.44313034547356717, "learning_rate": 5.038131862788491e-06, "loss": 0.4765, "step": 8425 }, { "epoch": 0.9829757462686567, "grad_norm": 0.44702546793927656, "learning_rate": 5.035649039301438e-06, "loss": 0.4612, "step": 8430 }, { "epoch": 0.9835587686567164, "grad_norm": 0.45947101038805505, "learning_rate": 5.033249725733377e-06, "loss": 0.4967, "step": 8435 }, { "epoch": 0.9841417910447762, "grad_norm": 0.42282849418109447, "learning_rate": 5.0309339310036125e-06, "loss": 0.507, "step": 8440 }, { "epoch": 0.9847248134328358, "grad_norm": 0.44692770292599415, "learning_rate": 5.02870166372098e-06, "loss": 0.4808, "step": 8445 }, { "epoch": 0.9853078358208955, "grad_norm": 0.5582818670824954, "learning_rate": 5.0265529321838004e-06, "loss": 0.5405, "step": 8450 }, { "epoch": 0.9858908582089553, "grad_norm": 0.4665732954055579, "learning_rate": 5.02448774437986e-06, "loss": 0.5157, "step": 8455 }, { "epoch": 0.9864738805970149, "grad_norm": 0.5202669052061701, "learning_rate": 5.022506107986374e-06, "loss": 0.5146, "step": 8460 }, { "epoch": 0.9870569029850746, "grad_norm": 0.4563878919577009, "learning_rate": 5.020608030369962e-06, "loss": 0.4614, "step": 8465 }, { "epoch": 0.9876399253731343, "grad_norm": 0.45761657701588687, "learning_rate": 5.018793518586616e-06, "loss": 0.5007, "step": 8470 }, { "epoch": 0.988222947761194, "grad_norm": 0.46136936074859125, "learning_rate": 5.017062579381676e-06, "loss": 0.5068, "step": 8475 }, { "epoch": 0.9888059701492538, "grad_norm": 0.48777910519871126, "learning_rate": 5.015415219189812e-06, "loss": 0.4889, "step": 8480 }, { "epoch": 0.9893889925373134, "grad_norm": 0.47084471147391416, "learning_rate": 5.013851444134987e-06, "loss": 0.5022, "step": 8485 }, { "epoch": 0.9899720149253731, "grad_norm": 0.432844624719787, "learning_rate": 5.012371260030445e-06, "loss": 0.491, "step": 8490 }, { "epoch": 0.9905550373134329, "grad_norm": 0.4649632477223049, "learning_rate": 5.010974672378682e-06, "loss": 0.4741, "step": 8495 }, { "epoch": 0.9911380597014925, "grad_norm": 0.4764372199483869, "learning_rate": 5.009661686371434e-06, "loss": 0.4772, "step": 8500 }, { "epoch": 0.9917210820895522, "grad_norm": 0.4868588188840104, "learning_rate": 5.008432306889652e-06, "loss": 0.5214, "step": 8505 }, { "epoch": 0.992304104477612, "grad_norm": 0.438360158275501, "learning_rate": 5.0072865385034785e-06, "loss": 0.4905, "step": 8510 }, { "epoch": 0.9928871268656716, "grad_norm": 0.48340344564902316, "learning_rate": 5.006224385472242e-06, "loss": 0.4927, "step": 8515 }, { "epoch": 0.9934701492537313, "grad_norm": 0.45531955111325645, "learning_rate": 5.0052458517444364e-06, "loss": 0.4888, "step": 8520 }, { "epoch": 0.9940531716417911, "grad_norm": 0.4362698593831642, "learning_rate": 5.004350940957703e-06, "loss": 0.4818, "step": 8525 }, { "epoch": 0.9946361940298507, "grad_norm": 0.46810128935885154, "learning_rate": 5.0035396564388184e-06, "loss": 0.5101, "step": 8530 }, { "epoch": 0.9952192164179104, "grad_norm": 0.8386528644620809, "learning_rate": 5.00281200120369e-06, "loss": 0.4791, "step": 8535 }, { "epoch": 0.9958022388059702, "grad_norm": 0.4834359055420526, "learning_rate": 5.00216797795733e-06, "loss": 0.4913, "step": 8540 }, { "epoch": 0.9963852611940298, "grad_norm": 0.4925810326491755, "learning_rate": 5.001607589093861e-06, "loss": 0.5076, "step": 8545 }, { "epoch": 0.9969682835820896, "grad_norm": 0.44173971825287517, "learning_rate": 5.001130836696491e-06, "loss": 0.4809, "step": 8550 }, { "epoch": 0.9975513059701493, "grad_norm": 0.48077397748529965, "learning_rate": 5.000737722537526e-06, "loss": 0.4974, "step": 8555 }, { "epoch": 0.9981343283582089, "grad_norm": 0.6138381620178532, "learning_rate": 5.00042824807834e-06, "loss": 0.4925, "step": 8560 }, { "epoch": 0.9987173507462687, "grad_norm": 0.43738249534157836, "learning_rate": 5.000202414469386e-06, "loss": 0.4573, "step": 8565 }, { "epoch": 0.9993003731343284, "grad_norm": 0.4562639975448534, "learning_rate": 5.0000602225501925e-06, "loss": 0.4941, "step": 8570 }, { "epoch": 0.999883395522388, "grad_norm": 0.4592908023435152, "learning_rate": 5.0000016728493425e-06, "loss": 0.4962, "step": 8575 }, { "epoch": 1.0004664179104477, "grad_norm": 0.47426058627957746, "learning_rate": 4.728485497099385e-05, "loss": 0.4391, "step": 8580 }, { "epoch": 1.0010494402985075, "grad_norm": 0.5320004656516157, "learning_rate": 4.728072178445397e-05, "loss": 0.4522, "step": 8585 }, { "epoch": 1.0016324626865671, "grad_norm": 0.5024183427100246, "learning_rate": 4.7276585656710295e-05, "loss": 0.4439, "step": 8590 }, { "epoch": 1.0022154850746268, "grad_norm": 0.5574190311460309, "learning_rate": 4.7272446588377794e-05, "loss": 0.4286, "step": 8595 }, { "epoch": 1.0027985074626866, "grad_norm": 0.48789901216879533, "learning_rate": 4.726830458007194e-05, "loss": 0.4493, "step": 8600 }, { "epoch": 1.0033815298507462, "grad_norm": 0.5860716293937468, "learning_rate": 4.7264159632408603e-05, "loss": 0.4677, "step": 8605 }, { "epoch": 1.0039645522388059, "grad_norm": 0.5517100367748053, "learning_rate": 4.72600117460041e-05, "loss": 0.4521, "step": 8610 }, { "epoch": 1.0045475746268657, "grad_norm": 0.486171634292782, "learning_rate": 4.7255860921475156e-05, "loss": 0.485, "step": 8615 }, { "epoch": 1.0051305970149254, "grad_norm": 0.531279693662836, "learning_rate": 4.725170715943898e-05, "loss": 0.4816, "step": 8620 }, { "epoch": 1.005713619402985, "grad_norm": 0.5180505587047434, "learning_rate": 4.724755046051321e-05, "loss": 0.4304, "step": 8625 }, { "epoch": 1.0062966417910448, "grad_norm": 0.5284778037290939, "learning_rate": 4.724339082531588e-05, "loss": 0.4632, "step": 8630 }, { "epoch": 1.0068796641791045, "grad_norm": 0.5374534908810564, "learning_rate": 4.72392282544655e-05, "loss": 0.4742, "step": 8635 }, { "epoch": 1.007462686567164, "grad_norm": 0.49462912725198666, "learning_rate": 4.723506274858101e-05, "loss": 0.4861, "step": 8640 }, { "epoch": 1.008045708955224, "grad_norm": 0.487854302513415, "learning_rate": 4.723089430828177e-05, "loss": 0.4928, "step": 8645 }, { "epoch": 1.0086287313432836, "grad_norm": 0.5019200031208862, "learning_rate": 4.722672293418759e-05, "loss": 0.493, "step": 8650 }, { "epoch": 1.0092117537313432, "grad_norm": 0.582688345784807, "learning_rate": 4.722254862691871e-05, "loss": 0.5059, "step": 8655 }, { "epoch": 1.009794776119403, "grad_norm": 0.5238946658768917, "learning_rate": 4.721837138709582e-05, "loss": 0.5027, "step": 8660 }, { "epoch": 1.0103777985074627, "grad_norm": 0.5185530471749461, "learning_rate": 4.7214191215340036e-05, "loss": 0.5167, "step": 8665 }, { "epoch": 1.0109608208955223, "grad_norm": 0.5974306068998476, "learning_rate": 4.7210008112272895e-05, "loss": 0.4869, "step": 8670 }, { "epoch": 1.0115438432835822, "grad_norm": 0.6107926371037933, "learning_rate": 4.72058220785164e-05, "loss": 0.5024, "step": 8675 }, { "epoch": 1.0121268656716418, "grad_norm": 0.5282921823444569, "learning_rate": 4.720163311469296e-05, "loss": 0.4619, "step": 8680 }, { "epoch": 1.0127098880597014, "grad_norm": 0.6003339223461499, "learning_rate": 4.7197441221425446e-05, "loss": 0.5173, "step": 8685 }, { "epoch": 1.0132929104477613, "grad_norm": 0.4730173461537816, "learning_rate": 4.7193246399337146e-05, "loss": 0.4813, "step": 8690 }, { "epoch": 1.013875932835821, "grad_norm": 0.5071456563461645, "learning_rate": 4.7189048649051786e-05, "loss": 0.4793, "step": 8695 }, { "epoch": 1.0144589552238805, "grad_norm": 0.5015048651689884, "learning_rate": 4.718484797119355e-05, "loss": 0.5245, "step": 8700 }, { "epoch": 1.0150419776119404, "grad_norm": 0.526074227917695, "learning_rate": 4.718064436638701e-05, "loss": 0.475, "step": 8705 }, { "epoch": 1.015625, "grad_norm": 0.5009454385549627, "learning_rate": 4.717643783525722e-05, "loss": 0.4524, "step": 8710 }, { "epoch": 1.0162080223880596, "grad_norm": 0.5373569510563955, "learning_rate": 4.717222837842965e-05, "loss": 0.4908, "step": 8715 }, { "epoch": 1.0167910447761195, "grad_norm": 0.4843490024297434, "learning_rate": 4.7168015996530204e-05, "loss": 0.4695, "step": 8720 }, { "epoch": 1.017374067164179, "grad_norm": 0.4665154044423995, "learning_rate": 4.716380069018523e-05, "loss": 0.4744, "step": 8725 }, { "epoch": 1.0179570895522387, "grad_norm": 0.5304849338647405, "learning_rate": 4.715958246002148e-05, "loss": 0.4814, "step": 8730 }, { "epoch": 1.0185401119402986, "grad_norm": 0.48442401715883604, "learning_rate": 4.715536130666619e-05, "loss": 0.4741, "step": 8735 }, { "epoch": 1.0191231343283582, "grad_norm": 0.5382707553704985, "learning_rate": 4.715113723074699e-05, "loss": 0.4884, "step": 8740 }, { "epoch": 1.0197061567164178, "grad_norm": 0.5264939747473608, "learning_rate": 4.7146910232891975e-05, "loss": 0.4908, "step": 8745 }, { "epoch": 1.0202891791044777, "grad_norm": 0.46111084467153807, "learning_rate": 4.714268031372964e-05, "loss": 0.4963, "step": 8750 }, { "epoch": 1.0208722014925373, "grad_norm": 0.5123460394242938, "learning_rate": 4.7138447473888945e-05, "loss": 0.452, "step": 8755 }, { "epoch": 1.021455223880597, "grad_norm": 0.468514834125539, "learning_rate": 4.7134211713999264e-05, "loss": 0.4578, "step": 8760 }, { "epoch": 1.0220382462686568, "grad_norm": 0.4684210782948509, "learning_rate": 4.712997303469043e-05, "loss": 0.4705, "step": 8765 }, { "epoch": 1.0226212686567164, "grad_norm": 0.5073812307209864, "learning_rate": 4.712573143659268e-05, "loss": 0.5215, "step": 8770 }, { "epoch": 1.023204291044776, "grad_norm": 0.5322751538831147, "learning_rate": 4.71214869203367e-05, "loss": 0.5065, "step": 8775 }, { "epoch": 1.023787313432836, "grad_norm": 0.49987343777690785, "learning_rate": 4.711723948655362e-05, "loss": 0.4676, "step": 8780 }, { "epoch": 1.0243703358208955, "grad_norm": 0.5284446960377239, "learning_rate": 4.711298913587497e-05, "loss": 0.4509, "step": 8785 }, { "epoch": 1.0249533582089552, "grad_norm": 0.47657330330882275, "learning_rate": 4.710873586893276e-05, "loss": 0.4871, "step": 8790 }, { "epoch": 1.025536380597015, "grad_norm": 0.4842840362473736, "learning_rate": 4.710447968635941e-05, "loss": 0.4951, "step": 8795 }, { "epoch": 1.0261194029850746, "grad_norm": 0.4510225196671863, "learning_rate": 4.7100220588787755e-05, "loss": 0.4686, "step": 8800 }, { "epoch": 1.0267024253731343, "grad_norm": 0.49756046747429755, "learning_rate": 4.709595857685109e-05, "loss": 0.5167, "step": 8805 }, { "epoch": 1.0272854477611941, "grad_norm": 0.4744972653076067, "learning_rate": 4.7091693651183144e-05, "loss": 0.487, "step": 8810 }, { "epoch": 1.0278684701492538, "grad_norm": 0.5150713800815724, "learning_rate": 4.708742581241806e-05, "loss": 0.5044, "step": 8815 }, { "epoch": 1.0284514925373134, "grad_norm": 0.557518317918137, "learning_rate": 4.7083155061190426e-05, "loss": 0.5088, "step": 8820 }, { "epoch": 1.0290345149253732, "grad_norm": 0.5309722610343153, "learning_rate": 4.707888139813527e-05, "loss": 0.4847, "step": 8825 }, { "epoch": 1.0296175373134329, "grad_norm": 0.5018724058089924, "learning_rate": 4.707460482388804e-05, "loss": 0.4847, "step": 8830 }, { "epoch": 1.0302005597014925, "grad_norm": 0.5097255324921017, "learning_rate": 4.707032533908461e-05, "loss": 0.4737, "step": 8835 }, { "epoch": 1.0307835820895523, "grad_norm": 0.6170849116718103, "learning_rate": 4.706604294436132e-05, "loss": 0.4815, "step": 8840 }, { "epoch": 1.031366604477612, "grad_norm": 0.5081805654355479, "learning_rate": 4.706175764035491e-05, "loss": 0.519, "step": 8845 }, { "epoch": 1.0319496268656716, "grad_norm": 0.5130221033518372, "learning_rate": 4.705746942770255e-05, "loss": 0.4659, "step": 8850 }, { "epoch": 1.0325326492537314, "grad_norm": 0.5066243868822757, "learning_rate": 4.7053178307041886e-05, "loss": 0.4911, "step": 8855 }, { "epoch": 1.033115671641791, "grad_norm": 0.4661575061349955, "learning_rate": 4.704888427901094e-05, "loss": 0.4657, "step": 8860 }, { "epoch": 1.0336986940298507, "grad_norm": 0.47132915772418305, "learning_rate": 4.704458734424821e-05, "loss": 0.512, "step": 8865 }, { "epoch": 1.0342817164179103, "grad_norm": 0.4675671747669863, "learning_rate": 4.70402875033926e-05, "loss": 0.4833, "step": 8870 }, { "epoch": 1.0348647388059702, "grad_norm": 0.5619087684145831, "learning_rate": 4.7035984757083454e-05, "loss": 0.5305, "step": 8875 }, { "epoch": 1.0354477611940298, "grad_norm": 0.49373872507593697, "learning_rate": 4.703167910596055e-05, "loss": 0.5055, "step": 8880 }, { "epoch": 1.0360307835820897, "grad_norm": 0.47489827739888235, "learning_rate": 4.70273705506641e-05, "loss": 0.4775, "step": 8885 }, { "epoch": 1.0366138059701493, "grad_norm": 0.4971757998829849, "learning_rate": 4.702305909183475e-05, "loss": 0.4826, "step": 8890 }, { "epoch": 1.037196828358209, "grad_norm": 0.5422128300971927, "learning_rate": 4.7018744730113565e-05, "loss": 0.5111, "step": 8895 }, { "epoch": 1.0377798507462686, "grad_norm": 0.5012066261927423, "learning_rate": 4.701442746614206e-05, "loss": 0.5076, "step": 8900 }, { "epoch": 1.0383628731343284, "grad_norm": 0.5258269384569444, "learning_rate": 4.701010730056216e-05, "loss": 0.4817, "step": 8905 }, { "epoch": 1.038945895522388, "grad_norm": 0.5029440378162809, "learning_rate": 4.700578423401622e-05, "loss": 0.4841, "step": 8910 }, { "epoch": 1.0395289179104477, "grad_norm": 0.5032231245192578, "learning_rate": 4.7001458267147055e-05, "loss": 0.4596, "step": 8915 }, { "epoch": 1.0401119402985075, "grad_norm": 0.5668474501893467, "learning_rate": 4.699712940059791e-05, "loss": 0.5245, "step": 8920 }, { "epoch": 1.0406949626865671, "grad_norm": 0.7205300131824032, "learning_rate": 4.6992797635012415e-05, "loss": 0.4943, "step": 8925 }, { "epoch": 1.0412779850746268, "grad_norm": 0.46430225487702537, "learning_rate": 4.6988462971034676e-05, "loss": 0.4969, "step": 8930 }, { "epoch": 1.0418610074626866, "grad_norm": 0.508570057790338, "learning_rate": 4.698412540930922e-05, "loss": 0.4993, "step": 8935 }, { "epoch": 1.0424440298507462, "grad_norm": 0.4725765308686987, "learning_rate": 4.697978495048099e-05, "loss": 0.4678, "step": 8940 }, { "epoch": 1.0430270522388059, "grad_norm": 0.5665033300096499, "learning_rate": 4.697544159519539e-05, "loss": 0.4983, "step": 8945 }, { "epoch": 1.0436100746268657, "grad_norm": 0.5501878853405641, "learning_rate": 4.697109534409821e-05, "loss": 0.5027, "step": 8950 }, { "epoch": 1.0441930970149254, "grad_norm": 0.5139618047452844, "learning_rate": 4.696674619783572e-05, "loss": 0.4927, "step": 8955 }, { "epoch": 1.044776119402985, "grad_norm": 0.5540540968653056, "learning_rate": 4.696239415705458e-05, "loss": 0.5114, "step": 8960 }, { "epoch": 1.0453591417910448, "grad_norm": 0.5196349914754745, "learning_rate": 4.69580392224019e-05, "loss": 0.4993, "step": 8965 }, { "epoch": 1.0459421641791045, "grad_norm": 0.6616451345728891, "learning_rate": 4.695368139452521e-05, "loss": 0.472, "step": 8970 }, { "epoch": 1.046525186567164, "grad_norm": 0.4858247139067617, "learning_rate": 4.69493206740725e-05, "loss": 0.515, "step": 8975 }, { "epoch": 1.047108208955224, "grad_norm": 0.48350960828960377, "learning_rate": 4.694495706169214e-05, "loss": 0.4762, "step": 8980 }, { "epoch": 1.0476912313432836, "grad_norm": 0.5122511024079021, "learning_rate": 4.6940590558032985e-05, "loss": 0.4952, "step": 8985 }, { "epoch": 1.0482742537313432, "grad_norm": 1.3959823952147061, "learning_rate": 4.693622116374427e-05, "loss": 0.5212, "step": 8990 }, { "epoch": 1.048857276119403, "grad_norm": 0.5401382717722655, "learning_rate": 4.693184887947569e-05, "loss": 0.4941, "step": 8995 }, { "epoch": 1.0494402985074627, "grad_norm": 0.4507778073049574, "learning_rate": 4.692747370587737e-05, "loss": 0.499, "step": 9000 }, { "epoch": 1.0500233208955223, "grad_norm": 0.48797343069389976, "learning_rate": 4.692309564359984e-05, "loss": 0.461, "step": 9005 }, { "epoch": 1.0506063432835822, "grad_norm": 0.47561161106654043, "learning_rate": 4.691871469329408e-05, "loss": 0.5224, "step": 9010 }, { "epoch": 1.0511893656716418, "grad_norm": 0.5634009367566019, "learning_rate": 4.69143308556115e-05, "loss": 0.4768, "step": 9015 }, { "epoch": 1.0517723880597014, "grad_norm": 0.49734215708222496, "learning_rate": 4.690994413120394e-05, "loss": 0.4823, "step": 9020 }, { "epoch": 1.0523554104477613, "grad_norm": 0.4665279232609275, "learning_rate": 4.690555452072366e-05, "loss": 0.4763, "step": 9025 }, { "epoch": 1.052938432835821, "grad_norm": 0.5190852875616855, "learning_rate": 4.690116202482335e-05, "loss": 0.4974, "step": 9030 }, { "epoch": 1.0535214552238805, "grad_norm": 0.46566803498581916, "learning_rate": 4.689676664415613e-05, "loss": 0.4779, "step": 9035 }, { "epoch": 1.0541044776119404, "grad_norm": 0.5319093271984696, "learning_rate": 4.689236837937556e-05, "loss": 0.4909, "step": 9040 }, { "epoch": 1.0546875, "grad_norm": 0.4767808134962894, "learning_rate": 4.6887967231135605e-05, "loss": 0.5057, "step": 9045 }, { "epoch": 1.0552705223880596, "grad_norm": 0.47198420277762565, "learning_rate": 4.688356320009069e-05, "loss": 0.5153, "step": 9050 }, { "epoch": 1.0558535447761195, "grad_norm": 0.4972543814636995, "learning_rate": 4.687915628689564e-05, "loss": 0.5068, "step": 9055 }, { "epoch": 1.056436567164179, "grad_norm": 0.5146050784315456, "learning_rate": 4.687474649220573e-05, "loss": 0.4925, "step": 9060 }, { "epoch": 1.0570195895522387, "grad_norm": 0.5259065163440027, "learning_rate": 4.687033381667664e-05, "loss": 0.5216, "step": 9065 }, { "epoch": 1.0576026119402986, "grad_norm": 0.4843993142938887, "learning_rate": 4.6865918260964506e-05, "loss": 0.4815, "step": 9070 }, { "epoch": 1.0581856343283582, "grad_norm": 0.48264634391304173, "learning_rate": 4.6861499825725877e-05, "loss": 0.472, "step": 9075 }, { "epoch": 1.0587686567164178, "grad_norm": 0.4916707467335896, "learning_rate": 4.685707851161773e-05, "loss": 0.5039, "step": 9080 }, { "epoch": 1.0593516791044777, "grad_norm": 0.5395782527592068, "learning_rate": 4.685265431929747e-05, "loss": 0.5192, "step": 9085 }, { "epoch": 1.0599347014925373, "grad_norm": 0.48955754808478563, "learning_rate": 4.6848227249422936e-05, "loss": 0.4976, "step": 9090 }, { "epoch": 1.060517723880597, "grad_norm": 0.5173966026954939, "learning_rate": 4.684379730265239e-05, "loss": 0.4904, "step": 9095 }, { "epoch": 1.0611007462686568, "grad_norm": 0.5586831591167815, "learning_rate": 4.683936447964452e-05, "loss": 0.5159, "step": 9100 }, { "epoch": 1.0616837686567164, "grad_norm": 0.5306411920252676, "learning_rate": 4.683492878105844e-05, "loss": 0.5023, "step": 9105 }, { "epoch": 1.062266791044776, "grad_norm": 0.5394987090128364, "learning_rate": 4.683049020755372e-05, "loss": 0.5145, "step": 9110 }, { "epoch": 1.062849813432836, "grad_norm": 0.49914789057311915, "learning_rate": 4.6826048759790295e-05, "loss": 0.4957, "step": 9115 }, { "epoch": 1.0634328358208955, "grad_norm": 0.5095562857323067, "learning_rate": 4.6821604438428594e-05, "loss": 0.52, "step": 9120 }, { "epoch": 1.0640158582089552, "grad_norm": 0.4947345455878514, "learning_rate": 4.6817157244129436e-05, "loss": 0.5119, "step": 9125 }, { "epoch": 1.064598880597015, "grad_norm": 0.49090098726653636, "learning_rate": 4.681270717755409e-05, "loss": 0.5096, "step": 9130 }, { "epoch": 1.0651819029850746, "grad_norm": 0.5338327678334841, "learning_rate": 4.680825423936421e-05, "loss": 0.4926, "step": 9135 }, { "epoch": 1.0657649253731343, "grad_norm": 0.5090081376265945, "learning_rate": 4.680379843022192e-05, "loss": 0.4686, "step": 9140 }, { "epoch": 1.0663479477611941, "grad_norm": 0.44239826305309765, "learning_rate": 4.679933975078976e-05, "loss": 0.4555, "step": 9145 }, { "epoch": 1.0669309701492538, "grad_norm": 0.45188824230008673, "learning_rate": 4.679487820173069e-05, "loss": 0.4937, "step": 9150 }, { "epoch": 1.0675139925373134, "grad_norm": 0.49566573297809374, "learning_rate": 4.67904137837081e-05, "loss": 0.452, "step": 9155 }, { "epoch": 1.0680970149253732, "grad_norm": 0.5076839916250646, "learning_rate": 4.678594649738581e-05, "loss": 0.4913, "step": 9160 }, { "epoch": 1.0686800373134329, "grad_norm": 0.4373870137902814, "learning_rate": 4.678147634342805e-05, "loss": 0.4681, "step": 9165 }, { "epoch": 1.0692630597014925, "grad_norm": 0.4800654095382689, "learning_rate": 4.67770033224995e-05, "loss": 0.4948, "step": 9170 }, { "epoch": 1.0698460820895523, "grad_norm": 0.46964119819112704, "learning_rate": 4.677252743526525e-05, "loss": 0.4559, "step": 9175 }, { "epoch": 1.070429104477612, "grad_norm": 0.49650644344484723, "learning_rate": 4.676804868239083e-05, "loss": 0.4652, "step": 9180 }, { "epoch": 1.0710121268656716, "grad_norm": 0.8449820819897319, "learning_rate": 4.676356706454217e-05, "loss": 0.4713, "step": 9185 }, { "epoch": 1.0715951492537314, "grad_norm": 0.48019854470117596, "learning_rate": 4.675908258238567e-05, "loss": 0.501, "step": 9190 }, { "epoch": 1.072178171641791, "grad_norm": 0.510626881014733, "learning_rate": 4.6754595236588114e-05, "loss": 0.4853, "step": 9195 }, { "epoch": 1.0727611940298507, "grad_norm": 0.4874317455515729, "learning_rate": 4.6750105027816716e-05, "loss": 0.5114, "step": 9200 }, { "epoch": 1.0733442164179103, "grad_norm": 0.49008478006262846, "learning_rate": 4.674561195673914e-05, "loss": 0.5026, "step": 9205 }, { "epoch": 1.0739272388059702, "grad_norm": 0.45606704402648035, "learning_rate": 4.6741116024023476e-05, "loss": 0.4815, "step": 9210 }, { "epoch": 1.0745102611940298, "grad_norm": 0.5470864580462397, "learning_rate": 4.6736617230338205e-05, "loss": 0.5029, "step": 9215 }, { "epoch": 1.0750932835820897, "grad_norm": 1.0582793581481065, "learning_rate": 4.673211557635225e-05, "loss": 0.4717, "step": 9220 }, { "epoch": 1.0756763059701493, "grad_norm": 0.5869186280878189, "learning_rate": 4.6727611062734994e-05, "loss": 0.5123, "step": 9225 }, { "epoch": 1.076259328358209, "grad_norm": 0.48811171212689763, "learning_rate": 4.672310369015619e-05, "loss": 0.5022, "step": 9230 }, { "epoch": 1.0768423507462686, "grad_norm": 0.4379874815657095, "learning_rate": 4.6718593459286036e-05, "loss": 0.4868, "step": 9235 }, { "epoch": 1.0774253731343284, "grad_norm": 0.536314202527189, "learning_rate": 4.671408037079519e-05, "loss": 0.5014, "step": 9240 }, { "epoch": 1.078008395522388, "grad_norm": 0.455274704274001, "learning_rate": 4.670956442535467e-05, "loss": 0.4843, "step": 9245 }, { "epoch": 1.0785914179104477, "grad_norm": 0.4838644013521913, "learning_rate": 4.670504562363598e-05, "loss": 0.4902, "step": 9250 }, { "epoch": 1.0791744402985075, "grad_norm": 0.4735942629468596, "learning_rate": 4.6700523966311e-05, "loss": 0.536, "step": 9255 }, { "epoch": 1.0797574626865671, "grad_norm": 0.5403926324710048, "learning_rate": 4.669599945405208e-05, "loss": 0.5086, "step": 9260 }, { "epoch": 1.0803404850746268, "grad_norm": 0.43208196517726205, "learning_rate": 4.6691472087531967e-05, "loss": 0.4789, "step": 9265 }, { "epoch": 1.0809235074626866, "grad_norm": 0.4463915398795724, "learning_rate": 4.668694186742383e-05, "loss": 0.4841, "step": 9270 }, { "epoch": 1.0815065298507462, "grad_norm": 0.4637201156142629, "learning_rate": 4.668240879440127e-05, "loss": 0.4747, "step": 9275 }, { "epoch": 1.0820895522388059, "grad_norm": 0.5083155084770778, "learning_rate": 4.6677872869138304e-05, "loss": 0.5122, "step": 9280 }, { "epoch": 1.0826725746268657, "grad_norm": 0.4567415229261271, "learning_rate": 4.66733340923094e-05, "loss": 0.4732, "step": 9285 }, { "epoch": 1.0832555970149254, "grad_norm": 0.46826732560954215, "learning_rate": 4.666879246458941e-05, "loss": 0.5041, "step": 9290 }, { "epoch": 1.083838619402985, "grad_norm": 0.4764779152646372, "learning_rate": 4.6664247986653645e-05, "loss": 0.5059, "step": 9295 }, { "epoch": 1.0844216417910448, "grad_norm": 0.46620112639686195, "learning_rate": 4.6659700659177814e-05, "loss": 0.4608, "step": 9300 }, { "epoch": 1.0850046641791045, "grad_norm": 0.4915813706267885, "learning_rate": 4.665515048283808e-05, "loss": 0.4997, "step": 9305 }, { "epoch": 1.085587686567164, "grad_norm": 0.43796257819202183, "learning_rate": 4.665059745831098e-05, "loss": 0.4994, "step": 9310 }, { "epoch": 1.086170708955224, "grad_norm": 0.5151808827501788, "learning_rate": 4.664604158627355e-05, "loss": 0.4713, "step": 9315 }, { "epoch": 1.0867537313432836, "grad_norm": 0.49502837394742893, "learning_rate": 4.6641482867403156e-05, "loss": 0.5048, "step": 9320 }, { "epoch": 1.0873367537313432, "grad_norm": 0.4620786512707241, "learning_rate": 4.663692130237766e-05, "loss": 0.4756, "step": 9325 }, { "epoch": 1.087919776119403, "grad_norm": 0.4758291599811604, "learning_rate": 4.6632356891875336e-05, "loss": 0.5135, "step": 9330 }, { "epoch": 1.0885027985074627, "grad_norm": 0.47571548818133175, "learning_rate": 4.6627789636574836e-05, "loss": 0.4952, "step": 9335 }, { "epoch": 1.0890858208955223, "grad_norm": 0.474514825567827, "learning_rate": 4.662321953715529e-05, "loss": 0.4858, "step": 9340 }, { "epoch": 1.0896688432835822, "grad_norm": 0.469298753664266, "learning_rate": 4.6618646594296226e-05, "loss": 0.4772, "step": 9345 }, { "epoch": 1.0902518656716418, "grad_norm": 0.46481163188836494, "learning_rate": 4.661407080867759e-05, "loss": 0.4887, "step": 9350 }, { "epoch": 1.0908348880597014, "grad_norm": 0.5020235935240444, "learning_rate": 4.6609492180979756e-05, "loss": 0.4904, "step": 9355 }, { "epoch": 1.0914179104477613, "grad_norm": 0.47503692296447025, "learning_rate": 4.660491071188353e-05, "loss": 0.5197, "step": 9360 }, { "epoch": 1.092000932835821, "grad_norm": 0.5448342002718358, "learning_rate": 4.660032640207013e-05, "loss": 0.5347, "step": 9365 }, { "epoch": 1.0925839552238805, "grad_norm": 0.472936178069635, "learning_rate": 4.6595739252221196e-05, "loss": 0.5326, "step": 9370 }, { "epoch": 1.0931669776119404, "grad_norm": 0.48811140194669395, "learning_rate": 4.6591149263018796e-05, "loss": 0.5094, "step": 9375 }, { "epoch": 1.09375, "grad_norm": 0.46926160934689126, "learning_rate": 4.658655643514541e-05, "loss": 0.4928, "step": 9380 }, { "epoch": 1.0943330223880596, "grad_norm": 0.4603620397682986, "learning_rate": 4.6581960769283955e-05, "loss": 0.4901, "step": 9385 }, { "epoch": 1.0949160447761195, "grad_norm": 0.4775351626053154, "learning_rate": 4.657736226611778e-05, "loss": 0.5341, "step": 9390 }, { "epoch": 1.095499067164179, "grad_norm": 0.5354532994408574, "learning_rate": 4.657276092633059e-05, "loss": 0.5089, "step": 9395 }, { "epoch": 1.0960820895522387, "grad_norm": 0.46464617476535186, "learning_rate": 4.656815675060662e-05, "loss": 0.485, "step": 9400 }, { "epoch": 1.0966651119402986, "grad_norm": 0.4646395350154372, "learning_rate": 4.656354973963041e-05, "loss": 0.4913, "step": 9405 }, { "epoch": 1.0972481343283582, "grad_norm": 0.5303126801182483, "learning_rate": 4.655893989408702e-05, "loss": 0.5052, "step": 9410 }, { "epoch": 1.0978311567164178, "grad_norm": 0.4792925399666974, "learning_rate": 4.6554327214661876e-05, "loss": 0.4784, "step": 9415 }, { "epoch": 1.0984141791044777, "grad_norm": 0.48310458072243645, "learning_rate": 4.654971170204083e-05, "loss": 0.4818, "step": 9420 }, { "epoch": 1.0989972014925373, "grad_norm": 0.49198574089775615, "learning_rate": 4.654509335691018e-05, "loss": 0.478, "step": 9425 }, { "epoch": 1.099580223880597, "grad_norm": 0.4933335405522241, "learning_rate": 4.6540472179956625e-05, "loss": 0.5031, "step": 9430 }, { "epoch": 1.1001632462686568, "grad_norm": 0.45486513478535884, "learning_rate": 4.6535848171867275e-05, "loss": 0.5033, "step": 9435 }, { "epoch": 1.1007462686567164, "grad_norm": 0.49315534436056047, "learning_rate": 4.6531221333329694e-05, "loss": 0.5081, "step": 9440 }, { "epoch": 1.101329291044776, "grad_norm": 0.4587755290744345, "learning_rate": 4.652659166503184e-05, "loss": 0.481, "step": 9445 }, { "epoch": 1.101912313432836, "grad_norm": 0.4697565098870484, "learning_rate": 4.652195916766211e-05, "loss": 0.4809, "step": 9450 }, { "epoch": 1.1024953358208955, "grad_norm": 0.5226619085445289, "learning_rate": 4.651732384190929e-05, "loss": 0.4955, "step": 9455 }, { "epoch": 1.1030783582089552, "grad_norm": 0.48377912133133427, "learning_rate": 4.6512685688462645e-05, "loss": 0.5489, "step": 9460 }, { "epoch": 1.103661380597015, "grad_norm": 0.5086592264829859, "learning_rate": 4.6508044708011784e-05, "loss": 0.4849, "step": 9465 }, { "epoch": 1.1042444029850746, "grad_norm": 0.4609475650380094, "learning_rate": 4.65034009012468e-05, "loss": 0.4928, "step": 9470 }, { "epoch": 1.1048274253731343, "grad_norm": 0.4959771192749763, "learning_rate": 4.649875426885818e-05, "loss": 0.5119, "step": 9475 }, { "epoch": 1.1054104477611941, "grad_norm": 0.48799886078119564, "learning_rate": 4.649410481153683e-05, "loss": 0.4918, "step": 9480 }, { "epoch": 1.1059934701492538, "grad_norm": 0.6184568571875035, "learning_rate": 4.648945252997407e-05, "loss": 0.4846, "step": 9485 }, { "epoch": 1.1065764925373134, "grad_norm": 0.5193725706342754, "learning_rate": 4.6484797424861675e-05, "loss": 0.4931, "step": 9490 }, { "epoch": 1.1071595149253732, "grad_norm": 0.44442788847116255, "learning_rate": 4.6480139496891796e-05, "loss": 0.4562, "step": 9495 }, { "epoch": 1.1077425373134329, "grad_norm": 0.4449516559335734, "learning_rate": 4.6475478746757025e-05, "loss": 0.5007, "step": 9500 }, { "epoch": 1.1083255597014925, "grad_norm": 0.5289263524242726, "learning_rate": 4.6470815175150364e-05, "loss": 0.5147, "step": 9505 }, { "epoch": 1.1089085820895523, "grad_norm": 0.4972928998457614, "learning_rate": 4.646614878276526e-05, "loss": 0.5089, "step": 9510 }, { "epoch": 1.109491604477612, "grad_norm": 0.446913491949988, "learning_rate": 4.646147957029555e-05, "loss": 0.4915, "step": 9515 }, { "epoch": 1.1100746268656716, "grad_norm": 0.4775093930913278, "learning_rate": 4.64568075384355e-05, "loss": 0.4999, "step": 9520 }, { "epoch": 1.1106576492537314, "grad_norm": 0.44866621767628573, "learning_rate": 4.6452132687879796e-05, "loss": 0.4638, "step": 9525 }, { "epoch": 1.111240671641791, "grad_norm": 0.46156410846716345, "learning_rate": 4.644745501932355e-05, "loss": 0.48, "step": 9530 }, { "epoch": 1.1118236940298507, "grad_norm": 0.4564165396246152, "learning_rate": 4.644277453346227e-05, "loss": 0.4955, "step": 9535 }, { "epoch": 1.1124067164179103, "grad_norm": 0.48881473046306984, "learning_rate": 4.643809123099192e-05, "loss": 0.525, "step": 9540 }, { "epoch": 1.1129897388059702, "grad_norm": 0.5063410142886016, "learning_rate": 4.6433405112608845e-05, "loss": 0.5017, "step": 9545 }, { "epoch": 1.1135727611940298, "grad_norm": 0.4634913419143995, "learning_rate": 4.6428716179009844e-05, "loss": 0.4923, "step": 9550 }, { "epoch": 1.1141557835820897, "grad_norm": 0.44747230770129015, "learning_rate": 4.6424024430892105e-05, "loss": 0.4753, "step": 9555 }, { "epoch": 1.1147388059701493, "grad_norm": 0.5267843304204154, "learning_rate": 4.641932986895325e-05, "loss": 0.4864, "step": 9560 }, { "epoch": 1.115321828358209, "grad_norm": 0.49303473319964075, "learning_rate": 4.64146324938913e-05, "loss": 0.5239, "step": 9565 }, { "epoch": 1.1159048507462686, "grad_norm": 0.4536781686999443, "learning_rate": 4.6409932306404735e-05, "loss": 0.4804, "step": 9570 }, { "epoch": 1.1164878731343284, "grad_norm": 0.5465837228080015, "learning_rate": 4.640522930719241e-05, "loss": 0.5159, "step": 9575 }, { "epoch": 1.117070895522388, "grad_norm": 0.4206690194245236, "learning_rate": 4.640052349695363e-05, "loss": 0.44, "step": 9580 }, { "epoch": 1.1176539179104477, "grad_norm": 0.46474965912968363, "learning_rate": 4.6395814876388086e-05, "loss": 0.483, "step": 9585 }, { "epoch": 1.1182369402985075, "grad_norm": 0.49977594667791597, "learning_rate": 4.6391103446195915e-05, "loss": 0.4804, "step": 9590 }, { "epoch": 1.1188199626865671, "grad_norm": 0.4470192727431772, "learning_rate": 4.6386389207077665e-05, "loss": 0.482, "step": 9595 }, { "epoch": 1.1194029850746268, "grad_norm": 0.4619398096288415, "learning_rate": 4.6381672159734287e-05, "loss": 0.4891, "step": 9600 }, { "epoch": 1.1199860074626866, "grad_norm": 0.45123039183594194, "learning_rate": 4.637695230486718e-05, "loss": 0.5042, "step": 9605 }, { "epoch": 1.1205690298507462, "grad_norm": 0.7988979042706097, "learning_rate": 4.637222964317811e-05, "loss": 0.5056, "step": 9610 }, { "epoch": 1.1211520522388059, "grad_norm": 0.48769226712085023, "learning_rate": 4.6367504175369326e-05, "loss": 0.4817, "step": 9615 }, { "epoch": 1.1217350746268657, "grad_norm": 0.45837775913119466, "learning_rate": 4.636277590214344e-05, "loss": 0.4797, "step": 9620 }, { "epoch": 1.1223180970149254, "grad_norm": 0.4736479089789474, "learning_rate": 4.63580448242035e-05, "loss": 0.4854, "step": 9625 }, { "epoch": 1.122901119402985, "grad_norm": 0.4734355258184259, "learning_rate": 4.6353310942252986e-05, "loss": 0.5026, "step": 9630 }, { "epoch": 1.1234841417910448, "grad_norm": 0.4531022842477336, "learning_rate": 4.634857425699577e-05, "loss": 0.4939, "step": 9635 }, { "epoch": 1.1240671641791045, "grad_norm": 0.45650809373210965, "learning_rate": 4.634383476913615e-05, "loss": 0.4905, "step": 9640 }, { "epoch": 1.124650186567164, "grad_norm": 0.5049363292458079, "learning_rate": 4.633909247937884e-05, "loss": 0.5124, "step": 9645 }, { "epoch": 1.125233208955224, "grad_norm": 0.4666273322582206, "learning_rate": 4.6334347388429e-05, "loss": 0.444, "step": 9650 }, { "epoch": 1.1258162313432836, "grad_norm": 0.44415022743546667, "learning_rate": 4.6329599496992145e-05, "loss": 0.4767, "step": 9655 }, { "epoch": 1.1263992537313432, "grad_norm": 0.4194373685239034, "learning_rate": 4.632484880577425e-05, "loss": 0.4825, "step": 9660 }, { "epoch": 1.126982276119403, "grad_norm": 0.46274489368091026, "learning_rate": 4.632009531548171e-05, "loss": 0.4918, "step": 9665 }, { "epoch": 1.1275652985074627, "grad_norm": 0.46087571366096425, "learning_rate": 4.6315339026821305e-05, "loss": 0.5047, "step": 9670 }, { "epoch": 1.1281483208955223, "grad_norm": 0.4785351679933754, "learning_rate": 4.631057994050027e-05, "loss": 0.4671, "step": 9675 }, { "epoch": 1.1287313432835822, "grad_norm": 0.4536972049315929, "learning_rate": 4.6305818057226226e-05, "loss": 0.5097, "step": 9680 }, { "epoch": 1.1293143656716418, "grad_norm": 0.4963284888796665, "learning_rate": 4.630105337770722e-05, "loss": 0.5098, "step": 9685 }, { "epoch": 1.1298973880597014, "grad_norm": 0.4814699797414588, "learning_rate": 4.62962859026517e-05, "loss": 0.5175, "step": 9690 }, { "epoch": 1.1304804104477613, "grad_norm": 0.4545111543543466, "learning_rate": 4.629151563276857e-05, "loss": 0.4986, "step": 9695 }, { "epoch": 1.131063432835821, "grad_norm": 0.5441314432893252, "learning_rate": 4.62867425687671e-05, "loss": 0.5291, "step": 9700 }, { "epoch": 1.1316464552238805, "grad_norm": 0.465387680919809, "learning_rate": 4.6281966711357014e-05, "loss": 0.5149, "step": 9705 }, { "epoch": 1.1322294776119404, "grad_norm": 0.4594632439309518, "learning_rate": 4.6277188061248436e-05, "loss": 0.4979, "step": 9710 }, { "epoch": 1.1328125, "grad_norm": 0.4946851483949794, "learning_rate": 4.6272406619151896e-05, "loss": 0.491, "step": 9715 }, { "epoch": 1.1333955223880596, "grad_norm": 0.47943186970197044, "learning_rate": 4.626762238577836e-05, "loss": 0.4894, "step": 9720 }, { "epoch": 1.1339785447761195, "grad_norm": 0.44710867224321554, "learning_rate": 4.626283536183918e-05, "loss": 0.5072, "step": 9725 }, { "epoch": 1.134561567164179, "grad_norm": 0.462289128611941, "learning_rate": 4.6258045548046166e-05, "loss": 0.4788, "step": 9730 }, { "epoch": 1.1351445895522387, "grad_norm": 0.4868105685200524, "learning_rate": 4.625325294511149e-05, "loss": 0.4868, "step": 9735 }, { "epoch": 1.1357276119402986, "grad_norm": 0.4790346195426802, "learning_rate": 4.624845755374779e-05, "loss": 0.5232, "step": 9740 }, { "epoch": 1.1363106343283582, "grad_norm": 0.5622037868349276, "learning_rate": 4.624365937466808e-05, "loss": 0.4915, "step": 9745 }, { "epoch": 1.1368936567164178, "grad_norm": 0.551869800635871, "learning_rate": 4.6238858408585804e-05, "loss": 0.4657, "step": 9750 }, { "epoch": 1.1374766791044777, "grad_norm": 0.550117149877168, "learning_rate": 4.623405465621483e-05, "loss": 0.493, "step": 9755 }, { "epoch": 1.1380597014925373, "grad_norm": 0.4748486725342485, "learning_rate": 4.622924811826942e-05, "loss": 0.5158, "step": 9760 }, { "epoch": 1.138642723880597, "grad_norm": 0.5332978068300538, "learning_rate": 4.622443879546426e-05, "loss": 0.5166, "step": 9765 }, { "epoch": 1.1392257462686568, "grad_norm": 0.47769296938995587, "learning_rate": 4.6219626688514456e-05, "loss": 0.5003, "step": 9770 }, { "epoch": 1.1398087686567164, "grad_norm": 0.4560542794837011, "learning_rate": 4.621481179813552e-05, "loss": 0.4695, "step": 9775 }, { "epoch": 1.140391791044776, "grad_norm": 0.5074775456178793, "learning_rate": 4.620999412504338e-05, "loss": 0.4987, "step": 9780 }, { "epoch": 1.140974813432836, "grad_norm": 0.4837252945781173, "learning_rate": 4.620517366995437e-05, "loss": 0.4963, "step": 9785 }, { "epoch": 1.1415578358208955, "grad_norm": 0.45856568740720954, "learning_rate": 4.620035043358526e-05, "loss": 0.4961, "step": 9790 }, { "epoch": 1.1421408582089552, "grad_norm": 0.4698757059244029, "learning_rate": 4.619552441665322e-05, "loss": 0.5111, "step": 9795 }, { "epoch": 1.142723880597015, "grad_norm": 0.4499050961202368, "learning_rate": 4.619069561987581e-05, "loss": 0.4926, "step": 9800 }, { "epoch": 1.1433069029850746, "grad_norm": 0.45156172108012554, "learning_rate": 4.618586404397104e-05, "loss": 0.4798, "step": 9805 }, { "epoch": 1.1438899253731343, "grad_norm": 0.45580586613281165, "learning_rate": 4.618102968965733e-05, "loss": 0.4725, "step": 9810 }, { "epoch": 1.1444729477611941, "grad_norm": 0.43980063096394745, "learning_rate": 4.617619255765349e-05, "loss": 0.4752, "step": 9815 }, { "epoch": 1.1450559701492538, "grad_norm": 0.48147377861958357, "learning_rate": 4.6171352648678755e-05, "loss": 0.4981, "step": 9820 }, { "epoch": 1.1456389925373134, "grad_norm": 0.5069422112555739, "learning_rate": 4.616650996345277e-05, "loss": 0.4734, "step": 9825 }, { "epoch": 1.1462220149253732, "grad_norm": 0.5087637858166896, "learning_rate": 4.6161664502695606e-05, "loss": 0.5172, "step": 9830 }, { "epoch": 1.1468050373134329, "grad_norm": 0.42833596634640814, "learning_rate": 4.6156816267127726e-05, "loss": 0.4649, "step": 9835 }, { "epoch": 1.1473880597014925, "grad_norm": 0.4598225831428788, "learning_rate": 4.615196525747003e-05, "loss": 0.46, "step": 9840 }, { "epoch": 1.1479710820895521, "grad_norm": 0.5264734272354958, "learning_rate": 4.61471114744438e-05, "loss": 0.5014, "step": 9845 }, { "epoch": 1.148554104477612, "grad_norm": 0.48798600824327, "learning_rate": 4.6142254918770764e-05, "loss": 0.5134, "step": 9850 }, { "epoch": 1.1491371268656716, "grad_norm": 0.48672018339280493, "learning_rate": 4.6137395591173035e-05, "loss": 0.5069, "step": 9855 }, { "epoch": 1.1497201492537314, "grad_norm": 0.4470801393970488, "learning_rate": 4.613253349237314e-05, "loss": 0.4745, "step": 9860 }, { "epoch": 1.150303171641791, "grad_norm": 0.6393897239521419, "learning_rate": 4.612766862309404e-05, "loss": 0.5135, "step": 9865 }, { "epoch": 1.1508861940298507, "grad_norm": 0.4702345469985217, "learning_rate": 4.612280098405909e-05, "loss": 0.4825, "step": 9870 }, { "epoch": 1.1514692164179103, "grad_norm": 0.45536375824284325, "learning_rate": 4.611793057599208e-05, "loss": 0.4509, "step": 9875 }, { "epoch": 1.1520522388059702, "grad_norm": 0.6847591734638865, "learning_rate": 4.611305739961715e-05, "loss": 0.4968, "step": 9880 }, { "epoch": 1.1526352611940298, "grad_norm": 0.4881444154577869, "learning_rate": 4.6108181455658936e-05, "loss": 0.5217, "step": 9885 }, { "epoch": 1.1532182835820897, "grad_norm": 0.47170189984374566, "learning_rate": 4.610330274484242e-05, "loss": 0.4984, "step": 9890 }, { "epoch": 1.1538013059701493, "grad_norm": 0.47449564973164, "learning_rate": 4.6098421267893024e-05, "loss": 0.4729, "step": 9895 }, { "epoch": 1.154384328358209, "grad_norm": 0.45892838721809553, "learning_rate": 4.609353702553659e-05, "loss": 0.4877, "step": 9900 }, { "epoch": 1.1549673507462686, "grad_norm": 0.49238188160505747, "learning_rate": 4.608865001849935e-05, "loss": 0.4879, "step": 9905 }, { "epoch": 1.1555503731343284, "grad_norm": 0.6389954991876162, "learning_rate": 4.6083760247507945e-05, "loss": 0.5487, "step": 9910 }, { "epoch": 1.156133395522388, "grad_norm": 0.48889944206681957, "learning_rate": 4.607886771328945e-05, "loss": 0.4958, "step": 9915 }, { "epoch": 1.1567164179104479, "grad_norm": 0.5675028453045466, "learning_rate": 4.607397241657133e-05, "loss": 0.4861, "step": 9920 }, { "epoch": 1.1572994402985075, "grad_norm": 0.45399241888001685, "learning_rate": 4.6069074358081476e-05, "loss": 0.4797, "step": 9925 }, { "epoch": 1.1578824626865671, "grad_norm": 0.5737213260956343, "learning_rate": 4.606417353854818e-05, "loss": 0.484, "step": 9930 }, { "epoch": 1.1584654850746268, "grad_norm": 0.47210844957751424, "learning_rate": 4.6059269958700136e-05, "loss": 0.4956, "step": 9935 }, { "epoch": 1.1590485074626866, "grad_norm": 0.4452252202160641, "learning_rate": 4.605436361926648e-05, "loss": 0.4561, "step": 9940 }, { "epoch": 1.1596315298507462, "grad_norm": 0.48156847504108874, "learning_rate": 4.604945452097672e-05, "loss": 0.5153, "step": 9945 }, { "epoch": 1.1602145522388059, "grad_norm": 0.44527665917724885, "learning_rate": 4.6044542664560804e-05, "loss": 0.4955, "step": 9950 }, { "epoch": 1.1607975746268657, "grad_norm": 0.4377330122208343, "learning_rate": 4.6039628050749066e-05, "loss": 0.4412, "step": 9955 }, { "epoch": 1.1613805970149254, "grad_norm": 0.44907976595838806, "learning_rate": 4.6034710680272274e-05, "loss": 0.4628, "step": 9960 }, { "epoch": 1.161963619402985, "grad_norm": 0.4848892879121351, "learning_rate": 4.6029790553861594e-05, "loss": 0.5327, "step": 9965 }, { "epoch": 1.1625466417910448, "grad_norm": 0.47865209843468254, "learning_rate": 4.602486767224858e-05, "loss": 0.4881, "step": 9970 }, { "epoch": 1.1631296641791045, "grad_norm": 0.5065935109536762, "learning_rate": 4.601994203616525e-05, "loss": 0.5038, "step": 9975 }, { "epoch": 1.163712686567164, "grad_norm": 0.5058923277966703, "learning_rate": 4.601501364634397e-05, "loss": 0.4909, "step": 9980 }, { "epoch": 1.164295708955224, "grad_norm": 0.5358765833523091, "learning_rate": 4.601008250351756e-05, "loss": 0.54, "step": 9985 }, { "epoch": 1.1648787313432836, "grad_norm": 0.44762389969899724, "learning_rate": 4.600514860841923e-05, "loss": 0.503, "step": 9990 }, { "epoch": 1.1654617537313432, "grad_norm": 0.5131406173451447, "learning_rate": 4.6000211961782605e-05, "loss": 0.5139, "step": 9995 }, { "epoch": 1.166044776119403, "grad_norm": 0.5104173496537336, "learning_rate": 4.599527256434171e-05, "loss": 0.48, "step": 10000 }, { "epoch": 1.1666277985074627, "grad_norm": 0.4886514119195971, "learning_rate": 4.599033041683099e-05, "loss": 0.5176, "step": 10005 }, { "epoch": 1.1672108208955223, "grad_norm": 0.42725696967219473, "learning_rate": 4.598538551998531e-05, "loss": 0.4679, "step": 10010 }, { "epoch": 1.1677938432835822, "grad_norm": 0.43566282015581653, "learning_rate": 4.59804378745399e-05, "loss": 0.4853, "step": 10015 }, { "epoch": 1.1683768656716418, "grad_norm": 0.5137803755067021, "learning_rate": 4.597548748123046e-05, "loss": 0.5042, "step": 10020 }, { "epoch": 1.1689598880597014, "grad_norm": 0.4609910866389386, "learning_rate": 4.597053434079303e-05, "loss": 0.4835, "step": 10025 }, { "epoch": 1.1695429104477613, "grad_norm": 0.48248991604882147, "learning_rate": 4.596557845396412e-05, "loss": 0.509, "step": 10030 }, { "epoch": 1.170125932835821, "grad_norm": 0.4857742343614341, "learning_rate": 4.596061982148062e-05, "loss": 0.478, "step": 10035 }, { "epoch": 1.1707089552238805, "grad_norm": 0.4685134256969988, "learning_rate": 4.595565844407982e-05, "loss": 0.5073, "step": 10040 }, { "epoch": 1.1712919776119404, "grad_norm": 0.4552557913062714, "learning_rate": 4.5950694322499444e-05, "loss": 0.5152, "step": 10045 }, { "epoch": 1.171875, "grad_norm": 0.4901456457078471, "learning_rate": 4.59457274574776e-05, "loss": 0.4881, "step": 10050 }, { "epoch": 1.1724580223880596, "grad_norm": 0.4775643180270164, "learning_rate": 4.5940757849752805e-05, "loss": 0.4607, "step": 10055 }, { "epoch": 1.1730410447761195, "grad_norm": 0.46063185695387576, "learning_rate": 4.5935785500064014e-05, "loss": 0.487, "step": 10060 }, { "epoch": 1.173624067164179, "grad_norm": 0.4746823163282632, "learning_rate": 4.5930810409150556e-05, "loss": 0.5182, "step": 10065 }, { "epoch": 1.1742070895522387, "grad_norm": 0.43843195262337814, "learning_rate": 4.5925832577752175e-05, "loss": 0.4881, "step": 10070 }, { "epoch": 1.1747901119402986, "grad_norm": 0.4685303294788509, "learning_rate": 4.592085200660903e-05, "loss": 0.4951, "step": 10075 }, { "epoch": 1.1753731343283582, "grad_norm": 0.4749093447944963, "learning_rate": 4.5915868696461685e-05, "loss": 0.5261, "step": 10080 }, { "epoch": 1.1759561567164178, "grad_norm": 0.4795736672954712, "learning_rate": 4.591088264805111e-05, "loss": 0.5027, "step": 10085 }, { "epoch": 1.1765391791044777, "grad_norm": 0.48217126639017427, "learning_rate": 4.590589386211869e-05, "loss": 0.4937, "step": 10090 }, { "epoch": 1.1771222014925373, "grad_norm": 0.420400842266455, "learning_rate": 4.5900902339406195e-05, "loss": 0.4879, "step": 10095 }, { "epoch": 1.177705223880597, "grad_norm": 0.4889175318908055, "learning_rate": 4.589590808065583e-05, "loss": 0.4891, "step": 10100 }, { "epoch": 1.1782882462686568, "grad_norm": 0.4699220474921735, "learning_rate": 4.5890911086610184e-05, "loss": 0.4997, "step": 10105 }, { "epoch": 1.1788712686567164, "grad_norm": 0.4499121851481335, "learning_rate": 4.588591135801227e-05, "loss": 0.4942, "step": 10110 }, { "epoch": 1.179454291044776, "grad_norm": 0.48039361463042335, "learning_rate": 4.588090889560549e-05, "loss": 0.5262, "step": 10115 }, { "epoch": 1.180037313432836, "grad_norm": 0.4870244514079405, "learning_rate": 4.587590370013367e-05, "loss": 0.5012, "step": 10120 }, { "epoch": 1.1806203358208955, "grad_norm": 0.4359608570010937, "learning_rate": 4.587089577234104e-05, "loss": 0.4856, "step": 10125 }, { "epoch": 1.1812033582089552, "grad_norm": 0.466445375839257, "learning_rate": 4.5865885112972216e-05, "loss": 0.4906, "step": 10130 }, { "epoch": 1.181786380597015, "grad_norm": 0.42268739888448525, "learning_rate": 4.5860871722772246e-05, "loss": 0.4782, "step": 10135 }, { "epoch": 1.1823694029850746, "grad_norm": 0.4966303201182526, "learning_rate": 4.585585560248657e-05, "loss": 0.4948, "step": 10140 }, { "epoch": 1.1829524253731343, "grad_norm": 0.49723599768519844, "learning_rate": 4.5850836752861034e-05, "loss": 0.4967, "step": 10145 }, { "epoch": 1.1835354477611941, "grad_norm": 0.47220426148758693, "learning_rate": 4.58458151746419e-05, "loss": 0.5122, "step": 10150 }, { "epoch": 1.1841184701492538, "grad_norm": 0.44873656844364723, "learning_rate": 4.584079086857582e-05, "loss": 0.4919, "step": 10155 }, { "epoch": 1.1847014925373134, "grad_norm": 0.7909442158986997, "learning_rate": 4.5835763835409864e-05, "loss": 0.512, "step": 10160 }, { "epoch": 1.1852845149253732, "grad_norm": 0.4579010837343018, "learning_rate": 4.583073407589151e-05, "loss": 0.4753, "step": 10165 }, { "epoch": 1.1858675373134329, "grad_norm": 0.47469471616317205, "learning_rate": 4.5825701590768625e-05, "loss": 0.4933, "step": 10170 }, { "epoch": 1.1864505597014925, "grad_norm": 0.4495999689572246, "learning_rate": 4.582066638078949e-05, "loss": 0.4934, "step": 10175 }, { "epoch": 1.1870335820895521, "grad_norm": 0.4796448933384789, "learning_rate": 4.58156284467028e-05, "loss": 0.4762, "step": 10180 }, { "epoch": 1.187616604477612, "grad_norm": 0.46714972366630153, "learning_rate": 4.5810587789257646e-05, "loss": 0.4858, "step": 10185 }, { "epoch": 1.1881996268656716, "grad_norm": 0.4889030838131866, "learning_rate": 4.5805544409203535e-05, "loss": 0.5097, "step": 10190 }, { "epoch": 1.1887826492537314, "grad_norm": 0.47705359892658183, "learning_rate": 4.5800498307290344e-05, "loss": 0.509, "step": 10195 }, { "epoch": 1.189365671641791, "grad_norm": 0.4612757329558082, "learning_rate": 4.579544948426841e-05, "loss": 0.4907, "step": 10200 }, { "epoch": 1.1899486940298507, "grad_norm": 0.4392831600817961, "learning_rate": 4.579039794088842e-05, "loss": 0.5172, "step": 10205 }, { "epoch": 1.1905317164179103, "grad_norm": 0.5130394151474353, "learning_rate": 4.57853436779015e-05, "loss": 0.5189, "step": 10210 }, { "epoch": 1.1911147388059702, "grad_norm": 0.4736920577937518, "learning_rate": 4.578028669605918e-05, "loss": 0.4695, "step": 10215 }, { "epoch": 1.1916977611940298, "grad_norm": 0.5675131307737508, "learning_rate": 4.577522699611336e-05, "loss": 0.4985, "step": 10220 }, { "epoch": 1.1922807835820897, "grad_norm": 0.4390341606630438, "learning_rate": 4.577016457881639e-05, "loss": 0.5118, "step": 10225 }, { "epoch": 1.1928638059701493, "grad_norm": 0.45624624457570856, "learning_rate": 4.576509944492101e-05, "loss": 0.487, "step": 10230 }, { "epoch": 1.193446828358209, "grad_norm": 0.41894433880336407, "learning_rate": 4.5760031595180325e-05, "loss": 0.4869, "step": 10235 }, { "epoch": 1.1940298507462686, "grad_norm": 0.436370405965647, "learning_rate": 4.57549610303479e-05, "loss": 0.4878, "step": 10240 }, { "epoch": 1.1946128731343284, "grad_norm": 0.49373600917312327, "learning_rate": 4.574988775117767e-05, "loss": 0.5055, "step": 10245 }, { "epoch": 1.195195895522388, "grad_norm": 0.5090184054460075, "learning_rate": 4.5744811758424e-05, "loss": 0.5114, "step": 10250 }, { "epoch": 1.1957789179104479, "grad_norm": 0.4521890891822228, "learning_rate": 4.573973305284162e-05, "loss": 0.4756, "step": 10255 }, { "epoch": 1.1963619402985075, "grad_norm": 0.48661728690265815, "learning_rate": 4.573465163518569e-05, "loss": 0.4901, "step": 10260 }, { "epoch": 1.1969449626865671, "grad_norm": 0.4562515996289815, "learning_rate": 4.572956750621178e-05, "loss": 0.4752, "step": 10265 }, { "epoch": 1.1975279850746268, "grad_norm": 0.48159859341372235, "learning_rate": 4.572448066667584e-05, "loss": 0.5142, "step": 10270 }, { "epoch": 1.1981110074626866, "grad_norm": 0.4175615384612034, "learning_rate": 4.571939111733423e-05, "loss": 0.4917, "step": 10275 }, { "epoch": 1.1986940298507462, "grad_norm": 0.48514238856571956, "learning_rate": 4.571429885894373e-05, "loss": 0.5033, "step": 10280 }, { "epoch": 1.1992770522388059, "grad_norm": 0.4700533064137317, "learning_rate": 4.5709203892261506e-05, "loss": 0.5039, "step": 10285 }, { "epoch": 1.1998600746268657, "grad_norm": 0.5155371027252977, "learning_rate": 4.5704106218045124e-05, "loss": 0.5267, "step": 10290 }, { "epoch": 1.2004430970149254, "grad_norm": 0.4258916802960759, "learning_rate": 4.569900583705257e-05, "loss": 0.5128, "step": 10295 }, { "epoch": 1.201026119402985, "grad_norm": 0.5218094530712158, "learning_rate": 4.569390275004221e-05, "loss": 0.4913, "step": 10300 }, { "epoch": 1.2016091417910448, "grad_norm": 0.458383381872571, "learning_rate": 4.568879695777283e-05, "loss": 0.5071, "step": 10305 }, { "epoch": 1.2021921641791045, "grad_norm": 0.447897704047899, "learning_rate": 4.568368846100363e-05, "loss": 0.4877, "step": 10310 }, { "epoch": 1.202775186567164, "grad_norm": 0.4595544275808344, "learning_rate": 4.567857726049415e-05, "loss": 0.4911, "step": 10315 }, { "epoch": 1.203358208955224, "grad_norm": 0.45837361012593975, "learning_rate": 4.567346335700442e-05, "loss": 0.5353, "step": 10320 }, { "epoch": 1.2039412313432836, "grad_norm": 0.44439826613487565, "learning_rate": 4.56683467512948e-05, "loss": 0.4808, "step": 10325 }, { "epoch": 1.2045242537313432, "grad_norm": 0.46198420918559796, "learning_rate": 4.5663227444126114e-05, "loss": 0.524, "step": 10330 }, { "epoch": 1.205107276119403, "grad_norm": 0.45273321897247865, "learning_rate": 4.565810543625952e-05, "loss": 0.4827, "step": 10335 }, { "epoch": 1.2056902985074627, "grad_norm": 0.4130813589196938, "learning_rate": 4.565298072845662e-05, "loss": 0.4525, "step": 10340 }, { "epoch": 1.2062733208955223, "grad_norm": 0.4501895517862483, "learning_rate": 4.5647853321479414e-05, "loss": 0.4815, "step": 10345 }, { "epoch": 1.2068563432835822, "grad_norm": 0.4850929960512088, "learning_rate": 4.564272321609031e-05, "loss": 0.4847, "step": 10350 }, { "epoch": 1.2074393656716418, "grad_norm": 0.47342463711706323, "learning_rate": 4.563759041305207e-05, "loss": 0.5328, "step": 10355 }, { "epoch": 1.2080223880597014, "grad_norm": 0.4591790988243215, "learning_rate": 4.563245491312793e-05, "loss": 0.5116, "step": 10360 }, { "epoch": 1.2086054104477613, "grad_norm": 0.48391887993801336, "learning_rate": 4.562731671708147e-05, "loss": 0.475, "step": 10365 }, { "epoch": 1.209188432835821, "grad_norm": 0.5940498487621698, "learning_rate": 4.5622175825676695e-05, "loss": 0.4691, "step": 10370 }, { "epoch": 1.2097714552238805, "grad_norm": 0.45160586473877545, "learning_rate": 4.5617032239678016e-05, "loss": 0.4887, "step": 10375 }, { "epoch": 1.2103544776119404, "grad_norm": 0.5472177361751651, "learning_rate": 4.5611885959850216e-05, "loss": 0.5053, "step": 10380 }, { "epoch": 1.2109375, "grad_norm": 0.4361554855779335, "learning_rate": 4.5606736986958504e-05, "loss": 0.4706, "step": 10385 }, { "epoch": 1.2115205223880596, "grad_norm": 0.48091608079580567, "learning_rate": 4.560158532176849e-05, "loss": 0.4846, "step": 10390 }, { "epoch": 1.2121035447761195, "grad_norm": 0.5277938909221369, "learning_rate": 4.559643096504618e-05, "loss": 0.4529, "step": 10395 }, { "epoch": 1.212686567164179, "grad_norm": 0.47110973401201595, "learning_rate": 4.559127391755796e-05, "loss": 0.4843, "step": 10400 }, { "epoch": 1.2132695895522387, "grad_norm": 0.46646043048754804, "learning_rate": 4.558611418007065e-05, "loss": 0.5225, "step": 10405 }, { "epoch": 1.2138526119402986, "grad_norm": 0.4117714748940162, "learning_rate": 4.558095175335145e-05, "loss": 0.4972, "step": 10410 }, { "epoch": 1.2144356343283582, "grad_norm": 0.473548178220035, "learning_rate": 4.557578663816796e-05, "loss": 0.4888, "step": 10415 }, { "epoch": 1.2150186567164178, "grad_norm": 0.44588850897264254, "learning_rate": 4.557061883528818e-05, "loss": 0.4889, "step": 10420 }, { "epoch": 1.2156016791044777, "grad_norm": 0.4357468417538078, "learning_rate": 4.556544834548052e-05, "loss": 0.4935, "step": 10425 }, { "epoch": 1.2161847014925373, "grad_norm": 0.47214744820744703, "learning_rate": 4.5560275169513786e-05, "loss": 0.4969, "step": 10430 }, { "epoch": 1.216767723880597, "grad_norm": 0.5052146704131883, "learning_rate": 4.5555099308157163e-05, "loss": 0.4797, "step": 10435 }, { "epoch": 1.2173507462686568, "grad_norm": 0.5143329850027023, "learning_rate": 4.554992076218026e-05, "loss": 0.4744, "step": 10440 }, { "epoch": 1.2179337686567164, "grad_norm": 0.5905636689742819, "learning_rate": 4.554473953235309e-05, "loss": 0.4858, "step": 10445 }, { "epoch": 1.218516791044776, "grad_norm": 0.4868778049016568, "learning_rate": 4.553955561944603e-05, "loss": 0.4674, "step": 10450 }, { "epoch": 1.219099813432836, "grad_norm": 0.4735704731328373, "learning_rate": 4.5534369024229903e-05, "loss": 0.4945, "step": 10455 }, { "epoch": 1.2196828358208955, "grad_norm": 0.4217849941090126, "learning_rate": 4.552917974747588e-05, "loss": 0.5251, "step": 10460 }, { "epoch": 1.2202658582089552, "grad_norm": 0.4381496438873959, "learning_rate": 4.552398778995558e-05, "loss": 0.4855, "step": 10465 }, { "epoch": 1.220848880597015, "grad_norm": 0.4682518660001828, "learning_rate": 4.551879315244098e-05, "loss": 0.4612, "step": 10470 }, { "epoch": 1.2214319029850746, "grad_norm": 0.4711869099835559, "learning_rate": 4.551359583570448e-05, "loss": 0.4624, "step": 10475 }, { "epoch": 1.2220149253731343, "grad_norm": 0.452571746667093, "learning_rate": 4.5508395840518884e-05, "loss": 0.4821, "step": 10480 }, { "epoch": 1.2225979477611941, "grad_norm": 0.49725953336704015, "learning_rate": 4.550319316765735e-05, "loss": 0.4707, "step": 10485 }, { "epoch": 1.2231809701492538, "grad_norm": 0.484195499245689, "learning_rate": 4.549798781789349e-05, "loss": 0.4914, "step": 10490 }, { "epoch": 1.2237639925373134, "grad_norm": 0.47164140799331555, "learning_rate": 4.5492779792001286e-05, "loss": 0.5137, "step": 10495 }, { "epoch": 1.2243470149253732, "grad_norm": 0.4764823439535703, "learning_rate": 4.548756909075511e-05, "loss": 0.4996, "step": 10500 }, { "epoch": 1.2249300373134329, "grad_norm": 0.4481478128548299, "learning_rate": 4.5482355714929764e-05, "loss": 0.4917, "step": 10505 }, { "epoch": 1.2255130597014925, "grad_norm": 0.45412757674717086, "learning_rate": 4.5477139665300414e-05, "loss": 0.4782, "step": 10510 }, { "epoch": 1.2260960820895521, "grad_norm": 0.510120546472683, "learning_rate": 4.5471920942642634e-05, "loss": 0.4597, "step": 10515 }, { "epoch": 1.226679104477612, "grad_norm": 0.472860169073306, "learning_rate": 4.5466699547732405e-05, "loss": 0.5071, "step": 10520 }, { "epoch": 1.2272621268656716, "grad_norm": 0.48174398275339125, "learning_rate": 4.5461475481346086e-05, "loss": 0.5015, "step": 10525 }, { "epoch": 1.2278451492537314, "grad_norm": 0.45223858027921016, "learning_rate": 4.545624874426047e-05, "loss": 0.4694, "step": 10530 }, { "epoch": 1.228428171641791, "grad_norm": 0.4703622316483385, "learning_rate": 4.54510193372527e-05, "loss": 0.5083, "step": 10535 }, { "epoch": 1.2290111940298507, "grad_norm": 0.4766899395190844, "learning_rate": 4.544578726110035e-05, "loss": 0.4635, "step": 10540 }, { "epoch": 1.2295942164179103, "grad_norm": 0.4805029162802611, "learning_rate": 4.544055251658137e-05, "loss": 0.5375, "step": 10545 }, { "epoch": 1.2301772388059702, "grad_norm": 0.4455488814122577, "learning_rate": 4.5435315104474124e-05, "loss": 0.4855, "step": 10550 }, { "epoch": 1.2307602611940298, "grad_norm": 0.42671789157033496, "learning_rate": 4.5430075025557365e-05, "loss": 0.5019, "step": 10555 }, { "epoch": 1.2313432835820897, "grad_norm": 0.46701158462531245, "learning_rate": 4.5424832280610245e-05, "loss": 0.5012, "step": 10560 }, { "epoch": 1.2319263059701493, "grad_norm": 0.5297688795382418, "learning_rate": 4.5419586870412306e-05, "loss": 0.4908, "step": 10565 }, { "epoch": 1.232509328358209, "grad_norm": 0.4204261464822169, "learning_rate": 4.541433879574348e-05, "loss": 0.4693, "step": 10570 }, { "epoch": 1.2330923507462686, "grad_norm": 0.4673047492978749, "learning_rate": 4.540908805738412e-05, "loss": 0.4689, "step": 10575 }, { "epoch": 1.2336753731343284, "grad_norm": 0.44842552442527106, "learning_rate": 4.540383465611496e-05, "loss": 0.4921, "step": 10580 }, { "epoch": 1.234258395522388, "grad_norm": 0.46336131437386063, "learning_rate": 4.5398578592717135e-05, "loss": 0.5065, "step": 10585 }, { "epoch": 1.2348414179104479, "grad_norm": 0.4759828002351157, "learning_rate": 4.539331986797215e-05, "loss": 0.5122, "step": 10590 }, { "epoch": 1.2354244402985075, "grad_norm": 0.4717537952461409, "learning_rate": 4.5388058482661944e-05, "loss": 0.5031, "step": 10595 }, { "epoch": 1.2360074626865671, "grad_norm": 0.4609477364032761, "learning_rate": 4.5382794437568824e-05, "loss": 0.487, "step": 10600 }, { "epoch": 1.2365904850746268, "grad_norm": 0.5120862465684253, "learning_rate": 4.537752773347552e-05, "loss": 0.4904, "step": 10605 }, { "epoch": 1.2371735074626866, "grad_norm": 0.45861223756305225, "learning_rate": 4.537225837116512e-05, "loss": 0.5004, "step": 10610 }, { "epoch": 1.2377565298507462, "grad_norm": 0.47356352919508377, "learning_rate": 4.5366986351421145e-05, "loss": 0.4942, "step": 10615 }, { "epoch": 1.2383395522388059, "grad_norm": 0.4919271378489869, "learning_rate": 4.5361711675027484e-05, "loss": 0.4831, "step": 10620 }, { "epoch": 1.2389225746268657, "grad_norm": 0.4637408665064941, "learning_rate": 4.5356434342768434e-05, "loss": 0.4851, "step": 10625 }, { "epoch": 1.2395055970149254, "grad_norm": 0.4879180836517928, "learning_rate": 4.535115435542868e-05, "loss": 0.491, "step": 10630 }, { "epoch": 1.240088619402985, "grad_norm": 0.484147000323564, "learning_rate": 4.5345871713793306e-05, "loss": 0.532, "step": 10635 }, { "epoch": 1.2406716417910448, "grad_norm": 0.48885676700578345, "learning_rate": 4.53405864186478e-05, "loss": 0.4839, "step": 10640 }, { "epoch": 1.2412546641791045, "grad_norm": 0.500355902387549, "learning_rate": 4.533529847077803e-05, "loss": 0.4965, "step": 10645 }, { "epoch": 1.241837686567164, "grad_norm": 0.45464876801725207, "learning_rate": 4.5330007870970255e-05, "loss": 0.4859, "step": 10650 }, { "epoch": 1.242420708955224, "grad_norm": 0.41211951322449497, "learning_rate": 4.532471462001114e-05, "loss": 0.5091, "step": 10655 }, { "epoch": 1.2430037313432836, "grad_norm": 0.4532085017482041, "learning_rate": 4.531941871868775e-05, "loss": 0.5016, "step": 10660 }, { "epoch": 1.2435867537313432, "grad_norm": 0.5200222182217336, "learning_rate": 4.531412016778752e-05, "loss": 0.494, "step": 10665 }, { "epoch": 1.244169776119403, "grad_norm": 0.44998083416964635, "learning_rate": 4.530881896809831e-05, "loss": 0.4777, "step": 10670 }, { "epoch": 1.2447527985074627, "grad_norm": 0.46243724465256153, "learning_rate": 4.530351512040834e-05, "loss": 0.4682, "step": 10675 }, { "epoch": 1.2453358208955223, "grad_norm": 0.4463317078925723, "learning_rate": 4.5298208625506253e-05, "loss": 0.5188, "step": 10680 }, { "epoch": 1.2459188432835822, "grad_norm": 0.4957783469036879, "learning_rate": 4.5292899484181074e-05, "loss": 0.5024, "step": 10685 }, { "epoch": 1.2465018656716418, "grad_norm": 0.47347227953314347, "learning_rate": 4.5287587697222215e-05, "loss": 0.4755, "step": 10690 }, { "epoch": 1.2470848880597014, "grad_norm": 0.46933916348082055, "learning_rate": 4.528227326541949e-05, "loss": 0.5216, "step": 10695 }, { "epoch": 1.2476679104477613, "grad_norm": 0.4274168227368018, "learning_rate": 4.527695618956312e-05, "loss": 0.4911, "step": 10700 }, { "epoch": 1.248250932835821, "grad_norm": 0.47907368979204124, "learning_rate": 4.527163647044367e-05, "loss": 0.4884, "step": 10705 }, { "epoch": 1.2488339552238805, "grad_norm": 0.46085266857825197, "learning_rate": 4.5266314108852166e-05, "loss": 0.5122, "step": 10710 }, { "epoch": 1.2494169776119404, "grad_norm": 0.45342521267091646, "learning_rate": 4.526098910557996e-05, "loss": 0.5044, "step": 10715 }, { "epoch": 1.25, "grad_norm": 0.5852842506751587, "learning_rate": 4.5255661461418854e-05, "loss": 0.5238, "step": 10720 }, { "epoch": 1.2505830223880596, "grad_norm": 0.4673897355491722, "learning_rate": 4.5250331177161014e-05, "loss": 0.4876, "step": 10725 }, { "epoch": 1.2511660447761195, "grad_norm": 0.45628166086918426, "learning_rate": 4.5244998253598994e-05, "loss": 0.4871, "step": 10730 }, { "epoch": 1.251749067164179, "grad_norm": 0.47124101261746165, "learning_rate": 4.5239662691525744e-05, "loss": 0.5156, "step": 10735 }, { "epoch": 1.2523320895522387, "grad_norm": 0.44725519298792205, "learning_rate": 4.5234324491734624e-05, "loss": 0.4716, "step": 10740 }, { "epoch": 1.2529151119402986, "grad_norm": 0.42690219529381024, "learning_rate": 4.522898365501938e-05, "loss": 0.4991, "step": 10745 }, { "epoch": 1.2534981343283582, "grad_norm": 0.4916189926473275, "learning_rate": 4.5223640182174115e-05, "loss": 0.5192, "step": 10750 }, { "epoch": 1.2540811567164178, "grad_norm": 0.47347920108375546, "learning_rate": 4.5218294073993374e-05, "loss": 0.4843, "step": 10755 }, { "epoch": 1.2546641791044777, "grad_norm": 0.5188258241202645, "learning_rate": 4.521294533127206e-05, "loss": 0.5152, "step": 10760 }, { "epoch": 1.2552472014925373, "grad_norm": 0.4391126986556853, "learning_rate": 4.5207593954805494e-05, "loss": 0.4726, "step": 10765 }, { "epoch": 1.255830223880597, "grad_norm": 0.4608047643671716, "learning_rate": 4.520223994538937e-05, "loss": 0.4968, "step": 10770 }, { "epoch": 1.2564132462686568, "grad_norm": 0.5149266987509121, "learning_rate": 4.519688330381976e-05, "loss": 0.5188, "step": 10775 }, { "epoch": 1.2569962686567164, "grad_norm": 0.4408037918106659, "learning_rate": 4.519152403089317e-05, "loss": 0.4712, "step": 10780 }, { "epoch": 1.257579291044776, "grad_norm": 0.48078540476076054, "learning_rate": 4.518616212740647e-05, "loss": 0.5117, "step": 10785 }, { "epoch": 1.2581623134328357, "grad_norm": 0.4937330498447094, "learning_rate": 4.51807975941569e-05, "loss": 0.4937, "step": 10790 }, { "epoch": 1.2587453358208955, "grad_norm": 0.3956058070444886, "learning_rate": 4.517543043194214e-05, "loss": 0.448, "step": 10795 }, { "epoch": 1.2593283582089552, "grad_norm": 0.46438661170530343, "learning_rate": 4.517006064156023e-05, "loss": 0.4839, "step": 10800 }, { "epoch": 1.259911380597015, "grad_norm": 0.4576805182366751, "learning_rate": 4.516468822380959e-05, "loss": 0.5221, "step": 10805 }, { "epoch": 1.2604944029850746, "grad_norm": 0.424073212385048, "learning_rate": 4.515931317948907e-05, "loss": 0.4614, "step": 10810 }, { "epoch": 1.2610774253731343, "grad_norm": 0.4358863186762239, "learning_rate": 4.515393550939787e-05, "loss": 0.4648, "step": 10815 }, { "epoch": 1.261660447761194, "grad_norm": 0.45400319344417495, "learning_rate": 4.5148555214335616e-05, "loss": 0.5128, "step": 10820 }, { "epoch": 1.2622434701492538, "grad_norm": 0.4441332948804677, "learning_rate": 4.514317229510228e-05, "loss": 0.468, "step": 10825 }, { "epoch": 1.2628264925373134, "grad_norm": 0.47556452899330476, "learning_rate": 4.5137786752498285e-05, "loss": 0.4899, "step": 10830 }, { "epoch": 1.2634095149253732, "grad_norm": 0.4358375800095142, "learning_rate": 4.513239858732438e-05, "loss": 0.4863, "step": 10835 }, { "epoch": 1.2639925373134329, "grad_norm": 0.4571425793619909, "learning_rate": 4.512700780038174e-05, "loss": 0.4896, "step": 10840 }, { "epoch": 1.2645755597014925, "grad_norm": 0.5151550921023409, "learning_rate": 4.5121614392471934e-05, "loss": 0.4849, "step": 10845 }, { "epoch": 1.2651585820895521, "grad_norm": 0.4603106071865496, "learning_rate": 4.5116218364396904e-05, "loss": 0.4666, "step": 10850 }, { "epoch": 1.265741604477612, "grad_norm": 0.4211185066790496, "learning_rate": 4.511081971695899e-05, "loss": 0.4641, "step": 10855 }, { "epoch": 1.2663246268656716, "grad_norm": 0.4678760624094688, "learning_rate": 4.510541845096091e-05, "loss": 0.4711, "step": 10860 }, { "epoch": 1.2669076492537314, "grad_norm": 0.47620077865417704, "learning_rate": 4.510001456720579e-05, "loss": 0.4807, "step": 10865 }, { "epoch": 1.267490671641791, "grad_norm": 0.46355439011575994, "learning_rate": 4.509460806649714e-05, "loss": 0.4881, "step": 10870 }, { "epoch": 1.2680736940298507, "grad_norm": 0.4914208310417759, "learning_rate": 4.508919894963884e-05, "loss": 0.4937, "step": 10875 }, { "epoch": 1.2686567164179103, "grad_norm": 0.4305195635584616, "learning_rate": 4.5083787217435175e-05, "loss": 0.4902, "step": 10880 }, { "epoch": 1.2692397388059702, "grad_norm": 0.44759494114245063, "learning_rate": 4.507837287069083e-05, "loss": 0.4961, "step": 10885 }, { "epoch": 1.2698227611940298, "grad_norm": 0.4440267264043612, "learning_rate": 4.507295591021087e-05, "loss": 0.509, "step": 10890 }, { "epoch": 1.2704057835820897, "grad_norm": 0.43935234739243484, "learning_rate": 4.5067536336800724e-05, "loss": 0.4726, "step": 10895 }, { "epoch": 1.2709888059701493, "grad_norm": 0.4573323836385464, "learning_rate": 4.506211415126624e-05, "loss": 0.5018, "step": 10900 }, { "epoch": 1.271571828358209, "grad_norm": 0.46045982197691526, "learning_rate": 4.5056689354413664e-05, "loss": 0.4728, "step": 10905 }, { "epoch": 1.2721548507462686, "grad_norm": 0.4630717093874329, "learning_rate": 4.505126194704958e-05, "loss": 0.4958, "step": 10910 }, { "epoch": 1.2727378731343284, "grad_norm": 0.48470057016551354, "learning_rate": 4.504583192998101e-05, "loss": 0.5273, "step": 10915 }, { "epoch": 1.273320895522388, "grad_norm": 0.45280566005549905, "learning_rate": 4.504039930401535e-05, "loss": 0.4964, "step": 10920 }, { "epoch": 1.2739039179104479, "grad_norm": 0.5037372516068997, "learning_rate": 4.503496406996037e-05, "loss": 0.4999, "step": 10925 }, { "epoch": 1.2744869402985075, "grad_norm": 0.47606837622249965, "learning_rate": 4.5029526228624226e-05, "loss": 0.5029, "step": 10930 }, { "epoch": 1.2750699626865671, "grad_norm": 0.5703203195144877, "learning_rate": 4.50240857808155e-05, "loss": 0.538, "step": 10935 }, { "epoch": 1.2756529850746268, "grad_norm": 0.45251604419229047, "learning_rate": 4.501864272734311e-05, "loss": 0.5015, "step": 10940 }, { "epoch": 1.2762360074626866, "grad_norm": 0.49655362487006866, "learning_rate": 4.50131970690164e-05, "loss": 0.4847, "step": 10945 }, { "epoch": 1.2768190298507462, "grad_norm": 0.4613013724647206, "learning_rate": 4.500774880664508e-05, "loss": 0.4787, "step": 10950 }, { "epoch": 1.277402052238806, "grad_norm": 0.4302691188658478, "learning_rate": 4.500229794103925e-05, "loss": 0.5333, "step": 10955 }, { "epoch": 1.2779850746268657, "grad_norm": 0.42502232464484513, "learning_rate": 4.4996844473009425e-05, "loss": 0.491, "step": 10960 }, { "epoch": 1.2785680970149254, "grad_norm": 0.40601973604606284, "learning_rate": 4.499138840336646e-05, "loss": 0.4629, "step": 10965 }, { "epoch": 1.279151119402985, "grad_norm": 0.45669189552203826, "learning_rate": 4.498592973292162e-05, "loss": 0.4843, "step": 10970 }, { "epoch": 1.2797341417910448, "grad_norm": 0.4340655439220024, "learning_rate": 4.498046846248658e-05, "loss": 0.4678, "step": 10975 }, { "epoch": 1.2803171641791045, "grad_norm": 0.4729483198135297, "learning_rate": 4.497500459287335e-05, "loss": 0.5303, "step": 10980 }, { "epoch": 1.2809001865671643, "grad_norm": 0.4553839692139655, "learning_rate": 4.496953812489438e-05, "loss": 0.4713, "step": 10985 }, { "epoch": 1.281483208955224, "grad_norm": 0.43086952416463076, "learning_rate": 4.496406905936246e-05, "loss": 0.4945, "step": 10990 }, { "epoch": 1.2820662313432836, "grad_norm": 0.4957276784882462, "learning_rate": 4.49585973970908e-05, "loss": 0.4715, "step": 10995 }, { "epoch": 1.2826492537313432, "grad_norm": 0.44036399286463346, "learning_rate": 4.4953123138892984e-05, "loss": 0.4556, "step": 11000 }, { "epoch": 1.283232276119403, "grad_norm": 0.40191787688124264, "learning_rate": 4.4947646285582974e-05, "loss": 0.4984, "step": 11005 }, { "epoch": 1.2838152985074627, "grad_norm": 0.5236556766781192, "learning_rate": 4.4942166837975134e-05, "loss": 0.5033, "step": 11010 }, { "epoch": 1.2843983208955223, "grad_norm": 0.4324916399134003, "learning_rate": 4.49366847968842e-05, "loss": 0.5058, "step": 11015 }, { "epoch": 1.2849813432835822, "grad_norm": 0.416311459433444, "learning_rate": 4.4931200163125306e-05, "loss": 0.4637, "step": 11020 }, { "epoch": 1.2855643656716418, "grad_norm": 0.41026736261558155, "learning_rate": 4.492571293751395e-05, "loss": 0.4862, "step": 11025 }, { "epoch": 1.2861473880597014, "grad_norm": 0.44497130836664395, "learning_rate": 4.492022312086605e-05, "loss": 0.4772, "step": 11030 }, { "epoch": 1.2867304104477613, "grad_norm": 0.4469991233871229, "learning_rate": 4.491473071399787e-05, "loss": 0.5133, "step": 11035 }, { "epoch": 1.287313432835821, "grad_norm": 0.5031693931998229, "learning_rate": 4.4909235717726086e-05, "loss": 0.4877, "step": 11040 }, { "epoch": 1.2878964552238805, "grad_norm": 0.44744449115948626, "learning_rate": 4.490373813286776e-05, "loss": 0.4886, "step": 11045 }, { "epoch": 1.2884794776119404, "grad_norm": 0.4715880128998123, "learning_rate": 4.4898237960240315e-05, "loss": 0.5093, "step": 11050 }, { "epoch": 1.2890625, "grad_norm": 0.4623706251903199, "learning_rate": 4.4892735200661584e-05, "loss": 0.473, "step": 11055 }, { "epoch": 1.2896455223880596, "grad_norm": 0.4076569329168991, "learning_rate": 4.488722985494978e-05, "loss": 0.5065, "step": 11060 }, { "epoch": 1.2902285447761195, "grad_norm": 0.4563730078630026, "learning_rate": 4.488172192392347e-05, "loss": 0.4682, "step": 11065 }, { "epoch": 1.290811567164179, "grad_norm": 0.4627222760790777, "learning_rate": 4.487621140840165e-05, "loss": 0.5048, "step": 11070 }, { "epoch": 1.2913945895522387, "grad_norm": 0.47548018063252756, "learning_rate": 4.487069830920369e-05, "loss": 0.511, "step": 11075 }, { "epoch": 1.2919776119402986, "grad_norm": 0.4303449084265729, "learning_rate": 4.486518262714931e-05, "loss": 0.4873, "step": 11080 }, { "epoch": 1.2925606343283582, "grad_norm": 0.42835590214107866, "learning_rate": 4.4859664363058665e-05, "loss": 0.4907, "step": 11085 }, { "epoch": 1.2931436567164178, "grad_norm": 0.4547377226856405, "learning_rate": 4.485414351775224e-05, "loss": 0.4938, "step": 11090 }, { "epoch": 1.2937266791044777, "grad_norm": 0.4927578436352473, "learning_rate": 4.484862009205096e-05, "loss": 0.491, "step": 11095 }, { "epoch": 1.2943097014925373, "grad_norm": 0.4525594158585282, "learning_rate": 4.484309408677609e-05, "loss": 0.4882, "step": 11100 }, { "epoch": 1.294892723880597, "grad_norm": 0.4662919621023449, "learning_rate": 4.48375655027493e-05, "loss": 0.4851, "step": 11105 }, { "epoch": 1.2954757462686568, "grad_norm": 0.4944056942567037, "learning_rate": 4.483203434079263e-05, "loss": 0.5345, "step": 11110 }, { "epoch": 1.2960587686567164, "grad_norm": 0.4394858066693536, "learning_rate": 4.4826500601728515e-05, "loss": 0.4991, "step": 11115 }, { "epoch": 1.296641791044776, "grad_norm": 0.48205648299926923, "learning_rate": 4.4820964286379764e-05, "loss": 0.5118, "step": 11120 }, { "epoch": 1.2972248134328357, "grad_norm": 0.4460294687516735, "learning_rate": 4.481542539556959e-05, "loss": 0.4754, "step": 11125 }, { "epoch": 1.2978078358208955, "grad_norm": 0.46649810464317104, "learning_rate": 4.480988393012155e-05, "loss": 0.4977, "step": 11130 }, { "epoch": 1.2983908582089552, "grad_norm": 0.4543096879064287, "learning_rate": 4.4804339890859625e-05, "loss": 0.5205, "step": 11135 }, { "epoch": 1.298973880597015, "grad_norm": 0.49816168005660294, "learning_rate": 4.479879327860816e-05, "loss": 0.5077, "step": 11140 }, { "epoch": 1.2995569029850746, "grad_norm": 0.48361299010120495, "learning_rate": 4.479324409419186e-05, "loss": 0.513, "step": 11145 }, { "epoch": 1.3001399253731343, "grad_norm": 0.48096288151197436, "learning_rate": 4.478769233843587e-05, "loss": 0.4814, "step": 11150 }, { "epoch": 1.300722947761194, "grad_norm": 0.44871839828132487, "learning_rate": 4.478213801216566e-05, "loss": 0.4846, "step": 11155 }, { "epoch": 1.3013059701492538, "grad_norm": 0.475520923780983, "learning_rate": 4.477658111620711e-05, "loss": 0.4792, "step": 11160 }, { "epoch": 1.3018889925373134, "grad_norm": 0.45266920995262744, "learning_rate": 4.477102165138648e-05, "loss": 0.4876, "step": 11165 }, { "epoch": 1.3024720149253732, "grad_norm": 0.5301680764440551, "learning_rate": 4.4765459618530405e-05, "loss": 0.4966, "step": 11170 }, { "epoch": 1.3030550373134329, "grad_norm": 0.4280386589494294, "learning_rate": 4.4759895018465906e-05, "loss": 0.478, "step": 11175 }, { "epoch": 1.3036380597014925, "grad_norm": 0.4016001646885821, "learning_rate": 4.47543278520204e-05, "loss": 0.469, "step": 11180 }, { "epoch": 1.3042210820895521, "grad_norm": 0.4199568094935958, "learning_rate": 4.474875812002165e-05, "loss": 0.4642, "step": 11185 }, { "epoch": 1.304804104477612, "grad_norm": 0.47030310198436404, "learning_rate": 4.474318582329783e-05, "loss": 0.4556, "step": 11190 }, { "epoch": 1.3053871268656716, "grad_norm": 0.42443805232822085, "learning_rate": 4.473761096267749e-05, "loss": 0.4874, "step": 11195 }, { "epoch": 1.3059701492537314, "grad_norm": 0.4709462751586026, "learning_rate": 4.4732033538989556e-05, "loss": 0.4638, "step": 11200 }, { "epoch": 1.306553171641791, "grad_norm": 0.4222680732756702, "learning_rate": 4.4726453553063343e-05, "loss": 0.488, "step": 11205 }, { "epoch": 1.3071361940298507, "grad_norm": 0.47741257493485134, "learning_rate": 4.4720871005728526e-05, "loss": 0.5127, "step": 11210 }, { "epoch": 1.3077192164179103, "grad_norm": 0.5247863791189061, "learning_rate": 4.4715285897815196e-05, "loss": 0.4891, "step": 11215 }, { "epoch": 1.3083022388059702, "grad_norm": 0.4515647058817243, "learning_rate": 4.47096982301538e-05, "loss": 0.487, "step": 11220 }, { "epoch": 1.3088852611940298, "grad_norm": 0.403232965036066, "learning_rate": 4.470410800357515e-05, "loss": 0.5052, "step": 11225 }, { "epoch": 1.3094682835820897, "grad_norm": 0.4426716841265954, "learning_rate": 4.469851521891049e-05, "loss": 0.4755, "step": 11230 }, { "epoch": 1.3100513059701493, "grad_norm": 0.5160837707568388, "learning_rate": 4.469291987699139e-05, "loss": 0.5067, "step": 11235 }, { "epoch": 1.310634328358209, "grad_norm": 0.4574976120420014, "learning_rate": 4.468732197864984e-05, "loss": 0.5015, "step": 11240 }, { "epoch": 1.3112173507462686, "grad_norm": 0.5195077800060661, "learning_rate": 4.468172152471818e-05, "loss": 0.4991, "step": 11245 }, { "epoch": 1.3118003731343284, "grad_norm": 0.46232057951489985, "learning_rate": 4.467611851602916e-05, "loss": 0.5187, "step": 11250 }, { "epoch": 1.312383395522388, "grad_norm": 0.47008596948770354, "learning_rate": 4.467051295341587e-05, "loss": 0.5048, "step": 11255 }, { "epoch": 1.3129664179104479, "grad_norm": 0.45102971581296836, "learning_rate": 4.4664904837711835e-05, "loss": 0.4824, "step": 11260 }, { "epoch": 1.3135494402985075, "grad_norm": 0.4259458579905399, "learning_rate": 4.4659294169750896e-05, "loss": 0.4819, "step": 11265 }, { "epoch": 1.3141324626865671, "grad_norm": 0.404768642006727, "learning_rate": 4.465368095036733e-05, "loss": 0.4568, "step": 11270 }, { "epoch": 1.3147154850746268, "grad_norm": 0.4869900553207631, "learning_rate": 4.464806518039575e-05, "loss": 0.5059, "step": 11275 }, { "epoch": 1.3152985074626866, "grad_norm": 0.46355528098325016, "learning_rate": 4.4642446860671185e-05, "loss": 0.4913, "step": 11280 }, { "epoch": 1.3158815298507462, "grad_norm": 0.47138496613455305, "learning_rate": 4.463682599202902e-05, "loss": 0.4866, "step": 11285 }, { "epoch": 1.316464552238806, "grad_norm": 0.47576375715614827, "learning_rate": 4.463120257530501e-05, "loss": 0.4941, "step": 11290 }, { "epoch": 1.3170475746268657, "grad_norm": 0.4506773331630608, "learning_rate": 4.462557661133532e-05, "loss": 0.4827, "step": 11295 }, { "epoch": 1.3176305970149254, "grad_norm": 0.5246787882695987, "learning_rate": 4.461994810095647e-05, "loss": 0.4724, "step": 11300 }, { "epoch": 1.318213619402985, "grad_norm": 0.46038738940449725, "learning_rate": 4.4614317045005365e-05, "loss": 0.4945, "step": 11305 }, { "epoch": 1.3187966417910448, "grad_norm": 0.46730360950405997, "learning_rate": 4.46086834443193e-05, "loss": 0.4796, "step": 11310 }, { "epoch": 1.3193796641791045, "grad_norm": 0.5075247511570078, "learning_rate": 4.460304729973592e-05, "loss": 0.494, "step": 11315 }, { "epoch": 1.3199626865671643, "grad_norm": 0.41706468419557796, "learning_rate": 4.4597408612093265e-05, "loss": 0.5149, "step": 11320 }, { "epoch": 1.320545708955224, "grad_norm": 0.40811472809077004, "learning_rate": 4.4591767382229776e-05, "loss": 0.4671, "step": 11325 }, { "epoch": 1.3211287313432836, "grad_norm": 0.4594248970927176, "learning_rate": 4.458612361098423e-05, "loss": 0.4783, "step": 11330 }, { "epoch": 1.3217117537313432, "grad_norm": 1.024719586676958, "learning_rate": 4.458047729919581e-05, "loss": 0.5189, "step": 11335 }, { "epoch": 1.322294776119403, "grad_norm": 0.4277748471869806, "learning_rate": 4.457482844770408e-05, "loss": 0.4929, "step": 11340 }, { "epoch": 1.3228777985074627, "grad_norm": 0.5040113578496384, "learning_rate": 4.456917705734894e-05, "loss": 0.5262, "step": 11345 }, { "epoch": 1.3234608208955223, "grad_norm": 0.43865739047204216, "learning_rate": 4.456352312897072e-05, "loss": 0.5148, "step": 11350 }, { "epoch": 1.3240438432835822, "grad_norm": 0.44935657578945365, "learning_rate": 4.45578666634101e-05, "loss": 0.4761, "step": 11355 }, { "epoch": 1.3246268656716418, "grad_norm": 0.5147493131186344, "learning_rate": 4.455220766150814e-05, "loss": 0.5198, "step": 11360 }, { "epoch": 1.3252098880597014, "grad_norm": 0.4818581080990096, "learning_rate": 4.454654612410628e-05, "loss": 0.4745, "step": 11365 }, { "epoch": 1.3257929104477613, "grad_norm": 0.4579540692943894, "learning_rate": 4.454088205204634e-05, "loss": 0.5275, "step": 11370 }, { "epoch": 1.326375932835821, "grad_norm": 0.4177917530534575, "learning_rate": 4.453521544617051e-05, "loss": 0.472, "step": 11375 }, { "epoch": 1.3269589552238805, "grad_norm": 0.4135399462613423, "learning_rate": 4.452954630732136e-05, "loss": 0.4785, "step": 11380 }, { "epoch": 1.3275419776119404, "grad_norm": 0.43489658697969774, "learning_rate": 4.452387463634185e-05, "loss": 0.5032, "step": 11385 }, { "epoch": 1.328125, "grad_norm": 0.49028362359902466, "learning_rate": 4.451820043407527e-05, "loss": 0.4864, "step": 11390 }, { "epoch": 1.3287080223880596, "grad_norm": 0.4210675259494679, "learning_rate": 4.451252370136536e-05, "loss": 0.5168, "step": 11395 }, { "epoch": 1.3292910447761195, "grad_norm": 0.4353151647796139, "learning_rate": 4.450684443905615e-05, "loss": 0.4912, "step": 11400 }, { "epoch": 1.329874067164179, "grad_norm": 0.4876532002745093, "learning_rate": 4.450116264799214e-05, "loss": 0.4966, "step": 11405 }, { "epoch": 1.3304570895522387, "grad_norm": 0.4373095960768425, "learning_rate": 4.4495478329018125e-05, "loss": 0.5092, "step": 11410 }, { "epoch": 1.3310401119402986, "grad_norm": 0.4627316321181924, "learning_rate": 4.448979148297932e-05, "loss": 0.4972, "step": 11415 }, { "epoch": 1.3316231343283582, "grad_norm": 1.252558091692985, "learning_rate": 4.44841021107213e-05, "loss": 0.5056, "step": 11420 }, { "epoch": 1.3322061567164178, "grad_norm": 0.48649675930152275, "learning_rate": 4.4478410213090035e-05, "loss": 0.4868, "step": 11425 }, { "epoch": 1.3327891791044777, "grad_norm": 0.4738155854412053, "learning_rate": 4.447271579093185e-05, "loss": 0.4811, "step": 11430 }, { "epoch": 1.3333722014925373, "grad_norm": 0.42461686825587563, "learning_rate": 4.446701884509343e-05, "loss": 0.4794, "step": 11435 }, { "epoch": 1.333955223880597, "grad_norm": 0.4596175102509013, "learning_rate": 4.4461319376421875e-05, "loss": 0.4838, "step": 11440 }, { "epoch": 1.3345382462686568, "grad_norm": 0.4398843309190659, "learning_rate": 4.445561738576464e-05, "loss": 0.4937, "step": 11445 }, { "epoch": 1.3351212686567164, "grad_norm": 0.4893163040733851, "learning_rate": 4.444991287396955e-05, "loss": 0.4924, "step": 11450 }, { "epoch": 1.335704291044776, "grad_norm": 0.437191903940666, "learning_rate": 4.444420584188482e-05, "loss": 0.4925, "step": 11455 }, { "epoch": 1.3362873134328357, "grad_norm": 0.417694375488598, "learning_rate": 4.443849629035903e-05, "loss": 0.4461, "step": 11460 }, { "epoch": 1.3368703358208955, "grad_norm": 0.4417471806673306, "learning_rate": 4.443278422024113e-05, "loss": 0.4595, "step": 11465 }, { "epoch": 1.3374533582089552, "grad_norm": 0.48774322715083146, "learning_rate": 4.4427069632380455e-05, "loss": 0.5065, "step": 11470 }, { "epoch": 1.338036380597015, "grad_norm": 0.41939788274877826, "learning_rate": 4.4421352527626706e-05, "loss": 0.4693, "step": 11475 }, { "epoch": 1.3386194029850746, "grad_norm": 0.4440878300703214, "learning_rate": 4.441563290682996e-05, "loss": 0.4914, "step": 11480 }, { "epoch": 1.3392024253731343, "grad_norm": 0.45833074723308814, "learning_rate": 4.440991077084067e-05, "loss": 0.4647, "step": 11485 }, { "epoch": 1.339785447761194, "grad_norm": 0.4256435153711335, "learning_rate": 4.4404186120509674e-05, "loss": 0.5241, "step": 11490 }, { "epoch": 1.3403684701492538, "grad_norm": 0.4502983106443609, "learning_rate": 4.4398458956688156e-05, "loss": 0.4878, "step": 11495 }, { "epoch": 1.3409514925373134, "grad_norm": 0.5696469219475851, "learning_rate": 4.43927292802277e-05, "loss": 0.5033, "step": 11500 }, { "epoch": 1.3415345149253732, "grad_norm": 0.4461570791917921, "learning_rate": 4.4386997091980255e-05, "loss": 0.484, "step": 11505 }, { "epoch": 1.3421175373134329, "grad_norm": 0.4340282222712749, "learning_rate": 4.438126239279814e-05, "loss": 0.5006, "step": 11510 }, { "epoch": 1.3427005597014925, "grad_norm": 0.4837057378517508, "learning_rate": 4.437552518353405e-05, "loss": 0.4833, "step": 11515 }, { "epoch": 1.3432835820895521, "grad_norm": 0.4750137544995223, "learning_rate": 4.436978546504105e-05, "loss": 0.4787, "step": 11520 }, { "epoch": 1.343866604477612, "grad_norm": 0.4518116308598689, "learning_rate": 4.436404323817258e-05, "loss": 0.4907, "step": 11525 }, { "epoch": 1.3444496268656716, "grad_norm": 0.44161839241271017, "learning_rate": 4.435829850378247e-05, "loss": 0.5339, "step": 11530 }, { "epoch": 1.3450326492537314, "grad_norm": 0.46451523565596275, "learning_rate": 4.435255126272489e-05, "loss": 0.4786, "step": 11535 }, { "epoch": 1.345615671641791, "grad_norm": 0.472507616974173, "learning_rate": 4.43468015158544e-05, "loss": 0.4932, "step": 11540 }, { "epoch": 1.3461986940298507, "grad_norm": 0.44181411074076127, "learning_rate": 4.434104926402594e-05, "loss": 0.4824, "step": 11545 }, { "epoch": 1.3467817164179103, "grad_norm": 0.5750209645654945, "learning_rate": 4.433529450809481e-05, "loss": 0.5099, "step": 11550 }, { "epoch": 1.3473647388059702, "grad_norm": 0.43343667019679816, "learning_rate": 4.432953724891669e-05, "loss": 0.4618, "step": 11555 }, { "epoch": 1.3479477611940298, "grad_norm": 0.6911166321822115, "learning_rate": 4.432377748734763e-05, "loss": 0.5044, "step": 11560 }, { "epoch": 1.3485307835820897, "grad_norm": 0.4039695330997034, "learning_rate": 4.4318015224244044e-05, "loss": 0.464, "step": 11565 }, { "epoch": 1.3491138059701493, "grad_norm": 0.45677998741177633, "learning_rate": 4.431225046046274e-05, "loss": 0.4892, "step": 11570 }, { "epoch": 1.349696828358209, "grad_norm": 0.42028095255888276, "learning_rate": 4.4306483196860866e-05, "loss": 0.4714, "step": 11575 }, { "epoch": 1.3502798507462686, "grad_norm": 0.43454719039788725, "learning_rate": 4.430071343429597e-05, "loss": 0.4839, "step": 11580 }, { "epoch": 1.3508628731343284, "grad_norm": 0.4270067192100428, "learning_rate": 4.429494117362595e-05, "loss": 0.4919, "step": 11585 }, { "epoch": 1.351445895522388, "grad_norm": 0.47601183858432783, "learning_rate": 4.4289166415709096e-05, "loss": 0.4867, "step": 11590 }, { "epoch": 1.3520289179104479, "grad_norm": 0.39834803271753444, "learning_rate": 4.428338916140406e-05, "loss": 0.4529, "step": 11595 }, { "epoch": 1.3526119402985075, "grad_norm": 0.41806798471822515, "learning_rate": 4.427760941156986e-05, "loss": 0.469, "step": 11600 }, { "epoch": 1.3531949626865671, "grad_norm": 0.4704193815184823, "learning_rate": 4.427182716706589e-05, "loss": 0.4891, "step": 11605 }, { "epoch": 1.3537779850746268, "grad_norm": 0.4476846974897753, "learning_rate": 4.426604242875191e-05, "loss": 0.4983, "step": 11610 }, { "epoch": 1.3543610074626866, "grad_norm": 0.4472584935699144, "learning_rate": 4.426025519748807e-05, "loss": 0.4858, "step": 11615 }, { "epoch": 1.3549440298507462, "grad_norm": 0.4288138419577691, "learning_rate": 4.4254465474134856e-05, "loss": 0.4909, "step": 11620 }, { "epoch": 1.355527052238806, "grad_norm": 0.43688944499628235, "learning_rate": 4.424867325955315e-05, "loss": 0.4484, "step": 11625 }, { "epoch": 1.3561100746268657, "grad_norm": 0.4467119070895311, "learning_rate": 4.42428785546042e-05, "loss": 0.4572, "step": 11630 }, { "epoch": 1.3566930970149254, "grad_norm": 0.48775234180390487, "learning_rate": 4.4237081360149646e-05, "loss": 0.5326, "step": 11635 }, { "epoch": 1.357276119402985, "grad_norm": 0.42440937988251515, "learning_rate": 4.423128167705144e-05, "loss": 0.5245, "step": 11640 }, { "epoch": 1.3578591417910448, "grad_norm": 0.4265868048302539, "learning_rate": 4.4225479506171956e-05, "loss": 0.4683, "step": 11645 }, { "epoch": 1.3584421641791045, "grad_norm": 0.4185160086710533, "learning_rate": 4.4219674848373924e-05, "loss": 0.4848, "step": 11650 }, { "epoch": 1.3590251865671643, "grad_norm": 0.4282718371282175, "learning_rate": 4.421386770452042e-05, "loss": 0.4682, "step": 11655 }, { "epoch": 1.359608208955224, "grad_norm": 0.3966715398630358, "learning_rate": 4.4208058075474945e-05, "loss": 0.4707, "step": 11660 }, { "epoch": 1.3601912313432836, "grad_norm": 0.43928634287198337, "learning_rate": 4.4202245962101314e-05, "loss": 0.4829, "step": 11665 }, { "epoch": 1.3607742537313432, "grad_norm": 0.4079043636556758, "learning_rate": 4.419643136526373e-05, "loss": 0.53, "step": 11670 }, { "epoch": 1.361357276119403, "grad_norm": 0.47151409999679794, "learning_rate": 4.419061428582678e-05, "loss": 0.488, "step": 11675 }, { "epoch": 1.3619402985074627, "grad_norm": 0.4928984564012108, "learning_rate": 4.418479472465539e-05, "loss": 0.4987, "step": 11680 }, { "epoch": 1.3625233208955223, "grad_norm": 0.4344552366085095, "learning_rate": 4.41789726826149e-05, "loss": 0.4862, "step": 11685 }, { "epoch": 1.3631063432835822, "grad_norm": 0.4668029110763993, "learning_rate": 4.417314816057096e-05, "loss": 0.5062, "step": 11690 }, { "epoch": 1.3636893656716418, "grad_norm": 0.49558056969821035, "learning_rate": 4.416732115938965e-05, "loss": 0.5118, "step": 11695 }, { "epoch": 1.3642723880597014, "grad_norm": 0.5214969389536915, "learning_rate": 4.416149167993737e-05, "loss": 0.4765, "step": 11700 }, { "epoch": 1.3648554104477613, "grad_norm": 0.5157968315431264, "learning_rate": 4.415565972308092e-05, "loss": 0.5147, "step": 11705 }, { "epoch": 1.365438432835821, "grad_norm": 0.42535300596999137, "learning_rate": 4.4149825289687454e-05, "loss": 0.4939, "step": 11710 }, { "epoch": 1.3660214552238805, "grad_norm": 0.44669677244769357, "learning_rate": 4.414398838062448e-05, "loss": 0.4892, "step": 11715 }, { "epoch": 1.3666044776119404, "grad_norm": 0.4703220090528917, "learning_rate": 4.413814899675991e-05, "loss": 0.514, "step": 11720 }, { "epoch": 1.3671875, "grad_norm": 0.427440575861135, "learning_rate": 4.413230713896199e-05, "loss": 0.4975, "step": 11725 }, { "epoch": 1.3677705223880596, "grad_norm": 0.4328688500353565, "learning_rate": 4.4126462808099364e-05, "loss": 0.5014, "step": 11730 }, { "epoch": 1.3683535447761195, "grad_norm": 0.4368082006934825, "learning_rate": 4.4120616005041014e-05, "loss": 0.4863, "step": 11735 }, { "epoch": 1.368936567164179, "grad_norm": 0.42516872447155635, "learning_rate": 4.411476673065631e-05, "loss": 0.4882, "step": 11740 }, { "epoch": 1.3695195895522387, "grad_norm": 0.40472021682783205, "learning_rate": 4.4108914985814985e-05, "loss": 0.4384, "step": 11745 }, { "epoch": 1.3701026119402986, "grad_norm": 0.4600007872153849, "learning_rate": 4.410306077138713e-05, "loss": 0.5051, "step": 11750 }, { "epoch": 1.3706856343283582, "grad_norm": 0.5136663989613519, "learning_rate": 4.409720408824323e-05, "loss": 0.4675, "step": 11755 }, { "epoch": 1.3712686567164178, "grad_norm": 0.4400619155182538, "learning_rate": 4.409134493725409e-05, "loss": 0.4662, "step": 11760 }, { "epoch": 1.3718516791044777, "grad_norm": 0.449065541332509, "learning_rate": 4.408548331929092e-05, "loss": 0.5552, "step": 11765 }, { "epoch": 1.3724347014925373, "grad_norm": 0.47887706466721547, "learning_rate": 4.407961923522529e-05, "loss": 0.5058, "step": 11770 }, { "epoch": 1.373017723880597, "grad_norm": 0.38601814241288335, "learning_rate": 4.407375268592914e-05, "loss": 0.4809, "step": 11775 }, { "epoch": 1.3736007462686568, "grad_norm": 0.45229299340709755, "learning_rate": 4.406788367227475e-05, "loss": 0.4887, "step": 11780 }, { "epoch": 1.3741837686567164, "grad_norm": 0.49749738471497534, "learning_rate": 4.4062012195134814e-05, "loss": 0.5211, "step": 11785 }, { "epoch": 1.374766791044776, "grad_norm": 0.4574798612280582, "learning_rate": 4.4056138255382335e-05, "loss": 0.4905, "step": 11790 }, { "epoch": 1.3753498134328357, "grad_norm": 0.4533972511175499, "learning_rate": 4.405026185389073e-05, "loss": 0.536, "step": 11795 }, { "epoch": 1.3759328358208955, "grad_norm": 0.4716256768316318, "learning_rate": 4.404438299153376e-05, "loss": 0.542, "step": 11800 }, { "epoch": 1.3765158582089552, "grad_norm": 0.41848445317544863, "learning_rate": 4.4038501669185544e-05, "loss": 0.5097, "step": 11805 }, { "epoch": 1.377098880597015, "grad_norm": 0.46718965756679864, "learning_rate": 4.4032617887720604e-05, "loss": 0.5251, "step": 11810 }, { "epoch": 1.3776819029850746, "grad_norm": 0.40800098396507833, "learning_rate": 4.402673164801377e-05, "loss": 0.4919, "step": 11815 }, { "epoch": 1.3782649253731343, "grad_norm": 0.45963121993501865, "learning_rate": 4.4020842950940294e-05, "loss": 0.527, "step": 11820 }, { "epoch": 1.378847947761194, "grad_norm": 0.49038282501383, "learning_rate": 4.401495179737576e-05, "loss": 0.4581, "step": 11825 }, { "epoch": 1.3794309701492538, "grad_norm": 0.44009117483164156, "learning_rate": 4.400905818819613e-05, "loss": 0.4752, "step": 11830 }, { "epoch": 1.3800139925373134, "grad_norm": 0.4590450905749542, "learning_rate": 4.400316212427772e-05, "loss": 0.5268, "step": 11835 }, { "epoch": 1.3805970149253732, "grad_norm": 0.48248426252968263, "learning_rate": 4.3997263606497225e-05, "loss": 0.4727, "step": 11840 }, { "epoch": 1.3811800373134329, "grad_norm": 0.4814309262867952, "learning_rate": 4.3991362635731684e-05, "loss": 0.4886, "step": 11845 }, { "epoch": 1.3817630597014925, "grad_norm": 0.44899737588705313, "learning_rate": 4.3985459212858535e-05, "loss": 0.5019, "step": 11850 }, { "epoch": 1.3823460820895521, "grad_norm": 0.4414302378152841, "learning_rate": 4.397955333875555e-05, "loss": 0.5178, "step": 11855 }, { "epoch": 1.382929104477612, "grad_norm": 0.6612564285628922, "learning_rate": 4.397364501430088e-05, "loss": 0.5117, "step": 11860 }, { "epoch": 1.3835121268656716, "grad_norm": 0.4337120359984851, "learning_rate": 4.3967734240373025e-05, "loss": 0.5125, "step": 11865 }, { "epoch": 1.3840951492537314, "grad_norm": 0.4440649198676861, "learning_rate": 4.396182101785089e-05, "loss": 0.4958, "step": 11870 }, { "epoch": 1.384678171641791, "grad_norm": 9.663684236466022, "learning_rate": 4.3955905347613666e-05, "loss": 0.5006, "step": 11875 }, { "epoch": 1.3852611940298507, "grad_norm": 0.44755912533370196, "learning_rate": 4.3949987230541e-05, "loss": 0.4754, "step": 11880 }, { "epoch": 1.3858442164179103, "grad_norm": 0.5148467190767787, "learning_rate": 4.394406666751284e-05, "loss": 0.4958, "step": 11885 }, { "epoch": 1.3864272388059702, "grad_norm": 0.43418661721534973, "learning_rate": 4.3938143659409515e-05, "loss": 0.4815, "step": 11890 }, { "epoch": 1.3870102611940298, "grad_norm": 0.4691447765306588, "learning_rate": 4.393221820711173e-05, "loss": 0.5257, "step": 11895 }, { "epoch": 1.3875932835820897, "grad_norm": 0.4291236683000934, "learning_rate": 4.392629031150054e-05, "loss": 0.4615, "step": 11900 }, { "epoch": 1.3881763059701493, "grad_norm": 0.4901153343134047, "learning_rate": 4.392035997345736e-05, "loss": 0.4896, "step": 11905 }, { "epoch": 1.388759328358209, "grad_norm": 0.3982941566431026, "learning_rate": 4.391442719386398e-05, "loss": 0.4629, "step": 11910 }, { "epoch": 1.3893423507462686, "grad_norm": 0.4117537859151202, "learning_rate": 4.390849197360254e-05, "loss": 0.4508, "step": 11915 }, { "epoch": 1.3899253731343284, "grad_norm": 0.43084463356368424, "learning_rate": 4.390255431355557e-05, "loss": 0.5008, "step": 11920 }, { "epoch": 1.390508395522388, "grad_norm": 0.42556976076020264, "learning_rate": 4.389661421460592e-05, "loss": 0.4614, "step": 11925 }, { "epoch": 1.3910914179104479, "grad_norm": 0.46479192102788114, "learning_rate": 4.389067167763683e-05, "loss": 0.4864, "step": 11930 }, { "epoch": 1.3916744402985075, "grad_norm": 0.4816085181354391, "learning_rate": 4.388472670353191e-05, "loss": 0.5218, "step": 11935 }, { "epoch": 1.3922574626865671, "grad_norm": 0.4610453279542012, "learning_rate": 4.387877929317512e-05, "loss": 0.5116, "step": 11940 }, { "epoch": 1.3928404850746268, "grad_norm": 0.4504152664235486, "learning_rate": 4.387282944745077e-05, "loss": 0.4922, "step": 11945 }, { "epoch": 1.3934235074626866, "grad_norm": 0.42146949445467374, "learning_rate": 4.3866877167243554e-05, "loss": 0.4677, "step": 11950 }, { "epoch": 1.3940065298507462, "grad_norm": 0.44820586510689486, "learning_rate": 4.3860922453438515e-05, "loss": 0.5098, "step": 11955 }, { "epoch": 1.394589552238806, "grad_norm": 0.4775795192769077, "learning_rate": 4.3854965306921064e-05, "loss": 0.5096, "step": 11960 }, { "epoch": 1.3951725746268657, "grad_norm": 0.4322143493768893, "learning_rate": 4.3849005728576975e-05, "loss": 0.481, "step": 11965 }, { "epoch": 1.3957555970149254, "grad_norm": 0.4594739362514963, "learning_rate": 4.384304371929238e-05, "loss": 0.4891, "step": 11970 }, { "epoch": 1.396338619402985, "grad_norm": 0.38526098315876905, "learning_rate": 4.383707927995377e-05, "loss": 0.4581, "step": 11975 }, { "epoch": 1.3969216417910448, "grad_norm": 0.4428604325260078, "learning_rate": 4.383111241144798e-05, "loss": 0.4766, "step": 11980 }, { "epoch": 1.3975046641791045, "grad_norm": 0.6088745381734858, "learning_rate": 4.3825143114662266e-05, "loss": 0.472, "step": 11985 }, { "epoch": 1.3980876865671643, "grad_norm": 0.4331264380996842, "learning_rate": 4.3819171390484184e-05, "loss": 0.4965, "step": 11990 }, { "epoch": 1.398670708955224, "grad_norm": 0.45889896992386375, "learning_rate": 4.381319723980167e-05, "loss": 0.5156, "step": 11995 }, { "epoch": 1.3992537313432836, "grad_norm": 0.4993820307760769, "learning_rate": 4.380722066350303e-05, "loss": 0.4816, "step": 12000 }, { "epoch": 1.3998367537313432, "grad_norm": 0.44968773852326066, "learning_rate": 4.380124166247691e-05, "loss": 0.4763, "step": 12005 }, { "epoch": 1.400419776119403, "grad_norm": 0.45066812781558496, "learning_rate": 4.3795260237612353e-05, "loss": 0.5029, "step": 12010 }, { "epoch": 1.4010027985074627, "grad_norm": 0.44516074290492613, "learning_rate": 4.378927638979871e-05, "loss": 0.5189, "step": 12015 }, { "epoch": 1.4015858208955223, "grad_norm": 0.4560257185162567, "learning_rate": 4.378329011992575e-05, "loss": 0.4886, "step": 12020 }, { "epoch": 1.4021688432835822, "grad_norm": 0.451992209920357, "learning_rate": 4.377730142888356e-05, "loss": 0.484, "step": 12025 }, { "epoch": 1.4027518656716418, "grad_norm": 0.42224508768129243, "learning_rate": 4.37713103175626e-05, "loss": 0.4594, "step": 12030 }, { "epoch": 1.4033348880597014, "grad_norm": 0.4777349919451058, "learning_rate": 4.376531678685369e-05, "loss": 0.4843, "step": 12035 }, { "epoch": 1.4039179104477613, "grad_norm": 0.45962348039814177, "learning_rate": 4.375932083764803e-05, "loss": 0.4696, "step": 12040 }, { "epoch": 1.404500932835821, "grad_norm": 0.4355260970118706, "learning_rate": 4.3753322470837135e-05, "loss": 0.4979, "step": 12045 }, { "epoch": 1.4050839552238805, "grad_norm": 0.4601965696034231, "learning_rate": 4.3747321687312916e-05, "loss": 0.4809, "step": 12050 }, { "epoch": 1.4056669776119404, "grad_norm": 0.4828330309644055, "learning_rate": 4.3741318487967634e-05, "loss": 0.5025, "step": 12055 }, { "epoch": 1.40625, "grad_norm": 0.4890520866160836, "learning_rate": 4.37353128736939e-05, "loss": 0.509, "step": 12060 }, { "epoch": 1.4068330223880596, "grad_norm": 0.4064163487682252, "learning_rate": 4.3729304845384695e-05, "loss": 0.4689, "step": 12065 }, { "epoch": 1.4074160447761195, "grad_norm": 0.4149499769771672, "learning_rate": 4.3723294403933355e-05, "loss": 0.5198, "step": 12070 }, { "epoch": 1.407999067164179, "grad_norm": 0.43687821599607873, "learning_rate": 4.371728155023358e-05, "loss": 0.4837, "step": 12075 }, { "epoch": 1.4085820895522387, "grad_norm": 0.45448131654117757, "learning_rate": 4.3711266285179415e-05, "loss": 0.4978, "step": 12080 }, { "epoch": 1.4091651119402986, "grad_norm": 0.4723176347005134, "learning_rate": 4.370524860966529e-05, "loss": 0.5008, "step": 12085 }, { "epoch": 1.4097481343283582, "grad_norm": 0.46347806983282575, "learning_rate": 4.369922852458594e-05, "loss": 0.515, "step": 12090 }, { "epoch": 1.4103311567164178, "grad_norm": 0.4436342437266037, "learning_rate": 4.369320603083653e-05, "loss": 0.5111, "step": 12095 }, { "epoch": 1.4109141791044777, "grad_norm": 0.418830107444406, "learning_rate": 4.3687181129312534e-05, "loss": 0.4707, "step": 12100 }, { "epoch": 1.4114972014925373, "grad_norm": 0.4707556769100974, "learning_rate": 4.368115382090979e-05, "loss": 0.5223, "step": 12105 }, { "epoch": 1.412080223880597, "grad_norm": 0.539125567160963, "learning_rate": 4.3675124106524514e-05, "loss": 0.4929, "step": 12110 }, { "epoch": 1.4126632462686568, "grad_norm": 0.44387996002816776, "learning_rate": 4.366909198705325e-05, "loss": 0.4558, "step": 12115 }, { "epoch": 1.4132462686567164, "grad_norm": 0.4469636312909113, "learning_rate": 4.366305746339293e-05, "loss": 0.5214, "step": 12120 }, { "epoch": 1.413829291044776, "grad_norm": 0.4029416312443115, "learning_rate": 4.365702053644083e-05, "loss": 0.4463, "step": 12125 }, { "epoch": 1.4144123134328357, "grad_norm": 0.4456710663609492, "learning_rate": 4.365098120709458e-05, "loss": 0.4572, "step": 12130 }, { "epoch": 1.4149953358208955, "grad_norm": 0.44908954981495897, "learning_rate": 4.364493947625217e-05, "loss": 0.4933, "step": 12135 }, { "epoch": 1.4155783582089552, "grad_norm": 0.4503655079561281, "learning_rate": 4.363889534481195e-05, "loss": 0.5046, "step": 12140 }, { "epoch": 1.416161380597015, "grad_norm": 0.424261930398403, "learning_rate": 4.3632848813672614e-05, "loss": 0.4653, "step": 12145 }, { "epoch": 1.4167444029850746, "grad_norm": 0.44105778628436115, "learning_rate": 4.3626799883733236e-05, "loss": 0.4921, "step": 12150 }, { "epoch": 1.4173274253731343, "grad_norm": 0.42006012534571524, "learning_rate": 4.362074855589322e-05, "loss": 0.4848, "step": 12155 }, { "epoch": 1.417910447761194, "grad_norm": 0.40687698386764304, "learning_rate": 4.361469483105236e-05, "loss": 0.4841, "step": 12160 }, { "epoch": 1.4184934701492538, "grad_norm": 0.4102416331704393, "learning_rate": 4.3608638710110775e-05, "loss": 0.4639, "step": 12165 }, { "epoch": 1.4190764925373134, "grad_norm": 0.42134394967904526, "learning_rate": 4.360258019396895e-05, "loss": 0.483, "step": 12170 }, { "epoch": 1.4196595149253732, "grad_norm": 0.5353276112439099, "learning_rate": 4.3596519283527745e-05, "loss": 0.5029, "step": 12175 }, { "epoch": 1.4202425373134329, "grad_norm": 0.47287256377441444, "learning_rate": 4.3590455979688335e-05, "loss": 0.5101, "step": 12180 }, { "epoch": 1.4208255597014925, "grad_norm": 0.4626159700923588, "learning_rate": 4.358439028335229e-05, "loss": 0.5342, "step": 12185 }, { "epoch": 1.4214085820895521, "grad_norm": 0.4720855733706812, "learning_rate": 4.357832219542151e-05, "loss": 0.5208, "step": 12190 }, { "epoch": 1.421991604477612, "grad_norm": 0.4492415730868526, "learning_rate": 4.357225171679828e-05, "loss": 0.5087, "step": 12195 }, { "epoch": 1.4225746268656716, "grad_norm": 0.4644605710513964, "learning_rate": 4.3566178848385194e-05, "loss": 0.4889, "step": 12200 }, { "epoch": 1.4231576492537314, "grad_norm": 0.41588190425927396, "learning_rate": 4.3560103591085264e-05, "loss": 0.4761, "step": 12205 }, { "epoch": 1.423740671641791, "grad_norm": 0.4786554490248903, "learning_rate": 4.35540259458018e-05, "loss": 0.4776, "step": 12210 }, { "epoch": 1.4243236940298507, "grad_norm": 0.44410318260591897, "learning_rate": 4.3547945913438494e-05, "loss": 0.489, "step": 12215 }, { "epoch": 1.4249067164179103, "grad_norm": 0.39959270238242506, "learning_rate": 4.3541863494899385e-05, "loss": 0.484, "step": 12220 }, { "epoch": 1.4254897388059702, "grad_norm": 0.43498444964457617, "learning_rate": 4.353577869108887e-05, "loss": 0.4863, "step": 12225 }, { "epoch": 1.4260727611940298, "grad_norm": 0.42608426819692075, "learning_rate": 4.352969150291172e-05, "loss": 0.4623, "step": 12230 }, { "epoch": 1.4266557835820897, "grad_norm": 0.5302877096128493, "learning_rate": 4.3523601931273024e-05, "loss": 0.4656, "step": 12235 }, { "epoch": 1.4272388059701493, "grad_norm": 0.43611984433348255, "learning_rate": 4.351750997707824e-05, "loss": 0.4614, "step": 12240 }, { "epoch": 1.427821828358209, "grad_norm": 0.4261003310907538, "learning_rate": 4.351141564123319e-05, "loss": 0.4668, "step": 12245 }, { "epoch": 1.4284048507462686, "grad_norm": 0.4483288490424969, "learning_rate": 4.3505318924644036e-05, "loss": 0.4852, "step": 12250 }, { "epoch": 1.4289878731343284, "grad_norm": 0.4745429308517631, "learning_rate": 4.349921982821732e-05, "loss": 0.5018, "step": 12255 }, { "epoch": 1.429570895522388, "grad_norm": 0.40722023696620646, "learning_rate": 4.34931183528599e-05, "loss": 0.4365, "step": 12260 }, { "epoch": 1.4301539179104479, "grad_norm": 0.4028239085418656, "learning_rate": 4.3487014499479016e-05, "loss": 0.4778, "step": 12265 }, { "epoch": 1.4307369402985075, "grad_norm": 0.4247854997712988, "learning_rate": 4.348090826898225e-05, "loss": 0.4988, "step": 12270 }, { "epoch": 1.4313199626865671, "grad_norm": 0.4048901928518633, "learning_rate": 4.3474799662277534e-05, "loss": 0.4755, "step": 12275 }, { "epoch": 1.4319029850746268, "grad_norm": 0.4437514833245814, "learning_rate": 4.346868868027318e-05, "loss": 0.495, "step": 12280 }, { "epoch": 1.4324860074626866, "grad_norm": 0.4756215405863023, "learning_rate": 4.3462575323877804e-05, "loss": 0.4735, "step": 12285 }, { "epoch": 1.4330690298507462, "grad_norm": 0.42528856885323113, "learning_rate": 4.345645959400043e-05, "loss": 0.4771, "step": 12290 }, { "epoch": 1.433652052238806, "grad_norm": 0.4391604171730095, "learning_rate": 4.345034149155039e-05, "loss": 0.4647, "step": 12295 }, { "epoch": 1.4342350746268657, "grad_norm": 0.40144749141967323, "learning_rate": 4.344422101743739e-05, "loss": 0.4597, "step": 12300 }, { "epoch": 1.4348180970149254, "grad_norm": 0.7966750253828946, "learning_rate": 4.343809817257149e-05, "loss": 0.4805, "step": 12305 }, { "epoch": 1.435401119402985, "grad_norm": 0.47601640019814095, "learning_rate": 4.3431972957863106e-05, "loss": 0.4944, "step": 12310 }, { "epoch": 1.4359841417910448, "grad_norm": 0.4536554294663507, "learning_rate": 4.342584537422298e-05, "loss": 0.5054, "step": 12315 }, { "epoch": 1.4365671641791045, "grad_norm": 0.4887767983955777, "learning_rate": 4.341971542256225e-05, "loss": 0.492, "step": 12320 }, { "epoch": 1.4371501865671643, "grad_norm": 0.3975790565593417, "learning_rate": 4.341358310379235e-05, "loss": 0.4937, "step": 12325 }, { "epoch": 1.437733208955224, "grad_norm": 0.44357373763971175, "learning_rate": 4.340744841882512e-05, "loss": 0.4881, "step": 12330 }, { "epoch": 1.4383162313432836, "grad_norm": 0.3987880605562992, "learning_rate": 4.3401311368572723e-05, "loss": 0.4645, "step": 12335 }, { "epoch": 1.4388992537313432, "grad_norm": 0.40039316439532135, "learning_rate": 4.339517195394768e-05, "loss": 0.4984, "step": 12340 }, { "epoch": 1.439482276119403, "grad_norm": 0.43680532542532763, "learning_rate": 4.3389030175862854e-05, "loss": 0.4876, "step": 12345 }, { "epoch": 1.4400652985074627, "grad_norm": 0.4777486287265028, "learning_rate": 4.3382886035231484e-05, "loss": 0.5109, "step": 12350 }, { "epoch": 1.4406483208955223, "grad_norm": 0.39871466038545517, "learning_rate": 4.337673953296714e-05, "loss": 0.4875, "step": 12355 }, { "epoch": 1.4412313432835822, "grad_norm": 0.41247498145818423, "learning_rate": 4.3370590669983736e-05, "loss": 0.4728, "step": 12360 }, { "epoch": 1.4418143656716418, "grad_norm": 0.4447465949127571, "learning_rate": 4.3364439447195565e-05, "loss": 0.4983, "step": 12365 }, { "epoch": 1.4423973880597014, "grad_norm": 0.4382953649379091, "learning_rate": 4.335828586551725e-05, "loss": 0.484, "step": 12370 }, { "epoch": 1.4429804104477613, "grad_norm": 0.43182985591318496, "learning_rate": 4.335212992586376e-05, "loss": 0.5007, "step": 12375 }, { "epoch": 1.443563432835821, "grad_norm": 0.41190256419156196, "learning_rate": 4.334597162915045e-05, "loss": 0.4622, "step": 12380 }, { "epoch": 1.4441464552238805, "grad_norm": 0.5890419819477933, "learning_rate": 4.333981097629296e-05, "loss": 0.4612, "step": 12385 }, { "epoch": 1.4447294776119404, "grad_norm": 0.4122620827750522, "learning_rate": 4.333364796820735e-05, "loss": 0.5001, "step": 12390 }, { "epoch": 1.4453125, "grad_norm": 0.4424009618506142, "learning_rate": 4.332748260580999e-05, "loss": 0.5064, "step": 12395 }, { "epoch": 1.4458955223880596, "grad_norm": 0.5333277773391598, "learning_rate": 4.332131489001762e-05, "loss": 0.5058, "step": 12400 }, { "epoch": 1.4464785447761195, "grad_norm": 0.4252961144391402, "learning_rate": 4.331514482174731e-05, "loss": 0.4903, "step": 12405 }, { "epoch": 1.447061567164179, "grad_norm": 0.5145602598846947, "learning_rate": 4.3308972401916495e-05, "loss": 0.4512, "step": 12410 }, { "epoch": 1.4476445895522387, "grad_norm": 0.44184089921348135, "learning_rate": 4.330279763144296e-05, "loss": 0.4768, "step": 12415 }, { "epoch": 1.4482276119402986, "grad_norm": 0.42191608325707347, "learning_rate": 4.3296620511244804e-05, "loss": 0.5121, "step": 12420 }, { "epoch": 1.4488106343283582, "grad_norm": 0.48904339278157805, "learning_rate": 4.3290441042240544e-05, "loss": 0.4953, "step": 12425 }, { "epoch": 1.4493936567164178, "grad_norm": 0.45283412501132253, "learning_rate": 4.3284259225348985e-05, "loss": 0.4775, "step": 12430 }, { "epoch": 1.4499766791044777, "grad_norm": 0.44979751951002644, "learning_rate": 4.327807506148931e-05, "loss": 0.4713, "step": 12435 }, { "epoch": 1.4505597014925373, "grad_norm": 0.4444340926415052, "learning_rate": 4.327188855158106e-05, "loss": 0.5068, "step": 12440 }, { "epoch": 1.451142723880597, "grad_norm": 0.4318952777015103, "learning_rate": 4.3265699696544085e-05, "loss": 0.5089, "step": 12445 }, { "epoch": 1.4517257462686568, "grad_norm": 0.4653500509979724, "learning_rate": 4.325950849729862e-05, "loss": 0.4885, "step": 12450 }, { "epoch": 1.4523087686567164, "grad_norm": 0.4878397446682021, "learning_rate": 4.325331495476523e-05, "loss": 0.4948, "step": 12455 }, { "epoch": 1.452891791044776, "grad_norm": 0.4724080194199092, "learning_rate": 4.3247119069864856e-05, "loss": 0.4807, "step": 12460 }, { "epoch": 1.4534748134328357, "grad_norm": 0.42491183132207805, "learning_rate": 4.3240920843518746e-05, "loss": 0.482, "step": 12465 }, { "epoch": 1.4540578358208955, "grad_norm": 0.40164134969279697, "learning_rate": 4.323472027664852e-05, "loss": 0.4506, "step": 12470 }, { "epoch": 1.4546408582089552, "grad_norm": 0.4186169308135839, "learning_rate": 4.322851737017615e-05, "loss": 0.4801, "step": 12475 }, { "epoch": 1.455223880597015, "grad_norm": 0.41053533762928546, "learning_rate": 4.322231212502394e-05, "loss": 0.475, "step": 12480 }, { "epoch": 1.4558069029850746, "grad_norm": 0.43523829404928854, "learning_rate": 4.321610454211456e-05, "loss": 0.4734, "step": 12485 }, { "epoch": 1.4563899253731343, "grad_norm": 0.4235532721449954, "learning_rate": 4.320989462237101e-05, "loss": 0.4718, "step": 12490 }, { "epoch": 1.456972947761194, "grad_norm": 0.5057469098686633, "learning_rate": 4.3203682366716645e-05, "loss": 0.4987, "step": 12495 }, { "epoch": 1.4575559701492538, "grad_norm": 0.4421119102163701, "learning_rate": 4.3197467776075185e-05, "loss": 0.4994, "step": 12500 }, { "epoch": 1.4581389925373134, "grad_norm": 0.40877775171569397, "learning_rate": 4.3191250851370655e-05, "loss": 0.492, "step": 12505 }, { "epoch": 1.4587220149253732, "grad_norm": 0.47115934720890046, "learning_rate": 4.318503159352748e-05, "loss": 0.4853, "step": 12510 }, { "epoch": 1.4593050373134329, "grad_norm": 0.40289925144193833, "learning_rate": 4.317881000347037e-05, "loss": 0.475, "step": 12515 }, { "epoch": 1.4598880597014925, "grad_norm": 0.4168163898786155, "learning_rate": 4.317258608212444e-05, "loss": 0.4831, "step": 12520 }, { "epoch": 1.4604710820895521, "grad_norm": 0.40185861798071193, "learning_rate": 4.316635983041512e-05, "loss": 0.4604, "step": 12525 }, { "epoch": 1.461054104477612, "grad_norm": 0.39226299307634493, "learning_rate": 4.31601312492682e-05, "loss": 0.4625, "step": 12530 }, { "epoch": 1.4616371268656716, "grad_norm": 0.45982557786300016, "learning_rate": 4.3153900339609804e-05, "loss": 0.5358, "step": 12535 }, { "epoch": 1.4622201492537314, "grad_norm": 0.4649164262902816, "learning_rate": 4.3147667102366415e-05, "loss": 0.4701, "step": 12540 }, { "epoch": 1.462803171641791, "grad_norm": 0.5759458752065588, "learning_rate": 4.3141431538464846e-05, "loss": 0.5232, "step": 12545 }, { "epoch": 1.4633861940298507, "grad_norm": 0.4097720584944929, "learning_rate": 4.313519364883227e-05, "loss": 0.4726, "step": 12550 }, { "epoch": 1.4639692164179103, "grad_norm": 0.428403317278586, "learning_rate": 4.31289534343962e-05, "loss": 0.4703, "step": 12555 }, { "epoch": 1.4645522388059702, "grad_norm": 0.4817780524968433, "learning_rate": 4.3122710896084504e-05, "loss": 0.4996, "step": 12560 }, { "epoch": 1.4651352611940298, "grad_norm": 0.44708410299744195, "learning_rate": 4.311646603482538e-05, "loss": 0.4941, "step": 12565 }, { "epoch": 1.4657182835820897, "grad_norm": 0.4474854917989366, "learning_rate": 4.3110218851547384e-05, "loss": 0.4667, "step": 12570 }, { "epoch": 1.4663013059701493, "grad_norm": 0.4232582351074315, "learning_rate": 4.31039693471794e-05, "loss": 0.4855, "step": 12575 }, { "epoch": 1.466884328358209, "grad_norm": 0.41313500688774263, "learning_rate": 4.309771752265069e-05, "loss": 0.4631, "step": 12580 }, { "epoch": 1.4674673507462686, "grad_norm": 0.43845478336810845, "learning_rate": 4.309146337889082e-05, "loss": 0.4786, "step": 12585 }, { "epoch": 1.4680503731343284, "grad_norm": 0.40820439801952435, "learning_rate": 4.308520691682974e-05, "loss": 0.4524, "step": 12590 }, { "epoch": 1.468633395522388, "grad_norm": 0.4259621029588171, "learning_rate": 4.30789481373977e-05, "loss": 0.4863, "step": 12595 }, { "epoch": 1.4692164179104479, "grad_norm": 0.4133570436825587, "learning_rate": 4.307268704152535e-05, "loss": 0.4689, "step": 12600 }, { "epoch": 1.4697994402985075, "grad_norm": 0.4457935140957306, "learning_rate": 4.306642363014363e-05, "loss": 0.5014, "step": 12605 }, { "epoch": 1.4703824626865671, "grad_norm": 0.46010583854052445, "learning_rate": 4.3060157904183873e-05, "loss": 0.4938, "step": 12610 }, { "epoch": 1.4709654850746268, "grad_norm": 0.4006398533843468, "learning_rate": 4.305388986457772e-05, "loss": 0.4763, "step": 12615 }, { "epoch": 1.4715485074626866, "grad_norm": 0.4234382305757754, "learning_rate": 4.3047619512257164e-05, "loss": 0.4519, "step": 12620 }, { "epoch": 1.4721315298507462, "grad_norm": 0.4093525102315727, "learning_rate": 4.304134684815455e-05, "loss": 0.4689, "step": 12625 }, { "epoch": 1.472714552238806, "grad_norm": 0.4196275704466523, "learning_rate": 4.3035071873202563e-05, "loss": 0.4616, "step": 12630 }, { "epoch": 1.4732975746268657, "grad_norm": 0.4259015394284289, "learning_rate": 4.3028794588334246e-05, "loss": 0.5045, "step": 12635 }, { "epoch": 1.4738805970149254, "grad_norm": 0.3939914414962169, "learning_rate": 4.302251499448294e-05, "loss": 0.4961, "step": 12640 }, { "epoch": 1.474463619402985, "grad_norm": 0.44185520364444403, "learning_rate": 4.301623309258239e-05, "loss": 0.4972, "step": 12645 }, { "epoch": 1.4750466417910448, "grad_norm": 0.46608646485523214, "learning_rate": 4.3009948883566645e-05, "loss": 0.5024, "step": 12650 }, { "epoch": 1.4756296641791045, "grad_norm": 0.39388424659760235, "learning_rate": 4.3003662368370094e-05, "loss": 0.5013, "step": 12655 }, { "epoch": 1.4762126865671643, "grad_norm": 0.40116437547289896, "learning_rate": 4.29973735479275e-05, "loss": 0.4845, "step": 12660 }, { "epoch": 1.476795708955224, "grad_norm": 0.43411032490509016, "learning_rate": 4.299108242317393e-05, "loss": 0.4605, "step": 12665 }, { "epoch": 1.4773787313432836, "grad_norm": 0.39980492762307934, "learning_rate": 4.298478899504485e-05, "loss": 0.4931, "step": 12670 }, { "epoch": 1.4779617537313432, "grad_norm": 0.4688618156846162, "learning_rate": 4.297849326447599e-05, "loss": 0.4978, "step": 12675 }, { "epoch": 1.478544776119403, "grad_norm": 0.40564826416738364, "learning_rate": 4.297219523240349e-05, "loss": 0.4496, "step": 12680 }, { "epoch": 1.4791277985074627, "grad_norm": 0.4345714909631159, "learning_rate": 4.2965894899763796e-05, "loss": 0.4752, "step": 12685 }, { "epoch": 1.4797108208955223, "grad_norm": 0.4334778318451127, "learning_rate": 4.2959592267493715e-05, "loss": 0.49, "step": 12690 }, { "epoch": 1.4802938432835822, "grad_norm": 0.4166685929591067, "learning_rate": 4.29532873365304e-05, "loss": 0.4756, "step": 12695 }, { "epoch": 1.4808768656716418, "grad_norm": 0.5209174926132473, "learning_rate": 4.2946980107811295e-05, "loss": 0.5279, "step": 12700 }, { "epoch": 1.4814598880597014, "grad_norm": 0.4827579151766335, "learning_rate": 4.2940670582274265e-05, "loss": 0.4841, "step": 12705 }, { "epoch": 1.4820429104477613, "grad_norm": 0.48644754238159954, "learning_rate": 4.2934358760857454e-05, "loss": 0.5016, "step": 12710 }, { "epoch": 1.482625932835821, "grad_norm": 0.4335735653052962, "learning_rate": 4.2928044644499374e-05, "loss": 0.4904, "step": 12715 }, { "epoch": 1.4832089552238805, "grad_norm": 0.4125137397259088, "learning_rate": 4.292172823413887e-05, "loss": 0.4414, "step": 12720 }, { "epoch": 1.4837919776119404, "grad_norm": 0.46539183622297464, "learning_rate": 4.2915409530715144e-05, "loss": 0.5214, "step": 12725 }, { "epoch": 1.484375, "grad_norm": 0.4300560406379161, "learning_rate": 4.2909088535167714e-05, "loss": 0.445, "step": 12730 }, { "epoch": 1.4849580223880596, "grad_norm": 0.4360665003036945, "learning_rate": 4.2902765248436454e-05, "loss": 0.5046, "step": 12735 }, { "epoch": 1.4855410447761195, "grad_norm": 0.4335094594302782, "learning_rate": 4.289643967146158e-05, "loss": 0.4974, "step": 12740 }, { "epoch": 1.486124067164179, "grad_norm": 0.48488705242221036, "learning_rate": 4.2890111805183646e-05, "loss": 0.5096, "step": 12745 }, { "epoch": 1.4867070895522387, "grad_norm": 0.40918438376476834, "learning_rate": 4.288378165054354e-05, "loss": 0.4499, "step": 12750 }, { "epoch": 1.4872901119402986, "grad_norm": 0.39800136322363067, "learning_rate": 4.28774492084825e-05, "loss": 0.4899, "step": 12755 }, { "epoch": 1.4878731343283582, "grad_norm": 0.41219574591314934, "learning_rate": 4.28711144799421e-05, "loss": 0.4824, "step": 12760 }, { "epoch": 1.4884561567164178, "grad_norm": 0.4484850868217721, "learning_rate": 4.286477746586424e-05, "loss": 0.4791, "step": 12765 }, { "epoch": 1.4890391791044777, "grad_norm": 0.49051255211363426, "learning_rate": 4.2858438167191185e-05, "loss": 0.4777, "step": 12770 }, { "epoch": 1.4896222014925373, "grad_norm": 0.425993495221689, "learning_rate": 4.285209658486553e-05, "loss": 0.4787, "step": 12775 }, { "epoch": 1.490205223880597, "grad_norm": 0.4847765436901161, "learning_rate": 4.2845752719830206e-05, "loss": 0.4705, "step": 12780 }, { "epoch": 1.4907882462686568, "grad_norm": 0.4175921333004011, "learning_rate": 4.283940657302848e-05, "loss": 0.5082, "step": 12785 }, { "epoch": 1.4913712686567164, "grad_norm": 0.4735360584268163, "learning_rate": 4.283305814540397e-05, "loss": 0.4641, "step": 12790 }, { "epoch": 1.491954291044776, "grad_norm": 0.4278079405722102, "learning_rate": 4.282670743790062e-05, "loss": 0.4927, "step": 12795 }, { "epoch": 1.4925373134328357, "grad_norm": 0.43228098072331395, "learning_rate": 4.282035445146272e-05, "loss": 0.4979, "step": 12800 }, { "epoch": 1.4931203358208955, "grad_norm": 0.4924860189029467, "learning_rate": 4.28139991870349e-05, "loss": 0.4795, "step": 12805 }, { "epoch": 1.4937033582089552, "grad_norm": 0.3976744876424136, "learning_rate": 4.2807641645562134e-05, "loss": 0.5012, "step": 12810 }, { "epoch": 1.494286380597015, "grad_norm": 0.4800407795785969, "learning_rate": 4.280128182798972e-05, "loss": 0.4818, "step": 12815 }, { "epoch": 1.4948694029850746, "grad_norm": 0.4455319688757278, "learning_rate": 4.2794919735263295e-05, "loss": 0.4624, "step": 12820 }, { "epoch": 1.4954524253731343, "grad_norm": 0.41882899861591605, "learning_rate": 4.278855536832885e-05, "loss": 0.5013, "step": 12825 }, { "epoch": 1.496035447761194, "grad_norm": 0.4421966254620205, "learning_rate": 4.278218872813271e-05, "loss": 0.4942, "step": 12830 }, { "epoch": 1.4966184701492538, "grad_norm": 0.45412038119801124, "learning_rate": 4.277581981562152e-05, "loss": 0.4706, "step": 12835 }, { "epoch": 1.4972014925373134, "grad_norm": 0.4367038149200649, "learning_rate": 4.276944863174229e-05, "loss": 0.4855, "step": 12840 }, { "epoch": 1.4977845149253732, "grad_norm": 0.4494163430328603, "learning_rate": 4.2763075177442346e-05, "loss": 0.4746, "step": 12845 }, { "epoch": 1.4983675373134329, "grad_norm": 0.427417625421576, "learning_rate": 4.275669945366936e-05, "loss": 0.4898, "step": 12850 }, { "epoch": 1.4989505597014925, "grad_norm": 0.4887656627468298, "learning_rate": 4.275032146137135e-05, "loss": 0.4808, "step": 12855 }, { "epoch": 1.4995335820895521, "grad_norm": 0.42689802776408003, "learning_rate": 4.2743941201496644e-05, "loss": 0.4847, "step": 12860 }, { "epoch": 1.500116604477612, "grad_norm": 0.43598094745387306, "learning_rate": 4.2737558674993936e-05, "loss": 0.477, "step": 12865 }, { "epoch": 1.5006996268656716, "grad_norm": 0.4238323232177558, "learning_rate": 4.2731173882812264e-05, "loss": 0.4926, "step": 12870 }, { "epoch": 1.5012826492537314, "grad_norm": 0.46203244450943115, "learning_rate": 4.2724786825900955e-05, "loss": 0.4728, "step": 12875 }, { "epoch": 1.501865671641791, "grad_norm": 0.4587859587588373, "learning_rate": 4.271839750520972e-05, "loss": 0.4925, "step": 12880 }, { "epoch": 1.5024486940298507, "grad_norm": 0.42984369024104474, "learning_rate": 4.271200592168858e-05, "loss": 0.4799, "step": 12885 }, { "epoch": 1.5030317164179103, "grad_norm": 0.4231394348503417, "learning_rate": 4.2705612076287907e-05, "loss": 0.4559, "step": 12890 }, { "epoch": 1.5036147388059702, "grad_norm": 0.3989365282022972, "learning_rate": 4.269921596995842e-05, "loss": 0.4657, "step": 12895 }, { "epoch": 1.5041977611940298, "grad_norm": 0.4384376186803543, "learning_rate": 4.2692817603651134e-05, "loss": 0.4655, "step": 12900 }, { "epoch": 1.5047807835820897, "grad_norm": 0.38533933255167363, "learning_rate": 4.268641697831744e-05, "loss": 0.4671, "step": 12905 }, { "epoch": 1.5053638059701493, "grad_norm": 0.46050400043600404, "learning_rate": 4.2680014094909035e-05, "loss": 0.4811, "step": 12910 }, { "epoch": 1.505946828358209, "grad_norm": 0.43695900398680027, "learning_rate": 4.267360895437799e-05, "loss": 0.484, "step": 12915 }, { "epoch": 1.5065298507462686, "grad_norm": 0.42712376354348724, "learning_rate": 4.2667201557676673e-05, "loss": 0.4784, "step": 12920 }, { "epoch": 1.5071128731343284, "grad_norm": 0.4445094978595546, "learning_rate": 4.2660791905757794e-05, "loss": 0.4795, "step": 12925 }, { "epoch": 1.507695895522388, "grad_norm": 0.417555805193044, "learning_rate": 4.2654379999574425e-05, "loss": 0.4923, "step": 12930 }, { "epoch": 1.5082789179104479, "grad_norm": 0.4406546422247791, "learning_rate": 4.2647965840079945e-05, "loss": 0.5034, "step": 12935 }, { "epoch": 1.5088619402985075, "grad_norm": 0.44448449882347235, "learning_rate": 4.2641549428228087e-05, "loss": 0.4666, "step": 12940 }, { "epoch": 1.5094449626865671, "grad_norm": 0.39874278334141017, "learning_rate": 4.263513076497289e-05, "loss": 0.4714, "step": 12945 }, { "epoch": 1.5100279850746268, "grad_norm": 0.4479067494744725, "learning_rate": 4.2628709851268775e-05, "loss": 0.4956, "step": 12950 }, { "epoch": 1.5106110074626866, "grad_norm": 0.44966984070826277, "learning_rate": 4.262228668807044e-05, "loss": 0.4794, "step": 12955 }, { "epoch": 1.5111940298507462, "grad_norm": 0.4119622476774015, "learning_rate": 4.261586127633297e-05, "loss": 0.4726, "step": 12960 }, { "epoch": 1.511777052238806, "grad_norm": 0.40137599613034686, "learning_rate": 4.260943361701176e-05, "loss": 0.4591, "step": 12965 }, { "epoch": 1.5123600746268657, "grad_norm": 0.42023299547437754, "learning_rate": 4.2603003711062536e-05, "loss": 0.4836, "step": 12970 }, { "epoch": 1.5129430970149254, "grad_norm": 0.40037349675158795, "learning_rate": 4.259657155944136e-05, "loss": 0.4896, "step": 12975 }, { "epoch": 1.513526119402985, "grad_norm": 0.4145572583666219, "learning_rate": 4.259013716310465e-05, "loss": 0.4834, "step": 12980 }, { "epoch": 1.5141091417910446, "grad_norm": 0.4198272987992974, "learning_rate": 4.258370052300911e-05, "loss": 0.4792, "step": 12985 }, { "epoch": 1.5146921641791045, "grad_norm": 0.44070396389547273, "learning_rate": 4.2577261640111834e-05, "loss": 0.4668, "step": 12990 }, { "epoch": 1.5152751865671643, "grad_norm": 0.4232891194001117, "learning_rate": 4.25708205153702e-05, "loss": 0.4957, "step": 12995 }, { "epoch": 1.515858208955224, "grad_norm": 0.39904857993881904, "learning_rate": 4.256437714974196e-05, "loss": 0.5017, "step": 13000 }, { "epoch": 1.5164412313432836, "grad_norm": 0.4353750148819155, "learning_rate": 4.2557931544185166e-05, "loss": 0.5133, "step": 13005 }, { "epoch": 1.5170242537313432, "grad_norm": 0.41500382537486163, "learning_rate": 4.255148369965822e-05, "loss": 0.4685, "step": 13010 }, { "epoch": 1.5176072761194028, "grad_norm": 0.43078388455839467, "learning_rate": 4.254503361711987e-05, "loss": 0.4516, "step": 13015 }, { "epoch": 1.5181902985074627, "grad_norm": 0.5704149589769468, "learning_rate": 4.253858129752916e-05, "loss": 0.5069, "step": 13020 }, { "epoch": 1.5187733208955225, "grad_norm": 0.43024437900631485, "learning_rate": 4.2532126741845506e-05, "loss": 0.491, "step": 13025 }, { "epoch": 1.5193563432835822, "grad_norm": 0.4484607017917007, "learning_rate": 4.252566995102864e-05, "loss": 0.4886, "step": 13030 }, { "epoch": 1.5199393656716418, "grad_norm": 0.44070631539484884, "learning_rate": 4.25192109260386e-05, "loss": 0.4941, "step": 13035 }, { "epoch": 1.5205223880597014, "grad_norm": 0.4316579967713402, "learning_rate": 4.251274966783579e-05, "loss": 0.4646, "step": 13040 }, { "epoch": 1.521105410447761, "grad_norm": 0.44767588050786555, "learning_rate": 4.250628617738096e-05, "loss": 0.4648, "step": 13045 }, { "epoch": 1.521688432835821, "grad_norm": 0.40604247085739564, "learning_rate": 4.2499820455635154e-05, "loss": 0.4713, "step": 13050 }, { "epoch": 1.5222714552238807, "grad_norm": 0.45662084505548567, "learning_rate": 4.2493352503559756e-05, "loss": 0.4988, "step": 13055 }, { "epoch": 1.5228544776119404, "grad_norm": 0.4490900961893545, "learning_rate": 4.24868823221165e-05, "loss": 0.4959, "step": 13060 }, { "epoch": 1.5234375, "grad_norm": 0.4289758911518763, "learning_rate": 4.248040991226743e-05, "loss": 0.4898, "step": 13065 }, { "epoch": 1.5240205223880596, "grad_norm": 0.4416847350835621, "learning_rate": 4.2473935274974944e-05, "loss": 0.492, "step": 13070 }, { "epoch": 1.5246035447761193, "grad_norm": 0.391925996428675, "learning_rate": 4.246745841120174e-05, "loss": 0.4779, "step": 13075 }, { "epoch": 1.525186567164179, "grad_norm": 0.3826988879592668, "learning_rate": 4.246097932191088e-05, "loss": 0.4809, "step": 13080 }, { "epoch": 1.525769589552239, "grad_norm": 0.39125499815155884, "learning_rate": 4.245449800806574e-05, "loss": 0.4665, "step": 13085 }, { "epoch": 1.5263526119402986, "grad_norm": 0.42676458586294946, "learning_rate": 4.2448014470630034e-05, "loss": 0.479, "step": 13090 }, { "epoch": 1.5269356343283582, "grad_norm": 0.4118712526151781, "learning_rate": 4.244152871056779e-05, "loss": 0.5024, "step": 13095 }, { "epoch": 1.5275186567164178, "grad_norm": 0.39857692702972825, "learning_rate": 4.2435040728843376e-05, "loss": 0.4644, "step": 13100 }, { "epoch": 1.5281016791044775, "grad_norm": 0.4219119952764566, "learning_rate": 4.242855052642151e-05, "loss": 0.4909, "step": 13105 }, { "epoch": 1.5286847014925373, "grad_norm": 0.39937988060481167, "learning_rate": 4.2422058104267215e-05, "loss": 0.4833, "step": 13110 }, { "epoch": 1.5292677238805972, "grad_norm": 0.4118277229315712, "learning_rate": 4.241556346334584e-05, "loss": 0.4853, "step": 13115 }, { "epoch": 1.5298507462686568, "grad_norm": 0.4493140604958407, "learning_rate": 4.2409066604623096e-05, "loss": 0.5012, "step": 13120 }, { "epoch": 1.5304337686567164, "grad_norm": 0.44284647507189534, "learning_rate": 4.2402567529065e-05, "loss": 0.4952, "step": 13125 }, { "epoch": 1.531016791044776, "grad_norm": 0.40025342570562444, "learning_rate": 4.239606623763789e-05, "loss": 0.4714, "step": 13130 }, { "epoch": 1.5315998134328357, "grad_norm": 0.46613399151775736, "learning_rate": 4.2389562731308454e-05, "loss": 0.4748, "step": 13135 }, { "epoch": 1.5321828358208955, "grad_norm": 0.47135795761210825, "learning_rate": 4.23830570110437e-05, "loss": 0.4826, "step": 13140 }, { "epoch": 1.5327658582089554, "grad_norm": 0.42414701916509373, "learning_rate": 4.237654907781096e-05, "loss": 0.5079, "step": 13145 }, { "epoch": 1.533348880597015, "grad_norm": 0.421407479474492, "learning_rate": 4.237003893257791e-05, "loss": 0.4905, "step": 13150 }, { "epoch": 1.5339319029850746, "grad_norm": 0.4728117585754357, "learning_rate": 4.236352657631254e-05, "loss": 0.5496, "step": 13155 }, { "epoch": 1.5345149253731343, "grad_norm": 0.40357298044420986, "learning_rate": 4.2357012009983185e-05, "loss": 0.4637, "step": 13160 }, { "epoch": 1.535097947761194, "grad_norm": 0.4009201035522565, "learning_rate": 4.2350495234558494e-05, "loss": 0.4566, "step": 13165 }, { "epoch": 1.5356809701492538, "grad_norm": 0.43495102739745845, "learning_rate": 4.234397625100745e-05, "loss": 0.4985, "step": 13170 }, { "epoch": 1.5362639925373134, "grad_norm": 0.4207053021942057, "learning_rate": 4.233745506029934e-05, "loss": 0.4779, "step": 13175 }, { "epoch": 1.5368470149253732, "grad_norm": 0.46483422993637863, "learning_rate": 4.2330931663403844e-05, "loss": 0.491, "step": 13180 }, { "epoch": 1.5374300373134329, "grad_norm": 0.3954227522834553, "learning_rate": 4.232440606129089e-05, "loss": 0.472, "step": 13185 }, { "epoch": 1.5380130597014925, "grad_norm": 0.41282541519258287, "learning_rate": 4.231787825493081e-05, "loss": 0.4617, "step": 13190 }, { "epoch": 1.5385960820895521, "grad_norm": 0.4130982502495838, "learning_rate": 4.231134824529419e-05, "loss": 0.4601, "step": 13195 }, { "epoch": 1.539179104477612, "grad_norm": 0.43435287131953826, "learning_rate": 4.230481603335201e-05, "loss": 0.5106, "step": 13200 }, { "epoch": 1.5397621268656716, "grad_norm": 0.38619999090987356, "learning_rate": 4.229828162007553e-05, "loss": 0.474, "step": 13205 }, { "epoch": 1.5403451492537314, "grad_norm": 0.426167049344755, "learning_rate": 4.229174500643634e-05, "loss": 0.4757, "step": 13210 }, { "epoch": 1.540928171641791, "grad_norm": 0.40446093120877646, "learning_rate": 4.228520619340641e-05, "loss": 0.4907, "step": 13215 }, { "epoch": 1.5415111940298507, "grad_norm": 0.3992756880668716, "learning_rate": 4.227866518195797e-05, "loss": 0.467, "step": 13220 }, { "epoch": 1.5420942164179103, "grad_norm": 0.43829585582456626, "learning_rate": 4.227212197306362e-05, "loss": 0.5113, "step": 13225 }, { "epoch": 1.5426772388059702, "grad_norm": 0.4573036781634611, "learning_rate": 4.226557656769626e-05, "loss": 0.4979, "step": 13230 }, { "epoch": 1.5432602611940298, "grad_norm": 0.47132710634129954, "learning_rate": 4.225902896682914e-05, "loss": 0.4863, "step": 13235 }, { "epoch": 1.5438432835820897, "grad_norm": 0.4285096899328013, "learning_rate": 4.225247917143582e-05, "loss": 0.5048, "step": 13240 }, { "epoch": 1.5444263059701493, "grad_norm": 0.4601748805852775, "learning_rate": 4.2245927182490194e-05, "loss": 0.4989, "step": 13245 }, { "epoch": 1.545009328358209, "grad_norm": 0.46120571320041653, "learning_rate": 4.223937300096648e-05, "loss": 0.492, "step": 13250 }, { "epoch": 1.5455923507462686, "grad_norm": 0.40037712446225193, "learning_rate": 4.223281662783922e-05, "loss": 0.4699, "step": 13255 }, { "epoch": 1.5461753731343284, "grad_norm": 0.43357360655332144, "learning_rate": 4.22262580640833e-05, "loss": 0.5139, "step": 13260 }, { "epoch": 1.546758395522388, "grad_norm": 0.4578799403414573, "learning_rate": 4.221969731067388e-05, "loss": 0.4858, "step": 13265 }, { "epoch": 1.5473414179104479, "grad_norm": 0.4294249725349592, "learning_rate": 4.221313436858651e-05, "loss": 0.4817, "step": 13270 }, { "epoch": 1.5479244402985075, "grad_norm": 0.4127021933030226, "learning_rate": 4.2206569238797025e-05, "loss": 0.4616, "step": 13275 }, { "epoch": 1.5485074626865671, "grad_norm": 0.40377848223242957, "learning_rate": 4.220000192228161e-05, "loss": 0.5366, "step": 13280 }, { "epoch": 1.5490904850746268, "grad_norm": 0.4267006224788294, "learning_rate": 4.2193432420016746e-05, "loss": 0.5087, "step": 13285 }, { "epoch": 1.5496735074626866, "grad_norm": 0.4346604585728533, "learning_rate": 4.218686073297926e-05, "loss": 0.511, "step": 13290 }, { "epoch": 1.5502565298507462, "grad_norm": 0.41716535906131436, "learning_rate": 4.218028686214631e-05, "loss": 0.4603, "step": 13295 }, { "epoch": 1.550839552238806, "grad_norm": 0.4126400492390039, "learning_rate": 4.217371080849535e-05, "loss": 0.4532, "step": 13300 }, { "epoch": 1.5514225746268657, "grad_norm": 0.408431332457747, "learning_rate": 4.216713257300418e-05, "loss": 0.4623, "step": 13305 }, { "epoch": 1.5520055970149254, "grad_norm": 0.4087969742335221, "learning_rate": 4.216055215665093e-05, "loss": 0.4751, "step": 13310 }, { "epoch": 1.552588619402985, "grad_norm": 0.44399390016750945, "learning_rate": 4.215396956041404e-05, "loss": 0.4946, "step": 13315 }, { "epoch": 1.5531716417910446, "grad_norm": 0.40275491386457934, "learning_rate": 4.2147384785272284e-05, "loss": 0.4548, "step": 13320 }, { "epoch": 1.5537546641791045, "grad_norm": 0.4225983009878632, "learning_rate": 4.214079783220474e-05, "loss": 0.4889, "step": 13325 }, { "epoch": 1.5543376865671643, "grad_norm": 0.4424375061579874, "learning_rate": 4.213420870219084e-05, "loss": 0.522, "step": 13330 }, { "epoch": 1.554920708955224, "grad_norm": 0.41521237834904035, "learning_rate": 4.212761739621032e-05, "loss": 0.4647, "step": 13335 }, { "epoch": 1.5555037313432836, "grad_norm": 0.41541356196497864, "learning_rate": 4.212102391524324e-05, "loss": 0.4896, "step": 13340 }, { "epoch": 1.5560867537313432, "grad_norm": 0.43157699479722045, "learning_rate": 4.211442826027e-05, "loss": 0.4708, "step": 13345 }, { "epoch": 1.5566697761194028, "grad_norm": 0.41054688958194885, "learning_rate": 4.210783043227129e-05, "loss": 0.4784, "step": 13350 }, { "epoch": 1.5572527985074627, "grad_norm": 0.5577820332160767, "learning_rate": 4.210123043222816e-05, "loss": 0.5368, "step": 13355 }, { "epoch": 1.5578358208955225, "grad_norm": 0.42009766884515093, "learning_rate": 4.209462826112195e-05, "loss": 0.4593, "step": 13360 }, { "epoch": 1.5584188432835822, "grad_norm": 0.40169251834467307, "learning_rate": 4.2088023919934366e-05, "loss": 0.4704, "step": 13365 }, { "epoch": 1.5590018656716418, "grad_norm": 0.44031000256184927, "learning_rate": 4.2081417409647386e-05, "loss": 0.5025, "step": 13370 }, { "epoch": 1.5595848880597014, "grad_norm": 0.3983519697254828, "learning_rate": 4.207480873124335e-05, "loss": 0.5129, "step": 13375 }, { "epoch": 1.560167910447761, "grad_norm": 0.4168826424645943, "learning_rate": 4.2068197885704904e-05, "loss": 0.4901, "step": 13380 }, { "epoch": 1.560750932835821, "grad_norm": 0.4247715644740658, "learning_rate": 4.2061584874015006e-05, "loss": 0.4944, "step": 13385 }, { "epoch": 1.5613339552238807, "grad_norm": 0.4488066024401581, "learning_rate": 4.205496969715696e-05, "loss": 0.5082, "step": 13390 }, { "epoch": 1.5619169776119404, "grad_norm": 0.6085524137681178, "learning_rate": 4.2048352356114366e-05, "loss": 0.4868, "step": 13395 }, { "epoch": 1.5625, "grad_norm": 0.44574728325793944, "learning_rate": 4.204173285187117e-05, "loss": 0.4842, "step": 13400 }, { "epoch": 1.5630830223880596, "grad_norm": 0.4283934712785062, "learning_rate": 4.203511118541163e-05, "loss": 0.5008, "step": 13405 }, { "epoch": 1.5636660447761193, "grad_norm": 0.46648021154345765, "learning_rate": 4.202848735772031e-05, "loss": 0.4999, "step": 13410 }, { "epoch": 1.564249067164179, "grad_norm": 0.38962086511204524, "learning_rate": 4.202186136978213e-05, "loss": 0.4569, "step": 13415 }, { "epoch": 1.564832089552239, "grad_norm": 0.4053152724284662, "learning_rate": 4.201523322258231e-05, "loss": 0.4452, "step": 13420 }, { "epoch": 1.5654151119402986, "grad_norm": 0.45365460844311817, "learning_rate": 4.2008602917106365e-05, "loss": 0.5115, "step": 13425 }, { "epoch": 1.5659981343283582, "grad_norm": 0.41269832929867595, "learning_rate": 4.2001970454340185e-05, "loss": 0.4741, "step": 13430 }, { "epoch": 1.5665811567164178, "grad_norm": 0.39565240262036605, "learning_rate": 4.199533583526994e-05, "loss": 0.4491, "step": 13435 }, { "epoch": 1.5671641791044775, "grad_norm": 0.4410938378323605, "learning_rate": 4.1988699060882144e-05, "loss": 0.4826, "step": 13440 }, { "epoch": 1.5677472014925373, "grad_norm": 0.39842097053892017, "learning_rate": 4.198206013216361e-05, "loss": 0.507, "step": 13445 }, { "epoch": 1.5683302238805972, "grad_norm": 0.45863932701511173, "learning_rate": 4.197541905010149e-05, "loss": 0.4471, "step": 13450 }, { "epoch": 1.5689132462686568, "grad_norm": 0.4311043355544529, "learning_rate": 4.196877581568326e-05, "loss": 0.5115, "step": 13455 }, { "epoch": 1.5694962686567164, "grad_norm": 0.43044704887228286, "learning_rate": 4.196213042989668e-05, "loss": 0.4931, "step": 13460 }, { "epoch": 1.570079291044776, "grad_norm": 0.40671533407476856, "learning_rate": 4.195548289372988e-05, "loss": 0.5098, "step": 13465 }, { "epoch": 1.5706623134328357, "grad_norm": 0.43825156815662814, "learning_rate": 4.194883320817127e-05, "loss": 0.4953, "step": 13470 }, { "epoch": 1.5712453358208955, "grad_norm": 0.4444194837704474, "learning_rate": 4.1942181374209596e-05, "loss": 0.4999, "step": 13475 }, { "epoch": 1.5718283582089554, "grad_norm": 0.4009905992752678, "learning_rate": 4.193552739283393e-05, "loss": 0.4662, "step": 13480 }, { "epoch": 1.572411380597015, "grad_norm": 0.4316134200753667, "learning_rate": 4.192887126503364e-05, "loss": 0.5024, "step": 13485 }, { "epoch": 1.5729944029850746, "grad_norm": 0.40927502514366515, "learning_rate": 4.192221299179845e-05, "loss": 0.4842, "step": 13490 }, { "epoch": 1.5735774253731343, "grad_norm": 0.40540407523137834, "learning_rate": 4.191555257411837e-05, "loss": 0.4879, "step": 13495 }, { "epoch": 1.574160447761194, "grad_norm": 0.403099969585178, "learning_rate": 4.190889001298373e-05, "loss": 0.4629, "step": 13500 }, { "epoch": 1.5747434701492538, "grad_norm": 0.434966788230841, "learning_rate": 4.190222530938521e-05, "loss": 0.4871, "step": 13505 }, { "epoch": 1.5753264925373134, "grad_norm": 0.4122870548217484, "learning_rate": 4.189555846431377e-05, "loss": 0.4661, "step": 13510 }, { "epoch": 1.5759095149253732, "grad_norm": 0.5319251320420735, "learning_rate": 4.188888947876071e-05, "loss": 0.5356, "step": 13515 }, { "epoch": 1.5764925373134329, "grad_norm": 0.42569822190937806, "learning_rate": 4.188221835371766e-05, "loss": 0.4437, "step": 13520 }, { "epoch": 1.5770755597014925, "grad_norm": 0.45460568678761587, "learning_rate": 4.187554509017653e-05, "loss": 0.4794, "step": 13525 }, { "epoch": 1.5776585820895521, "grad_norm": 0.4077073604912347, "learning_rate": 4.1868869689129584e-05, "loss": 0.4625, "step": 13530 }, { "epoch": 1.578241604477612, "grad_norm": 0.4283745446098321, "learning_rate": 4.186219215156938e-05, "loss": 0.486, "step": 13535 }, { "epoch": 1.5788246268656716, "grad_norm": 0.4748030475902487, "learning_rate": 4.1855512478488816e-05, "loss": 0.4668, "step": 13540 }, { "epoch": 1.5794076492537314, "grad_norm": 0.4240629031299508, "learning_rate": 4.184883067088108e-05, "loss": 0.4829, "step": 13545 }, { "epoch": 1.579990671641791, "grad_norm": 0.40511524150360195, "learning_rate": 4.184214672973971e-05, "loss": 0.5097, "step": 13550 }, { "epoch": 1.5805736940298507, "grad_norm": 0.41235092373023413, "learning_rate": 4.183546065605855e-05, "loss": 0.5137, "step": 13555 }, { "epoch": 1.5811567164179103, "grad_norm": 0.41714248354462624, "learning_rate": 4.182877245083172e-05, "loss": 0.4558, "step": 13560 }, { "epoch": 1.5817397388059702, "grad_norm": 0.403392553276603, "learning_rate": 4.1822082115053717e-05, "loss": 0.4481, "step": 13565 }, { "epoch": 1.5823227611940298, "grad_norm": 0.4508819632643197, "learning_rate": 4.181538964971933e-05, "loss": 0.4628, "step": 13570 }, { "epoch": 1.5829057835820897, "grad_norm": 0.4525378549777149, "learning_rate": 4.180869505582366e-05, "loss": 0.4909, "step": 13575 }, { "epoch": 1.5834888059701493, "grad_norm": 0.44597128262577185, "learning_rate": 4.180199833436213e-05, "loss": 0.4769, "step": 13580 }, { "epoch": 1.584071828358209, "grad_norm": 0.4459012583943023, "learning_rate": 4.179529948633047e-05, "loss": 0.5133, "step": 13585 }, { "epoch": 1.5846548507462686, "grad_norm": 0.4679210869513856, "learning_rate": 4.178859851272475e-05, "loss": 0.5075, "step": 13590 }, { "epoch": 1.5852378731343284, "grad_norm": 0.4317612218037428, "learning_rate": 4.1781895414541326e-05, "loss": 0.5058, "step": 13595 }, { "epoch": 1.585820895522388, "grad_norm": 0.455624049071983, "learning_rate": 4.1775190192776905e-05, "loss": 0.4975, "step": 13600 }, { "epoch": 1.5864039179104479, "grad_norm": 0.431954202780353, "learning_rate": 4.176848284842847e-05, "loss": 0.5046, "step": 13605 }, { "epoch": 1.5869869402985075, "grad_norm": 0.39093059851720136, "learning_rate": 4.176177338249334e-05, "loss": 0.4473, "step": 13610 }, { "epoch": 1.5875699626865671, "grad_norm": 0.4572824921584941, "learning_rate": 4.1755061795969155e-05, "loss": 0.5043, "step": 13615 }, { "epoch": 1.5881529850746268, "grad_norm": 0.39997096727612613, "learning_rate": 4.1748348089853864e-05, "loss": 0.473, "step": 13620 }, { "epoch": 1.5887360074626866, "grad_norm": 0.4122896828831148, "learning_rate": 4.1741632265145715e-05, "loss": 0.4832, "step": 13625 }, { "epoch": 1.5893190298507462, "grad_norm": 0.44332962502958334, "learning_rate": 4.173491432284332e-05, "loss": 0.5335, "step": 13630 }, { "epoch": 1.589902052238806, "grad_norm": 0.4460466797414813, "learning_rate": 4.172819426394554e-05, "loss": 0.4858, "step": 13635 }, { "epoch": 1.5904850746268657, "grad_norm": 0.43173460502304956, "learning_rate": 4.172147208945159e-05, "loss": 0.5037, "step": 13640 }, { "epoch": 1.5910680970149254, "grad_norm": 0.4068645141608874, "learning_rate": 4.171474780036101e-05, "loss": 0.4539, "step": 13645 }, { "epoch": 1.591651119402985, "grad_norm": 0.40161314212077937, "learning_rate": 4.170802139767362e-05, "loss": 0.5051, "step": 13650 }, { "epoch": 1.5922341417910446, "grad_norm": 0.4212810978511474, "learning_rate": 4.170129288238958e-05, "loss": 0.4782, "step": 13655 }, { "epoch": 1.5928171641791045, "grad_norm": 0.4672279762345317, "learning_rate": 4.1694562255509354e-05, "loss": 0.4761, "step": 13660 }, { "epoch": 1.5934001865671643, "grad_norm": 0.4096412974311588, "learning_rate": 4.1687829518033726e-05, "loss": 0.4878, "step": 13665 }, { "epoch": 1.593983208955224, "grad_norm": 0.43522674885978196, "learning_rate": 4.168109467096378e-05, "loss": 0.4661, "step": 13670 }, { "epoch": 1.5945662313432836, "grad_norm": 0.4211645815058826, "learning_rate": 4.1674357715300924e-05, "loss": 0.4589, "step": 13675 }, { "epoch": 1.5951492537313432, "grad_norm": 0.466936568795633, "learning_rate": 4.1667618652046894e-05, "loss": 0.4982, "step": 13680 }, { "epoch": 1.5957322761194028, "grad_norm": 0.4488551762263432, "learning_rate": 4.1660877482203704e-05, "loss": 0.4972, "step": 13685 }, { "epoch": 1.5963152985074627, "grad_norm": 0.3994647058487909, "learning_rate": 4.165413420677372e-05, "loss": 0.4738, "step": 13690 }, { "epoch": 1.5968983208955225, "grad_norm": 0.4688525595679455, "learning_rate": 4.164738882675958e-05, "loss": 0.5398, "step": 13695 }, { "epoch": 1.5974813432835822, "grad_norm": 0.45941567626509505, "learning_rate": 4.164064134316428e-05, "loss": 0.5057, "step": 13700 }, { "epoch": 1.5980643656716418, "grad_norm": 0.47364171736560806, "learning_rate": 4.163389175699109e-05, "loss": 0.4975, "step": 13705 }, { "epoch": 1.5986473880597014, "grad_norm": 0.439734025007231, "learning_rate": 4.162714006924362e-05, "loss": 0.5169, "step": 13710 }, { "epoch": 1.599230410447761, "grad_norm": 0.4587953563310536, "learning_rate": 4.1620386280925776e-05, "loss": 0.4744, "step": 13715 }, { "epoch": 1.599813432835821, "grad_norm": 0.5455847649812249, "learning_rate": 4.161363039304177e-05, "loss": 0.5009, "step": 13720 }, { "epoch": 1.6003964552238807, "grad_norm": 0.4583565860258099, "learning_rate": 4.160687240659616e-05, "loss": 0.4638, "step": 13725 }, { "epoch": 1.6009794776119404, "grad_norm": 0.41780067818965255, "learning_rate": 4.160011232259378e-05, "loss": 0.4963, "step": 13730 }, { "epoch": 1.6015625, "grad_norm": 0.36360567577215513, "learning_rate": 4.1593350142039806e-05, "loss": 0.4521, "step": 13735 }, { "epoch": 1.6021455223880596, "grad_norm": 0.5549813184778564, "learning_rate": 4.158658586593969e-05, "loss": 0.4994, "step": 13740 }, { "epoch": 1.6027285447761193, "grad_norm": 0.3982818850740962, "learning_rate": 4.157981949529922e-05, "loss": 0.4716, "step": 13745 }, { "epoch": 1.603311567164179, "grad_norm": 0.428213403831558, "learning_rate": 4.1573051031124486e-05, "loss": 0.4404, "step": 13750 }, { "epoch": 1.603894589552239, "grad_norm": 0.43629496576454474, "learning_rate": 4.156628047442191e-05, "loss": 0.4643, "step": 13755 }, { "epoch": 1.6044776119402986, "grad_norm": 0.4193637142109093, "learning_rate": 4.155950782619819e-05, "loss": 0.5045, "step": 13760 }, { "epoch": 1.6050606343283582, "grad_norm": 0.41187735553233135, "learning_rate": 4.155273308746037e-05, "loss": 0.4731, "step": 13765 }, { "epoch": 1.6056436567164178, "grad_norm": 0.4153061528694636, "learning_rate": 4.1545956259215776e-05, "loss": 0.4604, "step": 13770 }, { "epoch": 1.6062266791044775, "grad_norm": 0.4265804750716642, "learning_rate": 4.153917734247208e-05, "loss": 0.4966, "step": 13775 }, { "epoch": 1.6068097014925373, "grad_norm": 0.4247938848939225, "learning_rate": 4.153239633823721e-05, "loss": 0.4826, "step": 13780 }, { "epoch": 1.6073927238805972, "grad_norm": 0.4080890456726183, "learning_rate": 4.152561324751945e-05, "loss": 0.4953, "step": 13785 }, { "epoch": 1.6079757462686568, "grad_norm": 0.46540044999851854, "learning_rate": 4.151882807132739e-05, "loss": 0.5214, "step": 13790 }, { "epoch": 1.6085587686567164, "grad_norm": 0.4374526811163078, "learning_rate": 4.1512040810669905e-05, "loss": 0.4926, "step": 13795 }, { "epoch": 1.609141791044776, "grad_norm": 0.4312462430872498, "learning_rate": 4.1505251466556206e-05, "loss": 0.4902, "step": 13800 }, { "epoch": 1.6097248134328357, "grad_norm": 0.45494365001895526, "learning_rate": 4.1498460039995805e-05, "loss": 0.4832, "step": 13805 }, { "epoch": 1.6103078358208955, "grad_norm": 0.44416438041500533, "learning_rate": 4.149166653199852e-05, "loss": 0.4958, "step": 13810 }, { "epoch": 1.6108908582089554, "grad_norm": 0.4476916723216493, "learning_rate": 4.148487094357447e-05, "loss": 0.4874, "step": 13815 }, { "epoch": 1.611473880597015, "grad_norm": 0.44998270039657035, "learning_rate": 4.1478073275734105e-05, "loss": 0.472, "step": 13820 }, { "epoch": 1.6120569029850746, "grad_norm": 0.414424612331312, "learning_rate": 4.147127352948817e-05, "loss": 0.4639, "step": 13825 }, { "epoch": 1.6126399253731343, "grad_norm": 0.4235139396928849, "learning_rate": 4.146447170584772e-05, "loss": 0.4438, "step": 13830 }, { "epoch": 1.613222947761194, "grad_norm": 0.4558452516562571, "learning_rate": 4.145766780582413e-05, "loss": 0.4894, "step": 13835 }, { "epoch": 1.6138059701492538, "grad_norm": 0.45802437895486725, "learning_rate": 4.145086183042907e-05, "loss": 0.4825, "step": 13840 }, { "epoch": 1.6143889925373134, "grad_norm": 0.4051697239513389, "learning_rate": 4.1444053780674506e-05, "loss": 0.4763, "step": 13845 }, { "epoch": 1.6149720149253732, "grad_norm": 0.3891587238042614, "learning_rate": 4.143724365757275e-05, "loss": 0.4492, "step": 13850 }, { "epoch": 1.6155550373134329, "grad_norm": 0.42980012673803525, "learning_rate": 4.143043146213641e-05, "loss": 0.4734, "step": 13855 }, { "epoch": 1.6161380597014925, "grad_norm": 0.3918017871738858, "learning_rate": 4.142361719537838e-05, "loss": 0.478, "step": 13860 }, { "epoch": 1.6167210820895521, "grad_norm": 0.38440071087680916, "learning_rate": 4.1416800858311875e-05, "loss": 0.4541, "step": 13865 }, { "epoch": 1.617304104477612, "grad_norm": 0.4022474722165739, "learning_rate": 4.140998245195042e-05, "loss": 0.4635, "step": 13870 }, { "epoch": 1.6178871268656716, "grad_norm": 0.394886036018443, "learning_rate": 4.1403161977307845e-05, "loss": 0.4969, "step": 13875 }, { "epoch": 1.6184701492537314, "grad_norm": 0.4155495143145242, "learning_rate": 4.13963394353983e-05, "loss": 0.5149, "step": 13880 }, { "epoch": 1.619053171641791, "grad_norm": 0.425780163466277, "learning_rate": 4.1389514827236214e-05, "loss": 0.5023, "step": 13885 }, { "epoch": 1.6196361940298507, "grad_norm": 0.3970847797166461, "learning_rate": 4.138268815383636e-05, "loss": 0.4614, "step": 13890 }, { "epoch": 1.6202192164179103, "grad_norm": 0.4841170861966004, "learning_rate": 4.137585941621379e-05, "loss": 0.5141, "step": 13895 }, { "epoch": 1.6208022388059702, "grad_norm": 0.4321492633657125, "learning_rate": 4.136902861538387e-05, "loss": 0.4941, "step": 13900 }, { "epoch": 1.6213852611940298, "grad_norm": 0.4193613731390034, "learning_rate": 4.136219575236228e-05, "loss": 0.4832, "step": 13905 }, { "epoch": 1.6219682835820897, "grad_norm": 0.478542902077223, "learning_rate": 4.135536082816499e-05, "loss": 0.4919, "step": 13910 }, { "epoch": 1.6225513059701493, "grad_norm": 0.3910423437514934, "learning_rate": 4.13485238438083e-05, "loss": 0.4782, "step": 13915 }, { "epoch": 1.623134328358209, "grad_norm": 0.39792175646311384, "learning_rate": 4.13416848003088e-05, "loss": 0.4701, "step": 13920 }, { "epoch": 1.6237173507462686, "grad_norm": 0.42639038291147685, "learning_rate": 4.133484369868339e-05, "loss": 0.4865, "step": 13925 }, { "epoch": 1.6243003731343284, "grad_norm": 0.4834550137525021, "learning_rate": 4.132800053994927e-05, "loss": 0.5021, "step": 13930 }, { "epoch": 1.624883395522388, "grad_norm": 0.40030111596840384, "learning_rate": 4.132115532512397e-05, "loss": 0.4933, "step": 13935 }, { "epoch": 1.6254664179104479, "grad_norm": 0.40158229111374893, "learning_rate": 4.1314308055225295e-05, "loss": 0.5126, "step": 13940 }, { "epoch": 1.6260494402985075, "grad_norm": 0.38529150774322646, "learning_rate": 4.130745873127136e-05, "loss": 0.4645, "step": 13945 }, { "epoch": 1.6266324626865671, "grad_norm": 0.39397074973711016, "learning_rate": 4.1300607354280605e-05, "loss": 0.4933, "step": 13950 }, { "epoch": 1.6272154850746268, "grad_norm": 0.4310925387629514, "learning_rate": 4.129375392527177e-05, "loss": 0.4634, "step": 13955 }, { "epoch": 1.6277985074626866, "grad_norm": 0.4049957818820292, "learning_rate": 4.128689844526388e-05, "loss": 0.4614, "step": 13960 }, { "epoch": 1.6283815298507462, "grad_norm": 0.4206548881995724, "learning_rate": 4.128004091527629e-05, "loss": 0.5058, "step": 13965 }, { "epoch": 1.628964552238806, "grad_norm": 0.42818619835778277, "learning_rate": 4.1273181336328646e-05, "loss": 0.4768, "step": 13970 }, { "epoch": 1.6295475746268657, "grad_norm": 0.49216574144431363, "learning_rate": 4.1266319709440895e-05, "loss": 0.4693, "step": 13975 }, { "epoch": 1.6301305970149254, "grad_norm": 0.3772910316702474, "learning_rate": 4.125945603563331e-05, "loss": 0.4832, "step": 13980 }, { "epoch": 1.630713619402985, "grad_norm": 0.3775254497117327, "learning_rate": 4.1252590315926435e-05, "loss": 0.4835, "step": 13985 }, { "epoch": 1.6312966417910446, "grad_norm": 0.4604685839675388, "learning_rate": 4.124572255134115e-05, "loss": 0.4558, "step": 13990 }, { "epoch": 1.6318796641791045, "grad_norm": 0.3961489769456557, "learning_rate": 4.123885274289862e-05, "loss": 0.4543, "step": 13995 }, { "epoch": 1.6324626865671643, "grad_norm": 0.39566019294349775, "learning_rate": 4.123198089162033e-05, "loss": 0.4669, "step": 14000 }, { "epoch": 1.633045708955224, "grad_norm": 0.4577549701799179, "learning_rate": 4.122510699852803e-05, "loss": 0.4815, "step": 14005 }, { "epoch": 1.6336287313432836, "grad_norm": 0.4296076529412509, "learning_rate": 4.121823106464384e-05, "loss": 0.5045, "step": 14010 }, { "epoch": 1.6342117537313432, "grad_norm": 0.412674549575472, "learning_rate": 4.121135309099013e-05, "loss": 0.4939, "step": 14015 }, { "epoch": 1.6347947761194028, "grad_norm": 0.45241629533800665, "learning_rate": 4.1204473078589575e-05, "loss": 0.4878, "step": 14020 }, { "epoch": 1.6353777985074627, "grad_norm": 0.5013913112036309, "learning_rate": 4.119759102846518e-05, "loss": 0.4996, "step": 14025 }, { "epoch": 1.6359608208955225, "grad_norm": 0.3951691037062018, "learning_rate": 4.119070694164024e-05, "loss": 0.4782, "step": 14030 }, { "epoch": 1.6365438432835822, "grad_norm": 0.4589785308608991, "learning_rate": 4.1183820819138355e-05, "loss": 0.4651, "step": 14035 }, { "epoch": 1.6371268656716418, "grad_norm": 0.45276924257140067, "learning_rate": 4.117693266198342e-05, "loss": 0.4734, "step": 14040 }, { "epoch": 1.6377098880597014, "grad_norm": 0.44295878617434087, "learning_rate": 4.117004247119964e-05, "loss": 0.4831, "step": 14045 }, { "epoch": 1.638292910447761, "grad_norm": 0.410851387489969, "learning_rate": 4.116315024781152e-05, "loss": 0.4834, "step": 14050 }, { "epoch": 1.638875932835821, "grad_norm": 0.4164900346469144, "learning_rate": 4.115625599284386e-05, "loss": 0.489, "step": 14055 }, { "epoch": 1.6394589552238807, "grad_norm": 0.4459067435418286, "learning_rate": 4.114935970732178e-05, "loss": 0.5067, "step": 14060 }, { "epoch": 1.6400419776119404, "grad_norm": 0.4291682494710737, "learning_rate": 4.114246139227069e-05, "loss": 0.5263, "step": 14065 }, { "epoch": 1.640625, "grad_norm": 0.4216746640472223, "learning_rate": 4.113556104871631e-05, "loss": 0.4964, "step": 14070 }, { "epoch": 1.6412080223880596, "grad_norm": 0.4211314766595737, "learning_rate": 4.112865867768464e-05, "loss": 0.4808, "step": 14075 }, { "epoch": 1.6417910447761193, "grad_norm": 0.4053437292411652, "learning_rate": 4.112175428020199e-05, "loss": 0.4461, "step": 14080 }, { "epoch": 1.642374067164179, "grad_norm": 0.396670578607788, "learning_rate": 4.1114847857295006e-05, "loss": 0.4605, "step": 14085 }, { "epoch": 1.642957089552239, "grad_norm": 0.3995791356492982, "learning_rate": 4.110793940999059e-05, "loss": 0.4711, "step": 14090 }, { "epoch": 1.6435401119402986, "grad_norm": 0.3954757796377986, "learning_rate": 4.110102893931597e-05, "loss": 0.4994, "step": 14095 }, { "epoch": 1.6441231343283582, "grad_norm": 0.435294454301607, "learning_rate": 4.1094116446298645e-05, "loss": 0.4766, "step": 14100 }, { "epoch": 1.6447061567164178, "grad_norm": 0.44310065079643096, "learning_rate": 4.1087201931966463e-05, "loss": 0.5113, "step": 14105 }, { "epoch": 1.6452891791044775, "grad_norm": 0.3607871264105027, "learning_rate": 4.108028539734753e-05, "loss": 0.4855, "step": 14110 }, { "epoch": 1.6458722014925373, "grad_norm": 0.42095392810100446, "learning_rate": 4.1073366843470285e-05, "loss": 0.4639, "step": 14115 }, { "epoch": 1.6464552238805972, "grad_norm": 0.3766418494165451, "learning_rate": 4.1066446271363426e-05, "loss": 0.4926, "step": 14120 }, { "epoch": 1.6470382462686568, "grad_norm": 0.4443662116425456, "learning_rate": 4.1059523682056e-05, "loss": 0.4919, "step": 14125 }, { "epoch": 1.6476212686567164, "grad_norm": 0.44464467585543765, "learning_rate": 4.1052599076577306e-05, "loss": 0.5104, "step": 14130 }, { "epoch": 1.648204291044776, "grad_norm": 0.4047224424351832, "learning_rate": 4.104567245595699e-05, "loss": 0.4564, "step": 14135 }, { "epoch": 1.6487873134328357, "grad_norm": 0.42026335706897944, "learning_rate": 4.103874382122496e-05, "loss": 0.4832, "step": 14140 }, { "epoch": 1.6493703358208955, "grad_norm": 0.4107997789237667, "learning_rate": 4.103181317341144e-05, "loss": 0.467, "step": 14145 }, { "epoch": 1.6499533582089554, "grad_norm": 0.39889095294561816, "learning_rate": 4.1024880513546955e-05, "loss": 0.4714, "step": 14150 }, { "epoch": 1.650536380597015, "grad_norm": 0.46962418152356855, "learning_rate": 4.101794584266232e-05, "loss": 0.4707, "step": 14155 }, { "epoch": 1.6511194029850746, "grad_norm": 0.45018302237534263, "learning_rate": 4.1011009161788655e-05, "loss": 0.4952, "step": 14160 }, { "epoch": 1.6517024253731343, "grad_norm": 0.3961606609630567, "learning_rate": 4.100407047195738e-05, "loss": 0.4712, "step": 14165 }, { "epoch": 1.652285447761194, "grad_norm": 0.42032988863236986, "learning_rate": 4.099712977420021e-05, "loss": 0.4548, "step": 14170 }, { "epoch": 1.6528684701492538, "grad_norm": 0.3703946722633511, "learning_rate": 4.099018706954916e-05, "loss": 0.4416, "step": 14175 }, { "epoch": 1.6534514925373134, "grad_norm": 0.4098114340694798, "learning_rate": 4.098324235903655e-05, "loss": 0.4745, "step": 14180 }, { "epoch": 1.6540345149253732, "grad_norm": 0.40705381450134037, "learning_rate": 4.0976295643694986e-05, "loss": 0.5113, "step": 14185 }, { "epoch": 1.6546175373134329, "grad_norm": 0.43040331236645074, "learning_rate": 4.0969346924557374e-05, "loss": 0.5035, "step": 14190 }, { "epoch": 1.6552005597014925, "grad_norm": 0.4405551738954265, "learning_rate": 4.096239620265693e-05, "loss": 0.4878, "step": 14195 }, { "epoch": 1.6557835820895521, "grad_norm": 0.4967926559533798, "learning_rate": 4.095544347902715e-05, "loss": 0.5059, "step": 14200 }, { "epoch": 1.656366604477612, "grad_norm": 0.4095083869310005, "learning_rate": 4.0948488754701846e-05, "loss": 0.4885, "step": 14205 }, { "epoch": 1.6569496268656716, "grad_norm": 0.47061671668377314, "learning_rate": 4.094153203071512e-05, "loss": 0.5638, "step": 14210 }, { "epoch": 1.6575326492537314, "grad_norm": 0.4081933179265426, "learning_rate": 4.0934573308101376e-05, "loss": 0.4959, "step": 14215 }, { "epoch": 1.658115671641791, "grad_norm": 0.3996955520067459, "learning_rate": 4.092761258789529e-05, "loss": 0.4474, "step": 14220 }, { "epoch": 1.6586986940298507, "grad_norm": 0.4314479874396754, "learning_rate": 4.092064987113186e-05, "loss": 0.4718, "step": 14225 }, { "epoch": 1.6592817164179103, "grad_norm": 0.4438299110909517, "learning_rate": 4.091368515884638e-05, "loss": 0.5341, "step": 14230 }, { "epoch": 1.6598647388059702, "grad_norm": 0.41651006798955637, "learning_rate": 4.0906718452074435e-05, "loss": 0.4727, "step": 14235 }, { "epoch": 1.6604477611940298, "grad_norm": 0.4331091906462682, "learning_rate": 4.089974975185192e-05, "loss": 0.4836, "step": 14240 }, { "epoch": 1.6610307835820897, "grad_norm": 0.4021683622488243, "learning_rate": 4.0892779059214994e-05, "loss": 0.497, "step": 14245 }, { "epoch": 1.6616138059701493, "grad_norm": 0.4117795790481671, "learning_rate": 4.088580637520015e-05, "loss": 0.4425, "step": 14250 }, { "epoch": 1.662196828358209, "grad_norm": 0.3895811651306167, "learning_rate": 4.087883170084414e-05, "loss": 0.4824, "step": 14255 }, { "epoch": 1.6627798507462686, "grad_norm": 0.37307335490523996, "learning_rate": 4.087185503718404e-05, "loss": 0.4462, "step": 14260 }, { "epoch": 1.6633628731343284, "grad_norm": 0.48478137335603105, "learning_rate": 4.0864876385257225e-05, "loss": 0.4726, "step": 14265 }, { "epoch": 1.663945895522388, "grad_norm": 0.4395735769638866, "learning_rate": 4.0857895746101335e-05, "loss": 0.5045, "step": 14270 }, { "epoch": 1.6645289179104479, "grad_norm": 0.4395663397458483, "learning_rate": 4.085091312075434e-05, "loss": 0.4661, "step": 14275 }, { "epoch": 1.6651119402985075, "grad_norm": 0.3518701273075788, "learning_rate": 4.084392851025447e-05, "loss": 0.432, "step": 14280 }, { "epoch": 1.6656949626865671, "grad_norm": 0.3887442406304548, "learning_rate": 4.083694191564028e-05, "loss": 0.4702, "step": 14285 }, { "epoch": 1.6662779850746268, "grad_norm": 0.4184054471272332, "learning_rate": 4.082995333795063e-05, "loss": 0.4889, "step": 14290 }, { "epoch": 1.6668610074626866, "grad_norm": 0.4173370886409284, "learning_rate": 4.0822962778224613e-05, "loss": 0.4917, "step": 14295 }, { "epoch": 1.6674440298507462, "grad_norm": 0.4078412810184441, "learning_rate": 4.081597023750169e-05, "loss": 0.4649, "step": 14300 }, { "epoch": 1.668027052238806, "grad_norm": 0.42994252550274004, "learning_rate": 4.0808975716821574e-05, "loss": 0.4807, "step": 14305 }, { "epoch": 1.6686100746268657, "grad_norm": 0.39193144870866914, "learning_rate": 4.0801979217224285e-05, "loss": 0.4667, "step": 14310 }, { "epoch": 1.6691930970149254, "grad_norm": 0.438265726796618, "learning_rate": 4.079498073975013e-05, "loss": 0.4994, "step": 14315 }, { "epoch": 1.669776119402985, "grad_norm": 0.3858463575868289, "learning_rate": 4.078798028543974e-05, "loss": 0.4716, "step": 14320 }, { "epoch": 1.6703591417910446, "grad_norm": 0.4501937346570606, "learning_rate": 4.078097785533398e-05, "loss": 0.4999, "step": 14325 }, { "epoch": 1.6709421641791045, "grad_norm": 0.5008841356624519, "learning_rate": 4.0773973450474055e-05, "loss": 0.4859, "step": 14330 }, { "epoch": 1.6715251865671643, "grad_norm": 0.42419460599067715, "learning_rate": 4.076696707190147e-05, "loss": 0.4636, "step": 14335 }, { "epoch": 1.672108208955224, "grad_norm": 0.4280915136097193, "learning_rate": 4.0759958720658e-05, "loss": 0.4702, "step": 14340 }, { "epoch": 1.6726912313432836, "grad_norm": 0.4195309591052, "learning_rate": 4.07529483977857e-05, "loss": 0.4785, "step": 14345 }, { "epoch": 1.6732742537313432, "grad_norm": 0.4035026247735428, "learning_rate": 4.074593610432695e-05, "loss": 0.4677, "step": 14350 }, { "epoch": 1.6738572761194028, "grad_norm": 0.37961678043272695, "learning_rate": 4.073892184132442e-05, "loss": 0.478, "step": 14355 }, { "epoch": 1.6744402985074627, "grad_norm": 0.395602393674479, "learning_rate": 4.073190560982106e-05, "loss": 0.4684, "step": 14360 }, { "epoch": 1.6750233208955225, "grad_norm": 0.42332378607544247, "learning_rate": 4.072488741086011e-05, "loss": 0.4963, "step": 14365 }, { "epoch": 1.6756063432835822, "grad_norm": 0.469879001605267, "learning_rate": 4.071786724548511e-05, "loss": 0.4671, "step": 14370 }, { "epoch": 1.6761893656716418, "grad_norm": 0.3733727950920635, "learning_rate": 4.07108451147399e-05, "loss": 0.4872, "step": 14375 }, { "epoch": 1.6767723880597014, "grad_norm": 0.46457269040707083, "learning_rate": 4.07038210196686e-05, "loss": 0.4792, "step": 14380 }, { "epoch": 1.677355410447761, "grad_norm": 0.429251035541327, "learning_rate": 4.0696794961315605e-05, "loss": 0.5056, "step": 14385 }, { "epoch": 1.677938432835821, "grad_norm": 0.3983369316978038, "learning_rate": 4.068976694072565e-05, "loss": 0.4526, "step": 14390 }, { "epoch": 1.6785214552238807, "grad_norm": 0.40331575606174813, "learning_rate": 4.068273695894373e-05, "loss": 0.471, "step": 14395 }, { "epoch": 1.6791044776119404, "grad_norm": 0.40640282769794667, "learning_rate": 4.067570501701513e-05, "loss": 0.4954, "step": 14400 }, { "epoch": 1.6796875, "grad_norm": 0.40774451617277263, "learning_rate": 4.066867111598542e-05, "loss": 0.4865, "step": 14405 }, { "epoch": 1.6802705223880596, "grad_norm": 0.36969606212151795, "learning_rate": 4.0661635256900505e-05, "loss": 0.4753, "step": 14410 }, { "epoch": 1.6808535447761193, "grad_norm": 0.42003634711138027, "learning_rate": 4.065459744080652e-05, "loss": 0.4599, "step": 14415 }, { "epoch": 1.681436567164179, "grad_norm": 0.4042331899360192, "learning_rate": 4.064755766874993e-05, "loss": 0.462, "step": 14420 }, { "epoch": 1.682019589552239, "grad_norm": 0.4189680958836512, "learning_rate": 4.064051594177749e-05, "loss": 0.486, "step": 14425 }, { "epoch": 1.6826026119402986, "grad_norm": 0.43460367229952235, "learning_rate": 4.0633472260936224e-05, "loss": 0.4778, "step": 14430 }, { "epoch": 1.6831856343283582, "grad_norm": 0.4118904647153084, "learning_rate": 4.0626426627273474e-05, "loss": 0.4786, "step": 14435 }, { "epoch": 1.6837686567164178, "grad_norm": 0.37638816320378793, "learning_rate": 4.061937904183685e-05, "loss": 0.4494, "step": 14440 }, { "epoch": 1.6843516791044775, "grad_norm": 0.3805988153836944, "learning_rate": 4.061232950567427e-05, "loss": 0.5114, "step": 14445 }, { "epoch": 1.6849347014925373, "grad_norm": 0.4358490040848953, "learning_rate": 4.060527801983391e-05, "loss": 0.4923, "step": 14450 }, { "epoch": 1.6855177238805972, "grad_norm": 0.42111081737665557, "learning_rate": 4.0598224585364276e-05, "loss": 0.4918, "step": 14455 }, { "epoch": 1.6861007462686568, "grad_norm": 0.4466995410046454, "learning_rate": 4.0591169203314145e-05, "loss": 0.4887, "step": 14460 }, { "epoch": 1.6866837686567164, "grad_norm": 0.427094980705651, "learning_rate": 4.058411187473257e-05, "loss": 0.4939, "step": 14465 }, { "epoch": 1.687266791044776, "grad_norm": 0.44279907161841936, "learning_rate": 4.057705260066894e-05, "loss": 0.5105, "step": 14470 }, { "epoch": 1.6878498134328357, "grad_norm": 0.5717252706474644, "learning_rate": 4.056999138217287e-05, "loss": 0.4705, "step": 14475 }, { "epoch": 1.6884328358208955, "grad_norm": 0.4971844718005374, "learning_rate": 4.056292822029432e-05, "loss": 0.5335, "step": 14480 }, { "epoch": 1.6890158582089554, "grad_norm": 0.4067642994557471, "learning_rate": 4.055586311608349e-05, "loss": 0.4599, "step": 14485 }, { "epoch": 1.689598880597015, "grad_norm": 0.395307690446041, "learning_rate": 4.05487960705909e-05, "loss": 0.4821, "step": 14490 }, { "epoch": 1.6901819029850746, "grad_norm": 0.42006160426752454, "learning_rate": 4.054172708486737e-05, "loss": 0.489, "step": 14495 }, { "epoch": 1.6907649253731343, "grad_norm": 0.41681042217877773, "learning_rate": 4.053465615996397e-05, "loss": 0.4973, "step": 14500 }, { "epoch": 1.691347947761194, "grad_norm": 0.4317642030407513, "learning_rate": 4.05275832969321e-05, "loss": 0.4785, "step": 14505 }, { "epoch": 1.6919309701492538, "grad_norm": 0.4183629465011244, "learning_rate": 4.0520508496823395e-05, "loss": 0.5063, "step": 14510 }, { "epoch": 1.6925139925373134, "grad_norm": 0.4227104803821206, "learning_rate": 4.051343176068984e-05, "loss": 0.512, "step": 14515 }, { "epoch": 1.6930970149253732, "grad_norm": 0.39716646367954495, "learning_rate": 4.050635308958366e-05, "loss": 0.4791, "step": 14520 }, { "epoch": 1.6936800373134329, "grad_norm": 0.4315919077606542, "learning_rate": 4.04992724845574e-05, "loss": 0.496, "step": 14525 }, { "epoch": 1.6942630597014925, "grad_norm": 0.4190476408707576, "learning_rate": 4.0492189946663864e-05, "loss": 0.4841, "step": 14530 }, { "epoch": 1.6948460820895521, "grad_norm": 0.39975994216386457, "learning_rate": 4.048510547695616e-05, "loss": 0.4635, "step": 14535 }, { "epoch": 1.695429104477612, "grad_norm": 0.4021707350007361, "learning_rate": 4.047801907648769e-05, "loss": 0.4667, "step": 14540 }, { "epoch": 1.6960121268656716, "grad_norm": 0.5246904053875769, "learning_rate": 4.047093074631213e-05, "loss": 0.471, "step": 14545 }, { "epoch": 1.6965951492537314, "grad_norm": 0.41206111863478134, "learning_rate": 4.046384048748344e-05, "loss": 0.4911, "step": 14550 }, { "epoch": 1.697178171641791, "grad_norm": 0.3873183615800148, "learning_rate": 4.045674830105587e-05, "loss": 0.4604, "step": 14555 }, { "epoch": 1.6977611940298507, "grad_norm": 0.4486891743823356, "learning_rate": 4.0449654188083985e-05, "loss": 0.4897, "step": 14560 }, { "epoch": 1.6983442164179103, "grad_norm": 0.44464525267632954, "learning_rate": 4.0442558149622586e-05, "loss": 0.4916, "step": 14565 }, { "epoch": 1.6989272388059702, "grad_norm": 0.4310393568480325, "learning_rate": 4.04354601867268e-05, "loss": 0.4586, "step": 14570 }, { "epoch": 1.6995102611940298, "grad_norm": 0.4134083766327794, "learning_rate": 4.0428360300452024e-05, "loss": 0.4872, "step": 14575 }, { "epoch": 1.7000932835820897, "grad_norm": 0.44318935135672344, "learning_rate": 4.042125849185394e-05, "loss": 0.5347, "step": 14580 }, { "epoch": 1.7006763059701493, "grad_norm": 0.44538180422620816, "learning_rate": 4.0414154761988506e-05, "loss": 0.4964, "step": 14585 }, { "epoch": 1.701259328358209, "grad_norm": 0.4207916509197397, "learning_rate": 4.040704911191201e-05, "loss": 0.4929, "step": 14590 }, { "epoch": 1.7018423507462686, "grad_norm": 0.42148426727773364, "learning_rate": 4.0399941542680956e-05, "loss": 0.4994, "step": 14595 }, { "epoch": 1.7024253731343284, "grad_norm": 0.4197532755663317, "learning_rate": 4.0392832055352205e-05, "loss": 0.4699, "step": 14600 }, { "epoch": 1.703008395522388, "grad_norm": 0.39751059891449686, "learning_rate": 4.038572065098286e-05, "loss": 0.5227, "step": 14605 }, { "epoch": 1.7035914179104479, "grad_norm": 0.40816550933831536, "learning_rate": 4.0378607330630304e-05, "loss": 0.4642, "step": 14610 }, { "epoch": 1.7041744402985075, "grad_norm": 0.3755709499880748, "learning_rate": 4.037149209535222e-05, "loss": 0.4619, "step": 14615 }, { "epoch": 1.7047574626865671, "grad_norm": 0.41100106429253497, "learning_rate": 4.036437494620661e-05, "loss": 0.4661, "step": 14620 }, { "epoch": 1.7053404850746268, "grad_norm": 0.44133548984910614, "learning_rate": 4.0357255884251694e-05, "loss": 0.4946, "step": 14625 }, { "epoch": 1.7059235074626866, "grad_norm": 0.40752151523015684, "learning_rate": 4.0350134910546e-05, "loss": 0.4846, "step": 14630 }, { "epoch": 1.7065065298507462, "grad_norm": 0.47322208410241956, "learning_rate": 4.0343012026148384e-05, "loss": 0.4784, "step": 14635 }, { "epoch": 1.707089552238806, "grad_norm": 0.40403758744911183, "learning_rate": 4.033588723211793e-05, "loss": 0.4802, "step": 14640 }, { "epoch": 1.7076725746268657, "grad_norm": 0.4412551393571504, "learning_rate": 4.032876052951402e-05, "loss": 0.5116, "step": 14645 }, { "epoch": 1.7082555970149254, "grad_norm": 0.4082133615947316, "learning_rate": 4.032163191939633e-05, "loss": 0.4833, "step": 14650 }, { "epoch": 1.708838619402985, "grad_norm": 0.44662383748583084, "learning_rate": 4.0314501402824825e-05, "loss": 0.4675, "step": 14655 }, { "epoch": 1.7094216417910446, "grad_norm": 0.456074311014277, "learning_rate": 4.030736898085974e-05, "loss": 0.4761, "step": 14660 }, { "epoch": 1.7100046641791045, "grad_norm": 0.42416393425691723, "learning_rate": 4.03002346545616e-05, "loss": 0.4865, "step": 14665 }, { "epoch": 1.7105876865671643, "grad_norm": 0.40905104456516456, "learning_rate": 4.02930984249912e-05, "loss": 0.4791, "step": 14670 }, { "epoch": 1.711170708955224, "grad_norm": 0.38741083727404313, "learning_rate": 4.028596029320965e-05, "loss": 0.4933, "step": 14675 }, { "epoch": 1.7117537313432836, "grad_norm": 0.3984672434837992, "learning_rate": 4.02788202602783e-05, "loss": 0.4811, "step": 14680 }, { "epoch": 1.7123367537313432, "grad_norm": 0.42142094914020906, "learning_rate": 4.027167832725882e-05, "loss": 0.4735, "step": 14685 }, { "epoch": 1.7129197761194028, "grad_norm": 0.39812681935895405, "learning_rate": 4.026453449521313e-05, "loss": 0.5039, "step": 14690 }, { "epoch": 1.7135027985074627, "grad_norm": 0.3855065197526314, "learning_rate": 4.025738876520347e-05, "loss": 0.4829, "step": 14695 }, { "epoch": 1.7140858208955225, "grad_norm": 0.44644473762754755, "learning_rate": 4.025024113829233e-05, "loss": 0.4941, "step": 14700 }, { "epoch": 1.7146688432835822, "grad_norm": 0.37300083226666375, "learning_rate": 4.024309161554249e-05, "loss": 0.4784, "step": 14705 }, { "epoch": 1.7152518656716418, "grad_norm": 0.40617514898541857, "learning_rate": 4.023594019801702e-05, "loss": 0.4744, "step": 14710 }, { "epoch": 1.7158348880597014, "grad_norm": 0.3840399029109539, "learning_rate": 4.022878688677927e-05, "loss": 0.4706, "step": 14715 }, { "epoch": 1.716417910447761, "grad_norm": 0.36819401392874374, "learning_rate": 4.022163168289287e-05, "loss": 0.4714, "step": 14720 }, { "epoch": 1.717000932835821, "grad_norm": 0.4237902145668965, "learning_rate": 4.021447458742172e-05, "loss": 0.4819, "step": 14725 }, { "epoch": 1.7175839552238807, "grad_norm": 0.6219554874705321, "learning_rate": 4.020731560143002e-05, "loss": 0.4804, "step": 14730 }, { "epoch": 1.7181669776119404, "grad_norm": 0.5096288424326689, "learning_rate": 4.0200154725982245e-05, "loss": 0.5171, "step": 14735 }, { "epoch": 1.71875, "grad_norm": 0.46719677801906906, "learning_rate": 4.019299196214315e-05, "loss": 0.4781, "step": 14740 }, { "epoch": 1.7193330223880596, "grad_norm": 0.41664870706254153, "learning_rate": 4.0185827310977756e-05, "loss": 0.4694, "step": 14745 }, { "epoch": 1.7199160447761193, "grad_norm": 0.42107627767474415, "learning_rate": 4.017866077355139e-05, "loss": 0.4978, "step": 14750 }, { "epoch": 1.720499067164179, "grad_norm": 0.4541677559740301, "learning_rate": 4.0171492350929644e-05, "loss": 0.4864, "step": 14755 }, { "epoch": 1.721082089552239, "grad_norm": 0.4448482281379494, "learning_rate": 4.016432204417839e-05, "loss": 0.5001, "step": 14760 }, { "epoch": 1.7216651119402986, "grad_norm": 0.4382211385559882, "learning_rate": 4.015714985436379e-05, "loss": 0.4768, "step": 14765 }, { "epoch": 1.7222481343283582, "grad_norm": 0.43206927833480646, "learning_rate": 4.014997578255227e-05, "loss": 0.4862, "step": 14770 }, { "epoch": 1.7228311567164178, "grad_norm": 0.36936715040033663, "learning_rate": 4.014279982981057e-05, "loss": 0.4378, "step": 14775 }, { "epoch": 1.7234141791044775, "grad_norm": 0.4120603192454238, "learning_rate": 4.0135621997205654e-05, "loss": 0.4609, "step": 14780 }, { "epoch": 1.7239972014925373, "grad_norm": 0.40226922820462935, "learning_rate": 4.0128442285804815e-05, "loss": 0.4681, "step": 14785 }, { "epoch": 1.7245802238805972, "grad_norm": 0.4159642213364248, "learning_rate": 4.01212606966756e-05, "loss": 0.4813, "step": 14790 }, { "epoch": 1.7251632462686568, "grad_norm": 0.4192546917299525, "learning_rate": 4.0114077230885847e-05, "loss": 0.5048, "step": 14795 }, { "epoch": 1.7257462686567164, "grad_norm": 0.4259810736862134, "learning_rate": 4.010689188950367e-05, "loss": 0.5066, "step": 14800 }, { "epoch": 1.726329291044776, "grad_norm": 0.38815377067775536, "learning_rate": 4.009970467359746e-05, "loss": 0.4957, "step": 14805 }, { "epoch": 1.7269123134328357, "grad_norm": 0.4709225043046578, "learning_rate": 4.009251558423588e-05, "loss": 0.4877, "step": 14810 }, { "epoch": 1.7274953358208955, "grad_norm": 0.4376423324231026, "learning_rate": 4.008532462248789e-05, "loss": 0.4778, "step": 14815 }, { "epoch": 1.7280783582089554, "grad_norm": 0.4088402780943302, "learning_rate": 4.00781317894227e-05, "loss": 0.4791, "step": 14820 }, { "epoch": 1.728661380597015, "grad_norm": 0.42516785610240804, "learning_rate": 4.007093708610984e-05, "loss": 0.4941, "step": 14825 }, { "epoch": 1.7292444029850746, "grad_norm": 0.4350060525008231, "learning_rate": 4.006374051361907e-05, "loss": 0.4695, "step": 14830 }, { "epoch": 1.7298274253731343, "grad_norm": 0.4314369115933184, "learning_rate": 4.005654207302047e-05, "loss": 0.4801, "step": 14835 }, { "epoch": 1.730410447761194, "grad_norm": 0.3774773619724921, "learning_rate": 4.004934176538436e-05, "loss": 0.4677, "step": 14840 }, { "epoch": 1.7309934701492538, "grad_norm": 0.4382139271394396, "learning_rate": 4.004213959178137e-05, "loss": 0.4693, "step": 14845 }, { "epoch": 1.7315764925373134, "grad_norm": 0.44729161745585944, "learning_rate": 4.0034935553282396e-05, "loss": 0.5012, "step": 14850 }, { "epoch": 1.7321595149253732, "grad_norm": 0.46312961873331987, "learning_rate": 4.00277296509586e-05, "loss": 0.4723, "step": 14855 }, { "epoch": 1.7327425373134329, "grad_norm": 0.4268816266860399, "learning_rate": 4.002052188588144e-05, "loss": 0.5003, "step": 14860 }, { "epoch": 1.7333255597014925, "grad_norm": 0.47299567394079345, "learning_rate": 4.001331225912263e-05, "loss": 0.4902, "step": 14865 }, { "epoch": 1.7339085820895521, "grad_norm": 0.45090820016715927, "learning_rate": 4.000610077175419e-05, "loss": 0.5036, "step": 14870 }, { "epoch": 1.734491604477612, "grad_norm": 0.4188897717540988, "learning_rate": 3.999888742484838e-05, "loss": 0.4765, "step": 14875 }, { "epoch": 1.7350746268656716, "grad_norm": 0.38053316117089947, "learning_rate": 3.999167221947777e-05, "loss": 0.4722, "step": 14880 }, { "epoch": 1.7356576492537314, "grad_norm": 0.4385997518493541, "learning_rate": 3.9984455156715176e-05, "loss": 0.4869, "step": 14885 }, { "epoch": 1.736240671641791, "grad_norm": 0.44990818417600253, "learning_rate": 3.997723623763372e-05, "loss": 0.4819, "step": 14890 }, { "epoch": 1.7368236940298507, "grad_norm": 0.4126902232835701, "learning_rate": 3.997001546330679e-05, "loss": 0.4975, "step": 14895 }, { "epoch": 1.7374067164179103, "grad_norm": 0.4465958329751703, "learning_rate": 3.9962792834808034e-05, "loss": 0.4966, "step": 14900 }, { "epoch": 1.7379897388059702, "grad_norm": 0.4092713940163117, "learning_rate": 3.9955568353211384e-05, "loss": 0.4841, "step": 14905 }, { "epoch": 1.7385727611940298, "grad_norm": 0.39784336874622206, "learning_rate": 3.9948342019591066e-05, "loss": 0.4578, "step": 14910 }, { "epoch": 1.7391557835820897, "grad_norm": 0.4304337157634813, "learning_rate": 3.994111383502156e-05, "loss": 0.508, "step": 14915 }, { "epoch": 1.7397388059701493, "grad_norm": 0.7312527517732008, "learning_rate": 3.993388380057763e-05, "loss": 0.4789, "step": 14920 }, { "epoch": 1.740321828358209, "grad_norm": 0.4550878792432819, "learning_rate": 3.992665191733431e-05, "loss": 0.4904, "step": 14925 }, { "epoch": 1.7409048507462686, "grad_norm": 0.3994822719676129, "learning_rate": 3.9919418186366905e-05, "loss": 0.4608, "step": 14930 }, { "epoch": 1.7414878731343284, "grad_norm": 0.4625317998459724, "learning_rate": 3.991218260875101e-05, "loss": 0.5032, "step": 14935 }, { "epoch": 1.742070895522388, "grad_norm": 0.38077012965226736, "learning_rate": 3.9904945185562484e-05, "loss": 0.4565, "step": 14940 }, { "epoch": 1.7426539179104479, "grad_norm": 0.4480502431336596, "learning_rate": 3.989770591787747e-05, "loss": 0.522, "step": 14945 }, { "epoch": 1.7432369402985075, "grad_norm": 0.398867521435284, "learning_rate": 3.989046480677236e-05, "loss": 0.465, "step": 14950 }, { "epoch": 1.7438199626865671, "grad_norm": 0.43754836115769846, "learning_rate": 3.988322185332386e-05, "loss": 0.484, "step": 14955 }, { "epoch": 1.7444029850746268, "grad_norm": 0.41737523281789957, "learning_rate": 3.987597705860891e-05, "loss": 0.4685, "step": 14960 }, { "epoch": 1.7449860074626866, "grad_norm": 0.43664087867694085, "learning_rate": 3.9868730423704754e-05, "loss": 0.4881, "step": 14965 }, { "epoch": 1.7455690298507462, "grad_norm": 0.42139600304341013, "learning_rate": 3.986148194968888e-05, "loss": 0.4855, "step": 14970 }, { "epoch": 1.746152052238806, "grad_norm": 0.5222225528877735, "learning_rate": 3.985423163763909e-05, "loss": 0.4786, "step": 14975 }, { "epoch": 1.7467350746268657, "grad_norm": 0.4322503978118561, "learning_rate": 3.9846979488633415e-05, "loss": 0.4846, "step": 14980 }, { "epoch": 1.7473180970149254, "grad_norm": 0.41162470239800886, "learning_rate": 3.9839725503750185e-05, "loss": 0.4843, "step": 14985 }, { "epoch": 1.747901119402985, "grad_norm": 0.4231023984292742, "learning_rate": 3.9832469684068007e-05, "loss": 0.4647, "step": 14990 }, { "epoch": 1.7484841417910446, "grad_norm": 0.4306464601193167, "learning_rate": 3.982521203066575e-05, "loss": 0.4401, "step": 14995 }, { "epoch": 1.7490671641791045, "grad_norm": 0.40830830901483156, "learning_rate": 3.9817952544622554e-05, "loss": 0.4667, "step": 15000 }, { "epoch": 1.7496501865671643, "grad_norm": 0.44644806445507973, "learning_rate": 3.981069122701784e-05, "loss": 0.5094, "step": 15005 }, { "epoch": 1.750233208955224, "grad_norm": 0.4111577936580399, "learning_rate": 3.9803428078931276e-05, "loss": 0.5006, "step": 15010 }, { "epoch": 1.7508162313432836, "grad_norm": 0.42627297333764413, "learning_rate": 3.979616310144284e-05, "loss": 0.4789, "step": 15015 }, { "epoch": 1.7513992537313432, "grad_norm": 0.4058739814998778, "learning_rate": 3.978889629563277e-05, "loss": 0.4479, "step": 15020 }, { "epoch": 1.7519822761194028, "grad_norm": 0.38798700870798036, "learning_rate": 3.9781627662581575e-05, "loss": 0.4926, "step": 15025 }, { "epoch": 1.7525652985074627, "grad_norm": 0.4221107232729366, "learning_rate": 3.977435720337e-05, "loss": 0.483, "step": 15030 }, { "epoch": 1.7531483208955225, "grad_norm": 0.45583926648182216, "learning_rate": 3.976708491907912e-05, "loss": 0.4824, "step": 15035 }, { "epoch": 1.7537313432835822, "grad_norm": 0.4169946301163216, "learning_rate": 3.9759810810790236e-05, "loss": 0.4917, "step": 15040 }, { "epoch": 1.7543143656716418, "grad_norm": 0.4210033692145867, "learning_rate": 3.9752534879584954e-05, "loss": 0.4882, "step": 15045 }, { "epoch": 1.7548973880597014, "grad_norm": 0.466250165685237, "learning_rate": 3.9745257126545146e-05, "loss": 0.4934, "step": 15050 }, { "epoch": 1.755480410447761, "grad_norm": 0.43642385025225544, "learning_rate": 3.973797755275291e-05, "loss": 0.4547, "step": 15055 }, { "epoch": 1.756063432835821, "grad_norm": 0.4667562211514214, "learning_rate": 3.9730696159290656e-05, "loss": 0.4952, "step": 15060 }, { "epoch": 1.7566464552238807, "grad_norm": 0.44193754300987165, "learning_rate": 3.9723412947241085e-05, "loss": 0.4613, "step": 15065 }, { "epoch": 1.7572294776119404, "grad_norm": 0.41844044588590157, "learning_rate": 3.971612791768712e-05, "loss": 0.4676, "step": 15070 }, { "epoch": 1.7578125, "grad_norm": 0.39445835046273797, "learning_rate": 3.970884107171198e-05, "loss": 0.4851, "step": 15075 }, { "epoch": 1.7583955223880596, "grad_norm": 0.405745262922451, "learning_rate": 3.970155241039914e-05, "loss": 0.4777, "step": 15080 }, { "epoch": 1.7589785447761193, "grad_norm": 0.40216231559136584, "learning_rate": 3.969426193483237e-05, "loss": 0.4627, "step": 15085 }, { "epoch": 1.759561567164179, "grad_norm": 0.45390641354272226, "learning_rate": 3.968696964609568e-05, "loss": 0.4844, "step": 15090 }, { "epoch": 1.760144589552239, "grad_norm": 0.43424950082620944, "learning_rate": 3.967967554527338e-05, "loss": 0.5153, "step": 15095 }, { "epoch": 1.7607276119402986, "grad_norm": 0.4313331182458118, "learning_rate": 3.967237963345001e-05, "loss": 0.4671, "step": 15100 }, { "epoch": 1.7613106343283582, "grad_norm": 0.4189378906681823, "learning_rate": 3.966508191171041e-05, "loss": 0.4915, "step": 15105 }, { "epoch": 1.7618936567164178, "grad_norm": 0.4284044452739007, "learning_rate": 3.9657782381139696e-05, "loss": 0.4774, "step": 15110 }, { "epoch": 1.7624766791044775, "grad_norm": 0.3976581151912186, "learning_rate": 3.965048104282323e-05, "loss": 0.5076, "step": 15115 }, { "epoch": 1.7630597014925373, "grad_norm": 0.4121697800037609, "learning_rate": 3.964317789784664e-05, "loss": 0.4844, "step": 15120 }, { "epoch": 1.7636427238805972, "grad_norm": 0.40692894613398783, "learning_rate": 3.963587294729585e-05, "loss": 0.4911, "step": 15125 }, { "epoch": 1.7642257462686568, "grad_norm": 0.40246451021405094, "learning_rate": 3.962856619225703e-05, "loss": 0.488, "step": 15130 }, { "epoch": 1.7648087686567164, "grad_norm": 0.393330174653249, "learning_rate": 3.962125763381661e-05, "loss": 0.4561, "step": 15135 }, { "epoch": 1.765391791044776, "grad_norm": 0.3870639900750078, "learning_rate": 3.961394727306133e-05, "loss": 0.4619, "step": 15140 }, { "epoch": 1.7659748134328357, "grad_norm": 0.4089934690121375, "learning_rate": 3.9606635111078156e-05, "loss": 0.4582, "step": 15145 }, { "epoch": 1.7665578358208955, "grad_norm": 0.44166689072835347, "learning_rate": 3.9599321148954325e-05, "loss": 0.4706, "step": 15150 }, { "epoch": 1.7671408582089554, "grad_norm": 0.4774148419314475, "learning_rate": 3.959200538777738e-05, "loss": 0.4628, "step": 15155 }, { "epoch": 1.767723880597015, "grad_norm": 0.4079011504725099, "learning_rate": 3.958468782863508e-05, "loss": 0.4752, "step": 15160 }, { "epoch": 1.7683069029850746, "grad_norm": 0.45201689672551193, "learning_rate": 3.957736847261548e-05, "loss": 0.5049, "step": 15165 }, { "epoch": 1.7688899253731343, "grad_norm": 0.3802703247205141, "learning_rate": 3.9570047320806916e-05, "loss": 0.4596, "step": 15170 }, { "epoch": 1.769472947761194, "grad_norm": 0.39404343060002167, "learning_rate": 3.956272437429796e-05, "loss": 0.4836, "step": 15175 }, { "epoch": 1.7700559701492538, "grad_norm": 0.44542854788396297, "learning_rate": 3.955539963417746e-05, "loss": 0.4605, "step": 15180 }, { "epoch": 1.7706389925373134, "grad_norm": 0.39794735367770373, "learning_rate": 3.954807310153454e-05, "loss": 0.4695, "step": 15185 }, { "epoch": 1.7712220149253732, "grad_norm": 0.41241430426744424, "learning_rate": 3.954074477745859e-05, "loss": 0.4811, "step": 15190 }, { "epoch": 1.7718050373134329, "grad_norm": 0.41950410364972074, "learning_rate": 3.9533414663039246e-05, "loss": 0.492, "step": 15195 }, { "epoch": 1.7723880597014925, "grad_norm": 0.4198224492459816, "learning_rate": 3.952608275936644e-05, "loss": 0.5086, "step": 15200 }, { "epoch": 1.7729710820895521, "grad_norm": 0.4022553273442121, "learning_rate": 3.951874906753035e-05, "loss": 0.4395, "step": 15205 }, { "epoch": 1.773554104477612, "grad_norm": 0.4300154004601707, "learning_rate": 3.9511413588621435e-05, "loss": 0.4696, "step": 15210 }, { "epoch": 1.7741371268656716, "grad_norm": 0.39241273429664947, "learning_rate": 3.9504076323730396e-05, "loss": 0.4657, "step": 15215 }, { "epoch": 1.7747201492537314, "grad_norm": 0.3888676516483212, "learning_rate": 3.949673727394823e-05, "loss": 0.4709, "step": 15220 }, { "epoch": 1.775303171641791, "grad_norm": 0.40229431096323487, "learning_rate": 3.948939644036616e-05, "loss": 0.4924, "step": 15225 }, { "epoch": 1.7758861940298507, "grad_norm": 0.6464933825933378, "learning_rate": 3.9482053824075716e-05, "loss": 0.4674, "step": 15230 }, { "epoch": 1.7764692164179103, "grad_norm": 0.4043491830389959, "learning_rate": 3.947470942616868e-05, "loss": 0.4623, "step": 15235 }, { "epoch": 1.7770522388059702, "grad_norm": 0.38156501215349453, "learning_rate": 3.946736324773707e-05, "loss": 0.5024, "step": 15240 }, { "epoch": 1.7776352611940298, "grad_norm": 0.39245586227753537, "learning_rate": 3.946001528987322e-05, "loss": 0.4861, "step": 15245 }, { "epoch": 1.7782182835820897, "grad_norm": 0.44125935346496437, "learning_rate": 3.945266555366968e-05, "loss": 0.4624, "step": 15250 }, { "epoch": 1.7788013059701493, "grad_norm": 0.37553696918559865, "learning_rate": 3.94453140402193e-05, "loss": 0.4735, "step": 15255 }, { "epoch": 1.779384328358209, "grad_norm": 0.425390219319215, "learning_rate": 3.943796075061517e-05, "loss": 0.4895, "step": 15260 }, { "epoch": 1.7799673507462686, "grad_norm": 0.416099722707022, "learning_rate": 3.943060568595065e-05, "loss": 0.482, "step": 15265 }, { "epoch": 1.7805503731343284, "grad_norm": 0.4170100272677797, "learning_rate": 3.942324884731938e-05, "loss": 0.4697, "step": 15270 }, { "epoch": 1.781133395522388, "grad_norm": 0.36539660037054905, "learning_rate": 3.941589023581524e-05, "loss": 0.4403, "step": 15275 }, { "epoch": 1.7817164179104479, "grad_norm": 0.3867176150081152, "learning_rate": 3.940852985253239e-05, "loss": 0.4376, "step": 15280 }, { "epoch": 1.7822994402985075, "grad_norm": 0.45738766430041133, "learning_rate": 3.940116769856526e-05, "loss": 0.493, "step": 15285 }, { "epoch": 1.7828824626865671, "grad_norm": 0.4251208871596025, "learning_rate": 3.9393803775008506e-05, "loss": 0.4974, "step": 15290 }, { "epoch": 1.7834654850746268, "grad_norm": 0.40352873209568857, "learning_rate": 3.9386438082957096e-05, "loss": 0.4754, "step": 15295 }, { "epoch": 1.7840485074626866, "grad_norm": 0.3986071012356413, "learning_rate": 3.937907062350622e-05, "loss": 0.487, "step": 15300 }, { "epoch": 1.7846315298507462, "grad_norm": 0.41332502961987394, "learning_rate": 3.937170139775137e-05, "loss": 0.4915, "step": 15305 }, { "epoch": 1.785214552238806, "grad_norm": 0.43103245611688024, "learning_rate": 3.9364330406788265e-05, "loss": 0.4801, "step": 15310 }, { "epoch": 1.7857975746268657, "grad_norm": 0.41205735119186915, "learning_rate": 3.9356957651712894e-05, "loss": 0.4816, "step": 15315 }, { "epoch": 1.7863805970149254, "grad_norm": 0.48840694762325626, "learning_rate": 3.9349583133621535e-05, "loss": 0.4694, "step": 15320 }, { "epoch": 1.786963619402985, "grad_norm": 0.4131915852321686, "learning_rate": 3.934220685361069e-05, "loss": 0.4827, "step": 15325 }, { "epoch": 1.7875466417910446, "grad_norm": 0.3894300207628937, "learning_rate": 3.933482881277715e-05, "loss": 0.4892, "step": 15330 }, { "epoch": 1.7881296641791045, "grad_norm": 0.38210279764096605, "learning_rate": 3.9327449012217955e-05, "loss": 0.4549, "step": 15335 }, { "epoch": 1.7887126865671643, "grad_norm": 0.40537470988711033, "learning_rate": 3.9320067453030415e-05, "loss": 0.485, "step": 15340 }, { "epoch": 1.789295708955224, "grad_norm": 0.4089454829387791, "learning_rate": 3.9312684136312094e-05, "loss": 0.4737, "step": 15345 }, { "epoch": 1.7898787313432836, "grad_norm": 0.4372054009396863, "learning_rate": 3.930529906316083e-05, "loss": 0.5003, "step": 15350 }, { "epoch": 1.7904617537313432, "grad_norm": 0.40955980587827856, "learning_rate": 3.9297912234674694e-05, "loss": 0.4944, "step": 15355 }, { "epoch": 1.7910447761194028, "grad_norm": 0.400700542691212, "learning_rate": 3.9290523651952046e-05, "loss": 0.4647, "step": 15360 }, { "epoch": 1.7916277985074627, "grad_norm": 0.40549421369688765, "learning_rate": 3.92831333160915e-05, "loss": 0.4676, "step": 15365 }, { "epoch": 1.7922108208955225, "grad_norm": 0.45638556477073006, "learning_rate": 3.927574122819193e-05, "loss": 0.4765, "step": 15370 }, { "epoch": 1.7927938432835822, "grad_norm": 0.4246020398214672, "learning_rate": 3.9268347389352464e-05, "loss": 0.4908, "step": 15375 }, { "epoch": 1.7933768656716418, "grad_norm": 0.3833487818350352, "learning_rate": 3.926095180067249e-05, "loss": 0.4488, "step": 15380 }, { "epoch": 1.7939598880597014, "grad_norm": 0.4093574048152263, "learning_rate": 3.925355446325167e-05, "loss": 0.5032, "step": 15385 }, { "epoch": 1.794542910447761, "grad_norm": 0.3986170325896095, "learning_rate": 3.924615537818992e-05, "loss": 0.4791, "step": 15390 }, { "epoch": 1.795125932835821, "grad_norm": 0.420527866036737, "learning_rate": 3.92387545465874e-05, "loss": 0.4917, "step": 15395 }, { "epoch": 1.7957089552238807, "grad_norm": 0.45521502374903666, "learning_rate": 3.923135196954456e-05, "loss": 0.5045, "step": 15400 }, { "epoch": 1.7962919776119404, "grad_norm": 0.41107859823130743, "learning_rate": 3.922394764816208e-05, "loss": 0.4777, "step": 15405 }, { "epoch": 1.796875, "grad_norm": 0.40650658242433396, "learning_rate": 3.92165415835409e-05, "loss": 0.4683, "step": 15410 }, { "epoch": 1.7974580223880596, "grad_norm": 0.3849555799439404, "learning_rate": 3.920913377678226e-05, "loss": 0.4823, "step": 15415 }, { "epoch": 1.7980410447761193, "grad_norm": 0.418578702384259, "learning_rate": 3.92017242289876e-05, "loss": 0.4686, "step": 15420 }, { "epoch": 1.798624067164179, "grad_norm": 0.43812039694586474, "learning_rate": 3.919431294125868e-05, "loss": 0.4732, "step": 15425 }, { "epoch": 1.799207089552239, "grad_norm": 0.41127898907608035, "learning_rate": 3.918689991469746e-05, "loss": 0.4592, "step": 15430 }, { "epoch": 1.7997901119402986, "grad_norm": 0.44816753133483084, "learning_rate": 3.91794851504062e-05, "loss": 0.475, "step": 15435 }, { "epoch": 1.8003731343283582, "grad_norm": 0.40643338863374023, "learning_rate": 3.9172068649487405e-05, "loss": 0.4557, "step": 15440 }, { "epoch": 1.8009561567164178, "grad_norm": 0.37782727725264037, "learning_rate": 3.916465041304383e-05, "loss": 0.478, "step": 15445 }, { "epoch": 1.8015391791044775, "grad_norm": 0.44244631303917853, "learning_rate": 3.91572304421785e-05, "loss": 0.487, "step": 15450 }, { "epoch": 1.8021222014925373, "grad_norm": 0.4084939226282715, "learning_rate": 3.9149808737994705e-05, "loss": 0.4687, "step": 15455 }, { "epoch": 1.8027052238805972, "grad_norm": 0.4341262271358096, "learning_rate": 3.914238530159595e-05, "loss": 0.5069, "step": 15460 }, { "epoch": 1.8032882462686568, "grad_norm": 0.4667319157767231, "learning_rate": 3.9134960134086055e-05, "loss": 0.4913, "step": 15465 }, { "epoch": 1.8038712686567164, "grad_norm": 0.38542046105302263, "learning_rate": 3.9127533236569077e-05, "loss": 0.479, "step": 15470 }, { "epoch": 1.804454291044776, "grad_norm": 0.3983104149375585, "learning_rate": 3.91201046101493e-05, "loss": 0.4763, "step": 15475 }, { "epoch": 1.8050373134328357, "grad_norm": 0.4557044991237112, "learning_rate": 3.9112674255931294e-05, "loss": 0.4729, "step": 15480 }, { "epoch": 1.8056203358208955, "grad_norm": 0.398605111774296, "learning_rate": 3.9105242175019905e-05, "loss": 0.4576, "step": 15485 }, { "epoch": 1.8062033582089554, "grad_norm": 0.4020143944218762, "learning_rate": 3.909780836852019e-05, "loss": 0.493, "step": 15490 }, { "epoch": 1.806786380597015, "grad_norm": 0.4258847032896706, "learning_rate": 3.9090372837537496e-05, "loss": 0.4779, "step": 15495 }, { "epoch": 1.8073694029850746, "grad_norm": 0.3788449876872275, "learning_rate": 3.908293558317741e-05, "loss": 0.462, "step": 15500 }, { "epoch": 1.8079524253731343, "grad_norm": 0.41534740906913603, "learning_rate": 3.907549660654577e-05, "loss": 0.476, "step": 15505 }, { "epoch": 1.808535447761194, "grad_norm": 0.4332899675796817, "learning_rate": 3.9068055908748706e-05, "loss": 0.5103, "step": 15510 }, { "epoch": 1.8091184701492538, "grad_norm": 0.4030914140670705, "learning_rate": 3.9060613490892556e-05, "loss": 0.4584, "step": 15515 }, { "epoch": 1.8097014925373134, "grad_norm": 0.39597160258735686, "learning_rate": 3.9053169354083946e-05, "loss": 0.4421, "step": 15520 }, { "epoch": 1.8102845149253732, "grad_norm": 0.398649547369236, "learning_rate": 3.904572349942974e-05, "loss": 0.4864, "step": 15525 }, { "epoch": 1.8108675373134329, "grad_norm": 0.43211937478635676, "learning_rate": 3.903827592803708e-05, "loss": 0.4928, "step": 15530 }, { "epoch": 1.8114505597014925, "grad_norm": 0.39353992590112763, "learning_rate": 3.903082664101334e-05, "loss": 0.4405, "step": 15535 }, { "epoch": 1.8120335820895521, "grad_norm": 0.41038084236870254, "learning_rate": 3.9023375639466156e-05, "loss": 0.4444, "step": 15540 }, { "epoch": 1.812616604477612, "grad_norm": 0.40088923901412543, "learning_rate": 3.901592292450342e-05, "loss": 0.4708, "step": 15545 }, { "epoch": 1.8131996268656716, "grad_norm": 0.3747100514690436, "learning_rate": 3.900846849723328e-05, "loss": 0.4713, "step": 15550 }, { "epoch": 1.8137826492537314, "grad_norm": 0.4339908537179978, "learning_rate": 3.9001012358764146e-05, "loss": 0.4815, "step": 15555 }, { "epoch": 1.814365671641791, "grad_norm": 0.38610085982358383, "learning_rate": 3.8993554510204664e-05, "loss": 0.4761, "step": 15560 }, { "epoch": 1.8149486940298507, "grad_norm": 0.42773321572752476, "learning_rate": 3.898609495266375e-05, "loss": 0.4818, "step": 15565 }, { "epoch": 1.8155317164179103, "grad_norm": 0.45083045474627925, "learning_rate": 3.897863368725056e-05, "loss": 0.5221, "step": 15570 }, { "epoch": 1.8161147388059702, "grad_norm": 0.36013209274777985, "learning_rate": 3.8971170715074526e-05, "loss": 0.4544, "step": 15575 }, { "epoch": 1.8166977611940298, "grad_norm": 0.41184520107378214, "learning_rate": 3.896370603724531e-05, "loss": 0.4854, "step": 15580 }, { "epoch": 1.8172807835820897, "grad_norm": 0.3885048648718672, "learning_rate": 3.895623965487284e-05, "loss": 0.4589, "step": 15585 }, { "epoch": 1.8178638059701493, "grad_norm": 0.3986924147173458, "learning_rate": 3.8948771569067305e-05, "loss": 0.4751, "step": 15590 }, { "epoch": 1.818446828358209, "grad_norm": 0.40984748035131313, "learning_rate": 3.8941301780939124e-05, "loss": 0.474, "step": 15595 }, { "epoch": 1.8190298507462686, "grad_norm": 0.366818795293633, "learning_rate": 3.893383029159899e-05, "loss": 0.4709, "step": 15600 }, { "epoch": 1.8196128731343284, "grad_norm": 0.4036551530085086, "learning_rate": 3.892635710215785e-05, "loss": 0.5008, "step": 15605 }, { "epoch": 1.820195895522388, "grad_norm": 0.42129360937557636, "learning_rate": 3.891888221372688e-05, "loss": 0.4685, "step": 15610 }, { "epoch": 1.8207789179104479, "grad_norm": 0.4444262082361587, "learning_rate": 3.891140562741753e-05, "loss": 0.4679, "step": 15615 }, { "epoch": 1.8213619402985075, "grad_norm": 0.413109811823458, "learning_rate": 3.89039273443415e-05, "loss": 0.4626, "step": 15620 }, { "epoch": 1.8219449626865671, "grad_norm": 0.37574586024072143, "learning_rate": 3.889644736561073e-05, "loss": 0.4729, "step": 15625 }, { "epoch": 1.8225279850746268, "grad_norm": 0.3900772734777929, "learning_rate": 3.888896569233744e-05, "loss": 0.4716, "step": 15630 }, { "epoch": 1.8231110074626866, "grad_norm": 0.4268307621995299, "learning_rate": 3.888148232563407e-05, "loss": 0.4949, "step": 15635 }, { "epoch": 1.8236940298507462, "grad_norm": 0.4358830677717346, "learning_rate": 3.887399726661332e-05, "loss": 0.4726, "step": 15640 }, { "epoch": 1.824277052238806, "grad_norm": 0.39883007170076384, "learning_rate": 3.886651051638815e-05, "loss": 0.4685, "step": 15645 }, { "epoch": 1.8248600746268657, "grad_norm": 0.40396056332788266, "learning_rate": 3.885902207607178e-05, "loss": 0.4735, "step": 15650 }, { "epoch": 1.8254430970149254, "grad_norm": 0.4167157619548304, "learning_rate": 3.885153194677766e-05, "loss": 0.4938, "step": 15655 }, { "epoch": 1.826026119402985, "grad_norm": 0.4203187960069392, "learning_rate": 3.88440401296195e-05, "loss": 0.4929, "step": 15660 }, { "epoch": 1.8266091417910446, "grad_norm": 0.4341211867568005, "learning_rate": 3.8836546625711254e-05, "loss": 0.5087, "step": 15665 }, { "epoch": 1.8271921641791045, "grad_norm": 0.39075385186204453, "learning_rate": 3.8829051436167144e-05, "loss": 0.4711, "step": 15670 }, { "epoch": 1.8277751865671643, "grad_norm": 0.41422888614841374, "learning_rate": 3.882155456210164e-05, "loss": 0.4642, "step": 15675 }, { "epoch": 1.828358208955224, "grad_norm": 0.37785499219074814, "learning_rate": 3.881405600462943e-05, "loss": 0.4893, "step": 15680 }, { "epoch": 1.8289412313432836, "grad_norm": 0.3927210965136199, "learning_rate": 3.880655576486551e-05, "loss": 0.445, "step": 15685 }, { "epoch": 1.8295242537313432, "grad_norm": 0.41703084100536003, "learning_rate": 3.879905384392508e-05, "loss": 0.5177, "step": 15690 }, { "epoch": 1.8301072761194028, "grad_norm": 0.41137230623778376, "learning_rate": 3.8791550242923594e-05, "loss": 0.4823, "step": 15695 }, { "epoch": 1.8306902985074627, "grad_norm": 0.38682073394946265, "learning_rate": 3.8784044962976776e-05, "loss": 0.4707, "step": 15700 }, { "epoch": 1.8312733208955225, "grad_norm": 0.40629038056312605, "learning_rate": 3.877653800520058e-05, "loss": 0.4682, "step": 15705 }, { "epoch": 1.8318563432835822, "grad_norm": 0.41598586100196094, "learning_rate": 3.8769029370711234e-05, "loss": 0.5037, "step": 15710 }, { "epoch": 1.8324393656716418, "grad_norm": 0.46420430672556257, "learning_rate": 3.876151906062519e-05, "loss": 0.4625, "step": 15715 }, { "epoch": 1.8330223880597014, "grad_norm": 0.46369679722699775, "learning_rate": 3.8754007076059155e-05, "loss": 0.5175, "step": 15720 }, { "epoch": 1.833605410447761, "grad_norm": 0.4181383714240629, "learning_rate": 3.8746493418130096e-05, "loss": 0.4867, "step": 15725 }, { "epoch": 1.834188432835821, "grad_norm": 0.40288133499025486, "learning_rate": 3.873897808795522e-05, "loss": 0.4839, "step": 15730 }, { "epoch": 1.8347714552238807, "grad_norm": 0.4205814700827323, "learning_rate": 3.8731461086652e-05, "loss": 0.5, "step": 15735 }, { "epoch": 1.8353544776119404, "grad_norm": 0.411766350596247, "learning_rate": 3.8723942415338105e-05, "loss": 0.4873, "step": 15740 }, { "epoch": 1.8359375, "grad_norm": 0.38163342055425953, "learning_rate": 3.871642207513152e-05, "loss": 0.451, "step": 15745 }, { "epoch": 1.8365205223880596, "grad_norm": 0.39414867687151395, "learning_rate": 3.870890006715044e-05, "loss": 0.4919, "step": 15750 }, { "epoch": 1.8371035447761193, "grad_norm": 0.4551187782615539, "learning_rate": 3.870137639251331e-05, "loss": 0.4682, "step": 15755 }, { "epoch": 1.837686567164179, "grad_norm": 0.4591681715825297, "learning_rate": 3.869385105233884e-05, "loss": 0.4877, "step": 15760 }, { "epoch": 1.838269589552239, "grad_norm": 0.4118018694693055, "learning_rate": 3.868632404774597e-05, "loss": 0.4968, "step": 15765 }, { "epoch": 1.8388526119402986, "grad_norm": 0.509013827239908, "learning_rate": 3.867879537985388e-05, "loss": 0.4833, "step": 15770 }, { "epoch": 1.8394356343283582, "grad_norm": 0.48060406980698844, "learning_rate": 3.867126504978204e-05, "loss": 0.4879, "step": 15775 }, { "epoch": 1.8400186567164178, "grad_norm": 0.3938739727043748, "learning_rate": 3.8663733058650104e-05, "loss": 0.5192, "step": 15780 }, { "epoch": 1.8406016791044775, "grad_norm": 0.38171962022765954, "learning_rate": 3.865619940757804e-05, "loss": 0.4585, "step": 15785 }, { "epoch": 1.8411847014925373, "grad_norm": 0.3789930067999332, "learning_rate": 3.8648664097686e-05, "loss": 0.4511, "step": 15790 }, { "epoch": 1.8417677238805972, "grad_norm": 0.3702657242133424, "learning_rate": 3.8641127130094434e-05, "loss": 0.4756, "step": 15795 }, { "epoch": 1.8423507462686568, "grad_norm": 0.4039677147752383, "learning_rate": 3.8633588505924e-05, "loss": 0.5078, "step": 15800 }, { "epoch": 1.8429337686567164, "grad_norm": 0.44270006231799147, "learning_rate": 3.862604822629564e-05, "loss": 0.4749, "step": 15805 }, { "epoch": 1.843516791044776, "grad_norm": 0.4071226582925008, "learning_rate": 3.861850629233051e-05, "loss": 0.4973, "step": 15810 }, { "epoch": 1.8440998134328357, "grad_norm": 0.3844192653576694, "learning_rate": 3.861096270515001e-05, "loss": 0.4487, "step": 15815 }, { "epoch": 1.8446828358208955, "grad_norm": 0.46835696367368285, "learning_rate": 3.8603417465875816e-05, "loss": 0.5028, "step": 15820 }, { "epoch": 1.8452658582089554, "grad_norm": 0.417841442575684, "learning_rate": 3.8595870575629834e-05, "loss": 0.4793, "step": 15825 }, { "epoch": 1.845848880597015, "grad_norm": 0.4381855887470024, "learning_rate": 3.858832203553421e-05, "loss": 0.5293, "step": 15830 }, { "epoch": 1.8464319029850746, "grad_norm": 0.3992981993529362, "learning_rate": 3.8580771846711335e-05, "loss": 0.4555, "step": 15835 }, { "epoch": 1.8470149253731343, "grad_norm": 0.4040824685345217, "learning_rate": 3.857322001028385e-05, "loss": 0.4593, "step": 15840 }, { "epoch": 1.847597947761194, "grad_norm": 0.38293322273225205, "learning_rate": 3.856566652737465e-05, "loss": 0.5019, "step": 15845 }, { "epoch": 1.8481809701492538, "grad_norm": 0.4585816035516511, "learning_rate": 3.855811139910686e-05, "loss": 0.4875, "step": 15850 }, { "epoch": 1.8487639925373134, "grad_norm": 0.41162663830745716, "learning_rate": 3.855055462660385e-05, "loss": 0.4919, "step": 15855 }, { "epoch": 1.8493470149253732, "grad_norm": 0.39170798846721, "learning_rate": 3.854299621098925e-05, "loss": 0.4544, "step": 15860 }, { "epoch": 1.8499300373134329, "grad_norm": 0.37926089062207197, "learning_rate": 3.853543615338691e-05, "loss": 0.4406, "step": 15865 }, { "epoch": 1.8505130597014925, "grad_norm": 0.37298974771470295, "learning_rate": 3.8527874454920955e-05, "loss": 0.4886, "step": 15870 }, { "epoch": 1.8510960820895521, "grad_norm": 0.3870604348064698, "learning_rate": 3.852031111671573e-05, "loss": 0.459, "step": 15875 }, { "epoch": 1.851679104477612, "grad_norm": 0.44136326104268875, "learning_rate": 3.851274613989582e-05, "loss": 0.5028, "step": 15880 }, { "epoch": 1.8522621268656716, "grad_norm": 0.3667399614732067, "learning_rate": 3.850517952558608e-05, "loss": 0.4462, "step": 15885 }, { "epoch": 1.8528451492537314, "grad_norm": 0.40808488522894626, "learning_rate": 3.849761127491158e-05, "loss": 0.4807, "step": 15890 }, { "epoch": 1.853428171641791, "grad_norm": 0.38135841075395843, "learning_rate": 3.849004138899766e-05, "loss": 0.4928, "step": 15895 }, { "epoch": 1.8540111940298507, "grad_norm": 0.38045258820885103, "learning_rate": 3.848246986896989e-05, "loss": 0.481, "step": 15900 }, { "epoch": 1.8545942164179103, "grad_norm": 0.40655612248620926, "learning_rate": 3.847489671595406e-05, "loss": 0.4395, "step": 15905 }, { "epoch": 1.8551772388059702, "grad_norm": 0.3829921354976652, "learning_rate": 3.8467321931076255e-05, "loss": 0.4578, "step": 15910 }, { "epoch": 1.8557602611940298, "grad_norm": 0.39279724721647, "learning_rate": 3.845974551546276e-05, "loss": 0.5077, "step": 15915 }, { "epoch": 1.8563432835820897, "grad_norm": 0.38717973434618574, "learning_rate": 3.84521674702401e-05, "loss": 0.5018, "step": 15920 }, { "epoch": 1.8569263059701493, "grad_norm": 0.3753542167148014, "learning_rate": 3.844458779653508e-05, "loss": 0.4613, "step": 15925 }, { "epoch": 1.857509328358209, "grad_norm": 0.36248882376059743, "learning_rate": 3.8437006495474716e-05, "loss": 0.4514, "step": 15930 }, { "epoch": 1.8580923507462686, "grad_norm": 0.4130159463676241, "learning_rate": 3.842942356818627e-05, "loss": 0.5111, "step": 15935 }, { "epoch": 1.8586753731343284, "grad_norm": 0.4022933677414021, "learning_rate": 3.8421839015797265e-05, "loss": 0.4664, "step": 15940 }, { "epoch": 1.859258395522388, "grad_norm": 0.45940652562024537, "learning_rate": 3.841425283943544e-05, "loss": 0.5038, "step": 15945 }, { "epoch": 1.8598414179104479, "grad_norm": 0.4338299573989669, "learning_rate": 3.840666504022879e-05, "loss": 0.469, "step": 15950 }, { "epoch": 1.8604244402985075, "grad_norm": 0.38921153278825393, "learning_rate": 3.839907561930554e-05, "loss": 0.4724, "step": 15955 }, { "epoch": 1.8610074626865671, "grad_norm": 0.4120809967947985, "learning_rate": 3.839148457779418e-05, "loss": 0.5102, "step": 15960 }, { "epoch": 1.8615904850746268, "grad_norm": 0.43073826253995934, "learning_rate": 3.838389191682341e-05, "loss": 0.4868, "step": 15965 }, { "epoch": 1.8621735074626866, "grad_norm": 0.37518589498092525, "learning_rate": 3.837629763752219e-05, "loss": 0.4519, "step": 15970 }, { "epoch": 1.8627565298507462, "grad_norm": 0.42065130984639537, "learning_rate": 3.8368701741019734e-05, "loss": 0.5179, "step": 15975 }, { "epoch": 1.863339552238806, "grad_norm": 0.44034209990802575, "learning_rate": 3.8361104228445455e-05, "loss": 0.5088, "step": 15980 }, { "epoch": 1.8639225746268657, "grad_norm": 0.428726844646854, "learning_rate": 3.835350510092904e-05, "loss": 0.5116, "step": 15985 }, { "epoch": 1.8645055970149254, "grad_norm": 0.41290111731726753, "learning_rate": 3.834590435960041e-05, "loss": 0.4702, "step": 15990 }, { "epoch": 1.865088619402985, "grad_norm": 0.38778917040761307, "learning_rate": 3.833830200558971e-05, "loss": 0.4761, "step": 15995 }, { "epoch": 1.8656716417910446, "grad_norm": 0.44238489509858847, "learning_rate": 3.8330698040027345e-05, "loss": 0.5411, "step": 16000 }, { "epoch": 1.8662546641791045, "grad_norm": 0.42042102256172875, "learning_rate": 3.832309246404396e-05, "loss": 0.5205, "step": 16005 }, { "epoch": 1.8668376865671643, "grad_norm": 0.42171279923682037, "learning_rate": 3.8315485278770423e-05, "loss": 0.4779, "step": 16010 }, { "epoch": 1.867420708955224, "grad_norm": 0.4325489253113547, "learning_rate": 3.830787648533785e-05, "loss": 0.5003, "step": 16015 }, { "epoch": 1.8680037313432836, "grad_norm": 0.44449707743834876, "learning_rate": 3.83002660848776e-05, "loss": 0.4702, "step": 16020 }, { "epoch": 1.8685867537313432, "grad_norm": 0.4057244191845612, "learning_rate": 3.829265407852125e-05, "loss": 0.4906, "step": 16025 }, { "epoch": 1.8691697761194028, "grad_norm": 0.4047231713285812, "learning_rate": 3.828504046740065e-05, "loss": 0.4838, "step": 16030 }, { "epoch": 1.8697527985074627, "grad_norm": 0.4402545345321185, "learning_rate": 3.827742525264787e-05, "loss": 0.506, "step": 16035 }, { "epoch": 1.8703358208955225, "grad_norm": 0.4318181827689146, "learning_rate": 3.826980843539521e-05, "loss": 0.4912, "step": 16040 }, { "epoch": 1.8709188432835822, "grad_norm": 0.39281509103457857, "learning_rate": 3.826219001677523e-05, "loss": 0.4507, "step": 16045 }, { "epoch": 1.8715018656716418, "grad_norm": 0.3790746086135691, "learning_rate": 3.82545699979207e-05, "loss": 0.4501, "step": 16050 }, { "epoch": 1.8720848880597014, "grad_norm": 0.4044025067278778, "learning_rate": 3.824694837996466e-05, "loss": 0.4661, "step": 16055 }, { "epoch": 1.872667910447761, "grad_norm": 0.42653108142887636, "learning_rate": 3.823932516404036e-05, "loss": 0.4689, "step": 16060 }, { "epoch": 1.873250932835821, "grad_norm": 0.38397790509604734, "learning_rate": 3.823170035128131e-05, "loss": 0.4521, "step": 16065 }, { "epoch": 1.8738339552238807, "grad_norm": 0.4002935434985125, "learning_rate": 3.822407394282123e-05, "loss": 0.4773, "step": 16070 }, { "epoch": 1.8744169776119404, "grad_norm": 0.41580866271495587, "learning_rate": 3.821644593979411e-05, "loss": 0.5112, "step": 16075 }, { "epoch": 1.875, "grad_norm": 0.38766810421634024, "learning_rate": 3.8208816343334156e-05, "loss": 0.4724, "step": 16080 }, { "epoch": 1.8755830223880596, "grad_norm": 0.40069384581867035, "learning_rate": 3.820118515457582e-05, "loss": 0.4846, "step": 16085 }, { "epoch": 1.8761660447761193, "grad_norm": 0.3903370227525892, "learning_rate": 3.819355237465377e-05, "loss": 0.4481, "step": 16090 }, { "epoch": 1.876749067164179, "grad_norm": 0.4096276880014734, "learning_rate": 3.818591800470295e-05, "loss": 0.5202, "step": 16095 }, { "epoch": 1.877332089552239, "grad_norm": 0.39952781068074394, "learning_rate": 3.81782820458585e-05, "loss": 0.4386, "step": 16100 }, { "epoch": 1.8779151119402986, "grad_norm": 0.4743106559132605, "learning_rate": 3.817064449925582e-05, "loss": 0.4819, "step": 16105 }, { "epoch": 1.8784981343283582, "grad_norm": 0.4121861252761789, "learning_rate": 3.816300536603054e-05, "loss": 0.4789, "step": 16110 }, { "epoch": 1.8790811567164178, "grad_norm": 0.41347188150712333, "learning_rate": 3.815536464731853e-05, "loss": 0.4799, "step": 16115 }, { "epoch": 1.8796641791044775, "grad_norm": 0.4027894743276831, "learning_rate": 3.814772234425588e-05, "loss": 0.4731, "step": 16120 }, { "epoch": 1.8802472014925373, "grad_norm": 0.40856626217364383, "learning_rate": 3.814007845797894e-05, "loss": 0.4692, "step": 16125 }, { "epoch": 1.8808302238805972, "grad_norm": 0.4391793128455858, "learning_rate": 3.813243298962428e-05, "loss": 0.4712, "step": 16130 }, { "epoch": 1.8814132462686568, "grad_norm": 0.4023196419084873, "learning_rate": 3.8124785940328704e-05, "loss": 0.4837, "step": 16135 }, { "epoch": 1.8819962686567164, "grad_norm": 0.4051129806594245, "learning_rate": 3.8117137311229255e-05, "loss": 0.4606, "step": 16140 }, { "epoch": 1.882579291044776, "grad_norm": 0.384969224693189, "learning_rate": 3.810948710346322e-05, "loss": 0.4887, "step": 16145 }, { "epoch": 1.8831623134328357, "grad_norm": 0.39416545187061863, "learning_rate": 3.81018353181681e-05, "loss": 0.4448, "step": 16150 }, { "epoch": 1.8837453358208955, "grad_norm": 0.39071190107945314, "learning_rate": 3.8094181956481645e-05, "loss": 0.4937, "step": 16155 }, { "epoch": 1.8843283582089554, "grad_norm": 0.40976718895408326, "learning_rate": 3.808652701954183e-05, "loss": 0.4899, "step": 16160 }, { "epoch": 1.884911380597015, "grad_norm": 0.39986071039677495, "learning_rate": 3.807887050848689e-05, "loss": 0.4747, "step": 16165 }, { "epoch": 1.8854944029850746, "grad_norm": 0.397199438043576, "learning_rate": 3.807121242445526e-05, "loss": 0.4768, "step": 16170 }, { "epoch": 1.8860774253731343, "grad_norm": 0.37446958271136305, "learning_rate": 3.806355276858562e-05, "loss": 0.4702, "step": 16175 }, { "epoch": 1.886660447761194, "grad_norm": 0.42441869032333956, "learning_rate": 3.805589154201691e-05, "loss": 0.462, "step": 16180 }, { "epoch": 1.8872434701492538, "grad_norm": 0.41373456326126706, "learning_rate": 3.804822874588825e-05, "loss": 0.4603, "step": 16185 }, { "epoch": 1.8878264925373134, "grad_norm": 0.4063084742024033, "learning_rate": 3.804056438133905e-05, "loss": 0.4649, "step": 16190 }, { "epoch": 1.8884095149253732, "grad_norm": 0.42477797034907416, "learning_rate": 3.803289844950893e-05, "loss": 0.4836, "step": 16195 }, { "epoch": 1.8889925373134329, "grad_norm": 0.3874744527316154, "learning_rate": 3.80252309515377e-05, "loss": 0.4607, "step": 16200 }, { "epoch": 1.8895755597014925, "grad_norm": 0.3893900370916626, "learning_rate": 3.801756188856549e-05, "loss": 0.4657, "step": 16205 }, { "epoch": 1.8901585820895521, "grad_norm": 0.3892493641847063, "learning_rate": 3.800989126173259e-05, "loss": 0.472, "step": 16210 }, { "epoch": 1.890741604477612, "grad_norm": 0.3738549215638216, "learning_rate": 3.800221907217956e-05, "loss": 0.4704, "step": 16215 }, { "epoch": 1.8913246268656716, "grad_norm": 0.408209912848128, "learning_rate": 3.799454532104718e-05, "loss": 0.473, "step": 16220 }, { "epoch": 1.8919076492537314, "grad_norm": 0.42433965709059185, "learning_rate": 3.7986870009476454e-05, "loss": 0.4623, "step": 16225 }, { "epoch": 1.892490671641791, "grad_norm": 0.3914089243364931, "learning_rate": 3.7979193138608646e-05, "loss": 0.4924, "step": 16230 }, { "epoch": 1.8930736940298507, "grad_norm": 0.485375920959211, "learning_rate": 3.797151470958521e-05, "loss": 0.5154, "step": 16235 }, { "epoch": 1.8936567164179103, "grad_norm": 0.38590075567283577, "learning_rate": 3.7963834723547866e-05, "loss": 0.4696, "step": 16240 }, { "epoch": 1.8942397388059702, "grad_norm": 0.410772058988447, "learning_rate": 3.795615318163856e-05, "loss": 0.4443, "step": 16245 }, { "epoch": 1.8948227611940298, "grad_norm": 0.38317535209122194, "learning_rate": 3.794847008499946e-05, "loss": 0.4993, "step": 16250 }, { "epoch": 1.8954057835820897, "grad_norm": 0.4064916270947185, "learning_rate": 3.7940785434772965e-05, "loss": 0.4767, "step": 16255 }, { "epoch": 1.8959888059701493, "grad_norm": 0.40411293880729643, "learning_rate": 3.793309923210171e-05, "loss": 0.4625, "step": 16260 }, { "epoch": 1.896571828358209, "grad_norm": 0.37456650735591096, "learning_rate": 3.792541147812856e-05, "loss": 0.5228, "step": 16265 }, { "epoch": 1.8971548507462686, "grad_norm": 0.4190371199656764, "learning_rate": 3.791772217399661e-05, "loss": 0.4706, "step": 16270 }, { "epoch": 1.8977378731343284, "grad_norm": 0.3922319515428183, "learning_rate": 3.791003132084919e-05, "loss": 0.4767, "step": 16275 }, { "epoch": 1.898320895522388, "grad_norm": 0.5470149003980315, "learning_rate": 3.7902338919829854e-05, "loss": 0.4731, "step": 16280 }, { "epoch": 1.8989039179104479, "grad_norm": 0.40976174662007087, "learning_rate": 3.789464497208238e-05, "loss": 0.4642, "step": 16285 }, { "epoch": 1.8994869402985075, "grad_norm": 0.43326296131089675, "learning_rate": 3.788694947875079e-05, "loss": 0.5053, "step": 16290 }, { "epoch": 1.9000699626865671, "grad_norm": 0.3930508352423981, "learning_rate": 3.787925244097935e-05, "loss": 0.4555, "step": 16295 }, { "epoch": 1.9006529850746268, "grad_norm": 0.3689438893532244, "learning_rate": 3.78715538599125e-05, "loss": 0.468, "step": 16300 }, { "epoch": 1.9012360074626866, "grad_norm": 0.43438128865830056, "learning_rate": 3.786385373669497e-05, "loss": 0.4738, "step": 16305 }, { "epoch": 1.9018190298507462, "grad_norm": 0.41853652061756247, "learning_rate": 3.7856152072471686e-05, "loss": 0.4671, "step": 16310 }, { "epoch": 1.902402052238806, "grad_norm": 0.3945411702472311, "learning_rate": 3.7848448868387806e-05, "loss": 0.4479, "step": 16315 }, { "epoch": 1.9029850746268657, "grad_norm": 0.38768017734875904, "learning_rate": 3.784074412558875e-05, "loss": 0.4574, "step": 16320 }, { "epoch": 1.9035680970149254, "grad_norm": 0.39455269546284544, "learning_rate": 3.7833037845220097e-05, "loss": 0.4786, "step": 16325 }, { "epoch": 1.904151119402985, "grad_norm": 0.4239773973420398, "learning_rate": 3.782533002842773e-05, "loss": 0.528, "step": 16330 }, { "epoch": 1.9047341417910446, "grad_norm": 0.42244613712490187, "learning_rate": 3.781762067635771e-05, "loss": 0.4987, "step": 16335 }, { "epoch": 1.9053171641791045, "grad_norm": 0.44852303417013406, "learning_rate": 3.7809909790156355e-05, "loss": 0.4746, "step": 16340 }, { "epoch": 1.9059001865671643, "grad_norm": 0.4393086527481142, "learning_rate": 3.78021973709702e-05, "loss": 0.4542, "step": 16345 }, { "epoch": 1.906483208955224, "grad_norm": 0.402224723983359, "learning_rate": 3.7794483419946e-05, "loss": 0.4717, "step": 16350 }, { "epoch": 1.9070662313432836, "grad_norm": 0.4243706638845106, "learning_rate": 3.778676793823075e-05, "loss": 0.4572, "step": 16355 }, { "epoch": 1.9076492537313432, "grad_norm": 0.41599027838302116, "learning_rate": 3.777905092697166e-05, "loss": 0.4829, "step": 16360 }, { "epoch": 1.9082322761194028, "grad_norm": 0.4017264788022485, "learning_rate": 3.7771332387316186e-05, "loss": 0.4711, "step": 16365 }, { "epoch": 1.9088152985074627, "grad_norm": 0.4550572468239916, "learning_rate": 3.7763612320412e-05, "loss": 0.4829, "step": 16370 }, { "epoch": 1.9093983208955225, "grad_norm": 0.41334212997226194, "learning_rate": 3.7755890727406994e-05, "loss": 0.455, "step": 16375 }, { "epoch": 1.9099813432835822, "grad_norm": 0.4022630693327699, "learning_rate": 3.77481676094493e-05, "loss": 0.4953, "step": 16380 }, { "epoch": 1.9105643656716418, "grad_norm": 0.4484057334472698, "learning_rate": 3.774044296768728e-05, "loss": 0.4769, "step": 16385 }, { "epoch": 1.9111473880597014, "grad_norm": 0.40114234694902606, "learning_rate": 3.77327168032695e-05, "loss": 0.4843, "step": 16390 }, { "epoch": 1.911730410447761, "grad_norm": 0.42756820757515207, "learning_rate": 3.772498911734478e-05, "loss": 0.465, "step": 16395 }, { "epoch": 1.912313432835821, "grad_norm": 0.4081033664707095, "learning_rate": 3.771725991106214e-05, "loss": 0.4989, "step": 16400 }, { "epoch": 1.9128964552238807, "grad_norm": 0.39990963402946317, "learning_rate": 3.7709529185570846e-05, "loss": 0.4549, "step": 16405 }, { "epoch": 1.9134794776119404, "grad_norm": 0.39415693532083024, "learning_rate": 3.770179694202038e-05, "loss": 0.5289, "step": 16410 }, { "epoch": 1.9140625, "grad_norm": 0.36870474335573705, "learning_rate": 3.7694063181560456e-05, "loss": 0.4529, "step": 16415 }, { "epoch": 1.9146455223880596, "grad_norm": 0.3811879050835146, "learning_rate": 3.7686327905341014e-05, "loss": 0.4886, "step": 16420 }, { "epoch": 1.9152285447761193, "grad_norm": 0.42225081839415346, "learning_rate": 3.76785911145122e-05, "loss": 0.4569, "step": 16425 }, { "epoch": 1.915811567164179, "grad_norm": 0.3893868986907824, "learning_rate": 3.767085281022441e-05, "loss": 0.5049, "step": 16430 }, { "epoch": 1.916394589552239, "grad_norm": 0.3857552525470896, "learning_rate": 3.766311299362826e-05, "loss": 0.485, "step": 16435 }, { "epoch": 1.9169776119402986, "grad_norm": 0.41590242594626253, "learning_rate": 3.765537166587458e-05, "loss": 0.4761, "step": 16440 }, { "epoch": 1.9175606343283582, "grad_norm": 0.41966464258161296, "learning_rate": 3.764762882811444e-05, "loss": 0.4939, "step": 16445 }, { "epoch": 1.9181436567164178, "grad_norm": 0.37378791935038896, "learning_rate": 3.763988448149912e-05, "loss": 0.4749, "step": 16450 }, { "epoch": 1.9187266791044775, "grad_norm": 0.3623872216384651, "learning_rate": 3.763213862718012e-05, "loss": 0.4621, "step": 16455 }, { "epoch": 1.9193097014925373, "grad_norm": 0.4088015820211004, "learning_rate": 3.762439126630919e-05, "loss": 0.4975, "step": 16460 }, { "epoch": 1.9198927238805972, "grad_norm": 0.41167934288178915, "learning_rate": 3.761664240003828e-05, "loss": 0.483, "step": 16465 }, { "epoch": 1.9204757462686568, "grad_norm": 0.3566403186749648, "learning_rate": 3.7608892029519576e-05, "loss": 0.4467, "step": 16470 }, { "epoch": 1.9210587686567164, "grad_norm": 0.3960928211013188, "learning_rate": 3.7601140155905485e-05, "loss": 0.4905, "step": 16475 }, { "epoch": 1.921641791044776, "grad_norm": 0.42244900696929966, "learning_rate": 3.7593386780348625e-05, "loss": 0.4826, "step": 16480 }, { "epoch": 1.9222248134328357, "grad_norm": 0.41264798541535225, "learning_rate": 3.758563190400187e-05, "loss": 0.4481, "step": 16485 }, { "epoch": 1.9228078358208955, "grad_norm": 0.4784051652523608, "learning_rate": 3.757787552801827e-05, "loss": 0.5125, "step": 16490 }, { "epoch": 1.9233908582089554, "grad_norm": 0.40478268729486794, "learning_rate": 3.757011765355115e-05, "loss": 0.4877, "step": 16495 }, { "epoch": 1.923973880597015, "grad_norm": 0.43157962225933183, "learning_rate": 3.756235828175401e-05, "loss": 0.4886, "step": 16500 }, { "epoch": 1.9245569029850746, "grad_norm": 0.3870430698134395, "learning_rate": 3.755459741378061e-05, "loss": 0.4539, "step": 16505 }, { "epoch": 1.9251399253731343, "grad_norm": 0.5381877988217303, "learning_rate": 3.75468350507849e-05, "loss": 0.5197, "step": 16510 }, { "epoch": 1.925722947761194, "grad_norm": 0.42249259646697085, "learning_rate": 3.753907119392108e-05, "loss": 0.5015, "step": 16515 }, { "epoch": 1.9263059701492538, "grad_norm": 0.4390311414973388, "learning_rate": 3.753130584434357e-05, "loss": 0.4683, "step": 16520 }, { "epoch": 1.9268889925373134, "grad_norm": 0.4090149980450418, "learning_rate": 3.752353900320698e-05, "loss": 0.452, "step": 16525 }, { "epoch": 1.9274720149253732, "grad_norm": 0.4061074236998217, "learning_rate": 3.7515770671666175e-05, "loss": 0.4854, "step": 16530 }, { "epoch": 1.9280550373134329, "grad_norm": 0.4290232571924102, "learning_rate": 3.750800085087625e-05, "loss": 0.5024, "step": 16535 }, { "epoch": 1.9286380597014925, "grad_norm": 0.5160988567004748, "learning_rate": 3.750022954199248e-05, "loss": 0.5046, "step": 16540 }, { "epoch": 1.9292210820895521, "grad_norm": 0.38032340028583833, "learning_rate": 3.7492456746170385e-05, "loss": 0.4712, "step": 16545 }, { "epoch": 1.929804104477612, "grad_norm": 0.41077435748539576, "learning_rate": 3.748468246456572e-05, "loss": 0.4761, "step": 16550 }, { "epoch": 1.9303871268656716, "grad_norm": 0.4700124737492129, "learning_rate": 3.747690669833443e-05, "loss": 0.4658, "step": 16555 }, { "epoch": 1.9309701492537314, "grad_norm": 0.46311118729692075, "learning_rate": 3.7469129448632704e-05, "loss": 0.4817, "step": 16560 }, { "epoch": 1.931553171641791, "grad_norm": 0.42113606356414157, "learning_rate": 3.7461350716616955e-05, "loss": 0.4866, "step": 16565 }, { "epoch": 1.9321361940298507, "grad_norm": 0.4025147398337016, "learning_rate": 3.7453570503443785e-05, "loss": 0.4595, "step": 16570 }, { "epoch": 1.9327192164179103, "grad_norm": 0.4229540767535665, "learning_rate": 3.744578881027006e-05, "loss": 0.4806, "step": 16575 }, { "epoch": 1.9333022388059702, "grad_norm": 0.45409889851507895, "learning_rate": 3.743800563825283e-05, "loss": 0.5039, "step": 16580 }, { "epoch": 1.9338852611940298, "grad_norm": 0.38530571510428296, "learning_rate": 3.743022098854937e-05, "loss": 0.4766, "step": 16585 }, { "epoch": 1.9344682835820897, "grad_norm": 0.46896570244232355, "learning_rate": 3.742243486231719e-05, "loss": 0.4815, "step": 16590 }, { "epoch": 1.9350513059701493, "grad_norm": 0.43267921963119627, "learning_rate": 3.7414647260714025e-05, "loss": 0.4887, "step": 16595 }, { "epoch": 1.935634328358209, "grad_norm": 0.7028618230877963, "learning_rate": 3.74068581848978e-05, "loss": 0.4806, "step": 16600 }, { "epoch": 1.9362173507462686, "grad_norm": 0.3861540918854558, "learning_rate": 3.739906763602669e-05, "loss": 0.4705, "step": 16605 }, { "epoch": 1.9368003731343284, "grad_norm": 2.3870802765535304, "learning_rate": 3.7391275615259065e-05, "loss": 0.4961, "step": 16610 }, { "epoch": 1.937383395522388, "grad_norm": 0.3844269511493217, "learning_rate": 3.738348212375352e-05, "loss": 0.4621, "step": 16615 }, { "epoch": 1.9379664179104479, "grad_norm": 0.4041414263644477, "learning_rate": 3.737568716266888e-05, "loss": 0.4719, "step": 16620 }, { "epoch": 1.9385494402985075, "grad_norm": 0.37973447998386956, "learning_rate": 3.7367890733164185e-05, "loss": 0.4713, "step": 16625 }, { "epoch": 1.9391324626865671, "grad_norm": 0.39106083913968837, "learning_rate": 3.7360092836398686e-05, "loss": 0.4714, "step": 16630 }, { "epoch": 1.9397154850746268, "grad_norm": 0.39565639330982066, "learning_rate": 3.7352293473531844e-05, "loss": 0.4771, "step": 16635 }, { "epoch": 1.9402985074626866, "grad_norm": 0.3578259873475878, "learning_rate": 3.734449264572336e-05, "loss": 0.459, "step": 16640 }, { "epoch": 1.9408815298507462, "grad_norm": 0.3787654282979704, "learning_rate": 3.733669035413315e-05, "loss": 0.4745, "step": 16645 }, { "epoch": 1.941464552238806, "grad_norm": 0.39718938432333034, "learning_rate": 3.7328886599921327e-05, "loss": 0.466, "step": 16650 }, { "epoch": 1.9420475746268657, "grad_norm": 0.4102061499601817, "learning_rate": 3.732108138424824e-05, "loss": 0.5032, "step": 16655 }, { "epoch": 1.9426305970149254, "grad_norm": 0.3774949062632342, "learning_rate": 3.7313274708274445e-05, "loss": 0.4674, "step": 16660 }, { "epoch": 1.943213619402985, "grad_norm": 0.3984575933091318, "learning_rate": 3.7305466573160725e-05, "loss": 0.4679, "step": 16665 }, { "epoch": 1.9437966417910446, "grad_norm": 0.40535943928527784, "learning_rate": 3.729765698006808e-05, "loss": 0.4664, "step": 16670 }, { "epoch": 1.9443796641791045, "grad_norm": 0.5454599589638158, "learning_rate": 3.7289845930157704e-05, "loss": 0.4414, "step": 16675 }, { "epoch": 1.9449626865671643, "grad_norm": 0.3859731888383419, "learning_rate": 3.7282033424591043e-05, "loss": 0.4554, "step": 16680 }, { "epoch": 1.945545708955224, "grad_norm": 0.3685656495645585, "learning_rate": 3.7274219464529736e-05, "loss": 0.4528, "step": 16685 }, { "epoch": 1.9461287313432836, "grad_norm": 0.4588171481071328, "learning_rate": 3.726640405113564e-05, "loss": 0.4844, "step": 16690 }, { "epoch": 1.9467117537313432, "grad_norm": 0.36823038271837566, "learning_rate": 3.725858718557084e-05, "loss": 0.4454, "step": 16695 }, { "epoch": 1.9472947761194028, "grad_norm": 0.40010987425600264, "learning_rate": 3.725076886899763e-05, "loss": 0.4504, "step": 16700 }, { "epoch": 1.9478777985074627, "grad_norm": 0.38973950050762535, "learning_rate": 3.72429491025785e-05, "loss": 0.5048, "step": 16705 }, { "epoch": 1.9484608208955225, "grad_norm": 0.4310358335759325, "learning_rate": 3.723512788747619e-05, "loss": 0.475, "step": 16710 }, { "epoch": 1.9490438432835822, "grad_norm": 0.46712116479614596, "learning_rate": 3.722730522485364e-05, "loss": 0.541, "step": 16715 }, { "epoch": 1.9496268656716418, "grad_norm": 0.4316970096467175, "learning_rate": 3.721948111587399e-05, "loss": 0.5075, "step": 16720 }, { "epoch": 1.9502098880597014, "grad_norm": 0.4205955396297808, "learning_rate": 3.721165556170065e-05, "loss": 0.4548, "step": 16725 }, { "epoch": 1.950792910447761, "grad_norm": 0.4203871211897958, "learning_rate": 3.720382856349715e-05, "loss": 0.4675, "step": 16730 }, { "epoch": 1.951375932835821, "grad_norm": 0.41475197902523725, "learning_rate": 3.719600012242733e-05, "loss": 0.4433, "step": 16735 }, { "epoch": 1.9519589552238807, "grad_norm": 0.4081830980890671, "learning_rate": 3.718817023965519e-05, "loss": 0.463, "step": 16740 }, { "epoch": 1.9525419776119404, "grad_norm": 0.4211230101753503, "learning_rate": 3.718033891634496e-05, "loss": 0.4878, "step": 16745 }, { "epoch": 1.953125, "grad_norm": 0.42193399504459156, "learning_rate": 3.717250615366108e-05, "loss": 0.5, "step": 16750 }, { "epoch": 1.9537080223880596, "grad_norm": 0.4219503411049521, "learning_rate": 3.7164671952768206e-05, "loss": 0.4733, "step": 16755 }, { "epoch": 1.9542910447761193, "grad_norm": 0.406463599638427, "learning_rate": 3.715683631483121e-05, "loss": 0.4603, "step": 16760 }, { "epoch": 1.954874067164179, "grad_norm": 0.3855559528822891, "learning_rate": 3.7148999241015185e-05, "loss": 0.4534, "step": 16765 }, { "epoch": 1.955457089552239, "grad_norm": 0.37082440935305794, "learning_rate": 3.714116073248542e-05, "loss": 0.4427, "step": 16770 }, { "epoch": 1.9560401119402986, "grad_norm": 0.3607069535459937, "learning_rate": 3.713332079040743e-05, "loss": 0.4756, "step": 16775 }, { "epoch": 1.9566231343283582, "grad_norm": 0.4334772889695182, "learning_rate": 3.712547941594693e-05, "loss": 0.4924, "step": 16780 }, { "epoch": 1.9572061567164178, "grad_norm": 0.3856978224354025, "learning_rate": 3.7117636610269875e-05, "loss": 0.4632, "step": 16785 }, { "epoch": 1.9577891791044775, "grad_norm": 0.4645421829101539, "learning_rate": 3.71097923745424e-05, "loss": 0.5006, "step": 16790 }, { "epoch": 1.9583722014925373, "grad_norm": 0.3868172011914484, "learning_rate": 3.710194670993087e-05, "loss": 0.4852, "step": 16795 }, { "epoch": 1.9589552238805972, "grad_norm": 0.42532981954584903, "learning_rate": 3.709409961760186e-05, "loss": 0.4869, "step": 16800 }, { "epoch": 1.9595382462686568, "grad_norm": 0.39036454374697166, "learning_rate": 3.708625109872217e-05, "loss": 0.4682, "step": 16805 }, { "epoch": 1.9601212686567164, "grad_norm": 0.9115506210889813, "learning_rate": 3.707840115445877e-05, "loss": 0.4796, "step": 16810 }, { "epoch": 1.960704291044776, "grad_norm": 0.3994822012039142, "learning_rate": 3.707054978597891e-05, "loss": 0.4547, "step": 16815 }, { "epoch": 1.9612873134328357, "grad_norm": 0.3920868940709815, "learning_rate": 3.706269699444998e-05, "loss": 0.4774, "step": 16820 }, { "epoch": 1.9618703358208955, "grad_norm": 0.45976672383860007, "learning_rate": 3.705484278103964e-05, "loss": 0.482, "step": 16825 }, { "epoch": 1.9624533582089554, "grad_norm": 0.40408221217538515, "learning_rate": 3.704698714691572e-05, "loss": 0.4559, "step": 16830 }, { "epoch": 1.963036380597015, "grad_norm": 0.38867974780837394, "learning_rate": 3.703913009324628e-05, "loss": 0.4788, "step": 16835 }, { "epoch": 1.9636194029850746, "grad_norm": 0.4074948137795385, "learning_rate": 3.703127162119959e-05, "loss": 0.4924, "step": 16840 }, { "epoch": 1.9642024253731343, "grad_norm": 0.3532292695939483, "learning_rate": 3.702341173194413e-05, "loss": 0.4759, "step": 16845 }, { "epoch": 1.964785447761194, "grad_norm": 0.3673161523930121, "learning_rate": 3.701555042664861e-05, "loss": 0.4246, "step": 16850 }, { "epoch": 1.9653684701492538, "grad_norm": 0.3983167128787513, "learning_rate": 3.7007687706481896e-05, "loss": 0.4742, "step": 16855 }, { "epoch": 1.9659514925373134, "grad_norm": 0.44069995874263695, "learning_rate": 3.699982357261312e-05, "loss": 0.4747, "step": 16860 }, { "epoch": 1.9665345149253732, "grad_norm": 0.4179449484696286, "learning_rate": 3.699195802621159e-05, "loss": 0.4814, "step": 16865 }, { "epoch": 1.9671175373134329, "grad_norm": 0.5313306303372765, "learning_rate": 3.6984091068446855e-05, "loss": 0.4724, "step": 16870 }, { "epoch": 1.9677005597014925, "grad_norm": 0.4041824205186734, "learning_rate": 3.697622270048864e-05, "loss": 0.4871, "step": 16875 }, { "epoch": 1.9682835820895521, "grad_norm": 0.38779126275640496, "learning_rate": 3.69683529235069e-05, "loss": 0.5014, "step": 16880 }, { "epoch": 1.968866604477612, "grad_norm": 0.4087047082682217, "learning_rate": 3.6960481738671806e-05, "loss": 0.4835, "step": 16885 }, { "epoch": 1.9694496268656716, "grad_norm": 0.4257340759685614, "learning_rate": 3.695260914715372e-05, "loss": 0.4789, "step": 16890 }, { "epoch": 1.9700326492537314, "grad_norm": 0.3888475404211362, "learning_rate": 3.6944735150123215e-05, "loss": 0.4573, "step": 16895 }, { "epoch": 1.970615671641791, "grad_norm": 0.43264952576993276, "learning_rate": 3.693685974875109e-05, "loss": 0.5198, "step": 16900 }, { "epoch": 1.9711986940298507, "grad_norm": 0.375427913943805, "learning_rate": 3.6928982944208344e-05, "loss": 0.4761, "step": 16905 }, { "epoch": 1.9717817164179103, "grad_norm": 0.39791450964579966, "learning_rate": 3.692110473766616e-05, "loss": 0.5142, "step": 16910 }, { "epoch": 1.9723647388059702, "grad_norm": 0.38141927095940475, "learning_rate": 3.6913225130295974e-05, "loss": 0.4504, "step": 16915 }, { "epoch": 1.9729477611940298, "grad_norm": 0.4014015389936595, "learning_rate": 3.69053441232694e-05, "loss": 0.4564, "step": 16920 }, { "epoch": 1.9735307835820897, "grad_norm": 0.3993117903736875, "learning_rate": 3.689746171775827e-05, "loss": 0.4577, "step": 16925 }, { "epoch": 1.9741138059701493, "grad_norm": 0.40597494605440465, "learning_rate": 3.688957791493462e-05, "loss": 0.4999, "step": 16930 }, { "epoch": 1.974696828358209, "grad_norm": 0.40406283019005323, "learning_rate": 3.68816927159707e-05, "loss": 0.4712, "step": 16935 }, { "epoch": 1.9752798507462686, "grad_norm": 0.368111443636303, "learning_rate": 3.6873806122038964e-05, "loss": 0.4703, "step": 16940 }, { "epoch": 1.9758628731343284, "grad_norm": 0.406242724610839, "learning_rate": 3.686591813431206e-05, "loss": 0.4891, "step": 16945 }, { "epoch": 1.976445895522388, "grad_norm": 0.39794611594957685, "learning_rate": 3.685802875396287e-05, "loss": 0.5092, "step": 16950 }, { "epoch": 1.9770289179104479, "grad_norm": 0.47672970484842125, "learning_rate": 3.6850137982164466e-05, "loss": 0.5087, "step": 16955 }, { "epoch": 1.9776119402985075, "grad_norm": 0.38184874607776165, "learning_rate": 3.684224582009014e-05, "loss": 0.4717, "step": 16960 }, { "epoch": 1.9781949626865671, "grad_norm": 0.4241920379079499, "learning_rate": 3.683435226891335e-05, "loss": 0.4584, "step": 16965 }, { "epoch": 1.9787779850746268, "grad_norm": 0.3959445216740194, "learning_rate": 3.682645732980783e-05, "loss": 0.4829, "step": 16970 }, { "epoch": 1.9793610074626866, "grad_norm": 0.39963720749487497, "learning_rate": 3.681856100394745e-05, "loss": 0.4618, "step": 16975 }, { "epoch": 1.9799440298507462, "grad_norm": 0.42456237621721227, "learning_rate": 3.6810663292506344e-05, "loss": 0.4766, "step": 16980 }, { "epoch": 1.980527052238806, "grad_norm": 0.41505505807337784, "learning_rate": 3.68027641966588e-05, "loss": 0.4658, "step": 16985 }, { "epoch": 1.9811100746268657, "grad_norm": 0.4117451536711639, "learning_rate": 3.6794863717579365e-05, "loss": 0.4696, "step": 16990 }, { "epoch": 1.9816930970149254, "grad_norm": 0.37960314538128104, "learning_rate": 3.6786961856442737e-05, "loss": 0.4542, "step": 16995 }, { "epoch": 1.982276119402985, "grad_norm": 0.3902955640541868, "learning_rate": 3.677905861442387e-05, "loss": 0.4879, "step": 17000 }, { "epoch": 1.9828591417910446, "grad_norm": 0.41164180183844934, "learning_rate": 3.677115399269789e-05, "loss": 0.4881, "step": 17005 }, { "epoch": 1.9834421641791045, "grad_norm": 0.3998624672180739, "learning_rate": 3.676324799244014e-05, "loss": 0.4764, "step": 17010 }, { "epoch": 1.9840251865671643, "grad_norm": 0.4166789335922044, "learning_rate": 3.675534061482617e-05, "loss": 0.4781, "step": 17015 }, { "epoch": 1.984608208955224, "grad_norm": 0.44753527894552464, "learning_rate": 3.6747431861031716e-05, "loss": 0.48, "step": 17020 }, { "epoch": 1.9851912313432836, "grad_norm": 0.4377000277323754, "learning_rate": 3.6739521732232756e-05, "loss": 0.4719, "step": 17025 }, { "epoch": 1.9857742537313432, "grad_norm": 0.4556311746195216, "learning_rate": 3.673161022960544e-05, "loss": 0.4919, "step": 17030 }, { "epoch": 1.9863572761194028, "grad_norm": 0.39199825786880405, "learning_rate": 3.6723697354326134e-05, "loss": 0.4587, "step": 17035 }, { "epoch": 1.9869402985074627, "grad_norm": 0.3974044923023687, "learning_rate": 3.67157831075714e-05, "loss": 0.4645, "step": 17040 }, { "epoch": 1.9875233208955225, "grad_norm": 0.3778225135522206, "learning_rate": 3.670786749051801e-05, "loss": 0.466, "step": 17045 }, { "epoch": 1.9881063432835822, "grad_norm": 0.40936053779607, "learning_rate": 3.6699950504342954e-05, "loss": 0.4591, "step": 17050 }, { "epoch": 1.9886893656716418, "grad_norm": 0.44904800916291815, "learning_rate": 3.669203215022341e-05, "loss": 0.4915, "step": 17055 }, { "epoch": 1.9892723880597014, "grad_norm": 0.39843809526649276, "learning_rate": 3.6684112429336745e-05, "loss": 0.4855, "step": 17060 }, { "epoch": 1.989855410447761, "grad_norm": 0.3620310448140616, "learning_rate": 3.667619134286057e-05, "loss": 0.4501, "step": 17065 }, { "epoch": 1.990438432835821, "grad_norm": 0.444308166017696, "learning_rate": 3.666826889197265e-05, "loss": 0.472, "step": 17070 }, { "epoch": 1.9910214552238807, "grad_norm": 0.37952757162829676, "learning_rate": 3.666034507785098e-05, "loss": 0.4652, "step": 17075 }, { "epoch": 1.9916044776119404, "grad_norm": 0.4185276423874473, "learning_rate": 3.665241990167378e-05, "loss": 0.4616, "step": 17080 }, { "epoch": 1.9921875, "grad_norm": 0.38571830135742774, "learning_rate": 3.664449336461943e-05, "loss": 0.4805, "step": 17085 }, { "epoch": 1.9927705223880596, "grad_norm": 0.42090409971706727, "learning_rate": 3.663656546786653e-05, "loss": 0.4862, "step": 17090 }, { "epoch": 1.9933535447761193, "grad_norm": 0.40892314384825473, "learning_rate": 3.6628636212593874e-05, "loss": 0.4472, "step": 17095 }, { "epoch": 1.993936567164179, "grad_norm": 0.4050926400176725, "learning_rate": 3.6620705599980494e-05, "loss": 0.4617, "step": 17100 }, { "epoch": 1.994519589552239, "grad_norm": 0.37504799105020076, "learning_rate": 3.6612773631205567e-05, "loss": 0.432, "step": 17105 }, { "epoch": 1.9951026119402986, "grad_norm": 0.3890783052111123, "learning_rate": 3.660484030744852e-05, "loss": 0.4245, "step": 17110 }, { "epoch": 1.9956856343283582, "grad_norm": 0.5019978499285157, "learning_rate": 3.659690562988894e-05, "loss": 0.4986, "step": 17115 }, { "epoch": 1.9962686567164178, "grad_norm": 0.38062845774729975, "learning_rate": 3.6588969599706665e-05, "loss": 0.4555, "step": 17120 }, { "epoch": 1.9968516791044775, "grad_norm": 0.3986977104412468, "learning_rate": 3.6581032218081685e-05, "loss": 0.4601, "step": 17125 }, { "epoch": 1.9974347014925373, "grad_norm": 0.41108104833354003, "learning_rate": 3.6573093486194226e-05, "loss": 0.4832, "step": 17130 }, { "epoch": 1.9980177238805972, "grad_norm": 0.39946669843141763, "learning_rate": 3.65651534052247e-05, "loss": 0.4761, "step": 17135 }, { "epoch": 1.9986007462686568, "grad_norm": 0.617041064444864, "learning_rate": 3.655721197635371e-05, "loss": 0.4633, "step": 17140 }, { "epoch": 1.9991837686567164, "grad_norm": 0.3566803157427374, "learning_rate": 3.654926920076208e-05, "loss": 0.4548, "step": 17145 }, { "epoch": 1.999766791044776, "grad_norm": 0.39221319928845394, "learning_rate": 3.654132507963083e-05, "loss": 0.4758, "step": 17150 }, { "epoch": 2.0003498134328357, "grad_norm": 0.35974425418608424, "learning_rate": 3.653337961414116e-05, "loss": 0.4572, "step": 17155 }, { "epoch": 2.0009328358208953, "grad_norm": 0.41393158661494905, "learning_rate": 3.652543280547449e-05, "loss": 0.4038, "step": 17160 }, { "epoch": 2.0015158582089554, "grad_norm": 0.4318684000835193, "learning_rate": 3.651748465481245e-05, "loss": 0.4066, "step": 17165 }, { "epoch": 2.002098880597015, "grad_norm": 0.42383911903824045, "learning_rate": 3.650953516333682e-05, "loss": 0.4109, "step": 17170 }, { "epoch": 2.0026819029850746, "grad_norm": 0.4168315446980631, "learning_rate": 3.6501584332229645e-05, "loss": 0.3801, "step": 17175 }, { "epoch": 2.0032649253731343, "grad_norm": 0.44231291127034256, "learning_rate": 3.6493632162673125e-05, "loss": 0.4155, "step": 17180 }, { "epoch": 2.003847947761194, "grad_norm": 0.4186092913713343, "learning_rate": 3.648567865584967e-05, "loss": 0.4237, "step": 17185 }, { "epoch": 2.0044309701492535, "grad_norm": 0.3977411578691464, "learning_rate": 3.647772381294189e-05, "loss": 0.4077, "step": 17190 }, { "epoch": 2.0050139925373136, "grad_norm": 0.4615182032330395, "learning_rate": 3.64697676351326e-05, "loss": 0.3831, "step": 17195 }, { "epoch": 2.0055970149253732, "grad_norm": 0.4155941705113631, "learning_rate": 3.6461810123604805e-05, "loss": 0.4059, "step": 17200 }, { "epoch": 2.006180037313433, "grad_norm": 0.4474256705323867, "learning_rate": 3.645385127954171e-05, "loss": 0.4716, "step": 17205 }, { "epoch": 2.0067630597014925, "grad_norm": 0.4054934065107215, "learning_rate": 3.6445891104126714e-05, "loss": 0.4249, "step": 17210 }, { "epoch": 2.007346082089552, "grad_norm": 0.3976611767707526, "learning_rate": 3.643792959854342e-05, "loss": 0.4094, "step": 17215 }, { "epoch": 2.0079291044776117, "grad_norm": 0.44449371418132566, "learning_rate": 3.6429966763975636e-05, "loss": 0.4026, "step": 17220 }, { "epoch": 2.008512126865672, "grad_norm": 0.43500057395824304, "learning_rate": 3.642200260160735e-05, "loss": 0.4202, "step": 17225 }, { "epoch": 2.0090951492537314, "grad_norm": 0.4155992350702752, "learning_rate": 3.641403711262277e-05, "loss": 0.3987, "step": 17230 }, { "epoch": 2.009678171641791, "grad_norm": 0.41312574634007154, "learning_rate": 3.6406070298206265e-05, "loss": 0.418, "step": 17235 }, { "epoch": 2.0102611940298507, "grad_norm": 0.446437880517514, "learning_rate": 3.639810215954245e-05, "loss": 0.4117, "step": 17240 }, { "epoch": 2.0108442164179103, "grad_norm": 0.38507786194850685, "learning_rate": 3.639013269781609e-05, "loss": 0.4016, "step": 17245 }, { "epoch": 2.01142723880597, "grad_norm": 0.4094945005900366, "learning_rate": 3.638216191421218e-05, "loss": 0.3944, "step": 17250 }, { "epoch": 2.01201026119403, "grad_norm": 0.4471885484091221, "learning_rate": 3.637418980991589e-05, "loss": 0.4132, "step": 17255 }, { "epoch": 2.0125932835820897, "grad_norm": 0.4416582578431907, "learning_rate": 3.6366216386112605e-05, "loss": 0.4041, "step": 17260 }, { "epoch": 2.0131763059701493, "grad_norm": 0.44700347809049584, "learning_rate": 3.635824164398789e-05, "loss": 0.414, "step": 17265 }, { "epoch": 2.013759328358209, "grad_norm": 0.4085205455503069, "learning_rate": 3.635026558472752e-05, "loss": 0.4136, "step": 17270 }, { "epoch": 2.0143423507462686, "grad_norm": 0.3860550897181453, "learning_rate": 3.634228820951744e-05, "loss": 0.3729, "step": 17275 }, { "epoch": 2.014925373134328, "grad_norm": 0.413879237651807, "learning_rate": 3.633430951954383e-05, "loss": 0.4095, "step": 17280 }, { "epoch": 2.0155083955223883, "grad_norm": 0.4380213833210687, "learning_rate": 3.6326329515993055e-05, "loss": 0.4137, "step": 17285 }, { "epoch": 2.016091417910448, "grad_norm": 0.4050950469907184, "learning_rate": 3.631834820005163e-05, "loss": 0.3836, "step": 17290 }, { "epoch": 2.0166744402985075, "grad_norm": 0.4484025706465973, "learning_rate": 3.6310365572906314e-05, "loss": 0.3881, "step": 17295 }, { "epoch": 2.017257462686567, "grad_norm": 0.3826212858166743, "learning_rate": 3.6302381635744056e-05, "loss": 0.3814, "step": 17300 }, { "epoch": 2.0178404850746268, "grad_norm": 0.43005810923375426, "learning_rate": 3.629439638975199e-05, "loss": 0.4003, "step": 17305 }, { "epoch": 2.0184235074626864, "grad_norm": 0.410666288794066, "learning_rate": 3.628640983611744e-05, "loss": 0.408, "step": 17310 }, { "epoch": 2.0190065298507465, "grad_norm": 0.4918541323710734, "learning_rate": 3.6278421976027926e-05, "loss": 0.4103, "step": 17315 }, { "epoch": 2.019589552238806, "grad_norm": 0.390538141531701, "learning_rate": 3.6270432810671176e-05, "loss": 0.4314, "step": 17320 }, { "epoch": 2.0201725746268657, "grad_norm": 0.4457795006339435, "learning_rate": 3.6262442341235105e-05, "loss": 0.4374, "step": 17325 }, { "epoch": 2.0207555970149254, "grad_norm": 0.380497746108089, "learning_rate": 3.62544505689078e-05, "loss": 0.3921, "step": 17330 }, { "epoch": 2.021338619402985, "grad_norm": 5.372828015954768, "learning_rate": 3.6246457494877585e-05, "loss": 0.3873, "step": 17335 }, { "epoch": 2.0219216417910446, "grad_norm": 0.45641737776863933, "learning_rate": 3.623846312033294e-05, "loss": 0.4123, "step": 17340 }, { "epoch": 2.0225046641791047, "grad_norm": 0.43416818803809887, "learning_rate": 3.623046744646254e-05, "loss": 0.424, "step": 17345 }, { "epoch": 2.0230876865671643, "grad_norm": 0.4204419218483119, "learning_rate": 3.622247047445529e-05, "loss": 0.3923, "step": 17350 }, { "epoch": 2.023670708955224, "grad_norm": 0.49827384849949685, "learning_rate": 3.6214472205500256e-05, "loss": 0.4653, "step": 17355 }, { "epoch": 2.0242537313432836, "grad_norm": 0.430083259667377, "learning_rate": 3.6206472640786696e-05, "loss": 0.4285, "step": 17360 }, { "epoch": 2.024836753731343, "grad_norm": 0.4167994726809484, "learning_rate": 3.6198471781504076e-05, "loss": 0.399, "step": 17365 }, { "epoch": 2.025419776119403, "grad_norm": 0.3992188423609367, "learning_rate": 3.619046962884204e-05, "loss": 0.4007, "step": 17370 }, { "epoch": 2.026002798507463, "grad_norm": 0.40863350972882084, "learning_rate": 3.618246618399043e-05, "loss": 0.3853, "step": 17375 }, { "epoch": 2.0265858208955225, "grad_norm": 0.4439383038448917, "learning_rate": 3.617446144813929e-05, "loss": 0.4105, "step": 17380 }, { "epoch": 2.027168843283582, "grad_norm": 0.4482173126534646, "learning_rate": 3.616645542247885e-05, "loss": 0.389, "step": 17385 }, { "epoch": 2.027751865671642, "grad_norm": 0.40762063689592337, "learning_rate": 3.6158448108199515e-05, "loss": 0.4125, "step": 17390 }, { "epoch": 2.0283348880597014, "grad_norm": 0.41610468738070305, "learning_rate": 3.6150439506491915e-05, "loss": 0.3968, "step": 17395 }, { "epoch": 2.028917910447761, "grad_norm": 0.4490666758996821, "learning_rate": 3.614242961854683e-05, "loss": 0.3972, "step": 17400 }, { "epoch": 2.029500932835821, "grad_norm": 0.707190655829145, "learning_rate": 3.6134418445555275e-05, "loss": 0.4201, "step": 17405 }, { "epoch": 2.0300839552238807, "grad_norm": 0.4761853979925262, "learning_rate": 3.6126405988708424e-05, "loss": 0.4067, "step": 17410 }, { "epoch": 2.0306669776119404, "grad_norm": 0.4126466981845711, "learning_rate": 3.611839224919765e-05, "loss": 0.4175, "step": 17415 }, { "epoch": 2.03125, "grad_norm": 0.4036158369871603, "learning_rate": 3.611037722821452e-05, "loss": 0.3865, "step": 17420 }, { "epoch": 2.0318330223880596, "grad_norm": 0.4277080064913996, "learning_rate": 3.6102360926950796e-05, "loss": 0.3837, "step": 17425 }, { "epoch": 2.0324160447761193, "grad_norm": 0.4345897946626314, "learning_rate": 3.609434334659842e-05, "loss": 0.4133, "step": 17430 }, { "epoch": 2.0329990671641793, "grad_norm": 0.5322124187508095, "learning_rate": 3.608632448834954e-05, "loss": 0.4251, "step": 17435 }, { "epoch": 2.033582089552239, "grad_norm": 0.4320842291475738, "learning_rate": 3.607830435339648e-05, "loss": 0.4183, "step": 17440 }, { "epoch": 2.0341651119402986, "grad_norm": 0.37156559821800766, "learning_rate": 3.6070282942931744e-05, "loss": 0.4307, "step": 17445 }, { "epoch": 2.034748134328358, "grad_norm": 0.5350659867774648, "learning_rate": 3.606226025814805e-05, "loss": 0.4295, "step": 17450 }, { "epoch": 2.035331156716418, "grad_norm": 0.44133508577698394, "learning_rate": 3.605423630023829e-05, "loss": 0.4014, "step": 17455 }, { "epoch": 2.0359141791044775, "grad_norm": 0.43885446890854674, "learning_rate": 3.604621107039555e-05, "loss": 0.4162, "step": 17460 }, { "epoch": 2.036497201492537, "grad_norm": 0.48108449465160613, "learning_rate": 3.603818456981312e-05, "loss": 0.4243, "step": 17465 }, { "epoch": 2.037080223880597, "grad_norm": 0.4169175488663904, "learning_rate": 3.6030156799684435e-05, "loss": 0.4115, "step": 17470 }, { "epoch": 2.037663246268657, "grad_norm": 0.4833172600967252, "learning_rate": 3.602212776120317e-05, "loss": 0.4363, "step": 17475 }, { "epoch": 2.0382462686567164, "grad_norm": 0.46381822008841517, "learning_rate": 3.601409745556315e-05, "loss": 0.4045, "step": 17480 }, { "epoch": 2.038829291044776, "grad_norm": 0.43876403353906346, "learning_rate": 3.6006065883958425e-05, "loss": 0.3992, "step": 17485 }, { "epoch": 2.0394123134328357, "grad_norm": 0.44697008478487776, "learning_rate": 3.5998033047583194e-05, "loss": 0.4048, "step": 17490 }, { "epoch": 2.0399953358208953, "grad_norm": 0.37822668527686987, "learning_rate": 3.598999894763187e-05, "loss": 0.3751, "step": 17495 }, { "epoch": 2.0405783582089554, "grad_norm": 0.4064353495153569, "learning_rate": 3.598196358529906e-05, "loss": 0.4138, "step": 17500 }, { "epoch": 2.041161380597015, "grad_norm": 0.3799857869374489, "learning_rate": 3.597392696177953e-05, "loss": 0.3881, "step": 17505 }, { "epoch": 2.0417444029850746, "grad_norm": 0.40655421341103043, "learning_rate": 3.596588907826824e-05, "loss": 0.4058, "step": 17510 }, { "epoch": 2.0423274253731343, "grad_norm": 0.445390124206304, "learning_rate": 3.595784993596036e-05, "loss": 0.395, "step": 17515 }, { "epoch": 2.042910447761194, "grad_norm": 0.41662013675369436, "learning_rate": 3.5949809536051235e-05, "loss": 0.4273, "step": 17520 }, { "epoch": 2.0434934701492535, "grad_norm": 0.4259131767978619, "learning_rate": 3.594176787973638e-05, "loss": 0.4092, "step": 17525 }, { "epoch": 2.0440764925373136, "grad_norm": 0.3834830846569681, "learning_rate": 3.593372496821154e-05, "loss": 0.404, "step": 17530 }, { "epoch": 2.0446595149253732, "grad_norm": 0.3996890940345802, "learning_rate": 3.5925680802672584e-05, "loss": 0.4006, "step": 17535 }, { "epoch": 2.045242537313433, "grad_norm": 0.413506389504434, "learning_rate": 3.591763538431563e-05, "loss": 0.3943, "step": 17540 }, { "epoch": 2.0458255597014925, "grad_norm": 1.3631019268732503, "learning_rate": 3.590958871433695e-05, "loss": 0.3917, "step": 17545 }, { "epoch": 2.046408582089552, "grad_norm": 0.4650214211421635, "learning_rate": 3.5901540793933e-05, "loss": 0.3964, "step": 17550 }, { "epoch": 2.0469916044776117, "grad_norm": 0.4004480748536821, "learning_rate": 3.5893491624300416e-05, "loss": 0.4033, "step": 17555 }, { "epoch": 2.047574626865672, "grad_norm": 0.427543704792774, "learning_rate": 3.5885441206636065e-05, "loss": 0.3948, "step": 17560 }, { "epoch": 2.0481576492537314, "grad_norm": 0.44134543159574047, "learning_rate": 3.587738954213694e-05, "loss": 0.4285, "step": 17565 }, { "epoch": 2.048740671641791, "grad_norm": 0.4966737984480068, "learning_rate": 3.586933663200026e-05, "loss": 0.4287, "step": 17570 }, { "epoch": 2.0493236940298507, "grad_norm": 0.41118745931821765, "learning_rate": 3.586128247742341e-05, "loss": 0.4186, "step": 17575 }, { "epoch": 2.0499067164179103, "grad_norm": 0.4024292533172103, "learning_rate": 3.585322707960397e-05, "loss": 0.3918, "step": 17580 }, { "epoch": 2.05048973880597, "grad_norm": 0.4367564233329026, "learning_rate": 3.584517043973969e-05, "loss": 0.3929, "step": 17585 }, { "epoch": 2.05107276119403, "grad_norm": 0.39290442314481433, "learning_rate": 3.583711255902853e-05, "loss": 0.3763, "step": 17590 }, { "epoch": 2.0516557835820897, "grad_norm": 0.4096315820544621, "learning_rate": 3.58290534386686e-05, "loss": 0.4168, "step": 17595 }, { "epoch": 2.0522388059701493, "grad_norm": 0.4237540152963559, "learning_rate": 3.5820993079858235e-05, "loss": 0.4077, "step": 17600 }, { "epoch": 2.052821828358209, "grad_norm": 0.41721732565734926, "learning_rate": 3.581293148379592e-05, "loss": 0.4256, "step": 17605 }, { "epoch": 2.0534048507462686, "grad_norm": 0.427197849133855, "learning_rate": 3.580486865168034e-05, "loss": 0.4099, "step": 17610 }, { "epoch": 2.053987873134328, "grad_norm": 0.44396780922032564, "learning_rate": 3.579680458471037e-05, "loss": 0.4134, "step": 17615 }, { "epoch": 2.0545708955223883, "grad_norm": 0.40085397386916805, "learning_rate": 3.5788739284085044e-05, "loss": 0.4077, "step": 17620 }, { "epoch": 2.055153917910448, "grad_norm": 0.4183804859387029, "learning_rate": 3.57806727510036e-05, "loss": 0.3991, "step": 17625 }, { "epoch": 2.0557369402985075, "grad_norm": 0.422113085026162, "learning_rate": 3.577260498666546e-05, "loss": 0.4177, "step": 17630 }, { "epoch": 2.056319962686567, "grad_norm": 0.4010894845996832, "learning_rate": 3.5764535992270226e-05, "loss": 0.4245, "step": 17635 }, { "epoch": 2.0569029850746268, "grad_norm": 0.4358002446247879, "learning_rate": 3.575646576901767e-05, "loss": 0.3888, "step": 17640 }, { "epoch": 2.0574860074626864, "grad_norm": 0.417588300480475, "learning_rate": 3.5748394318107765e-05, "loss": 0.4181, "step": 17645 }, { "epoch": 2.0580690298507465, "grad_norm": 0.3896131004903475, "learning_rate": 3.5740321640740646e-05, "loss": 0.3995, "step": 17650 }, { "epoch": 2.058652052238806, "grad_norm": 0.44817209090164584, "learning_rate": 3.573224773811665e-05, "loss": 0.4132, "step": 17655 }, { "epoch": 2.0592350746268657, "grad_norm": 0.4601241879579617, "learning_rate": 3.57241726114363e-05, "loss": 0.4123, "step": 17660 }, { "epoch": 2.0598180970149254, "grad_norm": 0.4332211901795128, "learning_rate": 3.5716096261900274e-05, "loss": 0.4097, "step": 17665 }, { "epoch": 2.060401119402985, "grad_norm": 0.4435205427580933, "learning_rate": 3.570801869070945e-05, "loss": 0.4233, "step": 17670 }, { "epoch": 2.0609841417910446, "grad_norm": 0.4089469394108028, "learning_rate": 3.5699939899064894e-05, "loss": 0.3906, "step": 17675 }, { "epoch": 2.0615671641791047, "grad_norm": 0.39929033875936315, "learning_rate": 3.5691859888167846e-05, "loss": 0.3697, "step": 17680 }, { "epoch": 2.0621501865671643, "grad_norm": 0.4032698121477907, "learning_rate": 3.5683778659219714e-05, "loss": 0.3849, "step": 17685 }, { "epoch": 2.062733208955224, "grad_norm": 0.39957157338321503, "learning_rate": 3.5675696213422105e-05, "loss": 0.4, "step": 17690 }, { "epoch": 2.0633162313432836, "grad_norm": 0.47575181622872736, "learning_rate": 3.56676125519768e-05, "loss": 0.3997, "step": 17695 }, { "epoch": 2.063899253731343, "grad_norm": 0.42660736711985703, "learning_rate": 3.5659527676085774e-05, "loss": 0.4026, "step": 17700 }, { "epoch": 2.064482276119403, "grad_norm": 0.42536589886353837, "learning_rate": 3.565144158695115e-05, "loss": 0.4148, "step": 17705 }, { "epoch": 2.065065298507463, "grad_norm": 0.420815781715264, "learning_rate": 3.564335428577526e-05, "loss": 0.4157, "step": 17710 }, { "epoch": 2.0656483208955225, "grad_norm": 0.37727500584762214, "learning_rate": 3.5635265773760625e-05, "loss": 0.3936, "step": 17715 }, { "epoch": 2.066231343283582, "grad_norm": 0.38246861442370994, "learning_rate": 3.56271760521099e-05, "loss": 0.3855, "step": 17720 }, { "epoch": 2.066814365671642, "grad_norm": 0.4083430209998539, "learning_rate": 3.561908512202597e-05, "loss": 0.4223, "step": 17725 }, { "epoch": 2.0673973880597014, "grad_norm": 0.40292647230277046, "learning_rate": 3.561099298471187e-05, "loss": 0.3884, "step": 17730 }, { "epoch": 2.067980410447761, "grad_norm": 0.406003317049021, "learning_rate": 3.560289964137083e-05, "loss": 0.4112, "step": 17735 }, { "epoch": 2.0685634328358207, "grad_norm": 0.4459596903475761, "learning_rate": 3.559480509320625e-05, "loss": 0.4178, "step": 17740 }, { "epoch": 2.0691464552238807, "grad_norm": 0.42667037304533173, "learning_rate": 3.558670934142171e-05, "loss": 0.3871, "step": 17745 }, { "epoch": 2.0697294776119404, "grad_norm": 0.41694123362390095, "learning_rate": 3.557861238722097e-05, "loss": 0.3812, "step": 17750 }, { "epoch": 2.0703125, "grad_norm": 0.4639670327141468, "learning_rate": 3.557051423180797e-05, "loss": 0.406, "step": 17755 }, { "epoch": 2.0708955223880596, "grad_norm": 0.4536747585597657, "learning_rate": 3.556241487638682e-05, "loss": 0.4184, "step": 17760 }, { "epoch": 2.0714785447761193, "grad_norm": 0.463341281056315, "learning_rate": 3.5554314322161844e-05, "loss": 0.4251, "step": 17765 }, { "epoch": 2.0720615671641793, "grad_norm": 0.4236756562769349, "learning_rate": 3.554621257033749e-05, "loss": 0.4342, "step": 17770 }, { "epoch": 2.072644589552239, "grad_norm": 0.40098119433640333, "learning_rate": 3.5538109622118414e-05, "loss": 0.4252, "step": 17775 }, { "epoch": 2.0732276119402986, "grad_norm": 0.5462436217767143, "learning_rate": 3.5530005478709446e-05, "loss": 0.4228, "step": 17780 }, { "epoch": 2.073810634328358, "grad_norm": 0.5377323817355546, "learning_rate": 3.552190014131562e-05, "loss": 0.4102, "step": 17785 }, { "epoch": 2.074393656716418, "grad_norm": 0.4414133041774776, "learning_rate": 3.551379361114209e-05, "loss": 0.3974, "step": 17790 }, { "epoch": 2.0749766791044775, "grad_norm": 0.43093720318960205, "learning_rate": 3.550568588939423e-05, "loss": 0.3919, "step": 17795 }, { "epoch": 2.075559701492537, "grad_norm": 0.3987041971189923, "learning_rate": 3.549757697727759e-05, "loss": 0.4058, "step": 17800 }, { "epoch": 2.076142723880597, "grad_norm": 0.46671630573375056, "learning_rate": 3.5489466875997876e-05, "loss": 0.4105, "step": 17805 }, { "epoch": 2.076725746268657, "grad_norm": 0.42786744581372393, "learning_rate": 3.548135558676098e-05, "loss": 0.3887, "step": 17810 }, { "epoch": 2.0773087686567164, "grad_norm": 0.41607289536770725, "learning_rate": 3.547324311077299e-05, "loss": 0.4048, "step": 17815 }, { "epoch": 2.077891791044776, "grad_norm": 0.45844722867997517, "learning_rate": 3.546512944924014e-05, "loss": 0.4312, "step": 17820 }, { "epoch": 2.0784748134328357, "grad_norm": 0.6436067039745295, "learning_rate": 3.5457014603368844e-05, "loss": 0.3995, "step": 17825 }, { "epoch": 2.0790578358208953, "grad_norm": 0.40376232253464883, "learning_rate": 3.544889857436573e-05, "loss": 0.4047, "step": 17830 }, { "epoch": 2.0796408582089554, "grad_norm": 0.4424008134172488, "learning_rate": 3.544078136343755e-05, "loss": 0.3986, "step": 17835 }, { "epoch": 2.080223880597015, "grad_norm": 0.45334469023841945, "learning_rate": 3.5432662971791264e-05, "loss": 0.4157, "step": 17840 }, { "epoch": 2.0808069029850746, "grad_norm": 0.4197736763225393, "learning_rate": 3.5424543400634e-05, "loss": 0.4154, "step": 17845 }, { "epoch": 2.0813899253731343, "grad_norm": 1.919560356302374, "learning_rate": 3.541642265117306e-05, "loss": 0.4086, "step": 17850 }, { "epoch": 2.081972947761194, "grad_norm": 0.4376616144845669, "learning_rate": 3.5408300724615915e-05, "loss": 0.3762, "step": 17855 }, { "epoch": 2.0825559701492535, "grad_norm": 0.394848239549364, "learning_rate": 3.540017762217023e-05, "loss": 0.405, "step": 17860 }, { "epoch": 2.0831389925373136, "grad_norm": 0.39351720162653736, "learning_rate": 3.539205334504382e-05, "loss": 0.3845, "step": 17865 }, { "epoch": 2.0837220149253732, "grad_norm": 0.4296218352693514, "learning_rate": 3.5383927894444694e-05, "loss": 0.4308, "step": 17870 }, { "epoch": 2.084305037313433, "grad_norm": 0.44104359649301106, "learning_rate": 3.537580127158103e-05, "loss": 0.442, "step": 17875 }, { "epoch": 2.0848880597014925, "grad_norm": 0.3921743818551184, "learning_rate": 3.5367673477661174e-05, "loss": 0.3775, "step": 17880 }, { "epoch": 2.085471082089552, "grad_norm": 0.45131654218140227, "learning_rate": 3.5359544513893655e-05, "loss": 0.4044, "step": 17885 }, { "epoch": 2.0860541044776117, "grad_norm": 0.4537637907819424, "learning_rate": 3.535141438148717e-05, "loss": 0.4257, "step": 17890 }, { "epoch": 2.086637126865672, "grad_norm": 0.4064131288923466, "learning_rate": 3.53432830816506e-05, "loss": 0.414, "step": 17895 }, { "epoch": 2.0872201492537314, "grad_norm": 0.3900103655232811, "learning_rate": 3.533515061559297e-05, "loss": 0.396, "step": 17900 }, { "epoch": 2.087803171641791, "grad_norm": 0.42363353505397877, "learning_rate": 3.532701698452352e-05, "loss": 0.4104, "step": 17905 }, { "epoch": 2.0883861940298507, "grad_norm": 0.4775771231746664, "learning_rate": 3.5318882189651635e-05, "loss": 0.4063, "step": 17910 }, { "epoch": 2.0889692164179103, "grad_norm": 0.45616241876431546, "learning_rate": 3.531074623218689e-05, "loss": 0.4081, "step": 17915 }, { "epoch": 2.08955223880597, "grad_norm": 0.42170797688086853, "learning_rate": 3.5302609113339e-05, "loss": 0.3844, "step": 17920 }, { "epoch": 2.09013526119403, "grad_norm": 0.439242908636829, "learning_rate": 3.52944708343179e-05, "loss": 0.4012, "step": 17925 }, { "epoch": 2.0907182835820897, "grad_norm": 0.46414434867866833, "learning_rate": 3.5286331396333675e-05, "loss": 0.4167, "step": 17930 }, { "epoch": 2.0913013059701493, "grad_norm": 0.42598616305498765, "learning_rate": 3.527819080059657e-05, "loss": 0.433, "step": 17935 }, { "epoch": 2.091884328358209, "grad_norm": 1.9646733034362873, "learning_rate": 3.5270049048317016e-05, "loss": 0.3971, "step": 17940 }, { "epoch": 2.0924673507462686, "grad_norm": 0.442469170939552, "learning_rate": 3.5261906140705615e-05, "loss": 0.4112, "step": 17945 }, { "epoch": 2.093050373134328, "grad_norm": 0.4102310568186522, "learning_rate": 3.525376207897314e-05, "loss": 0.3888, "step": 17950 }, { "epoch": 2.0936333955223883, "grad_norm": 0.40470558046344235, "learning_rate": 3.524561686433053e-05, "loss": 0.39, "step": 17955 }, { "epoch": 2.094216417910448, "grad_norm": 0.4257906695133037, "learning_rate": 3.5237470497988905e-05, "loss": 0.4057, "step": 17960 }, { "epoch": 2.0947994402985075, "grad_norm": 0.4570530733280104, "learning_rate": 3.522932298115955e-05, "loss": 0.3993, "step": 17965 }, { "epoch": 2.095382462686567, "grad_norm": 0.43448859420515695, "learning_rate": 3.5221174315053935e-05, "loss": 0.388, "step": 17970 }, { "epoch": 2.0959654850746268, "grad_norm": 0.45632890224050665, "learning_rate": 3.5213024500883666e-05, "loss": 0.3922, "step": 17975 }, { "epoch": 2.0965485074626864, "grad_norm": 0.40012892603825206, "learning_rate": 3.520487353986056e-05, "loss": 0.4059, "step": 17980 }, { "epoch": 2.0971315298507465, "grad_norm": 0.41112655096189904, "learning_rate": 3.5196721433196575e-05, "loss": 0.4021, "step": 17985 }, { "epoch": 2.097714552238806, "grad_norm": 0.41389615796919993, "learning_rate": 3.518856818210387e-05, "loss": 0.3994, "step": 17990 }, { "epoch": 2.0982975746268657, "grad_norm": 0.4131593212558457, "learning_rate": 3.5180413787794724e-05, "loss": 0.451, "step": 17995 }, { "epoch": 2.0988805970149254, "grad_norm": 0.42752541290912965, "learning_rate": 3.517225825148164e-05, "loss": 0.4025, "step": 18000 }, { "epoch": 2.099463619402985, "grad_norm": 0.502384225015268, "learning_rate": 3.516410157437727e-05, "loss": 0.4255, "step": 18005 }, { "epoch": 2.1000466417910446, "grad_norm": 0.42592470933952536, "learning_rate": 3.515594375769442e-05, "loss": 0.4134, "step": 18010 }, { "epoch": 2.1006296641791047, "grad_norm": 0.4755593904042131, "learning_rate": 3.514778480264609e-05, "loss": 0.4399, "step": 18015 }, { "epoch": 2.1012126865671643, "grad_norm": 0.39159887557747713, "learning_rate": 3.513962471044543e-05, "loss": 0.4093, "step": 18020 }, { "epoch": 2.101795708955224, "grad_norm": 0.44229008867276415, "learning_rate": 3.513146348230578e-05, "loss": 0.4095, "step": 18025 }, { "epoch": 2.1023787313432836, "grad_norm": 0.4463152827359944, "learning_rate": 3.512330111944062e-05, "loss": 0.4183, "step": 18030 }, { "epoch": 2.102961753731343, "grad_norm": 0.4263982192743119, "learning_rate": 3.511513762306363e-05, "loss": 0.4039, "step": 18035 }, { "epoch": 2.103544776119403, "grad_norm": 0.3764688587303262, "learning_rate": 3.510697299438864e-05, "loss": 0.3985, "step": 18040 }, { "epoch": 2.104127798507463, "grad_norm": 0.7883615400214776, "learning_rate": 3.509880723462965e-05, "loss": 0.4069, "step": 18045 }, { "epoch": 2.1047108208955225, "grad_norm": 0.45316947689705445, "learning_rate": 3.509064034500082e-05, "loss": 0.4279, "step": 18050 }, { "epoch": 2.105293843283582, "grad_norm": 0.4323162177598834, "learning_rate": 3.50824723267165e-05, "loss": 0.4259, "step": 18055 }, { "epoch": 2.105876865671642, "grad_norm": 0.4157293684577551, "learning_rate": 3.50743031809912e-05, "loss": 0.4008, "step": 18060 }, { "epoch": 2.1064598880597014, "grad_norm": 0.41443560930906936, "learning_rate": 3.506613290903958e-05, "loss": 0.4091, "step": 18065 }, { "epoch": 2.107042910447761, "grad_norm": 1.6540605463548794, "learning_rate": 3.505796151207651e-05, "loss": 0.4126, "step": 18070 }, { "epoch": 2.1076259328358207, "grad_norm": 0.45561508384631727, "learning_rate": 3.504978899131696e-05, "loss": 0.3996, "step": 18075 }, { "epoch": 2.1082089552238807, "grad_norm": 0.4434447916074442, "learning_rate": 3.504161534797612e-05, "loss": 0.4, "step": 18080 }, { "epoch": 2.1087919776119404, "grad_norm": 0.4279181497681623, "learning_rate": 3.503344058326934e-05, "loss": 0.4212, "step": 18085 }, { "epoch": 2.109375, "grad_norm": 0.4186326705057981, "learning_rate": 3.5025264698412126e-05, "loss": 0.4048, "step": 18090 }, { "epoch": 2.1099580223880596, "grad_norm": 0.42558836872774103, "learning_rate": 3.5017087694620154e-05, "loss": 0.4056, "step": 18095 }, { "epoch": 2.1105410447761193, "grad_norm": 0.4585705588952108, "learning_rate": 3.500890957310926e-05, "loss": 0.449, "step": 18100 }, { "epoch": 2.1111240671641793, "grad_norm": 0.39868856032720257, "learning_rate": 3.500073033509546e-05, "loss": 0.3963, "step": 18105 }, { "epoch": 2.111707089552239, "grad_norm": 0.4518611231636433, "learning_rate": 3.4992549981794915e-05, "loss": 0.4087, "step": 18110 }, { "epoch": 2.1122901119402986, "grad_norm": 0.41093552739258665, "learning_rate": 3.498436851442398e-05, "loss": 0.4011, "step": 18115 }, { "epoch": 2.112873134328358, "grad_norm": 0.4551342557687998, "learning_rate": 3.497618593419916e-05, "loss": 0.4195, "step": 18120 }, { "epoch": 2.113456156716418, "grad_norm": 0.4249640753172037, "learning_rate": 3.496800224233713e-05, "loss": 0.4222, "step": 18125 }, { "epoch": 2.1140391791044775, "grad_norm": 0.4317873459107346, "learning_rate": 3.495981744005471e-05, "loss": 0.3798, "step": 18130 }, { "epoch": 2.114622201492537, "grad_norm": 0.43776493112473797, "learning_rate": 3.49516315285689e-05, "loss": 0.4436, "step": 18135 }, { "epoch": 2.115205223880597, "grad_norm": 0.4536385624082108, "learning_rate": 3.494344450909689e-05, "loss": 0.4279, "step": 18140 }, { "epoch": 2.115788246268657, "grad_norm": 0.45679071602941196, "learning_rate": 3.4935256382855996e-05, "loss": 0.4218, "step": 18145 }, { "epoch": 2.1163712686567164, "grad_norm": 0.43460120047194506, "learning_rate": 3.492706715106372e-05, "loss": 0.3872, "step": 18150 }, { "epoch": 2.116954291044776, "grad_norm": 0.43466514815981416, "learning_rate": 3.491887681493771e-05, "loss": 0.4079, "step": 18155 }, { "epoch": 2.1175373134328357, "grad_norm": 0.43627926272393935, "learning_rate": 3.491068537569581e-05, "loss": 0.4141, "step": 18160 }, { "epoch": 2.1181203358208953, "grad_norm": 0.400505012948606, "learning_rate": 3.4902492834555994e-05, "loss": 0.4082, "step": 18165 }, { "epoch": 2.1187033582089554, "grad_norm": 0.40465376681171294, "learning_rate": 3.489429919273642e-05, "loss": 0.3956, "step": 18170 }, { "epoch": 2.119286380597015, "grad_norm": 0.4656852789402069, "learning_rate": 3.488610445145539e-05, "loss": 0.4346, "step": 18175 }, { "epoch": 2.1198694029850746, "grad_norm": 0.41113986376765876, "learning_rate": 3.4877908611931406e-05, "loss": 0.4083, "step": 18180 }, { "epoch": 2.1204524253731343, "grad_norm": 0.42259438586971515, "learning_rate": 3.48697116753831e-05, "loss": 0.4146, "step": 18185 }, { "epoch": 2.121035447761194, "grad_norm": 0.4004898003730871, "learning_rate": 3.486151364302928e-05, "loss": 0.392, "step": 18190 }, { "epoch": 2.1216184701492535, "grad_norm": 0.4492710402394599, "learning_rate": 3.485331451608891e-05, "loss": 0.406, "step": 18195 }, { "epoch": 2.1222014925373136, "grad_norm": 0.41841886702855924, "learning_rate": 3.484511429578113e-05, "loss": 0.4094, "step": 18200 }, { "epoch": 2.1227845149253732, "grad_norm": 0.383432454482924, "learning_rate": 3.483691298332522e-05, "loss": 0.3925, "step": 18205 }, { "epoch": 2.123367537313433, "grad_norm": 0.40871040660467367, "learning_rate": 3.482871057994065e-05, "loss": 0.4174, "step": 18210 }, { "epoch": 2.1239505597014925, "grad_norm": 0.41203869297679074, "learning_rate": 3.4820507086847036e-05, "loss": 0.4183, "step": 18215 }, { "epoch": 2.124533582089552, "grad_norm": 0.39172999651212204, "learning_rate": 3.481230250526416e-05, "loss": 0.3882, "step": 18220 }, { "epoch": 2.1251166044776117, "grad_norm": 0.39979542942187735, "learning_rate": 3.480409683641196e-05, "loss": 0.3902, "step": 18225 }, { "epoch": 2.125699626865672, "grad_norm": 0.4188582512860355, "learning_rate": 3.479589008151054e-05, "loss": 0.4008, "step": 18230 }, { "epoch": 2.1262826492537314, "grad_norm": 0.4018983973265039, "learning_rate": 3.4787682241780164e-05, "loss": 0.4247, "step": 18235 }, { "epoch": 2.126865671641791, "grad_norm": 0.492136567359692, "learning_rate": 3.477947331844127e-05, "loss": 0.4028, "step": 18240 }, { "epoch": 2.1274486940298507, "grad_norm": 0.4746116975916419, "learning_rate": 3.477126331271445e-05, "loss": 0.4032, "step": 18245 }, { "epoch": 2.1280317164179103, "grad_norm": 0.46327889070060085, "learning_rate": 3.476305222582042e-05, "loss": 0.4051, "step": 18250 }, { "epoch": 2.12861473880597, "grad_norm": 0.9946701863145109, "learning_rate": 3.475484005898013e-05, "loss": 0.419, "step": 18255 }, { "epoch": 2.12919776119403, "grad_norm": 0.457118037673843, "learning_rate": 3.4746626813414624e-05, "loss": 0.4194, "step": 18260 }, { "epoch": 2.1297807835820897, "grad_norm": 0.42676681216035084, "learning_rate": 3.473841249034514e-05, "loss": 0.3873, "step": 18265 }, { "epoch": 2.1303638059701493, "grad_norm": 0.4175294629764706, "learning_rate": 3.4730197090993084e-05, "loss": 0.3949, "step": 18270 }, { "epoch": 2.130946828358209, "grad_norm": 0.4429123065062491, "learning_rate": 3.4721980616579984e-05, "loss": 0.3859, "step": 18275 }, { "epoch": 2.1315298507462686, "grad_norm": 0.42960011018651256, "learning_rate": 3.471376306832756e-05, "loss": 0.3879, "step": 18280 }, { "epoch": 2.132112873134328, "grad_norm": 0.3977545876838716, "learning_rate": 3.4705544447457686e-05, "loss": 0.3659, "step": 18285 }, { "epoch": 2.1326958955223883, "grad_norm": 0.4071063253380223, "learning_rate": 3.4697324755192387e-05, "loss": 0.4226, "step": 18290 }, { "epoch": 2.133278917910448, "grad_norm": 0.3928332736265074, "learning_rate": 3.468910399275387e-05, "loss": 0.4177, "step": 18295 }, { "epoch": 2.1338619402985075, "grad_norm": 0.4426832092037558, "learning_rate": 3.468088216136445e-05, "loss": 0.4196, "step": 18300 }, { "epoch": 2.134444962686567, "grad_norm": 0.4412295617512612, "learning_rate": 3.467265926224667e-05, "loss": 0.4098, "step": 18305 }, { "epoch": 2.1350279850746268, "grad_norm": 0.5185023740026237, "learning_rate": 3.466443529662317e-05, "loss": 0.4195, "step": 18310 }, { "epoch": 2.1356110074626864, "grad_norm": 0.4505364691967898, "learning_rate": 3.465621026571679e-05, "loss": 0.4118, "step": 18315 }, { "epoch": 2.1361940298507465, "grad_norm": 0.40175822152730395, "learning_rate": 3.4647984170750506e-05, "loss": 0.3971, "step": 18320 }, { "epoch": 2.136777052238806, "grad_norm": 0.4230475965578641, "learning_rate": 3.463975701294747e-05, "loss": 0.4035, "step": 18325 }, { "epoch": 2.1373600746268657, "grad_norm": 0.46159035331101195, "learning_rate": 3.463152879353097e-05, "loss": 0.4105, "step": 18330 }, { "epoch": 2.1379430970149254, "grad_norm": 0.4081762936664928, "learning_rate": 3.462329951372446e-05, "loss": 0.3938, "step": 18335 }, { "epoch": 2.138526119402985, "grad_norm": 0.4162526437244533, "learning_rate": 3.4615069174751566e-05, "loss": 0.4065, "step": 18340 }, { "epoch": 2.1391091417910446, "grad_norm": 0.41395818841603066, "learning_rate": 3.460683777783605e-05, "loss": 0.3902, "step": 18345 }, { "epoch": 2.1396921641791047, "grad_norm": 0.4000044137100815, "learning_rate": 3.459860532420186e-05, "loss": 0.4215, "step": 18350 }, { "epoch": 2.1402751865671643, "grad_norm": 0.46193139026077534, "learning_rate": 3.459037181507307e-05, "loss": 0.3989, "step": 18355 }, { "epoch": 2.140858208955224, "grad_norm": 0.38117231649973093, "learning_rate": 3.4582137251673916e-05, "loss": 0.3833, "step": 18360 }, { "epoch": 2.1414412313432836, "grad_norm": 0.4004118722461508, "learning_rate": 3.4573901635228815e-05, "loss": 0.4215, "step": 18365 }, { "epoch": 2.142024253731343, "grad_norm": 0.4255852414170699, "learning_rate": 3.456566496696232e-05, "loss": 0.4079, "step": 18370 }, { "epoch": 2.142607276119403, "grad_norm": 0.4694852826837166, "learning_rate": 3.455742724809914e-05, "loss": 0.4061, "step": 18375 }, { "epoch": 2.143190298507463, "grad_norm": 0.40555512916946207, "learning_rate": 3.454918847986414e-05, "loss": 0.3846, "step": 18380 }, { "epoch": 2.1437733208955225, "grad_norm": 0.4686029056448322, "learning_rate": 3.4540948663482356e-05, "loss": 0.4174, "step": 18385 }, { "epoch": 2.144356343283582, "grad_norm": 0.4259521402032897, "learning_rate": 3.453270780017897e-05, "loss": 0.4211, "step": 18390 }, { "epoch": 2.144939365671642, "grad_norm": 0.42338287414652503, "learning_rate": 3.452446589117932e-05, "loss": 0.4064, "step": 18395 }, { "epoch": 2.1455223880597014, "grad_norm": 0.45445782962606823, "learning_rate": 3.451622293770889e-05, "loss": 0.4315, "step": 18400 }, { "epoch": 2.146105410447761, "grad_norm": 0.4193864890396944, "learning_rate": 3.450797894099332e-05, "loss": 0.4213, "step": 18405 }, { "epoch": 2.1466884328358207, "grad_norm": 0.4045938647292487, "learning_rate": 3.4499733902258446e-05, "loss": 0.3765, "step": 18410 }, { "epoch": 2.1472714552238807, "grad_norm": 0.4375966310441991, "learning_rate": 3.4491487822730194e-05, "loss": 0.4165, "step": 18415 }, { "epoch": 2.1478544776119404, "grad_norm": 0.4021322452390154, "learning_rate": 3.448324070363469e-05, "loss": 0.3987, "step": 18420 }, { "epoch": 2.1484375, "grad_norm": 0.656108292610777, "learning_rate": 3.447499254619821e-05, "loss": 0.4061, "step": 18425 }, { "epoch": 2.1490205223880596, "grad_norm": 0.4662551126290562, "learning_rate": 3.446674335164716e-05, "loss": 0.416, "step": 18430 }, { "epoch": 2.1496035447761193, "grad_norm": 0.4203431257433708, "learning_rate": 3.445849312120813e-05, "loss": 0.3833, "step": 18435 }, { "epoch": 2.1501865671641793, "grad_norm": 0.43701350646468623, "learning_rate": 3.445024185610783e-05, "loss": 0.4031, "step": 18440 }, { "epoch": 2.150769589552239, "grad_norm": 0.41464833300465653, "learning_rate": 3.444198955757316e-05, "loss": 0.4291, "step": 18445 }, { "epoch": 2.1513526119402986, "grad_norm": 0.38800303739700287, "learning_rate": 3.443373622683116e-05, "loss": 0.3794, "step": 18450 }, { "epoch": 2.151935634328358, "grad_norm": 0.5015747533688855, "learning_rate": 3.442548186510902e-05, "loss": 0.4224, "step": 18455 }, { "epoch": 2.152518656716418, "grad_norm": 0.44318396438444974, "learning_rate": 3.441722647363408e-05, "loss": 0.3991, "step": 18460 }, { "epoch": 2.1531016791044775, "grad_norm": 0.4629739351457824, "learning_rate": 3.4408970053633835e-05, "loss": 0.4237, "step": 18465 }, { "epoch": 2.153684701492537, "grad_norm": 0.46756060010063044, "learning_rate": 3.440071260633594e-05, "loss": 0.4228, "step": 18470 }, { "epoch": 2.154267723880597, "grad_norm": 0.43654862859242105, "learning_rate": 3.43924541329682e-05, "loss": 0.4125, "step": 18475 }, { "epoch": 2.154850746268657, "grad_norm": 0.3975759879864766, "learning_rate": 3.438419463475857e-05, "loss": 0.3953, "step": 18480 }, { "epoch": 2.1554337686567164, "grad_norm": 0.43559577817094774, "learning_rate": 3.437593411293516e-05, "loss": 0.4237, "step": 18485 }, { "epoch": 2.156016791044776, "grad_norm": 0.4153504610787961, "learning_rate": 3.436767256872621e-05, "loss": 0.4057, "step": 18490 }, { "epoch": 2.1565998134328357, "grad_norm": 0.39063787940198125, "learning_rate": 3.435941000336016e-05, "loss": 0.399, "step": 18495 }, { "epoch": 2.1571828358208953, "grad_norm": 0.41088878460192013, "learning_rate": 3.435114641806557e-05, "loss": 0.4191, "step": 18500 }, { "epoch": 2.1577658582089554, "grad_norm": 0.40787535265956926, "learning_rate": 3.434288181407114e-05, "loss": 0.4269, "step": 18505 }, { "epoch": 2.158348880597015, "grad_norm": 0.42008146833643994, "learning_rate": 3.433461619260575e-05, "loss": 0.4151, "step": 18510 }, { "epoch": 2.1589319029850746, "grad_norm": 0.4719089973719463, "learning_rate": 3.432634955489841e-05, "loss": 0.435, "step": 18515 }, { "epoch": 2.1595149253731343, "grad_norm": 0.4340721315473417, "learning_rate": 3.43180819021783e-05, "loss": 0.4143, "step": 18520 }, { "epoch": 2.160097947761194, "grad_norm": 0.412517457467355, "learning_rate": 3.430981323567475e-05, "loss": 0.4301, "step": 18525 }, { "epoch": 2.1606809701492535, "grad_norm": 0.4357379435528156, "learning_rate": 3.4301543556617206e-05, "loss": 0.396, "step": 18530 }, { "epoch": 2.1612639925373136, "grad_norm": 0.4456223599662238, "learning_rate": 3.42932728662353e-05, "loss": 0.4141, "step": 18535 }, { "epoch": 2.1618470149253732, "grad_norm": 0.4304404852290806, "learning_rate": 3.428500116575881e-05, "loss": 0.4279, "step": 18540 }, { "epoch": 2.162430037313433, "grad_norm": 0.4180683020284989, "learning_rate": 3.427672845641765e-05, "loss": 0.4214, "step": 18545 }, { "epoch": 2.1630130597014925, "grad_norm": 0.4013336763453828, "learning_rate": 3.42684547394419e-05, "loss": 0.391, "step": 18550 }, { "epoch": 2.163596082089552, "grad_norm": 0.4293341803829475, "learning_rate": 3.4260180016061784e-05, "loss": 0.4085, "step": 18555 }, { "epoch": 2.1641791044776117, "grad_norm": 0.4078104883279643, "learning_rate": 3.425190428750767e-05, "loss": 0.3774, "step": 18560 }, { "epoch": 2.164762126865672, "grad_norm": 0.6462307153545634, "learning_rate": 3.424362755501007e-05, "loss": 0.3849, "step": 18565 }, { "epoch": 2.1653451492537314, "grad_norm": 0.3935470087549329, "learning_rate": 3.423534981979968e-05, "loss": 0.3944, "step": 18570 }, { "epoch": 2.165928171641791, "grad_norm": 0.7172474120312462, "learning_rate": 3.422707108310729e-05, "loss": 0.4279, "step": 18575 }, { "epoch": 2.1665111940298507, "grad_norm": 0.4118325792008938, "learning_rate": 3.4218791346163894e-05, "loss": 0.4032, "step": 18580 }, { "epoch": 2.1670942164179103, "grad_norm": 0.4233889408435394, "learning_rate": 3.421051061020059e-05, "loss": 0.4088, "step": 18585 }, { "epoch": 2.16767723880597, "grad_norm": 0.4072578863626613, "learning_rate": 3.420222887644866e-05, "loss": 0.4148, "step": 18590 }, { "epoch": 2.16826026119403, "grad_norm": 0.4626653985981102, "learning_rate": 3.419394614613951e-05, "loss": 0.4195, "step": 18595 }, { "epoch": 2.1688432835820897, "grad_norm": 0.4089173557620577, "learning_rate": 3.41856624205047e-05, "loss": 0.4023, "step": 18600 }, { "epoch": 2.1694263059701493, "grad_norm": 0.5967881339025768, "learning_rate": 3.417737770077595e-05, "loss": 0.4372, "step": 18605 }, { "epoch": 2.170009328358209, "grad_norm": 0.4581423130322887, "learning_rate": 3.4169091988185106e-05, "loss": 0.4144, "step": 18610 }, { "epoch": 2.1705923507462686, "grad_norm": 0.43372339823006584, "learning_rate": 3.4160805283964184e-05, "loss": 0.4088, "step": 18615 }, { "epoch": 2.171175373134328, "grad_norm": 0.4317014186436621, "learning_rate": 3.415251758934534e-05, "loss": 0.4311, "step": 18620 }, { "epoch": 2.1717583955223883, "grad_norm": 0.43081952318192973, "learning_rate": 3.414422890556087e-05, "loss": 0.3964, "step": 18625 }, { "epoch": 2.172341417910448, "grad_norm": 0.43556154011233644, "learning_rate": 3.413593923384321e-05, "loss": 0.4198, "step": 18630 }, { "epoch": 2.1729244402985075, "grad_norm": 0.40153583373509305, "learning_rate": 3.4127648575424975e-05, "loss": 0.3962, "step": 18635 }, { "epoch": 2.173507462686567, "grad_norm": 0.39965651070488356, "learning_rate": 3.4119356931538894e-05, "loss": 0.3813, "step": 18640 }, { "epoch": 2.1740904850746268, "grad_norm": 0.42253742875349837, "learning_rate": 3.411106430341786e-05, "loss": 0.394, "step": 18645 }, { "epoch": 2.1746735074626864, "grad_norm": 0.467515445902988, "learning_rate": 3.410277069229491e-05, "loss": 0.4244, "step": 18650 }, { "epoch": 2.1752565298507465, "grad_norm": 0.3986272829489681, "learning_rate": 3.409447609940322e-05, "loss": 0.4061, "step": 18655 }, { "epoch": 2.175839552238806, "grad_norm": 0.40896378231540925, "learning_rate": 3.408618052597611e-05, "loss": 0.4067, "step": 18660 }, { "epoch": 2.1764225746268657, "grad_norm": 0.459152029100428, "learning_rate": 3.407788397324706e-05, "loss": 0.446, "step": 18665 }, { "epoch": 2.1770055970149254, "grad_norm": 0.4216994814627903, "learning_rate": 3.4069586442449684e-05, "loss": 0.3913, "step": 18670 }, { "epoch": 2.177588619402985, "grad_norm": 0.4434271126358292, "learning_rate": 3.406128793481776e-05, "loss": 0.41, "step": 18675 }, { "epoch": 2.1781716417910446, "grad_norm": 0.4231177774602946, "learning_rate": 3.405298845158518e-05, "loss": 0.3915, "step": 18680 }, { "epoch": 2.1787546641791047, "grad_norm": 0.3950903294347333, "learning_rate": 3.4044687993985995e-05, "loss": 0.4027, "step": 18685 }, { "epoch": 2.1793376865671643, "grad_norm": 0.4311258737949541, "learning_rate": 3.403638656325442e-05, "loss": 0.3971, "step": 18690 }, { "epoch": 2.179920708955224, "grad_norm": 0.3987552988837497, "learning_rate": 3.402808416062479e-05, "loss": 0.4345, "step": 18695 }, { "epoch": 2.1805037313432836, "grad_norm": 0.3977106718559317, "learning_rate": 3.4019780787331586e-05, "loss": 0.4056, "step": 18700 }, { "epoch": 2.181086753731343, "grad_norm": 0.4691687312056667, "learning_rate": 3.4011476444609456e-05, "loss": 0.4051, "step": 18705 }, { "epoch": 2.181669776119403, "grad_norm": 0.44967777394079034, "learning_rate": 3.4003171133693154e-05, "loss": 0.4354, "step": 18710 }, { "epoch": 2.182252798507463, "grad_norm": 0.40526689548409556, "learning_rate": 3.399486485581762e-05, "loss": 0.4035, "step": 18715 }, { "epoch": 2.1828358208955225, "grad_norm": 0.4395555616835956, "learning_rate": 3.3986557612217904e-05, "loss": 0.392, "step": 18720 }, { "epoch": 2.183418843283582, "grad_norm": 0.4498947483029495, "learning_rate": 3.3978249404129224e-05, "loss": 0.3945, "step": 18725 }, { "epoch": 2.184001865671642, "grad_norm": 0.4318157927154423, "learning_rate": 3.396994023278693e-05, "loss": 0.4375, "step": 18730 }, { "epoch": 2.1845848880597014, "grad_norm": 0.4374380435409525, "learning_rate": 3.396163009942651e-05, "loss": 0.3999, "step": 18735 }, { "epoch": 2.185167910447761, "grad_norm": 0.4468219550216439, "learning_rate": 3.3953319005283606e-05, "loss": 0.4004, "step": 18740 }, { "epoch": 2.1857509328358207, "grad_norm": 0.425195951885405, "learning_rate": 3.3945006951594e-05, "loss": 0.4284, "step": 18745 }, { "epoch": 2.1863339552238807, "grad_norm": 0.44941379499567136, "learning_rate": 3.393669393959361e-05, "loss": 0.4061, "step": 18750 }, { "epoch": 2.1869169776119404, "grad_norm": 0.5269031717232117, "learning_rate": 3.39283799705185e-05, "loss": 0.4194, "step": 18755 }, { "epoch": 2.1875, "grad_norm": 0.46465051136448565, "learning_rate": 3.392006504560487e-05, "loss": 0.4118, "step": 18760 }, { "epoch": 2.1880830223880596, "grad_norm": 0.3792927685182771, "learning_rate": 3.391174916608909e-05, "loss": 0.3958, "step": 18765 }, { "epoch": 2.1886660447761193, "grad_norm": 0.4860036129164203, "learning_rate": 3.390343233320764e-05, "loss": 0.4069, "step": 18770 }, { "epoch": 2.1892490671641793, "grad_norm": 0.44189085409945256, "learning_rate": 3.3895114548197154e-05, "loss": 0.4101, "step": 18775 }, { "epoch": 2.189832089552239, "grad_norm": 0.43937345374335685, "learning_rate": 3.388679581229441e-05, "loss": 0.411, "step": 18780 }, { "epoch": 2.1904151119402986, "grad_norm": 0.4116553804776797, "learning_rate": 3.3878476126736314e-05, "loss": 0.4091, "step": 18785 }, { "epoch": 2.190998134328358, "grad_norm": 0.3982822877797406, "learning_rate": 3.3870155492759936e-05, "loss": 0.3865, "step": 18790 }, { "epoch": 2.191581156716418, "grad_norm": 0.4679843239066103, "learning_rate": 3.386183391160248e-05, "loss": 0.4097, "step": 18795 }, { "epoch": 2.1921641791044775, "grad_norm": 0.41721294970723116, "learning_rate": 3.3853511384501256e-05, "loss": 0.3947, "step": 18800 }, { "epoch": 2.192747201492537, "grad_norm": 0.5285480534010507, "learning_rate": 3.384518791269378e-05, "loss": 0.4121, "step": 18805 }, { "epoch": 2.193330223880597, "grad_norm": 0.38832671898642174, "learning_rate": 3.3836863497417645e-05, "loss": 0.3911, "step": 18810 }, { "epoch": 2.193913246268657, "grad_norm": 0.39413625020813214, "learning_rate": 3.3828538139910626e-05, "loss": 0.3847, "step": 18815 }, { "epoch": 2.1944962686567164, "grad_norm": 0.40769783959180844, "learning_rate": 3.382021184141062e-05, "loss": 0.3942, "step": 18820 }, { "epoch": 2.195079291044776, "grad_norm": 0.42504062253604796, "learning_rate": 3.3811884603155665e-05, "loss": 0.3862, "step": 18825 }, { "epoch": 2.1956623134328357, "grad_norm": 0.46077084076022645, "learning_rate": 3.3803556426383954e-05, "loss": 0.4021, "step": 18830 }, { "epoch": 2.1962453358208953, "grad_norm": 0.4951549657984212, "learning_rate": 3.379522731233379e-05, "loss": 0.4217, "step": 18835 }, { "epoch": 2.1968283582089554, "grad_norm": 0.4149451628834867, "learning_rate": 3.378689726224364e-05, "loss": 0.4025, "step": 18840 }, { "epoch": 2.197411380597015, "grad_norm": 0.4442642390103348, "learning_rate": 3.377856627735211e-05, "loss": 0.4019, "step": 18845 }, { "epoch": 2.1979944029850746, "grad_norm": 0.4153845645412846, "learning_rate": 3.3770234358897926e-05, "loss": 0.41, "step": 18850 }, { "epoch": 2.1985774253731343, "grad_norm": 0.41527227718583565, "learning_rate": 3.376190150811997e-05, "loss": 0.4008, "step": 18855 }, { "epoch": 2.199160447761194, "grad_norm": 0.4702777032654794, "learning_rate": 3.3753567726257255e-05, "loss": 0.4328, "step": 18860 }, { "epoch": 2.1997434701492535, "grad_norm": 0.43001582614714884, "learning_rate": 3.3745233014548936e-05, "loss": 0.4016, "step": 18865 }, { "epoch": 2.2003264925373136, "grad_norm": 0.4475735867598758, "learning_rate": 3.373689737423431e-05, "loss": 0.4133, "step": 18870 }, { "epoch": 2.2009095149253732, "grad_norm": 0.4049807990171297, "learning_rate": 3.372856080655279e-05, "loss": 0.4036, "step": 18875 }, { "epoch": 2.201492537313433, "grad_norm": 0.4073750550140848, "learning_rate": 3.372022331274397e-05, "loss": 0.4137, "step": 18880 }, { "epoch": 2.2020755597014925, "grad_norm": 0.4655568861000359, "learning_rate": 3.3711884894047526e-05, "loss": 0.4142, "step": 18885 }, { "epoch": 2.202658582089552, "grad_norm": 0.4272531176487613, "learning_rate": 3.3703545551703326e-05, "loss": 0.403, "step": 18890 }, { "epoch": 2.2032416044776117, "grad_norm": 0.37553555316986753, "learning_rate": 3.369520528695133e-05, "loss": 0.376, "step": 18895 }, { "epoch": 2.203824626865672, "grad_norm": 0.39230467817125414, "learning_rate": 3.368686410103167e-05, "loss": 0.4148, "step": 18900 }, { "epoch": 2.2044076492537314, "grad_norm": 0.4030207298330371, "learning_rate": 3.36785219951846e-05, "loss": 0.3791, "step": 18905 }, { "epoch": 2.204990671641791, "grad_norm": 0.4027915953070148, "learning_rate": 3.367017897065051e-05, "loss": 0.3822, "step": 18910 }, { "epoch": 2.2055736940298507, "grad_norm": 0.38773701267559524, "learning_rate": 3.366183502866991e-05, "loss": 0.4065, "step": 18915 }, { "epoch": 2.2061567164179103, "grad_norm": 0.40914304941626306, "learning_rate": 3.3653490170483485e-05, "loss": 0.3976, "step": 18920 }, { "epoch": 2.20673973880597, "grad_norm": 0.3684464651454588, "learning_rate": 3.364514439733203e-05, "loss": 0.3978, "step": 18925 }, { "epoch": 2.20732276119403, "grad_norm": 0.45339020959733384, "learning_rate": 3.363679771045648e-05, "loss": 0.3868, "step": 18930 }, { "epoch": 2.2079057835820897, "grad_norm": 0.4640714288006895, "learning_rate": 3.3628450111097914e-05, "loss": 0.427, "step": 18935 }, { "epoch": 2.2084888059701493, "grad_norm": 0.3832431727307536, "learning_rate": 3.3620101600497526e-05, "loss": 0.3794, "step": 18940 }, { "epoch": 2.209071828358209, "grad_norm": 0.41826149036899857, "learning_rate": 3.361175217989668e-05, "loss": 0.3823, "step": 18945 }, { "epoch": 2.2096548507462686, "grad_norm": 0.43461855868836347, "learning_rate": 3.360340185053683e-05, "loss": 0.385, "step": 18950 }, { "epoch": 2.210237873134328, "grad_norm": 0.4498995074752811, "learning_rate": 3.3595050613659605e-05, "loss": 0.4362, "step": 18955 }, { "epoch": 2.2108208955223883, "grad_norm": 0.4538539856134972, "learning_rate": 3.358669847050676e-05, "loss": 0.4102, "step": 18960 }, { "epoch": 2.211403917910448, "grad_norm": 0.48452969901849496, "learning_rate": 3.3578345422320165e-05, "loss": 0.4232, "step": 18965 }, { "epoch": 2.2119869402985075, "grad_norm": 0.40585507907959584, "learning_rate": 3.356999147034184e-05, "loss": 0.3751, "step": 18970 }, { "epoch": 2.212569962686567, "grad_norm": 0.6602423945002934, "learning_rate": 3.3561636615813945e-05, "loss": 0.394, "step": 18975 }, { "epoch": 2.2131529850746268, "grad_norm": 0.3995041964148151, "learning_rate": 3.355328085997876e-05, "loss": 0.4013, "step": 18980 }, { "epoch": 2.2137360074626864, "grad_norm": 0.4493928819783629, "learning_rate": 3.3544924204078715e-05, "loss": 0.4081, "step": 18985 }, { "epoch": 2.2143190298507465, "grad_norm": 0.4599416014816642, "learning_rate": 3.3536566649356356e-05, "loss": 0.4008, "step": 18990 }, { "epoch": 2.214902052238806, "grad_norm": 0.42168687357584556, "learning_rate": 3.352820819705437e-05, "loss": 0.4092, "step": 18995 }, { "epoch": 2.2154850746268657, "grad_norm": 0.4315475803234669, "learning_rate": 3.351984884841558e-05, "loss": 0.411, "step": 19000 }, { "epoch": 2.2160680970149254, "grad_norm": 0.4085084805547176, "learning_rate": 3.351148860468295e-05, "loss": 0.404, "step": 19005 }, { "epoch": 2.216651119402985, "grad_norm": 0.4224188899426308, "learning_rate": 3.350312746709956e-05, "loss": 0.4209, "step": 19010 }, { "epoch": 2.2172341417910446, "grad_norm": 0.3903041081792129, "learning_rate": 3.3494765436908635e-05, "loss": 0.3881, "step": 19015 }, { "epoch": 2.2178171641791047, "grad_norm": 0.4060882490180791, "learning_rate": 3.348640251535352e-05, "loss": 0.4431, "step": 19020 }, { "epoch": 2.2184001865671643, "grad_norm": 0.4287928617213922, "learning_rate": 3.3478038703677714e-05, "loss": 0.4297, "step": 19025 }, { "epoch": 2.218983208955224, "grad_norm": 0.4685125464577112, "learning_rate": 3.346967400312482e-05, "loss": 0.416, "step": 19030 }, { "epoch": 2.2195662313432836, "grad_norm": 0.4538311902780141, "learning_rate": 3.346130841493861e-05, "loss": 0.4115, "step": 19035 }, { "epoch": 2.220149253731343, "grad_norm": 0.41675800397388363, "learning_rate": 3.3452941940362946e-05, "loss": 0.4098, "step": 19040 }, { "epoch": 2.220732276119403, "grad_norm": 0.4115809497046314, "learning_rate": 3.3444574580641855e-05, "loss": 0.389, "step": 19045 }, { "epoch": 2.221315298507463, "grad_norm": 0.411072064864703, "learning_rate": 3.343620633701948e-05, "loss": 0.3959, "step": 19050 }, { "epoch": 2.2218983208955225, "grad_norm": 0.4553105626693565, "learning_rate": 3.3427837210740096e-05, "loss": 0.407, "step": 19055 }, { "epoch": 2.222481343283582, "grad_norm": 0.45285812900354233, "learning_rate": 3.341946720304812e-05, "loss": 0.4268, "step": 19060 }, { "epoch": 2.223064365671642, "grad_norm": 0.418828723073918, "learning_rate": 3.341109631518809e-05, "loss": 0.4411, "step": 19065 }, { "epoch": 2.2236473880597014, "grad_norm": 0.3826327345459468, "learning_rate": 3.340272454840466e-05, "loss": 0.4077, "step": 19070 }, { "epoch": 2.224230410447761, "grad_norm": 0.4019863082348389, "learning_rate": 3.339435190394266e-05, "loss": 0.3925, "step": 19075 }, { "epoch": 2.2248134328358207, "grad_norm": 0.38540552264775296, "learning_rate": 3.3385978383046996e-05, "loss": 0.3667, "step": 19080 }, { "epoch": 2.2253964552238807, "grad_norm": 0.40850546390600095, "learning_rate": 3.337760398696275e-05, "loss": 0.4317, "step": 19085 }, { "epoch": 2.2259794776119404, "grad_norm": 0.4075709876647318, "learning_rate": 3.336922871693509e-05, "loss": 0.4119, "step": 19090 }, { "epoch": 2.2265625, "grad_norm": 0.4676102128976199, "learning_rate": 3.3360852574209364e-05, "loss": 0.401, "step": 19095 }, { "epoch": 2.2271455223880596, "grad_norm": 0.42331464158872506, "learning_rate": 3.335247556003101e-05, "loss": 0.4329, "step": 19100 }, { "epoch": 2.2277285447761193, "grad_norm": 0.43176856400343006, "learning_rate": 3.334409767564562e-05, "loss": 0.3929, "step": 19105 }, { "epoch": 2.2283115671641793, "grad_norm": 0.4546741054489476, "learning_rate": 3.33357189222989e-05, "loss": 0.4298, "step": 19110 }, { "epoch": 2.228894589552239, "grad_norm": 0.4222722235222902, "learning_rate": 3.3327339301236685e-05, "loss": 0.3984, "step": 19115 }, { "epoch": 2.2294776119402986, "grad_norm": 0.41156536253900705, "learning_rate": 3.331895881370495e-05, "loss": 0.3993, "step": 19120 }, { "epoch": 2.230060634328358, "grad_norm": 0.42397863871609975, "learning_rate": 3.33105774609498e-05, "loss": 0.4251, "step": 19125 }, { "epoch": 2.230643656716418, "grad_norm": 0.4094806636161526, "learning_rate": 3.3302195244217435e-05, "loss": 0.395, "step": 19130 }, { "epoch": 2.2312266791044775, "grad_norm": 0.4387702217365145, "learning_rate": 3.329381216475424e-05, "loss": 0.4125, "step": 19135 }, { "epoch": 2.231809701492537, "grad_norm": 0.44313493486655786, "learning_rate": 3.32854282238067e-05, "loss": 0.4234, "step": 19140 }, { "epoch": 2.232392723880597, "grad_norm": 0.42304706417295024, "learning_rate": 3.327704342262139e-05, "loss": 0.405, "step": 19145 }, { "epoch": 2.232975746268657, "grad_norm": 0.43077227644053545, "learning_rate": 3.326865776244509e-05, "loss": 0.4015, "step": 19150 }, { "epoch": 2.2335587686567164, "grad_norm": 0.40206455569293875, "learning_rate": 3.326027124452464e-05, "loss": 0.4033, "step": 19155 }, { "epoch": 2.234141791044776, "grad_norm": 0.3837895983092989, "learning_rate": 3.3251883870107066e-05, "loss": 0.386, "step": 19160 }, { "epoch": 2.2347248134328357, "grad_norm": 0.44978271368359235, "learning_rate": 3.324349564043945e-05, "loss": 0.4055, "step": 19165 }, { "epoch": 2.2353078358208953, "grad_norm": 0.47717898363254724, "learning_rate": 3.323510655676906e-05, "loss": 0.4109, "step": 19170 }, { "epoch": 2.2358908582089554, "grad_norm": 0.44735628156790574, "learning_rate": 3.322671662034329e-05, "loss": 0.4376, "step": 19175 }, { "epoch": 2.236473880597015, "grad_norm": 0.4502648812475679, "learning_rate": 3.3218325832409616e-05, "loss": 0.4336, "step": 19180 }, { "epoch": 2.2370569029850746, "grad_norm": 0.4267332030835794, "learning_rate": 3.3209934194215685e-05, "loss": 0.3839, "step": 19185 }, { "epoch": 2.2376399253731343, "grad_norm": 0.4899680668403373, "learning_rate": 3.320154170700925e-05, "loss": 0.4115, "step": 19190 }, { "epoch": 2.238222947761194, "grad_norm": 0.4372955169554078, "learning_rate": 3.3193148372038186e-05, "loss": 0.4369, "step": 19195 }, { "epoch": 2.2388059701492535, "grad_norm": 0.40104463684894454, "learning_rate": 3.3184754190550506e-05, "loss": 0.3798, "step": 19200 }, { "epoch": 2.2393889925373136, "grad_norm": 0.4458017841998383, "learning_rate": 3.317635916379434e-05, "loss": 0.4321, "step": 19205 }, { "epoch": 2.2399720149253732, "grad_norm": 0.46399213984482857, "learning_rate": 3.316796329301796e-05, "loss": 0.4135, "step": 19210 }, { "epoch": 2.240555037313433, "grad_norm": 0.470985654242471, "learning_rate": 3.315956657946975e-05, "loss": 0.4332, "step": 19215 }, { "epoch": 2.2411380597014925, "grad_norm": 0.4012839214934904, "learning_rate": 3.31511690243982e-05, "loss": 0.387, "step": 19220 }, { "epoch": 2.241721082089552, "grad_norm": 0.37874214937375866, "learning_rate": 3.314277062905197e-05, "loss": 0.4104, "step": 19225 }, { "epoch": 2.2423041044776117, "grad_norm": 0.38189244637664627, "learning_rate": 3.3134371394679806e-05, "loss": 0.406, "step": 19230 }, { "epoch": 2.242887126865672, "grad_norm": 0.4073493151668433, "learning_rate": 3.3125971322530605e-05, "loss": 0.3941, "step": 19235 }, { "epoch": 2.2434701492537314, "grad_norm": 0.41295329233596706, "learning_rate": 3.3117570413853373e-05, "loss": 0.4375, "step": 19240 }, { "epoch": 2.244053171641791, "grad_norm": 0.4113591289942066, "learning_rate": 3.3109168669897234e-05, "loss": 0.4334, "step": 19245 }, { "epoch": 2.2446361940298507, "grad_norm": 0.3948951613485122, "learning_rate": 3.3100766091911464e-05, "loss": 0.4223, "step": 19250 }, { "epoch": 2.2452192164179103, "grad_norm": 0.47305209701104506, "learning_rate": 3.309236268114543e-05, "loss": 0.3993, "step": 19255 }, { "epoch": 2.24580223880597, "grad_norm": 0.4063313365683372, "learning_rate": 3.308395843884866e-05, "loss": 0.3894, "step": 19260 }, { "epoch": 2.24638526119403, "grad_norm": 0.4966297547212204, "learning_rate": 3.3075553366270765e-05, "loss": 0.4113, "step": 19265 }, { "epoch": 2.2469682835820897, "grad_norm": 0.42660858880974023, "learning_rate": 3.30671474646615e-05, "loss": 0.4103, "step": 19270 }, { "epoch": 2.2475513059701493, "grad_norm": 0.3921026144813095, "learning_rate": 3.305874073527076e-05, "loss": 0.3976, "step": 19275 }, { "epoch": 2.248134328358209, "grad_norm": 0.40947245891887135, "learning_rate": 3.305033317934852e-05, "loss": 0.4195, "step": 19280 }, { "epoch": 2.2487173507462686, "grad_norm": 0.39824547295686524, "learning_rate": 3.304192479814492e-05, "loss": 0.3951, "step": 19285 }, { "epoch": 2.249300373134328, "grad_norm": 0.4605956512328903, "learning_rate": 3.30335155929102e-05, "loss": 0.4179, "step": 19290 }, { "epoch": 2.2498833955223883, "grad_norm": 0.4235960354555694, "learning_rate": 3.3025105564894726e-05, "loss": 0.4074, "step": 19295 }, { "epoch": 2.250466417910448, "grad_norm": 0.7789025397419326, "learning_rate": 3.301669471534899e-05, "loss": 0.4059, "step": 19300 }, { "epoch": 2.2510494402985075, "grad_norm": 0.41723162788172374, "learning_rate": 3.300828304552362e-05, "loss": 0.4225, "step": 19305 }, { "epoch": 2.251632462686567, "grad_norm": 0.4106349523920227, "learning_rate": 3.299987055666932e-05, "loss": 0.4052, "step": 19310 }, { "epoch": 2.2522154850746268, "grad_norm": 0.4283657768425172, "learning_rate": 3.299145725003698e-05, "loss": 0.4132, "step": 19315 }, { "epoch": 2.2527985074626864, "grad_norm": 0.42135169549612134, "learning_rate": 3.298304312687754e-05, "loss": 0.4057, "step": 19320 }, { "epoch": 2.2533815298507465, "grad_norm": 0.4343994879957478, "learning_rate": 3.2974628188442135e-05, "loss": 0.4162, "step": 19325 }, { "epoch": 2.253964552238806, "grad_norm": 0.38418503116395103, "learning_rate": 3.2966212435981975e-05, "loss": 0.4028, "step": 19330 }, { "epoch": 2.2545475746268657, "grad_norm": 0.4386121996784451, "learning_rate": 3.295779587074839e-05, "loss": 0.4076, "step": 19335 }, { "epoch": 2.2551305970149254, "grad_norm": 0.4261264480921298, "learning_rate": 3.2949378493992854e-05, "loss": 0.4434, "step": 19340 }, { "epoch": 2.255713619402985, "grad_norm": 0.4202596274567211, "learning_rate": 3.294096030696695e-05, "loss": 0.4288, "step": 19345 }, { "epoch": 2.2562966417910446, "grad_norm": 0.4023981550455131, "learning_rate": 3.293254131092238e-05, "loss": 0.3901, "step": 19350 }, { "epoch": 2.2568796641791042, "grad_norm": 0.43842212014715376, "learning_rate": 3.292412150711096e-05, "loss": 0.4076, "step": 19355 }, { "epoch": 2.2574626865671643, "grad_norm": 0.4235541619911344, "learning_rate": 3.2915700896784655e-05, "loss": 0.41, "step": 19360 }, { "epoch": 2.258045708955224, "grad_norm": 0.3924033689792878, "learning_rate": 3.29072794811955e-05, "loss": 0.4163, "step": 19365 }, { "epoch": 2.2586287313432836, "grad_norm": 0.5602310911698453, "learning_rate": 3.28988572615957e-05, "loss": 0.4159, "step": 19370 }, { "epoch": 2.259211753731343, "grad_norm": 0.44134197341260173, "learning_rate": 3.289043423923756e-05, "loss": 0.4145, "step": 19375 }, { "epoch": 2.259794776119403, "grad_norm": 0.4355846163833659, "learning_rate": 3.288201041537348e-05, "loss": 0.4081, "step": 19380 }, { "epoch": 2.260377798507463, "grad_norm": 0.37596200221279935, "learning_rate": 3.2873585791256026e-05, "loss": 0.4149, "step": 19385 }, { "epoch": 2.2609608208955225, "grad_norm": 0.429895697615993, "learning_rate": 3.286516036813785e-05, "loss": 0.4211, "step": 19390 }, { "epoch": 2.261543843283582, "grad_norm": 0.4110188978590114, "learning_rate": 3.2856734147271734e-05, "loss": 0.4115, "step": 19395 }, { "epoch": 2.262126865671642, "grad_norm": 0.4219592976595445, "learning_rate": 3.284830712991057e-05, "loss": 0.4149, "step": 19400 }, { "epoch": 2.2627098880597014, "grad_norm": 0.3974568668430782, "learning_rate": 3.283987931730738e-05, "loss": 0.4092, "step": 19405 }, { "epoch": 2.263292910447761, "grad_norm": 0.41645173989415424, "learning_rate": 3.28314507107153e-05, "loss": 0.4021, "step": 19410 }, { "epoch": 2.2638759328358207, "grad_norm": 0.41635428855051715, "learning_rate": 3.282302131138758e-05, "loss": 0.3773, "step": 19415 }, { "epoch": 2.2644589552238807, "grad_norm": 0.3923843382277778, "learning_rate": 3.281459112057759e-05, "loss": 0.4427, "step": 19420 }, { "epoch": 2.2650419776119404, "grad_norm": 0.40199122879201077, "learning_rate": 3.280616013953882e-05, "loss": 0.3949, "step": 19425 }, { "epoch": 2.265625, "grad_norm": 0.47769311176420254, "learning_rate": 3.2797728369524875e-05, "loss": 0.3995, "step": 19430 }, { "epoch": 2.2662080223880596, "grad_norm": 0.4425848431188337, "learning_rate": 3.2789295811789486e-05, "loss": 0.4143, "step": 19435 }, { "epoch": 2.2667910447761193, "grad_norm": 0.4324374383716641, "learning_rate": 3.2780862467586486e-05, "loss": 0.4028, "step": 19440 }, { "epoch": 2.2673740671641793, "grad_norm": 0.43786139719568246, "learning_rate": 3.2772428338169835e-05, "loss": 0.4266, "step": 19445 }, { "epoch": 2.267957089552239, "grad_norm": 0.5747528957501068, "learning_rate": 3.2763993424793604e-05, "loss": 0.4308, "step": 19450 }, { "epoch": 2.2685401119402986, "grad_norm": 0.3870883426706857, "learning_rate": 3.275555772871198e-05, "loss": 0.3695, "step": 19455 }, { "epoch": 2.269123134328358, "grad_norm": 0.44081356534981314, "learning_rate": 3.2747121251179294e-05, "loss": 0.4262, "step": 19460 }, { "epoch": 2.269706156716418, "grad_norm": 4.048079343708381, "learning_rate": 3.2738683993449937e-05, "loss": 0.424, "step": 19465 }, { "epoch": 2.2702891791044775, "grad_norm": 0.44792548660329046, "learning_rate": 3.273024595677846e-05, "loss": 0.4231, "step": 19470 }, { "epoch": 2.270872201492537, "grad_norm": 0.4550582978354379, "learning_rate": 3.272180714241954e-05, "loss": 0.3964, "step": 19475 }, { "epoch": 2.271455223880597, "grad_norm": 0.4093528141036578, "learning_rate": 3.271336755162792e-05, "loss": 0.3874, "step": 19480 }, { "epoch": 2.272038246268657, "grad_norm": 0.4258837139029889, "learning_rate": 3.270492718565848e-05, "loss": 0.4188, "step": 19485 }, { "epoch": 2.2726212686567164, "grad_norm": 0.4492579967645142, "learning_rate": 3.269648604576625e-05, "loss": 0.4329, "step": 19490 }, { "epoch": 2.273204291044776, "grad_norm": 0.4299121285612161, "learning_rate": 3.2688044133206336e-05, "loss": 0.3899, "step": 19495 }, { "epoch": 2.2737873134328357, "grad_norm": 0.4475759072761559, "learning_rate": 3.267960144923397e-05, "loss": 0.4271, "step": 19500 }, { "epoch": 2.2743703358208958, "grad_norm": 0.47668206129734714, "learning_rate": 3.267115799510448e-05, "loss": 0.4259, "step": 19505 }, { "epoch": 2.2749533582089554, "grad_norm": 0.42450594454386936, "learning_rate": 3.266271377207335e-05, "loss": 0.4242, "step": 19510 }, { "epoch": 2.275536380597015, "grad_norm": 0.431683491675114, "learning_rate": 3.265426878139615e-05, "loss": 0.4076, "step": 19515 }, { "epoch": 2.2761194029850746, "grad_norm": 0.43830855687851744, "learning_rate": 3.264582302432856e-05, "loss": 0.4024, "step": 19520 }, { "epoch": 2.2767024253731343, "grad_norm": 0.4386244681679389, "learning_rate": 3.263737650212639e-05, "loss": 0.3781, "step": 19525 }, { "epoch": 2.277285447761194, "grad_norm": 0.43014543463612237, "learning_rate": 3.262892921604556e-05, "loss": 0.4089, "step": 19530 }, { "epoch": 2.2778684701492535, "grad_norm": 0.4343757511308402, "learning_rate": 3.262048116734209e-05, "loss": 0.4129, "step": 19535 }, { "epoch": 2.2784514925373136, "grad_norm": 0.40237337145907276, "learning_rate": 3.261203235727214e-05, "loss": 0.3891, "step": 19540 }, { "epoch": 2.2790345149253732, "grad_norm": 0.45091683200498744, "learning_rate": 3.2603582787091954e-05, "loss": 0.4146, "step": 19545 }, { "epoch": 2.279617537313433, "grad_norm": 0.3974180501483524, "learning_rate": 3.259513245805791e-05, "loss": 0.3896, "step": 19550 }, { "epoch": 2.2802005597014925, "grad_norm": 0.3903372551099371, "learning_rate": 3.258668137142648e-05, "loss": 0.4153, "step": 19555 }, { "epoch": 2.280783582089552, "grad_norm": 0.44473419209702447, "learning_rate": 3.2578229528454266e-05, "loss": 0.4477, "step": 19560 }, { "epoch": 2.2813666044776117, "grad_norm": 0.42338089189107436, "learning_rate": 3.2569776930397986e-05, "loss": 0.4107, "step": 19565 }, { "epoch": 2.281949626865672, "grad_norm": 0.4271095839124686, "learning_rate": 3.256132357851445e-05, "loss": 0.4111, "step": 19570 }, { "epoch": 2.2825326492537314, "grad_norm": 0.4240361012694304, "learning_rate": 3.255286947406058e-05, "loss": 0.4256, "step": 19575 }, { "epoch": 2.283115671641791, "grad_norm": 0.43112816927698716, "learning_rate": 3.254441461829344e-05, "loss": 0.4476, "step": 19580 }, { "epoch": 2.2836986940298507, "grad_norm": 0.4452325290170443, "learning_rate": 3.253595901247017e-05, "loss": 0.4232, "step": 19585 }, { "epoch": 2.2842817164179103, "grad_norm": 0.422485076532205, "learning_rate": 3.252750265784806e-05, "loss": 0.3957, "step": 19590 }, { "epoch": 2.28486473880597, "grad_norm": 0.44535347407515513, "learning_rate": 3.251904555568446e-05, "loss": 0.3979, "step": 19595 }, { "epoch": 2.28544776119403, "grad_norm": 0.442577024845191, "learning_rate": 3.251058770723688e-05, "loss": 0.3925, "step": 19600 }, { "epoch": 2.2860307835820897, "grad_norm": 0.4029571005295842, "learning_rate": 3.2502129113762916e-05, "loss": 0.3969, "step": 19605 }, { "epoch": 2.2866138059701493, "grad_norm": 0.4409108505001848, "learning_rate": 3.249366977652028e-05, "loss": 0.3964, "step": 19610 }, { "epoch": 2.287196828358209, "grad_norm": 0.43439371315893266, "learning_rate": 3.248520969676679e-05, "loss": 0.4023, "step": 19615 }, { "epoch": 2.2877798507462686, "grad_norm": 0.45017403790988, "learning_rate": 3.247674887576038e-05, "loss": 0.4053, "step": 19620 }, { "epoch": 2.288362873134328, "grad_norm": 0.41301240031394465, "learning_rate": 3.24682873147591e-05, "loss": 0.4034, "step": 19625 }, { "epoch": 2.2889458955223883, "grad_norm": 0.43559973356243425, "learning_rate": 3.24598250150211e-05, "loss": 0.4009, "step": 19630 }, { "epoch": 2.289528917910448, "grad_norm": 0.3955797638011971, "learning_rate": 3.245136197780464e-05, "loss": 0.4003, "step": 19635 }, { "epoch": 2.2901119402985075, "grad_norm": 0.5378091429648244, "learning_rate": 3.2442898204368086e-05, "loss": 0.412, "step": 19640 }, { "epoch": 2.290694962686567, "grad_norm": 0.41044291869817934, "learning_rate": 3.243443369596994e-05, "loss": 0.4, "step": 19645 }, { "epoch": 2.2912779850746268, "grad_norm": 0.5155671788654731, "learning_rate": 3.242596845386878e-05, "loss": 0.4272, "step": 19650 }, { "epoch": 2.2918610074626864, "grad_norm": 0.46689772618771636, "learning_rate": 3.24175024793233e-05, "loss": 0.4553, "step": 19655 }, { "epoch": 2.2924440298507465, "grad_norm": 0.44880295534744935, "learning_rate": 3.240903577359232e-05, "loss": 0.3942, "step": 19660 }, { "epoch": 2.293027052238806, "grad_norm": 0.4386249304117945, "learning_rate": 3.2400568337934753e-05, "loss": 0.4148, "step": 19665 }, { "epoch": 2.2936100746268657, "grad_norm": 0.3841084140882872, "learning_rate": 3.239210017360963e-05, "loss": 0.4006, "step": 19670 }, { "epoch": 2.2941930970149254, "grad_norm": 0.4270348488186706, "learning_rate": 3.238363128187609e-05, "loss": 0.4257, "step": 19675 }, { "epoch": 2.294776119402985, "grad_norm": 0.48423965482498266, "learning_rate": 3.237516166399336e-05, "loss": 0.3961, "step": 19680 }, { "epoch": 2.2953591417910446, "grad_norm": 0.39634698752788855, "learning_rate": 3.23666913212208e-05, "loss": 0.3969, "step": 19685 }, { "epoch": 2.2959421641791042, "grad_norm": 0.45634558141168363, "learning_rate": 3.2358220254817874e-05, "loss": 0.3999, "step": 19690 }, { "epoch": 2.2965251865671643, "grad_norm": 0.3964112052788775, "learning_rate": 3.234974846604414e-05, "loss": 0.3696, "step": 19695 }, { "epoch": 2.297108208955224, "grad_norm": 0.4906474377698196, "learning_rate": 3.234127595615927e-05, "loss": 0.4101, "step": 19700 }, { "epoch": 2.2976912313432836, "grad_norm": 0.4264425204370442, "learning_rate": 3.233280272642306e-05, "loss": 0.4013, "step": 19705 }, { "epoch": 2.298274253731343, "grad_norm": 0.43759040567702656, "learning_rate": 3.232432877809538e-05, "loss": 0.4417, "step": 19710 }, { "epoch": 2.298857276119403, "grad_norm": 0.3948352991061603, "learning_rate": 3.231585411243625e-05, "loss": 0.4268, "step": 19715 }, { "epoch": 2.299440298507463, "grad_norm": 0.4726401560011215, "learning_rate": 3.230737873070574e-05, "loss": 0.4239, "step": 19720 }, { "epoch": 2.3000233208955225, "grad_norm": 0.4633160273854487, "learning_rate": 3.2298902634164076e-05, "loss": 0.4466, "step": 19725 }, { "epoch": 2.300606343283582, "grad_norm": 0.3866952226690696, "learning_rate": 3.229042582407157e-05, "loss": 0.4167, "step": 19730 }, { "epoch": 2.301189365671642, "grad_norm": 0.4311186362441962, "learning_rate": 3.2281948301688644e-05, "loss": 0.4065, "step": 19735 }, { "epoch": 2.3017723880597014, "grad_norm": 0.4348609745059275, "learning_rate": 3.2273470068275816e-05, "loss": 0.3859, "step": 19740 }, { "epoch": 2.302355410447761, "grad_norm": 0.40747328514484904, "learning_rate": 3.2264991125093734e-05, "loss": 0.4135, "step": 19745 }, { "epoch": 2.3029384328358207, "grad_norm": 0.4222247568672412, "learning_rate": 3.225651147340312e-05, "loss": 0.4136, "step": 19750 }, { "epoch": 2.3035214552238807, "grad_norm": 0.4285394092682662, "learning_rate": 3.224803111446482e-05, "loss": 0.4054, "step": 19755 }, { "epoch": 2.3041044776119404, "grad_norm": 0.4484351780405356, "learning_rate": 3.223955004953979e-05, "loss": 0.4058, "step": 19760 }, { "epoch": 2.3046875, "grad_norm": 0.7772680349991993, "learning_rate": 3.223106827988908e-05, "loss": 0.4178, "step": 19765 }, { "epoch": 2.3052705223880596, "grad_norm": 0.42053916584824075, "learning_rate": 3.222258580677385e-05, "loss": 0.424, "step": 19770 }, { "epoch": 2.3058535447761193, "grad_norm": 0.4617068198682325, "learning_rate": 3.221410263145534e-05, "loss": 0.3985, "step": 19775 }, { "epoch": 2.3064365671641793, "grad_norm": 0.4161972544147272, "learning_rate": 3.220561875519495e-05, "loss": 0.4045, "step": 19780 }, { "epoch": 2.307019589552239, "grad_norm": 0.45098332375509076, "learning_rate": 3.219713417925414e-05, "loss": 0.3895, "step": 19785 }, { "epoch": 2.3076026119402986, "grad_norm": 0.39429093395594705, "learning_rate": 3.218864890489446e-05, "loss": 0.4099, "step": 19790 }, { "epoch": 2.308185634328358, "grad_norm": 0.4441370005884668, "learning_rate": 3.218016293337763e-05, "loss": 0.4148, "step": 19795 }, { "epoch": 2.308768656716418, "grad_norm": 0.5063774882183846, "learning_rate": 3.2171676265965415e-05, "loss": 0.3917, "step": 19800 }, { "epoch": 2.3093516791044775, "grad_norm": 0.41287549927129186, "learning_rate": 3.216318890391969e-05, "loss": 0.4159, "step": 19805 }, { "epoch": 2.309934701492537, "grad_norm": 0.39958654092936824, "learning_rate": 3.2154700848502454e-05, "loss": 0.3951, "step": 19810 }, { "epoch": 2.310517723880597, "grad_norm": 0.4457276425526031, "learning_rate": 3.21462121009758e-05, "loss": 0.3834, "step": 19815 }, { "epoch": 2.311100746268657, "grad_norm": 0.3999870840682877, "learning_rate": 3.2137722662601934e-05, "loss": 0.3951, "step": 19820 }, { "epoch": 2.3116837686567164, "grad_norm": 0.45784680226701713, "learning_rate": 3.212923253464314e-05, "loss": 0.4181, "step": 19825 }, { "epoch": 2.312266791044776, "grad_norm": 0.4197518751738806, "learning_rate": 3.212074171836181e-05, "loss": 0.4343, "step": 19830 }, { "epoch": 2.3128498134328357, "grad_norm": 0.42816680566531606, "learning_rate": 3.211225021502047e-05, "loss": 0.3756, "step": 19835 }, { "epoch": 2.3134328358208958, "grad_norm": 0.40880001600894555, "learning_rate": 3.21037580258817e-05, "loss": 0.3867, "step": 19840 }, { "epoch": 2.3140158582089554, "grad_norm": 0.3973850915227583, "learning_rate": 3.209526515220824e-05, "loss": 0.4087, "step": 19845 }, { "epoch": 2.314598880597015, "grad_norm": 0.43025167101520595, "learning_rate": 3.208677159526287e-05, "loss": 0.4039, "step": 19850 }, { "epoch": 2.3151819029850746, "grad_norm": 0.43292192466058066, "learning_rate": 3.207827735630851e-05, "loss": 0.4189, "step": 19855 }, { "epoch": 2.3157649253731343, "grad_norm": 0.3870544571777672, "learning_rate": 3.206978243660817e-05, "loss": 0.3742, "step": 19860 }, { "epoch": 2.316347947761194, "grad_norm": 0.40163379164131247, "learning_rate": 3.206128683742497e-05, "loss": 0.381, "step": 19865 }, { "epoch": 2.3169309701492535, "grad_norm": 0.436758508652368, "learning_rate": 3.205279056002212e-05, "loss": 0.4012, "step": 19870 }, { "epoch": 2.3175139925373136, "grad_norm": 0.39860721757852535, "learning_rate": 3.204429360566293e-05, "loss": 0.3733, "step": 19875 }, { "epoch": 2.3180970149253732, "grad_norm": 0.42656335971174303, "learning_rate": 3.203579597561082e-05, "loss": 0.395, "step": 19880 }, { "epoch": 2.318680037313433, "grad_norm": 0.4497528973413777, "learning_rate": 3.202729767112931e-05, "loss": 0.3933, "step": 19885 }, { "epoch": 2.3192630597014925, "grad_norm": 0.48962635772772106, "learning_rate": 3.2018798693482015e-05, "loss": 0.4005, "step": 19890 }, { "epoch": 2.319846082089552, "grad_norm": 0.43087445360519433, "learning_rate": 3.201029904393265e-05, "loss": 0.3916, "step": 19895 }, { "epoch": 2.3204291044776117, "grad_norm": 0.4290395291230286, "learning_rate": 3.200179872374503e-05, "loss": 0.4111, "step": 19900 }, { "epoch": 2.321012126865672, "grad_norm": 0.43822200286103785, "learning_rate": 3.199329773418307e-05, "loss": 0.4117, "step": 19905 }, { "epoch": 2.3215951492537314, "grad_norm": 0.4313418370656167, "learning_rate": 3.198479607651079e-05, "loss": 0.4198, "step": 19910 }, { "epoch": 2.322178171641791, "grad_norm": 0.4505798881150213, "learning_rate": 3.1976293751992295e-05, "loss": 0.4264, "step": 19915 }, { "epoch": 2.3227611940298507, "grad_norm": 0.4342457919802341, "learning_rate": 3.1967790761891826e-05, "loss": 0.3948, "step": 19920 }, { "epoch": 2.3233442164179103, "grad_norm": 0.43034316120632954, "learning_rate": 3.195928710747366e-05, "loss": 0.3945, "step": 19925 }, { "epoch": 2.32392723880597, "grad_norm": 0.3794488846030449, "learning_rate": 3.1950782790002236e-05, "loss": 0.3983, "step": 19930 }, { "epoch": 2.32451026119403, "grad_norm": 0.43556043158438223, "learning_rate": 3.194227781074205e-05, "loss": 0.4076, "step": 19935 }, { "epoch": 2.3250932835820897, "grad_norm": 0.42871794072011205, "learning_rate": 3.1933772170957716e-05, "loss": 0.404, "step": 19940 }, { "epoch": 2.3256763059701493, "grad_norm": 0.44074119013671076, "learning_rate": 3.192526587191395e-05, "loss": 0.4035, "step": 19945 }, { "epoch": 2.326259328358209, "grad_norm": 0.4359971713746372, "learning_rate": 3.191675891487554e-05, "loss": 0.3908, "step": 19950 }, { "epoch": 2.3268423507462686, "grad_norm": 0.45675933115702533, "learning_rate": 3.190825130110739e-05, "loss": 0.3975, "step": 19955 }, { "epoch": 2.327425373134328, "grad_norm": 0.4129999570499641, "learning_rate": 3.189974303187452e-05, "loss": 0.3941, "step": 19960 }, { "epoch": 2.3280083955223883, "grad_norm": 0.473532918140509, "learning_rate": 3.1891234108442007e-05, "loss": 0.3914, "step": 19965 }, { "epoch": 2.328591417910448, "grad_norm": 0.3960313726920244, "learning_rate": 3.188272453207507e-05, "loss": 0.3835, "step": 19970 }, { "epoch": 2.3291744402985075, "grad_norm": 0.4139472126774143, "learning_rate": 3.1874214304038965e-05, "loss": 0.3835, "step": 19975 }, { "epoch": 2.329757462686567, "grad_norm": 0.41494669031781867, "learning_rate": 3.186570342559912e-05, "loss": 0.4275, "step": 19980 }, { "epoch": 2.3303404850746268, "grad_norm": 0.4178039006108638, "learning_rate": 3.1857191898020996e-05, "loss": 0.3807, "step": 19985 }, { "epoch": 2.3309235074626864, "grad_norm": 0.44004003162212, "learning_rate": 3.184867972257019e-05, "loss": 0.4094, "step": 19990 }, { "epoch": 2.3315065298507465, "grad_norm": 0.4388893382580093, "learning_rate": 3.184016690051237e-05, "loss": 0.4354, "step": 19995 }, { "epoch": 2.332089552238806, "grad_norm": 0.4123871451644767, "learning_rate": 3.1831653433113317e-05, "loss": 0.3927, "step": 20000 }, { "epoch": 2.3326725746268657, "grad_norm": 0.5086366333897813, "learning_rate": 3.18231393216389e-05, "loss": 0.4365, "step": 20005 }, { "epoch": 2.3332555970149254, "grad_norm": 0.4244222110279215, "learning_rate": 3.1814624567355087e-05, "loss": 0.4105, "step": 20010 }, { "epoch": 2.333838619402985, "grad_norm": 0.483139716666001, "learning_rate": 3.180610917152795e-05, "loss": 0.4058, "step": 20015 }, { "epoch": 2.3344216417910446, "grad_norm": 0.47751784102029826, "learning_rate": 3.179759313542362e-05, "loss": 0.4326, "step": 20020 }, { "epoch": 2.3350046641791042, "grad_norm": 0.4530366603227296, "learning_rate": 3.178907646030838e-05, "loss": 0.427, "step": 20025 }, { "epoch": 2.3355876865671643, "grad_norm": 0.4131765323720608, "learning_rate": 3.1780559147448554e-05, "loss": 0.3869, "step": 20030 }, { "epoch": 2.336170708955224, "grad_norm": 0.3909592535404698, "learning_rate": 3.1772041198110604e-05, "loss": 0.4155, "step": 20035 }, { "epoch": 2.3367537313432836, "grad_norm": 0.392337753019418, "learning_rate": 3.176352261356105e-05, "loss": 0.403, "step": 20040 }, { "epoch": 2.337336753731343, "grad_norm": 0.40748285662078076, "learning_rate": 3.1755003395066546e-05, "loss": 0.3931, "step": 20045 }, { "epoch": 2.337919776119403, "grad_norm": 0.4215948719992, "learning_rate": 3.17464835438938e-05, "loss": 0.3992, "step": 20050 }, { "epoch": 2.338502798507463, "grad_norm": 0.4107600451026482, "learning_rate": 3.173796306130963e-05, "loss": 0.385, "step": 20055 }, { "epoch": 2.3390858208955225, "grad_norm": 0.4112967006629501, "learning_rate": 3.172944194858096e-05, "loss": 0.3813, "step": 20060 }, { "epoch": 2.339668843283582, "grad_norm": 0.4050618756862867, "learning_rate": 3.17209202069748e-05, "loss": 0.4194, "step": 20065 }, { "epoch": 2.340251865671642, "grad_norm": 0.4395181870541969, "learning_rate": 3.171239783775825e-05, "loss": 0.4291, "step": 20070 }, { "epoch": 2.3408348880597014, "grad_norm": 0.41947129455782817, "learning_rate": 3.170387484219849e-05, "loss": 0.4266, "step": 20075 }, { "epoch": 2.341417910447761, "grad_norm": 0.4102958164069878, "learning_rate": 3.169535122156283e-05, "loss": 0.4082, "step": 20080 }, { "epoch": 2.3420009328358207, "grad_norm": 0.4671712493198971, "learning_rate": 3.1686826977118635e-05, "loss": 0.4312, "step": 20085 }, { "epoch": 2.3425839552238807, "grad_norm": 0.4552248865176427, "learning_rate": 3.167830211013338e-05, "loss": 0.3904, "step": 20090 }, { "epoch": 2.3431669776119404, "grad_norm": 0.4303843947549127, "learning_rate": 3.166977662187464e-05, "loss": 0.3861, "step": 20095 }, { "epoch": 2.34375, "grad_norm": 0.4309237307936687, "learning_rate": 3.166125051361007e-05, "loss": 0.3873, "step": 20100 }, { "epoch": 2.3443330223880596, "grad_norm": 0.3994652686476947, "learning_rate": 3.165272378660741e-05, "loss": 0.3848, "step": 20105 }, { "epoch": 2.3449160447761193, "grad_norm": 0.4284802061767505, "learning_rate": 3.164419644213451e-05, "loss": 0.4149, "step": 20110 }, { "epoch": 2.3454990671641793, "grad_norm": 0.42777765319194583, "learning_rate": 3.1635668481459306e-05, "loss": 0.4066, "step": 20115 }, { "epoch": 2.346082089552239, "grad_norm": 0.44010404715579654, "learning_rate": 3.162713990584983e-05, "loss": 0.425, "step": 20120 }, { "epoch": 2.3466651119402986, "grad_norm": 0.4205388245648607, "learning_rate": 3.1618610716574196e-05, "loss": 0.4062, "step": 20125 }, { "epoch": 2.347248134328358, "grad_norm": 0.437076758831132, "learning_rate": 3.1610080914900604e-05, "loss": 0.4409, "step": 20130 }, { "epoch": 2.347831156716418, "grad_norm": 0.4623130929754608, "learning_rate": 3.1601550502097356e-05, "loss": 0.4162, "step": 20135 }, { "epoch": 2.3484141791044775, "grad_norm": 0.40550332345951196, "learning_rate": 3.159301947943285e-05, "loss": 0.3974, "step": 20140 }, { "epoch": 2.348997201492537, "grad_norm": 0.427271702948713, "learning_rate": 3.158448784817556e-05, "loss": 0.4172, "step": 20145 }, { "epoch": 2.349580223880597, "grad_norm": 0.406859620580034, "learning_rate": 3.157595560959407e-05, "loss": 0.3981, "step": 20150 }, { "epoch": 2.350163246268657, "grad_norm": 0.39776364849500506, "learning_rate": 3.156742276495702e-05, "loss": 0.3899, "step": 20155 }, { "epoch": 2.3507462686567164, "grad_norm": 0.4510281901026045, "learning_rate": 3.155888931553319e-05, "loss": 0.4406, "step": 20160 }, { "epoch": 2.351329291044776, "grad_norm": 0.4365965942805441, "learning_rate": 3.155035526259139e-05, "loss": 0.3817, "step": 20165 }, { "epoch": 2.3519123134328357, "grad_norm": 0.4545022679422716, "learning_rate": 3.154182060740058e-05, "loss": 0.4329, "step": 20170 }, { "epoch": 2.3524953358208958, "grad_norm": 0.3934986770325854, "learning_rate": 3.153328535122978e-05, "loss": 0.3961, "step": 20175 }, { "epoch": 2.3530783582089554, "grad_norm": 0.4282677145322305, "learning_rate": 3.152474949534808e-05, "loss": 0.4105, "step": 20180 }, { "epoch": 2.353661380597015, "grad_norm": 0.38176053687568606, "learning_rate": 3.15162130410247e-05, "loss": 0.3885, "step": 20185 }, { "epoch": 2.3542444029850746, "grad_norm": 0.4681680530843617, "learning_rate": 3.1507675989528915e-05, "loss": 0.4028, "step": 20190 }, { "epoch": 2.3548274253731343, "grad_norm": 0.43310488195784286, "learning_rate": 3.1499138342130114e-05, "loss": 0.3849, "step": 20195 }, { "epoch": 2.355410447761194, "grad_norm": 0.39181491044361416, "learning_rate": 3.1490600100097746e-05, "loss": 0.3933, "step": 20200 }, { "epoch": 2.3559934701492535, "grad_norm": 0.3984123549110816, "learning_rate": 3.148206126470138e-05, "loss": 0.3849, "step": 20205 }, { "epoch": 2.3565764925373136, "grad_norm": 0.4185418729161508, "learning_rate": 3.147352183721067e-05, "loss": 0.4053, "step": 20210 }, { "epoch": 2.3571595149253732, "grad_norm": 0.4153936523137314, "learning_rate": 3.1464981818895325e-05, "loss": 0.4045, "step": 20215 }, { "epoch": 2.357742537313433, "grad_norm": 0.4215852959550067, "learning_rate": 3.145644121102517e-05, "loss": 0.4319, "step": 20220 }, { "epoch": 2.3583255597014925, "grad_norm": 0.3932563933434031, "learning_rate": 3.1447900014870125e-05, "loss": 0.4, "step": 20225 }, { "epoch": 2.358908582089552, "grad_norm": 0.42232225989119876, "learning_rate": 3.1439358231700165e-05, "loss": 0.4123, "step": 20230 }, { "epoch": 2.3594916044776117, "grad_norm": 0.38271574851859447, "learning_rate": 3.143081586278539e-05, "loss": 0.4145, "step": 20235 }, { "epoch": 2.360074626865672, "grad_norm": 0.39680852459169036, "learning_rate": 3.142227290939595e-05, "loss": 0.4048, "step": 20240 }, { "epoch": 2.3606576492537314, "grad_norm": 0.3908631021160493, "learning_rate": 3.14137293728021e-05, "loss": 0.3967, "step": 20245 }, { "epoch": 2.361240671641791, "grad_norm": 0.38489036732200466, "learning_rate": 3.14051852542742e-05, "loss": 0.3806, "step": 20250 }, { "epoch": 2.3618236940298507, "grad_norm": 0.414997237373693, "learning_rate": 3.1396640555082665e-05, "loss": 0.4315, "step": 20255 }, { "epoch": 2.3624067164179103, "grad_norm": 0.399401219366884, "learning_rate": 3.1388095276498013e-05, "loss": 0.4272, "step": 20260 }, { "epoch": 2.36298973880597, "grad_norm": 0.43657244502649967, "learning_rate": 3.137954941979085e-05, "loss": 0.3999, "step": 20265 }, { "epoch": 2.36357276119403, "grad_norm": 0.5097443478752106, "learning_rate": 3.1371002986231855e-05, "loss": 0.3874, "step": 20270 }, { "epoch": 2.3641557835820897, "grad_norm": 0.4216243846227505, "learning_rate": 3.136245597709181e-05, "loss": 0.4382, "step": 20275 }, { "epoch": 2.3647388059701493, "grad_norm": 0.3834158761183634, "learning_rate": 3.1353908393641574e-05, "loss": 0.3912, "step": 20280 }, { "epoch": 2.365321828358209, "grad_norm": 0.4730070704366648, "learning_rate": 3.134536023715207e-05, "loss": 0.4071, "step": 20285 }, { "epoch": 2.3659048507462686, "grad_norm": 0.42336283969259897, "learning_rate": 3.133681150889434e-05, "loss": 0.4033, "step": 20290 }, { "epoch": 2.366487873134328, "grad_norm": 0.41487489597089183, "learning_rate": 3.1328262210139515e-05, "loss": 0.4041, "step": 20295 }, { "epoch": 2.3670708955223883, "grad_norm": 0.4143196669283807, "learning_rate": 3.131971234215877e-05, "loss": 0.4204, "step": 20300 }, { "epoch": 2.367653917910448, "grad_norm": 0.4192282635638447, "learning_rate": 3.13111619062234e-05, "loss": 0.4249, "step": 20305 }, { "epoch": 2.3682369402985075, "grad_norm": 0.4281030980622985, "learning_rate": 3.1302610903604775e-05, "loss": 0.3837, "step": 20310 }, { "epoch": 2.368819962686567, "grad_norm": 0.42012529862171527, "learning_rate": 3.129405933557433e-05, "loss": 0.4065, "step": 20315 }, { "epoch": 2.3694029850746268, "grad_norm": 0.4280964987224729, "learning_rate": 3.128550720340362e-05, "loss": 0.4199, "step": 20320 }, { "epoch": 2.3699860074626864, "grad_norm": 0.40572184861344535, "learning_rate": 3.127695450836426e-05, "loss": 0.386, "step": 20325 }, { "epoch": 2.3705690298507465, "grad_norm": 0.3987095331284861, "learning_rate": 3.126840125172795e-05, "loss": 0.4256, "step": 20330 }, { "epoch": 2.371152052238806, "grad_norm": 0.4041729121660031, "learning_rate": 3.125984743476648e-05, "loss": 0.4079, "step": 20335 }, { "epoch": 2.3717350746268657, "grad_norm": 0.40515378758058834, "learning_rate": 3.125129305875172e-05, "loss": 0.4276, "step": 20340 }, { "epoch": 2.3723180970149254, "grad_norm": 0.4367449436443719, "learning_rate": 3.1242738124955624e-05, "loss": 0.4252, "step": 20345 }, { "epoch": 2.372901119402985, "grad_norm": 0.4232215775395212, "learning_rate": 3.1234182634650234e-05, "loss": 0.4135, "step": 20350 }, { "epoch": 2.3734841417910446, "grad_norm": 0.49297842768262384, "learning_rate": 3.122562658910765e-05, "loss": 0.4111, "step": 20355 }, { "epoch": 2.3740671641791042, "grad_norm": 0.4097608023873178, "learning_rate": 3.1217069989600097e-05, "loss": 0.4213, "step": 20360 }, { "epoch": 2.3746501865671643, "grad_norm": 0.4085027008391505, "learning_rate": 3.1208512837399856e-05, "loss": 0.3976, "step": 20365 }, { "epoch": 2.375233208955224, "grad_norm": 0.4185490070440442, "learning_rate": 3.119995513377928e-05, "loss": 0.4022, "step": 20370 }, { "epoch": 2.3758162313432836, "grad_norm": 0.3782251945072346, "learning_rate": 3.119139688001082e-05, "loss": 0.4073, "step": 20375 }, { "epoch": 2.376399253731343, "grad_norm": 0.43857261590896524, "learning_rate": 3.118283807736703e-05, "loss": 0.4336, "step": 20380 }, { "epoch": 2.376982276119403, "grad_norm": 0.4625141637612915, "learning_rate": 3.1174278727120496e-05, "loss": 0.405, "step": 20385 }, { "epoch": 2.377565298507463, "grad_norm": 0.4151689515085908, "learning_rate": 3.1165718830543914e-05, "loss": 0.4239, "step": 20390 }, { "epoch": 2.3781483208955225, "grad_norm": 0.41081737455149997, "learning_rate": 3.115715838891007e-05, "loss": 0.4171, "step": 20395 }, { "epoch": 2.378731343283582, "grad_norm": 0.4750578098368241, "learning_rate": 3.1148597403491816e-05, "loss": 0.4378, "step": 20400 }, { "epoch": 2.379314365671642, "grad_norm": 0.4357376291208132, "learning_rate": 3.114003587556208e-05, "loss": 0.4245, "step": 20405 }, { "epoch": 2.3798973880597014, "grad_norm": 0.4011573642351387, "learning_rate": 3.1131473806393876e-05, "loss": 0.3972, "step": 20410 }, { "epoch": 2.380480410447761, "grad_norm": 0.397300079143492, "learning_rate": 3.112291119726032e-05, "loss": 0.4254, "step": 20415 }, { "epoch": 2.3810634328358207, "grad_norm": 0.4069344376856726, "learning_rate": 3.1114348049434583e-05, "loss": 0.3848, "step": 20420 }, { "epoch": 2.3816464552238807, "grad_norm": 0.3974674651436214, "learning_rate": 3.110578436418992e-05, "loss": 0.3926, "step": 20425 }, { "epoch": 2.3822294776119404, "grad_norm": 0.4521801060123563, "learning_rate": 3.109722014279967e-05, "loss": 0.4047, "step": 20430 }, { "epoch": 2.3828125, "grad_norm": 0.39620834393615184, "learning_rate": 3.108865538653725e-05, "loss": 0.3763, "step": 20435 }, { "epoch": 2.3833955223880596, "grad_norm": 0.4379716188799694, "learning_rate": 3.108009009667615e-05, "loss": 0.3971, "step": 20440 }, { "epoch": 2.3839785447761193, "grad_norm": 0.40168351093725196, "learning_rate": 3.1071524274489966e-05, "loss": 0.4279, "step": 20445 }, { "epoch": 2.3845615671641793, "grad_norm": 0.4470968221721542, "learning_rate": 3.106295792125233e-05, "loss": 0.4263, "step": 20450 }, { "epoch": 2.385144589552239, "grad_norm": 0.45990971240797274, "learning_rate": 3.1054391038237e-05, "loss": 0.4429, "step": 20455 }, { "epoch": 2.3857276119402986, "grad_norm": 0.42201629816681707, "learning_rate": 3.104582362671778e-05, "loss": 0.4311, "step": 20460 }, { "epoch": 2.386310634328358, "grad_norm": 0.3751079860934871, "learning_rate": 3.103725568796854e-05, "loss": 0.388, "step": 20465 }, { "epoch": 2.386893656716418, "grad_norm": 0.4332472386759332, "learning_rate": 3.102868722326328e-05, "loss": 0.4032, "step": 20470 }, { "epoch": 2.3874766791044775, "grad_norm": 0.45049407808338693, "learning_rate": 3.102011823387605e-05, "loss": 0.401, "step": 20475 }, { "epoch": 2.388059701492537, "grad_norm": 0.4071153872700178, "learning_rate": 3.1011548721080955e-05, "loss": 0.4025, "step": 20480 }, { "epoch": 2.388642723880597, "grad_norm": 0.4776890821807428, "learning_rate": 3.100297868615222e-05, "loss": 0.4094, "step": 20485 }, { "epoch": 2.389225746268657, "grad_norm": 0.42967822194216654, "learning_rate": 3.099440813036411e-05, "loss": 0.4142, "step": 20490 }, { "epoch": 2.3898087686567164, "grad_norm": 0.39970921355549716, "learning_rate": 3.0985837054990983e-05, "loss": 0.4014, "step": 20495 }, { "epoch": 2.390391791044776, "grad_norm": 0.40907692527245637, "learning_rate": 3.097726546130729e-05, "loss": 0.3832, "step": 20500 }, { "epoch": 2.3909748134328357, "grad_norm": 0.42182387212132105, "learning_rate": 3.096869335058755e-05, "loss": 0.4126, "step": 20505 }, { "epoch": 2.3915578358208958, "grad_norm": 0.3907488142464014, "learning_rate": 3.096012072410633e-05, "loss": 0.4078, "step": 20510 }, { "epoch": 2.3921408582089554, "grad_norm": 0.42905064578366786, "learning_rate": 3.095154758313831e-05, "loss": 0.4209, "step": 20515 }, { "epoch": 2.392723880597015, "grad_norm": 0.42243047199337486, "learning_rate": 3.094297392895825e-05, "loss": 0.3888, "step": 20520 }, { "epoch": 2.3933069029850746, "grad_norm": 0.45036621292222845, "learning_rate": 3.093439976284094e-05, "loss": 0.4333, "step": 20525 }, { "epoch": 2.3938899253731343, "grad_norm": 0.4137249071571011, "learning_rate": 3.0925825086061295e-05, "loss": 0.4103, "step": 20530 }, { "epoch": 2.394472947761194, "grad_norm": 0.463925441043269, "learning_rate": 3.0917249899894285e-05, "loss": 0.403, "step": 20535 }, { "epoch": 2.3950559701492535, "grad_norm": 0.44466459696270055, "learning_rate": 3.090867420561495e-05, "loss": 0.4264, "step": 20540 }, { "epoch": 2.3956389925373136, "grad_norm": 0.4789706272314867, "learning_rate": 3.090009800449842e-05, "loss": 0.4343, "step": 20545 }, { "epoch": 2.3962220149253732, "grad_norm": 0.4677081585453051, "learning_rate": 3.0891521297819906e-05, "loss": 0.4168, "step": 20550 }, { "epoch": 2.396805037313433, "grad_norm": 0.4394275863034301, "learning_rate": 3.088294408685466e-05, "loss": 0.4149, "step": 20555 }, { "epoch": 2.3973880597014925, "grad_norm": 0.44940923406140465, "learning_rate": 3.0874366372878036e-05, "loss": 0.4437, "step": 20560 }, { "epoch": 2.397971082089552, "grad_norm": 0.4048519685878374, "learning_rate": 3.086578815716548e-05, "loss": 0.4055, "step": 20565 }, { "epoch": 2.3985541044776117, "grad_norm": 0.39322982180209676, "learning_rate": 3.085720944099246e-05, "loss": 0.3879, "step": 20570 }, { "epoch": 2.399137126865672, "grad_norm": 0.4292387757119189, "learning_rate": 3.0848630225634564e-05, "loss": 0.3966, "step": 20575 }, { "epoch": 2.3997201492537314, "grad_norm": 0.39519598752823326, "learning_rate": 3.0840050512367444e-05, "loss": 0.3793, "step": 20580 }, { "epoch": 2.400303171641791, "grad_norm": 0.4441287436004875, "learning_rate": 3.08314703024668e-05, "loss": 0.39, "step": 20585 }, { "epoch": 2.4008861940298507, "grad_norm": 0.3926175895055523, "learning_rate": 3.082288959720845e-05, "loss": 0.4133, "step": 20590 }, { "epoch": 2.4014692164179103, "grad_norm": 0.452675435761681, "learning_rate": 3.081430839786825e-05, "loss": 0.4103, "step": 20595 }, { "epoch": 2.40205223880597, "grad_norm": 0.4211404487727711, "learning_rate": 3.0805726705722156e-05, "loss": 0.42, "step": 20600 }, { "epoch": 2.40263526119403, "grad_norm": 0.42564064678254915, "learning_rate": 3.079714452204617e-05, "loss": 0.3877, "step": 20605 }, { "epoch": 2.4032182835820897, "grad_norm": 0.3929680554251429, "learning_rate": 3.078856184811638e-05, "loss": 0.3891, "step": 20610 }, { "epoch": 2.4038013059701493, "grad_norm": 0.4530599295376277, "learning_rate": 3.0779978685208956e-05, "loss": 0.3993, "step": 20615 }, { "epoch": 2.404384328358209, "grad_norm": 0.4036391934769732, "learning_rate": 3.077139503460012e-05, "loss": 0.4197, "step": 20620 }, { "epoch": 2.4049673507462686, "grad_norm": 0.40603035443045027, "learning_rate": 3.0762810897566184e-05, "loss": 0.3807, "step": 20625 }, { "epoch": 2.405550373134328, "grad_norm": 0.48643919266582186, "learning_rate": 3.0754226275383546e-05, "loss": 0.4486, "step": 20630 }, { "epoch": 2.4061333955223883, "grad_norm": 0.44987864557727897, "learning_rate": 3.0745641169328627e-05, "loss": 0.4137, "step": 20635 }, { "epoch": 2.406716417910448, "grad_norm": 0.3967160264721621, "learning_rate": 3.073705558067797e-05, "loss": 0.4116, "step": 20640 }, { "epoch": 2.4072994402985075, "grad_norm": 0.43400882918751754, "learning_rate": 3.072846951070816e-05, "loss": 0.4211, "step": 20645 }, { "epoch": 2.407882462686567, "grad_norm": 0.43332322513148747, "learning_rate": 3.071988296069586e-05, "loss": 0.4159, "step": 20650 }, { "epoch": 2.4084654850746268, "grad_norm": 0.40809729263624933, "learning_rate": 3.071129593191783e-05, "loss": 0.4201, "step": 20655 }, { "epoch": 2.4090485074626864, "grad_norm": 0.441248257372105, "learning_rate": 3.070270842565084e-05, "loss": 0.4051, "step": 20660 }, { "epoch": 2.4096315298507465, "grad_norm": 0.4013710020223029, "learning_rate": 3.069412044317181e-05, "loss": 0.3836, "step": 20665 }, { "epoch": 2.410214552238806, "grad_norm": 0.41433068535226436, "learning_rate": 3.068553198575767e-05, "loss": 0.4328, "step": 20670 }, { "epoch": 2.4107975746268657, "grad_norm": 0.4493111978580927, "learning_rate": 3.0676943054685445e-05, "loss": 0.3994, "step": 20675 }, { "epoch": 2.4113805970149254, "grad_norm": 0.46036109843736306, "learning_rate": 3.0668353651232226e-05, "loss": 0.417, "step": 20680 }, { "epoch": 2.411963619402985, "grad_norm": 0.41046467832975453, "learning_rate": 3.065976377667517e-05, "loss": 0.3921, "step": 20685 }, { "epoch": 2.4125466417910446, "grad_norm": 0.41393769625874877, "learning_rate": 3.065117343229153e-05, "loss": 0.3999, "step": 20690 }, { "epoch": 2.4131296641791042, "grad_norm": 0.4683255889770547, "learning_rate": 3.0642582619358576e-05, "loss": 0.4093, "step": 20695 }, { "epoch": 2.4137126865671643, "grad_norm": 0.43977471510009114, "learning_rate": 3.063399133915371e-05, "loss": 0.4025, "step": 20700 }, { "epoch": 2.414295708955224, "grad_norm": 0.41258514848682676, "learning_rate": 3.0625399592954346e-05, "loss": 0.3768, "step": 20705 }, { "epoch": 2.4148787313432836, "grad_norm": 0.4084008696090954, "learning_rate": 3.0616807382038016e-05, "loss": 0.3983, "step": 20710 }, { "epoch": 2.415461753731343, "grad_norm": 0.4795776151542908, "learning_rate": 3.0608214707682286e-05, "loss": 0.3944, "step": 20715 }, { "epoch": 2.416044776119403, "grad_norm": 0.43468743305130253, "learning_rate": 3.059962157116481e-05, "loss": 0.3776, "step": 20720 }, { "epoch": 2.416627798507463, "grad_norm": 0.39826732371846674, "learning_rate": 3.059102797376331e-05, "loss": 0.4066, "step": 20725 }, { "epoch": 2.4172108208955225, "grad_norm": 0.4738603577348725, "learning_rate": 3.058243391675557e-05, "loss": 0.4203, "step": 20730 }, { "epoch": 2.417793843283582, "grad_norm": 0.4621376024557958, "learning_rate": 3.0573839401419426e-05, "loss": 0.4172, "step": 20735 }, { "epoch": 2.418376865671642, "grad_norm": 0.4307459637540691, "learning_rate": 3.056524442903282e-05, "loss": 0.4082, "step": 20740 }, { "epoch": 2.4189598880597014, "grad_norm": 0.41965207734947385, "learning_rate": 3.055664900087374e-05, "loss": 0.4192, "step": 20745 }, { "epoch": 2.419542910447761, "grad_norm": 0.4079039339608424, "learning_rate": 3.054805311822023e-05, "loss": 0.4219, "step": 20750 }, { "epoch": 2.4201259328358207, "grad_norm": 0.4156154188703871, "learning_rate": 3.0539456782350436e-05, "loss": 0.3912, "step": 20755 }, { "epoch": 2.4207089552238807, "grad_norm": 0.37832895205507644, "learning_rate": 3.053085999454254e-05, "loss": 0.4041, "step": 20760 }, { "epoch": 2.4212919776119404, "grad_norm": 0.4387467174365725, "learning_rate": 3.0522262756074796e-05, "loss": 0.4251, "step": 20765 }, { "epoch": 2.421875, "grad_norm": 0.4026709072983937, "learning_rate": 3.051366506822554e-05, "loss": 0.4225, "step": 20770 }, { "epoch": 2.4224580223880596, "grad_norm": 0.399075731089508, "learning_rate": 3.0505066932273157e-05, "loss": 0.3929, "step": 20775 }, { "epoch": 2.4230410447761193, "grad_norm": 0.4104673891549445, "learning_rate": 3.0496468349496115e-05, "loss": 0.4161, "step": 20780 }, { "epoch": 2.4236240671641793, "grad_norm": 0.44960082229866133, "learning_rate": 3.0487869321172947e-05, "loss": 0.4242, "step": 20785 }, { "epoch": 2.424207089552239, "grad_norm": 0.46803264518164983, "learning_rate": 3.047926984858223e-05, "loss": 0.4248, "step": 20790 }, { "epoch": 2.4247901119402986, "grad_norm": 0.4458071023091181, "learning_rate": 3.047066993300264e-05, "loss": 0.4182, "step": 20795 }, { "epoch": 2.425373134328358, "grad_norm": 0.4146868850445553, "learning_rate": 3.046206957571288e-05, "loss": 0.3912, "step": 20800 }, { "epoch": 2.425956156716418, "grad_norm": 0.39914570066824695, "learning_rate": 3.0453468777991768e-05, "loss": 0.4008, "step": 20805 }, { "epoch": 2.4265391791044775, "grad_norm": 0.4182645339921344, "learning_rate": 3.0444867541118145e-05, "loss": 0.4193, "step": 20810 }, { "epoch": 2.427122201492537, "grad_norm": 0.4320660646550807, "learning_rate": 3.0436265866370922e-05, "loss": 0.4025, "step": 20815 }, { "epoch": 2.427705223880597, "grad_norm": 0.4129958951278084, "learning_rate": 3.0427663755029108e-05, "loss": 0.3835, "step": 20820 }, { "epoch": 2.428288246268657, "grad_norm": 0.4526823821048915, "learning_rate": 3.041906120837174e-05, "loss": 0.4081, "step": 20825 }, { "epoch": 2.4288712686567164, "grad_norm": 0.43042241753095895, "learning_rate": 3.0410458227677934e-05, "loss": 0.4141, "step": 20830 }, { "epoch": 2.429454291044776, "grad_norm": 0.4220190332194512, "learning_rate": 3.040185481422689e-05, "loss": 0.4066, "step": 20835 }, { "epoch": 2.4300373134328357, "grad_norm": 0.4289775459163234, "learning_rate": 3.0393250969297826e-05, "loss": 0.4104, "step": 20840 }, { "epoch": 2.4306203358208958, "grad_norm": 0.4199362912881698, "learning_rate": 3.0384646694170073e-05, "loss": 0.4292, "step": 20845 }, { "epoch": 2.4312033582089554, "grad_norm": 0.41385623867411475, "learning_rate": 3.0376041990122983e-05, "loss": 0.404, "step": 20850 }, { "epoch": 2.431786380597015, "grad_norm": 0.40814181771152663, "learning_rate": 3.036743685843601e-05, "loss": 0.439, "step": 20855 }, { "epoch": 2.4323694029850746, "grad_norm": 0.4162244955391998, "learning_rate": 3.0358831300388657e-05, "loss": 0.4025, "step": 20860 }, { "epoch": 2.4329524253731343, "grad_norm": 0.4072332592261159, "learning_rate": 3.035022531726047e-05, "loss": 0.3916, "step": 20865 }, { "epoch": 2.433535447761194, "grad_norm": 0.4274678052433978, "learning_rate": 3.0341618910331093e-05, "loss": 0.407, "step": 20870 }, { "epoch": 2.4341184701492535, "grad_norm": 0.4094881177576757, "learning_rate": 3.0333012080880207e-05, "loss": 0.3993, "step": 20875 }, { "epoch": 2.4347014925373136, "grad_norm": 0.4313901078848929, "learning_rate": 3.0324404830187564e-05, "loss": 0.42, "step": 20880 }, { "epoch": 2.4352845149253732, "grad_norm": 0.4549308035397354, "learning_rate": 3.0315797159532995e-05, "loss": 0.4303, "step": 20885 }, { "epoch": 2.435867537313433, "grad_norm": 0.40035506096441525, "learning_rate": 3.0307189070196358e-05, "loss": 0.3914, "step": 20890 }, { "epoch": 2.4364505597014925, "grad_norm": 0.39996703863221555, "learning_rate": 3.0298580563457606e-05, "loss": 0.3742, "step": 20895 }, { "epoch": 2.437033582089552, "grad_norm": 0.4314370923694186, "learning_rate": 3.0289971640596737e-05, "loss": 0.409, "step": 20900 }, { "epoch": 2.4376166044776117, "grad_norm": 0.42296438302299455, "learning_rate": 3.0281362302893822e-05, "loss": 0.4043, "step": 20905 }, { "epoch": 2.438199626865672, "grad_norm": 0.43148416696910497, "learning_rate": 3.0272752551628975e-05, "loss": 0.3851, "step": 20910 }, { "epoch": 2.4387826492537314, "grad_norm": 0.4305457157633837, "learning_rate": 3.026414238808239e-05, "loss": 0.3934, "step": 20915 }, { "epoch": 2.439365671641791, "grad_norm": 0.4237369702223498, "learning_rate": 3.0255531813534322e-05, "loss": 0.4, "step": 20920 }, { "epoch": 2.4399486940298507, "grad_norm": 0.44021761828619876, "learning_rate": 3.0246920829265067e-05, "loss": 0.4381, "step": 20925 }, { "epoch": 2.4405317164179103, "grad_norm": 0.4255673618813955, "learning_rate": 3.0238309436555e-05, "loss": 0.4051, "step": 20930 }, { "epoch": 2.44111473880597, "grad_norm": 0.39474819727816296, "learning_rate": 3.0229697636684568e-05, "loss": 0.389, "step": 20935 }, { "epoch": 2.44169776119403, "grad_norm": 0.4081173436729887, "learning_rate": 3.022108543093425e-05, "loss": 0.4282, "step": 20940 }, { "epoch": 2.4422807835820897, "grad_norm": 0.44756554485405237, "learning_rate": 3.0212472820584587e-05, "loss": 0.3982, "step": 20945 }, { "epoch": 2.4428638059701493, "grad_norm": 0.4379530237401978, "learning_rate": 3.020385980691621e-05, "loss": 0.4291, "step": 20950 }, { "epoch": 2.443446828358209, "grad_norm": 0.38755292941765784, "learning_rate": 3.019524639120979e-05, "loss": 0.4263, "step": 20955 }, { "epoch": 2.4440298507462686, "grad_norm": 0.4410023360525595, "learning_rate": 3.0186632574746055e-05, "loss": 0.3812, "step": 20960 }, { "epoch": 2.444612873134328, "grad_norm": 0.41835013471968363, "learning_rate": 3.0178018358805793e-05, "loss": 0.3962, "step": 20965 }, { "epoch": 2.4451958955223883, "grad_norm": 0.4288347118406861, "learning_rate": 3.016940374466986e-05, "loss": 0.407, "step": 20970 }, { "epoch": 2.445778917910448, "grad_norm": 0.40810092040353635, "learning_rate": 3.0160788733619167e-05, "loss": 0.409, "step": 20975 }, { "epoch": 2.4463619402985075, "grad_norm": 0.38803359970739254, "learning_rate": 3.0152173326934692e-05, "loss": 0.4163, "step": 20980 }, { "epoch": 2.446944962686567, "grad_norm": 0.44456045122082916, "learning_rate": 3.0143557525897444e-05, "loss": 0.4199, "step": 20985 }, { "epoch": 2.4475279850746268, "grad_norm": 0.5765165346297749, "learning_rate": 3.0134941331788525e-05, "loss": 0.3912, "step": 20990 }, { "epoch": 2.4481110074626864, "grad_norm": 0.39749837135133814, "learning_rate": 3.0126324745889067e-05, "loss": 0.3937, "step": 20995 }, { "epoch": 2.4486940298507465, "grad_norm": 0.41658626971422624, "learning_rate": 3.0117707769480285e-05, "loss": 0.3956, "step": 21000 }, { "epoch": 2.449277052238806, "grad_norm": 0.4174717512204487, "learning_rate": 3.0109090403843448e-05, "loss": 0.3994, "step": 21005 }, { "epoch": 2.4498600746268657, "grad_norm": 0.3668055516956151, "learning_rate": 3.0100472650259866e-05, "loss": 0.383, "step": 21010 }, { "epoch": 2.4504430970149254, "grad_norm": 0.46167332370755626, "learning_rate": 3.0091854510010907e-05, "loss": 0.4617, "step": 21015 }, { "epoch": 2.451026119402985, "grad_norm": 0.41320660992640296, "learning_rate": 3.008323598437802e-05, "loss": 0.3861, "step": 21020 }, { "epoch": 2.4516091417910446, "grad_norm": 0.4173103635473449, "learning_rate": 3.0074617074642693e-05, "loss": 0.4062, "step": 21025 }, { "epoch": 2.4521921641791042, "grad_norm": 0.415988371828538, "learning_rate": 3.006599778208647e-05, "loss": 0.3943, "step": 21030 }, { "epoch": 2.4527751865671643, "grad_norm": 0.43831470089612806, "learning_rate": 3.005737810799097e-05, "loss": 0.3961, "step": 21035 }, { "epoch": 2.453358208955224, "grad_norm": 0.44906331098002017, "learning_rate": 3.0048758053637844e-05, "loss": 0.4064, "step": 21040 }, { "epoch": 2.4539412313432836, "grad_norm": 0.39363363505423504, "learning_rate": 3.0040137620308812e-05, "loss": 0.4004, "step": 21045 }, { "epoch": 2.454524253731343, "grad_norm": 0.4259684359070059, "learning_rate": 3.0031516809285658e-05, "loss": 0.4237, "step": 21050 }, { "epoch": 2.455107276119403, "grad_norm": 0.4536016468944285, "learning_rate": 3.0022895621850207e-05, "loss": 0.4114, "step": 21055 }, { "epoch": 2.455690298507463, "grad_norm": 0.43620568966654055, "learning_rate": 3.001427405928435e-05, "loss": 0.4059, "step": 21060 }, { "epoch": 2.4562733208955225, "grad_norm": 0.4021623529310093, "learning_rate": 3.0005652122870032e-05, "loss": 0.4138, "step": 21065 }, { "epoch": 2.456856343283582, "grad_norm": 0.441213535522327, "learning_rate": 2.999702981388925e-05, "loss": 0.4089, "step": 21070 }, { "epoch": 2.457439365671642, "grad_norm": 0.46302774269362845, "learning_rate": 2.9988407133624057e-05, "loss": 0.4504, "step": 21075 }, { "epoch": 2.4580223880597014, "grad_norm": 0.40340865628322886, "learning_rate": 2.9979784083356567e-05, "loss": 0.3934, "step": 21080 }, { "epoch": 2.458605410447761, "grad_norm": 0.4825397672970459, "learning_rate": 2.9971160664368946e-05, "loss": 0.4174, "step": 21085 }, { "epoch": 2.4591884328358207, "grad_norm": 0.4433655216883402, "learning_rate": 2.996253687794341e-05, "loss": 0.4306, "step": 21090 }, { "epoch": 2.4597714552238807, "grad_norm": 0.3525179960536248, "learning_rate": 2.9953912725362225e-05, "loss": 0.3879, "step": 21095 }, { "epoch": 2.4603544776119404, "grad_norm": 0.45390894009834903, "learning_rate": 2.994528820790774e-05, "loss": 0.4135, "step": 21100 }, { "epoch": 2.4609375, "grad_norm": 0.4805990267234585, "learning_rate": 2.9936663326862323e-05, "loss": 0.4009, "step": 21105 }, { "epoch": 2.4615205223880596, "grad_norm": 0.45127157246378136, "learning_rate": 2.9928038083508415e-05, "loss": 0.4159, "step": 21110 }, { "epoch": 2.4621035447761193, "grad_norm": 0.3990167290637741, "learning_rate": 2.9919412479128513e-05, "loss": 0.4213, "step": 21115 }, { "epoch": 2.4626865671641793, "grad_norm": 0.4237021041873273, "learning_rate": 2.9910786515005146e-05, "loss": 0.3822, "step": 21120 }, { "epoch": 2.463269589552239, "grad_norm": 0.4116002355791405, "learning_rate": 2.990216019242093e-05, "loss": 0.4163, "step": 21125 }, { "epoch": 2.4638526119402986, "grad_norm": 0.43391250160781775, "learning_rate": 2.9893533512658507e-05, "loss": 0.4002, "step": 21130 }, { "epoch": 2.464435634328358, "grad_norm": 0.40413378811255374, "learning_rate": 2.988490647700058e-05, "loss": 0.4124, "step": 21135 }, { "epoch": 2.465018656716418, "grad_norm": 0.38966204506682905, "learning_rate": 2.987627908672992e-05, "loss": 0.3958, "step": 21140 }, { "epoch": 2.4656016791044775, "grad_norm": 0.4629556379149426, "learning_rate": 2.9867651343129315e-05, "loss": 0.4225, "step": 21145 }, { "epoch": 2.466184701492537, "grad_norm": 0.394750285059733, "learning_rate": 2.9859023247481644e-05, "loss": 0.4123, "step": 21150 }, { "epoch": 2.466767723880597, "grad_norm": 0.3900887470754676, "learning_rate": 2.985039480106982e-05, "loss": 0.3894, "step": 21155 }, { "epoch": 2.467350746268657, "grad_norm": 0.44726068748955106, "learning_rate": 2.9841766005176808e-05, "loss": 0.4169, "step": 21160 }, { "epoch": 2.4679337686567164, "grad_norm": 0.40366600935594904, "learning_rate": 2.983313686108563e-05, "loss": 0.4198, "step": 21165 }, { "epoch": 2.468516791044776, "grad_norm": 0.4251537162507513, "learning_rate": 2.982450737007935e-05, "loss": 0.4009, "step": 21170 }, { "epoch": 2.4690998134328357, "grad_norm": 0.413453973855533, "learning_rate": 2.9815877533441107e-05, "loss": 0.3987, "step": 21175 }, { "epoch": 2.4696828358208958, "grad_norm": 0.4164854573390885, "learning_rate": 2.9807247352454055e-05, "loss": 0.3866, "step": 21180 }, { "epoch": 2.4702658582089554, "grad_norm": 0.42380988885977333, "learning_rate": 2.9798616828401428e-05, "loss": 0.4139, "step": 21185 }, { "epoch": 2.470848880597015, "grad_norm": 0.3943042370713538, "learning_rate": 2.9789985962566503e-05, "loss": 0.4144, "step": 21190 }, { "epoch": 2.4714319029850746, "grad_norm": 0.43740794486958573, "learning_rate": 2.9781354756232604e-05, "loss": 0.4135, "step": 21195 }, { "epoch": 2.4720149253731343, "grad_norm": 0.41163586037059013, "learning_rate": 2.977272321068311e-05, "loss": 0.3922, "step": 21200 }, { "epoch": 2.472597947761194, "grad_norm": 0.4222108796080354, "learning_rate": 2.9764091327201456e-05, "loss": 0.3814, "step": 21205 }, { "epoch": 2.4731809701492535, "grad_norm": 0.4245857820065939, "learning_rate": 2.975545910707111e-05, "loss": 0.3924, "step": 21210 }, { "epoch": 2.4737639925373136, "grad_norm": 0.468031541203365, "learning_rate": 2.9746826551575606e-05, "loss": 0.4196, "step": 21215 }, { "epoch": 2.4743470149253732, "grad_norm": 0.47749102565899015, "learning_rate": 2.9738193661998526e-05, "loss": 0.4182, "step": 21220 }, { "epoch": 2.474930037313433, "grad_norm": 0.41588474180728796, "learning_rate": 2.9729560439623484e-05, "loss": 0.3983, "step": 21225 }, { "epoch": 2.4755130597014925, "grad_norm": 0.4155839342233435, "learning_rate": 2.9720926885734167e-05, "loss": 0.4143, "step": 21230 }, { "epoch": 2.476096082089552, "grad_norm": 0.4085486152492274, "learning_rate": 2.97122930016143e-05, "loss": 0.3968, "step": 21235 }, { "epoch": 2.4766791044776117, "grad_norm": 0.4216718598522093, "learning_rate": 2.9703658788547674e-05, "loss": 0.3694, "step": 21240 }, { "epoch": 2.477262126865672, "grad_norm": 0.4102876666531931, "learning_rate": 2.9695024247818088e-05, "loss": 0.42, "step": 21245 }, { "epoch": 2.4778451492537314, "grad_norm": 0.40350132209784795, "learning_rate": 2.968638938070942e-05, "loss": 0.4171, "step": 21250 }, { "epoch": 2.478428171641791, "grad_norm": 0.40859983213586976, "learning_rate": 2.9677754188505614e-05, "loss": 0.4041, "step": 21255 }, { "epoch": 2.4790111940298507, "grad_norm": 0.3986772690785753, "learning_rate": 2.9669118672490627e-05, "loss": 0.395, "step": 21260 }, { "epoch": 2.4795942164179103, "grad_norm": 0.46926280945104276, "learning_rate": 2.9660482833948466e-05, "loss": 0.4449, "step": 21265 }, { "epoch": 2.48017723880597, "grad_norm": 0.42229644993874677, "learning_rate": 2.9651846674163208e-05, "loss": 0.4274, "step": 21270 }, { "epoch": 2.48076026119403, "grad_norm": 0.4801079403961941, "learning_rate": 2.964321019441898e-05, "loss": 0.4282, "step": 21275 }, { "epoch": 2.4813432835820897, "grad_norm": 0.4190103918754208, "learning_rate": 2.9634573395999916e-05, "loss": 0.4421, "step": 21280 }, { "epoch": 2.4819263059701493, "grad_norm": 0.3943065446933593, "learning_rate": 2.962593628019024e-05, "loss": 0.3797, "step": 21285 }, { "epoch": 2.482509328358209, "grad_norm": 0.5370051602481157, "learning_rate": 2.9617298848274223e-05, "loss": 0.4006, "step": 21290 }, { "epoch": 2.4830923507462686, "grad_norm": 0.7458422677624169, "learning_rate": 2.960866110153614e-05, "loss": 0.4127, "step": 21295 }, { "epoch": 2.483675373134328, "grad_norm": 0.3882811930877767, "learning_rate": 2.9600023041260355e-05, "loss": 0.3721, "step": 21300 }, { "epoch": 2.4842583955223883, "grad_norm": 0.44801344023130407, "learning_rate": 2.9591384668731264e-05, "loss": 0.402, "step": 21305 }, { "epoch": 2.484841417910448, "grad_norm": 0.4531884106552282, "learning_rate": 2.9582745985233312e-05, "loss": 0.43, "step": 21310 }, { "epoch": 2.4854244402985075, "grad_norm": 0.38787451045843185, "learning_rate": 2.9574106992050993e-05, "loss": 0.4139, "step": 21315 }, { "epoch": 2.486007462686567, "grad_norm": 0.4231218200420361, "learning_rate": 2.9565467690468834e-05, "loss": 0.3824, "step": 21320 }, { "epoch": 2.4865904850746268, "grad_norm": 0.4236931079009935, "learning_rate": 2.9556828081771413e-05, "loss": 0.3923, "step": 21325 }, { "epoch": 2.4871735074626864, "grad_norm": 0.41850927778702807, "learning_rate": 2.9548188167243372e-05, "loss": 0.3824, "step": 21330 }, { "epoch": 2.4877565298507465, "grad_norm": 0.4570955094695527, "learning_rate": 2.953954794816937e-05, "loss": 0.4263, "step": 21335 }, { "epoch": 2.488339552238806, "grad_norm": 0.41066886065497704, "learning_rate": 2.953090742583413e-05, "loss": 0.3996, "step": 21340 }, { "epoch": 2.4889225746268657, "grad_norm": 0.4023001356178296, "learning_rate": 2.952226660152242e-05, "loss": 0.4348, "step": 21345 }, { "epoch": 2.4895055970149254, "grad_norm": 0.37656867274528566, "learning_rate": 2.951362547651903e-05, "loss": 0.418, "step": 21350 }, { "epoch": 2.490088619402985, "grad_norm": 0.4609871369769133, "learning_rate": 2.950498405210883e-05, "loss": 0.4038, "step": 21355 }, { "epoch": 2.4906716417910446, "grad_norm": 0.40441805010449483, "learning_rate": 2.949634232957671e-05, "loss": 0.4375, "step": 21360 }, { "epoch": 2.4912546641791042, "grad_norm": 0.3979780416205592, "learning_rate": 2.9487700310207618e-05, "loss": 0.4188, "step": 21365 }, { "epoch": 2.4918376865671643, "grad_norm": 0.41771420923256114, "learning_rate": 2.9479057995286528e-05, "loss": 0.4149, "step": 21370 }, { "epoch": 2.492420708955224, "grad_norm": 0.5541303106519052, "learning_rate": 2.947041538609848e-05, "loss": 0.385, "step": 21375 }, { "epoch": 2.4930037313432836, "grad_norm": 0.4486629116161178, "learning_rate": 2.9461772483928547e-05, "loss": 0.411, "step": 21380 }, { "epoch": 2.493586753731343, "grad_norm": 0.3995070406681024, "learning_rate": 2.9453129290061832e-05, "loss": 0.4122, "step": 21385 }, { "epoch": 2.494169776119403, "grad_norm": 0.4204055617537802, "learning_rate": 2.944448580578351e-05, "loss": 0.408, "step": 21390 }, { "epoch": 2.494752798507463, "grad_norm": 0.40183549257717155, "learning_rate": 2.9435842032378778e-05, "loss": 0.3994, "step": 21395 }, { "epoch": 2.4953358208955225, "grad_norm": 0.4664775080112823, "learning_rate": 2.9427197971132886e-05, "loss": 0.4167, "step": 21400 }, { "epoch": 2.495918843283582, "grad_norm": 0.4184877981207242, "learning_rate": 2.941855362333112e-05, "loss": 0.396, "step": 21405 }, { "epoch": 2.496501865671642, "grad_norm": 0.41181796305763557, "learning_rate": 2.9409908990258812e-05, "loss": 0.4087, "step": 21410 }, { "epoch": 2.4970848880597014, "grad_norm": 0.430701979025966, "learning_rate": 2.9401264073201333e-05, "loss": 0.4299, "step": 21415 }, { "epoch": 2.497667910447761, "grad_norm": 0.4027967505927966, "learning_rate": 2.9392618873444112e-05, "loss": 0.39, "step": 21420 }, { "epoch": 2.4982509328358207, "grad_norm": 0.4355200232474978, "learning_rate": 2.938397339227259e-05, "loss": 0.4288, "step": 21425 }, { "epoch": 2.4988339552238807, "grad_norm": 0.4619928752591043, "learning_rate": 2.937532763097227e-05, "loss": 0.4215, "step": 21430 }, { "epoch": 2.4994169776119404, "grad_norm": 0.4095081145525622, "learning_rate": 2.936668159082871e-05, "loss": 0.3808, "step": 21435 }, { "epoch": 2.5, "grad_norm": 0.4215106073434386, "learning_rate": 2.9358035273127483e-05, "loss": 0.3971, "step": 21440 }, { "epoch": 2.5005830223880596, "grad_norm": 0.38973346358103333, "learning_rate": 2.9349388679154206e-05, "loss": 0.3973, "step": 21445 }, { "epoch": 2.5011660447761193, "grad_norm": 0.41835149099010577, "learning_rate": 2.934074181019455e-05, "loss": 0.3942, "step": 21450 }, { "epoch": 2.5017490671641793, "grad_norm": 0.4046609572605303, "learning_rate": 2.9332094667534238e-05, "loss": 0.3975, "step": 21455 }, { "epoch": 2.502332089552239, "grad_norm": 0.39363303197131777, "learning_rate": 2.9323447252458986e-05, "loss": 0.4111, "step": 21460 }, { "epoch": 2.5029151119402986, "grad_norm": 0.4110521497065824, "learning_rate": 2.9314799566254603e-05, "loss": 0.4108, "step": 21465 }, { "epoch": 2.503498134328358, "grad_norm": 0.43182190112430263, "learning_rate": 2.9306151610206916e-05, "loss": 0.4267, "step": 21470 }, { "epoch": 2.504081156716418, "grad_norm": 0.4301973632379895, "learning_rate": 2.9297503385601788e-05, "loss": 0.3874, "step": 21475 }, { "epoch": 2.5046641791044775, "grad_norm": 0.4293896023715753, "learning_rate": 2.9288854893725128e-05, "loss": 0.4208, "step": 21480 }, { "epoch": 2.505247201492537, "grad_norm": 0.4416509369399998, "learning_rate": 2.928020613586288e-05, "loss": 0.4042, "step": 21485 }, { "epoch": 2.505830223880597, "grad_norm": 0.41110271954851013, "learning_rate": 2.9271557113301047e-05, "loss": 0.4046, "step": 21490 }, { "epoch": 2.506413246268657, "grad_norm": 0.4247547878930185, "learning_rate": 2.9262907827325638e-05, "loss": 0.4093, "step": 21495 }, { "epoch": 2.5069962686567164, "grad_norm": 0.46742801105554627, "learning_rate": 2.9254258279222724e-05, "loss": 0.4068, "step": 21500 }, { "epoch": 2.507579291044776, "grad_norm": 0.43897513187606063, "learning_rate": 2.9245608470278417e-05, "loss": 0.3935, "step": 21505 }, { "epoch": 2.5081623134328357, "grad_norm": 0.3855522137341133, "learning_rate": 2.9236958401778854e-05, "loss": 0.4267, "step": 21510 }, { "epoch": 2.5087453358208958, "grad_norm": 0.4411823143289941, "learning_rate": 2.9228308075010213e-05, "loss": 0.441, "step": 21515 }, { "epoch": 2.5093283582089554, "grad_norm": 0.4196947934879396, "learning_rate": 2.921965749125873e-05, "loss": 0.4117, "step": 21520 }, { "epoch": 2.509911380597015, "grad_norm": 0.44916711873645215, "learning_rate": 2.9211006651810645e-05, "loss": 0.4058, "step": 21525 }, { "epoch": 2.5104944029850746, "grad_norm": 0.46830549267752497, "learning_rate": 2.920235555795227e-05, "loss": 0.4155, "step": 21530 }, { "epoch": 2.5110774253731343, "grad_norm": 0.4322880218878378, "learning_rate": 2.919370421096993e-05, "loss": 0.3937, "step": 21535 }, { "epoch": 2.511660447761194, "grad_norm": 0.41689030667715465, "learning_rate": 2.9185052612150004e-05, "loss": 0.4028, "step": 21540 }, { "epoch": 2.5122434701492535, "grad_norm": 0.44283367320919953, "learning_rate": 2.9176400762778906e-05, "loss": 0.4324, "step": 21545 }, { "epoch": 2.5128264925373136, "grad_norm": 0.4462371229480639, "learning_rate": 2.9167748664143067e-05, "loss": 0.3823, "step": 21550 }, { "epoch": 2.5134095149253732, "grad_norm": 0.36489194518166884, "learning_rate": 2.9159096317528985e-05, "loss": 0.3788, "step": 21555 }, { "epoch": 2.513992537313433, "grad_norm": 0.5003846722632077, "learning_rate": 2.9150443724223174e-05, "loss": 0.4258, "step": 21560 }, { "epoch": 2.5145755597014925, "grad_norm": 0.4205153403610952, "learning_rate": 2.91417908855122e-05, "loss": 0.4135, "step": 21565 }, { "epoch": 2.515158582089552, "grad_norm": 0.4036636588531879, "learning_rate": 2.9133137802682646e-05, "loss": 0.3993, "step": 21570 }, { "epoch": 2.515741604477612, "grad_norm": 0.41459542944198186, "learning_rate": 2.912448447702115e-05, "loss": 0.4058, "step": 21575 }, { "epoch": 2.5163246268656714, "grad_norm": 0.40454346028708793, "learning_rate": 2.9115830909814374e-05, "loss": 0.4187, "step": 21580 }, { "epoch": 2.5169076492537314, "grad_norm": 0.4227574492714799, "learning_rate": 2.9107177102349026e-05, "loss": 0.412, "step": 21585 }, { "epoch": 2.517490671641791, "grad_norm": 0.3820073360239675, "learning_rate": 2.909852305591184e-05, "loss": 0.3882, "step": 21590 }, { "epoch": 2.5180736940298507, "grad_norm": 0.41820926089204913, "learning_rate": 2.9089868771789598e-05, "loss": 0.4028, "step": 21595 }, { "epoch": 2.5186567164179103, "grad_norm": 0.4368346520661654, "learning_rate": 2.9081214251269095e-05, "loss": 0.4418, "step": 21600 }, { "epoch": 2.51923973880597, "grad_norm": 0.5258252316148309, "learning_rate": 2.9072559495637187e-05, "loss": 0.4111, "step": 21605 }, { "epoch": 2.51982276119403, "grad_norm": 0.4398525033401105, "learning_rate": 2.9063904506180746e-05, "loss": 0.4026, "step": 21610 }, { "epoch": 2.5204057835820897, "grad_norm": 0.4429162914251529, "learning_rate": 2.9055249284186686e-05, "loss": 0.4085, "step": 21615 }, { "epoch": 2.5209888059701493, "grad_norm": 0.42203105398839463, "learning_rate": 2.904659383094197e-05, "loss": 0.4344, "step": 21620 }, { "epoch": 2.521571828358209, "grad_norm": 0.40620566810256226, "learning_rate": 2.9037938147733557e-05, "loss": 0.3826, "step": 21625 }, { "epoch": 2.5221548507462686, "grad_norm": 0.41829004160654043, "learning_rate": 2.902928223584848e-05, "loss": 0.4044, "step": 21630 }, { "epoch": 2.5227378731343286, "grad_norm": 0.42124429434039856, "learning_rate": 2.9020626096573793e-05, "loss": 0.4184, "step": 21635 }, { "epoch": 2.523320895522388, "grad_norm": 0.41292177384683004, "learning_rate": 2.9011969731196565e-05, "loss": 0.3945, "step": 21640 }, { "epoch": 2.523903917910448, "grad_norm": 0.42538107659879604, "learning_rate": 2.9003313141003934e-05, "loss": 0.3956, "step": 21645 }, { "epoch": 2.5244869402985075, "grad_norm": 0.4263449207400961, "learning_rate": 2.8994656327283036e-05, "loss": 0.388, "step": 21650 }, { "epoch": 2.525069962686567, "grad_norm": 0.43570400848495505, "learning_rate": 2.898599929132107e-05, "loss": 0.4091, "step": 21655 }, { "epoch": 2.5256529850746268, "grad_norm": 0.3685028513907922, "learning_rate": 2.897734203440524e-05, "loss": 0.3859, "step": 21660 }, { "epoch": 2.5262360074626864, "grad_norm": 0.42144367982032993, "learning_rate": 2.8968684557822806e-05, "loss": 0.3989, "step": 21665 }, { "epoch": 2.5268190298507465, "grad_norm": 0.42879147975007814, "learning_rate": 2.8960026862861057e-05, "loss": 0.416, "step": 21670 }, { "epoch": 2.527402052238806, "grad_norm": 0.42972426861921337, "learning_rate": 2.8951368950807288e-05, "loss": 0.416, "step": 21675 }, { "epoch": 2.5279850746268657, "grad_norm": 0.41098597858844427, "learning_rate": 2.894271082294887e-05, "loss": 0.4061, "step": 21680 }, { "epoch": 2.5285680970149254, "grad_norm": 0.41669927137715934, "learning_rate": 2.8934052480573175e-05, "loss": 0.3919, "step": 21685 }, { "epoch": 2.529151119402985, "grad_norm": 0.5075636109195275, "learning_rate": 2.8925393924967615e-05, "loss": 0.3878, "step": 21690 }, { "epoch": 2.5297341417910446, "grad_norm": 0.43172849798857615, "learning_rate": 2.891673515741964e-05, "loss": 0.4003, "step": 21695 }, { "epoch": 2.5303171641791042, "grad_norm": 0.4138559841501898, "learning_rate": 2.8908076179216715e-05, "loss": 0.398, "step": 21700 }, { "epoch": 2.5309001865671643, "grad_norm": 0.4433376163763771, "learning_rate": 2.8899416991646354e-05, "loss": 0.4245, "step": 21705 }, { "epoch": 2.531483208955224, "grad_norm": 0.4345974572000521, "learning_rate": 2.88907575959961e-05, "loss": 0.3999, "step": 21710 }, { "epoch": 2.5320662313432836, "grad_norm": 0.40454268596395815, "learning_rate": 2.8882097993553504e-05, "loss": 0.41, "step": 21715 }, { "epoch": 2.532649253731343, "grad_norm": 0.3980572891511395, "learning_rate": 2.8873438185606194e-05, "loss": 0.403, "step": 21720 }, { "epoch": 2.533232276119403, "grad_norm": 0.42317910525564006, "learning_rate": 2.8864778173441775e-05, "loss": 0.3967, "step": 21725 }, { "epoch": 2.533815298507463, "grad_norm": 0.4563297646016866, "learning_rate": 2.8856117958347923e-05, "loss": 0.4272, "step": 21730 }, { "epoch": 2.5343983208955225, "grad_norm": 0.4018884227123111, "learning_rate": 2.884745754161232e-05, "loss": 0.4066, "step": 21735 }, { "epoch": 2.534981343283582, "grad_norm": 0.4242066161997057, "learning_rate": 2.8838796924522694e-05, "loss": 0.3932, "step": 21740 }, { "epoch": 2.535564365671642, "grad_norm": 0.401542307992814, "learning_rate": 2.883013610836679e-05, "loss": 0.4204, "step": 21745 }, { "epoch": 2.5361473880597014, "grad_norm": 0.4216314766589857, "learning_rate": 2.8821475094432393e-05, "loss": 0.4115, "step": 21750 }, { "epoch": 2.536730410447761, "grad_norm": 0.38714484182977127, "learning_rate": 2.8812813884007306e-05, "loss": 0.4157, "step": 21755 }, { "epoch": 2.5373134328358207, "grad_norm": 0.401423487689331, "learning_rate": 2.8804152478379377e-05, "loss": 0.4004, "step": 21760 }, { "epoch": 2.5378964552238807, "grad_norm": 0.39028263032288135, "learning_rate": 2.8795490878836468e-05, "loss": 0.3918, "step": 21765 }, { "epoch": 2.5384794776119404, "grad_norm": 0.4065366877513189, "learning_rate": 2.8786829086666483e-05, "loss": 0.3895, "step": 21770 }, { "epoch": 2.5390625, "grad_norm": 0.38717226865681914, "learning_rate": 2.877816710315734e-05, "loss": 0.3943, "step": 21775 }, { "epoch": 2.5396455223880596, "grad_norm": 0.42458318342762813, "learning_rate": 2.8769504929596986e-05, "loss": 0.4244, "step": 21780 }, { "epoch": 2.5402285447761193, "grad_norm": 0.5479384180330955, "learning_rate": 2.876084256727342e-05, "loss": 0.462, "step": 21785 }, { "epoch": 2.5408115671641793, "grad_norm": 0.41665436063921335, "learning_rate": 2.8752180017474646e-05, "loss": 0.4203, "step": 21790 }, { "epoch": 2.541394589552239, "grad_norm": 0.42340080438974775, "learning_rate": 2.8743517281488703e-05, "loss": 0.417, "step": 21795 }, { "epoch": 2.5419776119402986, "grad_norm": 0.4255330334928534, "learning_rate": 2.8734854360603646e-05, "loss": 0.4148, "step": 21800 }, { "epoch": 2.542560634328358, "grad_norm": 0.45592826028439914, "learning_rate": 2.8726191256107582e-05, "loss": 0.3922, "step": 21805 }, { "epoch": 2.543143656716418, "grad_norm": 0.403101059759272, "learning_rate": 2.8717527969288632e-05, "loss": 0.387, "step": 21810 }, { "epoch": 2.5437266791044775, "grad_norm": 0.4021919699524545, "learning_rate": 2.870886450143493e-05, "loss": 0.4124, "step": 21815 }, { "epoch": 2.544309701492537, "grad_norm": 0.422454164200704, "learning_rate": 2.870020085383466e-05, "loss": 0.4013, "step": 21820 }, { "epoch": 2.544892723880597, "grad_norm": 0.45287185780633143, "learning_rate": 2.8691537027776022e-05, "loss": 0.4303, "step": 21825 }, { "epoch": 2.545475746268657, "grad_norm": 0.4108350468375693, "learning_rate": 2.868287302454725e-05, "loss": 0.4208, "step": 21830 }, { "epoch": 2.5460587686567164, "grad_norm": 0.42982432907901225, "learning_rate": 2.867420884543659e-05, "loss": 0.4029, "step": 21835 }, { "epoch": 2.546641791044776, "grad_norm": 0.4086358957532511, "learning_rate": 2.8665544491732315e-05, "loss": 0.4133, "step": 21840 }, { "epoch": 2.5472248134328357, "grad_norm": 0.45077563953403066, "learning_rate": 2.8656879964722753e-05, "loss": 0.4225, "step": 21845 }, { "epoch": 2.5478078358208958, "grad_norm": 0.49688042020521955, "learning_rate": 2.8648215265696227e-05, "loss": 0.4206, "step": 21850 }, { "epoch": 2.5483908582089554, "grad_norm": 0.4286001722564965, "learning_rate": 2.8639550395941085e-05, "loss": 0.4105, "step": 21855 }, { "epoch": 2.548973880597015, "grad_norm": 0.39585660153808594, "learning_rate": 2.8630885356745716e-05, "loss": 0.4275, "step": 21860 }, { "epoch": 2.5495569029850746, "grad_norm": 0.39916539236865917, "learning_rate": 2.8622220149398533e-05, "loss": 0.3737, "step": 21865 }, { "epoch": 2.5501399253731343, "grad_norm": 0.3998433747834968, "learning_rate": 2.8613554775187962e-05, "loss": 0.3922, "step": 21870 }, { "epoch": 2.550722947761194, "grad_norm": 0.37614164356161306, "learning_rate": 2.860488923540247e-05, "loss": 0.4006, "step": 21875 }, { "epoch": 2.5513059701492535, "grad_norm": 0.4446740096753151, "learning_rate": 2.859622353133054e-05, "loss": 0.4203, "step": 21880 }, { "epoch": 2.5518889925373136, "grad_norm": 0.42220583522566685, "learning_rate": 2.8587557664260662e-05, "loss": 0.4038, "step": 21885 }, { "epoch": 2.5524720149253732, "grad_norm": 0.4180017378797911, "learning_rate": 2.8578891635481387e-05, "loss": 0.429, "step": 21890 }, { "epoch": 2.553055037313433, "grad_norm": 0.42970196216904194, "learning_rate": 2.857022544628126e-05, "loss": 0.4237, "step": 21895 }, { "epoch": 2.5536380597014925, "grad_norm": 0.407121658567729, "learning_rate": 2.8561559097948863e-05, "loss": 0.401, "step": 21900 }, { "epoch": 2.554221082089552, "grad_norm": 0.3922610519380279, "learning_rate": 2.8552892591772806e-05, "loss": 0.3913, "step": 21905 }, { "epoch": 2.554804104477612, "grad_norm": 0.4098739412970042, "learning_rate": 2.8544225929041697e-05, "loss": 0.4026, "step": 21910 }, { "epoch": 2.5553871268656714, "grad_norm": 0.40623477535739433, "learning_rate": 2.8535559111044206e-05, "loss": 0.4082, "step": 21915 }, { "epoch": 2.5559701492537314, "grad_norm": 0.4016199061212181, "learning_rate": 2.852689213906899e-05, "loss": 0.4331, "step": 21920 }, { "epoch": 2.556553171641791, "grad_norm": 0.4455436046189285, "learning_rate": 2.851822501440476e-05, "loss": 0.382, "step": 21925 }, { "epoch": 2.5571361940298507, "grad_norm": 0.409219008525592, "learning_rate": 2.850955773834022e-05, "loss": 0.3984, "step": 21930 }, { "epoch": 2.5577192164179103, "grad_norm": 0.39634426680336415, "learning_rate": 2.850089031216412e-05, "loss": 0.3869, "step": 21935 }, { "epoch": 2.55830223880597, "grad_norm": 0.46628954030010084, "learning_rate": 2.849222273716522e-05, "loss": 0.395, "step": 21940 }, { "epoch": 2.55888526119403, "grad_norm": 0.4401779440320999, "learning_rate": 2.84835550146323e-05, "loss": 0.4189, "step": 21945 }, { "epoch": 2.5594682835820897, "grad_norm": 0.44388153967136124, "learning_rate": 2.8474887145854183e-05, "loss": 0.408, "step": 21950 }, { "epoch": 2.5600513059701493, "grad_norm": 0.4036687461098782, "learning_rate": 2.8466219132119688e-05, "loss": 0.4039, "step": 21955 }, { "epoch": 2.560634328358209, "grad_norm": 0.4032150130986403, "learning_rate": 2.8457550974717655e-05, "loss": 0.4208, "step": 21960 }, { "epoch": 2.5612173507462686, "grad_norm": 0.4125258192596562, "learning_rate": 2.8448882674936973e-05, "loss": 0.3951, "step": 21965 }, { "epoch": 2.5618003731343286, "grad_norm": 0.4104849801208035, "learning_rate": 2.8440214234066524e-05, "loss": 0.4074, "step": 21970 }, { "epoch": 2.562383395522388, "grad_norm": 0.39023451394189007, "learning_rate": 2.8431545653395236e-05, "loss": 0.3778, "step": 21975 }, { "epoch": 2.562966417910448, "grad_norm": 0.43210189811340993, "learning_rate": 2.8422876934212027e-05, "loss": 0.4341, "step": 21980 }, { "epoch": 2.5635494402985075, "grad_norm": 0.4380309452516506, "learning_rate": 2.841420807780586e-05, "loss": 0.4221, "step": 21985 }, { "epoch": 2.564132462686567, "grad_norm": 0.4332141557386373, "learning_rate": 2.8405539085465717e-05, "loss": 0.4323, "step": 21990 }, { "epoch": 2.5647154850746268, "grad_norm": 0.46081240369833126, "learning_rate": 2.8396869958480587e-05, "loss": 0.3924, "step": 21995 }, { "epoch": 2.5652985074626864, "grad_norm": 0.4397813502612307, "learning_rate": 2.8388200698139484e-05, "loss": 0.4371, "step": 22000 }, { "epoch": 2.5658815298507465, "grad_norm": 0.4130395762991295, "learning_rate": 2.837953130573145e-05, "loss": 0.3803, "step": 22005 }, { "epoch": 2.566464552238806, "grad_norm": 0.46598689677584026, "learning_rate": 2.8370861782545537e-05, "loss": 0.4045, "step": 22010 }, { "epoch": 2.5670475746268657, "grad_norm": 0.36310078298500686, "learning_rate": 2.8362192129870817e-05, "loss": 0.4044, "step": 22015 }, { "epoch": 2.5676305970149254, "grad_norm": 0.4385515606137188, "learning_rate": 2.8353522348996388e-05, "loss": 0.4217, "step": 22020 }, { "epoch": 2.568213619402985, "grad_norm": 0.40335608705844234, "learning_rate": 2.8344852441211367e-05, "loss": 0.4213, "step": 22025 }, { "epoch": 2.5687966417910446, "grad_norm": 0.42055530755961007, "learning_rate": 2.8336182407804886e-05, "loss": 0.4073, "step": 22030 }, { "epoch": 2.5693796641791042, "grad_norm": 0.4021555512781502, "learning_rate": 2.8327512250066083e-05, "loss": 0.433, "step": 22035 }, { "epoch": 2.5699626865671643, "grad_norm": 0.4013356366914958, "learning_rate": 2.8318841969284145e-05, "loss": 0.3689, "step": 22040 }, { "epoch": 2.570545708955224, "grad_norm": 0.40531603837072133, "learning_rate": 2.8310171566748243e-05, "loss": 0.4037, "step": 22045 }, { "epoch": 2.5711287313432836, "grad_norm": 0.41115410771966315, "learning_rate": 2.8301501043747608e-05, "loss": 0.4164, "step": 22050 }, { "epoch": 2.571711753731343, "grad_norm": 0.42752996731676995, "learning_rate": 2.829283040157143e-05, "loss": 0.4021, "step": 22055 }, { "epoch": 2.572294776119403, "grad_norm": 0.44069654554966997, "learning_rate": 2.8284159641508972e-05, "loss": 0.4024, "step": 22060 }, { "epoch": 2.572877798507463, "grad_norm": 0.4311906079588823, "learning_rate": 2.827548876484949e-05, "loss": 0.4185, "step": 22065 }, { "epoch": 2.5734608208955225, "grad_norm": 0.39731550212635675, "learning_rate": 2.826681777288226e-05, "loss": 0.3763, "step": 22070 }, { "epoch": 2.574043843283582, "grad_norm": 0.42071353294179487, "learning_rate": 2.825814666689658e-05, "loss": 0.394, "step": 22075 }, { "epoch": 2.574626865671642, "grad_norm": 0.42675361843367593, "learning_rate": 2.824947544818175e-05, "loss": 0.422, "step": 22080 }, { "epoch": 2.5752098880597014, "grad_norm": 0.4205779890422091, "learning_rate": 2.8240804118027092e-05, "loss": 0.3994, "step": 22085 }, { "epoch": 2.575792910447761, "grad_norm": 0.3833468747199891, "learning_rate": 2.8232132677721972e-05, "loss": 0.406, "step": 22090 }, { "epoch": 2.5763759328358207, "grad_norm": 0.4054730357193972, "learning_rate": 2.8223461128555727e-05, "loss": 0.3922, "step": 22095 }, { "epoch": 2.5769589552238807, "grad_norm": 0.4460472381553438, "learning_rate": 2.8214789471817754e-05, "loss": 0.4172, "step": 22100 }, { "epoch": 2.5775419776119404, "grad_norm": 0.4247270949273739, "learning_rate": 2.8206117708797432e-05, "loss": 0.4344, "step": 22105 }, { "epoch": 2.578125, "grad_norm": 0.4021692662185986, "learning_rate": 2.819744584078417e-05, "loss": 0.4083, "step": 22110 }, { "epoch": 2.5787080223880596, "grad_norm": 0.4014781071940521, "learning_rate": 2.818877386906739e-05, "loss": 0.4134, "step": 22115 }, { "epoch": 2.5792910447761193, "grad_norm": 0.40459568724162415, "learning_rate": 2.8180101794936542e-05, "loss": 0.4081, "step": 22120 }, { "epoch": 2.5798740671641793, "grad_norm": 0.41212136983585784, "learning_rate": 2.8171429619681073e-05, "loss": 0.4111, "step": 22125 }, { "epoch": 2.580457089552239, "grad_norm": 0.4577787954720089, "learning_rate": 2.8162757344590445e-05, "loss": 0.4203, "step": 22130 }, { "epoch": 2.5810401119402986, "grad_norm": 0.39595967976502433, "learning_rate": 2.815408497095416e-05, "loss": 0.4018, "step": 22135 }, { "epoch": 2.581623134328358, "grad_norm": 0.39472005263470467, "learning_rate": 2.8145412500061702e-05, "loss": 0.3734, "step": 22140 }, { "epoch": 2.582206156716418, "grad_norm": 0.4172146465184219, "learning_rate": 2.813673993320259e-05, "loss": 0.4212, "step": 22145 }, { "epoch": 2.5827891791044775, "grad_norm": 0.3902573137975402, "learning_rate": 2.812806727166635e-05, "loss": 0.4147, "step": 22150 }, { "epoch": 2.583372201492537, "grad_norm": 0.43001278044711233, "learning_rate": 2.811939451674252e-05, "loss": 0.4102, "step": 22155 }, { "epoch": 2.583955223880597, "grad_norm": 0.3730688765244184, "learning_rate": 2.8110721669720663e-05, "loss": 0.3709, "step": 22160 }, { "epoch": 2.584538246268657, "grad_norm": 0.4299995367289496, "learning_rate": 2.8102048731890345e-05, "loss": 0.391, "step": 22165 }, { "epoch": 2.5851212686567164, "grad_norm": 0.4118428597575234, "learning_rate": 2.8093375704541158e-05, "loss": 0.3908, "step": 22170 }, { "epoch": 2.585704291044776, "grad_norm": 0.43560160129228553, "learning_rate": 2.808470258896268e-05, "loss": 0.4025, "step": 22175 }, { "epoch": 2.5862873134328357, "grad_norm": 0.41686734968264216, "learning_rate": 2.8076029386444524e-05, "loss": 0.4024, "step": 22180 }, { "epoch": 2.5868703358208958, "grad_norm": 0.37646393896873553, "learning_rate": 2.806735609827633e-05, "loss": 0.3895, "step": 22185 }, { "epoch": 2.5874533582089554, "grad_norm": 0.41617694394390714, "learning_rate": 2.805868272574771e-05, "loss": 0.3886, "step": 22190 }, { "epoch": 2.588036380597015, "grad_norm": 0.4455298031267967, "learning_rate": 2.8050009270148326e-05, "loss": 0.432, "step": 22195 }, { "epoch": 2.5886194029850746, "grad_norm": 0.44424990770711303, "learning_rate": 2.804133573276783e-05, "loss": 0.4439, "step": 22200 }, { "epoch": 2.5892024253731343, "grad_norm": 0.4357978692348376, "learning_rate": 2.803266211489591e-05, "loss": 0.4076, "step": 22205 }, { "epoch": 2.589785447761194, "grad_norm": 0.4062729433625467, "learning_rate": 2.8023988417822222e-05, "loss": 0.3905, "step": 22210 }, { "epoch": 2.5903684701492535, "grad_norm": 0.4152482478748045, "learning_rate": 2.8015314642836476e-05, "loss": 0.3906, "step": 22215 }, { "epoch": 2.5909514925373136, "grad_norm": 0.42204159354622933, "learning_rate": 2.800664079122839e-05, "loss": 0.423, "step": 22220 }, { "epoch": 2.5915345149253732, "grad_norm": 0.40821190774312066, "learning_rate": 2.799796686428766e-05, "loss": 0.4179, "step": 22225 }, { "epoch": 2.592117537313433, "grad_norm": 0.4052933237396047, "learning_rate": 2.7989292863304045e-05, "loss": 0.3832, "step": 22230 }, { "epoch": 2.5927005597014925, "grad_norm": 0.4229739013081971, "learning_rate": 2.7980618789567258e-05, "loss": 0.4054, "step": 22235 }, { "epoch": 2.593283582089552, "grad_norm": 0.42874283632249127, "learning_rate": 2.7971944644367066e-05, "loss": 0.4123, "step": 22240 }, { "epoch": 2.593866604477612, "grad_norm": 0.3909236414679042, "learning_rate": 2.796327042899322e-05, "loss": 0.4103, "step": 22245 }, { "epoch": 2.5944496268656714, "grad_norm": 0.3975325102935005, "learning_rate": 2.7954596144735512e-05, "loss": 0.382, "step": 22250 }, { "epoch": 2.5950326492537314, "grad_norm": 0.4088920813258165, "learning_rate": 2.7945921792883707e-05, "loss": 0.3917, "step": 22255 }, { "epoch": 2.595615671641791, "grad_norm": 0.4530264932143675, "learning_rate": 2.79372473747276e-05, "loss": 0.4198, "step": 22260 }, { "epoch": 2.5961986940298507, "grad_norm": 0.4434793559794076, "learning_rate": 2.7928572891557003e-05, "loss": 0.4291, "step": 22265 }, { "epoch": 2.5967817164179103, "grad_norm": 0.40488246331631406, "learning_rate": 2.7919898344661723e-05, "loss": 0.4139, "step": 22270 }, { "epoch": 2.59736473880597, "grad_norm": 0.3983270611357439, "learning_rate": 2.791122373533157e-05, "loss": 0.4037, "step": 22275 }, { "epoch": 2.59794776119403, "grad_norm": 0.45657914992253346, "learning_rate": 2.7902549064856405e-05, "loss": 0.4098, "step": 22280 }, { "epoch": 2.5985307835820897, "grad_norm": 0.4575063731920724, "learning_rate": 2.7893874334526043e-05, "loss": 0.423, "step": 22285 }, { "epoch": 2.5991138059701493, "grad_norm": 0.4133352840595252, "learning_rate": 2.7885199545630343e-05, "loss": 0.4223, "step": 22290 }, { "epoch": 2.599696828358209, "grad_norm": 0.42850769313939163, "learning_rate": 2.7876524699459163e-05, "loss": 0.4342, "step": 22295 }, { "epoch": 2.6002798507462686, "grad_norm": 0.4077259449427239, "learning_rate": 2.7867849797302357e-05, "loss": 0.3952, "step": 22300 }, { "epoch": 2.6008628731343286, "grad_norm": 0.4841522503520805, "learning_rate": 2.7859174840449826e-05, "loss": 0.4095, "step": 22305 }, { "epoch": 2.601445895522388, "grad_norm": 0.4242614014407421, "learning_rate": 2.785049983019143e-05, "loss": 0.4199, "step": 22310 }, { "epoch": 2.602028917910448, "grad_norm": 0.4716478244582767, "learning_rate": 2.7841824767817065e-05, "loss": 0.403, "step": 22315 }, { "epoch": 2.6026119402985075, "grad_norm": 0.43420482892718326, "learning_rate": 2.7833149654616637e-05, "loss": 0.4232, "step": 22320 }, { "epoch": 2.603194962686567, "grad_norm": 0.40058425131317626, "learning_rate": 2.782447449188004e-05, "loss": 0.3793, "step": 22325 }, { "epoch": 2.6037779850746268, "grad_norm": 0.39819558111581616, "learning_rate": 2.7815799280897202e-05, "loss": 0.4095, "step": 22330 }, { "epoch": 2.6043610074626864, "grad_norm": 0.39928022261189117, "learning_rate": 2.780712402295803e-05, "loss": 0.3788, "step": 22335 }, { "epoch": 2.6049440298507465, "grad_norm": 0.44112096336092194, "learning_rate": 2.7798448719352467e-05, "loss": 0.4381, "step": 22340 }, { "epoch": 2.605527052238806, "grad_norm": 0.3987578265750149, "learning_rate": 2.778977337137044e-05, "loss": 0.3785, "step": 22345 }, { "epoch": 2.6061100746268657, "grad_norm": 0.4212063703933318, "learning_rate": 2.7781097980301878e-05, "loss": 0.4011, "step": 22350 }, { "epoch": 2.6066930970149254, "grad_norm": 0.44446745041834496, "learning_rate": 2.777242254743675e-05, "loss": 0.4113, "step": 22355 }, { "epoch": 2.607276119402985, "grad_norm": 0.4460450398839979, "learning_rate": 2.7763747074065e-05, "loss": 0.4231, "step": 22360 }, { "epoch": 2.6078591417910446, "grad_norm": 0.4233552010343455, "learning_rate": 2.775507156147658e-05, "loss": 0.4226, "step": 22365 }, { "epoch": 2.6084421641791042, "grad_norm": 0.4212752781517472, "learning_rate": 2.7746396010961462e-05, "loss": 0.3786, "step": 22370 }, { "epoch": 2.6090251865671643, "grad_norm": 0.4767806851482472, "learning_rate": 2.773772042380962e-05, "loss": 0.4075, "step": 22375 }, { "epoch": 2.609608208955224, "grad_norm": 0.48360042803705594, "learning_rate": 2.7729044801311032e-05, "loss": 0.3988, "step": 22380 }, { "epoch": 2.6101912313432836, "grad_norm": 0.38229719290658104, "learning_rate": 2.772036914475567e-05, "loss": 0.3905, "step": 22385 }, { "epoch": 2.610774253731343, "grad_norm": 0.4133302946992925, "learning_rate": 2.7711693455433534e-05, "loss": 0.3953, "step": 22390 }, { "epoch": 2.611357276119403, "grad_norm": 0.4165574527008948, "learning_rate": 2.7703017734634608e-05, "loss": 0.3945, "step": 22395 }, { "epoch": 2.611940298507463, "grad_norm": 0.4210804912903456, "learning_rate": 2.7694341983648884e-05, "loss": 0.4313, "step": 22400 }, { "epoch": 2.6125233208955225, "grad_norm": 0.4367388363947852, "learning_rate": 2.768566620376638e-05, "loss": 0.4329, "step": 22405 }, { "epoch": 2.613106343283582, "grad_norm": 0.424959420072146, "learning_rate": 2.7676990396277085e-05, "loss": 0.4094, "step": 22410 }, { "epoch": 2.613689365671642, "grad_norm": 0.39468901509057047, "learning_rate": 2.7668314562471008e-05, "loss": 0.3875, "step": 22415 }, { "epoch": 2.6142723880597014, "grad_norm": 0.4262353191509631, "learning_rate": 2.7659638703638173e-05, "loss": 0.4233, "step": 22420 }, { "epoch": 2.614855410447761, "grad_norm": 0.4076459301141559, "learning_rate": 2.7650962821068595e-05, "loss": 0.3985, "step": 22425 }, { "epoch": 2.6154384328358207, "grad_norm": 0.46846289670042857, "learning_rate": 2.764228691605229e-05, "loss": 0.4245, "step": 22430 }, { "epoch": 2.6160214552238807, "grad_norm": 0.3901685618490997, "learning_rate": 2.7633610989879284e-05, "loss": 0.3936, "step": 22435 }, { "epoch": 2.6166044776119404, "grad_norm": 0.40547142558021154, "learning_rate": 2.76249350438396e-05, "loss": 0.3963, "step": 22440 }, { "epoch": 2.6171875, "grad_norm": 0.4070049551386447, "learning_rate": 2.761625907922328e-05, "loss": 0.4323, "step": 22445 }, { "epoch": 2.6177705223880596, "grad_norm": 0.40403875423687535, "learning_rate": 2.7607583097320345e-05, "loss": 0.393, "step": 22450 }, { "epoch": 2.6183535447761193, "grad_norm": 0.3826446833329266, "learning_rate": 2.7598907099420835e-05, "loss": 0.3775, "step": 22455 }, { "epoch": 2.6189365671641793, "grad_norm": 0.472760481953975, "learning_rate": 2.7590231086814782e-05, "loss": 0.4497, "step": 22460 }, { "epoch": 2.619519589552239, "grad_norm": 0.4931381898829188, "learning_rate": 2.758155506079223e-05, "loss": 0.4046, "step": 22465 }, { "epoch": 2.6201026119402986, "grad_norm": 0.3828620701724777, "learning_rate": 2.7572879022643228e-05, "loss": 0.3692, "step": 22470 }, { "epoch": 2.620685634328358, "grad_norm": 0.4408182639542891, "learning_rate": 2.7564202973657815e-05, "loss": 0.3907, "step": 22475 }, { "epoch": 2.621268656716418, "grad_norm": 0.39359600706846304, "learning_rate": 2.7555526915126033e-05, "loss": 0.4072, "step": 22480 }, { "epoch": 2.6218516791044775, "grad_norm": 0.38439887698226066, "learning_rate": 2.754685084833793e-05, "loss": 0.3863, "step": 22485 }, { "epoch": 2.622434701492537, "grad_norm": 0.4157945776321559, "learning_rate": 2.7538174774583552e-05, "loss": 0.3726, "step": 22490 }, { "epoch": 2.623017723880597, "grad_norm": 0.49145189075517376, "learning_rate": 2.752949869515295e-05, "loss": 0.4228, "step": 22495 }, { "epoch": 2.623600746268657, "grad_norm": 0.4141780869591111, "learning_rate": 2.7520822611336176e-05, "loss": 0.4133, "step": 22500 }, { "epoch": 2.6241837686567164, "grad_norm": 0.4119506326344177, "learning_rate": 2.7512146524423288e-05, "loss": 0.3876, "step": 22505 }, { "epoch": 2.624766791044776, "grad_norm": 0.4187381085954075, "learning_rate": 2.7503470435704322e-05, "loss": 0.394, "step": 22510 }, { "epoch": 2.6253498134328357, "grad_norm": 0.4090135178615731, "learning_rate": 2.7494794346469327e-05, "loss": 0.4223, "step": 22515 }, { "epoch": 2.6259328358208958, "grad_norm": 0.4895679200073971, "learning_rate": 2.7486118258008374e-05, "loss": 0.391, "step": 22520 }, { "epoch": 2.6265158582089554, "grad_norm": 0.41769312414868964, "learning_rate": 2.74774421716115e-05, "loss": 0.4008, "step": 22525 }, { "epoch": 2.627098880597015, "grad_norm": 0.43231608436340224, "learning_rate": 2.746876608856876e-05, "loss": 0.4026, "step": 22530 }, { "epoch": 2.6276819029850746, "grad_norm": 0.37945509445185577, "learning_rate": 2.7460090010170197e-05, "loss": 0.3772, "step": 22535 }, { "epoch": 2.6282649253731343, "grad_norm": 0.41771793432698484, "learning_rate": 2.7451413937705878e-05, "loss": 0.3921, "step": 22540 }, { "epoch": 2.628847947761194, "grad_norm": 0.4172517343337237, "learning_rate": 2.7442737872465835e-05, "loss": 0.4105, "step": 22545 }, { "epoch": 2.6294309701492535, "grad_norm": 0.40019931215224414, "learning_rate": 2.743406181574012e-05, "loss": 0.4056, "step": 22550 }, { "epoch": 2.6300139925373136, "grad_norm": 0.3876915080045103, "learning_rate": 2.7425385768818784e-05, "loss": 0.4061, "step": 22555 }, { "epoch": 2.6305970149253732, "grad_norm": 0.4244692317339967, "learning_rate": 2.7416709732991863e-05, "loss": 0.4166, "step": 22560 }, { "epoch": 2.631180037313433, "grad_norm": 0.4714360469640444, "learning_rate": 2.7408033709549413e-05, "loss": 0.4269, "step": 22565 }, { "epoch": 2.6317630597014925, "grad_norm": 0.4583986896646736, "learning_rate": 2.7399357699781477e-05, "loss": 0.4063, "step": 22570 }, { "epoch": 2.632346082089552, "grad_norm": 0.4053176001235881, "learning_rate": 2.739068170497807e-05, "loss": 0.4134, "step": 22575 }, { "epoch": 2.632929104477612, "grad_norm": 0.38513463011362853, "learning_rate": 2.7382005726429256e-05, "loss": 0.3723, "step": 22580 }, { "epoch": 2.6335121268656714, "grad_norm": 0.4170962101723745, "learning_rate": 2.7373329765425053e-05, "loss": 0.4114, "step": 22585 }, { "epoch": 2.6340951492537314, "grad_norm": 0.42612121152467314, "learning_rate": 2.736465382325551e-05, "loss": 0.3895, "step": 22590 }, { "epoch": 2.634678171641791, "grad_norm": 0.44297163836302395, "learning_rate": 2.7355977901210646e-05, "loss": 0.4147, "step": 22595 }, { "epoch": 2.6352611940298507, "grad_norm": 0.3967897810112757, "learning_rate": 2.7347302000580475e-05, "loss": 0.4174, "step": 22600 }, { "epoch": 2.6358442164179103, "grad_norm": 0.43077554945593344, "learning_rate": 2.7338626122655047e-05, "loss": 0.394, "step": 22605 }, { "epoch": 2.63642723880597, "grad_norm": 0.7404115339398403, "learning_rate": 2.7329950268724358e-05, "loss": 0.4053, "step": 22610 }, { "epoch": 2.63701026119403, "grad_norm": 0.40749654219568165, "learning_rate": 2.7321274440078442e-05, "loss": 0.3827, "step": 22615 }, { "epoch": 2.6375932835820897, "grad_norm": 0.453240222390189, "learning_rate": 2.7312598638007308e-05, "loss": 0.4143, "step": 22620 }, { "epoch": 2.6381763059701493, "grad_norm": 0.40924706417700796, "learning_rate": 2.7303922863800952e-05, "loss": 0.406, "step": 22625 }, { "epoch": 2.638759328358209, "grad_norm": 0.4287701356191956, "learning_rate": 2.7295247118749395e-05, "loss": 0.4277, "step": 22630 }, { "epoch": 2.6393423507462686, "grad_norm": 0.4429404967354247, "learning_rate": 2.728657140414262e-05, "loss": 0.4176, "step": 22635 }, { "epoch": 2.6399253731343286, "grad_norm": 0.42899805695313875, "learning_rate": 2.727789572127064e-05, "loss": 0.4077, "step": 22640 }, { "epoch": 2.640508395522388, "grad_norm": 0.4433381158083287, "learning_rate": 2.726922007142344e-05, "loss": 0.4034, "step": 22645 }, { "epoch": 2.641091417910448, "grad_norm": 0.4250267655283454, "learning_rate": 2.7260544455890996e-05, "loss": 0.3969, "step": 22650 }, { "epoch": 2.6416744402985075, "grad_norm": 0.4217829392477543, "learning_rate": 2.7251868875963312e-05, "loss": 0.4214, "step": 22655 }, { "epoch": 2.642257462686567, "grad_norm": 0.39776670231864214, "learning_rate": 2.724319333293033e-05, "loss": 0.375, "step": 22660 }, { "epoch": 2.6428404850746268, "grad_norm": 0.4490704964111275, "learning_rate": 2.723451782808205e-05, "loss": 0.4137, "step": 22665 }, { "epoch": 2.6434235074626864, "grad_norm": 0.42191925987309814, "learning_rate": 2.7225842362708427e-05, "loss": 0.3901, "step": 22670 }, { "epoch": 2.6440065298507465, "grad_norm": 0.38597234858085877, "learning_rate": 2.7217166938099402e-05, "loss": 0.4004, "step": 22675 }, { "epoch": 2.644589552238806, "grad_norm": 0.4017520282470813, "learning_rate": 2.7208491555544964e-05, "loss": 0.3971, "step": 22680 }, { "epoch": 2.6451725746268657, "grad_norm": 0.4006865608636846, "learning_rate": 2.7199816216335024e-05, "loss": 0.4131, "step": 22685 }, { "epoch": 2.6457555970149254, "grad_norm": 0.4380554997393179, "learning_rate": 2.7191140921759546e-05, "loss": 0.4302, "step": 22690 }, { "epoch": 2.646338619402985, "grad_norm": 0.43207188614631753, "learning_rate": 2.7182465673108454e-05, "loss": 0.4141, "step": 22695 }, { "epoch": 2.6469216417910446, "grad_norm": 0.42491075705628634, "learning_rate": 2.7173790471671662e-05, "loss": 0.3825, "step": 22700 }, { "epoch": 2.6475046641791042, "grad_norm": 0.40472752807362644, "learning_rate": 2.7165115318739114e-05, "loss": 0.3965, "step": 22705 }, { "epoch": 2.6480876865671643, "grad_norm": 0.41736521217127226, "learning_rate": 2.7156440215600703e-05, "loss": 0.4186, "step": 22710 }, { "epoch": 2.648670708955224, "grad_norm": 0.41016883747121546, "learning_rate": 2.7147765163546347e-05, "loss": 0.4239, "step": 22715 }, { "epoch": 2.6492537313432836, "grad_norm": 0.3966850260485352, "learning_rate": 2.7139090163865932e-05, "loss": 0.394, "step": 22720 }, { "epoch": 2.649836753731343, "grad_norm": 0.4372589987881457, "learning_rate": 2.7130415217849354e-05, "loss": 0.3926, "step": 22725 }, { "epoch": 2.650419776119403, "grad_norm": 0.4133710259665845, "learning_rate": 2.712174032678648e-05, "loss": 0.4281, "step": 22730 }, { "epoch": 2.651002798507463, "grad_norm": 0.42125956473126536, "learning_rate": 2.711306549196721e-05, "loss": 0.415, "step": 22735 }, { "epoch": 2.6515858208955225, "grad_norm": 0.42151885559638086, "learning_rate": 2.7104390714681393e-05, "loss": 0.4255, "step": 22740 }, { "epoch": 2.652168843283582, "grad_norm": 0.4312968876015643, "learning_rate": 2.7095715996218875e-05, "loss": 0.4187, "step": 22745 }, { "epoch": 2.652751865671642, "grad_norm": 0.39117445238789644, "learning_rate": 2.7087041337869522e-05, "loss": 0.4134, "step": 22750 }, { "epoch": 2.6533348880597014, "grad_norm": 0.4040788468456531, "learning_rate": 2.7078366740923173e-05, "loss": 0.3892, "step": 22755 }, { "epoch": 2.653917910447761, "grad_norm": 0.48078558835702323, "learning_rate": 2.7069692206669633e-05, "loss": 0.4212, "step": 22760 }, { "epoch": 2.6545009328358207, "grad_norm": 0.4110706737895837, "learning_rate": 2.7061017736398752e-05, "loss": 0.4115, "step": 22765 }, { "epoch": 2.6550839552238807, "grad_norm": 0.47303699942361366, "learning_rate": 2.7052343331400322e-05, "loss": 0.4071, "step": 22770 }, { "epoch": 2.6556669776119404, "grad_norm": 0.4362663665645726, "learning_rate": 2.7043668992964157e-05, "loss": 0.4389, "step": 22775 }, { "epoch": 2.65625, "grad_norm": 0.41176850375937024, "learning_rate": 2.7034994722380036e-05, "loss": 0.3869, "step": 22780 }, { "epoch": 2.6568330223880596, "grad_norm": 0.4634401237951629, "learning_rate": 2.7026320520937738e-05, "loss": 0.4003, "step": 22785 }, { "epoch": 2.6574160447761193, "grad_norm": 0.390507280135157, "learning_rate": 2.701764638992705e-05, "loss": 0.3876, "step": 22790 }, { "epoch": 2.6579990671641793, "grad_norm": 0.45156977381832825, "learning_rate": 2.700897233063771e-05, "loss": 0.3873, "step": 22795 }, { "epoch": 2.658582089552239, "grad_norm": 0.44211322039557294, "learning_rate": 2.7000298344359494e-05, "loss": 0.3986, "step": 22800 }, { "epoch": 2.6591651119402986, "grad_norm": 0.40439680647060877, "learning_rate": 2.6991624432382123e-05, "loss": 0.4075, "step": 22805 }, { "epoch": 2.659748134328358, "grad_norm": 0.4232521145995904, "learning_rate": 2.6982950595995315e-05, "loss": 0.4343, "step": 22810 }, { "epoch": 2.660331156716418, "grad_norm": 0.4007371468963951, "learning_rate": 2.6974276836488815e-05, "loss": 0.3863, "step": 22815 }, { "epoch": 2.6609141791044775, "grad_norm": 0.44755798649366285, "learning_rate": 2.6965603155152302e-05, "loss": 0.4087, "step": 22820 }, { "epoch": 2.661497201492537, "grad_norm": 0.40849892329817666, "learning_rate": 2.695692955327549e-05, "loss": 0.4253, "step": 22825 }, { "epoch": 2.662080223880597, "grad_norm": 0.3991343216495092, "learning_rate": 2.6948256032148052e-05, "loss": 0.4024, "step": 22830 }, { "epoch": 2.662663246268657, "grad_norm": 0.4280138181137737, "learning_rate": 2.6939582593059638e-05, "loss": 0.4184, "step": 22835 }, { "epoch": 2.6632462686567164, "grad_norm": 0.417897238898508, "learning_rate": 2.6930909237299934e-05, "loss": 0.4023, "step": 22840 }, { "epoch": 2.663829291044776, "grad_norm": 0.4099105982377711, "learning_rate": 2.692223596615857e-05, "loss": 0.3971, "step": 22845 }, { "epoch": 2.6644123134328357, "grad_norm": 0.4247214587174987, "learning_rate": 2.691356278092519e-05, "loss": 0.3981, "step": 22850 }, { "epoch": 2.6649953358208958, "grad_norm": 0.42057720588431785, "learning_rate": 2.6904889682889404e-05, "loss": 0.4088, "step": 22855 }, { "epoch": 2.6655783582089554, "grad_norm": 0.40337693004799835, "learning_rate": 2.6896216673340814e-05, "loss": 0.4156, "step": 22860 }, { "epoch": 2.666161380597015, "grad_norm": 0.44041729987186246, "learning_rate": 2.6887543753569022e-05, "loss": 0.3958, "step": 22865 }, { "epoch": 2.6667444029850746, "grad_norm": 0.3992974735678961, "learning_rate": 2.687887092486361e-05, "loss": 0.3951, "step": 22870 }, { "epoch": 2.6673274253731343, "grad_norm": 0.49044311970394866, "learning_rate": 2.6870198188514135e-05, "loss": 0.4412, "step": 22875 }, { "epoch": 2.667910447761194, "grad_norm": 0.45914136697215857, "learning_rate": 2.686152554581016e-05, "loss": 0.4483, "step": 22880 }, { "epoch": 2.6684934701492535, "grad_norm": 0.42236530528271476, "learning_rate": 2.6852852998041216e-05, "loss": 0.3997, "step": 22885 }, { "epoch": 2.6690764925373136, "grad_norm": 0.4006640978829436, "learning_rate": 2.6844180546496833e-05, "loss": 0.4116, "step": 22890 }, { "epoch": 2.6696595149253732, "grad_norm": 0.4244299860535475, "learning_rate": 2.6835508192466512e-05, "loss": 0.4146, "step": 22895 }, { "epoch": 2.670242537313433, "grad_norm": 0.43229327696854875, "learning_rate": 2.682683593723977e-05, "loss": 0.4009, "step": 22900 }, { "epoch": 2.6708255597014925, "grad_norm": 0.4255475594996611, "learning_rate": 2.6818163782106065e-05, "loss": 0.3926, "step": 22905 }, { "epoch": 2.671408582089552, "grad_norm": 0.41387623183724126, "learning_rate": 2.680949172835487e-05, "loss": 0.3794, "step": 22910 }, { "epoch": 2.671991604477612, "grad_norm": 0.39916971201033524, "learning_rate": 2.680081977727565e-05, "loss": 0.4106, "step": 22915 }, { "epoch": 2.6725746268656714, "grad_norm": 0.4225506725510269, "learning_rate": 2.6792147930157812e-05, "loss": 0.4047, "step": 22920 }, { "epoch": 2.6731576492537314, "grad_norm": 0.4441389573144737, "learning_rate": 2.6783476188290813e-05, "loss": 0.4155, "step": 22925 }, { "epoch": 2.673740671641791, "grad_norm": 0.5024459162447518, "learning_rate": 2.6774804552964034e-05, "loss": 0.4014, "step": 22930 }, { "epoch": 2.6743236940298507, "grad_norm": 0.45680744890787905, "learning_rate": 2.676613302546686e-05, "loss": 0.4145, "step": 22935 }, { "epoch": 2.6749067164179103, "grad_norm": 0.4168266138016353, "learning_rate": 2.6757461607088692e-05, "loss": 0.4058, "step": 22940 }, { "epoch": 2.67548973880597, "grad_norm": 0.41624027321880297, "learning_rate": 2.6748790299118846e-05, "loss": 0.4018, "step": 22945 }, { "epoch": 2.67607276119403, "grad_norm": 0.41227579319805346, "learning_rate": 2.6740119102846707e-05, "loss": 0.4348, "step": 22950 }, { "epoch": 2.6766557835820897, "grad_norm": 0.40388939645917765, "learning_rate": 2.6731448019561566e-05, "loss": 0.4115, "step": 22955 }, { "epoch": 2.6772388059701493, "grad_norm": 0.449562641306428, "learning_rate": 2.6722777050552737e-05, "loss": 0.4224, "step": 22960 }, { "epoch": 2.677821828358209, "grad_norm": 0.42693867980569006, "learning_rate": 2.6714106197109522e-05, "loss": 0.4092, "step": 22965 }, { "epoch": 2.6784048507462686, "grad_norm": 0.43130612603178214, "learning_rate": 2.6705435460521177e-05, "loss": 0.4101, "step": 22970 }, { "epoch": 2.6789878731343286, "grad_norm": 0.4370067229452783, "learning_rate": 2.6696764842076978e-05, "loss": 0.4065, "step": 22975 }, { "epoch": 2.679570895522388, "grad_norm": 0.4143678191065438, "learning_rate": 2.668809434306615e-05, "loss": 0.4137, "step": 22980 }, { "epoch": 2.680153917910448, "grad_norm": 0.43136723228740675, "learning_rate": 2.66794239647779e-05, "loss": 0.3968, "step": 22985 }, { "epoch": 2.6807369402985075, "grad_norm": 0.40280454269379085, "learning_rate": 2.6670753708501454e-05, "loss": 0.4125, "step": 22990 }, { "epoch": 2.681319962686567, "grad_norm": 0.391809902282067, "learning_rate": 2.6662083575525986e-05, "loss": 0.3959, "step": 22995 }, { "epoch": 2.6819029850746268, "grad_norm": 0.5427471445296358, "learning_rate": 2.6653413567140668e-05, "loss": 0.4388, "step": 23000 }, { "epoch": 2.6824860074626864, "grad_norm": 0.49937611659196496, "learning_rate": 2.6644743684634637e-05, "loss": 0.4058, "step": 23005 }, { "epoch": 2.6830690298507465, "grad_norm": 0.4167771466896609, "learning_rate": 2.6636073929297018e-05, "loss": 0.3937, "step": 23010 }, { "epoch": 2.683652052238806, "grad_norm": 0.4024336174643316, "learning_rate": 2.6627404302416935e-05, "loss": 0.3864, "step": 23015 }, { "epoch": 2.6842350746268657, "grad_norm": 0.4391183889059748, "learning_rate": 2.661873480528347e-05, "loss": 0.4209, "step": 23020 }, { "epoch": 2.6848180970149254, "grad_norm": 0.44775313421072555, "learning_rate": 2.66100654391857e-05, "loss": 0.4197, "step": 23025 }, { "epoch": 2.685401119402985, "grad_norm": 0.4284044805741468, "learning_rate": 2.660139620541267e-05, "loss": 0.4227, "step": 23030 }, { "epoch": 2.6859841417910446, "grad_norm": 0.4200272710208144, "learning_rate": 2.6592727105253413e-05, "loss": 0.4262, "step": 23035 }, { "epoch": 2.6865671641791042, "grad_norm": 0.4354503482949555, "learning_rate": 2.6584058139996942e-05, "loss": 0.4063, "step": 23040 }, { "epoch": 2.6871501865671643, "grad_norm": 0.48701117877310707, "learning_rate": 2.6575389310932252e-05, "loss": 0.4203, "step": 23045 }, { "epoch": 2.687733208955224, "grad_norm": 0.39316205216981354, "learning_rate": 2.656672061934831e-05, "loss": 0.3962, "step": 23050 }, { "epoch": 2.6883162313432836, "grad_norm": 0.4453379823230853, "learning_rate": 2.6558052066534077e-05, "loss": 0.4195, "step": 23055 }, { "epoch": 2.688899253731343, "grad_norm": 0.40011088033144115, "learning_rate": 2.654938365377847e-05, "loss": 0.3816, "step": 23060 }, { "epoch": 2.689482276119403, "grad_norm": 0.4048436964264881, "learning_rate": 2.6540715382370407e-05, "loss": 0.4114, "step": 23065 }, { "epoch": 2.690065298507463, "grad_norm": 0.45858943482750675, "learning_rate": 2.6532047253598776e-05, "loss": 0.4112, "step": 23070 }, { "epoch": 2.6906483208955225, "grad_norm": 0.4093823948188578, "learning_rate": 2.6523379268752448e-05, "loss": 0.3849, "step": 23075 }, { "epoch": 2.691231343283582, "grad_norm": 0.4727014551006822, "learning_rate": 2.651471142912026e-05, "loss": 0.4203, "step": 23080 }, { "epoch": 2.691814365671642, "grad_norm": 0.37511059443170225, "learning_rate": 2.6506043735991042e-05, "loss": 0.4003, "step": 23085 }, { "epoch": 2.6923973880597014, "grad_norm": 0.40664811860268496, "learning_rate": 2.6497376190653607e-05, "loss": 0.4036, "step": 23090 }, { "epoch": 2.692980410447761, "grad_norm": 0.43437066417470704, "learning_rate": 2.6488708794396712e-05, "loss": 0.4038, "step": 23095 }, { "epoch": 2.6935634328358207, "grad_norm": 0.41141835216852596, "learning_rate": 2.6480041548509137e-05, "loss": 0.3972, "step": 23100 }, { "epoch": 2.6941464552238807, "grad_norm": 0.4041710313748638, "learning_rate": 2.6471374454279617e-05, "loss": 0.4119, "step": 23105 }, { "epoch": 2.6947294776119404, "grad_norm": 0.41295264329339787, "learning_rate": 2.6462707512996847e-05, "loss": 0.3743, "step": 23110 }, { "epoch": 2.6953125, "grad_norm": 0.3754314035467755, "learning_rate": 2.645404072594954e-05, "loss": 0.3946, "step": 23115 }, { "epoch": 2.6958955223880596, "grad_norm": 0.39122158739748497, "learning_rate": 2.644537409442635e-05, "loss": 0.3883, "step": 23120 }, { "epoch": 2.6964785447761193, "grad_norm": 0.3733786295671506, "learning_rate": 2.6436707619715935e-05, "loss": 0.3917, "step": 23125 }, { "epoch": 2.6970615671641793, "grad_norm": 0.41979800453545263, "learning_rate": 2.642804130310691e-05, "loss": 0.4082, "step": 23130 }, { "epoch": 2.697644589552239, "grad_norm": 0.46204630505241356, "learning_rate": 2.641937514588786e-05, "loss": 0.4238, "step": 23135 }, { "epoch": 2.6982276119402986, "grad_norm": 0.4057527433244061, "learning_rate": 2.6410709149347385e-05, "loss": 0.3896, "step": 23140 }, { "epoch": 2.698810634328358, "grad_norm": 0.39929869805403395, "learning_rate": 2.640204331477401e-05, "loss": 0.3862, "step": 23145 }, { "epoch": 2.699393656716418, "grad_norm": 0.43679517893116127, "learning_rate": 2.6393377643456284e-05, "loss": 0.4329, "step": 23150 }, { "epoch": 2.6999766791044775, "grad_norm": 0.4134303595321092, "learning_rate": 2.6384712136682706e-05, "loss": 0.3848, "step": 23155 }, { "epoch": 2.700559701492537, "grad_norm": 0.40540442514472547, "learning_rate": 2.6376046795741733e-05, "loss": 0.4139, "step": 23160 }, { "epoch": 2.701142723880597, "grad_norm": 0.40554525658034907, "learning_rate": 2.6367381621921845e-05, "loss": 0.4031, "step": 23165 }, { "epoch": 2.701725746268657, "grad_norm": 0.435115298585795, "learning_rate": 2.6358716616511446e-05, "loss": 0.4148, "step": 23170 }, { "epoch": 2.7023087686567164, "grad_norm": 0.3862926045381509, "learning_rate": 2.635005178079896e-05, "loss": 0.372, "step": 23175 }, { "epoch": 2.702891791044776, "grad_norm": 0.40096781880358884, "learning_rate": 2.6341387116072763e-05, "loss": 0.3937, "step": 23180 }, { "epoch": 2.7034748134328357, "grad_norm": 0.40341537040171827, "learning_rate": 2.633272262362118e-05, "loss": 0.3907, "step": 23185 }, { "epoch": 2.7040578358208958, "grad_norm": 0.4253080292369856, "learning_rate": 2.6324058304732574e-05, "loss": 0.3989, "step": 23190 }, { "epoch": 2.7046408582089554, "grad_norm": 0.4261722316781176, "learning_rate": 2.6315394160695227e-05, "loss": 0.4108, "step": 23195 }, { "epoch": 2.705223880597015, "grad_norm": 0.40714567920789657, "learning_rate": 2.630673019279742e-05, "loss": 0.3914, "step": 23200 }, { "epoch": 2.7058069029850746, "grad_norm": 0.388495867404363, "learning_rate": 2.6298066402327404e-05, "loss": 0.4058, "step": 23205 }, { "epoch": 2.7063899253731343, "grad_norm": 0.4337222302072616, "learning_rate": 2.6289402790573392e-05, "loss": 0.4004, "step": 23210 }, { "epoch": 2.706972947761194, "grad_norm": 0.44018535449347046, "learning_rate": 2.6280739358823588e-05, "loss": 0.4089, "step": 23215 }, { "epoch": 2.7075559701492535, "grad_norm": 0.39877524769895384, "learning_rate": 2.6272076108366163e-05, "loss": 0.3916, "step": 23220 }, { "epoch": 2.7081389925373136, "grad_norm": 0.45642928017337175, "learning_rate": 2.6263413040489248e-05, "loss": 0.3958, "step": 23225 }, { "epoch": 2.7087220149253732, "grad_norm": 0.47021556383537044, "learning_rate": 2.6254750156480973e-05, "loss": 0.4572, "step": 23230 }, { "epoch": 2.709305037313433, "grad_norm": 0.416273335439482, "learning_rate": 2.6246087457629408e-05, "loss": 0.391, "step": 23235 }, { "epoch": 2.7098880597014925, "grad_norm": 0.3923420184469295, "learning_rate": 2.623742494522264e-05, "loss": 0.4073, "step": 23240 }, { "epoch": 2.710471082089552, "grad_norm": 0.3877627467831005, "learning_rate": 2.6228762620548675e-05, "loss": 0.4021, "step": 23245 }, { "epoch": 2.711054104477612, "grad_norm": 0.42093254685600145, "learning_rate": 2.6220100484895527e-05, "loss": 0.402, "step": 23250 }, { "epoch": 2.7116371268656714, "grad_norm": 0.40869295789702276, "learning_rate": 2.6211438539551187e-05, "loss": 0.4239, "step": 23255 }, { "epoch": 2.7122201492537314, "grad_norm": 0.43352026639842933, "learning_rate": 2.620277678580358e-05, "loss": 0.4344, "step": 23260 }, { "epoch": 2.712803171641791, "grad_norm": 0.4345051480468157, "learning_rate": 2.619411522494063e-05, "loss": 0.4127, "step": 23265 }, { "epoch": 2.7133861940298507, "grad_norm": 0.4123537753096545, "learning_rate": 2.6185453858250242e-05, "loss": 0.4251, "step": 23270 }, { "epoch": 2.7139692164179103, "grad_norm": 0.4250618106607531, "learning_rate": 2.6176792687020273e-05, "loss": 0.369, "step": 23275 }, { "epoch": 2.71455223880597, "grad_norm": 0.4151329426607874, "learning_rate": 2.616813171253855e-05, "loss": 0.4199, "step": 23280 }, { "epoch": 2.71513526119403, "grad_norm": 0.38972508331106953, "learning_rate": 2.615947093609288e-05, "loss": 0.3615, "step": 23285 }, { "epoch": 2.7157182835820897, "grad_norm": 0.45250019937154007, "learning_rate": 2.615081035897104e-05, "loss": 0.4137, "step": 23290 }, { "epoch": 2.7163013059701493, "grad_norm": 0.4011154676934871, "learning_rate": 2.614214998246077e-05, "loss": 0.3812, "step": 23295 }, { "epoch": 2.716884328358209, "grad_norm": 0.4040557255747825, "learning_rate": 2.6133489807849786e-05, "loss": 0.3944, "step": 23300 }, { "epoch": 2.7174673507462686, "grad_norm": 0.39984922383296745, "learning_rate": 2.6124829836425784e-05, "loss": 0.3781, "step": 23305 }, { "epoch": 2.7180503731343286, "grad_norm": 0.4112675512275878, "learning_rate": 2.6116170069476397e-05, "loss": 0.3903, "step": 23310 }, { "epoch": 2.718633395522388, "grad_norm": 0.43349766500278497, "learning_rate": 2.6107510508289274e-05, "loss": 0.4204, "step": 23315 }, { "epoch": 2.719216417910448, "grad_norm": 0.3985340664955324, "learning_rate": 2.609885115415198e-05, "loss": 0.3938, "step": 23320 }, { "epoch": 2.7197994402985075, "grad_norm": 0.4119274080405407, "learning_rate": 2.6090192008352115e-05, "loss": 0.4101, "step": 23325 }, { "epoch": 2.720382462686567, "grad_norm": 0.4231135539136476, "learning_rate": 2.6081533072177183e-05, "loss": 0.428, "step": 23330 }, { "epoch": 2.7209654850746268, "grad_norm": 0.3865666551000909, "learning_rate": 2.6072874346914688e-05, "loss": 0.3987, "step": 23335 }, { "epoch": 2.7215485074626864, "grad_norm": 0.43601065595854366, "learning_rate": 2.6064215833852113e-05, "loss": 0.3994, "step": 23340 }, { "epoch": 2.7221315298507465, "grad_norm": 0.4505806506956824, "learning_rate": 2.605555753427687e-05, "loss": 0.4118, "step": 23345 }, { "epoch": 2.722714552238806, "grad_norm": 0.4082580930394578, "learning_rate": 2.6046899449476397e-05, "loss": 0.4095, "step": 23350 }, { "epoch": 2.7232975746268657, "grad_norm": 0.38769732223260783, "learning_rate": 2.6038241580738053e-05, "loss": 0.3926, "step": 23355 }, { "epoch": 2.7238805970149254, "grad_norm": 0.4068613521392573, "learning_rate": 2.602958392934917e-05, "loss": 0.3833, "step": 23360 }, { "epoch": 2.724463619402985, "grad_norm": 0.5974696661302847, "learning_rate": 2.6020926496597082e-05, "loss": 0.4046, "step": 23365 }, { "epoch": 2.7250466417910446, "grad_norm": 0.4807652994716567, "learning_rate": 2.601226928376904e-05, "loss": 0.4328, "step": 23370 }, { "epoch": 2.7256296641791042, "grad_norm": 0.40072634460980483, "learning_rate": 2.600361229215231e-05, "loss": 0.3638, "step": 23375 }, { "epoch": 2.7262126865671643, "grad_norm": 0.4211773845340952, "learning_rate": 2.5994955523034098e-05, "loss": 0.4046, "step": 23380 }, { "epoch": 2.726795708955224, "grad_norm": 0.39596086036641953, "learning_rate": 2.598629897770157e-05, "loss": 0.418, "step": 23385 }, { "epoch": 2.7273787313432836, "grad_norm": 0.5036133136705389, "learning_rate": 2.5977642657441893e-05, "loss": 0.4182, "step": 23390 }, { "epoch": 2.727961753731343, "grad_norm": 0.45720145788244165, "learning_rate": 2.5968986563542153e-05, "loss": 0.4097, "step": 23395 }, { "epoch": 2.728544776119403, "grad_norm": 0.4589595028804862, "learning_rate": 2.5960330697289447e-05, "loss": 0.4094, "step": 23400 }, { "epoch": 2.729127798507463, "grad_norm": 0.4067884402041407, "learning_rate": 2.595167505997082e-05, "loss": 0.3952, "step": 23405 }, { "epoch": 2.7297108208955225, "grad_norm": 0.40815615139786815, "learning_rate": 2.5943019652873267e-05, "loss": 0.4175, "step": 23410 }, { "epoch": 2.730293843283582, "grad_norm": 0.37160135826095847, "learning_rate": 2.5934364477283786e-05, "loss": 0.3883, "step": 23415 }, { "epoch": 2.730876865671642, "grad_norm": 0.3894050185239279, "learning_rate": 2.5925709534489295e-05, "loss": 0.3664, "step": 23420 }, { "epoch": 2.7314598880597014, "grad_norm": 0.4120147513178943, "learning_rate": 2.5917054825776717e-05, "loss": 0.3872, "step": 23425 }, { "epoch": 2.732042910447761, "grad_norm": 0.4335842493381205, "learning_rate": 2.5908400352432927e-05, "loss": 0.4175, "step": 23430 }, { "epoch": 2.7326259328358207, "grad_norm": 0.4138931421552768, "learning_rate": 2.5899746115744743e-05, "loss": 0.4105, "step": 23435 }, { "epoch": 2.7332089552238807, "grad_norm": 0.740558249887021, "learning_rate": 2.589109211699899e-05, "loss": 0.4032, "step": 23440 }, { "epoch": 2.7337919776119404, "grad_norm": 0.40855631044561286, "learning_rate": 2.5882438357482414e-05, "loss": 0.4267, "step": 23445 }, { "epoch": 2.734375, "grad_norm": 0.4489663532548628, "learning_rate": 2.5873784838481762e-05, "loss": 0.3851, "step": 23450 }, { "epoch": 2.7349580223880596, "grad_norm": 0.39658794798836106, "learning_rate": 2.5865131561283718e-05, "loss": 0.3966, "step": 23455 }, { "epoch": 2.7355410447761193, "grad_norm": 0.4150587942209322, "learning_rate": 2.5856478527174955e-05, "loss": 0.3985, "step": 23460 }, { "epoch": 2.7361240671641793, "grad_norm": 0.41526330251693905, "learning_rate": 2.5847825737442088e-05, "loss": 0.3993, "step": 23465 }, { "epoch": 2.736707089552239, "grad_norm": 0.44656130500655006, "learning_rate": 2.5839173193371697e-05, "loss": 0.4259, "step": 23470 }, { "epoch": 2.7372901119402986, "grad_norm": 0.40721729631087433, "learning_rate": 2.583052089625035e-05, "loss": 0.4037, "step": 23475 }, { "epoch": 2.737873134328358, "grad_norm": 0.4165401357075778, "learning_rate": 2.5821868847364534e-05, "loss": 0.4097, "step": 23480 }, { "epoch": 2.738456156716418, "grad_norm": 0.40950560526214086, "learning_rate": 2.5813217048000753e-05, "loss": 0.3985, "step": 23485 }, { "epoch": 2.7390391791044775, "grad_norm": 0.43524417905378904, "learning_rate": 2.5804565499445437e-05, "loss": 0.4198, "step": 23490 }, { "epoch": 2.739622201492537, "grad_norm": 0.4239421544275635, "learning_rate": 2.5795914202984978e-05, "loss": 0.3871, "step": 23495 }, { "epoch": 2.740205223880597, "grad_norm": 0.4118146009144225, "learning_rate": 2.578726315990576e-05, "loss": 0.4263, "step": 23500 }, { "epoch": 2.740788246268657, "grad_norm": 0.4187728226608671, "learning_rate": 2.577861237149409e-05, "loss": 0.4042, "step": 23505 }, { "epoch": 2.7413712686567164, "grad_norm": 0.4064348141925177, "learning_rate": 2.5769961839036277e-05, "loss": 0.3832, "step": 23510 }, { "epoch": 2.741954291044776, "grad_norm": 0.44349461881866076, "learning_rate": 2.5761311563818564e-05, "loss": 0.4232, "step": 23515 }, { "epoch": 2.7425373134328357, "grad_norm": 0.40294997704414226, "learning_rate": 2.575266154712715e-05, "loss": 0.3995, "step": 23520 }, { "epoch": 2.7431203358208958, "grad_norm": 0.4570692373043179, "learning_rate": 2.574401179024823e-05, "loss": 0.4493, "step": 23525 }, { "epoch": 2.7437033582089554, "grad_norm": 0.39395181895347137, "learning_rate": 2.5735362294467928e-05, "loss": 0.4364, "step": 23530 }, { "epoch": 2.744286380597015, "grad_norm": 0.38849911166313594, "learning_rate": 2.5726713061072354e-05, "loss": 0.404, "step": 23535 }, { "epoch": 2.7448694029850746, "grad_norm": 0.4125750457549172, "learning_rate": 2.571806409134756e-05, "loss": 0.4082, "step": 23540 }, { "epoch": 2.7454524253731343, "grad_norm": 0.3888922840518757, "learning_rate": 2.5709415386579548e-05, "loss": 0.4092, "step": 23545 }, { "epoch": 2.746035447761194, "grad_norm": 0.4023705829034372, "learning_rate": 2.570076694805432e-05, "loss": 0.4065, "step": 23550 }, { "epoch": 2.7466184701492535, "grad_norm": 0.5430678955248344, "learning_rate": 2.5692118777057804e-05, "loss": 0.4228, "step": 23555 }, { "epoch": 2.7472014925373136, "grad_norm": 0.42836701838939134, "learning_rate": 2.5683470874875913e-05, "loss": 0.4181, "step": 23560 }, { "epoch": 2.7477845149253732, "grad_norm": 0.39499707975033255, "learning_rate": 2.56748232427945e-05, "loss": 0.4027, "step": 23565 }, { "epoch": 2.748367537313433, "grad_norm": 0.4048341595713592, "learning_rate": 2.566617588209937e-05, "loss": 0.3795, "step": 23570 }, { "epoch": 2.7489505597014925, "grad_norm": 0.384570534778654, "learning_rate": 2.5657528794076325e-05, "loss": 0.3799, "step": 23575 }, { "epoch": 2.749533582089552, "grad_norm": 0.4182858760300387, "learning_rate": 2.564888198001109e-05, "loss": 0.4312, "step": 23580 }, { "epoch": 2.750116604477612, "grad_norm": 0.41137050486749877, "learning_rate": 2.5640235441189374e-05, "loss": 0.4163, "step": 23585 }, { "epoch": 2.7506996268656714, "grad_norm": 0.3925721547324965, "learning_rate": 2.563158917889683e-05, "loss": 0.4183, "step": 23590 }, { "epoch": 2.7512826492537314, "grad_norm": 0.4179279447881019, "learning_rate": 2.562294319441907e-05, "loss": 0.4082, "step": 23595 }, { "epoch": 2.751865671641791, "grad_norm": 0.4357732605721109, "learning_rate": 2.5614297489041673e-05, "loss": 0.4213, "step": 23600 }, { "epoch": 2.7524486940298507, "grad_norm": 0.4263211744746736, "learning_rate": 2.5605652064050168e-05, "loss": 0.4061, "step": 23605 }, { "epoch": 2.7530317164179103, "grad_norm": 0.4279222660525636, "learning_rate": 2.559700692073006e-05, "loss": 0.3761, "step": 23610 }, { "epoch": 2.75361473880597, "grad_norm": 0.417219076894595, "learning_rate": 2.5588362060366788e-05, "loss": 0.3771, "step": 23615 }, { "epoch": 2.75419776119403, "grad_norm": 0.402566514565224, "learning_rate": 2.5579717484245756e-05, "loss": 0.3855, "step": 23620 }, { "epoch": 2.7547807835820897, "grad_norm": 0.3921980022841419, "learning_rate": 2.5571073193652344e-05, "loss": 0.4033, "step": 23625 }, { "epoch": 2.7553638059701493, "grad_norm": 0.4264633432451599, "learning_rate": 2.556242918987185e-05, "loss": 0.4137, "step": 23630 }, { "epoch": 2.755946828358209, "grad_norm": 0.4329588491768397, "learning_rate": 2.555378547418959e-05, "loss": 0.4162, "step": 23635 }, { "epoch": 2.7565298507462686, "grad_norm": 0.41367723409575946, "learning_rate": 2.554514204789078e-05, "loss": 0.3996, "step": 23640 }, { "epoch": 2.7571128731343286, "grad_norm": 0.4052815832329333, "learning_rate": 2.553649891226061e-05, "loss": 0.4136, "step": 23645 }, { "epoch": 2.757695895522388, "grad_norm": 0.39155926007138503, "learning_rate": 2.5527856068584244e-05, "loss": 0.3746, "step": 23650 }, { "epoch": 2.758278917910448, "grad_norm": 0.4476494032793297, "learning_rate": 2.5519213518146778e-05, "loss": 0.4151, "step": 23655 }, { "epoch": 2.7588619402985075, "grad_norm": 0.42318808533098395, "learning_rate": 2.551057126223329e-05, "loss": 0.4079, "step": 23660 }, { "epoch": 2.759444962686567, "grad_norm": 0.419430489222338, "learning_rate": 2.5501929302128795e-05, "loss": 0.3974, "step": 23665 }, { "epoch": 2.7600279850746268, "grad_norm": 0.43282775381006433, "learning_rate": 2.5493287639118265e-05, "loss": 0.4262, "step": 23670 }, { "epoch": 2.7606110074626864, "grad_norm": 0.4323264282523569, "learning_rate": 2.548464627448664e-05, "loss": 0.4272, "step": 23675 }, { "epoch": 2.7611940298507465, "grad_norm": 0.4353983566542406, "learning_rate": 2.54760052095188e-05, "loss": 0.3751, "step": 23680 }, { "epoch": 2.761777052238806, "grad_norm": 0.39523963202263834, "learning_rate": 2.54673644454996e-05, "loss": 0.3989, "step": 23685 }, { "epoch": 2.7623600746268657, "grad_norm": 0.4330013473539442, "learning_rate": 2.545872398371383e-05, "loss": 0.4311, "step": 23690 }, { "epoch": 2.7629430970149254, "grad_norm": 0.41197784556490724, "learning_rate": 2.5450083825446237e-05, "loss": 0.3924, "step": 23695 }, { "epoch": 2.763526119402985, "grad_norm": 0.4038801679236969, "learning_rate": 2.544144397198155e-05, "loss": 0.3977, "step": 23700 }, { "epoch": 2.7641091417910446, "grad_norm": 0.45941878876760606, "learning_rate": 2.5432804424604412e-05, "loss": 0.4289, "step": 23705 }, { "epoch": 2.7646921641791042, "grad_norm": 0.3773822659259436, "learning_rate": 2.5424165184599457e-05, "loss": 0.3872, "step": 23710 }, { "epoch": 2.7652751865671643, "grad_norm": 0.4318969237293315, "learning_rate": 2.541552625325125e-05, "loss": 0.412, "step": 23715 }, { "epoch": 2.765858208955224, "grad_norm": 0.40242158228990554, "learning_rate": 2.5406887631844312e-05, "loss": 0.4161, "step": 23720 }, { "epoch": 2.7664412313432836, "grad_norm": 0.41685747516691346, "learning_rate": 2.539824932166313e-05, "loss": 0.3989, "step": 23725 }, { "epoch": 2.767024253731343, "grad_norm": 0.5283443603392355, "learning_rate": 2.5389611323992134e-05, "loss": 0.4217, "step": 23730 }, { "epoch": 2.767607276119403, "grad_norm": 0.4431415757580844, "learning_rate": 2.538097364011572e-05, "loss": 0.3959, "step": 23735 }, { "epoch": 2.768190298507463, "grad_norm": 0.4708376365779287, "learning_rate": 2.5372336271318225e-05, "loss": 0.4275, "step": 23740 }, { "epoch": 2.7687733208955225, "grad_norm": 0.3879746198686029, "learning_rate": 2.5363699218883928e-05, "loss": 0.4, "step": 23745 }, { "epoch": 2.769356343283582, "grad_norm": 0.40532301443873264, "learning_rate": 2.5355062484097103e-05, "loss": 0.4123, "step": 23750 }, { "epoch": 2.769939365671642, "grad_norm": 0.3868527746931474, "learning_rate": 2.534642606824193e-05, "loss": 0.4059, "step": 23755 }, { "epoch": 2.7705223880597014, "grad_norm": 0.39234138283164033, "learning_rate": 2.5337789972602566e-05, "loss": 0.3997, "step": 23760 }, { "epoch": 2.771105410447761, "grad_norm": 0.40105674009452147, "learning_rate": 2.5329154198463122e-05, "loss": 0.4219, "step": 23765 }, { "epoch": 2.7716884328358207, "grad_norm": 0.4119086203387013, "learning_rate": 2.5320518747107646e-05, "loss": 0.4056, "step": 23770 }, { "epoch": 2.7722714552238807, "grad_norm": 0.4351952534923432, "learning_rate": 2.5311883619820148e-05, "loss": 0.4275, "step": 23775 }, { "epoch": 2.7728544776119404, "grad_norm": 0.4391144020073444, "learning_rate": 2.530324881788459e-05, "loss": 0.4205, "step": 23780 }, { "epoch": 2.7734375, "grad_norm": 0.42911783395826564, "learning_rate": 2.529461434258489e-05, "loss": 0.3701, "step": 23785 }, { "epoch": 2.7740205223880596, "grad_norm": 0.38659198334080536, "learning_rate": 2.5285980195204906e-05, "loss": 0.4171, "step": 23790 }, { "epoch": 2.7746035447761193, "grad_norm": 0.4259465737601676, "learning_rate": 2.5277346377028448e-05, "loss": 0.4051, "step": 23795 }, { "epoch": 2.7751865671641793, "grad_norm": 0.3839863991363711, "learning_rate": 2.5268712889339296e-05, "loss": 0.3869, "step": 23800 }, { "epoch": 2.775769589552239, "grad_norm": 0.4374632446399871, "learning_rate": 2.5260079733421155e-05, "loss": 0.4058, "step": 23805 }, { "epoch": 2.7763526119402986, "grad_norm": 0.43809648237627147, "learning_rate": 2.5251446910557704e-05, "loss": 0.4092, "step": 23810 }, { "epoch": 2.776935634328358, "grad_norm": 0.43408596055973, "learning_rate": 2.524281442203254e-05, "loss": 0.4072, "step": 23815 }, { "epoch": 2.777518656716418, "grad_norm": 0.4070905572092175, "learning_rate": 2.5234182269129253e-05, "loss": 0.3964, "step": 23820 }, { "epoch": 2.7781016791044775, "grad_norm": 0.42091320895090806, "learning_rate": 2.522555045313136e-05, "loss": 0.3994, "step": 23825 }, { "epoch": 2.778684701492537, "grad_norm": 0.40957912989869033, "learning_rate": 2.5216918975322303e-05, "loss": 0.4005, "step": 23830 }, { "epoch": 2.779267723880597, "grad_norm": 0.42193901325138783, "learning_rate": 2.520828783698554e-05, "loss": 0.3921, "step": 23835 }, { "epoch": 2.779850746268657, "grad_norm": 0.43234762135101035, "learning_rate": 2.519965703940441e-05, "loss": 0.3834, "step": 23840 }, { "epoch": 2.7804337686567164, "grad_norm": 0.42060154814410344, "learning_rate": 2.5191026583862237e-05, "loss": 0.4232, "step": 23845 }, { "epoch": 2.781016791044776, "grad_norm": 0.7083708125721425, "learning_rate": 2.5182396471642287e-05, "loss": 0.4051, "step": 23850 }, { "epoch": 2.7815998134328357, "grad_norm": 0.387630736355572, "learning_rate": 2.5173766704027775e-05, "loss": 0.4121, "step": 23855 }, { "epoch": 2.7821828358208958, "grad_norm": 0.4102084264934228, "learning_rate": 2.5165137282301877e-05, "loss": 0.4042, "step": 23860 }, { "epoch": 2.7827658582089554, "grad_norm": 0.45040483006911913, "learning_rate": 2.5156508207747686e-05, "loss": 0.3834, "step": 23865 }, { "epoch": 2.783348880597015, "grad_norm": 0.44049350009547306, "learning_rate": 2.5147879481648266e-05, "loss": 0.4058, "step": 23870 }, { "epoch": 2.7839319029850746, "grad_norm": 0.41386777302135547, "learning_rate": 2.5139251105286644e-05, "loss": 0.3861, "step": 23875 }, { "epoch": 2.7845149253731343, "grad_norm": 0.39641096536614573, "learning_rate": 2.5130623079945754e-05, "loss": 0.4048, "step": 23880 }, { "epoch": 2.785097947761194, "grad_norm": 0.427528561224656, "learning_rate": 2.5121995406908516e-05, "loss": 0.3854, "step": 23885 }, { "epoch": 2.7856809701492535, "grad_norm": 0.38980591755760313, "learning_rate": 2.511336808745778e-05, "loss": 0.3876, "step": 23890 }, { "epoch": 2.7862639925373136, "grad_norm": 0.417593608155887, "learning_rate": 2.510474112287633e-05, "loss": 0.4048, "step": 23895 }, { "epoch": 2.7868470149253732, "grad_norm": 0.3897415184789091, "learning_rate": 2.5096114514446934e-05, "loss": 0.3997, "step": 23900 }, { "epoch": 2.787430037313433, "grad_norm": 0.4135504576927545, "learning_rate": 2.508748826345227e-05, "loss": 0.4142, "step": 23905 }, { "epoch": 2.7880130597014925, "grad_norm": 0.4454685363310767, "learning_rate": 2.5078862371175e-05, "loss": 0.4176, "step": 23910 }, { "epoch": 2.788596082089552, "grad_norm": 0.408765100671447, "learning_rate": 2.507023683889769e-05, "loss": 0.404, "step": 23915 }, { "epoch": 2.789179104477612, "grad_norm": 2.566801092984802, "learning_rate": 2.5061611667902878e-05, "loss": 0.4013, "step": 23920 }, { "epoch": 2.7897621268656714, "grad_norm": 0.6071793929067169, "learning_rate": 2.505298685947306e-05, "loss": 0.3831, "step": 23925 }, { "epoch": 2.7903451492537314, "grad_norm": 0.42578873893408165, "learning_rate": 2.504436241489064e-05, "loss": 0.3892, "step": 23930 }, { "epoch": 2.790928171641791, "grad_norm": 0.39103629170693305, "learning_rate": 2.503573833543801e-05, "loss": 0.424, "step": 23935 }, { "epoch": 2.7915111940298507, "grad_norm": 0.47204331901509455, "learning_rate": 2.5027114622397473e-05, "loss": 0.4448, "step": 23940 }, { "epoch": 2.7920942164179103, "grad_norm": 0.39664647146837567, "learning_rate": 2.5018491277051297e-05, "loss": 0.4265, "step": 23945 }, { "epoch": 2.79267723880597, "grad_norm": 0.3965111167317444, "learning_rate": 2.50098683006817e-05, "loss": 0.4209, "step": 23950 }, { "epoch": 2.79326026119403, "grad_norm": 0.42966543838020704, "learning_rate": 2.500124569457082e-05, "loss": 0.4326, "step": 23955 }, { "epoch": 2.7938432835820897, "grad_norm": 0.39810968385462714, "learning_rate": 2.4992623460000763e-05, "loss": 0.3838, "step": 23960 }, { "epoch": 2.7944263059701493, "grad_norm": 0.45218513820816264, "learning_rate": 2.4984001598253586e-05, "loss": 0.405, "step": 23965 }, { "epoch": 2.795009328358209, "grad_norm": 0.4656333542507009, "learning_rate": 2.497538011061125e-05, "loss": 0.4084, "step": 23970 }, { "epoch": 2.7955923507462686, "grad_norm": 0.4112521610946821, "learning_rate": 2.4966758998355717e-05, "loss": 0.4116, "step": 23975 }, { "epoch": 2.7961753731343286, "grad_norm": 0.4288806020954192, "learning_rate": 2.495813826276884e-05, "loss": 0.403, "step": 23980 }, { "epoch": 2.796758395522388, "grad_norm": 0.4224093193422178, "learning_rate": 2.4949517905132454e-05, "loss": 0.4129, "step": 23985 }, { "epoch": 2.797341417910448, "grad_norm": 0.4205978319383233, "learning_rate": 2.4940897926728314e-05, "loss": 0.3884, "step": 23990 }, { "epoch": 2.7979244402985075, "grad_norm": 0.3986308142627854, "learning_rate": 2.4932278328838138e-05, "loss": 0.3889, "step": 23995 }, { "epoch": 2.798507462686567, "grad_norm": 0.4322059827673749, "learning_rate": 2.4923659112743576e-05, "loss": 0.3988, "step": 24000 }, { "epoch": 2.7990904850746268, "grad_norm": 0.4144370652573071, "learning_rate": 2.4915040279726204e-05, "loss": 0.4069, "step": 24005 }, { "epoch": 2.7996735074626864, "grad_norm": 0.3952033697853189, "learning_rate": 2.490642183106759e-05, "loss": 0.3922, "step": 24010 }, { "epoch": 2.8002565298507465, "grad_norm": 0.45683165418522576, "learning_rate": 2.48978037680492e-05, "loss": 0.4473, "step": 24015 }, { "epoch": 2.800839552238806, "grad_norm": 0.44948343362373777, "learning_rate": 2.4889186091952444e-05, "loss": 0.4055, "step": 24020 }, { "epoch": 2.8014225746268657, "grad_norm": 0.47008610412249724, "learning_rate": 2.4880568804058713e-05, "loss": 0.381, "step": 24025 }, { "epoch": 2.8020055970149254, "grad_norm": 0.4719458016582111, "learning_rate": 2.48719519056493e-05, "loss": 0.4198, "step": 24030 }, { "epoch": 2.802588619402985, "grad_norm": 0.41391021805188194, "learning_rate": 2.486333539800546e-05, "loss": 0.4273, "step": 24035 }, { "epoch": 2.8031716417910446, "grad_norm": 0.44497227363620745, "learning_rate": 2.485471928240839e-05, "loss": 0.3848, "step": 24040 }, { "epoch": 2.8037546641791042, "grad_norm": 0.43542455319928247, "learning_rate": 2.4846103560139206e-05, "loss": 0.4007, "step": 24045 }, { "epoch": 2.8043376865671643, "grad_norm": 0.382972498062499, "learning_rate": 2.4837488232479005e-05, "loss": 0.364, "step": 24050 }, { "epoch": 2.804920708955224, "grad_norm": 0.41513659207359027, "learning_rate": 2.4828873300708783e-05, "loss": 0.4198, "step": 24055 }, { "epoch": 2.8055037313432836, "grad_norm": 0.4152909338801933, "learning_rate": 2.4820258766109515e-05, "loss": 0.394, "step": 24060 }, { "epoch": 2.806086753731343, "grad_norm": 0.41993421455879915, "learning_rate": 2.48116446299621e-05, "loss": 0.4269, "step": 24065 }, { "epoch": 2.806669776119403, "grad_norm": 0.41329144124473244, "learning_rate": 2.4803030893547357e-05, "loss": 0.3952, "step": 24070 }, { "epoch": 2.807252798507463, "grad_norm": 0.4189307879832587, "learning_rate": 2.4794417558146093e-05, "loss": 0.3887, "step": 24075 }, { "epoch": 2.8078358208955225, "grad_norm": 0.4596873833252724, "learning_rate": 2.4785804625039005e-05, "loss": 0.387, "step": 24080 }, { "epoch": 2.808418843283582, "grad_norm": 0.44059313885271306, "learning_rate": 2.4777192095506775e-05, "loss": 0.4037, "step": 24085 }, { "epoch": 2.809001865671642, "grad_norm": 0.4489989021415904, "learning_rate": 2.4768579970829985e-05, "loss": 0.4079, "step": 24090 }, { "epoch": 2.8095848880597014, "grad_norm": 0.46829103663309113, "learning_rate": 2.4759968252289183e-05, "loss": 0.4293, "step": 24095 }, { "epoch": 2.810167910447761, "grad_norm": 0.4263120061814068, "learning_rate": 2.4751356941164855e-05, "loss": 0.3909, "step": 24100 }, { "epoch": 2.8107509328358207, "grad_norm": 0.38124904574498125, "learning_rate": 2.4742746038737404e-05, "loss": 0.3833, "step": 24105 }, { "epoch": 2.8113339552238807, "grad_norm": 0.4117583097365348, "learning_rate": 2.4734135546287208e-05, "loss": 0.4136, "step": 24110 }, { "epoch": 2.8119169776119404, "grad_norm": 0.3947387742870654, "learning_rate": 2.4725525465094557e-05, "loss": 0.3972, "step": 24115 }, { "epoch": 2.8125, "grad_norm": 0.4668608715868671, "learning_rate": 2.4716915796439678e-05, "loss": 0.4328, "step": 24120 }, { "epoch": 2.8130830223880596, "grad_norm": 0.4278804795115331, "learning_rate": 2.4708306541602765e-05, "loss": 0.3988, "step": 24125 }, { "epoch": 2.8136660447761193, "grad_norm": 0.44823585767839486, "learning_rate": 2.4699697701863916e-05, "loss": 0.4414, "step": 24130 }, { "epoch": 2.8142490671641793, "grad_norm": 0.3792493693971136, "learning_rate": 2.4691089278503188e-05, "loss": 0.3826, "step": 24135 }, { "epoch": 2.814832089552239, "grad_norm": 0.4250893569584375, "learning_rate": 2.4682481272800572e-05, "loss": 0.3922, "step": 24140 }, { "epoch": 2.8154151119402986, "grad_norm": 0.529672446563032, "learning_rate": 2.4673873686035996e-05, "loss": 0.3969, "step": 24145 }, { "epoch": 2.815998134328358, "grad_norm": 0.44283279861816877, "learning_rate": 2.4665266519489328e-05, "loss": 0.4131, "step": 24150 }, { "epoch": 2.816581156716418, "grad_norm": 0.3953088226301666, "learning_rate": 2.4656659774440365e-05, "loss": 0.3914, "step": 24155 }, { "epoch": 2.8171641791044775, "grad_norm": 0.42262510875290904, "learning_rate": 2.4648053452168857e-05, "loss": 0.3999, "step": 24160 }, { "epoch": 2.817747201492537, "grad_norm": 0.39812095649226686, "learning_rate": 2.4639447553954464e-05, "loss": 0.3922, "step": 24165 }, { "epoch": 2.818330223880597, "grad_norm": 0.4020501363321913, "learning_rate": 2.463084208107682e-05, "loss": 0.4158, "step": 24170 }, { "epoch": 2.818913246268657, "grad_norm": 0.3872723225939495, "learning_rate": 2.462223703481547e-05, "loss": 0.3971, "step": 24175 }, { "epoch": 2.8194962686567164, "grad_norm": 0.4555501116525867, "learning_rate": 2.4613632416449893e-05, "loss": 0.394, "step": 24180 }, { "epoch": 2.820079291044776, "grad_norm": 0.4259640134600756, "learning_rate": 2.4605028227259535e-05, "loss": 0.4212, "step": 24185 }, { "epoch": 2.8206623134328357, "grad_norm": 0.420385036695274, "learning_rate": 2.4596424468523728e-05, "loss": 0.3927, "step": 24190 }, { "epoch": 2.8212453358208958, "grad_norm": 0.3909871637600437, "learning_rate": 2.458782114152179e-05, "loss": 0.4026, "step": 24195 }, { "epoch": 2.8218283582089554, "grad_norm": 0.40224347395428345, "learning_rate": 2.4579218247532947e-05, "loss": 0.3792, "step": 24200 }, { "epoch": 2.822411380597015, "grad_norm": 0.40495405228442655, "learning_rate": 2.4570615787836358e-05, "loss": 0.4073, "step": 24205 }, { "epoch": 2.8229944029850746, "grad_norm": 0.42990772331215554, "learning_rate": 2.4562013763711145e-05, "loss": 0.4101, "step": 24210 }, { "epoch": 2.8235774253731343, "grad_norm": 0.38516181306189795, "learning_rate": 2.4553412176436323e-05, "loss": 0.419, "step": 24215 }, { "epoch": 2.824160447761194, "grad_norm": 0.4147705650040051, "learning_rate": 2.4544811027290893e-05, "loss": 0.3911, "step": 24220 }, { "epoch": 2.8247434701492535, "grad_norm": 0.3942782293350187, "learning_rate": 2.4536210317553744e-05, "loss": 0.4037, "step": 24225 }, { "epoch": 2.8253264925373136, "grad_norm": 0.4407757655623333, "learning_rate": 2.452761004850371e-05, "loss": 0.4152, "step": 24230 }, { "epoch": 2.8259095149253732, "grad_norm": 0.4092463833129098, "learning_rate": 2.4519010221419604e-05, "loss": 0.3924, "step": 24235 }, { "epoch": 2.826492537313433, "grad_norm": 0.4417188417776021, "learning_rate": 2.4510410837580106e-05, "loss": 0.4024, "step": 24240 }, { "epoch": 2.8270755597014925, "grad_norm": 0.3988022692782764, "learning_rate": 2.4501811898263878e-05, "loss": 0.3807, "step": 24245 }, { "epoch": 2.827658582089552, "grad_norm": 0.41403541909483943, "learning_rate": 2.4493213404749493e-05, "loss": 0.4077, "step": 24250 }, { "epoch": 2.828241604477612, "grad_norm": 0.4526465180974963, "learning_rate": 2.4484615358315456e-05, "loss": 0.3814, "step": 24255 }, { "epoch": 2.8288246268656714, "grad_norm": 0.4090896726844662, "learning_rate": 2.447601776024024e-05, "loss": 0.3579, "step": 24260 }, { "epoch": 2.8294076492537314, "grad_norm": 0.389366581958165, "learning_rate": 2.4467420611802194e-05, "loss": 0.4005, "step": 24265 }, { "epoch": 2.829990671641791, "grad_norm": 0.4193735141283348, "learning_rate": 2.4458823914279662e-05, "loss": 0.4058, "step": 24270 }, { "epoch": 2.8305736940298507, "grad_norm": 0.4072016054515705, "learning_rate": 2.445022766895087e-05, "loss": 0.3911, "step": 24275 }, { "epoch": 2.8311567164179103, "grad_norm": 0.3726787597832839, "learning_rate": 2.4441631877093995e-05, "loss": 0.3953, "step": 24280 }, { "epoch": 2.83173973880597, "grad_norm": 0.4384868084146645, "learning_rate": 2.443303653998717e-05, "loss": 0.4294, "step": 24285 }, { "epoch": 2.83232276119403, "grad_norm": 0.40754963352572265, "learning_rate": 2.442444165890842e-05, "loss": 0.4082, "step": 24290 }, { "epoch": 2.8329057835820897, "grad_norm": 0.3817317415158813, "learning_rate": 2.441584723513573e-05, "loss": 0.397, "step": 24295 }, { "epoch": 2.8334888059701493, "grad_norm": 0.4685953873915643, "learning_rate": 2.4407253269947006e-05, "loss": 0.4315, "step": 24300 }, { "epoch": 2.834071828358209, "grad_norm": 0.43119704571582806, "learning_rate": 2.4398659764620086e-05, "loss": 0.4131, "step": 24305 }, { "epoch": 2.8346548507462686, "grad_norm": 0.44133039518900996, "learning_rate": 2.4390066720432746e-05, "loss": 0.4377, "step": 24310 }, { "epoch": 2.8352378731343286, "grad_norm": 0.40069317868975407, "learning_rate": 2.438147413866269e-05, "loss": 0.4012, "step": 24315 }, { "epoch": 2.835820895522388, "grad_norm": 0.4130630853182529, "learning_rate": 2.437288202058755e-05, "loss": 0.4127, "step": 24320 }, { "epoch": 2.836403917910448, "grad_norm": 0.40646135376030335, "learning_rate": 2.4364290367484888e-05, "loss": 0.4012, "step": 24325 }, { "epoch": 2.8369869402985075, "grad_norm": 0.4158453066326914, "learning_rate": 2.4355699180632207e-05, "loss": 0.4292, "step": 24330 }, { "epoch": 2.837569962686567, "grad_norm": 0.39902524368616477, "learning_rate": 2.4347108461306935e-05, "loss": 0.414, "step": 24335 }, { "epoch": 2.8381529850746268, "grad_norm": 0.42744606003488217, "learning_rate": 2.4338518210786416e-05, "loss": 0.4204, "step": 24340 }, { "epoch": 2.8387360074626864, "grad_norm": 0.41169833535803, "learning_rate": 2.432992843034796e-05, "loss": 0.431, "step": 24345 }, { "epoch": 2.8393190298507465, "grad_norm": 0.43257474420696607, "learning_rate": 2.4321339121268766e-05, "loss": 0.3942, "step": 24350 }, { "epoch": 2.839902052238806, "grad_norm": 0.419108016090665, "learning_rate": 2.431275028482599e-05, "loss": 0.4003, "step": 24355 }, { "epoch": 2.8404850746268657, "grad_norm": 0.3856661560241362, "learning_rate": 2.430416192229672e-05, "loss": 0.4097, "step": 24360 }, { "epoch": 2.8410680970149254, "grad_norm": 0.4326362279524336, "learning_rate": 2.4295574034957935e-05, "loss": 0.4102, "step": 24365 }, { "epoch": 2.841651119402985, "grad_norm": 0.3791202902718328, "learning_rate": 2.42869866240866e-05, "loss": 0.3939, "step": 24370 }, { "epoch": 2.8422341417910446, "grad_norm": 0.42997859107080455, "learning_rate": 2.427839969095957e-05, "loss": 0.3957, "step": 24375 }, { "epoch": 2.8428171641791042, "grad_norm": 0.4363154249413559, "learning_rate": 2.4269813236853632e-05, "loss": 0.4104, "step": 24380 }, { "epoch": 2.8434001865671643, "grad_norm": 0.43630810723657876, "learning_rate": 2.4261227263045527e-05, "loss": 0.3868, "step": 24385 }, { "epoch": 2.843983208955224, "grad_norm": 0.4242797707850999, "learning_rate": 2.4252641770811886e-05, "loss": 0.3888, "step": 24390 }, { "epoch": 2.8445662313432836, "grad_norm": 0.3786323125165545, "learning_rate": 2.4244056761429307e-05, "loss": 0.4153, "step": 24395 }, { "epoch": 2.845149253731343, "grad_norm": 0.392344661545243, "learning_rate": 2.423547223617429e-05, "loss": 0.4219, "step": 24400 }, { "epoch": 2.845732276119403, "grad_norm": 0.39341726507792363, "learning_rate": 2.4226888196323268e-05, "loss": 0.3955, "step": 24405 }, { "epoch": 2.846315298507463, "grad_norm": 0.4365540625801256, "learning_rate": 2.4218304643152617e-05, "loss": 0.3962, "step": 24410 }, { "epoch": 2.8468983208955225, "grad_norm": 0.41052541016137806, "learning_rate": 2.4209721577938617e-05, "loss": 0.394, "step": 24415 }, { "epoch": 2.847481343283582, "grad_norm": 0.41405925493167905, "learning_rate": 2.42011390019575e-05, "loss": 0.4275, "step": 24420 }, { "epoch": 2.848064365671642, "grad_norm": 0.3900653204452662, "learning_rate": 2.419255691648541e-05, "loss": 0.3977, "step": 24425 }, { "epoch": 2.8486473880597014, "grad_norm": 0.4053471709622438, "learning_rate": 2.4183975322798407e-05, "loss": 0.3752, "step": 24430 }, { "epoch": 2.849230410447761, "grad_norm": 0.39641963474272346, "learning_rate": 2.417539422217251e-05, "loss": 0.3933, "step": 24435 }, { "epoch": 2.8498134328358207, "grad_norm": 0.4072103759711589, "learning_rate": 2.4166813615883625e-05, "loss": 0.4002, "step": 24440 }, { "epoch": 2.8503964552238807, "grad_norm": 0.3890747829198251, "learning_rate": 2.415823350520764e-05, "loss": 0.4037, "step": 24445 }, { "epoch": 2.8509794776119404, "grad_norm": 0.4028973809674288, "learning_rate": 2.4149653891420304e-05, "loss": 0.4016, "step": 24450 }, { "epoch": 2.8515625, "grad_norm": 0.4037724399117923, "learning_rate": 2.4141074775797335e-05, "loss": 0.4039, "step": 24455 }, { "epoch": 2.8521455223880596, "grad_norm": 0.5343163758156394, "learning_rate": 2.4132496159614366e-05, "loss": 0.4258, "step": 24460 }, { "epoch": 2.8527285447761193, "grad_norm": 0.4234157935262023, "learning_rate": 2.412391804414694e-05, "loss": 0.3924, "step": 24465 }, { "epoch": 2.8533115671641793, "grad_norm": 0.3733049551298622, "learning_rate": 2.4115340430670574e-05, "loss": 0.3863, "step": 24470 }, { "epoch": 2.853894589552239, "grad_norm": 0.39046157315620744, "learning_rate": 2.410676332046065e-05, "loss": 0.4081, "step": 24475 }, { "epoch": 2.8544776119402986, "grad_norm": 0.4394507404345183, "learning_rate": 2.4098186714792504e-05, "loss": 0.4132, "step": 24480 }, { "epoch": 2.855060634328358, "grad_norm": 0.43282596564227105, "learning_rate": 2.408961061494141e-05, "loss": 0.4294, "step": 24485 }, { "epoch": 2.855643656716418, "grad_norm": 0.4368857860325901, "learning_rate": 2.408103502218253e-05, "loss": 0.4134, "step": 24490 }, { "epoch": 2.8562266791044775, "grad_norm": 0.40514376788318834, "learning_rate": 2.407245993779099e-05, "loss": 0.4126, "step": 24495 }, { "epoch": 2.856809701492537, "grad_norm": 0.4689409168661143, "learning_rate": 2.4063885363041822e-05, "loss": 0.4228, "step": 24500 }, { "epoch": 2.857392723880597, "grad_norm": 0.42742228212990213, "learning_rate": 2.405531129920997e-05, "loss": 0.4172, "step": 24505 }, { "epoch": 2.857975746268657, "grad_norm": 0.41398247557989326, "learning_rate": 2.4046737747570326e-05, "loss": 0.4053, "step": 24510 }, { "epoch": 2.8585587686567164, "grad_norm": 0.3850251313792589, "learning_rate": 2.4038164709397694e-05, "loss": 0.4174, "step": 24515 }, { "epoch": 2.859141791044776, "grad_norm": 0.4101770377436773, "learning_rate": 2.4029592185966804e-05, "loss": 0.4024, "step": 24520 }, { "epoch": 2.8597248134328357, "grad_norm": 0.389933969483989, "learning_rate": 2.40210201785523e-05, "loss": 0.4039, "step": 24525 }, { "epoch": 2.8603078358208958, "grad_norm": 0.3864105506674258, "learning_rate": 2.4012448688428768e-05, "loss": 0.3806, "step": 24530 }, { "epoch": 2.8608908582089554, "grad_norm": 0.41064619983657913, "learning_rate": 2.4003877716870698e-05, "loss": 0.3943, "step": 24535 }, { "epoch": 2.861473880597015, "grad_norm": 0.41526288263186806, "learning_rate": 2.399530726515251e-05, "loss": 0.413, "step": 24540 }, { "epoch": 2.8620569029850746, "grad_norm": 0.37817741893593837, "learning_rate": 2.3986737334548564e-05, "loss": 0.3732, "step": 24545 }, { "epoch": 2.8626399253731343, "grad_norm": 0.4381206894440303, "learning_rate": 2.397816792633311e-05, "loss": 0.4261, "step": 24550 }, { "epoch": 2.863222947761194, "grad_norm": 0.40738716108025813, "learning_rate": 2.396959904178034e-05, "loss": 0.4174, "step": 24555 }, { "epoch": 2.8638059701492535, "grad_norm": 0.390537642181435, "learning_rate": 2.396103068216437e-05, "loss": 0.3995, "step": 24560 }, { "epoch": 2.8643889925373136, "grad_norm": 0.4159336953346919, "learning_rate": 2.395246284875922e-05, "loss": 0.4041, "step": 24565 }, { "epoch": 2.8649720149253732, "grad_norm": 0.3936853002100043, "learning_rate": 2.3943895542838868e-05, "loss": 0.3947, "step": 24570 }, { "epoch": 2.865555037313433, "grad_norm": 0.40261557417640986, "learning_rate": 2.3935328765677172e-05, "loss": 0.3773, "step": 24575 }, { "epoch": 2.8661380597014925, "grad_norm": 0.42449360027466265, "learning_rate": 2.3926762518547928e-05, "loss": 0.3903, "step": 24580 }, { "epoch": 2.866721082089552, "grad_norm": 0.41778284206752914, "learning_rate": 2.3918196802724874e-05, "loss": 0.4046, "step": 24585 }, { "epoch": 2.867304104477612, "grad_norm": 0.43191991233987104, "learning_rate": 2.3909631619481626e-05, "loss": 0.4109, "step": 24590 }, { "epoch": 2.8678871268656714, "grad_norm": 0.4133181416603124, "learning_rate": 2.390106697009176e-05, "loss": 0.4089, "step": 24595 }, { "epoch": 2.8684701492537314, "grad_norm": 0.44767587322082275, "learning_rate": 2.3892502855828762e-05, "loss": 0.3982, "step": 24600 }, { "epoch": 2.869053171641791, "grad_norm": 0.4109104703305783, "learning_rate": 2.388393927796601e-05, "loss": 0.4083, "step": 24605 }, { "epoch": 2.8696361940298507, "grad_norm": 0.4054090044863133, "learning_rate": 2.387537623777686e-05, "loss": 0.414, "step": 24610 }, { "epoch": 2.8702192164179103, "grad_norm": 0.5099691428529806, "learning_rate": 2.3866813736534517e-05, "loss": 0.4245, "step": 24615 }, { "epoch": 2.87080223880597, "grad_norm": 0.47498753884610195, "learning_rate": 2.3858251775512176e-05, "loss": 0.3947, "step": 24620 }, { "epoch": 2.87138526119403, "grad_norm": 0.41510822650922674, "learning_rate": 2.3849690355982895e-05, "loss": 0.4114, "step": 24625 }, { "epoch": 2.8719682835820897, "grad_norm": 0.3892083976859538, "learning_rate": 2.384112947921968e-05, "loss": 0.4255, "step": 24630 }, { "epoch": 2.8725513059701493, "grad_norm": 0.38156201149519997, "learning_rate": 2.3832569146495464e-05, "loss": 0.4061, "step": 24635 }, { "epoch": 2.873134328358209, "grad_norm": 0.3981291909865196, "learning_rate": 2.3824009359083073e-05, "loss": 0.38, "step": 24640 }, { "epoch": 2.8737173507462686, "grad_norm": 0.4616752471859324, "learning_rate": 2.3815450118255272e-05, "loss": 0.4084, "step": 24645 }, { "epoch": 2.8743003731343286, "grad_norm": 0.4272833105107833, "learning_rate": 2.380689142528474e-05, "loss": 0.4165, "step": 24650 }, { "epoch": 2.874883395522388, "grad_norm": 0.4121014836198729, "learning_rate": 2.3798333281444057e-05, "loss": 0.4342, "step": 24655 }, { "epoch": 2.875466417910448, "grad_norm": 0.42304423381586476, "learning_rate": 2.378977568800576e-05, "loss": 0.3939, "step": 24660 }, { "epoch": 2.8760494402985075, "grad_norm": 0.405604211463261, "learning_rate": 2.3781218646242264e-05, "loss": 0.4179, "step": 24665 }, { "epoch": 2.876632462686567, "grad_norm": 0.410351260508775, "learning_rate": 2.3772662157425925e-05, "loss": 0.427, "step": 24670 }, { "epoch": 2.8772154850746268, "grad_norm": 0.4444117347524044, "learning_rate": 2.376410622282902e-05, "loss": 0.4125, "step": 24675 }, { "epoch": 2.8777985074626864, "grad_norm": 0.4016325681634763, "learning_rate": 2.375555084372371e-05, "loss": 0.3851, "step": 24680 }, { "epoch": 2.8783815298507465, "grad_norm": 0.3821765509428547, "learning_rate": 2.3746996021382124e-05, "loss": 0.4251, "step": 24685 }, { "epoch": 2.878964552238806, "grad_norm": 0.4076947203602498, "learning_rate": 2.3738441757076268e-05, "loss": 0.3982, "step": 24690 }, { "epoch": 2.8795475746268657, "grad_norm": 0.4005527493041794, "learning_rate": 2.3729888052078088e-05, "loss": 0.394, "step": 24695 }, { "epoch": 2.8801305970149254, "grad_norm": 0.4060689225266477, "learning_rate": 2.3721334907659424e-05, "loss": 0.4106, "step": 24700 }, { "epoch": 2.880713619402985, "grad_norm": 0.42718017392739294, "learning_rate": 2.371278232509206e-05, "loss": 0.3948, "step": 24705 }, { "epoch": 2.8812966417910446, "grad_norm": 0.41383665802467334, "learning_rate": 2.370423030564768e-05, "loss": 0.387, "step": 24710 }, { "epoch": 2.8818796641791042, "grad_norm": 0.4451802175506528, "learning_rate": 2.3695678850597876e-05, "loss": 0.3882, "step": 24715 }, { "epoch": 2.8824626865671643, "grad_norm": 0.4055142167603417, "learning_rate": 2.368712796121419e-05, "loss": 0.3815, "step": 24720 }, { "epoch": 2.883045708955224, "grad_norm": 0.43359950461642005, "learning_rate": 2.3678577638768035e-05, "loss": 0.3981, "step": 24725 }, { "epoch": 2.8836287313432836, "grad_norm": 0.39948141435362394, "learning_rate": 2.367002788453077e-05, "loss": 0.4103, "step": 24730 }, { "epoch": 2.884211753731343, "grad_norm": 0.4229619925725585, "learning_rate": 2.366147869977367e-05, "loss": 0.4014, "step": 24735 }, { "epoch": 2.884794776119403, "grad_norm": 0.4028658884119076, "learning_rate": 2.3652930085767904e-05, "loss": 0.4064, "step": 24740 }, { "epoch": 2.885377798507463, "grad_norm": 0.5643632036476247, "learning_rate": 2.364438204378458e-05, "loss": 0.3807, "step": 24745 }, { "epoch": 2.8859608208955225, "grad_norm": 0.4195003817703374, "learning_rate": 2.3635834575094705e-05, "loss": 0.4188, "step": 24750 }, { "epoch": 2.886543843283582, "grad_norm": 0.41802101909621364, "learning_rate": 2.3627287680969195e-05, "loss": 0.4077, "step": 24755 }, { "epoch": 2.887126865671642, "grad_norm": 0.4061500737991145, "learning_rate": 2.3618741362678915e-05, "loss": 0.3806, "step": 24760 }, { "epoch": 2.8877098880597014, "grad_norm": 0.40416655755195474, "learning_rate": 2.3610195621494595e-05, "loss": 0.386, "step": 24765 }, { "epoch": 2.888292910447761, "grad_norm": 0.4368691942878869, "learning_rate": 2.360165045868693e-05, "loss": 0.3968, "step": 24770 }, { "epoch": 2.8888759328358207, "grad_norm": 0.37835566532333276, "learning_rate": 2.3593105875526488e-05, "loss": 0.3757, "step": 24775 }, { "epoch": 2.8894589552238807, "grad_norm": 0.4364902532815583, "learning_rate": 2.358456187328376e-05, "loss": 0.4083, "step": 24780 }, { "epoch": 2.8900419776119404, "grad_norm": 0.3782607959674674, "learning_rate": 2.3576018453229182e-05, "loss": 0.3691, "step": 24785 }, { "epoch": 2.890625, "grad_norm": 0.3797036800433863, "learning_rate": 2.3567475616633046e-05, "loss": 0.3827, "step": 24790 }, { "epoch": 2.8912080223880596, "grad_norm": 0.40776317385220623, "learning_rate": 2.3558933364765617e-05, "loss": 0.4085, "step": 24795 }, { "epoch": 2.8917910447761193, "grad_norm": 0.43262439495951677, "learning_rate": 2.355039169889704e-05, "loss": 0.395, "step": 24800 }, { "epoch": 2.8923740671641793, "grad_norm": 0.4180800612198426, "learning_rate": 2.3541850620297368e-05, "loss": 0.392, "step": 24805 }, { "epoch": 2.892957089552239, "grad_norm": 0.48634111522693574, "learning_rate": 2.3533310130236592e-05, "loss": 0.4063, "step": 24810 }, { "epoch": 2.8935401119402986, "grad_norm": 0.41627741997295004, "learning_rate": 2.3524770229984587e-05, "loss": 0.4063, "step": 24815 }, { "epoch": 2.894123134328358, "grad_norm": 0.41709604498467673, "learning_rate": 2.3516230920811166e-05, "loss": 0.3991, "step": 24820 }, { "epoch": 2.894706156716418, "grad_norm": 0.40823029921163884, "learning_rate": 2.3507692203986042e-05, "loss": 0.3722, "step": 24825 }, { "epoch": 2.8952891791044775, "grad_norm": 0.42155178738534094, "learning_rate": 2.3499154080778823e-05, "loss": 0.4107, "step": 24830 }, { "epoch": 2.895872201492537, "grad_norm": 0.4166192474304213, "learning_rate": 2.349061655245907e-05, "loss": 0.4029, "step": 24835 }, { "epoch": 2.896455223880597, "grad_norm": 0.3954396494556229, "learning_rate": 2.3482079620296223e-05, "loss": 0.3856, "step": 24840 }, { "epoch": 2.897038246268657, "grad_norm": 0.3977847598870714, "learning_rate": 2.347354328555963e-05, "loss": 0.3888, "step": 24845 }, { "epoch": 2.8976212686567164, "grad_norm": 0.4046284318378812, "learning_rate": 2.3465007549518576e-05, "loss": 0.3964, "step": 24850 }, { "epoch": 2.898204291044776, "grad_norm": 0.433611208865864, "learning_rate": 2.3456472413442248e-05, "loss": 0.4367, "step": 24855 }, { "epoch": 2.8987873134328357, "grad_norm": 0.3976024854002812, "learning_rate": 2.3447937878599725e-05, "loss": 0.4025, "step": 24860 }, { "epoch": 2.8993703358208958, "grad_norm": 0.3989868868191118, "learning_rate": 2.3439403946260007e-05, "loss": 0.4096, "step": 24865 }, { "epoch": 2.8999533582089554, "grad_norm": 0.42939386585697015, "learning_rate": 2.343087061769203e-05, "loss": 0.3946, "step": 24870 }, { "epoch": 2.900536380597015, "grad_norm": 0.44495417024776424, "learning_rate": 2.3422337894164594e-05, "loss": 0.4117, "step": 24875 }, { "epoch": 2.9011194029850746, "grad_norm": 0.41446939070729183, "learning_rate": 2.3413805776946453e-05, "loss": 0.4124, "step": 24880 }, { "epoch": 2.9017024253731343, "grad_norm": 0.4057898275011049, "learning_rate": 2.3405274267306244e-05, "loss": 0.4058, "step": 24885 }, { "epoch": 2.902285447761194, "grad_norm": 0.4097613234138069, "learning_rate": 2.3396743366512508e-05, "loss": 0.4219, "step": 24890 }, { "epoch": 2.9028684701492535, "grad_norm": 0.4509067786161942, "learning_rate": 2.338821307583373e-05, "loss": 0.3958, "step": 24895 }, { "epoch": 2.9034514925373136, "grad_norm": 0.4277230430884208, "learning_rate": 2.337968339653826e-05, "loss": 0.405, "step": 24900 }, { "epoch": 2.9040345149253732, "grad_norm": 0.4297228967842881, "learning_rate": 2.33711543298944e-05, "loss": 0.423, "step": 24905 }, { "epoch": 2.904617537313433, "grad_norm": 0.4076491715263942, "learning_rate": 2.3362625877170336e-05, "loss": 0.3952, "step": 24910 }, { "epoch": 2.9052005597014925, "grad_norm": 0.42301387777350424, "learning_rate": 2.3354098039634154e-05, "loss": 0.4155, "step": 24915 }, { "epoch": 2.905783582089552, "grad_norm": 0.4015962877581815, "learning_rate": 2.3345570818553874e-05, "loss": 0.4018, "step": 24920 }, { "epoch": 2.906366604477612, "grad_norm": 0.41774728592458704, "learning_rate": 2.3337044215197408e-05, "loss": 0.3948, "step": 24925 }, { "epoch": 2.9069496268656714, "grad_norm": 0.44990262245478574, "learning_rate": 2.3328518230832587e-05, "loss": 0.424, "step": 24930 }, { "epoch": 2.9075326492537314, "grad_norm": 0.4413178522099354, "learning_rate": 2.3319992866727136e-05, "loss": 0.3854, "step": 24935 }, { "epoch": 2.908115671641791, "grad_norm": 0.4161768623084127, "learning_rate": 2.331146812414869e-05, "loss": 0.3973, "step": 24940 }, { "epoch": 2.9086986940298507, "grad_norm": 0.4568466173744008, "learning_rate": 2.330294400436481e-05, "loss": 0.4282, "step": 24945 }, { "epoch": 2.9092817164179103, "grad_norm": 0.43582365445869903, "learning_rate": 2.329442050864293e-05, "loss": 0.3934, "step": 24950 }, { "epoch": 2.90986473880597, "grad_norm": 0.41216198065629644, "learning_rate": 2.3285897638250437e-05, "loss": 0.3868, "step": 24955 }, { "epoch": 2.91044776119403, "grad_norm": 0.4521166492531563, "learning_rate": 2.3277375394454594e-05, "loss": 0.4322, "step": 24960 }, { "epoch": 2.9110307835820897, "grad_norm": 0.37478266654025316, "learning_rate": 2.326885377852256e-05, "loss": 0.3909, "step": 24965 }, { "epoch": 2.9116138059701493, "grad_norm": 0.39825062099728903, "learning_rate": 2.326033279172144e-05, "loss": 0.4127, "step": 24970 }, { "epoch": 2.912196828358209, "grad_norm": 0.4310244343023203, "learning_rate": 2.32518124353182e-05, "loss": 0.411, "step": 24975 }, { "epoch": 2.9127798507462686, "grad_norm": 0.40561810191760284, "learning_rate": 2.324329271057976e-05, "loss": 0.3954, "step": 24980 }, { "epoch": 2.9133628731343286, "grad_norm": 0.3886447880786942, "learning_rate": 2.3234773618772916e-05, "loss": 0.4035, "step": 24985 }, { "epoch": 2.913945895522388, "grad_norm": 0.39291361858950735, "learning_rate": 2.322625516116435e-05, "loss": 0.3977, "step": 24990 }, { "epoch": 2.914528917910448, "grad_norm": 0.4289910244574198, "learning_rate": 2.3217737339020706e-05, "loss": 0.4142, "step": 24995 }, { "epoch": 2.9151119402985075, "grad_norm": 0.41804607838764657, "learning_rate": 2.3209220153608486e-05, "loss": 0.4253, "step": 25000 }, { "epoch": 2.915694962686567, "grad_norm": 0.380525476332318, "learning_rate": 2.3200703606194126e-05, "loss": 0.3988, "step": 25005 }, { "epoch": 2.9162779850746268, "grad_norm": 0.4146357362301396, "learning_rate": 2.3192187698043944e-05, "loss": 0.4056, "step": 25010 }, { "epoch": 2.9168610074626864, "grad_norm": 0.4419473513927475, "learning_rate": 2.3183672430424163e-05, "loss": 0.4335, "step": 25015 }, { "epoch": 2.9174440298507465, "grad_norm": 0.4497441028504768, "learning_rate": 2.3175157804600954e-05, "loss": 0.4157, "step": 25020 }, { "epoch": 2.918027052238806, "grad_norm": 0.4231823433948403, "learning_rate": 2.3166643821840338e-05, "loss": 0.3964, "step": 25025 }, { "epoch": 2.9186100746268657, "grad_norm": 0.40692064793035276, "learning_rate": 2.3158130483408262e-05, "loss": 0.384, "step": 25030 }, { "epoch": 2.9191930970149254, "grad_norm": 0.43646246071138917, "learning_rate": 2.3149617790570587e-05, "loss": 0.4171, "step": 25035 }, { "epoch": 2.919776119402985, "grad_norm": 0.41868527060090205, "learning_rate": 2.3141105744593065e-05, "loss": 0.3638, "step": 25040 }, { "epoch": 2.9203591417910446, "grad_norm": 0.3950288733939245, "learning_rate": 2.3132594346741353e-05, "loss": 0.3996, "step": 25045 }, { "epoch": 2.9209421641791042, "grad_norm": 0.3900419470979898, "learning_rate": 2.3124083598281022e-05, "loss": 0.3701, "step": 25050 }, { "epoch": 2.9215251865671643, "grad_norm": 0.3757075298206466, "learning_rate": 2.3115573500477534e-05, "loss": 0.3826, "step": 25055 }, { "epoch": 2.922108208955224, "grad_norm": 0.43993979242127856, "learning_rate": 2.310706405459625e-05, "loss": 0.3769, "step": 25060 }, { "epoch": 2.9226912313432836, "grad_norm": 0.42997594377083986, "learning_rate": 2.3098555261902455e-05, "loss": 0.4117, "step": 25065 }, { "epoch": 2.923274253731343, "grad_norm": 0.3630604906423459, "learning_rate": 2.3090047123661324e-05, "loss": 0.3755, "step": 25070 }, { "epoch": 2.923857276119403, "grad_norm": 0.3984197416436645, "learning_rate": 2.308153964113793e-05, "loss": 0.4096, "step": 25075 }, { "epoch": 2.924440298507463, "grad_norm": 0.4005189029459354, "learning_rate": 2.3073032815597263e-05, "loss": 0.3732, "step": 25080 }, { "epoch": 2.9250233208955225, "grad_norm": 0.42754499865938367, "learning_rate": 2.3064526648304195e-05, "loss": 0.4026, "step": 25085 }, { "epoch": 2.925606343283582, "grad_norm": 0.39659037663555, "learning_rate": 2.3056021140523516e-05, "loss": 0.4012, "step": 25090 }, { "epoch": 2.926189365671642, "grad_norm": 0.3857244207492492, "learning_rate": 2.304751629351992e-05, "loss": 0.376, "step": 25095 }, { "epoch": 2.9267723880597014, "grad_norm": 0.4091159243869105, "learning_rate": 2.3039012108557982e-05, "loss": 0.3889, "step": 25100 }, { "epoch": 2.927355410447761, "grad_norm": 0.4027380642874116, "learning_rate": 2.3030508586902215e-05, "loss": 0.413, "step": 25105 }, { "epoch": 2.9279384328358207, "grad_norm": 0.38937355535511614, "learning_rate": 2.3022005729817e-05, "loss": 0.3832, "step": 25110 }, { "epoch": 2.9285214552238807, "grad_norm": 0.41596054645460284, "learning_rate": 2.301350353856661e-05, "loss": 0.4048, "step": 25115 }, { "epoch": 2.9291044776119404, "grad_norm": 0.4161171037388307, "learning_rate": 2.3005002014415274e-05, "loss": 0.4167, "step": 25120 }, { "epoch": 2.9296875, "grad_norm": 0.3833302448550085, "learning_rate": 2.2996501158627054e-05, "loss": 0.3982, "step": 25125 }, { "epoch": 2.9302705223880596, "grad_norm": 0.4438540143346973, "learning_rate": 2.2988000972465978e-05, "loss": 0.4306, "step": 25130 }, { "epoch": 2.9308535447761193, "grad_norm": 0.43239374779471784, "learning_rate": 2.297950145719593e-05, "loss": 0.4431, "step": 25135 }, { "epoch": 2.9314365671641793, "grad_norm": 0.47247131842109463, "learning_rate": 2.297100261408069e-05, "loss": 0.3979, "step": 25140 }, { "epoch": 2.932019589552239, "grad_norm": 0.40091967591663, "learning_rate": 2.2962504444383974e-05, "loss": 0.3908, "step": 25145 }, { "epoch": 2.9326026119402986, "grad_norm": 0.3794271000855443, "learning_rate": 2.295400694936937e-05, "loss": 0.3978, "step": 25150 }, { "epoch": 2.933185634328358, "grad_norm": 0.4095246121458622, "learning_rate": 2.2945510130300386e-05, "loss": 0.4215, "step": 25155 }, { "epoch": 2.933768656716418, "grad_norm": 0.4064170878907522, "learning_rate": 2.2937013988440405e-05, "loss": 0.4178, "step": 25160 }, { "epoch": 2.9343516791044775, "grad_norm": 0.4118465548109363, "learning_rate": 2.2928518525052717e-05, "loss": 0.4167, "step": 25165 }, { "epoch": 2.934934701492537, "grad_norm": 0.41806234804780734, "learning_rate": 2.2920023741400533e-05, "loss": 0.4042, "step": 25170 }, { "epoch": 2.935517723880597, "grad_norm": 0.42840618861931157, "learning_rate": 2.2911529638746927e-05, "loss": 0.3817, "step": 25175 }, { "epoch": 2.936100746268657, "grad_norm": 0.41502354390655183, "learning_rate": 2.2903036218354912e-05, "loss": 0.3994, "step": 25180 }, { "epoch": 2.9366837686567164, "grad_norm": 0.42591051491474585, "learning_rate": 2.289454348148736e-05, "loss": 0.4108, "step": 25185 }, { "epoch": 2.937266791044776, "grad_norm": 0.41491598081280956, "learning_rate": 2.288605142940707e-05, "loss": 0.3917, "step": 25190 }, { "epoch": 2.9378498134328357, "grad_norm": 0.3745700162861839, "learning_rate": 2.2877560063376723e-05, "loss": 0.4062, "step": 25195 }, { "epoch": 2.9384328358208958, "grad_norm": 0.41726980475028025, "learning_rate": 2.2869069384658908e-05, "loss": 0.4291, "step": 25200 }, { "epoch": 2.9390158582089554, "grad_norm": 0.5516879898324801, "learning_rate": 2.2860579394516107e-05, "loss": 0.419, "step": 25205 }, { "epoch": 2.939598880597015, "grad_norm": 0.3848375565823875, "learning_rate": 2.2852090094210698e-05, "loss": 0.4064, "step": 25210 }, { "epoch": 2.9401819029850746, "grad_norm": 0.4109948064761437, "learning_rate": 2.2843601485004955e-05, "loss": 0.3992, "step": 25215 }, { "epoch": 2.9407649253731343, "grad_norm": 0.3927265043294491, "learning_rate": 2.283511356816106e-05, "loss": 0.4042, "step": 25220 }, { "epoch": 2.941347947761194, "grad_norm": 0.48121892872118954, "learning_rate": 2.2826626344941083e-05, "loss": 0.3892, "step": 25225 }, { "epoch": 2.9419309701492535, "grad_norm": 0.37256851975056027, "learning_rate": 2.2818139816607e-05, "loss": 0.3747, "step": 25230 }, { "epoch": 2.9425139925373136, "grad_norm": 0.3921876104660543, "learning_rate": 2.2809653984420652e-05, "loss": 0.3831, "step": 25235 }, { "epoch": 2.9430970149253732, "grad_norm": 0.4266465103397456, "learning_rate": 2.280116884964383e-05, "loss": 0.4239, "step": 25240 }, { "epoch": 2.943680037313433, "grad_norm": 0.3947150176195217, "learning_rate": 2.279268441353818e-05, "loss": 0.3994, "step": 25245 }, { "epoch": 2.9442630597014925, "grad_norm": 0.43091467973147646, "learning_rate": 2.2784200677365242e-05, "loss": 0.4288, "step": 25250 }, { "epoch": 2.944846082089552, "grad_norm": 0.4142103610308874, "learning_rate": 2.2775717642386497e-05, "loss": 0.4131, "step": 25255 }, { "epoch": 2.945429104477612, "grad_norm": 0.4215738737978956, "learning_rate": 2.276723530986327e-05, "loss": 0.413, "step": 25260 }, { "epoch": 2.9460121268656714, "grad_norm": 0.4043071366559255, "learning_rate": 2.27587536810568e-05, "loss": 0.4046, "step": 25265 }, { "epoch": 2.9465951492537314, "grad_norm": 0.39963288308005285, "learning_rate": 2.2750272757228235e-05, "loss": 0.3928, "step": 25270 }, { "epoch": 2.947178171641791, "grad_norm": 0.4260079204293212, "learning_rate": 2.27417925396386e-05, "loss": 0.3976, "step": 25275 }, { "epoch": 2.9477611940298507, "grad_norm": 0.4026489185676219, "learning_rate": 2.273331302954883e-05, "loss": 0.418, "step": 25280 }, { "epoch": 2.9483442164179103, "grad_norm": 0.4131040866705125, "learning_rate": 2.2724834228219742e-05, "loss": 0.3959, "step": 25285 }, { "epoch": 2.94892723880597, "grad_norm": 0.4178283482729982, "learning_rate": 2.271635613691205e-05, "loss": 0.3839, "step": 25290 }, { "epoch": 2.94951026119403, "grad_norm": 0.419560877318633, "learning_rate": 2.2707878756886368e-05, "loss": 0.4201, "step": 25295 }, { "epoch": 2.9500932835820897, "grad_norm": 0.42295331082686, "learning_rate": 2.26994020894032e-05, "loss": 0.4135, "step": 25300 }, { "epoch": 2.9506763059701493, "grad_norm": 0.4007879495508421, "learning_rate": 2.2690926135722946e-05, "loss": 0.4063, "step": 25305 }, { "epoch": 2.951259328358209, "grad_norm": 0.40328354566363045, "learning_rate": 2.2682450897105905e-05, "loss": 0.3961, "step": 25310 }, { "epoch": 2.9518423507462686, "grad_norm": 0.38707822579641404, "learning_rate": 2.2673976374812246e-05, "loss": 0.3886, "step": 25315 }, { "epoch": 2.9524253731343286, "grad_norm": 0.4193700698413242, "learning_rate": 2.266550257010207e-05, "loss": 0.4045, "step": 25320 }, { "epoch": 2.953008395522388, "grad_norm": 0.3828309533087151, "learning_rate": 2.265702948423534e-05, "loss": 0.3874, "step": 25325 }, { "epoch": 2.953591417910448, "grad_norm": 0.4138945989886876, "learning_rate": 2.2648557118471918e-05, "loss": 0.4097, "step": 25330 }, { "epoch": 2.9541744402985075, "grad_norm": 0.39832165005252257, "learning_rate": 2.264008547407158e-05, "loss": 0.3731, "step": 25335 }, { "epoch": 2.954757462686567, "grad_norm": 0.4221079463034623, "learning_rate": 2.2631614552293963e-05, "loss": 0.4295, "step": 25340 }, { "epoch": 2.9553404850746268, "grad_norm": 0.4399266627497746, "learning_rate": 2.2623144354398623e-05, "loss": 0.4045, "step": 25345 }, { "epoch": 2.9559235074626864, "grad_norm": 0.4028460067458374, "learning_rate": 2.2614674881644974e-05, "loss": 0.4032, "step": 25350 }, { "epoch": 2.9565065298507465, "grad_norm": 0.40086976663568546, "learning_rate": 2.2606206135292385e-05, "loss": 0.3883, "step": 25355 }, { "epoch": 2.957089552238806, "grad_norm": 0.40263627128283286, "learning_rate": 2.2597738116600048e-05, "loss": 0.4199, "step": 25360 }, { "epoch": 2.9576725746268657, "grad_norm": 0.46214462516979704, "learning_rate": 2.2589270826827073e-05, "loss": 0.4086, "step": 25365 }, { "epoch": 2.9582555970149254, "grad_norm": 0.39466493675004233, "learning_rate": 2.2580804267232484e-05, "loss": 0.3769, "step": 25370 }, { "epoch": 2.958838619402985, "grad_norm": 0.4021769345625866, "learning_rate": 2.257233843907517e-05, "loss": 0.3894, "step": 25375 }, { "epoch": 2.9594216417910446, "grad_norm": 0.38507883277133476, "learning_rate": 2.2563873343613916e-05, "loss": 0.4235, "step": 25380 }, { "epoch": 2.9600046641791042, "grad_norm": 0.41325075109978926, "learning_rate": 2.2555408982107407e-05, "loss": 0.413, "step": 25385 }, { "epoch": 2.9605876865671643, "grad_norm": 0.44675494871627636, "learning_rate": 2.2546945355814196e-05, "loss": 0.3839, "step": 25390 }, { "epoch": 2.961170708955224, "grad_norm": 0.39067882987873165, "learning_rate": 2.2538482465992762e-05, "loss": 0.4286, "step": 25395 }, { "epoch": 2.9617537313432836, "grad_norm": 0.43093610928308723, "learning_rate": 2.2530020313901446e-05, "loss": 0.3864, "step": 25400 }, { "epoch": 2.962336753731343, "grad_norm": 0.39795231184485813, "learning_rate": 2.2521558900798495e-05, "loss": 0.3943, "step": 25405 }, { "epoch": 2.962919776119403, "grad_norm": 0.38531472883509843, "learning_rate": 2.2513098227942032e-05, "loss": 0.3795, "step": 25410 }, { "epoch": 2.963502798507463, "grad_norm": 0.42197054661839406, "learning_rate": 2.250463829659008e-05, "loss": 0.3997, "step": 25415 }, { "epoch": 2.9640858208955225, "grad_norm": 0.42359026424902746, "learning_rate": 2.249617910800056e-05, "loss": 0.3913, "step": 25420 }, { "epoch": 2.964668843283582, "grad_norm": 0.41927514123524673, "learning_rate": 2.248772066343125e-05, "loss": 0.4009, "step": 25425 }, { "epoch": 2.965251865671642, "grad_norm": 0.44238902546771147, "learning_rate": 2.2479262964139863e-05, "loss": 0.4091, "step": 25430 }, { "epoch": 2.9658348880597014, "grad_norm": 0.4055168047041742, "learning_rate": 2.2470806011383972e-05, "loss": 0.3854, "step": 25435 }, { "epoch": 2.966417910447761, "grad_norm": 0.44548897158782064, "learning_rate": 2.2462349806421035e-05, "loss": 0.4153, "step": 25440 }, { "epoch": 2.9670009328358207, "grad_norm": 0.40984566677196516, "learning_rate": 2.245389435050842e-05, "loss": 0.4085, "step": 25445 }, { "epoch": 2.9675839552238807, "grad_norm": 0.37770958816231737, "learning_rate": 2.244543964490336e-05, "loss": 0.3793, "step": 25450 }, { "epoch": 2.9681669776119404, "grad_norm": 0.40014186988741113, "learning_rate": 2.2436985690863004e-05, "loss": 0.4069, "step": 25455 }, { "epoch": 2.96875, "grad_norm": 0.4271740503864899, "learning_rate": 2.2428532489644368e-05, "loss": 0.4004, "step": 25460 }, { "epoch": 2.9693330223880596, "grad_norm": 0.4287412853262523, "learning_rate": 2.2420080042504348e-05, "loss": 0.4088, "step": 25465 }, { "epoch": 2.9699160447761193, "grad_norm": 0.47085217031650956, "learning_rate": 2.2411628350699766e-05, "loss": 0.4047, "step": 25470 }, { "epoch": 2.9704990671641793, "grad_norm": 0.4236414913472614, "learning_rate": 2.2403177415487285e-05, "loss": 0.3855, "step": 25475 }, { "epoch": 2.971082089552239, "grad_norm": 0.44921836393514036, "learning_rate": 2.2394727238123497e-05, "loss": 0.4238, "step": 25480 }, { "epoch": 2.9716651119402986, "grad_norm": 0.43653869539233847, "learning_rate": 2.2386277819864853e-05, "loss": 0.4049, "step": 25485 }, { "epoch": 2.972248134328358, "grad_norm": 0.4453227108407065, "learning_rate": 2.23778291619677e-05, "loss": 0.4113, "step": 25490 }, { "epoch": 2.972831156716418, "grad_norm": 0.5524259039595061, "learning_rate": 2.2369381265688277e-05, "loss": 0.4009, "step": 25495 }, { "epoch": 2.9734141791044775, "grad_norm": 0.4118860361096215, "learning_rate": 2.236093413228269e-05, "loss": 0.4271, "step": 25500 }, { "epoch": 2.973997201492537, "grad_norm": 0.40652814888997635, "learning_rate": 2.2352487763006975e-05, "loss": 0.4235, "step": 25505 }, { "epoch": 2.974580223880597, "grad_norm": 0.4374563967976496, "learning_rate": 2.2344042159117006e-05, "loss": 0.4284, "step": 25510 }, { "epoch": 2.975163246268657, "grad_norm": 0.42160551742662283, "learning_rate": 2.2335597321868568e-05, "loss": 0.4074, "step": 25515 }, { "epoch": 2.9757462686567164, "grad_norm": 0.38660997279551207, "learning_rate": 2.2327153252517323e-05, "loss": 0.3876, "step": 25520 }, { "epoch": 2.976329291044776, "grad_norm": 0.4126166538415622, "learning_rate": 2.2318709952318822e-05, "loss": 0.4142, "step": 25525 }, { "epoch": 2.9769123134328357, "grad_norm": 0.3872941861425115, "learning_rate": 2.2310267422528523e-05, "loss": 0.3934, "step": 25530 }, { "epoch": 2.9774953358208958, "grad_norm": 0.40355541577838977, "learning_rate": 2.2301825664401733e-05, "loss": 0.4182, "step": 25535 }, { "epoch": 2.9780783582089554, "grad_norm": 0.483726374003381, "learning_rate": 2.2293384679193645e-05, "loss": 0.4213, "step": 25540 }, { "epoch": 2.978661380597015, "grad_norm": 0.43331721004443324, "learning_rate": 2.228494446815939e-05, "loss": 0.4137, "step": 25545 }, { "epoch": 2.9792444029850746, "grad_norm": 0.4004588618503616, "learning_rate": 2.2276505032553912e-05, "loss": 0.3982, "step": 25550 }, { "epoch": 2.9798274253731343, "grad_norm": 0.4222765181742688, "learning_rate": 2.2268066373632096e-05, "loss": 0.4272, "step": 25555 }, { "epoch": 2.980410447761194, "grad_norm": 0.45113868243041527, "learning_rate": 2.2259628492648676e-05, "loss": 0.4128, "step": 25560 }, { "epoch": 2.9809934701492535, "grad_norm": 0.4342499839855839, "learning_rate": 2.2251191390858295e-05, "loss": 0.3826, "step": 25565 }, { "epoch": 2.9815764925373136, "grad_norm": 0.4266366236984472, "learning_rate": 2.224275506951547e-05, "loss": 0.4174, "step": 25570 }, { "epoch": 2.9821595149253732, "grad_norm": 0.407381288290989, "learning_rate": 2.2234319529874586e-05, "loss": 0.3967, "step": 25575 }, { "epoch": 2.982742537313433, "grad_norm": 0.3899195848917077, "learning_rate": 2.2225884773189936e-05, "loss": 0.4168, "step": 25580 }, { "epoch": 2.9833255597014925, "grad_norm": 0.3990786929492579, "learning_rate": 2.221745080071569e-05, "loss": 0.3774, "step": 25585 }, { "epoch": 2.983908582089552, "grad_norm": 0.4012067435433367, "learning_rate": 2.2209017613705908e-05, "loss": 0.4093, "step": 25590 }, { "epoch": 2.984491604477612, "grad_norm": 0.4829709734822829, "learning_rate": 2.22005852134145e-05, "loss": 0.4028, "step": 25595 }, { "epoch": 2.9850746268656714, "grad_norm": 0.42554208792579185, "learning_rate": 2.2192153601095293e-05, "loss": 0.4084, "step": 25600 }, { "epoch": 2.9856576492537314, "grad_norm": 0.3953817743750672, "learning_rate": 2.2183722778002004e-05, "loss": 0.377, "step": 25605 }, { "epoch": 2.986240671641791, "grad_norm": 0.4166501312880979, "learning_rate": 2.2175292745388186e-05, "loss": 0.3996, "step": 25610 }, { "epoch": 2.9868236940298507, "grad_norm": 0.4538563083315286, "learning_rate": 2.2166863504507336e-05, "loss": 0.4162, "step": 25615 }, { "epoch": 2.9874067164179103, "grad_norm": 0.42389743691607706, "learning_rate": 2.2158435056612775e-05, "loss": 0.4036, "step": 25620 }, { "epoch": 2.98798973880597, "grad_norm": 0.4116072944395864, "learning_rate": 2.215000740295774e-05, "loss": 0.4144, "step": 25625 }, { "epoch": 2.98857276119403, "grad_norm": 0.40491099460562413, "learning_rate": 2.2141580544795353e-05, "loss": 0.3677, "step": 25630 }, { "epoch": 2.9891557835820897, "grad_norm": 0.41021211807311764, "learning_rate": 2.2133154483378587e-05, "loss": 0.3931, "step": 25635 }, { "epoch": 2.9897388059701493, "grad_norm": 0.4142741077006574, "learning_rate": 2.2124729219960343e-05, "loss": 0.4042, "step": 25640 }, { "epoch": 2.990321828358209, "grad_norm": 0.5130815958463901, "learning_rate": 2.211630475579336e-05, "loss": 0.4139, "step": 25645 }, { "epoch": 2.9909048507462686, "grad_norm": 0.41636737963451764, "learning_rate": 2.2107881092130266e-05, "loss": 0.4167, "step": 25650 }, { "epoch": 2.9914878731343286, "grad_norm": 0.37704740496619205, "learning_rate": 2.209945823022361e-05, "loss": 0.4094, "step": 25655 }, { "epoch": 2.992070895522388, "grad_norm": 0.44399220632690417, "learning_rate": 2.2091036171325754e-05, "loss": 0.4024, "step": 25660 }, { "epoch": 2.992653917910448, "grad_norm": 0.40764856190725607, "learning_rate": 2.2082614916689002e-05, "loss": 0.3627, "step": 25665 }, { "epoch": 2.9932369402985075, "grad_norm": 0.4239153676933328, "learning_rate": 2.2074194467565514e-05, "loss": 0.3837, "step": 25670 }, { "epoch": 2.993819962686567, "grad_norm": 0.40367519234787475, "learning_rate": 2.2065774825207304e-05, "loss": 0.4064, "step": 25675 }, { "epoch": 2.9944029850746268, "grad_norm": 0.41618454761265616, "learning_rate": 2.2057355990866328e-05, "loss": 0.3786, "step": 25680 }, { "epoch": 2.9949860074626864, "grad_norm": 0.4684558839151656, "learning_rate": 2.204893796579436e-05, "loss": 0.4113, "step": 25685 }, { "epoch": 2.9955690298507465, "grad_norm": 0.39257373267619683, "learning_rate": 2.2040520751243094e-05, "loss": 0.4094, "step": 25690 }, { "epoch": 2.996152052238806, "grad_norm": 0.421877977446944, "learning_rate": 2.2032104348464082e-05, "loss": 0.4149, "step": 25695 }, { "epoch": 2.9967350746268657, "grad_norm": 0.4045253191912498, "learning_rate": 2.2023688758708767e-05, "loss": 0.4285, "step": 25700 }, { "epoch": 2.9973180970149254, "grad_norm": 0.4073067509537844, "learning_rate": 2.201527398322846e-05, "loss": 0.3997, "step": 25705 }, { "epoch": 2.997901119402985, "grad_norm": 0.4153333223415684, "learning_rate": 2.2006860023274363e-05, "loss": 0.412, "step": 25710 }, { "epoch": 2.9984841417910446, "grad_norm": 0.38451643205960345, "learning_rate": 2.199844688009755e-05, "loss": 0.4017, "step": 25715 }, { "epoch": 2.9990671641791042, "grad_norm": 0.3894626368127005, "learning_rate": 2.199003455494898e-05, "loss": 0.431, "step": 25720 }, { "epoch": 2.9996501865671643, "grad_norm": 0.413333139108324, "learning_rate": 2.198162304907947e-05, "loss": 0.3862, "step": 25725 }, { "epoch": 3.000233208955224, "grad_norm": 0.3709667014045933, "learning_rate": 2.1973212363739747e-05, "loss": 0.3349, "step": 25730 }, { "epoch": 3.0008162313432836, "grad_norm": 0.37324013937538697, "learning_rate": 2.1964802500180388e-05, "loss": 0.3258, "step": 25735 }, { "epoch": 3.001399253731343, "grad_norm": 0.4321794169270504, "learning_rate": 2.1956393459651864e-05, "loss": 0.306, "step": 25740 }, { "epoch": 3.001982276119403, "grad_norm": 0.43584264007268964, "learning_rate": 2.1947985243404522e-05, "loss": 0.3349, "step": 25745 }, { "epoch": 3.002565298507463, "grad_norm": 0.4946303778086085, "learning_rate": 2.1939577852688576e-05, "loss": 0.3399, "step": 25750 }, { "epoch": 3.0031483208955225, "grad_norm": 0.478556817161445, "learning_rate": 2.1931171288754133e-05, "loss": 0.3145, "step": 25755 }, { "epoch": 3.003731343283582, "grad_norm": 0.43115696590550234, "learning_rate": 2.1922765552851155e-05, "loss": 0.324, "step": 25760 }, { "epoch": 3.004314365671642, "grad_norm": 0.41381544876817744, "learning_rate": 2.1914360646229508e-05, "loss": 0.3308, "step": 25765 }, { "epoch": 3.0048973880597014, "grad_norm": 0.45448327625698814, "learning_rate": 2.190595657013892e-05, "loss": 0.3501, "step": 25770 }, { "epoch": 3.005480410447761, "grad_norm": 0.44213183445629095, "learning_rate": 2.1897553325828984e-05, "loss": 0.3219, "step": 25775 }, { "epoch": 3.006063432835821, "grad_norm": 0.4284389496074295, "learning_rate": 2.1889150914549195e-05, "loss": 0.3102, "step": 25780 }, { "epoch": 3.0066464552238807, "grad_norm": 0.4281305962350065, "learning_rate": 2.18807493375489e-05, "loss": 0.3164, "step": 25785 }, { "epoch": 3.0072294776119404, "grad_norm": 0.46760793885757, "learning_rate": 2.1872348596077348e-05, "loss": 0.3303, "step": 25790 }, { "epoch": 3.0078125, "grad_norm": 0.48810245439796657, "learning_rate": 2.186394869138364e-05, "loss": 0.3626, "step": 25795 }, { "epoch": 3.0083955223880596, "grad_norm": 0.4188558881957621, "learning_rate": 2.1855549624716755e-05, "loss": 0.3079, "step": 25800 }, { "epoch": 3.0089785447761193, "grad_norm": 0.6868156571860319, "learning_rate": 2.1847151397325567e-05, "loss": 0.3506, "step": 25805 }, { "epoch": 3.009561567164179, "grad_norm": 0.4166324683203662, "learning_rate": 2.1838754010458796e-05, "loss": 0.3395, "step": 25810 }, { "epoch": 3.010144589552239, "grad_norm": 0.439798060859837, "learning_rate": 2.183035746536507e-05, "loss": 0.3185, "step": 25815 }, { "epoch": 3.0107276119402986, "grad_norm": 0.4368121556123018, "learning_rate": 2.182196176329287e-05, "loss": 0.3215, "step": 25820 }, { "epoch": 3.011310634328358, "grad_norm": 0.5375571776187688, "learning_rate": 2.181356690549054e-05, "loss": 0.3328, "step": 25825 }, { "epoch": 3.011893656716418, "grad_norm": 0.5128792384264712, "learning_rate": 2.1805172893206342e-05, "loss": 0.3191, "step": 25830 }, { "epoch": 3.0124766791044775, "grad_norm": 0.5022495614254291, "learning_rate": 2.179677972768836e-05, "loss": 0.3202, "step": 25835 }, { "epoch": 3.013059701492537, "grad_norm": 0.44919366412330874, "learning_rate": 2.1788387410184603e-05, "loss": 0.3083, "step": 25840 }, { "epoch": 3.013642723880597, "grad_norm": 0.44961762537675826, "learning_rate": 2.177999594194291e-05, "loss": 0.3243, "step": 25845 }, { "epoch": 3.014225746268657, "grad_norm": 0.4497450788881165, "learning_rate": 2.177160532421101e-05, "loss": 0.3275, "step": 25850 }, { "epoch": 3.0148087686567164, "grad_norm": 0.5022477741310479, "learning_rate": 2.1763215558236515e-05, "loss": 0.3167, "step": 25855 }, { "epoch": 3.015391791044776, "grad_norm": 0.4308664530537648, "learning_rate": 2.1754826645266895e-05, "loss": 0.328, "step": 25860 }, { "epoch": 3.0159748134328357, "grad_norm": 0.5031641219848102, "learning_rate": 2.1746438586549516e-05, "loss": 0.3348, "step": 25865 }, { "epoch": 3.0165578358208953, "grad_norm": 0.4583637418903681, "learning_rate": 2.1738051383331598e-05, "loss": 0.3192, "step": 25870 }, { "epoch": 3.0171408582089554, "grad_norm": 0.47224843054081067, "learning_rate": 2.1729665036860225e-05, "loss": 0.3544, "step": 25875 }, { "epoch": 3.017723880597015, "grad_norm": 0.4515220245909902, "learning_rate": 2.172127954838238e-05, "loss": 0.3094, "step": 25880 }, { "epoch": 3.0183069029850746, "grad_norm": 0.45586070303977266, "learning_rate": 2.1712894919144888e-05, "loss": 0.3384, "step": 25885 }, { "epoch": 3.0188899253731343, "grad_norm": 0.5169637575283627, "learning_rate": 2.1704511150394486e-05, "loss": 0.3156, "step": 25890 }, { "epoch": 3.019472947761194, "grad_norm": 0.4461111433917812, "learning_rate": 2.1696128243377743e-05, "loss": 0.3282, "step": 25895 }, { "epoch": 3.0200559701492535, "grad_norm": 0.437128585129177, "learning_rate": 2.1687746199341118e-05, "loss": 0.3028, "step": 25900 }, { "epoch": 3.0206389925373136, "grad_norm": 0.4353962297447659, "learning_rate": 2.1679365019530956e-05, "loss": 0.3234, "step": 25905 }, { "epoch": 3.0212220149253732, "grad_norm": 0.4144946526033545, "learning_rate": 2.167098470519344e-05, "loss": 0.3134, "step": 25910 }, { "epoch": 3.021805037313433, "grad_norm": 0.45415991580330156, "learning_rate": 2.1662605257574647e-05, "loss": 0.3226, "step": 25915 }, { "epoch": 3.0223880597014925, "grad_norm": 0.4494434819632714, "learning_rate": 2.165422667792053e-05, "loss": 0.3232, "step": 25920 }, { "epoch": 3.022971082089552, "grad_norm": 0.4633580086275821, "learning_rate": 2.1645848967476895e-05, "loss": 0.3484, "step": 25925 }, { "epoch": 3.0235541044776117, "grad_norm": 0.41772851509616826, "learning_rate": 2.1637472127489427e-05, "loss": 0.3049, "step": 25930 }, { "epoch": 3.024137126865672, "grad_norm": 0.5298250438349478, "learning_rate": 2.1629096159203683e-05, "loss": 0.3369, "step": 25935 }, { "epoch": 3.0247201492537314, "grad_norm": 0.47392793455278825, "learning_rate": 2.162072106386509e-05, "loss": 0.3282, "step": 25940 }, { "epoch": 3.025303171641791, "grad_norm": 0.4667852627077425, "learning_rate": 2.161234684271895e-05, "loss": 0.3259, "step": 25945 }, { "epoch": 3.0258861940298507, "grad_norm": 0.46375690535376113, "learning_rate": 2.1603973497010417e-05, "loss": 0.3221, "step": 25950 }, { "epoch": 3.0264692164179103, "grad_norm": 0.4814207884770054, "learning_rate": 2.1595601027984535e-05, "loss": 0.3336, "step": 25955 }, { "epoch": 3.02705223880597, "grad_norm": 0.4285397278536263, "learning_rate": 2.158722943688621e-05, "loss": 0.325, "step": 25960 }, { "epoch": 3.02763526119403, "grad_norm": 0.5222797070512876, "learning_rate": 2.1578858724960215e-05, "loss": 0.3369, "step": 25965 }, { "epoch": 3.0282182835820897, "grad_norm": 0.46558662752257896, "learning_rate": 2.1570488893451203e-05, "loss": 0.3177, "step": 25970 }, { "epoch": 3.0288013059701493, "grad_norm": 0.4740422033882584, "learning_rate": 2.1562119943603672e-05, "loss": 0.3298, "step": 25975 }, { "epoch": 3.029384328358209, "grad_norm": 0.48152240232416327, "learning_rate": 2.1553751876662014e-05, "loss": 0.3435, "step": 25980 }, { "epoch": 3.0299673507462686, "grad_norm": 0.49326475205634374, "learning_rate": 2.154538469387048e-05, "loss": 0.3295, "step": 25985 }, { "epoch": 3.030550373134328, "grad_norm": 0.4587260614381078, "learning_rate": 2.1537018396473195e-05, "loss": 0.3103, "step": 25990 }, { "epoch": 3.0311333955223883, "grad_norm": 0.4509155814863767, "learning_rate": 2.1528652985714143e-05, "loss": 0.3269, "step": 25995 }, { "epoch": 3.031716417910448, "grad_norm": 0.4828891286812978, "learning_rate": 2.1520288462837175e-05, "loss": 0.3316, "step": 26000 }, { "epoch": 3.0322994402985075, "grad_norm": 0.48231842060370533, "learning_rate": 2.1511924829086015e-05, "loss": 0.3108, "step": 26005 }, { "epoch": 3.032882462686567, "grad_norm": 0.5775698144252633, "learning_rate": 2.1503562085704265e-05, "loss": 0.3236, "step": 26010 }, { "epoch": 3.0334654850746268, "grad_norm": 0.46151414345237757, "learning_rate": 2.149520023393538e-05, "loss": 0.3317, "step": 26015 }, { "epoch": 3.0340485074626864, "grad_norm": 0.46242092784899796, "learning_rate": 2.148683927502269e-05, "loss": 0.3306, "step": 26020 }, { "epoch": 3.0346315298507465, "grad_norm": 0.4189566355856064, "learning_rate": 2.1478479210209383e-05, "loss": 0.3335, "step": 26025 }, { "epoch": 3.035214552238806, "grad_norm": 0.442510967396775, "learning_rate": 2.147012004073853e-05, "loss": 0.3416, "step": 26030 }, { "epoch": 3.0357975746268657, "grad_norm": 0.48666760042409635, "learning_rate": 2.1461761767853038e-05, "loss": 0.3099, "step": 26035 }, { "epoch": 3.0363805970149254, "grad_norm": 0.4111072274366574, "learning_rate": 2.1453404392795735e-05, "loss": 0.3184, "step": 26040 }, { "epoch": 3.036963619402985, "grad_norm": 0.4640345669066334, "learning_rate": 2.1445047916809262e-05, "loss": 0.3181, "step": 26045 }, { "epoch": 3.0375466417910446, "grad_norm": 0.46677437904450414, "learning_rate": 2.143669234113614e-05, "loss": 0.3174, "step": 26050 }, { "epoch": 3.0381296641791047, "grad_norm": 0.4626266546083552, "learning_rate": 2.1428337667018782e-05, "loss": 0.3388, "step": 26055 }, { "epoch": 3.0387126865671643, "grad_norm": 0.4956156984419573, "learning_rate": 2.1419983895699437e-05, "loss": 0.3387, "step": 26060 }, { "epoch": 3.039295708955224, "grad_norm": 0.441904913513983, "learning_rate": 2.141163102842023e-05, "loss": 0.3292, "step": 26065 }, { "epoch": 3.0398787313432836, "grad_norm": 0.4340883696843336, "learning_rate": 2.1403279066423166e-05, "loss": 0.3259, "step": 26070 }, { "epoch": 3.040461753731343, "grad_norm": 0.44951348637735855, "learning_rate": 2.1394928010950077e-05, "loss": 0.3407, "step": 26075 }, { "epoch": 3.041044776119403, "grad_norm": 0.48862955978877737, "learning_rate": 2.1386577863242708e-05, "loss": 0.3338, "step": 26080 }, { "epoch": 3.041627798507463, "grad_norm": 0.44175615140140967, "learning_rate": 2.1378228624542628e-05, "loss": 0.3361, "step": 26085 }, { "epoch": 3.0422108208955225, "grad_norm": 0.4990818637282398, "learning_rate": 2.136988029609131e-05, "loss": 0.3255, "step": 26090 }, { "epoch": 3.042793843283582, "grad_norm": 0.4916890579858683, "learning_rate": 2.1361532879130058e-05, "loss": 0.3269, "step": 26095 }, { "epoch": 3.043376865671642, "grad_norm": 0.44104377142597967, "learning_rate": 2.135318637490004e-05, "loss": 0.3116, "step": 26100 }, { "epoch": 3.0439598880597014, "grad_norm": 0.46567665822910337, "learning_rate": 2.1344840784642322e-05, "loss": 0.3194, "step": 26105 }, { "epoch": 3.044542910447761, "grad_norm": 0.46562424960840615, "learning_rate": 2.1336496109597804e-05, "loss": 0.3118, "step": 26110 }, { "epoch": 3.0451259328358207, "grad_norm": 0.4892840179391156, "learning_rate": 2.1328152351007264e-05, "loss": 0.3103, "step": 26115 }, { "epoch": 3.0457089552238807, "grad_norm": 0.43899662845298115, "learning_rate": 2.131980951011134e-05, "loss": 0.3369, "step": 26120 }, { "epoch": 3.0462919776119404, "grad_norm": 0.513675059625149, "learning_rate": 2.1311467588150518e-05, "loss": 0.3341, "step": 26125 }, { "epoch": 3.046875, "grad_norm": 0.47734407906100274, "learning_rate": 2.1303126586365175e-05, "loss": 0.3417, "step": 26130 }, { "epoch": 3.0474580223880596, "grad_norm": 0.43312626180627223, "learning_rate": 2.1294786505995534e-05, "loss": 0.3144, "step": 26135 }, { "epoch": 3.0480410447761193, "grad_norm": 0.45351095507528777, "learning_rate": 2.1286447348281695e-05, "loss": 0.3246, "step": 26140 }, { "epoch": 3.0486240671641793, "grad_norm": 0.550977527627621, "learning_rate": 2.1278109114463594e-05, "loss": 0.343, "step": 26145 }, { "epoch": 3.049207089552239, "grad_norm": 0.4570110420316436, "learning_rate": 2.126977180578106e-05, "loss": 0.3456, "step": 26150 }, { "epoch": 3.0497901119402986, "grad_norm": 0.4726144359521414, "learning_rate": 2.1261435423473765e-05, "loss": 0.3282, "step": 26155 }, { "epoch": 3.050373134328358, "grad_norm": 0.4383123748438573, "learning_rate": 2.1253099968781237e-05, "loss": 0.3162, "step": 26160 }, { "epoch": 3.050956156716418, "grad_norm": 0.4733717130768152, "learning_rate": 2.1244765442942904e-05, "loss": 0.3178, "step": 26165 }, { "epoch": 3.0515391791044775, "grad_norm": 0.46542673525491274, "learning_rate": 2.1236431847198017e-05, "loss": 0.3281, "step": 26170 }, { "epoch": 3.052122201492537, "grad_norm": 0.4735626309677211, "learning_rate": 2.1228099182785693e-05, "loss": 0.3389, "step": 26175 }, { "epoch": 3.052705223880597, "grad_norm": 0.4818373430655894, "learning_rate": 2.1219767450944938e-05, "loss": 0.3179, "step": 26180 }, { "epoch": 3.053288246268657, "grad_norm": 0.4681946119389254, "learning_rate": 2.1211436652914585e-05, "loss": 0.3105, "step": 26185 }, { "epoch": 3.0538712686567164, "grad_norm": 0.49628334189273177, "learning_rate": 2.1203106789933352e-05, "loss": 0.3468, "step": 26190 }, { "epoch": 3.054454291044776, "grad_norm": 0.4760500825592149, "learning_rate": 2.119477786323981e-05, "loss": 0.3201, "step": 26195 }, { "epoch": 3.0550373134328357, "grad_norm": 0.45477546124887575, "learning_rate": 2.1186449874072385e-05, "loss": 0.341, "step": 26200 }, { "epoch": 3.0556203358208953, "grad_norm": 0.5026800137037384, "learning_rate": 2.1178122823669373e-05, "loss": 0.3393, "step": 26205 }, { "epoch": 3.0562033582089554, "grad_norm": 0.5065218787007753, "learning_rate": 2.116979671326892e-05, "loss": 0.3267, "step": 26210 }, { "epoch": 3.056786380597015, "grad_norm": 0.41895410179357156, "learning_rate": 2.1161471544109057e-05, "loss": 0.3151, "step": 26215 }, { "epoch": 3.0573694029850746, "grad_norm": 0.40538522270532434, "learning_rate": 2.115314731742764e-05, "loss": 0.3071, "step": 26220 }, { "epoch": 3.0579524253731343, "grad_norm": 0.48454399106534213, "learning_rate": 2.1144824034462403e-05, "loss": 0.337, "step": 26225 }, { "epoch": 3.058535447761194, "grad_norm": 0.4370705887097963, "learning_rate": 2.1136501696450943e-05, "loss": 0.3302, "step": 26230 }, { "epoch": 3.0591184701492535, "grad_norm": 0.4445676743234194, "learning_rate": 2.112818030463071e-05, "loss": 0.3185, "step": 26235 }, { "epoch": 3.0597014925373136, "grad_norm": 0.4991765898821802, "learning_rate": 2.1119859860239023e-05, "loss": 0.3275, "step": 26240 }, { "epoch": 3.0602845149253732, "grad_norm": 0.46111349895725334, "learning_rate": 2.1111540364513045e-05, "loss": 0.3186, "step": 26245 }, { "epoch": 3.060867537313433, "grad_norm": 0.4484815999642295, "learning_rate": 2.1103221818689794e-05, "loss": 0.3072, "step": 26250 }, { "epoch": 3.0614505597014925, "grad_norm": 0.43613201402576735, "learning_rate": 2.1094904224006185e-05, "loss": 0.3189, "step": 26255 }, { "epoch": 3.062033582089552, "grad_norm": 0.4368901513564297, "learning_rate": 2.108658758169893e-05, "loss": 0.3168, "step": 26260 }, { "epoch": 3.0626166044776117, "grad_norm": 0.46999070952488564, "learning_rate": 2.107827189300467e-05, "loss": 0.3177, "step": 26265 }, { "epoch": 3.063199626865672, "grad_norm": 0.47318691393207823, "learning_rate": 2.1069957159159848e-05, "loss": 0.3332, "step": 26270 }, { "epoch": 3.0637826492537314, "grad_norm": 0.46268788338947603, "learning_rate": 2.1061643381400785e-05, "loss": 0.3084, "step": 26275 }, { "epoch": 3.064365671641791, "grad_norm": 0.44748836691943034, "learning_rate": 2.105333056096367e-05, "loss": 0.3388, "step": 26280 }, { "epoch": 3.0649486940298507, "grad_norm": 0.41553750338952344, "learning_rate": 2.104501869908453e-05, "loss": 0.312, "step": 26285 }, { "epoch": 3.0655317164179103, "grad_norm": 0.48119437755123573, "learning_rate": 2.1036707796999267e-05, "loss": 0.315, "step": 26290 }, { "epoch": 3.06611473880597, "grad_norm": 0.5485148429526312, "learning_rate": 2.102839785594362e-05, "loss": 0.3217, "step": 26295 }, { "epoch": 3.06669776119403, "grad_norm": 0.4813421692880979, "learning_rate": 2.1020088877153215e-05, "loss": 0.3154, "step": 26300 }, { "epoch": 3.0672807835820897, "grad_norm": 0.45170470024821985, "learning_rate": 2.1011780861863504e-05, "loss": 0.3131, "step": 26305 }, { "epoch": 3.0678638059701493, "grad_norm": 0.45835875961148714, "learning_rate": 2.100347381130982e-05, "loss": 0.3368, "step": 26310 }, { "epoch": 3.068446828358209, "grad_norm": 0.510026992930468, "learning_rate": 2.099516772672733e-05, "loss": 0.3332, "step": 26315 }, { "epoch": 3.0690298507462686, "grad_norm": 0.5120793089999462, "learning_rate": 2.0986862609351077e-05, "loss": 0.3464, "step": 26320 }, { "epoch": 3.069612873134328, "grad_norm": 0.46710572074543905, "learning_rate": 2.0978558460415954e-05, "loss": 0.3372, "step": 26325 }, { "epoch": 3.0701958955223883, "grad_norm": 0.6917733491212216, "learning_rate": 2.09702552811567e-05, "loss": 0.3305, "step": 26330 }, { "epoch": 3.070778917910448, "grad_norm": 0.44714880732994, "learning_rate": 2.096195307280792e-05, "loss": 0.3166, "step": 26335 }, { "epoch": 3.0713619402985075, "grad_norm": 0.4460138250360666, "learning_rate": 2.0953651836604083e-05, "loss": 0.3315, "step": 26340 }, { "epoch": 3.071944962686567, "grad_norm": 0.43659882259326416, "learning_rate": 2.094535157377949e-05, "loss": 0.3325, "step": 26345 }, { "epoch": 3.0725279850746268, "grad_norm": 0.4523932005657126, "learning_rate": 2.093705228556832e-05, "loss": 0.325, "step": 26350 }, { "epoch": 3.0731110074626864, "grad_norm": 0.4360071900869937, "learning_rate": 2.0928753973204597e-05, "loss": 0.3187, "step": 26355 }, { "epoch": 3.0736940298507465, "grad_norm": 0.515786247549207, "learning_rate": 2.0920456637922194e-05, "loss": 0.3373, "step": 26360 }, { "epoch": 3.074277052238806, "grad_norm": 0.4664235027987304, "learning_rate": 2.0912160280954852e-05, "loss": 0.3293, "step": 26365 }, { "epoch": 3.0748600746268657, "grad_norm": 0.5105316255986599, "learning_rate": 2.0903864903536147e-05, "loss": 0.332, "step": 26370 }, { "epoch": 3.0754430970149254, "grad_norm": 0.465501450938767, "learning_rate": 2.0895570506899544e-05, "loss": 0.3122, "step": 26375 }, { "epoch": 3.076026119402985, "grad_norm": 0.4784328153797881, "learning_rate": 2.088727709227833e-05, "loss": 0.3366, "step": 26380 }, { "epoch": 3.0766091417910446, "grad_norm": 0.4562543660615144, "learning_rate": 2.0878984660905644e-05, "loss": 0.3149, "step": 26385 }, { "epoch": 3.0771921641791047, "grad_norm": 0.4496520926125867, "learning_rate": 2.087069321401451e-05, "loss": 0.3207, "step": 26390 }, { "epoch": 3.0777751865671643, "grad_norm": 0.4712252095464664, "learning_rate": 2.0862402752837768e-05, "loss": 0.3332, "step": 26395 }, { "epoch": 3.078358208955224, "grad_norm": 0.4569205082117521, "learning_rate": 2.085411327860815e-05, "loss": 0.3405, "step": 26400 }, { "epoch": 3.0789412313432836, "grad_norm": 0.4602314078830094, "learning_rate": 2.084582479255821e-05, "loss": 0.3318, "step": 26405 }, { "epoch": 3.079524253731343, "grad_norm": 0.4696749255322814, "learning_rate": 2.083753729592037e-05, "loss": 0.321, "step": 26410 }, { "epoch": 3.080107276119403, "grad_norm": 0.4352764869390288, "learning_rate": 2.0829250789926898e-05, "loss": 0.3098, "step": 26415 }, { "epoch": 3.080690298507463, "grad_norm": 0.48320746852302104, "learning_rate": 2.0820965275809913e-05, "loss": 0.3328, "step": 26420 }, { "epoch": 3.0812733208955225, "grad_norm": 0.46097197214179425, "learning_rate": 2.081268075480141e-05, "loss": 0.3121, "step": 26425 }, { "epoch": 3.081856343283582, "grad_norm": 0.4922339007608091, "learning_rate": 2.0804397228133205e-05, "loss": 0.3418, "step": 26430 }, { "epoch": 3.082439365671642, "grad_norm": 0.46339810671505294, "learning_rate": 2.0796114697036972e-05, "loss": 0.3308, "step": 26435 }, { "epoch": 3.0830223880597014, "grad_norm": 0.574397678354726, "learning_rate": 2.0787833162744257e-05, "loss": 0.3119, "step": 26440 }, { "epoch": 3.083605410447761, "grad_norm": 0.4282297159701541, "learning_rate": 2.0779552626486444e-05, "loss": 0.347, "step": 26445 }, { "epoch": 3.0841884328358207, "grad_norm": 0.5042237481556545, "learning_rate": 2.077127308949476e-05, "loss": 0.3299, "step": 26450 }, { "epoch": 3.0847714552238807, "grad_norm": 0.4351639604910172, "learning_rate": 2.0762994553000304e-05, "loss": 0.3214, "step": 26455 }, { "epoch": 3.0853544776119404, "grad_norm": 0.4558374981572576, "learning_rate": 2.0754717018234003e-05, "loss": 0.3151, "step": 26460 }, { "epoch": 3.0859375, "grad_norm": 0.44193020864322696, "learning_rate": 2.074644048642666e-05, "loss": 0.3301, "step": 26465 }, { "epoch": 3.0865205223880596, "grad_norm": 0.4409441882722866, "learning_rate": 2.0738164958808905e-05, "loss": 0.3045, "step": 26470 }, { "epoch": 3.0871035447761193, "grad_norm": 0.46245906743626414, "learning_rate": 2.072989043661124e-05, "loss": 0.3305, "step": 26475 }, { "epoch": 3.0876865671641793, "grad_norm": 0.5313669338688435, "learning_rate": 2.072161692106399e-05, "loss": 0.3753, "step": 26480 }, { "epoch": 3.088269589552239, "grad_norm": 0.43890620433513766, "learning_rate": 2.0713344413397368e-05, "loss": 0.3048, "step": 26485 }, { "epoch": 3.0888526119402986, "grad_norm": 0.45550645734538264, "learning_rate": 2.0705072914841407e-05, "loss": 0.3254, "step": 26490 }, { "epoch": 3.089435634328358, "grad_norm": 0.45127483412657704, "learning_rate": 2.0696802426625993e-05, "loss": 0.314, "step": 26495 }, { "epoch": 3.090018656716418, "grad_norm": 0.47559466833847036, "learning_rate": 2.0688532949980882e-05, "loss": 0.3491, "step": 26500 }, { "epoch": 3.0906016791044775, "grad_norm": 0.4560452850152337, "learning_rate": 2.0680264486135665e-05, "loss": 0.3025, "step": 26505 }, { "epoch": 3.091184701492537, "grad_norm": 0.44289312672337844, "learning_rate": 2.0671997036319763e-05, "loss": 0.3096, "step": 26510 }, { "epoch": 3.091767723880597, "grad_norm": 0.44977244207477446, "learning_rate": 2.0663730601762494e-05, "loss": 0.3249, "step": 26515 }, { "epoch": 3.092350746268657, "grad_norm": 0.4944824796926875, "learning_rate": 2.0655465183692972e-05, "loss": 0.3373, "step": 26520 }, { "epoch": 3.0929337686567164, "grad_norm": 0.4602852658140172, "learning_rate": 2.0647200783340214e-05, "loss": 0.3265, "step": 26525 }, { "epoch": 3.093516791044776, "grad_norm": 0.43619491761256923, "learning_rate": 2.063893740193304e-05, "loss": 0.3409, "step": 26530 }, { "epoch": 3.0940998134328357, "grad_norm": 0.49781783714355804, "learning_rate": 2.063067504070012e-05, "loss": 0.35, "step": 26535 }, { "epoch": 3.0946828358208953, "grad_norm": 0.49332653496439094, "learning_rate": 2.0622413700870026e-05, "loss": 0.3266, "step": 26540 }, { "epoch": 3.0952658582089554, "grad_norm": 0.45841570511793567, "learning_rate": 2.0614153383671103e-05, "loss": 0.3234, "step": 26545 }, { "epoch": 3.095848880597015, "grad_norm": 0.4661202506373183, "learning_rate": 2.0605894090331607e-05, "loss": 0.3386, "step": 26550 }, { "epoch": 3.0964319029850746, "grad_norm": 0.43819664647364176, "learning_rate": 2.0597635822079607e-05, "loss": 0.3205, "step": 26555 }, { "epoch": 3.0970149253731343, "grad_norm": 0.4437153488494203, "learning_rate": 2.0589378580143016e-05, "loss": 0.3369, "step": 26560 }, { "epoch": 3.097597947761194, "grad_norm": 0.48069583190874654, "learning_rate": 2.058112236574963e-05, "loss": 0.3218, "step": 26565 }, { "epoch": 3.0981809701492535, "grad_norm": 0.47697809357117477, "learning_rate": 2.057286718012705e-05, "loss": 0.3231, "step": 26570 }, { "epoch": 3.0987639925373136, "grad_norm": 0.4840688119797522, "learning_rate": 2.0564613024502754e-05, "loss": 0.3395, "step": 26575 }, { "epoch": 3.0993470149253732, "grad_norm": 0.603029117989374, "learning_rate": 2.0556359900104054e-05, "loss": 0.3311, "step": 26580 }, { "epoch": 3.099930037313433, "grad_norm": 0.47697535251778106, "learning_rate": 2.0548107808158102e-05, "loss": 0.3093, "step": 26585 }, { "epoch": 3.1005130597014925, "grad_norm": 0.4882263624088247, "learning_rate": 2.0539856749891918e-05, "loss": 0.3437, "step": 26590 }, { "epoch": 3.101096082089552, "grad_norm": 0.451119064870134, "learning_rate": 2.0531606726532344e-05, "loss": 0.3493, "step": 26595 }, { "epoch": 3.1016791044776117, "grad_norm": 0.4338425342728425, "learning_rate": 2.0523357739306087e-05, "loss": 0.3246, "step": 26600 }, { "epoch": 3.102262126865672, "grad_norm": 0.4448999392854656, "learning_rate": 2.0515109789439695e-05, "loss": 0.3395, "step": 26605 }, { "epoch": 3.1028451492537314, "grad_norm": 0.4976464626815555, "learning_rate": 2.050686287815954e-05, "loss": 0.3382, "step": 26610 }, { "epoch": 3.103428171641791, "grad_norm": 0.4663663206133034, "learning_rate": 2.049861700669189e-05, "loss": 0.3335, "step": 26615 }, { "epoch": 3.1040111940298507, "grad_norm": 0.4542148386148399, "learning_rate": 2.049037217626279e-05, "loss": 0.3412, "step": 26620 }, { "epoch": 3.1045942164179103, "grad_norm": 0.46942413203721617, "learning_rate": 2.0482128388098202e-05, "loss": 0.3191, "step": 26625 }, { "epoch": 3.10517723880597, "grad_norm": 0.4688295158411415, "learning_rate": 2.0473885643423885e-05, "loss": 0.3352, "step": 26630 }, { "epoch": 3.10576026119403, "grad_norm": 0.47340585860574863, "learning_rate": 2.046564394346544e-05, "loss": 0.3296, "step": 26635 }, { "epoch": 3.1063432835820897, "grad_norm": 0.43339045008618204, "learning_rate": 2.0457403289448353e-05, "loss": 0.3305, "step": 26640 }, { "epoch": 3.1069263059701493, "grad_norm": 0.4527274948867366, "learning_rate": 2.0449163682597915e-05, "loss": 0.3239, "step": 26645 }, { "epoch": 3.107509328358209, "grad_norm": 0.46326112077044784, "learning_rate": 2.0440925124139286e-05, "loss": 0.3337, "step": 26650 }, { "epoch": 3.1080923507462686, "grad_norm": 0.4754515700991158, "learning_rate": 2.0432687615297458e-05, "loss": 0.3164, "step": 26655 }, { "epoch": 3.108675373134328, "grad_norm": 0.45125925500992764, "learning_rate": 2.0424451157297264e-05, "loss": 0.3166, "step": 26660 }, { "epoch": 3.1092583955223883, "grad_norm": 0.4711447156286339, "learning_rate": 2.0416215751363392e-05, "loss": 0.3157, "step": 26665 }, { "epoch": 3.109841417910448, "grad_norm": 0.4641385040928347, "learning_rate": 2.040798139872037e-05, "loss": 0.3308, "step": 26670 }, { "epoch": 3.1104244402985075, "grad_norm": 0.5248646209056408, "learning_rate": 2.0399748100592564e-05, "loss": 0.3325, "step": 26675 }, { "epoch": 3.111007462686567, "grad_norm": 0.4591315961710023, "learning_rate": 2.0391515858204184e-05, "loss": 0.3104, "step": 26680 }, { "epoch": 3.1115904850746268, "grad_norm": 0.4807515110675291, "learning_rate": 2.038328467277929e-05, "loss": 0.3196, "step": 26685 }, { "epoch": 3.1121735074626864, "grad_norm": 0.457473043091165, "learning_rate": 2.0375054545541776e-05, "loss": 0.3348, "step": 26690 }, { "epoch": 3.1127565298507465, "grad_norm": 0.488233815914533, "learning_rate": 2.0366825477715386e-05, "loss": 0.338, "step": 26695 }, { "epoch": 3.113339552238806, "grad_norm": 1.2441433195164318, "learning_rate": 2.0358597470523706e-05, "loss": 0.3272, "step": 26700 }, { "epoch": 3.1139225746268657, "grad_norm": 0.5547517656519116, "learning_rate": 2.035037052519016e-05, "loss": 0.3199, "step": 26705 }, { "epoch": 3.1145055970149254, "grad_norm": 0.5006793922568553, "learning_rate": 2.034214464293801e-05, "loss": 0.326, "step": 26710 }, { "epoch": 3.115088619402985, "grad_norm": 0.46256078339483764, "learning_rate": 2.0333919824990372e-05, "loss": 0.3262, "step": 26715 }, { "epoch": 3.1156716417910446, "grad_norm": 0.4248843856616436, "learning_rate": 2.0325696072570195e-05, "loss": 0.3116, "step": 26720 }, { "epoch": 3.1162546641791047, "grad_norm": 0.46781738586616756, "learning_rate": 2.0317473386900275e-05, "loss": 0.321, "step": 26725 }, { "epoch": 3.1168376865671643, "grad_norm": 0.48694496352209315, "learning_rate": 2.0309251769203252e-05, "loss": 0.3293, "step": 26730 }, { "epoch": 3.117420708955224, "grad_norm": 0.4822676988381848, "learning_rate": 2.0301031220701582e-05, "loss": 0.3224, "step": 26735 }, { "epoch": 3.1180037313432836, "grad_norm": 0.4377358685137589, "learning_rate": 2.0292811742617607e-05, "loss": 0.3327, "step": 26740 }, { "epoch": 3.118586753731343, "grad_norm": 0.4404678461258958, "learning_rate": 2.028459333617346e-05, "loss": 0.3119, "step": 26745 }, { "epoch": 3.119169776119403, "grad_norm": 0.494397776515022, "learning_rate": 2.0276376002591164e-05, "loss": 0.351, "step": 26750 }, { "epoch": 3.119752798507463, "grad_norm": 0.44359941570618844, "learning_rate": 2.0268159743092546e-05, "loss": 0.3227, "step": 26755 }, { "epoch": 3.1203358208955225, "grad_norm": 0.4705878230779239, "learning_rate": 2.0259944558899274e-05, "loss": 0.3375, "step": 26760 }, { "epoch": 3.120918843283582, "grad_norm": 0.4397419378532711, "learning_rate": 2.0251730451232886e-05, "loss": 0.3172, "step": 26765 }, { "epoch": 3.121501865671642, "grad_norm": 0.4672998399885591, "learning_rate": 2.0243517421314727e-05, "loss": 0.3377, "step": 26770 }, { "epoch": 3.1220848880597014, "grad_norm": 0.4488275991075581, "learning_rate": 2.0235305470366008e-05, "loss": 0.3341, "step": 26775 }, { "epoch": 3.122667910447761, "grad_norm": 0.45724136835508916, "learning_rate": 2.022709459960776e-05, "loss": 0.3263, "step": 26780 }, { "epoch": 3.1232509328358207, "grad_norm": 0.4303589813302327, "learning_rate": 2.021888481026086e-05, "loss": 0.3258, "step": 26785 }, { "epoch": 3.1238339552238807, "grad_norm": 0.44367388925675355, "learning_rate": 2.0210676103546028e-05, "loss": 0.3302, "step": 26790 }, { "epoch": 3.1244169776119404, "grad_norm": 0.5671474336419254, "learning_rate": 2.0202468480683812e-05, "loss": 0.3226, "step": 26795 }, { "epoch": 3.125, "grad_norm": 0.4693549663720372, "learning_rate": 2.0194261942894628e-05, "loss": 0.3249, "step": 26800 }, { "epoch": 3.1255830223880596, "grad_norm": 0.47169687262536697, "learning_rate": 2.0186056491398686e-05, "loss": 0.3251, "step": 26805 }, { "epoch": 3.1261660447761193, "grad_norm": 0.5245771146045501, "learning_rate": 2.0177852127416063e-05, "loss": 0.3275, "step": 26810 }, { "epoch": 3.1267490671641793, "grad_norm": 0.44686541430709736, "learning_rate": 2.0169648852166684e-05, "loss": 0.3225, "step": 26815 }, { "epoch": 3.127332089552239, "grad_norm": 0.4740393682290807, "learning_rate": 2.016144666687029e-05, "loss": 0.3379, "step": 26820 }, { "epoch": 3.1279151119402986, "grad_norm": 0.41623195515592065, "learning_rate": 2.015324557274645e-05, "loss": 0.316, "step": 26825 }, { "epoch": 3.128498134328358, "grad_norm": 0.4294606089025559, "learning_rate": 2.0145045571014614e-05, "loss": 0.2945, "step": 26830 }, { "epoch": 3.129081156716418, "grad_norm": 0.454012613019494, "learning_rate": 2.0136846662894028e-05, "loss": 0.3257, "step": 26835 }, { "epoch": 3.1296641791044775, "grad_norm": 0.5001273339562864, "learning_rate": 2.0128648849603798e-05, "loss": 0.3357, "step": 26840 }, { "epoch": 3.130247201492537, "grad_norm": 0.4811349204320928, "learning_rate": 2.0120452132362854e-05, "loss": 0.3325, "step": 26845 }, { "epoch": 3.130830223880597, "grad_norm": 0.4938092125575546, "learning_rate": 2.0112256512389976e-05, "loss": 0.3091, "step": 26850 }, { "epoch": 3.131413246268657, "grad_norm": 0.42774595838386315, "learning_rate": 2.0104061990903773e-05, "loss": 0.3103, "step": 26855 }, { "epoch": 3.1319962686567164, "grad_norm": 0.4660227477236933, "learning_rate": 2.009586856912269e-05, "loss": 0.3312, "step": 26860 }, { "epoch": 3.132579291044776, "grad_norm": 0.4510791455861746, "learning_rate": 2.008767624826501e-05, "loss": 0.3213, "step": 26865 }, { "epoch": 3.1331623134328357, "grad_norm": 0.48343884159203576, "learning_rate": 2.0079485029548838e-05, "loss": 0.3293, "step": 26870 }, { "epoch": 3.1337453358208953, "grad_norm": 0.4567351985253224, "learning_rate": 2.007129491419217e-05, "loss": 0.3147, "step": 26875 }, { "epoch": 3.1343283582089554, "grad_norm": 0.46898131402176935, "learning_rate": 2.006310590341276e-05, "loss": 0.3176, "step": 26880 }, { "epoch": 3.134911380597015, "grad_norm": 0.4289934744637185, "learning_rate": 2.005491799842824e-05, "loss": 0.3313, "step": 26885 }, { "epoch": 3.1354944029850746, "grad_norm": 0.6515331892054995, "learning_rate": 2.0046731200456097e-05, "loss": 0.3547, "step": 26890 }, { "epoch": 3.1360774253731343, "grad_norm": 0.4230308599433454, "learning_rate": 2.0038545510713597e-05, "loss": 0.3303, "step": 26895 }, { "epoch": 3.136660447761194, "grad_norm": 0.47907343313743017, "learning_rate": 2.00303609304179e-05, "loss": 0.3365, "step": 26900 }, { "epoch": 3.1372434701492535, "grad_norm": 0.46736099186068, "learning_rate": 2.0022177460785974e-05, "loss": 0.3423, "step": 26905 }, { "epoch": 3.1378264925373136, "grad_norm": 0.4512622531085032, "learning_rate": 2.0013995103034594e-05, "loss": 0.332, "step": 26910 }, { "epoch": 3.1384095149253732, "grad_norm": 0.4825903985622654, "learning_rate": 2.000581385838043e-05, "loss": 0.3213, "step": 26915 }, { "epoch": 3.138992537313433, "grad_norm": 0.5276528592938703, "learning_rate": 1.9997633728039933e-05, "loss": 0.319, "step": 26920 }, { "epoch": 3.1395755597014925, "grad_norm": 0.4775936717912453, "learning_rate": 1.9989454713229426e-05, "loss": 0.3271, "step": 26925 }, { "epoch": 3.140158582089552, "grad_norm": 0.4603167929873936, "learning_rate": 1.9981276815165046e-05, "loss": 0.3216, "step": 26930 }, { "epoch": 3.1407416044776117, "grad_norm": 0.452610017241828, "learning_rate": 1.9973100035062753e-05, "loss": 0.3216, "step": 26935 }, { "epoch": 3.141324626865672, "grad_norm": 0.43628172518109953, "learning_rate": 1.996492437413838e-05, "loss": 0.3111, "step": 26940 }, { "epoch": 3.1419076492537314, "grad_norm": 0.44650003921095405, "learning_rate": 1.9956749833607545e-05, "loss": 0.3218, "step": 26945 }, { "epoch": 3.142490671641791, "grad_norm": 0.44428101590584085, "learning_rate": 1.994857641468575e-05, "loss": 0.3357, "step": 26950 }, { "epoch": 3.1430736940298507, "grad_norm": 0.4624194022412386, "learning_rate": 1.994040411858829e-05, "loss": 0.3158, "step": 26955 }, { "epoch": 3.1436567164179103, "grad_norm": 0.44922257500149254, "learning_rate": 1.99322329465303e-05, "loss": 0.311, "step": 26960 }, { "epoch": 3.14423973880597, "grad_norm": 0.4556748287330155, "learning_rate": 1.992406289972677e-05, "loss": 0.3252, "step": 26965 }, { "epoch": 3.14482276119403, "grad_norm": 0.47534573579196415, "learning_rate": 1.9915893979392492e-05, "loss": 0.3524, "step": 26970 }, { "epoch": 3.1454057835820897, "grad_norm": 0.4296917439311925, "learning_rate": 1.9907726186742122e-05, "loss": 0.327, "step": 26975 }, { "epoch": 3.1459888059701493, "grad_norm": 0.4707200928426098, "learning_rate": 1.989955952299012e-05, "loss": 0.3179, "step": 26980 }, { "epoch": 3.146571828358209, "grad_norm": 0.4610040585110528, "learning_rate": 1.9891393989350794e-05, "loss": 0.3198, "step": 26985 }, { "epoch": 3.1471548507462686, "grad_norm": 0.5120188222955415, "learning_rate": 1.9883229587038287e-05, "loss": 0.3403, "step": 26990 }, { "epoch": 3.147737873134328, "grad_norm": 0.44834359511844246, "learning_rate": 1.987506631726656e-05, "loss": 0.3194, "step": 26995 }, { "epoch": 3.1483208955223883, "grad_norm": 0.4544558546793936, "learning_rate": 1.986690418124942e-05, "loss": 0.3417, "step": 27000 }, { "epoch": 3.148903917910448, "grad_norm": 0.4207714545786385, "learning_rate": 1.985874318020049e-05, "loss": 0.3174, "step": 27005 }, { "epoch": 3.1494869402985075, "grad_norm": 0.4823490868960121, "learning_rate": 1.9850583315333242e-05, "loss": 0.3287, "step": 27010 }, { "epoch": 3.150069962686567, "grad_norm": 0.47785976388211493, "learning_rate": 1.9842424587860958e-05, "loss": 0.3246, "step": 27015 }, { "epoch": 3.1506529850746268, "grad_norm": 0.4452797578776353, "learning_rate": 1.983426699899677e-05, "loss": 0.3156, "step": 27020 }, { "epoch": 3.1512360074626864, "grad_norm": 0.457534383185208, "learning_rate": 1.982611054995364e-05, "loss": 0.3279, "step": 27025 }, { "epoch": 3.1518190298507465, "grad_norm": 0.47946140418120975, "learning_rate": 1.9817955241944335e-05, "loss": 0.3366, "step": 27030 }, { "epoch": 3.152402052238806, "grad_norm": 0.47569245373062846, "learning_rate": 1.980980107618149e-05, "loss": 0.328, "step": 27035 }, { "epoch": 3.1529850746268657, "grad_norm": 0.44578237018932065, "learning_rate": 1.9801648053877548e-05, "loss": 0.3306, "step": 27040 }, { "epoch": 3.1535680970149254, "grad_norm": 0.479070478783956, "learning_rate": 1.979349617624477e-05, "loss": 0.3062, "step": 27045 }, { "epoch": 3.154151119402985, "grad_norm": 0.4470158079249052, "learning_rate": 1.978534544449528e-05, "loss": 0.306, "step": 27050 }, { "epoch": 3.1547341417910446, "grad_norm": 0.44846548897816296, "learning_rate": 1.9777195859840997e-05, "loss": 0.3253, "step": 27055 }, { "epoch": 3.1553171641791047, "grad_norm": 0.4385400951501217, "learning_rate": 1.9769047423493707e-05, "loss": 0.3361, "step": 27060 }, { "epoch": 3.1559001865671643, "grad_norm": 0.46022618469429677, "learning_rate": 1.9760900136664994e-05, "loss": 0.3278, "step": 27065 }, { "epoch": 3.156483208955224, "grad_norm": 0.5262601960800007, "learning_rate": 1.975275400056627e-05, "loss": 0.3476, "step": 27070 }, { "epoch": 3.1570662313432836, "grad_norm": 0.4211487289158979, "learning_rate": 1.974460901640881e-05, "loss": 0.3067, "step": 27075 }, { "epoch": 3.157649253731343, "grad_norm": 0.45755982586604127, "learning_rate": 1.9736465185403675e-05, "loss": 0.3272, "step": 27080 }, { "epoch": 3.158232276119403, "grad_norm": 0.46597636071099746, "learning_rate": 1.9728322508761794e-05, "loss": 0.3336, "step": 27085 }, { "epoch": 3.158815298507463, "grad_norm": 0.44247473549671384, "learning_rate": 1.9720180987693888e-05, "loss": 0.3336, "step": 27090 }, { "epoch": 3.1593983208955225, "grad_norm": 0.47766538399245295, "learning_rate": 1.9712040623410523e-05, "loss": 0.3046, "step": 27095 }, { "epoch": 3.159981343283582, "grad_norm": 0.4968219837604121, "learning_rate": 1.9703901417122106e-05, "loss": 0.3449, "step": 27100 }, { "epoch": 3.160564365671642, "grad_norm": 0.43325286695602533, "learning_rate": 1.9695763370038846e-05, "loss": 0.318, "step": 27105 }, { "epoch": 3.1611473880597014, "grad_norm": 0.427183151884983, "learning_rate": 1.968762648337081e-05, "loss": 0.3099, "step": 27110 }, { "epoch": 3.161730410447761, "grad_norm": 0.43966371576803287, "learning_rate": 1.9679490758327862e-05, "loss": 0.3254, "step": 27115 }, { "epoch": 3.1623134328358207, "grad_norm": 0.5057308075063571, "learning_rate": 1.96713561961197e-05, "loss": 0.3507, "step": 27120 }, { "epoch": 3.1628964552238807, "grad_norm": 0.47777550297006854, "learning_rate": 1.966322279795587e-05, "loss": 0.3463, "step": 27125 }, { "epoch": 3.1634794776119404, "grad_norm": 0.4559489386403158, "learning_rate": 1.9655090565045718e-05, "loss": 0.3263, "step": 27130 }, { "epoch": 3.1640625, "grad_norm": 0.4474189629131407, "learning_rate": 1.9646959498598444e-05, "loss": 0.3275, "step": 27135 }, { "epoch": 3.1646455223880596, "grad_norm": 0.4247199729692992, "learning_rate": 1.9638829599823056e-05, "loss": 0.3183, "step": 27140 }, { "epoch": 3.1652285447761193, "grad_norm": 0.43245919663374977, "learning_rate": 1.963070086992837e-05, "loss": 0.3276, "step": 27145 }, { "epoch": 3.1658115671641793, "grad_norm": 0.4972264534956099, "learning_rate": 1.9622573310123082e-05, "loss": 0.3491, "step": 27150 }, { "epoch": 3.166394589552239, "grad_norm": 0.4730961918270202, "learning_rate": 1.9614446921615654e-05, "loss": 0.3375, "step": 27155 }, { "epoch": 3.1669776119402986, "grad_norm": 0.4486450978632667, "learning_rate": 1.9606321705614427e-05, "loss": 0.3268, "step": 27160 }, { "epoch": 3.167560634328358, "grad_norm": 0.4488841836518065, "learning_rate": 1.9598197663327534e-05, "loss": 0.3204, "step": 27165 }, { "epoch": 3.168143656716418, "grad_norm": 0.4696329381013253, "learning_rate": 1.9590074795962925e-05, "loss": 0.312, "step": 27170 }, { "epoch": 3.1687266791044775, "grad_norm": 0.48907045259273, "learning_rate": 1.9581953104728422e-05, "loss": 0.3477, "step": 27175 }, { "epoch": 3.169309701492537, "grad_norm": 0.4805331764337033, "learning_rate": 1.957383259083162e-05, "loss": 0.3229, "step": 27180 }, { "epoch": 3.169892723880597, "grad_norm": 0.4450143918692602, "learning_rate": 1.9565713255479974e-05, "loss": 0.3353, "step": 27185 }, { "epoch": 3.170475746268657, "grad_norm": 0.4812438140518185, "learning_rate": 1.955759509988075e-05, "loss": 0.3281, "step": 27190 }, { "epoch": 3.1710587686567164, "grad_norm": 0.44373473747948927, "learning_rate": 1.9549478125241034e-05, "loss": 0.3445, "step": 27195 }, { "epoch": 3.171641791044776, "grad_norm": 0.4824582958979891, "learning_rate": 1.9541362332767737e-05, "loss": 0.3304, "step": 27200 }, { "epoch": 3.1722248134328357, "grad_norm": 0.4719028439016698, "learning_rate": 1.9533247723667613e-05, "loss": 0.3346, "step": 27205 }, { "epoch": 3.1728078358208953, "grad_norm": 0.4394771670223666, "learning_rate": 1.952513429914723e-05, "loss": 0.3389, "step": 27210 }, { "epoch": 3.1733908582089554, "grad_norm": 0.46966246137225504, "learning_rate": 1.9517022060412958e-05, "loss": 0.3287, "step": 27215 }, { "epoch": 3.173973880597015, "grad_norm": 0.46628037746950846, "learning_rate": 1.950891100867102e-05, "loss": 0.3299, "step": 27220 }, { "epoch": 3.1745569029850746, "grad_norm": 0.4785703641201323, "learning_rate": 1.950080114512746e-05, "loss": 0.3184, "step": 27225 }, { "epoch": 3.1751399253731343, "grad_norm": 0.43906165693485444, "learning_rate": 1.9492692470988115e-05, "loss": 0.3165, "step": 27230 }, { "epoch": 3.175722947761194, "grad_norm": 0.4621376834633981, "learning_rate": 1.9484584987458693e-05, "loss": 0.3241, "step": 27235 }, { "epoch": 3.1763059701492535, "grad_norm": 0.44326331540932834, "learning_rate": 1.9476478695744683e-05, "loss": 0.3209, "step": 27240 }, { "epoch": 3.1768889925373136, "grad_norm": 0.46263449751192637, "learning_rate": 1.9468373597051404e-05, "loss": 0.3106, "step": 27245 }, { "epoch": 3.1774720149253732, "grad_norm": 0.4287359278303057, "learning_rate": 1.9460269692584034e-05, "loss": 0.3025, "step": 27250 }, { "epoch": 3.178055037313433, "grad_norm": 0.4681246345170257, "learning_rate": 1.9452166983547516e-05, "loss": 0.3215, "step": 27255 }, { "epoch": 3.1786380597014925, "grad_norm": 0.4006218678599732, "learning_rate": 1.944406547114667e-05, "loss": 0.3101, "step": 27260 }, { "epoch": 3.179221082089552, "grad_norm": 0.4778409445664328, "learning_rate": 1.9435965156586105e-05, "loss": 0.3495, "step": 27265 }, { "epoch": 3.1798041044776117, "grad_norm": 0.4521839032924177, "learning_rate": 1.9427866041070254e-05, "loss": 0.3302, "step": 27270 }, { "epoch": 3.180387126865672, "grad_norm": 0.451989568859708, "learning_rate": 1.9419768125803382e-05, "loss": 0.307, "step": 27275 }, { "epoch": 3.1809701492537314, "grad_norm": 0.5400648266188888, "learning_rate": 1.9411671411989568e-05, "loss": 0.3593, "step": 27280 }, { "epoch": 3.181553171641791, "grad_norm": 0.4667878863039788, "learning_rate": 1.9403575900832726e-05, "loss": 0.3377, "step": 27285 }, { "epoch": 3.1821361940298507, "grad_norm": 0.4724285012856178, "learning_rate": 1.9395481593536575e-05, "loss": 0.3347, "step": 27290 }, { "epoch": 3.1827192164179103, "grad_norm": 0.45862786465029165, "learning_rate": 1.9387388491304646e-05, "loss": 0.3074, "step": 27295 }, { "epoch": 3.18330223880597, "grad_norm": 0.47227524445544855, "learning_rate": 1.937929659534034e-05, "loss": 0.3252, "step": 27300 }, { "epoch": 3.18388526119403, "grad_norm": 0.43162785406712434, "learning_rate": 1.9371205906846808e-05, "loss": 0.3083, "step": 27305 }, { "epoch": 3.1844682835820897, "grad_norm": 0.4876120836497574, "learning_rate": 1.9363116427027084e-05, "loss": 0.3162, "step": 27310 }, { "epoch": 3.1850513059701493, "grad_norm": 0.46307715110791076, "learning_rate": 1.9355028157083988e-05, "loss": 0.3501, "step": 27315 }, { "epoch": 3.185634328358209, "grad_norm": 0.46941187892021596, "learning_rate": 1.9346941098220157e-05, "loss": 0.3266, "step": 27320 }, { "epoch": 3.1862173507462686, "grad_norm": 0.47493550264768114, "learning_rate": 1.933885525163807e-05, "loss": 0.3392, "step": 27325 }, { "epoch": 3.186800373134328, "grad_norm": 0.47281274512403776, "learning_rate": 1.933077061854002e-05, "loss": 0.3309, "step": 27330 }, { "epoch": 3.1873833955223883, "grad_norm": 0.4349454178005739, "learning_rate": 1.9322687200128103e-05, "loss": 0.3313, "step": 27335 }, { "epoch": 3.187966417910448, "grad_norm": 0.44484068267761423, "learning_rate": 1.931460499760426e-05, "loss": 0.3116, "step": 27340 }, { "epoch": 3.1885494402985075, "grad_norm": 0.4631234982395724, "learning_rate": 1.930652401217021e-05, "loss": 0.3332, "step": 27345 }, { "epoch": 3.189132462686567, "grad_norm": 0.44363469283627516, "learning_rate": 1.929844424502755e-05, "loss": 0.32, "step": 27350 }, { "epoch": 3.1897154850746268, "grad_norm": 0.5142219286656142, "learning_rate": 1.929036569737765e-05, "loss": 0.333, "step": 27355 }, { "epoch": 3.1902985074626864, "grad_norm": 0.41970889457686733, "learning_rate": 1.9282288370421708e-05, "loss": 0.3154, "step": 27360 }, { "epoch": 3.1908815298507465, "grad_norm": 0.4469941346947236, "learning_rate": 1.9274212265360757e-05, "loss": 0.3214, "step": 27365 }, { "epoch": 3.191464552238806, "grad_norm": 0.4692755715896205, "learning_rate": 1.9266137383395626e-05, "loss": 0.311, "step": 27370 }, { "epoch": 3.1920475746268657, "grad_norm": 0.46469323415769187, "learning_rate": 1.925806372572697e-05, "loss": 0.3258, "step": 27375 }, { "epoch": 3.1926305970149254, "grad_norm": 0.4573763955838952, "learning_rate": 1.9249991293555276e-05, "loss": 0.3233, "step": 27380 }, { "epoch": 3.193213619402985, "grad_norm": 0.43970584272676727, "learning_rate": 1.9241920088080833e-05, "loss": 0.3098, "step": 27385 }, { "epoch": 3.1937966417910446, "grad_norm": 0.43208933972253044, "learning_rate": 1.9233850110503748e-05, "loss": 0.3066, "step": 27390 }, { "epoch": 3.1943796641791047, "grad_norm": 0.47906991700293433, "learning_rate": 1.9225781362023955e-05, "loss": 0.3139, "step": 27395 }, { "epoch": 3.1949626865671643, "grad_norm": 0.4930373154110795, "learning_rate": 1.9217713843841195e-05, "loss": 0.3228, "step": 27400 }, { "epoch": 3.195545708955224, "grad_norm": 0.44358866677645276, "learning_rate": 1.9209647557155025e-05, "loss": 0.33, "step": 27405 }, { "epoch": 3.1961287313432836, "grad_norm": 0.49269024353102203, "learning_rate": 1.9201582503164845e-05, "loss": 0.3356, "step": 27410 }, { "epoch": 3.196711753731343, "grad_norm": 0.4696815736959036, "learning_rate": 1.9193518683069833e-05, "loss": 0.301, "step": 27415 }, { "epoch": 3.197294776119403, "grad_norm": 0.4871495656179168, "learning_rate": 1.9185456098068998e-05, "loss": 0.3408, "step": 27420 }, { "epoch": 3.197877798507463, "grad_norm": 0.4213310904332378, "learning_rate": 1.9177394749361193e-05, "loss": 0.3099, "step": 27425 }, { "epoch": 3.1984608208955225, "grad_norm": 0.47862103029689884, "learning_rate": 1.9169334638145037e-05, "loss": 0.2995, "step": 27430 }, { "epoch": 3.199043843283582, "grad_norm": 0.44777242909659876, "learning_rate": 1.9161275765619007e-05, "loss": 0.3192, "step": 27435 }, { "epoch": 3.199626865671642, "grad_norm": 0.47406176834663966, "learning_rate": 1.9153218132981375e-05, "loss": 0.3321, "step": 27440 }, { "epoch": 3.2002098880597014, "grad_norm": 0.47048401955783853, "learning_rate": 1.9145161741430234e-05, "loss": 0.3217, "step": 27445 }, { "epoch": 3.200792910447761, "grad_norm": 0.4531181811978807, "learning_rate": 1.9137106592163495e-05, "loss": 0.3308, "step": 27450 }, { "epoch": 3.2013759328358207, "grad_norm": 0.45942249968126514, "learning_rate": 1.9129052686378873e-05, "loss": 0.3182, "step": 27455 }, { "epoch": 3.2019589552238807, "grad_norm": 0.42095683723958044, "learning_rate": 1.912100002527392e-05, "loss": 0.3243, "step": 27460 }, { "epoch": 3.2025419776119404, "grad_norm": 0.45713255314341616, "learning_rate": 1.9112948610045982e-05, "loss": 0.3339, "step": 27465 }, { "epoch": 3.203125, "grad_norm": 0.47613917907366915, "learning_rate": 1.9104898441892222e-05, "loss": 0.3394, "step": 27470 }, { "epoch": 3.2037080223880596, "grad_norm": 0.4594895192147124, "learning_rate": 1.909684952200964e-05, "loss": 0.3252, "step": 27475 }, { "epoch": 3.2042910447761193, "grad_norm": 0.962513676859494, "learning_rate": 1.9088801851595008e-05, "loss": 0.3173, "step": 27480 }, { "epoch": 3.2048740671641793, "grad_norm": 0.43709419203162797, "learning_rate": 1.908075543184496e-05, "loss": 0.3018, "step": 27485 }, { "epoch": 3.205457089552239, "grad_norm": 0.440565691190322, "learning_rate": 1.907271026395592e-05, "loss": 0.3129, "step": 27490 }, { "epoch": 3.2060401119402986, "grad_norm": 0.5112532590069527, "learning_rate": 1.90646663491241e-05, "loss": 0.3485, "step": 27495 }, { "epoch": 3.206623134328358, "grad_norm": 0.47127425580398696, "learning_rate": 1.9056623688545588e-05, "loss": 0.3242, "step": 27500 }, { "epoch": 3.207206156716418, "grad_norm": 0.4464146934009306, "learning_rate": 1.904858228341623e-05, "loss": 0.339, "step": 27505 }, { "epoch": 3.2077891791044775, "grad_norm": 0.46980098724748476, "learning_rate": 1.9040542134931715e-05, "loss": 0.3097, "step": 27510 }, { "epoch": 3.208372201492537, "grad_norm": 0.440758758693857, "learning_rate": 1.9032503244287537e-05, "loss": 0.3312, "step": 27515 }, { "epoch": 3.208955223880597, "grad_norm": 0.47824340765477763, "learning_rate": 1.9024465612678993e-05, "loss": 0.3373, "step": 27520 }, { "epoch": 3.209538246268657, "grad_norm": 0.46386513781546374, "learning_rate": 1.901642924130121e-05, "loss": 0.3404, "step": 27525 }, { "epoch": 3.2101212686567164, "grad_norm": 0.47197157493893405, "learning_rate": 1.900839413134911e-05, "loss": 0.3304, "step": 27530 }, { "epoch": 3.210704291044776, "grad_norm": 0.43944612589420196, "learning_rate": 1.9000360284017448e-05, "loss": 0.3278, "step": 27535 }, { "epoch": 3.2112873134328357, "grad_norm": 0.46241840789993244, "learning_rate": 1.8992327700500772e-05, "loss": 0.3174, "step": 27540 }, { "epoch": 3.2118703358208953, "grad_norm": 0.4233215422194263, "learning_rate": 1.8984296381993454e-05, "loss": 0.3249, "step": 27545 }, { "epoch": 3.2124533582089554, "grad_norm": 0.4165212543451875, "learning_rate": 1.897626632968968e-05, "loss": 0.2915, "step": 27550 }, { "epoch": 3.213036380597015, "grad_norm": 0.4079197063432625, "learning_rate": 1.8968237544783423e-05, "loss": 0.3321, "step": 27555 }, { "epoch": 3.2136194029850746, "grad_norm": 0.5014130420173903, "learning_rate": 1.8960210028468512e-05, "loss": 0.3392, "step": 27560 }, { "epoch": 3.2142024253731343, "grad_norm": 0.47986743675377647, "learning_rate": 1.8952183781938538e-05, "loss": 0.3203, "step": 27565 }, { "epoch": 3.214785447761194, "grad_norm": 0.4808351354951185, "learning_rate": 1.8944158806386942e-05, "loss": 0.3286, "step": 27570 }, { "epoch": 3.2153684701492535, "grad_norm": 0.47026680395669446, "learning_rate": 1.8936135103006957e-05, "loss": 0.3394, "step": 27575 }, { "epoch": 3.2159514925373136, "grad_norm": 0.44150994897363893, "learning_rate": 1.8928112672991626e-05, "loss": 0.3101, "step": 27580 }, { "epoch": 3.2165345149253732, "grad_norm": 0.4828026626153867, "learning_rate": 1.8920091517533818e-05, "loss": 0.306, "step": 27585 }, { "epoch": 3.217117537313433, "grad_norm": 0.8046588090155792, "learning_rate": 1.8912071637826196e-05, "loss": 0.3207, "step": 27590 }, { "epoch": 3.2177005597014925, "grad_norm": 0.4941742594832407, "learning_rate": 1.890405303506123e-05, "loss": 0.3495, "step": 27595 }, { "epoch": 3.218283582089552, "grad_norm": 0.4703362026510565, "learning_rate": 1.8896035710431225e-05, "loss": 0.3565, "step": 27600 }, { "epoch": 3.2188666044776117, "grad_norm": 0.495189929149459, "learning_rate": 1.888801966512827e-05, "loss": 0.3453, "step": 27605 }, { "epoch": 3.219449626865672, "grad_norm": 0.4719978449184704, "learning_rate": 1.8880004900344283e-05, "loss": 0.3315, "step": 27610 }, { "epoch": 3.2200326492537314, "grad_norm": 0.49674022335482504, "learning_rate": 1.8871991417270978e-05, "loss": 0.3552, "step": 27615 }, { "epoch": 3.220615671641791, "grad_norm": 0.4393130089474178, "learning_rate": 1.8863979217099874e-05, "loss": 0.3151, "step": 27620 }, { "epoch": 3.2211986940298507, "grad_norm": 0.4523050799744314, "learning_rate": 1.8855968301022326e-05, "loss": 0.3213, "step": 27625 }, { "epoch": 3.2217817164179103, "grad_norm": 0.6886153682176874, "learning_rate": 1.8847958670229465e-05, "loss": 0.3294, "step": 27630 }, { "epoch": 3.22236473880597, "grad_norm": 0.44038399138162393, "learning_rate": 1.883995032591226e-05, "loss": 0.3286, "step": 27635 }, { "epoch": 3.22294776119403, "grad_norm": 0.43311241772978903, "learning_rate": 1.8831943269261467e-05, "loss": 0.3283, "step": 27640 }, { "epoch": 3.2235307835820897, "grad_norm": 0.4388420255717518, "learning_rate": 1.8823937501467648e-05, "loss": 0.3235, "step": 27645 }, { "epoch": 3.2241138059701493, "grad_norm": 0.44877364529184444, "learning_rate": 1.8815933023721206e-05, "loss": 0.3396, "step": 27650 }, { "epoch": 3.224696828358209, "grad_norm": 0.44087057548560593, "learning_rate": 1.880792983721231e-05, "loss": 0.3292, "step": 27655 }, { "epoch": 3.2252798507462686, "grad_norm": 0.4567254636672784, "learning_rate": 1.8799927943130986e-05, "loss": 0.3317, "step": 27660 }, { "epoch": 3.225862873134328, "grad_norm": 0.43552549652640776, "learning_rate": 1.879192734266701e-05, "loss": 0.3236, "step": 27665 }, { "epoch": 3.2264458955223883, "grad_norm": 0.4799781666815651, "learning_rate": 1.878392803701e-05, "loss": 0.3337, "step": 27670 }, { "epoch": 3.227028917910448, "grad_norm": 0.4468328174907116, "learning_rate": 1.8775930027349386e-05, "loss": 0.3259, "step": 27675 }, { "epoch": 3.2276119402985075, "grad_norm": 0.470926021553355, "learning_rate": 1.8767933314874382e-05, "loss": 0.3499, "step": 27680 }, { "epoch": 3.228194962686567, "grad_norm": 0.4926736932412245, "learning_rate": 1.8759937900774038e-05, "loss": 0.3498, "step": 27685 }, { "epoch": 3.2287779850746268, "grad_norm": 0.4725154869096927, "learning_rate": 1.875194378623718e-05, "loss": 0.3272, "step": 27690 }, { "epoch": 3.2293610074626864, "grad_norm": 0.47622368975208734, "learning_rate": 1.8743950972452477e-05, "loss": 0.3238, "step": 27695 }, { "epoch": 3.2299440298507465, "grad_norm": 0.5072568601536372, "learning_rate": 1.8735959460608364e-05, "loss": 0.3509, "step": 27700 }, { "epoch": 3.230527052238806, "grad_norm": 0.5023368884886755, "learning_rate": 1.8727969251893107e-05, "loss": 0.3198, "step": 27705 }, { "epoch": 3.2311100746268657, "grad_norm": 0.5256277684008088, "learning_rate": 1.871998034749478e-05, "loss": 0.3401, "step": 27710 }, { "epoch": 3.2316930970149254, "grad_norm": 0.5022381624468324, "learning_rate": 1.8711992748601252e-05, "loss": 0.3458, "step": 27715 }, { "epoch": 3.232276119402985, "grad_norm": 0.4608319003836731, "learning_rate": 1.8704006456400202e-05, "loss": 0.3321, "step": 27720 }, { "epoch": 3.2328591417910446, "grad_norm": 0.4706421932327294, "learning_rate": 1.8696021472079118e-05, "loss": 0.3253, "step": 27725 }, { "epoch": 3.2334421641791047, "grad_norm": 0.49681056697961345, "learning_rate": 1.8688037796825285e-05, "loss": 0.3299, "step": 27730 }, { "epoch": 3.2340251865671643, "grad_norm": 0.4590968039711858, "learning_rate": 1.8680055431825804e-05, "loss": 0.3209, "step": 27735 }, { "epoch": 3.234608208955224, "grad_norm": 0.44550716999248235, "learning_rate": 1.8672074378267573e-05, "loss": 0.3324, "step": 27740 }, { "epoch": 3.2351912313432836, "grad_norm": 0.4470425587062118, "learning_rate": 1.8664094637337303e-05, "loss": 0.3317, "step": 27745 }, { "epoch": 3.235774253731343, "grad_norm": 0.48446303867359625, "learning_rate": 1.8656116210221502e-05, "loss": 0.3259, "step": 27750 }, { "epoch": 3.236357276119403, "grad_norm": 0.4840083407763402, "learning_rate": 1.8648139098106482e-05, "loss": 0.3369, "step": 27755 }, { "epoch": 3.236940298507463, "grad_norm": 0.44067379565281556, "learning_rate": 1.8640163302178377e-05, "loss": 0.3355, "step": 27760 }, { "epoch": 3.2375233208955225, "grad_norm": 0.49928397720096795, "learning_rate": 1.8632188823623086e-05, "loss": 0.3429, "step": 27765 }, { "epoch": 3.238106343283582, "grad_norm": 0.49338482903304315, "learning_rate": 1.8624215663626365e-05, "loss": 0.3184, "step": 27770 }, { "epoch": 3.238689365671642, "grad_norm": 0.4649169797107502, "learning_rate": 1.861624382337373e-05, "loss": 0.3704, "step": 27775 }, { "epoch": 3.2392723880597014, "grad_norm": 0.45303669376757155, "learning_rate": 1.8608273304050515e-05, "loss": 0.3168, "step": 27780 }, { "epoch": 3.239855410447761, "grad_norm": 0.7655008250028477, "learning_rate": 1.8600304106841876e-05, "loss": 0.3337, "step": 27785 }, { "epoch": 3.2404384328358207, "grad_norm": 0.4386919846379524, "learning_rate": 1.859233623293274e-05, "loss": 0.3179, "step": 27790 }, { "epoch": 3.2410214552238807, "grad_norm": 0.4965513787903615, "learning_rate": 1.8584369683507863e-05, "loss": 0.3437, "step": 27795 }, { "epoch": 3.2416044776119404, "grad_norm": 0.46268733115301086, "learning_rate": 1.8576404459751796e-05, "loss": 0.3239, "step": 27800 }, { "epoch": 3.2421875, "grad_norm": 0.4545564327310699, "learning_rate": 1.8568440562848876e-05, "loss": 0.3211, "step": 27805 }, { "epoch": 3.2427705223880596, "grad_norm": 0.4934358035272012, "learning_rate": 1.8560477993983284e-05, "loss": 0.343, "step": 27810 }, { "epoch": 3.2433535447761193, "grad_norm": 0.422853285427749, "learning_rate": 1.855251675433895e-05, "loss": 0.3235, "step": 27815 }, { "epoch": 3.2439365671641793, "grad_norm": 0.4751754389204498, "learning_rate": 1.8544556845099657e-05, "loss": 0.3305, "step": 27820 }, { "epoch": 3.244519589552239, "grad_norm": 0.4647433580608973, "learning_rate": 1.8536598267448958e-05, "loss": 0.3384, "step": 27825 }, { "epoch": 3.2451026119402986, "grad_norm": 0.4540116861372445, "learning_rate": 1.8528641022570202e-05, "loss": 0.3324, "step": 27830 }, { "epoch": 3.245685634328358, "grad_norm": 0.521097018048201, "learning_rate": 1.8520685111646585e-05, "loss": 0.3368, "step": 27835 }, { "epoch": 3.246268656716418, "grad_norm": 0.46427046263451505, "learning_rate": 1.851273053586105e-05, "loss": 0.3115, "step": 27840 }, { "epoch": 3.2468516791044775, "grad_norm": 0.4662757007968462, "learning_rate": 1.850477729639638e-05, "loss": 0.3373, "step": 27845 }, { "epoch": 3.247434701492537, "grad_norm": 0.4410616567724678, "learning_rate": 1.8496825394435146e-05, "loss": 0.3283, "step": 27850 }, { "epoch": 3.248017723880597, "grad_norm": 0.4669657380849723, "learning_rate": 1.8488874831159703e-05, "loss": 0.3401, "step": 27855 }, { "epoch": 3.248600746268657, "grad_norm": 0.5169708732045697, "learning_rate": 1.8480925607752248e-05, "loss": 0.3328, "step": 27860 }, { "epoch": 3.2491837686567164, "grad_norm": 0.4820043528671602, "learning_rate": 1.847297772539473e-05, "loss": 0.319, "step": 27865 }, { "epoch": 3.249766791044776, "grad_norm": 0.48121569675310194, "learning_rate": 1.8465031185268943e-05, "loss": 0.3708, "step": 27870 }, { "epoch": 3.2503498134328357, "grad_norm": 0.4156510208095905, "learning_rate": 1.845708598855645e-05, "loss": 0.3327, "step": 27875 }, { "epoch": 3.2509328358208958, "grad_norm": 0.44055453521381466, "learning_rate": 1.8449142136438628e-05, "loss": 0.3186, "step": 27880 }, { "epoch": 3.2515158582089554, "grad_norm": 0.46750972785844375, "learning_rate": 1.8441199630096655e-05, "loss": 0.333, "step": 27885 }, { "epoch": 3.252098880597015, "grad_norm": 0.4822002310139299, "learning_rate": 1.84332584707115e-05, "loss": 0.3268, "step": 27890 }, { "epoch": 3.2526819029850746, "grad_norm": 0.45841592667995923, "learning_rate": 1.8425318659463942e-05, "loss": 0.3139, "step": 27895 }, { "epoch": 3.2532649253731343, "grad_norm": 0.46469078456422636, "learning_rate": 1.8417380197534558e-05, "loss": 0.3392, "step": 27900 }, { "epoch": 3.253847947761194, "grad_norm": 0.44008698671573293, "learning_rate": 1.8409443086103713e-05, "loss": 0.3288, "step": 27905 }, { "epoch": 3.2544309701492535, "grad_norm": 0.42758635449004245, "learning_rate": 1.8401507326351575e-05, "loss": 0.3285, "step": 27910 }, { "epoch": 3.2550139925373136, "grad_norm": 0.45371761745377537, "learning_rate": 1.839357291945813e-05, "loss": 0.3325, "step": 27915 }, { "epoch": 3.2555970149253732, "grad_norm": 0.42570080917107816, "learning_rate": 1.8385639866603144e-05, "loss": 0.3385, "step": 27920 }, { "epoch": 3.256180037313433, "grad_norm": 0.47613616679112847, "learning_rate": 1.837770816896618e-05, "loss": 0.3464, "step": 27925 }, { "epoch": 3.2567630597014925, "grad_norm": 0.4875952718236274, "learning_rate": 1.836977782772661e-05, "loss": 0.3269, "step": 27930 }, { "epoch": 3.257346082089552, "grad_norm": 0.4574369613739823, "learning_rate": 1.8361848844063594e-05, "loss": 0.3313, "step": 27935 }, { "epoch": 3.2579291044776117, "grad_norm": 0.4703903863214078, "learning_rate": 1.8353921219156102e-05, "loss": 0.3297, "step": 27940 }, { "epoch": 3.258512126865672, "grad_norm": 0.47177636995396366, "learning_rate": 1.83459949541829e-05, "loss": 0.3202, "step": 27945 }, { "epoch": 3.2590951492537314, "grad_norm": 0.45047096290180144, "learning_rate": 1.8338070050322544e-05, "loss": 0.3508, "step": 27950 }, { "epoch": 3.259678171641791, "grad_norm": 0.4765900942250275, "learning_rate": 1.8330146508753377e-05, "loss": 0.338, "step": 27955 }, { "epoch": 3.2602611940298507, "grad_norm": 0.46011900679469064, "learning_rate": 1.8322224330653576e-05, "loss": 0.3252, "step": 27960 }, { "epoch": 3.2608442164179103, "grad_norm": 0.4515748400652757, "learning_rate": 1.8314303517201076e-05, "loss": 0.3283, "step": 27965 }, { "epoch": 3.26142723880597, "grad_norm": 0.46490340258957585, "learning_rate": 1.830638406957364e-05, "loss": 0.328, "step": 27970 }, { "epoch": 3.26201026119403, "grad_norm": 0.43386741663300754, "learning_rate": 1.829846598894881e-05, "loss": 0.3362, "step": 27975 }, { "epoch": 3.2625932835820897, "grad_norm": 0.45571419509069383, "learning_rate": 1.8290549276503915e-05, "loss": 0.3349, "step": 27980 }, { "epoch": 3.2631763059701493, "grad_norm": 0.4826439613582884, "learning_rate": 1.8282633933416115e-05, "loss": 0.3425, "step": 27985 }, { "epoch": 3.263759328358209, "grad_norm": 0.45438105183907856, "learning_rate": 1.8274719960862325e-05, "loss": 0.3282, "step": 27990 }, { "epoch": 3.2643423507462686, "grad_norm": 0.4754247902304436, "learning_rate": 1.82668073600193e-05, "loss": 0.3252, "step": 27995 }, { "epoch": 3.264925373134328, "grad_norm": 0.46722667906052695, "learning_rate": 1.825889613206355e-05, "loss": 0.335, "step": 28000 }, { "epoch": 3.2655083955223883, "grad_norm": 0.46493889162506147, "learning_rate": 1.82509862781714e-05, "loss": 0.3212, "step": 28005 }, { "epoch": 3.266091417910448, "grad_norm": 0.4612213513933666, "learning_rate": 1.824307779951898e-05, "loss": 0.3217, "step": 28010 }, { "epoch": 3.2666744402985075, "grad_norm": 0.4643578057614236, "learning_rate": 1.8235170697282194e-05, "loss": 0.3108, "step": 28015 }, { "epoch": 3.267257462686567, "grad_norm": 0.44275932576560134, "learning_rate": 1.8227264972636758e-05, "loss": 0.3018, "step": 28020 }, { "epoch": 3.2678404850746268, "grad_norm": 0.46553997310047907, "learning_rate": 1.8219360626758185e-05, "loss": 0.3301, "step": 28025 }, { "epoch": 3.2684235074626864, "grad_norm": 0.46688242085330117, "learning_rate": 1.821145766082176e-05, "loss": 0.3389, "step": 28030 }, { "epoch": 3.2690065298507465, "grad_norm": 0.45708609894384883, "learning_rate": 1.820355607600259e-05, "loss": 0.3459, "step": 28035 }, { "epoch": 3.269589552238806, "grad_norm": 0.4591085831877663, "learning_rate": 1.8195655873475554e-05, "loss": 0.3447, "step": 28040 }, { "epoch": 3.2701725746268657, "grad_norm": 0.43371501436425636, "learning_rate": 1.818775705441535e-05, "loss": 0.3085, "step": 28045 }, { "epoch": 3.2707555970149254, "grad_norm": 0.4565167376854876, "learning_rate": 1.8179859619996448e-05, "loss": 0.3408, "step": 28050 }, { "epoch": 3.271338619402985, "grad_norm": 0.44336035486171527, "learning_rate": 1.8171963571393112e-05, "loss": 0.333, "step": 28055 }, { "epoch": 3.2719216417910446, "grad_norm": 0.451954050568279, "learning_rate": 1.8164068909779437e-05, "loss": 0.3195, "step": 28060 }, { "epoch": 3.2725046641791042, "grad_norm": 0.5202880557337407, "learning_rate": 1.8156175636329252e-05, "loss": 0.3344, "step": 28065 }, { "epoch": 3.2730876865671643, "grad_norm": 0.5296368128506362, "learning_rate": 1.814828375221623e-05, "loss": 0.3298, "step": 28070 }, { "epoch": 3.273670708955224, "grad_norm": 0.43793489149007375, "learning_rate": 1.814039325861382e-05, "loss": 0.3353, "step": 28075 }, { "epoch": 3.2742537313432836, "grad_norm": 0.4584593923335097, "learning_rate": 1.8132504156695245e-05, "loss": 0.3334, "step": 28080 }, { "epoch": 3.274836753731343, "grad_norm": 0.4861907953454029, "learning_rate": 1.812461644763355e-05, "loss": 0.3429, "step": 28085 }, { "epoch": 3.275419776119403, "grad_norm": 0.46246625320099805, "learning_rate": 1.8116730132601565e-05, "loss": 0.3217, "step": 28090 }, { "epoch": 3.276002798507463, "grad_norm": 0.47876767332398457, "learning_rate": 1.8108845212771913e-05, "loss": 0.3137, "step": 28095 }, { "epoch": 3.2765858208955225, "grad_norm": 0.44409562799986846, "learning_rate": 1.8100961689317003e-05, "loss": 0.3318, "step": 28100 }, { "epoch": 3.277168843283582, "grad_norm": 0.40478347277432103, "learning_rate": 1.8093079563409017e-05, "loss": 0.3146, "step": 28105 }, { "epoch": 3.277751865671642, "grad_norm": 0.47570380198610074, "learning_rate": 1.808519883621999e-05, "loss": 0.3245, "step": 28110 }, { "epoch": 3.2783348880597014, "grad_norm": 0.45409502237373334, "learning_rate": 1.807731950892167e-05, "loss": 0.3191, "step": 28115 }, { "epoch": 3.278917910447761, "grad_norm": 0.47488286340939584, "learning_rate": 1.806944158268568e-05, "loss": 0.3346, "step": 28120 }, { "epoch": 3.2795009328358207, "grad_norm": 0.4421012793522906, "learning_rate": 1.8061565058683362e-05, "loss": 0.3258, "step": 28125 }, { "epoch": 3.2800839552238807, "grad_norm": 0.4359737526431151, "learning_rate": 1.805368993808589e-05, "loss": 0.3119, "step": 28130 }, { "epoch": 3.2806669776119404, "grad_norm": 0.4714093924509547, "learning_rate": 1.8045816222064212e-05, "loss": 0.3295, "step": 28135 }, { "epoch": 3.28125, "grad_norm": 0.48000776117763994, "learning_rate": 1.803794391178908e-05, "loss": 0.3311, "step": 28140 }, { "epoch": 3.2818330223880596, "grad_norm": 0.45661806602343735, "learning_rate": 1.8030073008431032e-05, "loss": 0.3432, "step": 28145 }, { "epoch": 3.2824160447761193, "grad_norm": 0.4398196977266842, "learning_rate": 1.8022203513160406e-05, "loss": 0.3021, "step": 28150 }, { "epoch": 3.2829990671641793, "grad_norm": 0.48306013572476025, "learning_rate": 1.8014335427147288e-05, "loss": 0.3325, "step": 28155 }, { "epoch": 3.283582089552239, "grad_norm": 0.44625314047636055, "learning_rate": 1.8006468751561628e-05, "loss": 0.3257, "step": 28160 }, { "epoch": 3.2841651119402986, "grad_norm": 0.4466349940081072, "learning_rate": 1.7998603487573095e-05, "loss": 0.3206, "step": 28165 }, { "epoch": 3.284748134328358, "grad_norm": 0.4676684095924915, "learning_rate": 1.7990739636351188e-05, "loss": 0.3321, "step": 28170 }, { "epoch": 3.285331156716418, "grad_norm": 0.46672320227419334, "learning_rate": 1.798287719906519e-05, "loss": 0.307, "step": 28175 }, { "epoch": 3.2859141791044775, "grad_norm": 0.44967161705789455, "learning_rate": 1.797501617688417e-05, "loss": 0.3159, "step": 28180 }, { "epoch": 3.286497201492537, "grad_norm": 0.4531141777991533, "learning_rate": 1.7967156570976974e-05, "loss": 0.3355, "step": 28185 }, { "epoch": 3.287080223880597, "grad_norm": 0.41605086071344555, "learning_rate": 1.795929838251227e-05, "loss": 0.3208, "step": 28190 }, { "epoch": 3.287663246268657, "grad_norm": 0.4785387238146306, "learning_rate": 1.7951441612658488e-05, "loss": 0.3427, "step": 28195 }, { "epoch": 3.2882462686567164, "grad_norm": 0.4416147481573904, "learning_rate": 1.7943586262583846e-05, "loss": 0.3247, "step": 28200 }, { "epoch": 3.288829291044776, "grad_norm": 0.48887574830993924, "learning_rate": 1.7935732333456362e-05, "loss": 0.3474, "step": 28205 }, { "epoch": 3.2894123134328357, "grad_norm": 0.4048883473534703, "learning_rate": 1.7927879826443844e-05, "loss": 0.3243, "step": 28210 }, { "epoch": 3.2899953358208958, "grad_norm": 0.4871027223708983, "learning_rate": 1.7920028742713885e-05, "loss": 0.3483, "step": 28215 }, { "epoch": 3.2905783582089554, "grad_norm": 0.4446307158366329, "learning_rate": 1.791217908343386e-05, "loss": 0.3227, "step": 28220 }, { "epoch": 3.291161380597015, "grad_norm": 0.46756764098881587, "learning_rate": 1.7904330849770957e-05, "loss": 0.3263, "step": 28225 }, { "epoch": 3.2917444029850746, "grad_norm": 0.48562959405852096, "learning_rate": 1.78964840428921e-05, "loss": 0.334, "step": 28230 }, { "epoch": 3.2923274253731343, "grad_norm": 0.47108291908589084, "learning_rate": 1.788863866396407e-05, "loss": 0.3198, "step": 28235 }, { "epoch": 3.292910447761194, "grad_norm": 0.45797650806212836, "learning_rate": 1.7880794714153366e-05, "loss": 0.3291, "step": 28240 }, { "epoch": 3.2934934701492535, "grad_norm": 0.46790517134111836, "learning_rate": 1.7872952194626337e-05, "loss": 0.3249, "step": 28245 }, { "epoch": 3.2940764925373136, "grad_norm": 0.4753403323837971, "learning_rate": 1.786511110654907e-05, "loss": 0.3446, "step": 28250 }, { "epoch": 3.2946595149253732, "grad_norm": 0.44769364250340016, "learning_rate": 1.785727145108747e-05, "loss": 0.3241, "step": 28255 }, { "epoch": 3.295242537313433, "grad_norm": 0.45298169045203396, "learning_rate": 1.784943322940722e-05, "loss": 0.3369, "step": 28260 }, { "epoch": 3.2958255597014925, "grad_norm": 0.44704865585719183, "learning_rate": 1.7841596442673785e-05, "loss": 0.3225, "step": 28265 }, { "epoch": 3.296408582089552, "grad_norm": 0.4229007651588271, "learning_rate": 1.7833761092052415e-05, "loss": 0.3268, "step": 28270 }, { "epoch": 3.2969916044776117, "grad_norm": 0.4693133143323309, "learning_rate": 1.782592717870817e-05, "loss": 0.3252, "step": 28275 }, { "epoch": 3.297574626865672, "grad_norm": 0.45258144180134846, "learning_rate": 1.7818094703805837e-05, "loss": 0.3169, "step": 28280 }, { "epoch": 3.2981576492537314, "grad_norm": 0.47177011099188704, "learning_rate": 1.7810263668510084e-05, "loss": 0.3338, "step": 28285 }, { "epoch": 3.298740671641791, "grad_norm": 0.4894847741427928, "learning_rate": 1.780243407398527e-05, "loss": 0.3199, "step": 28290 }, { "epoch": 3.2993236940298507, "grad_norm": 0.5215752438023614, "learning_rate": 1.7794605921395607e-05, "loss": 0.3502, "step": 28295 }, { "epoch": 3.2999067164179103, "grad_norm": 0.5104828930950608, "learning_rate": 1.7786779211905048e-05, "loss": 0.3469, "step": 28300 }, { "epoch": 3.30048973880597, "grad_norm": 0.4475261165375108, "learning_rate": 1.7778953946677353e-05, "loss": 0.3341, "step": 28305 }, { "epoch": 3.30107276119403, "grad_norm": 0.4698180033120021, "learning_rate": 1.7771130126876068e-05, "loss": 0.3309, "step": 28310 }, { "epoch": 3.3016557835820897, "grad_norm": 0.45325965435071763, "learning_rate": 1.776330775366452e-05, "loss": 0.3092, "step": 28315 }, { "epoch": 3.3022388059701493, "grad_norm": 0.465617570605433, "learning_rate": 1.775548682820582e-05, "loss": 0.3468, "step": 28320 }, { "epoch": 3.302821828358209, "grad_norm": 0.4542573607391547, "learning_rate": 1.774766735166287e-05, "loss": 0.3085, "step": 28325 }, { "epoch": 3.3034048507462686, "grad_norm": 0.4345564294184516, "learning_rate": 1.7739849325198334e-05, "loss": 0.3117, "step": 28330 }, { "epoch": 3.303987873134328, "grad_norm": 0.4843823089731792, "learning_rate": 1.7732032749974704e-05, "loss": 0.2899, "step": 28335 }, { "epoch": 3.3045708955223883, "grad_norm": 0.4660624778568792, "learning_rate": 1.7724217627154204e-05, "loss": 0.3313, "step": 28340 }, { "epoch": 3.305153917910448, "grad_norm": 0.4837352189507676, "learning_rate": 1.771640395789888e-05, "loss": 0.3398, "step": 28345 }, { "epoch": 3.3057369402985075, "grad_norm": 0.5750150917239619, "learning_rate": 1.7708591743370555e-05, "loss": 0.3346, "step": 28350 }, { "epoch": 3.306319962686567, "grad_norm": 0.4942291051775827, "learning_rate": 1.7700780984730818e-05, "loss": 0.3194, "step": 28355 }, { "epoch": 3.3069029850746268, "grad_norm": 0.44461377262811785, "learning_rate": 1.7692971683141063e-05, "loss": 0.3116, "step": 28360 }, { "epoch": 3.3074860074626864, "grad_norm": 0.40068238418777885, "learning_rate": 1.7685163839762457e-05, "loss": 0.3085, "step": 28365 }, { "epoch": 3.3080690298507465, "grad_norm": 0.46351437324640127, "learning_rate": 1.7677357455755954e-05, "loss": 0.3155, "step": 28370 }, { "epoch": 3.308652052238806, "grad_norm": 0.4289306287963423, "learning_rate": 1.7669552532282287e-05, "loss": 0.3295, "step": 28375 }, { "epoch": 3.3092350746268657, "grad_norm": 0.46317987590943466, "learning_rate": 1.766174907050196e-05, "loss": 0.326, "step": 28380 }, { "epoch": 3.3098180970149254, "grad_norm": 0.511695342506279, "learning_rate": 1.765394707157529e-05, "loss": 0.3434, "step": 28385 }, { "epoch": 3.310401119402985, "grad_norm": 0.4556319825438499, "learning_rate": 1.764614653666235e-05, "loss": 0.3212, "step": 28390 }, { "epoch": 3.3109841417910446, "grad_norm": 0.439370769368817, "learning_rate": 1.763834746692301e-05, "loss": 0.3132, "step": 28395 }, { "epoch": 3.3115671641791042, "grad_norm": 0.5276460677109263, "learning_rate": 1.7630549863516914e-05, "loss": 0.3447, "step": 28400 }, { "epoch": 3.3121501865671643, "grad_norm": 0.44358787596302135, "learning_rate": 1.76227537276035e-05, "loss": 0.3127, "step": 28405 }, { "epoch": 3.312733208955224, "grad_norm": 0.5168235568497626, "learning_rate": 1.7614959060341968e-05, "loss": 0.3381, "step": 28410 }, { "epoch": 3.3133162313432836, "grad_norm": 0.46956042215881105, "learning_rate": 1.7607165862891302e-05, "loss": 0.3257, "step": 28415 }, { "epoch": 3.313899253731343, "grad_norm": 0.473856643686133, "learning_rate": 1.75993741364103e-05, "loss": 0.3171, "step": 28420 }, { "epoch": 3.314482276119403, "grad_norm": 0.48769321188112036, "learning_rate": 1.7591583882057495e-05, "loss": 0.3339, "step": 28425 }, { "epoch": 3.315065298507463, "grad_norm": 0.43452129837794445, "learning_rate": 1.7583795100991246e-05, "loss": 0.3246, "step": 28430 }, { "epoch": 3.3156483208955225, "grad_norm": 0.42564160138718315, "learning_rate": 1.7576007794369648e-05, "loss": 0.3266, "step": 28435 }, { "epoch": 3.316231343283582, "grad_norm": 0.44775557185309367, "learning_rate": 1.7568221963350605e-05, "loss": 0.31, "step": 28440 }, { "epoch": 3.316814365671642, "grad_norm": 0.443918874549826, "learning_rate": 1.75604376090918e-05, "loss": 0.3422, "step": 28445 }, { "epoch": 3.3173973880597014, "grad_norm": 0.48853054823756, "learning_rate": 1.755265473275069e-05, "loss": 0.3398, "step": 28450 }, { "epoch": 3.317980410447761, "grad_norm": 0.4453288753925691, "learning_rate": 1.7544873335484514e-05, "loss": 0.3315, "step": 28455 }, { "epoch": 3.3185634328358207, "grad_norm": 0.4706736696432995, "learning_rate": 1.7537093418450294e-05, "loss": 0.3323, "step": 28460 }, { "epoch": 3.3191464552238807, "grad_norm": 0.4622062786366639, "learning_rate": 1.7529314982804817e-05, "loss": 0.32, "step": 28465 }, { "epoch": 3.3197294776119404, "grad_norm": 0.5128681127573129, "learning_rate": 1.7521538029704682e-05, "loss": 0.3417, "step": 28470 }, { "epoch": 3.3203125, "grad_norm": 0.42812012078225126, "learning_rate": 1.7513762560306224e-05, "loss": 0.3266, "step": 28475 }, { "epoch": 3.3208955223880596, "grad_norm": 0.4179109474240003, "learning_rate": 1.750598857576561e-05, "loss": 0.332, "step": 28480 }, { "epoch": 3.3214785447761193, "grad_norm": 0.43273395310547047, "learning_rate": 1.749821607723873e-05, "loss": 0.3304, "step": 28485 }, { "epoch": 3.3220615671641793, "grad_norm": 0.45259230208095724, "learning_rate": 1.749044506588129e-05, "loss": 0.3101, "step": 28490 }, { "epoch": 3.322644589552239, "grad_norm": 0.5092566706794512, "learning_rate": 1.748267554284877e-05, "loss": 0.3426, "step": 28495 }, { "epoch": 3.3232276119402986, "grad_norm": 0.46642444532670313, "learning_rate": 1.7474907509296412e-05, "loss": 0.3219, "step": 28500 }, { "epoch": 3.323810634328358, "grad_norm": 0.48434216570939764, "learning_rate": 1.746714096637926e-05, "loss": 0.3509, "step": 28505 }, { "epoch": 3.324393656716418, "grad_norm": 0.4625398070916998, "learning_rate": 1.7459375915252123e-05, "loss": 0.3382, "step": 28510 }, { "epoch": 3.3249766791044775, "grad_norm": 0.4731801618433754, "learning_rate": 1.745161235706958e-05, "loss": 0.3149, "step": 28515 }, { "epoch": 3.325559701492537, "grad_norm": 0.5480336339026786, "learning_rate": 1.7443850292986007e-05, "loss": 0.321, "step": 28520 }, { "epoch": 3.326142723880597, "grad_norm": 0.4731093715929599, "learning_rate": 1.7436089724155545e-05, "loss": 0.3352, "step": 28525 }, { "epoch": 3.326725746268657, "grad_norm": 0.482348860518997, "learning_rate": 1.742833065173212e-05, "loss": 0.3195, "step": 28530 }, { "epoch": 3.3273087686567164, "grad_norm": 0.512404518918994, "learning_rate": 1.7420573076869422e-05, "loss": 0.3419, "step": 28535 }, { "epoch": 3.327891791044776, "grad_norm": 0.44950430326831653, "learning_rate": 1.7412817000720937e-05, "loss": 0.3461, "step": 28540 }, { "epoch": 3.3284748134328357, "grad_norm": 0.4675428940933235, "learning_rate": 1.7405062424439916e-05, "loss": 0.3174, "step": 28545 }, { "epoch": 3.3290578358208958, "grad_norm": 0.46836753540610626, "learning_rate": 1.7397309349179393e-05, "loss": 0.311, "step": 28550 }, { "epoch": 3.3296408582089554, "grad_norm": 0.4612559364998148, "learning_rate": 1.7389557776092182e-05, "loss": 0.3388, "step": 28555 }, { "epoch": 3.330223880597015, "grad_norm": 0.4710098687914657, "learning_rate": 1.738180770633085e-05, "loss": 0.3294, "step": 28560 }, { "epoch": 3.3308069029850746, "grad_norm": 0.48319244955653473, "learning_rate": 1.7374059141047772e-05, "loss": 0.3349, "step": 28565 }, { "epoch": 3.3313899253731343, "grad_norm": 0.4742826927232586, "learning_rate": 1.7366312081395075e-05, "loss": 0.3496, "step": 28570 }, { "epoch": 3.331972947761194, "grad_norm": 0.4679596663176974, "learning_rate": 1.735856652852468e-05, "loss": 0.3378, "step": 28575 }, { "epoch": 3.3325559701492535, "grad_norm": 0.4700004437056801, "learning_rate": 1.7350822483588277e-05, "loss": 0.3407, "step": 28580 }, { "epoch": 3.3331389925373136, "grad_norm": 0.4661625701935188, "learning_rate": 1.734307994773734e-05, "loss": 0.3146, "step": 28585 }, { "epoch": 3.3337220149253732, "grad_norm": 0.4274293603330983, "learning_rate": 1.7335338922123076e-05, "loss": 0.3395, "step": 28590 }, { "epoch": 3.334305037313433, "grad_norm": 0.4357412023859455, "learning_rate": 1.732759940789655e-05, "loss": 0.3066, "step": 28595 }, { "epoch": 3.3348880597014925, "grad_norm": 0.47209808031962, "learning_rate": 1.7319861406208504e-05, "loss": 0.3258, "step": 28600 }, { "epoch": 3.335471082089552, "grad_norm": 0.4310846757244315, "learning_rate": 1.7312124918209548e-05, "loss": 0.337, "step": 28605 }, { "epoch": 3.3360541044776117, "grad_norm": 0.45143116184202875, "learning_rate": 1.7304389945050004e-05, "loss": 0.3227, "step": 28610 }, { "epoch": 3.336637126865672, "grad_norm": 0.4718825945233989, "learning_rate": 1.729665648787998e-05, "loss": 0.335, "step": 28615 }, { "epoch": 3.3372201492537314, "grad_norm": 0.45236652474602207, "learning_rate": 1.728892454784938e-05, "loss": 0.3228, "step": 28620 }, { "epoch": 3.337803171641791, "grad_norm": 0.4756684838279807, "learning_rate": 1.7281194126107864e-05, "loss": 0.337, "step": 28625 }, { "epoch": 3.3383861940298507, "grad_norm": 0.4832103297157127, "learning_rate": 1.7273465223804876e-05, "loss": 0.3235, "step": 28630 }, { "epoch": 3.3389692164179103, "grad_norm": 0.4620669106805667, "learning_rate": 1.726573784208963e-05, "loss": 0.3352, "step": 28635 }, { "epoch": 3.33955223880597, "grad_norm": 0.457379511350836, "learning_rate": 1.7258011982111094e-05, "loss": 0.3165, "step": 28640 }, { "epoch": 3.34013526119403, "grad_norm": 0.4837806776741993, "learning_rate": 1.7250287645018067e-05, "loss": 0.3398, "step": 28645 }, { "epoch": 3.3407182835820897, "grad_norm": 0.47797831299703214, "learning_rate": 1.7242564831959045e-05, "loss": 0.3164, "step": 28650 }, { "epoch": 3.3413013059701493, "grad_norm": 0.45061023593329286, "learning_rate": 1.7234843544082368e-05, "loss": 0.3114, "step": 28655 }, { "epoch": 3.341884328358209, "grad_norm": 0.4699462388404545, "learning_rate": 1.72271237825361e-05, "loss": 0.3261, "step": 28660 }, { "epoch": 3.3424673507462686, "grad_norm": 0.4408518335167113, "learning_rate": 1.7219405548468098e-05, "loss": 0.3109, "step": 28665 }, { "epoch": 3.343050373134328, "grad_norm": 0.47090059737791345, "learning_rate": 1.7211688843025987e-05, "loss": 0.3419, "step": 28670 }, { "epoch": 3.3436333955223883, "grad_norm": 0.4644265693422495, "learning_rate": 1.7203973667357177e-05, "loss": 0.3281, "step": 28675 }, { "epoch": 3.344216417910448, "grad_norm": 0.4373562328813315, "learning_rate": 1.7196260022608828e-05, "loss": 0.3275, "step": 28680 }, { "epoch": 3.3447994402985075, "grad_norm": 0.47346008251378013, "learning_rate": 1.7188547909927903e-05, "loss": 0.3282, "step": 28685 }, { "epoch": 3.345382462686567, "grad_norm": 1.0564611428526927, "learning_rate": 1.7180837330461093e-05, "loss": 0.3364, "step": 28690 }, { "epoch": 3.3459654850746268, "grad_norm": 0.4913014920273401, "learning_rate": 1.7173128285354917e-05, "loss": 0.3347, "step": 28695 }, { "epoch": 3.3465485074626864, "grad_norm": 0.4357942703481503, "learning_rate": 1.716542077575561e-05, "loss": 0.3373, "step": 28700 }, { "epoch": 3.3471315298507465, "grad_norm": 0.47395184014634933, "learning_rate": 1.7157714802809223e-05, "loss": 0.3308, "step": 28705 }, { "epoch": 3.347714552238806, "grad_norm": 0.4802861614279903, "learning_rate": 1.7150010367661546e-05, "loss": 0.3464, "step": 28710 }, { "epoch": 3.3482975746268657, "grad_norm": 0.47638191405072194, "learning_rate": 1.7142307471458165e-05, "loss": 0.3293, "step": 28715 }, { "epoch": 3.3488805970149254, "grad_norm": 0.48583329962606997, "learning_rate": 1.7134606115344427e-05, "loss": 0.3377, "step": 28720 }, { "epoch": 3.349463619402985, "grad_norm": 0.4708102851131365, "learning_rate": 1.7126906300465446e-05, "loss": 0.3311, "step": 28725 }, { "epoch": 3.3500466417910446, "grad_norm": 0.44741603367765165, "learning_rate": 1.7119208027966116e-05, "loss": 0.3022, "step": 28730 }, { "epoch": 3.3506296641791042, "grad_norm": 0.4369750820887205, "learning_rate": 1.7111511298991088e-05, "loss": 0.315, "step": 28735 }, { "epoch": 3.3512126865671643, "grad_norm": 0.4635991699528975, "learning_rate": 1.710381611468479e-05, "loss": 0.3441, "step": 28740 }, { "epoch": 3.351795708955224, "grad_norm": 0.4393989496610577, "learning_rate": 1.7096122476191434e-05, "loss": 0.3375, "step": 28745 }, { "epoch": 3.3523787313432836, "grad_norm": 0.4575591715744827, "learning_rate": 1.7088430384654984e-05, "loss": 0.3183, "step": 28750 }, { "epoch": 3.352961753731343, "grad_norm": 0.4837015506161303, "learning_rate": 1.7080739841219175e-05, "loss": 0.3005, "step": 28755 }, { "epoch": 3.353544776119403, "grad_norm": 0.41873851176301535, "learning_rate": 1.7073050847027537e-05, "loss": 0.3254, "step": 28760 }, { "epoch": 3.354127798507463, "grad_norm": 0.4385907189743988, "learning_rate": 1.7065363403223316e-05, "loss": 0.3221, "step": 28765 }, { "epoch": 3.3547108208955225, "grad_norm": 0.4703976876800958, "learning_rate": 1.7057677510949598e-05, "loss": 0.3454, "step": 28770 }, { "epoch": 3.355293843283582, "grad_norm": 0.44711954650808494, "learning_rate": 1.704999317134917e-05, "loss": 0.3384, "step": 28775 }, { "epoch": 3.355876865671642, "grad_norm": 0.45746519877296554, "learning_rate": 1.704231038556465e-05, "loss": 0.323, "step": 28780 }, { "epoch": 3.3564598880597014, "grad_norm": 0.48997222303227145, "learning_rate": 1.7034629154738368e-05, "loss": 0.3455, "step": 28785 }, { "epoch": 3.357042910447761, "grad_norm": 0.44365154822478664, "learning_rate": 1.702694948001246e-05, "loss": 0.3289, "step": 28790 }, { "epoch": 3.3576259328358207, "grad_norm": 0.4535111888588438, "learning_rate": 1.7019271362528823e-05, "loss": 0.3118, "step": 28795 }, { "epoch": 3.3582089552238807, "grad_norm": 0.47714760336776346, "learning_rate": 1.701159480342911e-05, "loss": 0.3335, "step": 28800 }, { "epoch": 3.3587919776119404, "grad_norm": 0.48959669410442797, "learning_rate": 1.7003919803854765e-05, "loss": 0.3325, "step": 28805 }, { "epoch": 3.359375, "grad_norm": 0.46285775475973334, "learning_rate": 1.6996246364946985e-05, "loss": 0.3116, "step": 28810 }, { "epoch": 3.3599580223880596, "grad_norm": 0.4169969152918247, "learning_rate": 1.698857448784672e-05, "loss": 0.3417, "step": 28815 }, { "epoch": 3.3605410447761193, "grad_norm": 0.42493875863731995, "learning_rate": 1.6980904173694727e-05, "loss": 0.3176, "step": 28820 }, { "epoch": 3.3611240671641793, "grad_norm": 0.479417795940034, "learning_rate": 1.6973235423631488e-05, "loss": 0.3339, "step": 28825 }, { "epoch": 3.361707089552239, "grad_norm": 0.46063227863602507, "learning_rate": 1.69655682387973e-05, "loss": 0.3213, "step": 28830 }, { "epoch": 3.3622901119402986, "grad_norm": 0.42695906558339464, "learning_rate": 1.6957902620332163e-05, "loss": 0.3249, "step": 28835 }, { "epoch": 3.362873134328358, "grad_norm": 0.46454051173634997, "learning_rate": 1.695023856937591e-05, "loss": 0.3426, "step": 28840 }, { "epoch": 3.363456156716418, "grad_norm": 0.42512367792774186, "learning_rate": 1.6942576087068096e-05, "loss": 0.3224, "step": 28845 }, { "epoch": 3.3640391791044775, "grad_norm": 0.43984928949360896, "learning_rate": 1.6934915174548073e-05, "loss": 0.3272, "step": 28850 }, { "epoch": 3.364622201492537, "grad_norm": 0.4761012354743501, "learning_rate": 1.6927255832954934e-05, "loss": 0.3332, "step": 28855 }, { "epoch": 3.365205223880597, "grad_norm": 0.4349741218695636, "learning_rate": 1.691959806342756e-05, "loss": 0.3166, "step": 28860 }, { "epoch": 3.365788246268657, "grad_norm": 0.4941038573767465, "learning_rate": 1.691194186710457e-05, "loss": 0.3307, "step": 28865 }, { "epoch": 3.3663712686567164, "grad_norm": 0.46814148887806617, "learning_rate": 1.690428724512439e-05, "loss": 0.3354, "step": 28870 }, { "epoch": 3.366954291044776, "grad_norm": 0.4927434354158718, "learning_rate": 1.6896634198625167e-05, "loss": 0.3062, "step": 28875 }, { "epoch": 3.3675373134328357, "grad_norm": 0.4495077343515924, "learning_rate": 1.688898272874485e-05, "loss": 0.3162, "step": 28880 }, { "epoch": 3.3681203358208958, "grad_norm": 0.42772372072909065, "learning_rate": 1.688133283662113e-05, "loss": 0.3207, "step": 28885 }, { "epoch": 3.3687033582089554, "grad_norm": 0.506251918914207, "learning_rate": 1.6873684523391487e-05, "loss": 0.3232, "step": 28890 }, { "epoch": 3.369286380597015, "grad_norm": 0.44161011006478523, "learning_rate": 1.6866037790193135e-05, "loss": 0.3217, "step": 28895 }, { "epoch": 3.3698694029850746, "grad_norm": 0.47497485510111964, "learning_rate": 1.685839263816308e-05, "loss": 0.3391, "step": 28900 }, { "epoch": 3.3704524253731343, "grad_norm": 0.4446775217842558, "learning_rate": 1.6850749068438092e-05, "loss": 0.3113, "step": 28905 }, { "epoch": 3.371035447761194, "grad_norm": 0.4456686993412567, "learning_rate": 1.6843107082154675e-05, "loss": 0.3224, "step": 28910 }, { "epoch": 3.3716184701492535, "grad_norm": 0.4640905670462758, "learning_rate": 1.6835466680449128e-05, "loss": 0.339, "step": 28915 }, { "epoch": 3.3722014925373136, "grad_norm": 0.4449103955020881, "learning_rate": 1.68278278644575e-05, "loss": 0.3248, "step": 28920 }, { "epoch": 3.3727845149253732, "grad_norm": 0.4272115502334145, "learning_rate": 1.6820190635315627e-05, "loss": 0.3202, "step": 28925 }, { "epoch": 3.373367537313433, "grad_norm": 0.4984313639098382, "learning_rate": 1.6812554994159073e-05, "loss": 0.3489, "step": 28930 }, { "epoch": 3.3739505597014925, "grad_norm": 0.45636220850360476, "learning_rate": 1.6804920942123207e-05, "loss": 0.3609, "step": 28935 }, { "epoch": 3.374533582089552, "grad_norm": 0.5119129821437987, "learning_rate": 1.679728848034311e-05, "loss": 0.3361, "step": 28940 }, { "epoch": 3.3751166044776117, "grad_norm": 0.4832296650669876, "learning_rate": 1.6789657609953686e-05, "loss": 0.2973, "step": 28945 }, { "epoch": 3.375699626865672, "grad_norm": 0.49182622850313595, "learning_rate": 1.678202833208954e-05, "loss": 0.3598, "step": 28950 }, { "epoch": 3.3762826492537314, "grad_norm": 0.5078875959243815, "learning_rate": 1.6774400647885106e-05, "loss": 0.3187, "step": 28955 }, { "epoch": 3.376865671641791, "grad_norm": 0.48009172975848813, "learning_rate": 1.6766774558474523e-05, "loss": 0.3383, "step": 28960 }, { "epoch": 3.3774486940298507, "grad_norm": 0.45320603960838435, "learning_rate": 1.675915006499173e-05, "loss": 0.3169, "step": 28965 }, { "epoch": 3.3780317164179103, "grad_norm": 0.4372366280181476, "learning_rate": 1.675152716857041e-05, "loss": 0.3334, "step": 28970 }, { "epoch": 3.37861473880597, "grad_norm": 0.5026424127544947, "learning_rate": 1.6743905870344017e-05, "loss": 0.32, "step": 28975 }, { "epoch": 3.37919776119403, "grad_norm": 0.4433482644885729, "learning_rate": 1.6736286171445763e-05, "loss": 0.3242, "step": 28980 }, { "epoch": 3.3797807835820897, "grad_norm": 0.4607096737178669, "learning_rate": 1.672866807300864e-05, "loss": 0.3153, "step": 28985 }, { "epoch": 3.3803638059701493, "grad_norm": 0.4534037189966212, "learning_rate": 1.672105157616535e-05, "loss": 0.3156, "step": 28990 }, { "epoch": 3.380946828358209, "grad_norm": 0.4754535574299298, "learning_rate": 1.6713436682048436e-05, "loss": 0.3397, "step": 28995 }, { "epoch": 3.3815298507462686, "grad_norm": 0.48982348927044955, "learning_rate": 1.670582339179012e-05, "loss": 0.3392, "step": 29000 }, { "epoch": 3.382112873134328, "grad_norm": 0.5006803940184319, "learning_rate": 1.669821170652246e-05, "loss": 0.3392, "step": 29005 }, { "epoch": 3.3826958955223883, "grad_norm": 0.43469884603822234, "learning_rate": 1.669060162737722e-05, "loss": 0.3239, "step": 29010 }, { "epoch": 3.383278917910448, "grad_norm": 0.43993247285998877, "learning_rate": 1.6682993155485956e-05, "loss": 0.3302, "step": 29015 }, { "epoch": 3.3838619402985075, "grad_norm": 0.45745638126081706, "learning_rate": 1.667538629197996e-05, "loss": 0.325, "step": 29020 }, { "epoch": 3.384444962686567, "grad_norm": 0.4904067030829676, "learning_rate": 1.6667781037990317e-05, "loss": 0.3493, "step": 29025 }, { "epoch": 3.3850279850746268, "grad_norm": 0.47994232510089185, "learning_rate": 1.666017739464784e-05, "loss": 0.3294, "step": 29030 }, { "epoch": 3.3856110074626864, "grad_norm": 0.43084287031412444, "learning_rate": 1.6652575363083145e-05, "loss": 0.3228, "step": 29035 }, { "epoch": 3.3861940298507465, "grad_norm": 0.5108201858031536, "learning_rate": 1.664497494442654e-05, "loss": 0.3402, "step": 29040 }, { "epoch": 3.386777052238806, "grad_norm": 0.4516662511890705, "learning_rate": 1.6637376139808174e-05, "loss": 0.3215, "step": 29045 }, { "epoch": 3.3873600746268657, "grad_norm": 0.47598794694666274, "learning_rate": 1.6629778950357883e-05, "loss": 0.3293, "step": 29050 }, { "epoch": 3.3879430970149254, "grad_norm": 0.4634828057975539, "learning_rate": 1.6622183377205325e-05, "loss": 0.3522, "step": 29055 }, { "epoch": 3.388526119402985, "grad_norm": 0.4800220506569302, "learning_rate": 1.6614589421479876e-05, "loss": 0.3033, "step": 29060 }, { "epoch": 3.3891091417910446, "grad_norm": 0.45987315002339646, "learning_rate": 1.660699708431068e-05, "loss": 0.3451, "step": 29065 }, { "epoch": 3.3896921641791042, "grad_norm": 0.4840242205468901, "learning_rate": 1.6599406366826648e-05, "loss": 0.3189, "step": 29070 }, { "epoch": 3.3902751865671643, "grad_norm": 0.4681427252517476, "learning_rate": 1.659181727015645e-05, "loss": 0.3242, "step": 29075 }, { "epoch": 3.390858208955224, "grad_norm": 0.5206917501264354, "learning_rate": 1.6584229795428514e-05, "loss": 0.364, "step": 29080 }, { "epoch": 3.3914412313432836, "grad_norm": 0.45132905670678686, "learning_rate": 1.6576643943771026e-05, "loss": 0.3198, "step": 29085 }, { "epoch": 3.392024253731343, "grad_norm": 0.42703489141135753, "learning_rate": 1.656905971631192e-05, "loss": 0.3282, "step": 29090 }, { "epoch": 3.392607276119403, "grad_norm": 0.4956592487889643, "learning_rate": 1.65614771141789e-05, "loss": 0.3318, "step": 29095 }, { "epoch": 3.393190298507463, "grad_norm": 0.4597032552965804, "learning_rate": 1.655389613849943e-05, "loss": 0.3192, "step": 29100 }, { "epoch": 3.3937733208955225, "grad_norm": 0.47238754573752373, "learning_rate": 1.6546316790400726e-05, "loss": 0.3238, "step": 29105 }, { "epoch": 3.394356343283582, "grad_norm": 0.45524961260409447, "learning_rate": 1.653873907100977e-05, "loss": 0.3242, "step": 29110 }, { "epoch": 3.394939365671642, "grad_norm": 0.44529409247332336, "learning_rate": 1.6531162981453286e-05, "loss": 0.3147, "step": 29115 }, { "epoch": 3.3955223880597014, "grad_norm": 0.4393766798726157, "learning_rate": 1.6523588522857784e-05, "loss": 0.3321, "step": 29120 }, { "epoch": 3.396105410447761, "grad_norm": 0.48351579617423796, "learning_rate": 1.6516015696349486e-05, "loss": 0.3219, "step": 29125 }, { "epoch": 3.3966884328358207, "grad_norm": 0.48109320080614015, "learning_rate": 1.6508444503054432e-05, "loss": 0.3297, "step": 29130 }, { "epoch": 3.3972714552238807, "grad_norm": 0.43121269326400713, "learning_rate": 1.6500874944098348e-05, "loss": 0.3341, "step": 29135 }, { "epoch": 3.3978544776119404, "grad_norm": 0.45953798551611186, "learning_rate": 1.6493307020606796e-05, "loss": 0.3231, "step": 29140 }, { "epoch": 3.3984375, "grad_norm": 0.4968957971702939, "learning_rate": 1.648574073370502e-05, "loss": 0.3204, "step": 29145 }, { "epoch": 3.3990205223880596, "grad_norm": 0.484987188142252, "learning_rate": 1.647817608451807e-05, "loss": 0.3483, "step": 29150 }, { "epoch": 3.3996035447761193, "grad_norm": 0.453128891702804, "learning_rate": 1.647061307417073e-05, "loss": 0.3305, "step": 29155 }, { "epoch": 3.4001865671641793, "grad_norm": 0.46707430914885883, "learning_rate": 1.6463051703787557e-05, "loss": 0.3289, "step": 29160 }, { "epoch": 3.400769589552239, "grad_norm": 0.442771230547862, "learning_rate": 1.645549197449285e-05, "loss": 0.3317, "step": 29165 }, { "epoch": 3.4013526119402986, "grad_norm": 0.4518401278396903, "learning_rate": 1.644793388741067e-05, "loss": 0.3192, "step": 29170 }, { "epoch": 3.401935634328358, "grad_norm": 0.46346999965867547, "learning_rate": 1.6440377443664816e-05, "loss": 0.3284, "step": 29175 }, { "epoch": 3.402518656716418, "grad_norm": 0.48560352645991145, "learning_rate": 1.6432822644378888e-05, "loss": 0.3321, "step": 29180 }, { "epoch": 3.4031016791044775, "grad_norm": 0.49327932470834185, "learning_rate": 1.6425269490676176e-05, "loss": 0.3271, "step": 29185 }, { "epoch": 3.403684701492537, "grad_norm": 0.455797669188555, "learning_rate": 1.64177179836798e-05, "loss": 0.3284, "step": 29190 }, { "epoch": 3.404267723880597, "grad_norm": 0.49251452493343373, "learning_rate": 1.641016812451257e-05, "loss": 0.3472, "step": 29195 }, { "epoch": 3.404850746268657, "grad_norm": 0.4423612059084429, "learning_rate": 1.6402619914297087e-05, "loss": 0.3268, "step": 29200 }, { "epoch": 3.4054337686567164, "grad_norm": 0.41882583544444013, "learning_rate": 1.6395073354155694e-05, "loss": 0.3192, "step": 29205 }, { "epoch": 3.406016791044776, "grad_norm": 0.4694412268068585, "learning_rate": 1.6387528445210497e-05, "loss": 0.3452, "step": 29210 }, { "epoch": 3.4065998134328357, "grad_norm": 0.4370163867794354, "learning_rate": 1.6379985188583352e-05, "loss": 0.3067, "step": 29215 }, { "epoch": 3.4071828358208958, "grad_norm": 0.6320314077603403, "learning_rate": 1.6372443585395875e-05, "loss": 0.3453, "step": 29220 }, { "epoch": 3.4077658582089554, "grad_norm": 0.4622433438894852, "learning_rate": 1.6364903636769403e-05, "loss": 0.3216, "step": 29225 }, { "epoch": 3.408348880597015, "grad_norm": 0.4562735122226254, "learning_rate": 1.6357365343825088e-05, "loss": 0.3128, "step": 29230 }, { "epoch": 3.4089319029850746, "grad_norm": 0.43711650765912896, "learning_rate": 1.634982870768378e-05, "loss": 0.3337, "step": 29235 }, { "epoch": 3.4095149253731343, "grad_norm": 0.4934253764714628, "learning_rate": 1.634229372946611e-05, "loss": 0.3184, "step": 29240 }, { "epoch": 3.410097947761194, "grad_norm": 0.47260380063181173, "learning_rate": 1.633476041029246e-05, "loss": 0.3434, "step": 29245 }, { "epoch": 3.4106809701492535, "grad_norm": 0.45886172596055086, "learning_rate": 1.632722875128296e-05, "loss": 0.3261, "step": 29250 }, { "epoch": 3.4112639925373136, "grad_norm": 0.49068242514814053, "learning_rate": 1.6319698753557498e-05, "loss": 0.3669, "step": 29255 }, { "epoch": 3.4118470149253732, "grad_norm": 0.4826921491070021, "learning_rate": 1.6312170418235705e-05, "loss": 0.3275, "step": 29260 }, { "epoch": 3.412430037313433, "grad_norm": 0.7730794447605902, "learning_rate": 1.6304643746436994e-05, "loss": 0.3234, "step": 29265 }, { "epoch": 3.4130130597014925, "grad_norm": 0.4460789197147561, "learning_rate": 1.6297118739280483e-05, "loss": 0.3239, "step": 29270 }, { "epoch": 3.413596082089552, "grad_norm": 0.47225575055167696, "learning_rate": 1.6289595397885074e-05, "loss": 0.3353, "step": 29275 }, { "epoch": 3.4141791044776117, "grad_norm": 0.4689193918446093, "learning_rate": 1.6282073723369427e-05, "loss": 0.3117, "step": 29280 }, { "epoch": 3.414762126865672, "grad_norm": 0.6572589456764506, "learning_rate": 1.6274553716851928e-05, "loss": 0.3172, "step": 29285 }, { "epoch": 3.4153451492537314, "grad_norm": 0.48956681254627316, "learning_rate": 1.6267035379450744e-05, "loss": 0.3249, "step": 29290 }, { "epoch": 3.415928171641791, "grad_norm": 0.4307089857003831, "learning_rate": 1.625951871228378e-05, "loss": 0.3179, "step": 29295 }, { "epoch": 3.4165111940298507, "grad_norm": 0.4394328605862848, "learning_rate": 1.625200371646867e-05, "loss": 0.3266, "step": 29300 }, { "epoch": 3.4170942164179103, "grad_norm": 0.5080366533717237, "learning_rate": 1.6244490393122857e-05, "loss": 0.3227, "step": 29305 }, { "epoch": 3.41767723880597, "grad_norm": 0.472944969837989, "learning_rate": 1.6236978743363464e-05, "loss": 0.3309, "step": 29310 }, { "epoch": 3.41826026119403, "grad_norm": 0.4679474789501229, "learning_rate": 1.622946876830744e-05, "loss": 0.3217, "step": 29315 }, { "epoch": 3.4188432835820897, "grad_norm": 0.46585469354169445, "learning_rate": 1.622196046907141e-05, "loss": 0.3383, "step": 29320 }, { "epoch": 3.4194263059701493, "grad_norm": 0.47036204047896363, "learning_rate": 1.62144538467718e-05, "loss": 0.3158, "step": 29325 }, { "epoch": 3.420009328358209, "grad_norm": 0.4460527976475676, "learning_rate": 1.6206948902524783e-05, "loss": 0.3047, "step": 29330 }, { "epoch": 3.4205923507462686, "grad_norm": 0.4662483884995518, "learning_rate": 1.6199445637446255e-05, "loss": 0.321, "step": 29335 }, { "epoch": 3.421175373134328, "grad_norm": 0.4653190427968612, "learning_rate": 1.619194405265189e-05, "loss": 0.3214, "step": 29340 }, { "epoch": 3.4217583955223883, "grad_norm": 0.4173920207221273, "learning_rate": 1.618444414925711e-05, "loss": 0.3136, "step": 29345 }, { "epoch": 3.422341417910448, "grad_norm": 0.4654712782520686, "learning_rate": 1.617694592837705e-05, "loss": 0.3128, "step": 29350 }, { "epoch": 3.4229244402985075, "grad_norm": 0.48019523489175087, "learning_rate": 1.616944939112666e-05, "loss": 0.3112, "step": 29355 }, { "epoch": 3.423507462686567, "grad_norm": 0.4820369050056036, "learning_rate": 1.616195453862057e-05, "loss": 0.3253, "step": 29360 }, { "epoch": 3.4240904850746268, "grad_norm": 0.4337344580595188, "learning_rate": 1.615446137197322e-05, "loss": 0.3208, "step": 29365 }, { "epoch": 3.4246735074626864, "grad_norm": 0.4857873891669292, "learning_rate": 1.614696989229876e-05, "loss": 0.3318, "step": 29370 }, { "epoch": 3.4252565298507465, "grad_norm": 0.44245351929052856, "learning_rate": 1.613948010071109e-05, "loss": 0.3149, "step": 29375 }, { "epoch": 3.425839552238806, "grad_norm": 0.4523023080052427, "learning_rate": 1.6131991998323893e-05, "loss": 0.3151, "step": 29380 }, { "epoch": 3.4264225746268657, "grad_norm": 0.46242880869989617, "learning_rate": 1.612450558625056e-05, "loss": 0.3332, "step": 29385 }, { "epoch": 3.4270055970149254, "grad_norm": 0.4335355944581716, "learning_rate": 1.611702086560426e-05, "loss": 0.3205, "step": 29390 }, { "epoch": 3.427588619402985, "grad_norm": 0.4714346715641154, "learning_rate": 1.61095378374979e-05, "loss": 0.3111, "step": 29395 }, { "epoch": 3.4281716417910446, "grad_norm": 0.48579038471627284, "learning_rate": 1.6102056503044115e-05, "loss": 0.3342, "step": 29400 }, { "epoch": 3.4287546641791042, "grad_norm": 0.4680627208635204, "learning_rate": 1.6094576863355338e-05, "loss": 0.347, "step": 29405 }, { "epoch": 3.4293376865671643, "grad_norm": 0.4452049869579787, "learning_rate": 1.6087098919543696e-05, "loss": 0.3125, "step": 29410 }, { "epoch": 3.429920708955224, "grad_norm": 0.4465235262099271, "learning_rate": 1.6079622672721093e-05, "loss": 0.3431, "step": 29415 }, { "epoch": 3.4305037313432836, "grad_norm": 0.46229598497620156, "learning_rate": 1.6072148123999182e-05, "loss": 0.3264, "step": 29420 }, { "epoch": 3.431086753731343, "grad_norm": 0.45152214672311974, "learning_rate": 1.6064675274489348e-05, "loss": 0.3378, "step": 29425 }, { "epoch": 3.431669776119403, "grad_norm": 0.4256730359566115, "learning_rate": 1.605720412530274e-05, "loss": 0.3107, "step": 29430 }, { "epoch": 3.432252798507463, "grad_norm": 0.47213526214375995, "learning_rate": 1.6049734677550243e-05, "loss": 0.3287, "step": 29435 }, { "epoch": 3.4328358208955225, "grad_norm": 0.5274518664129215, "learning_rate": 1.6042266932342498e-05, "loss": 0.3304, "step": 29440 }, { "epoch": 3.433418843283582, "grad_norm": 0.4606776918727309, "learning_rate": 1.6034800890789874e-05, "loss": 0.3319, "step": 29445 }, { "epoch": 3.434001865671642, "grad_norm": 0.4743678258705525, "learning_rate": 1.6027336554002512e-05, "loss": 0.328, "step": 29450 }, { "epoch": 3.4345848880597014, "grad_norm": 0.47545050039761877, "learning_rate": 1.6019873923090277e-05, "loss": 0.3182, "step": 29455 }, { "epoch": 3.435167910447761, "grad_norm": 0.4523005954933371, "learning_rate": 1.60124129991628e-05, "loss": 0.3239, "step": 29460 }, { "epoch": 3.4357509328358207, "grad_norm": 0.43291743277189226, "learning_rate": 1.600495378332944e-05, "loss": 0.3173, "step": 29465 }, { "epoch": 3.4363339552238807, "grad_norm": 0.4127627299113383, "learning_rate": 1.599749627669933e-05, "loss": 0.3095, "step": 29470 }, { "epoch": 3.4369169776119404, "grad_norm": 0.467592471972227, "learning_rate": 1.59900404803813e-05, "loss": 0.3324, "step": 29475 }, { "epoch": 3.4375, "grad_norm": 0.44253230695768114, "learning_rate": 1.5982586395483983e-05, "loss": 0.3379, "step": 29480 }, { "epoch": 3.4380830223880596, "grad_norm": 0.4901313291509628, "learning_rate": 1.5975134023115707e-05, "loss": 0.3136, "step": 29485 }, { "epoch": 3.4386660447761193, "grad_norm": 0.46239650748248434, "learning_rate": 1.5967683364384595e-05, "loss": 0.327, "step": 29490 }, { "epoch": 3.4392490671641793, "grad_norm": 0.43061478261902475, "learning_rate": 1.5960234420398464e-05, "loss": 0.3255, "step": 29495 }, { "epoch": 3.439832089552239, "grad_norm": 0.4798479801501889, "learning_rate": 1.595278719226491e-05, "loss": 0.3261, "step": 29500 }, { "epoch": 3.4404151119402986, "grad_norm": 0.49377906906887215, "learning_rate": 1.5945341681091268e-05, "loss": 0.3273, "step": 29505 }, { "epoch": 3.440998134328358, "grad_norm": 0.42993794672775065, "learning_rate": 1.5937897887984605e-05, "loss": 0.3354, "step": 29510 }, { "epoch": 3.441581156716418, "grad_norm": 0.469948013542681, "learning_rate": 1.593045581405175e-05, "loss": 0.3409, "step": 29515 }, { "epoch": 3.4421641791044775, "grad_norm": 0.4391459720289155, "learning_rate": 1.5923015460399277e-05, "loss": 0.3304, "step": 29520 }, { "epoch": 3.442747201492537, "grad_norm": 0.47788552398134604, "learning_rate": 1.5915576828133462e-05, "loss": 0.3184, "step": 29525 }, { "epoch": 3.443330223880597, "grad_norm": 0.45281405309650546, "learning_rate": 1.59081399183604e-05, "loss": 0.3406, "step": 29530 }, { "epoch": 3.443913246268657, "grad_norm": 0.49233220368932884, "learning_rate": 1.590070473218585e-05, "loss": 0.3513, "step": 29535 }, { "epoch": 3.4444962686567164, "grad_norm": 0.4282769663966438, "learning_rate": 1.589327127071539e-05, "loss": 0.33, "step": 29540 }, { "epoch": 3.445079291044776, "grad_norm": 0.4549963003710725, "learning_rate": 1.5885839535054277e-05, "loss": 0.3145, "step": 29545 }, { "epoch": 3.4456623134328357, "grad_norm": 0.4561056686906441, "learning_rate": 1.587840952630755e-05, "loss": 0.3257, "step": 29550 }, { "epoch": 3.4462453358208958, "grad_norm": 0.5272509953017616, "learning_rate": 1.587098124557997e-05, "loss": 0.3249, "step": 29555 }, { "epoch": 3.4468283582089554, "grad_norm": 0.46858598486506753, "learning_rate": 1.5863554693976065e-05, "loss": 0.339, "step": 29560 }, { "epoch": 3.447411380597015, "grad_norm": 0.47203929165066233, "learning_rate": 1.5856129872600083e-05, "loss": 0.3391, "step": 29565 }, { "epoch": 3.4479944029850746, "grad_norm": 0.4713804789655109, "learning_rate": 1.584870678255604e-05, "loss": 0.3143, "step": 29570 }, { "epoch": 3.4485774253731343, "grad_norm": 0.4645038934380423, "learning_rate": 1.5841285424947643e-05, "loss": 0.3246, "step": 29575 }, { "epoch": 3.449160447761194, "grad_norm": 0.44099698390558617, "learning_rate": 1.5833865800878422e-05, "loss": 0.348, "step": 29580 }, { "epoch": 3.4497434701492535, "grad_norm": 0.4629400844409978, "learning_rate": 1.5826447911451568e-05, "loss": 0.3366, "step": 29585 }, { "epoch": 3.4503264925373136, "grad_norm": 0.44877295900291836, "learning_rate": 1.5819031757770064e-05, "loss": 0.3243, "step": 29590 }, { "epoch": 3.4509095149253732, "grad_norm": 0.4761971918320478, "learning_rate": 1.5811617340936622e-05, "loss": 0.3347, "step": 29595 }, { "epoch": 3.451492537313433, "grad_norm": 0.47429383320207846, "learning_rate": 1.580420466205369e-05, "loss": 0.3327, "step": 29600 }, { "epoch": 3.4520755597014925, "grad_norm": 0.4325222611319617, "learning_rate": 1.5796793722223464e-05, "loss": 0.3084, "step": 29605 }, { "epoch": 3.452658582089552, "grad_norm": 0.49804279155651243, "learning_rate": 1.5789384522547888e-05, "loss": 0.3529, "step": 29610 }, { "epoch": 3.4532416044776117, "grad_norm": 0.4403294750127433, "learning_rate": 1.5781977064128627e-05, "loss": 0.3129, "step": 29615 }, { "epoch": 3.453824626865672, "grad_norm": 0.466385874116567, "learning_rate": 1.577457134806711e-05, "loss": 0.3067, "step": 29620 }, { "epoch": 3.4544076492537314, "grad_norm": 0.4554909670671076, "learning_rate": 1.5767167375464488e-05, "loss": 0.3243, "step": 29625 }, { "epoch": 3.454990671641791, "grad_norm": 0.47142982851626164, "learning_rate": 1.5759765147421658e-05, "loss": 0.3299, "step": 29630 }, { "epoch": 3.4555736940298507, "grad_norm": 0.4820811528926111, "learning_rate": 1.5752364665039265e-05, "loss": 0.3243, "step": 29635 }, { "epoch": 3.4561567164179103, "grad_norm": 0.44116226153905386, "learning_rate": 1.5744965929417693e-05, "loss": 0.3171, "step": 29640 }, { "epoch": 3.45673973880597, "grad_norm": 0.43959800142075794, "learning_rate": 1.5737568941657053e-05, "loss": 0.3121, "step": 29645 }, { "epoch": 3.45732276119403, "grad_norm": 0.4270239335081458, "learning_rate": 1.573017370285722e-05, "loss": 0.3207, "step": 29650 }, { "epoch": 3.4579057835820897, "grad_norm": 0.4715974742793349, "learning_rate": 1.57227802141178e-05, "loss": 0.3356, "step": 29655 }, { "epoch": 3.4584888059701493, "grad_norm": 0.46583689868027905, "learning_rate": 1.57153884765381e-05, "loss": 0.3284, "step": 29660 }, { "epoch": 3.459071828358209, "grad_norm": 0.45235961163983923, "learning_rate": 1.570799849121724e-05, "loss": 0.3299, "step": 29665 }, { "epoch": 3.4596548507462686, "grad_norm": 0.45464411904765856, "learning_rate": 1.5700610259254018e-05, "loss": 0.3324, "step": 29670 }, { "epoch": 3.460237873134328, "grad_norm": 0.4450421944133654, "learning_rate": 1.5693223781746996e-05, "loss": 0.3334, "step": 29675 }, { "epoch": 3.4608208955223883, "grad_norm": 0.4594034531240603, "learning_rate": 1.5685839059794476e-05, "loss": 0.3229, "step": 29680 }, { "epoch": 3.461403917910448, "grad_norm": 0.41895290725439016, "learning_rate": 1.5678456094494493e-05, "loss": 0.31, "step": 29685 }, { "epoch": 3.4619869402985075, "grad_norm": 0.44935042825201343, "learning_rate": 1.5671074886944823e-05, "loss": 0.3257, "step": 29690 }, { "epoch": 3.462569962686567, "grad_norm": 0.46514451343414803, "learning_rate": 1.5663695438242992e-05, "loss": 0.3295, "step": 29695 }, { "epoch": 3.4631529850746268, "grad_norm": 0.4602897596220332, "learning_rate": 1.5656317749486225e-05, "loss": 0.325, "step": 29700 }, { "epoch": 3.4637360074626864, "grad_norm": 0.47791320148340927, "learning_rate": 1.5648941821771552e-05, "loss": 0.3124, "step": 29705 }, { "epoch": 3.4643190298507465, "grad_norm": 0.4429455185747638, "learning_rate": 1.5641567656195664e-05, "loss": 0.3463, "step": 29710 }, { "epoch": 3.464902052238806, "grad_norm": 0.5332490658060334, "learning_rate": 1.563419525385507e-05, "loss": 0.3343, "step": 29715 }, { "epoch": 3.4654850746268657, "grad_norm": 0.4872252237806146, "learning_rate": 1.562682461584594e-05, "loss": 0.3534, "step": 29720 }, { "epoch": 3.4660680970149254, "grad_norm": 0.4136662662407533, "learning_rate": 1.5619455743264227e-05, "loss": 0.334, "step": 29725 }, { "epoch": 3.466651119402985, "grad_norm": 0.4599090427951163, "learning_rate": 1.561208863720562e-05, "loss": 0.3183, "step": 29730 }, { "epoch": 3.4672341417910446, "grad_norm": 0.4915351526363331, "learning_rate": 1.5604723298765532e-05, "loss": 0.3139, "step": 29735 }, { "epoch": 3.4678171641791042, "grad_norm": 0.5549869449077802, "learning_rate": 1.559735972903912e-05, "loss": 0.3345, "step": 29740 }, { "epoch": 3.4684001865671643, "grad_norm": 0.4140216835519108, "learning_rate": 1.5589997929121282e-05, "loss": 0.3321, "step": 29745 }, { "epoch": 3.468983208955224, "grad_norm": 0.4644103030378211, "learning_rate": 1.5582637900106622e-05, "loss": 0.3338, "step": 29750 }, { "epoch": 3.4695662313432836, "grad_norm": 0.4392730181361793, "learning_rate": 1.557527964308954e-05, "loss": 0.3445, "step": 29755 }, { "epoch": 3.470149253731343, "grad_norm": 0.46176465643720466, "learning_rate": 1.5567923159164108e-05, "loss": 0.3208, "step": 29760 }, { "epoch": 3.470732276119403, "grad_norm": 0.4878151437183506, "learning_rate": 1.5560568449424197e-05, "loss": 0.336, "step": 29765 }, { "epoch": 3.471315298507463, "grad_norm": 0.5059817850100247, "learning_rate": 1.555321551496335e-05, "loss": 0.3379, "step": 29770 }, { "epoch": 3.4718983208955225, "grad_norm": 0.4449441568580068, "learning_rate": 1.5545864356874894e-05, "loss": 0.3694, "step": 29775 }, { "epoch": 3.472481343283582, "grad_norm": 0.4667909260521094, "learning_rate": 1.553851497625187e-05, "loss": 0.3264, "step": 29780 }, { "epoch": 3.473064365671642, "grad_norm": 0.4646907193799085, "learning_rate": 1.5531167374187068e-05, "loss": 0.3064, "step": 29785 }, { "epoch": 3.4736473880597014, "grad_norm": 0.4966638032944251, "learning_rate": 1.5523821551773006e-05, "loss": 0.3505, "step": 29790 }, { "epoch": 3.474230410447761, "grad_norm": 0.46284592261560986, "learning_rate": 1.5516477510101933e-05, "loss": 0.3303, "step": 29795 }, { "epoch": 3.4748134328358207, "grad_norm": 0.459641710190575, "learning_rate": 1.5509135250265835e-05, "loss": 0.3198, "step": 29800 }, { "epoch": 3.4753964552238807, "grad_norm": 0.41530490147794286, "learning_rate": 1.5501794773356437e-05, "loss": 0.3179, "step": 29805 }, { "epoch": 3.4759794776119404, "grad_norm": 0.4827602927154679, "learning_rate": 1.5494456080465198e-05, "loss": 0.3173, "step": 29810 }, { "epoch": 3.4765625, "grad_norm": 0.45721814461824417, "learning_rate": 1.548711917268331e-05, "loss": 0.3362, "step": 29815 }, { "epoch": 3.4771455223880596, "grad_norm": 0.4500721077058827, "learning_rate": 1.547978405110171e-05, "loss": 0.3157, "step": 29820 }, { "epoch": 3.4777285447761193, "grad_norm": 0.43683248938066466, "learning_rate": 1.5472450716811052e-05, "loss": 0.3502, "step": 29825 }, { "epoch": 3.4783115671641793, "grad_norm": 0.44719437684965135, "learning_rate": 1.5465119170901742e-05, "loss": 0.3132, "step": 29830 }, { "epoch": 3.478894589552239, "grad_norm": 0.468349601267782, "learning_rate": 1.5457789414463886e-05, "loss": 0.3233, "step": 29835 }, { "epoch": 3.4794776119402986, "grad_norm": 0.44337347085310647, "learning_rate": 1.545046144858738e-05, "loss": 0.3499, "step": 29840 }, { "epoch": 3.480060634328358, "grad_norm": 0.4269928071599079, "learning_rate": 1.5443135274361792e-05, "loss": 0.3156, "step": 29845 }, { "epoch": 3.480643656716418, "grad_norm": 0.5329365590231341, "learning_rate": 1.543581089287649e-05, "loss": 0.3273, "step": 29850 }, { "epoch": 3.4812266791044775, "grad_norm": 0.4883476737585097, "learning_rate": 1.5428488305220506e-05, "loss": 0.3291, "step": 29855 }, { "epoch": 3.481809701492537, "grad_norm": 0.4440475681494304, "learning_rate": 1.5421167512482655e-05, "loss": 0.3462, "step": 29860 }, { "epoch": 3.482392723880597, "grad_norm": 0.5066557853889835, "learning_rate": 1.5413848515751463e-05, "loss": 0.3184, "step": 29865 }, { "epoch": 3.482975746268657, "grad_norm": 0.45645957901086526, "learning_rate": 1.5406531316115197e-05, "loss": 0.3312, "step": 29870 }, { "epoch": 3.4835587686567164, "grad_norm": 0.421373702662619, "learning_rate": 1.539921591466185e-05, "loss": 0.3092, "step": 29875 }, { "epoch": 3.484141791044776, "grad_norm": 0.4782413018340401, "learning_rate": 1.539190231247917e-05, "loss": 0.3322, "step": 29880 }, { "epoch": 3.4847248134328357, "grad_norm": 0.43459425670708246, "learning_rate": 1.5384590510654596e-05, "loss": 0.3197, "step": 29885 }, { "epoch": 3.4853078358208958, "grad_norm": 0.45425958228353447, "learning_rate": 1.5377280510275342e-05, "loss": 0.3108, "step": 29890 }, { "epoch": 3.4858908582089554, "grad_norm": 0.4905339271792473, "learning_rate": 1.536997231242831e-05, "loss": 0.3283, "step": 29895 }, { "epoch": 3.486473880597015, "grad_norm": 0.4527586055954125, "learning_rate": 1.5362665918200193e-05, "loss": 0.3278, "step": 29900 }, { "epoch": 3.4870569029850746, "grad_norm": 0.4509179327793743, "learning_rate": 1.5355361328677358e-05, "loss": 0.3379, "step": 29905 }, { "epoch": 3.4876399253731343, "grad_norm": 0.4625325556708729, "learning_rate": 1.534805854494593e-05, "loss": 0.3269, "step": 29910 }, { "epoch": 3.488222947761194, "grad_norm": 0.4589192547153669, "learning_rate": 1.5340757568091767e-05, "loss": 0.3247, "step": 29915 }, { "epoch": 3.4888059701492535, "grad_norm": 0.461786600897186, "learning_rate": 1.533345839920045e-05, "loss": 0.3238, "step": 29920 }, { "epoch": 3.4893889925373136, "grad_norm": 0.4643999625352688, "learning_rate": 1.5326161039357307e-05, "loss": 0.3206, "step": 29925 }, { "epoch": 3.4899720149253732, "grad_norm": 0.4475201148208013, "learning_rate": 1.5318865489647383e-05, "loss": 0.3264, "step": 29930 }, { "epoch": 3.490555037313433, "grad_norm": 0.43609363372306625, "learning_rate": 1.531157175115543e-05, "loss": 0.3342, "step": 29935 }, { "epoch": 3.4911380597014925, "grad_norm": 0.4828681148141982, "learning_rate": 1.5304279824966e-05, "loss": 0.3253, "step": 29940 }, { "epoch": 3.491721082089552, "grad_norm": 0.47411229592868015, "learning_rate": 1.5296989712163302e-05, "loss": 0.3472, "step": 29945 }, { "epoch": 3.4923041044776117, "grad_norm": 0.473296964249336, "learning_rate": 1.528970141383131e-05, "loss": 0.3473, "step": 29950 }, { "epoch": 3.492887126865672, "grad_norm": 0.442274355032023, "learning_rate": 1.528241493105373e-05, "loss": 0.3314, "step": 29955 }, { "epoch": 3.4934701492537314, "grad_norm": 0.4745079198954923, "learning_rate": 1.5275130264913994e-05, "loss": 0.3317, "step": 29960 }, { "epoch": 3.494053171641791, "grad_norm": 0.4150336984853833, "learning_rate": 1.5267847416495257e-05, "loss": 0.3329, "step": 29965 }, { "epoch": 3.4946361940298507, "grad_norm": 0.4879438571232143, "learning_rate": 1.5260566386880413e-05, "loss": 0.3257, "step": 29970 }, { "epoch": 3.4952192164179103, "grad_norm": 0.46425872436194615, "learning_rate": 1.5253287177152082e-05, "loss": 0.3226, "step": 29975 }, { "epoch": 3.49580223880597, "grad_norm": 0.46554802973905623, "learning_rate": 1.5246009788392606e-05, "loss": 0.3133, "step": 29980 }, { "epoch": 3.49638526119403, "grad_norm": 0.44823348537045765, "learning_rate": 1.5238734221684065e-05, "loss": 0.3392, "step": 29985 }, { "epoch": 3.4969682835820897, "grad_norm": 0.5319361981527446, "learning_rate": 1.5231460478108268e-05, "loss": 0.3351, "step": 29990 }, { "epoch": 3.4975513059701493, "grad_norm": 0.4458190794231886, "learning_rate": 1.5224188558746749e-05, "loss": 0.3111, "step": 29995 }, { "epoch": 3.498134328358209, "grad_norm": 0.4677344169174092, "learning_rate": 1.5216918464680776e-05, "loss": 0.3488, "step": 30000 }, { "epoch": 3.4987173507462686, "grad_norm": 0.47493244189895345, "learning_rate": 1.5209650196991352e-05, "loss": 0.3187, "step": 30005 }, { "epoch": 3.499300373134328, "grad_norm": 0.427928725696231, "learning_rate": 1.520238375675917e-05, "loss": 0.3145, "step": 30010 }, { "epoch": 3.4998833955223883, "grad_norm": 0.479999978785708, "learning_rate": 1.5195119145064712e-05, "loss": 0.3459, "step": 30015 }, { "epoch": 3.500466417910448, "grad_norm": 0.5049413499590955, "learning_rate": 1.5187856362988123e-05, "loss": 0.334, "step": 30020 }, { "epoch": 3.5010494402985075, "grad_norm": 0.4606207617182415, "learning_rate": 1.5180595411609347e-05, "loss": 0.3508, "step": 30025 }, { "epoch": 3.501632462686567, "grad_norm": 0.4245892946045796, "learning_rate": 1.5173336292007994e-05, "loss": 0.3256, "step": 30030 }, { "epoch": 3.5022154850746268, "grad_norm": 0.4788508571711781, "learning_rate": 1.5166079005263425e-05, "loss": 0.3323, "step": 30035 }, { "epoch": 3.5027985074626864, "grad_norm": 0.453151954371947, "learning_rate": 1.5158823552454737e-05, "loss": 0.3138, "step": 30040 }, { "epoch": 3.5033815298507465, "grad_norm": 0.46792014684429817, "learning_rate": 1.5151569934660743e-05, "loss": 0.3496, "step": 30045 }, { "epoch": 3.503964552238806, "grad_norm": 0.452465615558095, "learning_rate": 1.5144318152959985e-05, "loss": 0.3266, "step": 30050 }, { "epoch": 3.5045475746268657, "grad_norm": 0.4647037726873906, "learning_rate": 1.513706820843075e-05, "loss": 0.313, "step": 30055 }, { "epoch": 3.5051305970149254, "grad_norm": 0.45918369004542436, "learning_rate": 1.5129820102151e-05, "loss": 0.335, "step": 30060 }, { "epoch": 3.505713619402985, "grad_norm": 0.46661663913134543, "learning_rate": 1.5122573835198495e-05, "loss": 0.3286, "step": 30065 }, { "epoch": 3.5062966417910446, "grad_norm": 0.4441609496785673, "learning_rate": 1.5115329408650658e-05, "loss": 0.3148, "step": 30070 }, { "epoch": 3.5068796641791042, "grad_norm": 0.4603939051198401, "learning_rate": 1.5108086823584692e-05, "loss": 0.3215, "step": 30075 }, { "epoch": 3.5074626865671643, "grad_norm": 0.4531797376247531, "learning_rate": 1.5100846081077479e-05, "loss": 0.3264, "step": 30080 }, { "epoch": 3.508045708955224, "grad_norm": 0.4564771523303664, "learning_rate": 1.5093607182205652e-05, "loss": 0.3194, "step": 30085 }, { "epoch": 3.5086287313432836, "grad_norm": 0.4714967782602235, "learning_rate": 1.5086370128045574e-05, "loss": 0.3348, "step": 30090 }, { "epoch": 3.509211753731343, "grad_norm": 0.4945879123232498, "learning_rate": 1.5079134919673316e-05, "loss": 0.3488, "step": 30095 }, { "epoch": 3.509794776119403, "grad_norm": 0.4529553874663103, "learning_rate": 1.5071901558164692e-05, "loss": 0.3291, "step": 30100 }, { "epoch": 3.510377798507463, "grad_norm": 0.4340529939580081, "learning_rate": 1.5064670044595236e-05, "loss": 0.3111, "step": 30105 }, { "epoch": 3.5109608208955225, "grad_norm": 0.4640821458973144, "learning_rate": 1.5057440380040184e-05, "loss": 0.3219, "step": 30110 }, { "epoch": 3.511543843283582, "grad_norm": 0.4273474161407204, "learning_rate": 1.5050212565574551e-05, "loss": 0.3287, "step": 30115 }, { "epoch": 3.512126865671642, "grad_norm": 0.4766857444880997, "learning_rate": 1.5042986602273017e-05, "loss": 0.3351, "step": 30120 }, { "epoch": 3.5127098880597014, "grad_norm": 0.5169461119406662, "learning_rate": 1.503576249121002e-05, "loss": 0.3506, "step": 30125 }, { "epoch": 3.513292910447761, "grad_norm": 0.45750200079140124, "learning_rate": 1.502854023345972e-05, "loss": 0.3167, "step": 30130 }, { "epoch": 3.5138759328358207, "grad_norm": 2.7689476172114067, "learning_rate": 1.5021319830095995e-05, "loss": 0.3211, "step": 30135 }, { "epoch": 3.5144589552238807, "grad_norm": 0.45714542015338194, "learning_rate": 1.5014101282192452e-05, "loss": 0.3313, "step": 30140 }, { "epoch": 3.5150419776119404, "grad_norm": 0.5064495487185795, "learning_rate": 1.5006884590822418e-05, "loss": 0.3443, "step": 30145 }, { "epoch": 3.515625, "grad_norm": 0.4677117280305794, "learning_rate": 1.4999669757058956e-05, "loss": 0.3397, "step": 30150 }, { "epoch": 3.5162080223880596, "grad_norm": 0.4425881887654731, "learning_rate": 1.4992456781974826e-05, "loss": 0.3244, "step": 30155 }, { "epoch": 3.5167910447761193, "grad_norm": 0.44083925066694307, "learning_rate": 1.498524566664253e-05, "loss": 0.3022, "step": 30160 }, { "epoch": 3.5173740671641793, "grad_norm": 0.4348221106452468, "learning_rate": 1.4978036412134305e-05, "loss": 0.3084, "step": 30165 }, { "epoch": 3.517957089552239, "grad_norm": 0.4275335750701321, "learning_rate": 1.4970829019522083e-05, "loss": 0.3135, "step": 30170 }, { "epoch": 3.5185401119402986, "grad_norm": 0.4693869970062322, "learning_rate": 1.4963623489877546e-05, "loss": 0.3225, "step": 30175 }, { "epoch": 3.519123134328358, "grad_norm": 0.46492413750613176, "learning_rate": 1.4956419824272083e-05, "loss": 0.3391, "step": 30180 }, { "epoch": 3.519706156716418, "grad_norm": 0.48108138015879837, "learning_rate": 1.4949218023776806e-05, "loss": 0.3229, "step": 30185 }, { "epoch": 3.5202891791044775, "grad_norm": 0.48711724947857066, "learning_rate": 1.4942018089462567e-05, "loss": 0.3317, "step": 30190 }, { "epoch": 3.520872201492537, "grad_norm": 0.4570418012517138, "learning_rate": 1.4934820022399901e-05, "loss": 0.3409, "step": 30195 }, { "epoch": 3.521455223880597, "grad_norm": 0.469818870151866, "learning_rate": 1.4927623823659126e-05, "loss": 0.3076, "step": 30200 }, { "epoch": 3.522038246268657, "grad_norm": 0.4428288836390382, "learning_rate": 1.4920429494310223e-05, "loss": 0.3203, "step": 30205 }, { "epoch": 3.5226212686567164, "grad_norm": 0.4896116815348177, "learning_rate": 1.4913237035422926e-05, "loss": 0.3407, "step": 30210 }, { "epoch": 3.523204291044776, "grad_norm": 0.44644334194840807, "learning_rate": 1.4906046448066685e-05, "loss": 0.3107, "step": 30215 }, { "epoch": 3.5237873134328357, "grad_norm": 0.44030758438026024, "learning_rate": 1.4898857733310673e-05, "loss": 0.3453, "step": 30220 }, { "epoch": 3.5243703358208958, "grad_norm": 0.45797676287323474, "learning_rate": 1.4891670892223781e-05, "loss": 0.3245, "step": 30225 }, { "epoch": 3.5249533582089554, "grad_norm": 0.49234730565533935, "learning_rate": 1.4884485925874634e-05, "loss": 0.3398, "step": 30230 }, { "epoch": 3.525536380597015, "grad_norm": 0.47360282632648765, "learning_rate": 1.487730283533154e-05, "loss": 0.3264, "step": 30235 }, { "epoch": 3.5261194029850746, "grad_norm": 0.615572611208453, "learning_rate": 1.4870121621662594e-05, "loss": 0.3528, "step": 30240 }, { "epoch": 3.5267024253731343, "grad_norm": 0.4720230725004802, "learning_rate": 1.4862942285935536e-05, "loss": 0.3225, "step": 30245 }, { "epoch": 3.527285447761194, "grad_norm": 0.48480881634838957, "learning_rate": 1.4855764829217894e-05, "loss": 0.3283, "step": 30250 }, { "epoch": 3.5278684701492535, "grad_norm": 0.46537338033193987, "learning_rate": 1.4848589252576872e-05, "loss": 0.3295, "step": 30255 }, { "epoch": 3.5284514925373136, "grad_norm": 0.4932477814602127, "learning_rate": 1.4841415557079413e-05, "loss": 0.3148, "step": 30260 }, { "epoch": 3.5290345149253732, "grad_norm": 0.48945859058918917, "learning_rate": 1.4834243743792176e-05, "loss": 0.3484, "step": 30265 }, { "epoch": 3.529617537313433, "grad_norm": 0.4515938381967078, "learning_rate": 1.482707381378154e-05, "loss": 0.3408, "step": 30270 }, { "epoch": 3.5302005597014925, "grad_norm": 0.44603366316159504, "learning_rate": 1.4819905768113604e-05, "loss": 0.3345, "step": 30275 }, { "epoch": 3.530783582089552, "grad_norm": 0.4716672497914246, "learning_rate": 1.4812739607854199e-05, "loss": 0.3315, "step": 30280 }, { "epoch": 3.531366604477612, "grad_norm": 0.456842676782559, "learning_rate": 1.4805575334068842e-05, "loss": 0.3132, "step": 30285 }, { "epoch": 3.5319496268656714, "grad_norm": 0.4483232566313984, "learning_rate": 1.479841294782282e-05, "loss": 0.3102, "step": 30290 }, { "epoch": 3.5325326492537314, "grad_norm": 0.43596943271537114, "learning_rate": 1.4791252450181081e-05, "loss": 0.3397, "step": 30295 }, { "epoch": 3.533115671641791, "grad_norm": 0.4586826391998859, "learning_rate": 1.4784093842208351e-05, "loss": 0.315, "step": 30300 }, { "epoch": 3.5336986940298507, "grad_norm": 0.4916541342786979, "learning_rate": 1.4776937124969026e-05, "loss": 0.3592, "step": 30305 }, { "epoch": 3.5342817164179103, "grad_norm": 0.46179133577272885, "learning_rate": 1.4769782299527252e-05, "loss": 0.3102, "step": 30310 }, { "epoch": 3.53486473880597, "grad_norm": 0.4340595619890763, "learning_rate": 1.4762629366946876e-05, "loss": 0.3209, "step": 30315 }, { "epoch": 3.53544776119403, "grad_norm": 0.4743104374022143, "learning_rate": 1.4755478328291476e-05, "loss": 0.3168, "step": 30320 }, { "epoch": 3.5360307835820897, "grad_norm": 0.4791837885523034, "learning_rate": 1.4748329184624341e-05, "loss": 0.3032, "step": 30325 }, { "epoch": 3.5366138059701493, "grad_norm": 0.4803759644825129, "learning_rate": 1.4741181937008485e-05, "loss": 0.3191, "step": 30330 }, { "epoch": 3.537196828358209, "grad_norm": 0.45903502528960044, "learning_rate": 1.4734036586506628e-05, "loss": 0.3203, "step": 30335 }, { "epoch": 3.5377798507462686, "grad_norm": 0.48116858842136384, "learning_rate": 1.4726893134181214e-05, "loss": 0.3293, "step": 30340 }, { "epoch": 3.5383628731343286, "grad_norm": 0.44050830677185454, "learning_rate": 1.4719751581094409e-05, "loss": 0.3323, "step": 30345 }, { "epoch": 3.538945895522388, "grad_norm": 0.4587786175505095, "learning_rate": 1.4712611928308095e-05, "loss": 0.3318, "step": 30350 }, { "epoch": 3.539528917910448, "grad_norm": 0.5043457729237092, "learning_rate": 1.4705474176883865e-05, "loss": 0.3353, "step": 30355 }, { "epoch": 3.5401119402985075, "grad_norm": 0.4629321792633427, "learning_rate": 1.4698338327883044e-05, "loss": 0.3162, "step": 30360 }, { "epoch": 3.540694962686567, "grad_norm": 0.4791692020631821, "learning_rate": 1.4691204382366666e-05, "loss": 0.3339, "step": 30365 }, { "epoch": 3.5412779850746268, "grad_norm": 0.4579125183709961, "learning_rate": 1.4684072341395454e-05, "loss": 0.3167, "step": 30370 }, { "epoch": 3.5418610074626864, "grad_norm": 0.42304258163289626, "learning_rate": 1.4676942206029909e-05, "loss": 0.3082, "step": 30375 }, { "epoch": 3.5424440298507465, "grad_norm": 0.4724063661286411, "learning_rate": 1.4669813977330193e-05, "loss": 0.3227, "step": 30380 }, { "epoch": 3.543027052238806, "grad_norm": 0.4686993826196236, "learning_rate": 1.466268765635621e-05, "loss": 0.3192, "step": 30385 }, { "epoch": 3.5436100746268657, "grad_norm": 0.453485392281836, "learning_rate": 1.4655563244167572e-05, "loss": 0.3116, "step": 30390 }, { "epoch": 3.5441930970149254, "grad_norm": 0.4519716041384925, "learning_rate": 1.4648440741823616e-05, "loss": 0.3285, "step": 30395 }, { "epoch": 3.544776119402985, "grad_norm": 0.4876679115355501, "learning_rate": 1.4641320150383391e-05, "loss": 0.3209, "step": 30400 }, { "epoch": 3.5453591417910446, "grad_norm": 0.46832032629958203, "learning_rate": 1.4634201470905665e-05, "loss": 0.3363, "step": 30405 }, { "epoch": 3.5459421641791042, "grad_norm": 0.4133999250795664, "learning_rate": 1.4627084704448895e-05, "loss": 0.3084, "step": 30410 }, { "epoch": 3.5465251865671643, "grad_norm": 0.5371970916230431, "learning_rate": 1.4619969852071305e-05, "loss": 0.3281, "step": 30415 }, { "epoch": 3.547108208955224, "grad_norm": 0.490413599502768, "learning_rate": 1.461285691483078e-05, "loss": 0.3253, "step": 30420 }, { "epoch": 3.5476912313432836, "grad_norm": 0.5005322929727773, "learning_rate": 1.4605745893784973e-05, "loss": 0.3389, "step": 30425 }, { "epoch": 3.548274253731343, "grad_norm": 0.4313245945801489, "learning_rate": 1.4598636789991199e-05, "loss": 0.328, "step": 30430 }, { "epoch": 3.548857276119403, "grad_norm": 0.4674055972639654, "learning_rate": 1.4591529604506527e-05, "loss": 0.3295, "step": 30435 }, { "epoch": 3.549440298507463, "grad_norm": 0.4098801845115704, "learning_rate": 1.458442433838772e-05, "loss": 0.3314, "step": 30440 }, { "epoch": 3.5500233208955225, "grad_norm": 0.4410726552405741, "learning_rate": 1.4577320992691269e-05, "loss": 0.3223, "step": 30445 }, { "epoch": 3.550606343283582, "grad_norm": 0.4652104596232166, "learning_rate": 1.4570219568473376e-05, "loss": 0.3189, "step": 30450 }, { "epoch": 3.551189365671642, "grad_norm": 0.45065671865836054, "learning_rate": 1.4563120066789953e-05, "loss": 0.3268, "step": 30455 }, { "epoch": 3.5517723880597014, "grad_norm": 0.47720768682642084, "learning_rate": 1.4556022488696614e-05, "loss": 0.3201, "step": 30460 }, { "epoch": 3.552355410447761, "grad_norm": 0.46531219747261804, "learning_rate": 1.4548926835248722e-05, "loss": 0.3165, "step": 30465 }, { "epoch": 3.5529384328358207, "grad_norm": 0.4644734524214457, "learning_rate": 1.4541833107501312e-05, "loss": 0.3441, "step": 30470 }, { "epoch": 3.5535214552238807, "grad_norm": 0.5028933687869508, "learning_rate": 1.453474130650918e-05, "loss": 0.3354, "step": 30475 }, { "epoch": 3.5541044776119404, "grad_norm": 0.44612356750149024, "learning_rate": 1.4527651433326786e-05, "loss": 0.3283, "step": 30480 }, { "epoch": 3.5546875, "grad_norm": 0.4747894134423976, "learning_rate": 1.4520563489008333e-05, "loss": 0.3125, "step": 30485 }, { "epoch": 3.5552705223880596, "grad_norm": 0.48242015364768864, "learning_rate": 1.4513477474607729e-05, "loss": 0.3382, "step": 30490 }, { "epoch": 3.5558535447761193, "grad_norm": 0.459976643763842, "learning_rate": 1.4506393391178597e-05, "loss": 0.3309, "step": 30495 }, { "epoch": 3.5564365671641793, "grad_norm": 0.4353100122061999, "learning_rate": 1.4499311239774277e-05, "loss": 0.3443, "step": 30500 }, { "epoch": 3.557019589552239, "grad_norm": 0.4668148688982469, "learning_rate": 1.449223102144781e-05, "loss": 0.3356, "step": 30505 }, { "epoch": 3.5576026119402986, "grad_norm": 0.42776316532602726, "learning_rate": 1.4485152737251972e-05, "loss": 0.3319, "step": 30510 }, { "epoch": 3.558185634328358, "grad_norm": 0.532413465286466, "learning_rate": 1.4478076388239214e-05, "loss": 0.3419, "step": 30515 }, { "epoch": 3.558768656716418, "grad_norm": 0.4452158253276118, "learning_rate": 1.4471001975461735e-05, "loss": 0.34, "step": 30520 }, { "epoch": 3.5593516791044775, "grad_norm": 0.45648202388103803, "learning_rate": 1.4463929499971424e-05, "loss": 0.3331, "step": 30525 }, { "epoch": 3.559934701492537, "grad_norm": 0.45474354594715216, "learning_rate": 1.4456858962819897e-05, "loss": 0.3307, "step": 30530 }, { "epoch": 3.560517723880597, "grad_norm": 0.5143629576840156, "learning_rate": 1.4449790365058474e-05, "loss": 0.3213, "step": 30535 }, { "epoch": 3.561100746268657, "grad_norm": 0.48541349392452526, "learning_rate": 1.4442723707738199e-05, "loss": 0.3321, "step": 30540 }, { "epoch": 3.5616837686567164, "grad_norm": 0.4448177983046585, "learning_rate": 1.4435658991909784e-05, "loss": 0.3389, "step": 30545 }, { "epoch": 3.562266791044776, "grad_norm": 0.4373481608964463, "learning_rate": 1.4428596218623722e-05, "loss": 0.3381, "step": 30550 }, { "epoch": 3.5628498134328357, "grad_norm": 0.46052473260830273, "learning_rate": 1.4421535388930146e-05, "loss": 0.3417, "step": 30555 }, { "epoch": 3.5634328358208958, "grad_norm": 0.4926637056965854, "learning_rate": 1.4414476503878968e-05, "loss": 0.3301, "step": 30560 }, { "epoch": 3.5640158582089554, "grad_norm": 0.46336727427198904, "learning_rate": 1.440741956451975e-05, "loss": 0.3209, "step": 30565 }, { "epoch": 3.564598880597015, "grad_norm": 0.453847692957194, "learning_rate": 1.4400364571901803e-05, "loss": 0.3384, "step": 30570 }, { "epoch": 3.5651819029850746, "grad_norm": 0.4747549448739118, "learning_rate": 1.4393311527074132e-05, "loss": 0.3478, "step": 30575 }, { "epoch": 3.5657649253731343, "grad_norm": 0.4363988368659044, "learning_rate": 1.4386260431085457e-05, "loss": 0.3205, "step": 30580 }, { "epoch": 3.566347947761194, "grad_norm": 0.47010044342853596, "learning_rate": 1.4379211284984213e-05, "loss": 0.3349, "step": 30585 }, { "epoch": 3.5669309701492535, "grad_norm": 0.45336617538481033, "learning_rate": 1.4372164089818546e-05, "loss": 0.3463, "step": 30590 }, { "epoch": 3.5675139925373136, "grad_norm": 0.5103809496340458, "learning_rate": 1.4365118846636283e-05, "loss": 0.3386, "step": 30595 }, { "epoch": 3.5680970149253732, "grad_norm": 0.48231877924813954, "learning_rate": 1.4358075556485016e-05, "loss": 0.3322, "step": 30600 }, { "epoch": 3.568680037313433, "grad_norm": 0.4284399873001586, "learning_rate": 1.4351034220411983e-05, "loss": 0.3121, "step": 30605 }, { "epoch": 3.5692630597014925, "grad_norm": 0.4665428934088034, "learning_rate": 1.4343994839464192e-05, "loss": 0.3221, "step": 30610 }, { "epoch": 3.569846082089552, "grad_norm": 0.500284749218801, "learning_rate": 1.4336957414688313e-05, "loss": 0.3292, "step": 30615 }, { "epoch": 3.570429104477612, "grad_norm": 0.444873404022475, "learning_rate": 1.4329921947130748e-05, "loss": 0.308, "step": 30620 }, { "epoch": 3.5710121268656714, "grad_norm": 0.4377960289701257, "learning_rate": 1.4322888437837604e-05, "loss": 0.3139, "step": 30625 }, { "epoch": 3.5715951492537314, "grad_norm": 0.49744359940608707, "learning_rate": 1.43158568878547e-05, "loss": 0.3313, "step": 30630 }, { "epoch": 3.572178171641791, "grad_norm": 0.463954140641339, "learning_rate": 1.4308827298227551e-05, "loss": 0.3263, "step": 30635 }, { "epoch": 3.5727611940298507, "grad_norm": 0.45837969153942526, "learning_rate": 1.430179967000141e-05, "loss": 0.325, "step": 30640 }, { "epoch": 3.5733442164179103, "grad_norm": 0.451186575985876, "learning_rate": 1.4294774004221185e-05, "loss": 0.3294, "step": 30645 }, { "epoch": 3.57392723880597, "grad_norm": 0.4580687873676085, "learning_rate": 1.4287750301931557e-05, "loss": 0.3306, "step": 30650 }, { "epoch": 3.57451026119403, "grad_norm": 0.47140307322636815, "learning_rate": 1.4280728564176866e-05, "loss": 0.3389, "step": 30655 }, { "epoch": 3.5750932835820897, "grad_norm": 0.4410827472739807, "learning_rate": 1.4273708792001182e-05, "loss": 0.3164, "step": 30660 }, { "epoch": 3.5756763059701493, "grad_norm": 0.4440255101002687, "learning_rate": 1.4266690986448273e-05, "loss": 0.3394, "step": 30665 }, { "epoch": 3.576259328358209, "grad_norm": 0.5035070979999087, "learning_rate": 1.4259675148561627e-05, "loss": 0.3518, "step": 30670 }, { "epoch": 3.5768423507462686, "grad_norm": 0.454344166676145, "learning_rate": 1.4252661279384427e-05, "loss": 0.3319, "step": 30675 }, { "epoch": 3.5774253731343286, "grad_norm": 0.5376653647152443, "learning_rate": 1.424564937995957e-05, "loss": 0.3532, "step": 30680 }, { "epoch": 3.578008395522388, "grad_norm": 0.5173844403556396, "learning_rate": 1.4238639451329666e-05, "loss": 0.3395, "step": 30685 }, { "epoch": 3.578591417910448, "grad_norm": 0.47301287423528704, "learning_rate": 1.4231631494537007e-05, "loss": 0.3208, "step": 30690 }, { "epoch": 3.5791744402985075, "grad_norm": 0.4635506259497985, "learning_rate": 1.4224625510623618e-05, "loss": 0.3272, "step": 30695 }, { "epoch": 3.579757462686567, "grad_norm": 0.5085480217776777, "learning_rate": 1.4217621500631222e-05, "loss": 0.3359, "step": 30700 }, { "epoch": 3.5803404850746268, "grad_norm": 0.5790643628800386, "learning_rate": 1.4210619465601249e-05, "loss": 0.3283, "step": 30705 }, { "epoch": 3.5809235074626864, "grad_norm": 0.4351335248079688, "learning_rate": 1.4203619406574833e-05, "loss": 0.3152, "step": 30710 }, { "epoch": 3.5815065298507465, "grad_norm": 0.5269015701887055, "learning_rate": 1.4196621324592818e-05, "loss": 0.34, "step": 30715 }, { "epoch": 3.582089552238806, "grad_norm": 0.480888650874225, "learning_rate": 1.4189625220695746e-05, "loss": 0.3245, "step": 30720 }, { "epoch": 3.5826725746268657, "grad_norm": 0.41132331450331927, "learning_rate": 1.4182631095923884e-05, "loss": 0.3014, "step": 30725 }, { "epoch": 3.5832555970149254, "grad_norm": 0.4517273439773864, "learning_rate": 1.4175638951317166e-05, "loss": 0.3526, "step": 30730 }, { "epoch": 3.583838619402985, "grad_norm": 0.47586947349794534, "learning_rate": 1.4168648787915289e-05, "loss": 0.3388, "step": 30735 }, { "epoch": 3.5844216417910446, "grad_norm": 0.4399132035009382, "learning_rate": 1.41616606067576e-05, "loss": 0.3391, "step": 30740 }, { "epoch": 3.5850046641791042, "grad_norm": 0.4704910767583765, "learning_rate": 1.4154674408883179e-05, "loss": 0.324, "step": 30745 }, { "epoch": 3.5855876865671643, "grad_norm": 0.5182597288204652, "learning_rate": 1.4147690195330815e-05, "loss": 0.3237, "step": 30750 }, { "epoch": 3.586170708955224, "grad_norm": 0.4700820162789669, "learning_rate": 1.4140707967138983e-05, "loss": 0.3167, "step": 30755 }, { "epoch": 3.5867537313432836, "grad_norm": 0.5084414884924573, "learning_rate": 1.413372772534588e-05, "loss": 0.3284, "step": 30760 }, { "epoch": 3.587336753731343, "grad_norm": 0.4427609210397223, "learning_rate": 1.4126749470989408e-05, "loss": 0.349, "step": 30765 }, { "epoch": 3.587919776119403, "grad_norm": 0.45517836181530963, "learning_rate": 1.411977320510714e-05, "loss": 0.3343, "step": 30770 }, { "epoch": 3.588502798507463, "grad_norm": 0.48551626657276176, "learning_rate": 1.4112798928736415e-05, "loss": 0.3268, "step": 30775 }, { "epoch": 3.5890858208955225, "grad_norm": 0.465285706126102, "learning_rate": 1.410582664291421e-05, "loss": 0.3262, "step": 30780 }, { "epoch": 3.589668843283582, "grad_norm": 0.43831796135475604, "learning_rate": 1.4098856348677263e-05, "loss": 0.3371, "step": 30785 }, { "epoch": 3.590251865671642, "grad_norm": 0.4495296181918966, "learning_rate": 1.4091888047061974e-05, "loss": 0.3249, "step": 30790 }, { "epoch": 3.5908348880597014, "grad_norm": 0.4730025916124849, "learning_rate": 1.4084921739104465e-05, "loss": 0.3173, "step": 30795 }, { "epoch": 3.591417910447761, "grad_norm": 0.446813303352069, "learning_rate": 1.4077957425840563e-05, "loss": 0.3148, "step": 30800 }, { "epoch": 3.5920009328358207, "grad_norm": 0.43225037294348756, "learning_rate": 1.407099510830579e-05, "loss": 0.3097, "step": 30805 }, { "epoch": 3.5925839552238807, "grad_norm": 0.4547572890001812, "learning_rate": 1.406403478753538e-05, "loss": 0.324, "step": 30810 }, { "epoch": 3.5931669776119404, "grad_norm": 0.47153439536213787, "learning_rate": 1.4057076464564272e-05, "loss": 0.3182, "step": 30815 }, { "epoch": 3.59375, "grad_norm": 0.44347109812292, "learning_rate": 1.4050120140427081e-05, "loss": 0.3239, "step": 30820 }, { "epoch": 3.5943330223880596, "grad_norm": 0.44664336753678724, "learning_rate": 1.4043165816158172e-05, "loss": 0.3211, "step": 30825 }, { "epoch": 3.5949160447761193, "grad_norm": 0.4332024839803423, "learning_rate": 1.4036213492791561e-05, "loss": 0.3201, "step": 30830 }, { "epoch": 3.5954990671641793, "grad_norm": 0.4974923109860408, "learning_rate": 1.4029263171361018e-05, "loss": 0.3223, "step": 30835 }, { "epoch": 3.596082089552239, "grad_norm": 0.459723389493139, "learning_rate": 1.4022314852899968e-05, "loss": 0.3309, "step": 30840 }, { "epoch": 3.5966651119402986, "grad_norm": 0.4928228806000406, "learning_rate": 1.4015368538441567e-05, "loss": 0.3431, "step": 30845 }, { "epoch": 3.597248134328358, "grad_norm": 0.46940709708411177, "learning_rate": 1.4008424229018668e-05, "loss": 0.3212, "step": 30850 }, { "epoch": 3.597831156716418, "grad_norm": 0.45432976582878304, "learning_rate": 1.4001481925663818e-05, "loss": 0.3122, "step": 30855 }, { "epoch": 3.5984141791044775, "grad_norm": 0.48216334391261273, "learning_rate": 1.3994541629409275e-05, "loss": 0.2973, "step": 30860 }, { "epoch": 3.598997201492537, "grad_norm": 0.4598175603708983, "learning_rate": 1.3987603341287003e-05, "loss": 0.3325, "step": 30865 }, { "epoch": 3.599580223880597, "grad_norm": 0.42948881682522444, "learning_rate": 1.398066706232864e-05, "loss": 0.3295, "step": 30870 }, { "epoch": 3.600163246268657, "grad_norm": 0.5255344992401619, "learning_rate": 1.3973732793565553e-05, "loss": 0.343, "step": 30875 }, { "epoch": 3.6007462686567164, "grad_norm": 0.4937335780902401, "learning_rate": 1.3966800536028802e-05, "loss": 0.3469, "step": 30880 }, { "epoch": 3.601329291044776, "grad_norm": 0.4300854945636151, "learning_rate": 1.3959870290749148e-05, "loss": 0.2882, "step": 30885 }, { "epoch": 3.6019123134328357, "grad_norm": 0.4111604436999921, "learning_rate": 1.395294205875705e-05, "loss": 0.3022, "step": 30890 }, { "epoch": 3.6024953358208958, "grad_norm": 0.44701929226917175, "learning_rate": 1.394601584108267e-05, "loss": 0.3225, "step": 30895 }, { "epoch": 3.6030783582089554, "grad_norm": 0.4527450630108651, "learning_rate": 1.3939091638755882e-05, "loss": 0.3108, "step": 30900 }, { "epoch": 3.603661380597015, "grad_norm": 0.45689457674625045, "learning_rate": 1.3932169452806221e-05, "loss": 0.3384, "step": 30905 }, { "epoch": 3.6042444029850746, "grad_norm": 0.4739513240901544, "learning_rate": 1.3925249284262984e-05, "loss": 0.3261, "step": 30910 }, { "epoch": 3.6048274253731343, "grad_norm": 0.46259649913304157, "learning_rate": 1.3918331134155104e-05, "loss": 0.3382, "step": 30915 }, { "epoch": 3.605410447761194, "grad_norm": 0.5476694026778616, "learning_rate": 1.3911415003511258e-05, "loss": 0.3241, "step": 30920 }, { "epoch": 3.6059934701492535, "grad_norm": 0.5187947756610837, "learning_rate": 1.3904500893359809e-05, "loss": 0.3443, "step": 30925 }, { "epoch": 3.6065764925373136, "grad_norm": 0.44187033269340575, "learning_rate": 1.3897588804728818e-05, "loss": 0.3271, "step": 30930 }, { "epoch": 3.6071595149253732, "grad_norm": 0.42632349752822357, "learning_rate": 1.3890678738646038e-05, "loss": 0.3356, "step": 30935 }, { "epoch": 3.607742537313433, "grad_norm": 0.5299985861528723, "learning_rate": 1.3883770696138946e-05, "loss": 0.3532, "step": 30940 }, { "epoch": 3.6083255597014925, "grad_norm": 0.4936740664032516, "learning_rate": 1.3876864678234674e-05, "loss": 0.3329, "step": 30945 }, { "epoch": 3.608908582089552, "grad_norm": 0.47305652469131637, "learning_rate": 1.3869960685960118e-05, "loss": 0.3295, "step": 30950 }, { "epoch": 3.609491604477612, "grad_norm": 0.4784904276737314, "learning_rate": 1.3863058720341799e-05, "loss": 0.3189, "step": 30955 }, { "epoch": 3.6100746268656714, "grad_norm": 0.4753044017218112, "learning_rate": 1.3856158782406007e-05, "loss": 0.3272, "step": 30960 }, { "epoch": 3.6106576492537314, "grad_norm": 0.5188411601643111, "learning_rate": 1.3849260873178671e-05, "loss": 0.3478, "step": 30965 }, { "epoch": 3.611240671641791, "grad_norm": 0.47575059374767503, "learning_rate": 1.3842364993685453e-05, "loss": 0.3105, "step": 30970 }, { "epoch": 3.6118236940298507, "grad_norm": 0.4746714512523541, "learning_rate": 1.3835471144951705e-05, "loss": 0.3342, "step": 30975 }, { "epoch": 3.6124067164179103, "grad_norm": 0.4252946453989581, "learning_rate": 1.3828579328002473e-05, "loss": 0.3216, "step": 30980 }, { "epoch": 3.61298973880597, "grad_norm": 0.4708342921734325, "learning_rate": 1.382168954386251e-05, "loss": 0.3368, "step": 30985 }, { "epoch": 3.61357276119403, "grad_norm": 0.44174710006528406, "learning_rate": 1.3814801793556264e-05, "loss": 0.3133, "step": 30990 }, { "epoch": 3.6141557835820897, "grad_norm": 0.4498021523154961, "learning_rate": 1.3807916078107851e-05, "loss": 0.3114, "step": 30995 }, { "epoch": 3.6147388059701493, "grad_norm": 0.44302185317222054, "learning_rate": 1.3801032398541153e-05, "loss": 0.3347, "step": 31000 }, { "epoch": 3.615321828358209, "grad_norm": 0.4790054963282025, "learning_rate": 1.3794150755879665e-05, "loss": 0.3236, "step": 31005 }, { "epoch": 3.6159048507462686, "grad_norm": 0.47125434231183194, "learning_rate": 1.3787271151146658e-05, "loss": 0.3239, "step": 31010 }, { "epoch": 3.6164878731343286, "grad_norm": 0.46755831006417453, "learning_rate": 1.3780393585365043e-05, "loss": 0.3219, "step": 31015 }, { "epoch": 3.617070895522388, "grad_norm": 0.4784430740098405, "learning_rate": 1.3773518059557445e-05, "loss": 0.3306, "step": 31020 }, { "epoch": 3.617653917910448, "grad_norm": 0.44043478119241186, "learning_rate": 1.3766644574746202e-05, "loss": 0.3205, "step": 31025 }, { "epoch": 3.6182369402985075, "grad_norm": 0.4292599765900938, "learning_rate": 1.3759773131953321e-05, "loss": 0.3284, "step": 31030 }, { "epoch": 3.618819962686567, "grad_norm": 0.4783147794772432, "learning_rate": 1.3752903732200535e-05, "loss": 0.326, "step": 31035 }, { "epoch": 3.6194029850746268, "grad_norm": 0.4561539018021913, "learning_rate": 1.3746036376509252e-05, "loss": 0.3302, "step": 31040 }, { "epoch": 3.6199860074626864, "grad_norm": 0.46498336262089046, "learning_rate": 1.3739171065900577e-05, "loss": 0.3182, "step": 31045 }, { "epoch": 3.6205690298507465, "grad_norm": 0.45313508210464637, "learning_rate": 1.3732307801395322e-05, "loss": 0.3251, "step": 31050 }, { "epoch": 3.621152052238806, "grad_norm": 0.4563700798823811, "learning_rate": 1.3725446584013984e-05, "loss": 0.3142, "step": 31055 }, { "epoch": 3.6217350746268657, "grad_norm": 0.45174279163432074, "learning_rate": 1.3718587414776756e-05, "loss": 0.3241, "step": 31060 }, { "epoch": 3.6223180970149254, "grad_norm": 0.49544856606238696, "learning_rate": 1.3711730294703545e-05, "loss": 0.3444, "step": 31065 }, { "epoch": 3.622901119402985, "grad_norm": 0.4744435909483976, "learning_rate": 1.3704875224813928e-05, "loss": 0.3338, "step": 31070 }, { "epoch": 3.6234841417910446, "grad_norm": 0.4682710577465102, "learning_rate": 1.3698022206127201e-05, "loss": 0.3306, "step": 31075 }, { "epoch": 3.6240671641791042, "grad_norm": 0.46582122906724877, "learning_rate": 1.3691171239662315e-05, "loss": 0.3412, "step": 31080 }, { "epoch": 3.6246501865671643, "grad_norm": 0.5003394888146652, "learning_rate": 1.3684322326437976e-05, "loss": 0.3349, "step": 31085 }, { "epoch": 3.625233208955224, "grad_norm": 0.4492810392951438, "learning_rate": 1.3677475467472534e-05, "loss": 0.3329, "step": 31090 }, { "epoch": 3.6258162313432836, "grad_norm": 0.4272140190782571, "learning_rate": 1.367063066378405e-05, "loss": 0.314, "step": 31095 }, { "epoch": 3.626399253731343, "grad_norm": 0.45743973011498534, "learning_rate": 1.366378791639028e-05, "loss": 0.3143, "step": 31100 }, { "epoch": 3.626982276119403, "grad_norm": 0.4905799603140713, "learning_rate": 1.3656947226308686e-05, "loss": 0.3515, "step": 31105 }, { "epoch": 3.627565298507463, "grad_norm": 0.5056921514198293, "learning_rate": 1.3650108594556404e-05, "loss": 0.3273, "step": 31110 }, { "epoch": 3.6281483208955225, "grad_norm": 0.4460771657180042, "learning_rate": 1.3643272022150286e-05, "loss": 0.3027, "step": 31115 }, { "epoch": 3.628731343283582, "grad_norm": 0.46123327257003033, "learning_rate": 1.3636437510106836e-05, "loss": 0.3407, "step": 31120 }, { "epoch": 3.629314365671642, "grad_norm": 0.45680882201875966, "learning_rate": 1.3629605059442315e-05, "loss": 0.3225, "step": 31125 }, { "epoch": 3.6298973880597014, "grad_norm": 0.4351080943532293, "learning_rate": 1.362277467117261e-05, "loss": 0.3198, "step": 31130 }, { "epoch": 3.630480410447761, "grad_norm": 0.45381964677387365, "learning_rate": 1.3615946346313368e-05, "loss": 0.3414, "step": 31135 }, { "epoch": 3.6310634328358207, "grad_norm": 0.5275309351178956, "learning_rate": 1.3609120085879872e-05, "loss": 0.3313, "step": 31140 }, { "epoch": 3.6316464552238807, "grad_norm": 0.4898873747004423, "learning_rate": 1.360229589088713e-05, "loss": 0.3227, "step": 31145 }, { "epoch": 3.6322294776119404, "grad_norm": 0.4507389438058164, "learning_rate": 1.3595473762349825e-05, "loss": 0.3068, "step": 31150 }, { "epoch": 3.6328125, "grad_norm": 0.5061611948054943, "learning_rate": 1.3588653701282359e-05, "loss": 0.344, "step": 31155 }, { "epoch": 3.6333955223880596, "grad_norm": 0.48318510481799876, "learning_rate": 1.3581835708698796e-05, "loss": 0.3343, "step": 31160 }, { "epoch": 3.6339785447761193, "grad_norm": 0.48286218202518044, "learning_rate": 1.357501978561292e-05, "loss": 0.3227, "step": 31165 }, { "epoch": 3.6345615671641793, "grad_norm": 0.5035539243855311, "learning_rate": 1.3568205933038164e-05, "loss": 0.3423, "step": 31170 }, { "epoch": 3.635144589552239, "grad_norm": 0.4665797479912174, "learning_rate": 1.3561394151987722e-05, "loss": 0.3301, "step": 31175 }, { "epoch": 3.6357276119402986, "grad_norm": 0.4761977558837576, "learning_rate": 1.3554584443474405e-05, "loss": 0.3318, "step": 31180 }, { "epoch": 3.636310634328358, "grad_norm": 0.43260977443560905, "learning_rate": 1.3547776808510782e-05, "loss": 0.3284, "step": 31185 }, { "epoch": 3.636893656716418, "grad_norm": 0.4189791268300447, "learning_rate": 1.3540971248109063e-05, "loss": 0.3031, "step": 31190 }, { "epoch": 3.6374766791044775, "grad_norm": 0.481164745706147, "learning_rate": 1.3534167763281175e-05, "loss": 0.3234, "step": 31195 }, { "epoch": 3.638059701492537, "grad_norm": 0.49315789036743013, "learning_rate": 1.352736635503873e-05, "loss": 0.3188, "step": 31200 }, { "epoch": 3.638642723880597, "grad_norm": 0.455394463954907, "learning_rate": 1.3520567024393033e-05, "loss": 0.3302, "step": 31205 }, { "epoch": 3.639225746268657, "grad_norm": 0.4533384008353099, "learning_rate": 1.3513769772355083e-05, "loss": 0.3245, "step": 31210 }, { "epoch": 3.6398087686567164, "grad_norm": 0.4542700047601782, "learning_rate": 1.350697459993556e-05, "loss": 0.3247, "step": 31215 }, { "epoch": 3.640391791044776, "grad_norm": 0.4506638594465722, "learning_rate": 1.3500181508144855e-05, "loss": 0.3276, "step": 31220 }, { "epoch": 3.6409748134328357, "grad_norm": 0.4736122106785712, "learning_rate": 1.3493390497993014e-05, "loss": 0.3346, "step": 31225 }, { "epoch": 3.6415578358208958, "grad_norm": 0.45184832963680616, "learning_rate": 1.3486601570489809e-05, "loss": 0.3294, "step": 31230 }, { "epoch": 3.6421408582089554, "grad_norm": 0.47620322229028306, "learning_rate": 1.3479814726644686e-05, "loss": 0.3391, "step": 31235 }, { "epoch": 3.642723880597015, "grad_norm": 0.44130327890535087, "learning_rate": 1.3473029967466779e-05, "loss": 0.3102, "step": 31240 }, { "epoch": 3.6433069029850746, "grad_norm": 0.4927547189276676, "learning_rate": 1.3466247293964922e-05, "loss": 0.3244, "step": 31245 }, { "epoch": 3.6438899253731343, "grad_norm": 0.4761645226753004, "learning_rate": 1.3459466707147644e-05, "loss": 0.3433, "step": 31250 }, { "epoch": 3.644472947761194, "grad_norm": 0.43866264751097683, "learning_rate": 1.3452688208023123e-05, "loss": 0.3218, "step": 31255 }, { "epoch": 3.6450559701492535, "grad_norm": 0.4646522793504169, "learning_rate": 1.3445911797599293e-05, "loss": 0.3431, "step": 31260 }, { "epoch": 3.6456389925373136, "grad_norm": 0.48836516077234127, "learning_rate": 1.3439137476883706e-05, "loss": 0.3368, "step": 31265 }, { "epoch": 3.6462220149253732, "grad_norm": 0.46470621472139234, "learning_rate": 1.3432365246883671e-05, "loss": 0.3181, "step": 31270 }, { "epoch": 3.646805037313433, "grad_norm": 0.44878768581362055, "learning_rate": 1.3425595108606138e-05, "loss": 0.3261, "step": 31275 }, { "epoch": 3.6473880597014925, "grad_norm": 0.4500183550241171, "learning_rate": 1.3418827063057754e-05, "loss": 0.3094, "step": 31280 }, { "epoch": 3.647971082089552, "grad_norm": 0.4651200613887222, "learning_rate": 1.3412061111244877e-05, "loss": 0.3234, "step": 31285 }, { "epoch": 3.648554104477612, "grad_norm": 0.4346814567803131, "learning_rate": 1.3405297254173532e-05, "loss": 0.3361, "step": 31290 }, { "epoch": 3.6491371268656714, "grad_norm": 0.4945965432343019, "learning_rate": 1.3398535492849442e-05, "loss": 0.317, "step": 31295 }, { "epoch": 3.6497201492537314, "grad_norm": 0.5081834516913957, "learning_rate": 1.3391775828278023e-05, "loss": 0.3253, "step": 31300 }, { "epoch": 3.650303171641791, "grad_norm": 0.4759108436963413, "learning_rate": 1.338501826146435e-05, "loss": 0.3233, "step": 31305 }, { "epoch": 3.6508861940298507, "grad_norm": 0.5018965996529728, "learning_rate": 1.3378262793413237e-05, "loss": 0.3347, "step": 31310 }, { "epoch": 3.6514692164179103, "grad_norm": 0.4455252560222746, "learning_rate": 1.3371509425129131e-05, "loss": 0.3245, "step": 31315 }, { "epoch": 3.65205223880597, "grad_norm": 0.47711378350122946, "learning_rate": 1.3364758157616219e-05, "loss": 0.3129, "step": 31320 }, { "epoch": 3.65263526119403, "grad_norm": 0.4418243434050547, "learning_rate": 1.3358008991878329e-05, "loss": 0.3417, "step": 31325 }, { "epoch": 3.6532182835820897, "grad_norm": 0.5501608681321434, "learning_rate": 1.3351261928919007e-05, "loss": 0.3616, "step": 31330 }, { "epoch": 3.6538013059701493, "grad_norm": 0.5282777264117906, "learning_rate": 1.3344516969741471e-05, "loss": 0.3321, "step": 31335 }, { "epoch": 3.654384328358209, "grad_norm": 0.4498665078075985, "learning_rate": 1.3337774115348639e-05, "loss": 0.3001, "step": 31340 }, { "epoch": 3.6549673507462686, "grad_norm": 0.4629014153670532, "learning_rate": 1.3331033366743101e-05, "loss": 0.3079, "step": 31345 }, { "epoch": 3.6555503731343286, "grad_norm": 0.46894722870541605, "learning_rate": 1.3324294724927155e-05, "loss": 0.3003, "step": 31350 }, { "epoch": 3.656133395522388, "grad_norm": 0.49917555020652415, "learning_rate": 1.3317558190902752e-05, "loss": 0.3379, "step": 31355 }, { "epoch": 3.656716417910448, "grad_norm": 0.5067293504331861, "learning_rate": 1.3310823765671571e-05, "loss": 0.3567, "step": 31360 }, { "epoch": 3.6572994402985075, "grad_norm": 0.4914156511449992, "learning_rate": 1.3304091450234934e-05, "loss": 0.3388, "step": 31365 }, { "epoch": 3.657882462686567, "grad_norm": 0.4553188970482324, "learning_rate": 1.32973612455939e-05, "loss": 0.3098, "step": 31370 }, { "epoch": 3.6584654850746268, "grad_norm": 0.47881768265702407, "learning_rate": 1.3290633152749166e-05, "loss": 0.3346, "step": 31375 }, { "epoch": 3.6590485074626864, "grad_norm": 0.48698493411426985, "learning_rate": 1.3283907172701135e-05, "loss": 0.3395, "step": 31380 }, { "epoch": 3.6596315298507465, "grad_norm": 0.4406234083551808, "learning_rate": 1.3277183306449902e-05, "loss": 0.3039, "step": 31385 }, { "epoch": 3.660214552238806, "grad_norm": 0.4615525504738181, "learning_rate": 1.3270461554995243e-05, "loss": 0.3311, "step": 31390 }, { "epoch": 3.6607975746268657, "grad_norm": 0.4256954983134391, "learning_rate": 1.3263741919336608e-05, "loss": 0.3226, "step": 31395 }, { "epoch": 3.6613805970149254, "grad_norm": 0.45771343012051446, "learning_rate": 1.3257024400473162e-05, "loss": 0.3087, "step": 31400 }, { "epoch": 3.661963619402985, "grad_norm": 0.458134612632429, "learning_rate": 1.3250308999403715e-05, "loss": 0.3087, "step": 31405 }, { "epoch": 3.6625466417910446, "grad_norm": 0.4895246636519347, "learning_rate": 1.3243595717126792e-05, "loss": 0.3318, "step": 31410 }, { "epoch": 3.6631296641791042, "grad_norm": 0.4704772905461017, "learning_rate": 1.3236884554640589e-05, "loss": 0.3164, "step": 31415 }, { "epoch": 3.6637126865671643, "grad_norm": 0.43385243655184663, "learning_rate": 1.3230175512943e-05, "loss": 0.3141, "step": 31420 }, { "epoch": 3.664295708955224, "grad_norm": 0.44902480668516553, "learning_rate": 1.3223468593031585e-05, "loss": 0.3347, "step": 31425 }, { "epoch": 3.6648787313432836, "grad_norm": 0.441123561924162, "learning_rate": 1.3216763795903608e-05, "loss": 0.2972, "step": 31430 }, { "epoch": 3.665461753731343, "grad_norm": 0.4451649744099363, "learning_rate": 1.321006112255601e-05, "loss": 0.323, "step": 31435 }, { "epoch": 3.666044776119403, "grad_norm": 0.4718936590855251, "learning_rate": 1.3203360573985394e-05, "loss": 0.3391, "step": 31440 }, { "epoch": 3.666627798507463, "grad_norm": 0.48261580115801245, "learning_rate": 1.31966621511881e-05, "loss": 0.3145, "step": 31445 }, { "epoch": 3.6672108208955225, "grad_norm": 0.44949290096900263, "learning_rate": 1.3189965855160088e-05, "loss": 0.315, "step": 31450 }, { "epoch": 3.667793843283582, "grad_norm": 0.45535760627703, "learning_rate": 1.318327168689705e-05, "loss": 0.3195, "step": 31455 }, { "epoch": 3.668376865671642, "grad_norm": 0.5133884073554863, "learning_rate": 1.3176579647394338e-05, "loss": 0.3213, "step": 31460 }, { "epoch": 3.6689598880597014, "grad_norm": 0.48407952311970437, "learning_rate": 1.3169889737647001e-05, "loss": 0.3238, "step": 31465 }, { "epoch": 3.669542910447761, "grad_norm": 0.4712862288057401, "learning_rate": 1.3163201958649757e-05, "loss": 0.3113, "step": 31470 }, { "epoch": 3.6701259328358207, "grad_norm": 0.4468691551751411, "learning_rate": 1.315651631139703e-05, "loss": 0.3152, "step": 31475 }, { "epoch": 3.6707089552238807, "grad_norm": 0.45416081387858553, "learning_rate": 1.314983279688288e-05, "loss": 0.3303, "step": 31480 }, { "epoch": 3.6712919776119404, "grad_norm": 0.46364841940380214, "learning_rate": 1.3143151416101118e-05, "loss": 0.3342, "step": 31485 }, { "epoch": 3.671875, "grad_norm": 0.4699111081809452, "learning_rate": 1.3136472170045171e-05, "loss": 0.3244, "step": 31490 }, { "epoch": 3.6724580223880596, "grad_norm": 0.4511674288857646, "learning_rate": 1.3129795059708209e-05, "loss": 0.3346, "step": 31495 }, { "epoch": 3.6730410447761193, "grad_norm": 0.5140414045658062, "learning_rate": 1.3123120086083026e-05, "loss": 0.319, "step": 31500 }, { "epoch": 3.6736240671641793, "grad_norm": 0.49386677317891153, "learning_rate": 1.3116447250162145e-05, "loss": 0.3391, "step": 31505 }, { "epoch": 3.674207089552239, "grad_norm": 0.4724215098111002, "learning_rate": 1.3109776552937742e-05, "loss": 0.3327, "step": 31510 }, { "epoch": 3.6747901119402986, "grad_norm": 0.48079247659077834, "learning_rate": 1.3103107995401689e-05, "loss": 0.3278, "step": 31515 }, { "epoch": 3.675373134328358, "grad_norm": 0.4673231523048643, "learning_rate": 1.3096441578545544e-05, "loss": 0.3396, "step": 31520 }, { "epoch": 3.675956156716418, "grad_norm": 0.49948194068544777, "learning_rate": 1.3089777303360534e-05, "loss": 0.3243, "step": 31525 }, { "epoch": 3.6765391791044775, "grad_norm": 0.46695038827477636, "learning_rate": 1.308311517083756e-05, "loss": 0.3123, "step": 31530 }, { "epoch": 3.677122201492537, "grad_norm": 0.5344194598937325, "learning_rate": 1.3076455181967246e-05, "loss": 0.3344, "step": 31535 }, { "epoch": 3.677705223880597, "grad_norm": 0.4784688103852506, "learning_rate": 1.306979733773983e-05, "loss": 0.3169, "step": 31540 }, { "epoch": 3.678288246268657, "grad_norm": 0.48524859817677335, "learning_rate": 1.3063141639145313e-05, "loss": 0.3281, "step": 31545 }, { "epoch": 3.6788712686567164, "grad_norm": 0.420086663938523, "learning_rate": 1.3056488087173302e-05, "loss": 0.3211, "step": 31550 }, { "epoch": 3.679454291044776, "grad_norm": 0.47507384117275503, "learning_rate": 1.3049836682813127e-05, "loss": 0.3246, "step": 31555 }, { "epoch": 3.6800373134328357, "grad_norm": 0.43644833261680144, "learning_rate": 1.3043187427053788e-05, "loss": 0.312, "step": 31560 }, { "epoch": 3.6806203358208958, "grad_norm": 0.5023075759999288, "learning_rate": 1.3036540320883967e-05, "loss": 0.3392, "step": 31565 }, { "epoch": 3.6812033582089554, "grad_norm": 0.49592396130851113, "learning_rate": 1.3029895365292018e-05, "loss": 0.3275, "step": 31570 }, { "epoch": 3.681786380597015, "grad_norm": 0.48003364363140033, "learning_rate": 1.3023252561266e-05, "loss": 0.3341, "step": 31575 }, { "epoch": 3.6823694029850746, "grad_norm": 0.45634224992258754, "learning_rate": 1.3016611909793613e-05, "loss": 0.3255, "step": 31580 }, { "epoch": 3.6829524253731343, "grad_norm": 0.4701313857081696, "learning_rate": 1.3009973411862264e-05, "loss": 0.3177, "step": 31585 }, { "epoch": 3.683535447761194, "grad_norm": 0.4344048690540488, "learning_rate": 1.3003337068459037e-05, "loss": 0.328, "step": 31590 }, { "epoch": 3.6841184701492535, "grad_norm": 0.49223030335934287, "learning_rate": 1.29967028805707e-05, "loss": 0.3466, "step": 31595 }, { "epoch": 3.6847014925373136, "grad_norm": 0.41996922306859436, "learning_rate": 1.2990070849183678e-05, "loss": 0.3282, "step": 31600 }, { "epoch": 3.6852845149253732, "grad_norm": 0.5101676986445276, "learning_rate": 1.2983440975284101e-05, "loss": 0.3302, "step": 31605 }, { "epoch": 3.685867537313433, "grad_norm": 0.49682199517574394, "learning_rate": 1.2976813259857773e-05, "loss": 0.2948, "step": 31610 }, { "epoch": 3.6864505597014925, "grad_norm": 0.4631508954087268, "learning_rate": 1.2970187703890149e-05, "loss": 0.3347, "step": 31615 }, { "epoch": 3.687033582089552, "grad_norm": 0.47885669120682994, "learning_rate": 1.2963564308366416e-05, "loss": 0.3261, "step": 31620 }, { "epoch": 3.687616604477612, "grad_norm": 0.48543010067184783, "learning_rate": 1.2956943074271388e-05, "loss": 0.3414, "step": 31625 }, { "epoch": 3.6881996268656714, "grad_norm": 0.4984091597801686, "learning_rate": 1.295032400258958e-05, "loss": 0.3308, "step": 31630 }, { "epoch": 3.6887826492537314, "grad_norm": 0.46677066628033437, "learning_rate": 1.2943707094305196e-05, "loss": 0.3312, "step": 31635 }, { "epoch": 3.689365671641791, "grad_norm": 0.4773263411727992, "learning_rate": 1.2937092350402097e-05, "loss": 0.3155, "step": 31640 }, { "epoch": 3.6899486940298507, "grad_norm": 0.42832969149943423, "learning_rate": 1.2930479771863835e-05, "loss": 0.3408, "step": 31645 }, { "epoch": 3.6905317164179103, "grad_norm": 0.45644740360498687, "learning_rate": 1.2923869359673646e-05, "loss": 0.3264, "step": 31650 }, { "epoch": 3.69111473880597, "grad_norm": 0.44582819806361973, "learning_rate": 1.2917261114814414e-05, "loss": 0.3336, "step": 31655 }, { "epoch": 3.69169776119403, "grad_norm": 0.45150250658632707, "learning_rate": 1.2910655038268749e-05, "loss": 0.3073, "step": 31660 }, { "epoch": 3.6922807835820897, "grad_norm": 0.4326064459735902, "learning_rate": 1.2904051131018877e-05, "loss": 0.3327, "step": 31665 }, { "epoch": 3.6928638059701493, "grad_norm": 0.45941836798825053, "learning_rate": 1.2897449394046773e-05, "loss": 0.327, "step": 31670 }, { "epoch": 3.693446828358209, "grad_norm": 0.5060818866839104, "learning_rate": 1.2890849828334029e-05, "loss": 0.3183, "step": 31675 }, { "epoch": 3.6940298507462686, "grad_norm": 0.48612612030918756, "learning_rate": 1.2884252434861938e-05, "loss": 0.2979, "step": 31680 }, { "epoch": 3.6946128731343286, "grad_norm": 0.49800443359831537, "learning_rate": 1.2877657214611477e-05, "loss": 0.3155, "step": 31685 }, { "epoch": 3.695195895522388, "grad_norm": 0.4948702647432376, "learning_rate": 1.2871064168563291e-05, "loss": 0.3196, "step": 31690 }, { "epoch": 3.695778917910448, "grad_norm": 0.44893136195759986, "learning_rate": 1.2864473297697693e-05, "loss": 0.322, "step": 31695 }, { "epoch": 3.6963619402985075, "grad_norm": 0.45850928885668435, "learning_rate": 1.2857884602994706e-05, "loss": 0.3275, "step": 31700 }, { "epoch": 3.696944962686567, "grad_norm": 0.4672951709579247, "learning_rate": 1.2851298085433972e-05, "loss": 0.3318, "step": 31705 }, { "epoch": 3.6975279850746268, "grad_norm": 0.5054266321428105, "learning_rate": 1.2844713745994871e-05, "loss": 0.3474, "step": 31710 }, { "epoch": 3.6981110074626864, "grad_norm": 0.49467511543653125, "learning_rate": 1.283813158565641e-05, "loss": 0.3328, "step": 31715 }, { "epoch": 3.6986940298507465, "grad_norm": 0.5031012319202796, "learning_rate": 1.2831551605397321e-05, "loss": 0.3221, "step": 31720 }, { "epoch": 3.699277052238806, "grad_norm": 0.45112489659844346, "learning_rate": 1.2824973806195955e-05, "loss": 0.3006, "step": 31725 }, { "epoch": 3.6998600746268657, "grad_norm": 0.4464073300376547, "learning_rate": 1.2818398189030383e-05, "loss": 0.3196, "step": 31730 }, { "epoch": 3.7004430970149254, "grad_norm": 0.429807181591747, "learning_rate": 1.2811824754878331e-05, "loss": 0.326, "step": 31735 }, { "epoch": 3.701026119402985, "grad_norm": 0.4507615629116169, "learning_rate": 1.2805253504717213e-05, "loss": 0.3188, "step": 31740 }, { "epoch": 3.7016091417910446, "grad_norm": 0.45645840827116146, "learning_rate": 1.27986844395241e-05, "loss": 0.3308, "step": 31745 }, { "epoch": 3.7021921641791042, "grad_norm": 0.4385297201540902, "learning_rate": 1.2792117560275766e-05, "loss": 0.3368, "step": 31750 }, { "epoch": 3.7027751865671643, "grad_norm": 0.5116371698941201, "learning_rate": 1.2785552867948625e-05, "loss": 0.3425, "step": 31755 }, { "epoch": 3.703358208955224, "grad_norm": 0.4744805080188085, "learning_rate": 1.2778990363518785e-05, "loss": 0.3261, "step": 31760 }, { "epoch": 3.7039412313432836, "grad_norm": 0.4938922902251075, "learning_rate": 1.277243004796204e-05, "loss": 0.3517, "step": 31765 }, { "epoch": 3.704524253731343, "grad_norm": 0.4846248190781154, "learning_rate": 1.2765871922253835e-05, "loss": 0.3265, "step": 31770 }, { "epoch": 3.705107276119403, "grad_norm": 0.4366723589008353, "learning_rate": 1.2759315987369306e-05, "loss": 0.3361, "step": 31775 }, { "epoch": 3.705690298507463, "grad_norm": 0.4675989131888052, "learning_rate": 1.2752762244283255e-05, "loss": 0.3484, "step": 31780 }, { "epoch": 3.7062733208955225, "grad_norm": 0.46144740806748774, "learning_rate": 1.2746210693970171e-05, "loss": 0.3296, "step": 31785 }, { "epoch": 3.706856343283582, "grad_norm": 0.49553855145425413, "learning_rate": 1.2739661337404185e-05, "loss": 0.331, "step": 31790 }, { "epoch": 3.707439365671642, "grad_norm": 0.4482578805259561, "learning_rate": 1.2733114175559149e-05, "loss": 0.3293, "step": 31795 }, { "epoch": 3.7080223880597014, "grad_norm": 0.4791044612725869, "learning_rate": 1.2726569209408545e-05, "loss": 0.3103, "step": 31800 }, { "epoch": 3.708605410447761, "grad_norm": 0.4638006436199728, "learning_rate": 1.272002643992555e-05, "loss": 0.3319, "step": 31805 }, { "epoch": 3.7091884328358207, "grad_norm": 0.4608440531145325, "learning_rate": 1.2713485868083014e-05, "loss": 0.3418, "step": 31810 }, { "epoch": 3.7097714552238807, "grad_norm": 0.47277131107852466, "learning_rate": 1.270694749485346e-05, "loss": 0.3258, "step": 31815 }, { "epoch": 3.7103544776119404, "grad_norm": 0.5124155258559975, "learning_rate": 1.2700411321209078e-05, "loss": 0.3298, "step": 31820 }, { "epoch": 3.7109375, "grad_norm": 0.49050737819867396, "learning_rate": 1.2693877348121742e-05, "loss": 0.3475, "step": 31825 }, { "epoch": 3.7115205223880596, "grad_norm": 0.46803011403512135, "learning_rate": 1.2687345576562965e-05, "loss": 0.3299, "step": 31830 }, { "epoch": 3.7121035447761193, "grad_norm": 0.46483350384816363, "learning_rate": 1.2680816007503998e-05, "loss": 0.3118, "step": 31835 }, { "epoch": 3.7126865671641793, "grad_norm": 0.42148401608250813, "learning_rate": 1.2674288641915688e-05, "loss": 0.304, "step": 31840 }, { "epoch": 3.713269589552239, "grad_norm": 0.46259206012009374, "learning_rate": 1.2667763480768625e-05, "loss": 0.3316, "step": 31845 }, { "epoch": 3.7138526119402986, "grad_norm": 0.48018614383398484, "learning_rate": 1.2661240525033016e-05, "loss": 0.3368, "step": 31850 }, { "epoch": 3.714435634328358, "grad_norm": 0.4609350877561927, "learning_rate": 1.2654719775678764e-05, "loss": 0.339, "step": 31855 }, { "epoch": 3.715018656716418, "grad_norm": 0.4546027197923379, "learning_rate": 1.264820123367545e-05, "loss": 0.3319, "step": 31860 }, { "epoch": 3.7156016791044775, "grad_norm": 0.45717221896297866, "learning_rate": 1.2641684899992314e-05, "loss": 0.3372, "step": 31865 }, { "epoch": 3.716184701492537, "grad_norm": 0.4670576032693017, "learning_rate": 1.2635170775598271e-05, "loss": 0.3186, "step": 31870 }, { "epoch": 3.716767723880597, "grad_norm": 0.4617922031560612, "learning_rate": 1.2628658861461923e-05, "loss": 0.322, "step": 31875 }, { "epoch": 3.717350746268657, "grad_norm": 0.48166851839959623, "learning_rate": 1.26221491585515e-05, "loss": 0.3331, "step": 31880 }, { "epoch": 3.7179337686567164, "grad_norm": 0.4985631006094774, "learning_rate": 1.2615641667834962e-05, "loss": 0.3168, "step": 31885 }, { "epoch": 3.718516791044776, "grad_norm": 0.519262460358683, "learning_rate": 1.2609136390279886e-05, "loss": 0.3537, "step": 31890 }, { "epoch": 3.7190998134328357, "grad_norm": 0.47523741070448344, "learning_rate": 1.2602633326853569e-05, "loss": 0.3387, "step": 31895 }, { "epoch": 3.7196828358208958, "grad_norm": 0.4900817921587001, "learning_rate": 1.2596132478522938e-05, "loss": 0.3295, "step": 31900 }, { "epoch": 3.7202658582089554, "grad_norm": 0.4657727449280486, "learning_rate": 1.258963384625461e-05, "loss": 0.3437, "step": 31905 }, { "epoch": 3.720848880597015, "grad_norm": 0.42124359394423505, "learning_rate": 1.258313743101487e-05, "loss": 0.3117, "step": 31910 }, { "epoch": 3.7214319029850746, "grad_norm": 0.4599675295215162, "learning_rate": 1.2576643233769669e-05, "loss": 0.3389, "step": 31915 }, { "epoch": 3.7220149253731343, "grad_norm": 0.46923324974649605, "learning_rate": 1.2570151255484639e-05, "loss": 0.3201, "step": 31920 }, { "epoch": 3.722597947761194, "grad_norm": 0.4591247602003022, "learning_rate": 1.2563661497125073e-05, "loss": 0.3171, "step": 31925 }, { "epoch": 3.7231809701492535, "grad_norm": 0.4339479858104873, "learning_rate": 1.2557173959655932e-05, "loss": 0.3247, "step": 31930 }, { "epoch": 3.7237639925373136, "grad_norm": 0.49242388227375533, "learning_rate": 1.2550688644041861e-05, "loss": 0.3337, "step": 31935 }, { "epoch": 3.7243470149253732, "grad_norm": 0.4409374650015577, "learning_rate": 1.2544205551247148e-05, "loss": 0.3198, "step": 31940 }, { "epoch": 3.724930037313433, "grad_norm": 0.4929748884710064, "learning_rate": 1.2537724682235776e-05, "loss": 0.339, "step": 31945 }, { "epoch": 3.7255130597014925, "grad_norm": 0.4542569726267021, "learning_rate": 1.253124603797139e-05, "loss": 0.3068, "step": 31950 }, { "epoch": 3.726096082089552, "grad_norm": 0.47521964948638573, "learning_rate": 1.2524769619417297e-05, "loss": 0.3217, "step": 31955 }, { "epoch": 3.726679104477612, "grad_norm": 0.46060947529613755, "learning_rate": 1.251829542753648e-05, "loss": 0.3367, "step": 31960 }, { "epoch": 3.7272621268656714, "grad_norm": 0.46462461609170735, "learning_rate": 1.2511823463291595e-05, "loss": 0.3162, "step": 31965 }, { "epoch": 3.7278451492537314, "grad_norm": 0.513210399503573, "learning_rate": 1.2505353727644958e-05, "loss": 0.3318, "step": 31970 }, { "epoch": 3.728428171641791, "grad_norm": 0.4789985779106394, "learning_rate": 1.2498886221558547e-05, "loss": 0.3435, "step": 31975 }, { "epoch": 3.7290111940298507, "grad_norm": 0.4860609060195966, "learning_rate": 1.249242094599404e-05, "loss": 0.3366, "step": 31980 }, { "epoch": 3.7295942164179103, "grad_norm": 0.42586287306760157, "learning_rate": 1.2485957901912736e-05, "loss": 0.328, "step": 31985 }, { "epoch": 3.73017723880597, "grad_norm": 0.46580715460925837, "learning_rate": 1.2479497090275643e-05, "loss": 0.3263, "step": 31990 }, { "epoch": 3.73076026119403, "grad_norm": 0.46649738570614, "learning_rate": 1.2473038512043417e-05, "loss": 0.3414, "step": 31995 }, { "epoch": 3.7313432835820897, "grad_norm": 0.47731033900291076, "learning_rate": 1.246658216817639e-05, "loss": 0.3403, "step": 32000 }, { "epoch": 3.7319263059701493, "grad_norm": 0.4817616983967962, "learning_rate": 1.2460128059634556e-05, "loss": 0.3123, "step": 32005 }, { "epoch": 3.732509328358209, "grad_norm": 0.5627787936908184, "learning_rate": 1.2453676187377588e-05, "loss": 0.3207, "step": 32010 }, { "epoch": 3.7330923507462686, "grad_norm": 0.49174042244665306, "learning_rate": 1.2447226552364792e-05, "loss": 0.3363, "step": 32015 }, { "epoch": 3.7336753731343286, "grad_norm": 0.4882290714936515, "learning_rate": 1.2440779155555202e-05, "loss": 0.3167, "step": 32020 }, { "epoch": 3.734258395522388, "grad_norm": 0.45821859591672176, "learning_rate": 1.2434333997907448e-05, "loss": 0.316, "step": 32025 }, { "epoch": 3.734841417910448, "grad_norm": 0.4181102718574339, "learning_rate": 1.24278910803799e-05, "loss": 0.3339, "step": 32030 }, { "epoch": 3.7354244402985075, "grad_norm": 0.4701952700834935, "learning_rate": 1.2421450403930529e-05, "loss": 0.3516, "step": 32035 }, { "epoch": 3.736007462686567, "grad_norm": 0.4469984835355792, "learning_rate": 1.2415011969517016e-05, "loss": 0.3227, "step": 32040 }, { "epoch": 3.7365904850746268, "grad_norm": 0.4605442280735747, "learning_rate": 1.2408575778096686e-05, "loss": 0.3127, "step": 32045 }, { "epoch": 3.7371735074626864, "grad_norm": 0.4756742820583785, "learning_rate": 1.2402141830626547e-05, "loss": 0.3315, "step": 32050 }, { "epoch": 3.7377565298507465, "grad_norm": 0.4675689486501223, "learning_rate": 1.2395710128063263e-05, "loss": 0.3299, "step": 32055 }, { "epoch": 3.738339552238806, "grad_norm": 0.5287087304801841, "learning_rate": 1.2389280671363175e-05, "loss": 0.3029, "step": 32060 }, { "epoch": 3.7389225746268657, "grad_norm": 0.4764204681060229, "learning_rate": 1.2382853461482255e-05, "loss": 0.3203, "step": 32065 }, { "epoch": 3.7395055970149254, "grad_norm": 0.47976503952454813, "learning_rate": 1.2376428499376201e-05, "loss": 0.3254, "step": 32070 }, { "epoch": 3.740088619402985, "grad_norm": 0.44102137168488564, "learning_rate": 1.2370005786000314e-05, "loss": 0.3242, "step": 32075 }, { "epoch": 3.7406716417910446, "grad_norm": 0.448509879831629, "learning_rate": 1.2363585322309615e-05, "loss": 0.328, "step": 32080 }, { "epoch": 3.7412546641791042, "grad_norm": 0.4638741438273131, "learning_rate": 1.2357167109258748e-05, "loss": 0.327, "step": 32085 }, { "epoch": 3.7418376865671643, "grad_norm": 0.457911526246863, "learning_rate": 1.2350751147802047e-05, "loss": 0.3355, "step": 32090 }, { "epoch": 3.742420708955224, "grad_norm": 0.4553662582416192, "learning_rate": 1.23443374388935e-05, "loss": 0.3056, "step": 32095 }, { "epoch": 3.7430037313432836, "grad_norm": 0.4553970154189041, "learning_rate": 1.2337925983486768e-05, "loss": 0.338, "step": 32100 }, { "epoch": 3.743586753731343, "grad_norm": 0.47841457127273723, "learning_rate": 1.2331516782535172e-05, "loss": 0.3099, "step": 32105 }, { "epoch": 3.744169776119403, "grad_norm": 0.4315875427584779, "learning_rate": 1.2325109836991703e-05, "loss": 0.3212, "step": 32110 }, { "epoch": 3.744752798507463, "grad_norm": 0.5325946492349596, "learning_rate": 1.2318705147809006e-05, "loss": 0.343, "step": 32115 }, { "epoch": 3.7453358208955225, "grad_norm": 0.483822559676552, "learning_rate": 1.2312302715939394e-05, "loss": 0.3036, "step": 32120 }, { "epoch": 3.745918843283582, "grad_norm": 0.4743002706450139, "learning_rate": 1.2305902542334854e-05, "loss": 0.3088, "step": 32125 }, { "epoch": 3.746501865671642, "grad_norm": 2.044759677569916, "learning_rate": 1.2299504627947029e-05, "loss": 0.3454, "step": 32130 }, { "epoch": 3.7470848880597014, "grad_norm": 0.4217148784562264, "learning_rate": 1.2293108973727224e-05, "loss": 0.3337, "step": 32135 }, { "epoch": 3.747667910447761, "grad_norm": 0.46320034017587036, "learning_rate": 1.2286715580626418e-05, "loss": 0.3375, "step": 32140 }, { "epoch": 3.7482509328358207, "grad_norm": 0.4409040520149116, "learning_rate": 1.228032444959525e-05, "loss": 0.3284, "step": 32145 }, { "epoch": 3.7488339552238807, "grad_norm": 0.5085748463935102, "learning_rate": 1.2273935581584e-05, "loss": 0.331, "step": 32150 }, { "epoch": 3.7494169776119404, "grad_norm": 0.4331591632851916, "learning_rate": 1.2267548977542656e-05, "loss": 0.3183, "step": 32155 }, { "epoch": 3.75, "grad_norm": 0.449574801438169, "learning_rate": 1.2261164638420832e-05, "loss": 0.3103, "step": 32160 }, { "epoch": 3.7505830223880596, "grad_norm": 0.48238367955161604, "learning_rate": 1.2254782565167817e-05, "loss": 0.3207, "step": 32165 }, { "epoch": 3.7511660447761193, "grad_norm": 0.47403035375974273, "learning_rate": 1.2248402758732568e-05, "loss": 0.3324, "step": 32170 }, { "epoch": 3.7517490671641793, "grad_norm": 0.45266625798584026, "learning_rate": 1.22420252200637e-05, "loss": 0.3365, "step": 32175 }, { "epoch": 3.752332089552239, "grad_norm": 0.46073069022823204, "learning_rate": 1.2235649950109492e-05, "loss": 0.319, "step": 32180 }, { "epoch": 3.7529151119402986, "grad_norm": 0.48925534472831805, "learning_rate": 1.2229276949817894e-05, "loss": 0.3276, "step": 32185 }, { "epoch": 3.753498134328358, "grad_norm": 0.4443209482708789, "learning_rate": 1.222290622013649e-05, "loss": 0.3184, "step": 32190 }, { "epoch": 3.754081156716418, "grad_norm": 0.5255221148089022, "learning_rate": 1.221653776201257e-05, "loss": 0.341, "step": 32195 }, { "epoch": 3.7546641791044775, "grad_norm": 0.4793235964058894, "learning_rate": 1.2210171576393037e-05, "loss": 0.3383, "step": 32200 }, { "epoch": 3.755247201492537, "grad_norm": 0.4938430126386554, "learning_rate": 1.220380766422451e-05, "loss": 0.3197, "step": 32205 }, { "epoch": 3.755830223880597, "grad_norm": 0.4765637872045808, "learning_rate": 1.2197446026453219e-05, "loss": 0.3262, "step": 32210 }, { "epoch": 3.756413246268657, "grad_norm": 0.4683225388276627, "learning_rate": 1.219108666402509e-05, "loss": 0.3651, "step": 32215 }, { "epoch": 3.7569962686567164, "grad_norm": 0.45420723492269827, "learning_rate": 1.2184729577885695e-05, "loss": 0.3377, "step": 32220 }, { "epoch": 3.757579291044776, "grad_norm": 0.43816465509182606, "learning_rate": 1.2178374768980275e-05, "loss": 0.3232, "step": 32225 }, { "epoch": 3.7581623134328357, "grad_norm": 0.44977683835597543, "learning_rate": 1.2172022238253727e-05, "loss": 0.3268, "step": 32230 }, { "epoch": 3.7587453358208958, "grad_norm": 0.4541173885854834, "learning_rate": 1.2165671986650618e-05, "loss": 0.3276, "step": 32235 }, { "epoch": 3.7593283582089554, "grad_norm": 0.5592475270580155, "learning_rate": 1.2159324015115148e-05, "loss": 0.3519, "step": 32240 }, { "epoch": 3.759911380597015, "grad_norm": 0.44534625705153374, "learning_rate": 1.2152978324591233e-05, "loss": 0.3384, "step": 32245 }, { "epoch": 3.7604944029850746, "grad_norm": 0.44667025876219807, "learning_rate": 1.2146634916022383e-05, "loss": 0.3404, "step": 32250 }, { "epoch": 3.7610774253731343, "grad_norm": 0.43161392006810906, "learning_rate": 1.214029379035183e-05, "loss": 0.3141, "step": 32255 }, { "epoch": 3.761660447761194, "grad_norm": 0.4984373533108043, "learning_rate": 1.2133954948522423e-05, "loss": 0.343, "step": 32260 }, { "epoch": 3.7622434701492535, "grad_norm": 0.4875573311512853, "learning_rate": 1.2127618391476688e-05, "loss": 0.3329, "step": 32265 }, { "epoch": 3.7628264925373136, "grad_norm": 0.4676666518858329, "learning_rate": 1.2121284120156812e-05, "loss": 0.3124, "step": 32270 }, { "epoch": 3.7634095149253732, "grad_norm": 0.46558430530465317, "learning_rate": 1.2114952135504642e-05, "loss": 0.3157, "step": 32275 }, { "epoch": 3.763992537313433, "grad_norm": 0.444194527943588, "learning_rate": 1.210862243846168e-05, "loss": 0.3348, "step": 32280 }, { "epoch": 3.7645755597014925, "grad_norm": 0.4774140083616483, "learning_rate": 1.2102295029969104e-05, "loss": 0.3334, "step": 32285 }, { "epoch": 3.765158582089552, "grad_norm": 0.4887350469136481, "learning_rate": 1.209596991096772e-05, "loss": 0.3266, "step": 32290 }, { "epoch": 3.765741604477612, "grad_norm": 0.4697718880107236, "learning_rate": 1.2089647082398022e-05, "loss": 0.3297, "step": 32295 }, { "epoch": 3.7663246268656714, "grad_norm": 0.48329095287260077, "learning_rate": 1.2083326545200154e-05, "loss": 0.3494, "step": 32300 }, { "epoch": 3.7669076492537314, "grad_norm": 0.4906534119761042, "learning_rate": 1.2077008300313915e-05, "loss": 0.3346, "step": 32305 }, { "epoch": 3.767490671641791, "grad_norm": 0.459370468202657, "learning_rate": 1.2070692348678776e-05, "loss": 0.3137, "step": 32310 }, { "epoch": 3.7680736940298507, "grad_norm": 0.41422423338049147, "learning_rate": 1.2064378691233851e-05, "loss": 0.3051, "step": 32315 }, { "epoch": 3.7686567164179103, "grad_norm": 0.5547554233691531, "learning_rate": 1.205806732891793e-05, "loss": 0.3369, "step": 32320 }, { "epoch": 3.76923973880597, "grad_norm": 0.44835155064718296, "learning_rate": 1.205175826266943e-05, "loss": 0.3165, "step": 32325 }, { "epoch": 3.76982276119403, "grad_norm": 0.44559403853101776, "learning_rate": 1.2045451493426483e-05, "loss": 0.3352, "step": 32330 }, { "epoch": 3.7704057835820897, "grad_norm": 0.4433241737057994, "learning_rate": 1.2039147022126815e-05, "loss": 0.3137, "step": 32335 }, { "epoch": 3.7709888059701493, "grad_norm": 0.4367302619279283, "learning_rate": 1.2032844849707853e-05, "loss": 0.3003, "step": 32340 }, { "epoch": 3.771571828358209, "grad_norm": 0.44623453743128016, "learning_rate": 1.2026544977106669e-05, "loss": 0.3216, "step": 32345 }, { "epoch": 3.7721548507462686, "grad_norm": 0.47066659887948226, "learning_rate": 1.202024740525999e-05, "loss": 0.3343, "step": 32350 }, { "epoch": 3.7727378731343286, "grad_norm": 0.4729895133636887, "learning_rate": 1.2013952135104209e-05, "loss": 0.3244, "step": 32355 }, { "epoch": 3.773320895522388, "grad_norm": 0.42145515561435953, "learning_rate": 1.2007659167575377e-05, "loss": 0.3002, "step": 32360 }, { "epoch": 3.773903917910448, "grad_norm": 0.4740490777351826, "learning_rate": 1.2001368503609179e-05, "loss": 0.3385, "step": 32365 }, { "epoch": 3.7744869402985075, "grad_norm": 0.4311640262971568, "learning_rate": 1.1995080144141004e-05, "loss": 0.3092, "step": 32370 }, { "epoch": 3.775069962686567, "grad_norm": 0.43809698981656, "learning_rate": 1.1988794090105842e-05, "loss": 0.3111, "step": 32375 }, { "epoch": 3.7756529850746268, "grad_norm": 0.47687228833874734, "learning_rate": 1.1982510342438395e-05, "loss": 0.3334, "step": 32380 }, { "epoch": 3.7762360074626864, "grad_norm": 0.4240495396943132, "learning_rate": 1.1976228902072981e-05, "loss": 0.3063, "step": 32385 }, { "epoch": 3.7768190298507465, "grad_norm": 0.47655406701332764, "learning_rate": 1.1969949769943587e-05, "loss": 0.3324, "step": 32390 }, { "epoch": 3.777402052238806, "grad_norm": 0.466356782032414, "learning_rate": 1.196367294698387e-05, "loss": 0.3244, "step": 32395 }, { "epoch": 3.7779850746268657, "grad_norm": 0.48898401058095925, "learning_rate": 1.195739843412713e-05, "loss": 0.3292, "step": 32400 }, { "epoch": 3.7785680970149254, "grad_norm": 0.45772056201959277, "learning_rate": 1.1951126232306325e-05, "loss": 0.3227, "step": 32405 }, { "epoch": 3.779151119402985, "grad_norm": 0.4373854684473265, "learning_rate": 1.1944856342454078e-05, "loss": 0.3375, "step": 32410 }, { "epoch": 3.7797341417910446, "grad_norm": 0.44894386778120476, "learning_rate": 1.1938588765502643e-05, "loss": 0.3385, "step": 32415 }, { "epoch": 3.7803171641791042, "grad_norm": 0.5119645475216401, "learning_rate": 1.1932323502383978e-05, "loss": 0.3209, "step": 32420 }, { "epoch": 3.7809001865671643, "grad_norm": 0.4764171671627143, "learning_rate": 1.1926060554029636e-05, "loss": 0.3235, "step": 32425 }, { "epoch": 3.781483208955224, "grad_norm": 0.5176406273951388, "learning_rate": 1.1919799921370888e-05, "loss": 0.3365, "step": 32430 }, { "epoch": 3.7820662313432836, "grad_norm": 0.5064991390036546, "learning_rate": 1.1913541605338605e-05, "loss": 0.3134, "step": 32435 }, { "epoch": 3.782649253731343, "grad_norm": 0.49993873940907857, "learning_rate": 1.1907285606863351e-05, "loss": 0.3226, "step": 32440 }, { "epoch": 3.783232276119403, "grad_norm": 0.5135022484796244, "learning_rate": 1.190103192687533e-05, "loss": 0.3342, "step": 32445 }, { "epoch": 3.783815298507463, "grad_norm": 0.4597044748870591, "learning_rate": 1.1894780566304406e-05, "loss": 0.3255, "step": 32450 }, { "epoch": 3.7843983208955225, "grad_norm": 0.46585929236718493, "learning_rate": 1.1888531526080095e-05, "loss": 0.3458, "step": 32455 }, { "epoch": 3.784981343283582, "grad_norm": 0.4673428208687415, "learning_rate": 1.1882284807131576e-05, "loss": 0.3338, "step": 32460 }, { "epoch": 3.785564365671642, "grad_norm": 0.43258356419151617, "learning_rate": 1.1876040410387653e-05, "loss": 0.318, "step": 32465 }, { "epoch": 3.7861473880597014, "grad_norm": 0.41681281349956106, "learning_rate": 1.1869798336776845e-05, "loss": 0.3294, "step": 32470 }, { "epoch": 3.786730410447761, "grad_norm": 0.4674194498301845, "learning_rate": 1.1863558587227256e-05, "loss": 0.3152, "step": 32475 }, { "epoch": 3.7873134328358207, "grad_norm": 0.4513428447564988, "learning_rate": 1.1857321162666692e-05, "loss": 0.3341, "step": 32480 }, { "epoch": 3.7878964552238807, "grad_norm": 0.42564313689155603, "learning_rate": 1.1851086064022596e-05, "loss": 0.329, "step": 32485 }, { "epoch": 3.7884794776119404, "grad_norm": 0.43746810166454736, "learning_rate": 1.1844853292222066e-05, "loss": 0.3249, "step": 32490 }, { "epoch": 3.7890625, "grad_norm": 0.45328960422956577, "learning_rate": 1.1838622848191857e-05, "loss": 0.331, "step": 32495 }, { "epoch": 3.7896455223880596, "grad_norm": 0.44084270016519495, "learning_rate": 1.1832394732858377e-05, "loss": 0.2871, "step": 32500 }, { "epoch": 3.7902285447761193, "grad_norm": 0.47713810295263487, "learning_rate": 1.1826168947147693e-05, "loss": 0.3416, "step": 32505 }, { "epoch": 3.7908115671641793, "grad_norm": 0.4655338329444578, "learning_rate": 1.1819945491985504e-05, "loss": 0.3449, "step": 32510 }, { "epoch": 3.791394589552239, "grad_norm": 0.488389264161232, "learning_rate": 1.1813724368297187e-05, "loss": 0.3268, "step": 32515 }, { "epoch": 3.7919776119402986, "grad_norm": 0.46751089493196396, "learning_rate": 1.1807505577007765e-05, "loss": 0.3333, "step": 32520 }, { "epoch": 3.792560634328358, "grad_norm": 0.48137074011958036, "learning_rate": 1.1801289119041909e-05, "loss": 0.3356, "step": 32525 }, { "epoch": 3.793143656716418, "grad_norm": 0.4797731430916417, "learning_rate": 1.179507499532395e-05, "loss": 0.3448, "step": 32530 }, { "epoch": 3.7937266791044775, "grad_norm": 0.48443341575171117, "learning_rate": 1.1788863206777875e-05, "loss": 0.3379, "step": 32535 }, { "epoch": 3.794309701492537, "grad_norm": 0.43114955714166386, "learning_rate": 1.1782653754327295e-05, "loss": 0.3249, "step": 32540 }, { "epoch": 3.794892723880597, "grad_norm": 0.5020365866277783, "learning_rate": 1.1776446638895523e-05, "loss": 0.3399, "step": 32545 }, { "epoch": 3.795475746268657, "grad_norm": 0.45577173307467767, "learning_rate": 1.1770241861405475e-05, "loss": 0.3126, "step": 32550 }, { "epoch": 3.7960587686567164, "grad_norm": 0.4358709337137338, "learning_rate": 1.1764039422779765e-05, "loss": 0.3233, "step": 32555 }, { "epoch": 3.796641791044776, "grad_norm": 0.4940993977369946, "learning_rate": 1.1757839323940616e-05, "loss": 0.331, "step": 32560 }, { "epoch": 3.7972248134328357, "grad_norm": 0.4909816286910687, "learning_rate": 1.1751641565809931e-05, "loss": 0.3388, "step": 32565 }, { "epoch": 3.7978078358208958, "grad_norm": 0.4524352131225283, "learning_rate": 1.1745446149309257e-05, "loss": 0.3031, "step": 32570 }, { "epoch": 3.7983908582089554, "grad_norm": 0.4580994040900444, "learning_rate": 1.1739253075359796e-05, "loss": 0.3296, "step": 32575 }, { "epoch": 3.798973880597015, "grad_norm": 0.4580068147597908, "learning_rate": 1.1733062344882396e-05, "loss": 0.3531, "step": 32580 }, { "epoch": 3.7995569029850746, "grad_norm": 0.46620247870181725, "learning_rate": 1.1726873958797565e-05, "loss": 0.3321, "step": 32585 }, { "epoch": 3.8001399253731343, "grad_norm": 0.5971353874873305, "learning_rate": 1.1720687918025434e-05, "loss": 0.3126, "step": 32590 }, { "epoch": 3.800722947761194, "grad_norm": 0.4390347678214194, "learning_rate": 1.1714504223485844e-05, "loss": 0.3202, "step": 32595 }, { "epoch": 3.8013059701492535, "grad_norm": 0.48452309368607444, "learning_rate": 1.1708322876098215e-05, "loss": 0.3277, "step": 32600 }, { "epoch": 3.8018889925373136, "grad_norm": 0.4593093720055001, "learning_rate": 1.1702143876781687e-05, "loss": 0.3377, "step": 32605 }, { "epoch": 3.8024720149253732, "grad_norm": 0.42618677367571434, "learning_rate": 1.1695967226454996e-05, "loss": 0.3168, "step": 32610 }, { "epoch": 3.803055037313433, "grad_norm": 0.4414796121902606, "learning_rate": 1.1689792926036555e-05, "loss": 0.3271, "step": 32615 }, { "epoch": 3.8036380597014925, "grad_norm": 0.4733229542104905, "learning_rate": 1.1683620976444426e-05, "loss": 0.3463, "step": 32620 }, { "epoch": 3.804221082089552, "grad_norm": 0.46288076213503515, "learning_rate": 1.1677451378596318e-05, "loss": 0.3546, "step": 32625 }, { "epoch": 3.804804104477612, "grad_norm": 0.44912203881342816, "learning_rate": 1.1671284133409592e-05, "loss": 0.3125, "step": 32630 }, { "epoch": 3.8053871268656714, "grad_norm": 0.4297942344755935, "learning_rate": 1.1665119241801257e-05, "loss": 0.3278, "step": 32635 }, { "epoch": 3.8059701492537314, "grad_norm": 0.41257148676184574, "learning_rate": 1.1658956704687974e-05, "loss": 0.3151, "step": 32640 }, { "epoch": 3.806553171641791, "grad_norm": 0.454786923252004, "learning_rate": 1.165279652298606e-05, "loss": 0.3416, "step": 32645 }, { "epoch": 3.8071361940298507, "grad_norm": 0.4857749800750693, "learning_rate": 1.1646638697611462e-05, "loss": 0.3351, "step": 32650 }, { "epoch": 3.8077192164179103, "grad_norm": 0.45067500520051373, "learning_rate": 1.1640483229479794e-05, "loss": 0.3439, "step": 32655 }, { "epoch": 3.80830223880597, "grad_norm": 0.4823448438799639, "learning_rate": 1.1634330119506317e-05, "loss": 0.3296, "step": 32660 }, { "epoch": 3.80888526119403, "grad_norm": 0.47307038382014316, "learning_rate": 1.162817936860594e-05, "loss": 0.3521, "step": 32665 }, { "epoch": 3.8094682835820897, "grad_norm": 0.44750460757509564, "learning_rate": 1.1622030977693221e-05, "loss": 0.3021, "step": 32670 }, { "epoch": 3.8100513059701493, "grad_norm": 0.4693317663106042, "learning_rate": 1.1615884947682364e-05, "loss": 0.34, "step": 32675 }, { "epoch": 3.810634328358209, "grad_norm": 0.4615728105995682, "learning_rate": 1.1609741279487236e-05, "loss": 0.3472, "step": 32680 }, { "epoch": 3.8112173507462686, "grad_norm": 0.4486343742363925, "learning_rate": 1.1603599974021317e-05, "loss": 0.3005, "step": 32685 }, { "epoch": 3.8118003731343286, "grad_norm": 0.5027177392080457, "learning_rate": 1.1597461032197788e-05, "loss": 0.3401, "step": 32690 }, { "epoch": 3.812383395522388, "grad_norm": 0.44719318802271446, "learning_rate": 1.1591324454929433e-05, "loss": 0.3341, "step": 32695 }, { "epoch": 3.812966417910448, "grad_norm": 0.45142182524978125, "learning_rate": 1.1585190243128707e-05, "loss": 0.3241, "step": 32700 }, { "epoch": 3.8135494402985075, "grad_norm": 0.4769584469070269, "learning_rate": 1.1579058397707707e-05, "loss": 0.3164, "step": 32705 }, { "epoch": 3.814132462686567, "grad_norm": 0.4761466214469882, "learning_rate": 1.1572928919578186e-05, "loss": 0.3173, "step": 32710 }, { "epoch": 3.8147154850746268, "grad_norm": 0.43987912155833664, "learning_rate": 1.1566801809651532e-05, "loss": 0.3259, "step": 32715 }, { "epoch": 3.8152985074626864, "grad_norm": 0.44197945084467494, "learning_rate": 1.15606770688388e-05, "loss": 0.3343, "step": 32720 }, { "epoch": 3.8158815298507465, "grad_norm": 0.490769456821821, "learning_rate": 1.1554554698050652e-05, "loss": 0.3221, "step": 32725 }, { "epoch": 3.816464552238806, "grad_norm": 0.4370166373735806, "learning_rate": 1.154843469819746e-05, "loss": 0.3146, "step": 32730 }, { "epoch": 3.8170475746268657, "grad_norm": 0.4094388795690992, "learning_rate": 1.1542317070189181e-05, "loss": 0.3257, "step": 32735 }, { "epoch": 3.8176305970149254, "grad_norm": 0.5294554908674985, "learning_rate": 1.1536201814935473e-05, "loss": 0.2999, "step": 32740 }, { "epoch": 3.818213619402985, "grad_norm": 0.3973754804614025, "learning_rate": 1.1530088933345595e-05, "loss": 0.3059, "step": 32745 }, { "epoch": 3.8187966417910446, "grad_norm": 0.4460908479567948, "learning_rate": 1.152397842632848e-05, "loss": 0.3399, "step": 32750 }, { "epoch": 3.8193796641791042, "grad_norm": 0.45097839645187715, "learning_rate": 1.1517870294792709e-05, "loss": 0.3293, "step": 32755 }, { "epoch": 3.8199626865671643, "grad_norm": 0.4501929828562514, "learning_rate": 1.1511764539646494e-05, "loss": 0.3331, "step": 32760 }, { "epoch": 3.820545708955224, "grad_norm": 0.4679727027485936, "learning_rate": 1.1505661161797707e-05, "loss": 0.315, "step": 32765 }, { "epoch": 3.8211287313432836, "grad_norm": 0.47639889209726183, "learning_rate": 1.1499560162153866e-05, "loss": 0.3297, "step": 32770 }, { "epoch": 3.821711753731343, "grad_norm": 0.5031297037643795, "learning_rate": 1.1493461541622114e-05, "loss": 0.3245, "step": 32775 }, { "epoch": 3.822294776119403, "grad_norm": 0.5005275814073943, "learning_rate": 1.1487365301109281e-05, "loss": 0.3494, "step": 32780 }, { "epoch": 3.822877798507463, "grad_norm": 0.4678153501476471, "learning_rate": 1.1481271441521796e-05, "loss": 0.3315, "step": 32785 }, { "epoch": 3.8234608208955225, "grad_norm": 0.47138667915063925, "learning_rate": 1.147517996376578e-05, "loss": 0.3147, "step": 32790 }, { "epoch": 3.824043843283582, "grad_norm": 0.4446152429445292, "learning_rate": 1.1469090868746961e-05, "loss": 0.3107, "step": 32795 }, { "epoch": 3.824626865671642, "grad_norm": 0.4628791326023051, "learning_rate": 1.1463004157370735e-05, "loss": 0.3368, "step": 32800 }, { "epoch": 3.8252098880597014, "grad_norm": 0.47672171053911006, "learning_rate": 1.1456919830542138e-05, "loss": 0.3302, "step": 32805 }, { "epoch": 3.825792910447761, "grad_norm": 0.4833603423512201, "learning_rate": 1.1450837889165852e-05, "loss": 0.3266, "step": 32810 }, { "epoch": 3.8263759328358207, "grad_norm": 0.5082904135940178, "learning_rate": 1.14447583341462e-05, "loss": 0.3489, "step": 32815 }, { "epoch": 3.8269589552238807, "grad_norm": 0.4746803078733811, "learning_rate": 1.1438681166387162e-05, "loss": 0.3418, "step": 32820 }, { "epoch": 3.8275419776119404, "grad_norm": 0.46944595952476803, "learning_rate": 1.1432606386792341e-05, "loss": 0.3126, "step": 32825 }, { "epoch": 3.828125, "grad_norm": 0.45320397573807447, "learning_rate": 1.1426533996265008e-05, "loss": 0.3284, "step": 32830 }, { "epoch": 3.8287080223880596, "grad_norm": 0.4815212010253675, "learning_rate": 1.1420463995708064e-05, "loss": 0.3301, "step": 32835 }, { "epoch": 3.8292910447761193, "grad_norm": 0.4455388520982315, "learning_rate": 1.1414396386024064e-05, "loss": 0.2994, "step": 32840 }, { "epoch": 3.8298740671641793, "grad_norm": 0.5022441632403734, "learning_rate": 1.1408331168115203e-05, "loss": 0.3471, "step": 32845 }, { "epoch": 3.830457089552239, "grad_norm": 0.487466620138546, "learning_rate": 1.140226834288332e-05, "loss": 0.3252, "step": 32850 }, { "epoch": 3.8310401119402986, "grad_norm": 0.4577422473409993, "learning_rate": 1.1396207911229903e-05, "loss": 0.3442, "step": 32855 }, { "epoch": 3.831623134328358, "grad_norm": 0.4097207699401092, "learning_rate": 1.1390149874056065e-05, "loss": 0.3358, "step": 32860 }, { "epoch": 3.832206156716418, "grad_norm": 0.4877820747731223, "learning_rate": 1.1384094232262602e-05, "loss": 0.3215, "step": 32865 }, { "epoch": 3.8327891791044775, "grad_norm": 0.4833693289223179, "learning_rate": 1.1378040986749912e-05, "loss": 0.3364, "step": 32870 }, { "epoch": 3.833372201492537, "grad_norm": 0.461400786860475, "learning_rate": 1.137199013841806e-05, "loss": 0.3283, "step": 32875 }, { "epoch": 3.833955223880597, "grad_norm": 0.47158608400881774, "learning_rate": 1.1365941688166747e-05, "loss": 0.3344, "step": 32880 }, { "epoch": 3.834538246268657, "grad_norm": 0.4680783563143886, "learning_rate": 1.1359895636895324e-05, "loss": 0.3336, "step": 32885 }, { "epoch": 3.8351212686567164, "grad_norm": 0.45279641518224417, "learning_rate": 1.1353851985502777e-05, "loss": 0.3208, "step": 32890 }, { "epoch": 3.835704291044776, "grad_norm": 0.4574662744031216, "learning_rate": 1.1347810734887747e-05, "loss": 0.3305, "step": 32895 }, { "epoch": 3.8362873134328357, "grad_norm": 0.4328061385653668, "learning_rate": 1.134177188594849e-05, "loss": 0.3209, "step": 32900 }, { "epoch": 3.8368703358208958, "grad_norm": 0.4886698936922826, "learning_rate": 1.1335735439582952e-05, "loss": 0.3554, "step": 32905 }, { "epoch": 3.8374533582089554, "grad_norm": 0.48878378205522216, "learning_rate": 1.1329701396688669e-05, "loss": 0.3381, "step": 32910 }, { "epoch": 3.838036380597015, "grad_norm": 0.4803671045616649, "learning_rate": 1.132366975816287e-05, "loss": 0.3268, "step": 32915 }, { "epoch": 3.8386194029850746, "grad_norm": 0.503144613393508, "learning_rate": 1.1317640524902383e-05, "loss": 0.3464, "step": 32920 }, { "epoch": 3.8392024253731343, "grad_norm": 0.4587130891790112, "learning_rate": 1.1311613697803703e-05, "loss": 0.3239, "step": 32925 }, { "epoch": 3.839785447761194, "grad_norm": 0.4539351130193337, "learning_rate": 1.1305589277762965e-05, "loss": 0.3472, "step": 32930 }, { "epoch": 3.8403684701492535, "grad_norm": 0.450259232588213, "learning_rate": 1.1299567265675939e-05, "loss": 0.3076, "step": 32935 }, { "epoch": 3.8409514925373136, "grad_norm": 0.42848948841402396, "learning_rate": 1.129354766243804e-05, "loss": 0.3174, "step": 32940 }, { "epoch": 3.8415345149253732, "grad_norm": 0.4619131535601183, "learning_rate": 1.1287530468944332e-05, "loss": 0.3107, "step": 32945 }, { "epoch": 3.842117537313433, "grad_norm": 0.5382638189612068, "learning_rate": 1.1281515686089497e-05, "loss": 0.3219, "step": 32950 }, { "epoch": 3.8427005597014925, "grad_norm": 0.4759530067107466, "learning_rate": 1.1275503314767901e-05, "loss": 0.3239, "step": 32955 }, { "epoch": 3.843283582089552, "grad_norm": 0.46973304991536674, "learning_rate": 1.1269493355873498e-05, "loss": 0.3408, "step": 32960 }, { "epoch": 3.843866604477612, "grad_norm": 0.44085510836492364, "learning_rate": 1.126348581029994e-05, "loss": 0.3293, "step": 32965 }, { "epoch": 3.8444496268656714, "grad_norm": 0.449990620750623, "learning_rate": 1.1257480678940469e-05, "loss": 0.3026, "step": 32970 }, { "epoch": 3.8450326492537314, "grad_norm": 0.4514383287544397, "learning_rate": 1.1251477962687998e-05, "loss": 0.3115, "step": 32975 }, { "epoch": 3.845615671641791, "grad_norm": 0.43436002421258607, "learning_rate": 1.1245477662435076e-05, "loss": 0.3152, "step": 32980 }, { "epoch": 3.8461986940298507, "grad_norm": 0.48597649213632177, "learning_rate": 1.1239479779073885e-05, "loss": 0.3169, "step": 32985 }, { "epoch": 3.8467817164179103, "grad_norm": 0.44564284318614067, "learning_rate": 1.123348431349626e-05, "loss": 0.3332, "step": 32990 }, { "epoch": 3.84736473880597, "grad_norm": 0.4102286231617167, "learning_rate": 1.1227491266593668e-05, "loss": 0.3261, "step": 32995 }, { "epoch": 3.84794776119403, "grad_norm": 0.4651315474062523, "learning_rate": 1.1221500639257204e-05, "loss": 0.3179, "step": 33000 }, { "epoch": 3.8485307835820897, "grad_norm": 0.4872571147764733, "learning_rate": 1.121551243237764e-05, "loss": 0.3287, "step": 33005 }, { "epoch": 3.8491138059701493, "grad_norm": 0.4675344374427744, "learning_rate": 1.1209526646845346e-05, "loss": 0.3345, "step": 33010 }, { "epoch": 3.849696828358209, "grad_norm": 0.4400261379252843, "learning_rate": 1.1203543283550355e-05, "loss": 0.3108, "step": 33015 }, { "epoch": 3.8502798507462686, "grad_norm": 0.4430965742244254, "learning_rate": 1.1197562343382341e-05, "loss": 0.3395, "step": 33020 }, { "epoch": 3.8508628731343286, "grad_norm": 0.45735532099931, "learning_rate": 1.119158382723061e-05, "loss": 0.3278, "step": 33025 }, { "epoch": 3.851445895522388, "grad_norm": 0.4687951359586848, "learning_rate": 1.118560773598411e-05, "loss": 0.3231, "step": 33030 }, { "epoch": 3.852028917910448, "grad_norm": 0.46140836800880514, "learning_rate": 1.1179634070531427e-05, "loss": 0.3398, "step": 33035 }, { "epoch": 3.8526119402985075, "grad_norm": 0.4615435808577384, "learning_rate": 1.1173662831760798e-05, "loss": 0.336, "step": 33040 }, { "epoch": 3.853194962686567, "grad_norm": 0.498488469657942, "learning_rate": 1.1167694020560071e-05, "loss": 0.3461, "step": 33045 }, { "epoch": 3.8537779850746268, "grad_norm": 0.5473689760854433, "learning_rate": 1.1161727637816762e-05, "loss": 0.3294, "step": 33050 }, { "epoch": 3.8543610074626864, "grad_norm": 0.4668962224968966, "learning_rate": 1.1155763684418013e-05, "loss": 0.3426, "step": 33055 }, { "epoch": 3.8549440298507465, "grad_norm": 0.4405721965189786, "learning_rate": 1.1149802161250607e-05, "loss": 0.3221, "step": 33060 }, { "epoch": 3.855527052238806, "grad_norm": 0.47858635454662596, "learning_rate": 1.1143843069200965e-05, "loss": 0.323, "step": 33065 }, { "epoch": 3.8561100746268657, "grad_norm": 0.43614195310448467, "learning_rate": 1.1137886409155158e-05, "loss": 0.3183, "step": 33070 }, { "epoch": 3.8566930970149254, "grad_norm": 0.5362034447270819, "learning_rate": 1.1131932181998856e-05, "loss": 0.3537, "step": 33075 }, { "epoch": 3.857276119402985, "grad_norm": 0.4455931691239378, "learning_rate": 1.1125980388617425e-05, "loss": 0.3312, "step": 33080 }, { "epoch": 3.8578591417910446, "grad_norm": 0.46523903730440463, "learning_rate": 1.1120031029895816e-05, "loss": 0.3251, "step": 33085 }, { "epoch": 3.8584421641791042, "grad_norm": 0.5282719718396226, "learning_rate": 1.1114084106718667e-05, "loss": 0.3405, "step": 33090 }, { "epoch": 3.8590251865671643, "grad_norm": 0.45428995715896703, "learning_rate": 1.1108139619970207e-05, "loss": 0.3399, "step": 33095 }, { "epoch": 3.859608208955224, "grad_norm": 0.4469221003103343, "learning_rate": 1.1102197570534334e-05, "loss": 0.3184, "step": 33100 }, { "epoch": 3.8601912313432836, "grad_norm": 0.4305500784793551, "learning_rate": 1.1096257959294572e-05, "loss": 0.3296, "step": 33105 }, { "epoch": 3.860774253731343, "grad_norm": 0.49759181520564827, "learning_rate": 1.1090320787134085e-05, "loss": 0.3293, "step": 33110 }, { "epoch": 3.861357276119403, "grad_norm": 0.48592985818559, "learning_rate": 1.1084386054935669e-05, "loss": 0.3186, "step": 33115 }, { "epoch": 3.861940298507463, "grad_norm": 0.45724255869740565, "learning_rate": 1.1078453763581776e-05, "loss": 0.321, "step": 33120 }, { "epoch": 3.8625233208955225, "grad_norm": 0.459645642938318, "learning_rate": 1.1072523913954455e-05, "loss": 0.3245, "step": 33125 }, { "epoch": 3.863106343283582, "grad_norm": 0.5223399497373401, "learning_rate": 1.1066596506935447e-05, "loss": 0.3346, "step": 33130 }, { "epoch": 3.863689365671642, "grad_norm": 0.47712012114642316, "learning_rate": 1.1060671543406074e-05, "loss": 0.327, "step": 33135 }, { "epoch": 3.8642723880597014, "grad_norm": 0.48271037227275715, "learning_rate": 1.1054749024247348e-05, "loss": 0.3324, "step": 33140 }, { "epoch": 3.864855410447761, "grad_norm": 0.4315080688638868, "learning_rate": 1.1048828950339867e-05, "loss": 0.3079, "step": 33145 }, { "epoch": 3.8654384328358207, "grad_norm": 0.44450042431301534, "learning_rate": 1.1042911322563903e-05, "loss": 0.3259, "step": 33150 }, { "epoch": 3.8660214552238807, "grad_norm": 0.4503506105615425, "learning_rate": 1.1036996141799347e-05, "loss": 0.3399, "step": 33155 }, { "epoch": 3.8666044776119404, "grad_norm": 0.4775715826622604, "learning_rate": 1.103108340892573e-05, "loss": 0.3366, "step": 33160 }, { "epoch": 3.8671875, "grad_norm": 0.4747788035082357, "learning_rate": 1.1025173124822213e-05, "loss": 0.3184, "step": 33165 }, { "epoch": 3.8677705223880596, "grad_norm": 0.46296396496129416, "learning_rate": 1.1019265290367616e-05, "loss": 0.3401, "step": 33170 }, { "epoch": 3.8683535447761193, "grad_norm": 0.430501096671592, "learning_rate": 1.1013359906440353e-05, "loss": 0.3008, "step": 33175 }, { "epoch": 3.8689365671641793, "grad_norm": 0.437483851632213, "learning_rate": 1.100745697391852e-05, "loss": 0.3122, "step": 33180 }, { "epoch": 3.869519589552239, "grad_norm": 0.4713102482347539, "learning_rate": 1.1001556493679812e-05, "loss": 0.3223, "step": 33185 }, { "epoch": 3.8701026119402986, "grad_norm": 0.4496255994957661, "learning_rate": 1.099565846660158e-05, "loss": 0.3342, "step": 33190 }, { "epoch": 3.870685634328358, "grad_norm": 0.48147533231731876, "learning_rate": 1.0989762893560798e-05, "loss": 0.3372, "step": 33195 }, { "epoch": 3.871268656716418, "grad_norm": 0.44251577833344247, "learning_rate": 1.0983869775434091e-05, "loss": 0.3323, "step": 33200 }, { "epoch": 3.8718516791044775, "grad_norm": 0.48410716018750416, "learning_rate": 1.0977979113097702e-05, "loss": 0.324, "step": 33205 }, { "epoch": 3.872434701492537, "grad_norm": 0.44439583909512353, "learning_rate": 1.097209090742752e-05, "loss": 0.3061, "step": 33210 }, { "epoch": 3.873017723880597, "grad_norm": 0.47159658683048755, "learning_rate": 1.096620515929907e-05, "loss": 0.3182, "step": 33215 }, { "epoch": 3.873600746268657, "grad_norm": 0.4701875766362371, "learning_rate": 1.096032186958749e-05, "loss": 0.33, "step": 33220 }, { "epoch": 3.8741837686567164, "grad_norm": 0.48776139257811485, "learning_rate": 1.095444103916758e-05, "loss": 0.3306, "step": 33225 }, { "epoch": 3.874766791044776, "grad_norm": 0.48917754383258233, "learning_rate": 1.0948562668913763e-05, "loss": 0.3315, "step": 33230 }, { "epoch": 3.8753498134328357, "grad_norm": 0.42845203482849836, "learning_rate": 1.0942686759700092e-05, "loss": 0.3361, "step": 33235 }, { "epoch": 3.8759328358208958, "grad_norm": 0.4521128584339785, "learning_rate": 1.0936813312400263e-05, "loss": 0.3432, "step": 33240 }, { "epoch": 3.8765158582089554, "grad_norm": 0.46657270299240866, "learning_rate": 1.0930942327887605e-05, "loss": 0.3333, "step": 33245 }, { "epoch": 3.877098880597015, "grad_norm": 0.42492942434563746, "learning_rate": 1.092507380703506e-05, "loss": 0.3254, "step": 33250 }, { "epoch": 3.8776819029850746, "grad_norm": 0.44468334518498764, "learning_rate": 1.0919207750715243e-05, "loss": 0.314, "step": 33255 }, { "epoch": 3.8782649253731343, "grad_norm": 0.4651406759628273, "learning_rate": 1.091334415980036e-05, "loss": 0.3153, "step": 33260 }, { "epoch": 3.878847947761194, "grad_norm": 0.4458658650524649, "learning_rate": 1.0907483035162291e-05, "loss": 0.3217, "step": 33265 }, { "epoch": 3.8794309701492535, "grad_norm": 0.4663762736179635, "learning_rate": 1.0901624377672513e-05, "loss": 0.3061, "step": 33270 }, { "epoch": 3.8800139925373136, "grad_norm": 0.4238179445095949, "learning_rate": 1.0895768188202158e-05, "loss": 0.3382, "step": 33275 }, { "epoch": 3.8805970149253732, "grad_norm": 0.47716965895054936, "learning_rate": 1.0889914467621986e-05, "loss": 0.3275, "step": 33280 }, { "epoch": 3.881180037313433, "grad_norm": 0.45739653512658146, "learning_rate": 1.0884063216802388e-05, "loss": 0.3217, "step": 33285 }, { "epoch": 3.8817630597014925, "grad_norm": 0.48181114217634596, "learning_rate": 1.0878214436613387e-05, "loss": 0.3087, "step": 33290 }, { "epoch": 3.882346082089552, "grad_norm": 0.4721078496089997, "learning_rate": 1.0872368127924654e-05, "loss": 0.3287, "step": 33295 }, { "epoch": 3.882929104477612, "grad_norm": 0.474256957024035, "learning_rate": 1.0866524291605452e-05, "loss": 0.3301, "step": 33300 }, { "epoch": 3.8835121268656714, "grad_norm": 0.43908918833917066, "learning_rate": 1.0860682928524732e-05, "loss": 0.3148, "step": 33305 }, { "epoch": 3.8840951492537314, "grad_norm": 0.4688668078959364, "learning_rate": 1.0854844039551023e-05, "loss": 0.3072, "step": 33310 }, { "epoch": 3.884678171641791, "grad_norm": 0.49775451058081527, "learning_rate": 1.0849007625552539e-05, "loss": 0.311, "step": 33315 }, { "epoch": 3.8852611940298507, "grad_norm": 0.4565491640661001, "learning_rate": 1.0843173687397079e-05, "loss": 0.3254, "step": 33320 }, { "epoch": 3.8858442164179103, "grad_norm": 0.5054496025514279, "learning_rate": 1.0837342225952097e-05, "loss": 0.3444, "step": 33325 }, { "epoch": 3.88642723880597, "grad_norm": 0.4641649546547003, "learning_rate": 1.0831513242084681e-05, "loss": 0.3271, "step": 33330 }, { "epoch": 3.88701026119403, "grad_norm": 0.43977199230377334, "learning_rate": 1.0825686736661541e-05, "loss": 0.3058, "step": 33335 }, { "epoch": 3.8875932835820897, "grad_norm": 0.43046203770627206, "learning_rate": 1.0819862710549025e-05, "loss": 0.3295, "step": 33340 }, { "epoch": 3.8881763059701493, "grad_norm": 0.4502064285248263, "learning_rate": 1.0814041164613107e-05, "loss": 0.3107, "step": 33345 }, { "epoch": 3.888759328358209, "grad_norm": 0.49018547403922796, "learning_rate": 1.0808222099719396e-05, "loss": 0.33, "step": 33350 }, { "epoch": 3.8893423507462686, "grad_norm": 0.44719309919402267, "learning_rate": 1.0802405516733138e-05, "loss": 0.3327, "step": 33355 }, { "epoch": 3.8899253731343286, "grad_norm": 0.4349662933456471, "learning_rate": 1.0796591416519192e-05, "loss": 0.3073, "step": 33360 }, { "epoch": 3.890508395522388, "grad_norm": 0.4769310745163692, "learning_rate": 1.0790779799942063e-05, "loss": 0.3294, "step": 33365 }, { "epoch": 3.891091417910448, "grad_norm": 0.4633243435475389, "learning_rate": 1.0784970667865882e-05, "loss": 0.3389, "step": 33370 }, { "epoch": 3.8916744402985075, "grad_norm": 0.4348569876234952, "learning_rate": 1.0779164021154417e-05, "loss": 0.3206, "step": 33375 }, { "epoch": 3.892257462686567, "grad_norm": 0.4508919294527058, "learning_rate": 1.0773359860671054e-05, "loss": 0.3236, "step": 33380 }, { "epoch": 3.8928404850746268, "grad_norm": 0.4176659453803, "learning_rate": 1.0767558187278817e-05, "loss": 0.3059, "step": 33385 }, { "epoch": 3.8934235074626864, "grad_norm": 0.4936745756161462, "learning_rate": 1.0761759001840371e-05, "loss": 0.3656, "step": 33390 }, { "epoch": 3.8940065298507465, "grad_norm": 0.4864860346779509, "learning_rate": 1.0755962305217973e-05, "loss": 0.3161, "step": 33395 }, { "epoch": 3.894589552238806, "grad_norm": 0.47301983061706776, "learning_rate": 1.0750168098273569e-05, "loss": 0.3316, "step": 33400 }, { "epoch": 3.8951725746268657, "grad_norm": 0.44945043215503067, "learning_rate": 1.074437638186868e-05, "loss": 0.3235, "step": 33405 }, { "epoch": 3.8957555970149254, "grad_norm": 0.5170131988652965, "learning_rate": 1.073858715686448e-05, "loss": 0.3535, "step": 33410 }, { "epoch": 3.896338619402985, "grad_norm": 0.44865396114012535, "learning_rate": 1.0732800424121779e-05, "loss": 0.3252, "step": 33415 }, { "epoch": 3.8969216417910446, "grad_norm": 0.49849390150774087, "learning_rate": 1.0727016184501e-05, "loss": 0.3494, "step": 33420 }, { "epoch": 3.8975046641791042, "grad_norm": 0.4575638153722119, "learning_rate": 1.0721234438862213e-05, "loss": 0.3244, "step": 33425 }, { "epoch": 3.8980876865671643, "grad_norm": 0.4972029790663982, "learning_rate": 1.0715455188065112e-05, "loss": 0.3216, "step": 33430 }, { "epoch": 3.898670708955224, "grad_norm": 0.4997182569608143, "learning_rate": 1.0709678432968995e-05, "loss": 0.3526, "step": 33435 }, { "epoch": 3.8992537313432836, "grad_norm": 0.4745691563520591, "learning_rate": 1.0703904174432836e-05, "loss": 0.3265, "step": 33440 }, { "epoch": 3.899836753731343, "grad_norm": 0.5168420977177586, "learning_rate": 1.0698132413315188e-05, "loss": 0.3311, "step": 33445 }, { "epoch": 3.900419776119403, "grad_norm": 0.4455063996811109, "learning_rate": 1.069236315047428e-05, "loss": 0.3419, "step": 33450 }, { "epoch": 3.901002798507463, "grad_norm": 0.47340676730818415, "learning_rate": 1.0686596386767928e-05, "loss": 0.3333, "step": 33455 }, { "epoch": 3.9015858208955225, "grad_norm": 0.4809870026301231, "learning_rate": 1.0680832123053603e-05, "loss": 0.3274, "step": 33460 }, { "epoch": 3.902168843283582, "grad_norm": 0.4465458990605889, "learning_rate": 1.067507036018839e-05, "loss": 0.3447, "step": 33465 }, { "epoch": 3.902751865671642, "grad_norm": 0.4465058729904804, "learning_rate": 1.0669311099029014e-05, "loss": 0.3475, "step": 33470 }, { "epoch": 3.9033348880597014, "grad_norm": 0.5116129137143816, "learning_rate": 1.066355434043182e-05, "loss": 0.3439, "step": 33475 }, { "epoch": 3.903917910447761, "grad_norm": 0.4717166940202731, "learning_rate": 1.0657800085252789e-05, "loss": 0.3349, "step": 33480 }, { "epoch": 3.9045009328358207, "grad_norm": 0.49214369973716376, "learning_rate": 1.0652048334347503e-05, "loss": 0.3384, "step": 33485 }, { "epoch": 3.9050839552238807, "grad_norm": 0.4754204891436952, "learning_rate": 1.064629908857122e-05, "loss": 0.3252, "step": 33490 }, { "epoch": 3.9056669776119404, "grad_norm": 0.4368586840816342, "learning_rate": 1.0640552348778772e-05, "loss": 0.3375, "step": 33495 }, { "epoch": 3.90625, "grad_norm": 0.4529978997743157, "learning_rate": 1.0634808115824668e-05, "loss": 0.3234, "step": 33500 }, { "epoch": 3.9068330223880596, "grad_norm": 0.49149399879697336, "learning_rate": 1.0629066390563002e-05, "loss": 0.321, "step": 33505 }, { "epoch": 3.9074160447761193, "grad_norm": 0.4509824332176206, "learning_rate": 1.062332717384752e-05, "loss": 0.3581, "step": 33510 }, { "epoch": 3.9079990671641793, "grad_norm": 0.4537482827730715, "learning_rate": 1.061759046653159e-05, "loss": 0.3393, "step": 33515 }, { "epoch": 3.908582089552239, "grad_norm": 0.5105268910887601, "learning_rate": 1.0611856269468203e-05, "loss": 0.3447, "step": 33520 }, { "epoch": 3.9091651119402986, "grad_norm": 0.4901727461174683, "learning_rate": 1.0606124583509983e-05, "loss": 0.3255, "step": 33525 }, { "epoch": 3.909748134328358, "grad_norm": 0.4340808643488463, "learning_rate": 1.0600395409509177e-05, "loss": 0.3421, "step": 33530 }, { "epoch": 3.910331156716418, "grad_norm": 0.484371758609476, "learning_rate": 1.0594668748317643e-05, "loss": 0.3099, "step": 33535 }, { "epoch": 3.9109141791044775, "grad_norm": 0.4553888233254177, "learning_rate": 1.0588944600786907e-05, "loss": 0.2952, "step": 33540 }, { "epoch": 3.911497201492537, "grad_norm": 0.482461618635314, "learning_rate": 1.0583222967768076e-05, "loss": 0.3072, "step": 33545 }, { "epoch": 3.912080223880597, "grad_norm": 0.4991090354947881, "learning_rate": 1.0577503850111903e-05, "loss": 0.3241, "step": 33550 }, { "epoch": 3.912663246268657, "grad_norm": 0.432300133663098, "learning_rate": 1.0571787248668774e-05, "loss": 0.3403, "step": 33555 }, { "epoch": 3.9132462686567164, "grad_norm": 0.48530227575772117, "learning_rate": 1.0566073164288687e-05, "loss": 0.3301, "step": 33560 }, { "epoch": 3.913829291044776, "grad_norm": 0.4873409308216426, "learning_rate": 1.0560361597821273e-05, "loss": 0.3247, "step": 33565 }, { "epoch": 3.9144123134328357, "grad_norm": 0.4080850363735003, "learning_rate": 1.0554652550115788e-05, "loss": 0.3137, "step": 33570 }, { "epoch": 3.9149953358208958, "grad_norm": 0.4453352465572526, "learning_rate": 1.054894602202112e-05, "loss": 0.3084, "step": 33575 }, { "epoch": 3.9155783582089554, "grad_norm": 0.46849618867634657, "learning_rate": 1.0543242014385758e-05, "loss": 0.3101, "step": 33580 }, { "epoch": 3.916161380597015, "grad_norm": 0.501994941153192, "learning_rate": 1.0537540528057844e-05, "loss": 0.3325, "step": 33585 }, { "epoch": 3.9167444029850746, "grad_norm": 0.44566099017892913, "learning_rate": 1.0531841563885134e-05, "loss": 0.3099, "step": 33590 }, { "epoch": 3.9173274253731343, "grad_norm": 0.5558749333263552, "learning_rate": 1.0526145122715007e-05, "loss": 0.326, "step": 33595 }, { "epoch": 3.917910447761194, "grad_norm": 0.43950209811951646, "learning_rate": 1.052045120539447e-05, "loss": 0.3151, "step": 33600 }, { "epoch": 3.9184934701492535, "grad_norm": 0.5072366338265899, "learning_rate": 1.051475981277016e-05, "loss": 0.3456, "step": 33605 }, { "epoch": 3.9190764925373136, "grad_norm": 0.5196266090165022, "learning_rate": 1.050907094568832e-05, "loss": 0.3609, "step": 33610 }, { "epoch": 3.9196595149253732, "grad_norm": 0.4711784241295961, "learning_rate": 1.0503384604994846e-05, "loss": 0.323, "step": 33615 }, { "epoch": 3.920242537313433, "grad_norm": 0.4854892163180719, "learning_rate": 1.0497700791535221e-05, "loss": 0.3353, "step": 33620 }, { "epoch": 3.9208255597014925, "grad_norm": 0.5257714374863696, "learning_rate": 1.04920195061546e-05, "loss": 0.3431, "step": 33625 }, { "epoch": 3.921408582089552, "grad_norm": 0.5062598555312798, "learning_rate": 1.0486340749697716e-05, "loss": 0.3495, "step": 33630 }, { "epoch": 3.921991604477612, "grad_norm": 0.48392908035583404, "learning_rate": 1.0480664523008948e-05, "loss": 0.3264, "step": 33635 }, { "epoch": 3.9225746268656714, "grad_norm": 0.4650271530262864, "learning_rate": 1.0474990826932301e-05, "loss": 0.3253, "step": 33640 }, { "epoch": 3.9231576492537314, "grad_norm": 0.5065185766738228, "learning_rate": 1.0469319662311403e-05, "loss": 0.3442, "step": 33645 }, { "epoch": 3.923740671641791, "grad_norm": 0.4508854584980229, "learning_rate": 1.0463651029989492e-05, "loss": 0.335, "step": 33650 }, { "epoch": 3.9243236940298507, "grad_norm": 0.5485095643962583, "learning_rate": 1.0457984930809452e-05, "loss": 0.3683, "step": 33655 }, { "epoch": 3.9249067164179103, "grad_norm": 0.5218275787456073, "learning_rate": 1.0452321365613758e-05, "loss": 0.3506, "step": 33660 }, { "epoch": 3.92548973880597, "grad_norm": 0.4757228072173987, "learning_rate": 1.0446660335244551e-05, "loss": 0.328, "step": 33665 }, { "epoch": 3.92607276119403, "grad_norm": 0.49953106921690893, "learning_rate": 1.0441001840543548e-05, "loss": 0.3359, "step": 33670 }, { "epoch": 3.9266557835820897, "grad_norm": 0.453266974029994, "learning_rate": 1.0435345882352144e-05, "loss": 0.3173, "step": 33675 }, { "epoch": 3.9272388059701493, "grad_norm": 0.5206236913269021, "learning_rate": 1.0429692461511298e-05, "loss": 0.3386, "step": 33680 }, { "epoch": 3.927821828358209, "grad_norm": 0.47800687038688183, "learning_rate": 1.0424041578861626e-05, "loss": 0.322, "step": 33685 }, { "epoch": 3.9284048507462686, "grad_norm": 0.44966064859975596, "learning_rate": 1.041839323524337e-05, "loss": 0.3324, "step": 33690 }, { "epoch": 3.9289878731343286, "grad_norm": 0.48893228822481516, "learning_rate": 1.0412747431496372e-05, "loss": 0.3328, "step": 33695 }, { "epoch": 3.929570895522388, "grad_norm": 0.42419307008604823, "learning_rate": 1.0407104168460116e-05, "loss": 0.319, "step": 33700 }, { "epoch": 3.930153917910448, "grad_norm": 0.47660750708650085, "learning_rate": 1.0401463446973708e-05, "loss": 0.3419, "step": 33705 }, { "epoch": 3.9307369402985075, "grad_norm": 0.43585377683092885, "learning_rate": 1.0395825267875846e-05, "loss": 0.3083, "step": 33710 }, { "epoch": 3.931319962686567, "grad_norm": 0.4780579368292095, "learning_rate": 1.0390189632004905e-05, "loss": 0.3239, "step": 33715 }, { "epoch": 3.9319029850746268, "grad_norm": 0.44920571184233704, "learning_rate": 1.0384556540198825e-05, "loss": 0.3262, "step": 33720 }, { "epoch": 3.9324860074626864, "grad_norm": 0.47175046583080993, "learning_rate": 1.0378925993295202e-05, "loss": 0.3293, "step": 33725 }, { "epoch": 3.9330690298507465, "grad_norm": 0.46480283167238134, "learning_rate": 1.0373297992131242e-05, "loss": 0.3242, "step": 33730 }, { "epoch": 3.933652052238806, "grad_norm": 0.4175856958165451, "learning_rate": 1.0367672537543777e-05, "loss": 0.3083, "step": 33735 }, { "epoch": 3.9342350746268657, "grad_norm": 0.41018167877598666, "learning_rate": 1.0362049630369259e-05, "loss": 0.309, "step": 33740 }, { "epoch": 3.9348180970149254, "grad_norm": 0.4955778837967272, "learning_rate": 1.0356429271443757e-05, "loss": 0.3282, "step": 33745 }, { "epoch": 3.935401119402985, "grad_norm": 0.4265611463098694, "learning_rate": 1.0350811461602974e-05, "loss": 0.3255, "step": 33750 }, { "epoch": 3.9359841417910446, "grad_norm": 0.5561878417502005, "learning_rate": 1.0345196201682212e-05, "loss": 0.3307, "step": 33755 }, { "epoch": 3.9365671641791042, "grad_norm": 0.44307740143964736, "learning_rate": 1.033958349251641e-05, "loss": 0.3163, "step": 33760 }, { "epoch": 3.9371501865671643, "grad_norm": 0.5090739166525471, "learning_rate": 1.0333973334940125e-05, "loss": 0.3483, "step": 33765 }, { "epoch": 3.937733208955224, "grad_norm": 0.4525542709236959, "learning_rate": 1.0328365729787536e-05, "loss": 0.323, "step": 33770 }, { "epoch": 3.9383162313432836, "grad_norm": 0.46050542647327514, "learning_rate": 1.0322760677892437e-05, "loss": 0.3288, "step": 33775 }, { "epoch": 3.938899253731343, "grad_norm": 0.45907426668515966, "learning_rate": 1.0317158180088254e-05, "loss": 0.3445, "step": 33780 }, { "epoch": 3.939482276119403, "grad_norm": 0.5230425517412971, "learning_rate": 1.0311558237208006e-05, "loss": 0.3262, "step": 33785 }, { "epoch": 3.940065298507463, "grad_norm": 0.4715806732489677, "learning_rate": 1.0305960850084373e-05, "loss": 0.3278, "step": 33790 }, { "epoch": 3.9406483208955225, "grad_norm": 0.4801936056919124, "learning_rate": 1.030036601954961e-05, "loss": 0.3471, "step": 33795 }, { "epoch": 3.941231343283582, "grad_norm": 0.4793655410602897, "learning_rate": 1.0294773746435638e-05, "loss": 0.3509, "step": 33800 }, { "epoch": 3.941814365671642, "grad_norm": 0.4383407971147484, "learning_rate": 1.028918403157396e-05, "loss": 0.3416, "step": 33805 }, { "epoch": 3.9423973880597014, "grad_norm": 0.47283544342158557, "learning_rate": 1.0283596875795718e-05, "loss": 0.3367, "step": 33810 }, { "epoch": 3.942980410447761, "grad_norm": 0.5137500935771644, "learning_rate": 1.0278012279931665e-05, "loss": 0.3569, "step": 33815 }, { "epoch": 3.9435634328358207, "grad_norm": 0.4524030901963742, "learning_rate": 1.0272430244812175e-05, "loss": 0.3186, "step": 33820 }, { "epoch": 3.9441464552238807, "grad_norm": 0.5053690630460168, "learning_rate": 1.0266850771267253e-05, "loss": 0.3424, "step": 33825 }, { "epoch": 3.9447294776119404, "grad_norm": 0.4200738173375548, "learning_rate": 1.0261273860126514e-05, "loss": 0.3028, "step": 33830 }, { "epoch": 3.9453125, "grad_norm": 0.467065619389046, "learning_rate": 1.0255699512219166e-05, "loss": 0.3232, "step": 33835 }, { "epoch": 3.9458955223880596, "grad_norm": 0.46116333523776293, "learning_rate": 1.0250127728374098e-05, "loss": 0.3136, "step": 33840 }, { "epoch": 3.9464785447761193, "grad_norm": 0.45348991820287743, "learning_rate": 1.0244558509419748e-05, "loss": 0.315, "step": 33845 }, { "epoch": 3.9470615671641793, "grad_norm": 0.4517076456612718, "learning_rate": 1.023899185618423e-05, "loss": 0.3208, "step": 33850 }, { "epoch": 3.947644589552239, "grad_norm": 0.44252536059953496, "learning_rate": 1.023342776949524e-05, "loss": 0.323, "step": 33855 }, { "epoch": 3.9482276119402986, "grad_norm": 0.4811269923189408, "learning_rate": 1.0227866250180105e-05, "loss": 0.3246, "step": 33860 }, { "epoch": 3.948810634328358, "grad_norm": 0.45067652056352503, "learning_rate": 1.022230729906577e-05, "loss": 0.3308, "step": 33865 }, { "epoch": 3.949393656716418, "grad_norm": 0.42938991082270744, "learning_rate": 1.02167509169788e-05, "loss": 0.3154, "step": 33870 }, { "epoch": 3.9499766791044775, "grad_norm": 0.4637541076276976, "learning_rate": 1.0211197104745373e-05, "loss": 0.3501, "step": 33875 }, { "epoch": 3.950559701492537, "grad_norm": 0.4505223220118978, "learning_rate": 1.02056458631913e-05, "loss": 0.3286, "step": 33880 }, { "epoch": 3.951142723880597, "grad_norm": 0.47479324238117204, "learning_rate": 1.020009719314197e-05, "loss": 0.3388, "step": 33885 }, { "epoch": 3.951725746268657, "grad_norm": 0.44894587425131394, "learning_rate": 1.0194551095422447e-05, "loss": 0.323, "step": 33890 }, { "epoch": 3.9523087686567164, "grad_norm": 0.46095727378674345, "learning_rate": 1.0189007570857363e-05, "loss": 0.3222, "step": 33895 }, { "epoch": 3.952891791044776, "grad_norm": 0.4632299502106948, "learning_rate": 1.0183466620270996e-05, "loss": 0.3386, "step": 33900 }, { "epoch": 3.9534748134328357, "grad_norm": 0.46950464682753884, "learning_rate": 1.0177928244487225e-05, "loss": 0.3335, "step": 33905 }, { "epoch": 3.9540578358208958, "grad_norm": 0.4300936607612698, "learning_rate": 1.0172392444329561e-05, "loss": 0.3211, "step": 33910 }, { "epoch": 3.9546408582089554, "grad_norm": 0.42780701592683107, "learning_rate": 1.0166859220621122e-05, "loss": 0.3147, "step": 33915 }, { "epoch": 3.955223880597015, "grad_norm": 0.46160418788896695, "learning_rate": 1.0161328574184645e-05, "loss": 0.3255, "step": 33920 }, { "epoch": 3.9558069029850746, "grad_norm": 0.47917701468417334, "learning_rate": 1.015580050584249e-05, "loss": 0.3258, "step": 33925 }, { "epoch": 3.9563899253731343, "grad_norm": 0.4850823979231493, "learning_rate": 1.0150275016416613e-05, "loss": 0.3076, "step": 33930 }, { "epoch": 3.956972947761194, "grad_norm": 0.49446454624379393, "learning_rate": 1.0144752106728613e-05, "loss": 0.3455, "step": 33935 }, { "epoch": 3.9575559701492535, "grad_norm": 0.45225342777195443, "learning_rate": 1.0139231777599689e-05, "loss": 0.3278, "step": 33940 }, { "epoch": 3.9581389925373136, "grad_norm": 0.4417728280258304, "learning_rate": 1.0133714029850667e-05, "loss": 0.3338, "step": 33945 }, { "epoch": 3.9587220149253732, "grad_norm": 0.47526603632268744, "learning_rate": 1.0128198864301976e-05, "loss": 0.3188, "step": 33950 }, { "epoch": 3.959305037313433, "grad_norm": 0.4566346351343713, "learning_rate": 1.0122686281773674e-05, "loss": 0.3214, "step": 33955 }, { "epoch": 3.9598880597014925, "grad_norm": 0.46242712128959035, "learning_rate": 1.0117176283085419e-05, "loss": 0.3208, "step": 33960 }, { "epoch": 3.960471082089552, "grad_norm": 0.4421784234059182, "learning_rate": 1.0111668869056515e-05, "loss": 0.3322, "step": 33965 }, { "epoch": 3.961054104477612, "grad_norm": 0.44062643183665456, "learning_rate": 1.0106164040505835e-05, "loss": 0.3305, "step": 33970 }, { "epoch": 3.9616371268656714, "grad_norm": 0.47666506355668165, "learning_rate": 1.0100661798251923e-05, "loss": 0.3329, "step": 33975 }, { "epoch": 3.9622201492537314, "grad_norm": 0.4696572293050511, "learning_rate": 1.009516214311289e-05, "loss": 0.3291, "step": 33980 }, { "epoch": 3.962803171641791, "grad_norm": 0.44796711592830507, "learning_rate": 1.0089665075906485e-05, "loss": 0.3021, "step": 33985 }, { "epoch": 3.9633861940298507, "grad_norm": 0.4215620720448473, "learning_rate": 1.0084170597450073e-05, "loss": 0.3357, "step": 33990 }, { "epoch": 3.9639692164179103, "grad_norm": 0.5135757878352224, "learning_rate": 1.0078678708560627e-05, "loss": 0.3279, "step": 33995 }, { "epoch": 3.96455223880597, "grad_norm": 0.4382478647685149, "learning_rate": 1.0073189410054742e-05, "loss": 0.3329, "step": 34000 }, { "epoch": 3.96513526119403, "grad_norm": 0.47441238354061055, "learning_rate": 1.0067702702748627e-05, "loss": 0.3113, "step": 34005 }, { "epoch": 3.9657182835820897, "grad_norm": 0.47045652294160595, "learning_rate": 1.0062218587458085e-05, "loss": 0.3277, "step": 34010 }, { "epoch": 3.9663013059701493, "grad_norm": 0.46033420707207967, "learning_rate": 1.005673706499858e-05, "loss": 0.3344, "step": 34015 }, { "epoch": 3.966884328358209, "grad_norm": 0.48294006873083045, "learning_rate": 1.0051258136185132e-05, "loss": 0.3236, "step": 34020 }, { "epoch": 3.9674673507462686, "grad_norm": 0.48696980348070035, "learning_rate": 1.004578180183243e-05, "loss": 0.3407, "step": 34025 }, { "epoch": 3.9680503731343286, "grad_norm": 0.45801264492862054, "learning_rate": 1.0040308062754738e-05, "loss": 0.332, "step": 34030 }, { "epoch": 3.968633395522388, "grad_norm": 0.45810702477293136, "learning_rate": 1.0034836919765953e-05, "loss": 0.32, "step": 34035 }, { "epoch": 3.969216417910448, "grad_norm": 0.48611654623430944, "learning_rate": 1.0029368373679583e-05, "loss": 0.3286, "step": 34040 }, { "epoch": 3.9697994402985075, "grad_norm": 0.4428633768030733, "learning_rate": 1.002390242530874e-05, "loss": 0.3099, "step": 34045 }, { "epoch": 3.970382462686567, "grad_norm": 0.4519262796809441, "learning_rate": 1.001843907546617e-05, "loss": 0.3242, "step": 34050 }, { "epoch": 3.9709654850746268, "grad_norm": 0.4952882194557926, "learning_rate": 1.0012978324964214e-05, "loss": 0.3361, "step": 34055 }, { "epoch": 3.9715485074626864, "grad_norm": 0.45588317582345006, "learning_rate": 1.0007520174614836e-05, "loss": 0.2993, "step": 34060 }, { "epoch": 3.9721315298507465, "grad_norm": 0.4690490984838446, "learning_rate": 1.0002064625229613e-05, "loss": 0.3176, "step": 34065 }, { "epoch": 3.972714552238806, "grad_norm": 0.47169734458793017, "learning_rate": 9.996611677619719e-06, "loss": 0.3294, "step": 34070 }, { "epoch": 3.9732975746268657, "grad_norm": 0.4356789270614016, "learning_rate": 9.991161332595978e-06, "loss": 0.3101, "step": 34075 }, { "epoch": 3.9738805970149254, "grad_norm": 0.4398300251087639, "learning_rate": 9.98571359096878e-06, "loss": 0.3216, "step": 34080 }, { "epoch": 3.974463619402985, "grad_norm": 0.5044326152170167, "learning_rate": 9.980268453548172e-06, "loss": 0.3348, "step": 34085 }, { "epoch": 3.9750466417910446, "grad_norm": 0.4908949137397319, "learning_rate": 9.97482592114378e-06, "loss": 0.3452, "step": 34090 }, { "epoch": 3.9756296641791042, "grad_norm": 0.4388141472417088, "learning_rate": 9.969385994564862e-06, "loss": 0.3186, "step": 34095 }, { "epoch": 3.9762126865671643, "grad_norm": 0.4979654560935064, "learning_rate": 9.96394867462028e-06, "loss": 0.3412, "step": 34100 }, { "epoch": 3.976795708955224, "grad_norm": 0.5235472401417905, "learning_rate": 9.958513962118521e-06, "loss": 0.32, "step": 34105 }, { "epoch": 3.9773787313432836, "grad_norm": 0.4253250863988798, "learning_rate": 9.953081857867665e-06, "loss": 0.3323, "step": 34110 }, { "epoch": 3.977961753731343, "grad_norm": 0.44316158786191384, "learning_rate": 9.947652362675418e-06, "loss": 0.2972, "step": 34115 }, { "epoch": 3.978544776119403, "grad_norm": 0.4600520895546006, "learning_rate": 9.94222547734909e-06, "loss": 0.3256, "step": 34120 }, { "epoch": 3.979127798507463, "grad_norm": 0.45442856356849437, "learning_rate": 9.936801202695607e-06, "loss": 0.3435, "step": 34125 }, { "epoch": 3.9797108208955225, "grad_norm": 0.4611334100716397, "learning_rate": 9.93137953952151e-06, "loss": 0.3181, "step": 34130 }, { "epoch": 3.980293843283582, "grad_norm": 0.4535727228122911, "learning_rate": 9.925960488632948e-06, "loss": 0.3193, "step": 34135 }, { "epoch": 3.980876865671642, "grad_norm": 0.4688141137143561, "learning_rate": 9.92054405083569e-06, "loss": 0.3346, "step": 34140 }, { "epoch": 3.9814598880597014, "grad_norm": 0.4533556132404464, "learning_rate": 9.915130226935081e-06, "loss": 0.3392, "step": 34145 }, { "epoch": 3.982042910447761, "grad_norm": 0.5034582760137185, "learning_rate": 9.90971901773614e-06, "loss": 0.318, "step": 34150 }, { "epoch": 3.9826259328358207, "grad_norm": 0.44084630090610327, "learning_rate": 9.904310424043432e-06, "loss": 0.3228, "step": 34155 }, { "epoch": 3.9832089552238807, "grad_norm": 0.6637125022574998, "learning_rate": 9.898904446661188e-06, "loss": 0.3248, "step": 34160 }, { "epoch": 3.9837919776119404, "grad_norm": 0.46728347647795127, "learning_rate": 9.89350108639321e-06, "loss": 0.3165, "step": 34165 }, { "epoch": 3.984375, "grad_norm": 0.4702358150362368, "learning_rate": 9.888100344042926e-06, "loss": 0.3313, "step": 34170 }, { "epoch": 3.9849580223880596, "grad_norm": 0.43476736172092595, "learning_rate": 9.88270222041338e-06, "loss": 0.3159, "step": 34175 }, { "epoch": 3.9855410447761193, "grad_norm": 0.4531721630039909, "learning_rate": 9.87730671630722e-06, "loss": 0.3148, "step": 34180 }, { "epoch": 3.9861240671641793, "grad_norm": 0.46481035080290545, "learning_rate": 9.871913832526702e-06, "loss": 0.3294, "step": 34185 }, { "epoch": 3.986707089552239, "grad_norm": 0.4524958217254779, "learning_rate": 9.866523569873708e-06, "loss": 0.3376, "step": 34190 }, { "epoch": 3.9872901119402986, "grad_norm": 0.4877276855479251, "learning_rate": 9.861135929149695e-06, "loss": 0.3211, "step": 34195 }, { "epoch": 3.987873134328358, "grad_norm": 0.4217362588631423, "learning_rate": 9.855750911155784e-06, "loss": 0.3225, "step": 34200 }, { "epoch": 3.988456156716418, "grad_norm": 4.920054886385086, "learning_rate": 9.850368516692643e-06, "loss": 0.3257, "step": 34205 }, { "epoch": 3.9890391791044775, "grad_norm": 0.4452963636002685, "learning_rate": 9.844988746560615e-06, "loss": 0.3187, "step": 34210 }, { "epoch": 3.989622201492537, "grad_norm": 0.4222014745462168, "learning_rate": 9.839611601559597e-06, "loss": 0.2971, "step": 34215 }, { "epoch": 3.990205223880597, "grad_norm": 0.4926216363486367, "learning_rate": 9.834237082489126e-06, "loss": 0.3473, "step": 34220 }, { "epoch": 3.990788246268657, "grad_norm": 0.4813548007313419, "learning_rate": 9.828865190148342e-06, "loss": 0.3265, "step": 34225 }, { "epoch": 3.9913712686567164, "grad_norm": 0.47602805355492867, "learning_rate": 9.823495925335995e-06, "loss": 0.3292, "step": 34230 }, { "epoch": 3.991954291044776, "grad_norm": 0.4573633687387112, "learning_rate": 9.81812928885044e-06, "loss": 0.3297, "step": 34235 }, { "epoch": 3.9925373134328357, "grad_norm": 0.4959606630490096, "learning_rate": 9.812765281489655e-06, "loss": 0.3663, "step": 34240 }, { "epoch": 3.9931203358208958, "grad_norm": 0.5063115538753646, "learning_rate": 9.807403904051194e-06, "loss": 0.3234, "step": 34245 }, { "epoch": 3.9937033582089554, "grad_norm": 0.4838249620180325, "learning_rate": 9.802045157332269e-06, "loss": 0.3288, "step": 34250 }, { "epoch": 3.994286380597015, "grad_norm": 0.46547509225290806, "learning_rate": 9.796689042129652e-06, "loss": 0.3084, "step": 34255 }, { "epoch": 3.9948694029850746, "grad_norm": 0.5084441558896075, "learning_rate": 9.79133555923976e-06, "loss": 0.3341, "step": 34260 }, { "epoch": 3.9954524253731343, "grad_norm": 0.4782270556677637, "learning_rate": 9.785984709458602e-06, "loss": 0.3358, "step": 34265 }, { "epoch": 3.996035447761194, "grad_norm": 0.4669360357065058, "learning_rate": 9.780636493581797e-06, "loss": 0.3349, "step": 34270 }, { "epoch": 3.9966184701492535, "grad_norm": 0.47936948411575514, "learning_rate": 9.775290912404569e-06, "loss": 0.3174, "step": 34275 }, { "epoch": 3.9972014925373136, "grad_norm": 0.4750311734313681, "learning_rate": 9.76994796672176e-06, "loss": 0.3357, "step": 34280 }, { "epoch": 3.9977845149253732, "grad_norm": 0.43884871232542555, "learning_rate": 9.764607657327818e-06, "loss": 0.3256, "step": 34285 }, { "epoch": 3.998367537313433, "grad_norm": 0.5001795660217874, "learning_rate": 9.759269985016786e-06, "loss": 0.3546, "step": 34290 }, { "epoch": 3.9989505597014925, "grad_norm": 0.42042507950707164, "learning_rate": 9.753934950582333e-06, "loss": 0.341, "step": 34295 }, { "epoch": 3.999533582089552, "grad_norm": 0.4523835748984335, "learning_rate": 9.748602554817721e-06, "loss": 0.321, "step": 34300 }, { "epoch": 4.000116604477612, "grad_norm": 0.4531600037167153, "learning_rate": 9.743272798515829e-06, "loss": 0.3042, "step": 34305 }, { "epoch": 4.000699626865671, "grad_norm": 0.4108782613340139, "learning_rate": 9.737945682469145e-06, "loss": 0.2542, "step": 34310 }, { "epoch": 4.0012826492537314, "grad_norm": 0.4349396955742546, "learning_rate": 9.732621207469761e-06, "loss": 0.2688, "step": 34315 }, { "epoch": 4.001865671641791, "grad_norm": 0.4804686197522778, "learning_rate": 9.72729937430936e-06, "loss": 0.2487, "step": 34320 }, { "epoch": 4.002448694029851, "grad_norm": 0.5048592222069496, "learning_rate": 9.72198018377927e-06, "loss": 0.2587, "step": 34325 }, { "epoch": 4.003031716417911, "grad_norm": 0.46600498950028896, "learning_rate": 9.716663636670375e-06, "loss": 0.2505, "step": 34330 }, { "epoch": 4.00361473880597, "grad_norm": 0.5158150099952746, "learning_rate": 9.71134973377323e-06, "loss": 0.2639, "step": 34335 }, { "epoch": 4.00419776119403, "grad_norm": 0.5012120609047553, "learning_rate": 9.706038475877938e-06, "loss": 0.2581, "step": 34340 }, { "epoch": 4.004780783582089, "grad_norm": 0.5258548596606887, "learning_rate": 9.700729863774233e-06, "loss": 0.2626, "step": 34345 }, { "epoch": 4.005363805970149, "grad_norm": 0.4691002753690056, "learning_rate": 9.69542389825146e-06, "loss": 0.2607, "step": 34350 }, { "epoch": 4.005946828358209, "grad_norm": 0.5207991899052077, "learning_rate": 9.690120580098566e-06, "loss": 0.254, "step": 34355 }, { "epoch": 4.0065298507462686, "grad_norm": 0.4865108266583932, "learning_rate": 9.6848199101041e-06, "loss": 0.2678, "step": 34360 }, { "epoch": 4.007112873134329, "grad_norm": 0.5057030285566712, "learning_rate": 9.67952188905623e-06, "loss": 0.2625, "step": 34365 }, { "epoch": 4.007695895522388, "grad_norm": 0.4991036815732012, "learning_rate": 9.674226517742705e-06, "loss": 0.2421, "step": 34370 }, { "epoch": 4.008278917910448, "grad_norm": 0.5259789567259351, "learning_rate": 9.668933796950913e-06, "loss": 0.2616, "step": 34375 }, { "epoch": 4.008861940298507, "grad_norm": 0.4905941013631162, "learning_rate": 9.66364372746781e-06, "loss": 0.2439, "step": 34380 }, { "epoch": 4.009444962686567, "grad_norm": 0.511369017220589, "learning_rate": 9.658356310080007e-06, "loss": 0.2561, "step": 34385 }, { "epoch": 4.010027985074627, "grad_norm": 0.5533099144993038, "learning_rate": 9.653071545573667e-06, "loss": 0.2776, "step": 34390 }, { "epoch": 4.010611007462686, "grad_norm": 0.5128247997774668, "learning_rate": 9.647789434734594e-06, "loss": 0.2625, "step": 34395 }, { "epoch": 4.0111940298507465, "grad_norm": 0.4701134532348846, "learning_rate": 9.64250997834819e-06, "loss": 0.2418, "step": 34400 }, { "epoch": 4.011777052238806, "grad_norm": 0.45810143573615186, "learning_rate": 9.637233177199452e-06, "loss": 0.2579, "step": 34405 }, { "epoch": 4.012360074626866, "grad_norm": 0.4867578503297538, "learning_rate": 9.631959032072997e-06, "loss": 0.2444, "step": 34410 }, { "epoch": 4.012943097014926, "grad_norm": 0.49294893674379886, "learning_rate": 9.626687543753041e-06, "loss": 0.2415, "step": 34415 }, { "epoch": 4.013526119402985, "grad_norm": 0.5085206872787971, "learning_rate": 9.621418713023389e-06, "loss": 0.2729, "step": 34420 }, { "epoch": 4.014109141791045, "grad_norm": 0.5347517791479994, "learning_rate": 9.616152540667488e-06, "loss": 0.2492, "step": 34425 }, { "epoch": 4.014692164179104, "grad_norm": 0.5074309706958307, "learning_rate": 9.61088902746835e-06, "loss": 0.2531, "step": 34430 }, { "epoch": 4.015275186567164, "grad_norm": 0.4975697039738303, "learning_rate": 9.605628174208617e-06, "loss": 0.2357, "step": 34435 }, { "epoch": 4.0158582089552235, "grad_norm": 0.4977799738759334, "learning_rate": 9.60036998167052e-06, "loss": 0.2356, "step": 34440 }, { "epoch": 4.016441231343284, "grad_norm": 0.534198214468689, "learning_rate": 9.595114450635911e-06, "loss": 0.2589, "step": 34445 }, { "epoch": 4.017024253731344, "grad_norm": 0.5444308932824548, "learning_rate": 9.589861581886232e-06, "loss": 0.2846, "step": 34450 }, { "epoch": 4.017607276119403, "grad_norm": 0.5074586532370339, "learning_rate": 9.584611376202534e-06, "loss": 0.2475, "step": 34455 }, { "epoch": 4.018190298507463, "grad_norm": 0.4941333458896794, "learning_rate": 9.579363834365484e-06, "loss": 0.2468, "step": 34460 }, { "epoch": 4.018773320895522, "grad_norm": 0.5256476046275693, "learning_rate": 9.574118957155321e-06, "loss": 0.2653, "step": 34465 }, { "epoch": 4.019356343283582, "grad_norm": 0.4948114399941902, "learning_rate": 9.568876745351919e-06, "loss": 0.2585, "step": 34470 }, { "epoch": 4.019939365671642, "grad_norm": 0.5382404439562486, "learning_rate": 9.563637199734744e-06, "loss": 0.2647, "step": 34475 }, { "epoch": 4.020522388059701, "grad_norm": 1.3552595252469355, "learning_rate": 9.558400321082863e-06, "loss": 0.2543, "step": 34480 }, { "epoch": 4.0211054104477615, "grad_norm": 0.4937450707308806, "learning_rate": 9.553166110174957e-06, "loss": 0.2652, "step": 34485 }, { "epoch": 4.021688432835821, "grad_norm": 0.5103417878634484, "learning_rate": 9.547934567789302e-06, "loss": 0.2631, "step": 34490 }, { "epoch": 4.022271455223881, "grad_norm": 0.5142267726205416, "learning_rate": 9.542705694703763e-06, "loss": 0.2681, "step": 34495 }, { "epoch": 4.02285447761194, "grad_norm": 0.5602783254589139, "learning_rate": 9.537479491695845e-06, "loss": 0.2621, "step": 34500 }, { "epoch": 4.0234375, "grad_norm": 0.5185598440764575, "learning_rate": 9.532255959542616e-06, "loss": 0.2651, "step": 34505 }, { "epoch": 4.02402052238806, "grad_norm": 0.488475073935665, "learning_rate": 9.527035099020784e-06, "loss": 0.2541, "step": 34510 }, { "epoch": 4.024603544776119, "grad_norm": 0.516711784846163, "learning_rate": 9.521816910906626e-06, "loss": 0.2579, "step": 34515 }, { "epoch": 4.025186567164179, "grad_norm": 0.5128036867723536, "learning_rate": 9.516601395976038e-06, "loss": 0.2706, "step": 34520 }, { "epoch": 4.0257695895522385, "grad_norm": 0.4998739514067558, "learning_rate": 9.511388555004523e-06, "loss": 0.2674, "step": 34525 }, { "epoch": 4.026352611940299, "grad_norm": 0.48745000252020015, "learning_rate": 9.506178388767176e-06, "loss": 0.2447, "step": 34530 }, { "epoch": 4.026935634328358, "grad_norm": 0.5045935369935165, "learning_rate": 9.5009708980387e-06, "loss": 0.2447, "step": 34535 }, { "epoch": 4.027518656716418, "grad_norm": 0.5237258930097034, "learning_rate": 9.495766083593407e-06, "loss": 0.2598, "step": 34540 }, { "epoch": 4.028101679104478, "grad_norm": 0.4925259908992184, "learning_rate": 9.490563946205183e-06, "loss": 0.2577, "step": 34545 }, { "epoch": 4.028684701492537, "grad_norm": 0.5131302100958036, "learning_rate": 9.485364486647561e-06, "loss": 0.2569, "step": 34550 }, { "epoch": 4.029267723880597, "grad_norm": 0.5145603467207116, "learning_rate": 9.480167705693624e-06, "loss": 0.2496, "step": 34555 }, { "epoch": 4.029850746268656, "grad_norm": 0.5158991442898166, "learning_rate": 9.474973604116112e-06, "loss": 0.2634, "step": 34560 }, { "epoch": 4.030433768656716, "grad_norm": 0.4986558917596338, "learning_rate": 9.469782182687317e-06, "loss": 0.2552, "step": 34565 }, { "epoch": 4.0310167910447765, "grad_norm": 0.5042877087851613, "learning_rate": 9.464593442179162e-06, "loss": 0.2552, "step": 34570 }, { "epoch": 4.031599813432836, "grad_norm": 0.48701866303121916, "learning_rate": 9.459407383363158e-06, "loss": 0.3002, "step": 34575 }, { "epoch": 4.032182835820896, "grad_norm": 0.5834265076023694, "learning_rate": 9.454224007010428e-06, "loss": 0.2663, "step": 34580 }, { "epoch": 4.032765858208955, "grad_norm": 0.5239221149577419, "learning_rate": 9.449043313891692e-06, "loss": 0.2638, "step": 34585 }, { "epoch": 4.033348880597015, "grad_norm": 0.5236996970874352, "learning_rate": 9.443865304777266e-06, "loss": 0.2383, "step": 34590 }, { "epoch": 4.033931902985074, "grad_norm": 0.49535759671095636, "learning_rate": 9.438689980437062e-06, "loss": 0.2517, "step": 34595 }, { "epoch": 4.034514925373134, "grad_norm": 0.5470458516533174, "learning_rate": 9.433517341640621e-06, "loss": 0.2567, "step": 34600 }, { "epoch": 4.035097947761194, "grad_norm": 0.48917462514613874, "learning_rate": 9.428347389157039e-06, "loss": 0.2603, "step": 34605 }, { "epoch": 4.0356809701492535, "grad_norm": 0.5428754555939528, "learning_rate": 9.423180123755064e-06, "loss": 0.2545, "step": 34610 }, { "epoch": 4.036263992537314, "grad_norm": 0.5234581583690686, "learning_rate": 9.418015546203002e-06, "loss": 0.2468, "step": 34615 }, { "epoch": 4.036847014925373, "grad_norm": 0.4985094927754438, "learning_rate": 9.41285365726878e-06, "loss": 0.2669, "step": 34620 }, { "epoch": 4.037430037313433, "grad_norm": 0.5568416674224604, "learning_rate": 9.407694457719925e-06, "loss": 0.2747, "step": 34625 }, { "epoch": 4.038013059701493, "grad_norm": 0.5069542080613687, "learning_rate": 9.40253794832356e-06, "loss": 0.2582, "step": 34630 }, { "epoch": 4.038596082089552, "grad_norm": 0.5699199513363726, "learning_rate": 9.397384129846404e-06, "loss": 0.2633, "step": 34635 }, { "epoch": 4.039179104477612, "grad_norm": 0.5138557167585569, "learning_rate": 9.39223300305479e-06, "loss": 0.2449, "step": 34640 }, { "epoch": 4.039762126865671, "grad_norm": 0.5114334898282349, "learning_rate": 9.387084568714628e-06, "loss": 0.2666, "step": 34645 }, { "epoch": 4.0403451492537314, "grad_norm": 0.5581905434816621, "learning_rate": 9.381938827591447e-06, "loss": 0.2644, "step": 34650 }, { "epoch": 4.040928171641791, "grad_norm": 0.5073140461448422, "learning_rate": 9.376795780450373e-06, "loss": 0.2624, "step": 34655 }, { "epoch": 4.041511194029851, "grad_norm": 0.5021790408826153, "learning_rate": 9.371655428056122e-06, "loss": 0.2499, "step": 34660 }, { "epoch": 4.042094216417911, "grad_norm": 0.5024641361719697, "learning_rate": 9.36651777117302e-06, "loss": 0.2525, "step": 34665 }, { "epoch": 4.04267723880597, "grad_norm": 0.5461884943720894, "learning_rate": 9.361382810564984e-06, "loss": 0.263, "step": 34670 }, { "epoch": 4.04326026119403, "grad_norm": 0.5027030776687684, "learning_rate": 9.35625054699554e-06, "loss": 0.2728, "step": 34675 }, { "epoch": 4.043843283582089, "grad_norm": 0.5268213655634778, "learning_rate": 9.351120981227788e-06, "loss": 0.254, "step": 34680 }, { "epoch": 4.044426305970149, "grad_norm": 0.49028374930341145, "learning_rate": 9.345994114024472e-06, "loss": 0.2585, "step": 34685 }, { "epoch": 4.045009328358209, "grad_norm": 0.5218721168480488, "learning_rate": 9.34086994614789e-06, "loss": 0.2575, "step": 34690 }, { "epoch": 4.0455923507462686, "grad_norm": 0.5264320449452657, "learning_rate": 9.33574847835996e-06, "loss": 0.262, "step": 34695 }, { "epoch": 4.046175373134329, "grad_norm": 0.5478511395750972, "learning_rate": 9.330629711422196e-06, "loss": 0.2543, "step": 34700 }, { "epoch": 4.046758395522388, "grad_norm": 0.5732543892508573, "learning_rate": 9.325513646095707e-06, "loss": 0.2595, "step": 34705 }, { "epoch": 4.047341417910448, "grad_norm": 0.5335244097695253, "learning_rate": 9.320400283141208e-06, "loss": 0.2703, "step": 34710 }, { "epoch": 4.047924440298507, "grad_norm": 0.5026202484787695, "learning_rate": 9.315289623319012e-06, "loss": 0.2642, "step": 34715 }, { "epoch": 4.048507462686567, "grad_norm": 0.5292062634479631, "learning_rate": 9.310181667389003e-06, "loss": 0.2591, "step": 34720 }, { "epoch": 4.049090485074627, "grad_norm": 0.5535990342992809, "learning_rate": 9.305076416110715e-06, "loss": 0.2603, "step": 34725 }, { "epoch": 4.049673507462686, "grad_norm": 0.5069663116362183, "learning_rate": 9.299973870243222e-06, "loss": 0.245, "step": 34730 }, { "epoch": 4.0502565298507465, "grad_norm": 0.5701850619343685, "learning_rate": 9.294874030545247e-06, "loss": 0.2581, "step": 34735 }, { "epoch": 4.050839552238806, "grad_norm": 0.5031707224572886, "learning_rate": 9.289776897775074e-06, "loss": 0.2623, "step": 34740 }, { "epoch": 4.051422574626866, "grad_norm": 0.5023734778381901, "learning_rate": 9.284682472690599e-06, "loss": 0.2477, "step": 34745 }, { "epoch": 4.052005597014926, "grad_norm": 0.515147751103087, "learning_rate": 9.279590756049316e-06, "loss": 0.2675, "step": 34750 }, { "epoch": 4.052588619402985, "grad_norm": 0.5423363249426748, "learning_rate": 9.274501748608314e-06, "loss": 0.2649, "step": 34755 }, { "epoch": 4.053171641791045, "grad_norm": 0.5474114394899581, "learning_rate": 9.269415451124283e-06, "loss": 0.2551, "step": 34760 }, { "epoch": 4.053754664179104, "grad_norm": 0.5277244774771899, "learning_rate": 9.2643318643535e-06, "loss": 0.2452, "step": 34765 }, { "epoch": 4.054337686567164, "grad_norm": 0.5352283483668832, "learning_rate": 9.25925098905185e-06, "loss": 0.2596, "step": 34770 }, { "epoch": 4.0549207089552235, "grad_norm": 0.5429303684414251, "learning_rate": 9.254172825974823e-06, "loss": 0.2599, "step": 34775 }, { "epoch": 4.055503731343284, "grad_norm": 0.5139700143368325, "learning_rate": 9.249097375877458e-06, "loss": 0.2434, "step": 34780 }, { "epoch": 4.056086753731344, "grad_norm": 0.50005271179117, "learning_rate": 9.244024639514465e-06, "loss": 0.252, "step": 34785 }, { "epoch": 4.056669776119403, "grad_norm": 0.4997944637264341, "learning_rate": 9.23895461764009e-06, "loss": 0.2511, "step": 34790 }, { "epoch": 4.057252798507463, "grad_norm": 0.49561392020243344, "learning_rate": 9.233887311008197e-06, "loss": 0.2654, "step": 34795 }, { "epoch": 4.057835820895522, "grad_norm": 0.5220940410770386, "learning_rate": 9.22882272037225e-06, "loss": 0.2624, "step": 34800 }, { "epoch": 4.058418843283582, "grad_norm": 0.49265761402501357, "learning_rate": 9.223760846485307e-06, "loss": 0.2577, "step": 34805 }, { "epoch": 4.059001865671642, "grad_norm": 0.5295448198844314, "learning_rate": 9.218701690100017e-06, "loss": 0.2806, "step": 34810 }, { "epoch": 4.059584888059701, "grad_norm": 0.4915080457248284, "learning_rate": 9.21364525196863e-06, "loss": 0.2506, "step": 34815 }, { "epoch": 4.0601679104477615, "grad_norm": 0.5096665029389867, "learning_rate": 9.208591532842995e-06, "loss": 0.2594, "step": 34820 }, { "epoch": 4.060750932835821, "grad_norm": 0.582525339114675, "learning_rate": 9.203540533474537e-06, "loss": 0.2667, "step": 34825 }, { "epoch": 4.061333955223881, "grad_norm": 0.5878438127457113, "learning_rate": 9.198492254614302e-06, "loss": 0.2566, "step": 34830 }, { "epoch": 4.06191697761194, "grad_norm": 0.5481312822590019, "learning_rate": 9.193446697012921e-06, "loss": 0.2664, "step": 34835 }, { "epoch": 4.0625, "grad_norm": 0.5204472798397481, "learning_rate": 9.188403861420615e-06, "loss": 0.2755, "step": 34840 }, { "epoch": 4.06308302238806, "grad_norm": 0.6124691704600715, "learning_rate": 9.183363748587207e-06, "loss": 0.2847, "step": 34845 }, { "epoch": 4.063666044776119, "grad_norm": 0.5240735363343201, "learning_rate": 9.178326359262124e-06, "loss": 0.2408, "step": 34850 }, { "epoch": 4.064249067164179, "grad_norm": 0.5162043833097014, "learning_rate": 9.173291694194356e-06, "loss": 0.2636, "step": 34855 }, { "epoch": 4.0648320895522385, "grad_norm": 0.4776228176419194, "learning_rate": 9.16825975413253e-06, "loss": 0.2685, "step": 34860 }, { "epoch": 4.065415111940299, "grad_norm": 0.5115555275572906, "learning_rate": 9.163230539824829e-06, "loss": 0.2666, "step": 34865 }, { "epoch": 4.065998134328359, "grad_norm": 0.4938491423241653, "learning_rate": 9.158204052019069e-06, "loss": 0.2576, "step": 34870 }, { "epoch": 4.066581156716418, "grad_norm": 0.5304078190670312, "learning_rate": 9.153180291462627e-06, "loss": 0.2563, "step": 34875 }, { "epoch": 4.067164179104478, "grad_norm": 0.5088828115285172, "learning_rate": 9.148159258902488e-06, "loss": 0.2629, "step": 34880 }, { "epoch": 4.067747201492537, "grad_norm": 0.5073729369251818, "learning_rate": 9.143140955085239e-06, "loss": 0.2438, "step": 34885 }, { "epoch": 4.068330223880597, "grad_norm": 0.5083760831908246, "learning_rate": 9.138125380757046e-06, "loss": 0.2589, "step": 34890 }, { "epoch": 4.068913246268656, "grad_norm": 0.4919762102173519, "learning_rate": 9.133112536663682e-06, "loss": 0.2452, "step": 34895 }, { "epoch": 4.069496268656716, "grad_norm": 0.507306349249229, "learning_rate": 9.128102423550511e-06, "loss": 0.2649, "step": 34900 }, { "epoch": 4.0700792910447765, "grad_norm": 0.5389362937652415, "learning_rate": 9.123095042162477e-06, "loss": 0.2725, "step": 34905 }, { "epoch": 4.070662313432836, "grad_norm": 0.5657063443375089, "learning_rate": 9.118090393244147e-06, "loss": 0.2694, "step": 34910 }, { "epoch": 4.071245335820896, "grad_norm": 0.5223457192265057, "learning_rate": 9.113088477539643e-06, "loss": 0.2585, "step": 34915 }, { "epoch": 4.071828358208955, "grad_norm": 0.5630649644285136, "learning_rate": 9.108089295792726e-06, "loss": 0.2672, "step": 34920 }, { "epoch": 4.072411380597015, "grad_norm": 0.4750214373260717, "learning_rate": 9.10309284874671e-06, "loss": 0.2511, "step": 34925 }, { "epoch": 4.072994402985074, "grad_norm": 0.4986075941132954, "learning_rate": 9.098099137144522e-06, "loss": 0.2537, "step": 34930 }, { "epoch": 4.073577425373134, "grad_norm": 0.5086579097697231, "learning_rate": 9.093108161728683e-06, "loss": 0.2789, "step": 34935 }, { "epoch": 4.074160447761194, "grad_norm": 0.4987041903205375, "learning_rate": 9.088119923241295e-06, "loss": 0.2557, "step": 34940 }, { "epoch": 4.0747434701492535, "grad_norm": 0.5132993305722506, "learning_rate": 9.083134422424073e-06, "loss": 0.2746, "step": 34945 }, { "epoch": 4.075326492537314, "grad_norm": 0.510535415784068, "learning_rate": 9.07815166001831e-06, "loss": 0.256, "step": 34950 }, { "epoch": 4.075909514925373, "grad_norm": 0.4978048118559493, "learning_rate": 9.073171636764879e-06, "loss": 0.2548, "step": 34955 }, { "epoch": 4.076492537313433, "grad_norm": 0.5377280077379235, "learning_rate": 9.068194353404288e-06, "loss": 0.275, "step": 34960 }, { "epoch": 4.077075559701493, "grad_norm": 0.5426270596291648, "learning_rate": 9.063219810676593e-06, "loss": 0.2599, "step": 34965 }, { "epoch": 4.077658582089552, "grad_norm": 0.505525866696443, "learning_rate": 9.058248009321464e-06, "loss": 0.2644, "step": 34970 }, { "epoch": 4.078241604477612, "grad_norm": 0.4948310624798179, "learning_rate": 9.053278950078163e-06, "loss": 0.2516, "step": 34975 }, { "epoch": 4.078824626865671, "grad_norm": 0.5417889814619405, "learning_rate": 9.04831263368554e-06, "loss": 0.2514, "step": 34980 }, { "epoch": 4.0794076492537314, "grad_norm": 0.5249793428135052, "learning_rate": 9.043349060882039e-06, "loss": 0.2631, "step": 34985 }, { "epoch": 4.079990671641791, "grad_norm": 0.5414372285067178, "learning_rate": 9.038388232405699e-06, "loss": 0.2514, "step": 34990 }, { "epoch": 4.080573694029851, "grad_norm": 0.5426317620989723, "learning_rate": 9.033430148994148e-06, "loss": 0.2619, "step": 34995 }, { "epoch": 4.081156716417911, "grad_norm": 0.5269626295122668, "learning_rate": 9.028474811384597e-06, "loss": 0.2587, "step": 35000 }, { "epoch": 4.08173973880597, "grad_norm": 0.5347272041813942, "learning_rate": 9.023522220313865e-06, "loss": 0.2422, "step": 35005 }, { "epoch": 4.08232276119403, "grad_norm": 0.5424201318175285, "learning_rate": 9.01857237651835e-06, "loss": 0.2706, "step": 35010 }, { "epoch": 4.082905783582089, "grad_norm": 0.5145158271415362, "learning_rate": 9.013625280734047e-06, "loss": 0.269, "step": 35015 }, { "epoch": 4.083488805970149, "grad_norm": 0.5478132605753813, "learning_rate": 9.008680933696545e-06, "loss": 0.2797, "step": 35020 }, { "epoch": 4.084071828358209, "grad_norm": 0.523611251601591, "learning_rate": 9.003739336141025e-06, "loss": 0.2642, "step": 35025 }, { "epoch": 4.0846548507462686, "grad_norm": 0.5363592239524483, "learning_rate": 8.998800488802239e-06, "loss": 0.2656, "step": 35030 }, { "epoch": 4.085237873134329, "grad_norm": 1.8497445741110459, "learning_rate": 8.99386439241457e-06, "loss": 0.2652, "step": 35035 }, { "epoch": 4.085820895522388, "grad_norm": 0.5228606562217458, "learning_rate": 8.98893104771194e-06, "loss": 0.2652, "step": 35040 }, { "epoch": 4.086403917910448, "grad_norm": 0.5169279851288193, "learning_rate": 8.984000455427917e-06, "loss": 0.2529, "step": 35045 }, { "epoch": 4.086986940298507, "grad_norm": 0.5121517409717713, "learning_rate": 8.979072616295616e-06, "loss": 0.2728, "step": 35050 }, { "epoch": 4.087569962686567, "grad_norm": 0.5454166426967931, "learning_rate": 8.974147531047763e-06, "loss": 0.2618, "step": 35055 }, { "epoch": 4.088152985074627, "grad_norm": 0.5288732534324025, "learning_rate": 8.969225200416678e-06, "loss": 0.2529, "step": 35060 }, { "epoch": 4.088736007462686, "grad_norm": 0.5261887860723182, "learning_rate": 8.964305625134254e-06, "loss": 0.2591, "step": 35065 }, { "epoch": 4.0893190298507465, "grad_norm": 0.5270812824511006, "learning_rate": 8.959388805931993e-06, "loss": 0.2765, "step": 35070 }, { "epoch": 4.089902052238806, "grad_norm": 0.5693926518404087, "learning_rate": 8.954474743540979e-06, "loss": 0.2689, "step": 35075 }, { "epoch": 4.090485074626866, "grad_norm": 0.5230461666827151, "learning_rate": 8.94956343869187e-06, "loss": 0.2535, "step": 35080 }, { "epoch": 4.091068097014926, "grad_norm": 0.5007784450998025, "learning_rate": 8.944654892114956e-06, "loss": 0.2444, "step": 35085 }, { "epoch": 4.091651119402985, "grad_norm": 0.5004196138596179, "learning_rate": 8.939749104540065e-06, "loss": 0.2274, "step": 35090 }, { "epoch": 4.092234141791045, "grad_norm": 0.49148938073063914, "learning_rate": 8.934846076696665e-06, "loss": 0.2554, "step": 35095 }, { "epoch": 4.092817164179104, "grad_norm": 0.5472066988355871, "learning_rate": 8.929945809313773e-06, "loss": 0.2568, "step": 35100 }, { "epoch": 4.093400186567164, "grad_norm": 0.4833868095424993, "learning_rate": 8.925048303120012e-06, "loss": 0.234, "step": 35105 }, { "epoch": 4.0939832089552235, "grad_norm": 0.49536379685887266, "learning_rate": 8.9201535588436e-06, "loss": 0.2684, "step": 35110 }, { "epoch": 4.094566231343284, "grad_norm": 0.47392014318420916, "learning_rate": 8.915261577212337e-06, "loss": 0.2445, "step": 35115 }, { "epoch": 4.095149253731344, "grad_norm": 0.46564338813188333, "learning_rate": 8.910372358953614e-06, "loss": 0.2591, "step": 35120 }, { "epoch": 4.095732276119403, "grad_norm": 0.49233180038956115, "learning_rate": 8.905485904794416e-06, "loss": 0.2489, "step": 35125 }, { "epoch": 4.096315298507463, "grad_norm": 0.5479573179986573, "learning_rate": 8.900602215461297e-06, "loss": 0.2444, "step": 35130 }, { "epoch": 4.096898320895522, "grad_norm": 0.6122752404359795, "learning_rate": 8.895721291680433e-06, "loss": 0.2621, "step": 35135 }, { "epoch": 4.097481343283582, "grad_norm": 0.5597183955613113, "learning_rate": 8.890843134177555e-06, "loss": 0.2639, "step": 35140 }, { "epoch": 4.098064365671641, "grad_norm": 0.5315320998410075, "learning_rate": 8.885967743678011e-06, "loss": 0.2656, "step": 35145 }, { "epoch": 4.098647388059701, "grad_norm": 0.5353394340704639, "learning_rate": 8.881095120906716e-06, "loss": 0.2561, "step": 35150 }, { "epoch": 4.0992304104477615, "grad_norm": 0.5291361825474706, "learning_rate": 8.876225266588184e-06, "loss": 0.2519, "step": 35155 }, { "epoch": 4.099813432835821, "grad_norm": 0.5036639626548466, "learning_rate": 8.871358181446519e-06, "loss": 0.2467, "step": 35160 }, { "epoch": 4.100396455223881, "grad_norm": 0.47752774448835666, "learning_rate": 8.866493866205407e-06, "loss": 0.2632, "step": 35165 }, { "epoch": 4.10097947761194, "grad_norm": 0.5115050932562567, "learning_rate": 8.861632321588126e-06, "loss": 0.2542, "step": 35170 }, { "epoch": 4.1015625, "grad_norm": 0.49680977551903693, "learning_rate": 8.856773548317545e-06, "loss": 0.2761, "step": 35175 }, { "epoch": 4.10214552238806, "grad_norm": 0.5129949691060091, "learning_rate": 8.851917547116111e-06, "loss": 0.2524, "step": 35180 }, { "epoch": 4.102728544776119, "grad_norm": 0.5144168250882896, "learning_rate": 8.847064318705864e-06, "loss": 0.2616, "step": 35185 }, { "epoch": 4.103311567164179, "grad_norm": 0.48219095547975166, "learning_rate": 8.842213863808439e-06, "loss": 0.2734, "step": 35190 }, { "epoch": 4.1038945895522385, "grad_norm": 0.5426833429370523, "learning_rate": 8.837366183145044e-06, "loss": 0.2586, "step": 35195 }, { "epoch": 4.104477611940299, "grad_norm": 0.5278663639380166, "learning_rate": 8.83252127743649e-06, "loss": 0.2507, "step": 35200 }, { "epoch": 4.105060634328359, "grad_norm": 0.5078270817437557, "learning_rate": 8.827679147403167e-06, "loss": 0.2771, "step": 35205 }, { "epoch": 4.105643656716418, "grad_norm": 0.47929031798955013, "learning_rate": 8.822839793765056e-06, "loss": 0.2546, "step": 35210 }, { "epoch": 4.106226679104478, "grad_norm": 0.5516138359484303, "learning_rate": 8.818003217241707e-06, "loss": 0.2689, "step": 35215 }, { "epoch": 4.106809701492537, "grad_norm": 0.5316918362563171, "learning_rate": 8.813169418552294e-06, "loss": 0.2601, "step": 35220 }, { "epoch": 4.107392723880597, "grad_norm": 0.5002464993683433, "learning_rate": 8.808338398415544e-06, "loss": 0.2669, "step": 35225 }, { "epoch": 4.107975746268656, "grad_norm": 0.5432643899604875, "learning_rate": 8.803510157549785e-06, "loss": 0.2606, "step": 35230 }, { "epoch": 4.108558768656716, "grad_norm": 0.49159574710362086, "learning_rate": 8.79868469667293e-06, "loss": 0.2455, "step": 35235 }, { "epoch": 4.1091417910447765, "grad_norm": 0.5350433730708042, "learning_rate": 8.793862016502477e-06, "loss": 0.2635, "step": 35240 }, { "epoch": 4.109724813432836, "grad_norm": 0.591245801575081, "learning_rate": 8.789042117755521e-06, "loss": 0.2566, "step": 35245 }, { "epoch": 4.110307835820896, "grad_norm": 0.5652227713274541, "learning_rate": 8.78422500114873e-06, "loss": 0.2586, "step": 35250 }, { "epoch": 4.110890858208955, "grad_norm": 0.49119906635330823, "learning_rate": 8.779410667398352e-06, "loss": 0.2692, "step": 35255 }, { "epoch": 4.111473880597015, "grad_norm": 0.48519076071323763, "learning_rate": 8.774599117220254e-06, "loss": 0.2637, "step": 35260 }, { "epoch": 4.112056902985074, "grad_norm": 0.4868122980881773, "learning_rate": 8.769790351329847e-06, "loss": 0.249, "step": 35265 }, { "epoch": 4.112639925373134, "grad_norm": 0.5577564289977741, "learning_rate": 8.764984370442166e-06, "loss": 0.2828, "step": 35270 }, { "epoch": 4.113222947761194, "grad_norm": 0.5479227529823502, "learning_rate": 8.7601811752718e-06, "loss": 0.2436, "step": 35275 }, { "epoch": 4.1138059701492535, "grad_norm": 0.4942580182722624, "learning_rate": 8.755380766532945e-06, "loss": 0.2588, "step": 35280 }, { "epoch": 4.114388992537314, "grad_norm": 0.5452136170015037, "learning_rate": 8.750583144939373e-06, "loss": 0.2747, "step": 35285 }, { "epoch": 4.114972014925373, "grad_norm": 0.48304799967687756, "learning_rate": 8.745788311204444e-06, "loss": 0.2428, "step": 35290 }, { "epoch": 4.115555037313433, "grad_norm": 0.538047653761096, "learning_rate": 8.740996266041108e-06, "loss": 0.2612, "step": 35295 }, { "epoch": 4.116138059701493, "grad_norm": 0.5271145665435983, "learning_rate": 8.736207010161899e-06, "loss": 0.2672, "step": 35300 }, { "epoch": 4.116721082089552, "grad_norm": 0.5181804869567275, "learning_rate": 8.731420544278913e-06, "loss": 0.2543, "step": 35305 }, { "epoch": 4.117304104477612, "grad_norm": 0.5048284816398966, "learning_rate": 8.726636869103884e-06, "loss": 0.2536, "step": 35310 }, { "epoch": 4.117887126865671, "grad_norm": 0.485744198870478, "learning_rate": 8.721855985348068e-06, "loss": 0.2475, "step": 35315 }, { "epoch": 4.1184701492537314, "grad_norm": 0.5223579357266295, "learning_rate": 8.71707789372236e-06, "loss": 0.2602, "step": 35320 }, { "epoch": 4.119053171641791, "grad_norm": 0.5097406934491621, "learning_rate": 8.712302594937202e-06, "loss": 0.2492, "step": 35325 }, { "epoch": 4.119636194029851, "grad_norm": 0.5365099424282715, "learning_rate": 8.70753008970264e-06, "loss": 0.2719, "step": 35330 }, { "epoch": 4.120219216417911, "grad_norm": 0.4953737631268252, "learning_rate": 8.7027603787283e-06, "loss": 0.2667, "step": 35335 }, { "epoch": 4.12080223880597, "grad_norm": 0.47923786373376404, "learning_rate": 8.697993462723392e-06, "loss": 0.2663, "step": 35340 }, { "epoch": 4.12138526119403, "grad_norm": 0.49940924183785756, "learning_rate": 8.69322934239671e-06, "loss": 0.2481, "step": 35345 }, { "epoch": 4.121968283582089, "grad_norm": 0.5273696178399012, "learning_rate": 8.688468018456639e-06, "loss": 0.243, "step": 35350 }, { "epoch": 4.122551305970149, "grad_norm": 0.5667749246880608, "learning_rate": 8.68370949161113e-06, "loss": 0.2579, "step": 35355 }, { "epoch": 4.123134328358209, "grad_norm": 0.5426726008060032, "learning_rate": 8.678953762567739e-06, "loss": 0.2662, "step": 35360 }, { "epoch": 4.1237173507462686, "grad_norm": 0.5864143060614948, "learning_rate": 8.674200832033595e-06, "loss": 0.2526, "step": 35365 }, { "epoch": 4.124300373134329, "grad_norm": 0.5247040389500216, "learning_rate": 8.669450700715414e-06, "loss": 0.2562, "step": 35370 }, { "epoch": 4.124883395522388, "grad_norm": 0.534521634852648, "learning_rate": 8.664703369319496e-06, "loss": 0.2436, "step": 35375 }, { "epoch": 4.125466417910448, "grad_norm": 0.5334707822368108, "learning_rate": 8.659958838551722e-06, "loss": 0.2587, "step": 35380 }, { "epoch": 4.126049440298507, "grad_norm": 0.4776951742348572, "learning_rate": 8.655217109117564e-06, "loss": 0.2417, "step": 35385 }, { "epoch": 4.126632462686567, "grad_norm": 0.5784760840471114, "learning_rate": 8.650478181722055e-06, "loss": 0.2829, "step": 35390 }, { "epoch": 4.127215485074627, "grad_norm": 0.5494177092041136, "learning_rate": 8.64574205706985e-06, "loss": 0.2563, "step": 35395 }, { "epoch": 4.127798507462686, "grad_norm": 0.530356170485387, "learning_rate": 8.641008735865153e-06, "loss": 0.2688, "step": 35400 }, { "epoch": 4.1283815298507465, "grad_norm": 0.5049060522300168, "learning_rate": 8.63627821881176e-06, "loss": 0.2536, "step": 35405 }, { "epoch": 4.128964552238806, "grad_norm": 0.5553285155181097, "learning_rate": 8.631550506613062e-06, "loss": 0.2663, "step": 35410 }, { "epoch": 4.129547574626866, "grad_norm": 0.52838836040884, "learning_rate": 8.626825599972022e-06, "loss": 0.2538, "step": 35415 }, { "epoch": 4.130130597014926, "grad_norm": 0.4762703054443566, "learning_rate": 8.62210349959119e-06, "loss": 0.2521, "step": 35420 }, { "epoch": 4.130713619402985, "grad_norm": 0.5301260185468138, "learning_rate": 8.617384206172696e-06, "loss": 0.267, "step": 35425 }, { "epoch": 4.131296641791045, "grad_norm": 0.5584153545755062, "learning_rate": 8.612667720418243e-06, "loss": 0.2513, "step": 35430 }, { "epoch": 4.131879664179104, "grad_norm": 0.5203538176060201, "learning_rate": 8.60795404302915e-06, "loss": 0.2555, "step": 35435 }, { "epoch": 4.132462686567164, "grad_norm": 0.5207548491855819, "learning_rate": 8.60324317470627e-06, "loss": 0.2606, "step": 35440 }, { "epoch": 4.1330457089552235, "grad_norm": 0.5627320403489383, "learning_rate": 8.598535116150086e-06, "loss": 0.255, "step": 35445 }, { "epoch": 4.133628731343284, "grad_norm": 0.5160085587749553, "learning_rate": 8.593829868060632e-06, "loss": 0.2479, "step": 35450 }, { "epoch": 4.134211753731344, "grad_norm": 0.4992461902352645, "learning_rate": 8.589127431137527e-06, "loss": 0.2738, "step": 35455 }, { "epoch": 4.134794776119403, "grad_norm": 0.5119714860980036, "learning_rate": 8.584427806079988e-06, "loss": 0.2449, "step": 35460 }, { "epoch": 4.135377798507463, "grad_norm": 0.5312475240753449, "learning_rate": 8.579730993586798e-06, "loss": 0.2657, "step": 35465 }, { "epoch": 4.135960820895522, "grad_norm": 0.5619289480912323, "learning_rate": 8.575036994356334e-06, "loss": 0.2599, "step": 35470 }, { "epoch": 4.136543843283582, "grad_norm": 0.552781408980997, "learning_rate": 8.570345809086543e-06, "loss": 0.2507, "step": 35475 }, { "epoch": 4.137126865671641, "grad_norm": 0.533390892931154, "learning_rate": 8.565657438474963e-06, "loss": 0.271, "step": 35480 }, { "epoch": 4.137709888059701, "grad_norm": 0.5415315416003454, "learning_rate": 8.560971883218714e-06, "loss": 0.261, "step": 35485 }, { "epoch": 4.1382929104477615, "grad_norm": 0.5073952471708189, "learning_rate": 8.556289144014474e-06, "loss": 0.2433, "step": 35490 }, { "epoch": 4.138875932835821, "grad_norm": 0.5165418492461755, "learning_rate": 8.551609221558548e-06, "loss": 0.2542, "step": 35495 }, { "epoch": 4.139458955223881, "grad_norm": 0.5293658484286743, "learning_rate": 8.546932116546775e-06, "loss": 0.2742, "step": 35500 }, { "epoch": 4.14004197761194, "grad_norm": 0.5646742393884477, "learning_rate": 8.542257829674608e-06, "loss": 0.2482, "step": 35505 }, { "epoch": 4.140625, "grad_norm": 0.5458670461221791, "learning_rate": 8.53758636163706e-06, "loss": 0.2517, "step": 35510 }, { "epoch": 4.14120802238806, "grad_norm": 0.533126428830021, "learning_rate": 8.53291771312874e-06, "loss": 0.256, "step": 35515 }, { "epoch": 4.141791044776119, "grad_norm": 0.5984955716892207, "learning_rate": 8.528251884843829e-06, "loss": 0.2604, "step": 35520 }, { "epoch": 4.142374067164179, "grad_norm": 0.5401377877161797, "learning_rate": 8.523588877476089e-06, "loss": 0.2781, "step": 35525 }, { "epoch": 4.1429570895522385, "grad_norm": 0.5158414589426041, "learning_rate": 8.518928691718872e-06, "loss": 0.2731, "step": 35530 }, { "epoch": 4.143540111940299, "grad_norm": 0.514799122132554, "learning_rate": 8.514271328265094e-06, "loss": 0.2579, "step": 35535 }, { "epoch": 4.144123134328359, "grad_norm": 0.5085066014358727, "learning_rate": 8.509616787807263e-06, "loss": 0.2603, "step": 35540 }, { "epoch": 4.144706156716418, "grad_norm": 0.5303468068209967, "learning_rate": 8.504965071037465e-06, "loss": 0.2718, "step": 35545 }, { "epoch": 4.145289179104478, "grad_norm": 0.5138223155373116, "learning_rate": 8.500316178647366e-06, "loss": 0.2534, "step": 35550 }, { "epoch": 4.145872201492537, "grad_norm": 0.5569196620903132, "learning_rate": 8.495670111328214e-06, "loss": 0.2574, "step": 35555 }, { "epoch": 4.146455223880597, "grad_norm": 0.5760705410733047, "learning_rate": 8.491026869770832e-06, "loss": 0.2742, "step": 35560 }, { "epoch": 4.147038246268656, "grad_norm": 0.5006517435205796, "learning_rate": 8.486386454665621e-06, "loss": 0.2541, "step": 35565 }, { "epoch": 4.147621268656716, "grad_norm": 0.5181378253670358, "learning_rate": 8.48174886670258e-06, "loss": 0.247, "step": 35570 }, { "epoch": 4.1482042910447765, "grad_norm": 0.494115163963477, "learning_rate": 8.477114106571255e-06, "loss": 0.2661, "step": 35575 }, { "epoch": 4.148787313432836, "grad_norm": 0.5578647736221412, "learning_rate": 8.472482174960808e-06, "loss": 0.265, "step": 35580 }, { "epoch": 4.149370335820896, "grad_norm": 0.5384815577421193, "learning_rate": 8.467853072559953e-06, "loss": 0.2485, "step": 35585 }, { "epoch": 4.149953358208955, "grad_norm": 0.5278795431000908, "learning_rate": 8.463226800056995e-06, "loss": 0.2559, "step": 35590 }, { "epoch": 4.150536380597015, "grad_norm": 0.5360810310239597, "learning_rate": 8.458603358139818e-06, "loss": 0.2653, "step": 35595 }, { "epoch": 4.151119402985074, "grad_norm": 0.4977160792205137, "learning_rate": 8.453982747495881e-06, "loss": 0.2501, "step": 35600 }, { "epoch": 4.151702425373134, "grad_norm": 0.536231841668788, "learning_rate": 8.449364968812228e-06, "loss": 0.2622, "step": 35605 }, { "epoch": 4.152285447761194, "grad_norm": 0.5467799897844653, "learning_rate": 8.44475002277548e-06, "loss": 0.2565, "step": 35610 }, { "epoch": 4.1528684701492535, "grad_norm": 0.5569443669655396, "learning_rate": 8.440137910071821e-06, "loss": 0.267, "step": 35615 }, { "epoch": 4.153451492537314, "grad_norm": 0.4949963350018481, "learning_rate": 8.435528631387052e-06, "loss": 0.2521, "step": 35620 }, { "epoch": 4.154034514925373, "grad_norm": 0.5487988694955066, "learning_rate": 8.430922187406501e-06, "loss": 0.2591, "step": 35625 }, { "epoch": 4.154617537313433, "grad_norm": 0.6077030405410011, "learning_rate": 8.426318578815128e-06, "loss": 0.2613, "step": 35630 }, { "epoch": 4.155200559701493, "grad_norm": 0.5748776319339398, "learning_rate": 8.421717806297431e-06, "loss": 0.2789, "step": 35635 }, { "epoch": 4.155783582089552, "grad_norm": 0.5384423704349776, "learning_rate": 8.417119870537503e-06, "loss": 0.2573, "step": 35640 }, { "epoch": 4.156366604477612, "grad_norm": 0.508814057748118, "learning_rate": 8.41252477221901e-06, "loss": 0.2522, "step": 35645 }, { "epoch": 4.156949626865671, "grad_norm": 0.48773488979698165, "learning_rate": 8.407932512025207e-06, "loss": 0.2487, "step": 35650 }, { "epoch": 4.1575326492537314, "grad_norm": 0.4687360391225597, "learning_rate": 8.403343090638914e-06, "loss": 0.2337, "step": 35655 }, { "epoch": 4.158115671641791, "grad_norm": 0.5245902120953518, "learning_rate": 8.398756508742536e-06, "loss": 0.267, "step": 35660 }, { "epoch": 4.158698694029851, "grad_norm": 0.7145286657079379, "learning_rate": 8.394172767018048e-06, "loss": 0.2605, "step": 35665 }, { "epoch": 4.159281716417911, "grad_norm": 0.5005320632691611, "learning_rate": 8.38959186614702e-06, "loss": 0.2506, "step": 35670 }, { "epoch": 4.15986473880597, "grad_norm": 0.5446403276226495, "learning_rate": 8.385013806810577e-06, "loss": 0.2688, "step": 35675 }, { "epoch": 4.16044776119403, "grad_norm": 0.5128631372842963, "learning_rate": 8.380438589689438e-06, "loss": 0.2592, "step": 35680 }, { "epoch": 4.161030783582089, "grad_norm": 0.5541889766125377, "learning_rate": 8.37586621546389e-06, "loss": 0.2552, "step": 35685 }, { "epoch": 4.161613805970149, "grad_norm": 0.5238246783833841, "learning_rate": 8.371296684813806e-06, "loss": 0.2522, "step": 35690 }, { "epoch": 4.162196828358209, "grad_norm": 0.5621821440614606, "learning_rate": 8.36672999841863e-06, "loss": 0.2691, "step": 35695 }, { "epoch": 4.1627798507462686, "grad_norm": 0.579634596573173, "learning_rate": 8.36216615695738e-06, "loss": 0.2676, "step": 35700 }, { "epoch": 4.163362873134329, "grad_norm": 0.528976953956014, "learning_rate": 8.357605161108663e-06, "loss": 0.2661, "step": 35705 }, { "epoch": 4.163945895522388, "grad_norm": 0.49251543743971965, "learning_rate": 8.353047011550654e-06, "loss": 0.2589, "step": 35710 }, { "epoch": 4.164528917910448, "grad_norm": 0.5098277825232737, "learning_rate": 8.348491708961102e-06, "loss": 0.2689, "step": 35715 }, { "epoch": 4.165111940298507, "grad_norm": 0.5330813333767002, "learning_rate": 8.343939254017336e-06, "loss": 0.2522, "step": 35720 }, { "epoch": 4.165694962686567, "grad_norm": 0.5957115286883581, "learning_rate": 8.339389647396265e-06, "loss": 0.255, "step": 35725 }, { "epoch": 4.166277985074627, "grad_norm": 0.5451929472592748, "learning_rate": 8.334842889774374e-06, "loss": 0.2574, "step": 35730 }, { "epoch": 4.166861007462686, "grad_norm": 0.5571371475507364, "learning_rate": 8.330298981827719e-06, "loss": 0.2503, "step": 35735 }, { "epoch": 4.1674440298507465, "grad_norm": 0.5588413498691291, "learning_rate": 8.325757924231938e-06, "loss": 0.2682, "step": 35740 }, { "epoch": 4.168027052238806, "grad_norm": 0.4987987772903924, "learning_rate": 8.321219717662249e-06, "loss": 0.2478, "step": 35745 }, { "epoch": 4.168610074626866, "grad_norm": 0.49153653353859766, "learning_rate": 8.31668436279342e-06, "loss": 0.2608, "step": 35750 }, { "epoch": 4.169193097014926, "grad_norm": 0.4910461507026503, "learning_rate": 8.312151860299835e-06, "loss": 0.2365, "step": 35755 }, { "epoch": 4.169776119402985, "grad_norm": 0.5808500192652937, "learning_rate": 8.307622210855425e-06, "loss": 0.2564, "step": 35760 }, { "epoch": 4.170359141791045, "grad_norm": 0.5116169196975757, "learning_rate": 8.303095415133703e-06, "loss": 0.2638, "step": 35765 }, { "epoch": 4.170942164179104, "grad_norm": 0.5097325625916911, "learning_rate": 8.298571473807767e-06, "loss": 0.266, "step": 35770 }, { "epoch": 4.171525186567164, "grad_norm": 0.5642663632160959, "learning_rate": 8.29405038755028e-06, "loss": 0.2666, "step": 35775 }, { "epoch": 4.1721082089552235, "grad_norm": 0.5147817847658372, "learning_rate": 8.289532157033481e-06, "loss": 0.2669, "step": 35780 }, { "epoch": 4.172691231343284, "grad_norm": 0.5371217907838081, "learning_rate": 8.2850167829292e-06, "loss": 0.2564, "step": 35785 }, { "epoch": 4.173274253731344, "grad_norm": 0.5246730258831356, "learning_rate": 8.28050426590881e-06, "loss": 0.2457, "step": 35790 }, { "epoch": 4.173857276119403, "grad_norm": 0.494737036273425, "learning_rate": 8.2759946066433e-06, "loss": 0.2542, "step": 35795 }, { "epoch": 4.174440298507463, "grad_norm": 0.5091178946485959, "learning_rate": 8.271487805803193e-06, "loss": 0.2493, "step": 35800 }, { "epoch": 4.175023320895522, "grad_norm": 0.5218100567720455, "learning_rate": 8.26698386405863e-06, "loss": 0.2606, "step": 35805 }, { "epoch": 4.175606343283582, "grad_norm": 0.5331823430293728, "learning_rate": 8.262482782079281e-06, "loss": 0.2493, "step": 35810 }, { "epoch": 4.176189365671641, "grad_norm": 0.5205613862996497, "learning_rate": 8.25798456053443e-06, "loss": 0.2517, "step": 35815 }, { "epoch": 4.176772388059701, "grad_norm": 0.5626865062257371, "learning_rate": 8.253489200092912e-06, "loss": 0.2456, "step": 35820 }, { "epoch": 4.1773554104477615, "grad_norm": 0.592312622009633, "learning_rate": 8.248996701423141e-06, "loss": 0.2604, "step": 35825 }, { "epoch": 4.177938432835821, "grad_norm": 0.4899871092553764, "learning_rate": 8.244507065193117e-06, "loss": 0.253, "step": 35830 }, { "epoch": 4.178521455223881, "grad_norm": 0.4930121388676626, "learning_rate": 8.240020292070408e-06, "loss": 0.2609, "step": 35835 }, { "epoch": 4.17910447761194, "grad_norm": 0.5052795083406623, "learning_rate": 8.235536382722133e-06, "loss": 0.2417, "step": 35840 }, { "epoch": 4.1796875, "grad_norm": 0.49997651985199454, "learning_rate": 8.231055337815039e-06, "loss": 0.2529, "step": 35845 }, { "epoch": 4.18027052238806, "grad_norm": 0.5588037312255781, "learning_rate": 8.226577158015383e-06, "loss": 0.2588, "step": 35850 }, { "epoch": 4.180853544776119, "grad_norm": 0.5031696482630953, "learning_rate": 8.22210184398905e-06, "loss": 0.2784, "step": 35855 }, { "epoch": 4.181436567164179, "grad_norm": 0.5097831655399934, "learning_rate": 8.217629396401465e-06, "loss": 0.2656, "step": 35860 }, { "epoch": 4.1820195895522385, "grad_norm": 0.5219448757506173, "learning_rate": 8.21315981591764e-06, "loss": 0.2604, "step": 35865 }, { "epoch": 4.182602611940299, "grad_norm": 0.5800918170815204, "learning_rate": 8.208693103202158e-06, "loss": 0.2532, "step": 35870 }, { "epoch": 4.183185634328359, "grad_norm": 0.5219678785499666, "learning_rate": 8.20422925891918e-06, "loss": 0.2435, "step": 35875 }, { "epoch": 4.183768656716418, "grad_norm": 0.5939386831359745, "learning_rate": 8.199768283732432e-06, "loss": 0.2668, "step": 35880 }, { "epoch": 4.184351679104478, "grad_norm": 0.4817233331041549, "learning_rate": 8.19531017830523e-06, "loss": 0.2689, "step": 35885 }, { "epoch": 4.184934701492537, "grad_norm": 0.5186357667568724, "learning_rate": 8.190854943300436e-06, "loss": 0.2584, "step": 35890 }, { "epoch": 4.185517723880597, "grad_norm": 0.5439076379046998, "learning_rate": 8.18640257938051e-06, "loss": 0.2544, "step": 35895 }, { "epoch": 4.186100746268656, "grad_norm": 0.49688221461825877, "learning_rate": 8.181953087207467e-06, "loss": 0.2483, "step": 35900 }, { "epoch": 4.186683768656716, "grad_norm": 0.49769815620246266, "learning_rate": 8.177506467442915e-06, "loss": 0.2428, "step": 35905 }, { "epoch": 4.1872667910447765, "grad_norm": 0.5374612550401273, "learning_rate": 8.17306272074802e-06, "loss": 0.2597, "step": 35910 }, { "epoch": 4.187849813432836, "grad_norm": 0.48935843101678395, "learning_rate": 8.16862184778352e-06, "loss": 0.2486, "step": 35915 }, { "epoch": 4.188432835820896, "grad_norm": 0.5140564850219497, "learning_rate": 8.164183849209741e-06, "loss": 0.2554, "step": 35920 }, { "epoch": 4.189015858208955, "grad_norm": 0.5513480145355107, "learning_rate": 8.159748725686554e-06, "loss": 0.2741, "step": 35925 }, { "epoch": 4.189598880597015, "grad_norm": 0.5193608240444869, "learning_rate": 8.155316477873438e-06, "loss": 0.2587, "step": 35930 }, { "epoch": 4.190181902985074, "grad_norm": 0.5942447434682527, "learning_rate": 8.150887106429412e-06, "loss": 0.2578, "step": 35935 }, { "epoch": 4.190764925373134, "grad_norm": 0.501049514911053, "learning_rate": 8.146460612013083e-06, "loss": 0.2667, "step": 35940 }, { "epoch": 4.191347947761194, "grad_norm": 0.4857907189274466, "learning_rate": 8.142036995282633e-06, "loss": 0.2488, "step": 35945 }, { "epoch": 4.1919309701492535, "grad_norm": 0.6670315784476127, "learning_rate": 8.137616256895811e-06, "loss": 0.272, "step": 35950 }, { "epoch": 4.192513992537314, "grad_norm": 0.5773504915296385, "learning_rate": 8.133198397509936e-06, "loss": 0.2598, "step": 35955 }, { "epoch": 4.193097014925373, "grad_norm": 0.5206693955814928, "learning_rate": 8.128783417781909e-06, "loss": 0.2525, "step": 35960 }, { "epoch": 4.193680037313433, "grad_norm": 0.533228174492143, "learning_rate": 8.124371318368176e-06, "loss": 0.2506, "step": 35965 }, { "epoch": 4.194263059701493, "grad_norm": 0.5342964299565146, "learning_rate": 8.119962099924797e-06, "loss": 0.2609, "step": 35970 }, { "epoch": 4.194846082089552, "grad_norm": 0.5392135108758016, "learning_rate": 8.115555763107362e-06, "loss": 0.2616, "step": 35975 }, { "epoch": 4.195429104477612, "grad_norm": 0.5595098596245798, "learning_rate": 8.111152308571065e-06, "loss": 0.2564, "step": 35980 }, { "epoch": 4.196012126865671, "grad_norm": 0.5760912824826399, "learning_rate": 8.10675173697065e-06, "loss": 0.2704, "step": 35985 }, { "epoch": 4.1965951492537314, "grad_norm": 0.4998090745247113, "learning_rate": 8.10235404896044e-06, "loss": 0.2553, "step": 35990 }, { "epoch": 4.197178171641791, "grad_norm": 0.5243605883965704, "learning_rate": 8.097959245194333e-06, "loss": 0.2643, "step": 35995 }, { "epoch": 4.197761194029851, "grad_norm": 0.5270577400307964, "learning_rate": 8.09356732632579e-06, "loss": 0.2559, "step": 36000 }, { "epoch": 4.198344216417911, "grad_norm": 0.49781530633714827, "learning_rate": 8.089178293007848e-06, "loss": 0.2441, "step": 36005 }, { "epoch": 4.19892723880597, "grad_norm": 0.4833727758202483, "learning_rate": 8.084792145893122e-06, "loss": 0.2437, "step": 36010 }, { "epoch": 4.19951026119403, "grad_norm": 0.48774601752300895, "learning_rate": 8.08040888563377e-06, "loss": 0.2614, "step": 36015 }, { "epoch": 4.200093283582089, "grad_norm": 0.527381214528893, "learning_rate": 8.07602851288157e-06, "loss": 0.2617, "step": 36020 }, { "epoch": 4.200676305970149, "grad_norm": 0.5536445360071436, "learning_rate": 8.07165102828781e-06, "loss": 0.2617, "step": 36025 }, { "epoch": 4.201259328358209, "grad_norm": 0.5053238389327633, "learning_rate": 8.067276432503406e-06, "loss": 0.2589, "step": 36030 }, { "epoch": 4.2018423507462686, "grad_norm": 0.4972513306891196, "learning_rate": 8.062904726178806e-06, "loss": 0.2523, "step": 36035 }, { "epoch": 4.202425373134329, "grad_norm": 0.5571497706103212, "learning_rate": 8.058535909964041e-06, "loss": 0.261, "step": 36040 }, { "epoch": 4.203008395522388, "grad_norm": 0.5216726436146869, "learning_rate": 8.054169984508714e-06, "loss": 0.2473, "step": 36045 }, { "epoch": 4.203591417910448, "grad_norm": 0.5058800923696642, "learning_rate": 8.049806950461996e-06, "loss": 0.2727, "step": 36050 }, { "epoch": 4.204174440298507, "grad_norm": 0.509577924902328, "learning_rate": 8.045446808472628e-06, "loss": 0.2558, "step": 36055 }, { "epoch": 4.204757462686567, "grad_norm": 0.5448385570748286, "learning_rate": 8.041089559188929e-06, "loss": 0.2437, "step": 36060 }, { "epoch": 4.205340485074627, "grad_norm": 0.5009247022073703, "learning_rate": 8.036735203258766e-06, "loss": 0.2547, "step": 36065 }, { "epoch": 4.205923507462686, "grad_norm": 0.5114127604823394, "learning_rate": 8.032383741329598e-06, "loss": 0.2564, "step": 36070 }, { "epoch": 4.2065065298507465, "grad_norm": 0.5215847196762179, "learning_rate": 8.028035174048446e-06, "loss": 0.2452, "step": 36075 }, { "epoch": 4.207089552238806, "grad_norm": 0.5376417982056957, "learning_rate": 8.023689502061897e-06, "loss": 0.2431, "step": 36080 }, { "epoch": 4.207672574626866, "grad_norm": 0.5197812530448618, "learning_rate": 8.019346726016116e-06, "loss": 0.2471, "step": 36085 }, { "epoch": 4.208255597014926, "grad_norm": 0.5202704728429384, "learning_rate": 8.015006846556825e-06, "loss": 0.2503, "step": 36090 }, { "epoch": 4.208838619402985, "grad_norm": 0.49235619354545446, "learning_rate": 8.010669864329334e-06, "loss": 0.268, "step": 36095 }, { "epoch": 4.209421641791045, "grad_norm": 0.5410523126980088, "learning_rate": 8.006335779978494e-06, "loss": 0.2569, "step": 36100 }, { "epoch": 4.210004664179104, "grad_norm": 0.5192116504706309, "learning_rate": 8.00200459414876e-06, "loss": 0.2681, "step": 36105 }, { "epoch": 4.210587686567164, "grad_norm": 0.4638425184517283, "learning_rate": 7.997676307484123e-06, "loss": 0.2438, "step": 36110 }, { "epoch": 4.2111707089552235, "grad_norm": 0.5130433508193969, "learning_rate": 7.993350920628164e-06, "loss": 0.2555, "step": 36115 }, { "epoch": 4.211753731343284, "grad_norm": 0.5825557038472461, "learning_rate": 7.989028434224028e-06, "loss": 0.2724, "step": 36120 }, { "epoch": 4.212336753731344, "grad_norm": 0.5518343328605017, "learning_rate": 7.984708848914426e-06, "loss": 0.2479, "step": 36125 }, { "epoch": 4.212919776119403, "grad_norm": 0.5050534075937266, "learning_rate": 7.980392165341636e-06, "loss": 0.2692, "step": 36130 }, { "epoch": 4.213502798507463, "grad_norm": 0.5382208455206013, "learning_rate": 7.976078384147515e-06, "loss": 0.2651, "step": 36135 }, { "epoch": 4.214085820895522, "grad_norm": 0.5552846334550404, "learning_rate": 7.971767505973468e-06, "loss": 0.2654, "step": 36140 }, { "epoch": 4.214668843283582, "grad_norm": 0.5018471496633139, "learning_rate": 7.9674595314605e-06, "loss": 0.2588, "step": 36145 }, { "epoch": 4.215251865671641, "grad_norm": 0.5182138992083678, "learning_rate": 7.963154461249143e-06, "loss": 0.2656, "step": 36150 }, { "epoch": 4.215834888059701, "grad_norm": 0.48346843633341396, "learning_rate": 7.958852295979542e-06, "loss": 0.2574, "step": 36155 }, { "epoch": 4.2164179104477615, "grad_norm": 0.46271003948790973, "learning_rate": 7.95455303629137e-06, "loss": 0.2633, "step": 36160 }, { "epoch": 4.217000932835821, "grad_norm": 0.5246992653702127, "learning_rate": 7.950256682823895e-06, "loss": 0.2587, "step": 36165 }, { "epoch": 4.217583955223881, "grad_norm": 0.46752826909368617, "learning_rate": 7.945963236215944e-06, "loss": 0.2526, "step": 36170 }, { "epoch": 4.21816697761194, "grad_norm": 0.49985760934816387, "learning_rate": 7.941672697105905e-06, "loss": 0.2756, "step": 36175 }, { "epoch": 4.21875, "grad_norm": 0.5480026483849813, "learning_rate": 7.937385066131745e-06, "loss": 0.2779, "step": 36180 }, { "epoch": 4.21933302238806, "grad_norm": 0.5205461784668023, "learning_rate": 7.933100343930995e-06, "loss": 0.2547, "step": 36185 }, { "epoch": 4.219916044776119, "grad_norm": 0.5359242257528088, "learning_rate": 7.928818531140748e-06, "loss": 0.2614, "step": 36190 }, { "epoch": 4.220499067164179, "grad_norm": 0.5172170281092153, "learning_rate": 7.924539628397675e-06, "loss": 0.2572, "step": 36195 }, { "epoch": 4.2210820895522385, "grad_norm": 0.48467656439608553, "learning_rate": 7.920263636337994e-06, "loss": 0.2537, "step": 36200 }, { "epoch": 4.221665111940299, "grad_norm": 0.4963780572434781, "learning_rate": 7.915990555597522e-06, "loss": 0.2616, "step": 36205 }, { "epoch": 4.222248134328359, "grad_norm": 0.5338221364458754, "learning_rate": 7.911720386811613e-06, "loss": 0.2627, "step": 36210 }, { "epoch": 4.222831156716418, "grad_norm": 0.49942605853754896, "learning_rate": 7.907453130615203e-06, "loss": 0.2568, "step": 36215 }, { "epoch": 4.223414179104478, "grad_norm": 0.5328164426898738, "learning_rate": 7.90318878764279e-06, "loss": 0.2603, "step": 36220 }, { "epoch": 4.223997201492537, "grad_norm": 0.503178550188616, "learning_rate": 7.898927358528447e-06, "loss": 0.2568, "step": 36225 }, { "epoch": 4.224580223880597, "grad_norm": 0.5251883305365175, "learning_rate": 7.894668843905803e-06, "loss": 0.2684, "step": 36230 }, { "epoch": 4.225163246268656, "grad_norm": 0.5260399225353416, "learning_rate": 7.890413244408059e-06, "loss": 0.2634, "step": 36235 }, { "epoch": 4.225746268656716, "grad_norm": 0.5101823924393167, "learning_rate": 7.886160560667984e-06, "loss": 0.2741, "step": 36240 }, { "epoch": 4.2263292910447765, "grad_norm": 0.5553837194000492, "learning_rate": 7.881910793317915e-06, "loss": 0.2478, "step": 36245 }, { "epoch": 4.226912313432836, "grad_norm": 0.5605173196582268, "learning_rate": 7.87766394298974e-06, "loss": 0.2704, "step": 36250 }, { "epoch": 4.227495335820896, "grad_norm": 0.5432033405633592, "learning_rate": 7.873420010314933e-06, "loss": 0.2542, "step": 36255 }, { "epoch": 4.228078358208955, "grad_norm": 0.48881589685516663, "learning_rate": 7.869178995924525e-06, "loss": 0.2591, "step": 36260 }, { "epoch": 4.228661380597015, "grad_norm": 0.5909026498093377, "learning_rate": 7.864940900449109e-06, "loss": 0.2609, "step": 36265 }, { "epoch": 4.229244402985074, "grad_norm": 0.5149006012989761, "learning_rate": 7.860705724518857e-06, "loss": 0.2587, "step": 36270 }, { "epoch": 4.229827425373134, "grad_norm": 0.5347576530594315, "learning_rate": 7.8564734687635e-06, "loss": 0.2529, "step": 36275 }, { "epoch": 4.230410447761194, "grad_norm": 0.5373956815286194, "learning_rate": 7.852244133812332e-06, "loss": 0.2681, "step": 36280 }, { "epoch": 4.2309934701492535, "grad_norm": 0.5963324877668421, "learning_rate": 7.8480177202942e-06, "loss": 0.279, "step": 36285 }, { "epoch": 4.231576492537314, "grad_norm": 0.5333019181317546, "learning_rate": 7.843794228837556e-06, "loss": 0.267, "step": 36290 }, { "epoch": 4.232159514925373, "grad_norm": 0.5076514033378163, "learning_rate": 7.839573660070373e-06, "loss": 0.242, "step": 36295 }, { "epoch": 4.232742537313433, "grad_norm": 0.5521856639246142, "learning_rate": 7.83535601462022e-06, "loss": 0.2633, "step": 36300 }, { "epoch": 4.233325559701493, "grad_norm": 0.518017595350679, "learning_rate": 7.831141293114216e-06, "loss": 0.267, "step": 36305 }, { "epoch": 4.233908582089552, "grad_norm": 0.5082093658424798, "learning_rate": 7.82692949617905e-06, "loss": 0.2749, "step": 36310 }, { "epoch": 4.234491604477612, "grad_norm": 0.5159511799153876, "learning_rate": 7.822720624440978e-06, "loss": 0.2662, "step": 36315 }, { "epoch": 4.235074626865671, "grad_norm": 0.562555628318104, "learning_rate": 7.818514678525822e-06, "loss": 0.2481, "step": 36320 }, { "epoch": 4.2356576492537314, "grad_norm": 0.5499192977993825, "learning_rate": 7.814311659058951e-06, "loss": 0.2635, "step": 36325 }, { "epoch": 4.236240671641791, "grad_norm": 0.5075712256569114, "learning_rate": 7.810111566665333e-06, "loss": 0.2609, "step": 36330 }, { "epoch": 4.236823694029851, "grad_norm": 0.538459860137172, "learning_rate": 7.805914401969466e-06, "loss": 0.2648, "step": 36335 }, { "epoch": 4.237406716417911, "grad_norm": 0.5356031378837667, "learning_rate": 7.80172016559544e-06, "loss": 0.2527, "step": 36340 }, { "epoch": 4.23798973880597, "grad_norm": 0.4841147045807619, "learning_rate": 7.797528858166891e-06, "loss": 0.2402, "step": 36345 }, { "epoch": 4.23857276119403, "grad_norm": 0.5339756626276096, "learning_rate": 7.793340480307027e-06, "loss": 0.2574, "step": 36350 }, { "epoch": 4.239155783582089, "grad_norm": 0.5409122599507549, "learning_rate": 7.789155032638619e-06, "loss": 0.2519, "step": 36355 }, { "epoch": 4.239738805970149, "grad_norm": 0.527606702037047, "learning_rate": 7.784972515784004e-06, "loss": 0.2667, "step": 36360 }, { "epoch": 4.240321828358209, "grad_norm": 0.5457238162669897, "learning_rate": 7.780792930365085e-06, "loss": 0.2601, "step": 36365 }, { "epoch": 4.2409048507462686, "grad_norm": 0.5322066682409444, "learning_rate": 7.776616277003328e-06, "loss": 0.2694, "step": 36370 }, { "epoch": 4.241487873134329, "grad_norm": 0.5124879069234782, "learning_rate": 7.772442556319747e-06, "loss": 0.2366, "step": 36375 }, { "epoch": 4.242070895522388, "grad_norm": 0.5589255865140835, "learning_rate": 7.768271768934955e-06, "loss": 0.2778, "step": 36380 }, { "epoch": 4.242653917910448, "grad_norm": 0.5092111567258133, "learning_rate": 7.76410391546909e-06, "loss": 0.2603, "step": 36385 }, { "epoch": 4.243236940298507, "grad_norm": 0.5717120076961497, "learning_rate": 7.759938996541886e-06, "loss": 0.2761, "step": 36390 }, { "epoch": 4.243819962686567, "grad_norm": 0.4567926897637574, "learning_rate": 7.755777012772615e-06, "loss": 0.2576, "step": 36395 }, { "epoch": 4.244402985074627, "grad_norm": 0.49641405661123544, "learning_rate": 7.751617964780131e-06, "loss": 0.2568, "step": 36400 }, { "epoch": 4.244986007462686, "grad_norm": 0.5271208730438519, "learning_rate": 7.747461853182842e-06, "loss": 0.2649, "step": 36405 }, { "epoch": 4.2455690298507465, "grad_norm": 0.5332217545849521, "learning_rate": 7.743308678598722e-06, "loss": 0.2712, "step": 36410 }, { "epoch": 4.246152052238806, "grad_norm": 0.503291882174182, "learning_rate": 7.73915844164531e-06, "loss": 0.2592, "step": 36415 }, { "epoch": 4.246735074626866, "grad_norm": 0.5499570386458553, "learning_rate": 7.73501114293971e-06, "loss": 0.2626, "step": 36420 }, { "epoch": 4.247318097014926, "grad_norm": 0.5075973972142243, "learning_rate": 7.730866783098576e-06, "loss": 0.25, "step": 36425 }, { "epoch": 4.247901119402985, "grad_norm": 0.5677800484300837, "learning_rate": 7.726725362738141e-06, "loss": 0.2873, "step": 36430 }, { "epoch": 4.248484141791045, "grad_norm": 0.5039882471901479, "learning_rate": 7.722586882474191e-06, "loss": 0.2634, "step": 36435 }, { "epoch": 4.249067164179104, "grad_norm": 0.4832140453251526, "learning_rate": 7.71845134292208e-06, "loss": 0.2545, "step": 36440 }, { "epoch": 4.249650186567164, "grad_norm": 0.49668851944428616, "learning_rate": 7.714318744696728e-06, "loss": 0.2624, "step": 36445 }, { "epoch": 4.2502332089552235, "grad_norm": 0.5196422995791229, "learning_rate": 7.710189088412604e-06, "loss": 0.2636, "step": 36450 }, { "epoch": 4.250816231343284, "grad_norm": 0.5591389791142553, "learning_rate": 7.706062374683757e-06, "loss": 0.262, "step": 36455 }, { "epoch": 4.251399253731344, "grad_norm": 0.5311377058182638, "learning_rate": 7.70193860412378e-06, "loss": 0.2682, "step": 36460 }, { "epoch": 4.251982276119403, "grad_norm": 0.5239124817838773, "learning_rate": 7.697817777345852e-06, "loss": 0.2564, "step": 36465 }, { "epoch": 4.252565298507463, "grad_norm": 0.5315985409727104, "learning_rate": 7.693699894962686e-06, "loss": 0.2633, "step": 36470 }, { "epoch": 4.253148320895522, "grad_norm": 0.5551970215290292, "learning_rate": 7.689584957586578e-06, "loss": 0.2682, "step": 36475 }, { "epoch": 4.253731343283582, "grad_norm": 0.5200032944295727, "learning_rate": 7.68547296582938e-06, "loss": 0.2473, "step": 36480 }, { "epoch": 4.254314365671641, "grad_norm": 0.5082693332036062, "learning_rate": 7.681363920302506e-06, "loss": 0.2606, "step": 36485 }, { "epoch": 4.254897388059701, "grad_norm": 0.49708545300777185, "learning_rate": 7.67725782161693e-06, "loss": 0.2549, "step": 36490 }, { "epoch": 4.2554804104477615, "grad_norm": 0.5354157310233019, "learning_rate": 7.673154670383195e-06, "loss": 0.2594, "step": 36495 }, { "epoch": 4.256063432835821, "grad_norm": 0.5855095665349248, "learning_rate": 7.669054467211388e-06, "loss": 0.25, "step": 36500 }, { "epoch": 4.256646455223881, "grad_norm": 0.5123276101728429, "learning_rate": 7.664957212711187e-06, "loss": 0.2676, "step": 36505 }, { "epoch": 4.25722947761194, "grad_norm": 0.5419371953353457, "learning_rate": 7.660862907491795e-06, "loss": 0.2583, "step": 36510 }, { "epoch": 4.2578125, "grad_norm": 0.5071122813583242, "learning_rate": 7.656771552162015e-06, "loss": 0.2549, "step": 36515 }, { "epoch": 4.25839552238806, "grad_norm": 0.5314706075957834, "learning_rate": 7.652683147330177e-06, "loss": 0.2612, "step": 36520 }, { "epoch": 4.258978544776119, "grad_norm": 0.5254949390459906, "learning_rate": 7.6485976936042e-06, "loss": 0.2753, "step": 36525 }, { "epoch": 4.259561567164179, "grad_norm": 0.518860620361999, "learning_rate": 7.644515191591542e-06, "loss": 0.2573, "step": 36530 }, { "epoch": 4.2601445895522385, "grad_norm": 0.5139194775905636, "learning_rate": 7.640435641899236e-06, "loss": 0.2733, "step": 36535 }, { "epoch": 4.260727611940299, "grad_norm": 0.4904973044462139, "learning_rate": 7.636359045133873e-06, "loss": 0.2436, "step": 36540 }, { "epoch": 4.261310634328359, "grad_norm": 0.4845043649626959, "learning_rate": 7.632285401901606e-06, "loss": 0.2521, "step": 36545 }, { "epoch": 4.261893656716418, "grad_norm": 0.5427050745701884, "learning_rate": 7.6282147128081364e-06, "loss": 0.2578, "step": 36550 }, { "epoch": 4.262476679104478, "grad_norm": 0.4804396739028346, "learning_rate": 7.624146978458754e-06, "loss": 0.2494, "step": 36555 }, { "epoch": 4.263059701492537, "grad_norm": 0.524789611447541, "learning_rate": 7.620082199458269e-06, "loss": 0.2719, "step": 36560 }, { "epoch": 4.263642723880597, "grad_norm": 0.5390021497011211, "learning_rate": 7.616020376411098e-06, "loss": 0.2544, "step": 36565 }, { "epoch": 4.264225746268656, "grad_norm": 0.5455055751894204, "learning_rate": 7.611961509921182e-06, "loss": 0.265, "step": 36570 }, { "epoch": 4.264808768656716, "grad_norm": 0.5206793504486821, "learning_rate": 7.6079056005920375e-06, "loss": 0.2488, "step": 36575 }, { "epoch": 4.2653917910447765, "grad_norm": 0.5502080598660063, "learning_rate": 7.603852649026738e-06, "loss": 0.2652, "step": 36580 }, { "epoch": 4.265974813432836, "grad_norm": 0.5539548194259345, "learning_rate": 7.599802655827924e-06, "loss": 0.2557, "step": 36585 }, { "epoch": 4.266557835820896, "grad_norm": 0.4687970332586143, "learning_rate": 7.595755621597788e-06, "loss": 0.2534, "step": 36590 }, { "epoch": 4.267140858208955, "grad_norm": 0.676537775642717, "learning_rate": 7.591711546938086e-06, "loss": 0.2492, "step": 36595 }, { "epoch": 4.267723880597015, "grad_norm": 0.5022555909708639, "learning_rate": 7.587670432450131e-06, "loss": 0.2655, "step": 36600 }, { "epoch": 4.268306902985074, "grad_norm": 0.5339158540274818, "learning_rate": 7.583632278734798e-06, "loss": 0.2622, "step": 36605 }, { "epoch": 4.268889925373134, "grad_norm": 0.5299828743541835, "learning_rate": 7.57959708639252e-06, "loss": 0.2605, "step": 36610 }, { "epoch": 4.269472947761194, "grad_norm": 0.5067527837897201, "learning_rate": 7.575564856023298e-06, "loss": 0.2622, "step": 36615 }, { "epoch": 4.2700559701492535, "grad_norm": 0.5871851959846127, "learning_rate": 7.5715355882266815e-06, "loss": 0.2635, "step": 36620 }, { "epoch": 4.270638992537314, "grad_norm": 0.5452975219675954, "learning_rate": 7.567509283601784e-06, "loss": 0.2634, "step": 36625 }, { "epoch": 4.271222014925373, "grad_norm": 0.5425556897223894, "learning_rate": 7.5634859427472835e-06, "loss": 0.2463, "step": 36630 }, { "epoch": 4.271805037313433, "grad_norm": 0.5659494273780724, "learning_rate": 7.5594655662613995e-06, "loss": 0.2524, "step": 36635 }, { "epoch": 4.272388059701493, "grad_norm": 0.5783645379273165, "learning_rate": 7.5554481547419395e-06, "loss": 0.2734, "step": 36640 }, { "epoch": 4.272971082089552, "grad_norm": 0.5542099066930374, "learning_rate": 7.551433708786243e-06, "loss": 0.2619, "step": 36645 }, { "epoch": 4.273554104477612, "grad_norm": 0.5261202362681132, "learning_rate": 7.547422228991223e-06, "loss": 0.2733, "step": 36650 }, { "epoch": 4.274137126865671, "grad_norm": 0.5093685875923936, "learning_rate": 7.543413715953347e-06, "loss": 0.2824, "step": 36655 }, { "epoch": 4.2747201492537314, "grad_norm": 0.5460849079015364, "learning_rate": 7.539408170268644e-06, "loss": 0.2553, "step": 36660 }, { "epoch": 4.275303171641791, "grad_norm": 0.5560161818099602, "learning_rate": 7.535405592532703e-06, "loss": 0.2553, "step": 36665 }, { "epoch": 4.275886194029851, "grad_norm": 0.4952917383487221, "learning_rate": 7.531405983340668e-06, "loss": 0.2519, "step": 36670 }, { "epoch": 4.276469216417911, "grad_norm": 0.5111450651997773, "learning_rate": 7.527409343287231e-06, "loss": 0.2384, "step": 36675 }, { "epoch": 4.27705223880597, "grad_norm": 0.4905017963777417, "learning_rate": 7.523415672966675e-06, "loss": 0.2559, "step": 36680 }, { "epoch": 4.27763526119403, "grad_norm": 0.4983806910551259, "learning_rate": 7.519424972972797e-06, "loss": 0.2429, "step": 36685 }, { "epoch": 4.278218283582089, "grad_norm": 0.5434853616544607, "learning_rate": 7.515437243898998e-06, "loss": 0.2663, "step": 36690 }, { "epoch": 4.278801305970149, "grad_norm": 0.5324952085708943, "learning_rate": 7.511452486338202e-06, "loss": 0.2679, "step": 36695 }, { "epoch": 4.279384328358209, "grad_norm": 0.5207583171148841, "learning_rate": 7.507470700882905e-06, "loss": 0.2495, "step": 36700 }, { "epoch": 4.2799673507462686, "grad_norm": 0.5184628033405693, "learning_rate": 7.503491888125165e-06, "loss": 0.2523, "step": 36705 }, { "epoch": 4.280550373134329, "grad_norm": 0.5457139838206316, "learning_rate": 7.499516048656589e-06, "loss": 0.2507, "step": 36710 }, { "epoch": 4.281133395522388, "grad_norm": 0.5268301525162578, "learning_rate": 7.495543183068349e-06, "loss": 0.2538, "step": 36715 }, { "epoch": 4.281716417910448, "grad_norm": 0.557522503250984, "learning_rate": 7.491573291951176e-06, "loss": 0.2472, "step": 36720 }, { "epoch": 4.282299440298507, "grad_norm": 0.5443142569282987, "learning_rate": 7.487606375895343e-06, "loss": 0.2773, "step": 36725 }, { "epoch": 4.282882462686567, "grad_norm": 0.4833465040881014, "learning_rate": 7.483642435490706e-06, "loss": 0.2717, "step": 36730 }, { "epoch": 4.283465485074627, "grad_norm": 0.5174779099609201, "learning_rate": 7.479681471326648e-06, "loss": 0.2467, "step": 36735 }, { "epoch": 4.284048507462686, "grad_norm": 0.5953972225493099, "learning_rate": 7.475723483992149e-06, "loss": 0.2769, "step": 36740 }, { "epoch": 4.2846315298507465, "grad_norm": 0.48209904232999795, "learning_rate": 7.471768474075706e-06, "loss": 0.257, "step": 36745 }, { "epoch": 4.285214552238806, "grad_norm": 0.5132628295947472, "learning_rate": 7.467816442165397e-06, "loss": 0.2642, "step": 36750 }, { "epoch": 4.285797574626866, "grad_norm": 0.5161882336247029, "learning_rate": 7.463867388848851e-06, "loss": 0.2647, "step": 36755 }, { "epoch": 4.286380597014926, "grad_norm": 0.5326378076947488, "learning_rate": 7.459921314713253e-06, "loss": 0.2708, "step": 36760 }, { "epoch": 4.286963619402985, "grad_norm": 0.551228320162258, "learning_rate": 7.4559782203453485e-06, "loss": 0.2542, "step": 36765 }, { "epoch": 4.287546641791045, "grad_norm": 0.5006053211843197, "learning_rate": 7.452038106331442e-06, "loss": 0.2403, "step": 36770 }, { "epoch": 4.288129664179104, "grad_norm": 0.5245442263991795, "learning_rate": 7.448100973257381e-06, "loss": 0.2683, "step": 36775 }, { "epoch": 4.288712686567164, "grad_norm": 0.5337926743044439, "learning_rate": 7.444166821708584e-06, "loss": 0.2503, "step": 36780 }, { "epoch": 4.2892957089552235, "grad_norm": 0.4863058576843455, "learning_rate": 7.440235652270024e-06, "loss": 0.2552, "step": 36785 }, { "epoch": 4.289878731343284, "grad_norm": 0.5028778268065867, "learning_rate": 7.436307465526224e-06, "loss": 0.2641, "step": 36790 }, { "epoch": 4.290461753731344, "grad_norm": 0.507916076154588, "learning_rate": 7.432382262061271e-06, "loss": 0.2671, "step": 36795 }, { "epoch": 4.291044776119403, "grad_norm": 0.5394543154021711, "learning_rate": 7.4284600424588045e-06, "loss": 0.2547, "step": 36800 }, { "epoch": 4.291627798507463, "grad_norm": 0.4966968471220312, "learning_rate": 7.424540807302019e-06, "loss": 0.2512, "step": 36805 }, { "epoch": 4.292210820895522, "grad_norm": 0.4970972940700219, "learning_rate": 7.42062455717367e-06, "loss": 0.2599, "step": 36810 }, { "epoch": 4.292793843283582, "grad_norm": 0.5317555340317982, "learning_rate": 7.4167112926560714e-06, "loss": 0.257, "step": 36815 }, { "epoch": 4.293376865671641, "grad_norm": 0.47957480433862504, "learning_rate": 7.412801014331075e-06, "loss": 0.2495, "step": 36820 }, { "epoch": 4.293959888059701, "grad_norm": 0.5040812377491417, "learning_rate": 7.408893722780108e-06, "loss": 0.2669, "step": 36825 }, { "epoch": 4.2945429104477615, "grad_norm": 0.5694451210267427, "learning_rate": 7.4049894185841476e-06, "loss": 0.2567, "step": 36830 }, { "epoch": 4.295125932835821, "grad_norm": 0.5503887213795378, "learning_rate": 7.401088102323729e-06, "loss": 0.253, "step": 36835 }, { "epoch": 4.295708955223881, "grad_norm": 0.5149263772610403, "learning_rate": 7.397189774578939e-06, "loss": 0.2532, "step": 36840 }, { "epoch": 4.29629197761194, "grad_norm": 0.5036942272318263, "learning_rate": 7.393294435929424e-06, "loss": 0.2643, "step": 36845 }, { "epoch": 4.296875, "grad_norm": 0.529239131044159, "learning_rate": 7.389402086954368e-06, "loss": 0.2518, "step": 36850 }, { "epoch": 4.29745802238806, "grad_norm": 0.5311848561685814, "learning_rate": 7.385512728232552e-06, "loss": 0.2537, "step": 36855 }, { "epoch": 4.298041044776119, "grad_norm": 0.5349998590566727, "learning_rate": 7.38162636034226e-06, "loss": 0.268, "step": 36860 }, { "epoch": 4.298624067164179, "grad_norm": 0.5211252168730743, "learning_rate": 7.37774298386138e-06, "loss": 0.261, "step": 36865 }, { "epoch": 4.2992070895522385, "grad_norm": 0.5270155424654448, "learning_rate": 7.373862599367316e-06, "loss": 0.2654, "step": 36870 }, { "epoch": 4.299790111940299, "grad_norm": 0.5244230534494915, "learning_rate": 7.3699852074370605e-06, "loss": 0.262, "step": 36875 }, { "epoch": 4.300373134328359, "grad_norm": 0.5377841525788232, "learning_rate": 7.366110808647128e-06, "loss": 0.2751, "step": 36880 }, { "epoch": 4.300956156716418, "grad_norm": 0.4975736802719757, "learning_rate": 7.362239403573614e-06, "loss": 0.2518, "step": 36885 }, { "epoch": 4.301539179104478, "grad_norm": 0.5036460743072355, "learning_rate": 7.3583709927921574e-06, "loss": 0.2587, "step": 36890 }, { "epoch": 4.302122201492537, "grad_norm": 0.5484439567299549, "learning_rate": 7.3545055768779526e-06, "loss": 0.2598, "step": 36895 }, { "epoch": 4.302705223880597, "grad_norm": 0.4981576744832116, "learning_rate": 7.350643156405751e-06, "loss": 0.2466, "step": 36900 }, { "epoch": 4.303288246268656, "grad_norm": 0.4871133271522615, "learning_rate": 7.346783731949864e-06, "loss": 0.2612, "step": 36905 }, { "epoch": 4.303871268656716, "grad_norm": 0.5570397093413274, "learning_rate": 7.342927304084132e-06, "loss": 0.2583, "step": 36910 }, { "epoch": 4.3044542910447765, "grad_norm": 0.5341611951795412, "learning_rate": 7.339073873381991e-06, "loss": 0.2463, "step": 36915 }, { "epoch": 4.305037313432836, "grad_norm": 0.5083869474138867, "learning_rate": 7.335223440416391e-06, "loss": 0.2592, "step": 36920 }, { "epoch": 4.305620335820896, "grad_norm": 0.49069501591445763, "learning_rate": 7.3313760057598715e-06, "loss": 0.261, "step": 36925 }, { "epoch": 4.306203358208955, "grad_norm": 0.5490578683808961, "learning_rate": 7.327531569984497e-06, "loss": 0.2597, "step": 36930 }, { "epoch": 4.306786380597015, "grad_norm": 0.53560865433887, "learning_rate": 7.3236901336619024e-06, "loss": 0.2525, "step": 36935 }, { "epoch": 4.307369402985074, "grad_norm": 0.5552324863820831, "learning_rate": 7.319851697363271e-06, "loss": 0.2585, "step": 36940 }, { "epoch": 4.307952425373134, "grad_norm": 0.5362159037180242, "learning_rate": 7.316016261659342e-06, "loss": 0.2607, "step": 36945 }, { "epoch": 4.308535447761194, "grad_norm": 0.47378728948205756, "learning_rate": 7.31218382712041e-06, "loss": 0.2435, "step": 36950 }, { "epoch": 4.3091184701492535, "grad_norm": 0.541388164400969, "learning_rate": 7.308354394316322e-06, "loss": 0.2523, "step": 36955 }, { "epoch": 4.309701492537314, "grad_norm": 0.5202445569550581, "learning_rate": 7.304527963816472e-06, "loss": 0.2588, "step": 36960 }, { "epoch": 4.310284514925373, "grad_norm": 0.5339968171744611, "learning_rate": 7.300704536189819e-06, "loss": 0.2585, "step": 36965 }, { "epoch": 4.310867537313433, "grad_norm": 0.5024294225573035, "learning_rate": 7.2968841120048666e-06, "loss": 0.2579, "step": 36970 }, { "epoch": 4.311450559701493, "grad_norm": 0.5578889793339405, "learning_rate": 7.293066691829676e-06, "loss": 0.2499, "step": 36975 }, { "epoch": 4.312033582089552, "grad_norm": 0.5927410826730686, "learning_rate": 7.289252276231863e-06, "loss": 0.2569, "step": 36980 }, { "epoch": 4.312616604477612, "grad_norm": 0.5151664246773885, "learning_rate": 7.285440865778594e-06, "loss": 0.2432, "step": 36985 }, { "epoch": 4.313199626865671, "grad_norm": 0.5322956216373098, "learning_rate": 7.281632461036594e-06, "loss": 0.2595, "step": 36990 }, { "epoch": 4.3137826492537314, "grad_norm": 0.5074734962833999, "learning_rate": 7.277827062572121e-06, "loss": 0.263, "step": 36995 }, { "epoch": 4.314365671641791, "grad_norm": 0.5277409660614346, "learning_rate": 7.27402467095102e-06, "loss": 0.2653, "step": 37000 }, { "epoch": 4.314948694029851, "grad_norm": 0.548905404143263, "learning_rate": 7.2702252867386575e-06, "loss": 0.2684, "step": 37005 }, { "epoch": 4.315531716417911, "grad_norm": 0.5514609071435579, "learning_rate": 7.266428910499971e-06, "loss": 0.2735, "step": 37010 }, { "epoch": 4.31611473880597, "grad_norm": 0.5429758104363809, "learning_rate": 7.262635542799444e-06, "loss": 0.2652, "step": 37015 }, { "epoch": 4.31669776119403, "grad_norm": 0.5303149317593546, "learning_rate": 7.258845184201111e-06, "loss": 0.264, "step": 37020 }, { "epoch": 4.317280783582089, "grad_norm": 0.5060160025827667, "learning_rate": 7.255057835268567e-06, "loss": 0.2548, "step": 37025 }, { "epoch": 4.317863805970149, "grad_norm": 0.5277354228805766, "learning_rate": 7.251273496564957e-06, "loss": 0.2623, "step": 37030 }, { "epoch": 4.318446828358209, "grad_norm": 0.5563520398657388, "learning_rate": 7.2474921686529625e-06, "loss": 0.2716, "step": 37035 }, { "epoch": 4.3190298507462686, "grad_norm": 0.5057708969419261, "learning_rate": 7.243713852094848e-06, "loss": 0.2426, "step": 37040 }, { "epoch": 4.319612873134329, "grad_norm": 0.5564721538521776, "learning_rate": 7.239938547452394e-06, "loss": 0.2677, "step": 37045 }, { "epoch": 4.320195895522388, "grad_norm": 0.5034968721136049, "learning_rate": 7.2361662552869734e-06, "loss": 0.2505, "step": 37050 }, { "epoch": 4.320778917910448, "grad_norm": 0.5818160184561831, "learning_rate": 7.232396976159475e-06, "loss": 0.2579, "step": 37055 }, { "epoch": 4.321361940298507, "grad_norm": 0.5601487613092481, "learning_rate": 7.228630710630356e-06, "loss": 0.2725, "step": 37060 }, { "epoch": 4.321944962686567, "grad_norm": 0.4964651675982269, "learning_rate": 7.224867459259628e-06, "loss": 0.2591, "step": 37065 }, { "epoch": 4.322527985074627, "grad_norm": 0.4994096964254231, "learning_rate": 7.221107222606851e-06, "loss": 0.2415, "step": 37070 }, { "epoch": 4.323111007462686, "grad_norm": 0.503197828587518, "learning_rate": 7.217350001231131e-06, "loss": 0.2603, "step": 37075 }, { "epoch": 4.3236940298507465, "grad_norm": 0.5240381374738424, "learning_rate": 7.21359579569114e-06, "loss": 0.2603, "step": 37080 }, { "epoch": 4.324277052238806, "grad_norm": 0.5543165349117316, "learning_rate": 7.2098446065450795e-06, "loss": 0.2647, "step": 37085 }, { "epoch": 4.324860074626866, "grad_norm": 0.5156631769536357, "learning_rate": 7.206096434350728e-06, "loss": 0.2573, "step": 37090 }, { "epoch": 4.325443097014926, "grad_norm": 0.5425625059272369, "learning_rate": 7.202351279665391e-06, "loss": 0.2706, "step": 37095 }, { "epoch": 4.326026119402985, "grad_norm": 0.5329765676788932, "learning_rate": 7.198609143045948e-06, "loss": 0.2468, "step": 37100 }, { "epoch": 4.326609141791045, "grad_norm": 0.5254958625670477, "learning_rate": 7.194870025048812e-06, "loss": 0.2498, "step": 37105 }, { "epoch": 4.327192164179104, "grad_norm": 0.50014405669136, "learning_rate": 7.191133926229957e-06, "loss": 0.2613, "step": 37110 }, { "epoch": 4.327775186567164, "grad_norm": 0.5080791032131875, "learning_rate": 7.187400847144904e-06, "loss": 0.2636, "step": 37115 }, { "epoch": 4.3283582089552235, "grad_norm": 0.4947112924898851, "learning_rate": 7.183670788348726e-06, "loss": 0.241, "step": 37120 }, { "epoch": 4.328941231343284, "grad_norm": 0.5120615479347017, "learning_rate": 7.1799437503960465e-06, "loss": 0.2366, "step": 37125 }, { "epoch": 4.329524253731344, "grad_norm": 0.495302330072865, "learning_rate": 7.176219733841047e-06, "loss": 0.2547, "step": 37130 }, { "epoch": 4.330107276119403, "grad_norm": 0.5100176383568668, "learning_rate": 7.17249873923744e-06, "loss": 0.2394, "step": 37135 }, { "epoch": 4.330690298507463, "grad_norm": 0.5075922533880023, "learning_rate": 7.168780767138512e-06, "loss": 0.2528, "step": 37140 }, { "epoch": 4.331273320895522, "grad_norm": 0.5235847227785626, "learning_rate": 7.165065818097086e-06, "loss": 0.2542, "step": 37145 }, { "epoch": 4.331856343283582, "grad_norm": 0.5491017386277993, "learning_rate": 7.161353892665538e-06, "loss": 0.2592, "step": 37150 }, { "epoch": 4.332439365671641, "grad_norm": 0.5430027835984829, "learning_rate": 7.157644991395801e-06, "loss": 0.2729, "step": 37155 }, { "epoch": 4.333022388059701, "grad_norm": 0.4914792423545939, "learning_rate": 7.1539391148393474e-06, "loss": 0.2511, "step": 37160 }, { "epoch": 4.3336054104477615, "grad_norm": 0.551495047944463, "learning_rate": 7.1502362635472135e-06, "loss": 0.271, "step": 37165 }, { "epoch": 4.334188432835821, "grad_norm": 0.5804205968353219, "learning_rate": 7.146536438069963e-06, "loss": 0.2753, "step": 37170 }, { "epoch": 4.334771455223881, "grad_norm": 0.5042503043643048, "learning_rate": 7.142839638957743e-06, "loss": 0.2556, "step": 37175 }, { "epoch": 4.33535447761194, "grad_norm": 0.5813491586359381, "learning_rate": 7.139145866760217e-06, "loss": 0.2858, "step": 37180 }, { "epoch": 4.3359375, "grad_norm": 0.485487703216082, "learning_rate": 7.1354551220266216e-06, "loss": 0.2598, "step": 37185 }, { "epoch": 4.33652052238806, "grad_norm": 0.48698592173326216, "learning_rate": 7.1317674053057335e-06, "loss": 0.2445, "step": 37190 }, { "epoch": 4.337103544776119, "grad_norm": 0.5703412584081826, "learning_rate": 7.128082717145881e-06, "loss": 0.2636, "step": 37195 }, { "epoch": 4.337686567164179, "grad_norm": 0.5710686569373065, "learning_rate": 7.124401058094938e-06, "loss": 0.2658, "step": 37200 }, { "epoch": 4.3382695895522385, "grad_norm": 0.5424660992459078, "learning_rate": 7.120722428700342e-06, "loss": 0.2525, "step": 37205 }, { "epoch": 4.338852611940299, "grad_norm": 0.5074965430480484, "learning_rate": 7.117046829509057e-06, "loss": 0.2566, "step": 37210 }, { "epoch": 4.339435634328359, "grad_norm": 0.5156586706368018, "learning_rate": 7.1133742610676196e-06, "loss": 0.2725, "step": 37215 }, { "epoch": 4.340018656716418, "grad_norm": 0.5239252788403638, "learning_rate": 7.109704723922094e-06, "loss": 0.2687, "step": 37220 }, { "epoch": 4.340601679104478, "grad_norm": 0.5231514571935841, "learning_rate": 7.106038218618125e-06, "loss": 0.2562, "step": 37225 }, { "epoch": 4.341184701492537, "grad_norm": 0.4985416893829604, "learning_rate": 7.102374745700866e-06, "loss": 0.2724, "step": 37230 }, { "epoch": 4.341767723880597, "grad_norm": 0.5856112213107807, "learning_rate": 7.098714305715051e-06, "loss": 0.2606, "step": 37235 }, { "epoch": 4.342350746268656, "grad_norm": 0.5305915503730493, "learning_rate": 7.0950568992049494e-06, "loss": 0.2516, "step": 37240 }, { "epoch": 4.342933768656716, "grad_norm": 0.506075485166218, "learning_rate": 7.091402526714383e-06, "loss": 0.2598, "step": 37245 }, { "epoch": 4.3435167910447765, "grad_norm": 0.5338640013791274, "learning_rate": 7.087751188786723e-06, "loss": 0.2665, "step": 37250 }, { "epoch": 4.344099813432836, "grad_norm": 0.5025960931021678, "learning_rate": 7.084102885964892e-06, "loss": 0.2612, "step": 37255 }, { "epoch": 4.344682835820896, "grad_norm": 0.550167818814279, "learning_rate": 7.080457618791344e-06, "loss": 0.2632, "step": 37260 }, { "epoch": 4.345265858208955, "grad_norm": 0.5413176458958009, "learning_rate": 7.076815387808115e-06, "loss": 0.2762, "step": 37265 }, { "epoch": 4.345848880597015, "grad_norm": 0.5073742095205621, "learning_rate": 7.0731761935567495e-06, "loss": 0.2718, "step": 37270 }, { "epoch": 4.346431902985074, "grad_norm": 0.5089404131817162, "learning_rate": 7.0695400365783784e-06, "loss": 0.243, "step": 37275 }, { "epoch": 4.347014925373134, "grad_norm": 0.4872564680605731, "learning_rate": 7.0659069174136544e-06, "loss": 0.2399, "step": 37280 }, { "epoch": 4.347597947761194, "grad_norm": 0.5262780654065554, "learning_rate": 7.062276836602786e-06, "loss": 0.257, "step": 37285 }, { "epoch": 4.3481809701492535, "grad_norm": 0.5568681795604292, "learning_rate": 7.058649794685537e-06, "loss": 0.2523, "step": 37290 }, { "epoch": 4.348763992537314, "grad_norm": 0.5185483182676475, "learning_rate": 7.055025792201212e-06, "loss": 0.2591, "step": 37295 }, { "epoch": 4.349347014925373, "grad_norm": 0.5385127587917762, "learning_rate": 7.051404829688663e-06, "loss": 0.2492, "step": 37300 }, { "epoch": 4.349930037313433, "grad_norm": 0.5419968278587466, "learning_rate": 7.047786907686296e-06, "loss": 0.2554, "step": 37305 }, { "epoch": 4.350513059701493, "grad_norm": 0.5825834198515136, "learning_rate": 7.044172026732059e-06, "loss": 0.2765, "step": 37310 }, { "epoch": 4.351096082089552, "grad_norm": 0.5198659855109098, "learning_rate": 7.040560187363447e-06, "loss": 0.2597, "step": 37315 }, { "epoch": 4.351679104477612, "grad_norm": 0.5494992849010548, "learning_rate": 7.036951390117512e-06, "loss": 0.2596, "step": 37320 }, { "epoch": 4.352262126865671, "grad_norm": 0.5021895742655129, "learning_rate": 7.033345635530844e-06, "loss": 0.2605, "step": 37325 }, { "epoch": 4.3528451492537314, "grad_norm": 0.5077176016169493, "learning_rate": 7.029742924139586e-06, "loss": 0.2593, "step": 37330 }, { "epoch": 4.353428171641791, "grad_norm": 0.5192865679477117, "learning_rate": 7.0261432564794255e-06, "loss": 0.2536, "step": 37335 }, { "epoch": 4.354011194029851, "grad_norm": 0.4977951065713548, "learning_rate": 7.022546633085604e-06, "loss": 0.2628, "step": 37340 }, { "epoch": 4.354594216417911, "grad_norm": 0.9155990939611681, "learning_rate": 7.018953054492889e-06, "loss": 0.2565, "step": 37345 }, { "epoch": 4.35517723880597, "grad_norm": 0.525334257689392, "learning_rate": 7.015362521235632e-06, "loss": 0.2483, "step": 37350 }, { "epoch": 4.35576026119403, "grad_norm": 0.5142064584916372, "learning_rate": 7.011775033847698e-06, "loss": 0.2713, "step": 37355 }, { "epoch": 4.356343283582089, "grad_norm": 0.5175803551347481, "learning_rate": 7.008190592862514e-06, "loss": 0.2664, "step": 37360 }, { "epoch": 4.356926305970149, "grad_norm": 0.5101006717702697, "learning_rate": 7.004609198813053e-06, "loss": 0.2556, "step": 37365 }, { "epoch": 4.357509328358209, "grad_norm": 0.5424198649205665, "learning_rate": 7.0010308522318355e-06, "loss": 0.2663, "step": 37370 }, { "epoch": 4.3580923507462686, "grad_norm": 0.49956238051452523, "learning_rate": 6.997455553650924e-06, "loss": 0.252, "step": 37375 }, { "epoch": 4.358675373134329, "grad_norm": 0.5042968582061769, "learning_rate": 6.9938833036019365e-06, "loss": 0.2585, "step": 37380 }, { "epoch": 4.359258395522388, "grad_norm": 0.5095650472894148, "learning_rate": 6.990314102616022e-06, "loss": 0.2693, "step": 37385 }, { "epoch": 4.359841417910448, "grad_norm": 0.4672276363984919, "learning_rate": 6.9867479512239e-06, "loss": 0.2693, "step": 37390 }, { "epoch": 4.360424440298507, "grad_norm": 0.4978536274002948, "learning_rate": 6.98318484995581e-06, "loss": 0.2423, "step": 37395 }, { "epoch": 4.361007462686567, "grad_norm": 0.515237297087287, "learning_rate": 6.979624799341565e-06, "loss": 0.2673, "step": 37400 }, { "epoch": 4.361590485074627, "grad_norm": 0.5222560195887122, "learning_rate": 6.976067799910499e-06, "loss": 0.2682, "step": 37405 }, { "epoch": 4.362173507462686, "grad_norm": 0.5307135877330541, "learning_rate": 6.972513852191508e-06, "loss": 0.2688, "step": 37410 }, { "epoch": 4.3627565298507465, "grad_norm": 0.5217438038504087, "learning_rate": 6.968962956713028e-06, "loss": 0.2642, "step": 37415 }, { "epoch": 4.363339552238806, "grad_norm": 0.5155046445501493, "learning_rate": 6.965415114003046e-06, "loss": 0.2494, "step": 37420 }, { "epoch": 4.363922574626866, "grad_norm": 0.5647541931193226, "learning_rate": 6.96187032458909e-06, "loss": 0.2619, "step": 37425 }, { "epoch": 4.364505597014926, "grad_norm": 0.5424547728578071, "learning_rate": 6.958328588998242e-06, "loss": 0.2639, "step": 37430 }, { "epoch": 4.365088619402985, "grad_norm": 0.5574838744443145, "learning_rate": 6.954789907757112e-06, "loss": 0.2459, "step": 37435 }, { "epoch": 4.365671641791045, "grad_norm": 0.5332672524908634, "learning_rate": 6.951254281391881e-06, "loss": 0.2557, "step": 37440 }, { "epoch": 4.366254664179104, "grad_norm": 0.512641420673527, "learning_rate": 6.947721710428251e-06, "loss": 0.2584, "step": 37445 }, { "epoch": 4.366837686567164, "grad_norm": 0.503181365629127, "learning_rate": 6.944192195391494e-06, "loss": 0.2521, "step": 37450 }, { "epoch": 4.3674207089552235, "grad_norm": 0.5355075621911289, "learning_rate": 6.9406657368064055e-06, "loss": 0.2672, "step": 37455 }, { "epoch": 4.368003731343284, "grad_norm": 0.5356084017821451, "learning_rate": 6.937142335197338e-06, "loss": 0.2667, "step": 37460 }, { "epoch": 4.368586753731344, "grad_norm": 0.5057706029927774, "learning_rate": 6.93362199108819e-06, "loss": 0.2513, "step": 37465 }, { "epoch": 4.369169776119403, "grad_norm": 0.5140002040872937, "learning_rate": 6.930104705002403e-06, "loss": 0.2517, "step": 37470 }, { "epoch": 4.369752798507463, "grad_norm": 0.5497836785973316, "learning_rate": 6.9265904774629585e-06, "loss": 0.2552, "step": 37475 }, { "epoch": 4.370335820895522, "grad_norm": 0.5651483183149191, "learning_rate": 6.9230793089924005e-06, "loss": 0.2688, "step": 37480 }, { "epoch": 4.370918843283582, "grad_norm": 0.5277319736781791, "learning_rate": 6.919571200112787e-06, "loss": 0.2613, "step": 37485 }, { "epoch": 4.371501865671641, "grad_norm": 0.5174185756631917, "learning_rate": 6.916066151345761e-06, "loss": 0.2588, "step": 37490 }, { "epoch": 4.372084888059701, "grad_norm": 0.5221756811171331, "learning_rate": 6.912564163212476e-06, "loss": 0.2617, "step": 37495 }, { "epoch": 4.3726679104477615, "grad_norm": 0.5485975180262437, "learning_rate": 6.909065236233644e-06, "loss": 0.2635, "step": 37500 }, { "epoch": 4.373250932835821, "grad_norm": 0.4977335097480914, "learning_rate": 6.90556937092953e-06, "loss": 0.2471, "step": 37505 }, { "epoch": 4.373833955223881, "grad_norm": 0.5403404673699739, "learning_rate": 6.90207656781993e-06, "loss": 0.2619, "step": 37510 }, { "epoch": 4.37441697761194, "grad_norm": 0.5049022953370631, "learning_rate": 6.898586827424194e-06, "loss": 0.2455, "step": 37515 }, { "epoch": 4.375, "grad_norm": 0.5599830111419839, "learning_rate": 6.8951001502612065e-06, "loss": 0.2542, "step": 37520 }, { "epoch": 4.37558302238806, "grad_norm": 0.48238430975932345, "learning_rate": 6.891616536849416e-06, "loss": 0.2606, "step": 37525 }, { "epoch": 4.376166044776119, "grad_norm": 0.5316769961969444, "learning_rate": 6.888135987706787e-06, "loss": 0.2825, "step": 37530 }, { "epoch": 4.376749067164179, "grad_norm": 0.5871819644670407, "learning_rate": 6.884658503350851e-06, "loss": 0.2695, "step": 37535 }, { "epoch": 4.3773320895522385, "grad_norm": 0.5949800330782344, "learning_rate": 6.881184084298675e-06, "loss": 0.2643, "step": 37540 }, { "epoch": 4.377915111940299, "grad_norm": 0.5479774258194761, "learning_rate": 6.877712731066875e-06, "loss": 0.2632, "step": 37545 }, { "epoch": 4.378498134328359, "grad_norm": 0.47928653674232297, "learning_rate": 6.874244444171607e-06, "loss": 0.2489, "step": 37550 }, { "epoch": 4.379081156716418, "grad_norm": 0.7645855332023864, "learning_rate": 6.870779224128571e-06, "loss": 0.2432, "step": 37555 }, { "epoch": 4.379664179104478, "grad_norm": 0.4941948185524788, "learning_rate": 6.867317071453007e-06, "loss": 0.2599, "step": 37560 }, { "epoch": 4.380247201492537, "grad_norm": 0.5260540431972204, "learning_rate": 6.863857986659716e-06, "loss": 0.2419, "step": 37565 }, { "epoch": 4.380830223880597, "grad_norm": 0.48896765210516463, "learning_rate": 6.860401970263017e-06, "loss": 0.2547, "step": 37570 }, { "epoch": 4.381413246268656, "grad_norm": 0.5288192628090171, "learning_rate": 6.856949022776798e-06, "loss": 0.2626, "step": 37575 }, { "epoch": 4.381996268656716, "grad_norm": 0.48623900419051974, "learning_rate": 6.8534991447144706e-06, "loss": 0.2365, "step": 37580 }, { "epoch": 4.3825792910447765, "grad_norm": 0.554592369507337, "learning_rate": 6.850052336589008e-06, "loss": 0.2523, "step": 37585 }, { "epoch": 4.383162313432836, "grad_norm": 0.4765066061767422, "learning_rate": 6.8466085989129066e-06, "loss": 0.2388, "step": 37590 }, { "epoch": 4.383745335820896, "grad_norm": 0.49015261367610646, "learning_rate": 6.843167932198227e-06, "loss": 0.2608, "step": 37595 }, { "epoch": 4.384328358208955, "grad_norm": 0.5079185723432356, "learning_rate": 6.839730336956554e-06, "loss": 0.2529, "step": 37600 }, { "epoch": 4.384911380597015, "grad_norm": 0.5183434635482761, "learning_rate": 6.836295813699031e-06, "loss": 0.2556, "step": 37605 }, { "epoch": 4.385494402985074, "grad_norm": 0.5254416040626212, "learning_rate": 6.83286436293634e-06, "loss": 0.269, "step": 37610 }, { "epoch": 4.386077425373134, "grad_norm": 0.5626541633596372, "learning_rate": 6.829435985178708e-06, "loss": 0.253, "step": 37615 }, { "epoch": 4.386660447761194, "grad_norm": 0.5549683727914495, "learning_rate": 6.826010680935886e-06, "loss": 0.2691, "step": 37620 }, { "epoch": 4.3872434701492535, "grad_norm": 0.49828536853764854, "learning_rate": 6.8225884507172005e-06, "loss": 0.25, "step": 37625 }, { "epoch": 4.387826492537314, "grad_norm": 0.4931098129488875, "learning_rate": 6.819169295031493e-06, "loss": 0.2566, "step": 37630 }, { "epoch": 4.388409514925373, "grad_norm": 0.5532586822967704, "learning_rate": 6.815753214387172e-06, "loss": 0.2801, "step": 37635 }, { "epoch": 4.388992537313433, "grad_norm": 0.5817928904723288, "learning_rate": 6.812340209292164e-06, "loss": 0.2634, "step": 37640 }, { "epoch": 4.389575559701493, "grad_norm": 0.496738795223984, "learning_rate": 6.808930280253956e-06, "loss": 0.2596, "step": 37645 }, { "epoch": 4.390158582089552, "grad_norm": 0.5210410477638936, "learning_rate": 6.80552342777957e-06, "loss": 0.2594, "step": 37650 }, { "epoch": 4.390741604477612, "grad_norm": 0.5606455872288548, "learning_rate": 6.80211965237557e-06, "loss": 0.2729, "step": 37655 }, { "epoch": 4.391324626865671, "grad_norm": 0.5408706260037408, "learning_rate": 6.79871895454807e-06, "loss": 0.2674, "step": 37660 }, { "epoch": 4.3919076492537314, "grad_norm": 0.6566004981267262, "learning_rate": 6.7953213348027235e-06, "loss": 0.2562, "step": 37665 }, { "epoch": 4.392490671641791, "grad_norm": 0.4786957306654934, "learning_rate": 6.791926793644713e-06, "loss": 0.2554, "step": 37670 }, { "epoch": 4.393073694029851, "grad_norm": 0.5420558106985854, "learning_rate": 6.7885353315787825e-06, "loss": 0.2707, "step": 37675 }, { "epoch": 4.393656716417911, "grad_norm": 0.5141114362646728, "learning_rate": 6.785146949109206e-06, "loss": 0.2483, "step": 37680 }, { "epoch": 4.39423973880597, "grad_norm": 0.49279316333047224, "learning_rate": 6.781761646739805e-06, "loss": 0.253, "step": 37685 }, { "epoch": 4.39482276119403, "grad_norm": 0.5237432022683991, "learning_rate": 6.778379424973943e-06, "loss": 0.2467, "step": 37690 }, { "epoch": 4.395405783582089, "grad_norm": 0.528539998651686, "learning_rate": 6.775000284314523e-06, "loss": 0.2415, "step": 37695 }, { "epoch": 4.395988805970149, "grad_norm": 0.5355518181400558, "learning_rate": 6.771624225263994e-06, "loss": 0.2586, "step": 37700 }, { "epoch": 4.396571828358209, "grad_norm": 0.5007886297156655, "learning_rate": 6.768251248324333e-06, "loss": 0.26, "step": 37705 }, { "epoch": 4.3971548507462686, "grad_norm": 0.5096148207366898, "learning_rate": 6.764881353997082e-06, "loss": 0.2583, "step": 37710 }, { "epoch": 4.397737873134329, "grad_norm": 0.5326093420108741, "learning_rate": 6.761514542783308e-06, "loss": 0.2721, "step": 37715 }, { "epoch": 4.398320895522388, "grad_norm": 0.5451094943829453, "learning_rate": 6.758150815183618e-06, "loss": 0.2727, "step": 37720 }, { "epoch": 4.398903917910448, "grad_norm": 0.51444907180675, "learning_rate": 6.7547901716981704e-06, "loss": 0.2681, "step": 37725 }, { "epoch": 4.399486940298507, "grad_norm": 0.5171071872102907, "learning_rate": 6.751432612826664e-06, "loss": 0.2643, "step": 37730 }, { "epoch": 4.400069962686567, "grad_norm": 0.5994903914611903, "learning_rate": 6.748078139068327e-06, "loss": 0.2643, "step": 37735 }, { "epoch": 4.400652985074627, "grad_norm": 0.5187717746664459, "learning_rate": 6.7447267509219494e-06, "loss": 0.2472, "step": 37740 }, { "epoch": 4.401236007462686, "grad_norm": 0.5098504560103423, "learning_rate": 6.741378448885838e-06, "loss": 0.2722, "step": 37745 }, { "epoch": 4.4018190298507465, "grad_norm": 0.5292380796106622, "learning_rate": 6.738033233457863e-06, "loss": 0.2529, "step": 37750 }, { "epoch": 4.402402052238806, "grad_norm": 0.5311520117308983, "learning_rate": 6.734691105135417e-06, "loss": 0.2552, "step": 37755 }, { "epoch": 4.402985074626866, "grad_norm": 0.5129788185616609, "learning_rate": 6.7313520644154555e-06, "loss": 0.248, "step": 37760 }, { "epoch": 4.403568097014926, "grad_norm": 0.48753517716534006, "learning_rate": 6.7280161117944495e-06, "loss": 0.2562, "step": 37765 }, { "epoch": 4.404151119402985, "grad_norm": 0.4782181838575392, "learning_rate": 6.724683247768427e-06, "loss": 0.2416, "step": 37770 }, { "epoch": 4.404734141791045, "grad_norm": 0.5240340461399262, "learning_rate": 6.721353472832953e-06, "loss": 0.2448, "step": 37775 }, { "epoch": 4.405317164179104, "grad_norm": 0.545465169069081, "learning_rate": 6.718026787483131e-06, "loss": 0.2622, "step": 37780 }, { "epoch": 4.405900186567164, "grad_norm": 0.5545305276793467, "learning_rate": 6.714703192213614e-06, "loss": 0.2803, "step": 37785 }, { "epoch": 4.4064832089552235, "grad_norm": 0.5516414717751895, "learning_rate": 6.7113826875185885e-06, "loss": 0.2627, "step": 37790 }, { "epoch": 4.407066231343284, "grad_norm": 0.5450123974641333, "learning_rate": 6.7080652738917655e-06, "loss": 0.2617, "step": 37795 }, { "epoch": 4.407649253731344, "grad_norm": 0.503846458792622, "learning_rate": 6.704750951826438e-06, "loss": 0.2571, "step": 37800 }, { "epoch": 4.408232276119403, "grad_norm": 0.5109976775523121, "learning_rate": 6.701439721815391e-06, "loss": 0.2755, "step": 37805 }, { "epoch": 4.408815298507463, "grad_norm": 0.5148988499513758, "learning_rate": 6.698131584350989e-06, "loss": 0.2503, "step": 37810 }, { "epoch": 4.409398320895522, "grad_norm": 0.5485123842856282, "learning_rate": 6.69482653992511e-06, "loss": 0.2774, "step": 37815 }, { "epoch": 4.409981343283582, "grad_norm": 0.5346591042205974, "learning_rate": 6.691524589029188e-06, "loss": 0.2549, "step": 37820 }, { "epoch": 4.410564365671641, "grad_norm": 0.4997498350650287, "learning_rate": 6.688225732154189e-06, "loss": 0.2543, "step": 37825 }, { "epoch": 4.411147388059701, "grad_norm": 0.4910896586352394, "learning_rate": 6.684929969790622e-06, "loss": 0.2528, "step": 37830 }, { "epoch": 4.4117304104477615, "grad_norm": 0.5400898482405075, "learning_rate": 6.6816373024285365e-06, "loss": 0.2726, "step": 37835 }, { "epoch": 4.412313432835821, "grad_norm": 0.5161745050709917, "learning_rate": 6.6783477305575215e-06, "loss": 0.2599, "step": 37840 }, { "epoch": 4.412896455223881, "grad_norm": 0.5276429255682854, "learning_rate": 6.675061254666702e-06, "loss": 0.2581, "step": 37845 }, { "epoch": 4.41347947761194, "grad_norm": 0.4720099071014724, "learning_rate": 6.671777875244745e-06, "loss": 0.2557, "step": 37850 }, { "epoch": 4.4140625, "grad_norm": 0.5678187682080887, "learning_rate": 6.668497592779857e-06, "loss": 0.2711, "step": 37855 }, { "epoch": 4.41464552238806, "grad_norm": 0.5280209993381787, "learning_rate": 6.665220407759788e-06, "loss": 0.2647, "step": 37860 }, { "epoch": 4.415228544776119, "grad_norm": 0.5225296458493709, "learning_rate": 6.661946320671822e-06, "loss": 0.2525, "step": 37865 }, { "epoch": 4.415811567164179, "grad_norm": 0.5453569617633768, "learning_rate": 6.658675332002787e-06, "loss": 0.2592, "step": 37870 }, { "epoch": 4.4163945895522385, "grad_norm": 0.5199782382147519, "learning_rate": 6.655407442239047e-06, "loss": 0.2426, "step": 37875 }, { "epoch": 4.416977611940299, "grad_norm": 0.5335249567633386, "learning_rate": 6.652142651866497e-06, "loss": 0.2642, "step": 37880 }, { "epoch": 4.417560634328359, "grad_norm": 0.5715476011975159, "learning_rate": 6.648880961370593e-06, "loss": 0.261, "step": 37885 }, { "epoch": 4.418143656716418, "grad_norm": 0.4740952002785889, "learning_rate": 6.645622371236314e-06, "loss": 0.254, "step": 37890 }, { "epoch": 4.418726679104478, "grad_norm": 0.5514382405706211, "learning_rate": 6.642366881948173e-06, "loss": 0.2622, "step": 37895 }, { "epoch": 4.419309701492537, "grad_norm": 0.4822002285688902, "learning_rate": 6.639114493990238e-06, "loss": 0.2498, "step": 37900 }, { "epoch": 4.419892723880597, "grad_norm": 0.5564968018417733, "learning_rate": 6.635865207846106e-06, "loss": 0.263, "step": 37905 }, { "epoch": 4.420475746268656, "grad_norm": 0.5677505103823726, "learning_rate": 6.6326190239989135e-06, "loss": 0.2576, "step": 37910 }, { "epoch": 4.421058768656716, "grad_norm": 0.5207445653359911, "learning_rate": 6.629375942931345e-06, "loss": 0.2482, "step": 37915 }, { "epoch": 4.4216417910447765, "grad_norm": 0.5403295097755192, "learning_rate": 6.626135965125597e-06, "loss": 0.2512, "step": 37920 }, { "epoch": 4.422224813432836, "grad_norm": 0.6004710947281864, "learning_rate": 6.622899091063442e-06, "loss": 0.2824, "step": 37925 }, { "epoch": 4.422807835820896, "grad_norm": 0.49440557059003526, "learning_rate": 6.61966532122616e-06, "loss": 0.2607, "step": 37930 }, { "epoch": 4.423390858208955, "grad_norm": 0.5045026705601957, "learning_rate": 6.6164346560945935e-06, "loss": 0.2319, "step": 37935 }, { "epoch": 4.423973880597015, "grad_norm": 0.5545947136972534, "learning_rate": 6.613207096149099e-06, "loss": 0.2706, "step": 37940 }, { "epoch": 4.424556902985074, "grad_norm": 0.5364397445538137, "learning_rate": 6.609982641869591e-06, "loss": 0.2636, "step": 37945 }, { "epoch": 4.425139925373134, "grad_norm": 0.49659530270108676, "learning_rate": 6.606761293735513e-06, "loss": 0.2552, "step": 37950 }, { "epoch": 4.425722947761194, "grad_norm": 0.46371638977704815, "learning_rate": 6.6035430522258455e-06, "loss": 0.2522, "step": 37955 }, { "epoch": 4.4263059701492535, "grad_norm": 0.5048728294319823, "learning_rate": 6.600327917819114e-06, "loss": 0.2493, "step": 37960 }, { "epoch": 4.426888992537314, "grad_norm": 0.5910748256828204, "learning_rate": 6.597115890993383e-06, "loss": 0.2545, "step": 37965 }, { "epoch": 4.427472014925373, "grad_norm": 0.5140618458048315, "learning_rate": 6.593906972226238e-06, "loss": 0.2373, "step": 37970 }, { "epoch": 4.428055037313433, "grad_norm": 0.5607261448709014, "learning_rate": 6.590701161994828e-06, "loss": 0.2636, "step": 37975 }, { "epoch": 4.428638059701493, "grad_norm": 0.5190995583520444, "learning_rate": 6.587498460775811e-06, "loss": 0.2669, "step": 37980 }, { "epoch": 4.429221082089552, "grad_norm": 0.5015700867310398, "learning_rate": 6.5842988690454135e-06, "loss": 0.2624, "step": 37985 }, { "epoch": 4.429804104477612, "grad_norm": 0.5565323088619271, "learning_rate": 6.581102387279374e-06, "loss": 0.2737, "step": 37990 }, { "epoch": 4.430387126865671, "grad_norm": 0.5543711186726025, "learning_rate": 6.577909015952982e-06, "loss": 0.2716, "step": 37995 }, { "epoch": 4.4309701492537314, "grad_norm": 0.5544970341913353, "learning_rate": 6.574718755541061e-06, "loss": 0.2605, "step": 38000 }, { "epoch": 4.431553171641791, "grad_norm": 0.554823940207235, "learning_rate": 6.571531606517972e-06, "loss": 0.2634, "step": 38005 }, { "epoch": 4.432136194029851, "grad_norm": 0.5510808119909901, "learning_rate": 6.568347569357611e-06, "loss": 0.246, "step": 38010 }, { "epoch": 4.432719216417911, "grad_norm": 0.5381041837346422, "learning_rate": 6.565166644533424e-06, "loss": 0.2614, "step": 38015 }, { "epoch": 4.43330223880597, "grad_norm": 0.4887734043557818, "learning_rate": 6.561988832518367e-06, "loss": 0.2555, "step": 38020 }, { "epoch": 4.43388526119403, "grad_norm": 0.5137112665327495, "learning_rate": 6.558814133784966e-06, "loss": 0.272, "step": 38025 }, { "epoch": 4.434468283582089, "grad_norm": 0.5898989785214453, "learning_rate": 6.555642548805262e-06, "loss": 0.2818, "step": 38030 }, { "epoch": 4.435051305970149, "grad_norm": 0.5579589351581093, "learning_rate": 6.552474078050835e-06, "loss": 0.2596, "step": 38035 }, { "epoch": 4.435634328358209, "grad_norm": 0.5401232346234285, "learning_rate": 6.5493087219928114e-06, "loss": 0.2668, "step": 38040 }, { "epoch": 4.4362173507462686, "grad_norm": 0.49770926007981614, "learning_rate": 6.54614648110185e-06, "loss": 0.2446, "step": 38045 }, { "epoch": 4.436800373134329, "grad_norm": 0.49130703990350055, "learning_rate": 6.542987355848144e-06, "loss": 0.2533, "step": 38050 }, { "epoch": 4.437383395522388, "grad_norm": 0.5101533520889702, "learning_rate": 6.539831346701426e-06, "loss": 0.255, "step": 38055 }, { "epoch": 4.437966417910448, "grad_norm": 0.5144750523870821, "learning_rate": 6.536678454130965e-06, "loss": 0.2804, "step": 38060 }, { "epoch": 4.438549440298507, "grad_norm": 0.5594827325821637, "learning_rate": 6.53352867860556e-06, "loss": 0.2564, "step": 38065 }, { "epoch": 4.439132462686567, "grad_norm": 0.5343688389768886, "learning_rate": 6.530382020593559e-06, "loss": 0.2604, "step": 38070 }, { "epoch": 4.439715485074627, "grad_norm": 0.5539208089956309, "learning_rate": 6.527238480562838e-06, "loss": 0.2551, "step": 38075 }, { "epoch": 4.440298507462686, "grad_norm": 0.49755846486433397, "learning_rate": 6.52409805898081e-06, "loss": 0.2571, "step": 38080 }, { "epoch": 4.4408815298507465, "grad_norm": 0.5458675830435409, "learning_rate": 6.520960756314427e-06, "loss": 0.2634, "step": 38085 }, { "epoch": 4.441464552238806, "grad_norm": 0.5535817242089792, "learning_rate": 6.517826573030178e-06, "loss": 0.2733, "step": 38090 }, { "epoch": 4.442047574626866, "grad_norm": 0.5784905661614941, "learning_rate": 6.514695509594076e-06, "loss": 0.2654, "step": 38095 }, { "epoch": 4.442630597014926, "grad_norm": 0.5430423361040847, "learning_rate": 6.511567566471697e-06, "loss": 0.2588, "step": 38100 }, { "epoch": 4.443213619402985, "grad_norm": 0.5339731848505547, "learning_rate": 6.50844274412812e-06, "loss": 0.2606, "step": 38105 }, { "epoch": 4.443796641791045, "grad_norm": 0.5265808645716251, "learning_rate": 6.50532104302799e-06, "loss": 0.2495, "step": 38110 }, { "epoch": 4.444379664179104, "grad_norm": 0.5436833957811948, "learning_rate": 6.5022024636354605e-06, "loss": 0.2536, "step": 38115 }, { "epoch": 4.444962686567164, "grad_norm": 0.47978330218350496, "learning_rate": 6.499087006414245e-06, "loss": 0.2654, "step": 38120 }, { "epoch": 4.4455457089552235, "grad_norm": 0.5125491666211928, "learning_rate": 6.495974671827574e-06, "loss": 0.2461, "step": 38125 }, { "epoch": 4.446128731343284, "grad_norm": 0.536205379662219, "learning_rate": 6.492865460338228e-06, "loss": 0.2439, "step": 38130 }, { "epoch": 4.446711753731344, "grad_norm": 0.5100336962710121, "learning_rate": 6.489759372408514e-06, "loss": 0.2605, "step": 38135 }, { "epoch": 4.447294776119403, "grad_norm": 0.5500325924585305, "learning_rate": 6.4866564085002826e-06, "loss": 0.262, "step": 38140 }, { "epoch": 4.447877798507463, "grad_norm": 0.5329048459627673, "learning_rate": 6.483556569074904e-06, "loss": 0.2682, "step": 38145 }, { "epoch": 4.448460820895522, "grad_norm": 0.5663551121334356, "learning_rate": 6.480459854593305e-06, "loss": 0.2626, "step": 38150 }, { "epoch": 4.449043843283582, "grad_norm": 0.5232106945335262, "learning_rate": 6.477366265515931e-06, "loss": 0.2641, "step": 38155 }, { "epoch": 4.449626865671641, "grad_norm": 0.5389140609005297, "learning_rate": 6.474275802302776e-06, "loss": 0.2713, "step": 38160 }, { "epoch": 4.450209888059701, "grad_norm": 0.5200787546279895, "learning_rate": 6.471188465413355e-06, "loss": 0.2444, "step": 38165 }, { "epoch": 4.4507929104477615, "grad_norm": 0.5309435731380026, "learning_rate": 6.468104255306728e-06, "loss": 0.2638, "step": 38170 }, { "epoch": 4.451375932835821, "grad_norm": 0.49511464563093877, "learning_rate": 6.465023172441489e-06, "loss": 0.2556, "step": 38175 }, { "epoch": 4.451958955223881, "grad_norm": 0.5245319387742695, "learning_rate": 6.461945217275761e-06, "loss": 0.2643, "step": 38180 }, { "epoch": 4.45254197761194, "grad_norm": 0.48381562244638704, "learning_rate": 6.458870390267213e-06, "loss": 0.2367, "step": 38185 }, { "epoch": 4.453125, "grad_norm": 0.4945572418335977, "learning_rate": 6.455798691873042e-06, "loss": 0.2516, "step": 38190 }, { "epoch": 4.45370802238806, "grad_norm": 0.5119796477199425, "learning_rate": 6.45273012254997e-06, "loss": 0.2451, "step": 38195 }, { "epoch": 4.454291044776119, "grad_norm": 0.5380360042826006, "learning_rate": 6.449664682754278e-06, "loss": 0.272, "step": 38200 }, { "epoch": 4.454874067164179, "grad_norm": 0.5197699392470599, "learning_rate": 6.446602372941756e-06, "loss": 0.277, "step": 38205 }, { "epoch": 4.4554570895522385, "grad_norm": 0.5280041655845327, "learning_rate": 6.443543193567745e-06, "loss": 0.2538, "step": 38210 }, { "epoch": 4.456040111940299, "grad_norm": 0.5303693518474754, "learning_rate": 6.440487145087116e-06, "loss": 0.2748, "step": 38215 }, { "epoch": 4.456623134328359, "grad_norm": 0.5591970954489205, "learning_rate": 6.4374342279542726e-06, "loss": 0.2637, "step": 38220 }, { "epoch": 4.457206156716418, "grad_norm": 0.530588507584021, "learning_rate": 6.434384442623156e-06, "loss": 0.2499, "step": 38225 }, { "epoch": 4.457789179104478, "grad_norm": 0.5370039562897317, "learning_rate": 6.431337789547239e-06, "loss": 0.2626, "step": 38230 }, { "epoch": 4.458372201492537, "grad_norm": 0.5046364169965343, "learning_rate": 6.428294269179531e-06, "loss": 0.255, "step": 38235 }, { "epoch": 4.458955223880597, "grad_norm": 0.561326171816721, "learning_rate": 6.425253881972573e-06, "loss": 0.242, "step": 38240 }, { "epoch": 4.459538246268656, "grad_norm": 0.5725584683557882, "learning_rate": 6.4222166283784415e-06, "loss": 0.2591, "step": 38245 }, { "epoch": 4.460121268656716, "grad_norm": 0.5328326967447862, "learning_rate": 6.419182508848745e-06, "loss": 0.2544, "step": 38250 }, { "epoch": 4.4607042910447765, "grad_norm": 0.5443908173188287, "learning_rate": 6.416151523834632e-06, "loss": 0.2626, "step": 38255 }, { "epoch": 4.461287313432836, "grad_norm": 0.5190674201133977, "learning_rate": 6.4131236737867795e-06, "loss": 0.2438, "step": 38260 }, { "epoch": 4.461870335820896, "grad_norm": 0.5566166517489323, "learning_rate": 6.4100989591554026e-06, "loss": 0.2647, "step": 38265 }, { "epoch": 4.462453358208955, "grad_norm": 0.5444030570437454, "learning_rate": 6.407077380390236e-06, "loss": 0.2546, "step": 38270 }, { "epoch": 4.463036380597015, "grad_norm": 0.5236342301254023, "learning_rate": 6.4040589379405765e-06, "loss": 0.2515, "step": 38275 }, { "epoch": 4.463619402985074, "grad_norm": 0.5036287820632196, "learning_rate": 6.4010436322552204e-06, "loss": 0.2482, "step": 38280 }, { "epoch": 4.464202425373134, "grad_norm": 0.5342037509395847, "learning_rate": 6.39803146378253e-06, "loss": 0.2547, "step": 38285 }, { "epoch": 4.464785447761194, "grad_norm": 0.5877368789124545, "learning_rate": 6.395022432970375e-06, "loss": 0.2455, "step": 38290 }, { "epoch": 4.4653684701492535, "grad_norm": 0.5203001059160115, "learning_rate": 6.392016540266179e-06, "loss": 0.2627, "step": 38295 }, { "epoch": 4.465951492537314, "grad_norm": 0.5434466886587166, "learning_rate": 6.389013786116878e-06, "loss": 0.2708, "step": 38300 }, { "epoch": 4.466534514925373, "grad_norm": 0.5172868659600586, "learning_rate": 6.3860141709689615e-06, "loss": 0.259, "step": 38305 }, { "epoch": 4.467117537313433, "grad_norm": 0.5048021760692184, "learning_rate": 6.383017695268441e-06, "loss": 0.2486, "step": 38310 }, { "epoch": 4.467700559701493, "grad_norm": 0.5377071496318871, "learning_rate": 6.380024359460862e-06, "loss": 0.2563, "step": 38315 }, { "epoch": 4.468283582089552, "grad_norm": 0.49424572546955253, "learning_rate": 6.377034163991308e-06, "loss": 0.2696, "step": 38320 }, { "epoch": 4.468866604477612, "grad_norm": 0.5026516637006107, "learning_rate": 6.3740471093043914e-06, "loss": 0.2521, "step": 38325 }, { "epoch": 4.469449626865671, "grad_norm": 0.5236120140353857, "learning_rate": 6.3710631958442524e-06, "loss": 0.2438, "step": 38330 }, { "epoch": 4.4700326492537314, "grad_norm": 0.5368892754182085, "learning_rate": 6.3680824240545835e-06, "loss": 0.2617, "step": 38335 }, { "epoch": 4.470615671641791, "grad_norm": 0.5631534410705222, "learning_rate": 6.365104794378582e-06, "loss": 0.257, "step": 38340 }, { "epoch": 4.471198694029851, "grad_norm": 0.4969788859972526, "learning_rate": 6.362130307259008e-06, "loss": 0.2465, "step": 38345 }, { "epoch": 4.471781716417911, "grad_norm": 0.5802544230889424, "learning_rate": 6.3591589631381286e-06, "loss": 0.2641, "step": 38350 }, { "epoch": 4.47236473880597, "grad_norm": 0.5536864995144586, "learning_rate": 6.356190762457753e-06, "loss": 0.2657, "step": 38355 }, { "epoch": 4.47294776119403, "grad_norm": 0.5275125308796202, "learning_rate": 6.353225705659234e-06, "loss": 0.2542, "step": 38360 }, { "epoch": 4.473530783582089, "grad_norm": 0.5219923673534463, "learning_rate": 6.350263793183439e-06, "loss": 0.2687, "step": 38365 }, { "epoch": 4.474113805970149, "grad_norm": 0.5082496370709878, "learning_rate": 6.347305025470776e-06, "loss": 0.2518, "step": 38370 }, { "epoch": 4.474696828358209, "grad_norm": 0.5135128368336545, "learning_rate": 6.344349402961194e-06, "loss": 0.2694, "step": 38375 }, { "epoch": 4.4752798507462686, "grad_norm": 0.5466314389638472, "learning_rate": 6.341396926094155e-06, "loss": 0.2733, "step": 38380 }, { "epoch": 4.475862873134329, "grad_norm": 0.5845324971003246, "learning_rate": 6.338447595308671e-06, "loss": 0.2811, "step": 38385 }, { "epoch": 4.476445895522388, "grad_norm": 0.5562813248321179, "learning_rate": 6.335501411043274e-06, "loss": 0.2784, "step": 38390 }, { "epoch": 4.477028917910448, "grad_norm": 0.5183979516181434, "learning_rate": 6.3325583737360376e-06, "loss": 0.2606, "step": 38395 }, { "epoch": 4.477611940298507, "grad_norm": 0.5237406722626377, "learning_rate": 6.329618483824559e-06, "loss": 0.2719, "step": 38400 }, { "epoch": 4.478194962686567, "grad_norm": 0.4957426119917759, "learning_rate": 6.326681741745978e-06, "loss": 0.2471, "step": 38405 }, { "epoch": 4.478777985074627, "grad_norm": 0.5375130796660804, "learning_rate": 6.323748147936959e-06, "loss": 0.2895, "step": 38410 }, { "epoch": 4.479361007462686, "grad_norm": 0.5542027303220478, "learning_rate": 6.320817702833689e-06, "loss": 0.2648, "step": 38415 }, { "epoch": 4.4799440298507465, "grad_norm": 0.5824272515172335, "learning_rate": 6.317890406871914e-06, "loss": 0.273, "step": 38420 }, { "epoch": 4.480527052238806, "grad_norm": 0.5603501358301488, "learning_rate": 6.314966260486882e-06, "loss": 0.2716, "step": 38425 }, { "epoch": 4.481110074626866, "grad_norm": 0.48460259345144363, "learning_rate": 6.312045264113388e-06, "loss": 0.2579, "step": 38430 }, { "epoch": 4.481693097014926, "grad_norm": 0.5626337980726406, "learning_rate": 6.309127418185755e-06, "loss": 0.267, "step": 38435 }, { "epoch": 4.482276119402985, "grad_norm": 0.5715045452812639, "learning_rate": 6.306212723137846e-06, "loss": 0.2565, "step": 38440 }, { "epoch": 4.482859141791045, "grad_norm": 0.5188720938242106, "learning_rate": 6.3033011794030416e-06, "loss": 0.2516, "step": 38445 }, { "epoch": 4.483442164179104, "grad_norm": 0.5138320356119981, "learning_rate": 6.300392787414265e-06, "loss": 0.2754, "step": 38450 }, { "epoch": 4.484025186567164, "grad_norm": 0.5215779000371581, "learning_rate": 6.297487547603958e-06, "loss": 0.2622, "step": 38455 }, { "epoch": 4.4846082089552235, "grad_norm": 0.5231899814597589, "learning_rate": 6.2945854604041135e-06, "loss": 0.2567, "step": 38460 }, { "epoch": 4.485191231343284, "grad_norm": 0.5313579085351516, "learning_rate": 6.291686526246232e-06, "loss": 0.2723, "step": 38465 }, { "epoch": 4.485774253731344, "grad_norm": 0.4949843565279442, "learning_rate": 6.28879074556137e-06, "loss": 0.2521, "step": 38470 }, { "epoch": 4.486357276119403, "grad_norm": 0.5064034473611037, "learning_rate": 6.2858981187800915e-06, "loss": 0.2578, "step": 38475 }, { "epoch": 4.486940298507463, "grad_norm": 0.4877199417010325, "learning_rate": 6.283008646332507e-06, "loss": 0.2597, "step": 38480 }, { "epoch": 4.487523320895522, "grad_norm": 0.5049309841260072, "learning_rate": 6.280122328648254e-06, "loss": 0.241, "step": 38485 }, { "epoch": 4.488106343283582, "grad_norm": 0.5282900932518092, "learning_rate": 6.277239166156497e-06, "loss": 0.2613, "step": 38490 }, { "epoch": 4.488689365671641, "grad_norm": 0.5384873757949727, "learning_rate": 6.274359159285937e-06, "loss": 0.2677, "step": 38495 }, { "epoch": 4.489272388059701, "grad_norm": 0.5147663737460121, "learning_rate": 6.271482308464807e-06, "loss": 0.2569, "step": 38500 }, { "epoch": 4.4898554104477615, "grad_norm": 0.4827779590524776, "learning_rate": 6.268608614120858e-06, "loss": 0.259, "step": 38505 }, { "epoch": 4.490438432835821, "grad_norm": 0.5095805849321287, "learning_rate": 6.265738076681392e-06, "loss": 0.2493, "step": 38510 }, { "epoch": 4.491021455223881, "grad_norm": 0.6266904725920435, "learning_rate": 6.262870696573219e-06, "loss": 0.2782, "step": 38515 }, { "epoch": 4.49160447761194, "grad_norm": 0.5694132285290291, "learning_rate": 6.2600064742227e-06, "loss": 0.2754, "step": 38520 }, { "epoch": 4.4921875, "grad_norm": 0.5567702063766407, "learning_rate": 6.2571454100557136e-06, "loss": 0.2632, "step": 38525 }, { "epoch": 4.49277052238806, "grad_norm": 0.5085069735159217, "learning_rate": 6.254287504497672e-06, "loss": 0.272, "step": 38530 }, { "epoch": 4.493353544776119, "grad_norm": 0.5060260712801773, "learning_rate": 6.251432757973519e-06, "loss": 0.2522, "step": 38535 }, { "epoch": 4.493936567164179, "grad_norm": 0.5539886671173206, "learning_rate": 6.248581170907729e-06, "loss": 0.2592, "step": 38540 }, { "epoch": 4.4945195895522385, "grad_norm": 0.5150664889160734, "learning_rate": 6.245732743724305e-06, "loss": 0.2703, "step": 38545 }, { "epoch": 4.495102611940299, "grad_norm": 0.531805818535227, "learning_rate": 6.242887476846785e-06, "loss": 0.2457, "step": 38550 }, { "epoch": 4.495685634328359, "grad_norm": 0.5292251713955413, "learning_rate": 6.2400453706982216e-06, "loss": 0.2606, "step": 38555 }, { "epoch": 4.496268656716418, "grad_norm": 0.6074805083067017, "learning_rate": 6.237206425701223e-06, "loss": 0.2844, "step": 38560 }, { "epoch": 4.496851679104478, "grad_norm": 0.537703527035354, "learning_rate": 6.234370642277903e-06, "loss": 0.2656, "step": 38565 }, { "epoch": 4.497434701492537, "grad_norm": 0.5654128975770102, "learning_rate": 6.231538020849919e-06, "loss": 0.2699, "step": 38570 }, { "epoch": 4.498017723880597, "grad_norm": 0.5471811973711892, "learning_rate": 6.228708561838452e-06, "loss": 0.276, "step": 38575 }, { "epoch": 4.498600746268656, "grad_norm": 0.5438608880087065, "learning_rate": 6.225882265664218e-06, "loss": 0.2602, "step": 38580 }, { "epoch": 4.499183768656716, "grad_norm": 0.5218914053297654, "learning_rate": 6.223059132747463e-06, "loss": 0.2618, "step": 38585 }, { "epoch": 4.4997667910447765, "grad_norm": 0.5101625333495837, "learning_rate": 6.220239163507955e-06, "loss": 0.2639, "step": 38590 }, { "epoch": 4.500349813432836, "grad_norm": 0.5687057578995803, "learning_rate": 6.217422358364999e-06, "loss": 0.2683, "step": 38595 }, { "epoch": 4.500932835820896, "grad_norm": 0.5443420946317586, "learning_rate": 6.214608717737426e-06, "loss": 0.2725, "step": 38600 }, { "epoch": 4.501515858208955, "grad_norm": 0.5070065451751753, "learning_rate": 6.211798242043596e-06, "loss": 0.2487, "step": 38605 }, { "epoch": 4.502098880597015, "grad_norm": 0.50271908357633, "learning_rate": 6.2089909317014e-06, "loss": 0.259, "step": 38610 }, { "epoch": 4.502681902985074, "grad_norm": 0.5320041945977301, "learning_rate": 6.206186787128262e-06, "loss": 0.26, "step": 38615 }, { "epoch": 4.503264925373134, "grad_norm": 0.5167408209298835, "learning_rate": 6.2033858087411275e-06, "loss": 0.2349, "step": 38620 }, { "epoch": 4.503847947761194, "grad_norm": 0.5417274684958774, "learning_rate": 6.200587996956478e-06, "loss": 0.2674, "step": 38625 }, { "epoch": 4.5044309701492535, "grad_norm": 0.5170128925942264, "learning_rate": 6.197793352190316e-06, "loss": 0.2652, "step": 38630 }, { "epoch": 4.505013992537314, "grad_norm": 0.5346000401229939, "learning_rate": 6.1950018748581865e-06, "loss": 0.2597, "step": 38635 }, { "epoch": 4.505597014925373, "grad_norm": 0.5317872923214294, "learning_rate": 6.192213565375147e-06, "loss": 0.2641, "step": 38640 }, { "epoch": 4.506180037313433, "grad_norm": 0.5248799865586731, "learning_rate": 6.189428424155801e-06, "loss": 0.2574, "step": 38645 }, { "epoch": 4.506763059701493, "grad_norm": 0.5159466862846975, "learning_rate": 6.186646451614265e-06, "loss": 0.256, "step": 38650 }, { "epoch": 4.507346082089552, "grad_norm": 0.5355305657805994, "learning_rate": 6.1838676481641945e-06, "loss": 0.26, "step": 38655 }, { "epoch": 4.507929104477612, "grad_norm": 0.5311840720051096, "learning_rate": 6.1810920142187726e-06, "loss": 0.2661, "step": 38660 }, { "epoch": 4.508512126865671, "grad_norm": 0.5523166105701923, "learning_rate": 6.178319550190705e-06, "loss": 0.2805, "step": 38665 }, { "epoch": 4.5090951492537314, "grad_norm": 0.5460956837272631, "learning_rate": 6.175550256492235e-06, "loss": 0.2661, "step": 38670 }, { "epoch": 4.5096781716417915, "grad_norm": 0.5221584225934912, "learning_rate": 6.172784133535133e-06, "loss": 0.2671, "step": 38675 }, { "epoch": 4.510261194029851, "grad_norm": 0.5289324027768694, "learning_rate": 6.170021181730681e-06, "loss": 0.2608, "step": 38680 }, { "epoch": 4.510844216417911, "grad_norm": 0.5025094205111148, "learning_rate": 6.167261401489721e-06, "loss": 0.2634, "step": 38685 }, { "epoch": 4.51142723880597, "grad_norm": 0.5486400780433371, "learning_rate": 6.164504793222589e-06, "loss": 0.2594, "step": 38690 }, { "epoch": 4.51201026119403, "grad_norm": 0.5323475647368213, "learning_rate": 6.161751357339184e-06, "loss": 0.2835, "step": 38695 }, { "epoch": 4.512593283582089, "grad_norm": 0.5173297707671094, "learning_rate": 6.159001094248904e-06, "loss": 0.2598, "step": 38700 }, { "epoch": 4.513176305970149, "grad_norm": 0.514495731933102, "learning_rate": 6.156254004360687e-06, "loss": 0.2523, "step": 38705 }, { "epoch": 4.5137593283582085, "grad_norm": 0.49633841240966786, "learning_rate": 6.153510088083e-06, "loss": 0.2605, "step": 38710 }, { "epoch": 4.5143423507462686, "grad_norm": 0.5445902745091213, "learning_rate": 6.150769345823841e-06, "loss": 0.2502, "step": 38715 }, { "epoch": 4.514925373134329, "grad_norm": 0.5397016488949635, "learning_rate": 6.1480317779907285e-06, "loss": 0.2627, "step": 38720 }, { "epoch": 4.515508395522388, "grad_norm": 0.552039542472079, "learning_rate": 6.145297384990715e-06, "loss": 0.2437, "step": 38725 }, { "epoch": 4.516091417910448, "grad_norm": 0.5078529363100549, "learning_rate": 6.1425661672303735e-06, "loss": 0.2622, "step": 38730 }, { "epoch": 4.516674440298507, "grad_norm": 0.5381155791170246, "learning_rate": 6.139838125115818e-06, "loss": 0.2522, "step": 38735 }, { "epoch": 4.517257462686567, "grad_norm": 0.5464931315692569, "learning_rate": 6.1371132590526744e-06, "loss": 0.2599, "step": 38740 }, { "epoch": 4.517840485074627, "grad_norm": 0.49781898414356895, "learning_rate": 6.134391569446108e-06, "loss": 0.2629, "step": 38745 }, { "epoch": 4.518423507462686, "grad_norm": 0.5566657570479573, "learning_rate": 6.1316730567008086e-06, "loss": 0.2667, "step": 38750 }, { "epoch": 4.5190065298507465, "grad_norm": 0.5569678812886943, "learning_rate": 6.12895772122099e-06, "loss": 0.2726, "step": 38755 }, { "epoch": 4.519589552238806, "grad_norm": 0.4969833583679429, "learning_rate": 6.126245563410399e-06, "loss": 0.2599, "step": 38760 }, { "epoch": 4.520172574626866, "grad_norm": 0.4764080174611328, "learning_rate": 6.1235365836723054e-06, "loss": 0.2675, "step": 38765 }, { "epoch": 4.520755597014926, "grad_norm": 0.5149988727702287, "learning_rate": 6.120830782409515e-06, "loss": 0.257, "step": 38770 }, { "epoch": 4.521338619402985, "grad_norm": 0.5156047545093498, "learning_rate": 6.118128160024346e-06, "loss": 0.2623, "step": 38775 }, { "epoch": 4.521921641791045, "grad_norm": 0.543937669874722, "learning_rate": 6.115428716918657e-06, "loss": 0.2604, "step": 38780 }, { "epoch": 4.522504664179104, "grad_norm": 0.5511882106781363, "learning_rate": 6.112732453493826e-06, "loss": 0.2576, "step": 38785 }, { "epoch": 4.523087686567164, "grad_norm": 0.5425246178117245, "learning_rate": 6.110039370150765e-06, "loss": 0.2508, "step": 38790 }, { "epoch": 4.5236707089552235, "grad_norm": 0.5376242323411334, "learning_rate": 6.107349467289907e-06, "loss": 0.2485, "step": 38795 }, { "epoch": 4.524253731343284, "grad_norm": 0.5620467692409846, "learning_rate": 6.104662745311222e-06, "loss": 0.2771, "step": 38800 }, { "epoch": 4.524836753731344, "grad_norm": 0.5267031593834537, "learning_rate": 6.1019792046141875e-06, "loss": 0.2734, "step": 38805 }, { "epoch": 4.525419776119403, "grad_norm": 0.5384371673164725, "learning_rate": 6.099298845597832e-06, "loss": 0.2593, "step": 38810 }, { "epoch": 4.526002798507463, "grad_norm": 0.5125346701370309, "learning_rate": 6.096621668660686e-06, "loss": 0.267, "step": 38815 }, { "epoch": 4.526585820895522, "grad_norm": 0.524323265736002, "learning_rate": 6.093947674200838e-06, "loss": 0.2678, "step": 38820 }, { "epoch": 4.527168843283582, "grad_norm": 0.5278763593059844, "learning_rate": 6.09127686261587e-06, "loss": 0.246, "step": 38825 }, { "epoch": 4.527751865671641, "grad_norm": 0.5702414796266169, "learning_rate": 6.088609234302912e-06, "loss": 0.2645, "step": 38830 }, { "epoch": 4.528334888059701, "grad_norm": 0.5289656081427371, "learning_rate": 6.085944789658615e-06, "loss": 0.2526, "step": 38835 }, { "epoch": 4.5289179104477615, "grad_norm": 0.5157784900885519, "learning_rate": 6.083283529079157e-06, "loss": 0.26, "step": 38840 }, { "epoch": 4.529500932835821, "grad_norm": 0.5782867595801789, "learning_rate": 6.08062545296024e-06, "loss": 0.2778, "step": 38845 }, { "epoch": 4.530083955223881, "grad_norm": 0.47423350452700835, "learning_rate": 6.077970561697095e-06, "loss": 0.2687, "step": 38850 }, { "epoch": 4.53066697761194, "grad_norm": 0.5199150777035527, "learning_rate": 6.075318855684477e-06, "loss": 0.2562, "step": 38855 }, { "epoch": 4.53125, "grad_norm": 0.5399124222227776, "learning_rate": 6.072670335316676e-06, "loss": 0.2578, "step": 38860 }, { "epoch": 4.53183302238806, "grad_norm": 0.5150499155954146, "learning_rate": 6.070025000987492e-06, "loss": 0.2606, "step": 38865 }, { "epoch": 4.532416044776119, "grad_norm": 0.5004514576096859, "learning_rate": 6.067382853090269e-06, "loss": 0.2607, "step": 38870 }, { "epoch": 4.532999067164179, "grad_norm": 0.5304019536781089, "learning_rate": 6.064743892017864e-06, "loss": 0.2638, "step": 38875 }, { "epoch": 4.5335820895522385, "grad_norm": 0.5084621632745274, "learning_rate": 6.062108118162669e-06, "loss": 0.2485, "step": 38880 }, { "epoch": 4.534165111940299, "grad_norm": 0.5108568675959437, "learning_rate": 6.059475531916595e-06, "loss": 0.2452, "step": 38885 }, { "epoch": 4.534748134328359, "grad_norm": 0.6152099512145118, "learning_rate": 6.056846133671083e-06, "loss": 0.2623, "step": 38890 }, { "epoch": 4.535331156716418, "grad_norm": 0.4994264834604792, "learning_rate": 6.054219923817101e-06, "loss": 0.2628, "step": 38895 }, { "epoch": 4.535914179104478, "grad_norm": 0.5113986800087381, "learning_rate": 6.051596902745143e-06, "loss": 0.2576, "step": 38900 }, { "epoch": 4.536497201492537, "grad_norm": 0.5001538741025344, "learning_rate": 6.048977070845219e-06, "loss": 0.2488, "step": 38905 }, { "epoch": 4.537080223880597, "grad_norm": 0.47927372428680814, "learning_rate": 6.0463604285068834e-06, "loss": 0.2513, "step": 38910 }, { "epoch": 4.537663246268656, "grad_norm": 0.529013659568405, "learning_rate": 6.043746976119201e-06, "loss": 0.2773, "step": 38915 }, { "epoch": 4.538246268656716, "grad_norm": 0.4906537140004885, "learning_rate": 6.0411367140707625e-06, "loss": 0.2416, "step": 38920 }, { "epoch": 4.5388292910447765, "grad_norm": 0.5059263225489081, "learning_rate": 6.038529642749697e-06, "loss": 0.2428, "step": 38925 }, { "epoch": 4.539412313432836, "grad_norm": 0.523676580280404, "learning_rate": 6.035925762543644e-06, "loss": 0.2595, "step": 38930 }, { "epoch": 4.539995335820896, "grad_norm": 0.4983571917110067, "learning_rate": 6.03332507383978e-06, "loss": 0.2505, "step": 38935 }, { "epoch": 4.540578358208955, "grad_norm": 0.5118483537725239, "learning_rate": 6.030727577024802e-06, "loss": 0.2593, "step": 38940 }, { "epoch": 4.541161380597015, "grad_norm": 0.546423329616185, "learning_rate": 6.028133272484936e-06, "loss": 0.2727, "step": 38945 }, { "epoch": 4.541744402985074, "grad_norm": 0.4903725648707753, "learning_rate": 6.025542160605923e-06, "loss": 0.2436, "step": 38950 }, { "epoch": 4.542327425373134, "grad_norm": 0.5455420134323096, "learning_rate": 6.022954241773038e-06, "loss": 0.2761, "step": 38955 }, { "epoch": 4.542910447761194, "grad_norm": 0.4585826547870705, "learning_rate": 6.020369516371085e-06, "loss": 0.2668, "step": 38960 }, { "epoch": 4.5434934701492535, "grad_norm": 0.5265273982628189, "learning_rate": 6.017787984784381e-06, "loss": 0.2746, "step": 38965 }, { "epoch": 4.544076492537314, "grad_norm": 0.5020942041450563, "learning_rate": 6.015209647396781e-06, "loss": 0.27, "step": 38970 }, { "epoch": 4.544659514925373, "grad_norm": 0.5044068757198353, "learning_rate": 6.012634504591658e-06, "loss": 0.2688, "step": 38975 }, { "epoch": 4.545242537313433, "grad_norm": 0.5721697002713411, "learning_rate": 6.010062556751906e-06, "loss": 0.262, "step": 38980 }, { "epoch": 4.545825559701493, "grad_norm": 0.4880644698432569, "learning_rate": 6.0074938042599574e-06, "loss": 0.2422, "step": 38985 }, { "epoch": 4.546408582089552, "grad_norm": 0.5263997706877247, "learning_rate": 6.00492824749775e-06, "loss": 0.271, "step": 38990 }, { "epoch": 4.546991604477612, "grad_norm": 0.5517625766551889, "learning_rate": 6.00236588684677e-06, "loss": 0.2658, "step": 38995 }, { "epoch": 4.547574626865671, "grad_norm": 0.49354683745418637, "learning_rate": 5.999806722688007e-06, "loss": 0.2471, "step": 39000 }, { "epoch": 4.5481576492537314, "grad_norm": 0.5178997733040462, "learning_rate": 5.9972507554019895e-06, "loss": 0.2605, "step": 39005 }, { "epoch": 4.5487406716417915, "grad_norm": 0.5001909529200346, "learning_rate": 5.994697985368761e-06, "loss": 0.2638, "step": 39010 }, { "epoch": 4.549323694029851, "grad_norm": 0.5364586643874559, "learning_rate": 5.992148412967895e-06, "loss": 0.2609, "step": 39015 }, { "epoch": 4.549906716417911, "grad_norm": 0.5668501260028482, "learning_rate": 5.98960203857849e-06, "loss": 0.2613, "step": 39020 }, { "epoch": 4.55048973880597, "grad_norm": 1.7483447793114588, "learning_rate": 5.987058862579167e-06, "loss": 0.2594, "step": 39025 }, { "epoch": 4.55107276119403, "grad_norm": 0.537973349519233, "learning_rate": 5.98451888534807e-06, "loss": 0.2804, "step": 39030 }, { "epoch": 4.551655783582089, "grad_norm": 0.5651328937551121, "learning_rate": 5.981982107262877e-06, "loss": 0.2657, "step": 39035 }, { "epoch": 4.552238805970149, "grad_norm": 0.5165974934499695, "learning_rate": 5.9794485287007696e-06, "loss": 0.2512, "step": 39040 }, { "epoch": 4.5528218283582085, "grad_norm": 0.5823602307496405, "learning_rate": 5.976918150038478e-06, "loss": 0.2584, "step": 39045 }, { "epoch": 4.5534048507462686, "grad_norm": 0.628072248282222, "learning_rate": 5.974390971652237e-06, "loss": 0.2774, "step": 39050 }, { "epoch": 4.553987873134329, "grad_norm": 0.5251264104114984, "learning_rate": 5.971866993917821e-06, "loss": 0.2654, "step": 39055 }, { "epoch": 4.554570895522388, "grad_norm": 0.5521170164983134, "learning_rate": 5.9693462172105165e-06, "loss": 0.2692, "step": 39060 }, { "epoch": 4.555153917910448, "grad_norm": 0.4998549724119816, "learning_rate": 5.966828641905142e-06, "loss": 0.2438, "step": 39065 }, { "epoch": 4.555736940298507, "grad_norm": 0.4897087808573781, "learning_rate": 5.964314268376031e-06, "loss": 0.2653, "step": 39070 }, { "epoch": 4.556319962686567, "grad_norm": 0.5317645488175172, "learning_rate": 5.961803096997056e-06, "loss": 0.2691, "step": 39075 }, { "epoch": 4.556902985074627, "grad_norm": 0.5072629913145804, "learning_rate": 5.959295128141596e-06, "loss": 0.2555, "step": 39080 }, { "epoch": 4.557486007462686, "grad_norm": 0.5194778596385488, "learning_rate": 5.956790362182567e-06, "loss": 0.2648, "step": 39085 }, { "epoch": 4.5580690298507465, "grad_norm": 0.4896791599803869, "learning_rate": 5.9542887994923985e-06, "loss": 0.2449, "step": 39090 }, { "epoch": 4.558652052238806, "grad_norm": 0.518112465898371, "learning_rate": 5.951790440443055e-06, "loss": 0.2812, "step": 39095 }, { "epoch": 4.559235074626866, "grad_norm": 0.5467229113899952, "learning_rate": 5.949295285406015e-06, "loss": 0.2664, "step": 39100 }, { "epoch": 4.559818097014926, "grad_norm": 0.5920242852575323, "learning_rate": 5.946803334752285e-06, "loss": 0.2805, "step": 39105 }, { "epoch": 4.560401119402985, "grad_norm": 0.5489787113365736, "learning_rate": 5.944314588852393e-06, "loss": 0.261, "step": 39110 }, { "epoch": 4.560984141791045, "grad_norm": 0.5148234612456437, "learning_rate": 5.941829048076392e-06, "loss": 0.2573, "step": 39115 }, { "epoch": 4.561567164179104, "grad_norm": 0.5057751472170834, "learning_rate": 5.93934671279386e-06, "loss": 0.258, "step": 39120 }, { "epoch": 4.562150186567164, "grad_norm": 0.5035074594308664, "learning_rate": 5.936867583373895e-06, "loss": 0.263, "step": 39125 }, { "epoch": 4.5627332089552235, "grad_norm": 0.5165259452147263, "learning_rate": 5.934391660185121e-06, "loss": 0.2554, "step": 39130 }, { "epoch": 4.563316231343284, "grad_norm": 0.5311455792598649, "learning_rate": 5.931918943595682e-06, "loss": 0.2353, "step": 39135 }, { "epoch": 4.563899253731344, "grad_norm": 0.5718147441246065, "learning_rate": 5.929449433973249e-06, "loss": 0.2623, "step": 39140 }, { "epoch": 4.564482276119403, "grad_norm": 0.5146604972963873, "learning_rate": 5.926983131685012e-06, "loss": 0.2526, "step": 39145 }, { "epoch": 4.565065298507463, "grad_norm": 0.5789918241569233, "learning_rate": 5.924520037097688e-06, "loss": 0.2751, "step": 39150 }, { "epoch": 4.565648320895522, "grad_norm": 0.5570935922664527, "learning_rate": 5.922060150577517e-06, "loss": 0.2723, "step": 39155 }, { "epoch": 4.566231343283582, "grad_norm": 0.48832190251301205, "learning_rate": 5.919603472490263e-06, "loss": 0.2858, "step": 39160 }, { "epoch": 4.566814365671641, "grad_norm": 0.5002046914943303, "learning_rate": 5.917150003201201e-06, "loss": 0.2345, "step": 39165 }, { "epoch": 4.567397388059701, "grad_norm": 0.5286197129348891, "learning_rate": 5.914699743075149e-06, "loss": 0.2592, "step": 39170 }, { "epoch": 4.5679804104477615, "grad_norm": 0.5092297450544302, "learning_rate": 5.9122526924764264e-06, "loss": 0.2483, "step": 39175 }, { "epoch": 4.568563432835821, "grad_norm": 0.5739015669452459, "learning_rate": 5.909808851768898e-06, "loss": 0.269, "step": 39180 }, { "epoch": 4.569146455223881, "grad_norm": 0.5731974873144853, "learning_rate": 5.9073682213159325e-06, "loss": 0.2503, "step": 39185 }, { "epoch": 4.56972947761194, "grad_norm": 0.5813716778069474, "learning_rate": 5.904930801480427e-06, "loss": 0.2734, "step": 39190 }, { "epoch": 4.5703125, "grad_norm": 0.5598980422996925, "learning_rate": 5.902496592624808e-06, "loss": 0.2659, "step": 39195 }, { "epoch": 4.57089552238806, "grad_norm": 0.5153256785923885, "learning_rate": 5.900065595111014e-06, "loss": 0.2523, "step": 39200 }, { "epoch": 4.571478544776119, "grad_norm": 0.5204926567812194, "learning_rate": 5.897637809300514e-06, "loss": 0.2657, "step": 39205 }, { "epoch": 4.572061567164179, "grad_norm": 0.5635535313547579, "learning_rate": 5.895213235554298e-06, "loss": 0.2768, "step": 39210 }, { "epoch": 4.5726445895522385, "grad_norm": 0.5049764843459283, "learning_rate": 5.892791874232868e-06, "loss": 0.2458, "step": 39215 }, { "epoch": 4.573227611940299, "grad_norm": 0.5021265636482644, "learning_rate": 5.890373725696271e-06, "loss": 0.2525, "step": 39220 }, { "epoch": 4.573810634328359, "grad_norm": 0.5211571852581737, "learning_rate": 5.887958790304047e-06, "loss": 0.2618, "step": 39225 }, { "epoch": 4.574393656716418, "grad_norm": 0.5501783296978301, "learning_rate": 5.885547068415289e-06, "loss": 0.2711, "step": 39230 }, { "epoch": 4.574976679104478, "grad_norm": 0.47179845819744737, "learning_rate": 5.883138560388587e-06, "loss": 0.2447, "step": 39235 }, { "epoch": 4.575559701492537, "grad_norm": 0.5209049140406725, "learning_rate": 5.880733266582066e-06, "loss": 0.2702, "step": 39240 }, { "epoch": 4.576142723880597, "grad_norm": 0.5230158490968697, "learning_rate": 5.878331187353371e-06, "loss": 0.2569, "step": 39245 }, { "epoch": 4.576725746268656, "grad_norm": 0.5148379583291693, "learning_rate": 5.875932323059667e-06, "loss": 0.2649, "step": 39250 }, { "epoch": 4.577308768656716, "grad_norm": 0.5432816101988381, "learning_rate": 5.87353667405764e-06, "loss": 0.2542, "step": 39255 }, { "epoch": 4.5778917910447765, "grad_norm": 0.5310160749752704, "learning_rate": 5.871144240703507e-06, "loss": 0.2482, "step": 39260 }, { "epoch": 4.578474813432836, "grad_norm": 0.5192906911805769, "learning_rate": 5.86875502335299e-06, "loss": 0.2531, "step": 39265 }, { "epoch": 4.579057835820896, "grad_norm": 0.5434345680182487, "learning_rate": 5.866369022361354e-06, "loss": 0.2462, "step": 39270 }, { "epoch": 4.579640858208955, "grad_norm": 0.47507817563686167, "learning_rate": 5.863986238083367e-06, "loss": 0.2735, "step": 39275 }, { "epoch": 4.580223880597015, "grad_norm": 0.5349078706462872, "learning_rate": 5.8616066708733255e-06, "loss": 0.2523, "step": 39280 }, { "epoch": 4.580806902985074, "grad_norm": 0.47091628432142624, "learning_rate": 5.859230321085049e-06, "loss": 0.2529, "step": 39285 }, { "epoch": 4.581389925373134, "grad_norm": 0.528743361178868, "learning_rate": 5.856857189071884e-06, "loss": 0.2704, "step": 39290 }, { "epoch": 4.581972947761194, "grad_norm": 0.4990121507064239, "learning_rate": 5.8544872751866845e-06, "loss": 0.2472, "step": 39295 }, { "epoch": 4.5825559701492535, "grad_norm": 0.5645313197773756, "learning_rate": 5.852120579781838e-06, "loss": 0.2839, "step": 39300 }, { "epoch": 4.583138992537314, "grad_norm": 0.5226436084754454, "learning_rate": 5.849757103209252e-06, "loss": 0.2734, "step": 39305 }, { "epoch": 4.583722014925373, "grad_norm": 0.5463664391906513, "learning_rate": 5.847396845820349e-06, "loss": 0.2855, "step": 39310 }, { "epoch": 4.584305037313433, "grad_norm": 0.48099862258521753, "learning_rate": 5.845039807966074e-06, "loss": 0.253, "step": 39315 }, { "epoch": 4.584888059701493, "grad_norm": 0.5409449082799026, "learning_rate": 5.8426859899969034e-06, "loss": 0.2741, "step": 39320 }, { "epoch": 4.585471082089552, "grad_norm": 0.5182304216740492, "learning_rate": 5.84033539226282e-06, "loss": 0.2624, "step": 39325 }, { "epoch": 4.586054104477612, "grad_norm": 0.5048264798103715, "learning_rate": 5.83798801511334e-06, "loss": 0.2601, "step": 39330 }, { "epoch": 4.586637126865671, "grad_norm": 0.5511959412336976, "learning_rate": 5.835643858897498e-06, "loss": 0.263, "step": 39335 }, { "epoch": 4.5872201492537314, "grad_norm": 0.5006116307130094, "learning_rate": 5.833302923963837e-06, "loss": 0.2699, "step": 39340 }, { "epoch": 4.5878031716417915, "grad_norm": 0.5539465121332253, "learning_rate": 5.830965210660445e-06, "loss": 0.2727, "step": 39345 }, { "epoch": 4.588386194029851, "grad_norm": 0.527718508349746, "learning_rate": 5.828630719334905e-06, "loss": 0.2554, "step": 39350 }, { "epoch": 4.588969216417911, "grad_norm": 0.5229234931270784, "learning_rate": 5.826299450334345e-06, "loss": 0.2783, "step": 39355 }, { "epoch": 4.58955223880597, "grad_norm": 0.5365676057638463, "learning_rate": 5.8239714040053936e-06, "loss": 0.2715, "step": 39360 }, { "epoch": 4.59013526119403, "grad_norm": 0.5171971752161777, "learning_rate": 5.821646580694214e-06, "loss": 0.2675, "step": 39365 }, { "epoch": 4.590718283582089, "grad_norm": 0.48569033045468135, "learning_rate": 5.819324980746483e-06, "loss": 0.2552, "step": 39370 }, { "epoch": 4.591301305970149, "grad_norm": 0.5151000453096796, "learning_rate": 5.817006604507401e-06, "loss": 0.2567, "step": 39375 }, { "epoch": 4.5918843283582085, "grad_norm": 0.49170361831576925, "learning_rate": 5.814691452321687e-06, "loss": 0.244, "step": 39380 }, { "epoch": 4.5924673507462686, "grad_norm": 0.5706880060010474, "learning_rate": 5.812379524533587e-06, "loss": 0.2733, "step": 39385 }, { "epoch": 4.593050373134329, "grad_norm": 0.5273620636702008, "learning_rate": 5.810070821486854e-06, "loss": 0.2607, "step": 39390 }, { "epoch": 4.593633395522388, "grad_norm": 0.5088253491924825, "learning_rate": 5.8077653435247774e-06, "loss": 0.2708, "step": 39395 }, { "epoch": 4.594216417910448, "grad_norm": 0.5192738221976703, "learning_rate": 5.805463090990154e-06, "loss": 0.2505, "step": 39400 }, { "epoch": 4.594799440298507, "grad_norm": 0.5848398046537732, "learning_rate": 5.803164064225313e-06, "loss": 0.2686, "step": 39405 }, { "epoch": 4.595382462686567, "grad_norm": 0.5026316392413409, "learning_rate": 5.800868263572093e-06, "loss": 0.2508, "step": 39410 }, { "epoch": 4.595965485074627, "grad_norm": 0.5073754432419348, "learning_rate": 5.7985756893718585e-06, "loss": 0.2525, "step": 39415 }, { "epoch": 4.596548507462686, "grad_norm": 0.4958416479262377, "learning_rate": 5.796286341965492e-06, "loss": 0.245, "step": 39420 }, { "epoch": 4.5971315298507465, "grad_norm": 0.49618230305373573, "learning_rate": 5.794000221693403e-06, "loss": 0.257, "step": 39425 }, { "epoch": 4.597714552238806, "grad_norm": 0.5003617310823768, "learning_rate": 5.7917173288955105e-06, "loss": 0.2569, "step": 39430 }, { "epoch": 4.598297574626866, "grad_norm": 0.5744962967027848, "learning_rate": 5.789437663911261e-06, "loss": 0.2717, "step": 39435 }, { "epoch": 4.598880597014926, "grad_norm": 0.5351010564525503, "learning_rate": 5.787161227079613e-06, "loss": 0.2697, "step": 39440 }, { "epoch": 4.599463619402985, "grad_norm": 0.5091439653815344, "learning_rate": 5.7848880187390615e-06, "loss": 0.2643, "step": 39445 }, { "epoch": 4.600046641791045, "grad_norm": 0.519037907834975, "learning_rate": 5.782618039227603e-06, "loss": 0.2481, "step": 39450 }, { "epoch": 4.600629664179104, "grad_norm": 0.5497073186414385, "learning_rate": 5.7803512888827626e-06, "loss": 0.258, "step": 39455 }, { "epoch": 4.601212686567164, "grad_norm": 0.56716755693992, "learning_rate": 5.778087768041589e-06, "loss": 0.2768, "step": 39460 }, { "epoch": 4.6017957089552235, "grad_norm": 0.5217926690145527, "learning_rate": 5.7758274770406375e-06, "loss": 0.2476, "step": 39465 }, { "epoch": 4.602378731343284, "grad_norm": 0.4986334629889643, "learning_rate": 5.7735704162160005e-06, "loss": 0.2693, "step": 39470 }, { "epoch": 4.602961753731344, "grad_norm": 0.4957356408163735, "learning_rate": 5.771316585903276e-06, "loss": 0.2819, "step": 39475 }, { "epoch": 4.603544776119403, "grad_norm": 0.5330093711736085, "learning_rate": 5.769065986437591e-06, "loss": 0.2672, "step": 39480 }, { "epoch": 4.604127798507463, "grad_norm": 0.5645434647863821, "learning_rate": 5.766818618153584e-06, "loss": 0.2551, "step": 39485 }, { "epoch": 4.604710820895522, "grad_norm": 0.4946595097058112, "learning_rate": 5.764574481385419e-06, "loss": 0.2454, "step": 39490 }, { "epoch": 4.605293843283582, "grad_norm": 0.5224712282544505, "learning_rate": 5.762333576466778e-06, "loss": 0.2513, "step": 39495 }, { "epoch": 4.605876865671641, "grad_norm": 0.563427600903738, "learning_rate": 5.7600959037308626e-06, "loss": 0.2733, "step": 39500 }, { "epoch": 4.606459888059701, "grad_norm": 0.5579432122104788, "learning_rate": 5.75786146351039e-06, "loss": 0.2662, "step": 39505 }, { "epoch": 4.6070429104477615, "grad_norm": 0.5590879807675503, "learning_rate": 5.755630256137605e-06, "loss": 0.2775, "step": 39510 }, { "epoch": 4.607625932835821, "grad_norm": 0.4880211162127533, "learning_rate": 5.753402281944261e-06, "loss": 0.2511, "step": 39515 }, { "epoch": 4.608208955223881, "grad_norm": 0.5488164496375905, "learning_rate": 5.7511775412616415e-06, "loss": 0.2558, "step": 39520 }, { "epoch": 4.60879197761194, "grad_norm": 0.5506795924858428, "learning_rate": 5.748956034420539e-06, "loss": 0.2496, "step": 39525 }, { "epoch": 4.609375, "grad_norm": 0.5605310164010058, "learning_rate": 5.74673776175128e-06, "loss": 0.2593, "step": 39530 }, { "epoch": 4.60995802238806, "grad_norm": 0.5033277726183638, "learning_rate": 5.744522723583689e-06, "loss": 0.2426, "step": 39535 }, { "epoch": 4.610541044776119, "grad_norm": 0.5027357838767988, "learning_rate": 5.742310920247127e-06, "loss": 0.2507, "step": 39540 }, { "epoch": 4.611124067164179, "grad_norm": 0.5567561304167239, "learning_rate": 5.740102352070463e-06, "loss": 0.2844, "step": 39545 }, { "epoch": 4.6117070895522385, "grad_norm": 0.564171089492422, "learning_rate": 5.737897019382098e-06, "loss": 0.2635, "step": 39550 }, { "epoch": 4.612290111940299, "grad_norm": 0.5233508193926742, "learning_rate": 5.735694922509938e-06, "loss": 0.2519, "step": 39555 }, { "epoch": 4.612873134328359, "grad_norm": 0.5339753828795404, "learning_rate": 5.733496061781418e-06, "loss": 0.2648, "step": 39560 }, { "epoch": 4.613456156716418, "grad_norm": 0.48995155329985074, "learning_rate": 5.73130043752348e-06, "loss": 0.246, "step": 39565 }, { "epoch": 4.614039179104478, "grad_norm": 0.5177319985740271, "learning_rate": 5.729108050062603e-06, "loss": 0.2659, "step": 39570 }, { "epoch": 4.614622201492537, "grad_norm": 0.5445035797507868, "learning_rate": 5.726918899724759e-06, "loss": 0.2629, "step": 39575 }, { "epoch": 4.615205223880597, "grad_norm": 0.49770199544713895, "learning_rate": 5.7247329868354705e-06, "loss": 0.2587, "step": 39580 }, { "epoch": 4.615788246268656, "grad_norm": 0.5026847329904678, "learning_rate": 5.722550311719753e-06, "loss": 0.2548, "step": 39585 }, { "epoch": 4.616371268656716, "grad_norm": 0.5446304167463728, "learning_rate": 5.720370874702148e-06, "loss": 0.2852, "step": 39590 }, { "epoch": 4.6169542910447765, "grad_norm": 0.5383502217358929, "learning_rate": 5.7181946761067205e-06, "loss": 0.2648, "step": 39595 }, { "epoch": 4.617537313432836, "grad_norm": 0.5350750319413246, "learning_rate": 5.716021716257047e-06, "loss": 0.2593, "step": 39600 }, { "epoch": 4.618120335820896, "grad_norm": 0.5112603395958923, "learning_rate": 5.71385199547623e-06, "loss": 0.2599, "step": 39605 }, { "epoch": 4.618703358208955, "grad_norm": 0.5400977023317569, "learning_rate": 5.7116855140868874e-06, "loss": 0.243, "step": 39610 }, { "epoch": 4.619286380597015, "grad_norm": 0.600609893489187, "learning_rate": 5.709522272411145e-06, "loss": 0.2611, "step": 39615 }, { "epoch": 4.619869402985074, "grad_norm": 0.5343060392091388, "learning_rate": 5.707362270770665e-06, "loss": 0.2549, "step": 39620 }, { "epoch": 4.620452425373134, "grad_norm": 0.5099039748144819, "learning_rate": 5.705205509486613e-06, "loss": 0.2557, "step": 39625 }, { "epoch": 4.621035447761194, "grad_norm": 0.5196756335227851, "learning_rate": 5.703051988879689e-06, "loss": 0.2509, "step": 39630 }, { "epoch": 4.6216184701492535, "grad_norm": 0.5176297657516744, "learning_rate": 5.700901709270088e-06, "loss": 0.2606, "step": 39635 }, { "epoch": 4.622201492537314, "grad_norm": 0.48431253013961323, "learning_rate": 5.698754670977544e-06, "loss": 0.2455, "step": 39640 }, { "epoch": 4.622784514925373, "grad_norm": 0.5279944256153754, "learning_rate": 5.696610874321296e-06, "loss": 0.2685, "step": 39645 }, { "epoch": 4.623367537313433, "grad_norm": 0.7043291715170641, "learning_rate": 5.69447031962011e-06, "loss": 0.2755, "step": 39650 }, { "epoch": 4.623950559701493, "grad_norm": 0.5241944783645631, "learning_rate": 5.6923330071922634e-06, "loss": 0.2603, "step": 39655 }, { "epoch": 4.624533582089552, "grad_norm": 0.4551091243035712, "learning_rate": 5.690198937355561e-06, "loss": 0.2653, "step": 39660 }, { "epoch": 4.625116604477612, "grad_norm": 0.54382786301182, "learning_rate": 5.68806811042731e-06, "loss": 0.2636, "step": 39665 }, { "epoch": 4.625699626865671, "grad_norm": 0.5312020355109183, "learning_rate": 5.685940526724344e-06, "loss": 0.2512, "step": 39670 }, { "epoch": 4.6262826492537314, "grad_norm": 0.5030153366075172, "learning_rate": 5.683816186563018e-06, "loss": 0.2487, "step": 39675 }, { "epoch": 4.6268656716417915, "grad_norm": 0.5052790696536368, "learning_rate": 5.6816950902592005e-06, "loss": 0.2615, "step": 39680 }, { "epoch": 4.627448694029851, "grad_norm": 0.5152226706868136, "learning_rate": 5.6795772381282785e-06, "loss": 0.2535, "step": 39685 }, { "epoch": 4.628031716417911, "grad_norm": 0.5340266358428221, "learning_rate": 5.6774626304851555e-06, "loss": 0.2528, "step": 39690 }, { "epoch": 4.62861473880597, "grad_norm": 0.4777349890163476, "learning_rate": 5.675351267644256e-06, "loss": 0.2506, "step": 39695 }, { "epoch": 4.62919776119403, "grad_norm": 0.5433246293517688, "learning_rate": 5.673243149919512e-06, "loss": 0.2574, "step": 39700 }, { "epoch": 4.629780783582089, "grad_norm": 0.553171690141395, "learning_rate": 5.671138277624391e-06, "loss": 0.2646, "step": 39705 }, { "epoch": 4.630363805970149, "grad_norm": 0.5170862670736807, "learning_rate": 5.669036651071857e-06, "loss": 0.2543, "step": 39710 }, { "epoch": 4.6309468283582085, "grad_norm": 0.5189499541189913, "learning_rate": 5.66693827057441e-06, "loss": 0.2725, "step": 39715 }, { "epoch": 4.6315298507462686, "grad_norm": 0.5240036738205557, "learning_rate": 5.664843136444054e-06, "loss": 0.2564, "step": 39720 }, { "epoch": 4.632112873134329, "grad_norm": 0.5094303603605583, "learning_rate": 5.662751248992315e-06, "loss": 0.249, "step": 39725 }, { "epoch": 4.632695895522388, "grad_norm": 0.5419601046635678, "learning_rate": 5.660662608530239e-06, "loss": 0.2534, "step": 39730 }, { "epoch": 4.633278917910448, "grad_norm": 0.47543703416020394, "learning_rate": 5.658577215368389e-06, "loss": 0.2496, "step": 39735 }, { "epoch": 4.633861940298507, "grad_norm": 0.5580631260534408, "learning_rate": 5.6564950698168385e-06, "loss": 0.2709, "step": 39740 }, { "epoch": 4.634444962686567, "grad_norm": 0.531560243555757, "learning_rate": 5.654416172185187e-06, "loss": 0.2619, "step": 39745 }, { "epoch": 4.635027985074627, "grad_norm": 0.5205084469010693, "learning_rate": 5.652340522782542e-06, "loss": 0.253, "step": 39750 }, { "epoch": 4.635611007462686, "grad_norm": 0.5214763362082411, "learning_rate": 5.6502681219175355e-06, "loss": 0.2612, "step": 39755 }, { "epoch": 4.6361940298507465, "grad_norm": 0.5504813105626072, "learning_rate": 5.648198969898311e-06, "loss": 0.2495, "step": 39760 }, { "epoch": 4.636777052238806, "grad_norm": 0.5247833267231861, "learning_rate": 5.646133067032536e-06, "loss": 0.2481, "step": 39765 }, { "epoch": 4.637360074626866, "grad_norm": 0.4800118967667409, "learning_rate": 5.644070413627386e-06, "loss": 0.2439, "step": 39770 }, { "epoch": 4.637943097014926, "grad_norm": 0.5486091169962901, "learning_rate": 5.642011009989562e-06, "loss": 0.2735, "step": 39775 }, { "epoch": 4.638526119402985, "grad_norm": 0.5304177171428159, "learning_rate": 5.639954856425273e-06, "loss": 0.2762, "step": 39780 }, { "epoch": 4.639109141791045, "grad_norm": 0.5431082782054679, "learning_rate": 5.6379019532402554e-06, "loss": 0.2725, "step": 39785 }, { "epoch": 4.639692164179104, "grad_norm": 0.46956555725103133, "learning_rate": 5.6358523007397485e-06, "loss": 0.2535, "step": 39790 }, { "epoch": 4.640275186567164, "grad_norm": 0.576866891327412, "learning_rate": 5.633805899228524e-06, "loss": 0.2547, "step": 39795 }, { "epoch": 4.6408582089552235, "grad_norm": 0.5083759259602593, "learning_rate": 5.631762749010855e-06, "loss": 0.2382, "step": 39800 }, { "epoch": 4.641441231343284, "grad_norm": 0.582557041488828, "learning_rate": 5.629722850390544e-06, "loss": 0.2731, "step": 39805 }, { "epoch": 4.642024253731344, "grad_norm": 0.5333557531674228, "learning_rate": 5.6276862036709e-06, "loss": 0.2618, "step": 39810 }, { "epoch": 4.642607276119403, "grad_norm": 0.5261551531608604, "learning_rate": 5.625652809154753e-06, "loss": 0.2456, "step": 39815 }, { "epoch": 4.643190298507463, "grad_norm": 0.57089408956915, "learning_rate": 5.6236226671444555e-06, "loss": 0.2758, "step": 39820 }, { "epoch": 4.643773320895522, "grad_norm": 0.5279218139839572, "learning_rate": 5.6215957779418624e-06, "loss": 0.2547, "step": 39825 }, { "epoch": 4.644356343283582, "grad_norm": 0.47708116054931277, "learning_rate": 5.619572141848358e-06, "loss": 0.2551, "step": 39830 }, { "epoch": 4.644939365671641, "grad_norm": 0.4899332252791134, "learning_rate": 5.617551759164836e-06, "loss": 0.2452, "step": 39835 }, { "epoch": 4.645522388059701, "grad_norm": 0.5156010775338732, "learning_rate": 5.615534630191708e-06, "loss": 0.2583, "step": 39840 }, { "epoch": 4.6461054104477615, "grad_norm": 0.541749571599729, "learning_rate": 5.613520755228901e-06, "loss": 0.2509, "step": 39845 }, { "epoch": 4.646688432835821, "grad_norm": 0.5306833646724142, "learning_rate": 5.611510134575859e-06, "loss": 0.2629, "step": 39850 }, { "epoch": 4.647271455223881, "grad_norm": 0.5218121839872212, "learning_rate": 5.609502768531541e-06, "loss": 0.2563, "step": 39855 }, { "epoch": 4.64785447761194, "grad_norm": 0.5340644359523524, "learning_rate": 5.607498657394424e-06, "loss": 0.2782, "step": 39860 }, { "epoch": 4.6484375, "grad_norm": 0.5616557381089768, "learning_rate": 5.605497801462503e-06, "loss": 0.2869, "step": 39865 }, { "epoch": 4.64902052238806, "grad_norm": 0.5233009605449933, "learning_rate": 5.603500201033285e-06, "loss": 0.2647, "step": 39870 }, { "epoch": 4.649603544776119, "grad_norm": 0.5072572205369043, "learning_rate": 5.601505856403786e-06, "loss": 0.2506, "step": 39875 }, { "epoch": 4.650186567164179, "grad_norm": 0.520608341845551, "learning_rate": 5.59951476787056e-06, "loss": 0.2534, "step": 39880 }, { "epoch": 4.6507695895522385, "grad_norm": 0.5259777167662049, "learning_rate": 5.59752693572965e-06, "loss": 0.2608, "step": 39885 }, { "epoch": 4.651352611940299, "grad_norm": 0.5452055480624008, "learning_rate": 5.595542360276636e-06, "loss": 0.2482, "step": 39890 }, { "epoch": 4.651935634328359, "grad_norm": 0.5443251819133219, "learning_rate": 5.593561041806601e-06, "loss": 0.2603, "step": 39895 }, { "epoch": 4.652518656716418, "grad_norm": 0.5457749401429179, "learning_rate": 5.591582980614151e-06, "loss": 0.253, "step": 39900 }, { "epoch": 4.653101679104478, "grad_norm": 0.4980339071127979, "learning_rate": 5.589608176993401e-06, "loss": 0.2617, "step": 39905 }, { "epoch": 4.653684701492537, "grad_norm": 0.5607813456754676, "learning_rate": 5.587636631237991e-06, "loss": 0.2512, "step": 39910 }, { "epoch": 4.654267723880597, "grad_norm": 0.5211003881829674, "learning_rate": 5.585668343641064e-06, "loss": 0.2416, "step": 39915 }, { "epoch": 4.654850746268656, "grad_norm": 0.5170232868854111, "learning_rate": 5.583703314495294e-06, "loss": 0.2628, "step": 39920 }, { "epoch": 4.655433768656716, "grad_norm": 0.5249780373897078, "learning_rate": 5.58174154409285e-06, "loss": 0.2637, "step": 39925 }, { "epoch": 4.6560167910447765, "grad_norm": 0.5088062574359778, "learning_rate": 5.579783032725441e-06, "loss": 0.2746, "step": 39930 }, { "epoch": 4.656599813432836, "grad_norm": 0.4989325567462243, "learning_rate": 5.577827780684269e-06, "loss": 0.2541, "step": 39935 }, { "epoch": 4.657182835820896, "grad_norm": 0.49775649080391965, "learning_rate": 5.5758757882600706e-06, "loss": 0.2568, "step": 39940 }, { "epoch": 4.657765858208955, "grad_norm": 0.5303577158898439, "learning_rate": 5.573927055743082e-06, "loss": 0.2704, "step": 39945 }, { "epoch": 4.658348880597015, "grad_norm": 0.5266344318925089, "learning_rate": 5.57198158342306e-06, "loss": 0.2594, "step": 39950 }, { "epoch": 4.658931902985074, "grad_norm": 0.5586617486799256, "learning_rate": 5.5700393715892815e-06, "loss": 0.2585, "step": 39955 }, { "epoch": 4.659514925373134, "grad_norm": 0.5464402018931301, "learning_rate": 5.568100420530533e-06, "loss": 0.2639, "step": 39960 }, { "epoch": 4.660097947761194, "grad_norm": 0.49288058807736795, "learning_rate": 5.566164730535119e-06, "loss": 0.2647, "step": 39965 }, { "epoch": 4.6606809701492535, "grad_norm": 0.5437758561165839, "learning_rate": 5.5642323018908595e-06, "loss": 0.2703, "step": 39970 }, { "epoch": 4.661263992537314, "grad_norm": 0.5346299303623872, "learning_rate": 5.5623031348850815e-06, "loss": 0.2569, "step": 39975 }, { "epoch": 4.661847014925373, "grad_norm": 0.533172516862136, "learning_rate": 5.560377229804644e-06, "loss": 0.2582, "step": 39980 }, { "epoch": 4.662430037313433, "grad_norm": 0.5399634745002946, "learning_rate": 5.558454586935901e-06, "loss": 0.2805, "step": 39985 }, { "epoch": 4.663013059701493, "grad_norm": 0.47633814068425956, "learning_rate": 5.556535206564733e-06, "loss": 0.255, "step": 39990 }, { "epoch": 4.663596082089552, "grad_norm": 0.518734948250553, "learning_rate": 5.554619088976538e-06, "loss": 0.2497, "step": 39995 }, { "epoch": 4.664179104477612, "grad_norm": 0.5554992653820158, "learning_rate": 5.55270623445622e-06, "loss": 0.2492, "step": 40000 }, { "epoch": 4.664762126865671, "grad_norm": 0.5494475968014342, "learning_rate": 5.5507966432882056e-06, "loss": 0.2588, "step": 40005 }, { "epoch": 4.6653451492537314, "grad_norm": 0.5386402157373839, "learning_rate": 5.548890315756433e-06, "loss": 0.2579, "step": 40010 }, { "epoch": 4.6659281716417915, "grad_norm": 0.5444706039562287, "learning_rate": 5.546987252144351e-06, "loss": 0.2893, "step": 40015 }, { "epoch": 4.666511194029851, "grad_norm": 0.4879983142995859, "learning_rate": 5.545087452734928e-06, "loss": 0.2458, "step": 40020 }, { "epoch": 4.667094216417911, "grad_norm": 0.5557708654373923, "learning_rate": 5.543190917810647e-06, "loss": 0.2515, "step": 40025 }, { "epoch": 4.66767723880597, "grad_norm": 0.5626302490494139, "learning_rate": 5.541297647653505e-06, "loss": 0.2566, "step": 40030 }, { "epoch": 4.66826026119403, "grad_norm": 0.5570188393236565, "learning_rate": 5.539407642545012e-06, "loss": 0.2789, "step": 40035 }, { "epoch": 4.668843283582089, "grad_norm": 0.5201331559099625, "learning_rate": 5.537520902766193e-06, "loss": 0.268, "step": 40040 }, { "epoch": 4.669426305970149, "grad_norm": 0.543373508115044, "learning_rate": 5.535637428597591e-06, "loss": 0.269, "step": 40045 }, { "epoch": 4.6700093283582085, "grad_norm": 0.5568551024166752, "learning_rate": 5.533757220319257e-06, "loss": 0.2549, "step": 40050 }, { "epoch": 4.6705923507462686, "grad_norm": 0.5361543937930828, "learning_rate": 5.531880278210764e-06, "loss": 0.2474, "step": 40055 }, { "epoch": 4.671175373134329, "grad_norm": 0.5332695866731167, "learning_rate": 5.5300066025511885e-06, "loss": 0.2569, "step": 40060 }, { "epoch": 4.671758395522388, "grad_norm": 0.589542147153373, "learning_rate": 5.528136193619137e-06, "loss": 0.2773, "step": 40065 }, { "epoch": 4.672341417910448, "grad_norm": 0.5398247393515335, "learning_rate": 5.526269051692717e-06, "loss": 0.2604, "step": 40070 }, { "epoch": 4.672924440298507, "grad_norm": 0.5602495866514481, "learning_rate": 5.524405177049553e-06, "loss": 0.2557, "step": 40075 }, { "epoch": 4.673507462686567, "grad_norm": 0.5057896777527263, "learning_rate": 5.522544569966786e-06, "loss": 0.2585, "step": 40080 }, { "epoch": 4.674090485074627, "grad_norm": 0.5047447240054863, "learning_rate": 5.520687230721073e-06, "loss": 0.2586, "step": 40085 }, { "epoch": 4.674673507462686, "grad_norm": 0.5719492302799094, "learning_rate": 5.518833159588582e-06, "loss": 0.2566, "step": 40090 }, { "epoch": 4.6752565298507465, "grad_norm": 0.5195506542381323, "learning_rate": 5.516982356844994e-06, "loss": 0.2547, "step": 40095 }, { "epoch": 4.675839552238806, "grad_norm": 0.4927618908976009, "learning_rate": 5.515134822765504e-06, "loss": 0.2472, "step": 40100 }, { "epoch": 4.676422574626866, "grad_norm": 12.42300546948787, "learning_rate": 5.513290557624827e-06, "loss": 0.265, "step": 40105 }, { "epoch": 4.677005597014926, "grad_norm": 0.5843261635336789, "learning_rate": 5.511449561697183e-06, "loss": 0.256, "step": 40110 }, { "epoch": 4.677588619402985, "grad_norm": 0.6219886003787144, "learning_rate": 5.509611835256317e-06, "loss": 0.263, "step": 40115 }, { "epoch": 4.678171641791045, "grad_norm": 0.563527995817142, "learning_rate": 5.507777378575474e-06, "loss": 0.2835, "step": 40120 }, { "epoch": 4.678754664179104, "grad_norm": 0.4971100095783303, "learning_rate": 5.505946191927424e-06, "loss": 0.2659, "step": 40125 }, { "epoch": 4.679337686567164, "grad_norm": 0.535047842222231, "learning_rate": 5.504118275584444e-06, "loss": 0.2419, "step": 40130 }, { "epoch": 4.6799207089552235, "grad_norm": 0.5134808224822797, "learning_rate": 5.5022936298183316e-06, "loss": 0.2634, "step": 40135 }, { "epoch": 4.680503731343284, "grad_norm": 0.5704217200750399, "learning_rate": 5.500472254900392e-06, "loss": 0.2861, "step": 40140 }, { "epoch": 4.681086753731344, "grad_norm": 0.5068672631787788, "learning_rate": 5.49865415110145e-06, "loss": 0.2542, "step": 40145 }, { "epoch": 4.681669776119403, "grad_norm": 0.5313202522974988, "learning_rate": 5.49683931869183e-06, "loss": 0.241, "step": 40150 }, { "epoch": 4.682252798507463, "grad_norm": 0.5380947184806353, "learning_rate": 5.495027757941394e-06, "loss": 0.2525, "step": 40155 }, { "epoch": 4.682835820895522, "grad_norm": 0.5295733825833271, "learning_rate": 5.4932194691194905e-06, "loss": 0.247, "step": 40160 }, { "epoch": 4.683418843283582, "grad_norm": 0.5395528578987526, "learning_rate": 5.491414452495006e-06, "loss": 0.2479, "step": 40165 }, { "epoch": 4.684001865671641, "grad_norm": 0.5966249657406705, "learning_rate": 5.489612708336324e-06, "loss": 0.274, "step": 40170 }, { "epoch": 4.684584888059701, "grad_norm": 0.563176780579703, "learning_rate": 5.487814236911344e-06, "loss": 0.2607, "step": 40175 }, { "epoch": 4.6851679104477615, "grad_norm": 0.5203259541293299, "learning_rate": 5.486019038487483e-06, "loss": 0.2772, "step": 40180 }, { "epoch": 4.685750932835821, "grad_norm": 0.5344403281531664, "learning_rate": 5.484227113331673e-06, "loss": 0.2677, "step": 40185 }, { "epoch": 4.686333955223881, "grad_norm": 0.5177440774023253, "learning_rate": 5.482438461710355e-06, "loss": 0.2613, "step": 40190 }, { "epoch": 4.68691697761194, "grad_norm": 0.5305643996489512, "learning_rate": 5.480653083889483e-06, "loss": 0.2558, "step": 40195 }, { "epoch": 4.6875, "grad_norm": 0.5786857544930206, "learning_rate": 5.4788709801345244e-06, "loss": 0.2625, "step": 40200 }, { "epoch": 4.68808302238806, "grad_norm": 0.5069624380940012, "learning_rate": 5.477092150710465e-06, "loss": 0.2569, "step": 40205 }, { "epoch": 4.688666044776119, "grad_norm": 0.5489895315358849, "learning_rate": 5.475316595881796e-06, "loss": 0.262, "step": 40210 }, { "epoch": 4.689249067164179, "grad_norm": 0.535416688505873, "learning_rate": 5.473544315912525e-06, "loss": 0.2806, "step": 40215 }, { "epoch": 4.6898320895522385, "grad_norm": 0.5196999033028088, "learning_rate": 5.471775311066177e-06, "loss": 0.2654, "step": 40220 }, { "epoch": 4.690415111940299, "grad_norm": 0.5365107668452866, "learning_rate": 5.470009581605784e-06, "loss": 0.2651, "step": 40225 }, { "epoch": 4.690998134328359, "grad_norm": 0.4878512621687332, "learning_rate": 5.468247127793893e-06, "loss": 0.2718, "step": 40230 }, { "epoch": 4.691581156716418, "grad_norm": 0.5093462351218783, "learning_rate": 5.46648794989256e-06, "loss": 0.245, "step": 40235 }, { "epoch": 4.692164179104478, "grad_norm": 0.5283044061072845, "learning_rate": 5.464732048163365e-06, "loss": 0.2652, "step": 40240 }, { "epoch": 4.692747201492537, "grad_norm": 0.570138435529778, "learning_rate": 5.462979422867388e-06, "loss": 0.2958, "step": 40245 }, { "epoch": 4.693330223880597, "grad_norm": 0.4825031535208165, "learning_rate": 5.461230074265233e-06, "loss": 0.2523, "step": 40250 }, { "epoch": 4.693913246268656, "grad_norm": 0.5455881606816707, "learning_rate": 5.459484002617008e-06, "loss": 0.2535, "step": 40255 }, { "epoch": 4.694496268656716, "grad_norm": 0.4903244304349601, "learning_rate": 5.4577412081823355e-06, "loss": 0.2615, "step": 40260 }, { "epoch": 4.6950792910447765, "grad_norm": 0.5377195012743091, "learning_rate": 5.456001691220357e-06, "loss": 0.2653, "step": 40265 }, { "epoch": 4.695662313432836, "grad_norm": 0.5292935130242334, "learning_rate": 5.45426545198972e-06, "loss": 0.2742, "step": 40270 }, { "epoch": 4.696245335820896, "grad_norm": 0.5449815900092477, "learning_rate": 5.452532490748581e-06, "loss": 0.2791, "step": 40275 }, { "epoch": 4.696828358208955, "grad_norm": 0.5402211664905027, "learning_rate": 5.450802807754625e-06, "loss": 0.2683, "step": 40280 }, { "epoch": 4.697411380597015, "grad_norm": 0.5602627461356222, "learning_rate": 5.449076403265029e-06, "loss": 0.2609, "step": 40285 }, { "epoch": 4.697994402985074, "grad_norm": 0.5371661097644681, "learning_rate": 5.4473532775365026e-06, "loss": 0.2652, "step": 40290 }, { "epoch": 4.698577425373134, "grad_norm": 0.5055991793671961, "learning_rate": 5.44563343082525e-06, "loss": 0.2453, "step": 40295 }, { "epoch": 4.699160447761194, "grad_norm": 0.5283521426803366, "learning_rate": 5.443916863387002e-06, "loss": 0.2695, "step": 40300 }, { "epoch": 4.6997434701492535, "grad_norm": 0.5347185758970066, "learning_rate": 5.4422035754769915e-06, "loss": 0.2692, "step": 40305 }, { "epoch": 4.700326492537314, "grad_norm": 0.5416159480228936, "learning_rate": 5.4404935673499685e-06, "loss": 0.2593, "step": 40310 }, { "epoch": 4.700909514925373, "grad_norm": 0.511953850952864, "learning_rate": 5.438786839260197e-06, "loss": 0.2576, "step": 40315 }, { "epoch": 4.701492537313433, "grad_norm": 0.5346523548979524, "learning_rate": 5.437083391461452e-06, "loss": 0.2751, "step": 40320 }, { "epoch": 4.702075559701493, "grad_norm": 0.585314966757987, "learning_rate": 5.4353832242070155e-06, "loss": 0.257, "step": 40325 }, { "epoch": 4.702658582089552, "grad_norm": 0.5304246627779913, "learning_rate": 5.43368633774969e-06, "loss": 0.2773, "step": 40330 }, { "epoch": 4.703241604477612, "grad_norm": 0.5276605729069379, "learning_rate": 5.43199273234178e-06, "loss": 0.2613, "step": 40335 }, { "epoch": 4.703824626865671, "grad_norm": 0.5367997118697415, "learning_rate": 5.43030240823512e-06, "loss": 0.2578, "step": 40340 }, { "epoch": 4.7044076492537314, "grad_norm": 0.5098762450553499, "learning_rate": 5.428615365681034e-06, "loss": 0.2576, "step": 40345 }, { "epoch": 4.7049906716417915, "grad_norm": 0.4949497556650158, "learning_rate": 5.426931604930375e-06, "loss": 0.2399, "step": 40350 }, { "epoch": 4.705573694029851, "grad_norm": 0.5827573260925429, "learning_rate": 5.425251126233498e-06, "loss": 0.2609, "step": 40355 }, { "epoch": 4.706156716417911, "grad_norm": 0.5425470511542679, "learning_rate": 5.423573929840277e-06, "loss": 0.2593, "step": 40360 }, { "epoch": 4.70673973880597, "grad_norm": 0.5230301784054733, "learning_rate": 5.421900016000093e-06, "loss": 0.2694, "step": 40365 }, { "epoch": 4.70732276119403, "grad_norm": 0.5335007537950297, "learning_rate": 5.420229384961847e-06, "loss": 0.2526, "step": 40370 }, { "epoch": 4.707905783582089, "grad_norm": 0.5097797666119723, "learning_rate": 5.418562036973937e-06, "loss": 0.2686, "step": 40375 }, { "epoch": 4.708488805970149, "grad_norm": 0.518227965926884, "learning_rate": 5.416897972284287e-06, "loss": 0.2537, "step": 40380 }, { "epoch": 4.7090718283582085, "grad_norm": 0.532995801550675, "learning_rate": 5.415237191140326e-06, "loss": 0.2619, "step": 40385 }, { "epoch": 4.7096548507462686, "grad_norm": 0.5120121487426791, "learning_rate": 5.413579693788995e-06, "loss": 0.2478, "step": 40390 }, { "epoch": 4.710237873134329, "grad_norm": 0.5699124059926832, "learning_rate": 5.411925480476752e-06, "loss": 0.2814, "step": 40395 }, { "epoch": 4.710820895522388, "grad_norm": 0.4817653417686525, "learning_rate": 5.410274551449559e-06, "loss": 0.2579, "step": 40400 }, { "epoch": 4.711403917910448, "grad_norm": 0.5000307120977543, "learning_rate": 5.408626906952895e-06, "loss": 0.254, "step": 40405 }, { "epoch": 4.711986940298507, "grad_norm": 0.5539859309501919, "learning_rate": 5.406982547231746e-06, "loss": 0.2586, "step": 40410 }, { "epoch": 4.712569962686567, "grad_norm": 0.5030322900301132, "learning_rate": 5.40534147253062e-06, "loss": 0.2531, "step": 40415 }, { "epoch": 4.713152985074627, "grad_norm": 0.5064228139112594, "learning_rate": 5.403703683093517e-06, "loss": 0.2694, "step": 40420 }, { "epoch": 4.713736007462686, "grad_norm": 0.6053394100947187, "learning_rate": 5.402069179163974e-06, "loss": 0.2794, "step": 40425 }, { "epoch": 4.7143190298507465, "grad_norm": 0.5284915173440232, "learning_rate": 5.400437960985017e-06, "loss": 0.2582, "step": 40430 }, { "epoch": 4.714902052238806, "grad_norm": 0.5162057214643305, "learning_rate": 5.398810028799196e-06, "loss": 0.2695, "step": 40435 }, { "epoch": 4.715485074626866, "grad_norm": 0.5218416661881635, "learning_rate": 5.397185382848568e-06, "loss": 0.2617, "step": 40440 }, { "epoch": 4.716068097014926, "grad_norm": 0.5124408552690876, "learning_rate": 5.3955640233747e-06, "loss": 0.2708, "step": 40445 }, { "epoch": 4.716651119402985, "grad_norm": 0.4967211274025528, "learning_rate": 5.393945950618678e-06, "loss": 0.2612, "step": 40450 }, { "epoch": 4.717234141791045, "grad_norm": 0.5516360632320493, "learning_rate": 5.392331164821091e-06, "loss": 0.2588, "step": 40455 }, { "epoch": 4.717817164179104, "grad_norm": 0.5251034968437159, "learning_rate": 5.39071966622204e-06, "loss": 0.2647, "step": 40460 }, { "epoch": 4.718400186567164, "grad_norm": 0.5323528852485695, "learning_rate": 5.3891114550611434e-06, "loss": 0.279, "step": 40465 }, { "epoch": 4.7189832089552235, "grad_norm": 0.5446225597407718, "learning_rate": 5.387506531577523e-06, "loss": 0.2574, "step": 40470 }, { "epoch": 4.719566231343284, "grad_norm": 0.519963815275516, "learning_rate": 5.385904896009821e-06, "loss": 0.2644, "step": 40475 }, { "epoch": 4.720149253731344, "grad_norm": 0.5711317539586047, "learning_rate": 5.384306548596178e-06, "loss": 0.2637, "step": 40480 }, { "epoch": 4.720732276119403, "grad_norm": 0.5320934777742895, "learning_rate": 5.382711489574259e-06, "loss": 0.2633, "step": 40485 }, { "epoch": 4.721315298507463, "grad_norm": 0.5258731490344641, "learning_rate": 5.3811197191812296e-06, "loss": 0.2523, "step": 40490 }, { "epoch": 4.721898320895522, "grad_norm": 0.5204609157125093, "learning_rate": 5.379531237653774e-06, "loss": 0.2613, "step": 40495 }, { "epoch": 4.722481343283582, "grad_norm": 0.49552434747541724, "learning_rate": 5.377946045228084e-06, "loss": 0.2448, "step": 40500 }, { "epoch": 4.723064365671641, "grad_norm": 0.5685699400908784, "learning_rate": 5.376364142139862e-06, "loss": 0.291, "step": 40505 }, { "epoch": 4.723647388059701, "grad_norm": 0.5229595114578796, "learning_rate": 5.374785528624317e-06, "loss": 0.2706, "step": 40510 }, { "epoch": 4.7242304104477615, "grad_norm": 0.5530791731934469, "learning_rate": 5.3732102049161845e-06, "loss": 0.2662, "step": 40515 }, { "epoch": 4.724813432835821, "grad_norm": 0.5389337714303444, "learning_rate": 5.37163817124969e-06, "loss": 0.2543, "step": 40520 }, { "epoch": 4.725396455223881, "grad_norm": 0.5106618022840242, "learning_rate": 5.370069427858584e-06, "loss": 0.2566, "step": 40525 }, { "epoch": 4.72597947761194, "grad_norm": 0.5233962639447487, "learning_rate": 5.368503974976122e-06, "loss": 0.2609, "step": 40530 }, { "epoch": 4.7265625, "grad_norm": 0.5187126215016823, "learning_rate": 5.366941812835075e-06, "loss": 0.255, "step": 40535 }, { "epoch": 4.72714552238806, "grad_norm": 0.5812894275984566, "learning_rate": 5.36538294166772e-06, "loss": 0.2774, "step": 40540 }, { "epoch": 4.727728544776119, "grad_norm": 0.524220816828342, "learning_rate": 5.363827361705844e-06, "loss": 0.2556, "step": 40545 }, { "epoch": 4.728311567164179, "grad_norm": 0.524510929715867, "learning_rate": 5.362275073180749e-06, "loss": 0.2737, "step": 40550 }, { "epoch": 4.7288945895522385, "grad_norm": 0.5310585456415964, "learning_rate": 5.360726076323246e-06, "loss": 0.26, "step": 40555 }, { "epoch": 4.729477611940299, "grad_norm": 0.5829572995701353, "learning_rate": 5.3591803713636545e-06, "loss": 0.2532, "step": 40560 }, { "epoch": 4.730060634328359, "grad_norm": 0.49653735151619316, "learning_rate": 5.357637958531805e-06, "loss": 0.2677, "step": 40565 }, { "epoch": 4.730643656716418, "grad_norm": 0.5483304065130487, "learning_rate": 5.3560988380570405e-06, "loss": 0.2671, "step": 40570 }, { "epoch": 4.731226679104478, "grad_norm": 0.5614680943080873, "learning_rate": 5.3545630101682155e-06, "loss": 0.2667, "step": 40575 }, { "epoch": 4.731809701492537, "grad_norm": 0.5035387139025418, "learning_rate": 5.353030475093694e-06, "loss": 0.2511, "step": 40580 }, { "epoch": 4.732392723880597, "grad_norm": 0.5161985002683129, "learning_rate": 5.351501233061343e-06, "loss": 0.2738, "step": 40585 }, { "epoch": 4.732975746268656, "grad_norm": 0.5531027710723204, "learning_rate": 5.349975284298552e-06, "loss": 0.2613, "step": 40590 }, { "epoch": 4.733558768656716, "grad_norm": 0.5022586841054332, "learning_rate": 5.348452629032209e-06, "loss": 0.2692, "step": 40595 }, { "epoch": 4.7341417910447765, "grad_norm": 0.5253995608443942, "learning_rate": 5.346933267488726e-06, "loss": 0.2648, "step": 40600 }, { "epoch": 4.734724813432836, "grad_norm": 0.5104069186945384, "learning_rate": 5.345417199894012e-06, "loss": 0.2516, "step": 40605 }, { "epoch": 4.735307835820896, "grad_norm": 0.559296865010764, "learning_rate": 5.343904426473493e-06, "loss": 0.2544, "step": 40610 }, { "epoch": 4.735890858208955, "grad_norm": 0.5548292756496112, "learning_rate": 5.342394947452106e-06, "loss": 0.2689, "step": 40615 }, { "epoch": 4.736473880597015, "grad_norm": 0.5239717307752404, "learning_rate": 5.340888763054291e-06, "loss": 0.2654, "step": 40620 }, { "epoch": 4.737056902985074, "grad_norm": 0.5688670435061347, "learning_rate": 5.3393858735040074e-06, "loss": 0.268, "step": 40625 }, { "epoch": 4.737639925373134, "grad_norm": 0.5198015841583459, "learning_rate": 5.337886279024722e-06, "loss": 0.2487, "step": 40630 }, { "epoch": 4.738222947761194, "grad_norm": 0.5503052586607899, "learning_rate": 5.336389979839405e-06, "loss": 0.2556, "step": 40635 }, { "epoch": 4.7388059701492535, "grad_norm": 0.4806789494119374, "learning_rate": 5.3348969761705446e-06, "loss": 0.2594, "step": 40640 }, { "epoch": 4.739388992537314, "grad_norm": 0.49998377720445447, "learning_rate": 5.3334072682401365e-06, "loss": 0.2524, "step": 40645 }, { "epoch": 4.739972014925373, "grad_norm": 0.5404004453849443, "learning_rate": 5.331920856269686e-06, "loss": 0.2525, "step": 40650 }, { "epoch": 4.740555037313433, "grad_norm": 0.5412806575945461, "learning_rate": 5.330437740480206e-06, "loss": 0.2661, "step": 40655 }, { "epoch": 4.741138059701493, "grad_norm": 0.4893659442503186, "learning_rate": 5.328957921092224e-06, "loss": 0.2619, "step": 40660 }, { "epoch": 4.741721082089552, "grad_norm": 0.5552724233256167, "learning_rate": 5.327481398325775e-06, "loss": 0.265, "step": 40665 }, { "epoch": 4.742304104477612, "grad_norm": 0.5489696726807467, "learning_rate": 5.326008172400402e-06, "loss": 0.2758, "step": 40670 }, { "epoch": 4.742887126865671, "grad_norm": 0.500392286479777, "learning_rate": 5.324538243535162e-06, "loss": 0.2522, "step": 40675 }, { "epoch": 4.7434701492537314, "grad_norm": 0.5139908114288545, "learning_rate": 5.323071611948619e-06, "loss": 0.2562, "step": 40680 }, { "epoch": 4.7440531716417915, "grad_norm": 0.5034898497943342, "learning_rate": 5.3216082778588426e-06, "loss": 0.2526, "step": 40685 }, { "epoch": 4.744636194029851, "grad_norm": 0.5437215997709941, "learning_rate": 5.320148241483422e-06, "loss": 0.2584, "step": 40690 }, { "epoch": 4.745219216417911, "grad_norm": 0.5085616862564696, "learning_rate": 5.318691503039448e-06, "loss": 0.2821, "step": 40695 }, { "epoch": 4.74580223880597, "grad_norm": 0.5749620527731026, "learning_rate": 5.317238062743527e-06, "loss": 0.2702, "step": 40700 }, { "epoch": 4.74638526119403, "grad_norm": 0.5476980176559368, "learning_rate": 5.315787920811766e-06, "loss": 0.2519, "step": 40705 }, { "epoch": 4.746968283582089, "grad_norm": 0.4931259630495937, "learning_rate": 5.31434107745979e-06, "loss": 0.245, "step": 40710 }, { "epoch": 4.747551305970149, "grad_norm": 0.5631639328610191, "learning_rate": 5.312897532902733e-06, "loss": 0.2585, "step": 40715 }, { "epoch": 4.7481343283582085, "grad_norm": 0.5407894786123406, "learning_rate": 5.311457287355232e-06, "loss": 0.2596, "step": 40720 }, { "epoch": 4.7487173507462686, "grad_norm": 0.525482713469301, "learning_rate": 5.310020341031439e-06, "loss": 0.2536, "step": 40725 }, { "epoch": 4.749300373134329, "grad_norm": 0.5247967627650835, "learning_rate": 5.3085866941450185e-06, "loss": 0.2442, "step": 40730 }, { "epoch": 4.749883395522388, "grad_norm": 0.5372888627745188, "learning_rate": 5.307156346909135e-06, "loss": 0.2777, "step": 40735 }, { "epoch": 4.750466417910448, "grad_norm": 0.509979551698758, "learning_rate": 5.3057292995364695e-06, "loss": 0.2643, "step": 40740 }, { "epoch": 4.751049440298507, "grad_norm": 0.5673777341030805, "learning_rate": 5.304305552239209e-06, "loss": 0.2719, "step": 40745 }, { "epoch": 4.751632462686567, "grad_norm": 0.6138984591484716, "learning_rate": 5.302885105229052e-06, "loss": 0.2473, "step": 40750 }, { "epoch": 4.752215485074627, "grad_norm": 0.5511838873187485, "learning_rate": 5.301467958717205e-06, "loss": 0.2635, "step": 40755 }, { "epoch": 4.752798507462686, "grad_norm": 0.5626985726020425, "learning_rate": 5.300054112914385e-06, "loss": 0.2637, "step": 40760 }, { "epoch": 4.7533815298507465, "grad_norm": 0.5653948297545796, "learning_rate": 5.298643568030817e-06, "loss": 0.2602, "step": 40765 }, { "epoch": 4.753964552238806, "grad_norm": 0.5995559376975006, "learning_rate": 5.297236324276231e-06, "loss": 0.2619, "step": 40770 }, { "epoch": 4.754547574626866, "grad_norm": 0.4862563833282966, "learning_rate": 5.295832381859881e-06, "loss": 0.2473, "step": 40775 }, { "epoch": 4.755130597014926, "grad_norm": 0.5391813031082727, "learning_rate": 5.294431740990509e-06, "loss": 0.251, "step": 40780 }, { "epoch": 4.755713619402985, "grad_norm": 0.5059009637354451, "learning_rate": 5.293034401876384e-06, "loss": 0.2494, "step": 40785 }, { "epoch": 4.756296641791045, "grad_norm": 0.5855608841540562, "learning_rate": 5.291640364725272e-06, "loss": 0.2731, "step": 40790 }, { "epoch": 4.756879664179104, "grad_norm": 0.5309524178372007, "learning_rate": 5.290249629744457e-06, "loss": 0.2557, "step": 40795 }, { "epoch": 4.757462686567164, "grad_norm": 0.5965301157496198, "learning_rate": 5.288862197140726e-06, "loss": 0.2693, "step": 40800 }, { "epoch": 4.7580457089552235, "grad_norm": 0.5934960101275178, "learning_rate": 5.28747806712038e-06, "loss": 0.2659, "step": 40805 }, { "epoch": 4.758628731343284, "grad_norm": 0.565589469279002, "learning_rate": 5.286097239889219e-06, "loss": 0.2785, "step": 40810 }, { "epoch": 4.759211753731344, "grad_norm": 0.5046709999882028, "learning_rate": 5.284719715652565e-06, "loss": 0.2567, "step": 40815 }, { "epoch": 4.759794776119403, "grad_norm": 0.5118365450693864, "learning_rate": 5.283345494615238e-06, "loss": 0.2557, "step": 40820 }, { "epoch": 4.760377798507463, "grad_norm": 0.5205303636235599, "learning_rate": 5.281974576981579e-06, "loss": 0.256, "step": 40825 }, { "epoch": 4.760960820895522, "grad_norm": 0.5395217583502233, "learning_rate": 5.280606962955423e-06, "loss": 0.2661, "step": 40830 }, { "epoch": 4.761543843283582, "grad_norm": 0.5485553093877918, "learning_rate": 5.279242652740121e-06, "loss": 0.2776, "step": 40835 }, { "epoch": 4.762126865671641, "grad_norm": 0.5254161397828818, "learning_rate": 5.277881646538537e-06, "loss": 0.2517, "step": 40840 }, { "epoch": 4.762709888059701, "grad_norm": 0.5385952564235336, "learning_rate": 5.276523944553039e-06, "loss": 0.269, "step": 40845 }, { "epoch": 4.7632929104477615, "grad_norm": 0.5207889980547087, "learning_rate": 5.275169546985502e-06, "loss": 0.249, "step": 40850 }, { "epoch": 4.763875932835821, "grad_norm": 0.4855190115549899, "learning_rate": 5.2738184540373165e-06, "loss": 0.2468, "step": 40855 }, { "epoch": 4.764458955223881, "grad_norm": 0.6091454621186038, "learning_rate": 5.272470665909368e-06, "loss": 0.271, "step": 40860 }, { "epoch": 4.76504197761194, "grad_norm": 0.49247077988336013, "learning_rate": 5.271126182802072e-06, "loss": 0.2644, "step": 40865 }, { "epoch": 4.765625, "grad_norm": 0.5095254342060529, "learning_rate": 5.269785004915328e-06, "loss": 0.2556, "step": 40870 }, { "epoch": 4.76620802238806, "grad_norm": 0.5959620475722028, "learning_rate": 5.268447132448565e-06, "loss": 0.2763, "step": 40875 }, { "epoch": 4.766791044776119, "grad_norm": 0.4929494771026853, "learning_rate": 5.267112565600707e-06, "loss": 0.2531, "step": 40880 }, { "epoch": 4.767374067164179, "grad_norm": 0.5381638579536344, "learning_rate": 5.265781304570194e-06, "loss": 0.2544, "step": 40885 }, { "epoch": 4.7679570895522385, "grad_norm": 0.5320593743143683, "learning_rate": 5.26445334955497e-06, "loss": 0.2494, "step": 40890 }, { "epoch": 4.768540111940299, "grad_norm": 0.5045017763591447, "learning_rate": 5.263128700752493e-06, "loss": 0.2483, "step": 40895 }, { "epoch": 4.769123134328359, "grad_norm": 0.5700101588266805, "learning_rate": 5.261807358359719e-06, "loss": 0.2723, "step": 40900 }, { "epoch": 4.769706156716418, "grad_norm": 0.5635033094370631, "learning_rate": 5.260489322573125e-06, "loss": 0.2725, "step": 40905 }, { "epoch": 4.770289179104478, "grad_norm": 0.6278980798894483, "learning_rate": 5.259174593588688e-06, "loss": 0.2538, "step": 40910 }, { "epoch": 4.770872201492537, "grad_norm": 0.5626121390410826, "learning_rate": 5.257863171601895e-06, "loss": 0.2507, "step": 40915 }, { "epoch": 4.771455223880597, "grad_norm": 0.5656869405961013, "learning_rate": 5.25655505680774e-06, "loss": 0.2576, "step": 40920 }, { "epoch": 4.772038246268656, "grad_norm": 0.5341669013729338, "learning_rate": 5.255250249400732e-06, "loss": 0.2667, "step": 40925 }, { "epoch": 4.772621268656716, "grad_norm": 0.5691003546065335, "learning_rate": 5.253948749574879e-06, "loss": 0.2784, "step": 40930 }, { "epoch": 4.7732042910447765, "grad_norm": 0.5362236931275812, "learning_rate": 5.252650557523707e-06, "loss": 0.2567, "step": 40935 }, { "epoch": 4.773787313432836, "grad_norm": 0.5153875571339367, "learning_rate": 5.2513556734402384e-06, "loss": 0.2547, "step": 40940 }, { "epoch": 4.774370335820896, "grad_norm": 0.5544388593610338, "learning_rate": 5.2500640975170116e-06, "loss": 0.2742, "step": 40945 }, { "epoch": 4.774953358208955, "grad_norm": 0.5186078722498859, "learning_rate": 5.248775829946076e-06, "loss": 0.26, "step": 40950 }, { "epoch": 4.775536380597015, "grad_norm": 0.5039981307918615, "learning_rate": 5.247490870918979e-06, "loss": 0.2615, "step": 40955 }, { "epoch": 4.776119402985074, "grad_norm": 0.5411918132359785, "learning_rate": 5.2462092206267864e-06, "loss": 0.2665, "step": 40960 }, { "epoch": 4.776702425373134, "grad_norm": 0.5229118820120464, "learning_rate": 5.244930879260062e-06, "loss": 0.2624, "step": 40965 }, { "epoch": 4.777285447761194, "grad_norm": 0.5592016814123764, "learning_rate": 5.243655847008888e-06, "loss": 0.2617, "step": 40970 }, { "epoch": 4.7778684701492535, "grad_norm": 0.58770568209626, "learning_rate": 5.242384124062848e-06, "loss": 0.2651, "step": 40975 }, { "epoch": 4.778451492537314, "grad_norm": 0.5386650433973039, "learning_rate": 5.241115710611033e-06, "loss": 0.24, "step": 40980 }, { "epoch": 4.779034514925373, "grad_norm": 0.5263463685860263, "learning_rate": 5.239850606842045e-06, "loss": 0.2522, "step": 40985 }, { "epoch": 4.779617537313433, "grad_norm": 0.49546510753620826, "learning_rate": 5.2385888129439934e-06, "loss": 0.2808, "step": 40990 }, { "epoch": 4.780200559701493, "grad_norm": 0.5160900866158508, "learning_rate": 5.237330329104494e-06, "loss": 0.268, "step": 40995 }, { "epoch": 4.780783582089552, "grad_norm": 0.5393541779676191, "learning_rate": 5.236075155510675e-06, "loss": 0.2731, "step": 41000 }, { "epoch": 4.781366604477612, "grad_norm": 0.5075386549133473, "learning_rate": 5.234823292349164e-06, "loss": 0.2517, "step": 41005 }, { "epoch": 4.781949626865671, "grad_norm": 0.5220473258091382, "learning_rate": 5.2335747398061e-06, "loss": 0.2504, "step": 41010 }, { "epoch": 4.7825326492537314, "grad_norm": 0.5231653854748372, "learning_rate": 5.2323294980671375e-06, "loss": 0.2535, "step": 41015 }, { "epoch": 4.7831156716417915, "grad_norm": 0.5170973159021661, "learning_rate": 5.231087567317425e-06, "loss": 0.2579, "step": 41020 }, { "epoch": 4.783698694029851, "grad_norm": 0.4787404341100575, "learning_rate": 5.229848947741629e-06, "loss": 0.271, "step": 41025 }, { "epoch": 4.784281716417911, "grad_norm": 0.5512414149848892, "learning_rate": 5.228613639523922e-06, "loss": 0.2701, "step": 41030 }, { "epoch": 4.78486473880597, "grad_norm": 0.5135871981620789, "learning_rate": 5.2273816428479785e-06, "loss": 0.2448, "step": 41035 }, { "epoch": 4.78544776119403, "grad_norm": 0.49858436633604963, "learning_rate": 5.2261529578969905e-06, "loss": 0.2445, "step": 41040 }, { "epoch": 4.786030783582089, "grad_norm": 0.5510471618902676, "learning_rate": 5.224927584853641e-06, "loss": 0.2497, "step": 41045 }, { "epoch": 4.786613805970149, "grad_norm": 0.8113177787151018, "learning_rate": 5.223705523900145e-06, "loss": 0.2595, "step": 41050 }, { "epoch": 4.7871968283582085, "grad_norm": 0.5028236402233456, "learning_rate": 5.2224867752181995e-06, "loss": 0.2622, "step": 41055 }, { "epoch": 4.7877798507462686, "grad_norm": 0.5069885429192494, "learning_rate": 5.22127133898903e-06, "loss": 0.2522, "step": 41060 }, { "epoch": 4.788362873134329, "grad_norm": 0.5164495970262721, "learning_rate": 5.220059215393352e-06, "loss": 0.2786, "step": 41065 }, { "epoch": 4.788945895522388, "grad_norm": 0.4950288137723687, "learning_rate": 5.2188504046114005e-06, "loss": 0.2586, "step": 41070 }, { "epoch": 4.789528917910448, "grad_norm": 0.546256706251749, "learning_rate": 5.217644906822914e-06, "loss": 0.2624, "step": 41075 }, { "epoch": 4.790111940298507, "grad_norm": 0.5324037872923388, "learning_rate": 5.216442722207141e-06, "loss": 0.2586, "step": 41080 }, { "epoch": 4.790694962686567, "grad_norm": 0.5215843825027218, "learning_rate": 5.215243850942831e-06, "loss": 0.2544, "step": 41085 }, { "epoch": 4.791277985074627, "grad_norm": 0.5247919475166944, "learning_rate": 5.214048293208246e-06, "loss": 0.2689, "step": 41090 }, { "epoch": 4.791861007462686, "grad_norm": 0.5457803597659311, "learning_rate": 5.212856049181154e-06, "loss": 0.2775, "step": 41095 }, { "epoch": 4.7924440298507465, "grad_norm": 0.5500037743985086, "learning_rate": 5.211667119038829e-06, "loss": 0.2669, "step": 41100 }, { "epoch": 4.793027052238806, "grad_norm": 0.5141040864901906, "learning_rate": 5.210481502958057e-06, "loss": 0.2412, "step": 41105 }, { "epoch": 4.793610074626866, "grad_norm": 0.5085772527750257, "learning_rate": 5.209299201115125e-06, "loss": 0.2677, "step": 41110 }, { "epoch": 4.794193097014926, "grad_norm": 0.5400621924271326, "learning_rate": 5.2081202136858296e-06, "loss": 0.2602, "step": 41115 }, { "epoch": 4.794776119402985, "grad_norm": 0.5204024070669308, "learning_rate": 5.206944540845476e-06, "loss": 0.2545, "step": 41120 }, { "epoch": 4.795359141791045, "grad_norm": 0.5752393003235707, "learning_rate": 5.205772182768876e-06, "loss": 0.2572, "step": 41125 }, { "epoch": 4.795942164179104, "grad_norm": 0.5273295672290484, "learning_rate": 5.204603139630345e-06, "loss": 0.2741, "step": 41130 }, { "epoch": 4.796525186567164, "grad_norm": 0.5809780830806073, "learning_rate": 5.2034374116037146e-06, "loss": 0.2711, "step": 41135 }, { "epoch": 4.7971082089552235, "grad_norm": 0.5082961951923609, "learning_rate": 5.202274998862312e-06, "loss": 0.2605, "step": 41140 }, { "epoch": 4.797691231343284, "grad_norm": 0.5000814687993748, "learning_rate": 5.2011159015789775e-06, "loss": 0.2626, "step": 41145 }, { "epoch": 4.798274253731344, "grad_norm": 0.5607675581199246, "learning_rate": 5.199960119926059e-06, "loss": 0.265, "step": 41150 }, { "epoch": 4.798857276119403, "grad_norm": 0.5137968339531936, "learning_rate": 5.19880765407541e-06, "loss": 0.2494, "step": 41155 }, { "epoch": 4.799440298507463, "grad_norm": 0.593887723657077, "learning_rate": 5.197658504198392e-06, "loss": 0.2768, "step": 41160 }, { "epoch": 4.800023320895522, "grad_norm": 0.5074639536519121, "learning_rate": 5.19651267046587e-06, "loss": 0.2584, "step": 41165 }, { "epoch": 4.800606343283582, "grad_norm": 0.5227648618254918, "learning_rate": 5.1953701530482215e-06, "loss": 0.2636, "step": 41170 }, { "epoch": 4.801189365671641, "grad_norm": 0.5476621550591564, "learning_rate": 5.194230952115327e-06, "loss": 0.2688, "step": 41175 }, { "epoch": 4.801772388059701, "grad_norm": 0.5210770379297213, "learning_rate": 5.1930950678365715e-06, "loss": 0.2854, "step": 41180 }, { "epoch": 4.8023554104477615, "grad_norm": 0.5216575254526342, "learning_rate": 5.191962500380854e-06, "loss": 0.262, "step": 41185 }, { "epoch": 4.802938432835821, "grad_norm": 0.5171537386595773, "learning_rate": 5.190833249916577e-06, "loss": 0.2733, "step": 41190 }, { "epoch": 4.803521455223881, "grad_norm": 0.5813908164077521, "learning_rate": 5.189707316611646e-06, "loss": 0.2495, "step": 41195 }, { "epoch": 4.80410447761194, "grad_norm": 0.5586557732836941, "learning_rate": 5.188584700633478e-06, "loss": 0.241, "step": 41200 }, { "epoch": 4.8046875, "grad_norm": 0.5704331772628655, "learning_rate": 5.187465402148996e-06, "loss": 0.2714, "step": 41205 }, { "epoch": 4.80527052238806, "grad_norm": 0.5038053712504418, "learning_rate": 5.186349421324627e-06, "loss": 0.2521, "step": 41210 }, { "epoch": 4.805853544776119, "grad_norm": 0.48991005015762856, "learning_rate": 5.185236758326307e-06, "loss": 0.2465, "step": 41215 }, { "epoch": 4.806436567164179, "grad_norm": 0.5590184867533937, "learning_rate": 5.184127413319482e-06, "loss": 0.2564, "step": 41220 }, { "epoch": 4.8070195895522385, "grad_norm": 0.5390856299523609, "learning_rate": 5.183021386469096e-06, "loss": 0.2651, "step": 41225 }, { "epoch": 4.807602611940299, "grad_norm": 0.5153705347905203, "learning_rate": 5.181918677939608e-06, "loss": 0.2394, "step": 41230 }, { "epoch": 4.808185634328359, "grad_norm": 0.5204901147551954, "learning_rate": 5.180819287894979e-06, "loss": 0.2488, "step": 41235 }, { "epoch": 4.808768656716418, "grad_norm": 0.5024091636388212, "learning_rate": 5.179723216498677e-06, "loss": 0.2562, "step": 41240 }, { "epoch": 4.809351679104478, "grad_norm": 0.5366827271698726, "learning_rate": 5.178630463913678e-06, "loss": 0.2663, "step": 41245 }, { "epoch": 4.809934701492537, "grad_norm": 0.5671695408736948, "learning_rate": 5.177541030302462e-06, "loss": 0.2702, "step": 41250 }, { "epoch": 4.810517723880597, "grad_norm": 0.5516268347808859, "learning_rate": 5.176454915827022e-06, "loss": 0.2582, "step": 41255 }, { "epoch": 4.811100746268656, "grad_norm": 0.5179267846099194, "learning_rate": 5.17537212064885e-06, "loss": 0.2809, "step": 41260 }, { "epoch": 4.811683768656716, "grad_norm": 0.5722368640770885, "learning_rate": 5.174292644928947e-06, "loss": 0.2552, "step": 41265 }, { "epoch": 4.8122667910447765, "grad_norm": 0.5146685148838908, "learning_rate": 5.173216488827822e-06, "loss": 0.2505, "step": 41270 }, { "epoch": 4.812849813432836, "grad_norm": 0.5008876262394624, "learning_rate": 5.172143652505484e-06, "loss": 0.2469, "step": 41275 }, { "epoch": 4.813432835820896, "grad_norm": 0.5546237872143527, "learning_rate": 5.171074136121461e-06, "loss": 0.2642, "step": 41280 }, { "epoch": 4.814015858208955, "grad_norm": 0.5142335572112503, "learning_rate": 5.170007939834775e-06, "loss": 0.26, "step": 41285 }, { "epoch": 4.814598880597015, "grad_norm": 0.49684680340233983, "learning_rate": 5.168945063803962e-06, "loss": 0.2638, "step": 41290 }, { "epoch": 4.815181902985074, "grad_norm": 0.5244666706893247, "learning_rate": 5.167885508187059e-06, "loss": 0.2554, "step": 41295 }, { "epoch": 4.815764925373134, "grad_norm": 0.6252559958215652, "learning_rate": 5.166829273141612e-06, "loss": 0.2836, "step": 41300 }, { "epoch": 4.816347947761194, "grad_norm": 0.4941692603058814, "learning_rate": 5.165776358824675e-06, "loss": 0.2483, "step": 41305 }, { "epoch": 4.8169309701492535, "grad_norm": 0.490494548706234, "learning_rate": 5.164726765392805e-06, "loss": 0.2504, "step": 41310 }, { "epoch": 4.817513992537314, "grad_norm": 0.5106466529862049, "learning_rate": 5.163680493002067e-06, "loss": 0.25, "step": 41315 }, { "epoch": 4.818097014925373, "grad_norm": 0.5760332400543562, "learning_rate": 5.162637541808031e-06, "loss": 0.2742, "step": 41320 }, { "epoch": 4.818680037313433, "grad_norm": 0.5549388778893086, "learning_rate": 5.161597911965775e-06, "loss": 0.2693, "step": 41325 }, { "epoch": 4.819263059701493, "grad_norm": 0.5452292126226116, "learning_rate": 5.16056160362988e-06, "loss": 0.2658, "step": 41330 }, { "epoch": 4.819846082089552, "grad_norm": 0.49410615896944976, "learning_rate": 5.159528616954435e-06, "loss": 0.2529, "step": 41335 }, { "epoch": 4.820429104477612, "grad_norm": 0.5279706776887558, "learning_rate": 5.158498952093038e-06, "loss": 0.2605, "step": 41340 }, { "epoch": 4.821012126865671, "grad_norm": 0.6082692977800148, "learning_rate": 5.157472609198789e-06, "loss": 0.2657, "step": 41345 }, { "epoch": 4.8215951492537314, "grad_norm": 0.5572523430582565, "learning_rate": 5.156449588424295e-06, "loss": 0.2571, "step": 41350 }, { "epoch": 4.8221781716417915, "grad_norm": 0.5309507134319285, "learning_rate": 5.155429889921669e-06, "loss": 0.2394, "step": 41355 }, { "epoch": 4.822761194029851, "grad_norm": 0.538160114032342, "learning_rate": 5.154413513842533e-06, "loss": 0.2489, "step": 41360 }, { "epoch": 4.823344216417911, "grad_norm": 0.5687588679402671, "learning_rate": 5.153400460338007e-06, "loss": 0.2599, "step": 41365 }, { "epoch": 4.82392723880597, "grad_norm": 0.5386766325848653, "learning_rate": 5.152390729558727e-06, "loss": 0.2595, "step": 41370 }, { "epoch": 4.82451026119403, "grad_norm": 0.5107716584296756, "learning_rate": 5.151384321654828e-06, "loss": 0.2608, "step": 41375 }, { "epoch": 4.825093283582089, "grad_norm": 0.5359963200171824, "learning_rate": 5.1503812367759575e-06, "loss": 0.2594, "step": 41380 }, { "epoch": 4.825676305970149, "grad_norm": 0.53938224010963, "learning_rate": 5.149381475071259e-06, "loss": 0.2704, "step": 41385 }, { "epoch": 4.8262593283582085, "grad_norm": 0.523481136237025, "learning_rate": 5.148385036689391e-06, "loss": 0.2655, "step": 41390 }, { "epoch": 4.8268423507462686, "grad_norm": 0.5531309878440063, "learning_rate": 5.14739192177851e-06, "loss": 0.2629, "step": 41395 }, { "epoch": 4.827425373134329, "grad_norm": 0.5487188548709266, "learning_rate": 5.146402130486288e-06, "loss": 0.2598, "step": 41400 }, { "epoch": 4.828008395522388, "grad_norm": 0.483868133541718, "learning_rate": 5.145415662959895e-06, "loss": 0.2519, "step": 41405 }, { "epoch": 4.828591417910448, "grad_norm": 0.5460956093552999, "learning_rate": 5.144432519346011e-06, "loss": 0.2628, "step": 41410 }, { "epoch": 4.829174440298507, "grad_norm": 0.4684887677128535, "learning_rate": 5.143452699790817e-06, "loss": 0.2573, "step": 41415 }, { "epoch": 4.829757462686567, "grad_norm": 0.5176221523453539, "learning_rate": 5.142476204440002e-06, "loss": 0.2541, "step": 41420 }, { "epoch": 4.830340485074627, "grad_norm": 0.542325095255919, "learning_rate": 5.141503033438769e-06, "loss": 0.271, "step": 41425 }, { "epoch": 4.830923507462686, "grad_norm": 0.5277846578558064, "learning_rate": 5.140533186931809e-06, "loss": 0.2643, "step": 41430 }, { "epoch": 4.8315065298507465, "grad_norm": 0.5046873588315237, "learning_rate": 5.139566665063337e-06, "loss": 0.261, "step": 41435 }, { "epoch": 4.832089552238806, "grad_norm": 0.5400667280080967, "learning_rate": 5.138603467977062e-06, "loss": 0.2538, "step": 41440 }, { "epoch": 4.832672574626866, "grad_norm": 0.5617397499863674, "learning_rate": 5.137643595816202e-06, "loss": 0.2638, "step": 41445 }, { "epoch": 4.833255597014926, "grad_norm": 0.5542382982549092, "learning_rate": 5.136687048723483e-06, "loss": 0.2605, "step": 41450 }, { "epoch": 4.833838619402985, "grad_norm": 0.5600659457148531, "learning_rate": 5.135733826841131e-06, "loss": 0.2695, "step": 41455 }, { "epoch": 4.834421641791045, "grad_norm": 0.5662640252754004, "learning_rate": 5.134783930310883e-06, "loss": 0.2864, "step": 41460 }, { "epoch": 4.835004664179104, "grad_norm": 0.5118629372594454, "learning_rate": 5.13383735927398e-06, "loss": 0.251, "step": 41465 }, { "epoch": 4.835587686567164, "grad_norm": 0.5466234570907842, "learning_rate": 5.132894113871167e-06, "loss": 0.2764, "step": 41470 }, { "epoch": 4.8361707089552235, "grad_norm": 0.4682026764983122, "learning_rate": 5.131954194242696e-06, "loss": 0.2511, "step": 41475 }, { "epoch": 4.836753731343284, "grad_norm": 0.5064540164484578, "learning_rate": 5.131017600528324e-06, "loss": 0.2595, "step": 41480 }, { "epoch": 4.837336753731344, "grad_norm": 0.5180824294976669, "learning_rate": 5.130084332867315e-06, "loss": 0.2643, "step": 41485 }, { "epoch": 4.837919776119403, "grad_norm": 0.5387925620436101, "learning_rate": 5.129154391398433e-06, "loss": 0.2577, "step": 41490 }, { "epoch": 4.838502798507463, "grad_norm": 0.5343000066978911, "learning_rate": 5.128227776259953e-06, "loss": 0.2637, "step": 41495 }, { "epoch": 4.839085820895522, "grad_norm": 0.5690264309006462, "learning_rate": 5.127304487589658e-06, "loss": 0.273, "step": 41500 }, { "epoch": 4.839668843283582, "grad_norm": 0.5326039894970697, "learning_rate": 5.126384525524826e-06, "loss": 0.2596, "step": 41505 }, { "epoch": 4.840251865671641, "grad_norm": 0.5213989444377917, "learning_rate": 5.12546789020225e-06, "loss": 0.2576, "step": 41510 }, { "epoch": 4.840834888059701, "grad_norm": 0.5202532610547281, "learning_rate": 5.124554581758225e-06, "loss": 0.2698, "step": 41515 }, { "epoch": 4.8414179104477615, "grad_norm": 0.5381840289433081, "learning_rate": 5.123644600328549e-06, "loss": 0.2522, "step": 41520 }, { "epoch": 4.842000932835821, "grad_norm": 0.5360853614588602, "learning_rate": 5.122737946048531e-06, "loss": 0.2514, "step": 41525 }, { "epoch": 4.842583955223881, "grad_norm": 0.5440661029265731, "learning_rate": 5.121834619052979e-06, "loss": 0.2667, "step": 41530 }, { "epoch": 4.84316697761194, "grad_norm": 0.6222529165472798, "learning_rate": 5.1209346194762086e-06, "loss": 0.2767, "step": 41535 }, { "epoch": 4.84375, "grad_norm": 0.5104595406737876, "learning_rate": 5.120037947452043e-06, "loss": 0.2536, "step": 41540 }, { "epoch": 4.84433302238806, "grad_norm": 0.5454977030369529, "learning_rate": 5.119144603113809e-06, "loss": 0.2633, "step": 41545 }, { "epoch": 4.844916044776119, "grad_norm": 0.4894074696870294, "learning_rate": 5.118254586594335e-06, "loss": 0.239, "step": 41550 }, { "epoch": 4.845499067164179, "grad_norm": 0.5171996109115735, "learning_rate": 5.117367898025964e-06, "loss": 0.258, "step": 41555 }, { "epoch": 4.8460820895522385, "grad_norm": 0.4713522480727568, "learning_rate": 5.116484537540532e-06, "loss": 0.2538, "step": 41560 }, { "epoch": 4.846665111940299, "grad_norm": 0.5275016619452988, "learning_rate": 5.115604505269388e-06, "loss": 0.2529, "step": 41565 }, { "epoch": 4.847248134328359, "grad_norm": 0.5513663378704136, "learning_rate": 5.114727801343385e-06, "loss": 0.2599, "step": 41570 }, { "epoch": 4.847831156716418, "grad_norm": 0.4682971768023134, "learning_rate": 5.113854425892884e-06, "loss": 0.2617, "step": 41575 }, { "epoch": 4.848414179104478, "grad_norm": 0.5358951952426247, "learning_rate": 5.11298437904774e-06, "loss": 0.2743, "step": 41580 }, { "epoch": 4.848997201492537, "grad_norm": 0.5157316494416937, "learning_rate": 5.112117660937328e-06, "loss": 0.2599, "step": 41585 }, { "epoch": 4.849580223880597, "grad_norm": 0.5944301217961508, "learning_rate": 5.111254271690516e-06, "loss": 0.2837, "step": 41590 }, { "epoch": 4.850163246268656, "grad_norm": 0.5256746516596107, "learning_rate": 5.110394211435682e-06, "loss": 0.2594, "step": 41595 }, { "epoch": 4.850746268656716, "grad_norm": 0.4762454762299095, "learning_rate": 5.1095374803007115e-06, "loss": 0.2433, "step": 41600 }, { "epoch": 4.8513292910447765, "grad_norm": 0.5734092388377269, "learning_rate": 5.1086840784129895e-06, "loss": 0.257, "step": 41605 }, { "epoch": 4.851912313432836, "grad_norm": 0.5445861592902486, "learning_rate": 5.107834005899409e-06, "loss": 0.2732, "step": 41610 }, { "epoch": 4.852495335820896, "grad_norm": 0.5224875973279762, "learning_rate": 5.106987262886371e-06, "loss": 0.2631, "step": 41615 }, { "epoch": 4.853078358208955, "grad_norm": 0.5095682730083747, "learning_rate": 5.1061438494997726e-06, "loss": 0.2466, "step": 41620 }, { "epoch": 4.853661380597015, "grad_norm": 0.4893920407590629, "learning_rate": 5.105303765865026e-06, "loss": 0.267, "step": 41625 }, { "epoch": 4.854244402985074, "grad_norm": 0.5288169382606015, "learning_rate": 5.104467012107041e-06, "loss": 0.27, "step": 41630 }, { "epoch": 4.854827425373134, "grad_norm": 0.47419288178148694, "learning_rate": 5.103633588350236e-06, "loss": 0.2412, "step": 41635 }, { "epoch": 4.855410447761194, "grad_norm": 0.5475481127412534, "learning_rate": 5.102803494718532e-06, "loss": 0.2803, "step": 41640 }, { "epoch": 4.8559934701492535, "grad_norm": 0.5035109227563629, "learning_rate": 5.1019767313353564e-06, "loss": 0.254, "step": 41645 }, { "epoch": 4.856576492537314, "grad_norm": 0.5848075197635482, "learning_rate": 5.101153298323643e-06, "loss": 0.269, "step": 41650 }, { "epoch": 4.857159514925373, "grad_norm": 0.5471125756188394, "learning_rate": 5.100333195805823e-06, "loss": 0.2648, "step": 41655 }, { "epoch": 4.857742537313433, "grad_norm": 0.5511916141410351, "learning_rate": 5.099516423903844e-06, "loss": 0.2685, "step": 41660 }, { "epoch": 4.858325559701493, "grad_norm": 0.5121656009662893, "learning_rate": 5.098702982739151e-06, "loss": 0.2532, "step": 41665 }, { "epoch": 4.858908582089552, "grad_norm": 0.4893711976943801, "learning_rate": 5.097892872432691e-06, "loss": 0.2389, "step": 41670 }, { "epoch": 4.859491604477612, "grad_norm": 0.5103260050459835, "learning_rate": 5.097086093104924e-06, "loss": 0.2606, "step": 41675 }, { "epoch": 4.860074626865671, "grad_norm": 0.5045149224836615, "learning_rate": 5.096282644875807e-06, "loss": 0.2411, "step": 41680 }, { "epoch": 4.8606576492537314, "grad_norm": 0.546932084264033, "learning_rate": 5.095482527864808e-06, "loss": 0.2622, "step": 41685 }, { "epoch": 4.8612406716417915, "grad_norm": 0.5905858554988268, "learning_rate": 5.094685742190896e-06, "loss": 0.2475, "step": 41690 }, { "epoch": 4.861823694029851, "grad_norm": 0.5635884405524049, "learning_rate": 5.09389228797254e-06, "loss": 0.2578, "step": 41695 }, { "epoch": 4.862406716417911, "grad_norm": 0.5446643924445254, "learning_rate": 5.093102165327729e-06, "loss": 0.2711, "step": 41700 }, { "epoch": 4.86298973880597, "grad_norm": 0.5388030732595104, "learning_rate": 5.092315374373937e-06, "loss": 0.2605, "step": 41705 }, { "epoch": 4.86357276119403, "grad_norm": 0.5104379618728769, "learning_rate": 5.09153191522816e-06, "loss": 0.2506, "step": 41710 }, { "epoch": 4.864155783582089, "grad_norm": 0.5442195807888268, "learning_rate": 5.090751788006885e-06, "loss": 0.2565, "step": 41715 }, { "epoch": 4.864738805970149, "grad_norm": 0.5151659097617564, "learning_rate": 5.089974992826117e-06, "loss": 0.2546, "step": 41720 }, { "epoch": 4.8653218283582085, "grad_norm": 0.5475812818819479, "learning_rate": 5.0892015298013485e-06, "loss": 0.2628, "step": 41725 }, { "epoch": 4.8659048507462686, "grad_norm": 0.5611781041571859, "learning_rate": 5.08843139904759e-06, "loss": 0.2607, "step": 41730 }, { "epoch": 4.866487873134329, "grad_norm": 0.562604530991125, "learning_rate": 5.087664600679356e-06, "loss": 0.2703, "step": 41735 }, { "epoch": 4.867070895522388, "grad_norm": 0.4998378795657582, "learning_rate": 5.086901134810658e-06, "loss": 0.2408, "step": 41740 }, { "epoch": 4.867653917910448, "grad_norm": 0.4891612223302177, "learning_rate": 5.086141001555016e-06, "loss": 0.2463, "step": 41745 }, { "epoch": 4.868236940298507, "grad_norm": 0.49336471996126907, "learning_rate": 5.085384201025457e-06, "loss": 0.2739, "step": 41750 }, { "epoch": 4.868819962686567, "grad_norm": 0.5629493957193611, "learning_rate": 5.084630733334508e-06, "loss": 0.2798, "step": 41755 }, { "epoch": 4.869402985074627, "grad_norm": 0.5516219780327767, "learning_rate": 5.083880598594204e-06, "loss": 0.2698, "step": 41760 }, { "epoch": 4.869986007462686, "grad_norm": 0.5332212008821925, "learning_rate": 5.083133796916081e-06, "loss": 0.2549, "step": 41765 }, { "epoch": 4.8705690298507465, "grad_norm": 0.5009266210637915, "learning_rate": 5.082390328411184e-06, "loss": 0.2572, "step": 41770 }, { "epoch": 4.871152052238806, "grad_norm": 0.526802423608002, "learning_rate": 5.081650193190057e-06, "loss": 0.2671, "step": 41775 }, { "epoch": 4.871735074626866, "grad_norm": 0.5137792697456302, "learning_rate": 5.080913391362749e-06, "loss": 0.2627, "step": 41780 }, { "epoch": 4.872318097014926, "grad_norm": 0.5984306399966873, "learning_rate": 5.080179923038822e-06, "loss": 0.2764, "step": 41785 }, { "epoch": 4.872901119402985, "grad_norm": 0.5094423400085093, "learning_rate": 5.079449788327332e-06, "loss": 0.2661, "step": 41790 }, { "epoch": 4.873484141791045, "grad_norm": 0.5427727261012149, "learning_rate": 5.078722987336843e-06, "loss": 0.2755, "step": 41795 }, { "epoch": 4.874067164179104, "grad_norm": 0.49115019336966714, "learning_rate": 5.0779995201754225e-06, "loss": 0.2501, "step": 41800 }, { "epoch": 4.874650186567164, "grad_norm": 0.4980343372980356, "learning_rate": 5.077279386950642e-06, "loss": 0.2569, "step": 41805 }, { "epoch": 4.8752332089552235, "grad_norm": 0.579200878426523, "learning_rate": 5.076562587769584e-06, "loss": 0.2848, "step": 41810 }, { "epoch": 4.875816231343284, "grad_norm": 0.5313901159630037, "learning_rate": 5.0758491227388235e-06, "loss": 0.2568, "step": 41815 }, { "epoch": 4.876399253731344, "grad_norm": 0.5160576663677819, "learning_rate": 5.07513899196445e-06, "loss": 0.2552, "step": 41820 }, { "epoch": 4.876982276119403, "grad_norm": 0.5357377913236523, "learning_rate": 5.074432195552053e-06, "loss": 0.2615, "step": 41825 }, { "epoch": 4.877565298507463, "grad_norm": 0.5228674064442037, "learning_rate": 5.073728733606722e-06, "loss": 0.2558, "step": 41830 }, { "epoch": 4.878148320895522, "grad_norm": 0.5488209572438764, "learning_rate": 5.073028606233059e-06, "loss": 0.2543, "step": 41835 }, { "epoch": 4.878731343283582, "grad_norm": 0.5475836927974559, "learning_rate": 5.072331813535166e-06, "loss": 0.2472, "step": 41840 }, { "epoch": 4.879314365671641, "grad_norm": 0.5122104063146348, "learning_rate": 5.071638355616648e-06, "loss": 0.2588, "step": 41845 }, { "epoch": 4.879897388059701, "grad_norm": 0.5236345664813228, "learning_rate": 5.070948232580618e-06, "loss": 0.2564, "step": 41850 }, { "epoch": 4.8804804104477615, "grad_norm": 0.5460049029970603, "learning_rate": 5.070261444529688e-06, "loss": 0.2517, "step": 41855 }, { "epoch": 4.881063432835821, "grad_norm": 0.5282890574808324, "learning_rate": 5.069577991565977e-06, "loss": 0.2445, "step": 41860 }, { "epoch": 4.881646455223881, "grad_norm": 0.5527711274915231, "learning_rate": 5.0688978737911085e-06, "loss": 0.2751, "step": 41865 }, { "epoch": 4.88222947761194, "grad_norm": 0.5873242083073615, "learning_rate": 5.06822109130621e-06, "loss": 0.2641, "step": 41870 }, { "epoch": 4.8828125, "grad_norm": 0.6454795207223442, "learning_rate": 5.067547644211914e-06, "loss": 0.2661, "step": 41875 }, { "epoch": 4.88339552238806, "grad_norm": 0.515405993205954, "learning_rate": 5.066877532608349e-06, "loss": 0.257, "step": 41880 }, { "epoch": 4.883978544776119, "grad_norm": 0.5775533229734363, "learning_rate": 5.066210756595164e-06, "loss": 0.2666, "step": 41885 }, { "epoch": 4.884561567164179, "grad_norm": 0.5682993188365679, "learning_rate": 5.065547316271494e-06, "loss": 0.2716, "step": 41890 }, { "epoch": 4.8851445895522385, "grad_norm": 0.5021547582884069, "learning_rate": 5.064887211735991e-06, "loss": 0.247, "step": 41895 }, { "epoch": 4.885727611940299, "grad_norm": 0.5351609081219216, "learning_rate": 5.064230443086805e-06, "loss": 0.2656, "step": 41900 }, { "epoch": 4.886310634328359, "grad_norm": 0.5185797141181296, "learning_rate": 5.0635770104215915e-06, "loss": 0.2601, "step": 41905 }, { "epoch": 4.886893656716418, "grad_norm": 0.4897415944298069, "learning_rate": 5.062926913837507e-06, "loss": 0.2647, "step": 41910 }, { "epoch": 4.887476679104478, "grad_norm": 0.5146341622967224, "learning_rate": 5.062280153431218e-06, "loss": 0.2483, "step": 41915 }, { "epoch": 4.888059701492537, "grad_norm": 0.5201069230866949, "learning_rate": 5.06163672929889e-06, "loss": 0.2639, "step": 41920 }, { "epoch": 4.888642723880597, "grad_norm": 0.49058031879353137, "learning_rate": 5.060996641536193e-06, "loss": 0.2613, "step": 41925 }, { "epoch": 4.889225746268656, "grad_norm": 0.5195891301651294, "learning_rate": 5.060359890238305e-06, "loss": 0.2673, "step": 41930 }, { "epoch": 4.889808768656716, "grad_norm": 0.5534183022778316, "learning_rate": 5.059726475499902e-06, "loss": 0.2527, "step": 41935 }, { "epoch": 4.8903917910447765, "grad_norm": 0.5331781774186816, "learning_rate": 5.059096397415167e-06, "loss": 0.2754, "step": 41940 }, { "epoch": 4.890974813432836, "grad_norm": 0.5142689622820151, "learning_rate": 5.058469656077789e-06, "loss": 0.2557, "step": 41945 }, { "epoch": 4.891557835820896, "grad_norm": 0.4939092021509841, "learning_rate": 5.057846251580957e-06, "loss": 0.2442, "step": 41950 }, { "epoch": 4.892140858208955, "grad_norm": 0.5580170055851186, "learning_rate": 5.057226184017362e-06, "loss": 0.2732, "step": 41955 }, { "epoch": 4.892723880597015, "grad_norm": 0.5560264363819188, "learning_rate": 5.056609453479208e-06, "loss": 0.2551, "step": 41960 }, { "epoch": 4.893306902985074, "grad_norm": 0.5396528546006861, "learning_rate": 5.055996060058192e-06, "loss": 0.2646, "step": 41965 }, { "epoch": 4.893889925373134, "grad_norm": 0.5068940114371213, "learning_rate": 5.055386003845524e-06, "loss": 0.2549, "step": 41970 }, { "epoch": 4.894472947761194, "grad_norm": 0.5745579405298369, "learning_rate": 5.054779284931909e-06, "loss": 0.277, "step": 41975 }, { "epoch": 4.8950559701492535, "grad_norm": 0.5806705260974102, "learning_rate": 5.0541759034075645e-06, "loss": 0.2495, "step": 41980 }, { "epoch": 4.895638992537314, "grad_norm": 0.5562877907163609, "learning_rate": 5.053575859362203e-06, "loss": 0.2797, "step": 41985 }, { "epoch": 4.896222014925373, "grad_norm": 0.50697607397542, "learning_rate": 5.0529791528850515e-06, "loss": 0.2529, "step": 41990 }, { "epoch": 4.896805037313433, "grad_norm": 0.5483589167480247, "learning_rate": 5.052385784064827e-06, "loss": 0.2516, "step": 41995 }, { "epoch": 4.897388059701493, "grad_norm": 0.5100587783263615, "learning_rate": 5.051795752989764e-06, "loss": 0.2535, "step": 42000 }, { "epoch": 4.897971082089552, "grad_norm": 0.5757002834872101, "learning_rate": 5.051209059747594e-06, "loss": 0.2696, "step": 42005 }, { "epoch": 4.898554104477612, "grad_norm": 0.5331593458553955, "learning_rate": 5.050625704425547e-06, "loss": 0.2623, "step": 42010 }, { "epoch": 4.899137126865671, "grad_norm": 0.5283321720941034, "learning_rate": 5.0500456871103686e-06, "loss": 0.2446, "step": 42015 }, { "epoch": 4.8997201492537314, "grad_norm": 0.5114177573958205, "learning_rate": 5.049469007888298e-06, "loss": 0.2493, "step": 42020 }, { "epoch": 4.9003031716417915, "grad_norm": 0.5679235645395738, "learning_rate": 5.048895666845084e-06, "loss": 0.2668, "step": 42025 }, { "epoch": 4.900886194029851, "grad_norm": 0.5652236312690536, "learning_rate": 5.048325664065975e-06, "loss": 0.2706, "step": 42030 }, { "epoch": 4.901469216417911, "grad_norm": 0.5449377388643637, "learning_rate": 5.047758999635728e-06, "loss": 0.2699, "step": 42035 }, { "epoch": 4.90205223880597, "grad_norm": 0.5568006658896507, "learning_rate": 5.047195673638596e-06, "loss": 0.277, "step": 42040 }, { "epoch": 4.90263526119403, "grad_norm": 0.5024409873880541, "learning_rate": 5.0466356861583445e-06, "loss": 0.2498, "step": 42045 }, { "epoch": 4.903218283582089, "grad_norm": 0.5619462307067028, "learning_rate": 5.046079037278237e-06, "loss": 0.2529, "step": 42050 }, { "epoch": 4.903801305970149, "grad_norm": 0.5265679100431098, "learning_rate": 5.0455257270810425e-06, "loss": 0.2604, "step": 42055 }, { "epoch": 4.9043843283582085, "grad_norm": 0.518928623322086, "learning_rate": 5.044975755649028e-06, "loss": 0.2544, "step": 42060 }, { "epoch": 4.9049673507462686, "grad_norm": 0.5215674212626602, "learning_rate": 5.044429123063977e-06, "loss": 0.2721, "step": 42065 }, { "epoch": 4.905550373134329, "grad_norm": 0.5200189509891983, "learning_rate": 5.043885829407164e-06, "loss": 0.259, "step": 42070 }, { "epoch": 4.906133395522388, "grad_norm": 0.5527420023829943, "learning_rate": 5.043345874759371e-06, "loss": 0.2555, "step": 42075 }, { "epoch": 4.906716417910448, "grad_norm": 0.5342382947782167, "learning_rate": 5.042809259200885e-06, "loss": 0.256, "step": 42080 }, { "epoch": 4.907299440298507, "grad_norm": 0.5141683380541643, "learning_rate": 5.042275982811495e-06, "loss": 0.267, "step": 42085 }, { "epoch": 4.907882462686567, "grad_norm": 0.5766851960493508, "learning_rate": 5.041746045670495e-06, "loss": 0.265, "step": 42090 }, { "epoch": 4.908465485074627, "grad_norm": 0.5319560178381133, "learning_rate": 5.041219447856681e-06, "loss": 0.2586, "step": 42095 }, { "epoch": 4.909048507462686, "grad_norm": 0.551495036550619, "learning_rate": 5.040696189448356e-06, "loss": 0.2508, "step": 42100 }, { "epoch": 4.9096315298507465, "grad_norm": 0.5028715651681404, "learning_rate": 5.040176270523318e-06, "loss": 0.2572, "step": 42105 }, { "epoch": 4.910214552238806, "grad_norm": 0.5854116582149756, "learning_rate": 5.039659691158878e-06, "loss": 0.2441, "step": 42110 }, { "epoch": 4.910797574626866, "grad_norm": 0.5241056006342661, "learning_rate": 5.039146451431845e-06, "loss": 0.2485, "step": 42115 }, { "epoch": 4.911380597014926, "grad_norm": 0.5299036291724838, "learning_rate": 5.038636551418533e-06, "loss": 0.26, "step": 42120 }, { "epoch": 4.911963619402985, "grad_norm": 0.5693228608809118, "learning_rate": 5.038129991194761e-06, "loss": 0.2702, "step": 42125 }, { "epoch": 4.912546641791045, "grad_norm": 0.540432097247365, "learning_rate": 5.0376267708358455e-06, "loss": 0.271, "step": 42130 }, { "epoch": 4.913129664179104, "grad_norm": 0.565345592234274, "learning_rate": 5.037126890416614e-06, "loss": 0.2781, "step": 42135 }, { "epoch": 4.913712686567164, "grad_norm": 0.5143268463409374, "learning_rate": 5.036630350011395e-06, "loss": 0.2669, "step": 42140 }, { "epoch": 4.9142957089552235, "grad_norm": 0.5714145458883612, "learning_rate": 5.036137149694013e-06, "loss": 0.275, "step": 42145 }, { "epoch": 4.914878731343284, "grad_norm": 0.510914157357852, "learning_rate": 5.03564728953781e-06, "loss": 0.2632, "step": 42150 }, { "epoch": 4.915461753731344, "grad_norm": 0.5008171389831337, "learning_rate": 5.035160769615618e-06, "loss": 0.2508, "step": 42155 }, { "epoch": 4.916044776119403, "grad_norm": 0.5749675729501972, "learning_rate": 5.034677589999783e-06, "loss": 0.2664, "step": 42160 }, { "epoch": 4.916627798507463, "grad_norm": 0.5233900101671892, "learning_rate": 5.034197750762141e-06, "loss": 0.2527, "step": 42165 }, { "epoch": 4.917210820895522, "grad_norm": 0.5376105637275604, "learning_rate": 5.033721251974047e-06, "loss": 0.2505, "step": 42170 }, { "epoch": 4.917793843283582, "grad_norm": 0.5028727183182046, "learning_rate": 5.03324809370635e-06, "loss": 0.2587, "step": 42175 }, { "epoch": 4.918376865671641, "grad_norm": 0.5708070256275176, "learning_rate": 5.032778276029403e-06, "loss": 0.2492, "step": 42180 }, { "epoch": 4.918959888059701, "grad_norm": 0.5284780496823562, "learning_rate": 5.032311799013064e-06, "loss": 0.237, "step": 42185 }, { "epoch": 4.9195429104477615, "grad_norm": 0.5437624915243243, "learning_rate": 5.031848662726692e-06, "loss": 0.2527, "step": 42190 }, { "epoch": 4.920125932835821, "grad_norm": 0.5272630207823388, "learning_rate": 5.031388867239153e-06, "loss": 0.2516, "step": 42195 }, { "epoch": 4.920708955223881, "grad_norm": 0.5609438463293948, "learning_rate": 5.030932412618815e-06, "loss": 0.2645, "step": 42200 }, { "epoch": 4.92129197761194, "grad_norm": 0.5523835138093406, "learning_rate": 5.030479298933544e-06, "loss": 0.2527, "step": 42205 }, { "epoch": 4.921875, "grad_norm": 0.5448438770973634, "learning_rate": 5.030029526250719e-06, "loss": 0.2696, "step": 42210 }, { "epoch": 4.92245802238806, "grad_norm": 0.5334394635727333, "learning_rate": 5.029583094637212e-06, "loss": 0.2708, "step": 42215 }, { "epoch": 4.923041044776119, "grad_norm": 0.5473799855053304, "learning_rate": 5.029140004159409e-06, "loss": 0.2665, "step": 42220 }, { "epoch": 4.923624067164179, "grad_norm": 0.5088585387852517, "learning_rate": 5.028700254883189e-06, "loss": 0.2617, "step": 42225 }, { "epoch": 4.9242070895522385, "grad_norm": 0.5526294910242032, "learning_rate": 5.028263846873938e-06, "loss": 0.2625, "step": 42230 }, { "epoch": 4.924790111940299, "grad_norm": 0.5488602735603476, "learning_rate": 5.027830780196549e-06, "loss": 0.2553, "step": 42235 }, { "epoch": 4.925373134328359, "grad_norm": 0.5256559656515214, "learning_rate": 5.02740105491541e-06, "loss": 0.2538, "step": 42240 }, { "epoch": 4.925956156716418, "grad_norm": 0.49572639691711673, "learning_rate": 5.026974671094422e-06, "loss": 0.2777, "step": 42245 }, { "epoch": 4.926539179104478, "grad_norm": 0.5130528437133531, "learning_rate": 5.026551628796982e-06, "loss": 0.276, "step": 42250 }, { "epoch": 4.927122201492537, "grad_norm": 0.5252623718875515, "learning_rate": 5.026131928085994e-06, "loss": 0.2675, "step": 42255 }, { "epoch": 4.927705223880597, "grad_norm": 0.5249616603820997, "learning_rate": 5.025715569023859e-06, "loss": 0.2512, "step": 42260 }, { "epoch": 4.928288246268656, "grad_norm": 0.49308468732337574, "learning_rate": 5.025302551672492e-06, "loss": 0.2573, "step": 42265 }, { "epoch": 4.928871268656716, "grad_norm": 0.5241375322385623, "learning_rate": 5.024892876093299e-06, "loss": 0.251, "step": 42270 }, { "epoch": 4.9294542910447765, "grad_norm": 0.536547646668262, "learning_rate": 5.024486542347199e-06, "loss": 0.2529, "step": 42275 }, { "epoch": 4.930037313432836, "grad_norm": 0.5432071331597679, "learning_rate": 5.024083550494606e-06, "loss": 0.2706, "step": 42280 }, { "epoch": 4.930620335820896, "grad_norm": 0.519888884714539, "learning_rate": 5.023683900595444e-06, "loss": 0.2606, "step": 42285 }, { "epoch": 4.931203358208955, "grad_norm": 0.5150429884830602, "learning_rate": 5.023287592709136e-06, "loss": 0.2551, "step": 42290 }, { "epoch": 4.931786380597015, "grad_norm": 0.5381892098962242, "learning_rate": 5.02289462689461e-06, "loss": 0.2575, "step": 42295 }, { "epoch": 4.932369402985074, "grad_norm": 0.5129812840405475, "learning_rate": 5.0225050032102965e-06, "loss": 0.2645, "step": 42300 }, { "epoch": 4.932952425373134, "grad_norm": 0.5270017766303224, "learning_rate": 5.022118721714127e-06, "loss": 0.2562, "step": 42305 }, { "epoch": 4.933535447761194, "grad_norm": 0.5607984557942124, "learning_rate": 5.021735782463537e-06, "loss": 0.2687, "step": 42310 }, { "epoch": 4.9341184701492535, "grad_norm": 0.48585619759926013, "learning_rate": 5.02135618551547e-06, "loss": 0.2686, "step": 42315 }, { "epoch": 4.934701492537314, "grad_norm": 0.5343965401543147, "learning_rate": 5.020979930926365e-06, "loss": 0.2575, "step": 42320 }, { "epoch": 4.935284514925373, "grad_norm": 0.5572109602036827, "learning_rate": 5.02060701875217e-06, "loss": 0.2587, "step": 42325 }, { "epoch": 4.935867537313433, "grad_norm": 0.4902847984858535, "learning_rate": 5.020237449048333e-06, "loss": 0.2561, "step": 42330 }, { "epoch": 4.936450559701493, "grad_norm": 0.5390598769697923, "learning_rate": 5.019871221869802e-06, "loss": 0.2684, "step": 42335 }, { "epoch": 4.937033582089552, "grad_norm": 0.5147941260934705, "learning_rate": 5.0195083372710345e-06, "loss": 0.2664, "step": 42340 }, { "epoch": 4.937616604477612, "grad_norm": 0.548980577215786, "learning_rate": 5.019148795305989e-06, "loss": 0.2526, "step": 42345 }, { "epoch": 4.938199626865671, "grad_norm": 0.5035908145770805, "learning_rate": 5.018792596028123e-06, "loss": 0.2587, "step": 42350 }, { "epoch": 4.9387826492537314, "grad_norm": 0.5386835530616152, "learning_rate": 5.018439739490402e-06, "loss": 0.2459, "step": 42355 }, { "epoch": 4.9393656716417915, "grad_norm": 0.5672610362941495, "learning_rate": 5.018090225745291e-06, "loss": 0.2659, "step": 42360 }, { "epoch": 4.939948694029851, "grad_norm": 0.5227959253547065, "learning_rate": 5.017744054844761e-06, "loss": 0.2733, "step": 42365 }, { "epoch": 4.940531716417911, "grad_norm": 0.5482815190946052, "learning_rate": 5.017401226840284e-06, "loss": 0.276, "step": 42370 }, { "epoch": 4.94111473880597, "grad_norm": 0.6095589249221928, "learning_rate": 5.017061741782833e-06, "loss": 0.2588, "step": 42375 }, { "epoch": 4.94169776119403, "grad_norm": 0.5800286959569063, "learning_rate": 5.016725599722889e-06, "loss": 0.2872, "step": 42380 }, { "epoch": 4.942280783582089, "grad_norm": 0.5277976550400842, "learning_rate": 5.016392800710434e-06, "loss": 0.2592, "step": 42385 }, { "epoch": 4.942863805970149, "grad_norm": 0.4926808393479529, "learning_rate": 5.016063344794947e-06, "loss": 0.2586, "step": 42390 }, { "epoch": 4.9434468283582085, "grad_norm": 0.5243174413474881, "learning_rate": 5.015737232025418e-06, "loss": 0.2509, "step": 42395 }, { "epoch": 4.9440298507462686, "grad_norm": 0.5444091475374001, "learning_rate": 5.0154144624503365e-06, "loss": 0.2645, "step": 42400 }, { "epoch": 4.944612873134329, "grad_norm": 0.5396246731925429, "learning_rate": 5.015095036117697e-06, "loss": 0.2712, "step": 42405 }, { "epoch": 4.945195895522388, "grad_norm": 0.5737818317841215, "learning_rate": 5.014778953074992e-06, "loss": 0.2674, "step": 42410 }, { "epoch": 4.945778917910448, "grad_norm": 0.5765872960092024, "learning_rate": 5.014466213369223e-06, "loss": 0.2559, "step": 42415 }, { "epoch": 4.946361940298507, "grad_norm": 0.5236453221462053, "learning_rate": 5.014156817046891e-06, "loss": 0.2715, "step": 42420 }, { "epoch": 4.946944962686567, "grad_norm": 0.5116473112764134, "learning_rate": 5.013850764153996e-06, "loss": 0.2594, "step": 42425 }, { "epoch": 4.947527985074627, "grad_norm": 0.5441658732781992, "learning_rate": 5.013548054736049e-06, "loss": 0.2743, "step": 42430 }, { "epoch": 4.948111007462686, "grad_norm": 0.5245587783681419, "learning_rate": 5.013248688838061e-06, "loss": 0.253, "step": 42435 }, { "epoch": 4.9486940298507465, "grad_norm": 0.5122101874152416, "learning_rate": 5.012952666504542e-06, "loss": 0.2705, "step": 42440 }, { "epoch": 4.949277052238806, "grad_norm": 0.5570048059800029, "learning_rate": 5.012659987779512e-06, "loss": 0.2562, "step": 42445 }, { "epoch": 4.949860074626866, "grad_norm": 0.5608195747463256, "learning_rate": 5.012370652706484e-06, "loss": 0.2738, "step": 42450 }, { "epoch": 4.950443097014926, "grad_norm": 0.49721929201135545, "learning_rate": 5.012084661328482e-06, "loss": 0.2539, "step": 42455 }, { "epoch": 4.951026119402985, "grad_norm": 0.5130788204130693, "learning_rate": 5.011802013688029e-06, "loss": 0.2817, "step": 42460 }, { "epoch": 4.951609141791045, "grad_norm": 0.5395718479680132, "learning_rate": 5.011522709827154e-06, "loss": 0.2619, "step": 42465 }, { "epoch": 4.952192164179104, "grad_norm": 0.5558579165824954, "learning_rate": 5.011246749787385e-06, "loss": 0.2768, "step": 42470 }, { "epoch": 4.952775186567164, "grad_norm": 0.48807134518878653, "learning_rate": 5.010974133609758e-06, "loss": 0.2497, "step": 42475 }, { "epoch": 4.9533582089552235, "grad_norm": 0.5410353509505209, "learning_rate": 5.010704861334803e-06, "loss": 0.2462, "step": 42480 }, { "epoch": 4.953941231343284, "grad_norm": 0.5297584301230719, "learning_rate": 5.010438933002563e-06, "loss": 0.2772, "step": 42485 }, { "epoch": 4.954524253731344, "grad_norm": 0.4885389228572895, "learning_rate": 5.010176348652576e-06, "loss": 0.2618, "step": 42490 }, { "epoch": 4.955107276119403, "grad_norm": 0.5265683653471985, "learning_rate": 5.009917108323885e-06, "loss": 0.2589, "step": 42495 }, { "epoch": 4.955690298507463, "grad_norm": 0.5488495005880858, "learning_rate": 5.0096612120550436e-06, "loss": 0.2698, "step": 42500 }, { "epoch": 4.956273320895522, "grad_norm": 0.500672598203943, "learning_rate": 5.009408659884092e-06, "loss": 0.2725, "step": 42505 }, { "epoch": 4.956856343283582, "grad_norm": 0.5649699889084325, "learning_rate": 5.009159451848587e-06, "loss": 0.2475, "step": 42510 }, { "epoch": 4.957439365671641, "grad_norm": 0.5068588579455506, "learning_rate": 5.008913587985581e-06, "loss": 0.2531, "step": 42515 }, { "epoch": 4.958022388059701, "grad_norm": 0.4904332591542818, "learning_rate": 5.008671068331634e-06, "loss": 0.2752, "step": 42520 }, { "epoch": 4.9586054104477615, "grad_norm": 0.5292395690628368, "learning_rate": 5.008431892922808e-06, "loss": 0.2569, "step": 42525 }, { "epoch": 4.959188432835821, "grad_norm": 0.5162847077380787, "learning_rate": 5.00819606179466e-06, "loss": 0.2517, "step": 42530 }, { "epoch": 4.959771455223881, "grad_norm": 0.511489903051284, "learning_rate": 5.007963574982264e-06, "loss": 0.2658, "step": 42535 }, { "epoch": 4.96035447761194, "grad_norm": 0.5597316656999656, "learning_rate": 5.007734432520179e-06, "loss": 0.2874, "step": 42540 }, { "epoch": 4.9609375, "grad_norm": 0.5351076117828275, "learning_rate": 5.0075086344424855e-06, "loss": 0.2746, "step": 42545 }, { "epoch": 4.96152052238806, "grad_norm": 0.49566378784578796, "learning_rate": 5.0072861807827505e-06, "loss": 0.2522, "step": 42550 }, { "epoch": 4.962103544776119, "grad_norm": 0.5434957468307663, "learning_rate": 5.007067071574053e-06, "loss": 0.2583, "step": 42555 }, { "epoch": 4.962686567164179, "grad_norm": 0.5753697977872646, "learning_rate": 5.0068513068489765e-06, "loss": 0.2733, "step": 42560 }, { "epoch": 4.9632695895522385, "grad_norm": 0.5272924317732932, "learning_rate": 5.006638886639597e-06, "loss": 0.2576, "step": 42565 }, { "epoch": 4.963852611940299, "grad_norm": 0.5191068335476884, "learning_rate": 5.0064298109775035e-06, "loss": 0.2493, "step": 42570 }, { "epoch": 4.964435634328359, "grad_norm": 0.5195820566025755, "learning_rate": 5.00622407989378e-06, "loss": 0.2593, "step": 42575 }, { "epoch": 4.965018656716418, "grad_norm": 0.5156457041364598, "learning_rate": 5.006021693419021e-06, "loss": 0.2662, "step": 42580 }, { "epoch": 4.965601679104478, "grad_norm": 0.49984629226854865, "learning_rate": 5.005822651583317e-06, "loss": 0.2709, "step": 42585 }, { "epoch": 4.966184701492537, "grad_norm": 0.5638317611211411, "learning_rate": 5.0056269544162635e-06, "loss": 0.2483, "step": 42590 }, { "epoch": 4.966767723880597, "grad_norm": 0.5253048930761857, "learning_rate": 5.005434601946959e-06, "loss": 0.2757, "step": 42595 }, { "epoch": 4.967350746268656, "grad_norm": 0.5406929590852747, "learning_rate": 5.0052455942040045e-06, "loss": 0.2537, "step": 42600 }, { "epoch": 4.967933768656716, "grad_norm": 0.5350349817512208, "learning_rate": 5.005059931215503e-06, "loss": 0.2689, "step": 42605 }, { "epoch": 4.9685167910447765, "grad_norm": 0.5309767938037251, "learning_rate": 5.004877613009064e-06, "loss": 0.2651, "step": 42610 }, { "epoch": 4.969099813432836, "grad_norm": 0.5355737554112076, "learning_rate": 5.004698639611792e-06, "loss": 0.253, "step": 42615 }, { "epoch": 4.969682835820896, "grad_norm": 0.5290801600001703, "learning_rate": 5.0045230110503e-06, "loss": 0.2555, "step": 42620 }, { "epoch": 4.970265858208955, "grad_norm": 0.4998835620223301, "learning_rate": 5.004350727350704e-06, "loss": 0.2478, "step": 42625 }, { "epoch": 4.970848880597015, "grad_norm": 0.5661328413618032, "learning_rate": 5.00418178853862e-06, "loss": 0.2726, "step": 42630 }, { "epoch": 4.971431902985074, "grad_norm": 0.5681877161289965, "learning_rate": 5.004016194639169e-06, "loss": 0.2646, "step": 42635 }, { "epoch": 4.972014925373134, "grad_norm": 0.5706374125383276, "learning_rate": 5.003853945676969e-06, "loss": 0.2676, "step": 42640 }, { "epoch": 4.972597947761194, "grad_norm": 0.5537135888616044, "learning_rate": 5.0036950416761485e-06, "loss": 0.2819, "step": 42645 }, { "epoch": 4.9731809701492535, "grad_norm": 0.5217565278454588, "learning_rate": 5.0035394826603345e-06, "loss": 0.2539, "step": 42650 }, { "epoch": 4.973763992537314, "grad_norm": 0.5104889223282257, "learning_rate": 5.003387268652657e-06, "loss": 0.25, "step": 42655 }, { "epoch": 4.974347014925373, "grad_norm": 0.535969910630488, "learning_rate": 5.003238399675746e-06, "loss": 0.2599, "step": 42660 }, { "epoch": 4.974930037313433, "grad_norm": 0.513665049975225, "learning_rate": 5.003092875751742e-06, "loss": 0.2537, "step": 42665 }, { "epoch": 4.975513059701493, "grad_norm": 0.5649291292140013, "learning_rate": 5.002950696902278e-06, "loss": 0.2632, "step": 42670 }, { "epoch": 4.976096082089552, "grad_norm": 0.507554516214919, "learning_rate": 5.0028118631485e-06, "loss": 0.2526, "step": 42675 }, { "epoch": 4.976679104477612, "grad_norm": 0.4737579168132423, "learning_rate": 5.002676374511046e-06, "loss": 0.251, "step": 42680 }, { "epoch": 4.977262126865671, "grad_norm": 0.47485280932488766, "learning_rate": 5.002544231010064e-06, "loss": 0.2561, "step": 42685 }, { "epoch": 4.9778451492537314, "grad_norm": 0.5233764045395329, "learning_rate": 5.0024154326652044e-06, "loss": 0.2726, "step": 42690 }, { "epoch": 4.9784281716417915, "grad_norm": 0.5136717520620079, "learning_rate": 5.002289979495614e-06, "loss": 0.2532, "step": 42695 }, { "epoch": 4.979011194029851, "grad_norm": 0.5438972770455104, "learning_rate": 5.002167871519951e-06, "loss": 0.2627, "step": 42700 }, { "epoch": 4.979594216417911, "grad_norm": 0.5374919797374826, "learning_rate": 5.00204910875637e-06, "loss": 0.2637, "step": 42705 }, { "epoch": 4.98017723880597, "grad_norm": 0.5618090151137566, "learning_rate": 5.001933691222527e-06, "loss": 0.2704, "step": 42710 }, { "epoch": 4.98076026119403, "grad_norm": 0.5400968553646867, "learning_rate": 5.001821618935589e-06, "loss": 0.2656, "step": 42715 }, { "epoch": 4.981343283582089, "grad_norm": 0.5356939870669652, "learning_rate": 5.001712891912217e-06, "loss": 0.2746, "step": 42720 }, { "epoch": 4.981926305970149, "grad_norm": 0.5589525144895989, "learning_rate": 5.001607510168576e-06, "loss": 0.253, "step": 42725 }, { "epoch": 4.9825093283582085, "grad_norm": 0.51423787731282, "learning_rate": 5.001505473720337e-06, "loss": 0.2588, "step": 42730 }, { "epoch": 4.9830923507462686, "grad_norm": 0.5416516453895129, "learning_rate": 5.001406782582673e-06, "loss": 0.2767, "step": 42735 }, { "epoch": 4.983675373134329, "grad_norm": 0.5138911139053113, "learning_rate": 5.001311436770255e-06, "loss": 0.2645, "step": 42740 }, { "epoch": 4.984258395522388, "grad_norm": 0.5367203943984868, "learning_rate": 5.001219436297262e-06, "loss": 0.2568, "step": 42745 }, { "epoch": 4.984841417910448, "grad_norm": 0.5501690738444324, "learning_rate": 5.001130781177377e-06, "loss": 0.2685, "step": 42750 }, { "epoch": 4.985424440298507, "grad_norm": 0.5373366414628004, "learning_rate": 5.0010454714237786e-06, "loss": 0.2662, "step": 42755 }, { "epoch": 4.986007462686567, "grad_norm": 0.5347366551370059, "learning_rate": 5.000963507049151e-06, "loss": 0.266, "step": 42760 }, { "epoch": 4.986590485074627, "grad_norm": 0.546458556108298, "learning_rate": 5.000884888065682e-06, "loss": 0.2691, "step": 42765 }, { "epoch": 4.987173507462686, "grad_norm": 0.5032987363405463, "learning_rate": 5.000809614485062e-06, "loss": 0.2651, "step": 42770 }, { "epoch": 4.9877565298507465, "grad_norm": 0.5424532656947771, "learning_rate": 5.0007376863184835e-06, "loss": 0.2492, "step": 42775 }, { "epoch": 4.988339552238806, "grad_norm": 0.5043402438100439, "learning_rate": 5.000669103576643e-06, "loss": 0.2589, "step": 42780 }, { "epoch": 4.988922574626866, "grad_norm": 0.5343459268973346, "learning_rate": 5.000603866269734e-06, "loss": 0.2615, "step": 42785 }, { "epoch": 4.989505597014926, "grad_norm": 0.5689999173772574, "learning_rate": 5.000541974407462e-06, "loss": 0.2596, "step": 42790 }, { "epoch": 4.990088619402985, "grad_norm": 0.5861951582292237, "learning_rate": 5.0004834279990245e-06, "loss": 0.2659, "step": 42795 }, { "epoch": 4.990671641791045, "grad_norm": 0.5023106712695729, "learning_rate": 5.000428227053131e-06, "loss": 0.26, "step": 42800 }, { "epoch": 4.991254664179104, "grad_norm": 0.5257901894197187, "learning_rate": 5.000376371577987e-06, "loss": 0.2604, "step": 42805 }, { "epoch": 4.991837686567164, "grad_norm": 0.4763993165887568, "learning_rate": 5.000327861581302e-06, "loss": 0.2486, "step": 42810 }, { "epoch": 4.9924207089552235, "grad_norm": 0.5167447453038132, "learning_rate": 5.000282697070291e-06, "loss": 0.2747, "step": 42815 }, { "epoch": 4.993003731343284, "grad_norm": 0.4866714281289037, "learning_rate": 5.000240878051671e-06, "loss": 0.2499, "step": 42820 }, { "epoch": 4.993586753731344, "grad_norm": 0.5005658570867595, "learning_rate": 5.000202404531656e-06, "loss": 0.2503, "step": 42825 }, { "epoch": 4.994169776119403, "grad_norm": 0.5380777958946619, "learning_rate": 5.0001672765159696e-06, "loss": 0.2529, "step": 42830 }, { "epoch": 4.994752798507463, "grad_norm": 0.5187253248059627, "learning_rate": 5.000135494009835e-06, "loss": 0.2748, "step": 42835 }, { "epoch": 4.995335820895522, "grad_norm": 0.5165834922078414, "learning_rate": 5.000107057017976e-06, "loss": 0.2754, "step": 42840 }, { "epoch": 4.995918843283582, "grad_norm": 0.5600181004782937, "learning_rate": 5.000081965544622e-06, "loss": 0.2637, "step": 42845 }, { "epoch": 4.996501865671641, "grad_norm": 0.5397015314691016, "learning_rate": 5.0000602195935046e-06, "loss": 0.2614, "step": 42850 }, { "epoch": 4.997084888059701, "grad_norm": 0.4935744319945381, "learning_rate": 5.000041819167857e-06, "loss": 0.252, "step": 42855 }, { "epoch": 4.9976679104477615, "grad_norm": 0.5591119193292369, "learning_rate": 5.000026764270413e-06, "loss": 0.2561, "step": 42860 }, { "epoch": 4.998250932835821, "grad_norm": 0.5626430914617118, "learning_rate": 5.000015054903415e-06, "loss": 0.2765, "step": 42865 }, { "epoch": 4.998833955223881, "grad_norm": 0.5479808963178789, "learning_rate": 5.0000066910686e-06, "loss": 0.2616, "step": 42870 }, { "epoch": 4.99941697761194, "grad_norm": 0.4935988139893575, "learning_rate": 5.0000016727672125e-06, "loss": 0.2403, "step": 42875 }, { "epoch": 5.0, "grad_norm": 1.2398633217504624, "learning_rate": 5e-06, "loss": 0.2729, "step": 42880 }, { "epoch": 5.0, "step": 42880, "total_flos": 2442892589137920.0, "train_loss": 0.2958939010054985, "train_runtime": 111352.539, "train_samples_per_second": 1.54, "train_steps_per_second": 0.385 } ], "logging_steps": 5, "max_steps": 42880, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2442892589137920.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }