diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": 0.6779661016949152, "best_model_checkpoint": "DF_Image_VIT_V1/checkpoint-13812", - "epoch": 4.0, + "epoch": 8.0, "eval_steps": 500, - "global_step": 18416, + "global_step": 36832, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -12932,19 +12932,12949 @@ "step": 18416 }, { - "epoch": 4.0, - "step": 18416, - "total_flos": 2.2832239820043387e+19, - "train_loss": 0.018179216772772535, - "train_runtime": 6460.7853, - "train_samples_per_second": 45.604, - "train_steps_per_second": 2.85 + "epoch": 4.000868809730669, + "grad_norm": 0.0007167569710873067, + "learning_rate": 2.5002715030408342e-05, + "loss": 0.0004, + "step": 18420 + }, + { + "epoch": 4.003040834057342, + "grad_norm": 0.0007184027344919741, + "learning_rate": 2.498913987836664e-05, + "loss": 0.0044, + "step": 18430 + }, + { + "epoch": 4.005212858384014, + "grad_norm": 0.0007228711619973183, + "learning_rate": 2.4975564726324935e-05, + "loss": 0.0001, + "step": 18440 + }, + { + "epoch": 4.007384882710686, + "grad_norm": 0.0007069294806569815, + "learning_rate": 2.4961989574283235e-05, + "loss": 0.004, + "step": 18450 + }, + { + "epoch": 4.009556907037359, + "grad_norm": 0.006892836652696133, + "learning_rate": 2.494841442224153e-05, + "loss": 0.0002, + "step": 18460 + }, + { + "epoch": 4.011728931364031, + "grad_norm": 0.0007597632356919348, + "learning_rate": 2.4934839270199828e-05, + "loss": 0.0415, + "step": 18470 + }, + { + "epoch": 4.013900955690704, + "grad_norm": 0.0008095133816823363, + "learning_rate": 2.4921264118158124e-05, + "loss": 0.0001, + "step": 18480 + }, + { + "epoch": 4.016072980017376, + "grad_norm": 0.0007956316112540662, + "learning_rate": 2.490768896611642e-05, + "loss": 0.0002, + "step": 18490 + }, + { + "epoch": 4.018245004344049, + "grad_norm": 0.000739986018743366, + "learning_rate": 2.4894113814074717e-05, + "loss": 0.0001, + "step": 18500 + }, + { + "epoch": 4.020417028670721, + "grad_norm": 0.000718973926268518, + "learning_rate": 2.4880538662033014e-05, + "loss": 0.0003, + "step": 18510 + }, + { + "epoch": 4.022589052997394, + "grad_norm": 0.00071329454658553, + "learning_rate": 2.486696350999131e-05, + "loss": 0.0001, + "step": 18520 + }, + { + "epoch": 4.024761077324066, + "grad_norm": 0.0007385624339804053, + "learning_rate": 2.485338835794961e-05, + "loss": 0.0001, + "step": 18530 + }, + { + "epoch": 4.026933101650738, + "grad_norm": 0.004547603894025087, + "learning_rate": 2.4839813205907907e-05, + "loss": 0.0001, + "step": 18540 + }, + { + "epoch": 4.029105125977411, + "grad_norm": 0.0011677941074594855, + "learning_rate": 2.4826238053866203e-05, + "loss": 0.0001, + "step": 18550 + }, + { + "epoch": 4.031277150304083, + "grad_norm": 0.004432608839124441, + "learning_rate": 2.4812662901824503e-05, + "loss": 0.0025, + "step": 18560 + }, + { + "epoch": 4.033449174630756, + "grad_norm": 0.0007672629435546696, + "learning_rate": 2.47990877497828e-05, + "loss": 0.0001, + "step": 18570 + }, + { + "epoch": 4.035621198957428, + "grad_norm": 0.09396348148584366, + "learning_rate": 2.4785512597741096e-05, + "loss": 0.0254, + "step": 18580 + }, + { + "epoch": 4.037793223284101, + "grad_norm": 0.0008097750251181424, + "learning_rate": 2.4771937445699393e-05, + "loss": 0.0006, + "step": 18590 + }, + { + "epoch": 4.039965247610773, + "grad_norm": 0.0007146692369133234, + "learning_rate": 2.4758362293657693e-05, + "loss": 0.0001, + "step": 18600 + }, + { + "epoch": 4.042137271937445, + "grad_norm": 0.0007175743812695146, + "learning_rate": 2.474478714161599e-05, + "loss": 0.001, + "step": 18610 + }, + { + "epoch": 4.044309296264118, + "grad_norm": 0.0007156123756431043, + "learning_rate": 2.4731211989574286e-05, + "loss": 0.0009, + "step": 18620 + }, + { + "epoch": 4.04648132059079, + "grad_norm": 0.0006997164455242455, + "learning_rate": 2.4717636837532582e-05, + "loss": 0.0005, + "step": 18630 + }, + { + "epoch": 4.048653344917463, + "grad_norm": 0.0007682847790420055, + "learning_rate": 2.470406168549088e-05, + "loss": 0.0006, + "step": 18640 + }, + { + "epoch": 4.050825369244135, + "grad_norm": 0.0006895299884490669, + "learning_rate": 2.4690486533449175e-05, + "loss": 0.0001, + "step": 18650 + }, + { + "epoch": 4.052997393570808, + "grad_norm": 0.0007155478233471513, + "learning_rate": 2.467691138140747e-05, + "loss": 0.0006, + "step": 18660 + }, + { + "epoch": 4.05516941789748, + "grad_norm": 0.012197580188512802, + "learning_rate": 2.4663336229365768e-05, + "loss": 0.0004, + "step": 18670 + }, + { + "epoch": 4.0573414422241525, + "grad_norm": 0.011634773574769497, + "learning_rate": 2.4649761077324068e-05, + "loss": 0.0088, + "step": 18680 + }, + { + "epoch": 4.0595134665508255, + "grad_norm": 0.0007289189961738884, + "learning_rate": 2.4636185925282365e-05, + "loss": 0.0005, + "step": 18690 + }, + { + "epoch": 4.0616854908774975, + "grad_norm": 0.0007233425858430564, + "learning_rate": 2.462261077324066e-05, + "loss": 0.0001, + "step": 18700 + }, + { + "epoch": 4.0638575152041705, + "grad_norm": 0.001152144162915647, + "learning_rate": 2.4609035621198958e-05, + "loss": 0.0115, + "step": 18710 + }, + { + "epoch": 4.066029539530843, + "grad_norm": 0.0007558057550340891, + "learning_rate": 2.4595460469157254e-05, + "loss": 0.0006, + "step": 18720 + }, + { + "epoch": 4.0682015638575155, + "grad_norm": 0.0007093999884091318, + "learning_rate": 2.458188531711555e-05, + "loss": 0.0002, + "step": 18730 + }, + { + "epoch": 4.070373588184188, + "grad_norm": 0.0007253269432112575, + "learning_rate": 2.456831016507385e-05, + "loss": 0.0005, + "step": 18740 + }, + { + "epoch": 4.072545612510861, + "grad_norm": 0.0006909583462402225, + "learning_rate": 2.4554735013032147e-05, + "loss": 0.0001, + "step": 18750 + }, + { + "epoch": 4.074717636837533, + "grad_norm": 0.0007387499208562076, + "learning_rate": 2.4541159860990447e-05, + "loss": 0.0002, + "step": 18760 + }, + { + "epoch": 4.076889661164205, + "grad_norm": 0.0006695879274047911, + "learning_rate": 2.4527584708948743e-05, + "loss": 0.0001, + "step": 18770 + }, + { + "epoch": 4.079061685490878, + "grad_norm": 0.0006833495572209358, + "learning_rate": 2.451400955690704e-05, + "loss": 0.0002, + "step": 18780 + }, + { + "epoch": 4.08123370981755, + "grad_norm": 0.0006686097476631403, + "learning_rate": 2.4500434404865336e-05, + "loss": 0.0052, + "step": 18790 + }, + { + "epoch": 4.083405734144223, + "grad_norm": 0.03649875894188881, + "learning_rate": 2.4486859252823633e-05, + "loss": 0.0003, + "step": 18800 + }, + { + "epoch": 4.085577758470895, + "grad_norm": 0.0007832238334231079, + "learning_rate": 2.447328410078193e-05, + "loss": 0.0041, + "step": 18810 + }, + { + "epoch": 4.087749782797568, + "grad_norm": 15.030851364135742, + "learning_rate": 2.4459708948740226e-05, + "loss": 0.0476, + "step": 18820 + }, + { + "epoch": 4.08992180712424, + "grad_norm": 0.007664125878363848, + "learning_rate": 2.4446133796698526e-05, + "loss": 0.0004, + "step": 18830 + }, + { + "epoch": 4.092093831450912, + "grad_norm": 0.0014251531101763248, + "learning_rate": 2.4432558644656822e-05, + "loss": 0.0103, + "step": 18840 + }, + { + "epoch": 4.094265855777585, + "grad_norm": 0.0033034952357411385, + "learning_rate": 2.441898349261512e-05, + "loss": 0.005, + "step": 18850 + }, + { + "epoch": 4.096437880104257, + "grad_norm": 0.0006803958094678819, + "learning_rate": 2.4405408340573415e-05, + "loss": 0.0001, + "step": 18860 + }, + { + "epoch": 4.09860990443093, + "grad_norm": 0.0006750720203854144, + "learning_rate": 2.4391833188531712e-05, + "loss": 0.0002, + "step": 18870 + }, + { + "epoch": 4.100781928757602, + "grad_norm": 0.0007022693753242493, + "learning_rate": 2.437825803649001e-05, + "loss": 0.0677, + "step": 18880 + }, + { + "epoch": 4.102953953084275, + "grad_norm": 0.000759601651225239, + "learning_rate": 2.4364682884448305e-05, + "loss": 0.0002, + "step": 18890 + }, + { + "epoch": 4.105125977410947, + "grad_norm": 0.0007145005511119962, + "learning_rate": 2.43511077324066e-05, + "loss": 0.0003, + "step": 18900 + }, + { + "epoch": 4.107298001737619, + "grad_norm": 0.008006428368389606, + "learning_rate": 2.43375325803649e-05, + "loss": 0.0315, + "step": 18910 + }, + { + "epoch": 4.109470026064292, + "grad_norm": 0.0008494790527038276, + "learning_rate": 2.4323957428323198e-05, + "loss": 0.0026, + "step": 18920 + }, + { + "epoch": 4.111642050390964, + "grad_norm": 0.0010820915922522545, + "learning_rate": 2.4310382276281494e-05, + "loss": 0.0001, + "step": 18930 + }, + { + "epoch": 4.113814074717637, + "grad_norm": 0.0007636768277734518, + "learning_rate": 2.4296807124239794e-05, + "loss": 0.0004, + "step": 18940 + }, + { + "epoch": 4.115986099044309, + "grad_norm": 0.0008369954884983599, + "learning_rate": 2.428323197219809e-05, + "loss": 0.0001, + "step": 18950 + }, + { + "epoch": 4.118158123370982, + "grad_norm": 0.0007214748184196651, + "learning_rate": 2.4269656820156387e-05, + "loss": 0.0002, + "step": 18960 + }, + { + "epoch": 4.120330147697654, + "grad_norm": 0.0007030196720734239, + "learning_rate": 2.4256081668114684e-05, + "loss": 0.0002, + "step": 18970 + }, + { + "epoch": 4.122502172024326, + "grad_norm": 0.0007046711980365217, + "learning_rate": 2.4242506516072984e-05, + "loss": 0.0001, + "step": 18980 + }, + { + "epoch": 4.124674196350999, + "grad_norm": 0.00932464748620987, + "learning_rate": 2.422893136403128e-05, + "loss": 0.0527, + "step": 18990 + }, + { + "epoch": 4.126846220677671, + "grad_norm": 0.04322976619005203, + "learning_rate": 2.4215356211989577e-05, + "loss": 0.0109, + "step": 19000 + }, + { + "epoch": 4.129018245004344, + "grad_norm": 0.0007810614770278335, + "learning_rate": 2.4201781059947873e-05, + "loss": 0.0013, + "step": 19010 + }, + { + "epoch": 4.131190269331016, + "grad_norm": 0.0007010008557699621, + "learning_rate": 2.418820590790617e-05, + "loss": 0.0122, + "step": 19020 + }, + { + "epoch": 4.133362293657689, + "grad_norm": 0.0007034821319393814, + "learning_rate": 2.4174630755864466e-05, + "loss": 0.0001, + "step": 19030 + }, + { + "epoch": 4.135534317984361, + "grad_norm": 0.0008014214690774679, + "learning_rate": 2.4161055603822763e-05, + "loss": 0.1098, + "step": 19040 + }, + { + "epoch": 4.137706342311034, + "grad_norm": 12.395751953125, + "learning_rate": 2.414748045178106e-05, + "loss": 0.0169, + "step": 19050 + }, + { + "epoch": 4.139878366637706, + "grad_norm": 0.0018553922418504953, + "learning_rate": 2.413390529973936e-05, + "loss": 0.01, + "step": 19060 + }, + { + "epoch": 4.142050390964378, + "grad_norm": 0.0010227859020233154, + "learning_rate": 2.4120330147697656e-05, + "loss": 0.0001, + "step": 19070 + }, + { + "epoch": 4.144222415291051, + "grad_norm": 0.0009623629739508033, + "learning_rate": 2.4106754995655952e-05, + "loss": 0.0016, + "step": 19080 + }, + { + "epoch": 4.1463944396177235, + "grad_norm": 0.0010376714635640383, + "learning_rate": 2.409317984361425e-05, + "loss": 0.0214, + "step": 19090 + }, + { + "epoch": 4.148566463944396, + "grad_norm": 0.0009919269941747189, + "learning_rate": 2.4079604691572545e-05, + "loss": 0.0001, + "step": 19100 + }, + { + "epoch": 4.1507384882710685, + "grad_norm": 0.0010005880612879992, + "learning_rate": 2.406602953953084e-05, + "loss": 0.0001, + "step": 19110 + }, + { + "epoch": 4.1529105125977415, + "grad_norm": 2.53873872756958, + "learning_rate": 2.405245438748914e-05, + "loss": 0.1289, + "step": 19120 + }, + { + "epoch": 4.1550825369244135, + "grad_norm": 0.04172717407345772, + "learning_rate": 2.4038879235447438e-05, + "loss": 0.0016, + "step": 19130 + }, + { + "epoch": 4.157254561251086, + "grad_norm": 0.019300812855362892, + "learning_rate": 2.4025304083405738e-05, + "loss": 0.0013, + "step": 19140 + }, + { + "epoch": 4.159426585577759, + "grad_norm": 0.036429353058338165, + "learning_rate": 2.4011728931364034e-05, + "loss": 0.0046, + "step": 19150 + }, + { + "epoch": 4.161598609904431, + "grad_norm": 20.21198081970215, + "learning_rate": 2.399815377932233e-05, + "loss": 0.0074, + "step": 19160 + }, + { + "epoch": 4.163770634231104, + "grad_norm": 0.009878740645945072, + "learning_rate": 2.3984578627280627e-05, + "loss": 0.0065, + "step": 19170 + }, + { + "epoch": 4.165942658557776, + "grad_norm": 0.005423377268016338, + "learning_rate": 2.3971003475238924e-05, + "loss": 0.0002, + "step": 19180 + }, + { + "epoch": 4.168114682884449, + "grad_norm": 0.00920186284929514, + "learning_rate": 2.395742832319722e-05, + "loss": 0.0012, + "step": 19190 + }, + { + "epoch": 4.170286707211121, + "grad_norm": 0.05071718618273735, + "learning_rate": 2.3943853171155517e-05, + "loss": 0.0005, + "step": 19200 + }, + { + "epoch": 4.172458731537793, + "grad_norm": 0.0018713108729571104, + "learning_rate": 2.3930278019113817e-05, + "loss": 0.0003, + "step": 19210 + }, + { + "epoch": 4.174630755864466, + "grad_norm": 0.0014962096465751529, + "learning_rate": 2.3916702867072113e-05, + "loss": 0.0003, + "step": 19220 + }, + { + "epoch": 4.176802780191138, + "grad_norm": 0.0009879091521725059, + "learning_rate": 2.390312771503041e-05, + "loss": 0.0002, + "step": 19230 + }, + { + "epoch": 4.178974804517811, + "grad_norm": 0.0009537252481095493, + "learning_rate": 2.3889552562988706e-05, + "loss": 0.0001, + "step": 19240 + }, + { + "epoch": 4.181146828844483, + "grad_norm": 0.000836131046526134, + "learning_rate": 2.3875977410947003e-05, + "loss": 0.0002, + "step": 19250 + }, + { + "epoch": 4.183318853171156, + "grad_norm": 0.0021473176311701536, + "learning_rate": 2.38624022589053e-05, + "loss": 0.0085, + "step": 19260 + }, + { + "epoch": 4.185490877497828, + "grad_norm": 0.02601959928870201, + "learning_rate": 2.3848827106863596e-05, + "loss": 0.0011, + "step": 19270 + }, + { + "epoch": 4.187662901824501, + "grad_norm": 0.0008759573684073985, + "learning_rate": 2.3835251954821892e-05, + "loss": 0.0001, + "step": 19280 + }, + { + "epoch": 4.189834926151173, + "grad_norm": 0.0007665369194000959, + "learning_rate": 2.3821676802780192e-05, + "loss": 0.0001, + "step": 19290 + }, + { + "epoch": 4.192006950477845, + "grad_norm": 0.000761428673285991, + "learning_rate": 2.380810165073849e-05, + "loss": 0.0001, + "step": 19300 + }, + { + "epoch": 4.194178974804518, + "grad_norm": 1.9355708360671997, + "learning_rate": 2.3794526498696785e-05, + "loss": 0.0131, + "step": 19310 + }, + { + "epoch": 4.19635099913119, + "grad_norm": 0.0007248061010614038, + "learning_rate": 2.3780951346655085e-05, + "loss": 0.0001, + "step": 19320 + }, + { + "epoch": 4.198523023457863, + "grad_norm": 0.0007215312216430902, + "learning_rate": 2.3767376194613382e-05, + "loss": 0.0508, + "step": 19330 + }, + { + "epoch": 4.200695047784535, + "grad_norm": 0.002852953039109707, + "learning_rate": 2.3753801042571678e-05, + "loss": 0.0355, + "step": 19340 + }, + { + "epoch": 4.202867072111208, + "grad_norm": 0.004599974490702152, + "learning_rate": 2.3740225890529975e-05, + "loss": 0.0486, + "step": 19350 + }, + { + "epoch": 4.20503909643788, + "grad_norm": 0.002695480128750205, + "learning_rate": 2.3726650738488275e-05, + "loss": 0.0221, + "step": 19360 + }, + { + "epoch": 4.207211120764552, + "grad_norm": 0.0010665751760825515, + "learning_rate": 2.371307558644657e-05, + "loss": 0.0002, + "step": 19370 + }, + { + "epoch": 4.209383145091225, + "grad_norm": 0.0010960167273879051, + "learning_rate": 2.3699500434404868e-05, + "loss": 0.0185, + "step": 19380 + }, + { + "epoch": 4.211555169417897, + "grad_norm": 0.4971291422843933, + "learning_rate": 2.3685925282363164e-05, + "loss": 0.0009, + "step": 19390 + }, + { + "epoch": 4.21372719374457, + "grad_norm": 0.010001223534345627, + "learning_rate": 2.367235013032146e-05, + "loss": 0.0021, + "step": 19400 + }, + { + "epoch": 4.215899218071242, + "grad_norm": 0.006596973165869713, + "learning_rate": 2.3658774978279757e-05, + "loss": 0.0093, + "step": 19410 + }, + { + "epoch": 4.218071242397915, + "grad_norm": 0.44958433508872986, + "learning_rate": 2.3645199826238054e-05, + "loss": 0.0005, + "step": 19420 + }, + { + "epoch": 4.220243266724587, + "grad_norm": 0.0007452222635038197, + "learning_rate": 2.363162467419635e-05, + "loss": 0.0008, + "step": 19430 + }, + { + "epoch": 4.222415291051259, + "grad_norm": 0.0007038828334771097, + "learning_rate": 2.361804952215465e-05, + "loss": 0.0002, + "step": 19440 + }, + { + "epoch": 4.224587315377932, + "grad_norm": 0.0006963639170862734, + "learning_rate": 2.3604474370112947e-05, + "loss": 0.007, + "step": 19450 + }, + { + "epoch": 4.226759339704604, + "grad_norm": 0.0007029336411505938, + "learning_rate": 2.3590899218071243e-05, + "loss": 0.0001, + "step": 19460 + }, + { + "epoch": 4.228931364031277, + "grad_norm": 0.0007282626465894282, + "learning_rate": 2.357732406602954e-05, + "loss": 0.0309, + "step": 19470 + }, + { + "epoch": 4.231103388357949, + "grad_norm": 0.0007143176626414061, + "learning_rate": 2.3563748913987836e-05, + "loss": 0.0005, + "step": 19480 + }, + { + "epoch": 4.233275412684622, + "grad_norm": 0.5573042631149292, + "learning_rate": 2.3550173761946133e-05, + "loss": 0.0034, + "step": 19490 + }, + { + "epoch": 4.2354474370112944, + "grad_norm": 0.0006998754688538611, + "learning_rate": 2.3536598609904433e-05, + "loss": 0.0008, + "step": 19500 + }, + { + "epoch": 4.237619461337967, + "grad_norm": 0.01692361570894718, + "learning_rate": 2.352302345786273e-05, + "loss": 0.0296, + "step": 19510 + }, + { + "epoch": 4.2397914856646395, + "grad_norm": 0.000765336852055043, + "learning_rate": 2.350944830582103e-05, + "loss": 0.0005, + "step": 19520 + }, + { + "epoch": 4.2419635099913116, + "grad_norm": 0.25539854168891907, + "learning_rate": 2.3495873153779326e-05, + "loss": 0.0013, + "step": 19530 + }, + { + "epoch": 4.2441355343179845, + "grad_norm": 0.0006899808067828417, + "learning_rate": 2.3482298001737622e-05, + "loss": 0.001, + "step": 19540 + }, + { + "epoch": 4.246307558644657, + "grad_norm": 0.0007053895969875157, + "learning_rate": 2.346872284969592e-05, + "loss": 0.0004, + "step": 19550 + }, + { + "epoch": 4.2484795829713295, + "grad_norm": 0.0007296680123545229, + "learning_rate": 2.3455147697654215e-05, + "loss": 0.0011, + "step": 19560 + }, + { + "epoch": 4.250651607298002, + "grad_norm": 0.02057144045829773, + "learning_rate": 2.344157254561251e-05, + "loss": 0.0339, + "step": 19570 + }, + { + "epoch": 4.252823631624675, + "grad_norm": 0.0007652404601685703, + "learning_rate": 2.3427997393570808e-05, + "loss": 0.0001, + "step": 19580 + }, + { + "epoch": 4.254995655951347, + "grad_norm": 0.0007344402838498354, + "learning_rate": 2.3414422241529108e-05, + "loss": 0.0005, + "step": 19590 + }, + { + "epoch": 4.257167680278019, + "grad_norm": 0.0010844263015314937, + "learning_rate": 2.3400847089487404e-05, + "loss": 0.0006, + "step": 19600 + }, + { + "epoch": 4.259339704604692, + "grad_norm": 0.0356300063431263, + "learning_rate": 2.33872719374457e-05, + "loss": 0.0004, + "step": 19610 + }, + { + "epoch": 4.261511728931364, + "grad_norm": 0.0006806873134337366, + "learning_rate": 2.3373696785403997e-05, + "loss": 0.0065, + "step": 19620 + }, + { + "epoch": 4.263683753258037, + "grad_norm": 0.0006880282890051603, + "learning_rate": 2.3360121633362294e-05, + "loss": 0.0175, + "step": 19630 + }, + { + "epoch": 4.265855777584709, + "grad_norm": 0.0006852375227026641, + "learning_rate": 2.334654648132059e-05, + "loss": 0.0234, + "step": 19640 + }, + { + "epoch": 4.268027801911382, + "grad_norm": 0.04227971285581589, + "learning_rate": 2.3332971329278887e-05, + "loss": 0.0322, + "step": 19650 + }, + { + "epoch": 4.270199826238054, + "grad_norm": 0.03739573061466217, + "learning_rate": 2.3319396177237184e-05, + "loss": 0.012, + "step": 19660 + }, + { + "epoch": 4.272371850564726, + "grad_norm": 0.032801300287246704, + "learning_rate": 2.3305821025195483e-05, + "loss": 0.0095, + "step": 19670 + }, + { + "epoch": 4.274543874891399, + "grad_norm": 0.0007418083841912448, + "learning_rate": 2.329224587315378e-05, + "loss": 0.0013, + "step": 19680 + }, + { + "epoch": 4.276715899218071, + "grad_norm": 0.0007542024250142276, + "learning_rate": 2.3278670721112076e-05, + "loss": 0.0001, + "step": 19690 + }, + { + "epoch": 4.278887923544744, + "grad_norm": 0.0008709717076271772, + "learning_rate": 2.3265095569070376e-05, + "loss": 0.0001, + "step": 19700 + }, + { + "epoch": 4.281059947871416, + "grad_norm": 0.0007557451608590782, + "learning_rate": 2.3251520417028673e-05, + "loss": 0.0001, + "step": 19710 + }, + { + "epoch": 4.283231972198089, + "grad_norm": 0.24801254272460938, + "learning_rate": 2.323794526498697e-05, + "loss": 0.0041, + "step": 19720 + }, + { + "epoch": 4.285403996524761, + "grad_norm": 0.0008331090793944895, + "learning_rate": 2.3224370112945266e-05, + "loss": 0.0037, + "step": 19730 + }, + { + "epoch": 4.287576020851434, + "grad_norm": 0.0016378792934119701, + "learning_rate": 2.3210794960903562e-05, + "loss": 0.0001, + "step": 19740 + }, + { + "epoch": 4.289748045178106, + "grad_norm": 0.0007555813062936068, + "learning_rate": 2.3197219808861862e-05, + "loss": 0.0022, + "step": 19750 + }, + { + "epoch": 4.291920069504778, + "grad_norm": 0.2303730845451355, + "learning_rate": 2.318364465682016e-05, + "loss": 0.007, + "step": 19760 + }, + { + "epoch": 4.294092093831451, + "grad_norm": 0.0008710839902050793, + "learning_rate": 2.3170069504778455e-05, + "loss": 0.0262, + "step": 19770 + }, + { + "epoch": 4.296264118158123, + "grad_norm": 0.0009850760689005256, + "learning_rate": 2.3156494352736752e-05, + "loss": 0.0016, + "step": 19780 + }, + { + "epoch": 4.298436142484796, + "grad_norm": 0.0006828588084317744, + "learning_rate": 2.3142919200695048e-05, + "loss": 0.0018, + "step": 19790 + }, + { + "epoch": 4.300608166811468, + "grad_norm": 0.0008635299745947123, + "learning_rate": 2.3129344048653345e-05, + "loss": 0.0595, + "step": 19800 + }, + { + "epoch": 4.302780191138141, + "grad_norm": 0.0011844933032989502, + "learning_rate": 2.311576889661164e-05, + "loss": 0.0533, + "step": 19810 + }, + { + "epoch": 4.304952215464813, + "grad_norm": 0.0021339692175388336, + "learning_rate": 2.310219374456994e-05, + "loss": 0.0003, + "step": 19820 + }, + { + "epoch": 4.307124239791485, + "grad_norm": 0.0015525285853073, + "learning_rate": 2.3088618592528238e-05, + "loss": 0.0016, + "step": 19830 + }, + { + "epoch": 4.309296264118158, + "grad_norm": 0.0011952450731769204, + "learning_rate": 2.3075043440486534e-05, + "loss": 0.0095, + "step": 19840 + }, + { + "epoch": 4.31146828844483, + "grad_norm": 0.0019283192232251167, + "learning_rate": 2.306146828844483e-05, + "loss": 0.0003, + "step": 19850 + }, + { + "epoch": 4.313640312771503, + "grad_norm": 0.0012698841746896505, + "learning_rate": 2.3047893136403127e-05, + "loss": 0.0056, + "step": 19860 + }, + { + "epoch": 4.315812337098175, + "grad_norm": 0.0013187688309699297, + "learning_rate": 2.3034317984361424e-05, + "loss": 0.0003, + "step": 19870 + }, + { + "epoch": 4.317984361424848, + "grad_norm": 0.0029401553329080343, + "learning_rate": 2.3020742832319724e-05, + "loss": 0.0002, + "step": 19880 + }, + { + "epoch": 4.32015638575152, + "grad_norm": 0.010737915523350239, + "learning_rate": 2.300716768027802e-05, + "loss": 0.008, + "step": 19890 + }, + { + "epoch": 4.3223284100781925, + "grad_norm": 0.00099332130048424, + "learning_rate": 2.299359252823632e-05, + "loss": 0.0003, + "step": 19900 + }, + { + "epoch": 4.324500434404865, + "grad_norm": 0.0011381141375750303, + "learning_rate": 2.2980017376194617e-05, + "loss": 0.0103, + "step": 19910 + }, + { + "epoch": 4.3266724587315375, + "grad_norm": 0.001223794766701758, + "learning_rate": 2.2966442224152913e-05, + "loss": 0.0007, + "step": 19920 + }, + { + "epoch": 4.3288444830582105, + "grad_norm": 0.006975673139095306, + "learning_rate": 2.295286707211121e-05, + "loss": 0.0014, + "step": 19930 + }, + { + "epoch": 4.3310165073848825, + "grad_norm": 0.02788945473730564, + "learning_rate": 2.2939291920069506e-05, + "loss": 0.0107, + "step": 19940 + }, + { + "epoch": 4.3331885317115555, + "grad_norm": 0.0013920166529715061, + "learning_rate": 2.2925716768027803e-05, + "loss": 0.0002, + "step": 19950 + }, + { + "epoch": 4.335360556038228, + "grad_norm": 0.0010082477238029242, + "learning_rate": 2.29121416159861e-05, + "loss": 0.0156, + "step": 19960 + }, + { + "epoch": 4.3375325803649005, + "grad_norm": 0.6021537184715271, + "learning_rate": 2.28985664639444e-05, + "loss": 0.0117, + "step": 19970 + }, + { + "epoch": 4.339704604691573, + "grad_norm": 0.013891434296965599, + "learning_rate": 2.2884991311902695e-05, + "loss": 0.0014, + "step": 19980 + }, + { + "epoch": 4.341876629018245, + "grad_norm": 0.0009469124488532543, + "learning_rate": 2.2871416159860992e-05, + "loss": 0.0038, + "step": 19990 + }, + { + "epoch": 4.344048653344918, + "grad_norm": 0.0008154679671861231, + "learning_rate": 2.285784100781929e-05, + "loss": 0.0003, + "step": 20000 + }, + { + "epoch": 4.34622067767159, + "grad_norm": 0.0008383361855521798, + "learning_rate": 2.2844265855777585e-05, + "loss": 0.0012, + "step": 20010 + }, + { + "epoch": 4.348392701998263, + "grad_norm": 0.000763860356528312, + "learning_rate": 2.283069070373588e-05, + "loss": 0.0001, + "step": 20020 + }, + { + "epoch": 4.350564726324935, + "grad_norm": 0.0007653414504602551, + "learning_rate": 2.2817115551694178e-05, + "loss": 0.0015, + "step": 20030 + }, + { + "epoch": 4.352736750651608, + "grad_norm": 0.000781756651122123, + "learning_rate": 2.2803540399652475e-05, + "loss": 0.0001, + "step": 20040 + }, + { + "epoch": 4.35490877497828, + "grad_norm": 0.0007571419118903577, + "learning_rate": 2.2789965247610774e-05, + "loss": 0.0009, + "step": 20050 + }, + { + "epoch": 4.357080799304952, + "grad_norm": 0.04005056619644165, + "learning_rate": 2.277639009556907e-05, + "loss": 0.0002, + "step": 20060 + }, + { + "epoch": 4.359252823631625, + "grad_norm": 0.0007382580661214888, + "learning_rate": 2.2762814943527367e-05, + "loss": 0.0001, + "step": 20070 + }, + { + "epoch": 4.361424847958297, + "grad_norm": 0.0007434505387209356, + "learning_rate": 2.2749239791485667e-05, + "loss": 0.0002, + "step": 20080 + }, + { + "epoch": 4.36359687228497, + "grad_norm": 0.0007536330376751721, + "learning_rate": 2.2735664639443964e-05, + "loss": 0.0001, + "step": 20090 + }, + { + "epoch": 4.365768896611642, + "grad_norm": 0.0007379274466075003, + "learning_rate": 2.272208948740226e-05, + "loss": 0.0001, + "step": 20100 + }, + { + "epoch": 4.367940920938315, + "grad_norm": 0.0007571052410639822, + "learning_rate": 2.2708514335360557e-05, + "loss": 0.056, + "step": 20110 + }, + { + "epoch": 4.370112945264987, + "grad_norm": 0.0013724949676543474, + "learning_rate": 2.2694939183318853e-05, + "loss": 0.0003, + "step": 20120 + }, + { + "epoch": 4.372284969591659, + "grad_norm": 0.0017404978862032294, + "learning_rate": 2.2681364031277153e-05, + "loss": 0.0002, + "step": 20130 + }, + { + "epoch": 4.374456993918332, + "grad_norm": 0.002438147785142064, + "learning_rate": 2.266778887923545e-05, + "loss": 0.0004, + "step": 20140 + }, + { + "epoch": 4.376629018245004, + "grad_norm": 0.0024324068799614906, + "learning_rate": 2.2654213727193746e-05, + "loss": 0.0003, + "step": 20150 + }, + { + "epoch": 4.378801042571677, + "grad_norm": 0.001972701633349061, + "learning_rate": 2.2640638575152043e-05, + "loss": 0.0004, + "step": 20160 + }, + { + "epoch": 4.380973066898349, + "grad_norm": 0.002721251919865608, + "learning_rate": 2.262706342311034e-05, + "loss": 0.0003, + "step": 20170 + }, + { + "epoch": 4.383145091225022, + "grad_norm": 0.0017292031552642584, + "learning_rate": 2.2613488271068636e-05, + "loss": 0.0002, + "step": 20180 + }, + { + "epoch": 4.385317115551694, + "grad_norm": 0.001443073502741754, + "learning_rate": 2.2599913119026932e-05, + "loss": 0.0004, + "step": 20190 + }, + { + "epoch": 4.387489139878367, + "grad_norm": 0.0014947176678106189, + "learning_rate": 2.2586337966985232e-05, + "loss": 0.0343, + "step": 20200 + }, + { + "epoch": 4.389661164205039, + "grad_norm": 0.001758243073709309, + "learning_rate": 2.257276281494353e-05, + "loss": 0.0002, + "step": 20210 + }, + { + "epoch": 4.391833188531711, + "grad_norm": 0.0025202189572155476, + "learning_rate": 2.2559187662901825e-05, + "loss": 0.0225, + "step": 20220 + }, + { + "epoch": 4.394005212858384, + "grad_norm": 0.005367154721170664, + "learning_rate": 2.2545612510860122e-05, + "loss": 0.0185, + "step": 20230 + }, + { + "epoch": 4.396177237185056, + "grad_norm": 0.011141028255224228, + "learning_rate": 2.2532037358818418e-05, + "loss": 0.0007, + "step": 20240 + }, + { + "epoch": 4.398349261511729, + "grad_norm": 0.003209081245586276, + "learning_rate": 2.2518462206776718e-05, + "loss": 0.0005, + "step": 20250 + }, + { + "epoch": 4.400521285838401, + "grad_norm": 0.0013920213095843792, + "learning_rate": 2.2504887054735015e-05, + "loss": 0.0003, + "step": 20260 + }, + { + "epoch": 4.402693310165074, + "grad_norm": 0.004766841884702444, + "learning_rate": 2.249131190269331e-05, + "loss": 0.0141, + "step": 20270 + }, + { + "epoch": 4.404865334491746, + "grad_norm": 0.002749124076217413, + "learning_rate": 2.247773675065161e-05, + "loss": 0.0192, + "step": 20280 + }, + { + "epoch": 4.407037358818418, + "grad_norm": 0.07083216309547424, + "learning_rate": 2.2464161598609908e-05, + "loss": 0.0002, + "step": 20290 + }, + { + "epoch": 4.409209383145091, + "grad_norm": 0.0012828879989683628, + "learning_rate": 2.2450586446568204e-05, + "loss": 0.0004, + "step": 20300 + }, + { + "epoch": 4.411381407471763, + "grad_norm": 4.6498332023620605, + "learning_rate": 2.24370112945265e-05, + "loss": 0.0449, + "step": 20310 + }, + { + "epoch": 4.413553431798436, + "grad_norm": 0.0018220586935058236, + "learning_rate": 2.2423436142484797e-05, + "loss": 0.0303, + "step": 20320 + }, + { + "epoch": 4.4157254561251085, + "grad_norm": 0.018667029216885567, + "learning_rate": 2.2409860990443094e-05, + "loss": 0.0041, + "step": 20330 + }, + { + "epoch": 4.417897480451781, + "grad_norm": 0.04699333757162094, + "learning_rate": 2.239628583840139e-05, + "loss": 0.0172, + "step": 20340 + }, + { + "epoch": 4.4200695047784535, + "grad_norm": 0.006092607043683529, + "learning_rate": 2.2382710686359687e-05, + "loss": 0.0004, + "step": 20350 + }, + { + "epoch": 4.422241529105126, + "grad_norm": 0.36696863174438477, + "learning_rate": 2.2369135534317987e-05, + "loss": 0.0006, + "step": 20360 + }, + { + "epoch": 4.4244135534317985, + "grad_norm": 0.0011871858732774854, + "learning_rate": 2.2355560382276283e-05, + "loss": 0.0004, + "step": 20370 + }, + { + "epoch": 4.426585577758471, + "grad_norm": 0.0013889227993786335, + "learning_rate": 2.234198523023458e-05, + "loss": 0.0002, + "step": 20380 + }, + { + "epoch": 4.428757602085144, + "grad_norm": 0.002782667288556695, + "learning_rate": 2.2328410078192876e-05, + "loss": 0.0001, + "step": 20390 + }, + { + "epoch": 4.430929626411816, + "grad_norm": 0.008937436155974865, + "learning_rate": 2.2314834926151173e-05, + "loss": 0.0003, + "step": 20400 + }, + { + "epoch": 4.433101650738489, + "grad_norm": 0.00113490573130548, + "learning_rate": 2.230125977410947e-05, + "loss": 0.0001, + "step": 20410 + }, + { + "epoch": 4.435273675065161, + "grad_norm": 0.0009732726030051708, + "learning_rate": 2.2287684622067766e-05, + "loss": 0.0003, + "step": 20420 + }, + { + "epoch": 4.437445699391834, + "grad_norm": 0.0016077302861958742, + "learning_rate": 2.2274109470026065e-05, + "loss": 0.0003, + "step": 20430 + }, + { + "epoch": 4.439617723718506, + "grad_norm": 0.0008823130046948791, + "learning_rate": 2.2260534317984362e-05, + "loss": 0.0142, + "step": 20440 + }, + { + "epoch": 4.441789748045178, + "grad_norm": 0.0009418035624548793, + "learning_rate": 2.224695916594266e-05, + "loss": 0.0264, + "step": 20450 + }, + { + "epoch": 4.443961772371851, + "grad_norm": 0.0008949940092861652, + "learning_rate": 2.223338401390096e-05, + "loss": 0.0005, + "step": 20460 + }, + { + "epoch": 4.446133796698523, + "grad_norm": 0.0009129407699219882, + "learning_rate": 2.2219808861859255e-05, + "loss": 0.0001, + "step": 20470 + }, + { + "epoch": 4.448305821025196, + "grad_norm": 0.0014547642786055803, + "learning_rate": 2.220623370981755e-05, + "loss": 0.0003, + "step": 20480 + }, + { + "epoch": 4.450477845351868, + "grad_norm": 0.00568614574149251, + "learning_rate": 2.2192658557775848e-05, + "loss": 0.0002, + "step": 20490 + }, + { + "epoch": 4.45264986967854, + "grad_norm": 0.0009105128119699657, + "learning_rate": 2.2179083405734144e-05, + "loss": 0.022, + "step": 20500 + }, + { + "epoch": 4.454821894005213, + "grad_norm": 0.0072452593594789505, + "learning_rate": 2.2165508253692444e-05, + "loss": 0.0022, + "step": 20510 + }, + { + "epoch": 4.456993918331885, + "grad_norm": 0.0010036842431873083, + "learning_rate": 2.215193310165074e-05, + "loss": 0.0001, + "step": 20520 + }, + { + "epoch": 4.459165942658558, + "grad_norm": 0.0008698371821083128, + "learning_rate": 2.2138357949609037e-05, + "loss": 0.0007, + "step": 20530 + }, + { + "epoch": 4.46133796698523, + "grad_norm": 0.0008447846048511565, + "learning_rate": 2.2124782797567334e-05, + "loss": 0.0004, + "step": 20540 + }, + { + "epoch": 4.463509991311903, + "grad_norm": 0.0008029814343899488, + "learning_rate": 2.211120764552563e-05, + "loss": 0.0001, + "step": 20550 + }, + { + "epoch": 4.465682015638575, + "grad_norm": 0.008035254664719105, + "learning_rate": 2.2097632493483927e-05, + "loss": 0.0003, + "step": 20560 + }, + { + "epoch": 4.467854039965248, + "grad_norm": 0.19235126674175262, + "learning_rate": 2.2084057341442223e-05, + "loss": 0.0413, + "step": 20570 + }, + { + "epoch": 4.47002606429192, + "grad_norm": 0.007838092744350433, + "learning_rate": 2.2070482189400523e-05, + "loss": 0.0219, + "step": 20580 + }, + { + "epoch": 4.472198088618592, + "grad_norm": 0.0008193932590074837, + "learning_rate": 2.205690703735882e-05, + "loss": 0.0002, + "step": 20590 + }, + { + "epoch": 4.474370112945265, + "grad_norm": 0.000779150053858757, + "learning_rate": 2.2043331885317116e-05, + "loss": 0.0009, + "step": 20600 + }, + { + "epoch": 4.476542137271937, + "grad_norm": 0.0013178132940083742, + "learning_rate": 2.2029756733275413e-05, + "loss": 0.0001, + "step": 20610 + }, + { + "epoch": 4.47871416159861, + "grad_norm": 0.0007607596344314516, + "learning_rate": 2.201618158123371e-05, + "loss": 0.0004, + "step": 20620 + }, + { + "epoch": 4.480886185925282, + "grad_norm": 0.0007874126313254237, + "learning_rate": 2.200260642919201e-05, + "loss": 0.0001, + "step": 20630 + }, + { + "epoch": 4.483058210251955, + "grad_norm": 0.0007739612483419478, + "learning_rate": 2.1989031277150306e-05, + "loss": 0.0004, + "step": 20640 + }, + { + "epoch": 4.485230234578627, + "grad_norm": 0.0008215973502956331, + "learning_rate": 2.1975456125108602e-05, + "loss": 0.0002, + "step": 20650 + }, + { + "epoch": 4.4874022589053, + "grad_norm": 0.0007648586761206388, + "learning_rate": 2.1961880973066902e-05, + "loss": 0.0002, + "step": 20660 + }, + { + "epoch": 4.489574283231972, + "grad_norm": 0.0007663946016691625, + "learning_rate": 2.19483058210252e-05, + "loss": 0.0444, + "step": 20670 + }, + { + "epoch": 4.491746307558644, + "grad_norm": 0.0007827861700206995, + "learning_rate": 2.1934730668983495e-05, + "loss": 0.0006, + "step": 20680 + }, + { + "epoch": 4.493918331885317, + "grad_norm": 0.693448007106781, + "learning_rate": 2.192115551694179e-05, + "loss": 0.0024, + "step": 20690 + }, + { + "epoch": 4.496090356211989, + "grad_norm": 0.0008732756250537932, + "learning_rate": 2.1907580364900088e-05, + "loss": 0.0465, + "step": 20700 + }, + { + "epoch": 4.498262380538662, + "grad_norm": 0.0008466057479381561, + "learning_rate": 2.1894005212858385e-05, + "loss": 0.0004, + "step": 20710 + }, + { + "epoch": 4.500434404865334, + "grad_norm": 0.012937244027853012, + "learning_rate": 2.188043006081668e-05, + "loss": 0.0005, + "step": 20720 + }, + { + "epoch": 4.5026064291920065, + "grad_norm": 1.4481043815612793, + "learning_rate": 2.1866854908774978e-05, + "loss": 0.0143, + "step": 20730 + }, + { + "epoch": 4.5047784535186794, + "grad_norm": 0.008231800980865955, + "learning_rate": 2.1853279756733278e-05, + "loss": 0.0021, + "step": 20740 + }, + { + "epoch": 4.5069504778453515, + "grad_norm": 0.0010032965801656246, + "learning_rate": 2.1839704604691574e-05, + "loss": 0.0014, + "step": 20750 + }, + { + "epoch": 4.5091225021720245, + "grad_norm": 0.0007704606978222728, + "learning_rate": 2.182612945264987e-05, + "loss": 0.0062, + "step": 20760 + }, + { + "epoch": 4.5112945264986966, + "grad_norm": 0.000827732787001878, + "learning_rate": 2.1812554300608167e-05, + "loss": 0.0001, + "step": 20770 + }, + { + "epoch": 4.5134665508253695, + "grad_norm": 0.0018680243520066142, + "learning_rate": 2.1798979148566464e-05, + "loss": 0.0294, + "step": 20780 + }, + { + "epoch": 4.515638575152042, + "grad_norm": 0.0008118122932501137, + "learning_rate": 2.178540399652476e-05, + "loss": 0.0002, + "step": 20790 + }, + { + "epoch": 4.5178105994787146, + "grad_norm": 0.0030436657834798098, + "learning_rate": 2.1771828844483057e-05, + "loss": 0.0002, + "step": 20800 + }, + { + "epoch": 4.519982623805387, + "grad_norm": 0.0008835258195176721, + "learning_rate": 2.1758253692441357e-05, + "loss": 0.0123, + "step": 20810 + }, + { + "epoch": 4.522154648132059, + "grad_norm": 0.005247740540653467, + "learning_rate": 2.1744678540399653e-05, + "loss": 0.0004, + "step": 20820 + }, + { + "epoch": 4.524326672458732, + "grad_norm": 0.0017484568525105715, + "learning_rate": 2.173110338835795e-05, + "loss": 0.022, + "step": 20830 + }, + { + "epoch": 4.526498696785404, + "grad_norm": 0.0007496718899346888, + "learning_rate": 2.171752823631625e-05, + "loss": 0.0002, + "step": 20840 + }, + { + "epoch": 4.528670721112077, + "grad_norm": 5.449917316436768, + "learning_rate": 2.1703953084274546e-05, + "loss": 0.0168, + "step": 20850 + }, + { + "epoch": 4.530842745438749, + "grad_norm": 0.0012228295672684908, + "learning_rate": 2.1690377932232842e-05, + "loss": 0.0002, + "step": 20860 + }, + { + "epoch": 4.533014769765422, + "grad_norm": 5.398144721984863, + "learning_rate": 2.167680278019114e-05, + "loss": 0.0191, + "step": 20870 + }, + { + "epoch": 4.535186794092094, + "grad_norm": 0.0008976564276963472, + "learning_rate": 2.1663227628149435e-05, + "loss": 0.0001, + "step": 20880 + }, + { + "epoch": 4.537358818418767, + "grad_norm": 0.01168507058173418, + "learning_rate": 2.1649652476107735e-05, + "loss": 0.0005, + "step": 20890 + }, + { + "epoch": 4.539530842745439, + "grad_norm": 0.0032003382220864296, + "learning_rate": 2.1636077324066032e-05, + "loss": 0.0036, + "step": 20900 + }, + { + "epoch": 4.541702867072111, + "grad_norm": 4.298925399780273, + "learning_rate": 2.162250217202433e-05, + "loss": 0.0271, + "step": 20910 + }, + { + "epoch": 4.543874891398784, + "grad_norm": 0.0012041199952363968, + "learning_rate": 2.1608927019982625e-05, + "loss": 0.0383, + "step": 20920 + }, + { + "epoch": 4.546046915725456, + "grad_norm": 0.056777678430080414, + "learning_rate": 2.159535186794092e-05, + "loss": 0.0063, + "step": 20930 + }, + { + "epoch": 4.548218940052129, + "grad_norm": 0.0007683674339205027, + "learning_rate": 2.1581776715899218e-05, + "loss": 0.0076, + "step": 20940 + }, + { + "epoch": 4.550390964378801, + "grad_norm": 0.0007942827069200575, + "learning_rate": 2.1568201563857514e-05, + "loss": 0.0009, + "step": 20950 + }, + { + "epoch": 4.552562988705473, + "grad_norm": 0.0007410432444885373, + "learning_rate": 2.155462641181581e-05, + "loss": 0.0001, + "step": 20960 + }, + { + "epoch": 4.554735013032146, + "grad_norm": 0.0018079435685649514, + "learning_rate": 2.154105125977411e-05, + "loss": 0.0002, + "step": 20970 + }, + { + "epoch": 4.556907037358818, + "grad_norm": 0.0007396311848424375, + "learning_rate": 2.1527476107732407e-05, + "loss": 0.0004, + "step": 20980 + }, + { + "epoch": 4.559079061685491, + "grad_norm": 0.0008238620939664543, + "learning_rate": 2.1513900955690704e-05, + "loss": 0.0103, + "step": 20990 + }, + { + "epoch": 4.561251086012163, + "grad_norm": 0.0007384721538983285, + "learning_rate": 2.1500325803649e-05, + "loss": 0.0002, + "step": 21000 + }, + { + "epoch": 4.563423110338836, + "grad_norm": 0.0007408479577861726, + "learning_rate": 2.14867506516073e-05, + "loss": 0.0003, + "step": 21010 + }, + { + "epoch": 4.565595134665508, + "grad_norm": 0.0007491989526897669, + "learning_rate": 2.1474533014769767e-05, + "loss": 0.0219, + "step": 21020 + }, + { + "epoch": 4.567767158992181, + "grad_norm": 0.0007304720929823816, + "learning_rate": 2.1460957862728064e-05, + "loss": 0.0004, + "step": 21030 + }, + { + "epoch": 4.569939183318853, + "grad_norm": 0.0007323980098590255, + "learning_rate": 2.144738271068636e-05, + "loss": 0.0002, + "step": 21040 + }, + { + "epoch": 4.572111207645525, + "grad_norm": 0.0008021637913770974, + "learning_rate": 2.143380755864466e-05, + "loss": 0.0001, + "step": 21050 + }, + { + "epoch": 4.574283231972198, + "grad_norm": 0.0009776867227628827, + "learning_rate": 2.1420232406602957e-05, + "loss": 0.0001, + "step": 21060 + }, + { + "epoch": 4.57645525629887, + "grad_norm": 0.0009424020536243916, + "learning_rate": 2.1406657254561253e-05, + "loss": 0.0033, + "step": 21070 + }, + { + "epoch": 4.578627280625543, + "grad_norm": 0.08862542361021042, + "learning_rate": 2.139308210251955e-05, + "loss": 0.0004, + "step": 21080 + }, + { + "epoch": 4.580799304952215, + "grad_norm": 0.000843246525619179, + "learning_rate": 2.1379506950477846e-05, + "loss": 0.0029, + "step": 21090 + }, + { + "epoch": 4.582971329278888, + "grad_norm": 0.0007149993907660246, + "learning_rate": 2.1365931798436143e-05, + "loss": 0.001, + "step": 21100 + }, + { + "epoch": 4.58514335360556, + "grad_norm": 0.0006953048286959529, + "learning_rate": 2.1352356646394442e-05, + "loss": 0.0002, + "step": 21110 + }, + { + "epoch": 4.587315377932233, + "grad_norm": 0.018970614299178123, + "learning_rate": 2.133878149435274e-05, + "loss": 0.0012, + "step": 21120 + }, + { + "epoch": 4.589487402258905, + "grad_norm": 0.0007085061515681446, + "learning_rate": 2.1325206342311035e-05, + "loss": 0.0002, + "step": 21130 + }, + { + "epoch": 4.5916594265855775, + "grad_norm": 0.0007259439444169402, + "learning_rate": 2.1311631190269332e-05, + "loss": 0.0001, + "step": 21140 + }, + { + "epoch": 4.59383145091225, + "grad_norm": 0.0007674265652894974, + "learning_rate": 2.129805603822763e-05, + "loss": 0.0001, + "step": 21150 + }, + { + "epoch": 4.5960034752389225, + "grad_norm": 0.006665141321718693, + "learning_rate": 2.1284480886185925e-05, + "loss": 0.0001, + "step": 21160 + }, + { + "epoch": 4.5981754995655955, + "grad_norm": 0.0006893404060974717, + "learning_rate": 2.127090573414422e-05, + "loss": 0.0001, + "step": 21170 + }, + { + "epoch": 4.6003475238922675, + "grad_norm": 0.0035702604800462723, + "learning_rate": 2.1257330582102518e-05, + "loss": 0.0002, + "step": 21180 + }, + { + "epoch": 4.60251954821894, + "grad_norm": 0.000676013296470046, + "learning_rate": 2.1243755430060818e-05, + "loss": 0.0008, + "step": 21190 + }, + { + "epoch": 4.604691572545613, + "grad_norm": 0.000665114785078913, + "learning_rate": 2.1230180278019114e-05, + "loss": 0.0009, + "step": 21200 + }, + { + "epoch": 4.606863596872285, + "grad_norm": 0.0031976511236280203, + "learning_rate": 2.121660512597741e-05, + "loss": 0.0779, + "step": 21210 + }, + { + "epoch": 4.609035621198958, + "grad_norm": 0.6927196979522705, + "learning_rate": 2.1203029973935707e-05, + "loss": 0.0006, + "step": 21220 + }, + { + "epoch": 4.61120764552563, + "grad_norm": 0.0027231345884501934, + "learning_rate": 2.1189454821894007e-05, + "loss": 0.0003, + "step": 21230 + }, + { + "epoch": 4.613379669852303, + "grad_norm": 0.01008631195873022, + "learning_rate": 2.1175879669852304e-05, + "loss": 0.0515, + "step": 21240 + }, + { + "epoch": 4.615551694178975, + "grad_norm": 0.0007393760024569929, + "learning_rate": 2.11623045178106e-05, + "loss": 0.0001, + "step": 21250 + }, + { + "epoch": 4.617723718505648, + "grad_norm": 0.000740601506549865, + "learning_rate": 2.11487293657689e-05, + "loss": 0.0006, + "step": 21260 + }, + { + "epoch": 4.61989574283232, + "grad_norm": 0.0007298871059902012, + "learning_rate": 2.1135154213727197e-05, + "loss": 0.0001, + "step": 21270 + }, + { + "epoch": 4.622067767158992, + "grad_norm": 0.0009730908204801381, + "learning_rate": 2.1121579061685493e-05, + "loss": 0.0194, + "step": 21280 + }, + { + "epoch": 4.624239791485665, + "grad_norm": 0.006183923222124577, + "learning_rate": 2.110800390964379e-05, + "loss": 0.0006, + "step": 21290 + }, + { + "epoch": 4.626411815812337, + "grad_norm": 0.0007730096112936735, + "learning_rate": 2.1094428757602086e-05, + "loss": 0.0003, + "step": 21300 + }, + { + "epoch": 4.62858384013901, + "grad_norm": 0.0007310515502467752, + "learning_rate": 2.1080853605560383e-05, + "loss": 0.0001, + "step": 21310 + }, + { + "epoch": 4.630755864465682, + "grad_norm": 0.0019475659355521202, + "learning_rate": 2.106727845351868e-05, + "loss": 0.0001, + "step": 21320 + }, + { + "epoch": 4.632927888792355, + "grad_norm": 0.0008489437168464065, + "learning_rate": 2.1053703301476976e-05, + "loss": 0.0001, + "step": 21330 + }, + { + "epoch": 4.635099913119027, + "grad_norm": 0.0007811547257006168, + "learning_rate": 2.1040128149435276e-05, + "loss": 0.0001, + "step": 21340 + }, + { + "epoch": 4.6372719374457, + "grad_norm": 0.0007179775275290012, + "learning_rate": 2.1026552997393572e-05, + "loss": 0.0003, + "step": 21350 + }, + { + "epoch": 4.639443961772372, + "grad_norm": 0.0007291779038496315, + "learning_rate": 2.101297784535187e-05, + "loss": 0.0001, + "step": 21360 + }, + { + "epoch": 4.641615986099044, + "grad_norm": 6.902067184448242, + "learning_rate": 2.0999402693310165e-05, + "loss": 0.0221, + "step": 21370 + }, + { + "epoch": 4.643788010425717, + "grad_norm": 0.000777409237343818, + "learning_rate": 2.0985827541268462e-05, + "loss": 0.0002, + "step": 21380 + }, + { + "epoch": 4.645960034752389, + "grad_norm": 0.039273735135793686, + "learning_rate": 2.0972252389226758e-05, + "loss": 0.0003, + "step": 21390 + }, + { + "epoch": 4.648132059079062, + "grad_norm": 0.0009249201975762844, + "learning_rate": 2.0958677237185058e-05, + "loss": 0.0002, + "step": 21400 + }, + { + "epoch": 4.650304083405734, + "grad_norm": 0.004159982316195965, + "learning_rate": 2.0945102085143355e-05, + "loss": 0.0002, + "step": 21410 + }, + { + "epoch": 4.652476107732406, + "grad_norm": 0.0012220778735354543, + "learning_rate": 2.093152693310165e-05, + "loss": 0.0006, + "step": 21420 + }, + { + "epoch": 4.654648132059079, + "grad_norm": 0.000692892586812377, + "learning_rate": 2.091795178105995e-05, + "loss": 0.0198, + "step": 21430 + }, + { + "epoch": 4.656820156385751, + "grad_norm": 0.0007744540343992412, + "learning_rate": 2.0904376629018248e-05, + "loss": 0.0529, + "step": 21440 + }, + { + "epoch": 4.658992180712424, + "grad_norm": 0.03046293742954731, + "learning_rate": 2.0890801476976544e-05, + "loss": 0.0004, + "step": 21450 + }, + { + "epoch": 4.661164205039096, + "grad_norm": 0.0007374830893240869, + "learning_rate": 2.087722632493484e-05, + "loss": 0.0003, + "step": 21460 + }, + { + "epoch": 4.663336229365769, + "grad_norm": 0.023736968636512756, + "learning_rate": 2.0863651172893137e-05, + "loss": 0.0001, + "step": 21470 + }, + { + "epoch": 4.665508253692441, + "grad_norm": 0.0007188957533799112, + "learning_rate": 2.0850076020851434e-05, + "loss": 0.0002, + "step": 21480 + }, + { + "epoch": 4.667680278019114, + "grad_norm": 0.0006954037235118449, + "learning_rate": 2.0836500868809734e-05, + "loss": 0.0003, + "step": 21490 + }, + { + "epoch": 4.669852302345786, + "grad_norm": 0.0007659996044822037, + "learning_rate": 2.082292571676803e-05, + "loss": 0.0014, + "step": 21500 + }, + { + "epoch": 4.672024326672458, + "grad_norm": 0.02373771369457245, + "learning_rate": 2.0809350564726327e-05, + "loss": 0.0002, + "step": 21510 + }, + { + "epoch": 4.674196350999131, + "grad_norm": 0.0015294611221179366, + "learning_rate": 2.0795775412684623e-05, + "loss": 0.0002, + "step": 21520 + }, + { + "epoch": 4.676368375325803, + "grad_norm": 0.000917143770493567, + "learning_rate": 2.078220026064292e-05, + "loss": 0.0006, + "step": 21530 + }, + { + "epoch": 4.678540399652476, + "grad_norm": 0.021886199712753296, + "learning_rate": 2.0768625108601216e-05, + "loss": 0.0002, + "step": 21540 + }, + { + "epoch": 4.680712423979148, + "grad_norm": 0.0006681543891318142, + "learning_rate": 2.0755049956559513e-05, + "loss": 0.0001, + "step": 21550 + }, + { + "epoch": 4.682884448305821, + "grad_norm": 0.0006924382178112864, + "learning_rate": 2.074147480451781e-05, + "loss": 0.0001, + "step": 21560 + }, + { + "epoch": 4.6850564726324935, + "grad_norm": 0.0007467414252460003, + "learning_rate": 2.072789965247611e-05, + "loss": 0.0001, + "step": 21570 + }, + { + "epoch": 4.687228496959166, + "grad_norm": 0.0007178894011303782, + "learning_rate": 2.0714324500434405e-05, + "loss": 0.0006, + "step": 21580 + }, + { + "epoch": 4.6894005212858385, + "grad_norm": 0.0006496147834695876, + "learning_rate": 2.0700749348392702e-05, + "loss": 0.0241, + "step": 21590 + }, + { + "epoch": 4.691572545612511, + "grad_norm": 0.0006787081365473568, + "learning_rate": 2.0687174196351e-05, + "loss": 0.0143, + "step": 21600 + }, + { + "epoch": 4.6937445699391835, + "grad_norm": 0.0007202685810625553, + "learning_rate": 2.06735990443093e-05, + "loss": 0.0001, + "step": 21610 + }, + { + "epoch": 4.695916594265856, + "grad_norm": 0.0007853159331716597, + "learning_rate": 2.0660023892267595e-05, + "loss": 0.0008, + "step": 21620 + }, + { + "epoch": 4.698088618592529, + "grad_norm": 0.03189823031425476, + "learning_rate": 2.064644874022589e-05, + "loss": 0.0064, + "step": 21630 + }, + { + "epoch": 4.700260642919201, + "grad_norm": 0.12528634071350098, + "learning_rate": 2.063287358818419e-05, + "loss": 0.0232, + "step": 21640 + }, + { + "epoch": 4.702432667245873, + "grad_norm": 0.0006917872815392911, + "learning_rate": 2.0619298436142488e-05, + "loss": 0.0073, + "step": 21650 + }, + { + "epoch": 4.704604691572546, + "grad_norm": 0.0006776847876608372, + "learning_rate": 2.0605723284100784e-05, + "loss": 0.0001, + "step": 21660 + }, + { + "epoch": 4.706776715899218, + "grad_norm": 0.0006595394806936383, + "learning_rate": 2.059214813205908e-05, + "loss": 0.0002, + "step": 21670 + }, + { + "epoch": 4.708948740225891, + "grad_norm": 0.0007088473648764193, + "learning_rate": 2.0578572980017377e-05, + "loss": 0.0003, + "step": 21680 + }, + { + "epoch": 4.711120764552563, + "grad_norm": 0.0006470125517807901, + "learning_rate": 2.0564997827975674e-05, + "loss": 0.0001, + "step": 21690 + }, + { + "epoch": 4.713292788879236, + "grad_norm": 0.0244167298078537, + "learning_rate": 2.055142267593397e-05, + "loss": 0.012, + "step": 21700 + }, + { + "epoch": 4.715464813205908, + "grad_norm": 0.0006437553092837334, + "learning_rate": 2.0537847523892267e-05, + "loss": 0.0017, + "step": 21710 + }, + { + "epoch": 4.717636837532581, + "grad_norm": 0.6032142043113708, + "learning_rate": 2.0524272371850567e-05, + "loss": 0.0091, + "step": 21720 + }, + { + "epoch": 4.719808861859253, + "grad_norm": 0.0015431154752150178, + "learning_rate": 2.0510697219808863e-05, + "loss": 0.0011, + "step": 21730 + }, + { + "epoch": 4.721980886185925, + "grad_norm": 0.031862836331129074, + "learning_rate": 2.049712206776716e-05, + "loss": 0.0346, + "step": 21740 + }, + { + "epoch": 4.724152910512598, + "grad_norm": 0.0006862595910206437, + "learning_rate": 2.0483546915725456e-05, + "loss": 0.0006, + "step": 21750 + }, + { + "epoch": 4.72632493483927, + "grad_norm": 0.0006549872341565788, + "learning_rate": 2.0469971763683753e-05, + "loss": 0.0017, + "step": 21760 + }, + { + "epoch": 4.728496959165943, + "grad_norm": 0.0008586323237977922, + "learning_rate": 2.045639661164205e-05, + "loss": 0.0533, + "step": 21770 + }, + { + "epoch": 4.730668983492615, + "grad_norm": 0.0010252386564388871, + "learning_rate": 2.044282145960035e-05, + "loss": 0.0001, + "step": 21780 + }, + { + "epoch": 4.732841007819288, + "grad_norm": 0.05718378722667694, + "learning_rate": 2.0429246307558646e-05, + "loss": 0.0003, + "step": 21790 + }, + { + "epoch": 4.73501303214596, + "grad_norm": 0.0011250174138695002, + "learning_rate": 2.0415671155516942e-05, + "loss": 0.0107, + "step": 21800 + }, + { + "epoch": 4.737185056472632, + "grad_norm": 0.0008177366689778864, + "learning_rate": 2.0402096003475242e-05, + "loss": 0.0001, + "step": 21810 + }, + { + "epoch": 4.739357080799305, + "grad_norm": 0.0008273087441921234, + "learning_rate": 2.038852085143354e-05, + "loss": 0.0085, + "step": 21820 + }, + { + "epoch": 4.741529105125977, + "grad_norm": 0.0027850535698235035, + "learning_rate": 2.0374945699391835e-05, + "loss": 0.0112, + "step": 21830 + }, + { + "epoch": 4.74370112945265, + "grad_norm": 0.0014280122704803944, + "learning_rate": 2.036137054735013e-05, + "loss": 0.001, + "step": 21840 + }, + { + "epoch": 4.745873153779322, + "grad_norm": 0.0009550989489071071, + "learning_rate": 2.0347795395308428e-05, + "loss": 0.0021, + "step": 21850 + }, + { + "epoch": 4.748045178105995, + "grad_norm": 0.0006730654276907444, + "learning_rate": 2.0334220243266725e-05, + "loss": 0.0062, + "step": 21860 + }, + { + "epoch": 4.750217202432667, + "grad_norm": 0.0006814012303948402, + "learning_rate": 2.0320645091225025e-05, + "loss": 0.0007, + "step": 21870 + }, + { + "epoch": 4.752389226759339, + "grad_norm": 0.00271434779278934, + "learning_rate": 2.030706993918332e-05, + "loss": 0.0001, + "step": 21880 + }, + { + "epoch": 4.754561251086012, + "grad_norm": 0.0006929839146323502, + "learning_rate": 2.0293494787141618e-05, + "loss": 0.0063, + "step": 21890 + }, + { + "epoch": 4.756733275412684, + "grad_norm": 0.0015105424681678414, + "learning_rate": 2.0279919635099914e-05, + "loss": 0.0001, + "step": 21900 + }, + { + "epoch": 4.758905299739357, + "grad_norm": 0.0006794344517402351, + "learning_rate": 2.026634448305821e-05, + "loss": 0.0453, + "step": 21910 + }, + { + "epoch": 4.761077324066029, + "grad_norm": 0.0010067267576232553, + "learning_rate": 2.0252769331016507e-05, + "loss": 0.0016, + "step": 21920 + }, + { + "epoch": 4.763249348392702, + "grad_norm": 0.014931906014680862, + "learning_rate": 2.0239194178974804e-05, + "loss": 0.0003, + "step": 21930 + }, + { + "epoch": 4.765421372719374, + "grad_norm": 0.07368484139442444, + "learning_rate": 2.02256190269331e-05, + "loss": 0.0005, + "step": 21940 + }, + { + "epoch": 4.767593397046047, + "grad_norm": 0.0007706546457484365, + "learning_rate": 2.02120438748914e-05, + "loss": 0.0081, + "step": 21950 + }, + { + "epoch": 4.769765421372719, + "grad_norm": 0.0007759315776638687, + "learning_rate": 2.0198468722849697e-05, + "loss": 0.0001, + "step": 21960 + }, + { + "epoch": 4.7719374456993915, + "grad_norm": 0.02588563784956932, + "learning_rate": 2.0184893570807993e-05, + "loss": 0.0004, + "step": 21970 + }, + { + "epoch": 4.7741094700260645, + "grad_norm": 0.0009247218258678913, + "learning_rate": 2.0171318418766293e-05, + "loss": 0.0218, + "step": 21980 + }, + { + "epoch": 4.7762814943527365, + "grad_norm": 0.0019565566908568144, + "learning_rate": 2.015774326672459e-05, + "loss": 0.0012, + "step": 21990 + }, + { + "epoch": 4.7784535186794095, + "grad_norm": 0.3613233268260956, + "learning_rate": 2.0144168114682886e-05, + "loss": 0.0005, + "step": 22000 + }, + { + "epoch": 4.780625543006082, + "grad_norm": 0.0011166412150487304, + "learning_rate": 2.0130592962641182e-05, + "loss": 0.0004, + "step": 22010 + }, + { + "epoch": 4.782797567332754, + "grad_norm": 0.00089460943127051, + "learning_rate": 2.011701781059948e-05, + "loss": 0.0043, + "step": 22020 + }, + { + "epoch": 4.784969591659427, + "grad_norm": 0.00839716475456953, + "learning_rate": 2.010344265855778e-05, + "loss": 0.0164, + "step": 22030 + }, + { + "epoch": 4.787141615986099, + "grad_norm": 0.0009814859367907047, + "learning_rate": 2.0089867506516075e-05, + "loss": 0.0001, + "step": 22040 + }, + { + "epoch": 4.789313640312772, + "grad_norm": 0.000751690415199846, + "learning_rate": 2.0076292354474372e-05, + "loss": 0.0002, + "step": 22050 + }, + { + "epoch": 4.791485664639444, + "grad_norm": 0.08599035441875458, + "learning_rate": 2.006271720243267e-05, + "loss": 0.0002, + "step": 22060 + }, + { + "epoch": 4.793657688966117, + "grad_norm": 0.0007369867525994778, + "learning_rate": 2.0049142050390965e-05, + "loss": 0.0006, + "step": 22070 + }, + { + "epoch": 4.795829713292789, + "grad_norm": 0.0006486243801191449, + "learning_rate": 2.003556689834926e-05, + "loss": 0.0001, + "step": 22080 + }, + { + "epoch": 4.798001737619462, + "grad_norm": 0.0007731476216576993, + "learning_rate": 2.0021991746307558e-05, + "loss": 0.0137, + "step": 22090 + }, + { + "epoch": 4.800173761946134, + "grad_norm": 0.0006871931254863739, + "learning_rate": 2.0008416594265858e-05, + "loss": 0.0001, + "step": 22100 + }, + { + "epoch": 4.802345786272806, + "grad_norm": 0.0170272383838892, + "learning_rate": 1.9994841442224154e-05, + "loss": 0.0003, + "step": 22110 + }, + { + "epoch": 4.804517810599479, + "grad_norm": 0.000689572305418551, + "learning_rate": 1.998126629018245e-05, + "loss": 0.0007, + "step": 22120 + }, + { + "epoch": 4.806689834926151, + "grad_norm": 0.0006891828961670399, + "learning_rate": 1.9967691138140747e-05, + "loss": 0.0001, + "step": 22130 + }, + { + "epoch": 4.808861859252824, + "grad_norm": 0.005554310977458954, + "learning_rate": 1.9954115986099044e-05, + "loss": 0.0225, + "step": 22140 + }, + { + "epoch": 4.811033883579496, + "grad_norm": 0.0006960682803764939, + "learning_rate": 1.994054083405734e-05, + "loss": 0.0099, + "step": 22150 + }, + { + "epoch": 4.813205907906169, + "grad_norm": 0.001702047768048942, + "learning_rate": 1.992696568201564e-05, + "loss": 0.0003, + "step": 22160 + }, + { + "epoch": 4.815377932232841, + "grad_norm": 0.0007203352870419621, + "learning_rate": 1.9913390529973937e-05, + "loss": 0.0071, + "step": 22170 + }, + { + "epoch": 4.817549956559514, + "grad_norm": 0.0006288993754424155, + "learning_rate": 1.9899815377932233e-05, + "loss": 0.0001, + "step": 22180 + }, + { + "epoch": 4.819721980886186, + "grad_norm": 0.0006460346630774438, + "learning_rate": 1.9886240225890533e-05, + "loss": 0.0001, + "step": 22190 + }, + { + "epoch": 4.821894005212858, + "grad_norm": 0.0006239477661438286, + "learning_rate": 1.987266507384883e-05, + "loss": 0.0001, + "step": 22200 + }, + { + "epoch": 4.824066029539531, + "grad_norm": 0.0006247904966585338, + "learning_rate": 1.9859089921807126e-05, + "loss": 0.0147, + "step": 22210 + }, + { + "epoch": 4.826238053866203, + "grad_norm": 0.0006836046231910586, + "learning_rate": 1.9845514769765423e-05, + "loss": 0.0001, + "step": 22220 + }, + { + "epoch": 4.828410078192876, + "grad_norm": 0.00064464146271348, + "learning_rate": 1.983193961772372e-05, + "loss": 0.0011, + "step": 22230 + }, + { + "epoch": 4.830582102519548, + "grad_norm": 0.0006400091806426644, + "learning_rate": 1.9818364465682016e-05, + "loss": 0.0001, + "step": 22240 + }, + { + "epoch": 4.83275412684622, + "grad_norm": 0.0007611711043864489, + "learning_rate": 1.9804789313640316e-05, + "loss": 0.0001, + "step": 22250 + }, + { + "epoch": 4.834926151172893, + "grad_norm": 0.0007814390119165182, + "learning_rate": 1.9791214161598612e-05, + "loss": 0.0001, + "step": 22260 + }, + { + "epoch": 4.837098175499565, + "grad_norm": 0.0021302583627402782, + "learning_rate": 1.977763900955691e-05, + "loss": 0.0078, + "step": 22270 + }, + { + "epoch": 4.839270199826238, + "grad_norm": 0.0006355360383167863, + "learning_rate": 1.9764063857515205e-05, + "loss": 0.0013, + "step": 22280 + }, + { + "epoch": 4.84144222415291, + "grad_norm": 0.006262447685003281, + "learning_rate": 1.97504887054735e-05, + "loss": 0.0022, + "step": 22290 + }, + { + "epoch": 4.843614248479583, + "grad_norm": 0.12360040843486786, + "learning_rate": 1.9736913553431798e-05, + "loss": 0.0007, + "step": 22300 + }, + { + "epoch": 4.845786272806255, + "grad_norm": 0.013160685077309608, + "learning_rate": 1.9723338401390095e-05, + "loss": 0.0001, + "step": 22310 + }, + { + "epoch": 4.847958297132928, + "grad_norm": 0.006475683301687241, + "learning_rate": 1.970976324934839e-05, + "loss": 0.0001, + "step": 22320 + }, + { + "epoch": 4.8501303214596, + "grad_norm": 2.062697410583496, + "learning_rate": 1.969618809730669e-05, + "loss": 0.0092, + "step": 22330 + }, + { + "epoch": 4.852302345786272, + "grad_norm": 0.0044073620811104774, + "learning_rate": 1.9682612945264988e-05, + "loss": 0.0001, + "step": 22340 + }, + { + "epoch": 4.854474370112945, + "grad_norm": 0.0005856105126440525, + "learning_rate": 1.9669037793223284e-05, + "loss": 0.0339, + "step": 22350 + }, + { + "epoch": 4.856646394439617, + "grad_norm": 0.0006312388577498496, + "learning_rate": 1.9655462641181584e-05, + "loss": 0.0058, + "step": 22360 + }, + { + "epoch": 4.85881841876629, + "grad_norm": 0.0006607999093830585, + "learning_rate": 1.964188748913988e-05, + "loss": 0.0005, + "step": 22370 + }, + { + "epoch": 4.8609904430929625, + "grad_norm": 0.0006427134503610432, + "learning_rate": 1.9628312337098177e-05, + "loss": 0.0094, + "step": 22380 + }, + { + "epoch": 4.863162467419635, + "grad_norm": 0.000609448819886893, + "learning_rate": 1.9614737185056473e-05, + "loss": 0.0001, + "step": 22390 + }, + { + "epoch": 4.8653344917463075, + "grad_norm": 0.4275817573070526, + "learning_rate": 1.960116203301477e-05, + "loss": 0.0097, + "step": 22400 + }, + { + "epoch": 4.8675065160729805, + "grad_norm": 0.0006039931322447956, + "learning_rate": 1.958758688097307e-05, + "loss": 0.0001, + "step": 22410 + }, + { + "epoch": 4.8696785403996525, + "grad_norm": 0.0006329385214485228, + "learning_rate": 1.9574011728931366e-05, + "loss": 0.004, + "step": 22420 + }, + { + "epoch": 4.871850564726325, + "grad_norm": 0.0005937905516475439, + "learning_rate": 1.9560436576889663e-05, + "loss": 0.0001, + "step": 22430 + }, + { + "epoch": 4.874022589052998, + "grad_norm": 0.0005967863835394382, + "learning_rate": 1.954686142484796e-05, + "loss": 0.0261, + "step": 22440 + }, + { + "epoch": 4.87619461337967, + "grad_norm": 0.008394586853682995, + "learning_rate": 1.9533286272806256e-05, + "loss": 0.0001, + "step": 22450 + }, + { + "epoch": 4.878366637706343, + "grad_norm": 0.0006573013961315155, + "learning_rate": 1.9519711120764552e-05, + "loss": 0.0079, + "step": 22460 + }, + { + "epoch": 4.880538662033015, + "grad_norm": 0.0006314768688753247, + "learning_rate": 1.950613596872285e-05, + "loss": 0.0003, + "step": 22470 + }, + { + "epoch": 4.882710686359687, + "grad_norm": 0.0006235949695110321, + "learning_rate": 1.949256081668115e-05, + "loss": 0.0001, + "step": 22480 + }, + { + "epoch": 4.88488271068636, + "grad_norm": 0.000721083371900022, + "learning_rate": 1.9478985664639445e-05, + "loss": 0.0156, + "step": 22490 + }, + { + "epoch": 4.887054735013032, + "grad_norm": 0.0006140803452581167, + "learning_rate": 1.9465410512597742e-05, + "loss": 0.0002, + "step": 22500 + }, + { + "epoch": 4.889226759339705, + "grad_norm": 0.455748975276947, + "learning_rate": 1.945183536055604e-05, + "loss": 0.0042, + "step": 22510 + }, + { + "epoch": 4.891398783666377, + "grad_norm": 0.0006114744464866817, + "learning_rate": 1.9438260208514335e-05, + "loss": 0.0001, + "step": 22520 + }, + { + "epoch": 4.89357080799305, + "grad_norm": 0.000603028922341764, + "learning_rate": 1.942468505647263e-05, + "loss": 0.0001, + "step": 22530 + }, + { + "epoch": 4.895742832319722, + "grad_norm": 0.0005933665088377893, + "learning_rate": 1.941110990443093e-05, + "loss": 0.0001, + "step": 22540 + }, + { + "epoch": 4.897914856646395, + "grad_norm": 0.0005882186815142632, + "learning_rate": 1.9397534752389228e-05, + "loss": 0.0001, + "step": 22550 + }, + { + "epoch": 4.900086880973067, + "grad_norm": 0.0006071260431781411, + "learning_rate": 1.9383959600347528e-05, + "loss": 0.0023, + "step": 22560 + }, + { + "epoch": 4.902258905299739, + "grad_norm": 0.0005930233746767044, + "learning_rate": 1.9370384448305824e-05, + "loss": 0.0001, + "step": 22570 + }, + { + "epoch": 4.904430929626412, + "grad_norm": 0.0007253455114550889, + "learning_rate": 1.935680929626412e-05, + "loss": 0.0001, + "step": 22580 + }, + { + "epoch": 4.906602953953084, + "grad_norm": 0.0006349599570967257, + "learning_rate": 1.9343234144222417e-05, + "loss": 0.0001, + "step": 22590 + }, + { + "epoch": 4.908774978279757, + "grad_norm": 0.000644247978925705, + "learning_rate": 1.9329658992180714e-05, + "loss": 0.0143, + "step": 22600 + }, + { + "epoch": 4.910947002606429, + "grad_norm": 0.020878519862890244, + "learning_rate": 1.931608384013901e-05, + "loss": 0.0002, + "step": 22610 + }, + { + "epoch": 4.913119026933102, + "grad_norm": 0.0006180730415508151, + "learning_rate": 1.9302508688097307e-05, + "loss": 0.0019, + "step": 22620 + }, + { + "epoch": 4.915291051259774, + "grad_norm": 0.005366886965930462, + "learning_rate": 1.9288933536055603e-05, + "loss": 0.0002, + "step": 22630 + }, + { + "epoch": 4.917463075586447, + "grad_norm": 0.0005870962049812078, + "learning_rate": 1.9275358384013903e-05, + "loss": 0.0001, + "step": 22640 + }, + { + "epoch": 4.919635099913119, + "grad_norm": 0.000580133986659348, + "learning_rate": 1.92617832319722e-05, + "loss": 0.0001, + "step": 22650 + }, + { + "epoch": 4.921807124239791, + "grad_norm": 0.0006379460101015866, + "learning_rate": 1.9248208079930496e-05, + "loss": 0.0322, + "step": 22660 + }, + { + "epoch": 4.923979148566464, + "grad_norm": 0.0008098538964986801, + "learning_rate": 1.9234632927888793e-05, + "loss": 0.0001, + "step": 22670 + }, + { + "epoch": 4.926151172893136, + "grad_norm": 0.0011943280696868896, + "learning_rate": 1.922105777584709e-05, + "loss": 0.0009, + "step": 22680 + }, + { + "epoch": 4.928323197219809, + "grad_norm": 0.01928878016769886, + "learning_rate": 1.9207482623805386e-05, + "loss": 0.0001, + "step": 22690 + }, + { + "epoch": 4.930495221546481, + "grad_norm": 0.0005802169325761497, + "learning_rate": 1.9193907471763682e-05, + "loss": 0.0001, + "step": 22700 + }, + { + "epoch": 4.932667245873153, + "grad_norm": 0.0059051173739135265, + "learning_rate": 1.9180332319721982e-05, + "loss": 0.0001, + "step": 22710 + }, + { + "epoch": 4.934839270199826, + "grad_norm": 0.0032551810145378113, + "learning_rate": 1.916675716768028e-05, + "loss": 0.0012, + "step": 22720 + }, + { + "epoch": 4.937011294526498, + "grad_norm": 0.0008213729597628117, + "learning_rate": 1.9153182015638575e-05, + "loss": 0.0001, + "step": 22730 + }, + { + "epoch": 4.939183318853171, + "grad_norm": 0.0005722604691982269, + "learning_rate": 1.9139606863596875e-05, + "loss": 0.0001, + "step": 22740 + }, + { + "epoch": 4.941355343179843, + "grad_norm": 0.0005691969417966902, + "learning_rate": 1.912603171155517e-05, + "loss": 0.0001, + "step": 22750 + }, + { + "epoch": 4.943527367506516, + "grad_norm": 0.0005698164459317923, + "learning_rate": 1.9112456559513468e-05, + "loss": 0.0001, + "step": 22760 + }, + { + "epoch": 4.945699391833188, + "grad_norm": 0.0005637307767756283, + "learning_rate": 1.9098881407471765e-05, + "loss": 0.0022, + "step": 22770 + }, + { + "epoch": 4.947871416159861, + "grad_norm": 0.0005699021276086569, + "learning_rate": 1.908530625543006e-05, + "loss": 0.0001, + "step": 22780 + }, + { + "epoch": 4.950043440486533, + "grad_norm": 0.0005615533445961773, + "learning_rate": 1.907173110338836e-05, + "loss": 0.004, + "step": 22790 + }, + { + "epoch": 4.9522154648132055, + "grad_norm": 0.0005626556230708957, + "learning_rate": 1.9058155951346657e-05, + "loss": 0.0274, + "step": 22800 + }, + { + "epoch": 4.9543874891398785, + "grad_norm": 0.0006404595333151519, + "learning_rate": 1.9044580799304954e-05, + "loss": 0.0001, + "step": 22810 + }, + { + "epoch": 4.9565595134665505, + "grad_norm": 0.0005639271112158895, + "learning_rate": 1.903100564726325e-05, + "loss": 0.0111, + "step": 22820 + }, + { + "epoch": 4.9587315377932235, + "grad_norm": 0.011098935268819332, + "learning_rate": 1.9017430495221547e-05, + "loss": 0.0001, + "step": 22830 + }, + { + "epoch": 4.960903562119896, + "grad_norm": 0.0006036728736944497, + "learning_rate": 1.9003855343179843e-05, + "loss": 0.0014, + "step": 22840 + }, + { + "epoch": 4.9630755864465685, + "grad_norm": 0.006722571793943644, + "learning_rate": 1.899028019113814e-05, + "loss": 0.0001, + "step": 22850 + }, + { + "epoch": 4.965247610773241, + "grad_norm": 0.0006086063804104924, + "learning_rate": 1.897670503909644e-05, + "loss": 0.0001, + "step": 22860 + }, + { + "epoch": 4.967419635099914, + "grad_norm": 0.001963576767593622, + "learning_rate": 1.8963129887054736e-05, + "loss": 0.0001, + "step": 22870 + }, + { + "epoch": 4.969591659426586, + "grad_norm": 0.0009560598991811275, + "learning_rate": 1.8949554735013033e-05, + "loss": 0.0296, + "step": 22880 + }, + { + "epoch": 4.971763683753258, + "grad_norm": 0.0029919431544840336, + "learning_rate": 1.893597958297133e-05, + "loss": 0.0001, + "step": 22890 + }, + { + "epoch": 4.973935708079931, + "grad_norm": 0.0005524032167159021, + "learning_rate": 1.8922404430929626e-05, + "loss": 0.0001, + "step": 22900 + }, + { + "epoch": 4.976107732406603, + "grad_norm": 0.0005599394789896905, + "learning_rate": 1.8908829278887922e-05, + "loss": 0.0038, + "step": 22910 + }, + { + "epoch": 4.978279756733276, + "grad_norm": 0.003142011584714055, + "learning_rate": 1.8895254126846222e-05, + "loss": 0.0001, + "step": 22920 + }, + { + "epoch": 4.980451781059948, + "grad_norm": 0.0005478629609569907, + "learning_rate": 1.888167897480452e-05, + "loss": 0.003, + "step": 22930 + }, + { + "epoch": 4.98262380538662, + "grad_norm": 0.0005452021723613143, + "learning_rate": 1.886810382276282e-05, + "loss": 0.0003, + "step": 22940 + }, + { + "epoch": 4.984795829713293, + "grad_norm": 0.000553121033590287, + "learning_rate": 1.8854528670721115e-05, + "loss": 0.046, + "step": 22950 + }, + { + "epoch": 4.986967854039965, + "grad_norm": 0.002784899901598692, + "learning_rate": 1.8840953518679412e-05, + "loss": 0.0001, + "step": 22960 + }, + { + "epoch": 4.989139878366638, + "grad_norm": 0.0037593538872897625, + "learning_rate": 1.8827378366637708e-05, + "loss": 0.0154, + "step": 22970 + }, + { + "epoch": 4.99131190269331, + "grad_norm": 0.0006829476333223283, + "learning_rate": 1.8813803214596005e-05, + "loss": 0.0001, + "step": 22980 + }, + { + "epoch": 4.993483927019983, + "grad_norm": 0.0006080670282244682, + "learning_rate": 1.88002280625543e-05, + "loss": 0.0132, + "step": 22990 + }, + { + "epoch": 4.995655951346655, + "grad_norm": 0.0028045615181326866, + "learning_rate": 1.8786652910512598e-05, + "loss": 0.0364, + "step": 23000 + }, + { + "epoch": 4.997827975673328, + "grad_norm": 8.378561019897461, + "learning_rate": 1.8773077758470894e-05, + "loss": 0.0076, + "step": 23010 + }, + { + "epoch": 5.0, + "grad_norm": 0.0020884855184704065, + "learning_rate": 1.8759502606429194e-05, + "loss": 0.0001, + "step": 23020 + }, + { + "epoch": 5.0, + "eval_f1": 0.575875486381323, + "eval_loss": 0.08333506435155869, + "eval_runtime": 82.6846, + "eval_samples_per_second": 120.639, + "eval_steps_per_second": 7.547, + "step": 23020 + }, + { + "epoch": 5.002172024326672, + "grad_norm": 0.0006133164861239493, + "learning_rate": 1.874592745438749e-05, + "loss": 0.0355, + "step": 23030 + }, + { + "epoch": 5.004344048653345, + "grad_norm": 0.0006215014145709574, + "learning_rate": 1.8732352302345787e-05, + "loss": 0.0001, + "step": 23040 + }, + { + "epoch": 5.006516072980017, + "grad_norm": 0.0006174238515086472, + "learning_rate": 1.8718777150304084e-05, + "loss": 0.0001, + "step": 23050 + }, + { + "epoch": 5.00868809730669, + "grad_norm": 0.0008445650455541909, + "learning_rate": 1.870520199826238e-05, + "loss": 0.0001, + "step": 23060 + }, + { + "epoch": 5.010860121633362, + "grad_norm": 0.0005881586112082005, + "learning_rate": 1.8691626846220677e-05, + "loss": 0.0001, + "step": 23070 + }, + { + "epoch": 5.013032145960035, + "grad_norm": 0.0006194966263137758, + "learning_rate": 1.8678051694178973e-05, + "loss": 0.0001, + "step": 23080 + }, + { + "epoch": 5.015204170286707, + "grad_norm": 0.0007004704675637186, + "learning_rate": 1.8664476542137273e-05, + "loss": 0.0001, + "step": 23090 + }, + { + "epoch": 5.017376194613379, + "grad_norm": 0.030953222885727882, + "learning_rate": 1.865090139009557e-05, + "loss": 0.0545, + "step": 23100 + }, + { + "epoch": 5.019548218940052, + "grad_norm": 0.0037942533381283283, + "learning_rate": 1.8637326238053866e-05, + "loss": 0.0003, + "step": 23110 + }, + { + "epoch": 5.021720243266724, + "grad_norm": 0.004638850688934326, + "learning_rate": 1.8623751086012166e-05, + "loss": 0.0052, + "step": 23120 + }, + { + "epoch": 5.023892267593397, + "grad_norm": 0.006132997572422028, + "learning_rate": 1.8610175933970463e-05, + "loss": 0.0008, + "step": 23130 + }, + { + "epoch": 5.026064291920069, + "grad_norm": 0.031111733987927437, + "learning_rate": 1.859660078192876e-05, + "loss": 0.0004, + "step": 23140 + }, + { + "epoch": 5.028236316246742, + "grad_norm": 0.019011540338397026, + "learning_rate": 1.8583025629887056e-05, + "loss": 0.0003, + "step": 23150 + }, + { + "epoch": 5.030408340573414, + "grad_norm": 0.001691153272986412, + "learning_rate": 1.8569450477845352e-05, + "loss": 0.0004, + "step": 23160 + }, + { + "epoch": 5.032580364900087, + "grad_norm": 0.0015675558242946863, + "learning_rate": 1.8555875325803652e-05, + "loss": 0.0002, + "step": 23170 + }, + { + "epoch": 5.034752389226759, + "grad_norm": 0.0054156165570020676, + "learning_rate": 1.854230017376195e-05, + "loss": 0.0002, + "step": 23180 + }, + { + "epoch": 5.0369244135534315, + "grad_norm": 0.0011364802485331893, + "learning_rate": 1.8528725021720245e-05, + "loss": 0.0002, + "step": 23190 + }, + { + "epoch": 5.039096437880104, + "grad_norm": 0.001140189589932561, + "learning_rate": 1.851514986967854e-05, + "loss": 0.0002, + "step": 23200 + }, + { + "epoch": 5.0412684622067765, + "grad_norm": 0.030046336352825165, + "learning_rate": 1.8501574717636838e-05, + "loss": 0.0002, + "step": 23210 + }, + { + "epoch": 5.0434404865334495, + "grad_norm": 0.0009006512118503451, + "learning_rate": 1.8487999565595135e-05, + "loss": 0.0002, + "step": 23220 + }, + { + "epoch": 5.0456125108601215, + "grad_norm": 0.0010009000543504953, + "learning_rate": 1.847442441355343e-05, + "loss": 0.0003, + "step": 23230 + }, + { + "epoch": 5.0477845351867945, + "grad_norm": 0.0016936525935307145, + "learning_rate": 1.8460849261511728e-05, + "loss": 0.04, + "step": 23240 + }, + { + "epoch": 5.049956559513467, + "grad_norm": 0.001645937329158187, + "learning_rate": 1.8447274109470027e-05, + "loss": 0.0065, + "step": 23250 + }, + { + "epoch": 5.052128583840139, + "grad_norm": 0.0010947795817628503, + "learning_rate": 1.8433698957428324e-05, + "loss": 0.0003, + "step": 23260 + }, + { + "epoch": 5.054300608166812, + "grad_norm": 0.0009428582852706313, + "learning_rate": 1.842012380538662e-05, + "loss": 0.0003, + "step": 23270 + }, + { + "epoch": 5.056472632493484, + "grad_norm": 0.02614458091557026, + "learning_rate": 1.8406548653344917e-05, + "loss": 0.0002, + "step": 23280 + }, + { + "epoch": 5.058644656820157, + "grad_norm": 0.0008888828451745212, + "learning_rate": 1.8392973501303213e-05, + "loss": 0.0001, + "step": 23290 + }, + { + "epoch": 5.060816681146829, + "grad_norm": 0.01737385056912899, + "learning_rate": 1.8379398349261513e-05, + "loss": 0.0003, + "step": 23300 + }, + { + "epoch": 5.062988705473502, + "grad_norm": 0.0010263947769999504, + "learning_rate": 1.836582319721981e-05, + "loss": 0.0002, + "step": 23310 + }, + { + "epoch": 5.065160729800174, + "grad_norm": 0.000807849457487464, + "learning_rate": 1.835224804517811e-05, + "loss": 0.0002, + "step": 23320 + }, + { + "epoch": 5.067332754126846, + "grad_norm": 0.0008964362787082791, + "learning_rate": 1.8338672893136406e-05, + "loss": 0.0003, + "step": 23330 + }, + { + "epoch": 5.069504778453519, + "grad_norm": 0.008567686192691326, + "learning_rate": 1.8325097741094703e-05, + "loss": 0.0004, + "step": 23340 + }, + { + "epoch": 5.071676802780191, + "grad_norm": 0.0010218977695330977, + "learning_rate": 1.8311522589053e-05, + "loss": 0.0031, + "step": 23350 + }, + { + "epoch": 5.073848827106864, + "grad_norm": 0.0007686801254749298, + "learning_rate": 1.8297947437011296e-05, + "loss": 0.0015, + "step": 23360 + }, + { + "epoch": 5.076020851433536, + "grad_norm": 0.004290709272027016, + "learning_rate": 1.8284372284969592e-05, + "loss": 0.0002, + "step": 23370 + }, + { + "epoch": 5.078192875760209, + "grad_norm": 0.0007577894139103591, + "learning_rate": 1.827079713292789e-05, + "loss": 0.0001, + "step": 23380 + }, + { + "epoch": 5.080364900086881, + "grad_norm": 0.0007783273467794061, + "learning_rate": 1.8257221980886185e-05, + "loss": 0.0041, + "step": 23390 + }, + { + "epoch": 5.082536924413553, + "grad_norm": 0.0024375859647989273, + "learning_rate": 1.8243646828844485e-05, + "loss": 0.0113, + "step": 23400 + }, + { + "epoch": 5.084708948740226, + "grad_norm": 0.0007347238133661449, + "learning_rate": 1.8230071676802782e-05, + "loss": 0.0002, + "step": 23410 + }, + { + "epoch": 5.086880973066898, + "grad_norm": 0.01347825676202774, + "learning_rate": 1.8216496524761078e-05, + "loss": 0.0004, + "step": 23420 + }, + { + "epoch": 5.089052997393571, + "grad_norm": 0.0006978354067541659, + "learning_rate": 1.8202921372719375e-05, + "loss": 0.0145, + "step": 23430 + }, + { + "epoch": 5.091225021720243, + "grad_norm": 0.0006992680137045681, + "learning_rate": 1.818934622067767e-05, + "loss": 0.0002, + "step": 23440 + }, + { + "epoch": 5.093397046046916, + "grad_norm": 0.0029848958365619183, + "learning_rate": 1.8175771068635968e-05, + "loss": 0.0003, + "step": 23450 + }, + { + "epoch": 5.095569070373588, + "grad_norm": 0.0031678786035627127, + "learning_rate": 1.8162195916594264e-05, + "loss": 0.0001, + "step": 23460 + }, + { + "epoch": 5.097741094700261, + "grad_norm": 0.0006989357643760741, + "learning_rate": 1.8148620764552564e-05, + "loss": 0.0002, + "step": 23470 + }, + { + "epoch": 5.099913119026933, + "grad_norm": 0.010042333044111729, + "learning_rate": 1.813504561251086e-05, + "loss": 0.0071, + "step": 23480 + }, + { + "epoch": 5.102085143353605, + "grad_norm": 0.0007547451532445848, + "learning_rate": 1.8121470460469157e-05, + "loss": 0.0001, + "step": 23490 + }, + { + "epoch": 5.104257167680278, + "grad_norm": 0.001228576060384512, + "learning_rate": 1.8107895308427457e-05, + "loss": 0.0002, + "step": 23500 + }, + { + "epoch": 5.10642919200695, + "grad_norm": 0.011400981806218624, + "learning_rate": 1.8094320156385754e-05, + "loss": 0.0003, + "step": 23510 + }, + { + "epoch": 5.108601216333623, + "grad_norm": 0.0007331727538257837, + "learning_rate": 1.808074500434405e-05, + "loss": 0.0001, + "step": 23520 + }, + { + "epoch": 5.110773240660295, + "grad_norm": 0.20844632387161255, + "learning_rate": 1.8067169852302347e-05, + "loss": 0.0002, + "step": 23530 + }, + { + "epoch": 5.112945264986968, + "grad_norm": 0.0008100902196019888, + "learning_rate": 1.8053594700260643e-05, + "loss": 0.0098, + "step": 23540 + }, + { + "epoch": 5.11511728931364, + "grad_norm": 0.0008311902638524771, + "learning_rate": 1.8040019548218943e-05, + "loss": 0.0002, + "step": 23550 + }, + { + "epoch": 5.117289313640312, + "grad_norm": 0.0008574782987125218, + "learning_rate": 1.802644439617724e-05, + "loss": 0.0002, + "step": 23560 + }, + { + "epoch": 5.119461337966985, + "grad_norm": 0.0006869113421998918, + "learning_rate": 1.8012869244135536e-05, + "loss": 0.0018, + "step": 23570 + }, + { + "epoch": 5.121633362293657, + "grad_norm": 0.0008113477961160243, + "learning_rate": 1.7999294092093833e-05, + "loss": 0.0001, + "step": 23580 + }, + { + "epoch": 5.12380538662033, + "grad_norm": 0.0038217424880713224, + "learning_rate": 1.798571894005213e-05, + "loss": 0.0002, + "step": 23590 + }, + { + "epoch": 5.125977410947002, + "grad_norm": 0.0006686209235340357, + "learning_rate": 1.7972143788010426e-05, + "loss": 0.0001, + "step": 23600 + }, + { + "epoch": 5.128149435273675, + "grad_norm": 0.032629430294036865, + "learning_rate": 1.7958568635968722e-05, + "loss": 0.0002, + "step": 23610 + }, + { + "epoch": 5.1303214596003475, + "grad_norm": 0.0008298791362904012, + "learning_rate": 1.794499348392702e-05, + "loss": 0.0004, + "step": 23620 + }, + { + "epoch": 5.1324934839270195, + "grad_norm": 0.008889926597476006, + "learning_rate": 1.793141833188532e-05, + "loss": 0.0001, + "step": 23630 + }, + { + "epoch": 5.1346655082536925, + "grad_norm": 0.0006458312273025513, + "learning_rate": 1.7917843179843615e-05, + "loss": 0.0053, + "step": 23640 + }, + { + "epoch": 5.136837532580365, + "grad_norm": 0.002981486963108182, + "learning_rate": 1.790426802780191e-05, + "loss": 0.0001, + "step": 23650 + }, + { + "epoch": 5.1390095569070375, + "grad_norm": 19.913515090942383, + "learning_rate": 1.7890692875760208e-05, + "loss": 0.0131, + "step": 23660 + }, + { + "epoch": 5.14118158123371, + "grad_norm": 0.0006599088083021343, + "learning_rate": 1.7877117723718505e-05, + "loss": 0.0013, + "step": 23670 + }, + { + "epoch": 5.143353605560383, + "grad_norm": 0.0008784077363088727, + "learning_rate": 1.7863542571676804e-05, + "loss": 0.0001, + "step": 23680 + }, + { + "epoch": 5.145525629887055, + "grad_norm": 0.0006363080465234816, + "learning_rate": 1.78499674196351e-05, + "loss": 0.0001, + "step": 23690 + }, + { + "epoch": 5.147697654213728, + "grad_norm": 0.0006334060453809798, + "learning_rate": 1.78363922675934e-05, + "loss": 0.0025, + "step": 23700 + }, + { + "epoch": 5.1498696785404, + "grad_norm": 0.0006302617839537561, + "learning_rate": 1.7822817115551697e-05, + "loss": 0.0025, + "step": 23710 + }, + { + "epoch": 5.152041702867072, + "grad_norm": 0.0006213096203282475, + "learning_rate": 1.7809241963509994e-05, + "loss": 0.0034, + "step": 23720 + }, + { + "epoch": 5.154213727193745, + "grad_norm": 0.0006246123812161386, + "learning_rate": 1.779566681146829e-05, + "loss": 0.0001, + "step": 23730 + }, + { + "epoch": 5.156385751520417, + "grad_norm": 0.0006313191843219101, + "learning_rate": 1.7782091659426587e-05, + "loss": 0.0001, + "step": 23740 + }, + { + "epoch": 5.15855777584709, + "grad_norm": 0.0014395987382158637, + "learning_rate": 1.7768516507384883e-05, + "loss": 0.0008, + "step": 23750 + }, + { + "epoch": 5.160729800173762, + "grad_norm": 0.002919491846114397, + "learning_rate": 1.775494135534318e-05, + "loss": 0.0331, + "step": 23760 + }, + { + "epoch": 5.162901824500435, + "grad_norm": 0.004078062251210213, + "learning_rate": 1.7741366203301476e-05, + "loss": 0.0003, + "step": 23770 + }, + { + "epoch": 5.165073848827107, + "grad_norm": 0.0006804884178563952, + "learning_rate": 1.7727791051259776e-05, + "loss": 0.0066, + "step": 23780 + }, + { + "epoch": 5.167245873153779, + "grad_norm": 0.006823851726949215, + "learning_rate": 1.7714215899218073e-05, + "loss": 0.0004, + "step": 23790 + }, + { + "epoch": 5.169417897480452, + "grad_norm": 0.006236494984477758, + "learning_rate": 1.770064074717637e-05, + "loss": 0.0001, + "step": 23800 + }, + { + "epoch": 5.171589921807124, + "grad_norm": 0.0006333237397484481, + "learning_rate": 1.7687065595134666e-05, + "loss": 0.0002, + "step": 23810 + }, + { + "epoch": 5.173761946133797, + "grad_norm": 0.000644736282993108, + "learning_rate": 1.7673490443092962e-05, + "loss": 0.0003, + "step": 23820 + }, + { + "epoch": 5.175933970460469, + "grad_norm": 0.0006276658968999982, + "learning_rate": 1.765991529105126e-05, + "loss": 0.0138, + "step": 23830 + }, + { + "epoch": 5.178105994787142, + "grad_norm": 0.003340308554470539, + "learning_rate": 1.7646340139009555e-05, + "loss": 0.0065, + "step": 23840 + }, + { + "epoch": 5.180278019113814, + "grad_norm": 0.7302592396736145, + "learning_rate": 1.7632764986967852e-05, + "loss": 0.0131, + "step": 23850 + }, + { + "epoch": 5.182450043440486, + "grad_norm": 0.0007103821262717247, + "learning_rate": 1.7619189834926152e-05, + "loss": 0.0003, + "step": 23860 + }, + { + "epoch": 5.184622067767159, + "grad_norm": 0.0006702254759147763, + "learning_rate": 1.7605614682884448e-05, + "loss": 0.0006, + "step": 23870 + }, + { + "epoch": 5.186794092093831, + "grad_norm": 0.0006133012939244509, + "learning_rate": 1.7592039530842748e-05, + "loss": 0.0002, + "step": 23880 + }, + { + "epoch": 5.188966116420504, + "grad_norm": 0.0006230532308109105, + "learning_rate": 1.7578464378801045e-05, + "loss": 0.0003, + "step": 23890 + }, + { + "epoch": 5.191138140747176, + "grad_norm": 0.0006302871042862535, + "learning_rate": 1.756488922675934e-05, + "loss": 0.0009, + "step": 23900 + }, + { + "epoch": 5.193310165073849, + "grad_norm": 0.0007144107366912067, + "learning_rate": 1.7551314074717638e-05, + "loss": 0.0001, + "step": 23910 + }, + { + "epoch": 5.195482189400521, + "grad_norm": 3.326815128326416, + "learning_rate": 1.7537738922675934e-05, + "loss": 0.014, + "step": 23920 + }, + { + "epoch": 5.197654213727194, + "grad_norm": 0.0007625357247889042, + "learning_rate": 1.7524163770634234e-05, + "loss": 0.0002, + "step": 23930 + }, + { + "epoch": 5.199826238053866, + "grad_norm": 0.0006717897485941648, + "learning_rate": 1.751058861859253e-05, + "loss": 0.0005, + "step": 23940 + }, + { + "epoch": 5.201998262380538, + "grad_norm": 0.0006527569494210184, + "learning_rate": 1.7497013466550827e-05, + "loss": 0.0004, + "step": 23950 + }, + { + "epoch": 5.204170286707211, + "grad_norm": 0.0006034320103935897, + "learning_rate": 1.7483438314509124e-05, + "loss": 0.0002, + "step": 23960 + }, + { + "epoch": 5.206342311033883, + "grad_norm": 0.0006436226540245116, + "learning_rate": 1.746986316246742e-05, + "loss": 0.0001, + "step": 23970 + }, + { + "epoch": 5.208514335360556, + "grad_norm": 0.0006348975584842265, + "learning_rate": 1.7456288010425717e-05, + "loss": 0.0005, + "step": 23980 + }, + { + "epoch": 5.210686359687228, + "grad_norm": 1.1442747116088867, + "learning_rate": 1.7442712858384013e-05, + "loss": 0.0174, + "step": 23990 + }, + { + "epoch": 5.212858384013901, + "grad_norm": 0.0006141769699752331, + "learning_rate": 1.742913770634231e-05, + "loss": 0.0026, + "step": 24000 + }, + { + "epoch": 5.215030408340573, + "grad_norm": 0.0015413524815812707, + "learning_rate": 1.741556255430061e-05, + "loss": 0.0007, + "step": 24010 + }, + { + "epoch": 5.2172024326672455, + "grad_norm": 0.0006206813850440085, + "learning_rate": 1.7401987402258906e-05, + "loss": 0.0002, + "step": 24020 + }, + { + "epoch": 5.219374456993918, + "grad_norm": 0.0006547339726239443, + "learning_rate": 1.7388412250217203e-05, + "loss": 0.0013, + "step": 24030 + }, + { + "epoch": 5.2215464813205905, + "grad_norm": 0.0006602110806852579, + "learning_rate": 1.73748370981755e-05, + "loss": 0.0013, + "step": 24040 + }, + { + "epoch": 5.2237185056472635, + "grad_norm": 0.0006027701310813427, + "learning_rate": 1.7361261946133796e-05, + "loss": 0.0005, + "step": 24050 + }, + { + "epoch": 5.2258905299739355, + "grad_norm": 0.000601082225330174, + "learning_rate": 1.7347686794092095e-05, + "loss": 0.0003, + "step": 24060 + }, + { + "epoch": 5.2280625543006085, + "grad_norm": 0.000594939396250993, + "learning_rate": 1.7334111642050392e-05, + "loss": 0.0193, + "step": 24070 + }, + { + "epoch": 5.230234578627281, + "grad_norm": 0.0006422780570574105, + "learning_rate": 1.7320536490008692e-05, + "loss": 0.0006, + "step": 24080 + }, + { + "epoch": 5.232406602953953, + "grad_norm": 0.0006172214052639902, + "learning_rate": 1.730696133796699e-05, + "loss": 0.0023, + "step": 24090 + }, + { + "epoch": 5.234578627280626, + "grad_norm": 0.0006009265780448914, + "learning_rate": 1.7293386185925285e-05, + "loss": 0.016, + "step": 24100 + }, + { + "epoch": 5.236750651607298, + "grad_norm": 0.0006303676636889577, + "learning_rate": 1.727981103388358e-05, + "loss": 0.0041, + "step": 24110 + }, + { + "epoch": 5.238922675933971, + "grad_norm": 0.0005849118460901082, + "learning_rate": 1.7266235881841878e-05, + "loss": 0.0002, + "step": 24120 + }, + { + "epoch": 5.241094700260643, + "grad_norm": 0.001120755448937416, + "learning_rate": 1.7252660729800174e-05, + "loss": 0.0002, + "step": 24130 + }, + { + "epoch": 5.243266724587316, + "grad_norm": 0.0005915990332141519, + "learning_rate": 1.723908557775847e-05, + "loss": 0.0001, + "step": 24140 + }, + { + "epoch": 5.245438748913988, + "grad_norm": 0.0006512019317597151, + "learning_rate": 1.7225510425716767e-05, + "loss": 0.0011, + "step": 24150 + }, + { + "epoch": 5.247610773240661, + "grad_norm": 0.0005761512438766658, + "learning_rate": 1.7211935273675067e-05, + "loss": 0.0044, + "step": 24160 + }, + { + "epoch": 5.249782797567333, + "grad_norm": 0.0006002063164487481, + "learning_rate": 1.7198360121633364e-05, + "loss": 0.0003, + "step": 24170 + }, + { + "epoch": 5.251954821894005, + "grad_norm": 0.0006288880831561983, + "learning_rate": 1.718478496959166e-05, + "loss": 0.0005, + "step": 24180 + }, + { + "epoch": 5.254126846220678, + "grad_norm": 0.0006209379062056541, + "learning_rate": 1.7171209817549957e-05, + "loss": 0.0063, + "step": 24190 + }, + { + "epoch": 5.25629887054735, + "grad_norm": 0.000875494908541441, + "learning_rate": 1.7157634665508253e-05, + "loss": 0.0001, + "step": 24200 + }, + { + "epoch": 5.258470894874023, + "grad_norm": 0.0006017699488438666, + "learning_rate": 1.714405951346655e-05, + "loss": 0.0006, + "step": 24210 + }, + { + "epoch": 5.260642919200695, + "grad_norm": 0.0005895401118323207, + "learning_rate": 1.7130484361424846e-05, + "loss": 0.0097, + "step": 24220 + }, + { + "epoch": 5.262814943527368, + "grad_norm": 0.0006465850165113807, + "learning_rate": 1.7116909209383143e-05, + "loss": 0.0076, + "step": 24230 + }, + { + "epoch": 5.26498696785404, + "grad_norm": 0.0005974260275252163, + "learning_rate": 1.7103334057341443e-05, + "loss": 0.0001, + "step": 24240 + }, + { + "epoch": 5.267158992180712, + "grad_norm": 0.0005888324230909348, + "learning_rate": 1.708975890529974e-05, + "loss": 0.0002, + "step": 24250 + }, + { + "epoch": 5.269331016507385, + "grad_norm": 0.0007290198700502515, + "learning_rate": 1.707618375325804e-05, + "loss": 0.0002, + "step": 24260 + }, + { + "epoch": 5.271503040834057, + "grad_norm": 0.0016792593523859978, + "learning_rate": 1.7062608601216336e-05, + "loss": 0.0009, + "step": 24270 + }, + { + "epoch": 5.27367506516073, + "grad_norm": 0.0006172910216264427, + "learning_rate": 1.7049033449174632e-05, + "loss": 0.0002, + "step": 24280 + }, + { + "epoch": 5.275847089487402, + "grad_norm": 0.0010846874210983515, + "learning_rate": 1.703545829713293e-05, + "loss": 0.0002, + "step": 24290 + }, + { + "epoch": 5.278019113814075, + "grad_norm": 0.0006275677005760372, + "learning_rate": 1.7021883145091225e-05, + "loss": 0.0002, + "step": 24300 + }, + { + "epoch": 5.280191138140747, + "grad_norm": 0.0005748061230406165, + "learning_rate": 1.7008307993049525e-05, + "loss": 0.0002, + "step": 24310 + }, + { + "epoch": 5.282363162467419, + "grad_norm": 0.0006720026140101254, + "learning_rate": 1.699473284100782e-05, + "loss": 0.0001, + "step": 24320 + }, + { + "epoch": 5.284535186794092, + "grad_norm": 0.0007187994779087603, + "learning_rate": 1.6981157688966118e-05, + "loss": 0.0001, + "step": 24330 + }, + { + "epoch": 5.286707211120764, + "grad_norm": 0.0005684763309545815, + "learning_rate": 1.6967582536924415e-05, + "loss": 0.0001, + "step": 24340 + }, + { + "epoch": 5.288879235447437, + "grad_norm": 0.0005650657112710178, + "learning_rate": 1.695400738488271e-05, + "loss": 0.0028, + "step": 24350 + }, + { + "epoch": 5.291051259774109, + "grad_norm": 0.0005644088960252702, + "learning_rate": 1.6940432232841008e-05, + "loss": 0.0001, + "step": 24360 + }, + { + "epoch": 5.293223284100782, + "grad_norm": 0.0005540383281186223, + "learning_rate": 1.6926857080799304e-05, + "loss": 0.0001, + "step": 24370 + }, + { + "epoch": 5.295395308427454, + "grad_norm": 0.000572359946090728, + "learning_rate": 1.69132819287576e-05, + "loss": 0.0001, + "step": 24380 + }, + { + "epoch": 5.297567332754127, + "grad_norm": 0.0006077389698475599, + "learning_rate": 1.68997067767159e-05, + "loss": 0.0001, + "step": 24390 + }, + { + "epoch": 5.299739357080799, + "grad_norm": 0.0005711704143323004, + "learning_rate": 1.6886131624674197e-05, + "loss": 0.0003, + "step": 24400 + }, + { + "epoch": 5.301911381407471, + "grad_norm": 0.0005502477288246155, + "learning_rate": 1.6872556472632494e-05, + "loss": 0.0001, + "step": 24410 + }, + { + "epoch": 5.304083405734144, + "grad_norm": 0.0005449872696772218, + "learning_rate": 1.685898132059079e-05, + "loss": 0.0001, + "step": 24420 + }, + { + "epoch": 5.3062554300608165, + "grad_norm": 0.0005534543306566775, + "learning_rate": 1.6845406168549087e-05, + "loss": 0.0001, + "step": 24430 + }, + { + "epoch": 5.308427454387489, + "grad_norm": 0.0005486282170750201, + "learning_rate": 1.6831831016507387e-05, + "loss": 0.0001, + "step": 24440 + }, + { + "epoch": 5.3105994787141615, + "grad_norm": 0.0005433621699921787, + "learning_rate": 1.6818255864465683e-05, + "loss": 0.021, + "step": 24450 + }, + { + "epoch": 5.3127715030408345, + "grad_norm": 0.0005527997273020446, + "learning_rate": 1.680468071242398e-05, + "loss": 0.002, + "step": 24460 + }, + { + "epoch": 5.3149435273675065, + "grad_norm": 0.0005365969846025109, + "learning_rate": 1.679110556038228e-05, + "loss": 0.0001, + "step": 24470 + }, + { + "epoch": 5.317115551694179, + "grad_norm": 0.0005567724583670497, + "learning_rate": 1.6777530408340576e-05, + "loss": 0.0025, + "step": 24480 + }, + { + "epoch": 5.319287576020852, + "grad_norm": 0.0007047782419249415, + "learning_rate": 1.6763955256298872e-05, + "loss": 0.0011, + "step": 24490 + }, + { + "epoch": 5.321459600347524, + "grad_norm": 0.0005513280630111694, + "learning_rate": 1.675038010425717e-05, + "loss": 0.0001, + "step": 24500 + }, + { + "epoch": 5.323631624674197, + "grad_norm": 0.04640301689505577, + "learning_rate": 1.6736804952215465e-05, + "loss": 0.0002, + "step": 24510 + }, + { + "epoch": 5.325803649000869, + "grad_norm": 0.0005909107276238501, + "learning_rate": 1.6723229800173762e-05, + "loss": 0.0001, + "step": 24520 + }, + { + "epoch": 5.327975673327542, + "grad_norm": 0.0005448001902550459, + "learning_rate": 1.670965464813206e-05, + "loss": 0.0065, + "step": 24530 + }, + { + "epoch": 5.330147697654214, + "grad_norm": 0.002460588002577424, + "learning_rate": 1.669607949609036e-05, + "loss": 0.0001, + "step": 24540 + }, + { + "epoch": 5.332319721980886, + "grad_norm": 0.0005325566744431853, + "learning_rate": 1.6682504344048655e-05, + "loss": 0.0001, + "step": 24550 + }, + { + "epoch": 5.334491746307559, + "grad_norm": 0.000526305811945349, + "learning_rate": 1.666892919200695e-05, + "loss": 0.0002, + "step": 24560 + }, + { + "epoch": 5.336663770634231, + "grad_norm": 0.005369687918573618, + "learning_rate": 1.6655354039965248e-05, + "loss": 0.0001, + "step": 24570 + }, + { + "epoch": 5.338835794960904, + "grad_norm": 0.0005400310037657619, + "learning_rate": 1.6641778887923544e-05, + "loss": 0.0068, + "step": 24580 + }, + { + "epoch": 5.341007819287576, + "grad_norm": 0.0005356398760341108, + "learning_rate": 1.662820373588184e-05, + "loss": 0.0024, + "step": 24590 + }, + { + "epoch": 5.343179843614249, + "grad_norm": 0.01858246512711048, + "learning_rate": 1.6614628583840137e-05, + "loss": 0.0001, + "step": 24600 + }, + { + "epoch": 5.345351867940921, + "grad_norm": 0.000568159855902195, + "learning_rate": 1.6601053431798437e-05, + "loss": 0.0001, + "step": 24610 + }, + { + "epoch": 5.347523892267594, + "grad_norm": 0.0005330589483492076, + "learning_rate": 1.6587478279756734e-05, + "loss": 0.0001, + "step": 24620 + }, + { + "epoch": 5.349695916594266, + "grad_norm": 0.0006389102200046182, + "learning_rate": 1.657390312771503e-05, + "loss": 0.0607, + "step": 24630 + }, + { + "epoch": 5.351867940920938, + "grad_norm": 0.0015882436418905854, + "learning_rate": 1.656032797567333e-05, + "loss": 0.001, + "step": 24640 + }, + { + "epoch": 5.354039965247611, + "grad_norm": 0.0010832021944224834, + "learning_rate": 1.6546752823631627e-05, + "loss": 0.0002, + "step": 24650 + }, + { + "epoch": 5.356211989574283, + "grad_norm": 0.007294220384210348, + "learning_rate": 1.6533177671589923e-05, + "loss": 0.0049, + "step": 24660 + }, + { + "epoch": 5.358384013900956, + "grad_norm": 0.0007425106014125049, + "learning_rate": 1.651960251954822e-05, + "loss": 0.02, + "step": 24670 + }, + { + "epoch": 5.360556038227628, + "grad_norm": 0.000792081409599632, + "learning_rate": 1.6506027367506516e-05, + "loss": 0.0001, + "step": 24680 + }, + { + "epoch": 5.362728062554301, + "grad_norm": 0.017137495800852776, + "learning_rate": 1.6492452215464816e-05, + "loss": 0.0169, + "step": 24690 + }, + { + "epoch": 5.364900086880973, + "grad_norm": 0.0006564608193002641, + "learning_rate": 1.6478877063423113e-05, + "loss": 0.0002, + "step": 24700 + }, + { + "epoch": 5.367072111207645, + "grad_norm": 0.0006727157742716372, + "learning_rate": 1.646530191138141e-05, + "loss": 0.0001, + "step": 24710 + }, + { + "epoch": 5.369244135534318, + "grad_norm": 0.0006364115397445858, + "learning_rate": 1.6451726759339706e-05, + "loss": 0.0001, + "step": 24720 + }, + { + "epoch": 5.37141615986099, + "grad_norm": 0.0006498926086351275, + "learning_rate": 1.6438151607298002e-05, + "loss": 0.0034, + "step": 24730 + }, + { + "epoch": 5.373588184187663, + "grad_norm": 0.0006514867418445647, + "learning_rate": 1.64245764552563e-05, + "loss": 0.0004, + "step": 24740 + }, + { + "epoch": 5.375760208514335, + "grad_norm": 0.000643563864286989, + "learning_rate": 1.6412358818418766e-05, + "loss": 0.0194, + "step": 24750 + }, + { + "epoch": 5.377932232841008, + "grad_norm": 0.0006255882908590138, + "learning_rate": 1.6398783666377065e-05, + "loss": 0.0001, + "step": 24760 + }, + { + "epoch": 5.38010425716768, + "grad_norm": 0.0006202755030244589, + "learning_rate": 1.6385208514335362e-05, + "loss": 0.0001, + "step": 24770 + }, + { + "epoch": 5.382276281494352, + "grad_norm": 0.0006363045540638268, + "learning_rate": 1.637163336229366e-05, + "loss": 0.0001, + "step": 24780 + }, + { + "epoch": 5.384448305821025, + "grad_norm": 0.0007371389074251056, + "learning_rate": 1.6358058210251955e-05, + "loss": 0.0559, + "step": 24790 + }, + { + "epoch": 5.386620330147697, + "grad_norm": 0.0025946300011128187, + "learning_rate": 1.634448305821025e-05, + "loss": 0.0001, + "step": 24800 + }, + { + "epoch": 5.38879235447437, + "grad_norm": 0.0008282421040348709, + "learning_rate": 1.6330907906168548e-05, + "loss": 0.0001, + "step": 24810 + }, + { + "epoch": 5.390964378801042, + "grad_norm": 0.0008425298728980124, + "learning_rate": 1.6317332754126845e-05, + "loss": 0.0001, + "step": 24820 + }, + { + "epoch": 5.393136403127715, + "grad_norm": 0.0007536081247963011, + "learning_rate": 1.6303757602085144e-05, + "loss": 0.0001, + "step": 24830 + }, + { + "epoch": 5.395308427454387, + "grad_norm": 0.0009574370342306793, + "learning_rate": 1.629018245004344e-05, + "loss": 0.0001, + "step": 24840 + }, + { + "epoch": 5.39748045178106, + "grad_norm": 0.0009711087332107127, + "learning_rate": 1.627660729800174e-05, + "loss": 0.0001, + "step": 24850 + }, + { + "epoch": 5.3996524761077325, + "grad_norm": 0.0007579062366858125, + "learning_rate": 1.6263032145960037e-05, + "loss": 0.0001, + "step": 24860 + }, + { + "epoch": 5.4018245004344045, + "grad_norm": 0.0007839860627427697, + "learning_rate": 1.6249456993918334e-05, + "loss": 0.0001, + "step": 24870 + }, + { + "epoch": 5.4039965247610775, + "grad_norm": 0.00206410582177341, + "learning_rate": 1.623588184187663e-05, + "loss": 0.0001, + "step": 24880 + }, + { + "epoch": 5.40616854908775, + "grad_norm": 0.0009084375342354178, + "learning_rate": 1.6222306689834927e-05, + "loss": 0.008, + "step": 24890 + }, + { + "epoch": 5.4083405734144225, + "grad_norm": 0.0027578831650316715, + "learning_rate": 1.6208731537793223e-05, + "loss": 0.0052, + "step": 24900 + }, + { + "epoch": 5.410512597741095, + "grad_norm": 0.018679317086935043, + "learning_rate": 1.619515638575152e-05, + "loss": 0.012, + "step": 24910 + }, + { + "epoch": 5.412684622067768, + "grad_norm": 0.0007577328360639513, + "learning_rate": 1.618158123370982e-05, + "loss": 0.003, + "step": 24920 + }, + { + "epoch": 5.41485664639444, + "grad_norm": 0.0007110742153599858, + "learning_rate": 1.6168006081668116e-05, + "loss": 0.0002, + "step": 24930 + }, + { + "epoch": 5.417028670721112, + "grad_norm": 0.0006982145132496953, + "learning_rate": 1.6154430929626413e-05, + "loss": 0.0001, + "step": 24940 + }, + { + "epoch": 5.419200695047785, + "grad_norm": 0.9744926691055298, + "learning_rate": 1.614085577758471e-05, + "loss": 0.0024, + "step": 24950 + }, + { + "epoch": 5.421372719374457, + "grad_norm": 0.3139134645462036, + "learning_rate": 1.6127280625543006e-05, + "loss": 0.0073, + "step": 24960 + }, + { + "epoch": 5.42354474370113, + "grad_norm": 0.0016664909198880196, + "learning_rate": 1.6113705473501302e-05, + "loss": 0.0025, + "step": 24970 + }, + { + "epoch": 5.425716768027802, + "grad_norm": 0.0018273413879796863, + "learning_rate": 1.61001303214596e-05, + "loss": 0.0001, + "step": 24980 + }, + { + "epoch": 5.427888792354475, + "grad_norm": 0.009619076736271381, + "learning_rate": 1.60865551694179e-05, + "loss": 0.0036, + "step": 24990 + }, + { + "epoch": 5.430060816681147, + "grad_norm": 0.0006586744566448033, + "learning_rate": 1.6072980017376195e-05, + "loss": 0.0001, + "step": 25000 + }, + { + "epoch": 5.432232841007819, + "grad_norm": 0.0006743803387507796, + "learning_rate": 1.6059404865334492e-05, + "loss": 0.0001, + "step": 25010 + }, + { + "epoch": 5.434404865334492, + "grad_norm": 0.0007701460854150355, + "learning_rate": 1.6045829713292788e-05, + "loss": 0.0001, + "step": 25020 + }, + { + "epoch": 5.436576889661164, + "grad_norm": 0.003409789642319083, + "learning_rate": 1.6032254561251088e-05, + "loss": 0.0143, + "step": 25030 + }, + { + "epoch": 5.438748913987837, + "grad_norm": 0.0014798047486692667, + "learning_rate": 1.6018679409209385e-05, + "loss": 0.0002, + "step": 25040 + }, + { + "epoch": 5.440920938314509, + "grad_norm": 0.002662486396729946, + "learning_rate": 1.600510425716768e-05, + "loss": 0.0083, + "step": 25050 + }, + { + "epoch": 5.443092962641182, + "grad_norm": 0.0006909591029398143, + "learning_rate": 1.5991529105125978e-05, + "loss": 0.0037, + "step": 25060 + }, + { + "epoch": 5.445264986967854, + "grad_norm": 0.0006745163118466735, + "learning_rate": 1.5977953953084278e-05, + "loss": 0.0001, + "step": 25070 + }, + { + "epoch": 5.447437011294527, + "grad_norm": 0.23285184800624847, + "learning_rate": 1.5964378801042574e-05, + "loss": 0.0047, + "step": 25080 + }, + { + "epoch": 5.449609035621199, + "grad_norm": 0.0006249352591112256, + "learning_rate": 1.595080364900087e-05, + "loss": 0.0001, + "step": 25090 + }, + { + "epoch": 5.451781059947871, + "grad_norm": 0.0006067950162105262, + "learning_rate": 1.5937228496959167e-05, + "loss": 0.0013, + "step": 25100 + }, + { + "epoch": 5.453953084274544, + "grad_norm": 5.546998500823975, + "learning_rate": 1.5923653344917464e-05, + "loss": 0.0486, + "step": 25110 + }, + { + "epoch": 5.456125108601216, + "grad_norm": 0.0011793351732194424, + "learning_rate": 1.591007819287576e-05, + "loss": 0.0135, + "step": 25120 + }, + { + "epoch": 5.458297132927889, + "grad_norm": 0.0008291056728921831, + "learning_rate": 1.5896503040834057e-05, + "loss": 0.0001, + "step": 25130 + }, + { + "epoch": 5.460469157254561, + "grad_norm": 0.0031490300316363573, + "learning_rate": 1.5882927888792356e-05, + "loss": 0.0002, + "step": 25140 + }, + { + "epoch": 5.462641181581233, + "grad_norm": 0.0009691480663605034, + "learning_rate": 1.5869352736750653e-05, + "loss": 0.0021, + "step": 25150 + }, + { + "epoch": 5.464813205907906, + "grad_norm": 0.002213733736425638, + "learning_rate": 1.585577758470895e-05, + "loss": 0.0059, + "step": 25160 + }, + { + "epoch": 5.466985230234578, + "grad_norm": 0.0017564035952091217, + "learning_rate": 1.5842202432667246e-05, + "loss": 0.0001, + "step": 25170 + }, + { + "epoch": 5.469157254561251, + "grad_norm": 0.0007754802936688066, + "learning_rate": 1.5828627280625543e-05, + "loss": 0.0061, + "step": 25180 + }, + { + "epoch": 5.471329278887923, + "grad_norm": 0.0006599400658160448, + "learning_rate": 1.581505212858384e-05, + "loss": 0.0001, + "step": 25190 + }, + { + "epoch": 5.473501303214596, + "grad_norm": 0.0007077509071677923, + "learning_rate": 1.5801476976542136e-05, + "loss": 0.0001, + "step": 25200 + }, + { + "epoch": 5.475673327541268, + "grad_norm": 0.0008078523096628487, + "learning_rate": 1.5787901824500435e-05, + "loss": 0.0001, + "step": 25210 + }, + { + "epoch": 5.477845351867941, + "grad_norm": 0.0027974487747997046, + "learning_rate": 1.5774326672458732e-05, + "loss": 0.0001, + "step": 25220 + }, + { + "epoch": 5.480017376194613, + "grad_norm": 0.00064576615113765, + "learning_rate": 1.5760751520417032e-05, + "loss": 0.0001, + "step": 25230 + }, + { + "epoch": 5.4821894005212854, + "grad_norm": 0.0006342840497381985, + "learning_rate": 1.574717636837533e-05, + "loss": 0.0001, + "step": 25240 + }, + { + "epoch": 5.484361424847958, + "grad_norm": 0.0006040172302164137, + "learning_rate": 1.5733601216333625e-05, + "loss": 0.0001, + "step": 25250 + }, + { + "epoch": 5.4865334491746305, + "grad_norm": 0.0007217189413495362, + "learning_rate": 1.572002606429192e-05, + "loss": 0.0015, + "step": 25260 + }, + { + "epoch": 5.4887054735013034, + "grad_norm": 1.1857420206069946, + "learning_rate": 1.5706450912250218e-05, + "loss": 0.0371, + "step": 25270 + }, + { + "epoch": 5.4908774978279755, + "grad_norm": 0.0006299542728811502, + "learning_rate": 1.5692875760208514e-05, + "loss": 0.0001, + "step": 25280 + }, + { + "epoch": 5.4930495221546485, + "grad_norm": 0.0006110373069532216, + "learning_rate": 1.567930060816681e-05, + "loss": 0.0086, + "step": 25290 + }, + { + "epoch": 5.4952215464813206, + "grad_norm": 0.0006376666133292019, + "learning_rate": 1.566572545612511e-05, + "loss": 0.0001, + "step": 25300 + }, + { + "epoch": 5.4973935708079935, + "grad_norm": 0.000611914845649153, + "learning_rate": 1.5652150304083407e-05, + "loss": 0.0034, + "step": 25310 + }, + { + "epoch": 5.499565595134666, + "grad_norm": 0.0006409480702131987, + "learning_rate": 1.5638575152041704e-05, + "loss": 0.0001, + "step": 25320 + }, + { + "epoch": 5.501737619461338, + "grad_norm": 0.0006519390735775232, + "learning_rate": 1.5625e-05, + "loss": 0.0349, + "step": 25330 + }, + { + "epoch": 5.503909643788011, + "grad_norm": 0.004979894496500492, + "learning_rate": 1.5611424847958297e-05, + "loss": 0.0001, + "step": 25340 + }, + { + "epoch": 5.506081668114683, + "grad_norm": 0.003943281248211861, + "learning_rate": 1.5597849695916593e-05, + "loss": 0.0001, + "step": 25350 + }, + { + "epoch": 5.508253692441356, + "grad_norm": 0.0005990856443531811, + "learning_rate": 1.558427454387489e-05, + "loss": 0.0001, + "step": 25360 + }, + { + "epoch": 5.510425716768028, + "grad_norm": 0.0006143053760752082, + "learning_rate": 1.557069939183319e-05, + "loss": 0.0001, + "step": 25370 + }, + { + "epoch": 5.5125977410947, + "grad_norm": 0.000608195026870817, + "learning_rate": 1.5557124239791486e-05, + "loss": 0.0001, + "step": 25380 + }, + { + "epoch": 5.514769765421373, + "grad_norm": 0.003303609788417816, + "learning_rate": 1.5543549087749783e-05, + "loss": 0.0002, + "step": 25390 + }, + { + "epoch": 5.516941789748045, + "grad_norm": 0.0005992467049509287, + "learning_rate": 1.552997393570808e-05, + "loss": 0.0001, + "step": 25400 + }, + { + "epoch": 5.519113814074718, + "grad_norm": 0.0005971924983896315, + "learning_rate": 1.551639878366638e-05, + "loss": 0.0001, + "step": 25410 + }, + { + "epoch": 5.52128583840139, + "grad_norm": 0.17835816740989685, + "learning_rate": 1.5502823631624676e-05, + "loss": 0.0314, + "step": 25420 + }, + { + "epoch": 5.523457862728063, + "grad_norm": 0.0006768538733012974, + "learning_rate": 1.5489248479582972e-05, + "loss": 0.0001, + "step": 25430 + }, + { + "epoch": 5.525629887054735, + "grad_norm": 0.0006399670382961631, + "learning_rate": 1.547567332754127e-05, + "loss": 0.0001, + "step": 25440 + }, + { + "epoch": 5.527801911381408, + "grad_norm": 0.0006358442478813231, + "learning_rate": 1.546209817549957e-05, + "loss": 0.0001, + "step": 25450 + }, + { + "epoch": 5.52997393570808, + "grad_norm": 0.0006128513487055898, + "learning_rate": 1.5448523023457865e-05, + "loss": 0.0052, + "step": 25460 + }, + { + "epoch": 5.532145960034752, + "grad_norm": 0.0007636906229890883, + "learning_rate": 1.543494787141616e-05, + "loss": 0.0001, + "step": 25470 + }, + { + "epoch": 5.534317984361425, + "grad_norm": 0.0006304889684543014, + "learning_rate": 1.5421372719374458e-05, + "loss": 0.0001, + "step": 25480 + }, + { + "epoch": 5.536490008688097, + "grad_norm": 0.03822551667690277, + "learning_rate": 1.5407797567332755e-05, + "loss": 0.027, + "step": 25490 + }, + { + "epoch": 5.53866203301477, + "grad_norm": 0.0006133398273959756, + "learning_rate": 1.539422241529105e-05, + "loss": 0.0001, + "step": 25500 + }, + { + "epoch": 5.540834057341442, + "grad_norm": 0.0024534007534384727, + "learning_rate": 1.5380647263249348e-05, + "loss": 0.0014, + "step": 25510 + }, + { + "epoch": 5.543006081668115, + "grad_norm": 0.0006024197209626436, + "learning_rate": 1.5367072111207644e-05, + "loss": 0.0001, + "step": 25520 + }, + { + "epoch": 5.545178105994787, + "grad_norm": 0.0012399987317621708, + "learning_rate": 1.5353496959165944e-05, + "loss": 0.0082, + "step": 25530 + }, + { + "epoch": 5.54735013032146, + "grad_norm": 0.0005957476096227765, + "learning_rate": 1.533992180712424e-05, + "loss": 0.0001, + "step": 25540 + }, + { + "epoch": 5.549522154648132, + "grad_norm": 0.0005901667755097151, + "learning_rate": 1.5326346655082537e-05, + "loss": 0.0061, + "step": 25550 + }, + { + "epoch": 5.551694178974804, + "grad_norm": 0.0007840208127163351, + "learning_rate": 1.5312771503040834e-05, + "loss": 0.0001, + "step": 25560 + }, + { + "epoch": 5.553866203301477, + "grad_norm": 0.0030813610646873713, + "learning_rate": 1.529919635099913e-05, + "loss": 0.0001, + "step": 25570 + }, + { + "epoch": 5.556038227628149, + "grad_norm": 0.0005978733533993363, + "learning_rate": 1.5285621198957427e-05, + "loss": 0.0001, + "step": 25580 + }, + { + "epoch": 5.558210251954822, + "grad_norm": 0.0018240666249766946, + "learning_rate": 1.5272046046915726e-05, + "loss": 0.0001, + "step": 25590 + }, + { + "epoch": 5.560382276281494, + "grad_norm": 0.003863741410896182, + "learning_rate": 1.5258470894874025e-05, + "loss": 0.0001, + "step": 25600 + }, + { + "epoch": 5.562554300608166, + "grad_norm": 0.0005884706042706966, + "learning_rate": 1.5244895742832321e-05, + "loss": 0.0166, + "step": 25610 + }, + { + "epoch": 5.564726324934839, + "grad_norm": 0.0005884277052246034, + "learning_rate": 1.5231320590790618e-05, + "loss": 0.0001, + "step": 25620 + }, + { + "epoch": 5.566898349261511, + "grad_norm": 0.0018177337478846312, + "learning_rate": 1.5217745438748914e-05, + "loss": 0.0002, + "step": 25630 + }, + { + "epoch": 5.569070373588184, + "grad_norm": 0.0005721076158806682, + "learning_rate": 1.5204170286707212e-05, + "loss": 0.0001, + "step": 25640 + }, + { + "epoch": 5.571242397914856, + "grad_norm": 3.635812997817993, + "learning_rate": 1.5190595134665509e-05, + "loss": 0.0493, + "step": 25650 + }, + { + "epoch": 5.573414422241529, + "grad_norm": 0.0006589922704733908, + "learning_rate": 1.5177019982623805e-05, + "loss": 0.0001, + "step": 25660 + }, + { + "epoch": 5.5755864465682015, + "grad_norm": 0.0009152049897238612, + "learning_rate": 1.5163444830582102e-05, + "loss": 0.0001, + "step": 25670 + }, + { + "epoch": 5.577758470894874, + "grad_norm": 0.0008373759919777513, + "learning_rate": 1.5149869678540402e-05, + "loss": 0.0001, + "step": 25680 + }, + { + "epoch": 5.5799304952215465, + "grad_norm": 0.0024861509446054697, + "learning_rate": 1.5136294526498698e-05, + "loss": 0.0001, + "step": 25690 + }, + { + "epoch": 5.582102519548219, + "grad_norm": 0.0006951112300157547, + "learning_rate": 1.5122719374456995e-05, + "loss": 0.0005, + "step": 25700 + }, + { + "epoch": 5.5842745438748915, + "grad_norm": 0.0007802178151905537, + "learning_rate": 1.5109144222415291e-05, + "loss": 0.0001, + "step": 25710 + }, + { + "epoch": 5.586446568201564, + "grad_norm": 0.002290277276188135, + "learning_rate": 1.5095569070373588e-05, + "loss": 0.0247, + "step": 25720 + }, + { + "epoch": 5.588618592528237, + "grad_norm": 0.0008800785290077329, + "learning_rate": 1.5081993918331886e-05, + "loss": 0.0001, + "step": 25730 + }, + { + "epoch": 5.590790616854909, + "grad_norm": 0.013386576436460018, + "learning_rate": 1.5068418766290183e-05, + "loss": 0.0001, + "step": 25740 + }, + { + "epoch": 5.592962641181582, + "grad_norm": 0.0014391910517588258, + "learning_rate": 1.5054843614248482e-05, + "loss": 0.0004, + "step": 25750 + }, + { + "epoch": 5.595134665508254, + "grad_norm": 0.0017789318226277828, + "learning_rate": 1.5041268462206779e-05, + "loss": 0.0001, + "step": 25760 + }, + { + "epoch": 5.597306689834927, + "grad_norm": 0.0007133111357688904, + "learning_rate": 1.5027693310165076e-05, + "loss": 0.0052, + "step": 25770 + }, + { + "epoch": 5.599478714161599, + "grad_norm": 0.0008147881599143147, + "learning_rate": 1.5014118158123372e-05, + "loss": 0.0001, + "step": 25780 + }, + { + "epoch": 5.601650738488271, + "grad_norm": 0.0018234155140817165, + "learning_rate": 1.5000543006081669e-05, + "loss": 0.0002, + "step": 25790 + }, + { + "epoch": 5.603822762814944, + "grad_norm": 0.0006993436836637557, + "learning_rate": 1.4986967854039965e-05, + "loss": 0.0001, + "step": 25800 + }, + { + "epoch": 5.605994787141616, + "grad_norm": 0.0007021636120043695, + "learning_rate": 1.4973392701998262e-05, + "loss": 0.0055, + "step": 25810 + }, + { + "epoch": 5.608166811468289, + "grad_norm": 0.0006139131146483123, + "learning_rate": 1.495981754995656e-05, + "loss": 0.0001, + "step": 25820 + }, + { + "epoch": 5.610338835794961, + "grad_norm": 0.0006054277182556689, + "learning_rate": 1.4946242397914858e-05, + "loss": 0.0001, + "step": 25830 + }, + { + "epoch": 5.612510860121633, + "grad_norm": 0.0033255741000175476, + "learning_rate": 1.4932667245873156e-05, + "loss": 0.047, + "step": 25840 + }, + { + "epoch": 5.614682884448306, + "grad_norm": 0.0007535509066656232, + "learning_rate": 1.4919092093831453e-05, + "loss": 0.0004, + "step": 25850 + }, + { + "epoch": 5.616854908774978, + "grad_norm": 0.0018026516772806644, + "learning_rate": 1.490551694178975e-05, + "loss": 0.0002, + "step": 25860 + }, + { + "epoch": 5.619026933101651, + "grad_norm": 0.03184160590171814, + "learning_rate": 1.4891941789748046e-05, + "loss": 0.0049, + "step": 25870 + }, + { + "epoch": 5.621198957428323, + "grad_norm": 0.0012461596634238958, + "learning_rate": 1.4878366637706342e-05, + "loss": 0.0001, + "step": 25880 + }, + { + "epoch": 5.623370981754996, + "grad_norm": 0.0024950348306447268, + "learning_rate": 1.4864791485664639e-05, + "loss": 0.0001, + "step": 25890 + }, + { + "epoch": 5.625543006081668, + "grad_norm": 0.006434556096792221, + "learning_rate": 1.4851216333622935e-05, + "loss": 0.0001, + "step": 25900 + }, + { + "epoch": 5.627715030408341, + "grad_norm": 0.0007220886182039976, + "learning_rate": 1.4837641181581235e-05, + "loss": 0.0001, + "step": 25910 + }, + { + "epoch": 5.629887054735013, + "grad_norm": 0.0008968530455604196, + "learning_rate": 1.4824066029539532e-05, + "loss": 0.0265, + "step": 25920 + }, + { + "epoch": 5.632059079061685, + "grad_norm": 0.011500025168061256, + "learning_rate": 1.481049087749783e-05, + "loss": 0.0002, + "step": 25930 + }, + { + "epoch": 5.634231103388358, + "grad_norm": 0.0007694981177337468, + "learning_rate": 1.4796915725456126e-05, + "loss": 0.0007, + "step": 25940 + }, + { + "epoch": 5.63640312771503, + "grad_norm": 0.0009162534261122346, + "learning_rate": 1.4783340573414423e-05, + "loss": 0.0001, + "step": 25950 + }, + { + "epoch": 5.638575152041703, + "grad_norm": 0.0019229091703891754, + "learning_rate": 1.476976542137272e-05, + "loss": 0.0002, + "step": 25960 + }, + { + "epoch": 5.640747176368375, + "grad_norm": 0.0014625373296439648, + "learning_rate": 1.4756190269331016e-05, + "loss": 0.0001, + "step": 25970 + }, + { + "epoch": 5.642919200695048, + "grad_norm": 0.0010808553779497743, + "learning_rate": 1.4742615117289316e-05, + "loss": 0.0012, + "step": 25980 + }, + { + "epoch": 5.64509122502172, + "grad_norm": 0.00990188866853714, + "learning_rate": 1.4729039965247612e-05, + "loss": 0.0002, + "step": 25990 + }, + { + "epoch": 5.647263249348393, + "grad_norm": 0.056729722768068314, + "learning_rate": 1.4715464813205909e-05, + "loss": 0.0002, + "step": 26000 + }, + { + "epoch": 5.649435273675065, + "grad_norm": 0.0019011656986549497, + "learning_rate": 1.4701889661164205e-05, + "loss": 0.0366, + "step": 26010 + }, + { + "epoch": 5.651607298001737, + "grad_norm": 0.0027743070386350155, + "learning_rate": 1.4688314509122503e-05, + "loss": 0.0003, + "step": 26020 + }, + { + "epoch": 5.65377932232841, + "grad_norm": 0.004712869878858328, + "learning_rate": 1.46747393570808e-05, + "loss": 0.0002, + "step": 26030 + }, + { + "epoch": 5.655951346655082, + "grad_norm": 0.0006625755340792239, + "learning_rate": 1.4661164205039096e-05, + "loss": 0.0004, + "step": 26040 + }, + { + "epoch": 5.658123370981755, + "grad_norm": 0.026191281154751778, + "learning_rate": 1.4647589052997393e-05, + "loss": 0.0005, + "step": 26050 + }, + { + "epoch": 5.660295395308427, + "grad_norm": 0.000669986882712692, + "learning_rate": 1.4634013900955693e-05, + "loss": 0.0181, + "step": 26060 + }, + { + "epoch": 5.6624674196350995, + "grad_norm": 0.0005998788401484489, + "learning_rate": 1.462043874891399e-05, + "loss": 0.0001, + "step": 26070 + }, + { + "epoch": 5.664639443961772, + "grad_norm": 0.0005905433208681643, + "learning_rate": 1.4606863596872286e-05, + "loss": 0.0001, + "step": 26080 + }, + { + "epoch": 5.6668114682884445, + "grad_norm": 0.0005546990432776511, + "learning_rate": 1.4593288444830582e-05, + "loss": 0.0181, + "step": 26090 + }, + { + "epoch": 5.6689834926151175, + "grad_norm": 0.0005743417423218489, + "learning_rate": 1.4579713292788879e-05, + "loss": 0.0003, + "step": 26100 + }, + { + "epoch": 5.6711555169417895, + "grad_norm": 0.0017763548530638218, + "learning_rate": 1.4566138140747177e-05, + "loss": 0.0001, + "step": 26110 + }, + { + "epoch": 5.6733275412684625, + "grad_norm": 0.0005718552274629474, + "learning_rate": 1.4552562988705474e-05, + "loss": 0.0001, + "step": 26120 + }, + { + "epoch": 5.675499565595135, + "grad_norm": 0.0005786096444353461, + "learning_rate": 1.453898783666377e-05, + "loss": 0.0001, + "step": 26130 + }, + { + "epoch": 5.6776715899218075, + "grad_norm": 0.0006460116128437221, + "learning_rate": 1.452541268462207e-05, + "loss": 0.0234, + "step": 26140 + }, + { + "epoch": 5.67984361424848, + "grad_norm": 0.0006802164134569466, + "learning_rate": 1.4511837532580367e-05, + "loss": 0.0077, + "step": 26150 + }, + { + "epoch": 5.682015638575152, + "grad_norm": 0.017687102779746056, + "learning_rate": 1.4498262380538663e-05, + "loss": 0.0003, + "step": 26160 + }, + { + "epoch": 5.684187662901825, + "grad_norm": 0.000584051595069468, + "learning_rate": 1.448468722849696e-05, + "loss": 0.0003, + "step": 26170 + }, + { + "epoch": 5.686359687228497, + "grad_norm": 0.004040115978568792, + "learning_rate": 1.4471112076455256e-05, + "loss": 0.0015, + "step": 26180 + }, + { + "epoch": 5.68853171155517, + "grad_norm": 0.019951876252889633, + "learning_rate": 1.4457536924413553e-05, + "loss": 0.0004, + "step": 26190 + }, + { + "epoch": 5.690703735881842, + "grad_norm": 0.0008795502944849432, + "learning_rate": 1.444396177237185e-05, + "loss": 0.0004, + "step": 26200 + }, + { + "epoch": 5.692875760208515, + "grad_norm": 0.004787113983184099, + "learning_rate": 1.4430386620330149e-05, + "loss": 0.0002, + "step": 26210 + }, + { + "epoch": 5.695047784535187, + "grad_norm": 0.0067748213186860085, + "learning_rate": 1.4416811468288447e-05, + "loss": 0.0004, + "step": 26220 + }, + { + "epoch": 5.69721980886186, + "grad_norm": 0.008101826533675194, + "learning_rate": 1.4403236316246744e-05, + "loss": 0.0002, + "step": 26230 + }, + { + "epoch": 5.699391833188532, + "grad_norm": 0.0006347130984067917, + "learning_rate": 1.438966116420504e-05, + "loss": 0.0001, + "step": 26240 + }, + { + "epoch": 5.701563857515204, + "grad_norm": 0.0006116887088865042, + "learning_rate": 1.4376086012163337e-05, + "loss": 0.0001, + "step": 26250 + }, + { + "epoch": 5.703735881841877, + "grad_norm": 0.0005490531329996884, + "learning_rate": 1.4362510860121633e-05, + "loss": 0.0003, + "step": 26260 + }, + { + "epoch": 5.705907906168549, + "grad_norm": 0.0005719310138374567, + "learning_rate": 1.434893570807993e-05, + "loss": 0.0001, + "step": 26270 + }, + { + "epoch": 5.708079930495222, + "grad_norm": 0.01422004122287035, + "learning_rate": 1.4335360556038226e-05, + "loss": 0.0228, + "step": 26280 + }, + { + "epoch": 5.710251954821894, + "grad_norm": 0.0006018744898028672, + "learning_rate": 1.4321785403996526e-05, + "loss": 0.0002, + "step": 26290 + }, + { + "epoch": 5.712423979148566, + "grad_norm": 0.0006384833832271397, + "learning_rate": 1.4308210251954823e-05, + "loss": 0.0001, + "step": 26300 + }, + { + "epoch": 5.714596003475239, + "grad_norm": 0.006105415057390928, + "learning_rate": 1.4294635099913121e-05, + "loss": 0.001, + "step": 26310 + }, + { + "epoch": 5.716768027801911, + "grad_norm": 0.0006127614760771394, + "learning_rate": 1.4281059947871417e-05, + "loss": 0.0047, + "step": 26320 + }, + { + "epoch": 5.718940052128584, + "grad_norm": 0.0006579833570867777, + "learning_rate": 1.4267484795829714e-05, + "loss": 0.0007, + "step": 26330 + }, + { + "epoch": 5.721112076455256, + "grad_norm": 0.0005666995421051979, + "learning_rate": 1.425390964378801e-05, + "loss": 0.0065, + "step": 26340 + }, + { + "epoch": 5.723284100781929, + "grad_norm": 0.0005409110453911126, + "learning_rate": 1.4240334491746307e-05, + "loss": 0.0003, + "step": 26350 + }, + { + "epoch": 5.725456125108601, + "grad_norm": 0.0005358332418836653, + "learning_rate": 1.4226759339704607e-05, + "loss": 0.0067, + "step": 26360 + }, + { + "epoch": 5.727628149435274, + "grad_norm": 0.007814955897629261, + "learning_rate": 1.4213184187662903e-05, + "loss": 0.0007, + "step": 26370 + }, + { + "epoch": 5.729800173761946, + "grad_norm": 0.0005765220848843455, + "learning_rate": 1.41996090356212e-05, + "loss": 0.0005, + "step": 26380 + }, + { + "epoch": 5.731972198088618, + "grad_norm": 0.006794157437980175, + "learning_rate": 1.4186033883579496e-05, + "loss": 0.0002, + "step": 26390 + }, + { + "epoch": 5.734144222415291, + "grad_norm": 0.000567117880564183, + "learning_rate": 1.4172458731537795e-05, + "loss": 0.0002, + "step": 26400 + }, + { + "epoch": 5.736316246741963, + "grad_norm": 0.0006249643629416823, + "learning_rate": 1.4158883579496091e-05, + "loss": 0.0001, + "step": 26410 + }, + { + "epoch": 5.738488271068636, + "grad_norm": 0.0005337175680324435, + "learning_rate": 1.4145308427454388e-05, + "loss": 0.0001, + "step": 26420 + }, + { + "epoch": 5.740660295395308, + "grad_norm": 0.0005305648664943874, + "learning_rate": 1.4131733275412684e-05, + "loss": 0.0004, + "step": 26430 + }, + { + "epoch": 5.742832319721981, + "grad_norm": 0.0005393430474214256, + "learning_rate": 1.4118158123370984e-05, + "loss": 0.0063, + "step": 26440 + }, + { + "epoch": 5.745004344048653, + "grad_norm": 0.0005549487541429698, + "learning_rate": 1.410458297132928e-05, + "loss": 0.0001, + "step": 26450 + }, + { + "epoch": 5.747176368375325, + "grad_norm": 0.02173413708806038, + "learning_rate": 1.4091007819287577e-05, + "loss": 0.0003, + "step": 26460 + }, + { + "epoch": 5.749348392701998, + "grad_norm": 0.008116286247968674, + "learning_rate": 1.4077432667245873e-05, + "loss": 0.0005, + "step": 26470 + }, + { + "epoch": 5.7515204170286705, + "grad_norm": 0.0005138172418810427, + "learning_rate": 1.406385751520417e-05, + "loss": 0.0002, + "step": 26480 + }, + { + "epoch": 5.753692441355343, + "grad_norm": 0.0005188611685298383, + "learning_rate": 1.4050282363162468e-05, + "loss": 0.0014, + "step": 26490 + }, + { + "epoch": 5.7558644656820155, + "grad_norm": 0.0005352711887098849, + "learning_rate": 1.4036707211120765e-05, + "loss": 0.0039, + "step": 26500 + }, + { + "epoch": 5.7580364900086884, + "grad_norm": 0.0005132692167535424, + "learning_rate": 1.4023132059079061e-05, + "loss": 0.007, + "step": 26510 + }, + { + "epoch": 5.7602085143353605, + "grad_norm": 0.000535594008397311, + "learning_rate": 1.4009556907037361e-05, + "loss": 0.0001, + "step": 26520 + }, + { + "epoch": 5.762380538662033, + "grad_norm": 0.0005138751002959907, + "learning_rate": 1.3995981754995658e-05, + "loss": 0.0001, + "step": 26530 + }, + { + "epoch": 5.7645525629887056, + "grad_norm": 0.0005084550939500332, + "learning_rate": 1.3982406602953954e-05, + "loss": 0.0001, + "step": 26540 + }, + { + "epoch": 5.766724587315378, + "grad_norm": 0.000515693100169301, + "learning_rate": 1.396883145091225e-05, + "loss": 0.0001, + "step": 26550 + }, + { + "epoch": 5.768896611642051, + "grad_norm": 0.0007487752009183168, + "learning_rate": 1.3955256298870547e-05, + "loss": 0.0003, + "step": 26560 + }, + { + "epoch": 5.771068635968723, + "grad_norm": 0.0005719950422644615, + "learning_rate": 1.3941681146828844e-05, + "loss": 0.0001, + "step": 26570 + }, + { + "epoch": 5.773240660295396, + "grad_norm": 0.0005108347395434976, + "learning_rate": 1.3928105994787142e-05, + "loss": 0.0001, + "step": 26580 + }, + { + "epoch": 5.775412684622068, + "grad_norm": 0.0005025700083933771, + "learning_rate": 1.391453084274544e-05, + "loss": 0.0001, + "step": 26590 + }, + { + "epoch": 5.777584708948741, + "grad_norm": 0.0005161292501725256, + "learning_rate": 1.3900955690703738e-05, + "loss": 0.0001, + "step": 26600 + }, + { + "epoch": 5.779756733275413, + "grad_norm": 0.000515054736752063, + "learning_rate": 1.3887380538662035e-05, + "loss": 0.0001, + "step": 26610 + }, + { + "epoch": 5.781928757602085, + "grad_norm": 0.0005010199383832514, + "learning_rate": 1.3873805386620331e-05, + "loss": 0.0001, + "step": 26620 + }, + { + "epoch": 5.784100781928758, + "grad_norm": 0.0005252442206256092, + "learning_rate": 1.3860230234578628e-05, + "loss": 0.0001, + "step": 26630 + }, + { + "epoch": 5.78627280625543, + "grad_norm": 0.0005354645545594394, + "learning_rate": 1.3846655082536924e-05, + "loss": 0.0001, + "step": 26640 + }, + { + "epoch": 5.788444830582103, + "grad_norm": 0.0004989413428120315, + "learning_rate": 1.383307993049522e-05, + "loss": 0.003, + "step": 26650 + }, + { + "epoch": 5.790616854908775, + "grad_norm": 0.0004969520960003138, + "learning_rate": 1.3819504778453517e-05, + "loss": 0.0001, + "step": 26660 + }, + { + "epoch": 5.792788879235447, + "grad_norm": 0.0005011470057070255, + "learning_rate": 1.3805929626411817e-05, + "loss": 0.0001, + "step": 26670 + }, + { + "epoch": 5.79496090356212, + "grad_norm": 0.0004997824435122311, + "learning_rate": 1.3792354474370114e-05, + "loss": 0.0001, + "step": 26680 + }, + { + "epoch": 5.797132927888792, + "grad_norm": 0.0004935134784318507, + "learning_rate": 1.3778779322328412e-05, + "loss": 0.001, + "step": 26690 + }, + { + "epoch": 5.799304952215465, + "grad_norm": 0.000501400965731591, + "learning_rate": 1.3765204170286708e-05, + "loss": 0.0001, + "step": 26700 + }, + { + "epoch": 5.801476976542137, + "grad_norm": 0.004372824914753437, + "learning_rate": 1.3751629018245005e-05, + "loss": 0.0002, + "step": 26710 + }, + { + "epoch": 5.80364900086881, + "grad_norm": 0.0005527559551410377, + "learning_rate": 1.3738053866203301e-05, + "loss": 0.0607, + "step": 26720 + }, + { + "epoch": 5.805821025195482, + "grad_norm": 0.0007034401642158628, + "learning_rate": 1.3724478714161598e-05, + "loss": 0.0001, + "step": 26730 + }, + { + "epoch": 5.807993049522155, + "grad_norm": 0.0029618972912430763, + "learning_rate": 1.3710903562119894e-05, + "loss": 0.0001, + "step": 26740 + }, + { + "epoch": 5.810165073848827, + "grad_norm": 0.0008377031190320849, + "learning_rate": 1.3697328410078194e-05, + "loss": 0.023, + "step": 26750 + }, + { + "epoch": 5.812337098175499, + "grad_norm": 0.0009827688336372375, + "learning_rate": 1.368375325803649e-05, + "loss": 0.0001, + "step": 26760 + }, + { + "epoch": 5.814509122502172, + "grad_norm": 0.0007727128686383367, + "learning_rate": 1.3670178105994787e-05, + "loss": 0.0006, + "step": 26770 + }, + { + "epoch": 5.816681146828844, + "grad_norm": 0.0007160462555475533, + "learning_rate": 1.3656602953953086e-05, + "loss": 0.0001, + "step": 26780 + }, + { + "epoch": 5.818853171155517, + "grad_norm": 0.0021952898241579533, + "learning_rate": 1.3643027801911382e-05, + "loss": 0.0514, + "step": 26790 + }, + { + "epoch": 5.821025195482189, + "grad_norm": 0.008272473700344563, + "learning_rate": 1.3629452649869679e-05, + "loss": 0.0007, + "step": 26800 + }, + { + "epoch": 5.823197219808862, + "grad_norm": 0.0034944340586662292, + "learning_rate": 1.3615877497827975e-05, + "loss": 0.0028, + "step": 26810 + }, + { + "epoch": 5.825369244135534, + "grad_norm": 0.015482169575989246, + "learning_rate": 1.3602302345786275e-05, + "loss": 0.0005, + "step": 26820 + }, + { + "epoch": 5.827541268462207, + "grad_norm": 0.0030670249834656715, + "learning_rate": 1.3588727193744571e-05, + "loss": 0.0012, + "step": 26830 + }, + { + "epoch": 5.829713292788879, + "grad_norm": 0.17072609066963196, + "learning_rate": 1.3575152041702868e-05, + "loss": 0.0005, + "step": 26840 + }, + { + "epoch": 5.831885317115551, + "grad_norm": 0.0022793428506702185, + "learning_rate": 1.3561576889661164e-05, + "loss": 0.0071, + "step": 26850 + }, + { + "epoch": 5.834057341442224, + "grad_norm": 0.0021639687474817038, + "learning_rate": 1.3548001737619461e-05, + "loss": 0.0002, + "step": 26860 + }, + { + "epoch": 5.836229365768896, + "grad_norm": 0.0018504821928218007, + "learning_rate": 1.353442658557776e-05, + "loss": 0.0002, + "step": 26870 + }, + { + "epoch": 5.838401390095569, + "grad_norm": 0.0019850528333336115, + "learning_rate": 1.3520851433536056e-05, + "loss": 0.0002, + "step": 26880 + }, + { + "epoch": 5.840573414422241, + "grad_norm": 0.002545407973229885, + "learning_rate": 1.3507276281494352e-05, + "loss": 0.0002, + "step": 26890 + }, + { + "epoch": 5.8427454387489135, + "grad_norm": 0.0014092468190938234, + "learning_rate": 1.3493701129452652e-05, + "loss": 0.0003, + "step": 26900 + }, + { + "epoch": 5.8449174630755865, + "grad_norm": 0.0015003462322056293, + "learning_rate": 1.3480125977410949e-05, + "loss": 0.0001, + "step": 26910 + }, + { + "epoch": 5.8470894874022585, + "grad_norm": 0.0012168603716418147, + "learning_rate": 1.3466550825369245e-05, + "loss": 0.0003, + "step": 26920 + }, + { + "epoch": 5.8492615117289315, + "grad_norm": 0.0010929395211860538, + "learning_rate": 1.3452975673327542e-05, + "loss": 0.0002, + "step": 26930 + }, + { + "epoch": 5.851433536055604, + "grad_norm": 0.001105991075746715, + "learning_rate": 1.3439400521285838e-05, + "loss": 0.0028, + "step": 26940 + }, + { + "epoch": 5.8536055603822765, + "grad_norm": 0.0009902853053063154, + "learning_rate": 1.3425825369244135e-05, + "loss": 0.0001, + "step": 26950 + }, + { + "epoch": 5.855777584708949, + "grad_norm": 0.0010550229344516993, + "learning_rate": 1.3412250217202433e-05, + "loss": 0.0061, + "step": 26960 + }, + { + "epoch": 5.857949609035622, + "grad_norm": 0.0009797523962333798, + "learning_rate": 1.3398675065160731e-05, + "loss": 0.0001, + "step": 26970 + }, + { + "epoch": 5.860121633362294, + "grad_norm": 0.0010357762221246958, + "learning_rate": 1.338509991311903e-05, + "loss": 0.0001, + "step": 26980 + }, + { + "epoch": 5.862293657688966, + "grad_norm": 0.0010978849604725838, + "learning_rate": 1.3371524761077326e-05, + "loss": 0.0001, + "step": 26990 + }, + { + "epoch": 5.864465682015639, + "grad_norm": 0.0009231647709384561, + "learning_rate": 1.3357949609035622e-05, + "loss": 0.0001, + "step": 27000 + }, + { + "epoch": 5.866637706342311, + "grad_norm": 0.000857473467476666, + "learning_rate": 1.3344374456993919e-05, + "loss": 0.0001, + "step": 27010 + }, + { + "epoch": 5.868809730668984, + "grad_norm": 0.0011523263528943062, + "learning_rate": 1.3330799304952215e-05, + "loss": 0.0003, + "step": 27020 + }, + { + "epoch": 5.870981754995656, + "grad_norm": 0.002420986071228981, + "learning_rate": 1.3317224152910512e-05, + "loss": 0.0537, + "step": 27030 + }, + { + "epoch": 5.873153779322329, + "grad_norm": 0.007347964681684971, + "learning_rate": 1.3303649000868808e-05, + "loss": 0.0003, + "step": 27040 + }, + { + "epoch": 5.875325803649001, + "grad_norm": 0.002605201443657279, + "learning_rate": 1.3290073848827108e-05, + "loss": 0.0004, + "step": 27050 + }, + { + "epoch": 5.877497827975674, + "grad_norm": 0.0027562305331230164, + "learning_rate": 1.3276498696785405e-05, + "loss": 0.0004, + "step": 27060 + }, + { + "epoch": 5.879669852302346, + "grad_norm": 0.5885543823242188, + "learning_rate": 1.3262923544743703e-05, + "loss": 0.0023, + "step": 27070 + }, + { + "epoch": 5.881841876629018, + "grad_norm": 0.0020992341451346874, + "learning_rate": 1.3249348392702e-05, + "loss": 0.0002, + "step": 27080 + }, + { + "epoch": 5.884013900955691, + "grad_norm": 0.0018192260758951306, + "learning_rate": 1.3235773240660296e-05, + "loss": 0.0003, + "step": 27090 + }, + { + "epoch": 5.886185925282363, + "grad_norm": 0.0018530006054788828, + "learning_rate": 1.3222198088618592e-05, + "loss": 0.0105, + "step": 27100 + }, + { + "epoch": 5.888357949609036, + "grad_norm": 0.002540824469178915, + "learning_rate": 1.3208622936576889e-05, + "loss": 0.0002, + "step": 27110 + }, + { + "epoch": 5.890529973935708, + "grad_norm": 0.0017492754850536585, + "learning_rate": 1.3195047784535185e-05, + "loss": 0.0002, + "step": 27120 + }, + { + "epoch": 5.89270199826238, + "grad_norm": 0.001549109467305243, + "learning_rate": 1.3181472632493485e-05, + "loss": 0.0002, + "step": 27130 + }, + { + "epoch": 5.894874022589053, + "grad_norm": 0.0017339596524834633, + "learning_rate": 1.3167897480451782e-05, + "loss": 0.0064, + "step": 27140 + }, + { + "epoch": 5.897046046915725, + "grad_norm": 0.0021280774381011724, + "learning_rate": 1.3154322328410078e-05, + "loss": 0.0398, + "step": 27150 + }, + { + "epoch": 5.899218071242398, + "grad_norm": 0.004335555247962475, + "learning_rate": 1.3140747176368377e-05, + "loss": 0.0002, + "step": 27160 + }, + { + "epoch": 5.90139009556907, + "grad_norm": 0.011722725816071033, + "learning_rate": 1.3127172024326673e-05, + "loss": 0.0004, + "step": 27170 + }, + { + "epoch": 5.903562119895743, + "grad_norm": 0.0019563138484954834, + "learning_rate": 1.311359687228497e-05, + "loss": 0.0004, + "step": 27180 + }, + { + "epoch": 5.905734144222415, + "grad_norm": 0.002340024570003152, + "learning_rate": 1.3100021720243266e-05, + "loss": 0.0003, + "step": 27190 + }, + { + "epoch": 5.907906168549088, + "grad_norm": 0.0030410848557949066, + "learning_rate": 1.3086446568201566e-05, + "loss": 0.0002, + "step": 27200 + }, + { + "epoch": 5.91007819287576, + "grad_norm": 0.0047626374289393425, + "learning_rate": 1.3072871416159863e-05, + "loss": 0.0002, + "step": 27210 + }, + { + "epoch": 5.912250217202432, + "grad_norm": 0.001380560570396483, + "learning_rate": 1.3059296264118159e-05, + "loss": 0.0004, + "step": 27220 + }, + { + "epoch": 5.914422241529105, + "grad_norm": 0.0029822904616594315, + "learning_rate": 1.3045721112076456e-05, + "loss": 0.0002, + "step": 27230 + }, + { + "epoch": 5.916594265855777, + "grad_norm": 0.002323774853721261, + "learning_rate": 1.3032145960034752e-05, + "loss": 0.0002, + "step": 27240 + }, + { + "epoch": 5.91876629018245, + "grad_norm": 0.004266597796231508, + "learning_rate": 1.301857080799305e-05, + "loss": 0.0001, + "step": 27250 + }, + { + "epoch": 5.920938314509122, + "grad_norm": 0.0013143382966518402, + "learning_rate": 1.3004995655951347e-05, + "loss": 0.0066, + "step": 27260 + }, + { + "epoch": 5.923110338835795, + "grad_norm": 0.001244851271621883, + "learning_rate": 1.2991420503909643e-05, + "loss": 0.031, + "step": 27270 + }, + { + "epoch": 5.925282363162467, + "grad_norm": 0.000747871061321348, + "learning_rate": 1.2977845351867943e-05, + "loss": 0.0001, + "step": 27280 + }, + { + "epoch": 5.92745438748914, + "grad_norm": 0.0012308891164138913, + "learning_rate": 1.296427019982624e-05, + "loss": 0.0002, + "step": 27290 + }, + { + "epoch": 5.929626411815812, + "grad_norm": 0.0008152248337864876, + "learning_rate": 1.2950695047784536e-05, + "loss": 0.0001, + "step": 27300 + }, + { + "epoch": 5.9317984361424845, + "grad_norm": 0.000931603426579386, + "learning_rate": 1.2937119895742833e-05, + "loss": 0.0041, + "step": 27310 + }, + { + "epoch": 5.933970460469157, + "grad_norm": 0.0010958875063806772, + "learning_rate": 1.292354474370113e-05, + "loss": 0.0001, + "step": 27320 + }, + { + "epoch": 5.9361424847958295, + "grad_norm": 0.001938451430760324, + "learning_rate": 1.2909969591659426e-05, + "loss": 0.0001, + "step": 27330 + }, + { + "epoch": 5.9383145091225025, + "grad_norm": 0.0010382839245721698, + "learning_rate": 1.2896394439617724e-05, + "loss": 0.0034, + "step": 27340 + }, + { + "epoch": 5.9404865334491745, + "grad_norm": 0.0007932804292067885, + "learning_rate": 1.2882819287576022e-05, + "loss": 0.0001, + "step": 27350 + }, + { + "epoch": 5.942658557775847, + "grad_norm": 0.0008296747109852731, + "learning_rate": 1.286924413553432e-05, + "loss": 0.0002, + "step": 27360 + }, + { + "epoch": 5.94483058210252, + "grad_norm": 0.0008137888507917523, + "learning_rate": 1.2855668983492617e-05, + "loss": 0.0001, + "step": 27370 + }, + { + "epoch": 5.947002606429192, + "grad_norm": 0.0007017937605269253, + "learning_rate": 1.2842093831450913e-05, + "loss": 0.0001, + "step": 27380 + }, + { + "epoch": 5.949174630755865, + "grad_norm": 0.001600543037056923, + "learning_rate": 1.282851867940921e-05, + "loss": 0.0001, + "step": 27390 + }, + { + "epoch": 5.951346655082537, + "grad_norm": 0.001051751896739006, + "learning_rate": 1.2814943527367506e-05, + "loss": 0.0001, + "step": 27400 + }, + { + "epoch": 5.95351867940921, + "grad_norm": 0.000667224929202348, + "learning_rate": 1.2801368375325803e-05, + "loss": 0.0001, + "step": 27410 + }, + { + "epoch": 5.955690703735882, + "grad_norm": 0.0007074500899761915, + "learning_rate": 1.2787793223284101e-05, + "loss": 0.0002, + "step": 27420 + }, + { + "epoch": 5.957862728062555, + "grad_norm": 0.0007487855036742985, + "learning_rate": 1.27742180712424e-05, + "loss": 0.0019, + "step": 27430 + }, + { + "epoch": 5.960034752389227, + "grad_norm": 0.0010091594886034727, + "learning_rate": 1.2760642919200696e-05, + "loss": 0.0491, + "step": 27440 + }, + { + "epoch": 5.962206776715899, + "grad_norm": 0.0017638842109590769, + "learning_rate": 1.2747067767158994e-05, + "loss": 0.0001, + "step": 27450 + }, + { + "epoch": 5.964378801042572, + "grad_norm": 0.0033730980940163136, + "learning_rate": 1.273349261511729e-05, + "loss": 0.0001, + "step": 27460 + }, + { + "epoch": 5.966550825369244, + "grad_norm": 0.0007800173480063677, + "learning_rate": 1.2719917463075587e-05, + "loss": 0.0001, + "step": 27470 + }, + { + "epoch": 5.968722849695917, + "grad_norm": 0.6365306973457336, + "learning_rate": 1.2706342311033884e-05, + "loss": 0.0344, + "step": 27480 + }, + { + "epoch": 5.970894874022589, + "grad_norm": 0.0016547476407140493, + "learning_rate": 1.269276715899218e-05, + "loss": 0.0005, + "step": 27490 + }, + { + "epoch": 5.973066898349262, + "grad_norm": 0.003191007999703288, + "learning_rate": 1.2679192006950477e-05, + "loss": 0.0007, + "step": 27500 + }, + { + "epoch": 5.975238922675934, + "grad_norm": 1.2369848489761353, + "learning_rate": 1.2665616854908776e-05, + "loss": 0.0343, + "step": 27510 + }, + { + "epoch": 5.977410947002607, + "grad_norm": 0.0017436889465898275, + "learning_rate": 1.2652041702867073e-05, + "loss": 0.0002, + "step": 27520 + }, + { + "epoch": 5.979582971329279, + "grad_norm": 0.002784136915579438, + "learning_rate": 1.263846655082537e-05, + "loss": 0.0007, + "step": 27530 + }, + { + "epoch": 5.981754995655951, + "grad_norm": 0.003783125663176179, + "learning_rate": 1.2624891398783668e-05, + "loss": 0.0004, + "step": 27540 + }, + { + "epoch": 5.983927019982624, + "grad_norm": 0.0025968304835259914, + "learning_rate": 1.2611316246741964e-05, + "loss": 0.0065, + "step": 27550 + }, + { + "epoch": 5.986099044309296, + "grad_norm": 0.004436062183231115, + "learning_rate": 1.259774109470026e-05, + "loss": 0.0004, + "step": 27560 + }, + { + "epoch": 5.988271068635969, + "grad_norm": 0.001537824748083949, + "learning_rate": 1.2584165942658557e-05, + "loss": 0.0095, + "step": 27570 + }, + { + "epoch": 5.990443092962641, + "grad_norm": 0.001534433220513165, + "learning_rate": 1.2570590790616857e-05, + "loss": 0.0143, + "step": 27580 + }, + { + "epoch": 5.992615117289313, + "grad_norm": 0.004164781887084246, + "learning_rate": 1.2557015638575154e-05, + "loss": 0.0046, + "step": 27590 + }, + { + "epoch": 5.994787141615986, + "grad_norm": 0.004971928428858519, + "learning_rate": 1.254344048653345e-05, + "loss": 0.0005, + "step": 27600 + }, + { + "epoch": 5.996959165942658, + "grad_norm": 0.007716748397797346, + "learning_rate": 1.2529865334491747e-05, + "loss": 0.0005, + "step": 27610 + }, + { + "epoch": 5.999131190269331, + "grad_norm": 0.0008516062516719103, + "learning_rate": 1.2516290182450043e-05, + "loss": 0.0154, + "step": 27620 + }, + { + "epoch": 6.0, + "eval_f1": 0.6761565836298933, + "eval_loss": 0.06111188605427742, + "eval_runtime": 81.7432, + "eval_samples_per_second": 122.029, + "eval_steps_per_second": 7.634, + "step": 27624 + }, + { + "epoch": 6.001303214596003, + "grad_norm": 0.006091386545449495, + "learning_rate": 1.2502715030408341e-05, + "loss": 0.0001, + "step": 27630 + }, + { + "epoch": 6.003475238922676, + "grad_norm": 0.0009421011782251298, + "learning_rate": 1.248913987836664e-05, + "loss": 0.0002, + "step": 27640 + }, + { + "epoch": 6.005647263249348, + "grad_norm": 0.007014581002295017, + "learning_rate": 1.2475564726324936e-05, + "loss": 0.0002, + "step": 27650 + }, + { + "epoch": 6.007819287576021, + "grad_norm": 0.0006896135164424777, + "learning_rate": 1.2461989574283233e-05, + "loss": 0.0001, + "step": 27660 + }, + { + "epoch": 6.009991311902693, + "grad_norm": 0.0006911637610755861, + "learning_rate": 1.2448414422241529e-05, + "loss": 0.0002, + "step": 27670 + }, + { + "epoch": 6.012163336229365, + "grad_norm": 0.0013413167325779796, + "learning_rate": 1.2434839270199827e-05, + "loss": 0.0001, + "step": 27680 + }, + { + "epoch": 6.014335360556038, + "grad_norm": 0.0009952643886208534, + "learning_rate": 1.2421264118158124e-05, + "loss": 0.0001, + "step": 27690 + }, + { + "epoch": 6.01650738488271, + "grad_norm": 0.0007215281366370618, + "learning_rate": 1.240768896611642e-05, + "loss": 0.0061, + "step": 27700 + }, + { + "epoch": 6.018679409209383, + "grad_norm": 0.0007175153587013483, + "learning_rate": 1.2394113814074718e-05, + "loss": 0.0005, + "step": 27710 + }, + { + "epoch": 6.0208514335360555, + "grad_norm": 0.0008878212538547814, + "learning_rate": 1.2380538662033015e-05, + "loss": 0.0002, + "step": 27720 + }, + { + "epoch": 6.023023457862728, + "grad_norm": 0.0006465452606789768, + "learning_rate": 1.2366963509991313e-05, + "loss": 0.0001, + "step": 27730 + }, + { + "epoch": 6.0251954821894005, + "grad_norm": 0.004886427894234657, + "learning_rate": 1.235338835794961e-05, + "loss": 0.0001, + "step": 27740 + }, + { + "epoch": 6.027367506516073, + "grad_norm": 0.000832171062938869, + "learning_rate": 1.2339813205907906e-05, + "loss": 0.0136, + "step": 27750 + }, + { + "epoch": 6.0295395308427455, + "grad_norm": 0.0007295712712220848, + "learning_rate": 1.2326238053866204e-05, + "loss": 0.0001, + "step": 27760 + }, + { + "epoch": 6.031711555169418, + "grad_norm": 0.0024271977599710226, + "learning_rate": 1.2312662901824501e-05, + "loss": 0.0002, + "step": 27770 + }, + { + "epoch": 6.0338835794960906, + "grad_norm": 0.02930435724556446, + "learning_rate": 1.2299087749782797e-05, + "loss": 0.0002, + "step": 27780 + }, + { + "epoch": 6.036055603822763, + "grad_norm": 0.0006333515630103648, + "learning_rate": 1.2285512597741096e-05, + "loss": 0.0048, + "step": 27790 + }, + { + "epoch": 6.038227628149436, + "grad_norm": 0.003724567359313369, + "learning_rate": 1.2271937445699392e-05, + "loss": 0.0001, + "step": 27800 + }, + { + "epoch": 6.040399652476108, + "grad_norm": 0.0007120940135791898, + "learning_rate": 1.2258362293657689e-05, + "loss": 0.006, + "step": 27810 + }, + { + "epoch": 6.042571676802781, + "grad_norm": 0.000745340483263135, + "learning_rate": 1.2244787141615987e-05, + "loss": 0.0001, + "step": 27820 + }, + { + "epoch": 6.044743701129453, + "grad_norm": 0.0007420461624860764, + "learning_rate": 1.2231211989574285e-05, + "loss": 0.0001, + "step": 27830 + }, + { + "epoch": 6.046915725456125, + "grad_norm": 0.0011481235269457102, + "learning_rate": 1.2217636837532582e-05, + "loss": 0.0001, + "step": 27840 + }, + { + "epoch": 6.049087749782798, + "grad_norm": 0.007169825490564108, + "learning_rate": 1.2204061685490878e-05, + "loss": 0.0002, + "step": 27850 + }, + { + "epoch": 6.05125977410947, + "grad_norm": 0.003404540941119194, + "learning_rate": 1.2190486533449175e-05, + "loss": 0.0047, + "step": 27860 + }, + { + "epoch": 6.053431798436143, + "grad_norm": 0.0006610918790102005, + "learning_rate": 1.2176911381407473e-05, + "loss": 0.0026, + "step": 27870 + }, + { + "epoch": 6.055603822762815, + "grad_norm": 0.0006066480418667197, + "learning_rate": 1.216333622936577e-05, + "loss": 0.0001, + "step": 27880 + }, + { + "epoch": 6.057775847089488, + "grad_norm": 0.0006539212772622705, + "learning_rate": 1.2149761077324066e-05, + "loss": 0.0001, + "step": 27890 + }, + { + "epoch": 6.05994787141616, + "grad_norm": 0.0006049839430488646, + "learning_rate": 1.2136185925282364e-05, + "loss": 0.0002, + "step": 27900 + }, + { + "epoch": 6.062119895742832, + "grad_norm": 0.3469615578651428, + "learning_rate": 1.212261077324066e-05, + "loss": 0.0039, + "step": 27910 + }, + { + "epoch": 6.064291920069505, + "grad_norm": 0.0005610610824078321, + "learning_rate": 1.2109035621198959e-05, + "loss": 0.0001, + "step": 27920 + }, + { + "epoch": 6.066463944396177, + "grad_norm": 0.003698694286867976, + "learning_rate": 1.2095460469157255e-05, + "loss": 0.0004, + "step": 27930 + }, + { + "epoch": 6.06863596872285, + "grad_norm": 0.0033073897939175367, + "learning_rate": 1.2081885317115552e-05, + "loss": 0.0002, + "step": 27940 + }, + { + "epoch": 6.070807993049522, + "grad_norm": 0.0005659732269123197, + "learning_rate": 1.206831016507385e-05, + "loss": 0.0001, + "step": 27950 + }, + { + "epoch": 6.072980017376195, + "grad_norm": 0.002684724284335971, + "learning_rate": 1.2054735013032146e-05, + "loss": 0.0001, + "step": 27960 + }, + { + "epoch": 6.075152041702867, + "grad_norm": 0.0005653385887853801, + "learning_rate": 1.2041159860990443e-05, + "loss": 0.0001, + "step": 27970 + }, + { + "epoch": 6.077324066029539, + "grad_norm": 0.000581434287596494, + "learning_rate": 1.2027584708948741e-05, + "loss": 0.0002, + "step": 27980 + }, + { + "epoch": 6.079496090356212, + "grad_norm": 0.0017103266436606646, + "learning_rate": 1.2014009556907038e-05, + "loss": 0.0001, + "step": 27990 + }, + { + "epoch": 6.081668114682884, + "grad_norm": 0.0005672717234119773, + "learning_rate": 1.2000434404865336e-05, + "loss": 0.0001, + "step": 28000 + }, + { + "epoch": 6.083840139009557, + "grad_norm": 0.0005900085088796914, + "learning_rate": 1.1986859252823632e-05, + "loss": 0.0001, + "step": 28010 + }, + { + "epoch": 6.086012163336229, + "grad_norm": 0.0005386321572586894, + "learning_rate": 1.197328410078193e-05, + "loss": 0.0001, + "step": 28020 + }, + { + "epoch": 6.088184187662902, + "grad_norm": 0.0009092948166653514, + "learning_rate": 1.1959708948740227e-05, + "loss": 0.0001, + "step": 28030 + }, + { + "epoch": 6.090356211989574, + "grad_norm": 0.0005588334170170128, + "learning_rate": 1.1946133796698524e-05, + "loss": 0.0001, + "step": 28040 + }, + { + "epoch": 6.092528236316246, + "grad_norm": 0.0015306295827031136, + "learning_rate": 1.193255864465682e-05, + "loss": 0.0001, + "step": 28050 + }, + { + "epoch": 6.094700260642919, + "grad_norm": 0.0005438084481284022, + "learning_rate": 1.1918983492615118e-05, + "loss": 0.0001, + "step": 28060 + }, + { + "epoch": 6.096872284969591, + "grad_norm": 0.0005323129007592797, + "learning_rate": 1.1905408340573415e-05, + "loss": 0.0001, + "step": 28070 + }, + { + "epoch": 6.099044309296264, + "grad_norm": 0.0005739867337979376, + "learning_rate": 1.1891833188531711e-05, + "loss": 0.0001, + "step": 28080 + }, + { + "epoch": 6.101216333622936, + "grad_norm": 0.004397066310048103, + "learning_rate": 1.187825803649001e-05, + "loss": 0.0001, + "step": 28090 + }, + { + "epoch": 6.103388357949609, + "grad_norm": 0.00933290459215641, + "learning_rate": 1.1864682884448306e-05, + "loss": 0.0001, + "step": 28100 + }, + { + "epoch": 6.105560382276281, + "grad_norm": 0.0016863916534930468, + "learning_rate": 1.1851107732406604e-05, + "loss": 0.0001, + "step": 28110 + }, + { + "epoch": 6.107732406602954, + "grad_norm": 0.0005723837530240417, + "learning_rate": 1.18375325803649e-05, + "loss": 0.0001, + "step": 28120 + }, + { + "epoch": 6.109904430929626, + "grad_norm": 0.0005782105727121234, + "learning_rate": 1.1823957428323197e-05, + "loss": 0.0001, + "step": 28130 + }, + { + "epoch": 6.1120764552562985, + "grad_norm": 0.001487166155129671, + "learning_rate": 1.1810382276281495e-05, + "loss": 0.0047, + "step": 28140 + }, + { + "epoch": 6.1142484795829715, + "grad_norm": 0.0005293136346153915, + "learning_rate": 1.1796807124239792e-05, + "loss": 0.0001, + "step": 28150 + }, + { + "epoch": 6.1164205039096435, + "grad_norm": 0.0006125413347035646, + "learning_rate": 1.1783231972198088e-05, + "loss": 0.0001, + "step": 28160 + }, + { + "epoch": 6.1185925282363165, + "grad_norm": 0.0005350797437131405, + "learning_rate": 1.1769656820156387e-05, + "loss": 0.0001, + "step": 28170 + }, + { + "epoch": 6.120764552562989, + "grad_norm": 0.0005251934053376317, + "learning_rate": 1.1756081668114683e-05, + "loss": 0.0001, + "step": 28180 + }, + { + "epoch": 6.1229365768896615, + "grad_norm": 0.0005300256889313459, + "learning_rate": 1.1742506516072981e-05, + "loss": 0.0049, + "step": 28190 + }, + { + "epoch": 6.125108601216334, + "grad_norm": 0.0026551426853984594, + "learning_rate": 1.1728931364031278e-05, + "loss": 0.0041, + "step": 28200 + }, + { + "epoch": 6.127280625543006, + "grad_norm": 0.000529299140907824, + "learning_rate": 1.1715356211989576e-05, + "loss": 0.0004, + "step": 28210 + }, + { + "epoch": 6.129452649869679, + "grad_norm": 0.0005411395686678588, + "learning_rate": 1.1701781059947873e-05, + "loss": 0.0001, + "step": 28220 + }, + { + "epoch": 6.131624674196351, + "grad_norm": 0.0013515339232981205, + "learning_rate": 1.1688205907906169e-05, + "loss": 0.0001, + "step": 28230 + }, + { + "epoch": 6.133796698523024, + "grad_norm": 0.0005237034056335688, + "learning_rate": 1.1674630755864466e-05, + "loss": 0.0001, + "step": 28240 + }, + { + "epoch": 6.135968722849696, + "grad_norm": 0.0005243255873210728, + "learning_rate": 1.1661055603822764e-05, + "loss": 0.0001, + "step": 28250 + }, + { + "epoch": 6.138140747176369, + "grad_norm": 0.0018816670635715127, + "learning_rate": 1.164748045178106e-05, + "loss": 0.0051, + "step": 28260 + }, + { + "epoch": 6.140312771503041, + "grad_norm": 0.015584097243845463, + "learning_rate": 1.1633905299739357e-05, + "loss": 0.0001, + "step": 28270 + }, + { + "epoch": 6.142484795829713, + "grad_norm": 0.0005188002251088619, + "learning_rate": 1.1621687662901825e-05, + "loss": 0.0136, + "step": 28280 + }, + { + "epoch": 6.144656820156386, + "grad_norm": 0.0005213937256485224, + "learning_rate": 1.1608112510860122e-05, + "loss": 0.0001, + "step": 28290 + }, + { + "epoch": 6.146828844483058, + "grad_norm": 0.0005156263941898942, + "learning_rate": 1.1594537358818418e-05, + "loss": 0.0001, + "step": 28300 + }, + { + "epoch": 6.149000868809731, + "grad_norm": 0.0005170433432795107, + "learning_rate": 1.1580962206776717e-05, + "loss": 0.0001, + "step": 28310 + }, + { + "epoch": 6.151172893136403, + "grad_norm": 0.0017105289734899998, + "learning_rate": 1.1567387054735015e-05, + "loss": 0.0001, + "step": 28320 + }, + { + "epoch": 6.153344917463076, + "grad_norm": 0.0005193065735511482, + "learning_rate": 1.1553811902693311e-05, + "loss": 0.0001, + "step": 28330 + }, + { + "epoch": 6.155516941789748, + "grad_norm": 0.0005105392774567008, + "learning_rate": 1.1540236750651608e-05, + "loss": 0.0001, + "step": 28340 + }, + { + "epoch": 6.157688966116421, + "grad_norm": 0.0005192344542592764, + "learning_rate": 1.1526661598609904e-05, + "loss": 0.0001, + "step": 28350 + }, + { + "epoch": 6.159860990443093, + "grad_norm": 0.0018821783596649766, + "learning_rate": 1.1513086446568203e-05, + "loss": 0.0001, + "step": 28360 + }, + { + "epoch": 6.162033014769765, + "grad_norm": 0.0005060366238467395, + "learning_rate": 1.1499511294526499e-05, + "loss": 0.0001, + "step": 28370 + }, + { + "epoch": 6.164205039096438, + "grad_norm": 0.0005061374395154417, + "learning_rate": 1.1485936142484796e-05, + "loss": 0.0001, + "step": 28380 + }, + { + "epoch": 6.16637706342311, + "grad_norm": 0.000506606069393456, + "learning_rate": 1.1472360990443092e-05, + "loss": 0.0098, + "step": 28390 + }, + { + "epoch": 6.168549087749783, + "grad_norm": 0.001095798914320767, + "learning_rate": 1.145878583840139e-05, + "loss": 0.0001, + "step": 28400 + }, + { + "epoch": 6.170721112076455, + "grad_norm": 0.0005390364676713943, + "learning_rate": 1.1445210686359688e-05, + "loss": 0.0001, + "step": 28410 + }, + { + "epoch": 6.172893136403128, + "grad_norm": 0.020479867234826088, + "learning_rate": 1.1431635534317985e-05, + "loss": 0.0001, + "step": 28420 + }, + { + "epoch": 6.1750651607298, + "grad_norm": 0.0005304127698764205, + "learning_rate": 1.1418060382276283e-05, + "loss": 0.0046, + "step": 28430 + }, + { + "epoch": 6.177237185056472, + "grad_norm": 0.0005232515395618975, + "learning_rate": 1.140448523023458e-05, + "loss": 0.0001, + "step": 28440 + }, + { + "epoch": 6.179409209383145, + "grad_norm": 0.0005304008373059332, + "learning_rate": 1.1390910078192876e-05, + "loss": 0.0001, + "step": 28450 + }, + { + "epoch": 6.181581233709817, + "grad_norm": 0.0015208062250167131, + "learning_rate": 1.1377334926151173e-05, + "loss": 0.0057, + "step": 28460 + }, + { + "epoch": 6.18375325803649, + "grad_norm": 0.0005598796997219324, + "learning_rate": 1.1363759774109471e-05, + "loss": 0.0001, + "step": 28470 + }, + { + "epoch": 6.185925282363162, + "grad_norm": 0.0006372761563397944, + "learning_rate": 1.1350184622067767e-05, + "loss": 0.0001, + "step": 28480 + }, + { + "epoch": 6.188097306689835, + "grad_norm": 0.0023240367881953716, + "learning_rate": 1.1336609470026064e-05, + "loss": 0.0001, + "step": 28490 + }, + { + "epoch": 6.190269331016507, + "grad_norm": 0.0013823268236592412, + "learning_rate": 1.1323034317984362e-05, + "loss": 0.0001, + "step": 28500 + }, + { + "epoch": 6.192441355343179, + "grad_norm": 0.0006541903712786734, + "learning_rate": 1.130945916594266e-05, + "loss": 0.0001, + "step": 28510 + }, + { + "epoch": 6.194613379669852, + "grad_norm": 0.0005835950723849237, + "learning_rate": 1.1295884013900957e-05, + "loss": 0.0001, + "step": 28520 + }, + { + "epoch": 6.196785403996524, + "grad_norm": 0.001270595588721335, + "learning_rate": 1.1282308861859253e-05, + "loss": 0.0001, + "step": 28530 + }, + { + "epoch": 6.198957428323197, + "grad_norm": 0.0005248417728580534, + "learning_rate": 1.126873370981755e-05, + "loss": 0.0001, + "step": 28540 + }, + { + "epoch": 6.2011294526498695, + "grad_norm": 0.732323944568634, + "learning_rate": 1.1255158557775848e-05, + "loss": 0.0021, + "step": 28550 + }, + { + "epoch": 6.203301476976542, + "grad_norm": 0.0004987027496099472, + "learning_rate": 1.1241583405734145e-05, + "loss": 0.0001, + "step": 28560 + }, + { + "epoch": 6.2054735013032145, + "grad_norm": 0.0007200418040156364, + "learning_rate": 1.1228008253692441e-05, + "loss": 0.0001, + "step": 28570 + }, + { + "epoch": 6.2076455256298875, + "grad_norm": 0.0005158480489626527, + "learning_rate": 1.1214433101650738e-05, + "loss": 0.0089, + "step": 28580 + }, + { + "epoch": 6.2098175499565595, + "grad_norm": 0.0005337757174856961, + "learning_rate": 1.1200857949609036e-05, + "loss": 0.0001, + "step": 28590 + }, + { + "epoch": 6.211989574283232, + "grad_norm": 0.0005403549293987453, + "learning_rate": 1.1187282797567334e-05, + "loss": 0.0002, + "step": 28600 + }, + { + "epoch": 6.214161598609905, + "grad_norm": 0.0020134146325290203, + "learning_rate": 1.117370764552563e-05, + "loss": 0.0031, + "step": 28610 + }, + { + "epoch": 6.216333622936577, + "grad_norm": 0.0007066564867272973, + "learning_rate": 1.1160132493483927e-05, + "loss": 0.0001, + "step": 28620 + }, + { + "epoch": 6.21850564726325, + "grad_norm": 0.0005003396654501557, + "learning_rate": 1.1146557341442225e-05, + "loss": 0.0001, + "step": 28630 + }, + { + "epoch": 6.220677671589922, + "grad_norm": 0.0017538318643346429, + "learning_rate": 1.1132982189400522e-05, + "loss": 0.0001, + "step": 28640 + }, + { + "epoch": 6.222849695916595, + "grad_norm": 0.0005141882575117052, + "learning_rate": 1.1119407037358818e-05, + "loss": 0.0001, + "step": 28650 + }, + { + "epoch": 6.225021720243267, + "grad_norm": 0.0005083966534584761, + "learning_rate": 1.1105831885317116e-05, + "loss": 0.0001, + "step": 28660 + }, + { + "epoch": 6.227193744569939, + "grad_norm": 0.000546907598618418, + "learning_rate": 1.1092256733275413e-05, + "loss": 0.0001, + "step": 28670 + }, + { + "epoch": 6.229365768896612, + "grad_norm": 0.0015611139824613929, + "learning_rate": 1.107868158123371e-05, + "loss": 0.006, + "step": 28680 + }, + { + "epoch": 6.231537793223284, + "grad_norm": 0.0006153453723527491, + "learning_rate": 1.1065106429192008e-05, + "loss": 0.0044, + "step": 28690 + }, + { + "epoch": 6.233709817549957, + "grad_norm": 0.000602956220973283, + "learning_rate": 1.1051531277150306e-05, + "loss": 0.0003, + "step": 28700 + }, + { + "epoch": 6.235881841876629, + "grad_norm": 0.001490755588747561, + "learning_rate": 1.1037956125108602e-05, + "loss": 0.0001, + "step": 28710 + }, + { + "epoch": 6.238053866203302, + "grad_norm": 0.0006966798682697117, + "learning_rate": 1.1024380973066899e-05, + "loss": 0.0001, + "step": 28720 + }, + { + "epoch": 6.240225890529974, + "grad_norm": 0.0005024041165597737, + "learning_rate": 1.1010805821025195e-05, + "loss": 0.0001, + "step": 28730 + }, + { + "epoch": 6.242397914856646, + "grad_norm": 0.0009793771896511316, + "learning_rate": 1.0997230668983494e-05, + "loss": 0.0001, + "step": 28740 + }, + { + "epoch": 6.244569939183319, + "grad_norm": 0.0005014459602534771, + "learning_rate": 1.098365551694179e-05, + "loss": 0.0001, + "step": 28750 + }, + { + "epoch": 6.246741963509991, + "grad_norm": 0.0006804326549172401, + "learning_rate": 1.0970080364900087e-05, + "loss": 0.0001, + "step": 28760 + }, + { + "epoch": 6.248913987836664, + "grad_norm": 0.0005381435621529818, + "learning_rate": 1.0956505212858385e-05, + "loss": 0.0001, + "step": 28770 + }, + { + "epoch": 6.251086012163336, + "grad_norm": 0.00048754297313280404, + "learning_rate": 1.0942930060816681e-05, + "loss": 0.002, + "step": 28780 + }, + { + "epoch": 6.253258036490009, + "grad_norm": 0.000489606405608356, + "learning_rate": 1.092935490877498e-05, + "loss": 0.0061, + "step": 28790 + }, + { + "epoch": 6.255430060816681, + "grad_norm": 0.0004804472264368087, + "learning_rate": 1.0915779756733276e-05, + "loss": 0.0001, + "step": 28800 + }, + { + "epoch": 6.257602085143354, + "grad_norm": 0.0005286936648190022, + "learning_rate": 1.0902204604691573e-05, + "loss": 0.0001, + "step": 28810 + }, + { + "epoch": 6.259774109470026, + "grad_norm": 0.00048563070595264435, + "learning_rate": 1.088862945264987e-05, + "loss": 0.0001, + "step": 28820 + }, + { + "epoch": 6.261946133796698, + "grad_norm": 0.0004783800686709583, + "learning_rate": 1.0875054300608167e-05, + "loss": 0.0001, + "step": 28830 + }, + { + "epoch": 6.264118158123371, + "grad_norm": 0.00048122688895091414, + "learning_rate": 1.0861479148566464e-05, + "loss": 0.0046, + "step": 28840 + }, + { + "epoch": 6.266290182450043, + "grad_norm": 0.00048009914462454617, + "learning_rate": 1.0847903996524762e-05, + "loss": 0.0143, + "step": 28850 + }, + { + "epoch": 6.268462206776716, + "grad_norm": 0.0004848612006753683, + "learning_rate": 1.0834328844483058e-05, + "loss": 0.0001, + "step": 28860 + }, + { + "epoch": 6.270634231103388, + "grad_norm": 0.0009488983778283, + "learning_rate": 1.0820753692441355e-05, + "loss": 0.0001, + "step": 28870 + }, + { + "epoch": 6.272806255430061, + "grad_norm": 0.0010840209433808923, + "learning_rate": 1.0807178540399653e-05, + "loss": 0.0054, + "step": 28880 + }, + { + "epoch": 6.274978279756733, + "grad_norm": 0.00047432768042199314, + "learning_rate": 1.0793603388357951e-05, + "loss": 0.0001, + "step": 28890 + }, + { + "epoch": 6.277150304083405, + "grad_norm": 0.0009752907208167017, + "learning_rate": 1.0780028236316248e-05, + "loss": 0.0041, + "step": 28900 + }, + { + "epoch": 6.279322328410078, + "grad_norm": 0.02803998626768589, + "learning_rate": 1.0766453084274544e-05, + "loss": 0.0001, + "step": 28910 + }, + { + "epoch": 6.28149435273675, + "grad_norm": 0.0023725698702037334, + "learning_rate": 1.0752877932232841e-05, + "loss": 0.0001, + "step": 28920 + }, + { + "epoch": 6.283666377063423, + "grad_norm": 0.00048725263332016766, + "learning_rate": 1.0739302780191139e-05, + "loss": 0.0001, + "step": 28930 + }, + { + "epoch": 6.285838401390095, + "grad_norm": 0.0009969660313799977, + "learning_rate": 1.0725727628149436e-05, + "loss": 0.0001, + "step": 28940 + }, + { + "epoch": 6.288010425716768, + "grad_norm": 0.00048657169099897146, + "learning_rate": 1.0712152476107732e-05, + "loss": 0.0001, + "step": 28950 + }, + { + "epoch": 6.2901824500434405, + "grad_norm": 0.0012719962978735566, + "learning_rate": 1.069857732406603e-05, + "loss": 0.0001, + "step": 28960 + }, + { + "epoch": 6.2923544743701125, + "grad_norm": 0.00047141601680777967, + "learning_rate": 1.0685002172024327e-05, + "loss": 0.0001, + "step": 28970 + }, + { + "epoch": 6.2945264986967855, + "grad_norm": 0.0004745768674183637, + "learning_rate": 1.0671427019982625e-05, + "loss": 0.0, + "step": 28980 + }, + { + "epoch": 6.296698523023458, + "grad_norm": 0.0005675083375535905, + "learning_rate": 1.0657851867940922e-05, + "loss": 0.0001, + "step": 28990 + }, + { + "epoch": 6.2988705473501305, + "grad_norm": 0.00048619258450344205, + "learning_rate": 1.0644276715899218e-05, + "loss": 0.0001, + "step": 29000 + }, + { + "epoch": 6.301042571676803, + "grad_norm": 0.0004687570908572525, + "learning_rate": 1.0630701563857516e-05, + "loss": 0.0001, + "step": 29010 + }, + { + "epoch": 6.303214596003476, + "grad_norm": 0.0004763658216688782, + "learning_rate": 1.0617126411815813e-05, + "loss": 0.0001, + "step": 29020 + }, + { + "epoch": 6.305386620330148, + "grad_norm": 0.0004660674021579325, + "learning_rate": 1.060355125977411e-05, + "loss": 0.0001, + "step": 29030 + }, + { + "epoch": 6.307558644656821, + "grad_norm": 0.0004942914238199592, + "learning_rate": 1.0589976107732407e-05, + "loss": 0.0001, + "step": 29040 + }, + { + "epoch": 6.309730668983493, + "grad_norm": 0.0004738739226013422, + "learning_rate": 1.0576400955690704e-05, + "loss": 0.0001, + "step": 29050 + }, + { + "epoch": 6.311902693310165, + "grad_norm": 0.0004960225778631866, + "learning_rate": 1.0562825803649002e-05, + "loss": 0.0001, + "step": 29060 + }, + { + "epoch": 6.314074717636838, + "grad_norm": 0.00047240074491128325, + "learning_rate": 1.0549250651607299e-05, + "loss": 0.0001, + "step": 29070 + }, + { + "epoch": 6.31624674196351, + "grad_norm": 0.0005025041755288839, + "learning_rate": 1.0535675499565597e-05, + "loss": 0.0001, + "step": 29080 + }, + { + "epoch": 6.318418766290183, + "grad_norm": 0.0004930326831527054, + "learning_rate": 1.0522100347523893e-05, + "loss": 0.0001, + "step": 29090 + }, + { + "epoch": 6.320590790616855, + "grad_norm": 0.0010761783923953772, + "learning_rate": 1.050852519548219e-05, + "loss": 0.0001, + "step": 29100 + }, + { + "epoch": 6.322762814943528, + "grad_norm": 0.00047367255319841206, + "learning_rate": 1.0494950043440486e-05, + "loss": 0.0001, + "step": 29110 + }, + { + "epoch": 6.3249348392702, + "grad_norm": 0.0015355387004092336, + "learning_rate": 1.0481374891398785e-05, + "loss": 0.0607, + "step": 29120 + }, + { + "epoch": 6.327106863596872, + "grad_norm": 0.0014881700044497848, + "learning_rate": 1.0467799739357081e-05, + "loss": 0.0045, + "step": 29130 + }, + { + "epoch": 6.329278887923545, + "grad_norm": 0.0017769057303667068, + "learning_rate": 1.0454224587315378e-05, + "loss": 0.0002, + "step": 29140 + }, + { + "epoch": 6.331450912250217, + "grad_norm": 0.23785489797592163, + "learning_rate": 1.0440649435273676e-05, + "loss": 0.0054, + "step": 29150 + }, + { + "epoch": 6.33362293657689, + "grad_norm": 2.4570157527923584, + "learning_rate": 1.0427074283231972e-05, + "loss": 0.0405, + "step": 29160 + }, + { + "epoch": 6.335794960903562, + "grad_norm": 0.001386605086736381, + "learning_rate": 1.041349913119027e-05, + "loss": 0.0002, + "step": 29170 + }, + { + "epoch": 6.337966985230235, + "grad_norm": 0.006186521612107754, + "learning_rate": 1.0399923979148567e-05, + "loss": 0.0003, + "step": 29180 + }, + { + "epoch": 6.340139009556907, + "grad_norm": 0.002584697911515832, + "learning_rate": 1.0386348827106864e-05, + "loss": 0.0006, + "step": 29190 + }, + { + "epoch": 6.342311033883579, + "grad_norm": 0.0030540975276380777, + "learning_rate": 1.0372773675065162e-05, + "loss": 0.0003, + "step": 29200 + }, + { + "epoch": 6.344483058210252, + "grad_norm": 0.001179517013952136, + "learning_rate": 1.0359198523023458e-05, + "loss": 0.001, + "step": 29210 + }, + { + "epoch": 6.346655082536924, + "grad_norm": 0.007270792964845896, + "learning_rate": 1.0345623370981755e-05, + "loss": 0.0001, + "step": 29220 + }, + { + "epoch": 6.348827106863597, + "grad_norm": 0.001838905387558043, + "learning_rate": 1.0332048218940053e-05, + "loss": 0.0001, + "step": 29230 + }, + { + "epoch": 6.350999131190269, + "grad_norm": 0.2083090841770172, + "learning_rate": 1.031847306689835e-05, + "loss": 0.0052, + "step": 29240 + }, + { + "epoch": 6.353171155516942, + "grad_norm": 0.005161400884389877, + "learning_rate": 1.0304897914856648e-05, + "loss": 0.0001, + "step": 29250 + }, + { + "epoch": 6.355343179843614, + "grad_norm": 0.0009388396283611655, + "learning_rate": 1.0291322762814944e-05, + "loss": 0.0001, + "step": 29260 + }, + { + "epoch": 6.357515204170287, + "grad_norm": 0.001116700004786253, + "learning_rate": 1.0277747610773242e-05, + "loss": 0.0001, + "step": 29270 + }, + { + "epoch": 6.359687228496959, + "grad_norm": 0.0007811775431036949, + "learning_rate": 1.0264172458731539e-05, + "loss": 0.0001, + "step": 29280 + }, + { + "epoch": 6.361859252823631, + "grad_norm": 0.0009371462510898709, + "learning_rate": 1.0250597306689835e-05, + "loss": 0.0043, + "step": 29290 + }, + { + "epoch": 6.364031277150304, + "grad_norm": 0.0007141511887311935, + "learning_rate": 1.0237022154648132e-05, + "loss": 0.0001, + "step": 29300 + }, + { + "epoch": 6.366203301476976, + "grad_norm": 0.0007462035282514989, + "learning_rate": 1.022344700260643e-05, + "loss": 0.0002, + "step": 29310 + }, + { + "epoch": 6.368375325803649, + "grad_norm": 0.0007523217936977744, + "learning_rate": 1.0209871850564727e-05, + "loss": 0.0001, + "step": 29320 + }, + { + "epoch": 6.370547350130321, + "grad_norm": 0.0008268319652415812, + "learning_rate": 1.0196296698523023e-05, + "loss": 0.0001, + "step": 29330 + }, + { + "epoch": 6.372719374456994, + "grad_norm": 0.0009327057632617652, + "learning_rate": 1.0182721546481321e-05, + "loss": 0.0002, + "step": 29340 + }, + { + "epoch": 6.374891398783666, + "grad_norm": 0.0016702677821740508, + "learning_rate": 1.0169146394439618e-05, + "loss": 0.0046, + "step": 29350 + }, + { + "epoch": 6.3770634231103385, + "grad_norm": 0.0011580121936276555, + "learning_rate": 1.0155571242397916e-05, + "loss": 0.0222, + "step": 29360 + }, + { + "epoch": 6.379235447437011, + "grad_norm": 0.0007400794420391321, + "learning_rate": 1.0141996090356213e-05, + "loss": 0.0001, + "step": 29370 + }, + { + "epoch": 6.3814074717636835, + "grad_norm": 0.0006812246283516288, + "learning_rate": 1.0128420938314509e-05, + "loss": 0.0001, + "step": 29380 + }, + { + "epoch": 6.3835794960903565, + "grad_norm": 0.0010162891121581197, + "learning_rate": 1.0114845786272807e-05, + "loss": 0.0002, + "step": 29390 + }, + { + "epoch": 6.3857515204170285, + "grad_norm": 0.0008094085496850312, + "learning_rate": 1.0101270634231104e-05, + "loss": 0.0001, + "step": 29400 + }, + { + "epoch": 6.3879235447437015, + "grad_norm": 0.0006741775432601571, + "learning_rate": 1.00876954821894e-05, + "loss": 0.016, + "step": 29410 + }, + { + "epoch": 6.390095569070374, + "grad_norm": 0.0005891403998248279, + "learning_rate": 1.0074120330147697e-05, + "loss": 0.0001, + "step": 29420 + }, + { + "epoch": 6.392267593397046, + "grad_norm": 0.0006258541252464056, + "learning_rate": 1.0060545178105995e-05, + "loss": 0.0001, + "step": 29430 + }, + { + "epoch": 6.394439617723719, + "grad_norm": 0.0006183306686580181, + "learning_rate": 1.0046970026064293e-05, + "loss": 0.0001, + "step": 29440 + }, + { + "epoch": 6.396611642050391, + "grad_norm": 0.0007246190216392279, + "learning_rate": 1.003339487402259e-05, + "loss": 0.0586, + "step": 29450 + }, + { + "epoch": 6.398783666377064, + "grad_norm": 0.002597115933895111, + "learning_rate": 1.0019819721980888e-05, + "loss": 0.0002, + "step": 29460 + }, + { + "epoch": 6.400955690703736, + "grad_norm": 0.0027950238436460495, + "learning_rate": 1.0006244569939184e-05, + "loss": 0.0005, + "step": 29470 + }, + { + "epoch": 6.403127715030409, + "grad_norm": 0.004178280476480722, + "learning_rate": 9.992669417897481e-06, + "loss": 0.029, + "step": 29480 + }, + { + "epoch": 6.405299739357081, + "grad_norm": 0.02065322920680046, + "learning_rate": 9.979094265855777e-06, + "loss": 0.0009, + "step": 29490 + }, + { + "epoch": 6.407471763683754, + "grad_norm": 0.012738760560750961, + "learning_rate": 9.965519113814076e-06, + "loss": 0.0016, + "step": 29500 + }, + { + "epoch": 6.409643788010426, + "grad_norm": 0.019186438992619514, + "learning_rate": 9.951943961772372e-06, + "loss": 0.0011, + "step": 29510 + }, + { + "epoch": 6.411815812337098, + "grad_norm": 0.0061889952048659325, + "learning_rate": 9.938368809730669e-06, + "loss": 0.0004, + "step": 29520 + }, + { + "epoch": 6.413987836663771, + "grad_norm": 0.004089644178748131, + "learning_rate": 9.924793657688967e-06, + "loss": 0.0003, + "step": 29530 + }, + { + "epoch": 6.416159860990443, + "grad_norm": 0.003858107840642333, + "learning_rate": 9.911218505647265e-06, + "loss": 0.0003, + "step": 29540 + }, + { + "epoch": 6.418331885317116, + "grad_norm": 0.8940383791923523, + "learning_rate": 9.897643353605562e-06, + "loss": 0.0059, + "step": 29550 + }, + { + "epoch": 6.420503909643788, + "grad_norm": 0.001113207545131445, + "learning_rate": 9.884068201563858e-06, + "loss": 0.0002, + "step": 29560 + }, + { + "epoch": 6.422675933970461, + "grad_norm": 0.0023445405531674623, + "learning_rate": 9.870493049522155e-06, + "loss": 0.0003, + "step": 29570 + }, + { + "epoch": 6.424847958297133, + "grad_norm": 0.0012578286696225405, + "learning_rate": 9.856917897480453e-06, + "loss": 0.0087, + "step": 29580 + }, + { + "epoch": 6.427019982623805, + "grad_norm": 0.01483129058033228, + "learning_rate": 9.84334274543875e-06, + "loss": 0.0002, + "step": 29590 + }, + { + "epoch": 6.429192006950478, + "grad_norm": 0.0018328166333958507, + "learning_rate": 9.829767593397046e-06, + "loss": 0.0002, + "step": 29600 + }, + { + "epoch": 6.43136403127715, + "grad_norm": 0.0012817789101973176, + "learning_rate": 9.816192441355342e-06, + "loss": 0.0001, + "step": 29610 + }, + { + "epoch": 6.433536055603823, + "grad_norm": 0.002289244905114174, + "learning_rate": 9.80261728931364e-06, + "loss": 0.0002, + "step": 29620 + }, + { + "epoch": 6.435708079930495, + "grad_norm": 0.0060974303632974625, + "learning_rate": 9.789042137271939e-06, + "loss": 0.0054, + "step": 29630 + }, + { + "epoch": 6.437880104257168, + "grad_norm": 0.0017827164847403765, + "learning_rate": 9.775466985230235e-06, + "loss": 0.0002, + "step": 29640 + }, + { + "epoch": 6.44005212858384, + "grad_norm": 0.002662285231053829, + "learning_rate": 9.761891833188533e-06, + "loss": 0.0002, + "step": 29650 + }, + { + "epoch": 6.442224152910512, + "grad_norm": 0.0029989073518663645, + "learning_rate": 9.74831668114683e-06, + "loss": 0.0005, + "step": 29660 + }, + { + "epoch": 6.444396177237185, + "grad_norm": 0.0008592927479185164, + "learning_rate": 9.734741529105126e-06, + "loss": 0.0001, + "step": 29670 + }, + { + "epoch": 6.446568201563857, + "grad_norm": 0.646806001663208, + "learning_rate": 9.721166377063423e-06, + "loss": 0.0091, + "step": 29680 + }, + { + "epoch": 6.44874022589053, + "grad_norm": 0.0013074136804789305, + "learning_rate": 9.707591225021721e-06, + "loss": 0.0002, + "step": 29690 + }, + { + "epoch": 6.450912250217202, + "grad_norm": 0.0017774419393390417, + "learning_rate": 9.694016072980018e-06, + "loss": 0.0002, + "step": 29700 + }, + { + "epoch": 6.453084274543875, + "grad_norm": 0.0008454259368591011, + "learning_rate": 9.680440920938314e-06, + "loss": 0.0001, + "step": 29710 + }, + { + "epoch": 6.455256298870547, + "grad_norm": 0.0038401109632104635, + "learning_rate": 9.666865768896612e-06, + "loss": 0.0048, + "step": 29720 + }, + { + "epoch": 6.45742832319722, + "grad_norm": 0.001266291132196784, + "learning_rate": 9.65329061685491e-06, + "loss": 0.0001, + "step": 29730 + }, + { + "epoch": 6.459600347523892, + "grad_norm": 0.002196115907281637, + "learning_rate": 9.639715464813207e-06, + "loss": 0.0036, + "step": 29740 + }, + { + "epoch": 6.461772371850564, + "grad_norm": 0.000684195663779974, + "learning_rate": 9.626140312771504e-06, + "loss": 0.0001, + "step": 29750 + }, + { + "epoch": 6.463944396177237, + "grad_norm": 0.0014452398754656315, + "learning_rate": 9.6125651607298e-06, + "loss": 0.0001, + "step": 29760 + }, + { + "epoch": 6.4661164205039094, + "grad_norm": 0.04958047717809677, + "learning_rate": 9.598990008688098e-06, + "loss": 0.0002, + "step": 29770 + }, + { + "epoch": 6.468288444830582, + "grad_norm": 0.0006793006905354559, + "learning_rate": 9.585414856646395e-06, + "loss": 0.0005, + "step": 29780 + }, + { + "epoch": 6.4704604691572545, + "grad_norm": 0.0006012742524035275, + "learning_rate": 9.571839704604691e-06, + "loss": 0.0001, + "step": 29790 + }, + { + "epoch": 6.4726324934839266, + "grad_norm": 0.0005449285381473601, + "learning_rate": 9.558264552562988e-06, + "loss": 0.0001, + "step": 29800 + }, + { + "epoch": 6.4748045178105995, + "grad_norm": 0.0005929931649006903, + "learning_rate": 9.544689400521286e-06, + "loss": 0.0002, + "step": 29810 + }, + { + "epoch": 6.476976542137272, + "grad_norm": 0.0006561490008607507, + "learning_rate": 9.531114248479584e-06, + "loss": 0.0001, + "step": 29820 + }, + { + "epoch": 6.4791485664639445, + "grad_norm": 0.0006006149342283607, + "learning_rate": 9.51753909643788e-06, + "loss": 0.0001, + "step": 29830 + }, + { + "epoch": 6.481320590790617, + "grad_norm": 0.000574872363358736, + "learning_rate": 9.503963944396179e-06, + "loss": 0.0001, + "step": 29840 + }, + { + "epoch": 6.48349261511729, + "grad_norm": 0.0010642276611179113, + "learning_rate": 9.490388792354475e-06, + "loss": 0.0001, + "step": 29850 + }, + { + "epoch": 6.485664639443962, + "grad_norm": 0.0005911542684771121, + "learning_rate": 9.476813640312772e-06, + "loss": 0.0001, + "step": 29860 + }, + { + "epoch": 6.487836663770635, + "grad_norm": 0.0006189719424583018, + "learning_rate": 9.463238488271068e-06, + "loss": 0.0001, + "step": 29870 + }, + { + "epoch": 6.490008688097307, + "grad_norm": 0.0006683343090116978, + "learning_rate": 9.449663336229367e-06, + "loss": 0.0001, + "step": 29880 + }, + { + "epoch": 6.492180712423979, + "grad_norm": 0.0014524429570883512, + "learning_rate": 9.436088184187663e-06, + "loss": 0.0039, + "step": 29890 + }, + { + "epoch": 6.494352736750652, + "grad_norm": 0.0006171105778776109, + "learning_rate": 9.42251303214596e-06, + "loss": 0.0001, + "step": 29900 + }, + { + "epoch": 6.496524761077324, + "grad_norm": 0.0005288756219670177, + "learning_rate": 9.408937880104258e-06, + "loss": 0.0001, + "step": 29910 + }, + { + "epoch": 6.498696785403997, + "grad_norm": 0.0005219160229898989, + "learning_rate": 9.395362728062556e-06, + "loss": 0.0001, + "step": 29920 + }, + { + "epoch": 6.500868809730669, + "grad_norm": 0.0006902749300934374, + "learning_rate": 9.381787576020853e-06, + "loss": 0.0087, + "step": 29930 + }, + { + "epoch": 6.503040834057342, + "grad_norm": 0.0009760453249327838, + "learning_rate": 9.368212423979149e-06, + "loss": 0.0001, + "step": 29940 + }, + { + "epoch": 6.505212858384014, + "grad_norm": 0.0005377961206249893, + "learning_rate": 9.354637271937446e-06, + "loss": 0.0001, + "step": 29950 + }, + { + "epoch": 6.507384882710687, + "grad_norm": 0.0018513593822717667, + "learning_rate": 9.341062119895744e-06, + "loss": 0.0001, + "step": 29960 + }, + { + "epoch": 6.509556907037359, + "grad_norm": 0.0005402403767220676, + "learning_rate": 9.32748696785404e-06, + "loss": 0.0001, + "step": 29970 + }, + { + "epoch": 6.511728931364031, + "grad_norm": 0.0006104527274146676, + "learning_rate": 9.313911815812337e-06, + "loss": 0.0048, + "step": 29980 + }, + { + "epoch": 6.513900955690704, + "grad_norm": 0.0010075703030452132, + "learning_rate": 9.300336663770633e-06, + "loss": 0.0001, + "step": 29990 + }, + { + "epoch": 6.516072980017376, + "grad_norm": 0.0005355360917747021, + "learning_rate": 9.286761511728932e-06, + "loss": 0.0001, + "step": 30000 + }, + { + "epoch": 6.518245004344049, + "grad_norm": 0.0005514567019417882, + "learning_rate": 9.27318635968723e-06, + "loss": 0.0001, + "step": 30010 + }, + { + "epoch": 6.520417028670721, + "grad_norm": 0.0005905954749323428, + "learning_rate": 9.259611207645526e-06, + "loss": 0.0001, + "step": 30020 + }, + { + "epoch": 6.522589052997393, + "grad_norm": 0.0005409869481809437, + "learning_rate": 9.246036055603823e-06, + "loss": 0.0121, + "step": 30030 + }, + { + "epoch": 6.524761077324066, + "grad_norm": 0.0005206941277720034, + "learning_rate": 9.232460903562121e-06, + "loss": 0.0001, + "step": 30040 + }, + { + "epoch": 6.526933101650738, + "grad_norm": 0.0010512637672945857, + "learning_rate": 9.218885751520417e-06, + "loss": 0.0001, + "step": 30050 + }, + { + "epoch": 6.529105125977411, + "grad_norm": 0.0009512483957223594, + "learning_rate": 9.205310599478714e-06, + "loss": 0.0001, + "step": 30060 + }, + { + "epoch": 6.531277150304083, + "grad_norm": 0.000541405170224607, + "learning_rate": 9.191735447437012e-06, + "loss": 0.0001, + "step": 30070 + }, + { + "epoch": 6.533449174630756, + "grad_norm": 0.0005479655810631812, + "learning_rate": 9.178160295395309e-06, + "loss": 0.0001, + "step": 30080 + }, + { + "epoch": 6.535621198957428, + "grad_norm": 0.0006360400002449751, + "learning_rate": 9.164585143353605e-06, + "loss": 0.0042, + "step": 30090 + }, + { + "epoch": 6.537793223284101, + "grad_norm": 0.0021848746109753847, + "learning_rate": 9.151009991311903e-06, + "loss": 0.0001, + "step": 30100 + }, + { + "epoch": 6.539965247610773, + "grad_norm": 0.0005829419824294746, + "learning_rate": 9.137434839270202e-06, + "loss": 0.0001, + "step": 30110 + }, + { + "epoch": 6.542137271937445, + "grad_norm": 0.0009212974109686911, + "learning_rate": 9.123859687228498e-06, + "loss": 0.0001, + "step": 30120 + }, + { + "epoch": 6.544309296264118, + "grad_norm": 0.000987934647127986, + "learning_rate": 9.110284535186795e-06, + "loss": 0.0001, + "step": 30130 + }, + { + "epoch": 6.54648132059079, + "grad_norm": 0.0005119699635542929, + "learning_rate": 9.096709383145091e-06, + "loss": 0.0001, + "step": 30140 + }, + { + "epoch": 6.548653344917463, + "grad_norm": 0.0005486300215125084, + "learning_rate": 9.08313423110339e-06, + "loss": 0.0041, + "step": 30150 + }, + { + "epoch": 6.550825369244135, + "grad_norm": 0.0005901909316889942, + "learning_rate": 9.069559079061686e-06, + "loss": 0.0001, + "step": 30160 + }, + { + "epoch": 6.552997393570808, + "grad_norm": 0.0005802253726869822, + "learning_rate": 9.055983927019982e-06, + "loss": 0.0001, + "step": 30170 + }, + { + "epoch": 6.55516941789748, + "grad_norm": 0.0005272625712677836, + "learning_rate": 9.042408774978279e-06, + "loss": 0.0001, + "step": 30180 + }, + { + "epoch": 6.557341442224153, + "grad_norm": 0.0018508587963879108, + "learning_rate": 9.028833622936577e-06, + "loss": 0.0021, + "step": 30190 + }, + { + "epoch": 6.5595134665508255, + "grad_norm": 0.0005285025690682232, + "learning_rate": 9.015258470894875e-06, + "loss": 0.0001, + "step": 30200 + }, + { + "epoch": 6.5616854908774975, + "grad_norm": 0.0005890244501642883, + "learning_rate": 9.001683318853172e-06, + "loss": 0.0001, + "step": 30210 + }, + { + "epoch": 6.5638575152041705, + "grad_norm": 0.00271811755374074, + "learning_rate": 8.988108166811468e-06, + "loss": 0.0001, + "step": 30220 + }, + { + "epoch": 6.566029539530843, + "grad_norm": 0.0010901761706918478, + "learning_rate": 8.974533014769767e-06, + "loss": 0.0001, + "step": 30230 + }, + { + "epoch": 6.5682015638575155, + "grad_norm": 0.0005904104909859598, + "learning_rate": 8.960957862728063e-06, + "loss": 0.0001, + "step": 30240 + }, + { + "epoch": 6.570373588184188, + "grad_norm": 0.0005831909948028624, + "learning_rate": 8.94738271068636e-06, + "loss": 0.0001, + "step": 30250 + }, + { + "epoch": 6.57254561251086, + "grad_norm": 0.20518867671489716, + "learning_rate": 8.933807558644658e-06, + "loss": 0.005, + "step": 30260 + }, + { + "epoch": 6.574717636837533, + "grad_norm": 0.0007069736020639539, + "learning_rate": 8.920232406602954e-06, + "loss": 0.0038, + "step": 30270 + }, + { + "epoch": 6.576889661164205, + "grad_norm": 0.0006283735274337232, + "learning_rate": 8.90665725456125e-06, + "loss": 0.0001, + "step": 30280 + }, + { + "epoch": 6.579061685490878, + "grad_norm": 0.0012741173850372434, + "learning_rate": 8.893082102519549e-06, + "loss": 0.0001, + "step": 30290 + }, + { + "epoch": 6.58123370981755, + "grad_norm": 0.0006880299188196659, + "learning_rate": 8.879506950477847e-06, + "loss": 0.0046, + "step": 30300 + }, + { + "epoch": 6.583405734144223, + "grad_norm": 0.0005726420204155147, + "learning_rate": 8.865931798436144e-06, + "loss": 0.0001, + "step": 30310 + }, + { + "epoch": 6.585577758470895, + "grad_norm": 0.0005194940022192895, + "learning_rate": 8.85235664639444e-06, + "loss": 0.0001, + "step": 30320 + }, + { + "epoch": 6.587749782797568, + "grad_norm": 0.0005502538406290114, + "learning_rate": 8.838781494352737e-06, + "loss": 0.0001, + "step": 30330 + }, + { + "epoch": 6.58992180712424, + "grad_norm": 0.0007209287723526359, + "learning_rate": 8.825206342311035e-06, + "loss": 0.0001, + "step": 30340 + }, + { + "epoch": 6.592093831450912, + "grad_norm": 0.0005370117723941803, + "learning_rate": 8.811631190269331e-06, + "loss": 0.0001, + "step": 30350 + }, + { + "epoch": 6.594265855777585, + "grad_norm": 0.0005737512256018817, + "learning_rate": 8.798056038227628e-06, + "loss": 0.0001, + "step": 30360 + }, + { + "epoch": 6.596437880104257, + "grad_norm": 0.0005802140804007649, + "learning_rate": 8.784480886185924e-06, + "loss": 0.0001, + "step": 30370 + }, + { + "epoch": 6.59860990443093, + "grad_norm": 0.0005447014700621367, + "learning_rate": 8.770905734144223e-06, + "loss": 0.0001, + "step": 30380 + }, + { + "epoch": 6.600781928757602, + "grad_norm": 0.0006462910096161067, + "learning_rate": 8.75733058210252e-06, + "loss": 0.0034, + "step": 30390 + }, + { + "epoch": 6.602953953084275, + "grad_norm": 0.0005304300575517118, + "learning_rate": 8.743755430060817e-06, + "loss": 0.0001, + "step": 30400 + }, + { + "epoch": 6.605125977410947, + "grad_norm": 0.000523955503012985, + "learning_rate": 8.730180278019114e-06, + "loss": 0.0001, + "step": 30410 + }, + { + "epoch": 6.60729800173762, + "grad_norm": 0.0006388821639120579, + "learning_rate": 8.716605125977412e-06, + "loss": 0.0001, + "step": 30420 + }, + { + "epoch": 6.609470026064292, + "grad_norm": 0.0006965682841837406, + "learning_rate": 8.703029973935709e-06, + "loss": 0.0001, + "step": 30430 + }, + { + "epoch": 6.611642050390964, + "grad_norm": 0.0008022001711651683, + "learning_rate": 8.689454821894005e-06, + "loss": 0.0001, + "step": 30440 + }, + { + "epoch": 6.613814074717637, + "grad_norm": 0.0005276188021525741, + "learning_rate": 8.675879669852303e-06, + "loss": 0.0001, + "step": 30450 + }, + { + "epoch": 6.615986099044309, + "grad_norm": 0.0004995689378120005, + "learning_rate": 8.6623045178106e-06, + "loss": 0.0001, + "step": 30460 + }, + { + "epoch": 6.618158123370982, + "grad_norm": 0.0006201511714607477, + "learning_rate": 8.648729365768896e-06, + "loss": 0.0001, + "step": 30470 + }, + { + "epoch": 6.620330147697654, + "grad_norm": 0.0005640414892695844, + "learning_rate": 8.635154213727194e-06, + "loss": 0.0004, + "step": 30480 + }, + { + "epoch": 6.622502172024326, + "grad_norm": 0.000546163646504283, + "learning_rate": 8.621579061685493e-06, + "loss": 0.0065, + "step": 30490 + }, + { + "epoch": 6.624674196350999, + "grad_norm": 0.004325380548834801, + "learning_rate": 8.60800390964379e-06, + "loss": 0.0001, + "step": 30500 + }, + { + "epoch": 6.626846220677671, + "grad_norm": 0.0005421187379397452, + "learning_rate": 8.594428757602086e-06, + "loss": 0.0056, + "step": 30510 + }, + { + "epoch": 6.629018245004344, + "grad_norm": 0.0007157633081078529, + "learning_rate": 8.580853605560382e-06, + "loss": 0.0001, + "step": 30520 + }, + { + "epoch": 6.631190269331016, + "grad_norm": 0.0009098859154619277, + "learning_rate": 8.56727845351868e-06, + "loss": 0.0001, + "step": 30530 + }, + { + "epoch": 6.633362293657689, + "grad_norm": 0.0010034176521003246, + "learning_rate": 8.553703301476977e-06, + "loss": 0.0001, + "step": 30540 + }, + { + "epoch": 6.635534317984361, + "grad_norm": 0.0005072278436273336, + "learning_rate": 8.540128149435273e-06, + "loss": 0.0001, + "step": 30550 + }, + { + "epoch": 6.637706342311034, + "grad_norm": 0.0008486681617796421, + "learning_rate": 8.52655299739357e-06, + "loss": 0.0001, + "step": 30560 + }, + { + "epoch": 6.639878366637706, + "grad_norm": 0.0005881373072043061, + "learning_rate": 8.512977845351868e-06, + "loss": 0.0185, + "step": 30570 + }, + { + "epoch": 6.642050390964378, + "grad_norm": 0.0005281348712742329, + "learning_rate": 8.499402693310166e-06, + "loss": 0.0001, + "step": 30580 + }, + { + "epoch": 6.644222415291051, + "grad_norm": 0.0017992773791775107, + "learning_rate": 8.485827541268463e-06, + "loss": 0.0001, + "step": 30590 + }, + { + "epoch": 6.6463944396177235, + "grad_norm": 0.0006575282313860953, + "learning_rate": 8.47225238922676e-06, + "loss": 0.0055, + "step": 30600 + }, + { + "epoch": 6.648566463944396, + "grad_norm": 0.004106589592993259, + "learning_rate": 8.458677237185058e-06, + "loss": 0.0001, + "step": 30610 + }, + { + "epoch": 6.6507384882710685, + "grad_norm": 0.004435055423527956, + "learning_rate": 8.445102085143354e-06, + "loss": 0.0001, + "step": 30620 + }, + { + "epoch": 6.6529105125977415, + "grad_norm": 0.0005864517297595739, + "learning_rate": 8.43152693310165e-06, + "loss": 0.0001, + "step": 30630 + }, + { + "epoch": 6.6550825369244135, + "grad_norm": 0.002506996737793088, + "learning_rate": 8.417951781059947e-06, + "loss": 0.0001, + "step": 30640 + }, + { + "epoch": 6.6572545612510865, + "grad_norm": 0.0006056345882825553, + "learning_rate": 8.404376629018245e-06, + "loss": 0.008, + "step": 30650 + }, + { + "epoch": 6.659426585577759, + "grad_norm": 0.0007190427859313786, + "learning_rate": 8.390801476976542e-06, + "loss": 0.0072, + "step": 30660 + }, + { + "epoch": 6.661598609904431, + "grad_norm": 0.0009075113339349627, + "learning_rate": 8.37722632493484e-06, + "loss": 0.0001, + "step": 30670 + }, + { + "epoch": 6.663770634231104, + "grad_norm": 0.00090401666238904, + "learning_rate": 8.363651172893138e-06, + "loss": 0.0001, + "step": 30680 + }, + { + "epoch": 6.665942658557776, + "grad_norm": 0.0028792324010282755, + "learning_rate": 8.350076020851435e-06, + "loss": 0.0001, + "step": 30690 + }, + { + "epoch": 6.668114682884449, + "grad_norm": 0.0006368290050886571, + "learning_rate": 8.336500868809731e-06, + "loss": 0.0001, + "step": 30700 + }, + { + "epoch": 6.670286707211121, + "grad_norm": 0.002956211566925049, + "learning_rate": 8.322925716768028e-06, + "loss": 0.0001, + "step": 30710 + }, + { + "epoch": 6.672458731537793, + "grad_norm": 0.0004954506293870509, + "learning_rate": 8.309350564726326e-06, + "loss": 0.0001, + "step": 30720 + }, + { + "epoch": 6.674630755864466, + "grad_norm": 0.010645301081240177, + "learning_rate": 8.295775412684622e-06, + "loss": 0.0001, + "step": 30730 + }, + { + "epoch": 6.676802780191138, + "grad_norm": 0.0008476504008285701, + "learning_rate": 8.282200260642919e-06, + "loss": 0.0413, + "step": 30740 + }, + { + "epoch": 6.678974804517811, + "grad_norm": 0.0004940642975270748, + "learning_rate": 8.268625108601215e-06, + "loss": 0.0068, + "step": 30750 + }, + { + "epoch": 6.681146828844483, + "grad_norm": 0.0005225742934271693, + "learning_rate": 8.255049956559514e-06, + "loss": 0.0051, + "step": 30760 + }, + { + "epoch": 6.683318853171156, + "grad_norm": 0.0010523422388359904, + "learning_rate": 8.241474804517812e-06, + "loss": 0.0001, + "step": 30770 + }, + { + "epoch": 6.685490877497828, + "grad_norm": 0.0005202463362365961, + "learning_rate": 8.227899652476108e-06, + "loss": 0.0001, + "step": 30780 + }, + { + "epoch": 6.687662901824501, + "grad_norm": 0.000996004673652351, + "learning_rate": 8.214324500434405e-06, + "loss": 0.0001, + "step": 30790 + }, + { + "epoch": 6.689834926151173, + "grad_norm": 0.0006468400824815035, + "learning_rate": 8.200749348392703e-06, + "loss": 0.0001, + "step": 30800 + }, + { + "epoch": 6.692006950477845, + "grad_norm": 0.03909625858068466, + "learning_rate": 8.187174196351e-06, + "loss": 0.0002, + "step": 30810 + }, + { + "epoch": 6.694178974804518, + "grad_norm": 0.000541096436791122, + "learning_rate": 8.173599044309296e-06, + "loss": 0.0001, + "step": 30820 + }, + { + "epoch": 6.69635099913119, + "grad_norm": 0.0005183752509765327, + "learning_rate": 8.161381407471765e-06, + "loss": 0.0071, + "step": 30830 + }, + { + "epoch": 6.698523023457863, + "grad_norm": 0.0005007189465686679, + "learning_rate": 8.147806255430061e-06, + "loss": 0.0272, + "step": 30840 + }, + { + "epoch": 6.700695047784535, + "grad_norm": 0.2259320616722107, + "learning_rate": 8.134231103388358e-06, + "loss": 0.0102, + "step": 30850 + }, + { + "epoch": 6.702867072111208, + "grad_norm": 0.0012630257988348603, + "learning_rate": 8.120655951346654e-06, + "loss": 0.0001, + "step": 30860 + }, + { + "epoch": 6.70503909643788, + "grad_norm": 0.001169295865111053, + "learning_rate": 8.107080799304952e-06, + "loss": 0.0001, + "step": 30870 + }, + { + "epoch": 6.707211120764553, + "grad_norm": 0.0005535431555472314, + "learning_rate": 8.09350564726325e-06, + "loss": 0.0003, + "step": 30880 + }, + { + "epoch": 6.709383145091225, + "grad_norm": 0.0005410081357695162, + "learning_rate": 8.079930495221547e-06, + "loss": 0.026, + "step": 30890 + }, + { + "epoch": 6.711555169417897, + "grad_norm": 0.0007783859618939459, + "learning_rate": 8.066355343179844e-06, + "loss": 0.0001, + "step": 30900 + }, + { + "epoch": 6.71372719374457, + "grad_norm": 0.0005165360635146499, + "learning_rate": 8.052780191138142e-06, + "loss": 0.0002, + "step": 30910 + }, + { + "epoch": 6.715899218071242, + "grad_norm": 0.0021058362908661366, + "learning_rate": 8.039205039096438e-06, + "loss": 0.0001, + "step": 30920 + }, + { + "epoch": 6.718071242397915, + "grad_norm": 0.0005603966419585049, + "learning_rate": 8.025629887054735e-06, + "loss": 0.009, + "step": 30930 + }, + { + "epoch": 6.720243266724587, + "grad_norm": 0.0006074358243495226, + "learning_rate": 8.012054735013033e-06, + "loss": 0.0001, + "step": 30940 + }, + { + "epoch": 6.722415291051259, + "grad_norm": 0.0005121738649904728, + "learning_rate": 7.99847958297133e-06, + "loss": 0.0052, + "step": 30950 + }, + { + "epoch": 6.724587315377932, + "grad_norm": 0.0005047316080890596, + "learning_rate": 7.984904430929626e-06, + "loss": 0.0001, + "step": 30960 + }, + { + "epoch": 6.726759339704604, + "grad_norm": 0.0005447333678603172, + "learning_rate": 7.971329278887924e-06, + "loss": 0.0003, + "step": 30970 + }, + { + "epoch": 6.728931364031277, + "grad_norm": 0.0005434873746708035, + "learning_rate": 7.957754126846222e-06, + "loss": 0.0001, + "step": 30980 + }, + { + "epoch": 6.731103388357949, + "grad_norm": 0.0005110373022034764, + "learning_rate": 7.944178974804519e-06, + "loss": 0.0001, + "step": 30990 + }, + { + "epoch": 6.733275412684622, + "grad_norm": 0.0005496907979249954, + "learning_rate": 7.930603822762815e-06, + "loss": 0.0001, + "step": 31000 + }, + { + "epoch": 6.7354474370112944, + "grad_norm": 0.000641629914753139, + "learning_rate": 7.917028670721112e-06, + "loss": 0.0001, + "step": 31010 + }, + { + "epoch": 6.737619461337967, + "grad_norm": 0.22070743143558502, + "learning_rate": 7.90345351867941e-06, + "loss": 0.0052, + "step": 31020 + }, + { + "epoch": 6.7397914856646395, + "grad_norm": 0.0005148387281224132, + "learning_rate": 7.889878366637707e-06, + "loss": 0.0001, + "step": 31030 + }, + { + "epoch": 6.7419635099913116, + "grad_norm": 0.0005770947900600731, + "learning_rate": 7.876303214596003e-06, + "loss": 0.0244, + "step": 31040 + }, + { + "epoch": 6.7441355343179845, + "grad_norm": 0.0010065827518701553, + "learning_rate": 7.8627280625543e-06, + "loss": 0.0001, + "step": 31050 + }, + { + "epoch": 6.746307558644657, + "grad_norm": 0.0007362100295722485, + "learning_rate": 7.849152910512598e-06, + "loss": 0.0001, + "step": 31060 + }, + { + "epoch": 6.7484795829713295, + "grad_norm": 0.0005651618121191859, + "learning_rate": 7.835577758470896e-06, + "loss": 0.0001, + "step": 31070 + }, + { + "epoch": 6.750651607298002, + "grad_norm": 0.002264307113364339, + "learning_rate": 7.822002606429193e-06, + "loss": 0.0002, + "step": 31080 + }, + { + "epoch": 6.752823631624675, + "grad_norm": 0.0014880020171403885, + "learning_rate": 7.808427454387489e-06, + "loss": 0.0002, + "step": 31090 + }, + { + "epoch": 6.754995655951347, + "grad_norm": 0.002035699551925063, + "learning_rate": 7.794852302345787e-06, + "loss": 0.0001, + "step": 31100 + }, + { + "epoch": 6.757167680278019, + "grad_norm": 0.0005644381162710488, + "learning_rate": 7.781277150304084e-06, + "loss": 0.0002, + "step": 31110 + }, + { + "epoch": 6.759339704604692, + "grad_norm": 0.0005862244288437068, + "learning_rate": 7.76770199826238e-06, + "loss": 0.0001, + "step": 31120 + }, + { + "epoch": 6.761511728931364, + "grad_norm": 0.0005447894800454378, + "learning_rate": 7.754126846220679e-06, + "loss": 0.0001, + "step": 31130 + }, + { + "epoch": 6.763683753258037, + "grad_norm": 0.0011917410884052515, + "learning_rate": 7.740551694178975e-06, + "loss": 0.0001, + "step": 31140 + }, + { + "epoch": 6.765855777584709, + "grad_norm": 0.000644481391645968, + "learning_rate": 7.726976542137272e-06, + "loss": 0.0001, + "step": 31150 + }, + { + "epoch": 6.768027801911382, + "grad_norm": 0.0005681001930497587, + "learning_rate": 7.71340139009557e-06, + "loss": 0.0001, + "step": 31160 + }, + { + "epoch": 6.770199826238054, + "grad_norm": 0.013895424082875252, + "learning_rate": 7.699826238053868e-06, + "loss": 0.0001, + "step": 31170 + }, + { + "epoch": 6.772371850564726, + "grad_norm": 0.0007106468547135592, + "learning_rate": 7.686251086012164e-06, + "loss": 0.0003, + "step": 31180 + }, + { + "epoch": 6.774543874891399, + "grad_norm": 0.000547458475921303, + "learning_rate": 7.672675933970461e-06, + "loss": 0.0001, + "step": 31190 + }, + { + "epoch": 6.776715899218071, + "grad_norm": 0.0023357209283858538, + "learning_rate": 7.659100781928757e-06, + "loss": 0.0222, + "step": 31200 + }, + { + "epoch": 6.778887923544744, + "grad_norm": 0.013180797919631004, + "learning_rate": 7.645525629887056e-06, + "loss": 0.017, + "step": 31210 + }, + { + "epoch": 6.781059947871416, + "grad_norm": 0.03352075815200806, + "learning_rate": 7.631950477845352e-06, + "loss": 0.0003, + "step": 31220 + }, + { + "epoch": 6.783231972198089, + "grad_norm": 0.00327364937402308, + "learning_rate": 7.6183753258036496e-06, + "loss": 0.0002, + "step": 31230 + }, + { + "epoch": 6.785403996524761, + "grad_norm": 0.004868528805673122, + "learning_rate": 7.604800173761946e-06, + "loss": 0.0001, + "step": 31240 + }, + { + "epoch": 6.787576020851434, + "grad_norm": 0.0014305302174761891, + "learning_rate": 7.591225021720244e-06, + "loss": 0.0042, + "step": 31250 + }, + { + "epoch": 6.789748045178106, + "grad_norm": 0.001355145126581192, + "learning_rate": 7.577649869678541e-06, + "loss": 0.0002, + "step": 31260 + }, + { + "epoch": 6.791920069504778, + "grad_norm": 0.0006391478236764669, + "learning_rate": 7.564074717636837e-06, + "loss": 0.0004, + "step": 31270 + }, + { + "epoch": 6.794092093831451, + "grad_norm": 0.0006392408395186067, + "learning_rate": 7.550499565595135e-06, + "loss": 0.0001, + "step": 31280 + }, + { + "epoch": 6.796264118158123, + "grad_norm": 0.0009195109596475959, + "learning_rate": 7.536924413553433e-06, + "loss": 0.0001, + "step": 31290 + }, + { + "epoch": 6.798436142484796, + "grad_norm": 0.0015891172224655747, + "learning_rate": 7.523349261511729e-06, + "loss": 0.0001, + "step": 31300 + }, + { + "epoch": 6.800608166811468, + "grad_norm": 0.0005748227122239769, + "learning_rate": 7.509774109470026e-06, + "loss": 0.0001, + "step": 31310 + }, + { + "epoch": 6.80278019113814, + "grad_norm": 0.0006642222870141268, + "learning_rate": 7.496198957428324e-06, + "loss": 0.0001, + "step": 31320 + }, + { + "epoch": 6.804952215464813, + "grad_norm": 0.0006176797323860228, + "learning_rate": 7.482623805386621e-06, + "loss": 0.0001, + "step": 31330 + }, + { + "epoch": 6.807124239791485, + "grad_norm": 0.0005560345598496497, + "learning_rate": 7.469048653344918e-06, + "loss": 0.0003, + "step": 31340 + }, + { + "epoch": 6.809296264118158, + "grad_norm": 0.0021519416477531195, + "learning_rate": 7.4554735013032144e-06, + "loss": 0.0036, + "step": 31350 + }, + { + "epoch": 6.81146828844483, + "grad_norm": 0.0005443825502879918, + "learning_rate": 7.441898349261513e-06, + "loss": 0.0001, + "step": 31360 + }, + { + "epoch": 6.813640312771503, + "grad_norm": 0.0007841295446269214, + "learning_rate": 7.428323197219809e-06, + "loss": 0.0001, + "step": 31370 + }, + { + "epoch": 6.815812337098175, + "grad_norm": 0.0018480330472812057, + "learning_rate": 7.4147480451781065e-06, + "loss": 0.0001, + "step": 31380 + }, + { + "epoch": 6.817984361424848, + "grad_norm": 0.0018647913821041584, + "learning_rate": 7.401172893136403e-06, + "loss": 0.0001, + "step": 31390 + }, + { + "epoch": 6.82015638575152, + "grad_norm": 0.0005843811668455601, + "learning_rate": 7.387597741094701e-06, + "loss": 0.0002, + "step": 31400 + }, + { + "epoch": 6.8223284100781925, + "grad_norm": 0.0010984676191583276, + "learning_rate": 7.374022589052998e-06, + "loss": 0.0005, + "step": 31410 + }, + { + "epoch": 6.824500434404865, + "grad_norm": 0.0005059109535068274, + "learning_rate": 7.360447437011295e-06, + "loss": 0.0001, + "step": 31420 + }, + { + "epoch": 6.8266724587315375, + "grad_norm": 0.0004944863030686975, + "learning_rate": 7.346872284969592e-06, + "loss": 0.0017, + "step": 31430 + }, + { + "epoch": 6.8288444830582105, + "grad_norm": 0.0004914201563224196, + "learning_rate": 7.33329713292789e-06, + "loss": 0.0001, + "step": 31440 + }, + { + "epoch": 6.8310165073848825, + "grad_norm": 0.0005860158707946539, + "learning_rate": 7.319721980886186e-06, + "loss": 0.0038, + "step": 31450 + }, + { + "epoch": 6.8331885317115555, + "grad_norm": 0.0004897097824141383, + "learning_rate": 7.306146828844483e-06, + "loss": 0.0001, + "step": 31460 + }, + { + "epoch": 6.835360556038228, + "grad_norm": 0.0013720918213948607, + "learning_rate": 7.29257167680278e-06, + "loss": 0.0001, + "step": 31470 + }, + { + "epoch": 6.8375325803649005, + "grad_norm": 0.0004990609013475478, + "learning_rate": 7.278996524761078e-06, + "loss": 0.0001, + "step": 31480 + }, + { + "epoch": 6.839704604691573, + "grad_norm": 0.0004813902487512678, + "learning_rate": 7.265421372719375e-06, + "loss": 0.0001, + "step": 31490 + }, + { + "epoch": 6.841876629018245, + "grad_norm": 0.0009401888237334788, + "learning_rate": 7.251846220677671e-06, + "loss": 0.0001, + "step": 31500 + }, + { + "epoch": 6.844048653344918, + "grad_norm": 0.0006055055418983102, + "learning_rate": 7.238271068635969e-06, + "loss": 0.0001, + "step": 31510 + }, + { + "epoch": 6.84622067767159, + "grad_norm": 0.010489613749086857, + "learning_rate": 7.224695916594267e-06, + "loss": 0.0069, + "step": 31520 + }, + { + "epoch": 6.848392701998263, + "grad_norm": 0.000997158931568265, + "learning_rate": 7.2111207645525634e-06, + "loss": 0.0001, + "step": 31530 + }, + { + "epoch": 6.850564726324935, + "grad_norm": 0.00048104513552971184, + "learning_rate": 7.19754561251086e-06, + "loss": 0.0001, + "step": 31540 + }, + { + "epoch": 6.852736750651607, + "grad_norm": 0.002893621800467372, + "learning_rate": 7.183970460469158e-06, + "loss": 0.0001, + "step": 31550 + }, + { + "epoch": 6.85490877497828, + "grad_norm": 0.0006328218150883913, + "learning_rate": 7.170395308427455e-06, + "loss": 0.0027, + "step": 31560 + }, + { + "epoch": 6.857080799304952, + "grad_norm": 0.0006695294287055731, + "learning_rate": 7.156820156385752e-06, + "loss": 0.0343, + "step": 31570 + }, + { + "epoch": 6.859252823631625, + "grad_norm": 0.00702779833227396, + "learning_rate": 7.1432450043440485e-06, + "loss": 0.0001, + "step": 31580 + }, + { + "epoch": 6.861424847958297, + "grad_norm": 0.0006968594971112907, + "learning_rate": 7.129669852302347e-06, + "loss": 0.0055, + "step": 31590 + }, + { + "epoch": 6.86359687228497, + "grad_norm": 0.0021681380458176136, + "learning_rate": 7.116094700260643e-06, + "loss": 0.0002, + "step": 31600 + }, + { + "epoch": 6.865768896611642, + "grad_norm": 0.007272107060998678, + "learning_rate": 7.102519548218941e-06, + "loss": 0.0001, + "step": 31610 + }, + { + "epoch": 6.867940920938315, + "grad_norm": 0.0004872163408435881, + "learning_rate": 7.088944396177237e-06, + "loss": 0.0001, + "step": 31620 + }, + { + "epoch": 6.870112945264987, + "grad_norm": 0.0007561290985904634, + "learning_rate": 7.075369244135535e-06, + "loss": 0.0001, + "step": 31630 + }, + { + "epoch": 6.872284969591659, + "grad_norm": 0.10669023543596268, + "learning_rate": 7.061794092093832e-06, + "loss": 0.0002, + "step": 31640 + }, + { + "epoch": 6.874456993918332, + "grad_norm": 0.0030692359432578087, + "learning_rate": 7.048218940052128e-06, + "loss": 0.0001, + "step": 31650 + }, + { + "epoch": 6.876629018245004, + "grad_norm": 0.0014511916087940335, + "learning_rate": 7.034643788010426e-06, + "loss": 0.0001, + "step": 31660 + }, + { + "epoch": 6.878801042571677, + "grad_norm": 0.0005103896837681532, + "learning_rate": 7.021068635968724e-06, + "loss": 0.0001, + "step": 31670 + }, + { + "epoch": 6.880973066898349, + "grad_norm": 0.0005026645958423615, + "learning_rate": 7.00749348392702e-06, + "loss": 0.0001, + "step": 31680 + }, + { + "epoch": 6.883145091225022, + "grad_norm": 0.00869758054614067, + "learning_rate": 6.993918331885317e-06, + "loss": 0.0001, + "step": 31690 + }, + { + "epoch": 6.885317115551694, + "grad_norm": 0.00047411341802217066, + "learning_rate": 6.980343179843614e-06, + "loss": 0.0001, + "step": 31700 + }, + { + "epoch": 6.887489139878367, + "grad_norm": 0.00048073820653371513, + "learning_rate": 6.9667680278019125e-06, + "loss": 0.0001, + "step": 31710 + }, + { + "epoch": 6.889661164205039, + "grad_norm": 0.0004761155869346112, + "learning_rate": 6.953192875760209e-06, + "loss": 0.0001, + "step": 31720 + }, + { + "epoch": 6.891833188531711, + "grad_norm": 0.0005412718746811152, + "learning_rate": 6.9396177237185055e-06, + "loss": 0.0064, + "step": 31730 + }, + { + "epoch": 6.894005212858384, + "grad_norm": 0.0005142592126503587, + "learning_rate": 6.926042571676804e-06, + "loss": 0.0044, + "step": 31740 + }, + { + "epoch": 6.896177237185056, + "grad_norm": 0.013025223277509212, + "learning_rate": 6.9124674196351e-06, + "loss": 0.0173, + "step": 31750 + }, + { + "epoch": 6.898349261511729, + "grad_norm": 0.21733896434307098, + "learning_rate": 6.8988922675933975e-06, + "loss": 0.001, + "step": 31760 + }, + { + "epoch": 6.900521285838401, + "grad_norm": 0.0006785244331695139, + "learning_rate": 6.885317115551694e-06, + "loss": 0.0004, + "step": 31770 + }, + { + "epoch": 6.902693310165073, + "grad_norm": 0.0012234164169058204, + "learning_rate": 6.871741963509992e-06, + "loss": 0.0001, + "step": 31780 + }, + { + "epoch": 6.904865334491746, + "grad_norm": 0.0004961843369528651, + "learning_rate": 6.858166811468289e-06, + "loss": 0.0039, + "step": 31790 + }, + { + "epoch": 6.907037358818418, + "grad_norm": 0.0005016808281652629, + "learning_rate": 6.844591659426586e-06, + "loss": 0.0041, + "step": 31800 + }, + { + "epoch": 6.909209383145091, + "grad_norm": 0.0005186402704566717, + "learning_rate": 6.831016507384883e-06, + "loss": 0.0001, + "step": 31810 + }, + { + "epoch": 6.911381407471763, + "grad_norm": 0.05258096754550934, + "learning_rate": 6.817441355343181e-06, + "loss": 0.0002, + "step": 31820 + }, + { + "epoch": 6.913553431798436, + "grad_norm": 0.000563104753382504, + "learning_rate": 6.803866203301477e-06, + "loss": 0.0001, + "step": 31830 + }, + { + "epoch": 6.9157254561251085, + "grad_norm": 0.0004913609591312706, + "learning_rate": 6.790291051259774e-06, + "loss": 0.0003, + "step": 31840 + }, + { + "epoch": 6.917897480451781, + "grad_norm": 0.0004866503004450351, + "learning_rate": 6.776715899218071e-06, + "loss": 0.0001, + "step": 31850 + }, + { + "epoch": 6.9200695047784535, + "grad_norm": 0.0004830710240639746, + "learning_rate": 6.763140747176369e-06, + "loss": 0.0001, + "step": 31860 + }, + { + "epoch": 6.922241529105126, + "grad_norm": 0.0005364646785892546, + "learning_rate": 6.749565595134666e-06, + "loss": 0.0002, + "step": 31870 + }, + { + "epoch": 6.9244135534317985, + "grad_norm": 0.010428816080093384, + "learning_rate": 6.735990443092962e-06, + "loss": 0.0001, + "step": 31880 + }, + { + "epoch": 6.926585577758471, + "grad_norm": 0.0004830741381738335, + "learning_rate": 6.72241529105126e-06, + "loss": 0.0051, + "step": 31890 + }, + { + "epoch": 6.928757602085144, + "grad_norm": 0.000482941948575899, + "learning_rate": 6.708840139009558e-06, + "loss": 0.0001, + "step": 31900 + }, + { + "epoch": 6.930929626411816, + "grad_norm": 0.0005031520850025117, + "learning_rate": 6.6952649869678545e-06, + "loss": 0.0001, + "step": 31910 + }, + { + "epoch": 6.933101650738489, + "grad_norm": 0.0004937806515954435, + "learning_rate": 6.681689834926151e-06, + "loss": 0.0001, + "step": 31920 + }, + { + "epoch": 6.935273675065161, + "grad_norm": 0.0004919038037769496, + "learning_rate": 6.668114682884449e-06, + "loss": 0.0001, + "step": 31930 + }, + { + "epoch": 6.937445699391834, + "grad_norm": 0.0016344421310350299, + "learning_rate": 6.654539530842746e-06, + "loss": 0.0157, + "step": 31940 + }, + { + "epoch": 6.939617723718506, + "grad_norm": 0.0005624077748507261, + "learning_rate": 6.640964378801043e-06, + "loss": 0.0002, + "step": 31950 + }, + { + "epoch": 6.941789748045178, + "grad_norm": 0.0005970174679532647, + "learning_rate": 6.6273892267593396e-06, + "loss": 0.0001, + "step": 31960 + }, + { + "epoch": 6.943961772371851, + "grad_norm": 0.0005279725883156061, + "learning_rate": 6.613814074717638e-06, + "loss": 0.0001, + "step": 31970 + }, + { + "epoch": 6.946133796698523, + "grad_norm": 0.0004851980193052441, + "learning_rate": 6.600238922675934e-06, + "loss": 0.0004, + "step": 31980 + }, + { + "epoch": 6.948305821025196, + "grad_norm": 0.0004759115108754486, + "learning_rate": 6.586663770634232e-06, + "loss": 0.0001, + "step": 31990 + }, + { + "epoch": 6.950477845351868, + "grad_norm": 0.0005062356358394027, + "learning_rate": 6.573088618592528e-06, + "loss": 0.0001, + "step": 32000 + }, + { + "epoch": 6.95264986967854, + "grad_norm": 0.00047750433441251516, + "learning_rate": 6.559513466550826e-06, + "loss": 0.0001, + "step": 32010 + }, + { + "epoch": 6.954821894005213, + "grad_norm": 0.49857795238494873, + "learning_rate": 6.545938314509123e-06, + "loss": 0.0056, + "step": 32020 + }, + { + "epoch": 6.956993918331885, + "grad_norm": 0.015077983029186726, + "learning_rate": 6.532363162467419e-06, + "loss": 0.0125, + "step": 32030 + }, + { + "epoch": 6.959165942658558, + "grad_norm": 0.0009956338908523321, + "learning_rate": 6.518788010425717e-06, + "loss": 0.0052, + "step": 32040 + }, + { + "epoch": 6.96133796698523, + "grad_norm": 0.0005113192601129413, + "learning_rate": 6.505212858384015e-06, + "loss": 0.0001, + "step": 32050 + }, + { + "epoch": 6.963509991311903, + "grad_norm": 0.000508747179992497, + "learning_rate": 6.4916377063423114e-06, + "loss": 0.0001, + "step": 32060 + }, + { + "epoch": 6.965682015638575, + "grad_norm": 0.0006362470448948443, + "learning_rate": 6.478062554300608e-06, + "loss": 0.0001, + "step": 32070 + }, + { + "epoch": 6.967854039965248, + "grad_norm": 0.000799038796685636, + "learning_rate": 6.464487402258905e-06, + "loss": 0.0001, + "step": 32080 + }, + { + "epoch": 6.97002606429192, + "grad_norm": 0.0016933096339926124, + "learning_rate": 6.4509122502172035e-06, + "loss": 0.0001, + "step": 32090 + }, + { + "epoch": 6.972198088618592, + "grad_norm": 0.016275865957140923, + "learning_rate": 6.4373370981755e-06, + "loss": 0.0001, + "step": 32100 + }, + { + "epoch": 6.974370112945265, + "grad_norm": 0.0004907046677544713, + "learning_rate": 6.4237619461337965e-06, + "loss": 0.0001, + "step": 32110 + }, + { + "epoch": 6.976542137271937, + "grad_norm": 0.0004914928576909006, + "learning_rate": 6.410186794092095e-06, + "loss": 0.0002, + "step": 32120 + }, + { + "epoch": 6.97871416159861, + "grad_norm": 0.0004871877026744187, + "learning_rate": 6.396611642050391e-06, + "loss": 0.005, + "step": 32130 + }, + { + "epoch": 6.980886185925282, + "grad_norm": 0.0012598761823028326, + "learning_rate": 6.383036490008689e-06, + "loss": 0.0001, + "step": 32140 + }, + { + "epoch": 6.983058210251955, + "grad_norm": 0.0013038625475019217, + "learning_rate": 6.369461337966985e-06, + "loss": 0.0001, + "step": 32150 + }, + { + "epoch": 6.985230234578627, + "grad_norm": 0.0005218314472585917, + "learning_rate": 6.355886185925283e-06, + "loss": 0.0001, + "step": 32160 + }, + { + "epoch": 6.9874022589053, + "grad_norm": 0.0005293041467666626, + "learning_rate": 6.34231103388358e-06, + "loss": 0.0107, + "step": 32170 + }, + { + "epoch": 6.989574283231972, + "grad_norm": 0.006864586845040321, + "learning_rate": 6.328735881841877e-06, + "loss": 0.0161, + "step": 32180 + }, + { + "epoch": 6.991746307558644, + "grad_norm": 0.0005798496422357857, + "learning_rate": 6.315160729800174e-06, + "loss": 0.0014, + "step": 32190 + }, + { + "epoch": 6.993918331885317, + "grad_norm": 0.20845529437065125, + "learning_rate": 6.301585577758472e-06, + "loss": 0.0051, + "step": 32200 + }, + { + "epoch": 6.996090356211989, + "grad_norm": 0.004219398833811283, + "learning_rate": 6.288010425716768e-06, + "loss": 0.0003, + "step": 32210 + }, + { + "epoch": 6.998262380538662, + "grad_norm": 0.0004969558212906122, + "learning_rate": 6.274435273675065e-06, + "loss": 0.0001, + "step": 32220 + }, + { + "epoch": 7.0, + "eval_f1": 0.6423357664233577, + "eval_loss": 0.08360765874385834, + "eval_runtime": 82.7975, + "eval_samples_per_second": 120.475, + "eval_steps_per_second": 7.536, + "step": 32228 + }, + { + "epoch": 7.000434404865334, + "grad_norm": 0.000587178859859705, + "learning_rate": 6.260860121633362e-06, + "loss": 0.0003, + "step": 32230 + }, + { + "epoch": 7.002606429192007, + "grad_norm": 0.0005318076582625508, + "learning_rate": 6.24728496959166e-06, + "loss": 0.0001, + "step": 32240 + }, + { + "epoch": 7.0047784535186794, + "grad_norm": 0.0005707778618671, + "learning_rate": 6.233709817549957e-06, + "loss": 0.0001, + "step": 32250 + }, + { + "epoch": 7.0069504778453515, + "grad_norm": 0.0004858635657001287, + "learning_rate": 6.2201346655082535e-06, + "loss": 0.0001, + "step": 32260 + }, + { + "epoch": 7.0091225021720245, + "grad_norm": 0.0015563979977741838, + "learning_rate": 6.206559513466551e-06, + "loss": 0.0001, + "step": 32270 + }, + { + "epoch": 7.0112945264986966, + "grad_norm": 0.0009650535066612065, + "learning_rate": 6.192984361424848e-06, + "loss": 0.0001, + "step": 32280 + }, + { + "epoch": 7.0134665508253695, + "grad_norm": 0.0005269676330499351, + "learning_rate": 6.1794092093831455e-06, + "loss": 0.0001, + "step": 32290 + }, + { + "epoch": 7.015638575152042, + "grad_norm": 0.0009328114101663232, + "learning_rate": 6.165834057341442e-06, + "loss": 0.0001, + "step": 32300 + }, + { + "epoch": 7.0178105994787146, + "grad_norm": 0.0004974757903255522, + "learning_rate": 6.152258905299739e-06, + "loss": 0.0046, + "step": 32310 + }, + { + "epoch": 7.019982623805387, + "grad_norm": 0.005733178462833166, + "learning_rate": 6.138683753258037e-06, + "loss": 0.0001, + "step": 32320 + }, + { + "epoch": 7.022154648132059, + "grad_norm": 0.0009101475006900728, + "learning_rate": 6.125108601216334e-06, + "loss": 0.0042, + "step": 32330 + }, + { + "epoch": 7.024326672458732, + "grad_norm": 0.00048514435184188187, + "learning_rate": 6.1115334491746315e-06, + "loss": 0.0, + "step": 32340 + }, + { + "epoch": 7.026498696785404, + "grad_norm": 0.0004828857199754566, + "learning_rate": 6.097958297132928e-06, + "loss": 0.0001, + "step": 32350 + }, + { + "epoch": 7.028670721112077, + "grad_norm": 0.00048441332182846963, + "learning_rate": 6.084383145091225e-06, + "loss": 0.0041, + "step": 32360 + }, + { + "epoch": 7.030842745438749, + "grad_norm": 0.0004826818476431072, + "learning_rate": 6.070807993049523e-06, + "loss": 0.0, + "step": 32370 + }, + { + "epoch": 7.033014769765422, + "grad_norm": 0.0004850332625210285, + "learning_rate": 6.05723284100782e-06, + "loss": 0.0001, + "step": 32380 + }, + { + "epoch": 7.035186794092094, + "grad_norm": 0.0004887464456260204, + "learning_rate": 6.0436576889661165e-06, + "loss": 0.0061, + "step": 32390 + }, + { + "epoch": 7.037358818418766, + "grad_norm": 0.00047691402141936123, + "learning_rate": 6.030082536924414e-06, + "loss": 0.0001, + "step": 32400 + }, + { + "epoch": 7.039530842745439, + "grad_norm": 0.0014148523332551122, + "learning_rate": 6.01650738488271e-06, + "loss": 0.0001, + "step": 32410 + }, + { + "epoch": 7.041702867072111, + "grad_norm": 0.0004779113514814526, + "learning_rate": 6.002932232841009e-06, + "loss": 0.0077, + "step": 32420 + }, + { + "epoch": 7.043874891398784, + "grad_norm": 0.00047428891411982477, + "learning_rate": 5.989357080799305e-06, + "loss": 0.0001, + "step": 32430 + }, + { + "epoch": 7.046046915725456, + "grad_norm": 0.0006224108510650694, + "learning_rate": 5.9757819287576025e-06, + "loss": 0.006, + "step": 32440 + }, + { + "epoch": 7.048218940052129, + "grad_norm": 0.0004947124980390072, + "learning_rate": 5.962206776715899e-06, + "loss": 0.0001, + "step": 32450 + }, + { + "epoch": 7.050390964378801, + "grad_norm": 0.0007785210036672652, + "learning_rate": 5.948631624674196e-06, + "loss": 0.0001, + "step": 32460 + }, + { + "epoch": 7.052562988705474, + "grad_norm": 0.0006155652808956802, + "learning_rate": 5.935056472632494e-06, + "loss": 0.0001, + "step": 32470 + }, + { + "epoch": 7.054735013032146, + "grad_norm": 0.0004885067464783788, + "learning_rate": 5.921481320590791e-06, + "loss": 0.0001, + "step": 32480 + }, + { + "epoch": 7.056907037358818, + "grad_norm": 0.0008703715284354985, + "learning_rate": 5.9079061685490876e-06, + "loss": 0.0001, + "step": 32490 + }, + { + "epoch": 7.059079061685491, + "grad_norm": 0.0004715079558081925, + "learning_rate": 5.894331016507385e-06, + "loss": 0.0033, + "step": 32500 + }, + { + "epoch": 7.061251086012163, + "grad_norm": 0.4291040599346161, + "learning_rate": 5.880755864465682e-06, + "loss": 0.0059, + "step": 32510 + }, + { + "epoch": 7.063423110338836, + "grad_norm": 0.0004748949722852558, + "learning_rate": 5.86718071242398e-06, + "loss": 0.0001, + "step": 32520 + }, + { + "epoch": 7.065595134665508, + "grad_norm": 0.00047266524052247405, + "learning_rate": 5.853605560382277e-06, + "loss": 0.0001, + "step": 32530 + }, + { + "epoch": 7.067767158992181, + "grad_norm": 0.0004755932022817433, + "learning_rate": 5.8400304083405735e-06, + "loss": 0.0001, + "step": 32540 + }, + { + "epoch": 7.069939183318853, + "grad_norm": 0.0011011961614713073, + "learning_rate": 5.826455256298871e-06, + "loss": 0.0001, + "step": 32550 + }, + { + "epoch": 7.072111207645525, + "grad_norm": 0.0004747462226077914, + "learning_rate": 5.812880104257168e-06, + "loss": 0.0001, + "step": 32560 + }, + { + "epoch": 7.074283231972198, + "grad_norm": 0.0008834132459014654, + "learning_rate": 5.7993049522154656e-06, + "loss": 0.0002, + "step": 32570 + }, + { + "epoch": 7.07645525629887, + "grad_norm": 0.00047799412277527153, + "learning_rate": 5.785729800173762e-06, + "loss": 0.0001, + "step": 32580 + }, + { + "epoch": 7.078627280625543, + "grad_norm": 0.00047306338092312217, + "learning_rate": 5.7721546481320594e-06, + "loss": 0.0008, + "step": 32590 + }, + { + "epoch": 7.080799304952215, + "grad_norm": 0.00047740963054820895, + "learning_rate": 5.758579496090356e-06, + "loss": 0.009, + "step": 32600 + }, + { + "epoch": 7.082971329278888, + "grad_norm": 0.0004777971771545708, + "learning_rate": 5.745004344048654e-06, + "loss": 0.0001, + "step": 32610 + }, + { + "epoch": 7.08514335360556, + "grad_norm": 0.005595957860350609, + "learning_rate": 5.731429192006951e-06, + "loss": 0.0001, + "step": 32620 + }, + { + "epoch": 7.087315377932232, + "grad_norm": 0.00048442769912071526, + "learning_rate": 5.717854039965248e-06, + "loss": 0.0001, + "step": 32630 + }, + { + "epoch": 7.089487402258905, + "grad_norm": 0.00048627701471559703, + "learning_rate": 5.7042788879235445e-06, + "loss": 0.0001, + "step": 32640 + }, + { + "epoch": 7.0916594265855775, + "grad_norm": 0.0004757237038575113, + "learning_rate": 5.690703735881842e-06, + "loss": 0.0041, + "step": 32650 + }, + { + "epoch": 7.09383145091225, + "grad_norm": 0.0006175344460643828, + "learning_rate": 5.677128583840139e-06, + "loss": 0.0261, + "step": 32660 + }, + { + "epoch": 7.0960034752389225, + "grad_norm": 0.0004718822310678661, + "learning_rate": 5.6635534317984366e-06, + "loss": 0.0001, + "step": 32670 + }, + { + "epoch": 7.0981754995655955, + "grad_norm": 0.00047846182133071125, + "learning_rate": 5.649978279756733e-06, + "loss": 0.0, + "step": 32680 + }, + { + "epoch": 7.1003475238922675, + "grad_norm": 0.001272359979338944, + "learning_rate": 5.6364031277150304e-06, + "loss": 0.0001, + "step": 32690 + }, + { + "epoch": 7.10251954821894, + "grad_norm": 0.00047480466309934855, + "learning_rate": 5.622827975673328e-06, + "loss": 0.0001, + "step": 32700 + }, + { + "epoch": 7.104691572545613, + "grad_norm": 0.0008412067545577884, + "learning_rate": 5.609252823631625e-06, + "loss": 0.0001, + "step": 32710 + }, + { + "epoch": 7.106863596872285, + "grad_norm": 0.0004680381389334798, + "learning_rate": 5.595677671589922e-06, + "loss": 0.0, + "step": 32720 + }, + { + "epoch": 7.109035621198958, + "grad_norm": 0.000462341500679031, + "learning_rate": 5.582102519548219e-06, + "loss": 0.0052, + "step": 32730 + }, + { + "epoch": 7.11120764552563, + "grad_norm": 0.00048780019278638065, + "learning_rate": 5.568527367506516e-06, + "loss": 0.0001, + "step": 32740 + }, + { + "epoch": 7.113379669852303, + "grad_norm": 0.0004658056132029742, + "learning_rate": 5.554952215464814e-06, + "loss": 0.0001, + "step": 32750 + }, + { + "epoch": 7.115551694178975, + "grad_norm": 0.0010021216003224254, + "learning_rate": 5.541377063423111e-06, + "loss": 0.0047, + "step": 32760 + }, + { + "epoch": 7.117723718505648, + "grad_norm": 0.00046110479161143303, + "learning_rate": 5.527801911381408e-06, + "loss": 0.0043, + "step": 32770 + }, + { + "epoch": 7.11989574283232, + "grad_norm": 0.0004618534876499325, + "learning_rate": 5.514226759339705e-06, + "loss": 0.0, + "step": 32780 + }, + { + "epoch": 7.122067767158992, + "grad_norm": 0.0004645264125429094, + "learning_rate": 5.500651607298002e-06, + "loss": 0.0001, + "step": 32790 + }, + { + "epoch": 7.124239791485665, + "grad_norm": 0.00046783083234913647, + "learning_rate": 5.4870764552563e-06, + "loss": 0.0001, + "step": 32800 + }, + { + "epoch": 7.126411815812337, + "grad_norm": 0.0021647040266543627, + "learning_rate": 5.473501303214596e-06, + "loss": 0.0001, + "step": 32810 + }, + { + "epoch": 7.12858384013901, + "grad_norm": 0.00046100158942863345, + "learning_rate": 5.4599261511728935e-06, + "loss": 0.0076, + "step": 32820 + }, + { + "epoch": 7.130755864465682, + "grad_norm": 0.0004608416638802737, + "learning_rate": 5.44635099913119e-06, + "loss": 0.0001, + "step": 32830 + }, + { + "epoch": 7.132927888792355, + "grad_norm": 0.00045467689051292837, + "learning_rate": 5.432775847089487e-06, + "loss": 0.0001, + "step": 32840 + }, + { + "epoch": 7.135099913119027, + "grad_norm": 0.0013107570121064782, + "learning_rate": 5.419200695047785e-06, + "loss": 0.0052, + "step": 32850 + }, + { + "epoch": 7.137271937445699, + "grad_norm": 0.0004807122459169477, + "learning_rate": 5.405625543006082e-06, + "loss": 0.0046, + "step": 32860 + }, + { + "epoch": 7.139443961772372, + "grad_norm": 0.0009218992199748755, + "learning_rate": 5.392050390964379e-06, + "loss": 0.0043, + "step": 32870 + }, + { + "epoch": 7.141615986099044, + "grad_norm": 0.0004588186275213957, + "learning_rate": 5.378475238922676e-06, + "loss": 0.0001, + "step": 32880 + }, + { + "epoch": 7.143788010425717, + "grad_norm": 0.0008795844041742384, + "learning_rate": 5.364900086880973e-06, + "loss": 0.0067, + "step": 32890 + }, + { + "epoch": 7.145960034752389, + "grad_norm": 0.0004757294664159417, + "learning_rate": 5.351324934839271e-06, + "loss": 0.0001, + "step": 32900 + }, + { + "epoch": 7.148132059079062, + "grad_norm": 0.00046267296420410275, + "learning_rate": 5.337749782797567e-06, + "loss": 0.0047, + "step": 32910 + }, + { + "epoch": 7.150304083405734, + "grad_norm": 0.000463083473732695, + "learning_rate": 5.3241746307558645e-06, + "loss": 0.0001, + "step": 32920 + }, + { + "epoch": 7.152476107732406, + "grad_norm": 0.0004575937055051327, + "learning_rate": 5.310599478714162e-06, + "loss": 0.0001, + "step": 32930 + }, + { + "epoch": 7.154648132059079, + "grad_norm": 0.0041653853841125965, + "learning_rate": 5.297024326672459e-06, + "loss": 0.0001, + "step": 32940 + }, + { + "epoch": 7.156820156385751, + "grad_norm": 0.00046218730858527124, + "learning_rate": 5.283449174630757e-06, + "loss": 0.0, + "step": 32950 + }, + { + "epoch": 7.158992180712424, + "grad_norm": 0.0015792973572388291, + "learning_rate": 5.269874022589053e-06, + "loss": 0.0001, + "step": 32960 + }, + { + "epoch": 7.161164205039096, + "grad_norm": 0.0004536303167697042, + "learning_rate": 5.2562988705473505e-06, + "loss": 0.0001, + "step": 32970 + }, + { + "epoch": 7.163336229365769, + "grad_norm": 0.00048602151218801737, + "learning_rate": 5.242723718505648e-06, + "loss": 0.003, + "step": 32980 + }, + { + "epoch": 7.165508253692441, + "grad_norm": 0.00046395332901738584, + "learning_rate": 5.229148566463945e-06, + "loss": 0.0001, + "step": 32990 + }, + { + "epoch": 7.167680278019114, + "grad_norm": 0.0008334364974871278, + "learning_rate": 5.215573414422242e-06, + "loss": 0.0041, + "step": 33000 + }, + { + "epoch": 7.169852302345786, + "grad_norm": 0.00045471618068404496, + "learning_rate": 5.201998262380539e-06, + "loss": 0.0043, + "step": 33010 + }, + { + "epoch": 7.172024326672458, + "grad_norm": 0.00046859317808412015, + "learning_rate": 5.1884231103388356e-06, + "loss": 0.0063, + "step": 33020 + }, + { + "epoch": 7.174196350999131, + "grad_norm": 0.000619838887359947, + "learning_rate": 5.174847958297134e-06, + "loss": 0.0001, + "step": 33030 + }, + { + "epoch": 7.176368375325803, + "grad_norm": 0.0004607265873346478, + "learning_rate": 5.16127280625543e-06, + "loss": 0.0001, + "step": 33040 + }, + { + "epoch": 7.178540399652476, + "grad_norm": 0.0004701963916886598, + "learning_rate": 5.147697654213728e-06, + "loss": 0.0001, + "step": 33050 + }, + { + "epoch": 7.180712423979148, + "grad_norm": 0.0004519505600910634, + "learning_rate": 5.134122502172024e-06, + "loss": 0.0035, + "step": 33060 + }, + { + "epoch": 7.182884448305821, + "grad_norm": 0.00045899933320470154, + "learning_rate": 5.1205473501303215e-06, + "loss": 0.0, + "step": 33070 + }, + { + "epoch": 7.1850564726324935, + "grad_norm": 0.00045867246808484197, + "learning_rate": 5.106972198088619e-06, + "loss": 0.0001, + "step": 33080 + }, + { + "epoch": 7.1872284969591655, + "grad_norm": 0.0005122332950122654, + "learning_rate": 5.093397046046916e-06, + "loss": 0.0001, + "step": 33090 + }, + { + "epoch": 7.1894005212858385, + "grad_norm": 0.0004529119178187102, + "learning_rate": 5.079821894005213e-06, + "loss": 0.0001, + "step": 33100 + }, + { + "epoch": 7.191572545612511, + "grad_norm": 0.00045432275510393083, + "learning_rate": 5.06624674196351e-06, + "loss": 0.0001, + "step": 33110 + }, + { + "epoch": 7.1937445699391835, + "grad_norm": 0.0006407810724340379, + "learning_rate": 5.052671589921807e-06, + "loss": 0.0001, + "step": 33120 + }, + { + "epoch": 7.195916594265856, + "grad_norm": 0.00045336701441556215, + "learning_rate": 5.039096437880105e-06, + "loss": 0.0, + "step": 33130 + }, + { + "epoch": 7.198088618592529, + "grad_norm": 0.0004545553238131106, + "learning_rate": 5.025521285838402e-06, + "loss": 0.0001, + "step": 33140 + }, + { + "epoch": 7.200260642919201, + "grad_norm": 0.00044919364154338837, + "learning_rate": 5.011946133796699e-06, + "loss": 0.0001, + "step": 33150 + }, + { + "epoch": 7.202432667245873, + "grad_norm": 0.00045379812945611775, + "learning_rate": 4.998370981754996e-06, + "loss": 0.0001, + "step": 33160 + }, + { + "epoch": 7.204604691572546, + "grad_norm": 0.0004533766768872738, + "learning_rate": 4.984795829713293e-06, + "loss": 0.0001, + "step": 33170 + }, + { + "epoch": 7.206776715899218, + "grad_norm": 0.0009349008905701339, + "learning_rate": 4.971220677671591e-06, + "loss": 0.0001, + "step": 33180 + }, + { + "epoch": 7.208948740225891, + "grad_norm": 0.0005631750682368875, + "learning_rate": 4.957645525629887e-06, + "loss": 0.0, + "step": 33190 + }, + { + "epoch": 7.211120764552563, + "grad_norm": 0.00044737185817211866, + "learning_rate": 4.9440703735881846e-06, + "loss": 0.0001, + "step": 33200 + }, + { + "epoch": 7.213292788879236, + "grad_norm": 0.00045651328400708735, + "learning_rate": 4.930495221546481e-06, + "loss": 0.0054, + "step": 33210 + }, + { + "epoch": 7.215464813205908, + "grad_norm": 0.0006020652363076806, + "learning_rate": 4.916920069504779e-06, + "loss": 0.0001, + "step": 33220 + }, + { + "epoch": 7.217636837532581, + "grad_norm": 0.0004551556194201112, + "learning_rate": 4.903344917463076e-06, + "loss": 0.0001, + "step": 33230 + }, + { + "epoch": 7.219808861859253, + "grad_norm": 0.0004467817780096084, + "learning_rate": 4.889769765421373e-06, + "loss": 0.0001, + "step": 33240 + }, + { + "epoch": 7.221980886185925, + "grad_norm": 0.0004483225638978183, + "learning_rate": 4.87619461337967e-06, + "loss": 0.0001, + "step": 33250 + }, + { + "epoch": 7.224152910512598, + "grad_norm": 0.004001400899142027, + "learning_rate": 4.862619461337967e-06, + "loss": 0.0049, + "step": 33260 + }, + { + "epoch": 7.22632493483927, + "grad_norm": 0.00044646792230196297, + "learning_rate": 4.849044309296264e-06, + "loss": 0.0001, + "step": 33270 + }, + { + "epoch": 7.228496959165943, + "grad_norm": 0.00044519882067106664, + "learning_rate": 4.835469157254562e-06, + "loss": 0.0001, + "step": 33280 + }, + { + "epoch": 7.230668983492615, + "grad_norm": 0.0004504416137933731, + "learning_rate": 4.821894005212858e-06, + "loss": 0.0001, + "step": 33290 + }, + { + "epoch": 7.232841007819288, + "grad_norm": 0.0007989492733031511, + "learning_rate": 4.808318853171156e-06, + "loss": 0.0001, + "step": 33300 + }, + { + "epoch": 7.23501303214596, + "grad_norm": 0.00045564546599052846, + "learning_rate": 4.794743701129453e-06, + "loss": 0.0, + "step": 33310 + }, + { + "epoch": 7.237185056472632, + "grad_norm": 0.0008873406331986189, + "learning_rate": 4.78116854908775e-06, + "loss": 0.0001, + "step": 33320 + }, + { + "epoch": 7.239357080799305, + "grad_norm": 0.00045151382801122963, + "learning_rate": 4.767593397046048e-06, + "loss": 0.0001, + "step": 33330 + }, + { + "epoch": 7.241529105125977, + "grad_norm": 0.0004555900814011693, + "learning_rate": 4.754018245004344e-06, + "loss": 0.0001, + "step": 33340 + }, + { + "epoch": 7.24370112945265, + "grad_norm": 0.0004449512925930321, + "learning_rate": 4.7404430929626415e-06, + "loss": 0.0002, + "step": 33350 + }, + { + "epoch": 7.245873153779322, + "grad_norm": 0.00044150289613753557, + "learning_rate": 4.726867940920939e-06, + "loss": 0.0001, + "step": 33360 + }, + { + "epoch": 7.248045178105995, + "grad_norm": 0.0004383629420772195, + "learning_rate": 4.713292788879236e-06, + "loss": 0.005, + "step": 33370 + }, + { + "epoch": 7.250217202432667, + "grad_norm": 0.004365169908851385, + "learning_rate": 4.699717636837533e-06, + "loss": 0.0001, + "step": 33380 + }, + { + "epoch": 7.252389226759339, + "grad_norm": 0.0004420215846039355, + "learning_rate": 4.68614248479583e-06, + "loss": 0.0045, + "step": 33390 + }, + { + "epoch": 7.254561251086012, + "grad_norm": 0.00043925183126702905, + "learning_rate": 4.672567332754127e-06, + "loss": 0.0001, + "step": 33400 + }, + { + "epoch": 7.256733275412684, + "grad_norm": 0.00044884896487928927, + "learning_rate": 4.658992180712425e-06, + "loss": 0.0001, + "step": 33410 + }, + { + "epoch": 7.258905299739357, + "grad_norm": 0.00044087052810937166, + "learning_rate": 4.645417028670721e-06, + "loss": 0.0001, + "step": 33420 + }, + { + "epoch": 7.261077324066029, + "grad_norm": 0.3532722294330597, + "learning_rate": 4.631841876629019e-06, + "loss": 0.0055, + "step": 33430 + }, + { + "epoch": 7.263249348392702, + "grad_norm": 0.00044010658166371286, + "learning_rate": 4.618266724587315e-06, + "loss": 0.0, + "step": 33440 + }, + { + "epoch": 7.265421372719374, + "grad_norm": 0.00044461956713348627, + "learning_rate": 4.6046915725456125e-06, + "loss": 0.0001, + "step": 33450 + }, + { + "epoch": 7.267593397046047, + "grad_norm": 0.0010231295600533485, + "learning_rate": 4.59111642050391e-06, + "loss": 0.0001, + "step": 33460 + }, + { + "epoch": 7.269765421372719, + "grad_norm": 0.00043528215610422194, + "learning_rate": 4.577541268462207e-06, + "loss": 0.0001, + "step": 33470 + }, + { + "epoch": 7.2719374456993915, + "grad_norm": 0.0004665793967433274, + "learning_rate": 4.563966116420504e-06, + "loss": 0.0001, + "step": 33480 + }, + { + "epoch": 7.2741094700260645, + "grad_norm": 0.0008605076000094414, + "learning_rate": 4.550390964378801e-06, + "loss": 0.0001, + "step": 33490 + }, + { + "epoch": 7.2762814943527365, + "grad_norm": 0.0004442806530278176, + "learning_rate": 4.5368158123370985e-06, + "loss": 0.0001, + "step": 33500 + }, + { + "epoch": 7.2784535186794095, + "grad_norm": 0.0004634595534298569, + "learning_rate": 4.523240660295396e-06, + "loss": 0.0001, + "step": 33510 + }, + { + "epoch": 7.280625543006082, + "grad_norm": 0.002221224131062627, + "learning_rate": 4.509665508253692e-06, + "loss": 0.0001, + "step": 33520 + }, + { + "epoch": 7.2827975673327545, + "grad_norm": 0.0004456366295926273, + "learning_rate": 4.49609035621199e-06, + "loss": 0.0001, + "step": 33530 + }, + { + "epoch": 7.284969591659427, + "grad_norm": 0.000450130901299417, + "learning_rate": 4.482515204170287e-06, + "loss": 0.0, + "step": 33540 + }, + { + "epoch": 7.287141615986099, + "grad_norm": 0.00043738310341723263, + "learning_rate": 4.468940052128584e-06, + "loss": 0.0, + "step": 33550 + }, + { + "epoch": 7.289313640312772, + "grad_norm": 0.0006523529882542789, + "learning_rate": 4.455364900086882e-06, + "loss": 0.0041, + "step": 33560 + }, + { + "epoch": 7.291485664639444, + "grad_norm": 0.0004349502851255238, + "learning_rate": 4.441789748045178e-06, + "loss": 0.0, + "step": 33570 + }, + { + "epoch": 7.293657688966117, + "grad_norm": 0.0004345090710557997, + "learning_rate": 4.428214596003476e-06, + "loss": 0.0, + "step": 33580 + }, + { + "epoch": 7.295829713292789, + "grad_norm": 0.4664416015148163, + "learning_rate": 4.414639443961772e-06, + "loss": 0.0065, + "step": 33590 + }, + { + "epoch": 7.298001737619462, + "grad_norm": 0.0004413572605699301, + "learning_rate": 4.40106429192007e-06, + "loss": 0.0034, + "step": 33600 + }, + { + "epoch": 7.300173761946134, + "grad_norm": 0.4366700351238251, + "learning_rate": 4.387489139878367e-06, + "loss": 0.0048, + "step": 33610 + }, + { + "epoch": 7.302345786272806, + "grad_norm": 0.0004471038409974426, + "learning_rate": 4.373913987836664e-06, + "loss": 0.0001, + "step": 33620 + }, + { + "epoch": 7.304517810599479, + "grad_norm": 0.0004463079967536032, + "learning_rate": 4.360338835794961e-06, + "loss": 0.0001, + "step": 33630 + }, + { + "epoch": 7.306689834926151, + "grad_norm": 0.0004372438706923276, + "learning_rate": 4.346763683753258e-06, + "loss": 0.0, + "step": 33640 + }, + { + "epoch": 7.308861859252824, + "grad_norm": 0.0018223769729956985, + "learning_rate": 4.333188531711555e-06, + "loss": 0.0001, + "step": 33650 + }, + { + "epoch": 7.311033883579496, + "grad_norm": 0.00043676598579622805, + "learning_rate": 4.319613379669853e-06, + "loss": 0.0, + "step": 33660 + }, + { + "epoch": 7.313205907906169, + "grad_norm": 0.0015346236759796739, + "learning_rate": 4.306038227628149e-06, + "loss": 0.0001, + "step": 33670 + }, + { + "epoch": 7.315377932232841, + "grad_norm": 0.001203192281536758, + "learning_rate": 4.292463075586447e-06, + "loss": 0.0001, + "step": 33680 + }, + { + "epoch": 7.317549956559514, + "grad_norm": 0.00043293728958815336, + "learning_rate": 4.278887923544744e-06, + "loss": 0.0001, + "step": 33690 + }, + { + "epoch": 7.319721980886186, + "grad_norm": 0.000446510297479108, + "learning_rate": 4.265312771503041e-06, + "loss": 0.0043, + "step": 33700 + }, + { + "epoch": 7.321894005212858, + "grad_norm": 0.0004393671406432986, + "learning_rate": 4.251737619461338e-06, + "loss": 0.0, + "step": 33710 + }, + { + "epoch": 7.324066029539531, + "grad_norm": 0.0005656811990775168, + "learning_rate": 4.238162467419635e-06, + "loss": 0.0001, + "step": 33720 + }, + { + "epoch": 7.326238053866203, + "grad_norm": 0.00045842313556931913, + "learning_rate": 4.224587315377932e-06, + "loss": 0.0001, + "step": 33730 + }, + { + "epoch": 7.328410078192876, + "grad_norm": 0.00043327995808795094, + "learning_rate": 4.21101216333623e-06, + "loss": 0.0, + "step": 33740 + }, + { + "epoch": 7.330582102519548, + "grad_norm": 0.0004443236975930631, + "learning_rate": 4.197437011294527e-06, + "loss": 0.0038, + "step": 33750 + }, + { + "epoch": 7.332754126846221, + "grad_norm": 0.000434982095612213, + "learning_rate": 4.183861859252824e-06, + "loss": 0.0001, + "step": 33760 + }, + { + "epoch": 7.334926151172893, + "grad_norm": 0.00046286580618470907, + "learning_rate": 4.170286707211121e-06, + "loss": 0.0001, + "step": 33770 + }, + { + "epoch": 7.337098175499565, + "grad_norm": 0.0004482912190724164, + "learning_rate": 4.156711555169418e-06, + "loss": 0.0001, + "step": 33780 + }, + { + "epoch": 7.339270199826238, + "grad_norm": 0.00043282005935907364, + "learning_rate": 4.143136403127716e-06, + "loss": 0.0001, + "step": 33790 + }, + { + "epoch": 7.34144222415291, + "grad_norm": 0.0020901900716125965, + "learning_rate": 4.129561251086012e-06, + "loss": 0.0001, + "step": 33800 + }, + { + "epoch": 7.343614248479583, + "grad_norm": 0.0004341735620982945, + "learning_rate": 4.11598609904431e-06, + "loss": 0.0041, + "step": 33810 + }, + { + "epoch": 7.345786272806255, + "grad_norm": 0.00043011820525862277, + "learning_rate": 4.102410947002606e-06, + "loss": 0.0, + "step": 33820 + }, + { + "epoch": 7.347958297132928, + "grad_norm": 0.00043060528696514666, + "learning_rate": 4.0888357949609036e-06, + "loss": 0.0047, + "step": 33830 + }, + { + "epoch": 7.3501303214596, + "grad_norm": 0.00042701844358816743, + "learning_rate": 4.075260642919201e-06, + "loss": 0.0036, + "step": 33840 + }, + { + "epoch": 7.352302345786272, + "grad_norm": 0.0009441542206332088, + "learning_rate": 4.061685490877498e-06, + "loss": 0.0001, + "step": 33850 + }, + { + "epoch": 7.354474370112945, + "grad_norm": 0.0007577169453725219, + "learning_rate": 4.048110338835795e-06, + "loss": 0.0001, + "step": 33860 + }, + { + "epoch": 7.356646394439617, + "grad_norm": 0.00048064981820061803, + "learning_rate": 4.034535186794092e-06, + "loss": 0.0001, + "step": 33870 + }, + { + "epoch": 7.35881841876629, + "grad_norm": 0.00044433947186917067, + "learning_rate": 4.0209600347523895e-06, + "loss": 0.0, + "step": 33880 + }, + { + "epoch": 7.3609904430929625, + "grad_norm": 0.0004222741990815848, + "learning_rate": 4.007384882710687e-06, + "loss": 0.0, + "step": 33890 + }, + { + "epoch": 7.363162467419635, + "grad_norm": 0.0008366369875147939, + "learning_rate": 3.993809730668983e-06, + "loss": 0.0416, + "step": 33900 + }, + { + "epoch": 7.3653344917463075, + "grad_norm": 0.0004299264110159129, + "learning_rate": 3.980234578627281e-06, + "loss": 0.0001, + "step": 33910 + }, + { + "epoch": 7.3675065160729805, + "grad_norm": 0.0007753439131192863, + "learning_rate": 3.966659426585577e-06, + "loss": 0.0001, + "step": 33920 + }, + { + "epoch": 7.3696785403996525, + "grad_norm": 0.0007699221605435014, + "learning_rate": 3.9530842745438754e-06, + "loss": 0.0001, + "step": 33930 + }, + { + "epoch": 7.371850564726325, + "grad_norm": 0.0004444130463525653, + "learning_rate": 3.939509122502173e-06, + "loss": 0.0001, + "step": 33940 + }, + { + "epoch": 7.374022589052998, + "grad_norm": 0.0004256993706803769, + "learning_rate": 3.925933970460469e-06, + "loss": 0.0, + "step": 33950 + }, + { + "epoch": 7.37619461337967, + "grad_norm": 0.00043124344665557146, + "learning_rate": 3.912358818418767e-06, + "loss": 0.0, + "step": 33960 + }, + { + "epoch": 7.378366637706343, + "grad_norm": 0.0004286852781660855, + "learning_rate": 3.898783666377063e-06, + "loss": 0.0, + "step": 33970 + }, + { + "epoch": 7.380538662033015, + "grad_norm": 0.0004296654078643769, + "learning_rate": 3.885208514335361e-06, + "loss": 0.0, + "step": 33980 + }, + { + "epoch": 7.382710686359688, + "grad_norm": 0.0004529616271611303, + "learning_rate": 3.871633362293658e-06, + "loss": 0.0, + "step": 33990 + }, + { + "epoch": 7.38488271068636, + "grad_norm": 0.00044139905367046595, + "learning_rate": 3.858058210251955e-06, + "loss": 0.0001, + "step": 34000 + }, + { + "epoch": 7.387054735013032, + "grad_norm": 0.00042843882692977786, + "learning_rate": 3.844483058210252e-06, + "loss": 0.0001, + "step": 34010 + }, + { + "epoch": 7.389226759339705, + "grad_norm": 0.00043310338514856994, + "learning_rate": 3.830907906168549e-06, + "loss": 0.0, + "step": 34020 + }, + { + "epoch": 7.391398783666377, + "grad_norm": 0.0007619079551659524, + "learning_rate": 3.8173327541268464e-06, + "loss": 0.0001, + "step": 34030 + }, + { + "epoch": 7.39357080799305, + "grad_norm": 0.00042568857315927744, + "learning_rate": 3.803757602085144e-06, + "loss": 0.0001, + "step": 34040 + }, + { + "epoch": 7.395742832319722, + "grad_norm": 0.00042350677540525794, + "learning_rate": 3.7901824500434403e-06, + "loss": 0.0, + "step": 34050 + }, + { + "epoch": 7.397914856646395, + "grad_norm": 0.0004444782971404493, + "learning_rate": 3.776607298001738e-06, + "loss": 0.0, + "step": 34060 + }, + { + "epoch": 7.400086880973067, + "grad_norm": 0.0015245000831782818, + "learning_rate": 3.7630321459600346e-06, + "loss": 0.0061, + "step": 34070 + }, + { + "epoch": 7.402258905299739, + "grad_norm": 0.0004239232512190938, + "learning_rate": 3.749456993918332e-06, + "loss": 0.0001, + "step": 34080 + }, + { + "epoch": 7.404430929626412, + "grad_norm": 0.0025918257888406515, + "learning_rate": 3.735881841876629e-06, + "loss": 0.0001, + "step": 34090 + }, + { + "epoch": 7.406602953953084, + "grad_norm": 0.002145191188901663, + "learning_rate": 3.7223066898349262e-06, + "loss": 0.0, + "step": 34100 + }, + { + "epoch": 7.408774978279757, + "grad_norm": 0.0013365385821089149, + "learning_rate": 3.708731537793223e-06, + "loss": 0.0001, + "step": 34110 + }, + { + "epoch": 7.410947002606429, + "grad_norm": 0.20957709848880768, + "learning_rate": 3.6951563857515205e-06, + "loss": 0.0053, + "step": 34120 + }, + { + "epoch": 7.413119026933102, + "grad_norm": 0.0004579925735015422, + "learning_rate": 3.6815812337098175e-06, + "loss": 0.0001, + "step": 34130 + }, + { + "epoch": 7.415291051259774, + "grad_norm": 0.00043436442501842976, + "learning_rate": 3.668006081668115e-06, + "loss": 0.0001, + "step": 34140 + }, + { + "epoch": 7.417463075586447, + "grad_norm": 0.00043290789471939206, + "learning_rate": 3.654430929626412e-06, + "loss": 0.0001, + "step": 34150 + }, + { + "epoch": 7.419635099913119, + "grad_norm": 0.0004213610081933439, + "learning_rate": 3.640855777584709e-06, + "loss": 0.0037, + "step": 34160 + }, + { + "epoch": 7.421807124239791, + "grad_norm": 0.0004223252472002059, + "learning_rate": 3.6272806255430065e-06, + "loss": 0.0001, + "step": 34170 + }, + { + "epoch": 7.423979148566464, + "grad_norm": 0.00043533978168852627, + "learning_rate": 3.6137054735013034e-06, + "loss": 0.0, + "step": 34180 + }, + { + "epoch": 7.426151172893136, + "grad_norm": 0.0019435312133282423, + "learning_rate": 3.6001303214596007e-06, + "loss": 0.0001, + "step": 34190 + }, + { + "epoch": 7.428323197219809, + "grad_norm": 0.00043479521991685033, + "learning_rate": 3.5865551694178977e-06, + "loss": 0.0, + "step": 34200 + }, + { + "epoch": 7.430495221546481, + "grad_norm": 0.000424668105551973, + "learning_rate": 3.572980017376195e-06, + "loss": 0.0001, + "step": 34210 + }, + { + "epoch": 7.432667245873154, + "grad_norm": 0.0010326962219551206, + "learning_rate": 3.559404865334492e-06, + "loss": 0.0001, + "step": 34220 + }, + { + "epoch": 7.434839270199826, + "grad_norm": 0.0004303526075091213, + "learning_rate": 3.5458297132927893e-06, + "loss": 0.0001, + "step": 34230 + }, + { + "epoch": 7.437011294526498, + "grad_norm": 0.0009326340514235198, + "learning_rate": 3.532254561251086e-06, + "loss": 0.0001, + "step": 34240 + }, + { + "epoch": 7.439183318853171, + "grad_norm": 0.16912616789340973, + "learning_rate": 3.5186794092093836e-06, + "loss": 0.0035, + "step": 34250 + }, + { + "epoch": 7.441355343179843, + "grad_norm": 0.0004265088646207005, + "learning_rate": 3.50510425716768e-06, + "loss": 0.0001, + "step": 34260 + }, + { + "epoch": 7.443527367506516, + "grad_norm": 0.00042686297092586756, + "learning_rate": 3.491529105125978e-06, + "loss": 0.0001, + "step": 34270 + }, + { + "epoch": 7.445699391833188, + "grad_norm": 0.00045173687976785004, + "learning_rate": 3.4779539530842744e-06, + "loss": 0.0001, + "step": 34280 + }, + { + "epoch": 7.447871416159861, + "grad_norm": 0.0004314729885663837, + "learning_rate": 3.4643788010425718e-06, + "loss": 0.0001, + "step": 34290 + }, + { + "epoch": 7.450043440486533, + "grad_norm": 0.0004578959196805954, + "learning_rate": 3.4508036490008687e-06, + "loss": 0.0001, + "step": 34300 + }, + { + "epoch": 7.4522154648132055, + "grad_norm": 0.0007546443957835436, + "learning_rate": 3.437228496959166e-06, + "loss": 0.0042, + "step": 34310 + }, + { + "epoch": 7.4543874891398785, + "grad_norm": 0.0004262760339770466, + "learning_rate": 3.423653344917463e-06, + "loss": 0.0001, + "step": 34320 + }, + { + "epoch": 7.4565595134665505, + "grad_norm": 0.0004256195097696036, + "learning_rate": 3.4100781928757603e-06, + "loss": 0.0, + "step": 34330 + }, + { + "epoch": 7.4587315377932235, + "grad_norm": 0.0004252404614817351, + "learning_rate": 3.3965030408340577e-06, + "loss": 0.0001, + "step": 34340 + }, + { + "epoch": 7.460903562119896, + "grad_norm": 0.0004220245173200965, + "learning_rate": 3.3829278887923546e-06, + "loss": 0.0, + "step": 34350 + }, + { + "epoch": 7.4630755864465685, + "grad_norm": 0.00042326172115281224, + "learning_rate": 3.369352736750652e-06, + "loss": 0.0001, + "step": 34360 + }, + { + "epoch": 7.465247610773241, + "grad_norm": 0.000528325152117759, + "learning_rate": 3.355777584708949e-06, + "loss": 0.0, + "step": 34370 + }, + { + "epoch": 7.467419635099914, + "grad_norm": 0.00047799237654544413, + "learning_rate": 3.3422024326672463e-06, + "loss": 0.0061, + "step": 34380 + }, + { + "epoch": 7.469591659426586, + "grad_norm": 0.00041863773367367685, + "learning_rate": 3.328627280625543e-06, + "loss": 0.0001, + "step": 34390 + }, + { + "epoch": 7.471763683753258, + "grad_norm": 0.0021037731785327196, + "learning_rate": 3.3150521285838406e-06, + "loss": 0.0001, + "step": 34400 + }, + { + "epoch": 7.473935708079931, + "grad_norm": 0.00042101697181351483, + "learning_rate": 3.3014769765421375e-06, + "loss": 0.0, + "step": 34410 + }, + { + "epoch": 7.476107732406603, + "grad_norm": 0.0004177862429060042, + "learning_rate": 3.287901824500435e-06, + "loss": 0.0001, + "step": 34420 + }, + { + "epoch": 7.478279756733276, + "grad_norm": 0.000812828540802002, + "learning_rate": 3.2743266724587314e-06, + "loss": 0.0001, + "step": 34430 + }, + { + "epoch": 7.480451781059948, + "grad_norm": 0.00043613568414002657, + "learning_rate": 3.260751520417029e-06, + "loss": 0.0068, + "step": 34440 + }, + { + "epoch": 7.48262380538662, + "grad_norm": 0.0004363965417724103, + "learning_rate": 3.2471763683753256e-06, + "loss": 0.0, + "step": 34450 + }, + { + "epoch": 7.484795829713293, + "grad_norm": 0.0020837662741541862, + "learning_rate": 3.2336012163336234e-06, + "loss": 0.0001, + "step": 34460 + }, + { + "epoch": 7.486967854039965, + "grad_norm": 0.0008054388454183936, + "learning_rate": 3.22002606429192e-06, + "loss": 0.0001, + "step": 34470 + }, + { + "epoch": 7.489139878366638, + "grad_norm": 0.0004963803221471608, + "learning_rate": 3.2064509122502173e-06, + "loss": 0.0, + "step": 34480 + }, + { + "epoch": 7.49131190269331, + "grad_norm": 0.0004414473660290241, + "learning_rate": 3.1928757602085142e-06, + "loss": 0.0073, + "step": 34490 + }, + { + "epoch": 7.493483927019983, + "grad_norm": 0.00043333263602107763, + "learning_rate": 3.1793006081668116e-06, + "loss": 0.0001, + "step": 34500 + }, + { + "epoch": 7.495655951346655, + "grad_norm": 0.00043653626926243305, + "learning_rate": 3.1657254561251085e-06, + "loss": 0.0005, + "step": 34510 + }, + { + "epoch": 7.497827975673328, + "grad_norm": 0.00042178662260994315, + "learning_rate": 3.152150304083406e-06, + "loss": 0.0001, + "step": 34520 + }, + { + "epoch": 7.5, + "grad_norm": 0.0012135066790506244, + "learning_rate": 3.138575152041703e-06, + "loss": 0.0001, + "step": 34530 + }, + { + "epoch": 7.502172024326672, + "grad_norm": 0.00042127963388338685, + "learning_rate": 3.125e-06, + "loss": 0.0001, + "step": 34540 + }, + { + "epoch": 7.504344048653345, + "grad_norm": 0.00042868778109550476, + "learning_rate": 3.111424847958297e-06, + "loss": 0.0041, + "step": 34550 + }, + { + "epoch": 7.506516072980017, + "grad_norm": 0.00043172913137823343, + "learning_rate": 3.0978496959165944e-06, + "loss": 0.0, + "step": 34560 + }, + { + "epoch": 7.50868809730669, + "grad_norm": 0.00043645326513797045, + "learning_rate": 3.0842745438748914e-06, + "loss": 0.0047, + "step": 34570 + }, + { + "epoch": 7.510860121633362, + "grad_norm": 0.00042725811363197863, + "learning_rate": 3.0706993918331887e-06, + "loss": 0.0, + "step": 34580 + }, + { + "epoch": 7.513032145960035, + "grad_norm": 0.0009361863485537469, + "learning_rate": 3.0571242397914857e-06, + "loss": 0.0001, + "step": 34590 + }, + { + "epoch": 7.515204170286707, + "grad_norm": 0.0004268494958523661, + "learning_rate": 3.043549087749783e-06, + "loss": 0.0, + "step": 34600 + }, + { + "epoch": 7.51737619461338, + "grad_norm": 0.22192446887493134, + "learning_rate": 3.02997393570808e-06, + "loss": 0.0052, + "step": 34610 + }, + { + "epoch": 7.519548218940052, + "grad_norm": 0.0004318969149608165, + "learning_rate": 3.016398783666377e-06, + "loss": 0.0001, + "step": 34620 + }, + { + "epoch": 7.521720243266724, + "grad_norm": 0.00042987792403437197, + "learning_rate": 3.0028236316246742e-06, + "loss": 0.0055, + "step": 34630 + }, + { + "epoch": 7.523892267593397, + "grad_norm": 0.0035631656646728516, + "learning_rate": 2.989248479582971e-06, + "loss": 0.0001, + "step": 34640 + }, + { + "epoch": 7.526064291920069, + "grad_norm": 0.00042544747702777386, + "learning_rate": 2.975673327541269e-06, + "loss": 0.0001, + "step": 34650 + }, + { + "epoch": 7.528236316246742, + "grad_norm": 0.0004204209253657609, + "learning_rate": 2.962098175499566e-06, + "loss": 0.0001, + "step": 34660 + }, + { + "epoch": 7.530408340573414, + "grad_norm": 0.0004276661202311516, + "learning_rate": 2.948523023457863e-06, + "loss": 0.0, + "step": 34670 + }, + { + "epoch": 7.532580364900086, + "grad_norm": 0.00043516955338418484, + "learning_rate": 2.93494787141616e-06, + "loss": 0.0, + "step": 34680 + }, + { + "epoch": 7.534752389226759, + "grad_norm": 0.000489513564389199, + "learning_rate": 2.921372719374457e-06, + "loss": 0.0001, + "step": 34690 + }, + { + "epoch": 7.5369244135534315, + "grad_norm": 0.000425524078309536, + "learning_rate": 2.9077975673327545e-06, + "loss": 0.0, + "step": 34700 + }, + { + "epoch": 7.539096437880104, + "grad_norm": 0.0004260186688043177, + "learning_rate": 2.8942224152910514e-06, + "loss": 0.0194, + "step": 34710 + }, + { + "epoch": 7.5412684622067765, + "grad_norm": 0.0004336040292400867, + "learning_rate": 2.8806472632493487e-06, + "loss": 0.0, + "step": 34720 + }, + { + "epoch": 7.5434404865334495, + "grad_norm": 0.00045299273915588856, + "learning_rate": 2.8670721112076457e-06, + "loss": 0.0048, + "step": 34730 + }, + { + "epoch": 7.5456125108601215, + "grad_norm": 0.00044971954775974154, + "learning_rate": 2.8534969591659426e-06, + "loss": 0.0, + "step": 34740 + }, + { + "epoch": 7.5477845351867945, + "grad_norm": 0.0007768584764562547, + "learning_rate": 2.83992180712424e-06, + "loss": 0.0001, + "step": 34750 + }, + { + "epoch": 7.549956559513467, + "grad_norm": 0.0005349684506654739, + "learning_rate": 2.826346655082537e-06, + "loss": 0.0001, + "step": 34760 + }, + { + "epoch": 7.552128583840139, + "grad_norm": 0.0004436474700924009, + "learning_rate": 2.8127715030408342e-06, + "loss": 0.0001, + "step": 34770 + }, + { + "epoch": 7.554300608166812, + "grad_norm": 0.00044845970114693046, + "learning_rate": 2.799196350999131e-06, + "loss": 0.0, + "step": 34780 + }, + { + "epoch": 7.556472632493484, + "grad_norm": 0.0012216611066833138, + "learning_rate": 2.7856211989574285e-06, + "loss": 0.0001, + "step": 34790 + }, + { + "epoch": 7.558644656820157, + "grad_norm": 0.0007345577469095588, + "learning_rate": 2.7720460469157255e-06, + "loss": 0.0001, + "step": 34800 + }, + { + "epoch": 7.560816681146829, + "grad_norm": 0.00043187016854062676, + "learning_rate": 2.7584708948740224e-06, + "loss": 0.0001, + "step": 34810 + }, + { + "epoch": 7.562988705473502, + "grad_norm": 0.0004403771017678082, + "learning_rate": 2.7448957428323198e-06, + "loss": 0.0001, + "step": 34820 + }, + { + "epoch": 7.565160729800174, + "grad_norm": 0.18770098686218262, + "learning_rate": 2.7313205907906167e-06, + "loss": 0.0043, + "step": 34830 + }, + { + "epoch": 7.567332754126847, + "grad_norm": 0.000443107943283394, + "learning_rate": 2.717745438748914e-06, + "loss": 0.0051, + "step": 34840 + }, + { + "epoch": 7.569504778453519, + "grad_norm": 0.0004438256728462875, + "learning_rate": 2.7041702867072114e-06, + "loss": 0.0001, + "step": 34850 + }, + { + "epoch": 7.571676802780191, + "grad_norm": 0.001046639634296298, + "learning_rate": 2.6905951346655083e-06, + "loss": 0.0001, + "step": 34860 + }, + { + "epoch": 7.573848827106864, + "grad_norm": 0.0004500243521761149, + "learning_rate": 2.6770199826238057e-06, + "loss": 0.0042, + "step": 34870 + }, + { + "epoch": 7.576020851433536, + "grad_norm": 0.0009341577533632517, + "learning_rate": 2.6634448305821026e-06, + "loss": 0.0001, + "step": 34880 + }, + { + "epoch": 7.578192875760209, + "grad_norm": 0.00043487714719958603, + "learning_rate": 2.6498696785404e-06, + "loss": 0.0, + "step": 34890 + }, + { + "epoch": 7.580364900086881, + "grad_norm": 0.0004606035363394767, + "learning_rate": 2.636294526498697e-06, + "loss": 0.0004, + "step": 34900 + }, + { + "epoch": 7.582536924413553, + "grad_norm": 0.0005366328987292945, + "learning_rate": 2.6227193744569943e-06, + "loss": 0.0002, + "step": 34910 + }, + { + "epoch": 7.584708948740226, + "grad_norm": 0.0004259934357833117, + "learning_rate": 2.609144222415291e-06, + "loss": 0.0001, + "step": 34920 + }, + { + "epoch": 7.586880973066898, + "grad_norm": 0.00042626031790859997, + "learning_rate": 2.595569070373588e-06, + "loss": 0.0001, + "step": 34930 + }, + { + "epoch": 7.589052997393571, + "grad_norm": 0.009608348831534386, + "learning_rate": 2.5819939183318855e-06, + "loss": 0.0001, + "step": 34940 + }, + { + "epoch": 7.591225021720243, + "grad_norm": 0.00042460497934371233, + "learning_rate": 2.5684187662901824e-06, + "loss": 0.0, + "step": 34950 + }, + { + "epoch": 7.593397046046916, + "grad_norm": 0.0004247224424034357, + "learning_rate": 2.5548436142484798e-06, + "loss": 0.0, + "step": 34960 + }, + { + "epoch": 7.595569070373588, + "grad_norm": 0.00045198260340839624, + "learning_rate": 2.5412684622067767e-06, + "loss": 0.0001, + "step": 34970 + }, + { + "epoch": 7.597741094700261, + "grad_norm": 0.00043047365033999085, + "learning_rate": 2.527693310165074e-06, + "loss": 0.0, + "step": 34980 + }, + { + "epoch": 7.599913119026933, + "grad_norm": 0.000425986509071663, + "learning_rate": 2.514118158123371e-06, + "loss": 0.0001, + "step": 34990 + }, + { + "epoch": 7.602085143353605, + "grad_norm": 0.0004410984693095088, + "learning_rate": 2.5005430060816683e-06, + "loss": 0.0001, + "step": 35000 + }, + { + "epoch": 7.604257167680278, + "grad_norm": 0.0004189326718915254, + "learning_rate": 2.4869678540399653e-06, + "loss": 0.0, + "step": 35010 + }, + { + "epoch": 7.60642919200695, + "grad_norm": 0.0004251878126524389, + "learning_rate": 2.473392701998262e-06, + "loss": 0.0001, + "step": 35020 + }, + { + "epoch": 7.608601216333623, + "grad_norm": 0.00041855184826999903, + "learning_rate": 2.4598175499565596e-06, + "loss": 0.0001, + "step": 35030 + }, + { + "epoch": 7.610773240660295, + "grad_norm": 0.00042429458699189126, + "learning_rate": 2.4462423979148565e-06, + "loss": 0.0001, + "step": 35040 + }, + { + "epoch": 7.612945264986968, + "grad_norm": 0.0004268327902536839, + "learning_rate": 2.432667245873154e-06, + "loss": 0.0001, + "step": 35050 + }, + { + "epoch": 7.61511728931364, + "grad_norm": 0.014470481313765049, + "learning_rate": 2.419092093831451e-06, + "loss": 0.0001, + "step": 35060 + }, + { + "epoch": 7.617289313640313, + "grad_norm": 0.00042645548819564283, + "learning_rate": 2.405516941789748e-06, + "loss": 0.0001, + "step": 35070 + }, + { + "epoch": 7.619461337966985, + "grad_norm": 0.0004529398283921182, + "learning_rate": 2.3919417897480455e-06, + "loss": 0.0001, + "step": 35080 + }, + { + "epoch": 7.621633362293657, + "grad_norm": 0.0004270094505045563, + "learning_rate": 2.3783666377063424e-06, + "loss": 0.0001, + "step": 35090 + }, + { + "epoch": 7.62380538662033, + "grad_norm": 0.0005416472558863461, + "learning_rate": 2.3647914856646398e-06, + "loss": 0.0, + "step": 35100 + }, + { + "epoch": 7.625977410947002, + "grad_norm": 0.00043836349504999816, + "learning_rate": 2.3512163336229367e-06, + "loss": 0.0047, + "step": 35110 + }, + { + "epoch": 7.628149435273675, + "grad_norm": 0.00043431203812360764, + "learning_rate": 2.337641181581234e-06, + "loss": 0.0, + "step": 35120 + }, + { + "epoch": 7.6303214596003475, + "grad_norm": 0.0008394691394641995, + "learning_rate": 2.324066029539531e-06, + "loss": 0.0001, + "step": 35130 + }, + { + "epoch": 7.6324934839270195, + "grad_norm": 0.00043706296128220856, + "learning_rate": 2.310490877497828e-06, + "loss": 0.0, + "step": 35140 + }, + { + "epoch": 7.6346655082536925, + "grad_norm": 0.012723601423203945, + "learning_rate": 2.2969157254561253e-06, + "loss": 0.0001, + "step": 35150 + }, + { + "epoch": 7.636837532580365, + "grad_norm": 0.0009298368240706623, + "learning_rate": 2.2833405734144222e-06, + "loss": 0.0001, + "step": 35160 + }, + { + "epoch": 7.6390095569070375, + "grad_norm": 0.000419062766013667, + "learning_rate": 2.2697654213727196e-06, + "loss": 0.0, + "step": 35170 + }, + { + "epoch": 7.64118158123371, + "grad_norm": 0.0004180653195362538, + "learning_rate": 2.2561902693310165e-06, + "loss": 0.0047, + "step": 35180 + }, + { + "epoch": 7.643353605560383, + "grad_norm": 0.0006930680829100311, + "learning_rate": 2.242615117289314e-06, + "loss": 0.0001, + "step": 35190 + }, + { + "epoch": 7.645525629887055, + "grad_norm": 0.0004183394485153258, + "learning_rate": 2.229039965247611e-06, + "loss": 0.0001, + "step": 35200 + }, + { + "epoch": 7.647697654213728, + "grad_norm": 0.00047878033365122974, + "learning_rate": 2.2154648132059077e-06, + "loss": 0.0001, + "step": 35210 + }, + { + "epoch": 7.6498696785404, + "grad_norm": 0.0004129901062697172, + "learning_rate": 2.201889661164205e-06, + "loss": 0.0001, + "step": 35220 + }, + { + "epoch": 7.652041702867072, + "grad_norm": 0.0007938410853967071, + "learning_rate": 2.188314509122502e-06, + "loss": 0.0, + "step": 35230 + }, + { + "epoch": 7.654213727193745, + "grad_norm": 0.0012432819930836558, + "learning_rate": 2.1747393570807994e-06, + "loss": 0.0001, + "step": 35240 + }, + { + "epoch": 7.656385751520417, + "grad_norm": 0.00042003538692370057, + "learning_rate": 2.1611642050390967e-06, + "loss": 0.0, + "step": 35250 + }, + { + "epoch": 7.65855777584709, + "grad_norm": 0.0012501387391239405, + "learning_rate": 2.1475890529973937e-06, + "loss": 0.0049, + "step": 35260 + }, + { + "epoch": 7.660729800173762, + "grad_norm": 0.00042245659278705716, + "learning_rate": 2.134013900955691e-06, + "loss": 0.0, + "step": 35270 + }, + { + "epoch": 7.662901824500435, + "grad_norm": 0.00041626309393905103, + "learning_rate": 2.120438748913988e-06, + "loss": 0.0, + "step": 35280 + }, + { + "epoch": 7.665073848827107, + "grad_norm": 0.0007968571735545993, + "learning_rate": 2.1068635968722853e-06, + "loss": 0.0, + "step": 35290 + }, + { + "epoch": 7.66724587315378, + "grad_norm": 0.00041930555016733706, + "learning_rate": 2.0932884448305822e-06, + "loss": 0.0, + "step": 35300 + }, + { + "epoch": 7.669417897480452, + "grad_norm": 0.00045020724064670503, + "learning_rate": 2.0797132927888796e-06, + "loss": 0.0, + "step": 35310 + }, + { + "epoch": 7.671589921807124, + "grad_norm": 0.0006811002967879176, + "learning_rate": 2.0661381407471765e-06, + "loss": 0.0001, + "step": 35320 + }, + { + "epoch": 7.673761946133797, + "grad_norm": 0.3847941756248474, + "learning_rate": 2.0525629887054735e-06, + "loss": 0.0053, + "step": 35330 + }, + { + "epoch": 7.675933970460469, + "grad_norm": 0.0004168343439232558, + "learning_rate": 2.038987836663771e-06, + "loss": 0.0, + "step": 35340 + }, + { + "epoch": 7.678105994787142, + "grad_norm": 0.0004148240841459483, + "learning_rate": 2.0254126846220677e-06, + "loss": 0.0003, + "step": 35350 + }, + { + "epoch": 7.680278019113814, + "grad_norm": 0.0004149214073549956, + "learning_rate": 2.011837532580365e-06, + "loss": 0.0001, + "step": 35360 + }, + { + "epoch": 7.682450043440486, + "grad_norm": 0.0011588763445615768, + "learning_rate": 1.998262380538662e-06, + "loss": 0.0001, + "step": 35370 + }, + { + "epoch": 7.684622067767159, + "grad_norm": 0.0008375580073334277, + "learning_rate": 1.9846872284969594e-06, + "loss": 0.0001, + "step": 35380 + }, + { + "epoch": 7.686794092093831, + "grad_norm": 0.0009266881970688701, + "learning_rate": 1.9711120764552563e-06, + "loss": 0.0001, + "step": 35390 + }, + { + "epoch": 7.688966116420504, + "grad_norm": 0.0004148316220380366, + "learning_rate": 1.9575369244135533e-06, + "loss": 0.0001, + "step": 35400 + }, + { + "epoch": 7.691138140747176, + "grad_norm": 0.00043054655543528497, + "learning_rate": 1.9439617723718506e-06, + "loss": 0.0, + "step": 35410 + }, + { + "epoch": 7.693310165073849, + "grad_norm": 0.0004217229434289038, + "learning_rate": 1.9303866203301475e-06, + "loss": 0.0001, + "step": 35420 + }, + { + "epoch": 7.695482189400521, + "grad_norm": 0.00040780851850286126, + "learning_rate": 1.916811468288445e-06, + "loss": 0.0001, + "step": 35430 + }, + { + "epoch": 7.697654213727194, + "grad_norm": 0.0004132968606427312, + "learning_rate": 1.9032363162467418e-06, + "loss": 0.0001, + "step": 35440 + }, + { + "epoch": 7.699826238053866, + "grad_norm": 0.0004265105235390365, + "learning_rate": 1.8896611642050394e-06, + "loss": 0.0058, + "step": 35450 + }, + { + "epoch": 7.701998262380538, + "grad_norm": 0.0004204078286420554, + "learning_rate": 1.8760860121633365e-06, + "loss": 0.0001, + "step": 35460 + }, + { + "epoch": 7.704170286707211, + "grad_norm": 0.0004155028727836907, + "learning_rate": 1.8625108601216337e-06, + "loss": 0.0, + "step": 35470 + }, + { + "epoch": 7.706342311033883, + "grad_norm": 0.00041444695671088994, + "learning_rate": 1.8489357080799306e-06, + "loss": 0.0054, + "step": 35480 + }, + { + "epoch": 7.708514335360556, + "grad_norm": 0.0004336989077273756, + "learning_rate": 1.8353605560382278e-06, + "loss": 0.0, + "step": 35490 + }, + { + "epoch": 7.710686359687228, + "grad_norm": 0.0004059735219925642, + "learning_rate": 1.821785403996525e-06, + "loss": 0.0001, + "step": 35500 + }, + { + "epoch": 7.712858384013901, + "grad_norm": 0.0004083770327270031, + "learning_rate": 1.808210251954822e-06, + "loss": 0.0, + "step": 35510 + }, + { + "epoch": 7.715030408340573, + "grad_norm": 0.0004174058558419347, + "learning_rate": 1.7946350999131192e-06, + "loss": 0.0001, + "step": 35520 + }, + { + "epoch": 7.717202432667246, + "grad_norm": 0.00041737474384717643, + "learning_rate": 1.7810599478714163e-06, + "loss": 0.0045, + "step": 35530 + }, + { + "epoch": 7.719374456993918, + "grad_norm": 0.00040731337503530085, + "learning_rate": 1.7674847958297135e-06, + "loss": 0.0, + "step": 35540 + }, + { + "epoch": 7.7215464813205905, + "grad_norm": 0.0004167399602010846, + "learning_rate": 1.7539096437880104e-06, + "loss": 0.0, + "step": 35550 + }, + { + "epoch": 7.7237185056472635, + "grad_norm": 0.0004124731640331447, + "learning_rate": 1.7403344917463076e-06, + "loss": 0.0045, + "step": 35560 + }, + { + "epoch": 7.7258905299739355, + "grad_norm": 0.0004200803814455867, + "learning_rate": 1.7267593397046047e-06, + "loss": 0.0038, + "step": 35570 + }, + { + "epoch": 7.7280625543006085, + "grad_norm": 0.0004072072042617947, + "learning_rate": 1.7131841876629018e-06, + "loss": 0.0001, + "step": 35580 + }, + { + "epoch": 7.730234578627281, + "grad_norm": 0.0004259563866071403, + "learning_rate": 1.699609035621199e-06, + "loss": 0.0001, + "step": 35590 + }, + { + "epoch": 7.732406602953953, + "grad_norm": 0.00041089204023592174, + "learning_rate": 1.6860338835794961e-06, + "loss": 0.0001, + "step": 35600 + }, + { + "epoch": 7.734578627280626, + "grad_norm": 0.0007605640566907823, + "learning_rate": 1.6724587315377933e-06, + "loss": 0.0, + "step": 35610 + }, + { + "epoch": 7.736750651607298, + "grad_norm": 0.0004129525041207671, + "learning_rate": 1.6588835794960904e-06, + "loss": 0.0001, + "step": 35620 + }, + { + "epoch": 7.738922675933971, + "grad_norm": 0.00040990023990161717, + "learning_rate": 1.6453084274543873e-06, + "loss": 0.0, + "step": 35630 + }, + { + "epoch": 7.741094700260643, + "grad_norm": 0.0004054093733429909, + "learning_rate": 1.6317332754126845e-06, + "loss": 0.0, + "step": 35640 + }, + { + "epoch": 7.743266724587316, + "grad_norm": 0.00047282635932788253, + "learning_rate": 1.6181581233709816e-06, + "loss": 0.0, + "step": 35650 + }, + { + "epoch": 7.745438748913988, + "grad_norm": 0.0004282636509742588, + "learning_rate": 1.6045829713292792e-06, + "loss": 0.0001, + "step": 35660 + }, + { + "epoch": 7.747610773240661, + "grad_norm": 0.0004196454829070717, + "learning_rate": 1.5910078192875761e-06, + "loss": 0.0, + "step": 35670 + }, + { + "epoch": 7.749782797567333, + "grad_norm": 0.00041647738544270396, + "learning_rate": 1.5774326672458733e-06, + "loss": 0.0052, + "step": 35680 + }, + { + "epoch": 7.751954821894005, + "grad_norm": 0.00042042401037178934, + "learning_rate": 1.5638575152041704e-06, + "loss": 0.0, + "step": 35690 + }, + { + "epoch": 7.754126846220678, + "grad_norm": 0.00040815555257722735, + "learning_rate": 1.5502823631624674e-06, + "loss": 0.0109, + "step": 35700 + }, + { + "epoch": 7.75629887054735, + "grad_norm": 0.0008912209304980934, + "learning_rate": 1.5367072111207647e-06, + "loss": 0.0001, + "step": 35710 + }, + { + "epoch": 7.758470894874023, + "grad_norm": 0.0004114516486879438, + "learning_rate": 1.5231320590790619e-06, + "loss": 0.0001, + "step": 35720 + }, + { + "epoch": 7.760642919200695, + "grad_norm": 0.0004173968336544931, + "learning_rate": 1.509556907037359e-06, + "loss": 0.0, + "step": 35730 + }, + { + "epoch": 7.762814943527368, + "grad_norm": 0.0004173132765572518, + "learning_rate": 1.4959817549956561e-06, + "loss": 0.0001, + "step": 35740 + }, + { + "epoch": 7.76498696785404, + "grad_norm": 0.0004356455756351352, + "learning_rate": 1.482406602953953e-06, + "loss": 0.0001, + "step": 35750 + }, + { + "epoch": 7.767158992180712, + "grad_norm": 0.00044494314352050424, + "learning_rate": 1.4688314509122502e-06, + "loss": 0.0001, + "step": 35760 + }, + { + "epoch": 7.769331016507385, + "grad_norm": 0.0004122602695133537, + "learning_rate": 1.4552562988705474e-06, + "loss": 0.0, + "step": 35770 + }, + { + "epoch": 7.771503040834057, + "grad_norm": 0.00040964328218251467, + "learning_rate": 1.4416811468288445e-06, + "loss": 0.0001, + "step": 35780 + }, + { + "epoch": 7.77367506516073, + "grad_norm": 0.0005291461129672825, + "learning_rate": 1.4281059947871417e-06, + "loss": 0.0, + "step": 35790 + }, + { + "epoch": 7.775847089487402, + "grad_norm": 0.0004087795678060502, + "learning_rate": 1.4145308427454388e-06, + "loss": 0.0, + "step": 35800 + }, + { + "epoch": 7.778019113814075, + "grad_norm": 0.0006858239066787064, + "learning_rate": 1.400955690703736e-06, + "loss": 0.0049, + "step": 35810 + }, + { + "epoch": 7.780191138140747, + "grad_norm": 0.00041144140413962305, + "learning_rate": 1.387380538662033e-06, + "loss": 0.0001, + "step": 35820 + }, + { + "epoch": 7.782363162467419, + "grad_norm": 0.00048641389003023505, + "learning_rate": 1.3738053866203302e-06, + "loss": 0.0001, + "step": 35830 + }, + { + "epoch": 7.784535186794092, + "grad_norm": 0.0004075106990057975, + "learning_rate": 1.3602302345786274e-06, + "loss": 0.0034, + "step": 35840 + }, + { + "epoch": 7.786707211120764, + "grad_norm": 0.0004153190820943564, + "learning_rate": 1.3466550825369245e-06, + "loss": 0.0001, + "step": 35850 + }, + { + "epoch": 7.788879235447437, + "grad_norm": 0.0008529599872417748, + "learning_rate": 1.3330799304952217e-06, + "loss": 0.0001, + "step": 35860 + }, + { + "epoch": 7.791051259774109, + "grad_norm": 0.00041363760828971863, + "learning_rate": 1.3195047784535188e-06, + "loss": 0.0, + "step": 35870 + }, + { + "epoch": 7.793223284100782, + "grad_norm": 0.00042468035826459527, + "learning_rate": 1.3059296264118157e-06, + "loss": 0.0001, + "step": 35880 + }, + { + "epoch": 7.795395308427454, + "grad_norm": 0.0013285571476444602, + "learning_rate": 1.2923544743701129e-06, + "loss": 0.0001, + "step": 35890 + }, + { + "epoch": 7.797567332754127, + "grad_norm": 0.00040576845640316606, + "learning_rate": 1.27877932232841e-06, + "loss": 0.0, + "step": 35900 + }, + { + "epoch": 7.799739357080799, + "grad_norm": 0.0004062264342792332, + "learning_rate": 1.2652041702867074e-06, + "loss": 0.0, + "step": 35910 + }, + { + "epoch": 7.801911381407471, + "grad_norm": 0.0004351633251644671, + "learning_rate": 1.2516290182450045e-06, + "loss": 0.0, + "step": 35920 + }, + { + "epoch": 7.804083405734144, + "grad_norm": 0.00041670544305816293, + "learning_rate": 1.2380538662033017e-06, + "loss": 0.0, + "step": 35930 + }, + { + "epoch": 7.8062554300608165, + "grad_norm": 0.0004084085812792182, + "learning_rate": 1.2244787141615986e-06, + "loss": 0.0, + "step": 35940 + }, + { + "epoch": 7.808427454387489, + "grad_norm": 0.0011126359459012747, + "learning_rate": 1.2109035621198957e-06, + "loss": 0.0049, + "step": 35950 + }, + { + "epoch": 7.8105994787141615, + "grad_norm": 0.00042124345782212913, + "learning_rate": 1.1973284100781929e-06, + "loss": 0.0, + "step": 35960 + }, + { + "epoch": 7.812771503040834, + "grad_norm": 0.00041270238580182195, + "learning_rate": 1.18375325803649e-06, + "loss": 0.0001, + "step": 35970 + }, + { + "epoch": 7.8149435273675065, + "grad_norm": 0.00040624625398777425, + "learning_rate": 1.1701781059947872e-06, + "loss": 0.0001, + "step": 35980 + }, + { + "epoch": 7.817115551694179, + "grad_norm": 0.0004104378167539835, + "learning_rate": 1.1566029539530843e-06, + "loss": 0.0001, + "step": 35990 + }, + { + "epoch": 7.819287576020852, + "grad_norm": 0.0007016230374574661, + "learning_rate": 1.1430278019113815e-06, + "loss": 0.0047, + "step": 36000 + }, + { + "epoch": 7.821459600347524, + "grad_norm": 0.0005074520013295114, + "learning_rate": 1.1294526498696786e-06, + "loss": 0.0001, + "step": 36010 + }, + { + "epoch": 7.823631624674197, + "grad_norm": 0.0009288796572946012, + "learning_rate": 1.1158774978279757e-06, + "loss": 0.0001, + "step": 36020 + }, + { + "epoch": 7.825803649000869, + "grad_norm": 0.0008313364232890308, + "learning_rate": 1.1023023457862729e-06, + "loss": 0.0, + "step": 36030 + }, + { + "epoch": 7.827975673327542, + "grad_norm": 0.00041966530261561275, + "learning_rate": 1.08872719374457e-06, + "loss": 0.0001, + "step": 36040 + }, + { + "epoch": 7.830147697654214, + "grad_norm": 0.0004062599618919194, + "learning_rate": 1.0751520417028672e-06, + "loss": 0.0, + "step": 36050 + }, + { + "epoch": 7.832319721980886, + "grad_norm": 0.0004279286367818713, + "learning_rate": 1.0615768896611643e-06, + "loss": 0.0001, + "step": 36060 + }, + { + "epoch": 7.834491746307559, + "grad_norm": 0.00041747227078303695, + "learning_rate": 1.0480017376194613e-06, + "loss": 0.0, + "step": 36070 + }, + { + "epoch": 7.836663770634231, + "grad_norm": 0.0004111778107471764, + "learning_rate": 1.0344265855777584e-06, + "loss": 0.0, + "step": 36080 + }, + { + "epoch": 7.838835794960904, + "grad_norm": 0.18399830162525177, + "learning_rate": 1.0208514335360555e-06, + "loss": 0.009, + "step": 36090 + }, + { + "epoch": 7.841007819287576, + "grad_norm": 0.0004036433238070458, + "learning_rate": 1.0072762814943527e-06, + "loss": 0.0, + "step": 36100 + }, + { + "epoch": 7.843179843614249, + "grad_norm": 0.0004178290255367756, + "learning_rate": 9.9370112945265e-07, + "loss": 0.0001, + "step": 36110 + }, + { + "epoch": 7.845351867940921, + "grad_norm": 0.00040722807170823216, + "learning_rate": 9.801259774109472e-07, + "loss": 0.0, + "step": 36120 + }, + { + "epoch": 7.847523892267594, + "grad_norm": 0.00040777475805953145, + "learning_rate": 9.665508253692443e-07, + "loss": 0.006, + "step": 36130 + }, + { + "epoch": 7.849695916594266, + "grad_norm": 0.00040998373879119754, + "learning_rate": 9.529756733275414e-07, + "loss": 0.0001, + "step": 36140 + }, + { + "epoch": 7.851867940920938, + "grad_norm": 0.00065061473287642, + "learning_rate": 9.394005212858384e-07, + "loss": 0.0001, + "step": 36150 + }, + { + "epoch": 7.854039965247611, + "grad_norm": 0.0006581631605513394, + "learning_rate": 9.258253692441356e-07, + "loss": 0.0001, + "step": 36160 + }, + { + "epoch": 7.856211989574283, + "grad_norm": 0.00040802801959216595, + "learning_rate": 9.122502172024327e-07, + "loss": 0.0, + "step": 36170 + }, + { + "epoch": 7.858384013900956, + "grad_norm": 0.0004146042338106781, + "learning_rate": 8.986750651607298e-07, + "loss": 0.0051, + "step": 36180 + }, + { + "epoch": 7.860556038227628, + "grad_norm": 0.0004056603938806802, + "learning_rate": 8.850999131190269e-07, + "loss": 0.0067, + "step": 36190 + }, + { + "epoch": 7.8627280625543, + "grad_norm": 0.0006893807440064847, + "learning_rate": 8.71524761077324e-07, + "loss": 0.0001, + "step": 36200 + }, + { + "epoch": 7.864900086880973, + "grad_norm": 0.00041687351767905056, + "learning_rate": 8.579496090356213e-07, + "loss": 0.0, + "step": 36210 + }, + { + "epoch": 7.867072111207645, + "grad_norm": 0.0004174001805949956, + "learning_rate": 8.443744569939184e-07, + "loss": 0.0001, + "step": 36220 + }, + { + "epoch": 7.869244135534318, + "grad_norm": 0.00040999799966812134, + "learning_rate": 8.307993049522156e-07, + "loss": 0.0, + "step": 36230 + }, + { + "epoch": 7.87141615986099, + "grad_norm": 0.0004175525682512671, + "learning_rate": 8.172241529105127e-07, + "loss": 0.0, + "step": 36240 + }, + { + "epoch": 7.873588184187663, + "grad_norm": 0.0010128975845873356, + "learning_rate": 8.036490008688097e-07, + "loss": 0.0001, + "step": 36250 + }, + { + "epoch": 7.875760208514335, + "grad_norm": 0.0007056336617097259, + "learning_rate": 7.900738488271069e-07, + "loss": 0.0051, + "step": 36260 + }, + { + "epoch": 7.877932232841008, + "grad_norm": 0.0004121058445889503, + "learning_rate": 7.76498696785404e-07, + "loss": 0.0001, + "step": 36270 + }, + { + "epoch": 7.88010425716768, + "grad_norm": 0.0004146612773183733, + "learning_rate": 7.629235447437012e-07, + "loss": 0.0, + "step": 36280 + }, + { + "epoch": 7.882276281494352, + "grad_norm": 0.00040536164306104183, + "learning_rate": 7.493483927019983e-07, + "loss": 0.0001, + "step": 36290 + }, + { + "epoch": 7.884448305821025, + "grad_norm": 0.0005756246391683817, + "learning_rate": 7.357732406602955e-07, + "loss": 0.0001, + "step": 36300 + }, + { + "epoch": 7.886620330147697, + "grad_norm": 0.0009296848438680172, + "learning_rate": 7.221980886185926e-07, + "loss": 0.0001, + "step": 36310 + }, + { + "epoch": 7.88879235447437, + "grad_norm": 0.000410063483286649, + "learning_rate": 7.086229365768896e-07, + "loss": 0.0001, + "step": 36320 + }, + { + "epoch": 7.890964378801042, + "grad_norm": 0.0007830615504644811, + "learning_rate": 6.950477845351868e-07, + "loss": 0.0, + "step": 36330 + }, + { + "epoch": 7.893136403127715, + "grad_norm": 0.00046702235704287887, + "learning_rate": 6.81472632493484e-07, + "loss": 0.0, + "step": 36340 + }, + { + "epoch": 7.895308427454387, + "grad_norm": 0.00041825513471849263, + "learning_rate": 6.678974804517811e-07, + "loss": 0.0, + "step": 36350 + }, + { + "epoch": 7.89748045178106, + "grad_norm": 0.0004378632584121078, + "learning_rate": 6.543223284100782e-07, + "loss": 0.0, + "step": 36360 + }, + { + "epoch": 7.8996524761077325, + "grad_norm": 0.00043384116725064814, + "learning_rate": 6.407471763683754e-07, + "loss": 0.0001, + "step": 36370 + }, + { + "epoch": 7.9018245004344045, + "grad_norm": 0.00041194152436219156, + "learning_rate": 6.271720243266724e-07, + "loss": 0.0, + "step": 36380 + }, + { + "epoch": 7.9039965247610775, + "grad_norm": 0.00041512143798172474, + "learning_rate": 6.135968722849696e-07, + "loss": 0.0001, + "step": 36390 + }, + { + "epoch": 7.90616854908775, + "grad_norm": 0.0007027858518995345, + "learning_rate": 6.000217202432668e-07, + "loss": 0.0001, + "step": 36400 + }, + { + "epoch": 7.9083405734144225, + "grad_norm": 0.0007233781507238746, + "learning_rate": 5.864465682015638e-07, + "loss": 0.0001, + "step": 36410 + }, + { + "epoch": 7.910512597741095, + "grad_norm": 0.00040531178819946945, + "learning_rate": 5.72871416159861e-07, + "loss": 0.0063, + "step": 36420 + }, + { + "epoch": 7.912684622067767, + "grad_norm": 0.0006673138123005629, + "learning_rate": 5.592962641181581e-07, + "loss": 0.0054, + "step": 36430 + }, + { + "epoch": 7.91485664639444, + "grad_norm": 0.0004134076298214495, + "learning_rate": 5.457211120764553e-07, + "loss": 0.0, + "step": 36440 + }, + { + "epoch": 7.917028670721112, + "grad_norm": 0.0006809644983150065, + "learning_rate": 5.321459600347524e-07, + "loss": 0.0001, + "step": 36450 + }, + { + "epoch": 7.919200695047785, + "grad_norm": 0.0010297917760908604, + "learning_rate": 5.185708079930495e-07, + "loss": 0.0001, + "step": 36460 + }, + { + "epoch": 7.921372719374457, + "grad_norm": 0.00040686517604626715, + "learning_rate": 5.049956559513467e-07, + "loss": 0.0, + "step": 36470 + }, + { + "epoch": 7.92354474370113, + "grad_norm": 0.00040707376319915056, + "learning_rate": 4.914205039096437e-07, + "loss": 0.0, + "step": 36480 + }, + { + "epoch": 7.925716768027802, + "grad_norm": 0.00041068848804570735, + "learning_rate": 4.77845351867941e-07, + "loss": 0.0, + "step": 36490 + }, + { + "epoch": 7.927888792354475, + "grad_norm": 0.00040782021824270487, + "learning_rate": 4.6427019982623807e-07, + "loss": 0.0001, + "step": 36500 + }, + { + "epoch": 7.930060816681147, + "grad_norm": 0.000491233600769192, + "learning_rate": 4.506950477845352e-07, + "loss": 0.0001, + "step": 36510 + }, + { + "epoch": 7.932232841007819, + "grad_norm": 0.0004565462877508253, + "learning_rate": 4.371198957428323e-07, + "loss": 0.0042, + "step": 36520 + }, + { + "epoch": 7.934404865334492, + "grad_norm": 0.00040316057857126, + "learning_rate": 4.2354474370112945e-07, + "loss": 0.0001, + "step": 36530 + }, + { + "epoch": 7.936576889661164, + "grad_norm": 0.0011551212519407272, + "learning_rate": 4.0996959165942665e-07, + "loss": 0.0088, + "step": 36540 + }, + { + "epoch": 7.938748913987837, + "grad_norm": 0.00041238832636736333, + "learning_rate": 3.9639443961772374e-07, + "loss": 0.0, + "step": 36550 + }, + { + "epoch": 7.940920938314509, + "grad_norm": 0.0004119895747862756, + "learning_rate": 3.828192875760209e-07, + "loss": 0.0, + "step": 36560 + }, + { + "epoch": 7.943092962641182, + "grad_norm": 0.00041164946742355824, + "learning_rate": 3.6924413553431797e-07, + "loss": 0.0001, + "step": 36570 + }, + { + "epoch": 7.945264986967854, + "grad_norm": 0.00064078503055498, + "learning_rate": 3.556689834926151e-07, + "loss": 0.0001, + "step": 36580 + }, + { + "epoch": 7.947437011294527, + "grad_norm": 0.00040421413723379374, + "learning_rate": 3.4209383145091226e-07, + "loss": 0.0001, + "step": 36590 + }, + { + "epoch": 7.949609035621199, + "grad_norm": 0.0010148753644898534, + "learning_rate": 3.285186794092094e-07, + "loss": 0.0001, + "step": 36600 + }, + { + "epoch": 7.951781059947871, + "grad_norm": 0.0004108991415705532, + "learning_rate": 3.1494352736750655e-07, + "loss": 0.0, + "step": 36610 + }, + { + "epoch": 7.953953084274544, + "grad_norm": 0.0004116395430173725, + "learning_rate": 3.0136837532580364e-07, + "loss": 0.0, + "step": 36620 + }, + { + "epoch": 7.956125108601216, + "grad_norm": 0.00040760665433481336, + "learning_rate": 2.877932232841008e-07, + "loss": 0.0, + "step": 36630 + }, + { + "epoch": 7.958297132927889, + "grad_norm": 0.000540865701623261, + "learning_rate": 2.7421807124239793e-07, + "loss": 0.0, + "step": 36640 + }, + { + "epoch": 7.960469157254561, + "grad_norm": 0.0004090243310201913, + "learning_rate": 2.6064291920069507e-07, + "loss": 0.0, + "step": 36650 + }, + { + "epoch": 7.962641181581233, + "grad_norm": 0.0004173625202383846, + "learning_rate": 2.470677671589922e-07, + "loss": 0.0001, + "step": 36660 + }, + { + "epoch": 7.964813205907906, + "grad_norm": 0.00040882351459003985, + "learning_rate": 2.334926151172893e-07, + "loss": 0.0, + "step": 36670 + }, + { + "epoch": 7.966985230234578, + "grad_norm": 0.0013148378347977996, + "learning_rate": 2.1991746307558648e-07, + "loss": 0.0001, + "step": 36680 + }, + { + "epoch": 7.969157254561251, + "grad_norm": 0.00040579578489996493, + "learning_rate": 2.063423110338836e-07, + "loss": 0.0049, + "step": 36690 + }, + { + "epoch": 7.971329278887923, + "grad_norm": 0.0004032440483570099, + "learning_rate": 1.927671589921807e-07, + "loss": 0.0, + "step": 36700 + }, + { + "epoch": 7.973501303214596, + "grad_norm": 0.00041121779941022396, + "learning_rate": 1.7919200695047785e-07, + "loss": 0.0001, + "step": 36710 + }, + { + "epoch": 7.975673327541268, + "grad_norm": 0.000417822360759601, + "learning_rate": 1.65616854908775e-07, + "loss": 0.004, + "step": 36720 + }, + { + "epoch": 7.977845351867941, + "grad_norm": 0.0004095873446203768, + "learning_rate": 1.5204170286707212e-07, + "loss": 0.0, + "step": 36730 + }, + { + "epoch": 7.980017376194613, + "grad_norm": 0.0004114782204851508, + "learning_rate": 1.3846655082536923e-07, + "loss": 0.0101, + "step": 36740 + }, + { + "epoch": 7.9821894005212854, + "grad_norm": 0.0004039192572236061, + "learning_rate": 1.2489139878366638e-07, + "loss": 0.0001, + "step": 36750 + }, + { + "epoch": 7.984361424847958, + "grad_norm": 0.0007135707419365644, + "learning_rate": 1.1131624674196352e-07, + "loss": 0.0001, + "step": 36760 + }, + { + "epoch": 7.9865334491746305, + "grad_norm": 0.0004933361196890473, + "learning_rate": 9.774109470026065e-08, + "loss": 0.0, + "step": 36770 + }, + { + "epoch": 7.9887054735013034, + "grad_norm": 0.23448993265628815, + "learning_rate": 8.416594265855778e-08, + "loss": 0.0055, + "step": 36780 + }, + { + "epoch": 7.9908774978279755, + "grad_norm": 0.0004083155654370785, + "learning_rate": 7.059079061685491e-08, + "loss": 0.0001, + "step": 36790 + }, + { + "epoch": 7.9930495221546485, + "grad_norm": 0.000955652620177716, + "learning_rate": 5.7015638575152043e-08, + "loss": 0.0001, + "step": 36800 + }, + { + "epoch": 7.9952215464813206, + "grad_norm": 0.0004135241615585983, + "learning_rate": 4.3440486533449174e-08, + "loss": 0.0001, + "step": 36810 + }, + { + "epoch": 7.9973935708079935, + "grad_norm": 0.00041545607382431626, + "learning_rate": 2.986533449174631e-08, + "loss": 0.0001, + "step": 36820 + }, + { + "epoch": 7.999565595134666, + "grad_norm": 0.00040375188109464943, + "learning_rate": 1.6290182450043442e-08, + "loss": 0.0, + "step": 36830 + }, + { + "epoch": 8.0, + "eval_f1": 0.5962264150943396, + "eval_loss": 0.08886083960533142, + "eval_runtime": 84.061, + "eval_samples_per_second": 118.664, + "eval_steps_per_second": 7.423, + "step": 36832 + }, + { + "epoch": 8.0, + "step": 36832, + "total_flos": 4.566447964008677e+19, + "train_loss": 0.0017740661595287623, + "train_runtime": 6442.5194, + "train_samples_per_second": 91.467, + "train_steps_per_second": 5.717 } ], "logging_steps": 10, - "max_steps": 18416, + "max_steps": 36832, "num_input_tokens_seen": 0, - "num_train_epochs": 4, + "num_train_epochs": 8, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { @@ -12958,7 +25888,7 @@ "attributes": {} } }, - "total_flos": 2.2832239820043387e+19, + "total_flos": 4.566447964008677e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null