diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5038 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.0, + "eval_steps": 500, + "global_step": 1431, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.011180992313067784, + "grad_norm": 0.0, + "learning_rate": 5e-06, + "loss": 1.2878, + "step": 2 + }, + { + "epoch": 0.02236198462613557, + "grad_norm": 0.0, + "learning_rate": 5e-06, + "loss": 1.392, + "step": 4 + }, + { + "epoch": 0.033542976939203356, + "grad_norm": 0.0, + "learning_rate": 5e-06, + "loss": 1.3594, + "step": 6 + }, + { + "epoch": 0.04472396925227114, + "grad_norm": 0.0, + "learning_rate": 5e-06, + "loss": 1.2958, + "step": 8 + }, + { + "epoch": 0.055904961565338925, + "grad_norm": 0.0, + "learning_rate": 5e-06, + "loss": 1.3475, + "step": 10 + }, + { + "epoch": 0.06708595387840671, + "grad_norm": 0.0, + "learning_rate": 5e-06, + "loss": 1.2303, + "step": 12 + }, + { + "epoch": 0.07826694619147449, + "grad_norm": 0.0, + "learning_rate": 5e-06, + "loss": 1.1964, + "step": 14 + }, + { + "epoch": 0.08944793850454227, + "grad_norm": 0.0, + "learning_rate": 5e-06, + "loss": 1.3328, + "step": 16 + }, + { + "epoch": 0.10062893081761007, + "grad_norm": 6.944002672340658, + "learning_rate": 4.999996106235862e-06, + "loss": 1.3134, + "step": 18 + }, + { + "epoch": 0.11180992313067785, + "grad_norm": 7.800497498064014, + "learning_rate": 4.999964956195521e-06, + "loss": 1.1147, + "step": 20 + }, + { + "epoch": 0.12299091544374563, + "grad_norm": 4.4662495771497355, + "learning_rate": 4.999902656502973e-06, + "loss": 1.025, + "step": 22 + }, + { + "epoch": 0.13417190775681342, + "grad_norm": 4.026851738528776, + "learning_rate": 4.999809207934472e-06, + "loss": 1.0448, + "step": 24 + }, + { + "epoch": 0.1453529000698812, + "grad_norm": 5.658278761851693, + "learning_rate": 4.999684611654392e-06, + "loss": 0.9826, + "step": 26 + }, + { + "epoch": 0.15653389238294899, + "grad_norm": 3.9275877006609505, + "learning_rate": 4.9995288692152046e-06, + "loss": 0.9627, + "step": 28 + }, + { + "epoch": 0.16771488469601678, + "grad_norm": 3.634771950296262, + "learning_rate": 4.9993419825574686e-06, + "loss": 0.9476, + "step": 30 + }, + { + "epoch": 0.17889587700908455, + "grad_norm": 4.604406424526374, + "learning_rate": 4.9992368608591775e-06, + "loss": 0.9414, + "step": 32 + }, + { + "epoch": 0.19007686932215234, + "grad_norm": 5.708200502114745, + "learning_rate": 4.999003262361029e-06, + "loss": 0.9572, + "step": 34 + }, + { + "epoch": 0.20125786163522014, + "grad_norm": 5.020134712294459, + "learning_rate": 4.998738526193412e-06, + "loss": 0.9544, + "step": 36 + }, + { + "epoch": 0.2124388539482879, + "grad_norm": 4.643332496496484, + "learning_rate": 4.998442655654946e-06, + "loss": 0.8504, + "step": 38 + }, + { + "epoch": 0.2236198462613557, + "grad_norm": 4.7843514072232125, + "learning_rate": 4.998115654432191e-06, + "loss": 0.914, + "step": 40 + }, + { + "epoch": 0.2348008385744235, + "grad_norm": 3.973113705087721, + "learning_rate": 4.997757526599592e-06, + "loss": 0.8303, + "step": 42 + }, + { + "epoch": 0.24598183088749126, + "grad_norm": 5.753323652117126, + "learning_rate": 4.9973682766194355e-06, + "loss": 0.8916, + "step": 44 + }, + { + "epoch": 0.25716282320055905, + "grad_norm": 4.00607759948128, + "learning_rate": 4.996947909341789e-06, + "loss": 0.9391, + "step": 46 + }, + { + "epoch": 0.26834381551362685, + "grad_norm": 4.73751358896988, + "learning_rate": 4.996496430004446e-06, + "loss": 0.8445, + "step": 48 + }, + { + "epoch": 0.27952480782669464, + "grad_norm": 3.801634673248135, + "learning_rate": 4.9960138442328535e-06, + "loss": 0.8354, + "step": 50 + }, + { + "epoch": 0.2907058001397624, + "grad_norm": 4.998706656181077, + "learning_rate": 4.9955001580400475e-06, + "loss": 0.8556, + "step": 52 + }, + { + "epoch": 0.3018867924528302, + "grad_norm": 5.235396952388322, + "learning_rate": 4.994955377826577e-06, + "loss": 0.8821, + "step": 54 + }, + { + "epoch": 0.31306778476589797, + "grad_norm": 4.593843550283633, + "learning_rate": 4.994379510380421e-06, + "loss": 0.7965, + "step": 56 + }, + { + "epoch": 0.32424877707896577, + "grad_norm": 4.636040406542864, + "learning_rate": 4.993772562876909e-06, + "loss": 0.8576, + "step": 58 + }, + { + "epoch": 0.33542976939203356, + "grad_norm": 4.422458900120915, + "learning_rate": 4.993134542878631e-06, + "loss": 0.8388, + "step": 60 + }, + { + "epoch": 0.3466107617051013, + "grad_norm": 4.88515796654498, + "learning_rate": 4.992465458335335e-06, + "loss": 0.8427, + "step": 62 + }, + { + "epoch": 0.3577917540181691, + "grad_norm": 4.620642626620232, + "learning_rate": 4.991765317583841e-06, + "loss": 0.8088, + "step": 64 + }, + { + "epoch": 0.3689727463312369, + "grad_norm": 3.0164501013815146, + "learning_rate": 4.991034129347927e-06, + "loss": 0.7643, + "step": 66 + }, + { + "epoch": 0.3801537386443047, + "grad_norm": 4.0807085306410915, + "learning_rate": 4.990271902738223e-06, + "loss": 0.8304, + "step": 68 + }, + { + "epoch": 0.3913347309573725, + "grad_norm": 4.913983348963418, + "learning_rate": 4.989478647252101e-06, + "loss": 0.8694, + "step": 70 + }, + { + "epoch": 0.4025157232704403, + "grad_norm": 5.427166275548586, + "learning_rate": 4.988654372773552e-06, + "loss": 0.8031, + "step": 72 + }, + { + "epoch": 0.413696715583508, + "grad_norm": 4.976699288607289, + "learning_rate": 4.987799089573066e-06, + "loss": 0.7548, + "step": 74 + }, + { + "epoch": 0.4248777078965758, + "grad_norm": 5.035712861337141, + "learning_rate": 4.986912808307502e-06, + "loss": 0.7769, + "step": 76 + }, + { + "epoch": 0.4360587002096436, + "grad_norm": 5.703104314189732, + "learning_rate": 4.985995540019956e-06, + "loss": 0.7744, + "step": 78 + }, + { + "epoch": 0.4472396925227114, + "grad_norm": 3.6174332203212938, + "learning_rate": 4.985047296139622e-06, + "loss": 0.7215, + "step": 80 + }, + { + "epoch": 0.4584206848357792, + "grad_norm": 5.084461038739496, + "learning_rate": 4.984068088481654e-06, + "loss": 0.7462, + "step": 82 + }, + { + "epoch": 0.469601677148847, + "grad_norm": 5.500722673783384, + "learning_rate": 4.983057929247014e-06, + "loss": 0.7937, + "step": 84 + }, + { + "epoch": 0.4807826694619147, + "grad_norm": 5.76928743736382, + "learning_rate": 4.9820168310223215e-06, + "loss": 0.7701, + "step": 86 + }, + { + "epoch": 0.4919636617749825, + "grad_norm": 4.3638410984754366, + "learning_rate": 4.980944806779698e-06, + "loss": 0.7063, + "step": 88 + }, + { + "epoch": 0.5031446540880503, + "grad_norm": 6.6022312070502664, + "learning_rate": 4.979841869876603e-06, + "loss": 0.7829, + "step": 90 + }, + { + "epoch": 0.5143256464011181, + "grad_norm": 5.114853414480892, + "learning_rate": 4.97870803405567e-06, + "loss": 0.7419, + "step": 92 + }, + { + "epoch": 0.5255066387141859, + "grad_norm": 5.450293615821356, + "learning_rate": 4.977543313444534e-06, + "loss": 0.7428, + "step": 94 + }, + { + "epoch": 0.5366876310272537, + "grad_norm": 3.888671786201343, + "learning_rate": 4.976347722555655e-06, + "loss": 0.763, + "step": 96 + }, + { + "epoch": 0.5478686233403215, + "grad_norm": 5.580018062591517, + "learning_rate": 4.975121276286136e-06, + "loss": 0.7451, + "step": 98 + }, + { + "epoch": 0.5590496156533893, + "grad_norm": 5.244409209125885, + "learning_rate": 4.973863989917545e-06, + "loss": 0.6658, + "step": 100 + }, + { + "epoch": 0.570230607966457, + "grad_norm": 6.341201782490113, + "learning_rate": 4.9725758791157105e-06, + "loss": 0.7042, + "step": 102 + }, + { + "epoch": 0.5814116002795248, + "grad_norm": 3.63864440598579, + "learning_rate": 4.9712569599305415e-06, + "loss": 0.6859, + "step": 104 + }, + { + "epoch": 0.5925925925925926, + "grad_norm": 5.643540415249962, + "learning_rate": 4.9699072487958185e-06, + "loss": 0.7072, + "step": 106 + }, + { + "epoch": 0.6037735849056604, + "grad_norm": 4.518214836889502, + "learning_rate": 4.968526762528988e-06, + "loss": 0.6989, + "step": 108 + }, + { + "epoch": 0.6149545772187281, + "grad_norm": 4.813780988459217, + "learning_rate": 4.96711551833096e-06, + "loss": 0.6213, + "step": 110 + }, + { + "epoch": 0.6261355695317959, + "grad_norm": 6.534716960952802, + "learning_rate": 4.965673533785887e-06, + "loss": 0.6603, + "step": 112 + }, + { + "epoch": 0.6373165618448637, + "grad_norm": 4.694700268634709, + "learning_rate": 4.9642008268609455e-06, + "loss": 0.6458, + "step": 114 + }, + { + "epoch": 0.6484975541579315, + "grad_norm": 3.797163997052886, + "learning_rate": 4.962697415906118e-06, + "loss": 0.6208, + "step": 116 + }, + { + "epoch": 0.6596785464709993, + "grad_norm": 5.303604758140139, + "learning_rate": 4.961163319653959e-06, + "loss": 0.6175, + "step": 118 + }, + { + "epoch": 0.6708595387840671, + "grad_norm": 3.8308857949946398, + "learning_rate": 4.959598557219361e-06, + "loss": 0.6178, + "step": 120 + }, + { + "epoch": 0.6820405310971349, + "grad_norm": 5.611339241664303, + "learning_rate": 4.95800314809932e-06, + "loss": 0.617, + "step": 122 + }, + { + "epoch": 0.6932215234102026, + "grad_norm": 5.234511261826922, + "learning_rate": 4.956377112172691e-06, + "loss": 0.6557, + "step": 124 + }, + { + "epoch": 0.7044025157232704, + "grad_norm": 4.381066733905507, + "learning_rate": 4.954720469699939e-06, + "loss": 0.6343, + "step": 126 + }, + { + "epoch": 0.7155835080363382, + "grad_norm": 5.113989443684452, + "learning_rate": 4.953033241322887e-06, + "loss": 0.6135, + "step": 128 + }, + { + "epoch": 0.726764500349406, + "grad_norm": 5.138987950069777, + "learning_rate": 4.951315448064462e-06, + "loss": 0.6403, + "step": 130 + }, + { + "epoch": 0.7379454926624738, + "grad_norm": 4.43583718290579, + "learning_rate": 4.949567111328428e-06, + "loss": 0.6226, + "step": 132 + }, + { + "epoch": 0.7491264849755416, + "grad_norm": 4.391597448273059, + "learning_rate": 4.947788252899124e-06, + "loss": 0.6333, + "step": 134 + }, + { + "epoch": 0.7603074772886094, + "grad_norm": 4.193385817962468, + "learning_rate": 4.945978894941189e-06, + "loss": 0.6884, + "step": 136 + }, + { + "epoch": 0.7714884696016772, + "grad_norm": 5.03154779607414, + "learning_rate": 4.944139059999286e-06, + "loss": 0.5783, + "step": 138 + }, + { + "epoch": 0.782669461914745, + "grad_norm": 6.345004441163444, + "learning_rate": 4.942268770997825e-06, + "loss": 0.5314, + "step": 140 + }, + { + "epoch": 0.7938504542278128, + "grad_norm": 4.800013540838224, + "learning_rate": 4.940368051240675e-06, + "loss": 0.5876, + "step": 142 + }, + { + "epoch": 0.8050314465408805, + "grad_norm": 5.229387760297341, + "learning_rate": 4.938436924410869e-06, + "loss": 0.6266, + "step": 144 + }, + { + "epoch": 0.8162124388539483, + "grad_norm": 5.663117027843187, + "learning_rate": 4.936475414570317e-06, + "loss": 0.5407, + "step": 146 + }, + { + "epoch": 0.827393431167016, + "grad_norm": 4.355698674662869, + "learning_rate": 4.9344835461595016e-06, + "loss": 0.5757, + "step": 148 + }, + { + "epoch": 0.8385744234800838, + "grad_norm": 3.73012661577406, + "learning_rate": 4.932461343997174e-06, + "loss": 0.5671, + "step": 150 + }, + { + "epoch": 0.8497554157931516, + "grad_norm": 5.17610307953933, + "learning_rate": 4.930408833280044e-06, + "loss": 0.5552, + "step": 152 + }, + { + "epoch": 0.8609364081062194, + "grad_norm": 4.8108290286110575, + "learning_rate": 4.928326039582468e-06, + "loss": 0.5455, + "step": 154 + }, + { + "epoch": 0.8721174004192872, + "grad_norm": 4.143977047297293, + "learning_rate": 4.926212988856131e-06, + "loss": 0.5865, + "step": 156 + }, + { + "epoch": 0.883298392732355, + "grad_norm": 4.809016102192773, + "learning_rate": 4.9240697074297205e-06, + "loss": 0.5904, + "step": 158 + }, + { + "epoch": 0.8944793850454228, + "grad_norm": 4.329310274878485, + "learning_rate": 4.921896222008598e-06, + "loss": 0.5213, + "step": 160 + }, + { + "epoch": 0.9056603773584906, + "grad_norm": 6.082276125346202, + "learning_rate": 4.919692559674469e-06, + "loss": 0.5321, + "step": 162 + }, + { + "epoch": 0.9168413696715584, + "grad_norm": 3.595682377289556, + "learning_rate": 4.917458747885045e-06, + "loss": 0.5589, + "step": 164 + }, + { + "epoch": 0.9280223619846262, + "grad_norm": 4.759398027424621, + "learning_rate": 4.9151948144737e-06, + "loss": 0.5252, + "step": 166 + }, + { + "epoch": 0.939203354297694, + "grad_norm": 4.925856740501272, + "learning_rate": 4.912900787649124e-06, + "loss": 0.5688, + "step": 168 + }, + { + "epoch": 0.9503843466107617, + "grad_norm": 4.9751554778931695, + "learning_rate": 4.910576695994976e-06, + "loss": 0.49, + "step": 170 + }, + { + "epoch": 0.9615653389238294, + "grad_norm": 4.404002437196143, + "learning_rate": 4.908222568469516e-06, + "loss": 0.5031, + "step": 172 + }, + { + "epoch": 0.9727463312368972, + "grad_norm": 4.438458089119356, + "learning_rate": 4.905838434405259e-06, + "loss": 0.5015, + "step": 174 + }, + { + "epoch": 0.983927323549965, + "grad_norm": 3.7675300141289205, + "learning_rate": 4.903424323508601e-06, + "loss": 0.5133, + "step": 176 + }, + { + "epoch": 0.9951083158630328, + "grad_norm": 5.557474516168906, + "learning_rate": 4.900980265859449e-06, + "loss": 0.4913, + "step": 178 + }, + { + "epoch": 1.0062893081761006, + "grad_norm": 4.4806858821540585, + "learning_rate": 4.898506291910847e-06, + "loss": 0.4446, + "step": 180 + }, + { + "epoch": 1.0174703004891683, + "grad_norm": 4.605929975666356, + "learning_rate": 4.896002432488599e-06, + "loss": 0.3632, + "step": 182 + }, + { + "epoch": 1.0286512928022362, + "grad_norm": 4.9794341930411665, + "learning_rate": 4.893468718790883e-06, + "loss": 0.3868, + "step": 184 + }, + { + "epoch": 1.039832285115304, + "grad_norm": 3.5317296745452733, + "learning_rate": 4.890905182387862e-06, + "loss": 0.4334, + "step": 186 + }, + { + "epoch": 1.0510132774283718, + "grad_norm": 4.568181420141649, + "learning_rate": 4.88831185522129e-06, + "loss": 0.456, + "step": 188 + }, + { + "epoch": 1.0621942697414395, + "grad_norm": 3.570260813698039, + "learning_rate": 4.885688769604115e-06, + "loss": 0.3846, + "step": 190 + }, + { + "epoch": 1.0733752620545074, + "grad_norm": 3.639759353451614, + "learning_rate": 4.883035958220077e-06, + "loss": 0.4363, + "step": 192 + }, + { + "epoch": 1.084556254367575, + "grad_norm": 4.074741691986429, + "learning_rate": 4.8803534541233016e-06, + "loss": 0.3782, + "step": 194 + }, + { + "epoch": 1.095737246680643, + "grad_norm": 4.875221867832197, + "learning_rate": 4.8776412907378845e-06, + "loss": 0.3815, + "step": 196 + }, + { + "epoch": 1.1069182389937107, + "grad_norm": 3.575182053435755, + "learning_rate": 4.874899501857477e-06, + "loss": 0.4023, + "step": 198 + }, + { + "epoch": 1.1180992313067786, + "grad_norm": 3.984785984285916, + "learning_rate": 4.8721281216448675e-06, + "loss": 0.305, + "step": 200 + }, + { + "epoch": 1.1292802236198463, + "grad_norm": 3.997235184408756, + "learning_rate": 4.869327184631552e-06, + "loss": 0.3896, + "step": 202 + }, + { + "epoch": 1.140461215932914, + "grad_norm": 3.403723018382878, + "learning_rate": 4.866496725717304e-06, + "loss": 0.3332, + "step": 204 + }, + { + "epoch": 1.1516422082459818, + "grad_norm": 3.5740869992425917, + "learning_rate": 4.8636367801697415e-06, + "loss": 0.3299, + "step": 206 + }, + { + "epoch": 1.1628232005590495, + "grad_norm": 3.8789874672120033, + "learning_rate": 4.860747383623889e-06, + "loss": 0.4145, + "step": 208 + }, + { + "epoch": 1.1740041928721174, + "grad_norm": 3.8038820435820084, + "learning_rate": 4.857828572081731e-06, + "loss": 0.3171, + "step": 210 + }, + { + "epoch": 1.1851851851851851, + "grad_norm": 3.260333619392394, + "learning_rate": 4.854880381911762e-06, + "loss": 0.3474, + "step": 212 + }, + { + "epoch": 1.196366177498253, + "grad_norm": 2.8989963280714925, + "learning_rate": 4.851902849848536e-06, + "loss": 0.3931, + "step": 214 + }, + { + "epoch": 1.2075471698113207, + "grad_norm": 3.6383247911373773, + "learning_rate": 4.848896012992208e-06, + "loss": 0.3822, + "step": 216 + }, + { + "epoch": 1.2187281621243886, + "grad_norm": 3.0864181531286734, + "learning_rate": 4.845859908808074e-06, + "loss": 0.378, + "step": 218 + }, + { + "epoch": 1.2299091544374563, + "grad_norm": 2.494513481207721, + "learning_rate": 4.842794575126099e-06, + "loss": 0.3655, + "step": 220 + }, + { + "epoch": 1.2410901467505242, + "grad_norm": 2.6074910342756334, + "learning_rate": 4.839700050140448e-06, + "loss": 0.3973, + "step": 222 + }, + { + "epoch": 1.2522711390635919, + "grad_norm": 2.2421870374103285, + "learning_rate": 4.836576372409015e-06, + "loss": 0.3784, + "step": 224 + }, + { + "epoch": 1.2634521313766598, + "grad_norm": 2.451559449193117, + "learning_rate": 4.833423580852933e-06, + "loss": 0.3805, + "step": 226 + }, + { + "epoch": 1.2746331236897275, + "grad_norm": 2.5374184019501285, + "learning_rate": 4.830241714756099e-06, + "loss": 0.293, + "step": 228 + }, + { + "epoch": 1.2858141160027952, + "grad_norm": 2.525807489259318, + "learning_rate": 4.827030813764677e-06, + "loss": 0.2665, + "step": 230 + }, + { + "epoch": 1.296995108315863, + "grad_norm": 2.3755504317471523, + "learning_rate": 4.8237909178866075e-06, + "loss": 0.4108, + "step": 232 + }, + { + "epoch": 1.3081761006289307, + "grad_norm": 2.7662660096000793, + "learning_rate": 4.8205220674911075e-06, + "loss": 0.3928, + "step": 234 + }, + { + "epoch": 1.3193570929419987, + "grad_norm": 2.245517906271987, + "learning_rate": 4.81722430330817e-06, + "loss": 0.355, + "step": 236 + }, + { + "epoch": 1.3305380852550663, + "grad_norm": 2.684087860818518, + "learning_rate": 4.813897666428054e-06, + "loss": 0.3624, + "step": 238 + }, + { + "epoch": 1.3417190775681342, + "grad_norm": 2.5507370157459865, + "learning_rate": 4.810542198300772e-06, + "loss": 0.3494, + "step": 240 + }, + { + "epoch": 1.352900069881202, + "grad_norm": 2.157612559104276, + "learning_rate": 4.807157940735577e-06, + "loss": 0.3064, + "step": 242 + }, + { + "epoch": 1.3640810621942698, + "grad_norm": 1.9389355017962189, + "learning_rate": 4.803744935900439e-06, + "loss": 0.3331, + "step": 244 + }, + { + "epoch": 1.3752620545073375, + "grad_norm": 2.3147558047608867, + "learning_rate": 4.8003032263215185e-06, + "loss": 0.3538, + "step": 246 + }, + { + "epoch": 1.3864430468204052, + "grad_norm": 2.414181223767401, + "learning_rate": 4.79683285488264e-06, + "loss": 0.3237, + "step": 248 + }, + { + "epoch": 1.397624039133473, + "grad_norm": 2.0498128676624368, + "learning_rate": 4.793333864824756e-06, + "loss": 0.3742, + "step": 250 + }, + { + "epoch": 1.408805031446541, + "grad_norm": 2.2294049255917416, + "learning_rate": 4.789806299745405e-06, + "loss": 0.2948, + "step": 252 + }, + { + "epoch": 1.4199860237596087, + "grad_norm": 2.2210196470155923, + "learning_rate": 4.786250203598174e-06, + "loss": 0.28, + "step": 254 + }, + { + "epoch": 1.4311670160726764, + "grad_norm": 2.6896787603814816, + "learning_rate": 4.782665620692147e-06, + "loss": 0.3513, + "step": 256 + }, + { + "epoch": 1.4423480083857443, + "grad_norm": 2.1151921249556644, + "learning_rate": 4.779052595691355e-06, + "loss": 0.3598, + "step": 258 + }, + { + "epoch": 1.453529000698812, + "grad_norm": 2.6404538176276047, + "learning_rate": 4.775411173614218e-06, + "loss": 0.3075, + "step": 260 + }, + { + "epoch": 1.4647099930118799, + "grad_norm": 1.9888888421343762, + "learning_rate": 4.771741399832984e-06, + "loss": 0.356, + "step": 262 + }, + { + "epoch": 1.4758909853249476, + "grad_norm": 2.284642426340359, + "learning_rate": 4.768043320073165e-06, + "loss": 0.2765, + "step": 264 + }, + { + "epoch": 1.4870719776380152, + "grad_norm": 2.135563450656965, + "learning_rate": 4.764316980412966e-06, + "loss": 0.2825, + "step": 266 + }, + { + "epoch": 1.4982529699510831, + "grad_norm": 1.8267552790003188, + "learning_rate": 4.7605624272827125e-06, + "loss": 0.3915, + "step": 268 + }, + { + "epoch": 1.509433962264151, + "grad_norm": 2.26569092336033, + "learning_rate": 4.75677970746427e-06, + "loss": 0.3859, + "step": 270 + }, + { + "epoch": 1.5206149545772187, + "grad_norm": 2.3510908940666346, + "learning_rate": 4.75296886809046e-06, + "loss": 0.312, + "step": 272 + }, + { + "epoch": 1.5317959468902864, + "grad_norm": 2.1562478846600883, + "learning_rate": 4.749129956644477e-06, + "loss": 0.4398, + "step": 274 + }, + { + "epoch": 1.5429769392033543, + "grad_norm": 2.1811966726037655, + "learning_rate": 4.745263020959296e-06, + "loss": 0.3221, + "step": 276 + }, + { + "epoch": 1.5541579315164222, + "grad_norm": 2.035643810106488, + "learning_rate": 4.741368109217072e-06, + "loss": 0.3317, + "step": 278 + }, + { + "epoch": 1.56533892382949, + "grad_norm": 2.0722038381676824, + "learning_rate": 4.737445269948543e-06, + "loss": 0.4627, + "step": 280 + }, + { + "epoch": 1.5765199161425576, + "grad_norm": 2.2584403073433212, + "learning_rate": 4.733494552032426e-06, + "loss": 0.352, + "step": 282 + }, + { + "epoch": 1.5877009084556253, + "grad_norm": 3.1127410509937783, + "learning_rate": 4.729516004694808e-06, + "loss": 0.3109, + "step": 284 + }, + { + "epoch": 1.5988819007686932, + "grad_norm": 1.6930738402579835, + "learning_rate": 4.725509677508528e-06, + "loss": 0.3723, + "step": 286 + }, + { + "epoch": 1.610062893081761, + "grad_norm": 2.6225330496610573, + "learning_rate": 4.721475620392567e-06, + "loss": 0.2853, + "step": 288 + }, + { + "epoch": 1.6212438853948288, + "grad_norm": 1.998954970455011, + "learning_rate": 4.71741388361142e-06, + "loss": 0.323, + "step": 290 + }, + { + "epoch": 1.6324248777078965, + "grad_norm": 2.3952745413220677, + "learning_rate": 4.713324517774471e-06, + "loss": 0.4057, + "step": 292 + }, + { + "epoch": 1.6436058700209644, + "grad_norm": 1.7339961999135642, + "learning_rate": 4.7092075738353625e-06, + "loss": 0.2855, + "step": 294 + }, + { + "epoch": 1.6547868623340323, + "grad_norm": 2.3672466509243075, + "learning_rate": 4.705063103091365e-06, + "loss": 0.277, + "step": 296 + }, + { + "epoch": 1.6659678546471, + "grad_norm": 1.92096238087282, + "learning_rate": 4.700891157182729e-06, + "loss": 0.2699, + "step": 298 + }, + { + "epoch": 1.6771488469601676, + "grad_norm": 1.6478187267877538, + "learning_rate": 4.696691788092049e-06, + "loss": 0.2875, + "step": 300 + }, + { + "epoch": 1.6883298392732355, + "grad_norm": 2.6637144089516545, + "learning_rate": 4.692465048143615e-06, + "loss": 0.3229, + "step": 302 + }, + { + "epoch": 1.6995108315863034, + "grad_norm": 2.0530281428374084, + "learning_rate": 4.688210990002755e-06, + "loss": 0.3546, + "step": 304 + }, + { + "epoch": 1.7106918238993711, + "grad_norm": 2.150198399781322, + "learning_rate": 4.683929666675185e-06, + "loss": 0.4021, + "step": 306 + }, + { + "epoch": 1.7218728162124388, + "grad_norm": 2.1752313572704542, + "learning_rate": 4.679621131506347e-06, + "loss": 0.3299, + "step": 308 + }, + { + "epoch": 1.7330538085255065, + "grad_norm": 1.9055889494341978, + "learning_rate": 4.6752854381807414e-06, + "loss": 0.2514, + "step": 310 + }, + { + "epoch": 1.7442348008385744, + "grad_norm": 2.469483649303522, + "learning_rate": 4.670922640721261e-06, + "loss": 0.332, + "step": 312 + }, + { + "epoch": 1.7554157931516423, + "grad_norm": 2.327049750502898, + "learning_rate": 4.666532793488518e-06, + "loss": 0.3482, + "step": 314 + }, + { + "epoch": 1.76659678546471, + "grad_norm": 2.0224582609864674, + "learning_rate": 4.662115951180164e-06, + "loss": 0.3192, + "step": 316 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 1.9568416201882894, + "learning_rate": 4.657672168830211e-06, + "loss": 0.2682, + "step": 318 + }, + { + "epoch": 1.7889587700908456, + "grad_norm": 1.919410926201314, + "learning_rate": 4.653201501808346e-06, + "loss": 0.3602, + "step": 320 + }, + { + "epoch": 1.8001397624039135, + "grad_norm": 2.239752835185363, + "learning_rate": 4.6487040058192385e-06, + "loss": 0.346, + "step": 322 + }, + { + "epoch": 1.8113207547169812, + "grad_norm": 2.3820790461811643, + "learning_rate": 4.644179736901848e-06, + "loss": 0.393, + "step": 324 + }, + { + "epoch": 1.8225017470300489, + "grad_norm": 2.100652056063807, + "learning_rate": 4.639628751428728e-06, + "loss": 0.3348, + "step": 326 + }, + { + "epoch": 1.8336827393431165, + "grad_norm": 1.839587786014522, + "learning_rate": 4.635051106105316e-06, + "loss": 0.297, + "step": 328 + }, + { + "epoch": 1.8448637316561844, + "grad_norm": 1.460937373317575, + "learning_rate": 4.630446857969238e-06, + "loss": 0.3291, + "step": 330 + }, + { + "epoch": 1.8560447239692524, + "grad_norm": 3.066440662132836, + "learning_rate": 4.625816064389589e-06, + "loss": 0.2752, + "step": 332 + }, + { + "epoch": 1.86722571628232, + "grad_norm": 1.9596525632755366, + "learning_rate": 4.62115878306622e-06, + "loss": 0.3444, + "step": 334 + }, + { + "epoch": 1.8784067085953877, + "grad_norm": 2.2835299782118335, + "learning_rate": 4.616475072029024e-06, + "loss": 0.3013, + "step": 336 + }, + { + "epoch": 1.8895877009084556, + "grad_norm": 2.1330589159921756, + "learning_rate": 4.6117649896372055e-06, + "loss": 0.3811, + "step": 338 + }, + { + "epoch": 1.9007686932215235, + "grad_norm": 2.28792058261577, + "learning_rate": 4.607028594578559e-06, + "loss": 0.304, + "step": 340 + }, + { + "epoch": 1.9119496855345912, + "grad_norm": 1.8457539990364031, + "learning_rate": 4.602265945868735e-06, + "loss": 0.2817, + "step": 342 + }, + { + "epoch": 1.923130677847659, + "grad_norm": 1.7860630390403116, + "learning_rate": 4.597477102850506e-06, + "loss": 0.3166, + "step": 344 + }, + { + "epoch": 1.9343116701607268, + "grad_norm": 1.988441202911347, + "learning_rate": 4.592662125193027e-06, + "loss": 0.2881, + "step": 346 + }, + { + "epoch": 1.9454926624737947, + "grad_norm": 1.7341207391896365, + "learning_rate": 4.587821072891089e-06, + "loss": 0.3126, + "step": 348 + }, + { + "epoch": 1.9566736547868624, + "grad_norm": 1.8960045369195677, + "learning_rate": 4.582954006264377e-06, + "loss": 0.32, + "step": 350 + }, + { + "epoch": 1.96785464709993, + "grad_norm": 1.8028316706058551, + "learning_rate": 4.578060985956714e-06, + "loss": 0.3308, + "step": 352 + }, + { + "epoch": 1.9790356394129978, + "grad_norm": 1.7537644172052635, + "learning_rate": 4.573142072935307e-06, + "loss": 0.325, + "step": 354 + }, + { + "epoch": 1.9902166317260657, + "grad_norm": 1.5291097261080726, + "learning_rate": 4.568197328489986e-06, + "loss": 0.3418, + "step": 356 + }, + { + "epoch": 2.0013976240391336, + "grad_norm": 2.703429613422267, + "learning_rate": 4.563226814232444e-06, + "loss": 0.316, + "step": 358 + }, + { + "epoch": 2.0125786163522013, + "grad_norm": 1.6677019482039983, + "learning_rate": 4.558230592095465e-06, + "loss": 0.2242, + "step": 360 + }, + { + "epoch": 2.023759608665269, + "grad_norm": 2.1855279147060527, + "learning_rate": 4.5532087243321536e-06, + "loss": 0.1706, + "step": 362 + }, + { + "epoch": 2.0349406009783366, + "grad_norm": 1.433260386596143, + "learning_rate": 4.548161273515161e-06, + "loss": 0.2597, + "step": 364 + }, + { + "epoch": 2.0461215932914047, + "grad_norm": 1.9528007044032762, + "learning_rate": 4.543088302535903e-06, + "loss": 0.2321, + "step": 366 + }, + { + "epoch": 2.0573025856044724, + "grad_norm": 1.508509476663671, + "learning_rate": 4.53798987460378e-06, + "loss": 0.1975, + "step": 368 + }, + { + "epoch": 2.06848357791754, + "grad_norm": 1.4870411030447606, + "learning_rate": 4.532866053245385e-06, + "loss": 0.218, + "step": 370 + }, + { + "epoch": 2.079664570230608, + "grad_norm": 1.984299603467917, + "learning_rate": 4.527716902303713e-06, + "loss": 0.1866, + "step": 372 + }, + { + "epoch": 2.090845562543676, + "grad_norm": 1.7502708144873231, + "learning_rate": 4.522542485937369e-06, + "loss": 0.2128, + "step": 374 + }, + { + "epoch": 2.1020265548567436, + "grad_norm": 1.131006072907252, + "learning_rate": 4.517342868619764e-06, + "loss": 0.2418, + "step": 376 + }, + { + "epoch": 2.1132075471698113, + "grad_norm": 2.365723778930082, + "learning_rate": 4.512118115138315e-06, + "loss": 0.2249, + "step": 378 + }, + { + "epoch": 2.124388539482879, + "grad_norm": 1.7739738087900154, + "learning_rate": 4.506868290593635e-06, + "loss": 0.225, + "step": 380 + }, + { + "epoch": 2.135569531795947, + "grad_norm": 2.3920039733015197, + "learning_rate": 4.501593460398726e-06, + "loss": 0.207, + "step": 382 + }, + { + "epoch": 2.146750524109015, + "grad_norm": 1.3961875749075527, + "learning_rate": 4.49629369027816e-06, + "loss": 0.1847, + "step": 384 + }, + { + "epoch": 2.1579315164220825, + "grad_norm": 1.740079266616333, + "learning_rate": 4.490969046267258e-06, + "loss": 0.2092, + "step": 386 + }, + { + "epoch": 2.16911250873515, + "grad_norm": 1.716849109423316, + "learning_rate": 4.485619594711278e-06, + "loss": 0.2512, + "step": 388 + }, + { + "epoch": 2.180293501048218, + "grad_norm": 2.2256205473256836, + "learning_rate": 4.4802454022645725e-06, + "loss": 0.2212, + "step": 390 + }, + { + "epoch": 2.191474493361286, + "grad_norm": 1.5080548485099736, + "learning_rate": 4.474846535889773e-06, + "loss": 0.2577, + "step": 392 + }, + { + "epoch": 2.2026554856743537, + "grad_norm": 1.849350001917602, + "learning_rate": 4.469423062856946e-06, + "loss": 0.2518, + "step": 394 + }, + { + "epoch": 2.2138364779874213, + "grad_norm": 2.0456903454646937, + "learning_rate": 4.463975050742757e-06, + "loss": 0.2666, + "step": 396 + }, + { + "epoch": 2.225017470300489, + "grad_norm": 2.1576955140860172, + "learning_rate": 4.4585025674296315e-06, + "loss": 0.1881, + "step": 398 + }, + { + "epoch": 2.236198462613557, + "grad_norm": 1.959825305986428, + "learning_rate": 4.453005681104906e-06, + "loss": 0.1912, + "step": 400 + }, + { + "epoch": 2.247379454926625, + "grad_norm": 1.8263078605633967, + "learning_rate": 4.44748446025998e-06, + "loss": 0.177, + "step": 402 + }, + { + "epoch": 2.2585604472396925, + "grad_norm": 1.3737693376807456, + "learning_rate": 4.44193897368946e-06, + "loss": 0.2083, + "step": 404 + }, + { + "epoch": 2.26974143955276, + "grad_norm": 1.9216745648550881, + "learning_rate": 4.436369290490307e-06, + "loss": 0.269, + "step": 406 + }, + { + "epoch": 2.280922431865828, + "grad_norm": 1.5225068983698562, + "learning_rate": 4.430775480060973e-06, + "loss": 0.2043, + "step": 408 + }, + { + "epoch": 2.292103424178896, + "grad_norm": 1.958524495155971, + "learning_rate": 4.425157612100531e-06, + "loss": 0.2735, + "step": 410 + }, + { + "epoch": 2.3032844164919637, + "grad_norm": 2.020109840115744, + "learning_rate": 4.419515756607819e-06, + "loss": 0.2623, + "step": 412 + }, + { + "epoch": 2.3144654088050314, + "grad_norm": 1.6832635446278787, + "learning_rate": 4.413849983880554e-06, + "loss": 0.2122, + "step": 414 + }, + { + "epoch": 2.325646401118099, + "grad_norm": 1.8238819367042174, + "learning_rate": 4.4081603645144685e-06, + "loss": 0.2141, + "step": 416 + }, + { + "epoch": 2.336827393431167, + "grad_norm": 1.636664838162331, + "learning_rate": 4.4024469694024194e-06, + "loss": 0.2159, + "step": 418 + }, + { + "epoch": 2.348008385744235, + "grad_norm": 1.563361723149053, + "learning_rate": 4.396709869733515e-06, + "loss": 0.2636, + "step": 420 + }, + { + "epoch": 2.3591893780573026, + "grad_norm": 1.7104549540666967, + "learning_rate": 4.39094913699222e-06, + "loss": 0.2059, + "step": 422 + }, + { + "epoch": 2.3703703703703702, + "grad_norm": 1.7448299629844894, + "learning_rate": 4.385164842957469e-06, + "loss": 0.2076, + "step": 424 + }, + { + "epoch": 2.381551362683438, + "grad_norm": 2.0760771369111812, + "learning_rate": 4.379357059701771e-06, + "loss": 0.2241, + "step": 426 + }, + { + "epoch": 2.392732354996506, + "grad_norm": 1.4610379659131663, + "learning_rate": 4.373525859590313e-06, + "loss": 0.2135, + "step": 428 + }, + { + "epoch": 2.4039133473095737, + "grad_norm": 1.9763200369365506, + "learning_rate": 4.367671315280055e-06, + "loss": 0.2225, + "step": 430 + }, + { + "epoch": 2.4150943396226414, + "grad_norm": 2.138415914668256, + "learning_rate": 4.3617934997188274e-06, + "loss": 0.2618, + "step": 432 + }, + { + "epoch": 2.426275331935709, + "grad_norm": 1.6842725394389781, + "learning_rate": 4.355892486144419e-06, + "loss": 0.1691, + "step": 434 + }, + { + "epoch": 2.4374563242487772, + "grad_norm": 2.056626946764254, + "learning_rate": 4.349968348083673e-06, + "loss": 0.1922, + "step": 436 + }, + { + "epoch": 2.448637316561845, + "grad_norm": 1.2423274511146358, + "learning_rate": 4.3440211593515556e-06, + "loss": 0.2061, + "step": 438 + }, + { + "epoch": 2.4598183088749126, + "grad_norm": 1.465237522133527, + "learning_rate": 4.338050994050253e-06, + "loss": 0.1996, + "step": 440 + }, + { + "epoch": 2.4709993011879803, + "grad_norm": 2.1451900105983315, + "learning_rate": 4.332057926568235e-06, + "loss": 0.2441, + "step": 442 + }, + { + "epoch": 2.4821802935010484, + "grad_norm": 1.5259606296511572, + "learning_rate": 4.326042031579337e-06, + "loss": 0.2066, + "step": 444 + }, + { + "epoch": 2.493361285814116, + "grad_norm": 2.4163109674867784, + "learning_rate": 4.320003384041823e-06, + "loss": 0.2393, + "step": 446 + }, + { + "epoch": 2.5045422781271838, + "grad_norm": 2.1518283309231907, + "learning_rate": 4.313942059197457e-06, + "loss": 0.2467, + "step": 448 + }, + { + "epoch": 2.5157232704402515, + "grad_norm": 1.6715387204280183, + "learning_rate": 4.3078581325705614e-06, + "loss": 0.2495, + "step": 450 + }, + { + "epoch": 2.5269042627533196, + "grad_norm": 1.7729216990478125, + "learning_rate": 4.3017516799670785e-06, + "loss": 0.1586, + "step": 452 + }, + { + "epoch": 2.5380852550663873, + "grad_norm": 1.7853923740535589, + "learning_rate": 4.295622777473625e-06, + "loss": 0.2216, + "step": 454 + }, + { + "epoch": 2.549266247379455, + "grad_norm": 1.7001940457803237, + "learning_rate": 4.289471501456543e-06, + "loss": 0.2288, + "step": 456 + }, + { + "epoch": 2.5604472396925226, + "grad_norm": 2.5868877625212354, + "learning_rate": 4.283297928560951e-06, + "loss": 0.2075, + "step": 458 + }, + { + "epoch": 2.5716282320055903, + "grad_norm": 2.1990912649669823, + "learning_rate": 4.277102135709786e-06, + "loss": 0.2017, + "step": 460 + }, + { + "epoch": 2.582809224318658, + "grad_norm": 2.2627396419665273, + "learning_rate": 4.270884200102848e-06, + "loss": 0.2144, + "step": 462 + }, + { + "epoch": 2.593990216631726, + "grad_norm": 2.2283930780278505, + "learning_rate": 4.2646441992158356e-06, + "loss": 0.3, + "step": 464 + }, + { + "epoch": 2.605171208944794, + "grad_norm": 2.6765537923336087, + "learning_rate": 4.258382210799381e-06, + "loss": 0.2441, + "step": 466 + }, + { + "epoch": 2.6163522012578615, + "grad_norm": 2.0124117535310706, + "learning_rate": 4.252098312878083e-06, + "loss": 0.2667, + "step": 468 + }, + { + "epoch": 2.6275331935709296, + "grad_norm": 2.0622543839995586, + "learning_rate": 4.245792583749533e-06, + "loss": 0.2209, + "step": 470 + }, + { + "epoch": 2.6387141858839973, + "grad_norm": 1.7479329049755916, + "learning_rate": 4.2394651019833385e-06, + "loss": 0.2045, + "step": 472 + }, + { + "epoch": 2.649895178197065, + "grad_norm": 2.223724201139868, + "learning_rate": 4.23311594642015e-06, + "loss": 0.2283, + "step": 474 + }, + { + "epoch": 2.6610761705101327, + "grad_norm": 1.8280919056271019, + "learning_rate": 4.226745196170669e-06, + "loss": 0.2319, + "step": 476 + }, + { + "epoch": 2.6722571628232004, + "grad_norm": 1.6911807333452673, + "learning_rate": 4.220352930614672e-06, + "loss": 0.232, + "step": 478 + }, + { + "epoch": 2.6834381551362685, + "grad_norm": 1.9242468593637576, + "learning_rate": 4.213939229400014e-06, + "loss": 0.2733, + "step": 480 + }, + { + "epoch": 2.694619147449336, + "grad_norm": 2.1223012349945254, + "learning_rate": 4.20750417244164e-06, + "loss": 0.2529, + "step": 482 + }, + { + "epoch": 2.705800139762404, + "grad_norm": 2.1921742273194313, + "learning_rate": 4.201047839920589e-06, + "loss": 0.257, + "step": 484 + }, + { + "epoch": 2.7169811320754715, + "grad_norm": 2.118251084662083, + "learning_rate": 4.194570312282993e-06, + "loss": 0.235, + "step": 486 + }, + { + "epoch": 2.7281621243885397, + "grad_norm": 1.9816644323530734, + "learning_rate": 4.1880716702390764e-06, + "loss": 0.1839, + "step": 488 + }, + { + "epoch": 2.7393431167016074, + "grad_norm": 1.8891363830208663, + "learning_rate": 4.181551994762151e-06, + "loss": 0.2301, + "step": 490 + }, + { + "epoch": 2.750524109014675, + "grad_norm": 1.7502840233703516, + "learning_rate": 4.1750113670876045e-06, + "loss": 0.1883, + "step": 492 + }, + { + "epoch": 2.7617051013277427, + "grad_norm": 1.5627429248705165, + "learning_rate": 4.16844986871189e-06, + "loss": 0.2042, + "step": 494 + }, + { + "epoch": 2.7728860936408104, + "grad_norm": 1.8631447011251083, + "learning_rate": 4.161867581391511e-06, + "loss": 0.2018, + "step": 496 + }, + { + "epoch": 2.7840670859538785, + "grad_norm": 2.0906363974353765, + "learning_rate": 4.155264587142002e-06, + "loss": 0.2319, + "step": 498 + }, + { + "epoch": 2.795248078266946, + "grad_norm": 1.7819164584799931, + "learning_rate": 4.148640968236903e-06, + "loss": 0.1703, + "step": 500 + }, + { + "epoch": 2.806429070580014, + "grad_norm": 1.7607086842324982, + "learning_rate": 4.141996807206745e-06, + "loss": 0.2264, + "step": 502 + }, + { + "epoch": 2.817610062893082, + "grad_norm": 1.5277530729360727, + "learning_rate": 4.135332186838008e-06, + "loss": 0.2134, + "step": 504 + }, + { + "epoch": 2.8287910552061497, + "grad_norm": 1.739277840645659, + "learning_rate": 4.128647190172099e-06, + "loss": 0.1952, + "step": 506 + }, + { + "epoch": 2.8399720475192174, + "grad_norm": 1.9987218712547774, + "learning_rate": 4.121941900504316e-06, + "loss": 0.2364, + "step": 508 + }, + { + "epoch": 2.851153039832285, + "grad_norm": 2.2244662318443225, + "learning_rate": 4.1152164013828035e-06, + "loss": 0.2072, + "step": 510 + }, + { + "epoch": 2.8623340321453528, + "grad_norm": 1.526547678145968, + "learning_rate": 4.108470776607521e-06, + "loss": 0.2047, + "step": 512 + }, + { + "epoch": 2.8735150244584204, + "grad_norm": 2.005093613185987, + "learning_rate": 4.1017051102291946e-06, + "loss": 0.2789, + "step": 514 + }, + { + "epoch": 2.8846960167714886, + "grad_norm": 2.2990829029486624, + "learning_rate": 4.094919486548266e-06, + "loss": 0.2414, + "step": 516 + }, + { + "epoch": 2.8958770090845563, + "grad_norm": 2.13743283403912, + "learning_rate": 4.088113990113846e-06, + "loss": 0.2029, + "step": 518 + }, + { + "epoch": 2.907058001397624, + "grad_norm": 1.9027626030017704, + "learning_rate": 4.081288705722666e-06, + "loss": 0.2229, + "step": 520 + }, + { + "epoch": 2.918238993710692, + "grad_norm": 2.0076859155071745, + "learning_rate": 4.074443718418009e-06, + "loss": 0.1995, + "step": 522 + }, + { + "epoch": 2.9294199860237597, + "grad_norm": 1.7985240007466619, + "learning_rate": 4.067579113488661e-06, + "loss": 0.1807, + "step": 524 + }, + { + "epoch": 2.9406009783368274, + "grad_norm": 2.140934337000471, + "learning_rate": 4.060694976467844e-06, + "loss": 0.2532, + "step": 526 + }, + { + "epoch": 2.951781970649895, + "grad_norm": 2.323003193893417, + "learning_rate": 4.0537913931321495e-06, + "loss": 0.2421, + "step": 528 + }, + { + "epoch": 2.962962962962963, + "grad_norm": 1.4532319163010707, + "learning_rate": 4.04686844950047e-06, + "loss": 0.2267, + "step": 530 + }, + { + "epoch": 2.9741439552760305, + "grad_norm": 2.0854922336923023, + "learning_rate": 4.039926231832931e-06, + "loss": 0.266, + "step": 532 + }, + { + "epoch": 2.9853249475890986, + "grad_norm": 2.882533995321225, + "learning_rate": 4.032964826629811e-06, + "loss": 0.2079, + "step": 534 + }, + { + "epoch": 2.9965059399021663, + "grad_norm": 2.7236955724192873, + "learning_rate": 4.025984320630465e-06, + "loss": 0.1657, + "step": 536 + }, + { + "epoch": 3.007686932215234, + "grad_norm": 1.8432900490614266, + "learning_rate": 4.018984800812248e-06, + "loss": 0.1354, + "step": 538 + }, + { + "epoch": 3.018867924528302, + "grad_norm": 2.0142515580054017, + "learning_rate": 4.011966354389424e-06, + "loss": 0.1542, + "step": 540 + }, + { + "epoch": 3.03004891684137, + "grad_norm": 2.756352182005047, + "learning_rate": 4.004929068812086e-06, + "loss": 0.1638, + "step": 542 + }, + { + "epoch": 3.0412299091544375, + "grad_norm": 2.048077691313813, + "learning_rate": 3.997873031765061e-06, + "loss": 0.156, + "step": 544 + }, + { + "epoch": 3.052410901467505, + "grad_norm": 1.7442233155652336, + "learning_rate": 3.990798331166822e-06, + "loss": 0.1095, + "step": 546 + }, + { + "epoch": 3.063591893780573, + "grad_norm": 1.826861973142375, + "learning_rate": 3.983705055168391e-06, + "loss": 0.1195, + "step": 548 + }, + { + "epoch": 3.074772886093641, + "grad_norm": 1.943175517862748, + "learning_rate": 3.976593292152238e-06, + "loss": 0.1638, + "step": 550 + }, + { + "epoch": 3.0859538784067087, + "grad_norm": 1.5477727978546996, + "learning_rate": 3.969463130731183e-06, + "loss": 0.1291, + "step": 552 + }, + { + "epoch": 3.0971348707197763, + "grad_norm": 2.3918080397656034, + "learning_rate": 3.9623146597472915e-06, + "loss": 0.1333, + "step": 554 + }, + { + "epoch": 3.108315863032844, + "grad_norm": 2.0592865934704, + "learning_rate": 3.955147968270764e-06, + "loss": 0.1692, + "step": 556 + }, + { + "epoch": 3.119496855345912, + "grad_norm": 1.280306245998938, + "learning_rate": 3.947963145598833e-06, + "loss": 0.1695, + "step": 558 + }, + { + "epoch": 3.13067784765898, + "grad_norm": 1.5568837418874426, + "learning_rate": 3.940760281254645e-06, + "loss": 0.1614, + "step": 560 + }, + { + "epoch": 3.1418588399720475, + "grad_norm": 1.6248982612645957, + "learning_rate": 3.933539464986143e-06, + "loss": 0.1184, + "step": 562 + }, + { + "epoch": 3.153039832285115, + "grad_norm": 1.657284019650329, + "learning_rate": 3.926300786764957e-06, + "loss": 0.1523, + "step": 564 + }, + { + "epoch": 3.164220824598183, + "grad_norm": 1.9315037734198213, + "learning_rate": 3.919044336785274e-06, + "loss": 0.1411, + "step": 566 + }, + { + "epoch": 3.175401816911251, + "grad_norm": 1.7456382044347782, + "learning_rate": 3.911770205462717e-06, + "loss": 0.1764, + "step": 568 + }, + { + "epoch": 3.1865828092243187, + "grad_norm": 1.4045398532057205, + "learning_rate": 3.904478483433223e-06, + "loss": 0.1241, + "step": 570 + }, + { + "epoch": 3.1977638015373864, + "grad_norm": 2.0886459168414895, + "learning_rate": 3.897169261551907e-06, + "loss": 0.1475, + "step": 572 + }, + { + "epoch": 3.208944793850454, + "grad_norm": 1.9098750157027404, + "learning_rate": 3.889842630891934e-06, + "loss": 0.138, + "step": 574 + }, + { + "epoch": 3.220125786163522, + "grad_norm": 2.184899827108709, + "learning_rate": 3.8824986827433804e-06, + "loss": 0.1315, + "step": 576 + }, + { + "epoch": 3.23130677847659, + "grad_norm": 1.528868394326383, + "learning_rate": 3.875137508612104e-06, + "loss": 0.1447, + "step": 578 + }, + { + "epoch": 3.2424877707896576, + "grad_norm": 1.6893708687857107, + "learning_rate": 3.867759200218594e-06, + "loss": 0.1746, + "step": 580 + }, + { + "epoch": 3.2536687631027252, + "grad_norm": 1.2610411246909474, + "learning_rate": 3.860363849496836e-06, + "loss": 0.1301, + "step": 582 + }, + { + "epoch": 3.264849755415793, + "grad_norm": 1.397542140556738, + "learning_rate": 3.852951548593161e-06, + "loss": 0.1373, + "step": 584 + }, + { + "epoch": 3.276030747728861, + "grad_norm": 1.9903353672741917, + "learning_rate": 3.845522389865106e-06, + "loss": 0.1609, + "step": 586 + }, + { + "epoch": 3.2872117400419287, + "grad_norm": 1.8370941337314268, + "learning_rate": 3.838076465880248e-06, + "loss": 0.148, + "step": 588 + }, + { + "epoch": 3.2983927323549964, + "grad_norm": 2.058865100613852, + "learning_rate": 3.830613869415069e-06, + "loss": 0.1483, + "step": 590 + }, + { + "epoch": 3.309573724668064, + "grad_norm": 1.5232253694216566, + "learning_rate": 3.823134693453782e-06, + "loss": 0.1621, + "step": 592 + }, + { + "epoch": 3.3207547169811322, + "grad_norm": 1.4993049111722665, + "learning_rate": 3.8156390311871885e-06, + "loss": 0.1433, + "step": 594 + }, + { + "epoch": 3.3319357092942, + "grad_norm": 1.555934394379587, + "learning_rate": 3.808126976011505e-06, + "loss": 0.1426, + "step": 596 + }, + { + "epoch": 3.3431167016072676, + "grad_norm": 1.3356473446523094, + "learning_rate": 3.8005986215272056e-06, + "loss": 0.1706, + "step": 598 + }, + { + "epoch": 3.3542976939203353, + "grad_norm": 1.9137688829035275, + "learning_rate": 3.7930540615378565e-06, + "loss": 0.1268, + "step": 600 + }, + { + "epoch": 3.3654786862334034, + "grad_norm": 1.5344748040953766, + "learning_rate": 3.785493390048942e-06, + "loss": 0.1458, + "step": 602 + }, + { + "epoch": 3.376659678546471, + "grad_norm": 1.602087497610558, + "learning_rate": 3.777916701266699e-06, + "loss": 0.1697, + "step": 604 + }, + { + "epoch": 3.3878406708595388, + "grad_norm": 1.4842568873334896, + "learning_rate": 3.7703240895969373e-06, + "loss": 0.1519, + "step": 606 + }, + { + "epoch": 3.3990216631726065, + "grad_norm": 1.53860971256147, + "learning_rate": 3.7627156496438686e-06, + "loss": 0.1691, + "step": 608 + }, + { + "epoch": 3.4102026554856746, + "grad_norm": 1.4193083610134813, + "learning_rate": 3.755091476208925e-06, + "loss": 0.1211, + "step": 610 + }, + { + "epoch": 3.4213836477987423, + "grad_norm": 1.8053625548432577, + "learning_rate": 3.7474516642895804e-06, + "loss": 0.131, + "step": 612 + }, + { + "epoch": 3.43256464011181, + "grad_norm": 1.9235537907938398, + "learning_rate": 3.7397963090781606e-06, + "loss": 0.163, + "step": 614 + }, + { + "epoch": 3.4437456324248776, + "grad_norm": 1.6022979215271898, + "learning_rate": 3.732125505960665e-06, + "loss": 0.1479, + "step": 616 + }, + { + "epoch": 3.4549266247379453, + "grad_norm": 1.663918706474492, + "learning_rate": 3.7244393505155713e-06, + "loss": 0.1376, + "step": 618 + }, + { + "epoch": 3.4661076170510134, + "grad_norm": 1.7974067820999995, + "learning_rate": 3.716737938512651e-06, + "loss": 0.1281, + "step": 620 + }, + { + "epoch": 3.477288609364081, + "grad_norm": 2.10108609081228, + "learning_rate": 3.709021365911772e-06, + "loss": 0.1388, + "step": 622 + }, + { + "epoch": 3.488469601677149, + "grad_norm": 1.367826215107555, + "learning_rate": 3.701289728861701e-06, + "loss": 0.1191, + "step": 624 + }, + { + "epoch": 3.4996505939902165, + "grad_norm": 1.7959553374302317, + "learning_rate": 3.693543123698913e-06, + "loss": 0.1758, + "step": 626 + }, + { + "epoch": 3.5108315863032846, + "grad_norm": 1.7389366148854988, + "learning_rate": 3.6857816469463806e-06, + "loss": 0.1405, + "step": 628 + }, + { + "epoch": 3.5220125786163523, + "grad_norm": 2.871162474790627, + "learning_rate": 3.6780053953123836e-06, + "loss": 0.1549, + "step": 630 + }, + { + "epoch": 3.53319357092942, + "grad_norm": 1.478751565339363, + "learning_rate": 3.6702144656892907e-06, + "loss": 0.1759, + "step": 632 + }, + { + "epoch": 3.5443745632424877, + "grad_norm": 1.4974413518112613, + "learning_rate": 3.662408955152364e-06, + "loss": 0.1078, + "step": 634 + }, + { + "epoch": 3.5555555555555554, + "grad_norm": 1.7006067350332152, + "learning_rate": 3.6545889609585405e-06, + "loss": 0.1427, + "step": 636 + }, + { + "epoch": 3.5667365478686235, + "grad_norm": 1.8754398825641954, + "learning_rate": 3.6467545805452266e-06, + "loss": 0.1893, + "step": 638 + }, + { + "epoch": 3.577917540181691, + "grad_norm": 1.7762501705151392, + "learning_rate": 3.6389059115290813e-06, + "loss": 0.1109, + "step": 640 + }, + { + "epoch": 3.589098532494759, + "grad_norm": 2.0251975300449327, + "learning_rate": 3.631043051704799e-06, + "loss": 0.121, + "step": 642 + }, + { + "epoch": 3.6002795248078265, + "grad_norm": 1.3531681902278672, + "learning_rate": 3.6231660990438922e-06, + "loss": 0.1348, + "step": 644 + }, + { + "epoch": 3.6114605171208947, + "grad_norm": 1.9724391202631109, + "learning_rate": 3.615275151693471e-06, + "loss": 0.1449, + "step": 646 + }, + { + "epoch": 3.6226415094339623, + "grad_norm": 1.785158595271644, + "learning_rate": 3.6073703079750204e-06, + "loss": 0.1485, + "step": 648 + }, + { + "epoch": 3.63382250174703, + "grad_norm": 1.829166278099355, + "learning_rate": 3.5994516663831734e-06, + "loss": 0.1192, + "step": 650 + }, + { + "epoch": 3.6450034940600977, + "grad_norm": 1.9222881871208803, + "learning_rate": 3.591519325584487e-06, + "loss": 0.1635, + "step": 652 + }, + { + "epoch": 3.6561844863731654, + "grad_norm": 2.052453811112636, + "learning_rate": 3.583573384416209e-06, + "loss": 0.1561, + "step": 654 + }, + { + "epoch": 3.6673654786862335, + "grad_norm": 1.9190051036571132, + "learning_rate": 3.575613941885047e-06, + "loss": 0.1051, + "step": 656 + }, + { + "epoch": 3.678546470999301, + "grad_norm": 1.4736638642637576, + "learning_rate": 3.5676410971659404e-06, + "loss": 0.123, + "step": 658 + }, + { + "epoch": 3.689727463312369, + "grad_norm": 1.7325761695268906, + "learning_rate": 3.5596549496008165e-06, + "loss": 0.1446, + "step": 660 + }, + { + "epoch": 3.700908455625437, + "grad_norm": 2.0344810615726288, + "learning_rate": 3.551655598697358e-06, + "loss": 0.1629, + "step": 662 + }, + { + "epoch": 3.7120894479385047, + "grad_norm": 1.936581123166174, + "learning_rate": 3.54364314412776e-06, + "loss": 0.1569, + "step": 664 + }, + { + "epoch": 3.7232704402515724, + "grad_norm": 1.3525874354992642, + "learning_rate": 3.535617685727494e-06, + "loss": 0.1082, + "step": 666 + }, + { + "epoch": 3.73445143256464, + "grad_norm": 1.6514309403224916, + "learning_rate": 3.527579323494055e-06, + "loss": 0.1431, + "step": 668 + }, + { + "epoch": 3.7456324248777078, + "grad_norm": 1.8602451468342234, + "learning_rate": 3.5195281575857228e-06, + "loss": 0.1639, + "step": 670 + }, + { + "epoch": 3.7568134171907754, + "grad_norm": 1.4731268992440232, + "learning_rate": 3.511464288320311e-06, + "loss": 0.1271, + "step": 672 + }, + { + "epoch": 3.7679944095038436, + "grad_norm": 1.37724516129253, + "learning_rate": 3.503387816173916e-06, + "loss": 0.1597, + "step": 674 + }, + { + "epoch": 3.7791754018169113, + "grad_norm": 1.7200144334067748, + "learning_rate": 3.495298841779669e-06, + "loss": 0.117, + "step": 676 + }, + { + "epoch": 3.790356394129979, + "grad_norm": 1.92538314164391, + "learning_rate": 3.4871974659264786e-06, + "loss": 0.1584, + "step": 678 + }, + { + "epoch": 3.801537386443047, + "grad_norm": 1.4718208788605616, + "learning_rate": 3.4790837895577752e-06, + "loss": 0.1333, + "step": 680 + }, + { + "epoch": 3.8127183787561147, + "grad_norm": 1.5582481918696203, + "learning_rate": 3.470957913770255e-06, + "loss": 0.1464, + "step": 682 + }, + { + "epoch": 3.8238993710691824, + "grad_norm": 1.4618275028428347, + "learning_rate": 3.462819939812618e-06, + "loss": 0.0995, + "step": 684 + }, + { + "epoch": 3.83508036338225, + "grad_norm": 1.3366351935592664, + "learning_rate": 3.4546699690843123e-06, + "loss": 0.1204, + "step": 686 + }, + { + "epoch": 3.846261355695318, + "grad_norm": 1.3780079667316787, + "learning_rate": 3.446508103134259e-06, + "loss": 0.1701, + "step": 688 + }, + { + "epoch": 3.8574423480083855, + "grad_norm": 1.7451718870626607, + "learning_rate": 3.4383344436595992e-06, + "loss": 0.1158, + "step": 690 + }, + { + "epoch": 3.8686233403214536, + "grad_norm": 2.019474198008684, + "learning_rate": 3.430149092504422e-06, + "loss": 0.1304, + "step": 692 + }, + { + "epoch": 3.8798043326345213, + "grad_norm": 1.6820935429062616, + "learning_rate": 3.4219521516584912e-06, + "loss": 0.1334, + "step": 694 + }, + { + "epoch": 3.890985324947589, + "grad_norm": 2.2578057319721236, + "learning_rate": 3.4137437232559834e-06, + "loss": 0.1557, + "step": 696 + }, + { + "epoch": 3.902166317260657, + "grad_norm": 1.3610116271561221, + "learning_rate": 3.4055239095742067e-06, + "loss": 0.1644, + "step": 698 + }, + { + "epoch": 3.913347309573725, + "grad_norm": 1.3397050224861815, + "learning_rate": 3.3972928130323322e-06, + "loss": 0.1471, + "step": 700 + }, + { + "epoch": 3.9245283018867925, + "grad_norm": 1.5234658664307734, + "learning_rate": 3.3890505361901153e-06, + "loss": 0.1195, + "step": 702 + }, + { + "epoch": 3.93570929419986, + "grad_norm": 1.763362220735128, + "learning_rate": 3.380797181746619e-06, + "loss": 0.1363, + "step": 704 + }, + { + "epoch": 3.946890286512928, + "grad_norm": 2.038986301246902, + "learning_rate": 3.3725328525389324e-06, + "loss": 0.1203, + "step": 706 + }, + { + "epoch": 3.958071278825996, + "grad_norm": 1.9046513315579439, + "learning_rate": 3.364257651540891e-06, + "loss": 0.1578, + "step": 708 + }, + { + "epoch": 3.9692522711390636, + "grad_norm": 1.423399143627221, + "learning_rate": 3.355971681861794e-06, + "loss": 0.1211, + "step": 710 + }, + { + "epoch": 3.9804332634521313, + "grad_norm": 1.5586817639667492, + "learning_rate": 3.3476750467451176e-06, + "loss": 0.153, + "step": 712 + }, + { + "epoch": 3.991614255765199, + "grad_norm": 1.4814888460752178, + "learning_rate": 3.33936784956723e-06, + "loss": 0.1288, + "step": 714 + }, + { + "epoch": 4.002795248078267, + "grad_norm": 1.6561127976965244, + "learning_rate": 3.331050193836104e-06, + "loss": 0.1196, + "step": 716 + }, + { + "epoch": 4.013976240391335, + "grad_norm": 1.8246755797846792, + "learning_rate": 3.322722183190025e-06, + "loss": 0.0983, + "step": 718 + }, + { + "epoch": 4.0251572327044025, + "grad_norm": 1.2508646883720782, + "learning_rate": 3.3143839213963026e-06, + "loss": 0.1132, + "step": 720 + }, + { + "epoch": 4.03633822501747, + "grad_norm": 1.3174073933660169, + "learning_rate": 3.306035512349974e-06, + "loss": 0.0886, + "step": 722 + }, + { + "epoch": 4.047519217330538, + "grad_norm": 1.4006843207756257, + "learning_rate": 3.297677060072513e-06, + "loss": 0.0907, + "step": 724 + }, + { + "epoch": 4.058700209643606, + "grad_norm": 2.147633002379955, + "learning_rate": 3.2893086687105324e-06, + "loss": 0.0814, + "step": 726 + }, + { + "epoch": 4.069881201956673, + "grad_norm": 1.8499679148666142, + "learning_rate": 3.280930442534486e-06, + "loss": 0.0916, + "step": 728 + }, + { + "epoch": 4.081062194269742, + "grad_norm": 1.5576608674855401, + "learning_rate": 3.272542485937369e-06, + "loss": 0.0814, + "step": 730 + }, + { + "epoch": 4.0922431865828095, + "grad_norm": 1.5258204722757824, + "learning_rate": 3.264144903433419e-06, + "loss": 0.0929, + "step": 732 + }, + { + "epoch": 4.103424178895877, + "grad_norm": 1.2377371189448831, + "learning_rate": 3.2557377996568135e-06, + "loss": 0.0933, + "step": 734 + }, + { + "epoch": 4.114605171208945, + "grad_norm": 1.6706792363129992, + "learning_rate": 3.247321279360363e-06, + "loss": 0.0957, + "step": 736 + }, + { + "epoch": 4.1257861635220126, + "grad_norm": 1.5205095000978939, + "learning_rate": 3.238895447414211e-06, + "loss": 0.1094, + "step": 738 + }, + { + "epoch": 4.13696715583508, + "grad_norm": 1.8218111131497405, + "learning_rate": 3.2304604088045206e-06, + "loss": 0.0866, + "step": 740 + }, + { + "epoch": 4.148148148148148, + "grad_norm": 1.5060146063158792, + "learning_rate": 3.222016268632175e-06, + "loss": 0.0974, + "step": 742 + }, + { + "epoch": 4.159329140461216, + "grad_norm": 2.33394735696618, + "learning_rate": 3.2135631321114603e-06, + "loss": 0.0767, + "step": 744 + }, + { + "epoch": 4.170510132774284, + "grad_norm": 1.8304481485687374, + "learning_rate": 3.2051011045687574e-06, + "loss": 0.1027, + "step": 746 + }, + { + "epoch": 4.181691125087352, + "grad_norm": 1.4496933516097028, + "learning_rate": 3.196630291441231e-06, + "loss": 0.073, + "step": 748 + }, + { + "epoch": 4.1928721174004195, + "grad_norm": 1.5989097781751378, + "learning_rate": 3.1881507982755126e-06, + "loss": 0.074, + "step": 750 + }, + { + "epoch": 4.204053109713487, + "grad_norm": 1.5479651084913313, + "learning_rate": 3.17966273072639e-06, + "loss": 0.0941, + "step": 752 + }, + { + "epoch": 4.215234102026555, + "grad_norm": 1.4844971201883568, + "learning_rate": 3.1711661945554857e-06, + "loss": 0.1171, + "step": 754 + }, + { + "epoch": 4.226415094339623, + "grad_norm": 1.538555100844062, + "learning_rate": 3.162661295629942e-06, + "loss": 0.0839, + "step": 756 + }, + { + "epoch": 4.23759608665269, + "grad_norm": 1.511356916861757, + "learning_rate": 3.154148139921102e-06, + "loss": 0.1039, + "step": 758 + }, + { + "epoch": 4.248777078965758, + "grad_norm": 1.811476489190878, + "learning_rate": 3.1456268335031886e-06, + "loss": 0.0794, + "step": 760 + }, + { + "epoch": 4.259958071278826, + "grad_norm": 1.6229333309674812, + "learning_rate": 3.137097482551983e-06, + "loss": 0.1152, + "step": 762 + }, + { + "epoch": 4.271139063591894, + "grad_norm": 1.4723017587041405, + "learning_rate": 3.128560193343501e-06, + "loss": 0.0944, + "step": 764 + }, + { + "epoch": 4.282320055904962, + "grad_norm": 1.0034690245189755, + "learning_rate": 3.1200150722526693e-06, + "loss": 0.0663, + "step": 766 + }, + { + "epoch": 4.29350104821803, + "grad_norm": 1.5551415143149132, + "learning_rate": 3.1114622257520004e-06, + "loss": 0.1021, + "step": 768 + }, + { + "epoch": 4.304682040531097, + "grad_norm": 1.836559018121584, + "learning_rate": 3.1029017604102655e-06, + "loss": 0.099, + "step": 770 + }, + { + "epoch": 4.315863032844165, + "grad_norm": 1.0818921388079483, + "learning_rate": 3.0943337828911673e-06, + "loss": 0.0899, + "step": 772 + }, + { + "epoch": 4.327044025157233, + "grad_norm": 0.9784785751112162, + "learning_rate": 3.085758399952011e-06, + "loss": 0.1016, + "step": 774 + }, + { + "epoch": 4.3382250174703, + "grad_norm": 1.348338975607883, + "learning_rate": 3.0771757184423716e-06, + "loss": 0.1063, + "step": 776 + }, + { + "epoch": 4.349406009783368, + "grad_norm": 2.1529902019434455, + "learning_rate": 3.0685858453027668e-06, + "loss": 0.089, + "step": 778 + }, + { + "epoch": 4.360587002096436, + "grad_norm": 1.3031273077449874, + "learning_rate": 3.0599888875633192e-06, + "loss": 0.1077, + "step": 780 + }, + { + "epoch": 4.371767994409504, + "grad_norm": 1.3772043306307704, + "learning_rate": 3.0513849523424298e-06, + "loss": 0.0879, + "step": 782 + }, + { + "epoch": 4.382948986722572, + "grad_norm": 1.7829225937512299, + "learning_rate": 3.0427741468454375e-06, + "loss": 0.1099, + "step": 784 + }, + { + "epoch": 4.39412997903564, + "grad_norm": 1.1143653742483424, + "learning_rate": 3.034156578363284e-06, + "loss": 0.0908, + "step": 786 + }, + { + "epoch": 4.405310971348707, + "grad_norm": 1.9841896768408593, + "learning_rate": 3.0255323542711784e-06, + "loss": 0.0846, + "step": 788 + }, + { + "epoch": 4.416491963661775, + "grad_norm": 1.1622503242476587, + "learning_rate": 3.0169015820272595e-06, + "loss": 0.0809, + "step": 790 + }, + { + "epoch": 4.427672955974843, + "grad_norm": 1.4138977756081776, + "learning_rate": 3.0082643691712572e-06, + "loss": 0.0832, + "step": 792 + }, + { + "epoch": 4.43885394828791, + "grad_norm": 1.3694425414816003, + "learning_rate": 2.9996208233231506e-06, + "loss": 0.1015, + "step": 794 + }, + { + "epoch": 4.450034940600978, + "grad_norm": 1.8252502558409327, + "learning_rate": 2.9909710521818265e-06, + "loss": 0.1049, + "step": 796 + }, + { + "epoch": 4.461215932914046, + "grad_norm": 1.4396307405101365, + "learning_rate": 2.9823151635237424e-06, + "loss": 0.0613, + "step": 798 + }, + { + "epoch": 4.472396925227114, + "grad_norm": 1.3667673153541864, + "learning_rate": 2.973653265201578e-06, + "loss": 0.1081, + "step": 800 + }, + { + "epoch": 4.483577917540182, + "grad_norm": 1.761976942384573, + "learning_rate": 2.964985465142895e-06, + "loss": 0.1002, + "step": 802 + }, + { + "epoch": 4.49475890985325, + "grad_norm": 1.6343471974417978, + "learning_rate": 2.9563118713487895e-06, + "loss": 0.0749, + "step": 804 + }, + { + "epoch": 4.505939902166317, + "grad_norm": 2.0454570442431046, + "learning_rate": 2.9476325918925484e-06, + "loss": 0.0857, + "step": 806 + }, + { + "epoch": 4.517120894479385, + "grad_norm": 1.7007295640066746, + "learning_rate": 2.938947734918302e-06, + "loss": 0.1085, + "step": 808 + }, + { + "epoch": 4.528301886792453, + "grad_norm": 1.5611422829954795, + "learning_rate": 2.9302574086396774e-06, + "loss": 0.0775, + "step": 810 + }, + { + "epoch": 4.53948287910552, + "grad_norm": 1.7913016893140525, + "learning_rate": 2.9215617213384494e-06, + "loss": 0.0875, + "step": 812 + }, + { + "epoch": 4.550663871418588, + "grad_norm": 1.5753063947599002, + "learning_rate": 2.91286078136319e-06, + "loss": 0.0805, + "step": 814 + }, + { + "epoch": 4.561844863731656, + "grad_norm": 1.8942921897754963, + "learning_rate": 2.904154697127921e-06, + "loss": 0.0806, + "step": 816 + }, + { + "epoch": 4.573025856044724, + "grad_norm": 1.791394910046461, + "learning_rate": 2.8954435771107604e-06, + "loss": 0.0992, + "step": 818 + }, + { + "epoch": 4.584206848357792, + "grad_norm": 1.245790765054016, + "learning_rate": 2.8867275298525743e-06, + "loss": 0.0886, + "step": 820 + }, + { + "epoch": 4.59538784067086, + "grad_norm": 1.5133863011334676, + "learning_rate": 2.878006663955621e-06, + "loss": 0.0886, + "step": 822 + }, + { + "epoch": 4.606568832983927, + "grad_norm": 2.0502622868705993, + "learning_rate": 2.8692810880821997e-06, + "loss": 0.0716, + "step": 824 + }, + { + "epoch": 4.617749825296995, + "grad_norm": 1.2876873289352964, + "learning_rate": 2.860550910953296e-06, + "loss": 0.0943, + "step": 826 + }, + { + "epoch": 4.628930817610063, + "grad_norm": 1.440475980645125, + "learning_rate": 2.8518162413472266e-06, + "loss": 0.1083, + "step": 828 + }, + { + "epoch": 4.64011180992313, + "grad_norm": 1.3754262878787067, + "learning_rate": 2.843077188098286e-06, + "loss": 0.1041, + "step": 830 + }, + { + "epoch": 4.651292802236198, + "grad_norm": 1.4424213259038674, + "learning_rate": 2.834333860095388e-06, + "loss": 0.0807, + "step": 832 + }, + { + "epoch": 4.662473794549266, + "grad_norm": 1.994638545215632, + "learning_rate": 2.8255863662807097e-06, + "loss": 0.0819, + "step": 834 + }, + { + "epoch": 4.673654786862334, + "grad_norm": 1.5478645240921063, + "learning_rate": 2.8168348156483356e-06, + "loss": 0.113, + "step": 836 + }, + { + "epoch": 4.684835779175402, + "grad_norm": 1.324879005941319, + "learning_rate": 2.8124575531000226e-06, + "loss": 0.11, + "step": 838 + }, + { + "epoch": 4.69601677148847, + "grad_norm": 1.5993247352100177, + "learning_rate": 2.803700121715214e-06, + "loss": 0.0903, + "step": 840 + }, + { + "epoch": 4.707197763801537, + "grad_norm": 1.256541482417978, + "learning_rate": 2.7949389062160946e-06, + "loss": 0.0925, + "step": 842 + }, + { + "epoch": 4.718378756114605, + "grad_norm": 2.706891920194882, + "learning_rate": 2.786174015767721e-06, + "loss": 0.084, + "step": 844 + }, + { + "epoch": 4.729559748427673, + "grad_norm": 1.3220515828132557, + "learning_rate": 2.7774055595809395e-06, + "loss": 0.0801, + "step": 846 + }, + { + "epoch": 4.7407407407407405, + "grad_norm": 1.5911477732332153, + "learning_rate": 2.768633646911027e-06, + "loss": 0.0938, + "step": 848 + }, + { + "epoch": 4.751921733053808, + "grad_norm": 1.1333988378482527, + "learning_rate": 2.759858387056325e-06, + "loss": 0.0721, + "step": 850 + }, + { + "epoch": 4.763102725366876, + "grad_norm": 1.4690260920140663, + "learning_rate": 2.7510798893568846e-06, + "loss": 0.0769, + "step": 852 + }, + { + "epoch": 4.774283717679944, + "grad_norm": 1.3785131166774844, + "learning_rate": 2.742298263193099e-06, + "loss": 0.1064, + "step": 854 + }, + { + "epoch": 4.785464709993012, + "grad_norm": 1.39128795327872, + "learning_rate": 2.733513617984342e-06, + "loss": 0.075, + "step": 856 + }, + { + "epoch": 4.79664570230608, + "grad_norm": 1.6826021403482612, + "learning_rate": 2.724726063187605e-06, + "loss": 0.1175, + "step": 858 + }, + { + "epoch": 4.8078266946191475, + "grad_norm": 1.353741266830404, + "learning_rate": 2.715935708296134e-06, + "loss": 0.1146, + "step": 860 + }, + { + "epoch": 4.819007686932215, + "grad_norm": 1.4488179633464906, + "learning_rate": 2.707142662838062e-06, + "loss": 0.1033, + "step": 862 + }, + { + "epoch": 4.830188679245283, + "grad_norm": 1.307354977462126, + "learning_rate": 2.6983470363750497e-06, + "loss": 0.093, + "step": 864 + }, + { + "epoch": 4.8413696715583505, + "grad_norm": 1.4753004858703918, + "learning_rate": 2.689548938500914e-06, + "loss": 0.0905, + "step": 866 + }, + { + "epoch": 4.852550663871418, + "grad_norm": 1.551558439927485, + "learning_rate": 2.6807484788402676e-06, + "loss": 0.075, + "step": 868 + }, + { + "epoch": 4.863731656184486, + "grad_norm": 1.499892261020302, + "learning_rate": 2.67194576704715e-06, + "loss": 0.0876, + "step": 870 + }, + { + "epoch": 4.8749126484975545, + "grad_norm": 1.82643381640813, + "learning_rate": 2.6631409128036637e-06, + "loss": 0.0892, + "step": 872 + }, + { + "epoch": 4.886093640810622, + "grad_norm": 1.3480606493487655, + "learning_rate": 2.6543340258186063e-06, + "loss": 0.0816, + "step": 874 + }, + { + "epoch": 4.89727463312369, + "grad_norm": 2.2307067144092407, + "learning_rate": 2.6455252158261015e-06, + "loss": 0.0994, + "step": 876 + }, + { + "epoch": 4.9084556254367575, + "grad_norm": 1.8646868858712458, + "learning_rate": 2.636714592584235e-06, + "loss": 0.0902, + "step": 878 + }, + { + "epoch": 4.919636617749825, + "grad_norm": 1.535171207325978, + "learning_rate": 2.6279022658736856e-06, + "loss": 0.0911, + "step": 880 + }, + { + "epoch": 4.930817610062893, + "grad_norm": 1.1594360070916991, + "learning_rate": 2.619088345496358e-06, + "loss": 0.066, + "step": 882 + }, + { + "epoch": 4.941998602375961, + "grad_norm": 1.6526631394475477, + "learning_rate": 2.610272941274012e-06, + "loss": 0.1014, + "step": 884 + }, + { + "epoch": 4.953179594689029, + "grad_norm": 1.8240816325874138, + "learning_rate": 2.6014561630468993e-06, + "loss": 0.0928, + "step": 886 + }, + { + "epoch": 4.964360587002097, + "grad_norm": 1.3816438884334348, + "learning_rate": 2.5926381206723885e-06, + "loss": 0.088, + "step": 888 + }, + { + "epoch": 4.9755415793151645, + "grad_norm": 1.3157397283692482, + "learning_rate": 2.583818924023601e-06, + "loss": 0.0938, + "step": 890 + }, + { + "epoch": 4.986722571628232, + "grad_norm": 1.464557516575305, + "learning_rate": 2.5749986829880423e-06, + "loss": 0.0781, + "step": 892 + }, + { + "epoch": 4.9979035639413, + "grad_norm": 1.8481309973872981, + "learning_rate": 2.5661775074662276e-06, + "loss": 0.0708, + "step": 894 + }, + { + "epoch": 5.0090845562543675, + "grad_norm": 1.3777408578534927, + "learning_rate": 2.5573555073703172e-06, + "loss": 0.0574, + "step": 896 + }, + { + "epoch": 5.020265548567435, + "grad_norm": 1.5585565063610693, + "learning_rate": 2.5485327926227464e-06, + "loss": 0.0533, + "step": 898 + }, + { + "epoch": 5.031446540880503, + "grad_norm": 3.8488829032344403, + "learning_rate": 2.539709473154855e-06, + "loss": 0.0524, + "step": 900 + }, + { + "epoch": 5.042627533193571, + "grad_norm": 1.360678519326562, + "learning_rate": 2.5308856589055164e-06, + "loss": 0.0608, + "step": 902 + }, + { + "epoch": 5.053808525506638, + "grad_norm": 1.4720850175627471, + "learning_rate": 2.5220614598197708e-06, + "loss": 0.0527, + "step": 904 + }, + { + "epoch": 5.064989517819707, + "grad_norm": 1.2412662972591795, + "learning_rate": 2.513236985847451e-06, + "loss": 0.0488, + "step": 906 + }, + { + "epoch": 5.0761705101327745, + "grad_norm": 1.3236580966844242, + "learning_rate": 2.5044123469418174e-06, + "loss": 0.0638, + "step": 908 + }, + { + "epoch": 5.087351502445842, + "grad_norm": 1.8348241342651854, + "learning_rate": 2.495587653058184e-06, + "loss": 0.0629, + "step": 910 + }, + { + "epoch": 5.09853249475891, + "grad_norm": 0.9662213920921242, + "learning_rate": 2.4867630141525493e-06, + "loss": 0.0722, + "step": 912 + }, + { + "epoch": 5.109713487071978, + "grad_norm": 1.6784486385619315, + "learning_rate": 2.477938540180231e-06, + "loss": 0.0482, + "step": 914 + }, + { + "epoch": 5.120894479385045, + "grad_norm": 1.386742744607905, + "learning_rate": 2.4691143410944844e-06, + "loss": 0.0596, + "step": 916 + }, + { + "epoch": 5.132075471698113, + "grad_norm": 1.5375835898995094, + "learning_rate": 2.4602905268451455e-06, + "loss": 0.0592, + "step": 918 + }, + { + "epoch": 5.143256464011181, + "grad_norm": 1.334707574114043, + "learning_rate": 2.451467207377254e-06, + "loss": 0.0493, + "step": 920 + }, + { + "epoch": 5.154437456324249, + "grad_norm": 1.018606004126685, + "learning_rate": 2.442644492629683e-06, + "loss": 0.0544, + "step": 922 + }, + { + "epoch": 5.165618448637317, + "grad_norm": 1.0236510244569192, + "learning_rate": 2.433822492533774e-06, + "loss": 0.0501, + "step": 924 + }, + { + "epoch": 5.176799440950385, + "grad_norm": 0.8191759766926784, + "learning_rate": 2.4250013170119585e-06, + "loss": 0.0594, + "step": 926 + }, + { + "epoch": 5.187980433263452, + "grad_norm": 1.0938612787512558, + "learning_rate": 2.4161810759763993e-06, + "loss": 0.0544, + "step": 928 + }, + { + "epoch": 5.19916142557652, + "grad_norm": 1.3602285379082586, + "learning_rate": 2.407361879327612e-06, + "loss": 0.0442, + "step": 930 + }, + { + "epoch": 5.210342417889588, + "grad_norm": 1.1380441045618945, + "learning_rate": 2.398543836953101e-06, + "loss": 0.0563, + "step": 932 + }, + { + "epoch": 5.221523410202655, + "grad_norm": 1.1080478505241853, + "learning_rate": 2.389727058725989e-06, + "loss": 0.0515, + "step": 934 + }, + { + "epoch": 5.232704402515723, + "grad_norm": 1.2558697950305333, + "learning_rate": 2.380911654503643e-06, + "loss": 0.0507, + "step": 936 + }, + { + "epoch": 5.243885394828791, + "grad_norm": 1.2293644348010904, + "learning_rate": 2.3720977341263152e-06, + "loss": 0.0607, + "step": 938 + }, + { + "epoch": 5.255066387141859, + "grad_norm": 1.292488994918762, + "learning_rate": 2.3632854074157653e-06, + "loss": 0.0474, + "step": 940 + }, + { + "epoch": 5.266247379454927, + "grad_norm": 1.2671492916227067, + "learning_rate": 2.3544747841738998e-06, + "loss": 0.0769, + "step": 942 + }, + { + "epoch": 5.277428371767995, + "grad_norm": 1.6102887076835615, + "learning_rate": 2.3456659741813945e-06, + "loss": 0.0496, + "step": 944 + }, + { + "epoch": 5.288609364081062, + "grad_norm": 1.577997048333656, + "learning_rate": 2.3368590871963367e-06, + "loss": 0.0796, + "step": 946 + }, + { + "epoch": 5.29979035639413, + "grad_norm": 2.278441135480121, + "learning_rate": 2.328054232952851e-06, + "loss": 0.0679, + "step": 948 + }, + { + "epoch": 5.310971348707198, + "grad_norm": 1.1443796744340577, + "learning_rate": 2.3192515211597332e-06, + "loss": 0.0589, + "step": 950 + }, + { + "epoch": 5.322152341020265, + "grad_norm": 1.3246252050774938, + "learning_rate": 2.3104510614990875e-06, + "loss": 0.0711, + "step": 952 + }, + { + "epoch": 5.333333333333333, + "grad_norm": 2.3404125762291574, + "learning_rate": 2.301652963624951e-06, + "loss": 0.0571, + "step": 954 + }, + { + "epoch": 5.344514325646401, + "grad_norm": 1.6173224098499974, + "learning_rate": 2.292857337161938e-06, + "loss": 0.0715, + "step": 956 + }, + { + "epoch": 5.355695317959469, + "grad_norm": 1.416375080557459, + "learning_rate": 2.2840642917038666e-06, + "loss": 0.0555, + "step": 958 + }, + { + "epoch": 5.366876310272537, + "grad_norm": 1.2819320119071211, + "learning_rate": 2.2752739368123948e-06, + "loss": 0.0486, + "step": 960 + }, + { + "epoch": 5.378057302585605, + "grad_norm": 1.1198977788924485, + "learning_rate": 2.2664863820156593e-06, + "loss": 0.0408, + "step": 962 + }, + { + "epoch": 5.389238294898672, + "grad_norm": 1.1451798114445098, + "learning_rate": 2.2577017368069017e-06, + "loss": 0.0626, + "step": 964 + }, + { + "epoch": 5.40041928721174, + "grad_norm": 1.3380127274735694, + "learning_rate": 2.248920110643116e-06, + "loss": 0.0568, + "step": 966 + }, + { + "epoch": 5.411600279524808, + "grad_norm": 1.4489239240672898, + "learning_rate": 2.2401416129436753e-06, + "loss": 0.059, + "step": 968 + }, + { + "epoch": 5.422781271837875, + "grad_norm": 1.3130908635170957, + "learning_rate": 2.2313663530889734e-06, + "loss": 0.0444, + "step": 970 + }, + { + "epoch": 5.433962264150943, + "grad_norm": 1.2045728193533076, + "learning_rate": 2.222594440419061e-06, + "loss": 0.0952, + "step": 972 + }, + { + "epoch": 5.445143256464011, + "grad_norm": 1.1505612686257871, + "learning_rate": 2.2138259842322794e-06, + "loss": 0.0536, + "step": 974 + }, + { + "epoch": 5.456324248777079, + "grad_norm": 1.521719008832957, + "learning_rate": 2.2050610937839058e-06, + "loss": 0.073, + "step": 976 + }, + { + "epoch": 5.467505241090147, + "grad_norm": 1.3381824532405695, + "learning_rate": 2.1962998782847863e-06, + "loss": 0.0583, + "step": 978 + }, + { + "epoch": 5.478686233403215, + "grad_norm": 1.1782879600371732, + "learning_rate": 2.1875424468999787e-06, + "loss": 0.052, + "step": 980 + }, + { + "epoch": 5.489867225716282, + "grad_norm": 1.1689516819440322, + "learning_rate": 2.178788908747387e-06, + "loss": 0.0515, + "step": 982 + }, + { + "epoch": 5.50104821802935, + "grad_norm": 1.1479989981730907, + "learning_rate": 2.170039372896409e-06, + "loss": 0.055, + "step": 984 + }, + { + "epoch": 5.512229210342418, + "grad_norm": 1.3922562574409854, + "learning_rate": 2.161293948366573e-06, + "loss": 0.0554, + "step": 986 + }, + { + "epoch": 5.523410202655485, + "grad_norm": 1.409490849880991, + "learning_rate": 2.152552744126178e-06, + "loss": 0.0392, + "step": 988 + }, + { + "epoch": 5.534591194968553, + "grad_norm": 1.2479629003574995, + "learning_rate": 2.1438158690909413e-06, + "loss": 0.0599, + "step": 990 + }, + { + "epoch": 5.545772187281621, + "grad_norm": 1.2371376050465024, + "learning_rate": 2.1350834321226344e-06, + "loss": 0.0664, + "step": 992 + }, + { + "epoch": 5.556953179594689, + "grad_norm": 1.593505278104288, + "learning_rate": 2.126355542027734e-06, + "loss": 0.0479, + "step": 994 + }, + { + "epoch": 5.568134171907757, + "grad_norm": 1.2742537988695015, + "learning_rate": 2.117632307556059e-06, + "loss": 0.0803, + "step": 996 + }, + { + "epoch": 5.579315164220825, + "grad_norm": 1.3748039610126324, + "learning_rate": 2.1089138373994226e-06, + "loss": 0.0416, + "step": 998 + }, + { + "epoch": 5.590496156533892, + "grad_norm": 2.4084571636039755, + "learning_rate": 2.100200240190273e-06, + "loss": 0.0514, + "step": 1000 + }, + { + "epoch": 5.60167714884696, + "grad_norm": 1.1933752040503858, + "learning_rate": 2.09149162450034e-06, + "loss": 0.0625, + "step": 1002 + }, + { + "epoch": 5.612858141160028, + "grad_norm": 1.037709039674537, + "learning_rate": 2.0827880988392856e-06, + "loss": 0.0514, + "step": 1004 + }, + { + "epoch": 5.6240391334730955, + "grad_norm": 1.315142680072312, + "learning_rate": 2.0740897716533475e-06, + "loss": 0.0593, + "step": 1006 + }, + { + "epoch": 5.635220125786163, + "grad_norm": 1.0531660230737552, + "learning_rate": 2.0653967513239934e-06, + "loss": 0.0543, + "step": 1008 + }, + { + "epoch": 5.646401118099231, + "grad_norm": 1.2633776013551097, + "learning_rate": 2.0567091461665636e-06, + "loss": 0.0431, + "step": 1010 + }, + { + "epoch": 5.657582110412299, + "grad_norm": 1.449959564050197, + "learning_rate": 2.0480270644289282e-06, + "loss": 0.0482, + "step": 1012 + }, + { + "epoch": 5.668763102725367, + "grad_norm": 1.1071912059302882, + "learning_rate": 2.0393506142901347e-06, + "loss": 0.0564, + "step": 1014 + }, + { + "epoch": 5.679944095038435, + "grad_norm": 0.9876137346535111, + "learning_rate": 2.0306799038590595e-06, + "loss": 0.0391, + "step": 1016 + }, + { + "epoch": 5.6911250873515025, + "grad_norm": 1.1071464038310999, + "learning_rate": 2.0220150411730638e-06, + "loss": 0.0636, + "step": 1018 + }, + { + "epoch": 5.70230607966457, + "grad_norm": 1.0473491285671832, + "learning_rate": 2.013356134196643e-06, + "loss": 0.0581, + "step": 1020 + }, + { + "epoch": 5.713487071977638, + "grad_norm": 1.1296902267336801, + "learning_rate": 2.004703290820086e-06, + "loss": 0.0604, + "step": 1022 + }, + { + "epoch": 5.7246680642907055, + "grad_norm": 1.309317661735025, + "learning_rate": 1.9960566188581306e-06, + "loss": 0.0438, + "step": 1024 + }, + { + "epoch": 5.735849056603773, + "grad_norm": 0.8918766336417149, + "learning_rate": 1.9874162260486146e-06, + "loss": 0.0475, + "step": 1026 + }, + { + "epoch": 5.747030048916841, + "grad_norm": 1.2095534019736167, + "learning_rate": 1.978782220051142e-06, + "loss": 0.0454, + "step": 1028 + }, + { + "epoch": 5.7582110412299095, + "grad_norm": 1.1967009451687045, + "learning_rate": 1.9701547084457314e-06, + "loss": 0.0697, + "step": 1030 + }, + { + "epoch": 5.769392033542977, + "grad_norm": 1.8160556667087309, + "learning_rate": 1.961533798731486e-06, + "loss": 0.0422, + "step": 1032 + }, + { + "epoch": 5.780573025856045, + "grad_norm": 1.590627053883797, + "learning_rate": 1.952919598325247e-06, + "loss": 0.0602, + "step": 1034 + }, + { + "epoch": 5.7917540181691125, + "grad_norm": 1.4584761134724722, + "learning_rate": 1.944312214560256e-06, + "loss": 0.0575, + "step": 1036 + }, + { + "epoch": 5.80293501048218, + "grad_norm": 1.6093909025543798, + "learning_rate": 1.935711754684824e-06, + "loss": 0.0814, + "step": 1038 + }, + { + "epoch": 5.814116002795248, + "grad_norm": 1.7715253484509736, + "learning_rate": 1.9271183258609836e-06, + "loss": 0.0608, + "step": 1040 + }, + { + "epoch": 5.825296995108316, + "grad_norm": 0.850327251905485, + "learning_rate": 1.9185320351631654e-06, + "loss": 0.0388, + "step": 1042 + }, + { + "epoch": 5.836477987421384, + "grad_norm": 1.4837292387797913, + "learning_rate": 1.9099529895768552e-06, + "loss": 0.0567, + "step": 1044 + }, + { + "epoch": 5.847658979734452, + "grad_norm": 1.0384213631474088, + "learning_rate": 1.901381295997267e-06, + "loss": 0.0661, + "step": 1046 + }, + { + "epoch": 5.8588399720475195, + "grad_norm": 1.2071171218984706, + "learning_rate": 1.8928170612280067e-06, + "loss": 0.0665, + "step": 1048 + }, + { + "epoch": 5.870020964360587, + "grad_norm": 1.2020194163974407, + "learning_rate": 1.8842603919797436e-06, + "loss": 0.0466, + "step": 1050 + }, + { + "epoch": 5.881201956673655, + "grad_norm": 1.141150946131999, + "learning_rate": 1.8757113948688827e-06, + "loss": 0.0562, + "step": 1052 + }, + { + "epoch": 5.8923829489867225, + "grad_norm": 1.583487458549684, + "learning_rate": 1.8671701764162287e-06, + "loss": 0.0589, + "step": 1054 + }, + { + "epoch": 5.90356394129979, + "grad_norm": 1.3417276690702418, + "learning_rate": 1.8586368430456708e-06, + "loss": 0.0604, + "step": 1056 + }, + { + "epoch": 5.914744933612858, + "grad_norm": 1.3294273305641617, + "learning_rate": 1.8501115010828423e-06, + "loss": 0.0628, + "step": 1058 + }, + { + "epoch": 5.925925925925926, + "grad_norm": 1.2448945324282268, + "learning_rate": 1.8415942567538106e-06, + "loss": 0.0554, + "step": 1060 + }, + { + "epoch": 5.937106918238994, + "grad_norm": 0.960687093766239, + "learning_rate": 1.8330852161837399e-06, + "loss": 0.0532, + "step": 1062 + }, + { + "epoch": 5.948287910552062, + "grad_norm": 1.4656893110825278, + "learning_rate": 1.8245844853955786e-06, + "loss": 0.0719, + "step": 1064 + }, + { + "epoch": 5.9594689028651295, + "grad_norm": 1.6634277575338297, + "learning_rate": 1.8160921703087368e-06, + "loss": 0.0565, + "step": 1066 + }, + { + "epoch": 5.970649895178197, + "grad_norm": 1.7257111050609335, + "learning_rate": 1.8076083767377595e-06, + "loss": 0.068, + "step": 1068 + }, + { + "epoch": 5.981830887491265, + "grad_norm": 1.42483183153276, + "learning_rate": 1.7991332103910184e-06, + "loss": 0.0613, + "step": 1070 + }, + { + "epoch": 5.993011879804333, + "grad_norm": 1.4316025881020678, + "learning_rate": 1.7906667768693853e-06, + "loss": 0.0481, + "step": 1072 + }, + { + "epoch": 6.0041928721174, + "grad_norm": 1.037376667784287, + "learning_rate": 1.782209181664924e-06, + "loss": 0.0483, + "step": 1074 + }, + { + "epoch": 6.015373864430468, + "grad_norm": 1.0336168566598631, + "learning_rate": 1.773760530159571e-06, + "loss": 0.0347, + "step": 1076 + }, + { + "epoch": 6.026554856743536, + "grad_norm": 0.7872905184564322, + "learning_rate": 1.7653209276238242e-06, + "loss": 0.0355, + "step": 1078 + }, + { + "epoch": 6.037735849056604, + "grad_norm": 1.772389302776251, + "learning_rate": 1.7568904792154328e-06, + "loss": 0.0542, + "step": 1080 + }, + { + "epoch": 6.048916841369672, + "grad_norm": 1.3577848873845724, + "learning_rate": 1.7484692899780812e-06, + "loss": 0.0583, + "step": 1082 + }, + { + "epoch": 6.06009783368274, + "grad_norm": 0.7840766650439943, + "learning_rate": 1.740057464840088e-06, + "loss": 0.0289, + "step": 1084 + }, + { + "epoch": 6.071278825995807, + "grad_norm": 0.9255675051401594, + "learning_rate": 1.7316551086130925e-06, + "loss": 0.0417, + "step": 1086 + }, + { + "epoch": 6.082459818308875, + "grad_norm": 0.9107219582827843, + "learning_rate": 1.7232623259907538e-06, + "loss": 0.0429, + "step": 1088 + }, + { + "epoch": 6.093640810621943, + "grad_norm": 1.0296310110561282, + "learning_rate": 1.714879221547439e-06, + "loss": 0.0362, + "step": 1090 + }, + { + "epoch": 6.10482180293501, + "grad_norm": 0.9575340239366315, + "learning_rate": 1.7065058997369288e-06, + "loss": 0.0471, + "step": 1092 + }, + { + "epoch": 6.116002795248078, + "grad_norm": 0.7430183397758778, + "learning_rate": 1.6981424648911112e-06, + "loss": 0.0351, + "step": 1094 + }, + { + "epoch": 6.127183787561146, + "grad_norm": 0.9807593854080312, + "learning_rate": 1.6897890212186804e-06, + "loss": 0.0334, + "step": 1096 + }, + { + "epoch": 6.138364779874214, + "grad_norm": 1.2961448011313597, + "learning_rate": 1.6814456728038431e-06, + "loss": 0.025, + "step": 1098 + }, + { + "epoch": 6.149545772187282, + "grad_norm": 0.961636779671174, + "learning_rate": 1.673112523605015e-06, + "loss": 0.0285, + "step": 1100 + }, + { + "epoch": 6.16072676450035, + "grad_norm": 0.9647606646620928, + "learning_rate": 1.6647896774535324e-06, + "loss": 0.0303, + "step": 1102 + }, + { + "epoch": 6.171907756813417, + "grad_norm": 1.1381988477100318, + "learning_rate": 1.6564772380523546e-06, + "loss": 0.0358, + "step": 1104 + }, + { + "epoch": 6.183088749126485, + "grad_norm": 0.7901346245952422, + "learning_rate": 1.648175308974771e-06, + "loss": 0.0279, + "step": 1106 + }, + { + "epoch": 6.194269741439553, + "grad_norm": 1.2717247572933381, + "learning_rate": 1.6398839936631142e-06, + "loss": 0.0328, + "step": 1108 + }, + { + "epoch": 6.20545073375262, + "grad_norm": 1.2916496315117834, + "learning_rate": 1.631603395427466e-06, + "loss": 0.055, + "step": 1110 + }, + { + "epoch": 6.216631726065688, + "grad_norm": 0.9740099844597652, + "learning_rate": 1.6233336174443762e-06, + "loss": 0.048, + "step": 1112 + }, + { + "epoch": 6.227812718378756, + "grad_norm": 1.0103830292004847, + "learning_rate": 1.6150747627555713e-06, + "loss": 0.0434, + "step": 1114 + }, + { + "epoch": 6.238993710691824, + "grad_norm": 1.1350854047223082, + "learning_rate": 1.6068269342666749e-06, + "loss": 0.0389, + "step": 1116 + }, + { + "epoch": 6.250174703004892, + "grad_norm": 0.7884154494279628, + "learning_rate": 1.5985902347459239e-06, + "loss": 0.0432, + "step": 1118 + }, + { + "epoch": 6.26135569531796, + "grad_norm": 0.8788178903528164, + "learning_rate": 1.5903647668228855e-06, + "loss": 0.0432, + "step": 1120 + }, + { + "epoch": 6.272536687631027, + "grad_norm": 0.6393918351108393, + "learning_rate": 1.5821506329871834e-06, + "loss": 0.0253, + "step": 1122 + }, + { + "epoch": 6.283717679944095, + "grad_norm": 1.0870268262489273, + "learning_rate": 1.5739479355872162e-06, + "loss": 0.0364, + "step": 1124 + }, + { + "epoch": 6.294898672257163, + "grad_norm": 1.1679875063936556, + "learning_rate": 1.5657567768288868e-06, + "loss": 0.0333, + "step": 1126 + }, + { + "epoch": 6.30607966457023, + "grad_norm": 0.8388447320245327, + "learning_rate": 1.5575772587743222e-06, + "loss": 0.0316, + "step": 1128 + }, + { + "epoch": 6.317260656883298, + "grad_norm": 0.7710273725047172, + "learning_rate": 1.5494094833406092e-06, + "loss": 0.0308, + "step": 1130 + }, + { + "epoch": 6.328441649196366, + "grad_norm": 1.3107972415612894, + "learning_rate": 1.5412535522985205e-06, + "loss": 0.0186, + "step": 1132 + }, + { + "epoch": 6.339622641509434, + "grad_norm": 0.8488196487806184, + "learning_rate": 1.5331095672712463e-06, + "loss": 0.023, + "step": 1134 + }, + { + "epoch": 6.350803633822502, + "grad_norm": 1.014050814471419, + "learning_rate": 1.5249776297331302e-06, + "loss": 0.0425, + "step": 1136 + }, + { + "epoch": 6.36198462613557, + "grad_norm": 0.8160528908459946, + "learning_rate": 1.516857841008401e-06, + "loss": 0.0407, + "step": 1138 + }, + { + "epoch": 6.373165618448637, + "grad_norm": 0.6924190623075557, + "learning_rate": 1.5087503022699168e-06, + "loss": 0.0527, + "step": 1140 + }, + { + "epoch": 6.384346610761705, + "grad_norm": 1.0149043689805195, + "learning_rate": 1.5006551145378967e-06, + "loss": 0.0367, + "step": 1142 + }, + { + "epoch": 6.395527603074773, + "grad_norm": 1.5920991707794845, + "learning_rate": 1.4925723786786691e-06, + "loss": 0.0319, + "step": 1144 + }, + { + "epoch": 6.40670859538784, + "grad_norm": 0.8834798218634231, + "learning_rate": 1.4845021954034106e-06, + "loss": 0.0372, + "step": 1146 + }, + { + "epoch": 6.417889587700908, + "grad_norm": 1.072104658850445, + "learning_rate": 1.476444665266889e-06, + "loss": 0.0413, + "step": 1148 + }, + { + "epoch": 6.429070580013976, + "grad_norm": 1.1893734124292998, + "learning_rate": 1.4683998886662187e-06, + "loss": 0.0307, + "step": 1150 + }, + { + "epoch": 6.440251572327044, + "grad_norm": 1.1513167005422524, + "learning_rate": 1.4603679658396006e-06, + "loss": 0.0402, + "step": 1152 + }, + { + "epoch": 6.451432564640112, + "grad_norm": 1.0586602700365229, + "learning_rate": 1.4523489968650795e-06, + "loss": 0.0303, + "step": 1154 + }, + { + "epoch": 6.46261355695318, + "grad_norm": 0.7650987855999634, + "learning_rate": 1.4443430816592936e-06, + "loss": 0.0312, + "step": 1156 + }, + { + "epoch": 6.473794549266247, + "grad_norm": 0.7470083708652993, + "learning_rate": 1.4363503199762296e-06, + "loss": 0.0298, + "step": 1158 + }, + { + "epoch": 6.484975541579315, + "grad_norm": 1.2247183517462086, + "learning_rate": 1.4283708114059853e-06, + "loss": 0.0476, + "step": 1160 + }, + { + "epoch": 6.496156533892383, + "grad_norm": 1.0042001049340177, + "learning_rate": 1.4204046553735174e-06, + "loss": 0.0421, + "step": 1162 + }, + { + "epoch": 6.5073375262054505, + "grad_norm": 1.0066856707214424, + "learning_rate": 1.4124519511374158e-06, + "loss": 0.0277, + "step": 1164 + }, + { + "epoch": 6.518518518518518, + "grad_norm": 1.3761888161849996, + "learning_rate": 1.404512797788657e-06, + "loss": 0.0251, + "step": 1166 + }, + { + "epoch": 6.529699510831586, + "grad_norm": 0.7445041473181229, + "learning_rate": 1.396587294249374e-06, + "loss": 0.0383, + "step": 1168 + }, + { + "epoch": 6.540880503144654, + "grad_norm": 1.0231799225570892, + "learning_rate": 1.3886755392716225e-06, + "loss": 0.0289, + "step": 1170 + }, + { + "epoch": 6.552061495457722, + "grad_norm": 1.0842064444530823, + "learning_rate": 1.3807776314361498e-06, + "loss": 0.0341, + "step": 1172 + }, + { + "epoch": 6.56324248777079, + "grad_norm": 0.9409388421938562, + "learning_rate": 1.3728936691511704e-06, + "loss": 0.0413, + "step": 1174 + }, + { + "epoch": 6.5744234800838575, + "grad_norm": 0.8052329748698783, + "learning_rate": 1.3650237506511333e-06, + "loss": 0.0399, + "step": 1176 + }, + { + "epoch": 6.585604472396925, + "grad_norm": 0.6879172446908371, + "learning_rate": 1.3571679739955029e-06, + "loss": 0.0288, + "step": 1178 + }, + { + "epoch": 6.596785464709993, + "grad_norm": 0.8737080494275846, + "learning_rate": 1.3493264370675352e-06, + "loss": 0.0181, + "step": 1180 + }, + { + "epoch": 6.6079664570230605, + "grad_norm": 0.8744184416405667, + "learning_rate": 1.3414992375730587e-06, + "loss": 0.0432, + "step": 1182 + }, + { + "epoch": 6.619147449336128, + "grad_norm": 0.9265074156931595, + "learning_rate": 1.3336864730392587e-06, + "loss": 0.0464, + "step": 1184 + }, + { + "epoch": 6.630328441649196, + "grad_norm": 1.14003149718633, + "learning_rate": 1.3258882408134582e-06, + "loss": 0.0271, + "step": 1186 + }, + { + "epoch": 6.6415094339622645, + "grad_norm": 0.8949105583359471, + "learning_rate": 1.3181046380619078e-06, + "loss": 0.0276, + "step": 1188 + }, + { + "epoch": 6.652690426275332, + "grad_norm": 1.0602768370905677, + "learning_rate": 1.3103357617685746e-06, + "loss": 0.0352, + "step": 1190 + }, + { + "epoch": 6.6638714185884, + "grad_norm": 1.187406942024327, + "learning_rate": 1.3025817087339335e-06, + "loss": 0.0597, + "step": 1192 + }, + { + "epoch": 6.6750524109014675, + "grad_norm": 0.8451020033143687, + "learning_rate": 1.2948425755737592e-06, + "loss": 0.0359, + "step": 1194 + }, + { + "epoch": 6.686233403214535, + "grad_norm": 1.2760921925255864, + "learning_rate": 1.2871184587179286e-06, + "loss": 0.0285, + "step": 1196 + }, + { + "epoch": 6.697414395527603, + "grad_norm": 0.7781748766075295, + "learning_rate": 1.2794094544092111e-06, + "loss": 0.0346, + "step": 1198 + }, + { + "epoch": 6.7085953878406706, + "grad_norm": 1.1832623077309767, + "learning_rate": 1.2717156587020746e-06, + "loss": 0.041, + "step": 1200 + }, + { + "epoch": 6.719776380153739, + "grad_norm": 1.3133094357866473, + "learning_rate": 1.2640371674614866e-06, + "loss": 0.0629, + "step": 1202 + }, + { + "epoch": 6.730957372466807, + "grad_norm": 0.7218331862903847, + "learning_rate": 1.2563740763617198e-06, + "loss": 0.0366, + "step": 1204 + }, + { + "epoch": 6.7421383647798745, + "grad_norm": 0.9560652150388108, + "learning_rate": 1.2487264808851654e-06, + "loss": 0.044, + "step": 1206 + }, + { + "epoch": 6.753319357092942, + "grad_norm": 1.1190106870390395, + "learning_rate": 1.2410944763211302e-06, + "loss": 0.0517, + "step": 1208 + }, + { + "epoch": 6.76450034940601, + "grad_norm": 0.7835985914687663, + "learning_rate": 1.2334781577646653e-06, + "loss": 0.0272, + "step": 1210 + }, + { + "epoch": 6.7756813417190775, + "grad_norm": 2.056446636497986, + "learning_rate": 1.2258776201153702e-06, + "loss": 0.0239, + "step": 1212 + }, + { + "epoch": 6.786862334032145, + "grad_norm": 0.8485551422736736, + "learning_rate": 1.218292958076213e-06, + "loss": 0.0206, + "step": 1214 + }, + { + "epoch": 6.798043326345213, + "grad_norm": 1.2531964534501892, + "learning_rate": 1.2107242661523544e-06, + "loss": 0.0254, + "step": 1216 + }, + { + "epoch": 6.809224318658281, + "grad_norm": 1.269537638790587, + "learning_rate": 1.203171638649962e-06, + "loss": 0.0299, + "step": 1218 + }, + { + "epoch": 6.820405310971349, + "grad_norm": 1.1178764385402225, + "learning_rate": 1.195635169675045e-06, + "loss": 0.0396, + "step": 1220 + }, + { + "epoch": 6.831586303284417, + "grad_norm": 0.6920818283019613, + "learning_rate": 1.1881149531322744e-06, + "loss": 0.0268, + "step": 1222 + }, + { + "epoch": 6.8427672955974845, + "grad_norm": 0.80369354175751, + "learning_rate": 1.180611082723814e-06, + "loss": 0.031, + "step": 1224 + }, + { + "epoch": 6.853948287910552, + "grad_norm": 0.7447389756775401, + "learning_rate": 1.1731236519481593e-06, + "loss": 0.0345, + "step": 1226 + }, + { + "epoch": 6.86512928022362, + "grad_norm": 1.1115305000722167, + "learning_rate": 1.1656527540989595e-06, + "loss": 0.0283, + "step": 1228 + }, + { + "epoch": 6.876310272536688, + "grad_norm": 1.2279572164110593, + "learning_rate": 1.1581984822638706e-06, + "loss": 0.0452, + "step": 1230 + }, + { + "epoch": 6.887491264849755, + "grad_norm": 0.8467749629186313, + "learning_rate": 1.1507609293233837e-06, + "loss": 0.0283, + "step": 1232 + }, + { + "epoch": 6.898672257162823, + "grad_norm": 1.355703618365484, + "learning_rate": 1.1433401879496723e-06, + "loss": 0.0366, + "step": 1234 + }, + { + "epoch": 6.909853249475891, + "grad_norm": 1.004917827499692, + "learning_rate": 1.135936350605438e-06, + "loss": 0.0496, + "step": 1236 + }, + { + "epoch": 6.921034241788959, + "grad_norm": 1.2615070307313305, + "learning_rate": 1.1285495095427563e-06, + "loss": 0.0461, + "step": 1238 + }, + { + "epoch": 6.932215234102027, + "grad_norm": 0.9861185460727813, + "learning_rate": 1.1211797568019312e-06, + "loss": 0.0366, + "step": 1240 + }, + { + "epoch": 6.943396226415095, + "grad_norm": 1.6576290169923233, + "learning_rate": 1.113827184210343e-06, + "loss": 0.0337, + "step": 1242 + }, + { + "epoch": 6.954577218728162, + "grad_norm": 1.1363579065284033, + "learning_rate": 1.1064918833813073e-06, + "loss": 0.0406, + "step": 1244 + }, + { + "epoch": 6.96575821104123, + "grad_norm": 1.3125191134965577, + "learning_rate": 1.0991739457129333e-06, + "loss": 0.0397, + "step": 1246 + }, + { + "epoch": 6.976939203354298, + "grad_norm": 0.8904462468667067, + "learning_rate": 1.0918734623869835e-06, + "loss": 0.0407, + "step": 1248 + }, + { + "epoch": 6.988120195667365, + "grad_norm": 2.263233580582389, + "learning_rate": 1.0845905243677416e-06, + "loss": 0.0307, + "step": 1250 + }, + { + "epoch": 6.999301187980433, + "grad_norm": 0.791294534235276, + "learning_rate": 1.0773252224008726e-06, + "loss": 0.0387, + "step": 1252 + }, + { + "epoch": 7.010482180293501, + "grad_norm": 0.76599595030522, + "learning_rate": 1.0700776470122981e-06, + "loss": 0.0269, + "step": 1254 + }, + { + "epoch": 7.021663172606569, + "grad_norm": 0.7331796337642835, + "learning_rate": 1.0628478885070647e-06, + "loss": 0.0221, + "step": 1256 + }, + { + "epoch": 7.032844164919637, + "grad_norm": 0.6845784469587074, + "learning_rate": 1.05563603696822e-06, + "loss": 0.0291, + "step": 1258 + }, + { + "epoch": 7.044025157232705, + "grad_norm": 0.8176233505690059, + "learning_rate": 1.0484421822556904e-06, + "loss": 0.0364, + "step": 1260 + }, + { + "epoch": 7.055206149545772, + "grad_norm": 0.8629657573128657, + "learning_rate": 1.041266414005162e-06, + "loss": 0.0265, + "step": 1262 + }, + { + "epoch": 7.06638714185884, + "grad_norm": 1.1172499462707595, + "learning_rate": 1.0341088216269625e-06, + "loss": 0.0157, + "step": 1264 + }, + { + "epoch": 7.077568134171908, + "grad_norm": 0.5230775744769823, + "learning_rate": 1.0269694943049462e-06, + "loss": 0.0157, + "step": 1266 + }, + { + "epoch": 7.088749126484975, + "grad_norm": 0.8978199171663125, + "learning_rate": 1.0198485209953865e-06, + "loss": 0.0275, + "step": 1268 + }, + { + "epoch": 7.099930118798043, + "grad_norm": 0.815308309594077, + "learning_rate": 1.0127459904258621e-06, + "loss": 0.0237, + "step": 1270 + }, + { + "epoch": 7.111111111111111, + "grad_norm": 0.8967571058386815, + "learning_rate": 1.0056619910941592e-06, + "loss": 0.019, + "step": 1272 + }, + { + "epoch": 7.122292103424179, + "grad_norm": 0.7843358442700527, + "learning_rate": 9.98596611267158e-07, + "loss": 0.021, + "step": 1274 + }, + { + "epoch": 7.133473095737247, + "grad_norm": 0.6797830063456453, + "learning_rate": 9.915499389797444e-07, + "loss": 0.0316, + "step": 1276 + }, + { + "epoch": 7.144654088050315, + "grad_norm": 0.6688875199025872, + "learning_rate": 9.845220620337054e-07, + "loss": 0.0303, + "step": 1278 + }, + { + "epoch": 7.155835080363382, + "grad_norm": 0.6664970872749731, + "learning_rate": 9.77513067996636e-07, + "loss": 0.0219, + "step": 1280 + }, + { + "epoch": 7.16701607267645, + "grad_norm": 0.7973098520727987, + "learning_rate": 9.705230442008542e-07, + "loss": 0.0376, + "step": 1282 + }, + { + "epoch": 7.178197064989518, + "grad_norm": 0.8759703504057706, + "learning_rate": 9.63552077742301e-07, + "loss": 0.0385, + "step": 1284 + }, + { + "epoch": 7.189378057302585, + "grad_norm": 1.0267904937054426, + "learning_rate": 9.56600255479469e-07, + "loss": 0.0222, + "step": 1286 + }, + { + "epoch": 7.200559049615653, + "grad_norm": 0.6389768145894307, + "learning_rate": 9.4966766403231e-07, + "loss": 0.018, + "step": 1288 + }, + { + "epoch": 7.211740041928721, + "grad_norm": 0.5762313893158477, + "learning_rate": 9.427543897811584e-07, + "loss": 0.0165, + "step": 1290 + }, + { + "epoch": 7.222921034241789, + "grad_norm": 0.5902518126138557, + "learning_rate": 9.358605188656603e-07, + "loss": 0.02, + "step": 1292 + }, + { + "epoch": 7.234102026554857, + "grad_norm": 0.824105561963567, + "learning_rate": 9.289861371836886e-07, + "loss": 0.0337, + "step": 1294 + }, + { + "epoch": 7.245283018867925, + "grad_norm": 0.504698332550927, + "learning_rate": 9.22131330390286e-07, + "loss": 0.0283, + "step": 1296 + }, + { + "epoch": 7.256464011180992, + "grad_norm": 0.5789695393721453, + "learning_rate": 9.152961838965879e-07, + "loss": 0.0169, + "step": 1298 + }, + { + "epoch": 7.26764500349406, + "grad_norm": 1.4892687104014115, + "learning_rate": 9.084807828687628e-07, + "loss": 0.0314, + "step": 1300 + }, + { + "epoch": 7.278825995807128, + "grad_norm": 1.0727067281323632, + "learning_rate": 9.016852122269493e-07, + "loss": 0.0274, + "step": 1302 + }, + { + "epoch": 7.290006988120195, + "grad_norm": 0.7309629553367788, + "learning_rate": 8.949095566441985e-07, + "loss": 0.0219, + "step": 1304 + }, + { + "epoch": 7.301187980433263, + "grad_norm": 0.6871990809680889, + "learning_rate": 8.881539005454215e-07, + "loss": 0.0339, + "step": 1306 + }, + { + "epoch": 7.312368972746331, + "grad_norm": 0.8530617423198913, + "learning_rate": 8.814183281063326e-07, + "loss": 0.0248, + "step": 1308 + }, + { + "epoch": 7.323549965059399, + "grad_norm": 0.76651991997128, + "learning_rate": 8.747029232524037e-07, + "loss": 0.023, + "step": 1310 + }, + { + "epoch": 7.334730957372467, + "grad_norm": 0.6966547986519114, + "learning_rate": 8.680077696578182e-07, + "loss": 0.0332, + "step": 1312 + }, + { + "epoch": 7.345911949685535, + "grad_norm": 1.0873098335521205, + "learning_rate": 8.613329507444274e-07, + "loss": 0.0234, + "step": 1314 + }, + { + "epoch": 7.357092941998602, + "grad_norm": 0.6461932986017782, + "learning_rate": 8.546785496807116e-07, + "loss": 0.0242, + "step": 1316 + }, + { + "epoch": 7.36827393431167, + "grad_norm": 0.7614414460885182, + "learning_rate": 8.480446493807464e-07, + "loss": 0.031, + "step": 1318 + }, + { + "epoch": 7.379454926624738, + "grad_norm": 0.641294466328584, + "learning_rate": 8.414313325031642e-07, + "loss": 0.028, + "step": 1320 + }, + { + "epoch": 7.3906359189378055, + "grad_norm": 0.47088954187562415, + "learning_rate": 8.348386814501286e-07, + "loss": 0.0186, + "step": 1322 + }, + { + "epoch": 7.401816911250873, + "grad_norm": 0.7909087034714356, + "learning_rate": 8.282667783663056e-07, + "loss": 0.0212, + "step": 1324 + }, + { + "epoch": 7.412997903563941, + "grad_norm": 0.8059238279425677, + "learning_rate": 8.217157051378411e-07, + "loss": 0.0239, + "step": 1326 + }, + { + "epoch": 7.424178895877009, + "grad_norm": 0.788531385863816, + "learning_rate": 8.151855433913414e-07, + "loss": 0.0199, + "step": 1328 + }, + { + "epoch": 7.435359888190077, + "grad_norm": 1.1393964476120448, + "learning_rate": 8.086763744928536e-07, + "loss": 0.0292, + "step": 1330 + }, + { + "epoch": 7.446540880503145, + "grad_norm": 0.5408108502649198, + "learning_rate": 8.02188279546853e-07, + "loss": 0.0146, + "step": 1332 + }, + { + "epoch": 7.4577218728162125, + "grad_norm": 0.8749206113652656, + "learning_rate": 7.957213393952335e-07, + "loss": 0.0247, + "step": 1334 + }, + { + "epoch": 7.46890286512928, + "grad_norm": 0.7053824386402378, + "learning_rate": 7.892756346162986e-07, + "loss": 0.02, + "step": 1336 + }, + { + "epoch": 7.480083857442348, + "grad_norm": 0.6965900833846856, + "learning_rate": 7.82851245523761e-07, + "loss": 0.0315, + "step": 1338 + }, + { + "epoch": 7.4912648497554155, + "grad_norm": 0.9392067120327887, + "learning_rate": 7.764482521657343e-07, + "loss": 0.0308, + "step": 1340 + }, + { + "epoch": 7.502445842068483, + "grad_norm": 0.7074561491918046, + "learning_rate": 7.700667343237453e-07, + "loss": 0.0171, + "step": 1342 + }, + { + "epoch": 7.513626834381551, + "grad_norm": 0.7697005768650605, + "learning_rate": 7.637067715117327e-07, + "loss": 0.0302, + "step": 1344 + }, + { + "epoch": 7.5248078266946195, + "grad_norm": 1.176668146060272, + "learning_rate": 7.573684429750583e-07, + "loss": 0.0265, + "step": 1346 + }, + { + "epoch": 7.535988819007687, + "grad_norm": 0.7258573280389607, + "learning_rate": 7.510518276895234e-07, + "loss": 0.0257, + "step": 1348 + }, + { + "epoch": 7.547169811320755, + "grad_norm": 1.1195611459347754, + "learning_rate": 7.447570043603755e-07, + "loss": 0.0261, + "step": 1350 + }, + { + "epoch": 7.5583508036338225, + "grad_norm": 0.9527258409378455, + "learning_rate": 7.384840514213404e-07, + "loss": 0.0524, + "step": 1352 + }, + { + "epoch": 7.56953179594689, + "grad_norm": 0.7074898357644916, + "learning_rate": 7.322330470336314e-07, + "loss": 0.0205, + "step": 1354 + }, + { + "epoch": 7.580712788259958, + "grad_norm": 0.9361424266631929, + "learning_rate": 7.26004069084987e-07, + "loss": 0.0217, + "step": 1356 + }, + { + "epoch": 7.5918937805730256, + "grad_norm": 1.7048958108176762, + "learning_rate": 7.197971951886956e-07, + "loss": 0.0225, + "step": 1358 + }, + { + "epoch": 7.603074772886094, + "grad_norm": 0.8812767707258257, + "learning_rate": 7.13612502682623e-07, + "loss": 0.0196, + "step": 1360 + }, + { + "epoch": 7.614255765199162, + "grad_norm": 0.5682027618905875, + "learning_rate": 7.074500686282609e-07, + "loss": 0.019, + "step": 1362 + }, + { + "epoch": 7.6254367575122295, + "grad_norm": 0.4475598932931596, + "learning_rate": 7.013099698097539e-07, + "loss": 0.0171, + "step": 1364 + }, + { + "epoch": 7.636617749825297, + "grad_norm": 0.5527498039813922, + "learning_rate": 6.951922827329535e-07, + "loss": 0.0217, + "step": 1366 + }, + { + "epoch": 7.647798742138365, + "grad_norm": 0.7984442985333638, + "learning_rate": 6.890970836244574e-07, + "loss": 0.0361, + "step": 1368 + }, + { + "epoch": 7.6589797344514325, + "grad_norm": 0.624268450810696, + "learning_rate": 6.830244484306623e-07, + "loss": 0.0158, + "step": 1370 + }, + { + "epoch": 7.6701607267645, + "grad_norm": 0.7493822409267487, + "learning_rate": 6.769744528168207e-07, + "loss": 0.0286, + "step": 1372 + }, + { + "epoch": 7.681341719077568, + "grad_norm": 0.6787647092695418, + "learning_rate": 6.709471721660904e-07, + "loss": 0.0215, + "step": 1374 + }, + { + "epoch": 7.692522711390636, + "grad_norm": 0.7321502006735149, + "learning_rate": 6.649426815786045e-07, + "loss": 0.0311, + "step": 1376 + }, + { + "epoch": 7.703703703703704, + "grad_norm": 0.701610396870259, + "learning_rate": 6.589610558705284e-07, + "loss": 0.0235, + "step": 1378 + }, + { + "epoch": 7.714884696016772, + "grad_norm": 0.6530846520546149, + "learning_rate": 6.53002369573131e-07, + "loss": 0.0245, + "step": 1380 + }, + { + "epoch": 7.7260656883298395, + "grad_norm": 0.7531427984254183, + "learning_rate": 6.470666969318554e-07, + "loss": 0.0315, + "step": 1382 + }, + { + "epoch": 7.737246680642907, + "grad_norm": 0.7301669272251805, + "learning_rate": 6.41154111905393e-07, + "loss": 0.0225, + "step": 1384 + }, + { + "epoch": 7.748427672955975, + "grad_norm": 0.8707140120777088, + "learning_rate": 6.352646881647647e-07, + "loss": 0.0259, + "step": 1386 + }, + { + "epoch": 7.759608665269043, + "grad_norm": 0.837200588883093, + "learning_rate": 6.29398499092399e-07, + "loss": 0.0474, + "step": 1388 + }, + { + "epoch": 7.77078965758211, + "grad_norm": 0.973530488120086, + "learning_rate": 6.235556177812205e-07, + "loss": 0.0329, + "step": 1390 + }, + { + "epoch": 7.781970649895178, + "grad_norm": 0.5813627298678434, + "learning_rate": 6.177361170337376e-07, + "loss": 0.0194, + "step": 1392 + }, + { + "epoch": 7.793151642208246, + "grad_norm": 0.8597088367336019, + "learning_rate": 6.119400693611358e-07, + "loss": 0.0123, + "step": 1394 + }, + { + "epoch": 7.804332634521314, + "grad_norm": 0.8368570476462492, + "learning_rate": 6.061675469823763e-07, + "loss": 0.0227, + "step": 1396 + }, + { + "epoch": 7.815513626834382, + "grad_norm": 0.5203392914919558, + "learning_rate": 6.004186218232933e-07, + "loss": 0.0217, + "step": 1398 + }, + { + "epoch": 7.82669461914745, + "grad_norm": 0.8572153440435842, + "learning_rate": 5.946933655156976e-07, + "loss": 0.0294, + "step": 1400 + }, + { + "epoch": 7.837875611460517, + "grad_norm": 0.6862577628733875, + "learning_rate": 5.889918493964869e-07, + "loss": 0.0228, + "step": 1402 + }, + { + "epoch": 7.849056603773585, + "grad_norm": 0.7097594226614418, + "learning_rate": 5.833141445067541e-07, + "loss": 0.0113, + "step": 1404 + }, + { + "epoch": 7.860237596086653, + "grad_norm": 0.6322499286175502, + "learning_rate": 5.776603215909041e-07, + "loss": 0.0229, + "step": 1406 + }, + { + "epoch": 7.87141858839972, + "grad_norm": 0.6798739232739857, + "learning_rate": 5.720304510957722e-07, + "loss": 0.0257, + "step": 1408 + }, + { + "epoch": 7.882599580712788, + "grad_norm": 0.6568708401714163, + "learning_rate": 5.66424603169744e-07, + "loss": 0.0285, + "step": 1410 + }, + { + "epoch": 7.893780573025856, + "grad_norm": 1.1483908878505031, + "learning_rate": 5.608428476618843e-07, + "loss": 0.0235, + "step": 1412 + }, + { + "epoch": 7.904961565338924, + "grad_norm": 0.9297111790590921, + "learning_rate": 5.552852541210651e-07, + "loss": 0.022, + "step": 1414 + }, + { + "epoch": 7.916142557651992, + "grad_norm": 0.7288896652277049, + "learning_rate": 5.497518917950986e-07, + "loss": 0.033, + "step": 1416 + }, + { + "epoch": 7.92732354996506, + "grad_norm": 1.3241630685241197, + "learning_rate": 5.44242829629878e-07, + "loss": 0.0236, + "step": 1418 + }, + { + "epoch": 7.938504542278127, + "grad_norm": 0.6616696784338312, + "learning_rate": 5.387581362685112e-07, + "loss": 0.03, + "step": 1420 + }, + { + "epoch": 7.949685534591195, + "grad_norm": 0.9223806906428696, + "learning_rate": 5.332978800504742e-07, + "loss": 0.0234, + "step": 1422 + }, + { + "epoch": 7.960866526904263, + "grad_norm": 1.1302104401143789, + "learning_rate": 5.278621290107533e-07, + "loss": 0.0334, + "step": 1424 + }, + { + "epoch": 7.97204751921733, + "grad_norm": 0.6145924647383543, + "learning_rate": 5.224509508789987e-07, + "loss": 0.0205, + "step": 1426 + }, + { + "epoch": 7.983228511530398, + "grad_norm": 0.6724718918142113, + "learning_rate": 5.170644130786842e-07, + "loss": 0.0315, + "step": 1428 + }, + { + "epoch": 7.994409503843466, + "grad_norm": 0.5897709957691004, + "learning_rate": 5.117025827262598e-07, + "loss": 0.0189, + "step": 1430 + } + ], + "logging_steps": 2, + "max_steps": 1780, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 598197676277760.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}