{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9993836671802774, "eval_steps": 500, "global_step": 3244, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006163328197226503, "grad_norm": 18.283123016357422, "learning_rate": 6.153846153846154e-08, "loss": 1.9367, "step": 1 }, { "epoch": 0.0012326656394453005, "grad_norm": 18.946231842041016, "learning_rate": 1.2307692307692308e-07, "loss": 1.9557, "step": 2 }, { "epoch": 0.0018489984591679508, "grad_norm": 17.863901138305664, "learning_rate": 1.8461538461538464e-07, "loss": 1.9502, "step": 3 }, { "epoch": 0.002465331278890601, "grad_norm": 17.759328842163086, "learning_rate": 2.4615384615384616e-07, "loss": 1.9743, "step": 4 }, { "epoch": 0.0030816640986132513, "grad_norm": 17.28434181213379, "learning_rate": 3.0769230769230774e-07, "loss": 1.9215, "step": 5 }, { "epoch": 0.0036979969183359015, "grad_norm": 16.458284378051758, "learning_rate": 3.6923076923076927e-07, "loss": 1.96, "step": 6 }, { "epoch": 0.004314329738058551, "grad_norm": 15.54858112335205, "learning_rate": 4.307692307692308e-07, "loss": 1.9378, "step": 7 }, { "epoch": 0.004930662557781202, "grad_norm": 13.557045936584473, "learning_rate": 4.923076923076923e-07, "loss": 1.9347, "step": 8 }, { "epoch": 0.005546995377503852, "grad_norm": 12.082256317138672, "learning_rate": 5.53846153846154e-07, "loss": 1.8656, "step": 9 }, { "epoch": 0.0061633281972265025, "grad_norm": 9.061657905578613, "learning_rate": 6.153846153846155e-07, "loss": 1.8648, "step": 10 }, { "epoch": 0.006779661016949152, "grad_norm": 6.246723175048828, "learning_rate": 6.769230769230769e-07, "loss": 1.9179, "step": 11 }, { "epoch": 0.007395993836671803, "grad_norm": 7.135113716125488, "learning_rate": 7.384615384615385e-07, "loss": 1.8729, "step": 12 }, { "epoch": 0.008012326656394453, "grad_norm": 8.535818099975586, "learning_rate": 8.000000000000001e-07, "loss": 1.7982, "step": 13 }, { "epoch": 0.008628659476117103, "grad_norm": 11.601216316223145, "learning_rate": 8.615384615384616e-07, "loss": 1.9343, "step": 14 }, { "epoch": 0.009244992295839754, "grad_norm": 9.720977783203125, "learning_rate": 9.230769230769232e-07, "loss": 1.8258, "step": 15 }, { "epoch": 0.009861325115562404, "grad_norm": 8.743847846984863, "learning_rate": 9.846153846153847e-07, "loss": 1.8506, "step": 16 }, { "epoch": 0.010477657935285054, "grad_norm": 6.64108419418335, "learning_rate": 1.0461538461538463e-06, "loss": 1.8593, "step": 17 }, { "epoch": 0.011093990755007704, "grad_norm": 4.951357841491699, "learning_rate": 1.107692307692308e-06, "loss": 1.8172, "step": 18 }, { "epoch": 0.011710323574730355, "grad_norm": 4.0312819480896, "learning_rate": 1.1692307692307693e-06, "loss": 1.8289, "step": 19 }, { "epoch": 0.012326656394453005, "grad_norm": 4.08912992477417, "learning_rate": 1.230769230769231e-06, "loss": 1.7789, "step": 20 }, { "epoch": 0.012942989214175655, "grad_norm": 3.7903993129730225, "learning_rate": 1.2923076923076924e-06, "loss": 1.8056, "step": 21 }, { "epoch": 0.013559322033898305, "grad_norm": 3.1444079875946045, "learning_rate": 1.3538461538461538e-06, "loss": 1.7244, "step": 22 }, { "epoch": 0.014175654853620955, "grad_norm": 3.313861131668091, "learning_rate": 1.4153846153846155e-06, "loss": 1.7461, "step": 23 }, { "epoch": 0.014791987673343606, "grad_norm": 2.7172651290893555, "learning_rate": 1.476923076923077e-06, "loss": 1.7905, "step": 24 }, { "epoch": 0.015408320493066256, "grad_norm": 2.6085026264190674, "learning_rate": 1.5384615384615387e-06, "loss": 1.8212, "step": 25 }, { "epoch": 0.016024653312788906, "grad_norm": 2.484985113143921, "learning_rate": 1.6000000000000001e-06, "loss": 1.8062, "step": 26 }, { "epoch": 0.016640986132511557, "grad_norm": 2.6136233806610107, "learning_rate": 1.6615384615384616e-06, "loss": 1.7101, "step": 27 }, { "epoch": 0.017257318952234205, "grad_norm": 2.9175522327423096, "learning_rate": 1.7230769230769232e-06, "loss": 1.7577, "step": 28 }, { "epoch": 0.017873651771956857, "grad_norm": 2.6146135330200195, "learning_rate": 1.7846153846153846e-06, "loss": 1.775, "step": 29 }, { "epoch": 0.01848998459167951, "grad_norm": 2.8538455963134766, "learning_rate": 1.8461538461538465e-06, "loss": 1.7441, "step": 30 }, { "epoch": 0.019106317411402157, "grad_norm": 2.6471569538116455, "learning_rate": 1.907692307692308e-06, "loss": 1.7708, "step": 31 }, { "epoch": 0.019722650231124808, "grad_norm": 2.458494186401367, "learning_rate": 1.9692307692307693e-06, "loss": 1.7105, "step": 32 }, { "epoch": 0.020338983050847456, "grad_norm": 2.479487895965576, "learning_rate": 2.030769230769231e-06, "loss": 1.7297, "step": 33 }, { "epoch": 0.020955315870570108, "grad_norm": 2.721555709838867, "learning_rate": 2.0923076923076926e-06, "loss": 1.8167, "step": 34 }, { "epoch": 0.02157164869029276, "grad_norm": 2.142219066619873, "learning_rate": 2.153846153846154e-06, "loss": 1.7339, "step": 35 }, { "epoch": 0.022187981510015407, "grad_norm": 2.2020745277404785, "learning_rate": 2.215384615384616e-06, "loss": 1.7379, "step": 36 }, { "epoch": 0.02280431432973806, "grad_norm": 2.1203973293304443, "learning_rate": 2.276923076923077e-06, "loss": 1.7364, "step": 37 }, { "epoch": 0.02342064714946071, "grad_norm": 1.9414070844650269, "learning_rate": 2.3384615384615387e-06, "loss": 1.6987, "step": 38 }, { "epoch": 0.02403697996918336, "grad_norm": 2.148810386657715, "learning_rate": 2.4000000000000003e-06, "loss": 1.7789, "step": 39 }, { "epoch": 0.02465331278890601, "grad_norm": 2.27530574798584, "learning_rate": 2.461538461538462e-06, "loss": 1.6623, "step": 40 }, { "epoch": 0.025269645608628658, "grad_norm": 2.198756694793701, "learning_rate": 2.523076923076923e-06, "loss": 1.7178, "step": 41 }, { "epoch": 0.02588597842835131, "grad_norm": 2.029705047607422, "learning_rate": 2.584615384615385e-06, "loss": 1.7701, "step": 42 }, { "epoch": 0.02650231124807396, "grad_norm": 2.2430293560028076, "learning_rate": 2.6461538461538464e-06, "loss": 1.7731, "step": 43 }, { "epoch": 0.02711864406779661, "grad_norm": 1.9643375873565674, "learning_rate": 2.7076923076923076e-06, "loss": 1.7308, "step": 44 }, { "epoch": 0.02773497688751926, "grad_norm": 1.9275065660476685, "learning_rate": 2.7692307692307697e-06, "loss": 1.7122, "step": 45 }, { "epoch": 0.02835130970724191, "grad_norm": 2.019418954849243, "learning_rate": 2.830769230769231e-06, "loss": 1.6499, "step": 46 }, { "epoch": 0.02896764252696456, "grad_norm": 1.8190032243728638, "learning_rate": 2.8923076923076925e-06, "loss": 1.7056, "step": 47 }, { "epoch": 0.029583975346687212, "grad_norm": 2.035541296005249, "learning_rate": 2.953846153846154e-06, "loss": 1.7581, "step": 48 }, { "epoch": 0.03020030816640986, "grad_norm": 1.7920801639556885, "learning_rate": 3.0153846153846154e-06, "loss": 1.7001, "step": 49 }, { "epoch": 0.030816640986132512, "grad_norm": 1.994023084640503, "learning_rate": 3.0769230769230774e-06, "loss": 1.7239, "step": 50 }, { "epoch": 0.03143297380585516, "grad_norm": 1.997628927230835, "learning_rate": 3.1384615384615386e-06, "loss": 1.7965, "step": 51 }, { "epoch": 0.03204930662557781, "grad_norm": 1.9447797536849976, "learning_rate": 3.2000000000000003e-06, "loss": 1.7471, "step": 52 }, { "epoch": 0.03266563944530046, "grad_norm": 2.0363621711730957, "learning_rate": 3.2615384615384615e-06, "loss": 1.6917, "step": 53 }, { "epoch": 0.033281972265023115, "grad_norm": 1.9079309701919556, "learning_rate": 3.323076923076923e-06, "loss": 1.7219, "step": 54 }, { "epoch": 0.03389830508474576, "grad_norm": 1.8836143016815186, "learning_rate": 3.384615384615385e-06, "loss": 1.6925, "step": 55 }, { "epoch": 0.03451463790446841, "grad_norm": 1.8466883897781372, "learning_rate": 3.4461538461538464e-06, "loss": 1.7067, "step": 56 }, { "epoch": 0.035130970724191066, "grad_norm": 2.005443572998047, "learning_rate": 3.507692307692308e-06, "loss": 1.7282, "step": 57 }, { "epoch": 0.035747303543913714, "grad_norm": 1.9682645797729492, "learning_rate": 3.5692307692307692e-06, "loss": 1.6803, "step": 58 }, { "epoch": 0.03636363636363636, "grad_norm": 1.9016236066818237, "learning_rate": 3.630769230769231e-06, "loss": 1.7499, "step": 59 }, { "epoch": 0.03697996918335902, "grad_norm": 1.9661368131637573, "learning_rate": 3.692307692307693e-06, "loss": 1.7559, "step": 60 }, { "epoch": 0.037596302003081665, "grad_norm": 2.085322141647339, "learning_rate": 3.753846153846154e-06, "loss": 1.7302, "step": 61 }, { "epoch": 0.03821263482280431, "grad_norm": 1.9989895820617676, "learning_rate": 3.815384615384616e-06, "loss": 1.7078, "step": 62 }, { "epoch": 0.03882896764252696, "grad_norm": 1.825740933418274, "learning_rate": 3.876923076923077e-06, "loss": 1.7043, "step": 63 }, { "epoch": 0.039445300462249616, "grad_norm": 2.006073474884033, "learning_rate": 3.938461538461539e-06, "loss": 1.7016, "step": 64 }, { "epoch": 0.040061633281972264, "grad_norm": 1.9252952337265015, "learning_rate": 4.000000000000001e-06, "loss": 1.746, "step": 65 }, { "epoch": 0.04067796610169491, "grad_norm": 1.960313320159912, "learning_rate": 4.061538461538462e-06, "loss": 1.7106, "step": 66 }, { "epoch": 0.04129429892141757, "grad_norm": 1.9012987613677979, "learning_rate": 4.123076923076924e-06, "loss": 1.6545, "step": 67 }, { "epoch": 0.041910631741140215, "grad_norm": 2.0893990993499756, "learning_rate": 4.184615384615385e-06, "loss": 1.6829, "step": 68 }, { "epoch": 0.042526964560862864, "grad_norm": 1.980640172958374, "learning_rate": 4.246153846153846e-06, "loss": 1.7721, "step": 69 }, { "epoch": 0.04314329738058552, "grad_norm": 1.882873296737671, "learning_rate": 4.307692307692308e-06, "loss": 1.6841, "step": 70 }, { "epoch": 0.04375963020030817, "grad_norm": 1.8705075979232788, "learning_rate": 4.36923076923077e-06, "loss": 1.709, "step": 71 }, { "epoch": 0.044375963020030815, "grad_norm": 2.001063585281372, "learning_rate": 4.430769230769232e-06, "loss": 1.7173, "step": 72 }, { "epoch": 0.04499229583975347, "grad_norm": 1.961666464805603, "learning_rate": 4.492307692307693e-06, "loss": 1.6596, "step": 73 }, { "epoch": 0.04560862865947612, "grad_norm": 1.894849181175232, "learning_rate": 4.553846153846154e-06, "loss": 1.6286, "step": 74 }, { "epoch": 0.046224961479198766, "grad_norm": 2.0797834396362305, "learning_rate": 4.615384615384616e-06, "loss": 1.6802, "step": 75 }, { "epoch": 0.04684129429892142, "grad_norm": 1.7766090631484985, "learning_rate": 4.676923076923077e-06, "loss": 1.6067, "step": 76 }, { "epoch": 0.04745762711864407, "grad_norm": 1.9216660261154175, "learning_rate": 4.738461538461539e-06, "loss": 1.6541, "step": 77 }, { "epoch": 0.04807395993836672, "grad_norm": 1.8954193592071533, "learning_rate": 4.800000000000001e-06, "loss": 1.6725, "step": 78 }, { "epoch": 0.048690292758089365, "grad_norm": 2.0106375217437744, "learning_rate": 4.861538461538462e-06, "loss": 1.6328, "step": 79 }, { "epoch": 0.04930662557781202, "grad_norm": 1.8704357147216797, "learning_rate": 4.923076923076924e-06, "loss": 1.6116, "step": 80 }, { "epoch": 0.04992295839753467, "grad_norm": 2.1793038845062256, "learning_rate": 4.984615384615385e-06, "loss": 1.6703, "step": 81 }, { "epoch": 0.050539291217257316, "grad_norm": 1.966057538986206, "learning_rate": 5.046153846153846e-06, "loss": 1.6565, "step": 82 }, { "epoch": 0.05115562403697997, "grad_norm": 2.1816067695617676, "learning_rate": 5.1076923076923075e-06, "loss": 1.7122, "step": 83 }, { "epoch": 0.05177195685670262, "grad_norm": 1.8907414674758911, "learning_rate": 5.16923076923077e-06, "loss": 1.633, "step": 84 }, { "epoch": 0.05238828967642527, "grad_norm": 1.957214117050171, "learning_rate": 5.230769230769232e-06, "loss": 1.6571, "step": 85 }, { "epoch": 0.05300462249614792, "grad_norm": 1.983808994293213, "learning_rate": 5.292307692307693e-06, "loss": 1.7002, "step": 86 }, { "epoch": 0.05362095531587057, "grad_norm": 2.0325801372528076, "learning_rate": 5.353846153846154e-06, "loss": 1.6982, "step": 87 }, { "epoch": 0.05423728813559322, "grad_norm": 2.0177230834960938, "learning_rate": 5.415384615384615e-06, "loss": 1.6717, "step": 88 }, { "epoch": 0.054853620955315874, "grad_norm": 2.146578788757324, "learning_rate": 5.476923076923077e-06, "loss": 1.7126, "step": 89 }, { "epoch": 0.05546995377503852, "grad_norm": 1.9034457206726074, "learning_rate": 5.538461538461539e-06, "loss": 1.6631, "step": 90 }, { "epoch": 0.05608628659476117, "grad_norm": 2.109832286834717, "learning_rate": 5.600000000000001e-06, "loss": 1.7038, "step": 91 }, { "epoch": 0.05670261941448382, "grad_norm": 1.89127516746521, "learning_rate": 5.661538461538462e-06, "loss": 1.6389, "step": 92 }, { "epoch": 0.05731895223420647, "grad_norm": 1.980305790901184, "learning_rate": 5.723076923076923e-06, "loss": 1.7275, "step": 93 }, { "epoch": 0.05793528505392912, "grad_norm": 1.9199168682098389, "learning_rate": 5.784615384615385e-06, "loss": 1.6594, "step": 94 }, { "epoch": 0.05855161787365177, "grad_norm": 1.990004301071167, "learning_rate": 5.846153846153847e-06, "loss": 1.623, "step": 95 }, { "epoch": 0.059167950693374424, "grad_norm": 2.0493931770324707, "learning_rate": 5.907692307692308e-06, "loss": 1.6637, "step": 96 }, { "epoch": 0.05978428351309707, "grad_norm": 1.9087932109832764, "learning_rate": 5.9692307692307695e-06, "loss": 1.7582, "step": 97 }, { "epoch": 0.06040061633281972, "grad_norm": 2.247390031814575, "learning_rate": 6.030769230769231e-06, "loss": 1.755, "step": 98 }, { "epoch": 0.061016949152542375, "grad_norm": 1.8937528133392334, "learning_rate": 6.092307692307693e-06, "loss": 1.6809, "step": 99 }, { "epoch": 0.061633281972265024, "grad_norm": 2.01607608795166, "learning_rate": 6.153846153846155e-06, "loss": 1.6627, "step": 100 }, { "epoch": 0.06224961479198767, "grad_norm": 1.9090261459350586, "learning_rate": 6.215384615384615e-06, "loss": 1.6922, "step": 101 }, { "epoch": 0.06286594761171033, "grad_norm": 1.9379462003707886, "learning_rate": 6.276923076923077e-06, "loss": 1.7481, "step": 102 }, { "epoch": 0.06348228043143297, "grad_norm": 2.0171878337860107, "learning_rate": 6.3384615384615385e-06, "loss": 1.616, "step": 103 }, { "epoch": 0.06409861325115562, "grad_norm": 2.0259618759155273, "learning_rate": 6.4000000000000006e-06, "loss": 1.6998, "step": 104 }, { "epoch": 0.06471494607087827, "grad_norm": 1.912516713142395, "learning_rate": 6.461538461538463e-06, "loss": 1.761, "step": 105 }, { "epoch": 0.06533127889060092, "grad_norm": 1.8912975788116455, "learning_rate": 6.523076923076923e-06, "loss": 1.6717, "step": 106 }, { "epoch": 0.06594761171032358, "grad_norm": 1.8712334632873535, "learning_rate": 6.584615384615385e-06, "loss": 1.6727, "step": 107 }, { "epoch": 0.06656394453004623, "grad_norm": 2.164719581604004, "learning_rate": 6.646153846153846e-06, "loss": 1.7102, "step": 108 }, { "epoch": 0.06718027734976888, "grad_norm": 1.9096243381500244, "learning_rate": 6.707692307692308e-06, "loss": 1.6774, "step": 109 }, { "epoch": 0.06779661016949153, "grad_norm": 1.9874730110168457, "learning_rate": 6.76923076923077e-06, "loss": 1.6728, "step": 110 }, { "epoch": 0.06841294298921417, "grad_norm": 1.9436291456222534, "learning_rate": 6.830769230769231e-06, "loss": 1.7058, "step": 111 }, { "epoch": 0.06902927580893682, "grad_norm": 1.894932508468628, "learning_rate": 6.892307692307693e-06, "loss": 1.7173, "step": 112 }, { "epoch": 0.06964560862865947, "grad_norm": 1.938931941986084, "learning_rate": 6.953846153846154e-06, "loss": 1.6985, "step": 113 }, { "epoch": 0.07026194144838213, "grad_norm": 1.9768579006195068, "learning_rate": 7.015384615384616e-06, "loss": 1.6411, "step": 114 }, { "epoch": 0.07087827426810478, "grad_norm": 1.8404351472854614, "learning_rate": 7.076923076923078e-06, "loss": 1.6333, "step": 115 }, { "epoch": 0.07149460708782743, "grad_norm": 1.9638370275497437, "learning_rate": 7.1384615384615385e-06, "loss": 1.6913, "step": 116 }, { "epoch": 0.07211093990755008, "grad_norm": 1.8498730659484863, "learning_rate": 7.2000000000000005e-06, "loss": 1.6164, "step": 117 }, { "epoch": 0.07272727272727272, "grad_norm": 1.9184679985046387, "learning_rate": 7.261538461538462e-06, "loss": 1.7153, "step": 118 }, { "epoch": 0.07334360554699537, "grad_norm": 1.7618049383163452, "learning_rate": 7.323076923076924e-06, "loss": 1.6561, "step": 119 }, { "epoch": 0.07395993836671803, "grad_norm": 1.894404649734497, "learning_rate": 7.384615384615386e-06, "loss": 1.6705, "step": 120 }, { "epoch": 0.07457627118644068, "grad_norm": 1.813152551651001, "learning_rate": 7.446153846153846e-06, "loss": 1.7003, "step": 121 }, { "epoch": 0.07519260400616333, "grad_norm": 1.8237017393112183, "learning_rate": 7.507692307692308e-06, "loss": 1.6992, "step": 122 }, { "epoch": 0.07580893682588598, "grad_norm": 1.883804202079773, "learning_rate": 7.5692307692307695e-06, "loss": 1.6957, "step": 123 }, { "epoch": 0.07642526964560863, "grad_norm": 1.7882566452026367, "learning_rate": 7.630769230769232e-06, "loss": 1.6152, "step": 124 }, { "epoch": 0.07704160246533127, "grad_norm": 1.9625492095947266, "learning_rate": 7.692307692307694e-06, "loss": 1.6875, "step": 125 }, { "epoch": 0.07765793528505392, "grad_norm": 1.8058832883834839, "learning_rate": 7.753846153846155e-06, "loss": 1.6658, "step": 126 }, { "epoch": 0.07827426810477658, "grad_norm": 1.9218006134033203, "learning_rate": 7.815384615384616e-06, "loss": 1.6772, "step": 127 }, { "epoch": 0.07889060092449923, "grad_norm": 1.9369494915008545, "learning_rate": 7.876923076923077e-06, "loss": 1.6381, "step": 128 }, { "epoch": 0.07950693374422188, "grad_norm": 2.0018672943115234, "learning_rate": 7.93846153846154e-06, "loss": 1.7309, "step": 129 }, { "epoch": 0.08012326656394453, "grad_norm": 1.785011649131775, "learning_rate": 8.000000000000001e-06, "loss": 1.6822, "step": 130 }, { "epoch": 0.08073959938366718, "grad_norm": 1.791548490524292, "learning_rate": 8.061538461538463e-06, "loss": 1.666, "step": 131 }, { "epoch": 0.08135593220338982, "grad_norm": 2.0245301723480225, "learning_rate": 8.123076923076924e-06, "loss": 1.6309, "step": 132 }, { "epoch": 0.08197226502311249, "grad_norm": 1.8726131916046143, "learning_rate": 8.184615384615385e-06, "loss": 1.6149, "step": 133 }, { "epoch": 0.08258859784283513, "grad_norm": 1.8501847982406616, "learning_rate": 8.246153846153848e-06, "loss": 1.6536, "step": 134 }, { "epoch": 0.08320493066255778, "grad_norm": 2.203555107116699, "learning_rate": 8.307692307692309e-06, "loss": 1.6539, "step": 135 }, { "epoch": 0.08382126348228043, "grad_norm": 1.9053564071655273, "learning_rate": 8.36923076923077e-06, "loss": 1.7, "step": 136 }, { "epoch": 0.08443759630200308, "grad_norm": 1.8981451988220215, "learning_rate": 8.430769230769231e-06, "loss": 1.6684, "step": 137 }, { "epoch": 0.08505392912172573, "grad_norm": 2.0209877490997314, "learning_rate": 8.492307692307693e-06, "loss": 1.6921, "step": 138 }, { "epoch": 0.08567026194144839, "grad_norm": 1.7909303903579712, "learning_rate": 8.553846153846156e-06, "loss": 1.6617, "step": 139 }, { "epoch": 0.08628659476117104, "grad_norm": 1.912179708480835, "learning_rate": 8.615384615384617e-06, "loss": 1.6708, "step": 140 }, { "epoch": 0.08690292758089369, "grad_norm": 1.8235849142074585, "learning_rate": 8.676923076923078e-06, "loss": 1.7039, "step": 141 }, { "epoch": 0.08751926040061633, "grad_norm": 1.9197677373886108, "learning_rate": 8.73846153846154e-06, "loss": 1.7472, "step": 142 }, { "epoch": 0.08813559322033898, "grad_norm": 1.9008625745773315, "learning_rate": 8.8e-06, "loss": 1.7531, "step": 143 }, { "epoch": 0.08875192604006163, "grad_norm": 1.8651905059814453, "learning_rate": 8.861538461538463e-06, "loss": 1.7262, "step": 144 }, { "epoch": 0.08936825885978428, "grad_norm": 2.1121091842651367, "learning_rate": 8.923076923076925e-06, "loss": 1.6441, "step": 145 }, { "epoch": 0.08998459167950694, "grad_norm": 1.7415376901626587, "learning_rate": 8.984615384615386e-06, "loss": 1.6712, "step": 146 }, { "epoch": 0.09060092449922959, "grad_norm": 1.9746865034103394, "learning_rate": 9.046153846153847e-06, "loss": 1.6518, "step": 147 }, { "epoch": 0.09121725731895224, "grad_norm": 1.9373316764831543, "learning_rate": 9.107692307692308e-06, "loss": 1.6361, "step": 148 }, { "epoch": 0.09183359013867488, "grad_norm": 1.9080982208251953, "learning_rate": 9.169230769230771e-06, "loss": 1.6265, "step": 149 }, { "epoch": 0.09244992295839753, "grad_norm": 1.7338404655456543, "learning_rate": 9.230769230769232e-06, "loss": 1.6829, "step": 150 }, { "epoch": 0.09306625577812018, "grad_norm": 2.0784664154052734, "learning_rate": 9.292307692307694e-06, "loss": 1.66, "step": 151 }, { "epoch": 0.09368258859784284, "grad_norm": 1.8862324953079224, "learning_rate": 9.353846153846155e-06, "loss": 1.6137, "step": 152 }, { "epoch": 0.09429892141756549, "grad_norm": 1.9064992666244507, "learning_rate": 9.415384615384616e-06, "loss": 1.6307, "step": 153 }, { "epoch": 0.09491525423728814, "grad_norm": 1.8554366827011108, "learning_rate": 9.476923076923079e-06, "loss": 1.6703, "step": 154 }, { "epoch": 0.09553158705701079, "grad_norm": 1.771305799484253, "learning_rate": 9.53846153846154e-06, "loss": 1.6755, "step": 155 }, { "epoch": 0.09614791987673343, "grad_norm": 1.8853998184204102, "learning_rate": 9.600000000000001e-06, "loss": 1.6707, "step": 156 }, { "epoch": 0.09676425269645608, "grad_norm": 1.9528155326843262, "learning_rate": 9.661538461538462e-06, "loss": 1.7336, "step": 157 }, { "epoch": 0.09738058551617873, "grad_norm": 1.7515678405761719, "learning_rate": 9.723076923076924e-06, "loss": 1.6043, "step": 158 }, { "epoch": 0.09799691833590139, "grad_norm": 1.7086735963821411, "learning_rate": 9.784615384615387e-06, "loss": 1.6793, "step": 159 }, { "epoch": 0.09861325115562404, "grad_norm": 1.699203610420227, "learning_rate": 9.846153846153848e-06, "loss": 1.6565, "step": 160 }, { "epoch": 0.09922958397534669, "grad_norm": 1.6627018451690674, "learning_rate": 9.907692307692309e-06, "loss": 1.6271, "step": 161 }, { "epoch": 0.09984591679506934, "grad_norm": 1.6584526300430298, "learning_rate": 9.96923076923077e-06, "loss": 1.6329, "step": 162 }, { "epoch": 0.10046224961479198, "grad_norm": 1.6759963035583496, "learning_rate": 1.0030769230769231e-05, "loss": 1.6385, "step": 163 }, { "epoch": 0.10107858243451463, "grad_norm": 1.793668508529663, "learning_rate": 1.0092307692307693e-05, "loss": 1.6845, "step": 164 }, { "epoch": 0.1016949152542373, "grad_norm": 1.764163851737976, "learning_rate": 1.0153846153846154e-05, "loss": 1.7034, "step": 165 }, { "epoch": 0.10231124807395994, "grad_norm": 1.7434356212615967, "learning_rate": 1.0215384615384615e-05, "loss": 1.6406, "step": 166 }, { "epoch": 0.10292758089368259, "grad_norm": 1.6339836120605469, "learning_rate": 1.0276923076923078e-05, "loss": 1.6519, "step": 167 }, { "epoch": 0.10354391371340524, "grad_norm": 1.8387812376022339, "learning_rate": 1.033846153846154e-05, "loss": 1.5937, "step": 168 }, { "epoch": 0.10416024653312789, "grad_norm": 1.7533265352249146, "learning_rate": 1.04e-05, "loss": 1.6583, "step": 169 }, { "epoch": 0.10477657935285054, "grad_norm": 1.729021668434143, "learning_rate": 1.0461538461538463e-05, "loss": 1.6407, "step": 170 }, { "epoch": 0.10539291217257318, "grad_norm": 1.8352646827697754, "learning_rate": 1.0523076923076924e-05, "loss": 1.6755, "step": 171 }, { "epoch": 0.10600924499229585, "grad_norm": 1.6396636962890625, "learning_rate": 1.0584615384615386e-05, "loss": 1.6169, "step": 172 }, { "epoch": 0.1066255778120185, "grad_norm": 1.7374365329742432, "learning_rate": 1.0646153846153845e-05, "loss": 1.7692, "step": 173 }, { "epoch": 0.10724191063174114, "grad_norm": 1.720800518989563, "learning_rate": 1.0707692307692308e-05, "loss": 1.6125, "step": 174 }, { "epoch": 0.10785824345146379, "grad_norm": 1.74252450466156, "learning_rate": 1.076923076923077e-05, "loss": 1.6468, "step": 175 }, { "epoch": 0.10847457627118644, "grad_norm": 1.7793411016464233, "learning_rate": 1.083076923076923e-05, "loss": 1.6277, "step": 176 }, { "epoch": 0.10909090909090909, "grad_norm": 1.7517365217208862, "learning_rate": 1.0892307692307693e-05, "loss": 1.6613, "step": 177 }, { "epoch": 0.10970724191063175, "grad_norm": 1.6644283533096313, "learning_rate": 1.0953846153846155e-05, "loss": 1.6787, "step": 178 }, { "epoch": 0.1103235747303544, "grad_norm": 1.829753041267395, "learning_rate": 1.1015384615384616e-05, "loss": 1.6597, "step": 179 }, { "epoch": 0.11093990755007704, "grad_norm": 1.7197543382644653, "learning_rate": 1.1076923076923079e-05, "loss": 1.6225, "step": 180 }, { "epoch": 0.11155624036979969, "grad_norm": 1.6154543161392212, "learning_rate": 1.113846153846154e-05, "loss": 1.6493, "step": 181 }, { "epoch": 0.11217257318952234, "grad_norm": 1.7888453006744385, "learning_rate": 1.1200000000000001e-05, "loss": 1.7947, "step": 182 }, { "epoch": 0.11278890600924499, "grad_norm": 1.7465466260910034, "learning_rate": 1.126153846153846e-05, "loss": 1.6935, "step": 183 }, { "epoch": 0.11340523882896764, "grad_norm": 1.6538766622543335, "learning_rate": 1.1323076923076924e-05, "loss": 1.6137, "step": 184 }, { "epoch": 0.1140215716486903, "grad_norm": 1.871999740600586, "learning_rate": 1.1384615384615385e-05, "loss": 1.6067, "step": 185 }, { "epoch": 0.11463790446841295, "grad_norm": 1.6542673110961914, "learning_rate": 1.1446153846153846e-05, "loss": 1.6342, "step": 186 }, { "epoch": 0.1152542372881356, "grad_norm": 1.742908239364624, "learning_rate": 1.1507692307692309e-05, "loss": 1.6082, "step": 187 }, { "epoch": 0.11587057010785824, "grad_norm": 1.7705897092819214, "learning_rate": 1.156923076923077e-05, "loss": 1.6477, "step": 188 }, { "epoch": 0.11648690292758089, "grad_norm": 1.6397745609283447, "learning_rate": 1.1630769230769231e-05, "loss": 1.6578, "step": 189 }, { "epoch": 0.11710323574730354, "grad_norm": 1.781221628189087, "learning_rate": 1.1692307692307694e-05, "loss": 1.6463, "step": 190 }, { "epoch": 0.1177195685670262, "grad_norm": 1.6814526319503784, "learning_rate": 1.1753846153846155e-05, "loss": 1.7068, "step": 191 }, { "epoch": 0.11833590138674885, "grad_norm": 2.04496431350708, "learning_rate": 1.1815384615384617e-05, "loss": 1.6708, "step": 192 }, { "epoch": 0.1189522342064715, "grad_norm": 1.6994445323944092, "learning_rate": 1.1876923076923076e-05, "loss": 1.687, "step": 193 }, { "epoch": 0.11956856702619414, "grad_norm": 1.555846929550171, "learning_rate": 1.1938461538461539e-05, "loss": 1.5689, "step": 194 }, { "epoch": 0.12018489984591679, "grad_norm": 1.586464285850525, "learning_rate": 1.2e-05, "loss": 1.5716, "step": 195 }, { "epoch": 0.12080123266563944, "grad_norm": 1.8393712043762207, "learning_rate": 1.2061538461538462e-05, "loss": 1.6435, "step": 196 }, { "epoch": 0.12141756548536209, "grad_norm": 1.6719666719436646, "learning_rate": 1.2123076923076924e-05, "loss": 1.6788, "step": 197 }, { "epoch": 0.12203389830508475, "grad_norm": 1.5561131238937378, "learning_rate": 1.2184615384615386e-05, "loss": 1.6612, "step": 198 }, { "epoch": 0.1226502311248074, "grad_norm": 1.618941307067871, "learning_rate": 1.2246153846153847e-05, "loss": 1.6165, "step": 199 }, { "epoch": 0.12326656394453005, "grad_norm": 1.6859337091445923, "learning_rate": 1.230769230769231e-05, "loss": 1.6222, "step": 200 }, { "epoch": 0.1238828967642527, "grad_norm": 1.6043035984039307, "learning_rate": 1.2369230769230771e-05, "loss": 1.5917, "step": 201 }, { "epoch": 0.12449922958397534, "grad_norm": 1.6755459308624268, "learning_rate": 1.243076923076923e-05, "loss": 1.6903, "step": 202 }, { "epoch": 0.125115562403698, "grad_norm": 1.629734754562378, "learning_rate": 1.2492307692307692e-05, "loss": 1.5891, "step": 203 }, { "epoch": 0.12573189522342065, "grad_norm": 1.5861685276031494, "learning_rate": 1.2553846153846155e-05, "loss": 1.6449, "step": 204 }, { "epoch": 0.1263482280431433, "grad_norm": 1.7582676410675049, "learning_rate": 1.2615384615384616e-05, "loss": 1.6413, "step": 205 }, { "epoch": 0.12696456086286595, "grad_norm": 1.671907901763916, "learning_rate": 1.2676923076923077e-05, "loss": 1.665, "step": 206 }, { "epoch": 0.1275808936825886, "grad_norm": 1.5016672611236572, "learning_rate": 1.273846153846154e-05, "loss": 1.5984, "step": 207 }, { "epoch": 0.12819722650231125, "grad_norm": 1.8237395286560059, "learning_rate": 1.2800000000000001e-05, "loss": 1.6277, "step": 208 }, { "epoch": 0.1288135593220339, "grad_norm": 1.6548969745635986, "learning_rate": 1.2861538461538462e-05, "loss": 1.6669, "step": 209 }, { "epoch": 0.12942989214175654, "grad_norm": 1.676841378211975, "learning_rate": 1.2923076923076925e-05, "loss": 1.6438, "step": 210 }, { "epoch": 0.1300462249614792, "grad_norm": 1.6994240283966064, "learning_rate": 1.2984615384615386e-05, "loss": 1.6235, "step": 211 }, { "epoch": 0.13066255778120184, "grad_norm": 1.649195909500122, "learning_rate": 1.3046153846153846e-05, "loss": 1.6541, "step": 212 }, { "epoch": 0.13127889060092449, "grad_norm": 1.574959397315979, "learning_rate": 1.3107692307692307e-05, "loss": 1.6366, "step": 213 }, { "epoch": 0.13189522342064716, "grad_norm": 1.736695408821106, "learning_rate": 1.316923076923077e-05, "loss": 1.6522, "step": 214 }, { "epoch": 0.1325115562403698, "grad_norm": 1.607884168624878, "learning_rate": 1.3230769230769231e-05, "loss": 1.6279, "step": 215 }, { "epoch": 0.13312788906009246, "grad_norm": 1.5464998483657837, "learning_rate": 1.3292307692307692e-05, "loss": 1.6424, "step": 216 }, { "epoch": 0.1337442218798151, "grad_norm": 1.6006733179092407, "learning_rate": 1.3353846153846155e-05, "loss": 1.628, "step": 217 }, { "epoch": 0.13436055469953775, "grad_norm": 1.74956476688385, "learning_rate": 1.3415384615384617e-05, "loss": 1.6504, "step": 218 }, { "epoch": 0.1349768875192604, "grad_norm": 1.6868929862976074, "learning_rate": 1.3476923076923078e-05, "loss": 1.6724, "step": 219 }, { "epoch": 0.13559322033898305, "grad_norm": 1.535910725593567, "learning_rate": 1.353846153846154e-05, "loss": 1.6288, "step": 220 }, { "epoch": 0.1362095531587057, "grad_norm": 1.6340734958648682, "learning_rate": 1.3600000000000002e-05, "loss": 1.6178, "step": 221 }, { "epoch": 0.13682588597842835, "grad_norm": 1.6039944887161255, "learning_rate": 1.3661538461538461e-05, "loss": 1.714, "step": 222 }, { "epoch": 0.137442218798151, "grad_norm": 1.6758949756622314, "learning_rate": 1.3723076923076923e-05, "loss": 1.5851, "step": 223 }, { "epoch": 0.13805855161787364, "grad_norm": 1.5623993873596191, "learning_rate": 1.3784615384615386e-05, "loss": 1.6269, "step": 224 }, { "epoch": 0.1386748844375963, "grad_norm": 1.5382038354873657, "learning_rate": 1.3846153846153847e-05, "loss": 1.5651, "step": 225 }, { "epoch": 0.13929121725731894, "grad_norm": 1.5378073453903198, "learning_rate": 1.3907692307692308e-05, "loss": 1.6517, "step": 226 }, { "epoch": 0.13990755007704161, "grad_norm": 1.558674931526184, "learning_rate": 1.3969230769230771e-05, "loss": 1.65, "step": 227 }, { "epoch": 0.14052388289676426, "grad_norm": 1.6412041187286377, "learning_rate": 1.4030769230769232e-05, "loss": 1.6279, "step": 228 }, { "epoch": 0.1411402157164869, "grad_norm": 1.5764039754867554, "learning_rate": 1.4092307692307693e-05, "loss": 1.638, "step": 229 }, { "epoch": 0.14175654853620956, "grad_norm": 1.5572047233581543, "learning_rate": 1.4153846153846156e-05, "loss": 1.5969, "step": 230 }, { "epoch": 0.1423728813559322, "grad_norm": 1.74053955078125, "learning_rate": 1.4215384615384617e-05, "loss": 1.6423, "step": 231 }, { "epoch": 0.14298921417565486, "grad_norm": 1.7418849468231201, "learning_rate": 1.4276923076923077e-05, "loss": 1.6462, "step": 232 }, { "epoch": 0.1436055469953775, "grad_norm": 1.503605604171753, "learning_rate": 1.4338461538461538e-05, "loss": 1.642, "step": 233 }, { "epoch": 0.14422187981510015, "grad_norm": 1.5570054054260254, "learning_rate": 1.4400000000000001e-05, "loss": 1.6609, "step": 234 }, { "epoch": 0.1448382126348228, "grad_norm": 1.4636863470077515, "learning_rate": 1.4461538461538462e-05, "loss": 1.5636, "step": 235 }, { "epoch": 0.14545454545454545, "grad_norm": 1.6985546350479126, "learning_rate": 1.4523076923076923e-05, "loss": 1.5973, "step": 236 }, { "epoch": 0.1460708782742681, "grad_norm": 1.6968873739242554, "learning_rate": 1.4584615384615386e-05, "loss": 1.6671, "step": 237 }, { "epoch": 0.14668721109399074, "grad_norm": 1.7380080223083496, "learning_rate": 1.4646153846153848e-05, "loss": 1.6417, "step": 238 }, { "epoch": 0.1473035439137134, "grad_norm": 1.5503101348876953, "learning_rate": 1.4707692307692309e-05, "loss": 1.5822, "step": 239 }, { "epoch": 0.14791987673343607, "grad_norm": 1.5093276500701904, "learning_rate": 1.4769230769230772e-05, "loss": 1.6187, "step": 240 }, { "epoch": 0.14853620955315872, "grad_norm": 1.664287805557251, "learning_rate": 1.4830769230769233e-05, "loss": 1.6451, "step": 241 }, { "epoch": 0.14915254237288136, "grad_norm": 1.5850409269332886, "learning_rate": 1.4892307692307692e-05, "loss": 1.6461, "step": 242 }, { "epoch": 0.149768875192604, "grad_norm": 1.5930163860321045, "learning_rate": 1.4953846153846154e-05, "loss": 1.6342, "step": 243 }, { "epoch": 0.15038520801232666, "grad_norm": 1.4707285165786743, "learning_rate": 1.5015384615384617e-05, "loss": 1.6295, "step": 244 }, { "epoch": 0.1510015408320493, "grad_norm": 1.805741786956787, "learning_rate": 1.5076923076923078e-05, "loss": 1.6892, "step": 245 }, { "epoch": 0.15161787365177196, "grad_norm": 1.5307151079177856, "learning_rate": 1.5138461538461539e-05, "loss": 1.636, "step": 246 }, { "epoch": 0.1522342064714946, "grad_norm": 1.4822207689285278, "learning_rate": 1.5200000000000002e-05, "loss": 1.5938, "step": 247 }, { "epoch": 0.15285053929121725, "grad_norm": 1.64521324634552, "learning_rate": 1.5261538461538465e-05, "loss": 1.6216, "step": 248 }, { "epoch": 0.1534668721109399, "grad_norm": 1.508833646774292, "learning_rate": 1.5323076923076926e-05, "loss": 1.5582, "step": 249 }, { "epoch": 0.15408320493066255, "grad_norm": 1.56405508518219, "learning_rate": 1.5384615384615387e-05, "loss": 1.6127, "step": 250 }, { "epoch": 0.1546995377503852, "grad_norm": 1.4395506381988525, "learning_rate": 1.544615384615385e-05, "loss": 1.6121, "step": 251 }, { "epoch": 0.15531587057010784, "grad_norm": 1.5709831714630127, "learning_rate": 1.550769230769231e-05, "loss": 1.7473, "step": 252 }, { "epoch": 0.15593220338983052, "grad_norm": 1.4856370687484741, "learning_rate": 1.556923076923077e-05, "loss": 1.6983, "step": 253 }, { "epoch": 0.15654853620955317, "grad_norm": 1.5266668796539307, "learning_rate": 1.5630769230769232e-05, "loss": 1.6868, "step": 254 }, { "epoch": 0.15716486902927582, "grad_norm": 1.4639393091201782, "learning_rate": 1.5692307692307693e-05, "loss": 1.5946, "step": 255 }, { "epoch": 0.15778120184899846, "grad_norm": 1.4167178869247437, "learning_rate": 1.5753846153846154e-05, "loss": 1.5721, "step": 256 }, { "epoch": 0.1583975346687211, "grad_norm": 1.550665259361267, "learning_rate": 1.5815384615384616e-05, "loss": 1.6882, "step": 257 }, { "epoch": 0.15901386748844376, "grad_norm": 1.3775755167007446, "learning_rate": 1.587692307692308e-05, "loss": 1.6099, "step": 258 }, { "epoch": 0.1596302003081664, "grad_norm": 1.5821239948272705, "learning_rate": 1.593846153846154e-05, "loss": 1.6625, "step": 259 }, { "epoch": 0.16024653312788906, "grad_norm": 1.4148032665252686, "learning_rate": 1.6000000000000003e-05, "loss": 1.5674, "step": 260 }, { "epoch": 0.1608628659476117, "grad_norm": 1.558368444442749, "learning_rate": 1.606153846153846e-05, "loss": 1.6189, "step": 261 }, { "epoch": 0.16147919876733435, "grad_norm": 1.5412561893463135, "learning_rate": 1.6123076923076925e-05, "loss": 1.6856, "step": 262 }, { "epoch": 0.162095531587057, "grad_norm": 1.441593050956726, "learning_rate": 1.6184615384615386e-05, "loss": 1.5986, "step": 263 }, { "epoch": 0.16271186440677965, "grad_norm": 1.616611361503601, "learning_rate": 1.6246153846153848e-05, "loss": 1.6249, "step": 264 }, { "epoch": 0.1633281972265023, "grad_norm": 1.4335848093032837, "learning_rate": 1.630769230769231e-05, "loss": 1.6237, "step": 265 }, { "epoch": 0.16394453004622497, "grad_norm": 1.6855947971343994, "learning_rate": 1.636923076923077e-05, "loss": 1.6828, "step": 266 }, { "epoch": 0.16456086286594762, "grad_norm": 1.4556145668029785, "learning_rate": 1.643076923076923e-05, "loss": 1.6199, "step": 267 }, { "epoch": 0.16517719568567027, "grad_norm": 1.4786442518234253, "learning_rate": 1.6492307692307696e-05, "loss": 1.6326, "step": 268 }, { "epoch": 0.16579352850539292, "grad_norm": 1.4075632095336914, "learning_rate": 1.6553846153846157e-05, "loss": 1.6305, "step": 269 }, { "epoch": 0.16640986132511557, "grad_norm": 1.4632195234298706, "learning_rate": 1.6615384615384618e-05, "loss": 1.5632, "step": 270 }, { "epoch": 0.1670261941448382, "grad_norm": 1.3902661800384521, "learning_rate": 1.6676923076923076e-05, "loss": 1.5927, "step": 271 }, { "epoch": 0.16764252696456086, "grad_norm": 1.4905269145965576, "learning_rate": 1.673846153846154e-05, "loss": 1.6183, "step": 272 }, { "epoch": 0.1682588597842835, "grad_norm": 1.4212580919265747, "learning_rate": 1.6800000000000002e-05, "loss": 1.689, "step": 273 }, { "epoch": 0.16887519260400616, "grad_norm": 1.443870186805725, "learning_rate": 1.6861538461538463e-05, "loss": 1.6318, "step": 274 }, { "epoch": 0.1694915254237288, "grad_norm": 1.3670133352279663, "learning_rate": 1.6923076923076924e-05, "loss": 1.6088, "step": 275 }, { "epoch": 0.17010785824345145, "grad_norm": 1.529133915901184, "learning_rate": 1.6984615384615385e-05, "loss": 1.69, "step": 276 }, { "epoch": 0.1707241910631741, "grad_norm": 1.4097617864608765, "learning_rate": 1.7046153846153847e-05, "loss": 1.6318, "step": 277 }, { "epoch": 0.17134052388289678, "grad_norm": 1.4894518852233887, "learning_rate": 1.710769230769231e-05, "loss": 1.5926, "step": 278 }, { "epoch": 0.17195685670261943, "grad_norm": 1.4361993074417114, "learning_rate": 1.7169230769230772e-05, "loss": 1.5765, "step": 279 }, { "epoch": 0.17257318952234207, "grad_norm": 1.5445034503936768, "learning_rate": 1.7230769230769234e-05, "loss": 1.6191, "step": 280 }, { "epoch": 0.17318952234206472, "grad_norm": 1.3923152685165405, "learning_rate": 1.729230769230769e-05, "loss": 1.6101, "step": 281 }, { "epoch": 0.17380585516178737, "grad_norm": 1.4457353353500366, "learning_rate": 1.7353846153846156e-05, "loss": 1.6825, "step": 282 }, { "epoch": 0.17442218798151002, "grad_norm": 1.4225143194198608, "learning_rate": 1.7415384615384617e-05, "loss": 1.5981, "step": 283 }, { "epoch": 0.17503852080123267, "grad_norm": 1.3846886157989502, "learning_rate": 1.747692307692308e-05, "loss": 1.5473, "step": 284 }, { "epoch": 0.17565485362095531, "grad_norm": 1.428889274597168, "learning_rate": 1.753846153846154e-05, "loss": 1.6512, "step": 285 }, { "epoch": 0.17627118644067796, "grad_norm": 1.426888346672058, "learning_rate": 1.76e-05, "loss": 1.6474, "step": 286 }, { "epoch": 0.1768875192604006, "grad_norm": 1.4983881711959839, "learning_rate": 1.7661538461538462e-05, "loss": 1.6472, "step": 287 }, { "epoch": 0.17750385208012326, "grad_norm": 1.3838081359863281, "learning_rate": 1.7723076923076927e-05, "loss": 1.6553, "step": 288 }, { "epoch": 0.1781201848998459, "grad_norm": 1.3625962734222412, "learning_rate": 1.7784615384615388e-05, "loss": 1.6178, "step": 289 }, { "epoch": 0.17873651771956856, "grad_norm": 1.442751407623291, "learning_rate": 1.784615384615385e-05, "loss": 1.6365, "step": 290 }, { "epoch": 0.17935285053929123, "grad_norm": 1.4086443185806274, "learning_rate": 1.7907692307692307e-05, "loss": 1.6235, "step": 291 }, { "epoch": 0.17996918335901388, "grad_norm": 1.4534282684326172, "learning_rate": 1.796923076923077e-05, "loss": 1.717, "step": 292 }, { "epoch": 0.18058551617873653, "grad_norm": 1.3573397397994995, "learning_rate": 1.8030769230769233e-05, "loss": 1.634, "step": 293 }, { "epoch": 0.18120184899845918, "grad_norm": 1.421881914138794, "learning_rate": 1.8092307692307694e-05, "loss": 1.6785, "step": 294 }, { "epoch": 0.18181818181818182, "grad_norm": 1.3907103538513184, "learning_rate": 1.8153846153846155e-05, "loss": 1.657, "step": 295 }, { "epoch": 0.18243451463790447, "grad_norm": 1.4932299852371216, "learning_rate": 1.8215384615384616e-05, "loss": 1.6257, "step": 296 }, { "epoch": 0.18305084745762712, "grad_norm": 1.3402001857757568, "learning_rate": 1.8276923076923078e-05, "loss": 1.5934, "step": 297 }, { "epoch": 0.18366718027734977, "grad_norm": 1.457749366760254, "learning_rate": 1.8338461538461542e-05, "loss": 1.5782, "step": 298 }, { "epoch": 0.18428351309707242, "grad_norm": 1.4130915403366089, "learning_rate": 1.8400000000000003e-05, "loss": 1.5963, "step": 299 }, { "epoch": 0.18489984591679506, "grad_norm": 1.5611200332641602, "learning_rate": 1.8461538461538465e-05, "loss": 1.642, "step": 300 }, { "epoch": 0.1855161787365177, "grad_norm": 1.3795658349990845, "learning_rate": 1.8523076923076922e-05, "loss": 1.6563, "step": 301 }, { "epoch": 0.18613251155624036, "grad_norm": 1.4305459260940552, "learning_rate": 1.8584615384615387e-05, "loss": 1.6444, "step": 302 }, { "epoch": 0.186748844375963, "grad_norm": 1.3612898588180542, "learning_rate": 1.8646153846153848e-05, "loss": 1.6443, "step": 303 }, { "epoch": 0.18736517719568568, "grad_norm": 1.487234115600586, "learning_rate": 1.870769230769231e-05, "loss": 1.6719, "step": 304 }, { "epoch": 0.18798151001540833, "grad_norm": 1.3911621570587158, "learning_rate": 1.876923076923077e-05, "loss": 1.6248, "step": 305 }, { "epoch": 0.18859784283513098, "grad_norm": 1.3716368675231934, "learning_rate": 1.8830769230769232e-05, "loss": 1.5801, "step": 306 }, { "epoch": 0.18921417565485363, "grad_norm": 1.409889817237854, "learning_rate": 1.8892307692307693e-05, "loss": 1.6528, "step": 307 }, { "epoch": 0.18983050847457628, "grad_norm": 1.3550827503204346, "learning_rate": 1.8953846153846158e-05, "loss": 1.6326, "step": 308 }, { "epoch": 0.19044684129429892, "grad_norm": 1.3632606267929077, "learning_rate": 1.901538461538462e-05, "loss": 1.6014, "step": 309 }, { "epoch": 0.19106317411402157, "grad_norm": 1.4157477617263794, "learning_rate": 1.907692307692308e-05, "loss": 1.5859, "step": 310 }, { "epoch": 0.19167950693374422, "grad_norm": 1.381125569343567, "learning_rate": 1.9138461538461538e-05, "loss": 1.6281, "step": 311 }, { "epoch": 0.19229583975346687, "grad_norm": 1.3800766468048096, "learning_rate": 1.9200000000000003e-05, "loss": 1.6348, "step": 312 }, { "epoch": 0.19291217257318952, "grad_norm": 1.3425391912460327, "learning_rate": 1.9261538461538464e-05, "loss": 1.5904, "step": 313 }, { "epoch": 0.19352850539291216, "grad_norm": 1.358258605003357, "learning_rate": 1.9323076923076925e-05, "loss": 1.5912, "step": 314 }, { "epoch": 0.1941448382126348, "grad_norm": 1.3165721893310547, "learning_rate": 1.9384615384615386e-05, "loss": 1.5857, "step": 315 }, { "epoch": 0.19476117103235746, "grad_norm": 1.3274505138397217, "learning_rate": 1.9446153846153847e-05, "loss": 1.5857, "step": 316 }, { "epoch": 0.19537750385208014, "grad_norm": 1.3422726392745972, "learning_rate": 1.950769230769231e-05, "loss": 1.6138, "step": 317 }, { "epoch": 0.19599383667180278, "grad_norm": 1.3407599925994873, "learning_rate": 1.9569230769230773e-05, "loss": 1.6187, "step": 318 }, { "epoch": 0.19661016949152543, "grad_norm": 1.323610782623291, "learning_rate": 1.9630769230769234e-05, "loss": 1.6656, "step": 319 }, { "epoch": 0.19722650231124808, "grad_norm": 1.330980658531189, "learning_rate": 1.9692307692307696e-05, "loss": 1.588, "step": 320 }, { "epoch": 0.19784283513097073, "grad_norm": 1.4092925786972046, "learning_rate": 1.9753846153846153e-05, "loss": 1.6261, "step": 321 }, { "epoch": 0.19845916795069338, "grad_norm": 1.3522353172302246, "learning_rate": 1.9815384615384618e-05, "loss": 1.6458, "step": 322 }, { "epoch": 0.19907550077041603, "grad_norm": 1.3303834199905396, "learning_rate": 1.987692307692308e-05, "loss": 1.586, "step": 323 }, { "epoch": 0.19969183359013867, "grad_norm": 1.3448110818862915, "learning_rate": 1.993846153846154e-05, "loss": 1.6216, "step": 324 }, { "epoch": 0.20030816640986132, "grad_norm": 1.3356963396072388, "learning_rate": 2e-05, "loss": 1.5337, "step": 325 }, { "epoch": 0.20092449922958397, "grad_norm": 1.406883716583252, "learning_rate": 1.999999420836055e-05, "loss": 1.5738, "step": 326 }, { "epoch": 0.20154083204930662, "grad_norm": 1.3668279647827148, "learning_rate": 1.9999976833448903e-05, "loss": 1.5782, "step": 327 }, { "epoch": 0.20215716486902927, "grad_norm": 1.3366916179656982, "learning_rate": 1.9999947875285182e-05, "loss": 1.6068, "step": 328 }, { "epoch": 0.2027734976887519, "grad_norm": 1.3295947313308716, "learning_rate": 1.999990733390294e-05, "loss": 1.5614, "step": 329 }, { "epoch": 0.2033898305084746, "grad_norm": 1.3339965343475342, "learning_rate": 1.9999855209349128e-05, "loss": 1.6347, "step": 330 }, { "epoch": 0.20400616332819724, "grad_norm": 1.3420106172561646, "learning_rate": 1.9999791501684126e-05, "loss": 1.5883, "step": 331 }, { "epoch": 0.20462249614791989, "grad_norm": 1.4211523532867432, "learning_rate": 1.9999716210981736e-05, "loss": 1.6155, "step": 332 }, { "epoch": 0.20523882896764253, "grad_norm": 1.3816264867782593, "learning_rate": 1.9999629337329157e-05, "loss": 1.5964, "step": 333 }, { "epoch": 0.20585516178736518, "grad_norm": 1.425441026687622, "learning_rate": 1.999953088082702e-05, "loss": 1.627, "step": 334 }, { "epoch": 0.20647149460708783, "grad_norm": 1.3583567142486572, "learning_rate": 1.9999420841589377e-05, "loss": 1.6291, "step": 335 }, { "epoch": 0.20708782742681048, "grad_norm": 1.3295503854751587, "learning_rate": 1.9999299219743685e-05, "loss": 1.6068, "step": 336 }, { "epoch": 0.20770416024653313, "grad_norm": 1.3268381357192993, "learning_rate": 1.9999166015430816e-05, "loss": 1.5798, "step": 337 }, { "epoch": 0.20832049306625577, "grad_norm": 1.306710124015808, "learning_rate": 1.9999021228805077e-05, "loss": 1.5941, "step": 338 }, { "epoch": 0.20893682588597842, "grad_norm": 1.3095345497131348, "learning_rate": 1.999886486003417e-05, "loss": 1.5769, "step": 339 }, { "epoch": 0.20955315870570107, "grad_norm": 1.3329263925552368, "learning_rate": 1.999869690929922e-05, "loss": 1.6584, "step": 340 }, { "epoch": 0.21016949152542372, "grad_norm": 1.2900240421295166, "learning_rate": 1.9998517376794778e-05, "loss": 1.6344, "step": 341 }, { "epoch": 0.21078582434514637, "grad_norm": 1.3315945863723755, "learning_rate": 1.9998326262728792e-05, "loss": 1.6012, "step": 342 }, { "epoch": 0.21140215716486904, "grad_norm": 1.3844521045684814, "learning_rate": 1.9998123567322638e-05, "loss": 1.6281, "step": 343 }, { "epoch": 0.2120184899845917, "grad_norm": 1.2778468132019043, "learning_rate": 1.9997909290811105e-05, "loss": 1.5638, "step": 344 }, { "epoch": 0.21263482280431434, "grad_norm": 1.3523645401000977, "learning_rate": 1.9997683433442392e-05, "loss": 1.5883, "step": 345 }, { "epoch": 0.213251155624037, "grad_norm": 1.3121861219406128, "learning_rate": 1.999744599547812e-05, "loss": 1.6073, "step": 346 }, { "epoch": 0.21386748844375963, "grad_norm": 1.3167142868041992, "learning_rate": 1.9997196977193317e-05, "loss": 1.6022, "step": 347 }, { "epoch": 0.21448382126348228, "grad_norm": 1.3357988595962524, "learning_rate": 1.999693637887643e-05, "loss": 1.5933, "step": 348 }, { "epoch": 0.21510015408320493, "grad_norm": 1.2766566276550293, "learning_rate": 1.9996664200829322e-05, "loss": 1.542, "step": 349 }, { "epoch": 0.21571648690292758, "grad_norm": 1.3904541730880737, "learning_rate": 1.9996380443367252e-05, "loss": 1.6302, "step": 350 }, { "epoch": 0.21633281972265023, "grad_norm": 1.3405559062957764, "learning_rate": 1.9996085106818914e-05, "loss": 1.6005, "step": 351 }, { "epoch": 0.21694915254237288, "grad_norm": 1.348298192024231, "learning_rate": 1.99957781915264e-05, "loss": 1.6909, "step": 352 }, { "epoch": 0.21756548536209552, "grad_norm": 1.3109397888183594, "learning_rate": 1.999545969784522e-05, "loss": 1.534, "step": 353 }, { "epoch": 0.21818181818181817, "grad_norm": 1.2591431140899658, "learning_rate": 1.99951296261443e-05, "loss": 1.5943, "step": 354 }, { "epoch": 0.21879815100154082, "grad_norm": 1.3665069341659546, "learning_rate": 1.999478797680596e-05, "loss": 1.6444, "step": 355 }, { "epoch": 0.2194144838212635, "grad_norm": 1.3854387998580933, "learning_rate": 1.9994434750225947e-05, "loss": 1.6518, "step": 356 }, { "epoch": 0.22003081664098614, "grad_norm": 1.3044852018356323, "learning_rate": 1.9994069946813416e-05, "loss": 1.5971, "step": 357 }, { "epoch": 0.2206471494607088, "grad_norm": 1.3915233612060547, "learning_rate": 1.9993693566990925e-05, "loss": 1.633, "step": 358 }, { "epoch": 0.22126348228043144, "grad_norm": 1.3137407302856445, "learning_rate": 1.999330561119445e-05, "loss": 1.5717, "step": 359 }, { "epoch": 0.2218798151001541, "grad_norm": 1.3558845520019531, "learning_rate": 1.9992906079873363e-05, "loss": 1.6082, "step": 360 }, { "epoch": 0.22249614791987674, "grad_norm": 1.3518322706222534, "learning_rate": 1.999249497349046e-05, "loss": 1.6021, "step": 361 }, { "epoch": 0.22311248073959938, "grad_norm": 1.2853856086730957, "learning_rate": 1.9992072292521935e-05, "loss": 1.6153, "step": 362 }, { "epoch": 0.22372881355932203, "grad_norm": 1.3123093843460083, "learning_rate": 1.999163803745739e-05, "loss": 1.5856, "step": 363 }, { "epoch": 0.22434514637904468, "grad_norm": 1.3063842058181763, "learning_rate": 1.9991192208799836e-05, "loss": 1.6184, "step": 364 }, { "epoch": 0.22496147919876733, "grad_norm": 1.2838196754455566, "learning_rate": 1.9990734807065685e-05, "loss": 1.5878, "step": 365 }, { "epoch": 0.22557781201848998, "grad_norm": 1.3251609802246094, "learning_rate": 1.9990265832784764e-05, "loss": 1.6071, "step": 366 }, { "epoch": 0.22619414483821262, "grad_norm": 1.3536404371261597, "learning_rate": 1.9989785286500294e-05, "loss": 1.6189, "step": 367 }, { "epoch": 0.22681047765793527, "grad_norm": 1.2755295038223267, "learning_rate": 1.9989293168768912e-05, "loss": 1.5751, "step": 368 }, { "epoch": 0.22742681047765795, "grad_norm": 1.3265677690505981, "learning_rate": 1.998878948016064e-05, "loss": 1.6166, "step": 369 }, { "epoch": 0.2280431432973806, "grad_norm": 1.28499174118042, "learning_rate": 1.9988274221258926e-05, "loss": 1.5752, "step": 370 }, { "epoch": 0.22865947611710324, "grad_norm": 1.2985507249832153, "learning_rate": 1.9987747392660602e-05, "loss": 1.6127, "step": 371 }, { "epoch": 0.2292758089368259, "grad_norm": 1.3845150470733643, "learning_rate": 1.9987208994975914e-05, "loss": 1.6928, "step": 372 }, { "epoch": 0.22989214175654854, "grad_norm": 1.2768959999084473, "learning_rate": 1.9986659028828496e-05, "loss": 1.5763, "step": 373 }, { "epoch": 0.2305084745762712, "grad_norm": 1.284400463104248, "learning_rate": 1.998609749485539e-05, "loss": 1.5995, "step": 374 }, { "epoch": 0.23112480739599384, "grad_norm": 1.325767993927002, "learning_rate": 1.9985524393707046e-05, "loss": 1.5651, "step": 375 }, { "epoch": 0.23174114021571648, "grad_norm": 1.2712185382843018, "learning_rate": 1.9984939726047293e-05, "loss": 1.6215, "step": 376 }, { "epoch": 0.23235747303543913, "grad_norm": 1.3037066459655762, "learning_rate": 1.9984343492553374e-05, "loss": 1.6053, "step": 377 }, { "epoch": 0.23297380585516178, "grad_norm": 1.2590421438217163, "learning_rate": 1.9983735693915915e-05, "loss": 1.5933, "step": 378 }, { "epoch": 0.23359013867488443, "grad_norm": 1.323660969734192, "learning_rate": 1.9983116330838956e-05, "loss": 1.6098, "step": 379 }, { "epoch": 0.23420647149460708, "grad_norm": 1.2697356939315796, "learning_rate": 1.9982485404039913e-05, "loss": 1.6167, "step": 380 }, { "epoch": 0.23482280431432973, "grad_norm": 1.2741032838821411, "learning_rate": 1.9981842914249613e-05, "loss": 1.5941, "step": 381 }, { "epoch": 0.2354391371340524, "grad_norm": 1.3603581190109253, "learning_rate": 1.9981188862212267e-05, "loss": 1.6302, "step": 382 }, { "epoch": 0.23605546995377505, "grad_norm": 1.2568930387496948, "learning_rate": 1.998052324868548e-05, "loss": 1.5736, "step": 383 }, { "epoch": 0.2366718027734977, "grad_norm": 1.2936395406723022, "learning_rate": 1.9979846074440258e-05, "loss": 1.5967, "step": 384 }, { "epoch": 0.23728813559322035, "grad_norm": 1.2825345993041992, "learning_rate": 1.997915734026098e-05, "loss": 1.5754, "step": 385 }, { "epoch": 0.237904468412943, "grad_norm": 1.279841423034668, "learning_rate": 1.9978457046945437e-05, "loss": 1.574, "step": 386 }, { "epoch": 0.23852080123266564, "grad_norm": 1.295731544494629, "learning_rate": 1.997774519530479e-05, "loss": 1.5752, "step": 387 }, { "epoch": 0.2391371340523883, "grad_norm": 1.296617031097412, "learning_rate": 1.99770217861636e-05, "loss": 1.5669, "step": 388 }, { "epoch": 0.23975346687211094, "grad_norm": 1.3367897272109985, "learning_rate": 1.9976286820359814e-05, "loss": 1.6258, "step": 389 }, { "epoch": 0.24036979969183359, "grad_norm": 1.288947582244873, "learning_rate": 1.997554029874476e-05, "loss": 1.6328, "step": 390 }, { "epoch": 0.24098613251155623, "grad_norm": 1.290277361869812, "learning_rate": 1.9974782222183157e-05, "loss": 1.6135, "step": 391 }, { "epoch": 0.24160246533127888, "grad_norm": 1.2760404348373413, "learning_rate": 1.99740125915531e-05, "loss": 1.6506, "step": 392 }, { "epoch": 0.24221879815100153, "grad_norm": 1.2709912061691284, "learning_rate": 1.9973231407746084e-05, "loss": 1.6179, "step": 393 }, { "epoch": 0.24283513097072418, "grad_norm": 1.2396341562271118, "learning_rate": 1.997243867166697e-05, "loss": 1.6173, "step": 394 }, { "epoch": 0.24345146379044685, "grad_norm": 1.2633081674575806, "learning_rate": 1.9971634384234003e-05, "loss": 1.5467, "step": 395 }, { "epoch": 0.2440677966101695, "grad_norm": 1.2624627351760864, "learning_rate": 1.9970818546378817e-05, "loss": 1.544, "step": 396 }, { "epoch": 0.24468412942989215, "grad_norm": 1.2690812349319458, "learning_rate": 1.9969991159046423e-05, "loss": 1.5554, "step": 397 }, { "epoch": 0.2453004622496148, "grad_norm": 1.2628960609436035, "learning_rate": 1.9969152223195198e-05, "loss": 1.6391, "step": 398 }, { "epoch": 0.24591679506933745, "grad_norm": 1.2520534992218018, "learning_rate": 1.996830173979691e-05, "loss": 1.5608, "step": 399 }, { "epoch": 0.2465331278890601, "grad_norm": 1.2633273601531982, "learning_rate": 1.9967439709836694e-05, "loss": 1.5879, "step": 400 }, { "epoch": 0.24714946070878274, "grad_norm": 1.2758275270462036, "learning_rate": 1.996656613431307e-05, "loss": 1.5794, "step": 401 }, { "epoch": 0.2477657935285054, "grad_norm": 1.2048323154449463, "learning_rate": 1.9965681014237918e-05, "loss": 1.5352, "step": 402 }, { "epoch": 0.24838212634822804, "grad_norm": 1.2261072397232056, "learning_rate": 1.99647843506365e-05, "loss": 1.5147, "step": 403 }, { "epoch": 0.2489984591679507, "grad_norm": 1.3187477588653564, "learning_rate": 1.9963876144547448e-05, "loss": 1.626, "step": 404 }, { "epoch": 0.24961479198767333, "grad_norm": 1.303612232208252, "learning_rate": 1.9962956397022762e-05, "loss": 1.4956, "step": 405 }, { "epoch": 0.250231124807396, "grad_norm": 1.388525128364563, "learning_rate": 1.996202510912781e-05, "loss": 1.6097, "step": 406 }, { "epoch": 0.25084745762711863, "grad_norm": 1.2807857990264893, "learning_rate": 1.9961082281941325e-05, "loss": 1.6056, "step": 407 }, { "epoch": 0.2514637904468413, "grad_norm": 1.378028392791748, "learning_rate": 1.996012791655542e-05, "loss": 1.6161, "step": 408 }, { "epoch": 0.2520801232665639, "grad_norm": 1.271227478981018, "learning_rate": 1.9959162014075553e-05, "loss": 1.5674, "step": 409 }, { "epoch": 0.2526964560862866, "grad_norm": 1.3147228956222534, "learning_rate": 1.995818457562056e-05, "loss": 1.5622, "step": 410 }, { "epoch": 0.2533127889060092, "grad_norm": 1.4041773080825806, "learning_rate": 1.9957195602322637e-05, "loss": 1.621, "step": 411 }, { "epoch": 0.2539291217257319, "grad_norm": 1.3648581504821777, "learning_rate": 1.9956195095327334e-05, "loss": 1.6503, "step": 412 }, { "epoch": 0.2545454545454545, "grad_norm": 1.3132625818252563, "learning_rate": 1.995518305579357e-05, "loss": 1.5849, "step": 413 }, { "epoch": 0.2551617873651772, "grad_norm": 1.2900166511535645, "learning_rate": 1.9954159484893623e-05, "loss": 1.589, "step": 414 }, { "epoch": 0.25577812018489987, "grad_norm": 1.3345228433609009, "learning_rate": 1.9953124383813115e-05, "loss": 1.5829, "step": 415 }, { "epoch": 0.2563944530046225, "grad_norm": 1.2869850397109985, "learning_rate": 1.9952077753751037e-05, "loss": 1.5619, "step": 416 }, { "epoch": 0.25701078582434517, "grad_norm": 1.3288335800170898, "learning_rate": 1.9951019595919728e-05, "loss": 1.6031, "step": 417 }, { "epoch": 0.2576271186440678, "grad_norm": 1.2664883136749268, "learning_rate": 1.9949949911544885e-05, "loss": 1.6285, "step": 418 }, { "epoch": 0.25824345146379046, "grad_norm": 1.3004200458526611, "learning_rate": 1.9948868701865548e-05, "loss": 1.5755, "step": 419 }, { "epoch": 0.2588597842835131, "grad_norm": 1.2347030639648438, "learning_rate": 1.994777596813412e-05, "loss": 1.5205, "step": 420 }, { "epoch": 0.25947611710323576, "grad_norm": 1.2397122383117676, "learning_rate": 1.9946671711616337e-05, "loss": 1.5403, "step": 421 }, { "epoch": 0.2600924499229584, "grad_norm": 1.2547035217285156, "learning_rate": 1.9945555933591293e-05, "loss": 1.5082, "step": 422 }, { "epoch": 0.26070878274268106, "grad_norm": 1.3593640327453613, "learning_rate": 1.9944428635351426e-05, "loss": 1.6173, "step": 423 }, { "epoch": 0.2613251155624037, "grad_norm": 1.2993645668029785, "learning_rate": 1.9943289818202517e-05, "loss": 1.6217, "step": 424 }, { "epoch": 0.26194144838212635, "grad_norm": 1.2936800718307495, "learning_rate": 1.994213948346369e-05, "loss": 1.6201, "step": 425 }, { "epoch": 0.26255778120184897, "grad_norm": 1.2759329080581665, "learning_rate": 1.9940977632467408e-05, "loss": 1.6408, "step": 426 }, { "epoch": 0.26317411402157165, "grad_norm": 1.2894319295883179, "learning_rate": 1.9939804266559476e-05, "loss": 1.5712, "step": 427 }, { "epoch": 0.2637904468412943, "grad_norm": 1.3158437013626099, "learning_rate": 1.9938619387099035e-05, "loss": 1.6046, "step": 428 }, { "epoch": 0.26440677966101694, "grad_norm": 1.243251085281372, "learning_rate": 1.9937422995458568e-05, "loss": 1.5346, "step": 429 }, { "epoch": 0.2650231124807396, "grad_norm": 1.3151570558547974, "learning_rate": 1.9936215093023884e-05, "loss": 1.5658, "step": 430 }, { "epoch": 0.26563944530046224, "grad_norm": 1.2230660915374756, "learning_rate": 1.9934995681194133e-05, "loss": 1.5779, "step": 431 }, { "epoch": 0.2662557781201849, "grad_norm": 1.245911717414856, "learning_rate": 1.9933764761381792e-05, "loss": 1.6458, "step": 432 }, { "epoch": 0.26687211093990754, "grad_norm": 1.3029462099075317, "learning_rate": 1.9932522335012674e-05, "loss": 1.6242, "step": 433 }, { "epoch": 0.2674884437596302, "grad_norm": 1.267608642578125, "learning_rate": 1.9931268403525913e-05, "loss": 1.5705, "step": 434 }, { "epoch": 0.26810477657935283, "grad_norm": 1.2604044675827026, "learning_rate": 1.993000296837397e-05, "loss": 1.5525, "step": 435 }, { "epoch": 0.2687211093990755, "grad_norm": 1.3126895427703857, "learning_rate": 1.992872603102264e-05, "loss": 1.6054, "step": 436 }, { "epoch": 0.26933744221879813, "grad_norm": 1.2944269180297852, "learning_rate": 1.992743759295103e-05, "loss": 1.645, "step": 437 }, { "epoch": 0.2699537750385208, "grad_norm": 1.2287088632583618, "learning_rate": 1.9926137655651575e-05, "loss": 1.5964, "step": 438 }, { "epoch": 0.2705701078582434, "grad_norm": 1.2598721981048584, "learning_rate": 1.992482622063003e-05, "loss": 1.5895, "step": 439 }, { "epoch": 0.2711864406779661, "grad_norm": 1.2679294347763062, "learning_rate": 1.9923503289405467e-05, "loss": 1.6266, "step": 440 }, { "epoch": 0.2718027734976888, "grad_norm": 1.223596453666687, "learning_rate": 1.9922168863510275e-05, "loss": 1.5459, "step": 441 }, { "epoch": 0.2724191063174114, "grad_norm": 1.267927646636963, "learning_rate": 1.992082294449015e-05, "loss": 1.6334, "step": 442 }, { "epoch": 0.2730354391371341, "grad_norm": 1.243190050125122, "learning_rate": 1.9919465533904113e-05, "loss": 1.645, "step": 443 }, { "epoch": 0.2736517719568567, "grad_norm": 1.2241793870925903, "learning_rate": 1.9918096633324492e-05, "loss": 1.5469, "step": 444 }, { "epoch": 0.27426810477657937, "grad_norm": 1.2556524276733398, "learning_rate": 1.991671624433692e-05, "loss": 1.5667, "step": 445 }, { "epoch": 0.274884437596302, "grad_norm": 1.2456376552581787, "learning_rate": 1.991532436854034e-05, "loss": 1.5163, "step": 446 }, { "epoch": 0.27550077041602467, "grad_norm": 1.2353805303573608, "learning_rate": 1.9913921007547003e-05, "loss": 1.5856, "step": 447 }, { "epoch": 0.2761171032357473, "grad_norm": 1.2632793188095093, "learning_rate": 1.991250616298246e-05, "loss": 1.5893, "step": 448 }, { "epoch": 0.27673343605546996, "grad_norm": 1.3193575143814087, "learning_rate": 1.991107983648556e-05, "loss": 1.5571, "step": 449 }, { "epoch": 0.2773497688751926, "grad_norm": 1.2398138046264648, "learning_rate": 1.9909642029708466e-05, "loss": 1.5896, "step": 450 }, { "epoch": 0.27796610169491526, "grad_norm": 1.24996817111969, "learning_rate": 1.990819274431662e-05, "loss": 1.5628, "step": 451 }, { "epoch": 0.2785824345146379, "grad_norm": 1.226418375968933, "learning_rate": 1.990673198198878e-05, "loss": 1.5645, "step": 452 }, { "epoch": 0.27919876733436055, "grad_norm": 1.2438184022903442, "learning_rate": 1.9905259744416978e-05, "loss": 1.5599, "step": 453 }, { "epoch": 0.27981510015408323, "grad_norm": 1.25833261013031, "learning_rate": 1.9903776033306554e-05, "loss": 1.5899, "step": 454 }, { "epoch": 0.28043143297380585, "grad_norm": 1.280225396156311, "learning_rate": 1.990228085037613e-05, "loss": 1.5965, "step": 455 }, { "epoch": 0.2810477657935285, "grad_norm": 1.2606980800628662, "learning_rate": 1.9900774197357616e-05, "loss": 1.5974, "step": 456 }, { "epoch": 0.28166409861325115, "grad_norm": 1.2578256130218506, "learning_rate": 1.9899256075996213e-05, "loss": 1.6011, "step": 457 }, { "epoch": 0.2822804314329738, "grad_norm": 1.302246332168579, "learning_rate": 1.9897726488050405e-05, "loss": 1.562, "step": 458 }, { "epoch": 0.28289676425269644, "grad_norm": 1.2207574844360352, "learning_rate": 1.9896185435291953e-05, "loss": 1.5063, "step": 459 }, { "epoch": 0.2835130970724191, "grad_norm": 1.2724609375, "learning_rate": 1.9894632919505905e-05, "loss": 1.6667, "step": 460 }, { "epoch": 0.28412942989214174, "grad_norm": 1.2021819353103638, "learning_rate": 1.9893068942490576e-05, "loss": 1.5572, "step": 461 }, { "epoch": 0.2847457627118644, "grad_norm": 1.2160234451293945, "learning_rate": 1.9891493506057573e-05, "loss": 1.5693, "step": 462 }, { "epoch": 0.28536209553158703, "grad_norm": 1.2717446088790894, "learning_rate": 1.988990661203176e-05, "loss": 1.5821, "step": 463 }, { "epoch": 0.2859784283513097, "grad_norm": 1.2610220909118652, "learning_rate": 1.9888308262251286e-05, "loss": 1.5741, "step": 464 }, { "epoch": 0.28659476117103233, "grad_norm": 1.2978510856628418, "learning_rate": 1.9886698458567563e-05, "loss": 1.5888, "step": 465 }, { "epoch": 0.287211093990755, "grad_norm": 1.3481318950653076, "learning_rate": 1.9885077202845272e-05, "loss": 1.5477, "step": 466 }, { "epoch": 0.2878274268104777, "grad_norm": 1.2139685153961182, "learning_rate": 1.9883444496962355e-05, "loss": 1.5309, "step": 467 }, { "epoch": 0.2884437596302003, "grad_norm": 1.292353630065918, "learning_rate": 1.988180034281003e-05, "loss": 1.5003, "step": 468 }, { "epoch": 0.289060092449923, "grad_norm": 1.2215583324432373, "learning_rate": 1.9880144742292756e-05, "loss": 1.5637, "step": 469 }, { "epoch": 0.2896764252696456, "grad_norm": 1.2642847299575806, "learning_rate": 1.9878477697328266e-05, "loss": 1.5009, "step": 470 }, { "epoch": 0.2902927580893683, "grad_norm": 1.320966362953186, "learning_rate": 1.9876799209847544e-05, "loss": 1.532, "step": 471 }, { "epoch": 0.2909090909090909, "grad_norm": 1.2611565589904785, "learning_rate": 1.9875109281794828e-05, "loss": 1.5374, "step": 472 }, { "epoch": 0.29152542372881357, "grad_norm": 1.2412095069885254, "learning_rate": 1.987340791512761e-05, "loss": 1.5967, "step": 473 }, { "epoch": 0.2921417565485362, "grad_norm": 1.2784647941589355, "learning_rate": 1.987169511181663e-05, "loss": 1.5829, "step": 474 }, { "epoch": 0.29275808936825887, "grad_norm": 1.229587435722351, "learning_rate": 1.986997087384588e-05, "loss": 1.5635, "step": 475 }, { "epoch": 0.2933744221879815, "grad_norm": 1.299207329750061, "learning_rate": 1.9868235203212585e-05, "loss": 1.6308, "step": 476 }, { "epoch": 0.29399075500770416, "grad_norm": 1.3318393230438232, "learning_rate": 1.9866488101927226e-05, "loss": 1.6149, "step": 477 }, { "epoch": 0.2946070878274268, "grad_norm": 1.2325513362884521, "learning_rate": 1.986472957201352e-05, "loss": 1.5621, "step": 478 }, { "epoch": 0.29522342064714946, "grad_norm": 1.2294633388519287, "learning_rate": 1.9862959615508418e-05, "loss": 1.5363, "step": 479 }, { "epoch": 0.29583975346687214, "grad_norm": 1.2546963691711426, "learning_rate": 1.986117823446211e-05, "loss": 1.5244, "step": 480 }, { "epoch": 0.29645608628659476, "grad_norm": 1.2527563571929932, "learning_rate": 1.985938543093802e-05, "loss": 1.59, "step": 481 }, { "epoch": 0.29707241910631743, "grad_norm": 1.284782886505127, "learning_rate": 1.9857581207012805e-05, "loss": 1.5553, "step": 482 }, { "epoch": 0.29768875192604005, "grad_norm": 1.3089871406555176, "learning_rate": 1.9855765564776343e-05, "loss": 1.5725, "step": 483 }, { "epoch": 0.2983050847457627, "grad_norm": 1.256629467010498, "learning_rate": 1.985393850633175e-05, "loss": 1.5009, "step": 484 }, { "epoch": 0.29892141756548535, "grad_norm": 1.2267287969589233, "learning_rate": 1.985210003379535e-05, "loss": 1.5447, "step": 485 }, { "epoch": 0.299537750385208, "grad_norm": 1.2312675714492798, "learning_rate": 1.9850250149296704e-05, "loss": 1.563, "step": 486 }, { "epoch": 0.30015408320493064, "grad_norm": 1.240964412689209, "learning_rate": 1.9848388854978582e-05, "loss": 1.6205, "step": 487 }, { "epoch": 0.3007704160246533, "grad_norm": 1.2435818910598755, "learning_rate": 1.9846516152996974e-05, "loss": 1.5569, "step": 488 }, { "epoch": 0.30138674884437594, "grad_norm": 1.255699872970581, "learning_rate": 1.9844632045521083e-05, "loss": 1.6527, "step": 489 }, { "epoch": 0.3020030816640986, "grad_norm": 1.2148468494415283, "learning_rate": 1.984273653473332e-05, "loss": 1.5274, "step": 490 }, { "epoch": 0.30261941448382124, "grad_norm": 1.2130820751190186, "learning_rate": 1.984082962282931e-05, "loss": 1.5744, "step": 491 }, { "epoch": 0.3032357473035439, "grad_norm": 1.2665996551513672, "learning_rate": 1.983891131201789e-05, "loss": 1.5881, "step": 492 }, { "epoch": 0.3038520801232666, "grad_norm": 1.200944185256958, "learning_rate": 1.9836981604521077e-05, "loss": 1.5802, "step": 493 }, { "epoch": 0.3044684129429892, "grad_norm": 1.2124292850494385, "learning_rate": 1.983504050257412e-05, "loss": 1.5589, "step": 494 }, { "epoch": 0.3050847457627119, "grad_norm": 1.2255533933639526, "learning_rate": 1.9833088008425442e-05, "loss": 1.5986, "step": 495 }, { "epoch": 0.3057010785824345, "grad_norm": 1.2360104322433472, "learning_rate": 1.9831124124336677e-05, "loss": 1.6239, "step": 496 }, { "epoch": 0.3063174114021572, "grad_norm": 1.2451645135879517, "learning_rate": 1.9829148852582644e-05, "loss": 1.5553, "step": 497 }, { "epoch": 0.3069337442218798, "grad_norm": 1.2352529764175415, "learning_rate": 1.9827162195451353e-05, "loss": 1.6061, "step": 498 }, { "epoch": 0.3075500770416025, "grad_norm": 1.2615388631820679, "learning_rate": 1.982516415524401e-05, "loss": 1.5012, "step": 499 }, { "epoch": 0.3081664098613251, "grad_norm": 1.2173899412155151, "learning_rate": 1.9823154734274997e-05, "loss": 1.5109, "step": 500 }, { "epoch": 0.3087827426810478, "grad_norm": 1.1987580060958862, "learning_rate": 1.9821133934871883e-05, "loss": 1.5361, "step": 501 }, { "epoch": 0.3093990755007704, "grad_norm": 1.2084031105041504, "learning_rate": 1.981910175937542e-05, "loss": 1.4903, "step": 502 }, { "epoch": 0.31001540832049307, "grad_norm": 1.275362491607666, "learning_rate": 1.981705821013953e-05, "loss": 1.5575, "step": 503 }, { "epoch": 0.3106317411402157, "grad_norm": 1.2545727491378784, "learning_rate": 1.981500328953131e-05, "loss": 1.5957, "step": 504 }, { "epoch": 0.31124807395993837, "grad_norm": 1.2482452392578125, "learning_rate": 1.9812936999931034e-05, "loss": 1.5904, "step": 505 }, { "epoch": 0.31186440677966104, "grad_norm": 1.2415482997894287, "learning_rate": 1.9810859343732145e-05, "loss": 1.562, "step": 506 }, { "epoch": 0.31248073959938366, "grad_norm": 1.2128409147262573, "learning_rate": 1.9808770323341252e-05, "loss": 1.5355, "step": 507 }, { "epoch": 0.31309707241910634, "grad_norm": 1.3116565942764282, "learning_rate": 1.980666994117812e-05, "loss": 1.5479, "step": 508 }, { "epoch": 0.31371340523882896, "grad_norm": 1.2857582569122314, "learning_rate": 1.9804558199675683e-05, "loss": 1.5864, "step": 509 }, { "epoch": 0.31432973805855163, "grad_norm": 1.277963399887085, "learning_rate": 1.980243510128003e-05, "loss": 1.5655, "step": 510 }, { "epoch": 0.31494607087827425, "grad_norm": 1.2479331493377686, "learning_rate": 1.9800300648450407e-05, "loss": 1.564, "step": 511 }, { "epoch": 0.31556240369799693, "grad_norm": 1.2554208040237427, "learning_rate": 1.9798154843659205e-05, "loss": 1.5821, "step": 512 }, { "epoch": 0.31617873651771955, "grad_norm": 1.2144819498062134, "learning_rate": 1.9795997689391974e-05, "loss": 1.508, "step": 513 }, { "epoch": 0.3167950693374422, "grad_norm": 1.2817399501800537, "learning_rate": 1.9793829188147406e-05, "loss": 1.5145, "step": 514 }, { "epoch": 0.31741140215716485, "grad_norm": 1.3228282928466797, "learning_rate": 1.9791649342437334e-05, "loss": 1.6075, "step": 515 }, { "epoch": 0.3180277349768875, "grad_norm": 4.738117218017578, "learning_rate": 1.9789458154786736e-05, "loss": 1.5334, "step": 516 }, { "epoch": 0.31864406779661014, "grad_norm": 1.4434213638305664, "learning_rate": 1.9787255627733728e-05, "loss": 1.5197, "step": 517 }, { "epoch": 0.3192604006163328, "grad_norm": 1.2672399282455444, "learning_rate": 1.978504176382955e-05, "loss": 1.6119, "step": 518 }, { "epoch": 0.3198767334360555, "grad_norm": 1.298224925994873, "learning_rate": 1.978281656563859e-05, "loss": 1.6241, "step": 519 }, { "epoch": 0.3204930662557781, "grad_norm": 1.2640724182128906, "learning_rate": 1.978058003573836e-05, "loss": 1.5353, "step": 520 }, { "epoch": 0.3211093990755008, "grad_norm": 1.3141738176345825, "learning_rate": 1.9778332176719483e-05, "loss": 1.5921, "step": 521 }, { "epoch": 0.3217257318952234, "grad_norm": 1.2383464574813843, "learning_rate": 1.9776072991185726e-05, "loss": 1.573, "step": 522 }, { "epoch": 0.3223420647149461, "grad_norm": 1.329386830329895, "learning_rate": 1.9773802481753966e-05, "loss": 1.526, "step": 523 }, { "epoch": 0.3229583975346687, "grad_norm": 1.2244924306869507, "learning_rate": 1.9771520651054196e-05, "loss": 1.5665, "step": 524 }, { "epoch": 0.3235747303543914, "grad_norm": 1.2765448093414307, "learning_rate": 1.9769227501729525e-05, "loss": 1.4767, "step": 525 }, { "epoch": 0.324191063174114, "grad_norm": 1.2142835855484009, "learning_rate": 1.9766923036436164e-05, "loss": 1.523, "step": 526 }, { "epoch": 0.3248073959938367, "grad_norm": 1.3011589050292969, "learning_rate": 1.9764607257843453e-05, "loss": 1.5796, "step": 527 }, { "epoch": 0.3254237288135593, "grad_norm": 1.2366423606872559, "learning_rate": 1.9762280168633815e-05, "loss": 1.5646, "step": 528 }, { "epoch": 0.326040061633282, "grad_norm": 1.2498053312301636, "learning_rate": 1.975994177150278e-05, "loss": 1.5839, "step": 529 }, { "epoch": 0.3266563944530046, "grad_norm": 1.2395589351654053, "learning_rate": 1.975759206915899e-05, "loss": 1.5645, "step": 530 }, { "epoch": 0.32727272727272727, "grad_norm": 1.2005155086517334, "learning_rate": 1.9755231064324155e-05, "loss": 1.4686, "step": 531 }, { "epoch": 0.32788906009244995, "grad_norm": 1.2245957851409912, "learning_rate": 1.9752858759733104e-05, "loss": 1.5045, "step": 532 }, { "epoch": 0.32850539291217257, "grad_norm": 1.365159034729004, "learning_rate": 1.9750475158133738e-05, "loss": 1.5804, "step": 533 }, { "epoch": 0.32912172573189524, "grad_norm": 1.2590810060501099, "learning_rate": 1.9748080262287052e-05, "loss": 1.5909, "step": 534 }, { "epoch": 0.32973805855161786, "grad_norm": 1.1912745237350464, "learning_rate": 1.974567407496712e-05, "loss": 1.4674, "step": 535 }, { "epoch": 0.33035439137134054, "grad_norm": 1.2382985353469849, "learning_rate": 1.9743256598961098e-05, "loss": 1.5309, "step": 536 }, { "epoch": 0.33097072419106316, "grad_norm": 1.2057017087936401, "learning_rate": 1.9740827837069214e-05, "loss": 1.5348, "step": 537 }, { "epoch": 0.33158705701078584, "grad_norm": 1.226598858833313, "learning_rate": 1.973838779210477e-05, "loss": 1.5343, "step": 538 }, { "epoch": 0.33220338983050846, "grad_norm": 1.2329484224319458, "learning_rate": 1.9735936466894136e-05, "loss": 1.5229, "step": 539 }, { "epoch": 0.33281972265023113, "grad_norm": 1.2674819231033325, "learning_rate": 1.9733473864276752e-05, "loss": 1.5262, "step": 540 }, { "epoch": 0.33343605546995375, "grad_norm": 1.2331186532974243, "learning_rate": 1.9730999987105127e-05, "loss": 1.5125, "step": 541 }, { "epoch": 0.3340523882896764, "grad_norm": 1.2392030954360962, "learning_rate": 1.972851483824481e-05, "loss": 1.5472, "step": 542 }, { "epoch": 0.3346687211093991, "grad_norm": 1.2197105884552002, "learning_rate": 1.9726018420574423e-05, "loss": 1.5754, "step": 543 }, { "epoch": 0.3352850539291217, "grad_norm": 1.2638585567474365, "learning_rate": 1.972351073698564e-05, "loss": 1.5665, "step": 544 }, { "epoch": 0.3359013867488444, "grad_norm": 1.2815210819244385, "learning_rate": 1.9720991790383175e-05, "loss": 1.5271, "step": 545 }, { "epoch": 0.336517719568567, "grad_norm": 1.2389568090438843, "learning_rate": 1.9718461583684794e-05, "loss": 1.5475, "step": 546 }, { "epoch": 0.3371340523882897, "grad_norm": 1.2511855363845825, "learning_rate": 1.9715920119821315e-05, "loss": 1.6067, "step": 547 }, { "epoch": 0.3377503852080123, "grad_norm": 1.258094072341919, "learning_rate": 1.9713367401736576e-05, "loss": 1.5579, "step": 548 }, { "epoch": 0.338366718027735, "grad_norm": 1.2243613004684448, "learning_rate": 1.9710803432387466e-05, "loss": 1.6078, "step": 549 }, { "epoch": 0.3389830508474576, "grad_norm": 1.2125722169876099, "learning_rate": 1.97082282147439e-05, "loss": 1.5274, "step": 550 }, { "epoch": 0.3395993836671803, "grad_norm": 1.2413229942321777, "learning_rate": 1.970564175178883e-05, "loss": 1.5662, "step": 551 }, { "epoch": 0.3402157164869029, "grad_norm": 1.2161388397216797, "learning_rate": 1.9703044046518224e-05, "loss": 1.499, "step": 552 }, { "epoch": 0.3408320493066256, "grad_norm": 1.1758136749267578, "learning_rate": 1.9700435101941077e-05, "loss": 1.5068, "step": 553 }, { "epoch": 0.3414483821263482, "grad_norm": 1.222847580909729, "learning_rate": 1.9697814921079397e-05, "loss": 1.4852, "step": 554 }, { "epoch": 0.3420647149460709, "grad_norm": 1.2643941640853882, "learning_rate": 1.9695183506968224e-05, "loss": 1.5771, "step": 555 }, { "epoch": 0.34268104776579356, "grad_norm": 1.222047209739685, "learning_rate": 1.9692540862655587e-05, "loss": 1.5549, "step": 556 }, { "epoch": 0.3432973805855162, "grad_norm": 1.2197353839874268, "learning_rate": 1.9689886991202543e-05, "loss": 1.5692, "step": 557 }, { "epoch": 0.34391371340523885, "grad_norm": 1.2177941799163818, "learning_rate": 1.9687221895683143e-05, "loss": 1.5267, "step": 558 }, { "epoch": 0.3445300462249615, "grad_norm": 1.241468906402588, "learning_rate": 1.9684545579184437e-05, "loss": 1.5314, "step": 559 }, { "epoch": 0.34514637904468415, "grad_norm": 1.1903144121170044, "learning_rate": 1.968185804480648e-05, "loss": 1.5021, "step": 560 }, { "epoch": 0.34576271186440677, "grad_norm": 1.2305514812469482, "learning_rate": 1.9679159295662323e-05, "loss": 1.5364, "step": 561 }, { "epoch": 0.34637904468412944, "grad_norm": 1.2058769464492798, "learning_rate": 1.9676449334877993e-05, "loss": 1.557, "step": 562 }, { "epoch": 0.34699537750385206, "grad_norm": 1.2326509952545166, "learning_rate": 1.967372816559252e-05, "loss": 1.5573, "step": 563 }, { "epoch": 0.34761171032357474, "grad_norm": 1.2430949211120605, "learning_rate": 1.967099579095791e-05, "loss": 1.5715, "step": 564 }, { "epoch": 0.34822804314329736, "grad_norm": 1.2398585081100464, "learning_rate": 1.9668252214139146e-05, "loss": 1.5436, "step": 565 }, { "epoch": 0.34884437596302004, "grad_norm": 1.2173937559127808, "learning_rate": 1.966549743831419e-05, "loss": 1.523, "step": 566 }, { "epoch": 0.34946070878274266, "grad_norm": 1.229384422302246, "learning_rate": 1.9662731466673977e-05, "loss": 1.5181, "step": 567 }, { "epoch": 0.35007704160246533, "grad_norm": 1.2473266124725342, "learning_rate": 1.965995430242241e-05, "loss": 1.5383, "step": 568 }, { "epoch": 0.350693374422188, "grad_norm": 1.2145967483520508, "learning_rate": 1.9657165948776352e-05, "loss": 1.5401, "step": 569 }, { "epoch": 0.35130970724191063, "grad_norm": 1.2233625650405884, "learning_rate": 1.9654366408965637e-05, "loss": 1.5427, "step": 570 }, { "epoch": 0.3519260400616333, "grad_norm": 1.2210474014282227, "learning_rate": 1.965155568623304e-05, "loss": 1.5548, "step": 571 }, { "epoch": 0.3525423728813559, "grad_norm": 1.2687908411026, "learning_rate": 1.964873378383431e-05, "loss": 1.562, "step": 572 }, { "epoch": 0.3531587057010786, "grad_norm": 1.211151361465454, "learning_rate": 1.9645900705038127e-05, "loss": 1.581, "step": 573 }, { "epoch": 0.3537750385208012, "grad_norm": 1.2371565103530884, "learning_rate": 1.964305645312613e-05, "loss": 1.5341, "step": 574 }, { "epoch": 0.3543913713405239, "grad_norm": 1.2216075658798218, "learning_rate": 1.96402010313929e-05, "loss": 1.5946, "step": 575 }, { "epoch": 0.3550077041602465, "grad_norm": 1.2023626565933228, "learning_rate": 1.9637334443145936e-05, "loss": 1.5257, "step": 576 }, { "epoch": 0.3556240369799692, "grad_norm": 1.2776899337768555, "learning_rate": 1.9634456691705705e-05, "loss": 1.5502, "step": 577 }, { "epoch": 0.3562403697996918, "grad_norm": 1.2291333675384521, "learning_rate": 1.9631567780405576e-05, "loss": 1.523, "step": 578 }, { "epoch": 0.3568567026194145, "grad_norm": 1.2471462488174438, "learning_rate": 1.962866771259186e-05, "loss": 1.5351, "step": 579 }, { "epoch": 0.3574730354391371, "grad_norm": 1.2396150827407837, "learning_rate": 1.9625756491623785e-05, "loss": 1.4878, "step": 580 }, { "epoch": 0.3580893682588598, "grad_norm": 1.2014983892440796, "learning_rate": 1.9622834120873496e-05, "loss": 1.4731, "step": 581 }, { "epoch": 0.35870570107858246, "grad_norm": 1.25382399559021, "learning_rate": 1.9619900603726062e-05, "loss": 1.5337, "step": 582 }, { "epoch": 0.3593220338983051, "grad_norm": 1.265248417854309, "learning_rate": 1.9616955943579456e-05, "loss": 1.5683, "step": 583 }, { "epoch": 0.35993836671802776, "grad_norm": 1.2474901676177979, "learning_rate": 1.9614000143844557e-05, "loss": 1.5499, "step": 584 }, { "epoch": 0.3605546995377504, "grad_norm": 1.2424912452697754, "learning_rate": 1.9611033207945155e-05, "loss": 1.544, "step": 585 }, { "epoch": 0.36117103235747305, "grad_norm": 1.2285486459732056, "learning_rate": 1.9608055139317932e-05, "loss": 1.6155, "step": 586 }, { "epoch": 0.3617873651771957, "grad_norm": 1.2138084173202515, "learning_rate": 1.9605065941412473e-05, "loss": 1.5065, "step": 587 }, { "epoch": 0.36240369799691835, "grad_norm": 1.2348237037658691, "learning_rate": 1.9602065617691236e-05, "loss": 1.5046, "step": 588 }, { "epoch": 0.36302003081664097, "grad_norm": 1.2499668598175049, "learning_rate": 1.9599054171629594e-05, "loss": 1.5751, "step": 589 }, { "epoch": 0.36363636363636365, "grad_norm": 1.2013397216796875, "learning_rate": 1.959603160671578e-05, "loss": 1.5008, "step": 590 }, { "epoch": 0.36425269645608627, "grad_norm": 1.2554980516433716, "learning_rate": 1.959299792645092e-05, "loss": 1.5413, "step": 591 }, { "epoch": 0.36486902927580894, "grad_norm": 1.2143399715423584, "learning_rate": 1.9589953134349012e-05, "loss": 1.5882, "step": 592 }, { "epoch": 0.36548536209553156, "grad_norm": 1.1908376216888428, "learning_rate": 1.9586897233936916e-05, "loss": 1.5255, "step": 593 }, { "epoch": 0.36610169491525424, "grad_norm": 1.1920303106307983, "learning_rate": 1.9583830228754372e-05, "loss": 1.4941, "step": 594 }, { "epoch": 0.3667180277349769, "grad_norm": 1.2120412588119507, "learning_rate": 1.958075212235398e-05, "loss": 1.4638, "step": 595 }, { "epoch": 0.36733436055469953, "grad_norm": 1.244823694229126, "learning_rate": 1.957766291830119e-05, "loss": 1.4791, "step": 596 }, { "epoch": 0.3679506933744222, "grad_norm": 1.245976448059082, "learning_rate": 1.957456262017432e-05, "loss": 1.5523, "step": 597 }, { "epoch": 0.36856702619414483, "grad_norm": 1.2313226461410522, "learning_rate": 1.9571451231564523e-05, "loss": 1.5857, "step": 598 }, { "epoch": 0.3691833590138675, "grad_norm": 1.2284907102584839, "learning_rate": 1.9568328756075815e-05, "loss": 1.5816, "step": 599 }, { "epoch": 0.3697996918335901, "grad_norm": 1.2191033363342285, "learning_rate": 1.956519519732505e-05, "loss": 1.5546, "step": 600 }, { "epoch": 0.3704160246533128, "grad_norm": 1.2538461685180664, "learning_rate": 1.95620505589419e-05, "loss": 1.5947, "step": 601 }, { "epoch": 0.3710323574730354, "grad_norm": 1.2250356674194336, "learning_rate": 1.9558894844568905e-05, "loss": 1.4679, "step": 602 }, { "epoch": 0.3716486902927581, "grad_norm": 1.2412315607070923, "learning_rate": 1.955572805786141e-05, "loss": 1.5407, "step": 603 }, { "epoch": 0.3722650231124807, "grad_norm": 1.2216013669967651, "learning_rate": 1.955255020248759e-05, "loss": 1.4559, "step": 604 }, { "epoch": 0.3728813559322034, "grad_norm": 1.2542505264282227, "learning_rate": 1.9549361282128446e-05, "loss": 1.5075, "step": 605 }, { "epoch": 0.373497688751926, "grad_norm": 1.2043108940124512, "learning_rate": 1.9546161300477792e-05, "loss": 1.5267, "step": 606 }, { "epoch": 0.3741140215716487, "grad_norm": 1.2397077083587646, "learning_rate": 1.954295026124226e-05, "loss": 1.5211, "step": 607 }, { "epoch": 0.37473035439137137, "grad_norm": 1.3175978660583496, "learning_rate": 1.953972816814128e-05, "loss": 1.5431, "step": 608 }, { "epoch": 0.375346687211094, "grad_norm": 1.261094093322754, "learning_rate": 1.9536495024907097e-05, "loss": 1.5903, "step": 609 }, { "epoch": 0.37596302003081666, "grad_norm": 1.536584734916687, "learning_rate": 1.953325083528475e-05, "loss": 1.5284, "step": 610 }, { "epoch": 0.3765793528505393, "grad_norm": 1.2516734600067139, "learning_rate": 1.9529995603032077e-05, "loss": 1.5232, "step": 611 }, { "epoch": 0.37719568567026196, "grad_norm": 1.2510435581207275, "learning_rate": 1.95267293319197e-05, "loss": 1.5771, "step": 612 }, { "epoch": 0.3778120184899846, "grad_norm": 1.3033338785171509, "learning_rate": 1.952345202573103e-05, "loss": 1.5095, "step": 613 }, { "epoch": 0.37842835130970726, "grad_norm": 1.2102240324020386, "learning_rate": 1.9520163688262265e-05, "loss": 1.4538, "step": 614 }, { "epoch": 0.3790446841294299, "grad_norm": 1.2830308675765991, "learning_rate": 1.951686432332238e-05, "loss": 1.5334, "step": 615 }, { "epoch": 0.37966101694915255, "grad_norm": 1.2124521732330322, "learning_rate": 1.9513553934733123e-05, "loss": 1.5357, "step": 616 }, { "epoch": 0.3802773497688752, "grad_norm": 1.2969774007797241, "learning_rate": 1.9510232526329003e-05, "loss": 1.593, "step": 617 }, { "epoch": 0.38089368258859785, "grad_norm": 1.2636611461639404, "learning_rate": 1.9506900101957303e-05, "loss": 1.5209, "step": 618 }, { "epoch": 0.38151001540832047, "grad_norm": 1.2533831596374512, "learning_rate": 1.9503556665478066e-05, "loss": 1.5667, "step": 619 }, { "epoch": 0.38212634822804314, "grad_norm": 1.271254301071167, "learning_rate": 1.9500202220764083e-05, "loss": 1.5409, "step": 620 }, { "epoch": 0.3827426810477658, "grad_norm": 1.2194855213165283, "learning_rate": 1.9496836771700908e-05, "loss": 1.5235, "step": 621 }, { "epoch": 0.38335901386748844, "grad_norm": 1.2275644540786743, "learning_rate": 1.9493460322186823e-05, "loss": 1.5088, "step": 622 }, { "epoch": 0.3839753466872111, "grad_norm": 1.2805346250534058, "learning_rate": 1.949007287613287e-05, "loss": 1.5795, "step": 623 }, { "epoch": 0.38459167950693374, "grad_norm": 1.3345251083374023, "learning_rate": 1.9486674437462827e-05, "loss": 1.5836, "step": 624 }, { "epoch": 0.3852080123266564, "grad_norm": 1.2426317930221558, "learning_rate": 1.9483265010113192e-05, "loss": 1.5658, "step": 625 }, { "epoch": 0.38582434514637903, "grad_norm": 355.6366882324219, "learning_rate": 1.9479844598033202e-05, "loss": 1.6222, "step": 626 }, { "epoch": 0.3864406779661017, "grad_norm": 1.3632776737213135, "learning_rate": 1.947641320518482e-05, "loss": 1.5833, "step": 627 }, { "epoch": 0.38705701078582433, "grad_norm": 1.257728099822998, "learning_rate": 1.9472970835542724e-05, "loss": 1.5327, "step": 628 }, { "epoch": 0.387673343605547, "grad_norm": 1.5337251424789429, "learning_rate": 1.94695174930943e-05, "loss": 1.5269, "step": 629 }, { "epoch": 0.3882896764252696, "grad_norm": 1.3116291761398315, "learning_rate": 1.946605318183966e-05, "loss": 1.5364, "step": 630 }, { "epoch": 0.3889060092449923, "grad_norm": 1.261034369468689, "learning_rate": 1.9462577905791602e-05, "loss": 1.5399, "step": 631 }, { "epoch": 0.3895223420647149, "grad_norm": 1.3153637647628784, "learning_rate": 1.945909166897564e-05, "loss": 1.5751, "step": 632 }, { "epoch": 0.3901386748844376, "grad_norm": 1.2312641143798828, "learning_rate": 1.9455594475429982e-05, "loss": 1.5231, "step": 633 }, { "epoch": 0.3907550077041603, "grad_norm": 1.2884538173675537, "learning_rate": 1.9452086329205523e-05, "loss": 1.5298, "step": 634 }, { "epoch": 0.3913713405238829, "grad_norm": 1.2451438903808594, "learning_rate": 1.9448567234365843e-05, "loss": 1.589, "step": 635 }, { "epoch": 0.39198767334360557, "grad_norm": 1.295436978340149, "learning_rate": 1.9445037194987214e-05, "loss": 1.5045, "step": 636 }, { "epoch": 0.3926040061633282, "grad_norm": 1.299060344696045, "learning_rate": 1.9441496215158572e-05, "loss": 1.5568, "step": 637 }, { "epoch": 0.39322033898305087, "grad_norm": 1.247084379196167, "learning_rate": 1.9437944298981542e-05, "loss": 1.5233, "step": 638 }, { "epoch": 0.3938366718027735, "grad_norm": 1.2891147136688232, "learning_rate": 1.9434381450570397e-05, "loss": 1.5491, "step": 639 }, { "epoch": 0.39445300462249616, "grad_norm": 1.2120251655578613, "learning_rate": 1.9430807674052092e-05, "loss": 1.4826, "step": 640 }, { "epoch": 0.3950693374422188, "grad_norm": 1.270560383796692, "learning_rate": 1.9427222973566228e-05, "loss": 1.4867, "step": 641 }, { "epoch": 0.39568567026194146, "grad_norm": 1.227052092552185, "learning_rate": 1.9423627353265064e-05, "loss": 1.5354, "step": 642 }, { "epoch": 0.3963020030816641, "grad_norm": 1.2582279443740845, "learning_rate": 1.942002081731351e-05, "loss": 1.5084, "step": 643 }, { "epoch": 0.39691833590138675, "grad_norm": 1.291737675666809, "learning_rate": 1.941640336988911e-05, "loss": 1.5372, "step": 644 }, { "epoch": 0.3975346687211094, "grad_norm": 1.2285447120666504, "learning_rate": 1.9412775015182066e-05, "loss": 1.5557, "step": 645 }, { "epoch": 0.39815100154083205, "grad_norm": 1.212854266166687, "learning_rate": 1.940913575739519e-05, "loss": 1.4822, "step": 646 }, { "epoch": 0.3987673343605547, "grad_norm": 1.1870572566986084, "learning_rate": 1.9405485600743943e-05, "loss": 1.4074, "step": 647 }, { "epoch": 0.39938366718027735, "grad_norm": 1.3796590566635132, "learning_rate": 1.9401824549456402e-05, "loss": 1.6074, "step": 648 }, { "epoch": 0.4, "grad_norm": 1.3018184900283813, "learning_rate": 1.9398152607773264e-05, "loss": 1.5233, "step": 649 }, { "epoch": 0.40061633281972264, "grad_norm": 1.2377570867538452, "learning_rate": 1.9394469779947844e-05, "loss": 1.5456, "step": 650 }, { "epoch": 0.4012326656394453, "grad_norm": 1.2446471452713013, "learning_rate": 1.939077607024606e-05, "loss": 1.5312, "step": 651 }, { "epoch": 0.40184899845916794, "grad_norm": 1.2142305374145508, "learning_rate": 1.938707148294644e-05, "loss": 1.5241, "step": 652 }, { "epoch": 0.4024653312788906, "grad_norm": 1.2282423973083496, "learning_rate": 1.9383356022340116e-05, "loss": 1.5093, "step": 653 }, { "epoch": 0.40308166409861323, "grad_norm": 1.2484543323516846, "learning_rate": 1.9379629692730798e-05, "loss": 1.5271, "step": 654 }, { "epoch": 0.4036979969183359, "grad_norm": 1.2265640497207642, "learning_rate": 1.9375892498434813e-05, "loss": 1.49, "step": 655 }, { "epoch": 0.40431432973805853, "grad_norm": 1.2515850067138672, "learning_rate": 1.9372144443781043e-05, "loss": 1.568, "step": 656 }, { "epoch": 0.4049306625577812, "grad_norm": 1.2206379175186157, "learning_rate": 1.9368385533110975e-05, "loss": 1.4866, "step": 657 }, { "epoch": 0.4055469953775038, "grad_norm": 1.2374520301818848, "learning_rate": 1.9364615770778656e-05, "loss": 1.5523, "step": 658 }, { "epoch": 0.4061633281972265, "grad_norm": 1.2331488132476807, "learning_rate": 1.9360835161150703e-05, "loss": 1.4896, "step": 659 }, { "epoch": 0.4067796610169492, "grad_norm": 1.2167038917541504, "learning_rate": 1.9357043708606305e-05, "loss": 1.4734, "step": 660 }, { "epoch": 0.4073959938366718, "grad_norm": 1.2293692827224731, "learning_rate": 1.9353241417537216e-05, "loss": 1.4582, "step": 661 }, { "epoch": 0.4080123266563945, "grad_norm": 1.2981430292129517, "learning_rate": 1.9349428292347718e-05, "loss": 1.5297, "step": 662 }, { "epoch": 0.4086286594761171, "grad_norm": 1.2317942380905151, "learning_rate": 1.934560433745467e-05, "loss": 1.58, "step": 663 }, { "epoch": 0.40924499229583977, "grad_norm": 1.2100396156311035, "learning_rate": 1.934176955728747e-05, "loss": 1.4769, "step": 664 }, { "epoch": 0.4098613251155624, "grad_norm": 1.2224452495574951, "learning_rate": 1.933792395628804e-05, "loss": 1.4683, "step": 665 }, { "epoch": 0.41047765793528507, "grad_norm": 1.2032585144042969, "learning_rate": 1.9334067538910857e-05, "loss": 1.5282, "step": 666 }, { "epoch": 0.4110939907550077, "grad_norm": 1.2654169797897339, "learning_rate": 1.933020030962291e-05, "loss": 1.6033, "step": 667 }, { "epoch": 0.41171032357473036, "grad_norm": 1.361639380455017, "learning_rate": 1.9326322272903724e-05, "loss": 1.5843, "step": 668 }, { "epoch": 0.412326656394453, "grad_norm": 1.2395968437194824, "learning_rate": 1.932243343324533e-05, "loss": 1.5685, "step": 669 }, { "epoch": 0.41294298921417566, "grad_norm": 1.2128207683563232, "learning_rate": 1.931853379515229e-05, "loss": 1.5244, "step": 670 }, { "epoch": 0.4135593220338983, "grad_norm": 1.2616959810256958, "learning_rate": 1.931462336314165e-05, "loss": 1.579, "step": 671 }, { "epoch": 0.41417565485362096, "grad_norm": 1.222118616104126, "learning_rate": 1.931070214174298e-05, "loss": 1.5016, "step": 672 }, { "epoch": 0.41479198767334363, "grad_norm": 1.2500362396240234, "learning_rate": 1.9306770135498344e-05, "loss": 1.5919, "step": 673 }, { "epoch": 0.41540832049306625, "grad_norm": 1.265579104423523, "learning_rate": 1.930282734896229e-05, "loss": 1.5234, "step": 674 }, { "epoch": 0.41602465331278893, "grad_norm": 1.2557156085968018, "learning_rate": 1.929887378670186e-05, "loss": 1.5688, "step": 675 }, { "epoch": 0.41664098613251155, "grad_norm": 1.2902251482009888, "learning_rate": 1.9294909453296568e-05, "loss": 1.5018, "step": 676 }, { "epoch": 0.4172573189522342, "grad_norm": 1.2182374000549316, "learning_rate": 1.929093435333842e-05, "loss": 1.5251, "step": 677 }, { "epoch": 0.41787365177195684, "grad_norm": 1.2206605672836304, "learning_rate": 1.928694849143188e-05, "loss": 1.5036, "step": 678 }, { "epoch": 0.4184899845916795, "grad_norm": 1.219704270362854, "learning_rate": 1.9282951872193886e-05, "loss": 1.5159, "step": 679 }, { "epoch": 0.41910631741140214, "grad_norm": 1.3067165613174438, "learning_rate": 1.9278944500253837e-05, "loss": 1.5491, "step": 680 }, { "epoch": 0.4197226502311248, "grad_norm": 1.2167636156082153, "learning_rate": 1.9274926380253573e-05, "loss": 1.4905, "step": 681 }, { "epoch": 0.42033898305084744, "grad_norm": 1.2884389162063599, "learning_rate": 1.9270897516847406e-05, "loss": 1.5907, "step": 682 }, { "epoch": 0.4209553158705701, "grad_norm": 1.1723458766937256, "learning_rate": 1.9266857914702073e-05, "loss": 1.4426, "step": 683 }, { "epoch": 0.42157164869029273, "grad_norm": 1.194990634918213, "learning_rate": 1.9262807578496764e-05, "loss": 1.4695, "step": 684 }, { "epoch": 0.4221879815100154, "grad_norm": 1.260567545890808, "learning_rate": 1.925874651292309e-05, "loss": 1.5396, "step": 685 }, { "epoch": 0.4228043143297381, "grad_norm": 1.2608911991119385, "learning_rate": 1.92546747226851e-05, "loss": 1.5719, "step": 686 }, { "epoch": 0.4234206471494607, "grad_norm": 1.2482186555862427, "learning_rate": 1.9250592212499265e-05, "loss": 1.5608, "step": 687 }, { "epoch": 0.4240369799691834, "grad_norm": 1.2240350246429443, "learning_rate": 1.9246498987094464e-05, "loss": 1.6029, "step": 688 }, { "epoch": 0.424653312788906, "grad_norm": 1.2281354665756226, "learning_rate": 1.9242395051212e-05, "loss": 1.562, "step": 689 }, { "epoch": 0.4252696456086287, "grad_norm": 1.2392401695251465, "learning_rate": 1.923828040960557e-05, "loss": 1.4422, "step": 690 }, { "epoch": 0.4258859784283513, "grad_norm": 1.3233660459518433, "learning_rate": 1.9234155067041285e-05, "loss": 1.4675, "step": 691 }, { "epoch": 0.426502311248074, "grad_norm": 1.3640377521514893, "learning_rate": 1.9230019028297642e-05, "loss": 1.4715, "step": 692 }, { "epoch": 0.4271186440677966, "grad_norm": 1.3324940204620361, "learning_rate": 1.9225872298165526e-05, "loss": 1.525, "step": 693 }, { "epoch": 0.42773497688751927, "grad_norm": 1.2549407482147217, "learning_rate": 1.9221714881448215e-05, "loss": 1.5239, "step": 694 }, { "epoch": 0.4283513097072419, "grad_norm": 1.3053219318389893, "learning_rate": 1.9217546782961362e-05, "loss": 1.5187, "step": 695 }, { "epoch": 0.42896764252696457, "grad_norm": 1.215789556503296, "learning_rate": 1.9213368007532986e-05, "loss": 1.4635, "step": 696 }, { "epoch": 0.4295839753466872, "grad_norm": 1.2223495244979858, "learning_rate": 1.9209178560003482e-05, "loss": 1.4384, "step": 697 }, { "epoch": 0.43020030816640986, "grad_norm": 1.2833138704299927, "learning_rate": 1.9204978445225606e-05, "loss": 1.4981, "step": 698 }, { "epoch": 0.43081664098613254, "grad_norm": 1.2799115180969238, "learning_rate": 1.9200767668064464e-05, "loss": 1.5805, "step": 699 }, { "epoch": 0.43143297380585516, "grad_norm": 1.2546586990356445, "learning_rate": 1.919654623339752e-05, "loss": 1.4948, "step": 700 }, { "epoch": 0.43204930662557783, "grad_norm": 1.232107400894165, "learning_rate": 1.9192314146114577e-05, "loss": 1.4686, "step": 701 }, { "epoch": 0.43266563944530045, "grad_norm": 1.2199714183807373, "learning_rate": 1.9188071411117782e-05, "loss": 1.4924, "step": 702 }, { "epoch": 0.43328197226502313, "grad_norm": 1.2320053577423096, "learning_rate": 1.9183818033321612e-05, "loss": 1.5273, "step": 703 }, { "epoch": 0.43389830508474575, "grad_norm": 1.3008888959884644, "learning_rate": 1.9179554017652874e-05, "loss": 1.5312, "step": 704 }, { "epoch": 0.4345146379044684, "grad_norm": 1.2132518291473389, "learning_rate": 1.9175279369050695e-05, "loss": 1.4734, "step": 705 }, { "epoch": 0.43513097072419105, "grad_norm": 1.2187168598175049, "learning_rate": 1.917099409246652e-05, "loss": 1.5499, "step": 706 }, { "epoch": 0.4357473035439137, "grad_norm": 1.217174768447876, "learning_rate": 1.9166698192864105e-05, "loss": 1.4889, "step": 707 }, { "epoch": 0.43636363636363634, "grad_norm": 1.3033735752105713, "learning_rate": 1.916239167521951e-05, "loss": 1.597, "step": 708 }, { "epoch": 0.436979969183359, "grad_norm": 1.2284462451934814, "learning_rate": 1.9158074544521094e-05, "loss": 1.4886, "step": 709 }, { "epoch": 0.43759630200308164, "grad_norm": 1.2238386869430542, "learning_rate": 1.9153746805769512e-05, "loss": 1.4728, "step": 710 }, { "epoch": 0.4382126348228043, "grad_norm": 1.217970609664917, "learning_rate": 1.9149408463977705e-05, "loss": 1.478, "step": 711 }, { "epoch": 0.438828967642527, "grad_norm": 1.2461726665496826, "learning_rate": 1.9145059524170888e-05, "loss": 1.5, "step": 712 }, { "epoch": 0.4394453004622496, "grad_norm": 1.2552093267440796, "learning_rate": 1.9140699991386567e-05, "loss": 1.5054, "step": 713 }, { "epoch": 0.4400616332819723, "grad_norm": 1.2725965976715088, "learning_rate": 1.913632987067451e-05, "loss": 1.5268, "step": 714 }, { "epoch": 0.4406779661016949, "grad_norm": 1.265761375427246, "learning_rate": 1.9131949167096744e-05, "loss": 1.5456, "step": 715 }, { "epoch": 0.4412942989214176, "grad_norm": 1.2406741380691528, "learning_rate": 1.9127557885727562e-05, "loss": 1.4638, "step": 716 }, { "epoch": 0.4419106317411402, "grad_norm": 1.2415400743484497, "learning_rate": 1.9123156031653516e-05, "loss": 1.5269, "step": 717 }, { "epoch": 0.4425269645608629, "grad_norm": 1.2385265827178955, "learning_rate": 1.9118743609973384e-05, "loss": 1.5034, "step": 718 }, { "epoch": 0.4431432973805855, "grad_norm": 1.244320034980774, "learning_rate": 1.9114320625798206e-05, "loss": 1.5234, "step": 719 }, { "epoch": 0.4437596302003082, "grad_norm": 1.2252706289291382, "learning_rate": 1.9109887084251246e-05, "loss": 1.491, "step": 720 }, { "epoch": 0.4443759630200308, "grad_norm": 1.2738279104232788, "learning_rate": 1.9105442990467995e-05, "loss": 1.5197, "step": 721 }, { "epoch": 0.44499229583975347, "grad_norm": 1.2052665948867798, "learning_rate": 1.9100988349596175e-05, "loss": 1.4716, "step": 722 }, { "epoch": 0.4456086286594761, "grad_norm": 1.3171828985214233, "learning_rate": 1.909652316679572e-05, "loss": 1.5342, "step": 723 }, { "epoch": 0.44622496147919877, "grad_norm": 1.267164945602417, "learning_rate": 1.9092047447238775e-05, "loss": 1.4668, "step": 724 }, { "epoch": 0.44684129429892144, "grad_norm": 1.252143144607544, "learning_rate": 1.9087561196109686e-05, "loss": 1.517, "step": 725 }, { "epoch": 0.44745762711864406, "grad_norm": 1.24161958694458, "learning_rate": 1.908306441860501e-05, "loss": 1.566, "step": 726 }, { "epoch": 0.44807395993836674, "grad_norm": 1.225793480873108, "learning_rate": 1.907855711993349e-05, "loss": 1.5128, "step": 727 }, { "epoch": 0.44869029275808936, "grad_norm": 1.2523088455200195, "learning_rate": 1.907403930531605e-05, "loss": 1.5281, "step": 728 }, { "epoch": 0.44930662557781204, "grad_norm": 1.2475779056549072, "learning_rate": 1.9069510979985805e-05, "loss": 1.556, "step": 729 }, { "epoch": 0.44992295839753466, "grad_norm": 1.2963918447494507, "learning_rate": 1.906497214918804e-05, "loss": 1.4805, "step": 730 }, { "epoch": 0.45053929121725733, "grad_norm": 1.2220375537872314, "learning_rate": 1.9060422818180208e-05, "loss": 1.5009, "step": 731 }, { "epoch": 0.45115562403697995, "grad_norm": 1.2733973264694214, "learning_rate": 1.9055862992231926e-05, "loss": 1.5875, "step": 732 }, { "epoch": 0.45177195685670263, "grad_norm": 1.2013468742370605, "learning_rate": 1.9051292676624968e-05, "loss": 1.4633, "step": 733 }, { "epoch": 0.45238828967642525, "grad_norm": 1.2420295476913452, "learning_rate": 1.904671187665326e-05, "loss": 1.5404, "step": 734 }, { "epoch": 0.4530046224961479, "grad_norm": 1.2528700828552246, "learning_rate": 1.9042120597622866e-05, "loss": 1.4761, "step": 735 }, { "epoch": 0.45362095531587054, "grad_norm": 1.229811191558838, "learning_rate": 1.9037518844851998e-05, "loss": 1.4589, "step": 736 }, { "epoch": 0.4542372881355932, "grad_norm": 1.2342045307159424, "learning_rate": 1.903290662367099e-05, "loss": 1.4733, "step": 737 }, { "epoch": 0.4548536209553159, "grad_norm": 1.3630375862121582, "learning_rate": 1.902828393942231e-05, "loss": 1.5306, "step": 738 }, { "epoch": 0.4554699537750385, "grad_norm": 1.2195000648498535, "learning_rate": 1.902365079746054e-05, "loss": 1.4496, "step": 739 }, { "epoch": 0.4560862865947612, "grad_norm": 1.2455213069915771, "learning_rate": 1.9019007203152376e-05, "loss": 1.46, "step": 740 }, { "epoch": 0.4567026194144838, "grad_norm": 1.2461849451065063, "learning_rate": 1.9014353161876626e-05, "loss": 1.5298, "step": 741 }, { "epoch": 0.4573189522342065, "grad_norm": 1.2942029237747192, "learning_rate": 1.900968867902419e-05, "loss": 1.47, "step": 742 }, { "epoch": 0.4579352850539291, "grad_norm": 1.2524033784866333, "learning_rate": 1.900501375999808e-05, "loss": 1.503, "step": 743 }, { "epoch": 0.4585516178736518, "grad_norm": 1.262398362159729, "learning_rate": 1.9000328410213376e-05, "loss": 1.5193, "step": 744 }, { "epoch": 0.4591679506933744, "grad_norm": 1.284395694732666, "learning_rate": 1.899563263509725e-05, "loss": 1.523, "step": 745 }, { "epoch": 0.4597842835130971, "grad_norm": 1.2116655111312866, "learning_rate": 1.899092644008895e-05, "loss": 1.478, "step": 746 }, { "epoch": 0.4604006163328197, "grad_norm": 1.2691497802734375, "learning_rate": 1.8986209830639796e-05, "loss": 1.5552, "step": 747 }, { "epoch": 0.4610169491525424, "grad_norm": 1.2738350629806519, "learning_rate": 1.8981482812213163e-05, "loss": 1.5144, "step": 748 }, { "epoch": 0.461633281972265, "grad_norm": 1.21067214012146, "learning_rate": 1.8976745390284495e-05, "loss": 1.4255, "step": 749 }, { "epoch": 0.4622496147919877, "grad_norm": 1.2532901763916016, "learning_rate": 1.8971997570341272e-05, "loss": 1.5135, "step": 750 }, { "epoch": 0.46286594761171035, "grad_norm": 1.2445391416549683, "learning_rate": 1.896723935788303e-05, "loss": 1.5479, "step": 751 }, { "epoch": 0.46348228043143297, "grad_norm": 1.20797860622406, "learning_rate": 1.8962470758421342e-05, "loss": 1.4697, "step": 752 }, { "epoch": 0.46409861325115565, "grad_norm": 1.2362041473388672, "learning_rate": 1.895769177747981e-05, "loss": 1.4917, "step": 753 }, { "epoch": 0.46471494607087827, "grad_norm": 1.2625436782836914, "learning_rate": 1.8952902420594056e-05, "loss": 1.5496, "step": 754 }, { "epoch": 0.46533127889060094, "grad_norm": 1.2581541538238525, "learning_rate": 1.894810269331173e-05, "loss": 1.4492, "step": 755 }, { "epoch": 0.46594761171032356, "grad_norm": 1.2668228149414062, "learning_rate": 1.8943292601192483e-05, "loss": 1.5416, "step": 756 }, { "epoch": 0.46656394453004624, "grad_norm": 1.2598587274551392, "learning_rate": 1.8938472149807987e-05, "loss": 1.5068, "step": 757 }, { "epoch": 0.46718027734976886, "grad_norm": 1.251828670501709, "learning_rate": 1.89336413447419e-05, "loss": 1.4706, "step": 758 }, { "epoch": 0.46779661016949153, "grad_norm": 1.259675145149231, "learning_rate": 1.892880019158988e-05, "loss": 1.5043, "step": 759 }, { "epoch": 0.46841294298921415, "grad_norm": 1.2647371292114258, "learning_rate": 1.8923948695959574e-05, "loss": 1.5338, "step": 760 }, { "epoch": 0.46902927580893683, "grad_norm": 1.25161874294281, "learning_rate": 1.8919086863470596e-05, "loss": 1.4422, "step": 761 }, { "epoch": 0.46964560862865945, "grad_norm": 1.2323579788208008, "learning_rate": 1.891421469975455e-05, "loss": 1.4965, "step": 762 }, { "epoch": 0.4702619414483821, "grad_norm": 1.2396897077560425, "learning_rate": 1.8909332210454995e-05, "loss": 1.5351, "step": 763 }, { "epoch": 0.4708782742681048, "grad_norm": 1.2444100379943848, "learning_rate": 1.8904439401227457e-05, "loss": 1.4476, "step": 764 }, { "epoch": 0.4714946070878274, "grad_norm": 1.2893779277801514, "learning_rate": 1.8899536277739413e-05, "loss": 1.4322, "step": 765 }, { "epoch": 0.4721109399075501, "grad_norm": 1.271478295326233, "learning_rate": 1.8894622845670282e-05, "loss": 1.4601, "step": 766 }, { "epoch": 0.4727272727272727, "grad_norm": 1.2277095317840576, "learning_rate": 1.8889699110711435e-05, "loss": 1.4391, "step": 767 }, { "epoch": 0.4733436055469954, "grad_norm": 1.2456262111663818, "learning_rate": 1.8884765078566172e-05, "loss": 1.4615, "step": 768 }, { "epoch": 0.473959938366718, "grad_norm": 1.2485253810882568, "learning_rate": 1.887982075494972e-05, "loss": 1.4502, "step": 769 }, { "epoch": 0.4745762711864407, "grad_norm": 1.2888206243515015, "learning_rate": 1.8874866145589224e-05, "loss": 1.5287, "step": 770 }, { "epoch": 0.4751926040061633, "grad_norm": 1.276939034461975, "learning_rate": 1.886990125622375e-05, "loss": 1.5038, "step": 771 }, { "epoch": 0.475808936825886, "grad_norm": 1.2767584323883057, "learning_rate": 1.886492609260426e-05, "loss": 1.4341, "step": 772 }, { "epoch": 0.4764252696456086, "grad_norm": 1.2863210439682007, "learning_rate": 1.8859940660493634e-05, "loss": 1.556, "step": 773 }, { "epoch": 0.4770416024653313, "grad_norm": 1.2184972763061523, "learning_rate": 1.8854944965666632e-05, "loss": 1.5081, "step": 774 }, { "epoch": 0.4776579352850539, "grad_norm": 1.1967580318450928, "learning_rate": 1.884993901390991e-05, "loss": 1.4311, "step": 775 }, { "epoch": 0.4782742681047766, "grad_norm": 1.2631529569625854, "learning_rate": 1.8844922811022e-05, "loss": 1.4769, "step": 776 }, { "epoch": 0.47889060092449925, "grad_norm": 1.3077120780944824, "learning_rate": 1.8839896362813307e-05, "loss": 1.5289, "step": 777 }, { "epoch": 0.4795069337442219, "grad_norm": 1.2082256078720093, "learning_rate": 1.883485967510611e-05, "loss": 1.4397, "step": 778 }, { "epoch": 0.48012326656394455, "grad_norm": 1.2581230401992798, "learning_rate": 1.8829812753734538e-05, "loss": 1.4932, "step": 779 }, { "epoch": 0.48073959938366717, "grad_norm": 1.2624733448028564, "learning_rate": 1.8824755604544592e-05, "loss": 1.4866, "step": 780 }, { "epoch": 0.48135593220338985, "grad_norm": 1.270365834236145, "learning_rate": 1.8819688233394104e-05, "loss": 1.5877, "step": 781 }, { "epoch": 0.48197226502311247, "grad_norm": 1.256030559539795, "learning_rate": 1.881461064615275e-05, "loss": 1.4761, "step": 782 }, { "epoch": 0.48258859784283514, "grad_norm": 1.240537405014038, "learning_rate": 1.8809522848702036e-05, "loss": 1.4787, "step": 783 }, { "epoch": 0.48320493066255776, "grad_norm": 1.243285059928894, "learning_rate": 1.880442484693531e-05, "loss": 1.5122, "step": 784 }, { "epoch": 0.48382126348228044, "grad_norm": 1.2694751024246216, "learning_rate": 1.8799316646757725e-05, "loss": 1.5, "step": 785 }, { "epoch": 0.48443759630200306, "grad_norm": 1.2533049583435059, "learning_rate": 1.8794198254086247e-05, "loss": 1.5099, "step": 786 }, { "epoch": 0.48505392912172574, "grad_norm": 1.226676344871521, "learning_rate": 1.878906967484966e-05, "loss": 1.4711, "step": 787 }, { "epoch": 0.48567026194144836, "grad_norm": 1.2050113677978516, "learning_rate": 1.8783930914988537e-05, "loss": 1.4198, "step": 788 }, { "epoch": 0.48628659476117103, "grad_norm": 1.3092329502105713, "learning_rate": 1.8778781980455248e-05, "loss": 1.5241, "step": 789 }, { "epoch": 0.4869029275808937, "grad_norm": 1.2390162944793701, "learning_rate": 1.8773622877213947e-05, "loss": 1.4772, "step": 790 }, { "epoch": 0.4875192604006163, "grad_norm": 1.2183234691619873, "learning_rate": 1.876845361124057e-05, "loss": 1.4801, "step": 791 }, { "epoch": 0.488135593220339, "grad_norm": 1.222615122795105, "learning_rate": 1.8763274188522815e-05, "loss": 1.4104, "step": 792 }, { "epoch": 0.4887519260400616, "grad_norm": 1.2432262897491455, "learning_rate": 1.8758084615060156e-05, "loss": 1.4591, "step": 793 }, { "epoch": 0.4893682588597843, "grad_norm": 1.304048776626587, "learning_rate": 1.8752884896863822e-05, "loss": 1.502, "step": 794 }, { "epoch": 0.4899845916795069, "grad_norm": 1.233538269996643, "learning_rate": 1.874767503995679e-05, "loss": 1.5089, "step": 795 }, { "epoch": 0.4906009244992296, "grad_norm": 1.2568275928497314, "learning_rate": 1.8742455050373784e-05, "loss": 1.5089, "step": 796 }, { "epoch": 0.4912172573189522, "grad_norm": 1.2185765504837036, "learning_rate": 1.8737224934161263e-05, "loss": 1.4679, "step": 797 }, { "epoch": 0.4918335901386749, "grad_norm": 1.2645776271820068, "learning_rate": 1.8731984697377414e-05, "loss": 1.4747, "step": 798 }, { "epoch": 0.4924499229583975, "grad_norm": 1.2635529041290283, "learning_rate": 1.8726734346092148e-05, "loss": 1.5522, "step": 799 }, { "epoch": 0.4930662557781202, "grad_norm": 1.2735905647277832, "learning_rate": 1.8721473886387104e-05, "loss": 1.5524, "step": 800 }, { "epoch": 0.4936825885978428, "grad_norm": 1.2712733745574951, "learning_rate": 1.8716203324355608e-05, "loss": 1.5618, "step": 801 }, { "epoch": 0.4942989214175655, "grad_norm": 1.2424737215042114, "learning_rate": 1.87109226661027e-05, "loss": 1.483, "step": 802 }, { "epoch": 0.49491525423728816, "grad_norm": 1.2573866844177246, "learning_rate": 1.870563191774512e-05, "loss": 1.479, "step": 803 }, { "epoch": 0.4955315870570108, "grad_norm": 1.2829042673110962, "learning_rate": 1.8700331085411285e-05, "loss": 1.458, "step": 804 }, { "epoch": 0.49614791987673346, "grad_norm": 1.28048837184906, "learning_rate": 1.8695020175241296e-05, "loss": 1.5395, "step": 805 }, { "epoch": 0.4967642526964561, "grad_norm": 1.2624245882034302, "learning_rate": 1.868969919338693e-05, "loss": 1.4809, "step": 806 }, { "epoch": 0.49738058551617875, "grad_norm": 1.2632625102996826, "learning_rate": 1.8684368146011634e-05, "loss": 1.4968, "step": 807 }, { "epoch": 0.4979969183359014, "grad_norm": 1.6144354343414307, "learning_rate": 1.86790270392905e-05, "loss": 1.4869, "step": 808 }, { "epoch": 0.49861325115562405, "grad_norm": 1.2252204418182373, "learning_rate": 1.8673675879410282e-05, "loss": 1.389, "step": 809 }, { "epoch": 0.49922958397534667, "grad_norm": 1.2563177347183228, "learning_rate": 1.8668314672569385e-05, "loss": 1.5852, "step": 810 }, { "epoch": 0.49984591679506934, "grad_norm": 1.291148066520691, "learning_rate": 1.866294342497784e-05, "loss": 1.4793, "step": 811 }, { "epoch": 0.500462249614792, "grad_norm": 1.267246961593628, "learning_rate": 1.865756214285731e-05, "loss": 1.4314, "step": 812 }, { "epoch": 0.5010785824345146, "grad_norm": 1.2456042766571045, "learning_rate": 1.8652170832441086e-05, "loss": 1.4928, "step": 813 }, { "epoch": 0.5016949152542373, "grad_norm": 1.2584848403930664, "learning_rate": 1.8646769499974076e-05, "loss": 1.4457, "step": 814 }, { "epoch": 0.50231124807396, "grad_norm": 1.259623646736145, "learning_rate": 1.8641358151712792e-05, "loss": 1.4081, "step": 815 }, { "epoch": 0.5029275808936826, "grad_norm": 1.2222211360931396, "learning_rate": 1.8635936793925348e-05, "loss": 1.4555, "step": 816 }, { "epoch": 0.5035439137134052, "grad_norm": 1.3254154920578003, "learning_rate": 1.8630505432891456e-05, "loss": 1.5596, "step": 817 }, { "epoch": 0.5041602465331279, "grad_norm": 1.256636619567871, "learning_rate": 1.8625064074902416e-05, "loss": 1.4883, "step": 818 }, { "epoch": 0.5047765793528506, "grad_norm": 1.3108055591583252, "learning_rate": 1.86196127262611e-05, "loss": 1.5174, "step": 819 }, { "epoch": 0.5053929121725732, "grad_norm": 1.2449524402618408, "learning_rate": 1.8614151393281952e-05, "loss": 1.4493, "step": 820 }, { "epoch": 0.5060092449922958, "grad_norm": 1.2811285257339478, "learning_rate": 1.8608680082291e-05, "loss": 1.509, "step": 821 }, { "epoch": 0.5066255778120184, "grad_norm": 1.2476929426193237, "learning_rate": 1.8603198799625807e-05, "loss": 1.4688, "step": 822 }, { "epoch": 0.5072419106317412, "grad_norm": 1.316117286682129, "learning_rate": 1.8597707551635498e-05, "loss": 1.4734, "step": 823 }, { "epoch": 0.5078582434514638, "grad_norm": 1.3023074865341187, "learning_rate": 1.859220634468074e-05, "loss": 1.503, "step": 824 }, { "epoch": 0.5084745762711864, "grad_norm": 1.2876421213150024, "learning_rate": 1.8586695185133727e-05, "loss": 1.5322, "step": 825 }, { "epoch": 0.509090909090909, "grad_norm": 1.238061547279358, "learning_rate": 1.8581174079378198e-05, "loss": 1.5098, "step": 826 }, { "epoch": 0.5097072419106318, "grad_norm": 1.2876527309417725, "learning_rate": 1.85756430338094e-05, "loss": 1.4809, "step": 827 }, { "epoch": 0.5103235747303544, "grad_norm": 1.2882412672042847, "learning_rate": 1.8570102054834097e-05, "loss": 1.4486, "step": 828 }, { "epoch": 0.510939907550077, "grad_norm": 1.2883590459823608, "learning_rate": 1.856455114887056e-05, "loss": 1.4944, "step": 829 }, { "epoch": 0.5115562403697997, "grad_norm": 1.2597795724868774, "learning_rate": 1.855899032234856e-05, "loss": 1.568, "step": 830 }, { "epoch": 0.5121725731895224, "grad_norm": 1.2574599981307983, "learning_rate": 1.8553419581709356e-05, "loss": 1.481, "step": 831 }, { "epoch": 0.512788906009245, "grad_norm": 1.2375128269195557, "learning_rate": 1.854783893340569e-05, "loss": 1.4613, "step": 832 }, { "epoch": 0.5134052388289676, "grad_norm": 1.2782753705978394, "learning_rate": 1.8542248383901784e-05, "loss": 1.4309, "step": 833 }, { "epoch": 0.5140215716486903, "grad_norm": 1.3465543985366821, "learning_rate": 1.853664793967333e-05, "loss": 1.4315, "step": 834 }, { "epoch": 0.514637904468413, "grad_norm": 1.3034608364105225, "learning_rate": 1.8531037607207475e-05, "loss": 1.4816, "step": 835 }, { "epoch": 0.5152542372881356, "grad_norm": 1.2883905172348022, "learning_rate": 1.8525417393002824e-05, "loss": 1.509, "step": 836 }, { "epoch": 0.5158705701078582, "grad_norm": 1.2504594326019287, "learning_rate": 1.8519787303569435e-05, "loss": 1.4923, "step": 837 }, { "epoch": 0.5164869029275809, "grad_norm": 1.297469973564148, "learning_rate": 1.8514147345428785e-05, "loss": 1.5435, "step": 838 }, { "epoch": 0.5171032357473035, "grad_norm": 1.2614152431488037, "learning_rate": 1.8508497525113806e-05, "loss": 1.5002, "step": 839 }, { "epoch": 0.5177195685670262, "grad_norm": 1.3115254640579224, "learning_rate": 1.8502837849168835e-05, "loss": 1.4365, "step": 840 }, { "epoch": 0.5183359013867489, "grad_norm": 1.3797346353530884, "learning_rate": 1.8497168324149638e-05, "loss": 1.5441, "step": 841 }, { "epoch": 0.5189522342064715, "grad_norm": 1.3391892910003662, "learning_rate": 1.849148895662338e-05, "loss": 1.4927, "step": 842 }, { "epoch": 0.5195685670261941, "grad_norm": 1.2681193351745605, "learning_rate": 1.8485799753168634e-05, "loss": 1.4759, "step": 843 }, { "epoch": 0.5201848998459168, "grad_norm": 1.244890570640564, "learning_rate": 1.848010072037536e-05, "loss": 1.4473, "step": 844 }, { "epoch": 0.5208012326656395, "grad_norm": 1.260015845298767, "learning_rate": 1.8474391864844908e-05, "loss": 1.4864, "step": 845 }, { "epoch": 0.5214175654853621, "grad_norm": 1.2666666507720947, "learning_rate": 1.8468673193190002e-05, "loss": 1.5054, "step": 846 }, { "epoch": 0.5220338983050847, "grad_norm": 1.2543989419937134, "learning_rate": 1.8462944712034746e-05, "loss": 1.527, "step": 847 }, { "epoch": 0.5226502311248074, "grad_norm": 1.3276927471160889, "learning_rate": 1.845720642801459e-05, "loss": 1.4716, "step": 848 }, { "epoch": 0.5232665639445301, "grad_norm": 1.2448278665542603, "learning_rate": 1.8451458347776356e-05, "loss": 1.4264, "step": 849 }, { "epoch": 0.5238828967642527, "grad_norm": 1.2584590911865234, "learning_rate": 1.8445700477978207e-05, "loss": 1.4754, "step": 850 }, { "epoch": 0.5244992295839753, "grad_norm": 1.2292466163635254, "learning_rate": 1.8439932825289632e-05, "loss": 1.4599, "step": 851 }, { "epoch": 0.5251155624036979, "grad_norm": 1.2278330326080322, "learning_rate": 1.8434155396391477e-05, "loss": 1.4727, "step": 852 }, { "epoch": 0.5257318952234207, "grad_norm": 1.2465629577636719, "learning_rate": 1.8428368197975893e-05, "loss": 1.4788, "step": 853 }, { "epoch": 0.5263482280431433, "grad_norm": 1.2919851541519165, "learning_rate": 1.8422571236746355e-05, "loss": 1.4827, "step": 854 }, { "epoch": 0.5269645608628659, "grad_norm": 1.265480399131775, "learning_rate": 1.8416764519417643e-05, "loss": 1.3896, "step": 855 }, { "epoch": 0.5275808936825886, "grad_norm": 1.2633627653121948, "learning_rate": 1.8410948052715846e-05, "loss": 1.4049, "step": 856 }, { "epoch": 0.5281972265023113, "grad_norm": 1.2592886686325073, "learning_rate": 1.840512184337833e-05, "loss": 1.4663, "step": 857 }, { "epoch": 0.5288135593220339, "grad_norm": 1.3050137758255005, "learning_rate": 1.839928589815376e-05, "loss": 1.5236, "step": 858 }, { "epoch": 0.5294298921417565, "grad_norm": 1.2733769416809082, "learning_rate": 1.8393440223802078e-05, "loss": 1.4359, "step": 859 }, { "epoch": 0.5300462249614792, "grad_norm": 1.2855013608932495, "learning_rate": 1.8387584827094486e-05, "loss": 1.5144, "step": 860 }, { "epoch": 0.5306625577812019, "grad_norm": 1.262732982635498, "learning_rate": 1.838171971481345e-05, "loss": 1.4463, "step": 861 }, { "epoch": 0.5312788906009245, "grad_norm": 1.2458401918411255, "learning_rate": 1.8375844893752708e-05, "loss": 1.4711, "step": 862 }, { "epoch": 0.5318952234206471, "grad_norm": 1.2196964025497437, "learning_rate": 1.8369960370717216e-05, "loss": 1.4487, "step": 863 }, { "epoch": 0.5325115562403698, "grad_norm": 1.2991254329681396, "learning_rate": 1.8364066152523183e-05, "loss": 1.465, "step": 864 }, { "epoch": 0.5331278890600925, "grad_norm": 1.2846981287002563, "learning_rate": 1.835816224599805e-05, "loss": 1.4458, "step": 865 }, { "epoch": 0.5337442218798151, "grad_norm": 1.2920598983764648, "learning_rate": 1.8352248657980474e-05, "loss": 1.486, "step": 866 }, { "epoch": 0.5343605546995378, "grad_norm": 1.2542165517807007, "learning_rate": 1.834632539532033e-05, "loss": 1.469, "step": 867 }, { "epoch": 0.5349768875192604, "grad_norm": 1.235954761505127, "learning_rate": 1.8340392464878695e-05, "loss": 1.4539, "step": 868 }, { "epoch": 0.535593220338983, "grad_norm": 1.256439447402954, "learning_rate": 1.8334449873527856e-05, "loss": 1.4988, "step": 869 }, { "epoch": 0.5362095531587057, "grad_norm": 1.3356431722640991, "learning_rate": 1.8328497628151273e-05, "loss": 1.48, "step": 870 }, { "epoch": 0.5368258859784284, "grad_norm": 1.3085469007492065, "learning_rate": 1.8322535735643604e-05, "loss": 1.4927, "step": 871 }, { "epoch": 0.537442218798151, "grad_norm": 1.2832350730895996, "learning_rate": 1.8316564202910674e-05, "loss": 1.5026, "step": 872 }, { "epoch": 0.5380585516178736, "grad_norm": 1.2771832942962646, "learning_rate": 1.8310583036869475e-05, "loss": 1.4649, "step": 873 }, { "epoch": 0.5386748844375963, "grad_norm": 1.2699801921844482, "learning_rate": 1.8304592244448154e-05, "loss": 1.4966, "step": 874 }, { "epoch": 0.539291217257319, "grad_norm": 1.2624456882476807, "learning_rate": 1.8298591832586024e-05, "loss": 1.5052, "step": 875 }, { "epoch": 0.5399075500770416, "grad_norm": 1.255231261253357, "learning_rate": 1.8292581808233522e-05, "loss": 1.4501, "step": 876 }, { "epoch": 0.5405238828967642, "grad_norm": 1.349666714668274, "learning_rate": 1.8286562178352224e-05, "loss": 1.4877, "step": 877 }, { "epoch": 0.5411402157164868, "grad_norm": 1.2734099626541138, "learning_rate": 1.8280532949914843e-05, "loss": 1.4526, "step": 878 }, { "epoch": 0.5417565485362096, "grad_norm": 1.2511786222457886, "learning_rate": 1.8274494129905197e-05, "loss": 1.4448, "step": 879 }, { "epoch": 0.5423728813559322, "grad_norm": 1.2781952619552612, "learning_rate": 1.826844572531822e-05, "loss": 1.5, "step": 880 }, { "epoch": 0.5429892141756548, "grad_norm": 1.229331374168396, "learning_rate": 1.826238774315995e-05, "loss": 1.3904, "step": 881 }, { "epoch": 0.5436055469953776, "grad_norm": 1.2203567028045654, "learning_rate": 1.8256320190447516e-05, "loss": 1.4475, "step": 882 }, { "epoch": 0.5442218798151002, "grad_norm": 1.2885953187942505, "learning_rate": 1.8250243074209134e-05, "loss": 1.484, "step": 883 }, { "epoch": 0.5448382126348228, "grad_norm": 1.3245861530303955, "learning_rate": 1.8244156401484093e-05, "loss": 1.5352, "step": 884 }, { "epoch": 0.5454545454545454, "grad_norm": 1.2564862966537476, "learning_rate": 1.823806017932276e-05, "loss": 1.4251, "step": 885 }, { "epoch": 0.5460708782742681, "grad_norm": 1.2464264631271362, "learning_rate": 1.823195441478656e-05, "loss": 1.4394, "step": 886 }, { "epoch": 0.5466872110939908, "grad_norm": 1.2749861478805542, "learning_rate": 1.822583911494797e-05, "loss": 1.4892, "step": 887 }, { "epoch": 0.5473035439137134, "grad_norm": 1.249792218208313, "learning_rate": 1.8219714286890506e-05, "loss": 1.4853, "step": 888 }, { "epoch": 0.547919876733436, "grad_norm": 1.3281162977218628, "learning_rate": 1.8213579937708736e-05, "loss": 1.4661, "step": 889 }, { "epoch": 0.5485362095531587, "grad_norm": 1.2616132497787476, "learning_rate": 1.8207436074508243e-05, "loss": 1.4713, "step": 890 }, { "epoch": 0.5491525423728814, "grad_norm": 1.2525548934936523, "learning_rate": 1.8201282704405635e-05, "loss": 1.4904, "step": 891 }, { "epoch": 0.549768875192604, "grad_norm": 1.242445707321167, "learning_rate": 1.8195119834528535e-05, "loss": 1.4398, "step": 892 }, { "epoch": 0.5503852080123267, "grad_norm": 1.2310469150543213, "learning_rate": 1.8188947472015564e-05, "loss": 1.43, "step": 893 }, { "epoch": 0.5510015408320493, "grad_norm": 1.2463264465332031, "learning_rate": 1.8182765624016344e-05, "loss": 1.4616, "step": 894 }, { "epoch": 0.551617873651772, "grad_norm": 1.339301586151123, "learning_rate": 1.817657429769148e-05, "loss": 1.5234, "step": 895 }, { "epoch": 0.5522342064714946, "grad_norm": 1.2821612358093262, "learning_rate": 1.8170373500212557e-05, "loss": 1.4755, "step": 896 }, { "epoch": 0.5528505392912173, "grad_norm": 1.287657380104065, "learning_rate": 1.8164163238762135e-05, "loss": 1.4966, "step": 897 }, { "epoch": 0.5534668721109399, "grad_norm": 1.2771079540252686, "learning_rate": 1.8157943520533732e-05, "loss": 1.5199, "step": 898 }, { "epoch": 0.5540832049306625, "grad_norm": 1.2397319078445435, "learning_rate": 1.815171435273182e-05, "loss": 1.4812, "step": 899 }, { "epoch": 0.5546995377503852, "grad_norm": 1.2858364582061768, "learning_rate": 1.8145475742571822e-05, "loss": 1.4484, "step": 900 }, { "epoch": 0.5553158705701079, "grad_norm": 1.2449160814285278, "learning_rate": 1.8139227697280085e-05, "loss": 1.4736, "step": 901 }, { "epoch": 0.5559322033898305, "grad_norm": 1.2913939952850342, "learning_rate": 1.81329702240939e-05, "loss": 1.4556, "step": 902 }, { "epoch": 0.5565485362095531, "grad_norm": 1.2859578132629395, "learning_rate": 1.812670333026148e-05, "loss": 1.5124, "step": 903 }, { "epoch": 0.5571648690292758, "grad_norm": 1.2694047689437866, "learning_rate": 1.8120427023041926e-05, "loss": 1.4657, "step": 904 }, { "epoch": 0.5577812018489985, "grad_norm": 1.3014659881591797, "learning_rate": 1.8114141309705273e-05, "loss": 1.5037, "step": 905 }, { "epoch": 0.5583975346687211, "grad_norm": 1.259659767150879, "learning_rate": 1.8107846197532436e-05, "loss": 1.4883, "step": 906 }, { "epoch": 0.5590138674884437, "grad_norm": 1.23667311668396, "learning_rate": 1.8101541693815212e-05, "loss": 1.4693, "step": 907 }, { "epoch": 0.5596302003081665, "grad_norm": 1.2499157190322876, "learning_rate": 1.8095227805856294e-05, "loss": 1.4448, "step": 908 }, { "epoch": 0.5602465331278891, "grad_norm": 1.245858907699585, "learning_rate": 1.8088904540969226e-05, "loss": 1.4445, "step": 909 }, { "epoch": 0.5608628659476117, "grad_norm": 1.2809274196624756, "learning_rate": 1.8082571906478427e-05, "loss": 1.4326, "step": 910 }, { "epoch": 0.5614791987673343, "grad_norm": 1.2645031213760376, "learning_rate": 1.807622990971916e-05, "loss": 1.4864, "step": 911 }, { "epoch": 0.562095531587057, "grad_norm": 1.2725409269332886, "learning_rate": 1.806987855803754e-05, "loss": 1.4585, "step": 912 }, { "epoch": 0.5627118644067797, "grad_norm": 1.2623176574707031, "learning_rate": 1.8063517858790517e-05, "loss": 1.3968, "step": 913 }, { "epoch": 0.5633281972265023, "grad_norm": 1.2946065664291382, "learning_rate": 1.8057147819345858e-05, "loss": 1.4755, "step": 914 }, { "epoch": 0.5639445300462249, "grad_norm": 1.3034807443618774, "learning_rate": 1.8050768447082166e-05, "loss": 1.5204, "step": 915 }, { "epoch": 0.5645608628659476, "grad_norm": 1.2962021827697754, "learning_rate": 1.804437974938884e-05, "loss": 1.4217, "step": 916 }, { "epoch": 0.5651771956856703, "grad_norm": 1.2734880447387695, "learning_rate": 1.803798173366609e-05, "loss": 1.4696, "step": 917 }, { "epoch": 0.5657935285053929, "grad_norm": 1.3022407293319702, "learning_rate": 1.8031574407324918e-05, "loss": 1.428, "step": 918 }, { "epoch": 0.5664098613251156, "grad_norm": 1.2504037618637085, "learning_rate": 1.80251577777871e-05, "loss": 1.4666, "step": 919 }, { "epoch": 0.5670261941448382, "grad_norm": 1.4484992027282715, "learning_rate": 1.8018731852485206e-05, "loss": 1.4218, "step": 920 }, { "epoch": 0.5676425269645609, "grad_norm": 2.6547293663024902, "learning_rate": 1.8012296638862562e-05, "loss": 1.4498, "step": 921 }, { "epoch": 0.5682588597842835, "grad_norm": 1.3064091205596924, "learning_rate": 1.8005852144373254e-05, "loss": 1.4632, "step": 922 }, { "epoch": 0.5688751926040062, "grad_norm": 1.3208037614822388, "learning_rate": 1.7999398376482124e-05, "loss": 1.3692, "step": 923 }, { "epoch": 0.5694915254237288, "grad_norm": 1.2979129552841187, "learning_rate": 1.7992935342664744e-05, "loss": 1.4668, "step": 924 }, { "epoch": 0.5701078582434514, "grad_norm": 1.271518349647522, "learning_rate": 1.798646305040743e-05, "loss": 1.4579, "step": 925 }, { "epoch": 0.5707241910631741, "grad_norm": 1.269794225692749, "learning_rate": 1.797998150720722e-05, "loss": 1.4795, "step": 926 }, { "epoch": 0.5713405238828968, "grad_norm": 1.2728458642959595, "learning_rate": 1.7973490720571866e-05, "loss": 1.4376, "step": 927 }, { "epoch": 0.5719568567026194, "grad_norm": 1.264875888824463, "learning_rate": 1.7966990698019823e-05, "loss": 1.4896, "step": 928 }, { "epoch": 0.572573189522342, "grad_norm": 1.3039665222167969, "learning_rate": 1.7960481447080256e-05, "loss": 1.5014, "step": 929 }, { "epoch": 0.5731895223420647, "grad_norm": 1.2540168762207031, "learning_rate": 1.7953962975293008e-05, "loss": 1.4283, "step": 930 }, { "epoch": 0.5738058551617874, "grad_norm": 1.298341155052185, "learning_rate": 1.79474352902086e-05, "loss": 1.4812, "step": 931 }, { "epoch": 0.57442218798151, "grad_norm": 1.2632654905319214, "learning_rate": 1.7940898399388243e-05, "loss": 1.3947, "step": 932 }, { "epoch": 0.5750385208012326, "grad_norm": 1.292464256286621, "learning_rate": 1.7934352310403793e-05, "loss": 1.4455, "step": 933 }, { "epoch": 0.5756548536209554, "grad_norm": 1.2651805877685547, "learning_rate": 1.792779703083777e-05, "loss": 1.4063, "step": 934 }, { "epoch": 0.576271186440678, "grad_norm": 1.272404670715332, "learning_rate": 1.7921232568283337e-05, "loss": 1.4479, "step": 935 }, { "epoch": 0.5768875192604006, "grad_norm": 1.2771462202072144, "learning_rate": 1.7914658930344294e-05, "loss": 1.4692, "step": 936 }, { "epoch": 0.5775038520801232, "grad_norm": 1.2511054277420044, "learning_rate": 1.7908076124635067e-05, "loss": 1.4244, "step": 937 }, { "epoch": 0.578120184899846, "grad_norm": 1.255060076713562, "learning_rate": 1.7901484158780708e-05, "loss": 1.4582, "step": 938 }, { "epoch": 0.5787365177195686, "grad_norm": 1.2633306980133057, "learning_rate": 1.789488304041687e-05, "loss": 1.4685, "step": 939 }, { "epoch": 0.5793528505392912, "grad_norm": 1.2722944021224976, "learning_rate": 1.7888272777189812e-05, "loss": 1.4828, "step": 940 }, { "epoch": 0.5799691833590138, "grad_norm": 1.275268316268921, "learning_rate": 1.7881653376756392e-05, "loss": 1.4394, "step": 941 }, { "epoch": 0.5805855161787365, "grad_norm": 1.2793056964874268, "learning_rate": 1.7875024846784045e-05, "loss": 1.4381, "step": 942 }, { "epoch": 0.5812018489984592, "grad_norm": 1.2965726852416992, "learning_rate": 1.7868387194950777e-05, "loss": 1.428, "step": 943 }, { "epoch": 0.5818181818181818, "grad_norm": 1.2830195426940918, "learning_rate": 1.786174042894517e-05, "loss": 1.4647, "step": 944 }, { "epoch": 0.5824345146379045, "grad_norm": 1.2669392824172974, "learning_rate": 1.7855084556466356e-05, "loss": 1.3959, "step": 945 }, { "epoch": 0.5830508474576271, "grad_norm": 1.271543264389038, "learning_rate": 1.7848419585224017e-05, "loss": 1.405, "step": 946 }, { "epoch": 0.5836671802773498, "grad_norm": 1.280552864074707, "learning_rate": 1.7841745522938377e-05, "loss": 1.4046, "step": 947 }, { "epoch": 0.5842835130970724, "grad_norm": 1.2883890867233276, "learning_rate": 1.7835062377340192e-05, "loss": 1.387, "step": 948 }, { "epoch": 0.5848998459167951, "grad_norm": 1.2794532775878906, "learning_rate": 1.782837015617073e-05, "loss": 1.4882, "step": 949 }, { "epoch": 0.5855161787365177, "grad_norm": 1.309252381324768, "learning_rate": 1.782166886718178e-05, "loss": 1.4795, "step": 950 }, { "epoch": 0.5861325115562404, "grad_norm": 1.2647117376327515, "learning_rate": 1.7814958518135627e-05, "loss": 1.4353, "step": 951 }, { "epoch": 0.586748844375963, "grad_norm": 1.3099619150161743, "learning_rate": 1.7808239116805068e-05, "loss": 1.4662, "step": 952 }, { "epoch": 0.5873651771956857, "grad_norm": 1.2780290842056274, "learning_rate": 1.7801510670973357e-05, "loss": 1.4641, "step": 953 }, { "epoch": 0.5879815100154083, "grad_norm": 1.2789610624313354, "learning_rate": 1.779477318843425e-05, "loss": 1.4822, "step": 954 }, { "epoch": 0.588597842835131, "grad_norm": 1.3234350681304932, "learning_rate": 1.778802667699196e-05, "loss": 1.3918, "step": 955 }, { "epoch": 0.5892141756548536, "grad_norm": 1.3588504791259766, "learning_rate": 1.7781271144461164e-05, "loss": 1.4383, "step": 956 }, { "epoch": 0.5898305084745763, "grad_norm": 1.2964680194854736, "learning_rate": 1.7774506598666973e-05, "loss": 1.418, "step": 957 }, { "epoch": 0.5904468412942989, "grad_norm": 1.287760615348816, "learning_rate": 1.7767733047444954e-05, "loss": 1.4863, "step": 958 }, { "epoch": 0.5910631741140215, "grad_norm": 1.2561811208724976, "learning_rate": 1.77609504986411e-05, "loss": 1.4829, "step": 959 }, { "epoch": 0.5916795069337443, "grad_norm": 1.2881163358688354, "learning_rate": 1.775415896011183e-05, "loss": 1.417, "step": 960 }, { "epoch": 0.5922958397534669, "grad_norm": 1.3055890798568726, "learning_rate": 1.774735843972397e-05, "loss": 1.4554, "step": 961 }, { "epoch": 0.5929121725731895, "grad_norm": 1.298957347869873, "learning_rate": 1.774054894535475e-05, "loss": 1.46, "step": 962 }, { "epoch": 0.5935285053929121, "grad_norm": 1.2974475622177124, "learning_rate": 1.77337304848918e-05, "loss": 1.3998, "step": 963 }, { "epoch": 0.5941448382126349, "grad_norm": 1.2996740341186523, "learning_rate": 1.7726903066233134e-05, "loss": 1.3618, "step": 964 }, { "epoch": 0.5947611710323575, "grad_norm": 1.282745361328125, "learning_rate": 1.7720066697287138e-05, "loss": 1.4157, "step": 965 }, { "epoch": 0.5953775038520801, "grad_norm": 1.3274214267730713, "learning_rate": 1.771322138597257e-05, "loss": 1.4554, "step": 966 }, { "epoch": 0.5959938366718027, "grad_norm": 1.252694845199585, "learning_rate": 1.770636714021855e-05, "loss": 1.4714, "step": 967 }, { "epoch": 0.5966101694915255, "grad_norm": 1.3512969017028809, "learning_rate": 1.7699503967964533e-05, "loss": 1.4599, "step": 968 }, { "epoch": 0.5972265023112481, "grad_norm": 1.28632652759552, "learning_rate": 1.7692631877160326e-05, "loss": 1.4522, "step": 969 }, { "epoch": 0.5978428351309707, "grad_norm": 1.3569471836090088, "learning_rate": 1.768575087576607e-05, "loss": 1.4434, "step": 970 }, { "epoch": 0.5984591679506934, "grad_norm": 1.301344394683838, "learning_rate": 1.767886097175221e-05, "loss": 1.4774, "step": 971 }, { "epoch": 0.599075500770416, "grad_norm": 1.2891639471054077, "learning_rate": 1.7671962173099527e-05, "loss": 1.4445, "step": 972 }, { "epoch": 0.5996918335901387, "grad_norm": 1.2719417810440063, "learning_rate": 1.7665054487799077e-05, "loss": 1.4546, "step": 973 }, { "epoch": 0.6003081664098613, "grad_norm": 1.3011873960494995, "learning_rate": 1.7658137923852236e-05, "loss": 1.4382, "step": 974 }, { "epoch": 0.600924499229584, "grad_norm": 1.3045809268951416, "learning_rate": 1.765121248927065e-05, "loss": 1.4831, "step": 975 }, { "epoch": 0.6015408320493066, "grad_norm": 1.2637208700180054, "learning_rate": 1.764427819207624e-05, "loss": 1.4634, "step": 976 }, { "epoch": 0.6021571648690293, "grad_norm": 1.28716242313385, "learning_rate": 1.7637335040301198e-05, "loss": 1.4798, "step": 977 }, { "epoch": 0.6027734976887519, "grad_norm": 1.2964173555374146, "learning_rate": 1.7630383041987972e-05, "loss": 1.4342, "step": 978 }, { "epoch": 0.6033898305084746, "grad_norm": 1.28719961643219, "learning_rate": 1.7623422205189254e-05, "loss": 1.4484, "step": 979 }, { "epoch": 0.6040061633281972, "grad_norm": 1.2890859842300415, "learning_rate": 1.761645253796797e-05, "loss": 1.4184, "step": 980 }, { "epoch": 0.6046224961479199, "grad_norm": 1.2922401428222656, "learning_rate": 1.760947404839729e-05, "loss": 1.42, "step": 981 }, { "epoch": 0.6052388289676425, "grad_norm": 1.2837836742401123, "learning_rate": 1.760248674456059e-05, "loss": 1.4881, "step": 982 }, { "epoch": 0.6058551617873652, "grad_norm": 1.2654691934585571, "learning_rate": 1.7595490634551453e-05, "loss": 1.4458, "step": 983 }, { "epoch": 0.6064714946070878, "grad_norm": 1.2789521217346191, "learning_rate": 1.7588485726473676e-05, "loss": 1.4261, "step": 984 }, { "epoch": 0.6070878274268104, "grad_norm": 1.3136115074157715, "learning_rate": 1.758147202844123e-05, "loss": 1.5045, "step": 985 }, { "epoch": 0.6077041602465332, "grad_norm": 1.3418716192245483, "learning_rate": 1.757444954857829e-05, "loss": 1.4961, "step": 986 }, { "epoch": 0.6083204930662558, "grad_norm": 1.2966772317886353, "learning_rate": 1.756741829501918e-05, "loss": 1.4954, "step": 987 }, { "epoch": 0.6089368258859784, "grad_norm": 1.289447546005249, "learning_rate": 1.7560378275908398e-05, "loss": 1.3974, "step": 988 }, { "epoch": 0.609553158705701, "grad_norm": 1.3367266654968262, "learning_rate": 1.75533294994006e-05, "loss": 1.4104, "step": 989 }, { "epoch": 0.6101694915254238, "grad_norm": 1.4096240997314453, "learning_rate": 1.7546271973660577e-05, "loss": 1.469, "step": 990 }, { "epoch": 0.6107858243451464, "grad_norm": 1.3186215162277222, "learning_rate": 1.7539205706863255e-05, "loss": 1.4171, "step": 991 }, { "epoch": 0.611402157164869, "grad_norm": 1.340733528137207, "learning_rate": 1.7532130707193696e-05, "loss": 1.4817, "step": 992 }, { "epoch": 0.6120184899845916, "grad_norm": 1.3081716299057007, "learning_rate": 1.7525046982847062e-05, "loss": 1.443, "step": 993 }, { "epoch": 0.6126348228043144, "grad_norm": 1.2978266477584839, "learning_rate": 1.751795454202863e-05, "loss": 1.3804, "step": 994 }, { "epoch": 0.613251155624037, "grad_norm": 1.2926156520843506, "learning_rate": 1.7510853392953773e-05, "loss": 1.5062, "step": 995 }, { "epoch": 0.6138674884437596, "grad_norm": 1.3179266452789307, "learning_rate": 1.7503743543847954e-05, "loss": 1.4396, "step": 996 }, { "epoch": 0.6144838212634823, "grad_norm": 1.3290834426879883, "learning_rate": 1.7496625002946702e-05, "loss": 1.4109, "step": 997 }, { "epoch": 0.615100154083205, "grad_norm": 1.3353898525238037, "learning_rate": 1.7489497778495623e-05, "loss": 1.4308, "step": 998 }, { "epoch": 0.6157164869029276, "grad_norm": 1.2900705337524414, "learning_rate": 1.7482361878750388e-05, "loss": 1.4245, "step": 999 }, { "epoch": 0.6163328197226502, "grad_norm": 1.2731356620788574, "learning_rate": 1.7475217311976697e-05, "loss": 1.4197, "step": 1000 }, { "epoch": 0.6169491525423729, "grad_norm": 1.2875813245773315, "learning_rate": 1.7468064086450308e-05, "loss": 1.4641, "step": 1001 }, { "epoch": 0.6175654853620955, "grad_norm": 1.2781028747558594, "learning_rate": 1.7460902210457004e-05, "loss": 1.4729, "step": 1002 }, { "epoch": 0.6181818181818182, "grad_norm": 1.3109949827194214, "learning_rate": 1.7453731692292578e-05, "loss": 1.4783, "step": 1003 }, { "epoch": 0.6187981510015408, "grad_norm": 1.3525522947311401, "learning_rate": 1.7446552540262844e-05, "loss": 1.4504, "step": 1004 }, { "epoch": 0.6194144838212635, "grad_norm": 1.3489145040512085, "learning_rate": 1.7439364762683613e-05, "loss": 1.5043, "step": 1005 }, { "epoch": 0.6200308166409861, "grad_norm": 1.3232150077819824, "learning_rate": 1.7432168367880698e-05, "loss": 1.5343, "step": 1006 }, { "epoch": 0.6206471494607088, "grad_norm": 1.324433445930481, "learning_rate": 1.742496336418987e-05, "loss": 1.4634, "step": 1007 }, { "epoch": 0.6212634822804314, "grad_norm": 1.3114863634109497, "learning_rate": 1.7417749759956898e-05, "loss": 1.3818, "step": 1008 }, { "epoch": 0.6218798151001541, "grad_norm": 1.3406624794006348, "learning_rate": 1.741052756353749e-05, "loss": 1.5066, "step": 1009 }, { "epoch": 0.6224961479198767, "grad_norm": 1.3135074377059937, "learning_rate": 1.7403296783297322e-05, "loss": 1.4888, "step": 1010 }, { "epoch": 0.6231124807395994, "grad_norm": 1.2870179414749146, "learning_rate": 1.739605742761201e-05, "loss": 1.372, "step": 1011 }, { "epoch": 0.6237288135593221, "grad_norm": 1.2946306467056274, "learning_rate": 1.73888095048671e-05, "loss": 1.5086, "step": 1012 }, { "epoch": 0.6243451463790447, "grad_norm": 1.2745262384414673, "learning_rate": 1.7381553023458064e-05, "loss": 1.4279, "step": 1013 }, { "epoch": 0.6249614791987673, "grad_norm": 1.3256237506866455, "learning_rate": 1.7374287991790283e-05, "loss": 1.5078, "step": 1014 }, { "epoch": 0.6255778120184899, "grad_norm": 1.2990922927856445, "learning_rate": 1.7367014418279055e-05, "loss": 1.3981, "step": 1015 }, { "epoch": 0.6261941448382127, "grad_norm": 1.389955759048462, "learning_rate": 1.7359732311349552e-05, "loss": 1.5264, "step": 1016 }, { "epoch": 0.6268104776579353, "grad_norm": 1.351433515548706, "learning_rate": 1.7352441679436852e-05, "loss": 1.4815, "step": 1017 }, { "epoch": 0.6274268104776579, "grad_norm": 1.3153966665267944, "learning_rate": 1.734514253098589e-05, "loss": 1.4777, "step": 1018 }, { "epoch": 0.6280431432973805, "grad_norm": 1.283727765083313, "learning_rate": 1.733783487445147e-05, "loss": 1.4899, "step": 1019 }, { "epoch": 0.6286594761171033, "grad_norm": 1.3266637325286865, "learning_rate": 1.7330518718298263e-05, "loss": 1.4943, "step": 1020 }, { "epoch": 0.6292758089368259, "grad_norm": 1.3029146194458008, "learning_rate": 1.7323194071000776e-05, "loss": 1.47, "step": 1021 }, { "epoch": 0.6298921417565485, "grad_norm": 1.3112081289291382, "learning_rate": 1.7315860941043347e-05, "loss": 1.4342, "step": 1022 }, { "epoch": 0.6305084745762712, "grad_norm": 1.2910702228546143, "learning_rate": 1.730851933692015e-05, "loss": 1.3816, "step": 1023 }, { "epoch": 0.6311248073959939, "grad_norm": 1.3513481616973877, "learning_rate": 1.7301169267135162e-05, "loss": 1.4324, "step": 1024 }, { "epoch": 0.6317411402157165, "grad_norm": 1.3509823083877563, "learning_rate": 1.729381074020218e-05, "loss": 1.453, "step": 1025 }, { "epoch": 0.6323574730354391, "grad_norm": 1.2922526597976685, "learning_rate": 1.7286443764644796e-05, "loss": 1.4086, "step": 1026 }, { "epoch": 0.6329738058551618, "grad_norm": 1.318701148033142, "learning_rate": 1.7279068348996376e-05, "loss": 1.4527, "step": 1027 }, { "epoch": 0.6335901386748845, "grad_norm": 1.3059755563735962, "learning_rate": 1.7271684501800067e-05, "loss": 1.4533, "step": 1028 }, { "epoch": 0.6342064714946071, "grad_norm": 1.3281493186950684, "learning_rate": 1.726429223160879e-05, "loss": 1.4422, "step": 1029 }, { "epoch": 0.6348228043143297, "grad_norm": 1.290744662284851, "learning_rate": 1.7256891546985217e-05, "loss": 1.463, "step": 1030 }, { "epoch": 0.6354391371340524, "grad_norm": 1.3279908895492554, "learning_rate": 1.724948245650177e-05, "loss": 1.382, "step": 1031 }, { "epoch": 0.636055469953775, "grad_norm": 1.3572450876235962, "learning_rate": 1.7242064968740598e-05, "loss": 1.4548, "step": 1032 }, { "epoch": 0.6366718027734977, "grad_norm": 1.2883111238479614, "learning_rate": 1.7234639092293594e-05, "loss": 1.4256, "step": 1033 }, { "epoch": 0.6372881355932203, "grad_norm": 1.3109279870986938, "learning_rate": 1.7227204835762348e-05, "loss": 1.5113, "step": 1034 }, { "epoch": 0.637904468412943, "grad_norm": 1.282270073890686, "learning_rate": 1.721976220775817e-05, "loss": 1.3814, "step": 1035 }, { "epoch": 0.6385208012326656, "grad_norm": 1.3193929195404053, "learning_rate": 1.721231121690207e-05, "loss": 1.4284, "step": 1036 }, { "epoch": 0.6391371340523883, "grad_norm": 1.3761621713638306, "learning_rate": 1.720485187182473e-05, "loss": 1.4239, "step": 1037 }, { "epoch": 0.639753466872111, "grad_norm": 1.3555552959442139, "learning_rate": 1.7197384181166516e-05, "loss": 1.4803, "step": 1038 }, { "epoch": 0.6403697996918336, "grad_norm": 1.3150558471679688, "learning_rate": 1.7189908153577473e-05, "loss": 1.405, "step": 1039 }, { "epoch": 0.6409861325115562, "grad_norm": 1.3542011976242065, "learning_rate": 1.7182423797717286e-05, "loss": 1.4317, "step": 1040 }, { "epoch": 0.6416024653312788, "grad_norm": 1.266971230506897, "learning_rate": 1.717493112225529e-05, "loss": 1.4081, "step": 1041 }, { "epoch": 0.6422187981510016, "grad_norm": 1.3353286981582642, "learning_rate": 1.7167430135870464e-05, "loss": 1.4692, "step": 1042 }, { "epoch": 0.6428351309707242, "grad_norm": 1.3516252040863037, "learning_rate": 1.7159920847251407e-05, "loss": 1.3859, "step": 1043 }, { "epoch": 0.6434514637904468, "grad_norm": 1.3363137245178223, "learning_rate": 1.7152403265096342e-05, "loss": 1.394, "step": 1044 }, { "epoch": 0.6440677966101694, "grad_norm": 1.3625383377075195, "learning_rate": 1.7144877398113094e-05, "loss": 1.4381, "step": 1045 }, { "epoch": 0.6446841294298922, "grad_norm": 1.3290737867355347, "learning_rate": 1.713734325501908e-05, "loss": 1.4065, "step": 1046 }, { "epoch": 0.6453004622496148, "grad_norm": 1.3321038484573364, "learning_rate": 1.7129800844541308e-05, "loss": 1.4068, "step": 1047 }, { "epoch": 0.6459167950693374, "grad_norm": 1.3138689994812012, "learning_rate": 1.7122250175416363e-05, "loss": 1.4152, "step": 1048 }, { "epoch": 0.6465331278890601, "grad_norm": 1.3002607822418213, "learning_rate": 1.7114691256390405e-05, "loss": 1.4163, "step": 1049 }, { "epoch": 0.6471494607087828, "grad_norm": 1.3316622972488403, "learning_rate": 1.7107124096219124e-05, "loss": 1.4068, "step": 1050 }, { "epoch": 0.6477657935285054, "grad_norm": 1.3321202993392944, "learning_rate": 1.7099548703667785e-05, "loss": 1.3591, "step": 1051 }, { "epoch": 0.648382126348228, "grad_norm": 1.3037430047988892, "learning_rate": 1.7091965087511174e-05, "loss": 1.4411, "step": 1052 }, { "epoch": 0.6489984591679507, "grad_norm": 1.3212056159973145, "learning_rate": 1.7084373256533603e-05, "loss": 1.4342, "step": 1053 }, { "epoch": 0.6496147919876734, "grad_norm": 1.3281359672546387, "learning_rate": 1.7076773219528905e-05, "loss": 1.4187, "step": 1054 }, { "epoch": 0.650231124807396, "grad_norm": 1.308585524559021, "learning_rate": 1.706916498530041e-05, "loss": 1.4604, "step": 1055 }, { "epoch": 0.6508474576271186, "grad_norm": 1.286362886428833, "learning_rate": 1.706154856266095e-05, "loss": 1.455, "step": 1056 }, { "epoch": 0.6514637904468413, "grad_norm": 1.2927582263946533, "learning_rate": 1.7053923960432847e-05, "loss": 1.5197, "step": 1057 }, { "epoch": 0.652080123266564, "grad_norm": 1.2999407052993774, "learning_rate": 1.7046291187447878e-05, "loss": 1.4954, "step": 1058 }, { "epoch": 0.6526964560862866, "grad_norm": 1.3148698806762695, "learning_rate": 1.70386502525473e-05, "loss": 1.5155, "step": 1059 }, { "epoch": 0.6533127889060092, "grad_norm": 1.2802486419677734, "learning_rate": 1.7031001164581828e-05, "loss": 1.455, "step": 1060 }, { "epoch": 0.6539291217257319, "grad_norm": 1.305747151374817, "learning_rate": 1.702334393241161e-05, "loss": 1.4276, "step": 1061 }, { "epoch": 0.6545454545454545, "grad_norm": 1.2846449613571167, "learning_rate": 1.701567856490623e-05, "loss": 1.4244, "step": 1062 }, { "epoch": 0.6551617873651772, "grad_norm": 1.2861919403076172, "learning_rate": 1.7008005070944695e-05, "loss": 1.4181, "step": 1063 }, { "epoch": 0.6557781201848999, "grad_norm": 1.3186559677124023, "learning_rate": 1.7000323459415434e-05, "loss": 1.4413, "step": 1064 }, { "epoch": 0.6563944530046225, "grad_norm": 1.334093451499939, "learning_rate": 1.6992633739216266e-05, "loss": 1.4448, "step": 1065 }, { "epoch": 0.6570107858243451, "grad_norm": 1.3095637559890747, "learning_rate": 1.6984935919254408e-05, "loss": 1.4763, "step": 1066 }, { "epoch": 0.6576271186440678, "grad_norm": 1.2848647832870483, "learning_rate": 1.6977230008446466e-05, "loss": 1.4261, "step": 1067 }, { "epoch": 0.6582434514637905, "grad_norm": 1.269071102142334, "learning_rate": 1.6969516015718405e-05, "loss": 1.3611, "step": 1068 }, { "epoch": 0.6588597842835131, "grad_norm": 1.3244129419326782, "learning_rate": 1.696179395000556e-05, "loss": 1.4081, "step": 1069 }, { "epoch": 0.6594761171032357, "grad_norm": 1.3379712104797363, "learning_rate": 1.6954063820252615e-05, "loss": 1.4782, "step": 1070 }, { "epoch": 0.6600924499229583, "grad_norm": 1.3701303005218506, "learning_rate": 1.6946325635413596e-05, "loss": 1.471, "step": 1071 }, { "epoch": 0.6607087827426811, "grad_norm": 1.3067114353179932, "learning_rate": 1.693857940445186e-05, "loss": 1.3945, "step": 1072 }, { "epoch": 0.6613251155624037, "grad_norm": 1.3085665702819824, "learning_rate": 1.6930825136340076e-05, "loss": 1.4025, "step": 1073 }, { "epoch": 0.6619414483821263, "grad_norm": 1.3130125999450684, "learning_rate": 1.6923062840060233e-05, "loss": 1.437, "step": 1074 }, { "epoch": 0.662557781201849, "grad_norm": 1.308841586112976, "learning_rate": 1.6915292524603617e-05, "loss": 1.4018, "step": 1075 }, { "epoch": 0.6631741140215717, "grad_norm": 1.3269802331924438, "learning_rate": 1.69075141989708e-05, "loss": 1.3699, "step": 1076 }, { "epoch": 0.6637904468412943, "grad_norm": 1.3659073114395142, "learning_rate": 1.6899727872171632e-05, "loss": 1.5031, "step": 1077 }, { "epoch": 0.6644067796610169, "grad_norm": 1.334735631942749, "learning_rate": 1.689193355322523e-05, "loss": 1.4455, "step": 1078 }, { "epoch": 0.6650231124807396, "grad_norm": 1.3292573690414429, "learning_rate": 1.6884131251159978e-05, "loss": 1.4063, "step": 1079 }, { "epoch": 0.6656394453004623, "grad_norm": 1.4586622714996338, "learning_rate": 1.6876320975013498e-05, "loss": 1.4197, "step": 1080 }, { "epoch": 0.6662557781201849, "grad_norm": 1.323284387588501, "learning_rate": 1.6868502733832647e-05, "loss": 1.4331, "step": 1081 }, { "epoch": 0.6668721109399075, "grad_norm": 1.3338954448699951, "learning_rate": 1.6860676536673513e-05, "loss": 1.4308, "step": 1082 }, { "epoch": 0.6674884437596302, "grad_norm": 1.322880744934082, "learning_rate": 1.68528423926014e-05, "loss": 1.4751, "step": 1083 }, { "epoch": 0.6681047765793529, "grad_norm": 1.3367668390274048, "learning_rate": 1.6845000310690817e-05, "loss": 1.3907, "step": 1084 }, { "epoch": 0.6687211093990755, "grad_norm": 1.3415685892105103, "learning_rate": 1.683715030002546e-05, "loss": 1.3859, "step": 1085 }, { "epoch": 0.6693374422187982, "grad_norm": 1.2982367277145386, "learning_rate": 1.682929236969822e-05, "loss": 1.4506, "step": 1086 }, { "epoch": 0.6699537750385208, "grad_norm": 1.2846623659133911, "learning_rate": 1.682142652881116e-05, "loss": 1.4218, "step": 1087 }, { "epoch": 0.6705701078582434, "grad_norm": 1.3145357370376587, "learning_rate": 1.6813552786475495e-05, "loss": 1.4286, "step": 1088 }, { "epoch": 0.6711864406779661, "grad_norm": 1.2875252962112427, "learning_rate": 1.6805671151811606e-05, "loss": 1.4228, "step": 1089 }, { "epoch": 0.6718027734976888, "grad_norm": 1.4280080795288086, "learning_rate": 1.679778163394901e-05, "loss": 1.553, "step": 1090 }, { "epoch": 0.6724191063174114, "grad_norm": 1.319756269454956, "learning_rate": 1.6789884242026352e-05, "loss": 1.3581, "step": 1091 }, { "epoch": 0.673035439137134, "grad_norm": 1.3090920448303223, "learning_rate": 1.6781978985191406e-05, "loss": 1.4325, "step": 1092 }, { "epoch": 0.6736517719568567, "grad_norm": 1.3277099132537842, "learning_rate": 1.677406587260105e-05, "loss": 1.4288, "step": 1093 }, { "epoch": 0.6742681047765794, "grad_norm": 1.3114728927612305, "learning_rate": 1.676614491342126e-05, "loss": 1.4057, "step": 1094 }, { "epoch": 0.674884437596302, "grad_norm": 1.319132685661316, "learning_rate": 1.6758216116827106e-05, "loss": 1.4066, "step": 1095 }, { "epoch": 0.6755007704160246, "grad_norm": 1.3682420253753662, "learning_rate": 1.6750279492002737e-05, "loss": 1.4053, "step": 1096 }, { "epoch": 0.6761171032357473, "grad_norm": 1.4799227714538574, "learning_rate": 1.674233504814136e-05, "loss": 1.4855, "step": 1097 }, { "epoch": 0.67673343605547, "grad_norm": 1.3237099647521973, "learning_rate": 1.6734382794445253e-05, "loss": 1.4083, "step": 1098 }, { "epoch": 0.6773497688751926, "grad_norm": 1.3436493873596191, "learning_rate": 1.672642274012573e-05, "loss": 1.4439, "step": 1099 }, { "epoch": 0.6779661016949152, "grad_norm": 1.3135432004928589, "learning_rate": 1.6718454894403143e-05, "loss": 1.3968, "step": 1100 }, { "epoch": 0.678582434514638, "grad_norm": 1.3426473140716553, "learning_rate": 1.6710479266506873e-05, "loss": 1.4624, "step": 1101 }, { "epoch": 0.6791987673343606, "grad_norm": 1.4133937358856201, "learning_rate": 1.670249586567531e-05, "loss": 1.421, "step": 1102 }, { "epoch": 0.6798151001540832, "grad_norm": 1.3523914813995361, "learning_rate": 1.669450470115585e-05, "loss": 1.3592, "step": 1103 }, { "epoch": 0.6804314329738058, "grad_norm": 1.3294241428375244, "learning_rate": 1.668650578220488e-05, "loss": 1.4357, "step": 1104 }, { "epoch": 0.6810477657935285, "grad_norm": 1.3072315454483032, "learning_rate": 1.6678499118087777e-05, "loss": 1.4329, "step": 1105 }, { "epoch": 0.6816640986132512, "grad_norm": 1.3670566082000732, "learning_rate": 1.6670484718078878e-05, "loss": 1.5, "step": 1106 }, { "epoch": 0.6822804314329738, "grad_norm": 1.3196446895599365, "learning_rate": 1.6662462591461485e-05, "loss": 1.4262, "step": 1107 }, { "epoch": 0.6828967642526964, "grad_norm": 1.3277305364608765, "learning_rate": 1.6654432747527855e-05, "loss": 1.4207, "step": 1108 }, { "epoch": 0.6835130970724191, "grad_norm": 1.3459912538528442, "learning_rate": 1.664639519557918e-05, "loss": 1.4218, "step": 1109 }, { "epoch": 0.6841294298921418, "grad_norm": 1.3876893520355225, "learning_rate": 1.663834994492558e-05, "loss": 1.4324, "step": 1110 }, { "epoch": 0.6847457627118644, "grad_norm": 1.3100184202194214, "learning_rate": 1.6630297004886087e-05, "loss": 1.3616, "step": 1111 }, { "epoch": 0.6853620955315871, "grad_norm": 1.3379758596420288, "learning_rate": 1.6622236384788653e-05, "loss": 1.4224, "step": 1112 }, { "epoch": 0.6859784283513097, "grad_norm": 1.3118106126785278, "learning_rate": 1.6614168093970123e-05, "loss": 1.4312, "step": 1113 }, { "epoch": 0.6865947611710324, "grad_norm": 1.3353965282440186, "learning_rate": 1.660609214177621e-05, "loss": 1.4237, "step": 1114 }, { "epoch": 0.687211093990755, "grad_norm": 1.3618073463439941, "learning_rate": 1.6598008537561526e-05, "loss": 1.4676, "step": 1115 }, { "epoch": 0.6878274268104777, "grad_norm": 1.3534114360809326, "learning_rate": 1.6589917290689532e-05, "loss": 1.4006, "step": 1116 }, { "epoch": 0.6884437596302003, "grad_norm": 1.3765275478363037, "learning_rate": 1.6581818410532546e-05, "loss": 1.4377, "step": 1117 }, { "epoch": 0.689060092449923, "grad_norm": 1.3998385667800903, "learning_rate": 1.6573711906471723e-05, "loss": 1.4751, "step": 1118 }, { "epoch": 0.6896764252696456, "grad_norm": 1.3291836977005005, "learning_rate": 1.6565597787897055e-05, "loss": 1.3834, "step": 1119 }, { "epoch": 0.6902927580893683, "grad_norm": 1.3467388153076172, "learning_rate": 1.6557476064207354e-05, "loss": 1.3693, "step": 1120 }, { "epoch": 0.6909090909090909, "grad_norm": 1.3882273435592651, "learning_rate": 1.6549346744810236e-05, "loss": 1.3702, "step": 1121 }, { "epoch": 0.6915254237288135, "grad_norm": 1.4133625030517578, "learning_rate": 1.654120983912212e-05, "loss": 1.4387, "step": 1122 }, { "epoch": 0.6921417565485362, "grad_norm": 1.3705497980117798, "learning_rate": 1.6533065356568206e-05, "loss": 1.3665, "step": 1123 }, { "epoch": 0.6927580893682589, "grad_norm": 1.3537425994873047, "learning_rate": 1.6524913306582485e-05, "loss": 1.3907, "step": 1124 }, { "epoch": 0.6933744221879815, "grad_norm": 1.3726212978363037, "learning_rate": 1.6516753698607698e-05, "loss": 1.3866, "step": 1125 }, { "epoch": 0.6939907550077041, "grad_norm": 1.370794653892517, "learning_rate": 1.650858654209534e-05, "loss": 1.4486, "step": 1126 }, { "epoch": 0.6946070878274269, "grad_norm": 1.319109559059143, "learning_rate": 1.650041184650567e-05, "loss": 1.3534, "step": 1127 }, { "epoch": 0.6952234206471495, "grad_norm": 1.3743484020233154, "learning_rate": 1.649222962130766e-05, "loss": 1.4316, "step": 1128 }, { "epoch": 0.6958397534668721, "grad_norm": 1.3410758972167969, "learning_rate": 1.6484039875979004e-05, "loss": 1.3804, "step": 1129 }, { "epoch": 0.6964560862865947, "grad_norm": 1.4622619152069092, "learning_rate": 1.647584262000612e-05, "loss": 1.4098, "step": 1130 }, { "epoch": 0.6970724191063175, "grad_norm": 1.4402393102645874, "learning_rate": 1.6467637862884114e-05, "loss": 1.4853, "step": 1131 }, { "epoch": 0.6976887519260401, "grad_norm": 1.3858689069747925, "learning_rate": 1.6459425614116785e-05, "loss": 1.4193, "step": 1132 }, { "epoch": 0.6983050847457627, "grad_norm": 1.3480373620986938, "learning_rate": 1.6451205883216612e-05, "loss": 1.339, "step": 1133 }, { "epoch": 0.6989214175654853, "grad_norm": 1.312269687652588, "learning_rate": 1.6442978679704738e-05, "loss": 1.3665, "step": 1134 }, { "epoch": 0.699537750385208, "grad_norm": 1.3623703718185425, "learning_rate": 1.643474401311096e-05, "loss": 1.4147, "step": 1135 }, { "epoch": 0.7001540832049307, "grad_norm": 1.3509254455566406, "learning_rate": 1.6426501892973726e-05, "loss": 1.42, "step": 1136 }, { "epoch": 0.7007704160246533, "grad_norm": 1.3435120582580566, "learning_rate": 1.6418252328840112e-05, "loss": 1.4804, "step": 1137 }, { "epoch": 0.701386748844376, "grad_norm": 1.3655997514724731, "learning_rate": 1.6409995330265817e-05, "loss": 1.4709, "step": 1138 }, { "epoch": 0.7020030816640986, "grad_norm": 1.3416478633880615, "learning_rate": 1.640173090681515e-05, "loss": 1.4579, "step": 1139 }, { "epoch": 0.7026194144838213, "grad_norm": 1.3486171960830688, "learning_rate": 1.639345906806103e-05, "loss": 1.4693, "step": 1140 }, { "epoch": 0.7032357473035439, "grad_norm": 1.3015707731246948, "learning_rate": 1.6385179823584957e-05, "loss": 1.4163, "step": 1141 }, { "epoch": 0.7038520801232666, "grad_norm": 1.3379905223846436, "learning_rate": 1.6376893182977006e-05, "loss": 1.3602, "step": 1142 }, { "epoch": 0.7044684129429892, "grad_norm": 1.3435595035552979, "learning_rate": 1.6368599155835826e-05, "loss": 1.4084, "step": 1143 }, { "epoch": 0.7050847457627119, "grad_norm": 1.346488118171692, "learning_rate": 1.636029775176862e-05, "loss": 1.3837, "step": 1144 }, { "epoch": 0.7057010785824345, "grad_norm": 1.3288805484771729, "learning_rate": 1.6351988980391142e-05, "loss": 1.4711, "step": 1145 }, { "epoch": 0.7063174114021572, "grad_norm": 1.3497323989868164, "learning_rate": 1.6343672851327666e-05, "loss": 1.3483, "step": 1146 }, { "epoch": 0.7069337442218798, "grad_norm": 1.358965516090393, "learning_rate": 1.6335349374210997e-05, "loss": 1.4658, "step": 1147 }, { "epoch": 0.7075500770416024, "grad_norm": 1.3880150318145752, "learning_rate": 1.632701855868245e-05, "loss": 1.4341, "step": 1148 }, { "epoch": 0.7081664098613251, "grad_norm": 1.3684008121490479, "learning_rate": 1.631868041439185e-05, "loss": 1.3713, "step": 1149 }, { "epoch": 0.7087827426810478, "grad_norm": 1.3648697137832642, "learning_rate": 1.6310334950997492e-05, "loss": 1.428, "step": 1150 }, { "epoch": 0.7093990755007704, "grad_norm": 1.3747222423553467, "learning_rate": 1.6301982178166162e-05, "loss": 1.3868, "step": 1151 }, { "epoch": 0.710015408320493, "grad_norm": 1.3591516017913818, "learning_rate": 1.629362210557311e-05, "loss": 1.4682, "step": 1152 }, { "epoch": 0.7106317411402158, "grad_norm": 1.3448100090026855, "learning_rate": 1.628525474290204e-05, "loss": 1.4336, "step": 1153 }, { "epoch": 0.7112480739599384, "grad_norm": 1.34591543674469, "learning_rate": 1.62768800998451e-05, "loss": 1.3906, "step": 1154 }, { "epoch": 0.711864406779661, "grad_norm": 1.348944902420044, "learning_rate": 1.6268498186102882e-05, "loss": 1.4159, "step": 1155 }, { "epoch": 0.7124807395993836, "grad_norm": 1.4031671285629272, "learning_rate": 1.6260109011384382e-05, "loss": 1.3995, "step": 1156 }, { "epoch": 0.7130970724191064, "grad_norm": 1.3662149906158447, "learning_rate": 1.6251712585407016e-05, "loss": 1.4399, "step": 1157 }, { "epoch": 0.713713405238829, "grad_norm": 1.3691681623458862, "learning_rate": 1.62433089178966e-05, "loss": 1.433, "step": 1158 }, { "epoch": 0.7143297380585516, "grad_norm": 1.3465464115142822, "learning_rate": 1.6234898018587336e-05, "loss": 1.4762, "step": 1159 }, { "epoch": 0.7149460708782742, "grad_norm": 1.377349853515625, "learning_rate": 1.62264798972218e-05, "loss": 1.4748, "step": 1160 }, { "epoch": 0.715562403697997, "grad_norm": 1.3775911331176758, "learning_rate": 1.6218054563550946e-05, "loss": 1.4147, "step": 1161 }, { "epoch": 0.7161787365177196, "grad_norm": 1.3441201448440552, "learning_rate": 1.6209622027334063e-05, "loss": 1.5023, "step": 1162 }, { "epoch": 0.7167950693374422, "grad_norm": 1.3580741882324219, "learning_rate": 1.6201182298338798e-05, "loss": 1.3606, "step": 1163 }, { "epoch": 0.7174114021571649, "grad_norm": 1.3541022539138794, "learning_rate": 1.6192735386341123e-05, "loss": 1.3834, "step": 1164 }, { "epoch": 0.7180277349768875, "grad_norm": 1.3307015895843506, "learning_rate": 1.618428130112533e-05, "loss": 1.3617, "step": 1165 }, { "epoch": 0.7186440677966102, "grad_norm": 1.384247064590454, "learning_rate": 1.6175820052484025e-05, "loss": 1.3595, "step": 1166 }, { "epoch": 0.7192604006163328, "grad_norm": 1.367929458618164, "learning_rate": 1.616735165021811e-05, "loss": 1.3911, "step": 1167 }, { "epoch": 0.7198767334360555, "grad_norm": 1.3549861907958984, "learning_rate": 1.6158876104136765e-05, "loss": 1.4308, "step": 1168 }, { "epoch": 0.7204930662557781, "grad_norm": 1.3546504974365234, "learning_rate": 1.615039342405745e-05, "loss": 1.4307, "step": 1169 }, { "epoch": 0.7211093990755008, "grad_norm": 1.3728663921356201, "learning_rate": 1.6141903619805904e-05, "loss": 1.4187, "step": 1170 }, { "epoch": 0.7217257318952234, "grad_norm": 1.3984835147857666, "learning_rate": 1.613340670121609e-05, "loss": 1.4761, "step": 1171 }, { "epoch": 0.7223420647149461, "grad_norm": 1.3855217695236206, "learning_rate": 1.612490267813023e-05, "loss": 1.3772, "step": 1172 }, { "epoch": 0.7229583975346687, "grad_norm": 1.357847809791565, "learning_rate": 1.6116391560398776e-05, "loss": 1.3913, "step": 1173 }, { "epoch": 0.7235747303543913, "grad_norm": 1.352582335472107, "learning_rate": 1.6107873357880387e-05, "loss": 1.3946, "step": 1174 }, { "epoch": 0.724191063174114, "grad_norm": 1.3529841899871826, "learning_rate": 1.6099348080441934e-05, "loss": 1.4135, "step": 1175 }, { "epoch": 0.7248073959938367, "grad_norm": 1.3303114175796509, "learning_rate": 1.6090815737958488e-05, "loss": 1.3781, "step": 1176 }, { "epoch": 0.7254237288135593, "grad_norm": 1.3634905815124512, "learning_rate": 1.6082276340313296e-05, "loss": 1.3495, "step": 1177 }, { "epoch": 0.7260400616332819, "grad_norm": 1.3385804891586304, "learning_rate": 1.607372989739778e-05, "loss": 1.3583, "step": 1178 }, { "epoch": 0.7266563944530047, "grad_norm": 1.3620234727859497, "learning_rate": 1.606517641911153e-05, "loss": 1.4263, "step": 1179 }, { "epoch": 0.7272727272727273, "grad_norm": 1.3477967977523804, "learning_rate": 1.605661591536227e-05, "loss": 1.3293, "step": 1180 }, { "epoch": 0.7278890600924499, "grad_norm": 1.3738712072372437, "learning_rate": 1.6048048396065875e-05, "loss": 1.4275, "step": 1181 }, { "epoch": 0.7285053929121725, "grad_norm": 1.4421792030334473, "learning_rate": 1.603947387114634e-05, "loss": 1.3872, "step": 1182 }, { "epoch": 0.7291217257318953, "grad_norm": 1.3777658939361572, "learning_rate": 1.6030892350535773e-05, "loss": 1.3881, "step": 1183 }, { "epoch": 0.7297380585516179, "grad_norm": 1.328555941581726, "learning_rate": 1.60223038441744e-05, "loss": 1.3747, "step": 1184 }, { "epoch": 0.7303543913713405, "grad_norm": 1.3404127359390259, "learning_rate": 1.6013708362010512e-05, "loss": 1.4297, "step": 1185 }, { "epoch": 0.7309707241910631, "grad_norm": 1.3050881624221802, "learning_rate": 1.6005105914000508e-05, "loss": 1.3647, "step": 1186 }, { "epoch": 0.7315870570107859, "grad_norm": 1.3262568712234497, "learning_rate": 1.5996496510108836e-05, "loss": 1.4004, "step": 1187 }, { "epoch": 0.7322033898305085, "grad_norm": 1.3432468175888062, "learning_rate": 1.598788016030801e-05, "loss": 1.4275, "step": 1188 }, { "epoch": 0.7328197226502311, "grad_norm": 1.3584766387939453, "learning_rate": 1.5979256874578595e-05, "loss": 1.3929, "step": 1189 }, { "epoch": 0.7334360554699538, "grad_norm": 1.400837779045105, "learning_rate": 1.5970626662909174e-05, "loss": 1.46, "step": 1190 }, { "epoch": 0.7340523882896764, "grad_norm": 1.3563096523284912, "learning_rate": 1.596198953529637e-05, "loss": 1.3671, "step": 1191 }, { "epoch": 0.7346687211093991, "grad_norm": 1.3891572952270508, "learning_rate": 1.5953345501744803e-05, "loss": 1.4469, "step": 1192 }, { "epoch": 0.7352850539291217, "grad_norm": 1.3301055431365967, "learning_rate": 1.5944694572267098e-05, "loss": 1.3798, "step": 1193 }, { "epoch": 0.7359013867488444, "grad_norm": 1.3338972330093384, "learning_rate": 1.593603675688387e-05, "loss": 1.3985, "step": 1194 }, { "epoch": 0.736517719568567, "grad_norm": 1.3893210887908936, "learning_rate": 1.592737206562371e-05, "loss": 1.3904, "step": 1195 }, { "epoch": 0.7371340523882897, "grad_norm": 1.3857276439666748, "learning_rate": 1.591870050852317e-05, "loss": 1.4301, "step": 1196 }, { "epoch": 0.7377503852080123, "grad_norm": 1.3413461446762085, "learning_rate": 1.5910022095626752e-05, "loss": 1.4022, "step": 1197 }, { "epoch": 0.738366718027735, "grad_norm": 1.3509676456451416, "learning_rate": 1.5901336836986908e-05, "loss": 1.3769, "step": 1198 }, { "epoch": 0.7389830508474576, "grad_norm": 1.3821078538894653, "learning_rate": 1.589264474266402e-05, "loss": 1.4488, "step": 1199 }, { "epoch": 0.7395993836671803, "grad_norm": 1.3913413286209106, "learning_rate": 1.5883945822726373e-05, "loss": 1.4404, "step": 1200 }, { "epoch": 0.7402157164869029, "grad_norm": 1.4336520433425903, "learning_rate": 1.5875240087250177e-05, "loss": 1.3708, "step": 1201 }, { "epoch": 0.7408320493066256, "grad_norm": 1.410294532775879, "learning_rate": 1.5866527546319526e-05, "loss": 1.3549, "step": 1202 }, { "epoch": 0.7414483821263482, "grad_norm": 1.3986623287200928, "learning_rate": 1.5857808210026394e-05, "loss": 1.3558, "step": 1203 }, { "epoch": 0.7420647149460708, "grad_norm": 1.3681679964065552, "learning_rate": 1.5849082088470638e-05, "loss": 1.409, "step": 1204 }, { "epoch": 0.7426810477657936, "grad_norm": 1.3765243291854858, "learning_rate": 1.5840349191759964e-05, "loss": 1.404, "step": 1205 }, { "epoch": 0.7432973805855162, "grad_norm": 1.4221912622451782, "learning_rate": 1.5831609530009932e-05, "loss": 1.4265, "step": 1206 }, { "epoch": 0.7439137134052388, "grad_norm": 1.3863918781280518, "learning_rate": 1.5822863113343934e-05, "loss": 1.4538, "step": 1207 }, { "epoch": 0.7445300462249614, "grad_norm": 1.3673828840255737, "learning_rate": 1.5814109951893193e-05, "loss": 1.3628, "step": 1208 }, { "epoch": 0.7451463790446842, "grad_norm": 1.3595763444900513, "learning_rate": 1.5805350055796736e-05, "loss": 1.3924, "step": 1209 }, { "epoch": 0.7457627118644068, "grad_norm": 1.3429622650146484, "learning_rate": 1.5796583435201392e-05, "loss": 1.3822, "step": 1210 }, { "epoch": 0.7463790446841294, "grad_norm": 1.3468122482299805, "learning_rate": 1.5787810100261788e-05, "loss": 1.3903, "step": 1211 }, { "epoch": 0.746995377503852, "grad_norm": 1.3832107782363892, "learning_rate": 1.5779030061140323e-05, "loss": 1.4212, "step": 1212 }, { "epoch": 0.7476117103235748, "grad_norm": 1.411205768585205, "learning_rate": 1.5770243328007157e-05, "loss": 1.4503, "step": 1213 }, { "epoch": 0.7482280431432974, "grad_norm": 1.355100154876709, "learning_rate": 1.576144991104021e-05, "loss": 1.4021, "step": 1214 }, { "epoch": 0.74884437596302, "grad_norm": 1.3819501399993896, "learning_rate": 1.5752649820425138e-05, "loss": 1.3887, "step": 1215 }, { "epoch": 0.7494607087827427, "grad_norm": 1.384469985961914, "learning_rate": 1.574384306635534e-05, "loss": 1.3864, "step": 1216 }, { "epoch": 0.7500770416024654, "grad_norm": 1.3583180904388428, "learning_rate": 1.5735029659031918e-05, "loss": 1.3625, "step": 1217 }, { "epoch": 0.750693374422188, "grad_norm": 1.3643970489501953, "learning_rate": 1.5726209608663686e-05, "loss": 1.403, "step": 1218 }, { "epoch": 0.7513097072419106, "grad_norm": 1.3932344913482666, "learning_rate": 1.5717382925467162e-05, "loss": 1.4323, "step": 1219 }, { "epoch": 0.7519260400616333, "grad_norm": 1.4196738004684448, "learning_rate": 1.5708549619666534e-05, "loss": 1.3805, "step": 1220 }, { "epoch": 0.752542372881356, "grad_norm": 1.3787742853164673, "learning_rate": 1.569970970149367e-05, "loss": 1.3516, "step": 1221 }, { "epoch": 0.7531587057010786, "grad_norm": 1.372910499572754, "learning_rate": 1.5690863181188087e-05, "loss": 1.3953, "step": 1222 }, { "epoch": 0.7537750385208012, "grad_norm": 1.3647782802581787, "learning_rate": 1.5682010068996963e-05, "loss": 1.3226, "step": 1223 }, { "epoch": 0.7543913713405239, "grad_norm": 1.378604769706726, "learning_rate": 1.5673150375175105e-05, "loss": 1.4305, "step": 1224 }, { "epoch": 0.7550077041602465, "grad_norm": 1.3769879341125488, "learning_rate": 1.5664284109984936e-05, "loss": 1.3983, "step": 1225 }, { "epoch": 0.7556240369799692, "grad_norm": 1.3746600151062012, "learning_rate": 1.5655411283696507e-05, "loss": 1.4041, "step": 1226 }, { "epoch": 0.7562403697996918, "grad_norm": 1.4013216495513916, "learning_rate": 1.5646531906587456e-05, "loss": 1.4147, "step": 1227 }, { "epoch": 0.7568567026194145, "grad_norm": 1.4366151094436646, "learning_rate": 1.5637645988943008e-05, "loss": 1.4732, "step": 1228 }, { "epoch": 0.7574730354391371, "grad_norm": 1.3573259115219116, "learning_rate": 1.562875354105598e-05, "loss": 1.3766, "step": 1229 }, { "epoch": 0.7580893682588598, "grad_norm": 1.348633050918579, "learning_rate": 1.5619854573226735e-05, "loss": 1.4404, "step": 1230 }, { "epoch": 0.7587057010785825, "grad_norm": 1.3839634656906128, "learning_rate": 1.5610949095763198e-05, "loss": 1.4225, "step": 1231 }, { "epoch": 0.7593220338983051, "grad_norm": 1.35868239402771, "learning_rate": 1.5602037118980833e-05, "loss": 1.402, "step": 1232 }, { "epoch": 0.7599383667180277, "grad_norm": 1.3832217454910278, "learning_rate": 1.5593118653202628e-05, "loss": 1.3846, "step": 1233 }, { "epoch": 0.7605546995377503, "grad_norm": 1.3473392724990845, "learning_rate": 1.5584193708759094e-05, "loss": 1.3497, "step": 1234 }, { "epoch": 0.7611710323574731, "grad_norm": 1.3098177909851074, "learning_rate": 1.557526229598824e-05, "loss": 1.3364, "step": 1235 }, { "epoch": 0.7617873651771957, "grad_norm": 1.34636390209198, "learning_rate": 1.5566324425235574e-05, "loss": 1.3956, "step": 1236 }, { "epoch": 0.7624036979969183, "grad_norm": 1.3529012203216553, "learning_rate": 1.5557380106854077e-05, "loss": 1.3578, "step": 1237 }, { "epoch": 0.7630200308166409, "grad_norm": 1.4177062511444092, "learning_rate": 1.5548429351204205e-05, "loss": 1.4145, "step": 1238 }, { "epoch": 0.7636363636363637, "grad_norm": 1.3614940643310547, "learning_rate": 1.553947216865387e-05, "loss": 1.3916, "step": 1239 }, { "epoch": 0.7642526964560863, "grad_norm": 1.3795031309127808, "learning_rate": 1.553050856957842e-05, "loss": 1.3719, "step": 1240 }, { "epoch": 0.7648690292758089, "grad_norm": 1.418516993522644, "learning_rate": 1.552153856436065e-05, "loss": 1.3442, "step": 1241 }, { "epoch": 0.7654853620955316, "grad_norm": 1.4852447509765625, "learning_rate": 1.551256216339076e-05, "loss": 1.4791, "step": 1242 }, { "epoch": 0.7661016949152543, "grad_norm": 1.382890224456787, "learning_rate": 1.5503579377066367e-05, "loss": 1.4076, "step": 1243 }, { "epoch": 0.7667180277349769, "grad_norm": 1.3794680833816528, "learning_rate": 1.5494590215792484e-05, "loss": 1.4013, "step": 1244 }, { "epoch": 0.7673343605546995, "grad_norm": 1.3885034322738647, "learning_rate": 1.5485594689981512e-05, "loss": 1.4125, "step": 1245 }, { "epoch": 0.7679506933744222, "grad_norm": 1.359109878540039, "learning_rate": 1.547659281005321e-05, "loss": 1.3595, "step": 1246 }, { "epoch": 0.7685670261941449, "grad_norm": 1.5537559986114502, "learning_rate": 1.5467584586434713e-05, "loss": 1.4392, "step": 1247 }, { "epoch": 0.7691833590138675, "grad_norm": 1.4052560329437256, "learning_rate": 1.54585700295605e-05, "loss": 1.3676, "step": 1248 }, { "epoch": 0.7697996918335901, "grad_norm": 1.4142606258392334, "learning_rate": 1.544954914987238e-05, "loss": 1.342, "step": 1249 }, { "epoch": 0.7704160246533128, "grad_norm": 1.3751095533370972, "learning_rate": 1.5440521957819483e-05, "loss": 1.3423, "step": 1250 }, { "epoch": 0.7710323574730354, "grad_norm": 1.3883116245269775, "learning_rate": 1.5431488463858268e-05, "loss": 1.3999, "step": 1251 }, { "epoch": 0.7716486902927581, "grad_norm": 1.3512605428695679, "learning_rate": 1.542244867845248e-05, "loss": 1.4079, "step": 1252 }, { "epoch": 0.7722650231124807, "grad_norm": 1.3959479331970215, "learning_rate": 1.541340261207315e-05, "loss": 1.466, "step": 1253 }, { "epoch": 0.7728813559322034, "grad_norm": 1.3695547580718994, "learning_rate": 1.5404350275198593e-05, "loss": 1.3939, "step": 1254 }, { "epoch": 0.773497688751926, "grad_norm": 1.3965188264846802, "learning_rate": 1.5395291678314386e-05, "loss": 1.4144, "step": 1255 }, { "epoch": 0.7741140215716487, "grad_norm": 1.3554106950759888, "learning_rate": 1.5386226831913348e-05, "loss": 1.3843, "step": 1256 }, { "epoch": 0.7747303543913714, "grad_norm": 1.3952405452728271, "learning_rate": 1.5377155746495547e-05, "loss": 1.4189, "step": 1257 }, { "epoch": 0.775346687211094, "grad_norm": 1.3519076108932495, "learning_rate": 1.5368078432568274e-05, "loss": 1.3435, "step": 1258 }, { "epoch": 0.7759630200308166, "grad_norm": 1.433582067489624, "learning_rate": 1.5358994900646033e-05, "loss": 1.4627, "step": 1259 }, { "epoch": 0.7765793528505393, "grad_norm": 1.371935248374939, "learning_rate": 1.5349905161250537e-05, "loss": 1.396, "step": 1260 }, { "epoch": 0.777195685670262, "grad_norm": 1.3637937307357788, "learning_rate": 1.534080922491068e-05, "loss": 1.4028, "step": 1261 }, { "epoch": 0.7778120184899846, "grad_norm": 1.3429019451141357, "learning_rate": 1.5331707102162538e-05, "loss": 1.3822, "step": 1262 }, { "epoch": 0.7784283513097072, "grad_norm": 1.3694963455200195, "learning_rate": 1.5322598803549355e-05, "loss": 1.4195, "step": 1263 }, { "epoch": 0.7790446841294298, "grad_norm": 1.389094352722168, "learning_rate": 1.5313484339621534e-05, "loss": 1.4141, "step": 1264 }, { "epoch": 0.7796610169491526, "grad_norm": 1.3838677406311035, "learning_rate": 1.5304363720936603e-05, "loss": 1.3751, "step": 1265 }, { "epoch": 0.7802773497688752, "grad_norm": 1.3857319355010986, "learning_rate": 1.5295236958059235e-05, "loss": 1.38, "step": 1266 }, { "epoch": 0.7808936825885978, "grad_norm": 1.3965563774108887, "learning_rate": 1.528610406156121e-05, "loss": 1.3736, "step": 1267 }, { "epoch": 0.7815100154083205, "grad_norm": 1.3244354724884033, "learning_rate": 1.527696504202142e-05, "loss": 1.3778, "step": 1268 }, { "epoch": 0.7821263482280432, "grad_norm": 1.3821669816970825, "learning_rate": 1.5267819910025847e-05, "loss": 1.3483, "step": 1269 }, { "epoch": 0.7827426810477658, "grad_norm": 1.4164375066757202, "learning_rate": 1.5258668676167548e-05, "loss": 1.3895, "step": 1270 }, { "epoch": 0.7833590138674884, "grad_norm": 1.394556999206543, "learning_rate": 1.5249511351046656e-05, "loss": 1.3528, "step": 1271 }, { "epoch": 0.7839753466872111, "grad_norm": 1.474249005317688, "learning_rate": 1.5240347945270354e-05, "loss": 1.4683, "step": 1272 }, { "epoch": 0.7845916795069338, "grad_norm": 1.3885775804519653, "learning_rate": 1.523117846945287e-05, "loss": 1.4544, "step": 1273 }, { "epoch": 0.7852080123266564, "grad_norm": 1.3791872262954712, "learning_rate": 1.5222002934215468e-05, "loss": 1.367, "step": 1274 }, { "epoch": 0.785824345146379, "grad_norm": 1.3802798986434937, "learning_rate": 1.5212821350186424e-05, "loss": 1.4159, "step": 1275 }, { "epoch": 0.7864406779661017, "grad_norm": 1.4177604913711548, "learning_rate": 1.5203633728001016e-05, "loss": 1.378, "step": 1276 }, { "epoch": 0.7870570107858244, "grad_norm": 1.3435091972351074, "learning_rate": 1.5194440078301536e-05, "loss": 1.3385, "step": 1277 }, { "epoch": 0.787673343605547, "grad_norm": 1.4210253953933716, "learning_rate": 1.5185240411737236e-05, "loss": 1.4149, "step": 1278 }, { "epoch": 0.7882896764252696, "grad_norm": 1.3709641695022583, "learning_rate": 1.517603473896435e-05, "loss": 1.3729, "step": 1279 }, { "epoch": 0.7889060092449923, "grad_norm": 1.4279632568359375, "learning_rate": 1.5166823070646061e-05, "loss": 1.4442, "step": 1280 }, { "epoch": 0.7895223420647149, "grad_norm": 1.372291088104248, "learning_rate": 1.5157605417452506e-05, "loss": 1.3856, "step": 1281 }, { "epoch": 0.7901386748844376, "grad_norm": 1.3758490085601807, "learning_rate": 1.5148381790060749e-05, "loss": 1.3974, "step": 1282 }, { "epoch": 0.7907550077041603, "grad_norm": 1.4097537994384766, "learning_rate": 1.5139152199154774e-05, "loss": 1.4064, "step": 1283 }, { "epoch": 0.7913713405238829, "grad_norm": 1.3887323141098022, "learning_rate": 1.512991665542547e-05, "loss": 1.3429, "step": 1284 }, { "epoch": 0.7919876733436055, "grad_norm": 1.3858720064163208, "learning_rate": 1.5120675169570632e-05, "loss": 1.3684, "step": 1285 }, { "epoch": 0.7926040061633282, "grad_norm": 1.3858073949813843, "learning_rate": 1.5111427752294927e-05, "loss": 1.3775, "step": 1286 }, { "epoch": 0.7932203389830509, "grad_norm": 1.3987418413162231, "learning_rate": 1.5102174414309894e-05, "loss": 1.4474, "step": 1287 }, { "epoch": 0.7938366718027735, "grad_norm": 1.41084885597229, "learning_rate": 1.5092915166333937e-05, "loss": 1.4514, "step": 1288 }, { "epoch": 0.7944530046224961, "grad_norm": 1.3799611330032349, "learning_rate": 1.5083650019092301e-05, "loss": 1.3932, "step": 1289 }, { "epoch": 0.7950693374422187, "grad_norm": 1.3947360515594482, "learning_rate": 1.507437898331706e-05, "loss": 1.4465, "step": 1290 }, { "epoch": 0.7956856702619415, "grad_norm": 1.4010576009750366, "learning_rate": 1.5065102069747117e-05, "loss": 1.3953, "step": 1291 }, { "epoch": 0.7963020030816641, "grad_norm": 1.4063360691070557, "learning_rate": 1.5055819289128179e-05, "loss": 1.4568, "step": 1292 }, { "epoch": 0.7969183359013867, "grad_norm": 1.3783200979232788, "learning_rate": 1.504653065221275e-05, "loss": 1.35, "step": 1293 }, { "epoch": 0.7975346687211095, "grad_norm": 1.3875716924667358, "learning_rate": 1.5037236169760111e-05, "loss": 1.3762, "step": 1294 }, { "epoch": 0.7981510015408321, "grad_norm": 1.4101182222366333, "learning_rate": 1.5027935852536333e-05, "loss": 1.3805, "step": 1295 }, { "epoch": 0.7987673343605547, "grad_norm": 1.3442564010620117, "learning_rate": 1.5018629711314223e-05, "loss": 1.3126, "step": 1296 }, { "epoch": 0.7993836671802773, "grad_norm": 1.4397971630096436, "learning_rate": 1.5009317756873348e-05, "loss": 1.4344, "step": 1297 }, { "epoch": 0.8, "grad_norm": 1.4211087226867676, "learning_rate": 1.5000000000000002e-05, "loss": 1.4206, "step": 1298 }, { "epoch": 0.8006163328197227, "grad_norm": 1.3679343461990356, "learning_rate": 1.4990676451487202e-05, "loss": 1.442, "step": 1299 }, { "epoch": 0.8012326656394453, "grad_norm": 1.4321902990341187, "learning_rate": 1.4981347122134679e-05, "loss": 1.4353, "step": 1300 }, { "epoch": 0.8018489984591679, "grad_norm": 1.3676947355270386, "learning_rate": 1.4972012022748851e-05, "loss": 1.4112, "step": 1301 }, { "epoch": 0.8024653312788906, "grad_norm": 1.3920872211456299, "learning_rate": 1.4962671164142827e-05, "loss": 1.4319, "step": 1302 }, { "epoch": 0.8030816640986133, "grad_norm": 1.392578125, "learning_rate": 1.4953324557136384e-05, "loss": 1.3837, "step": 1303 }, { "epoch": 0.8036979969183359, "grad_norm": 1.408814787864685, "learning_rate": 1.494397221255595e-05, "loss": 1.425, "step": 1304 }, { "epoch": 0.8043143297380585, "grad_norm": 1.389100432395935, "learning_rate": 1.4934614141234618e-05, "loss": 1.3556, "step": 1305 }, { "epoch": 0.8049306625577812, "grad_norm": 1.3787171840667725, "learning_rate": 1.4925250354012097e-05, "loss": 1.3714, "step": 1306 }, { "epoch": 0.8055469953775038, "grad_norm": 1.3640087842941284, "learning_rate": 1.4915880861734721e-05, "loss": 1.3487, "step": 1307 }, { "epoch": 0.8061633281972265, "grad_norm": 1.3745168447494507, "learning_rate": 1.4906505675255439e-05, "loss": 1.3192, "step": 1308 }, { "epoch": 0.8067796610169492, "grad_norm": 1.4770541191101074, "learning_rate": 1.489712480543379e-05, "loss": 1.4225, "step": 1309 }, { "epoch": 0.8073959938366718, "grad_norm": 1.416696548461914, "learning_rate": 1.4887738263135893e-05, "loss": 1.3925, "step": 1310 }, { "epoch": 0.8080123266563944, "grad_norm": 1.405380368232727, "learning_rate": 1.4878346059234446e-05, "loss": 1.386, "step": 1311 }, { "epoch": 0.8086286594761171, "grad_norm": 1.4075982570648193, "learning_rate": 1.48689482046087e-05, "loss": 1.3545, "step": 1312 }, { "epoch": 0.8092449922958398, "grad_norm": 1.4123165607452393, "learning_rate": 1.4859544710144453e-05, "loss": 1.3995, "step": 1313 }, { "epoch": 0.8098613251155624, "grad_norm": 1.406624674797058, "learning_rate": 1.4850135586734027e-05, "loss": 1.3788, "step": 1314 }, { "epoch": 0.810477657935285, "grad_norm": 1.404159426689148, "learning_rate": 1.4840720845276284e-05, "loss": 1.39, "step": 1315 }, { "epoch": 0.8110939907550077, "grad_norm": 1.3947949409484863, "learning_rate": 1.4831300496676578e-05, "loss": 1.3792, "step": 1316 }, { "epoch": 0.8117103235747304, "grad_norm": 1.4087709188461304, "learning_rate": 1.4821874551846756e-05, "loss": 1.3719, "step": 1317 }, { "epoch": 0.812326656394453, "grad_norm": 1.4019376039505005, "learning_rate": 1.4812443021705157e-05, "loss": 1.3957, "step": 1318 }, { "epoch": 0.8129429892141756, "grad_norm": 1.4130116701126099, "learning_rate": 1.4803005917176585e-05, "loss": 1.3232, "step": 1319 }, { "epoch": 0.8135593220338984, "grad_norm": 1.3892890214920044, "learning_rate": 1.47935632491923e-05, "loss": 1.3701, "step": 1320 }, { "epoch": 0.814175654853621, "grad_norm": 1.407993197441101, "learning_rate": 1.4784115028690011e-05, "loss": 1.4015, "step": 1321 }, { "epoch": 0.8147919876733436, "grad_norm": 1.4025367498397827, "learning_rate": 1.4774661266613852e-05, "loss": 1.4229, "step": 1322 }, { "epoch": 0.8154083204930662, "grad_norm": 1.413794994354248, "learning_rate": 1.4765201973914383e-05, "loss": 1.3768, "step": 1323 }, { "epoch": 0.816024653312789, "grad_norm": 1.424534797668457, "learning_rate": 1.475573716154856e-05, "loss": 1.3565, "step": 1324 }, { "epoch": 0.8166409861325116, "grad_norm": 1.4204684495925903, "learning_rate": 1.4746266840479746e-05, "loss": 1.3481, "step": 1325 }, { "epoch": 0.8172573189522342, "grad_norm": 1.389485478401184, "learning_rate": 1.4736791021677677e-05, "loss": 1.3456, "step": 1326 }, { "epoch": 0.8178736517719568, "grad_norm": 1.4082889556884766, "learning_rate": 1.4727309716118452e-05, "loss": 1.424, "step": 1327 }, { "epoch": 0.8184899845916795, "grad_norm": 1.4207428693771362, "learning_rate": 1.4717822934784537e-05, "loss": 1.4518, "step": 1328 }, { "epoch": 0.8191063174114022, "grad_norm": 1.4084069728851318, "learning_rate": 1.4708330688664739e-05, "loss": 1.3446, "step": 1329 }, { "epoch": 0.8197226502311248, "grad_norm": 1.4078699350357056, "learning_rate": 1.4698832988754182e-05, "loss": 1.3813, "step": 1330 }, { "epoch": 0.8203389830508474, "grad_norm": 1.4303404092788696, "learning_rate": 1.4689329846054324e-05, "loss": 1.3991, "step": 1331 }, { "epoch": 0.8209553158705701, "grad_norm": 1.427810549736023, "learning_rate": 1.4679821271572916e-05, "loss": 1.3781, "step": 1332 }, { "epoch": 0.8215716486902928, "grad_norm": 1.4285391569137573, "learning_rate": 1.467030727632401e-05, "loss": 1.3351, "step": 1333 }, { "epoch": 0.8221879815100154, "grad_norm": 1.418769121170044, "learning_rate": 1.4660787871327924e-05, "loss": 1.3041, "step": 1334 }, { "epoch": 0.8228043143297381, "grad_norm": 1.434212327003479, "learning_rate": 1.4651263067611261e-05, "loss": 1.4122, "step": 1335 }, { "epoch": 0.8234206471494607, "grad_norm": 1.4391118288040161, "learning_rate": 1.4641732876206857e-05, "loss": 1.4078, "step": 1336 }, { "epoch": 0.8240369799691833, "grad_norm": 1.423215389251709, "learning_rate": 1.4632197308153804e-05, "loss": 1.3223, "step": 1337 }, { "epoch": 0.824653312788906, "grad_norm": 1.4083201885223389, "learning_rate": 1.4622656374497414e-05, "loss": 1.3529, "step": 1338 }, { "epoch": 0.8252696456086287, "grad_norm": 1.42240571975708, "learning_rate": 1.461311008628922e-05, "loss": 1.3894, "step": 1339 }, { "epoch": 0.8258859784283513, "grad_norm": 1.428430438041687, "learning_rate": 1.460355845458695e-05, "loss": 1.4154, "step": 1340 }, { "epoch": 0.8265023112480739, "grad_norm": 1.4398504495620728, "learning_rate": 1.4594001490454525e-05, "loss": 1.3765, "step": 1341 }, { "epoch": 0.8271186440677966, "grad_norm": 1.4611661434173584, "learning_rate": 1.4584439204962046e-05, "loss": 1.3782, "step": 1342 }, { "epoch": 0.8277349768875193, "grad_norm": 1.4274184703826904, "learning_rate": 1.4574871609185776e-05, "loss": 1.3609, "step": 1343 }, { "epoch": 0.8283513097072419, "grad_norm": 1.4081215858459473, "learning_rate": 1.4565298714208126e-05, "loss": 1.4647, "step": 1344 }, { "epoch": 0.8289676425269645, "grad_norm": 1.4000768661499023, "learning_rate": 1.4555720531117646e-05, "loss": 1.3294, "step": 1345 }, { "epoch": 0.8295839753466873, "grad_norm": 1.4113953113555908, "learning_rate": 1.4546137071009012e-05, "loss": 1.3598, "step": 1346 }, { "epoch": 0.8302003081664099, "grad_norm": 1.4383559226989746, "learning_rate": 1.4536548344983016e-05, "loss": 1.3987, "step": 1347 }, { "epoch": 0.8308166409861325, "grad_norm": 1.4376552104949951, "learning_rate": 1.4526954364146546e-05, "loss": 1.3226, "step": 1348 }, { "epoch": 0.8314329738058551, "grad_norm": 1.4254200458526611, "learning_rate": 1.451735513961258e-05, "loss": 1.3886, "step": 1349 }, { "epoch": 0.8320493066255779, "grad_norm": 1.4671714305877686, "learning_rate": 1.4507750682500162e-05, "loss": 1.41, "step": 1350 }, { "epoch": 0.8326656394453005, "grad_norm": 1.4334582090377808, "learning_rate": 1.4498141003934403e-05, "loss": 1.3882, "step": 1351 }, { "epoch": 0.8332819722650231, "grad_norm": 1.4583344459533691, "learning_rate": 1.4488526115046468e-05, "loss": 1.4052, "step": 1352 }, { "epoch": 0.8338983050847457, "grad_norm": 1.4547970294952393, "learning_rate": 1.4478906026973547e-05, "loss": 1.3979, "step": 1353 }, { "epoch": 0.8345146379044684, "grad_norm": 1.4366494417190552, "learning_rate": 1.4469280750858854e-05, "loss": 1.3444, "step": 1354 }, { "epoch": 0.8351309707241911, "grad_norm": 1.3889515399932861, "learning_rate": 1.4459650297851617e-05, "loss": 1.2855, "step": 1355 }, { "epoch": 0.8357473035439137, "grad_norm": 1.4684112071990967, "learning_rate": 1.4450014679107063e-05, "loss": 1.3969, "step": 1356 }, { "epoch": 0.8363636363636363, "grad_norm": 1.437579870223999, "learning_rate": 1.4440373905786389e-05, "loss": 1.4025, "step": 1357 }, { "epoch": 0.836979969183359, "grad_norm": 1.4207462072372437, "learning_rate": 1.4430727989056777e-05, "loss": 1.3565, "step": 1358 }, { "epoch": 0.8375963020030817, "grad_norm": 1.4660311937332153, "learning_rate": 1.442107694009136e-05, "loss": 1.431, "step": 1359 }, { "epoch": 0.8382126348228043, "grad_norm": 1.435903549194336, "learning_rate": 1.4411420770069218e-05, "loss": 1.4226, "step": 1360 }, { "epoch": 0.838828967642527, "grad_norm": 1.3640625476837158, "learning_rate": 1.4401759490175362e-05, "loss": 1.3524, "step": 1361 }, { "epoch": 0.8394453004622496, "grad_norm": 1.3695400953292847, "learning_rate": 1.439209311160072e-05, "loss": 1.3214, "step": 1362 }, { "epoch": 0.8400616332819723, "grad_norm": 1.4442394971847534, "learning_rate": 1.4382421645542133e-05, "loss": 1.3884, "step": 1363 }, { "epoch": 0.8406779661016949, "grad_norm": 1.4502166509628296, "learning_rate": 1.4372745103202322e-05, "loss": 1.3946, "step": 1364 }, { "epoch": 0.8412942989214176, "grad_norm": 1.39769446849823, "learning_rate": 1.4363063495789904e-05, "loss": 1.3027, "step": 1365 }, { "epoch": 0.8419106317411402, "grad_norm": 1.3702818155288696, "learning_rate": 1.435337683451935e-05, "loss": 1.3009, "step": 1366 }, { "epoch": 0.8425269645608628, "grad_norm": 1.4326624870300293, "learning_rate": 1.434368513061099e-05, "loss": 1.3968, "step": 1367 }, { "epoch": 0.8431432973805855, "grad_norm": 1.4271602630615234, "learning_rate": 1.4333988395290994e-05, "loss": 1.4192, "step": 1368 }, { "epoch": 0.8437596302003082, "grad_norm": 1.431688904762268, "learning_rate": 1.4324286639791367e-05, "loss": 1.3155, "step": 1369 }, { "epoch": 0.8443759630200308, "grad_norm": 1.4562785625457764, "learning_rate": 1.4314579875349917e-05, "loss": 1.359, "step": 1370 }, { "epoch": 0.8449922958397534, "grad_norm": 1.3993419408798218, "learning_rate": 1.4304868113210261e-05, "loss": 1.4371, "step": 1371 }, { "epoch": 0.8456086286594762, "grad_norm": 1.4102058410644531, "learning_rate": 1.4295151364621806e-05, "loss": 1.3713, "step": 1372 }, { "epoch": 0.8462249614791988, "grad_norm": 1.4262763261795044, "learning_rate": 1.4285429640839732e-05, "loss": 1.3949, "step": 1373 }, { "epoch": 0.8468412942989214, "grad_norm": 1.431217074394226, "learning_rate": 1.4275702953124981e-05, "loss": 1.3838, "step": 1374 }, { "epoch": 0.847457627118644, "grad_norm": 1.3891165256500244, "learning_rate": 1.4265971312744252e-05, "loss": 1.3519, "step": 1375 }, { "epoch": 0.8480739599383668, "grad_norm": 1.4371888637542725, "learning_rate": 1.4256234730969967e-05, "loss": 1.3758, "step": 1376 }, { "epoch": 0.8486902927580894, "grad_norm": 1.4106435775756836, "learning_rate": 1.4246493219080289e-05, "loss": 1.3495, "step": 1377 }, { "epoch": 0.849306625577812, "grad_norm": 1.4571994543075562, "learning_rate": 1.4236746788359078e-05, "loss": 1.4014, "step": 1378 }, { "epoch": 0.8499229583975346, "grad_norm": 1.4796806573867798, "learning_rate": 1.4226995450095892e-05, "loss": 1.3535, "step": 1379 }, { "epoch": 0.8505392912172574, "grad_norm": 1.400122046470642, "learning_rate": 1.4217239215585989e-05, "loss": 1.3946, "step": 1380 }, { "epoch": 0.85115562403698, "grad_norm": 1.3967502117156982, "learning_rate": 1.420747809613028e-05, "loss": 1.3748, "step": 1381 }, { "epoch": 0.8517719568567026, "grad_norm": 1.4143245220184326, "learning_rate": 1.4197712103035347e-05, "loss": 1.3765, "step": 1382 }, { "epoch": 0.8523882896764252, "grad_norm": 1.4179598093032837, "learning_rate": 1.4187941247613406e-05, "loss": 1.343, "step": 1383 }, { "epoch": 0.853004622496148, "grad_norm": 1.458194375038147, "learning_rate": 1.4178165541182313e-05, "loss": 1.3973, "step": 1384 }, { "epoch": 0.8536209553158706, "grad_norm": 1.4350707530975342, "learning_rate": 1.4168384995065546e-05, "loss": 1.4292, "step": 1385 }, { "epoch": 0.8542372881355932, "grad_norm": 1.4513510465621948, "learning_rate": 1.415859962059218e-05, "loss": 1.4152, "step": 1386 }, { "epoch": 0.8548536209553159, "grad_norm": 1.520041823387146, "learning_rate": 1.414880942909689e-05, "loss": 1.4375, "step": 1387 }, { "epoch": 0.8554699537750385, "grad_norm": 1.427418828010559, "learning_rate": 1.4139014431919923e-05, "loss": 1.3919, "step": 1388 }, { "epoch": 0.8560862865947612, "grad_norm": 1.438768744468689, "learning_rate": 1.4129214640407103e-05, "loss": 1.3435, "step": 1389 }, { "epoch": 0.8567026194144838, "grad_norm": 1.4289673566818237, "learning_rate": 1.41194100659098e-05, "loss": 1.3816, "step": 1390 }, { "epoch": 0.8573189522342065, "grad_norm": 1.4598023891448975, "learning_rate": 1.4109600719784922e-05, "loss": 1.336, "step": 1391 }, { "epoch": 0.8579352850539291, "grad_norm": 1.3857595920562744, "learning_rate": 1.4099786613394913e-05, "loss": 1.3819, "step": 1392 }, { "epoch": 0.8585516178736518, "grad_norm": 1.4400789737701416, "learning_rate": 1.4089967758107727e-05, "loss": 1.4063, "step": 1393 }, { "epoch": 0.8591679506933744, "grad_norm": 1.450950264930725, "learning_rate": 1.4080144165296814e-05, "loss": 1.3622, "step": 1394 }, { "epoch": 0.8597842835130971, "grad_norm": 1.399819016456604, "learning_rate": 1.4070315846341119e-05, "loss": 1.3326, "step": 1395 }, { "epoch": 0.8604006163328197, "grad_norm": 1.4255651235580444, "learning_rate": 1.4060482812625055e-05, "loss": 1.3157, "step": 1396 }, { "epoch": 0.8610169491525423, "grad_norm": 1.4652704000473022, "learning_rate": 1.40506450755385e-05, "loss": 1.4, "step": 1397 }, { "epoch": 0.8616332819722651, "grad_norm": 1.4135456085205078, "learning_rate": 1.4040802646476779e-05, "loss": 1.3485, "step": 1398 }, { "epoch": 0.8622496147919877, "grad_norm": 1.4687283039093018, "learning_rate": 1.4030955536840656e-05, "loss": 1.3886, "step": 1399 }, { "epoch": 0.8628659476117103, "grad_norm": 1.4153722524642944, "learning_rate": 1.402110375803631e-05, "loss": 1.3371, "step": 1400 }, { "epoch": 0.8634822804314329, "grad_norm": 1.4193997383117676, "learning_rate": 1.4011247321475325e-05, "loss": 1.3274, "step": 1401 }, { "epoch": 0.8640986132511557, "grad_norm": 1.42475426197052, "learning_rate": 1.40013862385747e-05, "loss": 1.3702, "step": 1402 }, { "epoch": 0.8647149460708783, "grad_norm": 1.4450716972351074, "learning_rate": 1.3991520520756789e-05, "loss": 1.4136, "step": 1403 }, { "epoch": 0.8653312788906009, "grad_norm": 1.4527150392532349, "learning_rate": 1.3981650179449336e-05, "loss": 1.394, "step": 1404 }, { "epoch": 0.8659476117103235, "grad_norm": 1.4465434551239014, "learning_rate": 1.3971775226085427e-05, "loss": 1.3916, "step": 1405 }, { "epoch": 0.8665639445300463, "grad_norm": 1.3918025493621826, "learning_rate": 1.3961895672103502e-05, "loss": 1.3865, "step": 1406 }, { "epoch": 0.8671802773497689, "grad_norm": 1.4197882413864136, "learning_rate": 1.395201152894732e-05, "loss": 1.4556, "step": 1407 }, { "epoch": 0.8677966101694915, "grad_norm": 1.411867618560791, "learning_rate": 1.394212280806596e-05, "loss": 1.3607, "step": 1408 }, { "epoch": 0.8684129429892141, "grad_norm": 1.402275562286377, "learning_rate": 1.3932229520913807e-05, "loss": 1.4059, "step": 1409 }, { "epoch": 0.8690292758089369, "grad_norm": 1.4054244756698608, "learning_rate": 1.3922331678950525e-05, "loss": 1.3475, "step": 1410 }, { "epoch": 0.8696456086286595, "grad_norm": 1.3978021144866943, "learning_rate": 1.3912429293641066e-05, "loss": 1.2802, "step": 1411 }, { "epoch": 0.8702619414483821, "grad_norm": 1.4561734199523926, "learning_rate": 1.3902522376455636e-05, "loss": 1.429, "step": 1412 }, { "epoch": 0.8708782742681048, "grad_norm": 1.4732402563095093, "learning_rate": 1.3892610938869693e-05, "loss": 1.3833, "step": 1413 }, { "epoch": 0.8714946070878274, "grad_norm": 1.440327763557434, "learning_rate": 1.3882694992363937e-05, "loss": 1.3241, "step": 1414 }, { "epoch": 0.8721109399075501, "grad_norm": 1.4418187141418457, "learning_rate": 1.3872774548424276e-05, "loss": 1.3799, "step": 1415 }, { "epoch": 0.8727272727272727, "grad_norm": 1.4671108722686768, "learning_rate": 1.3862849618541845e-05, "loss": 1.3512, "step": 1416 }, { "epoch": 0.8733436055469954, "grad_norm": 1.4645318984985352, "learning_rate": 1.3852920214212966e-05, "loss": 1.3858, "step": 1417 }, { "epoch": 0.873959938366718, "grad_norm": 1.4563814401626587, "learning_rate": 1.384298634693914e-05, "loss": 1.3819, "step": 1418 }, { "epoch": 0.8745762711864407, "grad_norm": 1.4273557662963867, "learning_rate": 1.3833048028227045e-05, "loss": 1.371, "step": 1419 }, { "epoch": 0.8751926040061633, "grad_norm": 1.4330974817276, "learning_rate": 1.3823105269588517e-05, "loss": 1.3608, "step": 1420 }, { "epoch": 0.875808936825886, "grad_norm": 1.4288973808288574, "learning_rate": 1.3813158082540525e-05, "loss": 1.355, "step": 1421 }, { "epoch": 0.8764252696456086, "grad_norm": 1.501861333847046, "learning_rate": 1.3803206478605176e-05, "loss": 1.4014, "step": 1422 }, { "epoch": 0.8770416024653312, "grad_norm": 1.445556402206421, "learning_rate": 1.3793250469309691e-05, "loss": 1.3343, "step": 1423 }, { "epoch": 0.877657935285054, "grad_norm": 1.4241622686386108, "learning_rate": 1.3783290066186392e-05, "loss": 1.3245, "step": 1424 }, { "epoch": 0.8782742681047766, "grad_norm": 1.4375249147415161, "learning_rate": 1.3773325280772692e-05, "loss": 1.3585, "step": 1425 }, { "epoch": 0.8788906009244992, "grad_norm": 1.3963353633880615, "learning_rate": 1.3763356124611077e-05, "loss": 1.3901, "step": 1426 }, { "epoch": 0.8795069337442218, "grad_norm": 1.3867874145507812, "learning_rate": 1.3753382609249103e-05, "loss": 1.3241, "step": 1427 }, { "epoch": 0.8801232665639446, "grad_norm": 1.3975516557693481, "learning_rate": 1.3743404746239368e-05, "loss": 1.3404, "step": 1428 }, { "epoch": 0.8807395993836672, "grad_norm": 1.4413161277770996, "learning_rate": 1.3733422547139509e-05, "loss": 1.3457, "step": 1429 }, { "epoch": 0.8813559322033898, "grad_norm": 1.506523847579956, "learning_rate": 1.3723436023512191e-05, "loss": 1.2928, "step": 1430 }, { "epoch": 0.8819722650231124, "grad_norm": 1.4723223447799683, "learning_rate": 1.3713445186925077e-05, "loss": 1.3487, "step": 1431 }, { "epoch": 0.8825885978428352, "grad_norm": 1.4872512817382812, "learning_rate": 1.3703450048950832e-05, "loss": 1.3143, "step": 1432 }, { "epoch": 0.8832049306625578, "grad_norm": 1.5259560346603394, "learning_rate": 1.3693450621167108e-05, "loss": 1.3869, "step": 1433 }, { "epoch": 0.8838212634822804, "grad_norm": 1.482622742652893, "learning_rate": 1.3683446915156515e-05, "loss": 1.3968, "step": 1434 }, { "epoch": 0.884437596302003, "grad_norm": 1.4638423919677734, "learning_rate": 1.3673438942506624e-05, "loss": 1.3213, "step": 1435 }, { "epoch": 0.8850539291217258, "grad_norm": 1.4277124404907227, "learning_rate": 1.3663426714809957e-05, "loss": 1.3101, "step": 1436 }, { "epoch": 0.8856702619414484, "grad_norm": 1.455073595046997, "learning_rate": 1.3653410243663953e-05, "loss": 1.3332, "step": 1437 }, { "epoch": 0.886286594761171, "grad_norm": 1.4494318962097168, "learning_rate": 1.3643389540670963e-05, "loss": 1.386, "step": 1438 }, { "epoch": 0.8869029275808937, "grad_norm": 1.4430327415466309, "learning_rate": 1.3633364617438255e-05, "loss": 1.3601, "step": 1439 }, { "epoch": 0.8875192604006163, "grad_norm": 1.475830316543579, "learning_rate": 1.3623335485577978e-05, "loss": 1.3969, "step": 1440 }, { "epoch": 0.888135593220339, "grad_norm": 1.4503047466278076, "learning_rate": 1.3613302156707146e-05, "loss": 1.3837, "step": 1441 }, { "epoch": 0.8887519260400616, "grad_norm": 1.488672137260437, "learning_rate": 1.360326464244765e-05, "loss": 1.3871, "step": 1442 }, { "epoch": 0.8893682588597843, "grad_norm": 1.4458681344985962, "learning_rate": 1.3593222954426226e-05, "loss": 1.3565, "step": 1443 }, { "epoch": 0.8899845916795069, "grad_norm": 1.4315749406814575, "learning_rate": 1.3583177104274436e-05, "loss": 1.3714, "step": 1444 }, { "epoch": 0.8906009244992296, "grad_norm": 1.4398359060287476, "learning_rate": 1.3573127103628666e-05, "loss": 1.3438, "step": 1445 }, { "epoch": 0.8912172573189522, "grad_norm": 1.4264541864395142, "learning_rate": 1.3563072964130122e-05, "loss": 1.3325, "step": 1446 }, { "epoch": 0.8918335901386749, "grad_norm": 1.4562867879867554, "learning_rate": 1.3553014697424784e-05, "loss": 1.3448, "step": 1447 }, { "epoch": 0.8924499229583975, "grad_norm": 1.4449052810668945, "learning_rate": 1.3542952315163427e-05, "loss": 1.3727, "step": 1448 }, { "epoch": 0.8930662557781202, "grad_norm": 1.439606785774231, "learning_rate": 1.3532885829001589e-05, "loss": 1.3571, "step": 1449 }, { "epoch": 0.8936825885978429, "grad_norm": 1.5166465044021606, "learning_rate": 1.3522815250599564e-05, "loss": 1.4099, "step": 1450 }, { "epoch": 0.8942989214175655, "grad_norm": 1.4896795749664307, "learning_rate": 1.351274059162238e-05, "loss": 1.3928, "step": 1451 }, { "epoch": 0.8949152542372881, "grad_norm": 1.4396259784698486, "learning_rate": 1.3502661863739795e-05, "loss": 1.4177, "step": 1452 }, { "epoch": 0.8955315870570107, "grad_norm": 1.4605425596237183, "learning_rate": 1.3492579078626285e-05, "loss": 1.3565, "step": 1453 }, { "epoch": 0.8961479198767335, "grad_norm": 1.4905821084976196, "learning_rate": 1.348249224796102e-05, "loss": 1.3721, "step": 1454 }, { "epoch": 0.8967642526964561, "grad_norm": 1.4254074096679688, "learning_rate": 1.347240138342785e-05, "loss": 1.3191, "step": 1455 }, { "epoch": 0.8973805855161787, "grad_norm": 1.4337348937988281, "learning_rate": 1.3462306496715314e-05, "loss": 1.3577, "step": 1456 }, { "epoch": 0.8979969183359013, "grad_norm": 1.5039030313491821, "learning_rate": 1.3452207599516598e-05, "loss": 1.4088, "step": 1457 }, { "epoch": 0.8986132511556241, "grad_norm": 1.4888246059417725, "learning_rate": 1.3442104703529537e-05, "loss": 1.371, "step": 1458 }, { "epoch": 0.8992295839753467, "grad_norm": 1.4920920133590698, "learning_rate": 1.3431997820456592e-05, "loss": 1.3962, "step": 1459 }, { "epoch": 0.8998459167950693, "grad_norm": 1.450639009475708, "learning_rate": 1.3421886962004857e-05, "loss": 1.3102, "step": 1460 }, { "epoch": 0.900462249614792, "grad_norm": 1.441603422164917, "learning_rate": 1.3411772139886017e-05, "loss": 1.3021, "step": 1461 }, { "epoch": 0.9010785824345147, "grad_norm": 1.4647035598754883, "learning_rate": 1.3401653365816344e-05, "loss": 1.4028, "step": 1462 }, { "epoch": 0.9016949152542373, "grad_norm": 1.5427651405334473, "learning_rate": 1.3391530651516709e-05, "loss": 1.4406, "step": 1463 }, { "epoch": 0.9023112480739599, "grad_norm": 1.4374496936798096, "learning_rate": 1.3381404008712531e-05, "loss": 1.3592, "step": 1464 }, { "epoch": 0.9029275808936826, "grad_norm": 1.458971381187439, "learning_rate": 1.3371273449133777e-05, "loss": 1.3699, "step": 1465 }, { "epoch": 0.9035439137134053, "grad_norm": 1.4602665901184082, "learning_rate": 1.336113898451496e-05, "loss": 1.3762, "step": 1466 }, { "epoch": 0.9041602465331279, "grad_norm": 1.4302271604537964, "learning_rate": 1.3351000626595116e-05, "loss": 1.3339, "step": 1467 }, { "epoch": 0.9047765793528505, "grad_norm": 1.4215492010116577, "learning_rate": 1.3340858387117785e-05, "loss": 1.3349, "step": 1468 }, { "epoch": 0.9053929121725732, "grad_norm": 1.5016993284225464, "learning_rate": 1.3330712277831003e-05, "loss": 1.3772, "step": 1469 }, { "epoch": 0.9060092449922958, "grad_norm": 1.4864174127578735, "learning_rate": 1.3320562310487301e-05, "loss": 1.3113, "step": 1470 }, { "epoch": 0.9066255778120185, "grad_norm": 1.4842710494995117, "learning_rate": 1.3310408496843661e-05, "loss": 1.3618, "step": 1471 }, { "epoch": 0.9072419106317411, "grad_norm": 1.4856656789779663, "learning_rate": 1.3300250848661524e-05, "loss": 1.4242, "step": 1472 }, { "epoch": 0.9078582434514638, "grad_norm": 1.4431753158569336, "learning_rate": 1.3290089377706792e-05, "loss": 1.3784, "step": 1473 }, { "epoch": 0.9084745762711864, "grad_norm": 1.5202155113220215, "learning_rate": 1.3279924095749769e-05, "loss": 1.4538, "step": 1474 }, { "epoch": 0.9090909090909091, "grad_norm": 1.4641852378845215, "learning_rate": 1.3269755014565184e-05, "loss": 1.3551, "step": 1475 }, { "epoch": 0.9097072419106318, "grad_norm": 1.4415433406829834, "learning_rate": 1.3259582145932174e-05, "loss": 1.3547, "step": 1476 }, { "epoch": 0.9103235747303544, "grad_norm": 1.4508389234542847, "learning_rate": 1.3249405501634254e-05, "loss": 1.334, "step": 1477 }, { "epoch": 0.910939907550077, "grad_norm": 1.4589800834655762, "learning_rate": 1.3239225093459312e-05, "loss": 1.3504, "step": 1478 }, { "epoch": 0.9115562403697997, "grad_norm": 1.473504662513733, "learning_rate": 1.3229040933199603e-05, "loss": 1.3707, "step": 1479 }, { "epoch": 0.9121725731895224, "grad_norm": 1.4704688787460327, "learning_rate": 1.3218853032651719e-05, "loss": 1.3387, "step": 1480 }, { "epoch": 0.912788906009245, "grad_norm": 1.4657350778579712, "learning_rate": 1.3208661403616594e-05, "loss": 1.2995, "step": 1481 }, { "epoch": 0.9134052388289676, "grad_norm": 1.4823287725448608, "learning_rate": 1.3198466057899474e-05, "loss": 1.3394, "step": 1482 }, { "epoch": 0.9140215716486902, "grad_norm": 1.4987499713897705, "learning_rate": 1.3188267007309911e-05, "loss": 1.3052, "step": 1483 }, { "epoch": 0.914637904468413, "grad_norm": 1.5599194765090942, "learning_rate": 1.3178064263661751e-05, "loss": 1.4072, "step": 1484 }, { "epoch": 0.9152542372881356, "grad_norm": 1.4872277975082397, "learning_rate": 1.316785783877312e-05, "loss": 1.3185, "step": 1485 }, { "epoch": 0.9158705701078582, "grad_norm": 1.4908335208892822, "learning_rate": 1.31576477444664e-05, "loss": 1.3656, "step": 1486 }, { "epoch": 0.916486902927581, "grad_norm": 1.4821746349334717, "learning_rate": 1.3147433992568228e-05, "loss": 1.3511, "step": 1487 }, { "epoch": 0.9171032357473036, "grad_norm": 1.4660634994506836, "learning_rate": 1.3137216594909483e-05, "loss": 1.3726, "step": 1488 }, { "epoch": 0.9177195685670262, "grad_norm": 1.521837830543518, "learning_rate": 1.3126995563325253e-05, "loss": 1.3389, "step": 1489 }, { "epoch": 0.9183359013867488, "grad_norm": 1.4630707502365112, "learning_rate": 1.311677090965485e-05, "loss": 1.3234, "step": 1490 }, { "epoch": 0.9189522342064715, "grad_norm": 1.441649317741394, "learning_rate": 1.3106542645741778e-05, "loss": 1.2926, "step": 1491 }, { "epoch": 0.9195685670261942, "grad_norm": 1.46121346950531, "learning_rate": 1.3096310783433711e-05, "loss": 1.39, "step": 1492 }, { "epoch": 0.9201848998459168, "grad_norm": 1.4443892240524292, "learning_rate": 1.3086075334582508e-05, "loss": 1.2713, "step": 1493 }, { "epoch": 0.9208012326656394, "grad_norm": 1.479066252708435, "learning_rate": 1.3075836311044177e-05, "loss": 1.3468, "step": 1494 }, { "epoch": 0.9214175654853621, "grad_norm": 1.4777826070785522, "learning_rate": 1.3065593724678855e-05, "loss": 1.3782, "step": 1495 }, { "epoch": 0.9220338983050848, "grad_norm": 1.5030559301376343, "learning_rate": 1.3055347587350822e-05, "loss": 1.3795, "step": 1496 }, { "epoch": 0.9226502311248074, "grad_norm": 1.466654896736145, "learning_rate": 1.3045097910928468e-05, "loss": 1.3128, "step": 1497 }, { "epoch": 0.92326656394453, "grad_norm": 1.4720027446746826, "learning_rate": 1.303484470728427e-05, "loss": 1.3744, "step": 1498 }, { "epoch": 0.9238828967642527, "grad_norm": 1.4461647272109985, "learning_rate": 1.3024587988294807e-05, "loss": 1.3535, "step": 1499 }, { "epoch": 0.9244992295839753, "grad_norm": 1.5089902877807617, "learning_rate": 1.301432776584072e-05, "loss": 1.3812, "step": 1500 }, { "epoch": 0.925115562403698, "grad_norm": 1.4923036098480225, "learning_rate": 1.3004064051806712e-05, "loss": 1.3286, "step": 1501 }, { "epoch": 0.9257318952234207, "grad_norm": 1.5019680261611938, "learning_rate": 1.2993796858081525e-05, "loss": 1.3419, "step": 1502 }, { "epoch": 0.9263482280431433, "grad_norm": 1.4703623056411743, "learning_rate": 1.2983526196557939e-05, "loss": 1.3388, "step": 1503 }, { "epoch": 0.9269645608628659, "grad_norm": 1.5223686695098877, "learning_rate": 1.297325207913275e-05, "loss": 1.3665, "step": 1504 }, { "epoch": 0.9275808936825886, "grad_norm": 1.4941604137420654, "learning_rate": 1.296297451770675e-05, "loss": 1.3809, "step": 1505 }, { "epoch": 0.9281972265023113, "grad_norm": 1.5102417469024658, "learning_rate": 1.295269352418473e-05, "loss": 1.345, "step": 1506 }, { "epoch": 0.9288135593220339, "grad_norm": 1.4968394041061401, "learning_rate": 1.2942409110475449e-05, "loss": 1.3434, "step": 1507 }, { "epoch": 0.9294298921417565, "grad_norm": 1.4783694744110107, "learning_rate": 1.2932121288491631e-05, "loss": 1.3509, "step": 1508 }, { "epoch": 0.9300462249614792, "grad_norm": 1.530909776687622, "learning_rate": 1.2921830070149941e-05, "loss": 1.3662, "step": 1509 }, { "epoch": 0.9306625577812019, "grad_norm": 1.5014904737472534, "learning_rate": 1.2911535467370995e-05, "loss": 1.4098, "step": 1510 }, { "epoch": 0.9312788906009245, "grad_norm": 1.489490270614624, "learning_rate": 1.2901237492079312e-05, "loss": 1.3388, "step": 1511 }, { "epoch": 0.9318952234206471, "grad_norm": 1.463609218597412, "learning_rate": 1.2890936156203323e-05, "loss": 1.4, "step": 1512 }, { "epoch": 0.9325115562403699, "grad_norm": 1.4386941194534302, "learning_rate": 1.2880631471675358e-05, "loss": 1.2751, "step": 1513 }, { "epoch": 0.9331278890600925, "grad_norm": 1.48918879032135, "learning_rate": 1.2870323450431617e-05, "loss": 1.3363, "step": 1514 }, { "epoch": 0.9337442218798151, "grad_norm": 1.4786691665649414, "learning_rate": 1.2860012104412166e-05, "loss": 1.2788, "step": 1515 }, { "epoch": 0.9343605546995377, "grad_norm": 1.5887880325317383, "learning_rate": 1.2849697445560926e-05, "loss": 1.3919, "step": 1516 }, { "epoch": 0.9349768875192604, "grad_norm": 1.4889891147613525, "learning_rate": 1.2839379485825663e-05, "loss": 1.3539, "step": 1517 }, { "epoch": 0.9355932203389831, "grad_norm": 1.492180347442627, "learning_rate": 1.2829058237157943e-05, "loss": 1.3421, "step": 1518 }, { "epoch": 0.9362095531587057, "grad_norm": 1.4641648530960083, "learning_rate": 1.2818733711513165e-05, "loss": 1.3389, "step": 1519 }, { "epoch": 0.9368258859784283, "grad_norm": 1.4852454662322998, "learning_rate": 1.2808405920850518e-05, "loss": 1.3898, "step": 1520 }, { "epoch": 0.937442218798151, "grad_norm": 1.4880000352859497, "learning_rate": 1.279807487713296e-05, "loss": 1.3601, "step": 1521 }, { "epoch": 0.9380585516178737, "grad_norm": 1.476817011833191, "learning_rate": 1.2787740592327232e-05, "loss": 1.3452, "step": 1522 }, { "epoch": 0.9386748844375963, "grad_norm": 1.493040919303894, "learning_rate": 1.2777403078403827e-05, "loss": 1.3065, "step": 1523 }, { "epoch": 0.9392912172573189, "grad_norm": 1.5109727382659912, "learning_rate": 1.2767062347336974e-05, "loss": 1.3875, "step": 1524 }, { "epoch": 0.9399075500770416, "grad_norm": 1.4838725328445435, "learning_rate": 1.2756718411104627e-05, "loss": 1.2822, "step": 1525 }, { "epoch": 0.9405238828967643, "grad_norm": 1.4793325662612915, "learning_rate": 1.2746371281688462e-05, "loss": 1.3835, "step": 1526 }, { "epoch": 0.9411402157164869, "grad_norm": 1.486796259880066, "learning_rate": 1.2736020971073839e-05, "loss": 1.3211, "step": 1527 }, { "epoch": 0.9417565485362096, "grad_norm": 1.4298349618911743, "learning_rate": 1.272566749124982e-05, "loss": 1.2616, "step": 1528 }, { "epoch": 0.9423728813559322, "grad_norm": 1.460893154144287, "learning_rate": 1.2715310854209125e-05, "loss": 1.3724, "step": 1529 }, { "epoch": 0.9429892141756548, "grad_norm": 1.5055670738220215, "learning_rate": 1.2704951071948134e-05, "loss": 1.3784, "step": 1530 }, { "epoch": 0.9436055469953775, "grad_norm": 1.5397368669509888, "learning_rate": 1.2694588156466878e-05, "loss": 1.3306, "step": 1531 }, { "epoch": 0.9442218798151002, "grad_norm": 1.4848099946975708, "learning_rate": 1.2684222119769005e-05, "loss": 1.3207, "step": 1532 }, { "epoch": 0.9448382126348228, "grad_norm": 1.463877558708191, "learning_rate": 1.2673852973861789e-05, "loss": 1.2508, "step": 1533 }, { "epoch": 0.9454545454545454, "grad_norm": 1.512209177017212, "learning_rate": 1.2663480730756095e-05, "loss": 1.351, "step": 1534 }, { "epoch": 0.9460708782742681, "grad_norm": 1.5086302757263184, "learning_rate": 1.2653105402466388e-05, "loss": 1.3562, "step": 1535 }, { "epoch": 0.9466872110939908, "grad_norm": 1.4995537996292114, "learning_rate": 1.2642727001010696e-05, "loss": 1.3442, "step": 1536 }, { "epoch": 0.9473035439137134, "grad_norm": 1.45460844039917, "learning_rate": 1.2632345538410609e-05, "loss": 1.2555, "step": 1537 }, { "epoch": 0.947919876733436, "grad_norm": 1.492310881614685, "learning_rate": 1.2621961026691272e-05, "loss": 1.3645, "step": 1538 }, { "epoch": 0.9485362095531588, "grad_norm": 1.4821099042892456, "learning_rate": 1.2611573477881346e-05, "loss": 1.3621, "step": 1539 }, { "epoch": 0.9491525423728814, "grad_norm": 1.4976590871810913, "learning_rate": 1.2601182904013024e-05, "loss": 1.3352, "step": 1540 }, { "epoch": 0.949768875192604, "grad_norm": 1.4502075910568237, "learning_rate": 1.2590789317121998e-05, "loss": 1.3199, "step": 1541 }, { "epoch": 0.9503852080123266, "grad_norm": 1.518487811088562, "learning_rate": 1.2580392729247448e-05, "loss": 1.2828, "step": 1542 }, { "epoch": 0.9510015408320494, "grad_norm": 1.448959231376648, "learning_rate": 1.2569993152432028e-05, "loss": 1.2683, "step": 1543 }, { "epoch": 0.951617873651772, "grad_norm": 1.512891411781311, "learning_rate": 1.2559590598721864e-05, "loss": 1.3876, "step": 1544 }, { "epoch": 0.9522342064714946, "grad_norm": 1.479915738105774, "learning_rate": 1.2549185080166524e-05, "loss": 1.3743, "step": 1545 }, { "epoch": 0.9528505392912172, "grad_norm": 1.4823411703109741, "learning_rate": 1.2538776608819e-05, "loss": 1.3786, "step": 1546 }, { "epoch": 0.95346687211094, "grad_norm": 1.497011423110962, "learning_rate": 1.2528365196735731e-05, "loss": 1.3162, "step": 1547 }, { "epoch": 0.9540832049306626, "grad_norm": 1.4856659173965454, "learning_rate": 1.2517950855976535e-05, "loss": 1.3289, "step": 1548 }, { "epoch": 0.9546995377503852, "grad_norm": 1.442374348640442, "learning_rate": 1.2507533598604634e-05, "loss": 1.288, "step": 1549 }, { "epoch": 0.9553158705701078, "grad_norm": 1.5086760520935059, "learning_rate": 1.2497113436686628e-05, "loss": 1.3261, "step": 1550 }, { "epoch": 0.9559322033898305, "grad_norm": 1.541587471961975, "learning_rate": 1.2486690382292486e-05, "loss": 1.3317, "step": 1551 }, { "epoch": 0.9565485362095532, "grad_norm": 1.5415881872177124, "learning_rate": 1.2476264447495517e-05, "loss": 1.323, "step": 1552 }, { "epoch": 0.9571648690292758, "grad_norm": 1.5076320171356201, "learning_rate": 1.2465835644372375e-05, "loss": 1.2495, "step": 1553 }, { "epoch": 0.9577812018489985, "grad_norm": 1.4670486450195312, "learning_rate": 1.2455403985003035e-05, "loss": 1.3346, "step": 1554 }, { "epoch": 0.9583975346687211, "grad_norm": 1.5284167528152466, "learning_rate": 1.2444969481470775e-05, "loss": 1.3255, "step": 1555 }, { "epoch": 0.9590138674884437, "grad_norm": 1.4835635423660278, "learning_rate": 1.2434532145862172e-05, "loss": 1.3724, "step": 1556 }, { "epoch": 0.9596302003081664, "grad_norm": 1.4784793853759766, "learning_rate": 1.2424091990267087e-05, "loss": 1.319, "step": 1557 }, { "epoch": 0.9602465331278891, "grad_norm": 1.517376184463501, "learning_rate": 1.2413649026778638e-05, "loss": 1.3336, "step": 1558 }, { "epoch": 0.9608628659476117, "grad_norm": 1.4965075254440308, "learning_rate": 1.2403203267493205e-05, "loss": 1.3242, "step": 1559 }, { "epoch": 0.9614791987673343, "grad_norm": 1.5396623611450195, "learning_rate": 1.2392754724510402e-05, "loss": 1.3348, "step": 1560 }, { "epoch": 0.962095531587057, "grad_norm": 1.5243395566940308, "learning_rate": 1.2382303409933065e-05, "loss": 1.4097, "step": 1561 }, { "epoch": 0.9627118644067797, "grad_norm": 1.5539473295211792, "learning_rate": 1.2371849335867247e-05, "loss": 1.3541, "step": 1562 }, { "epoch": 0.9633281972265023, "grad_norm": 1.509760856628418, "learning_rate": 1.236139251442219e-05, "loss": 1.2951, "step": 1563 }, { "epoch": 0.9639445300462249, "grad_norm": 1.544867753982544, "learning_rate": 1.2350932957710322e-05, "loss": 1.3753, "step": 1564 }, { "epoch": 0.9645608628659477, "grad_norm": 1.4922329187393188, "learning_rate": 1.2340470677847243e-05, "loss": 1.3504, "step": 1565 }, { "epoch": 0.9651771956856703, "grad_norm": 1.5165804624557495, "learning_rate": 1.2330005686951698e-05, "loss": 1.3727, "step": 1566 }, { "epoch": 0.9657935285053929, "grad_norm": 1.451600193977356, "learning_rate": 1.2319537997145584e-05, "loss": 1.3781, "step": 1567 }, { "epoch": 0.9664098613251155, "grad_norm": 1.4710602760314941, "learning_rate": 1.2309067620553913e-05, "loss": 1.3405, "step": 1568 }, { "epoch": 0.9670261941448383, "grad_norm": 1.4638984203338623, "learning_rate": 1.2298594569304818e-05, "loss": 1.3161, "step": 1569 }, { "epoch": 0.9676425269645609, "grad_norm": 1.4884973764419556, "learning_rate": 1.2288118855529526e-05, "loss": 1.2913, "step": 1570 }, { "epoch": 0.9682588597842835, "grad_norm": 1.5139981508255005, "learning_rate": 1.227764049136234e-05, "loss": 1.3924, "step": 1571 }, { "epoch": 0.9688751926040061, "grad_norm": 1.5501998662948608, "learning_rate": 1.2267159488940656e-05, "loss": 1.3893, "step": 1572 }, { "epoch": 0.9694915254237289, "grad_norm": 1.4986555576324463, "learning_rate": 1.2256675860404901e-05, "loss": 1.4026, "step": 1573 }, { "epoch": 0.9701078582434515, "grad_norm": 1.4892547130584717, "learning_rate": 1.2246189617898556e-05, "loss": 1.3071, "step": 1574 }, { "epoch": 0.9707241910631741, "grad_norm": 1.4597586393356323, "learning_rate": 1.2235700773568134e-05, "loss": 1.3143, "step": 1575 }, { "epoch": 0.9713405238828967, "grad_norm": 1.5127228498458862, "learning_rate": 1.2225209339563144e-05, "loss": 1.3425, "step": 1576 }, { "epoch": 0.9719568567026194, "grad_norm": 1.5094207525253296, "learning_rate": 1.2214715328036117e-05, "loss": 1.3173, "step": 1577 }, { "epoch": 0.9725731895223421, "grad_norm": 1.5327695608139038, "learning_rate": 1.220421875114256e-05, "loss": 1.4044, "step": 1578 }, { "epoch": 0.9731895223420647, "grad_norm": 1.5420618057250977, "learning_rate": 1.2193719621040942e-05, "loss": 1.3454, "step": 1579 }, { "epoch": 0.9738058551617874, "grad_norm": 1.500922679901123, "learning_rate": 1.2183217949892705e-05, "loss": 1.2499, "step": 1580 }, { "epoch": 0.97442218798151, "grad_norm": 1.4858174324035645, "learning_rate": 1.2172713749862229e-05, "loss": 1.3073, "step": 1581 }, { "epoch": 0.9750385208012327, "grad_norm": 1.5245347023010254, "learning_rate": 1.2162207033116816e-05, "loss": 1.3304, "step": 1582 }, { "epoch": 0.9756548536209553, "grad_norm": 1.50897216796875, "learning_rate": 1.2151697811826688e-05, "loss": 1.3096, "step": 1583 }, { "epoch": 0.976271186440678, "grad_norm": 1.542466640472412, "learning_rate": 1.214118609816498e-05, "loss": 1.358, "step": 1584 }, { "epoch": 0.9768875192604006, "grad_norm": 1.5227092504501343, "learning_rate": 1.2130671904307692e-05, "loss": 1.334, "step": 1585 }, { "epoch": 0.9775038520801232, "grad_norm": 1.5541828870773315, "learning_rate": 1.2120155242433711e-05, "loss": 1.3247, "step": 1586 }, { "epoch": 0.9781201848998459, "grad_norm": 1.4885971546173096, "learning_rate": 1.210963612472478e-05, "loss": 1.3033, "step": 1587 }, { "epoch": 0.9787365177195686, "grad_norm": 1.4917629957199097, "learning_rate": 1.209911456336549e-05, "loss": 1.2655, "step": 1588 }, { "epoch": 0.9793528505392912, "grad_norm": 1.6037946939468384, "learning_rate": 1.2088590570543252e-05, "loss": 1.3271, "step": 1589 }, { "epoch": 0.9799691833590138, "grad_norm": 1.5261393785476685, "learning_rate": 1.2078064158448308e-05, "loss": 1.3091, "step": 1590 }, { "epoch": 0.9805855161787366, "grad_norm": 1.5644488334655762, "learning_rate": 1.206753533927369e-05, "loss": 1.3218, "step": 1591 }, { "epoch": 0.9812018489984592, "grad_norm": 1.5540693998336792, "learning_rate": 1.2057004125215223e-05, "loss": 1.3493, "step": 1592 }, { "epoch": 0.9818181818181818, "grad_norm": 1.4877400398254395, "learning_rate": 1.2046470528471508e-05, "loss": 1.2666, "step": 1593 }, { "epoch": 0.9824345146379044, "grad_norm": 1.524936318397522, "learning_rate": 1.2035934561243905e-05, "loss": 1.3564, "step": 1594 }, { "epoch": 0.9830508474576272, "grad_norm": 1.4811499118804932, "learning_rate": 1.2025396235736516e-05, "loss": 1.315, "step": 1595 }, { "epoch": 0.9836671802773498, "grad_norm": 1.469612956047058, "learning_rate": 1.2014855564156175e-05, "loss": 1.291, "step": 1596 }, { "epoch": 0.9842835130970724, "grad_norm": 1.522155523300171, "learning_rate": 1.2004312558712442e-05, "loss": 1.2982, "step": 1597 }, { "epoch": 0.984899845916795, "grad_norm": 1.517244815826416, "learning_rate": 1.1993767231617568e-05, "loss": 1.3448, "step": 1598 }, { "epoch": 0.9855161787365178, "grad_norm": 1.551138997077942, "learning_rate": 1.1983219595086507e-05, "loss": 1.3294, "step": 1599 }, { "epoch": 0.9861325115562404, "grad_norm": 1.5549120903015137, "learning_rate": 1.1972669661336872e-05, "loss": 1.3565, "step": 1600 }, { "epoch": 0.986748844375963, "grad_norm": 1.5333266258239746, "learning_rate": 1.196211744258895e-05, "loss": 1.3081, "step": 1601 }, { "epoch": 0.9873651771956856, "grad_norm": 1.5496264696121216, "learning_rate": 1.1951562951065675e-05, "loss": 1.3368, "step": 1602 }, { "epoch": 0.9879815100154083, "grad_norm": 1.4952367544174194, "learning_rate": 1.1941006198992598e-05, "loss": 1.2788, "step": 1603 }, { "epoch": 0.988597842835131, "grad_norm": 1.5539542436599731, "learning_rate": 1.1930447198597905e-05, "loss": 1.3676, "step": 1604 }, { "epoch": 0.9892141756548536, "grad_norm": 1.518926978111267, "learning_rate": 1.1919885962112385e-05, "loss": 1.2683, "step": 1605 }, { "epoch": 0.9898305084745763, "grad_norm": 1.5524379014968872, "learning_rate": 1.1909322501769407e-05, "loss": 1.3415, "step": 1606 }, { "epoch": 0.9904468412942989, "grad_norm": 1.5077852010726929, "learning_rate": 1.1898756829804922e-05, "loss": 1.364, "step": 1607 }, { "epoch": 0.9910631741140216, "grad_norm": 1.5172991752624512, "learning_rate": 1.1888188958457444e-05, "loss": 1.3309, "step": 1608 }, { "epoch": 0.9916795069337442, "grad_norm": 1.5227206945419312, "learning_rate": 1.1877618899968038e-05, "loss": 1.3682, "step": 1609 }, { "epoch": 0.9922958397534669, "grad_norm": 1.5306718349456787, "learning_rate": 1.1867046666580287e-05, "loss": 1.2956, "step": 1610 }, { "epoch": 0.9929121725731895, "grad_norm": 1.6123008728027344, "learning_rate": 1.1856472270540312e-05, "loss": 1.2646, "step": 1611 }, { "epoch": 0.9935285053929122, "grad_norm": 1.4876329898834229, "learning_rate": 1.1845895724096727e-05, "loss": 1.3004, "step": 1612 }, { "epoch": 0.9941448382126348, "grad_norm": 1.5014806985855103, "learning_rate": 1.183531703950064e-05, "loss": 1.3112, "step": 1613 }, { "epoch": 0.9947611710323575, "grad_norm": 1.4946600198745728, "learning_rate": 1.182473622900564e-05, "loss": 1.2785, "step": 1614 }, { "epoch": 0.9953775038520801, "grad_norm": 1.5371549129486084, "learning_rate": 1.1814153304867775e-05, "loss": 1.3204, "step": 1615 }, { "epoch": 0.9959938366718027, "grad_norm": 1.5026248693466187, "learning_rate": 1.180356827934554e-05, "loss": 1.3339, "step": 1616 }, { "epoch": 0.9966101694915255, "grad_norm": 1.55011785030365, "learning_rate": 1.179298116469986e-05, "loss": 1.2808, "step": 1617 }, { "epoch": 0.9972265023112481, "grad_norm": 1.5160036087036133, "learning_rate": 1.1782391973194095e-05, "loss": 1.3392, "step": 1618 }, { "epoch": 0.9978428351309707, "grad_norm": 1.53262197971344, "learning_rate": 1.1771800717093995e-05, "loss": 1.3381, "step": 1619 }, { "epoch": 0.9984591679506933, "grad_norm": 1.527487874031067, "learning_rate": 1.1761207408667702e-05, "loss": 1.2764, "step": 1620 }, { "epoch": 0.9990755007704161, "grad_norm": 1.5237523317337036, "learning_rate": 1.1750612060185755e-05, "loss": 1.2828, "step": 1621 }, { "epoch": 0.9996918335901387, "grad_norm": 1.558052897453308, "learning_rate": 1.174001468392103e-05, "loss": 1.3627, "step": 1622 }, { "epoch": 1.0003081664098614, "grad_norm": 1.532157301902771, "learning_rate": 1.1729415292148768e-05, "loss": 1.2051, "step": 1623 }, { "epoch": 1.000924499229584, "grad_norm": 1.5478503704071045, "learning_rate": 1.1718813897146534e-05, "loss": 1.1418, "step": 1624 }, { "epoch": 1.0015408320493067, "grad_norm": 1.5938588380813599, "learning_rate": 1.1708210511194231e-05, "loss": 1.2372, "step": 1625 }, { "epoch": 1.0021571648690293, "grad_norm": 1.6270500421524048, "learning_rate": 1.1697605146574047e-05, "loss": 1.1936, "step": 1626 }, { "epoch": 1.002773497688752, "grad_norm": 1.621793270111084, "learning_rate": 1.1686997815570473e-05, "loss": 1.158, "step": 1627 }, { "epoch": 1.0033898305084745, "grad_norm": 1.677977442741394, "learning_rate": 1.1676388530470281e-05, "loss": 1.1623, "step": 1628 }, { "epoch": 1.0040061633281971, "grad_norm": 1.6872823238372803, "learning_rate": 1.1665777303562498e-05, "loss": 1.1995, "step": 1629 }, { "epoch": 1.00462249614792, "grad_norm": 1.6343873739242554, "learning_rate": 1.1655164147138405e-05, "loss": 1.1069, "step": 1630 }, { "epoch": 1.0052388289676426, "grad_norm": 1.7186012268066406, "learning_rate": 1.1644549073491518e-05, "loss": 1.2511, "step": 1631 }, { "epoch": 1.0058551617873652, "grad_norm": 1.7156963348388672, "learning_rate": 1.163393209491757e-05, "loss": 1.2065, "step": 1632 }, { "epoch": 1.0064714946070878, "grad_norm": 1.6278984546661377, "learning_rate": 1.162331322371451e-05, "loss": 1.1498, "step": 1633 }, { "epoch": 1.0070878274268105, "grad_norm": 1.613917589187622, "learning_rate": 1.1612692472182463e-05, "loss": 1.1289, "step": 1634 }, { "epoch": 1.007704160246533, "grad_norm": 1.6489546298980713, "learning_rate": 1.1602069852623748e-05, "loss": 1.2254, "step": 1635 }, { "epoch": 1.0083204930662557, "grad_norm": 1.6430319547653198, "learning_rate": 1.1591445377342843e-05, "loss": 1.1738, "step": 1636 }, { "epoch": 1.0089368258859783, "grad_norm": 1.6729966402053833, "learning_rate": 1.1580819058646369e-05, "loss": 1.1956, "step": 1637 }, { "epoch": 1.0095531587057012, "grad_norm": 1.666024923324585, "learning_rate": 1.1570190908843089e-05, "loss": 1.1455, "step": 1638 }, { "epoch": 1.0101694915254238, "grad_norm": 1.7424362897872925, "learning_rate": 1.1559560940243887e-05, "loss": 1.1794, "step": 1639 }, { "epoch": 1.0107858243451464, "grad_norm": 1.6913353204727173, "learning_rate": 1.1548929165161751e-05, "loss": 1.1656, "step": 1640 }, { "epoch": 1.011402157164869, "grad_norm": 1.653147578239441, "learning_rate": 1.1538295595911764e-05, "loss": 1.1664, "step": 1641 }, { "epoch": 1.0120184899845917, "grad_norm": 1.6270132064819336, "learning_rate": 1.152766024481108e-05, "loss": 1.1909, "step": 1642 }, { "epoch": 1.0126348228043143, "grad_norm": 1.6727157831192017, "learning_rate": 1.1517023124178933e-05, "loss": 1.2041, "step": 1643 }, { "epoch": 1.013251155624037, "grad_norm": 1.6736795902252197, "learning_rate": 1.1506384246336588e-05, "loss": 1.1245, "step": 1644 }, { "epoch": 1.0138674884437597, "grad_norm": 1.6414756774902344, "learning_rate": 1.1495743623607357e-05, "loss": 1.1508, "step": 1645 }, { "epoch": 1.0144838212634824, "grad_norm": 1.6608948707580566, "learning_rate": 1.1485101268316572e-05, "loss": 1.1935, "step": 1646 }, { "epoch": 1.015100154083205, "grad_norm": 1.6625101566314697, "learning_rate": 1.1474457192791564e-05, "loss": 1.1105, "step": 1647 }, { "epoch": 1.0157164869029276, "grad_norm": 1.6754577159881592, "learning_rate": 1.1463811409361667e-05, "loss": 1.2027, "step": 1648 }, { "epoch": 1.0163328197226502, "grad_norm": 1.6138331890106201, "learning_rate": 1.1453163930358192e-05, "loss": 1.1315, "step": 1649 }, { "epoch": 1.0169491525423728, "grad_norm": 1.637281894683838, "learning_rate": 1.1442514768114404e-05, "loss": 1.1813, "step": 1650 }, { "epoch": 1.0175654853620955, "grad_norm": 1.6954149007797241, "learning_rate": 1.1431863934965527e-05, "loss": 1.178, "step": 1651 }, { "epoch": 1.018181818181818, "grad_norm": 1.6617443561553955, "learning_rate": 1.1421211443248721e-05, "loss": 1.2177, "step": 1652 }, { "epoch": 1.018798151001541, "grad_norm": 1.6696064472198486, "learning_rate": 1.1410557305303057e-05, "loss": 1.1826, "step": 1653 }, { "epoch": 1.0194144838212635, "grad_norm": 1.6819738149642944, "learning_rate": 1.1399901533469531e-05, "loss": 1.1935, "step": 1654 }, { "epoch": 1.0200308166409862, "grad_norm": 1.6750751733779907, "learning_rate": 1.1389244140091014e-05, "loss": 1.1503, "step": 1655 }, { "epoch": 1.0206471494607088, "grad_norm": 1.6721802949905396, "learning_rate": 1.1378585137512264e-05, "loss": 1.1703, "step": 1656 }, { "epoch": 1.0212634822804314, "grad_norm": 1.6635315418243408, "learning_rate": 1.1367924538079895e-05, "loss": 1.164, "step": 1657 }, { "epoch": 1.021879815100154, "grad_norm": 1.6570863723754883, "learning_rate": 1.1357262354142385e-05, "loss": 1.0956, "step": 1658 }, { "epoch": 1.0224961479198766, "grad_norm": 1.6601768732070923, "learning_rate": 1.1346598598050039e-05, "loss": 1.1285, "step": 1659 }, { "epoch": 1.0231124807395995, "grad_norm": 1.7553274631500244, "learning_rate": 1.1335933282154974e-05, "loss": 1.2277, "step": 1660 }, { "epoch": 1.023728813559322, "grad_norm": 1.6631989479064941, "learning_rate": 1.132526641881113e-05, "loss": 1.1422, "step": 1661 }, { "epoch": 1.0243451463790447, "grad_norm": 1.6264389753341675, "learning_rate": 1.1314598020374232e-05, "loss": 1.1413, "step": 1662 }, { "epoch": 1.0249614791987673, "grad_norm": 1.6422289609909058, "learning_rate": 1.130392809920178e-05, "loss": 1.1565, "step": 1663 }, { "epoch": 1.02557781201849, "grad_norm": 1.6504275798797607, "learning_rate": 1.1293256667653043e-05, "loss": 1.2024, "step": 1664 }, { "epoch": 1.0261941448382126, "grad_norm": 1.659157156944275, "learning_rate": 1.1282583738089043e-05, "loss": 1.2028, "step": 1665 }, { "epoch": 1.0268104776579352, "grad_norm": 1.6415371894836426, "learning_rate": 1.1271909322872523e-05, "loss": 1.2064, "step": 1666 }, { "epoch": 1.027426810477658, "grad_norm": 1.6763290166854858, "learning_rate": 1.1261233434367962e-05, "loss": 1.2227, "step": 1667 }, { "epoch": 1.0280431432973807, "grad_norm": 1.671391248703003, "learning_rate": 1.125055608494154e-05, "loss": 1.2224, "step": 1668 }, { "epoch": 1.0286594761171033, "grad_norm": 1.680293321609497, "learning_rate": 1.1239877286961123e-05, "loss": 1.2023, "step": 1669 }, { "epoch": 1.029275808936826, "grad_norm": 1.6694871187210083, "learning_rate": 1.1229197052796266e-05, "loss": 1.1966, "step": 1670 }, { "epoch": 1.0298921417565485, "grad_norm": 1.6822479963302612, "learning_rate": 1.1218515394818176e-05, "loss": 1.2137, "step": 1671 }, { "epoch": 1.0305084745762711, "grad_norm": 1.6610949039459229, "learning_rate": 1.1207832325399722e-05, "loss": 1.1381, "step": 1672 }, { "epoch": 1.0311248073959938, "grad_norm": 1.6747246980667114, "learning_rate": 1.11971478569154e-05, "loss": 1.1507, "step": 1673 }, { "epoch": 1.0317411402157164, "grad_norm": 1.674744725227356, "learning_rate": 1.1186462001741323e-05, "loss": 1.1385, "step": 1674 }, { "epoch": 1.0323574730354392, "grad_norm": 1.7140374183654785, "learning_rate": 1.117577477225522e-05, "loss": 1.1871, "step": 1675 }, { "epoch": 1.0329738058551619, "grad_norm": 1.7452441453933716, "learning_rate": 1.1165086180836406e-05, "loss": 1.1999, "step": 1676 }, { "epoch": 1.0335901386748845, "grad_norm": 1.7006239891052246, "learning_rate": 1.1154396239865775e-05, "loss": 1.1372, "step": 1677 }, { "epoch": 1.034206471494607, "grad_norm": 1.6415176391601562, "learning_rate": 1.1143704961725781e-05, "loss": 1.147, "step": 1678 }, { "epoch": 1.0348228043143297, "grad_norm": 1.6549248695373535, "learning_rate": 1.1133012358800432e-05, "loss": 1.1257, "step": 1679 }, { "epoch": 1.0354391371340523, "grad_norm": 1.6558971405029297, "learning_rate": 1.1122318443475268e-05, "loss": 1.1877, "step": 1680 }, { "epoch": 1.036055469953775, "grad_norm": 1.6760103702545166, "learning_rate": 1.1111623228137347e-05, "loss": 1.2516, "step": 1681 }, { "epoch": 1.0366718027734976, "grad_norm": 1.674357295036316, "learning_rate": 1.1100926725175239e-05, "loss": 1.1446, "step": 1682 }, { "epoch": 1.0372881355932204, "grad_norm": 1.6593265533447266, "learning_rate": 1.1090228946979e-05, "loss": 1.2176, "step": 1683 }, { "epoch": 1.037904468412943, "grad_norm": 1.7100541591644287, "learning_rate": 1.1079529905940162e-05, "loss": 1.2292, "step": 1684 }, { "epoch": 1.0385208012326657, "grad_norm": 1.610440731048584, "learning_rate": 1.1068829614451728e-05, "loss": 1.1095, "step": 1685 }, { "epoch": 1.0391371340523883, "grad_norm": 1.7121686935424805, "learning_rate": 1.1058128084908145e-05, "loss": 1.1596, "step": 1686 }, { "epoch": 1.039753466872111, "grad_norm": 1.7559598684310913, "learning_rate": 1.1047425329705283e-05, "loss": 1.1255, "step": 1687 }, { "epoch": 1.0403697996918335, "grad_norm": 1.729260802268982, "learning_rate": 1.103672136124045e-05, "loss": 1.1982, "step": 1688 }, { "epoch": 1.0409861325115561, "grad_norm": 1.6465709209442139, "learning_rate": 1.1026016191912353e-05, "loss": 1.1022, "step": 1689 }, { "epoch": 1.041602465331279, "grad_norm": 1.6972978115081787, "learning_rate": 1.1015309834121083e-05, "loss": 1.1638, "step": 1690 }, { "epoch": 1.0422187981510016, "grad_norm": 1.727441668510437, "learning_rate": 1.1004602300268113e-05, "loss": 1.1946, "step": 1691 }, { "epoch": 1.0428351309707242, "grad_norm": 1.7458136081695557, "learning_rate": 1.0993893602756283e-05, "loss": 1.2014, "step": 1692 }, { "epoch": 1.0434514637904468, "grad_norm": 1.7200745344161987, "learning_rate": 1.0983183753989772e-05, "loss": 1.2158, "step": 1693 }, { "epoch": 1.0440677966101695, "grad_norm": 1.7580360174179077, "learning_rate": 1.0972472766374091e-05, "loss": 1.2384, "step": 1694 }, { "epoch": 1.044684129429892, "grad_norm": 1.665393590927124, "learning_rate": 1.0961760652316091e-05, "loss": 1.157, "step": 1695 }, { "epoch": 1.0453004622496147, "grad_norm": 1.6485153436660767, "learning_rate": 1.09510474242239e-05, "loss": 1.1995, "step": 1696 }, { "epoch": 1.0459167950693375, "grad_norm": 1.6452207565307617, "learning_rate": 1.0940333094506952e-05, "loss": 1.1542, "step": 1697 }, { "epoch": 1.0465331278890602, "grad_norm": 1.627976417541504, "learning_rate": 1.0929617675575955e-05, "loss": 1.161, "step": 1698 }, { "epoch": 1.0471494607087828, "grad_norm": 1.6823524236679077, "learning_rate": 1.0918901179842878e-05, "loss": 1.1602, "step": 1699 }, { "epoch": 1.0477657935285054, "grad_norm": 1.698856234550476, "learning_rate": 1.0908183619720933e-05, "loss": 1.1704, "step": 1700 }, { "epoch": 1.048382126348228, "grad_norm": 1.6878761053085327, "learning_rate": 1.0897465007624572e-05, "loss": 1.1167, "step": 1701 }, { "epoch": 1.0489984591679506, "grad_norm": 1.7008631229400635, "learning_rate": 1.0886745355969464e-05, "loss": 1.1712, "step": 1702 }, { "epoch": 1.0496147919876733, "grad_norm": 1.6876412630081177, "learning_rate": 1.0876024677172476e-05, "loss": 1.1502, "step": 1703 }, { "epoch": 1.0502311248073959, "grad_norm": 1.6719645261764526, "learning_rate": 1.0865302983651674e-05, "loss": 1.1948, "step": 1704 }, { "epoch": 1.0508474576271187, "grad_norm": 1.6950397491455078, "learning_rate": 1.0854580287826293e-05, "loss": 1.1822, "step": 1705 }, { "epoch": 1.0514637904468414, "grad_norm": 1.7246159315109253, "learning_rate": 1.0843856602116727e-05, "loss": 1.2471, "step": 1706 }, { "epoch": 1.052080123266564, "grad_norm": 1.749547004699707, "learning_rate": 1.0833131938944526e-05, "loss": 1.1849, "step": 1707 }, { "epoch": 1.0526964560862866, "grad_norm": 1.7358349561691284, "learning_rate": 1.0822406310732363e-05, "loss": 1.245, "step": 1708 }, { "epoch": 1.0533127889060092, "grad_norm": 1.7031949758529663, "learning_rate": 1.0811679729904031e-05, "loss": 1.1643, "step": 1709 }, { "epoch": 1.0539291217257318, "grad_norm": 1.8071651458740234, "learning_rate": 1.0800952208884433e-05, "loss": 1.2073, "step": 1710 }, { "epoch": 1.0545454545454545, "grad_norm": 1.6986761093139648, "learning_rate": 1.079022376009955e-05, "loss": 1.1748, "step": 1711 }, { "epoch": 1.0551617873651773, "grad_norm": 1.7395572662353516, "learning_rate": 1.0779494395976447e-05, "loss": 1.2177, "step": 1712 }, { "epoch": 1.0557781201849, "grad_norm": 1.7205052375793457, "learning_rate": 1.0768764128943245e-05, "loss": 1.1734, "step": 1713 }, { "epoch": 1.0563944530046225, "grad_norm": 1.6641335487365723, "learning_rate": 1.075803297142911e-05, "loss": 1.1822, "step": 1714 }, { "epoch": 1.0570107858243452, "grad_norm": 1.668532371520996, "learning_rate": 1.0747300935864245e-05, "loss": 1.1366, "step": 1715 }, { "epoch": 1.0576271186440678, "grad_norm": 1.6654084920883179, "learning_rate": 1.0736568034679862e-05, "loss": 1.2022, "step": 1716 }, { "epoch": 1.0582434514637904, "grad_norm": 1.6544806957244873, "learning_rate": 1.072583428030818e-05, "loss": 1.1668, "step": 1717 }, { "epoch": 1.058859784283513, "grad_norm": 1.691140055656433, "learning_rate": 1.0715099685182409e-05, "loss": 1.1784, "step": 1718 }, { "epoch": 1.0594761171032356, "grad_norm": 1.7504491806030273, "learning_rate": 1.0704364261736727e-05, "loss": 1.1426, "step": 1719 }, { "epoch": 1.0600924499229585, "grad_norm": 1.7295321226119995, "learning_rate": 1.0693628022406277e-05, "loss": 1.1583, "step": 1720 }, { "epoch": 1.060708782742681, "grad_norm": 1.7236430644989014, "learning_rate": 1.0682890979627141e-05, "loss": 1.2097, "step": 1721 }, { "epoch": 1.0613251155624037, "grad_norm": 1.7205568552017212, "learning_rate": 1.0672153145836336e-05, "loss": 1.2208, "step": 1722 }, { "epoch": 1.0619414483821263, "grad_norm": 1.7419980764389038, "learning_rate": 1.0661414533471802e-05, "loss": 1.2056, "step": 1723 }, { "epoch": 1.062557781201849, "grad_norm": 1.742187261581421, "learning_rate": 1.0650675154972361e-05, "loss": 1.2043, "step": 1724 }, { "epoch": 1.0631741140215716, "grad_norm": 1.7303779125213623, "learning_rate": 1.0639935022777741e-05, "loss": 1.1991, "step": 1725 }, { "epoch": 1.0637904468412942, "grad_norm": 1.6789500713348389, "learning_rate": 1.0629194149328537e-05, "loss": 1.1253, "step": 1726 }, { "epoch": 1.064406779661017, "grad_norm": 1.6874445676803589, "learning_rate": 1.0618452547066202e-05, "loss": 1.2073, "step": 1727 }, { "epoch": 1.0650231124807397, "grad_norm": 1.7014974355697632, "learning_rate": 1.060771022843303e-05, "loss": 1.1588, "step": 1728 }, { "epoch": 1.0656394453004623, "grad_norm": 1.7384965419769287, "learning_rate": 1.0596967205872156e-05, "loss": 1.1502, "step": 1729 }, { "epoch": 1.066255778120185, "grad_norm": 1.7259947061538696, "learning_rate": 1.0586223491827514e-05, "loss": 1.1538, "step": 1730 }, { "epoch": 1.0668721109399075, "grad_norm": 1.7045420408248901, "learning_rate": 1.057547909874385e-05, "loss": 1.1832, "step": 1731 }, { "epoch": 1.0674884437596301, "grad_norm": 1.7573102712631226, "learning_rate": 1.05647340390667e-05, "loss": 1.2041, "step": 1732 }, { "epoch": 1.0681047765793528, "grad_norm": 1.7170424461364746, "learning_rate": 1.055398832524236e-05, "loss": 1.1586, "step": 1733 }, { "epoch": 1.0687211093990756, "grad_norm": 1.684150218963623, "learning_rate": 1.0543241969717892e-05, "loss": 1.1441, "step": 1734 }, { "epoch": 1.0693374422187982, "grad_norm": 1.6515668630599976, "learning_rate": 1.0532494984941097e-05, "loss": 1.097, "step": 1735 }, { "epoch": 1.0699537750385208, "grad_norm": 1.6909493207931519, "learning_rate": 1.0521747383360515e-05, "loss": 1.1379, "step": 1736 }, { "epoch": 1.0705701078582435, "grad_norm": 1.7506537437438965, "learning_rate": 1.0510999177425383e-05, "loss": 1.2048, "step": 1737 }, { "epoch": 1.071186440677966, "grad_norm": 1.7358918190002441, "learning_rate": 1.050025037958565e-05, "loss": 1.1773, "step": 1738 }, { "epoch": 1.0718027734976887, "grad_norm": 1.702453851699829, "learning_rate": 1.0489501002291954e-05, "loss": 1.1635, "step": 1739 }, { "epoch": 1.0724191063174113, "grad_norm": 1.767865538597107, "learning_rate": 1.0478751057995593e-05, "loss": 1.167, "step": 1740 }, { "epoch": 1.073035439137134, "grad_norm": 1.676203727722168, "learning_rate": 1.0468000559148528e-05, "loss": 1.171, "step": 1741 }, { "epoch": 1.0736517719568568, "grad_norm": 1.7054195404052734, "learning_rate": 1.0457249518203366e-05, "loss": 1.1645, "step": 1742 }, { "epoch": 1.0742681047765794, "grad_norm": 1.7286897897720337, "learning_rate": 1.044649794761333e-05, "loss": 1.178, "step": 1743 }, { "epoch": 1.074884437596302, "grad_norm": 1.718239426612854, "learning_rate": 1.0435745859832271e-05, "loss": 1.1943, "step": 1744 }, { "epoch": 1.0755007704160247, "grad_norm": 1.679903507232666, "learning_rate": 1.0424993267314627e-05, "loss": 1.1646, "step": 1745 }, { "epoch": 1.0761171032357473, "grad_norm": 1.6798471212387085, "learning_rate": 1.041424018251543e-05, "loss": 1.096, "step": 1746 }, { "epoch": 1.07673343605547, "grad_norm": 1.7211341857910156, "learning_rate": 1.0403486617890277e-05, "loss": 1.1449, "step": 1747 }, { "epoch": 1.0773497688751925, "grad_norm": 1.7579010725021362, "learning_rate": 1.039273258589532e-05, "loss": 1.1802, "step": 1748 }, { "epoch": 1.0779661016949154, "grad_norm": 1.7467325925827026, "learning_rate": 1.0381978098987255e-05, "loss": 1.1367, "step": 1749 }, { "epoch": 1.078582434514638, "grad_norm": 1.7347357273101807, "learning_rate": 1.0371223169623306e-05, "loss": 1.187, "step": 1750 }, { "epoch": 1.0791987673343606, "grad_norm": 1.768505334854126, "learning_rate": 1.0360467810261204e-05, "loss": 1.141, "step": 1751 }, { "epoch": 1.0798151001540832, "grad_norm": 1.7369158267974854, "learning_rate": 1.0349712033359184e-05, "loss": 1.1547, "step": 1752 }, { "epoch": 1.0804314329738058, "grad_norm": 1.681262493133545, "learning_rate": 1.0338955851375962e-05, "loss": 1.1252, "step": 1753 }, { "epoch": 1.0810477657935285, "grad_norm": 1.7266089916229248, "learning_rate": 1.0328199276770727e-05, "loss": 1.1637, "step": 1754 }, { "epoch": 1.081664098613251, "grad_norm": 1.7784591913223267, "learning_rate": 1.0317442322003113e-05, "loss": 1.2053, "step": 1755 }, { "epoch": 1.0822804314329737, "grad_norm": 1.6345996856689453, "learning_rate": 1.0306684999533203e-05, "loss": 1.1106, "step": 1756 }, { "epoch": 1.0828967642526965, "grad_norm": 1.6926023960113525, "learning_rate": 1.0295927321821506e-05, "loss": 1.1517, "step": 1757 }, { "epoch": 1.0835130970724192, "grad_norm": 1.687718391418457, "learning_rate": 1.0285169301328938e-05, "loss": 1.1554, "step": 1758 }, { "epoch": 1.0841294298921418, "grad_norm": 1.703687071800232, "learning_rate": 1.0274410950516816e-05, "loss": 1.1568, "step": 1759 }, { "epoch": 1.0847457627118644, "grad_norm": 1.736072063446045, "learning_rate": 1.0263652281846837e-05, "loss": 1.1721, "step": 1760 }, { "epoch": 1.085362095531587, "grad_norm": 1.7136366367340088, "learning_rate": 1.0252893307781064e-05, "loss": 1.1913, "step": 1761 }, { "epoch": 1.0859784283513096, "grad_norm": 1.7000943422317505, "learning_rate": 1.0242134040781919e-05, "loss": 1.1583, "step": 1762 }, { "epoch": 1.0865947611710323, "grad_norm": 1.7180479764938354, "learning_rate": 1.0231374493312165e-05, "loss": 1.1719, "step": 1763 }, { "epoch": 1.087211093990755, "grad_norm": 1.703515887260437, "learning_rate": 1.0220614677834882e-05, "loss": 1.1122, "step": 1764 }, { "epoch": 1.0878274268104777, "grad_norm": 1.7440391778945923, "learning_rate": 1.020985460681346e-05, "loss": 1.1873, "step": 1765 }, { "epoch": 1.0884437596302003, "grad_norm": 1.7445173263549805, "learning_rate": 1.0199094292711597e-05, "loss": 1.1625, "step": 1766 }, { "epoch": 1.089060092449923, "grad_norm": 1.7920652627944946, "learning_rate": 1.0188333747993265e-05, "loss": 1.1736, "step": 1767 }, { "epoch": 1.0896764252696456, "grad_norm": 1.7361292839050293, "learning_rate": 1.0177572985122693e-05, "loss": 1.1528, "step": 1768 }, { "epoch": 1.0902927580893682, "grad_norm": 1.7838810682296753, "learning_rate": 1.0166812016564385e-05, "loss": 1.1594, "step": 1769 }, { "epoch": 1.0909090909090908, "grad_norm": 1.7376344203948975, "learning_rate": 1.0156050854783064e-05, "loss": 1.1752, "step": 1770 }, { "epoch": 1.0915254237288137, "grad_norm": 1.7934634685516357, "learning_rate": 1.0145289512243686e-05, "loss": 1.201, "step": 1771 }, { "epoch": 1.0921417565485363, "grad_norm": 1.7551517486572266, "learning_rate": 1.0134528001411412e-05, "loss": 1.1715, "step": 1772 }, { "epoch": 1.092758089368259, "grad_norm": 1.744391679763794, "learning_rate": 1.0123766334751605e-05, "loss": 1.1688, "step": 1773 }, { "epoch": 1.0933744221879815, "grad_norm": 1.7181929349899292, "learning_rate": 1.01130045247298e-05, "loss": 1.1554, "step": 1774 }, { "epoch": 1.0939907550077042, "grad_norm": 1.746791124343872, "learning_rate": 1.0102242583811699e-05, "loss": 1.1768, "step": 1775 }, { "epoch": 1.0946070878274268, "grad_norm": 1.8035285472869873, "learning_rate": 1.0091480524463167e-05, "loss": 1.1724, "step": 1776 }, { "epoch": 1.0952234206471494, "grad_norm": 1.7622153759002686, "learning_rate": 1.0080718359150188e-05, "loss": 1.1618, "step": 1777 }, { "epoch": 1.095839753466872, "grad_norm": 1.83767569065094, "learning_rate": 1.0069956100338882e-05, "loss": 1.1739, "step": 1778 }, { "epoch": 1.0964560862865949, "grad_norm": 1.7849615812301636, "learning_rate": 1.0059193760495477e-05, "loss": 1.1838, "step": 1779 }, { "epoch": 1.0970724191063175, "grad_norm": 1.7381643056869507, "learning_rate": 1.0048431352086285e-05, "loss": 1.1649, "step": 1780 }, { "epoch": 1.09768875192604, "grad_norm": 1.7230746746063232, "learning_rate": 1.003766888757771e-05, "loss": 1.0994, "step": 1781 }, { "epoch": 1.0983050847457627, "grad_norm": 1.7473838329315186, "learning_rate": 1.002690637943621e-05, "loss": 1.1545, "step": 1782 }, { "epoch": 1.0989214175654853, "grad_norm": 1.7174322605133057, "learning_rate": 1.0016143840128299e-05, "loss": 1.1749, "step": 1783 }, { "epoch": 1.099537750385208, "grad_norm": 1.7550326585769653, "learning_rate": 1.0005381282120532e-05, "loss": 1.2128, "step": 1784 }, { "epoch": 1.1001540832049306, "grad_norm": 1.7421151399612427, "learning_rate": 9.994618717879473e-06, "loss": 1.1808, "step": 1785 }, { "epoch": 1.1007704160246532, "grad_norm": 1.8551565408706665, "learning_rate": 9.983856159871701e-06, "loss": 1.1867, "step": 1786 }, { "epoch": 1.101386748844376, "grad_norm": 1.751265048980713, "learning_rate": 9.973093620563795e-06, "loss": 1.0931, "step": 1787 }, { "epoch": 1.1020030816640987, "grad_norm": 1.7458369731903076, "learning_rate": 9.962331112422293e-06, "loss": 1.1255, "step": 1788 }, { "epoch": 1.1026194144838213, "grad_norm": 1.7775629758834839, "learning_rate": 9.951568647913718e-06, "loss": 1.2343, "step": 1789 }, { "epoch": 1.103235747303544, "grad_norm": 1.8691169023513794, "learning_rate": 9.940806239504528e-06, "loss": 1.2195, "step": 1790 }, { "epoch": 1.1038520801232665, "grad_norm": 1.7444987297058105, "learning_rate": 9.93004389966112e-06, "loss": 1.1886, "step": 1791 }, { "epoch": 1.1044684129429891, "grad_norm": 1.74837064743042, "learning_rate": 9.919281640849813e-06, "loss": 1.1414, "step": 1792 }, { "epoch": 1.1050847457627118, "grad_norm": 1.7520579099655151, "learning_rate": 9.908519475536838e-06, "loss": 1.2339, "step": 1793 }, { "epoch": 1.1057010785824346, "grad_norm": 1.6795350313186646, "learning_rate": 9.897757416188303e-06, "loss": 1.1317, "step": 1794 }, { "epoch": 1.1063174114021572, "grad_norm": 1.7371749877929688, "learning_rate": 9.886995475270205e-06, "loss": 1.1875, "step": 1795 }, { "epoch": 1.1069337442218798, "grad_norm": 1.75054931640625, "learning_rate": 9.876233665248398e-06, "loss": 1.1083, "step": 1796 }, { "epoch": 1.1075500770416025, "grad_norm": 1.7307846546173096, "learning_rate": 9.865471998588588e-06, "loss": 1.125, "step": 1797 }, { "epoch": 1.108166409861325, "grad_norm": 1.7600802183151245, "learning_rate": 9.854710487756318e-06, "loss": 1.1454, "step": 1798 }, { "epoch": 1.1087827426810477, "grad_norm": 1.7342371940612793, "learning_rate": 9.843949145216938e-06, "loss": 1.2014, "step": 1799 }, { "epoch": 1.1093990755007703, "grad_norm": 1.7079304456710815, "learning_rate": 9.83318798343562e-06, "loss": 1.1457, "step": 1800 }, { "epoch": 1.1100154083204932, "grad_norm": 1.7480080127716064, "learning_rate": 9.82242701487731e-06, "loss": 1.2165, "step": 1801 }, { "epoch": 1.1106317411402158, "grad_norm": 1.7480674982070923, "learning_rate": 9.811666252006742e-06, "loss": 1.2194, "step": 1802 }, { "epoch": 1.1112480739599384, "grad_norm": 1.7608528137207031, "learning_rate": 9.800905707288408e-06, "loss": 1.1865, "step": 1803 }, { "epoch": 1.111864406779661, "grad_norm": 1.7697417736053467, "learning_rate": 9.790145393186542e-06, "loss": 1.1243, "step": 1804 }, { "epoch": 1.1124807395993837, "grad_norm": 1.7686156034469604, "learning_rate": 9.779385322165121e-06, "loss": 1.2208, "step": 1805 }, { "epoch": 1.1130970724191063, "grad_norm": 1.7410653829574585, "learning_rate": 9.768625506687838e-06, "loss": 1.102, "step": 1806 }, { "epoch": 1.113713405238829, "grad_norm": 1.746486783027649, "learning_rate": 9.757865959218083e-06, "loss": 1.1408, "step": 1807 }, { "epoch": 1.1143297380585517, "grad_norm": 1.7874982357025146, "learning_rate": 9.74710669221894e-06, "loss": 1.1756, "step": 1808 }, { "epoch": 1.1149460708782744, "grad_norm": 1.666343331336975, "learning_rate": 9.73634771815317e-06, "loss": 1.1227, "step": 1809 }, { "epoch": 1.115562403697997, "grad_norm": 1.7368969917297363, "learning_rate": 9.725589049483186e-06, "loss": 1.1381, "step": 1810 }, { "epoch": 1.1161787365177196, "grad_norm": 1.7172225713729858, "learning_rate": 9.714830698671065e-06, "loss": 1.2389, "step": 1811 }, { "epoch": 1.1167950693374422, "grad_norm": 1.7442600727081299, "learning_rate": 9.704072678178495e-06, "loss": 1.1419, "step": 1812 }, { "epoch": 1.1174114021571648, "grad_norm": 1.7055429220199585, "learning_rate": 9.6933150004668e-06, "loss": 1.1554, "step": 1813 }, { "epoch": 1.1180277349768875, "grad_norm": 1.8012456893920898, "learning_rate": 9.682557677996893e-06, "loss": 1.2002, "step": 1814 }, { "epoch": 1.11864406779661, "grad_norm": 1.794052243232727, "learning_rate": 9.67180072322928e-06, "loss": 1.143, "step": 1815 }, { "epoch": 1.119260400616333, "grad_norm": 1.8666329383850098, "learning_rate": 9.661044148624038e-06, "loss": 1.2665, "step": 1816 }, { "epoch": 1.1198767334360555, "grad_norm": 1.8370088338851929, "learning_rate": 9.650287966640817e-06, "loss": 1.2296, "step": 1817 }, { "epoch": 1.1204930662557782, "grad_norm": 1.7418245077133179, "learning_rate": 9.639532189738801e-06, "loss": 1.133, "step": 1818 }, { "epoch": 1.1211093990755008, "grad_norm": 1.741837501525879, "learning_rate": 9.628776830376698e-06, "loss": 1.0871, "step": 1819 }, { "epoch": 1.1217257318952234, "grad_norm": 1.7431105375289917, "learning_rate": 9.61802190101275e-06, "loss": 1.1046, "step": 1820 }, { "epoch": 1.122342064714946, "grad_norm": 1.8066598176956177, "learning_rate": 9.607267414104684e-06, "loss": 1.1706, "step": 1821 }, { "epoch": 1.1229583975346686, "grad_norm": 1.770628571510315, "learning_rate": 9.596513382109726e-06, "loss": 1.1385, "step": 1822 }, { "epoch": 1.1235747303543913, "grad_norm": 1.7723420858383179, "learning_rate": 9.58575981748457e-06, "loss": 1.2206, "step": 1823 }, { "epoch": 1.124191063174114, "grad_norm": 1.7622805833816528, "learning_rate": 9.575006732685375e-06, "loss": 1.1303, "step": 1824 }, { "epoch": 1.1248073959938367, "grad_norm": 1.7585525512695312, "learning_rate": 9.564254140167732e-06, "loss": 1.1241, "step": 1825 }, { "epoch": 1.1254237288135593, "grad_norm": 1.7792818546295166, "learning_rate": 9.553502052386673e-06, "loss": 1.2119, "step": 1826 }, { "epoch": 1.126040061633282, "grad_norm": 1.7811164855957031, "learning_rate": 9.542750481796641e-06, "loss": 1.1666, "step": 1827 }, { "epoch": 1.1266563944530046, "grad_norm": 1.772862195968628, "learning_rate": 9.531999440851474e-06, "loss": 1.1568, "step": 1828 }, { "epoch": 1.1272727272727272, "grad_norm": 1.7361563444137573, "learning_rate": 9.521248942004408e-06, "loss": 1.1459, "step": 1829 }, { "epoch": 1.1278890600924498, "grad_norm": 1.7397193908691406, "learning_rate": 9.510498997708048e-06, "loss": 1.1491, "step": 1830 }, { "epoch": 1.1285053929121727, "grad_norm": 1.8290362358093262, "learning_rate": 9.499749620414353e-06, "loss": 1.1508, "step": 1831 }, { "epoch": 1.1291217257318953, "grad_norm": 1.7763103246688843, "learning_rate": 9.48900082257462e-06, "loss": 1.0956, "step": 1832 }, { "epoch": 1.129738058551618, "grad_norm": 1.8184707164764404, "learning_rate": 9.47825261663949e-06, "loss": 1.1944, "step": 1833 }, { "epoch": 1.1303543913713405, "grad_norm": 1.7699685096740723, "learning_rate": 9.467505015058901e-06, "loss": 1.1311, "step": 1834 }, { "epoch": 1.1309707241910631, "grad_norm": 1.784938097000122, "learning_rate": 9.45675803028211e-06, "loss": 1.134, "step": 1835 }, { "epoch": 1.1315870570107858, "grad_norm": 1.7624138593673706, "learning_rate": 9.446011674757641e-06, "loss": 1.1594, "step": 1836 }, { "epoch": 1.1322033898305084, "grad_norm": 1.8356959819793701, "learning_rate": 9.435265960933304e-06, "loss": 1.1893, "step": 1837 }, { "epoch": 1.1328197226502312, "grad_norm": 1.761669397354126, "learning_rate": 9.424520901256152e-06, "loss": 1.1071, "step": 1838 }, { "epoch": 1.1334360554699539, "grad_norm": 1.8249238729476929, "learning_rate": 9.41377650817249e-06, "loss": 1.1396, "step": 1839 }, { "epoch": 1.1340523882896765, "grad_norm": 1.8210721015930176, "learning_rate": 9.40303279412785e-06, "loss": 1.1586, "step": 1840 }, { "epoch": 1.134668721109399, "grad_norm": 1.7769678831100464, "learning_rate": 9.39228977156697e-06, "loss": 1.1154, "step": 1841 }, { "epoch": 1.1352850539291217, "grad_norm": 1.7728196382522583, "learning_rate": 9.381547452933801e-06, "loss": 1.156, "step": 1842 }, { "epoch": 1.1359013867488443, "grad_norm": 1.7344157695770264, "learning_rate": 9.370805850671464e-06, "loss": 1.1547, "step": 1843 }, { "epoch": 1.136517719568567, "grad_norm": 1.7826488018035889, "learning_rate": 9.360064977222262e-06, "loss": 1.166, "step": 1844 }, { "epoch": 1.1371340523882898, "grad_norm": 1.7950408458709717, "learning_rate": 9.349324845027644e-06, "loss": 1.1285, "step": 1845 }, { "epoch": 1.1377503852080124, "grad_norm": 1.747721791267395, "learning_rate": 9.338585466528205e-06, "loss": 1.1129, "step": 1846 }, { "epoch": 1.138366718027735, "grad_norm": 1.800897479057312, "learning_rate": 9.327846854163663e-06, "loss": 1.174, "step": 1847 }, { "epoch": 1.1389830508474577, "grad_norm": 1.7975921630859375, "learning_rate": 9.317109020372862e-06, "loss": 1.1258, "step": 1848 }, { "epoch": 1.1395993836671803, "grad_norm": 1.8284772634506226, "learning_rate": 9.306371977593727e-06, "loss": 1.1215, "step": 1849 }, { "epoch": 1.140215716486903, "grad_norm": 1.87111234664917, "learning_rate": 9.295635738263277e-06, "loss": 1.2039, "step": 1850 }, { "epoch": 1.1408320493066255, "grad_norm": 1.826024055480957, "learning_rate": 9.284900314817596e-06, "loss": 1.1676, "step": 1851 }, { "epoch": 1.1414483821263481, "grad_norm": 1.8051592111587524, "learning_rate": 9.274165719691824e-06, "loss": 1.0962, "step": 1852 }, { "epoch": 1.1420647149460708, "grad_norm": 1.8918606042861938, "learning_rate": 9.26343196532014e-06, "loss": 1.2298, "step": 1853 }, { "epoch": 1.1426810477657936, "grad_norm": 1.81473970413208, "learning_rate": 9.252699064135759e-06, "loss": 1.0513, "step": 1854 }, { "epoch": 1.1432973805855162, "grad_norm": 1.8110792636871338, "learning_rate": 9.241967028570893e-06, "loss": 1.1481, "step": 1855 }, { "epoch": 1.1439137134052388, "grad_norm": 1.7994881868362427, "learning_rate": 9.231235871056758e-06, "loss": 1.1017, "step": 1856 }, { "epoch": 1.1445300462249615, "grad_norm": 1.856048822402954, "learning_rate": 9.220505604023558e-06, "loss": 1.193, "step": 1857 }, { "epoch": 1.145146379044684, "grad_norm": 1.849790334701538, "learning_rate": 9.209776239900453e-06, "loss": 1.1728, "step": 1858 }, { "epoch": 1.1457627118644067, "grad_norm": 1.7638351917266846, "learning_rate": 9.199047791115569e-06, "loss": 1.1397, "step": 1859 }, { "epoch": 1.1463790446841293, "grad_norm": 1.789586067199707, "learning_rate": 9.18832027009597e-06, "loss": 1.106, "step": 1860 }, { "epoch": 1.1469953775038522, "grad_norm": 1.8176939487457275, "learning_rate": 9.17759368926764e-06, "loss": 1.1297, "step": 1861 }, { "epoch": 1.1476117103235748, "grad_norm": 1.7629731893539429, "learning_rate": 9.166868061055476e-06, "loss": 1.1517, "step": 1862 }, { "epoch": 1.1482280431432974, "grad_norm": 1.8033028841018677, "learning_rate": 9.156143397883276e-06, "loss": 1.0981, "step": 1863 }, { "epoch": 1.14884437596302, "grad_norm": 1.8348621129989624, "learning_rate": 9.145419712173714e-06, "loss": 1.2065, "step": 1864 }, { "epoch": 1.1494607087827426, "grad_norm": 1.788564920425415, "learning_rate": 9.134697016348328e-06, "loss": 1.1237, "step": 1865 }, { "epoch": 1.1500770416024653, "grad_norm": 1.8244892358779907, "learning_rate": 9.123975322827524e-06, "loss": 1.1572, "step": 1866 }, { "epoch": 1.1506933744221879, "grad_norm": 1.7659651041030884, "learning_rate": 9.113254644030538e-06, "loss": 1.1588, "step": 1867 }, { "epoch": 1.1513097072419107, "grad_norm": 1.7614327669143677, "learning_rate": 9.102534992375432e-06, "loss": 1.1385, "step": 1868 }, { "epoch": 1.1519260400616333, "grad_norm": 1.8464422225952148, "learning_rate": 9.09181638027907e-06, "loss": 1.1921, "step": 1869 }, { "epoch": 1.152542372881356, "grad_norm": 1.8207978010177612, "learning_rate": 9.081098820157129e-06, "loss": 1.1312, "step": 1870 }, { "epoch": 1.1531587057010786, "grad_norm": 1.786438226699829, "learning_rate": 9.070382324424047e-06, "loss": 1.1394, "step": 1871 }, { "epoch": 1.1537750385208012, "grad_norm": 1.7761427164077759, "learning_rate": 9.059666905493051e-06, "loss": 1.1608, "step": 1872 }, { "epoch": 1.1543913713405238, "grad_norm": 1.7810614109039307, "learning_rate": 9.048952575776102e-06, "loss": 1.1538, "step": 1873 }, { "epoch": 1.1550077041602465, "grad_norm": 1.7345532178878784, "learning_rate": 9.038239347683912e-06, "loss": 1.1238, "step": 1874 }, { "epoch": 1.1556240369799693, "grad_norm": 1.80329430103302, "learning_rate": 9.02752723362591e-06, "loss": 1.1624, "step": 1875 }, { "epoch": 1.156240369799692, "grad_norm": 1.7871588468551636, "learning_rate": 9.016816246010233e-06, "loss": 1.1301, "step": 1876 }, { "epoch": 1.1568567026194145, "grad_norm": 1.7864456176757812, "learning_rate": 9.006106397243724e-06, "loss": 1.2297, "step": 1877 }, { "epoch": 1.1574730354391372, "grad_norm": 1.782387137413025, "learning_rate": 8.995397699731889e-06, "loss": 1.1756, "step": 1878 }, { "epoch": 1.1580893682588598, "grad_norm": 1.7530053853988647, "learning_rate": 8.98469016587892e-06, "loss": 1.1537, "step": 1879 }, { "epoch": 1.1587057010785824, "grad_norm": 1.7345830202102661, "learning_rate": 8.97398380808765e-06, "loss": 1.0717, "step": 1880 }, { "epoch": 1.159322033898305, "grad_norm": 1.889941930770874, "learning_rate": 8.963278638759552e-06, "loss": 1.2046, "step": 1881 }, { "epoch": 1.1599383667180279, "grad_norm": 1.8020962476730347, "learning_rate": 8.95257467029472e-06, "loss": 1.1432, "step": 1882 }, { "epoch": 1.1605546995377505, "grad_norm": 1.7620140314102173, "learning_rate": 8.941871915091862e-06, "loss": 1.0928, "step": 1883 }, { "epoch": 1.161171032357473, "grad_norm": 1.82870614528656, "learning_rate": 8.931170385548272e-06, "loss": 1.1198, "step": 1884 }, { "epoch": 1.1617873651771957, "grad_norm": 1.789713740348816, "learning_rate": 8.92047009405984e-06, "loss": 1.1195, "step": 1885 }, { "epoch": 1.1624036979969183, "grad_norm": 1.8409415483474731, "learning_rate": 8.909771053021003e-06, "loss": 1.2081, "step": 1886 }, { "epoch": 1.163020030816641, "grad_norm": 1.961564540863037, "learning_rate": 8.899073274824765e-06, "loss": 1.1774, "step": 1887 }, { "epoch": 1.1636363636363636, "grad_norm": 1.7843871116638184, "learning_rate": 8.888376771862656e-06, "loss": 1.0963, "step": 1888 }, { "epoch": 1.1642526964560862, "grad_norm": 1.8358018398284912, "learning_rate": 8.877681556524737e-06, "loss": 1.113, "step": 1889 }, { "epoch": 1.1648690292758088, "grad_norm": 1.7934008836746216, "learning_rate": 8.86698764119957e-06, "loss": 1.1643, "step": 1890 }, { "epoch": 1.1654853620955317, "grad_norm": 1.7898404598236084, "learning_rate": 8.85629503827422e-06, "loss": 1.168, "step": 1891 }, { "epoch": 1.1661016949152543, "grad_norm": 1.8463091850280762, "learning_rate": 8.84560376013423e-06, "loss": 1.1394, "step": 1892 }, { "epoch": 1.166718027734977, "grad_norm": 1.8168009519577026, "learning_rate": 8.834913819163596e-06, "loss": 1.1665, "step": 1893 }, { "epoch": 1.1673343605546995, "grad_norm": 1.8142045736312866, "learning_rate": 8.824225227744782e-06, "loss": 1.1435, "step": 1894 }, { "epoch": 1.1679506933744221, "grad_norm": 1.7923039197921753, "learning_rate": 8.813537998258682e-06, "loss": 1.0966, "step": 1895 }, { "epoch": 1.1685670261941448, "grad_norm": 1.8147804737091064, "learning_rate": 8.802852143084604e-06, "loss": 1.0902, "step": 1896 }, { "epoch": 1.1691833590138674, "grad_norm": 1.837048053741455, "learning_rate": 8.79216767460028e-06, "loss": 1.1944, "step": 1897 }, { "epoch": 1.1697996918335902, "grad_norm": 1.7532716989517212, "learning_rate": 8.781484605181825e-06, "loss": 1.1117, "step": 1898 }, { "epoch": 1.1704160246533128, "grad_norm": 1.7426851987838745, "learning_rate": 8.770802947203737e-06, "loss": 1.0765, "step": 1899 }, { "epoch": 1.1710323574730355, "grad_norm": 1.8615989685058594, "learning_rate": 8.76012271303888e-06, "loss": 1.1037, "step": 1900 }, { "epoch": 1.171648690292758, "grad_norm": 1.7652592658996582, "learning_rate": 8.749443915058467e-06, "loss": 1.0899, "step": 1901 }, { "epoch": 1.1722650231124807, "grad_norm": 1.694689154624939, "learning_rate": 8.738766565632038e-06, "loss": 1.0853, "step": 1902 }, { "epoch": 1.1728813559322033, "grad_norm": 1.8205093145370483, "learning_rate": 8.728090677127477e-06, "loss": 1.2141, "step": 1903 }, { "epoch": 1.173497688751926, "grad_norm": 1.7508158683776855, "learning_rate": 8.71741626191096e-06, "loss": 1.158, "step": 1904 }, { "epoch": 1.1741140215716488, "grad_norm": 1.796246886253357, "learning_rate": 8.706743332346958e-06, "loss": 1.1161, "step": 1905 }, { "epoch": 1.1747303543913714, "grad_norm": 1.8634192943572998, "learning_rate": 8.696071900798223e-06, "loss": 1.1774, "step": 1906 }, { "epoch": 1.175346687211094, "grad_norm": 1.8833073377609253, "learning_rate": 8.685401979625773e-06, "loss": 1.1787, "step": 1907 }, { "epoch": 1.1759630200308167, "grad_norm": 1.890223741531372, "learning_rate": 8.674733581188872e-06, "loss": 1.2277, "step": 1908 }, { "epoch": 1.1765793528505393, "grad_norm": 1.8512643575668335, "learning_rate": 8.66406671784503e-06, "loss": 1.1338, "step": 1909 }, { "epoch": 1.177195685670262, "grad_norm": 1.8430699110031128, "learning_rate": 8.653401401949966e-06, "loss": 1.1825, "step": 1910 }, { "epoch": 1.1778120184899845, "grad_norm": 1.7726476192474365, "learning_rate": 8.642737645857617e-06, "loss": 1.1213, "step": 1911 }, { "epoch": 1.1784283513097074, "grad_norm": 1.8244842290878296, "learning_rate": 8.632075461920109e-06, "loss": 1.1685, "step": 1912 }, { "epoch": 1.17904468412943, "grad_norm": 1.866361379623413, "learning_rate": 8.621414862487741e-06, "loss": 1.1446, "step": 1913 }, { "epoch": 1.1796610169491526, "grad_norm": 1.830185055732727, "learning_rate": 8.61075585990899e-06, "loss": 1.1149, "step": 1914 }, { "epoch": 1.1802773497688752, "grad_norm": 1.8399386405944824, "learning_rate": 8.60009846653047e-06, "loss": 1.1814, "step": 1915 }, { "epoch": 1.1808936825885978, "grad_norm": 1.8324203491210938, "learning_rate": 8.589442694696944e-06, "loss": 1.1564, "step": 1916 }, { "epoch": 1.1815100154083205, "grad_norm": 1.817185640335083, "learning_rate": 8.578788556751282e-06, "loss": 1.1639, "step": 1917 }, { "epoch": 1.182126348228043, "grad_norm": 1.7755348682403564, "learning_rate": 8.568136065034479e-06, "loss": 1.1863, "step": 1918 }, { "epoch": 1.1827426810477657, "grad_norm": 1.776236653327942, "learning_rate": 8.557485231885601e-06, "loss": 1.1137, "step": 1919 }, { "epoch": 1.1833590138674885, "grad_norm": 1.7917600870132446, "learning_rate": 8.546836069641815e-06, "loss": 1.0892, "step": 1920 }, { "epoch": 1.1839753466872112, "grad_norm": 1.9051663875579834, "learning_rate": 8.536188590638334e-06, "loss": 1.1788, "step": 1921 }, { "epoch": 1.1845916795069338, "grad_norm": 1.8405485153198242, "learning_rate": 8.52554280720844e-06, "loss": 1.1429, "step": 1922 }, { "epoch": 1.1852080123266564, "grad_norm": 1.8784586191177368, "learning_rate": 8.514898731683431e-06, "loss": 1.1841, "step": 1923 }, { "epoch": 1.185824345146379, "grad_norm": 1.8526698350906372, "learning_rate": 8.504256376392647e-06, "loss": 1.2118, "step": 1924 }, { "epoch": 1.1864406779661016, "grad_norm": 1.8220551013946533, "learning_rate": 8.493615753663417e-06, "loss": 1.1099, "step": 1925 }, { "epoch": 1.1870570107858243, "grad_norm": 1.8374228477478027, "learning_rate": 8.482976875821072e-06, "loss": 1.1382, "step": 1926 }, { "epoch": 1.1876733436055469, "grad_norm": 1.7971607446670532, "learning_rate": 8.47233975518892e-06, "loss": 1.1333, "step": 1927 }, { "epoch": 1.1882896764252697, "grad_norm": 1.8098211288452148, "learning_rate": 8.46170440408824e-06, "loss": 1.1584, "step": 1928 }, { "epoch": 1.1889060092449923, "grad_norm": 1.8533012866973877, "learning_rate": 8.451070834838254e-06, "loss": 1.1697, "step": 1929 }, { "epoch": 1.189522342064715, "grad_norm": 1.8473211526870728, "learning_rate": 8.440439059756116e-06, "loss": 1.1156, "step": 1930 }, { "epoch": 1.1901386748844376, "grad_norm": 1.803095817565918, "learning_rate": 8.429809091156915e-06, "loss": 1.1283, "step": 1931 }, { "epoch": 1.1907550077041602, "grad_norm": 1.8986005783081055, "learning_rate": 8.419180941353638e-06, "loss": 1.1508, "step": 1932 }, { "epoch": 1.1913713405238828, "grad_norm": 1.846991777420044, "learning_rate": 8.40855462265716e-06, "loss": 1.107, "step": 1933 }, { "epoch": 1.1919876733436054, "grad_norm": 1.8725359439849854, "learning_rate": 8.397930147376252e-06, "loss": 1.1358, "step": 1934 }, { "epoch": 1.1926040061633283, "grad_norm": 1.8242146968841553, "learning_rate": 8.38730752781754e-06, "loss": 1.1172, "step": 1935 }, { "epoch": 1.193220338983051, "grad_norm": 1.8275387287139893, "learning_rate": 8.376686776285493e-06, "loss": 1.1537, "step": 1936 }, { "epoch": 1.1938366718027735, "grad_norm": 1.7885950803756714, "learning_rate": 8.366067905082432e-06, "loss": 1.1038, "step": 1937 }, { "epoch": 1.1944530046224962, "grad_norm": 1.8306902647018433, "learning_rate": 8.355450926508487e-06, "loss": 1.1238, "step": 1938 }, { "epoch": 1.1950693374422188, "grad_norm": 1.8852002620697021, "learning_rate": 8.344835852861595e-06, "loss": 1.1539, "step": 1939 }, { "epoch": 1.1956856702619414, "grad_norm": 1.8789499998092651, "learning_rate": 8.334222696437502e-06, "loss": 1.1667, "step": 1940 }, { "epoch": 1.196302003081664, "grad_norm": 1.8008283376693726, "learning_rate": 8.32361146952972e-06, "loss": 1.1073, "step": 1941 }, { "epoch": 1.1969183359013869, "grad_norm": 1.7733309268951416, "learning_rate": 8.313002184429529e-06, "loss": 1.0809, "step": 1942 }, { "epoch": 1.1975346687211095, "grad_norm": 1.8066035509109497, "learning_rate": 8.302394853425956e-06, "loss": 1.1216, "step": 1943 }, { "epoch": 1.198151001540832, "grad_norm": 1.8129280805587769, "learning_rate": 8.291789488805776e-06, "loss": 1.1543, "step": 1944 }, { "epoch": 1.1987673343605547, "grad_norm": 1.8172587156295776, "learning_rate": 8.281186102853466e-06, "loss": 1.1472, "step": 1945 }, { "epoch": 1.1993836671802773, "grad_norm": 1.7605327367782593, "learning_rate": 8.270584707851237e-06, "loss": 1.1403, "step": 1946 }, { "epoch": 1.2, "grad_norm": 1.8558827638626099, "learning_rate": 8.259985316078972e-06, "loss": 1.1452, "step": 1947 }, { "epoch": 1.2006163328197226, "grad_norm": 1.8512718677520752, "learning_rate": 8.249387939814248e-06, "loss": 1.1469, "step": 1948 }, { "epoch": 1.2012326656394454, "grad_norm": 1.8816415071487427, "learning_rate": 8.2387925913323e-06, "loss": 1.152, "step": 1949 }, { "epoch": 1.201848998459168, "grad_norm": 1.8391399383544922, "learning_rate": 8.22819928290601e-06, "loss": 1.1232, "step": 1950 }, { "epoch": 1.2024653312788907, "grad_norm": 1.8567638397216797, "learning_rate": 8.21760802680591e-06, "loss": 1.1654, "step": 1951 }, { "epoch": 1.2030816640986133, "grad_norm": 1.9173164367675781, "learning_rate": 8.20701883530014e-06, "loss": 1.1762, "step": 1952 }, { "epoch": 1.203697996918336, "grad_norm": 1.9190682172775269, "learning_rate": 8.196431720654466e-06, "loss": 1.2301, "step": 1953 }, { "epoch": 1.2043143297380585, "grad_norm": 1.8192946910858154, "learning_rate": 8.185846695132226e-06, "loss": 1.1376, "step": 1954 }, { "epoch": 1.2049306625577811, "grad_norm": 1.823595404624939, "learning_rate": 8.175263770994362e-06, "loss": 1.1558, "step": 1955 }, { "epoch": 1.2055469953775038, "grad_norm": 1.7951470613479614, "learning_rate": 8.164682960499362e-06, "loss": 1.1968, "step": 1956 }, { "epoch": 1.2061633281972264, "grad_norm": 1.8093249797821045, "learning_rate": 8.15410427590328e-06, "loss": 1.1294, "step": 1957 }, { "epoch": 1.2067796610169492, "grad_norm": 1.8136457204818726, "learning_rate": 8.14352772945969e-06, "loss": 1.1392, "step": 1958 }, { "epoch": 1.2073959938366718, "grad_norm": 1.7840667963027954, "learning_rate": 8.132953333419715e-06, "loss": 1.121, "step": 1959 }, { "epoch": 1.2080123266563945, "grad_norm": 1.8186137676239014, "learning_rate": 8.122381100031967e-06, "loss": 1.089, "step": 1960 }, { "epoch": 1.208628659476117, "grad_norm": 1.813576102256775, "learning_rate": 8.111811041542557e-06, "loss": 1.1817, "step": 1961 }, { "epoch": 1.2092449922958397, "grad_norm": 1.8262306451797485, "learning_rate": 8.101243170195081e-06, "loss": 1.0775, "step": 1962 }, { "epoch": 1.2098613251155623, "grad_norm": 1.861322045326233, "learning_rate": 8.090677498230598e-06, "loss": 1.135, "step": 1963 }, { "epoch": 1.210477657935285, "grad_norm": 1.8071485757827759, "learning_rate": 8.080114037887615e-06, "loss": 1.1129, "step": 1964 }, { "epoch": 1.2110939907550078, "grad_norm": 1.9230101108551025, "learning_rate": 8.069552801402095e-06, "loss": 1.1663, "step": 1965 }, { "epoch": 1.2117103235747304, "grad_norm": 1.830327033996582, "learning_rate": 8.058993801007406e-06, "loss": 1.0966, "step": 1966 }, { "epoch": 1.212326656394453, "grad_norm": 1.8074411153793335, "learning_rate": 8.04843704893433e-06, "loss": 1.0904, "step": 1967 }, { "epoch": 1.2129429892141756, "grad_norm": 1.8767496347427368, "learning_rate": 8.037882557411051e-06, "loss": 1.1237, "step": 1968 }, { "epoch": 1.2135593220338983, "grad_norm": 1.909767985343933, "learning_rate": 8.027330338663133e-06, "loss": 1.1245, "step": 1969 }, { "epoch": 1.214175654853621, "grad_norm": 1.912820816040039, "learning_rate": 8.016780404913497e-06, "loss": 1.1812, "step": 1970 }, { "epoch": 1.2147919876733435, "grad_norm": 1.929832935333252, "learning_rate": 8.006232768382432e-06, "loss": 1.2204, "step": 1971 }, { "epoch": 1.2154083204930664, "grad_norm": 1.892865538597107, "learning_rate": 7.99568744128756e-06, "loss": 1.1698, "step": 1972 }, { "epoch": 1.216024653312789, "grad_norm": 1.886177897453308, "learning_rate": 7.985144435843826e-06, "loss": 1.1689, "step": 1973 }, { "epoch": 1.2166409861325116, "grad_norm": 1.8224637508392334, "learning_rate": 7.97460376426349e-06, "loss": 1.137, "step": 1974 }, { "epoch": 1.2172573189522342, "grad_norm": 1.8537793159484863, "learning_rate": 7.964065438756099e-06, "loss": 1.1097, "step": 1975 }, { "epoch": 1.2178736517719568, "grad_norm": 1.8391259908676147, "learning_rate": 7.953529471528492e-06, "loss": 1.1142, "step": 1976 }, { "epoch": 1.2184899845916795, "grad_norm": 1.8463839292526245, "learning_rate": 7.942995874784775e-06, "loss": 1.1226, "step": 1977 }, { "epoch": 1.219106317411402, "grad_norm": 1.7552274465560913, "learning_rate": 7.932464660726311e-06, "loss": 1.0874, "step": 1978 }, { "epoch": 1.219722650231125, "grad_norm": 1.8354761600494385, "learning_rate": 7.921935841551696e-06, "loss": 1.1625, "step": 1979 }, { "epoch": 1.2203389830508475, "grad_norm": 1.8153932094573975, "learning_rate": 7.91140942945675e-06, "loss": 1.1338, "step": 1980 }, { "epoch": 1.2209553158705702, "grad_norm": 1.8420171737670898, "learning_rate": 7.900885436634515e-06, "loss": 1.0879, "step": 1981 }, { "epoch": 1.2215716486902928, "grad_norm": 1.8687227964401245, "learning_rate": 7.890363875275222e-06, "loss": 1.18, "step": 1982 }, { "epoch": 1.2221879815100154, "grad_norm": 1.9812421798706055, "learning_rate": 7.879844757566292e-06, "loss": 1.1578, "step": 1983 }, { "epoch": 1.222804314329738, "grad_norm": 1.8564053773880005, "learning_rate": 7.869328095692313e-06, "loss": 1.1728, "step": 1984 }, { "epoch": 1.2234206471494606, "grad_norm": 1.836758017539978, "learning_rate": 7.858813901835026e-06, "loss": 1.1274, "step": 1985 }, { "epoch": 1.2240369799691835, "grad_norm": 1.8581491708755493, "learning_rate": 7.848302188173314e-06, "loss": 1.1349, "step": 1986 }, { "epoch": 1.224653312788906, "grad_norm": 1.8910939693450928, "learning_rate": 7.83779296688319e-06, "loss": 1.111, "step": 1987 }, { "epoch": 1.2252696456086287, "grad_norm": 1.8790580034255981, "learning_rate": 7.827286250137776e-06, "loss": 1.1242, "step": 1988 }, { "epoch": 1.2258859784283513, "grad_norm": 1.8605077266693115, "learning_rate": 7.816782050107295e-06, "loss": 1.1468, "step": 1989 }, { "epoch": 1.226502311248074, "grad_norm": 1.9218624830245972, "learning_rate": 7.80628037895906e-06, "loss": 1.1437, "step": 1990 }, { "epoch": 1.2271186440677966, "grad_norm": 1.868048071861267, "learning_rate": 7.795781248857442e-06, "loss": 1.137, "step": 1991 }, { "epoch": 1.2277349768875192, "grad_norm": 1.8974031209945679, "learning_rate": 7.785284671963884e-06, "loss": 1.1908, "step": 1992 }, { "epoch": 1.2283513097072418, "grad_norm": 1.8901944160461426, "learning_rate": 7.774790660436857e-06, "loss": 1.1805, "step": 1993 }, { "epoch": 1.2289676425269644, "grad_norm": 1.8425767421722412, "learning_rate": 7.764299226431873e-06, "loss": 1.153, "step": 1994 }, { "epoch": 1.2295839753466873, "grad_norm": 1.852847695350647, "learning_rate": 7.753810382101444e-06, "loss": 1.1143, "step": 1995 }, { "epoch": 1.23020030816641, "grad_norm": 1.8267360925674438, "learning_rate": 7.743324139595102e-06, "loss": 1.083, "step": 1996 }, { "epoch": 1.2308166409861325, "grad_norm": 1.824185848236084, "learning_rate": 7.732840511059346e-06, "loss": 1.1585, "step": 1997 }, { "epoch": 1.2314329738058551, "grad_norm": 1.8699235916137695, "learning_rate": 7.72235950863766e-06, "loss": 1.1454, "step": 1998 }, { "epoch": 1.2320493066255778, "grad_norm": 1.8451159000396729, "learning_rate": 7.711881144470481e-06, "loss": 1.1478, "step": 1999 }, { "epoch": 1.2326656394453004, "grad_norm": 1.8291648626327515, "learning_rate": 7.701405430695185e-06, "loss": 1.1683, "step": 2000 }, { "epoch": 1.233281972265023, "grad_norm": 1.864972472190857, "learning_rate": 7.690932379446087e-06, "loss": 1.1284, "step": 2001 }, { "epoch": 1.2338983050847459, "grad_norm": 1.8956785202026367, "learning_rate": 7.680462002854418e-06, "loss": 1.1385, "step": 2002 }, { "epoch": 1.2345146379044685, "grad_norm": 1.9941411018371582, "learning_rate": 7.669994313048303e-06, "loss": 1.0923, "step": 2003 }, { "epoch": 1.235130970724191, "grad_norm": 1.9455927610397339, "learning_rate": 7.65952932215276e-06, "loss": 1.1587, "step": 2004 }, { "epoch": 1.2357473035439137, "grad_norm": 1.906812310218811, "learning_rate": 7.649067042289681e-06, "loss": 1.1937, "step": 2005 }, { "epoch": 1.2363636363636363, "grad_norm": 1.8503549098968506, "learning_rate": 7.638607485577816e-06, "loss": 1.116, "step": 2006 }, { "epoch": 1.236979969183359, "grad_norm": 1.8868077993392944, "learning_rate": 7.628150664132755e-06, "loss": 1.1116, "step": 2007 }, { "epoch": 1.2375963020030816, "grad_norm": 1.8573330640792847, "learning_rate": 7.6176965900669356e-06, "loss": 1.1021, "step": 2008 }, { "epoch": 1.2382126348228044, "grad_norm": 1.858217716217041, "learning_rate": 7.6072452754896e-06, "loss": 1.1272, "step": 2009 }, { "epoch": 1.238828967642527, "grad_norm": 1.8204230070114136, "learning_rate": 7.596796732506797e-06, "loss": 1.0908, "step": 2010 }, { "epoch": 1.2394453004622497, "grad_norm": 1.8699613809585571, "learning_rate": 7.586350973221363e-06, "loss": 1.1187, "step": 2011 }, { "epoch": 1.2400616332819723, "grad_norm": 1.8638288974761963, "learning_rate": 7.575908009732918e-06, "loss": 1.1238, "step": 2012 }, { "epoch": 1.240677966101695, "grad_norm": 1.8669801950454712, "learning_rate": 7.565467854137829e-06, "loss": 1.1041, "step": 2013 }, { "epoch": 1.2412942989214175, "grad_norm": 1.8350132703781128, "learning_rate": 7.5550305185292275e-06, "loss": 1.0852, "step": 2014 }, { "epoch": 1.2419106317411401, "grad_norm": 1.9158581495285034, "learning_rate": 7.544596014996968e-06, "loss": 1.18, "step": 2015 }, { "epoch": 1.242526964560863, "grad_norm": 1.854806661605835, "learning_rate": 7.534164355627628e-06, "loss": 1.1573, "step": 2016 }, { "epoch": 1.2431432973805856, "grad_norm": 1.9001798629760742, "learning_rate": 7.523735552504485e-06, "loss": 1.1204, "step": 2017 }, { "epoch": 1.2437596302003082, "grad_norm": 1.8641166687011719, "learning_rate": 7.513309617707518e-06, "loss": 1.1261, "step": 2018 }, { "epoch": 1.2443759630200308, "grad_norm": 1.9213446378707886, "learning_rate": 7.502886563313376e-06, "loss": 1.1919, "step": 2019 }, { "epoch": 1.2449922958397535, "grad_norm": 1.8455933332443237, "learning_rate": 7.492466401395371e-06, "loss": 1.1358, "step": 2020 }, { "epoch": 1.245608628659476, "grad_norm": 1.8563966751098633, "learning_rate": 7.482049144023469e-06, "loss": 1.0932, "step": 2021 }, { "epoch": 1.2462249614791987, "grad_norm": 1.8951612710952759, "learning_rate": 7.4716348032642725e-06, "loss": 1.1129, "step": 2022 }, { "epoch": 1.2468412942989213, "grad_norm": 1.9441466331481934, "learning_rate": 7.461223391181e-06, "loss": 1.174, "step": 2023 }, { "epoch": 1.2474576271186442, "grad_norm": 1.9022141695022583, "learning_rate": 7.450814919833481e-06, "loss": 1.0765, "step": 2024 }, { "epoch": 1.2480739599383668, "grad_norm": 1.9474152326583862, "learning_rate": 7.440409401278139e-06, "loss": 1.0938, "step": 2025 }, { "epoch": 1.2486902927580894, "grad_norm": 1.8979072570800781, "learning_rate": 7.430006847567972e-06, "loss": 1.1241, "step": 2026 }, { "epoch": 1.249306625577812, "grad_norm": 1.9067083597183228, "learning_rate": 7.419607270752555e-06, "loss": 1.1813, "step": 2027 }, { "epoch": 1.2499229583975346, "grad_norm": 1.8857613801956177, "learning_rate": 7.409210682878003e-06, "loss": 1.1219, "step": 2028 }, { "epoch": 1.2505392912172573, "grad_norm": 1.9102001190185547, "learning_rate": 7.398817095986979e-06, "loss": 1.1843, "step": 2029 }, { "epoch": 1.25115562403698, "grad_norm": 1.8551355600357056, "learning_rate": 7.388426522118657e-06, "loss": 1.145, "step": 2030 }, { "epoch": 1.2517719568567025, "grad_norm": 1.8533369302749634, "learning_rate": 7.378038973308734e-06, "loss": 1.1616, "step": 2031 }, { "epoch": 1.2523882896764253, "grad_norm": 1.9075177907943726, "learning_rate": 7.367654461589392e-06, "loss": 1.1744, "step": 2032 }, { "epoch": 1.253004622496148, "grad_norm": 1.860812783241272, "learning_rate": 7.357272998989309e-06, "loss": 1.1656, "step": 2033 }, { "epoch": 1.2536209553158706, "grad_norm": 1.798598051071167, "learning_rate": 7.346894597533616e-06, "loss": 1.0937, "step": 2034 }, { "epoch": 1.2542372881355932, "grad_norm": 1.8755488395690918, "learning_rate": 7.336519269243908e-06, "loss": 1.1289, "step": 2035 }, { "epoch": 1.2548536209553158, "grad_norm": 1.8704454898834229, "learning_rate": 7.326147026138217e-06, "loss": 1.057, "step": 2036 }, { "epoch": 1.2554699537750384, "grad_norm": 1.8759182691574097, "learning_rate": 7.315777880230998e-06, "loss": 1.1287, "step": 2037 }, { "epoch": 1.256086286594761, "grad_norm": 1.8889687061309814, "learning_rate": 7.305411843533124e-06, "loss": 1.0871, "step": 2038 }, { "epoch": 1.256702619414484, "grad_norm": 1.9081194400787354, "learning_rate": 7.2950489280518664e-06, "loss": 1.1588, "step": 2039 }, { "epoch": 1.2573189522342065, "grad_norm": 1.9042205810546875, "learning_rate": 7.284689145790879e-06, "loss": 1.1745, "step": 2040 }, { "epoch": 1.2579352850539292, "grad_norm": 1.8646845817565918, "learning_rate": 7.274332508750183e-06, "loss": 1.1155, "step": 2041 }, { "epoch": 1.2585516178736518, "grad_norm": 1.9002572298049927, "learning_rate": 7.2639790289261644e-06, "loss": 1.1429, "step": 2042 }, { "epoch": 1.2591679506933744, "grad_norm": 1.955370545387268, "learning_rate": 7.253628718311545e-06, "loss": 1.1646, "step": 2043 }, { "epoch": 1.259784283513097, "grad_norm": 1.9329922199249268, "learning_rate": 7.243281588895374e-06, "loss": 1.1828, "step": 2044 }, { "epoch": 1.2604006163328196, "grad_norm": 1.8856618404388428, "learning_rate": 7.232937652663028e-06, "loss": 1.1684, "step": 2045 }, { "epoch": 1.2610169491525425, "grad_norm": 1.8926563262939453, "learning_rate": 7.222596921596174e-06, "loss": 1.1155, "step": 2046 }, { "epoch": 1.261633281972265, "grad_norm": 1.8362922668457031, "learning_rate": 7.2122594076727705e-06, "loss": 1.1285, "step": 2047 }, { "epoch": 1.2622496147919877, "grad_norm": 1.8675581216812134, "learning_rate": 7.201925122867044e-06, "loss": 1.1569, "step": 2048 }, { "epoch": 1.2628659476117103, "grad_norm": 1.870606541633606, "learning_rate": 7.191594079149489e-06, "loss": 1.1637, "step": 2049 }, { "epoch": 1.263482280431433, "grad_norm": 1.9547513723373413, "learning_rate": 7.181266288486835e-06, "loss": 1.159, "step": 2050 }, { "epoch": 1.2640986132511556, "grad_norm": 1.9471369981765747, "learning_rate": 7.1709417628420565e-06, "loss": 1.1493, "step": 2051 }, { "epoch": 1.2647149460708782, "grad_norm": 1.919755458831787, "learning_rate": 7.160620514174342e-06, "loss": 1.1107, "step": 2052 }, { "epoch": 1.265331278890601, "grad_norm": 1.8763797283172607, "learning_rate": 7.1503025544390745e-06, "loss": 1.1264, "step": 2053 }, { "epoch": 1.2659476117103234, "grad_norm": 1.8384065628051758, "learning_rate": 7.139987895587837e-06, "loss": 1.1061, "step": 2054 }, { "epoch": 1.2665639445300463, "grad_norm": 1.8735439777374268, "learning_rate": 7.12967654956839e-06, "loss": 1.1991, "step": 2055 }, { "epoch": 1.267180277349769, "grad_norm": 1.9802327156066895, "learning_rate": 7.119368528324648e-06, "loss": 1.1533, "step": 2056 }, { "epoch": 1.2677966101694915, "grad_norm": 1.8950251340866089, "learning_rate": 7.10906384379668e-06, "loss": 1.1374, "step": 2057 }, { "epoch": 1.2684129429892141, "grad_norm": 1.8133563995361328, "learning_rate": 7.098762507920691e-06, "loss": 1.0743, "step": 2058 }, { "epoch": 1.2690292758089368, "grad_norm": 1.89374577999115, "learning_rate": 7.088464532629009e-06, "loss": 1.1775, "step": 2059 }, { "epoch": 1.2696456086286596, "grad_norm": 1.890937089920044, "learning_rate": 7.078169929850063e-06, "loss": 1.2067, "step": 2060 }, { "epoch": 1.270261941448382, "grad_norm": 1.8246569633483887, "learning_rate": 7.067878711508376e-06, "loss": 1.0466, "step": 2061 }, { "epoch": 1.2708782742681048, "grad_norm": 1.933603286743164, "learning_rate": 7.057590889524557e-06, "loss": 1.0872, "step": 2062 }, { "epoch": 1.2714946070878275, "grad_norm": 1.9760533571243286, "learning_rate": 7.047306475815272e-06, "loss": 1.1409, "step": 2063 }, { "epoch": 1.27211093990755, "grad_norm": 1.888184905052185, "learning_rate": 7.037025482293253e-06, "loss": 1.1469, "step": 2064 }, { "epoch": 1.2727272727272727, "grad_norm": 1.9236257076263428, "learning_rate": 7.026747920867253e-06, "loss": 1.1327, "step": 2065 }, { "epoch": 1.2733436055469953, "grad_norm": 1.9354578256607056, "learning_rate": 7.016473803442064e-06, "loss": 1.0707, "step": 2066 }, { "epoch": 1.273959938366718, "grad_norm": 1.8858877420425415, "learning_rate": 7.0062031419184795e-06, "loss": 1.1201, "step": 2067 }, { "epoch": 1.2745762711864406, "grad_norm": 1.8765416145324707, "learning_rate": 6.995935948193294e-06, "loss": 1.1586, "step": 2068 }, { "epoch": 1.2751926040061634, "grad_norm": 1.9035807847976685, "learning_rate": 6.985672234159282e-06, "loss": 1.15, "step": 2069 }, { "epoch": 1.275808936825886, "grad_norm": 1.8456820249557495, "learning_rate": 6.975412011705196e-06, "loss": 1.1429, "step": 2070 }, { "epoch": 1.2764252696456087, "grad_norm": 1.8710267543792725, "learning_rate": 6.965155292715731e-06, "loss": 1.1797, "step": 2071 }, { "epoch": 1.2770416024653313, "grad_norm": 1.8806949853897095, "learning_rate": 6.954902089071536e-06, "loss": 1.1528, "step": 2072 }, { "epoch": 1.277657935285054, "grad_norm": 1.9494726657867432, "learning_rate": 6.94465241264918e-06, "loss": 1.1324, "step": 2073 }, { "epoch": 1.2782742681047765, "grad_norm": 1.868783712387085, "learning_rate": 6.9344062753211475e-06, "loss": 1.088, "step": 2074 }, { "epoch": 1.2788906009244991, "grad_norm": 1.9315228462219238, "learning_rate": 6.924163688955825e-06, "loss": 1.1075, "step": 2075 }, { "epoch": 1.279506933744222, "grad_norm": 1.8989384174346924, "learning_rate": 6.9139246654174915e-06, "loss": 1.1214, "step": 2076 }, { "epoch": 1.2801232665639446, "grad_norm": 1.8893548250198364, "learning_rate": 6.903689216566292e-06, "loss": 1.1009, "step": 2077 }, { "epoch": 1.2807395993836672, "grad_norm": 1.916996955871582, "learning_rate": 6.893457354258226e-06, "loss": 1.0819, "step": 2078 }, { "epoch": 1.2813559322033898, "grad_norm": 1.945035696029663, "learning_rate": 6.883229090345152e-06, "loss": 1.1738, "step": 2079 }, { "epoch": 1.2819722650231125, "grad_norm": 1.896695852279663, "learning_rate": 6.873004436674752e-06, "loss": 1.147, "step": 2080 }, { "epoch": 1.282588597842835, "grad_norm": 1.8675721883773804, "learning_rate": 6.86278340509052e-06, "loss": 1.1667, "step": 2081 }, { "epoch": 1.2832049306625577, "grad_norm": 1.8286998271942139, "learning_rate": 6.852566007431772e-06, "loss": 1.1052, "step": 2082 }, { "epoch": 1.2838212634822805, "grad_norm": 1.8694075345993042, "learning_rate": 6.842352255533603e-06, "loss": 1.1042, "step": 2083 }, { "epoch": 1.2844375963020032, "grad_norm": 1.825850248336792, "learning_rate": 6.832142161226885e-06, "loss": 1.0894, "step": 2084 }, { "epoch": 1.2850539291217258, "grad_norm": 1.8945086002349854, "learning_rate": 6.82193573633825e-06, "loss": 1.1759, "step": 2085 }, { "epoch": 1.2856702619414484, "grad_norm": 1.9032727479934692, "learning_rate": 6.811732992690094e-06, "loss": 1.1248, "step": 2086 }, { "epoch": 1.286286594761171, "grad_norm": 1.9234111309051514, "learning_rate": 6.801533942100529e-06, "loss": 1.1398, "step": 2087 }, { "epoch": 1.2869029275808936, "grad_norm": 1.9045689105987549, "learning_rate": 6.791338596383408e-06, "loss": 1.1735, "step": 2088 }, { "epoch": 1.2875192604006163, "grad_norm": 1.882921576499939, "learning_rate": 6.781146967348283e-06, "loss": 1.1367, "step": 2089 }, { "epoch": 1.288135593220339, "grad_norm": 1.8740825653076172, "learning_rate": 6.7709590668004025e-06, "loss": 1.0408, "step": 2090 }, { "epoch": 1.2887519260400615, "grad_norm": 1.900223970413208, "learning_rate": 6.76077490654069e-06, "loss": 1.1228, "step": 2091 }, { "epoch": 1.2893682588597843, "grad_norm": 1.959841012954712, "learning_rate": 6.75059449836575e-06, "loss": 1.1057, "step": 2092 }, { "epoch": 1.289984591679507, "grad_norm": 1.8852899074554443, "learning_rate": 6.740417854067831e-06, "loss": 1.1012, "step": 2093 }, { "epoch": 1.2906009244992296, "grad_norm": 1.8976807594299316, "learning_rate": 6.730244985434817e-06, "loss": 1.0982, "step": 2094 }, { "epoch": 1.2912172573189522, "grad_norm": 1.99767005443573, "learning_rate": 6.720075904250234e-06, "loss": 1.1384, "step": 2095 }, { "epoch": 1.2918335901386748, "grad_norm": 1.8880596160888672, "learning_rate": 6.709910622293212e-06, "loss": 1.0897, "step": 2096 }, { "epoch": 1.2924499229583977, "grad_norm": 1.927941918373108, "learning_rate": 6.699749151338477e-06, "loss": 1.162, "step": 2097 }, { "epoch": 1.29306625577812, "grad_norm": 1.960280418395996, "learning_rate": 6.689591503156344e-06, "loss": 1.0764, "step": 2098 }, { "epoch": 1.293682588597843, "grad_norm": 1.8347197771072388, "learning_rate": 6.6794376895127046e-06, "loss": 1.038, "step": 2099 }, { "epoch": 1.2942989214175655, "grad_norm": 1.9113661050796509, "learning_rate": 6.669287722168996e-06, "loss": 1.1014, "step": 2100 }, { "epoch": 1.2949152542372881, "grad_norm": 1.8190182447433472, "learning_rate": 6.659141612882219e-06, "loss": 1.1119, "step": 2101 }, { "epoch": 1.2955315870570108, "grad_norm": 1.8854924440383911, "learning_rate": 6.6489993734048855e-06, "loss": 1.136, "step": 2102 }, { "epoch": 1.2961479198767334, "grad_norm": 1.8342102766036987, "learning_rate": 6.638861015485043e-06, "loss": 1.022, "step": 2103 }, { "epoch": 1.296764252696456, "grad_norm": 1.8684513568878174, "learning_rate": 6.628726550866227e-06, "loss": 1.1322, "step": 2104 }, { "epoch": 1.2973805855161786, "grad_norm": 1.879026174545288, "learning_rate": 6.618595991287475e-06, "loss": 1.1626, "step": 2105 }, { "epoch": 1.2979969183359015, "grad_norm": 1.8854221105575562, "learning_rate": 6.6084693484832904e-06, "loss": 1.1297, "step": 2106 }, { "epoch": 1.298613251155624, "grad_norm": 1.8164162635803223, "learning_rate": 6.598346634183657e-06, "loss": 1.1081, "step": 2107 }, { "epoch": 1.2992295839753467, "grad_norm": 1.8698393106460571, "learning_rate": 6.5882278601139875e-06, "loss": 1.1, "step": 2108 }, { "epoch": 1.2998459167950693, "grad_norm": 1.893800139427185, "learning_rate": 6.5781130379951455e-06, "loss": 1.0815, "step": 2109 }, { "epoch": 1.300462249614792, "grad_norm": 1.9356763362884521, "learning_rate": 6.568002179543409e-06, "loss": 1.1578, "step": 2110 }, { "epoch": 1.3010785824345146, "grad_norm": 1.9007635116577148, "learning_rate": 6.557895296470467e-06, "loss": 1.1148, "step": 2111 }, { "epoch": 1.3016949152542372, "grad_norm": 1.9171055555343628, "learning_rate": 6.5477924004834015e-06, "loss": 1.1715, "step": 2112 }, { "epoch": 1.30231124807396, "grad_norm": 1.8486248254776, "learning_rate": 6.537693503284687e-06, "loss": 1.0418, "step": 2113 }, { "epoch": 1.3029275808936827, "grad_norm": 1.9156478643417358, "learning_rate": 6.527598616572153e-06, "loss": 1.0917, "step": 2114 }, { "epoch": 1.3035439137134053, "grad_norm": 1.844541072845459, "learning_rate": 6.517507752038985e-06, "loss": 1.0857, "step": 2115 }, { "epoch": 1.304160246533128, "grad_norm": 1.8431329727172852, "learning_rate": 6.507420921373719e-06, "loss": 1.0984, "step": 2116 }, { "epoch": 1.3047765793528505, "grad_norm": 1.9692331552505493, "learning_rate": 6.497338136260209e-06, "loss": 1.1541, "step": 2117 }, { "epoch": 1.3053929121725731, "grad_norm": 1.871606469154358, "learning_rate": 6.487259408377623e-06, "loss": 1.0545, "step": 2118 }, { "epoch": 1.3060092449922958, "grad_norm": 1.892255425453186, "learning_rate": 6.477184749400438e-06, "loss": 1.0952, "step": 2119 }, { "epoch": 1.3066255778120186, "grad_norm": 1.9948867559432983, "learning_rate": 6.467114170998412e-06, "loss": 1.0882, "step": 2120 }, { "epoch": 1.3072419106317412, "grad_norm": 1.9719386100769043, "learning_rate": 6.457047684836576e-06, "loss": 1.1628, "step": 2121 }, { "epoch": 1.3078582434514638, "grad_norm": 1.9667590856552124, "learning_rate": 6.44698530257522e-06, "loss": 1.0774, "step": 2122 }, { "epoch": 1.3084745762711865, "grad_norm": 1.9131965637207031, "learning_rate": 6.436927035869882e-06, "loss": 1.0848, "step": 2123 }, { "epoch": 1.309090909090909, "grad_norm": 1.9220930337905884, "learning_rate": 6.426872896371333e-06, "loss": 1.0525, "step": 2124 }, { "epoch": 1.3097072419106317, "grad_norm": 1.9416108131408691, "learning_rate": 6.4168228957255674e-06, "loss": 1.14, "step": 2125 }, { "epoch": 1.3103235747303543, "grad_norm": 1.930334448814392, "learning_rate": 6.406777045573776e-06, "loss": 1.1293, "step": 2126 }, { "epoch": 1.3109399075500772, "grad_norm": 1.9758033752441406, "learning_rate": 6.3967353575523505e-06, "loss": 1.1262, "step": 2127 }, { "epoch": 1.3115562403697996, "grad_norm": 1.97719407081604, "learning_rate": 6.386697843292855e-06, "loss": 1.1564, "step": 2128 }, { "epoch": 1.3121725731895224, "grad_norm": 1.934370994567871, "learning_rate": 6.3766645144220275e-06, "loss": 1.1651, "step": 2129 }, { "epoch": 1.312788906009245, "grad_norm": 1.8771915435791016, "learning_rate": 6.366635382561748e-06, "loss": 1.1116, "step": 2130 }, { "epoch": 1.3134052388289676, "grad_norm": 1.922093152999878, "learning_rate": 6.356610459329038e-06, "loss": 1.1559, "step": 2131 }, { "epoch": 1.3140215716486903, "grad_norm": 1.8336296081542969, "learning_rate": 6.34658975633605e-06, "loss": 1.1, "step": 2132 }, { "epoch": 1.3146379044684129, "grad_norm": 1.9261524677276611, "learning_rate": 6.336573285190044e-06, "loss": 1.1531, "step": 2133 }, { "epoch": 1.3152542372881357, "grad_norm": 1.9371651411056519, "learning_rate": 6.3265610574933766e-06, "loss": 1.1315, "step": 2134 }, { "epoch": 1.3158705701078581, "grad_norm": 1.8896856307983398, "learning_rate": 6.316553084843488e-06, "loss": 1.1366, "step": 2135 }, { "epoch": 1.316486902927581, "grad_norm": 1.8826168775558472, "learning_rate": 6.306549378832898e-06, "loss": 1.0979, "step": 2136 }, { "epoch": 1.3171032357473036, "grad_norm": 1.9258710145950317, "learning_rate": 6.296549951049169e-06, "loss": 1.1235, "step": 2137 }, { "epoch": 1.3177195685670262, "grad_norm": 1.9341789484024048, "learning_rate": 6.2865548130749255e-06, "loss": 1.1102, "step": 2138 }, { "epoch": 1.3183359013867488, "grad_norm": 1.9973599910736084, "learning_rate": 6.2765639764878105e-06, "loss": 1.109, "step": 2139 }, { "epoch": 1.3189522342064715, "grad_norm": 1.9003989696502686, "learning_rate": 6.266577452860492e-06, "loss": 1.0992, "step": 2140 }, { "epoch": 1.319568567026194, "grad_norm": 1.8930280208587646, "learning_rate": 6.256595253760635e-06, "loss": 1.0731, "step": 2141 }, { "epoch": 1.3201848998459167, "grad_norm": 1.8758518695831299, "learning_rate": 6.246617390750902e-06, "loss": 1.1165, "step": 2142 }, { "epoch": 1.3208012326656395, "grad_norm": 1.9386250972747803, "learning_rate": 6.236643875388925e-06, "loss": 1.1665, "step": 2143 }, { "epoch": 1.3214175654853622, "grad_norm": 1.838026762008667, "learning_rate": 6.226674719227313e-06, "loss": 1.0569, "step": 2144 }, { "epoch": 1.3220338983050848, "grad_norm": 1.9319950342178345, "learning_rate": 6.2167099338136095e-06, "loss": 1.143, "step": 2145 }, { "epoch": 1.3226502311248074, "grad_norm": 1.9024280309677124, "learning_rate": 6.206749530690312e-06, "loss": 1.1187, "step": 2146 }, { "epoch": 1.32326656394453, "grad_norm": 2.031986951828003, "learning_rate": 6.196793521394826e-06, "loss": 1.1294, "step": 2147 }, { "epoch": 1.3238828967642526, "grad_norm": 2.0211308002471924, "learning_rate": 6.1868419174594765e-06, "loss": 1.1484, "step": 2148 }, { "epoch": 1.3244992295839753, "grad_norm": 1.9099383354187012, "learning_rate": 6.176894730411482e-06, "loss": 1.0749, "step": 2149 }, { "epoch": 1.325115562403698, "grad_norm": 1.927715539932251, "learning_rate": 6.166951971772954e-06, "loss": 1.1056, "step": 2150 }, { "epoch": 1.3257318952234207, "grad_norm": 1.877619981765747, "learning_rate": 6.1570136530608635e-06, "loss": 1.0595, "step": 2151 }, { "epoch": 1.3263482280431433, "grad_norm": 1.9544296264648438, "learning_rate": 6.147079785787038e-06, "loss": 1.1368, "step": 2152 }, { "epoch": 1.326964560862866, "grad_norm": 1.977342128753662, "learning_rate": 6.137150381458159e-06, "loss": 1.1154, "step": 2153 }, { "epoch": 1.3275808936825886, "grad_norm": 1.914412021636963, "learning_rate": 6.127225451575729e-06, "loss": 1.1589, "step": 2154 }, { "epoch": 1.3281972265023112, "grad_norm": 1.8995112180709839, "learning_rate": 6.1173050076360674e-06, "loss": 1.1357, "step": 2155 }, { "epoch": 1.3288135593220338, "grad_norm": 1.8481757640838623, "learning_rate": 6.107389061130307e-06, "loss": 1.1105, "step": 2156 }, { "epoch": 1.3294298921417567, "grad_norm": 1.9290567636489868, "learning_rate": 6.097477623544366e-06, "loss": 1.1498, "step": 2157 }, { "epoch": 1.330046224961479, "grad_norm": 1.9054045677185059, "learning_rate": 6.087570706358938e-06, "loss": 1.0457, "step": 2158 }, { "epoch": 1.330662557781202, "grad_norm": 1.97115957736969, "learning_rate": 6.077668321049477e-06, "loss": 1.0961, "step": 2159 }, { "epoch": 1.3312788906009245, "grad_norm": 1.9310495853424072, "learning_rate": 6.067770479086198e-06, "loss": 1.1605, "step": 2160 }, { "epoch": 1.3318952234206471, "grad_norm": 1.962264895439148, "learning_rate": 6.057877191934041e-06, "loss": 1.1239, "step": 2161 }, { "epoch": 1.3325115562403698, "grad_norm": 1.9418169260025024, "learning_rate": 6.047988471052683e-06, "loss": 1.1512, "step": 2162 }, { "epoch": 1.3331278890600924, "grad_norm": 1.8888050317764282, "learning_rate": 6.0381043278965005e-06, "loss": 1.0532, "step": 2163 }, { "epoch": 1.3337442218798152, "grad_norm": 1.9572962522506714, "learning_rate": 6.028224773914575e-06, "loss": 1.1091, "step": 2164 }, { "epoch": 1.3343605546995376, "grad_norm": 1.9340481758117676, "learning_rate": 6.018349820550668e-06, "loss": 1.108, "step": 2165 }, { "epoch": 1.3349768875192605, "grad_norm": 1.9320956468582153, "learning_rate": 6.008479479243215e-06, "loss": 1.1321, "step": 2166 }, { "epoch": 1.335593220338983, "grad_norm": 1.9717285633087158, "learning_rate": 5.998613761425307e-06, "loss": 1.093, "step": 2167 }, { "epoch": 1.3362095531587057, "grad_norm": 1.8786693811416626, "learning_rate": 5.988752678524675e-06, "loss": 1.1289, "step": 2168 }, { "epoch": 1.3368258859784283, "grad_norm": 1.9123904705047607, "learning_rate": 5.978896241963693e-06, "loss": 1.0805, "step": 2169 }, { "epoch": 1.337442218798151, "grad_norm": 1.9245949983596802, "learning_rate": 5.9690444631593455e-06, "loss": 1.127, "step": 2170 }, { "epoch": 1.3380585516178736, "grad_norm": 1.9942740201950073, "learning_rate": 5.959197353523223e-06, "loss": 1.1354, "step": 2171 }, { "epoch": 1.3386748844375962, "grad_norm": 2.002119779586792, "learning_rate": 5.949354924461503e-06, "loss": 1.1496, "step": 2172 }, { "epoch": 1.339291217257319, "grad_norm": 1.9471687078475952, "learning_rate": 5.93951718737495e-06, "loss": 1.092, "step": 2173 }, { "epoch": 1.3399075500770417, "grad_norm": 2.0036675930023193, "learning_rate": 5.929684153658884e-06, "loss": 1.1417, "step": 2174 }, { "epoch": 1.3405238828967643, "grad_norm": 2.013810396194458, "learning_rate": 5.9198558347031885e-06, "loss": 1.1868, "step": 2175 }, { "epoch": 1.341140215716487, "grad_norm": 1.9210790395736694, "learning_rate": 5.910032241892276e-06, "loss": 1.1167, "step": 2176 }, { "epoch": 1.3417565485362095, "grad_norm": 1.9062467813491821, "learning_rate": 5.900213386605089e-06, "loss": 1.1246, "step": 2177 }, { "epoch": 1.3423728813559321, "grad_norm": 1.928695797920227, "learning_rate": 5.890399280215082e-06, "loss": 1.1271, "step": 2178 }, { "epoch": 1.3429892141756548, "grad_norm": 1.945951223373413, "learning_rate": 5.880589934090206e-06, "loss": 1.1079, "step": 2179 }, { "epoch": 1.3436055469953776, "grad_norm": 1.9896254539489746, "learning_rate": 5.870785359592899e-06, "loss": 1.1165, "step": 2180 }, { "epoch": 1.3442218798151002, "grad_norm": 1.9686380624771118, "learning_rate": 5.86098556808008e-06, "loss": 1.093, "step": 2181 }, { "epoch": 1.3448382126348228, "grad_norm": 1.9453051090240479, "learning_rate": 5.851190570903114e-06, "loss": 1.1187, "step": 2182 }, { "epoch": 1.3454545454545455, "grad_norm": 1.8942561149597168, "learning_rate": 5.841400379407822e-06, "loss": 1.1081, "step": 2183 }, { "epoch": 1.346070878274268, "grad_norm": 1.9189974069595337, "learning_rate": 5.831615004934455e-06, "loss": 1.0627, "step": 2184 }, { "epoch": 1.3466872110939907, "grad_norm": 1.9737305641174316, "learning_rate": 5.82183445881769e-06, "loss": 1.1252, "step": 2185 }, { "epoch": 1.3473035439137133, "grad_norm": 1.978537917137146, "learning_rate": 5.812058752386595e-06, "loss": 1.1271, "step": 2186 }, { "epoch": 1.3479198767334362, "grad_norm": 1.934865117073059, "learning_rate": 5.8022878969646575e-06, "loss": 1.0906, "step": 2187 }, { "epoch": 1.3485362095531588, "grad_norm": 1.9954735040664673, "learning_rate": 5.792521903869721e-06, "loss": 1.0655, "step": 2188 }, { "epoch": 1.3491525423728814, "grad_norm": 2.005958318710327, "learning_rate": 5.782760784414012e-06, "loss": 1.0776, "step": 2189 }, { "epoch": 1.349768875192604, "grad_norm": 1.991939902305603, "learning_rate": 5.773004549904107e-06, "loss": 1.161, "step": 2190 }, { "epoch": 1.3503852080123266, "grad_norm": 1.9078084230422974, "learning_rate": 5.763253211640929e-06, "loss": 1.1486, "step": 2191 }, { "epoch": 1.3510015408320493, "grad_norm": 1.996064305305481, "learning_rate": 5.753506780919711e-06, "loss": 1.2391, "step": 2192 }, { "epoch": 1.3516178736517719, "grad_norm": 1.9720475673675537, "learning_rate": 5.743765269030035e-06, "loss": 1.1297, "step": 2193 }, { "epoch": 1.3522342064714947, "grad_norm": 1.8880863189697266, "learning_rate": 5.7340286872557515e-06, "loss": 1.0831, "step": 2194 }, { "epoch": 1.3528505392912171, "grad_norm": 1.9727063179016113, "learning_rate": 5.724297046875019e-06, "loss": 1.1032, "step": 2195 }, { "epoch": 1.35346687211094, "grad_norm": 1.909714698791504, "learning_rate": 5.714570359160272e-06, "loss": 1.0717, "step": 2196 }, { "epoch": 1.3540832049306626, "grad_norm": 1.9569188356399536, "learning_rate": 5.704848635378197e-06, "loss": 1.1241, "step": 2197 }, { "epoch": 1.3546995377503852, "grad_norm": 2.0213723182678223, "learning_rate": 5.695131886789738e-06, "loss": 1.1463, "step": 2198 }, { "epoch": 1.3553158705701078, "grad_norm": 1.9204858541488647, "learning_rate": 5.685420124650086e-06, "loss": 1.0954, "step": 2199 }, { "epoch": 1.3559322033898304, "grad_norm": 1.9265260696411133, "learning_rate": 5.675713360208636e-06, "loss": 1.1031, "step": 2200 }, { "epoch": 1.3565485362095533, "grad_norm": 1.9578057527542114, "learning_rate": 5.666011604709005e-06, "loss": 1.1917, "step": 2201 }, { "epoch": 1.3571648690292757, "grad_norm": 1.9434736967086792, "learning_rate": 5.656314869389016e-06, "loss": 1.1235, "step": 2202 }, { "epoch": 1.3577812018489985, "grad_norm": 1.9246797561645508, "learning_rate": 5.646623165480656e-06, "loss": 1.0875, "step": 2203 }, { "epoch": 1.3583975346687212, "grad_norm": 1.9569138288497925, "learning_rate": 5.6369365042101e-06, "loss": 1.1874, "step": 2204 }, { "epoch": 1.3590138674884438, "grad_norm": 1.9025484323501587, "learning_rate": 5.627254896797679e-06, "loss": 1.0764, "step": 2205 }, { "epoch": 1.3596302003081664, "grad_norm": 1.9559506177902222, "learning_rate": 5.617578354457869e-06, "loss": 1.1001, "step": 2206 }, { "epoch": 1.360246533127889, "grad_norm": 1.9187514781951904, "learning_rate": 5.607906888399283e-06, "loss": 1.1115, "step": 2207 }, { "epoch": 1.3608628659476116, "grad_norm": 2.024916410446167, "learning_rate": 5.598240509824642e-06, "loss": 1.1428, "step": 2208 }, { "epoch": 1.3614791987673343, "grad_norm": 1.930270791053772, "learning_rate": 5.588579229930784e-06, "loss": 1.1229, "step": 2209 }, { "epoch": 1.362095531587057, "grad_norm": 1.9286863803863525, "learning_rate": 5.578923059908642e-06, "loss": 1.0815, "step": 2210 }, { "epoch": 1.3627118644067797, "grad_norm": 1.9832050800323486, "learning_rate": 5.5692720109432255e-06, "loss": 1.0517, "step": 2211 }, { "epoch": 1.3633281972265023, "grad_norm": 2.0018069744110107, "learning_rate": 5.5596260942136125e-06, "loss": 1.1169, "step": 2212 }, { "epoch": 1.363944530046225, "grad_norm": 1.9772279262542725, "learning_rate": 5.5499853208929425e-06, "loss": 1.1217, "step": 2213 }, { "epoch": 1.3645608628659476, "grad_norm": 1.9762356281280518, "learning_rate": 5.540349702148384e-06, "loss": 1.1221, "step": 2214 }, { "epoch": 1.3651771956856702, "grad_norm": 1.9476977586746216, "learning_rate": 5.530719249141148e-06, "loss": 1.136, "step": 2215 }, { "epoch": 1.3657935285053928, "grad_norm": 2.0378265380859375, "learning_rate": 5.521093973026459e-06, "loss": 1.1194, "step": 2216 }, { "epoch": 1.3664098613251157, "grad_norm": 1.9107972383499146, "learning_rate": 5.511473884953532e-06, "loss": 0.9934, "step": 2217 }, { "epoch": 1.3670261941448383, "grad_norm": 2.051091432571411, "learning_rate": 5.501858996065596e-06, "loss": 1.1287, "step": 2218 }, { "epoch": 1.367642526964561, "grad_norm": 1.97477388381958, "learning_rate": 5.492249317499842e-06, "loss": 1.1244, "step": 2219 }, { "epoch": 1.3682588597842835, "grad_norm": 2.009836196899414, "learning_rate": 5.482644860387424e-06, "loss": 1.1222, "step": 2220 }, { "epoch": 1.3688751926040061, "grad_norm": 1.9166696071624756, "learning_rate": 5.473045635853455e-06, "loss": 1.0805, "step": 2221 }, { "epoch": 1.3694915254237288, "grad_norm": 1.8979380130767822, "learning_rate": 5.463451655016988e-06, "loss": 1.0658, "step": 2222 }, { "epoch": 1.3701078582434514, "grad_norm": 1.9847412109375, "learning_rate": 5.453862928990988e-06, "loss": 1.1628, "step": 2223 }, { "epoch": 1.3707241910631742, "grad_norm": 1.9733080863952637, "learning_rate": 5.444279468882359e-06, "loss": 1.1025, "step": 2224 }, { "epoch": 1.3713405238828968, "grad_norm": 1.986677885055542, "learning_rate": 5.434701285791878e-06, "loss": 1.1137, "step": 2225 }, { "epoch": 1.3719568567026195, "grad_norm": 2.0321171283721924, "learning_rate": 5.425128390814225e-06, "loss": 1.1375, "step": 2226 }, { "epoch": 1.372573189522342, "grad_norm": 1.9283150434494019, "learning_rate": 5.415560795037953e-06, "loss": 1.1098, "step": 2227 }, { "epoch": 1.3731895223420647, "grad_norm": 1.9447542428970337, "learning_rate": 5.405998509545478e-06, "loss": 1.0647, "step": 2228 }, { "epoch": 1.3738058551617873, "grad_norm": 1.9330190420150757, "learning_rate": 5.3964415454130505e-06, "loss": 1.0941, "step": 2229 }, { "epoch": 1.37442218798151, "grad_norm": 1.9465807676315308, "learning_rate": 5.3868899137107845e-06, "loss": 1.1013, "step": 2230 }, { "epoch": 1.3750385208012328, "grad_norm": 1.940626621246338, "learning_rate": 5.3773436255025865e-06, "loss": 1.0924, "step": 2231 }, { "epoch": 1.3756548536209552, "grad_norm": 1.8969776630401611, "learning_rate": 5.367802691846198e-06, "loss": 1.0553, "step": 2232 }, { "epoch": 1.376271186440678, "grad_norm": 1.9547635316848755, "learning_rate": 5.358267123793148e-06, "loss": 1.1349, "step": 2233 }, { "epoch": 1.3768875192604006, "grad_norm": 1.971251368522644, "learning_rate": 5.348736932388745e-06, "loss": 1.0848, "step": 2234 }, { "epoch": 1.3775038520801233, "grad_norm": 1.9501301050186157, "learning_rate": 5.339212128672078e-06, "loss": 1.1348, "step": 2235 }, { "epoch": 1.378120184899846, "grad_norm": 1.9323294162750244, "learning_rate": 5.329692723675994e-06, "loss": 1.1228, "step": 2236 }, { "epoch": 1.3787365177195685, "grad_norm": 1.965030312538147, "learning_rate": 5.320178728427085e-06, "loss": 1.0711, "step": 2237 }, { "epoch": 1.3793528505392914, "grad_norm": 1.9186996221542358, "learning_rate": 5.310670153945679e-06, "loss": 1.0678, "step": 2238 }, { "epoch": 1.3799691833590138, "grad_norm": 2.0200178623199463, "learning_rate": 5.3011670112458226e-06, "loss": 1.1327, "step": 2239 }, { "epoch": 1.3805855161787366, "grad_norm": 1.957034945487976, "learning_rate": 5.2916693113352665e-06, "loss": 1.0522, "step": 2240 }, { "epoch": 1.3812018489984592, "grad_norm": 1.954397439956665, "learning_rate": 5.282177065215465e-06, "loss": 1.1061, "step": 2241 }, { "epoch": 1.3818181818181818, "grad_norm": 1.9905340671539307, "learning_rate": 5.2726902838815495e-06, "loss": 1.0359, "step": 2242 }, { "epoch": 1.3824345146379045, "grad_norm": 1.9293432235717773, "learning_rate": 5.2632089783223266e-06, "loss": 1.0873, "step": 2243 }, { "epoch": 1.383050847457627, "grad_norm": 1.9536933898925781, "learning_rate": 5.253733159520258e-06, "loss": 1.1139, "step": 2244 }, { "epoch": 1.3836671802773497, "grad_norm": 1.9559085369110107, "learning_rate": 5.244262838451442e-06, "loss": 1.0583, "step": 2245 }, { "epoch": 1.3842835130970723, "grad_norm": 1.952139139175415, "learning_rate": 5.234798026085621e-06, "loss": 1.1047, "step": 2246 }, { "epoch": 1.3848998459167952, "grad_norm": 1.970496654510498, "learning_rate": 5.225338733386149e-06, "loss": 1.0915, "step": 2247 }, { "epoch": 1.3855161787365178, "grad_norm": 1.9785736799240112, "learning_rate": 5.2158849713099905e-06, "loss": 1.1135, "step": 2248 }, { "epoch": 1.3861325115562404, "grad_norm": 1.9778302907943726, "learning_rate": 5.2064367508077e-06, "loss": 1.046, "step": 2249 }, { "epoch": 1.386748844375963, "grad_norm": 2.022606372833252, "learning_rate": 5.196994082823419e-06, "loss": 1.1126, "step": 2250 }, { "epoch": 1.3873651771956856, "grad_norm": 1.9199731349945068, "learning_rate": 5.187556978294847e-06, "loss": 1.0451, "step": 2251 }, { "epoch": 1.3879815100154083, "grad_norm": 2.032363176345825, "learning_rate": 5.178125448153247e-06, "loss": 1.107, "step": 2252 }, { "epoch": 1.3885978428351309, "grad_norm": 1.9399081468582153, "learning_rate": 5.168699503323429e-06, "loss": 1.0415, "step": 2253 }, { "epoch": 1.3892141756548537, "grad_norm": 2.032320499420166, "learning_rate": 5.159279154723715e-06, "loss": 1.1517, "step": 2254 }, { "epoch": 1.3898305084745763, "grad_norm": 1.8967019319534302, "learning_rate": 5.14986441326597e-06, "loss": 1.0455, "step": 2255 }, { "epoch": 1.390446841294299, "grad_norm": 1.9635406732559204, "learning_rate": 5.1404552898555525e-06, "loss": 1.091, "step": 2256 }, { "epoch": 1.3910631741140216, "grad_norm": 2.0191521644592285, "learning_rate": 5.131051795391302e-06, "loss": 1.07, "step": 2257 }, { "epoch": 1.3916795069337442, "grad_norm": 1.9674570560455322, "learning_rate": 5.121653940765555e-06, "loss": 1.0949, "step": 2258 }, { "epoch": 1.3922958397534668, "grad_norm": 1.9415597915649414, "learning_rate": 5.112261736864111e-06, "loss": 1.036, "step": 2259 }, { "epoch": 1.3929121725731894, "grad_norm": 1.9371141195297241, "learning_rate": 5.102875194566211e-06, "loss": 1.1123, "step": 2260 }, { "epoch": 1.3935285053929123, "grad_norm": 1.9655702114105225, "learning_rate": 5.093494324744564e-06, "loss": 1.1006, "step": 2261 }, { "epoch": 1.3941448382126347, "grad_norm": 1.9874012470245361, "learning_rate": 5.08411913826528e-06, "loss": 1.1537, "step": 2262 }, { "epoch": 1.3947611710323575, "grad_norm": 1.9527015686035156, "learning_rate": 5.074749645987907e-06, "loss": 1.1349, "step": 2263 }, { "epoch": 1.3953775038520801, "grad_norm": 1.9669395685195923, "learning_rate": 5.065385858765384e-06, "loss": 1.1548, "step": 2264 }, { "epoch": 1.3959938366718028, "grad_norm": 1.9820141792297363, "learning_rate": 5.056027787444053e-06, "loss": 1.1158, "step": 2265 }, { "epoch": 1.3966101694915254, "grad_norm": 1.9875644445419312, "learning_rate": 5.046675442863618e-06, "loss": 1.1484, "step": 2266 }, { "epoch": 1.397226502311248, "grad_norm": 1.9225784540176392, "learning_rate": 5.037328835857176e-06, "loss": 1.1419, "step": 2267 }, { "epoch": 1.3978428351309709, "grad_norm": 1.9722404479980469, "learning_rate": 5.027987977251149e-06, "loss": 1.0923, "step": 2268 }, { "epoch": 1.3984591679506932, "grad_norm": 1.9942500591278076, "learning_rate": 5.018652877865322e-06, "loss": 1.1314, "step": 2269 }, { "epoch": 1.399075500770416, "grad_norm": 1.9218981266021729, "learning_rate": 5.009323548512801e-06, "loss": 1.1549, "step": 2270 }, { "epoch": 1.3996918335901387, "grad_norm": 1.9097427129745483, "learning_rate": 5.000000000000003e-06, "loss": 1.0888, "step": 2271 }, { "epoch": 1.4003081664098613, "grad_norm": 1.9776077270507812, "learning_rate": 4.9906822431266556e-06, "loss": 1.1586, "step": 2272 }, { "epoch": 1.400924499229584, "grad_norm": 1.9064873456954956, "learning_rate": 4.981370288685778e-06, "loss": 1.0541, "step": 2273 }, { "epoch": 1.4015408320493066, "grad_norm": 1.9379702806472778, "learning_rate": 4.972064147463669e-06, "loss": 1.0869, "step": 2274 }, { "epoch": 1.4021571648690292, "grad_norm": 1.9099199771881104, "learning_rate": 4.962763830239887e-06, "loss": 1.067, "step": 2275 }, { "epoch": 1.4027734976887518, "grad_norm": 1.9758614301681519, "learning_rate": 4.953469347787256e-06, "loss": 1.104, "step": 2276 }, { "epoch": 1.4033898305084747, "grad_norm": 2.0334835052490234, "learning_rate": 4.944180710871825e-06, "loss": 1.1371, "step": 2277 }, { "epoch": 1.4040061633281973, "grad_norm": 2.070807933807373, "learning_rate": 4.934897930252887e-06, "loss": 1.1796, "step": 2278 }, { "epoch": 1.40462249614792, "grad_norm": 2.0391523838043213, "learning_rate": 4.925621016682942e-06, "loss": 1.2096, "step": 2279 }, { "epoch": 1.4052388289676425, "grad_norm": 2.0696229934692383, "learning_rate": 4.916349980907701e-06, "loss": 1.1665, "step": 2280 }, { "epoch": 1.4058551617873651, "grad_norm": 2.022423028945923, "learning_rate": 4.907084833666067e-06, "loss": 1.1472, "step": 2281 }, { "epoch": 1.4064714946070878, "grad_norm": 2.0195982456207275, "learning_rate": 4.897825585690109e-06, "loss": 1.0626, "step": 2282 }, { "epoch": 1.4070878274268104, "grad_norm": 1.9492371082305908, "learning_rate": 4.888572247705077e-06, "loss": 1.1265, "step": 2283 }, { "epoch": 1.4077041602465332, "grad_norm": 1.9625933170318604, "learning_rate": 4.879324830429371e-06, "loss": 1.0753, "step": 2284 }, { "epoch": 1.4083204930662558, "grad_norm": 1.939664363861084, "learning_rate": 4.870083344574531e-06, "loss": 1.0765, "step": 2285 }, { "epoch": 1.4089368258859785, "grad_norm": 1.9771808385849, "learning_rate": 4.860847800845229e-06, "loss": 1.0961, "step": 2286 }, { "epoch": 1.409553158705701, "grad_norm": 1.93572998046875, "learning_rate": 4.851618209939255e-06, "loss": 1.0768, "step": 2287 }, { "epoch": 1.4101694915254237, "grad_norm": 2.041668176651001, "learning_rate": 4.842394582547496e-06, "loss": 1.0864, "step": 2288 }, { "epoch": 1.4107858243451463, "grad_norm": 2.004164457321167, "learning_rate": 4.833176929353941e-06, "loss": 1.1344, "step": 2289 }, { "epoch": 1.411402157164869, "grad_norm": 1.9920216798782349, "learning_rate": 4.823965261035656e-06, "loss": 1.0988, "step": 2290 }, { "epoch": 1.4120184899845918, "grad_norm": 1.9728548526763916, "learning_rate": 4.8147595882627644e-06, "loss": 1.0926, "step": 2291 }, { "epoch": 1.4126348228043144, "grad_norm": 1.9731323719024658, "learning_rate": 4.805559921698464e-06, "loss": 1.0967, "step": 2292 }, { "epoch": 1.413251155624037, "grad_norm": 1.9998222589492798, "learning_rate": 4.796366271998984e-06, "loss": 1.1153, "step": 2293 }, { "epoch": 1.4138674884437596, "grad_norm": 1.9843158721923828, "learning_rate": 4.78717864981358e-06, "loss": 1.0948, "step": 2294 }, { "epoch": 1.4144838212634823, "grad_norm": 1.989874243736267, "learning_rate": 4.7779970657845334e-06, "loss": 1.1117, "step": 2295 }, { "epoch": 1.4151001540832049, "grad_norm": 2.004549503326416, "learning_rate": 4.768821530547133e-06, "loss": 1.0685, "step": 2296 }, { "epoch": 1.4157164869029275, "grad_norm": 1.9329215288162231, "learning_rate": 4.759652054729646e-06, "loss": 1.0733, "step": 2297 }, { "epoch": 1.4163328197226503, "grad_norm": 1.9636003971099854, "learning_rate": 4.750488648953348e-06, "loss": 1.1499, "step": 2298 }, { "epoch": 1.4169491525423727, "grad_norm": 1.9903494119644165, "learning_rate": 4.7413313238324556e-06, "loss": 1.0957, "step": 2299 }, { "epoch": 1.4175654853620956, "grad_norm": 1.9405877590179443, "learning_rate": 4.732180089974155e-06, "loss": 1.0654, "step": 2300 }, { "epoch": 1.4181818181818182, "grad_norm": 1.9986631870269775, "learning_rate": 4.723034957978584e-06, "loss": 1.0967, "step": 2301 }, { "epoch": 1.4187981510015408, "grad_norm": 2.0525617599487305, "learning_rate": 4.713895938438793e-06, "loss": 1.1517, "step": 2302 }, { "epoch": 1.4194144838212635, "grad_norm": 1.9770605564117432, "learning_rate": 4.704763041940765e-06, "loss": 1.0956, "step": 2303 }, { "epoch": 1.420030816640986, "grad_norm": 2.0559637546539307, "learning_rate": 4.695636279063399e-06, "loss": 1.1273, "step": 2304 }, { "epoch": 1.420647149460709, "grad_norm": 2.0033962726593018, "learning_rate": 4.686515660378469e-06, "loss": 1.1027, "step": 2305 }, { "epoch": 1.4212634822804313, "grad_norm": 1.9792624711990356, "learning_rate": 4.6774011964506435e-06, "loss": 1.0974, "step": 2306 }, { "epoch": 1.4218798151001542, "grad_norm": 1.9659953117370605, "learning_rate": 4.668292897837467e-06, "loss": 1.0936, "step": 2307 }, { "epoch": 1.4224961479198768, "grad_norm": 1.9983259439468384, "learning_rate": 4.659190775089326e-06, "loss": 1.1123, "step": 2308 }, { "epoch": 1.4231124807395994, "grad_norm": 2.0276520252227783, "learning_rate": 4.650094838749468e-06, "loss": 1.0455, "step": 2309 }, { "epoch": 1.423728813559322, "grad_norm": 1.9453918933868408, "learning_rate": 4.64100509935397e-06, "loss": 1.0417, "step": 2310 }, { "epoch": 1.4243451463790446, "grad_norm": 1.9814741611480713, "learning_rate": 4.631921567431729e-06, "loss": 1.1178, "step": 2311 }, { "epoch": 1.4249614791987673, "grad_norm": 1.9892122745513916, "learning_rate": 4.622844253504455e-06, "loss": 1.1144, "step": 2312 }, { "epoch": 1.4255778120184899, "grad_norm": 2.0533642768859863, "learning_rate": 4.613773168086657e-06, "loss": 1.0911, "step": 2313 }, { "epoch": 1.4261941448382127, "grad_norm": 2.0932095050811768, "learning_rate": 4.604708321685618e-06, "loss": 1.1211, "step": 2314 }, { "epoch": 1.4268104776579353, "grad_norm": 2.074625015258789, "learning_rate": 4.595649724801408e-06, "loss": 1.1017, "step": 2315 }, { "epoch": 1.427426810477658, "grad_norm": 2.015012502670288, "learning_rate": 4.586597387926851e-06, "loss": 1.0746, "step": 2316 }, { "epoch": 1.4280431432973806, "grad_norm": 1.9974778890609741, "learning_rate": 4.577551321547522e-06, "loss": 1.0522, "step": 2317 }, { "epoch": 1.4286594761171032, "grad_norm": 1.9459233283996582, "learning_rate": 4.568511536141736e-06, "loss": 1.022, "step": 2318 }, { "epoch": 1.4292758089368258, "grad_norm": 1.9822490215301514, "learning_rate": 4.5594780421805196e-06, "loss": 1.1083, "step": 2319 }, { "epoch": 1.4298921417565484, "grad_norm": 1.9983956813812256, "learning_rate": 4.550450850127626e-06, "loss": 1.1781, "step": 2320 }, { "epoch": 1.4305084745762713, "grad_norm": 1.9283967018127441, "learning_rate": 4.541429970439501e-06, "loss": 1.04, "step": 2321 }, { "epoch": 1.431124807395994, "grad_norm": 1.9519331455230713, "learning_rate": 4.532415413565285e-06, "loss": 1.1016, "step": 2322 }, { "epoch": 1.4317411402157165, "grad_norm": 2.0001869201660156, "learning_rate": 4.523407189946789e-06, "loss": 1.0898, "step": 2323 }, { "epoch": 1.4323574730354391, "grad_norm": 1.9758912324905396, "learning_rate": 4.514405310018493e-06, "loss": 1.0731, "step": 2324 }, { "epoch": 1.4329738058551618, "grad_norm": 1.995488166809082, "learning_rate": 4.505409784207517e-06, "loss": 1.0845, "step": 2325 }, { "epoch": 1.4335901386748844, "grad_norm": 1.987979769706726, "learning_rate": 4.496420622933635e-06, "loss": 1.0506, "step": 2326 }, { "epoch": 1.434206471494607, "grad_norm": 1.9867050647735596, "learning_rate": 4.487437836609247e-06, "loss": 1.1056, "step": 2327 }, { "epoch": 1.4348228043143298, "grad_norm": 2.1131980419158936, "learning_rate": 4.478461435639353e-06, "loss": 1.0898, "step": 2328 }, { "epoch": 1.4354391371340525, "grad_norm": 2.0385069847106934, "learning_rate": 4.46949143042158e-06, "loss": 1.1538, "step": 2329 }, { "epoch": 1.436055469953775, "grad_norm": 2.051150321960449, "learning_rate": 4.460527831346134e-06, "loss": 1.101, "step": 2330 }, { "epoch": 1.4366718027734977, "grad_norm": 2.022932529449463, "learning_rate": 4.451570648795797e-06, "loss": 1.0423, "step": 2331 }, { "epoch": 1.4372881355932203, "grad_norm": 2.00081205368042, "learning_rate": 4.442619893145924e-06, "loss": 1.089, "step": 2332 }, { "epoch": 1.437904468412943, "grad_norm": 2.0071496963500977, "learning_rate": 4.433675574764431e-06, "loss": 1.1115, "step": 2333 }, { "epoch": 1.4385208012326656, "grad_norm": 2.0162832736968994, "learning_rate": 4.42473770401176e-06, "loss": 1.0963, "step": 2334 }, { "epoch": 1.4391371340523884, "grad_norm": 1.9874433279037476, "learning_rate": 4.415806291240909e-06, "loss": 1.1052, "step": 2335 }, { "epoch": 1.4397534668721108, "grad_norm": 1.9707057476043701, "learning_rate": 4.406881346797375e-06, "loss": 1.0895, "step": 2336 }, { "epoch": 1.4403697996918337, "grad_norm": 1.9735043048858643, "learning_rate": 4.397962881019169e-06, "loss": 1.1431, "step": 2337 }, { "epoch": 1.4409861325115563, "grad_norm": 2.0012686252593994, "learning_rate": 4.389050904236806e-06, "loss": 1.0978, "step": 2338 }, { "epoch": 1.441602465331279, "grad_norm": 1.9406676292419434, "learning_rate": 4.380145426773269e-06, "loss": 1.0723, "step": 2339 }, { "epoch": 1.4422187981510015, "grad_norm": 2.010847330093384, "learning_rate": 4.371246458944019e-06, "loss": 1.145, "step": 2340 }, { "epoch": 1.4428351309707241, "grad_norm": 2.119178056716919, "learning_rate": 4.3623540110569935e-06, "loss": 1.0937, "step": 2341 }, { "epoch": 1.443451463790447, "grad_norm": 1.9569171667099, "learning_rate": 4.353468093412548e-06, "loss": 1.0435, "step": 2342 }, { "epoch": 1.4440677966101694, "grad_norm": 2.1147449016571045, "learning_rate": 4.344588716303495e-06, "loss": 1.1512, "step": 2343 }, { "epoch": 1.4446841294298922, "grad_norm": 1.9105144739151, "learning_rate": 4.3357158900150675e-06, "loss": 1.0252, "step": 2344 }, { "epoch": 1.4453004622496148, "grad_norm": 1.9496625661849976, "learning_rate": 4.3268496248249e-06, "loss": 1.0455, "step": 2345 }, { "epoch": 1.4459167950693375, "grad_norm": 1.9688549041748047, "learning_rate": 4.317989931003039e-06, "loss": 1.043, "step": 2346 }, { "epoch": 1.44653312788906, "grad_norm": 1.982770562171936, "learning_rate": 4.3091368188119144e-06, "loss": 1.0774, "step": 2347 }, { "epoch": 1.4471494607087827, "grad_norm": 1.9814218282699585, "learning_rate": 4.300290298506333e-06, "loss": 1.077, "step": 2348 }, { "epoch": 1.4477657935285053, "grad_norm": 1.97969388961792, "learning_rate": 4.291450380333466e-06, "loss": 1.0543, "step": 2349 }, { "epoch": 1.448382126348228, "grad_norm": 1.9842337369918823, "learning_rate": 4.282617074532841e-06, "loss": 1.0816, "step": 2350 }, { "epoch": 1.4489984591679508, "grad_norm": 2.033602714538574, "learning_rate": 4.273790391336315e-06, "loss": 1.1138, "step": 2351 }, { "epoch": 1.4496147919876734, "grad_norm": 2.0117831230163574, "learning_rate": 4.264970340968087e-06, "loss": 1.1013, "step": 2352 }, { "epoch": 1.450231124807396, "grad_norm": 2.0144495964050293, "learning_rate": 4.256156933644664e-06, "loss": 1.13, "step": 2353 }, { "epoch": 1.4508474576271186, "grad_norm": 2.077676296234131, "learning_rate": 4.247350179574863e-06, "loss": 1.1497, "step": 2354 }, { "epoch": 1.4514637904468413, "grad_norm": 1.979999303817749, "learning_rate": 4.2385500889597965e-06, "loss": 1.0701, "step": 2355 }, { "epoch": 1.4520801232665639, "grad_norm": 2.0381157398223877, "learning_rate": 4.229756671992848e-06, "loss": 1.1323, "step": 2356 }, { "epoch": 1.4526964560862865, "grad_norm": 1.950457215309143, "learning_rate": 4.22096993885968e-06, "loss": 1.0363, "step": 2357 }, { "epoch": 1.4533127889060093, "grad_norm": 2.0214462280273438, "learning_rate": 4.212189899738213e-06, "loss": 1.0999, "step": 2358 }, { "epoch": 1.453929121725732, "grad_norm": 2.0251963138580322, "learning_rate": 4.203416564798608e-06, "loss": 1.1234, "step": 2359 }, { "epoch": 1.4545454545454546, "grad_norm": 2.0867717266082764, "learning_rate": 4.194649944203266e-06, "loss": 1.1319, "step": 2360 }, { "epoch": 1.4551617873651772, "grad_norm": 1.9186941385269165, "learning_rate": 4.185890048106811e-06, "loss": 1.0551, "step": 2361 }, { "epoch": 1.4557781201848998, "grad_norm": 1.9851503372192383, "learning_rate": 4.177136886656067e-06, "loss": 1.0551, "step": 2362 }, { "epoch": 1.4563944530046224, "grad_norm": 2.051083564758301, "learning_rate": 4.16839046999007e-06, "loss": 1.0502, "step": 2363 }, { "epoch": 1.457010785824345, "grad_norm": 2.010798692703247, "learning_rate": 4.159650808240041e-06, "loss": 1.0508, "step": 2364 }, { "epoch": 1.457627118644068, "grad_norm": 2.0349762439727783, "learning_rate": 4.150917911529364e-06, "loss": 1.0644, "step": 2365 }, { "epoch": 1.4582434514637905, "grad_norm": 2.0556674003601074, "learning_rate": 4.1421917899736064e-06, "loss": 1.0775, "step": 2366 }, { "epoch": 1.4588597842835132, "grad_norm": 1.9564342498779297, "learning_rate": 4.133472453680479e-06, "loss": 1.0066, "step": 2367 }, { "epoch": 1.4594761171032358, "grad_norm": 2.0156657695770264, "learning_rate": 4.124759912749825e-06, "loss": 1.1222, "step": 2368 }, { "epoch": 1.4600924499229584, "grad_norm": 2.0361404418945312, "learning_rate": 4.116054177273628e-06, "loss": 1.0878, "step": 2369 }, { "epoch": 1.460708782742681, "grad_norm": 2.0296707153320312, "learning_rate": 4.107355257335985e-06, "loss": 1.0902, "step": 2370 }, { "epoch": 1.4613251155624036, "grad_norm": 1.9915072917938232, "learning_rate": 4.098663163013091e-06, "loss": 1.1113, "step": 2371 }, { "epoch": 1.4619414483821265, "grad_norm": 2.008096218109131, "learning_rate": 4.089977904373251e-06, "loss": 1.079, "step": 2372 }, { "epoch": 1.4625577812018489, "grad_norm": 2.0008907318115234, "learning_rate": 4.081299491476835e-06, "loss": 1.1123, "step": 2373 }, { "epoch": 1.4631741140215717, "grad_norm": 1.9906007051467896, "learning_rate": 4.072627934376292e-06, "loss": 1.0927, "step": 2374 }, { "epoch": 1.4637904468412943, "grad_norm": 1.9914522171020508, "learning_rate": 4.063963243116134e-06, "loss": 1.0495, "step": 2375 }, { "epoch": 1.464406779661017, "grad_norm": 2.0146689414978027, "learning_rate": 4.055305427732907e-06, "loss": 1.1062, "step": 2376 }, { "epoch": 1.4650231124807396, "grad_norm": 2.0080623626708984, "learning_rate": 4.046654498255199e-06, "loss": 1.054, "step": 2377 }, { "epoch": 1.4656394453004622, "grad_norm": 2.0446975231170654, "learning_rate": 4.0380104647036345e-06, "loss": 1.1305, "step": 2378 }, { "epoch": 1.466255778120185, "grad_norm": 2.0463545322418213, "learning_rate": 4.029373337090827e-06, "loss": 1.0555, "step": 2379 }, { "epoch": 1.4668721109399074, "grad_norm": 2.1065144538879395, "learning_rate": 4.0207431254214065e-06, "loss": 1.1401, "step": 2380 }, { "epoch": 1.4674884437596303, "grad_norm": 2.098684310913086, "learning_rate": 4.012119839691993e-06, "loss": 1.0773, "step": 2381 }, { "epoch": 1.468104776579353, "grad_norm": 2.1392428874969482, "learning_rate": 4.003503489891169e-06, "loss": 1.0628, "step": 2382 }, { "epoch": 1.4687211093990755, "grad_norm": 2.023073673248291, "learning_rate": 3.9948940859994964e-06, "loss": 1.067, "step": 2383 }, { "epoch": 1.4693374422187981, "grad_norm": 1.9942257404327393, "learning_rate": 3.98629163798949e-06, "loss": 1.0928, "step": 2384 }, { "epoch": 1.4699537750385208, "grad_norm": 2.004473924636841, "learning_rate": 3.9776961558256045e-06, "loss": 1.0936, "step": 2385 }, { "epoch": 1.4705701078582434, "grad_norm": 2.0050666332244873, "learning_rate": 3.969107649464226e-06, "loss": 1.0699, "step": 2386 }, { "epoch": 1.471186440677966, "grad_norm": 1.995571494102478, "learning_rate": 3.960526128853665e-06, "loss": 1.0629, "step": 2387 }, { "epoch": 1.4718027734976888, "grad_norm": 2.0142529010772705, "learning_rate": 3.951951603934129e-06, "loss": 1.0923, "step": 2388 }, { "epoch": 1.4724191063174115, "grad_norm": 2.0067546367645264, "learning_rate": 3.9433840846377325e-06, "loss": 1.0822, "step": 2389 }, { "epoch": 1.473035439137134, "grad_norm": 2.150789976119995, "learning_rate": 3.934823580888473e-06, "loss": 1.1278, "step": 2390 }, { "epoch": 1.4736517719568567, "grad_norm": 2.0594303607940674, "learning_rate": 3.926270102602219e-06, "loss": 1.0857, "step": 2391 }, { "epoch": 1.4742681047765793, "grad_norm": 1.9781386852264404, "learning_rate": 3.917723659686708e-06, "loss": 1.0203, "step": 2392 }, { "epoch": 1.474884437596302, "grad_norm": 2.027792453765869, "learning_rate": 3.909184262041517e-06, "loss": 1.0994, "step": 2393 }, { "epoch": 1.4755007704160246, "grad_norm": 1.9786205291748047, "learning_rate": 3.90065191955807e-06, "loss": 1.0582, "step": 2394 }, { "epoch": 1.4761171032357474, "grad_norm": 2.071662664413452, "learning_rate": 3.8921266421196166e-06, "loss": 1.114, "step": 2395 }, { "epoch": 1.47673343605547, "grad_norm": 2.0489635467529297, "learning_rate": 3.883608439601227e-06, "loss": 1.1111, "step": 2396 }, { "epoch": 1.4773497688751926, "grad_norm": 2.0163257122039795, "learning_rate": 3.8750973218697685e-06, "loss": 1.0864, "step": 2397 }, { "epoch": 1.4779661016949153, "grad_norm": 1.9999123811721802, "learning_rate": 3.866593298783914e-06, "loss": 1.1, "step": 2398 }, { "epoch": 1.4785824345146379, "grad_norm": 2.090106248855591, "learning_rate": 3.8580963801941e-06, "loss": 1.1381, "step": 2399 }, { "epoch": 1.4791987673343605, "grad_norm": 2.038511037826538, "learning_rate": 3.84960657594255e-06, "loss": 1.0475, "step": 2400 }, { "epoch": 1.4798151001540831, "grad_norm": 2.086063861846924, "learning_rate": 3.841123895863242e-06, "loss": 1.1042, "step": 2401 }, { "epoch": 1.480431432973806, "grad_norm": 2.0904266834259033, "learning_rate": 3.832648349781893e-06, "loss": 1.0759, "step": 2402 }, { "epoch": 1.4810477657935284, "grad_norm": 2.0258145332336426, "learning_rate": 3.824179947515974e-06, "loss": 1.022, "step": 2403 }, { "epoch": 1.4816640986132512, "grad_norm": 2.048868417739868, "learning_rate": 3.815718698874672e-06, "loss": 1.085, "step": 2404 }, { "epoch": 1.4822804314329738, "grad_norm": 1.9725048542022705, "learning_rate": 3.8072646136588796e-06, "loss": 0.9925, "step": 2405 }, { "epoch": 1.4828967642526965, "grad_norm": 2.0199265480041504, "learning_rate": 3.7988177016612038e-06, "loss": 1.0456, "step": 2406 }, { "epoch": 1.483513097072419, "grad_norm": 2.0674359798431396, "learning_rate": 3.7903779726659406e-06, "loss": 1.1364, "step": 2407 }, { "epoch": 1.4841294298921417, "grad_norm": 1.9948780536651611, "learning_rate": 3.781945436449054e-06, "loss": 1.0705, "step": 2408 }, { "epoch": 1.4847457627118645, "grad_norm": 1.965599536895752, "learning_rate": 3.7735201027782e-06, "loss": 1.1074, "step": 2409 }, { "epoch": 1.485362095531587, "grad_norm": 2.0250203609466553, "learning_rate": 3.7651019814126656e-06, "loss": 1.1243, "step": 2410 }, { "epoch": 1.4859784283513098, "grad_norm": 2.04533052444458, "learning_rate": 3.7566910821034007e-06, "loss": 1.034, "step": 2411 }, { "epoch": 1.4865947611710324, "grad_norm": 2.02551531791687, "learning_rate": 3.748287414592987e-06, "loss": 1.1094, "step": 2412 }, { "epoch": 1.487211093990755, "grad_norm": 2.0208184719085693, "learning_rate": 3.739890988615621e-06, "loss": 1.1417, "step": 2413 }, { "epoch": 1.4878274268104776, "grad_norm": 2.0637047290802, "learning_rate": 3.731501813897117e-06, "loss": 1.1381, "step": 2414 }, { "epoch": 1.4884437596302003, "grad_norm": 2.043926239013672, "learning_rate": 3.723119900154899e-06, "loss": 1.0641, "step": 2415 }, { "epoch": 1.4890600924499229, "grad_norm": 1.989729881286621, "learning_rate": 3.7147452570979627e-06, "loss": 1.0226, "step": 2416 }, { "epoch": 1.4896764252696455, "grad_norm": 2.012901782989502, "learning_rate": 3.7063778944268934e-06, "loss": 1.1066, "step": 2417 }, { "epoch": 1.4902927580893683, "grad_norm": 2.0135879516601562, "learning_rate": 3.6980178218338435e-06, "loss": 1.1223, "step": 2418 }, { "epoch": 1.490909090909091, "grad_norm": 1.9984265565872192, "learning_rate": 3.6896650490025133e-06, "loss": 1.0812, "step": 2419 }, { "epoch": 1.4915254237288136, "grad_norm": 2.0421345233917236, "learning_rate": 3.681319585608154e-06, "loss": 1.132, "step": 2420 }, { "epoch": 1.4921417565485362, "grad_norm": 2.091083288192749, "learning_rate": 3.672981441317549e-06, "loss": 1.0995, "step": 2421 }, { "epoch": 1.4927580893682588, "grad_norm": 2.0125412940979004, "learning_rate": 3.6646506257890046e-06, "loss": 1.0933, "step": 2422 }, { "epoch": 1.4933744221879814, "grad_norm": 2.03678297996521, "learning_rate": 3.656327148672336e-06, "loss": 1.0574, "step": 2423 }, { "epoch": 1.493990755007704, "grad_norm": 2.0228943824768066, "learning_rate": 3.6480110196088624e-06, "loss": 1.1305, "step": 2424 }, { "epoch": 1.494607087827427, "grad_norm": 2.0266366004943848, "learning_rate": 3.6397022482313804e-06, "loss": 1.0596, "step": 2425 }, { "epoch": 1.4952234206471495, "grad_norm": 2.0759503841400146, "learning_rate": 3.6314008441641768e-06, "loss": 1.1205, "step": 2426 }, { "epoch": 1.4958397534668721, "grad_norm": 2.0208513736724854, "learning_rate": 3.6231068170229967e-06, "loss": 1.0902, "step": 2427 }, { "epoch": 1.4964560862865948, "grad_norm": 2.000516176223755, "learning_rate": 3.6148201764150457e-06, "loss": 1.0819, "step": 2428 }, { "epoch": 1.4970724191063174, "grad_norm": 2.0682079792022705, "learning_rate": 3.6065409319389733e-06, "loss": 1.1177, "step": 2429 }, { "epoch": 1.49768875192604, "grad_norm": 2.043426752090454, "learning_rate": 3.5982690931848517e-06, "loss": 1.1282, "step": 2430 }, { "epoch": 1.4983050847457626, "grad_norm": 2.034386396408081, "learning_rate": 3.5900046697341872e-06, "loss": 1.035, "step": 2431 }, { "epoch": 1.4989214175654855, "grad_norm": 2.0119788646698, "learning_rate": 3.5817476711598908e-06, "loss": 1.0864, "step": 2432 }, { "epoch": 1.499537750385208, "grad_norm": 2.0708999633789062, "learning_rate": 3.573498107026275e-06, "loss": 1.143, "step": 2433 }, { "epoch": 1.5001540832049307, "grad_norm": 2.0691657066345215, "learning_rate": 3.56525598688904e-06, "loss": 1.1566, "step": 2434 }, { "epoch": 1.5007704160246533, "grad_norm": 2.032261371612549, "learning_rate": 3.557021320295266e-06, "loss": 1.1157, "step": 2435 }, { "epoch": 1.501386748844376, "grad_norm": 1.9695446491241455, "learning_rate": 3.5487941167833916e-06, "loss": 1.0407, "step": 2436 }, { "epoch": 1.5020030816640986, "grad_norm": 2.0456535816192627, "learning_rate": 3.5405743858832175e-06, "loss": 1.1133, "step": 2437 }, { "epoch": 1.5026194144838212, "grad_norm": 1.9807504415512085, "learning_rate": 3.5323621371158923e-06, "loss": 1.0403, "step": 2438 }, { "epoch": 1.503235747303544, "grad_norm": 2.02797269821167, "learning_rate": 3.5241573799938824e-06, "loss": 1.092, "step": 2439 }, { "epoch": 1.5038520801232664, "grad_norm": 2.0431435108184814, "learning_rate": 3.5159601240209963e-06, "loss": 1.1165, "step": 2440 }, { "epoch": 1.5044684129429893, "grad_norm": 2.0282094478607178, "learning_rate": 3.5077703786923444e-06, "loss": 1.1212, "step": 2441 }, { "epoch": 1.505084745762712, "grad_norm": 2.00825834274292, "learning_rate": 3.499588153494331e-06, "loss": 1.0681, "step": 2442 }, { "epoch": 1.5057010785824345, "grad_norm": 2.060857057571411, "learning_rate": 3.491413457904659e-06, "loss": 1.1248, "step": 2443 }, { "epoch": 1.5063174114021571, "grad_norm": 2.032208204269409, "learning_rate": 3.483246301392308e-06, "loss": 1.0811, "step": 2444 }, { "epoch": 1.5069337442218798, "grad_norm": 2.069103956222534, "learning_rate": 3.4750866934175156e-06, "loss": 1.0422, "step": 2445 }, { "epoch": 1.5075500770416026, "grad_norm": 2.0976555347442627, "learning_rate": 3.466934643431795e-06, "loss": 1.0688, "step": 2446 }, { "epoch": 1.508166409861325, "grad_norm": 2.0220611095428467, "learning_rate": 3.458790160877884e-06, "loss": 1.1163, "step": 2447 }, { "epoch": 1.5087827426810478, "grad_norm": 2.109142780303955, "learning_rate": 3.4506532551897665e-06, "loss": 1.0614, "step": 2448 }, { "epoch": 1.5093990755007705, "grad_norm": 2.019078016281128, "learning_rate": 3.442523935792651e-06, "loss": 1.0707, "step": 2449 }, { "epoch": 1.510015408320493, "grad_norm": 2.07504940032959, "learning_rate": 3.4344022121029476e-06, "loss": 1.0848, "step": 2450 }, { "epoch": 1.5106317411402157, "grad_norm": 2.0517802238464355, "learning_rate": 3.4262880935282763e-06, "loss": 1.0282, "step": 2451 }, { "epoch": 1.5112480739599383, "grad_norm": 2.0405373573303223, "learning_rate": 3.4181815894674573e-06, "loss": 1.0541, "step": 2452 }, { "epoch": 1.5118644067796612, "grad_norm": 2.1008715629577637, "learning_rate": 3.4100827093104694e-06, "loss": 1.0467, "step": 2453 }, { "epoch": 1.5124807395993836, "grad_norm": 2.045539379119873, "learning_rate": 3.401991462438474e-06, "loss": 1.0407, "step": 2454 }, { "epoch": 1.5130970724191064, "grad_norm": 2.0681421756744385, "learning_rate": 3.3939078582237926e-06, "loss": 1.1222, "step": 2455 }, { "epoch": 1.5137134052388288, "grad_norm": 2.0736196041107178, "learning_rate": 3.3858319060298826e-06, "loss": 1.0573, "step": 2456 }, { "epoch": 1.5143297380585516, "grad_norm": 2.0478904247283936, "learning_rate": 3.377763615211348e-06, "loss": 1.0782, "step": 2457 }, { "epoch": 1.5149460708782743, "grad_norm": 2.052022695541382, "learning_rate": 3.369702995113915e-06, "loss": 1.1259, "step": 2458 }, { "epoch": 1.5155624036979969, "grad_norm": 2.041614055633545, "learning_rate": 3.3616500550744247e-06, "loss": 1.1085, "step": 2459 }, { "epoch": 1.5161787365177197, "grad_norm": 2.125440835952759, "learning_rate": 3.3536048044208212e-06, "loss": 1.0784, "step": 2460 }, { "epoch": 1.5167950693374421, "grad_norm": 1.9957958459854126, "learning_rate": 3.3455672524721484e-06, "loss": 1.0534, "step": 2461 }, { "epoch": 1.517411402157165, "grad_norm": 2.0708723068237305, "learning_rate": 3.337537408538517e-06, "loss": 1.1045, "step": 2462 }, { "epoch": 1.5180277349768874, "grad_norm": 2.0071964263916016, "learning_rate": 3.3295152819211254e-06, "loss": 1.0781, "step": 2463 }, { "epoch": 1.5186440677966102, "grad_norm": 2.0319364070892334, "learning_rate": 3.3215008819122253e-06, "loss": 1.0811, "step": 2464 }, { "epoch": 1.5192604006163328, "grad_norm": 2.1418673992156982, "learning_rate": 3.313494217795119e-06, "loss": 1.13, "step": 2465 }, { "epoch": 1.5198767334360554, "grad_norm": 2.047104597091675, "learning_rate": 3.3054952988441548e-06, "loss": 1.0747, "step": 2466 }, { "epoch": 1.520493066255778, "grad_norm": 2.033287763595581, "learning_rate": 3.2975041343246937e-06, "loss": 1.0592, "step": 2467 }, { "epoch": 1.5211093990755007, "grad_norm": 2.039403200149536, "learning_rate": 3.28952073349313e-06, "loss": 1.0629, "step": 2468 }, { "epoch": 1.5217257318952235, "grad_norm": 2.0048320293426514, "learning_rate": 3.2815451055968616e-06, "loss": 1.0823, "step": 2469 }, { "epoch": 1.522342064714946, "grad_norm": 2.0329034328460693, "learning_rate": 3.273577259874271e-06, "loss": 1.0628, "step": 2470 }, { "epoch": 1.5229583975346688, "grad_norm": 1.9894753694534302, "learning_rate": 3.2656172055547475e-06, "loss": 1.0698, "step": 2471 }, { "epoch": 1.5235747303543914, "grad_norm": 2.0500733852386475, "learning_rate": 3.257664951858642e-06, "loss": 1.1109, "step": 2472 }, { "epoch": 1.524191063174114, "grad_norm": 2.060892343521118, "learning_rate": 3.2497205079972673e-06, "loss": 1.0889, "step": 2473 }, { "epoch": 1.5248073959938366, "grad_norm": 2.0931618213653564, "learning_rate": 3.2417838831728952e-06, "loss": 1.0866, "step": 2474 }, { "epoch": 1.5254237288135593, "grad_norm": 2.065467596054077, "learning_rate": 3.2338550865787446e-06, "loss": 1.1565, "step": 2475 }, { "epoch": 1.526040061633282, "grad_norm": 2.065355062484741, "learning_rate": 3.225934127398952e-06, "loss": 1.1072, "step": 2476 }, { "epoch": 1.5266563944530045, "grad_norm": 2.0444343090057373, "learning_rate": 3.2180210148085935e-06, "loss": 1.0762, "step": 2477 }, { "epoch": 1.5272727272727273, "grad_norm": 2.080658435821533, "learning_rate": 3.2101157579736486e-06, "loss": 1.148, "step": 2478 }, { "epoch": 1.52788906009245, "grad_norm": 2.0626893043518066, "learning_rate": 3.202218366050992e-06, "loss": 1.1127, "step": 2479 }, { "epoch": 1.5285053929121726, "grad_norm": 2.0607287883758545, "learning_rate": 3.194328848188395e-06, "loss": 1.0625, "step": 2480 }, { "epoch": 1.5291217257318952, "grad_norm": 2.038128614425659, "learning_rate": 3.1864472135245084e-06, "loss": 1.0852, "step": 2481 }, { "epoch": 1.5297380585516178, "grad_norm": 2.075026035308838, "learning_rate": 3.1785734711888417e-06, "loss": 1.1337, "step": 2482 }, { "epoch": 1.5303543913713407, "grad_norm": 2.0160279273986816, "learning_rate": 3.1707076303017825e-06, "loss": 1.0769, "step": 2483 }, { "epoch": 1.530970724191063, "grad_norm": 2.04659104347229, "learning_rate": 3.1628496999745427e-06, "loss": 1.0953, "step": 2484 }, { "epoch": 1.531587057010786, "grad_norm": 2.1809027194976807, "learning_rate": 3.1549996893091873e-06, "loss": 1.1065, "step": 2485 }, { "epoch": 1.5322033898305085, "grad_norm": 2.0479342937469482, "learning_rate": 3.1471576073986054e-06, "loss": 1.1197, "step": 2486 }, { "epoch": 1.5328197226502311, "grad_norm": 1.9800511598587036, "learning_rate": 3.1393234633264912e-06, "loss": 1.0059, "step": 2487 }, { "epoch": 1.5334360554699538, "grad_norm": 2.133227586746216, "learning_rate": 3.1314972661673572e-06, "loss": 1.0861, "step": 2488 }, { "epoch": 1.5340523882896764, "grad_norm": 2.0188205242156982, "learning_rate": 3.123679024986506e-06, "loss": 1.0496, "step": 2489 }, { "epoch": 1.5346687211093992, "grad_norm": 2.045571804046631, "learning_rate": 3.115868748840023e-06, "loss": 1.0771, "step": 2490 }, { "epoch": 1.5352850539291216, "grad_norm": 2.0303406715393066, "learning_rate": 3.108066446774769e-06, "loss": 1.0436, "step": 2491 }, { "epoch": 1.5359013867488445, "grad_norm": 2.0693485736846924, "learning_rate": 3.1002721278283732e-06, "loss": 1.0868, "step": 2492 }, { "epoch": 1.5365177195685669, "grad_norm": 2.0274434089660645, "learning_rate": 3.0924858010292037e-06, "loss": 1.0462, "step": 2493 }, { "epoch": 1.5371340523882897, "grad_norm": 2.047177314758301, "learning_rate": 3.0847074753963847e-06, "loss": 1.0488, "step": 2494 }, { "epoch": 1.5377503852080123, "grad_norm": 2.143435001373291, "learning_rate": 3.0769371599397677e-06, "loss": 1.058, "step": 2495 }, { "epoch": 1.538366718027735, "grad_norm": 2.060774803161621, "learning_rate": 3.069174863659926e-06, "loss": 1.0868, "step": 2496 }, { "epoch": 1.5389830508474578, "grad_norm": 2.0557701587677, "learning_rate": 3.0614205955481424e-06, "loss": 1.0746, "step": 2497 }, { "epoch": 1.5395993836671802, "grad_norm": 2.0952789783477783, "learning_rate": 3.0536743645864063e-06, "loss": 1.1119, "step": 2498 }, { "epoch": 1.540215716486903, "grad_norm": 1.9777603149414062, "learning_rate": 3.0459361797473875e-06, "loss": 1.0682, "step": 2499 }, { "epoch": 1.5408320493066254, "grad_norm": 2.048640489578247, "learning_rate": 3.0382060499944423e-06, "loss": 1.0517, "step": 2500 }, { "epoch": 1.5414483821263483, "grad_norm": 1.9787542819976807, "learning_rate": 3.030483984281598e-06, "loss": 1.0724, "step": 2501 }, { "epoch": 1.542064714946071, "grad_norm": 2.037224292755127, "learning_rate": 3.022769991553537e-06, "loss": 1.0798, "step": 2502 }, { "epoch": 1.5426810477657935, "grad_norm": 2.0846683979034424, "learning_rate": 3.0150640807455955e-06, "loss": 1.047, "step": 2503 }, { "epoch": 1.5432973805855161, "grad_norm": 2.1135592460632324, "learning_rate": 3.0073662607837395e-06, "loss": 1.1323, "step": 2504 }, { "epoch": 1.5439137134052388, "grad_norm": 2.065659761428833, "learning_rate": 2.9996765405845708e-06, "loss": 1.085, "step": 2505 }, { "epoch": 1.5445300462249616, "grad_norm": 2.0374462604522705, "learning_rate": 2.9919949290553096e-06, "loss": 1.0739, "step": 2506 }, { "epoch": 1.545146379044684, "grad_norm": 2.022367238998413, "learning_rate": 2.9843214350937732e-06, "loss": 1.0408, "step": 2507 }, { "epoch": 1.5457627118644068, "grad_norm": 2.065869092941284, "learning_rate": 2.9766560675883916e-06, "loss": 1.0498, "step": 2508 }, { "epoch": 1.5463790446841295, "grad_norm": 2.05846905708313, "learning_rate": 2.9689988354181742e-06, "loss": 1.0666, "step": 2509 }, { "epoch": 1.546995377503852, "grad_norm": 2.050328254699707, "learning_rate": 2.9613497474527e-06, "loss": 1.1037, "step": 2510 }, { "epoch": 1.5476117103235747, "grad_norm": 2.0447909832000732, "learning_rate": 2.9537088125521253e-06, "loss": 1.0924, "step": 2511 }, { "epoch": 1.5482280431432973, "grad_norm": 2.001426935195923, "learning_rate": 2.9460760395671585e-06, "loss": 1.0767, "step": 2512 }, { "epoch": 1.5488443759630202, "grad_norm": 2.0162224769592285, "learning_rate": 2.938451437339047e-06, "loss": 1.0898, "step": 2513 }, { "epoch": 1.5494607087827426, "grad_norm": 2.0276401042938232, "learning_rate": 2.930835014699588e-06, "loss": 1.0585, "step": 2514 }, { "epoch": 1.5500770416024654, "grad_norm": 2.0197055339813232, "learning_rate": 2.923226780471097e-06, "loss": 1.0498, "step": 2515 }, { "epoch": 1.550693374422188, "grad_norm": 2.0847554206848145, "learning_rate": 2.9156267434663965e-06, "loss": 1.0608, "step": 2516 }, { "epoch": 1.5513097072419106, "grad_norm": 2.0684635639190674, "learning_rate": 2.9080349124888265e-06, "loss": 1.0616, "step": 2517 }, { "epoch": 1.5519260400616333, "grad_norm": 2.0536725521087646, "learning_rate": 2.9004512963322173e-06, "loss": 1.0611, "step": 2518 }, { "epoch": 1.5525423728813559, "grad_norm": 2.1008455753326416, "learning_rate": 2.8928759037808752e-06, "loss": 1.1284, "step": 2519 }, { "epoch": 1.5531587057010787, "grad_norm": 2.029223680496216, "learning_rate": 2.8853087436096005e-06, "loss": 1.081, "step": 2520 }, { "epoch": 1.5537750385208011, "grad_norm": 2.063253879547119, "learning_rate": 2.8777498245836378e-06, "loss": 1.1068, "step": 2521 }, { "epoch": 1.554391371340524, "grad_norm": 2.055957317352295, "learning_rate": 2.870199155458695e-06, "loss": 1.0057, "step": 2522 }, { "epoch": 1.5550077041602466, "grad_norm": 2.069962501525879, "learning_rate": 2.8626567449809263e-06, "loss": 1.0786, "step": 2523 }, { "epoch": 1.5556240369799692, "grad_norm": 2.043140411376953, "learning_rate": 2.8551226018869105e-06, "loss": 1.1679, "step": 2524 }, { "epoch": 1.5562403697996918, "grad_norm": 2.003110885620117, "learning_rate": 2.847596734903659e-06, "loss": 1.0664, "step": 2525 }, { "epoch": 1.5568567026194144, "grad_norm": 2.0459177494049072, "learning_rate": 2.8400791527485936e-06, "loss": 1.0559, "step": 2526 }, { "epoch": 1.5574730354391373, "grad_norm": 2.0664639472961426, "learning_rate": 2.8325698641295386e-06, "loss": 1.074, "step": 2527 }, { "epoch": 1.5580893682588597, "grad_norm": 2.117327928543091, "learning_rate": 2.825068877744712e-06, "loss": 1.0991, "step": 2528 }, { "epoch": 1.5587057010785825, "grad_norm": 2.0350427627563477, "learning_rate": 2.8175762022827203e-06, "loss": 1.0853, "step": 2529 }, { "epoch": 1.559322033898305, "grad_norm": 2.055994749069214, "learning_rate": 2.8100918464225304e-06, "loss": 1.072, "step": 2530 }, { "epoch": 1.5599383667180278, "grad_norm": 2.0636258125305176, "learning_rate": 2.802615818833484e-06, "loss": 1.0691, "step": 2531 }, { "epoch": 1.5605546995377504, "grad_norm": 2.0166168212890625, "learning_rate": 2.7951481281752734e-06, "loss": 1.0065, "step": 2532 }, { "epoch": 1.561171032357473, "grad_norm": 2.07016658782959, "learning_rate": 2.7876887830979328e-06, "loss": 1.0629, "step": 2533 }, { "epoch": 1.5617873651771959, "grad_norm": 2.139784336090088, "learning_rate": 2.7802377922418287e-06, "loss": 1.0975, "step": 2534 }, { "epoch": 1.5624036979969183, "grad_norm": 2.0561773777008057, "learning_rate": 2.772795164237656e-06, "loss": 0.9927, "step": 2535 }, { "epoch": 1.563020030816641, "grad_norm": 2.033285617828369, "learning_rate": 2.7653609077064102e-06, "loss": 1.0223, "step": 2536 }, { "epoch": 1.5636363636363635, "grad_norm": 2.023505926132202, "learning_rate": 2.757935031259402e-06, "loss": 1.0524, "step": 2537 }, { "epoch": 1.5642526964560863, "grad_norm": 1.9613449573516846, "learning_rate": 2.7505175434982324e-06, "loss": 1.0793, "step": 2538 }, { "epoch": 1.564869029275809, "grad_norm": 2.0730087757110596, "learning_rate": 2.7431084530147834e-06, "loss": 1.0623, "step": 2539 }, { "epoch": 1.5654853620955316, "grad_norm": 1.993596076965332, "learning_rate": 2.7357077683912137e-06, "loss": 1.0267, "step": 2540 }, { "epoch": 1.5661016949152542, "grad_norm": 2.030557155609131, "learning_rate": 2.728315498199937e-06, "loss": 1.0973, "step": 2541 }, { "epoch": 1.5667180277349768, "grad_norm": 2.0284104347229004, "learning_rate": 2.720931651003629e-06, "loss": 0.9879, "step": 2542 }, { "epoch": 1.5673343605546997, "grad_norm": 2.0813801288604736, "learning_rate": 2.713556235355209e-06, "loss": 1.1125, "step": 2543 }, { "epoch": 1.567950693374422, "grad_norm": 2.0294036865234375, "learning_rate": 2.706189259797818e-06, "loss": 1.0105, "step": 2544 }, { "epoch": 1.568567026194145, "grad_norm": 2.045482635498047, "learning_rate": 2.6988307328648376e-06, "loss": 1.0376, "step": 2545 }, { "epoch": 1.5691833590138675, "grad_norm": 2.1052463054656982, "learning_rate": 2.691480663079855e-06, "loss": 1.1171, "step": 2546 }, { "epoch": 1.5697996918335901, "grad_norm": 2.024144411087036, "learning_rate": 2.684139058956655e-06, "loss": 1.0255, "step": 2547 }, { "epoch": 1.5704160246533128, "grad_norm": 2.093296527862549, "learning_rate": 2.6768059289992264e-06, "loss": 1.0812, "step": 2548 }, { "epoch": 1.5710323574730354, "grad_norm": 2.0114150047302246, "learning_rate": 2.669481281701739e-06, "loss": 1.1084, "step": 2549 }, { "epoch": 1.5716486902927582, "grad_norm": 2.0552802085876465, "learning_rate": 2.6621651255485293e-06, "loss": 1.069, "step": 2550 }, { "epoch": 1.5722650231124806, "grad_norm": 2.050388813018799, "learning_rate": 2.654857469014113e-06, "loss": 1.0159, "step": 2551 }, { "epoch": 1.5728813559322035, "grad_norm": 2.0489845275878906, "learning_rate": 2.647558320563152e-06, "loss": 1.0692, "step": 2552 }, { "epoch": 1.573497688751926, "grad_norm": 2.136768341064453, "learning_rate": 2.6402676886504485e-06, "loss": 1.1032, "step": 2553 }, { "epoch": 1.5741140215716487, "grad_norm": 2.085974931716919, "learning_rate": 2.632985581720947e-06, "loss": 1.0858, "step": 2554 }, { "epoch": 1.5747303543913713, "grad_norm": 2.050628662109375, "learning_rate": 2.6257120082097186e-06, "loss": 1.0532, "step": 2555 }, { "epoch": 1.575346687211094, "grad_norm": 2.018427610397339, "learning_rate": 2.6184469765419375e-06, "loss": 1.0422, "step": 2556 }, { "epoch": 1.5759630200308168, "grad_norm": 2.049408197402954, "learning_rate": 2.6111904951329025e-06, "loss": 0.9994, "step": 2557 }, { "epoch": 1.5765793528505392, "grad_norm": 2.002723217010498, "learning_rate": 2.6039425723879928e-06, "loss": 1.065, "step": 2558 }, { "epoch": 1.577195685670262, "grad_norm": 2.0850300788879395, "learning_rate": 2.5967032167026806e-06, "loss": 1.0801, "step": 2559 }, { "epoch": 1.5778120184899846, "grad_norm": 2.030367612838745, "learning_rate": 2.5894724364625155e-06, "loss": 1.078, "step": 2560 }, { "epoch": 1.5784283513097073, "grad_norm": 2.0487732887268066, "learning_rate": 2.582250240043108e-06, "loss": 1.0928, "step": 2561 }, { "epoch": 1.5790446841294299, "grad_norm": 2.031890630722046, "learning_rate": 2.575036635810131e-06, "loss": 1.0469, "step": 2562 }, { "epoch": 1.5796610169491525, "grad_norm": 2.109686851501465, "learning_rate": 2.567831632119305e-06, "loss": 1.1323, "step": 2563 }, { "epoch": 1.5802773497688754, "grad_norm": 2.1375865936279297, "learning_rate": 2.5606352373163858e-06, "loss": 1.1258, "step": 2564 }, { "epoch": 1.5808936825885977, "grad_norm": 2.0908050537109375, "learning_rate": 2.5534474597371574e-06, "loss": 1.0633, "step": 2565 }, { "epoch": 1.5815100154083206, "grad_norm": 2.014946460723877, "learning_rate": 2.546268307707428e-06, "loss": 1.0353, "step": 2566 }, { "epoch": 1.582126348228043, "grad_norm": 2.2133922576904297, "learning_rate": 2.5390977895430014e-06, "loss": 1.1492, "step": 2567 }, { "epoch": 1.5827426810477658, "grad_norm": 2.108938694000244, "learning_rate": 2.5319359135496935e-06, "loss": 1.0336, "step": 2568 }, { "epoch": 1.5833590138674885, "grad_norm": 2.094999313354492, "learning_rate": 2.524782688023305e-06, "loss": 1.1242, "step": 2569 }, { "epoch": 1.583975346687211, "grad_norm": 2.0294411182403564, "learning_rate": 2.5176381212496146e-06, "loss": 1.011, "step": 2570 }, { "epoch": 1.5845916795069337, "grad_norm": 2.103919744491577, "learning_rate": 2.510502221504376e-06, "loss": 1.0622, "step": 2571 }, { "epoch": 1.5852080123266563, "grad_norm": 2.0639235973358154, "learning_rate": 2.5033749970533015e-06, "loss": 1.0407, "step": 2572 }, { "epoch": 1.5858243451463792, "grad_norm": 2.0847933292388916, "learning_rate": 2.4962564561520498e-06, "loss": 1.0483, "step": 2573 }, { "epoch": 1.5864406779661016, "grad_norm": 2.0696210861206055, "learning_rate": 2.489146607046228e-06, "loss": 1.0473, "step": 2574 }, { "epoch": 1.5870570107858244, "grad_norm": 2.014371395111084, "learning_rate": 2.4820454579713716e-06, "loss": 0.9988, "step": 2575 }, { "epoch": 1.587673343605547, "grad_norm": 2.026151657104492, "learning_rate": 2.47495301715294e-06, "loss": 1.0526, "step": 2576 }, { "epoch": 1.5882896764252696, "grad_norm": 2.0853772163391113, "learning_rate": 2.4678692928063086e-06, "loss": 1.0377, "step": 2577 }, { "epoch": 1.5889060092449923, "grad_norm": 2.034478187561035, "learning_rate": 2.460794293136747e-06, "loss": 1.0702, "step": 2578 }, { "epoch": 1.5895223420647149, "grad_norm": 2.0734169483184814, "learning_rate": 2.4537280263394257e-06, "loss": 1.034, "step": 2579 }, { "epoch": 1.5901386748844377, "grad_norm": 2.1252076625823975, "learning_rate": 2.4466705005994053e-06, "loss": 1.1056, "step": 2580 }, { "epoch": 1.5907550077041601, "grad_norm": 2.1156845092773438, "learning_rate": 2.439621724091603e-06, "loss": 1.0714, "step": 2581 }, { "epoch": 1.591371340523883, "grad_norm": 2.04427170753479, "learning_rate": 2.432581704980822e-06, "loss": 1.0065, "step": 2582 }, { "epoch": 1.5919876733436056, "grad_norm": 2.161452531814575, "learning_rate": 2.4255504514217153e-06, "loss": 1.1163, "step": 2583 }, { "epoch": 1.5926040061633282, "grad_norm": 2.080475330352783, "learning_rate": 2.4185279715587704e-06, "loss": 1.0692, "step": 2584 }, { "epoch": 1.5932203389830508, "grad_norm": 2.0769569873809814, "learning_rate": 2.4115142735263286e-06, "loss": 1.0253, "step": 2585 }, { "epoch": 1.5938366718027734, "grad_norm": 2.107757091522217, "learning_rate": 2.404509365448552e-06, "loss": 1.1251, "step": 2586 }, { "epoch": 1.5944530046224963, "grad_norm": 2.035675287246704, "learning_rate": 2.397513255439413e-06, "loss": 1.044, "step": 2587 }, { "epoch": 1.5950693374422187, "grad_norm": 2.0649521350860596, "learning_rate": 2.390525951602709e-06, "loss": 1.0533, "step": 2588 }, { "epoch": 1.5956856702619415, "grad_norm": 2.140303373336792, "learning_rate": 2.383547462032031e-06, "loss": 1.0611, "step": 2589 }, { "epoch": 1.5963020030816641, "grad_norm": 2.1341047286987305, "learning_rate": 2.3765777948107507e-06, "loss": 1.0597, "step": 2590 }, { "epoch": 1.5969183359013868, "grad_norm": 2.078761339187622, "learning_rate": 2.36961695801203e-06, "loss": 1.0598, "step": 2591 }, { "epoch": 1.5975346687211094, "grad_norm": 2.0020689964294434, "learning_rate": 2.362664959698805e-06, "loss": 1.1086, "step": 2592 }, { "epoch": 1.598151001540832, "grad_norm": 2.054459571838379, "learning_rate": 2.3557218079237608e-06, "loss": 1.0708, "step": 2593 }, { "epoch": 1.5987673343605548, "grad_norm": 2.038203716278076, "learning_rate": 2.3487875107293535e-06, "loss": 1.0462, "step": 2594 }, { "epoch": 1.5993836671802772, "grad_norm": 2.06906795501709, "learning_rate": 2.341862076147765e-06, "loss": 1.1142, "step": 2595 }, { "epoch": 1.6, "grad_norm": 2.0887835025787354, "learning_rate": 2.3349455122009236e-06, "loss": 1.1003, "step": 2596 }, { "epoch": 1.6006163328197225, "grad_norm": 2.072643756866455, "learning_rate": 2.328037826900479e-06, "loss": 1.0719, "step": 2597 }, { "epoch": 1.6012326656394453, "grad_norm": 1.9990297555923462, "learning_rate": 2.321139028247792e-06, "loss": 1.0147, "step": 2598 }, { "epoch": 1.601848998459168, "grad_norm": 1.9973474740982056, "learning_rate": 2.3142491242339337e-06, "loss": 1.0, "step": 2599 }, { "epoch": 1.6024653312788906, "grad_norm": 2.0345118045806885, "learning_rate": 2.307368122839675e-06, "loss": 1.0424, "step": 2600 }, { "epoch": 1.6030816640986134, "grad_norm": 2.085649013519287, "learning_rate": 2.3004960320354698e-06, "loss": 1.0806, "step": 2601 }, { "epoch": 1.6036979969183358, "grad_norm": 2.1210012435913086, "learning_rate": 2.2936328597814538e-06, "loss": 1.1081, "step": 2602 }, { "epoch": 1.6043143297380587, "grad_norm": 2.0707218647003174, "learning_rate": 2.286778614027433e-06, "loss": 1.0355, "step": 2603 }, { "epoch": 1.604930662557781, "grad_norm": 2.0730538368225098, "learning_rate": 2.2799333027128645e-06, "loss": 1.0246, "step": 2604 }, { "epoch": 1.605546995377504, "grad_norm": 2.0679197311401367, "learning_rate": 2.2730969337668686e-06, "loss": 1.076, "step": 2605 }, { "epoch": 1.6061633281972265, "grad_norm": 2.0455403327941895, "learning_rate": 2.2662695151082003e-06, "loss": 1.0427, "step": 2606 }, { "epoch": 1.6067796610169491, "grad_norm": 2.0179731845855713, "learning_rate": 2.259451054645251e-06, "loss": 1.0853, "step": 2607 }, { "epoch": 1.6073959938366718, "grad_norm": 2.063236713409424, "learning_rate": 2.252641560276031e-06, "loss": 1.0911, "step": 2608 }, { "epoch": 1.6080123266563944, "grad_norm": 2.046931028366089, "learning_rate": 2.2458410398881726e-06, "loss": 1.0533, "step": 2609 }, { "epoch": 1.6086286594761172, "grad_norm": 1.9950497150421143, "learning_rate": 2.239049501358901e-06, "loss": 1.0714, "step": 2610 }, { "epoch": 1.6092449922958396, "grad_norm": 2.020690679550171, "learning_rate": 2.232266952555049e-06, "loss": 1.0398, "step": 2611 }, { "epoch": 1.6098613251155625, "grad_norm": 2.089660882949829, "learning_rate": 2.22549340133303e-06, "loss": 1.0342, "step": 2612 }, { "epoch": 1.610477657935285, "grad_norm": 2.089712619781494, "learning_rate": 2.21872885553884e-06, "loss": 1.1245, "step": 2613 }, { "epoch": 1.6110939907550077, "grad_norm": 2.05960750579834, "learning_rate": 2.211973323008041e-06, "loss": 1.0344, "step": 2614 }, { "epoch": 1.6117103235747303, "grad_norm": 2.059760570526123, "learning_rate": 2.2052268115657515e-06, "loss": 1.1052, "step": 2615 }, { "epoch": 1.612326656394453, "grad_norm": 2.0693416595458984, "learning_rate": 2.198489329026645e-06, "loss": 1.0998, "step": 2616 }, { "epoch": 1.6129429892141758, "grad_norm": 2.0683200359344482, "learning_rate": 2.19176088319494e-06, "loss": 1.0207, "step": 2617 }, { "epoch": 1.6135593220338982, "grad_norm": 2.096012592315674, "learning_rate": 2.185041481864373e-06, "loss": 1.0438, "step": 2618 }, { "epoch": 1.614175654853621, "grad_norm": 2.084972381591797, "learning_rate": 2.1783311328182235e-06, "loss": 1.0685, "step": 2619 }, { "epoch": 1.6147919876733436, "grad_norm": 2.181654930114746, "learning_rate": 2.1716298438292737e-06, "loss": 1.0883, "step": 2620 }, { "epoch": 1.6154083204930663, "grad_norm": 2.0826218128204346, "learning_rate": 2.164937622659811e-06, "loss": 1.1292, "step": 2621 }, { "epoch": 1.6160246533127889, "grad_norm": 2.010089635848999, "learning_rate": 2.158254477061623e-06, "loss": 1.1042, "step": 2622 }, { "epoch": 1.6166409861325115, "grad_norm": 2.0882441997528076, "learning_rate": 2.1515804147759865e-06, "loss": 1.063, "step": 2623 }, { "epoch": 1.6172573189522343, "grad_norm": 2.1047725677490234, "learning_rate": 2.1449154435336462e-06, "loss": 1.0989, "step": 2624 }, { "epoch": 1.6178736517719567, "grad_norm": 2.0736234188079834, "learning_rate": 2.1382595710548313e-06, "loss": 1.0731, "step": 2625 }, { "epoch": 1.6184899845916796, "grad_norm": 2.0380125045776367, "learning_rate": 2.1316128050492256e-06, "loss": 1.1196, "step": 2626 }, { "epoch": 1.6191063174114022, "grad_norm": 2.058689832687378, "learning_rate": 2.1249751532159578e-06, "loss": 1.091, "step": 2627 }, { "epoch": 1.6197226502311248, "grad_norm": 2.0337438583374023, "learning_rate": 2.118346623243609e-06, "loss": 1.0538, "step": 2628 }, { "epoch": 1.6203389830508474, "grad_norm": 2.0248005390167236, "learning_rate": 2.1117272228101903e-06, "loss": 1.0236, "step": 2629 }, { "epoch": 1.62095531587057, "grad_norm": 2.1067166328430176, "learning_rate": 2.1051169595831324e-06, "loss": 1.0586, "step": 2630 }, { "epoch": 1.621571648690293, "grad_norm": 2.1088666915893555, "learning_rate": 2.0985158412192965e-06, "loss": 1.0657, "step": 2631 }, { "epoch": 1.6221879815100153, "grad_norm": 2.0256736278533936, "learning_rate": 2.091923875364935e-06, "loss": 1.0855, "step": 2632 }, { "epoch": 1.6228043143297382, "grad_norm": 2.076997756958008, "learning_rate": 2.0853410696557085e-06, "loss": 1.0907, "step": 2633 }, { "epoch": 1.6234206471494605, "grad_norm": 2.076197624206543, "learning_rate": 2.0787674317166674e-06, "loss": 1.088, "step": 2634 }, { "epoch": 1.6240369799691834, "grad_norm": 2.0190000534057617, "learning_rate": 2.072202969162234e-06, "loss": 1.0483, "step": 2635 }, { "epoch": 1.624653312788906, "grad_norm": 2.053727388381958, "learning_rate": 2.06564768959621e-06, "loss": 1.0629, "step": 2636 }, { "epoch": 1.6252696456086286, "grad_norm": 2.1065638065338135, "learning_rate": 2.0591016006117604e-06, "loss": 1.1073, "step": 2637 }, { "epoch": 1.6258859784283515, "grad_norm": 2.0874826908111572, "learning_rate": 2.0525647097914013e-06, "loss": 1.119, "step": 2638 }, { "epoch": 1.6265023112480739, "grad_norm": 2.029690980911255, "learning_rate": 2.046037024706996e-06, "loss": 1.0528, "step": 2639 }, { "epoch": 1.6271186440677967, "grad_norm": 2.0255801677703857, "learning_rate": 2.0395185529197483e-06, "loss": 1.0895, "step": 2640 }, { "epoch": 1.6277349768875191, "grad_norm": 2.0642638206481934, "learning_rate": 2.033009301980179e-06, "loss": 1.0513, "step": 2641 }, { "epoch": 1.628351309707242, "grad_norm": 2.068793535232544, "learning_rate": 2.0265092794281373e-06, "loss": 1.0679, "step": 2642 }, { "epoch": 1.6289676425269646, "grad_norm": 2.129836082458496, "learning_rate": 2.0200184927927825e-06, "loss": 1.1092, "step": 2643 }, { "epoch": 1.6295839753466872, "grad_norm": 2.061929941177368, "learning_rate": 2.0135369495925715e-06, "loss": 1.0857, "step": 2644 }, { "epoch": 1.6302003081664098, "grad_norm": 2.117722511291504, "learning_rate": 2.007064657335258e-06, "loss": 1.0755, "step": 2645 }, { "epoch": 1.6308166409861324, "grad_norm": 2.083570957183838, "learning_rate": 2.0006016235178806e-06, "loss": 1.0385, "step": 2646 }, { "epoch": 1.6314329738058553, "grad_norm": 2.1030187606811523, "learning_rate": 1.994147855626747e-06, "loss": 1.0523, "step": 2647 }, { "epoch": 1.6320493066255777, "grad_norm": 2.1034135818481445, "learning_rate": 1.98770336113744e-06, "loss": 1.0047, "step": 2648 }, { "epoch": 1.6326656394453005, "grad_norm": 2.0599207878112793, "learning_rate": 1.981268147514794e-06, "loss": 1.0873, "step": 2649 }, { "epoch": 1.6332819722650231, "grad_norm": 2.2179017066955566, "learning_rate": 1.9748422222129007e-06, "loss": 1.1234, "step": 2650 }, { "epoch": 1.6338983050847458, "grad_norm": 2.0186102390289307, "learning_rate": 1.968425592675087e-06, "loss": 1.0258, "step": 2651 }, { "epoch": 1.6345146379044684, "grad_norm": 2.1317105293273926, "learning_rate": 1.9620182663339127e-06, "loss": 1.0902, "step": 2652 }, { "epoch": 1.635130970724191, "grad_norm": 2.0526463985443115, "learning_rate": 1.955620250611162e-06, "loss": 1.0736, "step": 2653 }, { "epoch": 1.6357473035439138, "grad_norm": 2.047393560409546, "learning_rate": 1.9492315529178394e-06, "loss": 1.0083, "step": 2654 }, { "epoch": 1.6363636363636362, "grad_norm": 1.981003761291504, "learning_rate": 1.9428521806541435e-06, "loss": 0.9944, "step": 2655 }, { "epoch": 1.636979969183359, "grad_norm": 2.055340528488159, "learning_rate": 1.936482141209486e-06, "loss": 1.0285, "step": 2656 }, { "epoch": 1.6375963020030817, "grad_norm": 2.073901414871216, "learning_rate": 1.9301214419624625e-06, "loss": 1.089, "step": 2657 }, { "epoch": 1.6382126348228043, "grad_norm": 2.0654079914093018, "learning_rate": 1.923770090280842e-06, "loss": 1.0959, "step": 2658 }, { "epoch": 1.638828967642527, "grad_norm": 2.000873565673828, "learning_rate": 1.917428093521576e-06, "loss": 1.0114, "step": 2659 }, { "epoch": 1.6394453004622496, "grad_norm": 2.0073211193084717, "learning_rate": 1.9110954590307773e-06, "loss": 1.0062, "step": 2660 }, { "epoch": 1.6400616332819724, "grad_norm": 2.091224431991577, "learning_rate": 1.9047721941437069e-06, "loss": 1.07, "step": 2661 }, { "epoch": 1.6406779661016948, "grad_norm": 2.079051971435547, "learning_rate": 1.898458306184786e-06, "loss": 1.0339, "step": 2662 }, { "epoch": 1.6412942989214176, "grad_norm": 2.1070806980133057, "learning_rate": 1.8921538024675679e-06, "loss": 1.0369, "step": 2663 }, { "epoch": 1.6419106317411403, "grad_norm": 2.0046908855438232, "learning_rate": 1.8858586902947284e-06, "loss": 1.0261, "step": 2664 }, { "epoch": 1.642526964560863, "grad_norm": 2.1234688758850098, "learning_rate": 1.8795729769580739e-06, "loss": 1.0672, "step": 2665 }, { "epoch": 1.6431432973805855, "grad_norm": 1.9971221685409546, "learning_rate": 1.8732966697385268e-06, "loss": 1.0169, "step": 2666 }, { "epoch": 1.6437596302003081, "grad_norm": 2.0762434005737305, "learning_rate": 1.8670297759060974e-06, "loss": 1.0631, "step": 2667 }, { "epoch": 1.644375963020031, "grad_norm": 2.0955910682678223, "learning_rate": 1.8607723027199176e-06, "loss": 1.1022, "step": 2668 }, { "epoch": 1.6449922958397534, "grad_norm": 2.064239501953125, "learning_rate": 1.854524257428182e-06, "loss": 1.0733, "step": 2669 }, { "epoch": 1.6456086286594762, "grad_norm": 2.078284978866577, "learning_rate": 1.8482856472681809e-06, "loss": 0.9842, "step": 2670 }, { "epoch": 1.6462249614791986, "grad_norm": 2.053962469100952, "learning_rate": 1.842056479466271e-06, "loss": 1.0224, "step": 2671 }, { "epoch": 1.6468412942989215, "grad_norm": 2.075232744216919, "learning_rate": 1.8358367612378669e-06, "loss": 1.0435, "step": 2672 }, { "epoch": 1.647457627118644, "grad_norm": 2.081386089324951, "learning_rate": 1.8296264997874447e-06, "loss": 1.0582, "step": 2673 }, { "epoch": 1.6480739599383667, "grad_norm": 2.058281898498535, "learning_rate": 1.8234257023085233e-06, "loss": 1.063, "step": 2674 }, { "epoch": 1.6486902927580893, "grad_norm": 2.1261003017425537, "learning_rate": 1.8172343759836585e-06, "loss": 1.0754, "step": 2675 }, { "epoch": 1.649306625577812, "grad_norm": 2.1156060695648193, "learning_rate": 1.8110525279844371e-06, "loss": 1.0474, "step": 2676 }, { "epoch": 1.6499229583975348, "grad_norm": 2.093165636062622, "learning_rate": 1.8048801654714687e-06, "loss": 1.0469, "step": 2677 }, { "epoch": 1.6505392912172572, "grad_norm": 2.0744221210479736, "learning_rate": 1.7987172955943677e-06, "loss": 1.0752, "step": 2678 }, { "epoch": 1.65115562403698, "grad_norm": 2.0282957553863525, "learning_rate": 1.7925639254917593e-06, "loss": 1.0659, "step": 2679 }, { "epoch": 1.6517719568567026, "grad_norm": 2.0455329418182373, "learning_rate": 1.7864200622912664e-06, "loss": 1.0791, "step": 2680 }, { "epoch": 1.6523882896764253, "grad_norm": 2.0407090187072754, "learning_rate": 1.7802857131094941e-06, "loss": 1.0739, "step": 2681 }, { "epoch": 1.6530046224961479, "grad_norm": 2.09871244430542, "learning_rate": 1.774160885052033e-06, "loss": 1.1109, "step": 2682 }, { "epoch": 1.6536209553158705, "grad_norm": 2.1057586669921875, "learning_rate": 1.7680455852134427e-06, "loss": 1.0605, "step": 2683 }, { "epoch": 1.6542372881355933, "grad_norm": 2.1416873931884766, "learning_rate": 1.7619398206772408e-06, "loss": 1.0051, "step": 2684 }, { "epoch": 1.6548536209553157, "grad_norm": 2.0432679653167725, "learning_rate": 1.7558435985159095e-06, "loss": 1.0725, "step": 2685 }, { "epoch": 1.6554699537750386, "grad_norm": 2.0269064903259277, "learning_rate": 1.7497569257908697e-06, "loss": 1.0352, "step": 2686 }, { "epoch": 1.6560862865947612, "grad_norm": 2.0915541648864746, "learning_rate": 1.7436798095524853e-06, "loss": 1.0497, "step": 2687 }, { "epoch": 1.6567026194144838, "grad_norm": 2.1467039585113525, "learning_rate": 1.7376122568400533e-06, "loss": 1.1039, "step": 2688 }, { "epoch": 1.6573189522342064, "grad_norm": 2.0717244148254395, "learning_rate": 1.7315542746817825e-06, "loss": 1.0522, "step": 2689 }, { "epoch": 1.657935285053929, "grad_norm": 2.0614404678344727, "learning_rate": 1.7255058700948058e-06, "loss": 1.0727, "step": 2690 }, { "epoch": 1.658551617873652, "grad_norm": 2.0715060234069824, "learning_rate": 1.7194670500851619e-06, "loss": 1.0556, "step": 2691 }, { "epoch": 1.6591679506933743, "grad_norm": 2.115962028503418, "learning_rate": 1.7134378216477766e-06, "loss": 1.0553, "step": 2692 }, { "epoch": 1.6597842835130971, "grad_norm": 2.100374937057495, "learning_rate": 1.70741819176648e-06, "loss": 1.0333, "step": 2693 }, { "epoch": 1.6604006163328198, "grad_norm": 2.0797359943389893, "learning_rate": 1.7014081674139783e-06, "loss": 1.0733, "step": 2694 }, { "epoch": 1.6610169491525424, "grad_norm": 2.0971317291259766, "learning_rate": 1.6954077555518455e-06, "loss": 1.0595, "step": 2695 }, { "epoch": 1.661633281972265, "grad_norm": 2.023136854171753, "learning_rate": 1.6894169631305279e-06, "loss": 0.9582, "step": 2696 }, { "epoch": 1.6622496147919876, "grad_norm": 2.090440034866333, "learning_rate": 1.68343579708933e-06, "loss": 1.1152, "step": 2697 }, { "epoch": 1.6628659476117105, "grad_norm": 2.0381994247436523, "learning_rate": 1.6774642643563955e-06, "loss": 1.0081, "step": 2698 }, { "epoch": 1.6634822804314329, "grad_norm": 2.050015926361084, "learning_rate": 1.671502371848729e-06, "loss": 1.0547, "step": 2699 }, { "epoch": 1.6640986132511557, "grad_norm": 2.087804079055786, "learning_rate": 1.6655501264721462e-06, "loss": 1.0396, "step": 2700 }, { "epoch": 1.664714946070878, "grad_norm": 2.0657382011413574, "learning_rate": 1.6596075351213047e-06, "loss": 1.0738, "step": 2701 }, { "epoch": 1.665331278890601, "grad_norm": 2.0703816413879395, "learning_rate": 1.6536746046796725e-06, "loss": 1.1127, "step": 2702 }, { "epoch": 1.6659476117103236, "grad_norm": 2.0800061225891113, "learning_rate": 1.64775134201953e-06, "loss": 1.0155, "step": 2703 }, { "epoch": 1.6665639445300462, "grad_norm": 2.1212174892425537, "learning_rate": 1.6418377540019537e-06, "loss": 1.1195, "step": 2704 }, { "epoch": 1.667180277349769, "grad_norm": 2.1189045906066895, "learning_rate": 1.6359338474768193e-06, "loss": 1.0868, "step": 2705 }, { "epoch": 1.6677966101694914, "grad_norm": 2.0949859619140625, "learning_rate": 1.630039629282787e-06, "loss": 1.0887, "step": 2706 }, { "epoch": 1.6684129429892143, "grad_norm": 2.1663527488708496, "learning_rate": 1.624155106247295e-06, "loss": 1.1317, "step": 2707 }, { "epoch": 1.6690292758089367, "grad_norm": 2.113461494445801, "learning_rate": 1.6182802851865499e-06, "loss": 1.0508, "step": 2708 }, { "epoch": 1.6696456086286595, "grad_norm": 2.0576534271240234, "learning_rate": 1.6124151729055193e-06, "loss": 1.0166, "step": 2709 }, { "epoch": 1.6702619414483821, "grad_norm": 2.0585548877716064, "learning_rate": 1.6065597761979258e-06, "loss": 1.0637, "step": 2710 }, { "epoch": 1.6708782742681048, "grad_norm": 2.071150302886963, "learning_rate": 1.6007141018462414e-06, "loss": 1.1271, "step": 2711 }, { "epoch": 1.6714946070878274, "grad_norm": 2.044684886932373, "learning_rate": 1.594878156621672e-06, "loss": 1.0285, "step": 2712 }, { "epoch": 1.67211093990755, "grad_norm": 2.0755202770233154, "learning_rate": 1.589051947284157e-06, "loss": 1.087, "step": 2713 }, { "epoch": 1.6727272727272728, "grad_norm": 2.1407110691070557, "learning_rate": 1.583235480582358e-06, "loss": 1.0771, "step": 2714 }, { "epoch": 1.6733436055469952, "grad_norm": 2.060163736343384, "learning_rate": 1.5774287632536467e-06, "loss": 1.092, "step": 2715 }, { "epoch": 1.673959938366718, "grad_norm": 2.1627395153045654, "learning_rate": 1.5716318020241083e-06, "loss": 1.0709, "step": 2716 }, { "epoch": 1.6745762711864407, "grad_norm": 2.0560460090637207, "learning_rate": 1.565844603608524e-06, "loss": 1.0522, "step": 2717 }, { "epoch": 1.6751926040061633, "grad_norm": 2.090230941772461, "learning_rate": 1.560067174710368e-06, "loss": 1.1012, "step": 2718 }, { "epoch": 1.675808936825886, "grad_norm": 2.0428271293640137, "learning_rate": 1.5542995220217961e-06, "loss": 1.0595, "step": 2719 }, { "epoch": 1.6764252696456086, "grad_norm": 2.0815696716308594, "learning_rate": 1.5485416522236453e-06, "loss": 1.113, "step": 2720 }, { "epoch": 1.6770416024653314, "grad_norm": 2.0874595642089844, "learning_rate": 1.5427935719854103e-06, "loss": 1.0722, "step": 2721 }, { "epoch": 1.6776579352850538, "grad_norm": 2.0856215953826904, "learning_rate": 1.5370552879652567e-06, "loss": 1.0792, "step": 2722 }, { "epoch": 1.6782742681047766, "grad_norm": 2.040480375289917, "learning_rate": 1.5313268068099984e-06, "loss": 1.1005, "step": 2723 }, { "epoch": 1.6788906009244993, "grad_norm": 2.1031923294067383, "learning_rate": 1.5256081351550945e-06, "loss": 1.0523, "step": 2724 }, { "epoch": 1.6795069337442219, "grad_norm": 2.0653512477874756, "learning_rate": 1.5198992796246447e-06, "loss": 0.9891, "step": 2725 }, { "epoch": 1.6801232665639445, "grad_norm": 2.02510666847229, "learning_rate": 1.51420024683137e-06, "loss": 1.0453, "step": 2726 }, { "epoch": 1.6807395993836671, "grad_norm": 2.1173653602600098, "learning_rate": 1.5085110433766215e-06, "loss": 1.0174, "step": 2727 }, { "epoch": 1.68135593220339, "grad_norm": 2.109287738800049, "learning_rate": 1.5028316758503659e-06, "loss": 1.0694, "step": 2728 }, { "epoch": 1.6819722650231124, "grad_norm": 2.2332851886749268, "learning_rate": 1.497162150831165e-06, "loss": 1.0917, "step": 2729 }, { "epoch": 1.6825885978428352, "grad_norm": 2.1506447792053223, "learning_rate": 1.491502474886195e-06, "loss": 1.0733, "step": 2730 }, { "epoch": 1.6832049306625578, "grad_norm": 2.0729403495788574, "learning_rate": 1.485852654571217e-06, "loss": 1.0179, "step": 2731 }, { "epoch": 1.6838212634822805, "grad_norm": 2.0613300800323486, "learning_rate": 1.4802126964305686e-06, "loss": 1.0535, "step": 2732 }, { "epoch": 1.684437596302003, "grad_norm": 2.1539289951324463, "learning_rate": 1.4745826069971757e-06, "loss": 1.0885, "step": 2733 }, { "epoch": 1.6850539291217257, "grad_norm": 2.063516855239868, "learning_rate": 1.468962392792529e-06, "loss": 1.0614, "step": 2734 }, { "epoch": 1.6856702619414485, "grad_norm": 2.09110426902771, "learning_rate": 1.4633520603266716e-06, "loss": 1.071, "step": 2735 }, { "epoch": 1.686286594761171, "grad_norm": 2.074352741241455, "learning_rate": 1.4577516160982186e-06, "loss": 1.0338, "step": 2736 }, { "epoch": 1.6869029275808938, "grad_norm": 2.0413167476654053, "learning_rate": 1.452161066594313e-06, "loss": 0.99, "step": 2737 }, { "epoch": 1.6875192604006162, "grad_norm": 2.1178739070892334, "learning_rate": 1.4465804182906472e-06, "loss": 1.0823, "step": 2738 }, { "epoch": 1.688135593220339, "grad_norm": 2.100261926651001, "learning_rate": 1.4410096776514404e-06, "loss": 1.0807, "step": 2739 }, { "epoch": 1.6887519260400616, "grad_norm": 2.0760152339935303, "learning_rate": 1.4354488511294418e-06, "loss": 1.0757, "step": 2740 }, { "epoch": 1.6893682588597843, "grad_norm": 2.127382755279541, "learning_rate": 1.4298979451659044e-06, "loss": 1.0608, "step": 2741 }, { "epoch": 1.689984591679507, "grad_norm": 2.1522481441497803, "learning_rate": 1.4243569661906021e-06, "loss": 1.1077, "step": 2742 }, { "epoch": 1.6906009244992295, "grad_norm": 2.048807382583618, "learning_rate": 1.4188259206218036e-06, "loss": 1.0214, "step": 2743 }, { "epoch": 1.6912172573189523, "grad_norm": 2.0883119106292725, "learning_rate": 1.413304814866273e-06, "loss": 1.0653, "step": 2744 }, { "epoch": 1.6918335901386747, "grad_norm": 2.1307601928710938, "learning_rate": 1.407793655319265e-06, "loss": 1.0793, "step": 2745 }, { "epoch": 1.6924499229583976, "grad_norm": 2.087623357772827, "learning_rate": 1.402292448364505e-06, "loss": 1.0803, "step": 2746 }, { "epoch": 1.6930662557781202, "grad_norm": 2.0996017456054688, "learning_rate": 1.3968012003741948e-06, "loss": 1.0292, "step": 2747 }, { "epoch": 1.6936825885978428, "grad_norm": 2.1138553619384766, "learning_rate": 1.391319917709001e-06, "loss": 1.0874, "step": 2748 }, { "epoch": 1.6942989214175654, "grad_norm": 2.050152063369751, "learning_rate": 1.3858486067180466e-06, "loss": 1.059, "step": 2749 }, { "epoch": 1.694915254237288, "grad_norm": 2.0773465633392334, "learning_rate": 1.3803872737389035e-06, "loss": 1.0782, "step": 2750 }, { "epoch": 1.695531587057011, "grad_norm": 2.092930555343628, "learning_rate": 1.374935925097588e-06, "loss": 1.0503, "step": 2751 }, { "epoch": 1.6961479198767333, "grad_norm": 2.1163747310638428, "learning_rate": 1.3694945671085446e-06, "loss": 1.0448, "step": 2752 }, { "epoch": 1.6967642526964561, "grad_norm": 2.090707540512085, "learning_rate": 1.364063206074654e-06, "loss": 1.0916, "step": 2753 }, { "epoch": 1.6973805855161788, "grad_norm": 2.123875856399536, "learning_rate": 1.35864184828721e-06, "loss": 1.0521, "step": 2754 }, { "epoch": 1.6979969183359014, "grad_norm": 2.0896854400634766, "learning_rate": 1.353230500025926e-06, "loss": 1.0729, "step": 2755 }, { "epoch": 1.698613251155624, "grad_norm": 2.086395502090454, "learning_rate": 1.347829167558915e-06, "loss": 1.0981, "step": 2756 }, { "epoch": 1.6992295839753466, "grad_norm": 2.1063523292541504, "learning_rate": 1.3424378571426944e-06, "loss": 1.096, "step": 2757 }, { "epoch": 1.6998459167950695, "grad_norm": 2.144566059112549, "learning_rate": 1.3370565750221643e-06, "loss": 1.0919, "step": 2758 }, { "epoch": 1.7004622496147919, "grad_norm": 2.081916093826294, "learning_rate": 1.3316853274306162e-06, "loss": 1.1155, "step": 2759 }, { "epoch": 1.7010785824345147, "grad_norm": 2.1451094150543213, "learning_rate": 1.3263241205897183e-06, "loss": 1.0403, "step": 2760 }, { "epoch": 1.7016949152542373, "grad_norm": 2.0598602294921875, "learning_rate": 1.3209729607095022e-06, "loss": 1.0478, "step": 2761 }, { "epoch": 1.70231124807396, "grad_norm": 2.1038122177124023, "learning_rate": 1.3156318539883705e-06, "loss": 1.0878, "step": 2762 }, { "epoch": 1.7029275808936826, "grad_norm": 2.088853359222412, "learning_rate": 1.3103008066130707e-06, "loss": 1.1042, "step": 2763 }, { "epoch": 1.7035439137134052, "grad_norm": 2.0508081912994385, "learning_rate": 1.3049798247587063e-06, "loss": 1.0475, "step": 2764 }, { "epoch": 1.704160246533128, "grad_norm": 2.032291889190674, "learning_rate": 1.2996689145887208e-06, "loss": 1.0247, "step": 2765 }, { "epoch": 1.7047765793528504, "grad_norm": 2.0679056644439697, "learning_rate": 1.294368082254882e-06, "loss": 1.0468, "step": 2766 }, { "epoch": 1.7053929121725733, "grad_norm": 2.0481960773468018, "learning_rate": 1.2890773338973007e-06, "loss": 1.0092, "step": 2767 }, { "epoch": 1.706009244992296, "grad_norm": 2.1075525283813477, "learning_rate": 1.2837966756443977e-06, "loss": 1.1082, "step": 2768 }, { "epoch": 1.7066255778120185, "grad_norm": 2.0636954307556152, "learning_rate": 1.2785261136129002e-06, "loss": 1.027, "step": 2769 }, { "epoch": 1.7072419106317411, "grad_norm": 2.0558688640594482, "learning_rate": 1.2732656539078524e-06, "loss": 1.0567, "step": 2770 }, { "epoch": 1.7078582434514638, "grad_norm": 2.081709146499634, "learning_rate": 1.2680153026225917e-06, "loss": 1.0333, "step": 2771 }, { "epoch": 1.7084745762711866, "grad_norm": 2.0737030506134033, "learning_rate": 1.2627750658387395e-06, "loss": 1.0066, "step": 2772 }, { "epoch": 1.709090909090909, "grad_norm": 2.1123170852661133, "learning_rate": 1.2575449496262192e-06, "loss": 1.0558, "step": 2773 }, { "epoch": 1.7097072419106318, "grad_norm": 2.113227605819702, "learning_rate": 1.252324960043212e-06, "loss": 0.9976, "step": 2774 }, { "epoch": 1.7103235747303542, "grad_norm": 2.1259281635284424, "learning_rate": 1.2471151031361795e-06, "loss": 1.0515, "step": 2775 }, { "epoch": 1.710939907550077, "grad_norm": 2.0200610160827637, "learning_rate": 1.2419153849398447e-06, "loss": 1.0181, "step": 2776 }, { "epoch": 1.7115562403697997, "grad_norm": 2.071638822555542, "learning_rate": 1.2367258114771897e-06, "loss": 1.0986, "step": 2777 }, { "epoch": 1.7121725731895223, "grad_norm": 2.1040408611297607, "learning_rate": 1.2315463887594348e-06, "loss": 1.0969, "step": 2778 }, { "epoch": 1.712788906009245, "grad_norm": 2.0506539344787598, "learning_rate": 1.2263771227860555e-06, "loss": 0.9804, "step": 2779 }, { "epoch": 1.7134052388289676, "grad_norm": 2.037851333618164, "learning_rate": 1.2212180195447532e-06, "loss": 1.1017, "step": 2780 }, { "epoch": 1.7140215716486904, "grad_norm": 2.0624821186065674, "learning_rate": 1.2160690850114642e-06, "loss": 1.02, "step": 2781 }, { "epoch": 1.7146379044684128, "grad_norm": 2.0894362926483154, "learning_rate": 1.2109303251503434e-06, "loss": 1.0565, "step": 2782 }, { "epoch": 1.7152542372881356, "grad_norm": 2.0170366764068604, "learning_rate": 1.205801745913756e-06, "loss": 1.0263, "step": 2783 }, { "epoch": 1.7158705701078583, "grad_norm": 2.043400287628174, "learning_rate": 1.20068335324228e-06, "loss": 1.0018, "step": 2784 }, { "epoch": 1.7164869029275809, "grad_norm": 2.1245791912078857, "learning_rate": 1.195575153064692e-06, "loss": 1.1009, "step": 2785 }, { "epoch": 1.7171032357473035, "grad_norm": 2.0878474712371826, "learning_rate": 1.190477151297964e-06, "loss": 1.0996, "step": 2786 }, { "epoch": 1.7177195685670261, "grad_norm": 2.068880319595337, "learning_rate": 1.1853893538472537e-06, "loss": 1.079, "step": 2787 }, { "epoch": 1.718335901386749, "grad_norm": 2.0690011978149414, "learning_rate": 1.1803117666058995e-06, "loss": 1.0151, "step": 2788 }, { "epoch": 1.7189522342064714, "grad_norm": 2.0702385902404785, "learning_rate": 1.1752443954554083e-06, "loss": 1.0509, "step": 2789 }, { "epoch": 1.7195685670261942, "grad_norm": 2.0423271656036377, "learning_rate": 1.1701872462654607e-06, "loss": 1.0192, "step": 2790 }, { "epoch": 1.7201848998459168, "grad_norm": 2.1170527935028076, "learning_rate": 1.165140324893893e-06, "loss": 1.0692, "step": 2791 }, { "epoch": 1.7208012326656394, "grad_norm": 2.0614938735961914, "learning_rate": 1.1601036371866946e-06, "loss": 1.0585, "step": 2792 }, { "epoch": 1.721417565485362, "grad_norm": 2.0696160793304443, "learning_rate": 1.1550771889780033e-06, "loss": 1.0276, "step": 2793 }, { "epoch": 1.7220338983050847, "grad_norm": 2.0907905101776123, "learning_rate": 1.1500609860900935e-06, "loss": 1.0699, "step": 2794 }, { "epoch": 1.7226502311248075, "grad_norm": 2.0855207443237305, "learning_rate": 1.14505503433337e-06, "loss": 1.0775, "step": 2795 }, { "epoch": 1.72326656394453, "grad_norm": 2.0611445903778076, "learning_rate": 1.1400593395063687e-06, "loss": 1.1052, "step": 2796 }, { "epoch": 1.7238828967642528, "grad_norm": 2.0724103450775146, "learning_rate": 1.1350739073957417e-06, "loss": 0.9985, "step": 2797 }, { "epoch": 1.7244992295839754, "grad_norm": 2.1183483600616455, "learning_rate": 1.1300987437762544e-06, "loss": 1.0548, "step": 2798 }, { "epoch": 1.725115562403698, "grad_norm": 2.1689651012420654, "learning_rate": 1.1251338544107792e-06, "loss": 1.0812, "step": 2799 }, { "epoch": 1.7257318952234206, "grad_norm": 2.053048849105835, "learning_rate": 1.1201792450502836e-06, "loss": 1.0287, "step": 2800 }, { "epoch": 1.7263482280431433, "grad_norm": 2.158965587615967, "learning_rate": 1.1152349214338288e-06, "loss": 1.1132, "step": 2801 }, { "epoch": 1.726964560862866, "grad_norm": 2.055330753326416, "learning_rate": 1.110300889288568e-06, "loss": 0.9894, "step": 2802 }, { "epoch": 1.7275808936825885, "grad_norm": 2.085465669631958, "learning_rate": 1.1053771543297198e-06, "loss": 1.0696, "step": 2803 }, { "epoch": 1.7281972265023113, "grad_norm": 2.0686750411987305, "learning_rate": 1.1004637222605907e-06, "loss": 1.0271, "step": 2804 }, { "epoch": 1.7288135593220337, "grad_norm": 2.099294900894165, "learning_rate": 1.0955605987725459e-06, "loss": 1.0523, "step": 2805 }, { "epoch": 1.7294298921417566, "grad_norm": 2.083559036254883, "learning_rate": 1.0906677895450057e-06, "loss": 1.0351, "step": 2806 }, { "epoch": 1.7300462249614792, "grad_norm": 2.119889736175537, "learning_rate": 1.085785300245451e-06, "loss": 1.0576, "step": 2807 }, { "epoch": 1.7306625577812018, "grad_norm": 2.0983338356018066, "learning_rate": 1.0809131365294057e-06, "loss": 0.9887, "step": 2808 }, { "epoch": 1.7312788906009247, "grad_norm": 2.065593719482422, "learning_rate": 1.0760513040404275e-06, "loss": 1.0771, "step": 2809 }, { "epoch": 1.731895223420647, "grad_norm": 2.105670928955078, "learning_rate": 1.0711998084101206e-06, "loss": 1.0938, "step": 2810 }, { "epoch": 1.73251155624037, "grad_norm": 2.113288640975952, "learning_rate": 1.0663586552581019e-06, "loss": 1.0351, "step": 2811 }, { "epoch": 1.7331278890600923, "grad_norm": 2.0836665630340576, "learning_rate": 1.0615278501920157e-06, "loss": 1.0994, "step": 2812 }, { "epoch": 1.7337442218798151, "grad_norm": 2.119351387023926, "learning_rate": 1.0567073988075194e-06, "loss": 1.0373, "step": 2813 }, { "epoch": 1.7343605546995378, "grad_norm": 2.041083574295044, "learning_rate": 1.0518973066882765e-06, "loss": 1.009, "step": 2814 }, { "epoch": 1.7349768875192604, "grad_norm": 2.0757973194122314, "learning_rate": 1.0470975794059468e-06, "loss": 1.0614, "step": 2815 }, { "epoch": 1.735593220338983, "grad_norm": 2.031484365463257, "learning_rate": 1.0423082225201931e-06, "loss": 1.0247, "step": 2816 }, { "epoch": 1.7362095531587056, "grad_norm": 2.1159274578094482, "learning_rate": 1.0375292415786574e-06, "loss": 1.081, "step": 2817 }, { "epoch": 1.7368258859784285, "grad_norm": 2.1639952659606934, "learning_rate": 1.032760642116969e-06, "loss": 1.052, "step": 2818 }, { "epoch": 1.7374422187981509, "grad_norm": 2.088078022003174, "learning_rate": 1.028002429658731e-06, "loss": 1.0646, "step": 2819 }, { "epoch": 1.7380585516178737, "grad_norm": 2.1091349124908447, "learning_rate": 1.0232546097155095e-06, "loss": 1.0751, "step": 2820 }, { "epoch": 1.7386748844375963, "grad_norm": 2.07338285446167, "learning_rate": 1.0185171877868395e-06, "loss": 1.0737, "step": 2821 }, { "epoch": 1.739291217257319, "grad_norm": 2.090885639190674, "learning_rate": 1.013790169360207e-06, "loss": 1.0936, "step": 2822 }, { "epoch": 1.7399075500770416, "grad_norm": 2.066072463989258, "learning_rate": 1.0090735599110512e-06, "loss": 1.0429, "step": 2823 }, { "epoch": 1.7405238828967642, "grad_norm": 2.1577398777008057, "learning_rate": 1.0043673649027519e-06, "loss": 1.0606, "step": 2824 }, { "epoch": 1.741140215716487, "grad_norm": 2.117487668991089, "learning_rate": 9.996715897866282e-07, "loss": 1.0424, "step": 2825 }, { "epoch": 1.7417565485362094, "grad_norm": 2.0518903732299805, "learning_rate": 9.949862400019228e-07, "loss": 1.0487, "step": 2826 }, { "epoch": 1.7423728813559323, "grad_norm": 2.1434221267700195, "learning_rate": 9.903113209758098e-07, "loss": 1.053, "step": 2827 }, { "epoch": 1.7429892141756549, "grad_norm": 2.1506640911102295, "learning_rate": 9.856468381233763e-07, "loss": 1.1, "step": 2828 }, { "epoch": 1.7436055469953775, "grad_norm": 2.0521011352539062, "learning_rate": 9.809927968476262e-07, "loss": 0.9832, "step": 2829 }, { "epoch": 1.7442218798151001, "grad_norm": 2.060861587524414, "learning_rate": 9.76349202539464e-07, "loss": 1.0695, "step": 2830 }, { "epoch": 1.7448382126348227, "grad_norm": 2.0808513164520264, "learning_rate": 9.717160605776932e-07, "loss": 1.0159, "step": 2831 }, { "epoch": 1.7454545454545456, "grad_norm": 2.081514835357666, "learning_rate": 9.670933763290114e-07, "loss": 1.0395, "step": 2832 }, { "epoch": 1.746070878274268, "grad_norm": 2.0972800254821777, "learning_rate": 9.62481155148003e-07, "loss": 1.0204, "step": 2833 }, { "epoch": 1.7466872110939908, "grad_norm": 2.0848379135131836, "learning_rate": 9.578794023771332e-07, "loss": 1.03, "step": 2834 }, { "epoch": 1.7473035439137135, "grad_norm": 2.0290963649749756, "learning_rate": 9.532881233467406e-07, "loss": 1.0229, "step": 2835 }, { "epoch": 1.747919876733436, "grad_norm": 2.054208517074585, "learning_rate": 9.487073233750333e-07, "loss": 1.0436, "step": 2836 }, { "epoch": 1.7485362095531587, "grad_norm": 2.1463398933410645, "learning_rate": 9.441370077680756e-07, "loss": 1.0846, "step": 2837 }, { "epoch": 1.7491525423728813, "grad_norm": 2.094835042953491, "learning_rate": 9.395771818197941e-07, "loss": 1.027, "step": 2838 }, { "epoch": 1.7497688751926042, "grad_norm": 2.095115900039673, "learning_rate": 9.350278508119637e-07, "loss": 1.1281, "step": 2839 }, { "epoch": 1.7503852080123266, "grad_norm": 2.0635569095611572, "learning_rate": 9.304890200141959e-07, "loss": 1.0681, "step": 2840 }, { "epoch": 1.7510015408320494, "grad_norm": 2.1283464431762695, "learning_rate": 9.259606946839506e-07, "loss": 0.9891, "step": 2841 }, { "epoch": 1.7516178736517718, "grad_norm": 2.118675947189331, "learning_rate": 9.214428800665131e-07, "loss": 1.0651, "step": 2842 }, { "epoch": 1.7522342064714946, "grad_norm": 2.0898642539978027, "learning_rate": 9.169355813949909e-07, "loss": 1.1255, "step": 2843 }, { "epoch": 1.7528505392912173, "grad_norm": 2.085451364517212, "learning_rate": 9.124388038903164e-07, "loss": 1.0124, "step": 2844 }, { "epoch": 1.7534668721109399, "grad_norm": 2.1298041343688965, "learning_rate": 9.079525527612321e-07, "loss": 1.0831, "step": 2845 }, { "epoch": 1.7540832049306627, "grad_norm": 2.074150323867798, "learning_rate": 9.034768332042831e-07, "loss": 1.0548, "step": 2846 }, { "epoch": 1.7546995377503851, "grad_norm": 2.140660047531128, "learning_rate": 8.990116504038283e-07, "loss": 1.0828, "step": 2847 }, { "epoch": 1.755315870570108, "grad_norm": 2.0366969108581543, "learning_rate": 8.945570095320067e-07, "loss": 1.0351, "step": 2848 }, { "epoch": 1.7559322033898304, "grad_norm": 2.0829992294311523, "learning_rate": 8.901129157487565e-07, "loss": 1.0233, "step": 2849 }, { "epoch": 1.7565485362095532, "grad_norm": 2.056164503097534, "learning_rate": 8.856793742017944e-07, "loss": 0.982, "step": 2850 }, { "epoch": 1.7571648690292758, "grad_norm": 2.043077230453491, "learning_rate": 8.812563900266169e-07, "loss": 1.0504, "step": 2851 }, { "epoch": 1.7577812018489984, "grad_norm": 2.029425621032715, "learning_rate": 8.768439683464869e-07, "loss": 1.0243, "step": 2852 }, { "epoch": 1.758397534668721, "grad_norm": 2.076650619506836, "learning_rate": 8.724421142724382e-07, "loss": 1.0328, "step": 2853 }, { "epoch": 1.7590138674884437, "grad_norm": 2.08042049407959, "learning_rate": 8.680508329032588e-07, "loss": 1.0221, "step": 2854 }, { "epoch": 1.7596302003081665, "grad_norm": 2.111846685409546, "learning_rate": 8.636701293254934e-07, "loss": 1.1038, "step": 2855 }, { "epoch": 1.760246533127889, "grad_norm": 2.119947910308838, "learning_rate": 8.593000086134362e-07, "loss": 1.0368, "step": 2856 }, { "epoch": 1.7608628659476118, "grad_norm": 2.0480871200561523, "learning_rate": 8.549404758291146e-07, "loss": 1.0237, "step": 2857 }, { "epoch": 1.7614791987673344, "grad_norm": 2.11029052734375, "learning_rate": 8.505915360222994e-07, "loss": 1.0653, "step": 2858 }, { "epoch": 1.762095531587057, "grad_norm": 2.068103551864624, "learning_rate": 8.462531942304897e-07, "loss": 1.073, "step": 2859 }, { "epoch": 1.7627118644067796, "grad_norm": 2.1379544734954834, "learning_rate": 8.419254554789058e-07, "loss": 1.0623, "step": 2860 }, { "epoch": 1.7633281972265022, "grad_norm": 2.0510141849517822, "learning_rate": 8.376083247804912e-07, "loss": 1.0322, "step": 2861 }, { "epoch": 1.763944530046225, "grad_norm": 2.131415367126465, "learning_rate": 8.333018071358978e-07, "loss": 1.0814, "step": 2862 }, { "epoch": 1.7645608628659475, "grad_norm": 2.095451593399048, "learning_rate": 8.290059075334822e-07, "loss": 1.0198, "step": 2863 }, { "epoch": 1.7651771956856703, "grad_norm": 2.1397531032562256, "learning_rate": 8.247206309493072e-07, "loss": 1.1263, "step": 2864 }, { "epoch": 1.765793528505393, "grad_norm": 2.095233678817749, "learning_rate": 8.204459823471278e-07, "loss": 1.0707, "step": 2865 }, { "epoch": 1.7664098613251156, "grad_norm": 2.112360715866089, "learning_rate": 8.161819666783888e-07, "loss": 1.0555, "step": 2866 }, { "epoch": 1.7670261941448382, "grad_norm": 2.081881523132324, "learning_rate": 8.119285888822203e-07, "loss": 0.9984, "step": 2867 }, { "epoch": 1.7676425269645608, "grad_norm": 2.0850625038146973, "learning_rate": 8.076858538854249e-07, "loss": 1.094, "step": 2868 }, { "epoch": 1.7682588597842837, "grad_norm": 2.110346794128418, "learning_rate": 8.034537666024822e-07, "loss": 1.0671, "step": 2869 }, { "epoch": 1.768875192604006, "grad_norm": 2.076744794845581, "learning_rate": 7.992323319355377e-07, "loss": 1.0743, "step": 2870 }, { "epoch": 1.769491525423729, "grad_norm": 2.135108470916748, "learning_rate": 7.950215547743956e-07, "loss": 1.0594, "step": 2871 }, { "epoch": 1.7701078582434515, "grad_norm": 2.0742945671081543, "learning_rate": 7.908214399965187e-07, "loss": 1.04, "step": 2872 }, { "epoch": 1.7707241910631741, "grad_norm": 2.075092077255249, "learning_rate": 7.866319924670163e-07, "loss": 1.0334, "step": 2873 }, { "epoch": 1.7713405238828968, "grad_norm": 2.1320059299468994, "learning_rate": 7.824532170386412e-07, "loss": 1.08, "step": 2874 }, { "epoch": 1.7719568567026194, "grad_norm": 2.051335096359253, "learning_rate": 7.782851185517848e-07, "loss": 1.0044, "step": 2875 }, { "epoch": 1.7725731895223422, "grad_norm": 2.140218496322632, "learning_rate": 7.741277018344761e-07, "loss": 1.1196, "step": 2876 }, { "epoch": 1.7731895223420646, "grad_norm": 2.1110212802886963, "learning_rate": 7.699809717023599e-07, "loss": 1.1105, "step": 2877 }, { "epoch": 1.7738058551617875, "grad_norm": 2.1338813304901123, "learning_rate": 7.658449329587147e-07, "loss": 1.1001, "step": 2878 }, { "epoch": 1.7744221879815099, "grad_norm": 2.101050615310669, "learning_rate": 7.617195903944308e-07, "loss": 1.0484, "step": 2879 }, { "epoch": 1.7750385208012327, "grad_norm": 2.0594234466552734, "learning_rate": 7.576049487880032e-07, "loss": 1.0178, "step": 2880 }, { "epoch": 1.7756548536209553, "grad_norm": 2.146439790725708, "learning_rate": 7.535010129055375e-07, "loss": 1.1035, "step": 2881 }, { "epoch": 1.776271186440678, "grad_norm": 2.1381568908691406, "learning_rate": 7.494077875007388e-07, "loss": 1.0342, "step": 2882 }, { "epoch": 1.7768875192604006, "grad_norm": 2.0954487323760986, "learning_rate": 7.453252773149e-07, "loss": 1.1266, "step": 2883 }, { "epoch": 1.7775038520801232, "grad_norm": 2.0855653285980225, "learning_rate": 7.412534870769117e-07, "loss": 1.027, "step": 2884 }, { "epoch": 1.778120184899846, "grad_norm": 2.0770161151885986, "learning_rate": 7.371924215032388e-07, "loss": 1.0814, "step": 2885 }, { "epoch": 1.7787365177195684, "grad_norm": 2.1221272945404053, "learning_rate": 7.331420852979276e-07, "loss": 1.0652, "step": 2886 }, { "epoch": 1.7793528505392913, "grad_norm": 2.1274423599243164, "learning_rate": 7.291024831525961e-07, "loss": 1.0694, "step": 2887 }, { "epoch": 1.7799691833590139, "grad_norm": 2.0679993629455566, "learning_rate": 7.250736197464292e-07, "loss": 1.033, "step": 2888 }, { "epoch": 1.7805855161787365, "grad_norm": 2.0794410705566406, "learning_rate": 7.210554997461683e-07, "loss": 1.0827, "step": 2889 }, { "epoch": 1.7812018489984591, "grad_norm": 2.0927202701568604, "learning_rate": 7.170481278061159e-07, "loss": 1.0698, "step": 2890 }, { "epoch": 1.7818181818181817, "grad_norm": 2.082484245300293, "learning_rate": 7.130515085681222e-07, "loss": 1.0591, "step": 2891 }, { "epoch": 1.7824345146379046, "grad_norm": 2.049777030944824, "learning_rate": 7.09065646661583e-07, "loss": 1.0593, "step": 2892 }, { "epoch": 1.783050847457627, "grad_norm": 2.107414484024048, "learning_rate": 7.050905467034353e-07, "loss": 1.1058, "step": 2893 }, { "epoch": 1.7836671802773498, "grad_norm": 2.051372528076172, "learning_rate": 7.011262132981456e-07, "loss": 1.0293, "step": 2894 }, { "epoch": 1.7842835130970724, "grad_norm": 2.0276668071746826, "learning_rate": 6.971726510377119e-07, "loss": 1.0375, "step": 2895 }, { "epoch": 1.784899845916795, "grad_norm": 2.0958454608917236, "learning_rate": 6.932298645016555e-07, "loss": 1.0771, "step": 2896 }, { "epoch": 1.7855161787365177, "grad_norm": 2.1039891242980957, "learning_rate": 6.892978582570187e-07, "loss": 1.0425, "step": 2897 }, { "epoch": 1.7861325115562403, "grad_norm": 2.0638132095336914, "learning_rate": 6.853766368583504e-07, "loss": 1.0125, "step": 2898 }, { "epoch": 1.7867488443759632, "grad_norm": 2.045748233795166, "learning_rate": 6.81466204847715e-07, "loss": 1.066, "step": 2899 }, { "epoch": 1.7873651771956856, "grad_norm": 2.136739492416382, "learning_rate": 6.775665667546705e-07, "loss": 1.0542, "step": 2900 }, { "epoch": 1.7879815100154084, "grad_norm": 2.1509625911712646, "learning_rate": 6.736777270962791e-07, "loss": 1.1152, "step": 2901 }, { "epoch": 1.788597842835131, "grad_norm": 2.083707809448242, "learning_rate": 6.697996903770909e-07, "loss": 1.0146, "step": 2902 }, { "epoch": 1.7892141756548536, "grad_norm": 2.079418897628784, "learning_rate": 6.659324610891449e-07, "loss": 1.0573, "step": 2903 }, { "epoch": 1.7898305084745763, "grad_norm": 2.134166955947876, "learning_rate": 6.62076043711961e-07, "loss": 1.1268, "step": 2904 }, { "epoch": 1.7904468412942989, "grad_norm": 2.1072630882263184, "learning_rate": 6.582304427125341e-07, "loss": 1.0676, "step": 2905 }, { "epoch": 1.7910631741140217, "grad_norm": 2.053607702255249, "learning_rate": 6.543956625453307e-07, "loss": 1.0301, "step": 2906 }, { "epoch": 1.7916795069337441, "grad_norm": 2.082515239715576, "learning_rate": 6.505717076522844e-07, "loss": 1.076, "step": 2907 }, { "epoch": 1.792295839753467, "grad_norm": 2.0779247283935547, "learning_rate": 6.467585824627886e-07, "loss": 1.017, "step": 2908 }, { "epoch": 1.7929121725731896, "grad_norm": 2.0974488258361816, "learning_rate": 6.429562913936926e-07, "loss": 1.0722, "step": 2909 }, { "epoch": 1.7935285053929122, "grad_norm": 2.0818679332733154, "learning_rate": 6.391648388492989e-07, "loss": 1.0873, "step": 2910 }, { "epoch": 1.7941448382126348, "grad_norm": 2.075866937637329, "learning_rate": 6.353842292213474e-07, "loss": 1.0444, "step": 2911 }, { "epoch": 1.7947611710323574, "grad_norm": 2.169825553894043, "learning_rate": 6.316144668890256e-07, "loss": 1.0826, "step": 2912 }, { "epoch": 1.7953775038520803, "grad_norm": 2.0500473976135254, "learning_rate": 6.278555562189581e-07, "loss": 1.0535, "step": 2913 }, { "epoch": 1.7959938366718027, "grad_norm": 2.0528759956359863, "learning_rate": 6.241075015651898e-07, "loss": 1.0643, "step": 2914 }, { "epoch": 1.7966101694915255, "grad_norm": 2.1030259132385254, "learning_rate": 6.203703072692014e-07, "loss": 1.0974, "step": 2915 }, { "epoch": 1.797226502311248, "grad_norm": 2.1467833518981934, "learning_rate": 6.166439776598887e-07, "loss": 1.0619, "step": 2916 }, { "epoch": 1.7978428351309708, "grad_norm": 2.066580057144165, "learning_rate": 6.129285170535615e-07, "loss": 1.0686, "step": 2917 }, { "epoch": 1.7984591679506934, "grad_norm": 2.2001168727874756, "learning_rate": 6.092239297539427e-07, "loss": 1.1734, "step": 2918 }, { "epoch": 1.799075500770416, "grad_norm": 2.0707781314849854, "learning_rate": 6.055302200521607e-07, "loss": 1.0651, "step": 2919 }, { "epoch": 1.7996918335901386, "grad_norm": 2.0653631687164307, "learning_rate": 6.018473922267376e-07, "loss": 1.0073, "step": 2920 }, { "epoch": 1.8003081664098612, "grad_norm": 2.0815882682800293, "learning_rate": 5.981754505436e-07, "loss": 1.0249, "step": 2921 }, { "epoch": 1.800924499229584, "grad_norm": 2.0794119834899902, "learning_rate": 5.945143992560587e-07, "loss": 1.0673, "step": 2922 }, { "epoch": 1.8015408320493065, "grad_norm": 2.1611437797546387, "learning_rate": 5.908642426048117e-07, "loss": 1.0975, "step": 2923 }, { "epoch": 1.8021571648690293, "grad_norm": 2.1359851360321045, "learning_rate": 5.872249848179368e-07, "loss": 1.052, "step": 2924 }, { "epoch": 1.802773497688752, "grad_norm": 2.047593116760254, "learning_rate": 5.835966301108909e-07, "loss": 1.0324, "step": 2925 }, { "epoch": 1.8033898305084746, "grad_norm": 2.113304853439331, "learning_rate": 5.799791826864931e-07, "loss": 1.033, "step": 2926 }, { "epoch": 1.8040061633281972, "grad_norm": 2.05393123626709, "learning_rate": 5.763726467349373e-07, "loss": 1.0745, "step": 2927 }, { "epoch": 1.8046224961479198, "grad_norm": 2.2034666538238525, "learning_rate": 5.72777026433774e-07, "loss": 1.1374, "step": 2928 }, { "epoch": 1.8052388289676427, "grad_norm": 2.163790225982666, "learning_rate": 5.691923259479093e-07, "loss": 1.0588, "step": 2929 }, { "epoch": 1.805855161787365, "grad_norm": 2.0500152111053467, "learning_rate": 5.656185494296051e-07, "loss": 0.9895, "step": 2930 }, { "epoch": 1.806471494607088, "grad_norm": 2.0368478298187256, "learning_rate": 5.620557010184624e-07, "loss": 1.0407, "step": 2931 }, { "epoch": 1.8070878274268105, "grad_norm": 2.103135585784912, "learning_rate": 5.585037848414288e-07, "loss": 1.0148, "step": 2932 }, { "epoch": 1.8077041602465331, "grad_norm": 2.077174425125122, "learning_rate": 5.549628050127887e-07, "loss": 1.0567, "step": 2933 }, { "epoch": 1.8083204930662558, "grad_norm": 2.066704034805298, "learning_rate": 5.514327656341589e-07, "loss": 1.0573, "step": 2934 }, { "epoch": 1.8089368258859784, "grad_norm": 2.033888578414917, "learning_rate": 5.479136707944799e-07, "loss": 1.0448, "step": 2935 }, { "epoch": 1.8095531587057012, "grad_norm": 2.0315964221954346, "learning_rate": 5.444055245700208e-07, "loss": 1.0298, "step": 2936 }, { "epoch": 1.8101694915254236, "grad_norm": 2.1389732360839844, "learning_rate": 5.409083310243624e-07, "loss": 1.071, "step": 2937 }, { "epoch": 1.8107858243451465, "grad_norm": 2.1019113063812256, "learning_rate": 5.374220942084007e-07, "loss": 1.0486, "step": 2938 }, { "epoch": 1.811402157164869, "grad_norm": 2.0766124725341797, "learning_rate": 5.339468181603435e-07, "loss": 1.0233, "step": 2939 }, { "epoch": 1.8120184899845917, "grad_norm": 2.1040728092193604, "learning_rate": 5.304825069056996e-07, "loss": 1.0241, "step": 2940 }, { "epoch": 1.8126348228043143, "grad_norm": 2.0781941413879395, "learning_rate": 5.270291644572778e-07, "loss": 1.0497, "step": 2941 }, { "epoch": 1.813251155624037, "grad_norm": 2.106842279434204, "learning_rate": 5.235867948151785e-07, "loss": 1.0871, "step": 2942 }, { "epoch": 1.8138674884437598, "grad_norm": 2.0832910537719727, "learning_rate": 5.201554019667964e-07, "loss": 1.0122, "step": 2943 }, { "epoch": 1.8144838212634822, "grad_norm": 2.1174983978271484, "learning_rate": 5.16734989886809e-07, "loss": 1.0313, "step": 2944 }, { "epoch": 1.815100154083205, "grad_norm": 2.087345838546753, "learning_rate": 5.133255625371747e-07, "loss": 1.0761, "step": 2945 }, { "epoch": 1.8157164869029274, "grad_norm": 2.089189052581787, "learning_rate": 5.0992712386713e-07, "loss": 1.07, "step": 2946 }, { "epoch": 1.8163328197226503, "grad_norm": 2.0173983573913574, "learning_rate": 5.065396778131804e-07, "loss": 1.0092, "step": 2947 }, { "epoch": 1.8169491525423729, "grad_norm": 2.118257999420166, "learning_rate": 5.031632282990972e-07, "loss": 1.0762, "step": 2948 }, { "epoch": 1.8175654853620955, "grad_norm": 2.1347830295562744, "learning_rate": 4.997977792359177e-07, "loss": 1.0551, "step": 2949 }, { "epoch": 1.8181818181818183, "grad_norm": 2.109677314758301, "learning_rate": 4.964433345219354e-07, "loss": 1.0141, "step": 2950 }, { "epoch": 1.8187981510015407, "grad_norm": 2.0900590419769287, "learning_rate": 4.93099898042696e-07, "loss": 1.0826, "step": 2951 }, { "epoch": 1.8194144838212636, "grad_norm": 2.0628292560577393, "learning_rate": 4.897674736709968e-07, "loss": 1.0865, "step": 2952 }, { "epoch": 1.820030816640986, "grad_norm": 2.150844097137451, "learning_rate": 4.864460652668789e-07, "loss": 1.0805, "step": 2953 }, { "epoch": 1.8206471494607088, "grad_norm": 2.066072940826416, "learning_rate": 4.831356766776196e-07, "loss": 1.0628, "step": 2954 }, { "epoch": 1.8212634822804314, "grad_norm": 2.1000566482543945, "learning_rate": 4.798363117377347e-07, "loss": 1.0159, "step": 2955 }, { "epoch": 1.821879815100154, "grad_norm": 2.119077444076538, "learning_rate": 4.765479742689727e-07, "loss": 1.0509, "step": 2956 }, { "epoch": 1.8224961479198767, "grad_norm": 2.089686155319214, "learning_rate": 4.7327066808030454e-07, "loss": 1.0546, "step": 2957 }, { "epoch": 1.8231124807395993, "grad_norm": 2.0845155715942383, "learning_rate": 4.700043969679258e-07, "loss": 1.0386, "step": 2958 }, { "epoch": 1.8237288135593221, "grad_norm": 2.1219706535339355, "learning_rate": 4.6674916471525e-07, "loss": 1.0566, "step": 2959 }, { "epoch": 1.8243451463790445, "grad_norm": 2.0993728637695312, "learning_rate": 4.6350497509290324e-07, "loss": 1.0752, "step": 2960 }, { "epoch": 1.8249614791987674, "grad_norm": 2.0600695610046387, "learning_rate": 4.6027183185872073e-07, "loss": 1.0116, "step": 2961 }, { "epoch": 1.82557781201849, "grad_norm": 2.1053409576416016, "learning_rate": 4.5704973875774327e-07, "loss": 1.0446, "step": 2962 }, { "epoch": 1.8261941448382126, "grad_norm": 2.0360031127929688, "learning_rate": 4.5383869952220993e-07, "loss": 1.0906, "step": 2963 }, { "epoch": 1.8268104776579352, "grad_norm": 2.0947060585021973, "learning_rate": 4.5063871787155653e-07, "loss": 1.0659, "step": 2964 }, { "epoch": 1.8274268104776579, "grad_norm": 2.0583627223968506, "learning_rate": 4.474497975124126e-07, "loss": 1.0463, "step": 2965 }, { "epoch": 1.8280431432973807, "grad_norm": 2.098816394805908, "learning_rate": 4.4427194213859216e-07, "loss": 1.0201, "step": 2966 }, { "epoch": 1.828659476117103, "grad_norm": 2.1085290908813477, "learning_rate": 4.411051554310963e-07, "loss": 1.103, "step": 2967 }, { "epoch": 1.829275808936826, "grad_norm": 2.08278751373291, "learning_rate": 4.379494410580998e-07, "loss": 1.079, "step": 2968 }, { "epoch": 1.8298921417565486, "grad_norm": 2.0496115684509277, "learning_rate": 4.348048026749552e-07, "loss": 1.0193, "step": 2969 }, { "epoch": 1.8305084745762712, "grad_norm": 2.096172332763672, "learning_rate": 4.3167124392418437e-07, "loss": 1.0686, "step": 2970 }, { "epoch": 1.8311248073959938, "grad_norm": 2.1069161891937256, "learning_rate": 4.285487684354772e-07, "loss": 1.0918, "step": 2971 }, { "epoch": 1.8317411402157164, "grad_norm": 2.088675022125244, "learning_rate": 4.254373798256839e-07, "loss": 1.0739, "step": 2972 }, { "epoch": 1.8323574730354393, "grad_norm": 2.0879387855529785, "learning_rate": 4.223370816988126e-07, "loss": 0.9982, "step": 2973 }, { "epoch": 1.8329738058551617, "grad_norm": 2.1240296363830566, "learning_rate": 4.1924787764602295e-07, "loss": 1.0547, "step": 2974 }, { "epoch": 1.8335901386748845, "grad_norm": 2.1005806922912598, "learning_rate": 4.161697712456292e-07, "loss": 1.0795, "step": 2975 }, { "epoch": 1.8342064714946071, "grad_norm": 2.061593770980835, "learning_rate": 4.1310276606308594e-07, "loss": 1.004, "step": 2976 }, { "epoch": 1.8348228043143298, "grad_norm": 2.178370475769043, "learning_rate": 4.100468656509915e-07, "loss": 1.1395, "step": 2977 }, { "epoch": 1.8354391371340524, "grad_norm": 2.0640926361083984, "learning_rate": 4.0700207354908094e-07, "loss": 1.0169, "step": 2978 }, { "epoch": 1.836055469953775, "grad_norm": 2.060967206954956, "learning_rate": 4.039683932842209e-07, "loss": 1.0293, "step": 2979 }, { "epoch": 1.8366718027734978, "grad_norm": 2.149895429611206, "learning_rate": 4.009458283704093e-07, "loss": 1.0236, "step": 2980 }, { "epoch": 1.8372881355932202, "grad_norm": 2.0975711345672607, "learning_rate": 3.9793438230876445e-07, "loss": 1.0637, "step": 2981 }, { "epoch": 1.837904468412943, "grad_norm": 2.1419622898101807, "learning_rate": 3.9493405858753165e-07, "loss": 1.0575, "step": 2982 }, { "epoch": 1.8385208012326655, "grad_norm": 2.112999439239502, "learning_rate": 3.919448606820686e-07, "loss": 1.0267, "step": 2983 }, { "epoch": 1.8391371340523883, "grad_norm": 2.1156396865844727, "learning_rate": 3.889667920548468e-07, "loss": 1.0318, "step": 2984 }, { "epoch": 1.839753466872111, "grad_norm": 2.112769365310669, "learning_rate": 3.8599985615544346e-07, "loss": 1.065, "step": 2985 }, { "epoch": 1.8403697996918336, "grad_norm": 2.0995681285858154, "learning_rate": 3.830440564205462e-07, "loss": 1.0301, "step": 2986 }, { "epoch": 1.8409861325115564, "grad_norm": 2.1615161895751953, "learning_rate": 3.800993962739408e-07, "loss": 1.1054, "step": 2987 }, { "epoch": 1.8416024653312788, "grad_norm": 2.119781017303467, "learning_rate": 3.771658791265054e-07, "loss": 1.0692, "step": 2988 }, { "epoch": 1.8422187981510016, "grad_norm": 2.093494176864624, "learning_rate": 3.742435083762186e-07, "loss": 1.0469, "step": 2989 }, { "epoch": 1.842835130970724, "grad_norm": 2.08669114112854, "learning_rate": 3.7133228740814263e-07, "loss": 1.0747, "step": 2990 }, { "epoch": 1.8434514637904469, "grad_norm": 2.070962905883789, "learning_rate": 3.6843221959442567e-07, "loss": 1.017, "step": 2991 }, { "epoch": 1.8440677966101695, "grad_norm": 2.0964596271514893, "learning_rate": 3.6554330829429716e-07, "loss": 1.1162, "step": 2992 }, { "epoch": 1.8446841294298921, "grad_norm": 2.0754380226135254, "learning_rate": 3.626655568540649e-07, "loss": 1.0531, "step": 2993 }, { "epoch": 1.8453004622496147, "grad_norm": 2.0844526290893555, "learning_rate": 3.597989686071057e-07, "loss": 1.0076, "step": 2994 }, { "epoch": 1.8459167950693374, "grad_norm": 2.0397045612335205, "learning_rate": 3.5694354687387136e-07, "loss": 1.008, "step": 2995 }, { "epoch": 1.8465331278890602, "grad_norm": 2.0624330043792725, "learning_rate": 3.540992949618749e-07, "loss": 1.0967, "step": 2996 }, { "epoch": 1.8471494607087826, "grad_norm": 2.101311445236206, "learning_rate": 3.5126621616569434e-07, "loss": 1.0204, "step": 2997 }, { "epoch": 1.8477657935285055, "grad_norm": 2.085203170776367, "learning_rate": 3.4844431376696355e-07, "loss": 1.0535, "step": 2998 }, { "epoch": 1.848382126348228, "grad_norm": 2.0333919525146484, "learning_rate": 3.456335910343689e-07, "loss": 0.9728, "step": 2999 }, { "epoch": 1.8489984591679507, "grad_norm": 2.059133529663086, "learning_rate": 3.4283405122365056e-07, "loss": 1.0168, "step": 3000 }, { "epoch": 1.8496147919876733, "grad_norm": 2.077089548110962, "learning_rate": 3.4004569757759233e-07, "loss": 1.0461, "step": 3001 }, { "epoch": 1.850231124807396, "grad_norm": 2.117769718170166, "learning_rate": 3.3726853332602396e-07, "loss": 1.0467, "step": 3002 }, { "epoch": 1.8508474576271188, "grad_norm": 2.0860180854797363, "learning_rate": 3.345025616858111e-07, "loss": 1.0922, "step": 3003 }, { "epoch": 1.8514637904468412, "grad_norm": 2.1653075218200684, "learning_rate": 3.317477858608564e-07, "loss": 1.0883, "step": 3004 }, { "epoch": 1.852080123266564, "grad_norm": 2.08742618560791, "learning_rate": 3.2900420904209194e-07, "loss": 1.0679, "step": 3005 }, { "epoch": 1.8526964560862866, "grad_norm": 2.0715746879577637, "learning_rate": 3.2627183440748114e-07, "loss": 1.0396, "step": 3006 }, { "epoch": 1.8533127889060093, "grad_norm": 2.0913636684417725, "learning_rate": 3.235506651220077e-07, "loss": 1.004, "step": 3007 }, { "epoch": 1.8539291217257319, "grad_norm": 2.0620977878570557, "learning_rate": 3.2084070433768045e-07, "loss": 1.0401, "step": 3008 }, { "epoch": 1.8545454545454545, "grad_norm": 2.113502025604248, "learning_rate": 3.181419551935205e-07, "loss": 1.1488, "step": 3009 }, { "epoch": 1.8551617873651773, "grad_norm": 2.083383321762085, "learning_rate": 3.1545442081556634e-07, "loss": 1.033, "step": 3010 }, { "epoch": 1.8557781201848997, "grad_norm": 2.0690321922302246, "learning_rate": 3.1277810431686097e-07, "loss": 1.0754, "step": 3011 }, { "epoch": 1.8563944530046226, "grad_norm": 2.069265604019165, "learning_rate": 3.101130087974591e-07, "loss": 1.0964, "step": 3012 }, { "epoch": 1.8570107858243452, "grad_norm": 2.134777784347534, "learning_rate": 3.0745913734441357e-07, "loss": 1.0816, "step": 3013 }, { "epoch": 1.8576271186440678, "grad_norm": 2.092782735824585, "learning_rate": 3.0481649303177875e-07, "loss": 1.0672, "step": 3014 }, { "epoch": 1.8582434514637904, "grad_norm": 2.077195405960083, "learning_rate": 3.0218507892060376e-07, "loss": 1.0964, "step": 3015 }, { "epoch": 1.858859784283513, "grad_norm": 2.0345282554626465, "learning_rate": 2.995648980589272e-07, "loss": 1.0362, "step": 3016 }, { "epoch": 1.859476117103236, "grad_norm": 2.1031267642974854, "learning_rate": 2.9695595348177896e-07, "loss": 1.0501, "step": 3017 }, { "epoch": 1.8600924499229583, "grad_norm": 2.075753688812256, "learning_rate": 2.9435824821117177e-07, "loss": 1.0474, "step": 3018 }, { "epoch": 1.8607087827426811, "grad_norm": 2.13966965675354, "learning_rate": 2.917717852560997e-07, "loss": 1.0743, "step": 3019 }, { "epoch": 1.8613251155624035, "grad_norm": 2.0799686908721924, "learning_rate": 2.891965676125352e-07, "loss": 1.0067, "step": 3020 }, { "epoch": 1.8619414483821264, "grad_norm": 2.1170997619628906, "learning_rate": 2.866325982634266e-07, "loss": 1.0424, "step": 3021 }, { "epoch": 1.862557781201849, "grad_norm": 2.0911810398101807, "learning_rate": 2.8407988017868814e-07, "loss": 1.0196, "step": 3022 }, { "epoch": 1.8631741140215716, "grad_norm": 2.0777387619018555, "learning_rate": 2.815384163152057e-07, "loss": 1.0255, "step": 3023 }, { "epoch": 1.8637904468412942, "grad_norm": 2.1072049140930176, "learning_rate": 2.7900820961682784e-07, "loss": 1.0645, "step": 3024 }, { "epoch": 1.8644067796610169, "grad_norm": 2.1233720779418945, "learning_rate": 2.764892630143623e-07, "loss": 1.0642, "step": 3025 }, { "epoch": 1.8650231124807397, "grad_norm": 2.09956431388855, "learning_rate": 2.7398157942557734e-07, "loss": 1.0688, "step": 3026 }, { "epoch": 1.865639445300462, "grad_norm": 2.1075704097747803, "learning_rate": 2.714851617551928e-07, "loss": 1.0266, "step": 3027 }, { "epoch": 1.866255778120185, "grad_norm": 2.0775206089019775, "learning_rate": 2.6900001289487663e-07, "loss": 1.065, "step": 3028 }, { "epoch": 1.8668721109399076, "grad_norm": 2.069898843765259, "learning_rate": 2.665261357232474e-07, "loss": 1.057, "step": 3029 }, { "epoch": 1.8674884437596302, "grad_norm": 2.108205795288086, "learning_rate": 2.640635331058661e-07, "loss": 1.0929, "step": 3030 }, { "epoch": 1.8681047765793528, "grad_norm": 2.1085076332092285, "learning_rate": 2.6161220789523435e-07, "loss": 1.068, "step": 3031 }, { "epoch": 1.8687211093990754, "grad_norm": 2.043652296066284, "learning_rate": 2.5917216293078864e-07, "loss": 0.9954, "step": 3032 }, { "epoch": 1.8693374422187983, "grad_norm": 2.1181650161743164, "learning_rate": 2.567434010389036e-07, "loss": 1.0426, "step": 3033 }, { "epoch": 1.8699537750385207, "grad_norm": 2.0890989303588867, "learning_rate": 2.5432592503288e-07, "loss": 1.033, "step": 3034 }, { "epoch": 1.8705701078582435, "grad_norm": 2.075713872909546, "learning_rate": 2.5191973771294895e-07, "loss": 1.0382, "step": 3035 }, { "epoch": 1.8711864406779661, "grad_norm": 2.120662212371826, "learning_rate": 2.4952484186626437e-07, "loss": 1.1058, "step": 3036 }, { "epoch": 1.8718027734976888, "grad_norm": 2.1247479915618896, "learning_rate": 2.4714124026689937e-07, "loss": 1.0396, "step": 3037 }, { "epoch": 1.8724191063174114, "grad_norm": 2.1243133544921875, "learning_rate": 2.447689356758476e-07, "loss": 1.0507, "step": 3038 }, { "epoch": 1.873035439137134, "grad_norm": 2.1038336753845215, "learning_rate": 2.424079308410143e-07, "loss": 1.0569, "step": 3039 }, { "epoch": 1.8736517719568568, "grad_norm": 2.062143564224243, "learning_rate": 2.400582284972197e-07, "loss": 0.958, "step": 3040 }, { "epoch": 1.8742681047765792, "grad_norm": 2.08815336227417, "learning_rate": 2.377198313661877e-07, "loss": 1.0644, "step": 3041 }, { "epoch": 1.874884437596302, "grad_norm": 2.12034010887146, "learning_rate": 2.3539274215654827e-07, "loss": 1.011, "step": 3042 }, { "epoch": 1.8755007704160247, "grad_norm": 2.0479044914245605, "learning_rate": 2.3307696356383525e-07, "loss": 0.9923, "step": 3043 }, { "epoch": 1.8761171032357473, "grad_norm": 2.1146936416625977, "learning_rate": 2.3077249827047954e-07, "loss": 1.071, "step": 3044 }, { "epoch": 1.87673343605547, "grad_norm": 2.1025168895721436, "learning_rate": 2.2847934894580592e-07, "loss": 1.0551, "step": 3045 }, { "epoch": 1.8773497688751926, "grad_norm": 2.0899600982666016, "learning_rate": 2.2619751824603408e-07, "loss": 1.1184, "step": 3046 }, { "epoch": 1.8779661016949154, "grad_norm": 2.1296637058258057, "learning_rate": 2.239270088142742e-07, "loss": 1.0476, "step": 3047 }, { "epoch": 1.8785824345146378, "grad_norm": 2.104119062423706, "learning_rate": 2.2166782328051806e-07, "loss": 0.9996, "step": 3048 }, { "epoch": 1.8791987673343606, "grad_norm": 2.1674883365631104, "learning_rate": 2.1941996426164347e-07, "loss": 1.0468, "step": 3049 }, { "epoch": 1.879815100154083, "grad_norm": 2.0450851917266846, "learning_rate": 2.1718343436140988e-07, "loss": 1.0543, "step": 3050 }, { "epoch": 1.8804314329738059, "grad_norm": 2.097395896911621, "learning_rate": 2.1495823617045053e-07, "loss": 1.0576, "step": 3051 }, { "epoch": 1.8810477657935285, "grad_norm": 2.0971126556396484, "learning_rate": 2.1274437226627586e-07, "loss": 1.0411, "step": 3052 }, { "epoch": 1.8816640986132511, "grad_norm": 2.0627105236053467, "learning_rate": 2.105418452132646e-07, "loss": 1.0333, "step": 3053 }, { "epoch": 1.882280431432974, "grad_norm": 2.088592290878296, "learning_rate": 2.0835065756266703e-07, "loss": 0.9716, "step": 3054 }, { "epoch": 1.8828967642526964, "grad_norm": 2.127971649169922, "learning_rate": 2.0617081185259512e-07, "loss": 1.0826, "step": 3055 }, { "epoch": 1.8835130970724192, "grad_norm": 2.035266160964966, "learning_rate": 2.040023106080269e-07, "loss": 1.0095, "step": 3056 }, { "epoch": 1.8841294298921416, "grad_norm": 2.1227118968963623, "learning_rate": 2.0184515634079638e-07, "loss": 1.0806, "step": 3057 }, { "epoch": 1.8847457627118644, "grad_norm": 2.046706199645996, "learning_rate": 1.9969935154959709e-07, "loss": 1.061, "step": 3058 }, { "epoch": 1.885362095531587, "grad_norm": 2.1209235191345215, "learning_rate": 1.9756489871997187e-07, "loss": 1.1024, "step": 3059 }, { "epoch": 1.8859784283513097, "grad_norm": 2.1150171756744385, "learning_rate": 1.954418003243197e-07, "loss": 1.0439, "step": 3060 }, { "epoch": 1.8865947611710323, "grad_norm": 2.0934438705444336, "learning_rate": 1.9333005882188337e-07, "loss": 1.03, "step": 3061 }, { "epoch": 1.887211093990755, "grad_norm": 2.1184937953948975, "learning_rate": 1.912296766587507e-07, "loss": 1.028, "step": 3062 }, { "epoch": 1.8878274268104778, "grad_norm": 2.0298590660095215, "learning_rate": 1.891406562678555e-07, "loss": 0.9886, "step": 3063 }, { "epoch": 1.8884437596302002, "grad_norm": 2.0989737510681152, "learning_rate": 1.8706300006896882e-07, "loss": 1.0151, "step": 3064 }, { "epoch": 1.889060092449923, "grad_norm": 2.1060686111450195, "learning_rate": 1.849967104686945e-07, "loss": 1.0389, "step": 3065 }, { "epoch": 1.8896764252696456, "grad_norm": 2.140235424041748, "learning_rate": 1.8294178986047462e-07, "loss": 1.04, "step": 3066 }, { "epoch": 1.8902927580893683, "grad_norm": 2.145745277404785, "learning_rate": 1.8089824062458294e-07, "loss": 1.0423, "step": 3067 }, { "epoch": 1.8909090909090909, "grad_norm": 2.097440719604492, "learning_rate": 1.788660651281171e-07, "loss": 1.0517, "step": 3068 }, { "epoch": 1.8915254237288135, "grad_norm": 2.0753231048583984, "learning_rate": 1.7684526572500417e-07, "loss": 1.0585, "step": 3069 }, { "epoch": 1.8921417565485363, "grad_norm": 2.1558947563171387, "learning_rate": 1.7483584475599168e-07, "loss": 1.0351, "step": 3070 }, { "epoch": 1.8927580893682587, "grad_norm": 2.1208322048187256, "learning_rate": 1.7283780454864784e-07, "loss": 1.103, "step": 3071 }, { "epoch": 1.8933744221879816, "grad_norm": 2.0651776790618896, "learning_rate": 1.7085114741735908e-07, "loss": 1.0915, "step": 3072 }, { "epoch": 1.8939907550077042, "grad_norm": 2.1323673725128174, "learning_rate": 1.6887587566332575e-07, "loss": 1.0553, "step": 3073 }, { "epoch": 1.8946070878274268, "grad_norm": 2.0623013973236084, "learning_rate": 1.6691199157455984e-07, "loss": 1.0303, "step": 3074 }, { "epoch": 1.8952234206471494, "grad_norm": 2.108752489089966, "learning_rate": 1.649594974258828e-07, "loss": 1.0544, "step": 3075 }, { "epoch": 1.895839753466872, "grad_norm": 2.1030497550964355, "learning_rate": 1.630183954789233e-07, "loss": 1.0922, "step": 3076 }, { "epoch": 1.896456086286595, "grad_norm": 2.0696065425872803, "learning_rate": 1.6108868798211387e-07, "loss": 1.0501, "step": 3077 }, { "epoch": 1.8970724191063173, "grad_norm": 2.0657424926757812, "learning_rate": 1.5917037717068985e-07, "loss": 1.0164, "step": 3078 }, { "epoch": 1.8976887519260401, "grad_norm": 2.1286303997039795, "learning_rate": 1.5726346526668156e-07, "loss": 1.0762, "step": 3079 }, { "epoch": 1.8983050847457628, "grad_norm": 2.1066014766693115, "learning_rate": 1.5536795447891996e-07, "loss": 1.0868, "step": 3080 }, { "epoch": 1.8989214175654854, "grad_norm": 2.080293893814087, "learning_rate": 1.5348384700302865e-07, "loss": 1.0847, "step": 3081 }, { "epoch": 1.899537750385208, "grad_norm": 2.1244759559631348, "learning_rate": 1.516111450214197e-07, "loss": 1.0372, "step": 3082 }, { "epoch": 1.9001540832049306, "grad_norm": 2.0655486583709717, "learning_rate": 1.4974985070329684e-07, "loss": 1.0733, "step": 3083 }, { "epoch": 1.9007704160246535, "grad_norm": 2.144289493560791, "learning_rate": 1.4789996620465208e-07, "loss": 1.0701, "step": 3084 }, { "epoch": 1.9013867488443759, "grad_norm": 2.0928845405578613, "learning_rate": 1.460614936682536e-07, "loss": 1.0248, "step": 3085 }, { "epoch": 1.9020030816640987, "grad_norm": 2.0548901557922363, "learning_rate": 1.4423443522365797e-07, "loss": 1.0529, "step": 3086 }, { "epoch": 1.902619414483821, "grad_norm": 2.1418538093566895, "learning_rate": 1.4241879298719785e-07, "loss": 1.0394, "step": 3087 }, { "epoch": 1.903235747303544, "grad_norm": 2.0840909481048584, "learning_rate": 1.406145690619809e-07, "loss": 1.0095, "step": 3088 }, { "epoch": 1.9038520801232666, "grad_norm": 2.05584454536438, "learning_rate": 1.3882176553789318e-07, "loss": 1.059, "step": 3089 }, { "epoch": 1.9044684129429892, "grad_norm": 2.0818300247192383, "learning_rate": 1.3704038449158574e-07, "loss": 1.0996, "step": 3090 }, { "epoch": 1.905084745762712, "grad_norm": 2.0956993103027344, "learning_rate": 1.3527042798648248e-07, "loss": 0.9887, "step": 3091 }, { "epoch": 1.9057010785824344, "grad_norm": 2.0821051597595215, "learning_rate": 1.3351189807277454e-07, "loss": 1.0727, "step": 3092 }, { "epoch": 1.9063174114021573, "grad_norm": 2.0664377212524414, "learning_rate": 1.3176479678741582e-07, "loss": 1.039, "step": 3093 }, { "epoch": 1.9069337442218797, "grad_norm": 2.164642333984375, "learning_rate": 1.30029126154122e-07, "loss": 1.0891, "step": 3094 }, { "epoch": 1.9075500770416025, "grad_norm": 2.061612844467163, "learning_rate": 1.2830488818337038e-07, "loss": 1.0077, "step": 3095 }, { "epoch": 1.9081664098613251, "grad_norm": 2.0881175994873047, "learning_rate": 1.2659208487239117e-07, "loss": 1.0662, "step": 3096 }, { "epoch": 1.9087827426810478, "grad_norm": 2.0770294666290283, "learning_rate": 1.2489071820517394e-07, "loss": 1.0275, "step": 3097 }, { "epoch": 1.9093990755007704, "grad_norm": 2.119786500930786, "learning_rate": 1.23200790152459e-07, "loss": 1.0342, "step": 3098 }, { "epoch": 1.910015408320493, "grad_norm": 2.1062846183776855, "learning_rate": 1.2152230267173715e-07, "loss": 1.069, "step": 3099 }, { "epoch": 1.9106317411402158, "grad_norm": 2.112689971923828, "learning_rate": 1.1985525770724648e-07, "loss": 1.1224, "step": 3100 }, { "epoch": 1.9112480739599382, "grad_norm": 2.0924489498138428, "learning_rate": 1.181996571899735e-07, "loss": 1.0037, "step": 3101 }, { "epoch": 1.911864406779661, "grad_norm": 2.020944356918335, "learning_rate": 1.1655550303764418e-07, "loss": 1.0097, "step": 3102 }, { "epoch": 1.9124807395993837, "grad_norm": 2.0818605422973633, "learning_rate": 1.1492279715472843e-07, "loss": 1.0341, "step": 3103 }, { "epoch": 1.9130970724191063, "grad_norm": 2.0637435913085938, "learning_rate": 1.1330154143243788e-07, "loss": 1.0888, "step": 3104 }, { "epoch": 1.913713405238829, "grad_norm": 2.024747610092163, "learning_rate": 1.1169173774871478e-07, "loss": 1.0087, "step": 3105 }, { "epoch": 1.9143297380585516, "grad_norm": 2.109853506088257, "learning_rate": 1.1009338796824087e-07, "loss": 1.0658, "step": 3106 }, { "epoch": 1.9149460708782744, "grad_norm": 2.0564465522766113, "learning_rate": 1.0850649394243074e-07, "loss": 1.0318, "step": 3107 }, { "epoch": 1.9155624036979968, "grad_norm": 2.06711483001709, "learning_rate": 1.0693105750942512e-07, "loss": 1.0364, "step": 3108 }, { "epoch": 1.9161787365177196, "grad_norm": 2.0901780128479004, "learning_rate": 1.0536708049409872e-07, "loss": 1.1021, "step": 3109 }, { "epoch": 1.9167950693374423, "grad_norm": 2.078535556793213, "learning_rate": 1.0381456470804796e-07, "loss": 1.0329, "step": 3110 }, { "epoch": 1.9174114021571649, "grad_norm": 2.1559512615203857, "learning_rate": 1.0227351194959545e-07, "loss": 1.0516, "step": 3111 }, { "epoch": 1.9180277349768875, "grad_norm": 2.0362589359283447, "learning_rate": 1.0074392400378663e-07, "loss": 1.0749, "step": 3112 }, { "epoch": 1.9186440677966101, "grad_norm": 2.0711281299591064, "learning_rate": 9.922580264238424e-08, "loss": 1.0532, "step": 3113 }, { "epoch": 1.919260400616333, "grad_norm": 2.120487689971924, "learning_rate": 9.771914962387164e-08, "loss": 1.0913, "step": 3114 }, { "epoch": 1.9198767334360554, "grad_norm": 2.145582914352417, "learning_rate": 9.622396669344836e-08, "loss": 1.1174, "step": 3115 }, { "epoch": 1.9204930662557782, "grad_norm": 2.1447513103485107, "learning_rate": 9.474025558302347e-08, "loss": 1.0836, "step": 3116 }, { "epoch": 1.9211093990755008, "grad_norm": 2.1312215328216553, "learning_rate": 9.326801801122332e-08, "loss": 1.0594, "step": 3117 }, { "epoch": 1.9217257318952234, "grad_norm": 2.1099977493286133, "learning_rate": 9.180725568338045e-08, "loss": 1.06, "step": 3118 }, { "epoch": 1.922342064714946, "grad_norm": 2.1006433963775635, "learning_rate": 9.035797029153693e-08, "loss": 1.07, "step": 3119 }, { "epoch": 1.9229583975346687, "grad_norm": 2.098017930984497, "learning_rate": 8.892016351443989e-08, "loss": 1.0434, "step": 3120 }, { "epoch": 1.9235747303543915, "grad_norm": 2.0727622509002686, "learning_rate": 8.749383701754377e-08, "loss": 1.0031, "step": 3121 }, { "epoch": 1.924191063174114, "grad_norm": 2.0315446853637695, "learning_rate": 8.607899245299923e-08, "loss": 0.973, "step": 3122 }, { "epoch": 1.9248073959938368, "grad_norm": 2.017815589904785, "learning_rate": 8.467563145966085e-08, "loss": 0.9748, "step": 3123 }, { "epoch": 1.9254237288135592, "grad_norm": 2.095153331756592, "learning_rate": 8.328375566308167e-08, "loss": 1.0669, "step": 3124 }, { "epoch": 1.926040061633282, "grad_norm": 2.067976474761963, "learning_rate": 8.190336667550869e-08, "loss": 1.0566, "step": 3125 }, { "epoch": 1.9266563944530046, "grad_norm": 2.0393502712249756, "learning_rate": 8.053446609588733e-08, "loss": 1.0061, "step": 3126 }, { "epoch": 1.9272727272727272, "grad_norm": 2.0963966846466064, "learning_rate": 7.917705550985255e-08, "loss": 0.9912, "step": 3127 }, { "epoch": 1.9278890600924499, "grad_norm": 2.1253952980041504, "learning_rate": 7.783113648972885e-08, "loss": 1.0538, "step": 3128 }, { "epoch": 1.9285053929121725, "grad_norm": 2.124704360961914, "learning_rate": 7.649671059453467e-08, "loss": 1.0365, "step": 3129 }, { "epoch": 1.9291217257318953, "grad_norm": 2.0717966556549072, "learning_rate": 7.517377936997028e-08, "loss": 1.0215, "step": 3130 }, { "epoch": 1.9297380585516177, "grad_norm": 2.1124215126037598, "learning_rate": 7.386234434842543e-08, "loss": 1.0582, "step": 3131 }, { "epoch": 1.9303543913713406, "grad_norm": 2.0875167846679688, "learning_rate": 7.256240704897167e-08, "loss": 1.0239, "step": 3132 }, { "epoch": 1.9309707241910632, "grad_norm": 2.113861083984375, "learning_rate": 7.127396897736117e-08, "loss": 1.0839, "step": 3133 }, { "epoch": 1.9315870570107858, "grad_norm": 2.0809085369110107, "learning_rate": 6.999703162603011e-08, "loss": 1.0124, "step": 3134 }, { "epoch": 1.9322033898305084, "grad_norm": 2.1039180755615234, "learning_rate": 6.873159647408867e-08, "loss": 1.0828, "step": 3135 }, { "epoch": 1.932819722650231, "grad_norm": 2.0878243446350098, "learning_rate": 6.747766498732655e-08, "loss": 1.0777, "step": 3136 }, { "epoch": 1.933436055469954, "grad_norm": 2.0593016147613525, "learning_rate": 6.623523861820635e-08, "loss": 1.0182, "step": 3137 }, { "epoch": 1.9340523882896763, "grad_norm": 2.0612845420837402, "learning_rate": 6.5004318805868e-08, "loss": 1.0058, "step": 3138 }, { "epoch": 1.9346687211093991, "grad_norm": 2.076216220855713, "learning_rate": 6.378490697611761e-08, "loss": 1.0576, "step": 3139 }, { "epoch": 1.9352850539291218, "grad_norm": 2.1313023567199707, "learning_rate": 6.257700454143423e-08, "loss": 1.064, "step": 3140 }, { "epoch": 1.9359013867488444, "grad_norm": 2.1063201427459717, "learning_rate": 6.138061290096753e-08, "loss": 1.0412, "step": 3141 }, { "epoch": 1.936517719568567, "grad_norm": 2.083827495574951, "learning_rate": 6.019573344052676e-08, "loss": 1.1517, "step": 3142 }, { "epoch": 1.9371340523882896, "grad_norm": 2.131040096282959, "learning_rate": 5.902236753259516e-08, "loss": 1.0746, "step": 3143 }, { "epoch": 1.9377503852080125, "grad_norm": 2.0473289489746094, "learning_rate": 5.7860516536312194e-08, "loss": 1.0531, "step": 3144 }, { "epoch": 1.9383667180277349, "grad_norm": 2.1149075031280518, "learning_rate": 5.6710181797483556e-08, "loss": 1.0559, "step": 3145 }, { "epoch": 1.9389830508474577, "grad_norm": 2.0744240283966064, "learning_rate": 5.55713646485756e-08, "loss": 1.0416, "step": 3146 }, { "epoch": 1.9395993836671803, "grad_norm": 2.1975409984588623, "learning_rate": 5.44440664087087e-08, "loss": 1.1529, "step": 3147 }, { "epoch": 1.940215716486903, "grad_norm": 2.11881422996521, "learning_rate": 5.3328288383666107e-08, "loss": 1.063, "step": 3148 }, { "epoch": 1.9408320493066256, "grad_norm": 2.0738093852996826, "learning_rate": 5.2224031865882876e-08, "loss": 1.039, "step": 3149 }, { "epoch": 1.9414483821263482, "grad_norm": 2.0933499336242676, "learning_rate": 5.11312981344525e-08, "loss": 0.9808, "step": 3150 }, { "epoch": 1.942064714946071, "grad_norm": 2.1024527549743652, "learning_rate": 5.005008845511694e-08, "loss": 1.0898, "step": 3151 }, { "epoch": 1.9426810477657934, "grad_norm": 2.127706527709961, "learning_rate": 4.898040408027327e-08, "loss": 1.0842, "step": 3152 }, { "epoch": 1.9432973805855163, "grad_norm": 2.090968608856201, "learning_rate": 4.79222462489648e-08, "loss": 0.9833, "step": 3153 }, { "epoch": 1.9439137134052387, "grad_norm": 2.127838134765625, "learning_rate": 4.687561618688663e-08, "loss": 1.0843, "step": 3154 }, { "epoch": 1.9445300462249615, "grad_norm": 2.0969154834747314, "learning_rate": 4.58405151063801e-08, "loss": 1.0891, "step": 3155 }, { "epoch": 1.9451463790446841, "grad_norm": 2.074000597000122, "learning_rate": 4.481694420642946e-08, "loss": 1.0026, "step": 3156 }, { "epoch": 1.9457627118644067, "grad_norm": 2.1791229248046875, "learning_rate": 4.3804904672666295e-08, "loss": 1.0447, "step": 3157 }, { "epoch": 1.9463790446841296, "grad_norm": 2.047727108001709, "learning_rate": 4.2804397677366214e-08, "loss": 1.0209, "step": 3158 }, { "epoch": 1.946995377503852, "grad_norm": 2.058377742767334, "learning_rate": 4.1815424379441085e-08, "loss": 1.0357, "step": 3159 }, { "epoch": 1.9476117103235748, "grad_norm": 2.1086928844451904, "learning_rate": 4.083798592444899e-08, "loss": 0.9879, "step": 3160 }, { "epoch": 1.9482280431432972, "grad_norm": 2.0458507537841797, "learning_rate": 3.9872083444583154e-08, "loss": 1.0767, "step": 3161 }, { "epoch": 1.94884437596302, "grad_norm": 2.0960803031921387, "learning_rate": 3.891771805867417e-08, "loss": 1.0513, "step": 3162 }, { "epoch": 1.9494607087827427, "grad_norm": 2.0675864219665527, "learning_rate": 3.7974890872193305e-08, "loss": 1.0279, "step": 3163 }, { "epoch": 1.9500770416024653, "grad_norm": 2.10414981842041, "learning_rate": 3.704360297723919e-08, "loss": 1.0127, "step": 3164 }, { "epoch": 1.950693374422188, "grad_norm": 2.118839979171753, "learning_rate": 3.6123855452552257e-08, "loss": 1.0113, "step": 3165 }, { "epoch": 1.9513097072419106, "grad_norm": 2.097756862640381, "learning_rate": 3.5215649363500305e-08, "loss": 1.0228, "step": 3166 }, { "epoch": 1.9519260400616334, "grad_norm": 2.1009750366210938, "learning_rate": 3.4318985762082924e-08, "loss": 1.0487, "step": 3167 }, { "epoch": 1.9525423728813558, "grad_norm": 2.1279332637786865, "learning_rate": 3.343386568693263e-08, "loss": 1.0606, "step": 3168 }, { "epoch": 1.9531587057010786, "grad_norm": 2.0460638999938965, "learning_rate": 3.256029016330709e-08, "loss": 0.9987, "step": 3169 }, { "epoch": 1.9537750385208013, "grad_norm": 2.050032377243042, "learning_rate": 3.169826020309352e-08, "loss": 1.0367, "step": 3170 }, { "epoch": 1.9543913713405239, "grad_norm": 2.1069376468658447, "learning_rate": 3.084777680480433e-08, "loss": 0.9937, "step": 3171 }, { "epoch": 1.9550077041602465, "grad_norm": 2.089057683944702, "learning_rate": 3.0008840953580364e-08, "loss": 1.0623, "step": 3172 }, { "epoch": 1.9556240369799691, "grad_norm": 2.1102845668792725, "learning_rate": 2.9181453621183186e-08, "loss": 1.1188, "step": 3173 }, { "epoch": 1.956240369799692, "grad_norm": 2.1782350540161133, "learning_rate": 2.8365615765998388e-08, "loss": 1.1168, "step": 3174 }, { "epoch": 1.9568567026194144, "grad_norm": 2.11480450630188, "learning_rate": 2.7561328333034488e-08, "loss": 1.0395, "step": 3175 }, { "epoch": 1.9574730354391372, "grad_norm": 2.0862765312194824, "learning_rate": 2.6768592253919588e-08, "loss": 1.0313, "step": 3176 }, { "epoch": 1.9580893682588598, "grad_norm": 2.085009813308716, "learning_rate": 2.5987408446901396e-08, "loss": 1.0426, "step": 3177 }, { "epoch": 1.9587057010785824, "grad_norm": 2.0287704467773438, "learning_rate": 2.521777781684831e-08, "loss": 1.049, "step": 3178 }, { "epoch": 1.959322033898305, "grad_norm": 2.1018753051757812, "learning_rate": 2.4459701255242774e-08, "loss": 1.0219, "step": 3179 }, { "epoch": 1.9599383667180277, "grad_norm": 2.1129989624023438, "learning_rate": 2.3713179640187934e-08, "loss": 1.1051, "step": 3180 }, { "epoch": 1.9605546995377505, "grad_norm": 2.1282215118408203, "learning_rate": 2.2978213836400974e-08, "loss": 1.0833, "step": 3181 }, { "epoch": 1.961171032357473, "grad_norm": 2.050248622894287, "learning_rate": 2.2254804695210908e-08, "loss": 1.0359, "step": 3182 }, { "epoch": 1.9617873651771958, "grad_norm": 2.0992536544799805, "learning_rate": 2.154295305456522e-08, "loss": 1.0205, "step": 3183 }, { "epoch": 1.9624036979969184, "grad_norm": 2.196012258529663, "learning_rate": 2.084265973901989e-08, "loss": 1.0138, "step": 3184 }, { "epoch": 1.963020030816641, "grad_norm": 2.1288442611694336, "learning_rate": 2.0153925559744937e-08, "loss": 1.1341, "step": 3185 }, { "epoch": 1.9636363636363636, "grad_norm": 2.1084108352661133, "learning_rate": 1.947675131452109e-08, "loss": 1.0416, "step": 3186 }, { "epoch": 1.9642526964560862, "grad_norm": 2.077240228652954, "learning_rate": 1.8811137787736465e-08, "loss": 1.0411, "step": 3187 }, { "epoch": 1.964869029275809, "grad_norm": 2.128040075302124, "learning_rate": 1.815708575038988e-08, "loss": 1.0681, "step": 3188 }, { "epoch": 1.9654853620955315, "grad_norm": 2.0569674968719482, "learning_rate": 1.7514595960089752e-08, "loss": 1.0697, "step": 3189 }, { "epoch": 1.9661016949152543, "grad_norm": 2.053607225418091, "learning_rate": 1.6883669161048556e-08, "loss": 1.0638, "step": 3190 }, { "epoch": 1.9667180277349767, "grad_norm": 2.108454465866089, "learning_rate": 1.6264306084087245e-08, "loss": 1.0614, "step": 3191 }, { "epoch": 1.9673343605546996, "grad_norm": 2.054828643798828, "learning_rate": 1.565650744662972e-08, "loss": 1.0093, "step": 3192 }, { "epoch": 1.9679506933744222, "grad_norm": 2.0928053855895996, "learning_rate": 1.5060273952708372e-08, "loss": 1.1028, "step": 3193 }, { "epoch": 1.9685670261941448, "grad_norm": 2.0599067211151123, "learning_rate": 1.4475606292955191e-08, "loss": 1.0116, "step": 3194 }, { "epoch": 1.9691833590138677, "grad_norm": 2.039684295654297, "learning_rate": 1.3902505144608447e-08, "loss": 1.0352, "step": 3195 }, { "epoch": 1.96979969183359, "grad_norm": 2.10646390914917, "learning_rate": 1.3340971171506013e-08, "loss": 1.0247, "step": 3196 }, { "epoch": 1.970416024653313, "grad_norm": 2.0715701580047607, "learning_rate": 1.2791005024089809e-08, "loss": 1.0735, "step": 3197 }, { "epoch": 1.9710323574730353, "grad_norm": 2.136016845703125, "learning_rate": 1.2252607339399148e-08, "loss": 1.1069, "step": 3198 }, { "epoch": 1.9716486902927581, "grad_norm": 2.0941197872161865, "learning_rate": 1.1725778741076276e-08, "loss": 1.0231, "step": 3199 }, { "epoch": 1.9722650231124808, "grad_norm": 2.1334826946258545, "learning_rate": 1.1210519839360835e-08, "loss": 1.043, "step": 3200 }, { "epoch": 1.9728813559322034, "grad_norm": 2.112459659576416, "learning_rate": 1.0706831231092064e-08, "loss": 1.0454, "step": 3201 }, { "epoch": 1.973497688751926, "grad_norm": 2.1495935916900635, "learning_rate": 1.0214713499706596e-08, "loss": 1.0733, "step": 3202 }, { "epoch": 1.9741140215716486, "grad_norm": 2.1209046840667725, "learning_rate": 9.734167215237345e-09, "loss": 1.0631, "step": 3203 }, { "epoch": 1.9747303543913715, "grad_norm": 2.0773231983184814, "learning_rate": 9.265192934315714e-09, "loss": 1.0934, "step": 3204 }, { "epoch": 1.9753466872110939, "grad_norm": 2.089447259902954, "learning_rate": 8.80779120016606e-09, "loss": 1.1072, "step": 3205 }, { "epoch": 1.9759630200308167, "grad_norm": 2.0175838470458984, "learning_rate": 8.361962542611235e-09, "loss": 1.0623, "step": 3206 }, { "epoch": 1.9765793528505393, "grad_norm": 2.1229896545410156, "learning_rate": 7.927707478065927e-09, "loss": 1.1004, "step": 3207 }, { "epoch": 1.977195685670262, "grad_norm": 2.047734498977661, "learning_rate": 7.505026509541102e-09, "loss": 0.998, "step": 3208 }, { "epoch": 1.9778120184899846, "grad_norm": 2.0932352542877197, "learning_rate": 7.093920126638454e-09, "loss": 1.0583, "step": 3209 }, { "epoch": 1.9784283513097072, "grad_norm": 2.0676937103271484, "learning_rate": 6.694388805553731e-09, "loss": 1.0604, "step": 3210 }, { "epoch": 1.97904468412943, "grad_norm": 2.074737071990967, "learning_rate": 6.30643300907674e-09, "loss": 1.0864, "step": 3211 }, { "epoch": 1.9796610169491524, "grad_norm": 2.1110336780548096, "learning_rate": 5.930053186586904e-09, "loss": 0.9825, "step": 3212 }, { "epoch": 1.9802773497688753, "grad_norm": 2.105992555618286, "learning_rate": 5.565249774055481e-09, "loss": 1.0175, "step": 3213 }, { "epoch": 1.9808936825885979, "grad_norm": 2.122161388397217, "learning_rate": 5.212023194043347e-09, "loss": 1.1095, "step": 3214 }, { "epoch": 1.9815100154083205, "grad_norm": 2.0866096019744873, "learning_rate": 4.8703738557054344e-09, "loss": 1.061, "step": 3215 }, { "epoch": 1.9821263482280431, "grad_norm": 2.0749428272247314, "learning_rate": 4.54030215478074e-09, "loss": 1.0137, "step": 3216 }, { "epoch": 1.9827426810477657, "grad_norm": 2.060509204864502, "learning_rate": 4.221808473601208e-09, "loss": 1.0596, "step": 3217 }, { "epoch": 1.9833590138674886, "grad_norm": 2.097128391265869, "learning_rate": 3.914893181088397e-09, "loss": 1.0704, "step": 3218 }, { "epoch": 1.983975346687211, "grad_norm": 2.076300859451294, "learning_rate": 3.619556632750154e-09, "loss": 0.9945, "step": 3219 }, { "epoch": 1.9845916795069338, "grad_norm": 2.0783615112304688, "learning_rate": 3.3357991706817195e-09, "loss": 1.0538, "step": 3220 }, { "epoch": 1.9852080123266564, "grad_norm": 2.095029830932617, "learning_rate": 3.0636211235690604e-09, "loss": 1.0774, "step": 3221 }, { "epoch": 1.985824345146379, "grad_norm": 2.0789778232574463, "learning_rate": 2.8030228066833197e-09, "loss": 1.0534, "step": 3222 }, { "epoch": 1.9864406779661017, "grad_norm": 2.1028459072113037, "learning_rate": 2.5540045218819256e-09, "loss": 1.027, "step": 3223 }, { "epoch": 1.9870570107858243, "grad_norm": 2.080382823944092, "learning_rate": 2.3165665576097007e-09, "loss": 1.0405, "step": 3224 }, { "epoch": 1.9876733436055471, "grad_norm": 2.081700563430786, "learning_rate": 2.090709188898865e-09, "loss": 1.0597, "step": 3225 }, { "epoch": 1.9882896764252695, "grad_norm": 2.1160099506378174, "learning_rate": 1.8764326773645925e-09, "loss": 1.1046, "step": 3226 }, { "epoch": 1.9889060092449924, "grad_norm": 2.0487403869628906, "learning_rate": 1.6737372712116729e-09, "loss": 1.0128, "step": 3227 }, { "epoch": 1.9895223420647148, "grad_norm": 2.1318671703338623, "learning_rate": 1.4826232052256306e-09, "loss": 1.0456, "step": 3228 }, { "epoch": 1.9901386748844376, "grad_norm": 2.0871245861053467, "learning_rate": 1.3030907007793858e-09, "loss": 1.1053, "step": 3229 }, { "epoch": 1.9907550077041603, "grad_norm": 2.0988638401031494, "learning_rate": 1.1351399658321437e-09, "loss": 1.0679, "step": 3230 }, { "epoch": 1.9913713405238829, "grad_norm": 2.116283655166626, "learning_rate": 9.787711949249546e-10, "loss": 1.0226, "step": 3231 }, { "epoch": 1.9919876733436055, "grad_norm": 2.088329792022705, "learning_rate": 8.339845691840431e-10, "loss": 1.0647, "step": 3232 }, { "epoch": 1.9926040061633281, "grad_norm": 2.0839295387268066, "learning_rate": 7.007802563185895e-10, "loss": 1.0768, "step": 3233 }, { "epoch": 1.993220338983051, "grad_norm": 2.121151924133301, "learning_rate": 5.791584106251691e-10, "loss": 1.0872, "step": 3234 }, { "epoch": 1.9938366718027734, "grad_norm": 2.1071760654449463, "learning_rate": 4.691191729810918e-10, "loss": 1.0933, "step": 3235 }, { "epoch": 1.9944530046224962, "grad_norm": 2.0906529426574707, "learning_rate": 3.706626708466221e-10, "loss": 1.0155, "step": 3236 }, { "epoch": 1.9950693374422188, "grad_norm": 2.072662353515625, "learning_rate": 2.837890182683101e-10, "loss": 1.0727, "step": 3237 }, { "epoch": 1.9956856702619414, "grad_norm": 2.130866527557373, "learning_rate": 2.0849831587343993e-10, "loss": 1.0884, "step": 3238 }, { "epoch": 1.996302003081664, "grad_norm": 2.055520534515381, "learning_rate": 1.4479065087336097e-10, "loss": 1.0412, "step": 3239 }, { "epoch": 1.9969183359013867, "grad_norm": 2.0852859020233154, "learning_rate": 9.266609706237716e-11, "loss": 1.093, "step": 3240 }, { "epoch": 1.9975346687211095, "grad_norm": 2.1555964946746826, "learning_rate": 5.212471481774728e-11, "loss": 1.1084, "step": 3241 }, { "epoch": 1.998151001540832, "grad_norm": 2.1718578338623047, "learning_rate": 2.3166551099684798e-11, "loss": 1.0483, "step": 3242 }, { "epoch": 1.9987673343605548, "grad_norm": 2.1439826488494873, "learning_rate": 5.79163945246819e-12, "loss": 1.0349, "step": 3243 }, { "epoch": 1.9993836671802774, "grad_norm": 2.073878288269043, "learning_rate": 0.0, "loss": 1.0748, "step": 3244 }, { "epoch": 1.9993836671802774, "step": 3244, "total_flos": 4.17170517143763e+19, "train_loss": 1.2978743012310103, "train_runtime": 38228.8954, "train_samples_per_second": 2.716, "train_steps_per_second": 0.085 } ], "logging_steps": 1.0, "max_steps": 3244, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.17170517143763e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }