{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999178779666584, "eval_steps": 500, "global_step": 6088, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 38.84074575883763, "learning_rate": 1.0928961748633881e-07, "loss": 1.7832, "step": 1 }, { "epoch": 0.0, "grad_norm": 20.840965769374012, "learning_rate": 2.1857923497267762e-07, "loss": 1.7955, "step": 2 }, { "epoch": 0.0, "grad_norm": 27.361668800267925, "learning_rate": 3.278688524590164e-07, "loss": 1.7813, "step": 3 }, { "epoch": 0.0, "grad_norm": 23.11456435578675, "learning_rate": 4.3715846994535524e-07, "loss": 1.7692, "step": 4 }, { "epoch": 0.0, "grad_norm": 27.26981739241391, "learning_rate": 5.46448087431694e-07, "loss": 1.633, "step": 5 }, { "epoch": 0.0, "grad_norm": 0.5819051073228698, "learning_rate": 6.557377049180328e-07, "loss": 0.2645, "step": 6 }, { "epoch": 0.0, "grad_norm": 22.545159511501367, "learning_rate": 7.650273224043716e-07, "loss": 1.7563, "step": 7 }, { "epoch": 0.0, "grad_norm": 27.387201817984806, "learning_rate": 8.743169398907105e-07, "loss": 1.863, "step": 8 }, { "epoch": 0.0, "grad_norm": 22.95681304055435, "learning_rate": 9.836065573770493e-07, "loss": 1.8171, "step": 9 }, { "epoch": 0.0, "grad_norm": 19.95440606105064, "learning_rate": 1.092896174863388e-06, "loss": 1.7146, "step": 10 }, { "epoch": 0.0, "grad_norm": 22.46411649067938, "learning_rate": 1.2021857923497268e-06, "loss": 1.6779, "step": 11 }, { "epoch": 0.0, "grad_norm": 29.633634735290794, "learning_rate": 1.3114754098360657e-06, "loss": 1.5058, "step": 12 }, { "epoch": 0.0, "grad_norm": 16.99806983665361, "learning_rate": 1.4207650273224043e-06, "loss": 1.7409, "step": 13 }, { "epoch": 0.0, "grad_norm": 13.187322558140579, "learning_rate": 1.5300546448087432e-06, "loss": 1.4756, "step": 14 }, { "epoch": 0.0, "grad_norm": 22.90140627104574, "learning_rate": 1.6393442622950819e-06, "loss": 1.5487, "step": 15 }, { "epoch": 0.0, "grad_norm": 9.910915850223, "learning_rate": 1.748633879781421e-06, "loss": 1.4734, "step": 16 }, { "epoch": 0.0, "grad_norm": 17.190075557884615, "learning_rate": 1.8579234972677599e-06, "loss": 1.5072, "step": 17 }, { "epoch": 0.0, "grad_norm": 10.68758632460099, "learning_rate": 1.9672131147540985e-06, "loss": 1.4086, "step": 18 }, { "epoch": 0.0, "grad_norm": 10.923816319822269, "learning_rate": 2.0765027322404376e-06, "loss": 1.4043, "step": 19 }, { "epoch": 0.0, "grad_norm": 6.7564175509861375, "learning_rate": 2.185792349726776e-06, "loss": 1.4519, "step": 20 }, { "epoch": 0.0, "grad_norm": 5.886796647477455, "learning_rate": 2.295081967213115e-06, "loss": 1.3906, "step": 21 }, { "epoch": 0.0, "grad_norm": 6.622367451965369, "learning_rate": 2.4043715846994536e-06, "loss": 1.3323, "step": 22 }, { "epoch": 0.0, "grad_norm": 5.697277605329102, "learning_rate": 2.5136612021857927e-06, "loss": 1.3405, "step": 23 }, { "epoch": 0.0, "grad_norm": 6.324319846141258, "learning_rate": 2.6229508196721314e-06, "loss": 1.3994, "step": 24 }, { "epoch": 0.0, "grad_norm": 6.200480820241418, "learning_rate": 2.7322404371584705e-06, "loss": 1.2953, "step": 25 }, { "epoch": 0.0, "grad_norm": 7.445272091965256, "learning_rate": 2.8415300546448087e-06, "loss": 1.1837, "step": 26 }, { "epoch": 0.0, "grad_norm": 10.964107004708145, "learning_rate": 2.9508196721311478e-06, "loss": 1.4235, "step": 27 }, { "epoch": 0.0, "grad_norm": 6.881232971113161, "learning_rate": 3.0601092896174864e-06, "loss": 1.3216, "step": 28 }, { "epoch": 0.0, "grad_norm": 0.8133682831968722, "learning_rate": 3.1693989071038255e-06, "loss": 0.2838, "step": 29 }, { "epoch": 0.0, "grad_norm": 0.8306806690597315, "learning_rate": 3.2786885245901638e-06, "loss": 0.2939, "step": 30 }, { "epoch": 0.01, "grad_norm": 5.181466020061959, "learning_rate": 3.387978142076503e-06, "loss": 1.2197, "step": 31 }, { "epoch": 0.01, "grad_norm": 4.348759356440266, "learning_rate": 3.497267759562842e-06, "loss": 1.1902, "step": 32 }, { "epoch": 0.01, "grad_norm": 4.084484409447551, "learning_rate": 3.6065573770491806e-06, "loss": 1.2463, "step": 33 }, { "epoch": 0.01, "grad_norm": 5.64822974665989, "learning_rate": 3.7158469945355197e-06, "loss": 1.2228, "step": 34 }, { "epoch": 0.01, "grad_norm": 3.6539075827765664, "learning_rate": 3.825136612021858e-06, "loss": 1.1928, "step": 35 }, { "epoch": 0.01, "grad_norm": 4.0332624050812464, "learning_rate": 3.934426229508197e-06, "loss": 1.2123, "step": 36 }, { "epoch": 0.01, "grad_norm": 4.1098311586042495, "learning_rate": 4.043715846994536e-06, "loss": 1.1048, "step": 37 }, { "epoch": 0.01, "grad_norm": 3.4794605332193513, "learning_rate": 4.153005464480875e-06, "loss": 1.1227, "step": 38 }, { "epoch": 0.01, "grad_norm": 3.6478429235325693, "learning_rate": 4.2622950819672135e-06, "loss": 1.2466, "step": 39 }, { "epoch": 0.01, "grad_norm": 3.2675825423796057, "learning_rate": 4.371584699453552e-06, "loss": 1.0787, "step": 40 }, { "epoch": 0.01, "grad_norm": 4.550317496648552, "learning_rate": 4.480874316939891e-06, "loss": 1.2034, "step": 41 }, { "epoch": 0.01, "grad_norm": 4.459632372410563, "learning_rate": 4.59016393442623e-06, "loss": 1.1935, "step": 42 }, { "epoch": 0.01, "grad_norm": 3.5699551386600263, "learning_rate": 4.699453551912569e-06, "loss": 1.2421, "step": 43 }, { "epoch": 0.01, "grad_norm": 3.435418788492063, "learning_rate": 4.808743169398907e-06, "loss": 1.1678, "step": 44 }, { "epoch": 0.01, "grad_norm": 3.1386710352782194, "learning_rate": 4.918032786885246e-06, "loss": 1.146, "step": 45 }, { "epoch": 0.01, "grad_norm": 3.633209132729536, "learning_rate": 5.027322404371585e-06, "loss": 1.1651, "step": 46 }, { "epoch": 0.01, "grad_norm": 3.2877889718586997, "learning_rate": 5.1366120218579245e-06, "loss": 1.1399, "step": 47 }, { "epoch": 0.01, "grad_norm": 4.492542819877019, "learning_rate": 5.245901639344263e-06, "loss": 1.07, "step": 48 }, { "epoch": 0.01, "grad_norm": 3.9718696966231795, "learning_rate": 5.355191256830602e-06, "loss": 1.2556, "step": 49 }, { "epoch": 0.01, "grad_norm": 4.255267085826149, "learning_rate": 5.464480874316941e-06, "loss": 1.1497, "step": 50 }, { "epoch": 0.01, "grad_norm": 5.720966468258994, "learning_rate": 5.573770491803278e-06, "loss": 1.2455, "step": 51 }, { "epoch": 0.01, "grad_norm": 3.694459692438322, "learning_rate": 5.683060109289617e-06, "loss": 1.1806, "step": 52 }, { "epoch": 0.01, "grad_norm": 3.034814711974371, "learning_rate": 5.7923497267759565e-06, "loss": 1.1885, "step": 53 }, { "epoch": 0.01, "grad_norm": 3.917275923447129, "learning_rate": 5.9016393442622956e-06, "loss": 1.1952, "step": 54 }, { "epoch": 0.01, "grad_norm": 2.8573670632282684, "learning_rate": 6.010928961748635e-06, "loss": 1.009, "step": 55 }, { "epoch": 0.01, "grad_norm": 3.3116499180919887, "learning_rate": 6.120218579234973e-06, "loss": 1.0457, "step": 56 }, { "epoch": 0.01, "grad_norm": 3.9089558477190502, "learning_rate": 6.229508196721312e-06, "loss": 1.0698, "step": 57 }, { "epoch": 0.01, "grad_norm": 4.736208719736402, "learning_rate": 6.338797814207651e-06, "loss": 1.1667, "step": 58 }, { "epoch": 0.01, "grad_norm": 2.7460078682270432, "learning_rate": 6.44808743169399e-06, "loss": 1.1084, "step": 59 }, { "epoch": 0.01, "grad_norm": 3.3187768209803945, "learning_rate": 6.5573770491803276e-06, "loss": 1.0883, "step": 60 }, { "epoch": 0.01, "grad_norm": 3.1280231922800863, "learning_rate": 6.666666666666667e-06, "loss": 1.0838, "step": 61 }, { "epoch": 0.01, "grad_norm": 4.315575985886534, "learning_rate": 6.775956284153006e-06, "loss": 1.1203, "step": 62 }, { "epoch": 0.01, "grad_norm": 5.704674840876163, "learning_rate": 6.885245901639345e-06, "loss": 1.1291, "step": 63 }, { "epoch": 0.01, "grad_norm": 3.7565854910583134, "learning_rate": 6.994535519125684e-06, "loss": 1.071, "step": 64 }, { "epoch": 0.01, "grad_norm": 3.8214217953699103, "learning_rate": 7.103825136612022e-06, "loss": 1.1919, "step": 65 }, { "epoch": 0.01, "grad_norm": 3.308219021947902, "learning_rate": 7.213114754098361e-06, "loss": 1.0285, "step": 66 }, { "epoch": 0.01, "grad_norm": 5.725191299888016, "learning_rate": 7.3224043715847e-06, "loss": 1.0517, "step": 67 }, { "epoch": 0.01, "grad_norm": 3.8323395536884215, "learning_rate": 7.4316939890710394e-06, "loss": 1.0449, "step": 68 }, { "epoch": 0.01, "grad_norm": 2.6060162951932937, "learning_rate": 7.540983606557377e-06, "loss": 1.0954, "step": 69 }, { "epoch": 0.01, "grad_norm": 2.9182663622079157, "learning_rate": 7.650273224043716e-06, "loss": 1.0494, "step": 70 }, { "epoch": 0.01, "grad_norm": 4.058454263480056, "learning_rate": 7.759562841530056e-06, "loss": 1.0554, "step": 71 }, { "epoch": 0.01, "grad_norm": 2.6039620846484324, "learning_rate": 7.868852459016394e-06, "loss": 1.0058, "step": 72 }, { "epoch": 0.01, "grad_norm": 3.3288658044530206, "learning_rate": 7.978142076502732e-06, "loss": 1.0461, "step": 73 }, { "epoch": 0.01, "grad_norm": 4.616908544762381, "learning_rate": 8.087431693989072e-06, "loss": 1.1037, "step": 74 }, { "epoch": 0.01, "grad_norm": 3.518628050078201, "learning_rate": 8.19672131147541e-06, "loss": 1.1602, "step": 75 }, { "epoch": 0.01, "grad_norm": 5.166508636219284, "learning_rate": 8.30601092896175e-06, "loss": 1.0254, "step": 76 }, { "epoch": 0.01, "grad_norm": 3.6912084876629274, "learning_rate": 8.415300546448089e-06, "loss": 1.0161, "step": 77 }, { "epoch": 0.01, "grad_norm": 3.1634864729404852, "learning_rate": 8.524590163934427e-06, "loss": 1.0518, "step": 78 }, { "epoch": 0.01, "grad_norm": 2.781418777786563, "learning_rate": 8.633879781420765e-06, "loss": 1.0785, "step": 79 }, { "epoch": 0.01, "grad_norm": 3.419562042306328, "learning_rate": 8.743169398907103e-06, "loss": 0.9852, "step": 80 }, { "epoch": 0.01, "grad_norm": 4.052405220108045, "learning_rate": 8.852459016393443e-06, "loss": 0.9652, "step": 81 }, { "epoch": 0.01, "grad_norm": 3.4236231523633704, "learning_rate": 8.961748633879782e-06, "loss": 1.0369, "step": 82 }, { "epoch": 0.01, "grad_norm": 2.7464242296906494, "learning_rate": 9.071038251366122e-06, "loss": 0.9789, "step": 83 }, { "epoch": 0.01, "grad_norm": 3.27178566553852, "learning_rate": 9.18032786885246e-06, "loss": 1.0857, "step": 84 }, { "epoch": 0.01, "grad_norm": 3.4313981260774704, "learning_rate": 9.2896174863388e-06, "loss": 0.9986, "step": 85 }, { "epoch": 0.01, "grad_norm": 5.05569668162306, "learning_rate": 9.398907103825138e-06, "loss": 1.1179, "step": 86 }, { "epoch": 0.01, "grad_norm": 4.45450440718213, "learning_rate": 9.508196721311476e-06, "loss": 0.9162, "step": 87 }, { "epoch": 0.01, "grad_norm": 4.1187143382325315, "learning_rate": 9.617486338797814e-06, "loss": 1.0219, "step": 88 }, { "epoch": 0.01, "grad_norm": 2.625480935080033, "learning_rate": 9.726775956284153e-06, "loss": 1.081, "step": 89 }, { "epoch": 0.01, "grad_norm": 4.036669414063297, "learning_rate": 9.836065573770493e-06, "loss": 1.1017, "step": 90 }, { "epoch": 0.01, "grad_norm": 3.4463370550471555, "learning_rate": 9.945355191256831e-06, "loss": 1.0329, "step": 91 }, { "epoch": 0.02, "grad_norm": 4.220066574408719, "learning_rate": 1.005464480874317e-05, "loss": 1.1173, "step": 92 }, { "epoch": 0.02, "grad_norm": 11.911797589489327, "learning_rate": 1.0163934426229509e-05, "loss": 1.0148, "step": 93 }, { "epoch": 0.02, "grad_norm": 6.017084902611932, "learning_rate": 1.0273224043715849e-05, "loss": 1.07, "step": 94 }, { "epoch": 0.02, "grad_norm": 3.836510506173722, "learning_rate": 1.0382513661202187e-05, "loss": 1.0216, "step": 95 }, { "epoch": 0.02, "grad_norm": 3.312695935104319, "learning_rate": 1.0491803278688525e-05, "loss": 1.0308, "step": 96 }, { "epoch": 0.02, "grad_norm": 3.6174064983865346, "learning_rate": 1.0601092896174865e-05, "loss": 1.0799, "step": 97 }, { "epoch": 0.02, "grad_norm": 3.2302377211976014, "learning_rate": 1.0710382513661204e-05, "loss": 1.0616, "step": 98 }, { "epoch": 0.02, "grad_norm": 2.6770544251441923, "learning_rate": 1.0819672131147544e-05, "loss": 0.942, "step": 99 }, { "epoch": 0.02, "grad_norm": 5.143432915699416, "learning_rate": 1.0928961748633882e-05, "loss": 0.9374, "step": 100 }, { "epoch": 0.02, "grad_norm": 3.7783307102634565, "learning_rate": 1.1038251366120218e-05, "loss": 1.0339, "step": 101 }, { "epoch": 0.02, "grad_norm": 4.9921161055839125, "learning_rate": 1.1147540983606557e-05, "loss": 1.0786, "step": 102 }, { "epoch": 0.02, "grad_norm": 3.453816180659913, "learning_rate": 1.1256830601092897e-05, "loss": 1.0324, "step": 103 }, { "epoch": 0.02, "grad_norm": 3.0391417062842647, "learning_rate": 1.1366120218579235e-05, "loss": 0.984, "step": 104 }, { "epoch": 0.02, "grad_norm": 4.075231379565818, "learning_rate": 1.1475409836065575e-05, "loss": 1.0338, "step": 105 }, { "epoch": 0.02, "grad_norm": 4.357940363659758, "learning_rate": 1.1584699453551913e-05, "loss": 0.999, "step": 106 }, { "epoch": 0.02, "grad_norm": 2.816894227584429, "learning_rate": 1.1693989071038251e-05, "loss": 0.9448, "step": 107 }, { "epoch": 0.02, "grad_norm": 2.8251659082722154, "learning_rate": 1.1803278688524591e-05, "loss": 0.9371, "step": 108 }, { "epoch": 0.02, "grad_norm": 3.0676739098691947, "learning_rate": 1.191256830601093e-05, "loss": 0.9938, "step": 109 }, { "epoch": 0.02, "grad_norm": 3.4440211940185166, "learning_rate": 1.202185792349727e-05, "loss": 0.9941, "step": 110 }, { "epoch": 0.02, "grad_norm": 3.7142285892916136, "learning_rate": 1.2131147540983608e-05, "loss": 1.0264, "step": 111 }, { "epoch": 0.02, "grad_norm": 2.4978175584676854, "learning_rate": 1.2240437158469946e-05, "loss": 1.0156, "step": 112 }, { "epoch": 0.02, "grad_norm": 3.790412780338605, "learning_rate": 1.2349726775956286e-05, "loss": 1.0514, "step": 113 }, { "epoch": 0.02, "grad_norm": 4.801190696042698, "learning_rate": 1.2459016393442624e-05, "loss": 0.9687, "step": 114 }, { "epoch": 0.02, "grad_norm": 3.369319129913228, "learning_rate": 1.2568306010928964e-05, "loss": 1.0038, "step": 115 }, { "epoch": 0.02, "grad_norm": 3.676221022885326, "learning_rate": 1.2677595628415302e-05, "loss": 0.9742, "step": 116 }, { "epoch": 0.02, "grad_norm": 1.2102595095313091, "learning_rate": 1.2786885245901642e-05, "loss": 0.2913, "step": 117 }, { "epoch": 0.02, "grad_norm": 3.0107694898828674, "learning_rate": 1.289617486338798e-05, "loss": 1.0291, "step": 118 }, { "epoch": 0.02, "grad_norm": 2.563243057183313, "learning_rate": 1.3005464480874317e-05, "loss": 1.0606, "step": 119 }, { "epoch": 0.02, "grad_norm": 4.211999964637774, "learning_rate": 1.3114754098360655e-05, "loss": 0.9659, "step": 120 }, { "epoch": 0.02, "grad_norm": 4.865186152309484, "learning_rate": 1.3224043715846995e-05, "loss": 0.9876, "step": 121 }, { "epoch": 0.02, "grad_norm": 3.43972330427909, "learning_rate": 1.3333333333333333e-05, "loss": 1.0138, "step": 122 }, { "epoch": 0.02, "grad_norm": 3.381540739685097, "learning_rate": 1.3442622950819673e-05, "loss": 1.0241, "step": 123 }, { "epoch": 0.02, "grad_norm": 2.960462751160101, "learning_rate": 1.3551912568306011e-05, "loss": 1.037, "step": 124 }, { "epoch": 0.02, "grad_norm": 2.811425762699898, "learning_rate": 1.366120218579235e-05, "loss": 1.0139, "step": 125 }, { "epoch": 0.02, "grad_norm": 3.1377005084947434, "learning_rate": 1.377049180327869e-05, "loss": 0.9945, "step": 126 }, { "epoch": 0.02, "grad_norm": 2.677546477587303, "learning_rate": 1.3879781420765028e-05, "loss": 0.9978, "step": 127 }, { "epoch": 0.02, "grad_norm": 2.457842352432103, "learning_rate": 1.3989071038251368e-05, "loss": 0.9476, "step": 128 }, { "epoch": 0.02, "grad_norm": 2.4159768894322307, "learning_rate": 1.4098360655737706e-05, "loss": 1.0536, "step": 129 }, { "epoch": 0.02, "grad_norm": 2.3039713070366177, "learning_rate": 1.4207650273224044e-05, "loss": 0.9868, "step": 130 }, { "epoch": 0.02, "grad_norm": 2.4773961652676766, "learning_rate": 1.4316939890710384e-05, "loss": 1.0143, "step": 131 }, { "epoch": 0.02, "grad_norm": 2.5268565014646494, "learning_rate": 1.4426229508196722e-05, "loss": 1.0482, "step": 132 }, { "epoch": 0.02, "grad_norm": 2.9994887713236844, "learning_rate": 1.4535519125683062e-05, "loss": 0.9954, "step": 133 }, { "epoch": 0.02, "grad_norm": 2.6207655125283242, "learning_rate": 1.46448087431694e-05, "loss": 0.9709, "step": 134 }, { "epoch": 0.02, "grad_norm": 2.484083892846453, "learning_rate": 1.4754098360655739e-05, "loss": 0.9511, "step": 135 }, { "epoch": 0.02, "grad_norm": 2.770752404187282, "learning_rate": 1.4863387978142079e-05, "loss": 1.0031, "step": 136 }, { "epoch": 0.02, "grad_norm": 2.4647222110658435, "learning_rate": 1.4972677595628417e-05, "loss": 1.032, "step": 137 }, { "epoch": 0.02, "grad_norm": 2.9983815778600387, "learning_rate": 1.5081967213114754e-05, "loss": 1.065, "step": 138 }, { "epoch": 0.02, "grad_norm": 4.070713594026917, "learning_rate": 1.5191256830601094e-05, "loss": 0.963, "step": 139 }, { "epoch": 0.02, "grad_norm": 2.506124916678639, "learning_rate": 1.5300546448087432e-05, "loss": 0.9523, "step": 140 }, { "epoch": 0.02, "grad_norm": 3.5805507804201526, "learning_rate": 1.5409836065573772e-05, "loss": 1.0074, "step": 141 }, { "epoch": 0.02, "grad_norm": 3.0113434885427934, "learning_rate": 1.551912568306011e-05, "loss": 0.9873, "step": 142 }, { "epoch": 0.02, "grad_norm": 17.060333094222216, "learning_rate": 1.5628415300546448e-05, "loss": 0.9762, "step": 143 }, { "epoch": 0.02, "grad_norm": 2.4712029656762495, "learning_rate": 1.5737704918032788e-05, "loss": 1.0201, "step": 144 }, { "epoch": 0.02, "grad_norm": 3.01443536849688, "learning_rate": 1.5846994535519128e-05, "loss": 1.0489, "step": 145 }, { "epoch": 0.02, "grad_norm": 1.417518914191741, "learning_rate": 1.5956284153005465e-05, "loss": 0.3392, "step": 146 }, { "epoch": 0.02, "grad_norm": 3.540863198238449, "learning_rate": 1.6065573770491805e-05, "loss": 1.024, "step": 147 }, { "epoch": 0.02, "grad_norm": 3.4073935159530575, "learning_rate": 1.6174863387978145e-05, "loss": 1.1068, "step": 148 }, { "epoch": 0.02, "grad_norm": 2.5600861856821786, "learning_rate": 1.628415300546448e-05, "loss": 1.05, "step": 149 }, { "epoch": 0.02, "grad_norm": 2.3515536648959725, "learning_rate": 1.639344262295082e-05, "loss": 1.0018, "step": 150 }, { "epoch": 0.02, "grad_norm": 5.499653409614938, "learning_rate": 1.650273224043716e-05, "loss": 1.0578, "step": 151 }, { "epoch": 0.02, "grad_norm": 2.497883008355538, "learning_rate": 1.66120218579235e-05, "loss": 1.0009, "step": 152 }, { "epoch": 0.03, "grad_norm": 2.4694244588064516, "learning_rate": 1.6721311475409837e-05, "loss": 1.0152, "step": 153 }, { "epoch": 0.03, "grad_norm": 2.909079250967417, "learning_rate": 1.6830601092896177e-05, "loss": 1.0089, "step": 154 }, { "epoch": 0.03, "grad_norm": 3.373093832275655, "learning_rate": 1.6939890710382517e-05, "loss": 1.0873, "step": 155 }, { "epoch": 0.03, "grad_norm": 2.751347132734379, "learning_rate": 1.7049180327868854e-05, "loss": 0.9756, "step": 156 }, { "epoch": 0.03, "grad_norm": 6.10082502826361, "learning_rate": 1.715846994535519e-05, "loss": 1.0879, "step": 157 }, { "epoch": 0.03, "grad_norm": 4.726546031531654, "learning_rate": 1.726775956284153e-05, "loss": 1.0295, "step": 158 }, { "epoch": 0.03, "grad_norm": 2.7218493665216252, "learning_rate": 1.737704918032787e-05, "loss": 1.1124, "step": 159 }, { "epoch": 0.03, "grad_norm": 3.0057925183196, "learning_rate": 1.7486338797814207e-05, "loss": 1.0027, "step": 160 }, { "epoch": 0.03, "grad_norm": 2.5064646365136256, "learning_rate": 1.7595628415300547e-05, "loss": 1.0656, "step": 161 }, { "epoch": 0.03, "grad_norm": 4.4344048222359564, "learning_rate": 1.7704918032786887e-05, "loss": 0.9411, "step": 162 }, { "epoch": 0.03, "grad_norm": 2.5524977132541733, "learning_rate": 1.7814207650273227e-05, "loss": 1.0545, "step": 163 }, { "epoch": 0.03, "grad_norm": 2.4527726417767246, "learning_rate": 1.7923497267759563e-05, "loss": 0.9313, "step": 164 }, { "epoch": 0.03, "grad_norm": 4.281482128109568, "learning_rate": 1.8032786885245903e-05, "loss": 0.9714, "step": 165 }, { "epoch": 0.03, "grad_norm": 2.8475256581824366, "learning_rate": 1.8142076502732243e-05, "loss": 1.0249, "step": 166 }, { "epoch": 0.03, "grad_norm": 3.3422702769589074, "learning_rate": 1.825136612021858e-05, "loss": 1.0037, "step": 167 }, { "epoch": 0.03, "grad_norm": 2.3692449382780962, "learning_rate": 1.836065573770492e-05, "loss": 0.9836, "step": 168 }, { "epoch": 0.03, "grad_norm": 3.549756136912152, "learning_rate": 1.846994535519126e-05, "loss": 0.9939, "step": 169 }, { "epoch": 0.03, "grad_norm": 2.5088425804255556, "learning_rate": 1.85792349726776e-05, "loss": 0.9821, "step": 170 }, { "epoch": 0.03, "grad_norm": 2.385868402934651, "learning_rate": 1.8688524590163936e-05, "loss": 0.9885, "step": 171 }, { "epoch": 0.03, "grad_norm": 0.7218489126465276, "learning_rate": 1.8797814207650276e-05, "loss": 0.3258, "step": 172 }, { "epoch": 0.03, "grad_norm": 3.1674629736410167, "learning_rate": 1.8907103825136616e-05, "loss": 1.0565, "step": 173 }, { "epoch": 0.03, "grad_norm": 2.9234007011414627, "learning_rate": 1.9016393442622952e-05, "loss": 1.0216, "step": 174 }, { "epoch": 0.03, "grad_norm": 3.1941425865341007, "learning_rate": 1.912568306010929e-05, "loss": 1.0294, "step": 175 }, { "epoch": 0.03, "grad_norm": 3.703572622192769, "learning_rate": 1.923497267759563e-05, "loss": 0.9003, "step": 176 }, { "epoch": 0.03, "grad_norm": 2.548963091307206, "learning_rate": 1.934426229508197e-05, "loss": 0.8968, "step": 177 }, { "epoch": 0.03, "grad_norm": 3.0714779295576284, "learning_rate": 1.9453551912568305e-05, "loss": 1.0731, "step": 178 }, { "epoch": 0.03, "grad_norm": 2.614194400641804, "learning_rate": 1.9562841530054645e-05, "loss": 1.0794, "step": 179 }, { "epoch": 0.03, "grad_norm": 3.793006494025213, "learning_rate": 1.9672131147540985e-05, "loss": 1.0823, "step": 180 }, { "epoch": 0.03, "grad_norm": 2.4897281437525693, "learning_rate": 1.9781420765027325e-05, "loss": 1.0755, "step": 181 }, { "epoch": 0.03, "grad_norm": 2.9325551138634625, "learning_rate": 1.9890710382513662e-05, "loss": 1.0658, "step": 182 }, { "epoch": 0.03, "grad_norm": 2.927121545775438, "learning_rate": 2e-05, "loss": 0.9869, "step": 183 }, { "epoch": 0.03, "grad_norm": 2.6225454720964274, "learning_rate": 1.999999858476052e-05, "loss": 1.0347, "step": 184 }, { "epoch": 0.03, "grad_norm": 2.534622204784922, "learning_rate": 1.999999433904248e-05, "loss": 1.0481, "step": 185 }, { "epoch": 0.03, "grad_norm": 4.523092343604785, "learning_rate": 1.999998726284708e-05, "loss": 0.9546, "step": 186 }, { "epoch": 0.03, "grad_norm": 0.911233825440132, "learning_rate": 1.999997735617632e-05, "loss": 0.3498, "step": 187 }, { "epoch": 0.03, "grad_norm": 2.8690566381779945, "learning_rate": 1.999996461903301e-05, "loss": 0.935, "step": 188 }, { "epoch": 0.03, "grad_norm": 0.6572236829351008, "learning_rate": 1.999994905142075e-05, "loss": 0.3275, "step": 189 }, { "epoch": 0.03, "grad_norm": 2.8114861437204803, "learning_rate": 1.999993065334395e-05, "loss": 1.0052, "step": 190 }, { "epoch": 0.03, "grad_norm": 3.544548917080019, "learning_rate": 1.999990942480782e-05, "loss": 0.9353, "step": 191 }, { "epoch": 0.03, "grad_norm": 2.3539401727214497, "learning_rate": 1.999988536581836e-05, "loss": 1.0455, "step": 192 }, { "epoch": 0.03, "grad_norm": 2.563997002912881, "learning_rate": 1.9999858476382388e-05, "loss": 1.0103, "step": 193 }, { "epoch": 0.03, "grad_norm": 2.7542929997717893, "learning_rate": 1.9999828756507512e-05, "loss": 0.9146, "step": 194 }, { "epoch": 0.03, "grad_norm": 2.536209634038412, "learning_rate": 1.999979620620214e-05, "loss": 0.9925, "step": 195 }, { "epoch": 0.03, "grad_norm": 2.246586735948212, "learning_rate": 1.9999760825475496e-05, "loss": 1.0301, "step": 196 }, { "epoch": 0.03, "grad_norm": 2.581612740998833, "learning_rate": 1.9999722614337585e-05, "loss": 1.0363, "step": 197 }, { "epoch": 0.03, "grad_norm": 1.2086200771361078, "learning_rate": 1.9999681572799226e-05, "loss": 0.336, "step": 198 }, { "epoch": 0.03, "grad_norm": 2.6540937211599736, "learning_rate": 1.9999637700872037e-05, "loss": 1.0381, "step": 199 }, { "epoch": 0.03, "grad_norm": 3.2960232453957063, "learning_rate": 1.9999590998568432e-05, "loss": 0.9762, "step": 200 }, { "epoch": 0.03, "grad_norm": 0.7597073137500212, "learning_rate": 1.9999541465901636e-05, "loss": 0.302, "step": 201 }, { "epoch": 0.03, "grad_norm": 10.237743986745652, "learning_rate": 1.9999489102885657e-05, "loss": 1.0247, "step": 202 }, { "epoch": 0.03, "grad_norm": 4.942311703192865, "learning_rate": 1.9999433909535333e-05, "loss": 1.0057, "step": 203 }, { "epoch": 0.03, "grad_norm": 3.1286680879668602, "learning_rate": 1.9999375885866272e-05, "loss": 1.014, "step": 204 }, { "epoch": 0.03, "grad_norm": 2.7022620145124217, "learning_rate": 1.9999315031894908e-05, "loss": 1.0321, "step": 205 }, { "epoch": 0.03, "grad_norm": 2.356325114648424, "learning_rate": 1.999925134763846e-05, "loss": 0.9818, "step": 206 }, { "epoch": 0.03, "grad_norm": 5.731334748372823, "learning_rate": 1.9999184833114952e-05, "loss": 1.0203, "step": 207 }, { "epoch": 0.03, "grad_norm": 3.259505796091457, "learning_rate": 1.9999115488343213e-05, "loss": 0.9781, "step": 208 }, { "epoch": 0.03, "grad_norm": 2.4262861116218333, "learning_rate": 1.9999043313342875e-05, "loss": 0.9899, "step": 209 }, { "epoch": 0.03, "grad_norm": 2.5243341134104225, "learning_rate": 1.9998968308134362e-05, "loss": 1.0135, "step": 210 }, { "epoch": 0.03, "grad_norm": 3.608770062399933, "learning_rate": 1.9998890472738902e-05, "loss": 1.079, "step": 211 }, { "epoch": 0.03, "grad_norm": 1.9720954785871398, "learning_rate": 1.9998809807178533e-05, "loss": 0.9281, "step": 212 }, { "epoch": 0.03, "grad_norm": 3.866365612140838, "learning_rate": 1.999872631147608e-05, "loss": 1.0369, "step": 213 }, { "epoch": 0.04, "grad_norm": 2.9337184487072894, "learning_rate": 1.9998639985655183e-05, "loss": 0.9803, "step": 214 }, { "epoch": 0.04, "grad_norm": 2.534896670057376, "learning_rate": 1.9998550829740277e-05, "loss": 1.0506, "step": 215 }, { "epoch": 0.04, "grad_norm": 1.3647465375395704, "learning_rate": 1.9998458843756587e-05, "loss": 0.3401, "step": 216 }, { "epoch": 0.04, "grad_norm": 2.678706400530421, "learning_rate": 1.999836402773016e-05, "loss": 1.0659, "step": 217 }, { "epoch": 0.04, "grad_norm": 2.2541520173568506, "learning_rate": 1.999826638168783e-05, "loss": 1.0261, "step": 218 }, { "epoch": 0.04, "grad_norm": 3.880434024972317, "learning_rate": 1.999816590565723e-05, "loss": 1.0332, "step": 219 }, { "epoch": 0.04, "grad_norm": 2.348605341490133, "learning_rate": 1.999806259966681e-05, "loss": 0.9659, "step": 220 }, { "epoch": 0.04, "grad_norm": 3.934943201874171, "learning_rate": 1.9997956463745806e-05, "loss": 1.0231, "step": 221 }, { "epoch": 0.04, "grad_norm": 0.9801241695984213, "learning_rate": 1.9997847497924258e-05, "loss": 0.3291, "step": 222 }, { "epoch": 0.04, "grad_norm": 2.5405631514247586, "learning_rate": 1.9997735702233006e-05, "loss": 1.0394, "step": 223 }, { "epoch": 0.04, "grad_norm": 3.163775387493945, "learning_rate": 1.99976210767037e-05, "loss": 1.0199, "step": 224 }, { "epoch": 0.04, "grad_norm": 2.4238624473626906, "learning_rate": 1.999750362136878e-05, "loss": 0.9457, "step": 225 }, { "epoch": 0.04, "grad_norm": 2.384071118800601, "learning_rate": 1.999738333626149e-05, "loss": 1.0134, "step": 226 }, { "epoch": 0.04, "grad_norm": 2.572090842391206, "learning_rate": 1.9997260221415883e-05, "loss": 0.9507, "step": 227 }, { "epoch": 0.04, "grad_norm": 2.6054745282135103, "learning_rate": 1.99971342768668e-05, "loss": 0.9835, "step": 228 }, { "epoch": 0.04, "grad_norm": 2.6021192178538746, "learning_rate": 1.99970055026499e-05, "loss": 0.9447, "step": 229 }, { "epoch": 0.04, "grad_norm": 3.024789413153466, "learning_rate": 1.9996873898801617e-05, "loss": 0.8999, "step": 230 }, { "epoch": 0.04, "grad_norm": 2.6786531209813877, "learning_rate": 1.9996739465359208e-05, "loss": 0.9598, "step": 231 }, { "epoch": 0.04, "grad_norm": 2.6961247477947334, "learning_rate": 1.9996602202360728e-05, "loss": 1.0336, "step": 232 }, { "epoch": 0.04, "grad_norm": 5.202048247485392, "learning_rate": 1.999646210984502e-05, "loss": 0.9702, "step": 233 }, { "epoch": 0.04, "grad_norm": 13.515143906990088, "learning_rate": 1.999631918785175e-05, "loss": 1.0695, "step": 234 }, { "epoch": 0.04, "grad_norm": 3.5328617122187134, "learning_rate": 1.999617343642136e-05, "loss": 1.0016, "step": 235 }, { "epoch": 0.04, "grad_norm": 2.5487678194445285, "learning_rate": 1.999602485559511e-05, "loss": 1.0239, "step": 236 }, { "epoch": 0.04, "grad_norm": 3.4866681105808626, "learning_rate": 1.9995873445415056e-05, "loss": 1.0353, "step": 237 }, { "epoch": 0.04, "grad_norm": 2.3179673945232273, "learning_rate": 1.999571920592405e-05, "loss": 0.9718, "step": 238 }, { "epoch": 0.04, "grad_norm": 3.0833718368517946, "learning_rate": 1.9995562137165752e-05, "loss": 0.9711, "step": 239 }, { "epoch": 0.04, "grad_norm": 2.371411783190079, "learning_rate": 1.9995402239184622e-05, "loss": 0.999, "step": 240 }, { "epoch": 0.04, "grad_norm": 1.7781080989350921, "learning_rate": 1.9995239512025916e-05, "loss": 0.3603, "step": 241 }, { "epoch": 0.04, "grad_norm": 5.20769876063206, "learning_rate": 1.9995073955735695e-05, "loss": 1.0144, "step": 242 }, { "epoch": 0.04, "grad_norm": 3.103826531794391, "learning_rate": 1.9994905570360817e-05, "loss": 1.0604, "step": 243 }, { "epoch": 0.04, "grad_norm": 2.0302279889001382, "learning_rate": 1.9994734355948943e-05, "loss": 0.9707, "step": 244 }, { "epoch": 0.04, "grad_norm": 3.2781306774070917, "learning_rate": 1.9994560312548538e-05, "loss": 0.9226, "step": 245 }, { "epoch": 0.04, "grad_norm": 2.3221448231902513, "learning_rate": 1.9994383440208867e-05, "loss": 1.0052, "step": 246 }, { "epoch": 0.04, "grad_norm": 0.9700540216215806, "learning_rate": 1.9994203738979984e-05, "loss": 0.3309, "step": 247 }, { "epoch": 0.04, "grad_norm": 2.7682766160157404, "learning_rate": 1.999402120891276e-05, "loss": 1.0395, "step": 248 }, { "epoch": 0.04, "grad_norm": 2.023252476234934, "learning_rate": 1.999383585005886e-05, "loss": 0.8517, "step": 249 }, { "epoch": 0.04, "grad_norm": 3.0870357374258037, "learning_rate": 1.9993647662470746e-05, "loss": 1.079, "step": 250 }, { "epoch": 0.04, "grad_norm": 2.5752024818429744, "learning_rate": 1.999345664620169e-05, "loss": 1.0098, "step": 251 }, { "epoch": 0.04, "grad_norm": 2.3642213207560516, "learning_rate": 1.999326280130575e-05, "loss": 0.8917, "step": 252 }, { "epoch": 0.04, "grad_norm": 2.251821698170165, "learning_rate": 1.99930661278378e-05, "loss": 0.8402, "step": 253 }, { "epoch": 0.04, "grad_norm": 2.624506237910221, "learning_rate": 1.999286662585351e-05, "loss": 1.0483, "step": 254 }, { "epoch": 0.04, "grad_norm": 3.203421853770946, "learning_rate": 1.9992664295409338e-05, "loss": 0.9812, "step": 255 }, { "epoch": 0.04, "grad_norm": 2.700395696998586, "learning_rate": 1.999245913656256e-05, "loss": 1.0033, "step": 256 }, { "epoch": 0.04, "grad_norm": 2.98156942862375, "learning_rate": 1.9992251149371253e-05, "loss": 1.011, "step": 257 }, { "epoch": 0.04, "grad_norm": 1.881402152407015, "learning_rate": 1.9992040333894273e-05, "loss": 0.9937, "step": 258 }, { "epoch": 0.04, "grad_norm": 0.8300837130666848, "learning_rate": 1.9991826690191304e-05, "loss": 0.365, "step": 259 }, { "epoch": 0.04, "grad_norm": 2.393530561694459, "learning_rate": 1.9991610218322804e-05, "loss": 0.9536, "step": 260 }, { "epoch": 0.04, "grad_norm": 2.7224881802704575, "learning_rate": 1.9991390918350054e-05, "loss": 1.0193, "step": 261 }, { "epoch": 0.04, "grad_norm": 3.1944741314509417, "learning_rate": 1.999116879033513e-05, "loss": 1.0215, "step": 262 }, { "epoch": 0.04, "grad_norm": 2.4566909150828455, "learning_rate": 1.9990943834340893e-05, "loss": 0.9636, "step": 263 }, { "epoch": 0.04, "grad_norm": 2.654936923320021, "learning_rate": 1.9990716050431026e-05, "loss": 0.9368, "step": 264 }, { "epoch": 0.04, "grad_norm": 2.393383495258629, "learning_rate": 1.9990485438669998e-05, "loss": 0.9465, "step": 265 }, { "epoch": 0.04, "grad_norm": 2.6030889957178496, "learning_rate": 1.9990251999123086e-05, "loss": 0.9967, "step": 266 }, { "epoch": 0.04, "grad_norm": 2.9712223658761805, "learning_rate": 1.9990015731856366e-05, "loss": 0.9807, "step": 267 }, { "epoch": 0.04, "grad_norm": 2.571428660927818, "learning_rate": 1.9989776636936705e-05, "loss": 1.002, "step": 268 }, { "epoch": 0.04, "grad_norm": 2.6675701945173436, "learning_rate": 1.9989534714431788e-05, "loss": 0.9957, "step": 269 }, { "epoch": 0.04, "grad_norm": 2.7088810000069827, "learning_rate": 1.9989289964410082e-05, "loss": 0.9117, "step": 270 }, { "epoch": 0.04, "grad_norm": 3.5352934285012174, "learning_rate": 1.9989042386940872e-05, "loss": 0.9512, "step": 271 }, { "epoch": 0.04, "grad_norm": 2.0891770723806844, "learning_rate": 1.9988791982094224e-05, "loss": 0.9682, "step": 272 }, { "epoch": 0.04, "grad_norm": 2.6275547157778796, "learning_rate": 1.9988538749941024e-05, "loss": 1.0189, "step": 273 }, { "epoch": 0.05, "grad_norm": 1.812883079998693, "learning_rate": 1.998828269055294e-05, "loss": 0.9702, "step": 274 }, { "epoch": 0.05, "grad_norm": 0.9661250797842796, "learning_rate": 1.9988023804002463e-05, "loss": 0.3478, "step": 275 }, { "epoch": 0.05, "grad_norm": 2.0639141322837733, "learning_rate": 1.9987762090362855e-05, "loss": 1.0166, "step": 276 }, { "epoch": 0.05, "grad_norm": 2.6698952831108427, "learning_rate": 1.9987497549708197e-05, "loss": 1.0014, "step": 277 }, { "epoch": 0.05, "grad_norm": 2.3680103721466934, "learning_rate": 1.9987230182113374e-05, "loss": 0.909, "step": 278 }, { "epoch": 0.05, "grad_norm": 3.1338342880100196, "learning_rate": 1.998695998765406e-05, "loss": 0.9019, "step": 279 }, { "epoch": 0.05, "grad_norm": 2.816622358286671, "learning_rate": 1.9986686966406728e-05, "loss": 0.9712, "step": 280 }, { "epoch": 0.05, "grad_norm": 2.623038771193023, "learning_rate": 1.9986411118448666e-05, "loss": 0.9934, "step": 281 }, { "epoch": 0.05, "grad_norm": 3.3572973667036727, "learning_rate": 1.9986132443857942e-05, "loss": 0.9818, "step": 282 }, { "epoch": 0.05, "grad_norm": 3.3000842648760176, "learning_rate": 1.998585094271344e-05, "loss": 0.9283, "step": 283 }, { "epoch": 0.05, "grad_norm": 4.670721345960199, "learning_rate": 1.9985566615094836e-05, "loss": 1.007, "step": 284 }, { "epoch": 0.05, "grad_norm": 0.9525169128237011, "learning_rate": 1.998527946108261e-05, "loss": 0.3224, "step": 285 }, { "epoch": 0.05, "grad_norm": 2.2390220392640203, "learning_rate": 1.9984989480758038e-05, "loss": 1.0218, "step": 286 }, { "epoch": 0.05, "grad_norm": 2.239391833331459, "learning_rate": 1.9984696674203204e-05, "loss": 0.968, "step": 287 }, { "epoch": 0.05, "grad_norm": 11.616682603944552, "learning_rate": 1.998440104150098e-05, "loss": 0.903, "step": 288 }, { "epoch": 0.05, "grad_norm": 3.127607735322672, "learning_rate": 1.998410258273505e-05, "loss": 1.016, "step": 289 }, { "epoch": 0.05, "grad_norm": 1.936636331457956, "learning_rate": 1.9983801297989883e-05, "loss": 1.0016, "step": 290 }, { "epoch": 0.05, "grad_norm": 2.0580269680560033, "learning_rate": 1.9983497187350767e-05, "loss": 0.8886, "step": 291 }, { "epoch": 0.05, "grad_norm": 2.563829561689164, "learning_rate": 1.998319025090377e-05, "loss": 0.9058, "step": 292 }, { "epoch": 0.05, "grad_norm": 3.5094041328990784, "learning_rate": 1.998288048873578e-05, "loss": 0.9963, "step": 293 }, { "epoch": 0.05, "grad_norm": 2.517706994997928, "learning_rate": 1.998256790093447e-05, "loss": 0.9151, "step": 294 }, { "epoch": 0.05, "grad_norm": 2.259491449517112, "learning_rate": 1.9982252487588315e-05, "loss": 0.9269, "step": 295 }, { "epoch": 0.05, "grad_norm": 2.504405761308801, "learning_rate": 1.9981934248786596e-05, "loss": 0.982, "step": 296 }, { "epoch": 0.05, "grad_norm": 0.9261537116572233, "learning_rate": 1.9981613184619387e-05, "loss": 0.3588, "step": 297 }, { "epoch": 0.05, "grad_norm": 3.0344793997247472, "learning_rate": 1.9981289295177566e-05, "loss": 0.9424, "step": 298 }, { "epoch": 0.05, "grad_norm": 2.288920727831465, "learning_rate": 1.9980962580552808e-05, "loss": 0.9867, "step": 299 }, { "epoch": 0.05, "grad_norm": 0.6743420268326172, "learning_rate": 1.998063304083759e-05, "loss": 0.3518, "step": 300 }, { "epoch": 0.05, "grad_norm": 2.620895129341444, "learning_rate": 1.9980300676125188e-05, "loss": 0.9736, "step": 301 }, { "epoch": 0.05, "grad_norm": 2.4726401914703025, "learning_rate": 1.9979965486509676e-05, "loss": 0.9484, "step": 302 }, { "epoch": 0.05, "grad_norm": 2.1133044917219417, "learning_rate": 1.9979627472085927e-05, "loss": 0.9264, "step": 303 }, { "epoch": 0.05, "grad_norm": 2.874281065701892, "learning_rate": 1.997928663294962e-05, "loss": 1.0567, "step": 304 }, { "epoch": 0.05, "grad_norm": 2.42344685591187, "learning_rate": 1.9978942969197224e-05, "loss": 0.964, "step": 305 }, { "epoch": 0.05, "grad_norm": 4.744187896684878, "learning_rate": 1.9978596480926017e-05, "loss": 0.8798, "step": 306 }, { "epoch": 0.05, "grad_norm": 2.538570171042028, "learning_rate": 1.9978247168234065e-05, "loss": 0.9533, "step": 307 }, { "epoch": 0.05, "grad_norm": 2.776000547798866, "learning_rate": 1.997789503122025e-05, "loss": 1.0314, "step": 308 }, { "epoch": 0.05, "grad_norm": 2.2049486520267316, "learning_rate": 1.9977540069984233e-05, "loss": 0.8667, "step": 309 }, { "epoch": 0.05, "grad_norm": 2.6906388169180775, "learning_rate": 1.9977182284626492e-05, "loss": 0.9138, "step": 310 }, { "epoch": 0.05, "grad_norm": 2.3668506662326667, "learning_rate": 1.9976821675248298e-05, "loss": 0.9874, "step": 311 }, { "epoch": 0.05, "grad_norm": 2.2717312951433235, "learning_rate": 1.9976458241951715e-05, "loss": 0.9154, "step": 312 }, { "epoch": 0.05, "grad_norm": 2.3405168948630024, "learning_rate": 1.9976091984839616e-05, "loss": 0.9112, "step": 313 }, { "epoch": 0.05, "grad_norm": 1.9679055414605273, "learning_rate": 1.997572290401567e-05, "loss": 1.0024, "step": 314 }, { "epoch": 0.05, "grad_norm": 2.587541997131777, "learning_rate": 1.9975350999584342e-05, "loss": 0.934, "step": 315 }, { "epoch": 0.05, "grad_norm": 2.5633806388144142, "learning_rate": 1.99749762716509e-05, "loss": 1.0557, "step": 316 }, { "epoch": 0.05, "grad_norm": 2.646351221550071, "learning_rate": 1.9974598720321407e-05, "loss": 0.9646, "step": 317 }, { "epoch": 0.05, "grad_norm": 2.4128391537347444, "learning_rate": 1.9974218345702733e-05, "loss": 0.8994, "step": 318 }, { "epoch": 0.05, "grad_norm": 2.2425650114217413, "learning_rate": 1.997383514790254e-05, "loss": 0.9417, "step": 319 }, { "epoch": 0.05, "grad_norm": 2.410243745362762, "learning_rate": 1.9973449127029296e-05, "loss": 0.958, "step": 320 }, { "epoch": 0.05, "grad_norm": 3.3019101296613287, "learning_rate": 1.9973060283192253e-05, "loss": 1.0252, "step": 321 }, { "epoch": 0.05, "grad_norm": 2.531584680423932, "learning_rate": 1.997266861650148e-05, "loss": 0.998, "step": 322 }, { "epoch": 0.05, "grad_norm": 1.424662093539455, "learning_rate": 1.9972274127067838e-05, "loss": 0.3643, "step": 323 }, { "epoch": 0.05, "grad_norm": 1.876647917762673, "learning_rate": 1.997187681500298e-05, "loss": 0.9693, "step": 324 }, { "epoch": 0.05, "grad_norm": 2.353135271651182, "learning_rate": 1.9971476680419372e-05, "loss": 0.9969, "step": 325 }, { "epoch": 0.05, "grad_norm": 2.4240931303399247, "learning_rate": 1.9971073723430266e-05, "loss": 0.8588, "step": 326 }, { "epoch": 0.05, "grad_norm": 3.613479741363531, "learning_rate": 1.997066794414972e-05, "loss": 0.917, "step": 327 }, { "epoch": 0.05, "grad_norm": 2.8142281331879775, "learning_rate": 1.997025934269259e-05, "loss": 0.9429, "step": 328 }, { "epoch": 0.05, "grad_norm": 1.8632227505099102, "learning_rate": 1.9969847919174525e-05, "loss": 0.892, "step": 329 }, { "epoch": 0.05, "grad_norm": 2.8423348511031667, "learning_rate": 1.9969433673711984e-05, "loss": 0.9253, "step": 330 }, { "epoch": 0.05, "grad_norm": 2.217200116790008, "learning_rate": 1.9969016606422215e-05, "loss": 0.9103, "step": 331 }, { "epoch": 0.05, "grad_norm": 2.4197113503157355, "learning_rate": 1.996859671742327e-05, "loss": 0.9739, "step": 332 }, { "epoch": 0.05, "grad_norm": 2.5395991709435415, "learning_rate": 1.9968174006833996e-05, "loss": 1.0175, "step": 333 }, { "epoch": 0.05, "grad_norm": 2.453409760160311, "learning_rate": 1.996774847477404e-05, "loss": 0.9148, "step": 334 }, { "epoch": 0.06, "grad_norm": 2.4305703606425544, "learning_rate": 1.9967320121363845e-05, "loss": 0.9194, "step": 335 }, { "epoch": 0.06, "grad_norm": 2.557720617541986, "learning_rate": 1.9966888946724663e-05, "loss": 0.9403, "step": 336 }, { "epoch": 0.06, "grad_norm": 1.8565534711977656, "learning_rate": 1.9966454950978534e-05, "loss": 0.9367, "step": 337 }, { "epoch": 0.06, "grad_norm": 2.1342740321879976, "learning_rate": 1.9966018134248296e-05, "loss": 0.9109, "step": 338 }, { "epoch": 0.06, "grad_norm": 2.8700294733675626, "learning_rate": 1.9965578496657593e-05, "loss": 0.9404, "step": 339 }, { "epoch": 0.06, "grad_norm": 2.3897654491313607, "learning_rate": 1.996513603833086e-05, "loss": 1.0065, "step": 340 }, { "epoch": 0.06, "grad_norm": 2.516554169705197, "learning_rate": 1.996469075939334e-05, "loss": 0.9174, "step": 341 }, { "epoch": 0.06, "grad_norm": 2.074994360250604, "learning_rate": 1.9964242659971063e-05, "loss": 0.9492, "step": 342 }, { "epoch": 0.06, "grad_norm": 2.930949844337704, "learning_rate": 1.9963791740190863e-05, "loss": 0.9571, "step": 343 }, { "epoch": 0.06, "grad_norm": 2.3956925077532056, "learning_rate": 1.9963338000180374e-05, "loss": 0.9841, "step": 344 }, { "epoch": 0.06, "grad_norm": 1.0710254869373075, "learning_rate": 1.9962881440068026e-05, "loss": 0.3955, "step": 345 }, { "epoch": 0.06, "grad_norm": 2.3757003801729337, "learning_rate": 1.9962422059983045e-05, "loss": 0.9826, "step": 346 }, { "epoch": 0.06, "grad_norm": 2.4879850584643752, "learning_rate": 1.9961959860055456e-05, "loss": 0.9306, "step": 347 }, { "epoch": 0.06, "grad_norm": 0.6994299865956333, "learning_rate": 1.996149484041609e-05, "loss": 0.3252, "step": 348 }, { "epoch": 0.06, "grad_norm": 2.290509990677671, "learning_rate": 1.9961027001196568e-05, "loss": 0.9466, "step": 349 }, { "epoch": 0.06, "grad_norm": 0.6931438964398902, "learning_rate": 1.996055634252931e-05, "loss": 0.336, "step": 350 }, { "epoch": 0.06, "grad_norm": 2.0477098616451954, "learning_rate": 1.9960082864547528e-05, "loss": 1.009, "step": 351 }, { "epoch": 0.06, "grad_norm": 2.271703626934976, "learning_rate": 1.995960656738525e-05, "loss": 0.9245, "step": 352 }, { "epoch": 0.06, "grad_norm": 2.3901315167380672, "learning_rate": 1.9959127451177287e-05, "loss": 0.9605, "step": 353 }, { "epoch": 0.06, "grad_norm": 2.8156163097899753, "learning_rate": 1.9958645516059247e-05, "loss": 0.906, "step": 354 }, { "epoch": 0.06, "grad_norm": 2.2033150668693877, "learning_rate": 1.995816076216755e-05, "loss": 0.9028, "step": 355 }, { "epoch": 0.06, "grad_norm": 2.6011957157181635, "learning_rate": 1.9957673189639398e-05, "loss": 0.8479, "step": 356 }, { "epoch": 0.06, "grad_norm": 2.2440703046320847, "learning_rate": 1.9957182798612797e-05, "loss": 1.0131, "step": 357 }, { "epoch": 0.06, "grad_norm": 2.1530221450444142, "learning_rate": 1.9956689589226555e-05, "loss": 0.9677, "step": 358 }, { "epoch": 0.06, "grad_norm": 3.850510066087014, "learning_rate": 1.9956193561620267e-05, "loss": 0.9093, "step": 359 }, { "epoch": 0.06, "grad_norm": 2.11921655346522, "learning_rate": 1.9955694715934344e-05, "loss": 0.9082, "step": 360 }, { "epoch": 0.06, "grad_norm": 2.6129120982816825, "learning_rate": 1.9955193052309972e-05, "loss": 0.9625, "step": 361 }, { "epoch": 0.06, "grad_norm": 2.160471423483292, "learning_rate": 1.9954688570889152e-05, "loss": 0.9813, "step": 362 }, { "epoch": 0.06, "grad_norm": 21.439278323174342, "learning_rate": 1.9954181271814673e-05, "loss": 0.8984, "step": 363 }, { "epoch": 0.06, "grad_norm": 5.149492349386487, "learning_rate": 1.995367115523013e-05, "loss": 0.9425, "step": 364 }, { "epoch": 0.06, "grad_norm": 2.449838659313054, "learning_rate": 1.9953158221279906e-05, "loss": 1.0034, "step": 365 }, { "epoch": 0.06, "grad_norm": 2.7551306200333854, "learning_rate": 1.9952642470109185e-05, "loss": 0.923, "step": 366 }, { "epoch": 0.06, "grad_norm": 1.703743255537708, "learning_rate": 1.9952123901863953e-05, "loss": 0.9031, "step": 367 }, { "epoch": 0.06, "grad_norm": 4.634592751910485, "learning_rate": 1.9951602516690988e-05, "loss": 0.9268, "step": 368 }, { "epoch": 0.06, "grad_norm": 2.0823949814446054, "learning_rate": 1.995107831473787e-05, "loss": 0.931, "step": 369 }, { "epoch": 0.06, "grad_norm": 2.2818332260820293, "learning_rate": 1.9950551296152965e-05, "loss": 0.8488, "step": 370 }, { "epoch": 0.06, "grad_norm": 2.0933897298859696, "learning_rate": 1.9950021461085452e-05, "loss": 0.9139, "step": 371 }, { "epoch": 0.06, "grad_norm": 1.8076265249122476, "learning_rate": 1.99494888096853e-05, "loss": 0.3888, "step": 372 }, { "epoch": 0.06, "grad_norm": 2.0961526805238044, "learning_rate": 1.9948953342103268e-05, "loss": 0.9625, "step": 373 }, { "epoch": 0.06, "grad_norm": 3.2812591285234642, "learning_rate": 1.9948415058490926e-05, "loss": 0.9804, "step": 374 }, { "epoch": 0.06, "grad_norm": 3.8153329686649364, "learning_rate": 1.994787395900063e-05, "loss": 0.9233, "step": 375 }, { "epoch": 0.06, "grad_norm": 1.990861901197015, "learning_rate": 1.994733004378554e-05, "loss": 0.9154, "step": 376 }, { "epoch": 0.06, "grad_norm": 2.3179491750202432, "learning_rate": 1.9946783312999606e-05, "loss": 0.9763, "step": 377 }, { "epoch": 0.06, "grad_norm": 2.5759462437738128, "learning_rate": 1.994623376679758e-05, "loss": 0.9744, "step": 378 }, { "epoch": 0.06, "grad_norm": 2.020620980969699, "learning_rate": 1.9945681405335016e-05, "loss": 0.9441, "step": 379 }, { "epoch": 0.06, "grad_norm": 0.9824789361238082, "learning_rate": 1.9945126228768252e-05, "loss": 0.3902, "step": 380 }, { "epoch": 0.06, "grad_norm": 2.250193367321044, "learning_rate": 1.994456823725443e-05, "loss": 0.9711, "step": 381 }, { "epoch": 0.06, "grad_norm": 2.0478825749159766, "learning_rate": 1.994400743095149e-05, "loss": 0.9662, "step": 382 }, { "epoch": 0.06, "grad_norm": 2.313844894328072, "learning_rate": 1.9943443810018174e-05, "loss": 0.9679, "step": 383 }, { "epoch": 0.06, "grad_norm": 1.9590117108106193, "learning_rate": 1.9942877374614e-05, "loss": 0.9279, "step": 384 }, { "epoch": 0.06, "grad_norm": 2.4987495406822666, "learning_rate": 1.9942308124899307e-05, "loss": 1.0187, "step": 385 }, { "epoch": 0.06, "grad_norm": 1.891992280657028, "learning_rate": 1.9941736061035213e-05, "loss": 1.012, "step": 386 }, { "epoch": 0.06, "grad_norm": 2.624147784251444, "learning_rate": 1.9941161183183643e-05, "loss": 0.9857, "step": 387 }, { "epoch": 0.06, "grad_norm": 2.6807907645586306, "learning_rate": 1.9940583491507314e-05, "loss": 0.8842, "step": 388 }, { "epoch": 0.06, "grad_norm": 2.440382080479501, "learning_rate": 1.994000298616974e-05, "loss": 0.8731, "step": 389 }, { "epoch": 0.06, "grad_norm": 2.025918654230008, "learning_rate": 1.9939419667335236e-05, "loss": 0.8806, "step": 390 }, { "epoch": 0.06, "grad_norm": 2.2763623286484096, "learning_rate": 1.99388335351689e-05, "loss": 0.9338, "step": 391 }, { "epoch": 0.06, "grad_norm": 2.522299225594755, "learning_rate": 1.9938244589836646e-05, "loss": 0.947, "step": 392 }, { "epoch": 0.06, "grad_norm": 2.291720326504215, "learning_rate": 1.993765283150517e-05, "loss": 0.8704, "step": 393 }, { "epoch": 0.06, "grad_norm": 1.8401831306048535, "learning_rate": 1.9937058260341967e-05, "loss": 0.9397, "step": 394 }, { "epoch": 0.06, "grad_norm": 1.9967891261091435, "learning_rate": 1.9936460876515323e-05, "loss": 0.9397, "step": 395 }, { "epoch": 0.07, "grad_norm": 3.0223891003226506, "learning_rate": 1.993586068019434e-05, "loss": 0.9806, "step": 396 }, { "epoch": 0.07, "grad_norm": 2.663398964748326, "learning_rate": 1.993525767154889e-05, "loss": 0.8183, "step": 397 }, { "epoch": 0.07, "grad_norm": 2.4859180838144397, "learning_rate": 1.9934651850749663e-05, "loss": 0.9238, "step": 398 }, { "epoch": 0.07, "grad_norm": 1.949360391085408, "learning_rate": 1.9934043217968124e-05, "loss": 0.9742, "step": 399 }, { "epoch": 0.07, "grad_norm": 3.5948064494788867, "learning_rate": 1.9933431773376554e-05, "loss": 1.037, "step": 400 }, { "epoch": 0.07, "grad_norm": 2.0053676757135745, "learning_rate": 1.993281751714802e-05, "loss": 0.9506, "step": 401 }, { "epoch": 0.07, "grad_norm": 2.7812653464512596, "learning_rate": 1.9932200449456385e-05, "loss": 0.9143, "step": 402 }, { "epoch": 0.07, "grad_norm": 2.3500804570409164, "learning_rate": 1.9931580570476306e-05, "loss": 0.9232, "step": 403 }, { "epoch": 0.07, "grad_norm": 2.5921034853078275, "learning_rate": 1.993095788038324e-05, "loss": 0.9831, "step": 404 }, { "epoch": 0.07, "grad_norm": 1.9241940479536161, "learning_rate": 1.993033237935344e-05, "loss": 0.9286, "step": 405 }, { "epoch": 0.07, "grad_norm": 1.9908608404587544, "learning_rate": 1.9929704067563957e-05, "loss": 0.9897, "step": 406 }, { "epoch": 0.07, "grad_norm": 2.0000951179735478, "learning_rate": 1.992907294519262e-05, "loss": 0.9674, "step": 407 }, { "epoch": 0.07, "grad_norm": 2.3303304125554667, "learning_rate": 1.9928439012418076e-05, "loss": 0.9105, "step": 408 }, { "epoch": 0.07, "grad_norm": 2.301935908526815, "learning_rate": 1.992780226941976e-05, "loss": 0.9777, "step": 409 }, { "epoch": 0.07, "grad_norm": 2.4192617388653033, "learning_rate": 1.9927162716377896e-05, "loss": 0.9253, "step": 410 }, { "epoch": 0.07, "grad_norm": 2.136943430635333, "learning_rate": 1.992652035347351e-05, "loss": 0.9551, "step": 411 }, { "epoch": 0.07, "grad_norm": 2.1049812719724392, "learning_rate": 1.9925875180888426e-05, "loss": 1.0022, "step": 412 }, { "epoch": 0.07, "grad_norm": 2.903668474269452, "learning_rate": 1.9925227198805247e-05, "loss": 0.955, "step": 413 }, { "epoch": 0.07, "grad_norm": 2.0624073145777353, "learning_rate": 1.9924576407407398e-05, "loss": 0.9634, "step": 414 }, { "epoch": 0.07, "grad_norm": 2.3951240688021627, "learning_rate": 1.992392280687907e-05, "loss": 0.8888, "step": 415 }, { "epoch": 0.07, "grad_norm": 1.7216035202119706, "learning_rate": 1.9923266397405273e-05, "loss": 0.9139, "step": 416 }, { "epoch": 0.07, "grad_norm": 1.8299876916267162, "learning_rate": 1.9922607179171796e-05, "loss": 0.8914, "step": 417 }, { "epoch": 0.07, "grad_norm": 2.0154396655189997, "learning_rate": 1.9921945152365235e-05, "loss": 0.9826, "step": 418 }, { "epoch": 0.07, "grad_norm": 1.5634692470886324, "learning_rate": 1.992128031717297e-05, "loss": 0.9811, "step": 419 }, { "epoch": 0.07, "grad_norm": 1.8372806136920297, "learning_rate": 1.9920612673783186e-05, "loss": 0.9234, "step": 420 }, { "epoch": 0.07, "grad_norm": 2.6900871392274928, "learning_rate": 1.9919942222384855e-05, "loss": 0.9675, "step": 421 }, { "epoch": 0.07, "grad_norm": 2.4774633992958806, "learning_rate": 1.9919268963167747e-05, "loss": 0.9382, "step": 422 }, { "epoch": 0.07, "grad_norm": 2.2456172632712086, "learning_rate": 1.9918592896322432e-05, "loss": 0.9966, "step": 423 }, { "epoch": 0.07, "grad_norm": 1.7055953876283685, "learning_rate": 1.9917914022040258e-05, "loss": 1.0356, "step": 424 }, { "epoch": 0.07, "grad_norm": 3.3953037403722215, "learning_rate": 1.9917232340513388e-05, "loss": 0.811, "step": 425 }, { "epoch": 0.07, "grad_norm": 2.306579723533434, "learning_rate": 1.9916547851934768e-05, "loss": 0.9114, "step": 426 }, { "epoch": 0.07, "grad_norm": 1.9900196466229794, "learning_rate": 1.991586055649814e-05, "loss": 0.9435, "step": 427 }, { "epoch": 0.07, "grad_norm": 2.0407205736870027, "learning_rate": 1.9915170454398045e-05, "loss": 0.8549, "step": 428 }, { "epoch": 0.07, "grad_norm": 2.2973426385699804, "learning_rate": 1.991447754582981e-05, "loss": 1.004, "step": 429 }, { "epoch": 0.07, "grad_norm": 1.7325632050195026, "learning_rate": 1.9913781830989568e-05, "loss": 0.8657, "step": 430 }, { "epoch": 0.07, "grad_norm": 3.324983499707621, "learning_rate": 1.991308331007423e-05, "loss": 0.9484, "step": 431 }, { "epoch": 0.07, "grad_norm": 1.8328692799589519, "learning_rate": 1.9912381983281518e-05, "loss": 1.0271, "step": 432 }, { "epoch": 0.07, "grad_norm": 1.9428265911838523, "learning_rate": 1.9911677850809943e-05, "loss": 0.9554, "step": 433 }, { "epoch": 0.07, "grad_norm": 2.0922690690172954, "learning_rate": 1.9910970912858802e-05, "loss": 0.9974, "step": 434 }, { "epoch": 0.07, "grad_norm": 2.6090170580387104, "learning_rate": 1.9910261169628195e-05, "loss": 0.8831, "step": 435 }, { "epoch": 0.07, "grad_norm": 1.6969331907389997, "learning_rate": 1.9909548621319014e-05, "loss": 0.8826, "step": 436 }, { "epoch": 0.07, "grad_norm": 0.8662281245374749, "learning_rate": 1.9908833268132943e-05, "loss": 0.3681, "step": 437 }, { "epoch": 0.07, "grad_norm": 3.6795329802505683, "learning_rate": 1.9908115110272463e-05, "loss": 0.9571, "step": 438 }, { "epoch": 0.07, "grad_norm": 2.5448135242354843, "learning_rate": 1.9907394147940845e-05, "loss": 0.9115, "step": 439 }, { "epoch": 0.07, "grad_norm": 1.6198232980370235, "learning_rate": 1.9906670381342156e-05, "loss": 0.8831, "step": 440 }, { "epoch": 0.07, "grad_norm": 2.145973248884581, "learning_rate": 1.9905943810681257e-05, "loss": 0.9373, "step": 441 }, { "epoch": 0.07, "grad_norm": 2.601444500220834, "learning_rate": 1.9905214436163806e-05, "loss": 0.8586, "step": 442 }, { "epoch": 0.07, "grad_norm": 2.156090077237774, "learning_rate": 1.9904482257996244e-05, "loss": 0.9114, "step": 443 }, { "epoch": 0.07, "grad_norm": 2.050425500640629, "learning_rate": 1.9903747276385816e-05, "loss": 0.9137, "step": 444 }, { "epoch": 0.07, "grad_norm": 2.47306088839831, "learning_rate": 1.9903009491540558e-05, "loss": 0.9358, "step": 445 }, { "epoch": 0.07, "grad_norm": 3.1122373833965304, "learning_rate": 1.99022689036693e-05, "loss": 0.8821, "step": 446 }, { "epoch": 0.07, "grad_norm": 2.070679589899214, "learning_rate": 1.9901525512981656e-05, "loss": 0.8692, "step": 447 }, { "epoch": 0.07, "grad_norm": 2.24057916012188, "learning_rate": 1.990077931968805e-05, "loss": 0.9148, "step": 448 }, { "epoch": 0.07, "grad_norm": 2.061744782282545, "learning_rate": 1.990003032399969e-05, "loss": 0.9548, "step": 449 }, { "epoch": 0.07, "grad_norm": 10.39965238693937, "learning_rate": 1.989927852612857e-05, "loss": 0.9281, "step": 450 }, { "epoch": 0.07, "grad_norm": 2.3905313779299386, "learning_rate": 1.9898523926287488e-05, "loss": 0.956, "step": 451 }, { "epoch": 0.07, "grad_norm": 2.1263619039433577, "learning_rate": 1.9897766524690038e-05, "loss": 0.9568, "step": 452 }, { "epoch": 0.07, "grad_norm": 2.299349245693766, "learning_rate": 1.9897006321550592e-05, "loss": 0.8967, "step": 453 }, { "epoch": 0.07, "grad_norm": 2.1421224578730977, "learning_rate": 1.9896243317084333e-05, "loss": 0.9593, "step": 454 }, { "epoch": 0.07, "grad_norm": 2.2176763943496263, "learning_rate": 1.989547751150722e-05, "loss": 0.9938, "step": 455 }, { "epoch": 0.07, "grad_norm": 2.1373219933915304, "learning_rate": 1.9894708905036015e-05, "loss": 0.8722, "step": 456 }, { "epoch": 0.08, "grad_norm": 2.1474140592073825, "learning_rate": 1.989393749788827e-05, "loss": 1.036, "step": 457 }, { "epoch": 0.08, "grad_norm": 2.45753586008728, "learning_rate": 1.9893163290282335e-05, "loss": 0.8735, "step": 458 }, { "epoch": 0.08, "grad_norm": 1.7917975960962658, "learning_rate": 1.9892386282437344e-05, "loss": 0.9331, "step": 459 }, { "epoch": 0.08, "grad_norm": 2.379809880600318, "learning_rate": 1.9891606474573225e-05, "loss": 0.8687, "step": 460 }, { "epoch": 0.08, "grad_norm": 2.7817381351229105, "learning_rate": 1.9890823866910702e-05, "loss": 0.967, "step": 461 }, { "epoch": 0.08, "grad_norm": 2.278703138901829, "learning_rate": 1.9890038459671297e-05, "loss": 0.8503, "step": 462 }, { "epoch": 0.08, "grad_norm": 2.166199205165296, "learning_rate": 1.9889250253077306e-05, "loss": 0.849, "step": 463 }, { "epoch": 0.08, "grad_norm": 1.862550590313638, "learning_rate": 1.9888459247351844e-05, "loss": 0.9084, "step": 464 }, { "epoch": 0.08, "grad_norm": 3.2563070047809184, "learning_rate": 1.988766544271879e-05, "loss": 0.9286, "step": 465 }, { "epoch": 0.08, "grad_norm": 1.955635752669839, "learning_rate": 1.9886868839402837e-05, "loss": 0.9098, "step": 466 }, { "epoch": 0.08, "grad_norm": 1.989339299938392, "learning_rate": 1.9886069437629456e-05, "loss": 0.8969, "step": 467 }, { "epoch": 0.08, "grad_norm": 2.2570910120363044, "learning_rate": 1.9885267237624923e-05, "loss": 0.9838, "step": 468 }, { "epoch": 0.08, "grad_norm": 1.938289785440406, "learning_rate": 1.9884462239616292e-05, "loss": 0.9376, "step": 469 }, { "epoch": 0.08, "grad_norm": 1.876375121858725, "learning_rate": 1.988365444383142e-05, "loss": 0.8553, "step": 470 }, { "epoch": 0.08, "grad_norm": 2.1331167600155845, "learning_rate": 1.988284385049895e-05, "loss": 0.9123, "step": 471 }, { "epoch": 0.08, "grad_norm": 2.1226403216796688, "learning_rate": 1.9882030459848325e-05, "loss": 0.8962, "step": 472 }, { "epoch": 0.08, "grad_norm": 3.114786054477219, "learning_rate": 1.988121427210976e-05, "loss": 0.8371, "step": 473 }, { "epoch": 0.08, "grad_norm": 1.79136192829497, "learning_rate": 1.9880395287514292e-05, "loss": 0.8568, "step": 474 }, { "epoch": 0.08, "grad_norm": 2.6858715267131967, "learning_rate": 1.987957350629372e-05, "loss": 0.9636, "step": 475 }, { "epoch": 0.08, "grad_norm": 2.0715276222516716, "learning_rate": 1.987874892868065e-05, "loss": 0.9725, "step": 476 }, { "epoch": 0.08, "grad_norm": 4.852684894445931, "learning_rate": 1.987792155490848e-05, "loss": 0.9662, "step": 477 }, { "epoch": 0.08, "grad_norm": 2.128423936687846, "learning_rate": 1.98770913852114e-05, "loss": 0.9752, "step": 478 }, { "epoch": 0.08, "grad_norm": 2.2562760496463756, "learning_rate": 1.9876258419824375e-05, "loss": 0.7923, "step": 479 }, { "epoch": 0.08, "grad_norm": 3.7443229009050136, "learning_rate": 1.987542265898319e-05, "loss": 0.8566, "step": 480 }, { "epoch": 0.08, "grad_norm": 2.4747300928997955, "learning_rate": 1.9874584102924394e-05, "loss": 0.9063, "step": 481 }, { "epoch": 0.08, "grad_norm": 2.269497707104427, "learning_rate": 1.987374275188534e-05, "loss": 0.8715, "step": 482 }, { "epoch": 0.08, "grad_norm": 6.433472762586367, "learning_rate": 1.9872898606104175e-05, "loss": 0.9416, "step": 483 }, { "epoch": 0.08, "grad_norm": 2.3896844925165985, "learning_rate": 1.9872051665819828e-05, "loss": 0.9321, "step": 484 }, { "epoch": 0.08, "grad_norm": 2.3585913422580815, "learning_rate": 1.9871201931272027e-05, "loss": 0.9675, "step": 485 }, { "epoch": 0.08, "grad_norm": 2.714412645256349, "learning_rate": 1.987034940270129e-05, "loss": 0.9992, "step": 486 }, { "epoch": 0.08, "grad_norm": 1.8377715097452423, "learning_rate": 1.9869494080348916e-05, "loss": 0.8812, "step": 487 }, { "epoch": 0.08, "grad_norm": 0.9636226359904293, "learning_rate": 1.9868635964457007e-05, "loss": 0.3807, "step": 488 }, { "epoch": 0.08, "grad_norm": 2.004004036908849, "learning_rate": 1.986777505526845e-05, "loss": 0.9249, "step": 489 }, { "epoch": 0.08, "grad_norm": 0.7370318165258586, "learning_rate": 1.986691135302692e-05, "loss": 0.3593, "step": 490 }, { "epoch": 0.08, "grad_norm": 2.465726589993977, "learning_rate": 1.9866044857976897e-05, "loss": 0.8585, "step": 491 }, { "epoch": 0.08, "grad_norm": 2.6919198109178897, "learning_rate": 1.986517557036363e-05, "loss": 0.9044, "step": 492 }, { "epoch": 0.08, "grad_norm": 2.713378038398805, "learning_rate": 1.986430349043317e-05, "loss": 0.9416, "step": 493 }, { "epoch": 0.08, "grad_norm": 2.324323149519284, "learning_rate": 1.9863428618432365e-05, "loss": 0.924, "step": 494 }, { "epoch": 0.08, "grad_norm": 2.0614209723225465, "learning_rate": 1.9862550954608837e-05, "loss": 0.9215, "step": 495 }, { "epoch": 0.08, "grad_norm": 2.0937015189460757, "learning_rate": 1.986167049921101e-05, "loss": 0.9218, "step": 496 }, { "epoch": 0.08, "grad_norm": 2.022183085169856, "learning_rate": 1.9860787252488096e-05, "loss": 0.9018, "step": 497 }, { "epoch": 0.08, "grad_norm": 2.9704411847954852, "learning_rate": 1.9859901214690094e-05, "loss": 0.9374, "step": 498 }, { "epoch": 0.08, "grad_norm": 2.268317131098601, "learning_rate": 1.98590123860678e-05, "loss": 0.9351, "step": 499 }, { "epoch": 0.08, "grad_norm": 2.332317654011405, "learning_rate": 1.9858120766872785e-05, "loss": 0.9334, "step": 500 }, { "epoch": 0.08, "grad_norm": 2.159700285069712, "learning_rate": 1.9857226357357432e-05, "loss": 0.9375, "step": 501 }, { "epoch": 0.08, "grad_norm": 2.4354622714192553, "learning_rate": 1.9856329157774897e-05, "loss": 0.893, "step": 502 }, { "epoch": 0.08, "grad_norm": 2.1673989234524433, "learning_rate": 1.9855429168379127e-05, "loss": 0.9542, "step": 503 }, { "epoch": 0.08, "grad_norm": 2.1141736380919736, "learning_rate": 1.9854526389424867e-05, "loss": 0.8892, "step": 504 }, { "epoch": 0.08, "grad_norm": 2.206684626700607, "learning_rate": 1.985362082116764e-05, "loss": 0.9417, "step": 505 }, { "epoch": 0.08, "grad_norm": 2.2770803312135692, "learning_rate": 1.9852712463863777e-05, "loss": 0.9695, "step": 506 }, { "epoch": 0.08, "grad_norm": 2.5992201913781323, "learning_rate": 1.9851801317770375e-05, "loss": 0.8836, "step": 507 }, { "epoch": 0.08, "grad_norm": 2.7416568649507056, "learning_rate": 1.985088738314534e-05, "loss": 0.9232, "step": 508 }, { "epoch": 0.08, "grad_norm": 2.103386495821501, "learning_rate": 1.9849970660247352e-05, "loss": 0.875, "step": 509 }, { "epoch": 0.08, "grad_norm": 2.7343113281945843, "learning_rate": 1.984905114933589e-05, "loss": 0.907, "step": 510 }, { "epoch": 0.08, "grad_norm": 16.06233680123029, "learning_rate": 1.9848128850671222e-05, "loss": 0.9075, "step": 511 }, { "epoch": 0.08, "grad_norm": 5.965802005920725, "learning_rate": 1.98472037645144e-05, "loss": 0.9031, "step": 512 }, { "epoch": 0.08, "grad_norm": 2.433694483947623, "learning_rate": 1.9846275891127275e-05, "loss": 0.9218, "step": 513 }, { "epoch": 0.08, "grad_norm": 2.478981682262646, "learning_rate": 1.9845345230772467e-05, "loss": 0.4661, "step": 514 }, { "epoch": 0.08, "grad_norm": 2.063143511384203, "learning_rate": 1.984441178371341e-05, "loss": 0.9026, "step": 515 }, { "epoch": 0.08, "grad_norm": 2.1142942226927737, "learning_rate": 1.9843475550214306e-05, "loss": 1.0068, "step": 516 }, { "epoch": 0.08, "grad_norm": 1.9141770887931957, "learning_rate": 1.9842536530540154e-05, "loss": 0.9112, "step": 517 }, { "epoch": 0.09, "grad_norm": 1.9671852297916643, "learning_rate": 1.9841594724956746e-05, "loss": 0.9713, "step": 518 }, { "epoch": 0.09, "grad_norm": 1.7843581868101637, "learning_rate": 1.9840650133730657e-05, "loss": 0.9026, "step": 519 }, { "epoch": 0.09, "grad_norm": 2.0027822699952766, "learning_rate": 1.9839702757129248e-05, "loss": 0.9038, "step": 520 }, { "epoch": 0.09, "grad_norm": 1.839541970144467, "learning_rate": 1.9838752595420674e-05, "loss": 0.8782, "step": 521 }, { "epoch": 0.09, "grad_norm": 1.7366450538170066, "learning_rate": 1.983779964887388e-05, "loss": 0.8545, "step": 522 }, { "epoch": 0.09, "grad_norm": 2.4973287737373133, "learning_rate": 1.9836843917758593e-05, "loss": 0.9099, "step": 523 }, { "epoch": 0.09, "grad_norm": 2.679854801617466, "learning_rate": 1.9835885402345325e-05, "loss": 0.9477, "step": 524 }, { "epoch": 0.09, "grad_norm": 3.505691879206287, "learning_rate": 1.983492410290539e-05, "loss": 0.9918, "step": 525 }, { "epoch": 0.09, "grad_norm": 1.7096518855674994, "learning_rate": 1.9833960019710878e-05, "loss": 0.9356, "step": 526 }, { "epoch": 0.09, "grad_norm": 1.9087251402338197, "learning_rate": 1.983299315303467e-05, "loss": 0.8808, "step": 527 }, { "epoch": 0.09, "grad_norm": 2.2640314638498906, "learning_rate": 1.983202350315044e-05, "loss": 0.9803, "step": 528 }, { "epoch": 0.09, "grad_norm": 4.556907403885395, "learning_rate": 1.9831051070332642e-05, "loss": 0.9541, "step": 529 }, { "epoch": 0.09, "grad_norm": 2.171508107081458, "learning_rate": 1.983007585485652e-05, "loss": 0.9048, "step": 530 }, { "epoch": 0.09, "grad_norm": 2.1827655428635326, "learning_rate": 1.9829097856998105e-05, "loss": 0.9515, "step": 531 }, { "epoch": 0.09, "grad_norm": 1.8182953249079523, "learning_rate": 1.9828117077034225e-05, "loss": 0.8442, "step": 532 }, { "epoch": 0.09, "grad_norm": 2.3760376950889253, "learning_rate": 1.982713351524248e-05, "loss": 0.9427, "step": 533 }, { "epoch": 0.09, "grad_norm": 3.0807255227278065, "learning_rate": 1.982614717190127e-05, "loss": 0.9188, "step": 534 }, { "epoch": 0.09, "grad_norm": 2.8437908200048447, "learning_rate": 1.982515804728977e-05, "loss": 0.9482, "step": 535 }, { "epoch": 0.09, "grad_norm": 13.861552737357353, "learning_rate": 1.982416614168796e-05, "loss": 0.9056, "step": 536 }, { "epoch": 0.09, "grad_norm": 2.2407140707990867, "learning_rate": 1.9823171455376592e-05, "loss": 0.9674, "step": 537 }, { "epoch": 0.09, "grad_norm": 2.6236114734219043, "learning_rate": 1.982217398863721e-05, "loss": 0.9407, "step": 538 }, { "epoch": 0.09, "grad_norm": 2.3668082143381204, "learning_rate": 1.982117374175214e-05, "loss": 0.8971, "step": 539 }, { "epoch": 0.09, "grad_norm": 2.3148233407863685, "learning_rate": 1.982017071500451e-05, "loss": 0.8602, "step": 540 }, { "epoch": 0.09, "grad_norm": 2.0462322341990062, "learning_rate": 1.9819164908678216e-05, "loss": 0.9154, "step": 541 }, { "epoch": 0.09, "grad_norm": 3.9617592723624218, "learning_rate": 1.9818156323057952e-05, "loss": 0.8465, "step": 542 }, { "epoch": 0.09, "grad_norm": 3.7648188148745643, "learning_rate": 1.98171449584292e-05, "loss": 0.8377, "step": 543 }, { "epoch": 0.09, "grad_norm": 3.2375788719192418, "learning_rate": 1.9816130815078216e-05, "loss": 0.9364, "step": 544 }, { "epoch": 0.09, "grad_norm": 2.2613988666164992, "learning_rate": 1.9815113893292058e-05, "loss": 1.0044, "step": 545 }, { "epoch": 0.09, "grad_norm": 2.9258897211585184, "learning_rate": 1.981409419335856e-05, "loss": 0.9555, "step": 546 }, { "epoch": 0.09, "grad_norm": 1.9702780699490952, "learning_rate": 1.981307171556635e-05, "loss": 0.9446, "step": 547 }, { "epoch": 0.09, "grad_norm": 2.079210946129372, "learning_rate": 1.9812046460204837e-05, "loss": 0.9279, "step": 548 }, { "epoch": 0.09, "grad_norm": 2.1780685360233454, "learning_rate": 1.9811018427564213e-05, "loss": 0.8909, "step": 549 }, { "epoch": 0.09, "grad_norm": 3.1581854755150434, "learning_rate": 1.9809987617935468e-05, "loss": 1.0128, "step": 550 }, { "epoch": 0.09, "grad_norm": 1.8005933377555292, "learning_rate": 1.9808954031610362e-05, "loss": 0.8844, "step": 551 }, { "epoch": 0.09, "grad_norm": 0.9100560825013212, "learning_rate": 1.9807917668881455e-05, "loss": 0.3729, "step": 552 }, { "epoch": 0.09, "grad_norm": 4.334654024369413, "learning_rate": 1.9806878530042083e-05, "loss": 0.9273, "step": 553 }, { "epoch": 0.09, "grad_norm": 2.5615957876514264, "learning_rate": 1.9805836615386378e-05, "loss": 0.926, "step": 554 }, { "epoch": 0.09, "grad_norm": 3.5590143181854237, "learning_rate": 1.980479192520925e-05, "loss": 0.861, "step": 555 }, { "epoch": 0.09, "grad_norm": 2.329246711311316, "learning_rate": 1.9803744459806393e-05, "loss": 1.0121, "step": 556 }, { "epoch": 0.09, "grad_norm": 2.197235310228738, "learning_rate": 1.980269421947429e-05, "loss": 0.971, "step": 557 }, { "epoch": 0.09, "grad_norm": 2.3895636577229444, "learning_rate": 1.9801641204510216e-05, "loss": 0.8795, "step": 558 }, { "epoch": 0.09, "grad_norm": 2.244681768951126, "learning_rate": 1.9800585415212214e-05, "loss": 0.9658, "step": 559 }, { "epoch": 0.09, "grad_norm": 1.7468165512318645, "learning_rate": 1.979952685187913e-05, "loss": 0.9211, "step": 560 }, { "epoch": 0.09, "grad_norm": 2.267967231577892, "learning_rate": 1.979846551481059e-05, "loss": 0.9555, "step": 561 }, { "epoch": 0.09, "grad_norm": 2.261652193850287, "learning_rate": 1.9797401404307e-05, "loss": 0.9014, "step": 562 }, { "epoch": 0.09, "grad_norm": 1.7151021983863315, "learning_rate": 1.9796334520669555e-05, "loss": 0.8737, "step": 563 }, { "epoch": 0.09, "grad_norm": 2.218045756102076, "learning_rate": 1.9795264864200233e-05, "loss": 0.8755, "step": 564 }, { "epoch": 0.09, "grad_norm": 2.4375912599605454, "learning_rate": 1.9794192435201797e-05, "loss": 0.9866, "step": 565 }, { "epoch": 0.09, "grad_norm": 2.3155090947105257, "learning_rate": 1.97931172339778e-05, "loss": 0.8523, "step": 566 }, { "epoch": 0.09, "grad_norm": 2.3996316169629237, "learning_rate": 1.979203926083257e-05, "loss": 0.8725, "step": 567 }, { "epoch": 0.09, "grad_norm": 3.1091322735868694, "learning_rate": 1.9790958516071228e-05, "loss": 0.8964, "step": 568 }, { "epoch": 0.09, "grad_norm": 2.1654682988271756, "learning_rate": 1.9789874999999678e-05, "loss": 0.9596, "step": 569 }, { "epoch": 0.09, "grad_norm": 2.2747095818973535, "learning_rate": 1.9788788712924606e-05, "loss": 0.8855, "step": 570 }, { "epoch": 0.09, "grad_norm": 2.154707819357547, "learning_rate": 1.978769965515348e-05, "loss": 0.894, "step": 571 }, { "epoch": 0.09, "grad_norm": 2.501848631525768, "learning_rate": 1.9786607826994557e-05, "loss": 0.9063, "step": 572 }, { "epoch": 0.09, "grad_norm": 2.739591549529581, "learning_rate": 1.978551322875688e-05, "loss": 0.8542, "step": 573 }, { "epoch": 0.09, "grad_norm": 2.220498104665963, "learning_rate": 1.978441586075027e-05, "loss": 0.9091, "step": 574 }, { "epoch": 0.09, "grad_norm": 2.0144762127177587, "learning_rate": 1.978331572328534e-05, "loss": 0.9523, "step": 575 }, { "epoch": 0.09, "grad_norm": 1.5547771237009116, "learning_rate": 1.9782212816673468e-05, "loss": 0.9126, "step": 576 }, { "epoch": 0.09, "grad_norm": 2.58682066141552, "learning_rate": 1.9781107141226845e-05, "loss": 0.8996, "step": 577 }, { "epoch": 0.09, "grad_norm": 0.7668102559768695, "learning_rate": 1.977999869725842e-05, "loss": 0.3553, "step": 578 }, { "epoch": 0.1, "grad_norm": 0.7507520191691047, "learning_rate": 1.977888748508194e-05, "loss": 0.3518, "step": 579 }, { "epoch": 0.1, "grad_norm": 2.082469834887363, "learning_rate": 1.977777350501193e-05, "loss": 0.9712, "step": 580 }, { "epoch": 0.1, "grad_norm": 3.080043390547281, "learning_rate": 1.97766567573637e-05, "loss": 0.8527, "step": 581 }, { "epoch": 0.1, "grad_norm": 0.6780038653364069, "learning_rate": 1.9775537242453347e-05, "loss": 0.344, "step": 582 }, { "epoch": 0.1, "grad_norm": 2.5422024115831734, "learning_rate": 1.977441496059774e-05, "loss": 0.9291, "step": 583 }, { "epoch": 0.1, "grad_norm": 2.38949138987306, "learning_rate": 1.9773289912114543e-05, "loss": 0.8904, "step": 584 }, { "epoch": 0.1, "grad_norm": 2.4172690021055026, "learning_rate": 1.9772162097322195e-05, "loss": 0.9366, "step": 585 }, { "epoch": 0.1, "grad_norm": 1.802903844188179, "learning_rate": 1.9771031516539928e-05, "loss": 0.9202, "step": 586 }, { "epoch": 0.1, "grad_norm": 2.654676987844412, "learning_rate": 1.9769898170087743e-05, "loss": 0.9656, "step": 587 }, { "epoch": 0.1, "grad_norm": 2.1755508938538086, "learning_rate": 1.9768762058286433e-05, "loss": 0.8583, "step": 588 }, { "epoch": 0.1, "grad_norm": 2.147050710212126, "learning_rate": 1.976762318145758e-05, "loss": 0.938, "step": 589 }, { "epoch": 0.1, "grad_norm": 2.186901851558186, "learning_rate": 1.9766481539923533e-05, "loss": 0.9505, "step": 590 }, { "epoch": 0.1, "grad_norm": 2.094617600226059, "learning_rate": 1.9765337134007432e-05, "loss": 0.9478, "step": 591 }, { "epoch": 0.1, "grad_norm": 2.5759821478735123, "learning_rate": 1.9764189964033198e-05, "loss": 0.91, "step": 592 }, { "epoch": 0.1, "grad_norm": 3.735313678299557, "learning_rate": 1.976304003032554e-05, "loss": 0.9334, "step": 593 }, { "epoch": 0.1, "grad_norm": 2.0900563877862486, "learning_rate": 1.976188733320994e-05, "loss": 0.8909, "step": 594 }, { "epoch": 0.1, "grad_norm": 4.025045319359035, "learning_rate": 1.9760731873012668e-05, "loss": 0.8745, "step": 595 }, { "epoch": 0.1, "grad_norm": 2.2809299025489156, "learning_rate": 1.9759573650060774e-05, "loss": 0.9053, "step": 596 }, { "epoch": 0.1, "grad_norm": 2.316763658728453, "learning_rate": 1.9758412664682088e-05, "loss": 0.8768, "step": 597 }, { "epoch": 0.1, "grad_norm": 2.1675681147812176, "learning_rate": 1.9757248917205228e-05, "loss": 0.9152, "step": 598 }, { "epoch": 0.1, "grad_norm": 1.9983504513881212, "learning_rate": 1.975608240795959e-05, "loss": 0.8547, "step": 599 }, { "epoch": 0.1, "grad_norm": 2.0250560481456756, "learning_rate": 1.9754913137275355e-05, "loss": 0.9234, "step": 600 }, { "epoch": 0.1, "grad_norm": 1.9864954096086396, "learning_rate": 1.9753741105483475e-05, "loss": 0.8658, "step": 601 }, { "epoch": 0.1, "grad_norm": 2.226606204979962, "learning_rate": 1.9752566312915697e-05, "loss": 0.9011, "step": 602 }, { "epoch": 0.1, "grad_norm": 2.600561301533232, "learning_rate": 1.975138875990454e-05, "loss": 0.9992, "step": 603 }, { "epoch": 0.1, "grad_norm": 2.256272640560826, "learning_rate": 1.975020844678331e-05, "loss": 0.9043, "step": 604 }, { "epoch": 0.1, "grad_norm": 3.619638017734431, "learning_rate": 1.974902537388609e-05, "loss": 0.8718, "step": 605 }, { "epoch": 0.1, "grad_norm": 1.9623238010294584, "learning_rate": 1.9747839541547754e-05, "loss": 0.8985, "step": 606 }, { "epoch": 0.1, "grad_norm": 3.2402357841343035, "learning_rate": 1.974665095010394e-05, "loss": 0.9417, "step": 607 }, { "epoch": 0.1, "grad_norm": 3.2429791411587643, "learning_rate": 1.974545959989108e-05, "loss": 0.8963, "step": 608 }, { "epoch": 0.1, "grad_norm": 2.092056711683265, "learning_rate": 1.974426549124638e-05, "loss": 0.8399, "step": 609 }, { "epoch": 0.1, "grad_norm": 1.9994587284677359, "learning_rate": 1.9743068624507837e-05, "loss": 0.9214, "step": 610 }, { "epoch": 0.1, "grad_norm": 1.9815387296705698, "learning_rate": 1.9741869000014217e-05, "loss": 0.9654, "step": 611 }, { "epoch": 0.1, "grad_norm": 2.110711738265122, "learning_rate": 1.9740666618105067e-05, "loss": 0.8851, "step": 612 }, { "epoch": 0.1, "grad_norm": 2.269098622689513, "learning_rate": 1.9739461479120727e-05, "loss": 0.8661, "step": 613 }, { "epoch": 0.1, "grad_norm": 2.114718071886836, "learning_rate": 1.9738253583402306e-05, "loss": 0.966, "step": 614 }, { "epoch": 0.1, "grad_norm": 2.1894880178859935, "learning_rate": 1.973704293129169e-05, "loss": 0.8586, "step": 615 }, { "epoch": 0.1, "grad_norm": 2.41299126880057, "learning_rate": 1.9735829523131564e-05, "loss": 0.8678, "step": 616 }, { "epoch": 0.1, "grad_norm": 2.325797832932958, "learning_rate": 1.9734613359265373e-05, "loss": 0.9549, "step": 617 }, { "epoch": 0.1, "grad_norm": 2.131517151528351, "learning_rate": 1.973339444003735e-05, "loss": 0.9202, "step": 618 }, { "epoch": 0.1, "grad_norm": 3.027258479159963, "learning_rate": 1.9732172765792507e-05, "loss": 0.8891, "step": 619 }, { "epoch": 0.1, "grad_norm": 2.845447028735344, "learning_rate": 1.9730948336876637e-05, "loss": 0.9824, "step": 620 }, { "epoch": 0.1, "grad_norm": 2.6478227171108757, "learning_rate": 1.9729721153636312e-05, "loss": 0.894, "step": 621 }, { "epoch": 0.1, "grad_norm": 2.892577625412371, "learning_rate": 1.9728491216418884e-05, "loss": 0.9694, "step": 622 }, { "epoch": 0.1, "grad_norm": 1.7422346611292776, "learning_rate": 1.9727258525572487e-05, "loss": 1.0037, "step": 623 }, { "epoch": 0.1, "grad_norm": 2.471205647691986, "learning_rate": 1.9726023081446026e-05, "loss": 0.8782, "step": 624 }, { "epoch": 0.1, "grad_norm": 3.2572664435929126, "learning_rate": 1.9724784884389195e-05, "loss": 0.8756, "step": 625 }, { "epoch": 0.1, "grad_norm": 0.9422648928443017, "learning_rate": 1.972354393475246e-05, "loss": 0.3921, "step": 626 }, { "epoch": 0.1, "grad_norm": 2.1519812535450025, "learning_rate": 1.9722300232887073e-05, "loss": 0.9745, "step": 627 }, { "epoch": 0.1, "grad_norm": 2.177992386822593, "learning_rate": 1.9721053779145057e-05, "loss": 0.8974, "step": 628 }, { "epoch": 0.1, "grad_norm": 2.5939675377281475, "learning_rate": 1.9719804573879223e-05, "loss": 0.8375, "step": 629 }, { "epoch": 0.1, "grad_norm": 1.809026516043162, "learning_rate": 1.971855261744315e-05, "loss": 0.9083, "step": 630 }, { "epoch": 0.1, "grad_norm": 2.007267288180993, "learning_rate": 1.9717297910191204e-05, "loss": 0.9553, "step": 631 }, { "epoch": 0.1, "grad_norm": 2.1899477958514657, "learning_rate": 1.9716040452478527e-05, "loss": 0.9126, "step": 632 }, { "epoch": 0.1, "grad_norm": 1.8936479098045322, "learning_rate": 1.9714780244661044e-05, "loss": 0.9226, "step": 633 }, { "epoch": 0.1, "grad_norm": 2.1182116302236973, "learning_rate": 1.971351728709545e-05, "loss": 0.881, "step": 634 }, { "epoch": 0.1, "grad_norm": 2.1468213857906866, "learning_rate": 1.9712251580139225e-05, "loss": 0.9134, "step": 635 }, { "epoch": 0.1, "grad_norm": 1.963981264086489, "learning_rate": 1.971098312415062e-05, "loss": 0.9399, "step": 636 }, { "epoch": 0.1, "grad_norm": 2.6672988506607784, "learning_rate": 1.9709711919488673e-05, "loss": 0.9054, "step": 637 }, { "epoch": 0.1, "grad_norm": 3.6762717839728163, "learning_rate": 1.9708437966513196e-05, "loss": 0.9014, "step": 638 }, { "epoch": 0.1, "grad_norm": 1.9722447436411168, "learning_rate": 1.9707161265584775e-05, "loss": 0.8657, "step": 639 }, { "epoch": 0.11, "grad_norm": 1.817077174047744, "learning_rate": 1.970588181706478e-05, "loss": 0.8728, "step": 640 }, { "epoch": 0.11, "grad_norm": 1.9542510630501138, "learning_rate": 1.970459962131536e-05, "loss": 0.9122, "step": 641 }, { "epoch": 0.11, "grad_norm": 2.0959832611417117, "learning_rate": 1.9703314678699426e-05, "loss": 0.9845, "step": 642 }, { "epoch": 0.11, "grad_norm": 2.13518203377058, "learning_rate": 1.9702026989580694e-05, "loss": 0.8515, "step": 643 }, { "epoch": 0.11, "grad_norm": 2.0812930329728405, "learning_rate": 1.970073655432363e-05, "loss": 0.8721, "step": 644 }, { "epoch": 0.11, "grad_norm": 1.8681933665623998, "learning_rate": 1.9699443373293496e-05, "loss": 0.9296, "step": 645 }, { "epoch": 0.11, "grad_norm": 1.7868260430820138, "learning_rate": 1.9698147446856316e-05, "loss": 0.8881, "step": 646 }, { "epoch": 0.11, "grad_norm": 4.469714269412271, "learning_rate": 1.969684877537891e-05, "loss": 0.9007, "step": 647 }, { "epoch": 0.11, "grad_norm": 1.9948030891340944, "learning_rate": 1.969554735922885e-05, "loss": 0.972, "step": 648 }, { "epoch": 0.11, "grad_norm": 1.9626827961818247, "learning_rate": 1.9694243198774516e-05, "loss": 0.9013, "step": 649 }, { "epoch": 0.11, "grad_norm": 2.5062146407051973, "learning_rate": 1.9692936294385038e-05, "loss": 0.9746, "step": 650 }, { "epoch": 0.11, "grad_norm": 2.306495146172734, "learning_rate": 1.969162664643033e-05, "loss": 0.8834, "step": 651 }, { "epoch": 0.11, "grad_norm": 2.0142080841729535, "learning_rate": 1.9690314255281092e-05, "loss": 0.8818, "step": 652 }, { "epoch": 0.11, "grad_norm": 1.7831689586438613, "learning_rate": 1.968899912130879e-05, "loss": 0.9243, "step": 653 }, { "epoch": 0.11, "grad_norm": 1.9269057231122926, "learning_rate": 1.968768124488567e-05, "loss": 0.9194, "step": 654 }, { "epoch": 0.11, "grad_norm": 3.189588264571461, "learning_rate": 1.9686360626384756e-05, "loss": 0.89, "step": 655 }, { "epoch": 0.11, "grad_norm": 1.8581884444825842, "learning_rate": 1.9685037266179846e-05, "loss": 0.9, "step": 656 }, { "epoch": 0.11, "grad_norm": 3.419826036586947, "learning_rate": 1.9683711164645508e-05, "loss": 1.0097, "step": 657 }, { "epoch": 0.11, "grad_norm": 2.200381675753864, "learning_rate": 1.9682382322157103e-05, "loss": 0.954, "step": 658 }, { "epoch": 0.11, "grad_norm": 2.058763183515874, "learning_rate": 1.968105073909075e-05, "loss": 0.9401, "step": 659 }, { "epoch": 0.11, "grad_norm": 2.7334581818041386, "learning_rate": 1.9679716415823352e-05, "loss": 0.9702, "step": 660 }, { "epoch": 0.11, "grad_norm": 2.0312162772875513, "learning_rate": 1.9678379352732587e-05, "loss": 0.9585, "step": 661 }, { "epoch": 0.11, "grad_norm": 2.124596592544637, "learning_rate": 1.967703955019691e-05, "loss": 0.9415, "step": 662 }, { "epoch": 0.11, "grad_norm": 2.124458777195437, "learning_rate": 1.9675697008595545e-05, "loss": 0.9403, "step": 663 }, { "epoch": 0.11, "grad_norm": 2.170347113829269, "learning_rate": 1.9674351728308498e-05, "loss": 0.8778, "step": 664 }, { "epoch": 0.11, "grad_norm": 0.8725290543299655, "learning_rate": 1.9673003709716548e-05, "loss": 0.3618, "step": 665 }, { "epoch": 0.11, "grad_norm": 2.0483350819875104, "learning_rate": 1.9671652953201245e-05, "loss": 0.8111, "step": 666 }, { "epoch": 0.11, "grad_norm": 0.6996840328629057, "learning_rate": 1.9670299459144923e-05, "loss": 0.3793, "step": 667 }, { "epoch": 0.11, "grad_norm": 2.0351501915788757, "learning_rate": 1.9668943227930686e-05, "loss": 0.9621, "step": 668 }, { "epoch": 0.11, "grad_norm": 3.618007529548743, "learning_rate": 1.9667584259942408e-05, "loss": 0.9375, "step": 669 }, { "epoch": 0.11, "grad_norm": 1.988692210889578, "learning_rate": 1.9666222555564744e-05, "loss": 0.8963, "step": 670 }, { "epoch": 0.11, "grad_norm": 2.097124492593114, "learning_rate": 1.9664858115183122e-05, "loss": 0.879, "step": 671 }, { "epoch": 0.11, "grad_norm": 2.1536916933847086, "learning_rate": 1.9663490939183744e-05, "loss": 0.8627, "step": 672 }, { "epoch": 0.11, "grad_norm": 1.9446547637755316, "learning_rate": 1.966212102795358e-05, "loss": 0.8443, "step": 673 }, { "epoch": 0.11, "grad_norm": 2.876973046954697, "learning_rate": 1.9660748381880394e-05, "loss": 0.9183, "step": 674 }, { "epoch": 0.11, "grad_norm": 0.8742884638452032, "learning_rate": 1.96593730013527e-05, "loss": 0.377, "step": 675 }, { "epoch": 0.11, "grad_norm": 1.9488757303836157, "learning_rate": 1.96579948867598e-05, "loss": 0.8948, "step": 676 }, { "epoch": 0.11, "grad_norm": 2.0104818738205, "learning_rate": 1.9656614038491765e-05, "loss": 0.9444, "step": 677 }, { "epoch": 0.11, "grad_norm": 0.7151682370202912, "learning_rate": 1.965523045693944e-05, "loss": 0.3948, "step": 678 }, { "epoch": 0.11, "grad_norm": 2.2289205543446515, "learning_rate": 1.965384414249445e-05, "loss": 0.9167, "step": 679 }, { "epoch": 0.11, "grad_norm": 1.7248304335501317, "learning_rate": 1.9652455095549188e-05, "loss": 0.9595, "step": 680 }, { "epoch": 0.11, "grad_norm": 2.7206627268080528, "learning_rate": 1.9651063316496813e-05, "loss": 0.8872, "step": 681 }, { "epoch": 0.11, "grad_norm": 2.003810605793115, "learning_rate": 1.9649668805731274e-05, "loss": 0.9261, "step": 682 }, { "epoch": 0.11, "grad_norm": 1.8735186383056535, "learning_rate": 1.964827156364728e-05, "loss": 0.9912, "step": 683 }, { "epoch": 0.11, "grad_norm": 1.8719561018249928, "learning_rate": 1.9646871590640317e-05, "loss": 0.9543, "step": 684 }, { "epoch": 0.11, "grad_norm": 1.996517229928255, "learning_rate": 1.9645468887106645e-05, "loss": 0.9137, "step": 685 }, { "epoch": 0.11, "grad_norm": 0.7160355666435414, "learning_rate": 1.9644063453443296e-05, "loss": 0.3622, "step": 686 }, { "epoch": 0.11, "grad_norm": 2.513411403573518, "learning_rate": 1.9642655290048077e-05, "loss": 0.7794, "step": 687 }, { "epoch": 0.11, "grad_norm": 1.9677789827628898, "learning_rate": 1.964124439731957e-05, "loss": 0.9032, "step": 688 }, { "epoch": 0.11, "grad_norm": 1.8503414387149546, "learning_rate": 1.9639830775657113e-05, "loss": 0.9094, "step": 689 }, { "epoch": 0.11, "grad_norm": 2.325285717209014, "learning_rate": 1.9638414425460834e-05, "loss": 0.9156, "step": 690 }, { "epoch": 0.11, "grad_norm": 1.834317863780255, "learning_rate": 1.9636995347131634e-05, "loss": 0.91, "step": 691 }, { "epoch": 0.11, "grad_norm": 2.161554414168365, "learning_rate": 1.9635573541071174e-05, "loss": 0.8774, "step": 692 }, { "epoch": 0.11, "grad_norm": 6.973655975763507, "learning_rate": 1.9634149007681894e-05, "loss": 0.9129, "step": 693 }, { "epoch": 0.11, "grad_norm": 2.0104947012923033, "learning_rate": 1.9632721747367005e-05, "loss": 0.8704, "step": 694 }, { "epoch": 0.11, "grad_norm": 3.1234597705637217, "learning_rate": 1.9631291760530492e-05, "loss": 0.9748, "step": 695 }, { "epoch": 0.11, "grad_norm": 2.2136188553388396, "learning_rate": 1.962985904757711e-05, "loss": 0.8822, "step": 696 }, { "epoch": 0.11, "grad_norm": 4.3756367054272465, "learning_rate": 1.962842360891238e-05, "loss": 0.9808, "step": 697 }, { "epoch": 0.11, "grad_norm": 2.098951792322454, "learning_rate": 1.962698544494261e-05, "loss": 0.9381, "step": 698 }, { "epoch": 0.11, "grad_norm": 1.8742674048736345, "learning_rate": 1.9625544556074857e-05, "loss": 0.9067, "step": 699 }, { "epoch": 0.11, "grad_norm": 1.7870440200906108, "learning_rate": 1.962410094271697e-05, "loss": 0.9272, "step": 700 }, { "epoch": 0.12, "grad_norm": 1.4639926657715723, "learning_rate": 1.962265460527756e-05, "loss": 0.9332, "step": 701 }, { "epoch": 0.12, "grad_norm": 1.785703119067691, "learning_rate": 1.962120554416601e-05, "loss": 0.9211, "step": 702 }, { "epoch": 0.12, "grad_norm": 1.749029195105529, "learning_rate": 1.9619753759792466e-05, "loss": 0.8817, "step": 703 }, { "epoch": 0.12, "grad_norm": 2.2231807827765926, "learning_rate": 1.9618299252567863e-05, "loss": 0.8859, "step": 704 }, { "epoch": 0.12, "grad_norm": 1.9831653532054745, "learning_rate": 1.961684202290389e-05, "loss": 0.9105, "step": 705 }, { "epoch": 0.12, "grad_norm": 2.346541688242642, "learning_rate": 1.9615382071213017e-05, "loss": 0.9261, "step": 706 }, { "epoch": 0.12, "grad_norm": 3.5223193020499277, "learning_rate": 1.9613919397908473e-05, "loss": 0.8237, "step": 707 }, { "epoch": 0.12, "grad_norm": 2.095695556219128, "learning_rate": 1.961245400340427e-05, "loss": 0.9304, "step": 708 }, { "epoch": 0.12, "grad_norm": 1.8428878844099028, "learning_rate": 1.961098588811519e-05, "loss": 0.8869, "step": 709 }, { "epoch": 0.12, "grad_norm": 2.2135108653540003, "learning_rate": 1.9609515052456768e-05, "loss": 0.859, "step": 710 }, { "epoch": 0.12, "grad_norm": 2.0959097276573453, "learning_rate": 1.960804149684533e-05, "loss": 0.9386, "step": 711 }, { "epoch": 0.12, "grad_norm": 10.212285998683617, "learning_rate": 1.960656522169796e-05, "loss": 0.9754, "step": 712 }, { "epoch": 0.12, "grad_norm": 2.0839066529316947, "learning_rate": 1.9605086227432512e-05, "loss": 0.9117, "step": 713 }, { "epoch": 0.12, "grad_norm": 4.293851474986266, "learning_rate": 1.9603604514467616e-05, "loss": 0.8069, "step": 714 }, { "epoch": 0.12, "grad_norm": 3.5589677817056815, "learning_rate": 1.9602120083222665e-05, "loss": 0.8959, "step": 715 }, { "epoch": 0.12, "grad_norm": 1.8879947948990934, "learning_rate": 1.960063293411783e-05, "loss": 0.9238, "step": 716 }, { "epoch": 0.12, "grad_norm": 2.1056179533422816, "learning_rate": 1.9599143067574037e-05, "loss": 0.908, "step": 717 }, { "epoch": 0.12, "grad_norm": 1.9845072605012397, "learning_rate": 1.9597650484012997e-05, "loss": 0.8539, "step": 718 }, { "epoch": 0.12, "grad_norm": 2.124662572596028, "learning_rate": 1.9596155183857176e-05, "loss": 0.8632, "step": 719 }, { "epoch": 0.12, "grad_norm": 2.07176691088613, "learning_rate": 1.9594657167529818e-05, "loss": 0.9465, "step": 720 }, { "epoch": 0.12, "grad_norm": 2.1023130756537016, "learning_rate": 1.9593156435454936e-05, "loss": 0.8481, "step": 721 }, { "epoch": 0.12, "grad_norm": 1.7284565814867467, "learning_rate": 1.9591652988057305e-05, "loss": 0.8889, "step": 722 }, { "epoch": 0.12, "grad_norm": 1.7179295290158891, "learning_rate": 1.9590146825762476e-05, "loss": 0.924, "step": 723 }, { "epoch": 0.12, "grad_norm": 2.010177601826913, "learning_rate": 1.9588637948996766e-05, "loss": 0.8735, "step": 724 }, { "epoch": 0.12, "grad_norm": 1.8736472474445849, "learning_rate": 1.9587126358187257e-05, "loss": 0.866, "step": 725 }, { "epoch": 0.12, "grad_norm": 1.9831970881255607, "learning_rate": 1.9585612053761798e-05, "loss": 0.8411, "step": 726 }, { "epoch": 0.12, "grad_norm": 2.128016791224626, "learning_rate": 1.9584095036149016e-05, "loss": 0.914, "step": 727 }, { "epoch": 0.12, "grad_norm": 3.5319932756674426, "learning_rate": 1.9582575305778297e-05, "loss": 0.9529, "step": 728 }, { "epoch": 0.12, "grad_norm": 2.0853345567635033, "learning_rate": 1.95810528630798e-05, "loss": 0.8577, "step": 729 }, { "epoch": 0.12, "grad_norm": 2.48092254333158, "learning_rate": 1.9579527708484444e-05, "loss": 0.9031, "step": 730 }, { "epoch": 0.12, "grad_norm": 2.7577533407558157, "learning_rate": 1.957799984242392e-05, "loss": 0.9174, "step": 731 }, { "epoch": 0.12, "grad_norm": 2.1384766850899504, "learning_rate": 1.95764692653307e-05, "loss": 0.8878, "step": 732 }, { "epoch": 0.12, "grad_norm": 1.7986085645882948, "learning_rate": 1.9574935977637994e-05, "loss": 0.9247, "step": 733 }, { "epoch": 0.12, "grad_norm": 1.8193614423938456, "learning_rate": 1.9573399979779807e-05, "loss": 0.9365, "step": 734 }, { "epoch": 0.12, "grad_norm": 2.0825089202600657, "learning_rate": 1.9571861272190898e-05, "loss": 0.9389, "step": 735 }, { "epoch": 0.12, "grad_norm": 1.8998827669915546, "learning_rate": 1.957031985530679e-05, "loss": 0.847, "step": 736 }, { "epoch": 0.12, "grad_norm": 2.2944631122214636, "learning_rate": 1.9568775729563782e-05, "loss": 0.8849, "step": 737 }, { "epoch": 0.12, "grad_norm": 0.7784663822082215, "learning_rate": 1.9567228895398936e-05, "loss": 0.4102, "step": 738 }, { "epoch": 0.12, "grad_norm": 2.108038436097284, "learning_rate": 1.9565679353250077e-05, "loss": 0.9042, "step": 739 }, { "epoch": 0.12, "grad_norm": 2.2463741061952764, "learning_rate": 1.95641271035558e-05, "loss": 0.8325, "step": 740 }, { "epoch": 0.12, "grad_norm": 1.87432624704806, "learning_rate": 1.9562572146755473e-05, "loss": 0.9414, "step": 741 }, { "epoch": 0.12, "grad_norm": 2.5765769014658013, "learning_rate": 1.956101448328921e-05, "loss": 0.9118, "step": 742 }, { "epoch": 0.12, "grad_norm": 2.2751898906031403, "learning_rate": 1.955945411359792e-05, "loss": 0.8449, "step": 743 }, { "epoch": 0.12, "grad_norm": 2.491359942486933, "learning_rate": 1.955789103812325e-05, "loss": 0.8624, "step": 744 }, { "epoch": 0.12, "grad_norm": 3.244216112449589, "learning_rate": 1.955632525730763e-05, "loss": 0.8979, "step": 745 }, { "epoch": 0.12, "grad_norm": 1.9700921816268484, "learning_rate": 1.955475677159425e-05, "loss": 0.8959, "step": 746 }, { "epoch": 0.12, "grad_norm": 2.1428358972789754, "learning_rate": 1.955318558142707e-05, "loss": 0.8904, "step": 747 }, { "epoch": 0.12, "grad_norm": 2.2776195737750156, "learning_rate": 1.9551611687250808e-05, "loss": 0.9925, "step": 748 }, { "epoch": 0.12, "grad_norm": 1.924224487328018, "learning_rate": 1.9550035089510952e-05, "loss": 0.9884, "step": 749 }, { "epoch": 0.12, "grad_norm": 2.0976805495196893, "learning_rate": 1.9548455788653754e-05, "loss": 0.9028, "step": 750 }, { "epoch": 0.12, "grad_norm": 2.385269596652196, "learning_rate": 1.9546873785126237e-05, "loss": 0.9885, "step": 751 }, { "epoch": 0.12, "grad_norm": 2.0359615982187083, "learning_rate": 1.954528907937618e-05, "loss": 0.887, "step": 752 }, { "epoch": 0.12, "grad_norm": 0.8215077954812855, "learning_rate": 1.9543701671852127e-05, "loss": 0.3678, "step": 753 }, { "epoch": 0.12, "grad_norm": 1.7826009484659584, "learning_rate": 1.9542111563003393e-05, "loss": 0.8288, "step": 754 }, { "epoch": 0.12, "grad_norm": 1.8030617387776484, "learning_rate": 1.954051875328006e-05, "loss": 0.9025, "step": 755 }, { "epoch": 0.12, "grad_norm": 2.8746011603081656, "learning_rate": 1.9538923243132967e-05, "loss": 0.9084, "step": 756 }, { "epoch": 0.12, "grad_norm": 2.186943332845059, "learning_rate": 1.9537325033013714e-05, "loss": 0.8891, "step": 757 }, { "epoch": 0.12, "grad_norm": 2.816351201143875, "learning_rate": 1.9535724123374674e-05, "loss": 0.9186, "step": 758 }, { "epoch": 0.12, "grad_norm": 0.7669086807450612, "learning_rate": 1.9534120514668987e-05, "loss": 0.3719, "step": 759 }, { "epoch": 0.12, "grad_norm": 2.5367048448792695, "learning_rate": 1.9532514207350543e-05, "loss": 0.9262, "step": 760 }, { "epoch": 0.12, "grad_norm": 2.3391386776789487, "learning_rate": 1.953090520187401e-05, "loss": 0.9151, "step": 761 }, { "epoch": 0.13, "grad_norm": 1.8695865577930504, "learning_rate": 1.952929349869481e-05, "loss": 0.8813, "step": 762 }, { "epoch": 0.13, "grad_norm": 1.9883569364242604, "learning_rate": 1.952767909826913e-05, "loss": 0.8955, "step": 763 }, { "epoch": 0.13, "grad_norm": 2.4202000188445396, "learning_rate": 1.9526062001053928e-05, "loss": 0.8582, "step": 764 }, { "epoch": 0.13, "grad_norm": 2.306454402860363, "learning_rate": 1.9524442207506915e-05, "loss": 0.8649, "step": 765 }, { "epoch": 0.13, "grad_norm": 2.0244253641864542, "learning_rate": 1.9522819718086578e-05, "loss": 0.9453, "step": 766 }, { "epoch": 0.13, "grad_norm": 1.7593879855845402, "learning_rate": 1.952119453325215e-05, "loss": 0.999, "step": 767 }, { "epoch": 0.13, "grad_norm": 2.0014908977448487, "learning_rate": 1.951956665346364e-05, "loss": 0.9432, "step": 768 }, { "epoch": 0.13, "grad_norm": 0.7782231935833783, "learning_rate": 1.951793607918182e-05, "loss": 0.3422, "step": 769 }, { "epoch": 0.13, "grad_norm": 0.6926122844357109, "learning_rate": 1.9516302810868212e-05, "loss": 0.3771, "step": 770 }, { "epoch": 0.13, "grad_norm": 2.4593768693596627, "learning_rate": 1.9514666848985116e-05, "loss": 0.913, "step": 771 }, { "epoch": 0.13, "grad_norm": 2.371002384025261, "learning_rate": 1.9513028193995588e-05, "loss": 0.9117, "step": 772 }, { "epoch": 0.13, "grad_norm": 2.791556832515129, "learning_rate": 1.951138684636344e-05, "loss": 0.9319, "step": 773 }, { "epoch": 0.13, "grad_norm": 2.27036256389709, "learning_rate": 1.950974280655326e-05, "loss": 0.7804, "step": 774 }, { "epoch": 0.13, "grad_norm": 2.0383322455439807, "learning_rate": 1.9508096075030378e-05, "loss": 0.8931, "step": 775 }, { "epoch": 0.13, "grad_norm": 2.5220343109250987, "learning_rate": 1.950644665226091e-05, "loss": 0.8975, "step": 776 }, { "epoch": 0.13, "grad_norm": 2.923563409176333, "learning_rate": 1.9504794538711715e-05, "loss": 0.9265, "step": 777 }, { "epoch": 0.13, "grad_norm": 2.289541687237284, "learning_rate": 1.9503139734850426e-05, "loss": 0.7807, "step": 778 }, { "epoch": 0.13, "grad_norm": 2.7984123502229408, "learning_rate": 1.9501482241145422e-05, "loss": 0.8845, "step": 779 }, { "epoch": 0.13, "grad_norm": 2.130114255159154, "learning_rate": 1.9499822058065863e-05, "loss": 0.839, "step": 780 }, { "epoch": 0.13, "grad_norm": 1.9333972158010722, "learning_rate": 1.9498159186081656e-05, "loss": 0.9102, "step": 781 }, { "epoch": 0.13, "grad_norm": 2.0496706468964834, "learning_rate": 1.949649362566347e-05, "loss": 0.9695, "step": 782 }, { "epoch": 0.13, "grad_norm": 1.9844144184688945, "learning_rate": 1.9494825377282746e-05, "loss": 0.9308, "step": 783 }, { "epoch": 0.13, "grad_norm": 2.099441384754326, "learning_rate": 1.9493154441411673e-05, "loss": 0.8738, "step": 784 }, { "epoch": 0.13, "grad_norm": 3.0859653649575653, "learning_rate": 1.949148081852321e-05, "loss": 0.9397, "step": 785 }, { "epoch": 0.13, "grad_norm": 2.0079354795755857, "learning_rate": 1.9489804509091066e-05, "loss": 0.8902, "step": 786 }, { "epoch": 0.13, "grad_norm": 2.4893212553341573, "learning_rate": 1.9488125513589722e-05, "loss": 0.8931, "step": 787 }, { "epoch": 0.13, "grad_norm": 1.8643422486402434, "learning_rate": 1.9486443832494414e-05, "loss": 0.9667, "step": 788 }, { "epoch": 0.13, "grad_norm": 1.968633436881877, "learning_rate": 1.9484759466281132e-05, "loss": 0.8766, "step": 789 }, { "epoch": 0.13, "grad_norm": 1.8154336031202623, "learning_rate": 1.948307241542664e-05, "loss": 0.9552, "step": 790 }, { "epoch": 0.13, "grad_norm": 1.7928208178877165, "learning_rate": 1.9481382680408455e-05, "loss": 0.879, "step": 791 }, { "epoch": 0.13, "grad_norm": 2.353981990725052, "learning_rate": 1.9479690261704848e-05, "loss": 0.8263, "step": 792 }, { "epoch": 0.13, "grad_norm": 2.2406139005630346, "learning_rate": 1.9477995159794854e-05, "loss": 0.8741, "step": 793 }, { "epoch": 0.13, "grad_norm": 1.8985464126756437, "learning_rate": 1.947629737515827e-05, "loss": 0.9169, "step": 794 }, { "epoch": 0.13, "grad_norm": 2.0437193711226747, "learning_rate": 1.947459690827565e-05, "loss": 0.9168, "step": 795 }, { "epoch": 0.13, "grad_norm": 1.784457225589206, "learning_rate": 1.9472893759628307e-05, "loss": 0.8241, "step": 796 }, { "epoch": 0.13, "grad_norm": 2.0606365447949933, "learning_rate": 1.9471187929698317e-05, "loss": 0.9383, "step": 797 }, { "epoch": 0.13, "grad_norm": 2.0620149508467134, "learning_rate": 1.9469479418968506e-05, "loss": 0.8079, "step": 798 }, { "epoch": 0.13, "grad_norm": 3.080678379594775, "learning_rate": 1.946776822792247e-05, "loss": 0.8946, "step": 799 }, { "epoch": 0.13, "grad_norm": 1.9914967325452804, "learning_rate": 1.9466054357044558e-05, "loss": 0.7988, "step": 800 }, { "epoch": 0.13, "grad_norm": 1.9877585276789362, "learning_rate": 1.9464337806819872e-05, "loss": 0.9453, "step": 801 }, { "epoch": 0.13, "grad_norm": 1.8409728918906183, "learning_rate": 1.946261857773428e-05, "loss": 0.955, "step": 802 }, { "epoch": 0.13, "grad_norm": 3.024558430749502, "learning_rate": 1.9460896670274408e-05, "loss": 0.9775, "step": 803 }, { "epoch": 0.13, "grad_norm": 1.885984056611599, "learning_rate": 1.9459172084927638e-05, "loss": 0.9797, "step": 804 }, { "epoch": 0.13, "grad_norm": 1.9561794643062147, "learning_rate": 1.945744482218211e-05, "loss": 0.8751, "step": 805 }, { "epoch": 0.13, "grad_norm": 2.252531720060081, "learning_rate": 1.945571488252672e-05, "loss": 0.8992, "step": 806 }, { "epoch": 0.13, "grad_norm": 1.20714589618587, "learning_rate": 1.945398226645113e-05, "loss": 0.4022, "step": 807 }, { "epoch": 0.13, "grad_norm": 2.808353086503005, "learning_rate": 1.9452246974445743e-05, "loss": 0.8529, "step": 808 }, { "epoch": 0.13, "grad_norm": 4.409329519039731, "learning_rate": 1.9450509007001738e-05, "loss": 0.8915, "step": 809 }, { "epoch": 0.13, "grad_norm": 2.246691205375467, "learning_rate": 1.9448768364611043e-05, "loss": 0.8705, "step": 810 }, { "epoch": 0.13, "grad_norm": 1.851002181468717, "learning_rate": 1.944702504776634e-05, "loss": 0.8393, "step": 811 }, { "epoch": 0.13, "grad_norm": 2.5129295025319647, "learning_rate": 1.944527905696107e-05, "loss": 0.914, "step": 812 }, { "epoch": 0.13, "grad_norm": 1.4554968761303677, "learning_rate": 1.9443530392689434e-05, "loss": 0.8652, "step": 813 }, { "epoch": 0.13, "grad_norm": 1.7546182571563387, "learning_rate": 1.9441779055446387e-05, "loss": 0.8776, "step": 814 }, { "epoch": 0.13, "grad_norm": 2.0747116869239064, "learning_rate": 1.9440025045727645e-05, "loss": 0.9032, "step": 815 }, { "epoch": 0.13, "grad_norm": 1.9592593487282657, "learning_rate": 1.9438268364029674e-05, "loss": 0.9136, "step": 816 }, { "epoch": 0.13, "grad_norm": 1.7764979591317263, "learning_rate": 1.9436509010849696e-05, "loss": 0.909, "step": 817 }, { "epoch": 0.13, "grad_norm": 2.403594212083483, "learning_rate": 1.94347469866857e-05, "loss": 0.9368, "step": 818 }, { "epoch": 0.13, "grad_norm": 4.991110215838935, "learning_rate": 1.9432982292036414e-05, "loss": 0.9404, "step": 819 }, { "epoch": 0.13, "grad_norm": 1.7035866750120383, "learning_rate": 1.9431214927401337e-05, "loss": 0.9876, "step": 820 }, { "epoch": 0.13, "grad_norm": 2.0915788470564407, "learning_rate": 1.9429444893280717e-05, "loss": 0.8894, "step": 821 }, { "epoch": 0.14, "grad_norm": 1.853002571488389, "learning_rate": 1.9427672190175557e-05, "loss": 0.9128, "step": 822 }, { "epoch": 0.14, "grad_norm": 1.9559860952762267, "learning_rate": 1.9425896818587615e-05, "loss": 0.8879, "step": 823 }, { "epoch": 0.14, "grad_norm": 2.3811972211144985, "learning_rate": 1.9424118779019415e-05, "loss": 0.8133, "step": 824 }, { "epoch": 0.14, "grad_norm": 2.585187446713117, "learning_rate": 1.9422338071974215e-05, "loss": 0.7693, "step": 825 }, { "epoch": 0.14, "grad_norm": 1.9366648500575083, "learning_rate": 1.9420554697956052e-05, "loss": 0.7988, "step": 826 }, { "epoch": 0.14, "grad_norm": 1.8392370824975066, "learning_rate": 1.9418768657469695e-05, "loss": 0.8165, "step": 827 }, { "epoch": 0.14, "grad_norm": 2.024864747447034, "learning_rate": 1.941697995102069e-05, "loss": 0.9042, "step": 828 }, { "epoch": 0.14, "grad_norm": 1.014556197473069, "learning_rate": 1.9415188579115315e-05, "loss": 0.3576, "step": 829 }, { "epoch": 0.14, "grad_norm": 2.2807326510099326, "learning_rate": 1.941339454226063e-05, "loss": 0.8701, "step": 830 }, { "epoch": 0.14, "grad_norm": 1.9583702201528428, "learning_rate": 1.9411597840964414e-05, "loss": 0.8727, "step": 831 }, { "epoch": 0.14, "grad_norm": 2.067334655998541, "learning_rate": 1.9409798475735234e-05, "loss": 0.8819, "step": 832 }, { "epoch": 0.14, "grad_norm": 1.7955776547677538, "learning_rate": 1.9407996447082394e-05, "loss": 0.9215, "step": 833 }, { "epoch": 0.14, "grad_norm": 1.6786481147679095, "learning_rate": 1.940619175551595e-05, "loss": 0.7997, "step": 834 }, { "epoch": 0.14, "grad_norm": 2.523672117796751, "learning_rate": 1.940438440154672e-05, "loss": 0.8874, "step": 835 }, { "epoch": 0.14, "grad_norm": 3.042193089675607, "learning_rate": 1.940257438568627e-05, "loss": 0.9601, "step": 836 }, { "epoch": 0.14, "grad_norm": 2.951993609781124, "learning_rate": 1.9400761708446918e-05, "loss": 0.9077, "step": 837 }, { "epoch": 0.14, "grad_norm": 2.431159546293204, "learning_rate": 1.939894637034174e-05, "loss": 0.969, "step": 838 }, { "epoch": 0.14, "grad_norm": 1.7154300160196638, "learning_rate": 1.9397128371884575e-05, "loss": 0.9005, "step": 839 }, { "epoch": 0.14, "grad_norm": 1.9613041762722885, "learning_rate": 1.9395307713589987e-05, "loss": 0.8522, "step": 840 }, { "epoch": 0.14, "grad_norm": 1.8163763317507673, "learning_rate": 1.939348439597332e-05, "loss": 0.8607, "step": 841 }, { "epoch": 0.14, "grad_norm": 6.398484876788083, "learning_rate": 1.9391658419550653e-05, "loss": 0.8363, "step": 842 }, { "epoch": 0.14, "grad_norm": 2.2134706369416173, "learning_rate": 1.9389829784838833e-05, "loss": 0.879, "step": 843 }, { "epoch": 0.14, "grad_norm": 1.9980768459799594, "learning_rate": 1.9387998492355444e-05, "loss": 0.9221, "step": 844 }, { "epoch": 0.14, "grad_norm": 1.9398690809871117, "learning_rate": 1.9386164542618836e-05, "loss": 0.8335, "step": 845 }, { "epoch": 0.14, "grad_norm": 2.097223462724194, "learning_rate": 1.9384327936148095e-05, "loss": 0.9002, "step": 846 }, { "epoch": 0.14, "grad_norm": 1.897412836314832, "learning_rate": 1.938248867346308e-05, "loss": 0.8569, "step": 847 }, { "epoch": 0.14, "grad_norm": 1.919976426358109, "learning_rate": 1.938064675508438e-05, "loss": 0.881, "step": 848 }, { "epoch": 0.14, "grad_norm": 2.197732059382964, "learning_rate": 1.9378802181533354e-05, "loss": 0.9985, "step": 849 }, { "epoch": 0.14, "grad_norm": 2.0425958420645958, "learning_rate": 1.9376954953332104e-05, "loss": 0.9065, "step": 850 }, { "epoch": 0.14, "grad_norm": 2.0435527287681636, "learning_rate": 1.9375105071003476e-05, "loss": 0.904, "step": 851 }, { "epoch": 0.14, "grad_norm": 2.017987524235337, "learning_rate": 1.9373252535071087e-05, "loss": 0.7804, "step": 852 }, { "epoch": 0.14, "grad_norm": 2.8306206707506236, "learning_rate": 1.9371397346059286e-05, "loss": 0.8762, "step": 853 }, { "epoch": 0.14, "grad_norm": 1.8396290014082255, "learning_rate": 1.936953950449318e-05, "loss": 0.9294, "step": 854 }, { "epoch": 0.14, "grad_norm": 1.9369015305154027, "learning_rate": 1.936767901089863e-05, "loss": 0.9184, "step": 855 }, { "epoch": 0.14, "grad_norm": 2.419477395243132, "learning_rate": 1.9365815865802243e-05, "loss": 0.8154, "step": 856 }, { "epoch": 0.14, "grad_norm": 1.6931632100281608, "learning_rate": 1.936395006973138e-05, "loss": 0.8845, "step": 857 }, { "epoch": 0.14, "grad_norm": 2.2700502985787807, "learning_rate": 1.936208162321415e-05, "loss": 0.9248, "step": 858 }, { "epoch": 0.14, "grad_norm": 2.454374448371036, "learning_rate": 1.9360210526779414e-05, "loss": 0.801, "step": 859 }, { "epoch": 0.14, "grad_norm": 1.8157367195905694, "learning_rate": 1.9358336780956777e-05, "loss": 0.8802, "step": 860 }, { "epoch": 0.14, "grad_norm": 2.583212570481326, "learning_rate": 1.9356460386276606e-05, "loss": 0.8858, "step": 861 }, { "epoch": 0.14, "grad_norm": 1.8658228823161875, "learning_rate": 1.9354581343270006e-05, "loss": 0.8762, "step": 862 }, { "epoch": 0.14, "grad_norm": 1.9879826703714638, "learning_rate": 1.9352699652468835e-05, "loss": 0.8899, "step": 863 }, { "epoch": 0.14, "grad_norm": 2.3671067817555316, "learning_rate": 1.9350815314405703e-05, "loss": 0.9856, "step": 864 }, { "epoch": 0.14, "grad_norm": 1.8889858059166953, "learning_rate": 1.934892832961397e-05, "loss": 0.923, "step": 865 }, { "epoch": 0.14, "grad_norm": 2.7105890949965334, "learning_rate": 1.9347038698627744e-05, "loss": 0.7951, "step": 866 }, { "epoch": 0.14, "grad_norm": 1.9841046289972593, "learning_rate": 1.9345146421981878e-05, "loss": 0.9181, "step": 867 }, { "epoch": 0.14, "grad_norm": 10.943380869001455, "learning_rate": 1.9343251500211977e-05, "loss": 0.8633, "step": 868 }, { "epoch": 0.14, "grad_norm": 2.3689700523922554, "learning_rate": 1.9341353933854396e-05, "loss": 0.8482, "step": 869 }, { "epoch": 0.14, "grad_norm": 2.5291402643337078, "learning_rate": 1.9339453723446234e-05, "loss": 0.8308, "step": 870 }, { "epoch": 0.14, "grad_norm": 2.6819160240783475, "learning_rate": 1.9337550869525344e-05, "loss": 0.8353, "step": 871 }, { "epoch": 0.14, "grad_norm": 1.8675735241351838, "learning_rate": 1.933564537263033e-05, "loss": 0.8838, "step": 872 }, { "epoch": 0.14, "grad_norm": 1.8229213772319368, "learning_rate": 1.933373723330053e-05, "loss": 0.9091, "step": 873 }, { "epoch": 0.14, "grad_norm": 1.8789534835442898, "learning_rate": 1.9331826452076044e-05, "loss": 0.9466, "step": 874 }, { "epoch": 0.14, "grad_norm": 2.2898252675273807, "learning_rate": 1.932991302949771e-05, "loss": 0.9055, "step": 875 }, { "epoch": 0.14, "grad_norm": 2.256273766906671, "learning_rate": 1.9327996966107122e-05, "loss": 0.879, "step": 876 }, { "epoch": 0.14, "grad_norm": 2.069022256397358, "learning_rate": 1.932607826244662e-05, "loss": 0.9341, "step": 877 }, { "epoch": 0.14, "grad_norm": 1.1380017056093472, "learning_rate": 1.9324156919059286e-05, "loss": 0.3916, "step": 878 }, { "epoch": 0.14, "grad_norm": 2.0248366874487513, "learning_rate": 1.932223293648895e-05, "loss": 0.9492, "step": 879 }, { "epoch": 0.14, "grad_norm": 2.1659061723784543, "learning_rate": 1.9320306315280196e-05, "loss": 0.8904, "step": 880 }, { "epoch": 0.14, "grad_norm": 2.2143959773342106, "learning_rate": 1.9318377055978342e-05, "loss": 0.7988, "step": 881 }, { "epoch": 0.14, "grad_norm": 2.1571080028374556, "learning_rate": 1.9316445159129474e-05, "loss": 0.9138, "step": 882 }, { "epoch": 0.15, "grad_norm": 2.497935767784172, "learning_rate": 1.93145106252804e-05, "loss": 0.8342, "step": 883 }, { "epoch": 0.15, "grad_norm": 1.9839142767465534, "learning_rate": 1.931257345497869e-05, "loss": 0.827, "step": 884 }, { "epoch": 0.15, "grad_norm": 1.671613864713911, "learning_rate": 1.9310633648772656e-05, "loss": 0.9538, "step": 885 }, { "epoch": 0.15, "grad_norm": 2.177757300828749, "learning_rate": 1.9308691207211353e-05, "loss": 0.9156, "step": 886 }, { "epoch": 0.15, "grad_norm": 1.8475341054791692, "learning_rate": 1.9306746130844593e-05, "loss": 0.902, "step": 887 }, { "epoch": 0.15, "grad_norm": 2.6525476201552403, "learning_rate": 1.9304798420222918e-05, "loss": 0.9297, "step": 888 }, { "epoch": 0.15, "grad_norm": 4.204629939641918, "learning_rate": 1.9302848075897624e-05, "loss": 0.8875, "step": 889 }, { "epoch": 0.15, "grad_norm": 1.7896183065556028, "learning_rate": 1.9300895098420753e-05, "loss": 0.8867, "step": 890 }, { "epoch": 0.15, "grad_norm": 1.8844074716562338, "learning_rate": 1.929893948834509e-05, "loss": 0.946, "step": 891 }, { "epoch": 0.15, "grad_norm": 2.123829321249946, "learning_rate": 1.9296981246224173e-05, "loss": 0.8405, "step": 892 }, { "epoch": 0.15, "grad_norm": 2.6032964836760533, "learning_rate": 1.9295020372612276e-05, "loss": 0.9483, "step": 893 }, { "epoch": 0.15, "grad_norm": 2.3278371492368963, "learning_rate": 1.929305686806441e-05, "loss": 0.8856, "step": 894 }, { "epoch": 0.15, "grad_norm": 1.5998581012578597, "learning_rate": 1.9291090733136352e-05, "loss": 0.9066, "step": 895 }, { "epoch": 0.15, "grad_norm": 1.8856524103435555, "learning_rate": 1.928912196838461e-05, "loss": 0.9384, "step": 896 }, { "epoch": 0.15, "grad_norm": 2.7172539062043755, "learning_rate": 1.9287150574366432e-05, "loss": 0.8089, "step": 897 }, { "epoch": 0.15, "grad_norm": 2.294428226122152, "learning_rate": 1.9285176551639826e-05, "loss": 0.9186, "step": 898 }, { "epoch": 0.15, "grad_norm": 2.3618703419066147, "learning_rate": 1.9283199900763533e-05, "loss": 0.8948, "step": 899 }, { "epoch": 0.15, "grad_norm": 1.6051964857573786, "learning_rate": 1.9281220622297033e-05, "loss": 0.9495, "step": 900 }, { "epoch": 0.15, "grad_norm": 2.3915854313916447, "learning_rate": 1.927923871680057e-05, "loss": 0.878, "step": 901 }, { "epoch": 0.15, "grad_norm": 2.375581356264163, "learning_rate": 1.9277254184835105e-05, "loss": 0.8208, "step": 902 }, { "epoch": 0.15, "grad_norm": 2.1633730189581732, "learning_rate": 1.9275267026962358e-05, "loss": 0.8414, "step": 903 }, { "epoch": 0.15, "grad_norm": 1.9828438195157023, "learning_rate": 1.9273277243744797e-05, "loss": 0.8665, "step": 904 }, { "epoch": 0.15, "grad_norm": 2.158328336236659, "learning_rate": 1.927128483574562e-05, "loss": 0.8749, "step": 905 }, { "epoch": 0.15, "grad_norm": 2.22443222797056, "learning_rate": 1.9269289803528775e-05, "loss": 0.9536, "step": 906 }, { "epoch": 0.15, "grad_norm": 1.9000990960974151, "learning_rate": 1.926729214765895e-05, "loss": 0.8154, "step": 907 }, { "epoch": 0.15, "grad_norm": 2.776581392897414, "learning_rate": 1.9265291868701584e-05, "loss": 0.9376, "step": 908 }, { "epoch": 0.15, "grad_norm": 3.697242421961515, "learning_rate": 1.9263288967222843e-05, "loss": 0.8889, "step": 909 }, { "epoch": 0.15, "grad_norm": 2.409292460342415, "learning_rate": 1.926128344378965e-05, "loss": 0.8734, "step": 910 }, { "epoch": 0.15, "grad_norm": 1.6965670325436537, "learning_rate": 1.9259275298969663e-05, "loss": 0.8973, "step": 911 }, { "epoch": 0.15, "grad_norm": 2.309843587049936, "learning_rate": 1.925726453333128e-05, "loss": 0.8801, "step": 912 }, { "epoch": 0.15, "grad_norm": 2.2706552680208048, "learning_rate": 1.9255251147443646e-05, "loss": 0.8783, "step": 913 }, { "epoch": 0.15, "grad_norm": 1.2031535397031454, "learning_rate": 1.9253235141876646e-05, "loss": 0.4128, "step": 914 }, { "epoch": 0.15, "grad_norm": 1.7611375580144872, "learning_rate": 1.925121651720091e-05, "loss": 0.8955, "step": 915 }, { "epoch": 0.15, "grad_norm": 2.3360586511514376, "learning_rate": 1.92491952739878e-05, "loss": 0.8782, "step": 916 }, { "epoch": 0.15, "grad_norm": 4.690873633796915, "learning_rate": 1.9247171412809423e-05, "loss": 0.9037, "step": 917 }, { "epoch": 0.15, "grad_norm": 2.081041705512327, "learning_rate": 1.924514493423864e-05, "loss": 0.9022, "step": 918 }, { "epoch": 0.15, "grad_norm": 1.7587202863320657, "learning_rate": 1.9243115838849023e-05, "loss": 0.9262, "step": 919 }, { "epoch": 0.15, "grad_norm": 1.7481882669925435, "learning_rate": 1.924108412721492e-05, "loss": 0.9335, "step": 920 }, { "epoch": 0.15, "grad_norm": 1.8530314318713175, "learning_rate": 1.9239049799911397e-05, "loss": 0.9105, "step": 921 }, { "epoch": 0.15, "grad_norm": 1.9672134314713143, "learning_rate": 1.923701285751426e-05, "loss": 0.9229, "step": 922 }, { "epoch": 0.15, "grad_norm": 0.6924044515506668, "learning_rate": 1.9234973300600074e-05, "loss": 0.3956, "step": 923 }, { "epoch": 0.15, "grad_norm": 1.704243465974315, "learning_rate": 1.9232931129746116e-05, "loss": 0.8826, "step": 924 }, { "epoch": 0.15, "grad_norm": 1.8458110196181354, "learning_rate": 1.9230886345530432e-05, "loss": 0.9589, "step": 925 }, { "epoch": 0.15, "grad_norm": 2.2138857146138413, "learning_rate": 1.9228838948531786e-05, "loss": 0.9284, "step": 926 }, { "epoch": 0.15, "grad_norm": 1.8900986000312447, "learning_rate": 1.9226788939329693e-05, "loss": 0.8661, "step": 927 }, { "epoch": 0.15, "grad_norm": 1.860690745047075, "learning_rate": 1.92247363185044e-05, "loss": 0.8099, "step": 928 }, { "epoch": 0.15, "grad_norm": 1.6530764198825834, "learning_rate": 1.92226810866369e-05, "loss": 0.7988, "step": 929 }, { "epoch": 0.15, "grad_norm": 0.702807026753439, "learning_rate": 1.922062324430892e-05, "loss": 0.3678, "step": 930 }, { "epoch": 0.15, "grad_norm": 1.6487983649066429, "learning_rate": 1.921856279210293e-05, "loss": 0.9257, "step": 931 }, { "epoch": 0.15, "grad_norm": 1.7396808833556598, "learning_rate": 1.9216499730602135e-05, "loss": 0.9248, "step": 932 }, { "epoch": 0.15, "grad_norm": 1.789261213804897, "learning_rate": 1.9214434060390484e-05, "loss": 0.9537, "step": 933 }, { "epoch": 0.15, "grad_norm": 1.946722914478503, "learning_rate": 1.9212365782052656e-05, "loss": 0.8867, "step": 934 }, { "epoch": 0.15, "grad_norm": 1.8374386574372588, "learning_rate": 1.9210294896174074e-05, "loss": 0.8584, "step": 935 }, { "epoch": 0.15, "grad_norm": 1.9156406531433596, "learning_rate": 1.9208221403340895e-05, "loss": 0.7935, "step": 936 }, { "epoch": 0.15, "grad_norm": 2.7266632405594358, "learning_rate": 1.9206145304140026e-05, "loss": 0.8981, "step": 937 }, { "epoch": 0.15, "grad_norm": 2.824732592641354, "learning_rate": 1.9204066599159094e-05, "loss": 0.8587, "step": 938 }, { "epoch": 0.15, "grad_norm": 1.8238999565290233, "learning_rate": 1.920198528898648e-05, "loss": 0.9459, "step": 939 }, { "epoch": 0.15, "grad_norm": 2.0661024325777078, "learning_rate": 1.919990137421128e-05, "loss": 0.8505, "step": 940 }, { "epoch": 0.15, "grad_norm": 1.5889329592864285, "learning_rate": 1.9197814855423357e-05, "loss": 0.8978, "step": 941 }, { "epoch": 0.15, "grad_norm": 0.7436571286957462, "learning_rate": 1.919572573321329e-05, "loss": 0.3589, "step": 942 }, { "epoch": 0.15, "grad_norm": 1.7663878661302295, "learning_rate": 1.9193634008172396e-05, "loss": 0.8663, "step": 943 }, { "epoch": 0.16, "grad_norm": 2.20931215479221, "learning_rate": 1.9191539680892738e-05, "loss": 0.8632, "step": 944 }, { "epoch": 0.16, "grad_norm": 0.7192005628265207, "learning_rate": 1.9189442751967117e-05, "loss": 0.3608, "step": 945 }, { "epoch": 0.16, "grad_norm": 1.7256619657399015, "learning_rate": 1.9187343221989052e-05, "loss": 0.8731, "step": 946 }, { "epoch": 0.16, "grad_norm": 2.771741440921574, "learning_rate": 1.918524109155282e-05, "loss": 0.8529, "step": 947 }, { "epoch": 0.16, "grad_norm": 0.6939880601123742, "learning_rate": 1.9183136361253417e-05, "loss": 0.3619, "step": 948 }, { "epoch": 0.16, "grad_norm": 3.187661250135478, "learning_rate": 1.918102903168659e-05, "loss": 0.9589, "step": 949 }, { "epoch": 0.16, "grad_norm": 1.884013353368715, "learning_rate": 1.9178919103448807e-05, "loss": 0.9103, "step": 950 }, { "epoch": 0.16, "grad_norm": 2.4142704676812783, "learning_rate": 1.9176806577137285e-05, "loss": 0.8516, "step": 951 }, { "epoch": 0.16, "grad_norm": 1.8002874155632687, "learning_rate": 1.9174691453349967e-05, "loss": 0.9643, "step": 952 }, { "epoch": 0.16, "grad_norm": 4.1642158110343885, "learning_rate": 1.917257373268554e-05, "loss": 0.8805, "step": 953 }, { "epoch": 0.16, "grad_norm": 1.9022069067561929, "learning_rate": 1.917045341574341e-05, "loss": 0.8918, "step": 954 }, { "epoch": 0.16, "grad_norm": 1.8188220193704252, "learning_rate": 1.916833050312373e-05, "loss": 0.9284, "step": 955 }, { "epoch": 0.16, "grad_norm": 1.9019830551074963, "learning_rate": 1.9166204995427398e-05, "loss": 0.8518, "step": 956 }, { "epoch": 0.16, "grad_norm": 0.7305015541057748, "learning_rate": 1.916407689325602e-05, "loss": 0.3552, "step": 957 }, { "epoch": 0.16, "grad_norm": 1.7081268391925977, "learning_rate": 1.916194619721196e-05, "loss": 0.8131, "step": 958 }, { "epoch": 0.16, "grad_norm": 3.1119840328950152, "learning_rate": 1.9159812907898304e-05, "loss": 0.946, "step": 959 }, { "epoch": 0.16, "grad_norm": 4.97100860133114, "learning_rate": 1.915767702591887e-05, "loss": 0.8576, "step": 960 }, { "epoch": 0.16, "grad_norm": 0.7300812963450235, "learning_rate": 1.9155538551878225e-05, "loss": 0.3672, "step": 961 }, { "epoch": 0.16, "grad_norm": 1.716324158498738, "learning_rate": 1.9153397486381657e-05, "loss": 0.9006, "step": 962 }, { "epoch": 0.16, "grad_norm": 2.1537609681625463, "learning_rate": 1.915125383003518e-05, "loss": 0.9193, "step": 963 }, { "epoch": 0.16, "grad_norm": 1.9501693087499625, "learning_rate": 1.9149107583445566e-05, "loss": 0.9094, "step": 964 }, { "epoch": 0.16, "grad_norm": 1.6558556228044932, "learning_rate": 1.9146958747220292e-05, "loss": 0.907, "step": 965 }, { "epoch": 0.16, "grad_norm": 2.849706457563362, "learning_rate": 1.9144807321967594e-05, "loss": 0.8477, "step": 966 }, { "epoch": 0.16, "grad_norm": 1.6911392331896895, "learning_rate": 1.914265330829642e-05, "loss": 0.8191, "step": 967 }, { "epoch": 0.16, "grad_norm": 10.057391282352857, "learning_rate": 1.914049670681646e-05, "loss": 0.8376, "step": 968 }, { "epoch": 0.16, "grad_norm": 2.6687980372291635, "learning_rate": 1.913833751813814e-05, "loss": 0.9369, "step": 969 }, { "epoch": 0.16, "grad_norm": 2.389992916630159, "learning_rate": 1.9136175742872608e-05, "loss": 0.9172, "step": 970 }, { "epoch": 0.16, "grad_norm": 1.8342823210599621, "learning_rate": 1.9134011381631755e-05, "loss": 0.8391, "step": 971 }, { "epoch": 0.16, "grad_norm": 1.9320891082389742, "learning_rate": 1.9131844435028196e-05, "loss": 0.8855, "step": 972 }, { "epoch": 0.16, "grad_norm": 1.9960295401493364, "learning_rate": 1.912967490367528e-05, "loss": 0.8613, "step": 973 }, { "epoch": 0.16, "grad_norm": 1.8199333830682882, "learning_rate": 1.912750278818709e-05, "loss": 0.8461, "step": 974 }, { "epoch": 0.16, "grad_norm": 1.858334930498051, "learning_rate": 1.9125328089178442e-05, "loss": 0.7858, "step": 975 }, { "epoch": 0.16, "grad_norm": 2.44974128901433, "learning_rate": 1.9123150807264872e-05, "loss": 0.9006, "step": 976 }, { "epoch": 0.16, "grad_norm": 0.7725108293453664, "learning_rate": 1.912097094306266e-05, "loss": 0.3542, "step": 977 }, { "epoch": 0.16, "grad_norm": 1.9055930889162225, "learning_rate": 1.9118788497188815e-05, "loss": 0.9132, "step": 978 }, { "epoch": 0.16, "grad_norm": 2.0094888571640976, "learning_rate": 1.9116603470261065e-05, "loss": 0.87, "step": 979 }, { "epoch": 0.16, "grad_norm": 2.0193234971829352, "learning_rate": 1.9114415862897883e-05, "loss": 0.9329, "step": 980 }, { "epoch": 0.16, "grad_norm": 1.690432813556107, "learning_rate": 1.911222567571847e-05, "loss": 0.9255, "step": 981 }, { "epoch": 0.16, "grad_norm": 2.1407394715563006, "learning_rate": 1.911003290934275e-05, "loss": 0.8953, "step": 982 }, { "epoch": 0.16, "grad_norm": 2.8685414341556523, "learning_rate": 1.9107837564391376e-05, "loss": 0.8823, "step": 983 }, { "epoch": 0.16, "grad_norm": 2.1137535564257344, "learning_rate": 1.910563964148574e-05, "loss": 0.8879, "step": 984 }, { "epoch": 0.16, "grad_norm": 2.5860411671558645, "learning_rate": 1.9103439141247966e-05, "loss": 0.8474, "step": 985 }, { "epoch": 0.16, "grad_norm": 1.84963681515135, "learning_rate": 1.9101236064300895e-05, "loss": 0.8205, "step": 986 }, { "epoch": 0.16, "grad_norm": 2.1012662172462786, "learning_rate": 1.90990304112681e-05, "loss": 0.8943, "step": 987 }, { "epoch": 0.16, "grad_norm": 2.830462565638336, "learning_rate": 1.9096822182773887e-05, "loss": 0.9178, "step": 988 }, { "epoch": 0.16, "grad_norm": 2.2274993403394485, "learning_rate": 1.9094611379443298e-05, "loss": 0.8373, "step": 989 }, { "epoch": 0.16, "grad_norm": 2.637843233681166, "learning_rate": 1.9092398001902092e-05, "loss": 0.8402, "step": 990 }, { "epoch": 0.16, "grad_norm": 1.7697575656959608, "learning_rate": 1.909018205077676e-05, "loss": 0.9199, "step": 991 }, { "epoch": 0.16, "grad_norm": 2.829186169301964, "learning_rate": 1.908796352669452e-05, "loss": 0.811, "step": 992 }, { "epoch": 0.16, "grad_norm": 1.5031925265018387, "learning_rate": 1.9085742430283322e-05, "loss": 0.9009, "step": 993 }, { "epoch": 0.16, "grad_norm": 1.7898722437584502, "learning_rate": 1.9083518762171847e-05, "loss": 0.8941, "step": 994 }, { "epoch": 0.16, "grad_norm": 1.7086410520579036, "learning_rate": 1.9081292522989493e-05, "loss": 0.8897, "step": 995 }, { "epoch": 0.16, "grad_norm": 4.800757190598681, "learning_rate": 1.90790637133664e-05, "loss": 0.7996, "step": 996 }, { "epoch": 0.16, "grad_norm": 2.0641149885151124, "learning_rate": 1.9076832333933423e-05, "loss": 0.8958, "step": 997 }, { "epoch": 0.16, "grad_norm": 4.0018422019879765, "learning_rate": 1.907459838532215e-05, "loss": 0.8821, "step": 998 }, { "epoch": 0.16, "grad_norm": 2.103365754593794, "learning_rate": 1.9072361868164892e-05, "loss": 0.7898, "step": 999 }, { "epoch": 0.16, "grad_norm": 2.276825417396754, "learning_rate": 1.9070122783094695e-05, "loss": 0.9172, "step": 1000 }, { "epoch": 0.16440831074977416, "grad_norm": 2.0907457506305054, "learning_rate": 1.9067881130745325e-05, "loss": 0.774, "step": 1001 }, { "epoch": 0.16457255481645724, "grad_norm": 2.533701406648404, "learning_rate": 1.906563691175128e-05, "loss": 0.932, "step": 1002 }, { "epoch": 0.16473679888314036, "grad_norm": 4.156417270145069, "learning_rate": 1.9063390126747778e-05, "loss": 0.8609, "step": 1003 }, { "epoch": 0.16490104294982344, "grad_norm": 0.8086379396946741, "learning_rate": 1.906114077637077e-05, "loss": 0.3206, "step": 1004 }, { "epoch": 0.16506528701650652, "grad_norm": 1.878695671620334, "learning_rate": 1.905888886125693e-05, "loss": 0.9631, "step": 1005 }, { "epoch": 0.1652295310831896, "grad_norm": 1.627532733454919, "learning_rate": 1.9056634382043653e-05, "loss": 0.8647, "step": 1006 }, { "epoch": 0.16539377514987272, "grad_norm": 0.6677123838818803, "learning_rate": 1.905437733936907e-05, "loss": 0.3381, "step": 1007 }, { "epoch": 0.1655580192165558, "grad_norm": 1.6483628399014232, "learning_rate": 1.9052117733872025e-05, "loss": 0.8635, "step": 1008 }, { "epoch": 0.16572226328323889, "grad_norm": 1.5466878367720402, "learning_rate": 1.9049855566192105e-05, "loss": 0.8171, "step": 1009 }, { "epoch": 0.165886507349922, "grad_norm": 2.5272975987622424, "learning_rate": 1.9047590836969603e-05, "loss": 0.8103, "step": 1010 }, { "epoch": 0.16605075141660508, "grad_norm": 1.8317504760476353, "learning_rate": 1.904532354684555e-05, "loss": 0.8357, "step": 1011 }, { "epoch": 0.16621499548328816, "grad_norm": 2.270705161318652, "learning_rate": 1.9043053696461696e-05, "loss": 0.866, "step": 1012 }, { "epoch": 0.16637923954997125, "grad_norm": 1.6741082630747026, "learning_rate": 1.904078128646052e-05, "loss": 0.7756, "step": 1013 }, { "epoch": 0.16654348361665436, "grad_norm": 1.7751144254280218, "learning_rate": 1.903850631748522e-05, "loss": 0.8279, "step": 1014 }, { "epoch": 0.16670772768333744, "grad_norm": 2.6014210543404466, "learning_rate": 1.9036228790179722e-05, "loss": 0.772, "step": 1015 }, { "epoch": 0.16687197175002053, "grad_norm": 2.6786546976578864, "learning_rate": 1.9033948705188673e-05, "loss": 0.8614, "step": 1016 }, { "epoch": 0.1670362158167036, "grad_norm": 2.21697024624804, "learning_rate": 1.9031666063157453e-05, "loss": 0.8056, "step": 1017 }, { "epoch": 0.16720045988338672, "grad_norm": 6.755999718736654, "learning_rate": 1.902938086473215e-05, "loss": 0.8451, "step": 1018 }, { "epoch": 0.1673647039500698, "grad_norm": 1.6371589592946414, "learning_rate": 1.9027093110559596e-05, "loss": 0.8546, "step": 1019 }, { "epoch": 0.1675289480167529, "grad_norm": 2.095761523209275, "learning_rate": 1.9024802801287327e-05, "loss": 0.8547, "step": 1020 }, { "epoch": 0.167693192083436, "grad_norm": 1.9235419509228973, "learning_rate": 1.9022509937563606e-05, "loss": 0.8337, "step": 1021 }, { "epoch": 0.16785743615011908, "grad_norm": 1.9866819555492476, "learning_rate": 1.9020214520037433e-05, "loss": 0.8226, "step": 1022 }, { "epoch": 0.16802168021680217, "grad_norm": 1.9417650339595613, "learning_rate": 1.901791654935852e-05, "loss": 0.8256, "step": 1023 }, { "epoch": 0.16818592428348525, "grad_norm": 2.038396599689906, "learning_rate": 1.9015616026177294e-05, "loss": 0.8611, "step": 1024 }, { "epoch": 0.16835016835016836, "grad_norm": 2.1887349091601473, "learning_rate": 1.901331295114492e-05, "loss": 0.7789, "step": 1025 }, { "epoch": 0.16851441241685144, "grad_norm": 2.560164803051263, "learning_rate": 1.9011007324913277e-05, "loss": 0.8869, "step": 1026 }, { "epoch": 0.16867865648353453, "grad_norm": 2.1258833507585493, "learning_rate": 1.9008699148134967e-05, "loss": 0.8323, "step": 1027 }, { "epoch": 0.1688429005502176, "grad_norm": 2.896718152344922, "learning_rate": 1.9006388421463322e-05, "loss": 0.8205, "step": 1028 }, { "epoch": 0.16900714461690072, "grad_norm": 1.6245224845616544, "learning_rate": 1.9004075145552378e-05, "loss": 0.8705, "step": 1029 }, { "epoch": 0.1691713886835838, "grad_norm": 1.9093078408565625, "learning_rate": 1.9001759321056905e-05, "loss": 0.8781, "step": 1030 }, { "epoch": 0.1693356327502669, "grad_norm": 1.9504723809545812, "learning_rate": 1.8999440948632397e-05, "loss": 0.8655, "step": 1031 }, { "epoch": 0.16949987681695, "grad_norm": 2.0189523204760986, "learning_rate": 1.899712002893506e-05, "loss": 0.8575, "step": 1032 }, { "epoch": 0.16966412088363308, "grad_norm": 1.8376092206554402, "learning_rate": 1.899479656262183e-05, "loss": 0.8596, "step": 1033 }, { "epoch": 0.16982836495031617, "grad_norm": 3.3568267885320227, "learning_rate": 1.8992470550350356e-05, "loss": 0.8292, "step": 1034 }, { "epoch": 0.16999260901699925, "grad_norm": 2.0746168228114885, "learning_rate": 1.8990141992779008e-05, "loss": 0.8362, "step": 1035 }, { "epoch": 0.17015685308368236, "grad_norm": 2.268106924180323, "learning_rate": 1.8987810890566885e-05, "loss": 0.8457, "step": 1036 }, { "epoch": 0.17032109715036545, "grad_norm": 1.7682461277834394, "learning_rate": 1.8985477244373796e-05, "loss": 0.848, "step": 1037 }, { "epoch": 0.17048534121704853, "grad_norm": 2.5332829978503426, "learning_rate": 1.898314105486028e-05, "loss": 0.8078, "step": 1038 }, { "epoch": 0.17064958528373161, "grad_norm": 1.8196059735595878, "learning_rate": 1.8980802322687584e-05, "loss": 0.7316, "step": 1039 }, { "epoch": 0.17081382935041473, "grad_norm": 1.5790336688876925, "learning_rate": 1.8978461048517686e-05, "loss": 0.7942, "step": 1040 }, { "epoch": 0.1709780734170978, "grad_norm": 1.9740858306569236, "learning_rate": 1.8976117233013278e-05, "loss": 0.8612, "step": 1041 }, { "epoch": 0.1711423174837809, "grad_norm": 1.8926699122066026, "learning_rate": 1.8973770876837772e-05, "loss": 0.8437, "step": 1042 }, { "epoch": 0.17130656155046398, "grad_norm": 1.6387232665383649, "learning_rate": 1.8971421980655295e-05, "loss": 0.8696, "step": 1043 }, { "epoch": 0.1714708056171471, "grad_norm": 2.3346077247269825, "learning_rate": 1.8969070545130702e-05, "loss": 0.9212, "step": 1044 }, { "epoch": 0.17163504968383017, "grad_norm": 5.728889294625457, "learning_rate": 1.896671657092956e-05, "loss": 0.8304, "step": 1045 }, { "epoch": 0.17179929375051325, "grad_norm": 2.5122928863779705, "learning_rate": 1.8964360058718162e-05, "loss": 0.9236, "step": 1046 }, { "epoch": 0.17196353781719637, "grad_norm": 2.4126000080599557, "learning_rate": 1.8962001009163506e-05, "loss": 0.6679, "step": 1047 }, { "epoch": 0.17212778188387945, "grad_norm": 4.543919482565323, "learning_rate": 1.8959639422933316e-05, "loss": 0.7917, "step": 1048 }, { "epoch": 0.17229202595056253, "grad_norm": 1.7374499094376519, "learning_rate": 1.8957275300696036e-05, "loss": 0.7982, "step": 1049 }, { "epoch": 0.17245627001724562, "grad_norm": 2.369868411065794, "learning_rate": 1.895490864312083e-05, "loss": 0.8881, "step": 1050 }, { "epoch": 0.17262051408392873, "grad_norm": 2.3088099842893928, "learning_rate": 1.895253945087757e-05, "loss": 0.835, "step": 1051 }, { "epoch": 0.1727847581506118, "grad_norm": 1.9934428113172995, "learning_rate": 1.8950167724636856e-05, "loss": 0.8957, "step": 1052 }, { "epoch": 0.1729490022172949, "grad_norm": 1.6884640116476086, "learning_rate": 1.894779346506999e-05, "loss": 0.8121, "step": 1053 }, { "epoch": 0.17311324628397798, "grad_norm": 1.6829506282318156, "learning_rate": 1.8945416672849014e-05, "loss": 0.8181, "step": 1054 }, { "epoch": 0.1732774903506611, "grad_norm": 1.6085416050367256, "learning_rate": 1.8943037348646668e-05, "loss": 0.8062, "step": 1055 }, { "epoch": 0.17344173441734417, "grad_norm": 2.9244991228676254, "learning_rate": 1.8940655493136415e-05, "loss": 0.8023, "step": 1056 }, { "epoch": 0.17360597848402726, "grad_norm": 1.711462395645189, "learning_rate": 1.8938271106992433e-05, "loss": 0.841, "step": 1057 }, { "epoch": 0.17377022255071037, "grad_norm": 1.8018964261291752, "learning_rate": 1.893588419088962e-05, "loss": 0.9328, "step": 1058 }, { "epoch": 0.17393446661739345, "grad_norm": 1.7393799484942076, "learning_rate": 1.8933494745503585e-05, "loss": 0.7877, "step": 1059 }, { "epoch": 0.17409871068407654, "grad_norm": 1.9688993574999356, "learning_rate": 1.8931102771510657e-05, "loss": 0.8435, "step": 1060 }, { "epoch": 0.17426295475075962, "grad_norm": 1.6687685865283675, "learning_rate": 1.8928708269587876e-05, "loss": 0.8382, "step": 1061 }, { "epoch": 0.17442719881744273, "grad_norm": 3.4080326174094, "learning_rate": 1.8926311240413008e-05, "loss": 0.8091, "step": 1062 }, { "epoch": 0.1745914428841258, "grad_norm": 1.9893894848722062, "learning_rate": 1.892391168466452e-05, "loss": 0.7694, "step": 1063 }, { "epoch": 0.1747556869508089, "grad_norm": 1.5572699851554657, "learning_rate": 1.8921509603021606e-05, "loss": 0.8837, "step": 1064 }, { "epoch": 0.17491993101749198, "grad_norm": 2.3464682138248505, "learning_rate": 1.8919104996164167e-05, "loss": 0.8417, "step": 1065 }, { "epoch": 0.1750841750841751, "grad_norm": 1.843122911919255, "learning_rate": 1.8916697864772822e-05, "loss": 0.8716, "step": 1066 }, { "epoch": 0.17524841915085818, "grad_norm": 1.7391453165886959, "learning_rate": 1.8914288209528907e-05, "loss": 0.8598, "step": 1067 }, { "epoch": 0.17541266321754126, "grad_norm": 1.607768094612111, "learning_rate": 1.891187603111447e-05, "loss": 0.8131, "step": 1068 }, { "epoch": 0.17557690728422437, "grad_norm": 1.5898831814550725, "learning_rate": 1.8909461330212267e-05, "loss": 0.789, "step": 1069 }, { "epoch": 0.17574115135090745, "grad_norm": 1.8231428317222904, "learning_rate": 1.890704410750578e-05, "loss": 0.7442, "step": 1070 }, { "epoch": 0.17590539541759054, "grad_norm": 2.236347663377677, "learning_rate": 1.89046243636792e-05, "loss": 0.7619, "step": 1071 }, { "epoch": 0.17606963948427362, "grad_norm": 1.725925503511682, "learning_rate": 1.8902202099417425e-05, "loss": 0.857, "step": 1072 }, { "epoch": 0.17623388355095673, "grad_norm": 1.8248453347066627, "learning_rate": 1.8899777315406073e-05, "loss": 0.8931, "step": 1073 }, { "epoch": 0.17639812761763982, "grad_norm": 2.248811690571544, "learning_rate": 1.8897350012331478e-05, "loss": 0.851, "step": 1074 }, { "epoch": 0.1765623716843229, "grad_norm": 1.679587789305827, "learning_rate": 1.889492019088068e-05, "loss": 0.8906, "step": 1075 }, { "epoch": 0.17672661575100598, "grad_norm": 2.276822422579005, "learning_rate": 1.889248785174143e-05, "loss": 0.8738, "step": 1076 }, { "epoch": 0.1768908598176891, "grad_norm": 1.972966532070285, "learning_rate": 1.8890052995602207e-05, "loss": 0.9206, "step": 1077 }, { "epoch": 0.17705510388437218, "grad_norm": 2.01920978163134, "learning_rate": 1.8887615623152188e-05, "loss": 0.8918, "step": 1078 }, { "epoch": 0.17721934795105526, "grad_norm": 2.155714544131627, "learning_rate": 1.888517573508126e-05, "loss": 0.8275, "step": 1079 }, { "epoch": 0.17738359201773837, "grad_norm": 2.081651089986359, "learning_rate": 1.8882733332080038e-05, "loss": 0.8694, "step": 1080 }, { "epoch": 0.17754783608442146, "grad_norm": 1.7313039489432462, "learning_rate": 1.888028841483983e-05, "loss": 0.8084, "step": 1081 }, { "epoch": 0.17771208015110454, "grad_norm": 1.8154374166697893, "learning_rate": 1.887784098405267e-05, "loss": 0.7588, "step": 1082 }, { "epoch": 0.17787632421778762, "grad_norm": 1.9617154809739192, "learning_rate": 1.88753910404113e-05, "loss": 0.9247, "step": 1083 }, { "epoch": 0.17804056828447073, "grad_norm": 1.9996039725077153, "learning_rate": 1.8872938584609164e-05, "loss": 0.8371, "step": 1084 }, { "epoch": 0.17820481235115382, "grad_norm": 1.7648413727905565, "learning_rate": 1.887048361734043e-05, "loss": 0.804, "step": 1085 }, { "epoch": 0.1783690564178369, "grad_norm": 1.818607908155781, "learning_rate": 1.886802613929997e-05, "loss": 0.7803, "step": 1086 }, { "epoch": 0.17853330048451999, "grad_norm": 1.5583339613425553, "learning_rate": 1.8865566151183365e-05, "loss": 0.7705, "step": 1087 }, { "epoch": 0.1786975445512031, "grad_norm": 0.9284119742035954, "learning_rate": 1.8863103653686917e-05, "loss": 0.386, "step": 1088 }, { "epoch": 0.17886178861788618, "grad_norm": 2.0460231565203952, "learning_rate": 1.8860638647507622e-05, "loss": 0.8962, "step": 1089 }, { "epoch": 0.17902603268456926, "grad_norm": 1.9280827009562027, "learning_rate": 1.8858171133343202e-05, "loss": 0.8774, "step": 1090 }, { "epoch": 0.17919027675125235, "grad_norm": 1.8100315014460457, "learning_rate": 1.885570111189208e-05, "loss": 0.8114, "step": 1091 }, { "epoch": 0.17935452081793546, "grad_norm": 1.8971654256109765, "learning_rate": 1.8853228583853384e-05, "loss": 0.8848, "step": 1092 }, { "epoch": 0.17951876488461854, "grad_norm": 1.8514450808376703, "learning_rate": 1.8850753549926967e-05, "loss": 0.8861, "step": 1093 }, { "epoch": 0.17968300895130163, "grad_norm": 2.0091422628675164, "learning_rate": 1.8848276010813377e-05, "loss": 0.9532, "step": 1094 }, { "epoch": 0.17984725301798474, "grad_norm": 2.073474087100087, "learning_rate": 1.8845795967213876e-05, "loss": 0.7827, "step": 1095 }, { "epoch": 0.18001149708466782, "grad_norm": 1.611718760085573, "learning_rate": 1.884331341983044e-05, "loss": 0.8056, "step": 1096 }, { "epoch": 0.1801757411513509, "grad_norm": 1.6212221587031124, "learning_rate": 1.8840828369365743e-05, "loss": 0.8525, "step": 1097 }, { "epoch": 0.180339985218034, "grad_norm": 1.6556887859914926, "learning_rate": 1.8838340816523175e-05, "loss": 0.9442, "step": 1098 }, { "epoch": 0.1805042292847171, "grad_norm": 1.7323144860221653, "learning_rate": 1.883585076200683e-05, "loss": 0.8261, "step": 1099 }, { "epoch": 0.18066847335140018, "grad_norm": 1.961665774965503, "learning_rate": 1.883335820652152e-05, "loss": 0.8715, "step": 1100 }, { "epoch": 0.18083271741808327, "grad_norm": 1.7604510784007628, "learning_rate": 1.8830863150772754e-05, "loss": 0.8993, "step": 1101 }, { "epoch": 0.18099696148476635, "grad_norm": 2.294220883787023, "learning_rate": 1.882836559546675e-05, "loss": 0.8027, "step": 1102 }, { "epoch": 0.18116120555144946, "grad_norm": 0.7619536922909695, "learning_rate": 1.8825865541310438e-05, "loss": 0.3442, "step": 1103 }, { "epoch": 0.18132544961813254, "grad_norm": 0.7808519447910388, "learning_rate": 1.882336298901145e-05, "loss": 0.3704, "step": 1104 }, { "epoch": 0.18148969368481563, "grad_norm": 2.0066197899653604, "learning_rate": 1.8820857939278136e-05, "loss": 0.7025, "step": 1105 }, { "epoch": 0.18165393775149874, "grad_norm": 1.69001145131898, "learning_rate": 1.8818350392819535e-05, "loss": 0.822, "step": 1106 }, { "epoch": 0.18181818181818182, "grad_norm": 1.6382494465285142, "learning_rate": 1.881584035034541e-05, "loss": 0.8572, "step": 1107 }, { "epoch": 0.1819824258848649, "grad_norm": 2.301467793390923, "learning_rate": 1.8813327812566217e-05, "loss": 0.7944, "step": 1108 }, { "epoch": 0.182146669951548, "grad_norm": 1.5692550576520372, "learning_rate": 1.881081278019313e-05, "loss": 0.8421, "step": 1109 }, { "epoch": 0.1823109140182311, "grad_norm": 1.5085077456033331, "learning_rate": 1.8808295253938025e-05, "loss": 0.8155, "step": 1110 }, { "epoch": 0.18247515808491419, "grad_norm": 2.219552386080805, "learning_rate": 1.8805775234513476e-05, "loss": 0.923, "step": 1111 }, { "epoch": 0.18263940215159727, "grad_norm": 0.7897020834273988, "learning_rate": 1.8803252722632775e-05, "loss": 0.3503, "step": 1112 }, { "epoch": 0.18280364621828035, "grad_norm": 1.7918983832642896, "learning_rate": 1.880072771900991e-05, "loss": 0.8452, "step": 1113 }, { "epoch": 0.18296789028496346, "grad_norm": 1.639700090630405, "learning_rate": 1.879820022435958e-05, "loss": 0.7858, "step": 1114 }, { "epoch": 0.18313213435164655, "grad_norm": 2.3874664125309866, "learning_rate": 1.8795670239397184e-05, "loss": 0.8242, "step": 1115 }, { "epoch": 0.18329637841832963, "grad_norm": 1.7231068891716699, "learning_rate": 1.8793137764838834e-05, "loss": 0.8908, "step": 1116 }, { "epoch": 0.18346062248501274, "grad_norm": 2.180388533462677, "learning_rate": 1.879060280140134e-05, "loss": 0.8039, "step": 1117 }, { "epoch": 0.18362486655169583, "grad_norm": 1.5034584809972915, "learning_rate": 1.878806534980221e-05, "loss": 0.876, "step": 1118 }, { "epoch": 0.1837891106183789, "grad_norm": 0.6666763134279081, "learning_rate": 1.8785525410759676e-05, "loss": 0.3026, "step": 1119 }, { "epoch": 0.183953354685062, "grad_norm": 2.097499915663113, "learning_rate": 1.878298298499266e-05, "loss": 0.8598, "step": 1120 }, { "epoch": 0.1841175987517451, "grad_norm": 1.8050090072880205, "learning_rate": 1.8780438073220785e-05, "loss": 0.9205, "step": 1121 }, { "epoch": 0.1842818428184282, "grad_norm": 2.702354067897842, "learning_rate": 1.8777890676164387e-05, "loss": 0.7834, "step": 1122 }, { "epoch": 0.18444608688511127, "grad_norm": 2.1172661978146285, "learning_rate": 1.8775340794544497e-05, "loss": 0.8035, "step": 1123 }, { "epoch": 0.18461033095179435, "grad_norm": 2.5934952637585074, "learning_rate": 1.877278842908286e-05, "loss": 0.9521, "step": 1124 }, { "epoch": 0.18477457501847747, "grad_norm": 2.1190052559287604, "learning_rate": 1.8770233580501913e-05, "loss": 0.8063, "step": 1125 }, { "epoch": 0.18493881908516055, "grad_norm": 1.8908021141530498, "learning_rate": 1.87676762495248e-05, "loss": 0.743, "step": 1126 }, { "epoch": 0.18510306315184363, "grad_norm": 1.8255051961736697, "learning_rate": 1.8765116436875374e-05, "loss": 0.7473, "step": 1127 }, { "epoch": 0.18526730721852674, "grad_norm": 2.6937470424092003, "learning_rate": 1.876255414327818e-05, "loss": 0.8468, "step": 1128 }, { "epoch": 0.18543155128520983, "grad_norm": 1.8575743368255966, "learning_rate": 1.8759989369458468e-05, "loss": 0.8622, "step": 1129 }, { "epoch": 0.1855957953518929, "grad_norm": 1.5204053851211308, "learning_rate": 1.8757422116142198e-05, "loss": 0.7741, "step": 1130 }, { "epoch": 0.185760039418576, "grad_norm": 1.8679762755453089, "learning_rate": 1.875485238405602e-05, "loss": 0.8277, "step": 1131 }, { "epoch": 0.1859242834852591, "grad_norm": 1.8990899400218808, "learning_rate": 1.875228017392729e-05, "loss": 0.746, "step": 1132 }, { "epoch": 0.1860885275519422, "grad_norm": 1.8544187682658844, "learning_rate": 1.8749705486484074e-05, "loss": 0.7926, "step": 1133 }, { "epoch": 0.18625277161862527, "grad_norm": 1.9961073384934014, "learning_rate": 1.8747128322455128e-05, "loss": 0.8588, "step": 1134 }, { "epoch": 0.18641701568530836, "grad_norm": 1.987484311617175, "learning_rate": 1.8744548682569914e-05, "loss": 0.8137, "step": 1135 }, { "epoch": 0.18658125975199147, "grad_norm": 1.8176004571028048, "learning_rate": 1.874196656755859e-05, "loss": 0.8238, "step": 1136 }, { "epoch": 0.18674550381867455, "grad_norm": 1.7239233526703492, "learning_rate": 1.873938197815202e-05, "loss": 0.8117, "step": 1137 }, { "epoch": 0.18690974788535764, "grad_norm": 1.3086602805000769, "learning_rate": 1.8736794915081765e-05, "loss": 0.8835, "step": 1138 }, { "epoch": 0.18707399195204072, "grad_norm": 6.431394445635227, "learning_rate": 1.8734205379080093e-05, "loss": 0.7806, "step": 1139 }, { "epoch": 0.18723823601872383, "grad_norm": 1.529622013266471, "learning_rate": 1.8731613370879963e-05, "loss": 0.839, "step": 1140 }, { "epoch": 0.18740248008540691, "grad_norm": 2.1172949318499907, "learning_rate": 1.8729018891215042e-05, "loss": 0.8799, "step": 1141 }, { "epoch": 0.18756672415209, "grad_norm": 2.8018317504715813, "learning_rate": 1.8726421940819683e-05, "loss": 0.8419, "step": 1142 }, { "epoch": 0.1877309682187731, "grad_norm": 1.5555763309491684, "learning_rate": 1.8723822520428954e-05, "loss": 0.7442, "step": 1143 }, { "epoch": 0.1878952122854562, "grad_norm": 1.7458883427903134, "learning_rate": 1.8721220630778613e-05, "loss": 0.8773, "step": 1144 }, { "epoch": 0.18805945635213928, "grad_norm": 2.1012775722645056, "learning_rate": 1.871861627260512e-05, "loss": 0.916, "step": 1145 }, { "epoch": 0.18822370041882236, "grad_norm": 2.045875454755917, "learning_rate": 1.8716009446645636e-05, "loss": 0.7344, "step": 1146 }, { "epoch": 0.18838794448550547, "grad_norm": 1.637681103054023, "learning_rate": 1.8713400153638013e-05, "loss": 0.9315, "step": 1147 }, { "epoch": 0.18855218855218855, "grad_norm": 1.4550412373981358, "learning_rate": 1.8710788394320807e-05, "loss": 0.7914, "step": 1148 }, { "epoch": 0.18871643261887164, "grad_norm": 1.9508269202764046, "learning_rate": 1.870817416943327e-05, "loss": 0.8912, "step": 1149 }, { "epoch": 0.18888067668555472, "grad_norm": 1.604486538757915, "learning_rate": 1.8705557479715363e-05, "loss": 0.8663, "step": 1150 }, { "epoch": 0.18904492075223783, "grad_norm": 7.442342646928074, "learning_rate": 1.870293832590772e-05, "loss": 0.8567, "step": 1151 }, { "epoch": 0.18920916481892092, "grad_norm": 2.0369233017608157, "learning_rate": 1.8700316708751693e-05, "loss": 0.8997, "step": 1152 }, { "epoch": 0.189373408885604, "grad_norm": 2.0303196984126224, "learning_rate": 1.8697692628989327e-05, "loss": 0.8218, "step": 1153 }, { "epoch": 0.1895376529522871, "grad_norm": 1.9946380038720792, "learning_rate": 1.869506608736336e-05, "loss": 0.8424, "step": 1154 }, { "epoch": 0.1897018970189702, "grad_norm": 2.537500243489021, "learning_rate": 1.869243708461723e-05, "loss": 0.8407, "step": 1155 }, { "epoch": 0.18986614108565328, "grad_norm": 2.1437860996969573, "learning_rate": 1.8689805621495072e-05, "loss": 0.8035, "step": 1156 }, { "epoch": 0.19003038515233636, "grad_norm": 2.179083730030705, "learning_rate": 1.8687171698741714e-05, "loss": 0.8023, "step": 1157 }, { "epoch": 0.19019462921901947, "grad_norm": 1.761285361500609, "learning_rate": 1.868453531710268e-05, "loss": 0.9511, "step": 1158 }, { "epoch": 0.19035887328570256, "grad_norm": 1.821943308649356, "learning_rate": 1.8681896477324198e-05, "loss": 0.902, "step": 1159 }, { "epoch": 0.19052311735238564, "grad_norm": 1.6818709338138116, "learning_rate": 1.8679255180153184e-05, "loss": 0.9405, "step": 1160 }, { "epoch": 0.19068736141906872, "grad_norm": 1.7555166346530386, "learning_rate": 1.867661142633725e-05, "loss": 0.8898, "step": 1161 }, { "epoch": 0.19085160548575184, "grad_norm": 1.7319379051807642, "learning_rate": 1.8673965216624704e-05, "loss": 0.8601, "step": 1162 }, { "epoch": 0.19101584955243492, "grad_norm": 1.663600728846235, "learning_rate": 1.8671316551764552e-05, "loss": 0.8282, "step": 1163 }, { "epoch": 0.191180093619118, "grad_norm": 1.883118389151872, "learning_rate": 1.8668665432506496e-05, "loss": 0.8728, "step": 1164 }, { "epoch": 0.1913443376858011, "grad_norm": 2.0767567895686576, "learning_rate": 1.8666011859600925e-05, "loss": 0.8931, "step": 1165 }, { "epoch": 0.1915085817524842, "grad_norm": 4.40606369485786, "learning_rate": 1.8663355833798927e-05, "loss": 0.8577, "step": 1166 }, { "epoch": 0.19167282581916728, "grad_norm": 2.3276832892982315, "learning_rate": 1.8660697355852288e-05, "loss": 0.8514, "step": 1167 }, { "epoch": 0.19183706988585036, "grad_norm": 2.411375321073403, "learning_rate": 1.865803642651348e-05, "loss": 0.7971, "step": 1168 }, { "epoch": 0.19200131395253348, "grad_norm": 1.6535908806299489, "learning_rate": 1.8655373046535682e-05, "loss": 0.922, "step": 1169 }, { "epoch": 0.19216555801921656, "grad_norm": 2.380104212503988, "learning_rate": 1.8652707216672747e-05, "loss": 0.7683, "step": 1170 }, { "epoch": 0.19232980208589964, "grad_norm": 4.105494702311962, "learning_rate": 1.865003893767924e-05, "loss": 0.7652, "step": 1171 }, { "epoch": 0.19249404615258273, "grad_norm": 2.304866796633562, "learning_rate": 1.864736821031041e-05, "loss": 0.9003, "step": 1172 }, { "epoch": 0.19265829021926584, "grad_norm": 1.7173353988078406, "learning_rate": 1.8644695035322203e-05, "loss": 0.7741, "step": 1173 }, { "epoch": 0.19282253428594892, "grad_norm": 1.876557533585563, "learning_rate": 1.864201941347125e-05, "loss": 0.785, "step": 1174 }, { "epoch": 0.192986778352632, "grad_norm": 1.8824446861653261, "learning_rate": 1.863934134551488e-05, "loss": 0.9054, "step": 1175 }, { "epoch": 0.19315102241931512, "grad_norm": 1.7034141654551767, "learning_rate": 1.8636660832211126e-05, "loss": 0.8459, "step": 1176 }, { "epoch": 0.1933152664859982, "grad_norm": 1.5732242344007759, "learning_rate": 1.8633977874318686e-05, "loss": 0.8695, "step": 1177 }, { "epoch": 0.19347951055268128, "grad_norm": 1.5596710638588533, "learning_rate": 1.8631292472596978e-05, "loss": 0.8372, "step": 1178 }, { "epoch": 0.19364375461936437, "grad_norm": 6.01263509625628, "learning_rate": 1.862860462780609e-05, "loss": 0.8547, "step": 1179 }, { "epoch": 0.19380799868604748, "grad_norm": 1.5142364827228612, "learning_rate": 1.8625914340706818e-05, "loss": 0.8686, "step": 1180 }, { "epoch": 0.19397224275273056, "grad_norm": 1.866018722902613, "learning_rate": 1.862322161206064e-05, "loss": 0.7677, "step": 1181 }, { "epoch": 0.19413648681941365, "grad_norm": 1.8230785194542451, "learning_rate": 1.862052644262972e-05, "loss": 0.8448, "step": 1182 }, { "epoch": 0.19430073088609673, "grad_norm": 2.6128930455743546, "learning_rate": 1.8617828833176935e-05, "loss": 0.7782, "step": 1183 }, { "epoch": 0.19446497495277984, "grad_norm": 1.7667382593628438, "learning_rate": 1.8615128784465826e-05, "loss": 0.8628, "step": 1184 }, { "epoch": 0.19462921901946292, "grad_norm": 4.091662756971155, "learning_rate": 1.861242629726064e-05, "loss": 0.8299, "step": 1185 }, { "epoch": 0.194793463086146, "grad_norm": 1.9064452807545198, "learning_rate": 1.8609721372326305e-05, "loss": 0.782, "step": 1186 }, { "epoch": 0.1949577071528291, "grad_norm": 1.5585551216732805, "learning_rate": 1.8607014010428454e-05, "loss": 0.8118, "step": 1187 }, { "epoch": 0.1951219512195122, "grad_norm": 3.048038736465385, "learning_rate": 1.860430421233339e-05, "loss": 0.9075, "step": 1188 }, { "epoch": 0.19528619528619529, "grad_norm": 1.7904813948360225, "learning_rate": 1.8601591978808126e-05, "loss": 0.7944, "step": 1189 }, { "epoch": 0.19545043935287837, "grad_norm": 2.0788548185010334, "learning_rate": 1.8598877310620347e-05, "loss": 0.7477, "step": 1190 }, { "epoch": 0.19561468341956148, "grad_norm": 4.7738543649733804, "learning_rate": 1.859616020853843e-05, "loss": 0.7704, "step": 1191 }, { "epoch": 0.19577892748624456, "grad_norm": 1.8800126047862482, "learning_rate": 1.859344067333146e-05, "loss": 0.8968, "step": 1192 }, { "epoch": 0.19594317155292765, "grad_norm": 1.4996415640229075, "learning_rate": 1.859071870576918e-05, "loss": 0.8713, "step": 1193 }, { "epoch": 0.19610741561961073, "grad_norm": 2.597570935190801, "learning_rate": 1.8587994306622047e-05, "loss": 0.8898, "step": 1194 }, { "epoch": 0.19627165968629384, "grad_norm": 4.09939178192434, "learning_rate": 1.8585267476661187e-05, "loss": 0.9014, "step": 1195 }, { "epoch": 0.19643590375297693, "grad_norm": 1.8989316055198002, "learning_rate": 1.8582538216658435e-05, "loss": 0.8594, "step": 1196 }, { "epoch": 0.19660014781966, "grad_norm": 8.315695952766212, "learning_rate": 1.85798065273863e-05, "loss": 0.8204, "step": 1197 }, { "epoch": 0.1967643918863431, "grad_norm": 1.8475729080988692, "learning_rate": 1.857707240961797e-05, "loss": 0.8645, "step": 1198 }, { "epoch": 0.1969286359530262, "grad_norm": 1.8275733555042544, "learning_rate": 1.8574335864127344e-05, "loss": 0.8859, "step": 1199 }, { "epoch": 0.1970928800197093, "grad_norm": 2.3610708820577373, "learning_rate": 1.8571596891688988e-05, "loss": 0.8542, "step": 1200 }, { "epoch": 0.19725712408639237, "grad_norm": 1.6707758144461768, "learning_rate": 1.8568855493078165e-05, "loss": 0.8745, "step": 1201 }, { "epoch": 0.19742136815307548, "grad_norm": 2.018033017882713, "learning_rate": 1.8566111669070822e-05, "loss": 0.8396, "step": 1202 }, { "epoch": 0.19758561221975857, "grad_norm": 1.7194330417396837, "learning_rate": 1.8563365420443594e-05, "loss": 0.8632, "step": 1203 }, { "epoch": 0.19774985628644165, "grad_norm": 2.614507228318818, "learning_rate": 1.85606167479738e-05, "loss": 0.8339, "step": 1204 }, { "epoch": 0.19791410035312473, "grad_norm": 2.3559723182472476, "learning_rate": 1.8557865652439445e-05, "loss": 0.8328, "step": 1205 }, { "epoch": 0.19807834441980784, "grad_norm": 1.9062739562201685, "learning_rate": 1.8555112134619218e-05, "loss": 0.8028, "step": 1206 }, { "epoch": 0.19824258848649093, "grad_norm": 1.9819105587319388, "learning_rate": 1.85523561952925e-05, "loss": 0.8598, "step": 1207 }, { "epoch": 0.198406832553174, "grad_norm": 1.8009763418454598, "learning_rate": 1.854959783523936e-05, "loss": 0.7891, "step": 1208 }, { "epoch": 0.1985710766198571, "grad_norm": 1.5040329477379972, "learning_rate": 1.8546837055240536e-05, "loss": 0.7878, "step": 1209 }, { "epoch": 0.1987353206865402, "grad_norm": 1.6961365748866157, "learning_rate": 1.854407385607746e-05, "loss": 0.8563, "step": 1210 }, { "epoch": 0.1988995647532233, "grad_norm": 2.4533604203369253, "learning_rate": 1.8541308238532257e-05, "loss": 0.8447, "step": 1211 }, { "epoch": 0.19906380881990637, "grad_norm": 1.7108464721324224, "learning_rate": 1.853854020338773e-05, "loss": 0.8767, "step": 1212 }, { "epoch": 0.19922805288658949, "grad_norm": 1.5484792671133374, "learning_rate": 1.853576975142736e-05, "loss": 0.826, "step": 1213 }, { "epoch": 0.19939229695327257, "grad_norm": 1.6758985356062976, "learning_rate": 1.853299688343532e-05, "loss": 0.7907, "step": 1214 }, { "epoch": 0.19955654101995565, "grad_norm": 2.8729786098292918, "learning_rate": 1.8530221600196462e-05, "loss": 0.7979, "step": 1215 }, { "epoch": 0.19972078508663874, "grad_norm": 2.1120881897244086, "learning_rate": 1.8527443902496325e-05, "loss": 0.9416, "step": 1216 }, { "epoch": 0.19988502915332185, "grad_norm": 3.2715208627036576, "learning_rate": 1.8524663791121134e-05, "loss": 0.8684, "step": 1217 }, { "epoch": 0.20004927322000493, "grad_norm": 2.0397997170897084, "learning_rate": 1.852188126685779e-05, "loss": 0.7951, "step": 1218 }, { "epoch": 0.20021351728668801, "grad_norm": 2.3279322377963987, "learning_rate": 1.851909633049388e-05, "loss": 0.7573, "step": 1219 }, { "epoch": 0.2003777613533711, "grad_norm": 2.1944540779142585, "learning_rate": 1.8516308982817685e-05, "loss": 0.7147, "step": 1220 }, { "epoch": 0.2005420054200542, "grad_norm": 1.901951012436895, "learning_rate": 1.851351922461814e-05, "loss": 0.8066, "step": 1221 }, { "epoch": 0.2007062494867373, "grad_norm": 1.5457868287924197, "learning_rate": 1.851072705668489e-05, "loss": 0.8164, "step": 1222 }, { "epoch": 0.20087049355342038, "grad_norm": 2.5415943951215954, "learning_rate": 1.8507932479808254e-05, "loss": 0.8083, "step": 1223 }, { "epoch": 0.2010347376201035, "grad_norm": 1.9939976861461184, "learning_rate": 1.8505135494779228e-05, "loss": 0.7491, "step": 1224 }, { "epoch": 0.20119898168678657, "grad_norm": 2.0011108191889675, "learning_rate": 1.8502336102389494e-05, "loss": 0.8904, "step": 1225 }, { "epoch": 0.20136322575346965, "grad_norm": 0.8608452927097782, "learning_rate": 1.8499534303431414e-05, "loss": 0.3775, "step": 1226 }, { "epoch": 0.20152746982015274, "grad_norm": 2.207645296432335, "learning_rate": 1.849673009869803e-05, "loss": 0.8229, "step": 1227 }, { "epoch": 0.20169171388683585, "grad_norm": 2.321544805555389, "learning_rate": 1.8493923488983066e-05, "loss": 0.8, "step": 1228 }, { "epoch": 0.20185595795351893, "grad_norm": 1.9932538671086861, "learning_rate": 1.849111447508093e-05, "loss": 0.8415, "step": 1229 }, { "epoch": 0.20202020202020202, "grad_norm": 2.331965767495245, "learning_rate": 1.8488303057786707e-05, "loss": 0.8621, "step": 1230 }, { "epoch": 0.2021844460868851, "grad_norm": 1.8143449998289656, "learning_rate": 1.848548923789616e-05, "loss": 0.7392, "step": 1231 }, { "epoch": 0.2023486901535682, "grad_norm": 2.12025836426402, "learning_rate": 1.8482673016205734e-05, "loss": 0.8071, "step": 1232 }, { "epoch": 0.2025129342202513, "grad_norm": 1.772817316764639, "learning_rate": 1.847985439351256e-05, "loss": 0.9128, "step": 1233 }, { "epoch": 0.20267717828693438, "grad_norm": 0.7883080848289032, "learning_rate": 1.8477033370614438e-05, "loss": 0.3486, "step": 1234 }, { "epoch": 0.20284142235361746, "grad_norm": 1.657217743349721, "learning_rate": 1.8474209948309852e-05, "loss": 0.8132, "step": 1235 }, { "epoch": 0.20300566642030057, "grad_norm": 3.2754999348363847, "learning_rate": 1.8471384127397974e-05, "loss": 0.7946, "step": 1236 }, { "epoch": 0.20316991048698366, "grad_norm": 1.92012815112875, "learning_rate": 1.8468555908678638e-05, "loss": 0.9329, "step": 1237 }, { "epoch": 0.20333415455366674, "grad_norm": 0.6735728394018957, "learning_rate": 1.846572529295237e-05, "loss": 0.3531, "step": 1238 }, { "epoch": 0.20349839862034985, "grad_norm": 2.029242604221141, "learning_rate": 1.8462892281020365e-05, "loss": 0.8139, "step": 1239 }, { "epoch": 0.20366264268703294, "grad_norm": 4.019961366748574, "learning_rate": 1.8460056873684503e-05, "loss": 0.7546, "step": 1240 }, { "epoch": 0.20382688675371602, "grad_norm": 2.336849954318392, "learning_rate": 1.8457219071747345e-05, "loss": 0.7812, "step": 1241 }, { "epoch": 0.2039911308203991, "grad_norm": 1.724895402731809, "learning_rate": 1.845437887601212e-05, "loss": 0.8335, "step": 1242 }, { "epoch": 0.2041553748870822, "grad_norm": 1.9580551330677458, "learning_rate": 1.845153628728274e-05, "loss": 0.8953, "step": 1243 }, { "epoch": 0.2043196189537653, "grad_norm": 1.4512541154405059, "learning_rate": 1.8448691306363798e-05, "loss": 0.9135, "step": 1244 }, { "epoch": 0.20448386302044838, "grad_norm": 2.109077267037824, "learning_rate": 1.844584393406055e-05, "loss": 0.7774, "step": 1245 }, { "epoch": 0.20464810708713146, "grad_norm": 1.8643682663130328, "learning_rate": 1.8442994171178948e-05, "loss": 0.8453, "step": 1246 }, { "epoch": 0.20481235115381458, "grad_norm": 1.9794050135491112, "learning_rate": 1.844014201852561e-05, "loss": 0.8585, "step": 1247 }, { "epoch": 0.20497659522049766, "grad_norm": 2.5188759635375986, "learning_rate": 1.8437287476907828e-05, "loss": 0.8361, "step": 1248 }, { "epoch": 0.20514083928718074, "grad_norm": 1.9062843452344977, "learning_rate": 1.8434430547133576e-05, "loss": 0.7733, "step": 1249 }, { "epoch": 0.20530508335386385, "grad_norm": 1.7342393609085802, "learning_rate": 1.8431571230011504e-05, "loss": 0.8804, "step": 1250 }, { "epoch": 0.20546932742054694, "grad_norm": 1.5312311184437684, "learning_rate": 1.8428709526350932e-05, "loss": 0.8513, "step": 1251 }, { "epoch": 0.20563357148723002, "grad_norm": 1.7322676649129554, "learning_rate": 1.8425845436961863e-05, "loss": 0.869, "step": 1252 }, { "epoch": 0.2057978155539131, "grad_norm": 1.7252584893666447, "learning_rate": 1.842297896265497e-05, "loss": 0.9381, "step": 1253 }, { "epoch": 0.20596205962059622, "grad_norm": 2.0061940835557235, "learning_rate": 1.8420110104241598e-05, "loss": 0.6822, "step": 1254 }, { "epoch": 0.2061263036872793, "grad_norm": 1.831104638801452, "learning_rate": 1.841723886253378e-05, "loss": 0.8024, "step": 1255 }, { "epoch": 0.20629054775396238, "grad_norm": 1.839457925526063, "learning_rate": 1.8414365238344208e-05, "loss": 0.8949, "step": 1256 }, { "epoch": 0.20645479182064547, "grad_norm": 2.340144284136591, "learning_rate": 1.8411489232486256e-05, "loss": 0.8205, "step": 1257 }, { "epoch": 0.20661903588732858, "grad_norm": 2.0485511812431847, "learning_rate": 1.8408610845773974e-05, "loss": 0.814, "step": 1258 }, { "epoch": 0.20678327995401166, "grad_norm": 2.3985931125138316, "learning_rate": 1.8405730079022083e-05, "loss": 0.8057, "step": 1259 }, { "epoch": 0.20694752402069475, "grad_norm": 1.6760881951973292, "learning_rate": 1.8402846933045974e-05, "loss": 0.8211, "step": 1260 }, { "epoch": 0.20711176808737786, "grad_norm": 2.1871998418192566, "learning_rate": 1.8399961408661725e-05, "loss": 0.8001, "step": 1261 }, { "epoch": 0.20727601215406094, "grad_norm": 2.1389273412827157, "learning_rate": 1.8397073506686066e-05, "loss": 0.7845, "step": 1262 }, { "epoch": 0.20744025622074402, "grad_norm": 1.9970175549473543, "learning_rate": 1.8394183227936418e-05, "loss": 0.7773, "step": 1263 }, { "epoch": 0.2076045002874271, "grad_norm": 1.720900989139485, "learning_rate": 1.839129057323087e-05, "loss": 0.8179, "step": 1264 }, { "epoch": 0.20776874435411022, "grad_norm": 1.935433630703798, "learning_rate": 1.8388395543388174e-05, "loss": 0.8716, "step": 1265 }, { "epoch": 0.2079329884207933, "grad_norm": 1.6706918545945628, "learning_rate": 1.838549813922777e-05, "loss": 0.9299, "step": 1266 }, { "epoch": 0.20809723248747639, "grad_norm": 2.0072588344579962, "learning_rate": 1.838259836156976e-05, "loss": 0.8803, "step": 1267 }, { "epoch": 0.20826147655415947, "grad_norm": 1.8666667385225546, "learning_rate": 1.8379696211234918e-05, "loss": 0.8691, "step": 1268 }, { "epoch": 0.20842572062084258, "grad_norm": 3.067066554975827, "learning_rate": 1.8376791689044693e-05, "loss": 0.7688, "step": 1269 }, { "epoch": 0.20858996468752566, "grad_norm": 24.43680480298238, "learning_rate": 1.8373884795821203e-05, "loss": 0.8465, "step": 1270 }, { "epoch": 0.20875420875420875, "grad_norm": 2.682757323548447, "learning_rate": 1.8370975532387237e-05, "loss": 0.8218, "step": 1271 }, { "epoch": 0.20891845282089186, "grad_norm": 1.7980280338471952, "learning_rate": 1.8368063899566263e-05, "loss": 0.7871, "step": 1272 }, { "epoch": 0.20908269688757494, "grad_norm": 2.0580494347547202, "learning_rate": 1.8365149898182403e-05, "loss": 0.8995, "step": 1273 }, { "epoch": 0.20924694095425803, "grad_norm": 1.7357982026797374, "learning_rate": 1.8362233529060464e-05, "loss": 0.835, "step": 1274 }, { "epoch": 0.2094111850209411, "grad_norm": 1.3905552456533172, "learning_rate": 1.8359314793025914e-05, "loss": 0.9007, "step": 1275 }, { "epoch": 0.20957542908762422, "grad_norm": 1.8807448389764505, "learning_rate": 1.8356393690904904e-05, "loss": 0.8645, "step": 1276 }, { "epoch": 0.2097396731543073, "grad_norm": 5.445388839517045, "learning_rate": 1.8353470223524237e-05, "loss": 0.8576, "step": 1277 }, { "epoch": 0.2099039172209904, "grad_norm": 1.91102228453434, "learning_rate": 1.8350544391711396e-05, "loss": 0.834, "step": 1278 }, { "epoch": 0.21006816128767347, "grad_norm": 2.0370394454370047, "learning_rate": 1.8347616196294536e-05, "loss": 0.8483, "step": 1279 }, { "epoch": 0.21023240535435658, "grad_norm": 1.9156132137621535, "learning_rate": 1.8344685638102472e-05, "loss": 0.8233, "step": 1280 }, { "epoch": 0.21039664942103967, "grad_norm": 2.853115472719562, "learning_rate": 1.8341752717964696e-05, "loss": 0.8138, "step": 1281 }, { "epoch": 0.21056089348772275, "grad_norm": 0.8657986097774432, "learning_rate": 1.8338817436711358e-05, "loss": 0.3532, "step": 1282 }, { "epoch": 0.21072513755440583, "grad_norm": 2.3887774184759887, "learning_rate": 1.833587979517329e-05, "loss": 0.9068, "step": 1283 }, { "epoch": 0.21088938162108894, "grad_norm": 10.413598226212777, "learning_rate": 1.8332939794181986e-05, "loss": 0.8448, "step": 1284 }, { "epoch": 0.21105362568777203, "grad_norm": 2.6679875456870916, "learning_rate": 1.83299974345696e-05, "loss": 0.8025, "step": 1285 }, { "epoch": 0.2112178697544551, "grad_norm": 1.865469186050698, "learning_rate": 1.832705271716897e-05, "loss": 0.8245, "step": 1286 }, { "epoch": 0.21138211382113822, "grad_norm": 1.7908815904315225, "learning_rate": 1.832410564281358e-05, "loss": 0.7202, "step": 1287 }, { "epoch": 0.2115463578878213, "grad_norm": 1.8688199309962974, "learning_rate": 1.8321156212337604e-05, "loss": 0.9024, "step": 1288 }, { "epoch": 0.2117106019545044, "grad_norm": 1.7555080874482094, "learning_rate": 1.8318204426575873e-05, "loss": 0.7988, "step": 1289 }, { "epoch": 0.21187484602118747, "grad_norm": 1.8928499496931426, "learning_rate": 1.831525028636387e-05, "loss": 0.8629, "step": 1290 }, { "epoch": 0.21203909008787059, "grad_norm": 2.6536717230616977, "learning_rate": 1.8312293792537773e-05, "loss": 0.8396, "step": 1291 }, { "epoch": 0.21220333415455367, "grad_norm": 1.6009430404399534, "learning_rate": 1.8309334945934402e-05, "loss": 0.8484, "step": 1292 }, { "epoch": 0.21236757822123675, "grad_norm": 2.2090133502722, "learning_rate": 1.830637374739126e-05, "loss": 0.7647, "step": 1293 }, { "epoch": 0.21253182228791984, "grad_norm": 3.2683958192817757, "learning_rate": 1.83034101977465e-05, "loss": 0.8869, "step": 1294 }, { "epoch": 0.21269606635460295, "grad_norm": 1.7667082945854522, "learning_rate": 1.8300444297838955e-05, "loss": 0.8311, "step": 1295 }, { "epoch": 0.21286031042128603, "grad_norm": 1.9773794053541263, "learning_rate": 1.8297476048508113e-05, "loss": 0.7649, "step": 1296 }, { "epoch": 0.21302455448796911, "grad_norm": 1.944893931714806, "learning_rate": 1.829450545059413e-05, "loss": 0.8611, "step": 1297 }, { "epoch": 0.21318879855465223, "grad_norm": 1.797889374770983, "learning_rate": 1.829153250493783e-05, "loss": 0.7939, "step": 1298 }, { "epoch": 0.2133530426213353, "grad_norm": 1.9372462363288334, "learning_rate": 1.8288557212380703e-05, "loss": 0.7697, "step": 1299 }, { "epoch": 0.2135172866880184, "grad_norm": 1.5072412076868091, "learning_rate": 1.828557957376489e-05, "loss": 0.8078, "step": 1300 }, { "epoch": 0.21368153075470148, "grad_norm": 2.5026117484614456, "learning_rate": 1.8282599589933214e-05, "loss": 0.8073, "step": 1301 }, { "epoch": 0.2138457748213846, "grad_norm": 3.569977480684114, "learning_rate": 1.8279617261729142e-05, "loss": 0.8282, "step": 1302 }, { "epoch": 0.21401001888806767, "grad_norm": 1.6734073168104546, "learning_rate": 1.827663258999683e-05, "loss": 0.8947, "step": 1303 }, { "epoch": 0.21417426295475075, "grad_norm": 2.308971608132171, "learning_rate": 1.827364557558107e-05, "loss": 0.857, "step": 1304 }, { "epoch": 0.21433850702143384, "grad_norm": 2.3766232746799276, "learning_rate": 1.827065621932734e-05, "loss": 0.8102, "step": 1305 }, { "epoch": 0.21450275108811695, "grad_norm": 1.7244670976926975, "learning_rate": 1.8267664522081767e-05, "loss": 0.8177, "step": 1306 }, { "epoch": 0.21466699515480003, "grad_norm": 2.5188155457623407, "learning_rate": 1.8264670484691144e-05, "loss": 0.8472, "step": 1307 }, { "epoch": 0.21483123922148312, "grad_norm": 1.6474535122194827, "learning_rate": 1.8261674108002925e-05, "loss": 0.8377, "step": 1308 }, { "epoch": 0.21499548328816623, "grad_norm": 1.8507850396824563, "learning_rate": 1.8258675392865235e-05, "loss": 0.7993, "step": 1309 }, { "epoch": 0.2151597273548493, "grad_norm": 0.8051682228024982, "learning_rate": 1.8255674340126847e-05, "loss": 0.3436, "step": 1310 }, { "epoch": 0.2153239714215324, "grad_norm": 1.754789644136224, "learning_rate": 1.8252670950637206e-05, "loss": 0.8726, "step": 1311 }, { "epoch": 0.21548821548821548, "grad_norm": 1.804078122754082, "learning_rate": 1.8249665225246417e-05, "loss": 0.8219, "step": 1312 }, { "epoch": 0.2156524595548986, "grad_norm": 2.510098306518109, "learning_rate": 1.824665716480524e-05, "loss": 0.9528, "step": 1313 }, { "epoch": 0.21581670362158167, "grad_norm": 1.8612818908658277, "learning_rate": 1.82436467701651e-05, "loss": 0.8062, "step": 1314 }, { "epoch": 0.21598094768826476, "grad_norm": 1.7603673527945864, "learning_rate": 1.8240634042178086e-05, "loss": 0.8753, "step": 1315 }, { "epoch": 0.21614519175494784, "grad_norm": 1.6111296733189402, "learning_rate": 1.8237618981696944e-05, "loss": 0.8494, "step": 1316 }, { "epoch": 0.21630943582163095, "grad_norm": 1.7179826887569458, "learning_rate": 1.823460158957508e-05, "loss": 0.8266, "step": 1317 }, { "epoch": 0.21647367988831404, "grad_norm": 1.539969875729237, "learning_rate": 1.823158186666656e-05, "loss": 0.8218, "step": 1318 }, { "epoch": 0.21663792395499712, "grad_norm": 7.502597633208521, "learning_rate": 1.8228559813826106e-05, "loss": 0.847, "step": 1319 }, { "epoch": 0.21680216802168023, "grad_norm": 8.005985812647, "learning_rate": 1.8225535431909113e-05, "loss": 0.8877, "step": 1320 }, { "epoch": 0.21696641208836331, "grad_norm": 1.4166387282693942, "learning_rate": 1.822250872177162e-05, "loss": 0.8629, "step": 1321 }, { "epoch": 0.2171306561550464, "grad_norm": 4.656555314362366, "learning_rate": 1.821947968427033e-05, "loss": 0.898, "step": 1322 }, { "epoch": 0.21729490022172948, "grad_norm": 1.8605206286156766, "learning_rate": 1.821644832026261e-05, "loss": 0.8643, "step": 1323 }, { "epoch": 0.2174591442884126, "grad_norm": 1.4607143288521547, "learning_rate": 1.821341463060648e-05, "loss": 0.8585, "step": 1324 }, { "epoch": 0.21762338835509568, "grad_norm": 1.8589834954448479, "learning_rate": 1.8210378616160617e-05, "loss": 0.7694, "step": 1325 }, { "epoch": 0.21778763242177876, "grad_norm": 1.8568756385774503, "learning_rate": 1.8207340277784357e-05, "loss": 0.8569, "step": 1326 }, { "epoch": 0.21795187648846184, "grad_norm": 2.178882158910963, "learning_rate": 1.82042996163377e-05, "loss": 0.842, "step": 1327 }, { "epoch": 0.21811612055514495, "grad_norm": 0.8263199175141653, "learning_rate": 1.82012566326813e-05, "loss": 0.3849, "step": 1328 }, { "epoch": 0.21828036462182804, "grad_norm": 2.206255380181002, "learning_rate": 1.819821132767646e-05, "loss": 0.8252, "step": 1329 }, { "epoch": 0.21844460868851112, "grad_norm": 12.115637964042032, "learning_rate": 1.8195163702185153e-05, "loss": 0.8679, "step": 1330 }, { "epoch": 0.2186088527551942, "grad_norm": 2.218326899349202, "learning_rate": 1.819211375707e-05, "loss": 0.8879, "step": 1331 }, { "epoch": 0.21877309682187732, "grad_norm": 1.6034289511656559, "learning_rate": 1.8189061493194283e-05, "loss": 0.8298, "step": 1332 }, { "epoch": 0.2189373408885604, "grad_norm": 1.9835066752065842, "learning_rate": 1.8186006911421937e-05, "loss": 0.8459, "step": 1333 }, { "epoch": 0.21910158495524348, "grad_norm": 1.7102569302494774, "learning_rate": 1.818295001261756e-05, "loss": 0.8411, "step": 1334 }, { "epoch": 0.2192658290219266, "grad_norm": 2.936636851979521, "learning_rate": 1.8179890797646398e-05, "loss": 0.8455, "step": 1335 }, { "epoch": 0.21943007308860968, "grad_norm": 2.1921157908330313, "learning_rate": 1.817682926737435e-05, "loss": 0.6944, "step": 1336 }, { "epoch": 0.21959431715529276, "grad_norm": 5.415707404464676, "learning_rate": 1.8173765422667987e-05, "loss": 0.7714, "step": 1337 }, { "epoch": 0.21975856122197585, "grad_norm": 1.9578389081674352, "learning_rate": 1.817069926439451e-05, "loss": 0.7725, "step": 1338 }, { "epoch": 0.21992280528865896, "grad_norm": 1.855851519830354, "learning_rate": 1.81676307934218e-05, "loss": 0.8304, "step": 1339 }, { "epoch": 0.22008704935534204, "grad_norm": 2.1003831006998372, "learning_rate": 1.8164560010618377e-05, "loss": 0.8118, "step": 1340 }, { "epoch": 0.22025129342202512, "grad_norm": 1.9241620447452226, "learning_rate": 1.816148691685342e-05, "loss": 0.9058, "step": 1341 }, { "epoch": 0.2204155374887082, "grad_norm": 1.9816455019399697, "learning_rate": 1.815841151299676e-05, "loss": 0.8408, "step": 1342 }, { "epoch": 0.22057978155539132, "grad_norm": 0.7821128645223024, "learning_rate": 1.8155333799918883e-05, "loss": 0.3663, "step": 1343 }, { "epoch": 0.2207440256220744, "grad_norm": 1.7666314622299413, "learning_rate": 1.8152253778490933e-05, "loss": 0.882, "step": 1344 }, { "epoch": 0.22090826968875749, "grad_norm": 1.9531649639126352, "learning_rate": 1.8149171449584705e-05, "loss": 0.8414, "step": 1345 }, { "epoch": 0.2210725137554406, "grad_norm": 2.0201213638804183, "learning_rate": 1.814608681407264e-05, "loss": 0.8914, "step": 1346 }, { "epoch": 0.22123675782212368, "grad_norm": 1.7615577327223526, "learning_rate": 1.814299987282784e-05, "loss": 0.8111, "step": 1347 }, { "epoch": 0.22140100188880676, "grad_norm": 1.9792562329424337, "learning_rate": 1.8139910626724058e-05, "loss": 0.8117, "step": 1348 }, { "epoch": 0.22156524595548985, "grad_norm": 2.2546867851960934, "learning_rate": 1.8136819076635696e-05, "loss": 0.8687, "step": 1349 }, { "epoch": 0.22172949002217296, "grad_norm": 1.6762954102129624, "learning_rate": 1.8133725223437815e-05, "loss": 0.7956, "step": 1350 }, { "epoch": 0.22189373408885604, "grad_norm": 1.5648605606204207, "learning_rate": 1.813062906800612e-05, "loss": 0.862, "step": 1351 }, { "epoch": 0.22205797815553913, "grad_norm": 2.738564581995891, "learning_rate": 1.8127530611216973e-05, "loss": 0.7441, "step": 1352 }, { "epoch": 0.2222222222222222, "grad_norm": 1.5120608709196597, "learning_rate": 1.8124429853947387e-05, "loss": 0.7867, "step": 1353 }, { "epoch": 0.22238646628890532, "grad_norm": 3.99429869746237, "learning_rate": 1.8121326797075022e-05, "loss": 0.8167, "step": 1354 }, { "epoch": 0.2225507103555884, "grad_norm": 1.9366128066822883, "learning_rate": 1.811822144147819e-05, "loss": 0.8638, "step": 1355 }, { "epoch": 0.2227149544222715, "grad_norm": 3.6884916965292804, "learning_rate": 1.8115113788035863e-05, "loss": 0.8863, "step": 1356 }, { "epoch": 0.2228791984889546, "grad_norm": 2.111167697845807, "learning_rate": 1.8112003837627646e-05, "loss": 0.7923, "step": 1357 }, { "epoch": 0.22304344255563768, "grad_norm": 1.9198618369844014, "learning_rate": 1.8108891591133812e-05, "loss": 0.8055, "step": 1358 }, { "epoch": 0.22320768662232077, "grad_norm": 2.765418521631962, "learning_rate": 1.810577704943527e-05, "loss": 0.8936, "step": 1359 }, { "epoch": 0.22337193068900385, "grad_norm": 2.727825235360716, "learning_rate": 1.8102660213413593e-05, "loss": 0.7972, "step": 1360 }, { "epoch": 0.22353617475568696, "grad_norm": 2.2799767283536796, "learning_rate": 1.8099541083950988e-05, "loss": 0.8934, "step": 1361 }, { "epoch": 0.22370041882237005, "grad_norm": 2.1832782740007874, "learning_rate": 1.8096419661930313e-05, "loss": 0.7927, "step": 1362 }, { "epoch": 0.22386466288905313, "grad_norm": 1.803568887250418, "learning_rate": 1.809329594823509e-05, "loss": 0.8658, "step": 1363 }, { "epoch": 0.2240289069557362, "grad_norm": 1.7246868176202523, "learning_rate": 1.8090169943749477e-05, "loss": 0.897, "step": 1364 }, { "epoch": 0.22419315102241932, "grad_norm": 2.267953258387611, "learning_rate": 1.808704164935828e-05, "loss": 0.9014, "step": 1365 }, { "epoch": 0.2243573950891024, "grad_norm": 2.543415685898095, "learning_rate": 1.8083911065946958e-05, "loss": 0.8915, "step": 1366 }, { "epoch": 0.2245216391557855, "grad_norm": 1.785208802381979, "learning_rate": 1.8080778194401615e-05, "loss": 0.9596, "step": 1367 }, { "epoch": 0.2246858832224686, "grad_norm": 2.600977831192266, "learning_rate": 1.8077643035609006e-05, "loss": 0.892, "step": 1368 }, { "epoch": 0.22485012728915169, "grad_norm": 1.8402790702167562, "learning_rate": 1.807450559045653e-05, "loss": 0.7784, "step": 1369 }, { "epoch": 0.22501437135583477, "grad_norm": 9.974648066965432, "learning_rate": 1.807136585983223e-05, "loss": 0.7898, "step": 1370 }, { "epoch": 0.22517861542251785, "grad_norm": 1.4518686374314607, "learning_rate": 1.8068223844624806e-05, "loss": 0.8372, "step": 1371 }, { "epoch": 0.22534285948920096, "grad_norm": 1.6733645737454927, "learning_rate": 1.80650795457236e-05, "loss": 0.8388, "step": 1372 }, { "epoch": 0.22550710355588405, "grad_norm": 2.176081197669796, "learning_rate": 1.806193296401859e-05, "loss": 0.8385, "step": 1373 }, { "epoch": 0.22567134762256713, "grad_norm": 2.3138123003809805, "learning_rate": 1.8058784100400418e-05, "loss": 0.8497, "step": 1374 }, { "epoch": 0.22583559168925021, "grad_norm": 0.969618384122468, "learning_rate": 1.8055632955760364e-05, "loss": 0.3528, "step": 1375 }, { "epoch": 0.22599983575593333, "grad_norm": 3.1140732927290355, "learning_rate": 1.8052479530990348e-05, "loss": 0.88, "step": 1376 }, { "epoch": 0.2261640798226164, "grad_norm": 2.068356919853371, "learning_rate": 1.8049323826982942e-05, "loss": 0.8614, "step": 1377 }, { "epoch": 0.2263283238892995, "grad_norm": 2.018561676865218, "learning_rate": 1.804616584463136e-05, "loss": 0.8563, "step": 1378 }, { "epoch": 0.22649256795598258, "grad_norm": 1.7574221990299785, "learning_rate": 1.804300558482946e-05, "loss": 0.7427, "step": 1379 }, { "epoch": 0.2266568120226657, "grad_norm": 1.8271174458539534, "learning_rate": 1.8039843048471756e-05, "loss": 0.8009, "step": 1380 }, { "epoch": 0.22682105608934877, "grad_norm": 1.9165676697764622, "learning_rate": 1.8036678236453387e-05, "loss": 0.8183, "step": 1381 }, { "epoch": 0.22698530015603186, "grad_norm": 2.2266202579875896, "learning_rate": 1.8033511149670152e-05, "loss": 0.8088, "step": 1382 }, { "epoch": 0.22714954422271497, "grad_norm": 2.2533630213350264, "learning_rate": 1.803034178901849e-05, "loss": 0.883, "step": 1383 }, { "epoch": 0.22731378828939805, "grad_norm": 2.2346598131111537, "learning_rate": 1.8027170155395476e-05, "loss": 0.8424, "step": 1384 }, { "epoch": 0.22747803235608113, "grad_norm": 2.3205498388534966, "learning_rate": 1.8023996249698836e-05, "loss": 0.8356, "step": 1385 }, { "epoch": 0.22764227642276422, "grad_norm": 1.7940442649057147, "learning_rate": 1.802082007282694e-05, "loss": 0.8817, "step": 1386 }, { "epoch": 0.22780652048944733, "grad_norm": 1.8778670481919042, "learning_rate": 1.8017641625678797e-05, "loss": 0.8777, "step": 1387 }, { "epoch": 0.2279707645561304, "grad_norm": 3.251034634276481, "learning_rate": 1.8014460909154058e-05, "loss": 0.8294, "step": 1388 }, { "epoch": 0.2281350086228135, "grad_norm": 2.4683258423091865, "learning_rate": 1.8011277924153022e-05, "loss": 0.8259, "step": 1389 }, { "epoch": 0.22829925268949658, "grad_norm": 1.7512120366229638, "learning_rate": 1.8008092671576624e-05, "loss": 0.8237, "step": 1390 }, { "epoch": 0.2284634967561797, "grad_norm": 2.3623448961744518, "learning_rate": 1.8004905152326443e-05, "loss": 0.8408, "step": 1391 }, { "epoch": 0.22862774082286277, "grad_norm": 3.2839131066487663, "learning_rate": 1.8001715367304696e-05, "loss": 0.7917, "step": 1392 }, { "epoch": 0.22879198488954586, "grad_norm": 2.0144774144692494, "learning_rate": 1.799852331741425e-05, "loss": 0.7845, "step": 1393 }, { "epoch": 0.22895622895622897, "grad_norm": 1.6926320716573375, "learning_rate": 1.799532900355861e-05, "loss": 0.7892, "step": 1394 }, { "epoch": 0.22912047302291205, "grad_norm": 1.8351009279473622, "learning_rate": 1.799213242664191e-05, "loss": 0.7971, "step": 1395 }, { "epoch": 0.22928471708959514, "grad_norm": 1.83152032899268, "learning_rate": 1.7988933587568948e-05, "loss": 0.7916, "step": 1396 }, { "epoch": 0.22944896115627822, "grad_norm": 2.1901973746719343, "learning_rate": 1.7985732487245132e-05, "loss": 0.8235, "step": 1397 }, { "epoch": 0.22961320522296133, "grad_norm": 2.891274017196177, "learning_rate": 1.7982529126576543e-05, "loss": 0.8111, "step": 1398 }, { "epoch": 0.22977744928964441, "grad_norm": 1.9966237298501652, "learning_rate": 1.7979323506469878e-05, "loss": 0.8393, "step": 1399 }, { "epoch": 0.2299416933563275, "grad_norm": 1.9823874986271464, "learning_rate": 1.797611562783248e-05, "loss": 0.7989, "step": 1400 }, { "epoch": 0.23010593742301058, "grad_norm": 2.215483377011996, "learning_rate": 1.7972905491572334e-05, "loss": 0.8432, "step": 1401 }, { "epoch": 0.2302701814896937, "grad_norm": 2.599023600744591, "learning_rate": 1.7969693098598063e-05, "loss": 0.8642, "step": 1402 }, { "epoch": 0.23043442555637678, "grad_norm": 1.0338254482681357, "learning_rate": 1.7966478449818925e-05, "loss": 0.3753, "step": 1403 }, { "epoch": 0.23059866962305986, "grad_norm": 1.8093575455091124, "learning_rate": 1.7963261546144823e-05, "loss": 0.8659, "step": 1404 }, { "epoch": 0.23076291368974297, "grad_norm": 2.3358206134382025, "learning_rate": 1.7960042388486293e-05, "loss": 0.8519, "step": 1405 }, { "epoch": 0.23092715775642605, "grad_norm": 2.866638207518902, "learning_rate": 1.795682097775451e-05, "loss": 0.7645, "step": 1406 }, { "epoch": 0.23109140182310914, "grad_norm": 1.4240445594639597, "learning_rate": 1.795359731486129e-05, "loss": 0.845, "step": 1407 }, { "epoch": 0.23125564588979222, "grad_norm": 1.7994707513700516, "learning_rate": 1.7950371400719087e-05, "loss": 0.7983, "step": 1408 }, { "epoch": 0.23141988995647533, "grad_norm": 1.7669531696642469, "learning_rate": 1.7947143236240983e-05, "loss": 0.8084, "step": 1409 }, { "epoch": 0.23158413402315842, "grad_norm": 3.127679671918903, "learning_rate": 1.7943912822340702e-05, "loss": 0.7815, "step": 1410 }, { "epoch": 0.2317483780898415, "grad_norm": 1.530569204017412, "learning_rate": 1.7940680159932612e-05, "loss": 0.8084, "step": 1411 }, { "epoch": 0.23191262215652458, "grad_norm": 1.5886901951153218, "learning_rate": 1.7937445249931706e-05, "loss": 0.8569, "step": 1412 }, { "epoch": 0.2320768662232077, "grad_norm": 4.4524020035713745, "learning_rate": 1.7934208093253625e-05, "loss": 0.8155, "step": 1413 }, { "epoch": 0.23224111028989078, "grad_norm": 1.6751246298861062, "learning_rate": 1.7930968690814634e-05, "loss": 0.7802, "step": 1414 }, { "epoch": 0.23240535435657386, "grad_norm": 1.80231554785924, "learning_rate": 1.7927727043531637e-05, "loss": 0.8976, "step": 1415 }, { "epoch": 0.23256959842325697, "grad_norm": 1.5249671157801423, "learning_rate": 1.7924483152322183e-05, "loss": 0.8332, "step": 1416 }, { "epoch": 0.23273384248994006, "grad_norm": 2.0068610928109365, "learning_rate": 1.7921237018104443e-05, "loss": 0.7274, "step": 1417 }, { "epoch": 0.23289808655662314, "grad_norm": 2.03792105805746, "learning_rate": 1.7917988641797227e-05, "loss": 0.8446, "step": 1418 }, { "epoch": 0.23306233062330622, "grad_norm": 2.6896359333459876, "learning_rate": 1.791473802431999e-05, "loss": 0.7646, "step": 1419 }, { "epoch": 0.23322657468998934, "grad_norm": 1.9997844057703817, "learning_rate": 1.7911485166592802e-05, "loss": 0.7463, "step": 1420 }, { "epoch": 0.23339081875667242, "grad_norm": 4.474189262672801, "learning_rate": 1.790823006953638e-05, "loss": 0.8564, "step": 1421 }, { "epoch": 0.2335550628233555, "grad_norm": 2.352371789523635, "learning_rate": 1.790497273407208e-05, "loss": 0.8611, "step": 1422 }, { "epoch": 0.2337193068900386, "grad_norm": 1.6897482087350053, "learning_rate": 1.7901713161121873e-05, "loss": 0.825, "step": 1423 }, { "epoch": 0.2338835509567217, "grad_norm": 1.7464945239949266, "learning_rate": 1.7898451351608385e-05, "loss": 0.9363, "step": 1424 }, { "epoch": 0.23404779502340478, "grad_norm": 1.9874617197028275, "learning_rate": 1.7895187306454852e-05, "loss": 0.8785, "step": 1425 }, { "epoch": 0.23421203909008786, "grad_norm": 2.4945951128924695, "learning_rate": 1.7891921026585167e-05, "loss": 0.8514, "step": 1426 }, { "epoch": 0.23437628315677095, "grad_norm": 1.7682933851054556, "learning_rate": 1.7888652512923836e-05, "loss": 0.8456, "step": 1427 }, { "epoch": 0.23454052722345406, "grad_norm": 3.0024902745497988, "learning_rate": 1.7885381766396008e-05, "loss": 0.7557, "step": 1428 }, { "epoch": 0.23470477129013714, "grad_norm": 2.133692834031926, "learning_rate": 1.788210878792746e-05, "loss": 0.7504, "step": 1429 }, { "epoch": 0.23486901535682023, "grad_norm": 1.884087097222835, "learning_rate": 1.7878833578444603e-05, "loss": 0.7465, "step": 1430 }, { "epoch": 0.23503325942350334, "grad_norm": 1.7677396790279123, "learning_rate": 1.787555613887448e-05, "loss": 0.8258, "step": 1431 }, { "epoch": 0.23519750349018642, "grad_norm": 2.121274185568481, "learning_rate": 1.7872276470144756e-05, "loss": 0.8208, "step": 1432 }, { "epoch": 0.2353617475568695, "grad_norm": 1.6987455488812568, "learning_rate": 1.786899457318374e-05, "loss": 0.8158, "step": 1433 }, { "epoch": 0.2355259916235526, "grad_norm": 3.5195443924021665, "learning_rate": 1.7865710448920365e-05, "loss": 0.8657, "step": 1434 }, { "epoch": 0.2356902356902357, "grad_norm": 1.9886904718735174, "learning_rate": 1.7862424098284197e-05, "loss": 0.8417, "step": 1435 }, { "epoch": 0.23585447975691878, "grad_norm": 2.036390213932355, "learning_rate": 1.7859135522205426e-05, "loss": 0.8633, "step": 1436 }, { "epoch": 0.23601872382360187, "grad_norm": 2.0357985306692354, "learning_rate": 1.7855844721614883e-05, "loss": 0.8556, "step": 1437 }, { "epoch": 0.23618296789028495, "grad_norm": 1.78279857080973, "learning_rate": 1.7852551697444017e-05, "loss": 0.7792, "step": 1438 }, { "epoch": 0.23634721195696806, "grad_norm": 2.9603696566188353, "learning_rate": 1.7849256450624914e-05, "loss": 0.8359, "step": 1439 }, { "epoch": 0.23651145602365115, "grad_norm": 0.9034742585563434, "learning_rate": 1.7845958982090287e-05, "loss": 0.3932, "step": 1440 }, { "epoch": 0.23667570009033423, "grad_norm": 1.6565311815059414, "learning_rate": 1.7842659292773474e-05, "loss": 0.8833, "step": 1441 }, { "epoch": 0.23683994415701734, "grad_norm": 2.7491271560565522, "learning_rate": 1.783935738360845e-05, "loss": 0.8119, "step": 1442 }, { "epoch": 0.23700418822370042, "grad_norm": 1.6658684305848024, "learning_rate": 1.783605325552981e-05, "loss": 0.8534, "step": 1443 }, { "epoch": 0.2371684322903835, "grad_norm": 2.097501122864875, "learning_rate": 1.7832746909472783e-05, "loss": 0.7476, "step": 1444 }, { "epoch": 0.2373326763570666, "grad_norm": 1.8001748616455322, "learning_rate": 1.782943834637322e-05, "loss": 0.8499, "step": 1445 }, { "epoch": 0.2374969204237497, "grad_norm": 2.141722661122658, "learning_rate": 1.7826127567167602e-05, "loss": 0.7738, "step": 1446 }, { "epoch": 0.23766116449043279, "grad_norm": 2.710887077490289, "learning_rate": 1.7822814572793047e-05, "loss": 0.8237, "step": 1447 }, { "epoch": 0.23782540855711587, "grad_norm": 2.067796743935287, "learning_rate": 1.7819499364187282e-05, "loss": 0.7858, "step": 1448 }, { "epoch": 0.23798965262379895, "grad_norm": 1.8610582763934436, "learning_rate": 1.7816181942288672e-05, "loss": 0.7823, "step": 1449 }, { "epoch": 0.23815389669048206, "grad_norm": 1.8335442969179303, "learning_rate": 1.781286230803621e-05, "loss": 0.8812, "step": 1450 }, { "epoch": 0.23831814075716515, "grad_norm": 1.5282925403887593, "learning_rate": 1.7809540462369505e-05, "loss": 0.7398, "step": 1451 }, { "epoch": 0.23848238482384823, "grad_norm": 2.0324464998669183, "learning_rate": 1.7806216406228805e-05, "loss": 0.8819, "step": 1452 }, { "epoch": 0.23864662889053134, "grad_norm": 1.5318258639748017, "learning_rate": 1.780289014055497e-05, "loss": 0.8772, "step": 1453 }, { "epoch": 0.23881087295721443, "grad_norm": 2.1496718161750614, "learning_rate": 1.77995616662895e-05, "loss": 0.9141, "step": 1454 }, { "epoch": 0.2389751170238975, "grad_norm": 2.469599688436256, "learning_rate": 1.779623098437451e-05, "loss": 0.8783, "step": 1455 }, { "epoch": 0.2391393610905806, "grad_norm": 1.8883906767904386, "learning_rate": 1.779289809575274e-05, "loss": 0.7563, "step": 1456 }, { "epoch": 0.2393036051572637, "grad_norm": 1.9485943271504742, "learning_rate": 1.7789563001367557e-05, "loss": 0.8545, "step": 1457 }, { "epoch": 0.2394678492239468, "grad_norm": 2.1153987104346528, "learning_rate": 1.7786225702162955e-05, "loss": 0.829, "step": 1458 }, { "epoch": 0.23963209329062987, "grad_norm": 1.7292968371122874, "learning_rate": 1.778288619908355e-05, "loss": 0.795, "step": 1459 }, { "epoch": 0.23979633735731296, "grad_norm": 3.3963242038671413, "learning_rate": 1.777954449307458e-05, "loss": 0.767, "step": 1460 }, { "epoch": 0.23996058142399607, "grad_norm": 2.0423601227443386, "learning_rate": 1.777620058508191e-05, "loss": 0.8554, "step": 1461 }, { "epoch": 0.24012482549067915, "grad_norm": 1.8632619926702145, "learning_rate": 1.7772854476052023e-05, "loss": 0.856, "step": 1462 }, { "epoch": 0.24028906955736223, "grad_norm": 1.7853466365344381, "learning_rate": 1.7769506166932026e-05, "loss": 0.8591, "step": 1463 }, { "epoch": 0.24045331362404535, "grad_norm": 1.8539953497885622, "learning_rate": 1.7766155658669655e-05, "loss": 0.904, "step": 1464 }, { "epoch": 0.24061755769072843, "grad_norm": 1.9298682544675676, "learning_rate": 1.7762802952213264e-05, "loss": 0.8067, "step": 1465 }, { "epoch": 0.2407818017574115, "grad_norm": 2.063780204678838, "learning_rate": 1.7759448048511833e-05, "loss": 0.8171, "step": 1466 }, { "epoch": 0.2409460458240946, "grad_norm": 1.8794451901681373, "learning_rate": 1.775609094851495e-05, "loss": 0.8544, "step": 1467 }, { "epoch": 0.2411102898907777, "grad_norm": 1.6499805469670068, "learning_rate": 1.7752731653172847e-05, "loss": 0.8468, "step": 1468 }, { "epoch": 0.2412745339574608, "grad_norm": 1.5224389692645393, "learning_rate": 1.7749370163436353e-05, "loss": 0.7983, "step": 1469 }, { "epoch": 0.24143877802414387, "grad_norm": 2.0793282782866136, "learning_rate": 1.7746006480256943e-05, "loss": 0.8947, "step": 1470 }, { "epoch": 0.24160302209082696, "grad_norm": 2.099349484593602, "learning_rate": 1.7742640604586694e-05, "loss": 0.7757, "step": 1471 }, { "epoch": 0.24176726615751007, "grad_norm": 1.7972746041369163, "learning_rate": 1.773927253737831e-05, "loss": 0.8717, "step": 1472 }, { "epoch": 0.24193151022419315, "grad_norm": 1.877826436663934, "learning_rate": 1.7735902279585118e-05, "loss": 0.8924, "step": 1473 }, { "epoch": 0.24209575429087624, "grad_norm": 0.8445263974348377, "learning_rate": 1.7732529832161057e-05, "loss": 0.3659, "step": 1474 }, { "epoch": 0.24225999835755932, "grad_norm": 1.7695388242623673, "learning_rate": 1.7729155196060697e-05, "loss": 0.8139, "step": 1475 }, { "epoch": 0.24242424242424243, "grad_norm": 2.332128200035578, "learning_rate": 1.772577837223922e-05, "loss": 0.8409, "step": 1476 }, { "epoch": 0.24258848649092551, "grad_norm": 2.6065018797068467, "learning_rate": 1.772239936165243e-05, "loss": 0.8195, "step": 1477 }, { "epoch": 0.2427527305576086, "grad_norm": 1.602683116927254, "learning_rate": 1.7719018165256745e-05, "loss": 0.8593, "step": 1478 }, { "epoch": 0.2429169746242917, "grad_norm": 1.9602132520787736, "learning_rate": 1.7715634784009207e-05, "loss": 0.8433, "step": 1479 }, { "epoch": 0.2430812186909748, "grad_norm": 6.035387844465864, "learning_rate": 1.7712249218867476e-05, "loss": 0.7891, "step": 1480 }, { "epoch": 0.24324546275765788, "grad_norm": 1.911573584907382, "learning_rate": 1.770886147078983e-05, "loss": 0.8217, "step": 1481 }, { "epoch": 0.24340970682434096, "grad_norm": 2.20803886477718, "learning_rate": 1.770547154073516e-05, "loss": 0.8441, "step": 1482 }, { "epoch": 0.24357395089102407, "grad_norm": 1.7329989347349282, "learning_rate": 1.7702079429662986e-05, "loss": 0.7907, "step": 1483 }, { "epoch": 0.24373819495770715, "grad_norm": 0.7221688345943101, "learning_rate": 1.769868513853343e-05, "loss": 0.3761, "step": 1484 }, { "epoch": 0.24390243902439024, "grad_norm": 1.556563444908011, "learning_rate": 1.769528866830724e-05, "loss": 0.8752, "step": 1485 }, { "epoch": 0.24406668309107332, "grad_norm": 1.952723023881889, "learning_rate": 1.7691890019945785e-05, "loss": 0.7422, "step": 1486 }, { "epoch": 0.24423092715775643, "grad_norm": 2.000465412895329, "learning_rate": 1.768848919441104e-05, "loss": 0.893, "step": 1487 }, { "epoch": 0.24439517122443952, "grad_norm": 1.8230044838013018, "learning_rate": 1.7685086192665605e-05, "loss": 0.7661, "step": 1488 }, { "epoch": 0.2445594152911226, "grad_norm": 1.6428317692510026, "learning_rate": 1.7681681015672693e-05, "loss": 0.6999, "step": 1489 }, { "epoch": 0.2447236593578057, "grad_norm": 2.2305470793331175, "learning_rate": 1.767827366439613e-05, "loss": 0.8492, "step": 1490 }, { "epoch": 0.2448879034244888, "grad_norm": 3.504712649183638, "learning_rate": 1.7674864139800356e-05, "loss": 0.8417, "step": 1491 }, { "epoch": 0.24505214749117188, "grad_norm": 0.6893095463911403, "learning_rate": 1.7671452442850438e-05, "loss": 0.3431, "step": 1492 }, { "epoch": 0.24521639155785496, "grad_norm": 1.7652770421627169, "learning_rate": 1.7668038574512045e-05, "loss": 0.8351, "step": 1493 }, { "epoch": 0.24538063562453807, "grad_norm": 2.976644639501337, "learning_rate": 1.7664622535751463e-05, "loss": 0.7664, "step": 1494 }, { "epoch": 0.24554487969122116, "grad_norm": 1.8073129182042318, "learning_rate": 1.76612043275356e-05, "loss": 0.7438, "step": 1495 }, { "epoch": 0.24570912375790424, "grad_norm": 1.4171469927824731, "learning_rate": 1.7657783950831965e-05, "loss": 0.8597, "step": 1496 }, { "epoch": 0.24587336782458732, "grad_norm": 2.0234338853526497, "learning_rate": 1.76543614066087e-05, "loss": 0.8453, "step": 1497 }, { "epoch": 0.24603761189127044, "grad_norm": 2.0741562201641495, "learning_rate": 1.7650936695834536e-05, "loss": 0.9394, "step": 1498 }, { "epoch": 0.24620185595795352, "grad_norm": 0.7413306144302982, "learning_rate": 1.764750981947884e-05, "loss": 0.3876, "step": 1499 }, { "epoch": 0.2463661000246366, "grad_norm": 4.529377133659213, "learning_rate": 1.764408077851158e-05, "loss": 0.8982, "step": 1500 }, { "epoch": 0.24653034409131971, "grad_norm": 1.5180422563053921, "learning_rate": 1.7640649573903334e-05, "loss": 0.8384, "step": 1501 }, { "epoch": 0.2466945881580028, "grad_norm": 1.8443317046647871, "learning_rate": 1.7637216206625303e-05, "loss": 0.808, "step": 1502 }, { "epoch": 0.24685883222468588, "grad_norm": 1.9365928862561945, "learning_rate": 1.763378067764929e-05, "loss": 0.9204, "step": 1503 }, { "epoch": 0.24702307629136896, "grad_norm": 1.9566968815508867, "learning_rate": 1.7630342987947718e-05, "loss": 0.8756, "step": 1504 }, { "epoch": 0.24718732035805208, "grad_norm": 1.6514314280260702, "learning_rate": 1.7626903138493614e-05, "loss": 0.8563, "step": 1505 }, { "epoch": 0.24735156442473516, "grad_norm": 2.2769667057182104, "learning_rate": 1.7623461130260625e-05, "loss": 0.8141, "step": 1506 }, { "epoch": 0.24751580849141824, "grad_norm": 1.8632614705223303, "learning_rate": 1.7620016964223e-05, "loss": 0.8522, "step": 1507 }, { "epoch": 0.24768005255810133, "grad_norm": 1.970551174551132, "learning_rate": 1.7616570641355602e-05, "loss": 0.8479, "step": 1508 }, { "epoch": 0.24784429662478444, "grad_norm": 2.0938262070776874, "learning_rate": 1.761312216263391e-05, "loss": 0.8209, "step": 1509 }, { "epoch": 0.24800854069146752, "grad_norm": 1.7664047446210622, "learning_rate": 1.7609671529034006e-05, "loss": 0.8315, "step": 1510 }, { "epoch": 0.2481727847581506, "grad_norm": 1.8796360850049698, "learning_rate": 1.7606218741532588e-05, "loss": 0.8713, "step": 1511 }, { "epoch": 0.24833702882483372, "grad_norm": 2.317169703615091, "learning_rate": 1.7602763801106952e-05, "loss": 0.8728, "step": 1512 }, { "epoch": 0.2485012728915168, "grad_norm": 1.8382568860213582, "learning_rate": 1.759930670873502e-05, "loss": 0.8351, "step": 1513 }, { "epoch": 0.24866551695819988, "grad_norm": 2.311029501047866, "learning_rate": 1.759584746539531e-05, "loss": 0.8413, "step": 1514 }, { "epoch": 0.24882976102488297, "grad_norm": 2.520930311585644, "learning_rate": 1.7592386072066962e-05, "loss": 0.8316, "step": 1515 }, { "epoch": 0.24899400509156608, "grad_norm": 1.9987647575608205, "learning_rate": 1.7588922529729703e-05, "loss": 0.8138, "step": 1516 }, { "epoch": 0.24915824915824916, "grad_norm": 1.8745833581053841, "learning_rate": 1.7585456839363886e-05, "loss": 0.821, "step": 1517 }, { "epoch": 0.24932249322493225, "grad_norm": 2.3779003028238224, "learning_rate": 1.758198900195047e-05, "loss": 0.7633, "step": 1518 }, { "epoch": 0.24948673729161533, "grad_norm": 2.667474567112814, "learning_rate": 1.757851901847102e-05, "loss": 0.7473, "step": 1519 }, { "epoch": 0.24965098135829844, "grad_norm": 2.5249678050979223, "learning_rate": 1.7575046889907708e-05, "loss": 0.8078, "step": 1520 }, { "epoch": 0.24981522542498152, "grad_norm": 3.0028208659166906, "learning_rate": 1.7571572617243307e-05, "loss": 0.818, "step": 1521 }, { "epoch": 0.2499794694916646, "grad_norm": 1.5853521715888828, "learning_rate": 1.7568096201461204e-05, "loss": 0.8481, "step": 1522 }, { "epoch": 0.2501437135583477, "grad_norm": 2.1662041503249463, "learning_rate": 1.7564617643545395e-05, "loss": 0.9196, "step": 1523 }, { "epoch": 0.2503079576250308, "grad_norm": 2.766675048508125, "learning_rate": 1.7561136944480478e-05, "loss": 0.8139, "step": 1524 }, { "epoch": 0.2504722016917139, "grad_norm": 2.070861966557271, "learning_rate": 1.7557654105251657e-05, "loss": 0.8338, "step": 1525 }, { "epoch": 0.250636445758397, "grad_norm": 2.329640866124689, "learning_rate": 1.7554169126844736e-05, "loss": 0.8704, "step": 1526 }, { "epoch": 0.2508006898250801, "grad_norm": 0.6967702411133787, "learning_rate": 1.755068201024614e-05, "loss": 0.3354, "step": 1527 }, { "epoch": 0.25096493389176316, "grad_norm": 1.7036339717485869, "learning_rate": 1.7547192756442887e-05, "loss": 0.8085, "step": 1528 }, { "epoch": 0.25112917795844625, "grad_norm": 1.9179362117765273, "learning_rate": 1.75437013664226e-05, "loss": 0.8006, "step": 1529 }, { "epoch": 0.25129342202512933, "grad_norm": 2.095770464494667, "learning_rate": 1.7540207841173513e-05, "loss": 0.7905, "step": 1530 }, { "epoch": 0.2514576660918124, "grad_norm": 2.357568646550473, "learning_rate": 1.753671218168446e-05, "loss": 0.8354, "step": 1531 }, { "epoch": 0.2516219101584955, "grad_norm": 0.6357931582784694, "learning_rate": 1.753321438894488e-05, "loss": 0.3379, "step": 1532 }, { "epoch": 0.25178615422517864, "grad_norm": 2.128990012196095, "learning_rate": 1.7529714463944815e-05, "loss": 0.7651, "step": 1533 }, { "epoch": 0.2519503982918617, "grad_norm": 2.7341866483121016, "learning_rate": 1.7526212407674916e-05, "loss": 0.7523, "step": 1534 }, { "epoch": 0.2521146423585448, "grad_norm": 3.804722608086298, "learning_rate": 1.7522708221126424e-05, "loss": 0.8002, "step": 1535 }, { "epoch": 0.2522788864252279, "grad_norm": 1.9557655829891014, "learning_rate": 1.7519201905291195e-05, "loss": 0.7883, "step": 1536 }, { "epoch": 0.25244313049191097, "grad_norm": 4.6324345946525565, "learning_rate": 1.7515693461161687e-05, "loss": 0.8518, "step": 1537 }, { "epoch": 0.25260737455859406, "grad_norm": 4.4591094181311215, "learning_rate": 1.751218288973096e-05, "loss": 0.8268, "step": 1538 }, { "epoch": 0.25277161862527714, "grad_norm": 2.078981495921315, "learning_rate": 1.7508670191992667e-05, "loss": 0.8377, "step": 1539 }, { "epoch": 0.2529358626919603, "grad_norm": 1.8561572974514315, "learning_rate": 1.7505155368941074e-05, "loss": 0.8655, "step": 1540 }, { "epoch": 0.25310010675864336, "grad_norm": 1.929694586779115, "learning_rate": 1.7501638421571045e-05, "loss": 0.8017, "step": 1541 }, { "epoch": 0.25326435082532645, "grad_norm": 2.5493109452649247, "learning_rate": 1.749811935087804e-05, "loss": 0.8249, "step": 1542 }, { "epoch": 0.25342859489200953, "grad_norm": 4.037555619810103, "learning_rate": 1.7494598157858127e-05, "loss": 0.8229, "step": 1543 }, { "epoch": 0.2535928389586926, "grad_norm": 1.7934617266433477, "learning_rate": 1.7491074843507974e-05, "loss": 0.8318, "step": 1544 }, { "epoch": 0.2537570830253757, "grad_norm": 1.788657153749602, "learning_rate": 1.7487549408824845e-05, "loss": 0.7649, "step": 1545 }, { "epoch": 0.2539213270920588, "grad_norm": 2.7590203696428515, "learning_rate": 1.748402185480661e-05, "loss": 0.8305, "step": 1546 }, { "epoch": 0.25408557115874186, "grad_norm": 2.1154663504663995, "learning_rate": 1.7480492182451735e-05, "loss": 0.8355, "step": 1547 }, { "epoch": 0.254249815225425, "grad_norm": 2.1326105814809315, "learning_rate": 1.7476960392759284e-05, "loss": 0.873, "step": 1548 }, { "epoch": 0.2544140592921081, "grad_norm": 2.304219879656297, "learning_rate": 1.7473426486728925e-05, "loss": 0.8, "step": 1549 }, { "epoch": 0.25457830335879117, "grad_norm": 2.41490983460287, "learning_rate": 1.746989046536092e-05, "loss": 0.7906, "step": 1550 }, { "epoch": 0.25474254742547425, "grad_norm": 3.9927364741499294, "learning_rate": 1.7466352329656134e-05, "loss": 0.8156, "step": 1551 }, { "epoch": 0.25490679149215734, "grad_norm": 1.7446507255885213, "learning_rate": 1.746281208061603e-05, "loss": 0.7383, "step": 1552 }, { "epoch": 0.2550710355588404, "grad_norm": 1.668801530418496, "learning_rate": 1.7459269719242665e-05, "loss": 0.8989, "step": 1553 }, { "epoch": 0.2552352796255235, "grad_norm": 2.233176916797145, "learning_rate": 1.74557252465387e-05, "loss": 0.7991, "step": 1554 }, { "epoch": 0.25539952369220664, "grad_norm": 12.220320346086542, "learning_rate": 1.745217866350739e-05, "loss": 0.8243, "step": 1555 }, { "epoch": 0.2555637677588897, "grad_norm": 2.4398688694484836, "learning_rate": 1.744862997115259e-05, "loss": 0.755, "step": 1556 }, { "epoch": 0.2557280118255728, "grad_norm": 1.6926559567495416, "learning_rate": 1.7445079170478743e-05, "loss": 0.8005, "step": 1557 }, { "epoch": 0.2558922558922559, "grad_norm": 1.9430283068763754, "learning_rate": 1.74415262624909e-05, "loss": 0.8356, "step": 1558 }, { "epoch": 0.256056499958939, "grad_norm": 2.3410118183078503, "learning_rate": 1.7437971248194706e-05, "loss": 0.7743, "step": 1559 }, { "epoch": 0.25622074402562206, "grad_norm": 1.6236623811162993, "learning_rate": 1.74344141285964e-05, "loss": 0.8211, "step": 1560 }, { "epoch": 0.25638498809230514, "grad_norm": 1.7826157412072852, "learning_rate": 1.7430854904702806e-05, "loss": 0.9053, "step": 1561 }, { "epoch": 0.2565492321589883, "grad_norm": 1.9136310608971878, "learning_rate": 1.7427293577521377e-05, "loss": 0.7882, "step": 1562 }, { "epoch": 0.25671347622567137, "grad_norm": 3.1105826738911193, "learning_rate": 1.742373014806012e-05, "loss": 0.7725, "step": 1563 }, { "epoch": 0.25687772029235445, "grad_norm": 1.6896685652969272, "learning_rate": 1.7420164617327662e-05, "loss": 0.7918, "step": 1564 }, { "epoch": 0.25704196435903753, "grad_norm": 2.2844636473471085, "learning_rate": 1.7416596986333222e-05, "loss": 0.8665, "step": 1565 }, { "epoch": 0.2572062084257206, "grad_norm": 1.9462875838065665, "learning_rate": 1.7413027256086606e-05, "loss": 0.8474, "step": 1566 }, { "epoch": 0.2573704524924037, "grad_norm": 1.6246514491813855, "learning_rate": 1.7409455427598224e-05, "loss": 0.8116, "step": 1567 }, { "epoch": 0.2575346965590868, "grad_norm": 2.2630334392857896, "learning_rate": 1.740588150187907e-05, "loss": 0.8298, "step": 1568 }, { "epoch": 0.25769894062576987, "grad_norm": 1.8298583858484962, "learning_rate": 1.7402305479940735e-05, "loss": 0.845, "step": 1569 }, { "epoch": 0.257863184692453, "grad_norm": 1.8991744996245485, "learning_rate": 1.739872736279541e-05, "loss": 0.8759, "step": 1570 }, { "epoch": 0.2580274287591361, "grad_norm": 1.7998457890929653, "learning_rate": 1.7395147151455868e-05, "loss": 0.8528, "step": 1571 }, { "epoch": 0.2581916728258192, "grad_norm": 2.1448159487042027, "learning_rate": 1.7391564846935484e-05, "loss": 0.833, "step": 1572 }, { "epoch": 0.25835591689250226, "grad_norm": 1.7293712067538536, "learning_rate": 1.7387980450248222e-05, "loss": 0.8562, "step": 1573 }, { "epoch": 0.25852016095918534, "grad_norm": 1.6342620606829736, "learning_rate": 1.7384393962408633e-05, "loss": 0.8604, "step": 1574 }, { "epoch": 0.2586844050258684, "grad_norm": 2.050775848688332, "learning_rate": 1.7380805384431875e-05, "loss": 0.7529, "step": 1575 }, { "epoch": 0.2588486490925515, "grad_norm": 2.271390809125704, "learning_rate": 1.7377214717333675e-05, "loss": 0.8025, "step": 1576 }, { "epoch": 0.25901289315923465, "grad_norm": 2.3318848279259528, "learning_rate": 1.7373621962130373e-05, "loss": 0.9536, "step": 1577 }, { "epoch": 0.25917713722591773, "grad_norm": 2.8895696301121, "learning_rate": 1.7370027119838884e-05, "loss": 0.8141, "step": 1578 }, { "epoch": 0.2593413812926008, "grad_norm": 2.1373440536484485, "learning_rate": 1.7366430191476723e-05, "loss": 0.7931, "step": 1579 }, { "epoch": 0.2595056253592839, "grad_norm": 1.7594810943471257, "learning_rate": 1.7362831178062e-05, "loss": 0.9415, "step": 1580 }, { "epoch": 0.259669869425967, "grad_norm": 1.9849309743963153, "learning_rate": 1.73592300806134e-05, "loss": 0.7805, "step": 1581 }, { "epoch": 0.25983411349265007, "grad_norm": 0.7261349698970159, "learning_rate": 1.7355626900150202e-05, "loss": 0.3784, "step": 1582 }, { "epoch": 0.25999835755933315, "grad_norm": 1.6742730237562706, "learning_rate": 1.735202163769229e-05, "loss": 0.8717, "step": 1583 }, { "epoch": 0.2601626016260163, "grad_norm": 1.495978155805795, "learning_rate": 1.734841429426012e-05, "loss": 0.8551, "step": 1584 }, { "epoch": 0.26032684569269937, "grad_norm": 1.7545992446041339, "learning_rate": 1.7344804870874744e-05, "loss": 0.8732, "step": 1585 }, { "epoch": 0.26049108975938245, "grad_norm": 1.7788532629324985, "learning_rate": 1.73411933685578e-05, "loss": 0.8164, "step": 1586 }, { "epoch": 0.26065533382606554, "grad_norm": 11.193147470773697, "learning_rate": 1.733757978833152e-05, "loss": 0.8628, "step": 1587 }, { "epoch": 0.2608195778927486, "grad_norm": 1.8658395966268908, "learning_rate": 1.7333964131218714e-05, "loss": 0.8659, "step": 1588 }, { "epoch": 0.2609838219594317, "grad_norm": 3.4043555340081073, "learning_rate": 1.7330346398242794e-05, "loss": 0.8272, "step": 1589 }, { "epoch": 0.2611480660261148, "grad_norm": 2.536165055427942, "learning_rate": 1.7326726590427747e-05, "loss": 0.8214, "step": 1590 }, { "epoch": 0.2613123100927979, "grad_norm": 2.9685879749459585, "learning_rate": 1.732310470879815e-05, "loss": 0.7728, "step": 1591 }, { "epoch": 0.261476554159481, "grad_norm": 1.7949190011910803, "learning_rate": 1.7319480754379175e-05, "loss": 0.9179, "step": 1592 }, { "epoch": 0.2616407982261641, "grad_norm": 1.9228499421377245, "learning_rate": 1.7315854728196568e-05, "loss": 0.7921, "step": 1593 }, { "epoch": 0.2618050422928472, "grad_norm": 1.959960204174951, "learning_rate": 1.7312226631276675e-05, "loss": 0.7988, "step": 1594 }, { "epoch": 0.26196928635953026, "grad_norm": 2.9984354358925773, "learning_rate": 1.7308596464646413e-05, "loss": 0.8507, "step": 1595 }, { "epoch": 0.26213353042621335, "grad_norm": 3.6629652105596215, "learning_rate": 1.7304964229333302e-05, "loss": 0.7128, "step": 1596 }, { "epoch": 0.26229777449289643, "grad_norm": 2.1729457482878893, "learning_rate": 1.7301329926365432e-05, "loss": 0.8056, "step": 1597 }, { "epoch": 0.2624620185595795, "grad_norm": 2.9292705997681607, "learning_rate": 1.729769355677149e-05, "loss": 0.7974, "step": 1598 }, { "epoch": 0.26262626262626265, "grad_norm": 2.1750460634252082, "learning_rate": 1.7294055121580735e-05, "loss": 0.8143, "step": 1599 }, { "epoch": 0.26279050669294574, "grad_norm": 1.829014443166834, "learning_rate": 1.7290414621823026e-05, "loss": 0.8758, "step": 1600 }, { "epoch": 0.2629547507596288, "grad_norm": 1.842646096331558, "learning_rate": 1.7286772058528796e-05, "loss": 0.8764, "step": 1601 }, { "epoch": 0.2631189948263119, "grad_norm": 1.9739903704364503, "learning_rate": 1.7283127432729063e-05, "loss": 0.8864, "step": 1602 }, { "epoch": 0.263283238892995, "grad_norm": 2.086508648416849, "learning_rate": 1.7279480745455433e-05, "loss": 0.8364, "step": 1603 }, { "epoch": 0.26344748295967807, "grad_norm": 2.050217197295815, "learning_rate": 1.7275831997740095e-05, "loss": 0.7224, "step": 1604 }, { "epoch": 0.26361172702636115, "grad_norm": 2.0092381416183955, "learning_rate": 1.7272181190615812e-05, "loss": 0.8096, "step": 1605 }, { "epoch": 0.26377597109304424, "grad_norm": 2.3283511575536253, "learning_rate": 1.7268528325115947e-05, "loss": 0.8753, "step": 1606 }, { "epoch": 0.2639402151597274, "grad_norm": 6.662533823509809, "learning_rate": 1.726487340227443e-05, "loss": 0.7161, "step": 1607 }, { "epoch": 0.26410445922641046, "grad_norm": 2.0105560564957368, "learning_rate": 1.7261216423125782e-05, "loss": 0.7704, "step": 1608 }, { "epoch": 0.26426870329309354, "grad_norm": 2.398494207142769, "learning_rate": 1.7257557388705098e-05, "loss": 0.8841, "step": 1609 }, { "epoch": 0.2644329473597766, "grad_norm": 1.8970975361458062, "learning_rate": 1.725389630004807e-05, "loss": 0.8472, "step": 1610 }, { "epoch": 0.2645971914264597, "grad_norm": 2.6925316812083313, "learning_rate": 1.7250233158190948e-05, "loss": 0.7784, "step": 1611 }, { "epoch": 0.2647614354931428, "grad_norm": 2.0831496728139753, "learning_rate": 1.7246567964170585e-05, "loss": 0.8329, "step": 1612 }, { "epoch": 0.2649256795598259, "grad_norm": 1.8293152911741362, "learning_rate": 1.724290071902441e-05, "loss": 0.8808, "step": 1613 }, { "epoch": 0.265089923626509, "grad_norm": 1.5404331384602405, "learning_rate": 1.723923142379042e-05, "loss": 0.833, "step": 1614 }, { "epoch": 0.2652541676931921, "grad_norm": 1.7150252802206896, "learning_rate": 1.7235560079507204e-05, "loss": 0.8738, "step": 1615 }, { "epoch": 0.2654184117598752, "grad_norm": 2.261316597003911, "learning_rate": 1.7231886687213936e-05, "loss": 0.7511, "step": 1616 }, { "epoch": 0.26558265582655827, "grad_norm": 1.8433554585394607, "learning_rate": 1.722821124795035e-05, "loss": 0.7791, "step": 1617 }, { "epoch": 0.26574689989324135, "grad_norm": 2.465891772775902, "learning_rate": 1.7224533762756775e-05, "loss": 0.8231, "step": 1618 }, { "epoch": 0.26591114395992443, "grad_norm": 1.8626038878582525, "learning_rate": 1.7220854232674127e-05, "loss": 0.8594, "step": 1619 }, { "epoch": 0.2660753880266075, "grad_norm": 2.9158833511016824, "learning_rate": 1.721717265874387e-05, "loss": 0.8259, "step": 1620 }, { "epoch": 0.26623963209329066, "grad_norm": 3.2098078653407653, "learning_rate": 1.721348904200808e-05, "loss": 0.8348, "step": 1621 }, { "epoch": 0.26640387615997374, "grad_norm": 1.8373851227461608, "learning_rate": 1.7209803383509394e-05, "loss": 0.8383, "step": 1622 }, { "epoch": 0.2665681202266568, "grad_norm": 2.9634389069172062, "learning_rate": 1.720611568429103e-05, "loss": 0.8683, "step": 1623 }, { "epoch": 0.2667323642933399, "grad_norm": 1.7005321691997881, "learning_rate": 1.7202425945396774e-05, "loss": 0.7915, "step": 1624 }, { "epoch": 0.266896608360023, "grad_norm": 1.9648146587003477, "learning_rate": 1.7198734167871015e-05, "loss": 0.7954, "step": 1625 }, { "epoch": 0.2670608524267061, "grad_norm": 1.5174085357568587, "learning_rate": 1.719504035275869e-05, "loss": 0.7872, "step": 1626 }, { "epoch": 0.26722509649338916, "grad_norm": 1.9309349766193011, "learning_rate": 1.7191344501105328e-05, "loss": 0.8777, "step": 1627 }, { "epoch": 0.26738934056007224, "grad_norm": 1.5580923302828749, "learning_rate": 1.718764661395704e-05, "loss": 0.8298, "step": 1628 }, { "epoch": 0.2675535846267554, "grad_norm": 1.892747497605761, "learning_rate": 1.7183946692360495e-05, "loss": 0.8544, "step": 1629 }, { "epoch": 0.26771782869343846, "grad_norm": 1.807179166780497, "learning_rate": 1.7180244737362956e-05, "loss": 0.8367, "step": 1630 }, { "epoch": 0.26788207276012155, "grad_norm": 2.178861546960086, "learning_rate": 1.717654075001225e-05, "loss": 0.7952, "step": 1631 }, { "epoch": 0.26804631682680463, "grad_norm": 1.9684414924996025, "learning_rate": 1.717283473135678e-05, "loss": 0.7908, "step": 1632 }, { "epoch": 0.2682105608934877, "grad_norm": 1.804394541461233, "learning_rate": 1.716912668244553e-05, "loss": 0.856, "step": 1633 }, { "epoch": 0.2683748049601708, "grad_norm": 0.8177193497068872, "learning_rate": 1.7165416604328054e-05, "loss": 0.3529, "step": 1634 }, { "epoch": 0.2685390490268539, "grad_norm": 1.5804853990333432, "learning_rate": 1.7161704498054485e-05, "loss": 0.7885, "step": 1635 }, { "epoch": 0.268703293093537, "grad_norm": 2.309145433883194, "learning_rate": 1.7157990364675524e-05, "loss": 0.8056, "step": 1636 }, { "epoch": 0.2688675371602201, "grad_norm": 2.0481423062113686, "learning_rate": 1.7154274205242448e-05, "loss": 0.8668, "step": 1637 }, { "epoch": 0.2690317812269032, "grad_norm": 1.9878448773530502, "learning_rate": 1.715055602080711e-05, "loss": 0.7959, "step": 1638 }, { "epoch": 0.26919602529358627, "grad_norm": 1.9540955569478045, "learning_rate": 1.7146835812421937e-05, "loss": 0.8636, "step": 1639 }, { "epoch": 0.26936026936026936, "grad_norm": 2.246622140615471, "learning_rate": 1.714311358113992e-05, "loss": 0.7322, "step": 1640 }, { "epoch": 0.26952451342695244, "grad_norm": 2.487776316238685, "learning_rate": 1.7139389328014634e-05, "loss": 0.8456, "step": 1641 }, { "epoch": 0.2696887574936355, "grad_norm": 2.030661174242529, "learning_rate": 1.7135663054100216e-05, "loss": 0.87, "step": 1642 }, { "epoch": 0.2698530015603186, "grad_norm": 1.8709500508400068, "learning_rate": 1.7131934760451385e-05, "loss": 0.8062, "step": 1643 }, { "epoch": 0.27001724562700175, "grad_norm": 1.8429330250410711, "learning_rate": 1.7128204448123422e-05, "loss": 0.8051, "step": 1644 }, { "epoch": 0.27018148969368483, "grad_norm": 2.067942874535034, "learning_rate": 1.7124472118172187e-05, "loss": 0.7805, "step": 1645 }, { "epoch": 0.2703457337603679, "grad_norm": 1.8006875084225606, "learning_rate": 1.712073777165411e-05, "loss": 0.8194, "step": 1646 }, { "epoch": 0.270509977827051, "grad_norm": 2.112002505818904, "learning_rate": 1.7117001409626185e-05, "loss": 0.8042, "step": 1647 }, { "epoch": 0.2706742218937341, "grad_norm": 1.594043917391037, "learning_rate": 1.7113263033145985e-05, "loss": 0.8195, "step": 1648 }, { "epoch": 0.27083846596041716, "grad_norm": 1.9523364995294175, "learning_rate": 1.7109522643271646e-05, "loss": 0.819, "step": 1649 }, { "epoch": 0.27100271002710025, "grad_norm": 1.983770722478716, "learning_rate": 1.7105780241061884e-05, "loss": 0.8518, "step": 1650 }, { "epoch": 0.2711669540937834, "grad_norm": 1.8095792385016831, "learning_rate": 1.710203582757597e-05, "loss": 0.8668, "step": 1651 }, { "epoch": 0.27133119816046647, "grad_norm": 1.6087425431079139, "learning_rate": 1.7098289403873754e-05, "loss": 0.9153, "step": 1652 }, { "epoch": 0.27149544222714955, "grad_norm": 1.7869319325583688, "learning_rate": 1.7094540971015663e-05, "loss": 0.8085, "step": 1653 }, { "epoch": 0.27165968629383264, "grad_norm": 1.9850069190519375, "learning_rate": 1.709079053006267e-05, "loss": 0.8771, "step": 1654 }, { "epoch": 0.2718239303605157, "grad_norm": 1.0417779812799102, "learning_rate": 1.708703808207634e-05, "loss": 0.3851, "step": 1655 }, { "epoch": 0.2719881744271988, "grad_norm": 0.7947460828376666, "learning_rate": 1.7083283628118786e-05, "loss": 0.3553, "step": 1656 }, { "epoch": 0.2721524184938819, "grad_norm": 2.15878837831279, "learning_rate": 1.7079527169252706e-05, "loss": 0.8137, "step": 1657 }, { "epoch": 0.272316662560565, "grad_norm": 2.0781818365957228, "learning_rate": 1.7075768706541355e-05, "loss": 0.8014, "step": 1658 }, { "epoch": 0.2724809066272481, "grad_norm": 3.967605597057034, "learning_rate": 1.7072008241048555e-05, "loss": 0.8285, "step": 1659 }, { "epoch": 0.2726451506939312, "grad_norm": 2.1262676724545533, "learning_rate": 1.7068245773838703e-05, "loss": 0.8789, "step": 1660 }, { "epoch": 0.2728093947606143, "grad_norm": 1.7603797944655148, "learning_rate": 1.7064481305976754e-05, "loss": 0.8517, "step": 1661 }, { "epoch": 0.27297363882729736, "grad_norm": 2.0391609961184263, "learning_rate": 1.7060714838528234e-05, "loss": 0.8594, "step": 1662 }, { "epoch": 0.27313788289398044, "grad_norm": 1.7503667933272904, "learning_rate": 1.7056946372559234e-05, "loss": 0.8669, "step": 1663 }, { "epoch": 0.2733021269606635, "grad_norm": 2.0840962650374277, "learning_rate": 1.7053175909136406e-05, "loss": 0.7748, "step": 1664 }, { "epoch": 0.2734663710273466, "grad_norm": 2.322540044001253, "learning_rate": 1.7049403449326982e-05, "loss": 0.8632, "step": 1665 }, { "epoch": 0.27363061509402975, "grad_norm": 1.9890280582193498, "learning_rate": 1.704562899419874e-05, "loss": 0.8417, "step": 1666 }, { "epoch": 0.27379485916071283, "grad_norm": 1.556503582483252, "learning_rate": 1.704185254482003e-05, "loss": 0.8487, "step": 1667 }, { "epoch": 0.2739591032273959, "grad_norm": 2.580209324464677, "learning_rate": 1.7038074102259775e-05, "loss": 0.8007, "step": 1668 }, { "epoch": 0.274123347294079, "grad_norm": 1.6557381930300588, "learning_rate": 1.703429366758745e-05, "loss": 0.8286, "step": 1669 }, { "epoch": 0.2742875913607621, "grad_norm": 2.0049549273650165, "learning_rate": 1.7030511241873107e-05, "loss": 0.8229, "step": 1670 }, { "epoch": 0.27445183542744517, "grad_norm": 2.0965082042396577, "learning_rate": 1.7026726826187343e-05, "loss": 0.8929, "step": 1671 }, { "epoch": 0.27461607949412825, "grad_norm": 2.0331577763750324, "learning_rate": 1.7022940421601334e-05, "loss": 0.8155, "step": 1672 }, { "epoch": 0.2747803235608114, "grad_norm": 2.0041785594578863, "learning_rate": 1.7019152029186817e-05, "loss": 0.7495, "step": 1673 }, { "epoch": 0.2749445676274945, "grad_norm": 1.4216572322881171, "learning_rate": 1.701536165001608e-05, "loss": 0.7817, "step": 1674 }, { "epoch": 0.27510881169417756, "grad_norm": 1.8895046520259875, "learning_rate": 1.701156928516199e-05, "loss": 0.7394, "step": 1675 }, { "epoch": 0.27527305576086064, "grad_norm": 1.8126411086965382, "learning_rate": 1.7007774935697966e-05, "loss": 0.8628, "step": 1676 }, { "epoch": 0.2754372998275437, "grad_norm": 3.3305007298860616, "learning_rate": 1.7003978602697988e-05, "loss": 0.7633, "step": 1677 }, { "epoch": 0.2756015438942268, "grad_norm": 1.43793183573693, "learning_rate": 1.70001802872366e-05, "loss": 0.8485, "step": 1678 }, { "epoch": 0.2757657879609099, "grad_norm": 1.6013859136812196, "learning_rate": 1.6996379990388908e-05, "loss": 0.791, "step": 1679 }, { "epoch": 0.27593003202759303, "grad_norm": 1.856929721748009, "learning_rate": 1.6992577713230582e-05, "loss": 0.8944, "step": 1680 }, { "epoch": 0.2760942760942761, "grad_norm": 1.8122824012918413, "learning_rate": 1.6988773456837847e-05, "loss": 0.8775, "step": 1681 }, { "epoch": 0.2762585201609592, "grad_norm": 2.2491627388266724, "learning_rate": 1.6984967222287484e-05, "loss": 0.8719, "step": 1682 }, { "epoch": 0.2764227642276423, "grad_norm": 1.463177334496228, "learning_rate": 1.6981159010656847e-05, "loss": 0.7801, "step": 1683 }, { "epoch": 0.27658700829432536, "grad_norm": 1.9084452252486828, "learning_rate": 1.6977348823023838e-05, "loss": 0.7322, "step": 1684 }, { "epoch": 0.27675125236100845, "grad_norm": 1.6817182732555307, "learning_rate": 1.697353666046692e-05, "loss": 0.8115, "step": 1685 }, { "epoch": 0.27691549642769153, "grad_norm": 1.7522846183883658, "learning_rate": 1.6969722524065124e-05, "loss": 0.7407, "step": 1686 }, { "epoch": 0.2770797404943746, "grad_norm": 1.9353938339169166, "learning_rate": 1.696590641489803e-05, "loss": 0.7909, "step": 1687 }, { "epoch": 0.27724398456105775, "grad_norm": 1.6686315753326277, "learning_rate": 1.6962088334045785e-05, "loss": 0.8049, "step": 1688 }, { "epoch": 0.27740822862774084, "grad_norm": 1.7194297090605963, "learning_rate": 1.695826828258908e-05, "loss": 0.7341, "step": 1689 }, { "epoch": 0.2775724726944239, "grad_norm": 2.226916247632244, "learning_rate": 1.6954446261609176e-05, "loss": 0.8063, "step": 1690 }, { "epoch": 0.277736716761107, "grad_norm": 2.0401641128650634, "learning_rate": 1.6950622272187888e-05, "loss": 0.8818, "step": 1691 }, { "epoch": 0.2779009608277901, "grad_norm": 2.462583749353122, "learning_rate": 1.6946796315407593e-05, "loss": 0.8034, "step": 1692 }, { "epoch": 0.2780652048944732, "grad_norm": 1.993430284702852, "learning_rate": 1.694296839235121e-05, "loss": 0.8321, "step": 1693 }, { "epoch": 0.27822944896115626, "grad_norm": 2.332756736374335, "learning_rate": 1.6939138504102235e-05, "loss": 0.8468, "step": 1694 }, { "epoch": 0.2783936930278394, "grad_norm": 2.0275582497883367, "learning_rate": 1.6935306651744704e-05, "loss": 0.7814, "step": 1695 }, { "epoch": 0.2785579370945225, "grad_norm": 1.7861630057297275, "learning_rate": 1.6931472836363214e-05, "loss": 0.7719, "step": 1696 }, { "epoch": 0.27872218116120556, "grad_norm": 1.8974306721897403, "learning_rate": 1.6927637059042918e-05, "loss": 0.8431, "step": 1697 }, { "epoch": 0.27888642522788865, "grad_norm": 1.5499004573897666, "learning_rate": 1.692379932086953e-05, "loss": 0.4281, "step": 1698 }, { "epoch": 0.27905066929457173, "grad_norm": 2.0986451348432555, "learning_rate": 1.6919959622929312e-05, "loss": 0.8601, "step": 1699 }, { "epoch": 0.2792149133612548, "grad_norm": 1.9991902331148421, "learning_rate": 1.691611796630908e-05, "loss": 0.802, "step": 1700 }, { "epoch": 0.2793791574279379, "grad_norm": 1.5906317555604867, "learning_rate": 1.6912274352096207e-05, "loss": 0.7279, "step": 1701 }, { "epoch": 0.279543401494621, "grad_norm": 2.3990865288048857, "learning_rate": 1.690842878137862e-05, "loss": 0.6609, "step": 1702 }, { "epoch": 0.2797076455613041, "grad_norm": 1.696385561193522, "learning_rate": 1.6904581255244802e-05, "loss": 0.8226, "step": 1703 }, { "epoch": 0.2798718896279872, "grad_norm": 1.94660413770437, "learning_rate": 1.690073177478379e-05, "loss": 0.7789, "step": 1704 }, { "epoch": 0.2800361336946703, "grad_norm": 2.5858135691787267, "learning_rate": 1.6896880341085158e-05, "loss": 0.8398, "step": 1705 }, { "epoch": 0.28020037776135337, "grad_norm": 0.7420945464359366, "learning_rate": 1.6893026955239062e-05, "loss": 0.3321, "step": 1706 }, { "epoch": 0.28036462182803645, "grad_norm": 2.073907304553136, "learning_rate": 1.688917161833618e-05, "loss": 0.8104, "step": 1707 }, { "epoch": 0.28052886589471954, "grad_norm": 1.899816573476461, "learning_rate": 1.688531433146777e-05, "loss": 0.8319, "step": 1708 }, { "epoch": 0.2806931099614026, "grad_norm": 2.267998422871806, "learning_rate": 1.6881455095725627e-05, "loss": 0.8589, "step": 1709 }, { "epoch": 0.28085735402808576, "grad_norm": 1.6394324758690553, "learning_rate": 1.6877593912202094e-05, "loss": 0.9015, "step": 1710 }, { "epoch": 0.28102159809476884, "grad_norm": 2.835973461342405, "learning_rate": 1.6873730781990073e-05, "loss": 0.8355, "step": 1711 }, { "epoch": 0.2811858421614519, "grad_norm": 2.512366022547189, "learning_rate": 1.6869865706183017e-05, "loss": 0.8023, "step": 1712 }, { "epoch": 0.281350086228135, "grad_norm": 1.9963896014712819, "learning_rate": 1.6865998685874923e-05, "loss": 0.7991, "step": 1713 }, { "epoch": 0.2815143302948181, "grad_norm": 2.0827784806280136, "learning_rate": 1.6862129722160347e-05, "loss": 0.8046, "step": 1714 }, { "epoch": 0.2816785743615012, "grad_norm": 1.3737731808528322, "learning_rate": 1.685825881613439e-05, "loss": 0.7809, "step": 1715 }, { "epoch": 0.28184281842818426, "grad_norm": 0.8397271532549275, "learning_rate": 1.6854385968892702e-05, "loss": 0.3771, "step": 1716 }, { "epoch": 0.2820070624948674, "grad_norm": 2.1657632286304764, "learning_rate": 1.6850511181531487e-05, "loss": 0.8024, "step": 1717 }, { "epoch": 0.2821713065615505, "grad_norm": 1.6381717137181149, "learning_rate": 1.6846634455147498e-05, "loss": 0.9003, "step": 1718 }, { "epoch": 0.28233555062823357, "grad_norm": 2.2935834355633418, "learning_rate": 1.6842755790838025e-05, "loss": 0.8485, "step": 1719 }, { "epoch": 0.28249979469491665, "grad_norm": 1.873574587376135, "learning_rate": 1.6838875189700924e-05, "loss": 0.7682, "step": 1720 }, { "epoch": 0.28266403876159973, "grad_norm": 1.702103339740926, "learning_rate": 1.6834992652834586e-05, "loss": 0.8206, "step": 1721 }, { "epoch": 0.2828282828282828, "grad_norm": 1.930772889070396, "learning_rate": 1.6831108181337957e-05, "loss": 0.8005, "step": 1722 }, { "epoch": 0.2829925268949659, "grad_norm": 2.0906326986740513, "learning_rate": 1.6827221776310532e-05, "loss": 0.7824, "step": 1723 }, { "epoch": 0.283156770961649, "grad_norm": 2.3833964356509827, "learning_rate": 1.6823333438852346e-05, "loss": 0.8698, "step": 1724 }, { "epoch": 0.2833210150283321, "grad_norm": 1.7491504442691999, "learning_rate": 1.6819443170063983e-05, "loss": 0.815, "step": 1725 }, { "epoch": 0.2834852590950152, "grad_norm": 2.2347047123173605, "learning_rate": 1.681555097104658e-05, "loss": 0.8297, "step": 1726 }, { "epoch": 0.2836495031616983, "grad_norm": 1.8365818637996116, "learning_rate": 1.681165684290181e-05, "loss": 0.779, "step": 1727 }, { "epoch": 0.2838137472283814, "grad_norm": 1.7927579482048481, "learning_rate": 1.6807760786731905e-05, "loss": 0.8771, "step": 1728 }, { "epoch": 0.28397799129506446, "grad_norm": 1.6846060734557091, "learning_rate": 1.680386280363963e-05, "loss": 0.8521, "step": 1729 }, { "epoch": 0.28414223536174754, "grad_norm": 1.6551922895694011, "learning_rate": 1.67999628947283e-05, "loss": 0.7635, "step": 1730 }, { "epoch": 0.2843064794284306, "grad_norm": 2.5613286297905216, "learning_rate": 1.6796061061101782e-05, "loss": 0.8269, "step": 1731 }, { "epoch": 0.28447072349511376, "grad_norm": 2.083979987087325, "learning_rate": 1.6792157303864475e-05, "loss": 0.9041, "step": 1732 }, { "epoch": 0.28463496756179685, "grad_norm": 3.5206605221138614, "learning_rate": 1.6788251624121335e-05, "loss": 0.8312, "step": 1733 }, { "epoch": 0.28479921162847993, "grad_norm": 2.8885067318825834, "learning_rate": 1.678434402297785e-05, "loss": 0.8884, "step": 1734 }, { "epoch": 0.284963455695163, "grad_norm": 0.7106520447945828, "learning_rate": 1.678043450154007e-05, "loss": 0.3452, "step": 1735 }, { "epoch": 0.2851276997618461, "grad_norm": 1.7519352423854162, "learning_rate": 1.6776523060914565e-05, "loss": 0.8959, "step": 1736 }, { "epoch": 0.2852919438285292, "grad_norm": 3.3838595672968923, "learning_rate": 1.677260970220846e-05, "loss": 0.71, "step": 1737 }, { "epoch": 0.28545618789521227, "grad_norm": 2.038064508661784, "learning_rate": 1.6768694426529432e-05, "loss": 0.7848, "step": 1738 }, { "epoch": 0.28562043196189535, "grad_norm": 2.111882400677896, "learning_rate": 1.676477723498569e-05, "loss": 0.7954, "step": 1739 }, { "epoch": 0.2857846760285785, "grad_norm": 2.2498840831171427, "learning_rate": 1.6760858128685974e-05, "loss": 0.8147, "step": 1740 }, { "epoch": 0.28594892009526157, "grad_norm": 2.0543014892323717, "learning_rate": 1.6756937108739596e-05, "loss": 0.7927, "step": 1741 }, { "epoch": 0.28611316416194466, "grad_norm": 2.244279661437098, "learning_rate": 1.675301417625638e-05, "loss": 0.7853, "step": 1742 }, { "epoch": 0.28627740822862774, "grad_norm": 1.5589904537010375, "learning_rate": 1.6749089332346714e-05, "loss": 0.7815, "step": 1743 }, { "epoch": 0.2864416522953108, "grad_norm": 1.9963291593965096, "learning_rate": 1.6745162578121504e-05, "loss": 0.8138, "step": 1744 }, { "epoch": 0.2866058963619939, "grad_norm": 1.6530354187212466, "learning_rate": 1.6741233914692223e-05, "loss": 0.8056, "step": 1745 }, { "epoch": 0.286770140428677, "grad_norm": 1.9064837370049286, "learning_rate": 1.6737303343170863e-05, "loss": 0.8468, "step": 1746 }, { "epoch": 0.28693438449536013, "grad_norm": 2.2649892244676737, "learning_rate": 1.6733370864669965e-05, "loss": 0.7915, "step": 1747 }, { "epoch": 0.2870986285620432, "grad_norm": 1.6892858245161826, "learning_rate": 1.672943648030261e-05, "loss": 0.797, "step": 1748 }, { "epoch": 0.2872628726287263, "grad_norm": 1.8264309403510057, "learning_rate": 1.6725500191182415e-05, "loss": 0.8371, "step": 1749 }, { "epoch": 0.2874271166954094, "grad_norm": 1.6636967154619322, "learning_rate": 1.672156199842354e-05, "loss": 0.8365, "step": 1750 }, { "epoch": 0.28759136076209246, "grad_norm": 1.747381742795321, "learning_rate": 1.6717621903140686e-05, "loss": 0.8154, "step": 1751 }, { "epoch": 0.28775560482877555, "grad_norm": 1.7050573954493256, "learning_rate": 1.6713679906449084e-05, "loss": 0.9197, "step": 1752 }, { "epoch": 0.28791984889545863, "grad_norm": 2.3803843368744544, "learning_rate": 1.6709736009464504e-05, "loss": 0.7918, "step": 1753 }, { "epoch": 0.28808409296214177, "grad_norm": 3.335739270645408, "learning_rate": 1.670579021330327e-05, "loss": 0.75, "step": 1754 }, { "epoch": 0.28824833702882485, "grad_norm": 1.7822888410491686, "learning_rate": 1.670184251908222e-05, "loss": 0.8623, "step": 1755 }, { "epoch": 0.28841258109550794, "grad_norm": 5.484353494873085, "learning_rate": 1.6697892927918742e-05, "loss": 0.8701, "step": 1756 }, { "epoch": 0.288576825162191, "grad_norm": 1.831382613388385, "learning_rate": 1.6693941440930768e-05, "loss": 0.8271, "step": 1757 }, { "epoch": 0.2887410692288741, "grad_norm": 1.8327402362878928, "learning_rate": 1.668998805923675e-05, "loss": 0.8078, "step": 1758 }, { "epoch": 0.2889053132955572, "grad_norm": 1.8999630714129998, "learning_rate": 1.668603278395568e-05, "loss": 0.8388, "step": 1759 }, { "epoch": 0.28906955736224027, "grad_norm": 2.073471117922414, "learning_rate": 1.6682075616207103e-05, "loss": 0.7016, "step": 1760 }, { "epoch": 0.28923380142892335, "grad_norm": 1.9658537104964622, "learning_rate": 1.667811655711108e-05, "loss": 0.753, "step": 1761 }, { "epoch": 0.2893980454956065, "grad_norm": 2.144681353313336, "learning_rate": 1.6674155607788214e-05, "loss": 0.8245, "step": 1762 }, { "epoch": 0.2895622895622896, "grad_norm": 2.2145101128947813, "learning_rate": 1.6670192769359643e-05, "loss": 0.8423, "step": 1763 }, { "epoch": 0.28972653362897266, "grad_norm": 1.3781765168119042, "learning_rate": 1.666622804294704e-05, "loss": 0.7651, "step": 1764 }, { "epoch": 0.28989077769565574, "grad_norm": 0.7084866438428383, "learning_rate": 1.666226142967262e-05, "loss": 0.3412, "step": 1765 }, { "epoch": 0.2900550217623388, "grad_norm": 1.5781032190740303, "learning_rate": 1.6658292930659115e-05, "loss": 0.8833, "step": 1766 }, { "epoch": 0.2902192658290219, "grad_norm": 2.082061889374477, "learning_rate": 1.6654322547029803e-05, "loss": 0.8225, "step": 1767 }, { "epoch": 0.290383509895705, "grad_norm": 2.4787281197146815, "learning_rate": 1.6650350279908497e-05, "loss": 0.852, "step": 1768 }, { "epoch": 0.29054775396238813, "grad_norm": 1.9578856464631313, "learning_rate": 1.664637613041953e-05, "loss": 0.8442, "step": 1769 }, { "epoch": 0.2907119980290712, "grad_norm": 1.6215915694839915, "learning_rate": 1.6642400099687787e-05, "loss": 0.7968, "step": 1770 }, { "epoch": 0.2908762420957543, "grad_norm": 2.133122666698249, "learning_rate": 1.6638422188838667e-05, "loss": 0.87, "step": 1771 }, { "epoch": 0.2910404861624374, "grad_norm": 1.6246094022104651, "learning_rate": 1.6634442398998115e-05, "loss": 0.8936, "step": 1772 }, { "epoch": 0.29120473022912047, "grad_norm": 1.6975254010314216, "learning_rate": 1.6630460731292597e-05, "loss": 0.9149, "step": 1773 }, { "epoch": 0.29136897429580355, "grad_norm": 0.6800657295010789, "learning_rate": 1.662647718684912e-05, "loss": 0.3484, "step": 1774 }, { "epoch": 0.29153321836248663, "grad_norm": 1.7782986260954374, "learning_rate": 1.6622491766795215e-05, "loss": 0.8517, "step": 1775 }, { "epoch": 0.2916974624291698, "grad_norm": 1.5922724069604801, "learning_rate": 1.661850447225895e-05, "loss": 0.8786, "step": 1776 }, { "epoch": 0.29186170649585286, "grad_norm": 1.4191458486167858, "learning_rate": 1.6614515304368915e-05, "loss": 0.8508, "step": 1777 }, { "epoch": 0.29202595056253594, "grad_norm": 1.8906199910617494, "learning_rate": 1.661052426425424e-05, "loss": 0.8098, "step": 1778 }, { "epoch": 0.292190194629219, "grad_norm": 0.68111765738755, "learning_rate": 1.6606531353044585e-05, "loss": 0.37, "step": 1779 }, { "epoch": 0.2923544386959021, "grad_norm": 0.6903679407890017, "learning_rate": 1.660253657187012e-05, "loss": 0.3451, "step": 1780 }, { "epoch": 0.2925186827625852, "grad_norm": 1.8004841852834388, "learning_rate": 1.6598539921861573e-05, "loss": 0.7704, "step": 1781 }, { "epoch": 0.2926829268292683, "grad_norm": 1.6651590069794806, "learning_rate": 1.6594541404150187e-05, "loss": 0.8049, "step": 1782 }, { "epoch": 0.29284717089595136, "grad_norm": 1.8622257073713264, "learning_rate": 1.6590541019867722e-05, "loss": 0.7001, "step": 1783 }, { "epoch": 0.2930114149626345, "grad_norm": 2.395052028708782, "learning_rate": 1.6586538770146495e-05, "loss": 0.7576, "step": 1784 }, { "epoch": 0.2931756590293176, "grad_norm": 1.764973975807419, "learning_rate": 1.658253465611932e-05, "loss": 0.8089, "step": 1785 }, { "epoch": 0.29333990309600066, "grad_norm": 1.6750188152168015, "learning_rate": 1.6578528678919564e-05, "loss": 0.8401, "step": 1786 }, { "epoch": 0.29350414716268375, "grad_norm": 0.7213066507881604, "learning_rate": 1.65745208396811e-05, "loss": 0.3204, "step": 1787 }, { "epoch": 0.29366839122936683, "grad_norm": 1.6642528464373831, "learning_rate": 1.6570511139538348e-05, "loss": 0.7782, "step": 1788 }, { "epoch": 0.2938326352960499, "grad_norm": 4.265217219772987, "learning_rate": 1.6566499579626237e-05, "loss": 0.8481, "step": 1789 }, { "epoch": 0.293996879362733, "grad_norm": 1.754796441390061, "learning_rate": 1.656248616108024e-05, "loss": 0.8037, "step": 1790 }, { "epoch": 0.29416112342941614, "grad_norm": 2.5706259923112476, "learning_rate": 1.655847088503634e-05, "loss": 0.9255, "step": 1791 }, { "epoch": 0.2943253674960992, "grad_norm": 1.7465423432819598, "learning_rate": 1.655445375263105e-05, "loss": 0.8609, "step": 1792 }, { "epoch": 0.2944896115627823, "grad_norm": 1.835795342527986, "learning_rate": 1.655043476500142e-05, "loss": 0.8128, "step": 1793 }, { "epoch": 0.2946538556294654, "grad_norm": 1.8264529560461529, "learning_rate": 1.6546413923285008e-05, "loss": 0.8468, "step": 1794 }, { "epoch": 0.2948180996961485, "grad_norm": 2.1455212694325025, "learning_rate": 1.6542391228619906e-05, "loss": 0.7573, "step": 1795 }, { "epoch": 0.29498234376283156, "grad_norm": 0.7603258277145798, "learning_rate": 1.6538366682144734e-05, "loss": 0.3576, "step": 1796 }, { "epoch": 0.29514658782951464, "grad_norm": 2.0146039088487018, "learning_rate": 1.6534340284998626e-05, "loss": 0.8908, "step": 1797 }, { "epoch": 0.2953108318961977, "grad_norm": 2.5128952111550973, "learning_rate": 1.6530312038321247e-05, "loss": 0.8466, "step": 1798 }, { "epoch": 0.29547507596288086, "grad_norm": 1.887147810106827, "learning_rate": 1.6526281943252782e-05, "loss": 0.8299, "step": 1799 }, { "epoch": 0.29563932002956395, "grad_norm": 1.7142390626806159, "learning_rate": 1.6522250000933948e-05, "loss": 0.8494, "step": 1800 }, { "epoch": 0.29580356409624703, "grad_norm": 1.6145482340943509, "learning_rate": 1.6518216212505968e-05, "loss": 0.7998, "step": 1801 }, { "epoch": 0.2959678081629301, "grad_norm": 2.1836378809667254, "learning_rate": 1.6514180579110606e-05, "loss": 0.7665, "step": 1802 }, { "epoch": 0.2961320522296132, "grad_norm": 1.5701311891682244, "learning_rate": 1.6510143101890136e-05, "loss": 0.7904, "step": 1803 }, { "epoch": 0.2962962962962963, "grad_norm": 1.634379728757685, "learning_rate": 1.6506103781987355e-05, "loss": 0.8614, "step": 1804 }, { "epoch": 0.29646054036297936, "grad_norm": 1.6179118127665528, "learning_rate": 1.650206262054559e-05, "loss": 0.7428, "step": 1805 }, { "epoch": 0.2966247844296625, "grad_norm": 1.9006890662751414, "learning_rate": 1.6498019618708673e-05, "loss": 0.8307, "step": 1806 }, { "epoch": 0.2967890284963456, "grad_norm": 1.7862524385244327, "learning_rate": 1.6493974777620976e-05, "loss": 0.8677, "step": 1807 }, { "epoch": 0.29695327256302867, "grad_norm": 1.6599423826928612, "learning_rate": 1.6489928098427383e-05, "loss": 0.8234, "step": 1808 }, { "epoch": 0.29711751662971175, "grad_norm": 1.8717174282360958, "learning_rate": 1.648587958227329e-05, "loss": 0.8361, "step": 1809 }, { "epoch": 0.29728176069639484, "grad_norm": 2.2714164847901364, "learning_rate": 1.648182923030463e-05, "loss": 0.8462, "step": 1810 }, { "epoch": 0.2974460047630779, "grad_norm": 1.7050065320171595, "learning_rate": 1.6477777043667846e-05, "loss": 0.8158, "step": 1811 }, { "epoch": 0.297610248829761, "grad_norm": 2.143131436299016, "learning_rate": 1.647372302350989e-05, "loss": 0.8117, "step": 1812 }, { "epoch": 0.29777449289644414, "grad_norm": 1.9075670264946136, "learning_rate": 1.6469667170978258e-05, "loss": 0.7894, "step": 1813 }, { "epoch": 0.2979387369631272, "grad_norm": 1.3973226194479678, "learning_rate": 1.6465609487220942e-05, "loss": 0.8658, "step": 1814 }, { "epoch": 0.2981029810298103, "grad_norm": 1.7617588638891897, "learning_rate": 1.6461549973386464e-05, "loss": 0.8332, "step": 1815 }, { "epoch": 0.2982672250964934, "grad_norm": 2.113701915511775, "learning_rate": 1.645748863062386e-05, "loss": 0.7587, "step": 1816 }, { "epoch": 0.2984314691631765, "grad_norm": 1.9544982060339435, "learning_rate": 1.6453425460082685e-05, "loss": 0.8605, "step": 1817 }, { "epoch": 0.29859571322985956, "grad_norm": 1.797677392119787, "learning_rate": 1.6449360462913005e-05, "loss": 0.7944, "step": 1818 }, { "epoch": 0.29875995729654264, "grad_norm": 1.820720607485835, "learning_rate": 1.644529364026542e-05, "loss": 0.8732, "step": 1819 }, { "epoch": 0.29892420136322573, "grad_norm": 1.8657681624692708, "learning_rate": 1.644122499329103e-05, "loss": 0.8594, "step": 1820 }, { "epoch": 0.29908844542990887, "grad_norm": 0.7775629567648491, "learning_rate": 1.6437154523141453e-05, "loss": 0.3585, "step": 1821 }, { "epoch": 0.29925268949659195, "grad_norm": 2.0864325683442426, "learning_rate": 1.6433082230968833e-05, "loss": 0.8899, "step": 1822 }, { "epoch": 0.29941693356327503, "grad_norm": 1.7282883250058918, "learning_rate": 1.642900811792582e-05, "loss": 0.8708, "step": 1823 }, { "epoch": 0.2995811776299581, "grad_norm": 1.8044012626445276, "learning_rate": 1.6424932185165587e-05, "loss": 0.8956, "step": 1824 }, { "epoch": 0.2997454216966412, "grad_norm": 1.890895084884183, "learning_rate": 1.6420854433841817e-05, "loss": 0.7611, "step": 1825 }, { "epoch": 0.2999096657633243, "grad_norm": 0.6565849034063015, "learning_rate": 1.6416774865108706e-05, "loss": 0.362, "step": 1826 }, { "epoch": 0.30007390983000737, "grad_norm": 1.948297910291902, "learning_rate": 1.641269348012097e-05, "loss": 0.8332, "step": 1827 }, { "epoch": 0.3002381538966905, "grad_norm": 1.798347061401738, "learning_rate": 1.640861028003383e-05, "loss": 0.8505, "step": 1828 }, { "epoch": 0.3004023979633736, "grad_norm": 2.2092524738792387, "learning_rate": 1.6404525266003037e-05, "loss": 0.7408, "step": 1829 }, { "epoch": 0.3005666420300567, "grad_norm": 1.5624785025295653, "learning_rate": 1.6400438439184842e-05, "loss": 0.7962, "step": 1830 }, { "epoch": 0.30073088609673976, "grad_norm": 1.4580807526893425, "learning_rate": 1.6396349800736012e-05, "loss": 0.8549, "step": 1831 }, { "epoch": 0.30089513016342284, "grad_norm": 2.1193930465055217, "learning_rate": 1.6392259351813827e-05, "loss": 0.7974, "step": 1832 }, { "epoch": 0.3010593742301059, "grad_norm": 1.7587807771698687, "learning_rate": 1.6388167093576083e-05, "loss": 0.8454, "step": 1833 }, { "epoch": 0.301223618296789, "grad_norm": 2.347275415925442, "learning_rate": 1.638407302718108e-05, "loss": 0.7793, "step": 1834 }, { "epoch": 0.3013878623634721, "grad_norm": 4.295282931335049, "learning_rate": 1.6379977153787637e-05, "loss": 0.7937, "step": 1835 }, { "epoch": 0.30155210643015523, "grad_norm": 1.8861771126791596, "learning_rate": 1.6375879474555084e-05, "loss": 0.8851, "step": 1836 }, { "epoch": 0.3017163504968383, "grad_norm": 0.7061475403103725, "learning_rate": 1.637177999064326e-05, "loss": 0.3377, "step": 1837 }, { "epoch": 0.3018805945635214, "grad_norm": 1.956104726219926, "learning_rate": 1.6367678703212515e-05, "loss": 0.8987, "step": 1838 }, { "epoch": 0.3020448386302045, "grad_norm": 2.2149728324516835, "learning_rate": 1.636357561342371e-05, "loss": 0.83, "step": 1839 }, { "epoch": 0.30220908269688757, "grad_norm": 2.15824964353182, "learning_rate": 1.6359470722438212e-05, "loss": 0.8751, "step": 1840 }, { "epoch": 0.30237332676357065, "grad_norm": 1.681104988603993, "learning_rate": 1.6355364031417903e-05, "loss": 0.8462, "step": 1841 }, { "epoch": 0.30253757083025373, "grad_norm": 1.6634906785713608, "learning_rate": 1.6351255541525182e-05, "loss": 0.8539, "step": 1842 }, { "epoch": 0.30270181489693687, "grad_norm": 1.376716945351399, "learning_rate": 1.6347145253922942e-05, "loss": 0.8886, "step": 1843 }, { "epoch": 0.30286605896361996, "grad_norm": 1.7188618095765702, "learning_rate": 1.6343033169774587e-05, "loss": 0.8394, "step": 1844 }, { "epoch": 0.30303030303030304, "grad_norm": 1.9610753967831724, "learning_rate": 1.633891929024404e-05, "loss": 0.8222, "step": 1845 }, { "epoch": 0.3031945470969861, "grad_norm": 1.8417273353397214, "learning_rate": 1.6334803616495722e-05, "loss": 0.8784, "step": 1846 }, { "epoch": 0.3033587911636692, "grad_norm": 1.4891984208937943, "learning_rate": 1.633068614969457e-05, "loss": 0.8106, "step": 1847 }, { "epoch": 0.3035230352303523, "grad_norm": 1.906621408217196, "learning_rate": 1.632656689100602e-05, "loss": 0.8387, "step": 1848 }, { "epoch": 0.3036872792970354, "grad_norm": 1.79999255931988, "learning_rate": 1.632244584159602e-05, "loss": 0.8756, "step": 1849 }, { "epoch": 0.3038515233637185, "grad_norm": 1.6593842000426928, "learning_rate": 1.631832300263103e-05, "loss": 0.8313, "step": 1850 }, { "epoch": 0.3040157674304016, "grad_norm": 1.5065945164912649, "learning_rate": 1.6314198375278003e-05, "loss": 0.7804, "step": 1851 }, { "epoch": 0.3041800114970847, "grad_norm": 2.378092592552877, "learning_rate": 1.6310071960704412e-05, "loss": 0.787, "step": 1852 }, { "epoch": 0.30434425556376776, "grad_norm": 3.797814038744849, "learning_rate": 1.6305943760078226e-05, "loss": 0.8007, "step": 1853 }, { "epoch": 0.30450849963045085, "grad_norm": 1.701547005076689, "learning_rate": 1.630181377456793e-05, "loss": 0.8698, "step": 1854 }, { "epoch": 0.30467274369713393, "grad_norm": 1.6527761487923467, "learning_rate": 1.6297682005342497e-05, "loss": 0.8506, "step": 1855 }, { "epoch": 0.304836987763817, "grad_norm": 1.719457220944166, "learning_rate": 1.6293548453571422e-05, "loss": 0.7652, "step": 1856 }, { "epoch": 0.3050012318305001, "grad_norm": 1.947689935091947, "learning_rate": 1.62894131204247e-05, "loss": 0.7623, "step": 1857 }, { "epoch": 0.30516547589718324, "grad_norm": 2.17736311660786, "learning_rate": 1.628527600707283e-05, "loss": 0.7817, "step": 1858 }, { "epoch": 0.3053297199638663, "grad_norm": 1.7489015006579554, "learning_rate": 1.62811371146868e-05, "loss": 0.8266, "step": 1859 }, { "epoch": 0.3054939640305494, "grad_norm": 2.6596924967835878, "learning_rate": 1.627699644443813e-05, "loss": 0.7482, "step": 1860 }, { "epoch": 0.3056582080972325, "grad_norm": 1.6829659113829556, "learning_rate": 1.6272853997498822e-05, "loss": 0.8387, "step": 1861 }, { "epoch": 0.30582245216391557, "grad_norm": 2.7171807605282043, "learning_rate": 1.6268709775041385e-05, "loss": 0.8474, "step": 1862 }, { "epoch": 0.30598669623059865, "grad_norm": 0.8153334602645896, "learning_rate": 1.6264563778238834e-05, "loss": 0.3524, "step": 1863 }, { "epoch": 0.30615094029728174, "grad_norm": 0.7342256731536813, "learning_rate": 1.6260416008264685e-05, "loss": 0.3167, "step": 1864 }, { "epoch": 0.3063151843639649, "grad_norm": 0.7204763426187445, "learning_rate": 1.625626646629296e-05, "loss": 0.3589, "step": 1865 }, { "epoch": 0.30647942843064796, "grad_norm": 1.889464785874698, "learning_rate": 1.625211515349817e-05, "loss": 0.7925, "step": 1866 }, { "epoch": 0.30664367249733104, "grad_norm": 2.0381115559877068, "learning_rate": 1.624796207105534e-05, "loss": 0.8509, "step": 1867 }, { "epoch": 0.3068079165640141, "grad_norm": 1.76032810535131, "learning_rate": 1.6243807220139988e-05, "loss": 0.8785, "step": 1868 }, { "epoch": 0.3069721606306972, "grad_norm": 1.7047642592656402, "learning_rate": 1.623965060192814e-05, "loss": 0.9106, "step": 1869 }, { "epoch": 0.3071364046973803, "grad_norm": 0.8895921738126732, "learning_rate": 1.623549221759632e-05, "loss": 0.3457, "step": 1870 }, { "epoch": 0.3073006487640634, "grad_norm": 1.836145266556743, "learning_rate": 1.6231332068321538e-05, "loss": 0.88, "step": 1871 }, { "epoch": 0.3074648928307465, "grad_norm": 1.9575690095997866, "learning_rate": 1.622717015528133e-05, "loss": 0.8264, "step": 1872 }, { "epoch": 0.3076291368974296, "grad_norm": 2.007898183179511, "learning_rate": 1.6223006479653708e-05, "loss": 0.8391, "step": 1873 }, { "epoch": 0.3077933809641127, "grad_norm": 5.678201417946265, "learning_rate": 1.6218841042617196e-05, "loss": 0.7562, "step": 1874 }, { "epoch": 0.30795762503079577, "grad_norm": 1.5280911630869711, "learning_rate": 1.621467384535081e-05, "loss": 0.7606, "step": 1875 }, { "epoch": 0.30812186909747885, "grad_norm": 1.8954378989397893, "learning_rate": 1.6210504889034063e-05, "loss": 0.7819, "step": 1876 }, { "epoch": 0.30828611316416193, "grad_norm": 1.459748027855536, "learning_rate": 1.6206334174846974e-05, "loss": 0.8634, "step": 1877 }, { "epoch": 0.308450357230845, "grad_norm": 1.7872718150749565, "learning_rate": 1.6202161703970057e-05, "loss": 0.8083, "step": 1878 }, { "epoch": 0.3086146012975281, "grad_norm": 2.0289520979129567, "learning_rate": 1.6197987477584315e-05, "loss": 0.8205, "step": 1879 }, { "epoch": 0.30877884536421124, "grad_norm": 1.5926293967947496, "learning_rate": 1.6193811496871256e-05, "loss": 0.857, "step": 1880 }, { "epoch": 0.3089430894308943, "grad_norm": 2.733398613500424, "learning_rate": 1.6189633763012885e-05, "loss": 0.7775, "step": 1881 }, { "epoch": 0.3091073334975774, "grad_norm": 2.084965187965568, "learning_rate": 1.61854542771917e-05, "loss": 0.8692, "step": 1882 }, { "epoch": 0.3092715775642605, "grad_norm": 1.7612851390587847, "learning_rate": 1.6181273040590696e-05, "loss": 0.7645, "step": 1883 }, { "epoch": 0.3094358216309436, "grad_norm": 2.0109011657487192, "learning_rate": 1.617709005439336e-05, "loss": 0.829, "step": 1884 }, { "epoch": 0.30960006569762666, "grad_norm": 2.128874120927319, "learning_rate": 1.617290531978368e-05, "loss": 0.8576, "step": 1885 }, { "epoch": 0.30976430976430974, "grad_norm": 1.599341375993444, "learning_rate": 1.6168718837946133e-05, "loss": 0.8064, "step": 1886 }, { "epoch": 0.3099285538309929, "grad_norm": 1.8103576188959296, "learning_rate": 1.61645306100657e-05, "loss": 0.8297, "step": 1887 }, { "epoch": 0.31009279789767596, "grad_norm": 2.2318103955773108, "learning_rate": 1.616034063732785e-05, "loss": 0.8141, "step": 1888 }, { "epoch": 0.31025704196435905, "grad_norm": 2.034708647155486, "learning_rate": 1.6156148920918538e-05, "loss": 0.8234, "step": 1889 }, { "epoch": 0.31042128603104213, "grad_norm": 1.8271615825608734, "learning_rate": 1.6151955462024225e-05, "loss": 0.7965, "step": 1890 }, { "epoch": 0.3105855300977252, "grad_norm": 2.027354384460302, "learning_rate": 1.6147760261831866e-05, "loss": 0.8284, "step": 1891 }, { "epoch": 0.3107497741644083, "grad_norm": 1.5220555447903699, "learning_rate": 1.6143563321528893e-05, "loss": 0.8828, "step": 1892 }, { "epoch": 0.3109140182310914, "grad_norm": 1.6846513942936743, "learning_rate": 1.613936464230325e-05, "loss": 0.85, "step": 1893 }, { "epoch": 0.31107826229777447, "grad_norm": 2.0110853990724817, "learning_rate": 1.6135164225343357e-05, "loss": 0.8351, "step": 1894 }, { "epoch": 0.3112425063644576, "grad_norm": 1.6061520692902311, "learning_rate": 1.613096207183814e-05, "loss": 0.8801, "step": 1895 }, { "epoch": 0.3114067504311407, "grad_norm": 2.348151414219099, "learning_rate": 1.6126758182977007e-05, "loss": 0.7554, "step": 1896 }, { "epoch": 0.31157099449782377, "grad_norm": 1.407503129696029, "learning_rate": 1.612255255994986e-05, "loss": 0.7991, "step": 1897 }, { "epoch": 0.31173523856450686, "grad_norm": 1.7982338021877655, "learning_rate": 1.6118345203947093e-05, "loss": 0.8701, "step": 1898 }, { "epoch": 0.31189948263118994, "grad_norm": 2.0247472543159035, "learning_rate": 1.6114136116159585e-05, "loss": 0.8449, "step": 1899 }, { "epoch": 0.312063726697873, "grad_norm": 2.8368005305368396, "learning_rate": 1.6109925297778717e-05, "loss": 0.8286, "step": 1900 }, { "epoch": 0.3122279707645561, "grad_norm": 1.863945364871219, "learning_rate": 1.6105712749996345e-05, "loss": 0.8106, "step": 1901 }, { "epoch": 0.31239221483123925, "grad_norm": 1.8727842816862894, "learning_rate": 1.610149847400482e-05, "loss": 0.8334, "step": 1902 }, { "epoch": 0.31255645889792233, "grad_norm": 1.4854456502986035, "learning_rate": 1.6097282470996997e-05, "loss": 0.8441, "step": 1903 }, { "epoch": 0.3127207029646054, "grad_norm": 0.729107502192457, "learning_rate": 1.609306474216619e-05, "loss": 0.3621, "step": 1904 }, { "epoch": 0.3128849470312885, "grad_norm": 0.6989751551263128, "learning_rate": 1.608884528870623e-05, "loss": 0.3352, "step": 1905 }, { "epoch": 0.3130491910979716, "grad_norm": 1.7536808512160524, "learning_rate": 1.608462411181142e-05, "loss": 0.8315, "step": 1906 }, { "epoch": 0.31321343516465466, "grad_norm": 2.2741256218334502, "learning_rate": 1.6080401212676558e-05, "loss": 0.7785, "step": 1907 }, { "epoch": 0.31337767923133775, "grad_norm": 1.9608356229222124, "learning_rate": 1.6076176592496926e-05, "loss": 0.7806, "step": 1908 }, { "epoch": 0.3135419232980209, "grad_norm": 1.8140458145890956, "learning_rate": 1.6071950252468288e-05, "loss": 0.7631, "step": 1909 }, { "epoch": 0.31370616736470397, "grad_norm": 1.971812616804373, "learning_rate": 1.6067722193786907e-05, "loss": 0.9225, "step": 1910 }, { "epoch": 0.31387041143138705, "grad_norm": 1.7787551340641174, "learning_rate": 1.6063492417649528e-05, "loss": 0.8355, "step": 1911 }, { "epoch": 0.31403465549807014, "grad_norm": 2.2931232777192787, "learning_rate": 1.605926092525337e-05, "loss": 0.7631, "step": 1912 }, { "epoch": 0.3141988995647532, "grad_norm": 1.9410283282407064, "learning_rate": 1.605502771779616e-05, "loss": 0.7733, "step": 1913 }, { "epoch": 0.3143631436314363, "grad_norm": 2.142768552213902, "learning_rate": 1.6050792796476092e-05, "loss": 0.8031, "step": 1914 }, { "epoch": 0.3145273876981194, "grad_norm": 1.950478576384539, "learning_rate": 1.6046556162491852e-05, "loss": 0.878, "step": 1915 }, { "epoch": 0.31469163176480247, "grad_norm": 1.6201694589017823, "learning_rate": 1.604231781704261e-05, "loss": 0.8173, "step": 1916 }, { "epoch": 0.3148558758314856, "grad_norm": 0.8334982541201804, "learning_rate": 1.6038077761328024e-05, "loss": 0.3461, "step": 1917 }, { "epoch": 0.3150201198981687, "grad_norm": 1.993527234537132, "learning_rate": 1.603383599654823e-05, "loss": 0.8478, "step": 1918 }, { "epoch": 0.3151843639648518, "grad_norm": 0.7112991221081528, "learning_rate": 1.602959252390385e-05, "loss": 0.3341, "step": 1919 }, { "epoch": 0.31534860803153486, "grad_norm": 2.01303775339127, "learning_rate": 1.602534734459599e-05, "loss": 0.8169, "step": 1920 }, { "epoch": 0.31551285209821794, "grad_norm": 1.7605964850904443, "learning_rate": 1.6021100459826243e-05, "loss": 0.8828, "step": 1921 }, { "epoch": 0.315677096164901, "grad_norm": 1.759755740300291, "learning_rate": 1.601685187079668e-05, "loss": 0.847, "step": 1922 }, { "epoch": 0.3158413402315841, "grad_norm": 1.8912424391069256, "learning_rate": 1.601260157870985e-05, "loss": 0.8314, "step": 1923 }, { "epoch": 0.31600558429826725, "grad_norm": 2.0298709854410752, "learning_rate": 1.6008349584768793e-05, "loss": 0.8536, "step": 1924 }, { "epoch": 0.31616982836495033, "grad_norm": 1.609879456145524, "learning_rate": 1.6004095890177026e-05, "loss": 0.7864, "step": 1925 }, { "epoch": 0.3163340724316334, "grad_norm": 1.6723913313637702, "learning_rate": 1.599984049613855e-05, "loss": 0.8248, "step": 1926 }, { "epoch": 0.3164983164983165, "grad_norm": 2.5548527318668017, "learning_rate": 1.5995583403857845e-05, "loss": 0.7469, "step": 1927 }, { "epoch": 0.3166625605649996, "grad_norm": 2.4253127237735486, "learning_rate": 1.599132461453987e-05, "loss": 0.8518, "step": 1928 }, { "epoch": 0.31682680463168267, "grad_norm": 1.769373296150159, "learning_rate": 1.5987064129390066e-05, "loss": 0.7468, "step": 1929 }, { "epoch": 0.31699104869836575, "grad_norm": 1.7851045364997342, "learning_rate": 1.5982801949614358e-05, "loss": 0.808, "step": 1930 }, { "epoch": 0.31715529276504884, "grad_norm": 1.7486355652682695, "learning_rate": 1.5978538076419143e-05, "loss": 0.7895, "step": 1931 }, { "epoch": 0.317319536831732, "grad_norm": 1.8210245010059374, "learning_rate": 1.5974272511011305e-05, "loss": 0.8207, "step": 1932 }, { "epoch": 0.31748378089841506, "grad_norm": 2.182888417269087, "learning_rate": 1.5970005254598204e-05, "loss": 0.7566, "step": 1933 }, { "epoch": 0.31764802496509814, "grad_norm": 1.758955018886119, "learning_rate": 1.5965736308387668e-05, "loss": 0.774, "step": 1934 }, { "epoch": 0.3178122690317812, "grad_norm": 1.4613961865360263, "learning_rate": 1.5961465673588027e-05, "loss": 0.8521, "step": 1935 }, { "epoch": 0.3179765130984643, "grad_norm": 1.6744458488516862, "learning_rate": 1.5957193351408065e-05, "loss": 0.7735, "step": 1936 }, { "epoch": 0.3181407571651474, "grad_norm": 1.8600822334156615, "learning_rate": 1.595291934305706e-05, "loss": 0.768, "step": 1937 }, { "epoch": 0.3183050012318305, "grad_norm": 1.50127926008538, "learning_rate": 1.594864364974476e-05, "loss": 0.8627, "step": 1938 }, { "epoch": 0.3184692452985136, "grad_norm": 1.9974721728912161, "learning_rate": 1.5944366272681386e-05, "loss": 0.7737, "step": 1939 }, { "epoch": 0.3186334893651967, "grad_norm": 2.423845750053685, "learning_rate": 1.5940087213077648e-05, "loss": 0.8301, "step": 1940 }, { "epoch": 0.3187977334318798, "grad_norm": 2.717203272739224, "learning_rate": 1.593580647214472e-05, "loss": 0.8227, "step": 1941 }, { "epoch": 0.31896197749856287, "grad_norm": 1.9748847285977904, "learning_rate": 1.5931524051094254e-05, "loss": 0.8607, "step": 1942 }, { "epoch": 0.31912622156524595, "grad_norm": 1.8411828342342034, "learning_rate": 1.592723995113839e-05, "loss": 0.783, "step": 1943 }, { "epoch": 0.31929046563192903, "grad_norm": 1.8048816586347352, "learning_rate": 1.5922954173489726e-05, "loss": 0.7723, "step": 1944 }, { "epoch": 0.3194547096986121, "grad_norm": 1.9109999918110256, "learning_rate": 1.5918666719361346e-05, "loss": 0.7356, "step": 1945 }, { "epoch": 0.31961895376529526, "grad_norm": 2.016035523763281, "learning_rate": 1.5914377589966798e-05, "loss": 0.8362, "step": 1946 }, { "epoch": 0.31978319783197834, "grad_norm": 2.148916155792717, "learning_rate": 1.5910086786520118e-05, "loss": 0.7731, "step": 1947 }, { "epoch": 0.3199474418986614, "grad_norm": 2.5353748284261424, "learning_rate": 1.5905794310235808e-05, "loss": 0.7772, "step": 1948 }, { "epoch": 0.3201116859653445, "grad_norm": 2.151613842587014, "learning_rate": 1.590150016232884e-05, "loss": 0.8387, "step": 1949 }, { "epoch": 0.3202759300320276, "grad_norm": 1.7005442852519415, "learning_rate": 1.589720434401467e-05, "loss": 0.7841, "step": 1950 }, { "epoch": 0.3204401740987107, "grad_norm": 1.644698719064845, "learning_rate": 1.5892906856509214e-05, "loss": 0.831, "step": 1951 }, { "epoch": 0.32060441816539376, "grad_norm": 1.6715560960551386, "learning_rate": 1.5888607701028877e-05, "loss": 0.8154, "step": 1952 }, { "epoch": 0.32076866223207684, "grad_norm": 1.7706704405994464, "learning_rate": 1.5884306878790512e-05, "loss": 0.8235, "step": 1953 }, { "epoch": 0.32093290629876, "grad_norm": 2.0379255699997105, "learning_rate": 1.5880004391011464e-05, "loss": 0.8969, "step": 1954 }, { "epoch": 0.32109715036544306, "grad_norm": 1.7989695683236433, "learning_rate": 1.5875700238909547e-05, "loss": 0.8906, "step": 1955 }, { "epoch": 0.32126139443212615, "grad_norm": 1.5798466076700584, "learning_rate": 1.5871394423703036e-05, "loss": 0.8484, "step": 1956 }, { "epoch": 0.32142563849880923, "grad_norm": 0.8494701442084591, "learning_rate": 1.5867086946610687e-05, "loss": 0.3875, "step": 1957 }, { "epoch": 0.3215898825654923, "grad_norm": 1.5378704636413927, "learning_rate": 1.586277780885172e-05, "loss": 0.8318, "step": 1958 }, { "epoch": 0.3217541266321754, "grad_norm": 1.9251455157588402, "learning_rate": 1.585846701164583e-05, "loss": 0.8133, "step": 1959 }, { "epoch": 0.3219183706988585, "grad_norm": 1.9056826276724734, "learning_rate": 1.585415455621318e-05, "loss": 0.8522, "step": 1960 }, { "epoch": 0.3220826147655416, "grad_norm": 1.97616560453577, "learning_rate": 1.5849840443774393e-05, "loss": 0.8096, "step": 1961 }, { "epoch": 0.3222468588322247, "grad_norm": 1.6247696380110779, "learning_rate": 1.584552467555058e-05, "loss": 0.8903, "step": 1962 }, { "epoch": 0.3224111028989078, "grad_norm": 19.374407770652837, "learning_rate": 1.58412072527633e-05, "loss": 0.8403, "step": 1963 }, { "epoch": 0.32257534696559087, "grad_norm": 1.525975655624924, "learning_rate": 1.58368881766346e-05, "loss": 0.7265, "step": 1964 }, { "epoch": 0.32273959103227395, "grad_norm": 1.732189657475418, "learning_rate": 1.5832567448386985e-05, "loss": 0.7664, "step": 1965 }, { "epoch": 0.32290383509895704, "grad_norm": 4.434842274285221, "learning_rate": 1.5828245069243417e-05, "loss": 0.818, "step": 1966 }, { "epoch": 0.3230680791656401, "grad_norm": 2.066450609590074, "learning_rate": 1.5823921040427348e-05, "loss": 0.6831, "step": 1967 }, { "epoch": 0.32323232323232326, "grad_norm": 1.716305422246661, "learning_rate": 1.5819595363162682e-05, "loss": 0.7078, "step": 1968 }, { "epoch": 0.32339656729900634, "grad_norm": 2.154694771745877, "learning_rate": 1.5815268038673786e-05, "loss": 0.7678, "step": 1969 }, { "epoch": 0.3235608113656894, "grad_norm": 1.7270753766670237, "learning_rate": 1.581093906818551e-05, "loss": 0.7802, "step": 1970 }, { "epoch": 0.3237250554323725, "grad_norm": 1.5420145032679995, "learning_rate": 1.5806608452923158e-05, "loss": 0.888, "step": 1971 }, { "epoch": 0.3238892994990556, "grad_norm": 2.521329931756971, "learning_rate": 1.5802276194112498e-05, "loss": 0.7602, "step": 1972 }, { "epoch": 0.3240535435657387, "grad_norm": 1.5709984021324783, "learning_rate": 1.5797942292979767e-05, "loss": 0.7446, "step": 1973 }, { "epoch": 0.32421778763242176, "grad_norm": 1.6102789045048882, "learning_rate": 1.5793606750751668e-05, "loss": 0.7405, "step": 1974 }, { "epoch": 0.32438203169910484, "grad_norm": 1.7518574067492336, "learning_rate": 1.578926956865537e-05, "loss": 0.7622, "step": 1975 }, { "epoch": 0.324546275765788, "grad_norm": 1.624023741680514, "learning_rate": 1.5784930747918492e-05, "loss": 0.816, "step": 1976 }, { "epoch": 0.32471051983247107, "grad_norm": 2.119092792123406, "learning_rate": 1.578059028976914e-05, "loss": 0.9259, "step": 1977 }, { "epoch": 0.32487476389915415, "grad_norm": 1.6266013687822016, "learning_rate": 1.577624819543587e-05, "loss": 0.879, "step": 1978 }, { "epoch": 0.32503900796583723, "grad_norm": 2.402755314011279, "learning_rate": 1.57719044661477e-05, "loss": 0.8003, "step": 1979 }, { "epoch": 0.3252032520325203, "grad_norm": 2.4696475765506225, "learning_rate": 1.5767559103134114e-05, "loss": 0.8026, "step": 1980 }, { "epoch": 0.3253674960992034, "grad_norm": 1.8110734041104148, "learning_rate": 1.5763212107625055e-05, "loss": 0.9279, "step": 1981 }, { "epoch": 0.3255317401658865, "grad_norm": 6.103558933676176, "learning_rate": 1.5758863480850936e-05, "loss": 0.7852, "step": 1982 }, { "epoch": 0.3256959842325696, "grad_norm": 2.8383537736716558, "learning_rate": 1.5754513224042625e-05, "loss": 0.7967, "step": 1983 }, { "epoch": 0.3258602282992527, "grad_norm": 4.693822034736458, "learning_rate": 1.5750161338431452e-05, "loss": 0.7537, "step": 1984 }, { "epoch": 0.3260244723659358, "grad_norm": 3.9768403320173054, "learning_rate": 1.5745807825249208e-05, "loss": 0.7245, "step": 1985 }, { "epoch": 0.3261887164326189, "grad_norm": 1.458841840354943, "learning_rate": 1.574145268572815e-05, "loss": 0.8179, "step": 1986 }, { "epoch": 0.32635296049930196, "grad_norm": 2.0498913216123573, "learning_rate": 1.5737095921100983e-05, "loss": 0.7471, "step": 1987 }, { "epoch": 0.32651720456598504, "grad_norm": 1.931260639267065, "learning_rate": 1.573273753260089e-05, "loss": 0.8243, "step": 1988 }, { "epoch": 0.3266814486326681, "grad_norm": 1.8384984495622874, "learning_rate": 1.5728377521461496e-05, "loss": 0.7647, "step": 1989 }, { "epoch": 0.3268456926993512, "grad_norm": 2.092214886473593, "learning_rate": 1.57240158889169e-05, "loss": 0.8065, "step": 1990 }, { "epoch": 0.32700993676603435, "grad_norm": 2.240594985864302, "learning_rate": 1.5719652636201646e-05, "loss": 0.8131, "step": 1991 }, { "epoch": 0.32717418083271743, "grad_norm": 1.9065360248880199, "learning_rate": 1.5715287764550745e-05, "loss": 0.8303, "step": 1992 }, { "epoch": 0.3273384248994005, "grad_norm": 1.65913281931474, "learning_rate": 1.571092127519967e-05, "loss": 0.7733, "step": 1993 }, { "epoch": 0.3275026689660836, "grad_norm": 1.433919319176288, "learning_rate": 1.570655316938434e-05, "loss": 0.7829, "step": 1994 }, { "epoch": 0.3276669130327667, "grad_norm": 1.9075661197137375, "learning_rate": 1.5702183448341143e-05, "loss": 0.827, "step": 1995 }, { "epoch": 0.32783115709944977, "grad_norm": 1.8594715168667146, "learning_rate": 1.5697812113306917e-05, "loss": 0.8419, "step": 1996 }, { "epoch": 0.32799540116613285, "grad_norm": 1.737900650521058, "learning_rate": 1.5693439165518957e-05, "loss": 0.8541, "step": 1997 }, { "epoch": 0.328159645232816, "grad_norm": 1.8610373884785556, "learning_rate": 1.568906460621502e-05, "loss": 0.7164, "step": 1998 }, { "epoch": 0.32832388929949907, "grad_norm": 1.6947036450453976, "learning_rate": 1.5684688436633314e-05, "loss": 0.8094, "step": 1999 }, { "epoch": 0.32848813336618216, "grad_norm": 1.9958821873191697, "learning_rate": 1.5680310658012507e-05, "loss": 0.7833, "step": 2000 }, { "epoch": 0.32865237743286524, "grad_norm": 1.439071805054242, "learning_rate": 1.5675931271591717e-05, "loss": 0.8202, "step": 2001 }, { "epoch": 0.3288166214995483, "grad_norm": 0.7713432375749639, "learning_rate": 1.5671550278610526e-05, "loss": 0.3533, "step": 2002 }, { "epoch": 0.3289808655662314, "grad_norm": 0.7028617804265109, "learning_rate": 1.566716768030896e-05, "loss": 0.348, "step": 2003 }, { "epoch": 0.3291451096329145, "grad_norm": 2.074705316178608, "learning_rate": 1.56627834779275e-05, "loss": 0.8076, "step": 2004 }, { "epoch": 0.32930935369959763, "grad_norm": 2.266863613717359, "learning_rate": 1.5658397672707093e-05, "loss": 0.7954, "step": 2005 }, { "epoch": 0.3294735977662807, "grad_norm": 2.182600466502276, "learning_rate": 1.565401026588913e-05, "loss": 0.735, "step": 2006 }, { "epoch": 0.3296378418329638, "grad_norm": 2.1409573032040514, "learning_rate": 1.5649621258715454e-05, "loss": 0.7661, "step": 2007 }, { "epoch": 0.3298020858996469, "grad_norm": 2.084331628716839, "learning_rate": 1.5645230652428367e-05, "loss": 0.9274, "step": 2008 }, { "epoch": 0.32996632996632996, "grad_norm": 2.8885193449846116, "learning_rate": 1.564083844827062e-05, "loss": 0.7311, "step": 2009 }, { "epoch": 0.33013057403301305, "grad_norm": 1.8980549408651557, "learning_rate": 1.563644464748542e-05, "loss": 0.8004, "step": 2010 }, { "epoch": 0.33029481809969613, "grad_norm": 1.7669892140165961, "learning_rate": 1.563204925131642e-05, "loss": 0.8541, "step": 2011 }, { "epoch": 0.3304590621663792, "grad_norm": 1.981156096172153, "learning_rate": 1.5627652261007726e-05, "loss": 0.9217, "step": 2012 }, { "epoch": 0.33062330623306235, "grad_norm": 1.761947115709835, "learning_rate": 1.5623253677803897e-05, "loss": 0.8098, "step": 2013 }, { "epoch": 0.33078755029974544, "grad_norm": 1.5620145073935534, "learning_rate": 1.5618853502949948e-05, "loss": 0.8614, "step": 2014 }, { "epoch": 0.3309517943664285, "grad_norm": 2.377232042970147, "learning_rate": 1.5614451737691335e-05, "loss": 0.7606, "step": 2015 }, { "epoch": 0.3311160384331116, "grad_norm": 3.687063941930215, "learning_rate": 1.561004838327397e-05, "loss": 0.7641, "step": 2016 }, { "epoch": 0.3312802824997947, "grad_norm": 1.7225545933032413, "learning_rate": 1.5605643440944213e-05, "loss": 0.8276, "step": 2017 }, { "epoch": 0.33144452656647777, "grad_norm": 1.7605738331476084, "learning_rate": 1.5601236911948876e-05, "loss": 0.8092, "step": 2018 }, { "epoch": 0.33160877063316085, "grad_norm": 1.7004486943069796, "learning_rate": 1.559682879753521e-05, "loss": 0.8594, "step": 2019 }, { "epoch": 0.331773014699844, "grad_norm": 3.68594949623202, "learning_rate": 1.559241909895093e-05, "loss": 0.7897, "step": 2020 }, { "epoch": 0.3319372587665271, "grad_norm": 1.6888192162286813, "learning_rate": 1.558800781744419e-05, "loss": 0.8986, "step": 2021 }, { "epoch": 0.33210150283321016, "grad_norm": 2.0147994927606097, "learning_rate": 1.5583594954263593e-05, "loss": 0.7401, "step": 2022 }, { "epoch": 0.33226574689989324, "grad_norm": 1.7950658598713973, "learning_rate": 1.5579180510658187e-05, "loss": 0.7967, "step": 2023 }, { "epoch": 0.3324299909665763, "grad_norm": 2.2044497391068285, "learning_rate": 1.557476448787748e-05, "loss": 0.7683, "step": 2024 }, { "epoch": 0.3325942350332594, "grad_norm": 1.6426097261140884, "learning_rate": 1.557034688717141e-05, "loss": 0.8429, "step": 2025 }, { "epoch": 0.3327584790999425, "grad_norm": 2.29277485493381, "learning_rate": 1.5565927709790377e-05, "loss": 0.7806, "step": 2026 }, { "epoch": 0.3329227231666256, "grad_norm": 2.153603732697247, "learning_rate": 1.5561506956985213e-05, "loss": 0.8789, "step": 2027 }, { "epoch": 0.3330869672333087, "grad_norm": 1.9694034416174662, "learning_rate": 1.5557084630007206e-05, "loss": 0.837, "step": 2028 }, { "epoch": 0.3332512112999918, "grad_norm": 1.601178479370574, "learning_rate": 1.5552660730108084e-05, "loss": 0.8687, "step": 2029 }, { "epoch": 0.3334154553666749, "grad_norm": 2.200003265377085, "learning_rate": 1.5548235258540023e-05, "loss": 0.8154, "step": 2030 }, { "epoch": 0.33357969943335797, "grad_norm": 1.7360039024718055, "learning_rate": 1.5543808216555645e-05, "loss": 0.7354, "step": 2031 }, { "epoch": 0.33374394350004105, "grad_norm": 1.8509508166472781, "learning_rate": 1.5539379605408015e-05, "loss": 0.7542, "step": 2032 }, { "epoch": 0.33390818756672414, "grad_norm": 2.248906246252484, "learning_rate": 1.5534949426350642e-05, "loss": 0.7872, "step": 2033 }, { "epoch": 0.3340724316334072, "grad_norm": 1.6882442619544726, "learning_rate": 1.5530517680637478e-05, "loss": 0.8126, "step": 2034 }, { "epoch": 0.33423667570009036, "grad_norm": 2.136024818011225, "learning_rate": 1.552608436952292e-05, "loss": 0.815, "step": 2035 }, { "epoch": 0.33440091976677344, "grad_norm": 2.0577097575554903, "learning_rate": 1.552164949426181e-05, "loss": 0.7981, "step": 2036 }, { "epoch": 0.3345651638334565, "grad_norm": 1.571163238878308, "learning_rate": 1.551721305610942e-05, "loss": 0.8166, "step": 2037 }, { "epoch": 0.3347294079001396, "grad_norm": 1.9222578367985343, "learning_rate": 1.551277505632149e-05, "loss": 0.7676, "step": 2038 }, { "epoch": 0.3348936519668227, "grad_norm": 1.757208057671307, "learning_rate": 1.550833549615417e-05, "loss": 0.745, "step": 2039 }, { "epoch": 0.3350578960335058, "grad_norm": 1.8919657671599492, "learning_rate": 1.550389437686408e-05, "loss": 0.7505, "step": 2040 }, { "epoch": 0.33522214010018886, "grad_norm": 1.799628630763172, "learning_rate": 1.549945169970827e-05, "loss": 0.8021, "step": 2041 }, { "epoch": 0.335386384166872, "grad_norm": 2.013648741632218, "learning_rate": 1.549500746594422e-05, "loss": 0.8774, "step": 2042 }, { "epoch": 0.3355506282335551, "grad_norm": 1.8794078416500883, "learning_rate": 1.549056167682987e-05, "loss": 0.7547, "step": 2043 }, { "epoch": 0.33571487230023817, "grad_norm": 1.570553835810196, "learning_rate": 1.5486114333623587e-05, "loss": 0.8333, "step": 2044 }, { "epoch": 0.33587911636692125, "grad_norm": 1.6218472709553786, "learning_rate": 1.5481665437584185e-05, "loss": 0.8633, "step": 2045 }, { "epoch": 0.33604336043360433, "grad_norm": 1.425438550222801, "learning_rate": 1.5477214989970916e-05, "loss": 0.7798, "step": 2046 }, { "epoch": 0.3362076045002874, "grad_norm": 0.9004566911251286, "learning_rate": 1.547276299204346e-05, "loss": 0.3697, "step": 2047 }, { "epoch": 0.3363718485669705, "grad_norm": 1.54931259667449, "learning_rate": 1.546830944506196e-05, "loss": 0.7728, "step": 2048 }, { "epoch": 0.3365360926336536, "grad_norm": 1.8799002033417245, "learning_rate": 1.5463854350286972e-05, "loss": 0.7699, "step": 2049 }, { "epoch": 0.3367003367003367, "grad_norm": 1.7751215656622428, "learning_rate": 1.5459397708979508e-05, "loss": 0.7436, "step": 2050 }, { "epoch": 0.3368645807670198, "grad_norm": 2.0400101159907313, "learning_rate": 1.5454939522401e-05, "loss": 0.8253, "step": 2051 }, { "epoch": 0.3370288248337029, "grad_norm": 1.8118294494207878, "learning_rate": 1.5450479791813348e-05, "loss": 0.7655, "step": 2052 }, { "epoch": 0.337193068900386, "grad_norm": 1.7024325139904881, "learning_rate": 1.544601851847885e-05, "loss": 0.7546, "step": 2053 }, { "epoch": 0.33735731296706906, "grad_norm": 2.08860643854194, "learning_rate": 1.544155570366027e-05, "loss": 0.8247, "step": 2054 }, { "epoch": 0.33752155703375214, "grad_norm": 1.99722586843416, "learning_rate": 1.5437091348620798e-05, "loss": 0.7494, "step": 2055 }, { "epoch": 0.3376858011004352, "grad_norm": 1.81029125145495, "learning_rate": 1.5432625454624054e-05, "loss": 0.8011, "step": 2056 }, { "epoch": 0.33785004516711836, "grad_norm": 2.0701533419189153, "learning_rate": 1.5428158022934106e-05, "loss": 0.8083, "step": 2057 }, { "epoch": 0.33801428923380145, "grad_norm": 1.7810298142779857, "learning_rate": 1.542368905481545e-05, "loss": 0.8034, "step": 2058 }, { "epoch": 0.33817853330048453, "grad_norm": 2.0222268125798974, "learning_rate": 1.5419218551533017e-05, "loss": 0.7702, "step": 2059 }, { "epoch": 0.3383427773671676, "grad_norm": 2.1410747223541224, "learning_rate": 1.5414746514352178e-05, "loss": 0.808, "step": 2060 }, { "epoch": 0.3385070214338507, "grad_norm": 2.15273946680317, "learning_rate": 1.5410272944538725e-05, "loss": 0.7899, "step": 2061 }, { "epoch": 0.3386712655005338, "grad_norm": 1.8180696531935656, "learning_rate": 1.5405797843358897e-05, "loss": 0.8048, "step": 2062 }, { "epoch": 0.33883550956721686, "grad_norm": 2.1484013802024364, "learning_rate": 1.5401321212079366e-05, "loss": 0.7659, "step": 2063 }, { "epoch": 0.3389997536339, "grad_norm": 1.5004802846053635, "learning_rate": 1.5396843051967225e-05, "loss": 0.8353, "step": 2064 }, { "epoch": 0.3391639977005831, "grad_norm": 2.1343861354911384, "learning_rate": 1.5392363364290016e-05, "loss": 0.843, "step": 2065 }, { "epoch": 0.33932824176726617, "grad_norm": 1.7684283842143436, "learning_rate": 1.53878821503157e-05, "loss": 0.8453, "step": 2066 }, { "epoch": 0.33949248583394925, "grad_norm": 2.0698085086872196, "learning_rate": 1.5383399411312673e-05, "loss": 0.8686, "step": 2067 }, { "epoch": 0.33965672990063234, "grad_norm": 1.577720577559077, "learning_rate": 1.5378915148549772e-05, "loss": 0.7271, "step": 2068 }, { "epoch": 0.3398209739673154, "grad_norm": 1.9348071968831742, "learning_rate": 1.5374429363296252e-05, "loss": 0.7894, "step": 2069 }, { "epoch": 0.3399852180339985, "grad_norm": 1.8148894136572127, "learning_rate": 1.536994205682181e-05, "loss": 0.8636, "step": 2070 }, { "epoch": 0.3401494621006816, "grad_norm": 1.806545101332243, "learning_rate": 1.536545323039657e-05, "loss": 0.7702, "step": 2071 }, { "epoch": 0.3403137061673647, "grad_norm": 1.5287753249989788, "learning_rate": 1.5360962885291074e-05, "loss": 0.7613, "step": 2072 }, { "epoch": 0.3404779502340478, "grad_norm": 1.9887922509373335, "learning_rate": 1.5356471022776315e-05, "loss": 0.8186, "step": 2073 }, { "epoch": 0.3406421943007309, "grad_norm": 1.5805709817124118, "learning_rate": 1.5351977644123703e-05, "loss": 0.8082, "step": 2074 }, { "epoch": 0.340806438367414, "grad_norm": 0.7222567179659651, "learning_rate": 1.5347482750605072e-05, "loss": 0.3448, "step": 2075 }, { "epoch": 0.34097068243409706, "grad_norm": 1.4872941848093435, "learning_rate": 1.5342986343492704e-05, "loss": 0.782, "step": 2076 }, { "epoch": 0.34113492650078014, "grad_norm": 1.6995634562031294, "learning_rate": 1.533848842405929e-05, "loss": 0.8525, "step": 2077 }, { "epoch": 0.34129917056746323, "grad_norm": 2.0456231173773527, "learning_rate": 1.5333988993577958e-05, "loss": 0.7616, "step": 2078 }, { "epoch": 0.34146341463414637, "grad_norm": 1.7245398481020815, "learning_rate": 1.5329488053322266e-05, "loss": 0.7777, "step": 2079 }, { "epoch": 0.34162765870082945, "grad_norm": 1.8481799710470888, "learning_rate": 1.5324985604566194e-05, "loss": 0.7983, "step": 2080 }, { "epoch": 0.34179190276751253, "grad_norm": 1.7632346328587203, "learning_rate": 1.5320481648584147e-05, "loss": 0.8004, "step": 2081 }, { "epoch": 0.3419561468341956, "grad_norm": 1.9272454515520363, "learning_rate": 1.5315976186650962e-05, "loss": 0.8132, "step": 2082 }, { "epoch": 0.3421203909008787, "grad_norm": 2.2907075428751313, "learning_rate": 1.5311469220041903e-05, "loss": 0.8688, "step": 2083 }, { "epoch": 0.3422846349675618, "grad_norm": 2.957243212785512, "learning_rate": 1.5306960750032657e-05, "loss": 0.7656, "step": 2084 }, { "epoch": 0.34244887903424487, "grad_norm": 3.070883324352414, "learning_rate": 1.5302450777899332e-05, "loss": 0.745, "step": 2085 }, { "epoch": 0.34261312310092795, "grad_norm": 1.5951288601597409, "learning_rate": 1.529793930491847e-05, "loss": 0.7023, "step": 2086 }, { "epoch": 0.3427773671676111, "grad_norm": 2.8992666476280786, "learning_rate": 1.5293426332367034e-05, "loss": 0.7438, "step": 2087 }, { "epoch": 0.3429416112342942, "grad_norm": 1.9728143826309155, "learning_rate": 1.5288911861522413e-05, "loss": 0.771, "step": 2088 }, { "epoch": 0.34310585530097726, "grad_norm": 0.708747291909293, "learning_rate": 1.5284395893662414e-05, "loss": 0.3432, "step": 2089 }, { "epoch": 0.34327009936766034, "grad_norm": 3.547399841784927, "learning_rate": 1.5279878430065277e-05, "loss": 0.7305, "step": 2090 }, { "epoch": 0.3434343434343434, "grad_norm": 0.6368118629190467, "learning_rate": 1.5275359472009656e-05, "loss": 0.4183, "step": 2091 }, { "epoch": 0.3435985875010265, "grad_norm": 2.102481918235715, "learning_rate": 1.5270839020774638e-05, "loss": 0.873, "step": 2092 }, { "epoch": 0.3437628315677096, "grad_norm": 1.9435662923376753, "learning_rate": 1.526631707763972e-05, "loss": 0.8692, "step": 2093 }, { "epoch": 0.34392707563439273, "grad_norm": 1.5720458858730844, "learning_rate": 1.5261793643884835e-05, "loss": 0.8319, "step": 2094 }, { "epoch": 0.3440913197010758, "grad_norm": 1.845035969613856, "learning_rate": 1.5257268720790328e-05, "loss": 0.7312, "step": 2095 }, { "epoch": 0.3442555637677589, "grad_norm": 1.831282211492257, "learning_rate": 1.5252742309636972e-05, "loss": 0.8778, "step": 2096 }, { "epoch": 0.344419807834442, "grad_norm": 1.963086906232942, "learning_rate": 1.5248214411705955e-05, "loss": 0.7785, "step": 2097 }, { "epoch": 0.34458405190112507, "grad_norm": 1.4646410973832884, "learning_rate": 1.5243685028278888e-05, "loss": 0.8167, "step": 2098 }, { "epoch": 0.34474829596780815, "grad_norm": 1.633648324922326, "learning_rate": 1.5239154160637805e-05, "loss": 0.6974, "step": 2099 }, { "epoch": 0.34491254003449123, "grad_norm": 1.5591249146927835, "learning_rate": 1.5234621810065164e-05, "loss": 0.7587, "step": 2100 }, { "epoch": 0.34507678410117437, "grad_norm": 1.6690270555182676, "learning_rate": 1.5230087977843826e-05, "loss": 0.8077, "step": 2101 }, { "epoch": 0.34524102816785746, "grad_norm": 1.877563569067742, "learning_rate": 1.5225552665257092e-05, "loss": 0.7693, "step": 2102 }, { "epoch": 0.34540527223454054, "grad_norm": 2.0750233378600527, "learning_rate": 1.5221015873588672e-05, "loss": 0.7692, "step": 2103 }, { "epoch": 0.3455695163012236, "grad_norm": 1.7710544978204248, "learning_rate": 1.521647760412269e-05, "loss": 0.7255, "step": 2104 }, { "epoch": 0.3457337603679067, "grad_norm": 1.7491240766262421, "learning_rate": 1.5211937858143699e-05, "loss": 0.7897, "step": 2105 }, { "epoch": 0.3458980044345898, "grad_norm": 2.110684821865486, "learning_rate": 1.5207396636936662e-05, "loss": 0.847, "step": 2106 }, { "epoch": 0.3460622485012729, "grad_norm": 1.7523039127097269, "learning_rate": 1.520285394178696e-05, "loss": 0.7298, "step": 2107 }, { "epoch": 0.34622649256795596, "grad_norm": 1.6395738121441645, "learning_rate": 1.5198309773980397e-05, "loss": 0.8351, "step": 2108 }, { "epoch": 0.3463907366346391, "grad_norm": 1.6133765980986292, "learning_rate": 1.5193764134803188e-05, "loss": 0.7882, "step": 2109 }, { "epoch": 0.3465549807013222, "grad_norm": 1.7150938830463687, "learning_rate": 1.5189217025541969e-05, "loss": 0.8731, "step": 2110 }, { "epoch": 0.34671922476800526, "grad_norm": 1.4791104583832888, "learning_rate": 1.5184668447483785e-05, "loss": 0.8116, "step": 2111 }, { "epoch": 0.34688346883468835, "grad_norm": 2.8128195314545734, "learning_rate": 1.5180118401916107e-05, "loss": 0.8782, "step": 2112 }, { "epoch": 0.34704771290137143, "grad_norm": 1.9194310679878466, "learning_rate": 1.5175566890126812e-05, "loss": 0.8292, "step": 2113 }, { "epoch": 0.3472119569680545, "grad_norm": 1.4575296108326439, "learning_rate": 1.51710139134042e-05, "loss": 0.8105, "step": 2114 }, { "epoch": 0.3473762010347376, "grad_norm": 3.965054971341187, "learning_rate": 1.5166459473036977e-05, "loss": 0.7725, "step": 2115 }, { "epoch": 0.34754044510142074, "grad_norm": 1.8422853578813159, "learning_rate": 1.5161903570314268e-05, "loss": 0.784, "step": 2116 }, { "epoch": 0.3477046891681038, "grad_norm": 1.904457267974215, "learning_rate": 1.5157346206525613e-05, "loss": 0.7761, "step": 2117 }, { "epoch": 0.3478689332347869, "grad_norm": 1.6714448929081633, "learning_rate": 1.5152787382960968e-05, "loss": 0.7978, "step": 2118 }, { "epoch": 0.34803317730147, "grad_norm": 1.928471555587888, "learning_rate": 1.5148227100910691e-05, "loss": 0.7531, "step": 2119 }, { "epoch": 0.34819742136815307, "grad_norm": 1.8577601283165228, "learning_rate": 1.5143665361665565e-05, "loss": 0.8172, "step": 2120 }, { "epoch": 0.34836166543483615, "grad_norm": 1.617136879630862, "learning_rate": 1.5139102166516782e-05, "loss": 0.8378, "step": 2121 }, { "epoch": 0.34852590950151924, "grad_norm": 2.632626534499025, "learning_rate": 1.5134537516755938e-05, "loss": 0.8079, "step": 2122 }, { "epoch": 0.3486901535682023, "grad_norm": 2.0966333116129237, "learning_rate": 1.5129971413675055e-05, "loss": 0.817, "step": 2123 }, { "epoch": 0.34885439763488546, "grad_norm": 1.7353975190687165, "learning_rate": 1.5125403858566552e-05, "loss": 0.8032, "step": 2124 }, { "epoch": 0.34901864170156854, "grad_norm": 2.034430932626032, "learning_rate": 1.5120834852723276e-05, "loss": 0.652, "step": 2125 }, { "epoch": 0.3491828857682516, "grad_norm": 2.138829018943066, "learning_rate": 1.5116264397438465e-05, "loss": 0.8437, "step": 2126 }, { "epoch": 0.3493471298349347, "grad_norm": 1.8695668967779628, "learning_rate": 1.511169249400578e-05, "loss": 0.8448, "step": 2127 }, { "epoch": 0.3495113739016178, "grad_norm": 1.8253091821103833, "learning_rate": 1.510711914371929e-05, "loss": 0.8067, "step": 2128 }, { "epoch": 0.3496756179683009, "grad_norm": 1.4316764017493981, "learning_rate": 1.5102544347873469e-05, "loss": 0.8788, "step": 2129 }, { "epoch": 0.34983986203498396, "grad_norm": 1.6977237568235917, "learning_rate": 1.5097968107763205e-05, "loss": 0.7876, "step": 2130 }, { "epoch": 0.3500041061016671, "grad_norm": 1.6860336688517226, "learning_rate": 1.5093390424683796e-05, "loss": 0.756, "step": 2131 }, { "epoch": 0.3501683501683502, "grad_norm": 1.8826203869527744, "learning_rate": 1.5088811299930942e-05, "loss": 0.8246, "step": 2132 }, { "epoch": 0.35033259423503327, "grad_norm": 1.7476414365221504, "learning_rate": 1.5084230734800754e-05, "loss": 0.8243, "step": 2133 }, { "epoch": 0.35049683830171635, "grad_norm": 1.942071202028528, "learning_rate": 1.5079648730589753e-05, "loss": 0.8292, "step": 2134 }, { "epoch": 0.35066108236839943, "grad_norm": 1.517597038164093, "learning_rate": 1.5075065288594864e-05, "loss": 0.7821, "step": 2135 }, { "epoch": 0.3508253264350825, "grad_norm": 0.7407375659562457, "learning_rate": 1.5070480410113427e-05, "loss": 0.3475, "step": 2136 }, { "epoch": 0.3509895705017656, "grad_norm": 1.9786314246435122, "learning_rate": 1.5065894096443173e-05, "loss": 0.8848, "step": 2137 }, { "epoch": 0.35115381456844874, "grad_norm": 1.9477949199597948, "learning_rate": 1.5061306348882252e-05, "loss": 0.7927, "step": 2138 }, { "epoch": 0.3513180586351318, "grad_norm": 2.181033573650758, "learning_rate": 1.505671716872922e-05, "loss": 0.8472, "step": 2139 }, { "epoch": 0.3514823027018149, "grad_norm": 1.6059547183215837, "learning_rate": 1.5052126557283031e-05, "loss": 0.867, "step": 2140 }, { "epoch": 0.351646546768498, "grad_norm": 1.9149853465712694, "learning_rate": 1.5047534515843047e-05, "loss": 0.8966, "step": 2141 }, { "epoch": 0.3518107908351811, "grad_norm": 1.4899990928656492, "learning_rate": 1.5042941045709039e-05, "loss": 0.8246, "step": 2142 }, { "epoch": 0.35197503490186416, "grad_norm": 1.9154818478684417, "learning_rate": 1.5038346148181178e-05, "loss": 0.7948, "step": 2143 }, { "epoch": 0.35213927896854724, "grad_norm": 1.9225191399687533, "learning_rate": 1.5033749824560037e-05, "loss": 0.8283, "step": 2144 }, { "epoch": 0.3523035230352303, "grad_norm": 1.9980012802695732, "learning_rate": 1.50291520761466e-05, "loss": 0.8255, "step": 2145 }, { "epoch": 0.35246776710191347, "grad_norm": 1.6840052296595815, "learning_rate": 1.5024552904242246e-05, "loss": 0.7395, "step": 2146 }, { "epoch": 0.35263201116859655, "grad_norm": 0.7649992261171568, "learning_rate": 1.5019952310148766e-05, "loss": 0.3563, "step": 2147 }, { "epoch": 0.35279625523527963, "grad_norm": 1.5567044845343287, "learning_rate": 1.5015350295168344e-05, "loss": 0.8376, "step": 2148 }, { "epoch": 0.3529604993019627, "grad_norm": 1.705071606685253, "learning_rate": 1.5010746860603575e-05, "loss": 0.8207, "step": 2149 }, { "epoch": 0.3531247433686458, "grad_norm": 2.099568199539102, "learning_rate": 1.5006142007757446e-05, "loss": 0.8886, "step": 2150 }, { "epoch": 0.3532889874353289, "grad_norm": 1.9851606065781815, "learning_rate": 1.5001535737933355e-05, "loss": 0.7628, "step": 2151 }, { "epoch": 0.35345323150201197, "grad_norm": 2.112212097339268, "learning_rate": 1.4996928052435095e-05, "loss": 0.764, "step": 2152 }, { "epoch": 0.3536174755686951, "grad_norm": 1.7058978739468682, "learning_rate": 1.4992318952566862e-05, "loss": 0.7437, "step": 2153 }, { "epoch": 0.3537817196353782, "grad_norm": 2.525711971512683, "learning_rate": 1.4987708439633255e-05, "loss": 0.9113, "step": 2154 }, { "epoch": 0.3539459637020613, "grad_norm": 1.4634487767938733, "learning_rate": 1.4983096514939263e-05, "loss": 0.8332, "step": 2155 }, { "epoch": 0.35411020776874436, "grad_norm": 1.6419913051904726, "learning_rate": 1.497848317979029e-05, "loss": 0.8077, "step": 2156 }, { "epoch": 0.35427445183542744, "grad_norm": 1.7307450027433822, "learning_rate": 1.4973868435492125e-05, "loss": 0.8536, "step": 2157 }, { "epoch": 0.3544386959021105, "grad_norm": 2.867269790946736, "learning_rate": 1.4969252283350964e-05, "loss": 0.8123, "step": 2158 }, { "epoch": 0.3546029399687936, "grad_norm": 1.7329722121615687, "learning_rate": 1.4964634724673397e-05, "loss": 0.7993, "step": 2159 }, { "epoch": 0.35476718403547675, "grad_norm": 1.8224571495088506, "learning_rate": 1.4960015760766418e-05, "loss": 0.7651, "step": 2160 }, { "epoch": 0.35493142810215983, "grad_norm": 0.709039987262886, "learning_rate": 1.495539539293741e-05, "loss": 0.3497, "step": 2161 }, { "epoch": 0.3550956721688429, "grad_norm": 1.6694850817422582, "learning_rate": 1.4950773622494166e-05, "loss": 0.844, "step": 2162 }, { "epoch": 0.355259916235526, "grad_norm": 1.6212077400322271, "learning_rate": 1.4946150450744859e-05, "loss": 0.8376, "step": 2163 }, { "epoch": 0.3554241603022091, "grad_norm": 0.6104224170549394, "learning_rate": 1.4941525878998073e-05, "loss": 0.3272, "step": 2164 }, { "epoch": 0.35558840436889216, "grad_norm": 1.9587508461398373, "learning_rate": 1.4936899908562788e-05, "loss": 0.8942, "step": 2165 }, { "epoch": 0.35575264843557525, "grad_norm": 2.1277326860731782, "learning_rate": 1.4932272540748366e-05, "loss": 0.8177, "step": 2166 }, { "epoch": 0.35591689250225833, "grad_norm": 2.6105136460243554, "learning_rate": 1.4927643776864577e-05, "loss": 0.8415, "step": 2167 }, { "epoch": 0.35608113656894147, "grad_norm": 2.3260375584518256, "learning_rate": 1.4923013618221584e-05, "loss": 0.8569, "step": 2168 }, { "epoch": 0.35624538063562455, "grad_norm": 1.8534304047709784, "learning_rate": 1.4918382066129946e-05, "loss": 0.8109, "step": 2169 }, { "epoch": 0.35640962470230764, "grad_norm": 3.1753520497236907, "learning_rate": 1.4913749121900611e-05, "loss": 0.7668, "step": 2170 }, { "epoch": 0.3565738687689907, "grad_norm": 1.6528415226324606, "learning_rate": 1.4909114786844925e-05, "loss": 0.8802, "step": 2171 }, { "epoch": 0.3567381128356738, "grad_norm": 1.6358276030254046, "learning_rate": 1.4904479062274627e-05, "loss": 0.7629, "step": 2172 }, { "epoch": 0.3569023569023569, "grad_norm": 1.6435842807897645, "learning_rate": 1.4899841949501845e-05, "loss": 0.8233, "step": 2173 }, { "epoch": 0.35706660096903997, "grad_norm": 1.5514605461732385, "learning_rate": 1.4895203449839111e-05, "loss": 0.9184, "step": 2174 }, { "epoch": 0.3572308450357231, "grad_norm": 1.6533207173632347, "learning_rate": 1.4890563564599337e-05, "loss": 0.7695, "step": 2175 }, { "epoch": 0.3573950891024062, "grad_norm": 0.8277333402590147, "learning_rate": 1.4885922295095836e-05, "loss": 0.3164, "step": 2176 }, { "epoch": 0.3575593331690893, "grad_norm": 2.8358253238416875, "learning_rate": 1.4881279642642308e-05, "loss": 0.8979, "step": 2177 }, { "epoch": 0.35772357723577236, "grad_norm": 1.6310688161027775, "learning_rate": 1.4876635608552845e-05, "loss": 0.856, "step": 2178 }, { "epoch": 0.35788782130245544, "grad_norm": 2.1420240411934293, "learning_rate": 1.4871990194141934e-05, "loss": 0.8115, "step": 2179 }, { "epoch": 0.35805206536913853, "grad_norm": 1.9182672410600323, "learning_rate": 1.486734340072445e-05, "loss": 0.8098, "step": 2180 }, { "epoch": 0.3582163094358216, "grad_norm": 1.8686873501457784, "learning_rate": 1.4862695229615654e-05, "loss": 0.8263, "step": 2181 }, { "epoch": 0.3583805535025047, "grad_norm": 1.6604694225991814, "learning_rate": 1.4858045682131203e-05, "loss": 0.8055, "step": 2182 }, { "epoch": 0.35854479756918783, "grad_norm": 1.334335711147853, "learning_rate": 1.4853394759587146e-05, "loss": 0.787, "step": 2183 }, { "epoch": 0.3587090416358709, "grad_norm": 1.614438112822883, "learning_rate": 1.4848742463299907e-05, "loss": 0.7722, "step": 2184 }, { "epoch": 0.358873285702554, "grad_norm": 2.6642562921060167, "learning_rate": 1.484408879458632e-05, "loss": 0.7345, "step": 2185 }, { "epoch": 0.3590375297692371, "grad_norm": 2.1296707156188885, "learning_rate": 1.4839433754763588e-05, "loss": 0.7885, "step": 2186 }, { "epoch": 0.35920177383592017, "grad_norm": 1.6517313763012638, "learning_rate": 1.4834777345149313e-05, "loss": 0.9144, "step": 2187 }, { "epoch": 0.35936601790260325, "grad_norm": 2.017126761947172, "learning_rate": 1.4830119567061484e-05, "loss": 0.7986, "step": 2188 }, { "epoch": 0.35953026196928634, "grad_norm": 1.9468935586671579, "learning_rate": 1.4825460421818472e-05, "loss": 0.7826, "step": 2189 }, { "epoch": 0.3596945060359695, "grad_norm": 1.3042807864353518, "learning_rate": 1.4820799910739042e-05, "loss": 0.8349, "step": 2190 }, { "epoch": 0.35985875010265256, "grad_norm": 2.7896433737420123, "learning_rate": 1.4816138035142334e-05, "loss": 0.6415, "step": 2191 }, { "epoch": 0.36002299416933564, "grad_norm": 2.0999838522343666, "learning_rate": 1.481147479634789e-05, "loss": 0.8308, "step": 2192 }, { "epoch": 0.3601872382360187, "grad_norm": 1.8974379744311907, "learning_rate": 1.4806810195675627e-05, "loss": 0.8774, "step": 2193 }, { "epoch": 0.3603514823027018, "grad_norm": 1.9336501897705292, "learning_rate": 1.480214423444585e-05, "loss": 0.8654, "step": 2194 }, { "epoch": 0.3605157263693849, "grad_norm": 1.8791108400583603, "learning_rate": 1.4797476913979251e-05, "loss": 0.8612, "step": 2195 }, { "epoch": 0.360679970436068, "grad_norm": 2.64155811381677, "learning_rate": 1.4792808235596907e-05, "loss": 0.7806, "step": 2196 }, { "epoch": 0.3608442145027511, "grad_norm": 1.556673709541587, "learning_rate": 1.4788138200620272e-05, "loss": 0.8595, "step": 2197 }, { "epoch": 0.3610084585694342, "grad_norm": 1.9683730067259768, "learning_rate": 1.4783466810371195e-05, "loss": 0.834, "step": 2198 }, { "epoch": 0.3611727026361173, "grad_norm": 1.6038401021057216, "learning_rate": 1.47787940661719e-05, "loss": 0.8252, "step": 2199 }, { "epoch": 0.36133694670280037, "grad_norm": 2.345245645346709, "learning_rate": 1.4774119969344996e-05, "loss": 0.7981, "step": 2200 }, { "epoch": 0.36150119076948345, "grad_norm": 1.674322386900407, "learning_rate": 1.4769444521213482e-05, "loss": 0.8503, "step": 2201 }, { "epoch": 0.36166543483616653, "grad_norm": 1.392994962624394, "learning_rate": 1.4764767723100729e-05, "loss": 0.7855, "step": 2202 }, { "epoch": 0.3618296789028496, "grad_norm": 1.8952241240616245, "learning_rate": 1.4760089576330493e-05, "loss": 0.7446, "step": 2203 }, { "epoch": 0.3619939229695327, "grad_norm": 2.6893063154080687, "learning_rate": 1.475541008222692e-05, "loss": 0.8275, "step": 2204 }, { "epoch": 0.36215816703621584, "grad_norm": 2.063107578881092, "learning_rate": 1.4750729242114527e-05, "loss": 0.7597, "step": 2205 }, { "epoch": 0.3623224111028989, "grad_norm": 1.5776442925861254, "learning_rate": 1.4746047057318217e-05, "loss": 0.8189, "step": 2206 }, { "epoch": 0.362486655169582, "grad_norm": 1.958253668202491, "learning_rate": 1.4741363529163273e-05, "loss": 0.7892, "step": 2207 }, { "epoch": 0.3626508992362651, "grad_norm": 6.613224850334649, "learning_rate": 1.4736678658975357e-05, "loss": 0.7742, "step": 2208 }, { "epoch": 0.3628151433029482, "grad_norm": 1.7810053748214678, "learning_rate": 1.4731992448080509e-05, "loss": 0.8005, "step": 2209 }, { "epoch": 0.36297938736963126, "grad_norm": 1.5458382601591525, "learning_rate": 1.4727304897805157e-05, "loss": 0.7939, "step": 2210 }, { "epoch": 0.36314363143631434, "grad_norm": 1.5593557565442, "learning_rate": 1.47226160094761e-05, "loss": 0.7669, "step": 2211 }, { "epoch": 0.3633078755029975, "grad_norm": 1.8038215035203304, "learning_rate": 1.4717925784420514e-05, "loss": 0.8013, "step": 2212 }, { "epoch": 0.36347211956968056, "grad_norm": 2.1940534762596617, "learning_rate": 1.471323422396596e-05, "loss": 0.8791, "step": 2213 }, { "epoch": 0.36363636363636365, "grad_norm": 1.945066207982811, "learning_rate": 1.4708541329440375e-05, "loss": 0.8457, "step": 2214 }, { "epoch": 0.36380060770304673, "grad_norm": 2.04853564139857, "learning_rate": 1.4703847102172074e-05, "loss": 0.7587, "step": 2215 }, { "epoch": 0.3639648517697298, "grad_norm": 2.0956119636097905, "learning_rate": 1.4699151543489745e-05, "loss": 0.7904, "step": 2216 }, { "epoch": 0.3641290958364129, "grad_norm": 1.9315891350085292, "learning_rate": 1.4694454654722459e-05, "loss": 0.7282, "step": 2217 }, { "epoch": 0.364293339903096, "grad_norm": 1.8926997100814917, "learning_rate": 1.4689756437199658e-05, "loss": 0.8008, "step": 2218 }, { "epoch": 0.36445758396977906, "grad_norm": 1.9676162664687291, "learning_rate": 1.4685056892251167e-05, "loss": 0.9206, "step": 2219 }, { "epoch": 0.3646218280364622, "grad_norm": 1.8847853148788876, "learning_rate": 1.4680356021207176e-05, "loss": 0.8276, "step": 2220 }, { "epoch": 0.3647860721031453, "grad_norm": 1.491915556142245, "learning_rate": 1.4675653825398261e-05, "loss": 0.8418, "step": 2221 }, { "epoch": 0.36495031616982837, "grad_norm": 2.2543810980661587, "learning_rate": 1.4670950306155368e-05, "loss": 0.76, "step": 2222 }, { "epoch": 0.36511456023651145, "grad_norm": 2.0492459763605844, "learning_rate": 1.4666245464809818e-05, "loss": 0.7783, "step": 2223 }, { "epoch": 0.36527880430319454, "grad_norm": 1.9241048113261043, "learning_rate": 1.4661539302693306e-05, "loss": 0.7747, "step": 2224 }, { "epoch": 0.3654430483698776, "grad_norm": 1.556590464985274, "learning_rate": 1.46568318211379e-05, "loss": 0.802, "step": 2225 }, { "epoch": 0.3656072924365607, "grad_norm": 1.6671854558660046, "learning_rate": 1.4652123021476044e-05, "loss": 0.8666, "step": 2226 }, { "epoch": 0.36577153650324384, "grad_norm": 1.8638699090216067, "learning_rate": 1.4647412905040553e-05, "loss": 0.7759, "step": 2227 }, { "epoch": 0.3659357805699269, "grad_norm": 1.776973317718489, "learning_rate": 1.4642701473164618e-05, "loss": 0.7003, "step": 2228 }, { "epoch": 0.36610002463661, "grad_norm": 1.6478686265985525, "learning_rate": 1.4637988727181798e-05, "loss": 0.8603, "step": 2229 }, { "epoch": 0.3662642687032931, "grad_norm": 1.3742262150816311, "learning_rate": 1.4633274668426028e-05, "loss": 0.8182, "step": 2230 }, { "epoch": 0.3664285127699762, "grad_norm": 1.483927764252259, "learning_rate": 1.462855929823161e-05, "loss": 0.8704, "step": 2231 }, { "epoch": 0.36659275683665926, "grad_norm": 1.5859086369418363, "learning_rate": 1.4623842617933219e-05, "loss": 0.8075, "step": 2232 }, { "epoch": 0.36675700090334235, "grad_norm": 2.4655237864723243, "learning_rate": 1.4619124628865904e-05, "loss": 0.8933, "step": 2233 }, { "epoch": 0.3669212449700255, "grad_norm": 2.000289703852493, "learning_rate": 1.461440533236508e-05, "loss": 0.7638, "step": 2234 }, { "epoch": 0.36708548903670857, "grad_norm": 1.7937007616394827, "learning_rate": 1.4609684729766536e-05, "loss": 0.7989, "step": 2235 }, { "epoch": 0.36724973310339165, "grad_norm": 2.0728438010839936, "learning_rate": 1.4604962822406426e-05, "loss": 0.8126, "step": 2236 }, { "epoch": 0.36741397717007473, "grad_norm": 1.6275721286823377, "learning_rate": 1.4600239611621274e-05, "loss": 0.8675, "step": 2237 }, { "epoch": 0.3675782212367578, "grad_norm": 1.7581874624791964, "learning_rate": 1.459551509874798e-05, "loss": 0.785, "step": 2238 }, { "epoch": 0.3677424653034409, "grad_norm": 1.759948293215646, "learning_rate": 1.4590789285123808e-05, "loss": 0.7693, "step": 2239 }, { "epoch": 0.367906709370124, "grad_norm": 0.7703598407209107, "learning_rate": 1.4586062172086383e-05, "loss": 0.3425, "step": 2240 }, { "epoch": 0.36807095343680707, "grad_norm": 3.3339348682742087, "learning_rate": 1.4581333760973713e-05, "loss": 0.8811, "step": 2241 }, { "epoch": 0.3682351975034902, "grad_norm": 3.5048573563093637, "learning_rate": 1.4576604053124154e-05, "loss": 0.7371, "step": 2242 }, { "epoch": 0.3683994415701733, "grad_norm": 1.985063425606704, "learning_rate": 1.4571873049876452e-05, "loss": 0.8264, "step": 2243 }, { "epoch": 0.3685636856368564, "grad_norm": 1.958156312387829, "learning_rate": 1.4567140752569701e-05, "loss": 0.8896, "step": 2244 }, { "epoch": 0.36872792970353946, "grad_norm": 1.9926599847653532, "learning_rate": 1.4562407162543367e-05, "loss": 0.7637, "step": 2245 }, { "epoch": 0.36889217377022254, "grad_norm": 2.0659966010667277, "learning_rate": 1.4557672281137286e-05, "loss": 0.7693, "step": 2246 }, { "epoch": 0.3690564178369056, "grad_norm": 1.2929831290587546, "learning_rate": 1.455293610969165e-05, "loss": 0.8387, "step": 2247 }, { "epoch": 0.3692206619035887, "grad_norm": 1.9947487831590949, "learning_rate": 1.454819864954703e-05, "loss": 0.717, "step": 2248 }, { "epoch": 0.36938490597027185, "grad_norm": 1.5710382632266646, "learning_rate": 1.4543459902044347e-05, "loss": 0.6812, "step": 2249 }, { "epoch": 0.36954915003695493, "grad_norm": 1.8729508265074601, "learning_rate": 1.45387198685249e-05, "loss": 0.8687, "step": 2250 }, { "epoch": 0.369713394103638, "grad_norm": 0.7133039651705189, "learning_rate": 1.4533978550330343e-05, "loss": 0.3377, "step": 2251 }, { "epoch": 0.3698776381703211, "grad_norm": 1.4058971996650615, "learning_rate": 1.4529235948802696e-05, "loss": 0.7702, "step": 2252 }, { "epoch": 0.3700418822370042, "grad_norm": 1.5111344258089923, "learning_rate": 1.4524492065284344e-05, "loss": 0.7881, "step": 2253 }, { "epoch": 0.37020612630368727, "grad_norm": 1.6607691262476938, "learning_rate": 1.4519746901118029e-05, "loss": 0.8761, "step": 2254 }, { "epoch": 0.37037037037037035, "grad_norm": 2.9732977504804756, "learning_rate": 1.4515000457646866e-05, "loss": 0.7581, "step": 2255 }, { "epoch": 0.3705346144370535, "grad_norm": 1.6651344059138267, "learning_rate": 1.4510252736214318e-05, "loss": 0.7702, "step": 2256 }, { "epoch": 0.3706988585037366, "grad_norm": 1.7390059115319512, "learning_rate": 1.4505503738164225e-05, "loss": 0.758, "step": 2257 }, { "epoch": 0.37086310257041966, "grad_norm": 1.5638045796059037, "learning_rate": 1.4500753464840775e-05, "loss": 0.7978, "step": 2258 }, { "epoch": 0.37102734663710274, "grad_norm": 1.6558330194445527, "learning_rate": 1.4496001917588528e-05, "loss": 0.7865, "step": 2259 }, { "epoch": 0.3711915907037858, "grad_norm": 3.9437404873798383, "learning_rate": 1.4491249097752393e-05, "loss": 0.8225, "step": 2260 }, { "epoch": 0.3713558347704689, "grad_norm": 1.8093493950452824, "learning_rate": 1.448649500667765e-05, "loss": 0.8365, "step": 2261 }, { "epoch": 0.371520078837152, "grad_norm": 1.7050344143963516, "learning_rate": 1.4481739645709935e-05, "loss": 0.7144, "step": 2262 }, { "epoch": 0.3716843229038351, "grad_norm": 1.8899508920093164, "learning_rate": 1.4476983016195245e-05, "loss": 0.8294, "step": 2263 }, { "epoch": 0.3718485669705182, "grad_norm": 1.6134938832850225, "learning_rate": 1.4472225119479928e-05, "loss": 0.8683, "step": 2264 }, { "epoch": 0.3720128110372013, "grad_norm": 1.3902789103850577, "learning_rate": 1.4467465956910704e-05, "loss": 0.8173, "step": 2265 }, { "epoch": 0.3721770551038844, "grad_norm": 0.736179060042763, "learning_rate": 1.4462705529834635e-05, "loss": 0.3359, "step": 2266 }, { "epoch": 0.37234129917056746, "grad_norm": 1.993908791988353, "learning_rate": 1.4457943839599158e-05, "loss": 0.7983, "step": 2267 }, { "epoch": 0.37250554323725055, "grad_norm": 2.219305028353858, "learning_rate": 1.4453180887552052e-05, "loss": 0.7292, "step": 2268 }, { "epoch": 0.37266978730393363, "grad_norm": 1.9722389736994093, "learning_rate": 1.4448416675041465e-05, "loss": 0.8345, "step": 2269 }, { "epoch": 0.3728340313706167, "grad_norm": 2.016436321505697, "learning_rate": 1.44436512034159e-05, "loss": 0.7978, "step": 2270 }, { "epoch": 0.37299827543729985, "grad_norm": 1.5536736839839638, "learning_rate": 1.443888447402421e-05, "loss": 0.7404, "step": 2271 }, { "epoch": 0.37316251950398294, "grad_norm": 1.937367193720623, "learning_rate": 1.4434116488215603e-05, "loss": 0.8638, "step": 2272 }, { "epoch": 0.373326763570666, "grad_norm": 1.9169136476139077, "learning_rate": 1.4429347247339656e-05, "loss": 0.7979, "step": 2273 }, { "epoch": 0.3734910076373491, "grad_norm": 2.9631516818771417, "learning_rate": 1.4424576752746288e-05, "loss": 0.8078, "step": 2274 }, { "epoch": 0.3736552517040322, "grad_norm": 1.6326737100011255, "learning_rate": 1.4419805005785783e-05, "loss": 0.8302, "step": 2275 }, { "epoch": 0.37381949577071527, "grad_norm": 2.0106727144541465, "learning_rate": 1.4415032007808767e-05, "loss": 0.7774, "step": 2276 }, { "epoch": 0.37398373983739835, "grad_norm": 0.7616340004566357, "learning_rate": 1.441025776016623e-05, "loss": 0.3827, "step": 2277 }, { "epoch": 0.37414798390408144, "grad_norm": 2.0187075316798238, "learning_rate": 1.4405482264209512e-05, "loss": 0.8531, "step": 2278 }, { "epoch": 0.3743122279707646, "grad_norm": 2.5676182894999426, "learning_rate": 1.4400705521290306e-05, "loss": 0.7873, "step": 2279 }, { "epoch": 0.37447647203744766, "grad_norm": 1.7706208012864058, "learning_rate": 1.4395927532760664e-05, "loss": 0.8279, "step": 2280 }, { "epoch": 0.37464071610413074, "grad_norm": 1.4720173714241258, "learning_rate": 1.4391148299972978e-05, "loss": 0.7307, "step": 2281 }, { "epoch": 0.37480496017081383, "grad_norm": 1.7258697870440345, "learning_rate": 1.4386367824280006e-05, "loss": 0.6944, "step": 2282 }, { "epoch": 0.3749692042374969, "grad_norm": 1.6985281856436085, "learning_rate": 1.4381586107034849e-05, "loss": 0.7783, "step": 2283 }, { "epoch": 0.37513344830418, "grad_norm": 1.5050492096102397, "learning_rate": 1.437680314959096e-05, "loss": 0.8105, "step": 2284 }, { "epoch": 0.3752976923708631, "grad_norm": 2.193972031436582, "learning_rate": 1.437201895330215e-05, "loss": 0.757, "step": 2285 }, { "epoch": 0.3754619364375462, "grad_norm": 1.4527396803309243, "learning_rate": 1.4367233519522571e-05, "loss": 0.7497, "step": 2286 }, { "epoch": 0.3756261805042293, "grad_norm": 2.2038594528333686, "learning_rate": 1.4362446849606737e-05, "loss": 0.8198, "step": 2287 }, { "epoch": 0.3757904245709124, "grad_norm": 1.7077401721884984, "learning_rate": 1.4357658944909496e-05, "loss": 0.8615, "step": 2288 }, { "epoch": 0.37595466863759547, "grad_norm": 1.5631489491519068, "learning_rate": 1.4352869806786061e-05, "loss": 0.7758, "step": 2289 }, { "epoch": 0.37611891270427855, "grad_norm": 0.6967639231553451, "learning_rate": 1.4348079436591982e-05, "loss": 0.3426, "step": 2290 }, { "epoch": 0.37628315677096164, "grad_norm": 5.281733556249654, "learning_rate": 1.4343287835683168e-05, "loss": 0.7193, "step": 2291 }, { "epoch": 0.3764474008376447, "grad_norm": 1.571198224497524, "learning_rate": 1.4338495005415869e-05, "loss": 0.7725, "step": 2292 }, { "epoch": 0.37661164490432786, "grad_norm": 1.6727108263134882, "learning_rate": 1.4333700947146686e-05, "loss": 0.7883, "step": 2293 }, { "epoch": 0.37677588897101094, "grad_norm": 2.0658107931408938, "learning_rate": 1.4328905662232567e-05, "loss": 0.8462, "step": 2294 }, { "epoch": 0.376940133037694, "grad_norm": 1.4661844144902374, "learning_rate": 1.4324109152030807e-05, "loss": 0.809, "step": 2295 }, { "epoch": 0.3771043771043771, "grad_norm": 1.8104979856606025, "learning_rate": 1.4319311417899048e-05, "loss": 0.8082, "step": 2296 }, { "epoch": 0.3772686211710602, "grad_norm": 1.8005787354805316, "learning_rate": 1.431451246119528e-05, "loss": 0.8557, "step": 2297 }, { "epoch": 0.3774328652377433, "grad_norm": 1.88195086268881, "learning_rate": 1.4309712283277839e-05, "loss": 0.674, "step": 2298 }, { "epoch": 0.37759710930442636, "grad_norm": 1.8049454576923545, "learning_rate": 1.4304910885505404e-05, "loss": 0.7787, "step": 2299 }, { "epoch": 0.37776135337110944, "grad_norm": 1.6511337335876706, "learning_rate": 1.4300108269236997e-05, "loss": 0.8723, "step": 2300 }, { "epoch": 0.3779255974377926, "grad_norm": 1.4543959310154766, "learning_rate": 1.429530443583199e-05, "loss": 0.7589, "step": 2301 }, { "epoch": 0.37808984150447567, "grad_norm": 1.654767977112134, "learning_rate": 1.4290499386650099e-05, "loss": 0.7601, "step": 2302 }, { "epoch": 0.37825408557115875, "grad_norm": 0.654876859955363, "learning_rate": 1.4285693123051385e-05, "loss": 0.3445, "step": 2303 }, { "epoch": 0.37841832963784183, "grad_norm": 1.641626385554174, "learning_rate": 1.4280885646396248e-05, "loss": 0.7719, "step": 2304 }, { "epoch": 0.3785825737045249, "grad_norm": 1.6387615905840902, "learning_rate": 1.4276076958045436e-05, "loss": 0.7272, "step": 2305 }, { "epoch": 0.378746817771208, "grad_norm": 2.293508713052914, "learning_rate": 1.4271267059360035e-05, "loss": 0.8344, "step": 2306 }, { "epoch": 0.3789110618378911, "grad_norm": 1.6951698294103028, "learning_rate": 1.4266455951701476e-05, "loss": 0.867, "step": 2307 }, { "epoch": 0.3790753059045742, "grad_norm": 1.6909323824451048, "learning_rate": 1.4261643636431539e-05, "loss": 0.7977, "step": 2308 }, { "epoch": 0.3792395499712573, "grad_norm": 0.6581746190969594, "learning_rate": 1.4256830114912341e-05, "loss": 0.3726, "step": 2309 }, { "epoch": 0.3794037940379404, "grad_norm": 1.9565302906723232, "learning_rate": 1.4252015388506328e-05, "loss": 0.8255, "step": 2310 }, { "epoch": 0.3795680381046235, "grad_norm": 2.0461377283557027, "learning_rate": 1.4247199458576308e-05, "loss": 0.8619, "step": 2311 }, { "epoch": 0.37973228217130656, "grad_norm": 1.9270303049351727, "learning_rate": 1.4242382326485416e-05, "loss": 0.782, "step": 2312 }, { "epoch": 0.37989652623798964, "grad_norm": 1.470332160026926, "learning_rate": 1.4237563993597133e-05, "loss": 0.801, "step": 2313 }, { "epoch": 0.3800607703046727, "grad_norm": 1.8315090137650905, "learning_rate": 1.4232744461275273e-05, "loss": 0.7844, "step": 2314 }, { "epoch": 0.3802250143713558, "grad_norm": 1.403912110464401, "learning_rate": 1.4227923730884001e-05, "loss": 0.8198, "step": 2315 }, { "epoch": 0.38038925843803895, "grad_norm": 1.7296070320659156, "learning_rate": 1.4223101803787811e-05, "loss": 0.8976, "step": 2316 }, { "epoch": 0.38055350250472203, "grad_norm": 1.5976933953922694, "learning_rate": 1.421827868135154e-05, "loss": 0.7846, "step": 2317 }, { "epoch": 0.3807177465714051, "grad_norm": 2.1506837712420723, "learning_rate": 1.4213454364940362e-05, "loss": 0.8422, "step": 2318 }, { "epoch": 0.3808819906380882, "grad_norm": 1.482910506861641, "learning_rate": 1.420862885591979e-05, "loss": 0.83, "step": 2319 }, { "epoch": 0.3810462347047713, "grad_norm": 1.7432156074815004, "learning_rate": 1.4203802155655677e-05, "loss": 0.741, "step": 2320 }, { "epoch": 0.38121047877145436, "grad_norm": 1.9548598962474149, "learning_rate": 1.4198974265514207e-05, "loss": 0.8278, "step": 2321 }, { "epoch": 0.38137472283813745, "grad_norm": 2.0515352122503425, "learning_rate": 1.4194145186861902e-05, "loss": 0.7307, "step": 2322 }, { "epoch": 0.3815389669048206, "grad_norm": 2.1688495272321404, "learning_rate": 1.4189314921065629e-05, "loss": 0.795, "step": 2323 }, { "epoch": 0.38170321097150367, "grad_norm": 1.5094303510731788, "learning_rate": 1.418448346949258e-05, "loss": 0.7863, "step": 2324 }, { "epoch": 0.38186745503818675, "grad_norm": 1.6938663482071312, "learning_rate": 1.417965083351029e-05, "loss": 0.8142, "step": 2325 }, { "epoch": 0.38203169910486984, "grad_norm": 1.8015090140224308, "learning_rate": 1.4174817014486622e-05, "loss": 0.743, "step": 2326 }, { "epoch": 0.3821959431715529, "grad_norm": 1.9480322347745263, "learning_rate": 1.4169982013789782e-05, "loss": 0.85, "step": 2327 }, { "epoch": 0.382360187238236, "grad_norm": 1.6504325203637775, "learning_rate": 1.4165145832788305e-05, "loss": 0.8129, "step": 2328 }, { "epoch": 0.3825244313049191, "grad_norm": 2.2498715355955485, "learning_rate": 1.4160308472851065e-05, "loss": 0.7802, "step": 2329 }, { "epoch": 0.3826886753716022, "grad_norm": 1.6953129383390442, "learning_rate": 1.4155469935347264e-05, "loss": 0.7608, "step": 2330 }, { "epoch": 0.3828529194382853, "grad_norm": 2.2765021767800246, "learning_rate": 1.415063022164644e-05, "loss": 0.7803, "step": 2331 }, { "epoch": 0.3830171635049684, "grad_norm": 1.7092202578939861, "learning_rate": 1.4145789333118462e-05, "loss": 0.7733, "step": 2332 }, { "epoch": 0.3831814075716515, "grad_norm": 2.0235280815002192, "learning_rate": 1.4140947271133536e-05, "loss": 0.7504, "step": 2333 }, { "epoch": 0.38334565163833456, "grad_norm": 3.239482749379883, "learning_rate": 1.4136104037062197e-05, "loss": 0.8401, "step": 2334 }, { "epoch": 0.38350989570501764, "grad_norm": 1.9770853084668012, "learning_rate": 1.4131259632275312e-05, "loss": 0.7061, "step": 2335 }, { "epoch": 0.38367413977170073, "grad_norm": 2.0253582631583096, "learning_rate": 1.412641405814408e-05, "loss": 0.8267, "step": 2336 }, { "epoch": 0.3838383838383838, "grad_norm": 1.8532829895994563, "learning_rate": 1.412156731604003e-05, "loss": 0.8136, "step": 2337 }, { "epoch": 0.38400262790506695, "grad_norm": 1.464269154801738, "learning_rate": 1.4116719407335022e-05, "loss": 0.8195, "step": 2338 }, { "epoch": 0.38416687197175003, "grad_norm": 1.9635318453174315, "learning_rate": 1.4111870333401246e-05, "loss": 0.8322, "step": 2339 }, { "epoch": 0.3843311160384331, "grad_norm": 1.8356509042046318, "learning_rate": 1.4107020095611223e-05, "loss": 0.8021, "step": 2340 }, { "epoch": 0.3844953601051162, "grad_norm": 1.4479269993734978, "learning_rate": 1.4102168695337804e-05, "loss": 0.8109, "step": 2341 }, { "epoch": 0.3846596041717993, "grad_norm": 1.4754002777547655, "learning_rate": 1.4097316133954163e-05, "loss": 0.8346, "step": 2342 }, { "epoch": 0.38482384823848237, "grad_norm": 1.97545951328215, "learning_rate": 1.4092462412833811e-05, "loss": 0.8931, "step": 2343 }, { "epoch": 0.38498809230516545, "grad_norm": 1.6706433850311129, "learning_rate": 1.4087607533350585e-05, "loss": 0.7893, "step": 2344 }, { "epoch": 0.3851523363718486, "grad_norm": 1.5978511636319663, "learning_rate": 1.4082751496878644e-05, "loss": 0.7199, "step": 2345 }, { "epoch": 0.3853165804385317, "grad_norm": 1.7590640637954262, "learning_rate": 1.4077894304792481e-05, "loss": 0.765, "step": 2346 }, { "epoch": 0.38548082450521476, "grad_norm": 1.96074234913281, "learning_rate": 1.4073035958466916e-05, "loss": 0.8113, "step": 2347 }, { "epoch": 0.38564506857189784, "grad_norm": 1.7355832741544581, "learning_rate": 1.406817645927709e-05, "loss": 0.8029, "step": 2348 }, { "epoch": 0.3858093126385809, "grad_norm": 1.5887272996916357, "learning_rate": 1.4063315808598477e-05, "loss": 0.7876, "step": 2349 }, { "epoch": 0.385973556705264, "grad_norm": 1.8155961072701083, "learning_rate": 1.4058454007806874e-05, "loss": 0.8408, "step": 2350 }, { "epoch": 0.3861378007719471, "grad_norm": 0.6298375783598125, "learning_rate": 1.4053591058278402e-05, "loss": 0.3648, "step": 2351 }, { "epoch": 0.38630204483863023, "grad_norm": 1.6502650094853832, "learning_rate": 1.4048726961389508e-05, "loss": 0.7987, "step": 2352 }, { "epoch": 0.3864662889053133, "grad_norm": 1.9042356329862522, "learning_rate": 1.4043861718516964e-05, "loss": 0.8872, "step": 2353 }, { "epoch": 0.3866305329719964, "grad_norm": 1.4982729379425361, "learning_rate": 1.403899533103787e-05, "loss": 0.7494, "step": 2354 }, { "epoch": 0.3867947770386795, "grad_norm": 1.9646158537405973, "learning_rate": 1.4034127800329645e-05, "loss": 0.7983, "step": 2355 }, { "epoch": 0.38695902110536257, "grad_norm": 1.6600688403634816, "learning_rate": 1.4029259127770032e-05, "loss": 0.7707, "step": 2356 }, { "epoch": 0.38712326517204565, "grad_norm": 1.7916725486309264, "learning_rate": 1.40243893147371e-05, "loss": 0.7152, "step": 2357 }, { "epoch": 0.38728750923872873, "grad_norm": 2.3265367461852873, "learning_rate": 1.4019518362609239e-05, "loss": 0.8139, "step": 2358 }, { "epoch": 0.3874517533054118, "grad_norm": 1.5937340295425584, "learning_rate": 1.4014646272765162e-05, "loss": 0.7883, "step": 2359 }, { "epoch": 0.38761599737209496, "grad_norm": 2.8642017850349166, "learning_rate": 1.4009773046583904e-05, "loss": 0.7704, "step": 2360 }, { "epoch": 0.38778024143877804, "grad_norm": 0.6234417679894575, "learning_rate": 1.4004898685444819e-05, "loss": 0.3651, "step": 2361 }, { "epoch": 0.3879444855054611, "grad_norm": 1.448151877750316, "learning_rate": 1.4000023190727587e-05, "loss": 0.7876, "step": 2362 }, { "epoch": 0.3881087295721442, "grad_norm": 2.6745443220750804, "learning_rate": 1.399514656381221e-05, "loss": 0.7957, "step": 2363 }, { "epoch": 0.3882729736388273, "grad_norm": 1.9212315736416412, "learning_rate": 1.3990268806078999e-05, "loss": 0.861, "step": 2364 }, { "epoch": 0.3884372177055104, "grad_norm": 1.705503060510947, "learning_rate": 1.39853899189086e-05, "loss": 0.8277, "step": 2365 }, { "epoch": 0.38860146177219346, "grad_norm": 1.6495812893459216, "learning_rate": 1.3980509903681968e-05, "loss": 0.7674, "step": 2366 }, { "epoch": 0.3887657058388766, "grad_norm": 2.185054988590452, "learning_rate": 1.397562876178038e-05, "loss": 0.8538, "step": 2367 }, { "epoch": 0.3889299499055597, "grad_norm": 4.191292084947805, "learning_rate": 1.3970746494585439e-05, "loss": 0.7498, "step": 2368 }, { "epoch": 0.38909419397224276, "grad_norm": 1.5705823694913805, "learning_rate": 1.3965863103479054e-05, "loss": 0.832, "step": 2369 }, { "epoch": 0.38925843803892585, "grad_norm": 3.993584064937232, "learning_rate": 1.3960978589843458e-05, "loss": 0.7498, "step": 2370 }, { "epoch": 0.38942268210560893, "grad_norm": 2.1526829535104266, "learning_rate": 1.3956092955061208e-05, "loss": 0.7924, "step": 2371 }, { "epoch": 0.389586926172292, "grad_norm": 2.289433053428552, "learning_rate": 1.3951206200515173e-05, "loss": 0.8218, "step": 2372 }, { "epoch": 0.3897511702389751, "grad_norm": 1.809232181030086, "learning_rate": 1.3946318327588534e-05, "loss": 0.7493, "step": 2373 }, { "epoch": 0.3899154143056582, "grad_norm": 0.6766898533954662, "learning_rate": 1.3941429337664791e-05, "loss": 0.319, "step": 2374 }, { "epoch": 0.3900796583723413, "grad_norm": 0.6576882046492004, "learning_rate": 1.3936539232127771e-05, "loss": 0.3826, "step": 2375 }, { "epoch": 0.3902439024390244, "grad_norm": 1.6016640576423329, "learning_rate": 1.3931648012361599e-05, "loss": 0.8193, "step": 2376 }, { "epoch": 0.3904081465057075, "grad_norm": 1.5583897802023072, "learning_rate": 1.392675567975073e-05, "loss": 0.6933, "step": 2377 }, { "epoch": 0.39057239057239057, "grad_norm": 2.3114178165276345, "learning_rate": 1.3921862235679929e-05, "loss": 0.7497, "step": 2378 }, { "epoch": 0.39073663463907365, "grad_norm": 1.4735106650395975, "learning_rate": 1.391696768153427e-05, "loss": 0.7409, "step": 2379 }, { "epoch": 0.39090087870575674, "grad_norm": 2.5553210391074943, "learning_rate": 1.3912072018699152e-05, "loss": 0.7221, "step": 2380 }, { "epoch": 0.3910651227724398, "grad_norm": 1.682678678858458, "learning_rate": 1.3907175248560276e-05, "loss": 0.8229, "step": 2381 }, { "epoch": 0.39122936683912296, "grad_norm": 0.738024885794659, "learning_rate": 1.390227737250367e-05, "loss": 0.3813, "step": 2382 }, { "epoch": 0.39139361090580604, "grad_norm": 1.7591488192088196, "learning_rate": 1.389737839191566e-05, "loss": 0.7653, "step": 2383 }, { "epoch": 0.3915578549724891, "grad_norm": 1.5382357833986176, "learning_rate": 1.38924783081829e-05, "loss": 0.8023, "step": 2384 }, { "epoch": 0.3917220990391722, "grad_norm": 1.9949274724206958, "learning_rate": 1.3887577122692337e-05, "loss": 0.8405, "step": 2385 }, { "epoch": 0.3918863431058553, "grad_norm": 1.7045813104325487, "learning_rate": 1.3882674836831251e-05, "loss": 0.8262, "step": 2386 }, { "epoch": 0.3920505871725384, "grad_norm": 2.0172770060203593, "learning_rate": 1.3877771451987223e-05, "loss": 0.8523, "step": 2387 }, { "epoch": 0.39221483123922146, "grad_norm": 1.7385411905057224, "learning_rate": 1.3872866969548143e-05, "loss": 0.8058, "step": 2388 }, { "epoch": 0.3923790753059046, "grad_norm": 1.9986381551444101, "learning_rate": 1.3867961390902214e-05, "loss": 0.8086, "step": 2389 }, { "epoch": 0.3925433193725877, "grad_norm": 2.1932525087800134, "learning_rate": 1.3863054717437952e-05, "loss": 0.676, "step": 2390 }, { "epoch": 0.39270756343927077, "grad_norm": 2.022067042875754, "learning_rate": 1.3858146950544178e-05, "loss": 0.8005, "step": 2391 }, { "epoch": 0.39287180750595385, "grad_norm": 1.9865640570339675, "learning_rate": 1.3853238091610029e-05, "loss": 0.791, "step": 2392 }, { "epoch": 0.39303605157263694, "grad_norm": 2.4933798345850873, "learning_rate": 1.384832814202494e-05, "loss": 0.7683, "step": 2393 }, { "epoch": 0.39320029563932, "grad_norm": 2.2774309012638327, "learning_rate": 1.3843417103178669e-05, "loss": 0.8177, "step": 2394 }, { "epoch": 0.3933645397060031, "grad_norm": 1.406079642198995, "learning_rate": 1.3838504976461278e-05, "loss": 0.8461, "step": 2395 }, { "epoch": 0.3935287837726862, "grad_norm": 1.978883965308588, "learning_rate": 1.3833591763263123e-05, "loss": 0.8791, "step": 2396 }, { "epoch": 0.3936930278393693, "grad_norm": 0.6098822438024226, "learning_rate": 1.3828677464974885e-05, "loss": 0.347, "step": 2397 }, { "epoch": 0.3938572719060524, "grad_norm": 3.266636023139941, "learning_rate": 1.3823762082987544e-05, "loss": 0.7535, "step": 2398 }, { "epoch": 0.3940215159727355, "grad_norm": 1.656382178314565, "learning_rate": 1.381884561869239e-05, "loss": 0.8553, "step": 2399 }, { "epoch": 0.3941857600394186, "grad_norm": 1.780943792391925, "learning_rate": 1.3813928073481023e-05, "loss": 0.8245, "step": 2400 }, { "epoch": 0.39435000410610166, "grad_norm": 1.8068027150802266, "learning_rate": 1.3809009448745334e-05, "loss": 0.8277, "step": 2401 }, { "epoch": 0.39451424817278474, "grad_norm": 0.6351729000382544, "learning_rate": 1.3804089745877536e-05, "loss": 0.3471, "step": 2402 }, { "epoch": 0.3946784922394678, "grad_norm": 0.6845964708167672, "learning_rate": 1.3799168966270139e-05, "loss": 0.3484, "step": 2403 }, { "epoch": 0.39484273630615097, "grad_norm": 0.6722945752534506, "learning_rate": 1.3794247111315955e-05, "loss": 0.3437, "step": 2404 }, { "epoch": 0.39500698037283405, "grad_norm": 1.4424301657782923, "learning_rate": 1.3789324182408112e-05, "loss": 0.8245, "step": 2405 }, { "epoch": 0.39517122443951713, "grad_norm": 1.6136033185545442, "learning_rate": 1.3784400180940032e-05, "loss": 0.7605, "step": 2406 }, { "epoch": 0.3953354685062002, "grad_norm": 1.654528784965412, "learning_rate": 1.377947510830544e-05, "loss": 0.759, "step": 2407 }, { "epoch": 0.3954997125728833, "grad_norm": 1.8272266856581623, "learning_rate": 1.3774548965898371e-05, "loss": 0.8014, "step": 2408 }, { "epoch": 0.3956639566395664, "grad_norm": 1.7340100631231006, "learning_rate": 1.3769621755113156e-05, "loss": 0.7873, "step": 2409 }, { "epoch": 0.39582820070624947, "grad_norm": 0.7374866527351378, "learning_rate": 1.3764693477344435e-05, "loss": 0.3109, "step": 2410 }, { "epoch": 0.39599244477293255, "grad_norm": 1.8414533369395487, "learning_rate": 1.3759764133987146e-05, "loss": 0.8271, "step": 2411 }, { "epoch": 0.3961566888396157, "grad_norm": 1.741414224694464, "learning_rate": 1.375483372643653e-05, "loss": 0.8114, "step": 2412 }, { "epoch": 0.3963209329062988, "grad_norm": 1.2783512610755214, "learning_rate": 1.3749902256088125e-05, "loss": 0.7697, "step": 2413 }, { "epoch": 0.39648517697298186, "grad_norm": 1.7119491009805583, "learning_rate": 1.3744969724337779e-05, "loss": 0.8904, "step": 2414 }, { "epoch": 0.39664942103966494, "grad_norm": 2.0881458739128, "learning_rate": 1.3740036132581626e-05, "loss": 0.8045, "step": 2415 }, { "epoch": 0.396813665106348, "grad_norm": 2.020123184475476, "learning_rate": 1.3735101482216117e-05, "loss": 0.8189, "step": 2416 }, { "epoch": 0.3969779091730311, "grad_norm": 1.592337562198539, "learning_rate": 1.3730165774637994e-05, "loss": 0.7296, "step": 2417 }, { "epoch": 0.3971421532397142, "grad_norm": 1.9414168453715932, "learning_rate": 1.3725229011244294e-05, "loss": 0.8105, "step": 2418 }, { "epoch": 0.39730639730639733, "grad_norm": 1.6889312134856571, "learning_rate": 1.3720291193432357e-05, "loss": 0.7654, "step": 2419 }, { "epoch": 0.3974706413730804, "grad_norm": 1.5788702675768127, "learning_rate": 1.3715352322599826e-05, "loss": 0.8024, "step": 2420 }, { "epoch": 0.3976348854397635, "grad_norm": 1.5043139434940178, "learning_rate": 1.3710412400144637e-05, "loss": 0.7941, "step": 2421 }, { "epoch": 0.3977991295064466, "grad_norm": 0.740763333766417, "learning_rate": 1.3705471427465025e-05, "loss": 0.3799, "step": 2422 }, { "epoch": 0.39796337357312966, "grad_norm": 1.8248843750468875, "learning_rate": 1.3700529405959517e-05, "loss": 0.7753, "step": 2423 }, { "epoch": 0.39812761763981275, "grad_norm": 1.53402570699225, "learning_rate": 1.3695586337026949e-05, "loss": 0.801, "step": 2424 }, { "epoch": 0.39829186170649583, "grad_norm": 1.829590966125987, "learning_rate": 1.3690642222066445e-05, "loss": 0.6638, "step": 2425 }, { "epoch": 0.39845610577317897, "grad_norm": 2.7976338223683945, "learning_rate": 1.3685697062477421e-05, "loss": 0.8324, "step": 2426 }, { "epoch": 0.39862034983986205, "grad_norm": 1.6504349856527731, "learning_rate": 1.3680750859659599e-05, "loss": 0.6629, "step": 2427 }, { "epoch": 0.39878459390654514, "grad_norm": 2.8434392706860816, "learning_rate": 1.3675803615012993e-05, "loss": 0.7774, "step": 2428 }, { "epoch": 0.3989488379732282, "grad_norm": 0.6530422422808483, "learning_rate": 1.3670855329937905e-05, "loss": 0.3473, "step": 2429 }, { "epoch": 0.3991130820399113, "grad_norm": 1.8460472393328224, "learning_rate": 1.3665906005834938e-05, "loss": 0.8128, "step": 2430 }, { "epoch": 0.3992773261065944, "grad_norm": 1.8424248910521128, "learning_rate": 1.3660955644104985e-05, "loss": 0.7744, "step": 2431 }, { "epoch": 0.39944157017327747, "grad_norm": 2.2598147293368758, "learning_rate": 1.3656004246149242e-05, "loss": 0.6537, "step": 2432 }, { "epoch": 0.39960581423996056, "grad_norm": 1.8660048713556445, "learning_rate": 1.3651051813369188e-05, "loss": 0.7423, "step": 2433 }, { "epoch": 0.3997700583066437, "grad_norm": 1.6315622301750778, "learning_rate": 1.3646098347166598e-05, "loss": 0.7732, "step": 2434 }, { "epoch": 0.3999343023733268, "grad_norm": 0.6449059691175149, "learning_rate": 1.3641143848943545e-05, "loss": 0.3681, "step": 2435 }, { "epoch": 0.40009854644000986, "grad_norm": 1.648678988039497, "learning_rate": 1.3636188320102384e-05, "loss": 0.8871, "step": 2436 }, { "epoch": 0.40026279050669294, "grad_norm": 1.627689395569138, "learning_rate": 1.3631231762045768e-05, "loss": 0.8205, "step": 2437 }, { "epoch": 0.40042703457337603, "grad_norm": 1.669467269068599, "learning_rate": 1.3626274176176645e-05, "loss": 0.7737, "step": 2438 }, { "epoch": 0.4005912786400591, "grad_norm": 2.090636761673577, "learning_rate": 1.3621315563898243e-05, "loss": 0.7696, "step": 2439 }, { "epoch": 0.4007555227067422, "grad_norm": 2.540529894173232, "learning_rate": 1.3616355926614089e-05, "loss": 0.8938, "step": 2440 }, { "epoch": 0.40091976677342533, "grad_norm": 1.7424957875742044, "learning_rate": 1.3611395265727998e-05, "loss": 0.8116, "step": 2441 }, { "epoch": 0.4010840108401084, "grad_norm": 1.662219559343329, "learning_rate": 1.3606433582644077e-05, "loss": 0.8065, "step": 2442 }, { "epoch": 0.4012482549067915, "grad_norm": 1.880314448794805, "learning_rate": 1.3601470878766714e-05, "loss": 0.8086, "step": 2443 }, { "epoch": 0.4014124989734746, "grad_norm": 1.5960377486043689, "learning_rate": 1.3596507155500596e-05, "loss": 0.7716, "step": 2444 }, { "epoch": 0.40157674304015767, "grad_norm": 0.6653544061819747, "learning_rate": 1.3591542414250694e-05, "loss": 0.3351, "step": 2445 }, { "epoch": 0.40174098710684075, "grad_norm": 1.9346732059321474, "learning_rate": 1.3586576656422268e-05, "loss": 0.7987, "step": 2446 }, { "epoch": 0.40190523117352384, "grad_norm": 1.5709438443162747, "learning_rate": 1.3581609883420866e-05, "loss": 0.7741, "step": 2447 }, { "epoch": 0.402069475240207, "grad_norm": 0.6249248711106216, "learning_rate": 1.3576642096652322e-05, "loss": 0.347, "step": 2448 }, { "epoch": 0.40223371930689006, "grad_norm": 3.4900775651000195, "learning_rate": 1.3571673297522759e-05, "loss": 0.8486, "step": 2449 }, { "epoch": 0.40239796337357314, "grad_norm": 1.9647386644503662, "learning_rate": 1.3566703487438579e-05, "loss": 0.6992, "step": 2450 }, { "epoch": 0.4025622074402562, "grad_norm": 1.9435872510674466, "learning_rate": 1.3561732667806481e-05, "loss": 0.7814, "step": 2451 }, { "epoch": 0.4027264515069393, "grad_norm": 1.923359647708493, "learning_rate": 1.3556760840033447e-05, "loss": 0.794, "step": 2452 }, { "epoch": 0.4028906955736224, "grad_norm": 1.6263299089741625, "learning_rate": 1.3551788005526738e-05, "loss": 0.8595, "step": 2453 }, { "epoch": 0.4030549396403055, "grad_norm": 1.6695619726579103, "learning_rate": 1.3546814165693909e-05, "loss": 0.7772, "step": 2454 }, { "epoch": 0.40321918370698856, "grad_norm": 1.7363617893309, "learning_rate": 1.3541839321942786e-05, "loss": 0.7873, "step": 2455 }, { "epoch": 0.4033834277736717, "grad_norm": 1.713378918892255, "learning_rate": 1.35368634756815e-05, "loss": 0.7744, "step": 2456 }, { "epoch": 0.4035476718403548, "grad_norm": 1.8332282065412888, "learning_rate": 1.3531886628318447e-05, "loss": 0.7765, "step": 2457 }, { "epoch": 0.40371191590703787, "grad_norm": 1.953452115637673, "learning_rate": 1.3526908781262314e-05, "loss": 0.8184, "step": 2458 }, { "epoch": 0.40387615997372095, "grad_norm": 1.5370013451141074, "learning_rate": 1.352192993592207e-05, "loss": 0.7836, "step": 2459 }, { "epoch": 0.40404040404040403, "grad_norm": 1.9045233871970475, "learning_rate": 1.3516950093706968e-05, "loss": 0.8125, "step": 2460 }, { "epoch": 0.4042046481070871, "grad_norm": 2.2240885556835672, "learning_rate": 1.3511969256026542e-05, "loss": 0.8065, "step": 2461 }, { "epoch": 0.4043688921737702, "grad_norm": 1.7153632510892665, "learning_rate": 1.3506987424290605e-05, "loss": 0.8404, "step": 2462 }, { "epoch": 0.40453313624045334, "grad_norm": 1.7865858124532916, "learning_rate": 1.3502004599909255e-05, "loss": 0.7744, "step": 2463 }, { "epoch": 0.4046973803071364, "grad_norm": 2.225479437740205, "learning_rate": 1.349702078429287e-05, "loss": 0.7628, "step": 2464 }, { "epoch": 0.4048616243738195, "grad_norm": 1.7075407948263688, "learning_rate": 1.3492035978852114e-05, "loss": 0.7676, "step": 2465 }, { "epoch": 0.4050258684405026, "grad_norm": 2.1038168376232655, "learning_rate": 1.3487050184997916e-05, "loss": 0.7893, "step": 2466 }, { "epoch": 0.4051901125071857, "grad_norm": 2.0525973595158717, "learning_rate": 1.3482063404141496e-05, "loss": 0.8076, "step": 2467 }, { "epoch": 0.40535435657386876, "grad_norm": 2.1721713190885565, "learning_rate": 1.3477075637694362e-05, "loss": 0.809, "step": 2468 }, { "epoch": 0.40551860064055184, "grad_norm": 1.4847269401230159, "learning_rate": 1.347208688706828e-05, "loss": 0.8346, "step": 2469 }, { "epoch": 0.4056828447072349, "grad_norm": 2.075125047325698, "learning_rate": 1.3467097153675313e-05, "loss": 0.7841, "step": 2470 }, { "epoch": 0.40584708877391806, "grad_norm": 1.586771912230141, "learning_rate": 1.3462106438927788e-05, "loss": 0.825, "step": 2471 }, { "epoch": 0.40601133284060115, "grad_norm": 2.128865464806009, "learning_rate": 1.345711474423832e-05, "loss": 0.8095, "step": 2472 }, { "epoch": 0.40617557690728423, "grad_norm": 1.9968695000420573, "learning_rate": 1.3452122071019797e-05, "loss": 0.7703, "step": 2473 }, { "epoch": 0.4063398209739673, "grad_norm": 0.9217363842185566, "learning_rate": 1.3447128420685385e-05, "loss": 0.3762, "step": 2474 }, { "epoch": 0.4065040650406504, "grad_norm": 1.7977033572380483, "learning_rate": 1.3442133794648521e-05, "loss": 0.8795, "step": 2475 }, { "epoch": 0.4066683091073335, "grad_norm": 1.7699122311616093, "learning_rate": 1.3437138194322934e-05, "loss": 0.9118, "step": 2476 }, { "epoch": 0.40683255317401656, "grad_norm": 2.0666863759072873, "learning_rate": 1.3432141621122608e-05, "loss": 0.7671, "step": 2477 }, { "epoch": 0.4069967972406997, "grad_norm": 1.710728210472888, "learning_rate": 1.3427144076461818e-05, "loss": 0.8216, "step": 2478 }, { "epoch": 0.4071610413073828, "grad_norm": 2.312960083134333, "learning_rate": 1.3422145561755106e-05, "loss": 0.8895, "step": 2479 }, { "epoch": 0.40732528537406587, "grad_norm": 1.3602652307601935, "learning_rate": 1.3417146078417294e-05, "loss": 0.7835, "step": 2480 }, { "epoch": 0.40748952944074895, "grad_norm": 1.9022584224757972, "learning_rate": 1.3412145627863473e-05, "loss": 0.7934, "step": 2481 }, { "epoch": 0.40765377350743204, "grad_norm": 1.5474956255004448, "learning_rate": 1.3407144211509014e-05, "loss": 0.7886, "step": 2482 }, { "epoch": 0.4078180175741151, "grad_norm": 1.616716060747654, "learning_rate": 1.3402141830769551e-05, "loss": 0.9248, "step": 2483 }, { "epoch": 0.4079822616407982, "grad_norm": 1.7240840069808834, "learning_rate": 1.3397138487060999e-05, "loss": 0.8227, "step": 2484 }, { "epoch": 0.40814650570748134, "grad_norm": 2.076909242188367, "learning_rate": 1.3392134181799547e-05, "loss": 0.8571, "step": 2485 }, { "epoch": 0.4083107497741644, "grad_norm": 1.9494166292075, "learning_rate": 1.338712891640165e-05, "loss": 0.7801, "step": 2486 }, { "epoch": 0.4084749938408475, "grad_norm": 2.1069607442574165, "learning_rate": 1.3382122692284041e-05, "loss": 0.7755, "step": 2487 }, { "epoch": 0.4086392379075306, "grad_norm": 2.4144231628439132, "learning_rate": 1.3377115510863716e-05, "loss": 0.791, "step": 2488 }, { "epoch": 0.4088034819742137, "grad_norm": 1.6134984266233603, "learning_rate": 1.3372107373557955e-05, "loss": 0.7433, "step": 2489 }, { "epoch": 0.40896772604089676, "grad_norm": 1.7763290217534737, "learning_rate": 1.336709828178429e-05, "loss": 0.8011, "step": 2490 }, { "epoch": 0.40913197010757985, "grad_norm": 1.6259821623685877, "learning_rate": 1.3362088236960544e-05, "loss": 0.7485, "step": 2491 }, { "epoch": 0.40929621417426293, "grad_norm": 1.6192302157701914, "learning_rate": 1.3357077240504795e-05, "loss": 0.7321, "step": 2492 }, { "epoch": 0.40946045824094607, "grad_norm": 1.72108564397868, "learning_rate": 1.3352065293835399e-05, "loss": 0.853, "step": 2493 }, { "epoch": 0.40962470230762915, "grad_norm": 1.449339224755182, "learning_rate": 1.3347052398370969e-05, "loss": 0.8513, "step": 2494 }, { "epoch": 0.40978894637431224, "grad_norm": 2.0596163123103084, "learning_rate": 1.3342038555530403e-05, "loss": 0.7448, "step": 2495 }, { "epoch": 0.4099531904409953, "grad_norm": 1.5919970101419434, "learning_rate": 1.3337023766732852e-05, "loss": 0.6834, "step": 2496 }, { "epoch": 0.4101174345076784, "grad_norm": 1.9714190548264252, "learning_rate": 1.3332008033397746e-05, "loss": 0.79, "step": 2497 }, { "epoch": 0.4102816785743615, "grad_norm": 1.8231414603806715, "learning_rate": 1.3326991356944776e-05, "loss": 0.8096, "step": 2498 }, { "epoch": 0.41044592264104457, "grad_norm": 1.4884775319343513, "learning_rate": 1.33219737387939e-05, "loss": 0.7974, "step": 2499 }, { "epoch": 0.4106101667077277, "grad_norm": 1.8445564964415497, "learning_rate": 1.3316955180365348e-05, "loss": 0.8158, "step": 2500 }, { "epoch": 0.4107744107744108, "grad_norm": 2.127298283602168, "learning_rate": 1.3311935683079611e-05, "loss": 0.7656, "step": 2501 }, { "epoch": 0.4109386548410939, "grad_norm": 1.8207364322730055, "learning_rate": 1.3306915248357442e-05, "loss": 0.8246, "step": 2502 }, { "epoch": 0.41110289890777696, "grad_norm": 1.7129330748035094, "learning_rate": 1.3301893877619874e-05, "loss": 0.7135, "step": 2503 }, { "epoch": 0.41126714297446004, "grad_norm": 1.7963831253888196, "learning_rate": 1.3296871572288187e-05, "loss": 0.8003, "step": 2504 }, { "epoch": 0.4114313870411431, "grad_norm": 1.627403568239389, "learning_rate": 1.3291848333783941e-05, "loss": 0.7551, "step": 2505 }, { "epoch": 0.4115956311078262, "grad_norm": 2.351446399540552, "learning_rate": 1.3286824163528948e-05, "loss": 0.6573, "step": 2506 }, { "epoch": 0.4117598751745093, "grad_norm": 1.7188330773120828, "learning_rate": 1.3281799062945291e-05, "loss": 0.7635, "step": 2507 }, { "epoch": 0.41192411924119243, "grad_norm": 1.532015861507549, "learning_rate": 1.3276773033455312e-05, "loss": 0.7648, "step": 2508 }, { "epoch": 0.4120883633078755, "grad_norm": 1.7222678508127003, "learning_rate": 1.3271746076481621e-05, "loss": 0.8008, "step": 2509 }, { "epoch": 0.4122526073745586, "grad_norm": 1.5579621492113973, "learning_rate": 1.3266718193447086e-05, "loss": 0.8521, "step": 2510 }, { "epoch": 0.4124168514412417, "grad_norm": 1.4293064338069887, "learning_rate": 1.3261689385774839e-05, "loss": 0.8004, "step": 2511 }, { "epoch": 0.41258109550792477, "grad_norm": 1.5468137543591005, "learning_rate": 1.3256659654888272e-05, "loss": 0.7506, "step": 2512 }, { "epoch": 0.41274533957460785, "grad_norm": 1.5464093649583772, "learning_rate": 1.3251629002211042e-05, "loss": 0.8425, "step": 2513 }, { "epoch": 0.41290958364129093, "grad_norm": 1.496261070198569, "learning_rate": 1.3246597429167066e-05, "loss": 0.8148, "step": 2514 }, { "epoch": 0.4130738277079741, "grad_norm": 1.6143648615645358, "learning_rate": 1.3241564937180513e-05, "loss": 0.8115, "step": 2515 }, { "epoch": 0.41323807177465716, "grad_norm": 1.5622630995642761, "learning_rate": 1.3236531527675828e-05, "loss": 0.8233, "step": 2516 }, { "epoch": 0.41340231584134024, "grad_norm": 1.9886342537025028, "learning_rate": 1.3231497202077701e-05, "loss": 0.788, "step": 2517 }, { "epoch": 0.4135665599080233, "grad_norm": 1.7875629930093833, "learning_rate": 1.322646196181109e-05, "loss": 0.8522, "step": 2518 }, { "epoch": 0.4137308039747064, "grad_norm": 2.402023915736297, "learning_rate": 1.3221425808301209e-05, "loss": 0.8985, "step": 2519 }, { "epoch": 0.4138950480413895, "grad_norm": 2.411361088477607, "learning_rate": 1.3216388742973532e-05, "loss": 0.8438, "step": 2520 }, { "epoch": 0.4140592921080726, "grad_norm": 1.7649309130240638, "learning_rate": 1.3211350767253786e-05, "loss": 0.844, "step": 2521 }, { "epoch": 0.4142235361747557, "grad_norm": 1.5664984642090578, "learning_rate": 1.320631188256796e-05, "loss": 0.7972, "step": 2522 }, { "epoch": 0.4143877802414388, "grad_norm": 1.757508470914565, "learning_rate": 1.3201272090342303e-05, "loss": 0.7875, "step": 2523 }, { "epoch": 0.4145520243081219, "grad_norm": 0.7681876383179567, "learning_rate": 1.3196231392003316e-05, "loss": 0.3797, "step": 2524 }, { "epoch": 0.41471626837480496, "grad_norm": 2.3503633715387924, "learning_rate": 1.3191189788977758e-05, "loss": 0.6995, "step": 2525 }, { "epoch": 0.41488051244148805, "grad_norm": 1.4013569577805634, "learning_rate": 1.3186147282692643e-05, "loss": 0.8286, "step": 2526 }, { "epoch": 0.41504475650817113, "grad_norm": 1.4704650618422812, "learning_rate": 1.3181103874575243e-05, "loss": 0.7127, "step": 2527 }, { "epoch": 0.4152090005748542, "grad_norm": 1.9859927795793484, "learning_rate": 1.3176059566053083e-05, "loss": 0.8091, "step": 2528 }, { "epoch": 0.4153732446415373, "grad_norm": 1.6115746424346684, "learning_rate": 1.3171014358553946e-05, "loss": 0.8199, "step": 2529 }, { "epoch": 0.41553748870822044, "grad_norm": 1.7459218471382605, "learning_rate": 1.3165968253505865e-05, "loss": 0.8948, "step": 2530 }, { "epoch": 0.4157017327749035, "grad_norm": 0.7003916582497868, "learning_rate": 1.3160921252337131e-05, "loss": 0.3531, "step": 2531 }, { "epoch": 0.4158659768415866, "grad_norm": 1.584760266893877, "learning_rate": 1.3155873356476287e-05, "loss": 0.7959, "step": 2532 }, { "epoch": 0.4160302209082697, "grad_norm": 1.5343789066740339, "learning_rate": 1.3150824567352128e-05, "loss": 0.7086, "step": 2533 }, { "epoch": 0.41619446497495277, "grad_norm": 1.814722003951146, "learning_rate": 1.3145774886393704e-05, "loss": 0.7707, "step": 2534 }, { "epoch": 0.41635870904163585, "grad_norm": 1.8624592258584924, "learning_rate": 1.3140724315030315e-05, "loss": 0.8051, "step": 2535 }, { "epoch": 0.41652295310831894, "grad_norm": 1.6724547123321465, "learning_rate": 1.313567285469152e-05, "loss": 0.849, "step": 2536 }, { "epoch": 0.4166871971750021, "grad_norm": 2.52324067329546, "learning_rate": 1.3130620506807116e-05, "loss": 0.8304, "step": 2537 }, { "epoch": 0.41685144124168516, "grad_norm": 2.404066290272961, "learning_rate": 1.3125567272807167e-05, "loss": 0.8597, "step": 2538 }, { "epoch": 0.41701568530836824, "grad_norm": 1.9960628257143174, "learning_rate": 1.3120513154121976e-05, "loss": 0.8918, "step": 2539 }, { "epoch": 0.41717992937505133, "grad_norm": 1.6344507572965088, "learning_rate": 1.3115458152182102e-05, "loss": 0.7755, "step": 2540 }, { "epoch": 0.4173441734417344, "grad_norm": 2.0378949302173037, "learning_rate": 1.3110402268418352e-05, "loss": 0.6901, "step": 2541 }, { "epoch": 0.4175084175084175, "grad_norm": 1.5588819310435744, "learning_rate": 1.3105345504261781e-05, "loss": 0.7887, "step": 2542 }, { "epoch": 0.4176726615751006, "grad_norm": 1.9242690133269857, "learning_rate": 1.3100287861143703e-05, "loss": 0.8409, "step": 2543 }, { "epoch": 0.4178369056417837, "grad_norm": 1.6213582275037104, "learning_rate": 1.3095229340495665e-05, "loss": 0.8406, "step": 2544 }, { "epoch": 0.4180011497084668, "grad_norm": 2.00660877429113, "learning_rate": 1.3090169943749475e-05, "loss": 0.7528, "step": 2545 }, { "epoch": 0.4181653937751499, "grad_norm": 2.4074992631610916, "learning_rate": 1.3085109672337183e-05, "loss": 0.887, "step": 2546 }, { "epoch": 0.41832963784183297, "grad_norm": 1.440839048534631, "learning_rate": 1.3080048527691092e-05, "loss": 0.7606, "step": 2547 }, { "epoch": 0.41849388190851605, "grad_norm": 2.0625308537219893, "learning_rate": 1.3074986511243741e-05, "loss": 0.832, "step": 2548 }, { "epoch": 0.41865812597519914, "grad_norm": 2.211977495327317, "learning_rate": 1.306992362442793e-05, "loss": 0.8239, "step": 2549 }, { "epoch": 0.4188223700418822, "grad_norm": 2.0471326021175966, "learning_rate": 1.3064859868676694e-05, "loss": 0.8129, "step": 2550 }, { "epoch": 0.4189866141085653, "grad_norm": 1.9718422434396943, "learning_rate": 1.3059795245423319e-05, "loss": 0.7873, "step": 2551 }, { "epoch": 0.41915085817524844, "grad_norm": 1.6527743737944138, "learning_rate": 1.3054729756101338e-05, "loss": 0.7237, "step": 2552 }, { "epoch": 0.4193151022419315, "grad_norm": 2.5397863243709176, "learning_rate": 1.3049663402144528e-05, "loss": 0.8126, "step": 2553 }, { "epoch": 0.4194793463086146, "grad_norm": 1.5414289918812876, "learning_rate": 1.3044596184986906e-05, "loss": 0.7633, "step": 2554 }, { "epoch": 0.4196435903752977, "grad_norm": 2.0639408639522006, "learning_rate": 1.303952810606274e-05, "loss": 0.8524, "step": 2555 }, { "epoch": 0.4198078344419808, "grad_norm": 0.7324898053782838, "learning_rate": 1.3034459166806537e-05, "loss": 0.3114, "step": 2556 }, { "epoch": 0.41997207850866386, "grad_norm": 1.59309393359768, "learning_rate": 1.3029389368653051e-05, "loss": 0.7473, "step": 2557 }, { "epoch": 0.42013632257534694, "grad_norm": 1.8582911881400046, "learning_rate": 1.302431871303728e-05, "loss": 0.8057, "step": 2558 }, { "epoch": 0.4203005666420301, "grad_norm": 2.314449268908037, "learning_rate": 1.3019247201394456e-05, "loss": 0.7576, "step": 2559 }, { "epoch": 0.42046481070871317, "grad_norm": 0.6469659325424681, "learning_rate": 1.3014174835160065e-05, "loss": 0.3772, "step": 2560 }, { "epoch": 0.42062905477539625, "grad_norm": 2.1487194959668856, "learning_rate": 1.300910161576983e-05, "loss": 0.8498, "step": 2561 }, { "epoch": 0.42079329884207933, "grad_norm": 1.5545647042431106, "learning_rate": 1.3004027544659712e-05, "loss": 0.7168, "step": 2562 }, { "epoch": 0.4209575429087624, "grad_norm": 2.129310487210473, "learning_rate": 1.2998952623265917e-05, "loss": 0.7892, "step": 2563 }, { "epoch": 0.4211217869754455, "grad_norm": 2.1192091661906245, "learning_rate": 1.2993876853024891e-05, "loss": 0.8499, "step": 2564 }, { "epoch": 0.4212860310421286, "grad_norm": 2.2546335478407884, "learning_rate": 1.298880023537332e-05, "loss": 0.7077, "step": 2565 }, { "epoch": 0.42145027510881167, "grad_norm": 1.7966636899230095, "learning_rate": 1.2983722771748131e-05, "loss": 0.7614, "step": 2566 }, { "epoch": 0.4216145191754948, "grad_norm": 2.215656497601288, "learning_rate": 1.2978644463586489e-05, "loss": 0.8226, "step": 2567 }, { "epoch": 0.4217787632421779, "grad_norm": 2.3860118575123925, "learning_rate": 1.2973565312325798e-05, "loss": 0.7395, "step": 2568 }, { "epoch": 0.421943007308861, "grad_norm": 2.357682286050106, "learning_rate": 1.29684853194037e-05, "loss": 0.8398, "step": 2569 }, { "epoch": 0.42210725137554406, "grad_norm": 1.6092915368078797, "learning_rate": 1.296340448625808e-05, "loss": 0.7616, "step": 2570 }, { "epoch": 0.42227149544222714, "grad_norm": 1.6083893398162246, "learning_rate": 1.2958322814327053e-05, "loss": 0.8185, "step": 2571 }, { "epoch": 0.4224357395089102, "grad_norm": 1.671392294579906, "learning_rate": 1.2953240305048978e-05, "loss": 0.7949, "step": 2572 }, { "epoch": 0.4225999835755933, "grad_norm": 2.250207068903031, "learning_rate": 1.2948156959862446e-05, "loss": 0.8407, "step": 2573 }, { "epoch": 0.42276422764227645, "grad_norm": 1.5190496504432998, "learning_rate": 1.294307278020629e-05, "loss": 0.8052, "step": 2574 }, { "epoch": 0.42292847170895953, "grad_norm": 1.9534297610220015, "learning_rate": 1.2937987767519576e-05, "loss": 0.7154, "step": 2575 }, { "epoch": 0.4230927157756426, "grad_norm": 2.2493140754176224, "learning_rate": 1.2932901923241603e-05, "loss": 0.8295, "step": 2576 }, { "epoch": 0.4232569598423257, "grad_norm": 2.309100568100057, "learning_rate": 1.2927815248811913e-05, "loss": 0.8029, "step": 2577 }, { "epoch": 0.4234212039090088, "grad_norm": 1.7039608165032392, "learning_rate": 1.2922727745670276e-05, "loss": 0.7964, "step": 2578 }, { "epoch": 0.42358544797569186, "grad_norm": 1.7191317316176506, "learning_rate": 1.29176394152567e-05, "loss": 0.8489, "step": 2579 }, { "epoch": 0.42374969204237495, "grad_norm": 1.7522076541305371, "learning_rate": 1.2912550259011422e-05, "loss": 0.8747, "step": 2580 }, { "epoch": 0.4239139361090581, "grad_norm": 1.9813741104326699, "learning_rate": 1.2907460278374925e-05, "loss": 0.7747, "step": 2581 }, { "epoch": 0.42407818017574117, "grad_norm": 2.102577910321952, "learning_rate": 1.2902369474787912e-05, "loss": 0.79, "step": 2582 }, { "epoch": 0.42424242424242425, "grad_norm": 3.0800608436347092, "learning_rate": 1.2897277849691326e-05, "loss": 0.7892, "step": 2583 }, { "epoch": 0.42440666830910734, "grad_norm": 1.5985419235935476, "learning_rate": 1.2892185404526338e-05, "loss": 0.7529, "step": 2584 }, { "epoch": 0.4245709123757904, "grad_norm": 2.337487178423964, "learning_rate": 1.2887092140734357e-05, "loss": 0.8071, "step": 2585 }, { "epoch": 0.4247351564424735, "grad_norm": 1.6760106361058835, "learning_rate": 1.288199805975702e-05, "loss": 0.7986, "step": 2586 }, { "epoch": 0.4248994005091566, "grad_norm": 1.768075002622865, "learning_rate": 1.2876903163036194e-05, "loss": 0.7264, "step": 2587 }, { "epoch": 0.42506364457583967, "grad_norm": 1.7422942534702766, "learning_rate": 1.2871807452013977e-05, "loss": 0.7305, "step": 2588 }, { "epoch": 0.4252278886425228, "grad_norm": 1.6945802433306802, "learning_rate": 1.2866710928132709e-05, "loss": 0.7867, "step": 2589 }, { "epoch": 0.4253921327092059, "grad_norm": 1.7585581392009204, "learning_rate": 1.2861613592834942e-05, "loss": 0.7668, "step": 2590 }, { "epoch": 0.425556376775889, "grad_norm": 1.7386183754885958, "learning_rate": 1.2856515447563467e-05, "loss": 0.7595, "step": 2591 }, { "epoch": 0.42572062084257206, "grad_norm": 0.7038767744873566, "learning_rate": 1.2851416493761301e-05, "loss": 0.3477, "step": 2592 }, { "epoch": 0.42588486490925515, "grad_norm": 1.8097410406779926, "learning_rate": 1.28463167328717e-05, "loss": 0.8391, "step": 2593 }, { "epoch": 0.42604910897593823, "grad_norm": 2.1244837300702875, "learning_rate": 1.2841216166338133e-05, "loss": 0.8202, "step": 2594 }, { "epoch": 0.4262133530426213, "grad_norm": 2.3824010572420997, "learning_rate": 1.2836114795604309e-05, "loss": 0.73, "step": 2595 }, { "epoch": 0.42637759710930445, "grad_norm": 1.3468640639646454, "learning_rate": 1.2831012622114159e-05, "loss": 0.7877, "step": 2596 }, { "epoch": 0.42654184117598754, "grad_norm": 1.7342552531741102, "learning_rate": 1.282590964731184e-05, "loss": 0.8061, "step": 2597 }, { "epoch": 0.4267060852426706, "grad_norm": 1.996050688974519, "learning_rate": 1.2820805872641745e-05, "loss": 0.8471, "step": 2598 }, { "epoch": 0.4268703293093537, "grad_norm": 2.2102485974617303, "learning_rate": 1.2815701299548478e-05, "loss": 0.8003, "step": 2599 }, { "epoch": 0.4270345733760368, "grad_norm": 0.6109671151938317, "learning_rate": 1.2810595929476884e-05, "loss": 0.3183, "step": 2600 }, { "epoch": 0.42719881744271987, "grad_norm": 1.5902105720835864, "learning_rate": 1.2805489763872026e-05, "loss": 0.7253, "step": 2601 }, { "epoch": 0.42736306150940295, "grad_norm": 2.0360027220044867, "learning_rate": 1.280038280417919e-05, "loss": 0.741, "step": 2602 }, { "epoch": 0.42752730557608604, "grad_norm": 2.5720321039671536, "learning_rate": 1.2795275051843893e-05, "loss": 0.821, "step": 2603 }, { "epoch": 0.4276915496427692, "grad_norm": 1.7577361067605444, "learning_rate": 1.2790166508311872e-05, "loss": 0.7275, "step": 2604 }, { "epoch": 0.42785579370945226, "grad_norm": 2.0535533363213205, "learning_rate": 1.2785057175029092e-05, "loss": 0.8179, "step": 2605 }, { "epoch": 0.42802003777613534, "grad_norm": 2.2670873464523353, "learning_rate": 1.277994705344174e-05, "loss": 0.858, "step": 2606 }, { "epoch": 0.4281842818428184, "grad_norm": 1.6278569824906683, "learning_rate": 1.2774836144996222e-05, "loss": 0.7879, "step": 2607 }, { "epoch": 0.4283485259095015, "grad_norm": 1.6725689614983164, "learning_rate": 1.276972445113917e-05, "loss": 0.7896, "step": 2608 }, { "epoch": 0.4285127699761846, "grad_norm": 1.8115070914662323, "learning_rate": 1.276461197331744e-05, "loss": 0.8251, "step": 2609 }, { "epoch": 0.4286770140428677, "grad_norm": 1.666169831456258, "learning_rate": 1.2759498712978106e-05, "loss": 0.8669, "step": 2610 }, { "epoch": 0.4288412581095508, "grad_norm": 0.6598613080394321, "learning_rate": 1.2754384671568469e-05, "loss": 0.3115, "step": 2611 }, { "epoch": 0.4290055021762339, "grad_norm": 1.530308055791298, "learning_rate": 1.2749269850536045e-05, "loss": 0.7511, "step": 2612 }, { "epoch": 0.429169746242917, "grad_norm": 1.6395543390679401, "learning_rate": 1.2744154251328573e-05, "loss": 0.7605, "step": 2613 }, { "epoch": 0.42933399030960007, "grad_norm": 1.7138599545201332, "learning_rate": 1.2739037875394013e-05, "loss": 0.8063, "step": 2614 }, { "epoch": 0.42949823437628315, "grad_norm": 1.7996045761719583, "learning_rate": 1.2733920724180542e-05, "loss": 0.7391, "step": 2615 }, { "epoch": 0.42966247844296623, "grad_norm": 1.876577522679765, "learning_rate": 1.2728802799136566e-05, "loss": 0.6661, "step": 2616 }, { "epoch": 0.4298267225096493, "grad_norm": 2.457665594076083, "learning_rate": 1.2723684101710696e-05, "loss": 0.8162, "step": 2617 }, { "epoch": 0.42999096657633246, "grad_norm": 2.2121782425057397, "learning_rate": 1.2718564633351773e-05, "loss": 0.8429, "step": 2618 }, { "epoch": 0.43015521064301554, "grad_norm": 1.8978519110304628, "learning_rate": 1.271344439550885e-05, "loss": 0.7697, "step": 2619 }, { "epoch": 0.4303194547096986, "grad_norm": 1.8979435760200736, "learning_rate": 1.2708323389631198e-05, "loss": 0.813, "step": 2620 }, { "epoch": 0.4304836987763817, "grad_norm": 1.5276186237372367, "learning_rate": 1.270320161716831e-05, "loss": 0.8164, "step": 2621 }, { "epoch": 0.4306479428430648, "grad_norm": 1.7054545600910966, "learning_rate": 1.2698079079569891e-05, "loss": 0.7782, "step": 2622 }, { "epoch": 0.4308121869097479, "grad_norm": 1.9517959542851526, "learning_rate": 1.2692955778285865e-05, "loss": 0.8302, "step": 2623 }, { "epoch": 0.43097643097643096, "grad_norm": 2.008055813396158, "learning_rate": 1.268783171476637e-05, "loss": 0.8416, "step": 2624 }, { "epoch": 0.43114067504311404, "grad_norm": 1.707812939278916, "learning_rate": 1.2682706890461764e-05, "loss": 0.9035, "step": 2625 }, { "epoch": 0.4313049191097972, "grad_norm": 1.574411661816596, "learning_rate": 1.2677581306822613e-05, "loss": 0.786, "step": 2626 }, { "epoch": 0.43146916317648026, "grad_norm": 1.9964572396296136, "learning_rate": 1.267245496529971e-05, "loss": 0.8589, "step": 2627 }, { "epoch": 0.43163340724316335, "grad_norm": 2.0527802655464398, "learning_rate": 1.266732786734405e-05, "loss": 0.7242, "step": 2628 }, { "epoch": 0.43179765130984643, "grad_norm": 1.8284557303033795, "learning_rate": 1.2662200014406848e-05, "loss": 0.7653, "step": 2629 }, { "epoch": 0.4319618953765295, "grad_norm": 1.734213546419165, "learning_rate": 1.2657071407939536e-05, "loss": 0.7559, "step": 2630 }, { "epoch": 0.4321261394432126, "grad_norm": 1.648434140881532, "learning_rate": 1.265194204939375e-05, "loss": 0.8356, "step": 2631 }, { "epoch": 0.4322903835098957, "grad_norm": 2.5625186614227, "learning_rate": 1.2646811940221346e-05, "loss": 0.7117, "step": 2632 }, { "epoch": 0.4324546275765788, "grad_norm": 2.200384742912208, "learning_rate": 1.2641681081874394e-05, "loss": 0.7739, "step": 2633 }, { "epoch": 0.4326188716432619, "grad_norm": 2.1403984386139343, "learning_rate": 1.2636549475805165e-05, "loss": 0.8111, "step": 2634 }, { "epoch": 0.432783115709945, "grad_norm": 1.615220526314349, "learning_rate": 1.2631417123466154e-05, "loss": 0.8246, "step": 2635 }, { "epoch": 0.43294735977662807, "grad_norm": 1.6454992485435243, "learning_rate": 1.2626284026310062e-05, "loss": 0.8056, "step": 2636 }, { "epoch": 0.43311160384331115, "grad_norm": 1.9240980739308298, "learning_rate": 1.2621150185789803e-05, "loss": 0.7839, "step": 2637 }, { "epoch": 0.43327584790999424, "grad_norm": 1.5804632689262352, "learning_rate": 1.2616015603358497e-05, "loss": 0.8772, "step": 2638 }, { "epoch": 0.4334400919766773, "grad_norm": 1.8137913864211819, "learning_rate": 1.261088028046948e-05, "loss": 0.7953, "step": 2639 }, { "epoch": 0.43360433604336046, "grad_norm": 2.2096186028778004, "learning_rate": 1.260574421857629e-05, "loss": 0.8155, "step": 2640 }, { "epoch": 0.43376858011004354, "grad_norm": 1.5108346880776107, "learning_rate": 1.2600607419132685e-05, "loss": 0.798, "step": 2641 }, { "epoch": 0.43393282417672663, "grad_norm": 2.5176818680369397, "learning_rate": 1.2595469883592617e-05, "loss": 0.8383, "step": 2642 }, { "epoch": 0.4340970682434097, "grad_norm": 1.4298204862943655, "learning_rate": 1.2590331613410261e-05, "loss": 0.8401, "step": 2643 }, { "epoch": 0.4342613123100928, "grad_norm": 1.875956746536616, "learning_rate": 1.258519261003999e-05, "loss": 0.7244, "step": 2644 }, { "epoch": 0.4344255563767759, "grad_norm": 23.80090047318258, "learning_rate": 1.2580052874936393e-05, "loss": 0.8036, "step": 2645 }, { "epoch": 0.43458980044345896, "grad_norm": 1.5323733143621217, "learning_rate": 1.2574912409554254e-05, "loss": 0.7437, "step": 2646 }, { "epoch": 0.43475404451014205, "grad_norm": 1.5286395248947962, "learning_rate": 1.2569771215348576e-05, "loss": 0.8202, "step": 2647 }, { "epoch": 0.4349182885768252, "grad_norm": 1.7087135104258058, "learning_rate": 1.2564629293774561e-05, "loss": 0.7826, "step": 2648 }, { "epoch": 0.43508253264350827, "grad_norm": 1.9032730644417242, "learning_rate": 1.2559486646287622e-05, "loss": 0.8192, "step": 2649 }, { "epoch": 0.43524677671019135, "grad_norm": 3.24580169182234, "learning_rate": 1.2554343274343367e-05, "loss": 0.8282, "step": 2650 }, { "epoch": 0.43541102077687444, "grad_norm": 1.7590215006474883, "learning_rate": 1.2549199179397627e-05, "loss": 0.7605, "step": 2651 }, { "epoch": 0.4355752648435575, "grad_norm": 2.2054439985791174, "learning_rate": 1.2544054362906421e-05, "loss": 0.7494, "step": 2652 }, { "epoch": 0.4357395089102406, "grad_norm": 1.7592120143178738, "learning_rate": 1.253890882632598e-05, "loss": 0.7279, "step": 2653 }, { "epoch": 0.4359037529769237, "grad_norm": 2.3811014107341673, "learning_rate": 1.2533762571112737e-05, "loss": 0.8259, "step": 2654 }, { "epoch": 0.4360679970436068, "grad_norm": 1.6530338944607577, "learning_rate": 1.2528615598723333e-05, "loss": 0.743, "step": 2655 }, { "epoch": 0.4362322411102899, "grad_norm": 1.8497881678790835, "learning_rate": 1.2523467910614597e-05, "loss": 0.8499, "step": 2656 }, { "epoch": 0.436396485176973, "grad_norm": 1.7982201132251219, "learning_rate": 1.2518319508243582e-05, "loss": 0.7875, "step": 2657 }, { "epoch": 0.4365607292436561, "grad_norm": 1.5948872190728418, "learning_rate": 1.2513170393067527e-05, "loss": 0.7903, "step": 2658 }, { "epoch": 0.43672497331033916, "grad_norm": 1.9563696191323858, "learning_rate": 1.2508020566543876e-05, "loss": 0.8285, "step": 2659 }, { "epoch": 0.43688921737702224, "grad_norm": 1.986384583026644, "learning_rate": 1.2502870030130285e-05, "loss": 0.8514, "step": 2660 }, { "epoch": 0.4370534614437053, "grad_norm": 1.4325806980376024, "learning_rate": 1.2497718785284594e-05, "loss": 0.8089, "step": 2661 }, { "epoch": 0.4372177055103884, "grad_norm": 3.69769240582976, "learning_rate": 1.2492566833464857e-05, "loss": 0.8201, "step": 2662 }, { "epoch": 0.43738194957707155, "grad_norm": 1.7293318763659546, "learning_rate": 1.2487414176129322e-05, "loss": 0.8219, "step": 2663 }, { "epoch": 0.43754619364375463, "grad_norm": 1.5882786582603305, "learning_rate": 1.2482260814736438e-05, "loss": 0.8253, "step": 2664 }, { "epoch": 0.4377104377104377, "grad_norm": 1.8225060497772076, "learning_rate": 1.2477106750744852e-05, "loss": 0.7708, "step": 2665 }, { "epoch": 0.4378746817771208, "grad_norm": 1.9525974202809677, "learning_rate": 1.2471951985613414e-05, "loss": 0.7601, "step": 2666 }, { "epoch": 0.4380389258438039, "grad_norm": 1.5557568287353338, "learning_rate": 1.2466796520801163e-05, "loss": 0.8462, "step": 2667 }, { "epoch": 0.43820316991048697, "grad_norm": 1.4023302653897731, "learning_rate": 1.246164035776735e-05, "loss": 0.8537, "step": 2668 }, { "epoch": 0.43836741397717005, "grad_norm": 0.6842474912214843, "learning_rate": 1.245648349797141e-05, "loss": 0.3726, "step": 2669 }, { "epoch": 0.4385316580438532, "grad_norm": 1.7808194992274102, "learning_rate": 1.2451325942872984e-05, "loss": 0.8222, "step": 2670 }, { "epoch": 0.4386959021105363, "grad_norm": 1.6023675361363128, "learning_rate": 1.2446167693931907e-05, "loss": 0.8588, "step": 2671 }, { "epoch": 0.43886014617721936, "grad_norm": 1.722356154131352, "learning_rate": 1.2441008752608212e-05, "loss": 0.7042, "step": 2672 }, { "epoch": 0.43902439024390244, "grad_norm": 2.8834832198619695, "learning_rate": 1.2435849120362123e-05, "loss": 0.7371, "step": 2673 }, { "epoch": 0.4391886343105855, "grad_norm": 1.6556075091194042, "learning_rate": 1.2430688798654064e-05, "loss": 0.6931, "step": 2674 }, { "epoch": 0.4393528783772686, "grad_norm": 1.9826348802200875, "learning_rate": 1.2425527788944656e-05, "loss": 0.8117, "step": 2675 }, { "epoch": 0.4395171224439517, "grad_norm": 1.268725492535557, "learning_rate": 1.2420366092694713e-05, "loss": 0.7527, "step": 2676 }, { "epoch": 0.43968136651063483, "grad_norm": 1.6850733962149607, "learning_rate": 1.2415203711365238e-05, "loss": 0.8957, "step": 2677 }, { "epoch": 0.4398456105773179, "grad_norm": 1.4639144072378325, "learning_rate": 1.2410040646417431e-05, "loss": 0.6922, "step": 2678 }, { "epoch": 0.440009854644001, "grad_norm": 1.9933963904082346, "learning_rate": 1.2404876899312693e-05, "loss": 0.7524, "step": 2679 }, { "epoch": 0.4401740987106841, "grad_norm": 1.891639040500139, "learning_rate": 1.2399712471512607e-05, "loss": 0.8585, "step": 2680 }, { "epoch": 0.44033834277736716, "grad_norm": 1.7848218913701432, "learning_rate": 1.239454736447895e-05, "loss": 0.8115, "step": 2681 }, { "epoch": 0.44050258684405025, "grad_norm": 1.6734026213833693, "learning_rate": 1.2389381579673704e-05, "loss": 0.832, "step": 2682 }, { "epoch": 0.44066683091073333, "grad_norm": 1.802181067387331, "learning_rate": 1.2384215118559027e-05, "loss": 0.7754, "step": 2683 }, { "epoch": 0.4408310749774164, "grad_norm": 1.294549689700015, "learning_rate": 1.2379047982597277e-05, "loss": 0.8182, "step": 2684 }, { "epoch": 0.44099531904409955, "grad_norm": 1.609821809890346, "learning_rate": 1.2373880173250998e-05, "loss": 0.7164, "step": 2685 }, { "epoch": 0.44115956311078264, "grad_norm": 1.8445943811746235, "learning_rate": 1.2368711691982933e-05, "loss": 0.8295, "step": 2686 }, { "epoch": 0.4413238071774657, "grad_norm": 1.716293992228234, "learning_rate": 1.236354254025601e-05, "loss": 0.893, "step": 2687 }, { "epoch": 0.4414880512441488, "grad_norm": 1.7488386171568346, "learning_rate": 1.235837271953334e-05, "loss": 0.8869, "step": 2688 }, { "epoch": 0.4416522953108319, "grad_norm": 1.7602620482194193, "learning_rate": 1.2353202231278232e-05, "loss": 0.7644, "step": 2689 }, { "epoch": 0.44181653937751497, "grad_norm": 1.507834379814725, "learning_rate": 1.2348031076954186e-05, "loss": 0.742, "step": 2690 }, { "epoch": 0.44198078344419806, "grad_norm": 1.765574572337032, "learning_rate": 1.2342859258024882e-05, "loss": 0.8053, "step": 2691 }, { "epoch": 0.4421450275108812, "grad_norm": 1.572425978535637, "learning_rate": 1.2337686775954193e-05, "loss": 0.7781, "step": 2692 }, { "epoch": 0.4423092715775643, "grad_norm": 1.99112123049374, "learning_rate": 1.2332513632206183e-05, "loss": 0.7555, "step": 2693 }, { "epoch": 0.44247351564424736, "grad_norm": 1.8926386447396557, "learning_rate": 1.2327339828245092e-05, "loss": 0.8168, "step": 2694 }, { "epoch": 0.44263775971093045, "grad_norm": 1.5964691075687079, "learning_rate": 1.2322165365535364e-05, "loss": 0.7827, "step": 2695 }, { "epoch": 0.44280200377761353, "grad_norm": 1.6442303782593284, "learning_rate": 1.2316990245541609e-05, "loss": 0.8199, "step": 2696 }, { "epoch": 0.4429662478442966, "grad_norm": 2.0397944310002014, "learning_rate": 1.2311814469728643e-05, "loss": 0.8238, "step": 2697 }, { "epoch": 0.4431304919109797, "grad_norm": 1.7486569254728597, "learning_rate": 1.2306638039561455e-05, "loss": 0.8063, "step": 2698 }, { "epoch": 0.4432947359776628, "grad_norm": 1.9824570294089827, "learning_rate": 1.2301460956505225e-05, "loss": 0.7742, "step": 2699 }, { "epoch": 0.4434589800443459, "grad_norm": 2.789004614705028, "learning_rate": 1.229628322202531e-05, "loss": 0.8199, "step": 2700 }, { "epoch": 0.443623224111029, "grad_norm": 1.8719266617387758, "learning_rate": 1.229110483758726e-05, "loss": 0.789, "step": 2701 }, { "epoch": 0.4437874681777121, "grad_norm": 0.7505770885909337, "learning_rate": 1.2285925804656806e-05, "loss": 0.3337, "step": 2702 }, { "epoch": 0.44395171224439517, "grad_norm": 2.0314624033840087, "learning_rate": 1.2280746124699864e-05, "loss": 0.7406, "step": 2703 }, { "epoch": 0.44411595631107825, "grad_norm": 2.00530065893799, "learning_rate": 1.2275565799182527e-05, "loss": 0.7985, "step": 2704 }, { "epoch": 0.44428020037776134, "grad_norm": 1.6391022882149011, "learning_rate": 1.227038482957108e-05, "loss": 0.7785, "step": 2705 }, { "epoch": 0.4444444444444444, "grad_norm": 2.4846126731286127, "learning_rate": 1.2265203217331982e-05, "loss": 0.7832, "step": 2706 }, { "epoch": 0.44460868851112756, "grad_norm": 1.7563101253047364, "learning_rate": 1.226002096393188e-05, "loss": 0.7451, "step": 2707 }, { "epoch": 0.44477293257781064, "grad_norm": 1.7058266850489678, "learning_rate": 1.2254838070837596e-05, "loss": 0.7962, "step": 2708 }, { "epoch": 0.4449371766444937, "grad_norm": 1.541383558891538, "learning_rate": 1.2249654539516143e-05, "loss": 0.8012, "step": 2709 }, { "epoch": 0.4451014207111768, "grad_norm": 1.9440356896704631, "learning_rate": 1.2244470371434705e-05, "loss": 0.7908, "step": 2710 }, { "epoch": 0.4452656647778599, "grad_norm": 1.396980105816837, "learning_rate": 1.2239285568060651e-05, "loss": 0.8982, "step": 2711 }, { "epoch": 0.445429908844543, "grad_norm": 1.774159452964297, "learning_rate": 1.2234100130861525e-05, "loss": 0.7941, "step": 2712 }, { "epoch": 0.44559415291122606, "grad_norm": 2.0240523966181283, "learning_rate": 1.2228914061305059e-05, "loss": 0.7691, "step": 2713 }, { "epoch": 0.4457583969779092, "grad_norm": 1.823314904309345, "learning_rate": 1.2223727360859156e-05, "loss": 0.7351, "step": 2714 }, { "epoch": 0.4459226410445923, "grad_norm": 2.5803191790495776, "learning_rate": 1.2218540030991903e-05, "loss": 0.8415, "step": 2715 }, { "epoch": 0.44608688511127537, "grad_norm": 2.005202209782677, "learning_rate": 1.2213352073171562e-05, "loss": 0.7208, "step": 2716 }, { "epoch": 0.44625112917795845, "grad_norm": 1.573532055375388, "learning_rate": 1.2208163488866573e-05, "loss": 0.7995, "step": 2717 }, { "epoch": 0.44641537324464153, "grad_norm": 2.2712865542053304, "learning_rate": 1.2202974279545554e-05, "loss": 0.8044, "step": 2718 }, { "epoch": 0.4465796173113246, "grad_norm": 1.4813679795461208, "learning_rate": 1.2197784446677299e-05, "loss": 0.785, "step": 2719 }, { "epoch": 0.4467438613780077, "grad_norm": 2.2026073426344706, "learning_rate": 1.2192593991730781e-05, "loss": 0.7807, "step": 2720 }, { "epoch": 0.4469081054446908, "grad_norm": 1.9092403146317853, "learning_rate": 1.2187402916175146e-05, "loss": 0.7078, "step": 2721 }, { "epoch": 0.4470723495113739, "grad_norm": 0.7223085735311459, "learning_rate": 1.2182211221479719e-05, "loss": 0.3232, "step": 2722 }, { "epoch": 0.447236593578057, "grad_norm": 1.59560443712604, "learning_rate": 1.2177018909113994e-05, "loss": 0.8486, "step": 2723 }, { "epoch": 0.4474008376447401, "grad_norm": 2.0021793143859146, "learning_rate": 1.2171825980547646e-05, "loss": 0.7858, "step": 2724 }, { "epoch": 0.4475650817114232, "grad_norm": 1.86951052988164, "learning_rate": 1.2166632437250527e-05, "loss": 0.7231, "step": 2725 }, { "epoch": 0.44772932577810626, "grad_norm": 3.1823412240705813, "learning_rate": 1.2161438280692655e-05, "loss": 0.6364, "step": 2726 }, { "epoch": 0.44789356984478934, "grad_norm": 2.3933082847028873, "learning_rate": 1.215624351234422e-05, "loss": 0.7548, "step": 2727 }, { "epoch": 0.4480578139114724, "grad_norm": 7.84119161144733, "learning_rate": 1.21510481336756e-05, "loss": 0.7136, "step": 2728 }, { "epoch": 0.44822205797815556, "grad_norm": 1.6437278986867976, "learning_rate": 1.214585214615733e-05, "loss": 0.78, "step": 2729 }, { "epoch": 0.44838630204483865, "grad_norm": 2.045764791370275, "learning_rate": 1.2140655551260124e-05, "loss": 0.6548, "step": 2730 }, { "epoch": 0.44855054611152173, "grad_norm": 2.1010757884921434, "learning_rate": 1.2135458350454867e-05, "loss": 0.7137, "step": 2731 }, { "epoch": 0.4487147901782048, "grad_norm": 1.6749012257713618, "learning_rate": 1.2130260545212618e-05, "loss": 0.8855, "step": 2732 }, { "epoch": 0.4488790342448879, "grad_norm": 1.7939461818183748, "learning_rate": 1.2125062137004602e-05, "loss": 0.6906, "step": 2733 }, { "epoch": 0.449043278311571, "grad_norm": 1.5732122788979532, "learning_rate": 1.2119863127302221e-05, "loss": 0.8365, "step": 2734 }, { "epoch": 0.44920752237825406, "grad_norm": 1.8418998361999395, "learning_rate": 1.211466351757704e-05, "loss": 0.7841, "step": 2735 }, { "epoch": 0.4493717664449372, "grad_norm": 1.6637403868966159, "learning_rate": 1.2109463309300798e-05, "loss": 0.8686, "step": 2736 }, { "epoch": 0.4495360105116203, "grad_norm": 1.7233026371032762, "learning_rate": 1.2104262503945406e-05, "loss": 0.7717, "step": 2737 }, { "epoch": 0.44970025457830337, "grad_norm": 1.4818827953143163, "learning_rate": 1.2099061102982939e-05, "loss": 0.7543, "step": 2738 }, { "epoch": 0.44986449864498645, "grad_norm": 1.8421311881293683, "learning_rate": 1.2093859107885642e-05, "loss": 0.7863, "step": 2739 }, { "epoch": 0.45002874271166954, "grad_norm": 1.7986235198420029, "learning_rate": 1.2088656520125929e-05, "loss": 0.7864, "step": 2740 }, { "epoch": 0.4501929867783526, "grad_norm": 1.964370718902037, "learning_rate": 1.2083453341176386e-05, "loss": 0.7295, "step": 2741 }, { "epoch": 0.4503572308450357, "grad_norm": 2.920263722776097, "learning_rate": 1.2078249572509755e-05, "loss": 0.8197, "step": 2742 }, { "epoch": 0.4505214749117188, "grad_norm": 1.877703086989278, "learning_rate": 1.2073045215598953e-05, "loss": 0.7923, "step": 2743 }, { "epoch": 0.45068571897840193, "grad_norm": 1.9519296067112601, "learning_rate": 1.2067840271917066e-05, "loss": 0.8415, "step": 2744 }, { "epoch": 0.450849963045085, "grad_norm": 1.5798755794469908, "learning_rate": 1.206263474293734e-05, "loss": 0.8299, "step": 2745 }, { "epoch": 0.4510142071117681, "grad_norm": 1.6785192735141286, "learning_rate": 1.205742863013319e-05, "loss": 0.7866, "step": 2746 }, { "epoch": 0.4511784511784512, "grad_norm": 2.748644904403262, "learning_rate": 1.2052221934978197e-05, "loss": 0.7379, "step": 2747 }, { "epoch": 0.45134269524513426, "grad_norm": 2.495698710266908, "learning_rate": 1.20470146589461e-05, "loss": 0.8304, "step": 2748 }, { "epoch": 0.45150693931181735, "grad_norm": 2.100353213134524, "learning_rate": 1.2041806803510809e-05, "loss": 0.8056, "step": 2749 }, { "epoch": 0.45167118337850043, "grad_norm": 1.502288525508338, "learning_rate": 1.20365983701464e-05, "loss": 0.8552, "step": 2750 }, { "epoch": 0.45183542744518357, "grad_norm": 17.60376901858516, "learning_rate": 1.2031389360327106e-05, "loss": 0.8143, "step": 2751 }, { "epoch": 0.45199967151186665, "grad_norm": 3.1196419038249563, "learning_rate": 1.202617977552733e-05, "loss": 0.7302, "step": 2752 }, { "epoch": 0.45216391557854974, "grad_norm": 1.8673334730163178, "learning_rate": 1.2020969617221627e-05, "loss": 0.7492, "step": 2753 }, { "epoch": 0.4523281596452328, "grad_norm": 1.7422264747340226, "learning_rate": 1.2015758886884727e-05, "loss": 0.7761, "step": 2754 }, { "epoch": 0.4524924037119159, "grad_norm": 1.6944384962050782, "learning_rate": 1.2010547585991516e-05, "loss": 0.8074, "step": 2755 }, { "epoch": 0.452656647778599, "grad_norm": 1.7496904098479142, "learning_rate": 1.200533571601704e-05, "loss": 0.662, "step": 2756 }, { "epoch": 0.45282089184528207, "grad_norm": 1.849548112473127, "learning_rate": 1.2000123278436508e-05, "loss": 0.835, "step": 2757 }, { "epoch": 0.45298513591196515, "grad_norm": 1.984020268208311, "learning_rate": 1.199491027472529e-05, "loss": 0.7585, "step": 2758 }, { "epoch": 0.4531493799786483, "grad_norm": 2.204992414363511, "learning_rate": 1.1989696706358917e-05, "loss": 0.7086, "step": 2759 }, { "epoch": 0.4533136240453314, "grad_norm": 1.5560066561439658, "learning_rate": 1.1984482574813076e-05, "loss": 0.7382, "step": 2760 }, { "epoch": 0.45347786811201446, "grad_norm": 3.209180753793417, "learning_rate": 1.1979267881563618e-05, "loss": 0.7632, "step": 2761 }, { "epoch": 0.45364211217869754, "grad_norm": 2.0898979655494574, "learning_rate": 1.197405262808655e-05, "loss": 0.826, "step": 2762 }, { "epoch": 0.4538063562453806, "grad_norm": 1.5803658531550113, "learning_rate": 1.1968836815858038e-05, "loss": 0.8271, "step": 2763 }, { "epoch": 0.4539706003120637, "grad_norm": 1.783703592595727, "learning_rate": 1.1963620446354406e-05, "loss": 0.8001, "step": 2764 }, { "epoch": 0.4541348443787468, "grad_norm": 1.366681218385594, "learning_rate": 1.195840352105214e-05, "loss": 0.8074, "step": 2765 }, { "epoch": 0.45429908844542993, "grad_norm": 1.6494848558977409, "learning_rate": 1.1953186041427878e-05, "loss": 0.8392, "step": 2766 }, { "epoch": 0.454463332512113, "grad_norm": 2.498164341173253, "learning_rate": 1.1947968008958414e-05, "loss": 0.6371, "step": 2767 }, { "epoch": 0.4546275765787961, "grad_norm": 1.6208966412893246, "learning_rate": 1.1942749425120704e-05, "loss": 0.7474, "step": 2768 }, { "epoch": 0.4547918206454792, "grad_norm": 1.483000867421248, "learning_rate": 1.1937530291391857e-05, "loss": 0.7542, "step": 2769 }, { "epoch": 0.45495606471216227, "grad_norm": 1.8550202241564726, "learning_rate": 1.1932310609249135e-05, "loss": 0.7223, "step": 2770 }, { "epoch": 0.45512030877884535, "grad_norm": 1.5354022613068308, "learning_rate": 1.1927090380169963e-05, "loss": 0.8481, "step": 2771 }, { "epoch": 0.45528455284552843, "grad_norm": 1.4968596851030023, "learning_rate": 1.1921869605631914e-05, "loss": 0.8382, "step": 2772 }, { "epoch": 0.4554487969122116, "grad_norm": 0.7088880452871457, "learning_rate": 1.1916648287112714e-05, "loss": 0.3281, "step": 2773 }, { "epoch": 0.45561304097889466, "grad_norm": 1.4145640596107, "learning_rate": 1.191142642609025e-05, "loss": 0.8136, "step": 2774 }, { "epoch": 0.45577728504557774, "grad_norm": 1.7013144948265218, "learning_rate": 1.1906204024042556e-05, "loss": 0.7986, "step": 2775 }, { "epoch": 0.4559415291122608, "grad_norm": 1.7470446950565972, "learning_rate": 1.1900981082447822e-05, "loss": 0.812, "step": 2776 }, { "epoch": 0.4561057731789439, "grad_norm": 1.8648198427624638, "learning_rate": 1.1895757602784395e-05, "loss": 0.8746, "step": 2777 }, { "epoch": 0.456270017245627, "grad_norm": 2.4390738136302947, "learning_rate": 1.1890533586530766e-05, "loss": 0.7328, "step": 2778 }, { "epoch": 0.4564342613123101, "grad_norm": 1.7261762261814515, "learning_rate": 1.1885309035165582e-05, "loss": 0.6847, "step": 2779 }, { "epoch": 0.45659850537899316, "grad_norm": 2.6345014970689484, "learning_rate": 1.1880083950167642e-05, "loss": 0.7227, "step": 2780 }, { "epoch": 0.4567627494456763, "grad_norm": 1.4793974696476369, "learning_rate": 1.1874858333015895e-05, "loss": 0.7527, "step": 2781 }, { "epoch": 0.4569269935123594, "grad_norm": 1.4569014367086013, "learning_rate": 1.186963218518944e-05, "loss": 0.8129, "step": 2782 }, { "epoch": 0.45709123757904246, "grad_norm": 1.862187913821822, "learning_rate": 1.1864405508167532e-05, "loss": 0.7517, "step": 2783 }, { "epoch": 0.45725548164572555, "grad_norm": 0.6315849534546268, "learning_rate": 1.1859178303429566e-05, "loss": 0.3489, "step": 2784 }, { "epoch": 0.45741972571240863, "grad_norm": 1.6929391965070075, "learning_rate": 1.1853950572455093e-05, "loss": 0.805, "step": 2785 }, { "epoch": 0.4575839697790917, "grad_norm": 1.5790858517064608, "learning_rate": 1.1848722316723809e-05, "loss": 0.8189, "step": 2786 }, { "epoch": 0.4577482138457748, "grad_norm": 1.8160415131397742, "learning_rate": 1.1843493537715563e-05, "loss": 0.6807, "step": 2787 }, { "epoch": 0.45791245791245794, "grad_norm": 1.4217254342139858, "learning_rate": 1.1838264236910348e-05, "loss": 0.8085, "step": 2788 }, { "epoch": 0.458076701979141, "grad_norm": 1.8165101028675914, "learning_rate": 1.183303441578831e-05, "loss": 0.7635, "step": 2789 }, { "epoch": 0.4582409460458241, "grad_norm": 2.048559485403732, "learning_rate": 1.1827804075829738e-05, "loss": 0.7606, "step": 2790 }, { "epoch": 0.4584051901125072, "grad_norm": 1.799134703791278, "learning_rate": 1.1822573218515068e-05, "loss": 0.7869, "step": 2791 }, { "epoch": 0.45856943417919027, "grad_norm": 1.7068649847513377, "learning_rate": 1.1817341845324882e-05, "loss": 0.7828, "step": 2792 }, { "epoch": 0.45873367824587336, "grad_norm": 2.1275700513329796, "learning_rate": 1.1812109957739907e-05, "loss": 0.773, "step": 2793 }, { "epoch": 0.45889792231255644, "grad_norm": 1.9930265268079337, "learning_rate": 1.1806877557241023e-05, "loss": 0.72, "step": 2794 }, { "epoch": 0.4590621663792395, "grad_norm": 1.8191108854600828, "learning_rate": 1.1801644645309252e-05, "loss": 0.8163, "step": 2795 }, { "epoch": 0.45922641044592266, "grad_norm": 1.7765529917994733, "learning_rate": 1.179641122342575e-05, "loss": 0.8034, "step": 2796 }, { "epoch": 0.45939065451260575, "grad_norm": 1.9660929091431067, "learning_rate": 1.1791177293071831e-05, "loss": 0.7609, "step": 2797 }, { "epoch": 0.45955489857928883, "grad_norm": 0.5945146337795791, "learning_rate": 1.1785942855728945e-05, "loss": 0.3509, "step": 2798 }, { "epoch": 0.4597191426459719, "grad_norm": 1.589746567317058, "learning_rate": 1.1780707912878693e-05, "loss": 0.8483, "step": 2799 }, { "epoch": 0.459883386712655, "grad_norm": 1.760704346292144, "learning_rate": 1.1775472466002812e-05, "loss": 0.7827, "step": 2800 }, { "epoch": 0.4600476307793381, "grad_norm": 2.8527660127514105, "learning_rate": 1.1770236516583187e-05, "loss": 0.8281, "step": 2801 }, { "epoch": 0.46021187484602116, "grad_norm": 1.4693354552823938, "learning_rate": 1.176500006610184e-05, "loss": 0.8165, "step": 2802 }, { "epoch": 0.4603761189127043, "grad_norm": 2.0915684223119597, "learning_rate": 1.1759763116040936e-05, "loss": 0.7089, "step": 2803 }, { "epoch": 0.4605403629793874, "grad_norm": 0.6737127433556142, "learning_rate": 1.1754525667882786e-05, "loss": 0.3367, "step": 2804 }, { "epoch": 0.46070460704607047, "grad_norm": 1.9403331559580566, "learning_rate": 1.1749287723109834e-05, "loss": 0.8621, "step": 2805 }, { "epoch": 0.46086885111275355, "grad_norm": 1.4163021628131534, "learning_rate": 1.1744049283204677e-05, "loss": 0.808, "step": 2806 }, { "epoch": 0.46103309517943664, "grad_norm": 1.6154824159471808, "learning_rate": 1.1738810349650036e-05, "loss": 0.8146, "step": 2807 }, { "epoch": 0.4611973392461197, "grad_norm": 1.561231848909158, "learning_rate": 1.1733570923928785e-05, "loss": 0.7529, "step": 2808 }, { "epoch": 0.4613615833128028, "grad_norm": 0.6655289660828597, "learning_rate": 1.1728331007523928e-05, "loss": 0.3142, "step": 2809 }, { "epoch": 0.46152582737948594, "grad_norm": 0.5912144127984758, "learning_rate": 1.1723090601918616e-05, "loss": 0.3229, "step": 2810 }, { "epoch": 0.461690071446169, "grad_norm": 1.7965423882498062, "learning_rate": 1.1717849708596136e-05, "loss": 0.7735, "step": 2811 }, { "epoch": 0.4618543155128521, "grad_norm": 1.6705000597548736, "learning_rate": 1.171260832903991e-05, "loss": 0.7836, "step": 2812 }, { "epoch": 0.4620185595795352, "grad_norm": 1.6654936246776286, "learning_rate": 1.1707366464733501e-05, "loss": 0.8229, "step": 2813 }, { "epoch": 0.4621828036462183, "grad_norm": 0.6379387586919204, "learning_rate": 1.1702124117160603e-05, "loss": 0.3615, "step": 2814 }, { "epoch": 0.46234704771290136, "grad_norm": 1.3862891195895168, "learning_rate": 1.1696881287805056e-05, "loss": 0.8308, "step": 2815 }, { "epoch": 0.46251129177958444, "grad_norm": 1.6104829923409953, "learning_rate": 1.1691637978150831e-05, "loss": 0.767, "step": 2816 }, { "epoch": 0.4626755358462675, "grad_norm": 1.6804136352660775, "learning_rate": 1.1686394189682035e-05, "loss": 0.819, "step": 2817 }, { "epoch": 0.46283977991295067, "grad_norm": 1.6798078519682562, "learning_rate": 1.1681149923882913e-05, "loss": 0.7954, "step": 2818 }, { "epoch": 0.46300402397963375, "grad_norm": 1.629034176061907, "learning_rate": 1.1675905182237839e-05, "loss": 0.7727, "step": 2819 }, { "epoch": 0.46316826804631683, "grad_norm": 0.673753447214259, "learning_rate": 1.167065996623133e-05, "loss": 0.3378, "step": 2820 }, { "epoch": 0.4633325121129999, "grad_norm": 1.733419644322018, "learning_rate": 1.166541427734803e-05, "loss": 0.7221, "step": 2821 }, { "epoch": 0.463496756179683, "grad_norm": 2.7029535420006967, "learning_rate": 1.1660168117072725e-05, "loss": 0.7753, "step": 2822 }, { "epoch": 0.4636610002463661, "grad_norm": 1.5866545599145578, "learning_rate": 1.1654921486890327e-05, "loss": 0.7697, "step": 2823 }, { "epoch": 0.46382524431304917, "grad_norm": 1.971332857146003, "learning_rate": 1.1649674388285883e-05, "loss": 0.7905, "step": 2824 }, { "epoch": 0.4639894883797323, "grad_norm": 0.5980744633239086, "learning_rate": 1.1644426822744575e-05, "loss": 0.3106, "step": 2825 }, { "epoch": 0.4641537324464154, "grad_norm": 1.809531065759343, "learning_rate": 1.1639178791751715e-05, "loss": 0.8295, "step": 2826 }, { "epoch": 0.4643179765130985, "grad_norm": 1.5823034271235366, "learning_rate": 1.1633930296792744e-05, "loss": 0.75, "step": 2827 }, { "epoch": 0.46448222057978156, "grad_norm": 2.3204244771403886, "learning_rate": 1.1628681339353244e-05, "loss": 0.757, "step": 2828 }, { "epoch": 0.46464646464646464, "grad_norm": 2.6354260339093973, "learning_rate": 1.1623431920918916e-05, "loss": 0.8439, "step": 2829 }, { "epoch": 0.4648107087131477, "grad_norm": 1.594802494767825, "learning_rate": 1.1618182042975596e-05, "loss": 0.8306, "step": 2830 }, { "epoch": 0.4649749527798308, "grad_norm": 1.892603381313633, "learning_rate": 1.1612931707009253e-05, "loss": 0.7484, "step": 2831 }, { "epoch": 0.46513919684651395, "grad_norm": 1.667718775368817, "learning_rate": 1.1607680914505985e-05, "loss": 0.8156, "step": 2832 }, { "epoch": 0.46530344091319703, "grad_norm": 0.6249235828296072, "learning_rate": 1.1602429666952015e-05, "loss": 0.323, "step": 2833 }, { "epoch": 0.4654676849798801, "grad_norm": 1.910404821710437, "learning_rate": 1.15971779658337e-05, "loss": 0.8475, "step": 2834 }, { "epoch": 0.4656319290465632, "grad_norm": 1.4483063210618283, "learning_rate": 1.1591925812637523e-05, "loss": 0.8391, "step": 2835 }, { "epoch": 0.4657961731132463, "grad_norm": 0.6757630051170556, "learning_rate": 1.1586673208850091e-05, "loss": 0.3412, "step": 2836 }, { "epoch": 0.46596041717992936, "grad_norm": 1.9697658030408651, "learning_rate": 1.158142015595815e-05, "loss": 0.7349, "step": 2837 }, { "epoch": 0.46612466124661245, "grad_norm": 1.7862513168887564, "learning_rate": 1.1576166655448558e-05, "loss": 0.8141, "step": 2838 }, { "epoch": 0.46628890531329553, "grad_norm": 2.0152382249191625, "learning_rate": 1.1570912708808311e-05, "loss": 0.7478, "step": 2839 }, { "epoch": 0.46645314937997867, "grad_norm": 1.7883766083980461, "learning_rate": 1.1565658317524526e-05, "loss": 0.8412, "step": 2840 }, { "epoch": 0.46661739344666175, "grad_norm": 2.539753017822357, "learning_rate": 1.1560403483084449e-05, "loss": 0.7614, "step": 2841 }, { "epoch": 0.46678163751334484, "grad_norm": 1.6058269993706986, "learning_rate": 1.1555148206975449e-05, "loss": 0.7746, "step": 2842 }, { "epoch": 0.4669458815800279, "grad_norm": 1.6194568769367041, "learning_rate": 1.1549892490685018e-05, "loss": 0.8708, "step": 2843 }, { "epoch": 0.467110125646711, "grad_norm": 2.121430655042268, "learning_rate": 1.1544636335700778e-05, "loss": 0.7278, "step": 2844 }, { "epoch": 0.4672743697133941, "grad_norm": 1.4727507457501132, "learning_rate": 1.1539379743510475e-05, "loss": 0.8106, "step": 2845 }, { "epoch": 0.4674386137800772, "grad_norm": 1.6681805818997872, "learning_rate": 1.1534122715601974e-05, "loss": 0.8324, "step": 2846 }, { "epoch": 0.4676028578467603, "grad_norm": 1.9715127174395086, "learning_rate": 1.1528865253463266e-05, "loss": 0.721, "step": 2847 }, { "epoch": 0.4677671019134434, "grad_norm": 2.4040371865380403, "learning_rate": 1.1523607358582462e-05, "loss": 0.7914, "step": 2848 }, { "epoch": 0.4679313459801265, "grad_norm": 1.7781323184823117, "learning_rate": 1.1518349032447806e-05, "loss": 0.7826, "step": 2849 }, { "epoch": 0.46809559004680956, "grad_norm": 1.6406935102789388, "learning_rate": 1.1513090276547647e-05, "loss": 0.7644, "step": 2850 }, { "epoch": 0.46825983411349265, "grad_norm": 1.4476716547002049, "learning_rate": 1.150783109237047e-05, "loss": 0.7849, "step": 2851 }, { "epoch": 0.46842407818017573, "grad_norm": 1.7471474067054822, "learning_rate": 1.1502571481404873e-05, "loss": 0.711, "step": 2852 }, { "epoch": 0.4685883222468588, "grad_norm": 1.9643930730009418, "learning_rate": 1.149731144513958e-05, "loss": 0.7214, "step": 2853 }, { "epoch": 0.4687525663135419, "grad_norm": 2.056454473433955, "learning_rate": 1.1492050985063432e-05, "loss": 0.8125, "step": 2854 }, { "epoch": 0.46891681038022504, "grad_norm": 1.634196765566033, "learning_rate": 1.1486790102665393e-05, "loss": 0.8414, "step": 2855 }, { "epoch": 0.4690810544469081, "grad_norm": 1.647807449011738, "learning_rate": 1.148152879943454e-05, "loss": 0.77, "step": 2856 }, { "epoch": 0.4692452985135912, "grad_norm": 1.6241597701955084, "learning_rate": 1.147626707686008e-05, "loss": 0.8019, "step": 2857 }, { "epoch": 0.4694095425802743, "grad_norm": 1.5470996435366815, "learning_rate": 1.1471004936431327e-05, "loss": 0.8208, "step": 2858 }, { "epoch": 0.46957378664695737, "grad_norm": 1.8636161986300395, "learning_rate": 1.1465742379637725e-05, "loss": 0.757, "step": 2859 }, { "epoch": 0.46973803071364045, "grad_norm": 15.583256369738459, "learning_rate": 1.1460479407968827e-05, "loss": 0.8522, "step": 2860 }, { "epoch": 0.46990227478032354, "grad_norm": 2.0346063917077, "learning_rate": 1.1455216022914302e-05, "loss": 0.7891, "step": 2861 }, { "epoch": 0.4700665188470067, "grad_norm": 1.8705129554482818, "learning_rate": 1.1449952225963946e-05, "loss": 0.7595, "step": 2862 }, { "epoch": 0.47023076291368976, "grad_norm": 1.9353619137781235, "learning_rate": 1.144468801860766e-05, "loss": 0.7398, "step": 2863 }, { "epoch": 0.47039500698037284, "grad_norm": 0.708880327919787, "learning_rate": 1.143942340233547e-05, "loss": 0.3479, "step": 2864 }, { "epoch": 0.4705592510470559, "grad_norm": 2.258755325065763, "learning_rate": 1.1434158378637514e-05, "loss": 0.7649, "step": 2865 }, { "epoch": 0.470723495113739, "grad_norm": 1.8576172688302162, "learning_rate": 1.1428892949004049e-05, "loss": 0.7489, "step": 2866 }, { "epoch": 0.4708877391804221, "grad_norm": 1.3192299427782106, "learning_rate": 1.1423627114925434e-05, "loss": 0.8325, "step": 2867 }, { "epoch": 0.4710519832471052, "grad_norm": 1.8865660680385612, "learning_rate": 1.1418360877892165e-05, "loss": 0.7676, "step": 2868 }, { "epoch": 0.4712162273137883, "grad_norm": 1.5517994447928225, "learning_rate": 1.1413094239394833e-05, "loss": 0.7438, "step": 2869 }, { "epoch": 0.4713804713804714, "grad_norm": 1.6833195383740487, "learning_rate": 1.140782720092415e-05, "loss": 0.8328, "step": 2870 }, { "epoch": 0.4715447154471545, "grad_norm": 1.9611035660703806, "learning_rate": 1.1402559763970943e-05, "loss": 0.7674, "step": 2871 }, { "epoch": 0.47170895951383757, "grad_norm": 1.9582246545890039, "learning_rate": 1.139729193002614e-05, "loss": 0.8408, "step": 2872 }, { "epoch": 0.47187320358052065, "grad_norm": 1.723857522913021, "learning_rate": 1.1392023700580796e-05, "loss": 0.7662, "step": 2873 }, { "epoch": 0.47203744764720373, "grad_norm": 1.4555674829871559, "learning_rate": 1.1386755077126073e-05, "loss": 0.7506, "step": 2874 }, { "epoch": 0.4722016917138868, "grad_norm": 1.7506895622290903, "learning_rate": 1.1381486061153244e-05, "loss": 0.7743, "step": 2875 }, { "epoch": 0.4723659357805699, "grad_norm": 1.6758789368174185, "learning_rate": 1.1376216654153689e-05, "loss": 0.8048, "step": 2876 }, { "epoch": 0.47253017984725304, "grad_norm": 1.4522152154079224, "learning_rate": 1.1370946857618908e-05, "loss": 0.8071, "step": 2877 }, { "epoch": 0.4726944239139361, "grad_norm": 1.7237281117058747, "learning_rate": 1.1365676673040502e-05, "loss": 0.823, "step": 2878 }, { "epoch": 0.4728586679806192, "grad_norm": 0.6498441448834055, "learning_rate": 1.1360406101910187e-05, "loss": 0.3525, "step": 2879 }, { "epoch": 0.4730229120473023, "grad_norm": 2.13473946013518, "learning_rate": 1.1355135145719784e-05, "loss": 0.8227, "step": 2880 }, { "epoch": 0.4731871561139854, "grad_norm": 1.4888375624186354, "learning_rate": 1.1349863805961233e-05, "loss": 0.7003, "step": 2881 }, { "epoch": 0.47335140018066846, "grad_norm": 1.8940658810245627, "learning_rate": 1.1344592084126573e-05, "loss": 0.7726, "step": 2882 }, { "epoch": 0.47351564424735154, "grad_norm": 1.501550549825837, "learning_rate": 1.133931998170795e-05, "loss": 0.7895, "step": 2883 }, { "epoch": 0.4736798883140347, "grad_norm": 1.3540571292030898, "learning_rate": 1.1334047500197625e-05, "loss": 0.8233, "step": 2884 }, { "epoch": 0.47384413238071776, "grad_norm": 1.3910960281882836, "learning_rate": 1.1328774641087958e-05, "loss": 0.7468, "step": 2885 }, { "epoch": 0.47400837644740085, "grad_norm": 1.9896820707674878, "learning_rate": 1.132350140587143e-05, "loss": 0.717, "step": 2886 }, { "epoch": 0.47417262051408393, "grad_norm": 4.564358377086722, "learning_rate": 1.1318227796040608e-05, "loss": 0.817, "step": 2887 }, { "epoch": 0.474336864580767, "grad_norm": 1.9448698311773438, "learning_rate": 1.1312953813088183e-05, "loss": 0.7055, "step": 2888 }, { "epoch": 0.4745011086474501, "grad_norm": 1.5675532148879705, "learning_rate": 1.1307679458506947e-05, "loss": 0.8645, "step": 2889 }, { "epoch": 0.4746653527141332, "grad_norm": 1.8645892948632499, "learning_rate": 1.1302404733789787e-05, "loss": 0.755, "step": 2890 }, { "epoch": 0.47482959678081627, "grad_norm": 2.4376477969179247, "learning_rate": 1.1297129640429707e-05, "loss": 0.7695, "step": 2891 }, { "epoch": 0.4749938408474994, "grad_norm": 1.5005467447027119, "learning_rate": 1.1291854179919812e-05, "loss": 0.7469, "step": 2892 }, { "epoch": 0.4751580849141825, "grad_norm": 1.5629715389084775, "learning_rate": 1.1286578353753313e-05, "loss": 0.7136, "step": 2893 }, { "epoch": 0.47532232898086557, "grad_norm": 2.4460316891227047, "learning_rate": 1.1281302163423515e-05, "loss": 0.8547, "step": 2894 }, { "epoch": 0.47548657304754866, "grad_norm": 1.756199507597582, "learning_rate": 1.1276025610423835e-05, "loss": 0.7618, "step": 2895 }, { "epoch": 0.47565081711423174, "grad_norm": 1.432543346982397, "learning_rate": 1.1270748696247791e-05, "loss": 0.8176, "step": 2896 }, { "epoch": 0.4758150611809148, "grad_norm": 1.8021860367825702, "learning_rate": 1.1265471422389003e-05, "loss": 0.8173, "step": 2897 }, { "epoch": 0.4759793052475979, "grad_norm": 1.6149165410289716, "learning_rate": 1.1260193790341186e-05, "loss": 0.7802, "step": 2898 }, { "epoch": 0.47614354931428104, "grad_norm": 1.935105077405733, "learning_rate": 1.1254915801598173e-05, "loss": 0.8575, "step": 2899 }, { "epoch": 0.47630779338096413, "grad_norm": 1.675284679469141, "learning_rate": 1.1249637457653881e-05, "loss": 0.8466, "step": 2900 }, { "epoch": 0.4764720374476472, "grad_norm": 1.3994677391968948, "learning_rate": 1.1244358760002337e-05, "loss": 0.7973, "step": 2901 }, { "epoch": 0.4766362815143303, "grad_norm": 1.6419967207284256, "learning_rate": 1.1239079710137659e-05, "loss": 0.7861, "step": 2902 }, { "epoch": 0.4768005255810134, "grad_norm": 0.6274312931582016, "learning_rate": 1.1233800309554083e-05, "loss": 0.3458, "step": 2903 }, { "epoch": 0.47696476964769646, "grad_norm": 1.6674229441604065, "learning_rate": 1.1228520559745922e-05, "loss": 0.7988, "step": 2904 }, { "epoch": 0.47712901371437955, "grad_norm": 1.7841581260030577, "learning_rate": 1.1223240462207601e-05, "loss": 0.7344, "step": 2905 }, { "epoch": 0.4772932577810627, "grad_norm": 1.7842946314141728, "learning_rate": 1.121796001843364e-05, "loss": 0.8423, "step": 2906 }, { "epoch": 0.47745750184774577, "grad_norm": 2.4162665571131865, "learning_rate": 1.1212679229918657e-05, "loss": 0.7974, "step": 2907 }, { "epoch": 0.47762174591442885, "grad_norm": 1.938170209863193, "learning_rate": 1.1207398098157371e-05, "loss": 0.6945, "step": 2908 }, { "epoch": 0.47778598998111194, "grad_norm": 1.6096799273305635, "learning_rate": 1.1202116624644594e-05, "loss": 0.8219, "step": 2909 }, { "epoch": 0.477950234047795, "grad_norm": 1.7634963486284976, "learning_rate": 1.1196834810875234e-05, "loss": 0.7549, "step": 2910 }, { "epoch": 0.4781144781144781, "grad_norm": 1.3698474602791029, "learning_rate": 1.11915526583443e-05, "loss": 0.747, "step": 2911 }, { "epoch": 0.4782787221811612, "grad_norm": 2.736295656053886, "learning_rate": 1.1186270168546891e-05, "loss": 0.8506, "step": 2912 }, { "epoch": 0.47844296624784427, "grad_norm": 1.402818224840677, "learning_rate": 1.1180987342978209e-05, "loss": 0.7918, "step": 2913 }, { "epoch": 0.4786072103145274, "grad_norm": 1.8147388021926167, "learning_rate": 1.1175704183133542e-05, "loss": 0.812, "step": 2914 }, { "epoch": 0.4787714543812105, "grad_norm": 1.9853670985172076, "learning_rate": 1.1170420690508281e-05, "loss": 0.75, "step": 2915 }, { "epoch": 0.4789356984478936, "grad_norm": 1.7836772855398655, "learning_rate": 1.1165136866597905e-05, "loss": 0.631, "step": 2916 }, { "epoch": 0.47909994251457666, "grad_norm": 0.6626698847701831, "learning_rate": 1.1159852712897989e-05, "loss": 0.3664, "step": 2917 }, { "epoch": 0.47926418658125974, "grad_norm": 1.7346874677070416, "learning_rate": 1.1154568230904204e-05, "loss": 0.6992, "step": 2918 }, { "epoch": 0.4794284306479428, "grad_norm": 1.7973851825959142, "learning_rate": 1.1149283422112312e-05, "loss": 0.778, "step": 2919 }, { "epoch": 0.4795926747146259, "grad_norm": 0.6408704838404428, "learning_rate": 1.1143998288018163e-05, "loss": 0.3448, "step": 2920 }, { "epoch": 0.47975691878130905, "grad_norm": 2.938841346776101, "learning_rate": 1.1138712830117706e-05, "loss": 0.805, "step": 2921 }, { "epoch": 0.47992116284799213, "grad_norm": 2.447505876994721, "learning_rate": 1.1133427049906978e-05, "loss": 0.808, "step": 2922 }, { "epoch": 0.4800854069146752, "grad_norm": 1.992262646426045, "learning_rate": 1.1128140948882107e-05, "loss": 0.8146, "step": 2923 }, { "epoch": 0.4802496509813583, "grad_norm": 1.94182433911372, "learning_rate": 1.1122854528539315e-05, "loss": 0.8718, "step": 2924 }, { "epoch": 0.4804138950480414, "grad_norm": 2.677642665403648, "learning_rate": 1.111756779037491e-05, "loss": 0.7428, "step": 2925 }, { "epoch": 0.48057813911472447, "grad_norm": 1.3360919198797618, "learning_rate": 1.1112280735885295e-05, "loss": 0.7723, "step": 2926 }, { "epoch": 0.48074238318140755, "grad_norm": 1.6569976310098036, "learning_rate": 1.1106993366566957e-05, "loss": 0.8129, "step": 2927 }, { "epoch": 0.4809066272480907, "grad_norm": 1.9219771391371783, "learning_rate": 1.1101705683916473e-05, "loss": 0.7306, "step": 2928 }, { "epoch": 0.4810708713147738, "grad_norm": 1.5726058732367358, "learning_rate": 1.1096417689430517e-05, "loss": 0.7885, "step": 2929 }, { "epoch": 0.48123511538145686, "grad_norm": 1.9894256446466991, "learning_rate": 1.1091129384605837e-05, "loss": 0.7667, "step": 2930 }, { "epoch": 0.48139935944813994, "grad_norm": 1.740980260194128, "learning_rate": 1.1085840770939283e-05, "loss": 0.7313, "step": 2931 }, { "epoch": 0.481563603514823, "grad_norm": 1.5194114206315978, "learning_rate": 1.108055184992778e-05, "loss": 0.7871, "step": 2932 }, { "epoch": 0.4817278475815061, "grad_norm": 1.3204827242607124, "learning_rate": 1.1075262623068352e-05, "loss": 0.7817, "step": 2933 }, { "epoch": 0.4818920916481892, "grad_norm": 1.46261293636058, "learning_rate": 1.10699730918581e-05, "loss": 0.8259, "step": 2934 }, { "epoch": 0.4820563357148723, "grad_norm": 2.4657736439442672, "learning_rate": 1.1064683257794216e-05, "loss": 0.8049, "step": 2935 }, { "epoch": 0.4822205797815554, "grad_norm": 1.6982834346708862, "learning_rate": 1.1059393122373976e-05, "loss": 0.7852, "step": 2936 }, { "epoch": 0.4823848238482385, "grad_norm": 2.530509211221007, "learning_rate": 1.1054102687094738e-05, "loss": 0.7754, "step": 2937 }, { "epoch": 0.4825490679149216, "grad_norm": 1.5091665714327174, "learning_rate": 1.1048811953453955e-05, "loss": 0.8, "step": 2938 }, { "epoch": 0.48271331198160466, "grad_norm": 0.7400188042471532, "learning_rate": 1.1043520922949156e-05, "loss": 0.3234, "step": 2939 }, { "epoch": 0.48287755604828775, "grad_norm": 2.01268035851558, "learning_rate": 1.1038229597077954e-05, "loss": 0.8063, "step": 2940 }, { "epoch": 0.48304180011497083, "grad_norm": 1.792768031477109, "learning_rate": 1.1032937977338048e-05, "loss": 0.8097, "step": 2941 }, { "epoch": 0.4832060441816539, "grad_norm": 1.5386602275558376, "learning_rate": 1.1027646065227222e-05, "loss": 0.8764, "step": 2942 }, { "epoch": 0.48337028824833705, "grad_norm": 1.7522518433809597, "learning_rate": 1.1022353862243338e-05, "loss": 0.7862, "step": 2943 }, { "epoch": 0.48353453231502014, "grad_norm": 1.9859807984547673, "learning_rate": 1.1017061369884345e-05, "loss": 0.8054, "step": 2944 }, { "epoch": 0.4836987763817032, "grad_norm": 1.7030071003538274, "learning_rate": 1.101176858964827e-05, "loss": 0.797, "step": 2945 }, { "epoch": 0.4838630204483863, "grad_norm": 1.3712306896216002, "learning_rate": 1.1006475523033225e-05, "loss": 0.7956, "step": 2946 }, { "epoch": 0.4840272645150694, "grad_norm": 0.6450202139005978, "learning_rate": 1.10011821715374e-05, "loss": 0.3397, "step": 2947 }, { "epoch": 0.48419150858175247, "grad_norm": 1.7045192984505926, "learning_rate": 1.0995888536659067e-05, "loss": 0.7593, "step": 2948 }, { "epoch": 0.48435575264843556, "grad_norm": 1.5619605648047385, "learning_rate": 1.0990594619896581e-05, "loss": 0.8161, "step": 2949 }, { "epoch": 0.48451999671511864, "grad_norm": 1.4093092278823638, "learning_rate": 1.098530042274837e-05, "loss": 0.7718, "step": 2950 }, { "epoch": 0.4846842407818018, "grad_norm": 1.5147123569987813, "learning_rate": 1.0980005946712949e-05, "loss": 0.7313, "step": 2951 }, { "epoch": 0.48484848484848486, "grad_norm": 1.6001487873550342, "learning_rate": 1.0974711193288906e-05, "loss": 0.823, "step": 2952 }, { "epoch": 0.48501272891516795, "grad_norm": 1.5879078304945249, "learning_rate": 1.096941616397491e-05, "loss": 0.8599, "step": 2953 }, { "epoch": 0.48517697298185103, "grad_norm": 1.6208307992286888, "learning_rate": 1.0964120860269708e-05, "loss": 0.8174, "step": 2954 }, { "epoch": 0.4853412170485341, "grad_norm": 1.5896648276007095, "learning_rate": 1.0958825283672126e-05, "loss": 0.816, "step": 2955 }, { "epoch": 0.4855054611152172, "grad_norm": 6.05970018410631, "learning_rate": 1.0953529435681063e-05, "loss": 0.747, "step": 2956 }, { "epoch": 0.4856697051819003, "grad_norm": 1.9865780174201684, "learning_rate": 1.09482333177955e-05, "loss": 0.7816, "step": 2957 }, { "epoch": 0.4858339492485834, "grad_norm": 1.959930974090589, "learning_rate": 1.0942936931514492e-05, "loss": 0.7622, "step": 2958 }, { "epoch": 0.4859981933152665, "grad_norm": 3.392974803069282, "learning_rate": 1.0937640278337167e-05, "loss": 0.6748, "step": 2959 }, { "epoch": 0.4861624373819496, "grad_norm": 1.3963635612507488, "learning_rate": 1.0932343359762736e-05, "loss": 0.7322, "step": 2960 }, { "epoch": 0.48632668144863267, "grad_norm": 7.760903173405188, "learning_rate": 1.0927046177290477e-05, "loss": 0.8072, "step": 2961 }, { "epoch": 0.48649092551531575, "grad_norm": 2.0241809226338696, "learning_rate": 1.092174873241975e-05, "loss": 0.7617, "step": 2962 }, { "epoch": 0.48665516958199884, "grad_norm": 1.478249991275486, "learning_rate": 1.0916451026649981e-05, "loss": 0.8027, "step": 2963 }, { "epoch": 0.4868194136486819, "grad_norm": 2.4581029655827726, "learning_rate": 1.091115306148068e-05, "loss": 0.8081, "step": 2964 }, { "epoch": 0.48698365771536506, "grad_norm": 1.6380778987355522, "learning_rate": 1.0905854838411418e-05, "loss": 0.7693, "step": 2965 }, { "epoch": 0.48714790178204814, "grad_norm": 2.2205923692363005, "learning_rate": 1.0900556358941855e-05, "loss": 0.776, "step": 2966 }, { "epoch": 0.4873121458487312, "grad_norm": 1.605001738891385, "learning_rate": 1.0895257624571705e-05, "loss": 0.7709, "step": 2967 }, { "epoch": 0.4874763899154143, "grad_norm": 1.5461558307671164, "learning_rate": 1.088995863680077e-05, "loss": 0.7859, "step": 2968 }, { "epoch": 0.4876406339820974, "grad_norm": 1.7293002806225168, "learning_rate": 1.0884659397128911e-05, "loss": 0.7726, "step": 2969 }, { "epoch": 0.4878048780487805, "grad_norm": 2.4554093783267303, "learning_rate": 1.0879359907056074e-05, "loss": 0.8253, "step": 2970 }, { "epoch": 0.48796912211546356, "grad_norm": 1.4072438841045025, "learning_rate": 1.0874060168082266e-05, "loss": 0.8272, "step": 2971 }, { "epoch": 0.48813336618214664, "grad_norm": 1.8701524512026073, "learning_rate": 1.0868760181707565e-05, "loss": 0.8371, "step": 2972 }, { "epoch": 0.4882976102488298, "grad_norm": 2.984665700092367, "learning_rate": 1.0863459949432122e-05, "loss": 0.7983, "step": 2973 }, { "epoch": 0.48846185431551287, "grad_norm": 1.634327876872802, "learning_rate": 1.0858159472756157e-05, "loss": 0.7668, "step": 2974 }, { "epoch": 0.48862609838219595, "grad_norm": 1.508099229603791, "learning_rate": 1.085285875317996e-05, "loss": 0.7899, "step": 2975 }, { "epoch": 0.48879034244887903, "grad_norm": 1.4002712065948355, "learning_rate": 1.0847557792203886e-05, "loss": 0.7169, "step": 2976 }, { "epoch": 0.4889545865155621, "grad_norm": 1.566640220733681, "learning_rate": 1.0842256591328362e-05, "loss": 0.7828, "step": 2977 }, { "epoch": 0.4891188305822452, "grad_norm": 1.567458387382535, "learning_rate": 1.0836955152053883e-05, "loss": 0.7384, "step": 2978 }, { "epoch": 0.4892830746489283, "grad_norm": 1.4624653358099744, "learning_rate": 1.083165347588101e-05, "loss": 0.7208, "step": 2979 }, { "epoch": 0.4894473187156114, "grad_norm": 1.8615586585865302, "learning_rate": 1.082635156431037e-05, "loss": 0.806, "step": 2980 }, { "epoch": 0.4896115627822945, "grad_norm": 1.9862738261524968, "learning_rate": 1.0821049418842654e-05, "loss": 0.7492, "step": 2981 }, { "epoch": 0.4897758068489776, "grad_norm": 1.4171645544414908, "learning_rate": 1.0815747040978628e-05, "loss": 0.7702, "step": 2982 }, { "epoch": 0.4899400509156607, "grad_norm": 1.9658639493614518, "learning_rate": 1.081044443221912e-05, "loss": 0.7059, "step": 2983 }, { "epoch": 0.49010429498234376, "grad_norm": 2.037758334852222, "learning_rate": 1.0805141594065022e-05, "loss": 0.724, "step": 2984 }, { "epoch": 0.49026853904902684, "grad_norm": 1.59234418391555, "learning_rate": 1.0799838528017288e-05, "loss": 0.8172, "step": 2985 }, { "epoch": 0.4904327831157099, "grad_norm": 1.3627527524284926, "learning_rate": 1.0794535235576941e-05, "loss": 0.7583, "step": 2986 }, { "epoch": 0.490597027182393, "grad_norm": 1.5995776847733085, "learning_rate": 1.0789231718245069e-05, "loss": 0.8401, "step": 2987 }, { "epoch": 0.49076127124907615, "grad_norm": 2.0803922023820225, "learning_rate": 1.0783927977522819e-05, "loss": 0.7272, "step": 2988 }, { "epoch": 0.49092551531575923, "grad_norm": 1.264327286018862, "learning_rate": 1.0778624014911403e-05, "loss": 0.7461, "step": 2989 }, { "epoch": 0.4910897593824423, "grad_norm": 1.5683543400135407, "learning_rate": 1.0773319831912099e-05, "loss": 0.8213, "step": 2990 }, { "epoch": 0.4912540034491254, "grad_norm": 1.8603682541291755, "learning_rate": 1.0768015430026244e-05, "loss": 0.8437, "step": 2991 }, { "epoch": 0.4914182475158085, "grad_norm": 1.4934441120256332, "learning_rate": 1.0762710810755234e-05, "loss": 0.756, "step": 2992 }, { "epoch": 0.49158249158249157, "grad_norm": 1.7065636745465071, "learning_rate": 1.0757405975600534e-05, "loss": 0.7174, "step": 2993 }, { "epoch": 0.49174673564917465, "grad_norm": 2.031133786345437, "learning_rate": 1.0752100926063669e-05, "loss": 0.8651, "step": 2994 }, { "epoch": 0.4919109797158578, "grad_norm": 1.768001252051337, "learning_rate": 1.074679566364622e-05, "loss": 0.8329, "step": 2995 }, { "epoch": 0.49207522378254087, "grad_norm": 1.7072161049686296, "learning_rate": 1.0741490189849826e-05, "loss": 0.7662, "step": 2996 }, { "epoch": 0.49223946784922396, "grad_norm": 1.4811372461891494, "learning_rate": 1.0736184506176195e-05, "loss": 0.7855, "step": 2997 }, { "epoch": 0.49240371191590704, "grad_norm": 1.665231448237911, "learning_rate": 1.0730878614127087e-05, "loss": 0.8582, "step": 2998 }, { "epoch": 0.4925679559825901, "grad_norm": 1.3876124064355433, "learning_rate": 1.0725572515204327e-05, "loss": 0.7692, "step": 2999 }, { "epoch": 0.4927322000492732, "grad_norm": 1.6383380571392547, "learning_rate": 1.0720266210909793e-05, "loss": 0.8297, "step": 3000 }, { "epoch": 0.4928964441159563, "grad_norm": 1.5436495802966257, "learning_rate": 1.0714959702745424e-05, "loss": 0.7588, "step": 3001 }, { "epoch": 0.49306068818263943, "grad_norm": 1.9907884372228548, "learning_rate": 1.0709652992213216e-05, "loss": 0.7594, "step": 3002 }, { "epoch": 0.4932249322493225, "grad_norm": 1.5270756925278355, "learning_rate": 1.0704346080815218e-05, "loss": 0.8226, "step": 3003 }, { "epoch": 0.4933891763160056, "grad_norm": 1.8106899432777397, "learning_rate": 1.0699038970053544e-05, "loss": 0.8016, "step": 3004 }, { "epoch": 0.4935534203826887, "grad_norm": 1.6955086953817247, "learning_rate": 1.069373166143036e-05, "loss": 0.7718, "step": 3005 }, { "epoch": 0.49371766444937176, "grad_norm": 1.5120369158223759, "learning_rate": 1.068842415644789e-05, "loss": 0.7319, "step": 3006 }, { "epoch": 0.49388190851605485, "grad_norm": 1.5388321468474213, "learning_rate": 1.0683116456608411e-05, "loss": 0.7385, "step": 3007 }, { "epoch": 0.49404615258273793, "grad_norm": 1.9264509007297228, "learning_rate": 1.0677808563414256e-05, "loss": 0.7382, "step": 3008 }, { "epoch": 0.494210396649421, "grad_norm": 2.0264535853186683, "learning_rate": 1.0672500478367813e-05, "loss": 0.7327, "step": 3009 }, { "epoch": 0.49437464071610415, "grad_norm": 1.5618571961909058, "learning_rate": 1.0667192202971525e-05, "loss": 0.8039, "step": 3010 }, { "epoch": 0.49453888478278724, "grad_norm": 1.6195137965987898, "learning_rate": 1.0661883738727888e-05, "loss": 0.7803, "step": 3011 }, { "epoch": 0.4947031288494703, "grad_norm": 1.467094978238244, "learning_rate": 1.0656575087139452e-05, "loss": 0.7479, "step": 3012 }, { "epoch": 0.4948673729161534, "grad_norm": 1.6491076668031788, "learning_rate": 1.0651266249708816e-05, "loss": 0.7992, "step": 3013 }, { "epoch": 0.4950316169828365, "grad_norm": 1.5381976643024264, "learning_rate": 1.064595722793864e-05, "loss": 0.7528, "step": 3014 }, { "epoch": 0.49519586104951957, "grad_norm": 1.4747955676206195, "learning_rate": 1.0640648023331625e-05, "loss": 0.7687, "step": 3015 }, { "epoch": 0.49536010511620265, "grad_norm": 1.6732130241886418, "learning_rate": 1.063533863739054e-05, "loss": 0.7509, "step": 3016 }, { "epoch": 0.4955243491828858, "grad_norm": 1.4979056753760986, "learning_rate": 1.0630029071618188e-05, "loss": 0.7314, "step": 3017 }, { "epoch": 0.4956885932495689, "grad_norm": 1.5506824432638877, "learning_rate": 1.0624719327517434e-05, "loss": 0.8254, "step": 3018 }, { "epoch": 0.49585283731625196, "grad_norm": 1.6267567953828879, "learning_rate": 1.061940940659119e-05, "loss": 0.7752, "step": 3019 }, { "epoch": 0.49601708138293504, "grad_norm": 1.4670475618747474, "learning_rate": 1.0614099310342414e-05, "loss": 0.7262, "step": 3020 }, { "epoch": 0.4961813254496181, "grad_norm": 0.6593238302496596, "learning_rate": 1.0608789040274122e-05, "loss": 0.3497, "step": 3021 }, { "epoch": 0.4963455695163012, "grad_norm": 1.654841671571881, "learning_rate": 1.0603478597889374e-05, "loss": 0.7193, "step": 3022 }, { "epoch": 0.4965098135829843, "grad_norm": 1.6638737217409445, "learning_rate": 1.0598167984691276e-05, "loss": 0.7968, "step": 3023 }, { "epoch": 0.49667405764966743, "grad_norm": 1.6616461508818265, "learning_rate": 1.059285720218299e-05, "loss": 0.708, "step": 3024 }, { "epoch": 0.4968383017163505, "grad_norm": 1.5833749397162, "learning_rate": 1.058754625186772e-05, "loss": 0.8115, "step": 3025 }, { "epoch": 0.4970025457830336, "grad_norm": 1.6798150022194867, "learning_rate": 1.0582235135248718e-05, "loss": 0.7077, "step": 3026 }, { "epoch": 0.4971667898497167, "grad_norm": 1.532928679053463, "learning_rate": 1.0576923853829284e-05, "loss": 0.7795, "step": 3027 }, { "epoch": 0.49733103391639977, "grad_norm": 1.6845380118912567, "learning_rate": 1.057161240911277e-05, "loss": 0.6936, "step": 3028 }, { "epoch": 0.49749527798308285, "grad_norm": 2.162535067510587, "learning_rate": 1.0566300802602565e-05, "loss": 0.7128, "step": 3029 }, { "epoch": 0.49765952204976593, "grad_norm": 2.3130691570807578, "learning_rate": 1.056098903580211e-05, "loss": 0.68, "step": 3030 }, { "epoch": 0.497823766116449, "grad_norm": 1.620051535699293, "learning_rate": 1.0555677110214889e-05, "loss": 0.7864, "step": 3031 }, { "epoch": 0.49798801018313216, "grad_norm": 1.5194833028582286, "learning_rate": 1.0550365027344432e-05, "loss": 0.7028, "step": 3032 }, { "epoch": 0.49815225424981524, "grad_norm": 1.939902615189187, "learning_rate": 1.0545052788694312e-05, "loss": 0.8576, "step": 3033 }, { "epoch": 0.4983164983164983, "grad_norm": 1.797355228614984, "learning_rate": 1.0539740395768143e-05, "loss": 0.7155, "step": 3034 }, { "epoch": 0.4984807423831814, "grad_norm": 1.7342081360777428, "learning_rate": 1.0534427850069595e-05, "loss": 0.7441, "step": 3035 }, { "epoch": 0.4986449864498645, "grad_norm": 1.6344279627447924, "learning_rate": 1.0529115153102366e-05, "loss": 0.7488, "step": 3036 }, { "epoch": 0.4988092305165476, "grad_norm": 3.0019745314479938, "learning_rate": 1.0523802306370206e-05, "loss": 0.8065, "step": 3037 }, { "epoch": 0.49897347458323066, "grad_norm": 1.574331353188551, "learning_rate": 1.0518489311376905e-05, "loss": 0.8437, "step": 3038 }, { "epoch": 0.4991377186499138, "grad_norm": 1.9394198489119026, "learning_rate": 1.0513176169626293e-05, "loss": 0.8096, "step": 3039 }, { "epoch": 0.4993019627165969, "grad_norm": 1.8607771454828645, "learning_rate": 1.0507862882622249e-05, "loss": 0.7952, "step": 3040 }, { "epoch": 0.49946620678327996, "grad_norm": 1.7986975558229348, "learning_rate": 1.0502549451868683e-05, "loss": 0.8123, "step": 3041 }, { "epoch": 0.49963045084996305, "grad_norm": 1.8263238528289658, "learning_rate": 1.0497235878869554e-05, "loss": 0.7086, "step": 3042 }, { "epoch": 0.49979469491664613, "grad_norm": 1.852782366116979, "learning_rate": 1.0491922165128853e-05, "loss": 0.7588, "step": 3043 }, { "epoch": 0.4999589389833292, "grad_norm": 1.7818706199780385, "learning_rate": 1.048660831215062e-05, "loss": 0.7966, "step": 3044 }, { "epoch": 0.5001231830500124, "grad_norm": 1.7957407142374702, "learning_rate": 1.0481294321438928e-05, "loss": 0.7566, "step": 3045 }, { "epoch": 0.5002874271166954, "grad_norm": 1.453163269646183, "learning_rate": 1.0475980194497892e-05, "loss": 0.7859, "step": 3046 }, { "epoch": 0.5004516711833785, "grad_norm": 1.5093910379821867, "learning_rate": 1.0470665932831661e-05, "loss": 0.8, "step": 3047 }, { "epoch": 0.5006159152500615, "grad_norm": 1.5723652085080109, "learning_rate": 1.0465351537944429e-05, "loss": 0.7809, "step": 3048 }, { "epoch": 0.5007801593167447, "grad_norm": 1.5296851229958874, "learning_rate": 1.0460037011340422e-05, "loss": 0.7567, "step": 3049 }, { "epoch": 0.5009444033834278, "grad_norm": 1.5855398279108333, "learning_rate": 1.0454722354523906e-05, "loss": 0.8302, "step": 3050 }, { "epoch": 0.5011086474501109, "grad_norm": 4.439306331079265, "learning_rate": 1.0449407568999186e-05, "loss": 0.7379, "step": 3051 }, { "epoch": 0.501272891516794, "grad_norm": 1.6038466973777707, "learning_rate": 1.04440926562706e-05, "loss": 0.8125, "step": 3052 }, { "epoch": 0.501437135583477, "grad_norm": 1.5371508066182764, "learning_rate": 1.043877761784252e-05, "loss": 0.7722, "step": 3053 }, { "epoch": 0.5016013796501602, "grad_norm": 1.7441782988912697, "learning_rate": 1.0433462455219359e-05, "loss": 0.787, "step": 3054 }, { "epoch": 0.5017656237168432, "grad_norm": 2.328735190159924, "learning_rate": 1.0428147169905563e-05, "loss": 0.7089, "step": 3055 }, { "epoch": 0.5019298677835263, "grad_norm": 2.4745737993391934, "learning_rate": 1.0422831763405612e-05, "loss": 0.7615, "step": 3056 }, { "epoch": 0.5020941118502094, "grad_norm": 1.9786860823875634, "learning_rate": 1.0417516237224017e-05, "loss": 0.7672, "step": 3057 }, { "epoch": 0.5022583559168925, "grad_norm": 2.069665027338814, "learning_rate": 1.0412200592865331e-05, "loss": 0.737, "step": 3058 }, { "epoch": 0.5024225999835756, "grad_norm": 1.7468161606918287, "learning_rate": 1.0406884831834133e-05, "loss": 0.7505, "step": 3059 }, { "epoch": 0.5025868440502587, "grad_norm": 1.3241919253069714, "learning_rate": 1.0401568955635042e-05, "loss": 0.7711, "step": 3060 }, { "epoch": 0.5027510881169418, "grad_norm": 2.073986994749109, "learning_rate": 1.0396252965772702e-05, "loss": 0.7763, "step": 3061 }, { "epoch": 0.5029153321836248, "grad_norm": 1.9351659939515757, "learning_rate": 1.0390936863751791e-05, "loss": 0.8362, "step": 3062 }, { "epoch": 0.503079576250308, "grad_norm": 1.6413902040654116, "learning_rate": 1.0385620651077024e-05, "loss": 0.8009, "step": 3063 }, { "epoch": 0.503243820316991, "grad_norm": 9.712604864143332, "learning_rate": 1.0380304329253144e-05, "loss": 0.7771, "step": 3064 }, { "epoch": 0.5034080643836741, "grad_norm": 1.722752012238772, "learning_rate": 1.0374987899784925e-05, "loss": 0.7875, "step": 3065 }, { "epoch": 0.5035723084503573, "grad_norm": 1.15722509053775, "learning_rate": 1.036967136417717e-05, "loss": 0.7576, "step": 3066 }, { "epoch": 0.5037365525170403, "grad_norm": 1.6412364095545926, "learning_rate": 1.036435472393471e-05, "loss": 0.8123, "step": 3067 }, { "epoch": 0.5039007965837234, "grad_norm": 2.0069178817291893, "learning_rate": 1.0359037980562416e-05, "loss": 0.7548, "step": 3068 }, { "epoch": 0.5040650406504065, "grad_norm": 1.5958217246691753, "learning_rate": 1.0353721135565173e-05, "loss": 0.7337, "step": 3069 }, { "epoch": 0.5042292847170896, "grad_norm": 1.9499888930459133, "learning_rate": 1.034840419044791e-05, "loss": 0.8547, "step": 3070 }, { "epoch": 0.5043935287837726, "grad_norm": 0.6562010562262413, "learning_rate": 1.0343087146715573e-05, "loss": 0.3187, "step": 3071 }, { "epoch": 0.5045577728504558, "grad_norm": 1.8028413738758844, "learning_rate": 1.033777000587314e-05, "loss": 0.8237, "step": 3072 }, { "epoch": 0.5047220169171389, "grad_norm": 1.6122498937185141, "learning_rate": 1.0332452769425619e-05, "loss": 0.883, "step": 3073 }, { "epoch": 0.5048862609838219, "grad_norm": 2.5844741096582347, "learning_rate": 1.0327135438878035e-05, "loss": 0.7391, "step": 3074 }, { "epoch": 0.5050505050505051, "grad_norm": 1.8069123168543597, "learning_rate": 1.0321818015735459e-05, "loss": 0.7515, "step": 3075 }, { "epoch": 0.5052147491171881, "grad_norm": 2.0930903633486495, "learning_rate": 1.031650050150297e-05, "loss": 0.7712, "step": 3076 }, { "epoch": 0.5053789931838713, "grad_norm": 1.6800819600874217, "learning_rate": 1.0311182897685681e-05, "loss": 0.7683, "step": 3077 }, { "epoch": 0.5055432372505543, "grad_norm": 1.8394520772748302, "learning_rate": 1.0305865205788728e-05, "loss": 0.7988, "step": 3078 }, { "epoch": 0.5057074813172374, "grad_norm": 2.0337873569672844, "learning_rate": 1.0300547427317269e-05, "loss": 0.8498, "step": 3079 }, { "epoch": 0.5058717253839206, "grad_norm": 1.4991734712188063, "learning_rate": 1.0295229563776494e-05, "loss": 0.807, "step": 3080 }, { "epoch": 0.5060359694506036, "grad_norm": 1.569686948882687, "learning_rate": 1.0289911616671613e-05, "loss": 0.7907, "step": 3081 }, { "epoch": 0.5062002135172867, "grad_norm": 1.8278767792309847, "learning_rate": 1.0284593587507857e-05, "loss": 0.7989, "step": 3082 }, { "epoch": 0.5063644575839698, "grad_norm": 1.5088602956692527, "learning_rate": 1.0279275477790487e-05, "loss": 0.7507, "step": 3083 }, { "epoch": 0.5065287016506529, "grad_norm": 2.1469802182100097, "learning_rate": 1.0273957289024778e-05, "loss": 0.7884, "step": 3084 }, { "epoch": 0.5066929457173359, "grad_norm": 1.5047044615304008, "learning_rate": 1.0268639022716033e-05, "loss": 0.7666, "step": 3085 }, { "epoch": 0.5068571897840191, "grad_norm": 2.6788777218436843, "learning_rate": 1.0263320680369581e-05, "loss": 0.7762, "step": 3086 }, { "epoch": 0.5070214338507022, "grad_norm": 1.3545070451846908, "learning_rate": 1.0258002263490767e-05, "loss": 0.6766, "step": 3087 }, { "epoch": 0.5071856779173852, "grad_norm": 1.6485707734893615, "learning_rate": 1.0252683773584953e-05, "loss": 0.8312, "step": 3088 }, { "epoch": 0.5073499219840684, "grad_norm": 1.5096042816082467, "learning_rate": 1.0247365212157527e-05, "loss": 0.781, "step": 3089 }, { "epoch": 0.5075141660507514, "grad_norm": 1.612879127441632, "learning_rate": 1.02420465807139e-05, "loss": 0.7982, "step": 3090 }, { "epoch": 0.5076784101174345, "grad_norm": 1.8031750968468208, "learning_rate": 1.0236727880759496e-05, "loss": 0.7849, "step": 3091 }, { "epoch": 0.5078426541841176, "grad_norm": 1.771751508406735, "learning_rate": 1.0231409113799764e-05, "loss": 0.7843, "step": 3092 }, { "epoch": 0.5080068982508007, "grad_norm": 1.651196713134183, "learning_rate": 1.0226090281340168e-05, "loss": 0.7985, "step": 3093 }, { "epoch": 0.5081711423174837, "grad_norm": 1.4762345755394317, "learning_rate": 1.0220771384886194e-05, "loss": 0.7336, "step": 3094 }, { "epoch": 0.5083353863841669, "grad_norm": 1.5245777442644395, "learning_rate": 1.0215452425943346e-05, "loss": 0.7917, "step": 3095 }, { "epoch": 0.50849963045085, "grad_norm": 1.5802471739618502, "learning_rate": 1.021013340601714e-05, "loss": 0.768, "step": 3096 }, { "epoch": 0.508663874517533, "grad_norm": 1.9205015309693492, "learning_rate": 1.0204814326613115e-05, "loss": 0.7849, "step": 3097 }, { "epoch": 0.5088281185842162, "grad_norm": 1.7388370739256658, "learning_rate": 1.0199495189236828e-05, "loss": 0.7739, "step": 3098 }, { "epoch": 0.5089923626508992, "grad_norm": 2.323583892962526, "learning_rate": 1.0194175995393847e-05, "loss": 0.8247, "step": 3099 }, { "epoch": 0.5091566067175823, "grad_norm": 1.5887960456491284, "learning_rate": 1.0188856746589757e-05, "loss": 0.7945, "step": 3100 }, { "epoch": 0.5093208507842654, "grad_norm": 1.853551043175056, "learning_rate": 1.0183537444330165e-05, "loss": 0.7166, "step": 3101 }, { "epoch": 0.5094850948509485, "grad_norm": 1.7427949535875071, "learning_rate": 1.0178218090120683e-05, "loss": 0.7798, "step": 3102 }, { "epoch": 0.5096493389176316, "grad_norm": 0.6938114713965811, "learning_rate": 1.0172898685466947e-05, "loss": 0.3095, "step": 3103 }, { "epoch": 0.5098135829843147, "grad_norm": 2.125308298464471, "learning_rate": 1.01675792318746e-05, "loss": 0.7868, "step": 3104 }, { "epoch": 0.5099778270509978, "grad_norm": 0.6387823853736255, "learning_rate": 1.0162259730849306e-05, "loss": 0.3257, "step": 3105 }, { "epoch": 0.5101420711176808, "grad_norm": 1.7696254138640135, "learning_rate": 1.0156940183896737e-05, "loss": 0.8238, "step": 3106 }, { "epoch": 0.510306315184364, "grad_norm": 1.3853121157228234, "learning_rate": 1.0151620592522577e-05, "loss": 0.7454, "step": 3107 }, { "epoch": 0.510470559251047, "grad_norm": 1.4033774425239587, "learning_rate": 1.0146300958232528e-05, "loss": 0.8123, "step": 3108 }, { "epoch": 0.5106348033177301, "grad_norm": 1.6269349258856869, "learning_rate": 1.0140981282532301e-05, "loss": 0.8326, "step": 3109 }, { "epoch": 0.5107990473844133, "grad_norm": 1.4713927124905433, "learning_rate": 1.0135661566927619e-05, "loss": 0.8292, "step": 3110 }, { "epoch": 0.5109632914510963, "grad_norm": 1.6599790595893134, "learning_rate": 1.0130341812924215e-05, "loss": 0.7492, "step": 3111 }, { "epoch": 0.5111275355177795, "grad_norm": 1.7613813991493414, "learning_rate": 1.0125022022027834e-05, "loss": 0.8316, "step": 3112 }, { "epoch": 0.5112917795844625, "grad_norm": 1.6900376746308592, "learning_rate": 1.0119702195744236e-05, "loss": 0.8717, "step": 3113 }, { "epoch": 0.5114560236511456, "grad_norm": 4.871623580296546, "learning_rate": 1.011438233557918e-05, "loss": 0.7865, "step": 3114 }, { "epoch": 0.5116202677178286, "grad_norm": 1.6749575508691399, "learning_rate": 1.0109062443038446e-05, "loss": 0.7196, "step": 3115 }, { "epoch": 0.5117845117845118, "grad_norm": 2.156518829833684, "learning_rate": 1.0103742519627818e-05, "loss": 0.7635, "step": 3116 }, { "epoch": 0.5119487558511949, "grad_norm": 1.7455732187240995, "learning_rate": 1.0098422566853086e-05, "loss": 0.7823, "step": 3117 }, { "epoch": 0.512112999917878, "grad_norm": 1.9155223012541684, "learning_rate": 1.0093102586220056e-05, "loss": 0.7889, "step": 3118 }, { "epoch": 0.5122772439845611, "grad_norm": 1.6376380567650435, "learning_rate": 1.0087782579234532e-05, "loss": 0.7905, "step": 3119 }, { "epoch": 0.5124414880512441, "grad_norm": 1.9072699500225152, "learning_rate": 1.0082462547402337e-05, "loss": 0.6922, "step": 3120 }, { "epoch": 0.5126057321179273, "grad_norm": 1.5839339704563284, "learning_rate": 1.0077142492229288e-05, "loss": 0.7276, "step": 3121 }, { "epoch": 0.5127699761846103, "grad_norm": 1.9576992558116584, "learning_rate": 1.007182241522122e-05, "loss": 0.7061, "step": 3122 }, { "epoch": 0.5129342202512934, "grad_norm": 1.7770817620506567, "learning_rate": 1.0066502317883969e-05, "loss": 0.8301, "step": 3123 }, { "epoch": 0.5130984643179766, "grad_norm": 1.8468331183819369, "learning_rate": 1.0061182201723377e-05, "loss": 0.7852, "step": 3124 }, { "epoch": 0.5132627083846596, "grad_norm": 1.7118088845101147, "learning_rate": 1.005586206824529e-05, "loss": 0.6657, "step": 3125 }, { "epoch": 0.5134269524513427, "grad_norm": 1.6807588278685557, "learning_rate": 1.0050541918955564e-05, "loss": 0.7809, "step": 3126 }, { "epoch": 0.5135911965180258, "grad_norm": 1.6547652224859948, "learning_rate": 1.0045221755360053e-05, "loss": 0.725, "step": 3127 }, { "epoch": 0.5137554405847089, "grad_norm": 1.474424239978108, "learning_rate": 1.0039901578964619e-05, "loss": 0.8434, "step": 3128 }, { "epoch": 0.5139196846513919, "grad_norm": 1.7146268325665783, "learning_rate": 1.0034581391275129e-05, "loss": 0.7919, "step": 3129 }, { "epoch": 0.5140839287180751, "grad_norm": 1.745833156865599, "learning_rate": 1.0029261193797446e-05, "loss": 0.833, "step": 3130 }, { "epoch": 0.5142481727847581, "grad_norm": 1.582072726803433, "learning_rate": 1.0023940988037446e-05, "loss": 0.7872, "step": 3131 }, { "epoch": 0.5144124168514412, "grad_norm": 1.4566794967709913, "learning_rate": 1.0018620775500999e-05, "loss": 0.7462, "step": 3132 }, { "epoch": 0.5145766609181244, "grad_norm": 1.6957077376769947, "learning_rate": 1.0013300557693981e-05, "loss": 0.8177, "step": 3133 }, { "epoch": 0.5147409049848074, "grad_norm": 1.8525080670179839, "learning_rate": 1.0007980336122267e-05, "loss": 0.7235, "step": 3134 }, { "epoch": 0.5149051490514905, "grad_norm": 1.7416463008032033, "learning_rate": 1.0002660112291736e-05, "loss": 0.6989, "step": 3135 }, { "epoch": 0.5150693931181736, "grad_norm": 1.5204147296416264, "learning_rate": 9.997339887708269e-06, "loss": 0.8481, "step": 3136 }, { "epoch": 0.5152336371848567, "grad_norm": 1.6061408809266506, "learning_rate": 9.992019663877738e-06, "loss": 0.7385, "step": 3137 }, { "epoch": 0.5153978812515397, "grad_norm": 1.9994797497407117, "learning_rate": 9.986699442306025e-06, "loss": 0.7871, "step": 3138 }, { "epoch": 0.5155621253182229, "grad_norm": 1.95300128013303, "learning_rate": 9.981379224499006e-06, "loss": 0.8548, "step": 3139 }, { "epoch": 0.515726369384906, "grad_norm": 1.8253670675984393, "learning_rate": 9.976059011962557e-06, "loss": 0.8168, "step": 3140 }, { "epoch": 0.515890613451589, "grad_norm": 1.4512205595911638, "learning_rate": 9.970738806202557e-06, "loss": 0.7533, "step": 3141 }, { "epoch": 0.5160548575182722, "grad_norm": 1.584482465562951, "learning_rate": 9.965418608724875e-06, "loss": 0.7924, "step": 3142 }, { "epoch": 0.5162191015849552, "grad_norm": 2.7420432781187496, "learning_rate": 9.960098421035383e-06, "loss": 0.7371, "step": 3143 }, { "epoch": 0.5163833456516383, "grad_norm": 4.051175356464356, "learning_rate": 9.95477824463995e-06, "loss": 0.7868, "step": 3144 }, { "epoch": 0.5165475897183214, "grad_norm": 2.0047022137241144, "learning_rate": 9.94945808104444e-06, "loss": 0.6995, "step": 3145 }, { "epoch": 0.5167118337850045, "grad_norm": 1.4787167941282047, "learning_rate": 9.944137931754712e-06, "loss": 0.7647, "step": 3146 }, { "epoch": 0.5168760778516877, "grad_norm": 9.738946131423267, "learning_rate": 9.938817798276627e-06, "loss": 0.7424, "step": 3147 }, { "epoch": 0.5170403219183707, "grad_norm": 2.575498232498258, "learning_rate": 9.933497682116035e-06, "loss": 0.7364, "step": 3148 }, { "epoch": 0.5172045659850538, "grad_norm": 2.1572366185603378, "learning_rate": 9.928177584778783e-06, "loss": 0.7803, "step": 3149 }, { "epoch": 0.5173688100517368, "grad_norm": 1.4277305443996617, "learning_rate": 9.922857507770716e-06, "loss": 0.7342, "step": 3150 }, { "epoch": 0.51753305411842, "grad_norm": 1.5116285803183358, "learning_rate": 9.917537452597667e-06, "loss": 0.8183, "step": 3151 }, { "epoch": 0.517697298185103, "grad_norm": 1.7406464975909157, "learning_rate": 9.912217420765471e-06, "loss": 0.7462, "step": 3152 }, { "epoch": 0.5178615422517862, "grad_norm": 2.822480605575285, "learning_rate": 9.906897413779949e-06, "loss": 0.8284, "step": 3153 }, { "epoch": 0.5180257863184693, "grad_norm": 1.6633932519668229, "learning_rate": 9.901577433146915e-06, "loss": 0.8015, "step": 3154 }, { "epoch": 0.5181900303851523, "grad_norm": 1.7494516515003198, "learning_rate": 9.896257480372184e-06, "loss": 0.7736, "step": 3155 }, { "epoch": 0.5183542744518355, "grad_norm": 3.9794326296940814, "learning_rate": 9.890937556961554e-06, "loss": 0.8528, "step": 3156 }, { "epoch": 0.5185185185185185, "grad_norm": 2.5367467490787727, "learning_rate": 9.88561766442082e-06, "loss": 0.7215, "step": 3157 }, { "epoch": 0.5186827625852016, "grad_norm": 1.7636843192838334, "learning_rate": 9.88029780425577e-06, "loss": 0.7547, "step": 3158 }, { "epoch": 0.5188470066518847, "grad_norm": 1.9470694047725672, "learning_rate": 9.87497797797217e-06, "loss": 0.7737, "step": 3159 }, { "epoch": 0.5190112507185678, "grad_norm": 1.6081354749539736, "learning_rate": 9.86965818707579e-06, "loss": 0.8275, "step": 3160 }, { "epoch": 0.5191754947852509, "grad_norm": 1.7449321258553045, "learning_rate": 9.864338433072386e-06, "loss": 0.6897, "step": 3161 }, { "epoch": 0.519339738851934, "grad_norm": 2.0389252340258346, "learning_rate": 9.859018717467704e-06, "loss": 0.7516, "step": 3162 }, { "epoch": 0.5195039829186171, "grad_norm": 2.0703254941165676, "learning_rate": 9.853699041767473e-06, "loss": 0.7492, "step": 3163 }, { "epoch": 0.5196682269853001, "grad_norm": 1.687162225411849, "learning_rate": 9.848379407477425e-06, "loss": 0.7894, "step": 3164 }, { "epoch": 0.5198324710519833, "grad_norm": 1.4889361498970353, "learning_rate": 9.843059816103267e-06, "loss": 0.8267, "step": 3165 }, { "epoch": 0.5199967151186663, "grad_norm": 1.4551665918953263, "learning_rate": 9.837740269150696e-06, "loss": 0.8189, "step": 3166 }, { "epoch": 0.5201609591853494, "grad_norm": 1.7721518445547124, "learning_rate": 9.832420768125402e-06, "loss": 0.7603, "step": 3167 }, { "epoch": 0.5203252032520326, "grad_norm": 2.6108217561896017, "learning_rate": 9.827101314533056e-06, "loss": 0.7554, "step": 3168 }, { "epoch": 0.5204894473187156, "grad_norm": 1.9555482518880698, "learning_rate": 9.82178190987932e-06, "loss": 0.6992, "step": 3169 }, { "epoch": 0.5206536913853987, "grad_norm": 1.7671726858737449, "learning_rate": 9.816462555669838e-06, "loss": 0.7994, "step": 3170 }, { "epoch": 0.5208179354520818, "grad_norm": 1.3908115170015098, "learning_rate": 9.811143253410244e-06, "loss": 0.7692, "step": 3171 }, { "epoch": 0.5209821795187649, "grad_norm": 1.6861349683244444, "learning_rate": 9.805824004606156e-06, "loss": 0.6928, "step": 3172 }, { "epoch": 0.5211464235854479, "grad_norm": 1.45549321402251, "learning_rate": 9.800504810763176e-06, "loss": 0.7632, "step": 3173 }, { "epoch": 0.5213106676521311, "grad_norm": 1.4753129996927745, "learning_rate": 9.795185673386886e-06, "loss": 0.7871, "step": 3174 }, { "epoch": 0.5214749117188141, "grad_norm": 1.908386236595279, "learning_rate": 9.789866593982863e-06, "loss": 0.7875, "step": 3175 }, { "epoch": 0.5216391557854972, "grad_norm": 1.558626737549681, "learning_rate": 9.784547574056657e-06, "loss": 0.8355, "step": 3176 }, { "epoch": 0.5218033998521804, "grad_norm": 1.5306131250592125, "learning_rate": 9.779228615113808e-06, "loss": 0.7695, "step": 3177 }, { "epoch": 0.5219676439188634, "grad_norm": 1.8106447084748698, "learning_rate": 9.773909718659831e-06, "loss": 0.7601, "step": 3178 }, { "epoch": 0.5221318879855466, "grad_norm": 1.8573848304572076, "learning_rate": 9.768590886200241e-06, "loss": 0.8054, "step": 3179 }, { "epoch": 0.5222961320522296, "grad_norm": 1.8944313425515305, "learning_rate": 9.76327211924051e-06, "loss": 0.8066, "step": 3180 }, { "epoch": 0.5224603761189127, "grad_norm": 2.5507912697207344, "learning_rate": 9.757953419286107e-06, "loss": 0.7476, "step": 3181 }, { "epoch": 0.5226246201855957, "grad_norm": 1.6445314577394636, "learning_rate": 9.752634787842478e-06, "loss": 0.7348, "step": 3182 }, { "epoch": 0.5227888642522789, "grad_norm": 1.5827164227422916, "learning_rate": 9.747316226415052e-06, "loss": 0.7865, "step": 3183 }, { "epoch": 0.522953108318962, "grad_norm": 1.991660590200216, "learning_rate": 9.741997736509238e-06, "loss": 0.6681, "step": 3184 }, { "epoch": 0.523117352385645, "grad_norm": 0.6629532642105995, "learning_rate": 9.73667931963042e-06, "loss": 0.3797, "step": 3185 }, { "epoch": 0.5232815964523282, "grad_norm": 2.099704713293579, "learning_rate": 9.731360977283969e-06, "loss": 0.7842, "step": 3186 }, { "epoch": 0.5234458405190112, "grad_norm": 1.5196028841969869, "learning_rate": 9.726042710975224e-06, "loss": 0.7306, "step": 3187 }, { "epoch": 0.5236100845856944, "grad_norm": 2.175132155516005, "learning_rate": 9.720724522209518e-06, "loss": 0.8371, "step": 3188 }, { "epoch": 0.5237743286523774, "grad_norm": 1.7907612593928923, "learning_rate": 9.715406412492145e-06, "loss": 0.7628, "step": 3189 }, { "epoch": 0.5239385727190605, "grad_norm": 1.917522365692466, "learning_rate": 9.710088383328392e-06, "loss": 0.694, "step": 3190 }, { "epoch": 0.5241028167857437, "grad_norm": 2.6034804856136375, "learning_rate": 9.704770436223508e-06, "loss": 0.7869, "step": 3191 }, { "epoch": 0.5242670608524267, "grad_norm": 1.678170147385643, "learning_rate": 9.699452572682734e-06, "loss": 0.7393, "step": 3192 }, { "epoch": 0.5244313049191098, "grad_norm": 1.7196666146006714, "learning_rate": 9.694134794211277e-06, "loss": 0.7697, "step": 3193 }, { "epoch": 0.5245955489857929, "grad_norm": 3.4134835313158676, "learning_rate": 9.68881710231432e-06, "loss": 0.8155, "step": 3194 }, { "epoch": 0.524759793052476, "grad_norm": 1.858174701476833, "learning_rate": 9.683499498497032e-06, "loss": 0.7922, "step": 3195 }, { "epoch": 0.524924037119159, "grad_norm": 1.6935762103371554, "learning_rate": 9.678181984264543e-06, "loss": 0.8155, "step": 3196 }, { "epoch": 0.5250882811858422, "grad_norm": 3.0244210956508626, "learning_rate": 9.672864561121963e-06, "loss": 0.7541, "step": 3197 }, { "epoch": 0.5252525252525253, "grad_norm": 18.135639164349012, "learning_rate": 9.667547230574386e-06, "loss": 0.7856, "step": 3198 }, { "epoch": 0.5254167693192083, "grad_norm": 1.5565396037586439, "learning_rate": 9.662229994126862e-06, "loss": 0.706, "step": 3199 }, { "epoch": 0.5255810133858915, "grad_norm": 1.5761074034785032, "learning_rate": 9.65691285328443e-06, "loss": 0.8341, "step": 3200 }, { "epoch": 0.5257452574525745, "grad_norm": 0.592453385769932, "learning_rate": 9.651595809552094e-06, "loss": 0.3405, "step": 3201 }, { "epoch": 0.5259095015192576, "grad_norm": 1.698499697445035, "learning_rate": 9.64627886443483e-06, "loss": 0.7306, "step": 3202 }, { "epoch": 0.5260737455859407, "grad_norm": 0.6276924462712145, "learning_rate": 9.64096201943759e-06, "loss": 0.3517, "step": 3203 }, { "epoch": 0.5262379896526238, "grad_norm": 1.6760467221846305, "learning_rate": 9.635645276065293e-06, "loss": 0.7766, "step": 3204 }, { "epoch": 0.526402233719307, "grad_norm": 1.9405881311828987, "learning_rate": 9.630328635822835e-06, "loss": 0.7453, "step": 3205 }, { "epoch": 0.52656647778599, "grad_norm": 1.8069453321584, "learning_rate": 9.625012100215078e-06, "loss": 0.8219, "step": 3206 }, { "epoch": 0.5267307218526731, "grad_norm": 1.5585851445124708, "learning_rate": 9.61969567074686e-06, "loss": 0.7982, "step": 3207 }, { "epoch": 0.5268949659193561, "grad_norm": 2.0565242460769437, "learning_rate": 9.61437934892298e-06, "loss": 0.8197, "step": 3208 }, { "epoch": 0.5270592099860393, "grad_norm": 1.6127559692307245, "learning_rate": 9.609063136248214e-06, "loss": 0.7857, "step": 3209 }, { "epoch": 0.5272234540527223, "grad_norm": 1.626886939367807, "learning_rate": 9.603747034227301e-06, "loss": 0.7328, "step": 3210 }, { "epoch": 0.5273876981194054, "grad_norm": 2.0645818241513116, "learning_rate": 9.598431044364963e-06, "loss": 0.7747, "step": 3211 }, { "epoch": 0.5275519421860885, "grad_norm": 1.825443739520823, "learning_rate": 9.593115168165868e-06, "loss": 0.7446, "step": 3212 }, { "epoch": 0.5277161862527716, "grad_norm": 1.7993957613803124, "learning_rate": 9.587799407134672e-06, "loss": 0.6651, "step": 3213 }, { "epoch": 0.5278804303194548, "grad_norm": 1.7684496012745479, "learning_rate": 9.582483762775987e-06, "loss": 0.8194, "step": 3214 }, { "epoch": 0.5280446743861378, "grad_norm": 2.0960136911820197, "learning_rate": 9.577168236594393e-06, "loss": 0.8873, "step": 3215 }, { "epoch": 0.5282089184528209, "grad_norm": 1.7129415955418115, "learning_rate": 9.571852830094439e-06, "loss": 0.792, "step": 3216 }, { "epoch": 0.528373162519504, "grad_norm": 1.7352922970835059, "learning_rate": 9.566537544780641e-06, "loss": 0.848, "step": 3217 }, { "epoch": 0.5285374065861871, "grad_norm": 2.352289470342295, "learning_rate": 9.56122238215748e-06, "loss": 0.7533, "step": 3218 }, { "epoch": 0.5287016506528701, "grad_norm": 1.5854799527277499, "learning_rate": 9.555907343729402e-06, "loss": 0.7361, "step": 3219 }, { "epoch": 0.5288658947195533, "grad_norm": 1.5535506209852037, "learning_rate": 9.550592431000814e-06, "loss": 0.8143, "step": 3220 }, { "epoch": 0.5290301387862364, "grad_norm": 1.56854191142987, "learning_rate": 9.545277645476094e-06, "loss": 0.8053, "step": 3221 }, { "epoch": 0.5291943828529194, "grad_norm": 1.708497142117159, "learning_rate": 9.53996298865958e-06, "loss": 0.8126, "step": 3222 }, { "epoch": 0.5293586269196026, "grad_norm": 3.2714992146868367, "learning_rate": 9.534648462055576e-06, "loss": 0.7508, "step": 3223 }, { "epoch": 0.5295228709862856, "grad_norm": 1.731854653944809, "learning_rate": 9.529334067168344e-06, "loss": 0.7962, "step": 3224 }, { "epoch": 0.5296871150529687, "grad_norm": 1.8732666218883531, "learning_rate": 9.524019805502113e-06, "loss": 0.8109, "step": 3225 }, { "epoch": 0.5298513591196518, "grad_norm": 2.16386652710444, "learning_rate": 9.518705678561075e-06, "loss": 0.8644, "step": 3226 }, { "epoch": 0.5300156031863349, "grad_norm": 2.8027994037600674, "learning_rate": 9.513391687849383e-06, "loss": 0.7725, "step": 3227 }, { "epoch": 0.530179847253018, "grad_norm": 1.4811955995079489, "learning_rate": 9.50807783487115e-06, "loss": 0.7602, "step": 3228 }, { "epoch": 0.5303440913197011, "grad_norm": 1.5416431097856216, "learning_rate": 9.50276412113045e-06, "loss": 0.8299, "step": 3229 }, { "epoch": 0.5305083353863842, "grad_norm": 2.1424849713530048, "learning_rate": 9.497450548131319e-06, "loss": 0.7463, "step": 3230 }, { "epoch": 0.5306725794530672, "grad_norm": 2.5926618171790357, "learning_rate": 9.492137117377755e-06, "loss": 0.7387, "step": 3231 }, { "epoch": 0.5308368235197504, "grad_norm": 1.7531750747558843, "learning_rate": 9.48682383037371e-06, "loss": 0.7515, "step": 3232 }, { "epoch": 0.5310010675864334, "grad_norm": 2.2177029366566545, "learning_rate": 9.481510688623098e-06, "loss": 0.7949, "step": 3233 }, { "epoch": 0.5311653116531165, "grad_norm": 1.4505859482295627, "learning_rate": 9.476197693629798e-06, "loss": 0.8093, "step": 3234 }, { "epoch": 0.5313295557197997, "grad_norm": 1.4388038574243118, "learning_rate": 9.470884846897638e-06, "loss": 0.7986, "step": 3235 }, { "epoch": 0.5314937997864827, "grad_norm": 2.0461243118799746, "learning_rate": 9.465572149930408e-06, "loss": 0.7398, "step": 3236 }, { "epoch": 0.5316580438531658, "grad_norm": 2.3519172054955, "learning_rate": 9.460259604231859e-06, "loss": 0.8552, "step": 3237 }, { "epoch": 0.5318222879198489, "grad_norm": 1.4523889711039832, "learning_rate": 9.454947211305691e-06, "loss": 0.7584, "step": 3238 }, { "epoch": 0.531986531986532, "grad_norm": 1.5879047368418844, "learning_rate": 9.44963497265557e-06, "loss": 0.7391, "step": 3239 }, { "epoch": 0.532150776053215, "grad_norm": 1.7453665471059177, "learning_rate": 9.444322889785111e-06, "loss": 0.776, "step": 3240 }, { "epoch": 0.5323150201198982, "grad_norm": 1.8134802376779362, "learning_rate": 9.43901096419789e-06, "loss": 0.777, "step": 3241 }, { "epoch": 0.5324792641865813, "grad_norm": 1.588757197534856, "learning_rate": 9.433699197397435e-06, "loss": 0.7496, "step": 3242 }, { "epoch": 0.5326435082532643, "grad_norm": 1.973796354201527, "learning_rate": 9.42838759088723e-06, "loss": 0.9094, "step": 3243 }, { "epoch": 0.5328077523199475, "grad_norm": 0.6375336844134148, "learning_rate": 9.423076146170718e-06, "loss": 0.3038, "step": 3244 }, { "epoch": 0.5329719963866305, "grad_norm": 3.5839659586418753, "learning_rate": 9.417764864751287e-06, "loss": 0.759, "step": 3245 }, { "epoch": 0.5331362404533136, "grad_norm": 1.7345757491788965, "learning_rate": 9.412453748132286e-06, "loss": 0.7926, "step": 3246 }, { "epoch": 0.5333004845199967, "grad_norm": 1.8299526870128493, "learning_rate": 9.407142797817014e-06, "loss": 0.7808, "step": 3247 }, { "epoch": 0.5334647285866798, "grad_norm": 1.588919355850177, "learning_rate": 9.401832015308728e-06, "loss": 0.8086, "step": 3248 }, { "epoch": 0.5336289726533628, "grad_norm": 1.6149935586503903, "learning_rate": 9.39652140211063e-06, "loss": 0.7143, "step": 3249 }, { "epoch": 0.533793216720046, "grad_norm": 1.8687507262255074, "learning_rate": 9.39121095972588e-06, "loss": 0.8215, "step": 3250 }, { "epoch": 0.5339574607867291, "grad_norm": 0.6597466148457728, "learning_rate": 9.385900689657588e-06, "loss": 0.3463, "step": 3251 }, { "epoch": 0.5341217048534121, "grad_norm": 1.719597879938788, "learning_rate": 9.380590593408813e-06, "loss": 0.8286, "step": 3252 }, { "epoch": 0.5342859489200953, "grad_norm": 2.1130861192942896, "learning_rate": 9.375280672482567e-06, "loss": 0.7438, "step": 3253 }, { "epoch": 0.5344501929867783, "grad_norm": 1.6394556392831907, "learning_rate": 9.369970928381813e-06, "loss": 0.7877, "step": 3254 }, { "epoch": 0.5346144370534615, "grad_norm": 1.3454908901598701, "learning_rate": 9.364661362609464e-06, "loss": 0.719, "step": 3255 }, { "epoch": 0.5347786811201445, "grad_norm": 4.98772107243857, "learning_rate": 9.359351976668377e-06, "loss": 0.7588, "step": 3256 }, { "epoch": 0.5349429251868276, "grad_norm": 2.364991032441409, "learning_rate": 9.354042772061362e-06, "loss": 0.7312, "step": 3257 }, { "epoch": 0.5351071692535108, "grad_norm": 2.305824492143595, "learning_rate": 9.348733750291186e-06, "loss": 0.7633, "step": 3258 }, { "epoch": 0.5352714133201938, "grad_norm": 1.5719438225040483, "learning_rate": 9.343424912860552e-06, "loss": 0.8547, "step": 3259 }, { "epoch": 0.5354356573868769, "grad_norm": 1.6131289685501222, "learning_rate": 9.338116261272114e-06, "loss": 0.7461, "step": 3260 }, { "epoch": 0.53559990145356, "grad_norm": 2.8326798211459097, "learning_rate": 9.332807797028476e-06, "loss": 0.779, "step": 3261 }, { "epoch": 0.5357641455202431, "grad_norm": 1.633549079707516, "learning_rate": 9.327499521632187e-06, "loss": 0.7458, "step": 3262 }, { "epoch": 0.5359283895869261, "grad_norm": 1.51493736506274, "learning_rate": 9.322191436585745e-06, "loss": 0.7587, "step": 3263 }, { "epoch": 0.5360926336536093, "grad_norm": 2.217508608892609, "learning_rate": 9.316883543391589e-06, "loss": 0.747, "step": 3264 }, { "epoch": 0.5362568777202924, "grad_norm": 1.5332416853156614, "learning_rate": 9.31157584355211e-06, "loss": 0.7896, "step": 3265 }, { "epoch": 0.5364211217869754, "grad_norm": 1.5928700441089596, "learning_rate": 9.306268338569643e-06, "loss": 0.7827, "step": 3266 }, { "epoch": 0.5365853658536586, "grad_norm": 1.511108346748925, "learning_rate": 9.30096102994646e-06, "loss": 0.7234, "step": 3267 }, { "epoch": 0.5367496099203416, "grad_norm": 1.4137918703746735, "learning_rate": 9.295653919184787e-06, "loss": 0.8322, "step": 3268 }, { "epoch": 0.5369138539870247, "grad_norm": 1.4188351370492387, "learning_rate": 9.290347007786791e-06, "loss": 0.6881, "step": 3269 }, { "epoch": 0.5370780980537078, "grad_norm": 1.7173102155663778, "learning_rate": 9.28504029725458e-06, "loss": 0.67, "step": 3270 }, { "epoch": 0.5372423421203909, "grad_norm": 1.6771156450773441, "learning_rate": 9.27973378909021e-06, "loss": 0.6913, "step": 3271 }, { "epoch": 0.537406586187074, "grad_norm": 1.6063865244991367, "learning_rate": 9.274427484795676e-06, "loss": 0.8143, "step": 3272 }, { "epoch": 0.5375708302537571, "grad_norm": 2.0315352082995712, "learning_rate": 9.269121385872915e-06, "loss": 0.7799, "step": 3273 }, { "epoch": 0.5377350743204402, "grad_norm": 2.085523668706819, "learning_rate": 9.263815493823808e-06, "loss": 0.7595, "step": 3274 }, { "epoch": 0.5378993183871232, "grad_norm": 3.1215318583625247, "learning_rate": 9.258509810150177e-06, "loss": 0.7608, "step": 3275 }, { "epoch": 0.5380635624538064, "grad_norm": 1.599000526390559, "learning_rate": 9.253204336353786e-06, "loss": 0.7487, "step": 3276 }, { "epoch": 0.5382278065204894, "grad_norm": 1.8367551217019822, "learning_rate": 9.247899073936334e-06, "loss": 0.8232, "step": 3277 }, { "epoch": 0.5383920505871725, "grad_norm": 1.6990663147323224, "learning_rate": 9.242594024399467e-06, "loss": 0.7514, "step": 3278 }, { "epoch": 0.5385562946538557, "grad_norm": 1.54476951995201, "learning_rate": 9.237289189244769e-06, "loss": 0.815, "step": 3279 }, { "epoch": 0.5387205387205387, "grad_norm": 1.6737295609683713, "learning_rate": 9.23198456997376e-06, "loss": 0.7522, "step": 3280 }, { "epoch": 0.5388847827872218, "grad_norm": 3.014510427640422, "learning_rate": 9.226680168087903e-06, "loss": 0.8071, "step": 3281 }, { "epoch": 0.5390490268539049, "grad_norm": 2.4096209906356054, "learning_rate": 9.221375985088597e-06, "loss": 0.7822, "step": 3282 }, { "epoch": 0.539213270920588, "grad_norm": 1.6992360268807574, "learning_rate": 9.216072022477183e-06, "loss": 0.7737, "step": 3283 }, { "epoch": 0.539377514987271, "grad_norm": 1.3507906804040584, "learning_rate": 9.210768281754931e-06, "loss": 0.7354, "step": 3284 }, { "epoch": 0.5395417590539542, "grad_norm": 2.8048770563046572, "learning_rate": 9.205464764423059e-06, "loss": 0.8227, "step": 3285 }, { "epoch": 0.5397060031206372, "grad_norm": 1.9616955517727865, "learning_rate": 9.200161471982713e-06, "loss": 0.7745, "step": 3286 }, { "epoch": 0.5398702471873204, "grad_norm": 1.5434293336849314, "learning_rate": 9.194858405934983e-06, "loss": 0.8077, "step": 3287 }, { "epoch": 0.5400344912540035, "grad_norm": 0.6455576341181775, "learning_rate": 9.189555567780882e-06, "loss": 0.3389, "step": 3288 }, { "epoch": 0.5401987353206865, "grad_norm": 1.4496939536343618, "learning_rate": 9.184252959021374e-06, "loss": 0.7132, "step": 3289 }, { "epoch": 0.5403629793873697, "grad_norm": 2.0583427827182663, "learning_rate": 9.17895058115735e-06, "loss": 0.8315, "step": 3290 }, { "epoch": 0.5405272234540527, "grad_norm": 2.176580008507282, "learning_rate": 9.173648435689637e-06, "loss": 0.8092, "step": 3291 }, { "epoch": 0.5406914675207358, "grad_norm": 1.4851767572287566, "learning_rate": 9.168346524118994e-06, "loss": 0.8367, "step": 3292 }, { "epoch": 0.5408557115874189, "grad_norm": 7.5917237950087255, "learning_rate": 9.16304484794612e-06, "loss": 0.7587, "step": 3293 }, { "epoch": 0.541019955654102, "grad_norm": 1.9566051501354949, "learning_rate": 9.15774340867164e-06, "loss": 0.862, "step": 3294 }, { "epoch": 0.5411841997207851, "grad_norm": 1.6782367380874952, "learning_rate": 9.152442207796115e-06, "loss": 0.7659, "step": 3295 }, { "epoch": 0.5413484437874682, "grad_norm": 2.21361183651928, "learning_rate": 9.147141246820042e-06, "loss": 0.7299, "step": 3296 }, { "epoch": 0.5415126878541513, "grad_norm": 2.4996546439713927, "learning_rate": 9.141840527243844e-06, "loss": 0.7538, "step": 3297 }, { "epoch": 0.5416769319208343, "grad_norm": 1.5130274749471213, "learning_rate": 9.13654005056788e-06, "loss": 0.7824, "step": 3298 }, { "epoch": 0.5418411759875175, "grad_norm": 3.365170559731577, "learning_rate": 9.131239818292438e-06, "loss": 0.7833, "step": 3299 }, { "epoch": 0.5420054200542005, "grad_norm": 1.6542317529200965, "learning_rate": 9.125939831917738e-06, "loss": 0.8316, "step": 3300 }, { "epoch": 0.5421696641208836, "grad_norm": 1.7518390748197936, "learning_rate": 9.120640092943929e-06, "loss": 0.7533, "step": 3301 }, { "epoch": 0.5423339081875668, "grad_norm": 1.3362227848293056, "learning_rate": 9.11534060287109e-06, "loss": 0.8068, "step": 3302 }, { "epoch": 0.5424981522542498, "grad_norm": 2.0268887655167984, "learning_rate": 9.110041363199233e-06, "loss": 0.7195, "step": 3303 }, { "epoch": 0.5426623963209329, "grad_norm": 1.8682993416639309, "learning_rate": 9.104742375428297e-06, "loss": 0.7359, "step": 3304 }, { "epoch": 0.542826640387616, "grad_norm": 1.7222816073632532, "learning_rate": 9.099443641058147e-06, "loss": 0.8668, "step": 3305 }, { "epoch": 0.5429908844542991, "grad_norm": 1.6955871116199428, "learning_rate": 9.094145161588582e-06, "loss": 0.8035, "step": 3306 }, { "epoch": 0.5431551285209821, "grad_norm": 1.80297697702705, "learning_rate": 9.088846938519322e-06, "loss": 0.8154, "step": 3307 }, { "epoch": 0.5433193725876653, "grad_norm": 1.8179893275212375, "learning_rate": 9.083548973350019e-06, "loss": 0.7377, "step": 3308 }, { "epoch": 0.5434836166543484, "grad_norm": 1.6650300708761934, "learning_rate": 9.078251267580256e-06, "loss": 0.7868, "step": 3309 }, { "epoch": 0.5436478607210314, "grad_norm": 2.0384996413439227, "learning_rate": 9.072953822709526e-06, "loss": 0.8271, "step": 3310 }, { "epoch": 0.5438121047877146, "grad_norm": 1.6084416346163812, "learning_rate": 9.067656640237267e-06, "loss": 0.8273, "step": 3311 }, { "epoch": 0.5439763488543976, "grad_norm": 1.8407677945276752, "learning_rate": 9.062359721662836e-06, "loss": 0.6796, "step": 3312 }, { "epoch": 0.5441405929210807, "grad_norm": 1.474579317153755, "learning_rate": 9.057063068485513e-06, "loss": 0.7471, "step": 3313 }, { "epoch": 0.5443048369877638, "grad_norm": 1.624558716049495, "learning_rate": 9.051766682204504e-06, "loss": 0.7078, "step": 3314 }, { "epoch": 0.5444690810544469, "grad_norm": 1.4633749811229433, "learning_rate": 9.04647056431894e-06, "loss": 0.8172, "step": 3315 }, { "epoch": 0.54463332512113, "grad_norm": 1.9032365368084239, "learning_rate": 9.041174716327879e-06, "loss": 0.7885, "step": 3316 }, { "epoch": 0.5447975691878131, "grad_norm": 1.8513863655110985, "learning_rate": 9.035879139730294e-06, "loss": 0.7607, "step": 3317 }, { "epoch": 0.5449618132544962, "grad_norm": 1.382949265011111, "learning_rate": 9.030583836025093e-06, "loss": 0.8274, "step": 3318 }, { "epoch": 0.5451260573211792, "grad_norm": 1.6528971907229144, "learning_rate": 9.025288806711096e-06, "loss": 0.7599, "step": 3319 }, { "epoch": 0.5452903013878624, "grad_norm": 1.6702865071603354, "learning_rate": 9.019994053287053e-06, "loss": 0.767, "step": 3320 }, { "epoch": 0.5454545454545454, "grad_norm": 1.5660446623666253, "learning_rate": 9.014699577251631e-06, "loss": 0.802, "step": 3321 }, { "epoch": 0.5456187895212286, "grad_norm": 2.5064144460906177, "learning_rate": 9.009405380103422e-06, "loss": 0.7867, "step": 3322 }, { "epoch": 0.5457830335879116, "grad_norm": 1.834498793187381, "learning_rate": 9.004111463340935e-06, "loss": 0.7013, "step": 3323 }, { "epoch": 0.5459472776545947, "grad_norm": 2.080164225944108, "learning_rate": 8.998817828462603e-06, "loss": 0.6521, "step": 3324 }, { "epoch": 0.5461115217212779, "grad_norm": 1.7725952419176276, "learning_rate": 8.993524476966779e-06, "loss": 0.8566, "step": 3325 }, { "epoch": 0.5462757657879609, "grad_norm": 1.4949518081773392, "learning_rate": 8.988231410351731e-06, "loss": 0.7623, "step": 3326 }, { "epoch": 0.546440009854644, "grad_norm": 1.3382310325976474, "learning_rate": 8.982938630115657e-06, "loss": 0.766, "step": 3327 }, { "epoch": 0.546604253921327, "grad_norm": 2.0952432794053406, "learning_rate": 8.977646137756662e-06, "loss": 0.833, "step": 3328 }, { "epoch": 0.5467684979880102, "grad_norm": 1.6873321282690694, "learning_rate": 8.97235393477278e-06, "loss": 0.7996, "step": 3329 }, { "epoch": 0.5469327420546932, "grad_norm": 1.829361435670844, "learning_rate": 8.967062022661952e-06, "loss": 0.8071, "step": 3330 }, { "epoch": 0.5470969861213764, "grad_norm": 2.4300763209532406, "learning_rate": 8.961770402922052e-06, "loss": 0.774, "step": 3331 }, { "epoch": 0.5472612301880595, "grad_norm": 1.4596174950370995, "learning_rate": 8.956479077050849e-06, "loss": 0.8758, "step": 3332 }, { "epoch": 0.5474254742547425, "grad_norm": 2.6334778898155347, "learning_rate": 8.951188046546048e-06, "loss": 0.7306, "step": 3333 }, { "epoch": 0.5475897183214257, "grad_norm": 1.6894712662127125, "learning_rate": 8.945897312905265e-06, "loss": 0.7789, "step": 3334 }, { "epoch": 0.5477539623881087, "grad_norm": 1.4442877869158142, "learning_rate": 8.940606877626028e-06, "loss": 0.7951, "step": 3335 }, { "epoch": 0.5479182064547918, "grad_norm": 1.439689534633324, "learning_rate": 8.935316742205787e-06, "loss": 0.795, "step": 3336 }, { "epoch": 0.5480824505214749, "grad_norm": 1.3530822240717655, "learning_rate": 8.930026908141902e-06, "loss": 0.7554, "step": 3337 }, { "epoch": 0.548246694588158, "grad_norm": 1.8005641233701444, "learning_rate": 8.924737376931651e-06, "loss": 0.7065, "step": 3338 }, { "epoch": 0.5484109386548411, "grad_norm": 1.7949994062089838, "learning_rate": 8.919448150072221e-06, "loss": 0.7891, "step": 3339 }, { "epoch": 0.5485751827215242, "grad_norm": 1.7902298705507202, "learning_rate": 8.91415922906072e-06, "loss": 0.7684, "step": 3340 }, { "epoch": 0.5487394267882073, "grad_norm": 2.01917117207601, "learning_rate": 8.908870615394164e-06, "loss": 0.7918, "step": 3341 }, { "epoch": 0.5489036708548903, "grad_norm": 1.465113887601621, "learning_rate": 8.903582310569487e-06, "loss": 0.7945, "step": 3342 }, { "epoch": 0.5490679149215735, "grad_norm": 0.6943632662305224, "learning_rate": 8.898294316083529e-06, "loss": 0.3141, "step": 3343 }, { "epoch": 0.5492321589882565, "grad_norm": 2.0324467434748565, "learning_rate": 8.893006633433048e-06, "loss": 0.7856, "step": 3344 }, { "epoch": 0.5493964030549396, "grad_norm": 1.6989960205310572, "learning_rate": 8.887719264114709e-06, "loss": 0.816, "step": 3345 }, { "epoch": 0.5495606471216228, "grad_norm": 3.2614329809466334, "learning_rate": 8.882432209625092e-06, "loss": 0.7784, "step": 3346 }, { "epoch": 0.5497248911883058, "grad_norm": 0.6383270684692961, "learning_rate": 8.877145471460688e-06, "loss": 0.3343, "step": 3347 }, { "epoch": 0.549889135254989, "grad_norm": 1.688982585874095, "learning_rate": 8.871859051117896e-06, "loss": 0.8238, "step": 3348 }, { "epoch": 0.550053379321672, "grad_norm": 2.431199977339897, "learning_rate": 8.866572950093026e-06, "loss": 0.7773, "step": 3349 }, { "epoch": 0.5502176233883551, "grad_norm": 1.8953163953043277, "learning_rate": 8.861287169882295e-06, "loss": 0.7728, "step": 3350 }, { "epoch": 0.5503818674550381, "grad_norm": 3.820702092173332, "learning_rate": 8.856001711981839e-06, "loss": 0.7825, "step": 3351 }, { "epoch": 0.5505461115217213, "grad_norm": 1.4582330473222762, "learning_rate": 8.850716577887695e-06, "loss": 0.8246, "step": 3352 }, { "epoch": 0.5507103555884044, "grad_norm": 2.0021669016762424, "learning_rate": 8.8454317690958e-06, "loss": 0.6984, "step": 3353 }, { "epoch": 0.5508745996550874, "grad_norm": 1.702567620657356, "learning_rate": 8.840147287102016e-06, "loss": 0.7307, "step": 3354 }, { "epoch": 0.5510388437217706, "grad_norm": 1.6905107807208948, "learning_rate": 8.8348631334021e-06, "loss": 0.7928, "step": 3355 }, { "epoch": 0.5512030877884536, "grad_norm": 1.5248411201667935, "learning_rate": 8.829579309491724e-06, "loss": 0.7077, "step": 3356 }, { "epoch": 0.5513673318551368, "grad_norm": 1.900890562638965, "learning_rate": 8.824295816866463e-06, "loss": 0.7133, "step": 3357 }, { "epoch": 0.5515315759218198, "grad_norm": 1.3969553687854213, "learning_rate": 8.819012657021794e-06, "loss": 0.7698, "step": 3358 }, { "epoch": 0.5516958199885029, "grad_norm": 1.5350630992205796, "learning_rate": 8.81372983145311e-06, "loss": 0.808, "step": 3359 }, { "epoch": 0.5518600640551861, "grad_norm": 2.1014696239335238, "learning_rate": 8.808447341655703e-06, "loss": 0.7944, "step": 3360 }, { "epoch": 0.5520243081218691, "grad_norm": 1.613098207648187, "learning_rate": 8.803165189124768e-06, "loss": 0.7681, "step": 3361 }, { "epoch": 0.5521885521885522, "grad_norm": 1.7411539827552434, "learning_rate": 8.79788337535541e-06, "loss": 0.7173, "step": 3362 }, { "epoch": 0.5523527962552353, "grad_norm": 1.4915984942045841, "learning_rate": 8.79260190184263e-06, "loss": 0.8171, "step": 3363 }, { "epoch": 0.5525170403219184, "grad_norm": 1.655092548829865, "learning_rate": 8.787320770081345e-06, "loss": 0.7132, "step": 3364 }, { "epoch": 0.5526812843886014, "grad_norm": 1.7836730454742784, "learning_rate": 8.782039981566364e-06, "loss": 0.795, "step": 3365 }, { "epoch": 0.5528455284552846, "grad_norm": 1.6310037494237979, "learning_rate": 8.776759537792402e-06, "loss": 0.7184, "step": 3366 }, { "epoch": 0.5530097725219676, "grad_norm": 1.7449981311505298, "learning_rate": 8.771479440254082e-06, "loss": 0.7256, "step": 3367 }, { "epoch": 0.5531740165886507, "grad_norm": 1.9463856737130534, "learning_rate": 8.76619969044592e-06, "loss": 0.8186, "step": 3368 }, { "epoch": 0.5533382606553339, "grad_norm": 1.4775432347815758, "learning_rate": 8.760920289862341e-06, "loss": 0.7494, "step": 3369 }, { "epoch": 0.5535025047220169, "grad_norm": 1.5238436983488515, "learning_rate": 8.755641239997667e-06, "loss": 0.7529, "step": 3370 }, { "epoch": 0.5536667487887, "grad_norm": 1.746885579142163, "learning_rate": 8.75036254234612e-06, "loss": 0.7426, "step": 3371 }, { "epoch": 0.5538309928553831, "grad_norm": 1.632871118252715, "learning_rate": 8.745084198401828e-06, "loss": 0.8207, "step": 3372 }, { "epoch": 0.5539952369220662, "grad_norm": 1.9396400402808387, "learning_rate": 8.739806209658812e-06, "loss": 0.7785, "step": 3373 }, { "epoch": 0.5541594809887492, "grad_norm": 2.3555171249784483, "learning_rate": 8.734528577611004e-06, "loss": 0.6816, "step": 3374 }, { "epoch": 0.5543237250554324, "grad_norm": 1.5978684193707426, "learning_rate": 8.729251303752214e-06, "loss": 0.8701, "step": 3375 }, { "epoch": 0.5544879691221155, "grad_norm": 1.4742824112236763, "learning_rate": 8.72397438957617e-06, "loss": 0.7376, "step": 3376 }, { "epoch": 0.5546522131887985, "grad_norm": 1.8149964698354808, "learning_rate": 8.71869783657649e-06, "loss": 0.8579, "step": 3377 }, { "epoch": 0.5548164572554817, "grad_norm": 1.7036878747094437, "learning_rate": 8.713421646246692e-06, "loss": 0.7683, "step": 3378 }, { "epoch": 0.5549807013221647, "grad_norm": 1.4624915781384529, "learning_rate": 8.70814582008019e-06, "loss": 0.8398, "step": 3379 }, { "epoch": 0.5551449453888478, "grad_norm": 1.4608247411316753, "learning_rate": 8.702870359570296e-06, "loss": 0.683, "step": 3380 }, { "epoch": 0.5553091894555309, "grad_norm": 1.5765859688420951, "learning_rate": 8.697595266210217e-06, "loss": 0.7523, "step": 3381 }, { "epoch": 0.555473433522214, "grad_norm": 1.6796364430451034, "learning_rate": 8.692320541493058e-06, "loss": 0.753, "step": 3382 }, { "epoch": 0.5556376775888971, "grad_norm": 2.613283362508903, "learning_rate": 8.687046186911819e-06, "loss": 0.7596, "step": 3383 }, { "epoch": 0.5558019216555802, "grad_norm": 1.5438975729817828, "learning_rate": 8.681772203959395e-06, "loss": 0.7495, "step": 3384 }, { "epoch": 0.5559661657222633, "grad_norm": 1.5276556362397467, "learning_rate": 8.676498594128576e-06, "loss": 0.6962, "step": 3385 }, { "epoch": 0.5561304097889463, "grad_norm": 1.721972278538643, "learning_rate": 8.671225358912044e-06, "loss": 0.7887, "step": 3386 }, { "epoch": 0.5562946538556295, "grad_norm": 2.1640283475298663, "learning_rate": 8.665952499802379e-06, "loss": 0.8112, "step": 3387 }, { "epoch": 0.5564588979223125, "grad_norm": 1.8314598336997692, "learning_rate": 8.660680018292053e-06, "loss": 0.8333, "step": 3388 }, { "epoch": 0.5566231419889957, "grad_norm": 1.6670623139209606, "learning_rate": 8.65540791587343e-06, "loss": 0.7483, "step": 3389 }, { "epoch": 0.5567873860556788, "grad_norm": 1.7902388002447276, "learning_rate": 8.650136194038767e-06, "loss": 0.7937, "step": 3390 }, { "epoch": 0.5569516301223618, "grad_norm": 1.3841750448111696, "learning_rate": 8.644864854280214e-06, "loss": 0.8139, "step": 3391 }, { "epoch": 0.557115874189045, "grad_norm": 1.965038627179026, "learning_rate": 8.639593898089815e-06, "loss": 0.8178, "step": 3392 }, { "epoch": 0.557280118255728, "grad_norm": 2.0592590243008573, "learning_rate": 8.634323326959501e-06, "loss": 0.7239, "step": 3393 }, { "epoch": 0.5574443623224111, "grad_norm": 1.812949859323635, "learning_rate": 8.629053142381093e-06, "loss": 0.8272, "step": 3394 }, { "epoch": 0.5576086063890942, "grad_norm": 1.5103402715679064, "learning_rate": 8.623783345846313e-06, "loss": 0.7918, "step": 3395 }, { "epoch": 0.5577728504557773, "grad_norm": 1.6504596396408813, "learning_rate": 8.618513938846763e-06, "loss": 0.7353, "step": 3396 }, { "epoch": 0.5579370945224604, "grad_norm": 1.4149280040411847, "learning_rate": 8.613244922873932e-06, "loss": 0.7978, "step": 3397 }, { "epoch": 0.5581013385891435, "grad_norm": 1.4978403785795869, "learning_rate": 8.60797629941921e-06, "loss": 0.6824, "step": 3398 }, { "epoch": 0.5582655826558266, "grad_norm": 8.232240214207133, "learning_rate": 8.602708069973866e-06, "loss": 0.7821, "step": 3399 }, { "epoch": 0.5584298267225096, "grad_norm": 1.3847835578260947, "learning_rate": 8.597440236029064e-06, "loss": 0.7389, "step": 3400 }, { "epoch": 0.5585940707891928, "grad_norm": 1.8855461271301144, "learning_rate": 8.592172799075853e-06, "loss": 0.6839, "step": 3401 }, { "epoch": 0.5587583148558758, "grad_norm": 1.573895026783421, "learning_rate": 8.586905760605169e-06, "loss": 0.7674, "step": 3402 }, { "epoch": 0.5589225589225589, "grad_norm": 1.9592894075844782, "learning_rate": 8.581639122107837e-06, "loss": 0.797, "step": 3403 }, { "epoch": 0.559086802989242, "grad_norm": 1.5500010826820296, "learning_rate": 8.576372885074567e-06, "loss": 0.7315, "step": 3404 }, { "epoch": 0.5592510470559251, "grad_norm": 0.6704909827189658, "learning_rate": 8.571107050995955e-06, "loss": 0.3355, "step": 3405 }, { "epoch": 0.5594152911226082, "grad_norm": 1.4578447631019549, "learning_rate": 8.565841621362488e-06, "loss": 0.7769, "step": 3406 }, { "epoch": 0.5595795351892913, "grad_norm": 1.5935783574929026, "learning_rate": 8.560576597664533e-06, "loss": 0.7836, "step": 3407 }, { "epoch": 0.5597437792559744, "grad_norm": 1.5411585030687263, "learning_rate": 8.555311981392342e-06, "loss": 0.7262, "step": 3408 }, { "epoch": 0.5599080233226574, "grad_norm": 1.468252885245683, "learning_rate": 8.550047774036058e-06, "loss": 0.8131, "step": 3409 }, { "epoch": 0.5600722673893406, "grad_norm": 2.2705096714361517, "learning_rate": 8.5447839770857e-06, "loss": 0.8318, "step": 3410 }, { "epoch": 0.5602365114560236, "grad_norm": 2.019126723710516, "learning_rate": 8.539520592031176e-06, "loss": 0.7167, "step": 3411 }, { "epoch": 0.5604007555227067, "grad_norm": 1.7597758577257092, "learning_rate": 8.534257620362277e-06, "loss": 0.8034, "step": 3412 }, { "epoch": 0.5605649995893899, "grad_norm": 1.83123928332205, "learning_rate": 8.528995063568673e-06, "loss": 0.6961, "step": 3413 }, { "epoch": 0.5607292436560729, "grad_norm": 1.7346688433106074, "learning_rate": 8.523732923139922e-06, "loss": 0.7405, "step": 3414 }, { "epoch": 0.560893487722756, "grad_norm": 1.5648988558101158, "learning_rate": 8.518471200565461e-06, "loss": 0.7384, "step": 3415 }, { "epoch": 0.5610577317894391, "grad_norm": 1.8359153344673933, "learning_rate": 8.513209897334612e-06, "loss": 0.8065, "step": 3416 }, { "epoch": 0.5612219758561222, "grad_norm": 0.6503920252158043, "learning_rate": 8.507949014936573e-06, "loss": 0.3651, "step": 3417 }, { "epoch": 0.5613862199228052, "grad_norm": 1.9019457386980987, "learning_rate": 8.502688554860426e-06, "loss": 0.835, "step": 3418 }, { "epoch": 0.5615504639894884, "grad_norm": 1.4807037250579735, "learning_rate": 8.497428518595132e-06, "loss": 0.7294, "step": 3419 }, { "epoch": 0.5617147080561715, "grad_norm": 1.8736227942591426, "learning_rate": 8.492168907629534e-06, "loss": 0.6381, "step": 3420 }, { "epoch": 0.5618789521228545, "grad_norm": 2.2412473697345425, "learning_rate": 8.486909723452356e-06, "loss": 0.8014, "step": 3421 }, { "epoch": 0.5620431961895377, "grad_norm": 1.5830163857402013, "learning_rate": 8.481650967552199e-06, "loss": 0.8157, "step": 3422 }, { "epoch": 0.5622074402562207, "grad_norm": 1.8641337241929692, "learning_rate": 8.47639264141754e-06, "loss": 0.8251, "step": 3423 }, { "epoch": 0.5623716843229039, "grad_norm": 1.4100246340871934, "learning_rate": 8.471134746536737e-06, "loss": 0.7389, "step": 3424 }, { "epoch": 0.5625359283895869, "grad_norm": 1.7313185553926287, "learning_rate": 8.465877284398029e-06, "loss": 0.7768, "step": 3425 }, { "epoch": 0.56270017245627, "grad_norm": 2.065076687887529, "learning_rate": 8.460620256489528e-06, "loss": 0.7618, "step": 3426 }, { "epoch": 0.5628644165229532, "grad_norm": 1.6075818361081708, "learning_rate": 8.455363664299225e-06, "loss": 0.8413, "step": 3427 }, { "epoch": 0.5630286605896362, "grad_norm": 1.6746101448623194, "learning_rate": 8.450107509314983e-06, "loss": 0.8226, "step": 3428 }, { "epoch": 0.5631929046563193, "grad_norm": 1.5648023503400812, "learning_rate": 8.444851793024555e-06, "loss": 0.7834, "step": 3429 }, { "epoch": 0.5633571487230024, "grad_norm": 1.4830694129529953, "learning_rate": 8.439596516915553e-06, "loss": 0.7988, "step": 3430 }, { "epoch": 0.5635213927896855, "grad_norm": 1.7885845666584606, "learning_rate": 8.434341682475476e-06, "loss": 0.7688, "step": 3431 }, { "epoch": 0.5636856368563685, "grad_norm": 1.4767038860722852, "learning_rate": 8.42908729119169e-06, "loss": 0.6848, "step": 3432 }, { "epoch": 0.5638498809230517, "grad_norm": 1.5321828063233582, "learning_rate": 8.423833344551443e-06, "loss": 0.752, "step": 3433 }, { "epoch": 0.5640141249897348, "grad_norm": 1.9654159055392517, "learning_rate": 8.418579844041852e-06, "loss": 0.8749, "step": 3434 }, { "epoch": 0.5641783690564178, "grad_norm": 1.7545947509314181, "learning_rate": 8.413326791149909e-06, "loss": 0.8011, "step": 3435 }, { "epoch": 0.564342613123101, "grad_norm": 1.621875429243046, "learning_rate": 8.408074187362479e-06, "loss": 0.7365, "step": 3436 }, { "epoch": 0.564506857189784, "grad_norm": 2.394986686070994, "learning_rate": 8.402822034166301e-06, "loss": 0.7763, "step": 3437 }, { "epoch": 0.5646711012564671, "grad_norm": 2.2155652314393, "learning_rate": 8.397570333047985e-06, "loss": 0.7331, "step": 3438 }, { "epoch": 0.5648353453231502, "grad_norm": 1.9649744820106112, "learning_rate": 8.392319085494018e-06, "loss": 0.7803, "step": 3439 }, { "epoch": 0.5649995893898333, "grad_norm": 1.9919878541268494, "learning_rate": 8.38706829299075e-06, "loss": 0.7719, "step": 3440 }, { "epoch": 0.5651638334565163, "grad_norm": 1.9302542525375173, "learning_rate": 8.381817957024409e-06, "loss": 0.8127, "step": 3441 }, { "epoch": 0.5653280775231995, "grad_norm": 1.5151750780551085, "learning_rate": 8.37656807908109e-06, "loss": 0.7812, "step": 3442 }, { "epoch": 0.5654923215898826, "grad_norm": 1.6950088020671161, "learning_rate": 8.37131866064676e-06, "loss": 0.7521, "step": 3443 }, { "epoch": 0.5656565656565656, "grad_norm": 5.168473836433209, "learning_rate": 8.366069703207257e-06, "loss": 0.6898, "step": 3444 }, { "epoch": 0.5658208097232488, "grad_norm": 1.791884257609125, "learning_rate": 8.360821208248289e-06, "loss": 0.7826, "step": 3445 }, { "epoch": 0.5659850537899318, "grad_norm": 0.6429435412089181, "learning_rate": 8.355573177255428e-06, "loss": 0.3597, "step": 3446 }, { "epoch": 0.5661492978566149, "grad_norm": 2.845535016521224, "learning_rate": 8.35032561171412e-06, "loss": 0.7639, "step": 3447 }, { "epoch": 0.566313541923298, "grad_norm": 1.425984357125665, "learning_rate": 8.345078513109677e-06, "loss": 0.7651, "step": 3448 }, { "epoch": 0.5664777859899811, "grad_norm": 1.5178128485007785, "learning_rate": 8.339831882927279e-06, "loss": 0.7309, "step": 3449 }, { "epoch": 0.5666420300566642, "grad_norm": 1.9090479075099112, "learning_rate": 8.334585722651973e-06, "loss": 0.7704, "step": 3450 }, { "epoch": 0.5668062741233473, "grad_norm": 0.6218493937895925, "learning_rate": 8.329340033768672e-06, "loss": 0.3329, "step": 3451 }, { "epoch": 0.5669705181900304, "grad_norm": 1.670334736384177, "learning_rate": 8.324094817762164e-06, "loss": 0.7555, "step": 3452 }, { "epoch": 0.5671347622567134, "grad_norm": 1.8187512717298249, "learning_rate": 8.31885007611709e-06, "loss": 0.7008, "step": 3453 }, { "epoch": 0.5672990063233966, "grad_norm": 1.6966862518654533, "learning_rate": 8.313605810317967e-06, "loss": 0.7659, "step": 3454 }, { "epoch": 0.5674632503900796, "grad_norm": 1.8584626954206473, "learning_rate": 8.30836202184917e-06, "loss": 0.6951, "step": 3455 }, { "epoch": 0.5676274944567627, "grad_norm": 2.1039274325906896, "learning_rate": 8.303118712194944e-06, "loss": 0.7402, "step": 3456 }, { "epoch": 0.5677917385234459, "grad_norm": 1.5167785665066316, "learning_rate": 8.297875882839397e-06, "loss": 0.8314, "step": 3457 }, { "epoch": 0.5679559825901289, "grad_norm": 2.331237632749007, "learning_rate": 8.2926335352665e-06, "loss": 0.7392, "step": 3458 }, { "epoch": 0.568120226656812, "grad_norm": 1.6833937451866126, "learning_rate": 8.28739167096009e-06, "loss": 0.77, "step": 3459 }, { "epoch": 0.5682844707234951, "grad_norm": 1.9672008319109875, "learning_rate": 8.282150291403867e-06, "loss": 0.7599, "step": 3460 }, { "epoch": 0.5684487147901782, "grad_norm": 1.7226349901644256, "learning_rate": 8.276909398081387e-06, "loss": 0.772, "step": 3461 }, { "epoch": 0.5686129588568613, "grad_norm": 0.6841951080056365, "learning_rate": 8.271668992476077e-06, "loss": 0.3166, "step": 3462 }, { "epoch": 0.5687772029235444, "grad_norm": 1.5080821807271747, "learning_rate": 8.266429076071221e-06, "loss": 0.7104, "step": 3463 }, { "epoch": 0.5689414469902275, "grad_norm": 1.8744830218479458, "learning_rate": 8.261189650349969e-06, "loss": 0.6832, "step": 3464 }, { "epoch": 0.5691056910569106, "grad_norm": 1.9718526047440834, "learning_rate": 8.255950716795328e-06, "loss": 0.8194, "step": 3465 }, { "epoch": 0.5692699351235937, "grad_norm": 1.9587780937838337, "learning_rate": 8.250712276890168e-06, "loss": 0.8063, "step": 3466 }, { "epoch": 0.5694341791902767, "grad_norm": 1.550540563954026, "learning_rate": 8.245474332117219e-06, "loss": 0.7283, "step": 3467 }, { "epoch": 0.5695984232569599, "grad_norm": 1.8514613039084347, "learning_rate": 8.240236883959067e-06, "loss": 0.7685, "step": 3468 }, { "epoch": 0.5697626673236429, "grad_norm": 1.754820787294976, "learning_rate": 8.234999933898164e-06, "loss": 0.8332, "step": 3469 }, { "epoch": 0.569926911390326, "grad_norm": 1.8238060821037316, "learning_rate": 8.229763483416815e-06, "loss": 0.7535, "step": 3470 }, { "epoch": 0.5700911554570092, "grad_norm": 1.5897390879443534, "learning_rate": 8.22452753399719e-06, "loss": 0.773, "step": 3471 }, { "epoch": 0.5702553995236922, "grad_norm": 2.28052801783084, "learning_rate": 8.219292087121309e-06, "loss": 0.7175, "step": 3472 }, { "epoch": 0.5704196435903753, "grad_norm": 1.8010593168496958, "learning_rate": 8.214057144271058e-06, "loss": 0.7336, "step": 3473 }, { "epoch": 0.5705838876570584, "grad_norm": 1.671759878653626, "learning_rate": 8.208822706928172e-06, "loss": 0.8084, "step": 3474 }, { "epoch": 0.5707481317237415, "grad_norm": 1.6487252510861254, "learning_rate": 8.203588776574254e-06, "loss": 0.7457, "step": 3475 }, { "epoch": 0.5709123757904245, "grad_norm": 1.6850757595112718, "learning_rate": 8.198355354690752e-06, "loss": 0.819, "step": 3476 }, { "epoch": 0.5710766198571077, "grad_norm": 1.95719607129057, "learning_rate": 8.193122442758977e-06, "loss": 0.8002, "step": 3477 }, { "epoch": 0.5712408639237907, "grad_norm": 1.6834901432853333, "learning_rate": 8.187890042260094e-06, "loss": 0.8149, "step": 3478 }, { "epoch": 0.5714051079904738, "grad_norm": 1.8936248474118205, "learning_rate": 8.18265815467512e-06, "loss": 0.8181, "step": 3479 }, { "epoch": 0.571569352057157, "grad_norm": 1.7219772055987461, "learning_rate": 8.177426781484933e-06, "loss": 0.7907, "step": 3480 }, { "epoch": 0.57173359612384, "grad_norm": 2.517902390494541, "learning_rate": 8.172195924170263e-06, "loss": 0.8139, "step": 3481 }, { "epoch": 0.5718978401905231, "grad_norm": 1.4910982923722622, "learning_rate": 8.166965584211694e-06, "loss": 0.8296, "step": 3482 }, { "epoch": 0.5720620842572062, "grad_norm": 1.4529411632713034, "learning_rate": 8.161735763089654e-06, "loss": 0.7964, "step": 3483 }, { "epoch": 0.5722263283238893, "grad_norm": 2.492450028552681, "learning_rate": 8.15650646228444e-06, "loss": 0.7198, "step": 3484 }, { "epoch": 0.5723905723905723, "grad_norm": 2.2054024991531387, "learning_rate": 8.151277683276196e-06, "loss": 0.6936, "step": 3485 }, { "epoch": 0.5725548164572555, "grad_norm": 1.7145438019500074, "learning_rate": 8.146049427544912e-06, "loss": 0.7265, "step": 3486 }, { "epoch": 0.5727190605239386, "grad_norm": 1.365505375572111, "learning_rate": 8.140821696570439e-06, "loss": 0.7227, "step": 3487 }, { "epoch": 0.5728833045906216, "grad_norm": 1.4683059461461851, "learning_rate": 8.13559449183247e-06, "loss": 0.7974, "step": 3488 }, { "epoch": 0.5730475486573048, "grad_norm": 2.745833365494509, "learning_rate": 8.130367814810561e-06, "loss": 0.7673, "step": 3489 }, { "epoch": 0.5732117927239878, "grad_norm": 1.6750055480723658, "learning_rate": 8.125141666984107e-06, "loss": 0.7708, "step": 3490 }, { "epoch": 0.573376036790671, "grad_norm": 0.6538355162469818, "learning_rate": 8.119916049832362e-06, "loss": 0.3357, "step": 3491 }, { "epoch": 0.573540280857354, "grad_norm": 1.5172371005658634, "learning_rate": 8.114690964834422e-06, "loss": 0.7117, "step": 3492 }, { "epoch": 0.5737045249240371, "grad_norm": 2.251328993073124, "learning_rate": 8.109466413469238e-06, "loss": 0.7894, "step": 3493 }, { "epoch": 0.5738687689907203, "grad_norm": 1.508932979617505, "learning_rate": 8.104242397215609e-06, "loss": 0.7674, "step": 3494 }, { "epoch": 0.5740330130574033, "grad_norm": 1.8408822287913764, "learning_rate": 8.09901891755218e-06, "loss": 0.7492, "step": 3495 }, { "epoch": 0.5741972571240864, "grad_norm": 1.4478383971963318, "learning_rate": 8.093795975957449e-06, "loss": 0.7199, "step": 3496 }, { "epoch": 0.5743615011907695, "grad_norm": 1.4803462551434048, "learning_rate": 8.088573573909755e-06, "loss": 0.7498, "step": 3497 }, { "epoch": 0.5745257452574526, "grad_norm": 1.4692226478182813, "learning_rate": 8.083351712887288e-06, "loss": 0.7595, "step": 3498 }, { "epoch": 0.5746899893241356, "grad_norm": 0.6287455281036454, "learning_rate": 8.078130394368088e-06, "loss": 0.3253, "step": 3499 }, { "epoch": 0.5748542333908188, "grad_norm": 1.6706830983496312, "learning_rate": 8.072909619830037e-06, "loss": 0.8617, "step": 3500 }, { "epoch": 0.5750184774575019, "grad_norm": 2.1981219096409057, "learning_rate": 8.067689390750863e-06, "loss": 0.745, "step": 3501 }, { "epoch": 0.5751827215241849, "grad_norm": 2.0502631818406254, "learning_rate": 8.062469708608144e-06, "loss": 0.8144, "step": 3502 }, { "epoch": 0.5753469655908681, "grad_norm": 1.6812640992123065, "learning_rate": 8.057250574879296e-06, "loss": 0.8551, "step": 3503 }, { "epoch": 0.5755112096575511, "grad_norm": 2.4252466043461838, "learning_rate": 8.052031991041591e-06, "loss": 0.8614, "step": 3504 }, { "epoch": 0.5756754537242342, "grad_norm": 1.6546878136252485, "learning_rate": 8.046813958572129e-06, "loss": 0.8096, "step": 3505 }, { "epoch": 0.5758396977909173, "grad_norm": 1.997560477817144, "learning_rate": 8.041596478947862e-06, "loss": 0.6865, "step": 3506 }, { "epoch": 0.5760039418576004, "grad_norm": 1.4849955429824873, "learning_rate": 8.036379553645595e-06, "loss": 0.8173, "step": 3507 }, { "epoch": 0.5761681859242835, "grad_norm": 2.4297857187116985, "learning_rate": 8.031163184141965e-06, "loss": 0.7798, "step": 3508 }, { "epoch": 0.5763324299909666, "grad_norm": 1.6633025028499118, "learning_rate": 8.025947371913454e-06, "loss": 0.6645, "step": 3509 }, { "epoch": 0.5764966740576497, "grad_norm": 1.8014124898037702, "learning_rate": 8.020732118436385e-06, "loss": 0.7771, "step": 3510 }, { "epoch": 0.5766609181243327, "grad_norm": 2.2662410168390688, "learning_rate": 8.015517425186926e-06, "loss": 0.7383, "step": 3511 }, { "epoch": 0.5768251621910159, "grad_norm": 1.4996613747670569, "learning_rate": 8.010303293641086e-06, "loss": 0.6788, "step": 3512 }, { "epoch": 0.5769894062576989, "grad_norm": 1.5820533316546335, "learning_rate": 8.005089725274711e-06, "loss": 0.7642, "step": 3513 }, { "epoch": 0.577153650324382, "grad_norm": 1.4223438491116238, "learning_rate": 7.999876721563494e-06, "loss": 0.7556, "step": 3514 }, { "epoch": 0.5773178943910651, "grad_norm": 2.939779369229469, "learning_rate": 7.994664283982962e-06, "loss": 0.744, "step": 3515 }, { "epoch": 0.5774821384577482, "grad_norm": 1.6227290667714314, "learning_rate": 7.989452414008485e-06, "loss": 0.7736, "step": 3516 }, { "epoch": 0.5776463825244313, "grad_norm": 1.3248570973590552, "learning_rate": 7.984241113115275e-06, "loss": 0.7507, "step": 3517 }, { "epoch": 0.5778106265911144, "grad_norm": 0.6997587894026525, "learning_rate": 7.979030382778376e-06, "loss": 0.3139, "step": 3518 }, { "epoch": 0.5779748706577975, "grad_norm": 1.6853716101853304, "learning_rate": 7.973820224472675e-06, "loss": 0.8502, "step": 3519 }, { "epoch": 0.5781391147244805, "grad_norm": 0.656218158369538, "learning_rate": 7.968610639672896e-06, "loss": 0.3584, "step": 3520 }, { "epoch": 0.5783033587911637, "grad_norm": 2.0458977167945043, "learning_rate": 7.9634016298536e-06, "loss": 0.8331, "step": 3521 }, { "epoch": 0.5784676028578467, "grad_norm": 2.3673749499483656, "learning_rate": 7.958193196489191e-06, "loss": 0.6921, "step": 3522 }, { "epoch": 0.5786318469245298, "grad_norm": 1.7523879480132905, "learning_rate": 7.952985341053902e-06, "loss": 0.8421, "step": 3523 }, { "epoch": 0.578796090991213, "grad_norm": 2.0197255866576227, "learning_rate": 7.947778065021805e-06, "loss": 0.7118, "step": 3524 }, { "epoch": 0.578960335057896, "grad_norm": 4.3183092551836175, "learning_rate": 7.942571369866814e-06, "loss": 0.7366, "step": 3525 }, { "epoch": 0.5791245791245792, "grad_norm": 1.7079415973636052, "learning_rate": 7.937365257062664e-06, "loss": 0.7458, "step": 3526 }, { "epoch": 0.5792888231912622, "grad_norm": 1.6615204686021716, "learning_rate": 7.932159728082938e-06, "loss": 0.8297, "step": 3527 }, { "epoch": 0.5794530672579453, "grad_norm": 2.9804350076433628, "learning_rate": 7.92695478440105e-06, "loss": 0.7047, "step": 3528 }, { "epoch": 0.5796173113246283, "grad_norm": 1.6308378526360439, "learning_rate": 7.921750427490248e-06, "loss": 0.8481, "step": 3529 }, { "epoch": 0.5797815553913115, "grad_norm": 1.7128151393745066, "learning_rate": 7.916546658823618e-06, "loss": 0.6995, "step": 3530 }, { "epoch": 0.5799457994579946, "grad_norm": 1.588992494392073, "learning_rate": 7.911343479874073e-06, "loss": 0.7853, "step": 3531 }, { "epoch": 0.5801100435246777, "grad_norm": 1.3021728590141195, "learning_rate": 7.906140892114361e-06, "loss": 0.7036, "step": 3532 }, { "epoch": 0.5802742875913608, "grad_norm": 2.1584490529117146, "learning_rate": 7.900938897017064e-06, "loss": 0.8414, "step": 3533 }, { "epoch": 0.5804385316580438, "grad_norm": 1.6176672155758385, "learning_rate": 7.895737496054597e-06, "loss": 0.804, "step": 3534 }, { "epoch": 0.580602775724727, "grad_norm": 2.0901234601154934, "learning_rate": 7.890536690699204e-06, "loss": 0.8272, "step": 3535 }, { "epoch": 0.58076701979141, "grad_norm": 2.338814832307023, "learning_rate": 7.885336482422964e-06, "loss": 0.794, "step": 3536 }, { "epoch": 0.5809312638580931, "grad_norm": 1.8486159957076176, "learning_rate": 7.880136872697784e-06, "loss": 0.6854, "step": 3537 }, { "epoch": 0.5810955079247763, "grad_norm": 1.611416408881241, "learning_rate": 7.874937862995401e-06, "loss": 0.7807, "step": 3538 }, { "epoch": 0.5812597519914593, "grad_norm": 1.658809587627324, "learning_rate": 7.869739454787385e-06, "loss": 0.7381, "step": 3539 }, { "epoch": 0.5814239960581424, "grad_norm": 1.9167306654612504, "learning_rate": 7.864541649545135e-06, "loss": 0.8213, "step": 3540 }, { "epoch": 0.5815882401248255, "grad_norm": 2.131586061682955, "learning_rate": 7.85934444873988e-06, "loss": 0.7498, "step": 3541 }, { "epoch": 0.5817524841915086, "grad_norm": 1.5843841676052517, "learning_rate": 7.854147853842672e-06, "loss": 0.8035, "step": 3542 }, { "epoch": 0.5819167282581916, "grad_norm": 0.690551490211987, "learning_rate": 7.848951866324402e-06, "loss": 0.2906, "step": 3543 }, { "epoch": 0.5820809723248748, "grad_norm": 1.638695166650634, "learning_rate": 7.84375648765578e-06, "loss": 0.834, "step": 3544 }, { "epoch": 0.5822452163915579, "grad_norm": 2.134893750576716, "learning_rate": 7.838561719307346e-06, "loss": 0.8111, "step": 3545 }, { "epoch": 0.5824094604582409, "grad_norm": 1.505773900937475, "learning_rate": 7.833367562749473e-06, "loss": 0.9188, "step": 3546 }, { "epoch": 0.5825737045249241, "grad_norm": 1.399088174992093, "learning_rate": 7.828174019452357e-06, "loss": 0.8138, "step": 3547 }, { "epoch": 0.5827379485916071, "grad_norm": 1.4693893490055758, "learning_rate": 7.822981090886011e-06, "loss": 0.7987, "step": 3548 }, { "epoch": 0.5829021926582902, "grad_norm": 1.5914354077398971, "learning_rate": 7.817788778520288e-06, "loss": 0.7827, "step": 3549 }, { "epoch": 0.5830664367249733, "grad_norm": 1.8801747290727193, "learning_rate": 7.81259708382486e-06, "loss": 0.7735, "step": 3550 }, { "epoch": 0.5832306807916564, "grad_norm": 1.942859449615414, "learning_rate": 7.807406008269224e-06, "loss": 0.8069, "step": 3551 }, { "epoch": 0.5833949248583395, "grad_norm": 1.7821694110257962, "learning_rate": 7.802215553322703e-06, "loss": 0.741, "step": 3552 }, { "epoch": 0.5835591689250226, "grad_norm": 2.1449966749409644, "learning_rate": 7.79702572045445e-06, "loss": 0.8162, "step": 3553 }, { "epoch": 0.5837234129917057, "grad_norm": 1.627412001569598, "learning_rate": 7.791836511133429e-06, "loss": 0.6942, "step": 3554 }, { "epoch": 0.5838876570583887, "grad_norm": 0.7037001693303386, "learning_rate": 7.78664792682844e-06, "loss": 0.3366, "step": 3555 }, { "epoch": 0.5840519011250719, "grad_norm": 2.144224940893413, "learning_rate": 7.781459969008098e-06, "loss": 0.7223, "step": 3556 }, { "epoch": 0.5842161451917549, "grad_norm": 1.6580269709205708, "learning_rate": 7.776272639140845e-06, "loss": 0.8664, "step": 3557 }, { "epoch": 0.584380389258438, "grad_norm": 1.6912947482211025, "learning_rate": 7.771085938694943e-06, "loss": 0.74, "step": 3558 }, { "epoch": 0.5845446333251211, "grad_norm": 1.6471544028815572, "learning_rate": 7.765899869138478e-06, "loss": 0.7453, "step": 3559 }, { "epoch": 0.5847088773918042, "grad_norm": 1.9024188650207277, "learning_rate": 7.760714431939354e-06, "loss": 0.7358, "step": 3560 }, { "epoch": 0.5848731214584874, "grad_norm": 1.9060366299167868, "learning_rate": 7.755529628565298e-06, "loss": 0.6872, "step": 3561 }, { "epoch": 0.5850373655251704, "grad_norm": 1.7890753773869608, "learning_rate": 7.750345460483859e-06, "loss": 0.6784, "step": 3562 }, { "epoch": 0.5852016095918535, "grad_norm": 1.6576830500035191, "learning_rate": 7.745161929162405e-06, "loss": 0.8248, "step": 3563 }, { "epoch": 0.5853658536585366, "grad_norm": 0.6089749936812776, "learning_rate": 7.739979036068125e-06, "loss": 0.3234, "step": 3564 }, { "epoch": 0.5855300977252197, "grad_norm": 1.7522970780523552, "learning_rate": 7.734796782668021e-06, "loss": 0.7086, "step": 3565 }, { "epoch": 0.5856943417919027, "grad_norm": 1.6420608625415594, "learning_rate": 7.729615170428923e-06, "loss": 0.7915, "step": 3566 }, { "epoch": 0.5858585858585859, "grad_norm": 2.2199659992935397, "learning_rate": 7.724434200817473e-06, "loss": 0.7361, "step": 3567 }, { "epoch": 0.586022829925269, "grad_norm": 1.386544631078071, "learning_rate": 7.719253875300138e-06, "loss": 0.8272, "step": 3568 }, { "epoch": 0.586187073991952, "grad_norm": 1.9899354319926283, "learning_rate": 7.7140741953432e-06, "loss": 0.8176, "step": 3569 }, { "epoch": 0.5863513180586352, "grad_norm": 0.627103910072485, "learning_rate": 7.708895162412745e-06, "loss": 0.3069, "step": 3570 }, { "epoch": 0.5865155621253182, "grad_norm": 1.9954436824351534, "learning_rate": 7.703716777974694e-06, "loss": 0.7924, "step": 3571 }, { "epoch": 0.5866798061920013, "grad_norm": 1.723175994880857, "learning_rate": 7.69853904349478e-06, "loss": 0.7517, "step": 3572 }, { "epoch": 0.5868440502586844, "grad_norm": 2.7639336982838927, "learning_rate": 7.693361960438548e-06, "loss": 0.778, "step": 3573 }, { "epoch": 0.5870082943253675, "grad_norm": 0.6438017449502532, "learning_rate": 7.688185530271359e-06, "loss": 0.3144, "step": 3574 }, { "epoch": 0.5871725383920506, "grad_norm": 0.6146758703936999, "learning_rate": 7.683009754458394e-06, "loss": 0.3515, "step": 3575 }, { "epoch": 0.5873367824587337, "grad_norm": 3.132146691307184, "learning_rate": 7.67783463446464e-06, "loss": 0.7262, "step": 3576 }, { "epoch": 0.5875010265254168, "grad_norm": 1.738622939217457, "learning_rate": 7.67266017175491e-06, "loss": 0.7622, "step": 3577 }, { "epoch": 0.5876652705920998, "grad_norm": 3.446871329120506, "learning_rate": 7.667486367793822e-06, "loss": 0.759, "step": 3578 }, { "epoch": 0.587829514658783, "grad_norm": 1.5652132703157435, "learning_rate": 7.66231322404581e-06, "loss": 0.7876, "step": 3579 }, { "epoch": 0.587993758725466, "grad_norm": 1.7769054516176128, "learning_rate": 7.657140741975121e-06, "loss": 0.7058, "step": 3580 }, { "epoch": 0.5881580027921491, "grad_norm": 2.127369635849584, "learning_rate": 7.651968923045817e-06, "loss": 0.739, "step": 3581 }, { "epoch": 0.5883222468588323, "grad_norm": 0.5897451840695359, "learning_rate": 7.64679776872177e-06, "loss": 0.3217, "step": 3582 }, { "epoch": 0.5884864909255153, "grad_norm": 2.9176819834125287, "learning_rate": 7.641627280466663e-06, "loss": 0.7672, "step": 3583 }, { "epoch": 0.5886507349921984, "grad_norm": 1.694992803942208, "learning_rate": 7.636457459743993e-06, "loss": 0.7461, "step": 3584 }, { "epoch": 0.5888149790588815, "grad_norm": 1.4648646468713227, "learning_rate": 7.631288308017068e-06, "loss": 0.7088, "step": 3585 }, { "epoch": 0.5889792231255646, "grad_norm": 1.6707069441219666, "learning_rate": 7.626119826749002e-06, "loss": 0.7027, "step": 3586 }, { "epoch": 0.5891434671922476, "grad_norm": 2.02296972403054, "learning_rate": 7.6209520174027255e-06, "loss": 0.8313, "step": 3587 }, { "epoch": 0.5893077112589308, "grad_norm": 2.0168794751267645, "learning_rate": 7.615784881440975e-06, "loss": 0.7654, "step": 3588 }, { "epoch": 0.5894719553256139, "grad_norm": 1.9110372368923896, "learning_rate": 7.610618420326299e-06, "loss": 0.7848, "step": 3589 }, { "epoch": 0.589636199392297, "grad_norm": 1.3426295943989306, "learning_rate": 7.605452635521054e-06, "loss": 0.7498, "step": 3590 }, { "epoch": 0.5898004434589801, "grad_norm": 2.058034080316875, "learning_rate": 7.6002875284874e-06, "loss": 0.793, "step": 3591 }, { "epoch": 0.5899646875256631, "grad_norm": 1.6929511090678844, "learning_rate": 7.595123100687313e-06, "loss": 0.7853, "step": 3592 }, { "epoch": 0.5901289315923463, "grad_norm": 1.884535237469642, "learning_rate": 7.589959353582574e-06, "loss": 0.759, "step": 3593 }, { "epoch": 0.5902931756590293, "grad_norm": 1.603799264648432, "learning_rate": 7.584796288634768e-06, "loss": 0.734, "step": 3594 }, { "epoch": 0.5904574197257124, "grad_norm": 1.5212938936116702, "learning_rate": 7.5796339073052915e-06, "loss": 0.8315, "step": 3595 }, { "epoch": 0.5906216637923954, "grad_norm": 1.7382599925087072, "learning_rate": 7.574472211055346e-06, "loss": 0.8297, "step": 3596 }, { "epoch": 0.5907859078590786, "grad_norm": 1.4352149147777205, "learning_rate": 7.569311201345939e-06, "loss": 0.8047, "step": 3597 }, { "epoch": 0.5909501519257617, "grad_norm": 1.8723763399225681, "learning_rate": 7.564150879637882e-06, "loss": 0.8407, "step": 3598 }, { "epoch": 0.5911143959924448, "grad_norm": 2.1571405865018405, "learning_rate": 7.558991247391792e-06, "loss": 0.7856, "step": 3599 }, { "epoch": 0.5912786400591279, "grad_norm": 1.9821907957263296, "learning_rate": 7.553832306068095e-06, "loss": 0.7817, "step": 3600 }, { "epoch": 0.5914428841258109, "grad_norm": 1.808792269678319, "learning_rate": 7.548674057127019e-06, "loss": 0.7611, "step": 3601 }, { "epoch": 0.5916071281924941, "grad_norm": 1.6715078649523851, "learning_rate": 7.543516502028594e-06, "loss": 0.8031, "step": 3602 }, { "epoch": 0.5917713722591771, "grad_norm": 1.5278422451920042, "learning_rate": 7.538359642232654e-06, "loss": 0.791, "step": 3603 }, { "epoch": 0.5919356163258602, "grad_norm": 1.5395309983945502, "learning_rate": 7.53320347919884e-06, "loss": 0.74, "step": 3604 }, { "epoch": 0.5920998603925434, "grad_norm": 1.5214916483098304, "learning_rate": 7.52804801438659e-06, "loss": 0.7945, "step": 3605 }, { "epoch": 0.5922641044592264, "grad_norm": 1.7333234115793266, "learning_rate": 7.52289324925515e-06, "loss": 0.75, "step": 3606 }, { "epoch": 0.5924283485259095, "grad_norm": 2.1765192048811226, "learning_rate": 7.517739185263564e-06, "loss": 0.6888, "step": 3607 }, { "epoch": 0.5925925925925926, "grad_norm": 1.9373000087382242, "learning_rate": 7.5125858238706785e-06, "loss": 0.765, "step": 3608 }, { "epoch": 0.5927568366592757, "grad_norm": 1.669169942216275, "learning_rate": 7.507433166535143e-06, "loss": 0.8337, "step": 3609 }, { "epoch": 0.5929210807259587, "grad_norm": 1.6162012966360921, "learning_rate": 7.5022812147154065e-06, "loss": 0.7388, "step": 3610 }, { "epoch": 0.5930853247926419, "grad_norm": 1.4836372382261074, "learning_rate": 7.497129969869718e-06, "loss": 0.7275, "step": 3611 }, { "epoch": 0.593249568859325, "grad_norm": 1.7275114303696646, "learning_rate": 7.491979433456127e-06, "loss": 0.724, "step": 3612 }, { "epoch": 0.593413812926008, "grad_norm": 1.6496404063265648, "learning_rate": 7.486829606932478e-06, "loss": 0.686, "step": 3613 }, { "epoch": 0.5935780569926912, "grad_norm": 1.7065575378285012, "learning_rate": 7.481680491756424e-06, "loss": 0.7668, "step": 3614 }, { "epoch": 0.5937423010593742, "grad_norm": 3.4179137611274095, "learning_rate": 7.476532089385407e-06, "loss": 0.7634, "step": 3615 }, { "epoch": 0.5939065451260573, "grad_norm": 0.6284830012813908, "learning_rate": 7.471384401276674e-06, "loss": 0.343, "step": 3616 }, { "epoch": 0.5940707891927404, "grad_norm": 4.401437369544412, "learning_rate": 7.466237428887265e-06, "loss": 0.7164, "step": 3617 }, { "epoch": 0.5942350332594235, "grad_norm": 1.234753995191413, "learning_rate": 7.461091173674022e-06, "loss": 0.784, "step": 3618 }, { "epoch": 0.5943992773261066, "grad_norm": 1.8581599478646835, "learning_rate": 7.455945637093581e-06, "loss": 0.7585, "step": 3619 }, { "epoch": 0.5945635213927897, "grad_norm": 1.883769730551608, "learning_rate": 7.450800820602375e-06, "loss": 0.8342, "step": 3620 }, { "epoch": 0.5947277654594728, "grad_norm": 1.7236530385652227, "learning_rate": 7.445656725656634e-06, "loss": 0.7655, "step": 3621 }, { "epoch": 0.5948920095261558, "grad_norm": 1.5960376448098288, "learning_rate": 7.440513353712381e-06, "loss": 0.8083, "step": 3622 }, { "epoch": 0.595056253592839, "grad_norm": 1.5249062638376445, "learning_rate": 7.43537070622544e-06, "loss": 0.7445, "step": 3623 }, { "epoch": 0.595220497659522, "grad_norm": 1.9548848842701148, "learning_rate": 7.430228784651426e-06, "loss": 0.802, "step": 3624 }, { "epoch": 0.5953847417262051, "grad_norm": 1.8802452300994232, "learning_rate": 7.425087590445747e-06, "loss": 0.8169, "step": 3625 }, { "epoch": 0.5955489857928883, "grad_norm": 1.7510908671982668, "learning_rate": 7.419947125063609e-06, "loss": 0.7904, "step": 3626 }, { "epoch": 0.5957132298595713, "grad_norm": 1.918476799555775, "learning_rate": 7.41480738996001e-06, "loss": 0.7471, "step": 3627 }, { "epoch": 0.5958774739262545, "grad_norm": 2.922211139180951, "learning_rate": 7.40966838658974e-06, "loss": 0.8204, "step": 3628 }, { "epoch": 0.5960417179929375, "grad_norm": 1.7289408973786935, "learning_rate": 7.4045301164073834e-06, "loss": 0.7714, "step": 3629 }, { "epoch": 0.5962059620596206, "grad_norm": 1.429722093005976, "learning_rate": 7.399392580867317e-06, "loss": 0.7602, "step": 3630 }, { "epoch": 0.5963702061263036, "grad_norm": 2.0849674404518765, "learning_rate": 7.394255781423709e-06, "loss": 0.6718, "step": 3631 }, { "epoch": 0.5965344501929868, "grad_norm": 1.7380142377456118, "learning_rate": 7.389119719530522e-06, "loss": 0.8185, "step": 3632 }, { "epoch": 0.5966986942596698, "grad_norm": 2.154295620154768, "learning_rate": 7.383984396641506e-06, "loss": 0.7755, "step": 3633 }, { "epoch": 0.596862938326353, "grad_norm": 1.4381645581178335, "learning_rate": 7.378849814210201e-06, "loss": 0.6836, "step": 3634 }, { "epoch": 0.5970271823930361, "grad_norm": 1.8449640537571692, "learning_rate": 7.373715973689941e-06, "loss": 0.7625, "step": 3635 }, { "epoch": 0.5971914264597191, "grad_norm": 1.6844965884206398, "learning_rate": 7.3685828765338495e-06, "loss": 0.6681, "step": 3636 }, { "epoch": 0.5973556705264023, "grad_norm": 0.6293679775097341, "learning_rate": 7.363450524194839e-06, "loss": 0.3325, "step": 3637 }, { "epoch": 0.5975199145930853, "grad_norm": 4.789866409754174, "learning_rate": 7.358318918125613e-06, "loss": 0.7812, "step": 3638 }, { "epoch": 0.5976841586597684, "grad_norm": 1.9997588950772456, "learning_rate": 7.353188059778657e-06, "loss": 0.7763, "step": 3639 }, { "epoch": 0.5978484027264515, "grad_norm": 0.6233420870234594, "learning_rate": 7.348057950606253e-06, "loss": 0.3407, "step": 3640 }, { "epoch": 0.5980126467931346, "grad_norm": 1.4218952073491578, "learning_rate": 7.342928592060468e-06, "loss": 0.79, "step": 3641 }, { "epoch": 0.5981768908598177, "grad_norm": 1.8311264016665234, "learning_rate": 7.337799985593152e-06, "loss": 0.8191, "step": 3642 }, { "epoch": 0.5983411349265008, "grad_norm": 1.4215454306573558, "learning_rate": 7.332672132655953e-06, "loss": 0.8074, "step": 3643 }, { "epoch": 0.5985053789931839, "grad_norm": 1.7350502234764245, "learning_rate": 7.327545034700294e-06, "loss": 0.7648, "step": 3644 }, { "epoch": 0.5986696230598669, "grad_norm": 1.5861564809752753, "learning_rate": 7.3224186931773885e-06, "loss": 0.7485, "step": 3645 }, { "epoch": 0.5988338671265501, "grad_norm": 1.575028445749271, "learning_rate": 7.317293109538239e-06, "loss": 0.6919, "step": 3646 }, { "epoch": 0.5989981111932331, "grad_norm": 1.7038826085136085, "learning_rate": 7.312168285233633e-06, "loss": 0.742, "step": 3647 }, { "epoch": 0.5991623552599162, "grad_norm": 1.955736872248312, "learning_rate": 7.307044221714139e-06, "loss": 0.8007, "step": 3648 }, { "epoch": 0.5993265993265994, "grad_norm": 2.2161418342321677, "learning_rate": 7.3019209204301115e-06, "loss": 0.7006, "step": 3649 }, { "epoch": 0.5994908433932824, "grad_norm": 9.720858086642469, "learning_rate": 7.296798382831691e-06, "loss": 0.7533, "step": 3650 }, { "epoch": 0.5996550874599655, "grad_norm": 2.8549720379589045, "learning_rate": 7.291676610368803e-06, "loss": 0.7721, "step": 3651 }, { "epoch": 0.5998193315266486, "grad_norm": 2.095636659133526, "learning_rate": 7.286555604491151e-06, "loss": 0.7336, "step": 3652 }, { "epoch": 0.5999835755933317, "grad_norm": 1.6963854915784455, "learning_rate": 7.2814353666482276e-06, "loss": 0.8199, "step": 3653 }, { "epoch": 0.6001478196600147, "grad_norm": 0.6290606025456569, "learning_rate": 7.276315898289303e-06, "loss": 0.3578, "step": 3654 }, { "epoch": 0.6003120637266979, "grad_norm": 5.370860936566808, "learning_rate": 7.271197200863438e-06, "loss": 0.7869, "step": 3655 }, { "epoch": 0.600476307793381, "grad_norm": 1.920791083626636, "learning_rate": 7.2660792758194596e-06, "loss": 0.6424, "step": 3656 }, { "epoch": 0.600640551860064, "grad_norm": 2.0670246941163533, "learning_rate": 7.260962124605993e-06, "loss": 0.7208, "step": 3657 }, { "epoch": 0.6008047959267472, "grad_norm": 1.5742962251720185, "learning_rate": 7.2558457486714316e-06, "loss": 0.7321, "step": 3658 }, { "epoch": 0.6009690399934302, "grad_norm": 1.9484262274972532, "learning_rate": 7.2507301494639605e-06, "loss": 0.6834, "step": 3659 }, { "epoch": 0.6011332840601133, "grad_norm": 5.443391677047776, "learning_rate": 7.245615328431535e-06, "loss": 0.7147, "step": 3660 }, { "epoch": 0.6012975281267964, "grad_norm": 1.6420035219249358, "learning_rate": 7.240501287021897e-06, "loss": 0.7795, "step": 3661 }, { "epoch": 0.6014617721934795, "grad_norm": 2.01992750414161, "learning_rate": 7.2353880266825635e-06, "loss": 0.8221, "step": 3662 }, { "epoch": 0.6016260162601627, "grad_norm": 1.4493123525500955, "learning_rate": 7.230275548860833e-06, "loss": 0.7348, "step": 3663 }, { "epoch": 0.6017902603268457, "grad_norm": 1.8309086778133743, "learning_rate": 7.225163855003781e-06, "loss": 0.6841, "step": 3664 }, { "epoch": 0.6019545043935288, "grad_norm": 1.9266553021165451, "learning_rate": 7.220052946558262e-06, "loss": 0.7806, "step": 3665 }, { "epoch": 0.6021187484602119, "grad_norm": 2.143563546968871, "learning_rate": 7.2149428249709095e-06, "loss": 0.7057, "step": 3666 }, { "epoch": 0.602282992526895, "grad_norm": 1.8866621665627374, "learning_rate": 7.209833491688131e-06, "loss": 0.7445, "step": 3667 }, { "epoch": 0.602447236593578, "grad_norm": 1.7138050773447475, "learning_rate": 7.2047249481561125e-06, "loss": 0.6867, "step": 3668 }, { "epoch": 0.6026114806602612, "grad_norm": 1.5973755034488526, "learning_rate": 7.1996171958208125e-06, "loss": 0.7514, "step": 3669 }, { "epoch": 0.6027757247269442, "grad_norm": 0.6083387050888563, "learning_rate": 7.194510236127978e-06, "loss": 0.3252, "step": 3670 }, { "epoch": 0.6029399687936273, "grad_norm": 2.1071648104245293, "learning_rate": 7.189404070523118e-06, "loss": 0.8553, "step": 3671 }, { "epoch": 0.6031042128603105, "grad_norm": 1.7740198455295129, "learning_rate": 7.184298700451524e-06, "loss": 0.8124, "step": 3672 }, { "epoch": 0.6032684569269935, "grad_norm": 1.3644819923853395, "learning_rate": 7.179194127358258e-06, "loss": 0.8267, "step": 3673 }, { "epoch": 0.6034327009936766, "grad_norm": 1.5884381591555703, "learning_rate": 7.17409035268816e-06, "loss": 0.7548, "step": 3674 }, { "epoch": 0.6035969450603597, "grad_norm": 1.524591068636676, "learning_rate": 7.168987377885843e-06, "loss": 0.8486, "step": 3675 }, { "epoch": 0.6037611891270428, "grad_norm": 1.894422083159994, "learning_rate": 7.163885204395692e-06, "loss": 0.7465, "step": 3676 }, { "epoch": 0.6039254331937258, "grad_norm": 1.7300440972731002, "learning_rate": 7.158783833661869e-06, "loss": 0.7134, "step": 3677 }, { "epoch": 0.604089677260409, "grad_norm": 1.8297153755843603, "learning_rate": 7.153683267128304e-06, "loss": 0.7334, "step": 3678 }, { "epoch": 0.6042539213270921, "grad_norm": 2.0072967306786547, "learning_rate": 7.148583506238701e-06, "loss": 0.783, "step": 3679 }, { "epoch": 0.6044181653937751, "grad_norm": 1.7400247548484735, "learning_rate": 7.143484552436537e-06, "loss": 0.7975, "step": 3680 }, { "epoch": 0.6045824094604583, "grad_norm": 1.7012400197357342, "learning_rate": 7.1383864071650635e-06, "loss": 0.7949, "step": 3681 }, { "epoch": 0.6047466535271413, "grad_norm": 1.5996386550214587, "learning_rate": 7.133289071867295e-06, "loss": 0.8343, "step": 3682 }, { "epoch": 0.6049108975938244, "grad_norm": 1.495237831777545, "learning_rate": 7.128192547986023e-06, "loss": 0.7903, "step": 3683 }, { "epoch": 0.6050751416605075, "grad_norm": 1.8622423393896583, "learning_rate": 7.1230968369638096e-06, "loss": 0.744, "step": 3684 }, { "epoch": 0.6052393857271906, "grad_norm": 1.5516708345953139, "learning_rate": 7.118001940242984e-06, "loss": 0.739, "step": 3685 }, { "epoch": 0.6054036297938737, "grad_norm": 1.7317971921073394, "learning_rate": 7.112907859265646e-06, "loss": 0.753, "step": 3686 }, { "epoch": 0.6055678738605568, "grad_norm": 0.5912801800897642, "learning_rate": 7.1078145954736655e-06, "loss": 0.3104, "step": 3687 }, { "epoch": 0.6057321179272399, "grad_norm": 1.8640725492111918, "learning_rate": 7.102722150308678e-06, "loss": 0.733, "step": 3688 }, { "epoch": 0.6058963619939229, "grad_norm": 1.53017904815472, "learning_rate": 7.097630525212091e-06, "loss": 0.7572, "step": 3689 }, { "epoch": 0.6060606060606061, "grad_norm": 2.10530512773924, "learning_rate": 7.092539721625078e-06, "loss": 0.8679, "step": 3690 }, { "epoch": 0.6062248501272891, "grad_norm": 1.9139833526566308, "learning_rate": 7.087449740988579e-06, "loss": 0.7895, "step": 3691 }, { "epoch": 0.6063890941939722, "grad_norm": 2.0179361949520933, "learning_rate": 7.082360584743302e-06, "loss": 0.811, "step": 3692 }, { "epoch": 0.6065533382606554, "grad_norm": 1.6712749077034348, "learning_rate": 7.077272254329726e-06, "loss": 0.7571, "step": 3693 }, { "epoch": 0.6067175823273384, "grad_norm": 1.6734045152119708, "learning_rate": 7.072184751188088e-06, "loss": 0.7195, "step": 3694 }, { "epoch": 0.6068818263940216, "grad_norm": 1.6461643485599493, "learning_rate": 7.067098076758398e-06, "loss": 0.7249, "step": 3695 }, { "epoch": 0.6070460704607046, "grad_norm": 1.456362178164319, "learning_rate": 7.062012232480427e-06, "loss": 0.7172, "step": 3696 }, { "epoch": 0.6072103145273877, "grad_norm": 1.6297095924268965, "learning_rate": 7.056927219793711e-06, "loss": 0.6613, "step": 3697 }, { "epoch": 0.6073745585940707, "grad_norm": 1.9712931981110569, "learning_rate": 7.051843040137558e-06, "loss": 0.8046, "step": 3698 }, { "epoch": 0.6075388026607539, "grad_norm": 1.9793638942146694, "learning_rate": 7.046759694951029e-06, "loss": 0.7243, "step": 3699 }, { "epoch": 0.607703046727437, "grad_norm": 1.5597823219883054, "learning_rate": 7.04167718567295e-06, "loss": 0.7668, "step": 3700 }, { "epoch": 0.60786729079412, "grad_norm": 1.9899418694280997, "learning_rate": 7.036595513741924e-06, "loss": 0.8088, "step": 3701 }, { "epoch": 0.6080315348608032, "grad_norm": 1.7460253606418679, "learning_rate": 7.0315146805963004e-06, "loss": 0.7544, "step": 3702 }, { "epoch": 0.6081957789274862, "grad_norm": 1.8009821494064997, "learning_rate": 7.026434687674204e-06, "loss": 0.8111, "step": 3703 }, { "epoch": 0.6083600229941694, "grad_norm": 0.6009481862200836, "learning_rate": 7.021355536413513e-06, "loss": 0.3235, "step": 3704 }, { "epoch": 0.6085242670608524, "grad_norm": 1.7816301894404152, "learning_rate": 7.016277228251871e-06, "loss": 0.7624, "step": 3705 }, { "epoch": 0.6086885111275355, "grad_norm": 3.343728205691778, "learning_rate": 7.011199764626682e-06, "loss": 0.8044, "step": 3706 }, { "epoch": 0.6088527551942186, "grad_norm": 1.8966013742893324, "learning_rate": 7.006123146975112e-06, "loss": 0.7709, "step": 3707 }, { "epoch": 0.6090169992609017, "grad_norm": 1.9945894903448156, "learning_rate": 7.001047376734087e-06, "loss": 0.8218, "step": 3708 }, { "epoch": 0.6091812433275848, "grad_norm": 1.6561435208748658, "learning_rate": 6.995972455340292e-06, "loss": 0.7824, "step": 3709 }, { "epoch": 0.6093454873942679, "grad_norm": 3.707722464171124, "learning_rate": 6.990898384230174e-06, "loss": 0.7498, "step": 3710 }, { "epoch": 0.609509731460951, "grad_norm": 0.6303376945083811, "learning_rate": 6.985825164839937e-06, "loss": 0.3258, "step": 3711 }, { "epoch": 0.609673975527634, "grad_norm": 4.018671988687553, "learning_rate": 6.980752798605547e-06, "loss": 0.7535, "step": 3712 }, { "epoch": 0.6098382195943172, "grad_norm": 1.6305601430620882, "learning_rate": 6.975681286962724e-06, "loss": 0.7551, "step": 3713 }, { "epoch": 0.6100024636610002, "grad_norm": 1.6092199105287872, "learning_rate": 6.970610631346951e-06, "loss": 0.8304, "step": 3714 }, { "epoch": 0.6101667077276833, "grad_norm": 1.7637381040713824, "learning_rate": 6.965540833193464e-06, "loss": 0.7682, "step": 3715 }, { "epoch": 0.6103309517943665, "grad_norm": 1.8560460049320882, "learning_rate": 6.9604718939372615e-06, "loss": 0.7301, "step": 3716 }, { "epoch": 0.6104951958610495, "grad_norm": 1.7663980436195013, "learning_rate": 6.9554038150130955e-06, "loss": 0.7561, "step": 3717 }, { "epoch": 0.6106594399277326, "grad_norm": 1.574399132453294, "learning_rate": 6.9503365978554735e-06, "loss": 0.7667, "step": 3718 }, { "epoch": 0.6108236839944157, "grad_norm": 2.010537335506341, "learning_rate": 6.945270243898662e-06, "loss": 0.7641, "step": 3719 }, { "epoch": 0.6109879280610988, "grad_norm": 1.9181792422419561, "learning_rate": 6.940204754576685e-06, "loss": 0.796, "step": 3720 }, { "epoch": 0.6111521721277818, "grad_norm": 5.528912597961342, "learning_rate": 6.935140131323312e-06, "loss": 0.7186, "step": 3721 }, { "epoch": 0.611316416194465, "grad_norm": 2.3289674167608467, "learning_rate": 6.930076375572077e-06, "loss": 0.7694, "step": 3722 }, { "epoch": 0.6114806602611481, "grad_norm": 1.6782233421797719, "learning_rate": 6.925013488756264e-06, "loss": 0.863, "step": 3723 }, { "epoch": 0.6116449043278311, "grad_norm": 2.447059762213744, "learning_rate": 6.919951472308912e-06, "loss": 0.8229, "step": 3724 }, { "epoch": 0.6118091483945143, "grad_norm": 2.2619359451338443, "learning_rate": 6.9148903276628175e-06, "loss": 0.774, "step": 3725 }, { "epoch": 0.6119733924611973, "grad_norm": 1.5487278278558398, "learning_rate": 6.909830056250527e-06, "loss": 0.7098, "step": 3726 }, { "epoch": 0.6121376365278804, "grad_norm": 1.9690155678821273, "learning_rate": 6.904770659504336e-06, "loss": 0.8446, "step": 3727 }, { "epoch": 0.6123018805945635, "grad_norm": 3.6897144621145594, "learning_rate": 6.8997121388563e-06, "loss": 0.7031, "step": 3728 }, { "epoch": 0.6124661246612466, "grad_norm": 1.5207367017929232, "learning_rate": 6.89465449573822e-06, "loss": 0.7773, "step": 3729 }, { "epoch": 0.6126303687279298, "grad_norm": 1.8253395622672375, "learning_rate": 6.889597731581652e-06, "loss": 0.7519, "step": 3730 }, { "epoch": 0.6127946127946128, "grad_norm": 1.7905694509578785, "learning_rate": 6.8845418478179016e-06, "loss": 0.6611, "step": 3731 }, { "epoch": 0.6129588568612959, "grad_norm": 1.8766961478890563, "learning_rate": 6.879486845878027e-06, "loss": 0.7416, "step": 3732 }, { "epoch": 0.613123100927979, "grad_norm": 1.6877314985102818, "learning_rate": 6.874432727192837e-06, "loss": 0.6935, "step": 3733 }, { "epoch": 0.6132873449946621, "grad_norm": 1.8879341910228844, "learning_rate": 6.869379493192886e-06, "loss": 0.7073, "step": 3734 }, { "epoch": 0.6134515890613451, "grad_norm": 1.6048013866045923, "learning_rate": 6.8643271453084845e-06, "loss": 0.7544, "step": 3735 }, { "epoch": 0.6136158331280283, "grad_norm": 1.7122832756761854, "learning_rate": 6.859275684969686e-06, "loss": 0.7656, "step": 3736 }, { "epoch": 0.6137800771947114, "grad_norm": 1.5993813149137792, "learning_rate": 6.854225113606299e-06, "loss": 0.7619, "step": 3737 }, { "epoch": 0.6139443212613944, "grad_norm": 2.3492823049297336, "learning_rate": 6.849175432647875e-06, "loss": 0.6989, "step": 3738 }, { "epoch": 0.6141085653280776, "grad_norm": 1.4097238607464675, "learning_rate": 6.844126643523714e-06, "loss": 0.8314, "step": 3739 }, { "epoch": 0.6142728093947606, "grad_norm": 0.6187364582395465, "learning_rate": 6.839078747662871e-06, "loss": 0.3258, "step": 3740 }, { "epoch": 0.6144370534614437, "grad_norm": 2.1435821850234813, "learning_rate": 6.834031746494136e-06, "loss": 0.7885, "step": 3741 }, { "epoch": 0.6146012975281268, "grad_norm": 1.394520459870188, "learning_rate": 6.8289856414460595e-06, "loss": 0.6827, "step": 3742 }, { "epoch": 0.6147655415948099, "grad_norm": 1.7173862979606327, "learning_rate": 6.823940433946921e-06, "loss": 0.7422, "step": 3743 }, { "epoch": 0.614929785661493, "grad_norm": 1.945896724961476, "learning_rate": 6.818896125424762e-06, "loss": 0.73, "step": 3744 }, { "epoch": 0.6150940297281761, "grad_norm": 1.7462220519763645, "learning_rate": 6.813852717307362e-06, "loss": 0.6982, "step": 3745 }, { "epoch": 0.6152582737948592, "grad_norm": 1.6918138223377481, "learning_rate": 6.808810211022248e-06, "loss": 0.7651, "step": 3746 }, { "epoch": 0.6154225178615422, "grad_norm": 1.6605471415989035, "learning_rate": 6.803768607996686e-06, "loss": 0.7976, "step": 3747 }, { "epoch": 0.6155867619282254, "grad_norm": 1.5085588134470176, "learning_rate": 6.798727909657698e-06, "loss": 0.7514, "step": 3748 }, { "epoch": 0.6157510059949084, "grad_norm": 1.6734753623090388, "learning_rate": 6.793688117432041e-06, "loss": 0.7818, "step": 3749 }, { "epoch": 0.6159152500615915, "grad_norm": 1.7302930272393682, "learning_rate": 6.788649232746217e-06, "loss": 0.7582, "step": 3750 }, { "epoch": 0.6160794941282746, "grad_norm": 1.9835509758521643, "learning_rate": 6.783611257026471e-06, "loss": 0.7119, "step": 3751 }, { "epoch": 0.6162437381949577, "grad_norm": 2.0347119528682325, "learning_rate": 6.778574191698793e-06, "loss": 0.7348, "step": 3752 }, { "epoch": 0.6164079822616408, "grad_norm": 1.748236204642184, "learning_rate": 6.773538038188912e-06, "loss": 0.7919, "step": 3753 }, { "epoch": 0.6165722263283239, "grad_norm": 1.7342684440283103, "learning_rate": 6.768502797922301e-06, "loss": 0.7871, "step": 3754 }, { "epoch": 0.616736470395007, "grad_norm": 1.667959467863352, "learning_rate": 6.763468472324175e-06, "loss": 0.8138, "step": 3755 }, { "epoch": 0.61690071446169, "grad_norm": 1.948899771505052, "learning_rate": 6.758435062819488e-06, "loss": 0.7185, "step": 3756 }, { "epoch": 0.6170649585283732, "grad_norm": 1.7180859609356518, "learning_rate": 6.7534025708329385e-06, "loss": 0.7228, "step": 3757 }, { "epoch": 0.6172292025950562, "grad_norm": 1.6201753030588557, "learning_rate": 6.74837099778896e-06, "loss": 0.7925, "step": 3758 }, { "epoch": 0.6173934466617393, "grad_norm": 1.6226587781580841, "learning_rate": 6.743340345111731e-06, "loss": 0.7948, "step": 3759 }, { "epoch": 0.6175576907284225, "grad_norm": 1.4003088078404722, "learning_rate": 6.738310614225164e-06, "loss": 0.7648, "step": 3760 }, { "epoch": 0.6177219347951055, "grad_norm": 2.2617426774911955, "learning_rate": 6.733281806552917e-06, "loss": 0.766, "step": 3761 }, { "epoch": 0.6178861788617886, "grad_norm": 2.033001556364395, "learning_rate": 6.728253923518379e-06, "loss": 0.7599, "step": 3762 }, { "epoch": 0.6180504229284717, "grad_norm": 2.1932110447109565, "learning_rate": 6.723226966544691e-06, "loss": 0.7321, "step": 3763 }, { "epoch": 0.6182146669951548, "grad_norm": 2.027356958577647, "learning_rate": 6.718200937054714e-06, "loss": 0.7462, "step": 3764 }, { "epoch": 0.6183789110618378, "grad_norm": 1.417314468128901, "learning_rate": 6.713175836471057e-06, "loss": 0.7937, "step": 3765 }, { "epoch": 0.618543155128521, "grad_norm": 1.6607156746963718, "learning_rate": 6.708151666216063e-06, "loss": 0.6726, "step": 3766 }, { "epoch": 0.6187073991952041, "grad_norm": 1.5323539392724204, "learning_rate": 6.703128427711816e-06, "loss": 0.8225, "step": 3767 }, { "epoch": 0.6188716432618871, "grad_norm": 1.605105077724122, "learning_rate": 6.69810612238013e-06, "loss": 0.8106, "step": 3768 }, { "epoch": 0.6190358873285703, "grad_norm": 1.7805379750769044, "learning_rate": 6.6930847516425615e-06, "loss": 0.7322, "step": 3769 }, { "epoch": 0.6192001313952533, "grad_norm": 2.703306777257313, "learning_rate": 6.688064316920393e-06, "loss": 0.7905, "step": 3770 }, { "epoch": 0.6193643754619365, "grad_norm": 1.756129257010095, "learning_rate": 6.683044819634654e-06, "loss": 0.7169, "step": 3771 }, { "epoch": 0.6195286195286195, "grad_norm": 1.5380931410912275, "learning_rate": 6.678026261206102e-06, "loss": 0.6994, "step": 3772 }, { "epoch": 0.6196928635953026, "grad_norm": 1.8050002597400245, "learning_rate": 6.673008643055228e-06, "loss": 0.7461, "step": 3773 }, { "epoch": 0.6198571076619858, "grad_norm": 1.620313038798409, "learning_rate": 6.667991966602257e-06, "loss": 0.7727, "step": 3774 }, { "epoch": 0.6200213517286688, "grad_norm": 1.40980914200872, "learning_rate": 6.66297623326715e-06, "loss": 0.6894, "step": 3775 }, { "epoch": 0.6201855957953519, "grad_norm": 2.1138398091956256, "learning_rate": 6.657961444469601e-06, "loss": 0.8267, "step": 3776 }, { "epoch": 0.620349839862035, "grad_norm": 1.5249804969494256, "learning_rate": 6.652947601629032e-06, "loss": 0.8198, "step": 3777 }, { "epoch": 0.6205140839287181, "grad_norm": 3.436731319614736, "learning_rate": 6.6479347061646046e-06, "loss": 0.8441, "step": 3778 }, { "epoch": 0.6206783279954011, "grad_norm": 1.6887554312997872, "learning_rate": 6.642922759495205e-06, "loss": 0.7897, "step": 3779 }, { "epoch": 0.6208425720620843, "grad_norm": 2.243108684313133, "learning_rate": 6.637911763039457e-06, "loss": 0.788, "step": 3780 }, { "epoch": 0.6210068161287674, "grad_norm": 4.450958079519282, "learning_rate": 6.632901718215711e-06, "loss": 0.8266, "step": 3781 }, { "epoch": 0.6211710601954504, "grad_norm": 1.3807194432538417, "learning_rate": 6.627892626442049e-06, "loss": 0.8174, "step": 3782 }, { "epoch": 0.6213353042621336, "grad_norm": 1.8396175300980806, "learning_rate": 6.622884489136286e-06, "loss": 0.7217, "step": 3783 }, { "epoch": 0.6214995483288166, "grad_norm": 1.705048196147358, "learning_rate": 6.617877307715963e-06, "loss": 0.8276, "step": 3784 }, { "epoch": 0.6216637923954997, "grad_norm": 0.6230776336248589, "learning_rate": 6.612871083598354e-06, "loss": 0.3413, "step": 3785 }, { "epoch": 0.6218280364621828, "grad_norm": 1.5207854650983654, "learning_rate": 6.607865818200458e-06, "loss": 0.8131, "step": 3786 }, { "epoch": 0.6219922805288659, "grad_norm": 2.1815729274709637, "learning_rate": 6.602861512939005e-06, "loss": 0.7089, "step": 3787 }, { "epoch": 0.6221565245955489, "grad_norm": 1.5420669014041164, "learning_rate": 6.597858169230454e-06, "loss": 0.7693, "step": 3788 }, { "epoch": 0.6223207686622321, "grad_norm": 1.696619251858423, "learning_rate": 6.592855788490991e-06, "loss": 0.7929, "step": 3789 }, { "epoch": 0.6224850127289152, "grad_norm": 1.9259428146572795, "learning_rate": 6.587854372136529e-06, "loss": 0.8215, "step": 3790 }, { "epoch": 0.6226492567955982, "grad_norm": 1.6755726099068695, "learning_rate": 6.582853921582708e-06, "loss": 0.7887, "step": 3791 }, { "epoch": 0.6228135008622814, "grad_norm": 1.8529810880095225, "learning_rate": 6.577854438244897e-06, "loss": 0.7844, "step": 3792 }, { "epoch": 0.6229777449289644, "grad_norm": 1.560612272798695, "learning_rate": 6.572855923538186e-06, "loss": 0.7523, "step": 3793 }, { "epoch": 0.6231419889956475, "grad_norm": 2.211335970791184, "learning_rate": 6.567858378877394e-06, "loss": 0.7175, "step": 3794 }, { "epoch": 0.6233062330623306, "grad_norm": 0.6338975486276296, "learning_rate": 6.5628618056770696e-06, "loss": 0.3055, "step": 3795 }, { "epoch": 0.6234704771290137, "grad_norm": 1.6016067210946865, "learning_rate": 6.557866205351479e-06, "loss": 0.7652, "step": 3796 }, { "epoch": 0.6236347211956969, "grad_norm": 1.6735108192563484, "learning_rate": 6.552871579314619e-06, "loss": 0.7783, "step": 3797 }, { "epoch": 0.6237989652623799, "grad_norm": 2.030778076510876, "learning_rate": 6.547877928980206e-06, "loss": 0.789, "step": 3798 }, { "epoch": 0.623963209329063, "grad_norm": 1.8296065874202234, "learning_rate": 6.542885255761682e-06, "loss": 0.7455, "step": 3799 }, { "epoch": 0.624127453395746, "grad_norm": 2.299796751162671, "learning_rate": 6.537893561072214e-06, "loss": 0.7947, "step": 3800 }, { "epoch": 0.6242916974624292, "grad_norm": 1.3238076715898541, "learning_rate": 6.532902846324689e-06, "loss": 0.7112, "step": 3801 }, { "epoch": 0.6244559415291122, "grad_norm": 1.7781734045520996, "learning_rate": 6.52791311293172e-06, "loss": 0.7107, "step": 3802 }, { "epoch": 0.6246201855957954, "grad_norm": 0.6175507007369399, "learning_rate": 6.522924362305639e-06, "loss": 0.3294, "step": 3803 }, { "epoch": 0.6247844296624785, "grad_norm": 1.6394512410390776, "learning_rate": 6.517936595858503e-06, "loss": 0.7813, "step": 3804 }, { "epoch": 0.6249486737291615, "grad_norm": 0.5745435534430914, "learning_rate": 6.512949815002088e-06, "loss": 0.3343, "step": 3805 }, { "epoch": 0.6251129177958447, "grad_norm": 1.5031263539172783, "learning_rate": 6.50796402114789e-06, "loss": 0.7694, "step": 3806 }, { "epoch": 0.6252771618625277, "grad_norm": 1.5016831857812036, "learning_rate": 6.502979215707133e-06, "loss": 0.7607, "step": 3807 }, { "epoch": 0.6254414059292108, "grad_norm": 1.4570058453967867, "learning_rate": 6.497995400090748e-06, "loss": 0.7952, "step": 3808 }, { "epoch": 0.6256056499958939, "grad_norm": 1.5786969519902352, "learning_rate": 6.4930125757094e-06, "loss": 0.6969, "step": 3809 }, { "epoch": 0.625769894062577, "grad_norm": 2.5188753235284778, "learning_rate": 6.488030743973463e-06, "loss": 0.7878, "step": 3810 }, { "epoch": 0.6259341381292601, "grad_norm": 1.4189953052614535, "learning_rate": 6.483049906293035e-06, "loss": 0.7686, "step": 3811 }, { "epoch": 0.6260983821959432, "grad_norm": 2.45690977404878, "learning_rate": 6.478070064077933e-06, "loss": 0.8436, "step": 3812 }, { "epoch": 0.6262626262626263, "grad_norm": 1.8406438066509525, "learning_rate": 6.4730912187376895e-06, "loss": 0.762, "step": 3813 }, { "epoch": 0.6264268703293093, "grad_norm": 1.3067721395785852, "learning_rate": 6.468113371681557e-06, "loss": 0.714, "step": 3814 }, { "epoch": 0.6265911143959925, "grad_norm": 1.5734890376023973, "learning_rate": 6.463136524318503e-06, "loss": 0.7669, "step": 3815 }, { "epoch": 0.6267553584626755, "grad_norm": 1.7052123198895832, "learning_rate": 6.4581606780572155e-06, "loss": 0.7606, "step": 3816 }, { "epoch": 0.6269196025293586, "grad_norm": 1.4454105726955642, "learning_rate": 6.453185834306095e-06, "loss": 0.6861, "step": 3817 }, { "epoch": 0.6270838465960418, "grad_norm": 1.552849476996544, "learning_rate": 6.448211994473263e-06, "loss": 0.701, "step": 3818 }, { "epoch": 0.6272480906627248, "grad_norm": 1.7405542271776688, "learning_rate": 6.443239159966556e-06, "loss": 0.7664, "step": 3819 }, { "epoch": 0.6274123347294079, "grad_norm": 1.4023315945345793, "learning_rate": 6.438267332193519e-06, "loss": 0.691, "step": 3820 }, { "epoch": 0.627576578796091, "grad_norm": 2.081661244178782, "learning_rate": 6.4332965125614235e-06, "loss": 0.8289, "step": 3821 }, { "epoch": 0.6277408228627741, "grad_norm": 5.239883221022357, "learning_rate": 6.428326702477246e-06, "loss": 0.7137, "step": 3822 }, { "epoch": 0.6279050669294571, "grad_norm": 2.2579003275804377, "learning_rate": 6.42335790334768e-06, "loss": 0.8092, "step": 3823 }, { "epoch": 0.6280693109961403, "grad_norm": 2.394592369652558, "learning_rate": 6.418390116579134e-06, "loss": 0.7712, "step": 3824 }, { "epoch": 0.6282335550628233, "grad_norm": 1.634917213262364, "learning_rate": 6.4134233435777315e-06, "loss": 0.8258, "step": 3825 }, { "epoch": 0.6283977991295064, "grad_norm": 2.0453811726096256, "learning_rate": 6.408457585749307e-06, "loss": 0.7833, "step": 3826 }, { "epoch": 0.6285620431961896, "grad_norm": 0.6455936252241974, "learning_rate": 6.403492844499406e-06, "loss": 0.3344, "step": 3827 }, { "epoch": 0.6287262872628726, "grad_norm": 1.5680476197889581, "learning_rate": 6.398529121233291e-06, "loss": 0.8203, "step": 3828 }, { "epoch": 0.6288905313295557, "grad_norm": 1.8941198223711755, "learning_rate": 6.39356641735593e-06, "loss": 0.7303, "step": 3829 }, { "epoch": 0.6290547753962388, "grad_norm": 1.8930435332338225, "learning_rate": 6.388604734272006e-06, "loss": 0.8176, "step": 3830 }, { "epoch": 0.6292190194629219, "grad_norm": 1.4612124856612854, "learning_rate": 6.383644073385915e-06, "loss": 0.7657, "step": 3831 }, { "epoch": 0.6293832635296049, "grad_norm": 2.5260953226303617, "learning_rate": 6.378684436101761e-06, "loss": 0.7956, "step": 3832 }, { "epoch": 0.6295475075962881, "grad_norm": 1.6435796448522966, "learning_rate": 6.373725823823359e-06, "loss": 0.7561, "step": 3833 }, { "epoch": 0.6297117516629712, "grad_norm": 2.2565727995997786, "learning_rate": 6.368768237954234e-06, "loss": 0.8128, "step": 3834 }, { "epoch": 0.6298759957296542, "grad_norm": 1.5294446942014266, "learning_rate": 6.363811679897618e-06, "loss": 0.7425, "step": 3835 }, { "epoch": 0.6300402397963374, "grad_norm": 1.7479667641988743, "learning_rate": 6.358856151056458e-06, "loss": 0.7488, "step": 3836 }, { "epoch": 0.6302044838630204, "grad_norm": 1.9504095377594706, "learning_rate": 6.353901652833403e-06, "loss": 0.7685, "step": 3837 }, { "epoch": 0.6303687279297036, "grad_norm": 1.8194076345820163, "learning_rate": 6.348948186630815e-06, "loss": 0.7244, "step": 3838 }, { "epoch": 0.6305329719963866, "grad_norm": 2.0121220393738164, "learning_rate": 6.343995753850762e-06, "loss": 0.7924, "step": 3839 }, { "epoch": 0.6306972160630697, "grad_norm": 1.9941822713832063, "learning_rate": 6.339044355895016e-06, "loss": 0.7816, "step": 3840 }, { "epoch": 0.6308614601297529, "grad_norm": 3.4017675603392323, "learning_rate": 6.334093994165067e-06, "loss": 0.8286, "step": 3841 }, { "epoch": 0.6310257041964359, "grad_norm": 1.716602421531439, "learning_rate": 6.3291446700621e-06, "loss": 0.7872, "step": 3842 }, { "epoch": 0.631189948263119, "grad_norm": 1.8019072139191836, "learning_rate": 6.324196384987009e-06, "loss": 0.7432, "step": 3843 }, { "epoch": 0.631354192329802, "grad_norm": 1.6236206636650536, "learning_rate": 6.3192491403404e-06, "loss": 0.771, "step": 3844 }, { "epoch": 0.6315184363964852, "grad_norm": 1.435440209868319, "learning_rate": 6.3143029375225785e-06, "loss": 0.7645, "step": 3845 }, { "epoch": 0.6316826804631682, "grad_norm": 1.7897732838379623, "learning_rate": 6.309357777933555e-06, "loss": 0.8097, "step": 3846 }, { "epoch": 0.6318469245298514, "grad_norm": 1.6049033106770152, "learning_rate": 6.30441366297305e-06, "loss": 0.7289, "step": 3847 }, { "epoch": 0.6320111685965345, "grad_norm": 0.5985869126651875, "learning_rate": 6.2994705940404825e-06, "loss": 0.3519, "step": 3848 }, { "epoch": 0.6321754126632175, "grad_norm": 1.4871817313907634, "learning_rate": 6.294528572534977e-06, "loss": 0.7665, "step": 3849 }, { "epoch": 0.6323396567299007, "grad_norm": 1.808606585819238, "learning_rate": 6.289587599855367e-06, "loss": 0.7309, "step": 3850 }, { "epoch": 0.6325039007965837, "grad_norm": 1.5614847205884754, "learning_rate": 6.284647677400177e-06, "loss": 0.7143, "step": 3851 }, { "epoch": 0.6326681448632668, "grad_norm": 1.8722407771768745, "learning_rate": 6.279708806567646e-06, "loss": 0.6281, "step": 3852 }, { "epoch": 0.6328323889299499, "grad_norm": 2.0576426337723768, "learning_rate": 6.274770988755712e-06, "loss": 0.7438, "step": 3853 }, { "epoch": 0.632996632996633, "grad_norm": 1.7460426110207976, "learning_rate": 6.2698342253620105e-06, "loss": 0.7562, "step": 3854 }, { "epoch": 0.6331608770633161, "grad_norm": 1.6910227768111523, "learning_rate": 6.264898517783885e-06, "loss": 0.7412, "step": 3855 }, { "epoch": 0.6333251211299992, "grad_norm": 2.234413318425567, "learning_rate": 6.259963867418375e-06, "loss": 0.6948, "step": 3856 }, { "epoch": 0.6334893651966823, "grad_norm": 1.509872271181271, "learning_rate": 6.255030275662226e-06, "loss": 0.7297, "step": 3857 }, { "epoch": 0.6336536092633653, "grad_norm": 1.9928476897466891, "learning_rate": 6.250097743911877e-06, "loss": 0.73, "step": 3858 }, { "epoch": 0.6338178533300485, "grad_norm": 1.5718292558120954, "learning_rate": 6.245166273563473e-06, "loss": 0.7567, "step": 3859 }, { "epoch": 0.6339820973967315, "grad_norm": 0.6178392493261509, "learning_rate": 6.240235866012856e-06, "loss": 0.349, "step": 3860 }, { "epoch": 0.6341463414634146, "grad_norm": 1.9014533418532245, "learning_rate": 6.235306522655566e-06, "loss": 0.7035, "step": 3861 }, { "epoch": 0.6343105855300977, "grad_norm": 1.6467683783849165, "learning_rate": 6.230378244886847e-06, "loss": 0.7514, "step": 3862 }, { "epoch": 0.6344748295967808, "grad_norm": 1.6857832975477336, "learning_rate": 6.225451034101631e-06, "loss": 0.6957, "step": 3863 }, { "epoch": 0.634639073663464, "grad_norm": 1.5144020723645157, "learning_rate": 6.220524891694562e-06, "loss": 0.7455, "step": 3864 }, { "epoch": 0.634803317730147, "grad_norm": 1.580514040844306, "learning_rate": 6.2155998190599705e-06, "loss": 0.7574, "step": 3865 }, { "epoch": 0.6349675617968301, "grad_norm": 2.2093455882456787, "learning_rate": 6.210675817591889e-06, "loss": 0.7786, "step": 3866 }, { "epoch": 0.6351318058635131, "grad_norm": 2.4284549569620233, "learning_rate": 6.2057528886840445e-06, "loss": 0.6884, "step": 3867 }, { "epoch": 0.6352960499301963, "grad_norm": 1.8304849518751596, "learning_rate": 6.200831033729864e-06, "loss": 0.7234, "step": 3868 }, { "epoch": 0.6354602939968793, "grad_norm": 1.791631049385766, "learning_rate": 6.195910254122466e-06, "loss": 0.7628, "step": 3869 }, { "epoch": 0.6356245380635624, "grad_norm": 1.7420475040287877, "learning_rate": 6.190990551254668e-06, "loss": 0.8215, "step": 3870 }, { "epoch": 0.6357887821302456, "grad_norm": 1.6415903319292704, "learning_rate": 6.186071926518984e-06, "loss": 0.7372, "step": 3871 }, { "epoch": 0.6359530261969286, "grad_norm": 1.6082316401192078, "learning_rate": 6.18115438130761e-06, "loss": 0.7234, "step": 3872 }, { "epoch": 0.6361172702636118, "grad_norm": 2.038240297704765, "learning_rate": 6.176237917012459e-06, "loss": 0.7809, "step": 3873 }, { "epoch": 0.6362815143302948, "grad_norm": 1.5896495135848245, "learning_rate": 6.171322535025119e-06, "loss": 0.7327, "step": 3874 }, { "epoch": 0.6364457583969779, "grad_norm": 1.6265651285904925, "learning_rate": 6.166408236736883e-06, "loss": 0.7721, "step": 3875 }, { "epoch": 0.636610002463661, "grad_norm": 1.5026015961452381, "learning_rate": 6.161495023538729e-06, "loss": 0.7927, "step": 3876 }, { "epoch": 0.6367742465303441, "grad_norm": 1.6905190100314778, "learning_rate": 6.1565828968213325e-06, "loss": 0.8089, "step": 3877 }, { "epoch": 0.6369384905970272, "grad_norm": 2.0215052886566207, "learning_rate": 6.151671857975061e-06, "loss": 0.7133, "step": 3878 }, { "epoch": 0.6371027346637103, "grad_norm": 1.6352138510406682, "learning_rate": 6.146761908389975e-06, "loss": 0.6905, "step": 3879 }, { "epoch": 0.6372669787303934, "grad_norm": 1.9646939902140015, "learning_rate": 6.141853049455824e-06, "loss": 0.7689, "step": 3880 }, { "epoch": 0.6374312227970764, "grad_norm": 1.7402061886412288, "learning_rate": 6.1369452825620515e-06, "loss": 0.8132, "step": 3881 }, { "epoch": 0.6375954668637596, "grad_norm": 1.7368144845720128, "learning_rate": 6.132038609097788e-06, "loss": 0.7771, "step": 3882 }, { "epoch": 0.6377597109304426, "grad_norm": 1.5268925803155735, "learning_rate": 6.12713303045186e-06, "loss": 0.7552, "step": 3883 }, { "epoch": 0.6379239549971257, "grad_norm": 1.7770201743186698, "learning_rate": 6.1222285480127786e-06, "loss": 0.7402, "step": 3884 }, { "epoch": 0.6380881990638089, "grad_norm": 1.6838016280651735, "learning_rate": 6.11732516316875e-06, "loss": 0.7839, "step": 3885 }, { "epoch": 0.6382524431304919, "grad_norm": 1.5429249744786588, "learning_rate": 6.112422877307664e-06, "loss": 0.8282, "step": 3886 }, { "epoch": 0.638416687197175, "grad_norm": 2.2142430709516447, "learning_rate": 6.107521691817104e-06, "loss": 0.7965, "step": 3887 }, { "epoch": 0.6385809312638581, "grad_norm": 1.5295472309229952, "learning_rate": 6.10262160808434e-06, "loss": 0.7378, "step": 3888 }, { "epoch": 0.6387451753305412, "grad_norm": 0.6460722144273259, "learning_rate": 6.097722627496332e-06, "loss": 0.2947, "step": 3889 }, { "epoch": 0.6389094193972242, "grad_norm": 1.3828693167911639, "learning_rate": 6.092824751439723e-06, "loss": 0.7377, "step": 3890 }, { "epoch": 0.6390736634639074, "grad_norm": 1.888586887963636, "learning_rate": 6.0879279813008495e-06, "loss": 0.7861, "step": 3891 }, { "epoch": 0.6392379075305905, "grad_norm": 1.7800210028989618, "learning_rate": 6.083032318465731e-06, "loss": 0.7585, "step": 3892 }, { "epoch": 0.6394021515972735, "grad_norm": 1.9698082735644027, "learning_rate": 6.0781377643200765e-06, "loss": 0.8371, "step": 3893 }, { "epoch": 0.6395663956639567, "grad_norm": 2.791012062794417, "learning_rate": 6.073244320249274e-06, "loss": 0.7491, "step": 3894 }, { "epoch": 0.6397306397306397, "grad_norm": 1.7453190706519741, "learning_rate": 6.0683519876384034e-06, "loss": 0.762, "step": 3895 }, { "epoch": 0.6398948837973228, "grad_norm": 2.0486644390991184, "learning_rate": 6.063460767872233e-06, "loss": 0.6996, "step": 3896 }, { "epoch": 0.6400591278640059, "grad_norm": 2.046815407551837, "learning_rate": 6.05857066233521e-06, "loss": 0.7197, "step": 3897 }, { "epoch": 0.640223371930689, "grad_norm": 1.6738192663847031, "learning_rate": 6.053681672411471e-06, "loss": 0.6622, "step": 3898 }, { "epoch": 0.640387615997372, "grad_norm": 1.8493952027997649, "learning_rate": 6.048793799484831e-06, "loss": 0.7105, "step": 3899 }, { "epoch": 0.6405518600640552, "grad_norm": 1.5922088591684471, "learning_rate": 6.0439070449387924e-06, "loss": 0.6925, "step": 3900 }, { "epoch": 0.6407161041307383, "grad_norm": 2.1434064440058442, "learning_rate": 6.039021410156542e-06, "loss": 0.767, "step": 3901 }, { "epoch": 0.6408803481974213, "grad_norm": 2.6514737315327035, "learning_rate": 6.03413689652095e-06, "loss": 0.7346, "step": 3902 }, { "epoch": 0.6410445922641045, "grad_norm": 1.3830524215019697, "learning_rate": 6.029253505414565e-06, "loss": 0.6931, "step": 3903 }, { "epoch": 0.6412088363307875, "grad_norm": 3.382880141581397, "learning_rate": 6.024371238219622e-06, "loss": 0.7778, "step": 3904 }, { "epoch": 0.6413730803974707, "grad_norm": 1.644074587406243, "learning_rate": 6.019490096318036e-06, "loss": 0.6859, "step": 3905 }, { "epoch": 0.6415373244641537, "grad_norm": 1.9207651221165707, "learning_rate": 6.014610081091403e-06, "loss": 0.7741, "step": 3906 }, { "epoch": 0.6417015685308368, "grad_norm": 4.009177054765378, "learning_rate": 6.009731193921002e-06, "loss": 0.7619, "step": 3907 }, { "epoch": 0.64186581259752, "grad_norm": 1.5091389116536345, "learning_rate": 6.004853436187794e-06, "loss": 0.761, "step": 3908 }, { "epoch": 0.642030056664203, "grad_norm": 1.9413619028130569, "learning_rate": 5.9999768092724145e-06, "loss": 0.663, "step": 3909 }, { "epoch": 0.6421943007308861, "grad_norm": 1.8353439869383965, "learning_rate": 5.995101314555181e-06, "loss": 0.6969, "step": 3910 }, { "epoch": 0.6423585447975692, "grad_norm": 1.716240394453561, "learning_rate": 5.990226953416099e-06, "loss": 0.7949, "step": 3911 }, { "epoch": 0.6425227888642523, "grad_norm": 1.71872523193612, "learning_rate": 5.98535372723484e-06, "loss": 0.7657, "step": 3912 }, { "epoch": 0.6426870329309353, "grad_norm": 1.4253981023934152, "learning_rate": 5.9804816373907625e-06, "loss": 0.7549, "step": 3913 }, { "epoch": 0.6428512769976185, "grad_norm": 2.2288944143803606, "learning_rate": 5.975610685262902e-06, "loss": 0.744, "step": 3914 }, { "epoch": 0.6430155210643016, "grad_norm": 1.5894353189659387, "learning_rate": 5.970740872229974e-06, "loss": 0.7777, "step": 3915 }, { "epoch": 0.6431797651309846, "grad_norm": 2.192860958131969, "learning_rate": 5.965872199670362e-06, "loss": 0.7468, "step": 3916 }, { "epoch": 0.6433440091976678, "grad_norm": 1.7302792546070147, "learning_rate": 5.961004668962136e-06, "loss": 0.8244, "step": 3917 }, { "epoch": 0.6435082532643508, "grad_norm": 1.5919629009093033, "learning_rate": 5.956138281483039e-06, "loss": 0.7771, "step": 3918 }, { "epoch": 0.6436724973310339, "grad_norm": 1.5968027801467293, "learning_rate": 5.951273038610496e-06, "loss": 0.744, "step": 3919 }, { "epoch": 0.643836741397717, "grad_norm": 0.6606638541209677, "learning_rate": 5.946408941721602e-06, "loss": 0.3553, "step": 3920 }, { "epoch": 0.6440009854644001, "grad_norm": 0.6232184966759605, "learning_rate": 5.941545992193129e-06, "loss": 0.3631, "step": 3921 }, { "epoch": 0.6441652295310832, "grad_norm": 2.2661844499340864, "learning_rate": 5.936684191401525e-06, "loss": 0.7643, "step": 3922 }, { "epoch": 0.6443294735977663, "grad_norm": 1.462533024160118, "learning_rate": 5.931823540722912e-06, "loss": 0.7505, "step": 3923 }, { "epoch": 0.6444937176644494, "grad_norm": 1.669455811836755, "learning_rate": 5.9269640415330875e-06, "loss": 0.7413, "step": 3924 }, { "epoch": 0.6446579617311324, "grad_norm": 2.5302448594559404, "learning_rate": 5.922105695207521e-06, "loss": 0.7877, "step": 3925 }, { "epoch": 0.6448222057978156, "grad_norm": 1.5101125188963342, "learning_rate": 5.917248503121359e-06, "loss": 0.7412, "step": 3926 }, { "epoch": 0.6449864498644986, "grad_norm": 1.7359216407502116, "learning_rate": 5.912392466649419e-06, "loss": 0.8252, "step": 3927 }, { "epoch": 0.6451506939311817, "grad_norm": 1.5290741270936046, "learning_rate": 5.907537587166191e-06, "loss": 0.8118, "step": 3928 }, { "epoch": 0.6453149379978649, "grad_norm": 1.6855922805872603, "learning_rate": 5.90268386604584e-06, "loss": 0.756, "step": 3929 }, { "epoch": 0.6454791820645479, "grad_norm": 1.7080272870413757, "learning_rate": 5.897831304662201e-06, "loss": 0.7135, "step": 3930 }, { "epoch": 0.645643426131231, "grad_norm": 1.6814311358212892, "learning_rate": 5.892979904388781e-06, "loss": 0.7513, "step": 3931 }, { "epoch": 0.6458076701979141, "grad_norm": 5.16353733707688, "learning_rate": 5.888129666598756e-06, "loss": 0.7421, "step": 3932 }, { "epoch": 0.6459719142645972, "grad_norm": 1.603182677201774, "learning_rate": 5.883280592664979e-06, "loss": 0.7587, "step": 3933 }, { "epoch": 0.6461361583312802, "grad_norm": 1.9099814156499044, "learning_rate": 5.878432683959972e-06, "loss": 0.8411, "step": 3934 }, { "epoch": 0.6463004023979634, "grad_norm": 3.434961132857352, "learning_rate": 5.8735859418559206e-06, "loss": 0.756, "step": 3935 }, { "epoch": 0.6464646464646465, "grad_norm": 1.62852999801844, "learning_rate": 5.868740367724692e-06, "loss": 0.7349, "step": 3936 }, { "epoch": 0.6466288905313295, "grad_norm": 1.8472587181087088, "learning_rate": 5.863895962937806e-06, "loss": 0.7511, "step": 3937 }, { "epoch": 0.6467931345980127, "grad_norm": 1.7856715085486972, "learning_rate": 5.859052728866468e-06, "loss": 0.7502, "step": 3938 }, { "epoch": 0.6469573786646957, "grad_norm": 1.791334714497397, "learning_rate": 5.854210666881544e-06, "loss": 0.7299, "step": 3939 }, { "epoch": 0.6471216227313789, "grad_norm": 1.9357787161473503, "learning_rate": 5.8493697783535665e-06, "loss": 0.7619, "step": 3940 }, { "epoch": 0.6472858667980619, "grad_norm": 1.7818105488371307, "learning_rate": 5.844530064652742e-06, "loss": 0.7766, "step": 3941 }, { "epoch": 0.647450110864745, "grad_norm": 2.4305855017741376, "learning_rate": 5.839691527148938e-06, "loss": 0.7816, "step": 3942 }, { "epoch": 0.647614354931428, "grad_norm": 1.463703183986096, "learning_rate": 5.834854167211699e-06, "loss": 0.7381, "step": 3943 }, { "epoch": 0.6477785989981112, "grad_norm": 2.218601223552984, "learning_rate": 5.8300179862102225e-06, "loss": 0.7031, "step": 3944 }, { "epoch": 0.6479428430647943, "grad_norm": 1.3866606479028358, "learning_rate": 5.825182985513383e-06, "loss": 0.762, "step": 3945 }, { "epoch": 0.6481070871314774, "grad_norm": 2.002315780360097, "learning_rate": 5.820349166489716e-06, "loss": 0.8068, "step": 3946 }, { "epoch": 0.6482713311981605, "grad_norm": 0.6403674057063214, "learning_rate": 5.8155165305074245e-06, "loss": 0.3243, "step": 3947 }, { "epoch": 0.6484355752648435, "grad_norm": 1.4160797197630919, "learning_rate": 5.810685078934375e-06, "loss": 0.7441, "step": 3948 }, { "epoch": 0.6485998193315267, "grad_norm": 1.5087202680207654, "learning_rate": 5.805854813138098e-06, "loss": 0.7714, "step": 3949 }, { "epoch": 0.6487640633982097, "grad_norm": 1.6537705678395362, "learning_rate": 5.801025734485794e-06, "loss": 0.7703, "step": 3950 }, { "epoch": 0.6489283074648928, "grad_norm": 0.5810848210072719, "learning_rate": 5.796197844344325e-06, "loss": 0.3157, "step": 3951 }, { "epoch": 0.649092551531576, "grad_norm": 1.6769074297200428, "learning_rate": 5.791371144080209e-06, "loss": 0.7726, "step": 3952 }, { "epoch": 0.649256795598259, "grad_norm": 1.8376265952981108, "learning_rate": 5.78654563505964e-06, "loss": 0.7247, "step": 3953 }, { "epoch": 0.6494210396649421, "grad_norm": 1.388243587639838, "learning_rate": 5.781721318648461e-06, "loss": 0.8002, "step": 3954 }, { "epoch": 0.6495852837316252, "grad_norm": 1.897165184656544, "learning_rate": 5.7768981962121906e-06, "loss": 0.7813, "step": 3955 }, { "epoch": 0.6497495277983083, "grad_norm": 1.7549299261675098, "learning_rate": 5.772076269116001e-06, "loss": 0.8086, "step": 3956 }, { "epoch": 0.6499137718649913, "grad_norm": 1.5771753654481855, "learning_rate": 5.7672555387247274e-06, "loss": 0.7854, "step": 3957 }, { "epoch": 0.6500780159316745, "grad_norm": 4.791232529971497, "learning_rate": 5.762436006402874e-06, "loss": 0.7379, "step": 3958 }, { "epoch": 0.6502422599983576, "grad_norm": 4.142654648317831, "learning_rate": 5.757617673514588e-06, "loss": 0.7573, "step": 3959 }, { "epoch": 0.6504065040650406, "grad_norm": 1.522275707381712, "learning_rate": 5.752800541423696e-06, "loss": 0.7137, "step": 3960 }, { "epoch": 0.6505707481317238, "grad_norm": 1.4843278818925048, "learning_rate": 5.747984611493675e-06, "loss": 0.7418, "step": 3961 }, { "epoch": 0.6507349921984068, "grad_norm": 2.704030556838571, "learning_rate": 5.743169885087665e-06, "loss": 0.8063, "step": 3962 }, { "epoch": 0.6508992362650899, "grad_norm": 1.9001604927285733, "learning_rate": 5.738356363568463e-06, "loss": 0.7894, "step": 3963 }, { "epoch": 0.651063480331773, "grad_norm": 2.1915304747801025, "learning_rate": 5.733544048298526e-06, "loss": 0.6711, "step": 3964 }, { "epoch": 0.6512277243984561, "grad_norm": 0.6469076939698002, "learning_rate": 5.728732940639972e-06, "loss": 0.3397, "step": 3965 }, { "epoch": 0.6513919684651392, "grad_norm": 1.5724579089604396, "learning_rate": 5.723923041954571e-06, "loss": 0.7699, "step": 3966 }, { "epoch": 0.6515562125318223, "grad_norm": 1.7609769611167838, "learning_rate": 5.719114353603757e-06, "loss": 0.7506, "step": 3967 }, { "epoch": 0.6517204565985054, "grad_norm": 1.520463040515715, "learning_rate": 5.714306876948621e-06, "loss": 0.7687, "step": 3968 }, { "epoch": 0.6518847006651884, "grad_norm": 1.842606373736128, "learning_rate": 5.709500613349906e-06, "loss": 0.7173, "step": 3969 }, { "epoch": 0.6520489447318716, "grad_norm": 1.8547992811781673, "learning_rate": 5.704695564168014e-06, "loss": 0.7431, "step": 3970 }, { "epoch": 0.6522131887985546, "grad_norm": 2.3422241312900214, "learning_rate": 5.6998917307630095e-06, "loss": 0.783, "step": 3971 }, { "epoch": 0.6523774328652377, "grad_norm": 1.7981385471788647, "learning_rate": 5.695089114494599e-06, "loss": 0.8099, "step": 3972 }, { "epoch": 0.6525416769319209, "grad_norm": 1.5990213109291682, "learning_rate": 5.69028771672216e-06, "loss": 0.8248, "step": 3973 }, { "epoch": 0.6527059209986039, "grad_norm": 1.9798717666782963, "learning_rate": 5.685487538804718e-06, "loss": 0.7027, "step": 3974 }, { "epoch": 0.6528701650652871, "grad_norm": 1.6671006978997363, "learning_rate": 5.68068858210095e-06, "loss": 0.8106, "step": 3975 }, { "epoch": 0.6530344091319701, "grad_norm": 2.1301766070897328, "learning_rate": 5.675890847969193e-06, "loss": 0.6932, "step": 3976 }, { "epoch": 0.6531986531986532, "grad_norm": 1.4940277857781792, "learning_rate": 5.671094337767433e-06, "loss": 0.7982, "step": 3977 }, { "epoch": 0.6533628972653363, "grad_norm": 1.3247803080280514, "learning_rate": 5.666299052853314e-06, "loss": 0.7602, "step": 3978 }, { "epoch": 0.6535271413320194, "grad_norm": 1.6126886600815558, "learning_rate": 5.661504994584133e-06, "loss": 0.7046, "step": 3979 }, { "epoch": 0.6536913853987024, "grad_norm": 2.052629953615901, "learning_rate": 5.656712164316838e-06, "loss": 0.699, "step": 3980 }, { "epoch": 0.6538556294653856, "grad_norm": 1.806946132838492, "learning_rate": 5.651920563408022e-06, "loss": 0.7492, "step": 3981 }, { "epoch": 0.6540198735320687, "grad_norm": 1.7546219579000295, "learning_rate": 5.647130193213945e-06, "loss": 0.7789, "step": 3982 }, { "epoch": 0.6541841175987517, "grad_norm": 2.3340114333699833, "learning_rate": 5.642341055090508e-06, "loss": 0.7889, "step": 3983 }, { "epoch": 0.6543483616654349, "grad_norm": 1.6534362085573693, "learning_rate": 5.637553150393268e-06, "loss": 0.7416, "step": 3984 }, { "epoch": 0.6545126057321179, "grad_norm": 1.8031089913863116, "learning_rate": 5.632766480477432e-06, "loss": 0.727, "step": 3985 }, { "epoch": 0.654676849798801, "grad_norm": 1.9139492820366046, "learning_rate": 5.6279810466978546e-06, "loss": 0.7874, "step": 3986 }, { "epoch": 0.6548410938654841, "grad_norm": 1.3906926388479337, "learning_rate": 5.623196850409044e-06, "loss": 0.823, "step": 3987 }, { "epoch": 0.6550053379321672, "grad_norm": 1.493874529992262, "learning_rate": 5.618413892965158e-06, "loss": 0.7769, "step": 3988 }, { "epoch": 0.6551695819988503, "grad_norm": 1.9123798388408444, "learning_rate": 5.613632175720001e-06, "loss": 0.8329, "step": 3989 }, { "epoch": 0.6553338260655334, "grad_norm": 1.889676504424047, "learning_rate": 5.6088517000270275e-06, "loss": 0.8155, "step": 3990 }, { "epoch": 0.6554980701322165, "grad_norm": 2.1373629193840955, "learning_rate": 5.604072467239343e-06, "loss": 0.7377, "step": 3991 }, { "epoch": 0.6556623141988995, "grad_norm": 1.9243087785134974, "learning_rate": 5.599294478709698e-06, "loss": 0.8129, "step": 3992 }, { "epoch": 0.6558265582655827, "grad_norm": 1.9120866089257529, "learning_rate": 5.5945177357904935e-06, "loss": 0.841, "step": 3993 }, { "epoch": 0.6559908023322657, "grad_norm": 1.4728345200394275, "learning_rate": 5.589742239833776e-06, "loss": 0.797, "step": 3994 }, { "epoch": 0.6561550463989488, "grad_norm": 1.974166551306139, "learning_rate": 5.584967992191234e-06, "loss": 0.7599, "step": 3995 }, { "epoch": 0.656319290465632, "grad_norm": 1.6945806346031567, "learning_rate": 5.580194994214216e-06, "loss": 0.6868, "step": 3996 }, { "epoch": 0.656483534532315, "grad_norm": 1.4983237161829932, "learning_rate": 5.5754232472537086e-06, "loss": 0.8041, "step": 3997 }, { "epoch": 0.6566477785989981, "grad_norm": 1.7861794447567172, "learning_rate": 5.570652752660343e-06, "loss": 0.7501, "step": 3998 }, { "epoch": 0.6568120226656812, "grad_norm": 1.544803913065485, "learning_rate": 5.565883511784396e-06, "loss": 0.7492, "step": 3999 }, { "epoch": 0.6569762667323643, "grad_norm": 1.5640251788013815, "learning_rate": 5.561115525975793e-06, "loss": 0.6852, "step": 4000 }, { "epoch": 0.6571405107990473, "grad_norm": 1.6363861617319915, "learning_rate": 5.5563487965841055e-06, "loss": 0.7636, "step": 4001 }, { "epoch": 0.6573047548657305, "grad_norm": 0.5918143307862721, "learning_rate": 5.5515833249585385e-06, "loss": 0.329, "step": 4002 }, { "epoch": 0.6574689989324136, "grad_norm": 1.717374802196961, "learning_rate": 5.546819112447952e-06, "loss": 0.7773, "step": 4003 }, { "epoch": 0.6576332429990966, "grad_norm": 2.0720343989312777, "learning_rate": 5.542056160400848e-06, "loss": 0.7567, "step": 4004 }, { "epoch": 0.6577974870657798, "grad_norm": 3.6002760672349328, "learning_rate": 5.537294470165369e-06, "loss": 0.7476, "step": 4005 }, { "epoch": 0.6579617311324628, "grad_norm": 1.716452719528783, "learning_rate": 5.532534043089302e-06, "loss": 0.7687, "step": 4006 }, { "epoch": 0.658125975199146, "grad_norm": 1.7471791341784748, "learning_rate": 5.527774880520073e-06, "loss": 0.7275, "step": 4007 }, { "epoch": 0.658290219265829, "grad_norm": 2.139571670067122, "learning_rate": 5.523016983804759e-06, "loss": 0.7846, "step": 4008 }, { "epoch": 0.6584544633325121, "grad_norm": 3.7675860947733653, "learning_rate": 5.518260354290066e-06, "loss": 0.7204, "step": 4009 }, { "epoch": 0.6586187073991953, "grad_norm": 1.7972929021648343, "learning_rate": 5.513504993322352e-06, "loss": 0.7589, "step": 4010 }, { "epoch": 0.6587829514658783, "grad_norm": 1.8127413218865476, "learning_rate": 5.508750902247612e-06, "loss": 0.7895, "step": 4011 }, { "epoch": 0.6589471955325614, "grad_norm": 1.6116170234693104, "learning_rate": 5.503998082411479e-06, "loss": 0.6711, "step": 4012 }, { "epoch": 0.6591114395992445, "grad_norm": 1.619285645320543, "learning_rate": 5.499246535159231e-06, "loss": 0.7713, "step": 4013 }, { "epoch": 0.6592756836659276, "grad_norm": 1.450846881414102, "learning_rate": 5.494496261835781e-06, "loss": 0.8497, "step": 4014 }, { "epoch": 0.6594399277326106, "grad_norm": 6.045501150524228, "learning_rate": 5.489747263785687e-06, "loss": 0.7604, "step": 4015 }, { "epoch": 0.6596041717992938, "grad_norm": 2.7608979911193936, "learning_rate": 5.48499954235314e-06, "loss": 0.7876, "step": 4016 }, { "epoch": 0.6597684158659768, "grad_norm": 1.6934185068898422, "learning_rate": 5.480253098881974e-06, "loss": 0.6659, "step": 4017 }, { "epoch": 0.6599326599326599, "grad_norm": 2.3464295723142206, "learning_rate": 5.47550793471566e-06, "loss": 0.7308, "step": 4018 }, { "epoch": 0.6600969039993431, "grad_norm": 1.6552378941574282, "learning_rate": 5.470764051197302e-06, "loss": 0.7848, "step": 4019 }, { "epoch": 0.6602611480660261, "grad_norm": 0.627265287909948, "learning_rate": 5.466021449669655e-06, "loss": 0.3442, "step": 4020 }, { "epoch": 0.6604253921327092, "grad_norm": 1.7789071939710608, "learning_rate": 5.461280131475099e-06, "loss": 0.8045, "step": 4021 }, { "epoch": 0.6605896361993923, "grad_norm": 1.9041207133326634, "learning_rate": 5.456540097955652e-06, "loss": 0.7751, "step": 4022 }, { "epoch": 0.6607538802660754, "grad_norm": 1.5234349166332426, "learning_rate": 5.451801350452975e-06, "loss": 0.7438, "step": 4023 }, { "epoch": 0.6609181243327584, "grad_norm": 2.7657441689295115, "learning_rate": 5.447063890308354e-06, "loss": 0.7602, "step": 4024 }, { "epoch": 0.6610823683994416, "grad_norm": 0.6056554489395887, "learning_rate": 5.442327718862721e-06, "loss": 0.3527, "step": 4025 }, { "epoch": 0.6612466124661247, "grad_norm": 0.6063744701023406, "learning_rate": 5.4375928374566376e-06, "loss": 0.3577, "step": 4026 }, { "epoch": 0.6614108565328077, "grad_norm": 1.5276723715071863, "learning_rate": 5.432859247430303e-06, "loss": 0.7881, "step": 4027 }, { "epoch": 0.6615751005994909, "grad_norm": 1.691441208942527, "learning_rate": 5.428126950123551e-06, "loss": 0.7863, "step": 4028 }, { "epoch": 0.6617393446661739, "grad_norm": 10.103100455467171, "learning_rate": 5.423395946875846e-06, "loss": 0.8592, "step": 4029 }, { "epoch": 0.661903588732857, "grad_norm": 1.8872711862782958, "learning_rate": 5.418666239026291e-06, "loss": 0.6838, "step": 4030 }, { "epoch": 0.6620678327995401, "grad_norm": 1.7231651512248376, "learning_rate": 5.413937827913619e-06, "loss": 0.8332, "step": 4031 }, { "epoch": 0.6622320768662232, "grad_norm": 2.2074617069263325, "learning_rate": 5.409210714876197e-06, "loss": 0.7633, "step": 4032 }, { "epoch": 0.6623963209329063, "grad_norm": 1.74449241790019, "learning_rate": 5.404484901252023e-06, "loss": 0.7422, "step": 4033 }, { "epoch": 0.6625605649995894, "grad_norm": 2.137132909690962, "learning_rate": 5.399760388378729e-06, "loss": 0.7531, "step": 4034 }, { "epoch": 0.6627248090662725, "grad_norm": 0.5731790079745397, "learning_rate": 5.395037177593579e-06, "loss": 0.324, "step": 4035 }, { "epoch": 0.6628890531329555, "grad_norm": 1.7192214284448362, "learning_rate": 5.390315270233469e-06, "loss": 0.7231, "step": 4036 }, { "epoch": 0.6630532971996387, "grad_norm": 1.9327800467689138, "learning_rate": 5.385594667634923e-06, "loss": 0.7734, "step": 4037 }, { "epoch": 0.6632175412663217, "grad_norm": 2.137284121913629, "learning_rate": 5.3808753711341e-06, "loss": 0.7368, "step": 4038 }, { "epoch": 0.6633817853330048, "grad_norm": 2.2876384559072114, "learning_rate": 5.376157382066784e-06, "loss": 0.802, "step": 4039 }, { "epoch": 0.663546029399688, "grad_norm": 1.7349073047107286, "learning_rate": 5.371440701768394e-06, "loss": 0.7588, "step": 4040 }, { "epoch": 0.663710273466371, "grad_norm": 1.6134263292586617, "learning_rate": 5.366725331573974e-06, "loss": 0.7024, "step": 4041 }, { "epoch": 0.6638745175330542, "grad_norm": 1.601536412895771, "learning_rate": 5.3620112728182e-06, "loss": 0.6455, "step": 4042 }, { "epoch": 0.6640387615997372, "grad_norm": 1.7278982739997726, "learning_rate": 5.357298526835381e-06, "loss": 0.7207, "step": 4043 }, { "epoch": 0.6642030056664203, "grad_norm": 0.5984016564661964, "learning_rate": 5.35258709495945e-06, "loss": 0.3217, "step": 4044 }, { "epoch": 0.6643672497331033, "grad_norm": 2.3131138045365938, "learning_rate": 5.34787697852396e-06, "loss": 0.7246, "step": 4045 }, { "epoch": 0.6645314937997865, "grad_norm": 1.6520060107803236, "learning_rate": 5.343168178862104e-06, "loss": 0.681, "step": 4046 }, { "epoch": 0.6646957378664696, "grad_norm": 1.816695747692389, "learning_rate": 5.338460697306699e-06, "loss": 0.7204, "step": 4047 }, { "epoch": 0.6648599819331527, "grad_norm": 2.0652812746222238, "learning_rate": 5.333754535190186e-06, "loss": 0.8254, "step": 4048 }, { "epoch": 0.6650242259998358, "grad_norm": 1.5328982595698273, "learning_rate": 5.329049693844635e-06, "loss": 0.772, "step": 4049 }, { "epoch": 0.6651884700665188, "grad_norm": 1.8166418999802854, "learning_rate": 5.324346174601741e-06, "loss": 0.7974, "step": 4050 }, { "epoch": 0.665352714133202, "grad_norm": 1.6852925040248945, "learning_rate": 5.319643978792825e-06, "loss": 0.8101, "step": 4051 }, { "epoch": 0.665516958199885, "grad_norm": 2.3957427270926974, "learning_rate": 5.314943107748836e-06, "loss": 0.6793, "step": 4052 }, { "epoch": 0.6656812022665681, "grad_norm": 1.463567206384996, "learning_rate": 5.3102435628003435e-06, "loss": 0.7215, "step": 4053 }, { "epoch": 0.6658454463332512, "grad_norm": 1.8596556499378365, "learning_rate": 5.305545345277543e-06, "loss": 0.7148, "step": 4054 }, { "epoch": 0.6660096903999343, "grad_norm": 1.7419015062170977, "learning_rate": 5.300848456510257e-06, "loss": 0.6692, "step": 4055 }, { "epoch": 0.6661739344666174, "grad_norm": 1.8779555022213168, "learning_rate": 5.296152897827929e-06, "loss": 0.7108, "step": 4056 }, { "epoch": 0.6663381785333005, "grad_norm": 1.549167776582029, "learning_rate": 5.291458670559628e-06, "loss": 0.6755, "step": 4057 }, { "epoch": 0.6665024225999836, "grad_norm": 2.1553682313390454, "learning_rate": 5.286765776034044e-06, "loss": 0.75, "step": 4058 }, { "epoch": 0.6666666666666666, "grad_norm": 1.5665960730095334, "learning_rate": 5.282074215579492e-06, "loss": 0.8031, "step": 4059 }, { "epoch": 0.6668309107333498, "grad_norm": 1.4693572122734553, "learning_rate": 5.277383990523905e-06, "loss": 0.7382, "step": 4060 }, { "epoch": 0.6669951548000328, "grad_norm": 1.96177804484554, "learning_rate": 5.272695102194846e-06, "loss": 0.7341, "step": 4061 }, { "epoch": 0.6671593988667159, "grad_norm": 1.3924838666605774, "learning_rate": 5.2680075519194926e-06, "loss": 0.7863, "step": 4062 }, { "epoch": 0.6673236429333991, "grad_norm": 1.6360453077850288, "learning_rate": 5.263321341024646e-06, "loss": 0.724, "step": 4063 }, { "epoch": 0.6674878870000821, "grad_norm": 1.8121597181215028, "learning_rate": 5.25863647083673e-06, "loss": 0.7726, "step": 4064 }, { "epoch": 0.6676521310667652, "grad_norm": 1.9023429621326782, "learning_rate": 5.253952942681782e-06, "loss": 0.7653, "step": 4065 }, { "epoch": 0.6678163751334483, "grad_norm": 1.5279003227383636, "learning_rate": 5.249270757885475e-06, "loss": 0.8057, "step": 4066 }, { "epoch": 0.6679806192001314, "grad_norm": 0.5804944604461842, "learning_rate": 5.244589917773082e-06, "loss": 0.3476, "step": 4067 }, { "epoch": 0.6681448632668144, "grad_norm": 1.4833404251944544, "learning_rate": 5.239910423669509e-06, "loss": 0.7808, "step": 4068 }, { "epoch": 0.6683091073334976, "grad_norm": 1.5801155684862098, "learning_rate": 5.2352322768992755e-06, "loss": 0.7814, "step": 4069 }, { "epoch": 0.6684733514001807, "grad_norm": 1.711399971517674, "learning_rate": 5.230555478786522e-06, "loss": 0.7518, "step": 4070 }, { "epoch": 0.6686375954668637, "grad_norm": 2.8484694973078635, "learning_rate": 5.225880030655006e-06, "loss": 0.7206, "step": 4071 }, { "epoch": 0.6688018395335469, "grad_norm": 2.6615307054730764, "learning_rate": 5.221205933828104e-06, "loss": 0.7745, "step": 4072 }, { "epoch": 0.6689660836002299, "grad_norm": 1.3478508964446585, "learning_rate": 5.216533189628808e-06, "loss": 0.7538, "step": 4073 }, { "epoch": 0.669130327666913, "grad_norm": 1.4963055181159703, "learning_rate": 5.211861799379731e-06, "loss": 0.7787, "step": 4074 }, { "epoch": 0.6692945717335961, "grad_norm": 1.5831561892606407, "learning_rate": 5.207191764403097e-06, "loss": 0.7606, "step": 4075 }, { "epoch": 0.6694588158002792, "grad_norm": 1.8513966942760338, "learning_rate": 5.20252308602075e-06, "loss": 0.7047, "step": 4076 }, { "epoch": 0.6696230598669624, "grad_norm": 1.9800120227621947, "learning_rate": 5.197855765554152e-06, "loss": 0.7083, "step": 4077 }, { "epoch": 0.6697873039336454, "grad_norm": 1.8432463020376435, "learning_rate": 5.193189804324376e-06, "loss": 0.7697, "step": 4078 }, { "epoch": 0.6699515480003285, "grad_norm": 1.727345196638392, "learning_rate": 5.1885252036521125e-06, "loss": 0.7553, "step": 4079 }, { "epoch": 0.6701157920670116, "grad_norm": 1.5675607579344644, "learning_rate": 5.183861964857669e-06, "loss": 0.7606, "step": 4080 }, { "epoch": 0.6702800361336947, "grad_norm": 1.9247039497631027, "learning_rate": 5.179200089260964e-06, "loss": 0.7887, "step": 4081 }, { "epoch": 0.6704442802003777, "grad_norm": 1.728668188241579, "learning_rate": 5.174539578181531e-06, "loss": 0.7175, "step": 4082 }, { "epoch": 0.6706085242670609, "grad_norm": 1.5694205050277912, "learning_rate": 5.169880432938519e-06, "loss": 0.74, "step": 4083 }, { "epoch": 0.670772768333744, "grad_norm": 1.7351300874795552, "learning_rate": 5.165222654850688e-06, "loss": 0.7135, "step": 4084 }, { "epoch": 0.670937012400427, "grad_norm": 2.370736583153193, "learning_rate": 5.160566245236413e-06, "loss": 0.7411, "step": 4085 }, { "epoch": 0.6711012564671102, "grad_norm": 1.6315313971434249, "learning_rate": 5.155911205413683e-06, "loss": 0.6699, "step": 4086 }, { "epoch": 0.6712655005337932, "grad_norm": 1.7253043982748608, "learning_rate": 5.151257536700094e-06, "loss": 0.7334, "step": 4087 }, { "epoch": 0.6714297446004763, "grad_norm": 1.6798365183238286, "learning_rate": 5.146605240412859e-06, "loss": 0.8297, "step": 4088 }, { "epoch": 0.6715939886671594, "grad_norm": 1.7710245653455476, "learning_rate": 5.141954317868798e-06, "loss": 0.7691, "step": 4089 }, { "epoch": 0.6717582327338425, "grad_norm": 1.926105976693498, "learning_rate": 5.137304770384348e-06, "loss": 0.712, "step": 4090 }, { "epoch": 0.6719224768005256, "grad_norm": 1.633807261937637, "learning_rate": 5.132656599275554e-06, "loss": 0.7701, "step": 4091 }, { "epoch": 0.6720867208672087, "grad_norm": 1.8965330201687398, "learning_rate": 5.128009805858067e-06, "loss": 0.7301, "step": 4092 }, { "epoch": 0.6722509649338918, "grad_norm": 1.7223170310295648, "learning_rate": 5.123364391447156e-06, "loss": 0.7704, "step": 4093 }, { "epoch": 0.6724152090005748, "grad_norm": 1.6264333727503781, "learning_rate": 5.118720357357696e-06, "loss": 0.7276, "step": 4094 }, { "epoch": 0.672579453067258, "grad_norm": 0.6331620560567269, "learning_rate": 5.114077704904168e-06, "loss": 0.329, "step": 4095 }, { "epoch": 0.672743697133941, "grad_norm": 1.2778650403118124, "learning_rate": 5.109436435400667e-06, "loss": 0.7635, "step": 4096 }, { "epoch": 0.6729079412006241, "grad_norm": 1.9257274009330967, "learning_rate": 5.104796550160893e-06, "loss": 0.7824, "step": 4097 }, { "epoch": 0.6730721852673072, "grad_norm": 1.9325136572370671, "learning_rate": 5.100158050498159e-06, "loss": 0.7007, "step": 4098 }, { "epoch": 0.6732364293339903, "grad_norm": 0.6358052858322073, "learning_rate": 5.095520937725378e-06, "loss": 0.348, "step": 4099 }, { "epoch": 0.6734006734006734, "grad_norm": 1.881599004939865, "learning_rate": 5.090885213155079e-06, "loss": 0.7345, "step": 4100 }, { "epoch": 0.6735649174673565, "grad_norm": 1.784521064902309, "learning_rate": 5.0862508780993915e-06, "loss": 0.7877, "step": 4101 }, { "epoch": 0.6737291615340396, "grad_norm": 1.7735435167006068, "learning_rate": 5.081617933870056e-06, "loss": 0.697, "step": 4102 }, { "epoch": 0.6738934056007226, "grad_norm": 2.251788959297409, "learning_rate": 5.076986381778417e-06, "loss": 0.7393, "step": 4103 }, { "epoch": 0.6740576496674058, "grad_norm": 1.2785138166455512, "learning_rate": 5.072356223135425e-06, "loss": 0.7466, "step": 4104 }, { "epoch": 0.6742218937340888, "grad_norm": 1.9686854987339313, "learning_rate": 5.067727459251638e-06, "loss": 0.7929, "step": 4105 }, { "epoch": 0.674386137800772, "grad_norm": 1.6807300838950772, "learning_rate": 5.063100091437217e-06, "loss": 0.6979, "step": 4106 }, { "epoch": 0.6745503818674551, "grad_norm": 1.4452869934380301, "learning_rate": 5.058474121001928e-06, "loss": 0.7382, "step": 4107 }, { "epoch": 0.6747146259341381, "grad_norm": 1.6822963266408177, "learning_rate": 5.053849549255143e-06, "loss": 0.7638, "step": 4108 }, { "epoch": 0.6748788700008213, "grad_norm": 1.7179742119173294, "learning_rate": 5.049226377505838e-06, "loss": 0.811, "step": 4109 }, { "epoch": 0.6750431140675043, "grad_norm": 1.4733234034980913, "learning_rate": 5.044604607062591e-06, "loss": 0.7723, "step": 4110 }, { "epoch": 0.6752073581341874, "grad_norm": 1.5403809447451349, "learning_rate": 5.0399842392335856e-06, "loss": 0.7726, "step": 4111 }, { "epoch": 0.6753716022008704, "grad_norm": 2.403151924615844, "learning_rate": 5.0353652753266045e-06, "loss": 0.7059, "step": 4112 }, { "epoch": 0.6755358462675536, "grad_norm": 1.546518225360613, "learning_rate": 5.03074771664904e-06, "loss": 0.7714, "step": 4113 }, { "epoch": 0.6757000903342367, "grad_norm": 2.23646267232459, "learning_rate": 5.026131564507878e-06, "loss": 0.7872, "step": 4114 }, { "epoch": 0.6758643344009198, "grad_norm": 1.6291608750685305, "learning_rate": 5.021516820209713e-06, "loss": 0.8024, "step": 4115 }, { "epoch": 0.6760285784676029, "grad_norm": 2.079409575968588, "learning_rate": 5.016903485060738e-06, "loss": 0.5973, "step": 4116 }, { "epoch": 0.6761928225342859, "grad_norm": 1.569080688727632, "learning_rate": 5.0122915603667485e-06, "loss": 0.7858, "step": 4117 }, { "epoch": 0.6763570666009691, "grad_norm": 3.1970920663799776, "learning_rate": 5.0076810474331395e-06, "loss": 0.7661, "step": 4118 }, { "epoch": 0.6765213106676521, "grad_norm": 1.601048478747501, "learning_rate": 5.003071947564908e-06, "loss": 0.7803, "step": 4119 }, { "epoch": 0.6766855547343352, "grad_norm": 1.4990984098760964, "learning_rate": 4.998464262066648e-06, "loss": 0.6989, "step": 4120 }, { "epoch": 0.6768497988010184, "grad_norm": 1.9131097712738854, "learning_rate": 4.993857992242557e-06, "loss": 0.6649, "step": 4121 }, { "epoch": 0.6770140428677014, "grad_norm": 1.6171465461985206, "learning_rate": 4.9892531393964285e-06, "loss": 0.7257, "step": 4122 }, { "epoch": 0.6771782869343845, "grad_norm": 2.0496005339824124, "learning_rate": 4.984649704831658e-06, "loss": 0.6587, "step": 4123 }, { "epoch": 0.6773425310010676, "grad_norm": 4.529559301250861, "learning_rate": 4.980047689851236e-06, "loss": 0.7327, "step": 4124 }, { "epoch": 0.6775067750677507, "grad_norm": 0.592552465700686, "learning_rate": 4.975447095757755e-06, "loss": 0.3354, "step": 4125 }, { "epoch": 0.6776710191344337, "grad_norm": 2.9811743722026725, "learning_rate": 4.970847923853404e-06, "loss": 0.7728, "step": 4126 }, { "epoch": 0.6778352632011169, "grad_norm": 1.8560018718465747, "learning_rate": 4.966250175439966e-06, "loss": 0.767, "step": 4127 }, { "epoch": 0.6779995072678, "grad_norm": 1.366181843330106, "learning_rate": 4.961653851818827e-06, "loss": 0.7673, "step": 4128 }, { "epoch": 0.678163751334483, "grad_norm": 1.6182371603970784, "learning_rate": 4.957058954290964e-06, "loss": 0.8152, "step": 4129 }, { "epoch": 0.6783279954011662, "grad_norm": 1.6492896954294751, "learning_rate": 4.952465484156956e-06, "loss": 0.8035, "step": 4130 }, { "epoch": 0.6784922394678492, "grad_norm": 2.0060435784356376, "learning_rate": 4.947873442716972e-06, "loss": 0.7779, "step": 4131 }, { "epoch": 0.6786564835345323, "grad_norm": 1.5581193136395506, "learning_rate": 4.9432828312707836e-06, "loss": 0.727, "step": 4132 }, { "epoch": 0.6788207276012154, "grad_norm": 1.5439348159687216, "learning_rate": 4.938693651117751e-06, "loss": 0.7966, "step": 4133 }, { "epoch": 0.6789849716678985, "grad_norm": 1.6014559389571452, "learning_rate": 4.934105903556831e-06, "loss": 0.765, "step": 4134 }, { "epoch": 0.6791492157345815, "grad_norm": 1.4443819329084104, "learning_rate": 4.929519589886578e-06, "loss": 0.7315, "step": 4135 }, { "epoch": 0.6793134598012647, "grad_norm": 1.4645927411204103, "learning_rate": 4.924934711405138e-06, "loss": 0.7693, "step": 4136 }, { "epoch": 0.6794777038679478, "grad_norm": 1.725707287446679, "learning_rate": 4.920351269410251e-06, "loss": 0.72, "step": 4137 }, { "epoch": 0.6796419479346308, "grad_norm": 1.421667857499466, "learning_rate": 4.9157692651992495e-06, "loss": 0.7794, "step": 4138 }, { "epoch": 0.679806192001314, "grad_norm": 1.6838321608858673, "learning_rate": 4.911188700069062e-06, "loss": 0.7079, "step": 4139 }, { "epoch": 0.679970436067997, "grad_norm": 1.892720049084006, "learning_rate": 4.906609575316207e-06, "loss": 0.7571, "step": 4140 }, { "epoch": 0.6801346801346801, "grad_norm": 3.0571322775861205, "learning_rate": 4.9020318922367956e-06, "loss": 0.7356, "step": 4141 }, { "epoch": 0.6802989242013632, "grad_norm": 1.490661056805924, "learning_rate": 4.897455652126533e-06, "loss": 0.7259, "step": 4142 }, { "epoch": 0.6804631682680463, "grad_norm": 1.3926133080744112, "learning_rate": 4.892880856280713e-06, "loss": 0.7026, "step": 4143 }, { "epoch": 0.6806274123347295, "grad_norm": 1.8346428521101026, "learning_rate": 4.888307505994222e-06, "loss": 0.7889, "step": 4144 }, { "epoch": 0.6807916564014125, "grad_norm": 1.80538554871554, "learning_rate": 4.883735602561537e-06, "loss": 0.8104, "step": 4145 }, { "epoch": 0.6809559004680956, "grad_norm": 3.2872181697595844, "learning_rate": 4.879165147276726e-06, "loss": 0.6575, "step": 4146 }, { "epoch": 0.6811201445347786, "grad_norm": 1.850947303085722, "learning_rate": 4.874596141433447e-06, "loss": 0.6841, "step": 4147 }, { "epoch": 0.6812843886014618, "grad_norm": 1.5005124794304714, "learning_rate": 4.870028586324947e-06, "loss": 0.7308, "step": 4148 }, { "epoch": 0.6814486326681448, "grad_norm": 1.3857427866643737, "learning_rate": 4.865462483244065e-06, "loss": 0.6746, "step": 4149 }, { "epoch": 0.681612876734828, "grad_norm": 0.6215389097549736, "learning_rate": 4.8608978334832225e-06, "loss": 0.3475, "step": 4150 }, { "epoch": 0.6817771208015111, "grad_norm": 1.436555282944228, "learning_rate": 4.8563346383344375e-06, "loss": 0.7488, "step": 4151 }, { "epoch": 0.6819413648681941, "grad_norm": 1.9753188812448657, "learning_rate": 4.851772899089312e-06, "loss": 0.7388, "step": 4152 }, { "epoch": 0.6821056089348773, "grad_norm": 0.6049588662403015, "learning_rate": 4.847212617039037e-06, "loss": 0.291, "step": 4153 }, { "epoch": 0.6822698530015603, "grad_norm": 4.144465113703816, "learning_rate": 4.842653793474389e-06, "loss": 0.6931, "step": 4154 }, { "epoch": 0.6824340970682434, "grad_norm": 2.1605479838072728, "learning_rate": 4.838096429685735e-06, "loss": 0.715, "step": 4155 }, { "epoch": 0.6825983411349265, "grad_norm": 1.7659107041551194, "learning_rate": 4.833540526963027e-06, "loss": 0.7711, "step": 4156 }, { "epoch": 0.6827625852016096, "grad_norm": 1.863063334754403, "learning_rate": 4.828986086595804e-06, "loss": 0.7852, "step": 4157 }, { "epoch": 0.6829268292682927, "grad_norm": 2.2616034901217104, "learning_rate": 4.82443310987319e-06, "loss": 0.7206, "step": 4158 }, { "epoch": 0.6830910733349758, "grad_norm": 2.0312990441802894, "learning_rate": 4.819881598083895e-06, "loss": 0.7676, "step": 4159 }, { "epoch": 0.6832553174016589, "grad_norm": 1.8578473492763297, "learning_rate": 4.815331552516217e-06, "loss": 0.7474, "step": 4160 }, { "epoch": 0.6834195614683419, "grad_norm": 1.5647086616527421, "learning_rate": 4.810782974458035e-06, "loss": 0.7673, "step": 4161 }, { "epoch": 0.6835838055350251, "grad_norm": 1.5658501431774499, "learning_rate": 4.806235865196815e-06, "loss": 0.7243, "step": 4162 }, { "epoch": 0.6837480496017081, "grad_norm": 2.189084732655328, "learning_rate": 4.801690226019606e-06, "loss": 0.7552, "step": 4163 }, { "epoch": 0.6839122936683912, "grad_norm": 2.1141706318578897, "learning_rate": 4.7971460582130425e-06, "loss": 0.7743, "step": 4164 }, { "epoch": 0.6840765377350744, "grad_norm": 1.9819913084700944, "learning_rate": 4.792603363063342e-06, "loss": 0.6209, "step": 4165 }, { "epoch": 0.6842407818017574, "grad_norm": 6.805384909398199, "learning_rate": 4.7880621418563035e-06, "loss": 0.6837, "step": 4166 }, { "epoch": 0.6844050258684405, "grad_norm": 1.834295584997863, "learning_rate": 4.783522395877311e-06, "loss": 0.6488, "step": 4167 }, { "epoch": 0.6845692699351236, "grad_norm": 1.912624766446398, "learning_rate": 4.77898412641133e-06, "loss": 0.7775, "step": 4168 }, { "epoch": 0.6847335140018067, "grad_norm": 1.5565827344669354, "learning_rate": 4.774447334742908e-06, "loss": 0.8339, "step": 4169 }, { "epoch": 0.6848977580684897, "grad_norm": 2.0529659788361188, "learning_rate": 4.769912022156175e-06, "loss": 0.7595, "step": 4170 }, { "epoch": 0.6850620021351729, "grad_norm": 1.5931609346346463, "learning_rate": 4.7653781899348395e-06, "loss": 0.8407, "step": 4171 }, { "epoch": 0.6852262462018559, "grad_norm": 0.6141096365649511, "learning_rate": 4.760845839362196e-06, "loss": 0.3473, "step": 4172 }, { "epoch": 0.685390490268539, "grad_norm": 0.6478176915775679, "learning_rate": 4.756314971721115e-06, "loss": 0.314, "step": 4173 }, { "epoch": 0.6855547343352222, "grad_norm": 2.588139184875229, "learning_rate": 4.751785588294053e-06, "loss": 0.7338, "step": 4174 }, { "epoch": 0.6857189784019052, "grad_norm": 0.6393874645844992, "learning_rate": 4.7472576903630314e-06, "loss": 0.3291, "step": 4175 }, { "epoch": 0.6858832224685883, "grad_norm": 11.821392038212382, "learning_rate": 4.742731279209674e-06, "loss": 0.7634, "step": 4176 }, { "epoch": 0.6860474665352714, "grad_norm": 2.6459825544094184, "learning_rate": 4.738206356115167e-06, "loss": 0.7092, "step": 4177 }, { "epoch": 0.6862117106019545, "grad_norm": 2.418260124115532, "learning_rate": 4.733682922360282e-06, "loss": 0.7996, "step": 4178 }, { "epoch": 0.6863759546686375, "grad_norm": 0.6597407177001968, "learning_rate": 4.729160979225365e-06, "loss": 0.292, "step": 4179 }, { "epoch": 0.6865401987353207, "grad_norm": 1.540839579875265, "learning_rate": 4.724640527990345e-06, "loss": 0.7568, "step": 4180 }, { "epoch": 0.6867044428020038, "grad_norm": 1.520542863259497, "learning_rate": 4.720121569934726e-06, "loss": 0.7266, "step": 4181 }, { "epoch": 0.6868686868686869, "grad_norm": 1.602713993503646, "learning_rate": 4.715604106337587e-06, "loss": 0.7788, "step": 4182 }, { "epoch": 0.68703293093537, "grad_norm": 1.391831494017251, "learning_rate": 4.71108813847759e-06, "loss": 0.6688, "step": 4183 }, { "epoch": 0.687197175002053, "grad_norm": 2.4639345395394865, "learning_rate": 4.706573667632967e-06, "loss": 0.7624, "step": 4184 }, { "epoch": 0.6873614190687362, "grad_norm": 1.5711362920016108, "learning_rate": 4.702060695081532e-06, "loss": 0.7956, "step": 4185 }, { "epoch": 0.6875256631354192, "grad_norm": 0.8140667355731993, "learning_rate": 4.69754922210067e-06, "loss": 0.3526, "step": 4186 }, { "epoch": 0.6876899072021023, "grad_norm": 1.613235531638512, "learning_rate": 4.693039249967347e-06, "loss": 0.7549, "step": 4187 }, { "epoch": 0.6878541512687855, "grad_norm": 1.7348447295463796, "learning_rate": 4.688530779958099e-06, "loss": 0.7478, "step": 4188 }, { "epoch": 0.6880183953354685, "grad_norm": 1.8252935785706554, "learning_rate": 4.684023813349039e-06, "loss": 0.691, "step": 4189 }, { "epoch": 0.6881826394021516, "grad_norm": 1.9054742534964377, "learning_rate": 4.679518351415855e-06, "loss": 0.7653, "step": 4190 }, { "epoch": 0.6883468834688347, "grad_norm": 0.6441510441310756, "learning_rate": 4.675014395433808e-06, "loss": 0.3278, "step": 4191 }, { "epoch": 0.6885111275355178, "grad_norm": 1.7143702141311712, "learning_rate": 4.6705119466777334e-06, "loss": 0.7523, "step": 4192 }, { "epoch": 0.6886753716022008, "grad_norm": 2.1015857787444006, "learning_rate": 4.666011006422041e-06, "loss": 0.7871, "step": 4193 }, { "epoch": 0.688839615668884, "grad_norm": 1.5293646233120795, "learning_rate": 4.661511575940712e-06, "loss": 0.7669, "step": 4194 }, { "epoch": 0.6890038597355671, "grad_norm": 2.1927700637288883, "learning_rate": 4.657013656507299e-06, "loss": 0.8357, "step": 4195 }, { "epoch": 0.6891681038022501, "grad_norm": 1.604379489583311, "learning_rate": 4.6525172493949335e-06, "loss": 0.8508, "step": 4196 }, { "epoch": 0.6893323478689333, "grad_norm": 1.5825456791060466, "learning_rate": 4.648022355876307e-06, "loss": 0.7618, "step": 4197 }, { "epoch": 0.6894965919356163, "grad_norm": 1.4364054707620553, "learning_rate": 4.643528977223689e-06, "loss": 0.7062, "step": 4198 }, { "epoch": 0.6896608360022994, "grad_norm": 1.649284638115554, "learning_rate": 4.63903711470893e-06, "loss": 0.7682, "step": 4199 }, { "epoch": 0.6898250800689825, "grad_norm": 1.9211574681656471, "learning_rate": 4.634546769603436e-06, "loss": 0.7724, "step": 4200 }, { "epoch": 0.6899893241356656, "grad_norm": 1.898395477295645, "learning_rate": 4.6300579431781915e-06, "loss": 0.8216, "step": 4201 }, { "epoch": 0.6901535682023487, "grad_norm": 0.6244132811670818, "learning_rate": 4.625570636703748e-06, "loss": 0.3121, "step": 4202 }, { "epoch": 0.6903178122690318, "grad_norm": 1.8780599529046955, "learning_rate": 4.621084851450229e-06, "loss": 0.8739, "step": 4203 }, { "epoch": 0.6904820563357149, "grad_norm": 1.8961791500183662, "learning_rate": 4.616600588687327e-06, "loss": 0.8076, "step": 4204 }, { "epoch": 0.6906463004023979, "grad_norm": 1.4765777401103908, "learning_rate": 4.6121178496843045e-06, "loss": 0.7611, "step": 4205 }, { "epoch": 0.6908105444690811, "grad_norm": 2.8835298872949395, "learning_rate": 4.607636635709988e-06, "loss": 0.7623, "step": 4206 }, { "epoch": 0.6909747885357641, "grad_norm": 2.4267313821601597, "learning_rate": 4.603156948032776e-06, "loss": 0.6389, "step": 4207 }, { "epoch": 0.6911390326024472, "grad_norm": 1.4004419777312318, "learning_rate": 4.5986787879206375e-06, "loss": 0.7905, "step": 4208 }, { "epoch": 0.6913032766691303, "grad_norm": 1.7867761672132518, "learning_rate": 4.594202156641105e-06, "loss": 0.6957, "step": 4209 }, { "epoch": 0.6914675207358134, "grad_norm": 1.7080196429260042, "learning_rate": 4.589727055461278e-06, "loss": 0.829, "step": 4210 }, { "epoch": 0.6916317648024966, "grad_norm": 2.6213757249524825, "learning_rate": 4.585253485647826e-06, "loss": 0.7262, "step": 4211 }, { "epoch": 0.6917960088691796, "grad_norm": 1.8864351049578385, "learning_rate": 4.5807814484669835e-06, "loss": 0.7912, "step": 4212 }, { "epoch": 0.6919602529358627, "grad_norm": 1.8518550933381783, "learning_rate": 4.5763109451845515e-06, "loss": 0.8008, "step": 4213 }, { "epoch": 0.6921244970025457, "grad_norm": 1.9351682669127273, "learning_rate": 4.571841977065895e-06, "loss": 0.7441, "step": 4214 }, { "epoch": 0.6922887410692289, "grad_norm": 1.7479048553226273, "learning_rate": 4.567374545375948e-06, "loss": 0.8165, "step": 4215 }, { "epoch": 0.6924529851359119, "grad_norm": 2.136141537308533, "learning_rate": 4.562908651379206e-06, "loss": 0.7047, "step": 4216 }, { "epoch": 0.692617229202595, "grad_norm": 0.598431840659357, "learning_rate": 4.558444296339731e-06, "loss": 0.2956, "step": 4217 }, { "epoch": 0.6927814732692782, "grad_norm": 1.3619732255100518, "learning_rate": 4.553981481521156e-06, "loss": 0.6951, "step": 4218 }, { "epoch": 0.6929457173359612, "grad_norm": 1.95313273454207, "learning_rate": 4.54952020818666e-06, "loss": 0.7002, "step": 4219 }, { "epoch": 0.6931099614026444, "grad_norm": 1.9973805877069362, "learning_rate": 4.545060477599002e-06, "loss": 0.7468, "step": 4220 }, { "epoch": 0.6932742054693274, "grad_norm": 2.272331793717865, "learning_rate": 4.540602291020499e-06, "loss": 0.7569, "step": 4221 }, { "epoch": 0.6934384495360105, "grad_norm": 1.5055615597839382, "learning_rate": 4.536145649713029e-06, "loss": 0.7443, "step": 4222 }, { "epoch": 0.6936026936026936, "grad_norm": 1.6933893633774952, "learning_rate": 4.531690554938043e-06, "loss": 0.7965, "step": 4223 }, { "epoch": 0.6937669376693767, "grad_norm": 1.7702506297102167, "learning_rate": 4.52723700795654e-06, "loss": 0.7771, "step": 4224 }, { "epoch": 0.6939311817360598, "grad_norm": 1.6547811235611238, "learning_rate": 4.522785010029087e-06, "loss": 0.643, "step": 4225 }, { "epoch": 0.6940954258027429, "grad_norm": 1.533998351844094, "learning_rate": 4.518334562415816e-06, "loss": 0.6668, "step": 4226 }, { "epoch": 0.694259669869426, "grad_norm": 0.6043339795333322, "learning_rate": 4.513885666376413e-06, "loss": 0.3146, "step": 4227 }, { "epoch": 0.694423913936109, "grad_norm": 1.8671884855633594, "learning_rate": 4.509438323170131e-06, "loss": 0.6657, "step": 4228 }, { "epoch": 0.6945881580027922, "grad_norm": 1.4473317905600132, "learning_rate": 4.504992534055781e-06, "loss": 0.7832, "step": 4229 }, { "epoch": 0.6947524020694752, "grad_norm": 1.6600836197835753, "learning_rate": 4.500548300291732e-06, "loss": 0.8051, "step": 4230 }, { "epoch": 0.6949166461361583, "grad_norm": 1.673114976307748, "learning_rate": 4.496105623135919e-06, "loss": 0.7303, "step": 4231 }, { "epoch": 0.6950808902028415, "grad_norm": 2.0487653529158862, "learning_rate": 4.4916645038458295e-06, "loss": 0.6681, "step": 4232 }, { "epoch": 0.6952451342695245, "grad_norm": 1.5561210703965036, "learning_rate": 4.487224943678513e-06, "loss": 0.7374, "step": 4233 }, { "epoch": 0.6954093783362076, "grad_norm": 1.8870798199867844, "learning_rate": 4.482786943890579e-06, "loss": 0.8051, "step": 4234 }, { "epoch": 0.6955736224028907, "grad_norm": 1.7345582721994128, "learning_rate": 4.478350505738194e-06, "loss": 0.7462, "step": 4235 }, { "epoch": 0.6957378664695738, "grad_norm": 1.6077536371873358, "learning_rate": 4.47391563047708e-06, "loss": 0.7407, "step": 4236 }, { "epoch": 0.6959021105362568, "grad_norm": 0.636588658124169, "learning_rate": 4.4694823193625225e-06, "loss": 0.3093, "step": 4237 }, { "epoch": 0.69606635460294, "grad_norm": 1.7093671689413104, "learning_rate": 4.465050573649359e-06, "loss": 0.8091, "step": 4238 }, { "epoch": 0.6962305986696231, "grad_norm": 0.5997260033800355, "learning_rate": 4.460620394591989e-06, "loss": 0.3372, "step": 4239 }, { "epoch": 0.6963948427363061, "grad_norm": 1.7333098043322177, "learning_rate": 4.45619178344436e-06, "loss": 0.7418, "step": 4240 }, { "epoch": 0.6965590868029893, "grad_norm": 2.254639804785435, "learning_rate": 4.451764741459983e-06, "loss": 0.6637, "step": 4241 }, { "epoch": 0.6967233308696723, "grad_norm": 1.6221059722857172, "learning_rate": 4.447339269891923e-06, "loss": 0.7295, "step": 4242 }, { "epoch": 0.6968875749363554, "grad_norm": 1.2915817238485863, "learning_rate": 4.442915369992802e-06, "loss": 0.8169, "step": 4243 }, { "epoch": 0.6970518190030385, "grad_norm": 2.7338250237926784, "learning_rate": 4.438493043014793e-06, "loss": 0.662, "step": 4244 }, { "epoch": 0.6972160630697216, "grad_norm": 1.9359473876800952, "learning_rate": 4.434072290209624e-06, "loss": 0.7929, "step": 4245 }, { "epoch": 0.6973803071364046, "grad_norm": 2.0301927918139855, "learning_rate": 4.429653112828589e-06, "loss": 0.7239, "step": 4246 }, { "epoch": 0.6975445512030878, "grad_norm": 2.343961658593991, "learning_rate": 4.4252355121225196e-06, "loss": 0.6963, "step": 4247 }, { "epoch": 0.6977087952697709, "grad_norm": 1.3902465128940213, "learning_rate": 4.4208194893418125e-06, "loss": 0.7618, "step": 4248 }, { "epoch": 0.697873039336454, "grad_norm": 2.2023282007508875, "learning_rate": 4.41640504573641e-06, "loss": 0.736, "step": 4249 }, { "epoch": 0.6980372834031371, "grad_norm": 2.10018296968998, "learning_rate": 4.411992182555812e-06, "loss": 0.7564, "step": 4250 }, { "epoch": 0.6982015274698201, "grad_norm": 2.630205104742473, "learning_rate": 4.407580901049071e-06, "loss": 0.7466, "step": 4251 }, { "epoch": 0.6983657715365033, "grad_norm": 1.5280775049563668, "learning_rate": 4.403171202464791e-06, "loss": 0.7837, "step": 4252 }, { "epoch": 0.6985300156031863, "grad_norm": 1.8370487956059816, "learning_rate": 4.398763088051127e-06, "loss": 0.7516, "step": 4253 }, { "epoch": 0.6986942596698694, "grad_norm": 0.6237846063922157, "learning_rate": 4.394356559055787e-06, "loss": 0.3173, "step": 4254 }, { "epoch": 0.6988585037365526, "grad_norm": 3.202887289099237, "learning_rate": 4.389951616726029e-06, "loss": 0.7408, "step": 4255 }, { "epoch": 0.6990227478032356, "grad_norm": 2.193647287207803, "learning_rate": 4.3855482623086645e-06, "loss": 0.7246, "step": 4256 }, { "epoch": 0.6991869918699187, "grad_norm": 1.6305017271308138, "learning_rate": 4.381146497050053e-06, "loss": 0.7356, "step": 4257 }, { "epoch": 0.6993512359366018, "grad_norm": 1.5827203652749293, "learning_rate": 4.3767463221961034e-06, "loss": 0.8048, "step": 4258 }, { "epoch": 0.6995154800032849, "grad_norm": 1.9601388515066736, "learning_rate": 4.372347738992278e-06, "loss": 0.7953, "step": 4259 }, { "epoch": 0.6996797240699679, "grad_norm": 2.1488124934297725, "learning_rate": 4.3679507486835835e-06, "loss": 0.7267, "step": 4260 }, { "epoch": 0.6998439681366511, "grad_norm": 1.672030066829806, "learning_rate": 4.363555352514587e-06, "loss": 0.7493, "step": 4261 }, { "epoch": 0.7000082122033342, "grad_norm": 1.7639009457271455, "learning_rate": 4.359161551729385e-06, "loss": 0.8073, "step": 4262 }, { "epoch": 0.7001724562700172, "grad_norm": 1.6695115634564868, "learning_rate": 4.354769347571638e-06, "loss": 0.719, "step": 4263 }, { "epoch": 0.7003367003367004, "grad_norm": 1.691869063786168, "learning_rate": 4.350378741284551e-06, "loss": 0.7866, "step": 4264 }, { "epoch": 0.7005009444033834, "grad_norm": 2.447805586012025, "learning_rate": 4.3459897341108756e-06, "loss": 0.6974, "step": 4265 }, { "epoch": 0.7006651884700665, "grad_norm": 1.952451789526993, "learning_rate": 4.341602327292912e-06, "loss": 0.7993, "step": 4266 }, { "epoch": 0.7008294325367496, "grad_norm": 2.2819707524639643, "learning_rate": 4.3372165220725045e-06, "loss": 0.7623, "step": 4267 }, { "epoch": 0.7009936766034327, "grad_norm": 1.9037776912773805, "learning_rate": 4.332832319691044e-06, "loss": 0.6874, "step": 4268 }, { "epoch": 0.7011579206701158, "grad_norm": 1.6477429983683958, "learning_rate": 4.328449721389475e-06, "loss": 0.7452, "step": 4269 }, { "epoch": 0.7013221647367989, "grad_norm": 1.5771285800526915, "learning_rate": 4.324068728408282e-06, "loss": 0.7894, "step": 4270 }, { "epoch": 0.701486408803482, "grad_norm": 2.259527323669414, "learning_rate": 4.319689341987493e-06, "loss": 0.817, "step": 4271 }, { "epoch": 0.701650652870165, "grad_norm": 2.0297209582663167, "learning_rate": 4.315311563366686e-06, "loss": 0.7643, "step": 4272 }, { "epoch": 0.7018148969368482, "grad_norm": 1.7669737161474004, "learning_rate": 4.3109353937849815e-06, "loss": 0.7871, "step": 4273 }, { "epoch": 0.7019791410035312, "grad_norm": 2.4108656315967965, "learning_rate": 4.306560834481045e-06, "loss": 0.7877, "step": 4274 }, { "epoch": 0.7021433850702143, "grad_norm": 1.569685165702422, "learning_rate": 4.302187886693087e-06, "loss": 0.7359, "step": 4275 }, { "epoch": 0.7023076291368975, "grad_norm": 2.6424139755130542, "learning_rate": 4.29781655165886e-06, "loss": 0.7127, "step": 4276 }, { "epoch": 0.7024718732035805, "grad_norm": 2.2069266070122713, "learning_rate": 4.293446830615662e-06, "loss": 0.702, "step": 4277 }, { "epoch": 0.7026361172702636, "grad_norm": 1.711470738704021, "learning_rate": 4.289078724800331e-06, "loss": 0.7663, "step": 4278 }, { "epoch": 0.7028003613369467, "grad_norm": 2.8041846159361947, "learning_rate": 4.2847122354492555e-06, "loss": 0.8391, "step": 4279 }, { "epoch": 0.7029646054036298, "grad_norm": 2.038088903444549, "learning_rate": 4.280347363798356e-06, "loss": 0.8298, "step": 4280 }, { "epoch": 0.7031288494703128, "grad_norm": 2.689424986266262, "learning_rate": 4.275984111083102e-06, "loss": 0.7534, "step": 4281 }, { "epoch": 0.703293093536996, "grad_norm": 2.0692078496466593, "learning_rate": 4.2716224785385075e-06, "loss": 0.7748, "step": 4282 }, { "epoch": 0.7034573376036791, "grad_norm": 1.836433625047028, "learning_rate": 4.267262467399114e-06, "loss": 0.7978, "step": 4283 }, { "epoch": 0.7036215816703622, "grad_norm": 1.6863482641058811, "learning_rate": 4.2629040788990205e-06, "loss": 0.7491, "step": 4284 }, { "epoch": 0.7037858257370453, "grad_norm": 0.6282007230363852, "learning_rate": 4.258547314271857e-06, "loss": 0.3037, "step": 4285 }, { "epoch": 0.7039500698037283, "grad_norm": 1.6407705362316163, "learning_rate": 4.254192174750796e-06, "loss": 0.7962, "step": 4286 }, { "epoch": 0.7041143138704115, "grad_norm": 2.000868314483962, "learning_rate": 4.249838661568554e-06, "loss": 0.8407, "step": 4287 }, { "epoch": 0.7042785579370945, "grad_norm": 1.664006590621862, "learning_rate": 4.24548677595738e-06, "loss": 0.7129, "step": 4288 }, { "epoch": 0.7044428020037776, "grad_norm": 1.8532722109051005, "learning_rate": 4.2411365191490684e-06, "loss": 0.8119, "step": 4289 }, { "epoch": 0.7046070460704607, "grad_norm": 0.5895469898474555, "learning_rate": 4.236787892374948e-06, "loss": 0.3209, "step": 4290 }, { "epoch": 0.7047712901371438, "grad_norm": 1.4616333393689396, "learning_rate": 4.232440896865888e-06, "loss": 0.7759, "step": 4291 }, { "epoch": 0.7049355342038269, "grad_norm": 1.545302524923281, "learning_rate": 4.2280955338523015e-06, "loss": 0.6585, "step": 4292 }, { "epoch": 0.70509977827051, "grad_norm": 1.4457393030978147, "learning_rate": 4.22375180456413e-06, "loss": 0.7265, "step": 4293 }, { "epoch": 0.7052640223371931, "grad_norm": 1.986475553140955, "learning_rate": 4.219409710230859e-06, "loss": 0.7959, "step": 4294 }, { "epoch": 0.7054282664038761, "grad_norm": 0.6518318090407716, "learning_rate": 4.215069252081509e-06, "loss": 0.317, "step": 4295 }, { "epoch": 0.7055925104705593, "grad_norm": 2.237537565328985, "learning_rate": 4.210730431344635e-06, "loss": 0.7803, "step": 4296 }, { "epoch": 0.7057567545372423, "grad_norm": 1.5764074113550153, "learning_rate": 4.206393249248334e-06, "loss": 0.7327, "step": 4297 }, { "epoch": 0.7059209986039254, "grad_norm": 2.0074087435332184, "learning_rate": 4.202057707020235e-06, "loss": 0.7747, "step": 4298 }, { "epoch": 0.7060852426706086, "grad_norm": 1.6622957092992803, "learning_rate": 4.1977238058875045e-06, "loss": 0.7611, "step": 4299 }, { "epoch": 0.7062494867372916, "grad_norm": 1.63572924389072, "learning_rate": 4.193391547076844e-06, "loss": 0.802, "step": 4300 }, { "epoch": 0.7064137308039747, "grad_norm": 2.498124519887322, "learning_rate": 4.189060931814489e-06, "loss": 0.7697, "step": 4301 }, { "epoch": 0.7065779748706578, "grad_norm": 1.617884241866548, "learning_rate": 4.184731961326213e-06, "loss": 0.7465, "step": 4302 }, { "epoch": 0.7067422189373409, "grad_norm": 2.096455097411565, "learning_rate": 4.180404636837321e-06, "loss": 0.8078, "step": 4303 }, { "epoch": 0.7069064630040239, "grad_norm": 1.859914174233471, "learning_rate": 4.176078959572656e-06, "loss": 0.7763, "step": 4304 }, { "epoch": 0.7070707070707071, "grad_norm": 2.1746750659851033, "learning_rate": 4.171754930756586e-06, "loss": 0.7692, "step": 4305 }, { "epoch": 0.7072349511373902, "grad_norm": 3.8517292585435627, "learning_rate": 4.167432551613021e-06, "loss": 0.731, "step": 4306 }, { "epoch": 0.7073991952040732, "grad_norm": 1.5598286747956336, "learning_rate": 4.163111823365403e-06, "loss": 0.7695, "step": 4307 }, { "epoch": 0.7075634392707564, "grad_norm": 2.1355102351569295, "learning_rate": 4.158792747236702e-06, "loss": 0.6999, "step": 4308 }, { "epoch": 0.7077276833374394, "grad_norm": 1.6342914639042052, "learning_rate": 4.154475324449425e-06, "loss": 0.7509, "step": 4309 }, { "epoch": 0.7078919274041225, "grad_norm": 1.9254591471433253, "learning_rate": 4.1501595562256105e-06, "loss": 0.7432, "step": 4310 }, { "epoch": 0.7080561714708056, "grad_norm": 1.7187107353867077, "learning_rate": 4.145845443786827e-06, "loss": 0.7694, "step": 4311 }, { "epoch": 0.7082204155374887, "grad_norm": 1.4645158865793135, "learning_rate": 4.141532988354173e-06, "loss": 0.7144, "step": 4312 }, { "epoch": 0.7083846596041719, "grad_norm": 3.086561279519324, "learning_rate": 4.137222191148282e-06, "loss": 0.7373, "step": 4313 }, { "epoch": 0.7085489036708549, "grad_norm": 1.6050654823776036, "learning_rate": 4.132913053389317e-06, "loss": 0.7118, "step": 4314 }, { "epoch": 0.708713147737538, "grad_norm": 1.4852967130111854, "learning_rate": 4.128605576296964e-06, "loss": 0.6993, "step": 4315 }, { "epoch": 0.708877391804221, "grad_norm": 1.7576806699658687, "learning_rate": 4.1242997610904546e-06, "loss": 0.7018, "step": 4316 }, { "epoch": 0.7090416358709042, "grad_norm": 1.7461735069115638, "learning_rate": 4.119995608988536e-06, "loss": 0.7702, "step": 4317 }, { "epoch": 0.7092058799375872, "grad_norm": 1.6924119278195955, "learning_rate": 4.11569312120949e-06, "loss": 0.788, "step": 4318 }, { "epoch": 0.7093701240042704, "grad_norm": 2.0028065504894625, "learning_rate": 4.111392298971127e-06, "loss": 0.7523, "step": 4319 }, { "epoch": 0.7095343680709535, "grad_norm": 1.7501107323151572, "learning_rate": 4.107093143490785e-06, "loss": 0.7503, "step": 4320 }, { "epoch": 0.7096986121376365, "grad_norm": 1.5236475311453819, "learning_rate": 4.102795655985331e-06, "loss": 0.7901, "step": 4321 }, { "epoch": 0.7098628562043197, "grad_norm": 1.7517062059035504, "learning_rate": 4.098499837671159e-06, "loss": 0.8366, "step": 4322 }, { "epoch": 0.7100271002710027, "grad_norm": 1.72366245409719, "learning_rate": 4.0942056897641934e-06, "loss": 0.7224, "step": 4323 }, { "epoch": 0.7101913443376858, "grad_norm": 1.7865987844383604, "learning_rate": 4.089913213479882e-06, "loss": 0.7716, "step": 4324 }, { "epoch": 0.7103555884043689, "grad_norm": 4.4364653333068285, "learning_rate": 4.085622410033203e-06, "loss": 0.7568, "step": 4325 }, { "epoch": 0.710519832471052, "grad_norm": 1.6227458369214776, "learning_rate": 4.081333280638661e-06, "loss": 0.8237, "step": 4326 }, { "epoch": 0.710684076537735, "grad_norm": 1.5164521719398492, "learning_rate": 4.077045826510277e-06, "loss": 0.7755, "step": 4327 }, { "epoch": 0.7108483206044182, "grad_norm": 1.3103309085254848, "learning_rate": 4.072760048861614e-06, "loss": 0.7281, "step": 4328 }, { "epoch": 0.7110125646711013, "grad_norm": 1.7742965640511075, "learning_rate": 4.068475948905746e-06, "loss": 0.8295, "step": 4329 }, { "epoch": 0.7111768087377843, "grad_norm": 3.7243341359320046, "learning_rate": 4.064193527855285e-06, "loss": 0.7921, "step": 4330 }, { "epoch": 0.7113410528044675, "grad_norm": 3.014263605191098, "learning_rate": 4.0599127869223565e-06, "loss": 0.7553, "step": 4331 }, { "epoch": 0.7115052968711505, "grad_norm": 1.6511339426081197, "learning_rate": 4.055633727318617e-06, "loss": 0.7718, "step": 4332 }, { "epoch": 0.7116695409378336, "grad_norm": 1.6906296756440902, "learning_rate": 4.051356350255246e-06, "loss": 0.7505, "step": 4333 }, { "epoch": 0.7118337850045167, "grad_norm": 1.702002494957694, "learning_rate": 4.047080656942943e-06, "loss": 0.7984, "step": 4334 }, { "epoch": 0.7119980290711998, "grad_norm": 1.4324810693626155, "learning_rate": 4.042806648591938e-06, "loss": 0.7571, "step": 4335 }, { "epoch": 0.7121622731378829, "grad_norm": 1.6809463655458, "learning_rate": 4.038534326411978e-06, "loss": 0.8125, "step": 4336 }, { "epoch": 0.712326517204566, "grad_norm": 1.4353790625143792, "learning_rate": 4.0342636916123355e-06, "loss": 0.772, "step": 4337 }, { "epoch": 0.7124907612712491, "grad_norm": 1.5496126342704502, "learning_rate": 4.0299947454018e-06, "loss": 0.8128, "step": 4338 }, { "epoch": 0.7126550053379321, "grad_norm": 2.132435065616249, "learning_rate": 4.025727488988696e-06, "loss": 0.6982, "step": 4339 }, { "epoch": 0.7128192494046153, "grad_norm": 1.8787442565538746, "learning_rate": 4.0214619235808575e-06, "loss": 0.7351, "step": 4340 }, { "epoch": 0.7129834934712983, "grad_norm": 0.6129046044908114, "learning_rate": 4.017198050385644e-06, "loss": 0.3343, "step": 4341 }, { "epoch": 0.7131477375379814, "grad_norm": 1.7091376054139245, "learning_rate": 4.012935870609934e-06, "loss": 0.7197, "step": 4342 }, { "epoch": 0.7133119816046646, "grad_norm": 1.675225804474349, "learning_rate": 4.008675385460131e-06, "loss": 0.7215, "step": 4343 }, { "epoch": 0.7134762256713476, "grad_norm": 2.0301480570276955, "learning_rate": 4.0044165961421565e-06, "loss": 0.7052, "step": 4344 }, { "epoch": 0.7136404697380307, "grad_norm": 2.690823584955875, "learning_rate": 4.000159503861451e-06, "loss": 0.8187, "step": 4345 }, { "epoch": 0.7138047138047138, "grad_norm": 1.875504163191729, "learning_rate": 3.9959041098229735e-06, "loss": 0.7568, "step": 4346 }, { "epoch": 0.7139689578713969, "grad_norm": 1.54553603278069, "learning_rate": 3.991650415231211e-06, "loss": 0.7796, "step": 4347 }, { "epoch": 0.7141332019380799, "grad_norm": 1.767067792793066, "learning_rate": 3.987398421290155e-06, "loss": 0.7372, "step": 4348 }, { "epoch": 0.7142974460047631, "grad_norm": 2.3849781532698717, "learning_rate": 3.983148129203326e-06, "loss": 0.8137, "step": 4349 }, { "epoch": 0.7144616900714462, "grad_norm": 1.8004255802635316, "learning_rate": 3.978899540173759e-06, "loss": 0.8282, "step": 4350 }, { "epoch": 0.7146259341381292, "grad_norm": 2.1816893370324273, "learning_rate": 3.974652655404012e-06, "loss": 0.7951, "step": 4351 }, { "epoch": 0.7147901782048124, "grad_norm": 1.8961242046651048, "learning_rate": 3.970407476096154e-06, "loss": 0.6713, "step": 4352 }, { "epoch": 0.7149544222714954, "grad_norm": 2.143131350252751, "learning_rate": 3.966164003451775e-06, "loss": 0.7151, "step": 4353 }, { "epoch": 0.7151186663381786, "grad_norm": 2.2501929935507126, "learning_rate": 3.961922238671981e-06, "loss": 0.7708, "step": 4354 }, { "epoch": 0.7152829104048616, "grad_norm": 2.1701310158996354, "learning_rate": 3.957682182957394e-06, "loss": 0.7526, "step": 4355 }, { "epoch": 0.7154471544715447, "grad_norm": 1.6819894158889694, "learning_rate": 3.953443837508153e-06, "loss": 0.7569, "step": 4356 }, { "epoch": 0.7156113985382279, "grad_norm": 1.7495168152576825, "learning_rate": 3.949207203523913e-06, "loss": 0.7331, "step": 4357 }, { "epoch": 0.7157756426049109, "grad_norm": 1.9061597591508077, "learning_rate": 3.944972282203844e-06, "loss": 0.7111, "step": 4358 }, { "epoch": 0.715939886671594, "grad_norm": 1.7401759023203143, "learning_rate": 3.940739074746632e-06, "loss": 0.6989, "step": 4359 }, { "epoch": 0.7161041307382771, "grad_norm": 1.6208511214466899, "learning_rate": 3.936507582350479e-06, "loss": 0.7394, "step": 4360 }, { "epoch": 0.7162683748049602, "grad_norm": 2.3431373141742338, "learning_rate": 3.932277806213093e-06, "loss": 0.7928, "step": 4361 }, { "epoch": 0.7164326188716432, "grad_norm": 0.6032935376993859, "learning_rate": 3.928049747531711e-06, "loss": 0.3254, "step": 4362 }, { "epoch": 0.7165968629383264, "grad_norm": 1.4670268978672183, "learning_rate": 3.923823407503076e-06, "loss": 0.7637, "step": 4363 }, { "epoch": 0.7167611070050094, "grad_norm": 1.9041711408744655, "learning_rate": 3.919598787323442e-06, "loss": 0.7182, "step": 4364 }, { "epoch": 0.7169253510716925, "grad_norm": 2.000742441590783, "learning_rate": 3.915375888188579e-06, "loss": 0.8683, "step": 4365 }, { "epoch": 0.7170895951383757, "grad_norm": 2.0193552389310647, "learning_rate": 3.9111547112937685e-06, "loss": 0.7651, "step": 4366 }, { "epoch": 0.7172538392050587, "grad_norm": 1.6619441256699632, "learning_rate": 3.906935257833809e-06, "loss": 0.8255, "step": 4367 }, { "epoch": 0.7174180832717418, "grad_norm": 1.7973898436643985, "learning_rate": 3.902717529003005e-06, "loss": 0.8441, "step": 4368 }, { "epoch": 0.7175823273384249, "grad_norm": 1.648198843119471, "learning_rate": 3.898501525995181e-06, "loss": 0.7913, "step": 4369 }, { "epoch": 0.717746571405108, "grad_norm": 1.8820321366076158, "learning_rate": 3.89428725000366e-06, "loss": 0.7188, "step": 4370 }, { "epoch": 0.717910815471791, "grad_norm": 1.6941318989865108, "learning_rate": 3.890074702221288e-06, "loss": 0.7024, "step": 4371 }, { "epoch": 0.7180750595384742, "grad_norm": 4.80151066943777, "learning_rate": 3.8858638838404175e-06, "loss": 0.719, "step": 4372 }, { "epoch": 0.7182393036051573, "grad_norm": 1.5897617184755626, "learning_rate": 3.88165479605291e-06, "loss": 0.7921, "step": 4373 }, { "epoch": 0.7184035476718403, "grad_norm": 1.9604353568386883, "learning_rate": 3.8774474400501415e-06, "loss": 0.6792, "step": 4374 }, { "epoch": 0.7185677917385235, "grad_norm": 2.1048783457321085, "learning_rate": 3.873241817022996e-06, "loss": 0.8258, "step": 4375 }, { "epoch": 0.7187320358052065, "grad_norm": 5.638391988056314, "learning_rate": 3.869037928161863e-06, "loss": 0.7589, "step": 4376 }, { "epoch": 0.7188962798718896, "grad_norm": 0.6220466802271273, "learning_rate": 3.8648357746566456e-06, "loss": 0.2772, "step": 4377 }, { "epoch": 0.7190605239385727, "grad_norm": 1.7882478513877318, "learning_rate": 3.860635357696756e-06, "loss": 0.7819, "step": 4378 }, { "epoch": 0.7192247680052558, "grad_norm": 1.8047183906336772, "learning_rate": 3.8564366784711116e-06, "loss": 0.7599, "step": 4379 }, { "epoch": 0.719389012071939, "grad_norm": 1.8931788879801663, "learning_rate": 3.852239738168141e-06, "loss": 0.8076, "step": 4380 }, { "epoch": 0.719553256138622, "grad_norm": 1.8473571752106652, "learning_rate": 3.848044537975778e-06, "loss": 0.7771, "step": 4381 }, { "epoch": 0.7197175002053051, "grad_norm": 1.8815188486452856, "learning_rate": 3.843851079081467e-06, "loss": 0.6854, "step": 4382 }, { "epoch": 0.7198817442719881, "grad_norm": 1.5253991359507943, "learning_rate": 3.839659362672156e-06, "loss": 0.7639, "step": 4383 }, { "epoch": 0.7200459883386713, "grad_norm": 1.5566721653178264, "learning_rate": 3.835469389934299e-06, "loss": 0.7946, "step": 4384 }, { "epoch": 0.7202102324053543, "grad_norm": 2.3011888347976543, "learning_rate": 3.8312811620538655e-06, "loss": 0.7383, "step": 4385 }, { "epoch": 0.7203744764720375, "grad_norm": 1.691327005876147, "learning_rate": 3.8270946802163216e-06, "loss": 0.7497, "step": 4386 }, { "epoch": 0.7205387205387206, "grad_norm": 1.4200159737664741, "learning_rate": 3.822909945606641e-06, "loss": 0.7665, "step": 4387 }, { "epoch": 0.7207029646054036, "grad_norm": 1.7032814217525898, "learning_rate": 3.818726959409305e-06, "loss": 0.7404, "step": 4388 }, { "epoch": 0.7208672086720868, "grad_norm": 1.671123575623072, "learning_rate": 3.8145457228082995e-06, "loss": 0.6951, "step": 4389 }, { "epoch": 0.7210314527387698, "grad_norm": 1.812401843897819, "learning_rate": 3.8103662369871143e-06, "loss": 0.702, "step": 4390 }, { "epoch": 0.7211956968054529, "grad_norm": 2.2967717716894414, "learning_rate": 3.806188503128746e-06, "loss": 0.7761, "step": 4391 }, { "epoch": 0.721359940872136, "grad_norm": 2.036594242206249, "learning_rate": 3.802012522415689e-06, "loss": 0.7923, "step": 4392 }, { "epoch": 0.7215241849388191, "grad_norm": 2.0191820067880166, "learning_rate": 3.7978382960299476e-06, "loss": 0.7313, "step": 4393 }, { "epoch": 0.7216884290055022, "grad_norm": 1.7737384767140503, "learning_rate": 3.793665825153029e-06, "loss": 0.7088, "step": 4394 }, { "epoch": 0.7218526730721853, "grad_norm": 1.6249304426582691, "learning_rate": 3.7894951109659404e-06, "loss": 0.7358, "step": 4395 }, { "epoch": 0.7220169171388684, "grad_norm": 1.703443546771971, "learning_rate": 3.785326154649196e-06, "loss": 0.7496, "step": 4396 }, { "epoch": 0.7221811612055514, "grad_norm": 2.1205618367412016, "learning_rate": 3.781158957382809e-06, "loss": 0.7655, "step": 4397 }, { "epoch": 0.7223454052722346, "grad_norm": 1.836316059311571, "learning_rate": 3.776993520346295e-06, "loss": 0.7402, "step": 4398 }, { "epoch": 0.7225096493389176, "grad_norm": 0.6787213417995603, "learning_rate": 3.772829844718674e-06, "loss": 0.3479, "step": 4399 }, { "epoch": 0.7226738934056007, "grad_norm": 1.5344388443929657, "learning_rate": 3.7686679316784635e-06, "loss": 0.7902, "step": 4400 }, { "epoch": 0.7228381374722838, "grad_norm": 1.62403403851692, "learning_rate": 3.764507782403686e-06, "loss": 0.7386, "step": 4401 }, { "epoch": 0.7230023815389669, "grad_norm": 10.816972618527988, "learning_rate": 3.760349398071862e-06, "loss": 0.7042, "step": 4402 }, { "epoch": 0.72316662560565, "grad_norm": 1.551811538003752, "learning_rate": 3.756192779860014e-06, "loss": 0.6973, "step": 4403 }, { "epoch": 0.7233308696723331, "grad_norm": 2.0718413278867374, "learning_rate": 3.752037928944664e-06, "loss": 0.7807, "step": 4404 }, { "epoch": 0.7234951137390162, "grad_norm": 2.0712599490535055, "learning_rate": 3.7478848465018336e-06, "loss": 0.7017, "step": 4405 }, { "epoch": 0.7236593578056992, "grad_norm": 2.0830561948971598, "learning_rate": 3.7437335337070445e-06, "loss": 0.6852, "step": 4406 }, { "epoch": 0.7238236018723824, "grad_norm": 2.4884198809216427, "learning_rate": 3.739583991735316e-06, "loss": 0.7412, "step": 4407 }, { "epoch": 0.7239878459390654, "grad_norm": 1.5966753823617281, "learning_rate": 3.7354362217611652e-06, "loss": 0.8208, "step": 4408 }, { "epoch": 0.7241520900057485, "grad_norm": 0.6366579274069425, "learning_rate": 3.7312902249586146e-06, "loss": 0.3418, "step": 4409 }, { "epoch": 0.7243163340724317, "grad_norm": 1.7918502142113875, "learning_rate": 3.7271460025011785e-06, "loss": 0.7788, "step": 4410 }, { "epoch": 0.7244805781391147, "grad_norm": 2.1856155560725132, "learning_rate": 3.723003555561869e-06, "loss": 0.6792, "step": 4411 }, { "epoch": 0.7246448222057978, "grad_norm": 1.4600171659153847, "learning_rate": 3.7188628853132023e-06, "loss": 0.7982, "step": 4412 }, { "epoch": 0.7248090662724809, "grad_norm": 2.338173364863663, "learning_rate": 3.714723992927177e-06, "loss": 0.8152, "step": 4413 }, { "epoch": 0.724973310339164, "grad_norm": 1.600750953566736, "learning_rate": 3.710586879575302e-06, "loss": 0.7188, "step": 4414 }, { "epoch": 0.725137554405847, "grad_norm": 1.5648591410106554, "learning_rate": 3.70645154642858e-06, "loss": 0.7525, "step": 4415 }, { "epoch": 0.7253017984725302, "grad_norm": 2.020863699959709, "learning_rate": 3.702317994657506e-06, "loss": 0.7866, "step": 4416 }, { "epoch": 0.7254660425392133, "grad_norm": 1.9506049727801473, "learning_rate": 3.6981862254320757e-06, "loss": 0.7337, "step": 4417 }, { "epoch": 0.7256302866058963, "grad_norm": 1.7671050288945405, "learning_rate": 3.694056239921776e-06, "loss": 0.7697, "step": 4418 }, { "epoch": 0.7257945306725795, "grad_norm": 2.1125872049778587, "learning_rate": 3.689928039295592e-06, "loss": 0.7569, "step": 4419 }, { "epoch": 0.7259587747392625, "grad_norm": 1.7360543770417707, "learning_rate": 3.6858016247219998e-06, "loss": 0.7366, "step": 4420 }, { "epoch": 0.7261230188059457, "grad_norm": 2.152145155445065, "learning_rate": 3.6816769973689736e-06, "loss": 0.722, "step": 4421 }, { "epoch": 0.7262872628726287, "grad_norm": 1.4656915102645596, "learning_rate": 3.677554158403982e-06, "loss": 0.7681, "step": 4422 }, { "epoch": 0.7264515069393118, "grad_norm": 3.9071068229880463, "learning_rate": 3.6734331089939835e-06, "loss": 0.7368, "step": 4423 }, { "epoch": 0.726615751005995, "grad_norm": 1.8093679536188316, "learning_rate": 3.669313850305435e-06, "loss": 0.8041, "step": 4424 }, { "epoch": 0.726779995072678, "grad_norm": 1.768774405014242, "learning_rate": 3.6651963835042813e-06, "loss": 0.7196, "step": 4425 }, { "epoch": 0.7269442391393611, "grad_norm": 0.5890932917364828, "learning_rate": 3.6610807097559644e-06, "loss": 0.3056, "step": 4426 }, { "epoch": 0.7271084832060442, "grad_norm": 1.7243401635153797, "learning_rate": 3.6569668302254167e-06, "loss": 0.7053, "step": 4427 }, { "epoch": 0.7272727272727273, "grad_norm": 1.5553846155194324, "learning_rate": 3.6528547460770636e-06, "loss": 0.7762, "step": 4428 }, { "epoch": 0.7274369713394103, "grad_norm": 1.7458854622068225, "learning_rate": 3.648744458474821e-06, "loss": 0.7269, "step": 4429 }, { "epoch": 0.7276012154060935, "grad_norm": 1.7373710144784587, "learning_rate": 3.6446359685820974e-06, "loss": 0.7455, "step": 4430 }, { "epoch": 0.7277654594727766, "grad_norm": 2.0081214414502, "learning_rate": 3.6405292775617886e-06, "loss": 0.7233, "step": 4431 }, { "epoch": 0.7279297035394596, "grad_norm": 1.6289245704565856, "learning_rate": 3.6364243865762926e-06, "loss": 0.7337, "step": 4432 }, { "epoch": 0.7280939476061428, "grad_norm": 1.9095343213095235, "learning_rate": 3.6323212967874866e-06, "loss": 0.7122, "step": 4433 }, { "epoch": 0.7282581916728258, "grad_norm": 2.0664500797933094, "learning_rate": 3.628220009356743e-06, "loss": 0.6869, "step": 4434 }, { "epoch": 0.7284224357395089, "grad_norm": 1.7732639738190494, "learning_rate": 3.6241205254449197e-06, "loss": 0.7063, "step": 4435 }, { "epoch": 0.728586679806192, "grad_norm": 2.140618622715774, "learning_rate": 3.6200228462123666e-06, "loss": 0.8103, "step": 4436 }, { "epoch": 0.7287509238728751, "grad_norm": 1.7975772599180366, "learning_rate": 3.6159269728189237e-06, "loss": 0.7377, "step": 4437 }, { "epoch": 0.7289151679395581, "grad_norm": 1.306599651088807, "learning_rate": 3.6118329064239222e-06, "loss": 0.7302, "step": 4438 }, { "epoch": 0.7290794120062413, "grad_norm": 2.5117816170746106, "learning_rate": 3.6077406481861756e-06, "loss": 0.819, "step": 4439 }, { "epoch": 0.7292436560729244, "grad_norm": 1.6464956684952867, "learning_rate": 3.6036501992639907e-06, "loss": 0.6854, "step": 4440 }, { "epoch": 0.7294079001396074, "grad_norm": 1.6066070679708107, "learning_rate": 3.59956156081516e-06, "loss": 0.7395, "step": 4441 }, { "epoch": 0.7295721442062906, "grad_norm": 1.7808974988994217, "learning_rate": 3.5954747339969653e-06, "loss": 0.6718, "step": 4442 }, { "epoch": 0.7297363882729736, "grad_norm": 1.7630109620117378, "learning_rate": 3.5913897199661716e-06, "loss": 0.7044, "step": 4443 }, { "epoch": 0.7299006323396567, "grad_norm": 4.416018795188322, "learning_rate": 3.587306519879037e-06, "loss": 0.744, "step": 4444 }, { "epoch": 0.7300648764063398, "grad_norm": 1.7239602674573054, "learning_rate": 3.5832251348912995e-06, "loss": 0.6954, "step": 4445 }, { "epoch": 0.7302291204730229, "grad_norm": 1.89786166901012, "learning_rate": 3.5791455661581877e-06, "loss": 0.7615, "step": 4446 }, { "epoch": 0.730393364539706, "grad_norm": 1.492295276005971, "learning_rate": 3.5750678148344153e-06, "loss": 0.742, "step": 4447 }, { "epoch": 0.7305576086063891, "grad_norm": 1.693889064310693, "learning_rate": 3.5709918820741816e-06, "loss": 0.7807, "step": 4448 }, { "epoch": 0.7307218526730722, "grad_norm": 1.3579962021631675, "learning_rate": 3.5669177690311696e-06, "loss": 0.7541, "step": 4449 }, { "epoch": 0.7308860967397552, "grad_norm": 1.9009966508640581, "learning_rate": 3.56284547685855e-06, "loss": 0.7357, "step": 4450 }, { "epoch": 0.7310503408064384, "grad_norm": 1.727183661343111, "learning_rate": 3.5587750067089745e-06, "loss": 0.7646, "step": 4451 }, { "epoch": 0.7312145848731214, "grad_norm": 1.8481846823700603, "learning_rate": 3.5547063597345833e-06, "loss": 0.7894, "step": 4452 }, { "epoch": 0.7313788289398045, "grad_norm": 1.6940919042206686, "learning_rate": 3.5506395370869963e-06, "loss": 0.7614, "step": 4453 }, { "epoch": 0.7315430730064877, "grad_norm": 1.3921694876673427, "learning_rate": 3.546574539917317e-06, "loss": 0.7298, "step": 4454 }, { "epoch": 0.7317073170731707, "grad_norm": 0.6173285154572269, "learning_rate": 3.5425113693761436e-06, "loss": 0.3245, "step": 4455 }, { "epoch": 0.7318715611398539, "grad_norm": 1.5556642559864402, "learning_rate": 3.5384500266135393e-06, "loss": 0.7531, "step": 4456 }, { "epoch": 0.7320358052065369, "grad_norm": 1.5850051338521938, "learning_rate": 3.5343905127790614e-06, "loss": 0.7496, "step": 4457 }, { "epoch": 0.73220004927322, "grad_norm": 2.073209580747585, "learning_rate": 3.5303328290217453e-06, "loss": 0.6845, "step": 4458 }, { "epoch": 0.732364293339903, "grad_norm": 2.957759602073293, "learning_rate": 3.526276976490112e-06, "loss": 0.7692, "step": 4459 }, { "epoch": 0.7325285374065862, "grad_norm": 2.164116438412018, "learning_rate": 3.52222295633216e-06, "loss": 0.6854, "step": 4460 }, { "epoch": 0.7326927814732693, "grad_norm": 1.5397985583274447, "learning_rate": 3.5181707696953728e-06, "loss": 0.7447, "step": 4461 }, { "epoch": 0.7328570255399524, "grad_norm": 1.7885300379589084, "learning_rate": 3.5141204177267117e-06, "loss": 0.6253, "step": 4462 }, { "epoch": 0.7330212696066355, "grad_norm": 1.7007812871561223, "learning_rate": 3.5100719015726228e-06, "loss": 0.7559, "step": 4463 }, { "epoch": 0.7331855136733185, "grad_norm": 1.7990258432002197, "learning_rate": 3.506025222379027e-06, "loss": 0.6925, "step": 4464 }, { "epoch": 0.7333497577400017, "grad_norm": 1.5924888616861608, "learning_rate": 3.501980381291331e-06, "loss": 0.72, "step": 4465 }, { "epoch": 0.7335140018066847, "grad_norm": 1.8591557655675561, "learning_rate": 3.497937379454417e-06, "loss": 0.6959, "step": 4466 }, { "epoch": 0.7336782458733678, "grad_norm": 2.087881809786579, "learning_rate": 3.493896218012649e-06, "loss": 0.7978, "step": 4467 }, { "epoch": 0.733842489940051, "grad_norm": 2.1369174517883236, "learning_rate": 3.4898568981098678e-06, "loss": 0.7063, "step": 4468 }, { "epoch": 0.734006734006734, "grad_norm": 1.9847289315484247, "learning_rate": 3.4858194208893967e-06, "loss": 0.7404, "step": 4469 }, { "epoch": 0.7341709780734171, "grad_norm": 0.6200470929239018, "learning_rate": 3.481783787494033e-06, "loss": 0.3001, "step": 4470 }, { "epoch": 0.7343352221401002, "grad_norm": 1.8113399021691938, "learning_rate": 3.477749999066056e-06, "loss": 0.7365, "step": 4471 }, { "epoch": 0.7344994662067833, "grad_norm": 1.428648344438699, "learning_rate": 3.4737180567472196e-06, "loss": 0.7489, "step": 4472 }, { "epoch": 0.7346637102734663, "grad_norm": 1.57738922843208, "learning_rate": 3.469687961678757e-06, "loss": 0.7391, "step": 4473 }, { "epoch": 0.7348279543401495, "grad_norm": 1.8466346331459889, "learning_rate": 3.465659715001379e-06, "loss": 0.7898, "step": 4474 }, { "epoch": 0.7349921984068326, "grad_norm": 1.9213742986122588, "learning_rate": 3.461633317855271e-06, "loss": 0.7405, "step": 4475 }, { "epoch": 0.7351564424735156, "grad_norm": 1.5866673830216962, "learning_rate": 3.4576087713800966e-06, "loss": 0.7216, "step": 4476 }, { "epoch": 0.7353206865401988, "grad_norm": 1.690676950871025, "learning_rate": 3.4535860767149963e-06, "loss": 0.6647, "step": 4477 }, { "epoch": 0.7354849306068818, "grad_norm": 2.0385329791360602, "learning_rate": 3.4495652349985844e-06, "loss": 0.7304, "step": 4478 }, { "epoch": 0.7356491746735649, "grad_norm": 1.7161707734195546, "learning_rate": 3.4455462473689515e-06, "loss": 0.7532, "step": 4479 }, { "epoch": 0.735813418740248, "grad_norm": 2.9399076623610125, "learning_rate": 3.4415291149636642e-06, "loss": 0.75, "step": 4480 }, { "epoch": 0.7359776628069311, "grad_norm": 1.6440422931604608, "learning_rate": 3.4375138389197627e-06, "loss": 0.744, "step": 4481 }, { "epoch": 0.7361419068736141, "grad_norm": 2.009041555910574, "learning_rate": 3.433500420373763e-06, "loss": 0.7349, "step": 4482 }, { "epoch": 0.7363061509402973, "grad_norm": 1.7321081888511638, "learning_rate": 3.429488860461655e-06, "loss": 0.7673, "step": 4483 }, { "epoch": 0.7364703950069804, "grad_norm": 1.5569402823411551, "learning_rate": 3.425479160318902e-06, "loss": 0.7967, "step": 4484 }, { "epoch": 0.7366346390736634, "grad_norm": 2.2420563053777713, "learning_rate": 3.421471321080441e-06, "loss": 0.7316, "step": 4485 }, { "epoch": 0.7367988831403466, "grad_norm": 1.8452766587861555, "learning_rate": 3.4174653438806814e-06, "loss": 0.7234, "step": 4486 }, { "epoch": 0.7369631272070296, "grad_norm": 2.347836281707389, "learning_rate": 3.4134612298535084e-06, "loss": 0.6916, "step": 4487 }, { "epoch": 0.7371273712737128, "grad_norm": 1.499037688035379, "learning_rate": 3.4094589801322773e-06, "loss": 0.7562, "step": 4488 }, { "epoch": 0.7372916153403958, "grad_norm": 1.861084530409473, "learning_rate": 3.4054585958498177e-06, "loss": 0.7042, "step": 4489 }, { "epoch": 0.7374558594070789, "grad_norm": 1.812546210283505, "learning_rate": 3.401460078138428e-06, "loss": 0.7328, "step": 4490 }, { "epoch": 0.7376201034737621, "grad_norm": 2.0754313733944643, "learning_rate": 3.3974634281298815e-06, "loss": 0.7286, "step": 4491 }, { "epoch": 0.7377843475404451, "grad_norm": 1.5520119538090542, "learning_rate": 3.3934686469554203e-06, "loss": 0.8153, "step": 4492 }, { "epoch": 0.7379485916071282, "grad_norm": 1.988635375256496, "learning_rate": 3.389475735745761e-06, "loss": 0.6887, "step": 4493 }, { "epoch": 0.7381128356738113, "grad_norm": 1.701778634983921, "learning_rate": 3.3854846956310862e-06, "loss": 0.6753, "step": 4494 }, { "epoch": 0.7382770797404944, "grad_norm": 1.6226672506171613, "learning_rate": 3.381495527741053e-06, "loss": 0.7354, "step": 4495 }, { "epoch": 0.7384413238071774, "grad_norm": 2.2676892596480327, "learning_rate": 3.377508233204787e-06, "loss": 0.7461, "step": 4496 }, { "epoch": 0.7386055678738606, "grad_norm": 1.6089134875300115, "learning_rate": 3.3735228131508824e-06, "loss": 0.7946, "step": 4497 }, { "epoch": 0.7387698119405437, "grad_norm": 1.69338951036869, "learning_rate": 3.3695392687074045e-06, "loss": 0.8133, "step": 4498 }, { "epoch": 0.7389340560072267, "grad_norm": 2.061699324001796, "learning_rate": 3.3655576010018875e-06, "loss": 0.6687, "step": 4499 }, { "epoch": 0.7390983000739099, "grad_norm": 1.7291476923731905, "learning_rate": 3.361577811161335e-06, "loss": 0.7112, "step": 4500 }, { "epoch": 0.7392625441405929, "grad_norm": 1.9619025560660586, "learning_rate": 3.3575999003122162e-06, "loss": 0.7471, "step": 4501 }, { "epoch": 0.739426788207276, "grad_norm": 1.3699849975416727, "learning_rate": 3.3536238695804713e-06, "loss": 0.7791, "step": 4502 }, { "epoch": 0.7395910322739591, "grad_norm": 1.662222711971544, "learning_rate": 3.3496497200915067e-06, "loss": 0.6245, "step": 4503 }, { "epoch": 0.7397552763406422, "grad_norm": 1.611521906014863, "learning_rate": 3.3456774529701987e-06, "loss": 0.8259, "step": 4504 }, { "epoch": 0.7399195204073253, "grad_norm": 2.0386258125320156, "learning_rate": 3.3417070693408882e-06, "loss": 0.7397, "step": 4505 }, { "epoch": 0.7400837644740084, "grad_norm": 0.614194889569704, "learning_rate": 3.3377385703273835e-06, "loss": 0.3013, "step": 4506 }, { "epoch": 0.7402480085406915, "grad_norm": 1.6788005210277663, "learning_rate": 3.3337719570529603e-06, "loss": 0.7858, "step": 4507 }, { "epoch": 0.7404122526073745, "grad_norm": 1.9002703736994988, "learning_rate": 3.3298072306403595e-06, "loss": 0.7637, "step": 4508 }, { "epoch": 0.7405764966740577, "grad_norm": 1.7551160548998335, "learning_rate": 3.32584439221179e-06, "loss": 0.7739, "step": 4509 }, { "epoch": 0.7407407407407407, "grad_norm": 1.5497876608684227, "learning_rate": 3.3218834428889244e-06, "loss": 0.7552, "step": 4510 }, { "epoch": 0.7409049848074238, "grad_norm": 1.8173473854875084, "learning_rate": 3.3179243837929e-06, "loss": 0.7301, "step": 4511 }, { "epoch": 0.741069228874107, "grad_norm": 1.90778394875643, "learning_rate": 3.3139672160443215e-06, "loss": 0.7947, "step": 4512 }, { "epoch": 0.74123347294079, "grad_norm": 1.7278012406484862, "learning_rate": 3.3100119407632556e-06, "loss": 0.779, "step": 4513 }, { "epoch": 0.7413977170074731, "grad_norm": 1.9692769942375117, "learning_rate": 3.306058559069236e-06, "loss": 0.7575, "step": 4514 }, { "epoch": 0.7415619610741562, "grad_norm": 1.773622679712363, "learning_rate": 3.3021070720812588e-06, "loss": 0.7093, "step": 4515 }, { "epoch": 0.7417262051408393, "grad_norm": 2.11206094371832, "learning_rate": 3.298157480917783e-06, "loss": 0.7384, "step": 4516 }, { "epoch": 0.7418904492075223, "grad_norm": 1.6476645930731963, "learning_rate": 3.2942097866967336e-06, "loss": 0.6839, "step": 4517 }, { "epoch": 0.7420546932742055, "grad_norm": 1.9431604178517667, "learning_rate": 3.2902639905354948e-06, "loss": 0.778, "step": 4518 }, { "epoch": 0.7422189373408885, "grad_norm": 1.662300174751346, "learning_rate": 3.286320093550919e-06, "loss": 0.7808, "step": 4519 }, { "epoch": 0.7423831814075716, "grad_norm": 1.868173791157996, "learning_rate": 3.2823780968593156e-06, "loss": 0.7091, "step": 4520 }, { "epoch": 0.7425474254742548, "grad_norm": 1.5045783916208784, "learning_rate": 3.2784380015764596e-06, "loss": 0.7331, "step": 4521 }, { "epoch": 0.7427116695409378, "grad_norm": 2.225368350941575, "learning_rate": 3.274499808817586e-06, "loss": 0.7973, "step": 4522 }, { "epoch": 0.742875913607621, "grad_norm": 1.7557894983301803, "learning_rate": 3.2705635196973927e-06, "loss": 0.7585, "step": 4523 }, { "epoch": 0.743040157674304, "grad_norm": 2.0004850753050136, "learning_rate": 3.266629135330037e-06, "loss": 0.7504, "step": 4524 }, { "epoch": 0.7432044017409871, "grad_norm": 1.7382221188189384, "learning_rate": 3.2626966568291396e-06, "loss": 0.7047, "step": 4525 }, { "epoch": 0.7433686458076701, "grad_norm": 2.027672595144798, "learning_rate": 3.2587660853077797e-06, "loss": 0.7267, "step": 4526 }, { "epoch": 0.7435328898743533, "grad_norm": 1.494593600935683, "learning_rate": 3.2548374218784963e-06, "loss": 0.7895, "step": 4527 }, { "epoch": 0.7436971339410364, "grad_norm": 1.6454649735939384, "learning_rate": 3.2509106676532897e-06, "loss": 0.7382, "step": 4528 }, { "epoch": 0.7438613780077195, "grad_norm": 3.53061138567379, "learning_rate": 3.2469858237436203e-06, "loss": 0.7326, "step": 4529 }, { "epoch": 0.7440256220744026, "grad_norm": 1.948739370955264, "learning_rate": 3.243062891260407e-06, "loss": 0.7969, "step": 4530 }, { "epoch": 0.7441898661410856, "grad_norm": 1.8350193017555565, "learning_rate": 3.2391418713140264e-06, "loss": 0.7243, "step": 4531 }, { "epoch": 0.7443541102077688, "grad_norm": 1.5280192051134358, "learning_rate": 3.235222765014315e-06, "loss": 0.7297, "step": 4532 }, { "epoch": 0.7445183542744518, "grad_norm": 1.9050534670829693, "learning_rate": 3.231305573470569e-06, "loss": 0.7999, "step": 4533 }, { "epoch": 0.7446825983411349, "grad_norm": 1.7634705040775303, "learning_rate": 3.2273902977915405e-06, "loss": 0.7495, "step": 4534 }, { "epoch": 0.7448468424078181, "grad_norm": 2.9184885235843456, "learning_rate": 3.2234769390854394e-06, "loss": 0.7606, "step": 4535 }, { "epoch": 0.7450110864745011, "grad_norm": 2.1719859199381646, "learning_rate": 3.2195654984599334e-06, "loss": 0.7685, "step": 4536 }, { "epoch": 0.7451753305411842, "grad_norm": 1.8660894827365313, "learning_rate": 3.2156559770221498e-06, "loss": 0.7332, "step": 4537 }, { "epoch": 0.7453395746078673, "grad_norm": 2.231444479977035, "learning_rate": 3.2117483758786683e-06, "loss": 0.7596, "step": 4538 }, { "epoch": 0.7455038186745504, "grad_norm": 1.763178812556316, "learning_rate": 3.207842696135527e-06, "loss": 0.691, "step": 4539 }, { "epoch": 0.7456680627412334, "grad_norm": 2.2653971067762426, "learning_rate": 3.2039389388982225e-06, "loss": 0.7092, "step": 4540 }, { "epoch": 0.7458323068079166, "grad_norm": 0.6137881710155052, "learning_rate": 3.200037105271703e-06, "loss": 0.3247, "step": 4541 }, { "epoch": 0.7459965508745997, "grad_norm": 1.7217409986837726, "learning_rate": 3.1961371963603736e-06, "loss": 0.6936, "step": 4542 }, { "epoch": 0.7461607949412827, "grad_norm": 1.651801616456928, "learning_rate": 3.192239213268099e-06, "loss": 0.7244, "step": 4543 }, { "epoch": 0.7463250390079659, "grad_norm": 2.1328947386881834, "learning_rate": 3.1883431570981917e-06, "loss": 0.7626, "step": 4544 }, { "epoch": 0.7464892830746489, "grad_norm": 1.773714239287069, "learning_rate": 3.1844490289534236e-06, "loss": 0.7403, "step": 4545 }, { "epoch": 0.746653527141332, "grad_norm": 1.6560212806715395, "learning_rate": 3.180556829936019e-06, "loss": 0.7212, "step": 4546 }, { "epoch": 0.7468177712080151, "grad_norm": 2.2284227554205525, "learning_rate": 3.1766665611476566e-06, "loss": 0.7388, "step": 4547 }, { "epoch": 0.7469820152746982, "grad_norm": 1.9000269421865936, "learning_rate": 3.17277822368947e-06, "loss": 0.7518, "step": 4548 }, { "epoch": 0.7471462593413813, "grad_norm": 1.997294003702247, "learning_rate": 3.168891818662043e-06, "loss": 0.7541, "step": 4549 }, { "epoch": 0.7473105034080644, "grad_norm": 1.44021544157376, "learning_rate": 3.1650073471654152e-06, "loss": 0.864, "step": 4550 }, { "epoch": 0.7474747474747475, "grad_norm": 2.5336952034362383, "learning_rate": 3.161124810299079e-06, "loss": 0.6736, "step": 4551 }, { "epoch": 0.7476389915414305, "grad_norm": 1.5858396494208158, "learning_rate": 3.157244209161977e-06, "loss": 0.7991, "step": 4552 }, { "epoch": 0.7478032356081137, "grad_norm": 1.7488474309457334, "learning_rate": 3.1533655448525057e-06, "loss": 0.6958, "step": 4553 }, { "epoch": 0.7479674796747967, "grad_norm": 1.8688281994066462, "learning_rate": 3.1494888184685134e-06, "loss": 0.7869, "step": 4554 }, { "epoch": 0.7481317237414798, "grad_norm": 1.8783710620415923, "learning_rate": 3.145614031107299e-06, "loss": 0.7662, "step": 4555 }, { "epoch": 0.7482959678081629, "grad_norm": 2.076257482474961, "learning_rate": 3.141741183865612e-06, "loss": 0.7669, "step": 4556 }, { "epoch": 0.748460211874846, "grad_norm": 1.6735991467265994, "learning_rate": 3.1378702778396554e-06, "loss": 0.7378, "step": 4557 }, { "epoch": 0.7486244559415292, "grad_norm": 1.8726395751195286, "learning_rate": 3.134001314125079e-06, "loss": 0.8058, "step": 4558 }, { "epoch": 0.7487887000082122, "grad_norm": 1.878825859956462, "learning_rate": 3.1301342938169854e-06, "loss": 0.7517, "step": 4559 }, { "epoch": 0.7489529440748953, "grad_norm": 2.168106356472436, "learning_rate": 3.1262692180099285e-06, "loss": 0.6624, "step": 4560 }, { "epoch": 0.7491171881415783, "grad_norm": 1.4526489088385204, "learning_rate": 3.1224060877979077e-06, "loss": 0.775, "step": 4561 }, { "epoch": 0.7492814322082615, "grad_norm": 1.706312232614212, "learning_rate": 3.1185449042743744e-06, "loss": 0.6998, "step": 4562 }, { "epoch": 0.7494456762749445, "grad_norm": 2.3155700792329266, "learning_rate": 3.114685668532229e-06, "loss": 0.7566, "step": 4563 }, { "epoch": 0.7496099203416277, "grad_norm": 1.5696098543816588, "learning_rate": 3.1108283816638196e-06, "loss": 0.6657, "step": 4564 }, { "epoch": 0.7497741644083108, "grad_norm": 1.5092351443196443, "learning_rate": 3.1069730447609423e-06, "loss": 0.6812, "step": 4565 }, { "epoch": 0.7499384084749938, "grad_norm": 1.518175847575158, "learning_rate": 3.103119658914844e-06, "loss": 0.7567, "step": 4566 }, { "epoch": 0.750102652541677, "grad_norm": 1.742871464841401, "learning_rate": 3.0992682252162165e-06, "loss": 0.7166, "step": 4567 }, { "epoch": 0.75026689660836, "grad_norm": 1.6497529898773637, "learning_rate": 3.0954187447551996e-06, "loss": 0.7152, "step": 4568 }, { "epoch": 0.7504311406750431, "grad_norm": 1.756543152165038, "learning_rate": 3.091571218621382e-06, "loss": 0.7064, "step": 4569 }, { "epoch": 0.7505953847417262, "grad_norm": 1.6015523958225732, "learning_rate": 3.0877256479037952e-06, "loss": 0.8521, "step": 4570 }, { "epoch": 0.7507596288084093, "grad_norm": 2.8034225256326595, "learning_rate": 3.0838820336909224e-06, "loss": 0.7266, "step": 4571 }, { "epoch": 0.7509238728750924, "grad_norm": 1.890265649152146, "learning_rate": 3.0800403770706912e-06, "loss": 0.8214, "step": 4572 }, { "epoch": 0.7510881169417755, "grad_norm": 1.9339118007061191, "learning_rate": 3.076200679130471e-06, "loss": 0.7653, "step": 4573 }, { "epoch": 0.7512523610084586, "grad_norm": 2.839398053311051, "learning_rate": 3.072362940957083e-06, "loss": 0.7767, "step": 4574 }, { "epoch": 0.7514166050751416, "grad_norm": 2.400624780251606, "learning_rate": 3.0685271636367895e-06, "loss": 0.7369, "step": 4575 }, { "epoch": 0.7515808491418248, "grad_norm": 1.618617084580264, "learning_rate": 3.064693348255301e-06, "loss": 0.711, "step": 4576 }, { "epoch": 0.7517450932085078, "grad_norm": 1.5820759371730548, "learning_rate": 3.060861495897769e-06, "loss": 0.7714, "step": 4577 }, { "epoch": 0.7519093372751909, "grad_norm": 0.6549481578650093, "learning_rate": 3.0570316076487918e-06, "loss": 0.3465, "step": 4578 }, { "epoch": 0.7520735813418741, "grad_norm": 2.250513065231014, "learning_rate": 3.0532036845924107e-06, "loss": 0.6936, "step": 4579 }, { "epoch": 0.7522378254085571, "grad_norm": 1.7749135507408575, "learning_rate": 3.049377727812113e-06, "loss": 0.7877, "step": 4580 }, { "epoch": 0.7524020694752402, "grad_norm": 1.5744689111126684, "learning_rate": 3.0455537383908263e-06, "loss": 0.757, "step": 4581 }, { "epoch": 0.7525663135419233, "grad_norm": 1.7565856273297817, "learning_rate": 3.041731717410923e-06, "loss": 0.7544, "step": 4582 }, { "epoch": 0.7527305576086064, "grad_norm": 0.6354459087356539, "learning_rate": 3.0379116659542186e-06, "loss": 0.3018, "step": 4583 }, { "epoch": 0.7528948016752894, "grad_norm": 1.7667651363040233, "learning_rate": 3.0340935851019694e-06, "loss": 0.7904, "step": 4584 }, { "epoch": 0.7530590457419726, "grad_norm": 1.8405068047409499, "learning_rate": 3.0302774759348797e-06, "loss": 0.767, "step": 4585 }, { "epoch": 0.7532232898086557, "grad_norm": 1.7537310153279349, "learning_rate": 3.0264633395330834e-06, "loss": 0.6905, "step": 4586 }, { "epoch": 0.7533875338753387, "grad_norm": 2.1361069460367723, "learning_rate": 3.022651176976166e-06, "loss": 0.7978, "step": 4587 }, { "epoch": 0.7535517779420219, "grad_norm": 1.6693901411422694, "learning_rate": 3.0188409893431556e-06, "loss": 0.7216, "step": 4588 }, { "epoch": 0.7537160220087049, "grad_norm": 1.5416113389560058, "learning_rate": 3.0150327777125175e-06, "loss": 0.7246, "step": 4589 }, { "epoch": 0.753880266075388, "grad_norm": 2.1081079437198333, "learning_rate": 3.011226543162156e-06, "loss": 0.7627, "step": 4590 }, { "epoch": 0.7540445101420711, "grad_norm": 1.8117002448794055, "learning_rate": 3.007422286769418e-06, "loss": 0.7477, "step": 4591 }, { "epoch": 0.7542087542087542, "grad_norm": 2.057010973956131, "learning_rate": 3.003620009611091e-06, "loss": 0.7046, "step": 4592 }, { "epoch": 0.7543729982754372, "grad_norm": 1.3281037431175564, "learning_rate": 2.999819712763402e-06, "loss": 0.7004, "step": 4593 }, { "epoch": 0.7545372423421204, "grad_norm": 0.609768910282511, "learning_rate": 2.996021397302015e-06, "loss": 0.3003, "step": 4594 }, { "epoch": 0.7547014864088035, "grad_norm": 1.5527368205093126, "learning_rate": 2.992225064302037e-06, "loss": 0.7444, "step": 4595 }, { "epoch": 0.7548657304754866, "grad_norm": 1.6531468782402983, "learning_rate": 2.988430714838011e-06, "loss": 0.7223, "step": 4596 }, { "epoch": 0.7550299745421697, "grad_norm": 2.318119874523909, "learning_rate": 2.9846383499839205e-06, "loss": 0.7455, "step": 4597 }, { "epoch": 0.7551942186088527, "grad_norm": 3.0421171779257894, "learning_rate": 2.9808479708131864e-06, "loss": 0.717, "step": 4598 }, { "epoch": 0.7553584626755359, "grad_norm": 1.8135899981164882, "learning_rate": 2.9770595783986666e-06, "loss": 0.6816, "step": 4599 }, { "epoch": 0.7555227067422189, "grad_norm": 1.8838925402001467, "learning_rate": 2.9732731738126586e-06, "loss": 0.7217, "step": 4600 }, { "epoch": 0.755686950808902, "grad_norm": 2.1023981543305155, "learning_rate": 2.969488758126896e-06, "loss": 0.721, "step": 4601 }, { "epoch": 0.7558511948755852, "grad_norm": 1.7407177594656835, "learning_rate": 2.965706332412549e-06, "loss": 0.6921, "step": 4602 }, { "epoch": 0.7560154389422682, "grad_norm": 0.6074123588232926, "learning_rate": 2.9619258977402253e-06, "loss": 0.3561, "step": 4603 }, { "epoch": 0.7561796830089513, "grad_norm": 1.5920897667573295, "learning_rate": 2.9581474551799703e-06, "loss": 0.73, "step": 4604 }, { "epoch": 0.7563439270756344, "grad_norm": 1.3624174537226614, "learning_rate": 2.9543710058012633e-06, "loss": 0.7792, "step": 4605 }, { "epoch": 0.7565081711423175, "grad_norm": 1.6777477970277859, "learning_rate": 2.9505965506730195e-06, "loss": 0.7852, "step": 4606 }, { "epoch": 0.7566724152090005, "grad_norm": 1.6696489495991222, "learning_rate": 2.946824090863596e-06, "loss": 0.735, "step": 4607 }, { "epoch": 0.7568366592756837, "grad_norm": 1.785903491724197, "learning_rate": 2.943053627440771e-06, "loss": 0.7028, "step": 4608 }, { "epoch": 0.7570009033423668, "grad_norm": 1.9898923966401623, "learning_rate": 2.93928516147177e-06, "loss": 0.7898, "step": 4609 }, { "epoch": 0.7571651474090498, "grad_norm": 0.5987115156305799, "learning_rate": 2.9355186940232493e-06, "loss": 0.2809, "step": 4610 }, { "epoch": 0.757329391475733, "grad_norm": 1.519428688404375, "learning_rate": 2.9317542261612986e-06, "loss": 0.7712, "step": 4611 }, { "epoch": 0.757493635542416, "grad_norm": 2.095336909305377, "learning_rate": 2.927991758951445e-06, "loss": 0.7958, "step": 4612 }, { "epoch": 0.7576578796090991, "grad_norm": 1.8013043551448862, "learning_rate": 2.924231293458647e-06, "loss": 0.8072, "step": 4613 }, { "epoch": 0.7578221236757822, "grad_norm": 2.284876449661414, "learning_rate": 2.920472830747295e-06, "loss": 0.7507, "step": 4614 }, { "epoch": 0.7579863677424653, "grad_norm": 3.380132286260215, "learning_rate": 2.9167163718812143e-06, "loss": 0.8224, "step": 4615 }, { "epoch": 0.7581506118091484, "grad_norm": 1.5987962285188115, "learning_rate": 2.9129619179236625e-06, "loss": 0.7309, "step": 4616 }, { "epoch": 0.7583148558758315, "grad_norm": 1.7534441906272038, "learning_rate": 2.9092094699373296e-06, "loss": 0.7128, "step": 4617 }, { "epoch": 0.7584790999425146, "grad_norm": 1.5333991646477205, "learning_rate": 2.90545902898434e-06, "loss": 0.6844, "step": 4618 }, { "epoch": 0.7586433440091976, "grad_norm": 2.0680348744090398, "learning_rate": 2.9017105961262448e-06, "loss": 0.7823, "step": 4619 }, { "epoch": 0.7588075880758808, "grad_norm": 1.9744337011758213, "learning_rate": 2.8979641724240324e-06, "loss": 0.7666, "step": 4620 }, { "epoch": 0.7589718321425638, "grad_norm": 1.6385870868495735, "learning_rate": 2.8942197589381204e-06, "loss": 0.7934, "step": 4621 }, { "epoch": 0.759136076209247, "grad_norm": 1.3858165457339662, "learning_rate": 2.890477356728356e-06, "loss": 0.686, "step": 4622 }, { "epoch": 0.7593003202759301, "grad_norm": 1.4371805863700704, "learning_rate": 2.886736966854019e-06, "loss": 0.7897, "step": 4623 }, { "epoch": 0.7594645643426131, "grad_norm": 2.6185146417992553, "learning_rate": 2.8829985903738176e-06, "loss": 0.7698, "step": 4624 }, { "epoch": 0.7596288084092963, "grad_norm": 4.446046394325405, "learning_rate": 2.8792622283458926e-06, "loss": 0.7474, "step": 4625 }, { "epoch": 0.7597930524759793, "grad_norm": 2.5653674678571528, "learning_rate": 2.8755278818278143e-06, "loss": 0.7854, "step": 4626 }, { "epoch": 0.7599572965426624, "grad_norm": 1.7699815925802314, "learning_rate": 2.8717955518765794e-06, "loss": 0.7184, "step": 4627 }, { "epoch": 0.7601215406093454, "grad_norm": 1.6392916518780982, "learning_rate": 2.8680652395486198e-06, "loss": 0.8381, "step": 4628 }, { "epoch": 0.7602857846760286, "grad_norm": 1.674410845579898, "learning_rate": 2.864336945899788e-06, "loss": 0.8111, "step": 4629 }, { "epoch": 0.7604500287427116, "grad_norm": 1.6581983289073432, "learning_rate": 2.860610671985371e-06, "loss": 0.7972, "step": 4630 }, { "epoch": 0.7606142728093948, "grad_norm": 1.7401049747797999, "learning_rate": 2.856886418860083e-06, "loss": 0.7031, "step": 4631 }, { "epoch": 0.7607785168760779, "grad_norm": 1.9502012446799428, "learning_rate": 2.853164187578067e-06, "loss": 0.7459, "step": 4632 }, { "epoch": 0.7609427609427609, "grad_norm": 1.748457152321588, "learning_rate": 2.849443979192892e-06, "loss": 0.7125, "step": 4633 }, { "epoch": 0.7611070050094441, "grad_norm": 1.728371282688831, "learning_rate": 2.845725794757551e-06, "loss": 0.6477, "step": 4634 }, { "epoch": 0.7612712490761271, "grad_norm": 2.1404377432435666, "learning_rate": 2.8420096353244763e-06, "loss": 0.6616, "step": 4635 }, { "epoch": 0.7614354931428102, "grad_norm": 1.7470952485010014, "learning_rate": 2.838295501945516e-06, "loss": 0.7113, "step": 4636 }, { "epoch": 0.7615997372094933, "grad_norm": 1.9288315852726114, "learning_rate": 2.834583395671947e-06, "loss": 0.7555, "step": 4637 }, { "epoch": 0.7617639812761764, "grad_norm": 2.115006165232687, "learning_rate": 2.8308733175544724e-06, "loss": 0.7714, "step": 4638 }, { "epoch": 0.7619282253428595, "grad_norm": 1.778125273414356, "learning_rate": 2.827165268643223e-06, "loss": 0.6948, "step": 4639 }, { "epoch": 0.7620924694095426, "grad_norm": 1.4999818654464363, "learning_rate": 2.8234592499877535e-06, "loss": 0.7837, "step": 4640 }, { "epoch": 0.7622567134762257, "grad_norm": 1.7984752072075048, "learning_rate": 2.819755262637046e-06, "loss": 0.6761, "step": 4641 }, { "epoch": 0.7624209575429087, "grad_norm": 1.9271941525896967, "learning_rate": 2.8160533076395045e-06, "loss": 0.7854, "step": 4642 }, { "epoch": 0.7625852016095919, "grad_norm": 2.174869769831172, "learning_rate": 2.812353386042962e-06, "loss": 0.7346, "step": 4643 }, { "epoch": 0.7627494456762749, "grad_norm": 1.7576752361480552, "learning_rate": 2.8086554988946714e-06, "loss": 0.7592, "step": 4644 }, { "epoch": 0.762913689742958, "grad_norm": 0.6265659210261424, "learning_rate": 2.804959647241312e-06, "loss": 0.3349, "step": 4645 }, { "epoch": 0.7630779338096412, "grad_norm": 2.0020917864042693, "learning_rate": 2.8012658321289878e-06, "loss": 0.7162, "step": 4646 }, { "epoch": 0.7632421778763242, "grad_norm": 1.7083750525841332, "learning_rate": 2.797574054603225e-06, "loss": 0.6826, "step": 4647 }, { "epoch": 0.7634064219430073, "grad_norm": 1.5750766150638207, "learning_rate": 2.7938843157089734e-06, "loss": 0.7764, "step": 4648 }, { "epoch": 0.7635706660096904, "grad_norm": 0.6517991812521192, "learning_rate": 2.790196616490607e-06, "loss": 0.3281, "step": 4649 }, { "epoch": 0.7637349100763735, "grad_norm": 2.1832011249108, "learning_rate": 2.7865109579919223e-06, "loss": 0.8153, "step": 4650 }, { "epoch": 0.7638991541430565, "grad_norm": 2.029533190585772, "learning_rate": 2.7828273412561324e-06, "loss": 0.7499, "step": 4651 }, { "epoch": 0.7640633982097397, "grad_norm": 1.5645860655827193, "learning_rate": 2.7791457673258793e-06, "loss": 0.7844, "step": 4652 }, { "epoch": 0.7642276422764228, "grad_norm": 2.0374595367623276, "learning_rate": 2.775466237243226e-06, "loss": 0.7322, "step": 4653 }, { "epoch": 0.7643918863431058, "grad_norm": 1.6014579597214933, "learning_rate": 2.7717887520496545e-06, "loss": 0.7235, "step": 4654 }, { "epoch": 0.764556130409789, "grad_norm": 1.9223214421669046, "learning_rate": 2.7681133127860705e-06, "loss": 0.6865, "step": 4655 }, { "epoch": 0.764720374476472, "grad_norm": 1.5645607768740504, "learning_rate": 2.7644399204927984e-06, "loss": 0.7405, "step": 4656 }, { "epoch": 0.7648846185431551, "grad_norm": 1.692244556610067, "learning_rate": 2.7607685762095825e-06, "loss": 0.7799, "step": 4657 }, { "epoch": 0.7650488626098382, "grad_norm": 1.6359994276144452, "learning_rate": 2.7570992809755937e-06, "loss": 0.6615, "step": 4658 }, { "epoch": 0.7652131066765213, "grad_norm": 2.4992039588910053, "learning_rate": 2.753432035829415e-06, "loss": 0.6845, "step": 4659 }, { "epoch": 0.7653773507432045, "grad_norm": 1.894136505085779, "learning_rate": 2.749766841809054e-06, "loss": 0.7221, "step": 4660 }, { "epoch": 0.7655415948098875, "grad_norm": 2.203973894876142, "learning_rate": 2.746103699951934e-06, "loss": 0.6898, "step": 4661 }, { "epoch": 0.7657058388765706, "grad_norm": 1.6205704784791752, "learning_rate": 2.742442611294902e-06, "loss": 0.7304, "step": 4662 }, { "epoch": 0.7658700829432536, "grad_norm": 2.637373258651767, "learning_rate": 2.73878357687422e-06, "loss": 0.7661, "step": 4663 }, { "epoch": 0.7660343270099368, "grad_norm": 0.6178587046225067, "learning_rate": 2.7351265977255702e-06, "loss": 0.3268, "step": 4664 }, { "epoch": 0.7661985710766198, "grad_norm": 1.7304524779002353, "learning_rate": 2.731471674884053e-06, "loss": 0.7405, "step": 4665 }, { "epoch": 0.766362815143303, "grad_norm": 1.9849253995746992, "learning_rate": 2.7278188093841874e-06, "loss": 0.7349, "step": 4666 }, { "epoch": 0.7665270592099861, "grad_norm": 1.8234822570253955, "learning_rate": 2.7241680022599073e-06, "loss": 0.6956, "step": 4667 }, { "epoch": 0.7666913032766691, "grad_norm": 1.8056859008282848, "learning_rate": 2.720519254544568e-06, "loss": 0.6683, "step": 4668 }, { "epoch": 0.7668555473433523, "grad_norm": 2.453219514468653, "learning_rate": 2.716872567270938e-06, "loss": 0.7454, "step": 4669 }, { "epoch": 0.7670197914100353, "grad_norm": 2.058799371221427, "learning_rate": 2.713227941471206e-06, "loss": 0.7474, "step": 4670 }, { "epoch": 0.7671840354767184, "grad_norm": 1.8281052019869888, "learning_rate": 2.7095853781769752e-06, "loss": 0.6876, "step": 4671 }, { "epoch": 0.7673482795434015, "grad_norm": 1.4771962291144052, "learning_rate": 2.7059448784192688e-06, "loss": 0.6716, "step": 4672 }, { "epoch": 0.7675125236100846, "grad_norm": 2.4477162369400016, "learning_rate": 2.702306443228516e-06, "loss": 0.7688, "step": 4673 }, { "epoch": 0.7676767676767676, "grad_norm": 3.567783589737038, "learning_rate": 2.6986700736345715e-06, "loss": 0.7264, "step": 4674 }, { "epoch": 0.7678410117434508, "grad_norm": 1.7119571964836504, "learning_rate": 2.6950357706667017e-06, "loss": 0.7484, "step": 4675 }, { "epoch": 0.7680052558101339, "grad_norm": 1.655963219942429, "learning_rate": 2.6914035353535897e-06, "loss": 0.7241, "step": 4676 }, { "epoch": 0.7681694998768169, "grad_norm": 2.062591373308456, "learning_rate": 2.68777336872333e-06, "loss": 0.7373, "step": 4677 }, { "epoch": 0.7683337439435001, "grad_norm": 1.6243814860021855, "learning_rate": 2.6841452718034343e-06, "loss": 0.7419, "step": 4678 }, { "epoch": 0.7684979880101831, "grad_norm": 1.8834616940253506, "learning_rate": 2.6805192456208297e-06, "loss": 0.775, "step": 4679 }, { "epoch": 0.7686622320768662, "grad_norm": 1.6402833009350484, "learning_rate": 2.6768952912018498e-06, "loss": 0.7656, "step": 4680 }, { "epoch": 0.7688264761435493, "grad_norm": 1.8957839836960468, "learning_rate": 2.6732734095722545e-06, "loss": 0.6814, "step": 4681 }, { "epoch": 0.7689907202102324, "grad_norm": 12.406376176531202, "learning_rate": 2.6696536017572074e-06, "loss": 0.729, "step": 4682 }, { "epoch": 0.7691549642769155, "grad_norm": 2.958131432500286, "learning_rate": 2.666035868781285e-06, "loss": 0.7605, "step": 4683 }, { "epoch": 0.7693192083435986, "grad_norm": 1.621518552748099, "learning_rate": 2.6624202116684816e-06, "loss": 0.7983, "step": 4684 }, { "epoch": 0.7694834524102817, "grad_norm": 1.9641532332122893, "learning_rate": 2.6588066314422e-06, "loss": 0.7024, "step": 4685 }, { "epoch": 0.7696476964769647, "grad_norm": 1.783669692625518, "learning_rate": 2.6551951291252576e-06, "loss": 0.7367, "step": 4686 }, { "epoch": 0.7698119405436479, "grad_norm": 2.226298233895963, "learning_rate": 2.651585705739881e-06, "loss": 0.7182, "step": 4687 }, { "epoch": 0.7699761846103309, "grad_norm": 1.8709777444808213, "learning_rate": 2.6479783623077105e-06, "loss": 0.7802, "step": 4688 }, { "epoch": 0.770140428677014, "grad_norm": 1.3726002759860527, "learning_rate": 2.6443730998497985e-06, "loss": 0.7739, "step": 4689 }, { "epoch": 0.7703046727436972, "grad_norm": 1.5785028811590014, "learning_rate": 2.6407699193866045e-06, "loss": 0.7071, "step": 4690 }, { "epoch": 0.7704689168103802, "grad_norm": 0.6066591623249008, "learning_rate": 2.6371688219380032e-06, "loss": 0.3204, "step": 4691 }, { "epoch": 0.7706331608770634, "grad_norm": 1.7605249720338179, "learning_rate": 2.6335698085232764e-06, "loss": 0.7724, "step": 4692 }, { "epoch": 0.7707974049437464, "grad_norm": 1.630350281032075, "learning_rate": 2.62997288016112e-06, "loss": 0.7273, "step": 4693 }, { "epoch": 0.7709616490104295, "grad_norm": 2.010838372120155, "learning_rate": 2.6263780378696324e-06, "loss": 0.7635, "step": 4694 }, { "epoch": 0.7711258930771125, "grad_norm": 2.0987479022443414, "learning_rate": 2.6227852826663294e-06, "loss": 0.8185, "step": 4695 }, { "epoch": 0.7712901371437957, "grad_norm": 1.7918601163059762, "learning_rate": 2.6191946155681303e-06, "loss": 0.719, "step": 4696 }, { "epoch": 0.7714543812104788, "grad_norm": 1.7341032091062134, "learning_rate": 2.6156060375913685e-06, "loss": 0.756, "step": 4697 }, { "epoch": 0.7716186252771619, "grad_norm": 2.004149977604749, "learning_rate": 2.6120195497517818e-06, "loss": 0.7798, "step": 4698 }, { "epoch": 0.771782869343845, "grad_norm": 2.0986089117549267, "learning_rate": 2.608435153064519e-06, "loss": 0.6878, "step": 4699 }, { "epoch": 0.771947113410528, "grad_norm": 1.7407966989796846, "learning_rate": 2.6048528485441347e-06, "loss": 0.6643, "step": 4700 }, { "epoch": 0.7721113574772112, "grad_norm": 1.7995669102693488, "learning_rate": 2.601272637204595e-06, "loss": 0.7732, "step": 4701 }, { "epoch": 0.7722756015438942, "grad_norm": 1.457093064732527, "learning_rate": 2.5976945200592683e-06, "loss": 0.7448, "step": 4702 }, { "epoch": 0.7724398456105773, "grad_norm": 2.8176560475306083, "learning_rate": 2.5941184981209354e-06, "loss": 0.7981, "step": 4703 }, { "epoch": 0.7726040896772605, "grad_norm": 1.4304669349146357, "learning_rate": 2.5905445724017786e-06, "loss": 0.778, "step": 4704 }, { "epoch": 0.7727683337439435, "grad_norm": 1.4990270691841774, "learning_rate": 2.586972743913394e-06, "loss": 0.7822, "step": 4705 }, { "epoch": 0.7729325778106266, "grad_norm": 0.5764104739969357, "learning_rate": 2.5834030136667796e-06, "loss": 0.326, "step": 4706 }, { "epoch": 0.7730968218773097, "grad_norm": 1.8360966910178473, "learning_rate": 2.579835382672339e-06, "loss": 0.7956, "step": 4707 }, { "epoch": 0.7732610659439928, "grad_norm": 1.5997617622655158, "learning_rate": 2.5762698519398832e-06, "loss": 0.7387, "step": 4708 }, { "epoch": 0.7734253100106758, "grad_norm": 2.5037277110474703, "learning_rate": 2.5727064224786267e-06, "loss": 0.7563, "step": 4709 }, { "epoch": 0.773589554077359, "grad_norm": 1.954569594880406, "learning_rate": 2.569145095297192e-06, "loss": 0.7444, "step": 4710 }, { "epoch": 0.773753798144042, "grad_norm": 1.8330777336915334, "learning_rate": 2.5655858714036054e-06, "loss": 0.7909, "step": 4711 }, { "epoch": 0.7739180422107251, "grad_norm": 3.245286824271806, "learning_rate": 2.5620287518052967e-06, "loss": 0.739, "step": 4712 }, { "epoch": 0.7740822862774083, "grad_norm": 3.6622940901966516, "learning_rate": 2.5584737375091016e-06, "loss": 0.7136, "step": 4713 }, { "epoch": 0.7742465303440913, "grad_norm": 1.8174465634538244, "learning_rate": 2.554920829521259e-06, "loss": 0.698, "step": 4714 }, { "epoch": 0.7744107744107744, "grad_norm": 2.223475804488537, "learning_rate": 2.551370028847416e-06, "loss": 0.7736, "step": 4715 }, { "epoch": 0.7745750184774575, "grad_norm": 1.539133144966381, "learning_rate": 2.547821336492614e-06, "loss": 0.7648, "step": 4716 }, { "epoch": 0.7747392625441406, "grad_norm": 1.4860242016186294, "learning_rate": 2.544274753461303e-06, "loss": 0.762, "step": 4717 }, { "epoch": 0.7749035066108236, "grad_norm": 1.7097374321519496, "learning_rate": 2.5407302807573387e-06, "loss": 0.7638, "step": 4718 }, { "epoch": 0.7750677506775068, "grad_norm": 1.8294399609950898, "learning_rate": 2.5371879193839756e-06, "loss": 0.7507, "step": 4719 }, { "epoch": 0.7752319947441899, "grad_norm": 1.92194498187338, "learning_rate": 2.5336476703438705e-06, "loss": 0.7759, "step": 4720 }, { "epoch": 0.7753962388108729, "grad_norm": 1.7411288371323235, "learning_rate": 2.530109534639085e-06, "loss": 0.6789, "step": 4721 }, { "epoch": 0.7755604828775561, "grad_norm": 1.9190214590799055, "learning_rate": 2.5265735132710802e-06, "loss": 0.7322, "step": 4722 }, { "epoch": 0.7757247269442391, "grad_norm": 1.663714834768973, "learning_rate": 2.5230396072407204e-06, "loss": 0.7447, "step": 4723 }, { "epoch": 0.7758889710109222, "grad_norm": 1.8475164388141647, "learning_rate": 2.519507817548269e-06, "loss": 0.684, "step": 4724 }, { "epoch": 0.7760532150776053, "grad_norm": 1.9784886395853685, "learning_rate": 2.515978145193393e-06, "loss": 0.6925, "step": 4725 }, { "epoch": 0.7762174591442884, "grad_norm": 2.3941093298709353, "learning_rate": 2.512450591175157e-06, "loss": 0.7131, "step": 4726 }, { "epoch": 0.7763817032109716, "grad_norm": 1.591952601995357, "learning_rate": 2.508925156492027e-06, "loss": 0.683, "step": 4727 }, { "epoch": 0.7765459472776546, "grad_norm": 1.623429685874823, "learning_rate": 2.5054018421418737e-06, "loss": 0.8018, "step": 4728 }, { "epoch": 0.7767101913443377, "grad_norm": 1.8470371074028855, "learning_rate": 2.5018806491219627e-06, "loss": 0.764, "step": 4729 }, { "epoch": 0.7768744354110207, "grad_norm": 1.8750623968971303, "learning_rate": 2.4983615784289585e-06, "loss": 0.7623, "step": 4730 }, { "epoch": 0.7770386794777039, "grad_norm": 1.6617825123108614, "learning_rate": 2.494844631058927e-06, "loss": 0.757, "step": 4731 }, { "epoch": 0.7772029235443869, "grad_norm": 2.080485811948162, "learning_rate": 2.4913298080073344e-06, "loss": 0.7736, "step": 4732 }, { "epoch": 0.77736716761107, "grad_norm": 1.8087464486955394, "learning_rate": 2.487817110269042e-06, "loss": 0.7996, "step": 4733 }, { "epoch": 0.7775314116777532, "grad_norm": 1.7644484684993078, "learning_rate": 2.4843065388383126e-06, "loss": 0.7658, "step": 4734 }, { "epoch": 0.7776956557444362, "grad_norm": 0.5638478552484686, "learning_rate": 2.480798094708805e-06, "loss": 0.2991, "step": 4735 }, { "epoch": 0.7778598998111194, "grad_norm": 2.3951821538077804, "learning_rate": 2.4772917788735786e-06, "loss": 0.7308, "step": 4736 }, { "epoch": 0.7780241438778024, "grad_norm": 1.8095132072575069, "learning_rate": 2.473787592325091e-06, "loss": 0.7666, "step": 4737 }, { "epoch": 0.7781883879444855, "grad_norm": 1.547216200062367, "learning_rate": 2.470285536055188e-06, "loss": 0.7471, "step": 4738 }, { "epoch": 0.7783526320111686, "grad_norm": 1.5550787271100543, "learning_rate": 2.4667856110551235e-06, "loss": 0.8002, "step": 4739 }, { "epoch": 0.7785168760778517, "grad_norm": 1.7790539831139505, "learning_rate": 2.463287818315543e-06, "loss": 0.7541, "step": 4740 }, { "epoch": 0.7786811201445348, "grad_norm": 2.3769075236316097, "learning_rate": 2.4597921588264893e-06, "loss": 0.7161, "step": 4741 }, { "epoch": 0.7788453642112179, "grad_norm": 1.85431677974566, "learning_rate": 2.456298633577402e-06, "loss": 0.742, "step": 4742 }, { "epoch": 0.779009608277901, "grad_norm": 1.9169482122414208, "learning_rate": 2.4528072435571158e-06, "loss": 0.6652, "step": 4743 }, { "epoch": 0.779173852344584, "grad_norm": 1.69460564136633, "learning_rate": 2.449317989753862e-06, "loss": 0.7276, "step": 4744 }, { "epoch": 0.7793380964112672, "grad_norm": 2.0176719617709233, "learning_rate": 2.445830873155266e-06, "loss": 0.7912, "step": 4745 }, { "epoch": 0.7795023404779502, "grad_norm": 1.9152922363500122, "learning_rate": 2.4423458947483482e-06, "loss": 0.7565, "step": 4746 }, { "epoch": 0.7796665845446333, "grad_norm": 1.9649473356580234, "learning_rate": 2.4388630555195247e-06, "loss": 0.6794, "step": 4747 }, { "epoch": 0.7798308286113164, "grad_norm": 1.849522706779605, "learning_rate": 2.4353823564546064e-06, "loss": 0.7389, "step": 4748 }, { "epoch": 0.7799950726779995, "grad_norm": 1.8042959566582404, "learning_rate": 2.4319037985387985e-06, "loss": 0.7536, "step": 4749 }, { "epoch": 0.7801593167446826, "grad_norm": 1.9474425946307574, "learning_rate": 2.428427382756695e-06, "loss": 0.7488, "step": 4750 }, { "epoch": 0.7803235608113657, "grad_norm": 1.668713723708166, "learning_rate": 2.424953110092294e-06, "loss": 0.7787, "step": 4751 }, { "epoch": 0.7804878048780488, "grad_norm": 1.8899063741080753, "learning_rate": 2.4214809815289797e-06, "loss": 0.7882, "step": 4752 }, { "epoch": 0.7806520489447318, "grad_norm": 1.8670844532051305, "learning_rate": 2.4180109980495293e-06, "loss": 0.7049, "step": 4753 }, { "epoch": 0.780816293011415, "grad_norm": 0.5718571563034333, "learning_rate": 2.4145431606361148e-06, "loss": 0.3089, "step": 4754 }, { "epoch": 0.780980537078098, "grad_norm": 1.7150441347489958, "learning_rate": 2.4110774702703e-06, "loss": 0.7329, "step": 4755 }, { "epoch": 0.7811447811447811, "grad_norm": 2.1169088236086213, "learning_rate": 2.4076139279330414e-06, "loss": 0.7843, "step": 4756 }, { "epoch": 0.7813090252114643, "grad_norm": 0.6139115704804174, "learning_rate": 2.4041525346046877e-06, "loss": 0.3458, "step": 4757 }, { "epoch": 0.7814732692781473, "grad_norm": 1.6686503463159996, "learning_rate": 2.4006932912649816e-06, "loss": 0.6951, "step": 4758 }, { "epoch": 0.7816375133448304, "grad_norm": 1.6454476376467695, "learning_rate": 2.3972361988930505e-06, "loss": 0.7859, "step": 4759 }, { "epoch": 0.7818017574115135, "grad_norm": 1.8366623574618564, "learning_rate": 2.3937812584674168e-06, "loss": 0.7573, "step": 4760 }, { "epoch": 0.7819660014781966, "grad_norm": 1.99650202006131, "learning_rate": 2.3903284709659957e-06, "loss": 0.7699, "step": 4761 }, { "epoch": 0.7821302455448796, "grad_norm": 1.9907287290949687, "learning_rate": 2.3868778373660927e-06, "loss": 0.6526, "step": 4762 }, { "epoch": 0.7822944896115628, "grad_norm": 1.9669949332342402, "learning_rate": 2.3834293586444e-06, "loss": 0.7098, "step": 4763 }, { "epoch": 0.7824587336782459, "grad_norm": 0.6058293683913037, "learning_rate": 2.379983035777005e-06, "loss": 0.2994, "step": 4764 }, { "epoch": 0.782622977744929, "grad_norm": 2.3129067208848677, "learning_rate": 2.37653886973938e-06, "loss": 0.7562, "step": 4765 }, { "epoch": 0.7827872218116121, "grad_norm": 1.6728309958508687, "learning_rate": 2.3730968615063886e-06, "loss": 0.7433, "step": 4766 }, { "epoch": 0.7829514658782951, "grad_norm": 2.796470461141458, "learning_rate": 2.3696570120522868e-06, "loss": 0.7775, "step": 4767 }, { "epoch": 0.7831157099449783, "grad_norm": 1.9923070935281662, "learning_rate": 2.3662193223507135e-06, "loss": 0.5838, "step": 4768 }, { "epoch": 0.7832799540116613, "grad_norm": 1.5423415454165075, "learning_rate": 2.362783793374701e-06, "loss": 0.7377, "step": 4769 }, { "epoch": 0.7834441980783444, "grad_norm": 3.1110336664057012, "learning_rate": 2.3593504260966695e-06, "loss": 0.7199, "step": 4770 }, { "epoch": 0.7836084421450276, "grad_norm": 1.5668007818625036, "learning_rate": 2.355919221488424e-06, "loss": 0.7447, "step": 4771 }, { "epoch": 0.7837726862117106, "grad_norm": 1.697984446970038, "learning_rate": 2.352490180521162e-06, "loss": 0.7828, "step": 4772 }, { "epoch": 0.7839369302783937, "grad_norm": 1.5473850188807263, "learning_rate": 2.349063304165462e-06, "loss": 0.7911, "step": 4773 }, { "epoch": 0.7841011743450768, "grad_norm": 1.8669545554073623, "learning_rate": 2.345638593391302e-06, "loss": 0.7154, "step": 4774 }, { "epoch": 0.7842654184117599, "grad_norm": 1.7093219531053432, "learning_rate": 2.3422160491680334e-06, "loss": 0.7363, "step": 4775 }, { "epoch": 0.7844296624784429, "grad_norm": 0.5828418421946204, "learning_rate": 2.3387956724644014e-06, "loss": 0.3071, "step": 4776 }, { "epoch": 0.7845939065451261, "grad_norm": 1.9151297099955378, "learning_rate": 2.3353774642485374e-06, "loss": 0.7307, "step": 4777 }, { "epoch": 0.7847581506118092, "grad_norm": 0.5384315838623532, "learning_rate": 2.331961425487956e-06, "loss": 0.3365, "step": 4778 }, { "epoch": 0.7849223946784922, "grad_norm": 1.7137343037645685, "learning_rate": 2.3285475571495617e-06, "loss": 0.6329, "step": 4779 }, { "epoch": 0.7850866387451754, "grad_norm": 2.2016131743607255, "learning_rate": 2.3251358601996453e-06, "loss": 0.7476, "step": 4780 }, { "epoch": 0.7852508828118584, "grad_norm": 1.5812871611763983, "learning_rate": 2.3217263356038744e-06, "loss": 0.6985, "step": 4781 }, { "epoch": 0.7854151268785415, "grad_norm": 1.643081044788768, "learning_rate": 2.31831898432731e-06, "loss": 0.7683, "step": 4782 }, { "epoch": 0.7855793709452246, "grad_norm": 1.5823256229564733, "learning_rate": 2.3149138073343958e-06, "loss": 0.7406, "step": 4783 }, { "epoch": 0.7857436150119077, "grad_norm": 1.6681734504188073, "learning_rate": 2.3115108055889614e-06, "loss": 0.7549, "step": 4784 }, { "epoch": 0.7859078590785907, "grad_norm": 1.9610378755016626, "learning_rate": 2.3081099800542183e-06, "loss": 0.751, "step": 4785 }, { "epoch": 0.7860721031452739, "grad_norm": 2.00371666744168, "learning_rate": 2.3047113316927627e-06, "loss": 0.6842, "step": 4786 }, { "epoch": 0.786236347211957, "grad_norm": 1.9279979563966179, "learning_rate": 2.301314861466575e-06, "loss": 0.7567, "step": 4787 }, { "epoch": 0.78640059127864, "grad_norm": 1.8793449318284634, "learning_rate": 2.297920570337019e-06, "loss": 0.7422, "step": 4788 }, { "epoch": 0.7865648353453232, "grad_norm": 2.076686456652735, "learning_rate": 2.294528459264842e-06, "loss": 0.7631, "step": 4789 }, { "epoch": 0.7867290794120062, "grad_norm": 1.4794923383660519, "learning_rate": 2.291138529210174e-06, "loss": 0.81, "step": 4790 }, { "epoch": 0.7868933234786893, "grad_norm": 1.7588631618451958, "learning_rate": 2.287750781132527e-06, "loss": 0.7159, "step": 4791 }, { "epoch": 0.7870575675453724, "grad_norm": 1.4972971977703458, "learning_rate": 2.284365215990797e-06, "loss": 0.7554, "step": 4792 }, { "epoch": 0.7872218116120555, "grad_norm": 1.99635390343358, "learning_rate": 2.2809818347432598e-06, "loss": 0.8107, "step": 4793 }, { "epoch": 0.7873860556787386, "grad_norm": 2.295386482086677, "learning_rate": 2.2776006383475745e-06, "loss": 0.7422, "step": 4794 }, { "epoch": 0.7875502997454217, "grad_norm": 1.6831968378387228, "learning_rate": 2.274221627760782e-06, "loss": 0.7646, "step": 4795 }, { "epoch": 0.7877145438121048, "grad_norm": 3.1685850410118577, "learning_rate": 2.270844803939305e-06, "loss": 0.7449, "step": 4796 }, { "epoch": 0.7878787878787878, "grad_norm": 1.7646841794484518, "learning_rate": 2.2674701678389423e-06, "loss": 0.6766, "step": 4797 }, { "epoch": 0.788043031945471, "grad_norm": 0.6079029525811129, "learning_rate": 2.2640977204148838e-06, "loss": 0.2937, "step": 4798 }, { "epoch": 0.788207276012154, "grad_norm": 1.917394475024718, "learning_rate": 2.26072746262169e-06, "loss": 0.7033, "step": 4799 }, { "epoch": 0.7883715200788372, "grad_norm": 1.9190999484613354, "learning_rate": 2.2573593954133067e-06, "loss": 0.8065, "step": 4800 }, { "epoch": 0.7885357641455203, "grad_norm": 1.9215298781885037, "learning_rate": 2.2539935197430574e-06, "loss": 0.7512, "step": 4801 }, { "epoch": 0.7887000082122033, "grad_norm": 1.551789586560586, "learning_rate": 2.2506298365636482e-06, "loss": 0.7578, "step": 4802 }, { "epoch": 0.7888642522788865, "grad_norm": 1.6752621354740291, "learning_rate": 2.2472683468271584e-06, "loss": 0.7928, "step": 4803 }, { "epoch": 0.7890284963455695, "grad_norm": 0.5912197478967467, "learning_rate": 2.2439090514850527e-06, "loss": 0.3085, "step": 4804 }, { "epoch": 0.7891927404122526, "grad_norm": 1.5498155473057318, "learning_rate": 2.2405519514881723e-06, "loss": 0.7399, "step": 4805 }, { "epoch": 0.7893569844789357, "grad_norm": 2.2034634497405863, "learning_rate": 2.2371970477867377e-06, "loss": 0.7772, "step": 4806 }, { "epoch": 0.7895212285456188, "grad_norm": 2.380427123394962, "learning_rate": 2.2338443413303466e-06, "loss": 0.6664, "step": 4807 }, { "epoch": 0.7896854726123019, "grad_norm": 1.9295825349625673, "learning_rate": 2.230493833067977e-06, "loss": 0.7702, "step": 4808 }, { "epoch": 0.789849716678985, "grad_norm": 1.6012218246160885, "learning_rate": 2.2271455239479822e-06, "loss": 0.6913, "step": 4809 }, { "epoch": 0.7900139607456681, "grad_norm": 1.804093867973828, "learning_rate": 2.2237994149180943e-06, "loss": 0.7673, "step": 4810 }, { "epoch": 0.7901782048123511, "grad_norm": 2.135087327207808, "learning_rate": 2.220455506925422e-06, "loss": 0.8207, "step": 4811 }, { "epoch": 0.7903424488790343, "grad_norm": 1.8175236912266872, "learning_rate": 2.2171138009164515e-06, "loss": 0.7266, "step": 4812 }, { "epoch": 0.7905066929457173, "grad_norm": 1.7078840421340822, "learning_rate": 2.213774297837047e-06, "loss": 0.745, "step": 4813 }, { "epoch": 0.7906709370124004, "grad_norm": 1.7424558545568125, "learning_rate": 2.210436998632446e-06, "loss": 0.7915, "step": 4814 }, { "epoch": 0.7908351810790836, "grad_norm": 1.6146636134981924, "learning_rate": 2.2071019042472643e-06, "loss": 0.7898, "step": 4815 }, { "epoch": 0.7909994251457666, "grad_norm": 3.2969161086458856, "learning_rate": 2.2037690156254944e-06, "loss": 0.685, "step": 4816 }, { "epoch": 0.7911636692124497, "grad_norm": 1.835672186440009, "learning_rate": 2.2004383337105016e-06, "loss": 0.7132, "step": 4817 }, { "epoch": 0.7913279132791328, "grad_norm": 2.237587318559841, "learning_rate": 2.1971098594450315e-06, "loss": 0.8027, "step": 4818 }, { "epoch": 0.7914921573458159, "grad_norm": 0.6000588799298746, "learning_rate": 2.1937835937711995e-06, "loss": 0.2778, "step": 4819 }, { "epoch": 0.7916564014124989, "grad_norm": 0.6002393908308179, "learning_rate": 2.190459537630495e-06, "loss": 0.3182, "step": 4820 }, { "epoch": 0.7918206454791821, "grad_norm": 1.4521665250863331, "learning_rate": 2.187137691963791e-06, "loss": 0.7267, "step": 4821 }, { "epoch": 0.7919848895458651, "grad_norm": 1.9286468715704148, "learning_rate": 2.1838180577113268e-06, "loss": 0.7688, "step": 4822 }, { "epoch": 0.7921491336125482, "grad_norm": 1.8229073492674686, "learning_rate": 2.1805006358127213e-06, "loss": 0.6773, "step": 4823 }, { "epoch": 0.7923133776792314, "grad_norm": 1.7699889929891408, "learning_rate": 2.177185427206956e-06, "loss": 0.7627, "step": 4824 }, { "epoch": 0.7924776217459144, "grad_norm": 1.8432036625509212, "learning_rate": 2.173872432832398e-06, "loss": 0.7178, "step": 4825 }, { "epoch": 0.7926418658125975, "grad_norm": 1.7676591761222393, "learning_rate": 2.1705616536267838e-06, "loss": 0.722, "step": 4826 }, { "epoch": 0.7928061098792806, "grad_norm": 1.7512065337077385, "learning_rate": 2.1672530905272215e-06, "loss": 0.7364, "step": 4827 }, { "epoch": 0.7929703539459637, "grad_norm": 2.0237610010991407, "learning_rate": 2.1639467444701934e-06, "loss": 0.7694, "step": 4828 }, { "epoch": 0.7931345980126467, "grad_norm": 2.0825394774215513, "learning_rate": 2.160642616391553e-06, "loss": 0.7398, "step": 4829 }, { "epoch": 0.7932988420793299, "grad_norm": 1.6236889486588721, "learning_rate": 2.1573407072265284e-06, "loss": 0.7556, "step": 4830 }, { "epoch": 0.793463086146013, "grad_norm": 1.931095587386969, "learning_rate": 2.1540410179097173e-06, "loss": 0.7476, "step": 4831 }, { "epoch": 0.793627330212696, "grad_norm": 2.1404853098572394, "learning_rate": 2.1507435493750885e-06, "loss": 0.7451, "step": 4832 }, { "epoch": 0.7937915742793792, "grad_norm": 2.057884749253406, "learning_rate": 2.1474483025559857e-06, "loss": 0.7275, "step": 4833 }, { "epoch": 0.7939558183460622, "grad_norm": 1.930903346977565, "learning_rate": 2.1441552783851195e-06, "loss": 0.6506, "step": 4834 }, { "epoch": 0.7941200624127454, "grad_norm": 1.7967144758360356, "learning_rate": 2.1408644777945753e-06, "loss": 0.7658, "step": 4835 }, { "epoch": 0.7942843064794284, "grad_norm": 4.037089354661651, "learning_rate": 2.137575901715806e-06, "loss": 0.753, "step": 4836 }, { "epoch": 0.7944485505461115, "grad_norm": 1.7424295359509072, "learning_rate": 2.1342895510796367e-06, "loss": 0.8085, "step": 4837 }, { "epoch": 0.7946127946127947, "grad_norm": 1.761444255050121, "learning_rate": 2.1310054268162628e-06, "loss": 0.7737, "step": 4838 }, { "epoch": 0.7947770386794777, "grad_norm": 1.7832710650748194, "learning_rate": 2.127723529855248e-06, "loss": 0.6957, "step": 4839 }, { "epoch": 0.7949412827461608, "grad_norm": 1.81733977870462, "learning_rate": 2.124443861125525e-06, "loss": 0.7053, "step": 4840 }, { "epoch": 0.7951055268128439, "grad_norm": 1.8883577019007507, "learning_rate": 2.1211664215553997e-06, "loss": 0.8041, "step": 4841 }, { "epoch": 0.795269770879527, "grad_norm": 1.838129176516185, "learning_rate": 2.1178912120725416e-06, "loss": 0.6853, "step": 4842 }, { "epoch": 0.79543401494621, "grad_norm": 1.74717041046329, "learning_rate": 2.114618233603992e-06, "loss": 0.7295, "step": 4843 }, { "epoch": 0.7955982590128932, "grad_norm": 2.1662509108908816, "learning_rate": 2.111347487076164e-06, "loss": 0.6973, "step": 4844 }, { "epoch": 0.7957625030795763, "grad_norm": 1.900279843749827, "learning_rate": 2.1080789734148366e-06, "loss": 0.6815, "step": 4845 }, { "epoch": 0.7959267471462593, "grad_norm": 1.6629391799291646, "learning_rate": 2.1048126935451495e-06, "loss": 0.7628, "step": 4846 }, { "epoch": 0.7960909912129425, "grad_norm": 1.9416151746185089, "learning_rate": 2.10154864839162e-06, "loss": 0.7817, "step": 4847 }, { "epoch": 0.7962552352796255, "grad_norm": 1.446036036179575, "learning_rate": 2.0982868388781286e-06, "loss": 0.7639, "step": 4848 }, { "epoch": 0.7964194793463086, "grad_norm": 1.6130152639935325, "learning_rate": 2.0950272659279246e-06, "loss": 0.7482, "step": 4849 }, { "epoch": 0.7965837234129917, "grad_norm": 1.4164187669503032, "learning_rate": 2.091769930463621e-06, "loss": 0.8189, "step": 4850 }, { "epoch": 0.7967479674796748, "grad_norm": 1.8104641509610067, "learning_rate": 2.0885148334072013e-06, "loss": 0.7439, "step": 4851 }, { "epoch": 0.7969122115463579, "grad_norm": 1.52719377232373, "learning_rate": 2.085261975680014e-06, "loss": 0.7437, "step": 4852 }, { "epoch": 0.797076455613041, "grad_norm": 1.9222807070880692, "learning_rate": 2.0820113582027734e-06, "loss": 0.8008, "step": 4853 }, { "epoch": 0.7972406996797241, "grad_norm": 2.1867316972105098, "learning_rate": 2.07876298189556e-06, "loss": 0.7379, "step": 4854 }, { "epoch": 0.7974049437464071, "grad_norm": 1.6242534352938554, "learning_rate": 2.07551684767782e-06, "loss": 0.8385, "step": 4855 }, { "epoch": 0.7975691878130903, "grad_norm": 1.5767101307096865, "learning_rate": 2.072272956468364e-06, "loss": 0.7038, "step": 4856 }, { "epoch": 0.7977334318797733, "grad_norm": 1.8053633785465342, "learning_rate": 2.0690313091853697e-06, "loss": 0.7564, "step": 4857 }, { "epoch": 0.7978976759464564, "grad_norm": 1.9014382103337846, "learning_rate": 2.0657919067463773e-06, "loss": 0.7325, "step": 4858 }, { "epoch": 0.7980619200131396, "grad_norm": 1.5315480751801966, "learning_rate": 2.062554750068294e-06, "loss": 0.7546, "step": 4859 }, { "epoch": 0.7982261640798226, "grad_norm": 2.239506754676248, "learning_rate": 2.05931984006739e-06, "loss": 0.7995, "step": 4860 }, { "epoch": 0.7983904081465057, "grad_norm": 1.7252162635677386, "learning_rate": 2.0560871776592996e-06, "loss": 0.7348, "step": 4861 }, { "epoch": 0.7985546522131888, "grad_norm": 2.3203614012862968, "learning_rate": 2.0528567637590214e-06, "loss": 0.759, "step": 4862 }, { "epoch": 0.7987188962798719, "grad_norm": 1.543501659315871, "learning_rate": 2.0496285992809163e-06, "loss": 0.8202, "step": 4863 }, { "epoch": 0.7988831403465549, "grad_norm": 3.615681537351475, "learning_rate": 2.0464026851387096e-06, "loss": 0.7561, "step": 4864 }, { "epoch": 0.7990473844132381, "grad_norm": 1.764691731231526, "learning_rate": 2.0431790222454906e-06, "loss": 0.7726, "step": 4865 }, { "epoch": 0.7992116284799211, "grad_norm": 2.4639905035215457, "learning_rate": 2.03995761151371e-06, "loss": 0.7167, "step": 4866 }, { "epoch": 0.7993758725466042, "grad_norm": 1.7854430435289732, "learning_rate": 2.0367384538551805e-06, "loss": 0.7649, "step": 4867 }, { "epoch": 0.7995401166132874, "grad_norm": 2.0410155213833514, "learning_rate": 2.033521550181078e-06, "loss": 0.7883, "step": 4868 }, { "epoch": 0.7997043606799704, "grad_norm": 1.7680650794585169, "learning_rate": 2.0303069014019415e-06, "loss": 0.7788, "step": 4869 }, { "epoch": 0.7998686047466536, "grad_norm": 2.0668635176373686, "learning_rate": 2.0270945084276695e-06, "loss": 0.6579, "step": 4870 }, { "epoch": 0.8000328488133366, "grad_norm": 3.1613111936526472, "learning_rate": 2.0238843721675226e-06, "loss": 0.7037, "step": 4871 }, { "epoch": 0.8001970928800197, "grad_norm": 1.730843093812246, "learning_rate": 2.020676493530126e-06, "loss": 0.7632, "step": 4872 }, { "epoch": 0.8003613369467028, "grad_norm": 1.4955054241853345, "learning_rate": 2.0174708734234596e-06, "loss": 0.7862, "step": 4873 }, { "epoch": 0.8005255810133859, "grad_norm": 1.6189831828567778, "learning_rate": 2.0142675127548684e-06, "loss": 0.732, "step": 4874 }, { "epoch": 0.800689825080069, "grad_norm": 2.000216914125089, "learning_rate": 2.0110664124310574e-06, "loss": 0.7357, "step": 4875 }, { "epoch": 0.8008540691467521, "grad_norm": 1.5672650016338052, "learning_rate": 2.007867573358091e-06, "loss": 0.8008, "step": 4876 }, { "epoch": 0.8010183132134352, "grad_norm": 1.7436913037694153, "learning_rate": 2.0046709964413947e-06, "loss": 0.6365, "step": 4877 }, { "epoch": 0.8011825572801182, "grad_norm": 0.6352285379499442, "learning_rate": 2.0014766825857514e-06, "loss": 0.28, "step": 4878 }, { "epoch": 0.8013468013468014, "grad_norm": 1.8368294158338736, "learning_rate": 1.9982846326953066e-06, "loss": 0.7456, "step": 4879 }, { "epoch": 0.8015110454134844, "grad_norm": 2.340173122173122, "learning_rate": 1.995094847673561e-06, "loss": 0.7391, "step": 4880 }, { "epoch": 0.8016752894801675, "grad_norm": 1.8907421575151355, "learning_rate": 1.991907328423379e-06, "loss": 0.6942, "step": 4881 }, { "epoch": 0.8018395335468507, "grad_norm": 1.864917824220496, "learning_rate": 1.9887220758469794e-06, "loss": 0.7627, "step": 4882 }, { "epoch": 0.8020037776135337, "grad_norm": 1.7192180273533937, "learning_rate": 1.985539090845943e-06, "loss": 0.7267, "step": 4883 }, { "epoch": 0.8021680216802168, "grad_norm": 1.6917035701497067, "learning_rate": 1.982358374321205e-06, "loss": 0.8212, "step": 4884 }, { "epoch": 0.8023322657468999, "grad_norm": 2.357903487900586, "learning_rate": 1.9791799271730626e-06, "loss": 0.7841, "step": 4885 }, { "epoch": 0.802496509813583, "grad_norm": 1.8495803028975992, "learning_rate": 1.9760037503011664e-06, "loss": 0.7466, "step": 4886 }, { "epoch": 0.802660753880266, "grad_norm": 1.721388527441018, "learning_rate": 1.972829844604528e-06, "loss": 0.72, "step": 4887 }, { "epoch": 0.8028249979469492, "grad_norm": 1.6445751549583405, "learning_rate": 1.9696582109815145e-06, "loss": 0.685, "step": 4888 }, { "epoch": 0.8029892420136323, "grad_norm": 1.8754301006745926, "learning_rate": 1.966488850329851e-06, "loss": 0.7324, "step": 4889 }, { "epoch": 0.8031534860803153, "grad_norm": 1.6634101717002312, "learning_rate": 1.9633217635466164e-06, "loss": 0.7934, "step": 4890 }, { "epoch": 0.8033177301469985, "grad_norm": 1.7789178291115613, "learning_rate": 1.960156951528248e-06, "loss": 0.7795, "step": 4891 }, { "epoch": 0.8034819742136815, "grad_norm": 1.780051691815692, "learning_rate": 1.9569944151705423e-06, "loss": 0.7556, "step": 4892 }, { "epoch": 0.8036462182803646, "grad_norm": 2.0039689978719304, "learning_rate": 1.9538341553686446e-06, "loss": 0.7804, "step": 4893 }, { "epoch": 0.8038104623470477, "grad_norm": 2.216979346774783, "learning_rate": 1.950676173017062e-06, "loss": 0.7621, "step": 4894 }, { "epoch": 0.8039747064137308, "grad_norm": 1.7264033792397133, "learning_rate": 1.947520469009655e-06, "loss": 0.6668, "step": 4895 }, { "epoch": 0.804138950480414, "grad_norm": 1.9294476178585729, "learning_rate": 1.9443670442396378e-06, "loss": 0.7192, "step": 4896 }, { "epoch": 0.804303194547097, "grad_norm": 1.9346403229935052, "learning_rate": 1.941215899599581e-06, "loss": 0.7341, "step": 4897 }, { "epoch": 0.8044674386137801, "grad_norm": 2.142209221692991, "learning_rate": 1.93806703598141e-06, "loss": 0.7645, "step": 4898 }, { "epoch": 0.8046316826804631, "grad_norm": 1.8740329530211324, "learning_rate": 1.9349204542764044e-06, "loss": 0.7225, "step": 4899 }, { "epoch": 0.8047959267471463, "grad_norm": 1.6140108369261141, "learning_rate": 1.9317761553751957e-06, "loss": 0.7442, "step": 4900 }, { "epoch": 0.8049601708138293, "grad_norm": 1.9107434618182957, "learning_rate": 1.928634140167772e-06, "loss": 0.6988, "step": 4901 }, { "epoch": 0.8051244148805125, "grad_norm": 1.9151672098943815, "learning_rate": 1.9254944095434745e-06, "loss": 0.7739, "step": 4902 }, { "epoch": 0.8052886589471955, "grad_norm": 1.651244472868367, "learning_rate": 1.9223569643909978e-06, "loss": 0.7335, "step": 4903 }, { "epoch": 0.8054529030138786, "grad_norm": 2.0685334732966685, "learning_rate": 1.919221805598388e-06, "loss": 0.6762, "step": 4904 }, { "epoch": 0.8056171470805618, "grad_norm": 2.612653831368047, "learning_rate": 1.9160889340530455e-06, "loss": 0.7632, "step": 4905 }, { "epoch": 0.8057813911472448, "grad_norm": 1.6904753607224665, "learning_rate": 1.9129583506417236e-06, "loss": 0.6961, "step": 4906 }, { "epoch": 0.8059456352139279, "grad_norm": 1.875754766163042, "learning_rate": 1.9098300562505266e-06, "loss": 0.7773, "step": 4907 }, { "epoch": 0.806109879280611, "grad_norm": 1.7068074918716023, "learning_rate": 1.9067040517649115e-06, "loss": 0.7387, "step": 4908 }, { "epoch": 0.8062741233472941, "grad_norm": 1.6531725733813536, "learning_rate": 1.9035803380696883e-06, "loss": 0.7532, "step": 4909 }, { "epoch": 0.8064383674139771, "grad_norm": 2.654900567289795, "learning_rate": 1.9004589160490173e-06, "loss": 0.6989, "step": 4910 }, { "epoch": 0.8066026114806603, "grad_norm": 1.7602688121285273, "learning_rate": 1.8973397865864095e-06, "loss": 0.7684, "step": 4911 }, { "epoch": 0.8067668555473434, "grad_norm": 2.1803357168044144, "learning_rate": 1.8942229505647292e-06, "loss": 0.7134, "step": 4912 }, { "epoch": 0.8069310996140264, "grad_norm": 1.528124676732638, "learning_rate": 1.8911084088661903e-06, "loss": 0.718, "step": 4913 }, { "epoch": 0.8070953436807096, "grad_norm": 1.6744698452363744, "learning_rate": 1.8879961623723553e-06, "loss": 0.6845, "step": 4914 }, { "epoch": 0.8072595877473926, "grad_norm": 4.591898221120859, "learning_rate": 1.884886211964141e-06, "loss": 0.7206, "step": 4915 }, { "epoch": 0.8074238318140757, "grad_norm": 2.099557392238401, "learning_rate": 1.8817785585218118e-06, "loss": 0.7123, "step": 4916 }, { "epoch": 0.8075880758807588, "grad_norm": 0.6235617040295451, "learning_rate": 1.878673202924982e-06, "loss": 0.3057, "step": 4917 }, { "epoch": 0.8077523199474419, "grad_norm": 1.6489473091808937, "learning_rate": 1.8755701460526166e-06, "loss": 0.751, "step": 4918 }, { "epoch": 0.807916564014125, "grad_norm": 1.5650697376920315, "learning_rate": 1.8724693887830292e-06, "loss": 0.7747, "step": 4919 }, { "epoch": 0.8080808080808081, "grad_norm": 1.8180404559346581, "learning_rate": 1.8693709319938824e-06, "loss": 0.7096, "step": 4920 }, { "epoch": 0.8082450521474912, "grad_norm": 1.9427371354986316, "learning_rate": 1.866274776562188e-06, "loss": 0.767, "step": 4921 }, { "epoch": 0.8084092962141742, "grad_norm": 1.7529707168980606, "learning_rate": 1.863180923364306e-06, "loss": 0.7985, "step": 4922 }, { "epoch": 0.8085735402808574, "grad_norm": 2.1772055696100705, "learning_rate": 1.860089373275945e-06, "loss": 0.6895, "step": 4923 }, { "epoch": 0.8087377843475404, "grad_norm": 1.7398393078479735, "learning_rate": 1.8570001271721627e-06, "loss": 0.7192, "step": 4924 }, { "epoch": 0.8089020284142235, "grad_norm": 1.7540305314225828, "learning_rate": 1.8539131859273628e-06, "loss": 0.7158, "step": 4925 }, { "epoch": 0.8090662724809067, "grad_norm": 1.7354294891826494, "learning_rate": 1.8508285504152979e-06, "loss": 0.6839, "step": 4926 }, { "epoch": 0.8092305165475897, "grad_norm": 2.5308859898848355, "learning_rate": 1.847746221509067e-06, "loss": 0.766, "step": 4927 }, { "epoch": 0.8093947606142728, "grad_norm": 1.6704001174834773, "learning_rate": 1.8446662000811177e-06, "loss": 0.7551, "step": 4928 }, { "epoch": 0.8095590046809559, "grad_norm": 1.8234788674171694, "learning_rate": 1.841588487003243e-06, "loss": 0.7252, "step": 4929 }, { "epoch": 0.809723248747639, "grad_norm": 1.538572158698826, "learning_rate": 1.8385130831465837e-06, "loss": 0.7776, "step": 4930 }, { "epoch": 0.809887492814322, "grad_norm": 2.1571375960035373, "learning_rate": 1.8354399893816255e-06, "loss": 0.7192, "step": 4931 }, { "epoch": 0.8100517368810052, "grad_norm": 1.661128579970526, "learning_rate": 1.8323692065782018e-06, "loss": 0.7989, "step": 4932 }, { "epoch": 0.8102159809476883, "grad_norm": 1.7935884484374256, "learning_rate": 1.8293007356054903e-06, "loss": 0.7217, "step": 4933 }, { "epoch": 0.8103802250143713, "grad_norm": 1.7813930145193622, "learning_rate": 1.8262345773320167e-06, "loss": 0.6865, "step": 4934 }, { "epoch": 0.8105444690810545, "grad_norm": 5.974166414140188, "learning_rate": 1.8231707326256498e-06, "loss": 0.7896, "step": 4935 }, { "epoch": 0.8107087131477375, "grad_norm": 1.719446535665447, "learning_rate": 1.8201092023536048e-06, "loss": 0.7742, "step": 4936 }, { "epoch": 0.8108729572144207, "grad_norm": 1.3389442444867055, "learning_rate": 1.817049987382441e-06, "loss": 0.7265, "step": 4937 }, { "epoch": 0.8110372012811037, "grad_norm": 1.7047737416452462, "learning_rate": 1.8139930885780621e-06, "loss": 0.6316, "step": 4938 }, { "epoch": 0.8112014453477868, "grad_norm": 2.3202422421222098, "learning_rate": 1.8109385068057183e-06, "loss": 0.7192, "step": 4939 }, { "epoch": 0.8113656894144698, "grad_norm": 1.9408929733206006, "learning_rate": 1.8078862429300015e-06, "loss": 0.6936, "step": 4940 }, { "epoch": 0.811529933481153, "grad_norm": 2.1132048951129914, "learning_rate": 1.8048362978148492e-06, "loss": 0.7239, "step": 4941 }, { "epoch": 0.8116941775478361, "grad_norm": 1.5424913857845284, "learning_rate": 1.8017886723235423e-06, "loss": 0.67, "step": 4942 }, { "epoch": 0.8118584216145192, "grad_norm": 1.8748607358961757, "learning_rate": 1.7987433673187026e-06, "loss": 0.8048, "step": 4943 }, { "epoch": 0.8120226656812023, "grad_norm": 1.5220463095557095, "learning_rate": 1.7957003836623e-06, "loss": 0.725, "step": 4944 }, { "epoch": 0.8121869097478853, "grad_norm": 1.7835445546713782, "learning_rate": 1.7926597222156438e-06, "loss": 0.7546, "step": 4945 }, { "epoch": 0.8123511538145685, "grad_norm": 1.7614303756685576, "learning_rate": 1.789621383839386e-06, "loss": 0.7674, "step": 4946 }, { "epoch": 0.8125153978812515, "grad_norm": 1.8059682905797074, "learning_rate": 1.786585369393522e-06, "loss": 0.7581, "step": 4947 }, { "epoch": 0.8126796419479346, "grad_norm": 2.0189161232744715, "learning_rate": 1.7835516797373908e-06, "loss": 0.7258, "step": 4948 }, { "epoch": 0.8128438860146178, "grad_norm": 1.9065425065063952, "learning_rate": 1.7805203157296692e-06, "loss": 0.6822, "step": 4949 }, { "epoch": 0.8130081300813008, "grad_norm": 2.5580290144410682, "learning_rate": 1.7774912782283815e-06, "loss": 0.7748, "step": 4950 }, { "epoch": 0.8131723741479839, "grad_norm": 1.8600671957390766, "learning_rate": 1.7744645680908878e-06, "loss": 0.7515, "step": 4951 }, { "epoch": 0.813336618214667, "grad_norm": 0.6409575460502052, "learning_rate": 1.771440186173894e-06, "loss": 0.3049, "step": 4952 }, { "epoch": 0.8135008622813501, "grad_norm": 1.8930635794440425, "learning_rate": 1.7684181333334437e-06, "loss": 0.8051, "step": 4953 }, { "epoch": 0.8136651063480331, "grad_norm": 1.4977113630341405, "learning_rate": 1.7653984104249221e-06, "loss": 0.6909, "step": 4954 }, { "epoch": 0.8138293504147163, "grad_norm": 2.5999931201324706, "learning_rate": 1.7623810183030576e-06, "loss": 0.7672, "step": 4955 }, { "epoch": 0.8139935944813994, "grad_norm": 2.023734718256377, "learning_rate": 1.7593659578219147e-06, "loss": 0.6949, "step": 4956 }, { "epoch": 0.8141578385480824, "grad_norm": 1.6844258531542329, "learning_rate": 1.7563532298349018e-06, "loss": 0.75, "step": 4957 }, { "epoch": 0.8143220826147656, "grad_norm": 1.8835395013417784, "learning_rate": 1.7533428351947634e-06, "loss": 0.7301, "step": 4958 }, { "epoch": 0.8144863266814486, "grad_norm": 1.5372444733066761, "learning_rate": 1.7503347747535859e-06, "loss": 0.7786, "step": 4959 }, { "epoch": 0.8146505707481317, "grad_norm": 1.7831741093905427, "learning_rate": 1.7473290493627948e-06, "loss": 0.7675, "step": 4960 }, { "epoch": 0.8148148148148148, "grad_norm": 4.354487167443821, "learning_rate": 1.744325659873154e-06, "loss": 0.6765, "step": 4961 }, { "epoch": 0.8149790588814979, "grad_norm": 2.3655967907954514, "learning_rate": 1.7413246071347667e-06, "loss": 0.6896, "step": 4962 }, { "epoch": 0.815143302948181, "grad_norm": 2.0063637354222705, "learning_rate": 1.7383258919970746e-06, "loss": 0.7097, "step": 4963 }, { "epoch": 0.8153075470148641, "grad_norm": 2.0588034042761003, "learning_rate": 1.7353295153088578e-06, "loss": 0.6834, "step": 4964 }, { "epoch": 0.8154717910815472, "grad_norm": 1.5586370326443468, "learning_rate": 1.7323354779182345e-06, "loss": 0.7637, "step": 4965 }, { "epoch": 0.8156360351482302, "grad_norm": 2.1253221737183994, "learning_rate": 1.729343780672661e-06, "loss": 0.715, "step": 4966 }, { "epoch": 0.8158002792149134, "grad_norm": 2.0818591281375536, "learning_rate": 1.7263544244189302e-06, "loss": 0.7518, "step": 4967 }, { "epoch": 0.8159645232815964, "grad_norm": 2.0080466832958215, "learning_rate": 1.7233674100031728e-06, "loss": 0.7351, "step": 4968 }, { "epoch": 0.8161287673482795, "grad_norm": 2.2688511810591474, "learning_rate": 1.7203827382708582e-06, "loss": 0.7289, "step": 4969 }, { "epoch": 0.8162930114149627, "grad_norm": 1.549462463426492, "learning_rate": 1.7174004100667907e-06, "loss": 0.7843, "step": 4970 }, { "epoch": 0.8164572554816457, "grad_norm": 1.6100175194384192, "learning_rate": 1.7144204262351116e-06, "loss": 0.7347, "step": 4971 }, { "epoch": 0.8166214995483289, "grad_norm": 1.633731187333372, "learning_rate": 1.7114427876192996e-06, "loss": 0.7759, "step": 4972 }, { "epoch": 0.8167857436150119, "grad_norm": 1.8747301748232557, "learning_rate": 1.7084674950621694e-06, "loss": 0.6987, "step": 4973 }, { "epoch": 0.816949987681695, "grad_norm": 0.6114575693241511, "learning_rate": 1.7054945494058705e-06, "loss": 0.3188, "step": 4974 }, { "epoch": 0.817114231748378, "grad_norm": 1.5227288646488826, "learning_rate": 1.7025239514918913e-06, "loss": 0.6726, "step": 4975 }, { "epoch": 0.8172784758150612, "grad_norm": 1.7344748440132476, "learning_rate": 1.6995557021610477e-06, "loss": 0.7205, "step": 4976 }, { "epoch": 0.8174427198817442, "grad_norm": 9.56517918641393, "learning_rate": 1.696589802253501e-06, "loss": 0.7702, "step": 4977 }, { "epoch": 0.8176069639484274, "grad_norm": 1.5834528654063043, "learning_rate": 1.6936262526087432e-06, "loss": 0.7185, "step": 4978 }, { "epoch": 0.8177712080151105, "grad_norm": 1.5567138381623995, "learning_rate": 1.690665054065599e-06, "loss": 0.7889, "step": 4979 }, { "epoch": 0.8179354520817935, "grad_norm": 1.6564467658663058, "learning_rate": 1.6877062074622296e-06, "loss": 0.7169, "step": 4980 }, { "epoch": 0.8180996961484767, "grad_norm": 1.706054421911076, "learning_rate": 1.6847497136361312e-06, "loss": 0.7814, "step": 4981 }, { "epoch": 0.8182639402151597, "grad_norm": 2.431466008124783, "learning_rate": 1.6817955734241321e-06, "loss": 0.6641, "step": 4982 }, { "epoch": 0.8184281842818428, "grad_norm": 1.7719459955867445, "learning_rate": 1.6788437876623963e-06, "loss": 0.6284, "step": 4983 }, { "epoch": 0.8185924283485259, "grad_norm": 1.8063011238698101, "learning_rate": 1.6758943571864206e-06, "loss": 0.687, "step": 4984 }, { "epoch": 0.818756672415209, "grad_norm": 1.6792600788631817, "learning_rate": 1.6729472828310334e-06, "loss": 0.8047, "step": 4985 }, { "epoch": 0.8189209164818921, "grad_norm": 1.8854051249001142, "learning_rate": 1.670002565430401e-06, "loss": 0.7186, "step": 4986 }, { "epoch": 0.8190851605485752, "grad_norm": 0.6150846614398294, "learning_rate": 1.6670602058180164e-06, "loss": 0.3221, "step": 4987 }, { "epoch": 0.8192494046152583, "grad_norm": 3.4746717673868903, "learning_rate": 1.6641202048267102e-06, "loss": 0.7071, "step": 4988 }, { "epoch": 0.8194136486819413, "grad_norm": 2.1791372881052027, "learning_rate": 1.6611825632886424e-06, "loss": 0.7568, "step": 4989 }, { "epoch": 0.8195778927486245, "grad_norm": 0.6216914282090794, "learning_rate": 1.658247282035307e-06, "loss": 0.3212, "step": 4990 }, { "epoch": 0.8197421368153075, "grad_norm": 2.296103150355904, "learning_rate": 1.6553143618975288e-06, "loss": 0.76, "step": 4991 }, { "epoch": 0.8199063808819906, "grad_norm": 1.8814375718090721, "learning_rate": 1.6523838037054652e-06, "loss": 0.6974, "step": 4992 }, { "epoch": 0.8200706249486738, "grad_norm": 1.8685483709026414, "learning_rate": 1.6494556082886038e-06, "loss": 0.7175, "step": 4993 }, { "epoch": 0.8202348690153568, "grad_norm": 1.4663265427947227, "learning_rate": 1.646529776475765e-06, "loss": 0.8177, "step": 4994 }, { "epoch": 0.8203991130820399, "grad_norm": 1.9284231553945679, "learning_rate": 1.6436063090950982e-06, "loss": 0.7009, "step": 4995 }, { "epoch": 0.820563357148723, "grad_norm": 1.9333023905386726, "learning_rate": 1.6406852069740876e-06, "loss": 0.8585, "step": 4996 }, { "epoch": 0.8207276012154061, "grad_norm": 1.8786673452939182, "learning_rate": 1.6377664709395403e-06, "loss": 0.7457, "step": 4997 }, { "epoch": 0.8208918452820891, "grad_norm": 2.095403083032877, "learning_rate": 1.634850101817601e-06, "loss": 0.6971, "step": 4998 }, { "epoch": 0.8210560893487723, "grad_norm": 2.3399238009337275, "learning_rate": 1.631936100433742e-06, "loss": 0.7513, "step": 4999 }, { "epoch": 0.8212203334154554, "grad_norm": 1.8514341933191372, "learning_rate": 1.629024467612762e-06, "loss": 0.8303, "step": 5000 }, { "epoch": 0.8213845774821384, "grad_norm": 2.0949189337327994, "learning_rate": 1.6261152041787986e-06, "loss": 0.7009, "step": 5001 }, { "epoch": 0.8215488215488216, "grad_norm": 1.602831538547282, "learning_rate": 1.6232083109553088e-06, "loss": 0.7621, "step": 5002 }, { "epoch": 0.8217130656155046, "grad_norm": 1.809731852167696, "learning_rate": 1.6203037887650842e-06, "loss": 0.7567, "step": 5003 }, { "epoch": 0.8218773096821878, "grad_norm": 1.6908072629242386, "learning_rate": 1.6174016384302415e-06, "loss": 0.7393, "step": 5004 }, { "epoch": 0.8220415537488708, "grad_norm": 2.248323858000244, "learning_rate": 1.6145018607722308e-06, "loss": 0.7148, "step": 5005 }, { "epoch": 0.8222057978155539, "grad_norm": 2.0801771981099333, "learning_rate": 1.6116044566118261e-06, "loss": 0.7199, "step": 5006 }, { "epoch": 0.8223700418822371, "grad_norm": 2.012603499764473, "learning_rate": 1.6087094267691329e-06, "loss": 0.6342, "step": 5007 }, { "epoch": 0.8225342859489201, "grad_norm": 2.338087251034915, "learning_rate": 1.6058167720635832e-06, "loss": 0.7236, "step": 5008 }, { "epoch": 0.8226985300156032, "grad_norm": 1.792808508968351, "learning_rate": 1.6029264933139354e-06, "loss": 0.7519, "step": 5009 }, { "epoch": 0.8228627740822863, "grad_norm": 1.8375181273870806, "learning_rate": 1.6000385913382777e-06, "loss": 0.7492, "step": 5010 }, { "epoch": 0.8230270181489694, "grad_norm": 1.8866493255088699, "learning_rate": 1.5971530669540258e-06, "loss": 0.7487, "step": 5011 }, { "epoch": 0.8231912622156524, "grad_norm": 1.6288024340667162, "learning_rate": 1.5942699209779189e-06, "loss": 0.7042, "step": 5012 }, { "epoch": 0.8233555062823356, "grad_norm": 2.013357783044878, "learning_rate": 1.5913891542260284e-06, "loss": 0.6958, "step": 5013 }, { "epoch": 0.8235197503490186, "grad_norm": 8.572088565692397, "learning_rate": 1.5885107675137468e-06, "loss": 0.7311, "step": 5014 }, { "epoch": 0.8236839944157017, "grad_norm": 1.9710998621449007, "learning_rate": 1.5856347616557955e-06, "loss": 0.7834, "step": 5015 }, { "epoch": 0.8238482384823849, "grad_norm": 1.614140617091305, "learning_rate": 1.5827611374662233e-06, "loss": 0.7047, "step": 5016 }, { "epoch": 0.8240124825490679, "grad_norm": 0.6286524170735063, "learning_rate": 1.579889895758403e-06, "loss": 0.2912, "step": 5017 }, { "epoch": 0.824176726615751, "grad_norm": 1.661241380302949, "learning_rate": 1.5770210373450356e-06, "loss": 0.815, "step": 5018 }, { "epoch": 0.8243409706824341, "grad_norm": 1.9691585115244006, "learning_rate": 1.5741545630381405e-06, "loss": 0.7237, "step": 5019 }, { "epoch": 0.8245052147491172, "grad_norm": 1.8211309180293174, "learning_rate": 1.571290473649071e-06, "loss": 0.6963, "step": 5020 }, { "epoch": 0.8246694588158002, "grad_norm": 2.2161658685203327, "learning_rate": 1.5684287699884993e-06, "loss": 0.6622, "step": 5021 }, { "epoch": 0.8248337028824834, "grad_norm": 1.5168127110476084, "learning_rate": 1.5655694528664266e-06, "loss": 0.838, "step": 5022 }, { "epoch": 0.8249979469491665, "grad_norm": 2.0126540843499123, "learning_rate": 1.5627125230921725e-06, "loss": 0.7677, "step": 5023 }, { "epoch": 0.8251621910158495, "grad_norm": 1.6446032948827665, "learning_rate": 1.5598579814743918e-06, "loss": 0.7391, "step": 5024 }, { "epoch": 0.8253264350825327, "grad_norm": 2.6465809940867295, "learning_rate": 1.5570058288210522e-06, "loss": 0.7763, "step": 5025 }, { "epoch": 0.8254906791492157, "grad_norm": 1.8338530828004946, "learning_rate": 1.5541560659394507e-06, "loss": 0.7692, "step": 5026 }, { "epoch": 0.8256549232158988, "grad_norm": 1.7701635517230343, "learning_rate": 1.5513086936362053e-06, "loss": 0.7941, "step": 5027 }, { "epoch": 0.8258191672825819, "grad_norm": 1.700570513809913, "learning_rate": 1.5484637127172609e-06, "loss": 0.7739, "step": 5028 }, { "epoch": 0.825983411349265, "grad_norm": 2.512584391905139, "learning_rate": 1.5456211239878815e-06, "loss": 0.6872, "step": 5029 }, { "epoch": 0.8261476554159481, "grad_norm": 1.6455658629854435, "learning_rate": 1.5427809282526563e-06, "loss": 0.6848, "step": 5030 }, { "epoch": 0.8263118994826312, "grad_norm": 2.0467870175762775, "learning_rate": 1.5399431263154973e-06, "loss": 0.709, "step": 5031 }, { "epoch": 0.8264761435493143, "grad_norm": 1.687205924123307, "learning_rate": 1.537107718979638e-06, "loss": 0.6991, "step": 5032 }, { "epoch": 0.8266403876159973, "grad_norm": 1.9224391136600862, "learning_rate": 1.5342747070476339e-06, "loss": 0.8058, "step": 5033 }, { "epoch": 0.8268046316826805, "grad_norm": 1.4713321947610567, "learning_rate": 1.5314440913213645e-06, "loss": 0.6191, "step": 5034 }, { "epoch": 0.8269688757493635, "grad_norm": 1.9695349066716457, "learning_rate": 1.5286158726020283e-06, "loss": 0.7065, "step": 5035 }, { "epoch": 0.8271331198160466, "grad_norm": 1.617104988309295, "learning_rate": 1.5257900516901469e-06, "loss": 0.726, "step": 5036 }, { "epoch": 0.8272973638827298, "grad_norm": 1.6240398318505118, "learning_rate": 1.522966629385564e-06, "loss": 0.6839, "step": 5037 }, { "epoch": 0.8274616079494128, "grad_norm": 1.5229706709926214, "learning_rate": 1.520145606487442e-06, "loss": 0.7653, "step": 5038 }, { "epoch": 0.827625852016096, "grad_norm": 1.8861362028618969, "learning_rate": 1.5173269837942683e-06, "loss": 0.7084, "step": 5039 }, { "epoch": 0.827790096082779, "grad_norm": 1.5721191660504124, "learning_rate": 1.5145107621038447e-06, "loss": 0.8133, "step": 5040 }, { "epoch": 0.8279543401494621, "grad_norm": 1.8979644523433878, "learning_rate": 1.5116969422132966e-06, "loss": 0.6517, "step": 5041 }, { "epoch": 0.8281185842161451, "grad_norm": 1.6978306505344498, "learning_rate": 1.5088855249190715e-06, "loss": 0.795, "step": 5042 }, { "epoch": 0.8282828282828283, "grad_norm": 1.4176625069489, "learning_rate": 1.506076511016935e-06, "loss": 0.7213, "step": 5043 }, { "epoch": 0.8284470723495114, "grad_norm": 1.4236785531472118, "learning_rate": 1.5032699013019724e-06, "loss": 0.7395, "step": 5044 }, { "epoch": 0.8286113164161945, "grad_norm": 3.092190565842293, "learning_rate": 1.5004656965685892e-06, "loss": 0.7711, "step": 5045 }, { "epoch": 0.8287755604828776, "grad_norm": 1.883551816384392, "learning_rate": 1.4976638976105063e-06, "loss": 0.6813, "step": 5046 }, { "epoch": 0.8289398045495606, "grad_norm": 1.8456196588183043, "learning_rate": 1.4948645052207721e-06, "loss": 0.7237, "step": 5047 }, { "epoch": 0.8291040486162438, "grad_norm": 2.707063297796928, "learning_rate": 1.4920675201917467e-06, "loss": 0.7579, "step": 5048 }, { "epoch": 0.8292682926829268, "grad_norm": 1.6131873045592606, "learning_rate": 1.4892729433151099e-06, "loss": 0.755, "step": 5049 }, { "epoch": 0.8294325367496099, "grad_norm": 1.9232636088789832, "learning_rate": 1.4864807753818623e-06, "loss": 0.7234, "step": 5050 }, { "epoch": 0.8295967808162931, "grad_norm": 2.424148495871534, "learning_rate": 1.4836910171823205e-06, "loss": 0.6947, "step": 5051 }, { "epoch": 0.8297610248829761, "grad_norm": 1.5194334379832004, "learning_rate": 1.4809036695061184e-06, "loss": 0.7288, "step": 5052 }, { "epoch": 0.8299252689496592, "grad_norm": 1.6749323724210896, "learning_rate": 1.4781187331422109e-06, "loss": 0.7403, "step": 5053 }, { "epoch": 0.8300895130163423, "grad_norm": 1.8258227004964787, "learning_rate": 1.475336208878868e-06, "loss": 0.7619, "step": 5054 }, { "epoch": 0.8302537570830254, "grad_norm": 2.407727550091275, "learning_rate": 1.4725560975036756e-06, "loss": 0.7459, "step": 5055 }, { "epoch": 0.8304180011497084, "grad_norm": 2.146995018570104, "learning_rate": 1.469778399803541e-06, "loss": 0.6066, "step": 5056 }, { "epoch": 0.8305822452163916, "grad_norm": 1.4854350269852763, "learning_rate": 1.4670031165646836e-06, "loss": 0.7102, "step": 5057 }, { "epoch": 0.8307464892830746, "grad_norm": 1.7009137692184924, "learning_rate": 1.4642302485726423e-06, "loss": 0.7241, "step": 5058 }, { "epoch": 0.8309107333497577, "grad_norm": 2.417113484503086, "learning_rate": 1.4614597966122712e-06, "loss": 0.6884, "step": 5059 }, { "epoch": 0.8310749774164409, "grad_norm": 1.8826046598059523, "learning_rate": 1.4586917614677409e-06, "loss": 0.7016, "step": 5060 }, { "epoch": 0.8312392214831239, "grad_norm": 1.704375395456636, "learning_rate": 1.4559261439225415e-06, "loss": 0.6889, "step": 5061 }, { "epoch": 0.831403465549807, "grad_norm": 2.145339956596673, "learning_rate": 1.4531629447594685e-06, "loss": 0.6771, "step": 5062 }, { "epoch": 0.8315677096164901, "grad_norm": 0.6250520985168388, "learning_rate": 1.4504021647606448e-06, "loss": 0.3192, "step": 5063 }, { "epoch": 0.8317319536831732, "grad_norm": 1.5501733502932964, "learning_rate": 1.4476438047075004e-06, "loss": 0.7052, "step": 5064 }, { "epoch": 0.8318961977498562, "grad_norm": 1.6890739522313316, "learning_rate": 1.4448878653807841e-06, "loss": 0.6975, "step": 5065 }, { "epoch": 0.8320604418165394, "grad_norm": 1.6176612042644178, "learning_rate": 1.4421343475605597e-06, "loss": 0.6847, "step": 5066 }, { "epoch": 0.8322246858832225, "grad_norm": 1.5956772514403388, "learning_rate": 1.4393832520262042e-06, "loss": 0.7852, "step": 5067 }, { "epoch": 0.8323889299499055, "grad_norm": 2.181473629431418, "learning_rate": 1.4366345795564084e-06, "loss": 0.6716, "step": 5068 }, { "epoch": 0.8325531740165887, "grad_norm": 1.664168477609314, "learning_rate": 1.4338883309291773e-06, "loss": 0.7127, "step": 5069 }, { "epoch": 0.8327174180832717, "grad_norm": 1.959056977090934, "learning_rate": 1.4311445069218355e-06, "loss": 0.7356, "step": 5070 }, { "epoch": 0.8328816621499548, "grad_norm": 1.9883545858403393, "learning_rate": 1.4284031083110127e-06, "loss": 0.7313, "step": 5071 }, { "epoch": 0.8330459062166379, "grad_norm": 1.5571643057680842, "learning_rate": 1.4256641358726586e-06, "loss": 0.6942, "step": 5072 }, { "epoch": 0.833210150283321, "grad_norm": 2.2910291180497144, "learning_rate": 1.4229275903820306e-06, "loss": 0.7245, "step": 5073 }, { "epoch": 0.8333743943500042, "grad_norm": 2.3753230225145217, "learning_rate": 1.4201934726137046e-06, "loss": 0.7554, "step": 5074 }, { "epoch": 0.8335386384166872, "grad_norm": 1.9617185769531178, "learning_rate": 1.417461783341565e-06, "loss": 0.6917, "step": 5075 }, { "epoch": 0.8337028824833703, "grad_norm": 4.246997786210959, "learning_rate": 1.414732523338812e-06, "loss": 0.7235, "step": 5076 }, { "epoch": 0.8338671265500534, "grad_norm": 1.5813274857654753, "learning_rate": 1.4120056933779558e-06, "loss": 0.7495, "step": 5077 }, { "epoch": 0.8340313706167365, "grad_norm": 1.9448752836088778, "learning_rate": 1.409281294230821e-06, "loss": 0.7819, "step": 5078 }, { "epoch": 0.8341956146834195, "grad_norm": 1.5273506166222222, "learning_rate": 1.4065593266685428e-06, "loss": 0.7312, "step": 5079 }, { "epoch": 0.8343598587501027, "grad_norm": 1.7320910618461194, "learning_rate": 1.4038397914615686e-06, "loss": 0.7778, "step": 5080 }, { "epoch": 0.8345241028167858, "grad_norm": 1.4227335600377835, "learning_rate": 1.4011226893796559e-06, "loss": 0.7246, "step": 5081 }, { "epoch": 0.8346883468834688, "grad_norm": 2.53181180194319, "learning_rate": 1.398408021191876e-06, "loss": 0.7613, "step": 5082 }, { "epoch": 0.834852590950152, "grad_norm": 1.6410423149649873, "learning_rate": 1.395695787666611e-06, "loss": 0.8195, "step": 5083 }, { "epoch": 0.835016835016835, "grad_norm": 1.7749570409519626, "learning_rate": 1.3929859895715502e-06, "loss": 0.7091, "step": 5084 }, { "epoch": 0.8351810790835181, "grad_norm": 1.6608024142428215, "learning_rate": 1.3902786276736969e-06, "loss": 0.807, "step": 5085 }, { "epoch": 0.8353453231502012, "grad_norm": 1.7831412941638212, "learning_rate": 1.3875737027393655e-06, "loss": 0.7813, "step": 5086 }, { "epoch": 0.8355095672168843, "grad_norm": 1.5752194252141434, "learning_rate": 1.384871215534177e-06, "loss": 0.6815, "step": 5087 }, { "epoch": 0.8356738112835674, "grad_norm": 1.5577478625537924, "learning_rate": 1.3821711668230675e-06, "loss": 0.814, "step": 5088 }, { "epoch": 0.8358380553502505, "grad_norm": 2.004219141854859, "learning_rate": 1.379473557370279e-06, "loss": 0.8083, "step": 5089 }, { "epoch": 0.8360022994169336, "grad_norm": 2.180488966257001, "learning_rate": 1.3767783879393637e-06, "loss": 0.7486, "step": 5090 }, { "epoch": 0.8361665434836166, "grad_norm": 1.8122908171960912, "learning_rate": 1.374085659293184e-06, "loss": 0.7526, "step": 5091 }, { "epoch": 0.8363307875502998, "grad_norm": 0.5819166005947805, "learning_rate": 1.3713953721939122e-06, "loss": 0.2697, "step": 5092 }, { "epoch": 0.8364950316169828, "grad_norm": 1.9466979337069188, "learning_rate": 1.3687075274030238e-06, "loss": 0.7205, "step": 5093 }, { "epoch": 0.8366592756836659, "grad_norm": 1.8763514903505243, "learning_rate": 1.3660221256813133e-06, "loss": 0.7163, "step": 5094 }, { "epoch": 0.836823519750349, "grad_norm": 1.91604182740193, "learning_rate": 1.3633391677888763e-06, "loss": 0.8091, "step": 5095 }, { "epoch": 0.8369877638170321, "grad_norm": 0.6189867545201743, "learning_rate": 1.3606586544851185e-06, "loss": 0.3194, "step": 5096 }, { "epoch": 0.8371520078837152, "grad_norm": 1.6678882847618273, "learning_rate": 1.3579805865287521e-06, "loss": 0.7577, "step": 5097 }, { "epoch": 0.8373162519503983, "grad_norm": 1.6132544734084433, "learning_rate": 1.3553049646777993e-06, "loss": 0.6291, "step": 5098 }, { "epoch": 0.8374804960170814, "grad_norm": 1.4158472529361068, "learning_rate": 1.3526317896895903e-06, "loss": 0.7044, "step": 5099 }, { "epoch": 0.8376447400837644, "grad_norm": 2.33977418074492, "learning_rate": 1.3499610623207604e-06, "loss": 0.6823, "step": 5100 }, { "epoch": 0.8378089841504476, "grad_norm": 1.5713187429744009, "learning_rate": 1.3472927833272541e-06, "loss": 0.6961, "step": 5101 }, { "epoch": 0.8379732282171306, "grad_norm": 3.3328644969720798, "learning_rate": 1.3446269534643209e-06, "loss": 0.7796, "step": 5102 }, { "epoch": 0.8381374722838137, "grad_norm": 2.093369754713053, "learning_rate": 1.34196357348652e-06, "loss": 0.7654, "step": 5103 }, { "epoch": 0.8383017163504969, "grad_norm": 2.2322782270045645, "learning_rate": 1.3393026441477163e-06, "loss": 0.7279, "step": 5104 }, { "epoch": 0.8384659604171799, "grad_norm": 1.6750432951108787, "learning_rate": 1.3366441662010765e-06, "loss": 0.7512, "step": 5105 }, { "epoch": 0.838630204483863, "grad_norm": 1.395419925766098, "learning_rate": 1.3339881403990796e-06, "loss": 0.729, "step": 5106 }, { "epoch": 0.8387944485505461, "grad_norm": 1.8286129549818586, "learning_rate": 1.3313345674935085e-06, "loss": 0.6924, "step": 5107 }, { "epoch": 0.8389586926172292, "grad_norm": 1.67836700521491, "learning_rate": 1.3286834482354506e-06, "loss": 0.7469, "step": 5108 }, { "epoch": 0.8391229366839122, "grad_norm": 1.9401596242225503, "learning_rate": 1.3260347833752995e-06, "loss": 0.8533, "step": 5109 }, { "epoch": 0.8392871807505954, "grad_norm": 2.0074545376384085, "learning_rate": 1.3233885736627538e-06, "loss": 0.8133, "step": 5110 }, { "epoch": 0.8394514248172785, "grad_norm": 1.7363144852710373, "learning_rate": 1.3207448198468199e-06, "loss": 0.7367, "step": 5111 }, { "epoch": 0.8396156688839616, "grad_norm": 1.8550842867168995, "learning_rate": 1.3181035226758044e-06, "loss": 0.8103, "step": 5112 }, { "epoch": 0.8397799129506447, "grad_norm": 1.459816546612852, "learning_rate": 1.3154646828973217e-06, "loss": 0.7803, "step": 5113 }, { "epoch": 0.8399441570173277, "grad_norm": 1.8168071246065975, "learning_rate": 1.3128283012582887e-06, "loss": 0.7871, "step": 5114 }, { "epoch": 0.8401084010840109, "grad_norm": 1.82319780717159, "learning_rate": 1.3101943785049309e-06, "loss": 0.8089, "step": 5115 }, { "epoch": 0.8402726451506939, "grad_norm": 1.810042515094005, "learning_rate": 1.3075629153827685e-06, "loss": 0.7353, "step": 5116 }, { "epoch": 0.840436889217377, "grad_norm": 0.6259987089824272, "learning_rate": 1.3049339126366388e-06, "loss": 0.3428, "step": 5117 }, { "epoch": 0.8406011332840602, "grad_norm": 1.617177827295305, "learning_rate": 1.3023073710106726e-06, "loss": 0.7583, "step": 5118 }, { "epoch": 0.8407653773507432, "grad_norm": 2.1521058314222072, "learning_rate": 1.2996832912483059e-06, "loss": 0.6825, "step": 5119 }, { "epoch": 0.8409296214174263, "grad_norm": 1.878283569449726, "learning_rate": 1.2970616740922804e-06, "loss": 0.6971, "step": 5120 }, { "epoch": 0.8410938654841094, "grad_norm": 1.9945620706274554, "learning_rate": 1.294442520284639e-06, "loss": 0.7573, "step": 5121 }, { "epoch": 0.8412581095507925, "grad_norm": 3.370065051351687, "learning_rate": 1.2918258305667264e-06, "loss": 0.6968, "step": 5122 }, { "epoch": 0.8414223536174755, "grad_norm": 2.0316556393109475, "learning_rate": 1.2892116056791927e-06, "loss": 0.7632, "step": 5123 }, { "epoch": 0.8415865976841587, "grad_norm": 2.003956297349745, "learning_rate": 1.2865998463619878e-06, "loss": 0.7106, "step": 5124 }, { "epoch": 0.8417508417508418, "grad_norm": 1.6864791457935446, "learning_rate": 1.283990553354365e-06, "loss": 0.6522, "step": 5125 }, { "epoch": 0.8419150858175248, "grad_norm": 1.832096993484862, "learning_rate": 1.2813837273948815e-06, "loss": 0.7975, "step": 5126 }, { "epoch": 0.842079329884208, "grad_norm": 1.7123068540385478, "learning_rate": 1.2787793692213902e-06, "loss": 0.7216, "step": 5127 }, { "epoch": 0.842243573950891, "grad_norm": 1.8091005594278324, "learning_rate": 1.2761774795710502e-06, "loss": 0.6726, "step": 5128 }, { "epoch": 0.8424078180175741, "grad_norm": 1.589711358716679, "learning_rate": 1.2735780591803205e-06, "loss": 0.7512, "step": 5129 }, { "epoch": 0.8425720620842572, "grad_norm": 5.396420742508218, "learning_rate": 1.2709811087849633e-06, "loss": 0.7003, "step": 5130 }, { "epoch": 0.8427363061509403, "grad_norm": 1.661325660509269, "learning_rate": 1.2683866291200386e-06, "loss": 0.7953, "step": 5131 }, { "epoch": 0.8429005502176233, "grad_norm": 1.805240657248031, "learning_rate": 1.2657946209199078e-06, "loss": 0.7522, "step": 5132 }, { "epoch": 0.8430647942843065, "grad_norm": 1.8570903761269586, "learning_rate": 1.2632050849182365e-06, "loss": 0.745, "step": 5133 }, { "epoch": 0.8432290383509896, "grad_norm": 1.7749690639641846, "learning_rate": 1.2606180218479836e-06, "loss": 0.747, "step": 5134 }, { "epoch": 0.8433932824176726, "grad_norm": 2.0528177328830317, "learning_rate": 1.2580334324414145e-06, "loss": 0.7562, "step": 5135 }, { "epoch": 0.8435575264843558, "grad_norm": 1.7069499013081229, "learning_rate": 1.255451317430091e-06, "loss": 0.7889, "step": 5136 }, { "epoch": 0.8437217705510388, "grad_norm": 1.966455618462793, "learning_rate": 1.2528716775448747e-06, "loss": 0.7062, "step": 5137 }, { "epoch": 0.843886014617722, "grad_norm": 1.7689377719374042, "learning_rate": 1.2502945135159272e-06, "loss": 0.8099, "step": 5138 }, { "epoch": 0.844050258684405, "grad_norm": 1.5791209465938854, "learning_rate": 1.2477198260727085e-06, "loss": 0.7686, "step": 5139 }, { "epoch": 0.8442145027510881, "grad_norm": 1.9223025848069606, "learning_rate": 1.2451476159439824e-06, "loss": 0.6989, "step": 5140 }, { "epoch": 0.8443787468177713, "grad_norm": 1.5271604595269654, "learning_rate": 1.242577883857804e-06, "loss": 0.7381, "step": 5141 }, { "epoch": 0.8445429908844543, "grad_norm": 1.9138892184029412, "learning_rate": 1.2400106305415317e-06, "loss": 0.7186, "step": 5142 }, { "epoch": 0.8447072349511374, "grad_norm": 2.421818339248674, "learning_rate": 1.2374458567218217e-06, "loss": 0.7644, "step": 5143 }, { "epoch": 0.8448714790178204, "grad_norm": 2.4751168790050095, "learning_rate": 1.234883563124627e-06, "loss": 0.7163, "step": 5144 }, { "epoch": 0.8450357230845036, "grad_norm": 2.544246910953795, "learning_rate": 1.2323237504751995e-06, "loss": 0.8059, "step": 5145 }, { "epoch": 0.8451999671511866, "grad_norm": 1.6026384689462974, "learning_rate": 1.2297664194980885e-06, "loss": 0.7173, "step": 5146 }, { "epoch": 0.8453642112178698, "grad_norm": 1.5523791225365262, "learning_rate": 1.2272115709171418e-06, "loss": 0.761, "step": 5147 }, { "epoch": 0.8455284552845529, "grad_norm": 1.683182697206517, "learning_rate": 1.2246592054555062e-06, "loss": 0.7713, "step": 5148 }, { "epoch": 0.8456926993512359, "grad_norm": 1.6317117297669823, "learning_rate": 1.2221093238356185e-06, "loss": 0.7016, "step": 5149 }, { "epoch": 0.8458569434179191, "grad_norm": 0.5961361480420788, "learning_rate": 1.2195619267792192e-06, "loss": 0.3152, "step": 5150 }, { "epoch": 0.8460211874846021, "grad_norm": 2.2754926832917355, "learning_rate": 1.2170170150073446e-06, "loss": 0.7721, "step": 5151 }, { "epoch": 0.8461854315512852, "grad_norm": 1.7979695652825567, "learning_rate": 1.2144745892403253e-06, "loss": 0.7362, "step": 5152 }, { "epoch": 0.8463496756179683, "grad_norm": 1.6493009075876852, "learning_rate": 1.2119346501977914e-06, "loss": 0.738, "step": 5153 }, { "epoch": 0.8465139196846514, "grad_norm": 2.0442789123509, "learning_rate": 1.2093971985986653e-06, "loss": 0.8012, "step": 5154 }, { "epoch": 0.8466781637513345, "grad_norm": 2.48453168709846, "learning_rate": 1.2068622351611691e-06, "loss": 0.7548, "step": 5155 }, { "epoch": 0.8468424078180176, "grad_norm": 2.0697205463130373, "learning_rate": 1.2043297606028181e-06, "loss": 0.7756, "step": 5156 }, { "epoch": 0.8470066518847007, "grad_norm": 2.8653172079768425, "learning_rate": 1.2017997756404232e-06, "loss": 0.7515, "step": 5157 }, { "epoch": 0.8471708959513837, "grad_norm": 1.9745524023441867, "learning_rate": 1.1992722809900925e-06, "loss": 0.7969, "step": 5158 }, { "epoch": 0.8473351400180669, "grad_norm": 1.978829697215563, "learning_rate": 1.1967472773672284e-06, "loss": 0.7941, "step": 5159 }, { "epoch": 0.8474993840847499, "grad_norm": 2.180816888426352, "learning_rate": 1.1942247654865259e-06, "loss": 0.7128, "step": 5160 }, { "epoch": 0.847663628151433, "grad_norm": 1.9101002023076932, "learning_rate": 1.1917047460619778e-06, "loss": 0.7472, "step": 5161 }, { "epoch": 0.8478278722181162, "grad_norm": 1.9291832938717732, "learning_rate": 1.1891872198068689e-06, "loss": 0.7496, "step": 5162 }, { "epoch": 0.8479921162847992, "grad_norm": 1.6275087601770504, "learning_rate": 1.1866721874337827e-06, "loss": 0.743, "step": 5163 }, { "epoch": 0.8481563603514823, "grad_norm": 2.160041236871068, "learning_rate": 1.1841596496545927e-06, "loss": 0.735, "step": 5164 }, { "epoch": 0.8483206044181654, "grad_norm": 1.8452049145348017, "learning_rate": 1.1816496071804662e-06, "loss": 0.7692, "step": 5165 }, { "epoch": 0.8484848484848485, "grad_norm": 1.4464120732329604, "learning_rate": 1.1791420607218673e-06, "loss": 0.7129, "step": 5166 }, { "epoch": 0.8486490925515315, "grad_norm": 2.437399862232661, "learning_rate": 1.1766370109885506e-06, "loss": 0.7764, "step": 5167 }, { "epoch": 0.8488133366182147, "grad_norm": 1.8573518730527747, "learning_rate": 1.1741344586895642e-06, "loss": 0.7567, "step": 5168 }, { "epoch": 0.8489775806848977, "grad_norm": 2.127068125039997, "learning_rate": 1.171634404533254e-06, "loss": 0.7448, "step": 5169 }, { "epoch": 0.8491418247515808, "grad_norm": 0.6648998069163917, "learning_rate": 1.1691368492272504e-06, "loss": 0.3043, "step": 5170 }, { "epoch": 0.849306068818264, "grad_norm": 1.8291247628535368, "learning_rate": 1.1666417934784824e-06, "loss": 0.8044, "step": 5171 }, { "epoch": 0.849470312884947, "grad_norm": 1.7862200906834997, "learning_rate": 1.1641492379931717e-06, "loss": 0.7211, "step": 5172 }, { "epoch": 0.8496345569516301, "grad_norm": 1.4549873312001924, "learning_rate": 1.1616591834768299e-06, "loss": 0.736, "step": 5173 }, { "epoch": 0.8497988010183132, "grad_norm": 1.4677650446446908, "learning_rate": 1.1591716306342615e-06, "loss": 0.7662, "step": 5174 }, { "epoch": 0.8499630450849963, "grad_norm": 1.7207424909064168, "learning_rate": 1.1566865801695649e-06, "loss": 0.7664, "step": 5175 }, { "epoch": 0.8501272891516793, "grad_norm": 2.4023950301979227, "learning_rate": 1.1542040327861259e-06, "loss": 0.7771, "step": 5176 }, { "epoch": 0.8502915332183625, "grad_norm": 3.174728574253007, "learning_rate": 1.1517239891866261e-06, "loss": 0.7797, "step": 5177 }, { "epoch": 0.8504557772850456, "grad_norm": 1.8817536110350492, "learning_rate": 1.149246450073036e-06, "loss": 0.6395, "step": 5178 }, { "epoch": 0.8506200213517287, "grad_norm": 5.9545249857679785, "learning_rate": 1.146771416146617e-06, "loss": 0.7738, "step": 5179 }, { "epoch": 0.8507842654184118, "grad_norm": 1.7795272033007838, "learning_rate": 1.1442988881079243e-06, "loss": 0.7871, "step": 5180 }, { "epoch": 0.8509485094850948, "grad_norm": 1.9364661271256762, "learning_rate": 1.1418288666568e-06, "loss": 0.7699, "step": 5181 }, { "epoch": 0.851112753551778, "grad_norm": 1.9308179953316469, "learning_rate": 1.1393613524923797e-06, "loss": 0.7373, "step": 5182 }, { "epoch": 0.851276997618461, "grad_norm": 2.0914979752022815, "learning_rate": 1.1368963463130866e-06, "loss": 0.7708, "step": 5183 }, { "epoch": 0.8514412416851441, "grad_norm": 3.6573882425465314, "learning_rate": 1.1344338488166362e-06, "loss": 0.7451, "step": 5184 }, { "epoch": 0.8516054857518273, "grad_norm": 2.455238966181254, "learning_rate": 1.1319738607000342e-06, "loss": 0.7271, "step": 5185 }, { "epoch": 0.8517697298185103, "grad_norm": 2.009743876799384, "learning_rate": 1.1295163826595724e-06, "loss": 0.7394, "step": 5186 }, { "epoch": 0.8519339738851934, "grad_norm": 1.770742641854683, "learning_rate": 1.1270614153908376e-06, "loss": 0.7552, "step": 5187 }, { "epoch": 0.8520982179518765, "grad_norm": 1.9613943032924541, "learning_rate": 1.1246089595887023e-06, "loss": 0.6977, "step": 5188 }, { "epoch": 0.8522624620185596, "grad_norm": 1.4281927095098421, "learning_rate": 1.1221590159473294e-06, "loss": 0.6845, "step": 5189 }, { "epoch": 0.8524267060852426, "grad_norm": 1.8043293388148818, "learning_rate": 1.119711585160169e-06, "loss": 0.7467, "step": 5190 }, { "epoch": 0.8525909501519258, "grad_norm": 1.7823295861589956, "learning_rate": 1.117266667919965e-06, "loss": 0.7541, "step": 5191 }, { "epoch": 0.8527551942186089, "grad_norm": 2.671402915586454, "learning_rate": 1.1148242649187403e-06, "loss": 0.7779, "step": 5192 }, { "epoch": 0.8529194382852919, "grad_norm": 1.7787167325714517, "learning_rate": 1.1123843768478148e-06, "loss": 0.7628, "step": 5193 }, { "epoch": 0.8530836823519751, "grad_norm": 1.9032097022031327, "learning_rate": 1.1099470043977933e-06, "loss": 0.7832, "step": 5194 }, { "epoch": 0.8532479264186581, "grad_norm": 2.0147640255149573, "learning_rate": 1.1075121482585704e-06, "loss": 0.7591, "step": 5195 }, { "epoch": 0.8534121704853412, "grad_norm": 1.475124167320132, "learning_rate": 1.1050798091193249e-06, "loss": 0.7761, "step": 5196 }, { "epoch": 0.8535764145520243, "grad_norm": 1.7360874547573826, "learning_rate": 1.1026499876685258e-06, "loss": 0.6357, "step": 5197 }, { "epoch": 0.8537406586187074, "grad_norm": 1.664290906332416, "learning_rate": 1.100222684593929e-06, "loss": 0.7537, "step": 5198 }, { "epoch": 0.8539049026853905, "grad_norm": 2.6291688453964386, "learning_rate": 1.097797900582579e-06, "loss": 0.697, "step": 5199 }, { "epoch": 0.8540691467520736, "grad_norm": 4.613796543643513, "learning_rate": 1.0953756363208034e-06, "loss": 0.6876, "step": 5200 }, { "epoch": 0.8542333908187567, "grad_norm": 1.6569531573875769, "learning_rate": 1.0929558924942217e-06, "loss": 0.8104, "step": 5201 }, { "epoch": 0.8543976348854397, "grad_norm": 1.559003813842868, "learning_rate": 1.090538669787735e-06, "loss": 0.7512, "step": 5202 }, { "epoch": 0.8545618789521229, "grad_norm": 1.754984065505898, "learning_rate": 1.088123968885534e-06, "loss": 0.7615, "step": 5203 }, { "epoch": 0.8547261230188059, "grad_norm": 1.879512400339803, "learning_rate": 1.0857117904710946e-06, "loss": 0.7796, "step": 5204 }, { "epoch": 0.854890367085489, "grad_norm": 1.7960316455701755, "learning_rate": 1.0833021352271788e-06, "loss": 0.6672, "step": 5205 }, { "epoch": 0.8550546111521721, "grad_norm": 1.3922507290583674, "learning_rate": 1.0808950038358357e-06, "loss": 0.6977, "step": 5206 }, { "epoch": 0.8552188552188552, "grad_norm": 2.1398059443053032, "learning_rate": 1.0784903969783966e-06, "loss": 0.6989, "step": 5207 }, { "epoch": 0.8553830992855384, "grad_norm": 1.8571577695469026, "learning_rate": 1.0760883153354818e-06, "loss": 0.7741, "step": 5208 }, { "epoch": 0.8555473433522214, "grad_norm": 2.459654473004281, "learning_rate": 1.0736887595869927e-06, "loss": 0.8246, "step": 5209 }, { "epoch": 0.8557115874189045, "grad_norm": 2.1092767300616613, "learning_rate": 1.071291730412123e-06, "loss": 0.7787, "step": 5210 }, { "epoch": 0.8558758314855875, "grad_norm": 2.1612122476282227, "learning_rate": 1.068897228489344e-06, "loss": 0.7211, "step": 5211 }, { "epoch": 0.8560400755522707, "grad_norm": 2.1895352881205516, "learning_rate": 1.0665052544964184e-06, "loss": 0.8029, "step": 5212 }, { "epoch": 0.8562043196189537, "grad_norm": 2.1652337419698715, "learning_rate": 1.0641158091103832e-06, "loss": 0.7792, "step": 5213 }, { "epoch": 0.8563685636856369, "grad_norm": 1.6584285072372897, "learning_rate": 1.0617288930075697e-06, "loss": 0.7782, "step": 5214 }, { "epoch": 0.85653280775232, "grad_norm": 2.3054883213508397, "learning_rate": 1.0593445068635876e-06, "loss": 0.8255, "step": 5215 }, { "epoch": 0.856697051819003, "grad_norm": 0.6458451316632803, "learning_rate": 1.0569626513533338e-06, "loss": 0.3392, "step": 5216 }, { "epoch": 0.8568612958856862, "grad_norm": 1.505423715222712, "learning_rate": 1.0545833271509864e-06, "loss": 0.6912, "step": 5217 }, { "epoch": 0.8570255399523692, "grad_norm": 0.558592904640358, "learning_rate": 1.0522065349300103e-06, "loss": 0.3422, "step": 5218 }, { "epoch": 0.8571897840190523, "grad_norm": 2.707667918332537, "learning_rate": 1.0498322753631484e-06, "loss": 0.8158, "step": 5219 }, { "epoch": 0.8573540280857354, "grad_norm": 1.7806093712633495, "learning_rate": 1.0474605491224321e-06, "loss": 0.7152, "step": 5220 }, { "epoch": 0.8575182721524185, "grad_norm": 1.4624872051512128, "learning_rate": 1.0450913568791731e-06, "loss": 0.8366, "step": 5221 }, { "epoch": 0.8576825162191016, "grad_norm": 2.2272188299622013, "learning_rate": 1.0427246993039653e-06, "loss": 0.7343, "step": 5222 }, { "epoch": 0.8578467602857847, "grad_norm": 1.504380003493601, "learning_rate": 1.040360577066688e-06, "loss": 0.7263, "step": 5223 }, { "epoch": 0.8580110043524678, "grad_norm": 1.6666363775096988, "learning_rate": 1.0379989908364997e-06, "loss": 0.7202, "step": 5224 }, { "epoch": 0.8581752484191508, "grad_norm": 1.3453479078842725, "learning_rate": 1.0356399412818418e-06, "loss": 0.686, "step": 5225 }, { "epoch": 0.858339492485834, "grad_norm": 1.9069991764856156, "learning_rate": 1.0332834290704397e-06, "loss": 0.7485, "step": 5226 }, { "epoch": 0.858503736552517, "grad_norm": 2.380619806100984, "learning_rate": 1.0309294548692994e-06, "loss": 0.8054, "step": 5227 }, { "epoch": 0.8586679806192001, "grad_norm": 0.6171074484401748, "learning_rate": 1.028578019344706e-06, "loss": 0.3071, "step": 5228 }, { "epoch": 0.8588322246858833, "grad_norm": 1.5588370267359242, "learning_rate": 1.0262291231622313e-06, "loss": 0.7012, "step": 5229 }, { "epoch": 0.8589964687525663, "grad_norm": 1.8432128779240369, "learning_rate": 1.0238827669867235e-06, "loss": 0.6885, "step": 5230 }, { "epoch": 0.8591607128192494, "grad_norm": 2.091074722729243, "learning_rate": 1.0215389514823148e-06, "loss": 0.7077, "step": 5231 }, { "epoch": 0.8593249568859325, "grad_norm": 1.9635164293022025, "learning_rate": 1.0191976773124157e-06, "loss": 0.7042, "step": 5232 }, { "epoch": 0.8594892009526156, "grad_norm": 0.5988372292043603, "learning_rate": 1.0168589451397204e-06, "loss": 0.322, "step": 5233 }, { "epoch": 0.8596534450192986, "grad_norm": 1.8951847700512674, "learning_rate": 1.014522755626205e-06, "loss": 0.8002, "step": 5234 }, { "epoch": 0.8598176890859818, "grad_norm": 1.6578378025890075, "learning_rate": 1.0121891094331172e-06, "loss": 0.7979, "step": 5235 }, { "epoch": 0.8599819331526649, "grad_norm": 1.949983972147512, "learning_rate": 1.009858007220994e-06, "loss": 0.7444, "step": 5236 }, { "epoch": 0.8601461772193479, "grad_norm": 2.0373824374381386, "learning_rate": 1.0075294496496479e-06, "loss": 0.8798, "step": 5237 }, { "epoch": 0.8603104212860311, "grad_norm": 2.5987276820185072, "learning_rate": 1.0052034373781716e-06, "loss": 0.804, "step": 5238 }, { "epoch": 0.8604746653527141, "grad_norm": 1.869742898847103, "learning_rate": 1.0028799710649406e-06, "loss": 0.7334, "step": 5239 }, { "epoch": 0.8606389094193972, "grad_norm": 1.9921993321457647, "learning_rate": 1.0005590513676045e-06, "loss": 0.7397, "step": 5240 }, { "epoch": 0.8608031534860803, "grad_norm": 0.6050323408030773, "learning_rate": 9.982406789430964e-07, "loss": 0.2993, "step": 5241 }, { "epoch": 0.8609673975527634, "grad_norm": 2.4660295285248153, "learning_rate": 9.959248544476252e-07, "loss": 0.7019, "step": 5242 }, { "epoch": 0.8611316416194466, "grad_norm": 2.0363312643599336, "learning_rate": 9.936115785366817e-07, "loss": 0.6938, "step": 5243 }, { "epoch": 0.8612958856861296, "grad_norm": 2.4546427540481965, "learning_rate": 9.913008518650336e-07, "loss": 0.7454, "step": 5244 }, { "epoch": 0.8614601297528127, "grad_norm": 1.769901266067752, "learning_rate": 9.889926750867251e-07, "loss": 0.7958, "step": 5245 }, { "epoch": 0.8616243738194957, "grad_norm": 1.5681190587730462, "learning_rate": 9.866870488550838e-07, "loss": 0.7161, "step": 5246 }, { "epoch": 0.8617886178861789, "grad_norm": 1.916074499540957, "learning_rate": 9.843839738227113e-07, "loss": 0.7167, "step": 5247 }, { "epoch": 0.8619528619528619, "grad_norm": 1.8220299083044498, "learning_rate": 9.820834506414866e-07, "loss": 0.654, "step": 5248 }, { "epoch": 0.862117106019545, "grad_norm": 1.9500233151359159, "learning_rate": 9.797854799625706e-07, "loss": 0.8259, "step": 5249 }, { "epoch": 0.8622813500862281, "grad_norm": 1.7802814672760634, "learning_rate": 9.77490062436396e-07, "loss": 0.7339, "step": 5250 }, { "epoch": 0.8624455941529112, "grad_norm": 0.551808351140802, "learning_rate": 9.751971987126785e-07, "loss": 0.3032, "step": 5251 }, { "epoch": 0.8626098382195944, "grad_norm": 1.8520121959879725, "learning_rate": 9.72906889440407e-07, "loss": 0.7444, "step": 5252 }, { "epoch": 0.8627740822862774, "grad_norm": 1.3240979703134936, "learning_rate": 9.706191352678495e-07, "loss": 0.7721, "step": 5253 }, { "epoch": 0.8629383263529605, "grad_norm": 1.9358962611857269, "learning_rate": 9.683339368425494e-07, "loss": 0.7007, "step": 5254 }, { "epoch": 0.8631025704196436, "grad_norm": 2.088407299105056, "learning_rate": 9.660512948113276e-07, "loss": 0.7118, "step": 5255 }, { "epoch": 0.8632668144863267, "grad_norm": 1.580762630608179, "learning_rate": 9.637712098202812e-07, "loss": 0.7995, "step": 5256 }, { "epoch": 0.8634310585530097, "grad_norm": 4.7842340294322, "learning_rate": 9.614936825147836e-07, "loss": 0.6647, "step": 5257 }, { "epoch": 0.8635953026196929, "grad_norm": 1.6187622221760065, "learning_rate": 9.592187135394826e-07, "loss": 0.7886, "step": 5258 }, { "epoch": 0.863759546686376, "grad_norm": 1.6759388210240271, "learning_rate": 9.569463035383054e-07, "loss": 0.7035, "step": 5259 }, { "epoch": 0.863923790753059, "grad_norm": 1.762774157129548, "learning_rate": 9.546764531544528e-07, "loss": 0.7037, "step": 5260 }, { "epoch": 0.8640880348197422, "grad_norm": 2.109001037897281, "learning_rate": 9.524091630304e-07, "loss": 0.7043, "step": 5261 }, { "epoch": 0.8642522788864252, "grad_norm": 1.6406933421261343, "learning_rate": 9.50144433807898e-07, "loss": 0.7589, "step": 5262 }, { "epoch": 0.8644165229531083, "grad_norm": 1.8185890960730844, "learning_rate": 9.478822661279763e-07, "loss": 0.8251, "step": 5263 }, { "epoch": 0.8645807670197914, "grad_norm": 1.5990841571385261, "learning_rate": 9.456226606309338e-07, "loss": 0.7751, "step": 5264 }, { "epoch": 0.8647450110864745, "grad_norm": 1.7946861950618302, "learning_rate": 9.4336561795635e-07, "loss": 0.7316, "step": 5265 }, { "epoch": 0.8649092551531576, "grad_norm": 2.362351177655277, "learning_rate": 9.411111387430738e-07, "loss": 0.7142, "step": 5266 }, { "epoch": 0.8650734992198407, "grad_norm": 2.123926341021256, "learning_rate": 9.388592236292316e-07, "loss": 0.7136, "step": 5267 }, { "epoch": 0.8652377432865238, "grad_norm": 4.315077342615549, "learning_rate": 9.366098732522233e-07, "loss": 0.7404, "step": 5268 }, { "epoch": 0.8654019873532068, "grad_norm": 2.085155323623697, "learning_rate": 9.343630882487221e-07, "loss": 0.7718, "step": 5269 }, { "epoch": 0.86556623141989, "grad_norm": 1.8987288865091585, "learning_rate": 9.321188692546767e-07, "loss": 0.7111, "step": 5270 }, { "epoch": 0.865730475486573, "grad_norm": 1.804213249333409, "learning_rate": 9.298772169053083e-07, "loss": 0.7245, "step": 5271 }, { "epoch": 0.8658947195532561, "grad_norm": 2.1883573762431916, "learning_rate": 9.276381318351124e-07, "loss": 0.7878, "step": 5272 }, { "epoch": 0.8660589636199393, "grad_norm": 2.039138827495066, "learning_rate": 9.254016146778555e-07, "loss": 0.7314, "step": 5273 }, { "epoch": 0.8662232076866223, "grad_norm": 1.7731092059726181, "learning_rate": 9.231676660665812e-07, "loss": 0.74, "step": 5274 }, { "epoch": 0.8663874517533054, "grad_norm": 1.4623782970367116, "learning_rate": 9.209362866336024e-07, "loss": 0.7487, "step": 5275 }, { "epoch": 0.8665516958199885, "grad_norm": 2.171025286462251, "learning_rate": 9.187074770105076e-07, "loss": 0.7792, "step": 5276 }, { "epoch": 0.8667159398866716, "grad_norm": 1.6760025778459149, "learning_rate": 9.164812378281562e-07, "loss": 0.7147, "step": 5277 }, { "epoch": 0.8668801839533546, "grad_norm": 6.831200686455394, "learning_rate": 9.1425756971668e-07, "loss": 0.706, "step": 5278 }, { "epoch": 0.8670444280200378, "grad_norm": 0.6440600591179, "learning_rate": 9.120364733054843e-07, "loss": 0.3205, "step": 5279 }, { "epoch": 0.8672086720867209, "grad_norm": 0.6266167124386139, "learning_rate": 9.098179492232451e-07, "loss": 0.3065, "step": 5280 }, { "epoch": 0.867372916153404, "grad_norm": 1.573263506353137, "learning_rate": 9.076019980979111e-07, "loss": 0.7103, "step": 5281 }, { "epoch": 0.8675371602200871, "grad_norm": 2.3140313102336894, "learning_rate": 9.05388620556703e-07, "loss": 0.6474, "step": 5282 }, { "epoch": 0.8677014042867701, "grad_norm": 4.37369588301392, "learning_rate": 9.03177817226113e-07, "loss": 0.7354, "step": 5283 }, { "epoch": 0.8678656483534533, "grad_norm": 1.9641219497293383, "learning_rate": 9.009695887319026e-07, "loss": 0.7561, "step": 5284 }, { "epoch": 0.8680298924201363, "grad_norm": 1.6226384427922649, "learning_rate": 8.987639356991085e-07, "loss": 0.743, "step": 5285 }, { "epoch": 0.8681941364868194, "grad_norm": 2.0907390647649464, "learning_rate": 8.965608587520347e-07, "loss": 0.6856, "step": 5286 }, { "epoch": 0.8683583805535025, "grad_norm": 1.910619208654613, "learning_rate": 8.94360358514258e-07, "loss": 0.7006, "step": 5287 }, { "epoch": 0.8685226246201856, "grad_norm": 1.6040594160623267, "learning_rate": 8.921624356086256e-07, "loss": 0.6803, "step": 5288 }, { "epoch": 0.8686868686868687, "grad_norm": 1.568500131940471, "learning_rate": 8.899670906572544e-07, "loss": 0.691, "step": 5289 }, { "epoch": 0.8688511127535518, "grad_norm": 1.9455358822499336, "learning_rate": 8.877743242815318e-07, "loss": 0.7317, "step": 5290 }, { "epoch": 0.8690153568202349, "grad_norm": 3.068488838481364, "learning_rate": 8.855841371021168e-07, "loss": 0.6601, "step": 5291 }, { "epoch": 0.8691796008869179, "grad_norm": 1.7057552640132796, "learning_rate": 8.833965297389368e-07, "loss": 0.7719, "step": 5292 }, { "epoch": 0.8693438449536011, "grad_norm": 1.815991693887317, "learning_rate": 8.81211502811189e-07, "loss": 0.7414, "step": 5293 }, { "epoch": 0.8695080890202841, "grad_norm": 2.2149242216711844, "learning_rate": 8.790290569373416e-07, "loss": 0.6977, "step": 5294 }, { "epoch": 0.8696723330869672, "grad_norm": 1.5284107057916954, "learning_rate": 8.768491927351308e-07, "loss": 0.7962, "step": 5295 }, { "epoch": 0.8698365771536504, "grad_norm": 2.8042776557335185, "learning_rate": 8.746719108215617e-07, "loss": 0.7138, "step": 5296 }, { "epoch": 0.8700008212203334, "grad_norm": 1.7813723284919896, "learning_rate": 8.724972118129116e-07, "loss": 0.6197, "step": 5297 }, { "epoch": 0.8701650652870165, "grad_norm": 2.0723239318912854, "learning_rate": 8.703250963247223e-07, "loss": 0.7216, "step": 5298 }, { "epoch": 0.8703293093536996, "grad_norm": 1.6849130834444206, "learning_rate": 8.681555649718076e-07, "loss": 0.7303, "step": 5299 }, { "epoch": 0.8704935534203827, "grad_norm": 1.6274016427871691, "learning_rate": 8.659886183682475e-07, "loss": 0.7635, "step": 5300 }, { "epoch": 0.8706577974870657, "grad_norm": 2.1411501076725328, "learning_rate": 8.638242571273935e-07, "loss": 0.7057, "step": 5301 }, { "epoch": 0.8708220415537489, "grad_norm": 2.752551257382407, "learning_rate": 8.616624818618635e-07, "loss": 0.7072, "step": 5302 }, { "epoch": 0.870986285620432, "grad_norm": 1.9434235645049098, "learning_rate": 8.595032931835423e-07, "loss": 0.7566, "step": 5303 }, { "epoch": 0.871150529687115, "grad_norm": 2.0255501904786533, "learning_rate": 8.573466917035834e-07, "loss": 0.7085, "step": 5304 }, { "epoch": 0.8713147737537982, "grad_norm": 1.6666693700513375, "learning_rate": 8.551926780324094e-07, "loss": 0.7801, "step": 5305 }, { "epoch": 0.8714790178204812, "grad_norm": 3.317591593515169, "learning_rate": 8.530412527797083e-07, "loss": 0.722, "step": 5306 }, { "epoch": 0.8716432618871643, "grad_norm": 1.7750343106519326, "learning_rate": 8.508924165544375e-07, "loss": 0.751, "step": 5307 }, { "epoch": 0.8718075059538474, "grad_norm": 1.8387151425955683, "learning_rate": 8.487461699648203e-07, "loss": 0.7083, "step": 5308 }, { "epoch": 0.8719717500205305, "grad_norm": 1.985450923603415, "learning_rate": 8.466025136183476e-07, "loss": 0.6663, "step": 5309 }, { "epoch": 0.8721359940872137, "grad_norm": 1.503671601968062, "learning_rate": 8.44461448121775e-07, "loss": 0.7851, "step": 5310 }, { "epoch": 0.8723002381538967, "grad_norm": 2.4136965139560447, "learning_rate": 8.42322974081129e-07, "loss": 0.7212, "step": 5311 }, { "epoch": 0.8724644822205798, "grad_norm": 1.7981175041776987, "learning_rate": 8.401870921016996e-07, "loss": 0.7673, "step": 5312 }, { "epoch": 0.8726287262872628, "grad_norm": 2.9564851297070636, "learning_rate": 8.380538027880425e-07, "loss": 0.7189, "step": 5313 }, { "epoch": 0.872792970353946, "grad_norm": 1.6227521697827143, "learning_rate": 8.35923106743981e-07, "loss": 0.6982, "step": 5314 }, { "epoch": 0.872957214420629, "grad_norm": 2.2072511391667096, "learning_rate": 8.337950045726051e-07, "loss": 0.6969, "step": 5315 }, { "epoch": 0.8731214584873122, "grad_norm": 2.0207525973105747, "learning_rate": 8.316694968762696e-07, "loss": 0.6659, "step": 5316 }, { "epoch": 0.8732857025539953, "grad_norm": 1.605529557054922, "learning_rate": 8.295465842565942e-07, "loss": 0.757, "step": 5317 }, { "epoch": 0.8734499466206783, "grad_norm": 2.3088375232223206, "learning_rate": 8.274262673144651e-07, "loss": 0.6154, "step": 5318 }, { "epoch": 0.8736141906873615, "grad_norm": 2.06370389920942, "learning_rate": 8.253085466500332e-07, "loss": 0.6951, "step": 5319 }, { "epoch": 0.8737784347540445, "grad_norm": 3.6221786613002935, "learning_rate": 8.231934228627158e-07, "loss": 0.7458, "step": 5320 }, { "epoch": 0.8739426788207276, "grad_norm": 1.782576727584207, "learning_rate": 8.210808965511941e-07, "loss": 0.7688, "step": 5321 }, { "epoch": 0.8741069228874107, "grad_norm": 1.8805421127046553, "learning_rate": 8.189709683134139e-07, "loss": 0.778, "step": 5322 }, { "epoch": 0.8742711669540938, "grad_norm": 1.8887319204774775, "learning_rate": 8.168636387465856e-07, "loss": 0.7413, "step": 5323 }, { "epoch": 0.8744354110207768, "grad_norm": 1.6194889019102068, "learning_rate": 8.147589084471851e-07, "loss": 0.6226, "step": 5324 }, { "epoch": 0.87459965508746, "grad_norm": 1.7599415172674926, "learning_rate": 8.126567780109506e-07, "loss": 0.7036, "step": 5325 }, { "epoch": 0.8747638991541431, "grad_norm": 1.9212822487454662, "learning_rate": 8.105572480328872e-07, "loss": 0.7904, "step": 5326 }, { "epoch": 0.8749281432208261, "grad_norm": 1.4529092084176425, "learning_rate": 8.084603191072615e-07, "loss": 0.8167, "step": 5327 }, { "epoch": 0.8750923872875093, "grad_norm": 1.4991529388555835, "learning_rate": 8.063659918276056e-07, "loss": 0.6468, "step": 5328 }, { "epoch": 0.8752566313541923, "grad_norm": 1.772004875439879, "learning_rate": 8.042742667867143e-07, "loss": 0.737, "step": 5329 }, { "epoch": 0.8754208754208754, "grad_norm": 2.553881551245324, "learning_rate": 8.02185144576646e-07, "loss": 0.6607, "step": 5330 }, { "epoch": 0.8755851194875585, "grad_norm": 2.3430808507636516, "learning_rate": 8.000986257887211e-07, "loss": 0.7205, "step": 5331 }, { "epoch": 0.8757493635542416, "grad_norm": 1.732869018444681, "learning_rate": 7.98014711013525e-07, "loss": 0.7864, "step": 5332 }, { "epoch": 0.8759136076209247, "grad_norm": 0.6209522041970075, "learning_rate": 7.95933400840907e-07, "loss": 0.3422, "step": 5333 }, { "epoch": 0.8760778516876078, "grad_norm": 2.1920914788063905, "learning_rate": 7.938546958599747e-07, "loss": 0.8009, "step": 5334 }, { "epoch": 0.8762420957542909, "grad_norm": 3.6238203019970054, "learning_rate": 7.91778596659104e-07, "loss": 0.7044, "step": 5335 }, { "epoch": 0.8764063398209739, "grad_norm": 1.4481317691306457, "learning_rate": 7.897051038259285e-07, "loss": 0.7629, "step": 5336 }, { "epoch": 0.8765705838876571, "grad_norm": 1.2414169440471803, "learning_rate": 7.876342179473473e-07, "loss": 0.7559, "step": 5337 }, { "epoch": 0.8767348279543401, "grad_norm": 1.7311893408882693, "learning_rate": 7.855659396095183e-07, "loss": 0.7426, "step": 5338 }, { "epoch": 0.8768990720210232, "grad_norm": 2.1248690860877026, "learning_rate": 7.835002693978655e-07, "loss": 0.6735, "step": 5339 }, { "epoch": 0.8770633160877064, "grad_norm": 2.1700925557807444, "learning_rate": 7.814372078970711e-07, "loss": 0.7028, "step": 5340 }, { "epoch": 0.8772275601543894, "grad_norm": 1.8362959272389676, "learning_rate": 7.793767556910814e-07, "loss": 0.6795, "step": 5341 }, { "epoch": 0.8773918042210725, "grad_norm": 1.8883075410179946, "learning_rate": 7.773189133631032e-07, "loss": 0.7233, "step": 5342 }, { "epoch": 0.8775560482877556, "grad_norm": 1.7249999397239564, "learning_rate": 7.752636814956027e-07, "loss": 0.6903, "step": 5343 }, { "epoch": 0.8777202923544387, "grad_norm": 1.6394319554695809, "learning_rate": 7.732110606703103e-07, "loss": 0.7584, "step": 5344 }, { "epoch": 0.8778845364211217, "grad_norm": 1.5459441330433696, "learning_rate": 7.711610514682155e-07, "loss": 0.812, "step": 5345 }, { "epoch": 0.8780487804878049, "grad_norm": 2.445711992711607, "learning_rate": 7.691136544695699e-07, "loss": 0.7471, "step": 5346 }, { "epoch": 0.878213024554488, "grad_norm": 2.61474921113742, "learning_rate": 7.670688702538831e-07, "loss": 0.7559, "step": 5347 }, { "epoch": 0.878377268621171, "grad_norm": 1.9230192778896642, "learning_rate": 7.6502669939993e-07, "loss": 0.7709, "step": 5348 }, { "epoch": 0.8785415126878542, "grad_norm": 2.2921563953231603, "learning_rate": 7.629871424857394e-07, "loss": 0.7246, "step": 5349 }, { "epoch": 0.8787057567545372, "grad_norm": 1.9297455696122177, "learning_rate": 7.60950200088606e-07, "loss": 0.7104, "step": 5350 }, { "epoch": 0.8788700008212204, "grad_norm": 2.5965577422759085, "learning_rate": 7.58915872785081e-07, "loss": 0.6926, "step": 5351 }, { "epoch": 0.8790342448879034, "grad_norm": 1.6671040491898494, "learning_rate": 7.56884161150977e-07, "loss": 0.7767, "step": 5352 }, { "epoch": 0.8791984889545865, "grad_norm": 2.3690756048816772, "learning_rate": 7.548550657613651e-07, "loss": 0.7574, "step": 5353 }, { "epoch": 0.8793627330212697, "grad_norm": 1.7210426229358466, "learning_rate": 7.52828587190576e-07, "loss": 0.7181, "step": 5354 }, { "epoch": 0.8795269770879527, "grad_norm": 1.3165649526842296, "learning_rate": 7.50804726012202e-07, "loss": 0.77, "step": 5355 }, { "epoch": 0.8796912211546358, "grad_norm": 1.8935186603968979, "learning_rate": 7.487834827990915e-07, "loss": 0.6352, "step": 5356 }, { "epoch": 0.8798554652213189, "grad_norm": 1.5424408150513702, "learning_rate": 7.467648581233533e-07, "loss": 0.6867, "step": 5357 }, { "epoch": 0.880019709288002, "grad_norm": 1.574682553599923, "learning_rate": 7.447488525563551e-07, "loss": 0.7165, "step": 5358 }, { "epoch": 0.880183953354685, "grad_norm": 4.191784920057722, "learning_rate": 7.427354666687226e-07, "loss": 0.7894, "step": 5359 }, { "epoch": 0.8803481974213682, "grad_norm": 1.7321865994592702, "learning_rate": 7.407247010303409e-07, "loss": 0.7291, "step": 5360 }, { "epoch": 0.8805124414880512, "grad_norm": 2.213640105687813, "learning_rate": 7.387165562103526e-07, "loss": 0.7723, "step": 5361 }, { "epoch": 0.8806766855547343, "grad_norm": 1.5646661764277467, "learning_rate": 7.367110327771587e-07, "loss": 0.7908, "step": 5362 }, { "epoch": 0.8808409296214175, "grad_norm": 0.6114956868044883, "learning_rate": 7.347081312984194e-07, "loss": 0.3647, "step": 5363 }, { "epoch": 0.8810051736881005, "grad_norm": 1.5596820814765993, "learning_rate": 7.327078523410525e-07, "loss": 0.8426, "step": 5364 }, { "epoch": 0.8811694177547836, "grad_norm": 1.5492391874218763, "learning_rate": 7.307101964712271e-07, "loss": 0.7445, "step": 5365 }, { "epoch": 0.8813336618214667, "grad_norm": 1.6633267663762419, "learning_rate": 7.287151642543822e-07, "loss": 0.7745, "step": 5366 }, { "epoch": 0.8814979058881498, "grad_norm": 1.7296697648014663, "learning_rate": 7.267227562552048e-07, "loss": 0.7682, "step": 5367 }, { "epoch": 0.8816621499548328, "grad_norm": 1.8287608673371682, "learning_rate": 7.247329730376429e-07, "loss": 0.7447, "step": 5368 }, { "epoch": 0.881826394021516, "grad_norm": 1.5503882475828425, "learning_rate": 7.227458151648992e-07, "loss": 0.7105, "step": 5369 }, { "epoch": 0.8819906380881991, "grad_norm": 1.8152312368861012, "learning_rate": 7.207612831994337e-07, "loss": 0.7916, "step": 5370 }, { "epoch": 0.8821548821548821, "grad_norm": 1.920050410082669, "learning_rate": 7.187793777029661e-07, "loss": 0.7106, "step": 5371 }, { "epoch": 0.8823191262215653, "grad_norm": 1.6068975203146778, "learning_rate": 7.168000992364699e-07, "loss": 0.7645, "step": 5372 }, { "epoch": 0.8824833702882483, "grad_norm": 1.705934650308344, "learning_rate": 7.148234483601746e-07, "loss": 0.7695, "step": 5373 }, { "epoch": 0.8826476143549314, "grad_norm": 1.367360477924355, "learning_rate": 7.128494256335694e-07, "loss": 0.7155, "step": 5374 }, { "epoch": 0.8828118584216145, "grad_norm": 1.8607903245105255, "learning_rate": 7.108780316153951e-07, "loss": 0.8014, "step": 5375 }, { "epoch": 0.8829761024882976, "grad_norm": 1.8803941360224548, "learning_rate": 7.08909266863651e-07, "loss": 0.7178, "step": 5376 }, { "epoch": 0.8831403465549807, "grad_norm": 2.9639904947686646, "learning_rate": 7.069431319355924e-07, "loss": 0.6826, "step": 5377 }, { "epoch": 0.8833045906216638, "grad_norm": 1.7884610719247505, "learning_rate": 7.049796273877297e-07, "loss": 0.7053, "step": 5378 }, { "epoch": 0.8834688346883469, "grad_norm": 1.8069963928202857, "learning_rate": 7.030187537758282e-07, "loss": 0.8189, "step": 5379 }, { "epoch": 0.8836330787550299, "grad_norm": 1.897802079188762, "learning_rate": 7.010605116549085e-07, "loss": 0.7179, "step": 5380 }, { "epoch": 0.8837973228217131, "grad_norm": 1.76043554144987, "learning_rate": 6.991049015792483e-07, "loss": 0.7453, "step": 5381 }, { "epoch": 0.8839615668883961, "grad_norm": 2.0173735316384276, "learning_rate": 6.971519241023795e-07, "loss": 0.7986, "step": 5382 }, { "epoch": 0.8841258109550792, "grad_norm": 2.7575693130217926, "learning_rate": 6.952015797770862e-07, "loss": 0.768, "step": 5383 }, { "epoch": 0.8842900550217624, "grad_norm": 1.4449290396334236, "learning_rate": 6.932538691554103e-07, "loss": 0.7331, "step": 5384 }, { "epoch": 0.8844542990884454, "grad_norm": 1.5163946154437162, "learning_rate": 6.913087927886464e-07, "loss": 0.7537, "step": 5385 }, { "epoch": 0.8846185431551286, "grad_norm": 1.6144358194075574, "learning_rate": 6.893663512273474e-07, "loss": 0.739, "step": 5386 }, { "epoch": 0.8847827872218116, "grad_norm": 1.841591821295999, "learning_rate": 6.874265450213124e-07, "loss": 0.7014, "step": 5387 }, { "epoch": 0.8849470312884947, "grad_norm": 2.055743618399857, "learning_rate": 6.854893747196034e-07, "loss": 0.7742, "step": 5388 }, { "epoch": 0.8851112753551778, "grad_norm": 2.174039369450636, "learning_rate": 6.835548408705284e-07, "loss": 0.7179, "step": 5389 }, { "epoch": 0.8852755194218609, "grad_norm": 2.0342845687908606, "learning_rate": 6.816229440216571e-07, "loss": 0.6854, "step": 5390 }, { "epoch": 0.885439763488544, "grad_norm": 1.3656333859918777, "learning_rate": 6.796936847198066e-07, "loss": 0.7403, "step": 5391 }, { "epoch": 0.8856040075552271, "grad_norm": 2.532254251169939, "learning_rate": 6.777670635110523e-07, "loss": 0.7643, "step": 5392 }, { "epoch": 0.8857682516219102, "grad_norm": 2.585810648212127, "learning_rate": 6.758430809407169e-07, "loss": 0.71, "step": 5393 }, { "epoch": 0.8859324956885932, "grad_norm": 3.07806257874251, "learning_rate": 6.739217375533813e-07, "loss": 0.7483, "step": 5394 }, { "epoch": 0.8860967397552764, "grad_norm": 1.5176706949006211, "learning_rate": 6.720030338928785e-07, "loss": 0.7474, "step": 5395 }, { "epoch": 0.8862609838219594, "grad_norm": 1.7271667159509934, "learning_rate": 6.700869705022916e-07, "loss": 0.7399, "step": 5396 }, { "epoch": 0.8864252278886425, "grad_norm": 1.7230420802334299, "learning_rate": 6.681735479239593e-07, "loss": 0.7743, "step": 5397 }, { "epoch": 0.8865894719553256, "grad_norm": 1.70884240466498, "learning_rate": 6.662627666994725e-07, "loss": 0.7811, "step": 5398 }, { "epoch": 0.8867537160220087, "grad_norm": 1.9752711827856233, "learning_rate": 6.643546273696732e-07, "loss": 0.7405, "step": 5399 }, { "epoch": 0.8869179600886918, "grad_norm": 1.5257772404756045, "learning_rate": 6.624491304746561e-07, "loss": 0.7449, "step": 5400 }, { "epoch": 0.8870822041553749, "grad_norm": 1.7949969432127757, "learning_rate": 6.605462765537674e-07, "loss": 0.7585, "step": 5401 }, { "epoch": 0.887246448222058, "grad_norm": 1.7655910913438049, "learning_rate": 6.586460661456074e-07, "loss": 0.6861, "step": 5402 }, { "epoch": 0.887410692288741, "grad_norm": 1.6240706002002498, "learning_rate": 6.567484997880247e-07, "loss": 0.7205, "step": 5403 }, { "epoch": 0.8875749363554242, "grad_norm": 2.0987165510615813, "learning_rate": 6.548535780181242e-07, "loss": 0.6974, "step": 5404 }, { "epoch": 0.8877391804221072, "grad_norm": 1.5935588036340032, "learning_rate": 6.529613013722568e-07, "loss": 0.6953, "step": 5405 }, { "epoch": 0.8879034244887903, "grad_norm": 1.9208611851755266, "learning_rate": 6.510716703860298e-07, "loss": 0.7659, "step": 5406 }, { "epoch": 0.8880676685554735, "grad_norm": 2.1037656510460137, "learning_rate": 6.491846855942984e-07, "loss": 0.7083, "step": 5407 }, { "epoch": 0.8882319126221565, "grad_norm": 3.097306039210947, "learning_rate": 6.47300347531169e-07, "loss": 0.7143, "step": 5408 }, { "epoch": 0.8883961566888396, "grad_norm": 1.6128305821090987, "learning_rate": 6.454186567299992e-07, "loss": 0.7409, "step": 5409 }, { "epoch": 0.8885604007555227, "grad_norm": 1.5641614449828636, "learning_rate": 6.435396137233985e-07, "loss": 0.8167, "step": 5410 }, { "epoch": 0.8887246448222058, "grad_norm": 2.7119339558782825, "learning_rate": 6.416632190432259e-07, "loss": 0.6666, "step": 5411 }, { "epoch": 0.8888888888888888, "grad_norm": 2.3319128043609463, "learning_rate": 6.397894732205889e-07, "loss": 0.7386, "step": 5412 }, { "epoch": 0.889053132955572, "grad_norm": 1.826078863169232, "learning_rate": 6.37918376785851e-07, "loss": 0.7269, "step": 5413 }, { "epoch": 0.8892173770222551, "grad_norm": 1.4723280357626174, "learning_rate": 6.360499302686207e-07, "loss": 0.7257, "step": 5414 }, { "epoch": 0.8893816210889381, "grad_norm": 1.8990914904145095, "learning_rate": 6.341841341977584e-07, "loss": 0.7549, "step": 5415 }, { "epoch": 0.8895458651556213, "grad_norm": 1.700740114344413, "learning_rate": 6.323209891013715e-07, "loss": 0.772, "step": 5416 }, { "epoch": 0.8897101092223043, "grad_norm": 0.6377051262420534, "learning_rate": 6.304604955068216e-07, "loss": 0.3074, "step": 5417 }, { "epoch": 0.8898743532889875, "grad_norm": 1.8614863064701395, "learning_rate": 6.286026539407164e-07, "loss": 0.6934, "step": 5418 }, { "epoch": 0.8900385973556705, "grad_norm": 1.6376164518104812, "learning_rate": 6.267474649289152e-07, "loss": 0.7456, "step": 5419 }, { "epoch": 0.8902028414223536, "grad_norm": 1.689428038162055, "learning_rate": 6.248949289965234e-07, "loss": 0.7484, "step": 5420 }, { "epoch": 0.8903670854890368, "grad_norm": 1.6970160150756757, "learning_rate": 6.230450466678995e-07, "loss": 0.7674, "step": 5421 }, { "epoch": 0.8905313295557198, "grad_norm": 1.8362985148186592, "learning_rate": 6.211978184666468e-07, "loss": 0.7508, "step": 5422 }, { "epoch": 0.8906955736224029, "grad_norm": 1.6045109575329428, "learning_rate": 6.193532449156203e-07, "loss": 0.7085, "step": 5423 }, { "epoch": 0.890859817689086, "grad_norm": 2.9310882364066067, "learning_rate": 6.175113265369237e-07, "loss": 0.7305, "step": 5424 }, { "epoch": 0.8910240617557691, "grad_norm": 1.6823567147352765, "learning_rate": 6.156720638519054e-07, "loss": 0.7367, "step": 5425 }, { "epoch": 0.8911883058224521, "grad_norm": 1.813812055821615, "learning_rate": 6.138354573811678e-07, "loss": 0.7524, "step": 5426 }, { "epoch": 0.8913525498891353, "grad_norm": 1.6452985187329976, "learning_rate": 6.120015076445573e-07, "loss": 0.7774, "step": 5427 }, { "epoch": 0.8915167939558184, "grad_norm": 1.6010630512175186, "learning_rate": 6.101702151611688e-07, "loss": 0.692, "step": 5428 }, { "epoch": 0.8916810380225014, "grad_norm": 2.9122180808055202, "learning_rate": 6.083415804493487e-07, "loss": 0.7999, "step": 5429 }, { "epoch": 0.8918452820891846, "grad_norm": 1.8363016453259207, "learning_rate": 6.06515604026684e-07, "loss": 0.7786, "step": 5430 }, { "epoch": 0.8920095261558676, "grad_norm": 2.0760125852855227, "learning_rate": 6.046922864100158e-07, "loss": 0.7392, "step": 5431 }, { "epoch": 0.8921737702225507, "grad_norm": 2.44489317373274, "learning_rate": 6.02871628115429e-07, "loss": 0.7473, "step": 5432 }, { "epoch": 0.8923380142892338, "grad_norm": 2.3043637292841708, "learning_rate": 6.010536296582592e-07, "loss": 0.7315, "step": 5433 }, { "epoch": 0.8925022583559169, "grad_norm": 1.5263610530742089, "learning_rate": 5.992382915530848e-07, "loss": 0.7744, "step": 5434 }, { "epoch": 0.8926665024226, "grad_norm": 1.4805985035838822, "learning_rate": 5.974256143137335e-07, "loss": 0.698, "step": 5435 }, { "epoch": 0.8928307464892831, "grad_norm": 6.306712363691309, "learning_rate": 5.95615598453283e-07, "loss": 0.6922, "step": 5436 }, { "epoch": 0.8929949905559662, "grad_norm": 1.7302125334191576, "learning_rate": 5.938082444840521e-07, "loss": 0.6602, "step": 5437 }, { "epoch": 0.8931592346226492, "grad_norm": 4.656399901500599, "learning_rate": 5.920035529176082e-07, "loss": 0.666, "step": 5438 }, { "epoch": 0.8933234786893324, "grad_norm": 2.312092738759966, "learning_rate": 5.902015242647651e-07, "loss": 0.6538, "step": 5439 }, { "epoch": 0.8934877227560154, "grad_norm": 1.690513470279837, "learning_rate": 5.884021590355859e-07, "loss": 0.7357, "step": 5440 }, { "epoch": 0.8936519668226985, "grad_norm": 1.769885946184022, "learning_rate": 5.866054577393742e-07, "loss": 0.736, "step": 5441 }, { "epoch": 0.8938162108893816, "grad_norm": 2.141980659622685, "learning_rate": 5.848114208846834e-07, "loss": 0.7725, "step": 5442 }, { "epoch": 0.8939804549560647, "grad_norm": 1.6922680425793843, "learning_rate": 5.830200489793136e-07, "loss": 0.8327, "step": 5443 }, { "epoch": 0.8941446990227478, "grad_norm": 1.5206880649182026, "learning_rate": 5.812313425303062e-07, "loss": 0.7793, "step": 5444 }, { "epoch": 0.8943089430894309, "grad_norm": 3.2182738810805183, "learning_rate": 5.794453020439517e-07, "loss": 0.8478, "step": 5445 }, { "epoch": 0.894473187156114, "grad_norm": 2.6734629903510374, "learning_rate": 5.776619280257855e-07, "loss": 0.716, "step": 5446 }, { "epoch": 0.894637431222797, "grad_norm": 1.478851267590568, "learning_rate": 5.758812209805887e-07, "loss": 0.7794, "step": 5447 }, { "epoch": 0.8948016752894802, "grad_norm": 1.4872767755891352, "learning_rate": 5.741031814123843e-07, "loss": 0.6575, "step": 5448 }, { "epoch": 0.8949659193561632, "grad_norm": 1.7582972476195968, "learning_rate": 5.723278098244455e-07, "loss": 0.7426, "step": 5449 }, { "epoch": 0.8951301634228463, "grad_norm": 1.6239444118433701, "learning_rate": 5.705551067192871e-07, "loss": 0.7574, "step": 5450 }, { "epoch": 0.8952944074895295, "grad_norm": 2.1665870841612898, "learning_rate": 5.687850725986654e-07, "loss": 0.7406, "step": 5451 }, { "epoch": 0.8954586515562125, "grad_norm": 1.5584274113641536, "learning_rate": 5.670177079635886e-07, "loss": 0.7729, "step": 5452 }, { "epoch": 0.8956228956228957, "grad_norm": 1.748294744319357, "learning_rate": 5.652530133143042e-07, "loss": 0.7376, "step": 5453 }, { "epoch": 0.8957871396895787, "grad_norm": 1.481631489775615, "learning_rate": 5.634909891503048e-07, "loss": 0.7286, "step": 5454 }, { "epoch": 0.8959513837562618, "grad_norm": 1.617778937814924, "learning_rate": 5.617316359703284e-07, "loss": 0.7484, "step": 5455 }, { "epoch": 0.8961156278229448, "grad_norm": 1.6991552595166703, "learning_rate": 5.599749542723565e-07, "loss": 0.7345, "step": 5456 }, { "epoch": 0.896279871889628, "grad_norm": 1.637834217382041, "learning_rate": 5.582209445536135e-07, "loss": 0.7391, "step": 5457 }, { "epoch": 0.8964441159563111, "grad_norm": 1.848370313843068, "learning_rate": 5.564696073105669e-07, "loss": 0.7628, "step": 5458 }, { "epoch": 0.8966083600229942, "grad_norm": 1.780259748651159, "learning_rate": 5.547209430389322e-07, "loss": 0.7494, "step": 5459 }, { "epoch": 0.8967726040896773, "grad_norm": 3.7304073413296477, "learning_rate": 5.529749522336625e-07, "loss": 0.7377, "step": 5460 }, { "epoch": 0.8969368481563603, "grad_norm": 1.91913725635434, "learning_rate": 5.512316353889591e-07, "loss": 0.6616, "step": 5461 }, { "epoch": 0.8971010922230435, "grad_norm": 0.6039270208172627, "learning_rate": 5.494909929982617e-07, "loss": 0.293, "step": 5462 }, { "epoch": 0.8972653362897265, "grad_norm": 1.6699269571052209, "learning_rate": 5.477530255542573e-07, "loss": 0.7381, "step": 5463 }, { "epoch": 0.8974295803564096, "grad_norm": 1.6984663797478132, "learning_rate": 5.460177335488736e-07, "loss": 0.7159, "step": 5464 }, { "epoch": 0.8975938244230928, "grad_norm": 1.6853370212185017, "learning_rate": 5.442851174732799e-07, "loss": 0.6918, "step": 5465 }, { "epoch": 0.8977580684897758, "grad_norm": 1.783829718326774, "learning_rate": 5.425551778178917e-07, "loss": 0.7127, "step": 5466 }, { "epoch": 0.8979223125564589, "grad_norm": 1.362022921410679, "learning_rate": 5.408279150723628e-07, "loss": 0.7989, "step": 5467 }, { "epoch": 0.898086556623142, "grad_norm": 2.115088717182433, "learning_rate": 5.391033297255932e-07, "loss": 0.7902, "step": 5468 }, { "epoch": 0.8982508006898251, "grad_norm": 3.445519787875428, "learning_rate": 5.373814222657214e-07, "loss": 0.7233, "step": 5469 }, { "epoch": 0.8984150447565081, "grad_norm": 1.552263449172889, "learning_rate": 5.356621931801309e-07, "loss": 0.7805, "step": 5470 }, { "epoch": 0.8985792888231913, "grad_norm": 1.5448055305667632, "learning_rate": 5.339456429554446e-07, "loss": 0.7721, "step": 5471 }, { "epoch": 0.8987435328898744, "grad_norm": 2.323836529378334, "learning_rate": 5.322317720775316e-07, "loss": 0.6942, "step": 5472 }, { "epoch": 0.8989077769565574, "grad_norm": 2.110302048402503, "learning_rate": 5.305205810314951e-07, "loss": 0.6767, "step": 5473 }, { "epoch": 0.8990720210232406, "grad_norm": 1.9718733748208497, "learning_rate": 5.288120703016863e-07, "loss": 0.7265, "step": 5474 }, { "epoch": 0.8992362650899236, "grad_norm": 2.114744430707877, "learning_rate": 5.271062403716953e-07, "loss": 0.7158, "step": 5475 }, { "epoch": 0.8994005091566067, "grad_norm": 2.0895792127936526, "learning_rate": 5.254030917243535e-07, "loss": 0.809, "step": 5476 }, { "epoch": 0.8995647532232898, "grad_norm": 1.7618248922656696, "learning_rate": 5.237026248417343e-07, "loss": 0.7786, "step": 5477 }, { "epoch": 0.8997289972899729, "grad_norm": 1.8814156676356901, "learning_rate": 5.220048402051503e-07, "loss": 0.6046, "step": 5478 }, { "epoch": 0.8998932413566559, "grad_norm": 1.6675605341744328, "learning_rate": 5.203097382951572e-07, "loss": 0.654, "step": 5479 }, { "epoch": 0.9000574854233391, "grad_norm": 2.8571695374460226, "learning_rate": 5.186173195915478e-07, "loss": 0.7966, "step": 5480 }, { "epoch": 0.9002217294900222, "grad_norm": 1.5136459157743716, "learning_rate": 5.169275845733601e-07, "loss": 0.7784, "step": 5481 }, { "epoch": 0.9003859735567052, "grad_norm": 1.762753866996377, "learning_rate": 5.15240533718867e-07, "loss": 0.702, "step": 5482 }, { "epoch": 0.9005502176233884, "grad_norm": 2.136191171480407, "learning_rate": 5.135561675055889e-07, "loss": 0.806, "step": 5483 }, { "epoch": 0.9007144616900714, "grad_norm": 2.282810461379112, "learning_rate": 5.118744864102787e-07, "loss": 0.7099, "step": 5484 }, { "epoch": 0.9008787057567545, "grad_norm": 2.042179613579721, "learning_rate": 5.101954909089346e-07, "loss": 0.7322, "step": 5485 }, { "epoch": 0.9010429498234376, "grad_norm": 1.7628264758868115, "learning_rate": 5.08519181476792e-07, "loss": 0.857, "step": 5486 }, { "epoch": 0.9012071938901207, "grad_norm": 1.8648527587100943, "learning_rate": 5.06845558588327e-07, "loss": 0.7765, "step": 5487 }, { "epoch": 0.9013714379568039, "grad_norm": 2.3095424485199505, "learning_rate": 5.051746227172538e-07, "loss": 0.7064, "step": 5488 }, { "epoch": 0.9015356820234869, "grad_norm": 1.6479992889342312, "learning_rate": 5.035063743365299e-07, "loss": 0.6936, "step": 5489 }, { "epoch": 0.90169992609017, "grad_norm": 1.826490264065697, "learning_rate": 5.018408139183462e-07, "loss": 0.7574, "step": 5490 }, { "epoch": 0.901864170156853, "grad_norm": 0.6271234720882374, "learning_rate": 5.001779419341391e-07, "loss": 0.3005, "step": 5491 }, { "epoch": 0.9020284142235362, "grad_norm": 2.7725617129056994, "learning_rate": 4.985177588545786e-07, "loss": 0.7826, "step": 5492 }, { "epoch": 0.9021926582902192, "grad_norm": 1.6878511712203732, "learning_rate": 4.96860265149578e-07, "loss": 0.7336, "step": 5493 }, { "epoch": 0.9023569023569024, "grad_norm": 1.8770557754832125, "learning_rate": 4.952054612882873e-07, "loss": 0.7514, "step": 5494 }, { "epoch": 0.9025211464235855, "grad_norm": 1.7210854269872795, "learning_rate": 4.93553347739093e-07, "loss": 0.727, "step": 5495 }, { "epoch": 0.9026853904902685, "grad_norm": 2.0873788283960852, "learning_rate": 4.919039249696233e-07, "loss": 0.7632, "step": 5496 }, { "epoch": 0.9028496345569517, "grad_norm": 2.1667255617794288, "learning_rate": 4.90257193446746e-07, "loss": 0.8717, "step": 5497 }, { "epoch": 0.9030138786236347, "grad_norm": 1.5925583938687151, "learning_rate": 4.886131536365623e-07, "loss": 0.6717, "step": 5498 }, { "epoch": 0.9031781226903178, "grad_norm": 2.4019627131020185, "learning_rate": 4.869718060044148e-07, "loss": 0.7512, "step": 5499 }, { "epoch": 0.9033423667570009, "grad_norm": 2.2638850880303534, "learning_rate": 4.853331510148851e-07, "loss": 0.7043, "step": 5500 }, { "epoch": 0.903506610823684, "grad_norm": 2.2872748078476355, "learning_rate": 4.836971891317898e-07, "loss": 0.6778, "step": 5501 }, { "epoch": 0.9036708548903671, "grad_norm": 1.5630817125249938, "learning_rate": 4.820639208181832e-07, "loss": 0.7155, "step": 5502 }, { "epoch": 0.9038350989570502, "grad_norm": 1.6755868251495865, "learning_rate": 4.804333465363609e-07, "loss": 0.7183, "step": 5503 }, { "epoch": 0.9039993430237333, "grad_norm": 1.7478271197072917, "learning_rate": 4.788054667478526e-07, "loss": 0.6521, "step": 5504 }, { "epoch": 0.9041635870904163, "grad_norm": 0.6070955835324557, "learning_rate": 4.771802819134253e-07, "loss": 0.3178, "step": 5505 }, { "epoch": 0.9043278311570995, "grad_norm": 2.5786909881767714, "learning_rate": 4.7555779249308545e-07, "loss": 0.7954, "step": 5506 }, { "epoch": 0.9044920752237825, "grad_norm": 0.6327118625635659, "learning_rate": 4.739379989460746e-07, "loss": 0.3322, "step": 5507 }, { "epoch": 0.9046563192904656, "grad_norm": 1.936700118553923, "learning_rate": 4.723209017308727e-07, "loss": 0.7375, "step": 5508 }, { "epoch": 0.9048205633571488, "grad_norm": 1.344134372227343, "learning_rate": 4.707065013051948e-07, "loss": 0.7444, "step": 5509 }, { "epoch": 0.9049848074238318, "grad_norm": 1.5972994129222657, "learning_rate": 4.69094798125993e-07, "loss": 0.7426, "step": 5510 }, { "epoch": 0.9051490514905149, "grad_norm": 2.228904470984248, "learning_rate": 4.6748579264945806e-07, "loss": 0.744, "step": 5511 }, { "epoch": 0.905313295557198, "grad_norm": 1.6796933631977362, "learning_rate": 4.658794853310156e-07, "loss": 0.7109, "step": 5512 }, { "epoch": 0.9054775396238811, "grad_norm": 1.8283649396551496, "learning_rate": 4.6427587662532636e-07, "loss": 0.5989, "step": 5513 }, { "epoch": 0.9056417836905641, "grad_norm": 2.1298226072009765, "learning_rate": 4.6267496698628846e-07, "loss": 0.696, "step": 5514 }, { "epoch": 0.9058060277572473, "grad_norm": 2.0079602008463366, "learning_rate": 4.6107675686703715e-07, "loss": 0.7318, "step": 5515 }, { "epoch": 0.9059702718239303, "grad_norm": 2.7130621713346246, "learning_rate": 4.5948124671994164e-07, "loss": 0.7474, "step": 5516 }, { "epoch": 0.9061345158906134, "grad_norm": 1.9152655771730043, "learning_rate": 4.5788843699660745e-07, "loss": 0.8863, "step": 5517 }, { "epoch": 0.9062987599572966, "grad_norm": 2.09411308528274, "learning_rate": 4.562983281478761e-07, "loss": 0.77, "step": 5518 }, { "epoch": 0.9064630040239796, "grad_norm": 2.4134351491320265, "learning_rate": 4.547109206238243e-07, "loss": 0.6399, "step": 5519 }, { "epoch": 0.9066272480906628, "grad_norm": 1.5499232813785346, "learning_rate": 4.5312621487376585e-07, "loss": 0.7775, "step": 5520 }, { "epoch": 0.9067914921573458, "grad_norm": 1.572853420245934, "learning_rate": 4.515442113462465e-07, "loss": 0.6871, "step": 5521 }, { "epoch": 0.9069557362240289, "grad_norm": 1.9594090203862242, "learning_rate": 4.499649104890502e-07, "loss": 0.6807, "step": 5522 }, { "epoch": 0.907119980290712, "grad_norm": 1.943084501235484, "learning_rate": 4.4838831274919505e-07, "loss": 0.7232, "step": 5523 }, { "epoch": 0.9072842243573951, "grad_norm": 1.8917682487010017, "learning_rate": 4.468144185729328e-07, "loss": 0.7758, "step": 5524 }, { "epoch": 0.9074484684240782, "grad_norm": 1.9802578968969962, "learning_rate": 4.4524322840575175e-07, "loss": 0.6995, "step": 5525 }, { "epoch": 0.9076127124907613, "grad_norm": 1.8899184130948297, "learning_rate": 4.4367474269237267e-07, "loss": 0.7529, "step": 5526 }, { "epoch": 0.9077769565574444, "grad_norm": 1.6429294231701963, "learning_rate": 4.4210896187675266e-07, "loss": 0.6494, "step": 5527 }, { "epoch": 0.9079412006241274, "grad_norm": 2.087590368579709, "learning_rate": 4.4054588640208285e-07, "loss": 0.7318, "step": 5528 }, { "epoch": 0.9081054446908106, "grad_norm": 1.7870364562204668, "learning_rate": 4.3898551671078926e-07, "loss": 0.7504, "step": 5529 }, { "epoch": 0.9082696887574936, "grad_norm": 2.056929224620405, "learning_rate": 4.3742785324453086e-07, "loss": 0.7691, "step": 5530 }, { "epoch": 0.9084339328241767, "grad_norm": 1.9665342668121586, "learning_rate": 4.358728964442005e-07, "loss": 0.7665, "step": 5531 }, { "epoch": 0.9085981768908599, "grad_norm": 1.4010875606707596, "learning_rate": 4.34320646749925e-07, "loss": 0.7524, "step": 5532 }, { "epoch": 0.9087624209575429, "grad_norm": 1.8541163657870054, "learning_rate": 4.327711046010663e-07, "loss": 0.7466, "step": 5533 }, { "epoch": 0.908926665024226, "grad_norm": 1.8923551266724152, "learning_rate": 4.3122427043621905e-07, "loss": 0.7284, "step": 5534 }, { "epoch": 0.9090909090909091, "grad_norm": 1.552068652961279, "learning_rate": 4.2968014469321194e-07, "loss": 0.6723, "step": 5535 }, { "epoch": 0.9092551531575922, "grad_norm": 1.6885711745267802, "learning_rate": 4.2813872780910425e-07, "loss": 0.795, "step": 5536 }, { "epoch": 0.9094193972242752, "grad_norm": 1.6668840051794682, "learning_rate": 4.266000202201948e-07, "loss": 0.7555, "step": 5537 }, { "epoch": 0.9095836412909584, "grad_norm": 1.7957226452262658, "learning_rate": 4.2506402236200616e-07, "loss": 0.7611, "step": 5538 }, { "epoch": 0.9097478853576415, "grad_norm": 0.6138616769906942, "learning_rate": 4.2353073466930404e-07, "loss": 0.3423, "step": 5539 }, { "epoch": 0.9099121294243245, "grad_norm": 1.8867964739132221, "learning_rate": 4.2200015757607905e-07, "loss": 0.676, "step": 5540 }, { "epoch": 0.9100763734910077, "grad_norm": 1.5229143687604876, "learning_rate": 4.20472291515559e-07, "loss": 0.6625, "step": 5541 }, { "epoch": 0.9102406175576907, "grad_norm": 1.7153213503849203, "learning_rate": 4.189471369202036e-07, "loss": 0.7582, "step": 5542 }, { "epoch": 0.9104048616243738, "grad_norm": 1.7093907409459206, "learning_rate": 4.1742469422170417e-07, "loss": 0.7565, "step": 5543 }, { "epoch": 0.9105691056910569, "grad_norm": 1.8679636808228974, "learning_rate": 4.15904963850986e-07, "loss": 0.7313, "step": 5544 }, { "epoch": 0.91073334975774, "grad_norm": 1.6896660357411506, "learning_rate": 4.143879462382039e-07, "loss": 0.7373, "step": 5545 }, { "epoch": 0.9108975938244231, "grad_norm": 2.009947824437482, "learning_rate": 4.128736418127477e-07, "loss": 0.7037, "step": 5546 }, { "epoch": 0.9110618378911062, "grad_norm": 2.2730259368535783, "learning_rate": 4.113620510032368e-07, "loss": 0.7365, "step": 5547 }, { "epoch": 0.9112260819577893, "grad_norm": 1.6954852504275904, "learning_rate": 4.0985317423752557e-07, "loss": 0.7693, "step": 5548 }, { "epoch": 0.9113903260244723, "grad_norm": 1.7009186751492542, "learning_rate": 4.083470119426969e-07, "loss": 0.7742, "step": 5549 }, { "epoch": 0.9115545700911555, "grad_norm": 1.9193530694234642, "learning_rate": 4.0684356454506747e-07, "loss": 0.6766, "step": 5550 }, { "epoch": 0.9117188141578385, "grad_norm": 1.637323592815818, "learning_rate": 4.053428324701836e-07, "loss": 0.7498, "step": 5551 }, { "epoch": 0.9118830582245216, "grad_norm": 2.1960268933636358, "learning_rate": 4.0384481614282764e-07, "loss": 0.7623, "step": 5552 }, { "epoch": 0.9120473022912047, "grad_norm": 1.4923413611008114, "learning_rate": 4.0234951598700725e-07, "loss": 0.7178, "step": 5553 }, { "epoch": 0.9122115463578878, "grad_norm": 1.8115792940040354, "learning_rate": 4.008569324259648e-07, "loss": 0.7711, "step": 5554 }, { "epoch": 0.912375790424571, "grad_norm": 0.5859122130091591, "learning_rate": 3.9936706588217243e-07, "loss": 0.3125, "step": 5555 }, { "epoch": 0.912540034491254, "grad_norm": 1.5931041959630656, "learning_rate": 3.9787991677733397e-07, "loss": 0.7465, "step": 5556 }, { "epoch": 0.9127042785579371, "grad_norm": 1.8794376045952719, "learning_rate": 3.9639548553238483e-07, "loss": 0.7284, "step": 5557 }, { "epoch": 0.9128685226246201, "grad_norm": 1.6332531586250996, "learning_rate": 3.94913772567489e-07, "loss": 0.8093, "step": 5558 }, { "epoch": 0.9130327666913033, "grad_norm": 2.0047451277149695, "learning_rate": 3.9343477830204424e-07, "loss": 0.7692, "step": 5559 }, { "epoch": 0.9131970107579863, "grad_norm": 2.6447058694496874, "learning_rate": 3.9195850315467244e-07, "loss": 0.7155, "step": 5560 }, { "epoch": 0.9133612548246695, "grad_norm": 1.684257630237752, "learning_rate": 3.904849475432337e-07, "loss": 0.7406, "step": 5561 }, { "epoch": 0.9135254988913526, "grad_norm": 1.9481277829763992, "learning_rate": 3.8901411188481453e-07, "loss": 0.8182, "step": 5562 }, { "epoch": 0.9136897429580356, "grad_norm": 1.7436909828433682, "learning_rate": 3.875459965957307e-07, "loss": 0.6647, "step": 5563 }, { "epoch": 0.9138539870247188, "grad_norm": 2.8022680990871653, "learning_rate": 3.860806020915286e-07, "loss": 0.7318, "step": 5564 }, { "epoch": 0.9140182310914018, "grad_norm": 1.7002100126179698, "learning_rate": 3.846179287869878e-07, "loss": 0.7298, "step": 5565 }, { "epoch": 0.9141824751580849, "grad_norm": 0.5630576654380177, "learning_rate": 3.831579770961125e-07, "loss": 0.297, "step": 5566 }, { "epoch": 0.914346719224768, "grad_norm": 16.533719649416778, "learning_rate": 3.817007474321399e-07, "loss": 0.7094, "step": 5567 }, { "epoch": 0.9145109632914511, "grad_norm": 1.7885922072006208, "learning_rate": 3.802462402075358e-07, "loss": 0.735, "step": 5568 }, { "epoch": 0.9146752073581342, "grad_norm": 2.9791884698347775, "learning_rate": 3.787944558339951e-07, "loss": 0.7029, "step": 5569 }, { "epoch": 0.9148394514248173, "grad_norm": 1.965365109782243, "learning_rate": 3.773453947224426e-07, "loss": 0.7447, "step": 5570 }, { "epoch": 0.9150036954915004, "grad_norm": 1.8401731167373574, "learning_rate": 3.7589905728303123e-07, "loss": 0.7673, "step": 5571 }, { "epoch": 0.9151679395581834, "grad_norm": 1.6494035364305453, "learning_rate": 3.7445544392514465e-07, "loss": 0.6929, "step": 5572 }, { "epoch": 0.9153321836248666, "grad_norm": 1.6133784212841669, "learning_rate": 3.7301455505739494e-07, "loss": 0.7167, "step": 5573 }, { "epoch": 0.9154964276915496, "grad_norm": 3.774992421491105, "learning_rate": 3.7157639108762136e-07, "loss": 0.7649, "step": 5574 }, { "epoch": 0.9156606717582327, "grad_norm": 1.9631705188542738, "learning_rate": 3.701409524228927e-07, "loss": 0.736, "step": 5575 }, { "epoch": 0.9158249158249159, "grad_norm": 2.016449925911159, "learning_rate": 3.687082394695096e-07, "loss": 0.7076, "step": 5576 }, { "epoch": 0.9159891598915989, "grad_norm": 1.686670300759552, "learning_rate": 3.6727825263299656e-07, "loss": 0.7422, "step": 5577 }, { "epoch": 0.916153403958282, "grad_norm": 1.6687488998140698, "learning_rate": 3.6585099231810863e-07, "loss": 0.6785, "step": 5578 }, { "epoch": 0.9163176480249651, "grad_norm": 1.7578863682804557, "learning_rate": 3.644264589288282e-07, "loss": 0.7636, "step": 5579 }, { "epoch": 0.9164818920916482, "grad_norm": 1.6865918124236603, "learning_rate": 3.630046528683695e-07, "loss": 0.6916, "step": 5580 }, { "epoch": 0.9166461361583312, "grad_norm": 1.4042905598150945, "learning_rate": 3.615855745391683e-07, "loss": 0.7123, "step": 5581 }, { "epoch": 0.9168103802250144, "grad_norm": 1.721055977972464, "learning_rate": 3.6016922434289113e-07, "loss": 0.6437, "step": 5582 }, { "epoch": 0.9169746242916975, "grad_norm": 2.3400486416562107, "learning_rate": 3.587556026804362e-07, "loss": 0.722, "step": 5583 }, { "epoch": 0.9171388683583805, "grad_norm": 2.187224310727989, "learning_rate": 3.573447099519245e-07, "loss": 0.7085, "step": 5584 }, { "epoch": 0.9173031124250637, "grad_norm": 1.8334346954237903, "learning_rate": 3.559365465567055e-07, "loss": 0.6931, "step": 5585 }, { "epoch": 0.9174673564917467, "grad_norm": 1.8586251874255537, "learning_rate": 3.545311128933582e-07, "loss": 0.7309, "step": 5586 }, { "epoch": 0.9176316005584298, "grad_norm": 1.994357479860277, "learning_rate": 3.531284093596865e-07, "loss": 0.7511, "step": 5587 }, { "epoch": 0.9177958446251129, "grad_norm": 1.8665953855334023, "learning_rate": 3.5172843635272403e-07, "loss": 0.6885, "step": 5588 }, { "epoch": 0.917960088691796, "grad_norm": 1.8729385480131984, "learning_rate": 3.503311942687293e-07, "loss": 0.7199, "step": 5589 }, { "epoch": 0.918124332758479, "grad_norm": 1.8033641674227765, "learning_rate": 3.489366835031882e-07, "loss": 0.6998, "step": 5590 }, { "epoch": 0.9182885768251622, "grad_norm": 1.6988848633087732, "learning_rate": 3.47544904450815e-07, "loss": 0.7719, "step": 5591 }, { "epoch": 0.9184528208918453, "grad_norm": 2.335248022581615, "learning_rate": 3.4615585750555016e-07, "loss": 0.7171, "step": 5592 }, { "epoch": 0.9186170649585284, "grad_norm": 1.3493047466795167, "learning_rate": 3.4476954306056023e-07, "loss": 0.7287, "step": 5593 }, { "epoch": 0.9187813090252115, "grad_norm": 1.7269482841679837, "learning_rate": 3.43385961508238e-07, "loss": 0.8378, "step": 5594 }, { "epoch": 0.9189455530918945, "grad_norm": 1.515993164864277, "learning_rate": 3.420051132402036e-07, "loss": 0.7388, "step": 5595 }, { "epoch": 0.9191097971585777, "grad_norm": 2.057040153782669, "learning_rate": 3.406269986473032e-07, "loss": 0.7372, "step": 5596 }, { "epoch": 0.9192740412252607, "grad_norm": 1.8358168363436527, "learning_rate": 3.392516181196093e-07, "loss": 0.7545, "step": 5597 }, { "epoch": 0.9194382852919438, "grad_norm": 3.2922041796367614, "learning_rate": 3.378789720464193e-07, "loss": 0.6981, "step": 5598 }, { "epoch": 0.919602529358627, "grad_norm": 1.7877368985573654, "learning_rate": 3.365090608162591e-07, "loss": 0.732, "step": 5599 }, { "epoch": 0.91976677342531, "grad_norm": 1.9971059362487447, "learning_rate": 3.351418848168808e-07, "loss": 0.7048, "step": 5600 }, { "epoch": 0.9199310174919931, "grad_norm": 1.7571471233017895, "learning_rate": 3.3377744443525816e-07, "loss": 0.7448, "step": 5601 }, { "epoch": 0.9200952615586762, "grad_norm": 1.8920241705176115, "learning_rate": 3.324157400575945e-07, "loss": 0.6955, "step": 5602 }, { "epoch": 0.9202595056253593, "grad_norm": 2.3750262343059125, "learning_rate": 3.31056772069317e-07, "loss": 0.6594, "step": 5603 }, { "epoch": 0.9204237496920423, "grad_norm": 1.9009838136329864, "learning_rate": 3.2970054085507795e-07, "loss": 0.6464, "step": 5604 }, { "epoch": 0.9205879937587255, "grad_norm": 1.9713542470713508, "learning_rate": 3.2834704679875596e-07, "loss": 0.7294, "step": 5605 }, { "epoch": 0.9207522378254086, "grad_norm": 2.002421349757353, "learning_rate": 3.269962902834545e-07, "loss": 0.7335, "step": 5606 }, { "epoch": 0.9209164818920916, "grad_norm": 1.45937398426119, "learning_rate": 3.256482716915044e-07, "loss": 0.7114, "step": 5607 }, { "epoch": 0.9210807259587748, "grad_norm": 1.9194144553259582, "learning_rate": 3.2430299140445597e-07, "loss": 0.7512, "step": 5608 }, { "epoch": 0.9212449700254578, "grad_norm": 1.5692313442583155, "learning_rate": 3.229604498030914e-07, "loss": 0.8114, "step": 5609 }, { "epoch": 0.9214092140921409, "grad_norm": 2.223941736606372, "learning_rate": 3.216206472674122e-07, "loss": 0.6851, "step": 5610 }, { "epoch": 0.921573458158824, "grad_norm": 2.5601977195843317, "learning_rate": 3.202835841766483e-07, "loss": 0.7773, "step": 5611 }, { "epoch": 0.9217377022255071, "grad_norm": 6.714295695623977, "learning_rate": 3.1894926090925037e-07, "loss": 0.7053, "step": 5612 }, { "epoch": 0.9219019462921902, "grad_norm": 1.9355573582572012, "learning_rate": 3.176176778428974e-07, "loss": 0.6475, "step": 5613 }, { "epoch": 0.9220661903588733, "grad_norm": 1.5868606256689055, "learning_rate": 3.1628883535449127e-07, "loss": 0.709, "step": 5614 }, { "epoch": 0.9222304344255564, "grad_norm": 1.9728386401758022, "learning_rate": 3.149627338201566e-07, "loss": 0.7442, "step": 5615 }, { "epoch": 0.9223946784922394, "grad_norm": 1.4750977395131804, "learning_rate": 3.1363937361524545e-07, "loss": 0.6757, "step": 5616 }, { "epoch": 0.9225589225589226, "grad_norm": 1.5662261333491678, "learning_rate": 3.123187551143314e-07, "loss": 0.7651, "step": 5617 }, { "epoch": 0.9227231666256056, "grad_norm": 2.0225315477105967, "learning_rate": 3.110008786912122e-07, "loss": 0.7244, "step": 5618 }, { "epoch": 0.9228874106922887, "grad_norm": 2.2123399254934726, "learning_rate": 3.0968574471891057e-07, "loss": 0.77, "step": 5619 }, { "epoch": 0.9230516547589719, "grad_norm": 1.5910636166863534, "learning_rate": 3.08373353569672e-07, "loss": 0.702, "step": 5620 }, { "epoch": 0.9232158988256549, "grad_norm": 2.243300694254895, "learning_rate": 3.07063705614965e-07, "loss": 0.733, "step": 5621 }, { "epoch": 0.923380142892338, "grad_norm": 1.681900344604224, "learning_rate": 3.0575680122548525e-07, "loss": 0.7713, "step": 5622 }, { "epoch": 0.9235443869590211, "grad_norm": 1.8773444680781162, "learning_rate": 3.044526407711501e-07, "loss": 0.6916, "step": 5623 }, { "epoch": 0.9237086310257042, "grad_norm": 1.8816237168047927, "learning_rate": 3.031512246210955e-07, "loss": 0.751, "step": 5624 }, { "epoch": 0.9238728750923872, "grad_norm": 2.8777668951707045, "learning_rate": 3.0185255314368555e-07, "loss": 0.6714, "step": 5625 }, { "epoch": 0.9240371191590704, "grad_norm": 1.945333751579908, "learning_rate": 3.005566267065085e-07, "loss": 0.7648, "step": 5626 }, { "epoch": 0.9242013632257535, "grad_norm": 1.4077192182918628, "learning_rate": 2.992634456763721e-07, "loss": 0.8223, "step": 5627 }, { "epoch": 0.9243656072924366, "grad_norm": 2.1501412843892536, "learning_rate": 2.97973010419309e-07, "loss": 0.7515, "step": 5628 }, { "epoch": 0.9245298513591197, "grad_norm": 1.513837337515496, "learning_rate": 2.9668532130057384e-07, "loss": 0.7611, "step": 5629 }, { "epoch": 0.9246940954258027, "grad_norm": 1.7452100794875927, "learning_rate": 2.954003786846449e-07, "loss": 0.7265, "step": 5630 }, { "epoch": 0.9248583394924859, "grad_norm": 1.7318040730050952, "learning_rate": 2.941181829352213e-07, "loss": 0.7235, "step": 5631 }, { "epoch": 0.9250225835591689, "grad_norm": 1.6641623214574128, "learning_rate": 2.9283873441522723e-07, "loss": 0.7563, "step": 5632 }, { "epoch": 0.925186827625852, "grad_norm": 1.7377715049804647, "learning_rate": 2.915620334868074e-07, "loss": 0.7582, "step": 5633 }, { "epoch": 0.925351071692535, "grad_norm": 3.182515671913392, "learning_rate": 2.902880805113284e-07, "loss": 0.7367, "step": 5634 }, { "epoch": 0.9255153157592182, "grad_norm": 1.662683086191026, "learning_rate": 2.8901687584938163e-07, "loss": 0.7821, "step": 5635 }, { "epoch": 0.9256795598259013, "grad_norm": 1.9620201486860576, "learning_rate": 2.877484198607783e-07, "loss": 0.7627, "step": 5636 }, { "epoch": 0.9258438038925844, "grad_norm": 1.8899929436452925, "learning_rate": 2.864827129045511e-07, "loss": 0.7419, "step": 5637 }, { "epoch": 0.9260080479592675, "grad_norm": 1.6098054274085556, "learning_rate": 2.852197553389568e-07, "loss": 0.7291, "step": 5638 }, { "epoch": 0.9261722920259505, "grad_norm": 3.440081949329503, "learning_rate": 2.8395954752147293e-07, "loss": 0.6754, "step": 5639 }, { "epoch": 0.9263365360926337, "grad_norm": 1.6331254337570573, "learning_rate": 2.8270208980879843e-07, "loss": 0.7574, "step": 5640 }, { "epoch": 0.9265007801593167, "grad_norm": 1.7416603508387638, "learning_rate": 2.8144738255685423e-07, "loss": 0.7983, "step": 5641 }, { "epoch": 0.9266650242259998, "grad_norm": 1.6440176346442514, "learning_rate": 2.801954261207818e-07, "loss": 0.7225, "step": 5642 }, { "epoch": 0.926829268292683, "grad_norm": 1.8516990338170511, "learning_rate": 2.789462208549454e-07, "loss": 0.7665, "step": 5643 }, { "epoch": 0.926993512359366, "grad_norm": 1.816431793655557, "learning_rate": 2.7769976711293e-07, "loss": 0.7019, "step": 5644 }, { "epoch": 0.9271577564260491, "grad_norm": 2.0851772873017254, "learning_rate": 2.764560652475412e-07, "loss": 0.7481, "step": 5645 }, { "epoch": 0.9273220004927322, "grad_norm": 0.5553726071133336, "learning_rate": 2.752151156108074e-07, "loss": 0.279, "step": 5646 }, { "epoch": 0.9274862445594153, "grad_norm": 1.7422900343685501, "learning_rate": 2.7397691855397534e-07, "loss": 0.6664, "step": 5647 }, { "epoch": 0.9276504886260983, "grad_norm": 0.6362676968050665, "learning_rate": 2.727414744275147e-07, "loss": 0.306, "step": 5648 }, { "epoch": 0.9278147326927815, "grad_norm": 1.7044460696425627, "learning_rate": 2.7150878358111585e-07, "loss": 0.7674, "step": 5649 }, { "epoch": 0.9279789767594646, "grad_norm": 7.104054348045914, "learning_rate": 2.7027884636368853e-07, "loss": 0.8044, "step": 5650 }, { "epoch": 0.9281432208261476, "grad_norm": 1.4829343040363367, "learning_rate": 2.6905166312336525e-07, "loss": 0.7355, "step": 5651 }, { "epoch": 0.9283074648928308, "grad_norm": 1.7691129045101865, "learning_rate": 2.67827234207495e-07, "loss": 0.7563, "step": 5652 }, { "epoch": 0.9284717089595138, "grad_norm": 1.4880737234683463, "learning_rate": 2.666055599626527e-07, "loss": 0.7988, "step": 5653 }, { "epoch": 0.928635953026197, "grad_norm": 1.712851595097735, "learning_rate": 2.653866407346284e-07, "loss": 0.6986, "step": 5654 }, { "epoch": 0.92880019709288, "grad_norm": 2.62359640599399, "learning_rate": 2.641704768684361e-07, "loss": 0.7831, "step": 5655 }, { "epoch": 0.9289644411595631, "grad_norm": 1.582460620724821, "learning_rate": 2.629570687083083e-07, "loss": 0.7501, "step": 5656 }, { "epoch": 0.9291286852262463, "grad_norm": 1.6755607671670283, "learning_rate": 2.6174641659769683e-07, "loss": 0.8307, "step": 5657 }, { "epoch": 0.9292929292929293, "grad_norm": 1.9799895418719171, "learning_rate": 2.6053852087927436e-07, "loss": 0.7458, "step": 5658 }, { "epoch": 0.9294571733596124, "grad_norm": 2.0674473921883267, "learning_rate": 2.5933338189493395e-07, "loss": 0.7392, "step": 5659 }, { "epoch": 0.9296214174262954, "grad_norm": 1.5674836373748073, "learning_rate": 2.581309999857873e-07, "loss": 0.6981, "step": 5660 }, { "epoch": 0.9297856614929786, "grad_norm": 1.7676747400896806, "learning_rate": 2.569313754921665e-07, "loss": 0.7065, "step": 5661 }, { "epoch": 0.9299499055596616, "grad_norm": 2.046177135405766, "learning_rate": 2.5573450875362117e-07, "loss": 0.7832, "step": 5662 }, { "epoch": 0.9301141496263448, "grad_norm": 1.9792556067089808, "learning_rate": 2.5454040010892354e-07, "loss": 0.7309, "step": 5663 }, { "epoch": 0.9302783936930279, "grad_norm": 1.7106729056269228, "learning_rate": 2.5334904989606336e-07, "loss": 0.8201, "step": 5664 }, { "epoch": 0.9304426377597109, "grad_norm": 1.8569175066386754, "learning_rate": 2.5216045845224854e-07, "loss": 0.7608, "step": 5665 }, { "epoch": 0.9306068818263941, "grad_norm": 1.5560985218482144, "learning_rate": 2.5097462611390897e-07, "loss": 0.7094, "step": 5666 }, { "epoch": 0.9307711258930771, "grad_norm": 1.7844515427377956, "learning_rate": 2.4979155321669166e-07, "loss": 0.69, "step": 5667 }, { "epoch": 0.9309353699597602, "grad_norm": 2.7425784176374663, "learning_rate": 2.486112400954621e-07, "loss": 0.6706, "step": 5668 }, { "epoch": 0.9310996140264433, "grad_norm": 0.6424308608643894, "learning_rate": 2.474336870843064e-07, "loss": 0.3278, "step": 5669 }, { "epoch": 0.9312638580931264, "grad_norm": 1.7562049679344733, "learning_rate": 2.462588945165267e-07, "loss": 0.8007, "step": 5670 }, { "epoch": 0.9314281021598094, "grad_norm": 1.5037396482788266, "learning_rate": 2.450868627246483e-07, "loss": 0.7649, "step": 5671 }, { "epoch": 0.9315923462264926, "grad_norm": 1.8752259428329703, "learning_rate": 2.439175920404102e-07, "loss": 0.7768, "step": 5672 }, { "epoch": 0.9317565902931757, "grad_norm": 1.7275858911670348, "learning_rate": 2.427510827947721e-07, "loss": 0.778, "step": 5673 }, { "epoch": 0.9319208343598587, "grad_norm": 1.5461709688492784, "learning_rate": 2.415873353179132e-07, "loss": 0.725, "step": 5674 }, { "epoch": 0.9320850784265419, "grad_norm": 1.6382809892907142, "learning_rate": 2.404263499392301e-07, "loss": 0.7576, "step": 5675 }, { "epoch": 0.9322493224932249, "grad_norm": 1.5060090047049106, "learning_rate": 2.3926812698733535e-07, "loss": 0.7086, "step": 5676 }, { "epoch": 0.932413566559908, "grad_norm": 1.785673218060708, "learning_rate": 2.381126667900624e-07, "loss": 0.7287, "step": 5677 }, { "epoch": 0.9325778106265911, "grad_norm": 1.7362945585932568, "learning_rate": 2.3695996967446178e-07, "loss": 0.7461, "step": 5678 }, { "epoch": 0.9327420546932742, "grad_norm": 1.739516500262754, "learning_rate": 2.3581003596680252e-07, "loss": 0.7916, "step": 5679 }, { "epoch": 0.9329062987599573, "grad_norm": 2.1345535990254922, "learning_rate": 2.3466286599257094e-07, "loss": 0.8358, "step": 5680 }, { "epoch": 0.9330705428266404, "grad_norm": 1.7553883678612165, "learning_rate": 2.335184600764695e-07, "loss": 0.7403, "step": 5681 }, { "epoch": 0.9332347868933235, "grad_norm": 1.9067560794795415, "learning_rate": 2.3237681854242245e-07, "loss": 0.6966, "step": 5682 }, { "epoch": 0.9333990309600065, "grad_norm": 1.7138475479143087, "learning_rate": 2.3123794171356683e-07, "loss": 0.714, "step": 5683 }, { "epoch": 0.9335632750266897, "grad_norm": 2.0174504829481954, "learning_rate": 2.3010182991226038e-07, "loss": 0.826, "step": 5684 }, { "epoch": 0.9337275190933727, "grad_norm": 0.5941218179345331, "learning_rate": 2.2896848346007584e-07, "loss": 0.2963, "step": 5685 }, { "epoch": 0.9338917631600558, "grad_norm": 1.6837184339442453, "learning_rate": 2.2783790267780658e-07, "loss": 0.7725, "step": 5686 }, { "epoch": 0.934056007226739, "grad_norm": 1.7318825707113363, "learning_rate": 2.2671008788546e-07, "loss": 0.6983, "step": 5687 }, { "epoch": 0.934220251293422, "grad_norm": 1.539606046786149, "learning_rate": 2.2558503940226296e-07, "loss": 0.7375, "step": 5688 }, { "epoch": 0.9343844953601051, "grad_norm": 1.6180367166720069, "learning_rate": 2.2446275754665514e-07, "loss": 0.7946, "step": 5689 }, { "epoch": 0.9345487394267882, "grad_norm": 1.81911492363737, "learning_rate": 2.2334324263629914e-07, "loss": 0.7088, "step": 5690 }, { "epoch": 0.9347129834934713, "grad_norm": 1.769142074837238, "learning_rate": 2.2222649498806924e-07, "loss": 0.7046, "step": 5691 }, { "epoch": 0.9348772275601543, "grad_norm": 2.949118411164376, "learning_rate": 2.211125149180604e-07, "loss": 0.7368, "step": 5692 }, { "epoch": 0.9350414716268375, "grad_norm": 2.095619043911114, "learning_rate": 2.2000130274158039e-07, "loss": 0.7394, "step": 5693 }, { "epoch": 0.9352057156935206, "grad_norm": 0.6063440669919136, "learning_rate": 2.1889285877315647e-07, "loss": 0.3355, "step": 5694 }, { "epoch": 0.9353699597602037, "grad_norm": 2.358951923882724, "learning_rate": 2.1778718332653103e-07, "loss": 0.6724, "step": 5695 }, { "epoch": 0.9355342038268868, "grad_norm": 1.9883529376487274, "learning_rate": 2.1668427671466375e-07, "loss": 0.789, "step": 5696 }, { "epoch": 0.9356984478935698, "grad_norm": 1.8057174022113176, "learning_rate": 2.1558413924972933e-07, "loss": 0.7348, "step": 5697 }, { "epoch": 0.935862691960253, "grad_norm": 2.1366154668924104, "learning_rate": 2.144867712431198e-07, "loss": 0.7189, "step": 5698 }, { "epoch": 0.936026936026936, "grad_norm": 1.5158785689746799, "learning_rate": 2.1339217300544335e-07, "loss": 0.8116, "step": 5699 }, { "epoch": 0.9361911800936191, "grad_norm": 1.59959109832981, "learning_rate": 2.1230034484652218e-07, "loss": 0.7604, "step": 5700 }, { "epoch": 0.9363554241603023, "grad_norm": 3.40501448046188, "learning_rate": 2.1121128707539796e-07, "loss": 0.7648, "step": 5701 }, { "epoch": 0.9365196682269853, "grad_norm": 1.5265048464300952, "learning_rate": 2.101250000003241e-07, "loss": 0.7318, "step": 5702 }, { "epoch": 0.9366839122936684, "grad_norm": 2.7515485866632896, "learning_rate": 2.0904148392877354e-07, "loss": 0.7771, "step": 5703 }, { "epoch": 0.9368481563603515, "grad_norm": 1.655987816289372, "learning_rate": 2.079607391674321e-07, "loss": 0.7783, "step": 5704 }, { "epoch": 0.9370124004270346, "grad_norm": 1.682362084659567, "learning_rate": 2.0688276602220392e-07, "loss": 0.7016, "step": 5705 }, { "epoch": 0.9371766444937176, "grad_norm": 2.7566105148894575, "learning_rate": 2.0580756479820496e-07, "loss": 0.7502, "step": 5706 }, { "epoch": 0.9373408885604008, "grad_norm": 1.382694136027324, "learning_rate": 2.0473513579976957e-07, "loss": 0.7203, "step": 5707 }, { "epoch": 0.9375051326270838, "grad_norm": 1.7231296011042732, "learning_rate": 2.0366547933044712e-07, "loss": 0.7295, "step": 5708 }, { "epoch": 0.9376693766937669, "grad_norm": 1.706006215527797, "learning_rate": 2.0259859569299989e-07, "loss": 0.6729, "step": 5709 }, { "epoch": 0.9378336207604501, "grad_norm": 1.8014823060792247, "learning_rate": 2.0153448518940967e-07, "loss": 0.7052, "step": 5710 }, { "epoch": 0.9379978648271331, "grad_norm": 1.9134194367996973, "learning_rate": 2.0047314812086883e-07, "loss": 0.676, "step": 5711 }, { "epoch": 0.9381621088938162, "grad_norm": 2.050321138808539, "learning_rate": 1.9941458478778597e-07, "loss": 0.8222, "step": 5712 }, { "epoch": 0.9383263529604993, "grad_norm": 2.056136363812956, "learning_rate": 1.983587954897881e-07, "loss": 0.7281, "step": 5713 }, { "epoch": 0.9384905970271824, "grad_norm": 1.730089640824202, "learning_rate": 1.9730578052571058e-07, "loss": 0.7424, "step": 5714 }, { "epoch": 0.9386548410938654, "grad_norm": 3.184120048126082, "learning_rate": 1.962555401936106e-07, "loss": 0.6691, "step": 5715 }, { "epoch": 0.9388190851605486, "grad_norm": 3.8372581827757277, "learning_rate": 1.9520807479075366e-07, "loss": 0.7826, "step": 5716 }, { "epoch": 0.9389833292272317, "grad_norm": 1.7170108188975173, "learning_rate": 1.941633846136237e-07, "loss": 0.7299, "step": 5717 }, { "epoch": 0.9391475732939147, "grad_norm": 1.815913994571612, "learning_rate": 1.931214699579176e-07, "loss": 0.6219, "step": 5718 }, { "epoch": 0.9393118173605979, "grad_norm": 1.8618167024395484, "learning_rate": 1.9208233111854824e-07, "loss": 0.7397, "step": 5719 }, { "epoch": 0.9394760614272809, "grad_norm": 1.4551266122266395, "learning_rate": 1.910459683896415e-07, "loss": 0.7297, "step": 5720 }, { "epoch": 0.939640305493964, "grad_norm": 0.6394513512310975, "learning_rate": 1.900123820645361e-07, "loss": 0.3473, "step": 5721 }, { "epoch": 0.9398045495606471, "grad_norm": 1.9473805497601735, "learning_rate": 1.8898157243578797e-07, "loss": 0.7095, "step": 5722 }, { "epoch": 0.9399687936273302, "grad_norm": 1.8930106297779368, "learning_rate": 1.8795353979516596e-07, "loss": 0.7217, "step": 5723 }, { "epoch": 0.9401330376940134, "grad_norm": 1.8339296896799946, "learning_rate": 1.8692828443365063e-07, "loss": 0.694, "step": 5724 }, { "epoch": 0.9402972817606964, "grad_norm": 1.5983394646153266, "learning_rate": 1.8590580664143987e-07, "loss": 0.756, "step": 5725 }, { "epoch": 0.9404615258273795, "grad_norm": 1.7907115029747436, "learning_rate": 1.8488610670794327e-07, "loss": 0.7114, "step": 5726 }, { "epoch": 0.9406257698940625, "grad_norm": 1.9712994537342572, "learning_rate": 1.838691849217855e-07, "loss": 0.7355, "step": 5727 }, { "epoch": 0.9407900139607457, "grad_norm": 3.123020487084774, "learning_rate": 1.8285504157080414e-07, "loss": 0.7093, "step": 5728 }, { "epoch": 0.9409542580274287, "grad_norm": 1.9179171884948163, "learning_rate": 1.818436769420484e-07, "loss": 0.7471, "step": 5729 }, { "epoch": 0.9411185020941119, "grad_norm": 1.606284116036228, "learning_rate": 1.80835091321786e-07, "loss": 0.7733, "step": 5730 }, { "epoch": 0.941282746160795, "grad_norm": 2.470097977921833, "learning_rate": 1.798292849954919e-07, "loss": 0.7983, "step": 5731 }, { "epoch": 0.941446990227478, "grad_norm": 1.9550462925994065, "learning_rate": 1.788262582478595e-07, "loss": 0.7205, "step": 5732 }, { "epoch": 0.9416112342941612, "grad_norm": 1.8660170806139145, "learning_rate": 1.7782601136279277e-07, "loss": 0.7732, "step": 5733 }, { "epoch": 0.9417754783608442, "grad_norm": 1.8737468135703443, "learning_rate": 1.768285446234086e-07, "loss": 0.7568, "step": 5734 }, { "epoch": 0.9419397224275273, "grad_norm": 1.8056645706417913, "learning_rate": 1.7583385831204003e-07, "loss": 0.7429, "step": 5735 }, { "epoch": 0.9421039664942104, "grad_norm": 1.6789430300583272, "learning_rate": 1.7484195271022853e-07, "loss": 0.7396, "step": 5736 }, { "epoch": 0.9422682105608935, "grad_norm": 1.9225169803533058, "learning_rate": 1.7385282809873283e-07, "loss": 0.8245, "step": 5737 }, { "epoch": 0.9424324546275766, "grad_norm": 2.3376109883129765, "learning_rate": 1.7286648475752122e-07, "loss": 0.6708, "step": 5738 }, { "epoch": 0.9425966986942597, "grad_norm": 1.8662380649356884, "learning_rate": 1.7188292296577703e-07, "loss": 0.6871, "step": 5739 }, { "epoch": 0.9427609427609428, "grad_norm": 1.635403184483016, "learning_rate": 1.7090214300189534e-07, "loss": 0.7566, "step": 5740 }, { "epoch": 0.9429251868276258, "grad_norm": 1.6408528962141606, "learning_rate": 1.69924145143483e-07, "loss": 0.7572, "step": 5741 }, { "epoch": 0.943089430894309, "grad_norm": 1.7382372283281748, "learning_rate": 1.6894892966736076e-07, "loss": 0.7269, "step": 5742 }, { "epoch": 0.943253674960992, "grad_norm": 1.97051957865035, "learning_rate": 1.6797649684956118e-07, "loss": 0.7088, "step": 5743 }, { "epoch": 0.9434179190276751, "grad_norm": 1.7730457771712558, "learning_rate": 1.6700684696532854e-07, "loss": 0.744, "step": 5744 }, { "epoch": 0.9435821630943582, "grad_norm": 1.799213254670409, "learning_rate": 1.6603998028912215e-07, "loss": 0.7325, "step": 5745 }, { "epoch": 0.9437464071610413, "grad_norm": 1.9866984174447377, "learning_rate": 1.6507589709461092e-07, "loss": 0.7299, "step": 5746 }, { "epoch": 0.9439106512277244, "grad_norm": 1.6466607153883421, "learning_rate": 1.6411459765467542e-07, "loss": 0.6946, "step": 5747 }, { "epoch": 0.9440748952944075, "grad_norm": 1.6456325172930193, "learning_rate": 1.6315608224141023e-07, "loss": 0.7192, "step": 5748 }, { "epoch": 0.9442391393610906, "grad_norm": 1.6453349117153573, "learning_rate": 1.6220035112612166e-07, "loss": 0.7736, "step": 5749 }, { "epoch": 0.9444033834277736, "grad_norm": 2.2566728981841115, "learning_rate": 1.6124740457932553e-07, "loss": 0.7546, "step": 5750 }, { "epoch": 0.9445676274944568, "grad_norm": 2.2288038914790542, "learning_rate": 1.6029724287075387e-07, "loss": 0.708, "step": 5751 }, { "epoch": 0.9447318715611398, "grad_norm": 1.5713251855922783, "learning_rate": 1.5934986626934602e-07, "loss": 0.7612, "step": 5752 }, { "epoch": 0.9448961156278229, "grad_norm": 2.349206548947523, "learning_rate": 1.5840527504325632e-07, "loss": 0.6806, "step": 5753 }, { "epoch": 0.9450603596945061, "grad_norm": 1.574217492520876, "learning_rate": 1.574634694598476e-07, "loss": 0.7846, "step": 5754 }, { "epoch": 0.9452246037611891, "grad_norm": 1.9070974565567127, "learning_rate": 1.565244497856977e-07, "loss": 0.7199, "step": 5755 }, { "epoch": 0.9453888478278722, "grad_norm": 1.7139268290621639, "learning_rate": 1.5558821628659293e-07, "loss": 0.7726, "step": 5756 }, { "epoch": 0.9455530918945553, "grad_norm": 2.835861409285592, "learning_rate": 1.5465476922753352e-07, "loss": 0.7754, "step": 5757 }, { "epoch": 0.9457173359612384, "grad_norm": 1.971323396916508, "learning_rate": 1.5372410887272814e-07, "loss": 0.7348, "step": 5758 }, { "epoch": 0.9458815800279214, "grad_norm": 1.8336197505724896, "learning_rate": 1.527962354855994e-07, "loss": 0.6549, "step": 5759 }, { "epoch": 0.9460458240946046, "grad_norm": 1.984645348097417, "learning_rate": 1.5187114932877945e-07, "loss": 0.6839, "step": 5760 }, { "epoch": 0.9462100681612877, "grad_norm": 2.006498703959593, "learning_rate": 1.5094885066411213e-07, "loss": 0.774, "step": 5761 }, { "epoch": 0.9463743122279707, "grad_norm": 1.7883309337081668, "learning_rate": 1.50029339752652e-07, "loss": 0.7295, "step": 5762 }, { "epoch": 0.9465385562946539, "grad_norm": 1.7896761718000775, "learning_rate": 1.4911261685466416e-07, "loss": 0.7494, "step": 5763 }, { "epoch": 0.9467028003613369, "grad_norm": 1.6691387307601124, "learning_rate": 1.4819868222962664e-07, "loss": 0.7124, "step": 5764 }, { "epoch": 0.94686704442802, "grad_norm": 1.64623362363272, "learning_rate": 1.472875361362247e-07, "loss": 0.7676, "step": 5765 }, { "epoch": 0.9470312884947031, "grad_norm": 1.6231914855059852, "learning_rate": 1.4637917883235874e-07, "loss": 0.6885, "step": 5766 }, { "epoch": 0.9471955325613862, "grad_norm": 1.6424759774344773, "learning_rate": 1.454736105751353e-07, "loss": 0.7248, "step": 5767 }, { "epoch": 0.9473597766280694, "grad_norm": 0.6252724494867604, "learning_rate": 1.4457083162087383e-07, "loss": 0.3131, "step": 5768 }, { "epoch": 0.9475240206947524, "grad_norm": 1.9008171751773164, "learning_rate": 1.436708422251054e-07, "loss": 0.7591, "step": 5769 }, { "epoch": 0.9476882647614355, "grad_norm": 1.7545101711905038, "learning_rate": 1.4277364264256854e-07, "loss": 0.7321, "step": 5770 }, { "epoch": 0.9478525088281186, "grad_norm": 4.819063710353785, "learning_rate": 1.4187923312721452e-07, "loss": 0.6961, "step": 5771 }, { "epoch": 0.9480167528948017, "grad_norm": 2.7391246045902182, "learning_rate": 1.4098761393220305e-07, "loss": 0.7439, "step": 5772 }, { "epoch": 0.9481809969614847, "grad_norm": 1.7368239871974374, "learning_rate": 1.4009878530990784e-07, "loss": 0.6567, "step": 5773 }, { "epoch": 0.9483452410281679, "grad_norm": 1.69203473713879, "learning_rate": 1.392127475119065e-07, "loss": 0.7033, "step": 5774 }, { "epoch": 0.948509485094851, "grad_norm": 2.098260105068749, "learning_rate": 1.3832950078899288e-07, "loss": 0.7916, "step": 5775 }, { "epoch": 0.948673729161534, "grad_norm": 2.419569796894306, "learning_rate": 1.3744904539116588e-07, "loss": 0.7575, "step": 5776 }, { "epoch": 0.9488379732282172, "grad_norm": 1.7706237510276135, "learning_rate": 1.3657138156763838e-07, "loss": 0.6962, "step": 5777 }, { "epoch": 0.9490022172949002, "grad_norm": 2.5355121667398604, "learning_rate": 1.3569650956682944e-07, "loss": 0.7047, "step": 5778 }, { "epoch": 0.9491664613615833, "grad_norm": 0.5407819984718727, "learning_rate": 1.34824429636371e-07, "loss": 0.2835, "step": 5779 }, { "epoch": 0.9493307054282664, "grad_norm": 2.2452636858215578, "learning_rate": 1.3395514202310443e-07, "loss": 0.6882, "step": 5780 }, { "epoch": 0.9494949494949495, "grad_norm": 1.5047888967947334, "learning_rate": 1.3308864697307745e-07, "loss": 0.6824, "step": 5781 }, { "epoch": 0.9496591935616325, "grad_norm": 1.495456844826937, "learning_rate": 1.3222494473155156e-07, "loss": 0.6938, "step": 5782 }, { "epoch": 0.9498234376283157, "grad_norm": 2.1219108535263516, "learning_rate": 1.313640355429946e-07, "loss": 0.7449, "step": 5783 }, { "epoch": 0.9499876816949988, "grad_norm": 0.5910208803475827, "learning_rate": 1.3050591965108605e-07, "loss": 0.3111, "step": 5784 }, { "epoch": 0.9501519257616818, "grad_norm": 2.373872247513203, "learning_rate": 1.2965059729871275e-07, "loss": 0.6837, "step": 5785 }, { "epoch": 0.950316169828365, "grad_norm": 2.1152389111814576, "learning_rate": 1.287980687279722e-07, "loss": 0.7371, "step": 5786 }, { "epoch": 0.950480413895048, "grad_norm": 2.4420796539057745, "learning_rate": 1.2794833418017237e-07, "loss": 0.7274, "step": 5787 }, { "epoch": 0.9506446579617311, "grad_norm": 1.6039932247996658, "learning_rate": 1.2710139389582654e-07, "loss": 0.722, "step": 5788 }, { "epoch": 0.9508089020284142, "grad_norm": 1.7905553187069425, "learning_rate": 1.2625724811466067e-07, "loss": 0.8127, "step": 5789 }, { "epoch": 0.9509731460950973, "grad_norm": 2.1928409957440254, "learning_rate": 1.254158970756092e-07, "loss": 0.7779, "step": 5790 }, { "epoch": 0.9511373901617804, "grad_norm": 1.694198495737204, "learning_rate": 1.2457734101681274e-07, "loss": 0.7639, "step": 5791 }, { "epoch": 0.9513016342284635, "grad_norm": 2.2207937641620377, "learning_rate": 1.2374158017562366e-07, "loss": 0.7581, "step": 5792 }, { "epoch": 0.9514658782951466, "grad_norm": 2.6665264315851935, "learning_rate": 1.2290861478860272e-07, "loss": 0.7874, "step": 5793 }, { "epoch": 0.9516301223618296, "grad_norm": 2.576743232901239, "learning_rate": 1.2207844509151912e-07, "loss": 0.8292, "step": 5794 }, { "epoch": 0.9517943664285128, "grad_norm": 1.818791708077242, "learning_rate": 1.2125107131935042e-07, "loss": 0.6906, "step": 5795 }, { "epoch": 0.9519586104951958, "grad_norm": 1.5033060893850763, "learning_rate": 1.2042649370628268e-07, "loss": 0.7278, "step": 5796 }, { "epoch": 0.952122854561879, "grad_norm": 1.6280245193546858, "learning_rate": 1.1960471248571138e-07, "loss": 0.7709, "step": 5797 }, { "epoch": 0.9522870986285621, "grad_norm": 1.758337576268224, "learning_rate": 1.1878572789023935e-07, "loss": 0.7286, "step": 5798 }, { "epoch": 0.9524513426952451, "grad_norm": 2.160558419211039, "learning_rate": 1.1796954015167895e-07, "loss": 0.7477, "step": 5799 }, { "epoch": 0.9526155867619283, "grad_norm": 1.577635608477728, "learning_rate": 1.1715614950104981e-07, "loss": 0.7798, "step": 5800 }, { "epoch": 0.9527798308286113, "grad_norm": 1.7660429558112993, "learning_rate": 1.1634555616858112e-07, "loss": 0.768, "step": 5801 }, { "epoch": 0.9529440748952944, "grad_norm": 0.6337040156445067, "learning_rate": 1.1553776038370934e-07, "loss": 0.33, "step": 5802 }, { "epoch": 0.9531083189619775, "grad_norm": 1.7741553548829379, "learning_rate": 1.1473276237507935e-07, "loss": 0.8237, "step": 5803 }, { "epoch": 0.9532725630286606, "grad_norm": 1.8632293799612223, "learning_rate": 1.1393056237054445e-07, "loss": 0.7479, "step": 5804 }, { "epoch": 0.9534368070953437, "grad_norm": 14.517562193906485, "learning_rate": 1.1313116059716522e-07, "loss": 0.7594, "step": 5805 }, { "epoch": 0.9536010511620268, "grad_norm": 1.996372930941686, "learning_rate": 1.1233455728121179e-07, "loss": 0.8467, "step": 5806 }, { "epoch": 0.9537652952287099, "grad_norm": 2.038217480713406, "learning_rate": 1.1154075264815822e-07, "loss": 0.6809, "step": 5807 }, { "epoch": 0.9539295392953929, "grad_norm": 1.858835186255343, "learning_rate": 1.1074974692269258e-07, "loss": 0.7535, "step": 5808 }, { "epoch": 0.9540937833620761, "grad_norm": 0.5659238425894028, "learning_rate": 1.0996154032870576e-07, "loss": 0.3357, "step": 5809 }, { "epoch": 0.9542580274287591, "grad_norm": 2.584554648672543, "learning_rate": 1.091761330892982e-07, "loss": 0.7511, "step": 5810 }, { "epoch": 0.9544222714954422, "grad_norm": 1.8426420905482168, "learning_rate": 1.0839352542677761e-07, "loss": 0.8032, "step": 5811 }, { "epoch": 0.9545865155621254, "grad_norm": 0.6019899940140203, "learning_rate": 1.0761371756265903e-07, "loss": 0.3145, "step": 5812 }, { "epoch": 0.9547507596288084, "grad_norm": 1.5640076267299614, "learning_rate": 1.068367097176659e-07, "loss": 0.8116, "step": 5813 }, { "epoch": 0.9549150036954915, "grad_norm": 1.707557891341736, "learning_rate": 1.0606250211172897e-07, "loss": 0.7789, "step": 5814 }, { "epoch": 0.9550792477621746, "grad_norm": 1.8252009198801806, "learning_rate": 1.0529109496398626e-07, "loss": 0.7286, "step": 5815 }, { "epoch": 0.9552434918288577, "grad_norm": 1.7932829778959158, "learning_rate": 1.0452248849278201e-07, "loss": 0.7353, "step": 5816 }, { "epoch": 0.9554077358955407, "grad_norm": 1.908270174765108, "learning_rate": 1.0375668291566998e-07, "loss": 0.7477, "step": 5817 }, { "epoch": 0.9555719799622239, "grad_norm": 2.6638345877359355, "learning_rate": 1.02993678449409e-07, "loss": 0.6609, "step": 5818 }, { "epoch": 0.955736224028907, "grad_norm": 1.7672037466081587, "learning_rate": 1.022334753099652e-07, "loss": 0.8423, "step": 5819 }, { "epoch": 0.95590046809559, "grad_norm": 1.6933769665678253, "learning_rate": 1.0147607371251312e-07, "loss": 0.8001, "step": 5820 }, { "epoch": 0.9560647121622732, "grad_norm": 2.096215481130546, "learning_rate": 1.0072147387143349e-07, "loss": 0.6522, "step": 5821 }, { "epoch": 0.9562289562289562, "grad_norm": 1.9976897400929698, "learning_rate": 9.996967600031437e-08, "loss": 0.7629, "step": 5822 }, { "epoch": 0.9563932002956393, "grad_norm": 2.1256770214890275, "learning_rate": 9.92206803119511e-08, "loss": 0.7347, "step": 5823 }, { "epoch": 0.9565574443623224, "grad_norm": 1.9534729030938447, "learning_rate": 9.847448701834405e-08, "loss": 0.7321, "step": 5824 }, { "epoch": 0.9567216884290055, "grad_norm": 1.6051844983020216, "learning_rate": 9.773109633070322e-08, "loss": 0.6346, "step": 5825 }, { "epoch": 0.9568859324956885, "grad_norm": 1.6598704034507288, "learning_rate": 9.699050845944357e-08, "loss": 0.7618, "step": 5826 }, { "epoch": 0.9570501765623717, "grad_norm": 0.5702928227706168, "learning_rate": 9.625272361418525e-08, "loss": 0.2792, "step": 5827 }, { "epoch": 0.9572144206290548, "grad_norm": 1.9125565308151282, "learning_rate": 9.551774200375896e-08, "loss": 0.7346, "step": 5828 }, { "epoch": 0.9573786646957378, "grad_norm": 1.436610713934142, "learning_rate": 9.478556383619719e-08, "loss": 0.8264, "step": 5829 }, { "epoch": 0.957542908762421, "grad_norm": 1.9360502937220672, "learning_rate": 9.405618931874417e-08, "loss": 0.762, "step": 5830 }, { "epoch": 0.957707152829104, "grad_norm": 1.8718344592173217, "learning_rate": 9.332961865784584e-08, "loss": 0.7059, "step": 5831 }, { "epoch": 0.9578713968957872, "grad_norm": 1.8052831066402413, "learning_rate": 9.260585205915773e-08, "loss": 0.8193, "step": 5832 }, { "epoch": 0.9580356409624702, "grad_norm": 1.498315399503796, "learning_rate": 9.188488972753928e-08, "loss": 0.7123, "step": 5833 }, { "epoch": 0.9581998850291533, "grad_norm": 1.9627443791807344, "learning_rate": 9.116673186705838e-08, "loss": 0.7294, "step": 5834 }, { "epoch": 0.9583641290958365, "grad_norm": 2.0161678025879586, "learning_rate": 9.045137868098686e-08, "loss": 0.7505, "step": 5835 }, { "epoch": 0.9585283731625195, "grad_norm": 2.4272430588238283, "learning_rate": 8.973883037180609e-08, "loss": 0.72, "step": 5836 }, { "epoch": 0.9586926172292026, "grad_norm": 0.6249175873765059, "learning_rate": 8.902908714119918e-08, "loss": 0.2688, "step": 5837 }, { "epoch": 0.9588568612958857, "grad_norm": 3.1975080412735317, "learning_rate": 8.832214919005877e-08, "loss": 0.8005, "step": 5838 }, { "epoch": 0.9590211053625688, "grad_norm": 2.2043970238839146, "learning_rate": 8.761801671848036e-08, "loss": 0.78, "step": 5839 }, { "epoch": 0.9591853494292518, "grad_norm": 2.0101814059687926, "learning_rate": 8.691668992577007e-08, "loss": 0.6953, "step": 5840 }, { "epoch": 0.959349593495935, "grad_norm": 1.7914477159855149, "learning_rate": 8.621816901043578e-08, "loss": 0.7686, "step": 5841 }, { "epoch": 0.9595138375626181, "grad_norm": 1.3141807480286103, "learning_rate": 8.552245417019045e-08, "loss": 0.774, "step": 5842 }, { "epoch": 0.9596780816293011, "grad_norm": 1.6246638537818063, "learning_rate": 8.482954560195655e-08, "loss": 0.6903, "step": 5843 }, { "epoch": 0.9598423256959843, "grad_norm": 2.1759904585121497, "learning_rate": 8.413944350186054e-08, "loss": 0.7812, "step": 5844 }, { "epoch": 0.9600065697626673, "grad_norm": 1.4544794527055536, "learning_rate": 8.345214806523394e-08, "loss": 0.7641, "step": 5845 }, { "epoch": 0.9601708138293504, "grad_norm": 1.6068966840194336, "learning_rate": 8.276765948661447e-08, "loss": 0.7128, "step": 5846 }, { "epoch": 0.9603350578960335, "grad_norm": 1.7186680595444375, "learning_rate": 8.208597795974382e-08, "loss": 0.7135, "step": 5847 }, { "epoch": 0.9604993019627166, "grad_norm": 1.655738576825443, "learning_rate": 8.14071036775721e-08, "loss": 0.6875, "step": 5848 }, { "epoch": 0.9606635460293997, "grad_norm": 1.6841650134826145, "learning_rate": 8.073103683225337e-08, "loss": 0.6479, "step": 5849 }, { "epoch": 0.9608277900960828, "grad_norm": 2.628759817240681, "learning_rate": 8.005777761514677e-08, "loss": 0.7562, "step": 5850 }, { "epoch": 0.9609920341627659, "grad_norm": 1.5447103992509097, "learning_rate": 7.938732621681545e-08, "loss": 0.7207, "step": 5851 }, { "epoch": 0.9611562782294489, "grad_norm": 1.8463736051430495, "learning_rate": 7.87196828270309e-08, "loss": 0.7786, "step": 5852 }, { "epoch": 0.9613205222961321, "grad_norm": 2.159651206847283, "learning_rate": 7.805484763476756e-08, "loss": 0.703, "step": 5853 }, { "epoch": 0.9614847663628151, "grad_norm": 1.6126088136443437, "learning_rate": 7.739282082820488e-08, "loss": 0.7737, "step": 5854 }, { "epoch": 0.9616490104294982, "grad_norm": 0.6062716114821882, "learning_rate": 7.673360259472962e-08, "loss": 0.3443, "step": 5855 }, { "epoch": 0.9618132544961814, "grad_norm": 4.247654846553943, "learning_rate": 7.607719312093142e-08, "loss": 0.7306, "step": 5856 }, { "epoch": 0.9619774985628644, "grad_norm": 1.6555509223926668, "learning_rate": 7.542359259260612e-08, "loss": 0.7972, "step": 5857 }, { "epoch": 0.9621417426295475, "grad_norm": 1.9770025549802077, "learning_rate": 7.477280119475239e-08, "loss": 0.7564, "step": 5858 }, { "epoch": 0.9623059866962306, "grad_norm": 2.324377288985533, "learning_rate": 7.412481911157732e-08, "loss": 0.7884, "step": 5859 }, { "epoch": 0.9624702307629137, "grad_norm": 1.689833229075792, "learning_rate": 7.347964652648976e-08, "loss": 0.7518, "step": 5860 }, { "epoch": 0.9626344748295967, "grad_norm": 6.703307155682424, "learning_rate": 7.283728362210474e-08, "loss": 0.6182, "step": 5861 }, { "epoch": 0.9627987188962799, "grad_norm": 1.9435016615040874, "learning_rate": 7.219773058024127e-08, "loss": 0.7828, "step": 5862 }, { "epoch": 0.9629629629629629, "grad_norm": 1.5643960625689846, "learning_rate": 7.156098758192453e-08, "loss": 0.7698, "step": 5863 }, { "epoch": 0.963127207029646, "grad_norm": 0.6018431156566193, "learning_rate": 7.09270548073826e-08, "loss": 0.3173, "step": 5864 }, { "epoch": 0.9632914510963292, "grad_norm": 2.2076034890649767, "learning_rate": 7.029593243604859e-08, "loss": 0.7451, "step": 5865 }, { "epoch": 0.9634556951630122, "grad_norm": 1.7969586429304918, "learning_rate": 6.966762064656075e-08, "loss": 0.7517, "step": 5866 }, { "epoch": 0.9636199392296954, "grad_norm": 1.571906250749098, "learning_rate": 6.904211961676122e-08, "loss": 0.7538, "step": 5867 }, { "epoch": 0.9637841832963784, "grad_norm": 1.6312633114647805, "learning_rate": 6.841942952369618e-08, "loss": 0.7521, "step": 5868 }, { "epoch": 0.9639484273630615, "grad_norm": 2.4297851405961417, "learning_rate": 6.779955054361797e-08, "loss": 0.759, "step": 5869 }, { "epoch": 0.9641126714297445, "grad_norm": 1.5220345224325804, "learning_rate": 6.718248285198182e-08, "loss": 0.6611, "step": 5870 }, { "epoch": 0.9642769154964277, "grad_norm": 1.6409862257961354, "learning_rate": 6.656822662344686e-08, "loss": 0.7765, "step": 5871 }, { "epoch": 0.9644411595631108, "grad_norm": 1.531870436242808, "learning_rate": 6.595678203187739e-08, "loss": 0.7851, "step": 5872 }, { "epoch": 0.9646054036297939, "grad_norm": 1.9976173093105203, "learning_rate": 6.53481492503405e-08, "loss": 0.7476, "step": 5873 }, { "epoch": 0.964769647696477, "grad_norm": 1.7032059853784902, "learning_rate": 6.474232845111062e-08, "loss": 0.715, "step": 5874 }, { "epoch": 0.96493389176316, "grad_norm": 1.8432921773298425, "learning_rate": 6.413931980566168e-08, "loss": 0.6975, "step": 5875 }, { "epoch": 0.9650981358298432, "grad_norm": 1.9910358422406678, "learning_rate": 6.353912348467606e-08, "loss": 0.7314, "step": 5876 }, { "epoch": 0.9652623798965262, "grad_norm": 1.9126153529240983, "learning_rate": 6.294173965803563e-08, "loss": 0.7138, "step": 5877 }, { "epoch": 0.9654266239632093, "grad_norm": 1.6393511726556544, "learning_rate": 6.23471684948318e-08, "loss": 0.7291, "step": 5878 }, { "epoch": 0.9655908680298925, "grad_norm": 2.491680204232961, "learning_rate": 6.175541016335329e-08, "loss": 0.733, "step": 5879 }, { "epoch": 0.9657551120965755, "grad_norm": 1.9059675992797274, "learning_rate": 6.116646483109945e-08, "loss": 0.7314, "step": 5880 }, { "epoch": 0.9659193561632586, "grad_norm": 1.6970106820544393, "learning_rate": 6.058033266476693e-08, "loss": 0.6991, "step": 5881 }, { "epoch": 0.9660836002299417, "grad_norm": 1.6213475170355052, "learning_rate": 5.99970138302608e-08, "loss": 0.6893, "step": 5882 }, { "epoch": 0.9662478442966248, "grad_norm": 2.1027329654641145, "learning_rate": 5.9416508492688986e-08, "loss": 0.6998, "step": 5883 }, { "epoch": 0.9664120883633078, "grad_norm": 2.297047901732733, "learning_rate": 5.8838816816360054e-08, "loss": 0.6934, "step": 5884 }, { "epoch": 0.966576332429991, "grad_norm": 1.6460181802696514, "learning_rate": 5.826393896478988e-08, "loss": 0.7678, "step": 5885 }, { "epoch": 0.9667405764966741, "grad_norm": 3.12025100883052, "learning_rate": 5.7691875100697184e-08, "loss": 0.6517, "step": 5886 }, { "epoch": 0.9669048205633571, "grad_norm": 1.7462894530007251, "learning_rate": 5.712262538600133e-08, "loss": 0.7286, "step": 5887 }, { "epoch": 0.9670690646300403, "grad_norm": 1.9730902127239494, "learning_rate": 5.655618998182899e-08, "loss": 0.7624, "step": 5888 }, { "epoch": 0.9672333086967233, "grad_norm": 1.4282779064414526, "learning_rate": 5.5992569048507475e-08, "loss": 0.7546, "step": 5889 }, { "epoch": 0.9673975527634064, "grad_norm": 2.739219176317298, "learning_rate": 5.543176274556916e-08, "loss": 0.6756, "step": 5890 }, { "epoch": 0.9675617968300895, "grad_norm": 1.614668581167636, "learning_rate": 5.487377123174931e-08, "loss": 0.6749, "step": 5891 }, { "epoch": 0.9677260408967726, "grad_norm": 1.8166154645563297, "learning_rate": 5.43185946649849e-08, "loss": 0.7543, "step": 5892 }, { "epoch": 0.9678902849634557, "grad_norm": 1.9737686176123825, "learning_rate": 5.376623320241914e-08, "loss": 0.6898, "step": 5893 }, { "epoch": 0.9680545290301388, "grad_norm": 2.30303144281502, "learning_rate": 5.321668700039584e-08, "loss": 0.7269, "step": 5894 }, { "epoch": 0.9682187730968219, "grad_norm": 2.1739786003838453, "learning_rate": 5.26699562144628e-08, "loss": 0.7208, "step": 5895 }, { "epoch": 0.9683830171635049, "grad_norm": 2.7732431199794387, "learning_rate": 5.2126040999371796e-08, "loss": 0.7957, "step": 5896 }, { "epoch": 0.9685472612301881, "grad_norm": 2.2891139570642296, "learning_rate": 5.1584941509075225e-08, "loss": 0.7299, "step": 5897 }, { "epoch": 0.9687115052968711, "grad_norm": 1.9049713549787235, "learning_rate": 5.10466578967328e-08, "loss": 0.7001, "step": 5898 }, { "epoch": 0.9688757493635543, "grad_norm": 1.6009512935103198, "learning_rate": 5.051119031470264e-08, "loss": 0.7131, "step": 5899 }, { "epoch": 0.9690399934302373, "grad_norm": 1.916434329983574, "learning_rate": 4.997853891454796e-08, "loss": 0.7252, "step": 5900 }, { "epoch": 0.9692042374969204, "grad_norm": 1.6595829028484148, "learning_rate": 4.9448703847034816e-08, "loss": 0.7054, "step": 5901 }, { "epoch": 0.9693684815636036, "grad_norm": 1.5095428006512082, "learning_rate": 4.892168526213215e-08, "loss": 0.6869, "step": 5902 }, { "epoch": 0.9695327256302866, "grad_norm": 4.753296388213047, "learning_rate": 4.8397483309011726e-08, "loss": 0.7453, "step": 5903 }, { "epoch": 0.9696969696969697, "grad_norm": 1.7803525682509445, "learning_rate": 4.78760981360471e-08, "loss": 0.8222, "step": 5904 }, { "epoch": 0.9698612137636528, "grad_norm": 3.296419621033837, "learning_rate": 4.735752989081577e-08, "loss": 0.7574, "step": 5905 }, { "epoch": 0.9700254578303359, "grad_norm": 2.712783827103167, "learning_rate": 4.6841778720095874e-08, "loss": 0.6869, "step": 5906 }, { "epoch": 0.9701897018970189, "grad_norm": 1.5991094946525437, "learning_rate": 4.632884476987176e-08, "loss": 0.7172, "step": 5907 }, { "epoch": 0.9703539459637021, "grad_norm": 2.527018918727077, "learning_rate": 4.581872818532729e-08, "loss": 0.6476, "step": 5908 }, { "epoch": 0.9705181900303852, "grad_norm": 1.798459366054547, "learning_rate": 4.531142911085029e-08, "loss": 0.7509, "step": 5909 }, { "epoch": 0.9706824340970682, "grad_norm": 2.134017333544006, "learning_rate": 4.480694769003035e-08, "loss": 0.755, "step": 5910 }, { "epoch": 0.9708466781637514, "grad_norm": 1.6291695694228414, "learning_rate": 4.4305284065659926e-08, "loss": 0.6834, "step": 5911 }, { "epoch": 0.9710109222304344, "grad_norm": 1.4203393394963142, "learning_rate": 4.38064383797332e-08, "loss": 0.8237, "step": 5912 }, { "epoch": 0.9711751662971175, "grad_norm": 1.5960312476674994, "learning_rate": 4.331041077344944e-08, "loss": 0.7807, "step": 5913 }, { "epoch": 0.9713394103638006, "grad_norm": 1.5995280333964972, "learning_rate": 4.281720138720524e-08, "loss": 0.6692, "step": 5914 }, { "epoch": 0.9715036544304837, "grad_norm": 1.7713757583744543, "learning_rate": 4.2326810360605594e-08, "loss": 0.6553, "step": 5915 }, { "epoch": 0.9716678984971668, "grad_norm": 1.7968344785982144, "learning_rate": 4.1839237832452805e-08, "loss": 0.7447, "step": 5916 }, { "epoch": 0.9718321425638499, "grad_norm": 0.5980026596732259, "learning_rate": 4.1354483940753145e-08, "loss": 0.3155, "step": 5917 }, { "epoch": 0.971996386630533, "grad_norm": 0.6077835447925313, "learning_rate": 4.087254882271574e-08, "loss": 0.3181, "step": 5918 }, { "epoch": 0.972160630697216, "grad_norm": 0.5843690831388844, "learning_rate": 4.039343261475148e-08, "loss": 0.3251, "step": 5919 }, { "epoch": 0.9723248747638992, "grad_norm": 1.9866114291440553, "learning_rate": 3.9917135452473e-08, "loss": 0.7131, "step": 5920 }, { "epoch": 0.9724891188305822, "grad_norm": 2.344349099478787, "learning_rate": 3.944365747069467e-08, "loss": 0.7469, "step": 5921 }, { "epoch": 0.9726533628972653, "grad_norm": 1.9384697883205624, "learning_rate": 3.897299880343486e-08, "loss": 0.7212, "step": 5922 }, { "epoch": 0.9728176069639485, "grad_norm": 2.1648918521867366, "learning_rate": 3.8505159583911434e-08, "loss": 0.7552, "step": 5923 }, { "epoch": 0.9729818510306315, "grad_norm": 1.9753726169730543, "learning_rate": 3.8040139944545143e-08, "loss": 0.6546, "step": 5924 }, { "epoch": 0.9731460950973146, "grad_norm": 1.6920522620047078, "learning_rate": 3.757794001695847e-08, "loss": 0.7719, "step": 5925 }, { "epoch": 0.9733103391639977, "grad_norm": 4.754103436830869, "learning_rate": 3.711855993197788e-08, "loss": 0.6873, "step": 5926 }, { "epoch": 0.9734745832306808, "grad_norm": 2.8600837170744415, "learning_rate": 3.666199981962826e-08, "loss": 0.7149, "step": 5927 }, { "epoch": 0.9736388272973638, "grad_norm": 1.641143035324519, "learning_rate": 3.6208259809139554e-08, "loss": 0.6608, "step": 5928 }, { "epoch": 0.973803071364047, "grad_norm": 1.597357811612621, "learning_rate": 3.575734002893905e-08, "loss": 0.8241, "step": 5929 }, { "epoch": 0.9739673154307301, "grad_norm": 1.718788880568816, "learning_rate": 3.5309240606662406e-08, "loss": 0.7606, "step": 5930 }, { "epoch": 0.9741315594974131, "grad_norm": 2.265857143341277, "learning_rate": 3.486396166914041e-08, "loss": 0.7713, "step": 5931 }, { "epoch": 0.9742958035640963, "grad_norm": 1.9072785207313159, "learning_rate": 3.44215033424089e-08, "loss": 0.759, "step": 5932 }, { "epoch": 0.9744600476307793, "grad_norm": 1.6397139774413652, "learning_rate": 3.3981865751705477e-08, "loss": 0.7242, "step": 5933 }, { "epoch": 0.9746242916974625, "grad_norm": 1.8927723838202755, "learning_rate": 3.354504902146838e-08, "loss": 0.7316, "step": 5934 }, { "epoch": 0.9747885357641455, "grad_norm": 4.383752393521027, "learning_rate": 3.31110532753387e-08, "loss": 0.682, "step": 5935 }, { "epoch": 0.9749527798308286, "grad_norm": 1.4802890988899136, "learning_rate": 3.267987863615485e-08, "loss": 0.7894, "step": 5936 }, { "epoch": 0.9751170238975116, "grad_norm": 2.1269574046155926, "learning_rate": 3.225152522596364e-08, "loss": 0.6813, "step": 5937 }, { "epoch": 0.9752812679641948, "grad_norm": 1.7209435323274371, "learning_rate": 3.182599316600699e-08, "loss": 0.8149, "step": 5938 }, { "epoch": 0.9754455120308779, "grad_norm": 0.6092128484184884, "learning_rate": 3.140328257673187e-08, "loss": 0.3413, "step": 5939 }, { "epoch": 0.975609756097561, "grad_norm": 1.8620599103988904, "learning_rate": 3.0983393577785906e-08, "loss": 0.7554, "step": 5940 }, { "epoch": 0.9757740001642441, "grad_norm": 1.9352685811410872, "learning_rate": 3.056632628801737e-08, "loss": 0.6866, "step": 5941 }, { "epoch": 0.9759382442309271, "grad_norm": 1.8524248001496268, "learning_rate": 3.015208082547627e-08, "loss": 0.7762, "step": 5942 }, { "epoch": 0.9761024882976103, "grad_norm": 1.8865278644153123, "learning_rate": 2.974065730741327e-08, "loss": 0.7124, "step": 5943 }, { "epoch": 0.9762667323642933, "grad_norm": 2.131152155425733, "learning_rate": 2.933205585028298e-08, "loss": 0.7771, "step": 5944 }, { "epoch": 0.9764309764309764, "grad_norm": 1.5693952627580467, "learning_rate": 2.8926276569736233e-08, "loss": 0.6884, "step": 5945 }, { "epoch": 0.9765952204976596, "grad_norm": 2.1528512336184016, "learning_rate": 2.852331958063115e-08, "loss": 0.784, "step": 5946 }, { "epoch": 0.9767594645643426, "grad_norm": 2.420681682080814, "learning_rate": 2.8123184997022046e-08, "loss": 0.7309, "step": 5947 }, { "epoch": 0.9769237086310257, "grad_norm": 0.6370308931438026, "learning_rate": 2.772587293216611e-08, "loss": 0.3178, "step": 5948 }, { "epoch": 0.9770879526977088, "grad_norm": 1.8419048076390396, "learning_rate": 2.7331383498522268e-08, "loss": 0.7546, "step": 5949 }, { "epoch": 0.9772521967643919, "grad_norm": 0.611459070655832, "learning_rate": 2.6939716807748982e-08, "loss": 0.288, "step": 5950 }, { "epoch": 0.9774164408310749, "grad_norm": 3.6841957726878847, "learning_rate": 2.6550872970707576e-08, "loss": 0.6957, "step": 5951 }, { "epoch": 0.9775806848977581, "grad_norm": 1.998282260086381, "learning_rate": 2.6164852097460003e-08, "loss": 0.8342, "step": 5952 }, { "epoch": 0.9777449289644412, "grad_norm": 2.2100105613377066, "learning_rate": 2.5781654297267756e-08, "loss": 0.8355, "step": 5953 }, { "epoch": 0.9779091730311242, "grad_norm": 1.58250226193762, "learning_rate": 2.540127967859407e-08, "loss": 0.6838, "step": 5954 }, { "epoch": 0.9780734170978074, "grad_norm": 2.6329483648238745, "learning_rate": 2.5023728349102828e-08, "loss": 0.7366, "step": 5955 }, { "epoch": 0.9782376611644904, "grad_norm": 0.5510856074412529, "learning_rate": 2.464900041566076e-08, "loss": 0.3237, "step": 5956 }, { "epoch": 0.9784019052311735, "grad_norm": 1.8028564193115277, "learning_rate": 2.427709598433303e-08, "loss": 0.6752, "step": 5957 }, { "epoch": 0.9785661492978566, "grad_norm": 1.7242181739146218, "learning_rate": 2.3908015160385433e-08, "loss": 0.7533, "step": 5958 }, { "epoch": 0.9787303933645397, "grad_norm": 1.4291361257165491, "learning_rate": 2.3541758048286624e-08, "loss": 0.7802, "step": 5959 }, { "epoch": 0.9788946374312228, "grad_norm": 1.7564518267949492, "learning_rate": 2.3178324751704784e-08, "loss": 0.6946, "step": 5960 }, { "epoch": 0.9790588814979059, "grad_norm": 2.693277489543369, "learning_rate": 2.2817715373507633e-08, "loss": 0.6621, "step": 5961 }, { "epoch": 0.979223125564589, "grad_norm": 1.6458966333068978, "learning_rate": 2.245993001576685e-08, "loss": 0.7632, "step": 5962 }, { "epoch": 0.979387369631272, "grad_norm": 1.7165198315085828, "learning_rate": 2.2104968779752546e-08, "loss": 0.7375, "step": 5963 }, { "epoch": 0.9795516136979552, "grad_norm": 1.6952024410101436, "learning_rate": 2.1752831765934346e-08, "loss": 0.7028, "step": 5964 }, { "epoch": 0.9797158577646382, "grad_norm": 2.0558371449724233, "learning_rate": 2.1403519073984748e-08, "loss": 0.6856, "step": 5965 }, { "epoch": 0.9798801018313213, "grad_norm": 3.174478250046696, "learning_rate": 2.105703080277688e-08, "loss": 0.7425, "step": 5966 }, { "epoch": 0.9800443458980045, "grad_norm": 1.7831772222288895, "learning_rate": 2.0713367050381185e-08, "loss": 0.6942, "step": 5967 }, { "epoch": 0.9802085899646875, "grad_norm": 1.6896599299604218, "learning_rate": 2.037252791407318e-08, "loss": 0.7319, "step": 5968 }, { "epoch": 0.9803728340313707, "grad_norm": 1.6853212609165653, "learning_rate": 2.0034513490325702e-08, "loss": 0.7269, "step": 5969 }, { "epoch": 0.9805370780980537, "grad_norm": 1.4922317848931024, "learning_rate": 1.9699323874814437e-08, "loss": 0.7554, "step": 5970 }, { "epoch": 0.9807013221647368, "grad_norm": 2.029439349722147, "learning_rate": 1.936695916241127e-08, "loss": 0.6757, "step": 5971 }, { "epoch": 0.9808655662314198, "grad_norm": 2.2587639955181906, "learning_rate": 1.9037419447193174e-08, "loss": 0.77, "step": 5972 }, { "epoch": 0.981029810298103, "grad_norm": 1.9581251663708619, "learning_rate": 1.8710704822435534e-08, "loss": 0.7626, "step": 5973 }, { "epoch": 0.981194054364786, "grad_norm": 1.678127350608195, "learning_rate": 1.8386815380614376e-08, "loss": 0.7045, "step": 5974 }, { "epoch": 0.9813582984314692, "grad_norm": 1.7347269029380228, "learning_rate": 1.8065751213405257e-08, "loss": 0.7629, "step": 5975 }, { "epoch": 0.9815225424981523, "grad_norm": 0.6223171763926851, "learning_rate": 1.7747512411685486e-08, "loss": 0.3113, "step": 5976 }, { "epoch": 0.9816867865648353, "grad_norm": 1.7762903601496571, "learning_rate": 1.7432099065531894e-08, "loss": 0.7227, "step": 5977 }, { "epoch": 0.9818510306315185, "grad_norm": 1.659973625609309, "learning_rate": 1.7119511264220844e-08, "loss": 0.7154, "step": 5978 }, { "epoch": 0.9820152746982015, "grad_norm": 1.7271466189614513, "learning_rate": 1.6809749096229344e-08, "loss": 0.7906, "step": 5979 }, { "epoch": 0.9821795187648846, "grad_norm": 1.8092706147339324, "learning_rate": 1.6502812649236145e-08, "loss": 0.7684, "step": 5980 }, { "epoch": 0.9823437628315677, "grad_norm": 12.093808075682073, "learning_rate": 1.6198702010118417e-08, "loss": 0.7356, "step": 5981 }, { "epoch": 0.9825080068982508, "grad_norm": 1.761578288768156, "learning_rate": 1.5897417264953974e-08, "loss": 0.7616, "step": 5982 }, { "epoch": 0.9826722509649339, "grad_norm": 1.6715826961921216, "learning_rate": 1.5598958499021265e-08, "loss": 0.8228, "step": 5983 }, { "epoch": 0.982836495031617, "grad_norm": 1.9965352057596626, "learning_rate": 1.5303325796797163e-08, "loss": 0.7437, "step": 5984 }, { "epoch": 0.9830007390983001, "grad_norm": 1.6958948892436938, "learning_rate": 1.5010519241961397e-08, "loss": 0.7689, "step": 5985 }, { "epoch": 0.9831649831649831, "grad_norm": 1.5715122499013288, "learning_rate": 1.4720538917391003e-08, "loss": 0.7206, "step": 5986 }, { "epoch": 0.9833292272316663, "grad_norm": 1.5634440320712286, "learning_rate": 1.4433384905164771e-08, "loss": 0.8018, "step": 5987 }, { "epoch": 0.9834934712983493, "grad_norm": 1.7755695061674797, "learning_rate": 1.4149057286562128e-08, "loss": 0.757, "step": 5988 }, { "epoch": 0.9836577153650324, "grad_norm": 1.7616389387113298, "learning_rate": 1.3867556142059813e-08, "loss": 0.715, "step": 5989 }, { "epoch": 0.9838219594317156, "grad_norm": 1.7736728925760747, "learning_rate": 1.3588881551337418e-08, "loss": 0.7334, "step": 5990 }, { "epoch": 0.9839862034983986, "grad_norm": 1.714033916299993, "learning_rate": 1.331303359327185e-08, "loss": 0.6973, "step": 5991 }, { "epoch": 0.9841504475650817, "grad_norm": 1.7444296658001535, "learning_rate": 1.3040012345941765e-08, "loss": 0.7631, "step": 5992 }, { "epoch": 0.9843146916317648, "grad_norm": 1.6345450952333076, "learning_rate": 1.2769817886626456e-08, "loss": 0.7103, "step": 5993 }, { "epoch": 0.9844789356984479, "grad_norm": 1.6127821697375688, "learning_rate": 1.2502450291802526e-08, "loss": 0.6535, "step": 5994 }, { "epoch": 0.9846431797651309, "grad_norm": 1.605349160525358, "learning_rate": 1.2237909637147217e-08, "loss": 0.7455, "step": 5995 }, { "epoch": 0.9848074238318141, "grad_norm": 1.7907931699565796, "learning_rate": 1.1976195997540629e-08, "loss": 0.6827, "step": 5996 }, { "epoch": 0.9849716678984972, "grad_norm": 1.9794905623601249, "learning_rate": 1.1717309447057957e-08, "loss": 0.76, "step": 5997 }, { "epoch": 0.9851359119651802, "grad_norm": 2.481555389266571, "learning_rate": 1.1461250058977248e-08, "loss": 0.7781, "step": 5998 }, { "epoch": 0.9853001560318634, "grad_norm": 0.5972738657451857, "learning_rate": 1.120801790577608e-08, "loss": 0.3225, "step": 5999 }, { "epoch": 0.9854644000985464, "grad_norm": 2.3759449603937357, "learning_rate": 1.0957613059131567e-08, "loss": 0.7121, "step": 6000 }, { "epoch": 0.9856286441652296, "grad_norm": 2.577726819434461, "learning_rate": 1.0710035589918122e-08, "loss": 0.8114, "step": 6001 }, { "epoch": 0.9857928882319126, "grad_norm": 1.9141777949630798, "learning_rate": 1.0465285568214134e-08, "loss": 0.7881, "step": 6002 }, { "epoch": 0.9859571322985957, "grad_norm": 1.5827542681300673, "learning_rate": 1.02233630632953e-08, "loss": 0.7486, "step": 6003 }, { "epoch": 0.9861213763652789, "grad_norm": 1.8272426632499996, "learning_rate": 9.984268143636844e-09, "loss": 0.7135, "step": 6004 }, { "epoch": 0.9862856204319619, "grad_norm": 1.6964341201217814, "learning_rate": 9.74800087691352e-09, "loss": 0.7562, "step": 6005 }, { "epoch": 0.986449864498645, "grad_norm": 2.7017572812558437, "learning_rate": 9.514561330001836e-09, "loss": 0.7631, "step": 6006 }, { "epoch": 0.986614108565328, "grad_norm": 2.3433623134240373, "learning_rate": 9.283949568974493e-09, "loss": 0.6768, "step": 6007 }, { "epoch": 0.9867783526320112, "grad_norm": 1.8384297916226926, "learning_rate": 9.056165659107053e-09, "loss": 0.6833, "step": 6008 }, { "epoch": 0.9869425966986942, "grad_norm": 3.9215745190056035, "learning_rate": 8.831209664872386e-09, "loss": 0.6705, "step": 6009 }, { "epoch": 0.9871068407653774, "grad_norm": 1.3474360795666873, "learning_rate": 8.609081649945116e-09, "loss": 0.6837, "step": 6010 }, { "epoch": 0.9872710848320605, "grad_norm": 2.0892093363412862, "learning_rate": 8.389781677196062e-09, "loss": 0.7816, "step": 6011 }, { "epoch": 0.9874353288987435, "grad_norm": 1.5432320440481986, "learning_rate": 8.173309808700014e-09, "loss": 0.7343, "step": 6012 }, { "epoch": 0.9875995729654267, "grad_norm": 0.5849836338544309, "learning_rate": 7.959666105727959e-09, "loss": 0.3003, "step": 6013 }, { "epoch": 0.9877638170321097, "grad_norm": 1.9023050100909664, "learning_rate": 7.748850628749305e-09, "loss": 0.8106, "step": 6014 }, { "epoch": 0.9879280610987928, "grad_norm": 4.149400019534418, "learning_rate": 7.54086343743854e-09, "loss": 0.7583, "step": 6015 }, { "epoch": 0.9880923051654759, "grad_norm": 1.9001657930862068, "learning_rate": 7.335704590663017e-09, "loss": 0.7826, "step": 6016 }, { "epoch": 0.988256549232159, "grad_norm": 2.2123316298989053, "learning_rate": 7.1333741464940655e-09, "loss": 0.7802, "step": 6017 }, { "epoch": 0.988420793298842, "grad_norm": 3.5405786466991325, "learning_rate": 6.933872162199207e-09, "loss": 0.7477, "step": 6018 }, { "epoch": 0.9885850373655252, "grad_norm": 1.71069337424589, "learning_rate": 6.737198694249936e-09, "loss": 0.6787, "step": 6019 }, { "epoch": 0.9887492814322083, "grad_norm": 3.562293100283718, "learning_rate": 6.543353798311725e-09, "loss": 0.7469, "step": 6020 }, { "epoch": 0.9889135254988913, "grad_norm": 1.7709893655114375, "learning_rate": 6.352337529252905e-09, "loss": 0.7757, "step": 6021 }, { "epoch": 0.9890777695655745, "grad_norm": 8.410246475147122, "learning_rate": 6.164149941140229e-09, "loss": 0.7141, "step": 6022 }, { "epoch": 0.9892420136322575, "grad_norm": 1.5990277460613975, "learning_rate": 5.978791087239977e-09, "loss": 0.665, "step": 6023 }, { "epoch": 0.9894062576989406, "grad_norm": 2.2417410723751976, "learning_rate": 5.796261020016847e-09, "loss": 0.6849, "step": 6024 }, { "epoch": 0.9895705017656237, "grad_norm": 1.9158906957295134, "learning_rate": 5.616559791136178e-09, "loss": 0.7472, "step": 6025 }, { "epoch": 0.9897347458323068, "grad_norm": 2.5252442028284783, "learning_rate": 5.439687451461728e-09, "loss": 0.773, "step": 6026 }, { "epoch": 0.98989898989899, "grad_norm": 2.970167354480783, "learning_rate": 5.265644051057894e-09, "loss": 0.743, "step": 6027 }, { "epoch": 0.990063233965673, "grad_norm": 1.7617413840802336, "learning_rate": 5.0944296391863825e-09, "loss": 0.734, "step": 6028 }, { "epoch": 0.9902274780323561, "grad_norm": 3.2575753316874585, "learning_rate": 4.926044264308427e-09, "loss": 0.774, "step": 6029 }, { "epoch": 0.9903917220990391, "grad_norm": 2.0611114007345743, "learning_rate": 4.760487974085903e-09, "loss": 0.5832, "step": 6030 }, { "epoch": 0.9905559661657223, "grad_norm": 1.81757866344955, "learning_rate": 4.597760815379104e-09, "loss": 0.7486, "step": 6031 }, { "epoch": 0.9907202102324053, "grad_norm": 1.6936918497874824, "learning_rate": 4.437862834247852e-09, "loss": 0.6899, "step": 6032 }, { "epoch": 0.9908844542990884, "grad_norm": 1.7581070639029062, "learning_rate": 4.2807940759515e-09, "loss": 0.7713, "step": 6033 }, { "epoch": 0.9910486983657716, "grad_norm": 1.516989304027012, "learning_rate": 4.126554584946707e-09, "loss": 0.7616, "step": 6034 }, { "epoch": 0.9912129424324546, "grad_norm": 1.9739760681091674, "learning_rate": 3.975144404890774e-09, "loss": 0.6704, "step": 6035 }, { "epoch": 0.9913771864991378, "grad_norm": 1.9122611819276611, "learning_rate": 3.82656357864053e-09, "loss": 0.7326, "step": 6036 }, { "epoch": 0.9915414305658208, "grad_norm": 2.2486083277421174, "learning_rate": 3.680812148251223e-09, "loss": 0.7537, "step": 6037 }, { "epoch": 0.9917056746325039, "grad_norm": 1.4442108992682667, "learning_rate": 3.53789015497763e-09, "loss": 0.7329, "step": 6038 }, { "epoch": 0.991869918699187, "grad_norm": 1.9145356479482347, "learning_rate": 3.3977976392740587e-09, "loss": 0.7495, "step": 6039 }, { "epoch": 0.9920341627658701, "grad_norm": 1.7843019507555493, "learning_rate": 3.2605346407932336e-09, "loss": 0.7139, "step": 6040 }, { "epoch": 0.9921984068325532, "grad_norm": 1.7886001416192256, "learning_rate": 3.1261011983851898e-09, "loss": 0.6684, "step": 6041 }, { "epoch": 0.9923626508992363, "grad_norm": 1.7652457941453965, "learning_rate": 2.994497350103931e-09, "loss": 0.7146, "step": 6042 }, { "epoch": 0.9925268949659194, "grad_norm": 1.4469391317756994, "learning_rate": 2.8657231331985503e-09, "loss": 0.778, "step": 6043 }, { "epoch": 0.9926911390326024, "grad_norm": 3.132199194470952, "learning_rate": 2.7397785841176692e-09, "loss": 0.7798, "step": 6044 }, { "epoch": 0.9928553830992856, "grad_norm": 1.5759946563921021, "learning_rate": 2.6166637385094396e-09, "loss": 0.7076, "step": 6045 }, { "epoch": 0.9930196271659686, "grad_norm": 0.5999685108538079, "learning_rate": 2.496378631222651e-09, "loss": 0.3278, "step": 6046 }, { "epoch": 0.9931838712326517, "grad_norm": 1.5346278909089557, "learning_rate": 2.3789232963034036e-09, "loss": 0.7382, "step": 6047 }, { "epoch": 0.9933481152993349, "grad_norm": 1.655520686827253, "learning_rate": 2.264297766995105e-09, "loss": 0.7258, "step": 6048 }, { "epoch": 0.9935123593660179, "grad_norm": 0.5894074981932529, "learning_rate": 2.152502075745133e-09, "loss": 0.3266, "step": 6049 }, { "epoch": 0.993676603432701, "grad_norm": 1.9445077550548027, "learning_rate": 2.0435362541959545e-09, "loss": 0.6691, "step": 6050 }, { "epoch": 0.9938408474993841, "grad_norm": 5.3338090888817264, "learning_rate": 1.9374003331895654e-09, "loss": 0.7128, "step": 6051 }, { "epoch": 0.9940050915660672, "grad_norm": 2.0890097442380964, "learning_rate": 1.8340943427685997e-09, "loss": 0.7601, "step": 6052 }, { "epoch": 0.9941693356327502, "grad_norm": 1.622865346949036, "learning_rate": 1.7336183121730022e-09, "loss": 0.7175, "step": 6053 }, { "epoch": 0.9943335796994334, "grad_norm": 0.5463714975996664, "learning_rate": 1.635972269841135e-09, "loss": 0.3035, "step": 6054 }, { "epoch": 0.9944978237661164, "grad_norm": 1.9237069985034692, "learning_rate": 1.541156243413111e-09, "loss": 0.7147, "step": 6055 }, { "epoch": 0.9946620678327995, "grad_norm": 1.7751914009512313, "learning_rate": 1.4491702597263513e-09, "loss": 0.6956, "step": 6056 }, { "epoch": 0.9948263118994827, "grad_norm": 3.224784367667747, "learning_rate": 1.360014344816696e-09, "loss": 0.6706, "step": 6057 }, { "epoch": 0.9949905559661657, "grad_norm": 1.768644620580071, "learning_rate": 1.273688523919514e-09, "loss": 0.6999, "step": 6058 }, { "epoch": 0.9951548000328488, "grad_norm": 1.5196678305859956, "learning_rate": 1.1901928214685942e-09, "loss": 0.7495, "step": 6059 }, { "epoch": 0.9953190440995319, "grad_norm": 2.0902528503590334, "learning_rate": 1.1095272610994746e-09, "loss": 0.7882, "step": 6060 }, { "epoch": 0.995483288166215, "grad_norm": 1.8774308881843345, "learning_rate": 1.0316918656416708e-09, "loss": 0.727, "step": 6061 }, { "epoch": 0.995647532232898, "grad_norm": 2.1341197645192853, "learning_rate": 9.56686657127559e-10, "loss": 0.6557, "step": 6062 }, { "epoch": 0.9958117762995812, "grad_norm": 1.618118962702908, "learning_rate": 8.845116567879342e-10, "loss": 0.7553, "step": 6063 }, { "epoch": 0.9959760203662643, "grad_norm": 1.6549912413130514, "learning_rate": 8.151668850508998e-10, "loss": 0.8024, "step": 6064 }, { "epoch": 0.9961402644329473, "grad_norm": 1.6340305728090674, "learning_rate": 7.486523615440888e-10, "loss": 0.6614, "step": 6065 }, { "epoch": 0.9963045084996305, "grad_norm": 1.5021459635923387, "learning_rate": 6.849681050946633e-10, "loss": 0.7563, "step": 6066 }, { "epoch": 0.9964687525663135, "grad_norm": 1.6045889780245775, "learning_rate": 6.241141337282042e-10, "loss": 0.7315, "step": 6067 }, { "epoch": 0.9966329966329966, "grad_norm": 2.048602391555501, "learning_rate": 5.660904646698217e-10, "loss": 0.724, "step": 6068 }, { "epoch": 0.9967972406996797, "grad_norm": 2.0100874202627037, "learning_rate": 5.10897114341935e-10, "loss": 0.7169, "step": 6069 }, { "epoch": 0.9969614847663628, "grad_norm": 1.8148946462035949, "learning_rate": 4.585340983676023e-10, "loss": 0.796, "step": 6070 }, { "epoch": 0.997125728833046, "grad_norm": 1.9985158609205695, "learning_rate": 4.090014315694113e-10, "loss": 0.6869, "step": 6071 }, { "epoch": 0.997289972899729, "grad_norm": 3.047429665705384, "learning_rate": 3.6229912796392763e-10, "loss": 0.6594, "step": 6072 }, { "epoch": 0.9974542169664121, "grad_norm": 1.6872239955781516, "learning_rate": 3.1842720077390754e-10, "loss": 0.7272, "step": 6073 }, { "epoch": 0.9976184610330951, "grad_norm": 2.1937748424180614, "learning_rate": 2.773856624149751e-10, "loss": 0.7395, "step": 6074 }, { "epoch": 0.9977827050997783, "grad_norm": 1.9256052705205118, "learning_rate": 2.391745245045041e-10, "loss": 0.8423, "step": 6075 }, { "epoch": 0.9979469491664613, "grad_norm": 1.7773951561304318, "learning_rate": 2.0379379785828712e-10, "loss": 0.7417, "step": 6076 }, { "epoch": 0.9981111932331445, "grad_norm": 1.8564077219640194, "learning_rate": 1.7124349249053596e-10, "loss": 0.7218, "step": 6077 }, { "epoch": 0.9982754372998276, "grad_norm": 2.106950055312346, "learning_rate": 1.4152361761388122e-10, "loss": 0.7449, "step": 6078 }, { "epoch": 0.9984396813665106, "grad_norm": 1.5086809299409414, "learning_rate": 1.1463418164159301e-10, "loss": 0.7254, "step": 6079 }, { "epoch": 0.9986039254331938, "grad_norm": 0.5932065052512773, "learning_rate": 9.057519218425015e-11, "loss": 0.3145, "step": 6080 }, { "epoch": 0.9987681694998768, "grad_norm": 1.6070556191081196, "learning_rate": 6.934665605196068e-11, "loss": 0.6914, "step": 6081 }, { "epoch": 0.9989324135665599, "grad_norm": 2.038899399571835, "learning_rate": 5.094857925214136e-11, "loss": 0.7985, "step": 6082 }, { "epoch": 0.999096657633243, "grad_norm": 1.9041929861249016, "learning_rate": 3.5380966993958655e-11, "loss": 0.7393, "step": 6083 }, { "epoch": 0.9992609016999261, "grad_norm": 1.8371112620144567, "learning_rate": 2.264382368277751e-11, "loss": 0.7155, "step": 6084 }, { "epoch": 0.9994251457666092, "grad_norm": 2.251121738231183, "learning_rate": 1.2737152924602313e-11, "loss": 0.7358, "step": 6085 }, { "epoch": 0.9995893898332923, "grad_norm": 0.6083770381462116, "learning_rate": 5.660957522746202e-12, "loss": 0.3164, "step": 6086 }, { "epoch": 0.9997536338999754, "grad_norm": 2.2173862082230924, "learning_rate": 1.4152394811617343e-12, "loss": 0.6633, "step": 6087 }, { "epoch": 0.9999178779666584, "grad_norm": 2.602695673611385, "learning_rate": 0.0, "loss": 0.3926, "step": 6088 }, { "epoch": 0.9999178779666584, "step": 6088, "total_flos": 3458409160409088.0, "train_loss": 0.494501494957959, "train_runtime": 70728.974, "train_samples_per_second": 11.018, "train_steps_per_second": 0.086 } ], "logging_steps": 1.0, "max_steps": 6088, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "total_flos": 3458409160409088.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }