{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9996048367975974, "eval_steps": 500, "global_step": 1581, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 5.2656612225200705, "learning_rate": 1.0416666666666667e-07, "loss": 0.0917, "step": 1 }, { "epoch": 0.0, "grad_norm": 37.71678549668734, "learning_rate": 2.0833333333333333e-07, "loss": 1.137, "step": 2 }, { "epoch": 0.0, "grad_norm": 21.895397158947514, "learning_rate": 3.125e-07, "loss": 0.7479, "step": 3 }, { "epoch": 0.0, "grad_norm": 17.183793101679505, "learning_rate": 4.1666666666666667e-07, "loss": 0.4486, "step": 4 }, { "epoch": 0.0, "grad_norm": 24.891246608292718, "learning_rate": 5.208333333333334e-07, "loss": 0.761, "step": 5 }, { "epoch": 0.0, "grad_norm": 18.994012792789633, "learning_rate": 6.25e-07, "loss": 0.4221, "step": 6 }, { "epoch": 0.0, "grad_norm": 20.526634963381646, "learning_rate": 7.291666666666667e-07, "loss": 0.6891, "step": 7 }, { "epoch": 0.01, "grad_norm": 11.301318633524378, "learning_rate": 8.333333333333333e-07, "loss": 0.2271, "step": 8 }, { "epoch": 0.01, "grad_norm": 27.30560285482862, "learning_rate": 9.375000000000001e-07, "loss": 0.8797, "step": 9 }, { "epoch": 0.01, "grad_norm": 32.6060244744932, "learning_rate": 1.0416666666666667e-06, "loss": 1.0926, "step": 10 }, { "epoch": 0.01, "grad_norm": 22.285069634472343, "learning_rate": 1.1458333333333333e-06, "loss": 0.6605, "step": 11 }, { "epoch": 0.01, "grad_norm": 15.910361966967372, "learning_rate": 1.25e-06, "loss": 0.4612, "step": 12 }, { "epoch": 0.01, "grad_norm": 30.847411644670956, "learning_rate": 1.3541666666666667e-06, "loss": 0.827, "step": 13 }, { "epoch": 0.01, "grad_norm": 16.670988107560447, "learning_rate": 1.4583333333333335e-06, "loss": 0.4954, "step": 14 }, { "epoch": 0.01, "grad_norm": 47.28509413074193, "learning_rate": 1.5625e-06, "loss": 1.0251, "step": 15 }, { "epoch": 0.01, "grad_norm": 12.371926282375087, "learning_rate": 1.6666666666666667e-06, "loss": 0.2646, "step": 16 }, { "epoch": 0.01, "grad_norm": 17.58598956011841, "learning_rate": 1.7708333333333337e-06, "loss": 0.4352, "step": 17 }, { "epoch": 0.01, "grad_norm": 12.707206567726946, "learning_rate": 1.8750000000000003e-06, "loss": 0.6815, "step": 18 }, { "epoch": 0.01, "grad_norm": 23.703607804492943, "learning_rate": 1.9791666666666666e-06, "loss": 0.5448, "step": 19 }, { "epoch": 0.01, "grad_norm": 14.897272953634102, "learning_rate": 2.0833333333333334e-06, "loss": 0.4308, "step": 20 }, { "epoch": 0.01, "grad_norm": 21.118015556739405, "learning_rate": 2.1875000000000002e-06, "loss": 1.1068, "step": 21 }, { "epoch": 0.01, "grad_norm": 13.298309356391181, "learning_rate": 2.2916666666666666e-06, "loss": 0.2084, "step": 22 }, { "epoch": 0.01, "grad_norm": 21.37628919494901, "learning_rate": 2.395833333333334e-06, "loss": 1.1782, "step": 23 }, { "epoch": 0.02, "grad_norm": 24.479434977474586, "learning_rate": 2.5e-06, "loss": 0.4243, "step": 24 }, { "epoch": 0.02, "grad_norm": 15.259739501649532, "learning_rate": 2.604166666666667e-06, "loss": 0.4217, "step": 25 }, { "epoch": 0.02, "grad_norm": 8.960559893553361, "learning_rate": 2.7083333333333334e-06, "loss": 0.4158, "step": 26 }, { "epoch": 0.02, "grad_norm": 16.429086598913813, "learning_rate": 2.8125e-06, "loss": 0.6773, "step": 27 }, { "epoch": 0.02, "grad_norm": 9.209207852868984, "learning_rate": 2.916666666666667e-06, "loss": 0.3294, "step": 28 }, { "epoch": 0.02, "grad_norm": 8.940346431292438, "learning_rate": 3.0208333333333334e-06, "loss": 0.2523, "step": 29 }, { "epoch": 0.02, "grad_norm": 13.523395752029726, "learning_rate": 3.125e-06, "loss": 0.3717, "step": 30 }, { "epoch": 0.02, "grad_norm": 10.435033084102253, "learning_rate": 3.229166666666667e-06, "loss": 0.5776, "step": 31 }, { "epoch": 0.02, "grad_norm": 12.350376484804269, "learning_rate": 3.3333333333333333e-06, "loss": 0.5697, "step": 32 }, { "epoch": 0.02, "grad_norm": 7.46023112043865, "learning_rate": 3.4375e-06, "loss": 0.3445, "step": 33 }, { "epoch": 0.02, "grad_norm": 13.91670954958463, "learning_rate": 3.5416666666666673e-06, "loss": 0.2192, "step": 34 }, { "epoch": 0.02, "grad_norm": 15.227868702792659, "learning_rate": 3.6458333333333333e-06, "loss": 0.4932, "step": 35 }, { "epoch": 0.02, "grad_norm": 11.284980136108494, "learning_rate": 3.7500000000000005e-06, "loss": 0.7055, "step": 36 }, { "epoch": 0.02, "grad_norm": 10.405195303041342, "learning_rate": 3.854166666666667e-06, "loss": 0.7916, "step": 37 }, { "epoch": 0.02, "grad_norm": 3.7766714829237134, "learning_rate": 3.958333333333333e-06, "loss": 0.0268, "step": 38 }, { "epoch": 0.02, "grad_norm": 7.013506801089294, "learning_rate": 4.0625000000000005e-06, "loss": 0.2811, "step": 39 }, { "epoch": 0.03, "grad_norm": 8.745139379595656, "learning_rate": 4.166666666666667e-06, "loss": 0.2401, "step": 40 }, { "epoch": 0.03, "grad_norm": 8.713988395103588, "learning_rate": 4.270833333333333e-06, "loss": 0.5153, "step": 41 }, { "epoch": 0.03, "grad_norm": 7.301871898454228, "learning_rate": 4.3750000000000005e-06, "loss": 0.2485, "step": 42 }, { "epoch": 0.03, "grad_norm": 6.848396848766385, "learning_rate": 4.479166666666667e-06, "loss": 0.2515, "step": 43 }, { "epoch": 0.03, "grad_norm": 6.7180885166743, "learning_rate": 4.583333333333333e-06, "loss": 0.1705, "step": 44 }, { "epoch": 0.03, "grad_norm": 7.707560595021471, "learning_rate": 4.6875000000000004e-06, "loss": 0.2393, "step": 45 }, { "epoch": 0.03, "grad_norm": 10.348481263119, "learning_rate": 4.791666666666668e-06, "loss": 0.401, "step": 46 }, { "epoch": 0.03, "grad_norm": 8.277477398342935, "learning_rate": 4.895833333333333e-06, "loss": 0.1771, "step": 47 }, { "epoch": 0.03, "grad_norm": 5.225147278092627, "learning_rate": 5e-06, "loss": 0.1883, "step": 48 }, { "epoch": 0.03, "grad_norm": 7.714093120128615, "learning_rate": 4.999994750411077e-06, "loss": 0.3014, "step": 49 }, { "epoch": 0.03, "grad_norm": 12.29428007285362, "learning_rate": 4.999979001666353e-06, "loss": 0.6187, "step": 50 }, { "epoch": 0.03, "grad_norm": 11.868248877610313, "learning_rate": 4.999952753831968e-06, "loss": 0.4989, "step": 51 }, { "epoch": 0.03, "grad_norm": 9.321802600524457, "learning_rate": 4.999916007018154e-06, "loss": 0.2043, "step": 52 }, { "epoch": 0.03, "grad_norm": 8.505912071091073, "learning_rate": 4.999868761379236e-06, "loss": 0.3569, "step": 53 }, { "epoch": 0.03, "grad_norm": 5.737221159631332, "learning_rate": 4.99981101711363e-06, "loss": 0.1102, "step": 54 }, { "epoch": 0.03, "grad_norm": 7.9970130112468105, "learning_rate": 4.999742774463843e-06, "loss": 0.2492, "step": 55 }, { "epoch": 0.04, "grad_norm": 9.322204039774054, "learning_rate": 4.999664033716471e-06, "loss": 0.48, "step": 56 }, { "epoch": 0.04, "grad_norm": 7.5526073475709055, "learning_rate": 4.9995747952022e-06, "loss": 0.2816, "step": 57 }, { "epoch": 0.04, "grad_norm": 3.6641124730067647, "learning_rate": 4.999475059295803e-06, "loss": 0.0647, "step": 58 }, { "epoch": 0.04, "grad_norm": 13.362074836839993, "learning_rate": 4.999364826416136e-06, "loss": 0.9832, "step": 59 }, { "epoch": 0.04, "grad_norm": 11.495240926785474, "learning_rate": 4.999244097026143e-06, "loss": 0.6477, "step": 60 }, { "epoch": 0.04, "grad_norm": 6.202760008252028, "learning_rate": 4.999112871632847e-06, "loss": 0.3342, "step": 61 }, { "epoch": 0.04, "grad_norm": 8.140751521029765, "learning_rate": 4.998971150787351e-06, "loss": 0.2422, "step": 62 }, { "epoch": 0.04, "grad_norm": 6.531221088546207, "learning_rate": 4.998818935084836e-06, "loss": 0.2217, "step": 63 }, { "epoch": 0.04, "grad_norm": 10.304101101586628, "learning_rate": 4.998656225164557e-06, "loss": 0.3276, "step": 64 }, { "epoch": 0.04, "grad_norm": 8.351477084601795, "learning_rate": 4.998483021709846e-06, "loss": 0.2839, "step": 65 }, { "epoch": 0.04, "grad_norm": 8.319785689748254, "learning_rate": 4.998299325448096e-06, "loss": 0.2311, "step": 66 }, { "epoch": 0.04, "grad_norm": 8.552400967967682, "learning_rate": 4.9981051371507735e-06, "loss": 0.1399, "step": 67 }, { "epoch": 0.04, "grad_norm": 11.4528212357398, "learning_rate": 4.997900457633405e-06, "loss": 0.33, "step": 68 }, { "epoch": 0.04, "grad_norm": 4.549916931064769, "learning_rate": 4.9976852877555755e-06, "loss": 0.249, "step": 69 }, { "epoch": 0.04, "grad_norm": 8.289074465413135, "learning_rate": 4.99745962842093e-06, "loss": 0.2407, "step": 70 }, { "epoch": 0.04, "grad_norm": 9.255157682372891, "learning_rate": 4.997223480577162e-06, "loss": 0.3071, "step": 71 }, { "epoch": 0.05, "grad_norm": 9.21536451342183, "learning_rate": 4.996976845216016e-06, "loss": 0.3498, "step": 72 }, { "epoch": 0.05, "grad_norm": 8.694160378243387, "learning_rate": 4.9967197233732786e-06, "loss": 0.5605, "step": 73 }, { "epoch": 0.05, "grad_norm": 6.000976165200353, "learning_rate": 4.996452116128778e-06, "loss": 0.0838, "step": 74 }, { "epoch": 0.05, "grad_norm": 11.964975743028065, "learning_rate": 4.996174024606375e-06, "loss": 0.6742, "step": 75 }, { "epoch": 0.05, "grad_norm": 6.168516070752894, "learning_rate": 4.9958854499739625e-06, "loss": 0.1488, "step": 76 }, { "epoch": 0.05, "grad_norm": 3.6416967558036006, "learning_rate": 4.995586393443461e-06, "loss": 0.0552, "step": 77 }, { "epoch": 0.05, "grad_norm": 10.733989414358419, "learning_rate": 4.9952768562708085e-06, "loss": 0.6977, "step": 78 }, { "epoch": 0.05, "grad_norm": 10.299548344755562, "learning_rate": 4.994956839755959e-06, "loss": 0.5162, "step": 79 }, { "epoch": 0.05, "grad_norm": 10.600930630119668, "learning_rate": 4.994626345242878e-06, "loss": 0.5782, "step": 80 }, { "epoch": 0.05, "grad_norm": 8.323689437212801, "learning_rate": 4.994285374119531e-06, "loss": 0.4152, "step": 81 }, { "epoch": 0.05, "grad_norm": 8.109158140013431, "learning_rate": 4.993933927817888e-06, "loss": 0.414, "step": 82 }, { "epoch": 0.05, "grad_norm": 4.800126058194779, "learning_rate": 4.993572007813905e-06, "loss": 0.1281, "step": 83 }, { "epoch": 0.05, "grad_norm": 8.125007512015758, "learning_rate": 4.9931996156275285e-06, "loss": 0.3613, "step": 84 }, { "epoch": 0.05, "grad_norm": 7.019597005987227, "learning_rate": 4.992816752822682e-06, "loss": 0.1949, "step": 85 }, { "epoch": 0.05, "grad_norm": 8.425014271709907, "learning_rate": 4.992423421007265e-06, "loss": 0.5576, "step": 86 }, { "epoch": 0.06, "grad_norm": 11.815926514021896, "learning_rate": 4.992019621833142e-06, "loss": 0.7708, "step": 87 }, { "epoch": 0.06, "grad_norm": 0.011332290033770304, "learning_rate": 4.991605356996135e-06, "loss": 0.0, "step": 88 }, { "epoch": 0.06, "grad_norm": 1.4658112852872014, "learning_rate": 4.99118062823602e-06, "loss": 0.0134, "step": 89 }, { "epoch": 0.06, "grad_norm": 9.64181278769754, "learning_rate": 4.99074543733652e-06, "loss": 0.485, "step": 90 }, { "epoch": 0.06, "grad_norm": 7.460222683362177, "learning_rate": 4.9902997861252924e-06, "loss": 0.4733, "step": 91 }, { "epoch": 0.06, "grad_norm": 7.417865384844604, "learning_rate": 4.989843676473926e-06, "loss": 0.3127, "step": 92 }, { "epoch": 0.06, "grad_norm": 4.679053058190546, "learning_rate": 4.9893771102979305e-06, "loss": 0.043, "step": 93 }, { "epoch": 0.06, "grad_norm": 11.30453362455745, "learning_rate": 4.9889000895567315e-06, "loss": 0.5676, "step": 94 }, { "epoch": 0.06, "grad_norm": 8.12439385133809, "learning_rate": 4.98841261625366e-06, "loss": 0.3245, "step": 95 }, { "epoch": 0.06, "grad_norm": 7.59403708586379, "learning_rate": 4.987914692435942e-06, "loss": 0.2386, "step": 96 }, { "epoch": 0.06, "grad_norm": 4.993527037187059, "learning_rate": 4.987406320194695e-06, "loss": 0.1594, "step": 97 }, { "epoch": 0.06, "grad_norm": 7.598896307863153, "learning_rate": 4.986887501664912e-06, "loss": 0.4927, "step": 98 }, { "epoch": 0.06, "grad_norm": 6.572072341140025, "learning_rate": 4.986358239025465e-06, "loss": 0.1074, "step": 99 }, { "epoch": 0.06, "grad_norm": 8.158141169039942, "learning_rate": 4.98581853449908e-06, "loss": 0.2743, "step": 100 }, { "epoch": 0.06, "grad_norm": 8.177560611734636, "learning_rate": 4.985268390352339e-06, "loss": 0.4104, "step": 101 }, { "epoch": 0.06, "grad_norm": 8.624361318630898, "learning_rate": 4.984707808895668e-06, "loss": 0.4447, "step": 102 }, { "epoch": 0.07, "grad_norm": 9.489156859299493, "learning_rate": 4.984136792483322e-06, "loss": 0.6547, "step": 103 }, { "epoch": 0.07, "grad_norm": 3.5095593832872147, "learning_rate": 4.9835553435133845e-06, "loss": 0.0884, "step": 104 }, { "epoch": 0.07, "grad_norm": 9.265172937696828, "learning_rate": 4.982963464427749e-06, "loss": 0.1859, "step": 105 }, { "epoch": 0.07, "grad_norm": 9.604090979244088, "learning_rate": 4.982361157712114e-06, "loss": 0.2245, "step": 106 }, { "epoch": 0.07, "grad_norm": 7.861771386959831, "learning_rate": 4.981748425895968e-06, "loss": 0.2213, "step": 107 }, { "epoch": 0.07, "grad_norm": 11.006355876944076, "learning_rate": 4.9811252715525835e-06, "loss": 0.8064, "step": 108 }, { "epoch": 0.07, "grad_norm": 7.294557387145929, "learning_rate": 4.980491697299005e-06, "loss": 0.249, "step": 109 }, { "epoch": 0.07, "grad_norm": 6.018720984463257, "learning_rate": 4.979847705796035e-06, "loss": 0.1311, "step": 110 }, { "epoch": 0.07, "grad_norm": 12.046383542180305, "learning_rate": 4.979193299748225e-06, "loss": 0.7158, "step": 111 }, { "epoch": 0.07, "grad_norm": 6.139919871450443, "learning_rate": 4.978528481903868e-06, "loss": 0.2277, "step": 112 }, { "epoch": 0.07, "grad_norm": 6.719032459088086, "learning_rate": 4.977853255054978e-06, "loss": 0.1229, "step": 113 }, { "epoch": 0.07, "grad_norm": 5.035410233181668, "learning_rate": 4.977167622037287e-06, "loss": 0.2215, "step": 114 }, { "epoch": 0.07, "grad_norm": 9.55615821929029, "learning_rate": 4.976471585730227e-06, "loss": 0.3229, "step": 115 }, { "epoch": 0.07, "grad_norm": 6.252747198966202, "learning_rate": 4.9757651490569235e-06, "loss": 0.1689, "step": 116 }, { "epoch": 0.07, "grad_norm": 10.26860256356231, "learning_rate": 4.975048314984176e-06, "loss": 0.3926, "step": 117 }, { "epoch": 0.07, "grad_norm": 6.439627879036101, "learning_rate": 4.974321086522453e-06, "loss": 0.5022, "step": 118 }, { "epoch": 0.08, "grad_norm": 9.200371477671347, "learning_rate": 4.973583466725875e-06, "loss": 0.453, "step": 119 }, { "epoch": 0.08, "grad_norm": 6.137229388267069, "learning_rate": 4.972835458692202e-06, "loss": 0.1897, "step": 120 }, { "epoch": 0.08, "grad_norm": 7.7150017289995985, "learning_rate": 4.9720770655628216e-06, "loss": 0.2803, "step": 121 }, { "epoch": 0.08, "grad_norm": 7.141197932858308, "learning_rate": 4.971308290522737e-06, "loss": 0.1684, "step": 122 }, { "epoch": 0.08, "grad_norm": 4.373554535908345, "learning_rate": 4.9705291368005485e-06, "loss": 0.2033, "step": 123 }, { "epoch": 0.08, "grad_norm": 9.776634419185687, "learning_rate": 4.9697396076684465e-06, "loss": 0.6946, "step": 124 }, { "epoch": 0.08, "grad_norm": 10.34632163911218, "learning_rate": 4.968939706442195e-06, "loss": 0.4237, "step": 125 }, { "epoch": 0.08, "grad_norm": 9.199015141722331, "learning_rate": 4.968129436481114e-06, "loss": 0.5535, "step": 126 }, { "epoch": 0.08, "grad_norm": 5.115336542030487, "learning_rate": 4.967308801188072e-06, "loss": 0.1513, "step": 127 }, { "epoch": 0.08, "grad_norm": 9.83142325605739, "learning_rate": 4.966477804009467e-06, "loss": 0.6237, "step": 128 }, { "epoch": 0.08, "grad_norm": 5.880253167962748, "learning_rate": 4.965636448435214e-06, "loss": 0.1413, "step": 129 }, { "epoch": 0.08, "grad_norm": 10.647769719523428, "learning_rate": 4.964784737998728e-06, "loss": 0.6148, "step": 130 }, { "epoch": 0.08, "grad_norm": 7.417627793803241, "learning_rate": 4.963922676276916e-06, "loss": 0.1604, "step": 131 }, { "epoch": 0.08, "grad_norm": 10.489083928200404, "learning_rate": 4.963050266890152e-06, "loss": 0.5219, "step": 132 }, { "epoch": 0.08, "grad_norm": 10.857201755335243, "learning_rate": 4.962167513502268e-06, "loss": 0.256, "step": 133 }, { "epoch": 0.08, "grad_norm": 5.730581582797649, "learning_rate": 4.96127441982054e-06, "loss": 0.1392, "step": 134 }, { "epoch": 0.09, "grad_norm": 1.8117295304767658, "learning_rate": 4.960370989595665e-06, "loss": 0.0117, "step": 135 }, { "epoch": 0.09, "grad_norm": 6.822915610587546, "learning_rate": 4.9594572266217545e-06, "loss": 0.3114, "step": 136 }, { "epoch": 0.09, "grad_norm": 4.887064750003579, "learning_rate": 4.9585331347363115e-06, "loss": 0.1163, "step": 137 }, { "epoch": 0.09, "grad_norm": 5.813427922823778, "learning_rate": 4.95759871782022e-06, "loss": 0.1726, "step": 138 }, { "epoch": 0.09, "grad_norm": 11.198693498974976, "learning_rate": 4.956653979797722e-06, "loss": 0.3887, "step": 139 }, { "epoch": 0.09, "grad_norm": 7.186200895249503, "learning_rate": 4.955698924636406e-06, "loss": 0.2827, "step": 140 }, { "epoch": 0.09, "grad_norm": 8.502346836809142, "learning_rate": 4.954733556347192e-06, "loss": 0.2718, "step": 141 }, { "epoch": 0.09, "grad_norm": 3.759273760439989, "learning_rate": 4.953757878984306e-06, "loss": 0.0605, "step": 142 }, { "epoch": 0.09, "grad_norm": 8.083775563940655, "learning_rate": 4.9527718966452756e-06, "loss": 0.328, "step": 143 }, { "epoch": 0.09, "grad_norm": 6.973868778063047, "learning_rate": 4.9517756134709005e-06, "loss": 0.3812, "step": 144 }, { "epoch": 0.09, "grad_norm": 3.8341193153548887, "learning_rate": 4.9507690336452425e-06, "loss": 0.0354, "step": 145 }, { "epoch": 0.09, "grad_norm": 5.416234317055171, "learning_rate": 4.949752161395606e-06, "loss": 0.0572, "step": 146 }, { "epoch": 0.09, "grad_norm": 7.324931736130027, "learning_rate": 4.948725000992519e-06, "loss": 0.258, "step": 147 }, { "epoch": 0.09, "grad_norm": 4.833407872820073, "learning_rate": 4.947687556749719e-06, "loss": 0.0653, "step": 148 }, { "epoch": 0.09, "grad_norm": 6.186974397163941, "learning_rate": 4.9466398330241305e-06, "loss": 0.2876, "step": 149 }, { "epoch": 0.09, "grad_norm": 10.006865624117305, "learning_rate": 4.945581834215848e-06, "loss": 0.8994, "step": 150 }, { "epoch": 0.1, "grad_norm": 10.082199999798531, "learning_rate": 4.944513564768119e-06, "loss": 0.7066, "step": 151 }, { "epoch": 0.1, "grad_norm": 7.091613985696003, "learning_rate": 4.943435029167322e-06, "loss": 0.3657, "step": 152 }, { "epoch": 0.1, "grad_norm": 9.091645616692901, "learning_rate": 4.942346231942955e-06, "loss": 0.4636, "step": 153 }, { "epoch": 0.1, "grad_norm": 9.60548066413454, "learning_rate": 4.941247177667606e-06, "loss": 0.4686, "step": 154 }, { "epoch": 0.1, "grad_norm": 5.308989498160375, "learning_rate": 4.940137870956942e-06, "loss": 0.1011, "step": 155 }, { "epoch": 0.1, "grad_norm": 6.307909212086813, "learning_rate": 4.939018316469687e-06, "loss": 0.1337, "step": 156 }, { "epoch": 0.1, "grad_norm": 9.32445644400113, "learning_rate": 4.937888518907601e-06, "loss": 0.6179, "step": 157 }, { "epoch": 0.1, "grad_norm": 11.119911315893658, "learning_rate": 4.936748483015462e-06, "loss": 0.7428, "step": 158 }, { "epoch": 0.1, "grad_norm": 4.807390547452164, "learning_rate": 4.935598213581047e-06, "loss": 0.1299, "step": 159 }, { "epoch": 0.1, "grad_norm": 8.948990271526885, "learning_rate": 4.934437715435108e-06, "loss": 0.8448, "step": 160 }, { "epoch": 0.1, "grad_norm": 5.513856425965154, "learning_rate": 4.933266993451357e-06, "loss": 0.2622, "step": 161 }, { "epoch": 0.1, "grad_norm": 7.631467484008082, "learning_rate": 4.93208605254644e-06, "loss": 0.3311, "step": 162 }, { "epoch": 0.1, "grad_norm": 9.988981661744699, "learning_rate": 4.930894897679921e-06, "loss": 0.3608, "step": 163 }, { "epoch": 0.1, "grad_norm": 10.093000921556436, "learning_rate": 4.929693533854259e-06, "loss": 0.7365, "step": 164 }, { "epoch": 0.1, "grad_norm": 7.056080018556442, "learning_rate": 4.928481966114785e-06, "loss": 0.4492, "step": 165 }, { "epoch": 0.1, "grad_norm": 6.683090324772103, "learning_rate": 4.927260199549689e-06, "loss": 0.3204, "step": 166 }, { "epoch": 0.11, "grad_norm": 5.425554154039748, "learning_rate": 4.926028239289985e-06, "loss": 0.2467, "step": 167 }, { "epoch": 0.11, "grad_norm": 6.33885327098014, "learning_rate": 4.924786090509504e-06, "loss": 0.1732, "step": 168 }, { "epoch": 0.11, "grad_norm": 6.58341017408635, "learning_rate": 4.923533758424858e-06, "loss": 0.1798, "step": 169 }, { "epoch": 0.11, "grad_norm": 7.90838859493241, "learning_rate": 4.922271248295436e-06, "loss": 0.4632, "step": 170 }, { "epoch": 0.11, "grad_norm": 8.761841962324697, "learning_rate": 4.92099856542336e-06, "loss": 0.2391, "step": 171 }, { "epoch": 0.11, "grad_norm": 5.079810792898726, "learning_rate": 4.919715715153482e-06, "loss": 0.1513, "step": 172 }, { "epoch": 0.11, "grad_norm": 11.926120307168409, "learning_rate": 4.9184227028733526e-06, "loss": 0.5402, "step": 173 }, { "epoch": 0.11, "grad_norm": 11.185515669973498, "learning_rate": 4.917119534013194e-06, "loss": 0.6567, "step": 174 }, { "epoch": 0.11, "grad_norm": 9.49548192252073, "learning_rate": 4.915806214045891e-06, "loss": 0.443, "step": 175 }, { "epoch": 0.11, "grad_norm": 11.866687314590607, "learning_rate": 4.914482748486953e-06, "loss": 0.4633, "step": 176 }, { "epoch": 0.11, "grad_norm": 6.839924897344879, "learning_rate": 4.913149142894501e-06, "loss": 0.2298, "step": 177 }, { "epoch": 0.11, "grad_norm": 14.581544137825711, "learning_rate": 4.91180540286924e-06, "loss": 0.4352, "step": 178 }, { "epoch": 0.11, "grad_norm": 8.974583659171536, "learning_rate": 4.910451534054436e-06, "loss": 0.3001, "step": 179 }, { "epoch": 0.11, "grad_norm": 11.895168804752402, "learning_rate": 4.909087542135893e-06, "loss": 0.4718, "step": 180 }, { "epoch": 0.11, "grad_norm": 8.10700239632405, "learning_rate": 4.907713432841929e-06, "loss": 0.2801, "step": 181 }, { "epoch": 0.12, "grad_norm": 10.166896838308176, "learning_rate": 4.906329211943349e-06, "loss": 0.4782, "step": 182 }, { "epoch": 0.12, "grad_norm": 4.69390248788744, "learning_rate": 4.904934885253428e-06, "loss": 0.1194, "step": 183 }, { "epoch": 0.12, "grad_norm": 8.701907718254184, "learning_rate": 4.903530458627878e-06, "loss": 0.3517, "step": 184 }, { "epoch": 0.12, "grad_norm": 7.913263760748024, "learning_rate": 4.90211593796483e-06, "loss": 0.2148, "step": 185 }, { "epoch": 0.12, "grad_norm": 8.457947770775371, "learning_rate": 4.9006913292048055e-06, "loss": 0.3311, "step": 186 }, { "epoch": 0.12, "grad_norm": 9.808477075172256, "learning_rate": 4.899256638330693e-06, "loss": 0.6076, "step": 187 }, { "epoch": 0.12, "grad_norm": 8.952034609239373, "learning_rate": 4.89781187136772e-06, "loss": 0.6339, "step": 188 }, { "epoch": 0.12, "grad_norm": 8.38088281610856, "learning_rate": 4.896357034383436e-06, "loss": 0.2371, "step": 189 }, { "epoch": 0.12, "grad_norm": 8.31560991666473, "learning_rate": 4.8948921334876755e-06, "loss": 0.526, "step": 190 }, { "epoch": 0.12, "grad_norm": 7.023614289262058, "learning_rate": 4.893417174832542e-06, "loss": 0.1653, "step": 191 }, { "epoch": 0.12, "grad_norm": 2.6420920545956146, "learning_rate": 4.891932164612376e-06, "loss": 0.0117, "step": 192 }, { "epoch": 0.12, "grad_norm": 5.266238213667865, "learning_rate": 4.890437109063733e-06, "loss": 0.1514, "step": 193 }, { "epoch": 0.12, "grad_norm": 4.652913786511836, "learning_rate": 4.8889320144653525e-06, "loss": 0.1414, "step": 194 }, { "epoch": 0.12, "grad_norm": 6.875838904783274, "learning_rate": 4.887416887138139e-06, "loss": 0.2616, "step": 195 }, { "epoch": 0.12, "grad_norm": 6.250095519289617, "learning_rate": 4.885891733445127e-06, "loss": 0.1971, "step": 196 }, { "epoch": 0.12, "grad_norm": 9.129772035859265, "learning_rate": 4.884356559791463e-06, "loss": 0.5796, "step": 197 }, { "epoch": 0.13, "grad_norm": 7.590763211081543, "learning_rate": 4.882811372624369e-06, "loss": 0.2874, "step": 198 }, { "epoch": 0.13, "grad_norm": 2.0113034310365054, "learning_rate": 4.8812561784331255e-06, "loss": 0.0168, "step": 199 }, { "epoch": 0.13, "grad_norm": 7.595816123163086, "learning_rate": 4.879690983749035e-06, "loss": 0.433, "step": 200 }, { "epoch": 0.13, "grad_norm": 7.134903299468987, "learning_rate": 4.8781157951454e-06, "loss": 0.3736, "step": 201 }, { "epoch": 0.13, "grad_norm": 6.270510474054294, "learning_rate": 4.876530619237495e-06, "loss": 0.3223, "step": 202 }, { "epoch": 0.13, "grad_norm": 6.494284831575592, "learning_rate": 4.874935462682539e-06, "loss": 0.2917, "step": 203 }, { "epoch": 0.13, "grad_norm": 7.339106223756011, "learning_rate": 4.873330332179663e-06, "loss": 0.4017, "step": 204 }, { "epoch": 0.13, "grad_norm": 7.793226714871994, "learning_rate": 4.8717152344698884e-06, "loss": 0.5368, "step": 205 }, { "epoch": 0.13, "grad_norm": 0.007605496887672106, "learning_rate": 4.870090176336094e-06, "loss": 0.0, "step": 206 }, { "epoch": 0.13, "grad_norm": 3.934248171845968, "learning_rate": 4.86845516460299e-06, "loss": 0.1657, "step": 207 }, { "epoch": 0.13, "grad_norm": 8.719543202185832, "learning_rate": 4.866810206137086e-06, "loss": 0.2729, "step": 208 }, { "epoch": 0.13, "grad_norm": 8.018804860094635, "learning_rate": 4.86515530784667e-06, "loss": 0.3355, "step": 209 }, { "epoch": 0.13, "grad_norm": 3.521138255140858, "learning_rate": 4.863490476681768e-06, "loss": 0.042, "step": 210 }, { "epoch": 0.13, "grad_norm": 9.854944651393597, "learning_rate": 4.861815719634124e-06, "loss": 0.3011, "step": 211 }, { "epoch": 0.13, "grad_norm": 3.5541484717514167, "learning_rate": 4.860131043737167e-06, "loss": 0.209, "step": 212 }, { "epoch": 0.13, "grad_norm": 7.6168549606899765, "learning_rate": 4.858436456065982e-06, "loss": 0.3621, "step": 213 }, { "epoch": 0.14, "grad_norm": 8.850190206951416, "learning_rate": 4.856731963737279e-06, "loss": 0.3514, "step": 214 }, { "epoch": 0.14, "grad_norm": 5.616166023256547, "learning_rate": 4.855017573909367e-06, "loss": 0.0938, "step": 215 }, { "epoch": 0.14, "grad_norm": 9.490503885540791, "learning_rate": 4.853293293782118e-06, "loss": 0.3176, "step": 216 }, { "epoch": 0.14, "grad_norm": 6.815167945813124, "learning_rate": 4.851559130596942e-06, "loss": 0.2464, "step": 217 }, { "epoch": 0.14, "grad_norm": 8.773913593911098, "learning_rate": 4.849815091636754e-06, "loss": 0.3162, "step": 218 }, { "epoch": 0.14, "grad_norm": 10.619651615154147, "learning_rate": 4.8480611842259435e-06, "loss": 0.6477, "step": 219 }, { "epoch": 0.14, "grad_norm": 9.136921658980324, "learning_rate": 4.846297415730346e-06, "loss": 0.4822, "step": 220 }, { "epoch": 0.14, "grad_norm": 7.580845798620424, "learning_rate": 4.844523793557207e-06, "loss": 0.4707, "step": 221 }, { "epoch": 0.14, "grad_norm": 12.854711454069392, "learning_rate": 4.842740325155159e-06, "loss": 0.66, "step": 222 }, { "epoch": 0.14, "grad_norm": 7.887155990232335, "learning_rate": 4.8409470180141825e-06, "loss": 0.3607, "step": 223 }, { "epoch": 0.14, "grad_norm": 10.030220145246533, "learning_rate": 4.839143879665577e-06, "loss": 0.4508, "step": 224 }, { "epoch": 0.14, "grad_norm": 5.049187003686174, "learning_rate": 4.83733091768193e-06, "loss": 0.1506, "step": 225 }, { "epoch": 0.14, "grad_norm": 8.041797169102463, "learning_rate": 4.835508139677086e-06, "loss": 0.2972, "step": 226 }, { "epoch": 0.14, "grad_norm": 9.465887562386847, "learning_rate": 4.833675553306112e-06, "loss": 0.378, "step": 227 }, { "epoch": 0.14, "grad_norm": 6.1022007958787015, "learning_rate": 4.831833166265271e-06, "loss": 0.3064, "step": 228 }, { "epoch": 0.14, "grad_norm": 8.811905211809423, "learning_rate": 4.82998098629198e-06, "loss": 0.3165, "step": 229 }, { "epoch": 0.15, "grad_norm": 4.243049098521701, "learning_rate": 4.8281190211647876e-06, "loss": 0.0759, "step": 230 }, { "epoch": 0.15, "grad_norm": 5.741882565245166, "learning_rate": 4.826247278703333e-06, "loss": 0.329, "step": 231 }, { "epoch": 0.15, "grad_norm": 6.282475138507657, "learning_rate": 4.824365766768322e-06, "loss": 0.2201, "step": 232 }, { "epoch": 0.15, "grad_norm": 7.906386302632517, "learning_rate": 4.822474493261483e-06, "loss": 0.3049, "step": 233 }, { "epoch": 0.15, "grad_norm": 1.5989551440609946, "learning_rate": 4.820573466125544e-06, "loss": 0.0179, "step": 234 }, { "epoch": 0.15, "grad_norm": 10.25735767805942, "learning_rate": 4.818662693344195e-06, "loss": 0.6544, "step": 235 }, { "epoch": 0.15, "grad_norm": 9.251767788609671, "learning_rate": 4.8167421829420505e-06, "loss": 0.324, "step": 236 }, { "epoch": 0.15, "grad_norm": 6.816084732299098, "learning_rate": 4.814811942984625e-06, "loss": 0.3753, "step": 237 }, { "epoch": 0.15, "grad_norm": 8.732009703913429, "learning_rate": 4.812871981578291e-06, "loss": 0.2821, "step": 238 }, { "epoch": 0.15, "grad_norm": 8.68447713609987, "learning_rate": 4.810922306870247e-06, "loss": 0.3349, "step": 239 }, { "epoch": 0.15, "grad_norm": 4.716199981642604, "learning_rate": 4.8089629270484875e-06, "loss": 0.0707, "step": 240 }, { "epoch": 0.15, "grad_norm": 4.484711425070972, "learning_rate": 4.806993850341763e-06, "loss": 0.1182, "step": 241 }, { "epoch": 0.15, "grad_norm": 7.396622851142999, "learning_rate": 4.805015085019547e-06, "loss": 0.4418, "step": 242 }, { "epoch": 0.15, "grad_norm": 7.017667817058326, "learning_rate": 4.8030266393920045e-06, "loss": 0.1671, "step": 243 }, { "epoch": 0.15, "grad_norm": 9.91983323603094, "learning_rate": 4.801028521809951e-06, "loss": 0.7713, "step": 244 }, { "epoch": 0.15, "grad_norm": 2.789966853365322, "learning_rate": 4.799020740664827e-06, "loss": 0.0282, "step": 245 }, { "epoch": 0.16, "grad_norm": 8.612661630915982, "learning_rate": 4.79700330438865e-06, "loss": 0.4338, "step": 246 }, { "epoch": 0.16, "grad_norm": 8.569635467076427, "learning_rate": 4.79497622145399e-06, "loss": 0.4263, "step": 247 }, { "epoch": 0.16, "grad_norm": 4.023934283650843, "learning_rate": 4.792939500373928e-06, "loss": 0.0597, "step": 248 }, { "epoch": 0.16, "grad_norm": 6.647239352966388, "learning_rate": 4.790893149702023e-06, "loss": 0.2972, "step": 249 }, { "epoch": 0.16, "grad_norm": 11.792282969046761, "learning_rate": 4.788837178032275e-06, "loss": 0.5494, "step": 250 }, { "epoch": 0.16, "grad_norm": 3.746679107152121, "learning_rate": 4.78677159399909e-06, "loss": 0.1171, "step": 251 }, { "epoch": 0.16, "grad_norm": 5.5558840379031444, "learning_rate": 4.78469640627724e-06, "loss": 0.0948, "step": 252 }, { "epoch": 0.16, "grad_norm": 8.412106427399646, "learning_rate": 4.782611623581831e-06, "loss": 0.3801, "step": 253 }, { "epoch": 0.16, "grad_norm": 5.157126797197075, "learning_rate": 4.780517254668265e-06, "loss": 0.0801, "step": 254 }, { "epoch": 0.16, "grad_norm": 9.015971106062372, "learning_rate": 4.778413308332204e-06, "loss": 0.3776, "step": 255 }, { "epoch": 0.16, "grad_norm": 10.289155558496217, "learning_rate": 4.776299793409529e-06, "loss": 0.7922, "step": 256 }, { "epoch": 0.16, "grad_norm": 8.287668871417681, "learning_rate": 4.774176718776309e-06, "loss": 0.467, "step": 257 }, { "epoch": 0.16, "grad_norm": 9.8340233598408, "learning_rate": 4.772044093348757e-06, "loss": 0.802, "step": 258 }, { "epoch": 0.16, "grad_norm": 10.366158636305297, "learning_rate": 4.769901926083202e-06, "loss": 0.6071, "step": 259 }, { "epoch": 0.16, "grad_norm": 9.4178176671571, "learning_rate": 4.767750225976039e-06, "loss": 0.2315, "step": 260 }, { "epoch": 0.17, "grad_norm": 6.3592580273717, "learning_rate": 4.765589002063702e-06, "loss": 0.1362, "step": 261 }, { "epoch": 0.17, "grad_norm": 8.813344495634473, "learning_rate": 4.76341826342262e-06, "loss": 0.2995, "step": 262 }, { "epoch": 0.17, "grad_norm": 6.8247550836571635, "learning_rate": 4.761238019169183e-06, "loss": 0.2216, "step": 263 }, { "epoch": 0.17, "grad_norm": 7.396924292106436, "learning_rate": 4.759048278459698e-06, "loss": 0.3645, "step": 264 }, { "epoch": 0.17, "grad_norm": 5.310670066266062, "learning_rate": 4.756849050490357e-06, "loss": 0.1536, "step": 265 }, { "epoch": 0.17, "grad_norm": 7.613794316344615, "learning_rate": 4.754640344497195e-06, "loss": 0.4279, "step": 266 }, { "epoch": 0.17, "grad_norm": 5.738743253823156, "learning_rate": 4.752422169756048e-06, "loss": 0.2885, "step": 267 }, { "epoch": 0.17, "grad_norm": 6.8228132943823105, "learning_rate": 4.750194535582523e-06, "loss": 0.3673, "step": 268 }, { "epoch": 0.17, "grad_norm": 11.195239581278578, "learning_rate": 4.7479574513319505e-06, "loss": 0.9428, "step": 269 }, { "epoch": 0.17, "grad_norm": 4.547670694579577, "learning_rate": 4.745710926399347e-06, "loss": 0.1266, "step": 270 }, { "epoch": 0.17, "grad_norm": 8.473438274334972, "learning_rate": 4.743454970219382e-06, "loss": 0.4765, "step": 271 }, { "epoch": 0.17, "grad_norm": 7.042287073136041, "learning_rate": 4.741189592266326e-06, "loss": 0.3018, "step": 272 }, { "epoch": 0.17, "grad_norm": 8.701635483230024, "learning_rate": 4.738914802054022e-06, "loss": 0.3544, "step": 273 }, { "epoch": 0.17, "grad_norm": 7.6243596042552975, "learning_rate": 4.736630609135843e-06, "loss": 0.3046, "step": 274 }, { "epoch": 0.17, "grad_norm": 8.318410965955888, "learning_rate": 4.734337023104645e-06, "loss": 0.2677, "step": 275 }, { "epoch": 0.17, "grad_norm": 10.16942579840848, "learning_rate": 4.732034053592738e-06, "loss": 0.2028, "step": 276 }, { "epoch": 0.18, "grad_norm": 7.019452450691545, "learning_rate": 4.729721710271834e-06, "loss": 0.2126, "step": 277 }, { "epoch": 0.18, "grad_norm": 8.840398685906719, "learning_rate": 4.727400002853017e-06, "loss": 0.3519, "step": 278 }, { "epoch": 0.18, "grad_norm": 7.6210832385563085, "learning_rate": 4.725068941086693e-06, "loss": 0.2438, "step": 279 }, { "epoch": 0.18, "grad_norm": 4.334328732728344, "learning_rate": 4.722728534762554e-06, "loss": 0.0618, "step": 280 }, { "epoch": 0.18, "grad_norm": 12.570676657248557, "learning_rate": 4.720378793709539e-06, "loss": 0.5638, "step": 281 }, { "epoch": 0.18, "grad_norm": 6.47696006300138, "learning_rate": 4.718019727795787e-06, "loss": 0.2949, "step": 282 }, { "epoch": 0.18, "grad_norm": 3.3630084592841567, "learning_rate": 4.715651346928598e-06, "loss": 0.0478, "step": 283 }, { "epoch": 0.18, "grad_norm": 6.2526694128489595, "learning_rate": 4.713273661054394e-06, "loss": 0.2678, "step": 284 }, { "epoch": 0.18, "grad_norm": 6.9107314539428915, "learning_rate": 4.710886680158674e-06, "loss": 0.3437, "step": 285 }, { "epoch": 0.18, "grad_norm": 6.190842467430662, "learning_rate": 4.708490414265972e-06, "loss": 0.2772, "step": 286 }, { "epoch": 0.18, "grad_norm": 13.424816302245013, "learning_rate": 4.706084873439815e-06, "loss": 0.7814, "step": 287 }, { "epoch": 0.18, "grad_norm": 5.92682724473768, "learning_rate": 4.7036700677826876e-06, "loss": 0.3288, "step": 288 }, { "epoch": 0.18, "grad_norm": 6.884500216515442, "learning_rate": 4.701246007435975e-06, "loss": 0.0483, "step": 289 }, { "epoch": 0.18, "grad_norm": 4.8542197992694085, "learning_rate": 4.6988127025799376e-06, "loss": 0.1828, "step": 290 }, { "epoch": 0.18, "grad_norm": 9.886115276920187, "learning_rate": 4.696370163433652e-06, "loss": 0.6249, "step": 291 }, { "epoch": 0.18, "grad_norm": 6.612995427687164, "learning_rate": 4.6939184002549805e-06, "loss": 0.2071, "step": 292 }, { "epoch": 0.19, "grad_norm": 7.302681354698541, "learning_rate": 4.691457423340524e-06, "loss": 0.4143, "step": 293 }, { "epoch": 0.19, "grad_norm": 6.13586769540285, "learning_rate": 4.688987243025574e-06, "loss": 0.2545, "step": 294 }, { "epoch": 0.19, "grad_norm": 8.086471525239894, "learning_rate": 4.686507869684076e-06, "loss": 0.3258, "step": 295 }, { "epoch": 0.19, "grad_norm": 4.333696545760031, "learning_rate": 4.684019313728583e-06, "loss": 0.0972, "step": 296 }, { "epoch": 0.19, "grad_norm": 6.896131188297605, "learning_rate": 4.681521585610212e-06, "loss": 0.4128, "step": 297 }, { "epoch": 0.19, "grad_norm": 5.408882161856064, "learning_rate": 4.679014695818598e-06, "loss": 0.252, "step": 298 }, { "epoch": 0.19, "grad_norm": 5.728542594442993, "learning_rate": 4.676498654881855e-06, "loss": 0.175, "step": 299 }, { "epoch": 0.19, "grad_norm": 1.906921080986179, "learning_rate": 4.6739734733665275e-06, "loss": 0.0217, "step": 300 }, { "epoch": 0.19, "grad_norm": 4.540570584710826, "learning_rate": 4.671439161877548e-06, "loss": 0.0623, "step": 301 }, { "epoch": 0.19, "grad_norm": 8.92013933765738, "learning_rate": 4.6688957310581905e-06, "loss": 0.5718, "step": 302 }, { "epoch": 0.19, "grad_norm": 6.779258330072647, "learning_rate": 4.666343191590027e-06, "loss": 0.2939, "step": 303 }, { "epoch": 0.19, "grad_norm": 8.220533518311544, "learning_rate": 4.663781554192886e-06, "loss": 0.6022, "step": 304 }, { "epoch": 0.19, "grad_norm": 8.011876826816918, "learning_rate": 4.661210829624802e-06, "loss": 0.5797, "step": 305 }, { "epoch": 0.19, "grad_norm": 4.733390076167311, "learning_rate": 4.65863102868197e-06, "loss": 0.1799, "step": 306 }, { "epoch": 0.19, "grad_norm": 7.6604052420012145, "learning_rate": 4.656042162198708e-06, "loss": 0.3506, "step": 307 }, { "epoch": 0.19, "grad_norm": 2.4477626238031727, "learning_rate": 4.653444241047403e-06, "loss": 0.0399, "step": 308 }, { "epoch": 0.2, "grad_norm": 9.488359648065368, "learning_rate": 4.650837276138471e-06, "loss": 0.4692, "step": 309 }, { "epoch": 0.2, "grad_norm": 2.9915621031726336, "learning_rate": 4.6482212784203055e-06, "loss": 0.041, "step": 310 }, { "epoch": 0.2, "grad_norm": 9.602561259333772, "learning_rate": 4.645596258879237e-06, "loss": 0.5356, "step": 311 }, { "epoch": 0.2, "grad_norm": 4.138638923373422, "learning_rate": 4.642962228539485e-06, "loss": 0.0997, "step": 312 }, { "epoch": 0.2, "grad_norm": 4.098187320358129, "learning_rate": 4.640319198463109e-06, "loss": 0.1054, "step": 313 }, { "epoch": 0.2, "grad_norm": 6.054088302003616, "learning_rate": 4.637667179749968e-06, "loss": 0.3498, "step": 314 }, { "epoch": 0.2, "grad_norm": 8.099179129744174, "learning_rate": 4.635006183537668e-06, "loss": 0.6047, "step": 315 }, { "epoch": 0.2, "grad_norm": 6.776208928766074, "learning_rate": 4.6323362210015176e-06, "loss": 0.3915, "step": 316 }, { "epoch": 0.2, "grad_norm": 4.500286940857086, "learning_rate": 4.629657303354482e-06, "loss": 0.1304, "step": 317 }, { "epoch": 0.2, "grad_norm": 4.528960182141249, "learning_rate": 4.626969441847133e-06, "loss": 0.1004, "step": 318 }, { "epoch": 0.2, "grad_norm": 8.16164129393346, "learning_rate": 4.624272647767607e-06, "loss": 0.5955, "step": 319 }, { "epoch": 0.2, "grad_norm": 5.581160032224291, "learning_rate": 4.6215669324415505e-06, "loss": 0.1906, "step": 320 }, { "epoch": 0.2, "grad_norm": 9.681609694889568, "learning_rate": 4.618852307232078e-06, "loss": 0.6354, "step": 321 }, { "epoch": 0.2, "grad_norm": 8.765863479933287, "learning_rate": 4.616128783539725e-06, "loss": 0.5332, "step": 322 }, { "epoch": 0.2, "grad_norm": 6.042645535150689, "learning_rate": 4.613396372802392e-06, "loss": 0.2459, "step": 323 }, { "epoch": 0.2, "grad_norm": 13.304337125634598, "learning_rate": 4.610655086495308e-06, "loss": 1.121, "step": 324 }, { "epoch": 0.21, "grad_norm": 6.138452198881182, "learning_rate": 4.607904936130974e-06, "loss": 0.1494, "step": 325 }, { "epoch": 0.21, "grad_norm": 11.324245025193733, "learning_rate": 4.605145933259115e-06, "loss": 0.6364, "step": 326 }, { "epoch": 0.21, "grad_norm": 8.174242307681336, "learning_rate": 4.602378089466637e-06, "loss": 0.4634, "step": 327 }, { "epoch": 0.21, "grad_norm": 6.03943801640145, "learning_rate": 4.599601416377575e-06, "loss": 0.4802, "step": 328 }, { "epoch": 0.21, "grad_norm": 7.0185706532664165, "learning_rate": 4.59681592565304e-06, "loss": 0.4142, "step": 329 }, { "epoch": 0.21, "grad_norm": 2.175639231787874, "learning_rate": 4.59402162899118e-06, "loss": 0.0195, "step": 330 }, { "epoch": 0.21, "grad_norm": 8.313862394962, "learning_rate": 4.5912185381271205e-06, "loss": 0.294, "step": 331 }, { "epoch": 0.21, "grad_norm": 8.336083479714063, "learning_rate": 4.588406664832921e-06, "loss": 0.374, "step": 332 }, { "epoch": 0.21, "grad_norm": 9.981125375904826, "learning_rate": 4.585586020917524e-06, "loss": 0.4972, "step": 333 }, { "epoch": 0.21, "grad_norm": 3.240178158889226, "learning_rate": 4.582756618226709e-06, "loss": 0.0465, "step": 334 }, { "epoch": 0.21, "grad_norm": 8.700701227100131, "learning_rate": 4.579918468643035e-06, "loss": 0.5969, "step": 335 }, { "epoch": 0.21, "grad_norm": 9.094599838135833, "learning_rate": 4.577071584085797e-06, "loss": 0.4755, "step": 336 }, { "epoch": 0.21, "grad_norm": 6.0528229235818785, "learning_rate": 4.574215976510973e-06, "loss": 0.249, "step": 337 }, { "epoch": 0.21, "grad_norm": 3.9219379799941887, "learning_rate": 4.571351657911178e-06, "loss": 0.0711, "step": 338 }, { "epoch": 0.21, "grad_norm": 6.685544307350515, "learning_rate": 4.568478640315606e-06, "loss": 0.3457, "step": 339 }, { "epoch": 0.21, "grad_norm": 8.159724288471223, "learning_rate": 4.565596935789987e-06, "loss": 0.1901, "step": 340 }, { "epoch": 0.22, "grad_norm": 3.037164325768962, "learning_rate": 4.562706556436534e-06, "loss": 0.0451, "step": 341 }, { "epoch": 0.22, "grad_norm": 8.629381918758996, "learning_rate": 4.5598075143938855e-06, "loss": 0.5185, "step": 342 }, { "epoch": 0.22, "grad_norm": 7.513771637546047, "learning_rate": 4.556899821837069e-06, "loss": 0.1977, "step": 343 }, { "epoch": 0.22, "grad_norm": 6.112397287828078, "learning_rate": 4.553983490977435e-06, "loss": 0.2676, "step": 344 }, { "epoch": 0.22, "grad_norm": 7.473957879645747, "learning_rate": 4.551058534062614e-06, "loss": 0.2595, "step": 345 }, { "epoch": 0.22, "grad_norm": 7.076851365992341, "learning_rate": 4.5481249633764635e-06, "loss": 0.1709, "step": 346 }, { "epoch": 0.22, "grad_norm": 4.886193555972856, "learning_rate": 4.545182791239015e-06, "loss": 0.1557, "step": 347 }, { "epoch": 0.22, "grad_norm": 6.42694233431067, "learning_rate": 4.5422320300064246e-06, "loss": 0.4042, "step": 348 }, { "epoch": 0.22, "grad_norm": 2.74570424341322, "learning_rate": 4.53927269207092e-06, "loss": 0.0405, "step": 349 }, { "epoch": 0.22, "grad_norm": 10.293017670661733, "learning_rate": 4.536304789860746e-06, "loss": 0.584, "step": 350 }, { "epoch": 0.22, "grad_norm": 7.763428651850486, "learning_rate": 4.5333283358401155e-06, "loss": 0.2972, "step": 351 }, { "epoch": 0.22, "grad_norm": 8.708866766208018, "learning_rate": 4.530343342509158e-06, "loss": 0.6355, "step": 352 }, { "epoch": 0.22, "grad_norm": 4.057864787308724, "learning_rate": 4.527349822403862e-06, "loss": 0.1121, "step": 353 }, { "epoch": 0.22, "grad_norm": 7.983738106783992, "learning_rate": 4.524347788096029e-06, "loss": 0.3805, "step": 354 }, { "epoch": 0.22, "grad_norm": 4.688560264204164, "learning_rate": 4.521337252193215e-06, "loss": 0.2027, "step": 355 }, { "epoch": 0.23, "grad_norm": 6.511093942636213, "learning_rate": 4.518318227338682e-06, "loss": 0.3418, "step": 356 }, { "epoch": 0.23, "grad_norm": 6.280807859127777, "learning_rate": 4.51529072621134e-06, "loss": 0.1645, "step": 357 }, { "epoch": 0.23, "grad_norm": 4.3348787681936995, "learning_rate": 4.5122547615257e-06, "loss": 0.104, "step": 358 }, { "epoch": 0.23, "grad_norm": 9.53142930159218, "learning_rate": 4.509210346031812e-06, "loss": 0.5503, "step": 359 }, { "epoch": 0.23, "grad_norm": 9.714571628049478, "learning_rate": 4.506157492515223e-06, "loss": 0.6116, "step": 360 }, { "epoch": 0.23, "grad_norm": 8.243656378294672, "learning_rate": 4.503096213796912e-06, "loss": 0.2708, "step": 361 }, { "epoch": 0.23, "grad_norm": 3.7711328770370227, "learning_rate": 4.5000265227332455e-06, "loss": 0.2498, "step": 362 }, { "epoch": 0.23, "grad_norm": 5.1023789784995754, "learning_rate": 4.4969484322159125e-06, "loss": 0.1407, "step": 363 }, { "epoch": 0.23, "grad_norm": 9.345248609310735, "learning_rate": 4.493861955171885e-06, "loss": 0.5385, "step": 364 }, { "epoch": 0.23, "grad_norm": 3.3255143649706245, "learning_rate": 4.490767104563348e-06, "loss": 0.0422, "step": 365 }, { "epoch": 0.23, "grad_norm": 8.319241880209145, "learning_rate": 4.487663893387658e-06, "loss": 0.4725, "step": 366 }, { "epoch": 0.23, "grad_norm": 8.979939462164577, "learning_rate": 4.484552334677281e-06, "loss": 0.3704, "step": 367 }, { "epoch": 0.23, "grad_norm": 11.078710371082373, "learning_rate": 4.481432441499741e-06, "loss": 0.8272, "step": 368 }, { "epoch": 0.23, "grad_norm": 7.6832591198738704, "learning_rate": 4.478304226957563e-06, "loss": 0.3487, "step": 369 }, { "epoch": 0.23, "grad_norm": 8.208336328289693, "learning_rate": 4.475167704188219e-06, "loss": 0.3827, "step": 370 }, { "epoch": 0.23, "grad_norm": 4.437385879983391, "learning_rate": 4.472022886364073e-06, "loss": 0.2293, "step": 371 }, { "epoch": 0.24, "grad_norm": 5.102030383585948, "learning_rate": 4.468869786692327e-06, "loss": 0.1278, "step": 372 }, { "epoch": 0.24, "grad_norm": 5.879508054226713, "learning_rate": 4.46570841841496e-06, "loss": 0.3139, "step": 373 }, { "epoch": 0.24, "grad_norm": 5.2286608123286085, "learning_rate": 4.462538794808683e-06, "loss": 0.2995, "step": 374 }, { "epoch": 0.24, "grad_norm": 3.431765263681682, "learning_rate": 4.459360929184869e-06, "loss": 0.0296, "step": 375 }, { "epoch": 0.24, "grad_norm": 7.00714482387539, "learning_rate": 4.4561748348895106e-06, "loss": 0.211, "step": 376 }, { "epoch": 0.24, "grad_norm": 6.65933170528292, "learning_rate": 4.452980525303156e-06, "loss": 0.2855, "step": 377 }, { "epoch": 0.24, "grad_norm": 3.519982650887477, "learning_rate": 4.449778013840854e-06, "loss": 0.0479, "step": 378 }, { "epoch": 0.24, "grad_norm": 5.751698782046443, "learning_rate": 4.4465673139521e-06, "loss": 0.2207, "step": 379 }, { "epoch": 0.24, "grad_norm": 8.255366573963133, "learning_rate": 4.443348439120778e-06, "loss": 0.4336, "step": 380 }, { "epoch": 0.24, "grad_norm": 5.216785906484164, "learning_rate": 4.440121402865104e-06, "loss": 0.1264, "step": 381 }, { "epoch": 0.24, "grad_norm": 7.038760908061135, "learning_rate": 4.436886218737568e-06, "loss": 0.3318, "step": 382 }, { "epoch": 0.24, "grad_norm": 8.532180455927987, "learning_rate": 4.433642900324881e-06, "loss": 0.4184, "step": 383 }, { "epoch": 0.24, "grad_norm": 4.688890378107093, "learning_rate": 4.430391461247911e-06, "loss": 0.2135, "step": 384 }, { "epoch": 0.24, "grad_norm": 4.346098779390515, "learning_rate": 4.427131915161635e-06, "loss": 0.0838, "step": 385 }, { "epoch": 0.24, "grad_norm": 4.3411319444873175, "learning_rate": 4.423864275755075e-06, "loss": 0.1006, "step": 386 }, { "epoch": 0.24, "grad_norm": 4.296116543501431, "learning_rate": 4.42058855675124e-06, "loss": 0.1309, "step": 387 }, { "epoch": 0.25, "grad_norm": 5.566110874838561, "learning_rate": 4.4173047719070745e-06, "loss": 0.3441, "step": 388 }, { "epoch": 0.25, "grad_norm": 8.973664216756589, "learning_rate": 4.414012935013395e-06, "loss": 0.4388, "step": 389 }, { "epoch": 0.25, "grad_norm": 4.992032378507416, "learning_rate": 4.410713059894831e-06, "loss": 0.1431, "step": 390 }, { "epoch": 0.25, "grad_norm": 4.035479552028947, "learning_rate": 4.4074051604097755e-06, "loss": 0.2138, "step": 391 }, { "epoch": 0.25, "grad_norm": 6.297789703647399, "learning_rate": 4.404089250450317e-06, "loss": 0.3008, "step": 392 }, { "epoch": 0.25, "grad_norm": 8.871278224221298, "learning_rate": 4.400765343942189e-06, "loss": 0.5762, "step": 393 }, { "epoch": 0.25, "grad_norm": 8.901936661106918, "learning_rate": 4.397433454844704e-06, "loss": 0.4035, "step": 394 }, { "epoch": 0.25, "grad_norm": 8.066499413051595, "learning_rate": 4.3940935971507e-06, "loss": 0.2631, "step": 395 }, { "epoch": 0.25, "grad_norm": 7.632133773878889, "learning_rate": 4.390745784886483e-06, "loss": 0.4767, "step": 396 }, { "epoch": 0.25, "grad_norm": 9.890438710849303, "learning_rate": 4.387390032111762e-06, "loss": 0.4167, "step": 397 }, { "epoch": 0.25, "grad_norm": 5.070736624872489, "learning_rate": 4.384026352919595e-06, "loss": 0.2567, "step": 398 }, { "epoch": 0.25, "grad_norm": 8.073674937086958, "learning_rate": 4.380654761436329e-06, "loss": 0.4412, "step": 399 }, { "epoch": 0.25, "grad_norm": 5.441670539391129, "learning_rate": 4.377275271821539e-06, "loss": 0.1941, "step": 400 }, { "epoch": 0.25, "grad_norm": 5.839444292880805, "learning_rate": 4.37388789826797e-06, "loss": 0.1258, "step": 401 }, { "epoch": 0.25, "grad_norm": 6.215905233733528, "learning_rate": 4.370492655001478e-06, "loss": 0.1758, "step": 402 }, { "epoch": 0.25, "grad_norm": 5.980578141859587, "learning_rate": 4.3670895562809675e-06, "loss": 0.2898, "step": 403 }, { "epoch": 0.26, "grad_norm": 8.282653919911857, "learning_rate": 4.363678616398334e-06, "loss": 0.3226, "step": 404 }, { "epoch": 0.26, "grad_norm": 6.063399415301817, "learning_rate": 4.360259849678402e-06, "loss": 0.2041, "step": 405 }, { "epoch": 0.26, "grad_norm": 7.3143282178086615, "learning_rate": 4.356833270478869e-06, "loss": 0.3638, "step": 406 }, { "epoch": 0.26, "grad_norm": 9.231448979903139, "learning_rate": 4.353398893190241e-06, "loss": 0.4857, "step": 407 }, { "epoch": 0.26, "grad_norm": 7.831446021230515, "learning_rate": 4.349956732235772e-06, "loss": 0.4654, "step": 408 }, { "epoch": 0.26, "grad_norm": 8.278094457020618, "learning_rate": 4.346506802071406e-06, "loss": 0.4976, "step": 409 }, { "epoch": 0.26, "grad_norm": 3.479196026767647, "learning_rate": 4.343049117185717e-06, "loss": 0.0527, "step": 410 }, { "epoch": 0.26, "grad_norm": 4.678965211892643, "learning_rate": 4.339583692099843e-06, "loss": 0.1032, "step": 411 }, { "epoch": 0.26, "grad_norm": 8.200423662359617, "learning_rate": 4.336110541367428e-06, "loss": 0.364, "step": 412 }, { "epoch": 0.26, "grad_norm": 5.721084535125891, "learning_rate": 4.332629679574566e-06, "loss": 0.2819, "step": 413 }, { "epoch": 0.26, "grad_norm": 3.3321076682876747, "learning_rate": 4.329141121339731e-06, "loss": 0.0294, "step": 414 }, { "epoch": 0.26, "grad_norm": 3.2734787239629024, "learning_rate": 4.325644881313718e-06, "loss": 0.0577, "step": 415 }, { "epoch": 0.26, "grad_norm": 6.010488878826746, "learning_rate": 4.322140974179589e-06, "loss": 0.1327, "step": 416 }, { "epoch": 0.26, "grad_norm": 4.561795114909279, "learning_rate": 4.3186294146526e-06, "loss": 0.1491, "step": 417 }, { "epoch": 0.26, "grad_norm": 8.32636865127843, "learning_rate": 4.315110217480145e-06, "loss": 0.4162, "step": 418 }, { "epoch": 0.26, "grad_norm": 7.775930033804638, "learning_rate": 4.3115833974416965e-06, "loss": 0.375, "step": 419 }, { "epoch": 0.27, "grad_norm": 8.33849063232613, "learning_rate": 4.308048969348738e-06, "loss": 0.6308, "step": 420 }, { "epoch": 0.27, "grad_norm": 6.957389569572968, "learning_rate": 4.304506948044704e-06, "loss": 0.4595, "step": 421 }, { "epoch": 0.27, "grad_norm": 8.645789701665759, "learning_rate": 4.300957348404923e-06, "loss": 0.4273, "step": 422 }, { "epoch": 0.27, "grad_norm": 7.466706447268282, "learning_rate": 4.297400185336541e-06, "loss": 0.2687, "step": 423 }, { "epoch": 0.27, "grad_norm": 6.85523642991325, "learning_rate": 4.293835473778477e-06, "loss": 0.1872, "step": 424 }, { "epoch": 0.27, "grad_norm": 4.908384405538566, "learning_rate": 4.290263228701346e-06, "loss": 0.2586, "step": 425 }, { "epoch": 0.27, "grad_norm": 7.656316764696141, "learning_rate": 4.286683465107403e-06, "loss": 0.4652, "step": 426 }, { "epoch": 0.27, "grad_norm": 3.2745372590901196, "learning_rate": 4.283096198030477e-06, "loss": 0.0445, "step": 427 }, { "epoch": 0.27, "grad_norm": 4.993416839295216, "learning_rate": 4.2795014425359095e-06, "loss": 0.132, "step": 428 }, { "epoch": 0.27, "grad_norm": 6.572081918402075, "learning_rate": 4.275899213720493e-06, "loss": 0.1834, "step": 429 }, { "epoch": 0.27, "grad_norm": 6.758615999468492, "learning_rate": 4.272289526712403e-06, "loss": 0.3461, "step": 430 }, { "epoch": 0.27, "grad_norm": 4.646954570773951, "learning_rate": 4.268672396671139e-06, "loss": 0.1242, "step": 431 }, { "epoch": 0.27, "grad_norm": 8.325819773592492, "learning_rate": 4.265047838787455e-06, "loss": 0.4949, "step": 432 }, { "epoch": 0.27, "grad_norm": 5.032490642480162, "learning_rate": 4.261415868283304e-06, "loss": 0.1983, "step": 433 }, { "epoch": 0.27, "grad_norm": 3.6534399988330506, "learning_rate": 4.257776500411768e-06, "loss": 0.0399, "step": 434 }, { "epoch": 0.28, "grad_norm": 7.7451464313728895, "learning_rate": 4.254129750456995e-06, "loss": 0.5699, "step": 435 }, { "epoch": 0.28, "grad_norm": 7.833969279778525, "learning_rate": 4.250475633734135e-06, "loss": 0.1851, "step": 436 }, { "epoch": 0.28, "grad_norm": 5.076512664709991, "learning_rate": 4.246814165589277e-06, "loss": 0.1268, "step": 437 }, { "epoch": 0.28, "grad_norm": 8.261292560668046, "learning_rate": 4.243145361399383e-06, "loss": 0.2889, "step": 438 }, { "epoch": 0.28, "grad_norm": 6.157137579278704, "learning_rate": 4.239469236572222e-06, "loss": 0.3119, "step": 439 }, { "epoch": 0.28, "grad_norm": 8.319839335137427, "learning_rate": 4.235785806546313e-06, "loss": 0.4011, "step": 440 }, { "epoch": 0.28, "grad_norm": 7.92563902387483, "learning_rate": 4.232095086790849e-06, "loss": 0.347, "step": 441 }, { "epoch": 0.28, "grad_norm": 8.313336093701249, "learning_rate": 4.228397092805639e-06, "loss": 0.4254, "step": 442 }, { "epoch": 0.28, "grad_norm": 5.785311864888827, "learning_rate": 4.224691840121042e-06, "loss": 0.182, "step": 443 }, { "epoch": 0.28, "grad_norm": 6.099194095093149, "learning_rate": 4.220979344297901e-06, "loss": 0.2243, "step": 444 }, { "epoch": 0.28, "grad_norm": 4.742114297468367, "learning_rate": 4.217259620927476e-06, "loss": 0.0591, "step": 445 }, { "epoch": 0.28, "grad_norm": 9.065907225872378, "learning_rate": 4.213532685631384e-06, "loss": 0.0399, "step": 446 }, { "epoch": 0.28, "grad_norm": 2.837702766601384, "learning_rate": 4.209798554061527e-06, "loss": 0.0264, "step": 447 }, { "epoch": 0.28, "grad_norm": 8.624665627010382, "learning_rate": 4.206057241900029e-06, "loss": 0.4901, "step": 448 }, { "epoch": 0.28, "grad_norm": 8.567189512612613, "learning_rate": 4.202308764859171e-06, "loss": 0.7284, "step": 449 }, { "epoch": 0.28, "grad_norm": 1.410828152936395, "learning_rate": 4.198553138681324e-06, "loss": 0.0097, "step": 450 }, { "epoch": 0.29, "grad_norm": 5.488429296268352, "learning_rate": 4.194790379138882e-06, "loss": 0.0699, "step": 451 }, { "epoch": 0.29, "grad_norm": 10.055795272114857, "learning_rate": 4.191020502034198e-06, "loss": 0.6005, "step": 452 }, { "epoch": 0.29, "grad_norm": 9.385892110136627, "learning_rate": 4.187243523199518e-06, "loss": 0.3304, "step": 453 }, { "epoch": 0.29, "grad_norm": 6.411432914363283, "learning_rate": 4.1834594584969084e-06, "loss": 0.297, "step": 454 }, { "epoch": 0.29, "grad_norm": 7.0139030263357505, "learning_rate": 4.179668323818198e-06, "loss": 0.39, "step": 455 }, { "epoch": 0.29, "grad_norm": 5.27903800552657, "learning_rate": 4.175870135084905e-06, "loss": 0.0633, "step": 456 }, { "epoch": 0.29, "grad_norm": 12.097437412252207, "learning_rate": 4.1720649082481735e-06, "loss": 0.7544, "step": 457 }, { "epoch": 0.29, "grad_norm": 6.4988279019467114, "learning_rate": 4.168252659288704e-06, "loss": 0.2817, "step": 458 }, { "epoch": 0.29, "grad_norm": 3.6865387002995305, "learning_rate": 4.164433404216689e-06, "loss": 0.1691, "step": 459 }, { "epoch": 0.29, "grad_norm": 4.785666426884982, "learning_rate": 4.160607159071744e-06, "loss": 0.1772, "step": 460 }, { "epoch": 0.29, "grad_norm": 7.605003777677964, "learning_rate": 4.156773939922841e-06, "loss": 0.6325, "step": 461 }, { "epoch": 0.29, "grad_norm": 5.118944638083486, "learning_rate": 4.1529337628682375e-06, "loss": 0.0584, "step": 462 }, { "epoch": 0.29, "grad_norm": 9.633787429121124, "learning_rate": 4.149086644035417e-06, "loss": 0.6426, "step": 463 }, { "epoch": 0.29, "grad_norm": 7.196798926948138, "learning_rate": 4.145232599581011e-06, "loss": 0.4067, "step": 464 }, { "epoch": 0.29, "grad_norm": 7.0593519368329, "learning_rate": 4.14137164569074e-06, "loss": 0.106, "step": 465 }, { "epoch": 0.29, "grad_norm": 9.188050960706681, "learning_rate": 4.137503798579341e-06, "loss": 0.5153, "step": 466 }, { "epoch": 0.3, "grad_norm": 5.90584308494099, "learning_rate": 4.1336290744905e-06, "loss": 0.2388, "step": 467 }, { "epoch": 0.3, "grad_norm": 6.919577602658904, "learning_rate": 4.129747489696781e-06, "loss": 0.1712, "step": 468 }, { "epoch": 0.3, "grad_norm": 0.7047643412375464, "learning_rate": 4.125859060499569e-06, "loss": 0.0032, "step": 469 }, { "epoch": 0.3, "grad_norm": 8.85964324921822, "learning_rate": 4.121963803228982e-06, "loss": 0.5518, "step": 470 }, { "epoch": 0.3, "grad_norm": 7.757176675700209, "learning_rate": 4.118061734243824e-06, "loss": 0.3864, "step": 471 }, { "epoch": 0.3, "grad_norm": 7.3281177122165255, "learning_rate": 4.1141528699314995e-06, "loss": 0.2804, "step": 472 }, { "epoch": 0.3, "grad_norm": 8.039482913873313, "learning_rate": 4.110237226707953e-06, "loss": 0.31, "step": 473 }, { "epoch": 0.3, "grad_norm": 8.70171702311622, "learning_rate": 4.1063148210176e-06, "loss": 0.4733, "step": 474 }, { "epoch": 0.3, "grad_norm": 5.856575828778487, "learning_rate": 4.102385669333252e-06, "loss": 0.3028, "step": 475 }, { "epoch": 0.3, "grad_norm": 5.473986924735284, "learning_rate": 4.098449788156056e-06, "loss": 0.2117, "step": 476 }, { "epoch": 0.3, "grad_norm": 5.875672322214464, "learning_rate": 4.094507194015417e-06, "loss": 0.2796, "step": 477 }, { "epoch": 0.3, "grad_norm": 7.169668352658258, "learning_rate": 4.090557903468935e-06, "loss": 0.4738, "step": 478 }, { "epoch": 0.3, "grad_norm": 4.306268363658799, "learning_rate": 4.086601933102331e-06, "loss": 0.0848, "step": 479 }, { "epoch": 0.3, "grad_norm": 7.081158592125722, "learning_rate": 4.08263929952938e-06, "loss": 0.205, "step": 480 }, { "epoch": 0.3, "grad_norm": 3.955295735718712, "learning_rate": 4.07867001939184e-06, "loss": 0.0559, "step": 481 }, { "epoch": 0.3, "grad_norm": 10.033866374746554, "learning_rate": 4.0746941093593815e-06, "loss": 0.5994, "step": 482 }, { "epoch": 0.31, "grad_norm": 6.338877041882072, "learning_rate": 4.070711586129519e-06, "loss": 0.1351, "step": 483 }, { "epoch": 0.31, "grad_norm": 3.948280351271775, "learning_rate": 4.066722466427541e-06, "loss": 0.123, "step": 484 }, { "epoch": 0.31, "grad_norm": 3.938709860318418, "learning_rate": 4.062726767006439e-06, "loss": 0.0669, "step": 485 }, { "epoch": 0.31, "grad_norm": 9.28614179863707, "learning_rate": 4.058724504646834e-06, "loss": 0.4046, "step": 486 }, { "epoch": 0.31, "grad_norm": 5.566973312673025, "learning_rate": 4.054715696156914e-06, "loss": 0.1975, "step": 487 }, { "epoch": 0.31, "grad_norm": 6.301531941486158, "learning_rate": 4.050700358372357e-06, "loss": 0.2406, "step": 488 }, { "epoch": 0.31, "grad_norm": 5.607999415137768, "learning_rate": 4.046678508156259e-06, "loss": 0.3169, "step": 489 }, { "epoch": 0.31, "grad_norm": 8.548200487083982, "learning_rate": 4.042650162399069e-06, "loss": 0.3722, "step": 490 }, { "epoch": 0.31, "grad_norm": 6.466660903980532, "learning_rate": 4.038615338018515e-06, "loss": 0.2581, "step": 491 }, { "epoch": 0.31, "grad_norm": 3.7044290427508364, "learning_rate": 4.034574051959532e-06, "loss": 0.0706, "step": 492 }, { "epoch": 0.31, "grad_norm": 6.211011816575825, "learning_rate": 4.030526321194194e-06, "loss": 0.1761, "step": 493 }, { "epoch": 0.31, "grad_norm": 6.092352291127704, "learning_rate": 4.026472162721636e-06, "loss": 0.3531, "step": 494 }, { "epoch": 0.31, "grad_norm": 5.452091851505345, "learning_rate": 4.022411593567992e-06, "loss": 0.1989, "step": 495 }, { "epoch": 0.31, "grad_norm": 5.931724550258274, "learning_rate": 4.018344630786318e-06, "loss": 0.1829, "step": 496 }, { "epoch": 0.31, "grad_norm": 7.408700014643758, "learning_rate": 4.0142712914565186e-06, "loss": 0.4181, "step": 497 }, { "epoch": 0.31, "grad_norm": 6.460201198409054, "learning_rate": 4.0101915926852795e-06, "loss": 0.2097, "step": 498 }, { "epoch": 0.32, "grad_norm": 6.572793789141728, "learning_rate": 4.006105551605995e-06, "loss": 0.1717, "step": 499 }, { "epoch": 0.32, "grad_norm": 3.778938028021749, "learning_rate": 4.002013185378694e-06, "loss": 0.1805, "step": 500 }, { "epoch": 0.32, "grad_norm": 6.484465596129168, "learning_rate": 3.997914511189968e-06, "loss": 0.5012, "step": 501 }, { "epoch": 0.32, "grad_norm": 9.424697782087838, "learning_rate": 3.993809546252901e-06, "loss": 0.408, "step": 502 }, { "epoch": 0.32, "grad_norm": 8.074791343992947, "learning_rate": 3.989698307806995e-06, "loss": 0.498, "step": 503 }, { "epoch": 0.32, "grad_norm": 10.072288538307756, "learning_rate": 3.9855808131181e-06, "loss": 0.4073, "step": 504 }, { "epoch": 0.32, "grad_norm": 5.087831955935733, "learning_rate": 3.981457079478341e-06, "loss": 0.1355, "step": 505 }, { "epoch": 0.32, "grad_norm": 7.2255848156211115, "learning_rate": 3.9773271242060405e-06, "loss": 0.3494, "step": 506 }, { "epoch": 0.32, "grad_norm": 4.962668481844557, "learning_rate": 3.973190964645655e-06, "loss": 0.1183, "step": 507 }, { "epoch": 0.32, "grad_norm": 4.452887706122831, "learning_rate": 3.969048618167693e-06, "loss": 0.0991, "step": 508 }, { "epoch": 0.32, "grad_norm": 8.159313342804909, "learning_rate": 3.964900102168647e-06, "loss": 0.4468, "step": 509 }, { "epoch": 0.32, "grad_norm": 6.423591477189401, "learning_rate": 3.9607454340709215e-06, "loss": 0.3173, "step": 510 }, { "epoch": 0.32, "grad_norm": 4.243627145868191, "learning_rate": 3.956584631322755e-06, "loss": 0.2281, "step": 511 }, { "epoch": 0.32, "grad_norm": 6.854213014888493, "learning_rate": 3.952417711398151e-06, "loss": 0.3971, "step": 512 }, { "epoch": 0.32, "grad_norm": 9.380350035946153, "learning_rate": 3.948244691796803e-06, "loss": 0.5623, "step": 513 }, { "epoch": 0.32, "grad_norm": 6.3256107563167125, "learning_rate": 3.944065590044021e-06, "loss": 0.2549, "step": 514 }, { "epoch": 0.33, "grad_norm": 5.693745296740134, "learning_rate": 3.939880423690657e-06, "loss": 0.1227, "step": 515 }, { "epoch": 0.33, "grad_norm": 6.590973328184677, "learning_rate": 3.935689210313036e-06, "loss": 0.3157, "step": 516 }, { "epoch": 0.33, "grad_norm": 8.52645929549164, "learning_rate": 3.931491967512872e-06, "loss": 0.5785, "step": 517 }, { "epoch": 0.33, "grad_norm": 7.141805538695314, "learning_rate": 3.927288712917209e-06, "loss": 0.185, "step": 518 }, { "epoch": 0.33, "grad_norm": 8.716130126233855, "learning_rate": 3.92307946417833e-06, "loss": 0.3371, "step": 519 }, { "epoch": 0.33, "grad_norm": 8.426152942224686, "learning_rate": 3.918864238973697e-06, "loss": 0.5731, "step": 520 }, { "epoch": 0.33, "grad_norm": 7.926517127155523, "learning_rate": 3.91464305500587e-06, "loss": 0.2902, "step": 521 }, { "epoch": 0.33, "grad_norm": 8.19851060692581, "learning_rate": 3.910415930002433e-06, "loss": 0.4182, "step": 522 }, { "epoch": 0.33, "grad_norm": 5.025790646729712, "learning_rate": 3.906182881715922e-06, "loss": 0.1609, "step": 523 }, { "epoch": 0.33, "grad_norm": 5.634598912990908, "learning_rate": 3.901943927923745e-06, "loss": 0.2222, "step": 524 }, { "epoch": 0.33, "grad_norm": 3.7465537448124984, "learning_rate": 3.897699086428117e-06, "loss": 0.2473, "step": 525 }, { "epoch": 0.33, "grad_norm": 6.65937294926426, "learning_rate": 3.893448375055973e-06, "loss": 0.1741, "step": 526 }, { "epoch": 0.33, "grad_norm": 6.670711371192274, "learning_rate": 3.889191811658907e-06, "loss": 0.1265, "step": 527 }, { "epoch": 0.33, "grad_norm": 8.077946496758454, "learning_rate": 3.884929414113082e-06, "loss": 0.3901, "step": 528 }, { "epoch": 0.33, "grad_norm": 3.7458163766332184, "learning_rate": 3.880661200319168e-06, "loss": 0.197, "step": 529 }, { "epoch": 0.34, "grad_norm": 6.553998108394142, "learning_rate": 3.876387188202258e-06, "loss": 0.3539, "step": 530 }, { "epoch": 0.34, "grad_norm": 9.285249920646573, "learning_rate": 3.872107395711799e-06, "loss": 0.5698, "step": 531 }, { "epoch": 0.34, "grad_norm": 5.359354002441556, "learning_rate": 3.867821840821509e-06, "loss": 0.221, "step": 532 }, { "epoch": 0.34, "grad_norm": 8.315099782889805, "learning_rate": 3.863530541529313e-06, "loss": 0.2987, "step": 533 }, { "epoch": 0.34, "grad_norm": 10.926422263998903, "learning_rate": 3.859233515857253e-06, "loss": 0.7797, "step": 534 }, { "epoch": 0.34, "grad_norm": 3.8648743772632983, "learning_rate": 3.854930781851426e-06, "loss": 0.0488, "step": 535 }, { "epoch": 0.34, "grad_norm": 7.375334134048924, "learning_rate": 3.8506223575819e-06, "loss": 0.4402, "step": 536 }, { "epoch": 0.34, "grad_norm": 6.648127081288748, "learning_rate": 3.846308261142639e-06, "loss": 0.4045, "step": 537 }, { "epoch": 0.34, "grad_norm": 12.550294327039207, "learning_rate": 3.84198851065143e-06, "loss": 0.2589, "step": 538 }, { "epoch": 0.34, "grad_norm": 4.057065409798057, "learning_rate": 3.837663124249803e-06, "loss": 0.0294, "step": 539 }, { "epoch": 0.34, "grad_norm": 6.565796233190705, "learning_rate": 3.833332120102961e-06, "loss": 0.1514, "step": 540 }, { "epoch": 0.34, "grad_norm": 4.971535339837574, "learning_rate": 3.828995516399695e-06, "loss": 0.1534, "step": 541 }, { "epoch": 0.34, "grad_norm": 7.567438704082053, "learning_rate": 3.824653331352316e-06, "loss": 0.2992, "step": 542 }, { "epoch": 0.34, "grad_norm": 7.496221480640524, "learning_rate": 3.820305583196571e-06, "loss": 0.1934, "step": 543 }, { "epoch": 0.34, "grad_norm": 6.530072411782659, "learning_rate": 3.815952290191575e-06, "loss": 0.2904, "step": 544 }, { "epoch": 0.34, "grad_norm": 4.382579450480106, "learning_rate": 3.8115934706197248e-06, "loss": 0.2244, "step": 545 }, { "epoch": 0.35, "grad_norm": 5.891447702931509, "learning_rate": 3.807229142786631e-06, "loss": 0.3195, "step": 546 }, { "epoch": 0.35, "grad_norm": 7.021732382181241, "learning_rate": 3.8028593250210337e-06, "loss": 0.2305, "step": 547 }, { "epoch": 0.35, "grad_norm": 7.649333461072385, "learning_rate": 3.798484035674732e-06, "loss": 0.3756, "step": 548 }, { "epoch": 0.35, "grad_norm": 7.815812286121932, "learning_rate": 3.7941032931225007e-06, "loss": 0.3032, "step": 549 }, { "epoch": 0.35, "grad_norm": 3.376647970732145, "learning_rate": 3.789717115762019e-06, "loss": 0.0415, "step": 550 }, { "epoch": 0.35, "grad_norm": 7.282498886993564, "learning_rate": 3.7853255220137885e-06, "loss": 0.1783, "step": 551 }, { "epoch": 0.35, "grad_norm": 6.842565982766671, "learning_rate": 3.78092853032106e-06, "loss": 0.4399, "step": 552 }, { "epoch": 0.35, "grad_norm": 6.064656925923377, "learning_rate": 3.7765261591497502e-06, "loss": 0.3379, "step": 553 }, { "epoch": 0.35, "grad_norm": 4.837069722441042, "learning_rate": 3.7721184269883735e-06, "loss": 0.3289, "step": 554 }, { "epoch": 0.35, "grad_norm": 6.809046998862238, "learning_rate": 3.7677053523479534e-06, "loss": 0.1118, "step": 555 }, { "epoch": 0.35, "grad_norm": 7.81892192111834, "learning_rate": 3.763286953761952e-06, "loss": 0.4513, "step": 556 }, { "epoch": 0.35, "grad_norm": 9.667934027715425, "learning_rate": 3.758863249786191e-06, "loss": 0.7158, "step": 557 }, { "epoch": 0.35, "grad_norm": 9.750197579753873, "learning_rate": 3.754434258998772e-06, "loss": 0.5956, "step": 558 }, { "epoch": 0.35, "grad_norm": 9.725275627094032, "learning_rate": 3.7500000000000005e-06, "loss": 0.7041, "step": 559 }, { "epoch": 0.35, "grad_norm": 4.155266207947589, "learning_rate": 3.745560491412305e-06, "loss": 0.1983, "step": 560 }, { "epoch": 0.35, "grad_norm": 6.481014106894056, "learning_rate": 3.7411157518801622e-06, "loss": 0.2349, "step": 561 }, { "epoch": 0.36, "grad_norm": 7.42416083220858, "learning_rate": 3.7366658000700164e-06, "loss": 0.1869, "step": 562 }, { "epoch": 0.36, "grad_norm": 5.608690310373774, "learning_rate": 3.732210654670201e-06, "loss": 0.1505, "step": 563 }, { "epoch": 0.36, "grad_norm": 5.146990606032226, "learning_rate": 3.7277503343908627e-06, "loss": 0.1055, "step": 564 }, { "epoch": 0.36, "grad_norm": 8.1348605843047, "learning_rate": 3.72328485796388e-06, "loss": 0.4122, "step": 565 }, { "epoch": 0.36, "grad_norm": 9.165249616737636, "learning_rate": 3.718814244142784e-06, "loss": 0.4394, "step": 566 }, { "epoch": 0.36, "grad_norm": 5.344305355370828, "learning_rate": 3.714338511702683e-06, "loss": 0.0903, "step": 567 }, { "epoch": 0.36, "grad_norm": 7.450327094149253, "learning_rate": 3.709857679440182e-06, "loss": 0.3096, "step": 568 }, { "epoch": 0.36, "grad_norm": 8.598028498160248, "learning_rate": 3.705371766173303e-06, "loss": 0.9059, "step": 569 }, { "epoch": 0.36, "grad_norm": 9.144789465998734, "learning_rate": 3.700880790741405e-06, "loss": 0.5633, "step": 570 }, { "epoch": 0.36, "grad_norm": 9.592664427333508, "learning_rate": 3.69638477200511e-06, "loss": 0.6696, "step": 571 }, { "epoch": 0.36, "grad_norm": 7.551634999264025, "learning_rate": 3.691883728846216e-06, "loss": 0.3798, "step": 572 }, { "epoch": 0.36, "grad_norm": 9.114513904125447, "learning_rate": 3.6873776801676265e-06, "loss": 0.5475, "step": 573 }, { "epoch": 0.36, "grad_norm": 7.077178015949125, "learning_rate": 3.6828666448932615e-06, "loss": 0.4892, "step": 574 }, { "epoch": 0.36, "grad_norm": 2.7550479335042, "learning_rate": 3.6783506419679878e-06, "loss": 0.0305, "step": 575 }, { "epoch": 0.36, "grad_norm": 7.619133865912793, "learning_rate": 3.6738296903575303e-06, "loss": 0.5245, "step": 576 }, { "epoch": 0.36, "grad_norm": 9.816562856266707, "learning_rate": 3.669303809048401e-06, "loss": 0.5877, "step": 577 }, { "epoch": 0.37, "grad_norm": 6.058456488209786, "learning_rate": 3.6647730170478124e-06, "loss": 0.3524, "step": 578 }, { "epoch": 0.37, "grad_norm": 8.488515837473997, "learning_rate": 3.6602373333836004e-06, "loss": 0.5515, "step": 579 }, { "epoch": 0.37, "grad_norm": 6.008600428866071, "learning_rate": 3.655696777104146e-06, "loss": 0.3129, "step": 580 }, { "epoch": 0.37, "grad_norm": 7.694389915349598, "learning_rate": 3.651151367278291e-06, "loss": 0.4325, "step": 581 }, { "epoch": 0.37, "grad_norm": 3.3934275212234715, "learning_rate": 3.646601122995263e-06, "loss": 0.0488, "step": 582 }, { "epoch": 0.37, "grad_norm": 4.194649585369792, "learning_rate": 3.6420460633645904e-06, "loss": 0.131, "step": 583 }, { "epoch": 0.37, "grad_norm": 4.970086451801307, "learning_rate": 3.637486207516027e-06, "loss": 0.1761, "step": 584 }, { "epoch": 0.37, "grad_norm": 6.756777151540958, "learning_rate": 3.6329215745994656e-06, "loss": 0.2108, "step": 585 }, { "epoch": 0.37, "grad_norm": 3.328596914440564, "learning_rate": 3.6283521837848655e-06, "loss": 0.0486, "step": 586 }, { "epoch": 0.37, "grad_norm": 6.35619647848426, "learning_rate": 3.623778054262165e-06, "loss": 0.2675, "step": 587 }, { "epoch": 0.37, "grad_norm": 7.5291643855885395, "learning_rate": 3.619199205241204e-06, "loss": 0.3329, "step": 588 }, { "epoch": 0.37, "grad_norm": 8.11179299587398, "learning_rate": 3.614615655951641e-06, "loss": 0.1971, "step": 589 }, { "epoch": 0.37, "grad_norm": 5.822432696361728, "learning_rate": 3.6100274256428773e-06, "loss": 0.2352, "step": 590 }, { "epoch": 0.37, "grad_norm": 5.646554973507976, "learning_rate": 3.605434533583971e-06, "loss": 0.254, "step": 591 }, { "epoch": 0.37, "grad_norm": 6.059289770525368, "learning_rate": 3.6008369990635583e-06, "loss": 0.2312, "step": 592 }, { "epoch": 0.37, "grad_norm": 6.808202665546631, "learning_rate": 3.596234841389771e-06, "loss": 0.2267, "step": 593 }, { "epoch": 0.38, "grad_norm": 9.383365408355296, "learning_rate": 3.5916280798901604e-06, "loss": 0.4627, "step": 594 }, { "epoch": 0.38, "grad_norm": 6.287491311888257, "learning_rate": 3.5870167339116075e-06, "loss": 0.3406, "step": 595 }, { "epoch": 0.38, "grad_norm": 5.794404112465292, "learning_rate": 3.58240082282025e-06, "loss": 0.2973, "step": 596 }, { "epoch": 0.38, "grad_norm": 8.69207379810026, "learning_rate": 3.577780366001396e-06, "loss": 0.463, "step": 597 }, { "epoch": 0.38, "grad_norm": 9.941540550418985, "learning_rate": 3.5731553828594446e-06, "loss": 0.3988, "step": 598 }, { "epoch": 0.38, "grad_norm": 7.707521495510718, "learning_rate": 3.5685258928178054e-06, "loss": 0.4192, "step": 599 }, { "epoch": 0.38, "grad_norm": 10.236522839387826, "learning_rate": 3.5638919153188125e-06, "loss": 0.5069, "step": 600 }, { "epoch": 0.38, "grad_norm": 5.299558426797743, "learning_rate": 3.5592534698236477e-06, "loss": 0.1596, "step": 601 }, { "epoch": 0.38, "grad_norm": 3.0906154281421703, "learning_rate": 3.554610575812257e-06, "loss": 0.0426, "step": 602 }, { "epoch": 0.38, "grad_norm": 0.0012822369296657781, "learning_rate": 3.5499632527832683e-06, "loss": 0.0, "step": 603 }, { "epoch": 0.38, "grad_norm": 4.57191123713371, "learning_rate": 3.54531152025391e-06, "loss": 0.2251, "step": 604 }, { "epoch": 0.38, "grad_norm": 7.31522755412989, "learning_rate": 3.54065539775993e-06, "loss": 0.3553, "step": 605 }, { "epoch": 0.38, "grad_norm": 6.726450211388549, "learning_rate": 3.535994904855509e-06, "loss": 0.3344, "step": 606 }, { "epoch": 0.38, "grad_norm": 9.912298528059726, "learning_rate": 3.5313300611131874e-06, "loss": 0.4693, "step": 607 }, { "epoch": 0.38, "grad_norm": 7.197976877169555, "learning_rate": 3.526660886123773e-06, "loss": 0.3403, "step": 608 }, { "epoch": 0.39, "grad_norm": 4.43994159419505, "learning_rate": 3.521987399496266e-06, "loss": 0.0843, "step": 609 }, { "epoch": 0.39, "grad_norm": 4.22676489269629, "learning_rate": 3.517309620857773e-06, "loss": 0.0739, "step": 610 }, { "epoch": 0.39, "grad_norm": 6.4999483546626164, "learning_rate": 3.5126275698534255e-06, "loss": 0.413, "step": 611 }, { "epoch": 0.39, "grad_norm": 5.044424306425061, "learning_rate": 3.507941266146299e-06, "loss": 0.2406, "step": 612 }, { "epoch": 0.39, "grad_norm": 9.330248413703988, "learning_rate": 3.5032507294173275e-06, "loss": 0.5586, "step": 613 }, { "epoch": 0.39, "grad_norm": 4.962067530930888, "learning_rate": 3.4985559793652223e-06, "loss": 0.152, "step": 614 }, { "epoch": 0.39, "grad_norm": 5.772229224355687, "learning_rate": 3.4938570357063906e-06, "loss": 0.1259, "step": 615 }, { "epoch": 0.39, "grad_norm": 2.8995872335347888, "learning_rate": 3.489153918174849e-06, "loss": 0.0429, "step": 616 }, { "epoch": 0.39, "grad_norm": 6.154732299238771, "learning_rate": 3.484446646522146e-06, "loss": 0.2469, "step": 617 }, { "epoch": 0.39, "grad_norm": 5.221432299199805, "learning_rate": 3.4797352405172735e-06, "loss": 0.1814, "step": 618 }, { "epoch": 0.39, "grad_norm": 8.765988374681921, "learning_rate": 3.475019719946588e-06, "loss": 0.5377, "step": 619 }, { "epoch": 0.39, "grad_norm": 7.284518969277898, "learning_rate": 3.4703001046137253e-06, "loss": 0.4524, "step": 620 }, { "epoch": 0.39, "grad_norm": 4.491568826283332, "learning_rate": 3.4655764143395163e-06, "loss": 0.125, "step": 621 }, { "epoch": 0.39, "grad_norm": 9.229204050050088, "learning_rate": 3.4608486689619087e-06, "loss": 0.6578, "step": 622 }, { "epoch": 0.39, "grad_norm": 8.420714459405207, "learning_rate": 3.4561168883358766e-06, "loss": 0.4392, "step": 623 }, { "epoch": 0.39, "grad_norm": 5.791198748158608, "learning_rate": 3.4513810923333436e-06, "loss": 0.2087, "step": 624 }, { "epoch": 0.4, "grad_norm": 4.0539738315514935, "learning_rate": 3.4466413008430943e-06, "loss": 0.204, "step": 625 }, { "epoch": 0.4, "grad_norm": 8.419946566523178, "learning_rate": 3.4418975337706957e-06, "loss": 0.431, "step": 626 }, { "epoch": 0.4, "grad_norm": 8.696142933306922, "learning_rate": 3.4371498110384083e-06, "loss": 0.6483, "step": 627 }, { "epoch": 0.4, "grad_norm": 5.394725180088116, "learning_rate": 3.4323981525851075e-06, "loss": 0.2136, "step": 628 }, { "epoch": 0.4, "grad_norm": 7.8987016794037, "learning_rate": 3.4276425783661945e-06, "loss": 0.3772, "step": 629 }, { "epoch": 0.4, "grad_norm": 5.807960952942797, "learning_rate": 3.4228831083535185e-06, "loss": 0.267, "step": 630 }, { "epoch": 0.4, "grad_norm": 4.267958900346725, "learning_rate": 3.4181197625352874e-06, "loss": 0.131, "step": 631 }, { "epoch": 0.4, "grad_norm": 4.30303447021465, "learning_rate": 3.4133525609159883e-06, "loss": 0.1255, "step": 632 }, { "epoch": 0.4, "grad_norm": 7.253820530760357, "learning_rate": 3.408581523516299e-06, "loss": 0.3077, "step": 633 }, { "epoch": 0.4, "grad_norm": 3.6422984979801796, "learning_rate": 3.403806670373008e-06, "loss": 0.0678, "step": 634 }, { "epoch": 0.4, "grad_norm": 7.2746809112730055, "learning_rate": 3.399028021538929e-06, "loss": 0.3641, "step": 635 }, { "epoch": 0.4, "grad_norm": 6.7660628038378805, "learning_rate": 3.3942455970828146e-06, "loss": 0.3673, "step": 636 }, { "epoch": 0.4, "grad_norm": 6.259821689491524, "learning_rate": 3.3894594170892747e-06, "loss": 0.216, "step": 637 }, { "epoch": 0.4, "grad_norm": 4.251441711033382, "learning_rate": 3.3846695016586915e-06, "loss": 0.1541, "step": 638 }, { "epoch": 0.4, "grad_norm": 8.74081112796774, "learning_rate": 3.3798758709071354e-06, "loss": 0.5547, "step": 639 }, { "epoch": 0.4, "grad_norm": 4.785835410986747, "learning_rate": 3.375078544966278e-06, "loss": 0.1474, "step": 640 }, { "epoch": 0.41, "grad_norm": 3.74835028918615, "learning_rate": 3.3702775439833126e-06, "loss": 0.0708, "step": 641 }, { "epoch": 0.41, "grad_norm": 8.46243647631831, "learning_rate": 3.3654728881208607e-06, "loss": 0.458, "step": 642 }, { "epoch": 0.41, "grad_norm": 9.001441840071807, "learning_rate": 3.360664597556901e-06, "loss": 0.4055, "step": 643 }, { "epoch": 0.41, "grad_norm": 10.33990609928784, "learning_rate": 3.3558526924846695e-06, "loss": 0.4857, "step": 644 }, { "epoch": 0.41, "grad_norm": 8.910268419120786, "learning_rate": 3.3510371931125875e-06, "loss": 0.4183, "step": 645 }, { "epoch": 0.41, "grad_norm": 8.582500627241547, "learning_rate": 3.346218119664166e-06, "loss": 0.4943, "step": 646 }, { "epoch": 0.41, "grad_norm": 5.771240807572005, "learning_rate": 3.3413954923779313e-06, "loss": 0.1844, "step": 647 }, { "epoch": 0.41, "grad_norm": 3.9605775327217363, "learning_rate": 3.33656933150733e-06, "loss": 0.2458, "step": 648 }, { "epoch": 0.41, "grad_norm": 4.461514250287404, "learning_rate": 3.3317396573206528e-06, "loss": 0.1039, "step": 649 }, { "epoch": 0.41, "grad_norm": 7.6908110644522765, "learning_rate": 3.326906490100941e-06, "loss": 0.5968, "step": 650 }, { "epoch": 0.41, "grad_norm": 4.353282102445988, "learning_rate": 3.3220698501459082e-06, "loss": 0.0937, "step": 651 }, { "epoch": 0.41, "grad_norm": 5.8408826025195895, "learning_rate": 3.3172297577678515e-06, "loss": 0.2298, "step": 652 }, { "epoch": 0.41, "grad_norm": 8.534472852360349, "learning_rate": 3.3123862332935674e-06, "loss": 0.2475, "step": 653 }, { "epoch": 0.41, "grad_norm": 4.577137498820138, "learning_rate": 3.3075392970642655e-06, "loss": 0.0879, "step": 654 }, { "epoch": 0.41, "grad_norm": 5.33067748382168, "learning_rate": 3.3026889694354845e-06, "loss": 0.1781, "step": 655 }, { "epoch": 0.41, "grad_norm": 9.428135552177555, "learning_rate": 3.297835270777005e-06, "loss": 0.6271, "step": 656 }, { "epoch": 0.42, "grad_norm": 4.577155000695369, "learning_rate": 3.2929782214727657e-06, "loss": 0.0708, "step": 657 }, { "epoch": 0.42, "grad_norm": 10.735077192280396, "learning_rate": 3.2881178419207754e-06, "loss": 0.7341, "step": 658 }, { "epoch": 0.42, "grad_norm": 9.902137263626237, "learning_rate": 3.2832541525330307e-06, "loss": 0.3174, "step": 659 }, { "epoch": 0.42, "grad_norm": 8.704656332682939, "learning_rate": 3.2783871737354272e-06, "loss": 0.384, "step": 660 }, { "epoch": 0.42, "grad_norm": 7.20714266724514, "learning_rate": 3.2735169259676754e-06, "loss": 0.4284, "step": 661 }, { "epoch": 0.42, "grad_norm": 5.117987324182064, "learning_rate": 3.268643429683214e-06, "loss": 0.1292, "step": 662 }, { "epoch": 0.42, "grad_norm": 4.8741767383788135, "learning_rate": 3.263766705349125e-06, "loss": 0.2135, "step": 663 }, { "epoch": 0.42, "grad_norm": 7.504234644362891, "learning_rate": 3.2588867734460467e-06, "loss": 0.3344, "step": 664 }, { "epoch": 0.42, "grad_norm": 4.94017168989941, "learning_rate": 3.254003654468088e-06, "loss": 0.2153, "step": 665 }, { "epoch": 0.42, "grad_norm": 8.415277901569533, "learning_rate": 3.249117368922744e-06, "loss": 0.5916, "step": 666 }, { "epoch": 0.42, "grad_norm": 3.828794210678742, "learning_rate": 3.244227937330805e-06, "loss": 0.284, "step": 667 }, { "epoch": 0.42, "grad_norm": 6.107075212019553, "learning_rate": 3.2393353802262777e-06, "loss": 0.2186, "step": 668 }, { "epoch": 0.42, "grad_norm": 7.80001101370792, "learning_rate": 3.234439718156292e-06, "loss": 0.5572, "step": 669 }, { "epoch": 0.42, "grad_norm": 8.40616740927132, "learning_rate": 3.229540971681019e-06, "loss": 0.3547, "step": 670 }, { "epoch": 0.42, "grad_norm": 5.9635428038572424, "learning_rate": 3.224639161373582e-06, "loss": 0.1532, "step": 671 }, { "epoch": 0.42, "grad_norm": 7.384910519522065, "learning_rate": 3.2197343078199735e-06, "loss": 0.2449, "step": 672 }, { "epoch": 0.43, "grad_norm": 8.827283214844652, "learning_rate": 3.2148264316189638e-06, "loss": 0.5185, "step": 673 }, { "epoch": 0.43, "grad_norm": 7.861391449667313, "learning_rate": 3.2099155533820207e-06, "loss": 0.2988, "step": 674 }, { "epoch": 0.43, "grad_norm": 4.31015680758051, "learning_rate": 3.2050016937332166e-06, "loss": 0.1083, "step": 675 }, { "epoch": 0.43, "grad_norm": 8.312843258302744, "learning_rate": 3.2000848733091473e-06, "loss": 0.3997, "step": 676 }, { "epoch": 0.43, "grad_norm": 4.8860401441663655, "learning_rate": 3.1951651127588403e-06, "loss": 0.108, "step": 677 }, { "epoch": 0.43, "grad_norm": 5.732600215496707, "learning_rate": 3.190242432743673e-06, "loss": 0.2456, "step": 678 }, { "epoch": 0.43, "grad_norm": 7.165008116436, "learning_rate": 3.185316853937281e-06, "loss": 0.3988, "step": 679 }, { "epoch": 0.43, "grad_norm": 7.344600806765619, "learning_rate": 3.1803883970254773e-06, "loss": 0.2684, "step": 680 }, { "epoch": 0.43, "grad_norm": 8.47365976724781, "learning_rate": 3.17545708270616e-06, "loss": 0.5079, "step": 681 }, { "epoch": 0.43, "grad_norm": 4.518229330834231, "learning_rate": 3.1705229316892263e-06, "loss": 0.1876, "step": 682 }, { "epoch": 0.43, "grad_norm": 6.8916402401892185, "learning_rate": 3.1655859646964882e-06, "loss": 0.1962, "step": 683 }, { "epoch": 0.43, "grad_norm": 5.847496422957769, "learning_rate": 3.1606462024615838e-06, "loss": 0.1164, "step": 684 }, { "epoch": 0.43, "grad_norm": 6.399327552754383, "learning_rate": 3.1557036657298906e-06, "loss": 0.1209, "step": 685 }, { "epoch": 0.43, "grad_norm": 11.45055607274162, "learning_rate": 3.150758375258436e-06, "loss": 0.5516, "step": 686 }, { "epoch": 0.43, "grad_norm": 6.256604176803525, "learning_rate": 3.145810351815815e-06, "loss": 0.2048, "step": 687 }, { "epoch": 0.43, "grad_norm": 1.4313041843335819, "learning_rate": 3.140859616182098e-06, "loss": 0.0273, "step": 688 }, { "epoch": 0.44, "grad_norm": 3.83878147958253, "learning_rate": 3.1359061891487473e-06, "loss": 0.0585, "step": 689 }, { "epoch": 0.44, "grad_norm": 4.938524357249688, "learning_rate": 3.1309500915185265e-06, "loss": 0.0961, "step": 690 }, { "epoch": 0.44, "grad_norm": 7.987872468147123, "learning_rate": 3.125991344105417e-06, "loss": 0.41, "step": 691 }, { "epoch": 0.44, "grad_norm": 6.726756165094635, "learning_rate": 3.1210299677345256e-06, "loss": 0.2588, "step": 692 }, { "epoch": 0.44, "grad_norm": 7.351286182622627, "learning_rate": 3.1160659832420033e-06, "loss": 0.3137, "step": 693 }, { "epoch": 0.44, "grad_norm": 6.102458971034878, "learning_rate": 3.111099411474951e-06, "loss": 0.1375, "step": 694 }, { "epoch": 0.44, "grad_norm": 3.9289523398289603, "learning_rate": 3.106130273291338e-06, "loss": 0.2306, "step": 695 }, { "epoch": 0.44, "grad_norm": 5.286081124614515, "learning_rate": 3.1011585895599083e-06, "loss": 0.2769, "step": 696 }, { "epoch": 0.44, "grad_norm": 0.7685540670661409, "learning_rate": 3.0961843811601016e-06, "loss": 0.006, "step": 697 }, { "epoch": 0.44, "grad_norm": 5.358175160084128, "learning_rate": 3.091207668981955e-06, "loss": 0.1439, "step": 698 }, { "epoch": 0.44, "grad_norm": 6.539215633007578, "learning_rate": 3.0862284739260247e-06, "loss": 0.3104, "step": 699 }, { "epoch": 0.44, "grad_norm": 10.370554511640458, "learning_rate": 3.081246816903292e-06, "loss": 0.726, "step": 700 }, { "epoch": 0.44, "grad_norm": 6.460448905063149, "learning_rate": 3.076262718835077e-06, "loss": 0.0923, "step": 701 }, { "epoch": 0.44, "grad_norm": 3.6729677562799097, "learning_rate": 3.0712762006529533e-06, "loss": 0.1067, "step": 702 }, { "epoch": 0.44, "grad_norm": 5.322218678118633, "learning_rate": 3.0662872832986574e-06, "loss": 0.109, "step": 703 }, { "epoch": 0.45, "grad_norm": 2.078091269771491, "learning_rate": 3.0612959877240024e-06, "loss": 0.0223, "step": 704 }, { "epoch": 0.45, "grad_norm": 10.485777168486294, "learning_rate": 3.056302334890786e-06, "loss": 0.427, "step": 705 }, { "epoch": 0.45, "grad_norm": 6.579777815757636, "learning_rate": 3.051306345770711e-06, "loss": 0.2521, "step": 706 }, { "epoch": 0.45, "grad_norm": 6.687302345848805, "learning_rate": 3.0463080413452856e-06, "loss": 0.4441, "step": 707 }, { "epoch": 0.45, "grad_norm": 9.01544559505207, "learning_rate": 3.0413074426057472e-06, "loss": 0.4639, "step": 708 }, { "epoch": 0.45, "grad_norm": 5.927990817661355, "learning_rate": 3.0363045705529638e-06, "loss": 0.2244, "step": 709 }, { "epoch": 0.45, "grad_norm": 8.664134707205871, "learning_rate": 3.0312994461973544e-06, "loss": 0.6734, "step": 710 }, { "epoch": 0.45, "grad_norm": 7.311725542839963, "learning_rate": 3.0262920905587945e-06, "loss": 0.2679, "step": 711 }, { "epoch": 0.45, "grad_norm": 1.4334897203665629, "learning_rate": 3.021282524666531e-06, "loss": 0.0143, "step": 712 }, { "epoch": 0.45, "grad_norm": 7.786103845052255, "learning_rate": 3.016270769559094e-06, "loss": 0.3318, "step": 713 }, { "epoch": 0.45, "grad_norm": 4.136857532049963, "learning_rate": 3.011256846284206e-06, "loss": 0.0677, "step": 714 }, { "epoch": 0.45, "grad_norm": 4.6911212412125805, "learning_rate": 3.006240775898696e-06, "loss": 0.3064, "step": 715 }, { "epoch": 0.45, "grad_norm": 7.80582893891379, "learning_rate": 3.001222579468411e-06, "loss": 0.3059, "step": 716 }, { "epoch": 0.45, "grad_norm": 2.5434229572582017, "learning_rate": 2.9962022780681237e-06, "loss": 0.0193, "step": 717 }, { "epoch": 0.45, "grad_norm": 8.66356231649076, "learning_rate": 2.991179892781451e-06, "loss": 0.4298, "step": 718 }, { "epoch": 0.45, "grad_norm": 8.296739365210787, "learning_rate": 2.986155444700758e-06, "loss": 0.6059, "step": 719 }, { "epoch": 0.46, "grad_norm": 9.80721612064858, "learning_rate": 2.981128954927075e-06, "loss": 0.6935, "step": 720 }, { "epoch": 0.46, "grad_norm": 5.20388174424918, "learning_rate": 2.9761004445700063e-06, "loss": 0.1102, "step": 721 }, { "epoch": 0.46, "grad_norm": 8.274031807768026, "learning_rate": 2.9710699347476415e-06, "loss": 0.4478, "step": 722 }, { "epoch": 0.46, "grad_norm": 5.294650620703173, "learning_rate": 2.966037446586467e-06, "loss": 0.2077, "step": 723 }, { "epoch": 0.46, "grad_norm": 6.823860984468512, "learning_rate": 2.9610030012212783e-06, "loss": 0.1845, "step": 724 }, { "epoch": 0.46, "grad_norm": 6.146084055123757, "learning_rate": 2.955966619795091e-06, "loss": 0.3648, "step": 725 }, { "epoch": 0.46, "grad_norm": 8.51193924679981, "learning_rate": 2.95092832345905e-06, "loss": 0.4524, "step": 726 }, { "epoch": 0.46, "grad_norm": 3.0467729600301205, "learning_rate": 2.945888133372343e-06, "loss": 0.0294, "step": 727 }, { "epoch": 0.46, "grad_norm": 3.921526768030892, "learning_rate": 2.9408460707021114e-06, "loss": 0.1625, "step": 728 }, { "epoch": 0.46, "grad_norm": 6.797477748058581, "learning_rate": 2.93580215662336e-06, "loss": 0.2835, "step": 729 }, { "epoch": 0.46, "grad_norm": 3.7187894931266885, "learning_rate": 2.930756412318869e-06, "loss": 0.0789, "step": 730 }, { "epoch": 0.46, "grad_norm": 3.4511765726658745, "learning_rate": 2.925708858979106e-06, "loss": 0.052, "step": 731 }, { "epoch": 0.46, "grad_norm": 8.717261840262642, "learning_rate": 2.9206595178021337e-06, "loss": 0.5213, "step": 732 }, { "epoch": 0.46, "grad_norm": 7.412682681220795, "learning_rate": 2.9156084099935256e-06, "loss": 0.2256, "step": 733 }, { "epoch": 0.46, "grad_norm": 5.429462148257445, "learning_rate": 2.9105555567662724e-06, "loss": 0.2526, "step": 734 }, { "epoch": 0.46, "grad_norm": 6.758202267029308, "learning_rate": 2.9055009793406972e-06, "loss": 0.2356, "step": 735 }, { "epoch": 0.47, "grad_norm": 6.453784973534456, "learning_rate": 2.900444698944362e-06, "loss": 0.2218, "step": 736 }, { "epoch": 0.47, "grad_norm": 8.2798803960298, "learning_rate": 2.895386736811982e-06, "loss": 0.411, "step": 737 }, { "epoch": 0.47, "grad_norm": 7.179331242436376, "learning_rate": 2.8903271141853346e-06, "loss": 0.351, "step": 738 }, { "epoch": 0.47, "grad_norm": 6.694703687587645, "learning_rate": 2.885265852313171e-06, "loss": 0.4684, "step": 739 }, { "epoch": 0.47, "grad_norm": 2.552161224629805, "learning_rate": 2.8802029724511265e-06, "loss": 0.0449, "step": 740 }, { "epoch": 0.47, "grad_norm": 4.193204040919566, "learning_rate": 2.8751384958616318e-06, "loss": 0.1399, "step": 741 }, { "epoch": 0.47, "grad_norm": 2.1645583263001535, "learning_rate": 2.8700724438138237e-06, "loss": 0.0175, "step": 742 }, { "epoch": 0.47, "grad_norm": 4.745994284258654, "learning_rate": 2.8650048375834534e-06, "loss": 0.1841, "step": 743 }, { "epoch": 0.47, "grad_norm": 8.503711114693864, "learning_rate": 2.8599356984528014e-06, "loss": 0.433, "step": 744 }, { "epoch": 0.47, "grad_norm": 6.274015445144218, "learning_rate": 2.8548650477105843e-06, "loss": 0.2573, "step": 745 }, { "epoch": 0.47, "grad_norm": 4.404104589760424, "learning_rate": 2.8497929066518686e-06, "loss": 0.1154, "step": 746 }, { "epoch": 0.47, "grad_norm": 6.134366095017097, "learning_rate": 2.8447192965779778e-06, "loss": 0.1819, "step": 747 }, { "epoch": 0.47, "grad_norm": 7.7412682843096325, "learning_rate": 2.8396442387964075e-06, "loss": 0.2815, "step": 748 }, { "epoch": 0.47, "grad_norm": 6.955670728667691, "learning_rate": 2.8345677546207283e-06, "loss": 0.2243, "step": 749 }, { "epoch": 0.47, "grad_norm": 6.948314863134504, "learning_rate": 2.829489865370507e-06, "loss": 0.3371, "step": 750 }, { "epoch": 0.47, "grad_norm": 8.02287740711825, "learning_rate": 2.8244105923712074e-06, "loss": 0.4982, "step": 751 }, { "epoch": 0.48, "grad_norm": 1.6725793941111144, "learning_rate": 2.819329956954106e-06, "loss": 0.0248, "step": 752 }, { "epoch": 0.48, "grad_norm": 6.534539813275994, "learning_rate": 2.8142479804562e-06, "loss": 0.279, "step": 753 }, { "epoch": 0.48, "grad_norm": 5.853079625904447, "learning_rate": 2.809164684220121e-06, "loss": 0.2999, "step": 754 }, { "epoch": 0.48, "grad_norm": 8.246319961517067, "learning_rate": 2.8040800895940395e-06, "loss": 0.485, "step": 755 }, { "epoch": 0.48, "grad_norm": 0.0004816702790924369, "learning_rate": 2.7989942179315832e-06, "loss": 0.0, "step": 756 }, { "epoch": 0.48, "grad_norm": 4.5227088143646, "learning_rate": 2.7939070905917378e-06, "loss": 0.0986, "step": 757 }, { "epoch": 0.48, "grad_norm": 5.858583930974236, "learning_rate": 2.7888187289387675e-06, "loss": 0.3007, "step": 758 }, { "epoch": 0.48, "grad_norm": 7.512850116308536, "learning_rate": 2.7837291543421164e-06, "loss": 0.4299, "step": 759 }, { "epoch": 0.48, "grad_norm": 4.078627982855776, "learning_rate": 2.778638388176324e-06, "loss": 0.0617, "step": 760 }, { "epoch": 0.48, "grad_norm": 9.690378856516007, "learning_rate": 2.773546451820936e-06, "loss": 0.5397, "step": 761 }, { "epoch": 0.48, "grad_norm": 5.9189220302761445, "learning_rate": 2.768453366660408e-06, "loss": 0.139, "step": 762 }, { "epoch": 0.48, "grad_norm": 4.124384978713398, "learning_rate": 2.7633591540840243e-06, "loss": 0.0565, "step": 763 }, { "epoch": 0.48, "grad_norm": 8.884950030030152, "learning_rate": 2.7582638354858023e-06, "loss": 0.5086, "step": 764 }, { "epoch": 0.48, "grad_norm": 5.9719791964189675, "learning_rate": 2.753167432264404e-06, "loss": 0.153, "step": 765 }, { "epoch": 0.48, "grad_norm": 8.941511201055096, "learning_rate": 2.748069965823047e-06, "loss": 0.5118, "step": 766 }, { "epoch": 0.48, "grad_norm": 5.274503111374512, "learning_rate": 2.7429714575694145e-06, "loss": 0.1259, "step": 767 }, { "epoch": 0.49, "grad_norm": 2.626365578898379, "learning_rate": 2.7378719289155637e-06, "loss": 0.0231, "step": 768 }, { "epoch": 0.49, "grad_norm": 6.24206038669074, "learning_rate": 2.7327714012778385e-06, "loss": 0.2318, "step": 769 }, { "epoch": 0.49, "grad_norm": 3.074652402506702, "learning_rate": 2.7276698960767774e-06, "loss": 0.0255, "step": 770 }, { "epoch": 0.49, "grad_norm": 3.2154239395202375, "learning_rate": 2.7225674347370248e-06, "loss": 0.0465, "step": 771 }, { "epoch": 0.49, "grad_norm": 4.499591384985523, "learning_rate": 2.7174640386872394e-06, "loss": 0.0556, "step": 772 }, { "epoch": 0.49, "grad_norm": 5.238796086510255, "learning_rate": 2.712359729360007e-06, "loss": 0.3998, "step": 773 }, { "epoch": 0.49, "grad_norm": 4.01275841186851, "learning_rate": 2.707254528191749e-06, "loss": 0.1265, "step": 774 }, { "epoch": 0.49, "grad_norm": 7.4573307606659505, "learning_rate": 2.70214845662263e-06, "loss": 0.439, "step": 775 }, { "epoch": 0.49, "grad_norm": 5.171731537733631, "learning_rate": 2.697041536096472e-06, "loss": 0.2573, "step": 776 }, { "epoch": 0.49, "grad_norm": 6.5856258730395885, "learning_rate": 2.691933788060662e-06, "loss": 0.2674, "step": 777 }, { "epoch": 0.49, "grad_norm": 9.56001199506063, "learning_rate": 2.686825233966061e-06, "loss": 0.4671, "step": 778 }, { "epoch": 0.49, "grad_norm": 6.760523751639689, "learning_rate": 2.6817158952669177e-06, "loss": 0.3438, "step": 779 }, { "epoch": 0.49, "grad_norm": 5.095872121035828, "learning_rate": 2.6766057934207734e-06, "loss": 0.1716, "step": 780 }, { "epoch": 0.49, "grad_norm": 8.796274818557782, "learning_rate": 2.6714949498883745e-06, "loss": 0.6519, "step": 781 }, { "epoch": 0.49, "grad_norm": 7.858535994838844, "learning_rate": 2.6663833861335857e-06, "loss": 0.2606, "step": 782 }, { "epoch": 0.5, "grad_norm": 6.965167711461031, "learning_rate": 2.6612711236232915e-06, "loss": 0.1159, "step": 783 }, { "epoch": 0.5, "grad_norm": 6.785247889961181, "learning_rate": 2.6561581838273137e-06, "loss": 0.297, "step": 784 }, { "epoch": 0.5, "grad_norm": 7.4581581251424875, "learning_rate": 2.651044588218318e-06, "loss": 0.2434, "step": 785 }, { "epoch": 0.5, "grad_norm": 3.397183452172526, "learning_rate": 2.645930358271725e-06, "loss": 0.0306, "step": 786 }, { "epoch": 0.5, "grad_norm": 8.3505465939643, "learning_rate": 2.640815515465618e-06, "loss": 0.7295, "step": 787 }, { "epoch": 0.5, "grad_norm": 2.947761630417348, "learning_rate": 2.635700081280655e-06, "loss": 0.1265, "step": 788 }, { "epoch": 0.5, "grad_norm": 6.494343570607112, "learning_rate": 2.6305840771999764e-06, "loss": 0.2842, "step": 789 }, { "epoch": 0.5, "grad_norm": 8.981692874105951, "learning_rate": 2.6254675247091182e-06, "loss": 0.2263, "step": 790 }, { "epoch": 0.5, "grad_norm": 3.334599604098417, "learning_rate": 2.6203504452959174e-06, "loss": 0.0601, "step": 791 }, { "epoch": 0.5, "grad_norm": 7.1997284732058215, "learning_rate": 2.615232860450425e-06, "loss": 0.3632, "step": 792 }, { "epoch": 0.5, "grad_norm": 6.6590461409884, "learning_rate": 2.610114791664814e-06, "loss": 0.278, "step": 793 }, { "epoch": 0.5, "grad_norm": 0.9268985235479245, "learning_rate": 2.6049962604332913e-06, "loss": 0.0039, "step": 794 }, { "epoch": 0.5, "grad_norm": 5.657219060849721, "learning_rate": 2.599877288252003e-06, "loss": 0.1487, "step": 795 }, { "epoch": 0.5, "grad_norm": 4.163294966566469, "learning_rate": 2.59475789661895e-06, "loss": 0.0782, "step": 796 }, { "epoch": 0.5, "grad_norm": 5.695923990878327, "learning_rate": 2.589638107033894e-06, "loss": 0.1973, "step": 797 }, { "epoch": 0.5, "grad_norm": 7.506726364024847, "learning_rate": 2.5845179409982667e-06, "loss": 0.2817, "step": 798 }, { "epoch": 0.51, "grad_norm": 8.582582410043921, "learning_rate": 2.579397420015081e-06, "loss": 0.5584, "step": 799 }, { "epoch": 0.51, "grad_norm": 3.1432966569019563, "learning_rate": 2.574276565588843e-06, "loss": 0.0564, "step": 800 }, { "epoch": 0.51, "grad_norm": 4.198366328815605, "learning_rate": 2.569155399225456e-06, "loss": 0.1426, "step": 801 }, { "epoch": 0.51, "grad_norm": 4.369378538308984, "learning_rate": 2.5640339424321346e-06, "loss": 0.1068, "step": 802 }, { "epoch": 0.51, "grad_norm": 5.453453950333066, "learning_rate": 2.5589122167173137e-06, "loss": 0.2165, "step": 803 }, { "epoch": 0.51, "grad_norm": 7.850692533814966, "learning_rate": 2.553790243590556e-06, "loss": 0.535, "step": 804 }, { "epoch": 0.51, "grad_norm": 2.6188975654486732, "learning_rate": 2.548668044562465e-06, "loss": 0.0219, "step": 805 }, { "epoch": 0.51, "grad_norm": 5.189834620904021, "learning_rate": 2.5435456411445912e-06, "loss": 0.2881, "step": 806 }, { "epoch": 0.51, "grad_norm": 3.7281350070918187, "learning_rate": 2.5384230548493467e-06, "loss": 0.1371, "step": 807 }, { "epoch": 0.51, "grad_norm": 4.953050679031332, "learning_rate": 2.533300307189906e-06, "loss": 0.3427, "step": 808 }, { "epoch": 0.51, "grad_norm": 8.44563726176564, "learning_rate": 2.5281774196801274e-06, "loss": 0.2655, "step": 809 }, { "epoch": 0.51, "grad_norm": 5.097336518648792, "learning_rate": 2.5230544138344513e-06, "loss": 0.2631, "step": 810 }, { "epoch": 0.51, "grad_norm": 5.291798186983118, "learning_rate": 2.5179313111678203e-06, "loss": 0.1324, "step": 811 }, { "epoch": 0.51, "grad_norm": 4.519882570952501, "learning_rate": 2.5128081331955783e-06, "loss": 0.11, "step": 812 }, { "epoch": 0.51, "grad_norm": 9.156809089959282, "learning_rate": 2.50768490143339e-06, "loss": 0.5701, "step": 813 }, { "epoch": 0.51, "grad_norm": 5.1893533360395345, "learning_rate": 2.5025616373971416e-06, "loss": 0.1588, "step": 814 }, { "epoch": 0.52, "grad_norm": 2.055942862625886, "learning_rate": 2.497438362602859e-06, "loss": 0.0073, "step": 815 }, { "epoch": 0.52, "grad_norm": 3.216517887397222, "learning_rate": 2.492315098566612e-06, "loss": 0.0435, "step": 816 }, { "epoch": 0.52, "grad_norm": 4.571209889528808, "learning_rate": 2.487191866804422e-06, "loss": 0.104, "step": 817 }, { "epoch": 0.52, "grad_norm": 6.88362440674566, "learning_rate": 2.482068688832181e-06, "loss": 0.2601, "step": 818 }, { "epoch": 0.52, "grad_norm": 4.985565904274485, "learning_rate": 2.476945586165549e-06, "loss": 0.1496, "step": 819 }, { "epoch": 0.52, "grad_norm": 7.572992281599657, "learning_rate": 2.471822580319874e-06, "loss": 0.1586, "step": 820 }, { "epoch": 0.52, "grad_norm": 5.37683127501101, "learning_rate": 2.466699692810095e-06, "loss": 0.1533, "step": 821 }, { "epoch": 0.52, "grad_norm": 6.5080739274471355, "learning_rate": 2.461576945150655e-06, "loss": 0.1901, "step": 822 }, { "epoch": 0.52, "grad_norm": 7.258736181047285, "learning_rate": 2.456454358855409e-06, "loss": 0.2367, "step": 823 }, { "epoch": 0.52, "grad_norm": 7.601914275166003, "learning_rate": 2.4513319554375363e-06, "loss": 0.4312, "step": 824 }, { "epoch": 0.52, "grad_norm": 6.299126385864762, "learning_rate": 2.446209756409445e-06, "loss": 0.151, "step": 825 }, { "epoch": 0.52, "grad_norm": 7.878069082785739, "learning_rate": 2.4410877832826876e-06, "loss": 0.5147, "step": 826 }, { "epoch": 0.52, "grad_norm": 2.740606129145274, "learning_rate": 2.435966057567866e-06, "loss": 0.0242, "step": 827 }, { "epoch": 0.52, "grad_norm": 6.954831580420314, "learning_rate": 2.4308446007745452e-06, "loss": 0.1391, "step": 828 }, { "epoch": 0.52, "grad_norm": 0.0008022396197248448, "learning_rate": 2.425723434411158e-06, "loss": 0.0, "step": 829 }, { "epoch": 0.52, "grad_norm": 4.70913423911258, "learning_rate": 2.4206025799849196e-06, "loss": 0.18, "step": 830 }, { "epoch": 0.53, "grad_norm": 7.48847202621159, "learning_rate": 2.415482059001734e-06, "loss": 0.4761, "step": 831 }, { "epoch": 0.53, "grad_norm": 0.17612393672201163, "learning_rate": 2.4103618929661072e-06, "loss": 0.0007, "step": 832 }, { "epoch": 0.53, "grad_norm": 2.466116937604331, "learning_rate": 2.4052421033810503e-06, "loss": 0.0284, "step": 833 }, { "epoch": 0.53, "grad_norm": 6.58109403101183, "learning_rate": 2.4001227117479983e-06, "loss": 0.2474, "step": 834 }, { "epoch": 0.53, "grad_norm": 4.597120022029906, "learning_rate": 2.3950037395667096e-06, "loss": 0.1241, "step": 835 }, { "epoch": 0.53, "grad_norm": 10.072654765424927, "learning_rate": 2.3898852083351874e-06, "loss": 0.4918, "step": 836 }, { "epoch": 0.53, "grad_norm": 5.72258925675377, "learning_rate": 2.3847671395495757e-06, "loss": 0.2196, "step": 837 }, { "epoch": 0.53, "grad_norm": 8.889633793878003, "learning_rate": 2.379649554704084e-06, "loss": 0.2769, "step": 838 }, { "epoch": 0.53, "grad_norm": 5.786338422639018, "learning_rate": 2.3745324752908826e-06, "loss": 0.1242, "step": 839 }, { "epoch": 0.53, "grad_norm": 8.816598791234943, "learning_rate": 2.369415922800025e-06, "loss": 0.4444, "step": 840 }, { "epoch": 0.53, "grad_norm": 6.793592620683167, "learning_rate": 2.364299918719346e-06, "loss": 0.2588, "step": 841 }, { "epoch": 0.53, "grad_norm": 5.715642366127424, "learning_rate": 2.3591844845343828e-06, "loss": 0.2249, "step": 842 }, { "epoch": 0.53, "grad_norm": 7.19279245322333, "learning_rate": 2.3540696417282755e-06, "loss": 0.3364, "step": 843 }, { "epoch": 0.53, "grad_norm": 0.0007639544783230961, "learning_rate": 2.3489554117816827e-06, "loss": 0.0, "step": 844 }, { "epoch": 0.53, "grad_norm": 7.660356689196195, "learning_rate": 2.343841816172687e-06, "loss": 0.3094, "step": 845 }, { "epoch": 0.53, "grad_norm": 11.056742231194947, "learning_rate": 2.3387288763767097e-06, "loss": 0.7192, "step": 846 }, { "epoch": 0.54, "grad_norm": 8.180834670631299, "learning_rate": 2.333616613866415e-06, "loss": 0.2911, "step": 847 }, { "epoch": 0.54, "grad_norm": 6.732843707320632, "learning_rate": 2.328505050111626e-06, "loss": 0.1936, "step": 848 }, { "epoch": 0.54, "grad_norm": 9.767254551541562, "learning_rate": 2.3233942065792274e-06, "loss": 0.5851, "step": 849 }, { "epoch": 0.54, "grad_norm": 2.880672994500611, "learning_rate": 2.3182841047330836e-06, "loss": 0.046, "step": 850 }, { "epoch": 0.54, "grad_norm": 8.668434354045177, "learning_rate": 2.3131747660339396e-06, "loss": 0.4546, "step": 851 }, { "epoch": 0.54, "grad_norm": 7.395906460539295, "learning_rate": 2.3080662119393395e-06, "loss": 0.4166, "step": 852 }, { "epoch": 0.54, "grad_norm": 5.432345874353768, "learning_rate": 2.3029584639035288e-06, "loss": 0.1, "step": 853 }, { "epoch": 0.54, "grad_norm": 4.045690886127331, "learning_rate": 2.297851543377371e-06, "loss": 0.1253, "step": 854 }, { "epoch": 0.54, "grad_norm": 8.64455022521076, "learning_rate": 2.292745471808252e-06, "loss": 0.2786, "step": 855 }, { "epoch": 0.54, "grad_norm": 2.774363274597841, "learning_rate": 2.287640270639994e-06, "loss": 0.0335, "step": 856 }, { "epoch": 0.54, "grad_norm": 8.037660170601043, "learning_rate": 2.2825359613127615e-06, "loss": 0.568, "step": 857 }, { "epoch": 0.54, "grad_norm": 6.294832489960329, "learning_rate": 2.2774325652629765e-06, "loss": 0.1654, "step": 858 }, { "epoch": 0.54, "grad_norm": 6.81186666081957, "learning_rate": 2.272330103923224e-06, "loss": 0.2634, "step": 859 }, { "epoch": 0.54, "grad_norm": 5.888207706105006, "learning_rate": 2.2672285987221628e-06, "loss": 0.2884, "step": 860 }, { "epoch": 0.54, "grad_norm": 8.885442044280387, "learning_rate": 2.262128071084437e-06, "loss": 0.4424, "step": 861 }, { "epoch": 0.55, "grad_norm": 7.511127609345163, "learning_rate": 2.257028542430587e-06, "loss": 0.2785, "step": 862 }, { "epoch": 0.55, "grad_norm": 7.313982951761123, "learning_rate": 2.2519300341769535e-06, "loss": 0.28, "step": 863 }, { "epoch": 0.55, "grad_norm": 7.148900685130503, "learning_rate": 2.246832567735597e-06, "loss": 0.3623, "step": 864 }, { "epoch": 0.55, "grad_norm": 10.176549725061074, "learning_rate": 2.2417361645141985e-06, "loss": 0.5615, "step": 865 }, { "epoch": 0.55, "grad_norm": 5.953518111097834, "learning_rate": 2.2366408459159765e-06, "loss": 0.1704, "step": 866 }, { "epoch": 0.55, "grad_norm": 7.298203798642748, "learning_rate": 2.2315466333395927e-06, "loss": 0.332, "step": 867 }, { "epoch": 0.55, "grad_norm": 2.5129001621401583, "learning_rate": 2.2264535481790656e-06, "loss": 0.0281, "step": 868 }, { "epoch": 0.55, "grad_norm": 4.387135269350836, "learning_rate": 2.2213616118236762e-06, "loss": 0.1011, "step": 869 }, { "epoch": 0.55, "grad_norm": 6.360321199475546, "learning_rate": 2.2162708456578853e-06, "loss": 0.2468, "step": 870 }, { "epoch": 0.55, "grad_norm": 5.904042956829158, "learning_rate": 2.2111812710612333e-06, "loss": 0.2183, "step": 871 }, { "epoch": 0.55, "grad_norm": 8.389410119218008, "learning_rate": 2.2060929094082635e-06, "loss": 0.3332, "step": 872 }, { "epoch": 0.55, "grad_norm": 5.497967951531669, "learning_rate": 2.2010057820684176e-06, "loss": 0.1991, "step": 873 }, { "epoch": 0.55, "grad_norm": 7.38982233765663, "learning_rate": 2.1959199104059613e-06, "loss": 0.3669, "step": 874 }, { "epoch": 0.55, "grad_norm": 4.546885487128155, "learning_rate": 2.19083531577988e-06, "loss": 0.1043, "step": 875 }, { "epoch": 0.55, "grad_norm": 5.0072057299867465, "learning_rate": 2.185752019543801e-06, "loss": 0.1262, "step": 876 }, { "epoch": 0.55, "grad_norm": 8.264415746255334, "learning_rate": 2.1806700430458948e-06, "loss": 0.4108, "step": 877 }, { "epoch": 0.56, "grad_norm": 6.680273126283689, "learning_rate": 2.175589407628794e-06, "loss": 0.3585, "step": 878 }, { "epoch": 0.56, "grad_norm": 6.24682536561072, "learning_rate": 2.1705101346294936e-06, "loss": 0.182, "step": 879 }, { "epoch": 0.56, "grad_norm": 7.567777951456993, "learning_rate": 2.165432245379273e-06, "loss": 0.4158, "step": 880 }, { "epoch": 0.56, "grad_norm": 7.282528482598164, "learning_rate": 2.1603557612035937e-06, "loss": 0.3149, "step": 881 }, { "epoch": 0.56, "grad_norm": 10.062430245293939, "learning_rate": 2.1552807034220226e-06, "loss": 0.4366, "step": 882 }, { "epoch": 0.56, "grad_norm": 9.655764076820983, "learning_rate": 2.1502070933481318e-06, "loss": 0.4819, "step": 883 }, { "epoch": 0.56, "grad_norm": 12.216308349794778, "learning_rate": 2.1451349522894165e-06, "loss": 0.9684, "step": 884 }, { "epoch": 0.56, "grad_norm": 6.114028446249881, "learning_rate": 2.1400643015471994e-06, "loss": 0.2168, "step": 885 }, { "epoch": 0.56, "grad_norm": 5.703920909788037, "learning_rate": 2.1349951624165474e-06, "loss": 0.1802, "step": 886 }, { "epoch": 0.56, "grad_norm": 6.930618860044399, "learning_rate": 2.129927556186177e-06, "loss": 0.2779, "step": 887 }, { "epoch": 0.56, "grad_norm": 7.278586493962798, "learning_rate": 2.1248615041383686e-06, "loss": 0.371, "step": 888 }, { "epoch": 0.56, "grad_norm": 2.9533216622884786, "learning_rate": 2.119797027548874e-06, "loss": 0.044, "step": 889 }, { "epoch": 0.56, "grad_norm": 6.96798633445337, "learning_rate": 2.11473414768683e-06, "loss": 0.5775, "step": 890 }, { "epoch": 0.56, "grad_norm": 4.909277109667161, "learning_rate": 2.1096728858146667e-06, "loss": 0.257, "step": 891 }, { "epoch": 0.56, "grad_norm": 7.980777056728199, "learning_rate": 2.1046132631880194e-06, "loss": 0.517, "step": 892 }, { "epoch": 0.56, "grad_norm": 4.918104101716663, "learning_rate": 2.099555301055639e-06, "loss": 0.1367, "step": 893 }, { "epoch": 0.57, "grad_norm": 4.490316356227466, "learning_rate": 2.094499020659304e-06, "loss": 0.2168, "step": 894 }, { "epoch": 0.57, "grad_norm": 4.4285994981500245, "learning_rate": 2.0894444432337284e-06, "loss": 0.0505, "step": 895 }, { "epoch": 0.57, "grad_norm": 5.161394177245647, "learning_rate": 2.084391590006476e-06, "loss": 0.1563, "step": 896 }, { "epoch": 0.57, "grad_norm": 7.159789963097649, "learning_rate": 2.079340482197867e-06, "loss": 0.3148, "step": 897 }, { "epoch": 0.57, "grad_norm": 3.475321550380632, "learning_rate": 2.0742911410208956e-06, "loss": 0.0655, "step": 898 }, { "epoch": 0.57, "grad_norm": 8.764324996208217, "learning_rate": 2.0692435876811317e-06, "loss": 0.2755, "step": 899 }, { "epoch": 0.57, "grad_norm": 6.186111448520437, "learning_rate": 2.0641978433766412e-06, "loss": 0.2011, "step": 900 }, { "epoch": 0.57, "grad_norm": 8.118794580444385, "learning_rate": 2.0591539292978894e-06, "loss": 0.4267, "step": 901 }, { "epoch": 0.57, "grad_norm": 7.822696471046605, "learning_rate": 2.054111866627658e-06, "loss": 0.4114, "step": 902 }, { "epoch": 0.57, "grad_norm": 3.864310872771928, "learning_rate": 2.049071676540951e-06, "loss": 0.076, "step": 903 }, { "epoch": 0.57, "grad_norm": 7.003151320510557, "learning_rate": 2.04403338020491e-06, "loss": 0.1848, "step": 904 }, { "epoch": 0.57, "grad_norm": 5.574367764465379, "learning_rate": 2.038996998778722e-06, "loss": 0.262, "step": 905 }, { "epoch": 0.57, "grad_norm": 0.9392739043718448, "learning_rate": 2.033962553413534e-06, "loss": 0.0052, "step": 906 }, { "epoch": 0.57, "grad_norm": 7.344892697001689, "learning_rate": 2.0289300652523594e-06, "loss": 0.4998, "step": 907 }, { "epoch": 0.57, "grad_norm": 7.67647400658845, "learning_rate": 2.0238995554299946e-06, "loss": 0.4042, "step": 908 }, { "epoch": 0.57, "grad_norm": 8.499106865128995, "learning_rate": 2.0188710450729255e-06, "loss": 0.3047, "step": 909 }, { "epoch": 0.58, "grad_norm": 5.004581641090867, "learning_rate": 2.0138445552992432e-06, "loss": 0.0607, "step": 910 }, { "epoch": 0.58, "grad_norm": 5.094376039941312, "learning_rate": 2.0088201072185497e-06, "loss": 0.1673, "step": 911 }, { "epoch": 0.58, "grad_norm": 7.014596434640352, "learning_rate": 2.0037977219318776e-06, "loss": 0.5188, "step": 912 }, { "epoch": 0.58, "grad_norm": 7.863010301389402, "learning_rate": 1.99877742053159e-06, "loss": 0.3261, "step": 913 }, { "epoch": 0.58, "grad_norm": 9.981489215198552, "learning_rate": 1.993759224101305e-06, "loss": 0.4534, "step": 914 }, { "epoch": 0.58, "grad_norm": 4.07740233584443, "learning_rate": 1.9887431537157944e-06, "loss": 0.1136, "step": 915 }, { "epoch": 0.58, "grad_norm": 7.884234146148189, "learning_rate": 1.9837292304409074e-06, "loss": 0.408, "step": 916 }, { "epoch": 0.58, "grad_norm": 6.787966447683212, "learning_rate": 1.9787174753334694e-06, "loss": 0.2243, "step": 917 }, { "epoch": 0.58, "grad_norm": 8.283351185380925, "learning_rate": 1.9737079094412072e-06, "loss": 0.5424, "step": 918 }, { "epoch": 0.58, "grad_norm": 5.1475586669305855, "learning_rate": 1.9687005538026464e-06, "loss": 0.3028, "step": 919 }, { "epoch": 0.58, "grad_norm": 4.465485347328667, "learning_rate": 1.9636954294470375e-06, "loss": 0.0826, "step": 920 }, { "epoch": 0.58, "grad_norm": 3.5577608676894794, "learning_rate": 1.958692557394254e-06, "loss": 0.1947, "step": 921 }, { "epoch": 0.58, "grad_norm": 5.472700512289778, "learning_rate": 1.9536919586547153e-06, "loss": 0.2325, "step": 922 }, { "epoch": 0.58, "grad_norm": 6.082228503645407, "learning_rate": 1.94869365422929e-06, "loss": 0.3296, "step": 923 }, { "epoch": 0.58, "grad_norm": 7.010116351545602, "learning_rate": 1.9436976651092143e-06, "loss": 0.3629, "step": 924 }, { "epoch": 0.58, "grad_norm": 3.9070867023831153, "learning_rate": 1.9387040122759984e-06, "loss": 0.1464, "step": 925 }, { "epoch": 0.59, "grad_norm": 6.330271265084205, "learning_rate": 1.933712716701343e-06, "loss": 0.231, "step": 926 }, { "epoch": 0.59, "grad_norm": 5.8114288327506, "learning_rate": 1.9287237993470475e-06, "loss": 0.1546, "step": 927 }, { "epoch": 0.59, "grad_norm": 8.41754775266865, "learning_rate": 1.923737281164924e-06, "loss": 0.452, "step": 928 }, { "epoch": 0.59, "grad_norm": 8.241115988554974, "learning_rate": 1.918753183096709e-06, "loss": 0.5664, "step": 929 }, { "epoch": 0.59, "grad_norm": 9.345251874881729, "learning_rate": 1.913771526073976e-06, "loss": 0.4855, "step": 930 }, { "epoch": 0.59, "grad_norm": 9.155355885045427, "learning_rate": 1.9087923310180453e-06, "loss": 0.8178, "step": 931 }, { "epoch": 0.59, "grad_norm": 4.797336220701351, "learning_rate": 1.9038156188398992e-06, "loss": 0.0949, "step": 932 }, { "epoch": 0.59, "grad_norm": 5.868256940025731, "learning_rate": 1.898841410440092e-06, "loss": 0.1998, "step": 933 }, { "epoch": 0.59, "grad_norm": 7.704476568722761, "learning_rate": 1.8938697267086636e-06, "loss": 0.4457, "step": 934 }, { "epoch": 0.59, "grad_norm": 4.702762627819754, "learning_rate": 1.88890058852505e-06, "loss": 0.1097, "step": 935 }, { "epoch": 0.59, "grad_norm": 6.170947504972868, "learning_rate": 1.8839340167579977e-06, "loss": 0.2688, "step": 936 }, { "epoch": 0.59, "grad_norm": 5.126022050873371, "learning_rate": 1.878970032265475e-06, "loss": 0.1048, "step": 937 }, { "epoch": 0.59, "grad_norm": 3.95862194146853, "learning_rate": 1.8740086558945842e-06, "loss": 0.0675, "step": 938 }, { "epoch": 0.59, "grad_norm": 10.157331297607431, "learning_rate": 1.8690499084814739e-06, "loss": 0.7983, "step": 939 }, { "epoch": 0.59, "grad_norm": 7.726465731825782, "learning_rate": 1.8640938108512538e-06, "loss": 0.435, "step": 940 }, { "epoch": 0.59, "grad_norm": 7.614194874872148, "learning_rate": 1.8591403838179026e-06, "loss": 0.4425, "step": 941 }, { "epoch": 0.6, "grad_norm": 5.407712529097981, "learning_rate": 1.854189648184186e-06, "loss": 0.1933, "step": 942 }, { "epoch": 0.6, "grad_norm": 9.040178261408725, "learning_rate": 1.8492416247415645e-06, "loss": 0.3822, "step": 943 }, { "epoch": 0.6, "grad_norm": 0.0005694511845410909, "learning_rate": 1.8442963342701107e-06, "loss": 0.0, "step": 944 }, { "epoch": 0.6, "grad_norm": 4.39729231837916, "learning_rate": 1.8393537975384168e-06, "loss": 0.1104, "step": 945 }, { "epoch": 0.6, "grad_norm": 9.878587674652728, "learning_rate": 1.8344140353035128e-06, "loss": 0.43, "step": 946 }, { "epoch": 0.6, "grad_norm": 5.129325901826996, "learning_rate": 1.829477068310775e-06, "loss": 0.198, "step": 947 }, { "epoch": 0.6, "grad_norm": 9.821412527888683, "learning_rate": 1.8245429172938414e-06, "loss": 0.3571, "step": 948 }, { "epoch": 0.6, "grad_norm": 4.974929134305073, "learning_rate": 1.8196116029745231e-06, "loss": 0.1794, "step": 949 }, { "epoch": 0.6, "grad_norm": 4.072969063944583, "learning_rate": 1.8146831460627198e-06, "loss": 0.048, "step": 950 }, { "epoch": 0.6, "grad_norm": 3.224159657607188, "learning_rate": 1.8097575672563278e-06, "loss": 0.1226, "step": 951 }, { "epoch": 0.6, "grad_norm": 1.4196871179088537, "learning_rate": 1.8048348872411608e-06, "loss": 0.0074, "step": 952 }, { "epoch": 0.6, "grad_norm": 6.3432469661818, "learning_rate": 1.7999151266908531e-06, "loss": 0.3635, "step": 953 }, { "epoch": 0.6, "grad_norm": 4.432677106164474, "learning_rate": 1.7949983062667842e-06, "loss": 0.1209, "step": 954 }, { "epoch": 0.6, "grad_norm": 8.328754336856798, "learning_rate": 1.7900844466179797e-06, "loss": 0.3935, "step": 955 }, { "epoch": 0.6, "grad_norm": 9.881259913864808, "learning_rate": 1.7851735683810373e-06, "loss": 0.3537, "step": 956 }, { "epoch": 0.61, "grad_norm": 8.165188039190037, "learning_rate": 1.7802656921800277e-06, "loss": 0.3275, "step": 957 }, { "epoch": 0.61, "grad_norm": 6.835526773375352, "learning_rate": 1.7753608386264196e-06, "loss": 0.2289, "step": 958 }, { "epoch": 0.61, "grad_norm": 6.3085698649896145, "learning_rate": 1.770459028318982e-06, "loss": 0.1853, "step": 959 }, { "epoch": 0.61, "grad_norm": 5.779465188491945, "learning_rate": 1.7655602818437095e-06, "loss": 0.2324, "step": 960 }, { "epoch": 0.61, "grad_norm": 7.877593082809007, "learning_rate": 1.7606646197737227e-06, "loss": 0.6645, "step": 961 }, { "epoch": 0.61, "grad_norm": 7.89453414924489, "learning_rate": 1.755772062669196e-06, "loss": 0.309, "step": 962 }, { "epoch": 0.61, "grad_norm": 5.595836569907511, "learning_rate": 1.7508826310772566e-06, "loss": 0.2078, "step": 963 }, { "epoch": 0.61, "grad_norm": 3.8634597227955005, "learning_rate": 1.7459963455319124e-06, "loss": 0.1937, "step": 964 }, { "epoch": 0.61, "grad_norm": 9.245447224042685, "learning_rate": 1.7411132265539537e-06, "loss": 0.6336, "step": 965 }, { "epoch": 0.61, "grad_norm": 7.032942504630441, "learning_rate": 1.7362332946508759e-06, "loss": 0.2036, "step": 966 }, { "epoch": 0.61, "grad_norm": 1.8539538815018344, "learning_rate": 1.7313565703167868e-06, "loss": 0.0123, "step": 967 }, { "epoch": 0.61, "grad_norm": 6.1530133564179605, "learning_rate": 1.7264830740323257e-06, "loss": 0.2706, "step": 968 }, { "epoch": 0.61, "grad_norm": 6.977211506591645, "learning_rate": 1.7216128262645734e-06, "loss": 0.4502, "step": 969 }, { "epoch": 0.61, "grad_norm": 6.862819632224306, "learning_rate": 1.71674584746697e-06, "loss": 0.1013, "step": 970 }, { "epoch": 0.61, "grad_norm": 6.729901662394336, "learning_rate": 1.711882158079225e-06, "loss": 0.4949, "step": 971 }, { "epoch": 0.61, "grad_norm": 5.062131326696847, "learning_rate": 1.7070217785272354e-06, "loss": 0.1355, "step": 972 }, { "epoch": 0.62, "grad_norm": 7.303964181266942, "learning_rate": 1.7021647292229954e-06, "loss": 0.2781, "step": 973 }, { "epoch": 0.62, "grad_norm": 10.80831740080856, "learning_rate": 1.6973110305645164e-06, "loss": 0.2575, "step": 974 }, { "epoch": 0.62, "grad_norm": 7.2783490734077185, "learning_rate": 1.6924607029357354e-06, "loss": 0.2815, "step": 975 }, { "epoch": 0.62, "grad_norm": 4.9908680492974415, "learning_rate": 1.6876137667064339e-06, "loss": 0.1786, "step": 976 }, { "epoch": 0.62, "grad_norm": 5.487313985604553, "learning_rate": 1.6827702422321496e-06, "loss": 0.1503, "step": 977 }, { "epoch": 0.62, "grad_norm": 4.370661082599542, "learning_rate": 1.6779301498540934e-06, "loss": 0.2291, "step": 978 }, { "epoch": 0.62, "grad_norm": 7.715340173901825, "learning_rate": 1.6730935098990602e-06, "loss": 0.409, "step": 979 }, { "epoch": 0.62, "grad_norm": 4.987489498576722, "learning_rate": 1.6682603426793487e-06, "loss": 0.1897, "step": 980 }, { "epoch": 0.62, "grad_norm": 7.626683784056619, "learning_rate": 1.6634306684926705e-06, "loss": 0.4759, "step": 981 }, { "epoch": 0.62, "grad_norm": 4.641636799157488, "learning_rate": 1.65860450762207e-06, "loss": 0.128, "step": 982 }, { "epoch": 0.62, "grad_norm": 10.557916312549223, "learning_rate": 1.6537818803358347e-06, "loss": 0.5851, "step": 983 }, { "epoch": 0.62, "grad_norm": 8.318635669498663, "learning_rate": 1.6489628068874142e-06, "loss": 0.3414, "step": 984 }, { "epoch": 0.62, "grad_norm": 8.349500410473203, "learning_rate": 1.644147307515331e-06, "loss": 0.4838, "step": 985 }, { "epoch": 0.62, "grad_norm": 9.401575695026681, "learning_rate": 1.6393354024431e-06, "loss": 0.2328, "step": 986 }, { "epoch": 0.62, "grad_norm": 9.015267879083828, "learning_rate": 1.6345271118791395e-06, "loss": 0.5914, "step": 987 }, { "epoch": 0.62, "grad_norm": 7.086201016237776, "learning_rate": 1.629722456016689e-06, "loss": 0.2998, "step": 988 }, { "epoch": 0.63, "grad_norm": 7.831802813844422, "learning_rate": 1.6249214550337222e-06, "loss": 0.5583, "step": 989 }, { "epoch": 0.63, "grad_norm": 7.586155488548567, "learning_rate": 1.6201241290928654e-06, "loss": 0.3286, "step": 990 }, { "epoch": 0.63, "grad_norm": 3.2517639655007406, "learning_rate": 1.615330498341309e-06, "loss": 0.0454, "step": 991 }, { "epoch": 0.63, "grad_norm": 7.182363781326082, "learning_rate": 1.6105405829107268e-06, "loss": 0.2463, "step": 992 }, { "epoch": 0.63, "grad_norm": 5.643108634794028, "learning_rate": 1.6057544029171863e-06, "loss": 0.1361, "step": 993 }, { "epoch": 0.63, "grad_norm": 5.297687659474982, "learning_rate": 1.6009719784610724e-06, "loss": 0.1918, "step": 994 }, { "epoch": 0.63, "grad_norm": 5.361747728058531, "learning_rate": 1.5961933296269923e-06, "loss": 0.1998, "step": 995 }, { "epoch": 0.63, "grad_norm": 8.086942306227849, "learning_rate": 1.5914184764837024e-06, "loss": 0.3141, "step": 996 }, { "epoch": 0.63, "grad_norm": 8.080349820561993, "learning_rate": 1.5866474390840126e-06, "loss": 0.4342, "step": 997 }, { "epoch": 0.63, "grad_norm": 7.043675544297867, "learning_rate": 1.5818802374647134e-06, "loss": 0.3817, "step": 998 }, { "epoch": 0.63, "grad_norm": 5.109757995249827, "learning_rate": 1.577116891646482e-06, "loss": 0.268, "step": 999 }, { "epoch": 0.63, "grad_norm": 7.481416440050833, "learning_rate": 1.5723574216338066e-06, "loss": 0.38, "step": 1000 }, { "epoch": 0.63, "grad_norm": 6.905021454641723, "learning_rate": 1.5676018474148935e-06, "loss": 0.1244, "step": 1001 }, { "epoch": 0.63, "grad_norm": 3.724723826161714, "learning_rate": 1.562850188961593e-06, "loss": 0.1855, "step": 1002 }, { "epoch": 0.63, "grad_norm": 9.249846586037759, "learning_rate": 1.5581024662293047e-06, "loss": 0.3089, "step": 1003 }, { "epoch": 0.63, "grad_norm": 7.341712961328778, "learning_rate": 1.5533586991569065e-06, "loss": 0.421, "step": 1004 }, { "epoch": 0.64, "grad_norm": 8.275293590489044, "learning_rate": 1.5486189076666575e-06, "loss": 0.3399, "step": 1005 }, { "epoch": 0.64, "grad_norm": 4.811066252920472, "learning_rate": 1.5438831116641244e-06, "loss": 0.1119, "step": 1006 }, { "epoch": 0.64, "grad_norm": 1.7495788339894816, "learning_rate": 1.5391513310380923e-06, "loss": 0.0152, "step": 1007 }, { "epoch": 0.64, "grad_norm": 6.7866886672666995, "learning_rate": 1.5344235856604845e-06, "loss": 0.4424, "step": 1008 }, { "epoch": 0.64, "grad_norm": 7.584903037571111, "learning_rate": 1.5296998953862755e-06, "loss": 0.3718, "step": 1009 }, { "epoch": 0.64, "grad_norm": 6.780438211958827, "learning_rate": 1.5249802800534125e-06, "loss": 0.2445, "step": 1010 }, { "epoch": 0.64, "grad_norm": 6.219854055575279, "learning_rate": 1.5202647594827269e-06, "loss": 0.4316, "step": 1011 }, { "epoch": 0.64, "grad_norm": 1.696767437583493, "learning_rate": 1.515553353477855e-06, "loss": 0.0192, "step": 1012 }, { "epoch": 0.64, "grad_norm": 5.291649685729808, "learning_rate": 1.5108460818251513e-06, "loss": 0.2354, "step": 1013 }, { "epoch": 0.64, "grad_norm": 8.03699664828297, "learning_rate": 1.5061429642936107e-06, "loss": 0.3573, "step": 1014 }, { "epoch": 0.64, "grad_norm": 9.268248008257986, "learning_rate": 1.5014440206347785e-06, "loss": 0.3276, "step": 1015 }, { "epoch": 0.64, "grad_norm": 6.071068126333296, "learning_rate": 1.4967492705826735e-06, "loss": 0.3007, "step": 1016 }, { "epoch": 0.64, "grad_norm": 7.901553240320484, "learning_rate": 1.4920587338537018e-06, "loss": 0.3938, "step": 1017 }, { "epoch": 0.64, "grad_norm": 6.879127234614124, "learning_rate": 1.4873724301465753e-06, "loss": 0.2296, "step": 1018 }, { "epoch": 0.64, "grad_norm": 5.506181191264519, "learning_rate": 1.4826903791422282e-06, "loss": 0.3104, "step": 1019 }, { "epoch": 0.64, "grad_norm": 6.972314581527378, "learning_rate": 1.4780126005037354e-06, "loss": 0.3961, "step": 1020 }, { "epoch": 0.65, "grad_norm": 5.687599265102159, "learning_rate": 1.4733391138762277e-06, "loss": 0.1626, "step": 1021 }, { "epoch": 0.65, "grad_norm": 6.415780175474357, "learning_rate": 1.4686699388868137e-06, "loss": 0.2941, "step": 1022 }, { "epoch": 0.65, "grad_norm": 5.9160262643309744, "learning_rate": 1.4640050951444912e-06, "loss": 0.184, "step": 1023 }, { "epoch": 0.65, "grad_norm": 8.384766762389278, "learning_rate": 1.4593446022400715e-06, "loss": 0.4718, "step": 1024 }, { "epoch": 0.65, "grad_norm": 6.417072068656092, "learning_rate": 1.4546884797460904e-06, "loss": 0.2465, "step": 1025 }, { "epoch": 0.65, "grad_norm": 3.750760446374365, "learning_rate": 1.450036747216732e-06, "loss": 0.1225, "step": 1026 }, { "epoch": 0.65, "grad_norm": 8.143795399162357, "learning_rate": 1.4453894241877433e-06, "loss": 0.2569, "step": 1027 }, { "epoch": 0.65, "grad_norm": 0.46836918620952056, "learning_rate": 1.4407465301763534e-06, "loss": 0.0021, "step": 1028 }, { "epoch": 0.65, "grad_norm": 7.7521595560981025, "learning_rate": 1.4361080846811888e-06, "loss": 0.4665, "step": 1029 }, { "epoch": 0.65, "grad_norm": 3.3502201691201714, "learning_rate": 1.4314741071821955e-06, "loss": 0.0774, "step": 1030 }, { "epoch": 0.65, "grad_norm": 7.693049466974612, "learning_rate": 1.4268446171405554e-06, "loss": 0.3174, "step": 1031 }, { "epoch": 0.65, "grad_norm": 3.8976653716194125, "learning_rate": 1.4222196339986053e-06, "loss": 0.0902, "step": 1032 }, { "epoch": 0.65, "grad_norm": 6.141925703130366, "learning_rate": 1.4175991771797508e-06, "loss": 0.2603, "step": 1033 }, { "epoch": 0.65, "grad_norm": 5.170304074227231, "learning_rate": 1.4129832660883933e-06, "loss": 0.1593, "step": 1034 }, { "epoch": 0.65, "grad_norm": 9.355323181931178, "learning_rate": 1.4083719201098404e-06, "loss": 0.5368, "step": 1035 }, { "epoch": 0.66, "grad_norm": 8.152163596326409, "learning_rate": 1.4037651586102297e-06, "loss": 0.5907, "step": 1036 }, { "epoch": 0.66, "grad_norm": 7.3126463264105706, "learning_rate": 1.399163000936443e-06, "loss": 0.4369, "step": 1037 }, { "epoch": 0.66, "grad_norm": 5.408593700628791, "learning_rate": 1.3945654664160296e-06, "loss": 0.2632, "step": 1038 }, { "epoch": 0.66, "grad_norm": 6.204173732093424, "learning_rate": 1.389972574357123e-06, "loss": 0.2789, "step": 1039 }, { "epoch": 0.66, "grad_norm": 9.896821068441229, "learning_rate": 1.38538434404836e-06, "loss": 0.5113, "step": 1040 }, { "epoch": 0.66, "grad_norm": 7.707353959285309, "learning_rate": 1.3808007947587976e-06, "loss": 0.479, "step": 1041 }, { "epoch": 0.66, "grad_norm": 3.706670038052496, "learning_rate": 1.3762219457378357e-06, "loss": 0.112, "step": 1042 }, { "epoch": 0.66, "grad_norm": 7.500034332196811, "learning_rate": 1.3716478162151347e-06, "loss": 0.5281, "step": 1043 }, { "epoch": 0.66, "grad_norm": 5.725800717258658, "learning_rate": 1.3670784254005354e-06, "loss": 0.2327, "step": 1044 }, { "epoch": 0.66, "grad_norm": 9.471210424454569, "learning_rate": 1.3625137924839749e-06, "loss": 0.3952, "step": 1045 }, { "epoch": 0.66, "grad_norm": 5.302821779321099, "learning_rate": 1.3579539366354106e-06, "loss": 0.1637, "step": 1046 }, { "epoch": 0.66, "grad_norm": 5.267206605363414, "learning_rate": 1.353398877004738e-06, "loss": 0.1717, "step": 1047 }, { "epoch": 0.66, "grad_norm": 3.7598349983453683, "learning_rate": 1.34884863272171e-06, "loss": 0.081, "step": 1048 }, { "epoch": 0.66, "grad_norm": 6.177234804334449, "learning_rate": 1.3443032228958547e-06, "loss": 0.2947, "step": 1049 }, { "epoch": 0.66, "grad_norm": 6.667959310461245, "learning_rate": 1.3397626666163998e-06, "loss": 0.2487, "step": 1050 }, { "epoch": 0.66, "grad_norm": 7.705428392307878, "learning_rate": 1.335226982952188e-06, "loss": 0.2478, "step": 1051 }, { "epoch": 0.67, "grad_norm": 9.654495820641348, "learning_rate": 1.3306961909516e-06, "loss": 0.4952, "step": 1052 }, { "epoch": 0.67, "grad_norm": 7.797367179793667, "learning_rate": 1.3261703096424695e-06, "loss": 0.2861, "step": 1053 }, { "epoch": 0.67, "grad_norm": 9.183331640373762, "learning_rate": 1.3216493580320137e-06, "loss": 0.422, "step": 1054 }, { "epoch": 0.67, "grad_norm": 0.15635336794597313, "learning_rate": 1.3171333551067389e-06, "loss": 0.0006, "step": 1055 }, { "epoch": 0.67, "grad_norm": 4.11514306747898, "learning_rate": 1.3126223198323752e-06, "loss": 0.1062, "step": 1056 }, { "epoch": 0.67, "grad_norm": 6.600095470778333, "learning_rate": 1.3081162711537838e-06, "loss": 0.2, "step": 1057 }, { "epoch": 0.67, "grad_norm": 7.592780659662964, "learning_rate": 1.303615227994891e-06, "loss": 0.3045, "step": 1058 }, { "epoch": 0.67, "grad_norm": 4.706069987427142, "learning_rate": 1.2991192092585952e-06, "loss": 0.0946, "step": 1059 }, { "epoch": 0.67, "grad_norm": 5.087550409990484, "learning_rate": 1.2946282338266985e-06, "loss": 0.1687, "step": 1060 }, { "epoch": 0.67, "grad_norm": 5.968857050230302, "learning_rate": 1.290142320559818e-06, "loss": 0.1613, "step": 1061 }, { "epoch": 0.67, "grad_norm": 7.776313165488766, "learning_rate": 1.2856614882973178e-06, "loss": 0.2875, "step": 1062 }, { "epoch": 0.67, "grad_norm": 7.584041967573621, "learning_rate": 1.2811857558572168e-06, "loss": 0.3282, "step": 1063 }, { "epoch": 0.67, "grad_norm": 3.6335259506077886, "learning_rate": 1.2767151420361218e-06, "loss": 0.1407, "step": 1064 }, { "epoch": 0.67, "grad_norm": 3.5209474413436777, "learning_rate": 1.272249665609137e-06, "loss": 0.038, "step": 1065 }, { "epoch": 0.67, "grad_norm": 7.837792554441897, "learning_rate": 1.2677893453297996e-06, "loss": 0.402, "step": 1066 }, { "epoch": 0.67, "grad_norm": 5.327216196721906, "learning_rate": 1.2633341999299845e-06, "loss": 0.2392, "step": 1067 }, { "epoch": 0.68, "grad_norm": 3.619910646674139, "learning_rate": 1.2588842481198382e-06, "loss": 0.0695, "step": 1068 }, { "epoch": 0.68, "grad_norm": 8.905789547444904, "learning_rate": 1.2544395085876954e-06, "loss": 0.3835, "step": 1069 }, { "epoch": 0.68, "grad_norm": 4.42389883359606, "learning_rate": 1.2500000000000007e-06, "loss": 0.1295, "step": 1070 }, { "epoch": 0.68, "grad_norm": 9.812823830867673, "learning_rate": 1.2455657410012283e-06, "loss": 0.2931, "step": 1071 }, { "epoch": 0.68, "grad_norm": 4.961542239838846, "learning_rate": 1.2411367502138095e-06, "loss": 0.1907, "step": 1072 }, { "epoch": 0.68, "grad_norm": 5.89326203511695, "learning_rate": 1.2367130462380483e-06, "loss": 0.3032, "step": 1073 }, { "epoch": 0.68, "grad_norm": 8.777073854745678, "learning_rate": 1.232294647652048e-06, "loss": 0.406, "step": 1074 }, { "epoch": 0.68, "grad_norm": 7.2234632289856515, "learning_rate": 1.2278815730116275e-06, "loss": 0.2681, "step": 1075 }, { "epoch": 0.68, "grad_norm": 7.129733470841275, "learning_rate": 1.22347384085025e-06, "loss": 0.3404, "step": 1076 }, { "epoch": 0.68, "grad_norm": 8.599542614506364, "learning_rate": 1.219071469678941e-06, "loss": 0.5208, "step": 1077 }, { "epoch": 0.68, "grad_norm": 4.4150575309625335, "learning_rate": 1.2146744779862126e-06, "loss": 0.0678, "step": 1078 }, { "epoch": 0.68, "grad_norm": 9.487797177853558, "learning_rate": 1.2102828842379824e-06, "loss": 0.5726, "step": 1079 }, { "epoch": 0.68, "grad_norm": 6.296861066400527, "learning_rate": 1.2058967068775002e-06, "loss": 0.1804, "step": 1080 }, { "epoch": 0.68, "grad_norm": 2.9092885202242647, "learning_rate": 1.2015159643252689e-06, "loss": 0.0314, "step": 1081 }, { "epoch": 0.68, "grad_norm": 5.357779129026744, "learning_rate": 1.1971406749789673e-06, "loss": 0.3348, "step": 1082 }, { "epoch": 0.68, "grad_norm": 6.648957892199807, "learning_rate": 1.19277085721337e-06, "loss": 0.2089, "step": 1083 }, { "epoch": 0.69, "grad_norm": 6.937832111279206, "learning_rate": 1.1884065293802756e-06, "loss": 0.2238, "step": 1084 }, { "epoch": 0.69, "grad_norm": 5.877264357515748, "learning_rate": 1.1840477098084258e-06, "loss": 0.19, "step": 1085 }, { "epoch": 0.69, "grad_norm": 10.515347844137308, "learning_rate": 1.17969441680343e-06, "loss": 0.8935, "step": 1086 }, { "epoch": 0.69, "grad_norm": 5.696434632574428, "learning_rate": 1.1753466686476855e-06, "loss": 0.2842, "step": 1087 }, { "epoch": 0.69, "grad_norm": 5.987193427469957, "learning_rate": 1.1710044836003057e-06, "loss": 0.1777, "step": 1088 }, { "epoch": 0.69, "grad_norm": 4.658585954525592, "learning_rate": 1.1666678798970397e-06, "loss": 0.1025, "step": 1089 }, { "epoch": 0.69, "grad_norm": 9.038527358294637, "learning_rate": 1.1623368757501977e-06, "loss": 0.3914, "step": 1090 }, { "epoch": 0.69, "grad_norm": 3.6766430354346022, "learning_rate": 1.1580114893485715e-06, "loss": 0.0806, "step": 1091 }, { "epoch": 0.69, "grad_norm": 6.800865948460224, "learning_rate": 1.1536917388573618e-06, "loss": 0.2416, "step": 1092 }, { "epoch": 0.69, "grad_norm": 5.253488562335028, "learning_rate": 1.1493776424181005e-06, "loss": 0.1593, "step": 1093 }, { "epoch": 0.69, "grad_norm": 6.327816350497619, "learning_rate": 1.1450692181485749e-06, "loss": 0.2859, "step": 1094 }, { "epoch": 0.69, "grad_norm": 5.577937617213323, "learning_rate": 1.1407664841427472e-06, "loss": 0.3745, "step": 1095 }, { "epoch": 0.69, "grad_norm": 4.353078800566941, "learning_rate": 1.1364694584706884e-06, "loss": 0.0767, "step": 1096 }, { "epoch": 0.69, "grad_norm": 5.082330177423406, "learning_rate": 1.132178159178491e-06, "loss": 0.1001, "step": 1097 }, { "epoch": 0.69, "grad_norm": 5.730500037193049, "learning_rate": 1.1278926042882026e-06, "loss": 0.1931, "step": 1098 }, { "epoch": 0.69, "grad_norm": 7.870542930090928, "learning_rate": 1.123612811797742e-06, "loss": 0.3888, "step": 1099 }, { "epoch": 0.7, "grad_norm": 9.670458170659172, "learning_rate": 1.1193387996808329e-06, "loss": 0.5273, "step": 1100 }, { "epoch": 0.7, "grad_norm": 5.121059484534487, "learning_rate": 1.115070585886918e-06, "loss": 0.3433, "step": 1101 }, { "epoch": 0.7, "grad_norm": 6.3361939779315914, "learning_rate": 1.1108081883410943e-06, "loss": 0.2923, "step": 1102 }, { "epoch": 0.7, "grad_norm": 4.036367317732274, "learning_rate": 1.1065516249440265e-06, "loss": 0.1008, "step": 1103 }, { "epoch": 0.7, "grad_norm": 6.021726372650959, "learning_rate": 1.1023009135718845e-06, "loss": 0.2031, "step": 1104 }, { "epoch": 0.7, "grad_norm": 3.6551065857841114, "learning_rate": 1.0980560720762556e-06, "loss": 0.0375, "step": 1105 }, { "epoch": 0.7, "grad_norm": 7.2919719813776895, "learning_rate": 1.0938171182840792e-06, "loss": 0.0966, "step": 1106 }, { "epoch": 0.7, "grad_norm": 2.1069531135936748, "learning_rate": 1.089584069997567e-06, "loss": 0.0144, "step": 1107 }, { "epoch": 0.7, "grad_norm": 4.236674792754401, "learning_rate": 1.085356944994131e-06, "loss": 0.2038, "step": 1108 }, { "epoch": 0.7, "grad_norm": 10.046522261862831, "learning_rate": 1.0811357610263037e-06, "loss": 0.668, "step": 1109 }, { "epoch": 0.7, "grad_norm": 7.421638051316306, "learning_rate": 1.0769205358216707e-06, "loss": 0.2997, "step": 1110 }, { "epoch": 0.7, "grad_norm": 7.741120204247029, "learning_rate": 1.0727112870827918e-06, "loss": 0.3144, "step": 1111 }, { "epoch": 0.7, "grad_norm": 8.016605785664773, "learning_rate": 1.068508032487128e-06, "loss": 0.401, "step": 1112 }, { "epoch": 0.7, "grad_norm": 6.595881546946371, "learning_rate": 1.064310789686965e-06, "loss": 0.3715, "step": 1113 }, { "epoch": 0.7, "grad_norm": 7.424384341419906, "learning_rate": 1.0601195763093427e-06, "loss": 0.3372, "step": 1114 }, { "epoch": 0.7, "grad_norm": 2.0735853129567308, "learning_rate": 1.0559344099559793e-06, "loss": 0.0263, "step": 1115 }, { "epoch": 0.71, "grad_norm": 5.952186810915065, "learning_rate": 1.051755308203198e-06, "loss": 0.1453, "step": 1116 }, { "epoch": 0.71, "grad_norm": 7.662348844397025, "learning_rate": 1.0475822886018498e-06, "loss": 0.463, "step": 1117 }, { "epoch": 0.71, "grad_norm": 5.960529518414667, "learning_rate": 1.0434153686772455e-06, "loss": 0.1973, "step": 1118 }, { "epoch": 0.71, "grad_norm": 9.361360520820744, "learning_rate": 1.0392545659290789e-06, "loss": 0.636, "step": 1119 }, { "epoch": 0.71, "grad_norm": 6.177106971974251, "learning_rate": 1.0350998978313537e-06, "loss": 0.279, "step": 1120 }, { "epoch": 0.71, "grad_norm": 8.457132739720128, "learning_rate": 1.030951381832308e-06, "loss": 0.2955, "step": 1121 }, { "epoch": 0.71, "grad_norm": 7.235605617186508, "learning_rate": 1.0268090353543456e-06, "loss": 0.2572, "step": 1122 }, { "epoch": 0.71, "grad_norm": 6.151599967441302, "learning_rate": 1.0226728757939595e-06, "loss": 0.0566, "step": 1123 }, { "epoch": 0.71, "grad_norm": 6.394817590565157, "learning_rate": 1.0185429205216602e-06, "loss": 0.1488, "step": 1124 }, { "epoch": 0.71, "grad_norm": 6.705416058881438, "learning_rate": 1.0144191868819003e-06, "loss": 0.2971, "step": 1125 }, { "epoch": 0.71, "grad_norm": 2.1250939067959833, "learning_rate": 1.0103016921930057e-06, "loss": 0.0226, "step": 1126 }, { "epoch": 0.71, "grad_norm": 6.0763281983266255, "learning_rate": 1.0061904537470998e-06, "loss": 0.1321, "step": 1127 }, { "epoch": 0.71, "grad_norm": 7.034939353263933, "learning_rate": 1.002085488810033e-06, "loss": 0.249, "step": 1128 }, { "epoch": 0.71, "grad_norm": 7.826931982444494, "learning_rate": 9.979868146213068e-07, "loss": 0.2878, "step": 1129 }, { "epoch": 0.71, "grad_norm": 5.233577656155282, "learning_rate": 9.938944483940053e-07, "loss": 0.1311, "step": 1130 }, { "epoch": 0.72, "grad_norm": 4.94281067924806, "learning_rate": 9.898084073147207e-07, "loss": 0.2794, "step": 1131 }, { "epoch": 0.72, "grad_norm": 5.594481521596385, "learning_rate": 9.85728708543483e-07, "loss": 0.0774, "step": 1132 }, { "epoch": 0.72, "grad_norm": 4.939104036942747, "learning_rate": 9.816553692136834e-07, "loss": 0.1812, "step": 1133 }, { "epoch": 0.72, "grad_norm": 9.760392348110935, "learning_rate": 9.775884064320087e-07, "loss": 0.6073, "step": 1134 }, { "epoch": 0.72, "grad_norm": 6.599541168447486, "learning_rate": 9.735278372783648e-07, "loss": 0.3746, "step": 1135 }, { "epoch": 0.72, "grad_norm": 7.925234952987741, "learning_rate": 9.694736788058079e-07, "loss": 0.5354, "step": 1136 }, { "epoch": 0.72, "grad_norm": 8.800757410226256, "learning_rate": 9.654259480404676e-07, "loss": 0.4464, "step": 1137 }, { "epoch": 0.72, "grad_norm": 8.06714963281593, "learning_rate": 9.613846619814854e-07, "loss": 0.4664, "step": 1138 }, { "epoch": 0.72, "grad_norm": 9.352241255651302, "learning_rate": 9.573498376009313e-07, "loss": 0.3984, "step": 1139 }, { "epoch": 0.72, "grad_norm": 1.9775548562255114, "learning_rate": 9.533214918437422e-07, "loss": 0.0233, "step": 1140 }, { "epoch": 0.72, "grad_norm": 4.0371188240697355, "learning_rate": 9.492996416276434e-07, "loss": 0.1628, "step": 1141 }, { "epoch": 0.72, "grad_norm": 6.430928255416815, "learning_rate": 9.452843038430862e-07, "loss": 0.2736, "step": 1142 }, { "epoch": 0.72, "grad_norm": 6.058750841673109, "learning_rate": 9.412754953531664e-07, "loss": 0.185, "step": 1143 }, { "epoch": 0.72, "grad_norm": 7.536225634086424, "learning_rate": 9.37273232993563e-07, "loss": 0.4172, "step": 1144 }, { "epoch": 0.72, "grad_norm": 3.8043015928598556, "learning_rate": 9.332775335724592e-07, "loss": 0.068, "step": 1145 }, { "epoch": 0.72, "grad_norm": 10.47882306020341, "learning_rate": 9.292884138704819e-07, "loss": 0.6243, "step": 1146 }, { "epoch": 0.73, "grad_norm": 10.199609217918214, "learning_rate": 9.253058906406195e-07, "loss": 0.2345, "step": 1147 }, { "epoch": 0.73, "grad_norm": 10.124477043304045, "learning_rate": 9.213299806081607e-07, "loss": 0.4456, "step": 1148 }, { "epoch": 0.73, "grad_norm": 4.799650402053632, "learning_rate": 9.1736070047062e-07, "loss": 0.0988, "step": 1149 }, { "epoch": 0.73, "grad_norm": 9.82518089846187, "learning_rate": 9.133980668976697e-07, "loss": 0.784, "step": 1150 }, { "epoch": 0.73, "grad_norm": 8.680766398264065, "learning_rate": 9.094420965310658e-07, "loss": 0.5107, "step": 1151 }, { "epoch": 0.73, "grad_norm": 7.337185397056643, "learning_rate": 9.054928059845836e-07, "loss": 0.4432, "step": 1152 }, { "epoch": 0.73, "grad_norm": 7.6635458302526525, "learning_rate": 9.015502118439451e-07, "loss": 0.2868, "step": 1153 }, { "epoch": 0.73, "grad_norm": 6.673698372677808, "learning_rate": 8.976143306667493e-07, "loss": 0.2073, "step": 1154 }, { "epoch": 0.73, "grad_norm": 7.00671527825118, "learning_rate": 8.936851789824017e-07, "loss": 0.2681, "step": 1155 }, { "epoch": 0.73, "grad_norm": 6.609538912713343, "learning_rate": 8.897627732920474e-07, "loss": 0.3573, "step": 1156 }, { "epoch": 0.73, "grad_norm": 5.5452974896995, "learning_rate": 8.858471300685012e-07, "loss": 0.1631, "step": 1157 }, { "epoch": 0.73, "grad_norm": 7.728287337495833, "learning_rate": 8.81938265756177e-07, "loss": 0.3244, "step": 1158 }, { "epoch": 0.73, "grad_norm": 4.948301837050428, "learning_rate": 8.780361967710185e-07, "loss": 0.2076, "step": 1159 }, { "epoch": 0.73, "grad_norm": 7.697940838422666, "learning_rate": 8.741409395004324e-07, "loss": 0.4055, "step": 1160 }, { "epoch": 0.73, "grad_norm": 5.494988672663058, "learning_rate": 8.702525103032186e-07, "loss": 0.1384, "step": 1161 }, { "epoch": 0.73, "grad_norm": 5.4240950291053736, "learning_rate": 8.663709255095018e-07, "loss": 0.2086, "step": 1162 }, { "epoch": 0.74, "grad_norm": 7.601571532946843, "learning_rate": 8.624962014206597e-07, "loss": 0.2748, "step": 1163 }, { "epoch": 0.74, "grad_norm": 7.106685456662032, "learning_rate": 8.586283543092603e-07, "loss": 0.4134, "step": 1164 }, { "epoch": 0.74, "grad_norm": 4.1297395238872205, "learning_rate": 8.547674004189894e-07, "loss": 0.0915, "step": 1165 }, { "epoch": 0.74, "grad_norm": 9.037661273197978, "learning_rate": 8.509133559645846e-07, "loss": 0.4811, "step": 1166 }, { "epoch": 0.74, "grad_norm": 1.0356577108462715, "learning_rate": 8.470662371317631e-07, "loss": 0.0052, "step": 1167 }, { "epoch": 0.74, "grad_norm": 6.276048622813399, "learning_rate": 8.4322606007716e-07, "loss": 0.1627, "step": 1168 }, { "epoch": 0.74, "grad_norm": 4.769225514553384, "learning_rate": 8.393928409282561e-07, "loss": 0.1517, "step": 1169 }, { "epoch": 0.74, "grad_norm": 5.967266437640687, "learning_rate": 8.35566595783312e-07, "loss": 0.2066, "step": 1170 }, { "epoch": 0.74, "grad_norm": 0.0004566797260357606, "learning_rate": 8.317473407112969e-07, "loss": 0.0, "step": 1171 }, { "epoch": 0.74, "grad_norm": 9.675421685386072, "learning_rate": 8.279350917518278e-07, "loss": 0.4631, "step": 1172 }, { "epoch": 0.74, "grad_norm": 0.45604089883548277, "learning_rate": 8.241298649150958e-07, "loss": 0.0021, "step": 1173 }, { "epoch": 0.74, "grad_norm": 9.14980425182308, "learning_rate": 8.203316761818034e-07, "loss": 0.4811, "step": 1174 }, { "epoch": 0.74, "grad_norm": 8.678894611062049, "learning_rate": 8.165405415030917e-07, "loss": 0.4395, "step": 1175 }, { "epoch": 0.74, "grad_norm": 6.299243869516143, "learning_rate": 8.12756476800483e-07, "loss": 0.3727, "step": 1176 }, { "epoch": 0.74, "grad_norm": 9.287321525902303, "learning_rate": 8.08979497965802e-07, "loss": 0.6829, "step": 1177 }, { "epoch": 0.74, "grad_norm": 6.9881196704349815, "learning_rate": 8.052096208611193e-07, "loss": 0.2384, "step": 1178 }, { "epoch": 0.75, "grad_norm": 5.170632388841534, "learning_rate": 8.014468613186768e-07, "loss": 0.1246, "step": 1179 }, { "epoch": 0.75, "grad_norm": 4.046362177932173, "learning_rate": 7.976912351408303e-07, "loss": 0.0351, "step": 1180 }, { "epoch": 0.75, "grad_norm": 6.544281876161677, "learning_rate": 7.939427580999717e-07, "loss": 0.1978, "step": 1181 }, { "epoch": 0.75, "grad_norm": 7.758983942348827, "learning_rate": 7.902014459384744e-07, "loss": 0.5287, "step": 1182 }, { "epoch": 0.75, "grad_norm": 5.402694326680793, "learning_rate": 7.864673143686161e-07, "loss": 0.2269, "step": 1183 }, { "epoch": 0.75, "grad_norm": 9.817453094051748, "learning_rate": 7.827403790725246e-07, "loss": 0.442, "step": 1184 }, { "epoch": 0.75, "grad_norm": 6.753924888818989, "learning_rate": 7.790206557021001e-07, "loss": 0.3795, "step": 1185 }, { "epoch": 0.75, "grad_norm": 9.122092842096281, "learning_rate": 7.753081598789594e-07, "loss": 0.5175, "step": 1186 }, { "epoch": 0.75, "grad_norm": 3.9394371474948677, "learning_rate": 7.716029071943612e-07, "loss": 0.074, "step": 1187 }, { "epoch": 0.75, "grad_norm": 6.743111097388839, "learning_rate": 7.67904913209152e-07, "loss": 0.098, "step": 1188 }, { "epoch": 0.75, "grad_norm": 11.447947915439228, "learning_rate": 7.642141934536876e-07, "loss": 0.6088, "step": 1189 }, { "epoch": 0.75, "grad_norm": 7.857754913865471, "learning_rate": 7.605307634277781e-07, "loss": 0.2789, "step": 1190 }, { "epoch": 0.75, "grad_norm": 7.365997250666482, "learning_rate": 7.568546386006185e-07, "loss": 0.416, "step": 1191 }, { "epoch": 0.75, "grad_norm": 4.114306837360714, "learning_rate": 7.531858344107246e-07, "loss": 0.1902, "step": 1192 }, { "epoch": 0.75, "grad_norm": 7.05534797356418, "learning_rate": 7.49524366265866e-07, "loss": 0.2576, "step": 1193 }, { "epoch": 0.75, "grad_norm": 9.82133989566553, "learning_rate": 7.458702495430056e-07, "loss": 0.5049, "step": 1194 }, { "epoch": 0.76, "grad_norm": 8.067195500885777, "learning_rate": 7.422234995882322e-07, "loss": 0.4976, "step": 1195 }, { "epoch": 0.76, "grad_norm": 4.689589378096578, "learning_rate": 7.385841317166966e-07, "loss": 0.0803, "step": 1196 }, { "epoch": 0.76, "grad_norm": 8.724191150500358, "learning_rate": 7.349521612125459e-07, "loss": 0.463, "step": 1197 }, { "epoch": 0.76, "grad_norm": 7.423637742971244, "learning_rate": 7.313276033288624e-07, "loss": 0.1105, "step": 1198 }, { "epoch": 0.76, "grad_norm": 10.449954705391146, "learning_rate": 7.277104732875972e-07, "loss": 0.6319, "step": 1199 }, { "epoch": 0.76, "grad_norm": 8.051548343337302, "learning_rate": 7.241007862795079e-07, "loss": 0.3577, "step": 1200 }, { "epoch": 0.76, "grad_norm": 7.683512576226657, "learning_rate": 7.20498557464091e-07, "loss": 0.2195, "step": 1201 }, { "epoch": 0.76, "grad_norm": 9.019963904602504, "learning_rate": 7.169038019695241e-07, "loss": 0.4715, "step": 1202 }, { "epoch": 0.76, "grad_norm": 4.181379257507884, "learning_rate": 7.133165348925978e-07, "loss": 0.0928, "step": 1203 }, { "epoch": 0.76, "grad_norm": 7.0344719180629065, "learning_rate": 7.097367712986548e-07, "loss": 0.3156, "step": 1204 }, { "epoch": 0.76, "grad_norm": 5.661127000627731, "learning_rate": 7.061645262215233e-07, "loss": 0.2014, "step": 1205 }, { "epoch": 0.76, "grad_norm": 8.183429667209468, "learning_rate": 7.025998146634591e-07, "loss": 0.5082, "step": 1206 }, { "epoch": 0.76, "grad_norm": 6.344470354343886, "learning_rate": 6.990426515950783e-07, "loss": 0.2123, "step": 1207 }, { "epoch": 0.76, "grad_norm": 5.637031065390516, "learning_rate": 6.954930519552963e-07, "loss": 0.1589, "step": 1208 }, { "epoch": 0.76, "grad_norm": 7.474302645914507, "learning_rate": 6.919510306512633e-07, "loss": 0.4279, "step": 1209 }, { "epoch": 0.77, "grad_norm": 3.846024747295774, "learning_rate": 6.884166025583044e-07, "loss": 0.0696, "step": 1210 }, { "epoch": 0.77, "grad_norm": 7.980983065903735, "learning_rate": 6.848897825198556e-07, "loss": 0.4559, "step": 1211 }, { "epoch": 0.77, "grad_norm": 6.823689641676835, "learning_rate": 6.813705853474014e-07, "loss": 0.2294, "step": 1212 }, { "epoch": 0.77, "grad_norm": 6.549338958230435, "learning_rate": 6.778590258204116e-07, "loss": 0.2122, "step": 1213 }, { "epoch": 0.77, "grad_norm": 7.953447316651441, "learning_rate": 6.743551186862817e-07, "loss": 0.3647, "step": 1214 }, { "epoch": 0.77, "grad_norm": 4.248103223405018, "learning_rate": 6.708588786602699e-07, "loss": 0.0772, "step": 1215 }, { "epoch": 0.77, "grad_norm": 4.828220798566685, "learning_rate": 6.673703204254348e-07, "loss": 0.0694, "step": 1216 }, { "epoch": 0.77, "grad_norm": 3.406936996195398, "learning_rate": 6.63889458632572e-07, "loss": 0.0508, "step": 1217 }, { "epoch": 0.77, "grad_norm": 2.291232593926028, "learning_rate": 6.604163079001586e-07, "loss": 0.025, "step": 1218 }, { "epoch": 0.77, "grad_norm": 6.0449736215474985, "learning_rate": 6.569508828142837e-07, "loss": 0.3029, "step": 1219 }, { "epoch": 0.77, "grad_norm": 4.924293142026417, "learning_rate": 6.534931979285947e-07, "loss": 0.098, "step": 1220 }, { "epoch": 0.77, "grad_norm": 4.687484130832513, "learning_rate": 6.500432677642282e-07, "loss": 0.179, "step": 1221 }, { "epoch": 0.77, "grad_norm": 7.767871309214235, "learning_rate": 6.4660110680976e-07, "loss": 0.2453, "step": 1222 }, { "epoch": 0.77, "grad_norm": 6.2186521589547255, "learning_rate": 6.431667295211314e-07, "loss": 0.2563, "step": 1223 }, { "epoch": 0.77, "grad_norm": 9.201440316942234, "learning_rate": 6.397401503215992e-07, "loss": 0.4153, "step": 1224 }, { "epoch": 0.77, "grad_norm": 5.83563811773922, "learning_rate": 6.363213836016669e-07, "loss": 0.1926, "step": 1225 }, { "epoch": 0.78, "grad_norm": 9.833525273233217, "learning_rate": 6.329104437190333e-07, "loss": 0.6962, "step": 1226 }, { "epoch": 0.78, "grad_norm": 5.720222658745461, "learning_rate": 6.295073449985223e-07, "loss": 0.1187, "step": 1227 }, { "epoch": 0.78, "grad_norm": 3.635566701042897, "learning_rate": 6.261121017320299e-07, "loss": 0.0345, "step": 1228 }, { "epoch": 0.78, "grad_norm": 9.417512253686738, "learning_rate": 6.227247281784613e-07, "loss": 0.6206, "step": 1229 }, { "epoch": 0.78, "grad_norm": 7.748078384893074, "learning_rate": 6.193452385636722e-07, "loss": 0.3458, "step": 1230 }, { "epoch": 0.78, "grad_norm": 8.378567064084603, "learning_rate": 6.159736470804059e-07, "loss": 0.3271, "step": 1231 }, { "epoch": 0.78, "grad_norm": 4.69959905009879, "learning_rate": 6.126099678882389e-07, "loss": 0.1606, "step": 1232 }, { "epoch": 0.78, "grad_norm": 3.7809707916834636, "learning_rate": 6.092542151135175e-07, "loss": 0.0598, "step": 1233 }, { "epoch": 0.78, "grad_norm": 8.208261040951747, "learning_rate": 6.059064028493003e-07, "loss": 0.4187, "step": 1234 }, { "epoch": 0.78, "grad_norm": 5.573923104458301, "learning_rate": 6.025665451552968e-07, "loss": 0.2647, "step": 1235 }, { "epoch": 0.78, "grad_norm": 8.77186642419758, "learning_rate": 5.992346560578113e-07, "loss": 0.5419, "step": 1236 }, { "epoch": 0.78, "grad_norm": 5.235602118287411, "learning_rate": 5.959107495496827e-07, "loss": 0.167, "step": 1237 }, { "epoch": 0.78, "grad_norm": 9.377726654266931, "learning_rate": 5.925948395902253e-07, "loss": 0.6169, "step": 1238 }, { "epoch": 0.78, "grad_norm": 6.714411412601687, "learning_rate": 5.892869401051698e-07, "loss": 0.2701, "step": 1239 }, { "epoch": 0.78, "grad_norm": 6.317109492874942, "learning_rate": 5.859870649866065e-07, "loss": 0.2369, "step": 1240 }, { "epoch": 0.78, "grad_norm": 0.32657318782199934, "learning_rate": 5.826952280929258e-07, "loss": 0.0016, "step": 1241 }, { "epoch": 0.79, "grad_norm": 5.860547408747393, "learning_rate": 5.794114432487605e-07, "loss": 0.3061, "step": 1242 }, { "epoch": 0.79, "grad_norm": 8.334092626948106, "learning_rate": 5.761357242449259e-07, "loss": 0.4243, "step": 1243 }, { "epoch": 0.79, "grad_norm": 4.1639392827395785, "learning_rate": 5.728680848383653e-07, "loss": 0.0566, "step": 1244 }, { "epoch": 0.79, "grad_norm": 7.430033036950494, "learning_rate": 5.696085387520894e-07, "loss": 0.4149, "step": 1245 }, { "epoch": 0.79, "grad_norm": 8.595394906496898, "learning_rate": 5.663570996751208e-07, "loss": 0.5662, "step": 1246 }, { "epoch": 0.79, "grad_norm": 7.530556294636503, "learning_rate": 5.631137812624329e-07, "loss": 0.4349, "step": 1247 }, { "epoch": 0.79, "grad_norm": 5.6481565455714176, "learning_rate": 5.598785971348969e-07, "loss": 0.2056, "step": 1248 }, { "epoch": 0.79, "grad_norm": 3.342931344183399, "learning_rate": 5.566515608792223e-07, "loss": 0.1623, "step": 1249 }, { "epoch": 0.79, "grad_norm": 2.9183746423272177, "learning_rate": 5.534326860479008e-07, "loss": 0.0167, "step": 1250 }, { "epoch": 0.79, "grad_norm": 6.021548676371724, "learning_rate": 5.502219861591471e-07, "loss": 0.2745, "step": 1251 }, { "epoch": 0.79, "grad_norm": 7.3786650780300524, "learning_rate": 5.470194746968451e-07, "loss": 0.2425, "step": 1252 }, { "epoch": 0.79, "grad_norm": 4.863226342561528, "learning_rate": 5.4382516511049e-07, "loss": 0.1711, "step": 1253 }, { "epoch": 0.79, "grad_norm": 6.759145686774258, "learning_rate": 5.406390708151321e-07, "loss": 0.2674, "step": 1254 }, { "epoch": 0.79, "grad_norm": 6.065181493095607, "learning_rate": 5.374612051913186e-07, "loss": 0.2833, "step": 1255 }, { "epoch": 0.79, "grad_norm": 8.418198046471073, "learning_rate": 5.342915815850402e-07, "loss": 0.5178, "step": 1256 }, { "epoch": 0.79, "grad_norm": 9.266771113769888, "learning_rate": 5.311302133076742e-07, "loss": 0.4868, "step": 1257 }, { "epoch": 0.8, "grad_norm": 3.99996042232007, "learning_rate": 5.279771136359279e-07, "loss": 0.1837, "step": 1258 }, { "epoch": 0.8, "grad_norm": 8.302677632642498, "learning_rate": 5.248322958117815e-07, "loss": 0.4099, "step": 1259 }, { "epoch": 0.8, "grad_norm": 4.450149987939859, "learning_rate": 5.216957730424382e-07, "loss": 0.097, "step": 1260 }, { "epoch": 0.8, "grad_norm": 9.49409622655994, "learning_rate": 5.185675585002597e-07, "loss": 0.6324, "step": 1261 }, { "epoch": 0.8, "grad_norm": 8.4665371301709, "learning_rate": 5.154476653227203e-07, "loss": 0.4794, "step": 1262 }, { "epoch": 0.8, "grad_norm": 2.9993792527445464, "learning_rate": 5.123361066123428e-07, "loss": 0.0241, "step": 1263 }, { "epoch": 0.8, "grad_norm": 8.411799871323337, "learning_rate": 5.092328954366535e-07, "loss": 0.2649, "step": 1264 }, { "epoch": 0.8, "grad_norm": 6.083115909122426, "learning_rate": 5.061380448281166e-07, "loss": 0.2333, "step": 1265 }, { "epoch": 0.8, "grad_norm": 8.19014179037132, "learning_rate": 5.030515677840883e-07, "loss": 0.3651, "step": 1266 }, { "epoch": 0.8, "grad_norm": 5.638652424899343, "learning_rate": 4.999734772667552e-07, "loss": 0.2034, "step": 1267 }, { "epoch": 0.8, "grad_norm": 8.191679142748107, "learning_rate": 4.969037862030881e-07, "loss": 0.237, "step": 1268 }, { "epoch": 0.8, "grad_norm": 6.194300449828385, "learning_rate": 4.938425074847778e-07, "loss": 0.1435, "step": 1269 }, { "epoch": 0.8, "grad_norm": 2.5156697334434965, "learning_rate": 4.907896539681884e-07, "loss": 0.0223, "step": 1270 }, { "epoch": 0.8, "grad_norm": 6.079149802142939, "learning_rate": 4.877452384743012e-07, "loss": 0.1507, "step": 1271 }, { "epoch": 0.8, "grad_norm": 4.633573884459923, "learning_rate": 4.847092737886608e-07, "loss": 0.0838, "step": 1272 }, { "epoch": 0.8, "grad_norm": 7.387581648270514, "learning_rate": 4.816817726613188e-07, "loss": 0.6913, "step": 1273 }, { "epoch": 0.81, "grad_norm": 4.890780607922828, "learning_rate": 4.786627478067852e-07, "loss": 0.1232, "step": 1274 }, { "epoch": 0.81, "grad_norm": 4.01637350149555, "learning_rate": 4.756522119039714e-07, "loss": 0.0432, "step": 1275 }, { "epoch": 0.81, "grad_norm": 6.7069533293012995, "learning_rate": 4.72650177596139e-07, "loss": 0.4121, "step": 1276 }, { "epoch": 0.81, "grad_norm": 5.7918383163520435, "learning_rate": 4.6965665749084345e-07, "loss": 0.2983, "step": 1277 }, { "epoch": 0.81, "grad_norm": 5.976018663982369, "learning_rate": 4.666716641598851e-07, "loss": 0.2853, "step": 1278 }, { "epoch": 0.81, "grad_norm": 4.863794212405065, "learning_rate": 4.6369521013925475e-07, "loss": 0.1324, "step": 1279 }, { "epoch": 0.81, "grad_norm": 4.5537468542506465, "learning_rate": 4.607273079290811e-07, "loss": 0.1497, "step": 1280 }, { "epoch": 0.81, "grad_norm": 4.5258704512695465, "learning_rate": 4.577679699935755e-07, "loss": 0.1564, "step": 1281 }, { "epoch": 0.81, "grad_norm": 0.0004484928355647459, "learning_rate": 4.548172087609856e-07, "loss": 0.0, "step": 1282 }, { "epoch": 0.81, "grad_norm": 11.357004540107548, "learning_rate": 4.5187503662353747e-07, "loss": 0.762, "step": 1283 }, { "epoch": 0.81, "grad_norm": 4.270852264114231, "learning_rate": 4.4894146593738724e-07, "loss": 0.1886, "step": 1284 }, { "epoch": 0.81, "grad_norm": 4.921765523026006, "learning_rate": 4.4601650902256597e-07, "loss": 0.2278, "step": 1285 }, { "epoch": 0.81, "grad_norm": 8.427307754627865, "learning_rate": 4.4310017816293155e-07, "loss": 0.3, "step": 1286 }, { "epoch": 0.81, "grad_norm": 5.4904721702882675, "learning_rate": 4.401924856061146e-07, "loss": 0.236, "step": 1287 }, { "epoch": 0.81, "grad_norm": 7.381259395583696, "learning_rate": 4.3729344356346767e-07, "loss": 0.4531, "step": 1288 }, { "epoch": 0.81, "grad_norm": 6.15602227578895, "learning_rate": 4.344030642100133e-07, "loss": 0.2468, "step": 1289 }, { "epoch": 0.82, "grad_norm": 5.471011146663745, "learning_rate": 4.315213596843945e-07, "loss": 0.1772, "step": 1290 }, { "epoch": 0.82, "grad_norm": 5.316799443579021, "learning_rate": 4.2864834208882267e-07, "loss": 0.2099, "step": 1291 }, { "epoch": 0.82, "grad_norm": 6.078974411948371, "learning_rate": 4.2578402348902756e-07, "loss": 0.1977, "step": 1292 }, { "epoch": 0.82, "grad_norm": 7.23347062128574, "learning_rate": 4.229284159142044e-07, "loss": 0.5377, "step": 1293 }, { "epoch": 0.82, "grad_norm": 0.3464634610544234, "learning_rate": 4.2008153135696587e-07, "loss": 0.0015, "step": 1294 }, { "epoch": 0.82, "grad_norm": 7.8637455037130986, "learning_rate": 4.1724338177329156e-07, "loss": 0.2514, "step": 1295 }, { "epoch": 0.82, "grad_norm": 7.861907247690541, "learning_rate": 4.144139790824764e-07, "loss": 0.3169, "step": 1296 }, { "epoch": 0.82, "grad_norm": 7.6535236097512325, "learning_rate": 4.1159333516707976e-07, "loss": 0.4802, "step": 1297 }, { "epoch": 0.82, "grad_norm": 5.452392528931751, "learning_rate": 4.0878146187288084e-07, "loss": 0.2476, "step": 1298 }, { "epoch": 0.82, "grad_norm": 10.366285225902557, "learning_rate": 4.0597837100882053e-07, "loss": 0.6126, "step": 1299 }, { "epoch": 0.82, "grad_norm": 8.993415967346264, "learning_rate": 4.031840743469606e-07, "loss": 0.5295, "step": 1300 }, { "epoch": 0.82, "grad_norm": 6.177072080073364, "learning_rate": 4.0039858362242555e-07, "loss": 0.1871, "step": 1301 }, { "epoch": 0.82, "grad_norm": 6.172281990635541, "learning_rate": 3.976219105333634e-07, "loss": 0.2052, "step": 1302 }, { "epoch": 0.82, "grad_norm": 6.308371222855513, "learning_rate": 3.9485406674088595e-07, "loss": 0.439, "step": 1303 }, { "epoch": 0.82, "grad_norm": 4.711694558520636, "learning_rate": 3.920950638690277e-07, "loss": 0.1565, "step": 1304 }, { "epoch": 0.83, "grad_norm": 4.772361122057014, "learning_rate": 3.893449135046923e-07, "loss": 0.1727, "step": 1305 }, { "epoch": 0.83, "grad_norm": 5.311852449332666, "learning_rate": 3.8660362719760867e-07, "loss": 0.0844, "step": 1306 }, { "epoch": 0.83, "grad_norm": 8.42096406614009, "learning_rate": 3.838712164602762e-07, "loss": 0.4988, "step": 1307 }, { "epoch": 0.83, "grad_norm": 6.279107938448386, "learning_rate": 3.811476927679228e-07, "loss": 0.2964, "step": 1308 }, { "epoch": 0.83, "grad_norm": 3.5713009457264016, "learning_rate": 3.7843306755845024e-07, "loss": 0.045, "step": 1309 }, { "epoch": 0.83, "grad_norm": 7.283329875372001, "learning_rate": 3.7572735223239426e-07, "loss": 0.4163, "step": 1310 }, { "epoch": 0.83, "grad_norm": 4.128545682207385, "learning_rate": 3.7303055815286733e-07, "loss": 0.0763, "step": 1311 }, { "epoch": 0.83, "grad_norm": 6.274153767095931, "learning_rate": 3.703426966455187e-07, "loss": 0.2464, "step": 1312 }, { "epoch": 0.83, "grad_norm": 4.929415107371386, "learning_rate": 3.6766377899848243e-07, "loss": 0.1365, "step": 1313 }, { "epoch": 0.83, "grad_norm": 8.654715832431283, "learning_rate": 3.649938164623326e-07, "loss": 0.4255, "step": 1314 }, { "epoch": 0.83, "grad_norm": 4.689330493828162, "learning_rate": 3.623328202500323e-07, "loss": 0.1858, "step": 1315 }, { "epoch": 0.83, "grad_norm": 6.308800548081129, "learning_rate": 3.5968080153689104e-07, "loss": 0.3296, "step": 1316 }, { "epoch": 0.83, "grad_norm": 10.070902662836929, "learning_rate": 3.570377714605158e-07, "loss": 0.6856, "step": 1317 }, { "epoch": 0.83, "grad_norm": 5.784315791308768, "learning_rate": 3.544037411207638e-07, "loss": 0.1901, "step": 1318 }, { "epoch": 0.83, "grad_norm": 7.281675661930889, "learning_rate": 3.517787215796953e-07, "loss": 0.4394, "step": 1319 }, { "epoch": 0.83, "grad_norm": 6.117864071979914, "learning_rate": 3.4916272386152946e-07, "loss": 0.135, "step": 1320 }, { "epoch": 0.84, "grad_norm": 5.175714917027768, "learning_rate": 3.4655575895259667e-07, "loss": 0.1428, "step": 1321 }, { "epoch": 0.84, "grad_norm": 6.587351220783437, "learning_rate": 3.4395783780129254e-07, "loss": 0.2707, "step": 1322 }, { "epoch": 0.84, "grad_norm": 6.910050257101179, "learning_rate": 3.413689713180307e-07, "loss": 0.2505, "step": 1323 }, { "epoch": 0.84, "grad_norm": 6.394795817201763, "learning_rate": 3.387891703751994e-07, "loss": 0.078, "step": 1324 }, { "epoch": 0.84, "grad_norm": 0.0004353534679147159, "learning_rate": 3.362184458071141e-07, "loss": 0.0, "step": 1325 }, { "epoch": 0.84, "grad_norm": 4.491810446132425, "learning_rate": 3.3365680840997337e-07, "loss": 0.1125, "step": 1326 }, { "epoch": 0.84, "grad_norm": 5.685452270769086, "learning_rate": 3.3110426894181066e-07, "loss": 0.1605, "step": 1327 }, { "epoch": 0.84, "grad_norm": 5.849547096392178, "learning_rate": 3.2856083812245253e-07, "loss": 0.1821, "step": 1328 }, { "epoch": 0.84, "grad_norm": 7.796367621110237, "learning_rate": 3.260265266334725e-07, "loss": 0.4107, "step": 1329 }, { "epoch": 0.84, "grad_norm": 6.590596534229808, "learning_rate": 3.235013451181457e-07, "loss": 0.2966, "step": 1330 }, { "epoch": 0.84, "grad_norm": 8.824323365637058, "learning_rate": 3.2098530418140315e-07, "loss": 0.5485, "step": 1331 }, { "epoch": 0.84, "grad_norm": 8.16455123387599, "learning_rate": 3.184784143897895e-07, "loss": 0.2093, "step": 1332 }, { "epoch": 0.84, "grad_norm": 9.05220657340829, "learning_rate": 3.1598068627141763e-07, "loss": 0.4038, "step": 1333 }, { "epoch": 0.84, "grad_norm": 8.689539388690154, "learning_rate": 3.1349213031592495e-07, "loss": 0.6675, "step": 1334 }, { "epoch": 0.84, "grad_norm": 7.255588350843405, "learning_rate": 3.11012756974427e-07, "loss": 0.3487, "step": 1335 }, { "epoch": 0.84, "grad_norm": 5.194256163169376, "learning_rate": 3.0854257665947683e-07, "loss": 0.1817, "step": 1336 }, { "epoch": 0.85, "grad_norm": 8.814087204001677, "learning_rate": 3.0608159974501924e-07, "loss": 0.4353, "step": 1337 }, { "epoch": 0.85, "grad_norm": 5.552531707961687, "learning_rate": 3.036298365663487e-07, "loss": 0.1674, "step": 1338 }, { "epoch": 0.85, "grad_norm": 3.6705039203821888, "learning_rate": 3.01187297420063e-07, "loss": 0.0878, "step": 1339 }, { "epoch": 0.85, "grad_norm": 3.8658706426632623, "learning_rate": 2.987539925640248e-07, "loss": 0.1105, "step": 1340 }, { "epoch": 0.85, "grad_norm": 6.168866083198873, "learning_rate": 2.96329932217313e-07, "loss": 0.1791, "step": 1341 }, { "epoch": 0.85, "grad_norm": 6.387063374685278, "learning_rate": 2.93915126560185e-07, "loss": 0.3495, "step": 1342 }, { "epoch": 0.85, "grad_norm": 6.837495637156543, "learning_rate": 2.915095857340289e-07, "loss": 0.2139, "step": 1343 }, { "epoch": 0.85, "grad_norm": 4.393526360801382, "learning_rate": 2.891133198413268e-07, "loss": 0.1317, "step": 1344 }, { "epoch": 0.85, "grad_norm": 8.024247616112522, "learning_rate": 2.8672633894560596e-07, "loss": 0.3229, "step": 1345 }, { "epoch": 0.85, "grad_norm": 6.983989116070319, "learning_rate": 2.843486530714026e-07, "loss": 0.2376, "step": 1346 }, { "epoch": 0.85, "grad_norm": 7.596305260268161, "learning_rate": 2.8198027220421363e-07, "loss": 0.1078, "step": 1347 }, { "epoch": 0.85, "grad_norm": 4.16937889834316, "learning_rate": 2.7962120629046183e-07, "loss": 0.2107, "step": 1348 }, { "epoch": 0.85, "grad_norm": 4.141138584746491, "learning_rate": 2.772714652374467e-07, "loss": 0.0773, "step": 1349 }, { "epoch": 0.85, "grad_norm": 6.2918029500925226, "learning_rate": 2.7493105891330837e-07, "loss": 0.1872, "step": 1350 }, { "epoch": 0.85, "grad_norm": 3.1827283560670367, "learning_rate": 2.725999971469839e-07, "loss": 0.0751, "step": 1351 }, { "epoch": 0.85, "grad_norm": 7.441762520178824, "learning_rate": 2.7027828972816664e-07, "loss": 0.2989, "step": 1352 }, { "epoch": 0.86, "grad_norm": 7.823169472203725, "learning_rate": 2.679659464072631e-07, "loss": 0.3266, "step": 1353 }, { "epoch": 0.86, "grad_norm": 5.302283482499232, "learning_rate": 2.656629768953553e-07, "loss": 0.0994, "step": 1354 }, { "epoch": 0.86, "grad_norm": 3.46057154526848, "learning_rate": 2.6336939086415796e-07, "loss": 0.0911, "step": 1355 }, { "epoch": 0.86, "grad_norm": 5.783404304589278, "learning_rate": 2.610851979459786e-07, "loss": 0.1883, "step": 1356 }, { "epoch": 0.86, "grad_norm": 5.0405895669428435, "learning_rate": 2.588104077336751e-07, "loss": 0.1434, "step": 1357 }, { "epoch": 0.86, "grad_norm": 5.417202477974949, "learning_rate": 2.565450297806191e-07, "loss": 0.1391, "step": 1358 }, { "epoch": 0.86, "grad_norm": 3.845751729631906, "learning_rate": 2.5428907360065256e-07, "loss": 0.1827, "step": 1359 }, { "epoch": 0.86, "grad_norm": 6.130814264428655, "learning_rate": 2.520425486680503e-07, "loss": 0.1775, "step": 1360 }, { "epoch": 0.86, "grad_norm": 7.1435033560399415, "learning_rate": 2.4980546441747744e-07, "loss": 0.2421, "step": 1361 }, { "epoch": 0.86, "grad_norm": 6.788641070421076, "learning_rate": 2.4757783024395244e-07, "loss": 0.2225, "step": 1362 }, { "epoch": 0.86, "grad_norm": 9.5390928925601, "learning_rate": 2.4535965550280594e-07, "loss": 0.6407, "step": 1363 }, { "epoch": 0.86, "grad_norm": 8.059805486333964, "learning_rate": 2.431509495096435e-07, "loss": 0.4669, "step": 1364 }, { "epoch": 0.86, "grad_norm": 0.6004940303788455, "learning_rate": 2.4095172154030235e-07, "loss": 0.003, "step": 1365 }, { "epoch": 0.86, "grad_norm": 9.662855339672104, "learning_rate": 2.3876198083081754e-07, "loss": 0.6122, "step": 1366 }, { "epoch": 0.86, "grad_norm": 4.7828968677475965, "learning_rate": 2.365817365773801e-07, "loss": 0.0665, "step": 1367 }, { "epoch": 0.86, "grad_norm": 4.590139202591782, "learning_rate": 2.3441099793629906e-07, "loss": 0.2537, "step": 1368 }, { "epoch": 0.87, "grad_norm": 9.344419525552738, "learning_rate": 2.3224977402396183e-07, "loss": 0.5967, "step": 1369 }, { "epoch": 0.87, "grad_norm": 7.027213468080827, "learning_rate": 2.300980739167988e-07, "loss": 0.208, "step": 1370 }, { "epoch": 0.87, "grad_norm": 9.5308681724236, "learning_rate": 2.2795590665124267e-07, "loss": 0.5625, "step": 1371 }, { "epoch": 0.87, "grad_norm": 5.268451056817928, "learning_rate": 2.2582328122369212e-07, "loss": 0.0789, "step": 1372 }, { "epoch": 0.87, "grad_norm": 6.426635759577424, "learning_rate": 2.2370020659047144e-07, "loss": 0.2975, "step": 1373 }, { "epoch": 0.87, "grad_norm": 7.1845259110910495, "learning_rate": 2.2158669166779667e-07, "loss": 0.4297, "step": 1374 }, { "epoch": 0.87, "grad_norm": 4.98179038071042, "learning_rate": 2.1948274533173525e-07, "loss": 0.0769, "step": 1375 }, { "epoch": 0.87, "grad_norm": 6.546048715383937, "learning_rate": 2.1738837641817e-07, "loss": 0.3055, "step": 1376 }, { "epoch": 0.87, "grad_norm": 4.827645781107163, "learning_rate": 2.1530359372276126e-07, "loss": 0.1171, "step": 1377 }, { "epoch": 0.87, "grad_norm": 5.874737023494315, "learning_rate": 2.1322840600091098e-07, "loss": 0.3757, "step": 1378 }, { "epoch": 0.87, "grad_norm": 5.133585553764287, "learning_rate": 2.111628219677253e-07, "loss": 0.2789, "step": 1379 }, { "epoch": 0.87, "grad_norm": 11.140339423715327, "learning_rate": 2.0910685029797789e-07, "loss": 0.5185, "step": 1380 }, { "epoch": 0.87, "grad_norm": 5.114861671762458, "learning_rate": 2.070604996260725e-07, "loss": 0.3173, "step": 1381 }, { "epoch": 0.87, "grad_norm": 4.469462517975171, "learning_rate": 2.0502377854601103e-07, "loss": 0.0963, "step": 1382 }, { "epoch": 0.87, "grad_norm": 4.584093389202246, "learning_rate": 2.0299669561135048e-07, "loss": 0.2258, "step": 1383 }, { "epoch": 0.88, "grad_norm": 7.381083678617386, "learning_rate": 2.0097925933517393e-07, "loss": 0.272, "step": 1384 }, { "epoch": 0.88, "grad_norm": 6.530355729933298, "learning_rate": 1.9897147819004841e-07, "loss": 0.5881, "step": 1385 }, { "epoch": 0.88, "grad_norm": 5.201646932808746, "learning_rate": 1.9697336060799626e-07, "loss": 0.1984, "step": 1386 }, { "epoch": 0.88, "grad_norm": 9.683744231636336, "learning_rate": 1.9498491498045334e-07, "loss": 0.6364, "step": 1387 }, { "epoch": 0.88, "grad_norm": 5.771182640695547, "learning_rate": 1.9300614965823805e-07, "loss": 0.2412, "step": 1388 }, { "epoch": 0.88, "grad_norm": 8.323466703328956, "learning_rate": 1.9103707295151262e-07, "loss": 0.3189, "step": 1389 }, { "epoch": 0.88, "grad_norm": 6.568747026640621, "learning_rate": 1.8907769312975332e-07, "loss": 0.2135, "step": 1390 }, { "epoch": 0.88, "grad_norm": 6.360316701237373, "learning_rate": 1.8712801842171004e-07, "loss": 0.2523, "step": 1391 }, { "epoch": 0.88, "grad_norm": 7.498390533688088, "learning_rate": 1.851880570153755e-07, "loss": 0.3119, "step": 1392 }, { "epoch": 0.88, "grad_norm": 7.627047154510003, "learning_rate": 1.8325781705794964e-07, "loss": 0.4537, "step": 1393 }, { "epoch": 0.88, "grad_norm": 8.374041146573841, "learning_rate": 1.8133730665580608e-07, "loss": 0.5308, "step": 1394 }, { "epoch": 0.88, "grad_norm": 4.223755755977993, "learning_rate": 1.7942653387445607e-07, "loss": 0.0912, "step": 1395 }, { "epoch": 0.88, "grad_norm": 6.3063778035395694, "learning_rate": 1.775255067385173e-07, "loss": 0.3447, "step": 1396 }, { "epoch": 0.88, "grad_norm": 6.159462956884823, "learning_rate": 1.7563423323167878e-07, "loss": 0.1981, "step": 1397 }, { "epoch": 0.88, "grad_norm": 8.2887937338614, "learning_rate": 1.737527212966672e-07, "loss": 0.3705, "step": 1398 }, { "epoch": 0.88, "grad_norm": 6.325701214063891, "learning_rate": 1.7188097883521353e-07, "loss": 0.2369, "step": 1399 }, { "epoch": 0.89, "grad_norm": 9.599735145094215, "learning_rate": 1.700190137080207e-07, "loss": 0.5197, "step": 1400 }, { "epoch": 0.89, "grad_norm": 5.436528436964425, "learning_rate": 1.6816683373473004e-07, "loss": 0.1849, "step": 1401 }, { "epoch": 0.89, "grad_norm": 8.051643573420177, "learning_rate": 1.6632444669388876e-07, "loss": 0.4636, "step": 1402 }, { "epoch": 0.89, "grad_norm": 5.92276421063471, "learning_rate": 1.6449186032291565e-07, "loss": 0.1783, "step": 1403 }, { "epoch": 0.89, "grad_norm": 4.982641032089617, "learning_rate": 1.6266908231807127e-07, "loss": 0.2434, "step": 1404 }, { "epoch": 0.89, "grad_norm": 8.621919828314278, "learning_rate": 1.608561203344239e-07, "loss": 0.6195, "step": 1405 }, { "epoch": 0.89, "grad_norm": 6.4191879999621, "learning_rate": 1.5905298198581776e-07, "loss": 0.175, "step": 1406 }, { "epoch": 0.89, "grad_norm": 5.64018099825841, "learning_rate": 1.5725967484484046e-07, "loss": 0.1374, "step": 1407 }, { "epoch": 0.89, "grad_norm": 4.074503375724342, "learning_rate": 1.5547620644279255e-07, "loss": 0.0964, "step": 1408 }, { "epoch": 0.89, "grad_norm": 3.2040563206383803, "learning_rate": 1.537025842696549e-07, "loss": 0.182, "step": 1409 }, { "epoch": 0.89, "grad_norm": 8.622362701128079, "learning_rate": 1.5193881577405729e-07, "loss": 0.4963, "step": 1410 }, { "epoch": 0.89, "grad_norm": 9.350747438634668, "learning_rate": 1.50184908363247e-07, "loss": 0.4012, "step": 1411 }, { "epoch": 0.89, "grad_norm": 8.583122423660912, "learning_rate": 1.484408694030587e-07, "loss": 0.3722, "step": 1412 }, { "epoch": 0.89, "grad_norm": 7.387553764444533, "learning_rate": 1.467067062178823e-07, "loss": 0.509, "step": 1413 }, { "epoch": 0.89, "grad_norm": 4.379003382480915, "learning_rate": 1.449824260906335e-07, "loss": 0.0959, "step": 1414 }, { "epoch": 0.89, "grad_norm": 8.58051937852108, "learning_rate": 1.4326803626272117e-07, "loss": 0.6395, "step": 1415 }, { "epoch": 0.9, "grad_norm": 4.478441945670635, "learning_rate": 1.415635439340185e-07, "loss": 0.0697, "step": 1416 }, { "epoch": 0.9, "grad_norm": 5.289528582503555, "learning_rate": 1.398689562628333e-07, "loss": 0.2054, "step": 1417 }, { "epoch": 0.9, "grad_norm": 6.4945406361767555, "learning_rate": 1.38184280365877e-07, "loss": 0.1162, "step": 1418 }, { "epoch": 0.9, "grad_norm": 7.735258541311191, "learning_rate": 1.365095233182326e-07, "loss": 0.4807, "step": 1419 }, { "epoch": 0.9, "grad_norm": 9.960743334504654, "learning_rate": 1.3484469215333084e-07, "loss": 0.7189, "step": 1420 }, { "epoch": 0.9, "grad_norm": 6.132947164624221, "learning_rate": 1.3318979386291397e-07, "loss": 0.1765, "step": 1421 }, { "epoch": 0.9, "grad_norm": 9.45488103524444, "learning_rate": 1.315448353970114e-07, "loss": 0.4651, "step": 1422 }, { "epoch": 0.9, "grad_norm": 8.637253170654683, "learning_rate": 1.2990982366390654e-07, "loss": 0.381, "step": 1423 }, { "epoch": 0.9, "grad_norm": 7.641859083620049, "learning_rate": 1.2828476553011227e-07, "loss": 0.3657, "step": 1424 }, { "epoch": 0.9, "grad_norm": 7.08632240806161, "learning_rate": 1.2666966782033745e-07, "loss": 0.2428, "step": 1425 }, { "epoch": 0.9, "grad_norm": 7.529403523482693, "learning_rate": 1.2506453731746194e-07, "loss": 0.3485, "step": 1426 }, { "epoch": 0.9, "grad_norm": 3.941044135798286, "learning_rate": 1.234693807625048e-07, "loss": 0.1049, "step": 1427 }, { "epoch": 0.9, "grad_norm": 4.211934553733774, "learning_rate": 1.2188420485460066e-07, "loss": 0.1333, "step": 1428 }, { "epoch": 0.9, "grad_norm": 7.444159083171612, "learning_rate": 1.2030901625096596e-07, "loss": 0.2546, "step": 1429 }, { "epoch": 0.9, "grad_norm": 3.5884628545617323, "learning_rate": 1.1874382156687514e-07, "loss": 0.1518, "step": 1430 }, { "epoch": 0.9, "grad_norm": 2.819229890357049, "learning_rate": 1.1718862737563063e-07, "loss": 0.023, "step": 1431 }, { "epoch": 0.91, "grad_norm": 4.415873738252984, "learning_rate": 1.1564344020853768e-07, "loss": 0.0783, "step": 1432 }, { "epoch": 0.91, "grad_norm": 5.482058478727181, "learning_rate": 1.1410826655487317e-07, "loss": 0.2457, "step": 1433 }, { "epoch": 0.91, "grad_norm": 5.2669632566614215, "learning_rate": 1.1258311286186208e-07, "loss": 0.1778, "step": 1434 }, { "epoch": 0.91, "grad_norm": 5.789782357086632, "learning_rate": 1.1106798553464804e-07, "loss": 0.3006, "step": 1435 }, { "epoch": 0.91, "grad_norm": 2.8420653331100634, "learning_rate": 1.0956289093626837e-07, "loss": 0.0192, "step": 1436 }, { "epoch": 0.91, "grad_norm": 7.920763287609952, "learning_rate": 1.0806783538762439e-07, "loss": 0.5136, "step": 1437 }, { "epoch": 0.91, "grad_norm": 10.284644404278689, "learning_rate": 1.0658282516745833e-07, "loss": 0.4975, "step": 1438 }, { "epoch": 0.91, "grad_norm": 5.4656066580148925, "learning_rate": 1.0510786651232452e-07, "loss": 0.1322, "step": 1439 }, { "epoch": 0.91, "grad_norm": 10.819073569639567, "learning_rate": 1.0364296561656467e-07, "loss": 0.3233, "step": 1440 }, { "epoch": 0.91, "grad_norm": 8.849850979859642, "learning_rate": 1.0218812863228012e-07, "loss": 0.4625, "step": 1441 }, { "epoch": 0.91, "grad_norm": 8.049652983015644, "learning_rate": 1.007433616693082e-07, "loss": 0.4446, "step": 1442 }, { "epoch": 0.91, "grad_norm": 5.74265450375623, "learning_rate": 9.930867079519485e-08, "loss": 0.1227, "step": 1443 }, { "epoch": 0.91, "grad_norm": 6.054313874560784, "learning_rate": 9.788406203517037e-08, "loss": 0.1747, "step": 1444 }, { "epoch": 0.91, "grad_norm": 5.792405371817055, "learning_rate": 9.646954137212228e-08, "loss": 0.2176, "step": 1445 }, { "epoch": 0.91, "grad_norm": 5.561878641120704, "learning_rate": 9.506511474657282e-08, "loss": 0.316, "step": 1446 }, { "epoch": 0.91, "grad_norm": 1.1851026529201485, "learning_rate": 9.367078805665147e-08, "loss": 0.0072, "step": 1447 }, { "epoch": 0.92, "grad_norm": 3.1354173594798254, "learning_rate": 9.22865671580725e-08, "loss": 0.0725, "step": 1448 }, { "epoch": 0.92, "grad_norm": 5.275467093519736, "learning_rate": 9.091245786410768e-08, "loss": 0.0572, "step": 1449 }, { "epoch": 0.92, "grad_norm": 6.923831706187367, "learning_rate": 8.954846594556449e-08, "loss": 0.3651, "step": 1450 }, { "epoch": 0.92, "grad_norm": 5.066185069201036, "learning_rate": 8.819459713076073e-08, "loss": 0.154, "step": 1451 }, { "epoch": 0.92, "grad_norm": 8.499823624520053, "learning_rate": 8.68508571054999e-08, "loss": 0.3934, "step": 1452 }, { "epoch": 0.92, "grad_norm": 5.946462512481981, "learning_rate": 8.551725151304785e-08, "loss": 0.2607, "step": 1453 }, { "epoch": 0.92, "grad_norm": 1.833471712034125, "learning_rate": 8.419378595411004e-08, "loss": 0.0076, "step": 1454 }, { "epoch": 0.92, "grad_norm": 5.882422769917423, "learning_rate": 8.288046598680627e-08, "loss": 0.3481, "step": 1455 }, { "epoch": 0.92, "grad_norm": 1.331843030054485, "learning_rate": 8.157729712664875e-08, "loss": 0.0064, "step": 1456 }, { "epoch": 0.92, "grad_norm": 5.300152060288706, "learning_rate": 8.028428484651796e-08, "loss": 0.2812, "step": 1457 }, { "epoch": 0.92, "grad_norm": 5.189792540121143, "learning_rate": 7.900143457664017e-08, "loss": 0.2562, "step": 1458 }, { "epoch": 0.92, "grad_norm": 5.755755322269527, "learning_rate": 7.772875170456495e-08, "loss": 0.3462, "step": 1459 }, { "epoch": 0.92, "grad_norm": 1.1676605736626782, "learning_rate": 7.64662415751416e-08, "loss": 0.0096, "step": 1460 }, { "epoch": 0.92, "grad_norm": 7.141204075945238, "learning_rate": 7.521390949049717e-08, "loss": 0.3239, "step": 1461 }, { "epoch": 0.92, "grad_norm": 5.666948685453614, "learning_rate": 7.397176071001544e-08, "loss": 0.2252, "step": 1462 }, { "epoch": 0.92, "grad_norm": 4.45920295196315, "learning_rate": 7.273980045031188e-08, "loss": 0.154, "step": 1463 }, { "epoch": 0.93, "grad_norm": 7.196136062989676, "learning_rate": 7.151803388521505e-08, "loss": 0.234, "step": 1464 }, { "epoch": 0.93, "grad_norm": 6.441134121468979, "learning_rate": 7.030646614574199e-08, "loss": 0.1867, "step": 1465 }, { "epoch": 0.93, "grad_norm": 2.762645172171149, "learning_rate": 6.910510232007977e-08, "loss": 0.0168, "step": 1466 }, { "epoch": 0.93, "grad_norm": 5.86077782686222, "learning_rate": 6.791394745356089e-08, "loss": 0.1645, "step": 1467 }, { "epoch": 0.93, "grad_norm": 4.621629466253201, "learning_rate": 6.673300654864406e-08, "loss": 0.0686, "step": 1468 }, { "epoch": 0.93, "grad_norm": 0.5898943494099643, "learning_rate": 6.556228456489233e-08, "loss": 0.0032, "step": 1469 }, { "epoch": 0.93, "grad_norm": 0.963063655882533, "learning_rate": 6.440178641895389e-08, "loss": 0.0059, "step": 1470 }, { "epoch": 0.93, "grad_norm": 5.663932851425676, "learning_rate": 6.325151698453852e-08, "loss": 0.2666, "step": 1471 }, { "epoch": 0.93, "grad_norm": 7.834146038128758, "learning_rate": 6.211148109239978e-08, "loss": 0.3064, "step": 1472 }, { "epoch": 0.93, "grad_norm": 7.1838422673162645, "learning_rate": 6.09816835303137e-08, "loss": 0.4441, "step": 1473 }, { "epoch": 0.93, "grad_norm": 4.167225050068681, "learning_rate": 5.986212904305871e-08, "loss": 0.1062, "step": 1474 }, { "epoch": 0.93, "grad_norm": 4.862076673737142, "learning_rate": 5.875282233239493e-08, "loss": 0.1184, "step": 1475 }, { "epoch": 0.93, "grad_norm": 8.9312355121471, "learning_rate": 5.7653768057045757e-08, "loss": 0.4186, "step": 1476 }, { "epoch": 0.93, "grad_norm": 6.775789233193873, "learning_rate": 5.65649708326782e-08, "loss": 0.378, "step": 1477 }, { "epoch": 0.93, "grad_norm": 5.458891085405919, "learning_rate": 5.548643523188235e-08, "loss": 0.1608, "step": 1478 }, { "epoch": 0.94, "grad_norm": 4.640605271422711, "learning_rate": 5.441816578415276e-08, "loss": 0.1665, "step": 1479 }, { "epoch": 0.94, "grad_norm": 3.092384884911356, "learning_rate": 5.336016697586988e-08, "loss": 0.031, "step": 1480 }, { "epoch": 0.94, "grad_norm": 5.562558463142998, "learning_rate": 5.231244325028084e-08, "loss": 0.2326, "step": 1481 }, { "epoch": 0.94, "grad_norm": 9.635782122855447, "learning_rate": 5.127499900748123e-08, "loss": 0.4579, "step": 1482 }, { "epoch": 0.94, "grad_norm": 6.747869084579012, "learning_rate": 5.024783860439475e-08, "loss": 0.2569, "step": 1483 }, { "epoch": 0.94, "grad_norm": 5.104292742151748, "learning_rate": 4.9230966354757993e-08, "loss": 0.2344, "step": 1484 }, { "epoch": 0.94, "grad_norm": 10.30196364860068, "learning_rate": 4.8224386529099884e-08, "loss": 0.6728, "step": 1485 }, { "epoch": 0.94, "grad_norm": 4.122846330168349, "learning_rate": 4.7228103354724495e-08, "loss": 0.0706, "step": 1486 }, { "epoch": 0.94, "grad_norm": 2.8690891040515463, "learning_rate": 4.624212101569353e-08, "loss": 0.0244, "step": 1487 }, { "epoch": 0.94, "grad_norm": 8.090607139712084, "learning_rate": 4.526644365280858e-08, "loss": 0.4098, "step": 1488 }, { "epoch": 0.94, "grad_norm": 1.405627515560764, "learning_rate": 4.4301075363593904e-08, "loss": 0.0104, "step": 1489 }, { "epoch": 0.94, "grad_norm": 7.910626432785334, "learning_rate": 4.3346020202278684e-08, "loss": 0.5296, "step": 1490 }, { "epoch": 0.94, "grad_norm": 6.903497069569218, "learning_rate": 4.240128217978062e-08, "loss": 0.2886, "step": 1491 }, { "epoch": 0.94, "grad_norm": 7.4777925410258, "learning_rate": 4.14668652636882e-08, "loss": 0.4299, "step": 1492 }, { "epoch": 0.94, "grad_norm": 7.492501643081704, "learning_rate": 4.054277337824597e-08, "loss": 0.3325, "step": 1493 }, { "epoch": 0.94, "grad_norm": 10.85073144090915, "learning_rate": 3.9629010404335644e-08, "loss": 0.8091, "step": 1494 }, { "epoch": 0.95, "grad_norm": 7.058855314744182, "learning_rate": 3.8725580179460874e-08, "loss": 0.2288, "step": 1495 }, { "epoch": 0.95, "grad_norm": 12.458801124661521, "learning_rate": 3.7832486497731966e-08, "loss": 0.8825, "step": 1496 }, { "epoch": 0.95, "grad_norm": 9.825964370262293, "learning_rate": 3.6949733109848395e-08, "loss": 0.503, "step": 1497 }, { "epoch": 0.95, "grad_norm": 8.12181850647075, "learning_rate": 3.6077323723084366e-08, "loss": 0.4041, "step": 1498 }, { "epoch": 0.95, "grad_norm": 0.2882274474099714, "learning_rate": 3.5215262001272176e-08, "loss": 0.0011, "step": 1499 }, { "epoch": 0.95, "grad_norm": 2.4677384634071937, "learning_rate": 3.436355156478749e-08, "loss": 0.0287, "step": 1500 }, { "epoch": 0.95, "grad_norm": 9.411555520706232, "learning_rate": 3.352219599053408e-08, "loss": 0.4804, "step": 1501 }, { "epoch": 0.95, "grad_norm": 6.000344902297967, "learning_rate": 3.269119881192912e-08, "loss": 0.1797, "step": 1502 }, { "epoch": 0.95, "grad_norm": 7.25677804791622, "learning_rate": 3.187056351888651e-08, "loss": 0.4743, "step": 1503 }, { "epoch": 0.95, "grad_norm": 6.984594286179355, "learning_rate": 3.1060293557805824e-08, "loss": 0.2057, "step": 1504 }, { "epoch": 0.95, "grad_norm": 8.64017732367631, "learning_rate": 3.0260392331553366e-08, "loss": 0.6345, "step": 1505 }, { "epoch": 0.95, "grad_norm": 5.0176756282493145, "learning_rate": 2.9470863199451972e-08, "loss": 0.1084, "step": 1506 }, { "epoch": 0.95, "grad_norm": 6.171223511896896, "learning_rate": 2.869170947726374e-08, "loss": 0.1464, "step": 1507 }, { "epoch": 0.95, "grad_norm": 0.22352443384374662, "learning_rate": 2.7922934437178695e-08, "loss": 0.0011, "step": 1508 }, { "epoch": 0.95, "grad_norm": 5.335905865286441, "learning_rate": 2.7164541307798665e-08, "loss": 0.2648, "step": 1509 }, { "epoch": 0.95, "grad_norm": 8.508021440558775, "learning_rate": 2.6416533274125912e-08, "loss": 0.2862, "step": 1510 }, { "epoch": 0.96, "grad_norm": 7.653577688527514, "learning_rate": 2.5678913477547306e-08, "loss": 0.2058, "step": 1511 }, { "epoch": 0.96, "grad_norm": 10.143165399483983, "learning_rate": 2.4951685015824623e-08, "loss": 0.5874, "step": 1512 }, { "epoch": 0.96, "grad_norm": 4.632141991094312, "learning_rate": 2.4234850943077315e-08, "loss": 0.1479, "step": 1513 }, { "epoch": 0.96, "grad_norm": 5.087911243360334, "learning_rate": 2.3528414269772814e-08, "loss": 0.1874, "step": 1514 }, { "epoch": 0.96, "grad_norm": 3.65698482595607, "learning_rate": 2.2832377962713203e-08, "loss": 0.1166, "step": 1515 }, { "epoch": 0.96, "grad_norm": 6.331473964333673, "learning_rate": 2.214674494502217e-08, "loss": 0.2949, "step": 1516 }, { "epoch": 0.96, "grad_norm": 7.426507600259947, "learning_rate": 2.1471518096132516e-08, "loss": 0.356, "step": 1517 }, { "epoch": 0.96, "grad_norm": 2.82471331939393, "learning_rate": 2.0806700251775057e-08, "loss": 0.0352, "step": 1518 }, { "epoch": 0.96, "grad_norm": 4.090778699395952, "learning_rate": 2.0152294203966138e-08, "loss": 0.1704, "step": 1519 }, { "epoch": 0.96, "grad_norm": 6.500984410789352, "learning_rate": 1.9508302700995962e-08, "loss": 0.4922, "step": 1520 }, { "epoch": 0.96, "grad_norm": 7.001297694218992, "learning_rate": 1.8874728447417223e-08, "loss": 0.1259, "step": 1521 }, { "epoch": 0.96, "grad_norm": 8.76889352497473, "learning_rate": 1.8251574104032888e-08, "loss": 0.515, "step": 1522 }, { "epoch": 0.96, "grad_norm": 4.53008155227246, "learning_rate": 1.7638842287887036e-08, "loss": 0.27, "step": 1523 }, { "epoch": 0.96, "grad_norm": 4.476618315397019, "learning_rate": 1.7036535572251267e-08, "loss": 0.1063, "step": 1524 }, { "epoch": 0.96, "grad_norm": 8.55209631847243, "learning_rate": 1.6444656486615808e-08, "loss": 0.2422, "step": 1525 }, { "epoch": 0.96, "grad_norm": 4.867984124840309, "learning_rate": 1.5863207516678138e-08, "loss": 0.1624, "step": 1526 }, { "epoch": 0.97, "grad_norm": 4.1059014902612425, "learning_rate": 1.529219110433272e-08, "loss": 0.3255, "step": 1527 }, { "epoch": 0.97, "grad_norm": 4.414717310045153, "learning_rate": 1.4731609647661016e-08, "loss": 0.1094, "step": 1528 }, { "epoch": 0.97, "grad_norm": 8.043296477508314, "learning_rate": 1.4181465500920367e-08, "loss": 0.4796, "step": 1529 }, { "epoch": 0.97, "grad_norm": 5.695897536717045, "learning_rate": 1.3641760974535401e-08, "loss": 0.2226, "step": 1530 }, { "epoch": 0.97, "grad_norm": 4.754830163275214, "learning_rate": 1.3112498335087764e-08, "loss": 0.2284, "step": 1531 }, { "epoch": 0.97, "grad_norm": 6.52310592813784, "learning_rate": 1.2593679805306402e-08, "loss": 0.1476, "step": 1532 }, { "epoch": 0.97, "grad_norm": 5.055509285255124, "learning_rate": 1.2085307564058679e-08, "loss": 0.2849, "step": 1533 }, { "epoch": 0.97, "grad_norm": 6.4433155642266255, "learning_rate": 1.1587383746340386e-08, "loss": 0.3427, "step": 1534 }, { "epoch": 0.97, "grad_norm": 4.3836416728327485, "learning_rate": 1.1099910443268248e-08, "loss": 0.1598, "step": 1535 }, { "epoch": 0.97, "grad_norm": 8.742945415788494, "learning_rate": 1.062288970206965e-08, "loss": 0.5362, "step": 1536 }, { "epoch": 0.97, "grad_norm": 5.7552970039360805, "learning_rate": 1.0156323526074874e-08, "loss": 0.2596, "step": 1537 }, { "epoch": 0.97, "grad_norm": 13.226891285414418, "learning_rate": 9.700213874708208e-09, "loss": 0.2647, "step": 1538 }, { "epoch": 0.97, "grad_norm": 8.866419983302873, "learning_rate": 9.25456266348046e-09, "loss": 0.5231, "step": 1539 }, { "epoch": 0.97, "grad_norm": 7.8747296968119205, "learning_rate": 8.819371763980345e-09, "loss": 0.317, "step": 1540 }, { "epoch": 0.97, "grad_norm": 6.722838679159709, "learning_rate": 8.394643003865887e-09, "loss": 0.2588, "step": 1541 }, { "epoch": 0.97, "grad_norm": 7.71384759193691, "learning_rate": 7.98037816685887e-09, "loss": 0.4926, "step": 1542 }, { "epoch": 0.98, "grad_norm": 6.012437327607516, "learning_rate": 7.576578992735118e-09, "loss": 0.271, "step": 1543 }, { "epoch": 0.98, "grad_norm": 8.027609864810419, "learning_rate": 7.183247177318109e-09, "loss": 0.4248, "step": 1544 }, { "epoch": 0.98, "grad_norm": 6.0435917735961375, "learning_rate": 6.800384372472324e-09, "loss": 0.1569, "step": 1545 }, { "epoch": 0.98, "grad_norm": 8.182820387831553, "learning_rate": 6.427992186095744e-09, "loss": 0.4146, "step": 1546 }, { "epoch": 0.98, "grad_norm": 8.89663938016722, "learning_rate": 6.066072182112914e-09, "loss": 0.2914, "step": 1547 }, { "epoch": 0.98, "grad_norm": 6.133212442174154, "learning_rate": 5.714625880469116e-09, "loss": 0.2168, "step": 1548 }, { "epoch": 0.98, "grad_norm": 5.547254608489343, "learning_rate": 5.37365475712287e-09, "loss": 0.1037, "step": 1549 }, { "epoch": 0.98, "grad_norm": 6.261510396954706, "learning_rate": 5.043160244040945e-09, "loss": 0.2477, "step": 1550 }, { "epoch": 0.98, "grad_norm": 6.857257490108165, "learning_rate": 4.7231437291916925e-09, "loss": 0.4704, "step": 1551 }, { "epoch": 0.98, "grad_norm": 3.4681728673351517, "learning_rate": 4.413606556538941e-09, "loss": 0.0619, "step": 1552 }, { "epoch": 0.98, "grad_norm": 7.85391270018635, "learning_rate": 4.114550026037279e-09, "loss": 0.4576, "step": 1553 }, { "epoch": 0.98, "grad_norm": 6.272525022661298, "learning_rate": 3.8259753936256695e-09, "loss": 0.2374, "step": 1554 }, { "epoch": 0.98, "grad_norm": 2.5000498766692116, "learning_rate": 3.5478838712227347e-09, "loss": 0.1446, "step": 1555 }, { "epoch": 0.98, "grad_norm": 9.323387387005008, "learning_rate": 3.280276626721479e-09, "loss": 0.3869, "step": 1556 }, { "epoch": 0.98, "grad_norm": 5.132127968766267, "learning_rate": 3.0231547839842945e-09, "loss": 0.1073, "step": 1557 }, { "epoch": 0.99, "grad_norm": 5.999095530684547, "learning_rate": 2.776519422838242e-09, "loss": 0.2723, "step": 1558 }, { "epoch": 0.99, "grad_norm": 2.005699025015711, "learning_rate": 2.540371579070611e-09, "loss": 0.0171, "step": 1559 }, { "epoch": 0.99, "grad_norm": 1.0890057804875521, "learning_rate": 2.3147122444250327e-09, "loss": 0.0127, "step": 1560 }, { "epoch": 0.99, "grad_norm": 6.036589003677193, "learning_rate": 2.099542366596208e-09, "loss": 0.1716, "step": 1561 }, { "epoch": 0.99, "grad_norm": 10.120532145022686, "learning_rate": 1.8948628492271303e-09, "loss": 0.5481, "step": 1562 }, { "epoch": 0.99, "grad_norm": 7.691719695873141, "learning_rate": 1.7006745519043688e-09, "loss": 0.2047, "step": 1563 }, { "epoch": 0.99, "grad_norm": 8.280277986881785, "learning_rate": 1.5169782901547358e-09, "loss": 0.4618, "step": 1564 }, { "epoch": 0.99, "grad_norm": 6.3709332827737795, "learning_rate": 1.3437748354425128e-09, "loss": 0.225, "step": 1565 }, { "epoch": 0.99, "grad_norm": 4.936250939097428, "learning_rate": 1.1810649151647313e-09, "loss": 0.311, "step": 1566 }, { "epoch": 0.99, "grad_norm": 6.997272777622928, "learning_rate": 1.028849212649785e-09, "loss": 0.155, "step": 1567 }, { "epoch": 0.99, "grad_norm": 8.109573979224292, "learning_rate": 8.871283671535446e-10, "loss": 0.4534, "step": 1568 }, { "epoch": 0.99, "grad_norm": 5.087829519186801, "learning_rate": 7.559029738571366e-10, "loss": 0.0842, "step": 1569 }, { "epoch": 0.99, "grad_norm": 5.349230359770883, "learning_rate": 6.351735838638906e-10, "loss": 0.1848, "step": 1570 }, { "epoch": 0.99, "grad_norm": 7.150807272414549, "learning_rate": 5.24940704197674e-10, "loss": 0.0204, "step": 1571 }, { "epoch": 0.99, "grad_norm": 7.855499829003662, "learning_rate": 4.2520479780011613e-10, "loss": 0.4247, "step": 1572 }, { "epoch": 0.99, "grad_norm": 7.755197566434629, "learning_rate": 3.35966283529221e-10, "loss": 0.3989, "step": 1573 }, { "epoch": 1.0, "grad_norm": 8.300312027179169, "learning_rate": 2.572255361577014e-10, "loss": 0.3569, "step": 1574 }, { "epoch": 1.0, "grad_norm": 6.64896707185098, "learning_rate": 1.8898288637048122e-10, "loss": 0.3549, "step": 1575 }, { "epoch": 1.0, "grad_norm": 6.788016744652564, "learning_rate": 1.3123862076441785e-10, "loss": 0.47, "step": 1576 }, { "epoch": 1.0, "grad_norm": 6.3169950589427435, "learning_rate": 8.399298184663674e-11, "loss": 0.3157, "step": 1577 }, { "epoch": 1.0, "grad_norm": 5.822387161701134, "learning_rate": 4.724616803286619e-11, "loss": 0.2321, "step": 1578 }, { "epoch": 1.0, "grad_norm": 1.1010913788331604, "learning_rate": 2.0998333647714863e-11, "loss": 0.0112, "step": 1579 }, { "epoch": 1.0, "grad_norm": 8.70557614229169, "learning_rate": 5.249588923561533e-12, "loss": 0.3462, "step": 1580 }, { "epoch": 1.0, "grad_norm": 5.500391512720971, "learning_rate": 0.0, "loss": 0.1648, "step": 1581 }, { "epoch": 1.0, "step": 1581, "total_flos": 331147936948224.0, "train_loss": 0.295211889783912, "train_runtime": 24740.4507, "train_samples_per_second": 0.511, "train_steps_per_second": 0.064 } ], "logging_steps": 1.0, "max_steps": 1581, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 331147936948224.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }