{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 296, "global_step": 2954, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003386171721233413, "grad_norm": 29.75, "learning_rate": 0.0, "loss": 0.5027, "step": 1 }, { "epoch": 0.0006772343442466826, "grad_norm": 35.25, "learning_rate": 1.3513513513513515e-07, "loss": 0.5494, "step": 2 }, { "epoch": 0.001015851516370024, "grad_norm": 28.5, "learning_rate": 2.702702702702703e-07, "loss": 0.4482, "step": 3 }, { "epoch": 0.0013544686884933651, "grad_norm": 33.5, "learning_rate": 4.0540540540540546e-07, "loss": 0.5661, "step": 4 }, { "epoch": 0.0016930858606167066, "grad_norm": 30.25, "learning_rate": 5.405405405405406e-07, "loss": 0.4796, "step": 5 }, { "epoch": 0.002031703032740048, "grad_norm": 34.25, "learning_rate": 6.756756756756758e-07, "loss": 0.515, "step": 6 }, { "epoch": 0.0023703202048633893, "grad_norm": 31.875, "learning_rate": 8.108108108108109e-07, "loss": 0.4466, "step": 7 }, { "epoch": 0.0027089373769867303, "grad_norm": 30.0, "learning_rate": 9.459459459459461e-07, "loss": 0.4354, "step": 8 }, { "epoch": 0.0030475545491100717, "grad_norm": 28.0, "learning_rate": 1.0810810810810812e-06, "loss": 0.4231, "step": 9 }, { "epoch": 0.003386171721233413, "grad_norm": 19.375, "learning_rate": 1.2162162162162164e-06, "loss": 0.3424, "step": 10 }, { "epoch": 0.003724788893356754, "grad_norm": 21.875, "learning_rate": 1.3513513513513515e-06, "loss": 0.37, "step": 11 }, { "epoch": 0.004063406065480096, "grad_norm": 19.0, "learning_rate": 1.4864864864864868e-06, "loss": 0.3439, "step": 12 }, { "epoch": 0.004402023237603437, "grad_norm": 15.0625, "learning_rate": 1.6216216216216219e-06, "loss": 0.2812, "step": 13 }, { "epoch": 0.0047406404097267785, "grad_norm": 16.0, "learning_rate": 1.756756756756757e-06, "loss": 0.335, "step": 14 }, { "epoch": 0.0050792575818501195, "grad_norm": 12.5, "learning_rate": 1.8918918918918922e-06, "loss": 0.2898, "step": 15 }, { "epoch": 0.0054178747539734605, "grad_norm": 11.6875, "learning_rate": 2.0270270270270273e-06, "loss": 0.2427, "step": 16 }, { "epoch": 0.005756491926096802, "grad_norm": 9.625, "learning_rate": 2.1621621621621623e-06, "loss": 0.2346, "step": 17 }, { "epoch": 0.006095109098220143, "grad_norm": 8.0625, "learning_rate": 2.297297297297298e-06, "loss": 0.2305, "step": 18 }, { "epoch": 0.006433726270343484, "grad_norm": 6.46875, "learning_rate": 2.432432432432433e-06, "loss": 0.1882, "step": 19 }, { "epoch": 0.006772343442466826, "grad_norm": 5.125, "learning_rate": 2.5675675675675675e-06, "loss": 0.1815, "step": 20 }, { "epoch": 0.007110960614590167, "grad_norm": 4.15625, "learning_rate": 2.702702702702703e-06, "loss": 0.1453, "step": 21 }, { "epoch": 0.007449577786713508, "grad_norm": 4.3125, "learning_rate": 2.837837837837838e-06, "loss": 0.1595, "step": 22 }, { "epoch": 0.00778819495883685, "grad_norm": 4.4375, "learning_rate": 2.9729729729729736e-06, "loss": 0.1914, "step": 23 }, { "epoch": 0.008126812130960191, "grad_norm": 3.703125, "learning_rate": 3.1081081081081082e-06, "loss": 0.1724, "step": 24 }, { "epoch": 0.008465429303083532, "grad_norm": 2.65625, "learning_rate": 3.2432432432432437e-06, "loss": 0.1183, "step": 25 }, { "epoch": 0.008804046475206873, "grad_norm": 3.09375, "learning_rate": 3.3783783783783788e-06, "loss": 0.165, "step": 26 }, { "epoch": 0.009142663647330216, "grad_norm": 2.546875, "learning_rate": 3.513513513513514e-06, "loss": 0.1775, "step": 27 }, { "epoch": 0.009481280819453557, "grad_norm": 1.71875, "learning_rate": 3.648648648648649e-06, "loss": 0.1276, "step": 28 }, { "epoch": 0.009819897991576898, "grad_norm": 1.953125, "learning_rate": 3.7837837837837844e-06, "loss": 0.1605, "step": 29 }, { "epoch": 0.010158515163700239, "grad_norm": 1.703125, "learning_rate": 3.918918918918919e-06, "loss": 0.1486, "step": 30 }, { "epoch": 0.01049713233582358, "grad_norm": 1.90625, "learning_rate": 4.0540540540540545e-06, "loss": 0.1535, "step": 31 }, { "epoch": 0.010835749507946921, "grad_norm": 1.5625, "learning_rate": 4.189189189189189e-06, "loss": 0.1189, "step": 32 }, { "epoch": 0.011174366680070264, "grad_norm": 1.828125, "learning_rate": 4.324324324324325e-06, "loss": 0.1359, "step": 33 }, { "epoch": 0.011512983852193605, "grad_norm": 1.328125, "learning_rate": 4.45945945945946e-06, "loss": 0.1289, "step": 34 }, { "epoch": 0.011851601024316946, "grad_norm": 1.09375, "learning_rate": 4.594594594594596e-06, "loss": 0.1033, "step": 35 }, { "epoch": 0.012190218196440287, "grad_norm": 1.3671875, "learning_rate": 4.72972972972973e-06, "loss": 0.149, "step": 36 }, { "epoch": 0.012528835368563628, "grad_norm": 1.03125, "learning_rate": 4.864864864864866e-06, "loss": 0.0975, "step": 37 }, { "epoch": 0.012867452540686969, "grad_norm": 1.0859375, "learning_rate": 5e-06, "loss": 0.0938, "step": 38 }, { "epoch": 0.013206069712810312, "grad_norm": 1.0625, "learning_rate": 5.135135135135135e-06, "loss": 0.1031, "step": 39 }, { "epoch": 0.013544686884933653, "grad_norm": 1.1796875, "learning_rate": 5.2702702702702705e-06, "loss": 0.1088, "step": 40 }, { "epoch": 0.013883304057056994, "grad_norm": 1.2421875, "learning_rate": 5.405405405405406e-06, "loss": 0.1207, "step": 41 }, { "epoch": 0.014221921229180335, "grad_norm": 1.703125, "learning_rate": 5.540540540540541e-06, "loss": 0.2261, "step": 42 }, { "epoch": 0.014560538401303676, "grad_norm": 1.4765625, "learning_rate": 5.675675675675676e-06, "loss": 0.12, "step": 43 }, { "epoch": 0.014899155573427017, "grad_norm": 1.21875, "learning_rate": 5.810810810810811e-06, "loss": 0.1243, "step": 44 }, { "epoch": 0.01523777274555036, "grad_norm": 1.140625, "learning_rate": 5.945945945945947e-06, "loss": 0.1028, "step": 45 }, { "epoch": 0.0155763899176737, "grad_norm": 1.1171875, "learning_rate": 6.081081081081082e-06, "loss": 0.1044, "step": 46 }, { "epoch": 0.01591500708979704, "grad_norm": 1.25, "learning_rate": 6.2162162162162164e-06, "loss": 0.1267, "step": 47 }, { "epoch": 0.016253624261920382, "grad_norm": 0.90625, "learning_rate": 6.351351351351351e-06, "loss": 0.0914, "step": 48 }, { "epoch": 0.016592241434043725, "grad_norm": 1.0546875, "learning_rate": 6.486486486486487e-06, "loss": 0.1085, "step": 49 }, { "epoch": 0.016930858606167064, "grad_norm": 0.87109375, "learning_rate": 6.621621621621622e-06, "loss": 0.0949, "step": 50 }, { "epoch": 0.017269475778290407, "grad_norm": 0.8671875, "learning_rate": 6.7567567567567575e-06, "loss": 0.0934, "step": 51 }, { "epoch": 0.017608092950413747, "grad_norm": 1.2734375, "learning_rate": 6.891891891891892e-06, "loss": 0.1231, "step": 52 }, { "epoch": 0.01794671012253709, "grad_norm": 0.84375, "learning_rate": 7.027027027027028e-06, "loss": 0.0946, "step": 53 }, { "epoch": 0.018285327294660432, "grad_norm": 1.3671875, "learning_rate": 7.162162162162163e-06, "loss": 0.1202, "step": 54 }, { "epoch": 0.01862394446678377, "grad_norm": 0.79296875, "learning_rate": 7.297297297297298e-06, "loss": 0.0802, "step": 55 }, { "epoch": 0.018962561638907114, "grad_norm": 1.2109375, "learning_rate": 7.4324324324324324e-06, "loss": 0.1052, "step": 56 }, { "epoch": 0.019301178811030453, "grad_norm": 0.91796875, "learning_rate": 7.567567567567569e-06, "loss": 0.0898, "step": 57 }, { "epoch": 0.019639795983153796, "grad_norm": 0.9921875, "learning_rate": 7.702702702702704e-06, "loss": 0.1046, "step": 58 }, { "epoch": 0.019978413155277135, "grad_norm": 0.87890625, "learning_rate": 7.837837837837838e-06, "loss": 0.0983, "step": 59 }, { "epoch": 0.020317030327400478, "grad_norm": 0.97265625, "learning_rate": 7.972972972972974e-06, "loss": 0.0832, "step": 60 }, { "epoch": 0.02065564749952382, "grad_norm": 1.0, "learning_rate": 8.108108108108109e-06, "loss": 0.1012, "step": 61 }, { "epoch": 0.02099426467164716, "grad_norm": 1.0, "learning_rate": 8.243243243243245e-06, "loss": 0.1124, "step": 62 }, { "epoch": 0.021332881843770503, "grad_norm": 0.73828125, "learning_rate": 8.378378378378378e-06, "loss": 0.1053, "step": 63 }, { "epoch": 0.021671499015893842, "grad_norm": 0.828125, "learning_rate": 8.513513513513514e-06, "loss": 0.0782, "step": 64 }, { "epoch": 0.022010116188017185, "grad_norm": 0.74609375, "learning_rate": 8.64864864864865e-06, "loss": 0.0739, "step": 65 }, { "epoch": 0.022348733360140528, "grad_norm": 0.89453125, "learning_rate": 8.783783783783785e-06, "loss": 0.0715, "step": 66 }, { "epoch": 0.022687350532263867, "grad_norm": 0.9765625, "learning_rate": 8.91891891891892e-06, "loss": 0.0913, "step": 67 }, { "epoch": 0.02302596770438721, "grad_norm": 0.8359375, "learning_rate": 9.054054054054054e-06, "loss": 0.0844, "step": 68 }, { "epoch": 0.02336458487651055, "grad_norm": 0.82421875, "learning_rate": 9.189189189189191e-06, "loss": 0.0825, "step": 69 }, { "epoch": 0.02370320204863389, "grad_norm": 1.0234375, "learning_rate": 9.324324324324325e-06, "loss": 0.084, "step": 70 }, { "epoch": 0.02404181922075723, "grad_norm": 0.9375, "learning_rate": 9.45945945945946e-06, "loss": 0.0933, "step": 71 }, { "epoch": 0.024380436392880574, "grad_norm": 0.6796875, "learning_rate": 9.594594594594594e-06, "loss": 0.0835, "step": 72 }, { "epoch": 0.024719053565003916, "grad_norm": 0.8203125, "learning_rate": 9.729729729729732e-06, "loss": 0.0814, "step": 73 }, { "epoch": 0.025057670737127256, "grad_norm": 0.796875, "learning_rate": 9.864864864864865e-06, "loss": 0.079, "step": 74 }, { "epoch": 0.0253962879092506, "grad_norm": 1.046875, "learning_rate": 1e-05, "loss": 0.0984, "step": 75 }, { "epoch": 0.025734905081373938, "grad_norm": 0.73828125, "learning_rate": 1.0135135135135136e-05, "loss": 0.079, "step": 76 }, { "epoch": 0.02607352225349728, "grad_norm": 0.765625, "learning_rate": 1.027027027027027e-05, "loss": 0.0811, "step": 77 }, { "epoch": 0.026412139425620623, "grad_norm": 0.796875, "learning_rate": 1.0405405405405407e-05, "loss": 0.0819, "step": 78 }, { "epoch": 0.026750756597743963, "grad_norm": 0.84765625, "learning_rate": 1.0540540540540541e-05, "loss": 0.0828, "step": 79 }, { "epoch": 0.027089373769867305, "grad_norm": 0.86328125, "learning_rate": 1.0675675675675677e-05, "loss": 0.1061, "step": 80 }, { "epoch": 0.027427990941990645, "grad_norm": 0.74609375, "learning_rate": 1.0810810810810812e-05, "loss": 0.0912, "step": 81 }, { "epoch": 0.027766608114113987, "grad_norm": 0.78515625, "learning_rate": 1.0945945945945946e-05, "loss": 0.0702, "step": 82 }, { "epoch": 0.02810522528623733, "grad_norm": 0.80078125, "learning_rate": 1.1081081081081081e-05, "loss": 0.0769, "step": 83 }, { "epoch": 0.02844384245836067, "grad_norm": 0.73046875, "learning_rate": 1.1216216216216219e-05, "loss": 0.0786, "step": 84 }, { "epoch": 0.028782459630484012, "grad_norm": 0.75, "learning_rate": 1.1351351351351352e-05, "loss": 0.0837, "step": 85 }, { "epoch": 0.02912107680260735, "grad_norm": 0.75390625, "learning_rate": 1.1486486486486488e-05, "loss": 0.0993, "step": 86 }, { "epoch": 0.029459693974730694, "grad_norm": 0.72265625, "learning_rate": 1.1621621621621622e-05, "loss": 0.0806, "step": 87 }, { "epoch": 0.029798311146854033, "grad_norm": 0.8046875, "learning_rate": 1.1756756756756757e-05, "loss": 0.0988, "step": 88 }, { "epoch": 0.030136928318977376, "grad_norm": 0.78515625, "learning_rate": 1.1891891891891894e-05, "loss": 0.0947, "step": 89 }, { "epoch": 0.03047554549110072, "grad_norm": 0.859375, "learning_rate": 1.2027027027027028e-05, "loss": 0.0726, "step": 90 }, { "epoch": 0.030814162663224058, "grad_norm": 0.90625, "learning_rate": 1.2162162162162164e-05, "loss": 0.0958, "step": 91 }, { "epoch": 0.0311527798353474, "grad_norm": 0.65234375, "learning_rate": 1.2297297297297299e-05, "loss": 0.0637, "step": 92 }, { "epoch": 0.031491397007470744, "grad_norm": 0.765625, "learning_rate": 1.2432432432432433e-05, "loss": 0.0935, "step": 93 }, { "epoch": 0.03183001417959408, "grad_norm": 0.67578125, "learning_rate": 1.2567567567567568e-05, "loss": 0.0661, "step": 94 }, { "epoch": 0.03216863135171742, "grad_norm": 0.8671875, "learning_rate": 1.2702702702702702e-05, "loss": 0.0871, "step": 95 }, { "epoch": 0.032507248523840765, "grad_norm": 1.5234375, "learning_rate": 1.283783783783784e-05, "loss": 0.1231, "step": 96 }, { "epoch": 0.03284586569596411, "grad_norm": 0.91015625, "learning_rate": 1.2972972972972975e-05, "loss": 0.0777, "step": 97 }, { "epoch": 0.03318448286808745, "grad_norm": 1.09375, "learning_rate": 1.3108108108108109e-05, "loss": 0.1009, "step": 98 }, { "epoch": 0.033523100040210786, "grad_norm": 0.78515625, "learning_rate": 1.3243243243243244e-05, "loss": 0.0944, "step": 99 }, { "epoch": 0.03386171721233413, "grad_norm": 0.734375, "learning_rate": 1.3378378378378381e-05, "loss": 0.0649, "step": 100 }, { "epoch": 0.03420033438445747, "grad_norm": 0.71875, "learning_rate": 1.3513513513513515e-05, "loss": 0.0813, "step": 101 }, { "epoch": 0.034538951556580814, "grad_norm": 0.76171875, "learning_rate": 1.364864864864865e-05, "loss": 0.0726, "step": 102 }, { "epoch": 0.03487756872870416, "grad_norm": 0.75, "learning_rate": 1.3783783783783784e-05, "loss": 0.0783, "step": 103 }, { "epoch": 0.03521618590082749, "grad_norm": 0.7109375, "learning_rate": 1.391891891891892e-05, "loss": 0.0656, "step": 104 }, { "epoch": 0.035554803072950836, "grad_norm": 0.73046875, "learning_rate": 1.4054054054054055e-05, "loss": 0.0719, "step": 105 }, { "epoch": 0.03589342024507418, "grad_norm": 0.79296875, "learning_rate": 1.4189189189189189e-05, "loss": 0.0821, "step": 106 }, { "epoch": 0.03623203741719752, "grad_norm": 0.609375, "learning_rate": 1.4324324324324326e-05, "loss": 0.075, "step": 107 }, { "epoch": 0.036570654589320864, "grad_norm": 0.7109375, "learning_rate": 1.4459459459459462e-05, "loss": 0.0753, "step": 108 }, { "epoch": 0.0369092717614442, "grad_norm": 0.796875, "learning_rate": 1.4594594594594596e-05, "loss": 0.0976, "step": 109 }, { "epoch": 0.03724788893356754, "grad_norm": 0.6953125, "learning_rate": 1.4729729729729731e-05, "loss": 0.0781, "step": 110 }, { "epoch": 0.037586506105690885, "grad_norm": 0.74609375, "learning_rate": 1.4864864864864865e-05, "loss": 0.0808, "step": 111 }, { "epoch": 0.03792512327781423, "grad_norm": 0.90625, "learning_rate": 1.5000000000000002e-05, "loss": 0.0816, "step": 112 }, { "epoch": 0.03826374044993757, "grad_norm": 0.828125, "learning_rate": 1.5135135135135138e-05, "loss": 0.082, "step": 113 }, { "epoch": 0.03860235762206091, "grad_norm": 0.703125, "learning_rate": 1.527027027027027e-05, "loss": 0.0811, "step": 114 }, { "epoch": 0.03894097479418425, "grad_norm": 0.73046875, "learning_rate": 1.540540540540541e-05, "loss": 0.0754, "step": 115 }, { "epoch": 0.03927959196630759, "grad_norm": 0.76171875, "learning_rate": 1.554054054054054e-05, "loss": 0.0891, "step": 116 }, { "epoch": 0.039618209138430935, "grad_norm": 0.77734375, "learning_rate": 1.5675675675675676e-05, "loss": 0.1031, "step": 117 }, { "epoch": 0.03995682631055427, "grad_norm": 0.71875, "learning_rate": 1.581081081081081e-05, "loss": 0.0718, "step": 118 }, { "epoch": 0.04029544348267761, "grad_norm": 0.64453125, "learning_rate": 1.5945945945945947e-05, "loss": 0.0749, "step": 119 }, { "epoch": 0.040634060654800956, "grad_norm": 0.73046875, "learning_rate": 1.6081081081081083e-05, "loss": 0.0815, "step": 120 }, { "epoch": 0.0409726778269243, "grad_norm": 0.74609375, "learning_rate": 1.6216216216216218e-05, "loss": 0.0832, "step": 121 }, { "epoch": 0.04131129499904764, "grad_norm": 0.6875, "learning_rate": 1.6351351351351354e-05, "loss": 0.0703, "step": 122 }, { "epoch": 0.04164991217117098, "grad_norm": 0.81640625, "learning_rate": 1.648648648648649e-05, "loss": 0.0965, "step": 123 }, { "epoch": 0.04198852934329432, "grad_norm": 0.8671875, "learning_rate": 1.662162162162162e-05, "loss": 0.0811, "step": 124 }, { "epoch": 0.04232714651541766, "grad_norm": 0.578125, "learning_rate": 1.6756756756756757e-05, "loss": 0.0647, "step": 125 }, { "epoch": 0.042665763687541006, "grad_norm": 0.64453125, "learning_rate": 1.6891891891891896e-05, "loss": 0.0717, "step": 126 }, { "epoch": 0.04300438085966435, "grad_norm": 1.125, "learning_rate": 1.7027027027027028e-05, "loss": 0.1835, "step": 127 }, { "epoch": 0.043342998031787684, "grad_norm": 0.66015625, "learning_rate": 1.7162162162162163e-05, "loss": 0.0627, "step": 128 }, { "epoch": 0.04368161520391103, "grad_norm": 0.59375, "learning_rate": 1.72972972972973e-05, "loss": 0.0673, "step": 129 }, { "epoch": 0.04402023237603437, "grad_norm": 0.7421875, "learning_rate": 1.7432432432432434e-05, "loss": 0.0917, "step": 130 }, { "epoch": 0.04435884954815771, "grad_norm": 0.828125, "learning_rate": 1.756756756756757e-05, "loss": 0.094, "step": 131 }, { "epoch": 0.044697466720281055, "grad_norm": 0.86328125, "learning_rate": 1.7702702702702702e-05, "loss": 0.0912, "step": 132 }, { "epoch": 0.04503608389240439, "grad_norm": 0.9140625, "learning_rate": 1.783783783783784e-05, "loss": 0.1102, "step": 133 }, { "epoch": 0.045374701064527734, "grad_norm": 0.6953125, "learning_rate": 1.7972972972972976e-05, "loss": 0.0581, "step": 134 }, { "epoch": 0.045713318236651077, "grad_norm": 0.66015625, "learning_rate": 1.8108108108108108e-05, "loss": 0.0591, "step": 135 }, { "epoch": 0.04605193540877442, "grad_norm": 0.765625, "learning_rate": 1.8243243243243244e-05, "loss": 0.0896, "step": 136 }, { "epoch": 0.04639055258089776, "grad_norm": 0.83203125, "learning_rate": 1.8378378378378383e-05, "loss": 0.0733, "step": 137 }, { "epoch": 0.0467291697530211, "grad_norm": 1.234375, "learning_rate": 1.8513513513513515e-05, "loss": 0.0883, "step": 138 }, { "epoch": 0.04706778692514444, "grad_norm": 0.84765625, "learning_rate": 1.864864864864865e-05, "loss": 0.0886, "step": 139 }, { "epoch": 0.04740640409726778, "grad_norm": 0.85546875, "learning_rate": 1.8783783783783786e-05, "loss": 0.1031, "step": 140 }, { "epoch": 0.047745021269391126, "grad_norm": 0.86328125, "learning_rate": 1.891891891891892e-05, "loss": 0.0936, "step": 141 }, { "epoch": 0.04808363844151446, "grad_norm": 0.640625, "learning_rate": 1.9054054054054057e-05, "loss": 0.0706, "step": 142 }, { "epoch": 0.048422255613637805, "grad_norm": 0.84375, "learning_rate": 1.918918918918919e-05, "loss": 0.1006, "step": 143 }, { "epoch": 0.04876087278576115, "grad_norm": 0.62109375, "learning_rate": 1.9324324324324328e-05, "loss": 0.0715, "step": 144 }, { "epoch": 0.04909948995788449, "grad_norm": 0.7421875, "learning_rate": 1.9459459459459463e-05, "loss": 0.0856, "step": 145 }, { "epoch": 0.04943810713000783, "grad_norm": 0.83984375, "learning_rate": 1.9594594594594595e-05, "loss": 0.0923, "step": 146 }, { "epoch": 0.04977672430213117, "grad_norm": 0.75, "learning_rate": 1.972972972972973e-05, "loss": 0.0825, "step": 147 }, { "epoch": 0.05011534147425451, "grad_norm": 0.62890625, "learning_rate": 1.9864864864864866e-05, "loss": 0.0588, "step": 148 }, { "epoch": 0.050453958646377854, "grad_norm": 0.81640625, "learning_rate": 2e-05, "loss": 0.1127, "step": 149 }, { "epoch": 0.0507925758185012, "grad_norm": 0.5703125, "learning_rate": 1.9999993732499594e-05, "loss": 0.0628, "step": 150 }, { "epoch": 0.05113119299062454, "grad_norm": 0.64453125, "learning_rate": 1.9999974930006222e-05, "loss": 0.0637, "step": 151 }, { "epoch": 0.051469810162747875, "grad_norm": 0.6875, "learning_rate": 1.999994359254346e-05, "loss": 0.0706, "step": 152 }, { "epoch": 0.05180842733487122, "grad_norm": 0.71875, "learning_rate": 1.999989972015058e-05, "loss": 0.0876, "step": 153 }, { "epoch": 0.05214704450699456, "grad_norm": 0.59765625, "learning_rate": 1.9999843312882592e-05, "loss": 0.0708, "step": 154 }, { "epoch": 0.052485661679117904, "grad_norm": 0.59765625, "learning_rate": 1.9999774370810187e-05, "loss": 0.0571, "step": 155 }, { "epoch": 0.052824278851241246, "grad_norm": 0.8203125, "learning_rate": 1.9999692894019792e-05, "loss": 0.0943, "step": 156 }, { "epoch": 0.05316289602336458, "grad_norm": 0.5546875, "learning_rate": 1.9999598882613537e-05, "loss": 0.0712, "step": 157 }, { "epoch": 0.053501513195487925, "grad_norm": 0.69921875, "learning_rate": 1.9999492336709263e-05, "loss": 0.0836, "step": 158 }, { "epoch": 0.05384013036761127, "grad_norm": 0.6328125, "learning_rate": 1.999937325644053e-05, "loss": 0.0626, "step": 159 }, { "epoch": 0.05417874753973461, "grad_norm": 1.890625, "learning_rate": 1.99992416419566e-05, "loss": 0.0855, "step": 160 }, { "epoch": 0.05451736471185795, "grad_norm": 0.75, "learning_rate": 1.9999097493422453e-05, "loss": 0.082, "step": 161 }, { "epoch": 0.05485598188398129, "grad_norm": 0.6640625, "learning_rate": 1.9998940811018782e-05, "loss": 0.0769, "step": 162 }, { "epoch": 0.05519459905610463, "grad_norm": 0.55078125, "learning_rate": 1.9998771594941983e-05, "loss": 0.0691, "step": 163 }, { "epoch": 0.055533216228227975, "grad_norm": 0.68359375, "learning_rate": 1.9998589845404176e-05, "loss": 0.0687, "step": 164 }, { "epoch": 0.05587183340035132, "grad_norm": 0.703125, "learning_rate": 1.9998395562633176e-05, "loss": 0.0738, "step": 165 }, { "epoch": 0.05621045057247466, "grad_norm": 1.5234375, "learning_rate": 1.9998188746872523e-05, "loss": 0.0891, "step": 166 }, { "epoch": 0.056549067744597996, "grad_norm": 0.62890625, "learning_rate": 1.9997969398381454e-05, "loss": 0.0911, "step": 167 }, { "epoch": 0.05688768491672134, "grad_norm": 0.76171875, "learning_rate": 1.9997737517434932e-05, "loss": 0.0826, "step": 168 }, { "epoch": 0.05722630208884468, "grad_norm": 0.81640625, "learning_rate": 1.9997493104323607e-05, "loss": 0.0955, "step": 169 }, { "epoch": 0.057564919260968024, "grad_norm": 0.640625, "learning_rate": 1.9997236159353864e-05, "loss": 0.0804, "step": 170 }, { "epoch": 0.05790353643309136, "grad_norm": 0.67578125, "learning_rate": 1.9996966682847776e-05, "loss": 0.0777, "step": 171 }, { "epoch": 0.0582421536052147, "grad_norm": 0.703125, "learning_rate": 1.9996684675143132e-05, "loss": 0.0823, "step": 172 }, { "epoch": 0.058580770777338045, "grad_norm": 0.640625, "learning_rate": 1.999639013659343e-05, "loss": 0.09, "step": 173 }, { "epoch": 0.05891938794946139, "grad_norm": 0.984375, "learning_rate": 1.9996083067567876e-05, "loss": 0.108, "step": 174 }, { "epoch": 0.05925800512158473, "grad_norm": 0.78515625, "learning_rate": 1.9995763468451376e-05, "loss": 0.0663, "step": 175 }, { "epoch": 0.05959662229370807, "grad_norm": 0.64453125, "learning_rate": 1.9995431339644552e-05, "loss": 0.0856, "step": 176 }, { "epoch": 0.05993523946583141, "grad_norm": 0.76171875, "learning_rate": 1.9995086681563725e-05, "loss": 0.0879, "step": 177 }, { "epoch": 0.06027385663795475, "grad_norm": 0.671875, "learning_rate": 1.999472949464093e-05, "loss": 0.0747, "step": 178 }, { "epoch": 0.060612473810078095, "grad_norm": 0.8359375, "learning_rate": 1.9994359779323892e-05, "loss": 0.1313, "step": 179 }, { "epoch": 0.06095109098220144, "grad_norm": 0.66796875, "learning_rate": 1.9993977536076052e-05, "loss": 0.0781, "step": 180 }, { "epoch": 0.061289708154324773, "grad_norm": 0.68359375, "learning_rate": 1.999358276537655e-05, "loss": 0.0731, "step": 181 }, { "epoch": 0.061628325326448116, "grad_norm": 0.67578125, "learning_rate": 1.9993175467720242e-05, "loss": 0.0756, "step": 182 }, { "epoch": 0.06196694249857146, "grad_norm": 0.61328125, "learning_rate": 1.9992755643617663e-05, "loss": 0.0743, "step": 183 }, { "epoch": 0.0623055596706948, "grad_norm": 0.62109375, "learning_rate": 1.9992323293595065e-05, "loss": 0.0743, "step": 184 }, { "epoch": 0.06264417684281814, "grad_norm": 0.69921875, "learning_rate": 1.9991878418194407e-05, "loss": 0.0809, "step": 185 }, { "epoch": 0.06298279401494149, "grad_norm": 0.66015625, "learning_rate": 1.9991421017973328e-05, "loss": 0.0751, "step": 186 }, { "epoch": 0.06332141118706483, "grad_norm": 0.765625, "learning_rate": 1.999095109350519e-05, "loss": 0.0868, "step": 187 }, { "epoch": 0.06366002835918816, "grad_norm": 0.58203125, "learning_rate": 1.9990468645379038e-05, "loss": 0.0625, "step": 188 }, { "epoch": 0.0639986455313115, "grad_norm": 0.7421875, "learning_rate": 1.998997367419962e-05, "loss": 0.0829, "step": 189 }, { "epoch": 0.06433726270343484, "grad_norm": 0.65234375, "learning_rate": 1.9989466180587386e-05, "loss": 0.0729, "step": 190 }, { "epoch": 0.06467587987555819, "grad_norm": 0.84375, "learning_rate": 1.998894616517848e-05, "loss": 0.1061, "step": 191 }, { "epoch": 0.06501449704768153, "grad_norm": 0.62109375, "learning_rate": 1.998841362862473e-05, "loss": 0.0675, "step": 192 }, { "epoch": 0.06535311421980487, "grad_norm": 0.54296875, "learning_rate": 1.998786857159369e-05, "loss": 0.0581, "step": 193 }, { "epoch": 0.06569173139192822, "grad_norm": 0.640625, "learning_rate": 1.9987310994768573e-05, "loss": 0.0743, "step": 194 }, { "epoch": 0.06603034856405156, "grad_norm": 0.57421875, "learning_rate": 1.9986740898848306e-05, "loss": 0.0655, "step": 195 }, { "epoch": 0.0663689657361749, "grad_norm": 0.73046875, "learning_rate": 1.998615828454751e-05, "loss": 0.0885, "step": 196 }, { "epoch": 0.06670758290829824, "grad_norm": 0.62890625, "learning_rate": 1.998556315259648e-05, "loss": 0.0657, "step": 197 }, { "epoch": 0.06704620008042157, "grad_norm": 0.65234375, "learning_rate": 1.9984955503741227e-05, "loss": 0.0742, "step": 198 }, { "epoch": 0.06738481725254492, "grad_norm": 0.73046875, "learning_rate": 1.998433533874343e-05, "loss": 0.0862, "step": 199 }, { "epoch": 0.06772343442466826, "grad_norm": 0.77734375, "learning_rate": 1.9983702658380474e-05, "loss": 0.1001, "step": 200 }, { "epoch": 0.0680620515967916, "grad_norm": 0.58984375, "learning_rate": 1.9983057463445415e-05, "loss": 0.0688, "step": 201 }, { "epoch": 0.06840066876891494, "grad_norm": 0.6953125, "learning_rate": 1.998239975474701e-05, "loss": 0.0882, "step": 202 }, { "epoch": 0.06873928594103829, "grad_norm": 0.64453125, "learning_rate": 1.9981729533109694e-05, "loss": 0.064, "step": 203 }, { "epoch": 0.06907790311316163, "grad_norm": 0.5703125, "learning_rate": 1.9981046799373595e-05, "loss": 0.0665, "step": 204 }, { "epoch": 0.06941652028528497, "grad_norm": 0.66796875, "learning_rate": 1.9980351554394514e-05, "loss": 0.0861, "step": 205 }, { "epoch": 0.06975513745740831, "grad_norm": 0.57421875, "learning_rate": 1.9979643799043945e-05, "loss": 0.0691, "step": 206 }, { "epoch": 0.07009375462953164, "grad_norm": 0.85546875, "learning_rate": 1.9978923534209052e-05, "loss": 0.1439, "step": 207 }, { "epoch": 0.07043237180165499, "grad_norm": 0.59765625, "learning_rate": 1.9978190760792698e-05, "loss": 0.0725, "step": 208 }, { "epoch": 0.07077098897377833, "grad_norm": 0.55859375, "learning_rate": 1.997744547971341e-05, "loss": 0.0732, "step": 209 }, { "epoch": 0.07110960614590167, "grad_norm": 0.60546875, "learning_rate": 1.9976687691905394e-05, "loss": 0.0718, "step": 210 }, { "epoch": 0.07144822331802501, "grad_norm": 0.78125, "learning_rate": 1.997591739831854e-05, "loss": 0.0774, "step": 211 }, { "epoch": 0.07178684049014836, "grad_norm": 0.57421875, "learning_rate": 1.9975134599918414e-05, "loss": 0.0717, "step": 212 }, { "epoch": 0.0721254576622717, "grad_norm": 0.478515625, "learning_rate": 1.9974339297686246e-05, "loss": 0.0601, "step": 213 }, { "epoch": 0.07246407483439504, "grad_norm": 0.6484375, "learning_rate": 1.9973531492618956e-05, "loss": 0.0813, "step": 214 }, { "epoch": 0.07280269200651839, "grad_norm": 0.578125, "learning_rate": 1.9972711185729124e-05, "loss": 0.0679, "step": 215 }, { "epoch": 0.07314130917864173, "grad_norm": 0.6484375, "learning_rate": 1.9971878378045005e-05, "loss": 0.0735, "step": 216 }, { "epoch": 0.07347992635076506, "grad_norm": 0.640625, "learning_rate": 1.997103307061052e-05, "loss": 0.0756, "step": 217 }, { "epoch": 0.0738185435228884, "grad_norm": 0.55078125, "learning_rate": 1.9970175264485268e-05, "loss": 0.0679, "step": 218 }, { "epoch": 0.07415716069501174, "grad_norm": 0.5390625, "learning_rate": 1.9969304960744508e-05, "loss": 0.0684, "step": 219 }, { "epoch": 0.07449577786713509, "grad_norm": 0.73828125, "learning_rate": 1.996842216047916e-05, "loss": 0.0952, "step": 220 }, { "epoch": 0.07483439503925843, "grad_norm": 0.6328125, "learning_rate": 1.996752686479582e-05, "loss": 0.0888, "step": 221 }, { "epoch": 0.07517301221138177, "grad_norm": 0.49609375, "learning_rate": 1.996661907481674e-05, "loss": 0.0633, "step": 222 }, { "epoch": 0.07551162938350511, "grad_norm": 0.82421875, "learning_rate": 1.9965698791679834e-05, "loss": 0.0985, "step": 223 }, { "epoch": 0.07585024655562846, "grad_norm": 0.69921875, "learning_rate": 1.996476601653868e-05, "loss": 0.0888, "step": 224 }, { "epoch": 0.0761888637277518, "grad_norm": 0.76171875, "learning_rate": 1.9963820750562506e-05, "loss": 0.0905, "step": 225 }, { "epoch": 0.07652748089987514, "grad_norm": 0.67578125, "learning_rate": 1.9962862994936207e-05, "loss": 0.0802, "step": 226 }, { "epoch": 0.07686609807199847, "grad_norm": 0.625, "learning_rate": 1.996189275086033e-05, "loss": 0.0809, "step": 227 }, { "epoch": 0.07720471524412181, "grad_norm": 0.60546875, "learning_rate": 1.9960910019551073e-05, "loss": 0.0823, "step": 228 }, { "epoch": 0.07754333241624516, "grad_norm": 0.6171875, "learning_rate": 1.9959914802240293e-05, "loss": 0.0713, "step": 229 }, { "epoch": 0.0778819495883685, "grad_norm": 0.59375, "learning_rate": 1.9958907100175492e-05, "loss": 0.0754, "step": 230 }, { "epoch": 0.07822056676049184, "grad_norm": 0.796875, "learning_rate": 1.9957886914619826e-05, "loss": 0.0887, "step": 231 }, { "epoch": 0.07855918393261518, "grad_norm": 0.5546875, "learning_rate": 1.99568542468521e-05, "loss": 0.0667, "step": 232 }, { "epoch": 0.07889780110473853, "grad_norm": 0.5234375, "learning_rate": 1.995580909816676e-05, "loss": 0.0622, "step": 233 }, { "epoch": 0.07923641827686187, "grad_norm": 0.62109375, "learning_rate": 1.99547514698739e-05, "loss": 0.0765, "step": 234 }, { "epoch": 0.07957503544898521, "grad_norm": 0.68359375, "learning_rate": 1.9953681363299258e-05, "loss": 0.0936, "step": 235 }, { "epoch": 0.07991365262110854, "grad_norm": 0.48046875, "learning_rate": 1.9952598779784214e-05, "loss": 0.0574, "step": 236 }, { "epoch": 0.08025226979323188, "grad_norm": 0.59765625, "learning_rate": 1.9951503720685784e-05, "loss": 0.0716, "step": 237 }, { "epoch": 0.08059088696535523, "grad_norm": 0.5859375, "learning_rate": 1.9950396187376628e-05, "loss": 0.0781, "step": 238 }, { "epoch": 0.08092950413747857, "grad_norm": 0.58203125, "learning_rate": 1.9949276181245037e-05, "loss": 0.0779, "step": 239 }, { "epoch": 0.08126812130960191, "grad_norm": 0.703125, "learning_rate": 1.994814370369494e-05, "loss": 0.1049, "step": 240 }, { "epoch": 0.08160673848172526, "grad_norm": 0.734375, "learning_rate": 1.9946998756145894e-05, "loss": 0.0857, "step": 241 }, { "epoch": 0.0819453556538486, "grad_norm": 0.59765625, "learning_rate": 1.9945841340033093e-05, "loss": 0.0722, "step": 242 }, { "epoch": 0.08228397282597194, "grad_norm": 0.6640625, "learning_rate": 1.994467145680736e-05, "loss": 0.083, "step": 243 }, { "epoch": 0.08262258999809528, "grad_norm": 0.8203125, "learning_rate": 1.994348910793514e-05, "loss": 0.0852, "step": 244 }, { "epoch": 0.08296120717021863, "grad_norm": 0.55078125, "learning_rate": 1.9942294294898513e-05, "loss": 0.063, "step": 245 }, { "epoch": 0.08329982434234195, "grad_norm": 0.54296875, "learning_rate": 1.994108701919517e-05, "loss": 0.0701, "step": 246 }, { "epoch": 0.0836384415144653, "grad_norm": 0.6953125, "learning_rate": 1.993986728233844e-05, "loss": 0.0869, "step": 247 }, { "epoch": 0.08397705868658864, "grad_norm": 0.52734375, "learning_rate": 1.9938635085857257e-05, "loss": 0.0619, "step": 248 }, { "epoch": 0.08431567585871198, "grad_norm": 0.578125, "learning_rate": 1.993739043129618e-05, "loss": 0.0643, "step": 249 }, { "epoch": 0.08465429303083533, "grad_norm": 0.6484375, "learning_rate": 1.9936133320215385e-05, "loss": 0.0753, "step": 250 }, { "epoch": 0.08499291020295867, "grad_norm": 0.6484375, "learning_rate": 1.9934863754190662e-05, "loss": 0.0535, "step": 251 }, { "epoch": 0.08533152737508201, "grad_norm": 0.671875, "learning_rate": 1.9933581734813404e-05, "loss": 0.0759, "step": 252 }, { "epoch": 0.08567014454720535, "grad_norm": 0.67578125, "learning_rate": 1.9932287263690637e-05, "loss": 0.0852, "step": 253 }, { "epoch": 0.0860087617193287, "grad_norm": 0.64453125, "learning_rate": 1.9930980342444966e-05, "loss": 0.07, "step": 254 }, { "epoch": 0.08634737889145204, "grad_norm": 0.81640625, "learning_rate": 1.9929660972714626e-05, "loss": 0.1072, "step": 255 }, { "epoch": 0.08668599606357537, "grad_norm": 2.0, "learning_rate": 1.9928329156153444e-05, "loss": 0.0896, "step": 256 }, { "epoch": 0.08702461323569871, "grad_norm": 0.63671875, "learning_rate": 1.992698489443085e-05, "loss": 0.0809, "step": 257 }, { "epoch": 0.08736323040782205, "grad_norm": 0.91015625, "learning_rate": 1.9925628189231885e-05, "loss": 0.1163, "step": 258 }, { "epoch": 0.0877018475799454, "grad_norm": 0.64453125, "learning_rate": 1.992425904225717e-05, "loss": 0.0763, "step": 259 }, { "epoch": 0.08804046475206874, "grad_norm": 0.8046875, "learning_rate": 1.9922877455222932e-05, "loss": 0.069, "step": 260 }, { "epoch": 0.08837908192419208, "grad_norm": 0.55078125, "learning_rate": 1.992148342986099e-05, "loss": 0.0689, "step": 261 }, { "epoch": 0.08871769909631542, "grad_norm": 0.625, "learning_rate": 1.9920076967918762e-05, "loss": 0.0634, "step": 262 }, { "epoch": 0.08905631626843877, "grad_norm": 0.72265625, "learning_rate": 1.9918658071159243e-05, "loss": 0.0745, "step": 263 }, { "epoch": 0.08939493344056211, "grad_norm": 0.68359375, "learning_rate": 1.9917226741361014e-05, "loss": 0.0769, "step": 264 }, { "epoch": 0.08973355061268544, "grad_norm": 0.671875, "learning_rate": 1.991578298031826e-05, "loss": 0.0794, "step": 265 }, { "epoch": 0.09007216778480878, "grad_norm": 0.6640625, "learning_rate": 1.9914326789840728e-05, "loss": 0.0771, "step": 266 }, { "epoch": 0.09041078495693212, "grad_norm": 0.90234375, "learning_rate": 1.991285817175375e-05, "loss": 0.0841, "step": 267 }, { "epoch": 0.09074940212905547, "grad_norm": 0.64453125, "learning_rate": 1.991137712789825e-05, "loss": 0.0692, "step": 268 }, { "epoch": 0.09108801930117881, "grad_norm": 0.81640625, "learning_rate": 1.9909883660130703e-05, "loss": 0.0986, "step": 269 }, { "epoch": 0.09142663647330215, "grad_norm": 0.8203125, "learning_rate": 1.9908377770323178e-05, "loss": 0.0774, "step": 270 }, { "epoch": 0.0917652536454255, "grad_norm": 0.5859375, "learning_rate": 1.9906859460363307e-05, "loss": 0.0663, "step": 271 }, { "epoch": 0.09210387081754884, "grad_norm": 0.69921875, "learning_rate": 1.9905328732154294e-05, "loss": 0.0812, "step": 272 }, { "epoch": 0.09244248798967218, "grad_norm": 0.69140625, "learning_rate": 1.9903785587614907e-05, "loss": 0.0833, "step": 273 }, { "epoch": 0.09278110516179552, "grad_norm": 0.63671875, "learning_rate": 1.990223002867947e-05, "loss": 0.0854, "step": 274 }, { "epoch": 0.09311972233391885, "grad_norm": 1.375, "learning_rate": 1.9900662057297886e-05, "loss": 0.0741, "step": 275 }, { "epoch": 0.0934583395060422, "grad_norm": 0.69921875, "learning_rate": 1.9899081675435604e-05, "loss": 0.0868, "step": 276 }, { "epoch": 0.09379695667816554, "grad_norm": 0.56640625, "learning_rate": 1.989748888507363e-05, "loss": 0.0632, "step": 277 }, { "epoch": 0.09413557385028888, "grad_norm": 0.6015625, "learning_rate": 1.9895883688208527e-05, "loss": 0.0696, "step": 278 }, { "epoch": 0.09447419102241222, "grad_norm": 0.5546875, "learning_rate": 1.9894266086852414e-05, "loss": 0.0657, "step": 279 }, { "epoch": 0.09481280819453557, "grad_norm": 0.6484375, "learning_rate": 1.989263608303295e-05, "loss": 0.0742, "step": 280 }, { "epoch": 0.09515142536665891, "grad_norm": 0.734375, "learning_rate": 1.989099367879335e-05, "loss": 0.0787, "step": 281 }, { "epoch": 0.09549004253878225, "grad_norm": 0.5625, "learning_rate": 1.9889338876192365e-05, "loss": 0.0675, "step": 282 }, { "epoch": 0.0958286597109056, "grad_norm": 0.5859375, "learning_rate": 1.9887671677304285e-05, "loss": 0.073, "step": 283 }, { "epoch": 0.09616727688302892, "grad_norm": 0.6171875, "learning_rate": 1.9885992084218948e-05, "loss": 0.0574, "step": 284 }, { "epoch": 0.09650589405515227, "grad_norm": 0.8671875, "learning_rate": 1.9884300099041728e-05, "loss": 0.1312, "step": 285 }, { "epoch": 0.09684451122727561, "grad_norm": 0.69140625, "learning_rate": 1.9882595723893525e-05, "loss": 0.0594, "step": 286 }, { "epoch": 0.09718312839939895, "grad_norm": 0.51171875, "learning_rate": 1.9880878960910772e-05, "loss": 0.0683, "step": 287 }, { "epoch": 0.0975217455715223, "grad_norm": 0.6484375, "learning_rate": 1.9879149812245434e-05, "loss": 0.0887, "step": 288 }, { "epoch": 0.09786036274364564, "grad_norm": 0.57421875, "learning_rate": 1.9877408280065e-05, "loss": 0.0788, "step": 289 }, { "epoch": 0.09819897991576898, "grad_norm": 0.578125, "learning_rate": 1.9875654366552476e-05, "loss": 0.0716, "step": 290 }, { "epoch": 0.09853759708789232, "grad_norm": 0.67578125, "learning_rate": 1.9873888073906396e-05, "loss": 0.0902, "step": 291 }, { "epoch": 0.09887621426001567, "grad_norm": 3.21875, "learning_rate": 1.987210940434081e-05, "loss": 0.0776, "step": 292 }, { "epoch": 0.09921483143213901, "grad_norm": 0.6171875, "learning_rate": 1.9870318360085277e-05, "loss": 0.0828, "step": 293 }, { "epoch": 0.09955344860426234, "grad_norm": 0.5703125, "learning_rate": 1.9868514943384872e-05, "loss": 0.0727, "step": 294 }, { "epoch": 0.09989206577638568, "grad_norm": 0.6484375, "learning_rate": 1.9866699156500177e-05, "loss": 0.0831, "step": 295 }, { "epoch": 0.10023068294850902, "grad_norm": 0.73828125, "learning_rate": 1.986487100170728e-05, "loss": 0.0895, "step": 296 }, { "epoch": 0.10023068294850902, "eval_loss": 0.0773262232542038, "eval_runtime": 833.2157, "eval_samples_per_second": 11.939, "eval_steps_per_second": 2.985, "step": 296 }, { "epoch": 0.10056930012063237, "grad_norm": 0.60546875, "learning_rate": 1.986303048129778e-05, "loss": 0.0779, "step": 297 }, { "epoch": 0.10090791729275571, "grad_norm": 0.8515625, "learning_rate": 1.9861177597578765e-05, "loss": 0.0699, "step": 298 }, { "epoch": 0.10124653446487905, "grad_norm": 0.60546875, "learning_rate": 1.9859312352872822e-05, "loss": 0.0706, "step": 299 }, { "epoch": 0.1015851516370024, "grad_norm": 0.76953125, "learning_rate": 1.985743474951804e-05, "loss": 0.0926, "step": 300 }, { "epoch": 0.10192376880912574, "grad_norm": 0.58984375, "learning_rate": 1.985554478986799e-05, "loss": 0.0685, "step": 301 }, { "epoch": 0.10226238598124908, "grad_norm": 0.58984375, "learning_rate": 1.9853642476291743e-05, "loss": 0.0623, "step": 302 }, { "epoch": 0.10260100315337242, "grad_norm": 0.5625, "learning_rate": 1.9851727811173844e-05, "loss": 0.0708, "step": 303 }, { "epoch": 0.10293962032549575, "grad_norm": 0.71484375, "learning_rate": 1.984980079691433e-05, "loss": 0.0816, "step": 304 }, { "epoch": 0.1032782374976191, "grad_norm": 0.609375, "learning_rate": 1.9847861435928708e-05, "loss": 0.0685, "step": 305 }, { "epoch": 0.10361685466974244, "grad_norm": 0.66015625, "learning_rate": 1.984590973064797e-05, "loss": 0.0951, "step": 306 }, { "epoch": 0.10395547184186578, "grad_norm": 0.62890625, "learning_rate": 1.984394568351858e-05, "loss": 0.0931, "step": 307 }, { "epoch": 0.10429408901398912, "grad_norm": 0.6015625, "learning_rate": 1.9841969297002473e-05, "loss": 0.0701, "step": 308 }, { "epoch": 0.10463270618611246, "grad_norm": 0.6484375, "learning_rate": 1.9839980573577046e-05, "loss": 0.0865, "step": 309 }, { "epoch": 0.10497132335823581, "grad_norm": 0.55078125, "learning_rate": 1.9837979515735168e-05, "loss": 0.0716, "step": 310 }, { "epoch": 0.10530994053035915, "grad_norm": 0.80859375, "learning_rate": 1.9835966125985155e-05, "loss": 0.0832, "step": 311 }, { "epoch": 0.10564855770248249, "grad_norm": 0.62890625, "learning_rate": 1.9833940406850805e-05, "loss": 0.0777, "step": 312 }, { "epoch": 0.10598717487460582, "grad_norm": 0.6015625, "learning_rate": 1.9831902360871344e-05, "loss": 0.0747, "step": 313 }, { "epoch": 0.10632579204672916, "grad_norm": 0.59375, "learning_rate": 1.9829851990601475e-05, "loss": 0.0761, "step": 314 }, { "epoch": 0.10666440921885251, "grad_norm": 0.484375, "learning_rate": 1.982778929861133e-05, "loss": 0.0632, "step": 315 }, { "epoch": 0.10700302639097585, "grad_norm": 0.6796875, "learning_rate": 1.9825714287486493e-05, "loss": 0.0886, "step": 316 }, { "epoch": 0.10734164356309919, "grad_norm": 0.4921875, "learning_rate": 1.9823626959827997e-05, "loss": 0.0639, "step": 317 }, { "epoch": 0.10768026073522254, "grad_norm": 0.671875, "learning_rate": 1.98215273182523e-05, "loss": 0.0787, "step": 318 }, { "epoch": 0.10801887790734588, "grad_norm": 0.73046875, "learning_rate": 1.9819415365391307e-05, "loss": 0.1027, "step": 319 }, { "epoch": 0.10835749507946922, "grad_norm": 0.6328125, "learning_rate": 1.9817291103892348e-05, "loss": 0.0793, "step": 320 }, { "epoch": 0.10869611225159256, "grad_norm": 0.6796875, "learning_rate": 1.981515453641819e-05, "loss": 0.0799, "step": 321 }, { "epoch": 0.1090347294237159, "grad_norm": 0.81640625, "learning_rate": 1.9813005665647017e-05, "loss": 0.1096, "step": 322 }, { "epoch": 0.10937334659583924, "grad_norm": 0.48828125, "learning_rate": 1.981084449427244e-05, "loss": 0.0687, "step": 323 }, { "epoch": 0.10971196376796258, "grad_norm": 0.65234375, "learning_rate": 1.9808671025003487e-05, "loss": 0.0751, "step": 324 }, { "epoch": 0.11005058094008592, "grad_norm": 0.64453125, "learning_rate": 1.9806485260564597e-05, "loss": 0.0686, "step": 325 }, { "epoch": 0.11038919811220926, "grad_norm": 0.57421875, "learning_rate": 1.9804287203695636e-05, "loss": 0.0608, "step": 326 }, { "epoch": 0.1107278152843326, "grad_norm": 0.67578125, "learning_rate": 1.9802076857151863e-05, "loss": 0.1027, "step": 327 }, { "epoch": 0.11106643245645595, "grad_norm": 0.640625, "learning_rate": 1.9799854223703943e-05, "loss": 0.0796, "step": 328 }, { "epoch": 0.11140504962857929, "grad_norm": 0.71484375, "learning_rate": 1.9797619306137958e-05, "loss": 0.0817, "step": 329 }, { "epoch": 0.11174366680070263, "grad_norm": 0.56640625, "learning_rate": 1.9795372107255368e-05, "loss": 0.0582, "step": 330 }, { "epoch": 0.11208228397282598, "grad_norm": 0.703125, "learning_rate": 1.979311262987304e-05, "loss": 0.0996, "step": 331 }, { "epoch": 0.11242090114494932, "grad_norm": 0.494140625, "learning_rate": 1.979084087682323e-05, "loss": 0.0598, "step": 332 }, { "epoch": 0.11275951831707265, "grad_norm": 0.5078125, "learning_rate": 1.978855685095358e-05, "loss": 0.0623, "step": 333 }, { "epoch": 0.11309813548919599, "grad_norm": 0.490234375, "learning_rate": 1.9786260555127116e-05, "loss": 0.0582, "step": 334 }, { "epoch": 0.11343675266131933, "grad_norm": 0.71875, "learning_rate": 1.9783951992222246e-05, "loss": 0.091, "step": 335 }, { "epoch": 0.11377536983344268, "grad_norm": 0.5859375, "learning_rate": 1.9781631165132755e-05, "loss": 0.0793, "step": 336 }, { "epoch": 0.11411398700556602, "grad_norm": 0.51953125, "learning_rate": 1.9779298076767795e-05, "loss": 0.0565, "step": 337 }, { "epoch": 0.11445260417768936, "grad_norm": 0.59765625, "learning_rate": 1.9776952730051896e-05, "loss": 0.0736, "step": 338 }, { "epoch": 0.1147912213498127, "grad_norm": 0.67578125, "learning_rate": 1.9774595127924955e-05, "loss": 0.0834, "step": 339 }, { "epoch": 0.11512983852193605, "grad_norm": 0.5390625, "learning_rate": 1.9772225273342216e-05, "loss": 0.0604, "step": 340 }, { "epoch": 0.11546845569405939, "grad_norm": 0.65234375, "learning_rate": 1.97698431692743e-05, "loss": 0.0674, "step": 341 }, { "epoch": 0.11580707286618272, "grad_norm": 0.68359375, "learning_rate": 1.976744881870717e-05, "loss": 0.0729, "step": 342 }, { "epoch": 0.11614569003830606, "grad_norm": 0.64453125, "learning_rate": 1.9765042224642146e-05, "loss": 0.0758, "step": 343 }, { "epoch": 0.1164843072104294, "grad_norm": 0.5859375, "learning_rate": 1.9762623390095897e-05, "loss": 0.0778, "step": 344 }, { "epoch": 0.11682292438255275, "grad_norm": 0.6875, "learning_rate": 1.976019231810043e-05, "loss": 0.079, "step": 345 }, { "epoch": 0.11716154155467609, "grad_norm": 0.5703125, "learning_rate": 1.9757749011703095e-05, "loss": 0.0729, "step": 346 }, { "epoch": 0.11750015872679943, "grad_norm": 0.625, "learning_rate": 1.9755293473966574e-05, "loss": 0.069, "step": 347 }, { "epoch": 0.11783877589892278, "grad_norm": 0.60546875, "learning_rate": 1.9752825707968884e-05, "loss": 0.0707, "step": 348 }, { "epoch": 0.11817739307104612, "grad_norm": 0.546875, "learning_rate": 1.975034571680337e-05, "loss": 0.0582, "step": 349 }, { "epoch": 0.11851601024316946, "grad_norm": 0.65625, "learning_rate": 1.9747853503578708e-05, "loss": 0.073, "step": 350 }, { "epoch": 0.1188546274152928, "grad_norm": 0.515625, "learning_rate": 1.9745349071418877e-05, "loss": 0.0576, "step": 351 }, { "epoch": 0.11919324458741613, "grad_norm": 0.68359375, "learning_rate": 1.974283242346319e-05, "loss": 0.0855, "step": 352 }, { "epoch": 0.11953186175953948, "grad_norm": 0.7890625, "learning_rate": 1.974030356286626e-05, "loss": 0.0982, "step": 353 }, { "epoch": 0.11987047893166282, "grad_norm": 0.546875, "learning_rate": 1.9737762492798018e-05, "loss": 0.0632, "step": 354 }, { "epoch": 0.12020909610378616, "grad_norm": 0.53125, "learning_rate": 1.97352092164437e-05, "loss": 0.0637, "step": 355 }, { "epoch": 0.1205477132759095, "grad_norm": 0.6328125, "learning_rate": 1.9732643737003827e-05, "loss": 0.0851, "step": 356 }, { "epoch": 0.12088633044803285, "grad_norm": 0.62109375, "learning_rate": 1.9730066057694236e-05, "loss": 0.0726, "step": 357 }, { "epoch": 0.12122494762015619, "grad_norm": 0.64453125, "learning_rate": 1.9727476181746045e-05, "loss": 0.0977, "step": 358 }, { "epoch": 0.12156356479227953, "grad_norm": 0.70703125, "learning_rate": 1.9724874112405663e-05, "loss": 0.0807, "step": 359 }, { "epoch": 0.12190218196440288, "grad_norm": 0.51171875, "learning_rate": 1.9722259852934785e-05, "loss": 0.0616, "step": 360 }, { "epoch": 0.1222407991365262, "grad_norm": 0.5546875, "learning_rate": 1.971963340661039e-05, "loss": 0.0783, "step": 361 }, { "epoch": 0.12257941630864955, "grad_norm": 0.58984375, "learning_rate": 1.971699477672472e-05, "loss": 0.0684, "step": 362 }, { "epoch": 0.12291803348077289, "grad_norm": 0.5390625, "learning_rate": 1.9714343966585308e-05, "loss": 0.0636, "step": 363 }, { "epoch": 0.12325665065289623, "grad_norm": 0.51171875, "learning_rate": 1.9711680979514936e-05, "loss": 0.0655, "step": 364 }, { "epoch": 0.12359526782501958, "grad_norm": 0.60546875, "learning_rate": 1.970900581885166e-05, "loss": 0.0897, "step": 365 }, { "epoch": 0.12393388499714292, "grad_norm": 0.5234375, "learning_rate": 1.97063184879488e-05, "loss": 0.0594, "step": 366 }, { "epoch": 0.12427250216926626, "grad_norm": 0.54296875, "learning_rate": 1.9703618990174917e-05, "loss": 0.0733, "step": 367 }, { "epoch": 0.1246111193413896, "grad_norm": 0.671875, "learning_rate": 1.970090732891384e-05, "loss": 0.1027, "step": 368 }, { "epoch": 0.12494973651351295, "grad_norm": 0.56640625, "learning_rate": 1.9698183507564626e-05, "loss": 0.0769, "step": 369 }, { "epoch": 0.1252883536856363, "grad_norm": 0.515625, "learning_rate": 1.96954475295416e-05, "loss": 0.0639, "step": 370 }, { "epoch": 0.12562697085775962, "grad_norm": 0.498046875, "learning_rate": 1.9692699398274298e-05, "loss": 0.0608, "step": 371 }, { "epoch": 0.12596558802988297, "grad_norm": 0.53515625, "learning_rate": 1.968993911720751e-05, "loss": 0.0676, "step": 372 }, { "epoch": 0.1263042052020063, "grad_norm": 0.5078125, "learning_rate": 1.9687166689801244e-05, "loss": 0.065, "step": 373 }, { "epoch": 0.12664282237412966, "grad_norm": 0.76171875, "learning_rate": 1.968438211953074e-05, "loss": 0.1108, "step": 374 }, { "epoch": 0.126981439546253, "grad_norm": 0.6171875, "learning_rate": 1.9681585409886454e-05, "loss": 0.0755, "step": 375 }, { "epoch": 0.12732005671837632, "grad_norm": 0.5, "learning_rate": 1.9678776564374068e-05, "loss": 0.0649, "step": 376 }, { "epoch": 0.12765867389049967, "grad_norm": 0.68359375, "learning_rate": 1.967595558651447e-05, "loss": 0.081, "step": 377 }, { "epoch": 0.127997291062623, "grad_norm": 0.5703125, "learning_rate": 1.9673122479843748e-05, "loss": 0.0675, "step": 378 }, { "epoch": 0.12833590823474636, "grad_norm": 0.640625, "learning_rate": 1.9670277247913205e-05, "loss": 0.0803, "step": 379 }, { "epoch": 0.1286745254068697, "grad_norm": 0.58203125, "learning_rate": 1.9667419894289345e-05, "loss": 0.0778, "step": 380 }, { "epoch": 0.12901314257899305, "grad_norm": 0.48828125, "learning_rate": 1.9664550422553852e-05, "loss": 0.0565, "step": 381 }, { "epoch": 0.12935175975111637, "grad_norm": 0.66015625, "learning_rate": 1.966166883630362e-05, "loss": 0.0802, "step": 382 }, { "epoch": 0.12969037692323973, "grad_norm": 0.486328125, "learning_rate": 1.9658775139150705e-05, "loss": 0.0626, "step": 383 }, { "epoch": 0.13002899409536306, "grad_norm": 0.546875, "learning_rate": 1.9655869334722363e-05, "loss": 0.0667, "step": 384 }, { "epoch": 0.13036761126748642, "grad_norm": 0.53515625, "learning_rate": 1.9652951426661025e-05, "loss": 0.0552, "step": 385 }, { "epoch": 0.13070622843960975, "grad_norm": 0.66015625, "learning_rate": 1.965002141862428e-05, "loss": 0.06, "step": 386 }, { "epoch": 0.13104484561173307, "grad_norm": 0.5625, "learning_rate": 1.9647079314284897e-05, "loss": 0.0681, "step": 387 }, { "epoch": 0.13138346278385643, "grad_norm": 0.625, "learning_rate": 1.9644125117330806e-05, "loss": 0.0949, "step": 388 }, { "epoch": 0.13172207995597976, "grad_norm": 0.578125, "learning_rate": 1.964115883146509e-05, "loss": 0.0591, "step": 389 }, { "epoch": 0.13206069712810312, "grad_norm": 0.63671875, "learning_rate": 1.9638180460405995e-05, "loss": 0.0798, "step": 390 }, { "epoch": 0.13239931430022644, "grad_norm": 0.6484375, "learning_rate": 1.96351900078869e-05, "loss": 0.0617, "step": 391 }, { "epoch": 0.1327379314723498, "grad_norm": 0.53515625, "learning_rate": 1.9632187477656342e-05, "loss": 0.0765, "step": 392 }, { "epoch": 0.13307654864447313, "grad_norm": 0.58984375, "learning_rate": 1.9629172873477995e-05, "loss": 0.0888, "step": 393 }, { "epoch": 0.1334151658165965, "grad_norm": 0.56640625, "learning_rate": 1.9626146199130664e-05, "loss": 0.0678, "step": 394 }, { "epoch": 0.13375378298871982, "grad_norm": 0.640625, "learning_rate": 1.962310745840828e-05, "loss": 0.0658, "step": 395 }, { "epoch": 0.13409240016084314, "grad_norm": 0.4921875, "learning_rate": 1.962005665511991e-05, "loss": 0.0577, "step": 396 }, { "epoch": 0.1344310173329665, "grad_norm": 0.59765625, "learning_rate": 1.961699379308974e-05, "loss": 0.0693, "step": 397 }, { "epoch": 0.13476963450508983, "grad_norm": 0.58203125, "learning_rate": 1.9613918876157062e-05, "loss": 0.0795, "step": 398 }, { "epoch": 0.1351082516772132, "grad_norm": 0.65234375, "learning_rate": 1.9610831908176285e-05, "loss": 0.0647, "step": 399 }, { "epoch": 0.13544686884933652, "grad_norm": 0.546875, "learning_rate": 1.9607732893016926e-05, "loss": 0.0737, "step": 400 }, { "epoch": 0.13578548602145987, "grad_norm": 0.59765625, "learning_rate": 1.9604621834563602e-05, "loss": 0.0687, "step": 401 }, { "epoch": 0.1361241031935832, "grad_norm": 0.486328125, "learning_rate": 1.960149873671602e-05, "loss": 0.0583, "step": 402 }, { "epoch": 0.13646272036570656, "grad_norm": 0.5234375, "learning_rate": 1.9598363603388986e-05, "loss": 0.0702, "step": 403 }, { "epoch": 0.1368013375378299, "grad_norm": 0.64453125, "learning_rate": 1.959521643851239e-05, "loss": 0.0803, "step": 404 }, { "epoch": 0.13713995470995322, "grad_norm": 0.8515625, "learning_rate": 1.9592057246031203e-05, "loss": 0.0896, "step": 405 }, { "epoch": 0.13747857188207657, "grad_norm": 0.8046875, "learning_rate": 1.9588886029905474e-05, "loss": 0.0889, "step": 406 }, { "epoch": 0.1378171890541999, "grad_norm": 0.53515625, "learning_rate": 1.9585702794110322e-05, "loss": 0.0684, "step": 407 }, { "epoch": 0.13815580622632326, "grad_norm": 0.65625, "learning_rate": 1.9582507542635933e-05, "loss": 0.0822, "step": 408 }, { "epoch": 0.1384944233984466, "grad_norm": 0.4765625, "learning_rate": 1.9579300279487558e-05, "loss": 0.0572, "step": 409 }, { "epoch": 0.13883304057056994, "grad_norm": 0.609375, "learning_rate": 1.9576081008685495e-05, "loss": 0.0897, "step": 410 }, { "epoch": 0.13917165774269327, "grad_norm": 0.52734375, "learning_rate": 1.9572849734265107e-05, "loss": 0.0655, "step": 411 }, { "epoch": 0.13951027491481663, "grad_norm": 0.63671875, "learning_rate": 1.956960646027679e-05, "loss": 0.0831, "step": 412 }, { "epoch": 0.13984889208693996, "grad_norm": 0.765625, "learning_rate": 1.9566351190785998e-05, "loss": 0.0684, "step": 413 }, { "epoch": 0.1401875092590633, "grad_norm": 0.52734375, "learning_rate": 1.9563083929873202e-05, "loss": 0.0739, "step": 414 }, { "epoch": 0.14052612643118664, "grad_norm": 0.46875, "learning_rate": 1.9559804681633918e-05, "loss": 0.0624, "step": 415 }, { "epoch": 0.14086474360330997, "grad_norm": 0.58984375, "learning_rate": 1.9556513450178683e-05, "loss": 0.0775, "step": 416 }, { "epoch": 0.14120336077543333, "grad_norm": 0.671875, "learning_rate": 1.955321023963306e-05, "loss": 0.1071, "step": 417 }, { "epoch": 0.14154197794755666, "grad_norm": 0.61328125, "learning_rate": 1.9549895054137616e-05, "loss": 0.0705, "step": 418 }, { "epoch": 0.14188059511968001, "grad_norm": 0.70703125, "learning_rate": 1.954656789784794e-05, "loss": 0.0901, "step": 419 }, { "epoch": 0.14221921229180334, "grad_norm": 0.68359375, "learning_rate": 1.9543228774934627e-05, "loss": 0.0946, "step": 420 }, { "epoch": 0.1425578294639267, "grad_norm": 0.484375, "learning_rate": 1.953987768958326e-05, "loss": 0.0663, "step": 421 }, { "epoch": 0.14289644663605003, "grad_norm": 0.51953125, "learning_rate": 1.953651464599443e-05, "loss": 0.0736, "step": 422 }, { "epoch": 0.14323506380817339, "grad_norm": 0.71484375, "learning_rate": 1.9533139648383712e-05, "loss": 0.0952, "step": 423 }, { "epoch": 0.14357368098029671, "grad_norm": 0.5390625, "learning_rate": 1.9529752700981664e-05, "loss": 0.0701, "step": 424 }, { "epoch": 0.14391229815242004, "grad_norm": 0.60546875, "learning_rate": 1.9526353808033827e-05, "loss": 0.0776, "step": 425 }, { "epoch": 0.1442509153245434, "grad_norm": 0.46484375, "learning_rate": 1.9522942973800712e-05, "loss": 0.0644, "step": 426 }, { "epoch": 0.14458953249666673, "grad_norm": 0.69140625, "learning_rate": 1.95195202025578e-05, "loss": 0.0784, "step": 427 }, { "epoch": 0.14492814966879008, "grad_norm": 0.490234375, "learning_rate": 1.9516085498595533e-05, "loss": 0.0623, "step": 428 }, { "epoch": 0.1452667668409134, "grad_norm": 0.447265625, "learning_rate": 1.951263886621932e-05, "loss": 0.0529, "step": 429 }, { "epoch": 0.14560538401303677, "grad_norm": 0.61328125, "learning_rate": 1.9509180309749505e-05, "loss": 0.0828, "step": 430 }, { "epoch": 0.1459440011851601, "grad_norm": 0.4921875, "learning_rate": 1.9505709833521396e-05, "loss": 0.0572, "step": 431 }, { "epoch": 0.14628261835728346, "grad_norm": 0.6328125, "learning_rate": 1.9502227441885232e-05, "loss": 0.0668, "step": 432 }, { "epoch": 0.14662123552940678, "grad_norm": 0.60546875, "learning_rate": 1.9498733139206193e-05, "loss": 0.0878, "step": 433 }, { "epoch": 0.1469598527015301, "grad_norm": 0.5625, "learning_rate": 1.9495226929864384e-05, "loss": 0.0672, "step": 434 }, { "epoch": 0.14729846987365347, "grad_norm": 0.62109375, "learning_rate": 1.9491708818254847e-05, "loss": 0.078, "step": 435 }, { "epoch": 0.1476370870457768, "grad_norm": 0.66796875, "learning_rate": 1.9488178808787527e-05, "loss": 0.0633, "step": 436 }, { "epoch": 0.14797570421790016, "grad_norm": 0.76953125, "learning_rate": 1.94846369058873e-05, "loss": 0.1021, "step": 437 }, { "epoch": 0.14831432139002348, "grad_norm": 0.55859375, "learning_rate": 1.9481083113993927e-05, "loss": 0.0724, "step": 438 }, { "epoch": 0.14865293856214684, "grad_norm": 0.6484375, "learning_rate": 1.9477517437562097e-05, "loss": 0.0714, "step": 439 }, { "epoch": 0.14899155573427017, "grad_norm": 0.6484375, "learning_rate": 1.9473939881061385e-05, "loss": 0.0857, "step": 440 }, { "epoch": 0.14933017290639353, "grad_norm": 0.6171875, "learning_rate": 1.9470350448976257e-05, "loss": 0.0953, "step": 441 }, { "epoch": 0.14966879007851686, "grad_norm": 0.859375, "learning_rate": 1.9466749145806065e-05, "loss": 0.0685, "step": 442 }, { "epoch": 0.15000740725064018, "grad_norm": 0.671875, "learning_rate": 1.9463135976065043e-05, "loss": 0.0905, "step": 443 }, { "epoch": 0.15034602442276354, "grad_norm": 0.6015625, "learning_rate": 1.9459510944282307e-05, "loss": 0.0771, "step": 444 }, { "epoch": 0.15068464159488687, "grad_norm": 0.57421875, "learning_rate": 1.9455874055001824e-05, "loss": 0.0682, "step": 445 }, { "epoch": 0.15102325876701023, "grad_norm": 0.451171875, "learning_rate": 1.945222531278244e-05, "loss": 0.0599, "step": 446 }, { "epoch": 0.15136187593913356, "grad_norm": 0.546875, "learning_rate": 1.9448564722197855e-05, "loss": 0.0668, "step": 447 }, { "epoch": 0.1517004931112569, "grad_norm": 0.53515625, "learning_rate": 1.9444892287836614e-05, "loss": 0.0712, "step": 448 }, { "epoch": 0.15203911028338024, "grad_norm": 0.5390625, "learning_rate": 1.944120801430212e-05, "loss": 0.0727, "step": 449 }, { "epoch": 0.1523777274555036, "grad_norm": 0.546875, "learning_rate": 1.9437511906212607e-05, "loss": 0.0698, "step": 450 }, { "epoch": 0.15271634462762693, "grad_norm": 0.69140625, "learning_rate": 1.9433803968201148e-05, "loss": 0.0945, "step": 451 }, { "epoch": 0.15305496179975028, "grad_norm": 0.57421875, "learning_rate": 1.9430084204915642e-05, "loss": 0.069, "step": 452 }, { "epoch": 0.1533935789718736, "grad_norm": 0.58984375, "learning_rate": 1.9426352621018817e-05, "loss": 0.071, "step": 453 }, { "epoch": 0.15373219614399694, "grad_norm": 0.609375, "learning_rate": 1.9422609221188208e-05, "loss": 0.0809, "step": 454 }, { "epoch": 0.1540708133161203, "grad_norm": 0.6484375, "learning_rate": 1.9418854010116168e-05, "loss": 0.085, "step": 455 }, { "epoch": 0.15440943048824363, "grad_norm": 0.609375, "learning_rate": 1.9415086992509858e-05, "loss": 0.0916, "step": 456 }, { "epoch": 0.15474804766036698, "grad_norm": 0.59765625, "learning_rate": 1.941130817309123e-05, "loss": 0.0544, "step": 457 }, { "epoch": 0.1550866648324903, "grad_norm": 0.58984375, "learning_rate": 1.940751755659704e-05, "loss": 0.073, "step": 458 }, { "epoch": 0.15542528200461367, "grad_norm": 0.61328125, "learning_rate": 1.9403715147778822e-05, "loss": 0.0854, "step": 459 }, { "epoch": 0.155763899176737, "grad_norm": 0.765625, "learning_rate": 1.9399900951402897e-05, "loss": 0.0612, "step": 460 }, { "epoch": 0.15610251634886035, "grad_norm": 0.58984375, "learning_rate": 1.939607497225036e-05, "loss": 0.079, "step": 461 }, { "epoch": 0.15644113352098368, "grad_norm": 0.46484375, "learning_rate": 1.9392237215117076e-05, "loss": 0.0544, "step": 462 }, { "epoch": 0.156779750693107, "grad_norm": 0.466796875, "learning_rate": 1.9388387684813676e-05, "loss": 0.0535, "step": 463 }, { "epoch": 0.15711836786523037, "grad_norm": 0.546875, "learning_rate": 1.9384526386165548e-05, "loss": 0.081, "step": 464 }, { "epoch": 0.1574569850373537, "grad_norm": 0.66015625, "learning_rate": 1.938065332401282e-05, "loss": 0.0916, "step": 465 }, { "epoch": 0.15779560220947705, "grad_norm": 0.453125, "learning_rate": 1.9376768503210388e-05, "loss": 0.0584, "step": 466 }, { "epoch": 0.15813421938160038, "grad_norm": 0.65234375, "learning_rate": 1.937287192862787e-05, "loss": 0.0942, "step": 467 }, { "epoch": 0.15847283655372374, "grad_norm": 0.6015625, "learning_rate": 1.9368963605149624e-05, "loss": 0.0744, "step": 468 }, { "epoch": 0.15881145372584707, "grad_norm": 0.416015625, "learning_rate": 1.936504353767473e-05, "loss": 0.0531, "step": 469 }, { "epoch": 0.15915007089797042, "grad_norm": 0.5546875, "learning_rate": 1.9361111731116993e-05, "loss": 0.0792, "step": 470 }, { "epoch": 0.15948868807009375, "grad_norm": 0.67578125, "learning_rate": 1.9357168190404937e-05, "loss": 0.0809, "step": 471 }, { "epoch": 0.15982730524221708, "grad_norm": 0.51953125, "learning_rate": 1.9353212920481792e-05, "loss": 0.0707, "step": 472 }, { "epoch": 0.16016592241434044, "grad_norm": 0.58984375, "learning_rate": 1.934924592630548e-05, "loss": 0.0847, "step": 473 }, { "epoch": 0.16050453958646377, "grad_norm": 0.515625, "learning_rate": 1.9345267212848638e-05, "loss": 0.0683, "step": 474 }, { "epoch": 0.16084315675858712, "grad_norm": 0.59765625, "learning_rate": 1.9341276785098584e-05, "loss": 0.081, "step": 475 }, { "epoch": 0.16118177393071045, "grad_norm": 0.6640625, "learning_rate": 1.9337274648057313e-05, "loss": 0.0843, "step": 476 }, { "epoch": 0.1615203911028338, "grad_norm": 0.50390625, "learning_rate": 1.93332608067415e-05, "loss": 0.0621, "step": 477 }, { "epoch": 0.16185900827495714, "grad_norm": 0.5078125, "learning_rate": 1.932923526618251e-05, "loss": 0.0625, "step": 478 }, { "epoch": 0.1621976254470805, "grad_norm": 0.63671875, "learning_rate": 1.932519803142635e-05, "loss": 0.0812, "step": 479 }, { "epoch": 0.16253624261920382, "grad_norm": 0.470703125, "learning_rate": 1.9321149107533693e-05, "loss": 0.0565, "step": 480 }, { "epoch": 0.16287485979132718, "grad_norm": 0.6875, "learning_rate": 1.931708849957987e-05, "loss": 0.0996, "step": 481 }, { "epoch": 0.1632134769634505, "grad_norm": 0.455078125, "learning_rate": 1.9313016212654845e-05, "loss": 0.0621, "step": 482 }, { "epoch": 0.16355209413557384, "grad_norm": 0.6015625, "learning_rate": 1.9308932251863243e-05, "loss": 0.0792, "step": 483 }, { "epoch": 0.1638907113076972, "grad_norm": 0.57421875, "learning_rate": 1.9304836622324295e-05, "loss": 0.0705, "step": 484 }, { "epoch": 0.16422932847982052, "grad_norm": 0.65625, "learning_rate": 1.930072932917188e-05, "loss": 0.0944, "step": 485 }, { "epoch": 0.16456794565194388, "grad_norm": 0.609375, "learning_rate": 1.9296610377554496e-05, "loss": 0.0713, "step": 486 }, { "epoch": 0.1649065628240672, "grad_norm": 0.51953125, "learning_rate": 1.9292479772635236e-05, "loss": 0.0654, "step": 487 }, { "epoch": 0.16524517999619057, "grad_norm": 0.458984375, "learning_rate": 1.9288337519591827e-05, "loss": 0.0673, "step": 488 }, { "epoch": 0.1655837971683139, "grad_norm": 0.71484375, "learning_rate": 1.9284183623616573e-05, "loss": 0.0963, "step": 489 }, { "epoch": 0.16592241434043725, "grad_norm": 0.8125, "learning_rate": 1.9280018089916387e-05, "loss": 0.083, "step": 490 }, { "epoch": 0.16626103151256058, "grad_norm": 0.5703125, "learning_rate": 1.927584092371277e-05, "loss": 0.0769, "step": 491 }, { "epoch": 0.1665996486846839, "grad_norm": 0.62109375, "learning_rate": 1.9271652130241794e-05, "loss": 0.0801, "step": 492 }, { "epoch": 0.16693826585680727, "grad_norm": 0.5703125, "learning_rate": 1.9267451714754113e-05, "loss": 0.0599, "step": 493 }, { "epoch": 0.1672768830289306, "grad_norm": 0.54296875, "learning_rate": 1.9263239682514953e-05, "loss": 0.0793, "step": 494 }, { "epoch": 0.16761550020105395, "grad_norm": 0.578125, "learning_rate": 1.925901603880409e-05, "loss": 0.0641, "step": 495 }, { "epoch": 0.16795411737317728, "grad_norm": 0.71484375, "learning_rate": 1.9254780788915865e-05, "loss": 0.0641, "step": 496 }, { "epoch": 0.16829273454530064, "grad_norm": 0.490234375, "learning_rate": 1.9250533938159166e-05, "loss": 0.0575, "step": 497 }, { "epoch": 0.16863135171742397, "grad_norm": 0.5546875, "learning_rate": 1.9246275491857417e-05, "loss": 0.0695, "step": 498 }, { "epoch": 0.16896996888954732, "grad_norm": 0.58203125, "learning_rate": 1.9242005455348582e-05, "loss": 0.0702, "step": 499 }, { "epoch": 0.16930858606167065, "grad_norm": 0.57421875, "learning_rate": 1.9237723833985154e-05, "loss": 0.0819, "step": 500 }, { "epoch": 0.16964720323379398, "grad_norm": 0.58984375, "learning_rate": 1.9233430633134146e-05, "loss": 0.0699, "step": 501 }, { "epoch": 0.16998582040591734, "grad_norm": 0.51953125, "learning_rate": 1.922912585817708e-05, "loss": 0.0708, "step": 502 }, { "epoch": 0.17032443757804067, "grad_norm": 0.5546875, "learning_rate": 1.9224809514509998e-05, "loss": 0.0752, "step": 503 }, { "epoch": 0.17066305475016402, "grad_norm": 0.58203125, "learning_rate": 1.9220481607543436e-05, "loss": 0.0789, "step": 504 }, { "epoch": 0.17100167192228735, "grad_norm": 0.55859375, "learning_rate": 1.9216142142702424e-05, "loss": 0.0735, "step": 505 }, { "epoch": 0.1713402890944107, "grad_norm": 0.6015625, "learning_rate": 1.921179112542648e-05, "loss": 0.0773, "step": 506 }, { "epoch": 0.17167890626653404, "grad_norm": 0.5234375, "learning_rate": 1.920742856116961e-05, "loss": 0.0579, "step": 507 }, { "epoch": 0.1720175234386574, "grad_norm": 0.50390625, "learning_rate": 1.920305445540028e-05, "loss": 0.0578, "step": 508 }, { "epoch": 0.17235614061078072, "grad_norm": 0.53515625, "learning_rate": 1.9198668813601443e-05, "loss": 0.0664, "step": 509 }, { "epoch": 0.17269475778290408, "grad_norm": 0.57421875, "learning_rate": 1.919427164127049e-05, "loss": 0.0685, "step": 510 }, { "epoch": 0.1730333749550274, "grad_norm": 0.66015625, "learning_rate": 1.918986294391929e-05, "loss": 0.0815, "step": 511 }, { "epoch": 0.17337199212715074, "grad_norm": 0.76171875, "learning_rate": 1.918544272707413e-05, "loss": 0.0878, "step": 512 }, { "epoch": 0.1737106092992741, "grad_norm": 0.578125, "learning_rate": 1.9181010996275767e-05, "loss": 0.0727, "step": 513 }, { "epoch": 0.17404922647139742, "grad_norm": 0.47265625, "learning_rate": 1.9176567757079368e-05, "loss": 0.0583, "step": 514 }, { "epoch": 0.17438784364352078, "grad_norm": 0.54296875, "learning_rate": 1.917211301505453e-05, "loss": 0.073, "step": 515 }, { "epoch": 0.1747264608156441, "grad_norm": 0.65234375, "learning_rate": 1.916764677578528e-05, "loss": 0.0841, "step": 516 }, { "epoch": 0.17506507798776746, "grad_norm": 0.412109375, "learning_rate": 1.916316904487005e-05, "loss": 0.0486, "step": 517 }, { "epoch": 0.1754036951598908, "grad_norm": 0.64453125, "learning_rate": 1.9158679827921667e-05, "loss": 0.088, "step": 518 }, { "epoch": 0.17574231233201415, "grad_norm": 0.484375, "learning_rate": 1.9154179130567374e-05, "loss": 0.0673, "step": 519 }, { "epoch": 0.17608092950413748, "grad_norm": 0.54296875, "learning_rate": 1.9149666958448792e-05, "loss": 0.0723, "step": 520 }, { "epoch": 0.1764195466762608, "grad_norm": 0.86328125, "learning_rate": 1.9145143317221925e-05, "loss": 0.0824, "step": 521 }, { "epoch": 0.17675816384838416, "grad_norm": 0.53125, "learning_rate": 1.9140608212557165e-05, "loss": 0.0802, "step": 522 }, { "epoch": 0.1770967810205075, "grad_norm": 1.9375, "learning_rate": 1.9136061650139262e-05, "loss": 0.0781, "step": 523 }, { "epoch": 0.17743539819263085, "grad_norm": 0.57421875, "learning_rate": 1.9131503635667337e-05, "loss": 0.0737, "step": 524 }, { "epoch": 0.17777401536475418, "grad_norm": 0.5390625, "learning_rate": 1.9126934174854856e-05, "loss": 0.0691, "step": 525 }, { "epoch": 0.17811263253687754, "grad_norm": 0.66796875, "learning_rate": 1.9122353273429635e-05, "loss": 0.0804, "step": 526 }, { "epoch": 0.17845124970900086, "grad_norm": 0.6484375, "learning_rate": 1.9117760937133843e-05, "loss": 0.0839, "step": 527 }, { "epoch": 0.17878986688112422, "grad_norm": 0.515625, "learning_rate": 1.911315717172397e-05, "loss": 0.0671, "step": 528 }, { "epoch": 0.17912848405324755, "grad_norm": 0.458984375, "learning_rate": 1.910854198297084e-05, "loss": 0.061, "step": 529 }, { "epoch": 0.17946710122537088, "grad_norm": 0.515625, "learning_rate": 1.9103915376659583e-05, "loss": 0.0598, "step": 530 }, { "epoch": 0.17980571839749424, "grad_norm": 0.48046875, "learning_rate": 1.909927735858966e-05, "loss": 0.0592, "step": 531 }, { "epoch": 0.18014433556961756, "grad_norm": 0.5859375, "learning_rate": 1.9094627934574825e-05, "loss": 0.0601, "step": 532 }, { "epoch": 0.18048295274174092, "grad_norm": 0.828125, "learning_rate": 1.9089967110443127e-05, "loss": 0.0756, "step": 533 }, { "epoch": 0.18082156991386425, "grad_norm": 0.53515625, "learning_rate": 1.9085294892036914e-05, "loss": 0.0741, "step": 534 }, { "epoch": 0.1811601870859876, "grad_norm": 0.5625, "learning_rate": 1.908061128521281e-05, "loss": 0.0654, "step": 535 }, { "epoch": 0.18149880425811094, "grad_norm": 0.5390625, "learning_rate": 1.907591629584172e-05, "loss": 0.0712, "step": 536 }, { "epoch": 0.1818374214302343, "grad_norm": 0.51953125, "learning_rate": 1.9071209929808808e-05, "loss": 0.0643, "step": 537 }, { "epoch": 0.18217603860235762, "grad_norm": 0.6171875, "learning_rate": 1.9066492193013505e-05, "loss": 0.0861, "step": 538 }, { "epoch": 0.18251465577448098, "grad_norm": 0.52734375, "learning_rate": 1.9061763091369498e-05, "loss": 0.0656, "step": 539 }, { "epoch": 0.1828532729466043, "grad_norm": 0.48828125, "learning_rate": 1.9057022630804715e-05, "loss": 0.0592, "step": 540 }, { "epoch": 0.18319189011872763, "grad_norm": 0.6640625, "learning_rate": 1.9052270817261323e-05, "loss": 0.0877, "step": 541 }, { "epoch": 0.183530507290851, "grad_norm": 0.51171875, "learning_rate": 1.9047507656695722e-05, "loss": 0.0686, "step": 542 }, { "epoch": 0.18386912446297432, "grad_norm": 0.53515625, "learning_rate": 1.9042733155078536e-05, "loss": 0.0651, "step": 543 }, { "epoch": 0.18420774163509768, "grad_norm": 0.46484375, "learning_rate": 1.9037947318394594e-05, "loss": 0.0624, "step": 544 }, { "epoch": 0.184546358807221, "grad_norm": 0.609375, "learning_rate": 1.9033150152642953e-05, "loss": 0.073, "step": 545 }, { "epoch": 0.18488497597934436, "grad_norm": 0.419921875, "learning_rate": 1.9028341663836855e-05, "loss": 0.0587, "step": 546 }, { "epoch": 0.1852235931514677, "grad_norm": 0.53125, "learning_rate": 1.9023521858003744e-05, "loss": 0.0741, "step": 547 }, { "epoch": 0.18556221032359105, "grad_norm": 0.6328125, "learning_rate": 1.9018690741185244e-05, "loss": 0.0801, "step": 548 }, { "epoch": 0.18590082749571438, "grad_norm": 0.55859375, "learning_rate": 1.9013848319437163e-05, "loss": 0.0627, "step": 549 }, { "epoch": 0.1862394446678377, "grad_norm": 0.92578125, "learning_rate": 1.900899459882948e-05, "loss": 0.0828, "step": 550 }, { "epoch": 0.18657806183996106, "grad_norm": 0.5390625, "learning_rate": 1.9004129585446326e-05, "loss": 0.0765, "step": 551 }, { "epoch": 0.1869166790120844, "grad_norm": 0.50390625, "learning_rate": 1.8999253285386e-05, "loss": 0.0803, "step": 552 }, { "epoch": 0.18725529618420775, "grad_norm": 0.60546875, "learning_rate": 1.8994365704760946e-05, "loss": 0.0785, "step": 553 }, { "epoch": 0.18759391335633108, "grad_norm": 0.66015625, "learning_rate": 1.8989466849697745e-05, "loss": 0.0949, "step": 554 }, { "epoch": 0.18793253052845443, "grad_norm": 0.486328125, "learning_rate": 1.8984556726337113e-05, "loss": 0.062, "step": 555 }, { "epoch": 0.18827114770057776, "grad_norm": 0.56640625, "learning_rate": 1.8979635340833887e-05, "loss": 0.0739, "step": 556 }, { "epoch": 0.18860976487270112, "grad_norm": 0.455078125, "learning_rate": 1.897470269935703e-05, "loss": 0.0574, "step": 557 }, { "epoch": 0.18894838204482445, "grad_norm": 0.5625, "learning_rate": 1.8969758808089602e-05, "loss": 0.0689, "step": 558 }, { "epoch": 0.18928699921694778, "grad_norm": 0.59375, "learning_rate": 1.8964803673228776e-05, "loss": 0.0719, "step": 559 }, { "epoch": 0.18962561638907113, "grad_norm": 0.58203125, "learning_rate": 1.895983730098581e-05, "loss": 0.0746, "step": 560 }, { "epoch": 0.18996423356119446, "grad_norm": 0.5234375, "learning_rate": 1.8954859697586057e-05, "loss": 0.07, "step": 561 }, { "epoch": 0.19030285073331782, "grad_norm": 0.55078125, "learning_rate": 1.8949870869268942e-05, "loss": 0.076, "step": 562 }, { "epoch": 0.19064146790544115, "grad_norm": 0.462890625, "learning_rate": 1.8944870822287957e-05, "loss": 0.0698, "step": 563 }, { "epoch": 0.1909800850775645, "grad_norm": 0.466796875, "learning_rate": 1.893985956291067e-05, "loss": 0.0552, "step": 564 }, { "epoch": 0.19131870224968783, "grad_norm": 0.5390625, "learning_rate": 1.893483709741868e-05, "loss": 0.0708, "step": 565 }, { "epoch": 0.1916573194218112, "grad_norm": 0.62109375, "learning_rate": 1.8929803432107662e-05, "loss": 0.0855, "step": 566 }, { "epoch": 0.19199593659393452, "grad_norm": 0.5390625, "learning_rate": 1.8924758573287315e-05, "loss": 0.0745, "step": 567 }, { "epoch": 0.19233455376605785, "grad_norm": 0.51171875, "learning_rate": 1.891970252728136e-05, "loss": 0.07, "step": 568 }, { "epoch": 0.1926731709381812, "grad_norm": 0.56640625, "learning_rate": 1.8914635300427563e-05, "loss": 0.0778, "step": 569 }, { "epoch": 0.19301178811030453, "grad_norm": 0.46484375, "learning_rate": 1.8909556899077683e-05, "loss": 0.0545, "step": 570 }, { "epoch": 0.1933504052824279, "grad_norm": 0.435546875, "learning_rate": 1.8904467329597503e-05, "loss": 0.0503, "step": 571 }, { "epoch": 0.19368902245455122, "grad_norm": 0.7734375, "learning_rate": 1.8899366598366796e-05, "loss": 0.0593, "step": 572 }, { "epoch": 0.19402763962667458, "grad_norm": 0.6640625, "learning_rate": 1.8894254711779333e-05, "loss": 0.1005, "step": 573 }, { "epoch": 0.1943662567987979, "grad_norm": 0.51953125, "learning_rate": 1.8889131676242858e-05, "loss": 0.0604, "step": 574 }, { "epoch": 0.19470487397092126, "grad_norm": 1.5859375, "learning_rate": 1.8883997498179103e-05, "loss": 0.0908, "step": 575 }, { "epoch": 0.1950434911430446, "grad_norm": 0.58984375, "learning_rate": 1.8878852184023754e-05, "loss": 0.0736, "step": 576 }, { "epoch": 0.19538210831516795, "grad_norm": 0.52734375, "learning_rate": 1.8873695740226468e-05, "loss": 0.0734, "step": 577 }, { "epoch": 0.19572072548729127, "grad_norm": 0.53125, "learning_rate": 1.8868528173250846e-05, "loss": 0.0574, "step": 578 }, { "epoch": 0.1960593426594146, "grad_norm": 0.63671875, "learning_rate": 1.886334948957443e-05, "loss": 0.0719, "step": 579 }, { "epoch": 0.19639795983153796, "grad_norm": 0.462890625, "learning_rate": 1.8858159695688708e-05, "loss": 0.0642, "step": 580 }, { "epoch": 0.1967365770036613, "grad_norm": 0.62109375, "learning_rate": 1.885295879809908e-05, "loss": 0.0721, "step": 581 }, { "epoch": 0.19707519417578465, "grad_norm": 0.5625, "learning_rate": 1.884774680332487e-05, "loss": 0.0739, "step": 582 }, { "epoch": 0.19741381134790797, "grad_norm": 0.5859375, "learning_rate": 1.8842523717899326e-05, "loss": 0.0791, "step": 583 }, { "epoch": 0.19775242852003133, "grad_norm": 0.5, "learning_rate": 1.8837289548369574e-05, "loss": 0.0719, "step": 584 }, { "epoch": 0.19809104569215466, "grad_norm": 0.515625, "learning_rate": 1.8832044301296652e-05, "loss": 0.0706, "step": 585 }, { "epoch": 0.19842966286427802, "grad_norm": 0.60546875, "learning_rate": 1.8826787983255474e-05, "loss": 0.0736, "step": 586 }, { "epoch": 0.19876828003640135, "grad_norm": 0.63671875, "learning_rate": 1.882152060083484e-05, "loss": 0.0699, "step": 587 }, { "epoch": 0.19910689720852467, "grad_norm": 0.388671875, "learning_rate": 1.881624216063741e-05, "loss": 0.0479, "step": 588 }, { "epoch": 0.19944551438064803, "grad_norm": 0.49609375, "learning_rate": 1.8810952669279707e-05, "loss": 0.0669, "step": 589 }, { "epoch": 0.19978413155277136, "grad_norm": 0.71484375, "learning_rate": 1.8805652133392115e-05, "loss": 0.0875, "step": 590 }, { "epoch": 0.20012274872489472, "grad_norm": 0.494140625, "learning_rate": 1.8800340559618855e-05, "loss": 0.0666, "step": 591 }, { "epoch": 0.20046136589701805, "grad_norm": 0.62109375, "learning_rate": 1.8795017954617982e-05, "loss": 0.0774, "step": 592 }, { "epoch": 0.20046136589701805, "eval_loss": 0.07367200404405594, "eval_runtime": 815.492, "eval_samples_per_second": 12.199, "eval_steps_per_second": 3.05, "step": 592 }, { "epoch": 0.2007999830691414, "grad_norm": 0.58984375, "learning_rate": 1.8789684325061382e-05, "loss": 0.0784, "step": 593 }, { "epoch": 0.20113860024126473, "grad_norm": 0.6796875, "learning_rate": 1.8784339677634763e-05, "loss": 0.0774, "step": 594 }, { "epoch": 0.2014772174133881, "grad_norm": 0.56640625, "learning_rate": 1.8778984019037642e-05, "loss": 0.0737, "step": 595 }, { "epoch": 0.20181583458551142, "grad_norm": 0.62109375, "learning_rate": 1.8773617355983332e-05, "loss": 0.0823, "step": 596 }, { "epoch": 0.20215445175763475, "grad_norm": 0.5078125, "learning_rate": 1.8768239695198945e-05, "loss": 0.0601, "step": 597 }, { "epoch": 0.2024930689297581, "grad_norm": 0.48828125, "learning_rate": 1.876285104342539e-05, "loss": 0.0671, "step": 598 }, { "epoch": 0.20283168610188143, "grad_norm": 0.5703125, "learning_rate": 1.8757451407417332e-05, "loss": 0.0679, "step": 599 }, { "epoch": 0.2031703032740048, "grad_norm": 0.6796875, "learning_rate": 1.8752040793943215e-05, "loss": 0.0969, "step": 600 }, { "epoch": 0.20350892044612812, "grad_norm": 0.5546875, "learning_rate": 1.8746619209785253e-05, "loss": 0.0729, "step": 601 }, { "epoch": 0.20384753761825147, "grad_norm": 0.73828125, "learning_rate": 1.874118666173939e-05, "loss": 0.1034, "step": 602 }, { "epoch": 0.2041861547903748, "grad_norm": 0.52734375, "learning_rate": 1.8735743156615337e-05, "loss": 0.0666, "step": 603 }, { "epoch": 0.20452477196249816, "grad_norm": 0.474609375, "learning_rate": 1.873028870123652e-05, "loss": 0.0677, "step": 604 }, { "epoch": 0.2048633891346215, "grad_norm": 0.66015625, "learning_rate": 1.87248233024401e-05, "loss": 0.0931, "step": 605 }, { "epoch": 0.20520200630674484, "grad_norm": 0.54296875, "learning_rate": 1.871934696707696e-05, "loss": 0.0632, "step": 606 }, { "epoch": 0.20554062347886817, "grad_norm": 0.56640625, "learning_rate": 1.871385970201168e-05, "loss": 0.0587, "step": 607 }, { "epoch": 0.2058792406509915, "grad_norm": 0.60546875, "learning_rate": 1.870836151412255e-05, "loss": 0.0776, "step": 608 }, { "epoch": 0.20621785782311486, "grad_norm": 0.52734375, "learning_rate": 1.8702852410301556e-05, "loss": 0.0557, "step": 609 }, { "epoch": 0.2065564749952382, "grad_norm": 0.6328125, "learning_rate": 1.869733239745435e-05, "loss": 0.0801, "step": 610 }, { "epoch": 0.20689509216736154, "grad_norm": 0.498046875, "learning_rate": 1.869180148250027e-05, "loss": 0.0632, "step": 611 }, { "epoch": 0.20723370933948487, "grad_norm": 0.474609375, "learning_rate": 1.8686259672372323e-05, "loss": 0.0592, "step": 612 }, { "epoch": 0.20757232651160823, "grad_norm": 0.50390625, "learning_rate": 1.8680706974017164e-05, "loss": 0.0714, "step": 613 }, { "epoch": 0.20791094368373156, "grad_norm": 0.5625, "learning_rate": 1.8675143394395106e-05, "loss": 0.066, "step": 614 }, { "epoch": 0.20824956085585491, "grad_norm": 1.15625, "learning_rate": 1.8669568940480093e-05, "loss": 0.0525, "step": 615 }, { "epoch": 0.20858817802797824, "grad_norm": 0.478515625, "learning_rate": 1.86639836192597e-05, "loss": 0.0637, "step": 616 }, { "epoch": 0.20892679520010157, "grad_norm": 0.494140625, "learning_rate": 1.8658387437735137e-05, "loss": 0.0581, "step": 617 }, { "epoch": 0.20926541237222493, "grad_norm": 0.4296875, "learning_rate": 1.865278040292121e-05, "loss": 0.0503, "step": 618 }, { "epoch": 0.20960402954434826, "grad_norm": 0.455078125, "learning_rate": 1.864716252184634e-05, "loss": 0.0602, "step": 619 }, { "epoch": 0.20994264671647161, "grad_norm": 0.62109375, "learning_rate": 1.864153380155254e-05, "loss": 0.0762, "step": 620 }, { "epoch": 0.21028126388859494, "grad_norm": 0.458984375, "learning_rate": 1.863589424909541e-05, "loss": 0.0517, "step": 621 }, { "epoch": 0.2106198810607183, "grad_norm": 0.4921875, "learning_rate": 1.863024387154414e-05, "loss": 0.0551, "step": 622 }, { "epoch": 0.21095849823284163, "grad_norm": 0.71875, "learning_rate": 1.8624582675981466e-05, "loss": 0.0596, "step": 623 }, { "epoch": 0.21129711540496499, "grad_norm": 0.48828125, "learning_rate": 1.8618910669503704e-05, "loss": 0.0525, "step": 624 }, { "epoch": 0.21163573257708831, "grad_norm": 0.515625, "learning_rate": 1.861322785922071e-05, "loss": 0.0619, "step": 625 }, { "epoch": 0.21197434974921164, "grad_norm": 0.578125, "learning_rate": 1.8607534252255896e-05, "loss": 0.0728, "step": 626 }, { "epoch": 0.212312966921335, "grad_norm": 0.435546875, "learning_rate": 1.8601829855746187e-05, "loss": 0.0583, "step": 627 }, { "epoch": 0.21265158409345833, "grad_norm": 0.478515625, "learning_rate": 1.8596114676842054e-05, "loss": 0.0659, "step": 628 }, { "epoch": 0.21299020126558169, "grad_norm": 0.56640625, "learning_rate": 1.8590388722707465e-05, "loss": 0.08, "step": 629 }, { "epoch": 0.21332881843770501, "grad_norm": 0.60546875, "learning_rate": 1.8584652000519913e-05, "loss": 0.0701, "step": 630 }, { "epoch": 0.21366743560982837, "grad_norm": 0.65625, "learning_rate": 1.8578904517470375e-05, "loss": 0.0718, "step": 631 }, { "epoch": 0.2140060527819517, "grad_norm": 0.69921875, "learning_rate": 1.8573146280763327e-05, "loss": 0.1271, "step": 632 }, { "epoch": 0.21434466995407506, "grad_norm": 0.59375, "learning_rate": 1.856737729761671e-05, "loss": 0.0677, "step": 633 }, { "epoch": 0.21468328712619839, "grad_norm": 0.6953125, "learning_rate": 1.856159757526195e-05, "loss": 0.0763, "step": 634 }, { "epoch": 0.21502190429832174, "grad_norm": 0.478515625, "learning_rate": 1.8555807120943927e-05, "loss": 0.0588, "step": 635 }, { "epoch": 0.21536052147044507, "grad_norm": 0.78125, "learning_rate": 1.8550005941920984e-05, "loss": 0.1482, "step": 636 }, { "epoch": 0.2156991386425684, "grad_norm": 0.6640625, "learning_rate": 1.8544194045464888e-05, "loss": 0.0953, "step": 637 }, { "epoch": 0.21603775581469176, "grad_norm": 0.60546875, "learning_rate": 1.8538371438860858e-05, "loss": 0.0685, "step": 638 }, { "epoch": 0.21637637298681509, "grad_norm": 0.51171875, "learning_rate": 1.8532538129407532e-05, "loss": 0.0665, "step": 639 }, { "epoch": 0.21671499015893844, "grad_norm": 0.45703125, "learning_rate": 1.8526694124416963e-05, "loss": 0.049, "step": 640 }, { "epoch": 0.21705360733106177, "grad_norm": 0.78125, "learning_rate": 1.852083943121461e-05, "loss": 0.086, "step": 641 }, { "epoch": 0.21739222450318513, "grad_norm": 0.5390625, "learning_rate": 1.8514974057139335e-05, "loss": 0.0779, "step": 642 }, { "epoch": 0.21773084167530846, "grad_norm": 0.50390625, "learning_rate": 1.8509098009543378e-05, "loss": 0.0581, "step": 643 }, { "epoch": 0.2180694588474318, "grad_norm": 0.5625, "learning_rate": 1.8503211295792375e-05, "loss": 0.0687, "step": 644 }, { "epoch": 0.21840807601955514, "grad_norm": 0.8125, "learning_rate": 1.8497313923265315e-05, "loss": 0.1094, "step": 645 }, { "epoch": 0.21874669319167847, "grad_norm": 0.63671875, "learning_rate": 1.8491405899354556e-05, "loss": 0.0887, "step": 646 }, { "epoch": 0.21908531036380183, "grad_norm": 0.54296875, "learning_rate": 1.848548723146581e-05, "loss": 0.0721, "step": 647 }, { "epoch": 0.21942392753592516, "grad_norm": 0.51171875, "learning_rate": 1.8479557927018127e-05, "loss": 0.0684, "step": 648 }, { "epoch": 0.2197625447080485, "grad_norm": 0.5078125, "learning_rate": 1.8473617993443885e-05, "loss": 0.0704, "step": 649 }, { "epoch": 0.22010116188017184, "grad_norm": 0.52734375, "learning_rate": 1.8467667438188794e-05, "loss": 0.0695, "step": 650 }, { "epoch": 0.2204397790522952, "grad_norm": 0.54296875, "learning_rate": 1.8461706268711878e-05, "loss": 0.0717, "step": 651 }, { "epoch": 0.22077839622441853, "grad_norm": 0.4375, "learning_rate": 1.8455734492485464e-05, "loss": 0.0598, "step": 652 }, { "epoch": 0.22111701339654188, "grad_norm": 0.486328125, "learning_rate": 1.844975211699517e-05, "loss": 0.0602, "step": 653 }, { "epoch": 0.2214556305686652, "grad_norm": 0.55078125, "learning_rate": 1.8443759149739906e-05, "loss": 0.0675, "step": 654 }, { "epoch": 0.22179424774078854, "grad_norm": 0.48828125, "learning_rate": 1.8437755598231857e-05, "loss": 0.0677, "step": 655 }, { "epoch": 0.2221328649129119, "grad_norm": 0.57421875, "learning_rate": 1.8431741469996475e-05, "loss": 0.0745, "step": 656 }, { "epoch": 0.22247148208503523, "grad_norm": 0.515625, "learning_rate": 1.8425716772572472e-05, "loss": 0.0688, "step": 657 }, { "epoch": 0.22281009925715858, "grad_norm": 0.85546875, "learning_rate": 1.8419681513511807e-05, "loss": 0.0683, "step": 658 }, { "epoch": 0.2231487164292819, "grad_norm": 1.0, "learning_rate": 1.8413635700379674e-05, "loss": 0.0793, "step": 659 }, { "epoch": 0.22348733360140527, "grad_norm": 0.515625, "learning_rate": 1.84075793407545e-05, "loss": 0.0661, "step": 660 }, { "epoch": 0.2238259507735286, "grad_norm": 0.66015625, "learning_rate": 1.840151244222794e-05, "loss": 0.087, "step": 661 }, { "epoch": 0.22416456794565195, "grad_norm": 0.43359375, "learning_rate": 1.8395435012404837e-05, "loss": 0.0571, "step": 662 }, { "epoch": 0.22450318511777528, "grad_norm": 0.49609375, "learning_rate": 1.838934705890327e-05, "loss": 0.0709, "step": 663 }, { "epoch": 0.22484180228989864, "grad_norm": 0.48828125, "learning_rate": 1.838324858935447e-05, "loss": 0.0707, "step": 664 }, { "epoch": 0.22518041946202197, "grad_norm": 0.51171875, "learning_rate": 1.8377139611402883e-05, "loss": 0.0706, "step": 665 }, { "epoch": 0.2255190366341453, "grad_norm": 0.43359375, "learning_rate": 1.8371020132706104e-05, "loss": 0.0537, "step": 666 }, { "epoch": 0.22585765380626865, "grad_norm": 0.7265625, "learning_rate": 1.8364890160934905e-05, "loss": 0.0909, "step": 667 }, { "epoch": 0.22619627097839198, "grad_norm": 0.45703125, "learning_rate": 1.8358749703773206e-05, "loss": 0.0635, "step": 668 }, { "epoch": 0.22653488815051534, "grad_norm": 0.671875, "learning_rate": 1.835259876891807e-05, "loss": 0.0939, "step": 669 }, { "epoch": 0.22687350532263867, "grad_norm": 0.56640625, "learning_rate": 1.8346437364079693e-05, "loss": 0.0852, "step": 670 }, { "epoch": 0.22721212249476203, "grad_norm": 0.70703125, "learning_rate": 1.8340265496981395e-05, "loss": 0.0639, "step": 671 }, { "epoch": 0.22755073966688535, "grad_norm": 0.48828125, "learning_rate": 1.8334083175359616e-05, "loss": 0.0598, "step": 672 }, { "epoch": 0.2278893568390087, "grad_norm": 0.6484375, "learning_rate": 1.8327890406963895e-05, "loss": 0.0872, "step": 673 }, { "epoch": 0.22822797401113204, "grad_norm": 0.55859375, "learning_rate": 1.8321687199556872e-05, "loss": 0.0835, "step": 674 }, { "epoch": 0.22856659118325537, "grad_norm": 0.478515625, "learning_rate": 1.8315473560914258e-05, "loss": 0.0586, "step": 675 }, { "epoch": 0.22890520835537873, "grad_norm": 0.40234375, "learning_rate": 1.8309249498824853e-05, "loss": 0.0586, "step": 676 }, { "epoch": 0.22924382552750205, "grad_norm": 0.474609375, "learning_rate": 1.8303015021090526e-05, "loss": 0.0627, "step": 677 }, { "epoch": 0.2295824426996254, "grad_norm": 0.640625, "learning_rate": 1.829677013552619e-05, "loss": 0.0771, "step": 678 }, { "epoch": 0.22992105987174874, "grad_norm": 0.55859375, "learning_rate": 1.829051484995981e-05, "loss": 0.0759, "step": 679 }, { "epoch": 0.2302596770438721, "grad_norm": 0.546875, "learning_rate": 1.828424917223239e-05, "loss": 0.0696, "step": 680 }, { "epoch": 0.23059829421599543, "grad_norm": 0.5859375, "learning_rate": 1.827797311019795e-05, "loss": 0.078, "step": 681 }, { "epoch": 0.23093691138811878, "grad_norm": 0.5234375, "learning_rate": 1.8271686671723543e-05, "loss": 0.0612, "step": 682 }, { "epoch": 0.2312755285602421, "grad_norm": 0.62109375, "learning_rate": 1.8265389864689213e-05, "loss": 0.0886, "step": 683 }, { "epoch": 0.23161414573236544, "grad_norm": 0.58203125, "learning_rate": 1.8259082696988013e-05, "loss": 0.0824, "step": 684 }, { "epoch": 0.2319527629044888, "grad_norm": 0.5859375, "learning_rate": 1.8252765176525976e-05, "loss": 0.0776, "step": 685 }, { "epoch": 0.23229138007661213, "grad_norm": 0.64453125, "learning_rate": 1.8246437311222117e-05, "loss": 0.0831, "step": 686 }, { "epoch": 0.23262999724873548, "grad_norm": 2.171875, "learning_rate": 1.8240099109008413e-05, "loss": 0.0765, "step": 687 }, { "epoch": 0.2329686144208588, "grad_norm": 0.5390625, "learning_rate": 1.82337505778298e-05, "loss": 0.0721, "step": 688 }, { "epoch": 0.23330723159298217, "grad_norm": 0.578125, "learning_rate": 1.8227391725644167e-05, "loss": 0.0893, "step": 689 }, { "epoch": 0.2336458487651055, "grad_norm": 0.451171875, "learning_rate": 1.822102256042233e-05, "loss": 0.0597, "step": 690 }, { "epoch": 0.23398446593722885, "grad_norm": 0.73046875, "learning_rate": 1.8214643090148044e-05, "loss": 0.0805, "step": 691 }, { "epoch": 0.23432308310935218, "grad_norm": 0.55859375, "learning_rate": 1.820825332281797e-05, "loss": 0.0646, "step": 692 }, { "epoch": 0.23466170028147554, "grad_norm": 0.5703125, "learning_rate": 1.820185326644169e-05, "loss": 0.0747, "step": 693 }, { "epoch": 0.23500031745359887, "grad_norm": 0.5234375, "learning_rate": 1.819544292904166e-05, "loss": 0.0687, "step": 694 }, { "epoch": 0.2353389346257222, "grad_norm": 0.486328125, "learning_rate": 1.8189022318653254e-05, "loss": 0.0573, "step": 695 }, { "epoch": 0.23567755179784555, "grad_norm": 0.427734375, "learning_rate": 1.81825914433247e-05, "loss": 0.0576, "step": 696 }, { "epoch": 0.23601616896996888, "grad_norm": 0.671875, "learning_rate": 1.8176150311117103e-05, "loss": 0.0783, "step": 697 }, { "epoch": 0.23635478614209224, "grad_norm": 0.55078125, "learning_rate": 1.816969893010442e-05, "loss": 0.063, "step": 698 }, { "epoch": 0.23669340331421557, "grad_norm": 0.59375, "learning_rate": 1.8163237308373465e-05, "loss": 0.084, "step": 699 }, { "epoch": 0.23703202048633892, "grad_norm": 0.44921875, "learning_rate": 1.8156765454023873e-05, "loss": 0.0549, "step": 700 }, { "epoch": 0.23737063765846225, "grad_norm": 0.61328125, "learning_rate": 1.8150283375168112e-05, "loss": 0.0821, "step": 701 }, { "epoch": 0.2377092548305856, "grad_norm": 0.4921875, "learning_rate": 1.814379107993148e-05, "loss": 0.0675, "step": 702 }, { "epoch": 0.23804787200270894, "grad_norm": 0.578125, "learning_rate": 1.8137288576452064e-05, "loss": 0.0852, "step": 703 }, { "epoch": 0.23838648917483227, "grad_norm": 0.478515625, "learning_rate": 1.8130775872880748e-05, "loss": 0.0743, "step": 704 }, { "epoch": 0.23872510634695562, "grad_norm": 0.56640625, "learning_rate": 1.812425297738121e-05, "loss": 0.0767, "step": 705 }, { "epoch": 0.23906372351907895, "grad_norm": 0.671875, "learning_rate": 1.81177198981299e-05, "loss": 0.0984, "step": 706 }, { "epoch": 0.2394023406912023, "grad_norm": 0.435546875, "learning_rate": 1.811117664331604e-05, "loss": 0.0517, "step": 707 }, { "epoch": 0.23974095786332564, "grad_norm": 0.451171875, "learning_rate": 1.810462322114159e-05, "loss": 0.0606, "step": 708 }, { "epoch": 0.240079575035449, "grad_norm": 0.462890625, "learning_rate": 1.8098059639821265e-05, "loss": 0.0588, "step": 709 }, { "epoch": 0.24041819220757232, "grad_norm": 0.546875, "learning_rate": 1.809148590758252e-05, "loss": 0.0773, "step": 710 }, { "epoch": 0.24075680937969568, "grad_norm": 0.439453125, "learning_rate": 1.8084902032665533e-05, "loss": 0.0607, "step": 711 }, { "epoch": 0.241095426551819, "grad_norm": 0.5625, "learning_rate": 1.8078308023323186e-05, "loss": 0.0862, "step": 712 }, { "epoch": 0.24143404372394234, "grad_norm": 0.62109375, "learning_rate": 1.8071703887821067e-05, "loss": 0.0735, "step": 713 }, { "epoch": 0.2417726608960657, "grad_norm": 0.50390625, "learning_rate": 1.8065089634437467e-05, "loss": 0.0684, "step": 714 }, { "epoch": 0.24211127806818902, "grad_norm": 0.53515625, "learning_rate": 1.805846527146335e-05, "loss": 0.0843, "step": 715 }, { "epoch": 0.24244989524031238, "grad_norm": 0.53125, "learning_rate": 1.8051830807202355e-05, "loss": 0.0703, "step": 716 }, { "epoch": 0.2427885124124357, "grad_norm": 0.52734375, "learning_rate": 1.8045186249970786e-05, "loss": 0.0828, "step": 717 }, { "epoch": 0.24312712958455907, "grad_norm": 0.80859375, "learning_rate": 1.8038531608097592e-05, "loss": 0.1078, "step": 718 }, { "epoch": 0.2434657467566824, "grad_norm": 0.55078125, "learning_rate": 1.803186688992437e-05, "loss": 0.0585, "step": 719 }, { "epoch": 0.24380436392880575, "grad_norm": 0.5546875, "learning_rate": 1.8025192103805348e-05, "loss": 0.0646, "step": 720 }, { "epoch": 0.24414298110092908, "grad_norm": 0.7109375, "learning_rate": 1.8018507258107364e-05, "loss": 0.0928, "step": 721 }, { "epoch": 0.2444815982730524, "grad_norm": 0.546875, "learning_rate": 1.801181236120988e-05, "loss": 0.0747, "step": 722 }, { "epoch": 0.24482021544517577, "grad_norm": 0.47265625, "learning_rate": 1.800510742150494e-05, "loss": 0.057, "step": 723 }, { "epoch": 0.2451588326172991, "grad_norm": 0.5078125, "learning_rate": 1.7998392447397197e-05, "loss": 0.0711, "step": 724 }, { "epoch": 0.24549744978942245, "grad_norm": 0.56640625, "learning_rate": 1.7991667447303865e-05, "loss": 0.0806, "step": 725 }, { "epoch": 0.24583606696154578, "grad_norm": 0.498046875, "learning_rate": 1.7984932429654734e-05, "loss": 0.0787, "step": 726 }, { "epoch": 0.24617468413366914, "grad_norm": 0.53125, "learning_rate": 1.7978187402892148e-05, "loss": 0.0801, "step": 727 }, { "epoch": 0.24651330130579246, "grad_norm": 0.5, "learning_rate": 1.7971432375471e-05, "loss": 0.0636, "step": 728 }, { "epoch": 0.24685191847791582, "grad_norm": 0.51171875, "learning_rate": 1.7964667355858718e-05, "loss": 0.0683, "step": 729 }, { "epoch": 0.24719053565003915, "grad_norm": 0.55078125, "learning_rate": 1.7957892352535253e-05, "loss": 0.0845, "step": 730 }, { "epoch": 0.2475291528221625, "grad_norm": 0.89453125, "learning_rate": 1.7951107373993074e-05, "loss": 0.1793, "step": 731 }, { "epoch": 0.24786776999428584, "grad_norm": 0.515625, "learning_rate": 1.7944312428737154e-05, "loss": 0.0657, "step": 732 }, { "epoch": 0.24820638716640916, "grad_norm": 0.53515625, "learning_rate": 1.793750752528495e-05, "loss": 0.0679, "step": 733 }, { "epoch": 0.24854500433853252, "grad_norm": 0.56640625, "learning_rate": 1.7930692672166416e-05, "loss": 0.0831, "step": 734 }, { "epoch": 0.24888362151065585, "grad_norm": 0.59375, "learning_rate": 1.7923867877923967e-05, "loss": 0.0764, "step": 735 }, { "epoch": 0.2492222386827792, "grad_norm": 0.515625, "learning_rate": 1.791703315111249e-05, "loss": 0.0583, "step": 736 }, { "epoch": 0.24956085585490254, "grad_norm": 0.49609375, "learning_rate": 1.7910188500299303e-05, "loss": 0.0661, "step": 737 }, { "epoch": 0.2498994730270259, "grad_norm": 2.953125, "learning_rate": 1.7903333934064185e-05, "loss": 0.0654, "step": 738 }, { "epoch": 0.2502380901991492, "grad_norm": 0.482421875, "learning_rate": 1.789646946099934e-05, "loss": 0.0655, "step": 739 }, { "epoch": 0.2505767073712726, "grad_norm": 0.55078125, "learning_rate": 1.7889595089709377e-05, "loss": 0.074, "step": 740 }, { "epoch": 0.25091532454339593, "grad_norm": 0.51171875, "learning_rate": 1.7882710828811322e-05, "loss": 0.0676, "step": 741 }, { "epoch": 0.25125394171551924, "grad_norm": 0.44921875, "learning_rate": 1.7875816686934596e-05, "loss": 0.053, "step": 742 }, { "epoch": 0.2515925588876426, "grad_norm": 0.490234375, "learning_rate": 1.7868912672721014e-05, "loss": 0.0705, "step": 743 }, { "epoch": 0.25193117605976595, "grad_norm": 0.431640625, "learning_rate": 1.7861998794824747e-05, "loss": 0.0544, "step": 744 }, { "epoch": 0.25226979323188925, "grad_norm": 0.458984375, "learning_rate": 1.785507506191235e-05, "loss": 0.063, "step": 745 }, { "epoch": 0.2526084104040126, "grad_norm": 0.921875, "learning_rate": 1.7848141482662726e-05, "loss": 0.065, "step": 746 }, { "epoch": 0.25294702757613596, "grad_norm": 0.515625, "learning_rate": 1.7841198065767107e-05, "loss": 0.0687, "step": 747 }, { "epoch": 0.2532856447482593, "grad_norm": 0.5, "learning_rate": 1.783424481992907e-05, "loss": 0.0679, "step": 748 }, { "epoch": 0.2536242619203826, "grad_norm": 0.4921875, "learning_rate": 1.782728175386451e-05, "loss": 0.0764, "step": 749 }, { "epoch": 0.253962879092506, "grad_norm": 0.494140625, "learning_rate": 1.7820308876301633e-05, "loss": 0.0632, "step": 750 }, { "epoch": 0.25430149626462933, "grad_norm": 0.52734375, "learning_rate": 1.781332619598094e-05, "loss": 0.0694, "step": 751 }, { "epoch": 0.25464011343675264, "grad_norm": 0.546875, "learning_rate": 1.780633372165522e-05, "loss": 0.0661, "step": 752 }, { "epoch": 0.254978730608876, "grad_norm": 0.41796875, "learning_rate": 1.7799331462089543e-05, "loss": 0.0546, "step": 753 }, { "epoch": 0.25531734778099935, "grad_norm": 0.44140625, "learning_rate": 1.7792319426061236e-05, "loss": 0.0567, "step": 754 }, { "epoch": 0.2556559649531227, "grad_norm": 0.58984375, "learning_rate": 1.7785297622359893e-05, "loss": 0.0569, "step": 755 }, { "epoch": 0.255994582125246, "grad_norm": 0.5859375, "learning_rate": 1.7778266059787345e-05, "loss": 0.0831, "step": 756 }, { "epoch": 0.25633319929736936, "grad_norm": 0.7734375, "learning_rate": 1.7771224747157655e-05, "loss": 0.0997, "step": 757 }, { "epoch": 0.2566718164694927, "grad_norm": 6.3125, "learning_rate": 1.7764173693297106e-05, "loss": 0.0747, "step": 758 }, { "epoch": 0.2570104336416161, "grad_norm": 0.5625, "learning_rate": 1.77571129070442e-05, "loss": 0.0697, "step": 759 }, { "epoch": 0.2573490508137394, "grad_norm": 0.57421875, "learning_rate": 1.775004239724963e-05, "loss": 0.0685, "step": 760 }, { "epoch": 0.25768766798586273, "grad_norm": 0.46875, "learning_rate": 1.774296217277628e-05, "loss": 0.0603, "step": 761 }, { "epoch": 0.2580262851579861, "grad_norm": 0.474609375, "learning_rate": 1.773587224249921e-05, "loss": 0.0621, "step": 762 }, { "epoch": 0.2583649023301094, "grad_norm": 0.5625, "learning_rate": 1.7728772615305657e-05, "loss": 0.061, "step": 763 }, { "epoch": 0.25870351950223275, "grad_norm": 0.56640625, "learning_rate": 1.7721663300094997e-05, "loss": 0.0644, "step": 764 }, { "epoch": 0.2590421366743561, "grad_norm": 0.447265625, "learning_rate": 1.7714544305778757e-05, "loss": 0.0581, "step": 765 }, { "epoch": 0.25938075384647946, "grad_norm": 0.5546875, "learning_rate": 1.7707415641280598e-05, "loss": 0.0812, "step": 766 }, { "epoch": 0.25971937101860276, "grad_norm": 0.5, "learning_rate": 1.7700277315536305e-05, "loss": 0.0754, "step": 767 }, { "epoch": 0.2600579881907261, "grad_norm": 0.494140625, "learning_rate": 1.7693129337493764e-05, "loss": 0.059, "step": 768 }, { "epoch": 0.2603966053628495, "grad_norm": 0.6953125, "learning_rate": 1.768597171611297e-05, "loss": 0.0896, "step": 769 }, { "epoch": 0.26073522253497283, "grad_norm": 0.65234375, "learning_rate": 1.7678804460366e-05, "loss": 0.0651, "step": 770 }, { "epoch": 0.26107383970709613, "grad_norm": 0.953125, "learning_rate": 1.7671627579237016e-05, "loss": 0.0634, "step": 771 }, { "epoch": 0.2614124568792195, "grad_norm": 0.5234375, "learning_rate": 1.766444108172223e-05, "loss": 0.079, "step": 772 }, { "epoch": 0.26175107405134285, "grad_norm": 0.55859375, "learning_rate": 1.765724497682992e-05, "loss": 0.0723, "step": 773 }, { "epoch": 0.26208969122346615, "grad_norm": 0.75, "learning_rate": 1.7650039273580406e-05, "loss": 0.0871, "step": 774 }, { "epoch": 0.2624283083955895, "grad_norm": 0.45703125, "learning_rate": 1.7642823981006037e-05, "loss": 0.065, "step": 775 }, { "epoch": 0.26276692556771286, "grad_norm": 0.5078125, "learning_rate": 1.763559910815118e-05, "loss": 0.072, "step": 776 }, { "epoch": 0.2631055427398362, "grad_norm": 0.7109375, "learning_rate": 1.7628364664072218e-05, "loss": 0.0845, "step": 777 }, { "epoch": 0.2634441599119595, "grad_norm": 0.5625, "learning_rate": 1.7621120657837528e-05, "loss": 0.0778, "step": 778 }, { "epoch": 0.2637827770840829, "grad_norm": 0.48046875, "learning_rate": 1.761386709852747e-05, "loss": 0.0744, "step": 779 }, { "epoch": 0.26412139425620623, "grad_norm": 0.6640625, "learning_rate": 1.760660399523438e-05, "loss": 0.0994, "step": 780 }, { "epoch": 0.26446001142832953, "grad_norm": 0.54296875, "learning_rate": 1.759933135706256e-05, "loss": 0.0787, "step": 781 }, { "epoch": 0.2647986286004529, "grad_norm": 0.451171875, "learning_rate": 1.759204919312826e-05, "loss": 0.0614, "step": 782 }, { "epoch": 0.26513724577257625, "grad_norm": 0.58203125, "learning_rate": 1.7584757512559674e-05, "loss": 0.0776, "step": 783 }, { "epoch": 0.2654758629446996, "grad_norm": 0.50390625, "learning_rate": 1.757745632449693e-05, "loss": 0.0646, "step": 784 }, { "epoch": 0.2658144801168229, "grad_norm": 0.6796875, "learning_rate": 1.757014563809206e-05, "loss": 0.0648, "step": 785 }, { "epoch": 0.26615309728894626, "grad_norm": 0.447265625, "learning_rate": 1.7562825462509018e-05, "loss": 0.0566, "step": 786 }, { "epoch": 0.2664917144610696, "grad_norm": 0.62109375, "learning_rate": 1.7555495806923635e-05, "loss": 0.0736, "step": 787 }, { "epoch": 0.266830331633193, "grad_norm": 0.56640625, "learning_rate": 1.754815668052364e-05, "loss": 0.0691, "step": 788 }, { "epoch": 0.2671689488053163, "grad_norm": 0.546875, "learning_rate": 1.754080809250863e-05, "loss": 0.069, "step": 789 }, { "epoch": 0.26750756597743963, "grad_norm": 0.55859375, "learning_rate": 1.753345005209006e-05, "loss": 0.0768, "step": 790 }, { "epoch": 0.267846183149563, "grad_norm": 0.64453125, "learning_rate": 1.7526082568491233e-05, "loss": 0.0748, "step": 791 }, { "epoch": 0.2681848003216863, "grad_norm": 0.5625, "learning_rate": 1.7518705650947292e-05, "loss": 0.07, "step": 792 }, { "epoch": 0.26852341749380965, "grad_norm": 0.59765625, "learning_rate": 1.7511319308705198e-05, "loss": 0.075, "step": 793 }, { "epoch": 0.268862034665933, "grad_norm": 0.54296875, "learning_rate": 1.750392355102374e-05, "loss": 0.0648, "step": 794 }, { "epoch": 0.26920065183805636, "grad_norm": 0.55859375, "learning_rate": 1.74965183871735e-05, "loss": 0.0848, "step": 795 }, { "epoch": 0.26953926901017966, "grad_norm": 0.48828125, "learning_rate": 1.7489103826436843e-05, "loss": 0.067, "step": 796 }, { "epoch": 0.269877886182303, "grad_norm": 0.431640625, "learning_rate": 1.7481679878107928e-05, "loss": 0.0608, "step": 797 }, { "epoch": 0.2702165033544264, "grad_norm": 0.55078125, "learning_rate": 1.7474246551492674e-05, "loss": 0.0584, "step": 798 }, { "epoch": 0.2705551205265497, "grad_norm": 0.54296875, "learning_rate": 1.7466803855908753e-05, "loss": 0.0558, "step": 799 }, { "epoch": 0.27089373769867303, "grad_norm": 0.423828125, "learning_rate": 1.745935180068559e-05, "loss": 0.057, "step": 800 }, { "epoch": 0.2712323548707964, "grad_norm": 0.69140625, "learning_rate": 1.745189039516434e-05, "loss": 0.0831, "step": 801 }, { "epoch": 0.27157097204291974, "grad_norm": 0.671875, "learning_rate": 1.7444419648697866e-05, "loss": 0.0964, "step": 802 }, { "epoch": 0.27190958921504305, "grad_norm": 0.53125, "learning_rate": 1.7436939570650754e-05, "loss": 0.0753, "step": 803 }, { "epoch": 0.2722482063871664, "grad_norm": 0.5546875, "learning_rate": 1.7429450170399278e-05, "loss": 0.0524, "step": 804 }, { "epoch": 0.27258682355928976, "grad_norm": 0.4921875, "learning_rate": 1.742195145733141e-05, "loss": 0.0638, "step": 805 }, { "epoch": 0.2729254407314131, "grad_norm": 0.62109375, "learning_rate": 1.741444344084678e-05, "loss": 0.0813, "step": 806 }, { "epoch": 0.2732640579035364, "grad_norm": 0.94921875, "learning_rate": 1.7406926130356692e-05, "loss": 0.0662, "step": 807 }, { "epoch": 0.2736026750756598, "grad_norm": 0.53515625, "learning_rate": 1.7399399535284093e-05, "loss": 0.0566, "step": 808 }, { "epoch": 0.27394129224778313, "grad_norm": 0.5546875, "learning_rate": 1.7391863665063572e-05, "loss": 0.0858, "step": 809 }, { "epoch": 0.27427990941990643, "grad_norm": 0.734375, "learning_rate": 1.738431852914134e-05, "loss": 0.0827, "step": 810 }, { "epoch": 0.2746185265920298, "grad_norm": 0.478515625, "learning_rate": 1.737676413697523e-05, "loss": 0.0644, "step": 811 }, { "epoch": 0.27495714376415314, "grad_norm": 0.640625, "learning_rate": 1.736920049803467e-05, "loss": 0.0772, "step": 812 }, { "epoch": 0.2752957609362765, "grad_norm": 0.44921875, "learning_rate": 1.7361627621800683e-05, "loss": 0.0561, "step": 813 }, { "epoch": 0.2756343781083998, "grad_norm": 0.734375, "learning_rate": 1.735404551776587e-05, "loss": 0.0853, "step": 814 }, { "epoch": 0.27597299528052316, "grad_norm": 0.486328125, "learning_rate": 1.73464541954344e-05, "loss": 0.0618, "step": 815 }, { "epoch": 0.2763116124526465, "grad_norm": 0.69921875, "learning_rate": 1.7338853664321993e-05, "loss": 0.079, "step": 816 }, { "epoch": 0.27665022962476987, "grad_norm": 0.62890625, "learning_rate": 1.7331243933955918e-05, "loss": 0.0579, "step": 817 }, { "epoch": 0.2769888467968932, "grad_norm": 0.53125, "learning_rate": 1.7323625013874972e-05, "loss": 0.0667, "step": 818 }, { "epoch": 0.27732746396901653, "grad_norm": 0.5390625, "learning_rate": 1.731599691362947e-05, "loss": 0.0661, "step": 819 }, { "epoch": 0.2776660811411399, "grad_norm": 0.68359375, "learning_rate": 1.730835964278124e-05, "loss": 0.117, "step": 820 }, { "epoch": 0.2780046983132632, "grad_norm": 0.5234375, "learning_rate": 1.7300713210903605e-05, "loss": 0.0619, "step": 821 }, { "epoch": 0.27834331548538654, "grad_norm": 0.466796875, "learning_rate": 1.7293057627581355e-05, "loss": 0.0645, "step": 822 }, { "epoch": 0.2786819326575099, "grad_norm": 0.48828125, "learning_rate": 1.7285392902410776e-05, "loss": 0.0636, "step": 823 }, { "epoch": 0.27902054982963326, "grad_norm": 0.47265625, "learning_rate": 1.7277719044999595e-05, "loss": 0.0543, "step": 824 }, { "epoch": 0.27935916700175656, "grad_norm": 0.423828125, "learning_rate": 1.7270036064967e-05, "loss": 0.06, "step": 825 }, { "epoch": 0.2796977841738799, "grad_norm": 0.455078125, "learning_rate": 1.7262343971943602e-05, "loss": 0.0598, "step": 826 }, { "epoch": 0.28003640134600327, "grad_norm": 0.486328125, "learning_rate": 1.725464277557144e-05, "loss": 0.0667, "step": 827 }, { "epoch": 0.2803750185181266, "grad_norm": 0.439453125, "learning_rate": 1.7246932485503964e-05, "loss": 0.0554, "step": 828 }, { "epoch": 0.28071363569024993, "grad_norm": 0.4609375, "learning_rate": 1.7239213111406027e-05, "loss": 0.0648, "step": 829 }, { "epoch": 0.2810522528623733, "grad_norm": 0.55078125, "learning_rate": 1.7231484662953862e-05, "loss": 0.063, "step": 830 }, { "epoch": 0.28139087003449664, "grad_norm": 0.63671875, "learning_rate": 1.7223747149835078e-05, "loss": 0.0752, "step": 831 }, { "epoch": 0.28172948720661994, "grad_norm": 0.52734375, "learning_rate": 1.7216000581748655e-05, "loss": 0.0745, "step": 832 }, { "epoch": 0.2820681043787433, "grad_norm": 0.59765625, "learning_rate": 1.7208244968404904e-05, "loss": 0.0526, "step": 833 }, { "epoch": 0.28240672155086666, "grad_norm": 0.47265625, "learning_rate": 1.7200480319525505e-05, "loss": 0.0644, "step": 834 }, { "epoch": 0.28274533872299, "grad_norm": 0.55859375, "learning_rate": 1.719270664484343e-05, "loss": 0.0788, "step": 835 }, { "epoch": 0.2830839558951133, "grad_norm": 0.66796875, "learning_rate": 1.7184923954102992e-05, "loss": 0.0718, "step": 836 }, { "epoch": 0.28342257306723667, "grad_norm": 0.490234375, "learning_rate": 1.7177132257059788e-05, "loss": 0.0729, "step": 837 }, { "epoch": 0.28376119023936003, "grad_norm": 0.546875, "learning_rate": 1.7169331563480713e-05, "loss": 0.0568, "step": 838 }, { "epoch": 0.28409980741148333, "grad_norm": 0.59375, "learning_rate": 1.7161521883143936e-05, "loss": 0.0644, "step": 839 }, { "epoch": 0.2844384245836067, "grad_norm": 0.6484375, "learning_rate": 1.7153703225838892e-05, "loss": 0.0567, "step": 840 }, { "epoch": 0.28477704175573004, "grad_norm": 0.62109375, "learning_rate": 1.714587560136627e-05, "loss": 0.0855, "step": 841 }, { "epoch": 0.2851156589278534, "grad_norm": 0.515625, "learning_rate": 1.7138039019538e-05, "loss": 0.0765, "step": 842 }, { "epoch": 0.2854542760999767, "grad_norm": 0.6484375, "learning_rate": 1.713019349017723e-05, "loss": 0.0771, "step": 843 }, { "epoch": 0.28579289327210006, "grad_norm": 0.52734375, "learning_rate": 1.7122339023118338e-05, "loss": 0.0754, "step": 844 }, { "epoch": 0.2861315104442234, "grad_norm": 0.40234375, "learning_rate": 1.7114475628206897e-05, "loss": 0.0429, "step": 845 }, { "epoch": 0.28647012761634677, "grad_norm": 0.5390625, "learning_rate": 1.7106603315299674e-05, "loss": 0.0673, "step": 846 }, { "epoch": 0.28680874478847007, "grad_norm": 0.640625, "learning_rate": 1.7098722094264616e-05, "loss": 0.0862, "step": 847 }, { "epoch": 0.28714736196059343, "grad_norm": 0.51953125, "learning_rate": 1.7090831974980832e-05, "loss": 0.0655, "step": 848 }, { "epoch": 0.2874859791327168, "grad_norm": 0.53515625, "learning_rate": 1.7082932967338588e-05, "loss": 0.0658, "step": 849 }, { "epoch": 0.2878245963048401, "grad_norm": 0.69921875, "learning_rate": 1.7075025081239286e-05, "loss": 0.0895, "step": 850 }, { "epoch": 0.28816321347696344, "grad_norm": 0.53515625, "learning_rate": 1.706710832659547e-05, "loss": 0.0735, "step": 851 }, { "epoch": 0.2885018306490868, "grad_norm": 0.50390625, "learning_rate": 1.7059182713330787e-05, "loss": 0.0594, "step": 852 }, { "epoch": 0.28884044782121016, "grad_norm": 0.4375, "learning_rate": 1.7051248251379997e-05, "loss": 0.0557, "step": 853 }, { "epoch": 0.28917906499333346, "grad_norm": 0.50390625, "learning_rate": 1.7043304950688947e-05, "loss": 0.065, "step": 854 }, { "epoch": 0.2895176821654568, "grad_norm": 0.60546875, "learning_rate": 1.703535282121456e-05, "loss": 0.0784, "step": 855 }, { "epoch": 0.28985629933758017, "grad_norm": 0.4453125, "learning_rate": 1.702739187292484e-05, "loss": 0.0567, "step": 856 }, { "epoch": 0.29019491650970347, "grad_norm": 0.9765625, "learning_rate": 1.7019422115798835e-05, "loss": 0.073, "step": 857 }, { "epoch": 0.2905335336818268, "grad_norm": 0.55078125, "learning_rate": 1.7011443559826632e-05, "loss": 0.079, "step": 858 }, { "epoch": 0.2908721508539502, "grad_norm": 0.46484375, "learning_rate": 1.700345621500935e-05, "loss": 0.0667, "step": 859 }, { "epoch": 0.29121076802607354, "grad_norm": 0.640625, "learning_rate": 1.699546009135913e-05, "loss": 0.1011, "step": 860 }, { "epoch": 0.29154938519819684, "grad_norm": 0.46875, "learning_rate": 1.6987455198899118e-05, "loss": 0.0662, "step": 861 }, { "epoch": 0.2918880023703202, "grad_norm": 0.53515625, "learning_rate": 1.6979441547663434e-05, "loss": 0.0675, "step": 862 }, { "epoch": 0.29222661954244356, "grad_norm": 0.5078125, "learning_rate": 1.6971419147697206e-05, "loss": 0.0607, "step": 863 }, { "epoch": 0.2925652367145669, "grad_norm": 0.578125, "learning_rate": 1.6963388009056505e-05, "loss": 0.075, "step": 864 }, { "epoch": 0.2929038538866902, "grad_norm": 0.431640625, "learning_rate": 1.6955348141808367e-05, "loss": 0.0512, "step": 865 }, { "epoch": 0.29324247105881357, "grad_norm": 0.53125, "learning_rate": 1.694729955603076e-05, "loss": 0.0719, "step": 866 }, { "epoch": 0.2935810882309369, "grad_norm": 3.921875, "learning_rate": 1.6939242261812592e-05, "loss": 0.0996, "step": 867 }, { "epoch": 0.2939197054030602, "grad_norm": 0.515625, "learning_rate": 1.693117626925368e-05, "loss": 0.066, "step": 868 }, { "epoch": 0.2942583225751836, "grad_norm": 1.0703125, "learning_rate": 1.6923101588464753e-05, "loss": 0.0758, "step": 869 }, { "epoch": 0.29459693974730694, "grad_norm": 0.578125, "learning_rate": 1.6915018229567412e-05, "loss": 0.0675, "step": 870 }, { "epoch": 0.2949355569194303, "grad_norm": 1.0546875, "learning_rate": 1.6906926202694158e-05, "loss": 0.2418, "step": 871 }, { "epoch": 0.2952741740915536, "grad_norm": 0.71875, "learning_rate": 1.6898825517988342e-05, "loss": 0.1013, "step": 872 }, { "epoch": 0.29561279126367696, "grad_norm": 0.40625, "learning_rate": 1.6890716185604178e-05, "loss": 0.053, "step": 873 }, { "epoch": 0.2959514084358003, "grad_norm": 0.365234375, "learning_rate": 1.688259821570671e-05, "loss": 0.0407, "step": 874 }, { "epoch": 0.29629002560792367, "grad_norm": 0.74609375, "learning_rate": 1.6874471618471813e-05, "loss": 0.1096, "step": 875 }, { "epoch": 0.29662864278004697, "grad_norm": 0.42578125, "learning_rate": 1.6866336404086185e-05, "loss": 0.0607, "step": 876 }, { "epoch": 0.2969672599521703, "grad_norm": 0.5078125, "learning_rate": 1.6858192582747306e-05, "loss": 0.0732, "step": 877 }, { "epoch": 0.2973058771242937, "grad_norm": 0.54296875, "learning_rate": 1.685004016466347e-05, "loss": 0.0591, "step": 878 }, { "epoch": 0.297644494296417, "grad_norm": 0.455078125, "learning_rate": 1.6841879160053724e-05, "loss": 0.0513, "step": 879 }, { "epoch": 0.29798311146854034, "grad_norm": 0.427734375, "learning_rate": 1.683370957914789e-05, "loss": 0.056, "step": 880 }, { "epoch": 0.2983217286406637, "grad_norm": 0.494140625, "learning_rate": 1.6825531432186545e-05, "loss": 0.0679, "step": 881 }, { "epoch": 0.29866034581278705, "grad_norm": 0.90625, "learning_rate": 1.6817344729420985e-05, "loss": 0.0666, "step": 882 }, { "epoch": 0.29899896298491035, "grad_norm": 0.52734375, "learning_rate": 1.6809149481113252e-05, "loss": 0.0664, "step": 883 }, { "epoch": 0.2993375801570337, "grad_norm": 0.625, "learning_rate": 1.6800945697536088e-05, "loss": 0.081, "step": 884 }, { "epoch": 0.29967619732915707, "grad_norm": 0.482421875, "learning_rate": 1.679273338897293e-05, "loss": 0.0667, "step": 885 }, { "epoch": 0.30001481450128037, "grad_norm": 0.5078125, "learning_rate": 1.678451256571792e-05, "loss": 0.0587, "step": 886 }, { "epoch": 0.3003534316734037, "grad_norm": 0.55078125, "learning_rate": 1.6776283238075853e-05, "loss": 0.0736, "step": 887 }, { "epoch": 0.3006920488455271, "grad_norm": 0.66015625, "learning_rate": 1.6768045416362192e-05, "loss": 0.0947, "step": 888 }, { "epoch": 0.3006920488455271, "eval_loss": 0.07113409042358398, "eval_runtime": 815.5638, "eval_samples_per_second": 12.198, "eval_steps_per_second": 3.049, "step": 888 }, { "epoch": 0.30103066601765044, "grad_norm": 0.46875, "learning_rate": 1.6759799110903046e-05, "loss": 0.0615, "step": 889 }, { "epoch": 0.30136928318977374, "grad_norm": 0.52734375, "learning_rate": 1.6751544332035164e-05, "loss": 0.0656, "step": 890 }, { "epoch": 0.3017079003618971, "grad_norm": 0.57421875, "learning_rate": 1.674328109010591e-05, "loss": 0.0781, "step": 891 }, { "epoch": 0.30204651753402045, "grad_norm": 0.54296875, "learning_rate": 1.6735009395473252e-05, "loss": 0.0776, "step": 892 }, { "epoch": 0.3023851347061438, "grad_norm": 0.5625, "learning_rate": 1.672672925850577e-05, "loss": 0.079, "step": 893 }, { "epoch": 0.3027237518782671, "grad_norm": 0.416015625, "learning_rate": 1.6718440689582613e-05, "loss": 0.0536, "step": 894 }, { "epoch": 0.30306236905039047, "grad_norm": 0.58203125, "learning_rate": 1.67101436990935e-05, "loss": 0.0796, "step": 895 }, { "epoch": 0.3034009862225138, "grad_norm": 0.474609375, "learning_rate": 1.6701838297438713e-05, "loss": 0.0662, "step": 896 }, { "epoch": 0.3037396033946371, "grad_norm": 0.486328125, "learning_rate": 1.669352449502907e-05, "loss": 0.0663, "step": 897 }, { "epoch": 0.3040782205667605, "grad_norm": 0.474609375, "learning_rate": 1.6685202302285926e-05, "loss": 0.0546, "step": 898 }, { "epoch": 0.30441683773888384, "grad_norm": 0.486328125, "learning_rate": 1.667687172964115e-05, "loss": 0.0613, "step": 899 }, { "epoch": 0.3047554549110072, "grad_norm": 0.55859375, "learning_rate": 1.6668532787537115e-05, "loss": 0.077, "step": 900 }, { "epoch": 0.3050940720831305, "grad_norm": 0.478515625, "learning_rate": 1.6660185486426684e-05, "loss": 0.0601, "step": 901 }, { "epoch": 0.30543268925525385, "grad_norm": 0.46875, "learning_rate": 1.66518298367732e-05, "loss": 0.0648, "step": 902 }, { "epoch": 0.3057713064273772, "grad_norm": 0.42578125, "learning_rate": 1.6643465849050473e-05, "loss": 0.0603, "step": 903 }, { "epoch": 0.30610992359950057, "grad_norm": 0.5859375, "learning_rate": 1.6635093533742762e-05, "loss": 0.0758, "step": 904 }, { "epoch": 0.30644854077162387, "grad_norm": 0.5, "learning_rate": 1.662671290134476e-05, "loss": 0.0569, "step": 905 }, { "epoch": 0.3067871579437472, "grad_norm": 0.54296875, "learning_rate": 1.6618323962361595e-05, "loss": 0.0667, "step": 906 }, { "epoch": 0.3071257751158706, "grad_norm": 0.5078125, "learning_rate": 1.6609926727308804e-05, "loss": 0.0652, "step": 907 }, { "epoch": 0.3074643922879939, "grad_norm": 0.59765625, "learning_rate": 1.660152120671232e-05, "loss": 0.0823, "step": 908 }, { "epoch": 0.30780300946011724, "grad_norm": 0.72265625, "learning_rate": 1.6593107411108462e-05, "loss": 0.0695, "step": 909 }, { "epoch": 0.3081416266322406, "grad_norm": 0.61328125, "learning_rate": 1.6584685351043924e-05, "loss": 0.0791, "step": 910 }, { "epoch": 0.30848024380436395, "grad_norm": 0.54296875, "learning_rate": 1.657625503707576e-05, "loss": 0.0716, "step": 911 }, { "epoch": 0.30881886097648725, "grad_norm": 0.55859375, "learning_rate": 1.6567816479771372e-05, "loss": 0.0772, "step": 912 }, { "epoch": 0.3091574781486106, "grad_norm": 0.55859375, "learning_rate": 1.655936968970848e-05, "loss": 0.0743, "step": 913 }, { "epoch": 0.30949609532073397, "grad_norm": 0.61328125, "learning_rate": 1.6550914677475155e-05, "loss": 0.0842, "step": 914 }, { "epoch": 0.30983471249285727, "grad_norm": 0.57421875, "learning_rate": 1.654245145366974e-05, "loss": 0.0685, "step": 915 }, { "epoch": 0.3101733296649806, "grad_norm": 0.63671875, "learning_rate": 1.6533980028900896e-05, "loss": 0.0904, "step": 916 }, { "epoch": 0.310511946837104, "grad_norm": 0.671875, "learning_rate": 1.6525500413787554e-05, "loss": 0.0768, "step": 917 }, { "epoch": 0.31085056400922734, "grad_norm": 0.58203125, "learning_rate": 1.6517012618958905e-05, "loss": 0.0794, "step": 918 }, { "epoch": 0.31118918118135064, "grad_norm": 0.58203125, "learning_rate": 1.6508516655054404e-05, "loss": 0.0797, "step": 919 }, { "epoch": 0.311527798353474, "grad_norm": 0.55859375, "learning_rate": 1.6500012532723748e-05, "loss": 0.0678, "step": 920 }, { "epoch": 0.31186641552559735, "grad_norm": 0.51953125, "learning_rate": 1.6491500262626847e-05, "loss": 0.0588, "step": 921 }, { "epoch": 0.3122050326977207, "grad_norm": 0.45703125, "learning_rate": 1.6482979855433837e-05, "loss": 0.0647, "step": 922 }, { "epoch": 0.312543649869844, "grad_norm": 0.609375, "learning_rate": 1.6474451321825048e-05, "loss": 0.0774, "step": 923 }, { "epoch": 0.31288226704196737, "grad_norm": 0.478515625, "learning_rate": 1.6465914672491e-05, "loss": 0.062, "step": 924 }, { "epoch": 0.3132208842140907, "grad_norm": 0.75, "learning_rate": 1.6457369918132376e-05, "loss": 0.0995, "step": 925 }, { "epoch": 0.313559501386214, "grad_norm": 0.56640625, "learning_rate": 1.6448817069460033e-05, "loss": 0.0756, "step": 926 }, { "epoch": 0.3138981185583374, "grad_norm": 0.5234375, "learning_rate": 1.6440256137194965e-05, "loss": 0.0769, "step": 927 }, { "epoch": 0.31423673573046074, "grad_norm": 0.56640625, "learning_rate": 1.6431687132068305e-05, "loss": 0.0754, "step": 928 }, { "epoch": 0.3145753529025841, "grad_norm": 0.5625, "learning_rate": 1.6423110064821296e-05, "loss": 0.0838, "step": 929 }, { "epoch": 0.3149139700747074, "grad_norm": 0.5703125, "learning_rate": 1.64145249462053e-05, "loss": 0.0665, "step": 930 }, { "epoch": 0.31525258724683075, "grad_norm": 0.640625, "learning_rate": 1.6405931786981753e-05, "loss": 0.0765, "step": 931 }, { "epoch": 0.3155912044189541, "grad_norm": 0.5234375, "learning_rate": 1.639733059792219e-05, "loss": 0.0623, "step": 932 }, { "epoch": 0.31592982159107746, "grad_norm": 0.6953125, "learning_rate": 1.63887213898082e-05, "loss": 0.0968, "step": 933 }, { "epoch": 0.31626843876320077, "grad_norm": 0.439453125, "learning_rate": 1.6380104173431423e-05, "loss": 0.0638, "step": 934 }, { "epoch": 0.3166070559353241, "grad_norm": 0.51953125, "learning_rate": 1.6371478959593543e-05, "loss": 0.0689, "step": 935 }, { "epoch": 0.3169456731074475, "grad_norm": 0.63671875, "learning_rate": 1.6362845759106267e-05, "loss": 0.0748, "step": 936 }, { "epoch": 0.3172842902795708, "grad_norm": 0.58984375, "learning_rate": 1.635420458279131e-05, "loss": 0.0638, "step": 937 }, { "epoch": 0.31762290745169414, "grad_norm": 0.5078125, "learning_rate": 1.634555544148039e-05, "loss": 0.0695, "step": 938 }, { "epoch": 0.3179615246238175, "grad_norm": 0.4609375, "learning_rate": 1.6336898346015202e-05, "loss": 0.0657, "step": 939 }, { "epoch": 0.31830014179594085, "grad_norm": 0.54296875, "learning_rate": 1.6328233307247426e-05, "loss": 0.0808, "step": 940 }, { "epoch": 0.31863875896806415, "grad_norm": 0.51953125, "learning_rate": 1.6319560336038678e-05, "loss": 0.0642, "step": 941 }, { "epoch": 0.3189773761401875, "grad_norm": 0.4609375, "learning_rate": 1.631087944326053e-05, "loss": 0.0618, "step": 942 }, { "epoch": 0.31931599331231086, "grad_norm": 0.46875, "learning_rate": 1.6302190639794486e-05, "loss": 0.0638, "step": 943 }, { "epoch": 0.31965461048443417, "grad_norm": 0.49609375, "learning_rate": 1.6293493936531956e-05, "loss": 0.0731, "step": 944 }, { "epoch": 0.3199932276565575, "grad_norm": 0.55078125, "learning_rate": 1.6284789344374266e-05, "loss": 0.0728, "step": 945 }, { "epoch": 0.3203318448286809, "grad_norm": 0.52734375, "learning_rate": 1.6276076874232614e-05, "loss": 0.0491, "step": 946 }, { "epoch": 0.32067046200080424, "grad_norm": 0.453125, "learning_rate": 1.626735653702809e-05, "loss": 0.059, "step": 947 }, { "epoch": 0.32100907917292754, "grad_norm": 0.458984375, "learning_rate": 1.6258628343691635e-05, "loss": 0.0598, "step": 948 }, { "epoch": 0.3213476963450509, "grad_norm": 0.47265625, "learning_rate": 1.6249892305164036e-05, "loss": 0.0661, "step": 949 }, { "epoch": 0.32168631351717425, "grad_norm": 0.578125, "learning_rate": 1.624114843239592e-05, "loss": 0.0594, "step": 950 }, { "epoch": 0.3220249306892976, "grad_norm": 0.46875, "learning_rate": 1.6232396736347736e-05, "loss": 0.0663, "step": 951 }, { "epoch": 0.3223635478614209, "grad_norm": 0.53125, "learning_rate": 1.6223637227989736e-05, "loss": 0.059, "step": 952 }, { "epoch": 0.32270216503354426, "grad_norm": 0.44921875, "learning_rate": 1.621486991830196e-05, "loss": 0.0622, "step": 953 }, { "epoch": 0.3230407822056676, "grad_norm": 0.58984375, "learning_rate": 1.6206094818274228e-05, "loss": 0.0461, "step": 954 }, { "epoch": 0.3233793993777909, "grad_norm": 0.4375, "learning_rate": 1.619731193890614e-05, "loss": 0.0483, "step": 955 }, { "epoch": 0.3237180165499143, "grad_norm": 0.5390625, "learning_rate": 1.6188521291207027e-05, "loss": 0.0747, "step": 956 }, { "epoch": 0.32405663372203763, "grad_norm": 0.57421875, "learning_rate": 1.6179722886195967e-05, "loss": 0.0733, "step": 957 }, { "epoch": 0.324395250894161, "grad_norm": 0.5234375, "learning_rate": 1.6170916734901765e-05, "loss": 0.0702, "step": 958 }, { "epoch": 0.3247338680662843, "grad_norm": 0.48828125, "learning_rate": 1.6162102848362932e-05, "loss": 0.0655, "step": 959 }, { "epoch": 0.32507248523840765, "grad_norm": 0.49609375, "learning_rate": 1.6153281237627675e-05, "loss": 0.0657, "step": 960 }, { "epoch": 0.325411102410531, "grad_norm": 0.546875, "learning_rate": 1.6144451913753882e-05, "loss": 0.0588, "step": 961 }, { "epoch": 0.32574971958265436, "grad_norm": 0.51171875, "learning_rate": 1.6135614887809113e-05, "loss": 0.0687, "step": 962 }, { "epoch": 0.32608833675477766, "grad_norm": 0.828125, "learning_rate": 1.612677017087058e-05, "loss": 0.0764, "step": 963 }, { "epoch": 0.326426953926901, "grad_norm": 0.443359375, "learning_rate": 1.6117917774025138e-05, "loss": 0.0539, "step": 964 }, { "epoch": 0.3267655710990244, "grad_norm": 0.6171875, "learning_rate": 1.6109057708369263e-05, "loss": 0.0853, "step": 965 }, { "epoch": 0.3271041882711477, "grad_norm": 0.72265625, "learning_rate": 1.6100189985009053e-05, "loss": 0.0523, "step": 966 }, { "epoch": 0.32744280544327103, "grad_norm": 0.5625, "learning_rate": 1.6091314615060196e-05, "loss": 0.0599, "step": 967 }, { "epoch": 0.3277814226153944, "grad_norm": 0.4609375, "learning_rate": 1.608243160964797e-05, "loss": 0.0625, "step": 968 }, { "epoch": 0.32812003978751775, "grad_norm": 0.42578125, "learning_rate": 1.6073540979907227e-05, "loss": 0.0561, "step": 969 }, { "epoch": 0.32845865695964105, "grad_norm": 0.65625, "learning_rate": 1.6064642736982368e-05, "loss": 0.08, "step": 970 }, { "epoch": 0.3287972741317644, "grad_norm": 0.51171875, "learning_rate": 1.6055736892027342e-05, "loss": 0.0691, "step": 971 }, { "epoch": 0.32913589130388776, "grad_norm": 0.53515625, "learning_rate": 1.6046823456205623e-05, "loss": 0.0728, "step": 972 }, { "epoch": 0.32947450847601106, "grad_norm": 0.482421875, "learning_rate": 1.6037902440690212e-05, "loss": 0.0604, "step": 973 }, { "epoch": 0.3298131256481344, "grad_norm": 0.51171875, "learning_rate": 1.6028973856663595e-05, "loss": 0.0723, "step": 974 }, { "epoch": 0.3301517428202578, "grad_norm": 0.486328125, "learning_rate": 1.6020037715317756e-05, "loss": 0.0836, "step": 975 }, { "epoch": 0.33049035999238113, "grad_norm": 0.54296875, "learning_rate": 1.6011094027854147e-05, "loss": 0.0603, "step": 976 }, { "epoch": 0.33082897716450443, "grad_norm": 0.4921875, "learning_rate": 1.6002142805483686e-05, "loss": 0.0717, "step": 977 }, { "epoch": 0.3311675943366278, "grad_norm": 0.7421875, "learning_rate": 1.5993184059426725e-05, "loss": 0.068, "step": 978 }, { "epoch": 0.33150621150875115, "grad_norm": 0.52734375, "learning_rate": 1.5984217800913052e-05, "loss": 0.0738, "step": 979 }, { "epoch": 0.3318448286808745, "grad_norm": 0.4765625, "learning_rate": 1.5975244041181877e-05, "loss": 0.0627, "step": 980 }, { "epoch": 0.3321834458529978, "grad_norm": 0.4765625, "learning_rate": 1.5966262791481812e-05, "loss": 0.0633, "step": 981 }, { "epoch": 0.33252206302512116, "grad_norm": 0.453125, "learning_rate": 1.5957274063070845e-05, "loss": 0.0576, "step": 982 }, { "epoch": 0.3328606801972445, "grad_norm": 0.58203125, "learning_rate": 1.5948277867216355e-05, "loss": 0.0802, "step": 983 }, { "epoch": 0.3331992973693678, "grad_norm": 0.4765625, "learning_rate": 1.5939274215195074e-05, "loss": 0.0643, "step": 984 }, { "epoch": 0.3335379145414912, "grad_norm": 1.6796875, "learning_rate": 1.5930263118293075e-05, "loss": 0.0746, "step": 985 }, { "epoch": 0.33387653171361453, "grad_norm": 0.61328125, "learning_rate": 1.5921244587805774e-05, "loss": 0.0764, "step": 986 }, { "epoch": 0.3342151488857379, "grad_norm": 0.48046875, "learning_rate": 1.5912218635037896e-05, "loss": 0.0583, "step": 987 }, { "epoch": 0.3345537660578612, "grad_norm": 0.6171875, "learning_rate": 1.5903185271303477e-05, "loss": 0.0803, "step": 988 }, { "epoch": 0.33489238322998455, "grad_norm": 0.54296875, "learning_rate": 1.5894144507925836e-05, "loss": 0.077, "step": 989 }, { "epoch": 0.3352310004021079, "grad_norm": 0.462890625, "learning_rate": 1.5885096356237572e-05, "loss": 0.0668, "step": 990 }, { "epoch": 0.33556961757423126, "grad_norm": 0.55078125, "learning_rate": 1.5876040827580545e-05, "loss": 0.0635, "step": 991 }, { "epoch": 0.33590823474635456, "grad_norm": 0.69921875, "learning_rate": 1.586697793330586e-05, "loss": 0.058, "step": 992 }, { "epoch": 0.3362468519184779, "grad_norm": 0.515625, "learning_rate": 1.5857907684773858e-05, "loss": 0.0653, "step": 993 }, { "epoch": 0.3365854690906013, "grad_norm": 0.5234375, "learning_rate": 1.584883009335409e-05, "loss": 0.0474, "step": 994 }, { "epoch": 0.3369240862627246, "grad_norm": 0.490234375, "learning_rate": 1.5839745170425326e-05, "loss": 0.0607, "step": 995 }, { "epoch": 0.33726270343484793, "grad_norm": 0.55078125, "learning_rate": 1.5830652927375506e-05, "loss": 0.0584, "step": 996 }, { "epoch": 0.3376013206069713, "grad_norm": 0.53125, "learning_rate": 1.582155337560177e-05, "loss": 0.0642, "step": 997 }, { "epoch": 0.33793993777909465, "grad_norm": 0.51953125, "learning_rate": 1.58124465265104e-05, "loss": 0.0747, "step": 998 }, { "epoch": 0.33827855495121795, "grad_norm": 0.68359375, "learning_rate": 1.5803332391516832e-05, "loss": 0.0634, "step": 999 }, { "epoch": 0.3386171721233413, "grad_norm": 0.51953125, "learning_rate": 1.5794210982045638e-05, "loss": 0.0548, "step": 1000 }, { "epoch": 0.33895578929546466, "grad_norm": 0.50390625, "learning_rate": 1.5785082309530504e-05, "loss": 0.0659, "step": 1001 }, { "epoch": 0.33929440646758796, "grad_norm": 0.6171875, "learning_rate": 1.577594638541422e-05, "loss": 0.0745, "step": 1002 }, { "epoch": 0.3396330236397113, "grad_norm": 0.5390625, "learning_rate": 1.5766803221148676e-05, "loss": 0.0689, "step": 1003 }, { "epoch": 0.3399716408118347, "grad_norm": 0.55859375, "learning_rate": 1.5757652828194815e-05, "loss": 0.0874, "step": 1004 }, { "epoch": 0.34031025798395803, "grad_norm": 0.439453125, "learning_rate": 1.5748495218022665e-05, "loss": 0.0583, "step": 1005 }, { "epoch": 0.34064887515608133, "grad_norm": 0.443359375, "learning_rate": 1.573933040211129e-05, "loss": 0.0639, "step": 1006 }, { "epoch": 0.3409874923282047, "grad_norm": 0.435546875, "learning_rate": 1.5730158391948785e-05, "loss": 0.0566, "step": 1007 }, { "epoch": 0.34132610950032805, "grad_norm": 0.66796875, "learning_rate": 1.5720979199032268e-05, "loss": 0.0807, "step": 1008 }, { "epoch": 0.3416647266724514, "grad_norm": 0.419921875, "learning_rate": 1.5711792834867856e-05, "loss": 0.0633, "step": 1009 }, { "epoch": 0.3420033438445747, "grad_norm": 0.5078125, "learning_rate": 1.570259931097066e-05, "loss": 0.0722, "step": 1010 }, { "epoch": 0.34234196101669806, "grad_norm": 0.421875, "learning_rate": 1.569339863886476e-05, "loss": 0.0596, "step": 1011 }, { "epoch": 0.3426805781888214, "grad_norm": 0.423828125, "learning_rate": 1.56841908300832e-05, "loss": 0.0618, "step": 1012 }, { "epoch": 0.3430191953609447, "grad_norm": 0.458984375, "learning_rate": 1.567497589616797e-05, "loss": 0.0626, "step": 1013 }, { "epoch": 0.3433578125330681, "grad_norm": 0.55078125, "learning_rate": 1.5665753848669987e-05, "loss": 0.0731, "step": 1014 }, { "epoch": 0.34369642970519143, "grad_norm": 0.5390625, "learning_rate": 1.5656524699149096e-05, "loss": 0.0723, "step": 1015 }, { "epoch": 0.3440350468773148, "grad_norm": 0.62109375, "learning_rate": 1.5647288459174032e-05, "loss": 0.0949, "step": 1016 }, { "epoch": 0.3443736640494381, "grad_norm": 0.431640625, "learning_rate": 1.563804514032242e-05, "loss": 0.0491, "step": 1017 }, { "epoch": 0.34471228122156145, "grad_norm": 0.68359375, "learning_rate": 1.5628794754180764e-05, "loss": 0.0886, "step": 1018 }, { "epoch": 0.3450508983936848, "grad_norm": 0.6015625, "learning_rate": 1.5619537312344422e-05, "loss": 0.0659, "step": 1019 }, { "epoch": 0.34538951556580816, "grad_norm": 0.455078125, "learning_rate": 1.56102728264176e-05, "loss": 0.0632, "step": 1020 }, { "epoch": 0.34572813273793146, "grad_norm": 1.1953125, "learning_rate": 1.560100130801333e-05, "loss": 0.0618, "step": 1021 }, { "epoch": 0.3460667499100548, "grad_norm": 0.6015625, "learning_rate": 1.5591722768753464e-05, "loss": 0.0721, "step": 1022 }, { "epoch": 0.3464053670821782, "grad_norm": 0.408203125, "learning_rate": 1.5582437220268648e-05, "loss": 0.0518, "step": 1023 }, { "epoch": 0.3467439842543015, "grad_norm": 0.48828125, "learning_rate": 1.5573144674198323e-05, "loss": 0.0599, "step": 1024 }, { "epoch": 0.34708260142642483, "grad_norm": 0.453125, "learning_rate": 1.5563845142190687e-05, "loss": 0.0601, "step": 1025 }, { "epoch": 0.3474212185985482, "grad_norm": 0.486328125, "learning_rate": 1.555453863590272e-05, "loss": 0.0596, "step": 1026 }, { "epoch": 0.34775983577067154, "grad_norm": 0.41015625, "learning_rate": 1.554522516700011e-05, "loss": 0.0496, "step": 1027 }, { "epoch": 0.34809845294279484, "grad_norm": 0.5703125, "learning_rate": 1.5535904747157303e-05, "loss": 0.0744, "step": 1028 }, { "epoch": 0.3484370701149182, "grad_norm": 0.43359375, "learning_rate": 1.5526577388057444e-05, "loss": 0.0532, "step": 1029 }, { "epoch": 0.34877568728704156, "grad_norm": 0.63671875, "learning_rate": 1.5517243101392373e-05, "loss": 0.08, "step": 1030 }, { "epoch": 0.34911430445916486, "grad_norm": 0.4921875, "learning_rate": 1.5507901898862623e-05, "loss": 0.0623, "step": 1031 }, { "epoch": 0.3494529216312882, "grad_norm": 0.484375, "learning_rate": 1.5498553792177395e-05, "loss": 0.0582, "step": 1032 }, { "epoch": 0.3497915388034116, "grad_norm": 0.59765625, "learning_rate": 1.5489198793054535e-05, "loss": 0.0806, "step": 1033 }, { "epoch": 0.35013015597553493, "grad_norm": 0.4765625, "learning_rate": 1.5479836913220544e-05, "loss": 0.0691, "step": 1034 }, { "epoch": 0.35046877314765823, "grad_norm": 0.47265625, "learning_rate": 1.547046816441053e-05, "loss": 0.0724, "step": 1035 }, { "epoch": 0.3508073903197816, "grad_norm": 0.470703125, "learning_rate": 1.5461092558368223e-05, "loss": 0.0604, "step": 1036 }, { "epoch": 0.35114600749190494, "grad_norm": 0.625, "learning_rate": 1.5451710106845953e-05, "loss": 0.0606, "step": 1037 }, { "epoch": 0.3514846246640283, "grad_norm": 0.52734375, "learning_rate": 1.5442320821604616e-05, "loss": 0.0774, "step": 1038 }, { "epoch": 0.3518232418361516, "grad_norm": 0.486328125, "learning_rate": 1.5432924714413685e-05, "loss": 0.0642, "step": 1039 }, { "epoch": 0.35216185900827496, "grad_norm": 0.74609375, "learning_rate": 1.5423521797051176e-05, "loss": 0.1061, "step": 1040 }, { "epoch": 0.3525004761803983, "grad_norm": 0.447265625, "learning_rate": 1.541411208130365e-05, "loss": 0.0601, "step": 1041 }, { "epoch": 0.3528390933525216, "grad_norm": 0.5859375, "learning_rate": 1.540469557896619e-05, "loss": 0.0713, "step": 1042 }, { "epoch": 0.35317771052464497, "grad_norm": 0.51953125, "learning_rate": 1.539527230184238e-05, "loss": 0.0591, "step": 1043 }, { "epoch": 0.35351632769676833, "grad_norm": 0.52734375, "learning_rate": 1.5385842261744296e-05, "loss": 0.0738, "step": 1044 }, { "epoch": 0.3538549448688917, "grad_norm": 0.56640625, "learning_rate": 1.5376405470492502e-05, "loss": 0.0825, "step": 1045 }, { "epoch": 0.354193562041015, "grad_norm": 0.39453125, "learning_rate": 1.536696193991601e-05, "loss": 0.0502, "step": 1046 }, { "epoch": 0.35453217921313834, "grad_norm": 0.56640625, "learning_rate": 1.535751168185228e-05, "loss": 0.0713, "step": 1047 }, { "epoch": 0.3548707963852617, "grad_norm": 0.53515625, "learning_rate": 1.5348054708147225e-05, "loss": 0.0786, "step": 1048 }, { "epoch": 0.35520941355738506, "grad_norm": 0.5078125, "learning_rate": 1.5338591030655154e-05, "loss": 0.0689, "step": 1049 }, { "epoch": 0.35554803072950836, "grad_norm": 0.4921875, "learning_rate": 1.5329120661238788e-05, "loss": 0.0645, "step": 1050 }, { "epoch": 0.3558866479016317, "grad_norm": 0.5234375, "learning_rate": 1.5319643611769237e-05, "loss": 0.0825, "step": 1051 }, { "epoch": 0.35622526507375507, "grad_norm": 0.458984375, "learning_rate": 1.5310159894125986e-05, "loss": 0.0608, "step": 1052 }, { "epoch": 0.35656388224587837, "grad_norm": 0.515625, "learning_rate": 1.530066952019687e-05, "loss": 0.0677, "step": 1053 }, { "epoch": 0.35690249941800173, "grad_norm": 0.5234375, "learning_rate": 1.529117250187808e-05, "loss": 0.078, "step": 1054 }, { "epoch": 0.3572411165901251, "grad_norm": 0.4921875, "learning_rate": 1.5281668851074123e-05, "loss": 0.0656, "step": 1055 }, { "epoch": 0.35757973376224844, "grad_norm": 0.4609375, "learning_rate": 1.527215857969783e-05, "loss": 0.0576, "step": 1056 }, { "epoch": 0.35791835093437174, "grad_norm": 0.484375, "learning_rate": 1.526264169967033e-05, "loss": 0.0565, "step": 1057 }, { "epoch": 0.3582569681064951, "grad_norm": 0.609375, "learning_rate": 1.5253118222921024e-05, "loss": 0.0859, "step": 1058 }, { "epoch": 0.35859558527861846, "grad_norm": 0.458984375, "learning_rate": 1.5243588161387596e-05, "loss": 0.0602, "step": 1059 }, { "epoch": 0.35893420245074176, "grad_norm": 0.51953125, "learning_rate": 1.5234051527015983e-05, "loss": 0.0625, "step": 1060 }, { "epoch": 0.3592728196228651, "grad_norm": 0.443359375, "learning_rate": 1.522450833176035e-05, "loss": 0.0579, "step": 1061 }, { "epoch": 0.35961143679498847, "grad_norm": 0.87890625, "learning_rate": 1.5214958587583092e-05, "loss": 0.0713, "step": 1062 }, { "epoch": 0.3599500539671118, "grad_norm": 0.58984375, "learning_rate": 1.5205402306454823e-05, "loss": 0.0696, "step": 1063 }, { "epoch": 0.36028867113923513, "grad_norm": 0.515625, "learning_rate": 1.5195839500354337e-05, "loss": 0.0755, "step": 1064 }, { "epoch": 0.3606272883113585, "grad_norm": 0.439453125, "learning_rate": 1.5186270181268612e-05, "loss": 0.0575, "step": 1065 }, { "epoch": 0.36096590548348184, "grad_norm": 0.59765625, "learning_rate": 1.5176694361192787e-05, "loss": 0.0814, "step": 1066 }, { "epoch": 0.3613045226556052, "grad_norm": 0.56640625, "learning_rate": 1.516711205213016e-05, "loss": 0.0819, "step": 1067 }, { "epoch": 0.3616431398277285, "grad_norm": 0.56640625, "learning_rate": 1.5157523266092153e-05, "loss": 0.0777, "step": 1068 }, { "epoch": 0.36198175699985186, "grad_norm": 0.419921875, "learning_rate": 1.5147928015098309e-05, "loss": 0.0584, "step": 1069 }, { "epoch": 0.3623203741719752, "grad_norm": 0.52734375, "learning_rate": 1.5138326311176278e-05, "loss": 0.0745, "step": 1070 }, { "epoch": 0.3626589913440985, "grad_norm": 0.490234375, "learning_rate": 1.5128718166361793e-05, "loss": 0.0671, "step": 1071 }, { "epoch": 0.36299760851622187, "grad_norm": 0.423828125, "learning_rate": 1.511910359269867e-05, "loss": 0.0531, "step": 1072 }, { "epoch": 0.3633362256883452, "grad_norm": 0.51171875, "learning_rate": 1.5109482602238773e-05, "loss": 0.0624, "step": 1073 }, { "epoch": 0.3636748428604686, "grad_norm": 0.66015625, "learning_rate": 1.5099855207042016e-05, "loss": 0.0907, "step": 1074 }, { "epoch": 0.3640134600325919, "grad_norm": 0.53125, "learning_rate": 1.509022141917634e-05, "loss": 0.0673, "step": 1075 }, { "epoch": 0.36435207720471524, "grad_norm": 0.54296875, "learning_rate": 1.5080581250717699e-05, "loss": 0.0671, "step": 1076 }, { "epoch": 0.3646906943768386, "grad_norm": 0.6796875, "learning_rate": 1.5070934713750043e-05, "loss": 0.0854, "step": 1077 }, { "epoch": 0.36502931154896195, "grad_norm": 0.6171875, "learning_rate": 1.5061281820365308e-05, "loss": 0.0592, "step": 1078 }, { "epoch": 0.36536792872108526, "grad_norm": 0.50390625, "learning_rate": 1.50516225826634e-05, "loss": 0.0659, "step": 1079 }, { "epoch": 0.3657065458932086, "grad_norm": 0.4921875, "learning_rate": 1.5041957012752173e-05, "loss": 0.0522, "step": 1080 }, { "epoch": 0.36604516306533197, "grad_norm": 0.60546875, "learning_rate": 1.5032285122747414e-05, "loss": 0.0703, "step": 1081 }, { "epoch": 0.36638378023745527, "grad_norm": 0.4375, "learning_rate": 1.5022606924772842e-05, "loss": 0.0595, "step": 1082 }, { "epoch": 0.3667223974095786, "grad_norm": 0.54296875, "learning_rate": 1.5012922430960082e-05, "loss": 0.059, "step": 1083 }, { "epoch": 0.367061014581702, "grad_norm": 0.453125, "learning_rate": 1.5003231653448645e-05, "loss": 0.0567, "step": 1084 }, { "epoch": 0.36739963175382534, "grad_norm": 0.462890625, "learning_rate": 1.4993534604385917e-05, "loss": 0.0622, "step": 1085 }, { "epoch": 0.36773824892594864, "grad_norm": 0.625, "learning_rate": 1.4983831295927154e-05, "loss": 0.0874, "step": 1086 }, { "epoch": 0.368076866098072, "grad_norm": 0.51171875, "learning_rate": 1.4974121740235457e-05, "loss": 0.0616, "step": 1087 }, { "epoch": 0.36841548327019535, "grad_norm": 0.5234375, "learning_rate": 1.496440594948175e-05, "loss": 0.0776, "step": 1088 }, { "epoch": 0.36875410044231866, "grad_norm": 0.5234375, "learning_rate": 1.495468393584478e-05, "loss": 0.0603, "step": 1089 }, { "epoch": 0.369092717614442, "grad_norm": 0.412109375, "learning_rate": 1.4944955711511091e-05, "loss": 0.0549, "step": 1090 }, { "epoch": 0.36943133478656537, "grad_norm": 0.60546875, "learning_rate": 1.4935221288675013e-05, "loss": 0.0634, "step": 1091 }, { "epoch": 0.3697699519586887, "grad_norm": 0.49609375, "learning_rate": 1.4925480679538646e-05, "loss": 0.0627, "step": 1092 }, { "epoch": 0.370108569130812, "grad_norm": 0.408203125, "learning_rate": 1.4915733896311844e-05, "loss": 0.052, "step": 1093 }, { "epoch": 0.3704471863029354, "grad_norm": 0.53125, "learning_rate": 1.49059809512122e-05, "loss": 0.0806, "step": 1094 }, { "epoch": 0.37078580347505874, "grad_norm": 0.58203125, "learning_rate": 1.4896221856465034e-05, "loss": 0.0737, "step": 1095 }, { "epoch": 0.3711244206471821, "grad_norm": 0.63671875, "learning_rate": 1.4886456624303369e-05, "loss": 0.0788, "step": 1096 }, { "epoch": 0.3714630378193054, "grad_norm": 0.5390625, "learning_rate": 1.4876685266967926e-05, "loss": 0.0535, "step": 1097 }, { "epoch": 0.37180165499142875, "grad_norm": 0.44140625, "learning_rate": 1.4866907796707102e-05, "loss": 0.0557, "step": 1098 }, { "epoch": 0.3721402721635521, "grad_norm": 0.65625, "learning_rate": 1.4857124225776955e-05, "loss": 0.0696, "step": 1099 }, { "epoch": 0.3724788893356754, "grad_norm": 0.54296875, "learning_rate": 1.4847334566441199e-05, "loss": 0.0639, "step": 1100 }, { "epoch": 0.37281750650779877, "grad_norm": 0.71875, "learning_rate": 1.4837538830971162e-05, "loss": 0.0792, "step": 1101 }, { "epoch": 0.3731561236799221, "grad_norm": 0.453125, "learning_rate": 1.4827737031645808e-05, "loss": 0.0613, "step": 1102 }, { "epoch": 0.3734947408520455, "grad_norm": 0.490234375, "learning_rate": 1.481792918075169e-05, "loss": 0.0628, "step": 1103 }, { "epoch": 0.3738333580241688, "grad_norm": 0.58203125, "learning_rate": 1.4808115290582947e-05, "loss": 0.0682, "step": 1104 }, { "epoch": 0.37417197519629214, "grad_norm": 0.54296875, "learning_rate": 1.4798295373441293e-05, "loss": 0.0762, "step": 1105 }, { "epoch": 0.3745105923684155, "grad_norm": 0.451171875, "learning_rate": 1.4788469441635997e-05, "loss": 0.0504, "step": 1106 }, { "epoch": 0.3748492095405388, "grad_norm": 0.42578125, "learning_rate": 1.4778637507483867e-05, "loss": 0.0547, "step": 1107 }, { "epoch": 0.37518782671266215, "grad_norm": 0.44140625, "learning_rate": 1.4768799583309228e-05, "loss": 0.0602, "step": 1108 }, { "epoch": 0.3755264438847855, "grad_norm": 0.466796875, "learning_rate": 1.475895568144392e-05, "loss": 0.0622, "step": 1109 }, { "epoch": 0.37586506105690887, "grad_norm": 0.44140625, "learning_rate": 1.4749105814227278e-05, "loss": 0.0564, "step": 1110 }, { "epoch": 0.37620367822903217, "grad_norm": 0.5546875, "learning_rate": 1.4739249994006111e-05, "loss": 0.0762, "step": 1111 }, { "epoch": 0.3765422954011555, "grad_norm": 0.44140625, "learning_rate": 1.4729388233134684e-05, "loss": 0.059, "step": 1112 }, { "epoch": 0.3768809125732789, "grad_norm": 0.48828125, "learning_rate": 1.4719520543974723e-05, "loss": 0.0712, "step": 1113 }, { "epoch": 0.37721952974540224, "grad_norm": 0.41796875, "learning_rate": 1.4709646938895374e-05, "loss": 0.0532, "step": 1114 }, { "epoch": 0.37755814691752554, "grad_norm": 0.515625, "learning_rate": 1.4699767430273202e-05, "loss": 0.0628, "step": 1115 }, { "epoch": 0.3778967640896489, "grad_norm": 0.55078125, "learning_rate": 1.468988203049217e-05, "loss": 0.0845, "step": 1116 }, { "epoch": 0.37823538126177225, "grad_norm": 0.54296875, "learning_rate": 1.4679990751943632e-05, "loss": 0.0704, "step": 1117 }, { "epoch": 0.37857399843389555, "grad_norm": 0.4453125, "learning_rate": 1.4670093607026302e-05, "loss": 0.05, "step": 1118 }, { "epoch": 0.3789126156060189, "grad_norm": 0.65234375, "learning_rate": 1.4660190608146253e-05, "loss": 0.0856, "step": 1119 }, { "epoch": 0.37925123277814227, "grad_norm": 0.484375, "learning_rate": 1.4650281767716895e-05, "loss": 0.0504, "step": 1120 }, { "epoch": 0.3795898499502656, "grad_norm": 0.4765625, "learning_rate": 1.4640367098158961e-05, "loss": 0.0704, "step": 1121 }, { "epoch": 0.3799284671223889, "grad_norm": 0.5234375, "learning_rate": 1.4630446611900493e-05, "loss": 0.0601, "step": 1122 }, { "epoch": 0.3802670842945123, "grad_norm": 0.451171875, "learning_rate": 1.4620520321376814e-05, "loss": 0.0665, "step": 1123 }, { "epoch": 0.38060570146663564, "grad_norm": 0.59375, "learning_rate": 1.4610588239030537e-05, "loss": 0.0776, "step": 1124 }, { "epoch": 0.380944318638759, "grad_norm": 0.474609375, "learning_rate": 1.4600650377311523e-05, "loss": 0.0622, "step": 1125 }, { "epoch": 0.3812829358108823, "grad_norm": 0.515625, "learning_rate": 1.4590706748676886e-05, "loss": 0.0618, "step": 1126 }, { "epoch": 0.38162155298300565, "grad_norm": 0.484375, "learning_rate": 1.4580757365590965e-05, "loss": 0.0694, "step": 1127 }, { "epoch": 0.381960170155129, "grad_norm": 0.486328125, "learning_rate": 1.4570802240525309e-05, "loss": 0.0619, "step": 1128 }, { "epoch": 0.3822987873272523, "grad_norm": 0.609375, "learning_rate": 1.456084138595867e-05, "loss": 0.0651, "step": 1129 }, { "epoch": 0.38263740449937567, "grad_norm": 0.53125, "learning_rate": 1.4550874814376983e-05, "loss": 0.0668, "step": 1130 }, { "epoch": 0.382976021671499, "grad_norm": 0.51171875, "learning_rate": 1.4540902538273343e-05, "loss": 0.0639, "step": 1131 }, { "epoch": 0.3833146388436224, "grad_norm": 0.51171875, "learning_rate": 1.4530924570147998e-05, "loss": 0.0626, "step": 1132 }, { "epoch": 0.3836532560157457, "grad_norm": 0.50390625, "learning_rate": 1.452094092250834e-05, "loss": 0.0724, "step": 1133 }, { "epoch": 0.38399187318786904, "grad_norm": 0.64453125, "learning_rate": 1.451095160786886e-05, "loss": 0.1049, "step": 1134 }, { "epoch": 0.3843304903599924, "grad_norm": 0.921875, "learning_rate": 1.450095663875117e-05, "loss": 0.0733, "step": 1135 }, { "epoch": 0.3846691075321157, "grad_norm": 0.7734375, "learning_rate": 1.449095602768397e-05, "loss": 0.077, "step": 1136 }, { "epoch": 0.38500772470423905, "grad_norm": 0.6640625, "learning_rate": 1.4480949787203015e-05, "loss": 0.0811, "step": 1137 }, { "epoch": 0.3853463418763624, "grad_norm": 0.82421875, "learning_rate": 1.4470937929851142e-05, "loss": 0.0932, "step": 1138 }, { "epoch": 0.38568495904848576, "grad_norm": 0.515625, "learning_rate": 1.4460920468178204e-05, "loss": 0.0565, "step": 1139 }, { "epoch": 0.38602357622060907, "grad_norm": 0.6796875, "learning_rate": 1.4450897414741095e-05, "loss": 0.0705, "step": 1140 }, { "epoch": 0.3863621933927324, "grad_norm": 0.5, "learning_rate": 1.4440868782103711e-05, "loss": 0.064, "step": 1141 }, { "epoch": 0.3867008105648558, "grad_norm": 0.5078125, "learning_rate": 1.443083458283695e-05, "loss": 0.055, "step": 1142 }, { "epoch": 0.38703942773697914, "grad_norm": 0.40234375, "learning_rate": 1.4420794829518674e-05, "loss": 0.0502, "step": 1143 }, { "epoch": 0.38737804490910244, "grad_norm": 0.466796875, "learning_rate": 1.4410749534733719e-05, "loss": 0.0673, "step": 1144 }, { "epoch": 0.3877166620812258, "grad_norm": 0.75, "learning_rate": 1.440069871107386e-05, "loss": 0.0594, "step": 1145 }, { "epoch": 0.38805527925334915, "grad_norm": 0.48046875, "learning_rate": 1.4390642371137807e-05, "loss": 0.059, "step": 1146 }, { "epoch": 0.38839389642547245, "grad_norm": 2.171875, "learning_rate": 1.438058052753118e-05, "loss": 0.099, "step": 1147 }, { "epoch": 0.3887325135975958, "grad_norm": 0.439453125, "learning_rate": 1.4370513192866507e-05, "loss": 0.0603, "step": 1148 }, { "epoch": 0.38907113076971916, "grad_norm": 0.46875, "learning_rate": 1.4360440379763187e-05, "loss": 0.059, "step": 1149 }, { "epoch": 0.3894097479418425, "grad_norm": 0.482421875, "learning_rate": 1.4350362100847495e-05, "loss": 0.0636, "step": 1150 }, { "epoch": 0.3897483651139658, "grad_norm": 0.6171875, "learning_rate": 1.4340278368752553e-05, "loss": 0.0746, "step": 1151 }, { "epoch": 0.3900869822860892, "grad_norm": 0.50390625, "learning_rate": 1.4330189196118323e-05, "loss": 0.0642, "step": 1152 }, { "epoch": 0.39042559945821254, "grad_norm": 0.68359375, "learning_rate": 1.4320094595591578e-05, "loss": 0.0929, "step": 1153 }, { "epoch": 0.3907642166303359, "grad_norm": 0.462890625, "learning_rate": 1.4309994579825908e-05, "loss": 0.0529, "step": 1154 }, { "epoch": 0.3911028338024592, "grad_norm": 0.54296875, "learning_rate": 1.4299889161481676e-05, "loss": 0.0702, "step": 1155 }, { "epoch": 0.39144145097458255, "grad_norm": 0.62890625, "learning_rate": 1.4289778353226032e-05, "loss": 0.0706, "step": 1156 }, { "epoch": 0.3917800681467059, "grad_norm": 0.515625, "learning_rate": 1.4279662167732869e-05, "loss": 0.0684, "step": 1157 }, { "epoch": 0.3921186853188292, "grad_norm": 0.4765625, "learning_rate": 1.4269540617682826e-05, "loss": 0.0706, "step": 1158 }, { "epoch": 0.39245730249095256, "grad_norm": 0.66015625, "learning_rate": 1.4259413715763276e-05, "loss": 0.0789, "step": 1159 }, { "epoch": 0.3927959196630759, "grad_norm": 0.55859375, "learning_rate": 1.4249281474668279e-05, "loss": 0.07, "step": 1160 }, { "epoch": 0.3931345368351993, "grad_norm": 0.671875, "learning_rate": 1.423914390709861e-05, "loss": 0.0906, "step": 1161 }, { "epoch": 0.3934731540073226, "grad_norm": 0.44921875, "learning_rate": 1.4229001025761704e-05, "loss": 0.0536, "step": 1162 }, { "epoch": 0.39381177117944594, "grad_norm": 0.466796875, "learning_rate": 1.4218852843371665e-05, "loss": 0.0557, "step": 1163 }, { "epoch": 0.3941503883515693, "grad_norm": 0.49609375, "learning_rate": 1.4208699372649244e-05, "loss": 0.0668, "step": 1164 }, { "epoch": 0.3944890055236926, "grad_norm": 0.47265625, "learning_rate": 1.4198540626321817e-05, "loss": 0.0609, "step": 1165 }, { "epoch": 0.39482762269581595, "grad_norm": 0.453125, "learning_rate": 1.4188376617123368e-05, "loss": 0.0655, "step": 1166 }, { "epoch": 0.3951662398679393, "grad_norm": 0.484375, "learning_rate": 1.4178207357794486e-05, "loss": 0.0662, "step": 1167 }, { "epoch": 0.39550485704006266, "grad_norm": 0.49609375, "learning_rate": 1.4168032861082344e-05, "loss": 0.0721, "step": 1168 }, { "epoch": 0.39584347421218596, "grad_norm": 0.458984375, "learning_rate": 1.4157853139740665e-05, "loss": 0.0676, "step": 1169 }, { "epoch": 0.3961820913843093, "grad_norm": 0.6015625, "learning_rate": 1.4147668206529737e-05, "loss": 0.0768, "step": 1170 }, { "epoch": 0.3965207085564327, "grad_norm": 0.486328125, "learning_rate": 1.413747807421637e-05, "loss": 0.0727, "step": 1171 }, { "epoch": 0.39685932572855603, "grad_norm": 0.435546875, "learning_rate": 1.4127282755573903e-05, "loss": 0.062, "step": 1172 }, { "epoch": 0.39719794290067933, "grad_norm": 0.5234375, "learning_rate": 1.4117082263382162e-05, "loss": 0.0696, "step": 1173 }, { "epoch": 0.3975365600728027, "grad_norm": 0.45703125, "learning_rate": 1.4106876610427466e-05, "loss": 0.0592, "step": 1174 }, { "epoch": 0.39787517724492605, "grad_norm": 0.484375, "learning_rate": 1.4096665809502607e-05, "loss": 0.0607, "step": 1175 }, { "epoch": 0.39821379441704935, "grad_norm": 0.49609375, "learning_rate": 1.408644987340682e-05, "loss": 0.0619, "step": 1176 }, { "epoch": 0.3985524115891727, "grad_norm": 0.50390625, "learning_rate": 1.4076228814945778e-05, "loss": 0.0537, "step": 1177 }, { "epoch": 0.39889102876129606, "grad_norm": 0.443359375, "learning_rate": 1.4066002646931587e-05, "loss": 0.0623, "step": 1178 }, { "epoch": 0.3992296459334194, "grad_norm": 0.49609375, "learning_rate": 1.4055771382182744e-05, "loss": 0.0812, "step": 1179 }, { "epoch": 0.3995682631055427, "grad_norm": 0.5078125, "learning_rate": 1.404553503352414e-05, "loss": 0.0687, "step": 1180 }, { "epoch": 0.3999068802776661, "grad_norm": 0.45703125, "learning_rate": 1.4035293613787042e-05, "loss": 0.0612, "step": 1181 }, { "epoch": 0.40024549744978943, "grad_norm": 0.55078125, "learning_rate": 1.4025047135809069e-05, "loss": 0.0767, "step": 1182 }, { "epoch": 0.4005841146219128, "grad_norm": 0.57421875, "learning_rate": 1.4014795612434182e-05, "loss": 0.085, "step": 1183 }, { "epoch": 0.4009227317940361, "grad_norm": 0.50390625, "learning_rate": 1.4004539056512667e-05, "loss": 0.0683, "step": 1184 }, { "epoch": 0.4009227317940361, "eval_loss": 0.06940508633852005, "eval_runtime": 816.0342, "eval_samples_per_second": 12.191, "eval_steps_per_second": 3.048, "step": 1184 }, { "epoch": 0.40126134896615945, "grad_norm": 0.494140625, "learning_rate": 1.3994277480901116e-05, "loss": 0.0512, "step": 1185 }, { "epoch": 0.4015999661382828, "grad_norm": 0.56640625, "learning_rate": 1.3984010898462417e-05, "loss": 0.0746, "step": 1186 }, { "epoch": 0.4019385833104061, "grad_norm": 0.515625, "learning_rate": 1.397373932206573e-05, "loss": 0.0604, "step": 1187 }, { "epoch": 0.40227720048252946, "grad_norm": 0.53515625, "learning_rate": 1.3963462764586479e-05, "loss": 0.0658, "step": 1188 }, { "epoch": 0.4026158176546528, "grad_norm": 0.49609375, "learning_rate": 1.3953181238906326e-05, "loss": 0.0674, "step": 1189 }, { "epoch": 0.4029544348267762, "grad_norm": 0.47265625, "learning_rate": 1.3942894757913169e-05, "loss": 0.0628, "step": 1190 }, { "epoch": 0.4032930519988995, "grad_norm": 0.494140625, "learning_rate": 1.3932603334501106e-05, "loss": 0.0657, "step": 1191 }, { "epoch": 0.40363166917102283, "grad_norm": 0.44921875, "learning_rate": 1.3922306981570447e-05, "loss": 0.0588, "step": 1192 }, { "epoch": 0.4039702863431462, "grad_norm": 0.404296875, "learning_rate": 1.3912005712027661e-05, "loss": 0.0558, "step": 1193 }, { "epoch": 0.4043089035152695, "grad_norm": 0.375, "learning_rate": 1.3901699538785398e-05, "loss": 0.0519, "step": 1194 }, { "epoch": 0.40464752068739285, "grad_norm": 0.671875, "learning_rate": 1.3891388474762444e-05, "loss": 0.0755, "step": 1195 }, { "epoch": 0.4049861378595162, "grad_norm": 0.41796875, "learning_rate": 1.388107253288372e-05, "loss": 0.0614, "step": 1196 }, { "epoch": 0.40532475503163956, "grad_norm": 0.7109375, "learning_rate": 1.3870751726080256e-05, "loss": 0.1036, "step": 1197 }, { "epoch": 0.40566337220376286, "grad_norm": 0.349609375, "learning_rate": 1.3860426067289185e-05, "loss": 0.0498, "step": 1198 }, { "epoch": 0.4060019893758862, "grad_norm": 0.43359375, "learning_rate": 1.3850095569453728e-05, "loss": 0.0602, "step": 1199 }, { "epoch": 0.4063406065480096, "grad_norm": 0.59765625, "learning_rate": 1.3839760245523155e-05, "loss": 0.0664, "step": 1200 }, { "epoch": 0.40667922372013293, "grad_norm": 0.6953125, "learning_rate": 1.38294201084528e-05, "loss": 0.089, "step": 1201 }, { "epoch": 0.40701784089225623, "grad_norm": 0.7265625, "learning_rate": 1.3819075171204028e-05, "loss": 0.0595, "step": 1202 }, { "epoch": 0.4073564580643796, "grad_norm": 0.765625, "learning_rate": 1.3808725446744218e-05, "loss": 0.0571, "step": 1203 }, { "epoch": 0.40769507523650295, "grad_norm": 0.431640625, "learning_rate": 1.3798370948046747e-05, "loss": 0.0537, "step": 1204 }, { "epoch": 0.40803369240862625, "grad_norm": 0.50390625, "learning_rate": 1.3788011688090978e-05, "loss": 0.0609, "step": 1205 }, { "epoch": 0.4083723095807496, "grad_norm": 0.48828125, "learning_rate": 1.3777647679862254e-05, "loss": 0.0642, "step": 1206 }, { "epoch": 0.40871092675287296, "grad_norm": 0.625, "learning_rate": 1.3767278936351853e-05, "loss": 0.0814, "step": 1207 }, { "epoch": 0.4090495439249963, "grad_norm": 0.57421875, "learning_rate": 1.3756905470556996e-05, "loss": 0.099, "step": 1208 }, { "epoch": 0.4093881610971196, "grad_norm": 0.4375, "learning_rate": 1.3746527295480825e-05, "loss": 0.0597, "step": 1209 }, { "epoch": 0.409726778269243, "grad_norm": 0.50390625, "learning_rate": 1.3736144424132383e-05, "loss": 0.0622, "step": 1210 }, { "epoch": 0.41006539544136633, "grad_norm": 0.6328125, "learning_rate": 1.3725756869526598e-05, "loss": 0.0794, "step": 1211 }, { "epoch": 0.4104040126134897, "grad_norm": 0.48828125, "learning_rate": 1.3715364644684273e-05, "loss": 0.0525, "step": 1212 }, { "epoch": 0.410742629785613, "grad_norm": 0.51171875, "learning_rate": 1.370496776263206e-05, "loss": 0.064, "step": 1213 }, { "epoch": 0.41108124695773635, "grad_norm": 0.828125, "learning_rate": 1.3694566236402458e-05, "loss": 0.0815, "step": 1214 }, { "epoch": 0.4114198641298597, "grad_norm": 0.48046875, "learning_rate": 1.3684160079033772e-05, "loss": 0.0638, "step": 1215 }, { "epoch": 0.411758481301983, "grad_norm": 0.56640625, "learning_rate": 1.3673749303570127e-05, "loss": 0.0801, "step": 1216 }, { "epoch": 0.41209709847410636, "grad_norm": 0.5703125, "learning_rate": 1.366333392306143e-05, "loss": 0.0628, "step": 1217 }, { "epoch": 0.4124357156462297, "grad_norm": 0.46875, "learning_rate": 1.3652913950563362e-05, "loss": 0.062, "step": 1218 }, { "epoch": 0.4127743328183531, "grad_norm": 0.7265625, "learning_rate": 1.3642489399137358e-05, "loss": 0.079, "step": 1219 }, { "epoch": 0.4131129499904764, "grad_norm": 0.50390625, "learning_rate": 1.3632060281850593e-05, "loss": 0.0634, "step": 1220 }, { "epoch": 0.41345156716259973, "grad_norm": 0.3984375, "learning_rate": 1.3621626611775966e-05, "loss": 0.0488, "step": 1221 }, { "epoch": 0.4137901843347231, "grad_norm": 0.5625, "learning_rate": 1.3611188401992087e-05, "loss": 0.0813, "step": 1222 }, { "epoch": 0.4141288015068464, "grad_norm": 0.91015625, "learning_rate": 1.360074566558325e-05, "loss": 0.0672, "step": 1223 }, { "epoch": 0.41446741867896975, "grad_norm": 0.57421875, "learning_rate": 1.3590298415639427e-05, "loss": 0.0753, "step": 1224 }, { "epoch": 0.4148060358510931, "grad_norm": 0.462890625, "learning_rate": 1.3579846665256244e-05, "loss": 0.0703, "step": 1225 }, { "epoch": 0.41514465302321646, "grad_norm": 0.72265625, "learning_rate": 1.3569390427534976e-05, "loss": 0.0963, "step": 1226 }, { "epoch": 0.41548327019533976, "grad_norm": 0.5859375, "learning_rate": 1.3558929715582517e-05, "loss": 0.07, "step": 1227 }, { "epoch": 0.4158218873674631, "grad_norm": 0.47265625, "learning_rate": 1.3548464542511364e-05, "loss": 0.0673, "step": 1228 }, { "epoch": 0.4161605045395865, "grad_norm": 0.57421875, "learning_rate": 1.353799492143962e-05, "loss": 0.0684, "step": 1229 }, { "epoch": 0.41649912171170983, "grad_norm": 0.609375, "learning_rate": 1.352752086549095e-05, "loss": 0.0755, "step": 1230 }, { "epoch": 0.41683773888383313, "grad_norm": 0.53125, "learning_rate": 1.3517042387794585e-05, "loss": 0.0698, "step": 1231 }, { "epoch": 0.4171763560559565, "grad_norm": 0.5, "learning_rate": 1.3506559501485304e-05, "loss": 0.058, "step": 1232 }, { "epoch": 0.41751497322807984, "grad_norm": 0.640625, "learning_rate": 1.3496072219703399e-05, "loss": 0.0792, "step": 1233 }, { "epoch": 0.41785359040020315, "grad_norm": 0.4296875, "learning_rate": 1.3485580555594679e-05, "loss": 0.0639, "step": 1234 }, { "epoch": 0.4181922075723265, "grad_norm": 0.66796875, "learning_rate": 1.3475084522310451e-05, "loss": 0.0783, "step": 1235 }, { "epoch": 0.41853082474444986, "grad_norm": 0.64453125, "learning_rate": 1.3464584133007486e-05, "loss": 0.0711, "step": 1236 }, { "epoch": 0.4188694419165732, "grad_norm": 0.58203125, "learning_rate": 1.3454079400848029e-05, "loss": 0.0688, "step": 1237 }, { "epoch": 0.4192080590886965, "grad_norm": 0.486328125, "learning_rate": 1.3443570338999759e-05, "loss": 0.0748, "step": 1238 }, { "epoch": 0.4195466762608199, "grad_norm": 0.56640625, "learning_rate": 1.3433056960635788e-05, "loss": 0.0767, "step": 1239 }, { "epoch": 0.41988529343294323, "grad_norm": 0.5546875, "learning_rate": 1.3422539278934637e-05, "loss": 0.0543, "step": 1240 }, { "epoch": 0.4202239106050666, "grad_norm": 0.81640625, "learning_rate": 1.341201730708022e-05, "loss": 0.0847, "step": 1241 }, { "epoch": 0.4205625277771899, "grad_norm": 0.58984375, "learning_rate": 1.3401491058261829e-05, "loss": 0.0803, "step": 1242 }, { "epoch": 0.42090114494931324, "grad_norm": 0.435546875, "learning_rate": 1.3390960545674117e-05, "loss": 0.058, "step": 1243 }, { "epoch": 0.4212397621214366, "grad_norm": 0.53515625, "learning_rate": 1.3380425782517084e-05, "loss": 0.0666, "step": 1244 }, { "epoch": 0.4215783792935599, "grad_norm": 0.63671875, "learning_rate": 1.3369886781996056e-05, "loss": 0.0741, "step": 1245 }, { "epoch": 0.42191699646568326, "grad_norm": 0.453125, "learning_rate": 1.335934355732167e-05, "loss": 0.052, "step": 1246 }, { "epoch": 0.4222556136378066, "grad_norm": 0.52734375, "learning_rate": 1.3348796121709862e-05, "loss": 0.0629, "step": 1247 }, { "epoch": 0.42259423080992997, "grad_norm": 0.43359375, "learning_rate": 1.3338244488381843e-05, "loss": 0.0573, "step": 1248 }, { "epoch": 0.4229328479820533, "grad_norm": 0.40625, "learning_rate": 1.332768867056408e-05, "loss": 0.0526, "step": 1249 }, { "epoch": 0.42327146515417663, "grad_norm": 0.470703125, "learning_rate": 1.3317128681488301e-05, "loss": 0.0641, "step": 1250 }, { "epoch": 0.4236100823263, "grad_norm": 0.5078125, "learning_rate": 1.3306564534391447e-05, "loss": 0.0617, "step": 1251 }, { "epoch": 0.4239486994984233, "grad_norm": 0.91796875, "learning_rate": 1.3295996242515679e-05, "loss": 0.0626, "step": 1252 }, { "epoch": 0.42428731667054664, "grad_norm": 0.46875, "learning_rate": 1.3285423819108349e-05, "loss": 0.0596, "step": 1253 }, { "epoch": 0.42462593384267, "grad_norm": 0.375, "learning_rate": 1.3274847277421997e-05, "loss": 0.0488, "step": 1254 }, { "epoch": 0.42496455101479336, "grad_norm": 0.42578125, "learning_rate": 1.3264266630714308e-05, "loss": 0.0614, "step": 1255 }, { "epoch": 0.42530316818691666, "grad_norm": 0.455078125, "learning_rate": 1.3253681892248136e-05, "loss": 0.0623, "step": 1256 }, { "epoch": 0.42564178535904, "grad_norm": 1.234375, "learning_rate": 1.3243093075291444e-05, "loss": 0.0729, "step": 1257 }, { "epoch": 0.42598040253116337, "grad_norm": 0.4609375, "learning_rate": 1.3232500193117318e-05, "loss": 0.0576, "step": 1258 }, { "epoch": 0.42631901970328673, "grad_norm": 0.58984375, "learning_rate": 1.3221903259003935e-05, "loss": 0.0782, "step": 1259 }, { "epoch": 0.42665763687541003, "grad_norm": 0.462890625, "learning_rate": 1.3211302286234553e-05, "loss": 0.0638, "step": 1260 }, { "epoch": 0.4269962540475334, "grad_norm": 0.431640625, "learning_rate": 1.3200697288097492e-05, "loss": 0.0536, "step": 1261 }, { "epoch": 0.42733487121965674, "grad_norm": 0.55078125, "learning_rate": 1.3190088277886119e-05, "loss": 0.073, "step": 1262 }, { "epoch": 0.42767348839178004, "grad_norm": 0.416015625, "learning_rate": 1.3179475268898828e-05, "loss": 0.0467, "step": 1263 }, { "epoch": 0.4280121055639034, "grad_norm": 0.361328125, "learning_rate": 1.316885827443903e-05, "loss": 0.0436, "step": 1264 }, { "epoch": 0.42835072273602676, "grad_norm": 0.55078125, "learning_rate": 1.3158237307815122e-05, "loss": 0.0731, "step": 1265 }, { "epoch": 0.4286893399081501, "grad_norm": 0.54296875, "learning_rate": 1.3147612382340493e-05, "loss": 0.0736, "step": 1266 }, { "epoch": 0.4290279570802734, "grad_norm": 0.59375, "learning_rate": 1.3136983511333483e-05, "loss": 0.0812, "step": 1267 }, { "epoch": 0.42936657425239677, "grad_norm": 0.51171875, "learning_rate": 1.3126350708117387e-05, "loss": 0.0731, "step": 1268 }, { "epoch": 0.4297051914245201, "grad_norm": 0.5078125, "learning_rate": 1.3115713986020421e-05, "loss": 0.0622, "step": 1269 }, { "epoch": 0.4300438085966435, "grad_norm": 0.388671875, "learning_rate": 1.3105073358375719e-05, "loss": 0.0519, "step": 1270 }, { "epoch": 0.4303824257687668, "grad_norm": 0.75390625, "learning_rate": 1.309442883852131e-05, "loss": 0.0535, "step": 1271 }, { "epoch": 0.43072104294089014, "grad_norm": 0.53515625, "learning_rate": 1.30837804398001e-05, "loss": 0.0714, "step": 1272 }, { "epoch": 0.4310596601130135, "grad_norm": 0.51953125, "learning_rate": 1.3073128175559852e-05, "loss": 0.0685, "step": 1273 }, { "epoch": 0.4313982772851368, "grad_norm": 0.5390625, "learning_rate": 1.3062472059153185e-05, "loss": 0.0976, "step": 1274 }, { "epoch": 0.43173689445726016, "grad_norm": 0.6171875, "learning_rate": 1.3051812103937545e-05, "loss": 0.0825, "step": 1275 }, { "epoch": 0.4320755116293835, "grad_norm": 0.44921875, "learning_rate": 1.3041148323275182e-05, "loss": 0.0525, "step": 1276 }, { "epoch": 0.43241412880150687, "grad_norm": 0.4609375, "learning_rate": 1.3030480730533146e-05, "loss": 0.0686, "step": 1277 }, { "epoch": 0.43275274597363017, "grad_norm": 0.59375, "learning_rate": 1.3019809339083262e-05, "loss": 0.0872, "step": 1278 }, { "epoch": 0.4330913631457535, "grad_norm": 0.62109375, "learning_rate": 1.3009134162302131e-05, "loss": 0.0991, "step": 1279 }, { "epoch": 0.4334299803178769, "grad_norm": 0.4921875, "learning_rate": 1.299845521357108e-05, "loss": 0.0583, "step": 1280 }, { "epoch": 0.4337685974900002, "grad_norm": 0.5703125, "learning_rate": 1.2987772506276173e-05, "loss": 0.0625, "step": 1281 }, { "epoch": 0.43410721466212354, "grad_norm": 0.4765625, "learning_rate": 1.2977086053808183e-05, "loss": 0.0614, "step": 1282 }, { "epoch": 0.4344458318342469, "grad_norm": 0.55078125, "learning_rate": 1.2966395869562582e-05, "loss": 0.0513, "step": 1283 }, { "epoch": 0.43478444900637025, "grad_norm": 0.486328125, "learning_rate": 1.2955701966939517e-05, "loss": 0.0637, "step": 1284 }, { "epoch": 0.43512306617849356, "grad_norm": 0.453125, "learning_rate": 1.2945004359343794e-05, "loss": 0.0661, "step": 1285 }, { "epoch": 0.4354616833506169, "grad_norm": 0.58984375, "learning_rate": 1.2934303060184865e-05, "loss": 0.0694, "step": 1286 }, { "epoch": 0.43580030052274027, "grad_norm": 0.40625, "learning_rate": 1.2923598082876811e-05, "loss": 0.0542, "step": 1287 }, { "epoch": 0.4361389176948636, "grad_norm": 0.419921875, "learning_rate": 1.291288944083832e-05, "loss": 0.0567, "step": 1288 }, { "epoch": 0.4364775348669869, "grad_norm": 0.494140625, "learning_rate": 1.2902177147492677e-05, "loss": 0.0662, "step": 1289 }, { "epoch": 0.4368161520391103, "grad_norm": 0.51953125, "learning_rate": 1.2891461216267742e-05, "loss": 0.0785, "step": 1290 }, { "epoch": 0.43715476921123364, "grad_norm": 0.37890625, "learning_rate": 1.2880741660595936e-05, "loss": 0.0521, "step": 1291 }, { "epoch": 0.43749338638335694, "grad_norm": 0.412109375, "learning_rate": 1.2870018493914227e-05, "loss": 0.0479, "step": 1292 }, { "epoch": 0.4378320035554803, "grad_norm": 0.4765625, "learning_rate": 1.2859291729664094e-05, "loss": 0.0694, "step": 1293 }, { "epoch": 0.43817062072760365, "grad_norm": 0.76953125, "learning_rate": 1.2848561381291547e-05, "loss": 0.0655, "step": 1294 }, { "epoch": 0.438509237899727, "grad_norm": 0.609375, "learning_rate": 1.2837827462247077e-05, "loss": 0.0711, "step": 1295 }, { "epoch": 0.4388478550718503, "grad_norm": 0.72265625, "learning_rate": 1.2827089985985647e-05, "loss": 0.1055, "step": 1296 }, { "epoch": 0.43918647224397367, "grad_norm": 0.55859375, "learning_rate": 1.2816348965966693e-05, "loss": 0.053, "step": 1297 }, { "epoch": 0.439525089416097, "grad_norm": 0.46875, "learning_rate": 1.2805604415654076e-05, "loss": 0.0567, "step": 1298 }, { "epoch": 0.4398637065882204, "grad_norm": 0.57421875, "learning_rate": 1.2794856348516095e-05, "loss": 0.0856, "step": 1299 }, { "epoch": 0.4402023237603437, "grad_norm": 0.69140625, "learning_rate": 1.2784104778025455e-05, "loss": 0.0913, "step": 1300 }, { "epoch": 0.44054094093246704, "grad_norm": 0.46875, "learning_rate": 1.2773349717659245e-05, "loss": 0.0607, "step": 1301 }, { "epoch": 0.4408795581045904, "grad_norm": 0.76953125, "learning_rate": 1.2762591180898938e-05, "loss": 0.1013, "step": 1302 }, { "epoch": 0.4412181752767137, "grad_norm": 0.5390625, "learning_rate": 1.2751829181230364e-05, "loss": 0.0565, "step": 1303 }, { "epoch": 0.44155679244883705, "grad_norm": 0.46875, "learning_rate": 1.274106373214368e-05, "loss": 0.0702, "step": 1304 }, { "epoch": 0.4418954096209604, "grad_norm": 0.490234375, "learning_rate": 1.2730294847133386e-05, "loss": 0.0666, "step": 1305 }, { "epoch": 0.44223402679308377, "grad_norm": 1.0546875, "learning_rate": 1.2719522539698277e-05, "loss": 0.0646, "step": 1306 }, { "epoch": 0.44257264396520707, "grad_norm": 0.58203125, "learning_rate": 1.2708746823341444e-05, "loss": 0.0871, "step": 1307 }, { "epoch": 0.4429112611373304, "grad_norm": 0.5390625, "learning_rate": 1.2697967711570243e-05, "loss": 0.0495, "step": 1308 }, { "epoch": 0.4432498783094538, "grad_norm": 0.5234375, "learning_rate": 1.2687185217896297e-05, "loss": 0.0733, "step": 1309 }, { "epoch": 0.4435884954815771, "grad_norm": 0.5625, "learning_rate": 1.267639935583546e-05, "loss": 0.072, "step": 1310 }, { "epoch": 0.44392711265370044, "grad_norm": 0.55078125, "learning_rate": 1.2665610138907813e-05, "loss": 0.0742, "step": 1311 }, { "epoch": 0.4442657298258238, "grad_norm": 0.73046875, "learning_rate": 1.2654817580637637e-05, "loss": 0.1116, "step": 1312 }, { "epoch": 0.44460434699794715, "grad_norm": 0.43359375, "learning_rate": 1.264402169455341e-05, "loss": 0.0562, "step": 1313 }, { "epoch": 0.44494296417007045, "grad_norm": 0.408203125, "learning_rate": 1.263322249418777e-05, "loss": 0.052, "step": 1314 }, { "epoch": 0.4452815813421938, "grad_norm": 0.56640625, "learning_rate": 1.2622419993077518e-05, "loss": 0.0801, "step": 1315 }, { "epoch": 0.44562019851431717, "grad_norm": 0.46484375, "learning_rate": 1.2611614204763587e-05, "loss": 0.0588, "step": 1316 }, { "epoch": 0.4459588156864405, "grad_norm": 0.515625, "learning_rate": 1.2600805142791042e-05, "loss": 0.0619, "step": 1317 }, { "epoch": 0.4462974328585638, "grad_norm": 0.54296875, "learning_rate": 1.2589992820709033e-05, "loss": 0.0616, "step": 1318 }, { "epoch": 0.4466360500306872, "grad_norm": 0.78515625, "learning_rate": 1.2579177252070815e-05, "loss": 0.0718, "step": 1319 }, { "epoch": 0.44697466720281054, "grad_norm": 0.439453125, "learning_rate": 1.2568358450433698e-05, "loss": 0.0587, "step": 1320 }, { "epoch": 0.44731328437493384, "grad_norm": 0.408203125, "learning_rate": 1.2557536429359054e-05, "loss": 0.0561, "step": 1321 }, { "epoch": 0.4476519015470572, "grad_norm": 0.474609375, "learning_rate": 1.2546711202412287e-05, "loss": 0.0559, "step": 1322 }, { "epoch": 0.44799051871918055, "grad_norm": 0.41015625, "learning_rate": 1.2535882783162823e-05, "loss": 0.0528, "step": 1323 }, { "epoch": 0.4483291358913039, "grad_norm": 0.38671875, "learning_rate": 1.2525051185184078e-05, "loss": 0.0451, "step": 1324 }, { "epoch": 0.4486677530634272, "grad_norm": 0.421875, "learning_rate": 1.2514216422053468e-05, "loss": 0.0545, "step": 1325 }, { "epoch": 0.44900637023555057, "grad_norm": 0.48828125, "learning_rate": 1.2503378507352365e-05, "loss": 0.071, "step": 1326 }, { "epoch": 0.4493449874076739, "grad_norm": 0.87890625, "learning_rate": 1.24925374546661e-05, "loss": 0.0632, "step": 1327 }, { "epoch": 0.4496836045797973, "grad_norm": 0.58203125, "learning_rate": 1.2481693277583932e-05, "loss": 0.0858, "step": 1328 }, { "epoch": 0.4500222217519206, "grad_norm": 0.5546875, "learning_rate": 1.2470845989699036e-05, "loss": 0.0668, "step": 1329 }, { "epoch": 0.45036083892404394, "grad_norm": 0.47265625, "learning_rate": 1.2459995604608493e-05, "loss": 0.066, "step": 1330 }, { "epoch": 0.4506994560961673, "grad_norm": 0.56640625, "learning_rate": 1.2449142135913254e-05, "loss": 0.0731, "step": 1331 }, { "epoch": 0.4510380732682906, "grad_norm": 0.46484375, "learning_rate": 1.243828559721815e-05, "loss": 0.0638, "step": 1332 }, { "epoch": 0.45137669044041395, "grad_norm": 0.6171875, "learning_rate": 1.2427426002131848e-05, "loss": 0.0645, "step": 1333 }, { "epoch": 0.4517153076125373, "grad_norm": 0.640625, "learning_rate": 1.2416563364266859e-05, "loss": 0.0873, "step": 1334 }, { "epoch": 0.45205392478466067, "grad_norm": 0.53515625, "learning_rate": 1.240569769723949e-05, "loss": 0.0606, "step": 1335 }, { "epoch": 0.45239254195678397, "grad_norm": 0.58984375, "learning_rate": 1.2394829014669863e-05, "loss": 0.0785, "step": 1336 }, { "epoch": 0.4527311591289073, "grad_norm": 0.5, "learning_rate": 1.238395733018187e-05, "loss": 0.0489, "step": 1337 }, { "epoch": 0.4530697763010307, "grad_norm": 0.44921875, "learning_rate": 1.2373082657403168e-05, "loss": 0.0622, "step": 1338 }, { "epoch": 0.453408393473154, "grad_norm": 0.5390625, "learning_rate": 1.236220500996516e-05, "loss": 0.0688, "step": 1339 }, { "epoch": 0.45374701064527734, "grad_norm": 0.4765625, "learning_rate": 1.235132440150298e-05, "loss": 0.0579, "step": 1340 }, { "epoch": 0.4540856278174007, "grad_norm": 0.494140625, "learning_rate": 1.234044084565547e-05, "loss": 0.0592, "step": 1341 }, { "epoch": 0.45442424498952405, "grad_norm": 0.46875, "learning_rate": 1.232955435606517e-05, "loss": 0.0614, "step": 1342 }, { "epoch": 0.45476286216164735, "grad_norm": 0.671875, "learning_rate": 1.2318664946378292e-05, "loss": 0.0752, "step": 1343 }, { "epoch": 0.4551014793337707, "grad_norm": 0.53515625, "learning_rate": 1.2307772630244715e-05, "loss": 0.0526, "step": 1344 }, { "epoch": 0.45544009650589407, "grad_norm": 0.55859375, "learning_rate": 1.2296877421317958e-05, "loss": 0.0691, "step": 1345 }, { "epoch": 0.4557787136780174, "grad_norm": 0.59765625, "learning_rate": 1.2285979333255165e-05, "loss": 0.0796, "step": 1346 }, { "epoch": 0.4561173308501407, "grad_norm": 0.447265625, "learning_rate": 1.227507837971709e-05, "loss": 0.0538, "step": 1347 }, { "epoch": 0.4564559480222641, "grad_norm": 0.515625, "learning_rate": 1.2264174574368079e-05, "loss": 0.0668, "step": 1348 }, { "epoch": 0.45679456519438744, "grad_norm": 0.46484375, "learning_rate": 1.2253267930876056e-05, "loss": 0.0635, "step": 1349 }, { "epoch": 0.45713318236651074, "grad_norm": 0.63671875, "learning_rate": 1.2242358462912496e-05, "loss": 0.0826, "step": 1350 }, { "epoch": 0.4574717995386341, "grad_norm": 0.458984375, "learning_rate": 1.2231446184152419e-05, "loss": 0.0538, "step": 1351 }, { "epoch": 0.45781041671075745, "grad_norm": 0.42578125, "learning_rate": 1.2220531108274367e-05, "loss": 0.0596, "step": 1352 }, { "epoch": 0.4581490338828808, "grad_norm": 0.53125, "learning_rate": 1.220961324896039e-05, "loss": 0.0715, "step": 1353 }, { "epoch": 0.4584876510550041, "grad_norm": 0.58203125, "learning_rate": 1.2198692619896026e-05, "loss": 0.0625, "step": 1354 }, { "epoch": 0.45882626822712747, "grad_norm": 0.453125, "learning_rate": 1.218776923477028e-05, "loss": 0.0606, "step": 1355 }, { "epoch": 0.4591648853992508, "grad_norm": 0.388671875, "learning_rate": 1.2176843107275624e-05, "loss": 0.0471, "step": 1356 }, { "epoch": 0.4595035025713742, "grad_norm": 0.83984375, "learning_rate": 1.2165914251107953e-05, "loss": 0.0775, "step": 1357 }, { "epoch": 0.4598421197434975, "grad_norm": 0.54296875, "learning_rate": 1.215498267996659e-05, "loss": 0.0786, "step": 1358 }, { "epoch": 0.46018073691562084, "grad_norm": 0.51171875, "learning_rate": 1.214404840755426e-05, "loss": 0.0682, "step": 1359 }, { "epoch": 0.4605193540877442, "grad_norm": 0.494140625, "learning_rate": 1.2133111447577077e-05, "loss": 0.0661, "step": 1360 }, { "epoch": 0.4608579712598675, "grad_norm": 0.62109375, "learning_rate": 1.2122171813744522e-05, "loss": 0.0905, "step": 1361 }, { "epoch": 0.46119658843199085, "grad_norm": 0.4375, "learning_rate": 1.2111229519769421e-05, "loss": 0.0615, "step": 1362 }, { "epoch": 0.4615352056041142, "grad_norm": 0.5234375, "learning_rate": 1.2100284579367947e-05, "loss": 0.0636, "step": 1363 }, { "epoch": 0.46187382277623756, "grad_norm": 0.44140625, "learning_rate": 1.2089337006259581e-05, "loss": 0.0617, "step": 1364 }, { "epoch": 0.46221243994836086, "grad_norm": 0.51953125, "learning_rate": 1.2078386814167106e-05, "loss": 0.0522, "step": 1365 }, { "epoch": 0.4625510571204842, "grad_norm": 0.59765625, "learning_rate": 1.2067434016816591e-05, "loss": 0.0824, "step": 1366 }, { "epoch": 0.4628896742926076, "grad_norm": 0.578125, "learning_rate": 1.2056478627937364e-05, "loss": 0.0736, "step": 1367 }, { "epoch": 0.4632282914647309, "grad_norm": 0.47265625, "learning_rate": 1.2045520661262011e-05, "loss": 0.0579, "step": 1368 }, { "epoch": 0.46356690863685424, "grad_norm": 0.765625, "learning_rate": 1.2034560130526341e-05, "loss": 0.0649, "step": 1369 }, { "epoch": 0.4639055258089776, "grad_norm": 0.5234375, "learning_rate": 1.2023597049469378e-05, "loss": 0.0666, "step": 1370 }, { "epoch": 0.46424414298110095, "grad_norm": 0.47265625, "learning_rate": 1.201263143183335e-05, "loss": 0.0632, "step": 1371 }, { "epoch": 0.46458276015322425, "grad_norm": 0.5078125, "learning_rate": 1.2001663291363661e-05, "loss": 0.0655, "step": 1372 }, { "epoch": 0.4649213773253476, "grad_norm": 0.451171875, "learning_rate": 1.199069264180887e-05, "loss": 0.0583, "step": 1373 }, { "epoch": 0.46525999449747096, "grad_norm": 0.5625, "learning_rate": 1.1979719496920686e-05, "loss": 0.0851, "step": 1374 }, { "epoch": 0.4655986116695943, "grad_norm": 0.67578125, "learning_rate": 1.1968743870453956e-05, "loss": 0.0895, "step": 1375 }, { "epoch": 0.4659372288417176, "grad_norm": 0.392578125, "learning_rate": 1.195776577616662e-05, "loss": 0.0533, "step": 1376 }, { "epoch": 0.466275846013841, "grad_norm": 0.52734375, "learning_rate": 1.1946785227819726e-05, "loss": 0.0661, "step": 1377 }, { "epoch": 0.46661446318596433, "grad_norm": 0.5, "learning_rate": 1.1935802239177387e-05, "loss": 0.0636, "step": 1378 }, { "epoch": 0.46695308035808764, "grad_norm": 0.435546875, "learning_rate": 1.1924816824006787e-05, "loss": 0.0596, "step": 1379 }, { "epoch": 0.467291697530211, "grad_norm": 0.443359375, "learning_rate": 1.1913828996078136e-05, "loss": 0.054, "step": 1380 }, { "epoch": 0.46763031470233435, "grad_norm": 0.494140625, "learning_rate": 1.1902838769164685e-05, "loss": 0.0634, "step": 1381 }, { "epoch": 0.4679689318744577, "grad_norm": 0.63671875, "learning_rate": 1.1891846157042678e-05, "loss": 0.0675, "step": 1382 }, { "epoch": 0.468307549046581, "grad_norm": 0.59765625, "learning_rate": 1.1880851173491361e-05, "loss": 0.0691, "step": 1383 }, { "epoch": 0.46864616621870436, "grad_norm": 0.71484375, "learning_rate": 1.1869853832292944e-05, "loss": 0.1164, "step": 1384 }, { "epoch": 0.4689847833908277, "grad_norm": 0.74609375, "learning_rate": 1.1858854147232595e-05, "loss": 0.0892, "step": 1385 }, { "epoch": 0.4693234005629511, "grad_norm": 0.5703125, "learning_rate": 1.184785213209842e-05, "loss": 0.0762, "step": 1386 }, { "epoch": 0.4696620177350744, "grad_norm": 0.515625, "learning_rate": 1.1836847800681443e-05, "loss": 0.0613, "step": 1387 }, { "epoch": 0.47000063490719773, "grad_norm": 1.1796875, "learning_rate": 1.1825841166775605e-05, "loss": 0.0655, "step": 1388 }, { "epoch": 0.4703392520793211, "grad_norm": 0.42578125, "learning_rate": 1.181483224417771e-05, "loss": 0.052, "step": 1389 }, { "epoch": 0.4706778692514444, "grad_norm": 0.515625, "learning_rate": 1.180382104668745e-05, "loss": 0.045, "step": 1390 }, { "epoch": 0.47101648642356775, "grad_norm": 0.41796875, "learning_rate": 1.1792807588107358e-05, "loss": 0.0532, "step": 1391 }, { "epoch": 0.4713551035956911, "grad_norm": 0.890625, "learning_rate": 1.1781791882242811e-05, "loss": 0.0719, "step": 1392 }, { "epoch": 0.47169372076781446, "grad_norm": 0.55078125, "learning_rate": 1.177077394290199e-05, "loss": 0.0679, "step": 1393 }, { "epoch": 0.47203233793993776, "grad_norm": 0.56640625, "learning_rate": 1.175975378389589e-05, "loss": 0.0782, "step": 1394 }, { "epoch": 0.4723709551120611, "grad_norm": 0.419921875, "learning_rate": 1.1748731419038278e-05, "loss": 0.0547, "step": 1395 }, { "epoch": 0.4727095722841845, "grad_norm": 0.609375, "learning_rate": 1.1737706862145688e-05, "loss": 0.0719, "step": 1396 }, { "epoch": 0.4730481894563078, "grad_norm": 0.431640625, "learning_rate": 1.1726680127037403e-05, "loss": 0.063, "step": 1397 }, { "epoch": 0.47338680662843113, "grad_norm": 0.490234375, "learning_rate": 1.1715651227535441e-05, "loss": 0.0681, "step": 1398 }, { "epoch": 0.4737254238005545, "grad_norm": 0.494140625, "learning_rate": 1.170462017746452e-05, "loss": 0.0679, "step": 1399 }, { "epoch": 0.47406404097267785, "grad_norm": 0.56640625, "learning_rate": 1.169358699065207e-05, "loss": 0.0749, "step": 1400 }, { "epoch": 0.47440265814480115, "grad_norm": 0.51171875, "learning_rate": 1.1682551680928189e-05, "loss": 0.0639, "step": 1401 }, { "epoch": 0.4747412753169245, "grad_norm": 0.52734375, "learning_rate": 1.1671514262125638e-05, "loss": 0.07, "step": 1402 }, { "epoch": 0.47507989248904786, "grad_norm": 0.458984375, "learning_rate": 1.1660474748079823e-05, "loss": 0.0539, "step": 1403 }, { "epoch": 0.4754185096611712, "grad_norm": 0.53515625, "learning_rate": 1.1649433152628775e-05, "loss": 0.0699, "step": 1404 }, { "epoch": 0.4757571268332945, "grad_norm": 0.60546875, "learning_rate": 1.1638389489613133e-05, "loss": 0.0785, "step": 1405 }, { "epoch": 0.4760957440054179, "grad_norm": 0.5234375, "learning_rate": 1.1627343772876133e-05, "loss": 0.0577, "step": 1406 }, { "epoch": 0.47643436117754123, "grad_norm": 0.48828125, "learning_rate": 1.1616296016263581e-05, "loss": 0.0617, "step": 1407 }, { "epoch": 0.47677297834966453, "grad_norm": 0.498046875, "learning_rate": 1.1605246233623843e-05, "loss": 0.0687, "step": 1408 }, { "epoch": 0.4771115955217879, "grad_norm": 0.546875, "learning_rate": 1.1594194438807817e-05, "loss": 0.0702, "step": 1409 }, { "epoch": 0.47745021269391125, "grad_norm": 0.515625, "learning_rate": 1.1583140645668933e-05, "loss": 0.0706, "step": 1410 }, { "epoch": 0.4777888298660346, "grad_norm": 0.546875, "learning_rate": 1.157208486806312e-05, "loss": 0.0633, "step": 1411 }, { "epoch": 0.4781274470381579, "grad_norm": 0.419921875, "learning_rate": 1.1561027119848793e-05, "loss": 0.0517, "step": 1412 }, { "epoch": 0.47846606421028126, "grad_norm": 0.5078125, "learning_rate": 1.1549967414886847e-05, "loss": 0.073, "step": 1413 }, { "epoch": 0.4788046813824046, "grad_norm": 0.578125, "learning_rate": 1.153890576704062e-05, "loss": 0.0749, "step": 1414 }, { "epoch": 0.4791432985545279, "grad_norm": 0.498046875, "learning_rate": 1.1527842190175886e-05, "loss": 0.0569, "step": 1415 }, { "epoch": 0.4794819157266513, "grad_norm": 0.5703125, "learning_rate": 1.1516776698160841e-05, "loss": 0.0752, "step": 1416 }, { "epoch": 0.47982053289877463, "grad_norm": 0.515625, "learning_rate": 1.1505709304866084e-05, "loss": 0.0677, "step": 1417 }, { "epoch": 0.480159150070898, "grad_norm": 0.43359375, "learning_rate": 1.1494640024164587e-05, "loss": 0.0518, "step": 1418 }, { "epoch": 0.4804977672430213, "grad_norm": 0.4296875, "learning_rate": 1.14835688699317e-05, "loss": 0.055, "step": 1419 }, { "epoch": 0.48083638441514465, "grad_norm": 0.5625, "learning_rate": 1.1472495856045112e-05, "loss": 0.073, "step": 1420 }, { "epoch": 0.481175001587268, "grad_norm": 0.51953125, "learning_rate": 1.1461420996384849e-05, "loss": 0.0762, "step": 1421 }, { "epoch": 0.48151361875939136, "grad_norm": 0.470703125, "learning_rate": 1.1450344304833248e-05, "loss": 0.0513, "step": 1422 }, { "epoch": 0.48185223593151466, "grad_norm": 0.6015625, "learning_rate": 1.1439265795274941e-05, "loss": 0.0863, "step": 1423 }, { "epoch": 0.482190853103638, "grad_norm": 0.45703125, "learning_rate": 1.142818548159684e-05, "loss": 0.0618, "step": 1424 }, { "epoch": 0.4825294702757614, "grad_norm": 0.59765625, "learning_rate": 1.1417103377688121e-05, "loss": 0.0715, "step": 1425 }, { "epoch": 0.4828680874478847, "grad_norm": 0.47265625, "learning_rate": 1.1406019497440206e-05, "loss": 0.0583, "step": 1426 }, { "epoch": 0.48320670462000803, "grad_norm": 0.578125, "learning_rate": 1.1394933854746733e-05, "loss": 0.078, "step": 1427 }, { "epoch": 0.4835453217921314, "grad_norm": 0.5, "learning_rate": 1.1383846463503558e-05, "loss": 0.0681, "step": 1428 }, { "epoch": 0.48388393896425475, "grad_norm": 0.58984375, "learning_rate": 1.1372757337608732e-05, "loss": 0.0879, "step": 1429 }, { "epoch": 0.48422255613637805, "grad_norm": 0.53125, "learning_rate": 1.1361666490962468e-05, "loss": 0.0716, "step": 1430 }, { "epoch": 0.4845611733085014, "grad_norm": 0.5546875, "learning_rate": 1.1350573937467147e-05, "loss": 0.0754, "step": 1431 }, { "epoch": 0.48489979048062476, "grad_norm": 0.439453125, "learning_rate": 1.1339479691027284e-05, "loss": 0.0527, "step": 1432 }, { "epoch": 0.4852384076527481, "grad_norm": 0.40234375, "learning_rate": 1.132838376554952e-05, "loss": 0.0522, "step": 1433 }, { "epoch": 0.4855770248248714, "grad_norm": 0.46484375, "learning_rate": 1.1317286174942596e-05, "loss": 0.0715, "step": 1434 }, { "epoch": 0.4859156419969948, "grad_norm": 0.48828125, "learning_rate": 1.1306186933117343e-05, "loss": 0.0668, "step": 1435 }, { "epoch": 0.48625425916911813, "grad_norm": 0.44921875, "learning_rate": 1.1295086053986664e-05, "loss": 0.0657, "step": 1436 }, { "epoch": 0.48659287634124143, "grad_norm": 0.47265625, "learning_rate": 1.1283983551465512e-05, "loss": 0.058, "step": 1437 }, { "epoch": 0.4869314935133648, "grad_norm": 0.490234375, "learning_rate": 1.127287943947087e-05, "loss": 0.0614, "step": 1438 }, { "epoch": 0.48727011068548814, "grad_norm": 0.62109375, "learning_rate": 1.1261773731921746e-05, "loss": 0.0736, "step": 1439 }, { "epoch": 0.4876087278576115, "grad_norm": 0.46484375, "learning_rate": 1.1250666442739149e-05, "loss": 0.0513, "step": 1440 }, { "epoch": 0.4879473450297348, "grad_norm": 0.640625, "learning_rate": 1.1239557585846066e-05, "loss": 0.0689, "step": 1441 }, { "epoch": 0.48828596220185816, "grad_norm": 0.6171875, "learning_rate": 1.1228447175167443e-05, "loss": 0.065, "step": 1442 }, { "epoch": 0.4886245793739815, "grad_norm": 0.470703125, "learning_rate": 1.1217335224630186e-05, "loss": 0.054, "step": 1443 }, { "epoch": 0.4889631965461048, "grad_norm": 0.53515625, "learning_rate": 1.1206221748163127e-05, "loss": 0.0709, "step": 1444 }, { "epoch": 0.4893018137182282, "grad_norm": 2.390625, "learning_rate": 1.1195106759697005e-05, "loss": 0.0699, "step": 1445 }, { "epoch": 0.48964043089035153, "grad_norm": 0.53125, "learning_rate": 1.1183990273164464e-05, "loss": 0.0593, "step": 1446 }, { "epoch": 0.4899790480624749, "grad_norm": 0.4609375, "learning_rate": 1.1172872302500017e-05, "loss": 0.0554, "step": 1447 }, { "epoch": 0.4903176652345982, "grad_norm": 0.50390625, "learning_rate": 1.1161752861640046e-05, "loss": 0.0639, "step": 1448 }, { "epoch": 0.49065628240672154, "grad_norm": 0.5703125, "learning_rate": 1.1150631964522767e-05, "loss": 0.0659, "step": 1449 }, { "epoch": 0.4909948995788449, "grad_norm": 0.5234375, "learning_rate": 1.1139509625088225e-05, "loss": 0.0659, "step": 1450 }, { "epoch": 0.49133351675096826, "grad_norm": 0.494140625, "learning_rate": 1.1128385857278274e-05, "loss": 0.0584, "step": 1451 }, { "epoch": 0.49167213392309156, "grad_norm": 0.54296875, "learning_rate": 1.1117260675036563e-05, "loss": 0.0791, "step": 1452 }, { "epoch": 0.4920107510952149, "grad_norm": 0.482421875, "learning_rate": 1.1106134092308502e-05, "loss": 0.0626, "step": 1453 }, { "epoch": 0.49234936826733827, "grad_norm": 0.5390625, "learning_rate": 1.1095006123041262e-05, "loss": 0.0627, "step": 1454 }, { "epoch": 0.4926879854394616, "grad_norm": 0.73046875, "learning_rate": 1.1083876781183762e-05, "loss": 0.047, "step": 1455 }, { "epoch": 0.49302660261158493, "grad_norm": 1.1328125, "learning_rate": 1.1072746080686628e-05, "loss": 0.0471, "step": 1456 }, { "epoch": 0.4933652197837083, "grad_norm": 0.625, "learning_rate": 1.1061614035502193e-05, "loss": 0.0858, "step": 1457 }, { "epoch": 0.49370383695583164, "grad_norm": 0.462890625, "learning_rate": 1.1050480659584475e-05, "loss": 0.0583, "step": 1458 }, { "epoch": 0.49404245412795494, "grad_norm": 0.41796875, "learning_rate": 1.1039345966889167e-05, "loss": 0.0484, "step": 1459 }, { "epoch": 0.4943810713000783, "grad_norm": 0.59765625, "learning_rate": 1.1028209971373605e-05, "loss": 0.0672, "step": 1460 }, { "epoch": 0.49471968847220166, "grad_norm": 0.4921875, "learning_rate": 1.101707268699676e-05, "loss": 0.063, "step": 1461 }, { "epoch": 0.495058305644325, "grad_norm": 0.4609375, "learning_rate": 1.1005934127719218e-05, "loss": 0.0549, "step": 1462 }, { "epoch": 0.4953969228164483, "grad_norm": 0.62109375, "learning_rate": 1.0994794307503162e-05, "loss": 0.0881, "step": 1463 }, { "epoch": 0.49573553998857167, "grad_norm": 0.5078125, "learning_rate": 1.0983653240312364e-05, "loss": 0.0701, "step": 1464 }, { "epoch": 0.49607415716069503, "grad_norm": 0.5078125, "learning_rate": 1.0972510940112149e-05, "loss": 0.0641, "step": 1465 }, { "epoch": 0.49641277433281833, "grad_norm": 0.466796875, "learning_rate": 1.0961367420869387e-05, "loss": 0.0599, "step": 1466 }, { "epoch": 0.4967513915049417, "grad_norm": 0.5546875, "learning_rate": 1.0950222696552487e-05, "loss": 0.0651, "step": 1467 }, { "epoch": 0.49709000867706504, "grad_norm": 0.51953125, "learning_rate": 1.0939076781131357e-05, "loss": 0.0631, "step": 1468 }, { "epoch": 0.4974286258491884, "grad_norm": 0.5546875, "learning_rate": 1.0927929688577408e-05, "loss": 0.0606, "step": 1469 }, { "epoch": 0.4977672430213117, "grad_norm": 0.5859375, "learning_rate": 1.0916781432863514e-05, "loss": 0.064, "step": 1470 }, { "epoch": 0.49810586019343506, "grad_norm": 0.40625, "learning_rate": 1.0905632027964024e-05, "loss": 0.0527, "step": 1471 }, { "epoch": 0.4984444773655584, "grad_norm": 0.359375, "learning_rate": 1.0894481487854711e-05, "loss": 0.0429, "step": 1472 }, { "epoch": 0.4987830945376817, "grad_norm": 0.60546875, "learning_rate": 1.0883329826512779e-05, "loss": 0.0731, "step": 1473 }, { "epoch": 0.49912171170980507, "grad_norm": 0.65625, "learning_rate": 1.087217705791684e-05, "loss": 0.088, "step": 1474 }, { "epoch": 0.49946032888192843, "grad_norm": 0.390625, "learning_rate": 1.0861023196046885e-05, "loss": 0.0539, "step": 1475 }, { "epoch": 0.4997989460540518, "grad_norm": 0.431640625, "learning_rate": 1.0849868254884284e-05, "loss": 0.0572, "step": 1476 }, { "epoch": 0.5001375632261751, "grad_norm": 0.427734375, "learning_rate": 1.0838712248411754e-05, "loss": 0.0495, "step": 1477 }, { "epoch": 0.5004761803982984, "grad_norm": 0.455078125, "learning_rate": 1.0827555190613353e-05, "loss": 0.0592, "step": 1478 }, { "epoch": 0.5008147975704218, "grad_norm": 0.57421875, "learning_rate": 1.0816397095474454e-05, "loss": 0.0719, "step": 1479 }, { "epoch": 0.5011534147425452, "grad_norm": 0.408203125, "learning_rate": 1.0805237976981729e-05, "loss": 0.0547, "step": 1480 }, { "epoch": 0.5011534147425452, "eval_loss": 0.06768392771482468, "eval_runtime": 815.5247, "eval_samples_per_second": 12.198, "eval_steps_per_second": 3.05, "step": 1480 }, { "epoch": 0.5014920319146685, "grad_norm": 0.482421875, "learning_rate": 1.0794077849123134e-05, "loss": 0.0581, "step": 1481 }, { "epoch": 0.5018306490867919, "grad_norm": 0.5234375, "learning_rate": 1.0782916725887888e-05, "loss": 0.0647, "step": 1482 }, { "epoch": 0.5021692662589151, "grad_norm": 0.4765625, "learning_rate": 1.0771754621266466e-05, "loss": 0.0725, "step": 1483 }, { "epoch": 0.5025078834310385, "grad_norm": 0.494140625, "learning_rate": 1.0760591549250561e-05, "loss": 0.0648, "step": 1484 }, { "epoch": 0.5028465006031618, "grad_norm": 0.5859375, "learning_rate": 1.0749427523833084e-05, "loss": 0.0707, "step": 1485 }, { "epoch": 0.5031851177752852, "grad_norm": 0.5546875, "learning_rate": 1.0738262559008148e-05, "loss": 0.0649, "step": 1486 }, { "epoch": 0.5035237349474085, "grad_norm": 0.39453125, "learning_rate": 1.0727096668771035e-05, "loss": 0.0522, "step": 1487 }, { "epoch": 0.5038623521195319, "grad_norm": 0.92578125, "learning_rate": 1.0715929867118187e-05, "loss": 0.0691, "step": 1488 }, { "epoch": 0.5042009692916553, "grad_norm": 0.447265625, "learning_rate": 1.0704762168047189e-05, "loss": 0.0571, "step": 1489 }, { "epoch": 0.5045395864637785, "grad_norm": 0.5, "learning_rate": 1.069359358555676e-05, "loss": 0.0701, "step": 1490 }, { "epoch": 0.5048782036359019, "grad_norm": 0.5390625, "learning_rate": 1.0682424133646712e-05, "loss": 0.0739, "step": 1491 }, { "epoch": 0.5052168208080252, "grad_norm": 0.443359375, "learning_rate": 1.0671253826317957e-05, "loss": 0.0613, "step": 1492 }, { "epoch": 0.5055554379801486, "grad_norm": 0.56640625, "learning_rate": 1.0660082677572474e-05, "loss": 0.0781, "step": 1493 }, { "epoch": 0.5058940551522719, "grad_norm": 0.5625, "learning_rate": 1.0648910701413306e-05, "loss": 0.0718, "step": 1494 }, { "epoch": 0.5062326723243953, "grad_norm": 0.5546875, "learning_rate": 1.0637737911844516e-05, "loss": 0.0781, "step": 1495 }, { "epoch": 0.5065712894965186, "grad_norm": 0.57421875, "learning_rate": 1.0626564322871205e-05, "loss": 0.09, "step": 1496 }, { "epoch": 0.5069099066686419, "grad_norm": 0.39453125, "learning_rate": 1.061538994849946e-05, "loss": 0.0554, "step": 1497 }, { "epoch": 0.5072485238407652, "grad_norm": 0.52734375, "learning_rate": 1.0604214802736366e-05, "loss": 0.0735, "step": 1498 }, { "epoch": 0.5075871410128886, "grad_norm": 0.462890625, "learning_rate": 1.0593038899589968e-05, "loss": 0.0592, "step": 1499 }, { "epoch": 0.507925758185012, "grad_norm": 0.38671875, "learning_rate": 1.0581862253069262e-05, "loss": 0.0484, "step": 1500 }, { "epoch": 0.5082643753571353, "grad_norm": 0.62109375, "learning_rate": 1.0570684877184169e-05, "loss": 0.0938, "step": 1501 }, { "epoch": 0.5086029925292587, "grad_norm": 0.640625, "learning_rate": 1.0559506785945538e-05, "loss": 0.0768, "step": 1502 }, { "epoch": 0.508941609701382, "grad_norm": 0.48828125, "learning_rate": 1.0548327993365108e-05, "loss": 0.0552, "step": 1503 }, { "epoch": 0.5092802268735053, "grad_norm": 0.412109375, "learning_rate": 1.0537148513455493e-05, "loss": 0.0519, "step": 1504 }, { "epoch": 0.5096188440456286, "grad_norm": 0.51171875, "learning_rate": 1.0525968360230173e-05, "loss": 0.0869, "step": 1505 }, { "epoch": 0.509957461217752, "grad_norm": 0.59375, "learning_rate": 1.0514787547703466e-05, "loss": 0.0748, "step": 1506 }, { "epoch": 0.5102960783898753, "grad_norm": 0.5546875, "learning_rate": 1.050360608989053e-05, "loss": 0.0689, "step": 1507 }, { "epoch": 0.5106346955619987, "grad_norm": 0.490234375, "learning_rate": 1.0492424000807316e-05, "loss": 0.0596, "step": 1508 }, { "epoch": 0.510973312734122, "grad_norm": 0.361328125, "learning_rate": 1.0481241294470578e-05, "loss": 0.0427, "step": 1509 }, { "epoch": 0.5113119299062454, "grad_norm": 0.5390625, "learning_rate": 1.047005798489784e-05, "loss": 0.0608, "step": 1510 }, { "epoch": 0.5116505470783688, "grad_norm": 0.474609375, "learning_rate": 1.0458874086107379e-05, "loss": 0.0565, "step": 1511 }, { "epoch": 0.511989164250492, "grad_norm": 0.44140625, "learning_rate": 1.0447689612118208e-05, "loss": 0.0595, "step": 1512 }, { "epoch": 0.5123277814226154, "grad_norm": 1.046875, "learning_rate": 1.0436504576950077e-05, "loss": 0.05, "step": 1513 }, { "epoch": 0.5126663985947387, "grad_norm": 0.421875, "learning_rate": 1.0425318994623423e-05, "loss": 0.0583, "step": 1514 }, { "epoch": 0.5130050157668621, "grad_norm": 0.56640625, "learning_rate": 1.0414132879159375e-05, "loss": 0.0612, "step": 1515 }, { "epoch": 0.5133436329389854, "grad_norm": 0.7421875, "learning_rate": 1.0402946244579726e-05, "loss": 0.1383, "step": 1516 }, { "epoch": 0.5136822501111088, "grad_norm": 0.4140625, "learning_rate": 1.0391759104906928e-05, "loss": 0.0571, "step": 1517 }, { "epoch": 0.5140208672832322, "grad_norm": 0.52734375, "learning_rate": 1.038057147416406e-05, "loss": 0.0564, "step": 1518 }, { "epoch": 0.5143594844553554, "grad_norm": 0.43359375, "learning_rate": 1.0369383366374819e-05, "loss": 0.0551, "step": 1519 }, { "epoch": 0.5146981016274788, "grad_norm": 0.466796875, "learning_rate": 1.0358194795563497e-05, "loss": 0.0617, "step": 1520 }, { "epoch": 0.5150367187996021, "grad_norm": 0.53515625, "learning_rate": 1.0347005775754969e-05, "loss": 0.0756, "step": 1521 }, { "epoch": 0.5153753359717255, "grad_norm": 0.46875, "learning_rate": 1.0335816320974672e-05, "loss": 0.0606, "step": 1522 }, { "epoch": 0.5157139531438488, "grad_norm": 0.41796875, "learning_rate": 1.0324626445248592e-05, "loss": 0.0454, "step": 1523 }, { "epoch": 0.5160525703159722, "grad_norm": 0.53125, "learning_rate": 1.0313436162603231e-05, "loss": 0.0752, "step": 1524 }, { "epoch": 0.5163911874880955, "grad_norm": 0.546875, "learning_rate": 1.0302245487065621e-05, "loss": 0.0705, "step": 1525 }, { "epoch": 0.5167298046602188, "grad_norm": 0.498046875, "learning_rate": 1.0291054432663267e-05, "loss": 0.0666, "step": 1526 }, { "epoch": 0.5170684218323421, "grad_norm": 0.58984375, "learning_rate": 1.0279863013424154e-05, "loss": 0.0596, "step": 1527 }, { "epoch": 0.5174070390044655, "grad_norm": 0.578125, "learning_rate": 1.0268671243376733e-05, "loss": 0.0686, "step": 1528 }, { "epoch": 0.5177456561765889, "grad_norm": 0.53125, "learning_rate": 1.0257479136549889e-05, "loss": 0.0569, "step": 1529 }, { "epoch": 0.5180842733487122, "grad_norm": 0.458984375, "learning_rate": 1.0246286706972923e-05, "loss": 0.0582, "step": 1530 }, { "epoch": 0.5184228905208356, "grad_norm": 0.5078125, "learning_rate": 1.023509396867555e-05, "loss": 0.072, "step": 1531 }, { "epoch": 0.5187615076929589, "grad_norm": 1.078125, "learning_rate": 1.0223900935687866e-05, "loss": 0.076, "step": 1532 }, { "epoch": 0.5191001248650822, "grad_norm": 0.45703125, "learning_rate": 1.0212707622040345e-05, "loss": 0.0651, "step": 1533 }, { "epoch": 0.5194387420372055, "grad_norm": 0.51953125, "learning_rate": 1.02015140417638e-05, "loss": 0.0795, "step": 1534 }, { "epoch": 0.5197773592093289, "grad_norm": 0.3984375, "learning_rate": 1.0190320208889388e-05, "loss": 0.0507, "step": 1535 }, { "epoch": 0.5201159763814522, "grad_norm": 0.5, "learning_rate": 1.0179126137448577e-05, "loss": 0.0691, "step": 1536 }, { "epoch": 0.5204545935535756, "grad_norm": 0.462890625, "learning_rate": 1.0167931841473143e-05, "loss": 0.0529, "step": 1537 }, { "epoch": 0.520793210725699, "grad_norm": 0.51953125, "learning_rate": 1.0156737334995129e-05, "loss": 0.0722, "step": 1538 }, { "epoch": 0.5211318278978223, "grad_norm": 0.4140625, "learning_rate": 1.014554263204685e-05, "loss": 0.0621, "step": 1539 }, { "epoch": 0.5214704450699457, "grad_norm": 0.474609375, "learning_rate": 1.013434774666087e-05, "loss": 0.0425, "step": 1540 }, { "epoch": 0.5218090622420689, "grad_norm": 0.47265625, "learning_rate": 1.0123152692869981e-05, "loss": 0.056, "step": 1541 }, { "epoch": 0.5221476794141923, "grad_norm": 0.451171875, "learning_rate": 1.0111957484707182e-05, "loss": 0.0616, "step": 1542 }, { "epoch": 0.5224862965863156, "grad_norm": 0.482421875, "learning_rate": 1.0100762136205664e-05, "loss": 0.0521, "step": 1543 }, { "epoch": 0.522824913758439, "grad_norm": 0.61328125, "learning_rate": 1.0089566661398802e-05, "loss": 0.0845, "step": 1544 }, { "epoch": 0.5231635309305623, "grad_norm": 0.53125, "learning_rate": 1.0078371074320123e-05, "loss": 0.0735, "step": 1545 }, { "epoch": 0.5235021481026857, "grad_norm": 0.53125, "learning_rate": 1.0067175389003297e-05, "loss": 0.0699, "step": 1546 }, { "epoch": 0.523840765274809, "grad_norm": 0.63671875, "learning_rate": 1.0055979619482112e-05, "loss": 0.0785, "step": 1547 }, { "epoch": 0.5241793824469323, "grad_norm": 0.55078125, "learning_rate": 1.0044783779790472e-05, "loss": 0.0614, "step": 1548 }, { "epoch": 0.5245179996190557, "grad_norm": 0.60546875, "learning_rate": 1.0033587883962362e-05, "loss": 0.0635, "step": 1549 }, { "epoch": 0.524856616791179, "grad_norm": 0.478515625, "learning_rate": 1.0022391946031832e-05, "loss": 0.0542, "step": 1550 }, { "epoch": 0.5251952339633024, "grad_norm": 0.53125, "learning_rate": 1.0011195980032996e-05, "loss": 0.067, "step": 1551 }, { "epoch": 0.5255338511354257, "grad_norm": 0.53125, "learning_rate": 1e-05, "loss": 0.0592, "step": 1552 }, { "epoch": 0.5258724683075491, "grad_norm": 0.578125, "learning_rate": 9.988804019967005e-06, "loss": 0.0721, "step": 1553 }, { "epoch": 0.5262110854796724, "grad_norm": 0.48046875, "learning_rate": 9.977608053968172e-06, "loss": 0.064, "step": 1554 }, { "epoch": 0.5265497026517957, "grad_norm": 0.6640625, "learning_rate": 9.966412116037643e-06, "loss": 0.0672, "step": 1555 }, { "epoch": 0.526888319823919, "grad_norm": 0.423828125, "learning_rate": 9.95521622020953e-06, "loss": 0.0526, "step": 1556 }, { "epoch": 0.5272269369960424, "grad_norm": 0.63671875, "learning_rate": 9.94402038051789e-06, "loss": 0.0733, "step": 1557 }, { "epoch": 0.5275655541681658, "grad_norm": 0.4609375, "learning_rate": 9.932824610996706e-06, "loss": 0.0561, "step": 1558 }, { "epoch": 0.5279041713402891, "grad_norm": 0.5546875, "learning_rate": 9.921628925679877e-06, "loss": 0.072, "step": 1559 }, { "epoch": 0.5282427885124125, "grad_norm": 0.44921875, "learning_rate": 9.910433338601198e-06, "loss": 0.0561, "step": 1560 }, { "epoch": 0.5285814056845358, "grad_norm": 0.62890625, "learning_rate": 9.899237863794336e-06, "loss": 0.0679, "step": 1561 }, { "epoch": 0.5289200228566591, "grad_norm": 0.427734375, "learning_rate": 9.888042515292821e-06, "loss": 0.0552, "step": 1562 }, { "epoch": 0.5292586400287824, "grad_norm": 0.5859375, "learning_rate": 9.876847307130024e-06, "loss": 0.0788, "step": 1563 }, { "epoch": 0.5295972572009058, "grad_norm": 0.6171875, "learning_rate": 9.865652253339133e-06, "loss": 0.0774, "step": 1564 }, { "epoch": 0.5299358743730291, "grad_norm": 0.427734375, "learning_rate": 9.854457367953155e-06, "loss": 0.0599, "step": 1565 }, { "epoch": 0.5302744915451525, "grad_norm": 0.498046875, "learning_rate": 9.843262665004876e-06, "loss": 0.062, "step": 1566 }, { "epoch": 0.5306131087172758, "grad_norm": 0.57421875, "learning_rate": 9.832068158526862e-06, "loss": 0.0831, "step": 1567 }, { "epoch": 0.5309517258893992, "grad_norm": 0.404296875, "learning_rate": 9.820873862551425e-06, "loss": 0.053, "step": 1568 }, { "epoch": 0.5312903430615225, "grad_norm": 1.28125, "learning_rate": 9.809679791110615e-06, "loss": 0.0688, "step": 1569 }, { "epoch": 0.5316289602336458, "grad_norm": 0.58203125, "learning_rate": 9.798485958236203e-06, "loss": 0.0557, "step": 1570 }, { "epoch": 0.5319675774057692, "grad_norm": 0.51953125, "learning_rate": 9.787292377959659e-06, "loss": 0.0671, "step": 1571 }, { "epoch": 0.5323061945778925, "grad_norm": 0.546875, "learning_rate": 9.776099064312135e-06, "loss": 0.0679, "step": 1572 }, { "epoch": 0.5326448117500159, "grad_norm": 0.7265625, "learning_rate": 9.764906031324454e-06, "loss": 0.0996, "step": 1573 }, { "epoch": 0.5329834289221392, "grad_norm": 0.56640625, "learning_rate": 9.75371329302708e-06, "loss": 0.0634, "step": 1574 }, { "epoch": 0.5333220460942626, "grad_norm": 0.41796875, "learning_rate": 9.742520863450116e-06, "loss": 0.054, "step": 1575 }, { "epoch": 0.533660663266386, "grad_norm": 0.451171875, "learning_rate": 9.731328756623269e-06, "loss": 0.059, "step": 1576 }, { "epoch": 0.5339992804385092, "grad_norm": 0.5, "learning_rate": 9.720136986575849e-06, "loss": 0.0614, "step": 1577 }, { "epoch": 0.5343378976106326, "grad_norm": 0.4140625, "learning_rate": 9.708945567336736e-06, "loss": 0.0475, "step": 1578 }, { "epoch": 0.5346765147827559, "grad_norm": 0.5234375, "learning_rate": 9.69775451293438e-06, "loss": 0.0628, "step": 1579 }, { "epoch": 0.5350151319548793, "grad_norm": 0.455078125, "learning_rate": 9.686563837396769e-06, "loss": 0.0635, "step": 1580 }, { "epoch": 0.5353537491270026, "grad_norm": 0.7890625, "learning_rate": 9.675373554751412e-06, "loss": 0.0987, "step": 1581 }, { "epoch": 0.535692366299126, "grad_norm": 0.453125, "learning_rate": 9.664183679025327e-06, "loss": 0.061, "step": 1582 }, { "epoch": 0.5360309834712493, "grad_norm": 0.5546875, "learning_rate": 9.652994224245033e-06, "loss": 0.0729, "step": 1583 }, { "epoch": 0.5363696006433726, "grad_norm": 0.462890625, "learning_rate": 9.641805204436508e-06, "loss": 0.0598, "step": 1584 }, { "epoch": 0.5367082178154959, "grad_norm": 0.5703125, "learning_rate": 9.630616633625186e-06, "loss": 0.0672, "step": 1585 }, { "epoch": 0.5370468349876193, "grad_norm": 0.466796875, "learning_rate": 9.619428525835944e-06, "loss": 0.0625, "step": 1586 }, { "epoch": 0.5373854521597426, "grad_norm": 0.64453125, "learning_rate": 9.608240895093077e-06, "loss": 0.0487, "step": 1587 }, { "epoch": 0.537724069331866, "grad_norm": 0.498046875, "learning_rate": 9.597053755420277e-06, "loss": 0.0708, "step": 1588 }, { "epoch": 0.5380626865039894, "grad_norm": 0.52734375, "learning_rate": 9.58586712084063e-06, "loss": 0.0683, "step": 1589 }, { "epoch": 0.5384013036761127, "grad_norm": 0.427734375, "learning_rate": 9.57468100537658e-06, "loss": 0.0504, "step": 1590 }, { "epoch": 0.538739920848236, "grad_norm": 0.453125, "learning_rate": 9.563495423049925e-06, "loss": 0.0582, "step": 1591 }, { "epoch": 0.5390785380203593, "grad_norm": 0.50390625, "learning_rate": 9.552310387881793e-06, "loss": 0.0629, "step": 1592 }, { "epoch": 0.5394171551924827, "grad_norm": 0.6796875, "learning_rate": 9.541125913892625e-06, "loss": 0.0937, "step": 1593 }, { "epoch": 0.539755772364606, "grad_norm": 0.5625, "learning_rate": 9.529942015102164e-06, "loss": 0.079, "step": 1594 }, { "epoch": 0.5400943895367294, "grad_norm": 0.5078125, "learning_rate": 9.518758705529423e-06, "loss": 0.0697, "step": 1595 }, { "epoch": 0.5404330067088527, "grad_norm": 0.4375, "learning_rate": 9.507575999192686e-06, "loss": 0.0548, "step": 1596 }, { "epoch": 0.5407716238809761, "grad_norm": 0.384765625, "learning_rate": 9.496393910109473e-06, "loss": 0.0503, "step": 1597 }, { "epoch": 0.5411102410530994, "grad_norm": 0.5859375, "learning_rate": 9.485212452296535e-06, "loss": 0.0829, "step": 1598 }, { "epoch": 0.5414488582252227, "grad_norm": 0.498046875, "learning_rate": 9.474031639769832e-06, "loss": 0.058, "step": 1599 }, { "epoch": 0.5417874753973461, "grad_norm": 0.61328125, "learning_rate": 9.46285148654451e-06, "loss": 0.0779, "step": 1600 }, { "epoch": 0.5421260925694694, "grad_norm": 0.498046875, "learning_rate": 9.451672006634892e-06, "loss": 0.0568, "step": 1601 }, { "epoch": 0.5424647097415928, "grad_norm": 0.478515625, "learning_rate": 9.44049321405446e-06, "loss": 0.0697, "step": 1602 }, { "epoch": 0.5428033269137161, "grad_norm": 0.6015625, "learning_rate": 9.429315122815831e-06, "loss": 0.0661, "step": 1603 }, { "epoch": 0.5431419440858395, "grad_norm": 0.396484375, "learning_rate": 9.418137746930743e-06, "loss": 0.0526, "step": 1604 }, { "epoch": 0.5434805612579628, "grad_norm": 0.546875, "learning_rate": 9.406961100410033e-06, "loss": 0.0715, "step": 1605 }, { "epoch": 0.5438191784300861, "grad_norm": 0.64453125, "learning_rate": 9.395785197263638e-06, "loss": 0.0763, "step": 1606 }, { "epoch": 0.5441577956022094, "grad_norm": 0.56640625, "learning_rate": 9.384610051500546e-06, "loss": 0.0883, "step": 1607 }, { "epoch": 0.5444964127743328, "grad_norm": 0.470703125, "learning_rate": 9.3734356771288e-06, "loss": 0.0611, "step": 1608 }, { "epoch": 0.5448350299464562, "grad_norm": 0.4140625, "learning_rate": 9.362262088155487e-06, "loss": 0.0593, "step": 1609 }, { "epoch": 0.5451736471185795, "grad_norm": 0.408203125, "learning_rate": 9.351089298586699e-06, "loss": 0.0573, "step": 1610 }, { "epoch": 0.5455122642907029, "grad_norm": 0.59375, "learning_rate": 9.339917322427528e-06, "loss": 0.0757, "step": 1611 }, { "epoch": 0.5458508814628262, "grad_norm": 0.51171875, "learning_rate": 9.328746173682046e-06, "loss": 0.0641, "step": 1612 }, { "epoch": 0.5461894986349495, "grad_norm": 0.50390625, "learning_rate": 9.317575866353293e-06, "loss": 0.0635, "step": 1613 }, { "epoch": 0.5465281158070728, "grad_norm": 0.51953125, "learning_rate": 9.306406414443246e-06, "loss": 0.073, "step": 1614 }, { "epoch": 0.5468667329791962, "grad_norm": 0.578125, "learning_rate": 9.295237831952815e-06, "loss": 0.0737, "step": 1615 }, { "epoch": 0.5472053501513195, "grad_norm": 0.546875, "learning_rate": 9.284070132881817e-06, "loss": 0.0773, "step": 1616 }, { "epoch": 0.5475439673234429, "grad_norm": 0.54296875, "learning_rate": 9.272903331228968e-06, "loss": 0.0576, "step": 1617 }, { "epoch": 0.5478825844955663, "grad_norm": 0.498046875, "learning_rate": 9.261737440991854e-06, "loss": 0.0701, "step": 1618 }, { "epoch": 0.5482212016676896, "grad_norm": 0.484375, "learning_rate": 9.250572476166918e-06, "loss": 0.0601, "step": 1619 }, { "epoch": 0.5485598188398129, "grad_norm": 1.1640625, "learning_rate": 9.239408450749442e-06, "loss": 0.0674, "step": 1620 }, { "epoch": 0.5488984360119362, "grad_norm": 0.43359375, "learning_rate": 9.228245378733537e-06, "loss": 0.0615, "step": 1621 }, { "epoch": 0.5492370531840596, "grad_norm": 0.458984375, "learning_rate": 9.217083274112114e-06, "loss": 0.061, "step": 1622 }, { "epoch": 0.5495756703561829, "grad_norm": 0.486328125, "learning_rate": 9.20592215087687e-06, "loss": 0.0649, "step": 1623 }, { "epoch": 0.5499142875283063, "grad_norm": 0.60546875, "learning_rate": 9.194762023018271e-06, "loss": 0.0715, "step": 1624 }, { "epoch": 0.5502529047004296, "grad_norm": 0.427734375, "learning_rate": 9.183602904525546e-06, "loss": 0.0529, "step": 1625 }, { "epoch": 0.550591521872553, "grad_norm": 0.55078125, "learning_rate": 9.172444809386647e-06, "loss": 0.0841, "step": 1626 }, { "epoch": 0.5509301390446762, "grad_norm": 0.578125, "learning_rate": 9.161287751588249e-06, "loss": 0.0757, "step": 1627 }, { "epoch": 0.5512687562167996, "grad_norm": 0.447265625, "learning_rate": 9.150131745115721e-06, "loss": 0.0556, "step": 1628 }, { "epoch": 0.551607373388923, "grad_norm": 0.435546875, "learning_rate": 9.138976803953122e-06, "loss": 0.0578, "step": 1629 }, { "epoch": 0.5519459905610463, "grad_norm": 0.49609375, "learning_rate": 9.127822942083167e-06, "loss": 0.064, "step": 1630 }, { "epoch": 0.5522846077331697, "grad_norm": 0.47265625, "learning_rate": 9.116670173487223e-06, "loss": 0.059, "step": 1631 }, { "epoch": 0.552623224905293, "grad_norm": 0.9921875, "learning_rate": 9.105518512145292e-06, "loss": 0.1812, "step": 1632 }, { "epoch": 0.5529618420774164, "grad_norm": 0.67578125, "learning_rate": 9.09436797203598e-06, "loss": 0.0754, "step": 1633 }, { "epoch": 0.5533004592495397, "grad_norm": 0.59765625, "learning_rate": 9.083218567136487e-06, "loss": 0.0926, "step": 1634 }, { "epoch": 0.553639076421663, "grad_norm": 0.42578125, "learning_rate": 9.072070311422595e-06, "loss": 0.0527, "step": 1635 }, { "epoch": 0.5539776935937863, "grad_norm": 0.51171875, "learning_rate": 9.060923218868644e-06, "loss": 0.0603, "step": 1636 }, { "epoch": 0.5543163107659097, "grad_norm": 0.5546875, "learning_rate": 9.049777303447517e-06, "loss": 0.0782, "step": 1637 }, { "epoch": 0.5546549279380331, "grad_norm": 0.5859375, "learning_rate": 9.038632579130617e-06, "loss": 0.0807, "step": 1638 }, { "epoch": 0.5549935451101564, "grad_norm": 0.625, "learning_rate": 9.027489059887855e-06, "loss": 0.071, "step": 1639 }, { "epoch": 0.5553321622822798, "grad_norm": 0.466796875, "learning_rate": 9.01634675968764e-06, "loss": 0.0492, "step": 1640 }, { "epoch": 0.5556707794544031, "grad_norm": 0.5078125, "learning_rate": 9.00520569249684e-06, "loss": 0.0619, "step": 1641 }, { "epoch": 0.5560093966265264, "grad_norm": 0.412109375, "learning_rate": 8.994065872280785e-06, "loss": 0.0572, "step": 1642 }, { "epoch": 0.5563480137986497, "grad_norm": 0.46484375, "learning_rate": 8.982927313003242e-06, "loss": 0.069, "step": 1643 }, { "epoch": 0.5566866309707731, "grad_norm": 0.52734375, "learning_rate": 8.971790028626395e-06, "loss": 0.0644, "step": 1644 }, { "epoch": 0.5570252481428964, "grad_norm": 0.46484375, "learning_rate": 8.960654033110834e-06, "loss": 0.0668, "step": 1645 }, { "epoch": 0.5573638653150198, "grad_norm": 0.427734375, "learning_rate": 8.949519340415526e-06, "loss": 0.0586, "step": 1646 }, { "epoch": 0.5577024824871432, "grad_norm": 0.375, "learning_rate": 8.938385964497807e-06, "loss": 0.0506, "step": 1647 }, { "epoch": 0.5580410996592665, "grad_norm": 0.65625, "learning_rate": 8.927253919313377e-06, "loss": 0.0757, "step": 1648 }, { "epoch": 0.5583797168313898, "grad_norm": 0.5390625, "learning_rate": 8.916123218816243e-06, "loss": 0.0689, "step": 1649 }, { "epoch": 0.5587183340035131, "grad_norm": 0.53125, "learning_rate": 8.90499387695874e-06, "loss": 0.078, "step": 1650 }, { "epoch": 0.5590569511756365, "grad_norm": 0.40625, "learning_rate": 8.893865907691503e-06, "loss": 0.0516, "step": 1651 }, { "epoch": 0.5593955683477598, "grad_norm": 0.61328125, "learning_rate": 8.882739324963442e-06, "loss": 0.0698, "step": 1652 }, { "epoch": 0.5597341855198832, "grad_norm": 0.443359375, "learning_rate": 8.871614142721728e-06, "loss": 0.0616, "step": 1653 }, { "epoch": 0.5600728026920065, "grad_norm": 0.66015625, "learning_rate": 8.860490374911777e-06, "loss": 0.0799, "step": 1654 }, { "epoch": 0.5604114198641299, "grad_norm": 0.83203125, "learning_rate": 8.849368035477236e-06, "loss": 0.0669, "step": 1655 }, { "epoch": 0.5607500370362531, "grad_norm": 0.6953125, "learning_rate": 8.838247138359957e-06, "loss": 0.1207, "step": 1656 }, { "epoch": 0.5610886542083765, "grad_norm": 0.494140625, "learning_rate": 8.827127697499985e-06, "loss": 0.0637, "step": 1657 }, { "epoch": 0.5614272713804999, "grad_norm": 0.4296875, "learning_rate": 8.816009726835538e-06, "loss": 0.0543, "step": 1658 }, { "epoch": 0.5617658885526232, "grad_norm": 0.46484375, "learning_rate": 8.804893240302997e-06, "loss": 0.0566, "step": 1659 }, { "epoch": 0.5621045057247466, "grad_norm": 0.431640625, "learning_rate": 8.793778251836878e-06, "loss": 0.0618, "step": 1660 }, { "epoch": 0.5624431228968699, "grad_norm": 0.59765625, "learning_rate": 8.782664775369818e-06, "loss": 0.0639, "step": 1661 }, { "epoch": 0.5627817400689933, "grad_norm": 0.443359375, "learning_rate": 8.771552824832559e-06, "loss": 0.0619, "step": 1662 }, { "epoch": 0.5631203572411166, "grad_norm": 0.609375, "learning_rate": 8.760442414153937e-06, "loss": 0.0627, "step": 1663 }, { "epoch": 0.5634589744132399, "grad_norm": 0.62109375, "learning_rate": 8.749333557260851e-06, "loss": 0.0621, "step": 1664 }, { "epoch": 0.5637975915853632, "grad_norm": 0.52734375, "learning_rate": 8.738226268078254e-06, "loss": 0.0725, "step": 1665 }, { "epoch": 0.5641362087574866, "grad_norm": 0.67578125, "learning_rate": 8.72712056052913e-06, "loss": 0.0559, "step": 1666 }, { "epoch": 0.56447482592961, "grad_norm": 0.515625, "learning_rate": 8.71601644853449e-06, "loss": 0.0639, "step": 1667 }, { "epoch": 0.5648134431017333, "grad_norm": 0.53125, "learning_rate": 8.704913946013337e-06, "loss": 0.0652, "step": 1668 }, { "epoch": 0.5651520602738567, "grad_norm": 0.55078125, "learning_rate": 8.69381306688266e-06, "loss": 0.0588, "step": 1669 }, { "epoch": 0.56549067744598, "grad_norm": 0.703125, "learning_rate": 8.682713825057409e-06, "loss": 0.0987, "step": 1670 }, { "epoch": 0.5658292946181033, "grad_norm": 0.55859375, "learning_rate": 8.671616234450486e-06, "loss": 0.0794, "step": 1671 }, { "epoch": 0.5661679117902266, "grad_norm": 0.390625, "learning_rate": 8.660520308972722e-06, "loss": 0.0537, "step": 1672 }, { "epoch": 0.56650652896235, "grad_norm": 0.416015625, "learning_rate": 8.649426062532858e-06, "loss": 0.0569, "step": 1673 }, { "epoch": 0.5668451461344733, "grad_norm": 0.45703125, "learning_rate": 8.638333509037537e-06, "loss": 0.0588, "step": 1674 }, { "epoch": 0.5671837633065967, "grad_norm": 0.42578125, "learning_rate": 8.627242662391273e-06, "loss": 0.0688, "step": 1675 }, { "epoch": 0.5675223804787201, "grad_norm": 0.482421875, "learning_rate": 8.616153536496444e-06, "loss": 0.0627, "step": 1676 }, { "epoch": 0.5678609976508434, "grad_norm": 0.45703125, "learning_rate": 8.605066145253269e-06, "loss": 0.0622, "step": 1677 }, { "epoch": 0.5681996148229667, "grad_norm": 1.1484375, "learning_rate": 8.593980502559797e-06, "loss": 0.1008, "step": 1678 }, { "epoch": 0.56853823199509, "grad_norm": 0.5078125, "learning_rate": 8.58289662231188e-06, "loss": 0.0611, "step": 1679 }, { "epoch": 0.5688768491672134, "grad_norm": 0.43359375, "learning_rate": 8.571814518403162e-06, "loss": 0.0609, "step": 1680 }, { "epoch": 0.5692154663393367, "grad_norm": 0.466796875, "learning_rate": 8.560734204725064e-06, "loss": 0.0711, "step": 1681 }, { "epoch": 0.5695540835114601, "grad_norm": 0.46484375, "learning_rate": 8.549655695166756e-06, "loss": 0.0548, "step": 1682 }, { "epoch": 0.5698927006835834, "grad_norm": 0.43359375, "learning_rate": 8.538579003615154e-06, "loss": 0.0634, "step": 1683 }, { "epoch": 0.5702313178557068, "grad_norm": 0.50390625, "learning_rate": 8.52750414395489e-06, "loss": 0.059, "step": 1684 }, { "epoch": 0.57056993502783, "grad_norm": 0.37890625, "learning_rate": 8.516431130068303e-06, "loss": 0.0496, "step": 1685 }, { "epoch": 0.5709085521999534, "grad_norm": 0.5078125, "learning_rate": 8.505359975835413e-06, "loss": 0.0686, "step": 1686 }, { "epoch": 0.5712471693720768, "grad_norm": 0.53515625, "learning_rate": 8.494290695133918e-06, "loss": 0.0561, "step": 1687 }, { "epoch": 0.5715857865442001, "grad_norm": 0.4296875, "learning_rate": 8.483223301839159e-06, "loss": 0.0549, "step": 1688 }, { "epoch": 0.5719244037163235, "grad_norm": 0.447265625, "learning_rate": 8.472157809824115e-06, "loss": 0.0581, "step": 1689 }, { "epoch": 0.5722630208884468, "grad_norm": 0.5, "learning_rate": 8.461094232959381e-06, "loss": 0.0655, "step": 1690 }, { "epoch": 0.5726016380605702, "grad_norm": 0.44921875, "learning_rate": 8.450032585113156e-06, "loss": 0.0554, "step": 1691 }, { "epoch": 0.5729402552326935, "grad_norm": 0.45703125, "learning_rate": 8.438972880151209e-06, "loss": 0.0606, "step": 1692 }, { "epoch": 0.5732788724048168, "grad_norm": 0.5390625, "learning_rate": 8.427915131936885e-06, "loss": 0.0702, "step": 1693 }, { "epoch": 0.5736174895769401, "grad_norm": 0.52734375, "learning_rate": 8.416859354331072e-06, "loss": 0.0659, "step": 1694 }, { "epoch": 0.5739561067490635, "grad_norm": 0.435546875, "learning_rate": 8.405805561192188e-06, "loss": 0.0478, "step": 1695 }, { "epoch": 0.5742947239211869, "grad_norm": 0.48046875, "learning_rate": 8.39475376637616e-06, "loss": 0.0667, "step": 1696 }, { "epoch": 0.5746333410933102, "grad_norm": 0.486328125, "learning_rate": 8.38370398373642e-06, "loss": 0.0618, "step": 1697 }, { "epoch": 0.5749719582654336, "grad_norm": 0.5078125, "learning_rate": 8.372656227123868e-06, "loss": 0.0609, "step": 1698 }, { "epoch": 0.5753105754375569, "grad_norm": 0.494140625, "learning_rate": 8.36161051038687e-06, "loss": 0.0585, "step": 1699 }, { "epoch": 0.5756491926096802, "grad_norm": 0.515625, "learning_rate": 8.350566847371228e-06, "loss": 0.0544, "step": 1700 }, { "epoch": 0.5759878097818035, "grad_norm": 0.5390625, "learning_rate": 8.33952525192018e-06, "loss": 0.0759, "step": 1701 }, { "epoch": 0.5763264269539269, "grad_norm": 0.796875, "learning_rate": 8.328485737874365e-06, "loss": 0.0673, "step": 1702 }, { "epoch": 0.5766650441260502, "grad_norm": 0.51953125, "learning_rate": 8.317448319071815e-06, "loss": 0.0662, "step": 1703 }, { "epoch": 0.5770036612981736, "grad_norm": 0.60546875, "learning_rate": 8.306413009347933e-06, "loss": 0.0805, "step": 1704 }, { "epoch": 0.577342278470297, "grad_norm": 0.45703125, "learning_rate": 8.295379822535482e-06, "loss": 0.0578, "step": 1705 }, { "epoch": 0.5776808956424203, "grad_norm": 0.458984375, "learning_rate": 8.284348772464564e-06, "loss": 0.0575, "step": 1706 }, { "epoch": 0.5780195128145436, "grad_norm": 0.50390625, "learning_rate": 8.273319872962599e-06, "loss": 0.0549, "step": 1707 }, { "epoch": 0.5783581299866669, "grad_norm": 0.6015625, "learning_rate": 8.262293137854315e-06, "loss": 0.0598, "step": 1708 }, { "epoch": 0.5786967471587903, "grad_norm": 0.5703125, "learning_rate": 8.251268580961724e-06, "loss": 0.0611, "step": 1709 }, { "epoch": 0.5790353643309136, "grad_norm": 0.44921875, "learning_rate": 8.24024621610411e-06, "loss": 0.0528, "step": 1710 }, { "epoch": 0.579373981503037, "grad_norm": 0.52734375, "learning_rate": 8.229226057098012e-06, "loss": 0.0724, "step": 1711 }, { "epoch": 0.5797125986751603, "grad_norm": 0.4609375, "learning_rate": 8.218208117757194e-06, "loss": 0.0598, "step": 1712 }, { "epoch": 0.5800512158472837, "grad_norm": 0.58984375, "learning_rate": 8.207192411892645e-06, "loss": 0.0767, "step": 1713 }, { "epoch": 0.5803898330194069, "grad_norm": 0.369140625, "learning_rate": 8.196178953312557e-06, "loss": 0.0515, "step": 1714 }, { "epoch": 0.5807284501915303, "grad_norm": 0.484375, "learning_rate": 8.185167755822294e-06, "loss": 0.0664, "step": 1715 }, { "epoch": 0.5810670673636537, "grad_norm": 0.474609375, "learning_rate": 8.1741588332244e-06, "loss": 0.06, "step": 1716 }, { "epoch": 0.581405684535777, "grad_norm": 0.5546875, "learning_rate": 8.163152199318559e-06, "loss": 0.0656, "step": 1717 }, { "epoch": 0.5817443017079004, "grad_norm": 0.49609375, "learning_rate": 8.152147867901586e-06, "loss": 0.059, "step": 1718 }, { "epoch": 0.5820829188800237, "grad_norm": 0.470703125, "learning_rate": 8.141145852767408e-06, "loss": 0.0609, "step": 1719 }, { "epoch": 0.5824215360521471, "grad_norm": 0.46484375, "learning_rate": 8.13014616770706e-06, "loss": 0.0649, "step": 1720 }, { "epoch": 0.5827601532242704, "grad_norm": 0.41796875, "learning_rate": 8.119148826508642e-06, "loss": 0.055, "step": 1721 }, { "epoch": 0.5830987703963937, "grad_norm": 0.58984375, "learning_rate": 8.108153842957324e-06, "loss": 0.0735, "step": 1722 }, { "epoch": 0.583437387568517, "grad_norm": 0.51171875, "learning_rate": 8.09716123083532e-06, "loss": 0.06, "step": 1723 }, { "epoch": 0.5837760047406404, "grad_norm": 0.8046875, "learning_rate": 8.086171003921865e-06, "loss": 0.0472, "step": 1724 }, { "epoch": 0.5841146219127638, "grad_norm": 0.515625, "learning_rate": 8.075183175993218e-06, "loss": 0.0706, "step": 1725 }, { "epoch": 0.5844532390848871, "grad_norm": 0.578125, "learning_rate": 8.064197760822615e-06, "loss": 0.0567, "step": 1726 }, { "epoch": 0.5847918562570105, "grad_norm": 0.609375, "learning_rate": 8.053214772180277e-06, "loss": 0.0868, "step": 1727 }, { "epoch": 0.5851304734291338, "grad_norm": 0.59765625, "learning_rate": 8.042234223833381e-06, "loss": 0.0503, "step": 1728 }, { "epoch": 0.5854690906012571, "grad_norm": 0.439453125, "learning_rate": 8.031256129546046e-06, "loss": 0.0617, "step": 1729 }, { "epoch": 0.5858077077733804, "grad_norm": 0.3671875, "learning_rate": 8.020280503079314e-06, "loss": 0.0443, "step": 1730 }, { "epoch": 0.5861463249455038, "grad_norm": 0.48046875, "learning_rate": 8.009307358191133e-06, "loss": 0.0642, "step": 1731 }, { "epoch": 0.5864849421176271, "grad_norm": 0.50390625, "learning_rate": 7.99833670863634e-06, "loss": 0.0677, "step": 1732 }, { "epoch": 0.5868235592897505, "grad_norm": 0.52734375, "learning_rate": 7.987368568166653e-06, "loss": 0.0724, "step": 1733 }, { "epoch": 0.5871621764618739, "grad_norm": 0.421875, "learning_rate": 7.976402950530623e-06, "loss": 0.0529, "step": 1734 }, { "epoch": 0.5875007936339972, "grad_norm": 0.5, "learning_rate": 7.965439869473664e-06, "loss": 0.067, "step": 1735 }, { "epoch": 0.5878394108061205, "grad_norm": 0.421875, "learning_rate": 7.954479338737995e-06, "loss": 0.0582, "step": 1736 }, { "epoch": 0.5881780279782438, "grad_norm": 0.609375, "learning_rate": 7.943521372062641e-06, "loss": 0.0765, "step": 1737 }, { "epoch": 0.5885166451503672, "grad_norm": 0.59375, "learning_rate": 7.932565983183416e-06, "loss": 0.0745, "step": 1738 }, { "epoch": 0.5888552623224905, "grad_norm": 0.45703125, "learning_rate": 7.921613185832897e-06, "loss": 0.0624, "step": 1739 }, { "epoch": 0.5891938794946139, "grad_norm": 0.455078125, "learning_rate": 7.910662993740422e-06, "loss": 0.0571, "step": 1740 }, { "epoch": 0.5895324966667372, "grad_norm": 0.5078125, "learning_rate": 7.899715420632056e-06, "loss": 0.0677, "step": 1741 }, { "epoch": 0.5898711138388606, "grad_norm": 0.44140625, "learning_rate": 7.888770480230582e-06, "loss": 0.0539, "step": 1742 }, { "epoch": 0.5902097310109838, "grad_norm": 0.5703125, "learning_rate": 7.87782818625548e-06, "loss": 0.0742, "step": 1743 }, { "epoch": 0.5905483481831072, "grad_norm": 0.51953125, "learning_rate": 7.866888552422924e-06, "loss": 0.0653, "step": 1744 }, { "epoch": 0.5908869653552306, "grad_norm": 0.4609375, "learning_rate": 7.855951592445743e-06, "loss": 0.0559, "step": 1745 }, { "epoch": 0.5912255825273539, "grad_norm": 0.470703125, "learning_rate": 7.845017320033415e-06, "loss": 0.0502, "step": 1746 }, { "epoch": 0.5915641996994773, "grad_norm": 0.53125, "learning_rate": 7.834085748892052e-06, "loss": 0.064, "step": 1747 }, { "epoch": 0.5919028168716006, "grad_norm": 0.4921875, "learning_rate": 7.823156892724379e-06, "loss": 0.0646, "step": 1748 }, { "epoch": 0.592241434043724, "grad_norm": 0.7109375, "learning_rate": 7.81223076522972e-06, "loss": 0.0848, "step": 1749 }, { "epoch": 0.5925800512158473, "grad_norm": 0.546875, "learning_rate": 7.801307380103977e-06, "loss": 0.0657, "step": 1750 }, { "epoch": 0.5929186683879706, "grad_norm": 0.458984375, "learning_rate": 7.790386751039609e-06, "loss": 0.0562, "step": 1751 }, { "epoch": 0.5932572855600939, "grad_norm": 0.59375, "learning_rate": 7.779468891725633e-06, "loss": 0.0803, "step": 1752 }, { "epoch": 0.5935959027322173, "grad_norm": 0.515625, "learning_rate": 7.768553815847583e-06, "loss": 0.0589, "step": 1753 }, { "epoch": 0.5939345199043407, "grad_norm": 0.5859375, "learning_rate": 7.757641537087509e-06, "loss": 0.0716, "step": 1754 }, { "epoch": 0.594273137076464, "grad_norm": 0.52734375, "learning_rate": 7.74673206912395e-06, "loss": 0.0715, "step": 1755 }, { "epoch": 0.5946117542485874, "grad_norm": 0.50390625, "learning_rate": 7.735825425631926e-06, "loss": 0.0671, "step": 1756 }, { "epoch": 0.5949503714207107, "grad_norm": 0.396484375, "learning_rate": 7.724921620282917e-06, "loss": 0.0529, "step": 1757 }, { "epoch": 0.595288988592834, "grad_norm": 0.453125, "learning_rate": 7.71402066674484e-06, "loss": 0.0607, "step": 1758 }, { "epoch": 0.5956276057649573, "grad_norm": 0.5390625, "learning_rate": 7.703122578682047e-06, "loss": 0.0687, "step": 1759 }, { "epoch": 0.5959662229370807, "grad_norm": 0.390625, "learning_rate": 7.69222736975529e-06, "loss": 0.0491, "step": 1760 }, { "epoch": 0.596304840109204, "grad_norm": 0.64453125, "learning_rate": 7.681335053621712e-06, "loss": 0.0563, "step": 1761 }, { "epoch": 0.5966434572813274, "grad_norm": 0.43359375, "learning_rate": 7.670445643934833e-06, "loss": 0.0574, "step": 1762 }, { "epoch": 0.5969820744534508, "grad_norm": 0.5390625, "learning_rate": 7.659559154344533e-06, "loss": 0.0558, "step": 1763 }, { "epoch": 0.5973206916255741, "grad_norm": 0.59765625, "learning_rate": 7.648675598497023e-06, "loss": 0.0637, "step": 1764 }, { "epoch": 0.5976593087976974, "grad_norm": 0.69140625, "learning_rate": 7.637794990034843e-06, "loss": 0.0891, "step": 1765 }, { "epoch": 0.5979979259698207, "grad_norm": 0.51953125, "learning_rate": 7.626917342596833e-06, "loss": 0.0642, "step": 1766 }, { "epoch": 0.5983365431419441, "grad_norm": 0.546875, "learning_rate": 7.616042669818133e-06, "loss": 0.0673, "step": 1767 }, { "epoch": 0.5986751603140674, "grad_norm": 0.6328125, "learning_rate": 7.605170985330139e-06, "loss": 0.0731, "step": 1768 }, { "epoch": 0.5990137774861908, "grad_norm": 0.462890625, "learning_rate": 7.594302302760512e-06, "loss": 0.0545, "step": 1769 }, { "epoch": 0.5993523946583141, "grad_norm": 0.53515625, "learning_rate": 7.5834366357331436e-06, "loss": 0.0648, "step": 1770 }, { "epoch": 0.5996910118304375, "grad_norm": 0.466796875, "learning_rate": 7.572573997868151e-06, "loss": 0.0583, "step": 1771 }, { "epoch": 0.6000296290025607, "grad_norm": 0.66015625, "learning_rate": 7.5617144027818515e-06, "loss": 0.1069, "step": 1772 }, { "epoch": 0.6003682461746841, "grad_norm": 0.474609375, "learning_rate": 7.550857864086747e-06, "loss": 0.0693, "step": 1773 }, { "epoch": 0.6007068633468075, "grad_norm": 0.427734375, "learning_rate": 7.540004395391509e-06, "loss": 0.0567, "step": 1774 }, { "epoch": 0.6010454805189308, "grad_norm": 0.4140625, "learning_rate": 7.529154010300963e-06, "loss": 0.0502, "step": 1775 }, { "epoch": 0.6013840976910542, "grad_norm": 0.470703125, "learning_rate": 7.518306722416074e-06, "loss": 0.0619, "step": 1776 }, { "epoch": 0.6013840976910542, "eval_loss": 0.06656693667173386, "eval_runtime": 815.1123, "eval_samples_per_second": 12.204, "eval_steps_per_second": 3.051, "step": 1776 }, { "epoch": 0.6017227148631775, "grad_norm": 0.486328125, "learning_rate": 7.5074625453339034e-06, "loss": 0.0615, "step": 1777 }, { "epoch": 0.6020613320353009, "grad_norm": 0.51953125, "learning_rate": 7.496621492647638e-06, "loss": 0.0651, "step": 1778 }, { "epoch": 0.6023999492074242, "grad_norm": 0.62109375, "learning_rate": 7.485783577946537e-06, "loss": 0.0694, "step": 1779 }, { "epoch": 0.6027385663795475, "grad_norm": 0.46484375, "learning_rate": 7.474948814815927e-06, "loss": 0.0644, "step": 1780 }, { "epoch": 0.6030771835516708, "grad_norm": 0.408203125, "learning_rate": 7.464117216837181e-06, "loss": 0.055, "step": 1781 }, { "epoch": 0.6034158007237942, "grad_norm": 0.5078125, "learning_rate": 7.453288797587714e-06, "loss": 0.0585, "step": 1782 }, { "epoch": 0.6037544178959176, "grad_norm": 0.45703125, "learning_rate": 7.442463570640947e-06, "loss": 0.0593, "step": 1783 }, { "epoch": 0.6040930350680409, "grad_norm": 0.412109375, "learning_rate": 7.431641549566304e-06, "loss": 0.0542, "step": 1784 }, { "epoch": 0.6044316522401643, "grad_norm": 0.54296875, "learning_rate": 7.420822747929187e-06, "loss": 0.0711, "step": 1785 }, { "epoch": 0.6047702694122876, "grad_norm": 0.98046875, "learning_rate": 7.410007179290968e-06, "loss": 0.0832, "step": 1786 }, { "epoch": 0.6051088865844109, "grad_norm": 0.49609375, "learning_rate": 7.399194857208962e-06, "loss": 0.0673, "step": 1787 }, { "epoch": 0.6054475037565342, "grad_norm": 0.51171875, "learning_rate": 7.388385795236415e-06, "loss": 0.0717, "step": 1788 }, { "epoch": 0.6057861209286576, "grad_norm": 0.53515625, "learning_rate": 7.377580006922486e-06, "loss": 0.0606, "step": 1789 }, { "epoch": 0.6061247381007809, "grad_norm": 0.50390625, "learning_rate": 7.366777505812234e-06, "loss": 0.0667, "step": 1790 }, { "epoch": 0.6064633552729043, "grad_norm": 0.421875, "learning_rate": 7.355978305446594e-06, "loss": 0.0498, "step": 1791 }, { "epoch": 0.6068019724450276, "grad_norm": 0.7265625, "learning_rate": 7.345182419362364e-06, "loss": 0.1045, "step": 1792 }, { "epoch": 0.607140589617151, "grad_norm": 2.0, "learning_rate": 7.334389861092187e-06, "loss": 0.0706, "step": 1793 }, { "epoch": 0.6074792067892743, "grad_norm": 0.44140625, "learning_rate": 7.323600644164539e-06, "loss": 0.0613, "step": 1794 }, { "epoch": 0.6078178239613976, "grad_norm": 0.66015625, "learning_rate": 7.312814782103703e-06, "loss": 0.0837, "step": 1795 }, { "epoch": 0.608156441133521, "grad_norm": 0.48828125, "learning_rate": 7.3020322884297565e-06, "loss": 0.073, "step": 1796 }, { "epoch": 0.6084950583056443, "grad_norm": 0.353515625, "learning_rate": 7.291253176658562e-06, "loss": 0.046, "step": 1797 }, { "epoch": 0.6088336754777677, "grad_norm": 0.5546875, "learning_rate": 7.280477460301727e-06, "loss": 0.0621, "step": 1798 }, { "epoch": 0.609172292649891, "grad_norm": 0.546875, "learning_rate": 7.26970515286662e-06, "loss": 0.0646, "step": 1799 }, { "epoch": 0.6095109098220144, "grad_norm": 0.427734375, "learning_rate": 7.258936267856323e-06, "loss": 0.0622, "step": 1800 }, { "epoch": 0.6098495269941376, "grad_norm": 0.435546875, "learning_rate": 7.248170818769642e-06, "loss": 0.0426, "step": 1801 }, { "epoch": 0.610188144166261, "grad_norm": 0.51171875, "learning_rate": 7.237408819101064e-06, "loss": 0.0586, "step": 1802 }, { "epoch": 0.6105267613383843, "grad_norm": 0.64453125, "learning_rate": 7.2266502823407584e-06, "loss": 0.0624, "step": 1803 }, { "epoch": 0.6108653785105077, "grad_norm": 0.474609375, "learning_rate": 7.215895221974548e-06, "loss": 0.062, "step": 1804 }, { "epoch": 0.6112039956826311, "grad_norm": 0.51953125, "learning_rate": 7.2051436514839064e-06, "loss": 0.0654, "step": 1805 }, { "epoch": 0.6115426128547544, "grad_norm": 0.47265625, "learning_rate": 7.194395584345927e-06, "loss": 0.0551, "step": 1806 }, { "epoch": 0.6118812300268778, "grad_norm": 0.55078125, "learning_rate": 7.1836510340333125e-06, "loss": 0.0641, "step": 1807 }, { "epoch": 0.6122198471990011, "grad_norm": 0.39453125, "learning_rate": 7.1729100140143535e-06, "loss": 0.0479, "step": 1808 }, { "epoch": 0.6125584643711244, "grad_norm": 0.6328125, "learning_rate": 7.162172537752927e-06, "loss": 0.0776, "step": 1809 }, { "epoch": 0.6128970815432477, "grad_norm": 0.486328125, "learning_rate": 7.151438618708455e-06, "loss": 0.065, "step": 1810 }, { "epoch": 0.6132356987153711, "grad_norm": 0.58984375, "learning_rate": 7.1407082703359085e-06, "loss": 0.0835, "step": 1811 }, { "epoch": 0.6135743158874944, "grad_norm": 0.54296875, "learning_rate": 7.129981506085777e-06, "loss": 0.0665, "step": 1812 }, { "epoch": 0.6139129330596178, "grad_norm": 0.48046875, "learning_rate": 7.119258339404065e-06, "loss": 0.0605, "step": 1813 }, { "epoch": 0.6142515502317412, "grad_norm": 0.39453125, "learning_rate": 7.1085387837322595e-06, "loss": 0.0528, "step": 1814 }, { "epoch": 0.6145901674038645, "grad_norm": 0.65625, "learning_rate": 7.097822852507325e-06, "loss": 0.0892, "step": 1815 }, { "epoch": 0.6149287845759878, "grad_norm": 0.5625, "learning_rate": 7.087110559161681e-06, "loss": 0.079, "step": 1816 }, { "epoch": 0.6152674017481111, "grad_norm": 0.671875, "learning_rate": 7.0764019171231906e-06, "loss": 0.0519, "step": 1817 }, { "epoch": 0.6156060189202345, "grad_norm": 0.53125, "learning_rate": 7.06569693981514e-06, "loss": 0.0608, "step": 1818 }, { "epoch": 0.6159446360923578, "grad_norm": 0.5546875, "learning_rate": 7.0549956406562105e-06, "loss": 0.0784, "step": 1819 }, { "epoch": 0.6162832532644812, "grad_norm": 0.47265625, "learning_rate": 7.044298033060487e-06, "loss": 0.0667, "step": 1820 }, { "epoch": 0.6166218704366045, "grad_norm": 0.447265625, "learning_rate": 7.033604130437422e-06, "loss": 0.0612, "step": 1821 }, { "epoch": 0.6169604876087279, "grad_norm": 0.61328125, "learning_rate": 7.022913946191821e-06, "loss": 0.0698, "step": 1822 }, { "epoch": 0.6172991047808511, "grad_norm": 0.5625, "learning_rate": 7.012227493723831e-06, "loss": 0.0673, "step": 1823 }, { "epoch": 0.6176377219529745, "grad_norm": 0.5546875, "learning_rate": 7.001544786428924e-06, "loss": 0.0601, "step": 1824 }, { "epoch": 0.6179763391250979, "grad_norm": 0.5390625, "learning_rate": 6.990865837697872e-06, "loss": 0.0562, "step": 1825 }, { "epoch": 0.6183149562972212, "grad_norm": 0.46875, "learning_rate": 6.980190660916739e-06, "loss": 0.0658, "step": 1826 }, { "epoch": 0.6186535734693446, "grad_norm": 0.42578125, "learning_rate": 6.969519269466858e-06, "loss": 0.055, "step": 1827 }, { "epoch": 0.6189921906414679, "grad_norm": 0.490234375, "learning_rate": 6.958851676724823e-06, "loss": 0.0652, "step": 1828 }, { "epoch": 0.6193308078135913, "grad_norm": 0.578125, "learning_rate": 6.9481878960624585e-06, "loss": 0.0715, "step": 1829 }, { "epoch": 0.6196694249857145, "grad_norm": 0.74609375, "learning_rate": 6.937527940846816e-06, "loss": 0.1297, "step": 1830 }, { "epoch": 0.6200080421578379, "grad_norm": 0.43359375, "learning_rate": 6.926871824440149e-06, "loss": 0.0607, "step": 1831 }, { "epoch": 0.6203466593299612, "grad_norm": 0.470703125, "learning_rate": 6.916219560199904e-06, "loss": 0.0621, "step": 1832 }, { "epoch": 0.6206852765020846, "grad_norm": 0.46484375, "learning_rate": 6.905571161478692e-06, "loss": 0.0516, "step": 1833 }, { "epoch": 0.621023893674208, "grad_norm": 0.5546875, "learning_rate": 6.894926641624282e-06, "loss": 0.0806, "step": 1834 }, { "epoch": 0.6213625108463313, "grad_norm": 0.453125, "learning_rate": 6.8842860139795795e-06, "loss": 0.0625, "step": 1835 }, { "epoch": 0.6217011280184547, "grad_norm": 0.490234375, "learning_rate": 6.873649291882613e-06, "loss": 0.0609, "step": 1836 }, { "epoch": 0.622039745190578, "grad_norm": 0.478515625, "learning_rate": 6.8630164886665165e-06, "loss": 0.0683, "step": 1837 }, { "epoch": 0.6223783623627013, "grad_norm": 0.43359375, "learning_rate": 6.8523876176595084e-06, "loss": 0.0567, "step": 1838 }, { "epoch": 0.6227169795348246, "grad_norm": 0.41796875, "learning_rate": 6.841762692184881e-06, "loss": 0.0535, "step": 1839 }, { "epoch": 0.623055596706948, "grad_norm": 0.57421875, "learning_rate": 6.831141725560975e-06, "loss": 0.0775, "step": 1840 }, { "epoch": 0.6233942138790713, "grad_norm": 0.5, "learning_rate": 6.820524731101176e-06, "loss": 0.0621, "step": 1841 }, { "epoch": 0.6237328310511947, "grad_norm": 0.490234375, "learning_rate": 6.809911722113884e-06, "loss": 0.0549, "step": 1842 }, { "epoch": 0.6240714482233181, "grad_norm": 0.54296875, "learning_rate": 6.7993027119025115e-06, "loss": 0.0683, "step": 1843 }, { "epoch": 0.6244100653954414, "grad_norm": 0.4453125, "learning_rate": 6.7886977137654505e-06, "loss": 0.0644, "step": 1844 }, { "epoch": 0.6247486825675647, "grad_norm": 0.52734375, "learning_rate": 6.778096740996069e-06, "loss": 0.0677, "step": 1845 }, { "epoch": 0.625087299739688, "grad_norm": 1.734375, "learning_rate": 6.767499806882685e-06, "loss": 0.0645, "step": 1846 }, { "epoch": 0.6254259169118114, "grad_norm": 0.486328125, "learning_rate": 6.756906924708558e-06, "loss": 0.069, "step": 1847 }, { "epoch": 0.6257645340839347, "grad_norm": 0.4765625, "learning_rate": 6.746318107751867e-06, "loss": 0.064, "step": 1848 }, { "epoch": 0.6261031512560581, "grad_norm": 0.48828125, "learning_rate": 6.735733369285694e-06, "loss": 0.0662, "step": 1849 }, { "epoch": 0.6264417684281814, "grad_norm": 0.55078125, "learning_rate": 6.7251527225780075e-06, "loss": 0.0766, "step": 1850 }, { "epoch": 0.6267803856003048, "grad_norm": 0.51171875, "learning_rate": 6.714576180891653e-06, "loss": 0.0681, "step": 1851 }, { "epoch": 0.627119002772428, "grad_norm": 0.52734375, "learning_rate": 6.7040037574843255e-06, "loss": 0.0711, "step": 1852 }, { "epoch": 0.6274576199445514, "grad_norm": 0.58203125, "learning_rate": 6.693435465608556e-06, "loss": 0.0742, "step": 1853 }, { "epoch": 0.6277962371166748, "grad_norm": 0.36328125, "learning_rate": 6.682871318511702e-06, "loss": 0.0477, "step": 1854 }, { "epoch": 0.6281348542887981, "grad_norm": 0.498046875, "learning_rate": 6.672311329435919e-06, "loss": 0.0624, "step": 1855 }, { "epoch": 0.6284734714609215, "grad_norm": 0.5078125, "learning_rate": 6.66175551161816e-06, "loss": 0.0648, "step": 1856 }, { "epoch": 0.6288120886330448, "grad_norm": 0.53125, "learning_rate": 6.651203878290139e-06, "loss": 0.0629, "step": 1857 }, { "epoch": 0.6291507058051682, "grad_norm": 0.4375, "learning_rate": 6.64065644267833e-06, "loss": 0.059, "step": 1858 }, { "epoch": 0.6294893229772914, "grad_norm": 0.5390625, "learning_rate": 6.630113218003944e-06, "loss": 0.0675, "step": 1859 }, { "epoch": 0.6298279401494148, "grad_norm": 0.59375, "learning_rate": 6.619574217482918e-06, "loss": 0.0798, "step": 1860 }, { "epoch": 0.6301665573215381, "grad_norm": 0.51171875, "learning_rate": 6.609039454325887e-06, "loss": 0.0715, "step": 1861 }, { "epoch": 0.6305051744936615, "grad_norm": 0.59375, "learning_rate": 6.598508941738176e-06, "loss": 0.0674, "step": 1862 }, { "epoch": 0.6308437916657849, "grad_norm": 0.427734375, "learning_rate": 6.587982692919785e-06, "loss": 0.045, "step": 1863 }, { "epoch": 0.6311824088379082, "grad_norm": 0.53515625, "learning_rate": 6.5774607210653675e-06, "loss": 0.0666, "step": 1864 }, { "epoch": 0.6315210260100316, "grad_norm": 0.5859375, "learning_rate": 6.566943039364215e-06, "loss": 0.0581, "step": 1865 }, { "epoch": 0.6318596431821549, "grad_norm": 0.5546875, "learning_rate": 6.556429661000244e-06, "loss": 0.0726, "step": 1866 }, { "epoch": 0.6321982603542782, "grad_norm": 1.3203125, "learning_rate": 6.545920599151976e-06, "loss": 0.0555, "step": 1867 }, { "epoch": 0.6325368775264015, "grad_norm": 0.51171875, "learning_rate": 6.535415866992518e-06, "loss": 0.0702, "step": 1868 }, { "epoch": 0.6328754946985249, "grad_norm": 0.51171875, "learning_rate": 6.524915477689553e-06, "loss": 0.0601, "step": 1869 }, { "epoch": 0.6332141118706482, "grad_norm": 0.384765625, "learning_rate": 6.5144194444053235e-06, "loss": 0.0561, "step": 1870 }, { "epoch": 0.6335527290427716, "grad_norm": 0.59765625, "learning_rate": 6.503927780296605e-06, "loss": 0.0833, "step": 1871 }, { "epoch": 0.633891346214895, "grad_norm": 0.48828125, "learning_rate": 6.4934404985147e-06, "loss": 0.0658, "step": 1872 }, { "epoch": 0.6342299633870183, "grad_norm": 0.44140625, "learning_rate": 6.482957612205416e-06, "loss": 0.0476, "step": 1873 }, { "epoch": 0.6345685805591416, "grad_norm": 1.03125, "learning_rate": 6.472479134509052e-06, "loss": 0.094, "step": 1874 }, { "epoch": 0.6349071977312649, "grad_norm": 0.6484375, "learning_rate": 6.4620050785603836e-06, "loss": 0.0744, "step": 1875 }, { "epoch": 0.6352458149033883, "grad_norm": 0.462890625, "learning_rate": 6.451535457488638e-06, "loss": 0.0597, "step": 1876 }, { "epoch": 0.6355844320755116, "grad_norm": 0.42578125, "learning_rate": 6.4410702844174875e-06, "loss": 0.0638, "step": 1877 }, { "epoch": 0.635923049247635, "grad_norm": 0.4765625, "learning_rate": 6.430609572465024e-06, "loss": 0.0623, "step": 1878 }, { "epoch": 0.6362616664197583, "grad_norm": 0.4375, "learning_rate": 6.420153334743755e-06, "loss": 0.0562, "step": 1879 }, { "epoch": 0.6366002835918817, "grad_norm": 1.125, "learning_rate": 6.409701584360575e-06, "loss": 0.0465, "step": 1880 }, { "epoch": 0.636938900764005, "grad_norm": 0.416015625, "learning_rate": 6.399254334416752e-06, "loss": 0.0492, "step": 1881 }, { "epoch": 0.6372775179361283, "grad_norm": 0.45703125, "learning_rate": 6.388811598007918e-06, "loss": 0.0583, "step": 1882 }, { "epoch": 0.6376161351082517, "grad_norm": 0.53125, "learning_rate": 6.378373388224039e-06, "loss": 0.0709, "step": 1883 }, { "epoch": 0.637954752280375, "grad_norm": 0.46484375, "learning_rate": 6.3679397181494115e-06, "loss": 0.0598, "step": 1884 }, { "epoch": 0.6382933694524984, "grad_norm": 0.41796875, "learning_rate": 6.357510600862646e-06, "loss": 0.0561, "step": 1885 }, { "epoch": 0.6386319866246217, "grad_norm": 0.5703125, "learning_rate": 6.3470860494366415e-06, "loss": 0.0637, "step": 1886 }, { "epoch": 0.6389706037967451, "grad_norm": 0.5390625, "learning_rate": 6.336666076938573e-06, "loss": 0.0627, "step": 1887 }, { "epoch": 0.6393092209688683, "grad_norm": 0.5703125, "learning_rate": 6.326250696429877e-06, "loss": 0.0742, "step": 1888 }, { "epoch": 0.6396478381409917, "grad_norm": 0.474609375, "learning_rate": 6.315839920966229e-06, "loss": 0.0568, "step": 1889 }, { "epoch": 0.639986455313115, "grad_norm": 0.51953125, "learning_rate": 6.305433763597546e-06, "loss": 0.0758, "step": 1890 }, { "epoch": 0.6403250724852384, "grad_norm": 0.462890625, "learning_rate": 6.295032237367942e-06, "loss": 0.0552, "step": 1891 }, { "epoch": 0.6406636896573618, "grad_norm": 0.470703125, "learning_rate": 6.284635355315731e-06, "loss": 0.0675, "step": 1892 }, { "epoch": 0.6410023068294851, "grad_norm": 0.52734375, "learning_rate": 6.274243130473405e-06, "loss": 0.063, "step": 1893 }, { "epoch": 0.6413409240016085, "grad_norm": 0.40625, "learning_rate": 6.2638555758676215e-06, "loss": 0.0549, "step": 1894 }, { "epoch": 0.6416795411737318, "grad_norm": 0.478515625, "learning_rate": 6.253472704519179e-06, "loss": 0.0589, "step": 1895 }, { "epoch": 0.6420181583458551, "grad_norm": 0.62109375, "learning_rate": 6.243094529443008e-06, "loss": 0.0856, "step": 1896 }, { "epoch": 0.6423567755179784, "grad_norm": 0.380859375, "learning_rate": 6.232721063648148e-06, "loss": 0.0506, "step": 1897 }, { "epoch": 0.6426953926901018, "grad_norm": 0.55859375, "learning_rate": 6.222352320137748e-06, "loss": 0.0758, "step": 1898 }, { "epoch": 0.6430340098622251, "grad_norm": 0.427734375, "learning_rate": 6.211988311909021e-06, "loss": 0.0528, "step": 1899 }, { "epoch": 0.6433726270343485, "grad_norm": 0.4296875, "learning_rate": 6.201629051953257e-06, "loss": 0.0562, "step": 1900 }, { "epoch": 0.6437112442064719, "grad_norm": 0.44140625, "learning_rate": 6.1912745532557834e-06, "loss": 0.0583, "step": 1901 }, { "epoch": 0.6440498613785952, "grad_norm": 0.56640625, "learning_rate": 6.180924828795972e-06, "loss": 0.0687, "step": 1902 }, { "epoch": 0.6443884785507185, "grad_norm": 0.53125, "learning_rate": 6.170579891547202e-06, "loss": 0.0623, "step": 1903 }, { "epoch": 0.6447270957228418, "grad_norm": 0.4921875, "learning_rate": 6.160239754476849e-06, "loss": 0.0695, "step": 1904 }, { "epoch": 0.6450657128949652, "grad_norm": 0.46484375, "learning_rate": 6.149904430546278e-06, "loss": 0.0585, "step": 1905 }, { "epoch": 0.6454043300670885, "grad_norm": 0.44921875, "learning_rate": 6.1395739327108185e-06, "loss": 0.0655, "step": 1906 }, { "epoch": 0.6457429472392119, "grad_norm": 0.56640625, "learning_rate": 6.12924827391975e-06, "loss": 0.0727, "step": 1907 }, { "epoch": 0.6460815644113352, "grad_norm": 0.46484375, "learning_rate": 6.118927467116285e-06, "loss": 0.0523, "step": 1908 }, { "epoch": 0.6464201815834586, "grad_norm": 0.625, "learning_rate": 6.1086115252375585e-06, "loss": 0.0855, "step": 1909 }, { "epoch": 0.6467587987555818, "grad_norm": 0.6328125, "learning_rate": 6.098300461214605e-06, "loss": 0.0866, "step": 1910 }, { "epoch": 0.6470974159277052, "grad_norm": 0.4765625, "learning_rate": 6.087994287972341e-06, "loss": 0.0627, "step": 1911 }, { "epoch": 0.6474360330998286, "grad_norm": 0.7109375, "learning_rate": 6.077693018429556e-06, "loss": 0.0702, "step": 1912 }, { "epoch": 0.6477746502719519, "grad_norm": 0.578125, "learning_rate": 6.0673966654988946e-06, "loss": 0.0679, "step": 1913 }, { "epoch": 0.6481132674440753, "grad_norm": 0.51953125, "learning_rate": 6.057105242086836e-06, "loss": 0.0615, "step": 1914 }, { "epoch": 0.6484518846161986, "grad_norm": 0.462890625, "learning_rate": 6.046818761093678e-06, "loss": 0.0506, "step": 1915 }, { "epoch": 0.648790501788322, "grad_norm": 0.76171875, "learning_rate": 6.036537235413524e-06, "loss": 0.1215, "step": 1916 }, { "epoch": 0.6491291189604452, "grad_norm": 0.4921875, "learning_rate": 6.026260677934273e-06, "loss": 0.0574, "step": 1917 }, { "epoch": 0.6494677361325686, "grad_norm": 0.43359375, "learning_rate": 6.015989101537586e-06, "loss": 0.0573, "step": 1918 }, { "epoch": 0.6498063533046919, "grad_norm": 0.384765625, "learning_rate": 6.005722519098887e-06, "loss": 0.0501, "step": 1919 }, { "epoch": 0.6501449704768153, "grad_norm": 0.46875, "learning_rate": 5.995460943487334e-06, "loss": 0.0666, "step": 1920 }, { "epoch": 0.6504835876489387, "grad_norm": 0.50390625, "learning_rate": 5.9852043875658195e-06, "loss": 0.0678, "step": 1921 }, { "epoch": 0.650822204821062, "grad_norm": 0.58203125, "learning_rate": 5.974952864190933e-06, "loss": 0.0809, "step": 1922 }, { "epoch": 0.6511608219931854, "grad_norm": 0.453125, "learning_rate": 5.964706386212959e-06, "loss": 0.0657, "step": 1923 }, { "epoch": 0.6514994391653087, "grad_norm": 0.443359375, "learning_rate": 5.95446496647586e-06, "loss": 0.0508, "step": 1924 }, { "epoch": 0.651838056337432, "grad_norm": 0.427734375, "learning_rate": 5.944228617817263e-06, "loss": 0.0598, "step": 1925 }, { "epoch": 0.6521766735095553, "grad_norm": 0.470703125, "learning_rate": 5.933997353068419e-06, "loss": 0.0699, "step": 1926 }, { "epoch": 0.6525152906816787, "grad_norm": 0.609375, "learning_rate": 5.923771185054224e-06, "loss": 0.0726, "step": 1927 }, { "epoch": 0.652853907853802, "grad_norm": 0.5078125, "learning_rate": 5.913550126593186e-06, "loss": 0.0721, "step": 1928 }, { "epoch": 0.6531925250259254, "grad_norm": 0.37890625, "learning_rate": 5.903334190497396e-06, "loss": 0.0483, "step": 1929 }, { "epoch": 0.6535311421980488, "grad_norm": 0.462890625, "learning_rate": 5.8931233895725345e-06, "loss": 0.0528, "step": 1930 }, { "epoch": 0.6538697593701721, "grad_norm": 0.482421875, "learning_rate": 5.882917736617839e-06, "loss": 0.0751, "step": 1931 }, { "epoch": 0.6542083765422954, "grad_norm": 0.4296875, "learning_rate": 5.872717244426099e-06, "loss": 0.0562, "step": 1932 }, { "epoch": 0.6545469937144187, "grad_norm": 0.498046875, "learning_rate": 5.862521925783631e-06, "loss": 0.0628, "step": 1933 }, { "epoch": 0.6548856108865421, "grad_norm": 0.44140625, "learning_rate": 5.852331793470267e-06, "loss": 0.0523, "step": 1934 }, { "epoch": 0.6552242280586654, "grad_norm": 0.5390625, "learning_rate": 5.842146860259337e-06, "loss": 0.0563, "step": 1935 }, { "epoch": 0.6555628452307888, "grad_norm": 0.41796875, "learning_rate": 5.8319671389176605e-06, "loss": 0.0523, "step": 1936 }, { "epoch": 0.6559014624029121, "grad_norm": 0.439453125, "learning_rate": 5.821792642205512e-06, "loss": 0.0534, "step": 1937 }, { "epoch": 0.6562400795750355, "grad_norm": 0.8984375, "learning_rate": 5.811623382876636e-06, "loss": 0.0865, "step": 1938 }, { "epoch": 0.6565786967471587, "grad_norm": 0.7734375, "learning_rate": 5.8014593736781864e-06, "loss": 0.0701, "step": 1939 }, { "epoch": 0.6569173139192821, "grad_norm": 0.458984375, "learning_rate": 5.791300627350759e-06, "loss": 0.052, "step": 1940 }, { "epoch": 0.6572559310914055, "grad_norm": 0.51953125, "learning_rate": 5.781147156628336e-06, "loss": 0.0633, "step": 1941 }, { "epoch": 0.6575945482635288, "grad_norm": 0.54296875, "learning_rate": 5.770998974238298e-06, "loss": 0.0629, "step": 1942 }, { "epoch": 0.6579331654356522, "grad_norm": 0.48828125, "learning_rate": 5.760856092901394e-06, "loss": 0.0605, "step": 1943 }, { "epoch": 0.6582717826077755, "grad_norm": 0.51953125, "learning_rate": 5.750718525331722e-06, "loss": 0.0576, "step": 1944 }, { "epoch": 0.6586103997798989, "grad_norm": 0.373046875, "learning_rate": 5.740586284236724e-06, "loss": 0.0499, "step": 1945 }, { "epoch": 0.6589490169520221, "grad_norm": 0.5078125, "learning_rate": 5.730459382317177e-06, "loss": 0.0711, "step": 1946 }, { "epoch": 0.6592876341241455, "grad_norm": 0.412109375, "learning_rate": 5.720337832267136e-06, "loss": 0.06, "step": 1947 }, { "epoch": 0.6596262512962688, "grad_norm": 0.52734375, "learning_rate": 5.710221646773971e-06, "loss": 0.0605, "step": 1948 }, { "epoch": 0.6599648684683922, "grad_norm": 0.48046875, "learning_rate": 5.700110838518327e-06, "loss": 0.0567, "step": 1949 }, { "epoch": 0.6603034856405156, "grad_norm": 0.376953125, "learning_rate": 5.690005420174095e-06, "loss": 0.0477, "step": 1950 }, { "epoch": 0.6606421028126389, "grad_norm": 0.486328125, "learning_rate": 5.679905404408426e-06, "loss": 0.0622, "step": 1951 }, { "epoch": 0.6609807199847623, "grad_norm": 0.6015625, "learning_rate": 5.6698108038816815e-06, "loss": 0.0638, "step": 1952 }, { "epoch": 0.6613193371568856, "grad_norm": 0.400390625, "learning_rate": 5.6597216312474476e-06, "loss": 0.054, "step": 1953 }, { "epoch": 0.6616579543290089, "grad_norm": 0.453125, "learning_rate": 5.649637899152509e-06, "loss": 0.0533, "step": 1954 }, { "epoch": 0.6619965715011322, "grad_norm": 0.455078125, "learning_rate": 5.639559620236815e-06, "loss": 0.0573, "step": 1955 }, { "epoch": 0.6623351886732556, "grad_norm": 0.5, "learning_rate": 5.629486807133495e-06, "loss": 0.0699, "step": 1956 }, { "epoch": 0.6626738058453789, "grad_norm": 0.44921875, "learning_rate": 5.619419472468824e-06, "loss": 0.0509, "step": 1957 }, { "epoch": 0.6630124230175023, "grad_norm": 0.625, "learning_rate": 5.609357628862197e-06, "loss": 0.0755, "step": 1958 }, { "epoch": 0.6633510401896257, "grad_norm": 0.451171875, "learning_rate": 5.599301288926145e-06, "loss": 0.0509, "step": 1959 }, { "epoch": 0.663689657361749, "grad_norm": 0.478515625, "learning_rate": 5.5892504652662845e-06, "loss": 0.0623, "step": 1960 }, { "epoch": 0.6640282745338723, "grad_norm": 0.478515625, "learning_rate": 5.579205170481328e-06, "loss": 0.0578, "step": 1961 }, { "epoch": 0.6643668917059956, "grad_norm": 0.498046875, "learning_rate": 5.569165417163054e-06, "loss": 0.0685, "step": 1962 }, { "epoch": 0.664705508878119, "grad_norm": 0.515625, "learning_rate": 5.559131217896288e-06, "loss": 0.0699, "step": 1963 }, { "epoch": 0.6650441260502423, "grad_norm": 0.470703125, "learning_rate": 5.549102585258904e-06, "loss": 0.0572, "step": 1964 }, { "epoch": 0.6653827432223657, "grad_norm": 0.48046875, "learning_rate": 5.539079531821799e-06, "loss": 0.0532, "step": 1965 }, { "epoch": 0.665721360394489, "grad_norm": 0.47265625, "learning_rate": 5.529062070148859e-06, "loss": 0.0597, "step": 1966 }, { "epoch": 0.6660599775666124, "grad_norm": 0.470703125, "learning_rate": 5.519050212796986e-06, "loss": 0.0668, "step": 1967 }, { "epoch": 0.6663985947387356, "grad_norm": 0.498046875, "learning_rate": 5.509043972316037e-06, "loss": 0.0614, "step": 1968 }, { "epoch": 0.666737211910859, "grad_norm": 0.56640625, "learning_rate": 5.499043361248832e-06, "loss": 0.0561, "step": 1969 }, { "epoch": 0.6670758290829824, "grad_norm": 0.51953125, "learning_rate": 5.489048392131147e-06, "loss": 0.0859, "step": 1970 }, { "epoch": 0.6674144462551057, "grad_norm": 0.46875, "learning_rate": 5.4790590774916665e-06, "loss": 0.0537, "step": 1971 }, { "epoch": 0.6677530634272291, "grad_norm": 0.435546875, "learning_rate": 5.469075429852002e-06, "loss": 0.0555, "step": 1972 }, { "epoch": 0.6680916805993524, "grad_norm": 0.51953125, "learning_rate": 5.459097461726661e-06, "loss": 0.0719, "step": 1973 }, { "epoch": 0.6684302977714758, "grad_norm": 0.55078125, "learning_rate": 5.44912518562302e-06, "loss": 0.0772, "step": 1974 }, { "epoch": 0.668768914943599, "grad_norm": 0.60546875, "learning_rate": 5.439158614041331e-06, "loss": 0.06, "step": 1975 }, { "epoch": 0.6691075321157224, "grad_norm": 0.3984375, "learning_rate": 5.4291977594746955e-06, "loss": 0.0536, "step": 1976 }, { "epoch": 0.6694461492878457, "grad_norm": 0.5859375, "learning_rate": 5.419242634409039e-06, "loss": 0.113, "step": 1977 }, { "epoch": 0.6697847664599691, "grad_norm": 0.6484375, "learning_rate": 5.409293251323119e-06, "loss": 0.0825, "step": 1978 }, { "epoch": 0.6701233836320925, "grad_norm": 0.5390625, "learning_rate": 5.399349622688479e-06, "loss": 0.0676, "step": 1979 }, { "epoch": 0.6704620008042158, "grad_norm": 0.609375, "learning_rate": 5.3894117609694655e-06, "loss": 0.0731, "step": 1980 }, { "epoch": 0.6708006179763392, "grad_norm": 0.4765625, "learning_rate": 5.379479678623189e-06, "loss": 0.0647, "step": 1981 }, { "epoch": 0.6711392351484625, "grad_norm": 0.5078125, "learning_rate": 5.3695533880995096e-06, "loss": 0.0873, "step": 1982 }, { "epoch": 0.6714778523205858, "grad_norm": 0.478515625, "learning_rate": 5.359632901841038e-06, "loss": 0.0594, "step": 1983 }, { "epoch": 0.6718164694927091, "grad_norm": 0.404296875, "learning_rate": 5.349718232283106e-06, "loss": 0.0601, "step": 1984 }, { "epoch": 0.6721550866648325, "grad_norm": 0.63671875, "learning_rate": 5.339809391853747e-06, "loss": 0.0798, "step": 1985 }, { "epoch": 0.6724937038369558, "grad_norm": 0.494140625, "learning_rate": 5.3299063929737015e-06, "loss": 0.0687, "step": 1986 }, { "epoch": 0.6728323210090792, "grad_norm": 0.4140625, "learning_rate": 5.3200092480563704e-06, "loss": 0.0536, "step": 1987 }, { "epoch": 0.6731709381812025, "grad_norm": 0.609375, "learning_rate": 5.310117969507833e-06, "loss": 0.0457, "step": 1988 }, { "epoch": 0.6735095553533259, "grad_norm": 0.51953125, "learning_rate": 5.300232569726805e-06, "loss": 0.0617, "step": 1989 }, { "epoch": 0.6738481725254492, "grad_norm": 0.490234375, "learning_rate": 5.29035306110463e-06, "loss": 0.0634, "step": 1990 }, { "epoch": 0.6741867896975725, "grad_norm": 0.53515625, "learning_rate": 5.2804794560252785e-06, "loss": 0.0601, "step": 1991 }, { "epoch": 0.6745254068696959, "grad_norm": 0.61328125, "learning_rate": 5.270611766865319e-06, "loss": 0.0957, "step": 1992 }, { "epoch": 0.6748640240418192, "grad_norm": 0.6328125, "learning_rate": 5.2607500059938935e-06, "loss": 0.1005, "step": 1993 }, { "epoch": 0.6752026412139426, "grad_norm": 0.458984375, "learning_rate": 5.250894185772724e-06, "loss": 0.0555, "step": 1994 }, { "epoch": 0.6755412583860659, "grad_norm": 0.44921875, "learning_rate": 5.241044318556083e-06, "loss": 0.0605, "step": 1995 }, { "epoch": 0.6758798755581893, "grad_norm": 0.490234375, "learning_rate": 5.231200416690775e-06, "loss": 0.0753, "step": 1996 }, { "epoch": 0.6762184927303125, "grad_norm": 0.515625, "learning_rate": 5.221362492516139e-06, "loss": 0.0718, "step": 1997 }, { "epoch": 0.6765571099024359, "grad_norm": 0.50390625, "learning_rate": 5.211530558364005e-06, "loss": 0.0645, "step": 1998 }, { "epoch": 0.6768957270745593, "grad_norm": 0.5, "learning_rate": 5.201704626558708e-06, "loss": 0.0597, "step": 1999 }, { "epoch": 0.6772343442466826, "grad_norm": 1.109375, "learning_rate": 5.191884709417058e-06, "loss": 0.0725, "step": 2000 }, { "epoch": 0.677572961418806, "grad_norm": 0.45703125, "learning_rate": 5.1820708192483145e-06, "loss": 0.0579, "step": 2001 }, { "epoch": 0.6779115785909293, "grad_norm": 0.71484375, "learning_rate": 5.172262968354198e-06, "loss": 0.087, "step": 2002 }, { "epoch": 0.6782501957630527, "grad_norm": 0.404296875, "learning_rate": 5.162461169028841e-06, "loss": 0.0513, "step": 2003 }, { "epoch": 0.6785888129351759, "grad_norm": 0.58984375, "learning_rate": 5.152665433558803e-06, "loss": 0.0824, "step": 2004 }, { "epoch": 0.6789274301072993, "grad_norm": 0.5, "learning_rate": 5.1428757742230466e-06, "loss": 0.0706, "step": 2005 }, { "epoch": 0.6792660472794226, "grad_norm": 0.5078125, "learning_rate": 5.1330922032928996e-06, "loss": 0.0718, "step": 2006 }, { "epoch": 0.679604664451546, "grad_norm": 0.59765625, "learning_rate": 5.123314733032074e-06, "loss": 0.0998, "step": 2007 }, { "epoch": 0.6799432816236693, "grad_norm": 0.37890625, "learning_rate": 5.113543375696633e-06, "loss": 0.052, "step": 2008 }, { "epoch": 0.6802818987957927, "grad_norm": 0.46484375, "learning_rate": 5.1037781435349676e-06, "loss": 0.065, "step": 2009 }, { "epoch": 0.6806205159679161, "grad_norm": 0.3984375, "learning_rate": 5.094019048787802e-06, "loss": 0.051, "step": 2010 }, { "epoch": 0.6809591331400394, "grad_norm": 0.60546875, "learning_rate": 5.084266103688161e-06, "loss": 0.0822, "step": 2011 }, { "epoch": 0.6812977503121627, "grad_norm": 0.482421875, "learning_rate": 5.074519320461358e-06, "loss": 0.0605, "step": 2012 }, { "epoch": 0.681636367484286, "grad_norm": 0.484375, "learning_rate": 5.064778711324989e-06, "loss": 0.0494, "step": 2013 }, { "epoch": 0.6819749846564094, "grad_norm": 0.88671875, "learning_rate": 5.055044288488913e-06, "loss": 0.0791, "step": 2014 }, { "epoch": 0.6823136018285327, "grad_norm": 0.41015625, "learning_rate": 5.045316064155221e-06, "loss": 0.054, "step": 2015 }, { "epoch": 0.6826522190006561, "grad_norm": 0.4453125, "learning_rate": 5.035594050518254e-06, "loss": 0.0535, "step": 2016 }, { "epoch": 0.6829908361727794, "grad_norm": 0.5078125, "learning_rate": 5.025878259764545e-06, "loss": 0.0676, "step": 2017 }, { "epoch": 0.6833294533449028, "grad_norm": 0.427734375, "learning_rate": 5.016168704072846e-06, "loss": 0.0536, "step": 2018 }, { "epoch": 0.683668070517026, "grad_norm": 0.4453125, "learning_rate": 5.006465395614086e-06, "loss": 0.0603, "step": 2019 }, { "epoch": 0.6840066876891494, "grad_norm": 0.498046875, "learning_rate": 4.9967683465513595e-06, "loss": 0.0726, "step": 2020 }, { "epoch": 0.6843453048612728, "grad_norm": 0.453125, "learning_rate": 4.987077569039922e-06, "loss": 0.0645, "step": 2021 }, { "epoch": 0.6846839220333961, "grad_norm": 0.453125, "learning_rate": 4.977393075227159e-06, "loss": 0.0542, "step": 2022 }, { "epoch": 0.6850225392055195, "grad_norm": 0.4453125, "learning_rate": 4.967714877252587e-06, "loss": 0.0515, "step": 2023 }, { "epoch": 0.6853611563776428, "grad_norm": 0.458984375, "learning_rate": 4.958042987247832e-06, "loss": 0.0684, "step": 2024 }, { "epoch": 0.6856997735497662, "grad_norm": 0.490234375, "learning_rate": 4.9483774173366e-06, "loss": 0.0718, "step": 2025 }, { "epoch": 0.6860383907218894, "grad_norm": 0.41796875, "learning_rate": 4.938718179634689e-06, "loss": 0.0558, "step": 2026 }, { "epoch": 0.6863770078940128, "grad_norm": 0.52734375, "learning_rate": 4.929065286249959e-06, "loss": 0.0724, "step": 2027 }, { "epoch": 0.6867156250661361, "grad_norm": 0.859375, "learning_rate": 4.919418749282302e-06, "loss": 0.1876, "step": 2028 }, { "epoch": 0.6870542422382595, "grad_norm": 0.38671875, "learning_rate": 4.909778580823663e-06, "loss": 0.0502, "step": 2029 }, { "epoch": 0.6873928594103829, "grad_norm": 0.65234375, "learning_rate": 4.9001447929579855e-06, "loss": 0.0814, "step": 2030 }, { "epoch": 0.6877314765825062, "grad_norm": 0.5703125, "learning_rate": 4.890517397761232e-06, "loss": 0.0727, "step": 2031 }, { "epoch": 0.6880700937546296, "grad_norm": 0.482421875, "learning_rate": 4.880896407301333e-06, "loss": 0.064, "step": 2032 }, { "epoch": 0.6884087109267528, "grad_norm": 0.75, "learning_rate": 4.8712818336382104e-06, "loss": 0.0589, "step": 2033 }, { "epoch": 0.6887473280988762, "grad_norm": 0.5703125, "learning_rate": 4.861673688823726e-06, "loss": 0.0676, "step": 2034 }, { "epoch": 0.6890859452709995, "grad_norm": 0.46484375, "learning_rate": 4.852071984901696e-06, "loss": 0.0677, "step": 2035 }, { "epoch": 0.6894245624431229, "grad_norm": 0.67578125, "learning_rate": 4.842476733907851e-06, "loss": 0.0656, "step": 2036 }, { "epoch": 0.6897631796152462, "grad_norm": 0.3984375, "learning_rate": 4.832887947869841e-06, "loss": 0.0561, "step": 2037 }, { "epoch": 0.6901017967873696, "grad_norm": 0.46875, "learning_rate": 4.823305638807215e-06, "loss": 0.0559, "step": 2038 }, { "epoch": 0.690440413959493, "grad_norm": 0.70703125, "learning_rate": 4.813729818731391e-06, "loss": 0.0806, "step": 2039 }, { "epoch": 0.6907790311316163, "grad_norm": 0.55859375, "learning_rate": 4.804160499645667e-06, "loss": 0.0692, "step": 2040 }, { "epoch": 0.6911176483037396, "grad_norm": 0.546875, "learning_rate": 4.794597693545179e-06, "loss": 0.0532, "step": 2041 }, { "epoch": 0.6914562654758629, "grad_norm": 0.435546875, "learning_rate": 4.785041412416906e-06, "loss": 0.0625, "step": 2042 }, { "epoch": 0.6917948826479863, "grad_norm": 0.49609375, "learning_rate": 4.7754916682396545e-06, "loss": 0.0646, "step": 2043 }, { "epoch": 0.6921334998201096, "grad_norm": 0.38671875, "learning_rate": 4.76594847298402e-06, "loss": 0.0437, "step": 2044 }, { "epoch": 0.692472116992233, "grad_norm": 0.494140625, "learning_rate": 4.756411838612402e-06, "loss": 0.0618, "step": 2045 }, { "epoch": 0.6928107341643563, "grad_norm": 0.515625, "learning_rate": 4.746881777078979e-06, "loss": 0.0675, "step": 2046 }, { "epoch": 0.6931493513364797, "grad_norm": 0.63671875, "learning_rate": 4.737358300329673e-06, "loss": 0.0711, "step": 2047 }, { "epoch": 0.693487968508603, "grad_norm": 0.443359375, "learning_rate": 4.727841420302172e-06, "loss": 0.0549, "step": 2048 }, { "epoch": 0.6938265856807263, "grad_norm": 0.412109375, "learning_rate": 4.7183311489258774e-06, "loss": 0.0567, "step": 2049 }, { "epoch": 0.6941652028528497, "grad_norm": 0.482421875, "learning_rate": 4.70882749812192e-06, "loss": 0.058, "step": 2050 }, { "epoch": 0.694503820024973, "grad_norm": 0.48046875, "learning_rate": 4.699330479803131e-06, "loss": 0.0677, "step": 2051 }, { "epoch": 0.6948424371970964, "grad_norm": 0.388671875, "learning_rate": 4.68984010587402e-06, "loss": 0.0471, "step": 2052 }, { "epoch": 0.6951810543692197, "grad_norm": 0.47265625, "learning_rate": 4.6803563882307655e-06, "loss": 0.06, "step": 2053 }, { "epoch": 0.6955196715413431, "grad_norm": 0.482421875, "learning_rate": 4.670879338761218e-06, "loss": 0.0604, "step": 2054 }, { "epoch": 0.6958582887134663, "grad_norm": 0.51171875, "learning_rate": 4.6614089693448515e-06, "loss": 0.0571, "step": 2055 }, { "epoch": 0.6961969058855897, "grad_norm": 0.50390625, "learning_rate": 4.651945291852779e-06, "loss": 0.0746, "step": 2056 }, { "epoch": 0.696535523057713, "grad_norm": 0.60546875, "learning_rate": 4.642488318147723e-06, "loss": 0.0734, "step": 2057 }, { "epoch": 0.6968741402298364, "grad_norm": 0.384765625, "learning_rate": 4.633038060083996e-06, "loss": 0.0513, "step": 2058 }, { "epoch": 0.6972127574019598, "grad_norm": 0.5859375, "learning_rate": 4.623594529507503e-06, "loss": 0.0631, "step": 2059 }, { "epoch": 0.6975513745740831, "grad_norm": 0.5859375, "learning_rate": 4.6141577382557044e-06, "loss": 0.0805, "step": 2060 }, { "epoch": 0.6978899917462065, "grad_norm": 0.55859375, "learning_rate": 4.604727698157621e-06, "loss": 0.0771, "step": 2061 }, { "epoch": 0.6982286089183297, "grad_norm": 0.46484375, "learning_rate": 4.5953044210338116e-06, "loss": 0.0578, "step": 2062 }, { "epoch": 0.6985672260904531, "grad_norm": 0.474609375, "learning_rate": 4.58588791869635e-06, "loss": 0.0691, "step": 2063 }, { "epoch": 0.6989058432625764, "grad_norm": 0.439453125, "learning_rate": 4.576478202948826e-06, "loss": 0.0568, "step": 2064 }, { "epoch": 0.6992444604346998, "grad_norm": 0.427734375, "learning_rate": 4.567075285586321e-06, "loss": 0.0491, "step": 2065 }, { "epoch": 0.6995830776068231, "grad_norm": 0.88671875, "learning_rate": 4.557679178395387e-06, "loss": 0.0596, "step": 2066 }, { "epoch": 0.6999216947789465, "grad_norm": 0.458984375, "learning_rate": 4.5482898931540505e-06, "loss": 0.0626, "step": 2067 }, { "epoch": 0.7002603119510699, "grad_norm": 0.52734375, "learning_rate": 4.538907441631776e-06, "loss": 0.0592, "step": 2068 }, { "epoch": 0.7005989291231932, "grad_norm": 0.43359375, "learning_rate": 4.5295318355894705e-06, "loss": 0.0555, "step": 2069 }, { "epoch": 0.7009375462953165, "grad_norm": 0.4296875, "learning_rate": 4.52016308677946e-06, "loss": 0.0569, "step": 2070 }, { "epoch": 0.7012761634674398, "grad_norm": 0.55859375, "learning_rate": 4.5108012069454645e-06, "loss": 0.066, "step": 2071 }, { "epoch": 0.7016147806395632, "grad_norm": 0.46484375, "learning_rate": 4.5014462078226064e-06, "loss": 0.0519, "step": 2072 }, { "epoch": 0.7016147806395632, "eval_loss": 0.06592338532209396, "eval_runtime": 815.5749, "eval_samples_per_second": 12.198, "eval_steps_per_second": 3.049, "step": 2072 }, { "epoch": 0.7019533978116865, "grad_norm": 0.41796875, "learning_rate": 4.492098101137382e-06, "loss": 0.0626, "step": 2073 }, { "epoch": 0.7022920149838099, "grad_norm": 0.52734375, "learning_rate": 4.482756898607633e-06, "loss": 0.072, "step": 2074 }, { "epoch": 0.7026306321559332, "grad_norm": 0.44140625, "learning_rate": 4.4734226119425615e-06, "loss": 0.0637, "step": 2075 }, { "epoch": 0.7029692493280566, "grad_norm": 0.42578125, "learning_rate": 4.464095252842703e-06, "loss": 0.0534, "step": 2076 }, { "epoch": 0.7033078665001798, "grad_norm": 0.37890625, "learning_rate": 4.454774832999893e-06, "loss": 0.0472, "step": 2077 }, { "epoch": 0.7036464836723032, "grad_norm": 0.640625, "learning_rate": 4.445461364097288e-06, "loss": 0.0701, "step": 2078 }, { "epoch": 0.7039851008444266, "grad_norm": 0.78125, "learning_rate": 4.436154857809314e-06, "loss": 0.0544, "step": 2079 }, { "epoch": 0.7043237180165499, "grad_norm": 0.6171875, "learning_rate": 4.42685532580168e-06, "loss": 0.0479, "step": 2080 }, { "epoch": 0.7046623351886733, "grad_norm": 0.60546875, "learning_rate": 4.417562779731355e-06, "loss": 0.0743, "step": 2081 }, { "epoch": 0.7050009523607966, "grad_norm": 0.41796875, "learning_rate": 4.408277231246539e-06, "loss": 0.0463, "step": 2082 }, { "epoch": 0.70533956953292, "grad_norm": 0.439453125, "learning_rate": 4.3989986919866716e-06, "loss": 0.0552, "step": 2083 }, { "epoch": 0.7056781867050432, "grad_norm": 0.478515625, "learning_rate": 4.3897271735824045e-06, "loss": 0.0654, "step": 2084 }, { "epoch": 0.7060168038771666, "grad_norm": 0.404296875, "learning_rate": 4.380462687655581e-06, "loss": 0.053, "step": 2085 }, { "epoch": 0.7063554210492899, "grad_norm": 0.474609375, "learning_rate": 4.371205245819241e-06, "loss": 0.0636, "step": 2086 }, { "epoch": 0.7066940382214133, "grad_norm": 0.46875, "learning_rate": 4.361954859677584e-06, "loss": 0.0645, "step": 2087 }, { "epoch": 0.7070326553935367, "grad_norm": 0.609375, "learning_rate": 4.35271154082597e-06, "loss": 0.0726, "step": 2088 }, { "epoch": 0.70737127256566, "grad_norm": 0.578125, "learning_rate": 4.343475300850907e-06, "loss": 0.0656, "step": 2089 }, { "epoch": 0.7077098897377834, "grad_norm": 0.55859375, "learning_rate": 4.334246151330012e-06, "loss": 0.0644, "step": 2090 }, { "epoch": 0.7080485069099066, "grad_norm": 0.515625, "learning_rate": 4.32502410383203e-06, "loss": 0.0722, "step": 2091 }, { "epoch": 0.70838712408203, "grad_norm": 0.4375, "learning_rate": 4.315809169916802e-06, "loss": 0.0505, "step": 2092 }, { "epoch": 0.7087257412541533, "grad_norm": 0.4140625, "learning_rate": 4.306601361135241e-06, "loss": 0.0484, "step": 2093 }, { "epoch": 0.7090643584262767, "grad_norm": 0.42578125, "learning_rate": 4.297400689029344e-06, "loss": 0.0606, "step": 2094 }, { "epoch": 0.7094029755984, "grad_norm": 0.6484375, "learning_rate": 4.2882071651321485e-06, "loss": 0.0702, "step": 2095 }, { "epoch": 0.7097415927705234, "grad_norm": 0.43359375, "learning_rate": 4.279020800967736e-06, "loss": 0.0488, "step": 2096 }, { "epoch": 0.7100802099426468, "grad_norm": 0.4140625, "learning_rate": 4.2698416080512204e-06, "loss": 0.0486, "step": 2097 }, { "epoch": 0.7104188271147701, "grad_norm": 0.462890625, "learning_rate": 4.260669597888715e-06, "loss": 0.0501, "step": 2098 }, { "epoch": 0.7107574442868934, "grad_norm": 0.62890625, "learning_rate": 4.251504781977337e-06, "loss": 0.0779, "step": 2099 }, { "epoch": 0.7110960614590167, "grad_norm": 0.4453125, "learning_rate": 4.24234717180519e-06, "loss": 0.0576, "step": 2100 }, { "epoch": 0.7114346786311401, "grad_norm": 0.49609375, "learning_rate": 4.2331967788513295e-06, "loss": 0.0669, "step": 2101 }, { "epoch": 0.7117732958032634, "grad_norm": 0.451171875, "learning_rate": 4.224053614585779e-06, "loss": 0.0635, "step": 2102 }, { "epoch": 0.7121119129753868, "grad_norm": 0.482421875, "learning_rate": 4.214917690469499e-06, "loss": 0.0612, "step": 2103 }, { "epoch": 0.7124505301475101, "grad_norm": 0.55078125, "learning_rate": 4.205789017954364e-06, "loss": 0.0592, "step": 2104 }, { "epoch": 0.7127891473196335, "grad_norm": 0.52734375, "learning_rate": 4.1966676084831715e-06, "loss": 0.065, "step": 2105 }, { "epoch": 0.7131277644917567, "grad_norm": 0.380859375, "learning_rate": 4.187553473489604e-06, "loss": 0.0527, "step": 2106 }, { "epoch": 0.7134663816638801, "grad_norm": 0.4296875, "learning_rate": 4.178446624398233e-06, "loss": 0.0521, "step": 2107 }, { "epoch": 0.7138049988360035, "grad_norm": 0.486328125, "learning_rate": 4.169347072624497e-06, "loss": 0.071, "step": 2108 }, { "epoch": 0.7141436160081268, "grad_norm": 0.392578125, "learning_rate": 4.160254829574679e-06, "loss": 0.0548, "step": 2109 }, { "epoch": 0.7144822331802502, "grad_norm": 0.423828125, "learning_rate": 4.15116990664591e-06, "loss": 0.0579, "step": 2110 }, { "epoch": 0.7148208503523735, "grad_norm": 0.400390625, "learning_rate": 4.142092315226146e-06, "loss": 0.0456, "step": 2111 }, { "epoch": 0.7151594675244969, "grad_norm": 0.62890625, "learning_rate": 4.13302206669414e-06, "loss": 0.0613, "step": 2112 }, { "epoch": 0.7154980846966201, "grad_norm": 0.46875, "learning_rate": 4.123959172419456e-06, "loss": 0.0577, "step": 2113 }, { "epoch": 0.7158367018687435, "grad_norm": 0.421875, "learning_rate": 4.114903643762428e-06, "loss": 0.0641, "step": 2114 }, { "epoch": 0.7161753190408668, "grad_norm": 0.64453125, "learning_rate": 4.1058554920741635e-06, "loss": 0.079, "step": 2115 }, { "epoch": 0.7165139362129902, "grad_norm": 0.42578125, "learning_rate": 4.096814728696529e-06, "loss": 0.0563, "step": 2116 }, { "epoch": 0.7168525533851136, "grad_norm": 0.4921875, "learning_rate": 4.087781364962108e-06, "loss": 0.0606, "step": 2117 }, { "epoch": 0.7171911705572369, "grad_norm": 0.5078125, "learning_rate": 4.078755412194228e-06, "loss": 0.0593, "step": 2118 }, { "epoch": 0.7175297877293603, "grad_norm": 0.51171875, "learning_rate": 4.069736881706929e-06, "loss": 0.0645, "step": 2119 }, { "epoch": 0.7178684049014835, "grad_norm": 0.455078125, "learning_rate": 4.06072578480493e-06, "loss": 0.048, "step": 2120 }, { "epoch": 0.7182070220736069, "grad_norm": 0.46875, "learning_rate": 4.051722132783644e-06, "loss": 0.0683, "step": 2121 }, { "epoch": 0.7185456392457302, "grad_norm": 0.39453125, "learning_rate": 4.042725936929157e-06, "loss": 0.0465, "step": 2122 }, { "epoch": 0.7188842564178536, "grad_norm": 0.50390625, "learning_rate": 4.0337372085181905e-06, "loss": 0.0717, "step": 2123 }, { "epoch": 0.7192228735899769, "grad_norm": 0.55859375, "learning_rate": 4.024755958818125e-06, "loss": 0.0725, "step": 2124 }, { "epoch": 0.7195614907621003, "grad_norm": 0.478515625, "learning_rate": 4.0157821990869505e-06, "loss": 0.0528, "step": 2125 }, { "epoch": 0.7199001079342237, "grad_norm": 0.498046875, "learning_rate": 4.006815940573279e-06, "loss": 0.0793, "step": 2126 }, { "epoch": 0.720238725106347, "grad_norm": 0.625, "learning_rate": 3.997857194516319e-06, "loss": 0.0728, "step": 2127 }, { "epoch": 0.7205773422784703, "grad_norm": 0.5546875, "learning_rate": 3.988905972145854e-06, "loss": 0.0728, "step": 2128 }, { "epoch": 0.7209159594505936, "grad_norm": 0.5234375, "learning_rate": 3.979962284682245e-06, "loss": 0.0724, "step": 2129 }, { "epoch": 0.721254576622717, "grad_norm": 0.51953125, "learning_rate": 3.971026143336409e-06, "loss": 0.0748, "step": 2130 }, { "epoch": 0.7215931937948403, "grad_norm": 0.3984375, "learning_rate": 3.96209755930979e-06, "loss": 0.0569, "step": 2131 }, { "epoch": 0.7219318109669637, "grad_norm": 0.51953125, "learning_rate": 3.953176543794378e-06, "loss": 0.0673, "step": 2132 }, { "epoch": 0.722270428139087, "grad_norm": 0.458984375, "learning_rate": 3.94426310797266e-06, "loss": 0.0556, "step": 2133 }, { "epoch": 0.7226090453112104, "grad_norm": 0.447265625, "learning_rate": 3.935357263017633e-06, "loss": 0.0616, "step": 2134 }, { "epoch": 0.7229476624833336, "grad_norm": 0.51171875, "learning_rate": 3.926459020092774e-06, "loss": 0.066, "step": 2135 }, { "epoch": 0.723286279655457, "grad_norm": 1.4296875, "learning_rate": 3.917568390352029e-06, "loss": 0.0712, "step": 2136 }, { "epoch": 0.7236248968275804, "grad_norm": 0.55859375, "learning_rate": 3.908685384939807e-06, "loss": 0.0741, "step": 2137 }, { "epoch": 0.7239635139997037, "grad_norm": 0.51953125, "learning_rate": 3.899810014990953e-06, "loss": 0.0728, "step": 2138 }, { "epoch": 0.7243021311718271, "grad_norm": 0.52734375, "learning_rate": 3.890942291630739e-06, "loss": 0.0746, "step": 2139 }, { "epoch": 0.7246407483439504, "grad_norm": 0.484375, "learning_rate": 3.8820822259748645e-06, "loss": 0.0595, "step": 2140 }, { "epoch": 0.7249793655160738, "grad_norm": 0.455078125, "learning_rate": 3.873229829129423e-06, "loss": 0.053, "step": 2141 }, { "epoch": 0.725317982688197, "grad_norm": 0.41015625, "learning_rate": 3.864385112190889e-06, "loss": 0.0526, "step": 2142 }, { "epoch": 0.7256565998603204, "grad_norm": 0.6171875, "learning_rate": 3.8555480862461214e-06, "loss": 0.0773, "step": 2143 }, { "epoch": 0.7259952170324437, "grad_norm": 0.5078125, "learning_rate": 3.846718762372328e-06, "loss": 0.0595, "step": 2144 }, { "epoch": 0.7263338342045671, "grad_norm": 0.58203125, "learning_rate": 3.837897151637069e-06, "loss": 0.073, "step": 2145 }, { "epoch": 0.7266724513766905, "grad_norm": 0.4140625, "learning_rate": 3.829083265098236e-06, "loss": 0.0546, "step": 2146 }, { "epoch": 0.7270110685488138, "grad_norm": 0.4921875, "learning_rate": 3.820277113804034e-06, "loss": 0.0585, "step": 2147 }, { "epoch": 0.7273496857209372, "grad_norm": 0.67578125, "learning_rate": 3.811478708792975e-06, "loss": 0.0918, "step": 2148 }, { "epoch": 0.7276883028930604, "grad_norm": 0.43359375, "learning_rate": 3.802688061093864e-06, "loss": 0.0533, "step": 2149 }, { "epoch": 0.7280269200651838, "grad_norm": 0.466796875, "learning_rate": 3.793905181725772e-06, "loss": 0.0574, "step": 2150 }, { "epoch": 0.7283655372373071, "grad_norm": 0.640625, "learning_rate": 3.785130081698045e-06, "loss": 0.0713, "step": 2151 }, { "epoch": 0.7287041544094305, "grad_norm": 0.47265625, "learning_rate": 3.776362772010267e-06, "loss": 0.0664, "step": 2152 }, { "epoch": 0.7290427715815538, "grad_norm": 0.515625, "learning_rate": 3.767603263652263e-06, "loss": 0.0622, "step": 2153 }, { "epoch": 0.7293813887536772, "grad_norm": 0.50390625, "learning_rate": 3.7588515676040805e-06, "loss": 0.0673, "step": 2154 }, { "epoch": 0.7297200059258006, "grad_norm": 0.4765625, "learning_rate": 3.750107694835966e-06, "loss": 0.0663, "step": 2155 }, { "epoch": 0.7300586230979239, "grad_norm": 0.54296875, "learning_rate": 3.7413716563083704e-06, "loss": 0.0625, "step": 2156 }, { "epoch": 0.7303972402700472, "grad_norm": 0.42578125, "learning_rate": 3.7326434629719122e-06, "loss": 0.0558, "step": 2157 }, { "epoch": 0.7307358574421705, "grad_norm": 0.546875, "learning_rate": 3.723923125767389e-06, "loss": 0.0678, "step": 2158 }, { "epoch": 0.7310744746142939, "grad_norm": 0.447265625, "learning_rate": 3.715210655625738e-06, "loss": 0.0477, "step": 2159 }, { "epoch": 0.7314130917864172, "grad_norm": 0.474609375, "learning_rate": 3.7065060634680485e-06, "loss": 0.0604, "step": 2160 }, { "epoch": 0.7317517089585406, "grad_norm": 0.67578125, "learning_rate": 3.6978093602055186e-06, "loss": 0.0876, "step": 2161 }, { "epoch": 0.7320903261306639, "grad_norm": 0.54296875, "learning_rate": 3.689120556739475e-06, "loss": 0.073, "step": 2162 }, { "epoch": 0.7324289433027873, "grad_norm": 0.455078125, "learning_rate": 3.6804396639613273e-06, "loss": 0.0456, "step": 2163 }, { "epoch": 0.7327675604749105, "grad_norm": 0.734375, "learning_rate": 3.6717666927525765e-06, "loss": 0.1512, "step": 2164 }, { "epoch": 0.7331061776470339, "grad_norm": 0.51953125, "learning_rate": 3.6631016539847987e-06, "loss": 0.0597, "step": 2165 }, { "epoch": 0.7334447948191573, "grad_norm": 0.5, "learning_rate": 3.654444558519612e-06, "loss": 0.059, "step": 2166 }, { "epoch": 0.7337834119912806, "grad_norm": 0.546875, "learning_rate": 3.6457954172086895e-06, "loss": 0.0734, "step": 2167 }, { "epoch": 0.734122029163404, "grad_norm": 0.53125, "learning_rate": 3.6371542408937355e-06, "loss": 0.0575, "step": 2168 }, { "epoch": 0.7344606463355273, "grad_norm": 0.4140625, "learning_rate": 3.6285210404064587e-06, "loss": 0.0573, "step": 2169 }, { "epoch": 0.7347992635076507, "grad_norm": 0.431640625, "learning_rate": 3.619895826568581e-06, "loss": 0.0489, "step": 2170 }, { "epoch": 0.7351378806797739, "grad_norm": 0.48828125, "learning_rate": 3.611278610191804e-06, "loss": 0.0538, "step": 2171 }, { "epoch": 0.7354764978518973, "grad_norm": 0.55859375, "learning_rate": 3.602669402077811e-06, "loss": 0.0678, "step": 2172 }, { "epoch": 0.7358151150240206, "grad_norm": 0.54296875, "learning_rate": 3.594068213018249e-06, "loss": 0.052, "step": 2173 }, { "epoch": 0.736153732196144, "grad_norm": 0.43359375, "learning_rate": 3.5854750537947035e-06, "loss": 0.0622, "step": 2174 }, { "epoch": 0.7364923493682674, "grad_norm": 0.6953125, "learning_rate": 3.5768899351787066e-06, "loss": 0.0634, "step": 2175 }, { "epoch": 0.7368309665403907, "grad_norm": 0.59765625, "learning_rate": 3.568312867931697e-06, "loss": 0.1019, "step": 2176 }, { "epoch": 0.7371695837125141, "grad_norm": 0.5078125, "learning_rate": 3.559743862805034e-06, "loss": 0.0662, "step": 2177 }, { "epoch": 0.7375082008846373, "grad_norm": 0.51953125, "learning_rate": 3.551182930539969e-06, "loss": 0.0743, "step": 2178 }, { "epoch": 0.7378468180567607, "grad_norm": 0.5703125, "learning_rate": 3.5426300818676264e-06, "loss": 0.072, "step": 2179 }, { "epoch": 0.738185435228884, "grad_norm": 0.466796875, "learning_rate": 3.534085327509006e-06, "loss": 0.0677, "step": 2180 }, { "epoch": 0.7385240524010074, "grad_norm": 0.4609375, "learning_rate": 3.525548678174957e-06, "loss": 0.0604, "step": 2181 }, { "epoch": 0.7388626695731307, "grad_norm": 0.5078125, "learning_rate": 3.5170201445661655e-06, "loss": 0.0628, "step": 2182 }, { "epoch": 0.7392012867452541, "grad_norm": 0.38671875, "learning_rate": 3.5084997373731546e-06, "loss": 0.0482, "step": 2183 }, { "epoch": 0.7395399039173775, "grad_norm": 0.498046875, "learning_rate": 3.4999874672762567e-06, "loss": 0.0587, "step": 2184 }, { "epoch": 0.7398785210895007, "grad_norm": 2.1875, "learning_rate": 3.4914833449455963e-06, "loss": 0.0638, "step": 2185 }, { "epoch": 0.740217138261624, "grad_norm": 0.5546875, "learning_rate": 3.482987381041096e-06, "loss": 0.0692, "step": 2186 }, { "epoch": 0.7405557554337474, "grad_norm": 0.4140625, "learning_rate": 3.4744995862124498e-06, "loss": 0.0501, "step": 2187 }, { "epoch": 0.7408943726058708, "grad_norm": 0.48046875, "learning_rate": 3.4660199710991038e-06, "loss": 0.0731, "step": 2188 }, { "epoch": 0.7412329897779941, "grad_norm": 0.62890625, "learning_rate": 3.4575485463302603e-06, "loss": 0.1051, "step": 2189 }, { "epoch": 0.7415716069501175, "grad_norm": 0.44140625, "learning_rate": 3.449085322524848e-06, "loss": 0.0553, "step": 2190 }, { "epoch": 0.7419102241222408, "grad_norm": 0.55078125, "learning_rate": 3.440630310291517e-06, "loss": 0.0543, "step": 2191 }, { "epoch": 0.7422488412943642, "grad_norm": 0.51953125, "learning_rate": 3.432183520228635e-06, "loss": 0.0701, "step": 2192 }, { "epoch": 0.7425874584664874, "grad_norm": 0.53125, "learning_rate": 3.4237449629242427e-06, "loss": 0.0757, "step": 2193 }, { "epoch": 0.7429260756386108, "grad_norm": 0.42578125, "learning_rate": 3.4153146489560807e-06, "loss": 0.0497, "step": 2194 }, { "epoch": 0.7432646928107342, "grad_norm": 0.5625, "learning_rate": 3.4068925888915417e-06, "loss": 0.0708, "step": 2195 }, { "epoch": 0.7436033099828575, "grad_norm": 0.4765625, "learning_rate": 3.398478793287682e-06, "loss": 0.0616, "step": 2196 }, { "epoch": 0.7439419271549809, "grad_norm": 0.44140625, "learning_rate": 3.390073272691198e-06, "loss": 0.0545, "step": 2197 }, { "epoch": 0.7442805443271042, "grad_norm": 0.396484375, "learning_rate": 3.381676037638404e-06, "loss": 0.0548, "step": 2198 }, { "epoch": 0.7446191614992276, "grad_norm": 0.50390625, "learning_rate": 3.3732870986552392e-06, "loss": 0.0593, "step": 2199 }, { "epoch": 0.7449577786713508, "grad_norm": 0.6953125, "learning_rate": 3.3649064662572406e-06, "loss": 0.0843, "step": 2200 }, { "epoch": 0.7452963958434742, "grad_norm": 0.578125, "learning_rate": 3.35653415094953e-06, "loss": 0.0733, "step": 2201 }, { "epoch": 0.7456350130155975, "grad_norm": 0.56640625, "learning_rate": 3.3481701632268014e-06, "loss": 0.0623, "step": 2202 }, { "epoch": 0.7459736301877209, "grad_norm": 0.439453125, "learning_rate": 3.339814513573321e-06, "loss": 0.059, "step": 2203 }, { "epoch": 0.7463122473598442, "grad_norm": 0.37890625, "learning_rate": 3.3314672124628877e-06, "loss": 0.0464, "step": 2204 }, { "epoch": 0.7466508645319676, "grad_norm": 0.55078125, "learning_rate": 3.323128270358851e-06, "loss": 0.0573, "step": 2205 }, { "epoch": 0.746989481704091, "grad_norm": 0.51171875, "learning_rate": 3.3147976977140763e-06, "loss": 0.0692, "step": 2206 }, { "epoch": 0.7473280988762142, "grad_norm": 0.423828125, "learning_rate": 3.3064755049709307e-06, "loss": 0.0537, "step": 2207 }, { "epoch": 0.7476667160483376, "grad_norm": 0.4375, "learning_rate": 3.2981617025612913e-06, "loss": 0.0586, "step": 2208 }, { "epoch": 0.7480053332204609, "grad_norm": 0.58203125, "learning_rate": 3.289856300906502e-06, "loss": 0.0716, "step": 2209 }, { "epoch": 0.7483439503925843, "grad_norm": 0.515625, "learning_rate": 3.2815593104173882e-06, "loss": 0.0656, "step": 2210 }, { "epoch": 0.7486825675647076, "grad_norm": 0.41796875, "learning_rate": 3.273270741494232e-06, "loss": 0.055, "step": 2211 }, { "epoch": 0.749021184736831, "grad_norm": 0.462890625, "learning_rate": 3.264990604526749e-06, "loss": 0.0665, "step": 2212 }, { "epoch": 0.7493598019089543, "grad_norm": 0.48046875, "learning_rate": 3.2567189098940966e-06, "loss": 0.0582, "step": 2213 }, { "epoch": 0.7496984190810776, "grad_norm": 0.5703125, "learning_rate": 3.2484556679648393e-06, "loss": 0.084, "step": 2214 }, { "epoch": 0.750037036253201, "grad_norm": 0.51171875, "learning_rate": 3.240200889096955e-06, "loss": 0.0749, "step": 2215 }, { "epoch": 0.7503756534253243, "grad_norm": 0.4921875, "learning_rate": 3.231954583637812e-06, "loss": 0.0605, "step": 2216 }, { "epoch": 0.7507142705974477, "grad_norm": 0.4609375, "learning_rate": 3.2237167619241492e-06, "loss": 0.0609, "step": 2217 }, { "epoch": 0.751052887769571, "grad_norm": 0.44921875, "learning_rate": 3.2154874342820797e-06, "loss": 0.0622, "step": 2218 }, { "epoch": 0.7513915049416944, "grad_norm": 0.3828125, "learning_rate": 3.207266611027069e-06, "loss": 0.0463, "step": 2219 }, { "epoch": 0.7517301221138177, "grad_norm": 0.5078125, "learning_rate": 3.199054302463914e-06, "loss": 0.0745, "step": 2220 }, { "epoch": 0.7520687392859411, "grad_norm": 0.453125, "learning_rate": 3.1908505188867513e-06, "loss": 0.0582, "step": 2221 }, { "epoch": 0.7524073564580643, "grad_norm": 0.5, "learning_rate": 3.1826552705790192e-06, "loss": 0.0577, "step": 2222 }, { "epoch": 0.7527459736301877, "grad_norm": 0.486328125, "learning_rate": 3.174468567813461e-06, "loss": 0.0602, "step": 2223 }, { "epoch": 0.753084590802311, "grad_norm": 0.51171875, "learning_rate": 3.166290420852114e-06, "loss": 0.0631, "step": 2224 }, { "epoch": 0.7534232079744344, "grad_norm": 1.5078125, "learning_rate": 3.1581208399462804e-06, "loss": 0.0609, "step": 2225 }, { "epoch": 0.7537618251465578, "grad_norm": 0.369140625, "learning_rate": 3.1499598353365334e-06, "loss": 0.0447, "step": 2226 }, { "epoch": 0.7541004423186811, "grad_norm": 0.47265625, "learning_rate": 3.141807417252697e-06, "loss": 0.0606, "step": 2227 }, { "epoch": 0.7544390594908045, "grad_norm": 0.396484375, "learning_rate": 3.1336635959138197e-06, "loss": 0.054, "step": 2228 }, { "epoch": 0.7547776766629277, "grad_norm": 0.453125, "learning_rate": 3.1255283815281876e-06, "loss": 0.0674, "step": 2229 }, { "epoch": 0.7551162938350511, "grad_norm": 0.466796875, "learning_rate": 3.1174017842932946e-06, "loss": 0.0645, "step": 2230 }, { "epoch": 0.7554549110071744, "grad_norm": 0.51953125, "learning_rate": 3.109283814395825e-06, "loss": 0.0789, "step": 2231 }, { "epoch": 0.7557935281792978, "grad_norm": 0.71875, "learning_rate": 3.1011744820116607e-06, "loss": 0.1046, "step": 2232 }, { "epoch": 0.7561321453514211, "grad_norm": 0.56640625, "learning_rate": 3.0930737973058443e-06, "loss": 0.076, "step": 2233 }, { "epoch": 0.7564707625235445, "grad_norm": 0.48828125, "learning_rate": 3.084981770432588e-06, "loss": 0.0705, "step": 2234 }, { "epoch": 0.7568093796956679, "grad_norm": 0.62109375, "learning_rate": 3.076898411535252e-06, "loss": 0.0654, "step": 2235 }, { "epoch": 0.7571479968677911, "grad_norm": 0.4609375, "learning_rate": 3.06882373074632e-06, "loss": 0.0583, "step": 2236 }, { "epoch": 0.7574866140399145, "grad_norm": 0.484375, "learning_rate": 3.0607577381874088e-06, "loss": 0.0562, "step": 2237 }, { "epoch": 0.7578252312120378, "grad_norm": 0.53125, "learning_rate": 3.0527004439692433e-06, "loss": 0.0584, "step": 2238 }, { "epoch": 0.7581638483841612, "grad_norm": 0.5, "learning_rate": 3.044651858191636e-06, "loss": 0.0621, "step": 2239 }, { "epoch": 0.7585024655562845, "grad_norm": 0.80078125, "learning_rate": 3.0366119909434977e-06, "loss": 0.0545, "step": 2240 }, { "epoch": 0.7588410827284079, "grad_norm": 0.470703125, "learning_rate": 3.0285808523027936e-06, "loss": 0.0627, "step": 2241 }, { "epoch": 0.7591796999005312, "grad_norm": 0.482421875, "learning_rate": 3.0205584523365626e-06, "loss": 0.0729, "step": 2242 }, { "epoch": 0.7595183170726545, "grad_norm": 0.453125, "learning_rate": 3.0125448011008894e-06, "loss": 0.0605, "step": 2243 }, { "epoch": 0.7598569342447778, "grad_norm": 0.59765625, "learning_rate": 3.004539908640872e-06, "loss": 0.0855, "step": 2244 }, { "epoch": 0.7601955514169012, "grad_norm": 0.466796875, "learning_rate": 2.996543784990653e-06, "loss": 0.0587, "step": 2245 }, { "epoch": 0.7605341685890246, "grad_norm": 0.412109375, "learning_rate": 2.9885564401733745e-06, "loss": 0.0519, "step": 2246 }, { "epoch": 0.7608727857611479, "grad_norm": 0.5, "learning_rate": 2.980577884201169e-06, "loss": 0.0668, "step": 2247 }, { "epoch": 0.7612114029332713, "grad_norm": 0.46484375, "learning_rate": 2.9726081270751594e-06, "loss": 0.0552, "step": 2248 }, { "epoch": 0.7615500201053946, "grad_norm": 0.443359375, "learning_rate": 2.9646471787854416e-06, "loss": 0.0611, "step": 2249 }, { "epoch": 0.761888637277518, "grad_norm": 0.50390625, "learning_rate": 2.956695049311057e-06, "loss": 0.0522, "step": 2250 }, { "epoch": 0.7622272544496412, "grad_norm": 0.462890625, "learning_rate": 2.948751748620007e-06, "loss": 0.0615, "step": 2251 }, { "epoch": 0.7625658716217646, "grad_norm": 0.322265625, "learning_rate": 2.940817286669214e-06, "loss": 0.0447, "step": 2252 }, { "epoch": 0.762904488793888, "grad_norm": 0.5, "learning_rate": 2.93289167340453e-06, "loss": 0.0689, "step": 2253 }, { "epoch": 0.7632431059660113, "grad_norm": 0.51171875, "learning_rate": 2.9249749187607146e-06, "loss": 0.0608, "step": 2254 }, { "epoch": 0.7635817231381347, "grad_norm": 0.62109375, "learning_rate": 2.917067032661415e-06, "loss": 0.0734, "step": 2255 }, { "epoch": 0.763920340310258, "grad_norm": 0.50390625, "learning_rate": 2.909168025019168e-06, "loss": 0.071, "step": 2256 }, { "epoch": 0.7642589574823814, "grad_norm": 0.51953125, "learning_rate": 2.901277905735386e-06, "loss": 0.0604, "step": 2257 }, { "epoch": 0.7645975746545046, "grad_norm": 0.6484375, "learning_rate": 2.893396684700326e-06, "loss": 0.0887, "step": 2258 }, { "epoch": 0.764936191826628, "grad_norm": 0.50390625, "learning_rate": 2.885524371793106e-06, "loss": 0.0768, "step": 2259 }, { "epoch": 0.7652748089987513, "grad_norm": 0.353515625, "learning_rate": 2.8776609768816655e-06, "loss": 0.0521, "step": 2260 }, { "epoch": 0.7656134261708747, "grad_norm": 0.474609375, "learning_rate": 2.8698065098227725e-06, "loss": 0.0669, "step": 2261 }, { "epoch": 0.765952043342998, "grad_norm": 0.4765625, "learning_rate": 2.8619609804620063e-06, "loss": 0.0602, "step": 2262 }, { "epoch": 0.7662906605151214, "grad_norm": 0.44140625, "learning_rate": 2.854124398633732e-06, "loss": 0.0546, "step": 2263 }, { "epoch": 0.7666292776872448, "grad_norm": 0.451171875, "learning_rate": 2.846296774161108e-06, "loss": 0.0598, "step": 2264 }, { "epoch": 0.766967894859368, "grad_norm": 0.421875, "learning_rate": 2.8384781168560693e-06, "loss": 0.06, "step": 2265 }, { "epoch": 0.7673065120314914, "grad_norm": 0.4921875, "learning_rate": 2.8306684365192915e-06, "loss": 0.0638, "step": 2266 }, { "epoch": 0.7676451292036147, "grad_norm": 0.5, "learning_rate": 2.822867742940214e-06, "loss": 0.0603, "step": 2267 }, { "epoch": 0.7679837463757381, "grad_norm": 0.431640625, "learning_rate": 2.8150760458970115e-06, "loss": 0.0627, "step": 2268 }, { "epoch": 0.7683223635478614, "grad_norm": 0.5546875, "learning_rate": 2.8072933551565706e-06, "loss": 0.0726, "step": 2269 }, { "epoch": 0.7686609807199848, "grad_norm": 0.56640625, "learning_rate": 2.7995196804745005e-06, "loss": 0.0813, "step": 2270 }, { "epoch": 0.7689995978921081, "grad_norm": 0.5703125, "learning_rate": 2.791755031595096e-06, "loss": 0.072, "step": 2271 }, { "epoch": 0.7693382150642314, "grad_norm": 0.66796875, "learning_rate": 2.7839994182513496e-06, "loss": 0.0566, "step": 2272 }, { "epoch": 0.7696768322363547, "grad_norm": 0.58203125, "learning_rate": 2.7762528501649256e-06, "loss": 0.0811, "step": 2273 }, { "epoch": 0.7700154494084781, "grad_norm": 0.46484375, "learning_rate": 2.7685153370461424e-06, "loss": 0.0523, "step": 2274 }, { "epoch": 0.7703540665806015, "grad_norm": 0.40234375, "learning_rate": 2.760786888593975e-06, "loss": 0.0529, "step": 2275 }, { "epoch": 0.7706926837527248, "grad_norm": 0.359375, "learning_rate": 2.7530675144960382e-06, "loss": 0.0425, "step": 2276 }, { "epoch": 0.7710313009248482, "grad_norm": 0.7578125, "learning_rate": 2.745357224428563e-06, "loss": 0.0685, "step": 2277 }, { "epoch": 0.7713699180969715, "grad_norm": 1.9296875, "learning_rate": 2.7376560280564025e-06, "loss": 0.0716, "step": 2278 }, { "epoch": 0.7717085352690949, "grad_norm": 0.6015625, "learning_rate": 2.729963935033002e-06, "loss": 0.0661, "step": 2279 }, { "epoch": 0.7720471524412181, "grad_norm": 0.57421875, "learning_rate": 2.722280955000404e-06, "loss": 0.0642, "step": 2280 }, { "epoch": 0.7723857696133415, "grad_norm": 0.625, "learning_rate": 2.714607097589226e-06, "loss": 0.1111, "step": 2281 }, { "epoch": 0.7727243867854648, "grad_norm": 0.59375, "learning_rate": 2.706942372418645e-06, "loss": 0.0496, "step": 2282 }, { "epoch": 0.7730630039575882, "grad_norm": 0.55859375, "learning_rate": 2.699286789096397e-06, "loss": 0.0549, "step": 2283 }, { "epoch": 0.7734016211297116, "grad_norm": 0.5, "learning_rate": 2.691640357218759e-06, "loss": 0.0606, "step": 2284 }, { "epoch": 0.7737402383018349, "grad_norm": 0.5234375, "learning_rate": 2.684003086370528e-06, "loss": 0.0644, "step": 2285 }, { "epoch": 0.7740788554739583, "grad_norm": 0.52734375, "learning_rate": 2.6763749861250297e-06, "loss": 0.0697, "step": 2286 }, { "epoch": 0.7744174726460815, "grad_norm": 0.52734375, "learning_rate": 2.6687560660440858e-06, "loss": 0.0635, "step": 2287 }, { "epoch": 0.7747560898182049, "grad_norm": 0.41796875, "learning_rate": 2.66114633567801e-06, "loss": 0.0387, "step": 2288 }, { "epoch": 0.7750947069903282, "grad_norm": 0.478515625, "learning_rate": 2.653545804565606e-06, "loss": 0.071, "step": 2289 }, { "epoch": 0.7754333241624516, "grad_norm": 0.55078125, "learning_rate": 2.645954482234133e-06, "loss": 0.0768, "step": 2290 }, { "epoch": 0.7757719413345749, "grad_norm": 0.53515625, "learning_rate": 2.6383723781993187e-06, "loss": 0.0642, "step": 2291 }, { "epoch": 0.7761105585066983, "grad_norm": 0.484375, "learning_rate": 2.630799501965333e-06, "loss": 0.0548, "step": 2292 }, { "epoch": 0.7764491756788217, "grad_norm": 0.451171875, "learning_rate": 2.6232358630247722e-06, "loss": 0.0597, "step": 2293 }, { "epoch": 0.7767877928509449, "grad_norm": 0.5078125, "learning_rate": 2.61568147085866e-06, "loss": 0.0518, "step": 2294 }, { "epoch": 0.7771264100230683, "grad_norm": 0.53515625, "learning_rate": 2.6081363349364317e-06, "loss": 0.0703, "step": 2295 }, { "epoch": 0.7774650271951916, "grad_norm": 0.443359375, "learning_rate": 2.600600464715909e-06, "loss": 0.0616, "step": 2296 }, { "epoch": 0.777803644367315, "grad_norm": 0.474609375, "learning_rate": 2.5930738696433124e-06, "loss": 0.0609, "step": 2297 }, { "epoch": 0.7781422615394383, "grad_norm": 0.423828125, "learning_rate": 2.5855565591532227e-06, "loss": 0.0559, "step": 2298 }, { "epoch": 0.7784808787115617, "grad_norm": 0.474609375, "learning_rate": 2.578048542668593e-06, "loss": 0.058, "step": 2299 }, { "epoch": 0.778819495883685, "grad_norm": 0.6015625, "learning_rate": 2.5705498296007247e-06, "loss": 0.0668, "step": 2300 }, { "epoch": 0.7791581130558083, "grad_norm": 0.48046875, "learning_rate": 2.56306042934925e-06, "loss": 0.0635, "step": 2301 }, { "epoch": 0.7794967302279316, "grad_norm": 0.6171875, "learning_rate": 2.5555803513021393e-06, "loss": 0.0563, "step": 2302 }, { "epoch": 0.779835347400055, "grad_norm": 0.58203125, "learning_rate": 2.5481096048356636e-06, "loss": 0.0667, "step": 2303 }, { "epoch": 0.7801739645721784, "grad_norm": 0.451171875, "learning_rate": 2.5406481993144084e-06, "loss": 0.0589, "step": 2304 }, { "epoch": 0.7805125817443017, "grad_norm": 0.515625, "learning_rate": 2.5331961440912476e-06, "loss": 0.0657, "step": 2305 }, { "epoch": 0.7808511989164251, "grad_norm": 0.5, "learning_rate": 2.525753448507329e-06, "loss": 0.0664, "step": 2306 }, { "epoch": 0.7811898160885484, "grad_norm": 0.462890625, "learning_rate": 2.518320121892076e-06, "loss": 0.0628, "step": 2307 }, { "epoch": 0.7815284332606718, "grad_norm": 0.5546875, "learning_rate": 2.5108961735631634e-06, "loss": 0.0737, "step": 2308 }, { "epoch": 0.781867050432795, "grad_norm": 0.67578125, "learning_rate": 2.503481612826506e-06, "loss": 0.0841, "step": 2309 }, { "epoch": 0.7822056676049184, "grad_norm": 0.5234375, "learning_rate": 2.496076448976261e-06, "loss": 0.0647, "step": 2310 }, { "epoch": 0.7825442847770417, "grad_norm": 0.50390625, "learning_rate": 2.4886806912948034e-06, "loss": 0.0586, "step": 2311 }, { "epoch": 0.7828829019491651, "grad_norm": 0.447265625, "learning_rate": 2.481294349052711e-06, "loss": 0.0621, "step": 2312 }, { "epoch": 0.7832215191212885, "grad_norm": 0.455078125, "learning_rate": 2.4739174315087678e-06, "loss": 0.0668, "step": 2313 }, { "epoch": 0.7835601362934118, "grad_norm": 0.5703125, "learning_rate": 2.466549947909942e-06, "loss": 0.0792, "step": 2314 }, { "epoch": 0.7838987534655352, "grad_norm": 0.439453125, "learning_rate": 2.4591919074913707e-06, "loss": 0.06, "step": 2315 }, { "epoch": 0.7842373706376584, "grad_norm": 0.4765625, "learning_rate": 2.4518433194763625e-06, "loss": 0.0653, "step": 2316 }, { "epoch": 0.7845759878097818, "grad_norm": 0.482421875, "learning_rate": 2.444504193076368e-06, "loss": 0.0655, "step": 2317 }, { "epoch": 0.7849146049819051, "grad_norm": 0.5, "learning_rate": 2.437174537490985e-06, "loss": 0.0754, "step": 2318 }, { "epoch": 0.7852532221540285, "grad_norm": 0.453125, "learning_rate": 2.429854361907942e-06, "loss": 0.0545, "step": 2319 }, { "epoch": 0.7855918393261518, "grad_norm": 0.68359375, "learning_rate": 2.4225436755030717e-06, "loss": 0.0695, "step": 2320 }, { "epoch": 0.7859304564982752, "grad_norm": 0.390625, "learning_rate": 2.415242487440328e-06, "loss": 0.0421, "step": 2321 }, { "epoch": 0.7862690736703986, "grad_norm": 0.53515625, "learning_rate": 2.4079508068717427e-06, "loss": 0.0666, "step": 2322 }, { "epoch": 0.7866076908425218, "grad_norm": 0.365234375, "learning_rate": 2.4006686429374437e-06, "loss": 0.0405, "step": 2323 }, { "epoch": 0.7869463080146452, "grad_norm": 0.462890625, "learning_rate": 2.3933960047656235e-06, "loss": 0.0497, "step": 2324 }, { "epoch": 0.7872849251867685, "grad_norm": 0.419921875, "learning_rate": 2.386132901472532e-06, "loss": 0.0493, "step": 2325 }, { "epoch": 0.7876235423588919, "grad_norm": 0.5390625, "learning_rate": 2.378879342162471e-06, "loss": 0.0699, "step": 2326 }, { "epoch": 0.7879621595310152, "grad_norm": 0.451171875, "learning_rate": 2.371635335927781e-06, "loss": 0.0606, "step": 2327 }, { "epoch": 0.7883007767031386, "grad_norm": 0.61328125, "learning_rate": 2.3644008918488216e-06, "loss": 0.0653, "step": 2328 }, { "epoch": 0.7886393938752619, "grad_norm": 0.53125, "learning_rate": 2.357176018993966e-06, "loss": 0.0768, "step": 2329 }, { "epoch": 0.7889780110473852, "grad_norm": 0.44140625, "learning_rate": 2.349960726419599e-06, "loss": 0.0589, "step": 2330 }, { "epoch": 0.7893166282195085, "grad_norm": 0.52734375, "learning_rate": 2.3427550231700836e-06, "loss": 0.0645, "step": 2331 }, { "epoch": 0.7896552453916319, "grad_norm": 0.443359375, "learning_rate": 2.335558918277774e-06, "loss": 0.0513, "step": 2332 }, { "epoch": 0.7899938625637553, "grad_norm": 0.490234375, "learning_rate": 2.3283724207629886e-06, "loss": 0.0674, "step": 2333 }, { "epoch": 0.7903324797358786, "grad_norm": 0.44140625, "learning_rate": 2.3211955396340003e-06, "loss": 0.0536, "step": 2334 }, { "epoch": 0.790671096908002, "grad_norm": 0.63671875, "learning_rate": 2.3140282838870332e-06, "loss": 0.074, "step": 2335 }, { "epoch": 0.7910097140801253, "grad_norm": 0.625, "learning_rate": 2.3068706625062385e-06, "loss": 0.0729, "step": 2336 }, { "epoch": 0.7913483312522487, "grad_norm": 0.439453125, "learning_rate": 2.299722684463698e-06, "loss": 0.0594, "step": 2337 }, { "epoch": 0.7916869484243719, "grad_norm": 0.40234375, "learning_rate": 2.2925843587194042e-06, "loss": 0.0524, "step": 2338 }, { "epoch": 0.7920255655964953, "grad_norm": 0.44921875, "learning_rate": 2.285455694221246e-06, "loss": 0.0622, "step": 2339 }, { "epoch": 0.7923641827686186, "grad_norm": 0.41015625, "learning_rate": 2.2783366999050074e-06, "loss": 0.0543, "step": 2340 }, { "epoch": 0.792702799940742, "grad_norm": 0.5859375, "learning_rate": 2.2712273846943457e-06, "loss": 0.0729, "step": 2341 }, { "epoch": 0.7930414171128654, "grad_norm": 0.515625, "learning_rate": 2.264127757500789e-06, "loss": 0.0587, "step": 2342 }, { "epoch": 0.7933800342849887, "grad_norm": 0.37109375, "learning_rate": 2.2570378272237237e-06, "loss": 0.046, "step": 2343 }, { "epoch": 0.7937186514571121, "grad_norm": 0.49609375, "learning_rate": 2.2499576027503723e-06, "loss": 0.0689, "step": 2344 }, { "epoch": 0.7940572686292353, "grad_norm": 0.443359375, "learning_rate": 2.2428870929558012e-06, "loss": 0.0569, "step": 2345 }, { "epoch": 0.7943958858013587, "grad_norm": 0.484375, "learning_rate": 2.2358263067028952e-06, "loss": 0.0631, "step": 2346 }, { "epoch": 0.794734502973482, "grad_norm": 0.4609375, "learning_rate": 2.228775252842347e-06, "loss": 0.064, "step": 2347 }, { "epoch": 0.7950731201456054, "grad_norm": 0.470703125, "learning_rate": 2.221733940212657e-06, "loss": 0.0628, "step": 2348 }, { "epoch": 0.7954117373177287, "grad_norm": 0.49609375, "learning_rate": 2.2147023776401077e-06, "loss": 0.0629, "step": 2349 }, { "epoch": 0.7957503544898521, "grad_norm": 0.49609375, "learning_rate": 2.2076805739387664e-06, "loss": 0.0649, "step": 2350 }, { "epoch": 0.7960889716619755, "grad_norm": 0.40625, "learning_rate": 2.200668537910461e-06, "loss": 0.054, "step": 2351 }, { "epoch": 0.7964275888340987, "grad_norm": 0.4140625, "learning_rate": 2.1936662783447836e-06, "loss": 0.0542, "step": 2352 }, { "epoch": 0.796766206006222, "grad_norm": 0.44140625, "learning_rate": 2.1866738040190638e-06, "loss": 0.0587, "step": 2353 }, { "epoch": 0.7971048231783454, "grad_norm": 1.265625, "learning_rate": 2.1796911236983708e-06, "loss": 0.0656, "step": 2354 }, { "epoch": 0.7974434403504688, "grad_norm": 0.498046875, "learning_rate": 2.172718246135492e-06, "loss": 0.0612, "step": 2355 }, { "epoch": 0.7977820575225921, "grad_norm": 0.431640625, "learning_rate": 2.165755180070932e-06, "loss": 0.0623, "step": 2356 }, { "epoch": 0.7981206746947155, "grad_norm": 0.59375, "learning_rate": 2.158801934232897e-06, "loss": 0.0772, "step": 2357 }, { "epoch": 0.7984592918668388, "grad_norm": 0.5078125, "learning_rate": 2.1518585173372774e-06, "loss": 0.0662, "step": 2358 }, { "epoch": 0.7987979090389621, "grad_norm": 0.53125, "learning_rate": 2.14492493808765e-06, "loss": 0.0785, "step": 2359 }, { "epoch": 0.7991365262110854, "grad_norm": 0.55859375, "learning_rate": 2.138001205175253e-06, "loss": 0.0744, "step": 2360 }, { "epoch": 0.7994751433832088, "grad_norm": 0.5390625, "learning_rate": 2.1310873272789878e-06, "loss": 0.0661, "step": 2361 }, { "epoch": 0.7998137605553322, "grad_norm": 0.52734375, "learning_rate": 2.1241833130654056e-06, "loss": 0.0665, "step": 2362 }, { "epoch": 0.8001523777274555, "grad_norm": 0.5625, "learning_rate": 2.117289171188681e-06, "loss": 0.0626, "step": 2363 }, { "epoch": 0.8004909948995789, "grad_norm": 0.51171875, "learning_rate": 2.1104049102906254e-06, "loss": 0.0716, "step": 2364 }, { "epoch": 0.8008296120717022, "grad_norm": 0.40234375, "learning_rate": 2.103530539000662e-06, "loss": 0.0494, "step": 2365 }, { "epoch": 0.8011682292438256, "grad_norm": 0.56640625, "learning_rate": 2.096666065935813e-06, "loss": 0.0471, "step": 2366 }, { "epoch": 0.8015068464159488, "grad_norm": 0.51953125, "learning_rate": 2.089811499700699e-06, "loss": 0.0715, "step": 2367 }, { "epoch": 0.8018454635880722, "grad_norm": 0.458984375, "learning_rate": 2.082966848887514e-06, "loss": 0.0612, "step": 2368 }, { "epoch": 0.8018454635880722, "eval_loss": 0.06570233404636383, "eval_runtime": 815.4749, "eval_samples_per_second": 12.199, "eval_steps_per_second": 3.05, "step": 2368 }, { "epoch": 0.8021840807601955, "grad_norm": 0.435546875, "learning_rate": 2.0761321220760324e-06, "loss": 0.0557, "step": 2369 }, { "epoch": 0.8025226979323189, "grad_norm": 0.40625, "learning_rate": 2.069307327833586e-06, "loss": 0.0535, "step": 2370 }, { "epoch": 0.8028613151044423, "grad_norm": 0.431640625, "learning_rate": 2.062492474715053e-06, "loss": 0.0586, "step": 2371 }, { "epoch": 0.8031999322765656, "grad_norm": 0.451171875, "learning_rate": 2.05568757126285e-06, "loss": 0.058, "step": 2372 }, { "epoch": 0.803538549448689, "grad_norm": 0.423828125, "learning_rate": 2.0488926260069284e-06, "loss": 0.0479, "step": 2373 }, { "epoch": 0.8038771666208122, "grad_norm": 0.546875, "learning_rate": 2.042107647464748e-06, "loss": 0.0671, "step": 2374 }, { "epoch": 0.8042157837929356, "grad_norm": 0.57421875, "learning_rate": 2.0353326441412835e-06, "loss": 0.0799, "step": 2375 }, { "epoch": 0.8045544009650589, "grad_norm": 0.6640625, "learning_rate": 2.0285676245290032e-06, "loss": 0.0755, "step": 2376 }, { "epoch": 0.8048930181371823, "grad_norm": 0.48046875, "learning_rate": 2.021812597107855e-06, "loss": 0.0677, "step": 2377 }, { "epoch": 0.8052316353093056, "grad_norm": 0.51953125, "learning_rate": 2.0150675703452717e-06, "loss": 0.0649, "step": 2378 }, { "epoch": 0.805570252481429, "grad_norm": 0.466796875, "learning_rate": 2.0083325526961394e-06, "loss": 0.0591, "step": 2379 }, { "epoch": 0.8059088696535524, "grad_norm": 0.474609375, "learning_rate": 2.0016075526028066e-06, "loss": 0.0642, "step": 2380 }, { "epoch": 0.8062474868256756, "grad_norm": 0.3984375, "learning_rate": 1.9948925784950625e-06, "loss": 0.0564, "step": 2381 }, { "epoch": 0.806586103997799, "grad_norm": 0.43359375, "learning_rate": 1.9881876387901243e-06, "loss": 0.0556, "step": 2382 }, { "epoch": 0.8069247211699223, "grad_norm": 0.5, "learning_rate": 1.9814927418926366e-06, "loss": 0.0643, "step": 2383 }, { "epoch": 0.8072633383420457, "grad_norm": 0.546875, "learning_rate": 1.974807896194655e-06, "loss": 0.0687, "step": 2384 }, { "epoch": 0.807601955514169, "grad_norm": 0.58984375, "learning_rate": 1.9681331100756298e-06, "loss": 0.073, "step": 2385 }, { "epoch": 0.8079405726862924, "grad_norm": 0.55859375, "learning_rate": 1.9614683919024103e-06, "loss": 0.0685, "step": 2386 }, { "epoch": 0.8082791898584157, "grad_norm": 0.4296875, "learning_rate": 1.9548137500292163e-06, "loss": 0.059, "step": 2387 }, { "epoch": 0.808617807030539, "grad_norm": 0.48046875, "learning_rate": 1.9481691927976453e-06, "loss": 0.0592, "step": 2388 }, { "epoch": 0.8089564242026623, "grad_norm": 0.546875, "learning_rate": 1.9415347285366527e-06, "loss": 0.0778, "step": 2389 }, { "epoch": 0.8092950413747857, "grad_norm": 0.671875, "learning_rate": 1.9349103655625346e-06, "loss": 0.0807, "step": 2390 }, { "epoch": 0.809633658546909, "grad_norm": 0.65625, "learning_rate": 1.9282961121789324e-06, "loss": 0.0714, "step": 2391 }, { "epoch": 0.8099722757190324, "grad_norm": 0.46484375, "learning_rate": 1.9216919766768194e-06, "loss": 0.0507, "step": 2392 }, { "epoch": 0.8103108928911558, "grad_norm": 0.59375, "learning_rate": 1.915097967334469e-06, "loss": 0.0808, "step": 2393 }, { "epoch": 0.8106495100632791, "grad_norm": 0.60546875, "learning_rate": 1.9085140924174783e-06, "loss": 0.0563, "step": 2394 }, { "epoch": 0.8109881272354025, "grad_norm": 0.48046875, "learning_rate": 1.9019403601787377e-06, "loss": 0.0615, "step": 2395 }, { "epoch": 0.8113267444075257, "grad_norm": 0.51953125, "learning_rate": 1.8953767788584155e-06, "loss": 0.0671, "step": 2396 }, { "epoch": 0.8116653615796491, "grad_norm": 0.466796875, "learning_rate": 1.8888233566839654e-06, "loss": 0.0672, "step": 2397 }, { "epoch": 0.8120039787517724, "grad_norm": 0.4765625, "learning_rate": 1.8822801018700999e-06, "loss": 0.0612, "step": 2398 }, { "epoch": 0.8123425959238958, "grad_norm": 0.6796875, "learning_rate": 1.8757470226187902e-06, "loss": 0.0745, "step": 2399 }, { "epoch": 0.8126812130960192, "grad_norm": 0.40625, "learning_rate": 1.8692241271192557e-06, "loss": 0.0505, "step": 2400 }, { "epoch": 0.8130198302681425, "grad_norm": 0.4609375, "learning_rate": 1.8627114235479393e-06, "loss": 0.0616, "step": 2401 }, { "epoch": 0.8133584474402659, "grad_norm": 0.431640625, "learning_rate": 1.8562089200685195e-06, "loss": 0.0558, "step": 2402 }, { "epoch": 0.8136970646123891, "grad_norm": 0.474609375, "learning_rate": 1.8497166248318876e-06, "loss": 0.0664, "step": 2403 }, { "epoch": 0.8140356817845125, "grad_norm": 0.57421875, "learning_rate": 1.8432345459761303e-06, "loss": 0.0728, "step": 2404 }, { "epoch": 0.8143742989566358, "grad_norm": 0.5546875, "learning_rate": 1.8367626916265401e-06, "loss": 0.0737, "step": 2405 }, { "epoch": 0.8147129161287592, "grad_norm": 0.63671875, "learning_rate": 1.8303010698955803e-06, "loss": 0.0819, "step": 2406 }, { "epoch": 0.8150515333008825, "grad_norm": 0.3828125, "learning_rate": 1.8238496888828983e-06, "loss": 0.0435, "step": 2407 }, { "epoch": 0.8153901504730059, "grad_norm": 0.609375, "learning_rate": 1.817408556675302e-06, "loss": 0.0885, "step": 2408 }, { "epoch": 0.8157287676451292, "grad_norm": 0.6015625, "learning_rate": 1.8109776813467473e-06, "loss": 0.0799, "step": 2409 }, { "epoch": 0.8160673848172525, "grad_norm": 0.4296875, "learning_rate": 1.8045570709583394e-06, "loss": 0.0674, "step": 2410 }, { "epoch": 0.8164060019893759, "grad_norm": 0.46875, "learning_rate": 1.7981467335583158e-06, "loss": 0.0574, "step": 2411 }, { "epoch": 0.8167446191614992, "grad_norm": 0.46875, "learning_rate": 1.7917466771820303e-06, "loss": 0.064, "step": 2412 }, { "epoch": 0.8170832363336226, "grad_norm": 0.55078125, "learning_rate": 1.7853569098519586e-06, "loss": 0.0681, "step": 2413 }, { "epoch": 0.8174218535057459, "grad_norm": 0.412109375, "learning_rate": 1.7789774395776716e-06, "loss": 0.053, "step": 2414 }, { "epoch": 0.8177604706778693, "grad_norm": 0.56640625, "learning_rate": 1.7726082743558349e-06, "loss": 0.0633, "step": 2415 }, { "epoch": 0.8180990878499926, "grad_norm": 0.482421875, "learning_rate": 1.766249422170202e-06, "loss": 0.0689, "step": 2416 }, { "epoch": 0.8184377050221159, "grad_norm": 0.6015625, "learning_rate": 1.7599008909915894e-06, "loss": 0.0711, "step": 2417 }, { "epoch": 0.8187763221942392, "grad_norm": 0.70703125, "learning_rate": 1.7535626887778846e-06, "loss": 0.0525, "step": 2418 }, { "epoch": 0.8191149393663626, "grad_norm": 0.4140625, "learning_rate": 1.7472348234740255e-06, "loss": 0.0505, "step": 2419 }, { "epoch": 0.819453556538486, "grad_norm": 0.53125, "learning_rate": 1.7409173030119886e-06, "loss": 0.0696, "step": 2420 }, { "epoch": 0.8197921737106093, "grad_norm": 0.5703125, "learning_rate": 1.734610135310788e-06, "loss": 0.0605, "step": 2421 }, { "epoch": 0.8201307908827327, "grad_norm": 0.5234375, "learning_rate": 1.7283133282764609e-06, "loss": 0.0601, "step": 2422 }, { "epoch": 0.820469408054856, "grad_norm": 0.76953125, "learning_rate": 1.722026889802052e-06, "loss": 0.0807, "step": 2423 }, { "epoch": 0.8208080252269794, "grad_norm": 0.458984375, "learning_rate": 1.715750827767615e-06, "loss": 0.0635, "step": 2424 }, { "epoch": 0.8211466423991026, "grad_norm": 0.49609375, "learning_rate": 1.7094851500401922e-06, "loss": 0.0611, "step": 2425 }, { "epoch": 0.821485259571226, "grad_norm": 0.5859375, "learning_rate": 1.703229864473811e-06, "loss": 0.0772, "step": 2426 }, { "epoch": 0.8218238767433493, "grad_norm": 0.408203125, "learning_rate": 1.6969849789094762e-06, "loss": 0.0508, "step": 2427 }, { "epoch": 0.8221624939154727, "grad_norm": 0.51171875, "learning_rate": 1.6907505011751468e-06, "loss": 0.0776, "step": 2428 }, { "epoch": 0.822501111087596, "grad_norm": 0.5859375, "learning_rate": 1.684526439085744e-06, "loss": 0.073, "step": 2429 }, { "epoch": 0.8228397282597194, "grad_norm": 0.51171875, "learning_rate": 1.6783128004431326e-06, "loss": 0.0656, "step": 2430 }, { "epoch": 0.8231783454318428, "grad_norm": 0.66015625, "learning_rate": 1.6721095930361042e-06, "loss": 0.0916, "step": 2431 }, { "epoch": 0.823516962603966, "grad_norm": 0.50390625, "learning_rate": 1.6659168246403855e-06, "loss": 0.0689, "step": 2432 }, { "epoch": 0.8238555797760894, "grad_norm": 0.455078125, "learning_rate": 1.6597345030186052e-06, "loss": 0.0594, "step": 2433 }, { "epoch": 0.8241941969482127, "grad_norm": 0.546875, "learning_rate": 1.6535626359203083e-06, "loss": 0.0555, "step": 2434 }, { "epoch": 0.8245328141203361, "grad_norm": 0.875, "learning_rate": 1.6474012310819354e-06, "loss": 0.0823, "step": 2435 }, { "epoch": 0.8248714312924594, "grad_norm": 0.40234375, "learning_rate": 1.6412502962267973e-06, "loss": 0.0519, "step": 2436 }, { "epoch": 0.8252100484645828, "grad_norm": 0.447265625, "learning_rate": 1.6351098390650966e-06, "loss": 0.0576, "step": 2437 }, { "epoch": 0.8255486656367061, "grad_norm": 0.54296875, "learning_rate": 1.6289798672938994e-06, "loss": 0.0627, "step": 2438 }, { "epoch": 0.8258872828088294, "grad_norm": 0.515625, "learning_rate": 1.6228603885971206e-06, "loss": 0.0665, "step": 2439 }, { "epoch": 0.8262258999809527, "grad_norm": 3.921875, "learning_rate": 1.6167514106455306e-06, "loss": 0.0593, "step": 2440 }, { "epoch": 0.8265645171530761, "grad_norm": 0.466796875, "learning_rate": 1.6106529410967354e-06, "loss": 0.0556, "step": 2441 }, { "epoch": 0.8269031343251995, "grad_norm": 0.6015625, "learning_rate": 1.604564987595162e-06, "loss": 0.059, "step": 2442 }, { "epoch": 0.8272417514973228, "grad_norm": 0.41796875, "learning_rate": 1.598487557772066e-06, "loss": 0.0548, "step": 2443 }, { "epoch": 0.8275803686694462, "grad_norm": 0.373046875, "learning_rate": 1.5924206592455016e-06, "loss": 0.0506, "step": 2444 }, { "epoch": 0.8279189858415695, "grad_norm": 0.578125, "learning_rate": 1.5863642996203288e-06, "loss": 0.0797, "step": 2445 }, { "epoch": 0.8282576030136928, "grad_norm": 0.4453125, "learning_rate": 1.580318486488197e-06, "loss": 0.0668, "step": 2446 }, { "epoch": 0.8285962201858161, "grad_norm": 0.44921875, "learning_rate": 1.5742832274275288e-06, "loss": 0.0522, "step": 2447 }, { "epoch": 0.8289348373579395, "grad_norm": 0.359375, "learning_rate": 1.5682585300035237e-06, "loss": 0.0387, "step": 2448 }, { "epoch": 0.8292734545300628, "grad_norm": 0.57421875, "learning_rate": 1.5622444017681438e-06, "loss": 0.0626, "step": 2449 }, { "epoch": 0.8296120717021862, "grad_norm": 0.44140625, "learning_rate": 1.5562408502600946e-06, "loss": 0.0529, "step": 2450 }, { "epoch": 0.8299506888743096, "grad_norm": 0.4296875, "learning_rate": 1.550247883004833e-06, "loss": 0.0554, "step": 2451 }, { "epoch": 0.8302893060464329, "grad_norm": 0.50390625, "learning_rate": 1.5442655075145375e-06, "loss": 0.059, "step": 2452 }, { "epoch": 0.8306279232185563, "grad_norm": 0.486328125, "learning_rate": 1.5382937312881208e-06, "loss": 0.0573, "step": 2453 }, { "epoch": 0.8309665403906795, "grad_norm": 0.515625, "learning_rate": 1.5323325618112072e-06, "loss": 0.0677, "step": 2454 }, { "epoch": 0.8313051575628029, "grad_norm": 0.408203125, "learning_rate": 1.5263820065561174e-06, "loss": 0.0532, "step": 2455 }, { "epoch": 0.8316437747349262, "grad_norm": 0.5546875, "learning_rate": 1.520442072981877e-06, "loss": 0.0751, "step": 2456 }, { "epoch": 0.8319823919070496, "grad_norm": 0.51953125, "learning_rate": 1.5145127685341932e-06, "loss": 0.0475, "step": 2457 }, { "epoch": 0.832321009079173, "grad_norm": 0.609375, "learning_rate": 1.5085941006454453e-06, "loss": 0.0856, "step": 2458 }, { "epoch": 0.8326596262512963, "grad_norm": 0.4140625, "learning_rate": 1.5026860767346862e-06, "loss": 0.0559, "step": 2459 }, { "epoch": 0.8329982434234197, "grad_norm": 0.4609375, "learning_rate": 1.4967887042076278e-06, "loss": 0.0658, "step": 2460 }, { "epoch": 0.8333368605955429, "grad_norm": 0.54296875, "learning_rate": 1.4909019904566223e-06, "loss": 0.0674, "step": 2461 }, { "epoch": 0.8336754777676663, "grad_norm": 0.515625, "learning_rate": 1.4850259428606707e-06, "loss": 0.0607, "step": 2462 }, { "epoch": 0.8340140949397896, "grad_norm": 0.40625, "learning_rate": 1.4791605687853927e-06, "loss": 0.0569, "step": 2463 }, { "epoch": 0.834352712111913, "grad_norm": 0.50390625, "learning_rate": 1.4733058755830399e-06, "loss": 0.0584, "step": 2464 }, { "epoch": 0.8346913292840363, "grad_norm": 0.46484375, "learning_rate": 1.4674618705924715e-06, "loss": 0.0637, "step": 2465 }, { "epoch": 0.8350299464561597, "grad_norm": 0.41015625, "learning_rate": 1.4616285611391445e-06, "loss": 0.0515, "step": 2466 }, { "epoch": 0.835368563628283, "grad_norm": 0.58203125, "learning_rate": 1.4558059545351144e-06, "loss": 0.0595, "step": 2467 }, { "epoch": 0.8357071808004063, "grad_norm": 1.4453125, "learning_rate": 1.4499940580790207e-06, "loss": 0.0883, "step": 2468 }, { "epoch": 0.8360457979725296, "grad_norm": 0.419921875, "learning_rate": 1.4441928790560733e-06, "loss": 0.059, "step": 2469 }, { "epoch": 0.836384415144653, "grad_norm": 0.69921875, "learning_rate": 1.4384024247380534e-06, "loss": 0.0896, "step": 2470 }, { "epoch": 0.8367230323167764, "grad_norm": 0.61328125, "learning_rate": 1.4326227023832928e-06, "loss": 0.0609, "step": 2471 }, { "epoch": 0.8370616494888997, "grad_norm": 0.486328125, "learning_rate": 1.426853719236676e-06, "loss": 0.0617, "step": 2472 }, { "epoch": 0.8374002666610231, "grad_norm": 0.6875, "learning_rate": 1.4210954825296253e-06, "loss": 0.0609, "step": 2473 }, { "epoch": 0.8377388838331464, "grad_norm": 0.58203125, "learning_rate": 1.4153479994800868e-06, "loss": 0.0683, "step": 2474 }, { "epoch": 0.8380775010052697, "grad_norm": 0.41796875, "learning_rate": 1.4096112772925353e-06, "loss": 0.0581, "step": 2475 }, { "epoch": 0.838416118177393, "grad_norm": 0.478515625, "learning_rate": 1.4038853231579486e-06, "loss": 0.0599, "step": 2476 }, { "epoch": 0.8387547353495164, "grad_norm": 0.6953125, "learning_rate": 1.3981701442538155e-06, "loss": 0.1391, "step": 2477 }, { "epoch": 0.8390933525216397, "grad_norm": 0.5234375, "learning_rate": 1.3924657477441072e-06, "loss": 0.0586, "step": 2478 }, { "epoch": 0.8394319696937631, "grad_norm": 0.45703125, "learning_rate": 1.38677214077929e-06, "loss": 0.0528, "step": 2479 }, { "epoch": 0.8397705868658865, "grad_norm": 0.83984375, "learning_rate": 1.381089330496297e-06, "loss": 0.0666, "step": 2480 }, { "epoch": 0.8401092040380098, "grad_norm": 0.66015625, "learning_rate": 1.3754173240185364e-06, "loss": 0.0938, "step": 2481 }, { "epoch": 0.8404478212101332, "grad_norm": 0.3984375, "learning_rate": 1.3697561284558624e-06, "loss": 0.0529, "step": 2482 }, { "epoch": 0.8407864383822564, "grad_norm": 0.451171875, "learning_rate": 1.3641057509045885e-06, "loss": 0.0661, "step": 2483 }, { "epoch": 0.8411250555543798, "grad_norm": 0.421875, "learning_rate": 1.3584661984474634e-06, "loss": 0.0582, "step": 2484 }, { "epoch": 0.8414636727265031, "grad_norm": 0.515625, "learning_rate": 1.3528374781536634e-06, "loss": 0.0646, "step": 2485 }, { "epoch": 0.8418022898986265, "grad_norm": 0.6640625, "learning_rate": 1.3472195970787927e-06, "loss": 0.0813, "step": 2486 }, { "epoch": 0.8421409070707498, "grad_norm": 0.423828125, "learning_rate": 1.3416125622648668e-06, "loss": 0.0589, "step": 2487 }, { "epoch": 0.8424795242428732, "grad_norm": 0.39453125, "learning_rate": 1.3360163807403004e-06, "loss": 0.0494, "step": 2488 }, { "epoch": 0.8428181414149966, "grad_norm": 0.6640625, "learning_rate": 1.3304310595199121e-06, "loss": 0.0604, "step": 2489 }, { "epoch": 0.8431567585871198, "grad_norm": 0.54296875, "learning_rate": 1.3248566056048972e-06, "loss": 0.0608, "step": 2490 }, { "epoch": 0.8434953757592432, "grad_norm": 0.47265625, "learning_rate": 1.3192930259828363e-06, "loss": 0.0544, "step": 2491 }, { "epoch": 0.8438339929313665, "grad_norm": 0.55859375, "learning_rate": 1.3137403276276805e-06, "loss": 0.0719, "step": 2492 }, { "epoch": 0.8441726101034899, "grad_norm": 0.41796875, "learning_rate": 1.3081985174997325e-06, "loss": 0.0548, "step": 2493 }, { "epoch": 0.8445112272756132, "grad_norm": 0.57421875, "learning_rate": 1.3026676025456553e-06, "loss": 0.0585, "step": 2494 }, { "epoch": 0.8448498444477366, "grad_norm": 0.4765625, "learning_rate": 1.2971475896984475e-06, "loss": 0.0612, "step": 2495 }, { "epoch": 0.8451884616198599, "grad_norm": 0.392578125, "learning_rate": 1.2916384858774488e-06, "loss": 0.0516, "step": 2496 }, { "epoch": 0.8455270787919832, "grad_norm": 0.48046875, "learning_rate": 1.2861402979883231e-06, "loss": 0.0614, "step": 2497 }, { "epoch": 0.8458656959641065, "grad_norm": 0.474609375, "learning_rate": 1.280653032923046e-06, "loss": 0.0543, "step": 2498 }, { "epoch": 0.8462043131362299, "grad_norm": 0.455078125, "learning_rate": 1.2751766975599033e-06, "loss": 0.0527, "step": 2499 }, { "epoch": 0.8465429303083533, "grad_norm": 0.56640625, "learning_rate": 1.2697112987634852e-06, "loss": 0.0711, "step": 2500 }, { "epoch": 0.8468815474804766, "grad_norm": 0.470703125, "learning_rate": 1.264256843384668e-06, "loss": 0.0619, "step": 2501 }, { "epoch": 0.8472201646526, "grad_norm": 0.48828125, "learning_rate": 1.2588133382606105e-06, "loss": 0.0684, "step": 2502 }, { "epoch": 0.8475587818247233, "grad_norm": 0.466796875, "learning_rate": 1.2533807902147522e-06, "loss": 0.062, "step": 2503 }, { "epoch": 0.8478973989968466, "grad_norm": 0.447265625, "learning_rate": 1.2479592060567857e-06, "loss": 0.0526, "step": 2504 }, { "epoch": 0.8482360161689699, "grad_norm": 0.486328125, "learning_rate": 1.2425485925826708e-06, "loss": 0.0605, "step": 2505 }, { "epoch": 0.8485746333410933, "grad_norm": 0.48828125, "learning_rate": 1.2371489565746141e-06, "loss": 0.0743, "step": 2506 }, { "epoch": 0.8489132505132166, "grad_norm": 0.494140625, "learning_rate": 1.231760304801054e-06, "loss": 0.0679, "step": 2507 }, { "epoch": 0.84925186768534, "grad_norm": 0.466796875, "learning_rate": 1.2263826440166725e-06, "loss": 0.0572, "step": 2508 }, { "epoch": 0.8495904848574634, "grad_norm": 0.53515625, "learning_rate": 1.2210159809623622e-06, "loss": 0.0659, "step": 2509 }, { "epoch": 0.8499291020295867, "grad_norm": 0.47265625, "learning_rate": 1.2156603223652376e-06, "loss": 0.0493, "step": 2510 }, { "epoch": 0.8502677192017101, "grad_norm": 0.458984375, "learning_rate": 1.2103156749386192e-06, "loss": 0.0545, "step": 2511 }, { "epoch": 0.8506063363738333, "grad_norm": 0.62109375, "learning_rate": 1.2049820453820194e-06, "loss": 0.0697, "step": 2512 }, { "epoch": 0.8509449535459567, "grad_norm": 0.578125, "learning_rate": 1.1996594403811478e-06, "loss": 0.0584, "step": 2513 }, { "epoch": 0.85128357071808, "grad_norm": 0.55859375, "learning_rate": 1.1943478666078856e-06, "loss": 0.0706, "step": 2514 }, { "epoch": 0.8516221878902034, "grad_norm": 0.498046875, "learning_rate": 1.1890473307202922e-06, "loss": 0.0624, "step": 2515 }, { "epoch": 0.8519608050623267, "grad_norm": 0.5234375, "learning_rate": 1.1837578393625937e-06, "loss": 0.0638, "step": 2516 }, { "epoch": 0.8522994222344501, "grad_norm": 0.5546875, "learning_rate": 1.1784793991651623e-06, "loss": 0.0716, "step": 2517 }, { "epoch": 0.8526380394065735, "grad_norm": 0.384765625, "learning_rate": 1.1732120167445248e-06, "loss": 0.0467, "step": 2518 }, { "epoch": 0.8529766565786967, "grad_norm": 0.48046875, "learning_rate": 1.1679556987033492e-06, "loss": 0.0635, "step": 2519 }, { "epoch": 0.8533152737508201, "grad_norm": 0.396484375, "learning_rate": 1.1627104516304278e-06, "loss": 0.0492, "step": 2520 }, { "epoch": 0.8536538909229434, "grad_norm": 0.443359375, "learning_rate": 1.157476282100677e-06, "loss": 0.0493, "step": 2521 }, { "epoch": 0.8539925080950668, "grad_norm": 0.515625, "learning_rate": 1.1522531966751304e-06, "loss": 0.0641, "step": 2522 }, { "epoch": 0.8543311252671901, "grad_norm": 0.62890625, "learning_rate": 1.1470412019009246e-06, "loss": 0.0554, "step": 2523 }, { "epoch": 0.8546697424393135, "grad_norm": 0.58984375, "learning_rate": 1.141840304311298e-06, "loss": 0.0642, "step": 2524 }, { "epoch": 0.8550083596114368, "grad_norm": 0.7109375, "learning_rate": 1.1366505104255732e-06, "loss": 0.0578, "step": 2525 }, { "epoch": 0.8553469767835601, "grad_norm": 0.435546875, "learning_rate": 1.1314718267491587e-06, "loss": 0.0582, "step": 2526 }, { "epoch": 0.8556855939556834, "grad_norm": 0.546875, "learning_rate": 1.1263042597735363e-06, "loss": 0.0638, "step": 2527 }, { "epoch": 0.8560242111278068, "grad_norm": 0.451171875, "learning_rate": 1.121147815976248e-06, "loss": 0.0628, "step": 2528 }, { "epoch": 0.8563628282999302, "grad_norm": 0.52734375, "learning_rate": 1.1160025018208997e-06, "loss": 0.056, "step": 2529 }, { "epoch": 0.8567014454720535, "grad_norm": 0.478515625, "learning_rate": 1.110868323757144e-06, "loss": 0.0598, "step": 2530 }, { "epoch": 0.8570400626441769, "grad_norm": 0.41015625, "learning_rate": 1.1057452882206688e-06, "loss": 0.0579, "step": 2531 }, { "epoch": 0.8573786798163002, "grad_norm": 0.52734375, "learning_rate": 1.1006334016332054e-06, "loss": 0.072, "step": 2532 }, { "epoch": 0.8577172969884235, "grad_norm": 0.4453125, "learning_rate": 1.0955326704024983e-06, "loss": 0.0652, "step": 2533 }, { "epoch": 0.8580559141605468, "grad_norm": 0.423828125, "learning_rate": 1.090443100922317e-06, "loss": 0.059, "step": 2534 }, { "epoch": 0.8583945313326702, "grad_norm": 0.453125, "learning_rate": 1.085364699572441e-06, "loss": 0.0618, "step": 2535 }, { "epoch": 0.8587331485047935, "grad_norm": 0.52734375, "learning_rate": 1.08029747271864e-06, "loss": 0.0659, "step": 2536 }, { "epoch": 0.8590717656769169, "grad_norm": 0.54296875, "learning_rate": 1.0752414267126876e-06, "loss": 0.074, "step": 2537 }, { "epoch": 0.8594103828490403, "grad_norm": 0.490234375, "learning_rate": 1.0701965678923387e-06, "loss": 0.0665, "step": 2538 }, { "epoch": 0.8597490000211636, "grad_norm": 0.482421875, "learning_rate": 1.0651629025813203e-06, "loss": 0.0722, "step": 2539 }, { "epoch": 0.860087617193287, "grad_norm": 0.412109375, "learning_rate": 1.0601404370893364e-06, "loss": 0.0595, "step": 2540 }, { "epoch": 0.8604262343654102, "grad_norm": 0.53515625, "learning_rate": 1.0551291777120465e-06, "loss": 0.0733, "step": 2541 }, { "epoch": 0.8607648515375336, "grad_norm": 0.474609375, "learning_rate": 1.0501291307310613e-06, "loss": 0.0652, "step": 2542 }, { "epoch": 0.8611034687096569, "grad_norm": 0.53515625, "learning_rate": 1.045140302413945e-06, "loss": 0.0781, "step": 2543 }, { "epoch": 0.8614420858817803, "grad_norm": 0.48046875, "learning_rate": 1.040162699014191e-06, "loss": 0.0637, "step": 2544 }, { "epoch": 0.8617807030539036, "grad_norm": 0.5390625, "learning_rate": 1.0351963267712261e-06, "loss": 0.0651, "step": 2545 }, { "epoch": 0.862119320226027, "grad_norm": 0.5078125, "learning_rate": 1.0302411919104005e-06, "loss": 0.0532, "step": 2546 }, { "epoch": 0.8624579373981504, "grad_norm": 0.58984375, "learning_rate": 1.0252973006429733e-06, "loss": 0.0686, "step": 2547 }, { "epoch": 0.8627965545702736, "grad_norm": 0.46484375, "learning_rate": 1.0203646591661142e-06, "loss": 0.059, "step": 2548 }, { "epoch": 0.863135171742397, "grad_norm": 0.5859375, "learning_rate": 1.0154432736628916e-06, "loss": 0.09, "step": 2549 }, { "epoch": 0.8634737889145203, "grad_norm": 0.447265625, "learning_rate": 1.0105331503022574e-06, "loss": 0.0472, "step": 2550 }, { "epoch": 0.8638124060866437, "grad_norm": 0.427734375, "learning_rate": 1.0056342952390574e-06, "loss": 0.0524, "step": 2551 }, { "epoch": 0.864151023258767, "grad_norm": 0.482421875, "learning_rate": 1.0007467146140026e-06, "loss": 0.061, "step": 2552 }, { "epoch": 0.8644896404308904, "grad_norm": 0.58203125, "learning_rate": 9.958704145536767e-07, "loss": 0.0603, "step": 2553 }, { "epoch": 0.8648282576030137, "grad_norm": 0.498046875, "learning_rate": 9.91005401170524e-07, "loss": 0.0645, "step": 2554 }, { "epoch": 0.865166874775137, "grad_norm": 0.71875, "learning_rate": 9.86151680562837e-07, "loss": 0.0801, "step": 2555 }, { "epoch": 0.8655054919472603, "grad_norm": 0.55078125, "learning_rate": 9.813092588147554e-07, "loss": 0.0743, "step": 2556 }, { "epoch": 0.8658441091193837, "grad_norm": 0.45703125, "learning_rate": 9.764781419962576e-07, "loss": 0.066, "step": 2557 }, { "epoch": 0.866182726291507, "grad_norm": 0.390625, "learning_rate": 9.71658336163146e-07, "loss": 0.051, "step": 2558 }, { "epoch": 0.8665213434636304, "grad_norm": 0.54296875, "learning_rate": 9.668498473570499e-07, "loss": 0.0704, "step": 2559 }, { "epoch": 0.8668599606357538, "grad_norm": 0.55859375, "learning_rate": 9.620526816054065e-07, "loss": 0.0629, "step": 2560 }, { "epoch": 0.8671985778078771, "grad_norm": 0.482421875, "learning_rate": 9.572668449214672e-07, "loss": 0.0703, "step": 2561 }, { "epoch": 0.8675371949800004, "grad_norm": 0.56640625, "learning_rate": 9.52492343304281e-07, "loss": 0.0715, "step": 2562 }, { "epoch": 0.8678758121521237, "grad_norm": 0.53515625, "learning_rate": 9.477291827386781e-07, "loss": 0.0736, "step": 2563 }, { "epoch": 0.8682144293242471, "grad_norm": 0.51171875, "learning_rate": 9.42977369195286e-07, "loss": 0.0627, "step": 2564 }, { "epoch": 0.8685530464963704, "grad_norm": 0.53515625, "learning_rate": 9.382369086305043e-07, "loss": 0.0759, "step": 2565 }, { "epoch": 0.8688916636684938, "grad_norm": 0.44140625, "learning_rate": 9.335078069864967e-07, "loss": 0.0654, "step": 2566 }, { "epoch": 0.8692302808406172, "grad_norm": 0.4375, "learning_rate": 9.287900701911945e-07, "loss": 0.0488, "step": 2567 }, { "epoch": 0.8695688980127405, "grad_norm": 0.44140625, "learning_rate": 9.240837041582839e-07, "loss": 0.0575, "step": 2568 }, { "epoch": 0.8699075151848639, "grad_norm": 0.51171875, "learning_rate": 9.193887147871905e-07, "loss": 0.0628, "step": 2569 }, { "epoch": 0.8702461323569871, "grad_norm": 0.474609375, "learning_rate": 9.147051079630886e-07, "loss": 0.0584, "step": 2570 }, { "epoch": 0.8705847495291105, "grad_norm": 0.453125, "learning_rate": 9.100328895568745e-07, "loss": 0.0583, "step": 2571 }, { "epoch": 0.8709233667012338, "grad_norm": 0.6015625, "learning_rate": 9.053720654251774e-07, "loss": 0.0696, "step": 2572 }, { "epoch": 0.8712619838733572, "grad_norm": 0.65234375, "learning_rate": 9.00722641410342e-07, "loss": 0.0825, "step": 2573 }, { "epoch": 0.8716006010454805, "grad_norm": 0.447265625, "learning_rate": 8.960846233404175e-07, "loss": 0.0647, "step": 2574 }, { "epoch": 0.8719392182176039, "grad_norm": 0.83984375, "learning_rate": 8.914580170291632e-07, "loss": 0.1008, "step": 2575 }, { "epoch": 0.8722778353897273, "grad_norm": 0.5, "learning_rate": 8.86842828276031e-07, "loss": 0.0718, "step": 2576 }, { "epoch": 0.8726164525618505, "grad_norm": 0.462890625, "learning_rate": 8.822390628661581e-07, "loss": 0.0611, "step": 2577 }, { "epoch": 0.8729550697339739, "grad_norm": 0.53515625, "learning_rate": 8.77646726570367e-07, "loss": 0.0616, "step": 2578 }, { "epoch": 0.8732936869060972, "grad_norm": 0.421875, "learning_rate": 8.730658251451485e-07, "loss": 0.0514, "step": 2579 }, { "epoch": 0.8736323040782206, "grad_norm": 0.3828125, "learning_rate": 8.68496364332665e-07, "loss": 0.0523, "step": 2580 }, { "epoch": 0.8739709212503439, "grad_norm": 0.59375, "learning_rate": 8.639383498607379e-07, "loss": 0.0746, "step": 2581 }, { "epoch": 0.8743095384224673, "grad_norm": 0.41796875, "learning_rate": 8.593917874428348e-07, "loss": 0.0556, "step": 2582 }, { "epoch": 0.8746481555945906, "grad_norm": 0.41015625, "learning_rate": 8.548566827780747e-07, "loss": 0.0481, "step": 2583 }, { "epoch": 0.8749867727667139, "grad_norm": 0.63671875, "learning_rate": 8.503330415512123e-07, "loss": 0.072, "step": 2584 }, { "epoch": 0.8753253899388372, "grad_norm": 0.431640625, "learning_rate": 8.458208694326287e-07, "loss": 0.054, "step": 2585 }, { "epoch": 0.8756640071109606, "grad_norm": 0.734375, "learning_rate": 8.413201720783337e-07, "loss": 0.1025, "step": 2586 }, { "epoch": 0.876002624283084, "grad_norm": 0.58984375, "learning_rate": 8.368309551299536e-07, "loss": 0.0694, "step": 2587 }, { "epoch": 0.8763412414552073, "grad_norm": 0.515625, "learning_rate": 8.323532242147203e-07, "loss": 0.067, "step": 2588 }, { "epoch": 0.8766798586273307, "grad_norm": 0.48828125, "learning_rate": 8.278869849454718e-07, "loss": 0.0664, "step": 2589 }, { "epoch": 0.877018475799454, "grad_norm": 0.578125, "learning_rate": 8.234322429206354e-07, "loss": 0.0697, "step": 2590 }, { "epoch": 0.8773570929715773, "grad_norm": 0.376953125, "learning_rate": 8.189890037242343e-07, "loss": 0.0443, "step": 2591 }, { "epoch": 0.8776957101437006, "grad_norm": 0.6015625, "learning_rate": 8.145572729258689e-07, "loss": 0.0524, "step": 2592 }, { "epoch": 0.878034327315824, "grad_norm": 0.380859375, "learning_rate": 8.101370560807132e-07, "loss": 0.0537, "step": 2593 }, { "epoch": 0.8783729444879473, "grad_norm": 0.439453125, "learning_rate": 8.057283587295084e-07, "loss": 0.0652, "step": 2594 }, { "epoch": 0.8787115616600707, "grad_norm": 0.447265625, "learning_rate": 8.013311863985596e-07, "loss": 0.0605, "step": 2595 }, { "epoch": 0.879050178832194, "grad_norm": 0.59375, "learning_rate": 7.969455445997198e-07, "loss": 0.0819, "step": 2596 }, { "epoch": 0.8793887960043174, "grad_norm": 0.47265625, "learning_rate": 7.92571438830394e-07, "loss": 0.0681, "step": 2597 }, { "epoch": 0.8797274131764408, "grad_norm": 0.478515625, "learning_rate": 7.882088745735217e-07, "loss": 0.0554, "step": 2598 }, { "epoch": 0.880066030348564, "grad_norm": 0.69140625, "learning_rate": 7.838578572975786e-07, "loss": 0.0827, "step": 2599 }, { "epoch": 0.8804046475206874, "grad_norm": 0.48046875, "learning_rate": 7.795183924565675e-07, "loss": 0.0565, "step": 2600 }, { "epoch": 0.8807432646928107, "grad_norm": 0.48828125, "learning_rate": 7.751904854900027e-07, "loss": 0.0599, "step": 2601 }, { "epoch": 0.8810818818649341, "grad_norm": 0.498046875, "learning_rate": 7.708741418229215e-07, "loss": 0.0602, "step": 2602 }, { "epoch": 0.8814204990370574, "grad_norm": 0.4609375, "learning_rate": 7.665693668658569e-07, "loss": 0.0624, "step": 2603 }, { "epoch": 0.8817591162091808, "grad_norm": 0.6015625, "learning_rate": 7.62276166014847e-07, "loss": 0.066, "step": 2604 }, { "epoch": 0.8820977333813041, "grad_norm": 0.412109375, "learning_rate": 7.579945446514192e-07, "loss": 0.0527, "step": 2605 }, { "epoch": 0.8824363505534274, "grad_norm": 0.474609375, "learning_rate": 7.53724508142587e-07, "loss": 0.0625, "step": 2606 }, { "epoch": 0.8827749677255508, "grad_norm": 0.46875, "learning_rate": 7.494660618408379e-07, "loss": 0.0601, "step": 2607 }, { "epoch": 0.8831135848976741, "grad_norm": 0.51171875, "learning_rate": 7.452192110841383e-07, "loss": 0.0722, "step": 2608 }, { "epoch": 0.8834522020697975, "grad_norm": 0.447265625, "learning_rate": 7.409839611959136e-07, "loss": 0.0582, "step": 2609 }, { "epoch": 0.8837908192419208, "grad_norm": 0.515625, "learning_rate": 7.367603174850502e-07, "loss": 0.0681, "step": 2610 }, { "epoch": 0.8841294364140442, "grad_norm": 0.43359375, "learning_rate": 7.325482852458887e-07, "loss": 0.0551, "step": 2611 }, { "epoch": 0.8844680535861675, "grad_norm": 0.6796875, "learning_rate": 7.283478697582091e-07, "loss": 0.0829, "step": 2612 }, { "epoch": 0.8848066707582908, "grad_norm": 0.48828125, "learning_rate": 7.241590762872319e-07, "loss": 0.062, "step": 2613 }, { "epoch": 0.8851452879304141, "grad_norm": 0.5078125, "learning_rate": 7.199819100836136e-07, "loss": 0.0635, "step": 2614 }, { "epoch": 0.8854839051025375, "grad_norm": 0.6171875, "learning_rate": 7.158163763834292e-07, "loss": 0.0689, "step": 2615 }, { "epoch": 0.8858225222746609, "grad_norm": 0.408203125, "learning_rate": 7.116624804081773e-07, "loss": 0.052, "step": 2616 }, { "epoch": 0.8861611394467842, "grad_norm": 0.4296875, "learning_rate": 7.075202273647652e-07, "loss": 0.0523, "step": 2617 }, { "epoch": 0.8864997566189076, "grad_norm": 0.5234375, "learning_rate": 7.033896224455072e-07, "loss": 0.0745, "step": 2618 }, { "epoch": 0.8868383737910309, "grad_norm": 0.416015625, "learning_rate": 6.992706708281205e-07, "loss": 0.0497, "step": 2619 }, { "epoch": 0.8871769909631542, "grad_norm": 0.427734375, "learning_rate": 6.951633776757071e-07, "loss": 0.0559, "step": 2620 }, { "epoch": 0.8875156081352775, "grad_norm": 0.470703125, "learning_rate": 6.910677481367623e-07, "loss": 0.0584, "step": 2621 }, { "epoch": 0.8878542253074009, "grad_norm": 0.392578125, "learning_rate": 6.869837873451557e-07, "loss": 0.0469, "step": 2622 }, { "epoch": 0.8881928424795242, "grad_norm": 0.5703125, "learning_rate": 6.829115004201325e-07, "loss": 0.0613, "step": 2623 }, { "epoch": 0.8885314596516476, "grad_norm": 0.484375, "learning_rate": 6.788508924663084e-07, "loss": 0.0785, "step": 2624 }, { "epoch": 0.888870076823771, "grad_norm": 0.71875, "learning_rate": 6.748019685736507e-07, "loss": 0.0639, "step": 2625 }, { "epoch": 0.8892086939958943, "grad_norm": 0.48046875, "learning_rate": 6.707647338174905e-07, "loss": 0.0623, "step": 2626 }, { "epoch": 0.8895473111680177, "grad_norm": 0.63671875, "learning_rate": 6.667391932584999e-07, "loss": 0.047, "step": 2627 }, { "epoch": 0.8898859283401409, "grad_norm": 0.609375, "learning_rate": 6.627253519426913e-07, "loss": 0.0521, "step": 2628 }, { "epoch": 0.8902245455122643, "grad_norm": 0.455078125, "learning_rate": 6.587232149014189e-07, "loss": 0.0527, "step": 2629 }, { "epoch": 0.8905631626843876, "grad_norm": 0.4296875, "learning_rate": 6.54732787151362e-07, "loss": 0.0537, "step": 2630 }, { "epoch": 0.890901779856511, "grad_norm": 0.76953125, "learning_rate": 6.507540736945195e-07, "loss": 0.0994, "step": 2631 }, { "epoch": 0.8912403970286343, "grad_norm": 0.58203125, "learning_rate": 6.467870795182108e-07, "loss": 0.0807, "step": 2632 }, { "epoch": 0.8915790142007577, "grad_norm": 0.447265625, "learning_rate": 6.428318095950648e-07, "loss": 0.0555, "step": 2633 }, { "epoch": 0.891917631372881, "grad_norm": 0.53515625, "learning_rate": 6.388882688830089e-07, "loss": 0.0607, "step": 2634 }, { "epoch": 0.8922562485450043, "grad_norm": 0.5859375, "learning_rate": 6.349564623252746e-07, "loss": 0.0594, "step": 2635 }, { "epoch": 0.8925948657171277, "grad_norm": 0.4140625, "learning_rate": 6.310363948503806e-07, "loss": 0.0569, "step": 2636 }, { "epoch": 0.892933482889251, "grad_norm": 0.484375, "learning_rate": 6.271280713721317e-07, "loss": 0.0663, "step": 2637 }, { "epoch": 0.8932721000613744, "grad_norm": 0.45703125, "learning_rate": 6.232314967896136e-07, "loss": 0.0572, "step": 2638 }, { "epoch": 0.8936107172334977, "grad_norm": 0.443359375, "learning_rate": 6.193466759871792e-07, "loss": 0.0492, "step": 2639 }, { "epoch": 0.8939493344056211, "grad_norm": 0.5546875, "learning_rate": 6.154736138344564e-07, "loss": 0.0611, "step": 2640 }, { "epoch": 0.8942879515777444, "grad_norm": 0.53125, "learning_rate": 6.11612315186324e-07, "loss": 0.0669, "step": 2641 }, { "epoch": 0.8946265687498677, "grad_norm": 0.45703125, "learning_rate": 6.077627848829238e-07, "loss": 0.0627, "step": 2642 }, { "epoch": 0.894965185921991, "grad_norm": 0.443359375, "learning_rate": 6.039250277496411e-07, "loss": 0.0535, "step": 2643 }, { "epoch": 0.8953038030941144, "grad_norm": 0.55078125, "learning_rate": 6.000990485971048e-07, "loss": 0.0703, "step": 2644 }, { "epoch": 0.8956424202662377, "grad_norm": 0.47265625, "learning_rate": 5.962848522211784e-07, "loss": 0.0602, "step": 2645 }, { "epoch": 0.8959810374383611, "grad_norm": 0.6640625, "learning_rate": 5.924824434029619e-07, "loss": 0.0655, "step": 2646 }, { "epoch": 0.8963196546104845, "grad_norm": 0.466796875, "learning_rate": 5.886918269087716e-07, "loss": 0.0624, "step": 2647 }, { "epoch": 0.8966582717826078, "grad_norm": 0.48046875, "learning_rate": 5.849130074901444e-07, "loss": 0.0661, "step": 2648 }, { "epoch": 0.8969968889547311, "grad_norm": 0.67578125, "learning_rate": 5.811459898838345e-07, "loss": 0.0686, "step": 2649 }, { "epoch": 0.8973355061268544, "grad_norm": 0.38671875, "learning_rate": 5.77390778811796e-07, "loss": 0.0509, "step": 2650 }, { "epoch": 0.8976741232989778, "grad_norm": 0.5703125, "learning_rate": 5.736473789811858e-07, "loss": 0.0724, "step": 2651 }, { "epoch": 0.8980127404711011, "grad_norm": 0.62109375, "learning_rate": 5.699157950843592e-07, "loss": 0.0741, "step": 2652 }, { "epoch": 0.8983513576432245, "grad_norm": 0.38671875, "learning_rate": 5.661960317988535e-07, "loss": 0.05, "step": 2653 }, { "epoch": 0.8986899748153478, "grad_norm": 0.54296875, "learning_rate": 5.624880937873956e-07, "loss": 0.063, "step": 2654 }, { "epoch": 0.8990285919874712, "grad_norm": 0.46875, "learning_rate": 5.587919856978819e-07, "loss": 0.053, "step": 2655 }, { "epoch": 0.8993672091595946, "grad_norm": 0.474609375, "learning_rate": 5.551077121633875e-07, "loss": 0.0588, "step": 2656 }, { "epoch": 0.8997058263317178, "grad_norm": 0.6953125, "learning_rate": 5.514352778021492e-07, "loss": 0.0755, "step": 2657 }, { "epoch": 0.9000444435038412, "grad_norm": 0.57421875, "learning_rate": 5.477746872175615e-07, "loss": 0.0687, "step": 2658 }, { "epoch": 0.9003830606759645, "grad_norm": 0.470703125, "learning_rate": 5.441259449981795e-07, "loss": 0.0619, "step": 2659 }, { "epoch": 0.9007216778480879, "grad_norm": 0.45703125, "learning_rate": 5.404890557176967e-07, "loss": 0.0589, "step": 2660 }, { "epoch": 0.9010602950202112, "grad_norm": 0.53515625, "learning_rate": 5.368640239349554e-07, "loss": 0.0579, "step": 2661 }, { "epoch": 0.9013989121923346, "grad_norm": 0.515625, "learning_rate": 5.332508541939374e-07, "loss": 0.0655, "step": 2662 }, { "epoch": 0.901737529364458, "grad_norm": 0.515625, "learning_rate": 5.296495510237453e-07, "loss": 0.0809, "step": 2663 }, { "epoch": 0.9020761465365812, "grad_norm": 0.65625, "learning_rate": 5.26060118938616e-07, "loss": 0.0621, "step": 2664 }, { "epoch": 0.9020761465365812, "eval_loss": 0.06565158814191818, "eval_runtime": 818.5534, "eval_samples_per_second": 12.153, "eval_steps_per_second": 3.038, "step": 2664 }, { "epoch": 0.9024147637087045, "grad_norm": 0.640625, "learning_rate": 5.224825624379048e-07, "loss": 0.0672, "step": 2665 }, { "epoch": 0.9027533808808279, "grad_norm": 0.51953125, "learning_rate": 5.189168860060756e-07, "loss": 0.0757, "step": 2666 }, { "epoch": 0.9030919980529513, "grad_norm": 0.52734375, "learning_rate": 5.153630941127063e-07, "loss": 0.0584, "step": 2667 }, { "epoch": 0.9034306152250746, "grad_norm": 0.84765625, "learning_rate": 5.118211912124726e-07, "loss": 0.0992, "step": 2668 }, { "epoch": 0.903769232397198, "grad_norm": 0.5390625, "learning_rate": 5.082911817451541e-07, "loss": 0.0662, "step": 2669 }, { "epoch": 0.9041078495693213, "grad_norm": 0.47265625, "learning_rate": 5.047730701356146e-07, "loss": 0.0505, "step": 2670 }, { "epoch": 0.9044464667414446, "grad_norm": 0.6875, "learning_rate": 5.012668607938087e-07, "loss": 0.0694, "step": 2671 }, { "epoch": 0.9047850839135679, "grad_norm": 0.578125, "learning_rate": 4.977725581147697e-07, "loss": 0.0784, "step": 2672 }, { "epoch": 0.9051237010856913, "grad_norm": 0.453125, "learning_rate": 4.942901664786071e-07, "loss": 0.0589, "step": 2673 }, { "epoch": 0.9054623182578146, "grad_norm": 0.482421875, "learning_rate": 4.90819690250497e-07, "loss": 0.0609, "step": 2674 }, { "epoch": 0.905800935429938, "grad_norm": 0.455078125, "learning_rate": 4.873611337806838e-07, "loss": 0.0631, "step": 2675 }, { "epoch": 0.9061395526020614, "grad_norm": 0.53125, "learning_rate": 4.839145014044688e-07, "loss": 0.0775, "step": 2676 }, { "epoch": 0.9064781697741847, "grad_norm": 0.515625, "learning_rate": 4.804797974422026e-07, "loss": 0.0666, "step": 2677 }, { "epoch": 0.906816786946308, "grad_norm": 0.486328125, "learning_rate": 4.770570261992913e-07, "loss": 0.0547, "step": 2678 }, { "epoch": 0.9071554041184313, "grad_norm": 0.408203125, "learning_rate": 4.73646191966175e-07, "loss": 0.0487, "step": 2679 }, { "epoch": 0.9074940212905547, "grad_norm": 0.48828125, "learning_rate": 4.70247299018336e-07, "loss": 0.0698, "step": 2680 }, { "epoch": 0.907832638462678, "grad_norm": 0.427734375, "learning_rate": 4.668603516162895e-07, "loss": 0.0562, "step": 2681 }, { "epoch": 0.9081712556348014, "grad_norm": 0.5078125, "learning_rate": 4.634853540055706e-07, "loss": 0.0682, "step": 2682 }, { "epoch": 0.9085098728069247, "grad_norm": 0.65234375, "learning_rate": 4.601223104167407e-07, "loss": 0.0755, "step": 2683 }, { "epoch": 0.9088484899790481, "grad_norm": 0.490234375, "learning_rate": 4.567712250653755e-07, "loss": 0.0657, "step": 2684 }, { "epoch": 0.9091871071511715, "grad_norm": 0.9375, "learning_rate": 4.5343210215206047e-07, "loss": 0.0488, "step": 2685 }, { "epoch": 0.9095257243232947, "grad_norm": 0.57421875, "learning_rate": 4.501049458623863e-07, "loss": 0.0672, "step": 2686 }, { "epoch": 0.9098643414954181, "grad_norm": 0.423828125, "learning_rate": 4.4678976036694354e-07, "loss": 0.0471, "step": 2687 }, { "epoch": 0.9102029586675414, "grad_norm": 0.447265625, "learning_rate": 4.43486549821317e-07, "loss": 0.0542, "step": 2688 }, { "epoch": 0.9105415758396648, "grad_norm": 0.57421875, "learning_rate": 4.401953183660834e-07, "loss": 0.0772, "step": 2689 }, { "epoch": 0.9108801930117881, "grad_norm": 0.52734375, "learning_rate": 4.369160701268016e-07, "loss": 0.0738, "step": 2690 }, { "epoch": 0.9112188101839115, "grad_norm": 0.86328125, "learning_rate": 4.3364880921400567e-07, "loss": 0.1494, "step": 2691 }, { "epoch": 0.9115574273560348, "grad_norm": 0.578125, "learning_rate": 4.303935397232117e-07, "loss": 0.081, "step": 2692 }, { "epoch": 0.9118960445281581, "grad_norm": 0.59765625, "learning_rate": 4.271502657348969e-07, "loss": 0.0663, "step": 2693 }, { "epoch": 0.9122346617002814, "grad_norm": 0.435546875, "learning_rate": 4.23918991314507e-07, "loss": 0.0502, "step": 2694 }, { "epoch": 0.9125732788724048, "grad_norm": 0.431640625, "learning_rate": 4.2069972051244635e-07, "loss": 0.055, "step": 2695 }, { "epoch": 0.9129118960445282, "grad_norm": 0.671875, "learning_rate": 4.174924573640682e-07, "loss": 0.0958, "step": 2696 }, { "epoch": 0.9132505132166515, "grad_norm": 0.52734375, "learning_rate": 4.14297205889681e-07, "loss": 0.0754, "step": 2697 }, { "epoch": 0.9135891303887749, "grad_norm": 0.37890625, "learning_rate": 4.111139700945277e-07, "loss": 0.052, "step": 2698 }, { "epoch": 0.9139277475608982, "grad_norm": 0.5, "learning_rate": 4.0794275396879856e-07, "loss": 0.0686, "step": 2699 }, { "epoch": 0.9142663647330215, "grad_norm": 0.53515625, "learning_rate": 4.047835614876128e-07, "loss": 0.0685, "step": 2700 }, { "epoch": 0.9146049819051448, "grad_norm": 0.59765625, "learning_rate": 4.0163639661101594e-07, "loss": 0.0812, "step": 2701 }, { "epoch": 0.9149435990772682, "grad_norm": 0.5234375, "learning_rate": 3.985012632839824e-07, "loss": 0.0745, "step": 2702 }, { "epoch": 0.9152822162493915, "grad_norm": 0.47265625, "learning_rate": 3.9537816543640085e-07, "loss": 0.0658, "step": 2703 }, { "epoch": 0.9156208334215149, "grad_norm": 0.427734375, "learning_rate": 3.9226710698307416e-07, "loss": 0.0452, "step": 2704 }, { "epoch": 0.9159594505936383, "grad_norm": 0.609375, "learning_rate": 3.891680918237151e-07, "loss": 0.0815, "step": 2705 }, { "epoch": 0.9162980677657616, "grad_norm": 0.384765625, "learning_rate": 3.8608112384293963e-07, "loss": 0.0468, "step": 2706 }, { "epoch": 0.9166366849378849, "grad_norm": 0.45703125, "learning_rate": 3.8300620691026024e-07, "loss": 0.0508, "step": 2707 }, { "epoch": 0.9169753021100082, "grad_norm": 0.54296875, "learning_rate": 3.799433448800893e-07, "loss": 0.0618, "step": 2708 }, { "epoch": 0.9173139192821316, "grad_norm": 0.5390625, "learning_rate": 3.7689254159172127e-07, "loss": 0.0641, "step": 2709 }, { "epoch": 0.9176525364542549, "grad_norm": 0.7578125, "learning_rate": 3.738538008693393e-07, "loss": 0.0743, "step": 2710 }, { "epoch": 0.9179911536263783, "grad_norm": 0.4140625, "learning_rate": 3.708271265220087e-07, "loss": 0.0496, "step": 2711 }, { "epoch": 0.9183297707985016, "grad_norm": 0.4375, "learning_rate": 3.6781252234365905e-07, "loss": 0.058, "step": 2712 }, { "epoch": 0.918668387970625, "grad_norm": 0.50390625, "learning_rate": 3.64809992113101e-07, "loss": 0.0674, "step": 2713 }, { "epoch": 0.9190070051427484, "grad_norm": 0.53125, "learning_rate": 3.618195395940083e-07, "loss": 0.0584, "step": 2714 }, { "epoch": 0.9193456223148716, "grad_norm": 0.59375, "learning_rate": 3.5884116853490915e-07, "loss": 0.0713, "step": 2715 }, { "epoch": 0.919684239486995, "grad_norm": 0.416015625, "learning_rate": 3.558748826691949e-07, "loss": 0.0544, "step": 2716 }, { "epoch": 0.9200228566591183, "grad_norm": 0.51171875, "learning_rate": 3.529206857151035e-07, "loss": 0.0735, "step": 2717 }, { "epoch": 0.9203614738312417, "grad_norm": 0.51171875, "learning_rate": 3.4997858137572174e-07, "loss": 0.0596, "step": 2718 }, { "epoch": 0.920700091003365, "grad_norm": 0.453125, "learning_rate": 3.4704857333897834e-07, "loss": 0.0601, "step": 2719 }, { "epoch": 0.9210387081754884, "grad_norm": 0.6015625, "learning_rate": 3.4413066527763774e-07, "loss": 0.0785, "step": 2720 }, { "epoch": 0.9213773253476117, "grad_norm": 0.42578125, "learning_rate": 3.412248608492974e-07, "loss": 0.0552, "step": 2721 }, { "epoch": 0.921715942519735, "grad_norm": 0.462890625, "learning_rate": 3.38331163696386e-07, "loss": 0.0571, "step": 2722 }, { "epoch": 0.9220545596918583, "grad_norm": 0.48046875, "learning_rate": 3.354495774461497e-07, "loss": 0.063, "step": 2723 }, { "epoch": 0.9223931768639817, "grad_norm": 0.5625, "learning_rate": 3.3258010571065925e-07, "loss": 0.0796, "step": 2724 }, { "epoch": 0.9227317940361051, "grad_norm": 0.61328125, "learning_rate": 3.2972275208679625e-07, "loss": 0.0554, "step": 2725 }, { "epoch": 0.9230704112082284, "grad_norm": 0.46484375, "learning_rate": 3.2687752015625574e-07, "loss": 0.0585, "step": 2726 }, { "epoch": 0.9234090283803518, "grad_norm": 0.53125, "learning_rate": 3.2404441348553475e-07, "loss": 0.0628, "step": 2727 }, { "epoch": 0.9237476455524751, "grad_norm": 0.65625, "learning_rate": 3.212234356259325e-07, "loss": 0.0557, "step": 2728 }, { "epoch": 0.9240862627245984, "grad_norm": 0.5078125, "learning_rate": 3.18414590113546e-07, "loss": 0.0617, "step": 2729 }, { "epoch": 0.9244248798967217, "grad_norm": 0.44921875, "learning_rate": 3.1561788046926335e-07, "loss": 0.0522, "step": 2730 }, { "epoch": 0.9247634970688451, "grad_norm": 0.5625, "learning_rate": 3.1283331019875905e-07, "loss": 0.0849, "step": 2731 }, { "epoch": 0.9251021142409684, "grad_norm": 0.486328125, "learning_rate": 3.100608827924934e-07, "loss": 0.063, "step": 2732 }, { "epoch": 0.9254407314130918, "grad_norm": 0.494140625, "learning_rate": 3.0730060172570407e-07, "loss": 0.0636, "step": 2733 }, { "epoch": 0.9257793485852152, "grad_norm": 0.5703125, "learning_rate": 3.045524704584024e-07, "loss": 0.0786, "step": 2734 }, { "epoch": 0.9261179657573385, "grad_norm": 0.4375, "learning_rate": 3.018164924353739e-07, "loss": 0.0595, "step": 2735 }, { "epoch": 0.9264565829294618, "grad_norm": 0.515625, "learning_rate": 2.990926710861641e-07, "loss": 0.0659, "step": 2736 }, { "epoch": 0.9267952001015851, "grad_norm": 0.6484375, "learning_rate": 2.963810098250841e-07, "loss": 0.0725, "step": 2737 }, { "epoch": 0.9271338172737085, "grad_norm": 0.5390625, "learning_rate": 2.936815120512038e-07, "loss": 0.0688, "step": 2738 }, { "epoch": 0.9274724344458318, "grad_norm": 0.546875, "learning_rate": 2.909941811483408e-07, "loss": 0.0729, "step": 2739 }, { "epoch": 0.9278110516179552, "grad_norm": 0.44921875, "learning_rate": 2.883190204850661e-07, "loss": 0.0586, "step": 2740 }, { "epoch": 0.9281496687900785, "grad_norm": 0.62109375, "learning_rate": 2.8565603341469514e-07, "loss": 0.0993, "step": 2741 }, { "epoch": 0.9284882859622019, "grad_norm": 0.5, "learning_rate": 2.8300522327528e-07, "loss": 0.0586, "step": 2742 }, { "epoch": 0.9288269031343253, "grad_norm": 0.58984375, "learning_rate": 2.803665933896127e-07, "loss": 0.0638, "step": 2743 }, { "epoch": 0.9291655203064485, "grad_norm": 0.41796875, "learning_rate": 2.7774014706521524e-07, "loss": 0.0539, "step": 2744 }, { "epoch": 0.9295041374785719, "grad_norm": 0.40625, "learning_rate": 2.7512588759433857e-07, "loss": 0.0481, "step": 2745 }, { "epoch": 0.9298427546506952, "grad_norm": 0.5625, "learning_rate": 2.7252381825395804e-07, "loss": 0.0726, "step": 2746 }, { "epoch": 0.9301813718228186, "grad_norm": 0.423828125, "learning_rate": 2.6993394230576676e-07, "loss": 0.0488, "step": 2747 }, { "epoch": 0.9305199889949419, "grad_norm": 0.3828125, "learning_rate": 2.6735626299617456e-07, "loss": 0.0516, "step": 2748 }, { "epoch": 0.9308586061670653, "grad_norm": 0.56640625, "learning_rate": 2.647907835563035e-07, "loss": 0.077, "step": 2749 }, { "epoch": 0.9311972233391886, "grad_norm": 5.59375, "learning_rate": 2.6223750720198115e-07, "loss": 0.079, "step": 2750 }, { "epoch": 0.9315358405113119, "grad_norm": 0.59765625, "learning_rate": 2.596964371337418e-07, "loss": 0.0828, "step": 2751 }, { "epoch": 0.9318744576834352, "grad_norm": 0.47265625, "learning_rate": 2.5716757653681313e-07, "loss": 0.0627, "step": 2752 }, { "epoch": 0.9322130748555586, "grad_norm": 0.478515625, "learning_rate": 2.5465092858112495e-07, "loss": 0.0546, "step": 2753 }, { "epoch": 0.932551692027682, "grad_norm": 0.453125, "learning_rate": 2.521464964212972e-07, "loss": 0.0481, "step": 2754 }, { "epoch": 0.9328903091998053, "grad_norm": 0.55078125, "learning_rate": 2.4965428319663085e-07, "loss": 0.0664, "step": 2755 }, { "epoch": 0.9332289263719287, "grad_norm": 0.58984375, "learning_rate": 2.471742920311193e-07, "loss": 0.0811, "step": 2756 }, { "epoch": 0.933567543544052, "grad_norm": 0.48828125, "learning_rate": 2.4470652603343024e-07, "loss": 0.0636, "step": 2757 }, { "epoch": 0.9339061607161753, "grad_norm": 0.5234375, "learning_rate": 2.422509882969093e-07, "loss": 0.0657, "step": 2758 }, { "epoch": 0.9342447778882986, "grad_norm": 0.439453125, "learning_rate": 2.3980768189957205e-07, "loss": 0.0632, "step": 2759 }, { "epoch": 0.934583395060422, "grad_norm": 1.0546875, "learning_rate": 2.3737660990410415e-07, "loss": 0.0615, "step": 2760 }, { "epoch": 0.9349220122325453, "grad_norm": 0.40234375, "learning_rate": 2.349577753578547e-07, "loss": 0.043, "step": 2761 }, { "epoch": 0.9352606294046687, "grad_norm": 0.55078125, "learning_rate": 2.325511812928327e-07, "loss": 0.0684, "step": 2762 }, { "epoch": 0.935599246576792, "grad_norm": 0.4296875, "learning_rate": 2.3015683072570406e-07, "loss": 0.0581, "step": 2763 }, { "epoch": 0.9359378637489154, "grad_norm": 0.5078125, "learning_rate": 2.2777472665778678e-07, "loss": 0.0654, "step": 2764 }, { "epoch": 0.9362764809210387, "grad_norm": 0.51171875, "learning_rate": 2.2540487207505012e-07, "loss": 0.0574, "step": 2765 }, { "epoch": 0.936615098093162, "grad_norm": 0.4375, "learning_rate": 2.2304726994810454e-07, "loss": 0.0502, "step": 2766 }, { "epoch": 0.9369537152652854, "grad_norm": 0.765625, "learning_rate": 2.2070192323220606e-07, "loss": 0.0884, "step": 2767 }, { "epoch": 0.9372923324374087, "grad_norm": 0.453125, "learning_rate": 2.1836883486724857e-07, "loss": 0.0496, "step": 2768 }, { "epoch": 0.9376309496095321, "grad_norm": 0.46875, "learning_rate": 2.1604800777775492e-07, "loss": 0.0553, "step": 2769 }, { "epoch": 0.9379695667816554, "grad_norm": 0.400390625, "learning_rate": 2.1373944487288577e-07, "loss": 0.0578, "step": 2770 }, { "epoch": 0.9383081839537788, "grad_norm": 0.62109375, "learning_rate": 2.1144314904642194e-07, "loss": 0.0569, "step": 2771 }, { "epoch": 0.9386468011259022, "grad_norm": 0.423828125, "learning_rate": 2.091591231767709e-07, "loss": 0.0565, "step": 2772 }, { "epoch": 0.9389854182980254, "grad_norm": 0.49609375, "learning_rate": 2.0688737012696136e-07, "loss": 0.0561, "step": 2773 }, { "epoch": 0.9393240354701488, "grad_norm": 0.51171875, "learning_rate": 2.0462789274463323e-07, "loss": 0.069, "step": 2774 }, { "epoch": 0.9396626526422721, "grad_norm": 0.55859375, "learning_rate": 2.023806938620443e-07, "loss": 0.0771, "step": 2775 }, { "epoch": 0.9400012698143955, "grad_norm": 0.44921875, "learning_rate": 2.0014577629605681e-07, "loss": 0.0607, "step": 2776 }, { "epoch": 0.9403398869865188, "grad_norm": 0.5234375, "learning_rate": 1.9792314284813984e-07, "loss": 0.0706, "step": 2777 }, { "epoch": 0.9406785041586422, "grad_norm": 0.734375, "learning_rate": 1.957127963043648e-07, "loss": 0.0555, "step": 2778 }, { "epoch": 0.9410171213307655, "grad_norm": 0.408203125, "learning_rate": 1.93514739435402e-07, "loss": 0.0445, "step": 2779 }, { "epoch": 0.9413557385028888, "grad_norm": 0.53515625, "learning_rate": 1.9132897499651636e-07, "loss": 0.0706, "step": 2780 }, { "epoch": 0.9416943556750121, "grad_norm": 0.4765625, "learning_rate": 1.8915550572756293e-07, "loss": 0.0628, "step": 2781 }, { "epoch": 0.9420329728471355, "grad_norm": 0.78125, "learning_rate": 1.8699433435298452e-07, "loss": 0.0882, "step": 2782 }, { "epoch": 0.9423715900192589, "grad_norm": 0.48828125, "learning_rate": 1.848454635818109e-07, "loss": 0.0704, "step": 2783 }, { "epoch": 0.9427102071913822, "grad_norm": 0.6171875, "learning_rate": 1.8270889610765285e-07, "loss": 0.0689, "step": 2784 }, { "epoch": 0.9430488243635056, "grad_norm": 0.671875, "learning_rate": 1.8058463460869478e-07, "loss": 0.0878, "step": 2785 }, { "epoch": 0.9433874415356289, "grad_norm": 0.5234375, "learning_rate": 1.7847268174770226e-07, "loss": 0.0701, "step": 2786 }, { "epoch": 0.9437260587077522, "grad_norm": 0.6640625, "learning_rate": 1.763730401720065e-07, "loss": 0.0942, "step": 2787 }, { "epoch": 0.9440646758798755, "grad_norm": 0.54296875, "learning_rate": 1.7428571251350779e-07, "loss": 0.0697, "step": 2788 }, { "epoch": 0.9444032930519989, "grad_norm": 0.51171875, "learning_rate": 1.7221070138867312e-07, "loss": 0.0664, "step": 2789 }, { "epoch": 0.9447419102241222, "grad_norm": 0.5, "learning_rate": 1.701480093985275e-07, "loss": 0.0477, "step": 2790 }, { "epoch": 0.9450805273962456, "grad_norm": 0.390625, "learning_rate": 1.6809763912865596e-07, "loss": 0.0492, "step": 2791 }, { "epoch": 0.945419144568369, "grad_norm": 0.515625, "learning_rate": 1.660595931491993e-07, "loss": 0.063, "step": 2792 }, { "epoch": 0.9457577617404923, "grad_norm": 0.5625, "learning_rate": 1.6403387401484506e-07, "loss": 0.076, "step": 2793 }, { "epoch": 0.9460963789126156, "grad_norm": 0.54296875, "learning_rate": 1.6202048426483652e-07, "loss": 0.067, "step": 2794 }, { "epoch": 0.9464349960847389, "grad_norm": 0.55078125, "learning_rate": 1.6001942642295487e-07, "loss": 0.0734, "step": 2795 }, { "epoch": 0.9467736132568623, "grad_norm": 0.419921875, "learning_rate": 1.580307029975281e-07, "loss": 0.0465, "step": 2796 }, { "epoch": 0.9471122304289856, "grad_norm": 0.435546875, "learning_rate": 1.5605431648141878e-07, "loss": 0.0428, "step": 2797 }, { "epoch": 0.947450847601109, "grad_norm": 0.58203125, "learning_rate": 1.5409026935203075e-07, "loss": 0.0825, "step": 2798 }, { "epoch": 0.9477894647732323, "grad_norm": 0.49609375, "learning_rate": 1.5213856407129467e-07, "loss": 0.0632, "step": 2799 }, { "epoch": 0.9481280819453557, "grad_norm": 0.33203125, "learning_rate": 1.501992030856736e-07, "loss": 0.0405, "step": 2800 }, { "epoch": 0.9484666991174789, "grad_norm": 0.64453125, "learning_rate": 1.4827218882615847e-07, "loss": 0.0767, "step": 2801 }, { "epoch": 0.9488053162896023, "grad_norm": 0.61328125, "learning_rate": 1.463575237082593e-07, "loss": 0.1085, "step": 2802 }, { "epoch": 0.9491439334617257, "grad_norm": 0.427734375, "learning_rate": 1.444552101320107e-07, "loss": 0.0514, "step": 2803 }, { "epoch": 0.949482550633849, "grad_norm": 0.5234375, "learning_rate": 1.42565250481963e-07, "loss": 0.068, "step": 2804 }, { "epoch": 0.9498211678059724, "grad_norm": 0.458984375, "learning_rate": 1.4068764712717897e-07, "loss": 0.0566, "step": 2805 }, { "epoch": 0.9501597849780957, "grad_norm": 0.4609375, "learning_rate": 1.3882240242123811e-07, "loss": 0.0567, "step": 2806 }, { "epoch": 0.9504984021502191, "grad_norm": 0.5234375, "learning_rate": 1.3696951870222018e-07, "loss": 0.0671, "step": 2807 }, { "epoch": 0.9508370193223424, "grad_norm": 0.51953125, "learning_rate": 1.3512899829271954e-07, "loss": 0.0617, "step": 2808 }, { "epoch": 0.9511756364944657, "grad_norm": 0.46875, "learning_rate": 1.3330084349982509e-07, "loss": 0.0632, "step": 2809 }, { "epoch": 0.951514253666589, "grad_norm": 1.1171875, "learning_rate": 1.3148505661513045e-07, "loss": 0.0709, "step": 2810 }, { "epoch": 0.9518528708387124, "grad_norm": 0.50390625, "learning_rate": 1.2968163991472493e-07, "loss": 0.0774, "step": 2811 }, { "epoch": 0.9521914880108358, "grad_norm": 0.392578125, "learning_rate": 1.2789059565919138e-07, "loss": 0.0542, "step": 2812 }, { "epoch": 0.9525301051829591, "grad_norm": 0.484375, "learning_rate": 1.261119260936039e-07, "loss": 0.0662, "step": 2813 }, { "epoch": 0.9528687223550825, "grad_norm": 0.455078125, "learning_rate": 1.243456334475246e-07, "loss": 0.0544, "step": 2814 }, { "epoch": 0.9532073395272058, "grad_norm": 0.47265625, "learning_rate": 1.225917199350013e-07, "loss": 0.0614, "step": 2815 }, { "epoch": 0.9535459566993291, "grad_norm": 0.50390625, "learning_rate": 1.2085018775456648e-07, "loss": 0.0535, "step": 2816 }, { "epoch": 0.9538845738714524, "grad_norm": 0.375, "learning_rate": 1.1912103908922945e-07, "loss": 0.0468, "step": 2817 }, { "epoch": 0.9542231910435758, "grad_norm": 0.6328125, "learning_rate": 1.1740427610647643e-07, "loss": 0.0739, "step": 2818 }, { "epoch": 0.9545618082156991, "grad_norm": 0.423828125, "learning_rate": 1.1569990095827378e-07, "loss": 0.0539, "step": 2819 }, { "epoch": 0.9549004253878225, "grad_norm": 0.55078125, "learning_rate": 1.1400791578105253e-07, "loss": 0.0921, "step": 2820 }, { "epoch": 0.9552390425599459, "grad_norm": 0.94140625, "learning_rate": 1.1232832269571725e-07, "loss": 0.1924, "step": 2821 }, { "epoch": 0.9555776597320692, "grad_norm": 0.703125, "learning_rate": 1.1066112380763939e-07, "loss": 0.0674, "step": 2822 }, { "epoch": 0.9559162769041925, "grad_norm": 0.5, "learning_rate": 1.0900632120665166e-07, "loss": 0.0646, "step": 2823 }, { "epoch": 0.9562548940763158, "grad_norm": 0.5625, "learning_rate": 1.073639169670504e-07, "loss": 0.0756, "step": 2824 }, { "epoch": 0.9565935112484392, "grad_norm": 0.58203125, "learning_rate": 1.0573391314758652e-07, "loss": 0.0681, "step": 2825 }, { "epoch": 0.9569321284205625, "grad_norm": 0.55859375, "learning_rate": 1.0411631179147342e-07, "loss": 0.0694, "step": 2826 }, { "epoch": 0.9572707455926859, "grad_norm": 0.55078125, "learning_rate": 1.0251111492637245e-07, "loss": 0.0779, "step": 2827 }, { "epoch": 0.9576093627648092, "grad_norm": 0.451171875, "learning_rate": 1.0091832456439854e-07, "loss": 0.0551, "step": 2828 }, { "epoch": 0.9579479799369326, "grad_norm": 0.494140625, "learning_rate": 9.933794270211461e-08, "loss": 0.0679, "step": 2829 }, { "epoch": 0.9582865971090558, "grad_norm": 0.51171875, "learning_rate": 9.776997132052935e-08, "loss": 0.0604, "step": 2830 }, { "epoch": 0.9586252142811792, "grad_norm": 0.44140625, "learning_rate": 9.621441238509611e-08, "loss": 0.0617, "step": 2831 }, { "epoch": 0.9589638314533026, "grad_norm": 0.53515625, "learning_rate": 9.467126784570623e-08, "loss": 0.0703, "step": 2832 }, { "epoch": 0.9593024486254259, "grad_norm": 0.6171875, "learning_rate": 9.314053963669245e-08, "loss": 0.0632, "step": 2833 }, { "epoch": 0.9596410657975493, "grad_norm": 0.431640625, "learning_rate": 9.162222967682322e-08, "loss": 0.0564, "step": 2834 }, { "epoch": 0.9599796829696726, "grad_norm": 0.50390625, "learning_rate": 9.011633986929947e-08, "loss": 0.0722, "step": 2835 }, { "epoch": 0.960318300141796, "grad_norm": 0.490234375, "learning_rate": 8.862287210175347e-08, "loss": 0.0665, "step": 2836 }, { "epoch": 0.9606569173139193, "grad_norm": 0.494140625, "learning_rate": 8.714182824624883e-08, "loss": 0.0717, "step": 2837 }, { "epoch": 0.9609955344860426, "grad_norm": 0.46875, "learning_rate": 8.567321015927387e-08, "loss": 0.0603, "step": 2838 }, { "epoch": 0.9613341516581659, "grad_norm": 0.5703125, "learning_rate": 8.421701968174156e-08, "loss": 0.0772, "step": 2839 }, { "epoch": 0.9616727688302893, "grad_norm": 0.5234375, "learning_rate": 8.27732586389851e-08, "loss": 0.0668, "step": 2840 }, { "epoch": 0.9620113860024126, "grad_norm": 0.52734375, "learning_rate": 8.134192884076131e-08, "loss": 0.0734, "step": 2841 }, { "epoch": 0.962350003174536, "grad_norm": 0.423828125, "learning_rate": 7.992303208123941e-08, "loss": 0.0504, "step": 2842 }, { "epoch": 0.9626886203466594, "grad_norm": 0.51171875, "learning_rate": 7.851657013901003e-08, "loss": 0.0743, "step": 2843 }, { "epoch": 0.9630272375187827, "grad_norm": 0.478515625, "learning_rate": 7.712254477707071e-08, "loss": 0.0614, "step": 2844 }, { "epoch": 0.963365854690906, "grad_norm": 0.470703125, "learning_rate": 7.574095774283363e-08, "loss": 0.0666, "step": 2845 }, { "epoch": 0.9637044718630293, "grad_norm": 0.48046875, "learning_rate": 7.437181076811794e-08, "loss": 0.06, "step": 2846 }, { "epoch": 0.9640430890351527, "grad_norm": 0.427734375, "learning_rate": 7.301510556914859e-08, "loss": 0.0535, "step": 2847 }, { "epoch": 0.964381706207276, "grad_norm": 0.70703125, "learning_rate": 7.167084384655742e-08, "loss": 0.0815, "step": 2848 }, { "epoch": 0.9647203233793994, "grad_norm": 0.439453125, "learning_rate": 7.033902728537546e-08, "loss": 0.0635, "step": 2849 }, { "epoch": 0.9650589405515227, "grad_norm": 0.443359375, "learning_rate": 6.901965755503503e-08, "loss": 0.0566, "step": 2850 }, { "epoch": 0.9653975577236461, "grad_norm": 0.51171875, "learning_rate": 6.77127363093666e-08, "loss": 0.0646, "step": 2851 }, { "epoch": 0.9657361748957694, "grad_norm": 0.41796875, "learning_rate": 6.641826518659633e-08, "loss": 0.0575, "step": 2852 }, { "epoch": 0.9660747920678927, "grad_norm": 0.43359375, "learning_rate": 6.513624580934186e-08, "loss": 0.0448, "step": 2853 }, { "epoch": 0.9664134092400161, "grad_norm": 0.462890625, "learning_rate": 6.386667978461658e-08, "loss": 0.0509, "step": 2854 }, { "epoch": 0.9667520264121394, "grad_norm": 0.55078125, "learning_rate": 6.260956870382196e-08, "loss": 0.0709, "step": 2855 }, { "epoch": 0.9670906435842628, "grad_norm": 0.55078125, "learning_rate": 6.136491414274415e-08, "loss": 0.0599, "step": 2856 }, { "epoch": 0.9674292607563861, "grad_norm": 0.478515625, "learning_rate": 6.01327176615607e-08, "loss": 0.0645, "step": 2857 }, { "epoch": 0.9677678779285095, "grad_norm": 0.494140625, "learning_rate": 5.891298080482943e-08, "loss": 0.0707, "step": 2858 }, { "epoch": 0.9681064951006327, "grad_norm": 0.71875, "learning_rate": 5.770570510148954e-08, "loss": 0.0616, "step": 2859 }, { "epoch": 0.9684451122727561, "grad_norm": 0.59375, "learning_rate": 5.65108920648616e-08, "loss": 0.087, "step": 2860 }, { "epoch": 0.9687837294448794, "grad_norm": 0.625, "learning_rate": 5.5328543192643134e-08, "loss": 0.0885, "step": 2861 }, { "epoch": 0.9691223466170028, "grad_norm": 0.50390625, "learning_rate": 5.4158659966909724e-08, "loss": 0.0677, "step": 2862 }, { "epoch": 0.9694609637891262, "grad_norm": 0.498046875, "learning_rate": 5.300124385410943e-08, "loss": 0.0629, "step": 2863 }, { "epoch": 0.9697995809612495, "grad_norm": 0.5390625, "learning_rate": 5.1856296305063945e-08, "loss": 0.0759, "step": 2864 }, { "epoch": 0.9701381981333729, "grad_norm": 0.451171875, "learning_rate": 5.072381875496524e-08, "loss": 0.065, "step": 2865 }, { "epoch": 0.9704768153054962, "grad_norm": 0.447265625, "learning_rate": 4.960381262337333e-08, "loss": 0.0499, "step": 2866 }, { "epoch": 0.9708154324776195, "grad_norm": 0.431640625, "learning_rate": 4.84962793142163e-08, "loss": 0.0571, "step": 2867 }, { "epoch": 0.9711540496497428, "grad_norm": 0.5859375, "learning_rate": 4.740122021578808e-08, "loss": 0.0695, "step": 2868 }, { "epoch": 0.9714926668218662, "grad_norm": 0.435546875, "learning_rate": 4.6318636700743994e-08, "loss": 0.0598, "step": 2869 }, { "epoch": 0.9718312839939895, "grad_norm": 0.412109375, "learning_rate": 4.5248530126102976e-08, "loss": 0.0446, "step": 2870 }, { "epoch": 0.9721699011661129, "grad_norm": 0.515625, "learning_rate": 4.419090183324315e-08, "loss": 0.0603, "step": 2871 }, { "epoch": 0.9725085183382363, "grad_norm": 0.5546875, "learning_rate": 4.314575314790292e-08, "loss": 0.074, "step": 2872 }, { "epoch": 0.9728471355103596, "grad_norm": 0.53515625, "learning_rate": 4.2113085380176556e-08, "loss": 0.0664, "step": 2873 }, { "epoch": 0.9731857526824829, "grad_norm": 0.515625, "learning_rate": 4.109289982451081e-08, "loss": 0.0668, "step": 2874 }, { "epoch": 0.9735243698546062, "grad_norm": 0.52734375, "learning_rate": 4.008519775971054e-08, "loss": 0.0689, "step": 2875 }, { "epoch": 0.9738629870267296, "grad_norm": 0.376953125, "learning_rate": 3.908998044892975e-08, "loss": 0.0483, "step": 2876 }, { "epoch": 0.9742016041988529, "grad_norm": 0.53125, "learning_rate": 3.810724913967278e-08, "loss": 0.0719, "step": 2877 }, { "epoch": 0.9745402213709763, "grad_norm": 0.4609375, "learning_rate": 3.713700506379536e-08, "loss": 0.0687, "step": 2878 }, { "epoch": 0.9748788385430996, "grad_norm": 0.58984375, "learning_rate": 3.617924943749573e-08, "loss": 0.0788, "step": 2879 }, { "epoch": 0.975217455715223, "grad_norm": 0.609375, "learning_rate": 3.5233983461322453e-08, "loss": 0.0752, "step": 2880 }, { "epoch": 0.9755560728873462, "grad_norm": 0.5234375, "learning_rate": 3.430120832016659e-08, "loss": 0.0585, "step": 2881 }, { "epoch": 0.9758946900594696, "grad_norm": 0.58203125, "learning_rate": 3.338092518326064e-08, "loss": 0.0634, "step": 2882 }, { "epoch": 0.976233307231593, "grad_norm": 0.474609375, "learning_rate": 3.2473135204180715e-08, "loss": 0.0662, "step": 2883 }, { "epoch": 0.9765719244037163, "grad_norm": 0.58203125, "learning_rate": 3.1577839520841034e-08, "loss": 0.0625, "step": 2884 }, { "epoch": 0.9769105415758397, "grad_norm": 0.5, "learning_rate": 3.0695039255494995e-08, "loss": 0.0629, "step": 2885 }, { "epoch": 0.977249158747963, "grad_norm": 0.578125, "learning_rate": 2.982473551473297e-08, "loss": 0.0783, "step": 2886 }, { "epoch": 0.9775877759200864, "grad_norm": 0.435546875, "learning_rate": 2.8966929389481202e-08, "loss": 0.0542, "step": 2887 }, { "epoch": 0.9779263930922096, "grad_norm": 0.515625, "learning_rate": 2.8121621954998457e-08, "loss": 0.0662, "step": 2888 }, { "epoch": 0.978265010264333, "grad_norm": 0.5390625, "learning_rate": 2.7288814270878262e-08, "loss": 0.0559, "step": 2889 }, { "epoch": 0.9786036274364563, "grad_norm": 0.80078125, "learning_rate": 2.6468507381045562e-08, "loss": 0.0583, "step": 2890 }, { "epoch": 0.9789422446085797, "grad_norm": 0.7109375, "learning_rate": 2.5660702313754505e-08, "loss": 0.0789, "step": 2891 }, { "epoch": 0.9792808617807031, "grad_norm": 1.1015625, "learning_rate": 2.4865400081589552e-08, "loss": 0.0491, "step": 2892 }, { "epoch": 0.9796194789528264, "grad_norm": 0.375, "learning_rate": 2.4082601681461038e-08, "loss": 0.0448, "step": 2893 }, { "epoch": 0.9799580961249498, "grad_norm": 0.5546875, "learning_rate": 2.3312308094607382e-08, "loss": 0.063, "step": 2894 }, { "epoch": 0.9802967132970731, "grad_norm": 0.67578125, "learning_rate": 2.2554520286592885e-08, "loss": 0.0595, "step": 2895 }, { "epoch": 0.9806353304691964, "grad_norm": 0.455078125, "learning_rate": 2.180923920730216e-08, "loss": 0.0575, "step": 2896 }, { "epoch": 0.9809739476413197, "grad_norm": 0.546875, "learning_rate": 2.10764657909468e-08, "loss": 0.0802, "step": 2897 }, { "epoch": 0.9813125648134431, "grad_norm": 0.451171875, "learning_rate": 2.0356200956058725e-08, "loss": 0.0639, "step": 2898 }, { "epoch": 0.9816511819855664, "grad_norm": 0.373046875, "learning_rate": 1.9648445605487954e-08, "loss": 0.044, "step": 2899 }, { "epoch": 0.9819897991576898, "grad_norm": 0.6484375, "learning_rate": 1.8953200626408153e-08, "loss": 0.0908, "step": 2900 }, { "epoch": 0.9823284163298132, "grad_norm": 0.5703125, "learning_rate": 1.827046689030665e-08, "loss": 0.0594, "step": 2901 }, { "epoch": 0.9826670335019365, "grad_norm": 0.54296875, "learning_rate": 1.76002452529922e-08, "loss": 0.051, "step": 2902 }, { "epoch": 0.9830056506740598, "grad_norm": 0.59765625, "learning_rate": 1.6942536554587218e-08, "loss": 0.0653, "step": 2903 }, { "epoch": 0.9833442678461831, "grad_norm": 0.41796875, "learning_rate": 1.6297341619528894e-08, "loss": 0.0472, "step": 2904 }, { "epoch": 0.9836828850183065, "grad_norm": 0.53515625, "learning_rate": 1.566466125656918e-08, "loss": 0.0653, "step": 2905 }, { "epoch": 0.9840215021904298, "grad_norm": 0.5546875, "learning_rate": 1.50444962587748e-08, "loss": 0.079, "step": 2906 }, { "epoch": 0.9843601193625532, "grad_norm": 0.578125, "learning_rate": 1.4436847403519471e-08, "loss": 0.0816, "step": 2907 }, { "epoch": 0.9846987365346765, "grad_norm": 0.57421875, "learning_rate": 1.3841715452493908e-08, "loss": 0.0587, "step": 2908 }, { "epoch": 0.9850373537067999, "grad_norm": 1.1875, "learning_rate": 1.325910115169471e-08, "loss": 0.0831, "step": 2909 }, { "epoch": 0.9853759708789231, "grad_norm": 0.455078125, "learning_rate": 1.2689005231429907e-08, "loss": 0.0584, "step": 2910 }, { "epoch": 0.9857145880510465, "grad_norm": 0.44140625, "learning_rate": 1.2131428406313428e-08, "loss": 0.0553, "step": 2911 }, { "epoch": 0.9860532052231699, "grad_norm": 0.58984375, "learning_rate": 1.1586371375268413e-08, "loss": 0.0721, "step": 2912 }, { "epoch": 0.9863918223952932, "grad_norm": 0.4140625, "learning_rate": 1.105383482152389e-08, "loss": 0.0446, "step": 2913 }, { "epoch": 0.9867304395674166, "grad_norm": 0.51953125, "learning_rate": 1.0533819412614776e-08, "loss": 0.0634, "step": 2914 }, { "epoch": 0.9870690567395399, "grad_norm": 0.66015625, "learning_rate": 1.0026325800380766e-08, "loss": 0.0712, "step": 2915 }, { "epoch": 0.9874076739116633, "grad_norm": 0.443359375, "learning_rate": 9.531354620964107e-09, "loss": 0.0529, "step": 2916 }, { "epoch": 0.9877462910837865, "grad_norm": 0.609375, "learning_rate": 9.048906494811826e-09, "loss": 0.0603, "step": 2917 }, { "epoch": 0.9880849082559099, "grad_norm": 0.486328125, "learning_rate": 8.5789820266724e-09, "loss": 0.0652, "step": 2918 }, { "epoch": 0.9884235254280332, "grad_norm": 0.515625, "learning_rate": 8.121581805596857e-09, "loss": 0.0653, "step": 2919 }, { "epoch": 0.9887621426001566, "grad_norm": 0.423828125, "learning_rate": 7.676706404935453e-09, "loss": 0.0558, "step": 2920 }, { "epoch": 0.98910075977228, "grad_norm": 0.490234375, "learning_rate": 7.24435638233989e-09, "loss": 0.066, "step": 2921 }, { "epoch": 0.9894393769444033, "grad_norm": 0.4140625, "learning_rate": 6.824532279761098e-09, "loss": 0.0527, "step": 2922 }, { "epoch": 0.9897779941165267, "grad_norm": 0.42578125, "learning_rate": 6.417234623449231e-09, "loss": 0.0532, "step": 2923 }, { "epoch": 0.99011661128865, "grad_norm": 0.4375, "learning_rate": 6.02246392395145e-09, "loss": 0.0528, "step": 2924 }, { "epoch": 0.9904552284607733, "grad_norm": 0.421875, "learning_rate": 5.6402206761119185e-09, "loss": 0.0547, "step": 2925 }, { "epoch": 0.9907938456328966, "grad_norm": 0.671875, "learning_rate": 5.27050535907403e-09, "loss": 0.0707, "step": 2926 }, { "epoch": 0.99113246280502, "grad_norm": 0.49609375, "learning_rate": 4.91331843627485e-09, "loss": 0.0661, "step": 2927 }, { "epoch": 0.9914710799771433, "grad_norm": 0.4765625, "learning_rate": 4.568660355448451e-09, "loss": 0.0526, "step": 2928 }, { "epoch": 0.9918096971492667, "grad_norm": 0.490234375, "learning_rate": 4.2365315486248e-09, "loss": 0.0686, "step": 2929 }, { "epoch": 0.9921483143213901, "grad_norm": 0.41015625, "learning_rate": 3.91693243212643e-09, "loss": 0.0554, "step": 2930 }, { "epoch": 0.9924869314935134, "grad_norm": 0.419921875, "learning_rate": 3.609863406570657e-09, "loss": 0.0495, "step": 2931 }, { "epoch": 0.9928255486656367, "grad_norm": 0.5078125, "learning_rate": 3.315324856869584e-09, "loss": 0.0686, "step": 2932 }, { "epoch": 0.99316416583776, "grad_norm": 0.423828125, "learning_rate": 3.0333171522256568e-09, "loss": 0.0586, "step": 2933 }, { "epoch": 0.9935027830098834, "grad_norm": 0.462890625, "learning_rate": 2.7638406461372167e-09, "loss": 0.065, "step": 2934 }, { "epoch": 0.9938414001820067, "grad_norm": 0.62109375, "learning_rate": 2.5068956763918405e-09, "loss": 0.0688, "step": 2935 }, { "epoch": 0.9941800173541301, "grad_norm": 0.66796875, "learning_rate": 2.262482565070778e-09, "loss": 0.0933, "step": 2936 }, { "epoch": 0.9945186345262534, "grad_norm": 0.83984375, "learning_rate": 2.0306016185456243e-09, "loss": 0.0699, "step": 2937 }, { "epoch": 0.9948572516983768, "grad_norm": 0.4375, "learning_rate": 1.8112531274794287e-09, "loss": 0.0583, "step": 2938 }, { "epoch": 0.9951958688705, "grad_norm": 0.5, "learning_rate": 1.6044373668255841e-09, "loss": 0.0712, "step": 2939 }, { "epoch": 0.9955344860426234, "grad_norm": 0.640625, "learning_rate": 1.4101545958267183e-09, "loss": 0.0991, "step": 2940 }, { "epoch": 0.9958731032147468, "grad_norm": 0.53125, "learning_rate": 1.228405058018023e-09, "loss": 0.0565, "step": 2941 }, { "epoch": 0.9962117203868701, "grad_norm": 0.50390625, "learning_rate": 1.0591889812205934e-09, "loss": 0.067, "step": 2942 }, { "epoch": 0.9965503375589935, "grad_norm": 0.380859375, "learning_rate": 9.025065775492003e-10, "loss": 0.0483, "step": 2943 }, { "epoch": 0.9968889547311168, "grad_norm": 0.431640625, "learning_rate": 7.583580434022963e-10, "loss": 0.064, "step": 2944 }, { "epoch": 0.9972275719032402, "grad_norm": 0.494140625, "learning_rate": 6.267435594720095e-10, "loss": 0.0585, "step": 2945 }, { "epoch": 0.9975661890753634, "grad_norm": 0.451171875, "learning_rate": 5.076632907374812e-10, "loss": 0.0568, "step": 2946 }, { "epoch": 0.9979048062474868, "grad_norm": 0.486328125, "learning_rate": 4.011173864637563e-10, "loss": 0.0697, "step": 2947 }, { "epoch": 0.9982434234196101, "grad_norm": 0.5234375, "learning_rate": 3.0710598020844416e-10, "loss": 0.0631, "step": 2948 }, { "epoch": 0.9985820405917335, "grad_norm": 0.44921875, "learning_rate": 2.2562918981394732e-10, "loss": 0.0585, "step": 2949 }, { "epoch": 0.9989206577638569, "grad_norm": 0.52734375, "learning_rate": 1.5668711741079202e-10, "loss": 0.0752, "step": 2950 }, { "epoch": 0.9992592749359802, "grad_norm": 0.48046875, "learning_rate": 1.0027984941873847e-10, "loss": 0.0674, "step": 2951 }, { "epoch": 0.9995978921081036, "grad_norm": 0.609375, "learning_rate": 5.640745654345026e-11, "loss": 0.0774, "step": 2952 }, { "epoch": 0.9999365092802269, "grad_norm": 0.515625, "learning_rate": 2.5069993779824887e-11, "loss": 0.0685, "step": 2953 }, { "epoch": 1.0, "grad_norm": 0.98046875, "learning_rate": 6.267500408663196e-12, "loss": 0.0576, "step": 2954 } ], "logging_steps": 1.0, "max_steps": 2954, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 296, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.726211900679385e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }