{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 4074, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007363770250368188, "grad_norm": 5.638559341430664, "learning_rate": 2.450980392156863e-08, "loss": 0.8206, "step": 1 }, { "epoch": 0.0014727540500736377, "grad_norm": 5.574512004852295, "learning_rate": 4.901960784313726e-08, "loss": 0.8179, "step": 2 }, { "epoch": 0.0022091310751104565, "grad_norm": 5.833616733551025, "learning_rate": 7.352941176470589e-08, "loss": 0.8298, "step": 3 }, { "epoch": 0.0029455081001472753, "grad_norm": 5.760316848754883, "learning_rate": 9.803921568627452e-08, "loss": 0.8612, "step": 4 }, { "epoch": 0.003681885125184094, "grad_norm": 5.582140922546387, "learning_rate": 1.2254901960784314e-07, "loss": 0.8597, "step": 5 }, { "epoch": 0.004418262150220913, "grad_norm": 5.870736122131348, "learning_rate": 1.4705882352941178e-07, "loss": 0.8647, "step": 6 }, { "epoch": 0.005154639175257732, "grad_norm": 5.794524192810059, "learning_rate": 1.7156862745098042e-07, "loss": 0.8709, "step": 7 }, { "epoch": 0.005891016200294551, "grad_norm": 5.833931922912598, "learning_rate": 1.9607843137254904e-07, "loss": 0.8438, "step": 8 }, { "epoch": 0.0066273932253313695, "grad_norm": 5.879095554351807, "learning_rate": 2.2058823529411768e-07, "loss": 0.8583, "step": 9 }, { "epoch": 0.007363770250368188, "grad_norm": 5.551036357879639, "learning_rate": 2.4509803921568627e-07, "loss": 0.8273, "step": 10 }, { "epoch": 0.008100147275405008, "grad_norm": 5.67064094543457, "learning_rate": 2.696078431372549e-07, "loss": 0.8328, "step": 11 }, { "epoch": 0.008836524300441826, "grad_norm": 5.897095203399658, "learning_rate": 2.9411764705882356e-07, "loss": 0.8742, "step": 12 }, { "epoch": 0.009572901325478646, "grad_norm": 5.685695648193359, "learning_rate": 3.1862745098039215e-07, "loss": 0.8389, "step": 13 }, { "epoch": 0.010309278350515464, "grad_norm": 5.742497444152832, "learning_rate": 3.4313725490196084e-07, "loss": 0.8695, "step": 14 }, { "epoch": 0.011045655375552283, "grad_norm": 5.5533037185668945, "learning_rate": 3.6764705882352943e-07, "loss": 0.8515, "step": 15 }, { "epoch": 0.011782032400589101, "grad_norm": 5.345178604125977, "learning_rate": 3.921568627450981e-07, "loss": 0.816, "step": 16 }, { "epoch": 0.012518409425625921, "grad_norm": 5.715511322021484, "learning_rate": 4.1666666666666667e-07, "loss": 0.9075, "step": 17 }, { "epoch": 0.013254786450662739, "grad_norm": 5.173557758331299, "learning_rate": 4.4117647058823536e-07, "loss": 0.8311, "step": 18 }, { "epoch": 0.013991163475699559, "grad_norm": 5.542694568634033, "learning_rate": 4.6568627450980395e-07, "loss": 0.8385, "step": 19 }, { "epoch": 0.014727540500736377, "grad_norm": 4.2676191329956055, "learning_rate": 4.901960784313725e-07, "loss": 0.8088, "step": 20 }, { "epoch": 0.015463917525773196, "grad_norm": 4.362669944763184, "learning_rate": 5.147058823529412e-07, "loss": 0.8291, "step": 21 }, { "epoch": 0.016200294550810016, "grad_norm": 4.3511643409729, "learning_rate": 5.392156862745098e-07, "loss": 0.8043, "step": 22 }, { "epoch": 0.016936671575846832, "grad_norm": 4.051642894744873, "learning_rate": 5.637254901960785e-07, "loss": 0.744, "step": 23 }, { "epoch": 0.017673048600883652, "grad_norm": 4.032516002655029, "learning_rate": 5.882352941176471e-07, "loss": 0.7975, "step": 24 }, { "epoch": 0.018409425625920472, "grad_norm": 3.7897393703460693, "learning_rate": 6.127450980392157e-07, "loss": 0.7686, "step": 25 }, { "epoch": 0.01914580265095729, "grad_norm": 3.99767804145813, "learning_rate": 6.372549019607843e-07, "loss": 0.777, "step": 26 }, { "epoch": 0.019882179675994108, "grad_norm": 2.312422037124634, "learning_rate": 6.61764705882353e-07, "loss": 0.7323, "step": 27 }, { "epoch": 0.020618556701030927, "grad_norm": 2.436676025390625, "learning_rate": 6.862745098039217e-07, "loss": 0.7402, "step": 28 }, { "epoch": 0.021354933726067747, "grad_norm": 2.346937417984009, "learning_rate": 7.107843137254903e-07, "loss": 0.7634, "step": 29 }, { "epoch": 0.022091310751104567, "grad_norm": 2.329108476638794, "learning_rate": 7.352941176470589e-07, "loss": 0.8077, "step": 30 }, { "epoch": 0.022827687776141383, "grad_norm": 2.1621270179748535, "learning_rate": 7.598039215686275e-07, "loss": 0.7406, "step": 31 }, { "epoch": 0.023564064801178203, "grad_norm": 2.004992723464966, "learning_rate": 7.843137254901962e-07, "loss": 0.7373, "step": 32 }, { "epoch": 0.024300441826215022, "grad_norm": 1.9387824535369873, "learning_rate": 8.088235294117648e-07, "loss": 0.6983, "step": 33 }, { "epoch": 0.025036818851251842, "grad_norm": 1.8947597742080688, "learning_rate": 8.333333333333333e-07, "loss": 0.7385, "step": 34 }, { "epoch": 0.02577319587628866, "grad_norm": 1.8148812055587769, "learning_rate": 8.57843137254902e-07, "loss": 0.7547, "step": 35 }, { "epoch": 0.026509572901325478, "grad_norm": 1.3978095054626465, "learning_rate": 8.823529411764707e-07, "loss": 0.721, "step": 36 }, { "epoch": 0.027245949926362298, "grad_norm": 1.5461397171020508, "learning_rate": 9.068627450980393e-07, "loss": 0.703, "step": 37 }, { "epoch": 0.027982326951399118, "grad_norm": 1.797391414642334, "learning_rate": 9.313725490196079e-07, "loss": 0.6854, "step": 38 }, { "epoch": 0.028718703976435934, "grad_norm": 2.147284746170044, "learning_rate": 9.558823529411764e-07, "loss": 0.736, "step": 39 }, { "epoch": 0.029455081001472753, "grad_norm": 2.267225980758667, "learning_rate": 9.80392156862745e-07, "loss": 0.7235, "step": 40 }, { "epoch": 0.030191458026509573, "grad_norm": 2.0823991298675537, "learning_rate": 1.0049019607843138e-06, "loss": 0.7262, "step": 41 }, { "epoch": 0.030927835051546393, "grad_norm": 2.126589775085449, "learning_rate": 1.0294117647058825e-06, "loss": 0.7298, "step": 42 }, { "epoch": 0.03166421207658321, "grad_norm": 1.8871612548828125, "learning_rate": 1.0539215686274512e-06, "loss": 0.7012, "step": 43 }, { "epoch": 0.03240058910162003, "grad_norm": 1.6312057971954346, "learning_rate": 1.0784313725490197e-06, "loss": 0.6583, "step": 44 }, { "epoch": 0.03313696612665685, "grad_norm": 1.5015575885772705, "learning_rate": 1.1029411764705884e-06, "loss": 0.6593, "step": 45 }, { "epoch": 0.033873343151693665, "grad_norm": 1.2276663780212402, "learning_rate": 1.127450980392157e-06, "loss": 0.6918, "step": 46 }, { "epoch": 0.03460972017673049, "grad_norm": 1.0622628927230835, "learning_rate": 1.1519607843137255e-06, "loss": 0.6918, "step": 47 }, { "epoch": 0.035346097201767304, "grad_norm": 0.9096408486366272, "learning_rate": 1.1764705882352942e-06, "loss": 0.6847, "step": 48 }, { "epoch": 0.03608247422680412, "grad_norm": 0.8563209772109985, "learning_rate": 1.200980392156863e-06, "loss": 0.6646, "step": 49 }, { "epoch": 0.036818851251840944, "grad_norm": 0.9862013459205627, "learning_rate": 1.2254901960784314e-06, "loss": 0.6732, "step": 50 }, { "epoch": 0.03755522827687776, "grad_norm": 1.099507212638855, "learning_rate": 1.25e-06, "loss": 0.6882, "step": 51 }, { "epoch": 0.03829160530191458, "grad_norm": 1.1145635843276978, "learning_rate": 1.2745098039215686e-06, "loss": 0.6658, "step": 52 }, { "epoch": 0.0390279823269514, "grad_norm": 1.0199302434921265, "learning_rate": 1.2990196078431375e-06, "loss": 0.6439, "step": 53 }, { "epoch": 0.039764359351988215, "grad_norm": 0.8398745059967041, "learning_rate": 1.323529411764706e-06, "loss": 0.641, "step": 54 }, { "epoch": 0.04050073637702504, "grad_norm": 0.7679091095924377, "learning_rate": 1.3480392156862745e-06, "loss": 0.6067, "step": 55 }, { "epoch": 0.041237113402061855, "grad_norm": 0.7991195321083069, "learning_rate": 1.3725490196078434e-06, "loss": 0.677, "step": 56 }, { "epoch": 0.04197349042709867, "grad_norm": 0.7389347553253174, "learning_rate": 1.3970588235294119e-06, "loss": 0.6504, "step": 57 }, { "epoch": 0.042709867452135494, "grad_norm": 0.7039929628372192, "learning_rate": 1.4215686274509805e-06, "loss": 0.6002, "step": 58 }, { "epoch": 0.04344624447717231, "grad_norm": 0.7065069675445557, "learning_rate": 1.4460784313725492e-06, "loss": 0.581, "step": 59 }, { "epoch": 0.044182621502209134, "grad_norm": 0.6952792406082153, "learning_rate": 1.4705882352941177e-06, "loss": 0.6523, "step": 60 }, { "epoch": 0.04491899852724595, "grad_norm": 0.6492939591407776, "learning_rate": 1.4950980392156864e-06, "loss": 0.6587, "step": 61 }, { "epoch": 0.045655375552282766, "grad_norm": 0.6436308026313782, "learning_rate": 1.519607843137255e-06, "loss": 0.6048, "step": 62 }, { "epoch": 0.04639175257731959, "grad_norm": 0.6987003087997437, "learning_rate": 1.5441176470588238e-06, "loss": 0.6353, "step": 63 }, { "epoch": 0.047128129602356406, "grad_norm": 0.698427677154541, "learning_rate": 1.5686274509803923e-06, "loss": 0.63, "step": 64 }, { "epoch": 0.04786450662739323, "grad_norm": 0.7047044634819031, "learning_rate": 1.5931372549019608e-06, "loss": 0.6048, "step": 65 }, { "epoch": 0.048600883652430045, "grad_norm": 0.6246572732925415, "learning_rate": 1.6176470588235297e-06, "loss": 0.6264, "step": 66 }, { "epoch": 0.04933726067746686, "grad_norm": 0.6060186624526978, "learning_rate": 1.6421568627450982e-06, "loss": 0.6277, "step": 67 }, { "epoch": 0.050073637702503684, "grad_norm": 0.5113458037376404, "learning_rate": 1.6666666666666667e-06, "loss": 0.5985, "step": 68 }, { "epoch": 0.0508100147275405, "grad_norm": 0.6172787547111511, "learning_rate": 1.6911764705882356e-06, "loss": 0.622, "step": 69 }, { "epoch": 0.05154639175257732, "grad_norm": 0.4991741180419922, "learning_rate": 1.715686274509804e-06, "loss": 0.6048, "step": 70 }, { "epoch": 0.05228276877761414, "grad_norm": 0.5184860825538635, "learning_rate": 1.7401960784313725e-06, "loss": 0.6033, "step": 71 }, { "epoch": 0.053019145802650956, "grad_norm": 0.4985435903072357, "learning_rate": 1.7647058823529414e-06, "loss": 0.5792, "step": 72 }, { "epoch": 0.05375552282768778, "grad_norm": 0.5244366526603699, "learning_rate": 1.78921568627451e-06, "loss": 0.6168, "step": 73 }, { "epoch": 0.054491899852724596, "grad_norm": 0.6064889430999756, "learning_rate": 1.8137254901960786e-06, "loss": 0.6034, "step": 74 }, { "epoch": 0.05522827687776141, "grad_norm": 0.5819352269172668, "learning_rate": 1.8382352941176473e-06, "loss": 0.5942, "step": 75 }, { "epoch": 0.055964653902798235, "grad_norm": 0.47975030541419983, "learning_rate": 1.8627450980392158e-06, "loss": 0.5817, "step": 76 }, { "epoch": 0.05670103092783505, "grad_norm": 0.47679486870765686, "learning_rate": 1.8872549019607845e-06, "loss": 0.5789, "step": 77 }, { "epoch": 0.05743740795287187, "grad_norm": 0.44839027523994446, "learning_rate": 1.9117647058823528e-06, "loss": 0.595, "step": 78 }, { "epoch": 0.05817378497790869, "grad_norm": 0.43815600872039795, "learning_rate": 1.9362745098039217e-06, "loss": 0.5648, "step": 79 }, { "epoch": 0.05891016200294551, "grad_norm": 0.457387775182724, "learning_rate": 1.96078431372549e-06, "loss": 0.6067, "step": 80 }, { "epoch": 0.05964653902798233, "grad_norm": 0.4818171560764313, "learning_rate": 1.985294117647059e-06, "loss": 0.61, "step": 81 }, { "epoch": 0.060382916053019146, "grad_norm": 0.4079191982746124, "learning_rate": 2.0098039215686276e-06, "loss": 0.5663, "step": 82 }, { "epoch": 0.06111929307805596, "grad_norm": 0.4442485272884369, "learning_rate": 2.034313725490196e-06, "loss": 0.5761, "step": 83 }, { "epoch": 0.061855670103092786, "grad_norm": 0.4076555073261261, "learning_rate": 2.058823529411765e-06, "loss": 0.5667, "step": 84 }, { "epoch": 0.0625920471281296, "grad_norm": 0.4406167268753052, "learning_rate": 2.0833333333333334e-06, "loss": 0.586, "step": 85 }, { "epoch": 0.06332842415316642, "grad_norm": 0.4781548082828522, "learning_rate": 2.1078431372549023e-06, "loss": 0.5387, "step": 86 }, { "epoch": 0.06406480117820323, "grad_norm": 0.4489077925682068, "learning_rate": 2.132352941176471e-06, "loss": 0.5346, "step": 87 }, { "epoch": 0.06480117820324006, "grad_norm": 0.47296687960624695, "learning_rate": 2.1568627450980393e-06, "loss": 0.611, "step": 88 }, { "epoch": 0.06553755522827688, "grad_norm": 0.4460233151912689, "learning_rate": 2.1813725490196082e-06, "loss": 0.5806, "step": 89 }, { "epoch": 0.0662739322533137, "grad_norm": 0.45212772488594055, "learning_rate": 2.2058823529411767e-06, "loss": 0.5761, "step": 90 }, { "epoch": 0.06701030927835051, "grad_norm": 0.4867306053638458, "learning_rate": 2.2303921568627456e-06, "loss": 0.555, "step": 91 }, { "epoch": 0.06774668630338733, "grad_norm": 0.44260451197624207, "learning_rate": 2.254901960784314e-06, "loss": 0.5333, "step": 92 }, { "epoch": 0.06848306332842416, "grad_norm": 0.4509826898574829, "learning_rate": 2.2794117647058826e-06, "loss": 0.5817, "step": 93 }, { "epoch": 0.06921944035346098, "grad_norm": 0.422654926776886, "learning_rate": 2.303921568627451e-06, "loss": 0.5797, "step": 94 }, { "epoch": 0.06995581737849779, "grad_norm": 0.40386420488357544, "learning_rate": 2.32843137254902e-06, "loss": 0.5972, "step": 95 }, { "epoch": 0.07069219440353461, "grad_norm": 0.4382508099079132, "learning_rate": 2.3529411764705885e-06, "loss": 0.588, "step": 96 }, { "epoch": 0.07142857142857142, "grad_norm": 0.4391435980796814, "learning_rate": 2.377450980392157e-06, "loss": 0.5434, "step": 97 }, { "epoch": 0.07216494845360824, "grad_norm": 0.39785510301589966, "learning_rate": 2.401960784313726e-06, "loss": 0.55, "step": 98 }, { "epoch": 0.07290132547864507, "grad_norm": 0.4807189404964447, "learning_rate": 2.4264705882352943e-06, "loss": 0.5642, "step": 99 }, { "epoch": 0.07363770250368189, "grad_norm": 0.40981948375701904, "learning_rate": 2.450980392156863e-06, "loss": 0.5257, "step": 100 }, { "epoch": 0.0743740795287187, "grad_norm": 0.4581270217895508, "learning_rate": 2.4754901960784317e-06, "loss": 0.5676, "step": 101 }, { "epoch": 0.07511045655375552, "grad_norm": 0.4791598320007324, "learning_rate": 2.5e-06, "loss": 0.5589, "step": 102 }, { "epoch": 0.07584683357879234, "grad_norm": 0.4747304916381836, "learning_rate": 2.5245098039215687e-06, "loss": 0.573, "step": 103 }, { "epoch": 0.07658321060382917, "grad_norm": 0.501209557056427, "learning_rate": 2.549019607843137e-06, "loss": 0.5354, "step": 104 }, { "epoch": 0.07731958762886598, "grad_norm": 0.42233791947364807, "learning_rate": 2.5735294117647057e-06, "loss": 0.5822, "step": 105 }, { "epoch": 0.0780559646539028, "grad_norm": 0.44912800192832947, "learning_rate": 2.598039215686275e-06, "loss": 0.5279, "step": 106 }, { "epoch": 0.07879234167893961, "grad_norm": 0.47332027554512024, "learning_rate": 2.6225490196078435e-06, "loss": 0.5641, "step": 107 }, { "epoch": 0.07952871870397643, "grad_norm": 0.44078969955444336, "learning_rate": 2.647058823529412e-06, "loss": 0.555, "step": 108 }, { "epoch": 0.08026509572901326, "grad_norm": 0.4458158016204834, "learning_rate": 2.6715686274509804e-06, "loss": 0.5706, "step": 109 }, { "epoch": 0.08100147275405008, "grad_norm": 0.4494592249393463, "learning_rate": 2.696078431372549e-06, "loss": 0.5742, "step": 110 }, { "epoch": 0.0817378497790869, "grad_norm": 0.47519755363464355, "learning_rate": 2.720588235294118e-06, "loss": 0.5553, "step": 111 }, { "epoch": 0.08247422680412371, "grad_norm": 0.4567236006259918, "learning_rate": 2.7450980392156867e-06, "loss": 0.5677, "step": 112 }, { "epoch": 0.08321060382916053, "grad_norm": 0.45444926619529724, "learning_rate": 2.7696078431372552e-06, "loss": 0.537, "step": 113 }, { "epoch": 0.08394698085419734, "grad_norm": 0.5289820432662964, "learning_rate": 2.7941176470588237e-06, "loss": 0.5701, "step": 114 }, { "epoch": 0.08468335787923417, "grad_norm": 0.5139638781547546, "learning_rate": 2.818627450980392e-06, "loss": 0.5588, "step": 115 }, { "epoch": 0.08541973490427099, "grad_norm": 0.42048802971839905, "learning_rate": 2.843137254901961e-06, "loss": 0.5342, "step": 116 }, { "epoch": 0.0861561119293078, "grad_norm": 0.43646562099456787, "learning_rate": 2.8676470588235296e-06, "loss": 0.5506, "step": 117 }, { "epoch": 0.08689248895434462, "grad_norm": 0.4417460560798645, "learning_rate": 2.8921568627450985e-06, "loss": 0.5493, "step": 118 }, { "epoch": 0.08762886597938144, "grad_norm": 0.46676769852638245, "learning_rate": 2.916666666666667e-06, "loss": 0.5831, "step": 119 }, { "epoch": 0.08836524300441827, "grad_norm": 0.45995548367500305, "learning_rate": 2.9411764705882355e-06, "loss": 0.5807, "step": 120 }, { "epoch": 0.08910162002945508, "grad_norm": 0.42616021633148193, "learning_rate": 2.9656862745098044e-06, "loss": 0.5562, "step": 121 }, { "epoch": 0.0898379970544919, "grad_norm": 0.419491708278656, "learning_rate": 2.990196078431373e-06, "loss": 0.5285, "step": 122 }, { "epoch": 0.09057437407952872, "grad_norm": 0.4764864146709442, "learning_rate": 3.0147058823529413e-06, "loss": 0.5761, "step": 123 }, { "epoch": 0.09131075110456553, "grad_norm": 0.4339980185031891, "learning_rate": 3.03921568627451e-06, "loss": 0.5707, "step": 124 }, { "epoch": 0.09204712812960236, "grad_norm": 0.41494664549827576, "learning_rate": 3.0637254901960787e-06, "loss": 0.5294, "step": 125 }, { "epoch": 0.09278350515463918, "grad_norm": 0.43163448572158813, "learning_rate": 3.0882352941176476e-06, "loss": 0.5539, "step": 126 }, { "epoch": 0.093519882179676, "grad_norm": 0.42778855562210083, "learning_rate": 3.112745098039216e-06, "loss": 0.5625, "step": 127 }, { "epoch": 0.09425625920471281, "grad_norm": 0.4978928565979004, "learning_rate": 3.1372549019607846e-06, "loss": 0.5107, "step": 128 }, { "epoch": 0.09499263622974963, "grad_norm": 0.4967725872993469, "learning_rate": 3.161764705882353e-06, "loss": 0.5434, "step": 129 }, { "epoch": 0.09572901325478646, "grad_norm": 0.4463275372982025, "learning_rate": 3.1862745098039216e-06, "loss": 0.553, "step": 130 }, { "epoch": 0.09646539027982327, "grad_norm": 0.5262526869773865, "learning_rate": 3.210784313725491e-06, "loss": 0.5464, "step": 131 }, { "epoch": 0.09720176730486009, "grad_norm": 0.4471772015094757, "learning_rate": 3.2352941176470594e-06, "loss": 0.5089, "step": 132 }, { "epoch": 0.0979381443298969, "grad_norm": 0.41978949308395386, "learning_rate": 3.259803921568628e-06, "loss": 0.4965, "step": 133 }, { "epoch": 0.09867452135493372, "grad_norm": 0.4757583439350128, "learning_rate": 3.2843137254901964e-06, "loss": 0.5509, "step": 134 }, { "epoch": 0.09941089837997054, "grad_norm": 0.4464501738548279, "learning_rate": 3.308823529411765e-06, "loss": 0.5032, "step": 135 }, { "epoch": 0.10014727540500737, "grad_norm": 0.46477508544921875, "learning_rate": 3.3333333333333333e-06, "loss": 0.5429, "step": 136 }, { "epoch": 0.10088365243004419, "grad_norm": 0.426889032125473, "learning_rate": 3.357843137254902e-06, "loss": 0.5572, "step": 137 }, { "epoch": 0.101620029455081, "grad_norm": 0.42896464467048645, "learning_rate": 3.382352941176471e-06, "loss": 0.541, "step": 138 }, { "epoch": 0.10235640648011782, "grad_norm": 0.4436034560203552, "learning_rate": 3.4068627450980396e-06, "loss": 0.5648, "step": 139 }, { "epoch": 0.10309278350515463, "grad_norm": 0.4158652722835541, "learning_rate": 3.431372549019608e-06, "loss": 0.5607, "step": 140 }, { "epoch": 0.10382916053019146, "grad_norm": 0.4582897424697876, "learning_rate": 3.4558823529411766e-06, "loss": 0.5745, "step": 141 }, { "epoch": 0.10456553755522828, "grad_norm": 0.4275548756122589, "learning_rate": 3.480392156862745e-06, "loss": 0.5147, "step": 142 }, { "epoch": 0.1053019145802651, "grad_norm": 0.4628273546695709, "learning_rate": 3.504901960784314e-06, "loss": 0.5305, "step": 143 }, { "epoch": 0.10603829160530191, "grad_norm": 0.4495219588279724, "learning_rate": 3.529411764705883e-06, "loss": 0.563, "step": 144 }, { "epoch": 0.10677466863033873, "grad_norm": 0.48380184173583984, "learning_rate": 3.5539215686274514e-06, "loss": 0.562, "step": 145 }, { "epoch": 0.10751104565537556, "grad_norm": 0.4028421938419342, "learning_rate": 3.57843137254902e-06, "loss": 0.5137, "step": 146 }, { "epoch": 0.10824742268041238, "grad_norm": 0.4970363974571228, "learning_rate": 3.6029411764705883e-06, "loss": 0.5268, "step": 147 }, { "epoch": 0.10898379970544919, "grad_norm": 0.46763232350349426, "learning_rate": 3.6274509803921573e-06, "loss": 0.542, "step": 148 }, { "epoch": 0.10972017673048601, "grad_norm": 0.457398921251297, "learning_rate": 3.6519607843137257e-06, "loss": 0.5289, "step": 149 }, { "epoch": 0.11045655375552282, "grad_norm": 0.44986864924430847, "learning_rate": 3.6764705882352946e-06, "loss": 0.549, "step": 150 }, { "epoch": 0.11119293078055964, "grad_norm": 0.46221205592155457, "learning_rate": 3.700980392156863e-06, "loss": 0.5005, "step": 151 }, { "epoch": 0.11192930780559647, "grad_norm": 0.5250900983810425, "learning_rate": 3.7254901960784316e-06, "loss": 0.5468, "step": 152 }, { "epoch": 0.11266568483063329, "grad_norm": 0.434471994638443, "learning_rate": 3.7500000000000005e-06, "loss": 0.5307, "step": 153 }, { "epoch": 0.1134020618556701, "grad_norm": 0.5031168460845947, "learning_rate": 3.774509803921569e-06, "loss": 0.5383, "step": 154 }, { "epoch": 0.11413843888070692, "grad_norm": 0.4508562684059143, "learning_rate": 3.7990196078431375e-06, "loss": 0.5359, "step": 155 }, { "epoch": 0.11487481590574374, "grad_norm": 0.4205038845539093, "learning_rate": 3.8235294117647055e-06, "loss": 0.5315, "step": 156 }, { "epoch": 0.11561119293078057, "grad_norm": 0.40232402086257935, "learning_rate": 3.848039215686275e-06, "loss": 0.5075, "step": 157 }, { "epoch": 0.11634756995581738, "grad_norm": 0.45643943548202515, "learning_rate": 3.872549019607843e-06, "loss": 0.5481, "step": 158 }, { "epoch": 0.1170839469808542, "grad_norm": 0.4849976599216461, "learning_rate": 3.897058823529412e-06, "loss": 0.5068, "step": 159 }, { "epoch": 0.11782032400589101, "grad_norm": 0.4546898901462555, "learning_rate": 3.92156862745098e-06, "loss": 0.5489, "step": 160 }, { "epoch": 0.11855670103092783, "grad_norm": 0.4682084023952484, "learning_rate": 3.946078431372549e-06, "loss": 0.5448, "step": 161 }, { "epoch": 0.11929307805596466, "grad_norm": 0.4213881492614746, "learning_rate": 3.970588235294118e-06, "loss": 0.4813, "step": 162 }, { "epoch": 0.12002945508100148, "grad_norm": 0.4240805208683014, "learning_rate": 3.995098039215687e-06, "loss": 0.5164, "step": 163 }, { "epoch": 0.12076583210603829, "grad_norm": 0.5331303477287292, "learning_rate": 4.019607843137255e-06, "loss": 0.5201, "step": 164 }, { "epoch": 0.12150220913107511, "grad_norm": 0.4699327349662781, "learning_rate": 4.044117647058824e-06, "loss": 0.527, "step": 165 }, { "epoch": 0.12223858615611193, "grad_norm": 0.44539308547973633, "learning_rate": 4.068627450980392e-06, "loss": 0.4908, "step": 166 }, { "epoch": 0.12297496318114874, "grad_norm": 0.4506961703300476, "learning_rate": 4.093137254901961e-06, "loss": 0.5314, "step": 167 }, { "epoch": 0.12371134020618557, "grad_norm": 0.44389674067497253, "learning_rate": 4.11764705882353e-06, "loss": 0.506, "step": 168 }, { "epoch": 0.12444771723122239, "grad_norm": 0.445320725440979, "learning_rate": 4.142156862745099e-06, "loss": 0.5236, "step": 169 }, { "epoch": 0.1251840942562592, "grad_norm": 0.44341355562210083, "learning_rate": 4.166666666666667e-06, "loss": 0.571, "step": 170 }, { "epoch": 0.12592047128129602, "grad_norm": 0.42870596051216125, "learning_rate": 4.191176470588236e-06, "loss": 0.5204, "step": 171 }, { "epoch": 0.12665684830633284, "grad_norm": 0.4404330849647522, "learning_rate": 4.215686274509805e-06, "loss": 0.5623, "step": 172 }, { "epoch": 0.12739322533136965, "grad_norm": 0.542121946811676, "learning_rate": 4.240196078431373e-06, "loss": 0.5301, "step": 173 }, { "epoch": 0.12812960235640647, "grad_norm": 0.5006332397460938, "learning_rate": 4.264705882352942e-06, "loss": 0.5237, "step": 174 }, { "epoch": 0.12886597938144329, "grad_norm": 0.4504947066307068, "learning_rate": 4.28921568627451e-06, "loss": 0.5052, "step": 175 }, { "epoch": 0.12960235640648013, "grad_norm": 0.4694218635559082, "learning_rate": 4.313725490196079e-06, "loss": 0.5012, "step": 176 }, { "epoch": 0.13033873343151695, "grad_norm": 0.47194620966911316, "learning_rate": 4.3382352941176475e-06, "loss": 0.5049, "step": 177 }, { "epoch": 0.13107511045655376, "grad_norm": 0.5346331596374512, "learning_rate": 4.3627450980392164e-06, "loss": 0.5118, "step": 178 }, { "epoch": 0.13181148748159058, "grad_norm": 0.5172557234764099, "learning_rate": 4.3872549019607845e-06, "loss": 0.4994, "step": 179 }, { "epoch": 0.1325478645066274, "grad_norm": 0.4675532579421997, "learning_rate": 4.411764705882353e-06, "loss": 0.5519, "step": 180 }, { "epoch": 0.1332842415316642, "grad_norm": 0.4985281527042389, "learning_rate": 4.4362745098039215e-06, "loss": 0.5058, "step": 181 }, { "epoch": 0.13402061855670103, "grad_norm": 0.46521127223968506, "learning_rate": 4.460784313725491e-06, "loss": 0.5559, "step": 182 }, { "epoch": 0.13475699558173784, "grad_norm": 0.48757219314575195, "learning_rate": 4.485294117647059e-06, "loss": 0.5249, "step": 183 }, { "epoch": 0.13549337260677466, "grad_norm": 0.5091550946235657, "learning_rate": 4.509803921568628e-06, "loss": 0.5223, "step": 184 }, { "epoch": 0.13622974963181148, "grad_norm": 0.4934118688106537, "learning_rate": 4.534313725490196e-06, "loss": 0.5419, "step": 185 }, { "epoch": 0.13696612665684832, "grad_norm": 0.5085634589195251, "learning_rate": 4.558823529411765e-06, "loss": 0.516, "step": 186 }, { "epoch": 0.13770250368188514, "grad_norm": 0.46844273805618286, "learning_rate": 4.583333333333333e-06, "loss": 0.5093, "step": 187 }, { "epoch": 0.13843888070692195, "grad_norm": 0.45088204741477966, "learning_rate": 4.607843137254902e-06, "loss": 0.5236, "step": 188 }, { "epoch": 0.13917525773195877, "grad_norm": 0.4446581304073334, "learning_rate": 4.632352941176471e-06, "loss": 0.5117, "step": 189 }, { "epoch": 0.13991163475699558, "grad_norm": 0.47169244289398193, "learning_rate": 4.65686274509804e-06, "loss": 0.5264, "step": 190 }, { "epoch": 0.1406480117820324, "grad_norm": 0.4336180090904236, "learning_rate": 4.681372549019608e-06, "loss": 0.527, "step": 191 }, { "epoch": 0.14138438880706922, "grad_norm": 0.4745936989784241, "learning_rate": 4.705882352941177e-06, "loss": 0.5235, "step": 192 }, { "epoch": 0.14212076583210603, "grad_norm": 0.49792900681495667, "learning_rate": 4.730392156862745e-06, "loss": 0.5006, "step": 193 }, { "epoch": 0.14285714285714285, "grad_norm": 0.503589391708374, "learning_rate": 4.754901960784314e-06, "loss": 0.5314, "step": 194 }, { "epoch": 0.14359351988217967, "grad_norm": 0.4816850423812866, "learning_rate": 4.779411764705883e-06, "loss": 0.5176, "step": 195 }, { "epoch": 0.14432989690721648, "grad_norm": 0.5058243870735168, "learning_rate": 4.803921568627452e-06, "loss": 0.4917, "step": 196 }, { "epoch": 0.14506627393225333, "grad_norm": 0.4948558509349823, "learning_rate": 4.82843137254902e-06, "loss": 0.5227, "step": 197 }, { "epoch": 0.14580265095729014, "grad_norm": 0.43871647119522095, "learning_rate": 4.852941176470589e-06, "loss": 0.5413, "step": 198 }, { "epoch": 0.14653902798232696, "grad_norm": 0.45166996121406555, "learning_rate": 4.8774509803921576e-06, "loss": 0.5645, "step": 199 }, { "epoch": 0.14727540500736377, "grad_norm": 0.5076332688331604, "learning_rate": 4.901960784313726e-06, "loss": 0.5222, "step": 200 }, { "epoch": 0.1480117820324006, "grad_norm": 0.46355342864990234, "learning_rate": 4.9264705882352945e-06, "loss": 0.4985, "step": 201 }, { "epoch": 0.1487481590574374, "grad_norm": 0.473640114068985, "learning_rate": 4.9509803921568634e-06, "loss": 0.5339, "step": 202 }, { "epoch": 0.14948453608247422, "grad_norm": 0.43335700035095215, "learning_rate": 4.9754901960784315e-06, "loss": 0.5236, "step": 203 }, { "epoch": 0.15022091310751104, "grad_norm": 0.5285197496414185, "learning_rate": 5e-06, "loss": 0.5102, "step": 204 }, { "epoch": 0.15095729013254786, "grad_norm": 0.4338807463645935, "learning_rate": 5.024509803921569e-06, "loss": 0.5243, "step": 205 }, { "epoch": 0.15169366715758467, "grad_norm": 0.5603867769241333, "learning_rate": 5.049019607843137e-06, "loss": 0.5269, "step": 206 }, { "epoch": 0.15243004418262152, "grad_norm": 0.547127366065979, "learning_rate": 5.073529411764706e-06, "loss": 0.5376, "step": 207 }, { "epoch": 0.15316642120765833, "grad_norm": 0.44914114475250244, "learning_rate": 5.098039215686274e-06, "loss": 0.4983, "step": 208 }, { "epoch": 0.15390279823269515, "grad_norm": 0.48467424511909485, "learning_rate": 5.122549019607843e-06, "loss": 0.4931, "step": 209 }, { "epoch": 0.15463917525773196, "grad_norm": 0.445891797542572, "learning_rate": 5.147058823529411e-06, "loss": 0.5086, "step": 210 }, { "epoch": 0.15537555228276878, "grad_norm": 0.44430914521217346, "learning_rate": 5.171568627450981e-06, "loss": 0.5023, "step": 211 }, { "epoch": 0.1561119293078056, "grad_norm": 0.5020100474357605, "learning_rate": 5.19607843137255e-06, "loss": 0.492, "step": 212 }, { "epoch": 0.1568483063328424, "grad_norm": 0.4869686961174011, "learning_rate": 5.220588235294118e-06, "loss": 0.4876, "step": 213 }, { "epoch": 0.15758468335787923, "grad_norm": 0.48111408948898315, "learning_rate": 5.245098039215687e-06, "loss": 0.5117, "step": 214 }, { "epoch": 0.15832106038291605, "grad_norm": 0.5187819004058838, "learning_rate": 5.269607843137256e-06, "loss": 0.5188, "step": 215 }, { "epoch": 0.15905743740795286, "grad_norm": 0.510890007019043, "learning_rate": 5.294117647058824e-06, "loss": 0.545, "step": 216 }, { "epoch": 0.15979381443298968, "grad_norm": 0.5489839315414429, "learning_rate": 5.318627450980393e-06, "loss": 0.508, "step": 217 }, { "epoch": 0.16053019145802652, "grad_norm": 0.4811173677444458, "learning_rate": 5.343137254901961e-06, "loss": 0.4955, "step": 218 }, { "epoch": 0.16126656848306334, "grad_norm": 0.49445685744285583, "learning_rate": 5.36764705882353e-06, "loss": 0.4678, "step": 219 }, { "epoch": 0.16200294550810015, "grad_norm": 0.44502392411231995, "learning_rate": 5.392156862745098e-06, "loss": 0.5166, "step": 220 }, { "epoch": 0.16273932253313697, "grad_norm": 0.46069005131721497, "learning_rate": 5.416666666666667e-06, "loss": 0.5279, "step": 221 }, { "epoch": 0.1634756995581738, "grad_norm": 0.4491000175476074, "learning_rate": 5.441176470588236e-06, "loss": 0.5093, "step": 222 }, { "epoch": 0.1642120765832106, "grad_norm": 0.5266180634498596, "learning_rate": 5.465686274509804e-06, "loss": 0.5221, "step": 223 }, { "epoch": 0.16494845360824742, "grad_norm": 0.48982885479927063, "learning_rate": 5.4901960784313735e-06, "loss": 0.4804, "step": 224 }, { "epoch": 0.16568483063328424, "grad_norm": 0.46362483501434326, "learning_rate": 5.514705882352942e-06, "loss": 0.524, "step": 225 }, { "epoch": 0.16642120765832105, "grad_norm": 0.480934202671051, "learning_rate": 5.5392156862745104e-06, "loss": 0.5063, "step": 226 }, { "epoch": 0.16715758468335787, "grad_norm": 0.4418233633041382, "learning_rate": 5.563725490196079e-06, "loss": 0.485, "step": 227 }, { "epoch": 0.16789396170839468, "grad_norm": 0.4766086935997009, "learning_rate": 5.588235294117647e-06, "loss": 0.4859, "step": 228 }, { "epoch": 0.16863033873343153, "grad_norm": 0.5201296806335449, "learning_rate": 5.612745098039216e-06, "loss": 0.5128, "step": 229 }, { "epoch": 0.16936671575846834, "grad_norm": 0.4526103734970093, "learning_rate": 5.637254901960784e-06, "loss": 0.5115, "step": 230 }, { "epoch": 0.17010309278350516, "grad_norm": 0.4606441557407379, "learning_rate": 5.661764705882353e-06, "loss": 0.5074, "step": 231 }, { "epoch": 0.17083946980854198, "grad_norm": 0.4658742845058441, "learning_rate": 5.686274509803922e-06, "loss": 0.5212, "step": 232 }, { "epoch": 0.1715758468335788, "grad_norm": 0.45116207003593445, "learning_rate": 5.71078431372549e-06, "loss": 0.5178, "step": 233 }, { "epoch": 0.1723122238586156, "grad_norm": 0.40379467606544495, "learning_rate": 5.735294117647059e-06, "loss": 0.4907, "step": 234 }, { "epoch": 0.17304860088365243, "grad_norm": 0.461535781621933, "learning_rate": 5.759803921568627e-06, "loss": 0.4812, "step": 235 }, { "epoch": 0.17378497790868924, "grad_norm": 0.40763628482818604, "learning_rate": 5.784313725490197e-06, "loss": 0.4861, "step": 236 }, { "epoch": 0.17452135493372606, "grad_norm": 0.4869963228702545, "learning_rate": 5.808823529411766e-06, "loss": 0.5126, "step": 237 }, { "epoch": 0.17525773195876287, "grad_norm": 0.43191322684288025, "learning_rate": 5.833333333333334e-06, "loss": 0.5194, "step": 238 }, { "epoch": 0.17599410898379972, "grad_norm": 0.4565574824810028, "learning_rate": 5.857843137254903e-06, "loss": 0.5108, "step": 239 }, { "epoch": 0.17673048600883653, "grad_norm": 0.4109247326850891, "learning_rate": 5.882352941176471e-06, "loss": 0.4885, "step": 240 }, { "epoch": 0.17746686303387335, "grad_norm": 0.5282112956047058, "learning_rate": 5.90686274509804e-06, "loss": 0.538, "step": 241 }, { "epoch": 0.17820324005891017, "grad_norm": 0.4757918417453766, "learning_rate": 5.931372549019609e-06, "loss": 0.5143, "step": 242 }, { "epoch": 0.17893961708394698, "grad_norm": 0.4788837134838104, "learning_rate": 5.955882352941177e-06, "loss": 0.4996, "step": 243 }, { "epoch": 0.1796759941089838, "grad_norm": 0.4603605568408966, "learning_rate": 5.980392156862746e-06, "loss": 0.4856, "step": 244 }, { "epoch": 0.18041237113402062, "grad_norm": 0.48925524950027466, "learning_rate": 6.004901960784314e-06, "loss": 0.5062, "step": 245 }, { "epoch": 0.18114874815905743, "grad_norm": 0.4555787742137909, "learning_rate": 6.029411764705883e-06, "loss": 0.4869, "step": 246 }, { "epoch": 0.18188512518409425, "grad_norm": 0.4337728023529053, "learning_rate": 6.053921568627451e-06, "loss": 0.5095, "step": 247 }, { "epoch": 0.18262150220913106, "grad_norm": 0.4596711993217468, "learning_rate": 6.07843137254902e-06, "loss": 0.5334, "step": 248 }, { "epoch": 0.18335787923416788, "grad_norm": 0.55954509973526, "learning_rate": 6.102941176470589e-06, "loss": 0.542, "step": 249 }, { "epoch": 0.18409425625920472, "grad_norm": 0.4391202926635742, "learning_rate": 6.1274509803921575e-06, "loss": 0.5085, "step": 250 }, { "epoch": 0.18483063328424154, "grad_norm": 0.5165490508079529, "learning_rate": 6.151960784313726e-06, "loss": 0.5267, "step": 251 }, { "epoch": 0.18556701030927836, "grad_norm": 0.5765572786331177, "learning_rate": 6.176470588235295e-06, "loss": 0.5302, "step": 252 }, { "epoch": 0.18630338733431517, "grad_norm": 0.4425363540649414, "learning_rate": 6.200980392156863e-06, "loss": 0.5294, "step": 253 }, { "epoch": 0.187039764359352, "grad_norm": 0.5258336663246155, "learning_rate": 6.225490196078432e-06, "loss": 0.4794, "step": 254 }, { "epoch": 0.1877761413843888, "grad_norm": 0.4908381998538971, "learning_rate": 6.25e-06, "loss": 0.4942, "step": 255 }, { "epoch": 0.18851251840942562, "grad_norm": 0.480133980512619, "learning_rate": 6.274509803921569e-06, "loss": 0.4991, "step": 256 }, { "epoch": 0.18924889543446244, "grad_norm": 0.48387065529823303, "learning_rate": 6.299019607843137e-06, "loss": 0.513, "step": 257 }, { "epoch": 0.18998527245949925, "grad_norm": 0.5518890619277954, "learning_rate": 6.323529411764706e-06, "loss": 0.522, "step": 258 }, { "epoch": 0.19072164948453607, "grad_norm": 0.5575649738311768, "learning_rate": 6.348039215686275e-06, "loss": 0.5046, "step": 259 }, { "epoch": 0.19145802650957292, "grad_norm": 0.5497487187385559, "learning_rate": 6.372549019607843e-06, "loss": 0.517, "step": 260 }, { "epoch": 0.19219440353460973, "grad_norm": 0.613700807094574, "learning_rate": 6.397058823529412e-06, "loss": 0.4874, "step": 261 }, { "epoch": 0.19293078055964655, "grad_norm": 0.5236343741416931, "learning_rate": 6.421568627450982e-06, "loss": 0.4959, "step": 262 }, { "epoch": 0.19366715758468336, "grad_norm": 0.5656614303588867, "learning_rate": 6.44607843137255e-06, "loss": 0.5192, "step": 263 }, { "epoch": 0.19440353460972018, "grad_norm": 0.4702565371990204, "learning_rate": 6.470588235294119e-06, "loss": 0.4974, "step": 264 }, { "epoch": 0.195139911634757, "grad_norm": 0.5624386668205261, "learning_rate": 6.495098039215687e-06, "loss": 0.5131, "step": 265 }, { "epoch": 0.1958762886597938, "grad_norm": 0.5100853443145752, "learning_rate": 6.519607843137256e-06, "loss": 0.4945, "step": 266 }, { "epoch": 0.19661266568483063, "grad_norm": 0.48825833201408386, "learning_rate": 6.544117647058824e-06, "loss": 0.4909, "step": 267 }, { "epoch": 0.19734904270986744, "grad_norm": 0.4945085942745209, "learning_rate": 6.568627450980393e-06, "loss": 0.5235, "step": 268 }, { "epoch": 0.19808541973490426, "grad_norm": 0.46311086416244507, "learning_rate": 6.593137254901962e-06, "loss": 0.4975, "step": 269 }, { "epoch": 0.19882179675994108, "grad_norm": 0.4879513680934906, "learning_rate": 6.61764705882353e-06, "loss": 0.494, "step": 270 }, { "epoch": 0.19955817378497792, "grad_norm": 0.43951448798179626, "learning_rate": 6.642156862745099e-06, "loss": 0.5052, "step": 271 }, { "epoch": 0.20029455081001474, "grad_norm": 0.4860183596611023, "learning_rate": 6.666666666666667e-06, "loss": 0.506, "step": 272 }, { "epoch": 0.20103092783505155, "grad_norm": 0.5143676400184631, "learning_rate": 6.6911764705882356e-06, "loss": 0.5197, "step": 273 }, { "epoch": 0.20176730486008837, "grad_norm": 0.4551714062690735, "learning_rate": 6.715686274509804e-06, "loss": 0.522, "step": 274 }, { "epoch": 0.2025036818851252, "grad_norm": 0.49370667338371277, "learning_rate": 6.740196078431373e-06, "loss": 0.4651, "step": 275 }, { "epoch": 0.203240058910162, "grad_norm": 0.42197269201278687, "learning_rate": 6.764705882352942e-06, "loss": 0.4675, "step": 276 }, { "epoch": 0.20397643593519882, "grad_norm": 0.48403897881507874, "learning_rate": 6.78921568627451e-06, "loss": 0.4831, "step": 277 }, { "epoch": 0.20471281296023564, "grad_norm": 0.5002971291542053, "learning_rate": 6.813725490196079e-06, "loss": 0.502, "step": 278 }, { "epoch": 0.20544918998527245, "grad_norm": 0.49652308225631714, "learning_rate": 6.838235294117648e-06, "loss": 0.5341, "step": 279 }, { "epoch": 0.20618556701030927, "grad_norm": 0.4934190809726715, "learning_rate": 6.862745098039216e-06, "loss": 0.5091, "step": 280 }, { "epoch": 0.20692194403534608, "grad_norm": 0.5390608310699463, "learning_rate": 6.887254901960785e-06, "loss": 0.4987, "step": 281 }, { "epoch": 0.20765832106038293, "grad_norm": 0.5566352605819702, "learning_rate": 6.911764705882353e-06, "loss": 0.5024, "step": 282 }, { "epoch": 0.20839469808541974, "grad_norm": 0.5296184420585632, "learning_rate": 6.936274509803922e-06, "loss": 0.5055, "step": 283 }, { "epoch": 0.20913107511045656, "grad_norm": 0.4719228744506836, "learning_rate": 6.96078431372549e-06, "loss": 0.506, "step": 284 }, { "epoch": 0.20986745213549338, "grad_norm": 0.49798110127449036, "learning_rate": 6.985294117647059e-06, "loss": 0.5311, "step": 285 }, { "epoch": 0.2106038291605302, "grad_norm": 0.5432643890380859, "learning_rate": 7.009803921568628e-06, "loss": 0.4817, "step": 286 }, { "epoch": 0.211340206185567, "grad_norm": 0.5015427470207214, "learning_rate": 7.034313725490197e-06, "loss": 0.4787, "step": 287 }, { "epoch": 0.21207658321060383, "grad_norm": 0.4968355596065521, "learning_rate": 7.058823529411766e-06, "loss": 0.5051, "step": 288 }, { "epoch": 0.21281296023564064, "grad_norm": 0.6130073666572571, "learning_rate": 7.083333333333335e-06, "loss": 0.4917, "step": 289 }, { "epoch": 0.21354933726067746, "grad_norm": 0.4766775369644165, "learning_rate": 7.107843137254903e-06, "loss": 0.4849, "step": 290 }, { "epoch": 0.21428571428571427, "grad_norm": 0.5826199650764465, "learning_rate": 7.132352941176472e-06, "loss": 0.4868, "step": 291 }, { "epoch": 0.21502209131075112, "grad_norm": 0.43983742594718933, "learning_rate": 7.15686274509804e-06, "loss": 0.4949, "step": 292 }, { "epoch": 0.21575846833578793, "grad_norm": 0.561597466468811, "learning_rate": 7.181372549019609e-06, "loss": 0.4796, "step": 293 }, { "epoch": 0.21649484536082475, "grad_norm": 0.5399706363677979, "learning_rate": 7.205882352941177e-06, "loss": 0.4824, "step": 294 }, { "epoch": 0.21723122238586157, "grad_norm": 0.45450007915496826, "learning_rate": 7.230392156862746e-06, "loss": 0.5083, "step": 295 }, { "epoch": 0.21796759941089838, "grad_norm": 0.5827359557151794, "learning_rate": 7.2549019607843145e-06, "loss": 0.5013, "step": 296 }, { "epoch": 0.2187039764359352, "grad_norm": 0.5125693678855896, "learning_rate": 7.2794117647058826e-06, "loss": 0.505, "step": 297 }, { "epoch": 0.21944035346097202, "grad_norm": 0.5245058536529541, "learning_rate": 7.3039215686274515e-06, "loss": 0.4918, "step": 298 }, { "epoch": 0.22017673048600883, "grad_norm": 0.5342023372650146, "learning_rate": 7.3284313725490195e-06, "loss": 0.4898, "step": 299 }, { "epoch": 0.22091310751104565, "grad_norm": 0.4772011339664459, "learning_rate": 7.352941176470589e-06, "loss": 0.4958, "step": 300 }, { "epoch": 0.22164948453608246, "grad_norm": 0.5259323716163635, "learning_rate": 7.377450980392158e-06, "loss": 0.4956, "step": 301 }, { "epoch": 0.22238586156111928, "grad_norm": 0.5168774724006653, "learning_rate": 7.401960784313726e-06, "loss": 0.4735, "step": 302 }, { "epoch": 0.22312223858615612, "grad_norm": 0.4500476121902466, "learning_rate": 7.426470588235295e-06, "loss": 0.5039, "step": 303 }, { "epoch": 0.22385861561119294, "grad_norm": 0.49780377745628357, "learning_rate": 7.450980392156863e-06, "loss": 0.4629, "step": 304 }, { "epoch": 0.22459499263622976, "grad_norm": 0.5209547281265259, "learning_rate": 7.475490196078432e-06, "loss": 0.4927, "step": 305 }, { "epoch": 0.22533136966126657, "grad_norm": 0.6013898849487305, "learning_rate": 7.500000000000001e-06, "loss": 0.5053, "step": 306 }, { "epoch": 0.2260677466863034, "grad_norm": 0.6150967478752136, "learning_rate": 7.524509803921569e-06, "loss": 0.4872, "step": 307 }, { "epoch": 0.2268041237113402, "grad_norm": 0.48117902874946594, "learning_rate": 7.549019607843138e-06, "loss": 0.5077, "step": 308 }, { "epoch": 0.22754050073637702, "grad_norm": 0.6058593392372131, "learning_rate": 7.573529411764706e-06, "loss": 0.4854, "step": 309 }, { "epoch": 0.22827687776141384, "grad_norm": 0.5079674124717712, "learning_rate": 7.598039215686275e-06, "loss": 0.4977, "step": 310 }, { "epoch": 0.22901325478645065, "grad_norm": 0.5175543427467346, "learning_rate": 7.622549019607843e-06, "loss": 0.5181, "step": 311 }, { "epoch": 0.22974963181148747, "grad_norm": 0.5423833131790161, "learning_rate": 7.647058823529411e-06, "loss": 0.5364, "step": 312 }, { "epoch": 0.23048600883652431, "grad_norm": 0.49087536334991455, "learning_rate": 7.671568627450981e-06, "loss": 0.4943, "step": 313 }, { "epoch": 0.23122238586156113, "grad_norm": 0.5285847783088684, "learning_rate": 7.69607843137255e-06, "loss": 0.5098, "step": 314 }, { "epoch": 0.23195876288659795, "grad_norm": 0.4606481194496155, "learning_rate": 7.720588235294119e-06, "loss": 0.477, "step": 315 }, { "epoch": 0.23269513991163476, "grad_norm": 0.5595932006835938, "learning_rate": 7.745098039215687e-06, "loss": 0.5141, "step": 316 }, { "epoch": 0.23343151693667158, "grad_norm": 0.4855089783668518, "learning_rate": 7.769607843137256e-06, "loss": 0.5224, "step": 317 }, { "epoch": 0.2341678939617084, "grad_norm": 0.49227041006088257, "learning_rate": 7.794117647058825e-06, "loss": 0.4859, "step": 318 }, { "epoch": 0.2349042709867452, "grad_norm": 0.46864691376686096, "learning_rate": 7.818627450980393e-06, "loss": 0.4907, "step": 319 }, { "epoch": 0.23564064801178203, "grad_norm": 0.5469391942024231, "learning_rate": 7.84313725490196e-06, "loss": 0.4839, "step": 320 }, { "epoch": 0.23637702503681884, "grad_norm": 0.5292018055915833, "learning_rate": 7.86764705882353e-06, "loss": 0.4977, "step": 321 }, { "epoch": 0.23711340206185566, "grad_norm": 0.5337371230125427, "learning_rate": 7.892156862745098e-06, "loss": 0.4871, "step": 322 }, { "epoch": 0.23784977908689248, "grad_norm": 0.5429027080535889, "learning_rate": 7.916666666666667e-06, "loss": 0.4995, "step": 323 }, { "epoch": 0.23858615611192932, "grad_norm": 0.5223814249038696, "learning_rate": 7.941176470588236e-06, "loss": 0.5147, "step": 324 }, { "epoch": 0.23932253313696614, "grad_norm": 0.4621415436267853, "learning_rate": 7.965686274509804e-06, "loss": 0.4943, "step": 325 }, { "epoch": 0.24005891016200295, "grad_norm": 0.6054308414459229, "learning_rate": 7.990196078431374e-06, "loss": 0.535, "step": 326 }, { "epoch": 0.24079528718703977, "grad_norm": 0.5081130266189575, "learning_rate": 8.014705882352942e-06, "loss": 0.4994, "step": 327 }, { "epoch": 0.24153166421207659, "grad_norm": 0.5109507441520691, "learning_rate": 8.03921568627451e-06, "loss": 0.4765, "step": 328 }, { "epoch": 0.2422680412371134, "grad_norm": 0.5108455419540405, "learning_rate": 8.06372549019608e-06, "loss": 0.4965, "step": 329 }, { "epoch": 0.24300441826215022, "grad_norm": 0.4857231378555298, "learning_rate": 8.088235294117648e-06, "loss": 0.4663, "step": 330 }, { "epoch": 0.24374079528718703, "grad_norm": 0.5149396061897278, "learning_rate": 8.112745098039216e-06, "loss": 0.517, "step": 331 }, { "epoch": 0.24447717231222385, "grad_norm": 0.5667039752006531, "learning_rate": 8.137254901960784e-06, "loss": 0.5037, "step": 332 }, { "epoch": 0.24521354933726067, "grad_norm": 0.5189935564994812, "learning_rate": 8.161764705882354e-06, "loss": 0.4786, "step": 333 }, { "epoch": 0.24594992636229748, "grad_norm": 0.5272833108901978, "learning_rate": 8.186274509803922e-06, "loss": 0.4986, "step": 334 }, { "epoch": 0.24668630338733433, "grad_norm": 0.4853070080280304, "learning_rate": 8.21078431372549e-06, "loss": 0.4876, "step": 335 }, { "epoch": 0.24742268041237114, "grad_norm": 0.6110916137695312, "learning_rate": 8.23529411764706e-06, "loss": 0.5359, "step": 336 }, { "epoch": 0.24815905743740796, "grad_norm": 0.6302662491798401, "learning_rate": 8.259803921568628e-06, "loss": 0.5306, "step": 337 }, { "epoch": 0.24889543446244478, "grad_norm": 0.5661644339561462, "learning_rate": 8.284313725490198e-06, "loss": 0.4894, "step": 338 }, { "epoch": 0.2496318114874816, "grad_norm": 0.5667375326156616, "learning_rate": 8.308823529411766e-06, "loss": 0.4911, "step": 339 }, { "epoch": 0.2503681885125184, "grad_norm": 0.5805349349975586, "learning_rate": 8.333333333333334e-06, "loss": 0.499, "step": 340 }, { "epoch": 0.2511045655375552, "grad_norm": 0.5153322219848633, "learning_rate": 8.357843137254903e-06, "loss": 0.4693, "step": 341 }, { "epoch": 0.25184094256259204, "grad_norm": 0.5583307147026062, "learning_rate": 8.382352941176472e-06, "loss": 0.4913, "step": 342 }, { "epoch": 0.25257731958762886, "grad_norm": 0.5932314395904541, "learning_rate": 8.40686274509804e-06, "loss": 0.5091, "step": 343 }, { "epoch": 0.2533136966126657, "grad_norm": 0.4746854603290558, "learning_rate": 8.43137254901961e-06, "loss": 0.4552, "step": 344 }, { "epoch": 0.2540500736377025, "grad_norm": 0.6550336480140686, "learning_rate": 8.455882352941177e-06, "loss": 0.4583, "step": 345 }, { "epoch": 0.2547864506627393, "grad_norm": 0.44437041878700256, "learning_rate": 8.480392156862745e-06, "loss": 0.4817, "step": 346 }, { "epoch": 0.2555228276877761, "grad_norm": 0.6586700677871704, "learning_rate": 8.504901960784314e-06, "loss": 0.4843, "step": 347 }, { "epoch": 0.25625920471281294, "grad_norm": 0.5380249619483948, "learning_rate": 8.529411764705883e-06, "loss": 0.4783, "step": 348 }, { "epoch": 0.25699558173784975, "grad_norm": 0.5162436962127686, "learning_rate": 8.553921568627451e-06, "loss": 0.4796, "step": 349 }, { "epoch": 0.25773195876288657, "grad_norm": 0.5282143354415894, "learning_rate": 8.57843137254902e-06, "loss": 0.4583, "step": 350 }, { "epoch": 0.25846833578792344, "grad_norm": 0.5736419558525085, "learning_rate": 8.60294117647059e-06, "loss": 0.4989, "step": 351 }, { "epoch": 0.25920471281296026, "grad_norm": 0.47756871581077576, "learning_rate": 8.627450980392157e-06, "loss": 0.4766, "step": 352 }, { "epoch": 0.2599410898379971, "grad_norm": 0.5326557159423828, "learning_rate": 8.651960784313727e-06, "loss": 0.4931, "step": 353 }, { "epoch": 0.2606774668630339, "grad_norm": 0.5259348750114441, "learning_rate": 8.676470588235295e-06, "loss": 0.4863, "step": 354 }, { "epoch": 0.2614138438880707, "grad_norm": 0.5482640862464905, "learning_rate": 8.700980392156863e-06, "loss": 0.5174, "step": 355 }, { "epoch": 0.2621502209131075, "grad_norm": 0.5132611989974976, "learning_rate": 8.725490196078433e-06, "loss": 0.4894, "step": 356 }, { "epoch": 0.26288659793814434, "grad_norm": 0.5965814590454102, "learning_rate": 8.750000000000001e-06, "loss": 0.4922, "step": 357 }, { "epoch": 0.26362297496318116, "grad_norm": 0.5167778730392456, "learning_rate": 8.774509803921569e-06, "loss": 0.4737, "step": 358 }, { "epoch": 0.26435935198821797, "grad_norm": 0.510610818862915, "learning_rate": 8.799019607843137e-06, "loss": 0.4877, "step": 359 }, { "epoch": 0.2650957290132548, "grad_norm": 0.5722100734710693, "learning_rate": 8.823529411764707e-06, "loss": 0.4597, "step": 360 }, { "epoch": 0.2658321060382916, "grad_norm": 0.662177562713623, "learning_rate": 8.848039215686275e-06, "loss": 0.5086, "step": 361 }, { "epoch": 0.2665684830633284, "grad_norm": 0.499646931886673, "learning_rate": 8.872549019607843e-06, "loss": 0.4513, "step": 362 }, { "epoch": 0.26730486008836524, "grad_norm": 0.5374613404273987, "learning_rate": 8.897058823529413e-06, "loss": 0.4636, "step": 363 }, { "epoch": 0.26804123711340205, "grad_norm": 0.6908944845199585, "learning_rate": 8.921568627450982e-06, "loss": 0.4969, "step": 364 }, { "epoch": 0.26877761413843887, "grad_norm": 0.4661368131637573, "learning_rate": 8.94607843137255e-06, "loss": 0.4593, "step": 365 }, { "epoch": 0.2695139911634757, "grad_norm": 0.5653083920478821, "learning_rate": 8.970588235294119e-06, "loss": 0.4714, "step": 366 }, { "epoch": 0.2702503681885125, "grad_norm": 0.46112319827079773, "learning_rate": 8.995098039215687e-06, "loss": 0.4775, "step": 367 }, { "epoch": 0.2709867452135493, "grad_norm": 0.5458109378814697, "learning_rate": 9.019607843137256e-06, "loss": 0.472, "step": 368 }, { "epoch": 0.27172312223858613, "grad_norm": 0.5330044031143188, "learning_rate": 9.044117647058824e-06, "loss": 0.4798, "step": 369 }, { "epoch": 0.27245949926362295, "grad_norm": 0.5421646237373352, "learning_rate": 9.068627450980392e-06, "loss": 0.5013, "step": 370 }, { "epoch": 0.27319587628865977, "grad_norm": 0.575491726398468, "learning_rate": 9.093137254901962e-06, "loss": 0.4939, "step": 371 }, { "epoch": 0.27393225331369664, "grad_norm": 0.5331813097000122, "learning_rate": 9.11764705882353e-06, "loss": 0.514, "step": 372 }, { "epoch": 0.27466863033873345, "grad_norm": 0.5526221990585327, "learning_rate": 9.142156862745098e-06, "loss": 0.5139, "step": 373 }, { "epoch": 0.27540500736377027, "grad_norm": 0.5274295806884766, "learning_rate": 9.166666666666666e-06, "loss": 0.4895, "step": 374 }, { "epoch": 0.2761413843888071, "grad_norm": 0.5230691432952881, "learning_rate": 9.191176470588236e-06, "loss": 0.4956, "step": 375 }, { "epoch": 0.2768777614138439, "grad_norm": 0.6479937434196472, "learning_rate": 9.215686274509804e-06, "loss": 0.4768, "step": 376 }, { "epoch": 0.2776141384388807, "grad_norm": 0.5493687987327576, "learning_rate": 9.240196078431374e-06, "loss": 0.4733, "step": 377 }, { "epoch": 0.27835051546391754, "grad_norm": 0.5068442225456238, "learning_rate": 9.264705882352942e-06, "loss": 0.4696, "step": 378 }, { "epoch": 0.27908689248895435, "grad_norm": 0.49230822920799255, "learning_rate": 9.28921568627451e-06, "loss": 0.4747, "step": 379 }, { "epoch": 0.27982326951399117, "grad_norm": 0.5067638158798218, "learning_rate": 9.31372549019608e-06, "loss": 0.4598, "step": 380 }, { "epoch": 0.280559646539028, "grad_norm": 0.5510022640228271, "learning_rate": 9.338235294117648e-06, "loss": 0.4844, "step": 381 }, { "epoch": 0.2812960235640648, "grad_norm": 0.5649453401565552, "learning_rate": 9.362745098039216e-06, "loss": 0.497, "step": 382 }, { "epoch": 0.2820324005891016, "grad_norm": 0.549223780632019, "learning_rate": 9.387254901960786e-06, "loss": 0.4801, "step": 383 }, { "epoch": 0.28276877761413843, "grad_norm": 0.46957284212112427, "learning_rate": 9.411764705882354e-06, "loss": 0.4752, "step": 384 }, { "epoch": 0.28350515463917525, "grad_norm": 0.4900481104850769, "learning_rate": 9.436274509803922e-06, "loss": 0.4712, "step": 385 }, { "epoch": 0.28424153166421207, "grad_norm": 0.49515727162361145, "learning_rate": 9.46078431372549e-06, "loss": 0.4827, "step": 386 }, { "epoch": 0.2849779086892489, "grad_norm": 0.44085896015167236, "learning_rate": 9.48529411764706e-06, "loss": 0.4651, "step": 387 }, { "epoch": 0.2857142857142857, "grad_norm": 0.5362505912780762, "learning_rate": 9.509803921568628e-06, "loss": 0.4951, "step": 388 }, { "epoch": 0.2864506627393225, "grad_norm": 0.593925952911377, "learning_rate": 9.534313725490198e-06, "loss": 0.471, "step": 389 }, { "epoch": 0.28718703976435933, "grad_norm": 0.6149821281433105, "learning_rate": 9.558823529411766e-06, "loss": 0.5252, "step": 390 }, { "epoch": 0.28792341678939615, "grad_norm": 0.484279066324234, "learning_rate": 9.583333333333335e-06, "loss": 0.4968, "step": 391 }, { "epoch": 0.28865979381443296, "grad_norm": 0.59568190574646, "learning_rate": 9.607843137254903e-06, "loss": 0.4778, "step": 392 }, { "epoch": 0.28939617083946984, "grad_norm": 0.587205708026886, "learning_rate": 9.632352941176471e-06, "loss": 0.5125, "step": 393 }, { "epoch": 0.29013254786450665, "grad_norm": 0.5671679377555847, "learning_rate": 9.65686274509804e-06, "loss": 0.5075, "step": 394 }, { "epoch": 0.29086892488954347, "grad_norm": 0.5437478423118591, "learning_rate": 9.68137254901961e-06, "loss": 0.4632, "step": 395 }, { "epoch": 0.2916053019145803, "grad_norm": 0.517405092716217, "learning_rate": 9.705882352941177e-06, "loss": 0.4851, "step": 396 }, { "epoch": 0.2923416789396171, "grad_norm": 0.4883304238319397, "learning_rate": 9.730392156862745e-06, "loss": 0.4827, "step": 397 }, { "epoch": 0.2930780559646539, "grad_norm": 0.5211227536201477, "learning_rate": 9.754901960784315e-06, "loss": 0.5063, "step": 398 }, { "epoch": 0.29381443298969073, "grad_norm": 0.4950783848762512, "learning_rate": 9.779411764705883e-06, "loss": 0.4588, "step": 399 }, { "epoch": 0.29455081001472755, "grad_norm": 0.5107828378677368, "learning_rate": 9.803921568627451e-06, "loss": 0.4775, "step": 400 }, { "epoch": 0.29528718703976436, "grad_norm": 0.5515358448028564, "learning_rate": 9.82843137254902e-06, "loss": 0.4581, "step": 401 }, { "epoch": 0.2960235640648012, "grad_norm": 0.6434028744697571, "learning_rate": 9.852941176470589e-06, "loss": 0.5061, "step": 402 }, { "epoch": 0.296759941089838, "grad_norm": 0.6046220064163208, "learning_rate": 9.877450980392159e-06, "loss": 0.5053, "step": 403 }, { "epoch": 0.2974963181148748, "grad_norm": 0.5827841758728027, "learning_rate": 9.901960784313727e-06, "loss": 0.4594, "step": 404 }, { "epoch": 0.29823269513991163, "grad_norm": 0.5246471762657166, "learning_rate": 9.926470588235295e-06, "loss": 0.4909, "step": 405 }, { "epoch": 0.29896907216494845, "grad_norm": 0.5674916505813599, "learning_rate": 9.950980392156863e-06, "loss": 0.4717, "step": 406 }, { "epoch": 0.29970544918998526, "grad_norm": 0.5033355951309204, "learning_rate": 9.975490196078433e-06, "loss": 0.5008, "step": 407 }, { "epoch": 0.3004418262150221, "grad_norm": 0.5390115976333618, "learning_rate": 1e-05, "loss": 0.5002, "step": 408 }, { "epoch": 0.3011782032400589, "grad_norm": 0.5051551461219788, "learning_rate": 9.999998164075549e-06, "loss": 0.4609, "step": 409 }, { "epoch": 0.3019145802650957, "grad_norm": 0.5789138078689575, "learning_rate": 9.999992656303539e-06, "loss": 0.5152, "step": 410 }, { "epoch": 0.3026509572901325, "grad_norm": 0.4873914420604706, "learning_rate": 9.999983476688016e-06, "loss": 0.4836, "step": 411 }, { "epoch": 0.30338733431516934, "grad_norm": 0.5575588345527649, "learning_rate": 9.999970625235724e-06, "loss": 0.472, "step": 412 }, { "epoch": 0.30412371134020616, "grad_norm": 0.5004022121429443, "learning_rate": 9.999954101956097e-06, "loss": 0.5051, "step": 413 }, { "epoch": 0.30486008836524303, "grad_norm": 0.5742695927619934, "learning_rate": 9.999933906861272e-06, "loss": 0.4613, "step": 414 }, { "epoch": 0.30559646539027985, "grad_norm": 0.5978338122367859, "learning_rate": 9.999910039966079e-06, "loss": 0.4931, "step": 415 }, { "epoch": 0.30633284241531666, "grad_norm": 0.5001004338264465, "learning_rate": 9.999882501288043e-06, "loss": 0.475, "step": 416 }, { "epoch": 0.3070692194403535, "grad_norm": 0.608790397644043, "learning_rate": 9.99985129084739e-06, "loss": 0.4743, "step": 417 }, { "epoch": 0.3078055964653903, "grad_norm": 0.5592244863510132, "learning_rate": 9.99981640866704e-06, "loss": 0.4767, "step": 418 }, { "epoch": 0.3085419734904271, "grad_norm": 0.6118118166923523, "learning_rate": 9.999777854772608e-06, "loss": 0.5155, "step": 419 }, { "epoch": 0.30927835051546393, "grad_norm": 0.691983699798584, "learning_rate": 9.999735629192408e-06, "loss": 0.4758, "step": 420 }, { "epoch": 0.31001472754050075, "grad_norm": 0.5400111675262451, "learning_rate": 9.99968973195745e-06, "loss": 0.4831, "step": 421 }, { "epoch": 0.31075110456553756, "grad_norm": 0.6896957159042358, "learning_rate": 9.999640163101436e-06, "loss": 0.4969, "step": 422 }, { "epoch": 0.3114874815905744, "grad_norm": 0.5258437991142273, "learning_rate": 9.99958692266077e-06, "loss": 0.4701, "step": 423 }, { "epoch": 0.3122238586156112, "grad_norm": 0.6300147175788879, "learning_rate": 9.999530010674552e-06, "loss": 0.5038, "step": 424 }, { "epoch": 0.312960235640648, "grad_norm": 0.5327275991439819, "learning_rate": 9.999469427184573e-06, "loss": 0.4895, "step": 425 }, { "epoch": 0.3136966126656848, "grad_norm": 0.5516902804374695, "learning_rate": 9.999405172235325e-06, "loss": 0.4983, "step": 426 }, { "epoch": 0.31443298969072164, "grad_norm": 0.5630642771720886, "learning_rate": 9.999337245873999e-06, "loss": 0.4625, "step": 427 }, { "epoch": 0.31516936671575846, "grad_norm": 0.5986700654029846, "learning_rate": 9.999265648150472e-06, "loss": 0.524, "step": 428 }, { "epoch": 0.3159057437407953, "grad_norm": 0.5675637125968933, "learning_rate": 9.999190379117324e-06, "loss": 0.484, "step": 429 }, { "epoch": 0.3166421207658321, "grad_norm": 0.6256847977638245, "learning_rate": 9.999111438829834e-06, "loss": 0.5181, "step": 430 }, { "epoch": 0.3173784977908689, "grad_norm": 0.6200090050697327, "learning_rate": 9.999028827345969e-06, "loss": 0.4901, "step": 431 }, { "epoch": 0.3181148748159057, "grad_norm": 0.5858426690101624, "learning_rate": 9.9989425447264e-06, "loss": 0.4901, "step": 432 }, { "epoch": 0.31885125184094254, "grad_norm": 0.5074707269668579, "learning_rate": 9.998852591034488e-06, "loss": 0.4749, "step": 433 }, { "epoch": 0.31958762886597936, "grad_norm": 0.5885580778121948, "learning_rate": 9.998758966336296e-06, "loss": 0.4836, "step": 434 }, { "epoch": 0.32032400589101623, "grad_norm": 0.5307214856147766, "learning_rate": 9.998661670700576e-06, "loss": 0.475, "step": 435 }, { "epoch": 0.32106038291605304, "grad_norm": 0.6851618885993958, "learning_rate": 9.998560704198776e-06, "loss": 0.4521, "step": 436 }, { "epoch": 0.32179675994108986, "grad_norm": 0.6349673271179199, "learning_rate": 9.99845606690505e-06, "loss": 0.4875, "step": 437 }, { "epoch": 0.3225331369661267, "grad_norm": 0.5425810217857361, "learning_rate": 9.998347758896234e-06, "loss": 0.5036, "step": 438 }, { "epoch": 0.3232695139911635, "grad_norm": 0.6161746382713318, "learning_rate": 9.99823578025187e-06, "loss": 0.4791, "step": 439 }, { "epoch": 0.3240058910162003, "grad_norm": 0.51716548204422, "learning_rate": 9.99812013105419e-06, "loss": 0.4881, "step": 440 }, { "epoch": 0.3247422680412371, "grad_norm": 0.6496948003768921, "learning_rate": 9.998000811388122e-06, "loss": 0.4804, "step": 441 }, { "epoch": 0.32547864506627394, "grad_norm": 0.6212929487228394, "learning_rate": 9.997877821341294e-06, "loss": 0.4871, "step": 442 }, { "epoch": 0.32621502209131076, "grad_norm": 0.6089105010032654, "learning_rate": 9.997751161004026e-06, "loss": 0.4518, "step": 443 }, { "epoch": 0.3269513991163476, "grad_norm": 0.6576497554779053, "learning_rate": 9.99762083046933e-06, "loss": 0.4673, "step": 444 }, { "epoch": 0.3276877761413844, "grad_norm": 0.6152659058570862, "learning_rate": 9.99748682983292e-06, "loss": 0.4811, "step": 445 }, { "epoch": 0.3284241531664212, "grad_norm": 0.6389955878257751, "learning_rate": 9.9973491591932e-06, "loss": 0.4737, "step": 446 }, { "epoch": 0.329160530191458, "grad_norm": 0.645021915435791, "learning_rate": 9.997207818651273e-06, "loss": 0.491, "step": 447 }, { "epoch": 0.32989690721649484, "grad_norm": 0.5410696864128113, "learning_rate": 9.997062808310935e-06, "loss": 0.4895, "step": 448 }, { "epoch": 0.33063328424153166, "grad_norm": 0.693676769733429, "learning_rate": 9.996914128278677e-06, "loss": 0.5014, "step": 449 }, { "epoch": 0.33136966126656847, "grad_norm": 0.5651475191116333, "learning_rate": 9.996761778663682e-06, "loss": 0.477, "step": 450 }, { "epoch": 0.3321060382916053, "grad_norm": 0.6107382774353027, "learning_rate": 9.996605759577836e-06, "loss": 0.4553, "step": 451 }, { "epoch": 0.3328424153166421, "grad_norm": 0.540756344795227, "learning_rate": 9.996446071135711e-06, "loss": 0.4961, "step": 452 }, { "epoch": 0.3335787923416789, "grad_norm": 0.5446950793266296, "learning_rate": 9.99628271345458e-06, "loss": 0.4891, "step": 453 }, { "epoch": 0.33431516936671574, "grad_norm": 0.4631234407424927, "learning_rate": 9.996115686654406e-06, "loss": 0.4734, "step": 454 }, { "epoch": 0.33505154639175255, "grad_norm": 0.4855410158634186, "learning_rate": 9.995944990857848e-06, "loss": 0.4658, "step": 455 }, { "epoch": 0.33578792341678937, "grad_norm": 0.5123491287231445, "learning_rate": 9.995770626190263e-06, "loss": 0.4627, "step": 456 }, { "epoch": 0.33652430044182624, "grad_norm": 0.512749969959259, "learning_rate": 9.995592592779695e-06, "loss": 0.4798, "step": 457 }, { "epoch": 0.33726067746686306, "grad_norm": 0.5249326229095459, "learning_rate": 9.995410890756891e-06, "loss": 0.5012, "step": 458 }, { "epoch": 0.3379970544918999, "grad_norm": 0.47508466243743896, "learning_rate": 9.995225520255282e-06, "loss": 0.5039, "step": 459 }, { "epoch": 0.3387334315169367, "grad_norm": 0.4680541455745697, "learning_rate": 9.995036481411005e-06, "loss": 0.4666, "step": 460 }, { "epoch": 0.3394698085419735, "grad_norm": 0.4403444528579712, "learning_rate": 9.994843774362878e-06, "loss": 0.4613, "step": 461 }, { "epoch": 0.3402061855670103, "grad_norm": 0.48063525557518005, "learning_rate": 9.994647399252423e-06, "loss": 0.4709, "step": 462 }, { "epoch": 0.34094256259204714, "grad_norm": 0.4822491705417633, "learning_rate": 9.99444735622385e-06, "loss": 0.4947, "step": 463 }, { "epoch": 0.34167893961708395, "grad_norm": 0.533244788646698, "learning_rate": 9.994243645424067e-06, "loss": 0.4927, "step": 464 }, { "epoch": 0.34241531664212077, "grad_norm": 0.49757879972457886, "learning_rate": 9.99403626700267e-06, "loss": 0.4851, "step": 465 }, { "epoch": 0.3431516936671576, "grad_norm": 0.583299994468689, "learning_rate": 9.993825221111955e-06, "loss": 0.5015, "step": 466 }, { "epoch": 0.3438880706921944, "grad_norm": 0.46042028069496155, "learning_rate": 9.993610507906904e-06, "loss": 0.5083, "step": 467 }, { "epoch": 0.3446244477172312, "grad_norm": 0.5615798830986023, "learning_rate": 9.993392127545198e-06, "loss": 0.4744, "step": 468 }, { "epoch": 0.34536082474226804, "grad_norm": 0.4136447310447693, "learning_rate": 9.99317008018721e-06, "loss": 0.4552, "step": 469 }, { "epoch": 0.34609720176730485, "grad_norm": 0.5388152599334717, "learning_rate": 9.992944365996002e-06, "loss": 0.4907, "step": 470 }, { "epoch": 0.34683357879234167, "grad_norm": 0.5055995583534241, "learning_rate": 9.992714985137336e-06, "loss": 0.4649, "step": 471 }, { "epoch": 0.3475699558173785, "grad_norm": 0.599743664264679, "learning_rate": 9.992481937779655e-06, "loss": 0.4747, "step": 472 }, { "epoch": 0.3483063328424153, "grad_norm": 0.5145841836929321, "learning_rate": 9.99224522409411e-06, "loss": 0.5214, "step": 473 }, { "epoch": 0.3490427098674521, "grad_norm": 0.5827757716178894, "learning_rate": 9.99200484425453e-06, "loss": 0.5151, "step": 474 }, { "epoch": 0.34977908689248893, "grad_norm": 0.502983808517456, "learning_rate": 9.991760798437448e-06, "loss": 0.4808, "step": 475 }, { "epoch": 0.35051546391752575, "grad_norm": 0.5331257581710815, "learning_rate": 9.99151308682208e-06, "loss": 0.4544, "step": 476 }, { "epoch": 0.35125184094256257, "grad_norm": 0.5600625276565552, "learning_rate": 9.99126170959034e-06, "loss": 0.4814, "step": 477 }, { "epoch": 0.35198821796759944, "grad_norm": 0.5749978423118591, "learning_rate": 9.991006666926832e-06, "loss": 0.4766, "step": 478 }, { "epoch": 0.35272459499263625, "grad_norm": 0.5478549599647522, "learning_rate": 9.990747959018849e-06, "loss": 0.4763, "step": 479 }, { "epoch": 0.35346097201767307, "grad_norm": 0.467970609664917, "learning_rate": 9.990485586056381e-06, "loss": 0.4753, "step": 480 }, { "epoch": 0.3541973490427099, "grad_norm": 0.4842618405818939, "learning_rate": 9.990219548232106e-06, "loss": 0.4721, "step": 481 }, { "epoch": 0.3549337260677467, "grad_norm": 0.4819396138191223, "learning_rate": 9.989949845741393e-06, "loss": 0.449, "step": 482 }, { "epoch": 0.3556701030927835, "grad_norm": 0.5315747857093811, "learning_rate": 9.989676478782305e-06, "loss": 0.4866, "step": 483 }, { "epoch": 0.35640648011782033, "grad_norm": 0.5199954509735107, "learning_rate": 9.989399447555594e-06, "loss": 0.4858, "step": 484 }, { "epoch": 0.35714285714285715, "grad_norm": 0.6086426377296448, "learning_rate": 9.989118752264704e-06, "loss": 0.5055, "step": 485 }, { "epoch": 0.35787923416789397, "grad_norm": 0.5935198664665222, "learning_rate": 9.988834393115768e-06, "loss": 0.4878, "step": 486 }, { "epoch": 0.3586156111929308, "grad_norm": 0.5054832696914673, "learning_rate": 9.988546370317609e-06, "loss": 0.4357, "step": 487 }, { "epoch": 0.3593519882179676, "grad_norm": 0.47322750091552734, "learning_rate": 9.988254684081746e-06, "loss": 0.4607, "step": 488 }, { "epoch": 0.3600883652430044, "grad_norm": 0.4437432289123535, "learning_rate": 9.987959334622381e-06, "loss": 0.4772, "step": 489 }, { "epoch": 0.36082474226804123, "grad_norm": 0.4980893135070801, "learning_rate": 9.987660322156413e-06, "loss": 0.4725, "step": 490 }, { "epoch": 0.36156111929307805, "grad_norm": 0.5222845077514648, "learning_rate": 9.987357646903427e-06, "loss": 0.4915, "step": 491 }, { "epoch": 0.36229749631811486, "grad_norm": 0.45025306940078735, "learning_rate": 9.987051309085698e-06, "loss": 0.4995, "step": 492 }, { "epoch": 0.3630338733431517, "grad_norm": 0.4636288583278656, "learning_rate": 9.986741308928189e-06, "loss": 0.4748, "step": 493 }, { "epoch": 0.3637702503681885, "grad_norm": 0.49730953574180603, "learning_rate": 9.986427646658559e-06, "loss": 0.4876, "step": 494 }, { "epoch": 0.3645066273932253, "grad_norm": 0.5664610862731934, "learning_rate": 9.986110322507149e-06, "loss": 0.4727, "step": 495 }, { "epoch": 0.36524300441826213, "grad_norm": 0.48833712935447693, "learning_rate": 9.985789336706993e-06, "loss": 0.5076, "step": 496 }, { "epoch": 0.36597938144329895, "grad_norm": 0.5941821336746216, "learning_rate": 9.985464689493814e-06, "loss": 0.5057, "step": 497 }, { "epoch": 0.36671575846833576, "grad_norm": 0.4826650023460388, "learning_rate": 9.985136381106022e-06, "loss": 0.5201, "step": 498 }, { "epoch": 0.36745213549337263, "grad_norm": 0.49603334069252014, "learning_rate": 9.984804411784717e-06, "loss": 0.4921, "step": 499 }, { "epoch": 0.36818851251840945, "grad_norm": 0.4792011082172394, "learning_rate": 9.984468781773688e-06, "loss": 0.459, "step": 500 }, { "epoch": 0.36892488954344627, "grad_norm": 0.4913840591907501, "learning_rate": 9.98412949131941e-06, "loss": 0.4734, "step": 501 }, { "epoch": 0.3696612665684831, "grad_norm": 0.5027479529380798, "learning_rate": 9.983786540671052e-06, "loss": 0.4432, "step": 502 }, { "epoch": 0.3703976435935199, "grad_norm": 0.4886789321899414, "learning_rate": 9.98343993008046e-06, "loss": 0.4696, "step": 503 }, { "epoch": 0.3711340206185567, "grad_norm": 0.4682316482067108, "learning_rate": 9.983089659802178e-06, "loss": 0.4516, "step": 504 }, { "epoch": 0.37187039764359353, "grad_norm": 0.465826153755188, "learning_rate": 9.982735730093436e-06, "loss": 0.4827, "step": 505 }, { "epoch": 0.37260677466863035, "grad_norm": 0.5015543699264526, "learning_rate": 9.982378141214144e-06, "loss": 0.4415, "step": 506 }, { "epoch": 0.37334315169366716, "grad_norm": 0.5372326970100403, "learning_rate": 9.98201689342691e-06, "loss": 0.4782, "step": 507 }, { "epoch": 0.374079528718704, "grad_norm": 0.4749949276447296, "learning_rate": 9.98165198699702e-06, "loss": 0.4636, "step": 508 }, { "epoch": 0.3748159057437408, "grad_norm": 0.459778755903244, "learning_rate": 9.98128342219245e-06, "loss": 0.4608, "step": 509 }, { "epoch": 0.3755522827687776, "grad_norm": 0.5350815653800964, "learning_rate": 9.980911199283864e-06, "loss": 0.4877, "step": 510 }, { "epoch": 0.37628865979381443, "grad_norm": 0.47353699803352356, "learning_rate": 9.98053531854461e-06, "loss": 0.4867, "step": 511 }, { "epoch": 0.37702503681885124, "grad_norm": 0.4845249652862549, "learning_rate": 9.980155780250728e-06, "loss": 0.4473, "step": 512 }, { "epoch": 0.37776141384388806, "grad_norm": 0.47789323329925537, "learning_rate": 9.979772584680933e-06, "loss": 0.4566, "step": 513 }, { "epoch": 0.3784977908689249, "grad_norm": 0.5107650756835938, "learning_rate": 9.979385732116638e-06, "loss": 0.4341, "step": 514 }, { "epoch": 0.3792341678939617, "grad_norm": 0.5246970057487488, "learning_rate": 9.978995222841932e-06, "loss": 0.4741, "step": 515 }, { "epoch": 0.3799705449189985, "grad_norm": 0.5087926387786865, "learning_rate": 9.978601057143593e-06, "loss": 0.4582, "step": 516 }, { "epoch": 0.3807069219440353, "grad_norm": 0.5556607246398926, "learning_rate": 9.978203235311088e-06, "loss": 0.4683, "step": 517 }, { "epoch": 0.38144329896907214, "grad_norm": 0.5219265818595886, "learning_rate": 9.97780175763656e-06, "loss": 0.4449, "step": 518 }, { "epoch": 0.38217967599410896, "grad_norm": 0.5306047201156616, "learning_rate": 9.977396624414848e-06, "loss": 0.439, "step": 519 }, { "epoch": 0.38291605301914583, "grad_norm": 0.638886570930481, "learning_rate": 9.976987835943465e-06, "loss": 0.4803, "step": 520 }, { "epoch": 0.38365243004418265, "grad_norm": 0.513490617275238, "learning_rate": 9.976575392522617e-06, "loss": 0.4631, "step": 521 }, { "epoch": 0.38438880706921946, "grad_norm": 0.5516048073768616, "learning_rate": 9.976159294455186e-06, "loss": 0.4958, "step": 522 }, { "epoch": 0.3851251840942563, "grad_norm": 0.48548823595046997, "learning_rate": 9.975739542046742e-06, "loss": 0.5089, "step": 523 }, { "epoch": 0.3858615611192931, "grad_norm": 0.5853198170661926, "learning_rate": 9.975316135605543e-06, "loss": 0.4525, "step": 524 }, { "epoch": 0.3865979381443299, "grad_norm": 0.513939619064331, "learning_rate": 9.97488907544252e-06, "loss": 0.466, "step": 525 }, { "epoch": 0.3873343151693667, "grad_norm": 0.5338413119316101, "learning_rate": 9.974458361871299e-06, "loss": 0.478, "step": 526 }, { "epoch": 0.38807069219440354, "grad_norm": 0.4851873815059662, "learning_rate": 9.974023995208177e-06, "loss": 0.4578, "step": 527 }, { "epoch": 0.38880706921944036, "grad_norm": 0.4788038730621338, "learning_rate": 9.973585975772144e-06, "loss": 0.4351, "step": 528 }, { "epoch": 0.3895434462444772, "grad_norm": 0.5574104189872742, "learning_rate": 9.973144303884867e-06, "loss": 0.4512, "step": 529 }, { "epoch": 0.390279823269514, "grad_norm": 0.5594552755355835, "learning_rate": 9.972698979870698e-06, "loss": 0.4996, "step": 530 }, { "epoch": 0.3910162002945508, "grad_norm": 0.5972340703010559, "learning_rate": 9.972250004056665e-06, "loss": 0.4908, "step": 531 }, { "epoch": 0.3917525773195876, "grad_norm": 0.5098032355308533, "learning_rate": 9.971797376772488e-06, "loss": 0.4873, "step": 532 }, { "epoch": 0.39248895434462444, "grad_norm": 0.5212668776512146, "learning_rate": 9.971341098350557e-06, "loss": 0.4603, "step": 533 }, { "epoch": 0.39322533136966126, "grad_norm": 0.49205219745635986, "learning_rate": 9.970881169125955e-06, "loss": 0.4892, "step": 534 }, { "epoch": 0.3939617083946981, "grad_norm": 0.4715671241283417, "learning_rate": 9.970417589436435e-06, "loss": 0.4701, "step": 535 }, { "epoch": 0.3946980854197349, "grad_norm": 0.49158528447151184, "learning_rate": 9.969950359622438e-06, "loss": 0.449, "step": 536 }, { "epoch": 0.3954344624447717, "grad_norm": 0.5873647928237915, "learning_rate": 9.969479480027086e-06, "loss": 0.4757, "step": 537 }, { "epoch": 0.3961708394698085, "grad_norm": 0.4452936053276062, "learning_rate": 9.969004950996175e-06, "loss": 0.4569, "step": 538 }, { "epoch": 0.39690721649484534, "grad_norm": 0.5542396306991577, "learning_rate": 9.968526772878185e-06, "loss": 0.4877, "step": 539 }, { "epoch": 0.39764359351988215, "grad_norm": 0.5141607522964478, "learning_rate": 9.968044946024277e-06, "loss": 0.4906, "step": 540 }, { "epoch": 0.39837997054491897, "grad_norm": 0.47367358207702637, "learning_rate": 9.967559470788292e-06, "loss": 0.4683, "step": 541 }, { "epoch": 0.39911634756995584, "grad_norm": 0.5380948781967163, "learning_rate": 9.967070347526743e-06, "loss": 0.4878, "step": 542 }, { "epoch": 0.39985272459499266, "grad_norm": 0.46736347675323486, "learning_rate": 9.966577576598833e-06, "loss": 0.4865, "step": 543 }, { "epoch": 0.4005891016200295, "grad_norm": 0.46713143587112427, "learning_rate": 9.966081158366434e-06, "loss": 0.4388, "step": 544 }, { "epoch": 0.4013254786450663, "grad_norm": 0.5499324798583984, "learning_rate": 9.965581093194103e-06, "loss": 0.4916, "step": 545 }, { "epoch": 0.4020618556701031, "grad_norm": 0.45428466796875, "learning_rate": 9.965077381449073e-06, "loss": 0.496, "step": 546 }, { "epoch": 0.4027982326951399, "grad_norm": 0.58051598072052, "learning_rate": 9.96457002350125e-06, "loss": 0.4618, "step": 547 }, { "epoch": 0.40353460972017674, "grad_norm": 0.4578842520713806, "learning_rate": 9.96405901972323e-06, "loss": 0.4808, "step": 548 }, { "epoch": 0.40427098674521356, "grad_norm": 0.5422972440719604, "learning_rate": 9.96354437049027e-06, "loss": 0.4721, "step": 549 }, { "epoch": 0.4050073637702504, "grad_norm": 0.5694018602371216, "learning_rate": 9.96302607618032e-06, "loss": 0.4825, "step": 550 }, { "epoch": 0.4057437407952872, "grad_norm": 0.5665662884712219, "learning_rate": 9.962504137173997e-06, "loss": 0.4877, "step": 551 }, { "epoch": 0.406480117820324, "grad_norm": 0.5937358140945435, "learning_rate": 9.961978553854597e-06, "loss": 0.454, "step": 552 }, { "epoch": 0.4072164948453608, "grad_norm": 0.5692036747932434, "learning_rate": 9.961449326608093e-06, "loss": 0.45, "step": 553 }, { "epoch": 0.40795287187039764, "grad_norm": 0.5986254811286926, "learning_rate": 9.960916455823134e-06, "loss": 0.4764, "step": 554 }, { "epoch": 0.40868924889543445, "grad_norm": 0.540023922920227, "learning_rate": 9.960379941891043e-06, "loss": 0.4402, "step": 555 }, { "epoch": 0.40942562592047127, "grad_norm": 0.620135486125946, "learning_rate": 9.959839785205821e-06, "loss": 0.4795, "step": 556 }, { "epoch": 0.4101620029455081, "grad_norm": 0.5036307573318481, "learning_rate": 9.959295986164139e-06, "loss": 0.4725, "step": 557 }, { "epoch": 0.4108983799705449, "grad_norm": 0.576543390750885, "learning_rate": 9.958748545165353e-06, "loss": 0.4593, "step": 558 }, { "epoch": 0.4116347569955817, "grad_norm": 0.45533323287963867, "learning_rate": 9.95819746261148e-06, "loss": 0.4724, "step": 559 }, { "epoch": 0.41237113402061853, "grad_norm": 0.5245439410209656, "learning_rate": 9.957642738907226e-06, "loss": 0.4688, "step": 560 }, { "epoch": 0.41310751104565535, "grad_norm": 0.4937553107738495, "learning_rate": 9.957084374459957e-06, "loss": 0.48, "step": 561 }, { "epoch": 0.41384388807069217, "grad_norm": 0.5153213739395142, "learning_rate": 9.956522369679722e-06, "loss": 0.4745, "step": 562 }, { "epoch": 0.41458026509572904, "grad_norm": 0.5216047763824463, "learning_rate": 9.955956724979239e-06, "loss": 0.4751, "step": 563 }, { "epoch": 0.41531664212076586, "grad_norm": 0.6182007789611816, "learning_rate": 9.955387440773902e-06, "loss": 0.4892, "step": 564 }, { "epoch": 0.41605301914580267, "grad_norm": 0.5679191946983337, "learning_rate": 9.954814517481774e-06, "loss": 0.4579, "step": 565 }, { "epoch": 0.4167893961708395, "grad_norm": 0.6047842502593994, "learning_rate": 9.954237955523593e-06, "loss": 0.5028, "step": 566 }, { "epoch": 0.4175257731958763, "grad_norm": 0.6306226253509521, "learning_rate": 9.953657755322772e-06, "loss": 0.4831, "step": 567 }, { "epoch": 0.4182621502209131, "grad_norm": 0.47558221220970154, "learning_rate": 9.953073917305386e-06, "loss": 0.4821, "step": 568 }, { "epoch": 0.41899852724594994, "grad_norm": 0.725439190864563, "learning_rate": 9.952486441900196e-06, "loss": 0.4881, "step": 569 }, { "epoch": 0.41973490427098675, "grad_norm": 0.5623874664306641, "learning_rate": 9.95189532953862e-06, "loss": 0.4738, "step": 570 }, { "epoch": 0.42047128129602357, "grad_norm": 0.6907193064689636, "learning_rate": 9.951300580654756e-06, "loss": 0.4706, "step": 571 }, { "epoch": 0.4212076583210604, "grad_norm": 0.6095152497291565, "learning_rate": 9.950702195685366e-06, "loss": 0.4737, "step": 572 }, { "epoch": 0.4219440353460972, "grad_norm": 0.6079631447792053, "learning_rate": 9.95010017506989e-06, "loss": 0.4676, "step": 573 }, { "epoch": 0.422680412371134, "grad_norm": 0.825157880783081, "learning_rate": 9.949494519250433e-06, "loss": 0.4791, "step": 574 }, { "epoch": 0.42341678939617083, "grad_norm": 0.6472348570823669, "learning_rate": 9.94888522867177e-06, "loss": 0.4969, "step": 575 }, { "epoch": 0.42415316642120765, "grad_norm": 0.6898149847984314, "learning_rate": 9.948272303781346e-06, "loss": 0.4648, "step": 576 }, { "epoch": 0.42488954344624447, "grad_norm": 0.5716066360473633, "learning_rate": 9.94765574502927e-06, "loss": 0.4745, "step": 577 }, { "epoch": 0.4256259204712813, "grad_norm": 0.6825960874557495, "learning_rate": 9.94703555286833e-06, "loss": 0.4888, "step": 578 }, { "epoch": 0.4263622974963181, "grad_norm": 0.5997359156608582, "learning_rate": 9.946411727753975e-06, "loss": 0.4807, "step": 579 }, { "epoch": 0.4270986745213549, "grad_norm": 0.5727107524871826, "learning_rate": 9.945784270144321e-06, "loss": 0.4724, "step": 580 }, { "epoch": 0.42783505154639173, "grad_norm": 0.5531638264656067, "learning_rate": 9.945153180500157e-06, "loss": 0.4379, "step": 581 }, { "epoch": 0.42857142857142855, "grad_norm": 0.5092889070510864, "learning_rate": 9.944518459284934e-06, "loss": 0.4745, "step": 582 }, { "epoch": 0.42930780559646536, "grad_norm": 0.5107025504112244, "learning_rate": 9.943880106964772e-06, "loss": 0.494, "step": 583 }, { "epoch": 0.43004418262150224, "grad_norm": 0.5222765803337097, "learning_rate": 9.943238124008458e-06, "loss": 0.4561, "step": 584 }, { "epoch": 0.43078055964653905, "grad_norm": 0.550485372543335, "learning_rate": 9.942592510887448e-06, "loss": 0.4548, "step": 585 }, { "epoch": 0.43151693667157587, "grad_norm": 0.5831854939460754, "learning_rate": 9.941943268075855e-06, "loss": 0.4955, "step": 586 }, { "epoch": 0.4322533136966127, "grad_norm": 0.4990431070327759, "learning_rate": 9.941290396050467e-06, "loss": 0.49, "step": 587 }, { "epoch": 0.4329896907216495, "grad_norm": 0.6468625664710999, "learning_rate": 9.940633895290733e-06, "loss": 0.4799, "step": 588 }, { "epoch": 0.4337260677466863, "grad_norm": 0.5194735527038574, "learning_rate": 9.939973766278768e-06, "loss": 0.4453, "step": 589 }, { "epoch": 0.43446244477172313, "grad_norm": 0.5722482204437256, "learning_rate": 9.939310009499348e-06, "loss": 0.4798, "step": 590 }, { "epoch": 0.43519882179675995, "grad_norm": 0.5742457509040833, "learning_rate": 9.938642625439918e-06, "loss": 0.4668, "step": 591 }, { "epoch": 0.43593519882179677, "grad_norm": 0.5003108382225037, "learning_rate": 9.937971614590587e-06, "loss": 0.4648, "step": 592 }, { "epoch": 0.4366715758468336, "grad_norm": 0.5724197030067444, "learning_rate": 9.93729697744412e-06, "loss": 0.4841, "step": 593 }, { "epoch": 0.4374079528718704, "grad_norm": 0.5018848180770874, "learning_rate": 9.936618714495954e-06, "loss": 0.4589, "step": 594 }, { "epoch": 0.4381443298969072, "grad_norm": 0.5396544337272644, "learning_rate": 9.935936826244183e-06, "loss": 0.4753, "step": 595 }, { "epoch": 0.43888070692194403, "grad_norm": 0.5358166694641113, "learning_rate": 9.935251313189564e-06, "loss": 0.4915, "step": 596 }, { "epoch": 0.43961708394698085, "grad_norm": 0.4927530884742737, "learning_rate": 9.934562175835521e-06, "loss": 0.4586, "step": 597 }, { "epoch": 0.44035346097201766, "grad_norm": 0.47724655270576477, "learning_rate": 9.933869414688134e-06, "loss": 0.4734, "step": 598 }, { "epoch": 0.4410898379970545, "grad_norm": 0.46607303619384766, "learning_rate": 9.933173030256142e-06, "loss": 0.4718, "step": 599 }, { "epoch": 0.4418262150220913, "grad_norm": 0.5244195461273193, "learning_rate": 9.932473023050954e-06, "loss": 0.4441, "step": 600 }, { "epoch": 0.4425625920471281, "grad_norm": 0.5027062892913818, "learning_rate": 9.931769393586633e-06, "loss": 0.4456, "step": 601 }, { "epoch": 0.44329896907216493, "grad_norm": 0.4999453127384186, "learning_rate": 9.9310621423799e-06, "loss": 0.4858, "step": 602 }, { "epoch": 0.44403534609720174, "grad_norm": 0.5346736311912537, "learning_rate": 9.930351269950144e-06, "loss": 0.4985, "step": 603 }, { "epoch": 0.44477172312223856, "grad_norm": 0.5561801791191101, "learning_rate": 9.929636776819404e-06, "loss": 0.4554, "step": 604 }, { "epoch": 0.44550810014727543, "grad_norm": 0.5683428049087524, "learning_rate": 9.928918663512382e-06, "loss": 0.4764, "step": 605 }, { "epoch": 0.44624447717231225, "grad_norm": 0.5989724397659302, "learning_rate": 9.928196930556442e-06, "loss": 0.473, "step": 606 }, { "epoch": 0.44698085419734906, "grad_norm": 0.5784966349601746, "learning_rate": 9.9274715784816e-06, "loss": 0.4653, "step": 607 }, { "epoch": 0.4477172312223859, "grad_norm": 0.5616557598114014, "learning_rate": 9.926742607820535e-06, "loss": 0.4722, "step": 608 }, { "epoch": 0.4484536082474227, "grad_norm": 0.6208142638206482, "learning_rate": 9.926010019108579e-06, "loss": 0.4639, "step": 609 }, { "epoch": 0.4491899852724595, "grad_norm": 0.5640535354614258, "learning_rate": 9.925273812883724e-06, "loss": 0.4703, "step": 610 }, { "epoch": 0.44992636229749633, "grad_norm": 0.7158654928207397, "learning_rate": 9.924533989686618e-06, "loss": 0.4697, "step": 611 }, { "epoch": 0.45066273932253315, "grad_norm": 0.6342428922653198, "learning_rate": 9.923790550060564e-06, "loss": 0.4662, "step": 612 }, { "epoch": 0.45139911634756996, "grad_norm": 0.4764558970928192, "learning_rate": 9.923043494551522e-06, "loss": 0.4519, "step": 613 }, { "epoch": 0.4521354933726068, "grad_norm": 0.6603193879127502, "learning_rate": 9.922292823708106e-06, "loss": 0.5068, "step": 614 }, { "epoch": 0.4528718703976436, "grad_norm": 0.522085428237915, "learning_rate": 9.921538538081588e-06, "loss": 0.4602, "step": 615 }, { "epoch": 0.4536082474226804, "grad_norm": 0.53127121925354, "learning_rate": 9.92078063822589e-06, "loss": 0.461, "step": 616 }, { "epoch": 0.4543446244477172, "grad_norm": 0.6258383393287659, "learning_rate": 9.920019124697594e-06, "loss": 0.489, "step": 617 }, { "epoch": 0.45508100147275404, "grad_norm": 0.49485811591148376, "learning_rate": 9.919253998055928e-06, "loss": 0.447, "step": 618 }, { "epoch": 0.45581737849779086, "grad_norm": 0.5272852182388306, "learning_rate": 9.918485258862781e-06, "loss": 0.478, "step": 619 }, { "epoch": 0.4565537555228277, "grad_norm": 0.4806794822216034, "learning_rate": 9.917712907682694e-06, "loss": 0.4353, "step": 620 }, { "epoch": 0.4572901325478645, "grad_norm": 0.501160204410553, "learning_rate": 9.916936945082854e-06, "loss": 0.4682, "step": 621 }, { "epoch": 0.4580265095729013, "grad_norm": 0.5172646641731262, "learning_rate": 9.916157371633106e-06, "loss": 0.4505, "step": 622 }, { "epoch": 0.4587628865979381, "grad_norm": 0.5966675281524658, "learning_rate": 9.915374187905945e-06, "loss": 0.5016, "step": 623 }, { "epoch": 0.45949926362297494, "grad_norm": 0.4914330840110779, "learning_rate": 9.91458739447652e-06, "loss": 0.4455, "step": 624 }, { "epoch": 0.46023564064801176, "grad_norm": 0.6271175146102905, "learning_rate": 9.913796991922624e-06, "loss": 0.4604, "step": 625 }, { "epoch": 0.46097201767304863, "grad_norm": 0.5928018689155579, "learning_rate": 9.913002980824709e-06, "loss": 0.4519, "step": 626 }, { "epoch": 0.46170839469808544, "grad_norm": 0.4849533140659332, "learning_rate": 9.912205361765868e-06, "loss": 0.5011, "step": 627 }, { "epoch": 0.46244477172312226, "grad_norm": 0.5498746633529663, "learning_rate": 9.911404135331852e-06, "loss": 0.4567, "step": 628 }, { "epoch": 0.4631811487481591, "grad_norm": 0.4846838712692261, "learning_rate": 9.910599302111057e-06, "loss": 0.4607, "step": 629 }, { "epoch": 0.4639175257731959, "grad_norm": 0.5435061454772949, "learning_rate": 9.909790862694528e-06, "loss": 0.4507, "step": 630 }, { "epoch": 0.4646539027982327, "grad_norm": 0.5106296539306641, "learning_rate": 9.908978817675959e-06, "loss": 0.4366, "step": 631 }, { "epoch": 0.4653902798232695, "grad_norm": 0.4310411512851715, "learning_rate": 9.908163167651688e-06, "loss": 0.4729, "step": 632 }, { "epoch": 0.46612665684830634, "grad_norm": 0.5732407569885254, "learning_rate": 9.907343913220707e-06, "loss": 0.4729, "step": 633 }, { "epoch": 0.46686303387334316, "grad_norm": 0.6053887009620667, "learning_rate": 9.90652105498465e-06, "loss": 0.4671, "step": 634 }, { "epoch": 0.46759941089838, "grad_norm": 0.4543951153755188, "learning_rate": 9.905694593547803e-06, "loss": 0.4631, "step": 635 }, { "epoch": 0.4683357879234168, "grad_norm": 0.5218227505683899, "learning_rate": 9.90486452951709e-06, "loss": 0.4638, "step": 636 }, { "epoch": 0.4690721649484536, "grad_norm": 0.5472959876060486, "learning_rate": 9.904030863502086e-06, "loss": 0.4677, "step": 637 }, { "epoch": 0.4698085419734904, "grad_norm": 0.5236535668373108, "learning_rate": 9.903193596115011e-06, "loss": 0.466, "step": 638 }, { "epoch": 0.47054491899852724, "grad_norm": 0.5994406342506409, "learning_rate": 9.902352727970729e-06, "loss": 0.4811, "step": 639 }, { "epoch": 0.47128129602356406, "grad_norm": 0.5658840537071228, "learning_rate": 9.901508259686746e-06, "loss": 0.4728, "step": 640 }, { "epoch": 0.47201767304860087, "grad_norm": 0.5107308030128479, "learning_rate": 9.900660191883217e-06, "loss": 0.4834, "step": 641 }, { "epoch": 0.4727540500736377, "grad_norm": 0.5555645227432251, "learning_rate": 9.899808525182935e-06, "loss": 0.471, "step": 642 }, { "epoch": 0.4734904270986745, "grad_norm": 0.6290595531463623, "learning_rate": 9.89895326021134e-06, "loss": 0.4718, "step": 643 }, { "epoch": 0.4742268041237113, "grad_norm": 0.5025970935821533, "learning_rate": 9.898094397596511e-06, "loss": 0.4752, "step": 644 }, { "epoch": 0.47496318114874814, "grad_norm": 0.60362309217453, "learning_rate": 9.897231937969172e-06, "loss": 0.4636, "step": 645 }, { "epoch": 0.47569955817378495, "grad_norm": 0.6537862420082092, "learning_rate": 9.896365881962687e-06, "loss": 0.4574, "step": 646 }, { "epoch": 0.47643593519882177, "grad_norm": 0.4715963304042816, "learning_rate": 9.895496230213061e-06, "loss": 0.4596, "step": 647 }, { "epoch": 0.47717231222385864, "grad_norm": 0.5870654582977295, "learning_rate": 9.894622983358941e-06, "loss": 0.4472, "step": 648 }, { "epoch": 0.47790868924889546, "grad_norm": 0.6155905723571777, "learning_rate": 9.893746142041612e-06, "loss": 0.4841, "step": 649 }, { "epoch": 0.4786450662739323, "grad_norm": 0.5059301853179932, "learning_rate": 9.892865706905e-06, "loss": 0.454, "step": 650 }, { "epoch": 0.4793814432989691, "grad_norm": 0.6735835671424866, "learning_rate": 9.891981678595671e-06, "loss": 0.4788, "step": 651 }, { "epoch": 0.4801178203240059, "grad_norm": 0.6185165643692017, "learning_rate": 9.891094057762827e-06, "loss": 0.4509, "step": 652 }, { "epoch": 0.4808541973490427, "grad_norm": 0.5275318026542664, "learning_rate": 9.89020284505831e-06, "loss": 0.4064, "step": 653 }, { "epoch": 0.48159057437407954, "grad_norm": 0.5368750095367432, "learning_rate": 9.889308041136601e-06, "loss": 0.4633, "step": 654 }, { "epoch": 0.48232695139911635, "grad_norm": 0.6606622934341431, "learning_rate": 9.888409646654818e-06, "loss": 0.4619, "step": 655 }, { "epoch": 0.48306332842415317, "grad_norm": 0.6033420562744141, "learning_rate": 9.88750766227271e-06, "loss": 0.4549, "step": 656 }, { "epoch": 0.48379970544919, "grad_norm": 0.5704071521759033, "learning_rate": 9.886602088652672e-06, "loss": 0.4584, "step": 657 }, { "epoch": 0.4845360824742268, "grad_norm": 0.6391714811325073, "learning_rate": 9.885692926459729e-06, "loss": 0.4612, "step": 658 }, { "epoch": 0.4852724594992636, "grad_norm": 0.49151089787483215, "learning_rate": 9.88478017636154e-06, "loss": 0.4524, "step": 659 }, { "epoch": 0.48600883652430044, "grad_norm": 0.5738299489021301, "learning_rate": 9.883863839028402e-06, "loss": 0.4854, "step": 660 }, { "epoch": 0.48674521354933725, "grad_norm": 0.5694648027420044, "learning_rate": 9.882943915133247e-06, "loss": 0.4583, "step": 661 }, { "epoch": 0.48748159057437407, "grad_norm": 0.5113805532455444, "learning_rate": 9.88202040535164e-06, "loss": 0.4714, "step": 662 }, { "epoch": 0.4882179675994109, "grad_norm": 0.5874478220939636, "learning_rate": 9.881093310361773e-06, "loss": 0.4851, "step": 663 }, { "epoch": 0.4889543446244477, "grad_norm": 0.5066226124763489, "learning_rate": 9.880162630844483e-06, "loss": 0.4644, "step": 664 }, { "epoch": 0.4896907216494845, "grad_norm": 0.5173943042755127, "learning_rate": 9.879228367483228e-06, "loss": 0.4526, "step": 665 }, { "epoch": 0.49042709867452133, "grad_norm": 0.531655490398407, "learning_rate": 9.878290520964107e-06, "loss": 0.4503, "step": 666 }, { "epoch": 0.49116347569955815, "grad_norm": 0.5501636266708374, "learning_rate": 9.877349091975844e-06, "loss": 0.4694, "step": 667 }, { "epoch": 0.49189985272459497, "grad_norm": 0.532542884349823, "learning_rate": 9.876404081209796e-06, "loss": 0.4588, "step": 668 }, { "epoch": 0.49263622974963184, "grad_norm": 0.4820326864719391, "learning_rate": 9.87545548935995e-06, "loss": 0.4698, "step": 669 }, { "epoch": 0.49337260677466865, "grad_norm": 0.4267001152038574, "learning_rate": 9.874503317122925e-06, "loss": 0.4621, "step": 670 }, { "epoch": 0.49410898379970547, "grad_norm": 0.4171275198459625, "learning_rate": 9.873547565197965e-06, "loss": 0.4448, "step": 671 }, { "epoch": 0.4948453608247423, "grad_norm": 0.4105445146560669, "learning_rate": 9.872588234286946e-06, "loss": 0.4507, "step": 672 }, { "epoch": 0.4955817378497791, "grad_norm": 0.46572741866111755, "learning_rate": 9.871625325094375e-06, "loss": 0.4485, "step": 673 }, { "epoch": 0.4963181148748159, "grad_norm": 0.44582173228263855, "learning_rate": 9.870658838327378e-06, "loss": 0.4712, "step": 674 }, { "epoch": 0.49705449189985274, "grad_norm": 0.5541704893112183, "learning_rate": 9.869688774695719e-06, "loss": 0.4527, "step": 675 }, { "epoch": 0.49779086892488955, "grad_norm": 0.45377492904663086, "learning_rate": 9.86871513491178e-06, "loss": 0.4764, "step": 676 }, { "epoch": 0.49852724594992637, "grad_norm": 0.5734421610832214, "learning_rate": 9.867737919690573e-06, "loss": 0.4822, "step": 677 }, { "epoch": 0.4992636229749632, "grad_norm": 0.49860337376594543, "learning_rate": 9.866757129749733e-06, "loss": 0.4502, "step": 678 }, { "epoch": 0.5, "grad_norm": 0.4667682945728302, "learning_rate": 9.865772765809528e-06, "loss": 0.4951, "step": 679 }, { "epoch": 0.5007363770250368, "grad_norm": 0.5397236347198486, "learning_rate": 9.864784828592842e-06, "loss": 0.4939, "step": 680 }, { "epoch": 0.5014727540500736, "grad_norm": 0.5122597813606262, "learning_rate": 9.863793318825186e-06, "loss": 0.4452, "step": 681 }, { "epoch": 0.5022091310751104, "grad_norm": 0.4826493263244629, "learning_rate": 9.862798237234697e-06, "loss": 0.4731, "step": 682 }, { "epoch": 0.5029455081001473, "grad_norm": 0.4937388300895691, "learning_rate": 9.86179958455213e-06, "loss": 0.4707, "step": 683 }, { "epoch": 0.5036818851251841, "grad_norm": 0.49966996908187866, "learning_rate": 9.860797361510867e-06, "loss": 0.4764, "step": 684 }, { "epoch": 0.5044182621502209, "grad_norm": 0.41720449924468994, "learning_rate": 9.859791568846908e-06, "loss": 0.458, "step": 685 }, { "epoch": 0.5051546391752577, "grad_norm": 0.5433468222618103, "learning_rate": 9.858782207298881e-06, "loss": 0.4631, "step": 686 }, { "epoch": 0.5058910162002945, "grad_norm": 0.5088586807250977, "learning_rate": 9.857769277608027e-06, "loss": 0.4453, "step": 687 }, { "epoch": 0.5066273932253313, "grad_norm": 0.5024508237838745, "learning_rate": 9.856752780518214e-06, "loss": 0.4519, "step": 688 }, { "epoch": 0.5073637702503682, "grad_norm": 0.5998568534851074, "learning_rate": 9.855732716775923e-06, "loss": 0.463, "step": 689 }, { "epoch": 0.508100147275405, "grad_norm": 0.5431100130081177, "learning_rate": 9.854709087130261e-06, "loss": 0.4632, "step": 690 }, { "epoch": 0.5088365243004418, "grad_norm": 0.5772528648376465, "learning_rate": 9.853681892332948e-06, "loss": 0.4629, "step": 691 }, { "epoch": 0.5095729013254786, "grad_norm": 0.4999123215675354, "learning_rate": 9.852651133138328e-06, "loss": 0.4269, "step": 692 }, { "epoch": 0.5103092783505154, "grad_norm": 0.6174575686454773, "learning_rate": 9.851616810303359e-06, "loss": 0.4618, "step": 693 }, { "epoch": 0.5110456553755522, "grad_norm": 0.5877047777175903, "learning_rate": 9.850578924587614e-06, "loss": 0.4857, "step": 694 }, { "epoch": 0.5117820324005891, "grad_norm": 0.636028528213501, "learning_rate": 9.849537476753286e-06, "loss": 0.4832, "step": 695 }, { "epoch": 0.5125184094256259, "grad_norm": 0.5791111588478088, "learning_rate": 9.848492467565182e-06, "loss": 0.493, "step": 696 }, { "epoch": 0.5132547864506627, "grad_norm": 0.557636559009552, "learning_rate": 9.847443897790728e-06, "loss": 0.4739, "step": 697 }, { "epoch": 0.5139911634756995, "grad_norm": 0.47123220562934875, "learning_rate": 9.84639176819996e-06, "loss": 0.463, "step": 698 }, { "epoch": 0.5147275405007363, "grad_norm": 0.6015990376472473, "learning_rate": 9.845336079565529e-06, "loss": 0.4914, "step": 699 }, { "epoch": 0.5154639175257731, "grad_norm": 0.4684229791164398, "learning_rate": 9.844276832662704e-06, "loss": 0.4573, "step": 700 }, { "epoch": 0.5162002945508101, "grad_norm": 0.4829377233982086, "learning_rate": 9.843214028269361e-06, "loss": 0.4414, "step": 701 }, { "epoch": 0.5169366715758469, "grad_norm": 0.5336502194404602, "learning_rate": 9.842147667165993e-06, "loss": 0.4746, "step": 702 }, { "epoch": 0.5176730486008837, "grad_norm": 0.4682151675224304, "learning_rate": 9.841077750135702e-06, "loss": 0.4873, "step": 703 }, { "epoch": 0.5184094256259205, "grad_norm": 0.5173882842063904, "learning_rate": 9.840004277964204e-06, "loss": 0.4497, "step": 704 }, { "epoch": 0.5191458026509573, "grad_norm": 0.4670022130012512, "learning_rate": 9.838927251439823e-06, "loss": 0.4857, "step": 705 }, { "epoch": 0.5198821796759941, "grad_norm": 0.5181158185005188, "learning_rate": 9.837846671353498e-06, "loss": 0.4822, "step": 706 }, { "epoch": 0.520618556701031, "grad_norm": 0.4926871955394745, "learning_rate": 9.83676253849877e-06, "loss": 0.4677, "step": 707 }, { "epoch": 0.5213549337260678, "grad_norm": 0.5156863331794739, "learning_rate": 9.835674853671797e-06, "loss": 0.4865, "step": 708 }, { "epoch": 0.5220913107511046, "grad_norm": 0.5189216136932373, "learning_rate": 9.83458361767134e-06, "loss": 0.4697, "step": 709 }, { "epoch": 0.5228276877761414, "grad_norm": 0.537659764289856, "learning_rate": 9.83348883129877e-06, "loss": 0.4682, "step": 710 }, { "epoch": 0.5235640648011782, "grad_norm": 0.5328056812286377, "learning_rate": 9.832390495358066e-06, "loss": 0.464, "step": 711 }, { "epoch": 0.524300441826215, "grad_norm": 0.5192391872406006, "learning_rate": 9.831288610655812e-06, "loss": 0.451, "step": 712 }, { "epoch": 0.5250368188512519, "grad_norm": 0.48723873496055603, "learning_rate": 9.830183178001199e-06, "loss": 0.4747, "step": 713 }, { "epoch": 0.5257731958762887, "grad_norm": 0.594840407371521, "learning_rate": 9.829074198206024e-06, "loss": 0.4407, "step": 714 }, { "epoch": 0.5265095729013255, "grad_norm": 0.5472460389137268, "learning_rate": 9.827961672084685e-06, "loss": 0.4487, "step": 715 }, { "epoch": 0.5272459499263623, "grad_norm": 0.5027500987052917, "learning_rate": 9.82684560045419e-06, "loss": 0.4904, "step": 716 }, { "epoch": 0.5279823269513991, "grad_norm": 0.5732831358909607, "learning_rate": 9.82572598413415e-06, "loss": 0.4505, "step": 717 }, { "epoch": 0.5287187039764359, "grad_norm": 0.5125916600227356, "learning_rate": 9.824602823946776e-06, "loss": 0.4428, "step": 718 }, { "epoch": 0.5294550810014728, "grad_norm": 0.5110583901405334, "learning_rate": 9.823476120716882e-06, "loss": 0.4561, "step": 719 }, { "epoch": 0.5301914580265096, "grad_norm": 0.5457249879837036, "learning_rate": 9.822345875271884e-06, "loss": 0.4834, "step": 720 }, { "epoch": 0.5309278350515464, "grad_norm": 0.4954557418823242, "learning_rate": 9.821212088441803e-06, "loss": 0.4562, "step": 721 }, { "epoch": 0.5316642120765832, "grad_norm": 0.6068676710128784, "learning_rate": 9.820074761059255e-06, "loss": 0.5063, "step": 722 }, { "epoch": 0.53240058910162, "grad_norm": 0.49514177441596985, "learning_rate": 9.81893389395946e-06, "loss": 0.4364, "step": 723 }, { "epoch": 0.5331369661266568, "grad_norm": 0.5643453598022461, "learning_rate": 9.817789487980237e-06, "loss": 0.4487, "step": 724 }, { "epoch": 0.5338733431516937, "grad_norm": 0.4869299530982971, "learning_rate": 9.816641543962001e-06, "loss": 0.4697, "step": 725 }, { "epoch": 0.5346097201767305, "grad_norm": 0.647627055644989, "learning_rate": 9.815490062747773e-06, "loss": 0.5138, "step": 726 }, { "epoch": 0.5353460972017673, "grad_norm": 0.5390787124633789, "learning_rate": 9.81433504518316e-06, "loss": 0.4643, "step": 727 }, { "epoch": 0.5360824742268041, "grad_norm": 0.5298830270767212, "learning_rate": 9.813176492116372e-06, "loss": 0.4745, "step": 728 }, { "epoch": 0.5368188512518409, "grad_norm": 0.5326485633850098, "learning_rate": 9.812014404398219e-06, "loss": 0.4452, "step": 729 }, { "epoch": 0.5375552282768777, "grad_norm": 0.47868359088897705, "learning_rate": 9.810848782882101e-06, "loss": 0.467, "step": 730 }, { "epoch": 0.5382916053019146, "grad_norm": 0.5476110577583313, "learning_rate": 9.809679628424016e-06, "loss": 0.4368, "step": 731 }, { "epoch": 0.5390279823269514, "grad_norm": 0.5308613181114197, "learning_rate": 9.808506941882556e-06, "loss": 0.4712, "step": 732 }, { "epoch": 0.5397643593519882, "grad_norm": 0.4515834152698517, "learning_rate": 9.807330724118906e-06, "loss": 0.4381, "step": 733 }, { "epoch": 0.540500736377025, "grad_norm": 0.5537406206130981, "learning_rate": 9.806150975996843e-06, "loss": 0.4615, "step": 734 }, { "epoch": 0.5412371134020618, "grad_norm": 0.47350478172302246, "learning_rate": 9.80496769838274e-06, "loss": 0.4433, "step": 735 }, { "epoch": 0.5419734904270986, "grad_norm": 0.4860948920249939, "learning_rate": 9.803780892145562e-06, "loss": 0.4558, "step": 736 }, { "epoch": 0.5427098674521355, "grad_norm": 0.6019650101661682, "learning_rate": 9.802590558156863e-06, "loss": 0.4454, "step": 737 }, { "epoch": 0.5434462444771723, "grad_norm": 0.467506468296051, "learning_rate": 9.801396697290786e-06, "loss": 0.4628, "step": 738 }, { "epoch": 0.5441826215022091, "grad_norm": 0.4791230857372284, "learning_rate": 9.800199310424067e-06, "loss": 0.4563, "step": 739 }, { "epoch": 0.5449189985272459, "grad_norm": 0.5122166275978088, "learning_rate": 9.798998398436031e-06, "loss": 0.4831, "step": 740 }, { "epoch": 0.5456553755522827, "grad_norm": 0.47877249121665955, "learning_rate": 9.797793962208593e-06, "loss": 0.465, "step": 741 }, { "epoch": 0.5463917525773195, "grad_norm": 0.49181488156318665, "learning_rate": 9.796586002626253e-06, "loss": 0.4793, "step": 742 }, { "epoch": 0.5471281296023565, "grad_norm": 0.48321738839149475, "learning_rate": 9.795374520576102e-06, "loss": 0.4757, "step": 743 }, { "epoch": 0.5478645066273933, "grad_norm": 0.5315781831741333, "learning_rate": 9.794159516947812e-06, "loss": 0.468, "step": 744 }, { "epoch": 0.5486008836524301, "grad_norm": 0.4678581655025482, "learning_rate": 9.792940992633649e-06, "loss": 0.4663, "step": 745 }, { "epoch": 0.5493372606774669, "grad_norm": 0.4767686724662781, "learning_rate": 9.791718948528457e-06, "loss": 0.4687, "step": 746 }, { "epoch": 0.5500736377025037, "grad_norm": 0.5320640206336975, "learning_rate": 9.790493385529671e-06, "loss": 0.4672, "step": 747 }, { "epoch": 0.5508100147275405, "grad_norm": 0.45434436202049255, "learning_rate": 9.789264304537307e-06, "loss": 0.442, "step": 748 }, { "epoch": 0.5515463917525774, "grad_norm": 0.5533967614173889, "learning_rate": 9.788031706453964e-06, "loss": 0.4539, "step": 749 }, { "epoch": 0.5522827687776142, "grad_norm": 0.5042517781257629, "learning_rate": 9.786795592184824e-06, "loss": 0.4619, "step": 750 }, { "epoch": 0.553019145802651, "grad_norm": 0.4778529107570648, "learning_rate": 9.785555962637654e-06, "loss": 0.4744, "step": 751 }, { "epoch": 0.5537555228276878, "grad_norm": 0.5131283402442932, "learning_rate": 9.784312818722799e-06, "loss": 0.4449, "step": 752 }, { "epoch": 0.5544918998527246, "grad_norm": 0.48384329676628113, "learning_rate": 9.783066161353188e-06, "loss": 0.474, "step": 753 }, { "epoch": 0.5552282768777614, "grad_norm": 0.49050474166870117, "learning_rate": 9.781815991444326e-06, "loss": 0.4445, "step": 754 }, { "epoch": 0.5559646539027983, "grad_norm": 0.5421154499053955, "learning_rate": 9.7805623099143e-06, "loss": 0.4668, "step": 755 }, { "epoch": 0.5567010309278351, "grad_norm": 0.5321326851844788, "learning_rate": 9.779305117683781e-06, "loss": 0.4762, "step": 756 }, { "epoch": 0.5574374079528719, "grad_norm": 0.5349854230880737, "learning_rate": 9.778044415676007e-06, "loss": 0.4774, "step": 757 }, { "epoch": 0.5581737849779087, "grad_norm": 0.5067935585975647, "learning_rate": 9.776780204816801e-06, "loss": 0.4862, "step": 758 }, { "epoch": 0.5589101620029455, "grad_norm": 0.4486694037914276, "learning_rate": 9.775512486034564e-06, "loss": 0.4481, "step": 759 }, { "epoch": 0.5596465390279823, "grad_norm": 0.5573259592056274, "learning_rate": 9.774241260260266e-06, "loss": 0.4695, "step": 760 }, { "epoch": 0.5603829160530192, "grad_norm": 0.45694780349731445, "learning_rate": 9.77296652842746e-06, "loss": 0.4728, "step": 761 }, { "epoch": 0.561119293078056, "grad_norm": 0.46451568603515625, "learning_rate": 9.771688291472269e-06, "loss": 0.458, "step": 762 }, { "epoch": 0.5618556701030928, "grad_norm": 0.5605217218399048, "learning_rate": 9.770406550333393e-06, "loss": 0.4492, "step": 763 }, { "epoch": 0.5625920471281296, "grad_norm": 0.4403326213359833, "learning_rate": 9.769121305952102e-06, "loss": 0.4721, "step": 764 }, { "epoch": 0.5633284241531664, "grad_norm": 0.5193141102790833, "learning_rate": 9.767832559272244e-06, "loss": 0.461, "step": 765 }, { "epoch": 0.5640648011782032, "grad_norm": 0.49891531467437744, "learning_rate": 9.766540311240232e-06, "loss": 0.451, "step": 766 }, { "epoch": 0.56480117820324, "grad_norm": 0.5359540581703186, "learning_rate": 9.765244562805055e-06, "loss": 0.4595, "step": 767 }, { "epoch": 0.5655375552282769, "grad_norm": 0.49058595299720764, "learning_rate": 9.76394531491827e-06, "loss": 0.4896, "step": 768 }, { "epoch": 0.5662739322533137, "grad_norm": 0.4365098476409912, "learning_rate": 9.762642568534012e-06, "loss": 0.4575, "step": 769 }, { "epoch": 0.5670103092783505, "grad_norm": 0.4465166926383972, "learning_rate": 9.76133632460897e-06, "loss": 0.4771, "step": 770 }, { "epoch": 0.5677466863033873, "grad_norm": 0.4014427363872528, "learning_rate": 9.760026584102414e-06, "loss": 0.4658, "step": 771 }, { "epoch": 0.5684830633284241, "grad_norm": 0.42262935638427734, "learning_rate": 9.758713347976179e-06, "loss": 0.4551, "step": 772 }, { "epoch": 0.569219440353461, "grad_norm": 0.43725284934043884, "learning_rate": 9.757396617194663e-06, "loss": 0.4512, "step": 773 }, { "epoch": 0.5699558173784978, "grad_norm": 0.4782891273498535, "learning_rate": 9.756076392724836e-06, "loss": 0.4364, "step": 774 }, { "epoch": 0.5706921944035346, "grad_norm": 0.4645746350288391, "learning_rate": 9.75475267553623e-06, "loss": 0.4487, "step": 775 }, { "epoch": 0.5714285714285714, "grad_norm": 0.4626657962799072, "learning_rate": 9.75342546660094e-06, "loss": 0.4305, "step": 776 }, { "epoch": 0.5721649484536082, "grad_norm": 0.519809365272522, "learning_rate": 9.752094766893635e-06, "loss": 0.4818, "step": 777 }, { "epoch": 0.572901325478645, "grad_norm": 0.48324060440063477, "learning_rate": 9.750760577391535e-06, "loss": 0.4035, "step": 778 }, { "epoch": 0.5736377025036818, "grad_norm": 0.5473081469535828, "learning_rate": 9.74942289907443e-06, "loss": 0.459, "step": 779 }, { "epoch": 0.5743740795287187, "grad_norm": 0.4750016927719116, "learning_rate": 9.74808173292467e-06, "loss": 0.4627, "step": 780 }, { "epoch": 0.5751104565537555, "grad_norm": 0.5140059590339661, "learning_rate": 9.746737079927166e-06, "loss": 0.475, "step": 781 }, { "epoch": 0.5758468335787923, "grad_norm": 0.6078734397888184, "learning_rate": 9.745388941069395e-06, "loss": 0.4972, "step": 782 }, { "epoch": 0.5765832106038291, "grad_norm": 0.5263283848762512, "learning_rate": 9.744037317341383e-06, "loss": 0.4598, "step": 783 }, { "epoch": 0.5773195876288659, "grad_norm": 0.47057968378067017, "learning_rate": 9.742682209735727e-06, "loss": 0.4601, "step": 784 }, { "epoch": 0.5780559646539027, "grad_norm": 0.5394490361213684, "learning_rate": 9.741323619247575e-06, "loss": 0.4812, "step": 785 }, { "epoch": 0.5787923416789397, "grad_norm": 0.43855324387550354, "learning_rate": 9.739961546874637e-06, "loss": 0.4499, "step": 786 }, { "epoch": 0.5795287187039765, "grad_norm": 0.5713585019111633, "learning_rate": 9.738595993617172e-06, "loss": 0.471, "step": 787 }, { "epoch": 0.5802650957290133, "grad_norm": 0.48629793524742126, "learning_rate": 9.737226960478006e-06, "loss": 0.456, "step": 788 }, { "epoch": 0.5810014727540501, "grad_norm": 0.48818135261535645, "learning_rate": 9.735854448462516e-06, "loss": 0.4399, "step": 789 }, { "epoch": 0.5817378497790869, "grad_norm": 0.5030027031898499, "learning_rate": 9.73447845857863e-06, "loss": 0.4885, "step": 790 }, { "epoch": 0.5824742268041238, "grad_norm": 0.5217078328132629, "learning_rate": 9.733098991836834e-06, "loss": 0.4426, "step": 791 }, { "epoch": 0.5832106038291606, "grad_norm": 0.5095168948173523, "learning_rate": 9.731716049250169e-06, "loss": 0.4469, "step": 792 }, { "epoch": 0.5839469808541974, "grad_norm": 0.4390084743499756, "learning_rate": 9.730329631834225e-06, "loss": 0.4538, "step": 793 }, { "epoch": 0.5846833578792342, "grad_norm": 0.5591133236885071, "learning_rate": 9.728939740607145e-06, "loss": 0.4437, "step": 794 }, { "epoch": 0.585419734904271, "grad_norm": 0.48923259973526, "learning_rate": 9.727546376589622e-06, "loss": 0.4809, "step": 795 }, { "epoch": 0.5861561119293078, "grad_norm": 0.5047422647476196, "learning_rate": 9.726149540804901e-06, "loss": 0.4616, "step": 796 }, { "epoch": 0.5868924889543446, "grad_norm": 0.47945308685302734, "learning_rate": 9.724749234278779e-06, "loss": 0.4662, "step": 797 }, { "epoch": 0.5876288659793815, "grad_norm": 0.45942962169647217, "learning_rate": 9.723345458039595e-06, "loss": 0.4572, "step": 798 }, { "epoch": 0.5883652430044183, "grad_norm": 0.4904220998287201, "learning_rate": 9.721938213118241e-06, "loss": 0.4517, "step": 799 }, { "epoch": 0.5891016200294551, "grad_norm": 0.4758506715297699, "learning_rate": 9.720527500548155e-06, "loss": 0.4392, "step": 800 }, { "epoch": 0.5898379970544919, "grad_norm": 0.5180002450942993, "learning_rate": 9.719113321365324e-06, "loss": 0.4538, "step": 801 }, { "epoch": 0.5905743740795287, "grad_norm": 0.49426352977752686, "learning_rate": 9.717695676608275e-06, "loss": 0.4865, "step": 802 }, { "epoch": 0.5913107511045655, "grad_norm": 0.5237860083580017, "learning_rate": 9.716274567318085e-06, "loss": 0.4542, "step": 803 }, { "epoch": 0.5920471281296024, "grad_norm": 0.48790404200553894, "learning_rate": 9.714849994538373e-06, "loss": 0.4931, "step": 804 }, { "epoch": 0.5927835051546392, "grad_norm": 0.49453994631767273, "learning_rate": 9.713421959315303e-06, "loss": 0.4633, "step": 805 }, { "epoch": 0.593519882179676, "grad_norm": 0.5625678896903992, "learning_rate": 9.71199046269758e-06, "loss": 0.4769, "step": 806 }, { "epoch": 0.5942562592047128, "grad_norm": 0.5181511044502258, "learning_rate": 9.710555505736456e-06, "loss": 0.4895, "step": 807 }, { "epoch": 0.5949926362297496, "grad_norm": 0.4666210114955902, "learning_rate": 9.709117089485714e-06, "loss": 0.4602, "step": 808 }, { "epoch": 0.5957290132547864, "grad_norm": 0.42587047815322876, "learning_rate": 9.707675215001685e-06, "loss": 0.4632, "step": 809 }, { "epoch": 0.5964653902798233, "grad_norm": 0.5286933183670044, "learning_rate": 9.706229883343242e-06, "loss": 0.4657, "step": 810 }, { "epoch": 0.5972017673048601, "grad_norm": 0.4617254436016083, "learning_rate": 9.704781095571788e-06, "loss": 0.4559, "step": 811 }, { "epoch": 0.5979381443298969, "grad_norm": 0.5746539235115051, "learning_rate": 9.70332885275127e-06, "loss": 0.4548, "step": 812 }, { "epoch": 0.5986745213549337, "grad_norm": 0.4450950026512146, "learning_rate": 9.701873155948177e-06, "loss": 0.4597, "step": 813 }, { "epoch": 0.5994108983799705, "grad_norm": 0.48545292019844055, "learning_rate": 9.70041400623152e-06, "loss": 0.4684, "step": 814 }, { "epoch": 0.6001472754050073, "grad_norm": 0.45839008688926697, "learning_rate": 9.698951404672858e-06, "loss": 0.436, "step": 815 }, { "epoch": 0.6008836524300442, "grad_norm": 0.5452147722244263, "learning_rate": 9.697485352346282e-06, "loss": 0.43, "step": 816 }, { "epoch": 0.601620029455081, "grad_norm": 0.4968319237232208, "learning_rate": 9.696015850328418e-06, "loss": 0.4438, "step": 817 }, { "epoch": 0.6023564064801178, "grad_norm": 0.5042880773544312, "learning_rate": 9.694542899698422e-06, "loss": 0.4463, "step": 818 }, { "epoch": 0.6030927835051546, "grad_norm": 0.5175356864929199, "learning_rate": 9.693066501537984e-06, "loss": 0.4614, "step": 819 }, { "epoch": 0.6038291605301914, "grad_norm": 0.5082677006721497, "learning_rate": 9.691586656931326e-06, "loss": 0.4635, "step": 820 }, { "epoch": 0.6045655375552282, "grad_norm": 0.5881240367889404, "learning_rate": 9.690103366965204e-06, "loss": 0.4691, "step": 821 }, { "epoch": 0.605301914580265, "grad_norm": 0.4930703341960907, "learning_rate": 9.688616632728898e-06, "loss": 0.439, "step": 822 }, { "epoch": 0.6060382916053019, "grad_norm": 0.4951449930667877, "learning_rate": 9.687126455314221e-06, "loss": 0.4493, "step": 823 }, { "epoch": 0.6067746686303387, "grad_norm": 0.4590352475643158, "learning_rate": 9.685632835815519e-06, "loss": 0.4778, "step": 824 }, { "epoch": 0.6075110456553755, "grad_norm": 0.547599196434021, "learning_rate": 9.684135775329653e-06, "loss": 0.4312, "step": 825 }, { "epoch": 0.6082474226804123, "grad_norm": 0.4426873028278351, "learning_rate": 9.682635274956026e-06, "loss": 0.4735, "step": 826 }, { "epoch": 0.6089837997054491, "grad_norm": 0.41880539059638977, "learning_rate": 9.681131335796557e-06, "loss": 0.4593, "step": 827 }, { "epoch": 0.6097201767304861, "grad_norm": 0.5596433877944946, "learning_rate": 9.679623958955692e-06, "loss": 0.4922, "step": 828 }, { "epoch": 0.6104565537555229, "grad_norm": 0.45280906558036804, "learning_rate": 9.678113145540406e-06, "loss": 0.4737, "step": 829 }, { "epoch": 0.6111929307805597, "grad_norm": 0.6396340727806091, "learning_rate": 9.676598896660194e-06, "loss": 0.4929, "step": 830 }, { "epoch": 0.6119293078055965, "grad_norm": 0.4250088036060333, "learning_rate": 9.675081213427076e-06, "loss": 0.421, "step": 831 }, { "epoch": 0.6126656848306333, "grad_norm": 0.5473159551620483, "learning_rate": 9.673560096955588e-06, "loss": 0.4563, "step": 832 }, { "epoch": 0.6134020618556701, "grad_norm": 0.46349871158599854, "learning_rate": 9.672035548362797e-06, "loss": 0.4308, "step": 833 }, { "epoch": 0.614138438880707, "grad_norm": 0.48488184809684753, "learning_rate": 9.670507568768281e-06, "loss": 0.4767, "step": 834 }, { "epoch": 0.6148748159057438, "grad_norm": 0.5228757262229919, "learning_rate": 9.668976159294145e-06, "loss": 0.48, "step": 835 }, { "epoch": 0.6156111929307806, "grad_norm": 0.5242437720298767, "learning_rate": 9.66744132106501e-06, "loss": 0.483, "step": 836 }, { "epoch": 0.6163475699558174, "grad_norm": 0.47697046399116516, "learning_rate": 9.665903055208013e-06, "loss": 0.4704, "step": 837 }, { "epoch": 0.6170839469808542, "grad_norm": 0.5204547047615051, "learning_rate": 9.664361362852813e-06, "loss": 0.4873, "step": 838 }, { "epoch": 0.617820324005891, "grad_norm": 0.4981822967529297, "learning_rate": 9.66281624513158e-06, "loss": 0.4354, "step": 839 }, { "epoch": 0.6185567010309279, "grad_norm": 0.4810592830181122, "learning_rate": 9.661267703178999e-06, "loss": 0.4763, "step": 840 }, { "epoch": 0.6192930780559647, "grad_norm": 0.4789879024028778, "learning_rate": 9.659715738132279e-06, "loss": 0.4358, "step": 841 }, { "epoch": 0.6200294550810015, "grad_norm": 0.48347872495651245, "learning_rate": 9.658160351131129e-06, "loss": 0.5049, "step": 842 }, { "epoch": 0.6207658321060383, "grad_norm": 0.48118627071380615, "learning_rate": 9.656601543317784e-06, "loss": 0.49, "step": 843 }, { "epoch": 0.6215022091310751, "grad_norm": 0.4531796872615814, "learning_rate": 9.655039315836983e-06, "loss": 0.4872, "step": 844 }, { "epoch": 0.6222385861561119, "grad_norm": 0.4885636270046234, "learning_rate": 9.653473669835978e-06, "loss": 0.4479, "step": 845 }, { "epoch": 0.6229749631811488, "grad_norm": 0.49448058009147644, "learning_rate": 9.651904606464536e-06, "loss": 0.4701, "step": 846 }, { "epoch": 0.6237113402061856, "grad_norm": 0.47281432151794434, "learning_rate": 9.650332126874924e-06, "loss": 0.4437, "step": 847 }, { "epoch": 0.6244477172312224, "grad_norm": 0.5343514084815979, "learning_rate": 9.648756232221925e-06, "loss": 0.4899, "step": 848 }, { "epoch": 0.6251840942562592, "grad_norm": 0.42154327034950256, "learning_rate": 9.647176923662833e-06, "loss": 0.4486, "step": 849 }, { "epoch": 0.625920471281296, "grad_norm": 0.4753609597682953, "learning_rate": 9.645594202357438e-06, "loss": 0.4371, "step": 850 }, { "epoch": 0.6266568483063328, "grad_norm": 0.6016598343849182, "learning_rate": 9.644008069468047e-06, "loss": 0.4638, "step": 851 }, { "epoch": 0.6273932253313697, "grad_norm": 0.46637803316116333, "learning_rate": 9.642418526159467e-06, "loss": 0.4541, "step": 852 }, { "epoch": 0.6281296023564065, "grad_norm": 0.5309515595436096, "learning_rate": 9.64082557359901e-06, "loss": 0.4409, "step": 853 }, { "epoch": 0.6288659793814433, "grad_norm": 0.6249402761459351, "learning_rate": 9.639229212956494e-06, "loss": 0.5012, "step": 854 }, { "epoch": 0.6296023564064801, "grad_norm": 0.43967223167419434, "learning_rate": 9.637629445404237e-06, "loss": 0.4438, "step": 855 }, { "epoch": 0.6303387334315169, "grad_norm": 0.45795974135398865, "learning_rate": 9.636026272117058e-06, "loss": 0.445, "step": 856 }, { "epoch": 0.6310751104565537, "grad_norm": 0.5375895500183105, "learning_rate": 9.63441969427228e-06, "loss": 0.4441, "step": 857 }, { "epoch": 0.6318114874815906, "grad_norm": 0.4524497985839844, "learning_rate": 9.632809713049726e-06, "loss": 0.4581, "step": 858 }, { "epoch": 0.6325478645066274, "grad_norm": 0.4217367172241211, "learning_rate": 9.631196329631719e-06, "loss": 0.4681, "step": 859 }, { "epoch": 0.6332842415316642, "grad_norm": 0.5063615441322327, "learning_rate": 9.629579545203076e-06, "loss": 0.4632, "step": 860 }, { "epoch": 0.634020618556701, "grad_norm": 0.46327000856399536, "learning_rate": 9.627959360951118e-06, "loss": 0.437, "step": 861 }, { "epoch": 0.6347569955817378, "grad_norm": 0.4578990042209625, "learning_rate": 9.626335778065655e-06, "loss": 0.4618, "step": 862 }, { "epoch": 0.6354933726067746, "grad_norm": 0.47172173857688904, "learning_rate": 9.624708797739002e-06, "loss": 0.4533, "step": 863 }, { "epoch": 0.6362297496318114, "grad_norm": 0.5421725511550903, "learning_rate": 9.623078421165958e-06, "loss": 0.4663, "step": 864 }, { "epoch": 0.6369661266568483, "grad_norm": 0.4460313320159912, "learning_rate": 9.62144464954383e-06, "loss": 0.4486, "step": 865 }, { "epoch": 0.6377025036818851, "grad_norm": 0.5286182165145874, "learning_rate": 9.619807484072405e-06, "loss": 0.4611, "step": 866 }, { "epoch": 0.6384388807069219, "grad_norm": 0.5084875226020813, "learning_rate": 9.618166925953969e-06, "loss": 0.478, "step": 867 }, { "epoch": 0.6391752577319587, "grad_norm": 0.5308467745780945, "learning_rate": 9.6165229763933e-06, "loss": 0.4512, "step": 868 }, { "epoch": 0.6399116347569955, "grad_norm": 0.5583586096763611, "learning_rate": 9.614875636597662e-06, "loss": 0.4665, "step": 869 }, { "epoch": 0.6406480117820325, "grad_norm": 0.4598456919193268, "learning_rate": 9.613224907776814e-06, "loss": 0.4138, "step": 870 }, { "epoch": 0.6413843888070693, "grad_norm": 0.43438005447387695, "learning_rate": 9.611570791143e-06, "loss": 0.4584, "step": 871 }, { "epoch": 0.6421207658321061, "grad_norm": 0.5358659029006958, "learning_rate": 9.609913287910957e-06, "loss": 0.4371, "step": 872 }, { "epoch": 0.6428571428571429, "grad_norm": 0.4852744936943054, "learning_rate": 9.608252399297899e-06, "loss": 0.4621, "step": 873 }, { "epoch": 0.6435935198821797, "grad_norm": 0.4329935610294342, "learning_rate": 9.606588126523537e-06, "loss": 0.4368, "step": 874 }, { "epoch": 0.6443298969072165, "grad_norm": 0.5553391575813293, "learning_rate": 9.60492047081006e-06, "loss": 0.4636, "step": 875 }, { "epoch": 0.6450662739322534, "grad_norm": 0.42962443828582764, "learning_rate": 9.603249433382145e-06, "loss": 0.4829, "step": 876 }, { "epoch": 0.6458026509572902, "grad_norm": 0.47330227494239807, "learning_rate": 9.60157501546695e-06, "loss": 0.4412, "step": 877 }, { "epoch": 0.646539027982327, "grad_norm": 0.444964200258255, "learning_rate": 9.599897218294122e-06, "loss": 0.4608, "step": 878 }, { "epoch": 0.6472754050073638, "grad_norm": 0.5015583038330078, "learning_rate": 9.598216043095779e-06, "loss": 0.4652, "step": 879 }, { "epoch": 0.6480117820324006, "grad_norm": 0.5097722411155701, "learning_rate": 9.596531491106528e-06, "loss": 0.4401, "step": 880 }, { "epoch": 0.6487481590574374, "grad_norm": 0.5142177939414978, "learning_rate": 9.594843563563452e-06, "loss": 0.4416, "step": 881 }, { "epoch": 0.6494845360824743, "grad_norm": 0.45564496517181396, "learning_rate": 9.593152261706113e-06, "loss": 0.4457, "step": 882 }, { "epoch": 0.6502209131075111, "grad_norm": 0.4867049753665924, "learning_rate": 9.591457586776555e-06, "loss": 0.4653, "step": 883 }, { "epoch": 0.6509572901325479, "grad_norm": 0.4787944257259369, "learning_rate": 9.589759540019293e-06, "loss": 0.4849, "step": 884 }, { "epoch": 0.6516936671575847, "grad_norm": 0.4713253080844879, "learning_rate": 9.588058122681324e-06, "loss": 0.432, "step": 885 }, { "epoch": 0.6524300441826215, "grad_norm": 0.48565471172332764, "learning_rate": 9.586353336012115e-06, "loss": 0.4606, "step": 886 }, { "epoch": 0.6531664212076583, "grad_norm": 0.48033156991004944, "learning_rate": 9.58464518126361e-06, "loss": 0.4695, "step": 887 }, { "epoch": 0.6539027982326951, "grad_norm": 0.5356956720352173, "learning_rate": 9.582933659690228e-06, "loss": 0.4774, "step": 888 }, { "epoch": 0.654639175257732, "grad_norm": 0.5078469514846802, "learning_rate": 9.58121877254886e-06, "loss": 0.4458, "step": 889 }, { "epoch": 0.6553755522827688, "grad_norm": 0.6260564923286438, "learning_rate": 9.57950052109886e-06, "loss": 0.446, "step": 890 }, { "epoch": 0.6561119293078056, "grad_norm": 0.4698830246925354, "learning_rate": 9.577778906602069e-06, "loss": 0.4637, "step": 891 }, { "epoch": 0.6568483063328424, "grad_norm": 0.5720396041870117, "learning_rate": 9.576053930322784e-06, "loss": 0.4609, "step": 892 }, { "epoch": 0.6575846833578792, "grad_norm": 0.5099953413009644, "learning_rate": 9.574325593527776e-06, "loss": 0.4763, "step": 893 }, { "epoch": 0.658321060382916, "grad_norm": 0.582254946231842, "learning_rate": 9.572593897486283e-06, "loss": 0.4653, "step": 894 }, { "epoch": 0.6590574374079529, "grad_norm": 0.5127898454666138, "learning_rate": 9.57085884347001e-06, "loss": 0.47, "step": 895 }, { "epoch": 0.6597938144329897, "grad_norm": 0.5360256433486938, "learning_rate": 9.56912043275313e-06, "loss": 0.4483, "step": 896 }, { "epoch": 0.6605301914580265, "grad_norm": 0.5025308728218079, "learning_rate": 9.567378666612279e-06, "loss": 0.4378, "step": 897 }, { "epoch": 0.6612665684830633, "grad_norm": 0.5077435374259949, "learning_rate": 9.565633546326555e-06, "loss": 0.4492, "step": 898 }, { "epoch": 0.6620029455081001, "grad_norm": 0.580901026725769, "learning_rate": 9.563885073177523e-06, "loss": 0.4499, "step": 899 }, { "epoch": 0.6627393225331369, "grad_norm": 0.4233223795890808, "learning_rate": 9.56213324844921e-06, "loss": 0.4618, "step": 900 }, { "epoch": 0.6634756995581738, "grad_norm": 0.5685048699378967, "learning_rate": 9.560378073428103e-06, "loss": 0.4363, "step": 901 }, { "epoch": 0.6642120765832106, "grad_norm": 0.5028063654899597, "learning_rate": 9.558619549403148e-06, "loss": 0.4684, "step": 902 }, { "epoch": 0.6649484536082474, "grad_norm": 0.5596533417701721, "learning_rate": 9.556857677665752e-06, "loss": 0.4379, "step": 903 }, { "epoch": 0.6656848306332842, "grad_norm": 0.391494482755661, "learning_rate": 9.555092459509783e-06, "loss": 0.4425, "step": 904 }, { "epoch": 0.666421207658321, "grad_norm": 0.5344425439834595, "learning_rate": 9.553323896231558e-06, "loss": 0.4473, "step": 905 }, { "epoch": 0.6671575846833578, "grad_norm": 0.4520135819911957, "learning_rate": 9.551551989129864e-06, "loss": 0.4698, "step": 906 }, { "epoch": 0.6678939617083947, "grad_norm": 0.5516674518585205, "learning_rate": 9.549776739505932e-06, "loss": 0.4926, "step": 907 }, { "epoch": 0.6686303387334315, "grad_norm": 0.4469415545463562, "learning_rate": 9.547998148663449e-06, "loss": 0.4509, "step": 908 }, { "epoch": 0.6693667157584683, "grad_norm": 0.4923398494720459, "learning_rate": 9.546216217908564e-06, "loss": 0.4591, "step": 909 }, { "epoch": 0.6701030927835051, "grad_norm": 0.5482118129730225, "learning_rate": 9.54443094854987e-06, "loss": 0.4566, "step": 910 }, { "epoch": 0.6708394698085419, "grad_norm": 0.5054488778114319, "learning_rate": 9.542642341898416e-06, "loss": 0.4513, "step": 911 }, { "epoch": 0.6715758468335787, "grad_norm": 0.6202924847602844, "learning_rate": 9.540850399267698e-06, "loss": 0.4827, "step": 912 }, { "epoch": 0.6723122238586157, "grad_norm": 0.45753154158592224, "learning_rate": 9.539055121973668e-06, "loss": 0.4849, "step": 913 }, { "epoch": 0.6730486008836525, "grad_norm": 0.7275657653808594, "learning_rate": 9.537256511334722e-06, "loss": 0.4659, "step": 914 }, { "epoch": 0.6737849779086893, "grad_norm": 0.43268248438835144, "learning_rate": 9.535454568671705e-06, "loss": 0.4408, "step": 915 }, { "epoch": 0.6745213549337261, "grad_norm": 0.6889506578445435, "learning_rate": 9.53364929530791e-06, "loss": 0.4649, "step": 916 }, { "epoch": 0.6752577319587629, "grad_norm": 0.4373452365398407, "learning_rate": 9.531840692569073e-06, "loss": 0.4474, "step": 917 }, { "epoch": 0.6759941089837997, "grad_norm": 0.6262958645820618, "learning_rate": 9.530028761783379e-06, "loss": 0.4589, "step": 918 }, { "epoch": 0.6767304860088366, "grad_norm": 0.4766991436481476, "learning_rate": 9.528213504281457e-06, "loss": 0.4724, "step": 919 }, { "epoch": 0.6774668630338734, "grad_norm": 0.43806901574134827, "learning_rate": 9.526394921396373e-06, "loss": 0.4478, "step": 920 }, { "epoch": 0.6782032400589102, "grad_norm": 0.509490966796875, "learning_rate": 9.524573014463643e-06, "loss": 0.4739, "step": 921 }, { "epoch": 0.678939617083947, "grad_norm": 0.4827909469604492, "learning_rate": 9.52274778482122e-06, "loss": 0.472, "step": 922 }, { "epoch": 0.6796759941089838, "grad_norm": 0.46269240975379944, "learning_rate": 9.520919233809494e-06, "loss": 0.4498, "step": 923 }, { "epoch": 0.6804123711340206, "grad_norm": 0.5521724224090576, "learning_rate": 9.519087362771302e-06, "loss": 0.4634, "step": 924 }, { "epoch": 0.6811487481590575, "grad_norm": 0.5625545978546143, "learning_rate": 9.517252173051912e-06, "loss": 0.4552, "step": 925 }, { "epoch": 0.6818851251840943, "grad_norm": 0.4687793552875519, "learning_rate": 9.515413665999034e-06, "loss": 0.4439, "step": 926 }, { "epoch": 0.6826215022091311, "grad_norm": 0.6292194724082947, "learning_rate": 9.51357184296281e-06, "loss": 0.47, "step": 927 }, { "epoch": 0.6833578792341679, "grad_norm": 0.5816941857337952, "learning_rate": 9.51172670529582e-06, "loss": 0.4765, "step": 928 }, { "epoch": 0.6840942562592047, "grad_norm": 0.5308511853218079, "learning_rate": 9.509878254353076e-06, "loss": 0.4756, "step": 929 }, { "epoch": 0.6848306332842415, "grad_norm": 0.6591737866401672, "learning_rate": 9.508026491492027e-06, "loss": 0.5005, "step": 930 }, { "epoch": 0.6855670103092784, "grad_norm": 0.477727472782135, "learning_rate": 9.50617141807255e-06, "loss": 0.4663, "step": 931 }, { "epoch": 0.6863033873343152, "grad_norm": 0.6698769927024841, "learning_rate": 9.504313035456955e-06, "loss": 0.4723, "step": 932 }, { "epoch": 0.687039764359352, "grad_norm": 0.5323542952537537, "learning_rate": 9.502451345009984e-06, "loss": 0.4566, "step": 933 }, { "epoch": 0.6877761413843888, "grad_norm": 0.5885183215141296, "learning_rate": 9.500586348098803e-06, "loss": 0.457, "step": 934 }, { "epoch": 0.6885125184094256, "grad_norm": 0.6716952919960022, "learning_rate": 9.498718046093013e-06, "loss": 0.4514, "step": 935 }, { "epoch": 0.6892488954344624, "grad_norm": 0.5240690112113953, "learning_rate": 9.496846440364634e-06, "loss": 0.4739, "step": 936 }, { "epoch": 0.6899852724594993, "grad_norm": 0.7379181981086731, "learning_rate": 9.49497153228812e-06, "loss": 0.4672, "step": 937 }, { "epoch": 0.6907216494845361, "grad_norm": 0.4711780250072479, "learning_rate": 9.493093323240348e-06, "loss": 0.4378, "step": 938 }, { "epoch": 0.6914580265095729, "grad_norm": 0.661437451839447, "learning_rate": 9.491211814600613e-06, "loss": 0.4741, "step": 939 }, { "epoch": 0.6921944035346097, "grad_norm": 0.5604520440101624, "learning_rate": 9.489327007750644e-06, "loss": 0.4706, "step": 940 }, { "epoch": 0.6929307805596465, "grad_norm": 0.5789750814437866, "learning_rate": 9.487438904074581e-06, "loss": 0.4202, "step": 941 }, { "epoch": 0.6936671575846833, "grad_norm": 0.6015188694000244, "learning_rate": 9.485547504958993e-06, "loss": 0.4321, "step": 942 }, { "epoch": 0.6944035346097202, "grad_norm": 0.5518826246261597, "learning_rate": 9.483652811792866e-06, "loss": 0.4777, "step": 943 }, { "epoch": 0.695139911634757, "grad_norm": 0.5786319375038147, "learning_rate": 9.481754825967606e-06, "loss": 0.4405, "step": 944 }, { "epoch": 0.6958762886597938, "grad_norm": 0.5292356610298157, "learning_rate": 9.479853548877033e-06, "loss": 0.4367, "step": 945 }, { "epoch": 0.6966126656848306, "grad_norm": 0.6690455675125122, "learning_rate": 9.477948981917393e-06, "loss": 0.4381, "step": 946 }, { "epoch": 0.6973490427098674, "grad_norm": 0.4962507486343384, "learning_rate": 9.476041126487341e-06, "loss": 0.4595, "step": 947 }, { "epoch": 0.6980854197349042, "grad_norm": 0.5820249915122986, "learning_rate": 9.474129983987943e-06, "loss": 0.4904, "step": 948 }, { "epoch": 0.698821796759941, "grad_norm": 0.5604087710380554, "learning_rate": 9.472215555822691e-06, "loss": 0.4706, "step": 949 }, { "epoch": 0.6995581737849779, "grad_norm": 0.4952147603034973, "learning_rate": 9.47029784339748e-06, "loss": 0.4787, "step": 950 }, { "epoch": 0.7002945508100147, "grad_norm": 0.5814775228500366, "learning_rate": 9.468376848120619e-06, "loss": 0.4813, "step": 951 }, { "epoch": 0.7010309278350515, "grad_norm": 0.4648406207561493, "learning_rate": 9.466452571402833e-06, "loss": 0.4607, "step": 952 }, { "epoch": 0.7017673048600883, "grad_norm": 0.5979225039482117, "learning_rate": 9.464525014657249e-06, "loss": 0.4521, "step": 953 }, { "epoch": 0.7025036818851251, "grad_norm": 0.4992789328098297, "learning_rate": 9.462594179299408e-06, "loss": 0.4686, "step": 954 }, { "epoch": 0.7032400589101621, "grad_norm": 0.5263684391975403, "learning_rate": 9.460660066747255e-06, "loss": 0.4618, "step": 955 }, { "epoch": 0.7039764359351989, "grad_norm": 0.5737230777740479, "learning_rate": 9.458722678421146e-06, "loss": 0.4294, "step": 956 }, { "epoch": 0.7047128129602357, "grad_norm": 0.48960253596305847, "learning_rate": 9.45678201574384e-06, "loss": 0.4361, "step": 957 }, { "epoch": 0.7054491899852725, "grad_norm": 0.6119993925094604, "learning_rate": 9.454838080140501e-06, "loss": 0.4593, "step": 958 }, { "epoch": 0.7061855670103093, "grad_norm": 0.5781173706054688, "learning_rate": 9.452890873038697e-06, "loss": 0.4643, "step": 959 }, { "epoch": 0.7069219440353461, "grad_norm": 0.7216101884841919, "learning_rate": 9.450940395868397e-06, "loss": 0.468, "step": 960 }, { "epoch": 0.707658321060383, "grad_norm": 0.47883689403533936, "learning_rate": 9.448986650061973e-06, "loss": 0.4386, "step": 961 }, { "epoch": 0.7083946980854198, "grad_norm": 0.5716425180435181, "learning_rate": 9.447029637054198e-06, "loss": 0.5083, "step": 962 }, { "epoch": 0.7091310751104566, "grad_norm": 0.5576707720756531, "learning_rate": 9.445069358282242e-06, "loss": 0.4811, "step": 963 }, { "epoch": 0.7098674521354934, "grad_norm": 0.4840952455997467, "learning_rate": 9.443105815185674e-06, "loss": 0.4485, "step": 964 }, { "epoch": 0.7106038291605302, "grad_norm": 0.6055101156234741, "learning_rate": 9.44113900920646e-06, "loss": 0.4484, "step": 965 }, { "epoch": 0.711340206185567, "grad_norm": 0.4516480267047882, "learning_rate": 9.439168941788965e-06, "loss": 0.4344, "step": 966 }, { "epoch": 0.7120765832106039, "grad_norm": 0.5439156889915466, "learning_rate": 9.437195614379947e-06, "loss": 0.4537, "step": 967 }, { "epoch": 0.7128129602356407, "grad_norm": 0.559701681137085, "learning_rate": 9.435219028428558e-06, "loss": 0.4527, "step": 968 }, { "epoch": 0.7135493372606775, "grad_norm": 0.4636538326740265, "learning_rate": 9.43323918538634e-06, "loss": 0.47, "step": 969 }, { "epoch": 0.7142857142857143, "grad_norm": 0.4914974272251129, "learning_rate": 9.431256086707233e-06, "loss": 0.4688, "step": 970 }, { "epoch": 0.7150220913107511, "grad_norm": 0.5754413604736328, "learning_rate": 9.429269733847563e-06, "loss": 0.4754, "step": 971 }, { "epoch": 0.7157584683357879, "grad_norm": 0.4941750168800354, "learning_rate": 9.427280128266049e-06, "loss": 0.4714, "step": 972 }, { "epoch": 0.7164948453608248, "grad_norm": 0.5356074571609497, "learning_rate": 9.425287271423797e-06, "loss": 0.433, "step": 973 }, { "epoch": 0.7172312223858616, "grad_norm": 0.5081741809844971, "learning_rate": 9.4232911647843e-06, "loss": 0.4652, "step": 974 }, { "epoch": 0.7179675994108984, "grad_norm": 0.4957806169986725, "learning_rate": 9.42129180981344e-06, "loss": 0.4731, "step": 975 }, { "epoch": 0.7187039764359352, "grad_norm": 0.5311683416366577, "learning_rate": 9.41928920797948e-06, "loss": 0.4396, "step": 976 }, { "epoch": 0.719440353460972, "grad_norm": 0.44548124074935913, "learning_rate": 9.417283360753073e-06, "loss": 0.4488, "step": 977 }, { "epoch": 0.7201767304860088, "grad_norm": 0.556957483291626, "learning_rate": 9.415274269607253e-06, "loss": 0.4302, "step": 978 }, { "epoch": 0.7209131075110456, "grad_norm": 0.478667289018631, "learning_rate": 9.413261936017433e-06, "loss": 0.4154, "step": 979 }, { "epoch": 0.7216494845360825, "grad_norm": 0.535645604133606, "learning_rate": 9.41124636146141e-06, "loss": 0.4511, "step": 980 }, { "epoch": 0.7223858615611193, "grad_norm": 0.5650351643562317, "learning_rate": 9.409227547419364e-06, "loss": 0.4624, "step": 981 }, { "epoch": 0.7231222385861561, "grad_norm": 0.4671155512332916, "learning_rate": 9.407205495373849e-06, "loss": 0.462, "step": 982 }, { "epoch": 0.7238586156111929, "grad_norm": 0.4867047965526581, "learning_rate": 9.405180206809799e-06, "loss": 0.4013, "step": 983 }, { "epoch": 0.7245949926362297, "grad_norm": 0.7005500197410583, "learning_rate": 9.403151683214525e-06, "loss": 0.4596, "step": 984 }, { "epoch": 0.7253313696612665, "grad_norm": 0.47493258118629456, "learning_rate": 9.401119926077714e-06, "loss": 0.4439, "step": 985 }, { "epoch": 0.7260677466863034, "grad_norm": 0.6544631123542786, "learning_rate": 9.399084936891424e-06, "loss": 0.4709, "step": 986 }, { "epoch": 0.7268041237113402, "grad_norm": 0.4934577941894531, "learning_rate": 9.397046717150095e-06, "loss": 0.4537, "step": 987 }, { "epoch": 0.727540500736377, "grad_norm": 0.5570245981216431, "learning_rate": 9.39500526835053e-06, "loss": 0.4462, "step": 988 }, { "epoch": 0.7282768777614138, "grad_norm": 0.6459545493125916, "learning_rate": 9.392960591991908e-06, "loss": 0.4522, "step": 989 }, { "epoch": 0.7290132547864506, "grad_norm": 0.5408301949501038, "learning_rate": 9.39091268957578e-06, "loss": 0.4684, "step": 990 }, { "epoch": 0.7297496318114874, "grad_norm": 0.6333877444267273, "learning_rate": 9.388861562606059e-06, "loss": 0.4797, "step": 991 }, { "epoch": 0.7304860088365243, "grad_norm": 0.4390884041786194, "learning_rate": 9.386807212589036e-06, "loss": 0.4377, "step": 992 }, { "epoch": 0.7312223858615611, "grad_norm": 0.4864395260810852, "learning_rate": 9.384749641033358e-06, "loss": 0.478, "step": 993 }, { "epoch": 0.7319587628865979, "grad_norm": 0.4967077076435089, "learning_rate": 9.382688849450049e-06, "loss": 0.4222, "step": 994 }, { "epoch": 0.7326951399116347, "grad_norm": 0.45611369609832764, "learning_rate": 9.380624839352486e-06, "loss": 0.4561, "step": 995 }, { "epoch": 0.7334315169366715, "grad_norm": 0.5901041626930237, "learning_rate": 9.37855761225642e-06, "loss": 0.4795, "step": 996 }, { "epoch": 0.7341678939617083, "grad_norm": 0.4063359498977661, "learning_rate": 9.37648716967996e-06, "loss": 0.4684, "step": 997 }, { "epoch": 0.7349042709867453, "grad_norm": 0.5191907286643982, "learning_rate": 9.374413513143574e-06, "loss": 0.438, "step": 998 }, { "epoch": 0.7356406480117821, "grad_norm": 0.4642137885093689, "learning_rate": 9.372336644170096e-06, "loss": 0.432, "step": 999 }, { "epoch": 0.7363770250368189, "grad_norm": 0.5185231566429138, "learning_rate": 9.370256564284713e-06, "loss": 0.4519, "step": 1000 }, { "epoch": 0.7371134020618557, "grad_norm": 0.4697466790676117, "learning_rate": 9.368173275014973e-06, "loss": 0.4226, "step": 1001 }, { "epoch": 0.7378497790868925, "grad_norm": 0.49580705165863037, "learning_rate": 9.366086777890785e-06, "loss": 0.475, "step": 1002 }, { "epoch": 0.7385861561119293, "grad_norm": 0.5398573875427246, "learning_rate": 9.363997074444402e-06, "loss": 0.4605, "step": 1003 }, { "epoch": 0.7393225331369662, "grad_norm": 0.49332237243652344, "learning_rate": 9.361904166210443e-06, "loss": 0.4483, "step": 1004 }, { "epoch": 0.740058910162003, "grad_norm": 0.6060042977333069, "learning_rate": 9.359808054725877e-06, "loss": 0.4546, "step": 1005 }, { "epoch": 0.7407952871870398, "grad_norm": 0.5035642385482788, "learning_rate": 9.357708741530025e-06, "loss": 0.4692, "step": 1006 }, { "epoch": 0.7415316642120766, "grad_norm": 0.5561889410018921, "learning_rate": 9.355606228164559e-06, "loss": 0.4416, "step": 1007 }, { "epoch": 0.7422680412371134, "grad_norm": 0.5201199650764465, "learning_rate": 9.3535005161735e-06, "loss": 0.4581, "step": 1008 }, { "epoch": 0.7430044182621502, "grad_norm": 0.5415538549423218, "learning_rate": 9.351391607103222e-06, "loss": 0.4417, "step": 1009 }, { "epoch": 0.7437407952871871, "grad_norm": 0.5373315811157227, "learning_rate": 9.34927950250244e-06, "loss": 0.4449, "step": 1010 }, { "epoch": 0.7444771723122239, "grad_norm": 0.6178976893424988, "learning_rate": 9.347164203922224e-06, "loss": 0.4936, "step": 1011 }, { "epoch": 0.7452135493372607, "grad_norm": 0.4898339807987213, "learning_rate": 9.34504571291598e-06, "loss": 0.4354, "step": 1012 }, { "epoch": 0.7459499263622975, "grad_norm": 0.6177398562431335, "learning_rate": 9.34292403103947e-06, "loss": 0.4385, "step": 1013 }, { "epoch": 0.7466863033873343, "grad_norm": 0.4545745551586151, "learning_rate": 9.34079915985079e-06, "loss": 0.4313, "step": 1014 }, { "epoch": 0.7474226804123711, "grad_norm": 0.5530493259429932, "learning_rate": 9.33867110091038e-06, "loss": 0.4351, "step": 1015 }, { "epoch": 0.748159057437408, "grad_norm": 0.562359094619751, "learning_rate": 9.336539855781027e-06, "loss": 0.4467, "step": 1016 }, { "epoch": 0.7488954344624448, "grad_norm": 0.4444253742694855, "learning_rate": 9.334405426027845e-06, "loss": 0.4276, "step": 1017 }, { "epoch": 0.7496318114874816, "grad_norm": 0.6157680749893188, "learning_rate": 9.332267813218303e-06, "loss": 0.4383, "step": 1018 }, { "epoch": 0.7503681885125184, "grad_norm": 0.5156718492507935, "learning_rate": 9.330127018922195e-06, "loss": 0.4623, "step": 1019 }, { "epoch": 0.7511045655375552, "grad_norm": 0.5296850204467773, "learning_rate": 9.327983044711655e-06, "loss": 0.4561, "step": 1020 }, { "epoch": 0.751840942562592, "grad_norm": 0.5704274773597717, "learning_rate": 9.325835892161156e-06, "loss": 0.4747, "step": 1021 }, { "epoch": 0.7525773195876289, "grad_norm": 0.5490828156471252, "learning_rate": 9.323685562847497e-06, "loss": 0.4119, "step": 1022 }, { "epoch": 0.7533136966126657, "grad_norm": 0.4620678722858429, "learning_rate": 9.321532058349817e-06, "loss": 0.4218, "step": 1023 }, { "epoch": 0.7540500736377025, "grad_norm": 0.507858157157898, "learning_rate": 9.31937538024959e-06, "loss": 0.4328, "step": 1024 }, { "epoch": 0.7547864506627393, "grad_norm": 0.5123826861381531, "learning_rate": 9.317215530130607e-06, "loss": 0.4529, "step": 1025 }, { "epoch": 0.7555228276877761, "grad_norm": 0.4884844124317169, "learning_rate": 9.315052509579004e-06, "loss": 0.4718, "step": 1026 }, { "epoch": 0.7562592047128129, "grad_norm": 0.5704168677330017, "learning_rate": 9.312886320183232e-06, "loss": 0.4674, "step": 1027 }, { "epoch": 0.7569955817378498, "grad_norm": 0.4477781057357788, "learning_rate": 9.310716963534077e-06, "loss": 0.443, "step": 1028 }, { "epoch": 0.7577319587628866, "grad_norm": 0.5215590000152588, "learning_rate": 9.30854444122465e-06, "loss": 0.4639, "step": 1029 }, { "epoch": 0.7584683357879234, "grad_norm": 0.5299932360649109, "learning_rate": 9.306368754850386e-06, "loss": 0.4503, "step": 1030 }, { "epoch": 0.7592047128129602, "grad_norm": 0.47621920704841614, "learning_rate": 9.30418990600904e-06, "loss": 0.4418, "step": 1031 }, { "epoch": 0.759941089837997, "grad_norm": 0.5789692401885986, "learning_rate": 9.302007896300697e-06, "loss": 0.4675, "step": 1032 }, { "epoch": 0.7606774668630338, "grad_norm": 0.4460338354110718, "learning_rate": 9.299822727327758e-06, "loss": 0.4253, "step": 1033 }, { "epoch": 0.7614138438880707, "grad_norm": 0.5220574140548706, "learning_rate": 9.297634400694943e-06, "loss": 0.4548, "step": 1034 }, { "epoch": 0.7621502209131075, "grad_norm": 0.5527031421661377, "learning_rate": 9.295442918009295e-06, "loss": 0.4268, "step": 1035 }, { "epoch": 0.7628865979381443, "grad_norm": 0.4812883734703064, "learning_rate": 9.29324828088017e-06, "loss": 0.4721, "step": 1036 }, { "epoch": 0.7636229749631811, "grad_norm": 0.5152131915092468, "learning_rate": 9.291050490919244e-06, "loss": 0.4758, "step": 1037 }, { "epoch": 0.7643593519882179, "grad_norm": 0.5110999941825867, "learning_rate": 9.288849549740513e-06, "loss": 0.4721, "step": 1038 }, { "epoch": 0.7650957290132547, "grad_norm": 0.4943920373916626, "learning_rate": 9.286645458960272e-06, "loss": 0.4636, "step": 1039 }, { "epoch": 0.7658321060382917, "grad_norm": 0.4862247109413147, "learning_rate": 9.28443822019715e-06, "loss": 0.4717, "step": 1040 }, { "epoch": 0.7665684830633285, "grad_norm": 0.4800577163696289, "learning_rate": 9.282227835072064e-06, "loss": 0.4624, "step": 1041 }, { "epoch": 0.7673048600883653, "grad_norm": 0.5938796401023865, "learning_rate": 9.280014305208264e-06, "loss": 0.4698, "step": 1042 }, { "epoch": 0.7680412371134021, "grad_norm": 0.45310524106025696, "learning_rate": 9.277797632231295e-06, "loss": 0.4681, "step": 1043 }, { "epoch": 0.7687776141384389, "grad_norm": 0.6024627089500427, "learning_rate": 9.275577817769015e-06, "loss": 0.4819, "step": 1044 }, { "epoch": 0.7695139911634757, "grad_norm": 0.4621509611606598, "learning_rate": 9.273354863451589e-06, "loss": 0.4683, "step": 1045 }, { "epoch": 0.7702503681885126, "grad_norm": 0.5616011619567871, "learning_rate": 9.271128770911489e-06, "loss": 0.4924, "step": 1046 }, { "epoch": 0.7709867452135494, "grad_norm": 0.5046648979187012, "learning_rate": 9.268899541783487e-06, "loss": 0.4244, "step": 1047 }, { "epoch": 0.7717231222385862, "grad_norm": 0.47864627838134766, "learning_rate": 9.266667177704665e-06, "loss": 0.4358, "step": 1048 }, { "epoch": 0.772459499263623, "grad_norm": 0.45622187852859497, "learning_rate": 9.2644316803144e-06, "loss": 0.4492, "step": 1049 }, { "epoch": 0.7731958762886598, "grad_norm": 0.5317423343658447, "learning_rate": 9.262193051254377e-06, "loss": 0.4541, "step": 1050 }, { "epoch": 0.7739322533136966, "grad_norm": 0.5093345046043396, "learning_rate": 9.259951292168576e-06, "loss": 0.4874, "step": 1051 }, { "epoch": 0.7746686303387335, "grad_norm": 0.4720238745212555, "learning_rate": 9.257706404703276e-06, "loss": 0.4477, "step": 1052 }, { "epoch": 0.7754050073637703, "grad_norm": 0.548051118850708, "learning_rate": 9.255458390507059e-06, "loss": 0.4761, "step": 1053 }, { "epoch": 0.7761413843888071, "grad_norm": 0.4737755358219147, "learning_rate": 9.253207251230793e-06, "loss": 0.4517, "step": 1054 }, { "epoch": 0.7768777614138439, "grad_norm": 0.40416955947875977, "learning_rate": 9.250952988527648e-06, "loss": 0.4095, "step": 1055 }, { "epoch": 0.7776141384388807, "grad_norm": 0.4669051468372345, "learning_rate": 9.248695604053091e-06, "loss": 0.4492, "step": 1056 }, { "epoch": 0.7783505154639175, "grad_norm": 0.5601681470870972, "learning_rate": 9.246435099464869e-06, "loss": 0.4564, "step": 1057 }, { "epoch": 0.7790868924889544, "grad_norm": 0.44851595163345337, "learning_rate": 9.244171476423037e-06, "loss": 0.4675, "step": 1058 }, { "epoch": 0.7798232695139912, "grad_norm": 0.5892722606658936, "learning_rate": 9.241904736589927e-06, "loss": 0.4731, "step": 1059 }, { "epoch": 0.780559646539028, "grad_norm": 0.47606587409973145, "learning_rate": 9.239634881630162e-06, "loss": 0.4443, "step": 1060 }, { "epoch": 0.7812960235640648, "grad_norm": 0.5107343196868896, "learning_rate": 9.237361913210658e-06, "loss": 0.4415, "step": 1061 }, { "epoch": 0.7820324005891016, "grad_norm": 0.593949556350708, "learning_rate": 9.235085833000613e-06, "loss": 0.4537, "step": 1062 }, { "epoch": 0.7827687776141384, "grad_norm": 0.4850008189678192, "learning_rate": 9.232806642671513e-06, "loss": 0.465, "step": 1063 }, { "epoch": 0.7835051546391752, "grad_norm": 0.4913601577281952, "learning_rate": 9.230524343897125e-06, "loss": 0.4483, "step": 1064 }, { "epoch": 0.7842415316642121, "grad_norm": 0.4335813820362091, "learning_rate": 9.228238938353502e-06, "loss": 0.4083, "step": 1065 }, { "epoch": 0.7849779086892489, "grad_norm": 0.5363730788230896, "learning_rate": 9.225950427718974e-06, "loss": 0.4521, "step": 1066 }, { "epoch": 0.7857142857142857, "grad_norm": 0.5813281536102295, "learning_rate": 9.223658813674157e-06, "loss": 0.4493, "step": 1067 }, { "epoch": 0.7864506627393225, "grad_norm": 0.466314435005188, "learning_rate": 9.221364097901941e-06, "loss": 0.446, "step": 1068 }, { "epoch": 0.7871870397643593, "grad_norm": 0.5087846517562866, "learning_rate": 9.219066282087497e-06, "loss": 0.4284, "step": 1069 }, { "epoch": 0.7879234167893961, "grad_norm": 0.5554249882698059, "learning_rate": 9.216765367918272e-06, "loss": 0.4804, "step": 1070 }, { "epoch": 0.788659793814433, "grad_norm": 0.48049336671829224, "learning_rate": 9.214461357083986e-06, "loss": 0.4659, "step": 1071 }, { "epoch": 0.7893961708394698, "grad_norm": 0.5227876901626587, "learning_rate": 9.212154251276637e-06, "loss": 0.4487, "step": 1072 }, { "epoch": 0.7901325478645066, "grad_norm": 0.5827174186706543, "learning_rate": 9.20984405219049e-06, "loss": 0.4681, "step": 1073 }, { "epoch": 0.7908689248895434, "grad_norm": 0.5332369804382324, "learning_rate": 9.207530761522093e-06, "loss": 0.4361, "step": 1074 }, { "epoch": 0.7916053019145802, "grad_norm": 0.525380551815033, "learning_rate": 9.205214380970247e-06, "loss": 0.4415, "step": 1075 }, { "epoch": 0.792341678939617, "grad_norm": 0.4778136610984802, "learning_rate": 9.20289491223604e-06, "loss": 0.4416, "step": 1076 }, { "epoch": 0.7930780559646539, "grad_norm": 0.5096407532691956, "learning_rate": 9.200572357022815e-06, "loss": 0.4379, "step": 1077 }, { "epoch": 0.7938144329896907, "grad_norm": 0.48358261585235596, "learning_rate": 9.198246717036187e-06, "loss": 0.4547, "step": 1078 }, { "epoch": 0.7945508100147275, "grad_norm": 0.5392265319824219, "learning_rate": 9.195917993984039e-06, "loss": 0.4585, "step": 1079 }, { "epoch": 0.7952871870397643, "grad_norm": 0.4468466341495514, "learning_rate": 9.19358618957651e-06, "loss": 0.4572, "step": 1080 }, { "epoch": 0.7960235640648011, "grad_norm": 0.48624467849731445, "learning_rate": 9.191251305526013e-06, "loss": 0.4327, "step": 1081 }, { "epoch": 0.7967599410898379, "grad_norm": 0.49556195735931396, "learning_rate": 9.18891334354721e-06, "loss": 0.4285, "step": 1082 }, { "epoch": 0.7974963181148749, "grad_norm": 0.427839994430542, "learning_rate": 9.18657230535703e-06, "loss": 0.4326, "step": 1083 }, { "epoch": 0.7982326951399117, "grad_norm": 0.5406298041343689, "learning_rate": 9.184228192674667e-06, "loss": 0.455, "step": 1084 }, { "epoch": 0.7989690721649485, "grad_norm": 0.4637623131275177, "learning_rate": 9.18188100722156e-06, "loss": 0.4596, "step": 1085 }, { "epoch": 0.7997054491899853, "grad_norm": 0.4427599608898163, "learning_rate": 9.179530750721413e-06, "loss": 0.4371, "step": 1086 }, { "epoch": 0.8004418262150221, "grad_norm": 0.5007920861244202, "learning_rate": 9.177177424900183e-06, "loss": 0.4656, "step": 1087 }, { "epoch": 0.801178203240059, "grad_norm": 0.5360900163650513, "learning_rate": 9.174821031486083e-06, "loss": 0.4214, "step": 1088 }, { "epoch": 0.8019145802650958, "grad_norm": 0.5234165191650391, "learning_rate": 9.172461572209578e-06, "loss": 0.4553, "step": 1089 }, { "epoch": 0.8026509572901326, "grad_norm": 0.5163874626159668, "learning_rate": 9.17009904880338e-06, "loss": 0.447, "step": 1090 }, { "epoch": 0.8033873343151694, "grad_norm": 0.6014302372932434, "learning_rate": 9.167733463002457e-06, "loss": 0.4715, "step": 1091 }, { "epoch": 0.8041237113402062, "grad_norm": 0.5282832384109497, "learning_rate": 9.165364816544022e-06, "loss": 0.4558, "step": 1092 }, { "epoch": 0.804860088365243, "grad_norm": 0.547073483467102, "learning_rate": 9.162993111167541e-06, "loss": 0.4541, "step": 1093 }, { "epoch": 0.8055964653902798, "grad_norm": 0.5955381393432617, "learning_rate": 9.160618348614718e-06, "loss": 0.4368, "step": 1094 }, { "epoch": 0.8063328424153167, "grad_norm": 0.4820695221424103, "learning_rate": 9.158240530629512e-06, "loss": 0.4562, "step": 1095 }, { "epoch": 0.8070692194403535, "grad_norm": 0.4931407868862152, "learning_rate": 9.155859658958117e-06, "loss": 0.4747, "step": 1096 }, { "epoch": 0.8078055964653903, "grad_norm": 0.5072289705276489, "learning_rate": 9.153475735348973e-06, "loss": 0.4338, "step": 1097 }, { "epoch": 0.8085419734904271, "grad_norm": 0.4261424243450165, "learning_rate": 9.151088761552764e-06, "loss": 0.4415, "step": 1098 }, { "epoch": 0.8092783505154639, "grad_norm": 0.5440129637718201, "learning_rate": 9.148698739322409e-06, "loss": 0.4326, "step": 1099 }, { "epoch": 0.8100147275405007, "grad_norm": 0.4529050886631012, "learning_rate": 9.146305670413069e-06, "loss": 0.4579, "step": 1100 }, { "epoch": 0.8107511045655376, "grad_norm": 0.5012593865394592, "learning_rate": 9.143909556582143e-06, "loss": 0.4643, "step": 1101 }, { "epoch": 0.8114874815905744, "grad_norm": 0.5710225701332092, "learning_rate": 9.141510399589261e-06, "loss": 0.4484, "step": 1102 }, { "epoch": 0.8122238586156112, "grad_norm": 0.4354053735733032, "learning_rate": 9.139108201196296e-06, "loss": 0.4333, "step": 1103 }, { "epoch": 0.812960235640648, "grad_norm": 0.49365589022636414, "learning_rate": 9.136702963167348e-06, "loss": 0.457, "step": 1104 }, { "epoch": 0.8136966126656848, "grad_norm": 0.46688565611839294, "learning_rate": 9.134294687268749e-06, "loss": 0.4454, "step": 1105 }, { "epoch": 0.8144329896907216, "grad_norm": 0.47631922364234924, "learning_rate": 9.131883375269067e-06, "loss": 0.4471, "step": 1106 }, { "epoch": 0.8151693667157585, "grad_norm": 0.54261714220047, "learning_rate": 9.129469028939094e-06, "loss": 0.4839, "step": 1107 }, { "epoch": 0.8159057437407953, "grad_norm": 0.5108979940414429, "learning_rate": 9.127051650051854e-06, "loss": 0.4306, "step": 1108 }, { "epoch": 0.8166421207658321, "grad_norm": 0.46782127022743225, "learning_rate": 9.1246312403826e-06, "loss": 0.4578, "step": 1109 }, { "epoch": 0.8173784977908689, "grad_norm": 0.5475788116455078, "learning_rate": 9.122207801708802e-06, "loss": 0.4498, "step": 1110 }, { "epoch": 0.8181148748159057, "grad_norm": 0.5272371172904968, "learning_rate": 9.119781335810164e-06, "loss": 0.4482, "step": 1111 }, { "epoch": 0.8188512518409425, "grad_norm": 0.49965953826904297, "learning_rate": 9.117351844468609e-06, "loss": 0.4164, "step": 1112 }, { "epoch": 0.8195876288659794, "grad_norm": 0.5126081109046936, "learning_rate": 9.114919329468283e-06, "loss": 0.4744, "step": 1113 }, { "epoch": 0.8203240058910162, "grad_norm": 0.5301181674003601, "learning_rate": 9.112483792595547e-06, "loss": 0.4508, "step": 1114 }, { "epoch": 0.821060382916053, "grad_norm": 0.49831271171569824, "learning_rate": 9.110045235638991e-06, "loss": 0.4435, "step": 1115 }, { "epoch": 0.8217967599410898, "grad_norm": 0.48843833804130554, "learning_rate": 9.107603660389414e-06, "loss": 0.4381, "step": 1116 }, { "epoch": 0.8225331369661266, "grad_norm": 0.5341619253158569, "learning_rate": 9.105159068639837e-06, "loss": 0.4548, "step": 1117 }, { "epoch": 0.8232695139911634, "grad_norm": 0.5220252275466919, "learning_rate": 9.102711462185495e-06, "loss": 0.4459, "step": 1118 }, { "epoch": 0.8240058910162003, "grad_norm": 0.4931434392929077, "learning_rate": 9.100260842823831e-06, "loss": 0.4393, "step": 1119 }, { "epoch": 0.8247422680412371, "grad_norm": 0.5014846920967102, "learning_rate": 9.097807212354513e-06, "loss": 0.4551, "step": 1120 }, { "epoch": 0.8254786450662739, "grad_norm": 0.5581266283988953, "learning_rate": 9.09535057257941e-06, "loss": 0.4658, "step": 1121 }, { "epoch": 0.8262150220913107, "grad_norm": 0.46569985151290894, "learning_rate": 9.092890925302601e-06, "loss": 0.4414, "step": 1122 }, { "epoch": 0.8269513991163475, "grad_norm": 0.45610496401786804, "learning_rate": 9.090428272330381e-06, "loss": 0.4561, "step": 1123 }, { "epoch": 0.8276877761413843, "grad_norm": 0.4498854875564575, "learning_rate": 9.087962615471246e-06, "loss": 0.4501, "step": 1124 }, { "epoch": 0.8284241531664213, "grad_norm": 0.4839475452899933, "learning_rate": 9.085493956535898e-06, "loss": 0.4394, "step": 1125 }, { "epoch": 0.8291605301914581, "grad_norm": 0.44261297583580017, "learning_rate": 9.083022297337251e-06, "loss": 0.4284, "step": 1126 }, { "epoch": 0.8298969072164949, "grad_norm": 0.49953368306159973, "learning_rate": 9.080547639690411e-06, "loss": 0.4698, "step": 1127 }, { "epoch": 0.8306332842415317, "grad_norm": 0.5280026197433472, "learning_rate": 9.078069985412697e-06, "loss": 0.435, "step": 1128 }, { "epoch": 0.8313696612665685, "grad_norm": 0.46925580501556396, "learning_rate": 9.075589336323619e-06, "loss": 0.4702, "step": 1129 }, { "epoch": 0.8321060382916053, "grad_norm": 0.49038681387901306, "learning_rate": 9.073105694244892e-06, "loss": 0.4367, "step": 1130 }, { "epoch": 0.8328424153166422, "grad_norm": 0.564074695110321, "learning_rate": 9.070619061000429e-06, "loss": 0.486, "step": 1131 }, { "epoch": 0.833578792341679, "grad_norm": 0.45910879969596863, "learning_rate": 9.068129438416337e-06, "loss": 0.4243, "step": 1132 }, { "epoch": 0.8343151693667158, "grad_norm": 0.5710626840591431, "learning_rate": 9.065636828320919e-06, "loss": 0.434, "step": 1133 }, { "epoch": 0.8350515463917526, "grad_norm": 0.504901647567749, "learning_rate": 9.063141232544676e-06, "loss": 0.4441, "step": 1134 }, { "epoch": 0.8357879234167894, "grad_norm": 0.5790115594863892, "learning_rate": 9.060642652920295e-06, "loss": 0.457, "step": 1135 }, { "epoch": 0.8365243004418262, "grad_norm": 0.5210295915603638, "learning_rate": 9.058141091282656e-06, "loss": 0.4539, "step": 1136 }, { "epoch": 0.8372606774668631, "grad_norm": 0.48215651512145996, "learning_rate": 9.055636549468833e-06, "loss": 0.4558, "step": 1137 }, { "epoch": 0.8379970544918999, "grad_norm": 0.42006632685661316, "learning_rate": 9.053129029318086e-06, "loss": 0.424, "step": 1138 }, { "epoch": 0.8387334315169367, "grad_norm": 0.45918166637420654, "learning_rate": 9.050618532671862e-06, "loss": 0.4302, "step": 1139 }, { "epoch": 0.8394698085419735, "grad_norm": 0.5937727093696594, "learning_rate": 9.048105061373793e-06, "loss": 0.4467, "step": 1140 }, { "epoch": 0.8402061855670103, "grad_norm": 0.4482143521308899, "learning_rate": 9.045588617269694e-06, "loss": 0.4423, "step": 1141 }, { "epoch": 0.8409425625920471, "grad_norm": 0.49220171570777893, "learning_rate": 9.043069202207571e-06, "loss": 0.4455, "step": 1142 }, { "epoch": 0.841678939617084, "grad_norm": 0.4809843599796295, "learning_rate": 9.040546818037602e-06, "loss": 0.4628, "step": 1143 }, { "epoch": 0.8424153166421208, "grad_norm": 0.49604564905166626, "learning_rate": 9.038021466612151e-06, "loss": 0.4283, "step": 1144 }, { "epoch": 0.8431516936671576, "grad_norm": 0.4608924388885498, "learning_rate": 9.035493149785758e-06, "loss": 0.4359, "step": 1145 }, { "epoch": 0.8438880706921944, "grad_norm": 0.5311315059661865, "learning_rate": 9.032961869415147e-06, "loss": 0.4655, "step": 1146 }, { "epoch": 0.8446244477172312, "grad_norm": 0.491970032453537, "learning_rate": 9.03042762735921e-06, "loss": 0.4477, "step": 1147 }, { "epoch": 0.845360824742268, "grad_norm": 0.5328875184059143, "learning_rate": 9.027890425479016e-06, "loss": 0.4565, "step": 1148 }, { "epoch": 0.8460972017673049, "grad_norm": 0.46462082862854004, "learning_rate": 9.025350265637816e-06, "loss": 0.4307, "step": 1149 }, { "epoch": 0.8468335787923417, "grad_norm": 0.568123996257782, "learning_rate": 9.02280714970102e-06, "loss": 0.4527, "step": 1150 }, { "epoch": 0.8475699558173785, "grad_norm": 0.39277321100234985, "learning_rate": 9.02026107953622e-06, "loss": 0.4382, "step": 1151 }, { "epoch": 0.8483063328424153, "grad_norm": 0.46049660444259644, "learning_rate": 9.01771205701317e-06, "loss": 0.4496, "step": 1152 }, { "epoch": 0.8490427098674521, "grad_norm": 0.5467735528945923, "learning_rate": 9.015160084003798e-06, "loss": 0.462, "step": 1153 }, { "epoch": 0.8497790868924889, "grad_norm": 0.46097782254219055, "learning_rate": 9.012605162382194e-06, "loss": 0.4612, "step": 1154 }, { "epoch": 0.8505154639175257, "grad_norm": 0.5207246541976929, "learning_rate": 9.010047294024615e-06, "loss": 0.4397, "step": 1155 }, { "epoch": 0.8512518409425626, "grad_norm": 0.5254480838775635, "learning_rate": 9.007486480809482e-06, "loss": 0.4787, "step": 1156 }, { "epoch": 0.8519882179675994, "grad_norm": 0.45604032278060913, "learning_rate": 9.00492272461738e-06, "loss": 0.4607, "step": 1157 }, { "epoch": 0.8527245949926362, "grad_norm": 0.4617128372192383, "learning_rate": 9.002356027331055e-06, "loss": 0.4236, "step": 1158 }, { "epoch": 0.853460972017673, "grad_norm": 0.5179611444473267, "learning_rate": 8.999786390835408e-06, "loss": 0.4647, "step": 1159 }, { "epoch": 0.8541973490427098, "grad_norm": 0.4486055374145508, "learning_rate": 8.997213817017508e-06, "loss": 0.4423, "step": 1160 }, { "epoch": 0.8549337260677466, "grad_norm": 0.5427169799804688, "learning_rate": 8.99463830776657e-06, "loss": 0.4648, "step": 1161 }, { "epoch": 0.8556701030927835, "grad_norm": 0.5229467749595642, "learning_rate": 8.992059864973972e-06, "loss": 0.4559, "step": 1162 }, { "epoch": 0.8564064801178203, "grad_norm": 0.5158513784408569, "learning_rate": 8.989478490533247e-06, "loss": 0.4168, "step": 1163 }, { "epoch": 0.8571428571428571, "grad_norm": 0.47246885299682617, "learning_rate": 8.986894186340075e-06, "loss": 0.4156, "step": 1164 }, { "epoch": 0.8578792341678939, "grad_norm": 0.5099225044250488, "learning_rate": 8.984306954292293e-06, "loss": 0.4577, "step": 1165 }, { "epoch": 0.8586156111929307, "grad_norm": 0.5285944938659668, "learning_rate": 8.981716796289886e-06, "loss": 0.4423, "step": 1166 }, { "epoch": 0.8593519882179675, "grad_norm": 0.43009987473487854, "learning_rate": 8.979123714234986e-06, "loss": 0.4311, "step": 1167 }, { "epoch": 0.8600883652430045, "grad_norm": 0.5090437531471252, "learning_rate": 8.976527710031875e-06, "loss": 0.4636, "step": 1168 }, { "epoch": 0.8608247422680413, "grad_norm": 0.44846466183662415, "learning_rate": 8.97392878558698e-06, "loss": 0.4243, "step": 1169 }, { "epoch": 0.8615611192930781, "grad_norm": 0.5008236765861511, "learning_rate": 8.971326942808876e-06, "loss": 0.4539, "step": 1170 }, { "epoch": 0.8622974963181149, "grad_norm": 0.5144447088241577, "learning_rate": 8.968722183608272e-06, "loss": 0.4791, "step": 1171 }, { "epoch": 0.8630338733431517, "grad_norm": 0.4842917025089264, "learning_rate": 8.966114509898026e-06, "loss": 0.4568, "step": 1172 }, { "epoch": 0.8637702503681886, "grad_norm": 0.4209856688976288, "learning_rate": 8.963503923593138e-06, "loss": 0.4375, "step": 1173 }, { "epoch": 0.8645066273932254, "grad_norm": 0.49867910146713257, "learning_rate": 8.96089042661074e-06, "loss": 0.4882, "step": 1174 }, { "epoch": 0.8652430044182622, "grad_norm": 0.43283653259277344, "learning_rate": 8.958274020870107e-06, "loss": 0.437, "step": 1175 }, { "epoch": 0.865979381443299, "grad_norm": 0.4610222578048706, "learning_rate": 8.955654708292647e-06, "loss": 0.4323, "step": 1176 }, { "epoch": 0.8667157584683358, "grad_norm": 0.5031896829605103, "learning_rate": 8.953032490801908e-06, "loss": 0.4147, "step": 1177 }, { "epoch": 0.8674521354933726, "grad_norm": 0.45876145362854004, "learning_rate": 8.950407370323563e-06, "loss": 0.4496, "step": 1178 }, { "epoch": 0.8681885125184094, "grad_norm": 0.4437890648841858, "learning_rate": 8.94777934878542e-06, "loss": 0.4182, "step": 1179 }, { "epoch": 0.8689248895434463, "grad_norm": 0.49486371874809265, "learning_rate": 8.945148428117423e-06, "loss": 0.4497, "step": 1180 }, { "epoch": 0.8696612665684831, "grad_norm": 0.554208517074585, "learning_rate": 8.942514610251639e-06, "loss": 0.4753, "step": 1181 }, { "epoch": 0.8703976435935199, "grad_norm": 0.4523577094078064, "learning_rate": 8.939877897122262e-06, "loss": 0.4476, "step": 1182 }, { "epoch": 0.8711340206185567, "grad_norm": 0.43716442584991455, "learning_rate": 8.937238290665617e-06, "loss": 0.43, "step": 1183 }, { "epoch": 0.8718703976435935, "grad_norm": 0.4811909794807434, "learning_rate": 8.934595792820152e-06, "loss": 0.4329, "step": 1184 }, { "epoch": 0.8726067746686303, "grad_norm": 0.5379281044006348, "learning_rate": 8.931950405526436e-06, "loss": 0.4529, "step": 1185 }, { "epoch": 0.8733431516936672, "grad_norm": 0.4634367525577545, "learning_rate": 8.92930213072716e-06, "loss": 0.4269, "step": 1186 }, { "epoch": 0.874079528718704, "grad_norm": 0.46587860584259033, "learning_rate": 8.926650970367138e-06, "loss": 0.4399, "step": 1187 }, { "epoch": 0.8748159057437408, "grad_norm": 0.47557711601257324, "learning_rate": 8.923996926393306e-06, "loss": 0.4365, "step": 1188 }, { "epoch": 0.8755522827687776, "grad_norm": 0.48660027980804443, "learning_rate": 8.921340000754708e-06, "loss": 0.4772, "step": 1189 }, { "epoch": 0.8762886597938144, "grad_norm": 0.4610178470611572, "learning_rate": 8.918680195402512e-06, "loss": 0.4276, "step": 1190 }, { "epoch": 0.8770250368188512, "grad_norm": 0.41225728392601013, "learning_rate": 8.916017512290001e-06, "loss": 0.4269, "step": 1191 }, { "epoch": 0.8777614138438881, "grad_norm": 0.4658215641975403, "learning_rate": 8.913351953372565e-06, "loss": 0.4116, "step": 1192 }, { "epoch": 0.8784977908689249, "grad_norm": 0.4793921709060669, "learning_rate": 8.910683520607713e-06, "loss": 0.4811, "step": 1193 }, { "epoch": 0.8792341678939617, "grad_norm": 0.4011600613594055, "learning_rate": 8.90801221595506e-06, "loss": 0.4319, "step": 1194 }, { "epoch": 0.8799705449189985, "grad_norm": 0.4331344962120056, "learning_rate": 8.90533804137633e-06, "loss": 0.4457, "step": 1195 }, { "epoch": 0.8807069219440353, "grad_norm": 0.5520123839378357, "learning_rate": 8.902660998835359e-06, "loss": 0.4549, "step": 1196 }, { "epoch": 0.8814432989690721, "grad_norm": 0.4395567774772644, "learning_rate": 8.899981090298084e-06, "loss": 0.4334, "step": 1197 }, { "epoch": 0.882179675994109, "grad_norm": 0.4339301288127899, "learning_rate": 8.89729831773255e-06, "loss": 0.4518, "step": 1198 }, { "epoch": 0.8829160530191458, "grad_norm": 0.4404997229576111, "learning_rate": 8.894612683108905e-06, "loss": 0.4428, "step": 1199 }, { "epoch": 0.8836524300441826, "grad_norm": 0.47511598467826843, "learning_rate": 8.891924188399395e-06, "loss": 0.4455, "step": 1200 }, { "epoch": 0.8843888070692194, "grad_norm": 0.5081692934036255, "learning_rate": 8.889232835578372e-06, "loss": 0.5261, "step": 1201 }, { "epoch": 0.8851251840942562, "grad_norm": 0.5229424834251404, "learning_rate": 8.886538626622282e-06, "loss": 0.4371, "step": 1202 }, { "epoch": 0.885861561119293, "grad_norm": 0.45727550983428955, "learning_rate": 8.883841563509671e-06, "loss": 0.414, "step": 1203 }, { "epoch": 0.8865979381443299, "grad_norm": 0.4745890498161316, "learning_rate": 8.881141648221185e-06, "loss": 0.472, "step": 1204 }, { "epoch": 0.8873343151693667, "grad_norm": 0.48544570803642273, "learning_rate": 8.878438882739554e-06, "loss": 0.4719, "step": 1205 }, { "epoch": 0.8880706921944035, "grad_norm": 0.45275935530662537, "learning_rate": 8.87573326904961e-06, "loss": 0.4395, "step": 1206 }, { "epoch": 0.8888070692194403, "grad_norm": 0.527344822883606, "learning_rate": 8.873024809138272e-06, "loss": 0.4796, "step": 1207 }, { "epoch": 0.8895434462444771, "grad_norm": 0.44065412878990173, "learning_rate": 8.870313504994556e-06, "loss": 0.4587, "step": 1208 }, { "epoch": 0.8902798232695139, "grad_norm": 0.5130428671836853, "learning_rate": 8.867599358609557e-06, "loss": 0.4672, "step": 1209 }, { "epoch": 0.8910162002945509, "grad_norm": 0.4739178419113159, "learning_rate": 8.864882371976466e-06, "loss": 0.4384, "step": 1210 }, { "epoch": 0.8917525773195877, "grad_norm": 0.4686276614665985, "learning_rate": 8.862162547090551e-06, "loss": 0.4473, "step": 1211 }, { "epoch": 0.8924889543446245, "grad_norm": 0.43980199098587036, "learning_rate": 8.859439885949175e-06, "loss": 0.4568, "step": 1212 }, { "epoch": 0.8932253313696613, "grad_norm": 0.43443796038627625, "learning_rate": 8.856714390551774e-06, "loss": 0.4529, "step": 1213 }, { "epoch": 0.8939617083946981, "grad_norm": 0.514228343963623, "learning_rate": 8.853986062899869e-06, "loss": 0.4468, "step": 1214 }, { "epoch": 0.894698085419735, "grad_norm": 0.4369834363460541, "learning_rate": 8.851254904997062e-06, "loss": 0.4721, "step": 1215 }, { "epoch": 0.8954344624447718, "grad_norm": 0.5049661993980408, "learning_rate": 8.848520918849035e-06, "loss": 0.4094, "step": 1216 }, { "epoch": 0.8961708394698086, "grad_norm": 0.4989210069179535, "learning_rate": 8.845784106463545e-06, "loss": 0.4519, "step": 1217 }, { "epoch": 0.8969072164948454, "grad_norm": 0.4148315489292145, "learning_rate": 8.84304446985042e-06, "loss": 0.4464, "step": 1218 }, { "epoch": 0.8976435935198822, "grad_norm": 0.43474555015563965, "learning_rate": 8.84030201102157e-06, "loss": 0.4372, "step": 1219 }, { "epoch": 0.898379970544919, "grad_norm": 0.49237704277038574, "learning_rate": 8.837556731990973e-06, "loss": 0.4582, "step": 1220 }, { "epoch": 0.8991163475699558, "grad_norm": 0.46191561222076416, "learning_rate": 8.83480863477468e-06, "loss": 0.4557, "step": 1221 }, { "epoch": 0.8998527245949927, "grad_norm": 0.41119834780693054, "learning_rate": 8.832057721390809e-06, "loss": 0.4343, "step": 1222 }, { "epoch": 0.9005891016200295, "grad_norm": 0.4598017632961273, "learning_rate": 8.829303993859548e-06, "loss": 0.4636, "step": 1223 }, { "epoch": 0.9013254786450663, "grad_norm": 0.4588959515094757, "learning_rate": 8.826547454203152e-06, "loss": 0.4331, "step": 1224 }, { "epoch": 0.9020618556701031, "grad_norm": 0.44713202118873596, "learning_rate": 8.823788104445941e-06, "loss": 0.4329, "step": 1225 }, { "epoch": 0.9027982326951399, "grad_norm": 0.5540334582328796, "learning_rate": 8.821025946614295e-06, "loss": 0.4508, "step": 1226 }, { "epoch": 0.9035346097201767, "grad_norm": 0.42183777689933777, "learning_rate": 8.818260982736662e-06, "loss": 0.4633, "step": 1227 }, { "epoch": 0.9042709867452136, "grad_norm": 0.5915690064430237, "learning_rate": 8.815493214843546e-06, "loss": 0.4515, "step": 1228 }, { "epoch": 0.9050073637702504, "grad_norm": 0.5356271862983704, "learning_rate": 8.812722644967515e-06, "loss": 0.4205, "step": 1229 }, { "epoch": 0.9057437407952872, "grad_norm": 0.4718015789985657, "learning_rate": 8.809949275143189e-06, "loss": 0.4572, "step": 1230 }, { "epoch": 0.906480117820324, "grad_norm": 0.5468553900718689, "learning_rate": 8.807173107407248e-06, "loss": 0.4489, "step": 1231 }, { "epoch": 0.9072164948453608, "grad_norm": 0.559502899646759, "learning_rate": 8.804394143798426e-06, "loss": 0.4629, "step": 1232 }, { "epoch": 0.9079528718703976, "grad_norm": 0.47452765703201294, "learning_rate": 8.801612386357508e-06, "loss": 0.4457, "step": 1233 }, { "epoch": 0.9086892488954345, "grad_norm": 0.5304099917411804, "learning_rate": 8.798827837127336e-06, "loss": 0.4492, "step": 1234 }, { "epoch": 0.9094256259204713, "grad_norm": 0.5169060826301575, "learning_rate": 8.796040498152797e-06, "loss": 0.4748, "step": 1235 }, { "epoch": 0.9101620029455081, "grad_norm": 0.49267488718032837, "learning_rate": 8.793250371480827e-06, "loss": 0.4579, "step": 1236 }, { "epoch": 0.9108983799705449, "grad_norm": 0.4913206100463867, "learning_rate": 8.790457459160414e-06, "loss": 0.4255, "step": 1237 }, { "epoch": 0.9116347569955817, "grad_norm": 0.531548798084259, "learning_rate": 8.787661763242585e-06, "loss": 0.4728, "step": 1238 }, { "epoch": 0.9123711340206185, "grad_norm": 0.52623450756073, "learning_rate": 8.784863285780419e-06, "loss": 0.4661, "step": 1239 }, { "epoch": 0.9131075110456554, "grad_norm": 0.5517471432685852, "learning_rate": 8.782062028829028e-06, "loss": 0.4553, "step": 1240 }, { "epoch": 0.9138438880706922, "grad_norm": 0.472700297832489, "learning_rate": 8.779257994445574e-06, "loss": 0.4422, "step": 1241 }, { "epoch": 0.914580265095729, "grad_norm": 0.44898468255996704, "learning_rate": 8.776451184689253e-06, "loss": 0.4124, "step": 1242 }, { "epoch": 0.9153166421207658, "grad_norm": 0.48845183849334717, "learning_rate": 8.773641601621303e-06, "loss": 0.4469, "step": 1243 }, { "epoch": 0.9160530191458026, "grad_norm": 0.5202846527099609, "learning_rate": 8.770829247304998e-06, "loss": 0.4723, "step": 1244 }, { "epoch": 0.9167893961708394, "grad_norm": 0.44977980852127075, "learning_rate": 8.768014123805642e-06, "loss": 0.4413, "step": 1245 }, { "epoch": 0.9175257731958762, "grad_norm": 0.46154212951660156, "learning_rate": 8.765196233190579e-06, "loss": 0.4411, "step": 1246 }, { "epoch": 0.9182621502209131, "grad_norm": 0.5938853621482849, "learning_rate": 8.762375577529184e-06, "loss": 0.4399, "step": 1247 }, { "epoch": 0.9189985272459499, "grad_norm": 0.43907907605171204, "learning_rate": 8.75955215889286e-06, "loss": 0.4402, "step": 1248 }, { "epoch": 0.9197349042709867, "grad_norm": 0.4637261629104614, "learning_rate": 8.756725979355039e-06, "loss": 0.4587, "step": 1249 }, { "epoch": 0.9204712812960235, "grad_norm": 0.45832645893096924, "learning_rate": 8.753897040991183e-06, "loss": 0.4579, "step": 1250 }, { "epoch": 0.9212076583210603, "grad_norm": 0.44324028491973877, "learning_rate": 8.751065345878778e-06, "loss": 0.453, "step": 1251 }, { "epoch": 0.9219440353460973, "grad_norm": 0.5125420689582825, "learning_rate": 8.748230896097338e-06, "loss": 0.4422, "step": 1252 }, { "epoch": 0.9226804123711341, "grad_norm": 0.5083498954772949, "learning_rate": 8.745393693728395e-06, "loss": 0.4661, "step": 1253 }, { "epoch": 0.9234167893961709, "grad_norm": 0.4523887038230896, "learning_rate": 8.742553740855507e-06, "loss": 0.4469, "step": 1254 }, { "epoch": 0.9241531664212077, "grad_norm": 0.5064306259155273, "learning_rate": 8.739711039564245e-06, "loss": 0.4462, "step": 1255 }, { "epoch": 0.9248895434462445, "grad_norm": 0.48103612661361694, "learning_rate": 8.736865591942208e-06, "loss": 0.4334, "step": 1256 }, { "epoch": 0.9256259204712813, "grad_norm": 0.43229183554649353, "learning_rate": 8.734017400079002e-06, "loss": 0.4749, "step": 1257 }, { "epoch": 0.9263622974963182, "grad_norm": 0.47184067964553833, "learning_rate": 8.731166466066258e-06, "loss": 0.4692, "step": 1258 }, { "epoch": 0.927098674521355, "grad_norm": 0.49023813009262085, "learning_rate": 8.728312791997612e-06, "loss": 0.4593, "step": 1259 }, { "epoch": 0.9278350515463918, "grad_norm": 0.5471561551094055, "learning_rate": 8.725456379968717e-06, "loss": 0.465, "step": 1260 }, { "epoch": 0.9285714285714286, "grad_norm": 0.41933873295783997, "learning_rate": 8.722597232077236e-06, "loss": 0.4507, "step": 1261 }, { "epoch": 0.9293078055964654, "grad_norm": 0.4452081322669983, "learning_rate": 8.71973535042284e-06, "loss": 0.4604, "step": 1262 }, { "epoch": 0.9300441826215022, "grad_norm": 0.40033042430877686, "learning_rate": 8.716870737107211e-06, "loss": 0.4578, "step": 1263 }, { "epoch": 0.930780559646539, "grad_norm": 0.4695224463939667, "learning_rate": 8.714003394234031e-06, "loss": 0.4582, "step": 1264 }, { "epoch": 0.9315169366715759, "grad_norm": 0.40169644355773926, "learning_rate": 8.711133323908993e-06, "loss": 0.4523, "step": 1265 }, { "epoch": 0.9322533136966127, "grad_norm": 0.43690577149391174, "learning_rate": 8.708260528239788e-06, "loss": 0.4569, "step": 1266 }, { "epoch": 0.9329896907216495, "grad_norm": 0.44741305708885193, "learning_rate": 8.705385009336111e-06, "loss": 0.4363, "step": 1267 }, { "epoch": 0.9337260677466863, "grad_norm": 0.6049298644065857, "learning_rate": 8.702506769309656e-06, "loss": 0.4569, "step": 1268 }, { "epoch": 0.9344624447717231, "grad_norm": 0.43075326085090637, "learning_rate": 8.699625810274115e-06, "loss": 0.4377, "step": 1269 }, { "epoch": 0.93519882179676, "grad_norm": 0.5428841710090637, "learning_rate": 8.696742134345178e-06, "loss": 0.437, "step": 1270 }, { "epoch": 0.9359351988217968, "grad_norm": 0.5976313948631287, "learning_rate": 8.69385574364053e-06, "loss": 0.4581, "step": 1271 }, { "epoch": 0.9366715758468336, "grad_norm": 0.48488497734069824, "learning_rate": 8.690966640279846e-06, "loss": 0.448, "step": 1272 }, { "epoch": 0.9374079528718704, "grad_norm": 0.5577574372291565, "learning_rate": 8.688074826384801e-06, "loss": 0.4572, "step": 1273 }, { "epoch": 0.9381443298969072, "grad_norm": 0.459681898355484, "learning_rate": 8.685180304079051e-06, "loss": 0.4488, "step": 1274 }, { "epoch": 0.938880706921944, "grad_norm": 0.5070257782936096, "learning_rate": 8.682283075488249e-06, "loss": 0.4495, "step": 1275 }, { "epoch": 0.9396170839469808, "grad_norm": 0.486531525850296, "learning_rate": 8.679383142740033e-06, "loss": 0.4402, "step": 1276 }, { "epoch": 0.9403534609720177, "grad_norm": 0.5276297330856323, "learning_rate": 8.676480507964021e-06, "loss": 0.4487, "step": 1277 }, { "epoch": 0.9410898379970545, "grad_norm": 0.47423624992370605, "learning_rate": 8.673575173291826e-06, "loss": 0.449, "step": 1278 }, { "epoch": 0.9418262150220913, "grad_norm": 0.5563042163848877, "learning_rate": 8.670667140857034e-06, "loss": 0.4449, "step": 1279 }, { "epoch": 0.9425625920471281, "grad_norm": 0.47434091567993164, "learning_rate": 8.667756412795217e-06, "loss": 0.4455, "step": 1280 }, { "epoch": 0.9432989690721649, "grad_norm": 0.6123890280723572, "learning_rate": 8.664842991243927e-06, "loss": 0.4782, "step": 1281 }, { "epoch": 0.9440353460972017, "grad_norm": 0.42824527621269226, "learning_rate": 8.661926878342692e-06, "loss": 0.4512, "step": 1282 }, { "epoch": 0.9447717231222386, "grad_norm": 0.5722915530204773, "learning_rate": 8.659008076233016e-06, "loss": 0.4485, "step": 1283 }, { "epoch": 0.9455081001472754, "grad_norm": 0.5747035145759583, "learning_rate": 8.656086587058381e-06, "loss": 0.4503, "step": 1284 }, { "epoch": 0.9462444771723122, "grad_norm": 0.4897293150424957, "learning_rate": 8.65316241296424e-06, "loss": 0.4605, "step": 1285 }, { "epoch": 0.946980854197349, "grad_norm": 0.5299695134162903, "learning_rate": 8.650235556098017e-06, "loss": 0.4241, "step": 1286 }, { "epoch": 0.9477172312223858, "grad_norm": 0.5436539053916931, "learning_rate": 8.647306018609108e-06, "loss": 0.4211, "step": 1287 }, { "epoch": 0.9484536082474226, "grad_norm": 0.49147042632102966, "learning_rate": 8.644373802648877e-06, "loss": 0.4528, "step": 1288 }, { "epoch": 0.9491899852724595, "grad_norm": 0.5229616165161133, "learning_rate": 8.641438910370655e-06, "loss": 0.4492, "step": 1289 }, { "epoch": 0.9499263622974963, "grad_norm": 0.49656403064727783, "learning_rate": 8.638501343929735e-06, "loss": 0.4382, "step": 1290 }, { "epoch": 0.9506627393225331, "grad_norm": 0.47837382555007935, "learning_rate": 8.635561105483384e-06, "loss": 0.469, "step": 1291 }, { "epoch": 0.9513991163475699, "grad_norm": 0.48511695861816406, "learning_rate": 8.632618197190817e-06, "loss": 0.4372, "step": 1292 }, { "epoch": 0.9521354933726067, "grad_norm": 0.5691758394241333, "learning_rate": 8.629672621213221e-06, "loss": 0.4331, "step": 1293 }, { "epoch": 0.9528718703976435, "grad_norm": 0.46881407499313354, "learning_rate": 8.626724379713736e-06, "loss": 0.4669, "step": 1294 }, { "epoch": 0.9536082474226805, "grad_norm": 0.5768502354621887, "learning_rate": 8.623773474857461e-06, "loss": 0.4613, "step": 1295 }, { "epoch": 0.9543446244477173, "grad_norm": 0.49087420105934143, "learning_rate": 8.620819908811455e-06, "loss": 0.4296, "step": 1296 }, { "epoch": 0.9550810014727541, "grad_norm": 0.45804309844970703, "learning_rate": 8.617863683744726e-06, "loss": 0.4376, "step": 1297 }, { "epoch": 0.9558173784977909, "grad_norm": 0.47427794337272644, "learning_rate": 8.614904801828234e-06, "loss": 0.4583, "step": 1298 }, { "epoch": 0.9565537555228277, "grad_norm": 0.44913750886917114, "learning_rate": 8.611943265234895e-06, "loss": 0.4409, "step": 1299 }, { "epoch": 0.9572901325478645, "grad_norm": 0.491207480430603, "learning_rate": 8.608979076139572e-06, "loss": 0.449, "step": 1300 }, { "epoch": 0.9580265095729014, "grad_norm": 0.43761712312698364, "learning_rate": 8.606012236719073e-06, "loss": 0.442, "step": 1301 }, { "epoch": 0.9587628865979382, "grad_norm": 0.39326536655426025, "learning_rate": 8.60304274915216e-06, "loss": 0.4258, "step": 1302 }, { "epoch": 0.959499263622975, "grad_norm": 0.4258727431297302, "learning_rate": 8.600070615619528e-06, "loss": 0.4392, "step": 1303 }, { "epoch": 0.9602356406480118, "grad_norm": 0.4401472210884094, "learning_rate": 8.597095838303831e-06, "loss": 0.4483, "step": 1304 }, { "epoch": 0.9609720176730486, "grad_norm": 0.4828743040561676, "learning_rate": 8.594118419389648e-06, "loss": 0.4307, "step": 1305 }, { "epoch": 0.9617083946980854, "grad_norm": 0.4741956889629364, "learning_rate": 8.591138361063508e-06, "loss": 0.4331, "step": 1306 }, { "epoch": 0.9624447717231223, "grad_norm": 0.4706476926803589, "learning_rate": 8.588155665513877e-06, "loss": 0.4425, "step": 1307 }, { "epoch": 0.9631811487481591, "grad_norm": 0.4412619471549988, "learning_rate": 8.585170334931156e-06, "loss": 0.4312, "step": 1308 }, { "epoch": 0.9639175257731959, "grad_norm": 0.4449722170829773, "learning_rate": 8.58218237150768e-06, "loss": 0.4158, "step": 1309 }, { "epoch": 0.9646539027982327, "grad_norm": 0.45096832513809204, "learning_rate": 8.579191777437721e-06, "loss": 0.4532, "step": 1310 }, { "epoch": 0.9653902798232695, "grad_norm": 0.4758065938949585, "learning_rate": 8.57619855491748e-06, "loss": 0.4518, "step": 1311 }, { "epoch": 0.9661266568483063, "grad_norm": 0.4714369773864746, "learning_rate": 8.57320270614509e-06, "loss": 0.4579, "step": 1312 }, { "epoch": 0.9668630338733432, "grad_norm": 0.44195324182510376, "learning_rate": 8.57020423332061e-06, "loss": 0.4256, "step": 1313 }, { "epoch": 0.96759941089838, "grad_norm": 0.49739697575569153, "learning_rate": 8.567203138646027e-06, "loss": 0.4411, "step": 1314 }, { "epoch": 0.9683357879234168, "grad_norm": 0.48561814427375793, "learning_rate": 8.564199424325259e-06, "loss": 0.439, "step": 1315 }, { "epoch": 0.9690721649484536, "grad_norm": 0.44757330417633057, "learning_rate": 8.56119309256414e-06, "loss": 0.468, "step": 1316 }, { "epoch": 0.9698085419734904, "grad_norm": 0.5125359296798706, "learning_rate": 8.558184145570427e-06, "loss": 0.4643, "step": 1317 }, { "epoch": 0.9705449189985272, "grad_norm": 0.6268301606178284, "learning_rate": 8.555172585553804e-06, "loss": 0.4455, "step": 1318 }, { "epoch": 0.9712812960235641, "grad_norm": 0.4266813099384308, "learning_rate": 8.552158414725868e-06, "loss": 0.4357, "step": 1319 }, { "epoch": 0.9720176730486009, "grad_norm": 0.49251487851142883, "learning_rate": 8.549141635300135e-06, "loss": 0.4315, "step": 1320 }, { "epoch": 0.9727540500736377, "grad_norm": 0.4862709939479828, "learning_rate": 8.546122249492035e-06, "loss": 0.4352, "step": 1321 }, { "epoch": 0.9734904270986745, "grad_norm": 0.4898791015148163, "learning_rate": 8.543100259518916e-06, "loss": 0.4408, "step": 1322 }, { "epoch": 0.9742268041237113, "grad_norm": 0.45198628306388855, "learning_rate": 8.540075667600034e-06, "loss": 0.4316, "step": 1323 }, { "epoch": 0.9749631811487481, "grad_norm": 0.5984836220741272, "learning_rate": 8.53704847595656e-06, "loss": 0.4535, "step": 1324 }, { "epoch": 0.975699558173785, "grad_norm": 0.5049937963485718, "learning_rate": 8.534018686811572e-06, "loss": 0.4115, "step": 1325 }, { "epoch": 0.9764359351988218, "grad_norm": 0.41473087668418884, "learning_rate": 8.530986302390053e-06, "loss": 0.463, "step": 1326 }, { "epoch": 0.9771723122238586, "grad_norm": 0.4933287501335144, "learning_rate": 8.527951324918897e-06, "loss": 0.4182, "step": 1327 }, { "epoch": 0.9779086892488954, "grad_norm": 0.6397290229797363, "learning_rate": 8.5249137566269e-06, "loss": 0.4784, "step": 1328 }, { "epoch": 0.9786450662739322, "grad_norm": 0.45069581270217896, "learning_rate": 8.521873599744758e-06, "loss": 0.4373, "step": 1329 }, { "epoch": 0.979381443298969, "grad_norm": 0.5195262432098389, "learning_rate": 8.518830856505072e-06, "loss": 0.4582, "step": 1330 }, { "epoch": 0.9801178203240059, "grad_norm": 0.6199840307235718, "learning_rate": 8.515785529142339e-06, "loss": 0.4265, "step": 1331 }, { "epoch": 0.9808541973490427, "grad_norm": 0.5363901257514954, "learning_rate": 8.512737619892958e-06, "loss": 0.4388, "step": 1332 }, { "epoch": 0.9815905743740795, "grad_norm": 0.576992392539978, "learning_rate": 8.509687130995223e-06, "loss": 0.4475, "step": 1333 }, { "epoch": 0.9823269513991163, "grad_norm": 0.4745303690433502, "learning_rate": 8.506634064689314e-06, "loss": 0.432, "step": 1334 }, { "epoch": 0.9830633284241531, "grad_norm": 0.4878941476345062, "learning_rate": 8.503578423217316e-06, "loss": 0.433, "step": 1335 }, { "epoch": 0.9837997054491899, "grad_norm": 0.5367373824119568, "learning_rate": 8.500520208823199e-06, "loss": 0.4369, "step": 1336 }, { "epoch": 0.9845360824742269, "grad_norm": 0.5170448422431946, "learning_rate": 8.497459423752824e-06, "loss": 0.4418, "step": 1337 }, { "epoch": 0.9852724594992637, "grad_norm": 0.45060834288597107, "learning_rate": 8.494396070253934e-06, "loss": 0.4293, "step": 1338 }, { "epoch": 0.9860088365243005, "grad_norm": 0.43550148606300354, "learning_rate": 8.49133015057617e-06, "loss": 0.437, "step": 1339 }, { "epoch": 0.9867452135493373, "grad_norm": 0.4647277891635895, "learning_rate": 8.488261666971047e-06, "loss": 0.4149, "step": 1340 }, { "epoch": 0.9874815905743741, "grad_norm": 0.5019136667251587, "learning_rate": 8.485190621691967e-06, "loss": 0.4437, "step": 1341 }, { "epoch": 0.9882179675994109, "grad_norm": 0.41769254207611084, "learning_rate": 8.482117016994213e-06, "loss": 0.4457, "step": 1342 }, { "epoch": 0.9889543446244478, "grad_norm": 0.4599890112876892, "learning_rate": 8.479040855134949e-06, "loss": 0.4314, "step": 1343 }, { "epoch": 0.9896907216494846, "grad_norm": 0.495074063539505, "learning_rate": 8.475962138373212e-06, "loss": 0.4334, "step": 1344 }, { "epoch": 0.9904270986745214, "grad_norm": 0.41426074504852295, "learning_rate": 8.472880868969922e-06, "loss": 0.4461, "step": 1345 }, { "epoch": 0.9911634756995582, "grad_norm": 0.4830911457538605, "learning_rate": 8.469797049187867e-06, "loss": 0.4623, "step": 1346 }, { "epoch": 0.991899852724595, "grad_norm": 0.5529137849807739, "learning_rate": 8.466710681291714e-06, "loss": 0.4372, "step": 1347 }, { "epoch": 0.9926362297496318, "grad_norm": 0.46961885690689087, "learning_rate": 8.463621767547998e-06, "loss": 0.4219, "step": 1348 }, { "epoch": 0.9933726067746687, "grad_norm": 0.4638763666152954, "learning_rate": 8.46053031022512e-06, "loss": 0.4388, "step": 1349 }, { "epoch": 0.9941089837997055, "grad_norm": 0.44377005100250244, "learning_rate": 8.457436311593358e-06, "loss": 0.4169, "step": 1350 }, { "epoch": 0.9948453608247423, "grad_norm": 0.5292149782180786, "learning_rate": 8.454339773924849e-06, "loss": 0.4521, "step": 1351 }, { "epoch": 0.9955817378497791, "grad_norm": 0.5035756230354309, "learning_rate": 8.451240699493597e-06, "loss": 0.4451, "step": 1352 }, { "epoch": 0.9963181148748159, "grad_norm": 0.42530128359794617, "learning_rate": 8.448139090575467e-06, "loss": 0.434, "step": 1353 }, { "epoch": 0.9970544918998527, "grad_norm": 0.5264377593994141, "learning_rate": 8.445034949448188e-06, "loss": 0.4654, "step": 1354 }, { "epoch": 0.9977908689248896, "grad_norm": 0.5930060148239136, "learning_rate": 8.441928278391349e-06, "loss": 0.4539, "step": 1355 }, { "epoch": 0.9985272459499264, "grad_norm": 0.43737247586250305, "learning_rate": 8.438819079686391e-06, "loss": 0.4495, "step": 1356 }, { "epoch": 0.9992636229749632, "grad_norm": 0.46241891384124756, "learning_rate": 8.43570735561662e-06, "loss": 0.4729, "step": 1357 }, { "epoch": 1.0, "grad_norm": 0.5586955547332764, "learning_rate": 8.43259310846719e-06, "loss": 0.4411, "step": 1358 }, { "epoch": 1.0007363770250368, "grad_norm": 0.5139601230621338, "learning_rate": 8.429476340525111e-06, "loss": 0.4186, "step": 1359 }, { "epoch": 1.0014727540500736, "grad_norm": 0.42976438999176025, "learning_rate": 8.426357054079244e-06, "loss": 0.4069, "step": 1360 }, { "epoch": 1.0022091310751104, "grad_norm": 0.537762463092804, "learning_rate": 8.423235251420297e-06, "loss": 0.3928, "step": 1361 }, { "epoch": 1.0029455081001473, "grad_norm": 0.4901559054851532, "learning_rate": 8.420110934840826e-06, "loss": 0.4392, "step": 1362 }, { "epoch": 1.003681885125184, "grad_norm": 0.44034528732299805, "learning_rate": 8.416984106635238e-06, "loss": 0.4228, "step": 1363 }, { "epoch": 1.004418262150221, "grad_norm": 0.4630548059940338, "learning_rate": 8.413854769099779e-06, "loss": 0.4053, "step": 1364 }, { "epoch": 1.0051546391752577, "grad_norm": 0.5146474242210388, "learning_rate": 8.410722924532541e-06, "loss": 0.3992, "step": 1365 }, { "epoch": 1.0058910162002945, "grad_norm": 0.46642136573791504, "learning_rate": 8.407588575233457e-06, "loss": 0.4437, "step": 1366 }, { "epoch": 1.0066273932253313, "grad_norm": 0.4860120415687561, "learning_rate": 8.404451723504295e-06, "loss": 0.421, "step": 1367 }, { "epoch": 1.0073637702503682, "grad_norm": 0.5072521567344666, "learning_rate": 8.401312371648667e-06, "loss": 0.4082, "step": 1368 }, { "epoch": 1.008100147275405, "grad_norm": 0.4237157702445984, "learning_rate": 8.398170521972017e-06, "loss": 0.4189, "step": 1369 }, { "epoch": 1.0088365243004418, "grad_norm": 0.4586620628833771, "learning_rate": 8.395026176781627e-06, "loss": 0.3847, "step": 1370 }, { "epoch": 1.0095729013254786, "grad_norm": 0.4629966616630554, "learning_rate": 8.391879338386604e-06, "loss": 0.4173, "step": 1371 }, { "epoch": 1.0103092783505154, "grad_norm": 0.4874536097049713, "learning_rate": 8.388730009097895e-06, "loss": 0.4035, "step": 1372 }, { "epoch": 1.0110456553755522, "grad_norm": 0.4574902355670929, "learning_rate": 8.385578191228272e-06, "loss": 0.425, "step": 1373 }, { "epoch": 1.011782032400589, "grad_norm": 0.3942805230617523, "learning_rate": 8.382423887092333e-06, "loss": 0.4186, "step": 1374 }, { "epoch": 1.0125184094256259, "grad_norm": 0.4842493236064911, "learning_rate": 8.379267099006506e-06, "loss": 0.4149, "step": 1375 }, { "epoch": 1.0132547864506627, "grad_norm": 0.46591871976852417, "learning_rate": 8.376107829289037e-06, "loss": 0.4112, "step": 1376 }, { "epoch": 1.0139911634756995, "grad_norm": 0.4761437177658081, "learning_rate": 8.372946080260002e-06, "loss": 0.4109, "step": 1377 }, { "epoch": 1.0147275405007363, "grad_norm": 0.4643495976924896, "learning_rate": 8.369781854241293e-06, "loss": 0.4222, "step": 1378 }, { "epoch": 1.0154639175257731, "grad_norm": 0.4508487284183502, "learning_rate": 8.36661515355662e-06, "loss": 0.4215, "step": 1379 }, { "epoch": 1.01620029455081, "grad_norm": 0.5168802738189697, "learning_rate": 8.363445980531515e-06, "loss": 0.4213, "step": 1380 }, { "epoch": 1.0169366715758468, "grad_norm": 0.46994489431381226, "learning_rate": 8.360274337493321e-06, "loss": 0.4292, "step": 1381 }, { "epoch": 1.0176730486008836, "grad_norm": 0.44330844283103943, "learning_rate": 8.3571002267712e-06, "loss": 0.4107, "step": 1382 }, { "epoch": 1.0184094256259204, "grad_norm": 0.4858071506023407, "learning_rate": 8.353923650696119e-06, "loss": 0.3989, "step": 1383 }, { "epoch": 1.0191458026509572, "grad_norm": 0.5103859305381775, "learning_rate": 8.35074461160086e-06, "loss": 0.4168, "step": 1384 }, { "epoch": 1.019882179675994, "grad_norm": 0.4278160631656647, "learning_rate": 8.347563111820014e-06, "loss": 0.4058, "step": 1385 }, { "epoch": 1.0206185567010309, "grad_norm": 0.436535507440567, "learning_rate": 8.34437915368998e-06, "loss": 0.3877, "step": 1386 }, { "epoch": 1.0213549337260677, "grad_norm": 0.47809407114982605, "learning_rate": 8.341192739548958e-06, "loss": 0.3958, "step": 1387 }, { "epoch": 1.0220913107511045, "grad_norm": 0.4272827208042145, "learning_rate": 8.338003871736957e-06, "loss": 0.4139, "step": 1388 }, { "epoch": 1.0228276877761413, "grad_norm": 0.4393133819103241, "learning_rate": 8.334812552595782e-06, "loss": 0.4266, "step": 1389 }, { "epoch": 1.0235640648011781, "grad_norm": 0.45444509387016296, "learning_rate": 8.331618784469043e-06, "loss": 0.4055, "step": 1390 }, { "epoch": 1.024300441826215, "grad_norm": 0.46266138553619385, "learning_rate": 8.328422569702148e-06, "loss": 0.4283, "step": 1391 }, { "epoch": 1.0250368188512518, "grad_norm": 0.43913209438323975, "learning_rate": 8.325223910642297e-06, "loss": 0.4317, "step": 1392 }, { "epoch": 1.0257731958762886, "grad_norm": 0.4571021795272827, "learning_rate": 8.322022809638492e-06, "loss": 0.4255, "step": 1393 }, { "epoch": 1.0265095729013254, "grad_norm": 0.4817020893096924, "learning_rate": 8.318819269041524e-06, "loss": 0.4324, "step": 1394 }, { "epoch": 1.0272459499263622, "grad_norm": 0.4041571021080017, "learning_rate": 8.315613291203977e-06, "loss": 0.417, "step": 1395 }, { "epoch": 1.027982326951399, "grad_norm": 0.4440697729587555, "learning_rate": 8.312404878480222e-06, "loss": 0.4043, "step": 1396 }, { "epoch": 1.0287187039764358, "grad_norm": 0.47406235337257385, "learning_rate": 8.309194033226423e-06, "loss": 0.4089, "step": 1397 }, { "epoch": 1.0294550810014726, "grad_norm": 0.4187905192375183, "learning_rate": 8.305980757800525e-06, "loss": 0.4108, "step": 1398 }, { "epoch": 1.0301914580265095, "grad_norm": 0.45452675223350525, "learning_rate": 8.302765054562261e-06, "loss": 0.4159, "step": 1399 }, { "epoch": 1.0309278350515463, "grad_norm": 0.4963740110397339, "learning_rate": 8.299546925873148e-06, "loss": 0.411, "step": 1400 }, { "epoch": 1.031664212076583, "grad_norm": 0.48701077699661255, "learning_rate": 8.296326374096482e-06, "loss": 0.409, "step": 1401 }, { "epoch": 1.0324005891016201, "grad_norm": 0.4532162547111511, "learning_rate": 8.293103401597338e-06, "loss": 0.3964, "step": 1402 }, { "epoch": 1.033136966126657, "grad_norm": 0.4221610128879547, "learning_rate": 8.28987801074257e-06, "loss": 0.4084, "step": 1403 }, { "epoch": 1.0338733431516938, "grad_norm": 0.5044291615486145, "learning_rate": 8.286650203900808e-06, "loss": 0.4088, "step": 1404 }, { "epoch": 1.0346097201767306, "grad_norm": 0.4653428792953491, "learning_rate": 8.283419983442453e-06, "loss": 0.3964, "step": 1405 }, { "epoch": 1.0353460972017674, "grad_norm": 0.45748192071914673, "learning_rate": 8.280187351739686e-06, "loss": 0.4184, "step": 1406 }, { "epoch": 1.0360824742268042, "grad_norm": 0.4691620171070099, "learning_rate": 8.276952311166451e-06, "loss": 0.3849, "step": 1407 }, { "epoch": 1.036818851251841, "grad_norm": 0.46832898259162903, "learning_rate": 8.273714864098466e-06, "loss": 0.4052, "step": 1408 }, { "epoch": 1.0375552282768779, "grad_norm": 0.3961854875087738, "learning_rate": 8.270475012913212e-06, "loss": 0.3872, "step": 1409 }, { "epoch": 1.0382916053019147, "grad_norm": 0.4253145158290863, "learning_rate": 8.267232759989938e-06, "loss": 0.3978, "step": 1410 }, { "epoch": 1.0390279823269515, "grad_norm": 0.43363869190216064, "learning_rate": 8.26398810770966e-06, "loss": 0.4387, "step": 1411 }, { "epoch": 1.0397643593519883, "grad_norm": 0.43698909878730774, "learning_rate": 8.260741058455147e-06, "loss": 0.4094, "step": 1412 }, { "epoch": 1.0405007363770251, "grad_norm": 0.3974965214729309, "learning_rate": 8.257491614610939e-06, "loss": 0.407, "step": 1413 }, { "epoch": 1.041237113402062, "grad_norm": 0.4418043792247772, "learning_rate": 8.254239778563325e-06, "loss": 0.4174, "step": 1414 }, { "epoch": 1.0419734904270987, "grad_norm": 0.43214893341064453, "learning_rate": 8.250985552700359e-06, "loss": 0.4399, "step": 1415 }, { "epoch": 1.0427098674521356, "grad_norm": 0.41666722297668457, "learning_rate": 8.247728939411845e-06, "loss": 0.4377, "step": 1416 }, { "epoch": 1.0434462444771724, "grad_norm": 0.45747169852256775, "learning_rate": 8.24446994108934e-06, "loss": 0.4204, "step": 1417 }, { "epoch": 1.0441826215022092, "grad_norm": 0.4472166895866394, "learning_rate": 8.241208560126154e-06, "loss": 0.4299, "step": 1418 }, { "epoch": 1.044918998527246, "grad_norm": 0.42849603295326233, "learning_rate": 8.237944798917347e-06, "loss": 0.419, "step": 1419 }, { "epoch": 1.0456553755522828, "grad_norm": 0.46297961473464966, "learning_rate": 8.234678659859729e-06, "loss": 0.4288, "step": 1420 }, { "epoch": 1.0463917525773196, "grad_norm": 0.5132995247840881, "learning_rate": 8.231410145351853e-06, "loss": 0.4088, "step": 1421 }, { "epoch": 1.0471281296023565, "grad_norm": 0.45529741048812866, "learning_rate": 8.228139257794012e-06, "loss": 0.4097, "step": 1422 }, { "epoch": 1.0478645066273933, "grad_norm": 0.4541131854057312, "learning_rate": 8.224865999588254e-06, "loss": 0.3927, "step": 1423 }, { "epoch": 1.04860088365243, "grad_norm": 0.46962690353393555, "learning_rate": 8.221590373138358e-06, "loss": 0.4196, "step": 1424 }, { "epoch": 1.049337260677467, "grad_norm": 0.39268723130226135, "learning_rate": 8.218312380849844e-06, "loss": 0.4267, "step": 1425 }, { "epoch": 1.0500736377025037, "grad_norm": 0.38890156149864197, "learning_rate": 8.21503202512997e-06, "loss": 0.4095, "step": 1426 }, { "epoch": 1.0508100147275405, "grad_norm": 0.4562273621559143, "learning_rate": 8.211749308387734e-06, "loss": 0.4437, "step": 1427 }, { "epoch": 1.0515463917525774, "grad_norm": 0.43432724475860596, "learning_rate": 8.208464233033862e-06, "loss": 0.4181, "step": 1428 }, { "epoch": 1.0522827687776142, "grad_norm": 0.4545213580131531, "learning_rate": 8.205176801480811e-06, "loss": 0.4148, "step": 1429 }, { "epoch": 1.053019145802651, "grad_norm": 0.5053450465202332, "learning_rate": 8.201887016142776e-06, "loss": 0.4027, "step": 1430 }, { "epoch": 1.0537555228276878, "grad_norm": 0.4846493601799011, "learning_rate": 8.198594879435673e-06, "loss": 0.4324, "step": 1431 }, { "epoch": 1.0544918998527246, "grad_norm": 0.4266805648803711, "learning_rate": 8.19530039377715e-06, "loss": 0.4072, "step": 1432 }, { "epoch": 1.0552282768777614, "grad_norm": 0.5038962364196777, "learning_rate": 8.192003561586576e-06, "loss": 0.4021, "step": 1433 }, { "epoch": 1.0559646539027983, "grad_norm": 0.5069649815559387, "learning_rate": 8.188704385285046e-06, "loss": 0.4373, "step": 1434 }, { "epoch": 1.056701030927835, "grad_norm": 0.40627750754356384, "learning_rate": 8.185402867295373e-06, "loss": 0.4307, "step": 1435 }, { "epoch": 1.0574374079528719, "grad_norm": 0.5366407632827759, "learning_rate": 8.182099010042095e-06, "loss": 0.4244, "step": 1436 }, { "epoch": 1.0581737849779087, "grad_norm": 0.4338124096393585, "learning_rate": 8.178792815951465e-06, "loss": 0.4091, "step": 1437 }, { "epoch": 1.0589101620029455, "grad_norm": 0.4336191415786743, "learning_rate": 8.175484287451448e-06, "loss": 0.4111, "step": 1438 }, { "epoch": 1.0596465390279823, "grad_norm": 0.4909955561161041, "learning_rate": 8.172173426971732e-06, "loss": 0.4237, "step": 1439 }, { "epoch": 1.0603829160530192, "grad_norm": 0.6215251088142395, "learning_rate": 8.168860236943709e-06, "loss": 0.4154, "step": 1440 }, { "epoch": 1.061119293078056, "grad_norm": 0.4355964958667755, "learning_rate": 8.16554471980049e-06, "loss": 0.3951, "step": 1441 }, { "epoch": 1.0618556701030928, "grad_norm": 0.5015247464179993, "learning_rate": 8.162226877976886e-06, "loss": 0.4373, "step": 1442 }, { "epoch": 1.0625920471281296, "grad_norm": 0.49945569038391113, "learning_rate": 8.158906713909425e-06, "loss": 0.3987, "step": 1443 }, { "epoch": 1.0633284241531664, "grad_norm": 0.492780864238739, "learning_rate": 8.155584230036328e-06, "loss": 0.424, "step": 1444 }, { "epoch": 1.0640648011782032, "grad_norm": 0.5154081583023071, "learning_rate": 8.152259428797535e-06, "loss": 0.4135, "step": 1445 }, { "epoch": 1.06480117820324, "grad_norm": 0.4992247521877289, "learning_rate": 8.148932312634674e-06, "loss": 0.4126, "step": 1446 }, { "epoch": 1.0655375552282769, "grad_norm": 0.579197883605957, "learning_rate": 8.14560288399108e-06, "loss": 0.4069, "step": 1447 }, { "epoch": 1.0662739322533137, "grad_norm": 0.5475592613220215, "learning_rate": 8.142271145311784e-06, "loss": 0.4106, "step": 1448 }, { "epoch": 1.0670103092783505, "grad_norm": 0.5077992081642151, "learning_rate": 8.138937099043516e-06, "loss": 0.4135, "step": 1449 }, { "epoch": 1.0677466863033873, "grad_norm": 0.5219954252243042, "learning_rate": 8.135600747634697e-06, "loss": 0.4654, "step": 1450 }, { "epoch": 1.0684830633284241, "grad_norm": 0.5722024440765381, "learning_rate": 8.132262093535444e-06, "loss": 0.4448, "step": 1451 }, { "epoch": 1.069219440353461, "grad_norm": 0.4120500683784485, "learning_rate": 8.128921139197563e-06, "loss": 0.3952, "step": 1452 }, { "epoch": 1.0699558173784978, "grad_norm": 0.46952569484710693, "learning_rate": 8.125577887074552e-06, "loss": 0.4353, "step": 1453 }, { "epoch": 1.0706921944035346, "grad_norm": 0.5226830244064331, "learning_rate": 8.12223233962159e-06, "loss": 0.409, "step": 1454 }, { "epoch": 1.0714285714285714, "grad_norm": 0.5324341058731079, "learning_rate": 8.118884499295549e-06, "loss": 0.4086, "step": 1455 }, { "epoch": 1.0721649484536082, "grad_norm": 0.4469471573829651, "learning_rate": 8.115534368554981e-06, "loss": 0.4064, "step": 1456 }, { "epoch": 1.072901325478645, "grad_norm": 0.5327169895172119, "learning_rate": 8.112181949860121e-06, "loss": 0.4437, "step": 1457 }, { "epoch": 1.0736377025036818, "grad_norm": 0.5192805528640747, "learning_rate": 8.108827245672884e-06, "loss": 0.4017, "step": 1458 }, { "epoch": 1.0743740795287187, "grad_norm": 0.3944444954395294, "learning_rate": 8.105470258456863e-06, "loss": 0.3981, "step": 1459 }, { "epoch": 1.0751104565537555, "grad_norm": 0.4764431416988373, "learning_rate": 8.102110990677328e-06, "loss": 0.4081, "step": 1460 }, { "epoch": 1.0758468335787923, "grad_norm": 0.49513620138168335, "learning_rate": 8.098749444801226e-06, "loss": 0.4406, "step": 1461 }, { "epoch": 1.076583210603829, "grad_norm": 0.4354887306690216, "learning_rate": 8.095385623297171e-06, "loss": 0.4379, "step": 1462 }, { "epoch": 1.077319587628866, "grad_norm": 0.541980504989624, "learning_rate": 8.092019528635453e-06, "loss": 0.43, "step": 1463 }, { "epoch": 1.0780559646539027, "grad_norm": 0.47262078523635864, "learning_rate": 8.088651163288032e-06, "loss": 0.4201, "step": 1464 }, { "epoch": 1.0787923416789396, "grad_norm": 0.4839096665382385, "learning_rate": 8.085280529728533e-06, "loss": 0.4137, "step": 1465 }, { "epoch": 1.0795287187039764, "grad_norm": 0.4608899652957916, "learning_rate": 8.081907630432246e-06, "loss": 0.4211, "step": 1466 }, { "epoch": 1.0802650957290132, "grad_norm": 0.4837184250354767, "learning_rate": 8.078532467876126e-06, "loss": 0.4036, "step": 1467 }, { "epoch": 1.08100147275405, "grad_norm": 0.518398642539978, "learning_rate": 8.075155044538792e-06, "loss": 0.4052, "step": 1468 }, { "epoch": 1.0817378497790868, "grad_norm": 0.5079612731933594, "learning_rate": 8.071775362900522e-06, "loss": 0.4255, "step": 1469 }, { "epoch": 1.0824742268041236, "grad_norm": 0.4972943365573883, "learning_rate": 8.068393425443253e-06, "loss": 0.3831, "step": 1470 }, { "epoch": 1.0832106038291605, "grad_norm": 0.4453538656234741, "learning_rate": 8.065009234650574e-06, "loss": 0.4174, "step": 1471 }, { "epoch": 1.0839469808541973, "grad_norm": 0.46142661571502686, "learning_rate": 8.061622793007735e-06, "loss": 0.4331, "step": 1472 }, { "epoch": 1.084683357879234, "grad_norm": 0.4330517053604126, "learning_rate": 8.058234103001634e-06, "loss": 0.4285, "step": 1473 }, { "epoch": 1.085419734904271, "grad_norm": 0.4767371118068695, "learning_rate": 8.054843167120827e-06, "loss": 0.4198, "step": 1474 }, { "epoch": 1.0861561119293077, "grad_norm": 0.45682060718536377, "learning_rate": 8.051449987855512e-06, "loss": 0.4344, "step": 1475 }, { "epoch": 1.0868924889543445, "grad_norm": 0.44379401206970215, "learning_rate": 8.048054567697537e-06, "loss": 0.4296, "step": 1476 }, { "epoch": 1.0876288659793814, "grad_norm": 0.4649368226528168, "learning_rate": 8.044656909140395e-06, "loss": 0.4213, "step": 1477 }, { "epoch": 1.0883652430044182, "grad_norm": 0.4579814672470093, "learning_rate": 8.041257014679228e-06, "loss": 0.4117, "step": 1478 }, { "epoch": 1.089101620029455, "grad_norm": 0.4541153013706207, "learning_rate": 8.037854886810813e-06, "loss": 0.4197, "step": 1479 }, { "epoch": 1.0898379970544918, "grad_norm": 0.39233219623565674, "learning_rate": 8.034450528033565e-06, "loss": 0.3982, "step": 1480 }, { "epoch": 1.0905743740795286, "grad_norm": 0.40796148777008057, "learning_rate": 8.031043940847551e-06, "loss": 0.4377, "step": 1481 }, { "epoch": 1.0913107511045654, "grad_norm": 0.47203153371810913, "learning_rate": 8.027635127754462e-06, "loss": 0.4251, "step": 1482 }, { "epoch": 1.0920471281296025, "grad_norm": 0.4739782512187958, "learning_rate": 8.024224091257628e-06, "loss": 0.4257, "step": 1483 }, { "epoch": 1.0927835051546393, "grad_norm": 0.3920868933200836, "learning_rate": 8.020810833862009e-06, "loss": 0.406, "step": 1484 }, { "epoch": 1.093519882179676, "grad_norm": 0.5111119151115417, "learning_rate": 8.017395358074198e-06, "loss": 0.4216, "step": 1485 }, { "epoch": 1.094256259204713, "grad_norm": 0.46156689524650574, "learning_rate": 8.013977666402421e-06, "loss": 0.4185, "step": 1486 }, { "epoch": 1.0949926362297497, "grad_norm": 0.42204272747039795, "learning_rate": 8.010557761356523e-06, "loss": 0.417, "step": 1487 }, { "epoch": 1.0957290132547866, "grad_norm": 0.5379983186721802, "learning_rate": 8.007135645447982e-06, "loss": 0.4235, "step": 1488 }, { "epoch": 1.0964653902798234, "grad_norm": 0.4320377707481384, "learning_rate": 8.003711321189895e-06, "loss": 0.4081, "step": 1489 }, { "epoch": 1.0972017673048602, "grad_norm": 0.4671606421470642, "learning_rate": 8.000284791096983e-06, "loss": 0.4235, "step": 1490 }, { "epoch": 1.097938144329897, "grad_norm": 0.47538307309150696, "learning_rate": 7.996856057685587e-06, "loss": 0.45, "step": 1491 }, { "epoch": 1.0986745213549338, "grad_norm": 0.45516252517700195, "learning_rate": 7.993425123473662e-06, "loss": 0.4002, "step": 1492 }, { "epoch": 1.0994108983799706, "grad_norm": 0.4619099199771881, "learning_rate": 7.989991990980786e-06, "loss": 0.4214, "step": 1493 }, { "epoch": 1.1001472754050075, "grad_norm": 0.36979731917381287, "learning_rate": 7.986556662728145e-06, "loss": 0.4225, "step": 1494 }, { "epoch": 1.1008836524300443, "grad_norm": 0.43840059638023376, "learning_rate": 7.983119141238543e-06, "loss": 0.4197, "step": 1495 }, { "epoch": 1.101620029455081, "grad_norm": 0.4606439769268036, "learning_rate": 7.97967942903639e-06, "loss": 0.4014, "step": 1496 }, { "epoch": 1.102356406480118, "grad_norm": 0.38191360235214233, "learning_rate": 7.976237528647705e-06, "loss": 0.4149, "step": 1497 }, { "epoch": 1.1030927835051547, "grad_norm": 0.4278278648853302, "learning_rate": 7.97279344260012e-06, "loss": 0.4283, "step": 1498 }, { "epoch": 1.1038291605301915, "grad_norm": 0.4436081051826477, "learning_rate": 7.969347173422866e-06, "loss": 0.4274, "step": 1499 }, { "epoch": 1.1045655375552283, "grad_norm": 0.40234655141830444, "learning_rate": 7.965898723646777e-06, "loss": 0.4044, "step": 1500 }, { "epoch": 1.1053019145802652, "grad_norm": 0.38506656885147095, "learning_rate": 7.962448095804292e-06, "loss": 0.4066, "step": 1501 }, { "epoch": 1.106038291605302, "grad_norm": 0.4169124364852905, "learning_rate": 7.958995292429447e-06, "loss": 0.423, "step": 1502 }, { "epoch": 1.1067746686303388, "grad_norm": 0.4359695613384247, "learning_rate": 7.955540316057877e-06, "loss": 0.4202, "step": 1503 }, { "epoch": 1.1075110456553756, "grad_norm": 0.4167039394378662, "learning_rate": 7.952083169226813e-06, "loss": 0.3861, "step": 1504 }, { "epoch": 1.1082474226804124, "grad_norm": 0.43994608521461487, "learning_rate": 7.948623854475079e-06, "loss": 0.4144, "step": 1505 }, { "epoch": 1.1089837997054492, "grad_norm": 0.4638996422290802, "learning_rate": 7.94516237434309e-06, "loss": 0.4085, "step": 1506 }, { "epoch": 1.109720176730486, "grad_norm": 0.4726799726486206, "learning_rate": 7.941698731372851e-06, "loss": 0.3952, "step": 1507 }, { "epoch": 1.1104565537555229, "grad_norm": 0.5001874566078186, "learning_rate": 7.938232928107963e-06, "loss": 0.4176, "step": 1508 }, { "epoch": 1.1111929307805597, "grad_norm": 0.39368122816085815, "learning_rate": 7.9347649670936e-06, "loss": 0.4093, "step": 1509 }, { "epoch": 1.1119293078055965, "grad_norm": 0.4525279998779297, "learning_rate": 7.93129485087653e-06, "loss": 0.4261, "step": 1510 }, { "epoch": 1.1126656848306333, "grad_norm": 0.43985629081726074, "learning_rate": 7.927822582005104e-06, "loss": 0.4309, "step": 1511 }, { "epoch": 1.1134020618556701, "grad_norm": 0.4164014160633087, "learning_rate": 7.924348163029249e-06, "loss": 0.4273, "step": 1512 }, { "epoch": 1.114138438880707, "grad_norm": 0.5301308035850525, "learning_rate": 7.920871596500473e-06, "loss": 0.4277, "step": 1513 }, { "epoch": 1.1148748159057438, "grad_norm": 0.4031001925468445, "learning_rate": 7.917392884971863e-06, "loss": 0.4058, "step": 1514 }, { "epoch": 1.1156111929307806, "grad_norm": 0.564858078956604, "learning_rate": 7.913912030998079e-06, "loss": 0.4363, "step": 1515 }, { "epoch": 1.1163475699558174, "grad_norm": 0.45828789472579956, "learning_rate": 7.910429037135355e-06, "loss": 0.4088, "step": 1516 }, { "epoch": 1.1170839469808542, "grad_norm": 0.5494905114173889, "learning_rate": 7.906943905941495e-06, "loss": 0.4241, "step": 1517 }, { "epoch": 1.117820324005891, "grad_norm": 0.5447287559509277, "learning_rate": 7.903456639975875e-06, "loss": 0.4177, "step": 1518 }, { "epoch": 1.1185567010309279, "grad_norm": 0.4168299734592438, "learning_rate": 7.89996724179944e-06, "loss": 0.4134, "step": 1519 }, { "epoch": 1.1192930780559647, "grad_norm": 0.5956445336341858, "learning_rate": 7.896475713974696e-06, "loss": 0.4133, "step": 1520 }, { "epoch": 1.1200294550810015, "grad_norm": 0.5098801255226135, "learning_rate": 7.892982059065714e-06, "loss": 0.4245, "step": 1521 }, { "epoch": 1.1207658321060383, "grad_norm": 0.4896615743637085, "learning_rate": 7.889486279638134e-06, "loss": 0.4074, "step": 1522 }, { "epoch": 1.1215022091310751, "grad_norm": 0.5458217263221741, "learning_rate": 7.885988378259145e-06, "loss": 0.4018, "step": 1523 }, { "epoch": 1.122238586156112, "grad_norm": 0.49367034435272217, "learning_rate": 7.882488357497504e-06, "loss": 0.4176, "step": 1524 }, { "epoch": 1.1229749631811488, "grad_norm": 0.5893204212188721, "learning_rate": 7.87898621992352e-06, "loss": 0.4162, "step": 1525 }, { "epoch": 1.1237113402061856, "grad_norm": 0.5065829753875732, "learning_rate": 7.875481968109052e-06, "loss": 0.4279, "step": 1526 }, { "epoch": 1.1244477172312224, "grad_norm": 0.5080648064613342, "learning_rate": 7.871975604627524e-06, "loss": 0.3956, "step": 1527 }, { "epoch": 1.1251840942562592, "grad_norm": 0.6169611811637878, "learning_rate": 7.8684671320539e-06, "loss": 0.3985, "step": 1528 }, { "epoch": 1.125920471281296, "grad_norm": 0.4453141987323761, "learning_rate": 7.864956552964695e-06, "loss": 0.4053, "step": 1529 }, { "epoch": 1.1266568483063328, "grad_norm": 0.5291115641593933, "learning_rate": 7.861443869937973e-06, "loss": 0.414, "step": 1530 }, { "epoch": 1.1273932253313697, "grad_norm": 0.5258786678314209, "learning_rate": 7.857929085553344e-06, "loss": 0.4303, "step": 1531 }, { "epoch": 1.1281296023564065, "grad_norm": 0.4741217792034149, "learning_rate": 7.854412202391958e-06, "loss": 0.4174, "step": 1532 }, { "epoch": 1.1288659793814433, "grad_norm": 0.5719811320304871, "learning_rate": 7.850893223036508e-06, "loss": 0.4043, "step": 1533 }, { "epoch": 1.12960235640648, "grad_norm": 0.46365654468536377, "learning_rate": 7.847372150071227e-06, "loss": 0.4338, "step": 1534 }, { "epoch": 1.130338733431517, "grad_norm": 0.4973761737346649, "learning_rate": 7.843848986081882e-06, "loss": 0.3943, "step": 1535 }, { "epoch": 1.1310751104565537, "grad_norm": 0.5263963937759399, "learning_rate": 7.84032373365578e-06, "loss": 0.3909, "step": 1536 }, { "epoch": 1.1318114874815906, "grad_norm": 0.452614426612854, "learning_rate": 7.836796395381761e-06, "loss": 0.415, "step": 1537 }, { "epoch": 1.1325478645066274, "grad_norm": 0.49693626165390015, "learning_rate": 7.833266973850192e-06, "loss": 0.4189, "step": 1538 }, { "epoch": 1.1332842415316642, "grad_norm": 0.5099273920059204, "learning_rate": 7.829735471652978e-06, "loss": 0.4125, "step": 1539 }, { "epoch": 1.134020618556701, "grad_norm": 0.40911126136779785, "learning_rate": 7.826201891383542e-06, "loss": 0.4016, "step": 1540 }, { "epoch": 1.1347569955817378, "grad_norm": 0.525810182094574, "learning_rate": 7.822666235636844e-06, "loss": 0.4202, "step": 1541 }, { "epoch": 1.1354933726067746, "grad_norm": 0.40537041425704956, "learning_rate": 7.819128507009361e-06, "loss": 0.4088, "step": 1542 }, { "epoch": 1.1362297496318114, "grad_norm": 0.42212677001953125, "learning_rate": 7.815588708099094e-06, "loss": 0.4222, "step": 1543 }, { "epoch": 1.1369661266568483, "grad_norm": 0.5106800198554993, "learning_rate": 7.812046841505563e-06, "loss": 0.4592, "step": 1544 }, { "epoch": 1.137702503681885, "grad_norm": 0.419968843460083, "learning_rate": 7.808502909829807e-06, "loss": 0.4089, "step": 1545 }, { "epoch": 1.138438880706922, "grad_norm": 0.4281230568885803, "learning_rate": 7.804956915674387e-06, "loss": 0.3874, "step": 1546 }, { "epoch": 1.1391752577319587, "grad_norm": 0.502241313457489, "learning_rate": 7.80140886164337e-06, "loss": 0.4284, "step": 1547 }, { "epoch": 1.1399116347569955, "grad_norm": 0.39548102021217346, "learning_rate": 7.79785875034234e-06, "loss": 0.408, "step": 1548 }, { "epoch": 1.1406480117820323, "grad_norm": 0.4249773919582367, "learning_rate": 7.794306584378392e-06, "loss": 0.4015, "step": 1549 }, { "epoch": 1.1413843888070692, "grad_norm": 0.4784921109676361, "learning_rate": 7.79075236636013e-06, "loss": 0.4199, "step": 1550 }, { "epoch": 1.142120765832106, "grad_norm": 0.42127084732055664, "learning_rate": 7.787196098897664e-06, "loss": 0.3893, "step": 1551 }, { "epoch": 1.1428571428571428, "grad_norm": 0.4333012104034424, "learning_rate": 7.783637784602608e-06, "loss": 0.3984, "step": 1552 }, { "epoch": 1.1435935198821796, "grad_norm": 0.47243404388427734, "learning_rate": 7.780077426088083e-06, "loss": 0.4212, "step": 1553 }, { "epoch": 1.1443298969072164, "grad_norm": 0.47604048252105713, "learning_rate": 7.776515025968707e-06, "loss": 0.3994, "step": 1554 }, { "epoch": 1.1450662739322532, "grad_norm": 0.5063879489898682, "learning_rate": 7.772950586860599e-06, "loss": 0.4086, "step": 1555 }, { "epoch": 1.14580265095729, "grad_norm": 0.43437930941581726, "learning_rate": 7.769384111381375e-06, "loss": 0.4358, "step": 1556 }, { "epoch": 1.1465390279823269, "grad_norm": 0.46195188164711, "learning_rate": 7.76581560215015e-06, "loss": 0.4083, "step": 1557 }, { "epoch": 1.1472754050073637, "grad_norm": 0.4431914985179901, "learning_rate": 7.762245061787525e-06, "loss": 0.4024, "step": 1558 }, { "epoch": 1.1480117820324005, "grad_norm": 0.42202213406562805, "learning_rate": 7.758672492915598e-06, "loss": 0.3988, "step": 1559 }, { "epoch": 1.1487481590574373, "grad_norm": 0.4758620858192444, "learning_rate": 7.755097898157957e-06, "loss": 0.4307, "step": 1560 }, { "epoch": 1.1494845360824741, "grad_norm": 0.4043138027191162, "learning_rate": 7.751521280139675e-06, "loss": 0.3967, "step": 1561 }, { "epoch": 1.150220913107511, "grad_norm": 0.41827574372291565, "learning_rate": 7.747942641487313e-06, "loss": 0.4107, "step": 1562 }, { "epoch": 1.1509572901325478, "grad_norm": 0.45348092913627625, "learning_rate": 7.74436198482892e-06, "loss": 0.4278, "step": 1563 }, { "epoch": 1.1516936671575846, "grad_norm": 0.43559592962265015, "learning_rate": 7.74077931279401e-06, "loss": 0.4146, "step": 1564 }, { "epoch": 1.1524300441826214, "grad_norm": 0.3940446078777313, "learning_rate": 7.7371946280136e-06, "loss": 0.404, "step": 1565 }, { "epoch": 1.1531664212076582, "grad_norm": 0.4286969006061554, "learning_rate": 7.73360793312017e-06, "loss": 0.4078, "step": 1566 }, { "epoch": 1.153902798232695, "grad_norm": 0.4312724471092224, "learning_rate": 7.730019230747681e-06, "loss": 0.4088, "step": 1567 }, { "epoch": 1.1546391752577319, "grad_norm": 0.41734251379966736, "learning_rate": 7.726428523531565e-06, "loss": 0.4228, "step": 1568 }, { "epoch": 1.1553755522827687, "grad_norm": 0.435044527053833, "learning_rate": 7.722835814108733e-06, "loss": 0.4059, "step": 1569 }, { "epoch": 1.1561119293078055, "grad_norm": 0.443087637424469, "learning_rate": 7.719241105117559e-06, "loss": 0.3977, "step": 1570 }, { "epoch": 1.1568483063328423, "grad_norm": 0.3917474150657654, "learning_rate": 7.715644399197893e-06, "loss": 0.3999, "step": 1571 }, { "epoch": 1.1575846833578791, "grad_norm": 0.4369930922985077, "learning_rate": 7.712045698991043e-06, "loss": 0.4112, "step": 1572 }, { "epoch": 1.158321060382916, "grad_norm": 0.42742857336997986, "learning_rate": 7.708445007139785e-06, "loss": 0.4092, "step": 1573 }, { "epoch": 1.1590574374079528, "grad_norm": 0.41142895817756653, "learning_rate": 7.70484232628836e-06, "loss": 0.4413, "step": 1574 }, { "epoch": 1.1597938144329896, "grad_norm": 0.4239861071109772, "learning_rate": 7.70123765908247e-06, "loss": 0.4047, "step": 1575 }, { "epoch": 1.1605301914580266, "grad_norm": 0.41946011781692505, "learning_rate": 7.69763100816927e-06, "loss": 0.3995, "step": 1576 }, { "epoch": 1.1612665684830634, "grad_norm": 0.41380631923675537, "learning_rate": 7.69402237619738e-06, "loss": 0.4251, "step": 1577 }, { "epoch": 1.1620029455081002, "grad_norm": 0.4237883985042572, "learning_rate": 7.690411765816864e-06, "loss": 0.4196, "step": 1578 }, { "epoch": 1.162739322533137, "grad_norm": 0.4093558192253113, "learning_rate": 7.68679917967925e-06, "loss": 0.412, "step": 1579 }, { "epoch": 1.1634756995581739, "grad_norm": 0.41598761081695557, "learning_rate": 7.683184620437511e-06, "loss": 0.4031, "step": 1580 }, { "epoch": 1.1642120765832107, "grad_norm": 0.43564292788505554, "learning_rate": 7.67956809074607e-06, "loss": 0.397, "step": 1581 }, { "epoch": 1.1649484536082475, "grad_norm": 0.40577253699302673, "learning_rate": 7.675949593260797e-06, "loss": 0.3964, "step": 1582 }, { "epoch": 1.1656848306332843, "grad_norm": 0.43315377831459045, "learning_rate": 7.672329130639007e-06, "loss": 0.4423, "step": 1583 }, { "epoch": 1.1664212076583211, "grad_norm": 0.5157422423362732, "learning_rate": 7.668706705539458e-06, "loss": 0.4398, "step": 1584 }, { "epoch": 1.167157584683358, "grad_norm": 0.4576352834701538, "learning_rate": 7.66508232062235e-06, "loss": 0.4075, "step": 1585 }, { "epoch": 1.1678939617083948, "grad_norm": 0.3952282965183258, "learning_rate": 7.661455978549322e-06, "loss": 0.4418, "step": 1586 }, { "epoch": 1.1686303387334316, "grad_norm": 0.40662047266960144, "learning_rate": 7.657827681983448e-06, "loss": 0.3875, "step": 1587 }, { "epoch": 1.1693667157584684, "grad_norm": 0.46831056475639343, "learning_rate": 7.654197433589243e-06, "loss": 0.4206, "step": 1588 }, { "epoch": 1.1701030927835052, "grad_norm": 0.44352883100509644, "learning_rate": 7.65056523603265e-06, "loss": 0.3882, "step": 1589 }, { "epoch": 1.170839469808542, "grad_norm": 0.4131197929382324, "learning_rate": 7.646931091981045e-06, "loss": 0.4203, "step": 1590 }, { "epoch": 1.1715758468335788, "grad_norm": 0.543073296546936, "learning_rate": 7.643295004103232e-06, "loss": 0.4035, "step": 1591 }, { "epoch": 1.1723122238586157, "grad_norm": 0.42148828506469727, "learning_rate": 7.639656975069447e-06, "loss": 0.4257, "step": 1592 }, { "epoch": 1.1730486008836525, "grad_norm": 0.45353934168815613, "learning_rate": 7.636017007551349e-06, "loss": 0.4306, "step": 1593 }, { "epoch": 1.1737849779086893, "grad_norm": 0.42976027727127075, "learning_rate": 7.632375104222014e-06, "loss": 0.3962, "step": 1594 }, { "epoch": 1.1745213549337261, "grad_norm": 0.4995291829109192, "learning_rate": 7.628731267755952e-06, "loss": 0.3974, "step": 1595 }, { "epoch": 1.175257731958763, "grad_norm": 0.4624010920524597, "learning_rate": 7.6250855008290856e-06, "loss": 0.4175, "step": 1596 }, { "epoch": 1.1759941089837997, "grad_norm": 0.46264103055000305, "learning_rate": 7.6214378061187546e-06, "loss": 0.3981, "step": 1597 }, { "epoch": 1.1767304860088366, "grad_norm": 0.48678502440452576, "learning_rate": 7.617788186303714e-06, "loss": 0.4185, "step": 1598 }, { "epoch": 1.1774668630338734, "grad_norm": 0.4638168513774872, "learning_rate": 7.6141366440641365e-06, "loss": 0.4137, "step": 1599 }, { "epoch": 1.1782032400589102, "grad_norm": 0.4336228668689728, "learning_rate": 7.610483182081607e-06, "loss": 0.4418, "step": 1600 }, { "epoch": 1.178939617083947, "grad_norm": 0.47200411558151245, "learning_rate": 7.606827803039112e-06, "loss": 0.4105, "step": 1601 }, { "epoch": 1.1796759941089838, "grad_norm": 0.5027320981025696, "learning_rate": 7.603170509621054e-06, "loss": 0.4476, "step": 1602 }, { "epoch": 1.1804123711340206, "grad_norm": 0.4112722873687744, "learning_rate": 7.5995113045132395e-06, "loss": 0.4147, "step": 1603 }, { "epoch": 1.1811487481590575, "grad_norm": 0.4506515562534332, "learning_rate": 7.595850190402877e-06, "loss": 0.4221, "step": 1604 }, { "epoch": 1.1818851251840943, "grad_norm": 0.4421546757221222, "learning_rate": 7.59218716997858e-06, "loss": 0.4313, "step": 1605 }, { "epoch": 1.182621502209131, "grad_norm": 0.4537235200405121, "learning_rate": 7.588522245930357e-06, "loss": 0.4175, "step": 1606 }, { "epoch": 1.183357879234168, "grad_norm": 0.4480210244655609, "learning_rate": 7.584855420949619e-06, "loss": 0.4213, "step": 1607 }, { "epoch": 1.1840942562592047, "grad_norm": 0.408025324344635, "learning_rate": 7.581186697729172e-06, "loss": 0.4099, "step": 1608 }, { "epoch": 1.1848306332842415, "grad_norm": 0.45869341492652893, "learning_rate": 7.577516078963215e-06, "loss": 0.4143, "step": 1609 }, { "epoch": 1.1855670103092784, "grad_norm": 0.48638081550598145, "learning_rate": 7.573843567347339e-06, "loss": 0.4361, "step": 1610 }, { "epoch": 1.1863033873343152, "grad_norm": 0.39438486099243164, "learning_rate": 7.570169165578527e-06, "loss": 0.4394, "step": 1611 }, { "epoch": 1.187039764359352, "grad_norm": 0.49038755893707275, "learning_rate": 7.566492876355147e-06, "loss": 0.416, "step": 1612 }, { "epoch": 1.1877761413843888, "grad_norm": 0.4872884452342987, "learning_rate": 7.562814702376955e-06, "loss": 0.4071, "step": 1613 }, { "epoch": 1.1885125184094256, "grad_norm": 0.4326101541519165, "learning_rate": 7.559134646345092e-06, "loss": 0.4061, "step": 1614 }, { "epoch": 1.1892488954344624, "grad_norm": 0.48473793268203735, "learning_rate": 7.5554527109620775e-06, "loss": 0.4206, "step": 1615 }, { "epoch": 1.1899852724594993, "grad_norm": 0.43738964200019836, "learning_rate": 7.551768898931816e-06, "loss": 0.4113, "step": 1616 }, { "epoch": 1.190721649484536, "grad_norm": 0.41719716787338257, "learning_rate": 7.548083212959588e-06, "loss": 0.4126, "step": 1617 }, { "epoch": 1.1914580265095729, "grad_norm": 0.4838084280490875, "learning_rate": 7.5443956557520485e-06, "loss": 0.4494, "step": 1618 }, { "epoch": 1.1921944035346097, "grad_norm": 0.4023309051990509, "learning_rate": 7.540706230017227e-06, "loss": 0.4212, "step": 1619 }, { "epoch": 1.1929307805596465, "grad_norm": 0.42563098669052124, "learning_rate": 7.537014938464529e-06, "loss": 0.378, "step": 1620 }, { "epoch": 1.1936671575846833, "grad_norm": 0.4844202697277069, "learning_rate": 7.533321783804726e-06, "loss": 0.4024, "step": 1621 }, { "epoch": 1.1944035346097202, "grad_norm": 0.39639124274253845, "learning_rate": 7.529626768749958e-06, "loss": 0.4082, "step": 1622 }, { "epoch": 1.195139911634757, "grad_norm": 0.45201244950294495, "learning_rate": 7.525929896013735e-06, "loss": 0.4252, "step": 1623 }, { "epoch": 1.1958762886597938, "grad_norm": 0.463316947221756, "learning_rate": 7.5222311683109265e-06, "loss": 0.417, "step": 1624 }, { "epoch": 1.1966126656848306, "grad_norm": 0.42931288480758667, "learning_rate": 7.518530588357769e-06, "loss": 0.4369, "step": 1625 }, { "epoch": 1.1973490427098674, "grad_norm": 0.44762808084487915, "learning_rate": 7.514828158871852e-06, "loss": 0.3949, "step": 1626 }, { "epoch": 1.1980854197349042, "grad_norm": 0.43206796050071716, "learning_rate": 7.511123882572133e-06, "loss": 0.4018, "step": 1627 }, { "epoch": 1.198821796759941, "grad_norm": 0.41934624314308167, "learning_rate": 7.5074177621789155e-06, "loss": 0.4152, "step": 1628 }, { "epoch": 1.1995581737849779, "grad_norm": 0.41448119282722473, "learning_rate": 7.503709800413868e-06, "loss": 0.4424, "step": 1629 }, { "epoch": 1.2002945508100147, "grad_norm": 0.4471980333328247, "learning_rate": 7.500000000000001e-06, "loss": 0.417, "step": 1630 }, { "epoch": 1.2010309278350515, "grad_norm": 0.4638178050518036, "learning_rate": 7.496288363661681e-06, "loss": 0.431, "step": 1631 }, { "epoch": 1.2017673048600883, "grad_norm": 0.4889911115169525, "learning_rate": 7.492574894124624e-06, "loss": 0.4159, "step": 1632 }, { "epoch": 1.2025036818851251, "grad_norm": 0.4356183707714081, "learning_rate": 7.4888595941158844e-06, "loss": 0.4258, "step": 1633 }, { "epoch": 1.203240058910162, "grad_norm": 0.4645734429359436, "learning_rate": 7.485142466363873e-06, "loss": 0.3947, "step": 1634 }, { "epoch": 1.2039764359351988, "grad_norm": 0.44492483139038086, "learning_rate": 7.481423513598331e-06, "loss": 0.4154, "step": 1635 }, { "epoch": 1.2047128129602356, "grad_norm": 0.42611250281333923, "learning_rate": 7.477702738550346e-06, "loss": 0.4105, "step": 1636 }, { "epoch": 1.2054491899852724, "grad_norm": 0.4502268135547638, "learning_rate": 7.473980143952344e-06, "loss": 0.3943, "step": 1637 }, { "epoch": 1.2061855670103092, "grad_norm": 0.4995657801628113, "learning_rate": 7.470255732538086e-06, "loss": 0.401, "step": 1638 }, { "epoch": 1.206921944035346, "grad_norm": 0.399395227432251, "learning_rate": 7.466529507042666e-06, "loss": 0.3953, "step": 1639 }, { "epoch": 1.2076583210603828, "grad_norm": 0.472028523683548, "learning_rate": 7.462801470202513e-06, "loss": 0.412, "step": 1640 }, { "epoch": 1.2083946980854197, "grad_norm": 0.44773492217063904, "learning_rate": 7.459071624755382e-06, "loss": 0.4121, "step": 1641 }, { "epoch": 1.2091310751104565, "grad_norm": 0.4189164340496063, "learning_rate": 7.455339973440361e-06, "loss": 0.398, "step": 1642 }, { "epoch": 1.2098674521354933, "grad_norm": 0.40162959694862366, "learning_rate": 7.4516065189978625e-06, "loss": 0.4068, "step": 1643 }, { "epoch": 1.21060382916053, "grad_norm": 0.4244077503681183, "learning_rate": 7.4478712641696194e-06, "loss": 0.4342, "step": 1644 }, { "epoch": 1.211340206185567, "grad_norm": 0.4497550427913666, "learning_rate": 7.444134211698692e-06, "loss": 0.4067, "step": 1645 }, { "epoch": 1.2120765832106037, "grad_norm": 0.4071500301361084, "learning_rate": 7.44039536432946e-06, "loss": 0.4084, "step": 1646 }, { "epoch": 1.2128129602356406, "grad_norm": 0.413670152425766, "learning_rate": 7.43665472480762e-06, "loss": 0.439, "step": 1647 }, { "epoch": 1.2135493372606774, "grad_norm": 0.4371226727962494, "learning_rate": 7.4329122958801806e-06, "loss": 0.3991, "step": 1648 }, { "epoch": 1.2142857142857142, "grad_norm": 0.4222114384174347, "learning_rate": 7.4291680802954716e-06, "loss": 0.4208, "step": 1649 }, { "epoch": 1.2150220913107512, "grad_norm": 0.4362742304801941, "learning_rate": 7.425422080803132e-06, "loss": 0.3812, "step": 1650 }, { "epoch": 1.215758468335788, "grad_norm": 0.4180275499820709, "learning_rate": 7.42167430015411e-06, "loss": 0.4073, "step": 1651 }, { "epoch": 1.2164948453608249, "grad_norm": 0.42266976833343506, "learning_rate": 7.417924741100662e-06, "loss": 0.4104, "step": 1652 }, { "epoch": 1.2172312223858617, "grad_norm": 0.44959259033203125, "learning_rate": 7.414173406396351e-06, "loss": 0.4324, "step": 1653 }, { "epoch": 1.2179675994108985, "grad_norm": 0.42745327949523926, "learning_rate": 7.410420298796045e-06, "loss": 0.413, "step": 1654 }, { "epoch": 1.2187039764359353, "grad_norm": 0.3998710811138153, "learning_rate": 7.406665421055912e-06, "loss": 0.4059, "step": 1655 }, { "epoch": 1.2194403534609721, "grad_norm": 0.4011445641517639, "learning_rate": 7.402908775933419e-06, "loss": 0.4191, "step": 1656 }, { "epoch": 1.220176730486009, "grad_norm": 0.44070056080818176, "learning_rate": 7.399150366187336e-06, "loss": 0.4215, "step": 1657 }, { "epoch": 1.2209131075110458, "grad_norm": 0.4228590726852417, "learning_rate": 7.395390194577724e-06, "loss": 0.4107, "step": 1658 }, { "epoch": 1.2216494845360826, "grad_norm": 0.3656286299228668, "learning_rate": 7.391628263865939e-06, "loss": 0.3822, "step": 1659 }, { "epoch": 1.2223858615611194, "grad_norm": 0.4292581081390381, "learning_rate": 7.387864576814628e-06, "loss": 0.4188, "step": 1660 }, { "epoch": 1.2231222385861562, "grad_norm": 0.39621493220329285, "learning_rate": 7.3840991361877326e-06, "loss": 0.3921, "step": 1661 }, { "epoch": 1.223858615611193, "grad_norm": 0.4454437792301178, "learning_rate": 7.380331944750476e-06, "loss": 0.4097, "step": 1662 }, { "epoch": 1.2245949926362298, "grad_norm": 0.39751824736595154, "learning_rate": 7.37656300526937e-06, "loss": 0.4232, "step": 1663 }, { "epoch": 1.2253313696612667, "grad_norm": 0.4180833399295807, "learning_rate": 7.37279232051221e-06, "loss": 0.4162, "step": 1664 }, { "epoch": 1.2260677466863035, "grad_norm": 0.41712233424186707, "learning_rate": 7.369019893248074e-06, "loss": 0.4304, "step": 1665 }, { "epoch": 1.2268041237113403, "grad_norm": 0.38904157280921936, "learning_rate": 7.365245726247316e-06, "loss": 0.427, "step": 1666 }, { "epoch": 1.227540500736377, "grad_norm": 0.41479575634002686, "learning_rate": 7.361469822281573e-06, "loss": 0.4401, "step": 1667 }, { "epoch": 1.228276877761414, "grad_norm": 0.3810448944568634, "learning_rate": 7.3576921841237535e-06, "loss": 0.4167, "step": 1668 }, { "epoch": 1.2290132547864507, "grad_norm": 0.42002904415130615, "learning_rate": 7.353912814548042e-06, "loss": 0.4009, "step": 1669 }, { "epoch": 1.2297496318114876, "grad_norm": 0.3993929326534271, "learning_rate": 7.350131716329891e-06, "loss": 0.4106, "step": 1670 }, { "epoch": 1.2304860088365244, "grad_norm": 0.4273282587528229, "learning_rate": 7.346348892246026e-06, "loss": 0.4382, "step": 1671 }, { "epoch": 1.2312223858615612, "grad_norm": 0.43270593881607056, "learning_rate": 7.342564345074441e-06, "loss": 0.4107, "step": 1672 }, { "epoch": 1.231958762886598, "grad_norm": 0.40336766839027405, "learning_rate": 7.338778077594388e-06, "loss": 0.4238, "step": 1673 }, { "epoch": 1.2326951399116348, "grad_norm": 0.39280226826667786, "learning_rate": 7.33499009258639e-06, "loss": 0.3768, "step": 1674 }, { "epoch": 1.2334315169366716, "grad_norm": 0.4784250557422638, "learning_rate": 7.331200392832232e-06, "loss": 0.4272, "step": 1675 }, { "epoch": 1.2341678939617085, "grad_norm": 0.3877667188644409, "learning_rate": 7.32740898111495e-06, "loss": 0.3862, "step": 1676 }, { "epoch": 1.2349042709867453, "grad_norm": 0.4045974016189575, "learning_rate": 7.323615860218844e-06, "loss": 0.4099, "step": 1677 }, { "epoch": 1.235640648011782, "grad_norm": 0.4475846588611603, "learning_rate": 7.319821032929467e-06, "loss": 0.4142, "step": 1678 }, { "epoch": 1.236377025036819, "grad_norm": 0.4150718152523041, "learning_rate": 7.316024502033627e-06, "loss": 0.428, "step": 1679 }, { "epoch": 1.2371134020618557, "grad_norm": 0.4547623097896576, "learning_rate": 7.31222627031938e-06, "loss": 0.4304, "step": 1680 }, { "epoch": 1.2378497790868925, "grad_norm": 0.4688529968261719, "learning_rate": 7.308426340576034e-06, "loss": 0.3903, "step": 1681 }, { "epoch": 1.2385861561119293, "grad_norm": 0.5233607888221741, "learning_rate": 7.30462471559414e-06, "loss": 0.3849, "step": 1682 }, { "epoch": 1.2393225331369662, "grad_norm": 0.43747377395629883, "learning_rate": 7.3008213981655005e-06, "loss": 0.3894, "step": 1683 }, { "epoch": 1.240058910162003, "grad_norm": 0.40247631072998047, "learning_rate": 7.297016391083154e-06, "loss": 0.376, "step": 1684 }, { "epoch": 1.2407952871870398, "grad_norm": 0.49333998560905457, "learning_rate": 7.2932096971413815e-06, "loss": 0.4105, "step": 1685 }, { "epoch": 1.2415316642120766, "grad_norm": 0.4000438153743744, "learning_rate": 7.289401319135707e-06, "loss": 0.3952, "step": 1686 }, { "epoch": 1.2422680412371134, "grad_norm": 0.44068193435668945, "learning_rate": 7.285591259862888e-06, "loss": 0.379, "step": 1687 }, { "epoch": 1.2430044182621502, "grad_norm": 0.4409189820289612, "learning_rate": 7.281779522120914e-06, "loss": 0.4173, "step": 1688 }, { "epoch": 1.243740795287187, "grad_norm": 0.4082186222076416, "learning_rate": 7.277966108709014e-06, "loss": 0.407, "step": 1689 }, { "epoch": 1.2444771723122239, "grad_norm": 0.40812796354293823, "learning_rate": 7.27415102242764e-06, "loss": 0.4243, "step": 1690 }, { "epoch": 1.2452135493372607, "grad_norm": 0.41487666964530945, "learning_rate": 7.2703342660784785e-06, "loss": 0.4032, "step": 1691 }, { "epoch": 1.2459499263622975, "grad_norm": 0.44201499223709106, "learning_rate": 7.266515842464438e-06, "loss": 0.448, "step": 1692 }, { "epoch": 1.2466863033873343, "grad_norm": 0.4068033695220947, "learning_rate": 7.262695754389655e-06, "loss": 0.4022, "step": 1693 }, { "epoch": 1.2474226804123711, "grad_norm": 0.3832601010799408, "learning_rate": 7.258874004659487e-06, "loss": 0.4315, "step": 1694 }, { "epoch": 1.248159057437408, "grad_norm": 0.4322513937950134, "learning_rate": 7.25505059608051e-06, "loss": 0.4123, "step": 1695 }, { "epoch": 1.2488954344624448, "grad_norm": 0.3983667194843292, "learning_rate": 7.25122553146052e-06, "loss": 0.3817, "step": 1696 }, { "epoch": 1.2496318114874816, "grad_norm": 0.44032666087150574, "learning_rate": 7.247398813608531e-06, "loss": 0.3921, "step": 1697 }, { "epoch": 1.2503681885125184, "grad_norm": 0.451885461807251, "learning_rate": 7.243570445334767e-06, "loss": 0.4332, "step": 1698 }, { "epoch": 1.2511045655375552, "grad_norm": 0.46099600195884705, "learning_rate": 7.239740429450664e-06, "loss": 0.4037, "step": 1699 }, { "epoch": 1.251840942562592, "grad_norm": 0.4341306984424591, "learning_rate": 7.235908768768875e-06, "loss": 0.3931, "step": 1700 }, { "epoch": 1.2525773195876289, "grad_norm": 0.4982093274593353, "learning_rate": 7.232075466103254e-06, "loss": 0.4291, "step": 1701 }, { "epoch": 1.2533136966126657, "grad_norm": 0.4553844928741455, "learning_rate": 7.228240524268858e-06, "loss": 0.4011, "step": 1702 }, { "epoch": 1.2540500736377025, "grad_norm": 0.43320947885513306, "learning_rate": 7.224403946081957e-06, "loss": 0.4023, "step": 1703 }, { "epoch": 1.2547864506627393, "grad_norm": 0.46215543150901794, "learning_rate": 7.220565734360019e-06, "loss": 0.4213, "step": 1704 }, { "epoch": 1.2555228276877761, "grad_norm": 0.4323497712612152, "learning_rate": 7.216725891921707e-06, "loss": 0.4226, "step": 1705 }, { "epoch": 1.256259204712813, "grad_norm": 0.47635823488235474, "learning_rate": 7.212884421586889e-06, "loss": 0.399, "step": 1706 }, { "epoch": 1.2569955817378498, "grad_norm": 0.40797707438468933, "learning_rate": 7.20904132617662e-06, "loss": 0.3783, "step": 1707 }, { "epoch": 1.2577319587628866, "grad_norm": 0.5090800523757935, "learning_rate": 7.2051966085131584e-06, "loss": 0.4148, "step": 1708 }, { "epoch": 1.2584683357879234, "grad_norm": 0.5189807415008545, "learning_rate": 7.201350271419945e-06, "loss": 0.4367, "step": 1709 }, { "epoch": 1.2592047128129602, "grad_norm": 0.40211692452430725, "learning_rate": 7.197502317721616e-06, "loss": 0.3966, "step": 1710 }, { "epoch": 1.259941089837997, "grad_norm": 0.45923155546188354, "learning_rate": 7.19365275024399e-06, "loss": 0.4394, "step": 1711 }, { "epoch": 1.2606774668630338, "grad_norm": 0.48686525225639343, "learning_rate": 7.189801571814075e-06, "loss": 0.4252, "step": 1712 }, { "epoch": 1.2614138438880707, "grad_norm": 0.43244504928588867, "learning_rate": 7.185948785260058e-06, "loss": 0.3866, "step": 1713 }, { "epoch": 1.2621502209131075, "grad_norm": 0.4492274820804596, "learning_rate": 7.182094393411312e-06, "loss": 0.423, "step": 1714 }, { "epoch": 1.2628865979381443, "grad_norm": 0.404514878988266, "learning_rate": 7.178238399098381e-06, "loss": 0.3901, "step": 1715 }, { "epoch": 1.263622974963181, "grad_norm": 0.41889336705207825, "learning_rate": 7.174380805152997e-06, "loss": 0.402, "step": 1716 }, { "epoch": 1.264359351988218, "grad_norm": 0.4595990777015686, "learning_rate": 7.170521614408056e-06, "loss": 0.3852, "step": 1717 }, { "epoch": 1.2650957290132547, "grad_norm": 0.422296941280365, "learning_rate": 7.166660829697633e-06, "loss": 0.4092, "step": 1718 }, { "epoch": 1.2658321060382915, "grad_norm": 0.49134594202041626, "learning_rate": 7.162798453856969e-06, "loss": 0.4246, "step": 1719 }, { "epoch": 1.2665684830633284, "grad_norm": 0.4637044072151184, "learning_rate": 7.1589344897224795e-06, "loss": 0.3807, "step": 1720 }, { "epoch": 1.2673048600883652, "grad_norm": 0.49215713143348694, "learning_rate": 7.155068940131741e-06, "loss": 0.4121, "step": 1721 }, { "epoch": 1.268041237113402, "grad_norm": 0.46819546818733215, "learning_rate": 7.151201807923497e-06, "loss": 0.4045, "step": 1722 }, { "epoch": 1.2687776141384388, "grad_norm": 0.4482595920562744, "learning_rate": 7.1473330959376515e-06, "loss": 0.3915, "step": 1723 }, { "epoch": 1.2695139911634756, "grad_norm": 0.5111578702926636, "learning_rate": 7.143462807015271e-06, "loss": 0.4096, "step": 1724 }, { "epoch": 1.2702503681885124, "grad_norm": 0.3854685425758362, "learning_rate": 7.139590943998579e-06, "loss": 0.4059, "step": 1725 }, { "epoch": 1.2709867452135493, "grad_norm": 0.542492151260376, "learning_rate": 7.135717509730953e-06, "loss": 0.4306, "step": 1726 }, { "epoch": 1.271723122238586, "grad_norm": 0.4321967661380768, "learning_rate": 7.1318425070569284e-06, "loss": 0.3901, "step": 1727 }, { "epoch": 1.272459499263623, "grad_norm": 0.41840946674346924, "learning_rate": 7.127965938822187e-06, "loss": 0.4079, "step": 1728 }, { "epoch": 1.2731958762886597, "grad_norm": 0.3965054750442505, "learning_rate": 7.124087807873565e-06, "loss": 0.4081, "step": 1729 }, { "epoch": 1.2739322533136965, "grad_norm": 0.4440041184425354, "learning_rate": 7.1202081170590455e-06, "loss": 0.4106, "step": 1730 }, { "epoch": 1.2746686303387333, "grad_norm": 0.4273045063018799, "learning_rate": 7.116326869227755e-06, "loss": 0.3929, "step": 1731 }, { "epoch": 1.2754050073637702, "grad_norm": 0.3961130976676941, "learning_rate": 7.112444067229966e-06, "loss": 0.4022, "step": 1732 }, { "epoch": 1.276141384388807, "grad_norm": 0.3908890187740326, "learning_rate": 7.108559713917089e-06, "loss": 0.3974, "step": 1733 }, { "epoch": 1.2768777614138438, "grad_norm": 0.4090268909931183, "learning_rate": 7.104673812141676e-06, "loss": 0.4551, "step": 1734 }, { "epoch": 1.2776141384388806, "grad_norm": 0.38256412744522095, "learning_rate": 7.100786364757417e-06, "loss": 0.4439, "step": 1735 }, { "epoch": 1.2783505154639174, "grad_norm": 0.37507322430610657, "learning_rate": 7.096897374619134e-06, "loss": 0.4087, "step": 1736 }, { "epoch": 1.2790868924889542, "grad_norm": 0.39156273007392883, "learning_rate": 7.093006844582787e-06, "loss": 0.3903, "step": 1737 }, { "epoch": 1.279823269513991, "grad_norm": 0.37378448247909546, "learning_rate": 7.08911477750546e-06, "loss": 0.4178, "step": 1738 }, { "epoch": 1.2805596465390279, "grad_norm": 0.4035623371601105, "learning_rate": 7.085221176245374e-06, "loss": 0.4079, "step": 1739 }, { "epoch": 1.2812960235640647, "grad_norm": 0.4087090492248535, "learning_rate": 7.081326043661867e-06, "loss": 0.39, "step": 1740 }, { "epoch": 1.2820324005891015, "grad_norm": 0.43084970116615295, "learning_rate": 7.0774293826154095e-06, "loss": 0.4189, "step": 1741 }, { "epoch": 1.2827687776141383, "grad_norm": 0.40026846528053284, "learning_rate": 7.073531195967591e-06, "loss": 0.4454, "step": 1742 }, { "epoch": 1.2835051546391751, "grad_norm": 0.39230623841285706, "learning_rate": 7.069631486581123e-06, "loss": 0.4106, "step": 1743 }, { "epoch": 1.284241531664212, "grad_norm": 0.39439305663108826, "learning_rate": 7.065730257319832e-06, "loss": 0.4, "step": 1744 }, { "epoch": 1.2849779086892488, "grad_norm": 0.37384194135665894, "learning_rate": 7.061827511048666e-06, "loss": 0.3976, "step": 1745 }, { "epoch": 1.2857142857142856, "grad_norm": 0.4040299355983734, "learning_rate": 7.057923250633681e-06, "loss": 0.4322, "step": 1746 }, { "epoch": 1.2864506627393224, "grad_norm": 0.42770475149154663, "learning_rate": 7.054017478942048e-06, "loss": 0.4132, "step": 1747 }, { "epoch": 1.2871870397643592, "grad_norm": 0.43052366375923157, "learning_rate": 7.050110198842052e-06, "loss": 0.4313, "step": 1748 }, { "epoch": 1.287923416789396, "grad_norm": 0.3632519841194153, "learning_rate": 7.046201413203076e-06, "loss": 0.4088, "step": 1749 }, { "epoch": 1.2886597938144329, "grad_norm": 0.3969815671443939, "learning_rate": 7.042291124895615e-06, "loss": 0.4149, "step": 1750 }, { "epoch": 1.28939617083947, "grad_norm": 0.45275336503982544, "learning_rate": 7.038379336791269e-06, "loss": 0.4252, "step": 1751 }, { "epoch": 1.2901325478645067, "grad_norm": 0.4112440049648285, "learning_rate": 7.034466051762736e-06, "loss": 0.4302, "step": 1752 }, { "epoch": 1.2908689248895435, "grad_norm": 0.4475271999835968, "learning_rate": 7.030551272683814e-06, "loss": 0.4077, "step": 1753 }, { "epoch": 1.2916053019145803, "grad_norm": 0.4204852283000946, "learning_rate": 7.026635002429399e-06, "loss": 0.4264, "step": 1754 }, { "epoch": 1.2923416789396172, "grad_norm": 0.44256094098091125, "learning_rate": 7.02271724387548e-06, "loss": 0.4211, "step": 1755 }, { "epoch": 1.293078055964654, "grad_norm": 0.4253244698047638, "learning_rate": 7.018797999899142e-06, "loss": 0.4134, "step": 1756 }, { "epoch": 1.2938144329896908, "grad_norm": 0.42249786853790283, "learning_rate": 7.014877273378557e-06, "loss": 0.4062, "step": 1757 }, { "epoch": 1.2945508100147276, "grad_norm": 0.42850828170776367, "learning_rate": 7.010955067192991e-06, "loss": 0.4063, "step": 1758 }, { "epoch": 1.2952871870397644, "grad_norm": 0.4714277386665344, "learning_rate": 7.0070313842227946e-06, "loss": 0.409, "step": 1759 }, { "epoch": 1.2960235640648012, "grad_norm": 0.3643467426300049, "learning_rate": 7.003106227349399e-06, "loss": 0.4297, "step": 1760 }, { "epoch": 1.296759941089838, "grad_norm": 0.4631706178188324, "learning_rate": 6.999179599455321e-06, "loss": 0.4398, "step": 1761 }, { "epoch": 1.2974963181148749, "grad_norm": 0.37905317544937134, "learning_rate": 6.995251503424158e-06, "loss": 0.3931, "step": 1762 }, { "epoch": 1.2982326951399117, "grad_norm": 0.4009278118610382, "learning_rate": 6.991321942140587e-06, "loss": 0.4023, "step": 1763 }, { "epoch": 1.2989690721649485, "grad_norm": 0.4024566113948822, "learning_rate": 6.987390918490356e-06, "loss": 0.4188, "step": 1764 }, { "epoch": 1.2997054491899853, "grad_norm": 0.4234688878059387, "learning_rate": 6.983458435360291e-06, "loss": 0.4308, "step": 1765 }, { "epoch": 1.3004418262150221, "grad_norm": 0.4323263466358185, "learning_rate": 6.9795244956382904e-06, "loss": 0.3954, "step": 1766 }, { "epoch": 1.301178203240059, "grad_norm": 0.4084095358848572, "learning_rate": 6.975589102213318e-06, "loss": 0.3908, "step": 1767 }, { "epoch": 1.3019145802650958, "grad_norm": 0.3725346624851227, "learning_rate": 6.97165225797541e-06, "loss": 0.4085, "step": 1768 }, { "epoch": 1.3026509572901326, "grad_norm": 0.4671725928783417, "learning_rate": 6.9677139658156656e-06, "loss": 0.4093, "step": 1769 }, { "epoch": 1.3033873343151694, "grad_norm": 0.42323094606399536, "learning_rate": 6.963774228626246e-06, "loss": 0.371, "step": 1770 }, { "epoch": 1.3041237113402062, "grad_norm": 0.37016546726226807, "learning_rate": 6.959833049300376e-06, "loss": 0.4009, "step": 1771 }, { "epoch": 1.304860088365243, "grad_norm": 0.4504646360874176, "learning_rate": 6.955890430732338e-06, "loss": 0.3951, "step": 1772 }, { "epoch": 1.3055964653902798, "grad_norm": 0.41494229435920715, "learning_rate": 6.9519463758174745e-06, "loss": 0.4112, "step": 1773 }, { "epoch": 1.3063328424153167, "grad_norm": 0.3828340768814087, "learning_rate": 6.948000887452177e-06, "loss": 0.4081, "step": 1774 }, { "epoch": 1.3070692194403535, "grad_norm": 0.4123505651950836, "learning_rate": 6.944053968533895e-06, "loss": 0.4267, "step": 1775 }, { "epoch": 1.3078055964653903, "grad_norm": 0.42603743076324463, "learning_rate": 6.9401056219611255e-06, "loss": 0.3866, "step": 1776 }, { "epoch": 1.3085419734904271, "grad_norm": 0.48614487051963806, "learning_rate": 6.936155850633417e-06, "loss": 0.4409, "step": 1777 }, { "epoch": 1.309278350515464, "grad_norm": 0.37237557768821716, "learning_rate": 6.932204657451358e-06, "loss": 0.4096, "step": 1778 }, { "epoch": 1.3100147275405007, "grad_norm": 0.45322996377944946, "learning_rate": 6.928252045316588e-06, "loss": 0.4214, "step": 1779 }, { "epoch": 1.3107511045655376, "grad_norm": 0.3440614640712738, "learning_rate": 6.924298017131786e-06, "loss": 0.3917, "step": 1780 }, { "epoch": 1.3114874815905744, "grad_norm": 0.38658320903778076, "learning_rate": 6.920342575800672e-06, "loss": 0.419, "step": 1781 }, { "epoch": 1.3122238586156112, "grad_norm": 0.4042324721813202, "learning_rate": 6.916385724227998e-06, "loss": 0.4234, "step": 1782 }, { "epoch": 1.312960235640648, "grad_norm": 0.41090258955955505, "learning_rate": 6.912427465319561e-06, "loss": 0.4129, "step": 1783 }, { "epoch": 1.3136966126656848, "grad_norm": 0.378579705953598, "learning_rate": 6.908467801982186e-06, "loss": 0.3884, "step": 1784 }, { "epoch": 1.3144329896907216, "grad_norm": 0.43072155117988586, "learning_rate": 6.9045067371237285e-06, "loss": 0.4319, "step": 1785 }, { "epoch": 1.3151693667157585, "grad_norm": 0.42852944135665894, "learning_rate": 6.9005442736530745e-06, "loss": 0.4116, "step": 1786 }, { "epoch": 1.3159057437407953, "grad_norm": 0.40988779067993164, "learning_rate": 6.8965804144801386e-06, "loss": 0.425, "step": 1787 }, { "epoch": 1.316642120765832, "grad_norm": 0.4157440960407257, "learning_rate": 6.89261516251586e-06, "loss": 0.4266, "step": 1788 }, { "epoch": 1.317378497790869, "grad_norm": 0.46512264013290405, "learning_rate": 6.888648520672198e-06, "loss": 0.4201, "step": 1789 }, { "epoch": 1.3181148748159057, "grad_norm": 0.496389776468277, "learning_rate": 6.8846804918621355e-06, "loss": 0.4487, "step": 1790 }, { "epoch": 1.3188512518409425, "grad_norm": 0.3850422501564026, "learning_rate": 6.880711078999673e-06, "loss": 0.4016, "step": 1791 }, { "epoch": 1.3195876288659794, "grad_norm": 0.44369614124298096, "learning_rate": 6.876740284999828e-06, "loss": 0.4188, "step": 1792 }, { "epoch": 1.3203240058910162, "grad_norm": 0.43615850806236267, "learning_rate": 6.872768112778629e-06, "loss": 0.4164, "step": 1793 }, { "epoch": 1.321060382916053, "grad_norm": 0.46753209829330444, "learning_rate": 6.868794565253123e-06, "loss": 0.4041, "step": 1794 }, { "epoch": 1.3217967599410898, "grad_norm": 0.4193538725376129, "learning_rate": 6.864819645341361e-06, "loss": 0.3932, "step": 1795 }, { "epoch": 1.3225331369661266, "grad_norm": 0.437418669462204, "learning_rate": 6.860843355962403e-06, "loss": 0.3834, "step": 1796 }, { "epoch": 1.3232695139911634, "grad_norm": 0.4375527799129486, "learning_rate": 6.856865700036317e-06, "loss": 0.4254, "step": 1797 }, { "epoch": 1.3240058910162003, "grad_norm": 0.4403863251209259, "learning_rate": 6.852886680484175e-06, "loss": 0.4204, "step": 1798 }, { "epoch": 1.324742268041237, "grad_norm": 0.39266374707221985, "learning_rate": 6.848906300228047e-06, "loss": 0.3803, "step": 1799 }, { "epoch": 1.3254786450662739, "grad_norm": 0.41610684990882874, "learning_rate": 6.844924562191003e-06, "loss": 0.3783, "step": 1800 }, { "epoch": 1.3262150220913107, "grad_norm": 0.39313656091690063, "learning_rate": 6.8409414692971125e-06, "loss": 0.3979, "step": 1801 }, { "epoch": 1.3269513991163475, "grad_norm": 0.3985959589481354, "learning_rate": 6.836957024471439e-06, "loss": 0.4142, "step": 1802 }, { "epoch": 1.3276877761413843, "grad_norm": 0.38051387667655945, "learning_rate": 6.832971230640037e-06, "loss": 0.4066, "step": 1803 }, { "epoch": 1.3284241531664212, "grad_norm": 0.4025249481201172, "learning_rate": 6.828984090729954e-06, "loss": 0.3993, "step": 1804 }, { "epoch": 1.329160530191458, "grad_norm": 0.39893102645874023, "learning_rate": 6.8249956076692235e-06, "loss": 0.4082, "step": 1805 }, { "epoch": 1.3298969072164948, "grad_norm": 0.3757968246936798, "learning_rate": 6.8210057843868715e-06, "loss": 0.4043, "step": 1806 }, { "epoch": 1.3306332842415316, "grad_norm": 0.377785861492157, "learning_rate": 6.817014623812898e-06, "loss": 0.3817, "step": 1807 }, { "epoch": 1.3313696612665684, "grad_norm": 0.39483869075775146, "learning_rate": 6.813022128878292e-06, "loss": 0.4089, "step": 1808 }, { "epoch": 1.3321060382916052, "grad_norm": 0.4306437373161316, "learning_rate": 6.809028302515024e-06, "loss": 0.3828, "step": 1809 }, { "epoch": 1.332842415316642, "grad_norm": 0.38789859414100647, "learning_rate": 6.805033147656037e-06, "loss": 0.4043, "step": 1810 }, { "epoch": 1.3335787923416789, "grad_norm": 0.3801842927932739, "learning_rate": 6.801036667235252e-06, "loss": 0.4094, "step": 1811 }, { "epoch": 1.3343151693667157, "grad_norm": 0.3741806447505951, "learning_rate": 6.797038864187564e-06, "loss": 0.4316, "step": 1812 }, { "epoch": 1.3350515463917525, "grad_norm": 0.39872273802757263, "learning_rate": 6.79303974144884e-06, "loss": 0.4268, "step": 1813 }, { "epoch": 1.3357879234167893, "grad_norm": 0.4184839427471161, "learning_rate": 6.789039301955913e-06, "loss": 0.4332, "step": 1814 }, { "epoch": 1.3365243004418264, "grad_norm": 0.3965596854686737, "learning_rate": 6.785037548646586e-06, "loss": 0.4486, "step": 1815 }, { "epoch": 1.3372606774668632, "grad_norm": 0.4066341817378998, "learning_rate": 6.781034484459624e-06, "loss": 0.3996, "step": 1816 }, { "epoch": 1.3379970544919, "grad_norm": 0.47760289907455444, "learning_rate": 6.777030112334759e-06, "loss": 0.4175, "step": 1817 }, { "epoch": 1.3387334315169368, "grad_norm": 0.39844220876693726, "learning_rate": 6.773024435212678e-06, "loss": 0.4186, "step": 1818 }, { "epoch": 1.3394698085419736, "grad_norm": 0.419331431388855, "learning_rate": 6.769017456035033e-06, "loss": 0.4229, "step": 1819 }, { "epoch": 1.3402061855670104, "grad_norm": 0.41925719380378723, "learning_rate": 6.765009177744425e-06, "loss": 0.4078, "step": 1820 }, { "epoch": 1.3409425625920472, "grad_norm": 0.39159253239631653, "learning_rate": 6.760999603284413e-06, "loss": 0.394, "step": 1821 }, { "epoch": 1.341678939617084, "grad_norm": 0.38817527890205383, "learning_rate": 6.756988735599508e-06, "loss": 0.3846, "step": 1822 }, { "epoch": 1.3424153166421209, "grad_norm": 0.4380631148815155, "learning_rate": 6.752976577635169e-06, "loss": 0.4231, "step": 1823 }, { "epoch": 1.3431516936671577, "grad_norm": 0.4459487795829773, "learning_rate": 6.748963132337807e-06, "loss": 0.4201, "step": 1824 }, { "epoch": 1.3438880706921945, "grad_norm": 0.3807379901409149, "learning_rate": 6.7449484026547705e-06, "loss": 0.4175, "step": 1825 }, { "epoch": 1.3446244477172313, "grad_norm": 0.3830302655696869, "learning_rate": 6.740932391534358e-06, "loss": 0.4129, "step": 1826 }, { "epoch": 1.3453608247422681, "grad_norm": 0.4267512559890747, "learning_rate": 6.736915101925807e-06, "loss": 0.4108, "step": 1827 }, { "epoch": 1.346097201767305, "grad_norm": 0.3901468515396118, "learning_rate": 6.732896536779293e-06, "loss": 0.3991, "step": 1828 }, { "epoch": 1.3468335787923418, "grad_norm": 0.41451314091682434, "learning_rate": 6.728876699045927e-06, "loss": 0.4133, "step": 1829 }, { "epoch": 1.3475699558173786, "grad_norm": 0.4063207507133484, "learning_rate": 6.7248555916777595e-06, "loss": 0.4133, "step": 1830 }, { "epoch": 1.3483063328424154, "grad_norm": 0.4275752902030945, "learning_rate": 6.720833217627769e-06, "loss": 0.4337, "step": 1831 }, { "epoch": 1.3490427098674522, "grad_norm": 0.451399028301239, "learning_rate": 6.716809579849865e-06, "loss": 0.3908, "step": 1832 }, { "epoch": 1.349779086892489, "grad_norm": 0.43301716446876526, "learning_rate": 6.712784681298885e-06, "loss": 0.4326, "step": 1833 }, { "epoch": 1.3505154639175259, "grad_norm": 0.4233209192752838, "learning_rate": 6.708758524930594e-06, "loss": 0.4022, "step": 1834 }, { "epoch": 1.3512518409425627, "grad_norm": 0.43682563304901123, "learning_rate": 6.704731113701679e-06, "loss": 0.4212, "step": 1835 }, { "epoch": 1.3519882179675995, "grad_norm": 0.4808812439441681, "learning_rate": 6.70070245056975e-06, "loss": 0.4156, "step": 1836 }, { "epoch": 1.3527245949926363, "grad_norm": 0.40924352407455444, "learning_rate": 6.696672538493334e-06, "loss": 0.3947, "step": 1837 }, { "epoch": 1.3534609720176731, "grad_norm": 0.3757144808769226, "learning_rate": 6.692641380431879e-06, "loss": 0.4008, "step": 1838 }, { "epoch": 1.35419734904271, "grad_norm": 0.40984469652175903, "learning_rate": 6.688608979345742e-06, "loss": 0.4306, "step": 1839 }, { "epoch": 1.3549337260677468, "grad_norm": 0.39980268478393555, "learning_rate": 6.6845753381961995e-06, "loss": 0.4105, "step": 1840 }, { "epoch": 1.3556701030927836, "grad_norm": 0.38676726818084717, "learning_rate": 6.680540459945435e-06, "loss": 0.3773, "step": 1841 }, { "epoch": 1.3564064801178204, "grad_norm": 0.4054774045944214, "learning_rate": 6.676504347556541e-06, "loss": 0.4035, "step": 1842 }, { "epoch": 1.3571428571428572, "grad_norm": 0.42290937900543213, "learning_rate": 6.6724670039935145e-06, "loss": 0.4226, "step": 1843 }, { "epoch": 1.357879234167894, "grad_norm": 0.41523557901382446, "learning_rate": 6.668428432221262e-06, "loss": 0.4143, "step": 1844 }, { "epoch": 1.3586156111929308, "grad_norm": 0.40057843923568726, "learning_rate": 6.664388635205587e-06, "loss": 0.4092, "step": 1845 }, { "epoch": 1.3593519882179677, "grad_norm": 0.43508538603782654, "learning_rate": 6.660347615913194e-06, "loss": 0.3645, "step": 1846 }, { "epoch": 1.3600883652430045, "grad_norm": 0.4160832464694977, "learning_rate": 6.656305377311686e-06, "loss": 0.3927, "step": 1847 }, { "epoch": 1.3608247422680413, "grad_norm": 0.40695255994796753, "learning_rate": 6.652261922369562e-06, "loss": 0.4079, "step": 1848 }, { "epoch": 1.361561119293078, "grad_norm": 0.4502415657043457, "learning_rate": 6.648217254056211e-06, "loss": 0.41, "step": 1849 }, { "epoch": 1.362297496318115, "grad_norm": 0.4875112473964691, "learning_rate": 6.644171375341915e-06, "loss": 0.4101, "step": 1850 }, { "epoch": 1.3630338733431517, "grad_norm": 0.4880272150039673, "learning_rate": 6.640124289197845e-06, "loss": 0.4239, "step": 1851 }, { "epoch": 1.3637702503681886, "grad_norm": 0.44013795256614685, "learning_rate": 6.636075998596063e-06, "loss": 0.3872, "step": 1852 }, { "epoch": 1.3645066273932254, "grad_norm": 0.43611839413642883, "learning_rate": 6.632026506509507e-06, "loss": 0.4029, "step": 1853 }, { "epoch": 1.3652430044182622, "grad_norm": 0.4688226878643036, "learning_rate": 6.627975815912002e-06, "loss": 0.4, "step": 1854 }, { "epoch": 1.365979381443299, "grad_norm": 0.4303816258907318, "learning_rate": 6.623923929778253e-06, "loss": 0.4301, "step": 1855 }, { "epoch": 1.3667157584683358, "grad_norm": 0.516590416431427, "learning_rate": 6.6198708510838446e-06, "loss": 0.4398, "step": 1856 }, { "epoch": 1.3674521354933726, "grad_norm": 0.4422401785850525, "learning_rate": 6.615816582805235e-06, "loss": 0.3925, "step": 1857 }, { "epoch": 1.3681885125184094, "grad_norm": 0.39282315969467163, "learning_rate": 6.611761127919753e-06, "loss": 0.3969, "step": 1858 }, { "epoch": 1.3689248895434463, "grad_norm": 0.4454099237918854, "learning_rate": 6.607704489405605e-06, "loss": 0.4306, "step": 1859 }, { "epoch": 1.369661266568483, "grad_norm": 0.45609748363494873, "learning_rate": 6.603646670241863e-06, "loss": 0.3761, "step": 1860 }, { "epoch": 1.37039764359352, "grad_norm": 0.40115681290626526, "learning_rate": 6.599587673408469e-06, "loss": 0.4091, "step": 1861 }, { "epoch": 1.3711340206185567, "grad_norm": 0.4406599998474121, "learning_rate": 6.595527501886223e-06, "loss": 0.4157, "step": 1862 }, { "epoch": 1.3718703976435935, "grad_norm": 0.3757615089416504, "learning_rate": 6.591466158656795e-06, "loss": 0.3968, "step": 1863 }, { "epoch": 1.3726067746686303, "grad_norm": 0.4482031464576721, "learning_rate": 6.5874036467027135e-06, "loss": 0.4094, "step": 1864 }, { "epoch": 1.3733431516936672, "grad_norm": 0.4158189594745636, "learning_rate": 6.583339969007364e-06, "loss": 0.4047, "step": 1865 }, { "epoch": 1.374079528718704, "grad_norm": 0.45515987277030945, "learning_rate": 6.579275128554986e-06, "loss": 0.4326, "step": 1866 }, { "epoch": 1.3748159057437408, "grad_norm": 0.49017441272735596, "learning_rate": 6.575209128330679e-06, "loss": 0.4162, "step": 1867 }, { "epoch": 1.3755522827687776, "grad_norm": 0.5305281281471252, "learning_rate": 6.57114197132039e-06, "loss": 0.402, "step": 1868 }, { "epoch": 1.3762886597938144, "grad_norm": 0.42159077525138855, "learning_rate": 6.567073660510914e-06, "loss": 0.4187, "step": 1869 }, { "epoch": 1.3770250368188512, "grad_norm": 0.4378039538860321, "learning_rate": 6.563004198889899e-06, "loss": 0.4001, "step": 1870 }, { "epoch": 1.377761413843888, "grad_norm": 0.45202401280403137, "learning_rate": 6.5589335894458305e-06, "loss": 0.4204, "step": 1871 }, { "epoch": 1.3784977908689249, "grad_norm": 0.4391757547855377, "learning_rate": 6.554861835168045e-06, "loss": 0.4141, "step": 1872 }, { "epoch": 1.3792341678939617, "grad_norm": 0.4077272415161133, "learning_rate": 6.550788939046713e-06, "loss": 0.3808, "step": 1873 }, { "epoch": 1.3799705449189985, "grad_norm": 0.488126665353775, "learning_rate": 6.546714904072848e-06, "loss": 0.4128, "step": 1874 }, { "epoch": 1.3807069219440353, "grad_norm": 0.43175673484802246, "learning_rate": 6.542639733238297e-06, "loss": 0.416, "step": 1875 }, { "epoch": 1.3814432989690721, "grad_norm": 0.39616307616233826, "learning_rate": 6.538563429535742e-06, "loss": 0.4148, "step": 1876 }, { "epoch": 1.382179675994109, "grad_norm": 0.436383455991745, "learning_rate": 6.534485995958699e-06, "loss": 0.4205, "step": 1877 }, { "epoch": 1.3829160530191458, "grad_norm": 0.5000892281532288, "learning_rate": 6.530407435501513e-06, "loss": 0.3923, "step": 1878 }, { "epoch": 1.3836524300441826, "grad_norm": 0.4896223545074463, "learning_rate": 6.5263277511593515e-06, "loss": 0.4397, "step": 1879 }, { "epoch": 1.3843888070692194, "grad_norm": 0.3927074372768402, "learning_rate": 6.522246945928214e-06, "loss": 0.422, "step": 1880 }, { "epoch": 1.3851251840942562, "grad_norm": 0.4764426350593567, "learning_rate": 6.518165022804921e-06, "loss": 0.4127, "step": 1881 }, { "epoch": 1.385861561119293, "grad_norm": 0.4523772895336151, "learning_rate": 6.514081984787112e-06, "loss": 0.4211, "step": 1882 }, { "epoch": 1.3865979381443299, "grad_norm": 0.3522360324859619, "learning_rate": 6.509997834873246e-06, "loss": 0.4101, "step": 1883 }, { "epoch": 1.3873343151693667, "grad_norm": 0.4114728271961212, "learning_rate": 6.505912576062602e-06, "loss": 0.4017, "step": 1884 }, { "epoch": 1.3880706921944035, "grad_norm": 0.4329925775527954, "learning_rate": 6.501826211355269e-06, "loss": 0.4095, "step": 1885 }, { "epoch": 1.3888070692194403, "grad_norm": 0.4298880398273468, "learning_rate": 6.497738743752151e-06, "loss": 0.4246, "step": 1886 }, { "epoch": 1.3895434462444771, "grad_norm": 0.4396612048149109, "learning_rate": 6.493650176254958e-06, "loss": 0.4231, "step": 1887 }, { "epoch": 1.390279823269514, "grad_norm": 0.3601161241531372, "learning_rate": 6.4895605118662116e-06, "loss": 0.3881, "step": 1888 }, { "epoch": 1.3910162002945508, "grad_norm": 0.42812642455101013, "learning_rate": 6.485469753589241e-06, "loss": 0.4094, "step": 1889 }, { "epoch": 1.3917525773195876, "grad_norm": 0.41203832626342773, "learning_rate": 6.481377904428171e-06, "loss": 0.4305, "step": 1890 }, { "epoch": 1.3924889543446244, "grad_norm": 0.4372691214084625, "learning_rate": 6.4772849673879335e-06, "loss": 0.4255, "step": 1891 }, { "epoch": 1.3932253313696612, "grad_norm": 0.47426924109458923, "learning_rate": 6.473190945474258e-06, "loss": 0.4377, "step": 1892 }, { "epoch": 1.393961708394698, "grad_norm": 0.4277667999267578, "learning_rate": 6.469095841693671e-06, "loss": 0.3954, "step": 1893 }, { "epoch": 1.3946980854197348, "grad_norm": 0.3962320387363434, "learning_rate": 6.4649996590534915e-06, "loss": 0.3877, "step": 1894 }, { "epoch": 1.3954344624447717, "grad_norm": 0.4663618206977844, "learning_rate": 6.460902400561835e-06, "loss": 0.4046, "step": 1895 }, { "epoch": 1.3961708394698085, "grad_norm": 0.4559689164161682, "learning_rate": 6.456804069227601e-06, "loss": 0.4166, "step": 1896 }, { "epoch": 1.3969072164948453, "grad_norm": 0.3791521489620209, "learning_rate": 6.452704668060481e-06, "loss": 0.4049, "step": 1897 }, { "epoch": 1.397643593519882, "grad_norm": 0.49199846386909485, "learning_rate": 6.448604200070953e-06, "loss": 0.4207, "step": 1898 }, { "epoch": 1.398379970544919, "grad_norm": 0.4463452994823456, "learning_rate": 6.444502668270276e-06, "loss": 0.4088, "step": 1899 }, { "epoch": 1.3991163475699557, "grad_norm": 0.47948887944221497, "learning_rate": 6.440400075670491e-06, "loss": 0.3906, "step": 1900 }, { "epoch": 1.3998527245949925, "grad_norm": 0.42631468176841736, "learning_rate": 6.4362964252844165e-06, "loss": 0.4024, "step": 1901 }, { "epoch": 1.4005891016200294, "grad_norm": 0.39506101608276367, "learning_rate": 6.432191720125651e-06, "loss": 0.4013, "step": 1902 }, { "epoch": 1.4013254786450662, "grad_norm": 0.4285804331302643, "learning_rate": 6.428085963208567e-06, "loss": 0.4148, "step": 1903 }, { "epoch": 1.402061855670103, "grad_norm": 0.4068101942539215, "learning_rate": 6.423979157548306e-06, "loss": 0.4113, "step": 1904 }, { "epoch": 1.4027982326951398, "grad_norm": 0.39695197343826294, "learning_rate": 6.419871306160782e-06, "loss": 0.3925, "step": 1905 }, { "epoch": 1.4035346097201766, "grad_norm": 0.38716959953308105, "learning_rate": 6.415762412062678e-06, "loss": 0.38, "step": 1906 }, { "epoch": 1.4042709867452134, "grad_norm": 0.41700947284698486, "learning_rate": 6.411652478271444e-06, "loss": 0.3994, "step": 1907 }, { "epoch": 1.4050073637702503, "grad_norm": 0.4346196949481964, "learning_rate": 6.407541507805286e-06, "loss": 0.4043, "step": 1908 }, { "epoch": 1.405743740795287, "grad_norm": 0.433353990316391, "learning_rate": 6.403429503683178e-06, "loss": 0.4124, "step": 1909 }, { "epoch": 1.406480117820324, "grad_norm": 0.4605219066143036, "learning_rate": 6.399316468924856e-06, "loss": 0.4196, "step": 1910 }, { "epoch": 1.4072164948453607, "grad_norm": 0.4418054223060608, "learning_rate": 6.395202406550803e-06, "loss": 0.4054, "step": 1911 }, { "epoch": 1.4079528718703975, "grad_norm": 0.4400234818458557, "learning_rate": 6.391087319582264e-06, "loss": 0.4206, "step": 1912 }, { "epoch": 1.4086892488954343, "grad_norm": 0.40220165252685547, "learning_rate": 6.386971211041235e-06, "loss": 0.4301, "step": 1913 }, { "epoch": 1.4094256259204712, "grad_norm": 0.41866376996040344, "learning_rate": 6.382854083950462e-06, "loss": 0.3882, "step": 1914 }, { "epoch": 1.410162002945508, "grad_norm": 0.4194769561290741, "learning_rate": 6.378735941333437e-06, "loss": 0.4037, "step": 1915 }, { "epoch": 1.4108983799705448, "grad_norm": 0.41453391313552856, "learning_rate": 6.374616786214402e-06, "loss": 0.4195, "step": 1916 }, { "epoch": 1.4116347569955816, "grad_norm": 0.4037197232246399, "learning_rate": 6.370496621618338e-06, "loss": 0.3969, "step": 1917 }, { "epoch": 1.4123711340206184, "grad_norm": 0.4220995008945465, "learning_rate": 6.366375450570971e-06, "loss": 0.4141, "step": 1918 }, { "epoch": 1.4131075110456552, "grad_norm": 0.45782655477523804, "learning_rate": 6.362253276098762e-06, "loss": 0.3984, "step": 1919 }, { "epoch": 1.413843888070692, "grad_norm": 0.40615370869636536, "learning_rate": 6.358130101228914e-06, "loss": 0.4279, "step": 1920 }, { "epoch": 1.414580265095729, "grad_norm": 0.42293697595596313, "learning_rate": 6.35400592898936e-06, "loss": 0.4001, "step": 1921 }, { "epoch": 1.415316642120766, "grad_norm": 0.4643053710460663, "learning_rate": 6.34988076240877e-06, "loss": 0.4087, "step": 1922 }, { "epoch": 1.4160530191458027, "grad_norm": 0.42279914021492004, "learning_rate": 6.345754604516539e-06, "loss": 0.3953, "step": 1923 }, { "epoch": 1.4167893961708395, "grad_norm": 0.4467233121395111, "learning_rate": 6.341627458342794e-06, "loss": 0.4182, "step": 1924 }, { "epoch": 1.4175257731958764, "grad_norm": 0.4454042911529541, "learning_rate": 6.337499326918386e-06, "loss": 0.4092, "step": 1925 }, { "epoch": 1.4182621502209132, "grad_norm": 0.414069265127182, "learning_rate": 6.33337021327489e-06, "loss": 0.4254, "step": 1926 }, { "epoch": 1.41899852724595, "grad_norm": 0.4927258789539337, "learning_rate": 6.329240120444602e-06, "loss": 0.4215, "step": 1927 }, { "epoch": 1.4197349042709868, "grad_norm": 0.4353216588497162, "learning_rate": 6.325109051460538e-06, "loss": 0.4077, "step": 1928 }, { "epoch": 1.4204712812960236, "grad_norm": 0.4361936151981354, "learning_rate": 6.3209770093564315e-06, "loss": 0.439, "step": 1929 }, { "epoch": 1.4212076583210604, "grad_norm": 0.4463266432285309, "learning_rate": 6.316843997166726e-06, "loss": 0.4217, "step": 1930 }, { "epoch": 1.4219440353460973, "grad_norm": 0.4338516294956207, "learning_rate": 6.312710017926582e-06, "loss": 0.4278, "step": 1931 }, { "epoch": 1.422680412371134, "grad_norm": 0.48313939571380615, "learning_rate": 6.3085750746718725e-06, "loss": 0.3838, "step": 1932 }, { "epoch": 1.423416789396171, "grad_norm": 0.5277740955352783, "learning_rate": 6.30443917043917e-06, "loss": 0.4097, "step": 1933 }, { "epoch": 1.4241531664212077, "grad_norm": 0.47533461451530457, "learning_rate": 6.30030230826576e-06, "loss": 0.3995, "step": 1934 }, { "epoch": 1.4248895434462445, "grad_norm": 0.4214441776275635, "learning_rate": 6.296164491189628e-06, "loss": 0.4226, "step": 1935 }, { "epoch": 1.4256259204712813, "grad_norm": 0.4258248507976532, "learning_rate": 6.292025722249463e-06, "loss": 0.3888, "step": 1936 }, { "epoch": 1.4263622974963182, "grad_norm": 0.4134536683559418, "learning_rate": 6.287886004484651e-06, "loss": 0.3852, "step": 1937 }, { "epoch": 1.427098674521355, "grad_norm": 0.47609132528305054, "learning_rate": 6.283745340935277e-06, "loss": 0.3968, "step": 1938 }, { "epoch": 1.4278350515463918, "grad_norm": 0.44796431064605713, "learning_rate": 6.279603734642117e-06, "loss": 0.4421, "step": 1939 }, { "epoch": 1.4285714285714286, "grad_norm": 0.404325395822525, "learning_rate": 6.275461188646641e-06, "loss": 0.4123, "step": 1940 }, { "epoch": 1.4293078055964654, "grad_norm": 0.41975611448287964, "learning_rate": 6.271317705991014e-06, "loss": 0.432, "step": 1941 }, { "epoch": 1.4300441826215022, "grad_norm": 0.46170738339424133, "learning_rate": 6.267173289718079e-06, "loss": 0.4272, "step": 1942 }, { "epoch": 1.430780559646539, "grad_norm": 0.388175904750824, "learning_rate": 6.263027942871375e-06, "loss": 0.3877, "step": 1943 }, { "epoch": 1.4315169366715759, "grad_norm": 0.459796667098999, "learning_rate": 6.258881668495116e-06, "loss": 0.3874, "step": 1944 }, { "epoch": 1.4322533136966127, "grad_norm": 0.4751020073890686, "learning_rate": 6.2547344696342015e-06, "loss": 0.393, "step": 1945 }, { "epoch": 1.4329896907216495, "grad_norm": 0.3899977207183838, "learning_rate": 6.250586349334209e-06, "loss": 0.3918, "step": 1946 }, { "epoch": 1.4337260677466863, "grad_norm": 0.43887919187545776, "learning_rate": 6.246437310641395e-06, "loss": 0.43, "step": 1947 }, { "epoch": 1.4344624447717231, "grad_norm": 0.4462708830833435, "learning_rate": 6.242287356602684e-06, "loss": 0.3989, "step": 1948 }, { "epoch": 1.43519882179676, "grad_norm": 0.4143645167350769, "learning_rate": 6.238136490265681e-06, "loss": 0.4257, "step": 1949 }, { "epoch": 1.4359351988217968, "grad_norm": 0.4817536175251007, "learning_rate": 6.2339847146786515e-06, "loss": 0.3969, "step": 1950 }, { "epoch": 1.4366715758468336, "grad_norm": 0.40567225217819214, "learning_rate": 6.22983203289054e-06, "loss": 0.4277, "step": 1951 }, { "epoch": 1.4374079528718704, "grad_norm": 0.4788074791431427, "learning_rate": 6.225678447950947e-06, "loss": 0.4243, "step": 1952 }, { "epoch": 1.4381443298969072, "grad_norm": 0.38317757844924927, "learning_rate": 6.2215239629101385e-06, "loss": 0.4197, "step": 1953 }, { "epoch": 1.438880706921944, "grad_norm": 0.40520721673965454, "learning_rate": 6.217368580819049e-06, "loss": 0.4112, "step": 1954 }, { "epoch": 1.4396170839469808, "grad_norm": 0.4457045793533325, "learning_rate": 6.213212304729259e-06, "loss": 0.4104, "step": 1955 }, { "epoch": 1.4403534609720177, "grad_norm": 0.42782163619995117, "learning_rate": 6.209055137693014e-06, "loss": 0.4247, "step": 1956 }, { "epoch": 1.4410898379970545, "grad_norm": 0.4254503548145294, "learning_rate": 6.204897082763213e-06, "loss": 0.3672, "step": 1957 }, { "epoch": 1.4418262150220913, "grad_norm": 0.3664149343967438, "learning_rate": 6.200738142993406e-06, "loss": 0.3824, "step": 1958 }, { "epoch": 1.4425625920471281, "grad_norm": 0.41897517442703247, "learning_rate": 6.1965783214377895e-06, "loss": 0.3947, "step": 1959 }, { "epoch": 1.443298969072165, "grad_norm": 0.3825157582759857, "learning_rate": 6.1924176211512145e-06, "loss": 0.3827, "step": 1960 }, { "epoch": 1.4440353460972017, "grad_norm": 0.42175137996673584, "learning_rate": 6.1882560451891715e-06, "loss": 0.4241, "step": 1961 }, { "epoch": 1.4447717231222386, "grad_norm": 0.40888282656669617, "learning_rate": 6.1840935966077985e-06, "loss": 0.4041, "step": 1962 }, { "epoch": 1.4455081001472754, "grad_norm": 0.42432424426078796, "learning_rate": 6.179930278463868e-06, "loss": 0.4155, "step": 1963 }, { "epoch": 1.4462444771723122, "grad_norm": 0.4558820128440857, "learning_rate": 6.175766093814798e-06, "loss": 0.4044, "step": 1964 }, { "epoch": 1.446980854197349, "grad_norm": 0.506184458732605, "learning_rate": 6.1716010457186395e-06, "loss": 0.4198, "step": 1965 }, { "epoch": 1.4477172312223858, "grad_norm": 0.38299667835235596, "learning_rate": 6.167435137234078e-06, "loss": 0.4086, "step": 1966 }, { "epoch": 1.4484536082474226, "grad_norm": 0.43511974811553955, "learning_rate": 6.16326837142043e-06, "loss": 0.4063, "step": 1967 }, { "epoch": 1.4491899852724595, "grad_norm": 0.48710158467292786, "learning_rate": 6.1591007513376425e-06, "loss": 0.4157, "step": 1968 }, { "epoch": 1.4499263622974963, "grad_norm": 0.41424429416656494, "learning_rate": 6.15493228004629e-06, "loss": 0.397, "step": 1969 }, { "epoch": 1.450662739322533, "grad_norm": 0.4269460439682007, "learning_rate": 6.1507629606075724e-06, "loss": 0.4171, "step": 1970 }, { "epoch": 1.45139911634757, "grad_norm": 0.4956137239933014, "learning_rate": 6.14659279608331e-06, "loss": 0.4108, "step": 1971 }, { "epoch": 1.4521354933726067, "grad_norm": 0.4469659626483917, "learning_rate": 6.142421789535948e-06, "loss": 0.4267, "step": 1972 }, { "epoch": 1.4528718703976435, "grad_norm": 0.3990239202976227, "learning_rate": 6.138249944028547e-06, "loss": 0.4151, "step": 1973 }, { "epoch": 1.4536082474226804, "grad_norm": 0.4268157184123993, "learning_rate": 6.134077262624783e-06, "loss": 0.42, "step": 1974 }, { "epoch": 1.4543446244477172, "grad_norm": 0.45455217361450195, "learning_rate": 6.129903748388948e-06, "loss": 0.3977, "step": 1975 }, { "epoch": 1.455081001472754, "grad_norm": 0.3931489884853363, "learning_rate": 6.125729404385946e-06, "loss": 0.4218, "step": 1976 }, { "epoch": 1.4558173784977908, "grad_norm": 0.4898168444633484, "learning_rate": 6.121554233681286e-06, "loss": 0.3984, "step": 1977 }, { "epoch": 1.4565537555228276, "grad_norm": 0.44921234250068665, "learning_rate": 6.11737823934109e-06, "loss": 0.4049, "step": 1978 }, { "epoch": 1.4572901325478644, "grad_norm": 0.4154300093650818, "learning_rate": 6.11320142443208e-06, "loss": 0.4296, "step": 1979 }, { "epoch": 1.4580265095729013, "grad_norm": 0.46535080671310425, "learning_rate": 6.109023792021586e-06, "loss": 0.4384, "step": 1980 }, { "epoch": 1.458762886597938, "grad_norm": 0.42891719937324524, "learning_rate": 6.1048453451775305e-06, "loss": 0.4186, "step": 1981 }, { "epoch": 1.4594992636229749, "grad_norm": 0.4581204056739807, "learning_rate": 6.100666086968441e-06, "loss": 0.3773, "step": 1982 }, { "epoch": 1.4602356406480117, "grad_norm": 0.3952482342720032, "learning_rate": 6.09648602046344e-06, "loss": 0.4141, "step": 1983 }, { "epoch": 1.4609720176730487, "grad_norm": 0.4057900905609131, "learning_rate": 6.0923051487322385e-06, "loss": 0.4384, "step": 1984 }, { "epoch": 1.4617083946980856, "grad_norm": 0.4145631194114685, "learning_rate": 6.088123474845144e-06, "loss": 0.3874, "step": 1985 }, { "epoch": 1.4624447717231224, "grad_norm": 0.3992542624473572, "learning_rate": 6.0839410018730515e-06, "loss": 0.414, "step": 1986 }, { "epoch": 1.4631811487481592, "grad_norm": 0.43685290217399597, "learning_rate": 6.079757732887444e-06, "loss": 0.3838, "step": 1987 }, { "epoch": 1.463917525773196, "grad_norm": 0.3933916389942169, "learning_rate": 6.075573670960385e-06, "loss": 0.3845, "step": 1988 }, { "epoch": 1.4646539027982328, "grad_norm": 0.4441211223602295, "learning_rate": 6.071388819164525e-06, "loss": 0.4222, "step": 1989 }, { "epoch": 1.4653902798232696, "grad_norm": 0.4381551444530487, "learning_rate": 6.067203180573094e-06, "loss": 0.3978, "step": 1990 }, { "epoch": 1.4661266568483065, "grad_norm": 0.40585675835609436, "learning_rate": 6.063016758259896e-06, "loss": 0.4072, "step": 1991 }, { "epoch": 1.4668630338733433, "grad_norm": 0.358671098947525, "learning_rate": 6.058829555299314e-06, "loss": 0.4078, "step": 1992 }, { "epoch": 1.46759941089838, "grad_norm": 0.39564085006713867, "learning_rate": 6.054641574766304e-06, "loss": 0.4084, "step": 1993 }, { "epoch": 1.468335787923417, "grad_norm": 0.40318918228149414, "learning_rate": 6.05045281973639e-06, "loss": 0.3892, "step": 1994 }, { "epoch": 1.4690721649484537, "grad_norm": 0.4056055247783661, "learning_rate": 6.04626329328567e-06, "loss": 0.4119, "step": 1995 }, { "epoch": 1.4698085419734905, "grad_norm": 0.40646982192993164, "learning_rate": 6.042072998490805e-06, "loss": 0.4069, "step": 1996 }, { "epoch": 1.4705449189985274, "grad_norm": 0.396869033575058, "learning_rate": 6.0378819384290185e-06, "loss": 0.4212, "step": 1997 }, { "epoch": 1.4712812960235642, "grad_norm": 0.46342048048973083, "learning_rate": 6.033690116178101e-06, "loss": 0.401, "step": 1998 }, { "epoch": 1.472017673048601, "grad_norm": 0.34812402725219727, "learning_rate": 6.0294975348163985e-06, "loss": 0.3684, "step": 1999 }, { "epoch": 1.4727540500736378, "grad_norm": 0.4260614812374115, "learning_rate": 6.025304197422819e-06, "loss": 0.3867, "step": 2000 }, { "epoch": 1.4734904270986746, "grad_norm": 0.4525951147079468, "learning_rate": 6.0211101070768184e-06, "loss": 0.3997, "step": 2001 }, { "epoch": 1.4742268041237114, "grad_norm": 0.4634397625923157, "learning_rate": 6.016915266858413e-06, "loss": 0.4346, "step": 2002 }, { "epoch": 1.4749631811487482, "grad_norm": 0.40612179040908813, "learning_rate": 6.0127196798481645e-06, "loss": 0.3975, "step": 2003 }, { "epoch": 1.475699558173785, "grad_norm": 0.4513106048107147, "learning_rate": 6.008523349127188e-06, "loss": 0.3806, "step": 2004 }, { "epoch": 1.4764359351988219, "grad_norm": 0.5352683067321777, "learning_rate": 6.004326277777141e-06, "loss": 0.4328, "step": 2005 }, { "epoch": 1.4771723122238587, "grad_norm": 0.4118804931640625, "learning_rate": 6.000128468880223e-06, "loss": 0.4485, "step": 2006 }, { "epoch": 1.4779086892488955, "grad_norm": 0.4650607109069824, "learning_rate": 5.995929925519181e-06, "loss": 0.4315, "step": 2007 }, { "epoch": 1.4786450662739323, "grad_norm": 0.5012993216514587, "learning_rate": 5.991730650777297e-06, "loss": 0.3924, "step": 2008 }, { "epoch": 1.4793814432989691, "grad_norm": 0.4178714156150818, "learning_rate": 5.987530647738394e-06, "loss": 0.42, "step": 2009 }, { "epoch": 1.480117820324006, "grad_norm": 0.4390731453895569, "learning_rate": 5.983329919486824e-06, "loss": 0.4017, "step": 2010 }, { "epoch": 1.4808541973490428, "grad_norm": 0.4657975137233734, "learning_rate": 5.9791284691074765e-06, "loss": 0.3949, "step": 2011 }, { "epoch": 1.4815905743740796, "grad_norm": 0.4687153100967407, "learning_rate": 5.974926299685772e-06, "loss": 0.3857, "step": 2012 }, { "epoch": 1.4823269513991164, "grad_norm": 0.41643956303596497, "learning_rate": 5.970723414307652e-06, "loss": 0.4026, "step": 2013 }, { "epoch": 1.4830633284241532, "grad_norm": 0.4140578806400299, "learning_rate": 5.966519816059591e-06, "loss": 0.3904, "step": 2014 }, { "epoch": 1.48379970544919, "grad_norm": 0.42964425683021545, "learning_rate": 5.962315508028584e-06, "loss": 0.3904, "step": 2015 }, { "epoch": 1.4845360824742269, "grad_norm": 0.4009730815887451, "learning_rate": 5.958110493302148e-06, "loss": 0.4047, "step": 2016 }, { "epoch": 1.4852724594992637, "grad_norm": 0.41644203662872314, "learning_rate": 5.95390477496832e-06, "loss": 0.4019, "step": 2017 }, { "epoch": 1.4860088365243005, "grad_norm": 0.43529626727104187, "learning_rate": 5.94969835611565e-06, "loss": 0.4447, "step": 2018 }, { "epoch": 1.4867452135493373, "grad_norm": 0.4235115051269531, "learning_rate": 5.945491239833206e-06, "loss": 0.4281, "step": 2019 }, { "epoch": 1.4874815905743741, "grad_norm": 0.441995769739151, "learning_rate": 5.941283429210568e-06, "loss": 0.3974, "step": 2020 }, { "epoch": 1.488217967599411, "grad_norm": 0.4208744764328003, "learning_rate": 5.937074927337824e-06, "loss": 0.3994, "step": 2021 }, { "epoch": 1.4889543446244478, "grad_norm": 0.3799182176589966, "learning_rate": 5.932865737305571e-06, "loss": 0.4188, "step": 2022 }, { "epoch": 1.4896907216494846, "grad_norm": 0.4479272961616516, "learning_rate": 5.928655862204911e-06, "loss": 0.4072, "step": 2023 }, { "epoch": 1.4904270986745214, "grad_norm": 0.4461783766746521, "learning_rate": 5.924445305127448e-06, "loss": 0.4119, "step": 2024 }, { "epoch": 1.4911634756995582, "grad_norm": 0.40956565737724304, "learning_rate": 5.9202340691652895e-06, "loss": 0.4073, "step": 2025 }, { "epoch": 1.491899852724595, "grad_norm": 0.3731072247028351, "learning_rate": 5.916022157411038e-06, "loss": 0.4237, "step": 2026 }, { "epoch": 1.4926362297496318, "grad_norm": 0.42282289266586304, "learning_rate": 5.911809572957796e-06, "loss": 0.415, "step": 2027 }, { "epoch": 1.4933726067746687, "grad_norm": 0.41434556245803833, "learning_rate": 5.907596318899157e-06, "loss": 0.4238, "step": 2028 }, { "epoch": 1.4941089837997055, "grad_norm": 0.4469594657421112, "learning_rate": 5.9033823983292095e-06, "loss": 0.4217, "step": 2029 }, { "epoch": 1.4948453608247423, "grad_norm": 0.4168426990509033, "learning_rate": 5.899167814342527e-06, "loss": 0.4015, "step": 2030 }, { "epoch": 1.495581737849779, "grad_norm": 0.3973442018032074, "learning_rate": 5.8949525700341735e-06, "loss": 0.3986, "step": 2031 }, { "epoch": 1.496318114874816, "grad_norm": 0.5031341314315796, "learning_rate": 5.890736668499696e-06, "loss": 0.4217, "step": 2032 }, { "epoch": 1.4970544918998527, "grad_norm": 0.42431867122650146, "learning_rate": 5.886520112835128e-06, "loss": 0.3919, "step": 2033 }, { "epoch": 1.4977908689248896, "grad_norm": 0.4433683454990387, "learning_rate": 5.8823029061369785e-06, "loss": 0.393, "step": 2034 }, { "epoch": 1.4985272459499264, "grad_norm": 0.3885188400745392, "learning_rate": 5.878085051502236e-06, "loss": 0.3966, "step": 2035 }, { "epoch": 1.4992636229749632, "grad_norm": 0.4179115295410156, "learning_rate": 5.873866552028367e-06, "loss": 0.39, "step": 2036 }, { "epoch": 1.5, "grad_norm": 0.409787118434906, "learning_rate": 5.86964741081331e-06, "loss": 0.4102, "step": 2037 }, { "epoch": 1.5007363770250368, "grad_norm": 0.4352511465549469, "learning_rate": 5.865427630955475e-06, "loss": 0.4002, "step": 2038 }, { "epoch": 1.5014727540500736, "grad_norm": 0.47778865694999695, "learning_rate": 5.861207215553739e-06, "loss": 0.3996, "step": 2039 }, { "epoch": 1.5022091310751104, "grad_norm": 0.3906075954437256, "learning_rate": 5.856986167707448e-06, "loss": 0.3844, "step": 2040 }, { "epoch": 1.5029455081001473, "grad_norm": 0.4253922402858734, "learning_rate": 5.852764490516414e-06, "loss": 0.4071, "step": 2041 }, { "epoch": 1.503681885125184, "grad_norm": 0.39142486453056335, "learning_rate": 5.8485421870809076e-06, "loss": 0.3863, "step": 2042 }, { "epoch": 1.504418262150221, "grad_norm": 0.4321049153804779, "learning_rate": 5.8443192605016604e-06, "loss": 0.4135, "step": 2043 }, { "epoch": 1.5051546391752577, "grad_norm": 0.4090045988559723, "learning_rate": 5.840095713879864e-06, "loss": 0.4048, "step": 2044 }, { "epoch": 1.5058910162002945, "grad_norm": 0.4378100037574768, "learning_rate": 5.83587155031716e-06, "loss": 0.4017, "step": 2045 }, { "epoch": 1.5066273932253313, "grad_norm": 0.42427340149879456, "learning_rate": 5.831646772915651e-06, "loss": 0.4201, "step": 2046 }, { "epoch": 1.5073637702503682, "grad_norm": 0.4345465898513794, "learning_rate": 5.827421384777883e-06, "loss": 0.3987, "step": 2047 }, { "epoch": 1.508100147275405, "grad_norm": 0.4228680729866028, "learning_rate": 5.823195389006853e-06, "loss": 0.4231, "step": 2048 }, { "epoch": 1.5088365243004418, "grad_norm": 0.3961130976676941, "learning_rate": 5.818968788706006e-06, "loss": 0.4138, "step": 2049 }, { "epoch": 1.5095729013254786, "grad_norm": 0.3891131579875946, "learning_rate": 5.814741586979228e-06, "loss": 0.3979, "step": 2050 }, { "epoch": 1.5103092783505154, "grad_norm": 0.44312554597854614, "learning_rate": 5.810513786930849e-06, "loss": 0.4171, "step": 2051 }, { "epoch": 1.5110456553755522, "grad_norm": 0.36081069707870483, "learning_rate": 5.806285391665639e-06, "loss": 0.3745, "step": 2052 }, { "epoch": 1.511782032400589, "grad_norm": 0.43281090259552, "learning_rate": 5.8020564042888015e-06, "loss": 0.408, "step": 2053 }, { "epoch": 1.5125184094256259, "grad_norm": 0.4164915680885315, "learning_rate": 5.7978268279059795e-06, "loss": 0.3897, "step": 2054 }, { "epoch": 1.5132547864506627, "grad_norm": 0.41813069581985474, "learning_rate": 5.7935966656232434e-06, "loss": 0.4268, "step": 2055 }, { "epoch": 1.5139911634756995, "grad_norm": 0.4125601053237915, "learning_rate": 5.789365920547098e-06, "loss": 0.3895, "step": 2056 }, { "epoch": 1.5147275405007363, "grad_norm": 0.4070327579975128, "learning_rate": 5.785134595784473e-06, "loss": 0.3862, "step": 2057 }, { "epoch": 1.5154639175257731, "grad_norm": 0.40182122588157654, "learning_rate": 5.780902694442727e-06, "loss": 0.4227, "step": 2058 }, { "epoch": 1.51620029455081, "grad_norm": 0.40783995389938354, "learning_rate": 5.776670219629643e-06, "loss": 0.3972, "step": 2059 }, { "epoch": 1.5169366715758468, "grad_norm": 0.4468885064125061, "learning_rate": 5.772437174453418e-06, "loss": 0.384, "step": 2060 }, { "epoch": 1.5176730486008836, "grad_norm": 0.4095504581928253, "learning_rate": 5.768203562022674e-06, "loss": 0.4271, "step": 2061 }, { "epoch": 1.5184094256259204, "grad_norm": 0.4125244915485382, "learning_rate": 5.7639693854464495e-06, "loss": 0.429, "step": 2062 }, { "epoch": 1.5191458026509572, "grad_norm": 0.3764050602912903, "learning_rate": 5.7597346478341946e-06, "loss": 0.4135, "step": 2063 }, { "epoch": 1.519882179675994, "grad_norm": 0.3860231637954712, "learning_rate": 5.755499352295772e-06, "loss": 0.387, "step": 2064 }, { "epoch": 1.5206185567010309, "grad_norm": 0.42220139503479004, "learning_rate": 5.751263501941454e-06, "loss": 0.385, "step": 2065 }, { "epoch": 1.5213549337260677, "grad_norm": 0.4375130236148834, "learning_rate": 5.747027099881925e-06, "loss": 0.4225, "step": 2066 }, { "epoch": 1.5220913107511045, "grad_norm": 0.3937590420246124, "learning_rate": 5.742790149228268e-06, "loss": 0.4172, "step": 2067 }, { "epoch": 1.5228276877761413, "grad_norm": 0.49965283274650574, "learning_rate": 5.738552653091971e-06, "loss": 0.3976, "step": 2068 }, { "epoch": 1.5235640648011781, "grad_norm": 0.4149169623851776, "learning_rate": 5.734314614584924e-06, "loss": 0.4299, "step": 2069 }, { "epoch": 1.524300441826215, "grad_norm": 0.41810843348503113, "learning_rate": 5.730076036819414e-06, "loss": 0.3996, "step": 2070 }, { "epoch": 1.5250368188512518, "grad_norm": 0.42622384428977966, "learning_rate": 5.725836922908125e-06, "loss": 0.387, "step": 2071 }, { "epoch": 1.5257731958762886, "grad_norm": 0.40818580985069275, "learning_rate": 5.7215972759641335e-06, "loss": 0.4264, "step": 2072 }, { "epoch": 1.5265095729013254, "grad_norm": 0.4584078788757324, "learning_rate": 5.71735709910091e-06, "loss": 0.4179, "step": 2073 }, { "epoch": 1.5272459499263622, "grad_norm": 0.44075214862823486, "learning_rate": 5.7131163954323085e-06, "loss": 0.4033, "step": 2074 }, { "epoch": 1.527982326951399, "grad_norm": 0.4163413643836975, "learning_rate": 5.708875168072577e-06, "loss": 0.4279, "step": 2075 }, { "epoch": 1.5287187039764358, "grad_norm": 0.40220654010772705, "learning_rate": 5.704633420136343e-06, "loss": 0.3998, "step": 2076 }, { "epoch": 1.5294550810014726, "grad_norm": 0.45772379636764526, "learning_rate": 5.700391154738619e-06, "loss": 0.3933, "step": 2077 }, { "epoch": 1.5301914580265095, "grad_norm": 0.38176655769348145, "learning_rate": 5.696148374994795e-06, "loss": 0.4204, "step": 2078 }, { "epoch": 1.5309278350515463, "grad_norm": 0.4314991533756256, "learning_rate": 5.691905084020642e-06, "loss": 0.3963, "step": 2079 }, { "epoch": 1.531664212076583, "grad_norm": 0.42802858352661133, "learning_rate": 5.687661284932306e-06, "loss": 0.4353, "step": 2080 }, { "epoch": 1.53240058910162, "grad_norm": 0.4672757089138031, "learning_rate": 5.6834169808463e-06, "loss": 0.439, "step": 2081 }, { "epoch": 1.5331369661266567, "grad_norm": 0.42437198758125305, "learning_rate": 5.679172174879516e-06, "loss": 0.3863, "step": 2082 }, { "epoch": 1.5338733431516935, "grad_norm": 0.404613196849823, "learning_rate": 5.67492687014921e-06, "loss": 0.4005, "step": 2083 }, { "epoch": 1.5346097201767304, "grad_norm": 0.41044384241104126, "learning_rate": 5.6706810697730095e-06, "loss": 0.4065, "step": 2084 }, { "epoch": 1.5353460972017672, "grad_norm": 0.4328327178955078, "learning_rate": 5.666434776868895e-06, "loss": 0.4264, "step": 2085 }, { "epoch": 1.536082474226804, "grad_norm": 0.44456425309181213, "learning_rate": 5.662187994555221e-06, "loss": 0.3825, "step": 2086 }, { "epoch": 1.5368188512518408, "grad_norm": 0.4055061638355255, "learning_rate": 5.657940725950693e-06, "loss": 0.3757, "step": 2087 }, { "epoch": 1.5375552282768776, "grad_norm": 0.4078313112258911, "learning_rate": 5.65369297417438e-06, "loss": 0.414, "step": 2088 }, { "epoch": 1.5382916053019144, "grad_norm": 0.4202408194541931, "learning_rate": 5.6494447423457e-06, "loss": 0.406, "step": 2089 }, { "epoch": 1.5390279823269513, "grad_norm": 0.4479289650917053, "learning_rate": 5.645196033584426e-06, "loss": 0.384, "step": 2090 }, { "epoch": 1.539764359351988, "grad_norm": 0.40371423959732056, "learning_rate": 5.640946851010682e-06, "loss": 0.3914, "step": 2091 }, { "epoch": 1.540500736377025, "grad_norm": 0.3889717161655426, "learning_rate": 5.636697197744941e-06, "loss": 0.4069, "step": 2092 }, { "epoch": 1.5412371134020617, "grad_norm": 0.4180935323238373, "learning_rate": 5.6324470769080165e-06, "loss": 0.4073, "step": 2093 }, { "epoch": 1.5419734904270985, "grad_norm": 0.46457764506340027, "learning_rate": 5.6281964916210715e-06, "loss": 0.3946, "step": 2094 }, { "epoch": 1.5427098674521353, "grad_norm": 0.41641589999198914, "learning_rate": 5.6239454450056066e-06, "loss": 0.4287, "step": 2095 }, { "epoch": 1.5434462444771722, "grad_norm": 0.40457090735435486, "learning_rate": 5.6196939401834625e-06, "loss": 0.4061, "step": 2096 }, { "epoch": 1.544182621502209, "grad_norm": 0.42012038826942444, "learning_rate": 5.615441980276814e-06, "loss": 0.4113, "step": 2097 }, { "epoch": 1.5449189985272458, "grad_norm": 0.4231841564178467, "learning_rate": 5.611189568408173e-06, "loss": 0.4352, "step": 2098 }, { "epoch": 1.5456553755522826, "grad_norm": 0.3875311017036438, "learning_rate": 5.6069367077003835e-06, "loss": 0.4069, "step": 2099 }, { "epoch": 1.5463917525773194, "grad_norm": 0.38472917675971985, "learning_rate": 5.6026834012766155e-06, "loss": 0.3963, "step": 2100 }, { "epoch": 1.5471281296023565, "grad_norm": 0.48356834053993225, "learning_rate": 5.598429652260371e-06, "loss": 0.4407, "step": 2101 }, { "epoch": 1.5478645066273933, "grad_norm": 0.3837643563747406, "learning_rate": 5.594175463775475e-06, "loss": 0.3983, "step": 2102 }, { "epoch": 1.54860088365243, "grad_norm": 0.4311898648738861, "learning_rate": 5.5899208389460715e-06, "loss": 0.4, "step": 2103 }, { "epoch": 1.549337260677467, "grad_norm": 0.38184744119644165, "learning_rate": 5.5856657808966315e-06, "loss": 0.3998, "step": 2104 }, { "epoch": 1.5500736377025037, "grad_norm": 0.4166722297668457, "learning_rate": 5.581410292751941e-06, "loss": 0.3881, "step": 2105 }, { "epoch": 1.5508100147275405, "grad_norm": 0.44956570863723755, "learning_rate": 5.577154377637101e-06, "loss": 0.4095, "step": 2106 }, { "epoch": 1.5515463917525774, "grad_norm": 0.3997005522251129, "learning_rate": 5.572898038677526e-06, "loss": 0.4035, "step": 2107 }, { "epoch": 1.5522827687776142, "grad_norm": 0.4489591121673584, "learning_rate": 5.5686412789989444e-06, "loss": 0.4001, "step": 2108 }, { "epoch": 1.553019145802651, "grad_norm": 0.40616336464881897, "learning_rate": 5.5643841017273915e-06, "loss": 0.3858, "step": 2109 }, { "epoch": 1.5537555228276878, "grad_norm": 0.46725019812583923, "learning_rate": 5.560126509989209e-06, "loss": 0.3954, "step": 2110 }, { "epoch": 1.5544918998527246, "grad_norm": 0.42120271921157837, "learning_rate": 5.5558685069110444e-06, "loss": 0.4105, "step": 2111 }, { "epoch": 1.5552282768777614, "grad_norm": 0.361070454120636, "learning_rate": 5.5516100956198445e-06, "loss": 0.3762, "step": 2112 }, { "epoch": 1.5559646539027983, "grad_norm": 0.4115205705165863, "learning_rate": 5.547351279242861e-06, "loss": 0.4182, "step": 2113 }, { "epoch": 1.556701030927835, "grad_norm": 0.40333011746406555, "learning_rate": 5.543092060907639e-06, "loss": 0.3663, "step": 2114 }, { "epoch": 1.5574374079528719, "grad_norm": 0.4221840500831604, "learning_rate": 5.538832443742018e-06, "loss": 0.4132, "step": 2115 }, { "epoch": 1.5581737849779087, "grad_norm": 0.40356510877609253, "learning_rate": 5.5345724308741326e-06, "loss": 0.3685, "step": 2116 }, { "epoch": 1.5589101620029455, "grad_norm": 0.47017496824264526, "learning_rate": 5.5303120254324104e-06, "loss": 0.4117, "step": 2117 }, { "epoch": 1.5596465390279823, "grad_norm": 0.3889496624469757, "learning_rate": 5.52605123054556e-06, "loss": 0.4128, "step": 2118 }, { "epoch": 1.5603829160530192, "grad_norm": 0.44326820969581604, "learning_rate": 5.521790049342583e-06, "loss": 0.4213, "step": 2119 }, { "epoch": 1.561119293078056, "grad_norm": 0.4094356298446655, "learning_rate": 5.5175284849527635e-06, "loss": 0.4006, "step": 2120 }, { "epoch": 1.5618556701030928, "grad_norm": 0.46180570125579834, "learning_rate": 5.513266540505662e-06, "loss": 0.4114, "step": 2121 }, { "epoch": 1.5625920471281296, "grad_norm": 0.38660722970962524, "learning_rate": 5.509004219131124e-06, "loss": 0.3904, "step": 2122 }, { "epoch": 1.5633284241531664, "grad_norm": 0.42590051889419556, "learning_rate": 5.504741523959269e-06, "loss": 0.4155, "step": 2123 }, { "epoch": 1.5640648011782032, "grad_norm": 0.4073706269264221, "learning_rate": 5.500478458120493e-06, "loss": 0.4318, "step": 2124 }, { "epoch": 1.56480117820324, "grad_norm": 0.39798110723495483, "learning_rate": 5.49621502474546e-06, "loss": 0.416, "step": 2125 }, { "epoch": 1.5655375552282769, "grad_norm": 0.4096386432647705, "learning_rate": 5.491951226965108e-06, "loss": 0.4398, "step": 2126 }, { "epoch": 1.5662739322533137, "grad_norm": 0.4136483669281006, "learning_rate": 5.48768706791064e-06, "loss": 0.4249, "step": 2127 }, { "epoch": 1.5670103092783505, "grad_norm": 0.435358464717865, "learning_rate": 5.4834225507135284e-06, "loss": 0.3931, "step": 2128 }, { "epoch": 1.5677466863033873, "grad_norm": 0.39365580677986145, "learning_rate": 5.479157678505503e-06, "loss": 0.419, "step": 2129 }, { "epoch": 1.5684830633284241, "grad_norm": 0.4026069939136505, "learning_rate": 5.474892454418559e-06, "loss": 0.393, "step": 2130 }, { "epoch": 1.569219440353461, "grad_norm": 0.44560348987579346, "learning_rate": 5.470626881584948e-06, "loss": 0.4176, "step": 2131 }, { "epoch": 1.5699558173784978, "grad_norm": 0.4150668978691101, "learning_rate": 5.466360963137175e-06, "loss": 0.4245, "step": 2132 }, { "epoch": 1.5706921944035346, "grad_norm": 0.3756403923034668, "learning_rate": 5.462094702208004e-06, "loss": 0.4057, "step": 2133 }, { "epoch": 1.5714285714285714, "grad_norm": 0.45393243432044983, "learning_rate": 5.4578281019304494e-06, "loss": 0.411, "step": 2134 }, { "epoch": 1.5721649484536082, "grad_norm": 0.4269341826438904, "learning_rate": 5.453561165437771e-06, "loss": 0.3946, "step": 2135 }, { "epoch": 1.572901325478645, "grad_norm": 0.39662691950798035, "learning_rate": 5.449293895863478e-06, "loss": 0.3958, "step": 2136 }, { "epoch": 1.5736377025036818, "grad_norm": 0.5086665153503418, "learning_rate": 5.445026296341325e-06, "loss": 0.4376, "step": 2137 }, { "epoch": 1.5743740795287187, "grad_norm": 0.43581289052963257, "learning_rate": 5.440758370005309e-06, "loss": 0.4384, "step": 2138 }, { "epoch": 1.5751104565537555, "grad_norm": 0.39093270897865295, "learning_rate": 5.4364901199896655e-06, "loss": 0.4093, "step": 2139 }, { "epoch": 1.5758468335787923, "grad_norm": 0.4165237545967102, "learning_rate": 5.432221549428867e-06, "loss": 0.3921, "step": 2140 }, { "epoch": 1.576583210603829, "grad_norm": 0.4502768814563751, "learning_rate": 5.427952661457624e-06, "loss": 0.4246, "step": 2141 }, { "epoch": 1.577319587628866, "grad_norm": 0.42256587743759155, "learning_rate": 5.42368345921088e-06, "loss": 0.3885, "step": 2142 }, { "epoch": 1.5780559646539027, "grad_norm": 0.3865189552307129, "learning_rate": 5.419413945823806e-06, "loss": 0.4251, "step": 2143 }, { "epoch": 1.5787923416789398, "grad_norm": 0.405563086271286, "learning_rate": 5.415144124431805e-06, "loss": 0.4107, "step": 2144 }, { "epoch": 1.5795287187039766, "grad_norm": 0.4749312102794647, "learning_rate": 5.410873998170503e-06, "loss": 0.4022, "step": 2145 }, { "epoch": 1.5802650957290134, "grad_norm": 0.41955462098121643, "learning_rate": 5.4066035701757535e-06, "loss": 0.4113, "step": 2146 }, { "epoch": 1.5810014727540502, "grad_norm": 0.34526562690734863, "learning_rate": 5.402332843583631e-06, "loss": 0.4176, "step": 2147 }, { "epoch": 1.581737849779087, "grad_norm": 0.4131619334220886, "learning_rate": 5.398061821530423e-06, "loss": 0.398, "step": 2148 }, { "epoch": 1.5824742268041239, "grad_norm": 0.38839566707611084, "learning_rate": 5.393790507152645e-06, "loss": 0.4031, "step": 2149 }, { "epoch": 1.5832106038291607, "grad_norm": 0.35662680864334106, "learning_rate": 5.389518903587016e-06, "loss": 0.3957, "step": 2150 }, { "epoch": 1.5839469808541975, "grad_norm": 0.417187362909317, "learning_rate": 5.3852470139704786e-06, "loss": 0.4268, "step": 2151 }, { "epoch": 1.5846833578792343, "grad_norm": 0.4425226151943207, "learning_rate": 5.380974841440173e-06, "loss": 0.4115, "step": 2152 }, { "epoch": 1.5854197349042711, "grad_norm": 0.41201913356781006, "learning_rate": 5.376702389133458e-06, "loss": 0.4056, "step": 2153 }, { "epoch": 1.586156111929308, "grad_norm": 0.39344313740730286, "learning_rate": 5.37242966018789e-06, "loss": 0.4178, "step": 2154 }, { "epoch": 1.5868924889543448, "grad_norm": 0.41077542304992676, "learning_rate": 5.3681566577412355e-06, "loss": 0.4046, "step": 2155 }, { "epoch": 1.5876288659793816, "grad_norm": 0.40460658073425293, "learning_rate": 5.363883384931456e-06, "loss": 0.4302, "step": 2156 }, { "epoch": 1.5883652430044184, "grad_norm": 0.43949198722839355, "learning_rate": 5.359609844896717e-06, "loss": 0.4148, "step": 2157 }, { "epoch": 1.5891016200294552, "grad_norm": 0.43986040353775024, "learning_rate": 5.355336040775373e-06, "loss": 0.4092, "step": 2158 }, { "epoch": 1.589837997054492, "grad_norm": 0.41094833612442017, "learning_rate": 5.3510619757059775e-06, "loss": 0.3936, "step": 2159 }, { "epoch": 1.5905743740795288, "grad_norm": 0.456833153963089, "learning_rate": 5.346787652827279e-06, "loss": 0.416, "step": 2160 }, { "epoch": 1.5913107511045657, "grad_norm": 0.43459081649780273, "learning_rate": 5.3425130752782065e-06, "loss": 0.4134, "step": 2161 }, { "epoch": 1.5920471281296025, "grad_norm": 0.41692793369293213, "learning_rate": 5.33823824619788e-06, "loss": 0.3983, "step": 2162 }, { "epoch": 1.5927835051546393, "grad_norm": 0.37748250365257263, "learning_rate": 5.3339631687256085e-06, "loss": 0.399, "step": 2163 }, { "epoch": 1.593519882179676, "grad_norm": 0.40077048540115356, "learning_rate": 5.3296878460008785e-06, "loss": 0.3902, "step": 2164 }, { "epoch": 1.594256259204713, "grad_norm": 0.4162245988845825, "learning_rate": 5.325412281163356e-06, "loss": 0.3972, "step": 2165 }, { "epoch": 1.5949926362297497, "grad_norm": 0.41424018144607544, "learning_rate": 5.321136477352887e-06, "loss": 0.396, "step": 2166 }, { "epoch": 1.5957290132547866, "grad_norm": 0.4058535099029541, "learning_rate": 5.3168604377094945e-06, "loss": 0.3772, "step": 2167 }, { "epoch": 1.5964653902798234, "grad_norm": 0.44171908497810364, "learning_rate": 5.312584165373372e-06, "loss": 0.4135, "step": 2168 }, { "epoch": 1.5972017673048602, "grad_norm": 0.46828219294548035, "learning_rate": 5.308307663484884e-06, "loss": 0.4229, "step": 2169 }, { "epoch": 1.597938144329897, "grad_norm": 0.39885610342025757, "learning_rate": 5.304030935184564e-06, "loss": 0.4146, "step": 2170 }, { "epoch": 1.5986745213549338, "grad_norm": 0.4212936460971832, "learning_rate": 5.299753983613114e-06, "loss": 0.4033, "step": 2171 }, { "epoch": 1.5994108983799706, "grad_norm": 0.4296552836894989, "learning_rate": 5.2954768119113975e-06, "loss": 0.4278, "step": 2172 }, { "epoch": 1.6001472754050075, "grad_norm": 0.4296092987060547, "learning_rate": 5.291199423220438e-06, "loss": 0.4141, "step": 2173 }, { "epoch": 1.6008836524300443, "grad_norm": 0.3816363215446472, "learning_rate": 5.286921820681421e-06, "loss": 0.4128, "step": 2174 }, { "epoch": 1.601620029455081, "grad_norm": 0.415924608707428, "learning_rate": 5.28264400743569e-06, "loss": 0.4133, "step": 2175 }, { "epoch": 1.602356406480118, "grad_norm": 0.47628721594810486, "learning_rate": 5.278365986624743e-06, "loss": 0.4195, "step": 2176 }, { "epoch": 1.6030927835051547, "grad_norm": 0.477327823638916, "learning_rate": 5.274087761390224e-06, "loss": 0.4116, "step": 2177 }, { "epoch": 1.6038291605301915, "grad_norm": 0.42795825004577637, "learning_rate": 5.269809334873939e-06, "loss": 0.4033, "step": 2178 }, { "epoch": 1.6045655375552283, "grad_norm": 0.4993734657764435, "learning_rate": 5.2655307102178285e-06, "loss": 0.4451, "step": 2179 }, { "epoch": 1.6053019145802652, "grad_norm": 0.49656498432159424, "learning_rate": 5.26125189056399e-06, "loss": 0.4144, "step": 2180 }, { "epoch": 1.606038291605302, "grad_norm": 0.4029616713523865, "learning_rate": 5.256972879054659e-06, "loss": 0.4113, "step": 2181 }, { "epoch": 1.6067746686303388, "grad_norm": 0.46401447057724, "learning_rate": 5.2526936788322106e-06, "loss": 0.4219, "step": 2182 }, { "epoch": 1.6075110456553756, "grad_norm": 0.49594005942344666, "learning_rate": 5.248414293039159e-06, "loss": 0.4095, "step": 2183 }, { "epoch": 1.6082474226804124, "grad_norm": 0.5078505873680115, "learning_rate": 5.244134724818158e-06, "loss": 0.4391, "step": 2184 }, { "epoch": 1.6089837997054492, "grad_norm": 0.3824053108692169, "learning_rate": 5.2398549773119945e-06, "loss": 0.391, "step": 2185 }, { "epoch": 1.609720176730486, "grad_norm": 0.44562169909477234, "learning_rate": 5.235575053663582e-06, "loss": 0.4052, "step": 2186 }, { "epoch": 1.6104565537555229, "grad_norm": 0.44519391655921936, "learning_rate": 5.231294957015969e-06, "loss": 0.4049, "step": 2187 }, { "epoch": 1.6111929307805597, "grad_norm": 0.38943272829055786, "learning_rate": 5.2270146905123285e-06, "loss": 0.3984, "step": 2188 }, { "epoch": 1.6119293078055965, "grad_norm": 0.4190487265586853, "learning_rate": 5.222734257295963e-06, "loss": 0.3971, "step": 2189 }, { "epoch": 1.6126656848306333, "grad_norm": 0.5010129809379578, "learning_rate": 5.218453660510287e-06, "loss": 0.3993, "step": 2190 }, { "epoch": 1.6134020618556701, "grad_norm": 0.4155033826828003, "learning_rate": 5.214172903298843e-06, "loss": 0.4258, "step": 2191 }, { "epoch": 1.614138438880707, "grad_norm": 0.441825270652771, "learning_rate": 5.209891988805292e-06, "loss": 0.4368, "step": 2192 }, { "epoch": 1.6148748159057438, "grad_norm": 0.49732720851898193, "learning_rate": 5.205610920173408e-06, "loss": 0.3995, "step": 2193 }, { "epoch": 1.6156111929307806, "grad_norm": 0.39198291301727295, "learning_rate": 5.201329700547077e-06, "loss": 0.412, "step": 2194 }, { "epoch": 1.6163475699558174, "grad_norm": 0.37358447909355164, "learning_rate": 5.197048333070297e-06, "loss": 0.4017, "step": 2195 }, { "epoch": 1.6170839469808542, "grad_norm": 0.49914005398750305, "learning_rate": 5.192766820887177e-06, "loss": 0.4346, "step": 2196 }, { "epoch": 1.617820324005891, "grad_norm": 0.45152711868286133, "learning_rate": 5.188485167141929e-06, "loss": 0.3845, "step": 2197 }, { "epoch": 1.6185567010309279, "grad_norm": 0.3699303865432739, "learning_rate": 5.1842033749788686e-06, "loss": 0.3937, "step": 2198 }, { "epoch": 1.6192930780559647, "grad_norm": 0.578906238079071, "learning_rate": 5.179921447542417e-06, "loss": 0.4365, "step": 2199 }, { "epoch": 1.6200294550810015, "grad_norm": 0.3776850998401642, "learning_rate": 5.175639387977091e-06, "loss": 0.4229, "step": 2200 }, { "epoch": 1.6207658321060383, "grad_norm": 0.5020015239715576, "learning_rate": 5.171357199427507e-06, "loss": 0.4087, "step": 2201 }, { "epoch": 1.6215022091310751, "grad_norm": 0.447489857673645, "learning_rate": 5.1670748850383734e-06, "loss": 0.4046, "step": 2202 }, { "epoch": 1.622238586156112, "grad_norm": 0.4035995602607727, "learning_rate": 5.162792447954494e-06, "loss": 0.3878, "step": 2203 }, { "epoch": 1.6229749631811488, "grad_norm": 0.43608716130256653, "learning_rate": 5.158509891320759e-06, "loss": 0.4031, "step": 2204 }, { "epoch": 1.6237113402061856, "grad_norm": 0.5111272931098938, "learning_rate": 5.154227218282149e-06, "loss": 0.3981, "step": 2205 }, { "epoch": 1.6244477172312224, "grad_norm": 0.40405896306037903, "learning_rate": 5.1499444319837326e-06, "loss": 0.3946, "step": 2206 }, { "epoch": 1.6251840942562592, "grad_norm": 0.4375455975532532, "learning_rate": 5.145661535570656e-06, "loss": 0.4045, "step": 2207 }, { "epoch": 1.625920471281296, "grad_norm": 0.4746772348880768, "learning_rate": 5.141378532188148e-06, "loss": 0.4113, "step": 2208 }, { "epoch": 1.6266568483063328, "grad_norm": 0.4946158826351166, "learning_rate": 5.137095424981519e-06, "loss": 0.4086, "step": 2209 }, { "epoch": 1.6273932253313697, "grad_norm": 0.4054524600505829, "learning_rate": 5.1328122170961534e-06, "loss": 0.4218, "step": 2210 }, { "epoch": 1.6281296023564065, "grad_norm": 0.5024380683898926, "learning_rate": 5.128528911677509e-06, "loss": 0.3965, "step": 2211 }, { "epoch": 1.6288659793814433, "grad_norm": 0.409605473279953, "learning_rate": 5.124245511871115e-06, "loss": 0.4167, "step": 2212 }, { "epoch": 1.62960235640648, "grad_norm": 0.44012802839279175, "learning_rate": 5.119962020822572e-06, "loss": 0.431, "step": 2213 }, { "epoch": 1.630338733431517, "grad_norm": 0.41772133111953735, "learning_rate": 5.115678441677546e-06, "loss": 0.3957, "step": 2214 }, { "epoch": 1.6310751104565537, "grad_norm": 0.3946409523487091, "learning_rate": 5.111394777581769e-06, "loss": 0.422, "step": 2215 }, { "epoch": 1.6318114874815906, "grad_norm": 0.4510660767555237, "learning_rate": 5.107111031681034e-06, "loss": 0.4063, "step": 2216 }, { "epoch": 1.6325478645066274, "grad_norm": 0.4569968581199646, "learning_rate": 5.1028272071211916e-06, "loss": 0.4193, "step": 2217 }, { "epoch": 1.6332842415316642, "grad_norm": 0.45792651176452637, "learning_rate": 5.098543307048158e-06, "loss": 0.4073, "step": 2218 }, { "epoch": 1.634020618556701, "grad_norm": 0.42194387316703796, "learning_rate": 5.094259334607896e-06, "loss": 0.3922, "step": 2219 }, { "epoch": 1.6347569955817378, "grad_norm": 0.48982447385787964, "learning_rate": 5.089975292946427e-06, "loss": 0.4192, "step": 2220 }, { "epoch": 1.6354933726067746, "grad_norm": 0.4341419041156769, "learning_rate": 5.085691185209824e-06, "loss": 0.428, "step": 2221 }, { "epoch": 1.6362297496318114, "grad_norm": 0.4825488030910492, "learning_rate": 5.081407014544202e-06, "loss": 0.4034, "step": 2222 }, { "epoch": 1.6369661266568483, "grad_norm": 0.4492826759815216, "learning_rate": 5.07712278409573e-06, "loss": 0.4292, "step": 2223 }, { "epoch": 1.637702503681885, "grad_norm": 0.44333913922309875, "learning_rate": 5.0728384970106135e-06, "loss": 0.3939, "step": 2224 }, { "epoch": 1.638438880706922, "grad_norm": 0.3804095387458801, "learning_rate": 5.068554156435108e-06, "loss": 0.3795, "step": 2225 }, { "epoch": 1.6391752577319587, "grad_norm": 0.3924141228199005, "learning_rate": 5.0642697655155e-06, "loss": 0.3771, "step": 2226 }, { "epoch": 1.6399116347569955, "grad_norm": 0.4596264660358429, "learning_rate": 5.059985327398121e-06, "loss": 0.4098, "step": 2227 }, { "epoch": 1.6406480117820323, "grad_norm": 0.4128621220588684, "learning_rate": 5.0557008452293275e-06, "loss": 0.4235, "step": 2228 }, { "epoch": 1.6413843888070692, "grad_norm": 0.44195055961608887, "learning_rate": 5.051416322155519e-06, "loss": 0.397, "step": 2229 }, { "epoch": 1.642120765832106, "grad_norm": 0.4374493956565857, "learning_rate": 5.047131761323115e-06, "loss": 0.3965, "step": 2230 }, { "epoch": 1.6428571428571428, "grad_norm": 0.39338651299476624, "learning_rate": 5.0428471658785715e-06, "loss": 0.3714, "step": 2231 }, { "epoch": 1.6435935198821796, "grad_norm": 0.4788668155670166, "learning_rate": 5.038562538968363e-06, "loss": 0.4252, "step": 2232 }, { "epoch": 1.6443298969072164, "grad_norm": 0.4350014328956604, "learning_rate": 5.034277883738992e-06, "loss": 0.3947, "step": 2233 }, { "epoch": 1.6450662739322532, "grad_norm": 0.3968852460384369, "learning_rate": 5.029993203336978e-06, "loss": 0.4066, "step": 2234 }, { "epoch": 1.64580265095729, "grad_norm": 0.3912464678287506, "learning_rate": 5.025708500908864e-06, "loss": 0.4011, "step": 2235 }, { "epoch": 1.6465390279823269, "grad_norm": 0.4370138943195343, "learning_rate": 5.021423779601202e-06, "loss": 0.3789, "step": 2236 }, { "epoch": 1.6472754050073637, "grad_norm": 0.39066141843795776, "learning_rate": 5.017139042560564e-06, "loss": 0.3597, "step": 2237 }, { "epoch": 1.6480117820324005, "grad_norm": 0.45047131180763245, "learning_rate": 5.01285429293353e-06, "loss": 0.4255, "step": 2238 }, { "epoch": 1.6487481590574373, "grad_norm": 0.4357801377773285, "learning_rate": 5.008569533866693e-06, "loss": 0.4201, "step": 2239 }, { "epoch": 1.6494845360824741, "grad_norm": 0.3798350393772125, "learning_rate": 5.00428476850665e-06, "loss": 0.3872, "step": 2240 }, { "epoch": 1.650220913107511, "grad_norm": 0.4238215386867523, "learning_rate": 5e-06, "loss": 0.4003, "step": 2241 }, { "epoch": 1.6509572901325478, "grad_norm": 0.4555947482585907, "learning_rate": 4.995715231493352e-06, "loss": 0.4197, "step": 2242 }, { "epoch": 1.6516936671575846, "grad_norm": 0.3676789700984955, "learning_rate": 4.991430466133308e-06, "loss": 0.3704, "step": 2243 }, { "epoch": 1.6524300441826214, "grad_norm": 0.43429771065711975, "learning_rate": 4.98714570706647e-06, "loss": 0.4075, "step": 2244 }, { "epoch": 1.6531664212076582, "grad_norm": 0.4155738055706024, "learning_rate": 4.982860957439437e-06, "loss": 0.4313, "step": 2245 }, { "epoch": 1.653902798232695, "grad_norm": 0.38362860679626465, "learning_rate": 4.978576220398801e-06, "loss": 0.4021, "step": 2246 }, { "epoch": 1.6546391752577319, "grad_norm": 0.39530959725379944, "learning_rate": 4.97429149909114e-06, "loss": 0.3926, "step": 2247 }, { "epoch": 1.6553755522827687, "grad_norm": 0.36841920018196106, "learning_rate": 4.970006796663023e-06, "loss": 0.4198, "step": 2248 }, { "epoch": 1.6561119293078055, "grad_norm": 0.34673774242401123, "learning_rate": 4.965722116261009e-06, "loss": 0.413, "step": 2249 }, { "epoch": 1.6568483063328423, "grad_norm": 0.35781288146972656, "learning_rate": 4.961437461031638e-06, "loss": 0.4232, "step": 2250 }, { "epoch": 1.6575846833578791, "grad_norm": 0.4127185046672821, "learning_rate": 4.95715283412143e-06, "loss": 0.4162, "step": 2251 }, { "epoch": 1.658321060382916, "grad_norm": 0.3733428120613098, "learning_rate": 4.952868238676885e-06, "loss": 0.3892, "step": 2252 }, { "epoch": 1.6590574374079528, "grad_norm": 0.46769607067108154, "learning_rate": 4.948583677844482e-06, "loss": 0.3946, "step": 2253 }, { "epoch": 1.6597938144329896, "grad_norm": 0.39663955569267273, "learning_rate": 4.944299154770673e-06, "loss": 0.3957, "step": 2254 }, { "epoch": 1.6605301914580264, "grad_norm": 0.3862704932689667, "learning_rate": 4.940014672601881e-06, "loss": 0.4042, "step": 2255 }, { "epoch": 1.6612665684830632, "grad_norm": 0.36831381916999817, "learning_rate": 4.9357302344845005e-06, "loss": 0.388, "step": 2256 }, { "epoch": 1.6620029455081, "grad_norm": 0.4497531056404114, "learning_rate": 4.931445843564893e-06, "loss": 0.4393, "step": 2257 }, { "epoch": 1.6627393225331368, "grad_norm": 0.4758419692516327, "learning_rate": 4.927161502989387e-06, "loss": 0.3868, "step": 2258 }, { "epoch": 1.6634756995581736, "grad_norm": 0.3863910734653473, "learning_rate": 4.922877215904272e-06, "loss": 0.4029, "step": 2259 }, { "epoch": 1.6642120765832105, "grad_norm": 0.4318389892578125, "learning_rate": 4.918592985455799e-06, "loss": 0.4101, "step": 2260 }, { "epoch": 1.6649484536082473, "grad_norm": 0.44303011894226074, "learning_rate": 4.914308814790178e-06, "loss": 0.3769, "step": 2261 }, { "epoch": 1.665684830633284, "grad_norm": 0.4603565037250519, "learning_rate": 4.910024707053573e-06, "loss": 0.417, "step": 2262 }, { "epoch": 1.666421207658321, "grad_norm": 0.3724450170993805, "learning_rate": 4.905740665392106e-06, "loss": 0.4129, "step": 2263 }, { "epoch": 1.6671575846833577, "grad_norm": 0.4588872790336609, "learning_rate": 4.901456692951844e-06, "loss": 0.4192, "step": 2264 }, { "epoch": 1.6678939617083945, "grad_norm": 0.36367759108543396, "learning_rate": 4.89717279287881e-06, "loss": 0.4206, "step": 2265 }, { "epoch": 1.6686303387334314, "grad_norm": 0.3877205550670624, "learning_rate": 4.892888968318968e-06, "loss": 0.4204, "step": 2266 }, { "epoch": 1.6693667157584682, "grad_norm": 0.4392288625240326, "learning_rate": 4.888605222418232e-06, "loss": 0.4133, "step": 2267 }, { "epoch": 1.670103092783505, "grad_norm": 0.35620570182800293, "learning_rate": 4.884321558322455e-06, "loss": 0.3808, "step": 2268 }, { "epoch": 1.6708394698085418, "grad_norm": 0.39953136444091797, "learning_rate": 4.8800379791774285e-06, "loss": 0.4151, "step": 2269 }, { "epoch": 1.6715758468335786, "grad_norm": 0.46267053484916687, "learning_rate": 4.875754488128885e-06, "loss": 0.3956, "step": 2270 }, { "epoch": 1.6723122238586157, "grad_norm": 0.401598185300827, "learning_rate": 4.871471088322493e-06, "loss": 0.4193, "step": 2271 }, { "epoch": 1.6730486008836525, "grad_norm": 0.41850388050079346, "learning_rate": 4.867187782903847e-06, "loss": 0.3883, "step": 2272 }, { "epoch": 1.6737849779086893, "grad_norm": 0.47216498851776123, "learning_rate": 4.862904575018482e-06, "loss": 0.4087, "step": 2273 }, { "epoch": 1.6745213549337261, "grad_norm": 0.416972279548645, "learning_rate": 4.8586214678118536e-06, "loss": 0.3824, "step": 2274 }, { "epoch": 1.675257731958763, "grad_norm": 0.41057777404785156, "learning_rate": 4.854338464429346e-06, "loss": 0.4095, "step": 2275 }, { "epoch": 1.6759941089837997, "grad_norm": 0.41428229212760925, "learning_rate": 4.850055568016268e-06, "loss": 0.4096, "step": 2276 }, { "epoch": 1.6767304860088366, "grad_norm": 0.5290152430534363, "learning_rate": 4.845772781717852e-06, "loss": 0.3914, "step": 2277 }, { "epoch": 1.6774668630338734, "grad_norm": 0.41980114579200745, "learning_rate": 4.841490108679242e-06, "loss": 0.3946, "step": 2278 }, { "epoch": 1.6782032400589102, "grad_norm": 0.3944283127784729, "learning_rate": 4.837207552045509e-06, "loss": 0.3936, "step": 2279 }, { "epoch": 1.678939617083947, "grad_norm": 0.46467381715774536, "learning_rate": 4.832925114961629e-06, "loss": 0.4159, "step": 2280 }, { "epoch": 1.6796759941089838, "grad_norm": 0.4479691982269287, "learning_rate": 4.828642800572495e-06, "loss": 0.3904, "step": 2281 }, { "epoch": 1.6804123711340206, "grad_norm": 0.39899787306785583, "learning_rate": 4.8243606120229095e-06, "loss": 0.4172, "step": 2282 }, { "epoch": 1.6811487481590575, "grad_norm": 0.4519674479961395, "learning_rate": 4.820078552457584e-06, "loss": 0.401, "step": 2283 }, { "epoch": 1.6818851251840943, "grad_norm": 0.405024915933609, "learning_rate": 4.815796625021132e-06, "loss": 0.4126, "step": 2284 }, { "epoch": 1.682621502209131, "grad_norm": 0.42666709423065186, "learning_rate": 4.811514832858072e-06, "loss": 0.4016, "step": 2285 }, { "epoch": 1.683357879234168, "grad_norm": 0.49751371145248413, "learning_rate": 4.8072331791128244e-06, "loss": 0.4145, "step": 2286 }, { "epoch": 1.6840942562592047, "grad_norm": 0.48239147663116455, "learning_rate": 4.802951666929704e-06, "loss": 0.4144, "step": 2287 }, { "epoch": 1.6848306332842415, "grad_norm": 0.41547465324401855, "learning_rate": 4.798670299452926e-06, "loss": 0.4038, "step": 2288 }, { "epoch": 1.6855670103092784, "grad_norm": 0.4863029420375824, "learning_rate": 4.794389079826594e-06, "loss": 0.3982, "step": 2289 }, { "epoch": 1.6863033873343152, "grad_norm": 0.48596158623695374, "learning_rate": 4.790108011194709e-06, "loss": 0.4637, "step": 2290 }, { "epoch": 1.687039764359352, "grad_norm": 0.3679700791835785, "learning_rate": 4.785827096701159e-06, "loss": 0.3796, "step": 2291 }, { "epoch": 1.6877761413843888, "grad_norm": 0.39074385166168213, "learning_rate": 4.781546339489716e-06, "loss": 0.4275, "step": 2292 }, { "epoch": 1.6885125184094256, "grad_norm": 0.47816234827041626, "learning_rate": 4.777265742704039e-06, "loss": 0.4048, "step": 2293 }, { "epoch": 1.6892488954344624, "grad_norm": 0.4297253489494324, "learning_rate": 4.7729853094876714e-06, "loss": 0.3735, "step": 2294 }, { "epoch": 1.6899852724594993, "grad_norm": 0.3920181095600128, "learning_rate": 4.768705042984031e-06, "loss": 0.4444, "step": 2295 }, { "epoch": 1.690721649484536, "grad_norm": 0.35152071714401245, "learning_rate": 4.7644249463364205e-06, "loss": 0.4234, "step": 2296 }, { "epoch": 1.6914580265095729, "grad_norm": 0.4087672233581543, "learning_rate": 4.760145022688007e-06, "loss": 0.4192, "step": 2297 }, { "epoch": 1.6921944035346097, "grad_norm": 0.40535783767700195, "learning_rate": 4.755865275181843e-06, "loss": 0.4036, "step": 2298 }, { "epoch": 1.6929307805596465, "grad_norm": 0.4112594425678253, "learning_rate": 4.751585706960842e-06, "loss": 0.4382, "step": 2299 }, { "epoch": 1.6936671575846833, "grad_norm": 0.40678101778030396, "learning_rate": 4.747306321167791e-06, "loss": 0.4198, "step": 2300 }, { "epoch": 1.6944035346097202, "grad_norm": 0.38269445300102234, "learning_rate": 4.743027120945342e-06, "loss": 0.4061, "step": 2301 }, { "epoch": 1.695139911634757, "grad_norm": 0.47511664032936096, "learning_rate": 4.73874810943601e-06, "loss": 0.4329, "step": 2302 }, { "epoch": 1.6958762886597938, "grad_norm": 0.3984917998313904, "learning_rate": 4.7344692897821714e-06, "loss": 0.4191, "step": 2303 }, { "epoch": 1.6966126656848306, "grad_norm": 0.37280645966529846, "learning_rate": 4.7301906651260634e-06, "loss": 0.4067, "step": 2304 }, { "epoch": 1.6973490427098674, "grad_norm": 0.3684954047203064, "learning_rate": 4.725912238609779e-06, "loss": 0.4016, "step": 2305 }, { "epoch": 1.6980854197349042, "grad_norm": 0.3745935559272766, "learning_rate": 4.7216340133752604e-06, "loss": 0.3998, "step": 2306 }, { "epoch": 1.698821796759941, "grad_norm": 0.360403448343277, "learning_rate": 4.717355992564311e-06, "loss": 0.3958, "step": 2307 }, { "epoch": 1.6995581737849779, "grad_norm": 0.3953647017478943, "learning_rate": 4.7130781793185805e-06, "loss": 0.404, "step": 2308 }, { "epoch": 1.7002945508100147, "grad_norm": 0.40129730105400085, "learning_rate": 4.708800576779564e-06, "loss": 0.4205, "step": 2309 }, { "epoch": 1.7010309278350515, "grad_norm": 0.3984296917915344, "learning_rate": 4.704523188088604e-06, "loss": 0.3888, "step": 2310 }, { "epoch": 1.7017673048600883, "grad_norm": 0.37107083201408386, "learning_rate": 4.700246016386887e-06, "loss": 0.3904, "step": 2311 }, { "epoch": 1.7025036818851251, "grad_norm": 0.38005733489990234, "learning_rate": 4.695969064815436e-06, "loss": 0.4087, "step": 2312 }, { "epoch": 1.7032400589101622, "grad_norm": 0.37871333956718445, "learning_rate": 4.6916923365151185e-06, "loss": 0.4028, "step": 2313 }, { "epoch": 1.703976435935199, "grad_norm": 0.3813686668872833, "learning_rate": 4.68741583462663e-06, "loss": 0.3869, "step": 2314 }, { "epoch": 1.7047128129602358, "grad_norm": 0.37767162919044495, "learning_rate": 4.683139562290506e-06, "loss": 0.4025, "step": 2315 }, { "epoch": 1.7054491899852726, "grad_norm": 0.38888660073280334, "learning_rate": 4.678863522647114e-06, "loss": 0.3863, "step": 2316 }, { "epoch": 1.7061855670103094, "grad_norm": 0.4239829182624817, "learning_rate": 4.6745877188366464e-06, "loss": 0.3987, "step": 2317 }, { "epoch": 1.7069219440353463, "grad_norm": 0.38922736048698425, "learning_rate": 4.670312153999123e-06, "loss": 0.3987, "step": 2318 }, { "epoch": 1.707658321060383, "grad_norm": 0.42677411437034607, "learning_rate": 4.666036831274392e-06, "loss": 0.3937, "step": 2319 }, { "epoch": 1.7083946980854199, "grad_norm": 0.43811002373695374, "learning_rate": 4.66176175380212e-06, "loss": 0.4189, "step": 2320 }, { "epoch": 1.7091310751104567, "grad_norm": 0.41332265734672546, "learning_rate": 4.657486924721797e-06, "loss": 0.4231, "step": 2321 }, { "epoch": 1.7098674521354935, "grad_norm": 0.41495949029922485, "learning_rate": 4.653212347172723e-06, "loss": 0.3893, "step": 2322 }, { "epoch": 1.7106038291605303, "grad_norm": 0.42686596512794495, "learning_rate": 4.648938024294023e-06, "loss": 0.4398, "step": 2323 }, { "epoch": 1.7113402061855671, "grad_norm": 0.38408711552619934, "learning_rate": 4.644663959224629e-06, "loss": 0.4055, "step": 2324 }, { "epoch": 1.712076583210604, "grad_norm": 0.4538039565086365, "learning_rate": 4.640390155103285e-06, "loss": 0.4302, "step": 2325 }, { "epoch": 1.7128129602356408, "grad_norm": 0.40232333540916443, "learning_rate": 4.636116615068545e-06, "loss": 0.4076, "step": 2326 }, { "epoch": 1.7135493372606776, "grad_norm": 0.45923569798469543, "learning_rate": 4.631843342258765e-06, "loss": 0.4284, "step": 2327 }, { "epoch": 1.7142857142857144, "grad_norm": 0.390129953622818, "learning_rate": 4.627570339812109e-06, "loss": 0.4171, "step": 2328 }, { "epoch": 1.7150220913107512, "grad_norm": 0.44369444251060486, "learning_rate": 4.623297610866544e-06, "loss": 0.4287, "step": 2329 }, { "epoch": 1.715758468335788, "grad_norm": 0.42557501792907715, "learning_rate": 4.619025158559829e-06, "loss": 0.4052, "step": 2330 }, { "epoch": 1.7164948453608249, "grad_norm": 0.3788171708583832, "learning_rate": 4.614752986029524e-06, "loss": 0.3884, "step": 2331 }, { "epoch": 1.7172312223858617, "grad_norm": 0.3878194987773895, "learning_rate": 4.610481096412985e-06, "loss": 0.4091, "step": 2332 }, { "epoch": 1.7179675994108985, "grad_norm": 0.40998604893684387, "learning_rate": 4.606209492847356e-06, "loss": 0.379, "step": 2333 }, { "epoch": 1.7187039764359353, "grad_norm": 0.3821749985218048, "learning_rate": 4.6019381784695774e-06, "loss": 0.4005, "step": 2334 }, { "epoch": 1.7194403534609721, "grad_norm": 0.35601040720939636, "learning_rate": 4.597667156416371e-06, "loss": 0.3966, "step": 2335 }, { "epoch": 1.720176730486009, "grad_norm": 0.40410375595092773, "learning_rate": 4.5933964298242465e-06, "loss": 0.3946, "step": 2336 }, { "epoch": 1.7209131075110458, "grad_norm": 0.37720221281051636, "learning_rate": 4.589126001829497e-06, "loss": 0.3987, "step": 2337 }, { "epoch": 1.7216494845360826, "grad_norm": 0.3496411144733429, "learning_rate": 4.584855875568198e-06, "loss": 0.4229, "step": 2338 }, { "epoch": 1.7223858615611194, "grad_norm": 0.3647906482219696, "learning_rate": 4.580586054176196e-06, "loss": 0.4183, "step": 2339 }, { "epoch": 1.7231222385861562, "grad_norm": 0.36781540513038635, "learning_rate": 4.576316540789122e-06, "loss": 0.3892, "step": 2340 }, { "epoch": 1.723858615611193, "grad_norm": 0.36236733198165894, "learning_rate": 4.572047338542377e-06, "loss": 0.4132, "step": 2341 }, { "epoch": 1.7245949926362298, "grad_norm": 0.3683478832244873, "learning_rate": 4.567778450571135e-06, "loss": 0.4394, "step": 2342 }, { "epoch": 1.7253313696612667, "grad_norm": 0.378169447183609, "learning_rate": 4.563509880010336e-06, "loss": 0.3853, "step": 2343 }, { "epoch": 1.7260677466863035, "grad_norm": 0.3431897759437561, "learning_rate": 4.559241629994693e-06, "loss": 0.4018, "step": 2344 }, { "epoch": 1.7268041237113403, "grad_norm": 0.3948942720890045, "learning_rate": 4.554973703658676e-06, "loss": 0.3987, "step": 2345 }, { "epoch": 1.727540500736377, "grad_norm": 0.36369240283966064, "learning_rate": 4.550706104136523e-06, "loss": 0.3903, "step": 2346 }, { "epoch": 1.728276877761414, "grad_norm": 0.40486857295036316, "learning_rate": 4.546438834562232e-06, "loss": 0.4185, "step": 2347 }, { "epoch": 1.7290132547864507, "grad_norm": 0.3831326365470886, "learning_rate": 4.542171898069553e-06, "loss": 0.3756, "step": 2348 }, { "epoch": 1.7297496318114876, "grad_norm": 0.40144047141075134, "learning_rate": 4.537905297791997e-06, "loss": 0.3718, "step": 2349 }, { "epoch": 1.7304860088365244, "grad_norm": 0.38407352566719055, "learning_rate": 4.5336390368628265e-06, "loss": 0.4093, "step": 2350 }, { "epoch": 1.7312223858615612, "grad_norm": 0.37621355056762695, "learning_rate": 4.529373118415053e-06, "loss": 0.4144, "step": 2351 }, { "epoch": 1.731958762886598, "grad_norm": 0.34284254908561707, "learning_rate": 4.525107545581442e-06, "loss": 0.4234, "step": 2352 }, { "epoch": 1.7326951399116348, "grad_norm": 0.42767757177352905, "learning_rate": 4.5208423214944975e-06, "loss": 0.4072, "step": 2353 }, { "epoch": 1.7334315169366716, "grad_norm": 0.3716091215610504, "learning_rate": 4.5165774492864715e-06, "loss": 0.3886, "step": 2354 }, { "epoch": 1.7341678939617085, "grad_norm": 0.35501131415367126, "learning_rate": 4.512312932089361e-06, "loss": 0.405, "step": 2355 }, { "epoch": 1.7349042709867453, "grad_norm": 0.4200528562068939, "learning_rate": 4.508048773034895e-06, "loss": 0.4098, "step": 2356 }, { "epoch": 1.735640648011782, "grad_norm": 0.3673626780509949, "learning_rate": 4.503784975254543e-06, "loss": 0.3913, "step": 2357 }, { "epoch": 1.736377025036819, "grad_norm": 0.39019590616226196, "learning_rate": 4.499521541879508e-06, "loss": 0.4088, "step": 2358 }, { "epoch": 1.7371134020618557, "grad_norm": 0.3743307888507843, "learning_rate": 4.495258476040732e-06, "loss": 0.4009, "step": 2359 }, { "epoch": 1.7378497790868925, "grad_norm": 0.39364027976989746, "learning_rate": 4.4909957808688765e-06, "loss": 0.4161, "step": 2360 }, { "epoch": 1.7385861561119293, "grad_norm": 0.3424508571624756, "learning_rate": 4.486733459494338e-06, "loss": 0.4315, "step": 2361 }, { "epoch": 1.7393225331369662, "grad_norm": 0.3977791368961334, "learning_rate": 4.482471515047237e-06, "loss": 0.3828, "step": 2362 }, { "epoch": 1.740058910162003, "grad_norm": 0.36530202627182007, "learning_rate": 4.478209950657418e-06, "loss": 0.399, "step": 2363 }, { "epoch": 1.7407952871870398, "grad_norm": 0.40208595991134644, "learning_rate": 4.4739487694544415e-06, "loss": 0.4193, "step": 2364 }, { "epoch": 1.7415316642120766, "grad_norm": 0.36320358514785767, "learning_rate": 4.469687974567591e-06, "loss": 0.3844, "step": 2365 }, { "epoch": 1.7422680412371134, "grad_norm": 0.3930068016052246, "learning_rate": 4.465427569125868e-06, "loss": 0.4007, "step": 2366 }, { "epoch": 1.7430044182621502, "grad_norm": 0.3515762984752655, "learning_rate": 4.461167556257984e-06, "loss": 0.422, "step": 2367 }, { "epoch": 1.743740795287187, "grad_norm": 0.39239639043807983, "learning_rate": 4.456907939092363e-06, "loss": 0.4105, "step": 2368 }, { "epoch": 1.7444771723122239, "grad_norm": 0.3900611102581024, "learning_rate": 4.45264872075714e-06, "loss": 0.3909, "step": 2369 }, { "epoch": 1.7452135493372607, "grad_norm": 0.38840755820274353, "learning_rate": 4.448389904380156e-06, "loss": 0.4077, "step": 2370 }, { "epoch": 1.7459499263622975, "grad_norm": 0.3633229732513428, "learning_rate": 4.444131493088956e-06, "loss": 0.4119, "step": 2371 }, { "epoch": 1.7466863033873343, "grad_norm": 0.3752785325050354, "learning_rate": 4.4398734900107935e-06, "loss": 0.3841, "step": 2372 }, { "epoch": 1.7474226804123711, "grad_norm": 0.42519140243530273, "learning_rate": 4.43561589827261e-06, "loss": 0.4221, "step": 2373 }, { "epoch": 1.748159057437408, "grad_norm": 0.40567177534103394, "learning_rate": 4.431358721001058e-06, "loss": 0.401, "step": 2374 }, { "epoch": 1.7488954344624448, "grad_norm": 0.335718035697937, "learning_rate": 4.427101961322475e-06, "loss": 0.3703, "step": 2375 }, { "epoch": 1.7496318114874816, "grad_norm": 0.39199185371398926, "learning_rate": 4.422845622362901e-06, "loss": 0.3967, "step": 2376 }, { "epoch": 1.7503681885125184, "grad_norm": 0.3723236918449402, "learning_rate": 4.418589707248061e-06, "loss": 0.3873, "step": 2377 }, { "epoch": 1.7511045655375552, "grad_norm": 0.41921404004096985, "learning_rate": 4.414334219103369e-06, "loss": 0.3941, "step": 2378 }, { "epoch": 1.751840942562592, "grad_norm": 0.37618371844291687, "learning_rate": 4.4100791610539285e-06, "loss": 0.3763, "step": 2379 }, { "epoch": 1.7525773195876289, "grad_norm": 0.4034104645252228, "learning_rate": 4.4058245362245276e-06, "loss": 0.3917, "step": 2380 }, { "epoch": 1.7533136966126657, "grad_norm": 0.43745455145835876, "learning_rate": 4.401570347739631e-06, "loss": 0.4249, "step": 2381 }, { "epoch": 1.7540500736377025, "grad_norm": 0.3758779466152191, "learning_rate": 4.397316598723385e-06, "loss": 0.413, "step": 2382 }, { "epoch": 1.7547864506627393, "grad_norm": 0.3742757737636566, "learning_rate": 4.393063292299618e-06, "loss": 0.4179, "step": 2383 }, { "epoch": 1.7555228276877761, "grad_norm": 0.4073251783847809, "learning_rate": 4.388810431591829e-06, "loss": 0.4235, "step": 2384 }, { "epoch": 1.756259204712813, "grad_norm": 0.4208831787109375, "learning_rate": 4.384558019723188e-06, "loss": 0.4119, "step": 2385 }, { "epoch": 1.7569955817378498, "grad_norm": 0.41460785269737244, "learning_rate": 4.380306059816539e-06, "loss": 0.3966, "step": 2386 }, { "epoch": 1.7577319587628866, "grad_norm": 0.4289226830005646, "learning_rate": 4.376054554994394e-06, "loss": 0.4164, "step": 2387 }, { "epoch": 1.7584683357879234, "grad_norm": 0.41257333755493164, "learning_rate": 4.371803508378929e-06, "loss": 0.3822, "step": 2388 }, { "epoch": 1.7592047128129602, "grad_norm": 0.44865962862968445, "learning_rate": 4.367552923091985e-06, "loss": 0.395, "step": 2389 }, { "epoch": 1.759941089837997, "grad_norm": 0.37914708256721497, "learning_rate": 4.363302802255062e-06, "loss": 0.4037, "step": 2390 }, { "epoch": 1.7606774668630338, "grad_norm": 0.3959915339946747, "learning_rate": 4.359053148989319e-06, "loss": 0.3997, "step": 2391 }, { "epoch": 1.7614138438880707, "grad_norm": 0.5142814517021179, "learning_rate": 4.354803966415576e-06, "loss": 0.4012, "step": 2392 }, { "epoch": 1.7621502209131075, "grad_norm": 0.4417358338832855, "learning_rate": 4.350555257654302e-06, "loss": 0.3962, "step": 2393 }, { "epoch": 1.7628865979381443, "grad_norm": 0.41511955857276917, "learning_rate": 4.346307025825621e-06, "loss": 0.407, "step": 2394 }, { "epoch": 1.763622974963181, "grad_norm": 0.3971770405769348, "learning_rate": 4.342059274049308e-06, "loss": 0.3975, "step": 2395 }, { "epoch": 1.764359351988218, "grad_norm": 0.43736016750335693, "learning_rate": 4.33781200544478e-06, "loss": 0.3925, "step": 2396 }, { "epoch": 1.7650957290132547, "grad_norm": 0.3964605927467346, "learning_rate": 4.333565223131107e-06, "loss": 0.3938, "step": 2397 }, { "epoch": 1.7658321060382915, "grad_norm": 0.4375699758529663, "learning_rate": 4.329318930226993e-06, "loss": 0.4229, "step": 2398 }, { "epoch": 1.7665684830633284, "grad_norm": 0.34781819581985474, "learning_rate": 4.325073129850791e-06, "loss": 0.3878, "step": 2399 }, { "epoch": 1.7673048600883652, "grad_norm": 0.39587196707725525, "learning_rate": 4.320827825120485e-06, "loss": 0.4248, "step": 2400 }, { "epoch": 1.768041237113402, "grad_norm": 0.47028040885925293, "learning_rate": 4.3165830191537016e-06, "loss": 0.3747, "step": 2401 }, { "epoch": 1.7687776141384388, "grad_norm": 0.3784812390804291, "learning_rate": 4.312338715067697e-06, "loss": 0.3972, "step": 2402 }, { "epoch": 1.7695139911634756, "grad_norm": 0.45428574085235596, "learning_rate": 4.308094915979359e-06, "loss": 0.4051, "step": 2403 }, { "epoch": 1.7702503681885124, "grad_norm": 0.4495220184326172, "learning_rate": 4.303851625005205e-06, "loss": 0.4132, "step": 2404 }, { "epoch": 1.7709867452135493, "grad_norm": 0.39823389053344727, "learning_rate": 4.2996088452613835e-06, "loss": 0.4036, "step": 2405 }, { "epoch": 1.771723122238586, "grad_norm": 0.4087408483028412, "learning_rate": 4.295366579863658e-06, "loss": 0.3894, "step": 2406 }, { "epoch": 1.772459499263623, "grad_norm": 0.5459381937980652, "learning_rate": 4.291124831927425e-06, "loss": 0.4391, "step": 2407 }, { "epoch": 1.7731958762886597, "grad_norm": 0.4444357752799988, "learning_rate": 4.286883604567693e-06, "loss": 0.3851, "step": 2408 }, { "epoch": 1.7739322533136965, "grad_norm": 0.4461228549480438, "learning_rate": 4.282642900899092e-06, "loss": 0.4251, "step": 2409 }, { "epoch": 1.7746686303387333, "grad_norm": 0.41404518485069275, "learning_rate": 4.278402724035868e-06, "loss": 0.4339, "step": 2410 }, { "epoch": 1.7754050073637702, "grad_norm": 0.4226773977279663, "learning_rate": 4.274163077091876e-06, "loss": 0.42, "step": 2411 }, { "epoch": 1.776141384388807, "grad_norm": 0.44457805156707764, "learning_rate": 4.269923963180587e-06, "loss": 0.4236, "step": 2412 }, { "epoch": 1.7768777614138438, "grad_norm": 0.4070783853530884, "learning_rate": 4.265685385415077e-06, "loss": 0.3761, "step": 2413 }, { "epoch": 1.7776141384388806, "grad_norm": 0.3834463953971863, "learning_rate": 4.261447346908032e-06, "loss": 0.4151, "step": 2414 }, { "epoch": 1.7783505154639174, "grad_norm": 0.41372165083885193, "learning_rate": 4.257209850771734e-06, "loss": 0.3919, "step": 2415 }, { "epoch": 1.7790868924889542, "grad_norm": 0.4155184328556061, "learning_rate": 4.2529729001180765e-06, "loss": 0.4034, "step": 2416 }, { "epoch": 1.779823269513991, "grad_norm": 0.3758399784564972, "learning_rate": 4.248736498058547e-06, "loss": 0.4108, "step": 2417 }, { "epoch": 1.7805596465390279, "grad_norm": 0.38244372606277466, "learning_rate": 4.24450064770423e-06, "loss": 0.3716, "step": 2418 }, { "epoch": 1.7812960235640647, "grad_norm": 0.4086960256099701, "learning_rate": 4.240265352165806e-06, "loss": 0.3858, "step": 2419 }, { "epoch": 1.7820324005891015, "grad_norm": 0.37736374139785767, "learning_rate": 4.236030614553552e-06, "loss": 0.4, "step": 2420 }, { "epoch": 1.7827687776141383, "grad_norm": 0.4141034185886383, "learning_rate": 4.2317964379773265e-06, "loss": 0.4311, "step": 2421 }, { "epoch": 1.7835051546391751, "grad_norm": 0.37181559205055237, "learning_rate": 4.2275628255465846e-06, "loss": 0.3799, "step": 2422 }, { "epoch": 1.784241531664212, "grad_norm": 0.3771205246448517, "learning_rate": 4.223329780370359e-06, "loss": 0.4282, "step": 2423 }, { "epoch": 1.7849779086892488, "grad_norm": 0.4032301604747772, "learning_rate": 4.219097305557274e-06, "loss": 0.4305, "step": 2424 }, { "epoch": 1.7857142857142856, "grad_norm": 0.39853084087371826, "learning_rate": 4.214865404215528e-06, "loss": 0.3973, "step": 2425 }, { "epoch": 1.7864506627393224, "grad_norm": 0.35678479075431824, "learning_rate": 4.210634079452904e-06, "loss": 0.368, "step": 2426 }, { "epoch": 1.7871870397643592, "grad_norm": 0.3927466869354248, "learning_rate": 4.206403334376757e-06, "loss": 0.4107, "step": 2427 }, { "epoch": 1.787923416789396, "grad_norm": 0.3804020285606384, "learning_rate": 4.202173172094022e-06, "loss": 0.3749, "step": 2428 }, { "epoch": 1.7886597938144329, "grad_norm": 0.3756907284259796, "learning_rate": 4.1979435957111984e-06, "loss": 0.4019, "step": 2429 }, { "epoch": 1.7893961708394697, "grad_norm": 0.3835069239139557, "learning_rate": 4.193714608334361e-06, "loss": 0.383, "step": 2430 }, { "epoch": 1.7901325478645065, "grad_norm": 0.41579073667526245, "learning_rate": 4.189486213069152e-06, "loss": 0.4195, "step": 2431 }, { "epoch": 1.7908689248895433, "grad_norm": 0.39560478925704956, "learning_rate": 4.1852584130207745e-06, "loss": 0.4095, "step": 2432 }, { "epoch": 1.7916053019145801, "grad_norm": 0.4340824782848358, "learning_rate": 4.181031211293997e-06, "loss": 0.4334, "step": 2433 }, { "epoch": 1.792341678939617, "grad_norm": 0.41419485211372375, "learning_rate": 4.176804610993149e-06, "loss": 0.4022, "step": 2434 }, { "epoch": 1.7930780559646537, "grad_norm": 0.40266767144203186, "learning_rate": 4.17257861522212e-06, "loss": 0.4214, "step": 2435 }, { "epoch": 1.7938144329896906, "grad_norm": 0.35882508754730225, "learning_rate": 4.1683532270843505e-06, "loss": 0.3855, "step": 2436 }, { "epoch": 1.7945508100147274, "grad_norm": 0.37283796072006226, "learning_rate": 4.16412844968284e-06, "loss": 0.3659, "step": 2437 }, { "epoch": 1.7952871870397642, "grad_norm": 0.3643818199634552, "learning_rate": 4.1599042861201375e-06, "loss": 0.4015, "step": 2438 }, { "epoch": 1.796023564064801, "grad_norm": 0.38770896196365356, "learning_rate": 4.155680739498342e-06, "loss": 0.379, "step": 2439 }, { "epoch": 1.7967599410898378, "grad_norm": 0.3598364293575287, "learning_rate": 4.151457812919094e-06, "loss": 0.4056, "step": 2440 }, { "epoch": 1.7974963181148749, "grad_norm": 0.39613017439842224, "learning_rate": 4.147235509483587e-06, "loss": 0.3952, "step": 2441 }, { "epoch": 1.7982326951399117, "grad_norm": 0.4370924234390259, "learning_rate": 4.1430138322925535e-06, "loss": 0.4109, "step": 2442 }, { "epoch": 1.7989690721649485, "grad_norm": 0.3646906912326813, "learning_rate": 4.138792784446263e-06, "loss": 0.3945, "step": 2443 }, { "epoch": 1.7997054491899853, "grad_norm": 0.34723278880119324, "learning_rate": 4.134572369044526e-06, "loss": 0.3952, "step": 2444 }, { "epoch": 1.8004418262150221, "grad_norm": 0.40729156136512756, "learning_rate": 4.1303525891866905e-06, "loss": 0.4143, "step": 2445 }, { "epoch": 1.801178203240059, "grad_norm": 0.4514944851398468, "learning_rate": 4.126133447971633e-06, "loss": 0.392, "step": 2446 }, { "epoch": 1.8019145802650958, "grad_norm": 0.3769078254699707, "learning_rate": 4.121914948497764e-06, "loss": 0.3999, "step": 2447 }, { "epoch": 1.8026509572901326, "grad_norm": 0.39932695031166077, "learning_rate": 4.117697093863023e-06, "loss": 0.4274, "step": 2448 }, { "epoch": 1.8033873343151694, "grad_norm": 0.3611906170845032, "learning_rate": 4.113479887164873e-06, "loss": 0.3788, "step": 2449 }, { "epoch": 1.8041237113402062, "grad_norm": 0.38393762707710266, "learning_rate": 4.109263331500305e-06, "loss": 0.4218, "step": 2450 }, { "epoch": 1.804860088365243, "grad_norm": 0.42800167202949524, "learning_rate": 4.105047429965828e-06, "loss": 0.3984, "step": 2451 }, { "epoch": 1.8055964653902798, "grad_norm": 0.38685426115989685, "learning_rate": 4.1008321856574745e-06, "loss": 0.4097, "step": 2452 }, { "epoch": 1.8063328424153167, "grad_norm": 0.3722650408744812, "learning_rate": 4.096617601670793e-06, "loss": 0.4073, "step": 2453 }, { "epoch": 1.8070692194403535, "grad_norm": 0.3909144997596741, "learning_rate": 4.092403681100844e-06, "loss": 0.403, "step": 2454 }, { "epoch": 1.8078055964653903, "grad_norm": 0.42803841829299927, "learning_rate": 4.0881904270422045e-06, "loss": 0.3873, "step": 2455 }, { "epoch": 1.8085419734904271, "grad_norm": 0.42633092403411865, "learning_rate": 4.083977842588963e-06, "loss": 0.3925, "step": 2456 }, { "epoch": 1.809278350515464, "grad_norm": 0.3902944028377533, "learning_rate": 4.079765930834714e-06, "loss": 0.4063, "step": 2457 }, { "epoch": 1.8100147275405007, "grad_norm": 0.3900681436061859, "learning_rate": 4.075554694872554e-06, "loss": 0.406, "step": 2458 }, { "epoch": 1.8107511045655376, "grad_norm": 0.412332147359848, "learning_rate": 4.071344137795091e-06, "loss": 0.4169, "step": 2459 }, { "epoch": 1.8114874815905744, "grad_norm": 0.39725714921951294, "learning_rate": 4.067134262694431e-06, "loss": 0.4163, "step": 2460 }, { "epoch": 1.8122238586156112, "grad_norm": 0.36094245314598083, "learning_rate": 4.062925072662177e-06, "loss": 0.3998, "step": 2461 }, { "epoch": 1.812960235640648, "grad_norm": 0.35459452867507935, "learning_rate": 4.0587165707894326e-06, "loss": 0.4008, "step": 2462 }, { "epoch": 1.8136966126656848, "grad_norm": 0.3944514989852905, "learning_rate": 4.054508760166795e-06, "loss": 0.4151, "step": 2463 }, { "epoch": 1.8144329896907216, "grad_norm": 0.41395866870880127, "learning_rate": 4.050301643884352e-06, "loss": 0.4049, "step": 2464 }, { "epoch": 1.8151693667157585, "grad_norm": 0.3743531405925751, "learning_rate": 4.046095225031683e-06, "loss": 0.4226, "step": 2465 }, { "epoch": 1.8159057437407953, "grad_norm": 0.4096841812133789, "learning_rate": 4.0418895066978536e-06, "loss": 0.3879, "step": 2466 }, { "epoch": 1.816642120765832, "grad_norm": 0.41024044156074524, "learning_rate": 4.037684491971417e-06, "loss": 0.4129, "step": 2467 }, { "epoch": 1.817378497790869, "grad_norm": 0.36897313594818115, "learning_rate": 4.033480183940412e-06, "loss": 0.391, "step": 2468 }, { "epoch": 1.8181148748159057, "grad_norm": 0.3991081416606903, "learning_rate": 4.029276585692349e-06, "loss": 0.4263, "step": 2469 }, { "epoch": 1.8188512518409425, "grad_norm": 0.43920043110847473, "learning_rate": 4.0250737003142294e-06, "loss": 0.433, "step": 2470 }, { "epoch": 1.8195876288659794, "grad_norm": 0.47898420691490173, "learning_rate": 4.0208715308925235e-06, "loss": 0.4085, "step": 2471 }, { "epoch": 1.8203240058910162, "grad_norm": 0.37598884105682373, "learning_rate": 4.016670080513176e-06, "loss": 0.4126, "step": 2472 }, { "epoch": 1.821060382916053, "grad_norm": 0.36710143089294434, "learning_rate": 4.012469352261608e-06, "loss": 0.3943, "step": 2473 }, { "epoch": 1.8217967599410898, "grad_norm": 0.42525747418403625, "learning_rate": 4.0082693492227035e-06, "loss": 0.3982, "step": 2474 }, { "epoch": 1.8225331369661266, "grad_norm": 0.39572814106941223, "learning_rate": 4.004070074480821e-06, "loss": 0.3859, "step": 2475 }, { "epoch": 1.8232695139911634, "grad_norm": 0.38209137320518494, "learning_rate": 3.999871531119779e-06, "loss": 0.3998, "step": 2476 }, { "epoch": 1.8240058910162003, "grad_norm": 0.36567169427871704, "learning_rate": 3.995673722222861e-06, "loss": 0.397, "step": 2477 }, { "epoch": 1.824742268041237, "grad_norm": 0.4170894920825958, "learning_rate": 3.991476650872813e-06, "loss": 0.4181, "step": 2478 }, { "epoch": 1.8254786450662739, "grad_norm": 0.43204647302627563, "learning_rate": 3.987280320151835e-06, "loss": 0.4415, "step": 2479 }, { "epoch": 1.8262150220913107, "grad_norm": 0.39173027873039246, "learning_rate": 3.983084733141588e-06, "loss": 0.3853, "step": 2480 }, { "epoch": 1.8269513991163475, "grad_norm": 0.4281696081161499, "learning_rate": 3.978889892923183e-06, "loss": 0.4305, "step": 2481 }, { "epoch": 1.8276877761413843, "grad_norm": 0.44608938694000244, "learning_rate": 3.974695802577184e-06, "loss": 0.4322, "step": 2482 }, { "epoch": 1.8284241531664214, "grad_norm": 0.4017941951751709, "learning_rate": 3.970502465183602e-06, "loss": 0.4158, "step": 2483 }, { "epoch": 1.8291605301914582, "grad_norm": 0.4041925072669983, "learning_rate": 3.966309883821901e-06, "loss": 0.4031, "step": 2484 }, { "epoch": 1.829896907216495, "grad_norm": 0.4443046748638153, "learning_rate": 3.962118061570982e-06, "loss": 0.3905, "step": 2485 }, { "epoch": 1.8306332842415318, "grad_norm": 0.45764559507369995, "learning_rate": 3.957927001509197e-06, "loss": 0.399, "step": 2486 }, { "epoch": 1.8313696612665686, "grad_norm": 0.4235036373138428, "learning_rate": 3.953736706714331e-06, "loss": 0.396, "step": 2487 }, { "epoch": 1.8321060382916055, "grad_norm": 0.46383094787597656, "learning_rate": 3.94954718026361e-06, "loss": 0.4079, "step": 2488 }, { "epoch": 1.8328424153166423, "grad_norm": 0.38650867342948914, "learning_rate": 3.945358425233697e-06, "loss": 0.4062, "step": 2489 }, { "epoch": 1.833578792341679, "grad_norm": 0.40243032574653625, "learning_rate": 3.941170444700688e-06, "loss": 0.3963, "step": 2490 }, { "epoch": 1.834315169366716, "grad_norm": 0.4477875530719757, "learning_rate": 3.9369832417401055e-06, "loss": 0.3711, "step": 2491 }, { "epoch": 1.8350515463917527, "grad_norm": 0.4357861578464508, "learning_rate": 3.9327968194269074e-06, "loss": 0.4004, "step": 2492 }, { "epoch": 1.8357879234167895, "grad_norm": 0.34190165996551514, "learning_rate": 3.928611180835476e-06, "loss": 0.4045, "step": 2493 }, { "epoch": 1.8365243004418264, "grad_norm": 0.40084031224250793, "learning_rate": 3.924426329039616e-06, "loss": 0.3943, "step": 2494 }, { "epoch": 1.8372606774668632, "grad_norm": 0.43317151069641113, "learning_rate": 3.920242267112557e-06, "loss": 0.4437, "step": 2495 }, { "epoch": 1.8379970544919, "grad_norm": 0.42097267508506775, "learning_rate": 3.916058998126949e-06, "loss": 0.4135, "step": 2496 }, { "epoch": 1.8387334315169368, "grad_norm": 0.43369096517562866, "learning_rate": 3.911876525154857e-06, "loss": 0.4216, "step": 2497 }, { "epoch": 1.8394698085419736, "grad_norm": 0.4318563640117645, "learning_rate": 3.907694851267764e-06, "loss": 0.396, "step": 2498 }, { "epoch": 1.8402061855670104, "grad_norm": 0.39135801792144775, "learning_rate": 3.903513979536563e-06, "loss": 0.4167, "step": 2499 }, { "epoch": 1.8409425625920472, "grad_norm": 0.4389367997646332, "learning_rate": 3.899333913031561e-06, "loss": 0.412, "step": 2500 }, { "epoch": 1.841678939617084, "grad_norm": 0.4623863995075226, "learning_rate": 3.895154654822471e-06, "loss": 0.3903, "step": 2501 }, { "epoch": 1.8424153166421209, "grad_norm": 0.40219375491142273, "learning_rate": 3.890976207978416e-06, "loss": 0.4075, "step": 2502 }, { "epoch": 1.8431516936671577, "grad_norm": 0.36119547486305237, "learning_rate": 3.8867985755679206e-06, "loss": 0.4064, "step": 2503 }, { "epoch": 1.8438880706921945, "grad_norm": 0.4446427524089813, "learning_rate": 3.882621760658911e-06, "loss": 0.4002, "step": 2504 }, { "epoch": 1.8446244477172313, "grad_norm": 0.38301101326942444, "learning_rate": 3.878445766318714e-06, "loss": 0.3946, "step": 2505 }, { "epoch": 1.8453608247422681, "grad_norm": 0.39597806334495544, "learning_rate": 3.874270595614057e-06, "loss": 0.3955, "step": 2506 }, { "epoch": 1.846097201767305, "grad_norm": 0.36354929208755493, "learning_rate": 3.870096251611053e-06, "loss": 0.415, "step": 2507 }, { "epoch": 1.8468335787923418, "grad_norm": 0.4049472510814667, "learning_rate": 3.865922737375219e-06, "loss": 0.4134, "step": 2508 }, { "epoch": 1.8475699558173786, "grad_norm": 0.42379382252693176, "learning_rate": 3.861750055971455e-06, "loss": 0.436, "step": 2509 }, { "epoch": 1.8483063328424154, "grad_norm": 0.3723948001861572, "learning_rate": 3.857578210464053e-06, "loss": 0.3914, "step": 2510 }, { "epoch": 1.8490427098674522, "grad_norm": 0.4040578007698059, "learning_rate": 3.8534072039166915e-06, "loss": 0.3967, "step": 2511 }, { "epoch": 1.849779086892489, "grad_norm": 0.3998672664165497, "learning_rate": 3.849237039392429e-06, "loss": 0.3973, "step": 2512 }, { "epoch": 1.8505154639175259, "grad_norm": 0.3948405086994171, "learning_rate": 3.845067719953711e-06, "loss": 0.4459, "step": 2513 }, { "epoch": 1.8512518409425627, "grad_norm": 0.38502955436706543, "learning_rate": 3.840899248662358e-06, "loss": 0.4169, "step": 2514 }, { "epoch": 1.8519882179675995, "grad_norm": 0.34823498129844666, "learning_rate": 3.836731628579573e-06, "loss": 0.4141, "step": 2515 }, { "epoch": 1.8527245949926363, "grad_norm": 0.4114006459712982, "learning_rate": 3.832564862765924e-06, "loss": 0.4317, "step": 2516 }, { "epoch": 1.8534609720176731, "grad_norm": 0.34312447905540466, "learning_rate": 3.828398954281361e-06, "loss": 0.3961, "step": 2517 }, { "epoch": 1.85419734904271, "grad_norm": 0.3569032549858093, "learning_rate": 3.8242339061852035e-06, "loss": 0.4025, "step": 2518 }, { "epoch": 1.8549337260677468, "grad_norm": 0.37142735719680786, "learning_rate": 3.8200697215361336e-06, "loss": 0.3778, "step": 2519 }, { "epoch": 1.8556701030927836, "grad_norm": 0.39146125316619873, "learning_rate": 3.815906403392203e-06, "loss": 0.4004, "step": 2520 }, { "epoch": 1.8564064801178204, "grad_norm": 0.3406389057636261, "learning_rate": 3.8117439548108293e-06, "loss": 0.4247, "step": 2521 }, { "epoch": 1.8571428571428572, "grad_norm": 0.3523464500904083, "learning_rate": 3.8075823788487863e-06, "loss": 0.4202, "step": 2522 }, { "epoch": 1.857879234167894, "grad_norm": 0.38543546199798584, "learning_rate": 3.803421678562213e-06, "loss": 0.4254, "step": 2523 }, { "epoch": 1.8586156111929308, "grad_norm": 0.3914850056171417, "learning_rate": 3.799261857006597e-06, "loss": 0.402, "step": 2524 }, { "epoch": 1.8593519882179677, "grad_norm": 0.3607421815395355, "learning_rate": 3.7951029172367883e-06, "loss": 0.396, "step": 2525 }, { "epoch": 1.8600883652430045, "grad_norm": 0.3871685862541199, "learning_rate": 3.790944862306988e-06, "loss": 0.4304, "step": 2526 }, { "epoch": 1.8608247422680413, "grad_norm": 0.4246087074279785, "learning_rate": 3.786787695270743e-06, "loss": 0.4147, "step": 2527 }, { "epoch": 1.861561119293078, "grad_norm": 0.35566386580467224, "learning_rate": 3.7826314191809522e-06, "loss": 0.4074, "step": 2528 }, { "epoch": 1.862297496318115, "grad_norm": 0.3580811321735382, "learning_rate": 3.778476037089861e-06, "loss": 0.4046, "step": 2529 }, { "epoch": 1.8630338733431517, "grad_norm": 0.35247814655303955, "learning_rate": 3.774321552049054e-06, "loss": 0.3959, "step": 2530 }, { "epoch": 1.8637702503681886, "grad_norm": 0.37611591815948486, "learning_rate": 3.7701679671094602e-06, "loss": 0.4283, "step": 2531 }, { "epoch": 1.8645066273932254, "grad_norm": 0.3860566020011902, "learning_rate": 3.7660152853213494e-06, "loss": 0.4123, "step": 2532 }, { "epoch": 1.8652430044182622, "grad_norm": 0.37459200620651245, "learning_rate": 3.7618635097343225e-06, "loss": 0.4377, "step": 2533 }, { "epoch": 1.865979381443299, "grad_norm": 0.37669306993484497, "learning_rate": 3.7577126433973176e-06, "loss": 0.4092, "step": 2534 }, { "epoch": 1.8667157584683358, "grad_norm": 0.3490535020828247, "learning_rate": 3.7535626893586062e-06, "loss": 0.4069, "step": 2535 }, { "epoch": 1.8674521354933726, "grad_norm": 0.3674035370349884, "learning_rate": 3.749413650665792e-06, "loss": 0.4336, "step": 2536 }, { "epoch": 1.8681885125184094, "grad_norm": 0.3696489930152893, "learning_rate": 3.7452655303657993e-06, "loss": 0.4025, "step": 2537 }, { "epoch": 1.8689248895434463, "grad_norm": 0.3966639041900635, "learning_rate": 3.7411183315048847e-06, "loss": 0.4173, "step": 2538 }, { "epoch": 1.869661266568483, "grad_norm": 0.3363037705421448, "learning_rate": 3.736972057128626e-06, "loss": 0.4027, "step": 2539 }, { "epoch": 1.87039764359352, "grad_norm": 0.35173100233078003, "learning_rate": 3.732826710281923e-06, "loss": 0.3908, "step": 2540 }, { "epoch": 1.8711340206185567, "grad_norm": 0.3510563373565674, "learning_rate": 3.728682294008988e-06, "loss": 0.3961, "step": 2541 }, { "epoch": 1.8718703976435935, "grad_norm": 0.3618358373641968, "learning_rate": 3.7245388113533596e-06, "loss": 0.3936, "step": 2542 }, { "epoch": 1.8726067746686303, "grad_norm": 0.33672353625297546, "learning_rate": 3.7203962653578853e-06, "loss": 0.4081, "step": 2543 }, { "epoch": 1.8733431516936672, "grad_norm": 0.36822593212127686, "learning_rate": 3.7162546590647254e-06, "loss": 0.4108, "step": 2544 }, { "epoch": 1.874079528718704, "grad_norm": 0.3691384196281433, "learning_rate": 3.7121139955153497e-06, "loss": 0.379, "step": 2545 }, { "epoch": 1.8748159057437408, "grad_norm": 0.4163464307785034, "learning_rate": 3.7079742777505373e-06, "loss": 0.4063, "step": 2546 }, { "epoch": 1.8755522827687776, "grad_norm": 0.3840535879135132, "learning_rate": 3.7038355088103726e-06, "loss": 0.403, "step": 2547 }, { "epoch": 1.8762886597938144, "grad_norm": 0.3876000642776489, "learning_rate": 3.699697691734243e-06, "loss": 0.4056, "step": 2548 }, { "epoch": 1.8770250368188512, "grad_norm": 0.3980104625225067, "learning_rate": 3.695560829560832e-06, "loss": 0.3986, "step": 2549 }, { "epoch": 1.877761413843888, "grad_norm": 0.3940967917442322, "learning_rate": 3.691424925328129e-06, "loss": 0.3895, "step": 2550 }, { "epoch": 1.8784977908689249, "grad_norm": 0.37271252274513245, "learning_rate": 3.687289982073419e-06, "loss": 0.4393, "step": 2551 }, { "epoch": 1.8792341678939617, "grad_norm": 0.37549808621406555, "learning_rate": 3.683156002833276e-06, "loss": 0.4248, "step": 2552 }, { "epoch": 1.8799705449189985, "grad_norm": 0.3900524079799652, "learning_rate": 3.6790229906435706e-06, "loss": 0.4255, "step": 2553 }, { "epoch": 1.8807069219440353, "grad_norm": 0.36957311630249023, "learning_rate": 3.674890948539463e-06, "loss": 0.404, "step": 2554 }, { "epoch": 1.8814432989690721, "grad_norm": 0.37756621837615967, "learning_rate": 3.670759879555399e-06, "loss": 0.4144, "step": 2555 }, { "epoch": 1.882179675994109, "grad_norm": 0.4113319516181946, "learning_rate": 3.666629786725111e-06, "loss": 0.3993, "step": 2556 }, { "epoch": 1.8829160530191458, "grad_norm": 0.4135149419307709, "learning_rate": 3.6625006730816157e-06, "loss": 0.4202, "step": 2557 }, { "epoch": 1.8836524300441826, "grad_norm": 0.3912547826766968, "learning_rate": 3.6583725416572093e-06, "loss": 0.4029, "step": 2558 }, { "epoch": 1.8843888070692194, "grad_norm": 0.36167773604393005, "learning_rate": 3.6542453954834632e-06, "loss": 0.4039, "step": 2559 }, { "epoch": 1.8851251840942562, "grad_norm": 0.3650279641151428, "learning_rate": 3.650119237591232e-06, "loss": 0.4051, "step": 2560 }, { "epoch": 1.885861561119293, "grad_norm": 0.358697772026062, "learning_rate": 3.6459940710106414e-06, "loss": 0.3967, "step": 2561 }, { "epoch": 1.8865979381443299, "grad_norm": 0.41603848338127136, "learning_rate": 3.6418698987710872e-06, "loss": 0.3998, "step": 2562 }, { "epoch": 1.8873343151693667, "grad_norm": 0.3741896450519562, "learning_rate": 3.637746723901238e-06, "loss": 0.4212, "step": 2563 }, { "epoch": 1.8880706921944035, "grad_norm": 0.39252546429634094, "learning_rate": 3.6336245494290305e-06, "loss": 0.387, "step": 2564 }, { "epoch": 1.8888070692194403, "grad_norm": 0.35629981756210327, "learning_rate": 3.6295033783816636e-06, "loss": 0.3794, "step": 2565 }, { "epoch": 1.8895434462444771, "grad_norm": 0.3638868033885956, "learning_rate": 3.6253832137856e-06, "loss": 0.4164, "step": 2566 }, { "epoch": 1.890279823269514, "grad_norm": 0.380978524684906, "learning_rate": 3.621264058666564e-06, "loss": 0.3887, "step": 2567 }, { "epoch": 1.8910162002945508, "grad_norm": 0.3995063304901123, "learning_rate": 3.6171459160495393e-06, "loss": 0.408, "step": 2568 }, { "epoch": 1.8917525773195876, "grad_norm": 0.4066588878631592, "learning_rate": 3.6130287889587665e-06, "loss": 0.4023, "step": 2569 }, { "epoch": 1.8924889543446244, "grad_norm": 0.4209347069263458, "learning_rate": 3.6089126804177373e-06, "loss": 0.3898, "step": 2570 }, { "epoch": 1.8932253313696612, "grad_norm": 0.41782402992248535, "learning_rate": 3.6047975934491983e-06, "loss": 0.4186, "step": 2571 }, { "epoch": 1.893961708394698, "grad_norm": 0.41970953345298767, "learning_rate": 3.6006835310751464e-06, "loss": 0.4078, "step": 2572 }, { "epoch": 1.8946980854197348, "grad_norm": 0.3769712746143341, "learning_rate": 3.596570496316822e-06, "loss": 0.4206, "step": 2573 }, { "epoch": 1.8954344624447717, "grad_norm": 0.38179120421409607, "learning_rate": 3.592458492194717e-06, "loss": 0.3897, "step": 2574 }, { "epoch": 1.8961708394698085, "grad_norm": 0.42509886622428894, "learning_rate": 3.5883475217285592e-06, "loss": 0.4142, "step": 2575 }, { "epoch": 1.8969072164948453, "grad_norm": 0.38956883549690247, "learning_rate": 3.5842375879373237e-06, "loss": 0.3984, "step": 2576 }, { "epoch": 1.897643593519882, "grad_norm": 0.34472212195396423, "learning_rate": 3.5801286938392195e-06, "loss": 0.4137, "step": 2577 }, { "epoch": 1.898379970544919, "grad_norm": 0.39913445711135864, "learning_rate": 3.5760208424516957e-06, "loss": 0.3989, "step": 2578 }, { "epoch": 1.8991163475699557, "grad_norm": 0.38616591691970825, "learning_rate": 3.571914036791435e-06, "loss": 0.384, "step": 2579 }, { "epoch": 1.8998527245949925, "grad_norm": 0.35932883620262146, "learning_rate": 3.5678082798743498e-06, "loss": 0.3966, "step": 2580 }, { "epoch": 1.9005891016200294, "grad_norm": 0.40238866209983826, "learning_rate": 3.5637035747155835e-06, "loss": 0.4105, "step": 2581 }, { "epoch": 1.9013254786450662, "grad_norm": 0.39923685789108276, "learning_rate": 3.5595999243295114e-06, "loss": 0.4016, "step": 2582 }, { "epoch": 1.902061855670103, "grad_norm": 0.428325891494751, "learning_rate": 3.5554973317297255e-06, "loss": 0.3744, "step": 2583 }, { "epoch": 1.9027982326951398, "grad_norm": 0.3603360652923584, "learning_rate": 3.5513957999290483e-06, "loss": 0.3955, "step": 2584 }, { "epoch": 1.9035346097201766, "grad_norm": 0.42727532982826233, "learning_rate": 3.5472953319395196e-06, "loss": 0.4234, "step": 2585 }, { "epoch": 1.9042709867452134, "grad_norm": 0.39020195603370667, "learning_rate": 3.5431959307724e-06, "loss": 0.4095, "step": 2586 }, { "epoch": 1.9050073637702503, "grad_norm": 0.3618599474430084, "learning_rate": 3.539097599438167e-06, "loss": 0.3776, "step": 2587 }, { "epoch": 1.905743740795287, "grad_norm": 0.37120410799980164, "learning_rate": 3.5350003409465085e-06, "loss": 0.4031, "step": 2588 }, { "epoch": 1.906480117820324, "grad_norm": 0.4274289309978485, "learning_rate": 3.530904158306329e-06, "loss": 0.3971, "step": 2589 }, { "epoch": 1.9072164948453607, "grad_norm": 0.36874106526374817, "learning_rate": 3.526809054525744e-06, "loss": 0.3911, "step": 2590 }, { "epoch": 1.9079528718703975, "grad_norm": 0.3638487756252289, "learning_rate": 3.522715032612069e-06, "loss": 0.405, "step": 2591 }, { "epoch": 1.9086892488954343, "grad_norm": 0.3634064495563507, "learning_rate": 3.518622095571831e-06, "loss": 0.4145, "step": 2592 }, { "epoch": 1.9094256259204712, "grad_norm": 0.3739914894104004, "learning_rate": 3.5145302464107612e-06, "loss": 0.3862, "step": 2593 }, { "epoch": 1.910162002945508, "grad_norm": 0.39826780557632446, "learning_rate": 3.510439488133789e-06, "loss": 0.4123, "step": 2594 }, { "epoch": 1.9108983799705448, "grad_norm": 0.37097451090812683, "learning_rate": 3.506349823745043e-06, "loss": 0.3913, "step": 2595 }, { "epoch": 1.9116347569955816, "grad_norm": 0.3629193603992462, "learning_rate": 3.5022612562478507e-06, "loss": 0.4055, "step": 2596 }, { "epoch": 1.9123711340206184, "grad_norm": 0.368109792470932, "learning_rate": 3.498173788644732e-06, "loss": 0.3793, "step": 2597 }, { "epoch": 1.9131075110456552, "grad_norm": 0.418338418006897, "learning_rate": 3.494087423937399e-06, "loss": 0.3781, "step": 2598 }, { "epoch": 1.913843888070692, "grad_norm": 0.35174235701560974, "learning_rate": 3.4900021651267557e-06, "loss": 0.405, "step": 2599 }, { "epoch": 1.9145802650957289, "grad_norm": 0.3879396915435791, "learning_rate": 3.485918015212891e-06, "loss": 0.383, "step": 2600 }, { "epoch": 1.9153166421207657, "grad_norm": 0.34615907073020935, "learning_rate": 3.481834977195081e-06, "loss": 0.3978, "step": 2601 }, { "epoch": 1.9160530191458025, "grad_norm": 0.36179783940315247, "learning_rate": 3.4777530540717875e-06, "loss": 0.4361, "step": 2602 }, { "epoch": 1.9167893961708393, "grad_norm": 0.37738144397735596, "learning_rate": 3.4736722488406493e-06, "loss": 0.409, "step": 2603 }, { "epoch": 1.9175257731958761, "grad_norm": 0.37794360518455505, "learning_rate": 3.4695925644984885e-06, "loss": 0.4211, "step": 2604 }, { "epoch": 1.918262150220913, "grad_norm": 0.40854412317276, "learning_rate": 3.465514004041301e-06, "loss": 0.4055, "step": 2605 }, { "epoch": 1.9189985272459498, "grad_norm": 0.3971635103225708, "learning_rate": 3.461436570464258e-06, "loss": 0.3971, "step": 2606 }, { "epoch": 1.9197349042709866, "grad_norm": 0.3623662292957306, "learning_rate": 3.4573602667617056e-06, "loss": 0.4185, "step": 2607 }, { "epoch": 1.9204712812960234, "grad_norm": 0.4009900391101837, "learning_rate": 3.453285095927154e-06, "loss": 0.4067, "step": 2608 }, { "epoch": 1.9212076583210602, "grad_norm": 0.34463950991630554, "learning_rate": 3.4492110609532892e-06, "loss": 0.4056, "step": 2609 }, { "epoch": 1.9219440353460973, "grad_norm": 0.3612362742424011, "learning_rate": 3.4451381648319573e-06, "loss": 0.3852, "step": 2610 }, { "epoch": 1.922680412371134, "grad_norm": 0.3724992275238037, "learning_rate": 3.4410664105541703e-06, "loss": 0.4193, "step": 2611 }, { "epoch": 1.923416789396171, "grad_norm": 0.3696814775466919, "learning_rate": 3.4369958011101035e-06, "loss": 0.4328, "step": 2612 }, { "epoch": 1.9241531664212077, "grad_norm": 0.3654807507991791, "learning_rate": 3.4329263394890867e-06, "loss": 0.3699, "step": 2613 }, { "epoch": 1.9248895434462445, "grad_norm": 0.3380682170391083, "learning_rate": 3.4288580286796106e-06, "loss": 0.3954, "step": 2614 }, { "epoch": 1.9256259204712813, "grad_norm": 0.3763212263584137, "learning_rate": 3.424790871669321e-06, "loss": 0.395, "step": 2615 }, { "epoch": 1.9263622974963182, "grad_norm": 0.37345150113105774, "learning_rate": 3.4207248714450157e-06, "loss": 0.4032, "step": 2616 }, { "epoch": 1.927098674521355, "grad_norm": 0.3797363340854645, "learning_rate": 3.416660030992639e-06, "loss": 0.4183, "step": 2617 }, { "epoch": 1.9278350515463918, "grad_norm": 0.39499834179878235, "learning_rate": 3.4125963532972878e-06, "loss": 0.4318, "step": 2618 }, { "epoch": 1.9285714285714286, "grad_norm": 0.41415366530418396, "learning_rate": 3.4085338413432066e-06, "loss": 0.3798, "step": 2619 }, { "epoch": 1.9293078055964654, "grad_norm": 0.38226163387298584, "learning_rate": 3.4044724981137787e-06, "loss": 0.4323, "step": 2620 }, { "epoch": 1.9300441826215022, "grad_norm": 0.41554006934165955, "learning_rate": 3.4004123265915328e-06, "loss": 0.3985, "step": 2621 }, { "epoch": 1.930780559646539, "grad_norm": 0.40494853258132935, "learning_rate": 3.3963533297581375e-06, "loss": 0.3894, "step": 2622 }, { "epoch": 1.9315169366715759, "grad_norm": 0.42371952533721924, "learning_rate": 3.3922955105943953e-06, "loss": 0.4125, "step": 2623 }, { "epoch": 1.9322533136966127, "grad_norm": 0.4033835530281067, "learning_rate": 3.3882388720802496e-06, "loss": 0.4221, "step": 2624 }, { "epoch": 1.9329896907216495, "grad_norm": 0.42939725518226624, "learning_rate": 3.384183417194767e-06, "loss": 0.4095, "step": 2625 }, { "epoch": 1.9337260677466863, "grad_norm": 0.41357123851776123, "learning_rate": 3.380129148916156e-06, "loss": 0.3923, "step": 2626 }, { "epoch": 1.9344624447717231, "grad_norm": 0.41408517956733704, "learning_rate": 3.3760760702217477e-06, "loss": 0.3849, "step": 2627 }, { "epoch": 1.93519882179676, "grad_norm": 0.3655821681022644, "learning_rate": 3.3720241840879992e-06, "loss": 0.4033, "step": 2628 }, { "epoch": 1.9359351988217968, "grad_norm": 0.40222102403640747, "learning_rate": 3.367973493490494e-06, "loss": 0.4061, "step": 2629 }, { "epoch": 1.9366715758468336, "grad_norm": 0.37809598445892334, "learning_rate": 3.363924001403939e-06, "loss": 0.3973, "step": 2630 }, { "epoch": 1.9374079528718704, "grad_norm": 0.3718864321708679, "learning_rate": 3.3598757108021546e-06, "loss": 0.4259, "step": 2631 }, { "epoch": 1.9381443298969072, "grad_norm": 0.409039169549942, "learning_rate": 3.355828624658087e-06, "loss": 0.4057, "step": 2632 }, { "epoch": 1.938880706921944, "grad_norm": 0.34290969371795654, "learning_rate": 3.351782745943792e-06, "loss": 0.3939, "step": 2633 }, { "epoch": 1.9396170839469808, "grad_norm": 0.378534734249115, "learning_rate": 3.3477380776304412e-06, "loss": 0.3941, "step": 2634 }, { "epoch": 1.9403534609720177, "grad_norm": 0.3930615782737732, "learning_rate": 3.343694622688315e-06, "loss": 0.4266, "step": 2635 }, { "epoch": 1.9410898379970545, "grad_norm": 0.3948494791984558, "learning_rate": 3.3396523840868065e-06, "loss": 0.4136, "step": 2636 }, { "epoch": 1.9418262150220913, "grad_norm": 0.3820838928222656, "learning_rate": 3.3356113647944144e-06, "loss": 0.403, "step": 2637 }, { "epoch": 1.9425625920471281, "grad_norm": 0.3924919664859772, "learning_rate": 3.3315715677787387e-06, "loss": 0.4077, "step": 2638 }, { "epoch": 1.943298969072165, "grad_norm": 0.3638979196548462, "learning_rate": 3.3275329960064855e-06, "loss": 0.4086, "step": 2639 }, { "epoch": 1.9440353460972017, "grad_norm": 0.3670336902141571, "learning_rate": 3.3234956524434615e-06, "loss": 0.4018, "step": 2640 }, { "epoch": 1.9447717231222386, "grad_norm": 0.38897061347961426, "learning_rate": 3.319459540054567e-06, "loss": 0.4143, "step": 2641 }, { "epoch": 1.9455081001472754, "grad_norm": 0.403804749250412, "learning_rate": 3.315424661803802e-06, "loss": 0.3792, "step": 2642 }, { "epoch": 1.9462444771723122, "grad_norm": 0.3691318929195404, "learning_rate": 3.3113910206542595e-06, "loss": 0.4074, "step": 2643 }, { "epoch": 1.946980854197349, "grad_norm": 0.41701748967170715, "learning_rate": 3.307358619568123e-06, "loss": 0.4037, "step": 2644 }, { "epoch": 1.9477172312223858, "grad_norm": 0.3514380156993866, "learning_rate": 3.303327461506667e-06, "loss": 0.4278, "step": 2645 }, { "epoch": 1.9484536082474226, "grad_norm": 0.3677442669868469, "learning_rate": 3.29929754943025e-06, "loss": 0.4029, "step": 2646 }, { "epoch": 1.9491899852724595, "grad_norm": 0.38052093982696533, "learning_rate": 3.295268886298321e-06, "loss": 0.3938, "step": 2647 }, { "epoch": 1.9499263622974963, "grad_norm": 0.4505428671836853, "learning_rate": 3.2912414750694064e-06, "loss": 0.3936, "step": 2648 }, { "epoch": 1.950662739322533, "grad_norm": 0.35540086030960083, "learning_rate": 3.2872153187011175e-06, "loss": 0.4038, "step": 2649 }, { "epoch": 1.95139911634757, "grad_norm": 0.38356146216392517, "learning_rate": 3.2831904201501376e-06, "loss": 0.4276, "step": 2650 }, { "epoch": 1.9521354933726067, "grad_norm": 0.38261398673057556, "learning_rate": 3.2791667823722327e-06, "loss": 0.3967, "step": 2651 }, { "epoch": 1.9528718703976435, "grad_norm": 0.3730379343032837, "learning_rate": 3.2751444083222418e-06, "loss": 0.3942, "step": 2652 }, { "epoch": 1.9536082474226806, "grad_norm": 0.41036689281463623, "learning_rate": 3.271123300954074e-06, "loss": 0.4031, "step": 2653 }, { "epoch": 1.9543446244477174, "grad_norm": 0.37355658411979675, "learning_rate": 3.2671034632207084e-06, "loss": 0.4094, "step": 2654 }, { "epoch": 1.9550810014727542, "grad_norm": 0.3889113962650299, "learning_rate": 3.263084898074194e-06, "loss": 0.4123, "step": 2655 }, { "epoch": 1.955817378497791, "grad_norm": 0.3863618075847626, "learning_rate": 3.2590676084656425e-06, "loss": 0.3765, "step": 2656 }, { "epoch": 1.9565537555228278, "grad_norm": 0.3829159438610077, "learning_rate": 3.2550515973452295e-06, "loss": 0.3963, "step": 2657 }, { "epoch": 1.9572901325478647, "grad_norm": 0.3452022671699524, "learning_rate": 3.251036867662195e-06, "loss": 0.4252, "step": 2658 }, { "epoch": 1.9580265095729015, "grad_norm": 0.3539165258407593, "learning_rate": 3.247023422364831e-06, "loss": 0.3867, "step": 2659 }, { "epoch": 1.9587628865979383, "grad_norm": 0.3868674039840698, "learning_rate": 3.243011264400494e-06, "loss": 0.421, "step": 2660 }, { "epoch": 1.959499263622975, "grad_norm": 0.39568665623664856, "learning_rate": 3.2390003967155887e-06, "loss": 0.4064, "step": 2661 }, { "epoch": 1.960235640648012, "grad_norm": 0.3922138214111328, "learning_rate": 3.2349908222555764e-06, "loss": 0.3997, "step": 2662 }, { "epoch": 1.9609720176730487, "grad_norm": 0.3632813096046448, "learning_rate": 3.230982543964969e-06, "loss": 0.3785, "step": 2663 }, { "epoch": 1.9617083946980856, "grad_norm": 0.38700589537620544, "learning_rate": 3.226975564787322e-06, "loss": 0.4231, "step": 2664 }, { "epoch": 1.9624447717231224, "grad_norm": 0.37696290016174316, "learning_rate": 3.2229698876652415e-06, "loss": 0.3819, "step": 2665 }, { "epoch": 1.9631811487481592, "grad_norm": 0.3456031084060669, "learning_rate": 3.218965515540377e-06, "loss": 0.4044, "step": 2666 }, { "epoch": 1.963917525773196, "grad_norm": 0.39847907423973083, "learning_rate": 3.214962451353416e-06, "loss": 0.4108, "step": 2667 }, { "epoch": 1.9646539027982328, "grad_norm": 0.3617440462112427, "learning_rate": 3.2109606980440887e-06, "loss": 0.4052, "step": 2668 }, { "epoch": 1.9653902798232696, "grad_norm": 0.37616562843322754, "learning_rate": 3.2069602585511605e-06, "loss": 0.4069, "step": 2669 }, { "epoch": 1.9661266568483065, "grad_norm": 0.39469170570373535, "learning_rate": 3.202961135812437e-06, "loss": 0.3785, "step": 2670 }, { "epoch": 1.9668630338733433, "grad_norm": 0.4196336269378662, "learning_rate": 3.1989633327647485e-06, "loss": 0.4297, "step": 2671 }, { "epoch": 1.96759941089838, "grad_norm": 0.380405068397522, "learning_rate": 3.1949668523439635e-06, "loss": 0.3883, "step": 2672 }, { "epoch": 1.968335787923417, "grad_norm": 0.37013182044029236, "learning_rate": 3.190971697484977e-06, "loss": 0.3756, "step": 2673 }, { "epoch": 1.9690721649484537, "grad_norm": 0.34765076637268066, "learning_rate": 3.186977871121708e-06, "loss": 0.3944, "step": 2674 }, { "epoch": 1.9698085419734905, "grad_norm": 0.3879580497741699, "learning_rate": 3.182985376187105e-06, "loss": 0.4227, "step": 2675 }, { "epoch": 1.9705449189985274, "grad_norm": 0.3524974584579468, "learning_rate": 3.178994215613131e-06, "loss": 0.3805, "step": 2676 }, { "epoch": 1.9712812960235642, "grad_norm": 0.3391323983669281, "learning_rate": 3.1750043923307773e-06, "loss": 0.4046, "step": 2677 }, { "epoch": 1.972017673048601, "grad_norm": 0.3604549169540405, "learning_rate": 3.1710159092700475e-06, "loss": 0.3858, "step": 2678 }, { "epoch": 1.9727540500736378, "grad_norm": 0.36499884724617004, "learning_rate": 3.167028769359964e-06, "loss": 0.4425, "step": 2679 }, { "epoch": 1.9734904270986746, "grad_norm": 0.31244251132011414, "learning_rate": 3.1630429755285623e-06, "loss": 0.4021, "step": 2680 }, { "epoch": 1.9742268041237114, "grad_norm": 0.3878641128540039, "learning_rate": 3.1590585307028884e-06, "loss": 0.4057, "step": 2681 }, { "epoch": 1.9749631811487482, "grad_norm": 0.3576429784297943, "learning_rate": 3.1550754378089976e-06, "loss": 0.3675, "step": 2682 }, { "epoch": 1.975699558173785, "grad_norm": 0.40466341376304626, "learning_rate": 3.1510936997719557e-06, "loss": 0.3959, "step": 2683 }, { "epoch": 1.9764359351988219, "grad_norm": 0.3640960156917572, "learning_rate": 3.1471133195158266e-06, "loss": 0.3905, "step": 2684 }, { "epoch": 1.9771723122238587, "grad_norm": 0.3552342653274536, "learning_rate": 3.143134299963684e-06, "loss": 0.4215, "step": 2685 }, { "epoch": 1.9779086892488955, "grad_norm": 0.3452814817428589, "learning_rate": 3.1391566440375987e-06, "loss": 0.3995, "step": 2686 }, { "epoch": 1.9786450662739323, "grad_norm": 0.3464387357234955, "learning_rate": 3.1351803546586407e-06, "loss": 0.4106, "step": 2687 }, { "epoch": 1.9793814432989691, "grad_norm": 0.3663497567176819, "learning_rate": 3.131205434746879e-06, "loss": 0.3728, "step": 2688 }, { "epoch": 1.980117820324006, "grad_norm": 0.3594132661819458, "learning_rate": 3.1272318872213713e-06, "loss": 0.4075, "step": 2689 }, { "epoch": 1.9808541973490428, "grad_norm": 0.35411345958709717, "learning_rate": 3.123259715000173e-06, "loss": 0.3754, "step": 2690 }, { "epoch": 1.9815905743740796, "grad_norm": 0.33865928649902344, "learning_rate": 3.1192889210003285e-06, "loss": 0.3723, "step": 2691 }, { "epoch": 1.9823269513991164, "grad_norm": 0.3422465920448303, "learning_rate": 3.115319508137866e-06, "loss": 0.387, "step": 2692 }, { "epoch": 1.9830633284241532, "grad_norm": 0.3784734010696411, "learning_rate": 3.1113514793278037e-06, "loss": 0.397, "step": 2693 }, { "epoch": 1.98379970544919, "grad_norm": 0.41923490166664124, "learning_rate": 3.1073848374841416e-06, "loss": 0.4199, "step": 2694 }, { "epoch": 1.9845360824742269, "grad_norm": 0.37350550293922424, "learning_rate": 3.1034195855198622e-06, "loss": 0.4023, "step": 2695 }, { "epoch": 1.9852724594992637, "grad_norm": 0.3785882294178009, "learning_rate": 3.0994557263469267e-06, "loss": 0.4117, "step": 2696 }, { "epoch": 1.9860088365243005, "grad_norm": 0.36809927225112915, "learning_rate": 3.0954932628762723e-06, "loss": 0.3928, "step": 2697 }, { "epoch": 1.9867452135493373, "grad_norm": 0.39854827523231506, "learning_rate": 3.0915321980178153e-06, "loss": 0.3881, "step": 2698 }, { "epoch": 1.9874815905743741, "grad_norm": 0.37375113368034363, "learning_rate": 3.0875725346804385e-06, "loss": 0.4052, "step": 2699 }, { "epoch": 1.988217967599411, "grad_norm": 0.4068243205547333, "learning_rate": 3.0836142757720034e-06, "loss": 0.3912, "step": 2700 }, { "epoch": 1.9889543446244478, "grad_norm": 0.3465851843357086, "learning_rate": 3.0796574241993306e-06, "loss": 0.4136, "step": 2701 }, { "epoch": 1.9896907216494846, "grad_norm": 0.39400115609169006, "learning_rate": 3.0757019828682145e-06, "loss": 0.3915, "step": 2702 }, { "epoch": 1.9904270986745214, "grad_norm": 0.33876892924308777, "learning_rate": 3.0717479546834136e-06, "loss": 0.3967, "step": 2703 }, { "epoch": 1.9911634756995582, "grad_norm": 0.3738095462322235, "learning_rate": 3.0677953425486435e-06, "loss": 0.3866, "step": 2704 }, { "epoch": 1.991899852724595, "grad_norm": 0.350265771150589, "learning_rate": 3.063844149366585e-06, "loss": 0.3789, "step": 2705 }, { "epoch": 1.9926362297496318, "grad_norm": 0.3564341068267822, "learning_rate": 3.0598943780388744e-06, "loss": 0.3948, "step": 2706 }, { "epoch": 1.9933726067746687, "grad_norm": 0.3786764442920685, "learning_rate": 3.055946031466105e-06, "loss": 0.4204, "step": 2707 }, { "epoch": 1.9941089837997055, "grad_norm": 0.3561117649078369, "learning_rate": 3.0519991125478244e-06, "loss": 0.3828, "step": 2708 }, { "epoch": 1.9948453608247423, "grad_norm": 0.40353891253471375, "learning_rate": 3.0480536241825263e-06, "loss": 0.3999, "step": 2709 }, { "epoch": 1.995581737849779, "grad_norm": 0.3841717839241028, "learning_rate": 3.0441095692676625e-06, "loss": 0.4079, "step": 2710 }, { "epoch": 1.996318114874816, "grad_norm": 0.4203062057495117, "learning_rate": 3.040166950699626e-06, "loss": 0.395, "step": 2711 }, { "epoch": 1.9970544918998527, "grad_norm": 0.4298144280910492, "learning_rate": 3.0362257713737552e-06, "loss": 0.466, "step": 2712 }, { "epoch": 1.9977908689248896, "grad_norm": 0.41384029388427734, "learning_rate": 3.0322860341843365e-06, "loss": 0.3908, "step": 2713 }, { "epoch": 1.9985272459499264, "grad_norm": 0.35945290327072144, "learning_rate": 3.028347742024591e-06, "loss": 0.3889, "step": 2714 }, { "epoch": 1.9992636229749632, "grad_norm": 0.3887770473957062, "learning_rate": 3.024410897786682e-06, "loss": 0.4116, "step": 2715 }, { "epoch": 2.0, "grad_norm": 0.4592723250389099, "learning_rate": 3.020475504361711e-06, "loss": 0.4304, "step": 2716 }, { "epoch": 2.000736377025037, "grad_norm": 0.4854089915752411, "learning_rate": 3.01654156463971e-06, "loss": 0.3914, "step": 2717 }, { "epoch": 2.0014727540500736, "grad_norm": 0.40188172459602356, "learning_rate": 3.0126090815096466e-06, "loss": 0.3823, "step": 2718 }, { "epoch": 2.0022091310751104, "grad_norm": 0.3685808777809143, "learning_rate": 3.008678057859415e-06, "loss": 0.3927, "step": 2719 }, { "epoch": 2.0029455081001473, "grad_norm": 0.41367441415786743, "learning_rate": 3.004748496575842e-06, "loss": 0.3738, "step": 2720 }, { "epoch": 2.003681885125184, "grad_norm": 0.3970171809196472, "learning_rate": 3.0008204005446807e-06, "loss": 0.3864, "step": 2721 }, { "epoch": 2.004418262150221, "grad_norm": 0.38890549540519714, "learning_rate": 2.996893772650602e-06, "loss": 0.3532, "step": 2722 }, { "epoch": 2.0051546391752577, "grad_norm": 0.39022096991539, "learning_rate": 2.992968615777206e-06, "loss": 0.3809, "step": 2723 }, { "epoch": 2.0058910162002945, "grad_norm": 0.3781045973300934, "learning_rate": 2.989044932807008e-06, "loss": 0.3753, "step": 2724 }, { "epoch": 2.0066273932253313, "grad_norm": 0.40674319863319397, "learning_rate": 2.9851227266214444e-06, "loss": 0.3748, "step": 2725 }, { "epoch": 2.007363770250368, "grad_norm": 0.42673784494400024, "learning_rate": 2.981202000100861e-06, "loss": 0.371, "step": 2726 }, { "epoch": 2.008100147275405, "grad_norm": 0.3645564615726471, "learning_rate": 2.9772827561245223e-06, "loss": 0.3759, "step": 2727 }, { "epoch": 2.008836524300442, "grad_norm": 0.3563685715198517, "learning_rate": 2.9733649975706035e-06, "loss": 0.3681, "step": 2728 }, { "epoch": 2.0095729013254786, "grad_norm": 0.35634204745292664, "learning_rate": 2.969448727316188e-06, "loss": 0.3784, "step": 2729 }, { "epoch": 2.0103092783505154, "grad_norm": 0.3639596998691559, "learning_rate": 2.9655339482372647e-06, "loss": 0.3652, "step": 2730 }, { "epoch": 2.0110456553755522, "grad_norm": 0.3867950141429901, "learning_rate": 2.961620663208732e-06, "loss": 0.3561, "step": 2731 }, { "epoch": 2.011782032400589, "grad_norm": 0.3902873396873474, "learning_rate": 2.957708875104386e-06, "loss": 0.3777, "step": 2732 }, { "epoch": 2.012518409425626, "grad_norm": 0.39293259382247925, "learning_rate": 2.9537985867969277e-06, "loss": 0.358, "step": 2733 }, { "epoch": 2.0132547864506627, "grad_norm": 0.37629497051239014, "learning_rate": 2.9498898011579514e-06, "loss": 0.3884, "step": 2734 }, { "epoch": 2.0139911634756995, "grad_norm": 0.3420172929763794, "learning_rate": 2.9459825210579534e-06, "loss": 0.3858, "step": 2735 }, { "epoch": 2.0147275405007363, "grad_norm": 0.3435642123222351, "learning_rate": 2.942076749366321e-06, "loss": 0.3672, "step": 2736 }, { "epoch": 2.015463917525773, "grad_norm": 0.36072492599487305, "learning_rate": 2.938172488951336e-06, "loss": 0.3591, "step": 2737 }, { "epoch": 2.01620029455081, "grad_norm": 0.34583789110183716, "learning_rate": 2.9342697426801693e-06, "loss": 0.3348, "step": 2738 }, { "epoch": 2.0169366715758468, "grad_norm": 0.3557235598564148, "learning_rate": 2.9303685134188785e-06, "loss": 0.3624, "step": 2739 }, { "epoch": 2.0176730486008836, "grad_norm": 0.3426262140274048, "learning_rate": 2.9264688040324098e-06, "loss": 0.3786, "step": 2740 }, { "epoch": 2.0184094256259204, "grad_norm": 0.33903318643569946, "learning_rate": 2.922570617384591e-06, "loss": 0.3707, "step": 2741 }, { "epoch": 2.0191458026509572, "grad_norm": 0.32387295365333557, "learning_rate": 2.918673956338136e-06, "loss": 0.3661, "step": 2742 }, { "epoch": 2.019882179675994, "grad_norm": 0.34796783328056335, "learning_rate": 2.914778823754628e-06, "loss": 0.3941, "step": 2743 }, { "epoch": 2.020618556701031, "grad_norm": 0.3746124804019928, "learning_rate": 2.9108852224945405e-06, "loss": 0.3809, "step": 2744 }, { "epoch": 2.0213549337260677, "grad_norm": 0.32552647590637207, "learning_rate": 2.9069931554172155e-06, "loss": 0.3749, "step": 2745 }, { "epoch": 2.0220913107511045, "grad_norm": 0.36219868063926697, "learning_rate": 2.9031026253808657e-06, "loss": 0.3647, "step": 2746 }, { "epoch": 2.0228276877761413, "grad_norm": 0.311967134475708, "learning_rate": 2.899213635242585e-06, "loss": 0.3714, "step": 2747 }, { "epoch": 2.023564064801178, "grad_norm": 0.382050484418869, "learning_rate": 2.8953261878583263e-06, "loss": 0.3629, "step": 2748 }, { "epoch": 2.024300441826215, "grad_norm": 0.3814060688018799, "learning_rate": 2.8914402860829116e-06, "loss": 0.3756, "step": 2749 }, { "epoch": 2.0250368188512518, "grad_norm": 0.3468574285507202, "learning_rate": 2.8875559327700376e-06, "loss": 0.3673, "step": 2750 }, { "epoch": 2.0257731958762886, "grad_norm": 0.3503856658935547, "learning_rate": 2.8836731307722456e-06, "loss": 0.3999, "step": 2751 }, { "epoch": 2.0265095729013254, "grad_norm": 0.34949731826782227, "learning_rate": 2.8797918829409553e-06, "loss": 0.371, "step": 2752 }, { "epoch": 2.027245949926362, "grad_norm": 0.4045935273170471, "learning_rate": 2.8759121921264366e-06, "loss": 0.3934, "step": 2753 }, { "epoch": 2.027982326951399, "grad_norm": 0.3662734031677246, "learning_rate": 2.8720340611778134e-06, "loss": 0.3674, "step": 2754 }, { "epoch": 2.028718703976436, "grad_norm": 0.3531613051891327, "learning_rate": 2.8681574929430732e-06, "loss": 0.3619, "step": 2755 }, { "epoch": 2.0294550810014726, "grad_norm": 0.35535216331481934, "learning_rate": 2.8642824902690482e-06, "loss": 0.3672, "step": 2756 }, { "epoch": 2.0301914580265095, "grad_norm": 0.39943090081214905, "learning_rate": 2.860409056001421e-06, "loss": 0.3925, "step": 2757 }, { "epoch": 2.0309278350515463, "grad_norm": 0.39442721009254456, "learning_rate": 2.8565371929847286e-06, "loss": 0.3568, "step": 2758 }, { "epoch": 2.031664212076583, "grad_norm": 0.35805344581604004, "learning_rate": 2.852666904062351e-06, "loss": 0.3424, "step": 2759 }, { "epoch": 2.03240058910162, "grad_norm": 0.3514004051685333, "learning_rate": 2.8487981920765044e-06, "loss": 0.3695, "step": 2760 }, { "epoch": 2.0331369661266567, "grad_norm": 0.3559187054634094, "learning_rate": 2.844931059868261e-06, "loss": 0.4159, "step": 2761 }, { "epoch": 2.0338733431516935, "grad_norm": 0.3435509502887726, "learning_rate": 2.841065510277523e-06, "loss": 0.3477, "step": 2762 }, { "epoch": 2.0346097201767304, "grad_norm": 0.3598922789096832, "learning_rate": 2.8372015461430313e-06, "loss": 0.3543, "step": 2763 }, { "epoch": 2.035346097201767, "grad_norm": 0.4100363552570343, "learning_rate": 2.833339170302369e-06, "loss": 0.3485, "step": 2764 }, { "epoch": 2.036082474226804, "grad_norm": 0.3745668828487396, "learning_rate": 2.829478385591946e-06, "loss": 0.3597, "step": 2765 }, { "epoch": 2.036818851251841, "grad_norm": 0.32710880041122437, "learning_rate": 2.8256191948470034e-06, "loss": 0.3525, "step": 2766 }, { "epoch": 2.0375552282768776, "grad_norm": 0.4090001583099365, "learning_rate": 2.8217616009016203e-06, "loss": 0.3994, "step": 2767 }, { "epoch": 2.0382916053019144, "grad_norm": 0.35250866413116455, "learning_rate": 2.81790560658869e-06, "loss": 0.3575, "step": 2768 }, { "epoch": 2.0390279823269513, "grad_norm": 0.3470795452594757, "learning_rate": 2.8140512147399436e-06, "loss": 0.3801, "step": 2769 }, { "epoch": 2.039764359351988, "grad_norm": 0.39775145053863525, "learning_rate": 2.8101984281859276e-06, "loss": 0.3803, "step": 2770 }, { "epoch": 2.040500736377025, "grad_norm": 0.38484060764312744, "learning_rate": 2.8063472497560107e-06, "loss": 0.3948, "step": 2771 }, { "epoch": 2.0412371134020617, "grad_norm": 0.35866910219192505, "learning_rate": 2.802497682278385e-06, "loss": 0.3644, "step": 2772 }, { "epoch": 2.0419734904270985, "grad_norm": 0.3320389688014984, "learning_rate": 2.7986497285800564e-06, "loss": 0.3525, "step": 2773 }, { "epoch": 2.0427098674521353, "grad_norm": 0.3605724573135376, "learning_rate": 2.7948033914868415e-06, "loss": 0.3861, "step": 2774 }, { "epoch": 2.043446244477172, "grad_norm": 0.35822010040283203, "learning_rate": 2.7909586738233816e-06, "loss": 0.3836, "step": 2775 }, { "epoch": 2.044182621502209, "grad_norm": 0.38547834753990173, "learning_rate": 2.787115578413113e-06, "loss": 0.3731, "step": 2776 }, { "epoch": 2.044918998527246, "grad_norm": 0.36046162247657776, "learning_rate": 2.7832741080782944e-06, "loss": 0.3705, "step": 2777 }, { "epoch": 2.0456553755522826, "grad_norm": 0.36100590229034424, "learning_rate": 2.7794342656399835e-06, "loss": 0.3573, "step": 2778 }, { "epoch": 2.0463917525773194, "grad_norm": 0.37001779675483704, "learning_rate": 2.775596053918043e-06, "loss": 0.3919, "step": 2779 }, { "epoch": 2.0471281296023562, "grad_norm": 0.37465184926986694, "learning_rate": 2.7717594757311435e-06, "loss": 0.4072, "step": 2780 }, { "epoch": 2.047864506627393, "grad_norm": 0.35224416851997375, "learning_rate": 2.7679245338967497e-06, "loss": 0.393, "step": 2781 }, { "epoch": 2.04860088365243, "grad_norm": 0.38229209184646606, "learning_rate": 2.764091231231125e-06, "loss": 0.391, "step": 2782 }, { "epoch": 2.0493372606774667, "grad_norm": 0.3451941907405853, "learning_rate": 2.7602595705493353e-06, "loss": 0.367, "step": 2783 }, { "epoch": 2.0500736377025035, "grad_norm": 0.33509618043899536, "learning_rate": 2.7564295546652366e-06, "loss": 0.3684, "step": 2784 }, { "epoch": 2.0508100147275403, "grad_norm": 0.38282108306884766, "learning_rate": 2.7526011863914702e-06, "loss": 0.3657, "step": 2785 }, { "epoch": 2.051546391752577, "grad_norm": 0.38125553727149963, "learning_rate": 2.748774468539481e-06, "loss": 0.3667, "step": 2786 }, { "epoch": 2.052282768777614, "grad_norm": 0.34986355900764465, "learning_rate": 2.74494940391949e-06, "loss": 0.4248, "step": 2787 }, { "epoch": 2.0530191458026508, "grad_norm": 0.38790905475616455, "learning_rate": 2.7411259953405143e-06, "loss": 0.3802, "step": 2788 }, { "epoch": 2.0537555228276876, "grad_norm": 0.3548150956630707, "learning_rate": 2.737304245610346e-06, "loss": 0.3833, "step": 2789 }, { "epoch": 2.0544918998527244, "grad_norm": 0.3673829436302185, "learning_rate": 2.7334841575355618e-06, "loss": 0.3856, "step": 2790 }, { "epoch": 2.055228276877761, "grad_norm": 0.363076388835907, "learning_rate": 2.7296657339215227e-06, "loss": 0.3783, "step": 2791 }, { "epoch": 2.055964653902798, "grad_norm": 0.3587580621242523, "learning_rate": 2.725848977572363e-06, "loss": 0.386, "step": 2792 }, { "epoch": 2.056701030927835, "grad_norm": 0.3971223831176758, "learning_rate": 2.722033891290988e-06, "loss": 0.3445, "step": 2793 }, { "epoch": 2.0574374079528717, "grad_norm": 0.38008826971054077, "learning_rate": 2.7182204778790878e-06, "loss": 0.3613, "step": 2794 }, { "epoch": 2.0581737849779085, "grad_norm": 0.3624361753463745, "learning_rate": 2.714408740137115e-06, "loss": 0.3675, "step": 2795 }, { "epoch": 2.0589101620029453, "grad_norm": 0.33494141697883606, "learning_rate": 2.7105986808642936e-06, "loss": 0.3914, "step": 2796 }, { "epoch": 2.059646539027982, "grad_norm": 0.36965957283973694, "learning_rate": 2.7067903028586193e-06, "loss": 0.3601, "step": 2797 }, { "epoch": 2.060382916053019, "grad_norm": 0.3501898944377899, "learning_rate": 2.702983608916849e-06, "loss": 0.3996, "step": 2798 }, { "epoch": 2.0611192930780557, "grad_norm": 0.41025105118751526, "learning_rate": 2.6991786018345e-06, "loss": 0.3741, "step": 2799 }, { "epoch": 2.0618556701030926, "grad_norm": 0.3536095917224884, "learning_rate": 2.69537528440586e-06, "loss": 0.3831, "step": 2800 }, { "epoch": 2.0625920471281294, "grad_norm": 0.369924396276474, "learning_rate": 2.6915736594239676e-06, "loss": 0.3879, "step": 2801 }, { "epoch": 2.063328424153166, "grad_norm": 0.3758985996246338, "learning_rate": 2.6877737296806217e-06, "loss": 0.3922, "step": 2802 }, { "epoch": 2.064064801178203, "grad_norm": 0.4005085527896881, "learning_rate": 2.6839754979663752e-06, "loss": 0.3852, "step": 2803 }, { "epoch": 2.0648011782032403, "grad_norm": 0.3522018790245056, "learning_rate": 2.6801789670705335e-06, "loss": 0.3979, "step": 2804 }, { "epoch": 2.065537555228277, "grad_norm": 0.36220279335975647, "learning_rate": 2.6763841397811576e-06, "loss": 0.3662, "step": 2805 }, { "epoch": 2.066273932253314, "grad_norm": 0.37038183212280273, "learning_rate": 2.6725910188850523e-06, "loss": 0.364, "step": 2806 }, { "epoch": 2.0670103092783507, "grad_norm": 0.3805944323539734, "learning_rate": 2.668799607167769e-06, "loss": 0.389, "step": 2807 }, { "epoch": 2.0677466863033875, "grad_norm": 0.33866146206855774, "learning_rate": 2.6650099074136095e-06, "loss": 0.348, "step": 2808 }, { "epoch": 2.0684830633284244, "grad_norm": 0.37582066655158997, "learning_rate": 2.6612219224056133e-06, "loss": 0.3865, "step": 2809 }, { "epoch": 2.069219440353461, "grad_norm": 0.39235755801200867, "learning_rate": 2.657435654925562e-06, "loss": 0.3762, "step": 2810 }, { "epoch": 2.069955817378498, "grad_norm": 0.3813096284866333, "learning_rate": 2.6536511077539757e-06, "loss": 0.3941, "step": 2811 }, { "epoch": 2.070692194403535, "grad_norm": 0.35484588146209717, "learning_rate": 2.6498682836701094e-06, "loss": 0.388, "step": 2812 }, { "epoch": 2.0714285714285716, "grad_norm": 0.37006160616874695, "learning_rate": 2.6460871854519594e-06, "loss": 0.4055, "step": 2813 }, { "epoch": 2.0721649484536084, "grad_norm": 0.35195285081863403, "learning_rate": 2.6423078158762473e-06, "loss": 0.3766, "step": 2814 }, { "epoch": 2.0729013254786453, "grad_norm": 0.3882370591163635, "learning_rate": 2.638530177718427e-06, "loss": 0.388, "step": 2815 }, { "epoch": 2.073637702503682, "grad_norm": 0.33952176570892334, "learning_rate": 2.6347542737526843e-06, "loss": 0.3726, "step": 2816 }, { "epoch": 2.074374079528719, "grad_norm": 0.37573882937431335, "learning_rate": 2.6309801067519293e-06, "loss": 0.3818, "step": 2817 }, { "epoch": 2.0751104565537557, "grad_norm": 0.3656890094280243, "learning_rate": 2.6272076794877915e-06, "loss": 0.397, "step": 2818 }, { "epoch": 2.0758468335787925, "grad_norm": 0.39007052779197693, "learning_rate": 2.623436994730632e-06, "loss": 0.3641, "step": 2819 }, { "epoch": 2.0765832106038293, "grad_norm": 0.36530953645706177, "learning_rate": 2.619668055249527e-06, "loss": 0.3543, "step": 2820 }, { "epoch": 2.077319587628866, "grad_norm": 0.3379687964916229, "learning_rate": 2.6159008638122687e-06, "loss": 0.3812, "step": 2821 }, { "epoch": 2.078055964653903, "grad_norm": 0.3618925213813782, "learning_rate": 2.6121354231853725e-06, "loss": 0.4019, "step": 2822 }, { "epoch": 2.07879234167894, "grad_norm": 0.39091405272483826, "learning_rate": 2.608371736134063e-06, "loss": 0.3696, "step": 2823 }, { "epoch": 2.0795287187039766, "grad_norm": 0.3801884055137634, "learning_rate": 2.6046098054222767e-06, "loss": 0.3822, "step": 2824 }, { "epoch": 2.0802650957290134, "grad_norm": 0.34748730063438416, "learning_rate": 2.6008496338126643e-06, "loss": 0.3625, "step": 2825 }, { "epoch": 2.0810014727540502, "grad_norm": 0.3482074737548828, "learning_rate": 2.5970912240665815e-06, "loss": 0.3685, "step": 2826 }, { "epoch": 2.081737849779087, "grad_norm": 0.39526116847991943, "learning_rate": 2.59333457894409e-06, "loss": 0.3852, "step": 2827 }, { "epoch": 2.082474226804124, "grad_norm": 0.39386582374572754, "learning_rate": 2.5895797012039576e-06, "loss": 0.3473, "step": 2828 }, { "epoch": 2.0832106038291607, "grad_norm": 0.40795037150382996, "learning_rate": 2.5858265936036496e-06, "loss": 0.394, "step": 2829 }, { "epoch": 2.0839469808541975, "grad_norm": 0.40214234590530396, "learning_rate": 2.582075258899339e-06, "loss": 0.3657, "step": 2830 }, { "epoch": 2.0846833578792343, "grad_norm": 0.35180026292800903, "learning_rate": 2.578325699845892e-06, "loss": 0.3508, "step": 2831 }, { "epoch": 2.085419734904271, "grad_norm": 0.3568512201309204, "learning_rate": 2.5745779191968686e-06, "loss": 0.3434, "step": 2832 }, { "epoch": 2.086156111929308, "grad_norm": 0.3792460262775421, "learning_rate": 2.5708319197045297e-06, "loss": 0.3691, "step": 2833 }, { "epoch": 2.0868924889543448, "grad_norm": 0.36163848638534546, "learning_rate": 2.567087704119821e-06, "loss": 0.3726, "step": 2834 }, { "epoch": 2.0876288659793816, "grad_norm": 0.3781639039516449, "learning_rate": 2.5633452751923825e-06, "loss": 0.3749, "step": 2835 }, { "epoch": 2.0883652430044184, "grad_norm": 0.3682844340801239, "learning_rate": 2.5596046356705418e-06, "loss": 0.3525, "step": 2836 }, { "epoch": 2.089101620029455, "grad_norm": 0.3800516128540039, "learning_rate": 2.5558657883013078e-06, "loss": 0.3867, "step": 2837 }, { "epoch": 2.089837997054492, "grad_norm": 0.34438201785087585, "learning_rate": 2.5521287358303814e-06, "loss": 0.4029, "step": 2838 }, { "epoch": 2.090574374079529, "grad_norm": 0.3582353889942169, "learning_rate": 2.54839348100214e-06, "loss": 0.35, "step": 2839 }, { "epoch": 2.0913107511045657, "grad_norm": 0.35245490074157715, "learning_rate": 2.544660026559639e-06, "loss": 0.3818, "step": 2840 }, { "epoch": 2.0920471281296025, "grad_norm": 0.3359908163547516, "learning_rate": 2.5409283752446183e-06, "loss": 0.3835, "step": 2841 }, { "epoch": 2.0927835051546393, "grad_norm": 0.3465385437011719, "learning_rate": 2.537198529797489e-06, "loss": 0.3917, "step": 2842 }, { "epoch": 2.093519882179676, "grad_norm": 0.3186095952987671, "learning_rate": 2.533470492957335e-06, "loss": 0.3766, "step": 2843 }, { "epoch": 2.094256259204713, "grad_norm": 0.34522688388824463, "learning_rate": 2.5297442674619153e-06, "loss": 0.3691, "step": 2844 }, { "epoch": 2.0949926362297497, "grad_norm": 0.3759070932865143, "learning_rate": 2.526019856047656e-06, "loss": 0.3514, "step": 2845 }, { "epoch": 2.0957290132547866, "grad_norm": 0.37958410382270813, "learning_rate": 2.5222972614496543e-06, "loss": 0.3999, "step": 2846 }, { "epoch": 2.0964653902798234, "grad_norm": 0.33293333649635315, "learning_rate": 2.518576486401671e-06, "loss": 0.3622, "step": 2847 }, { "epoch": 2.09720176730486, "grad_norm": 0.39282292127609253, "learning_rate": 2.514857533636128e-06, "loss": 0.3799, "step": 2848 }, { "epoch": 2.097938144329897, "grad_norm": 0.3422982394695282, "learning_rate": 2.5111404058841155e-06, "loss": 0.3917, "step": 2849 }, { "epoch": 2.098674521354934, "grad_norm": 0.3647412359714508, "learning_rate": 2.5074251058753783e-06, "loss": 0.3606, "step": 2850 }, { "epoch": 2.0994108983799706, "grad_norm": 0.38597914576530457, "learning_rate": 2.5037116363383203e-06, "loss": 0.3649, "step": 2851 }, { "epoch": 2.1001472754050075, "grad_norm": 0.33769500255584717, "learning_rate": 2.5000000000000015e-06, "loss": 0.37, "step": 2852 }, { "epoch": 2.1008836524300443, "grad_norm": 0.3958123028278351, "learning_rate": 2.4962901995861348e-06, "loss": 0.3439, "step": 2853 }, { "epoch": 2.101620029455081, "grad_norm": 0.34865254163742065, "learning_rate": 2.4925822378210844e-06, "loss": 0.3636, "step": 2854 }, { "epoch": 2.102356406480118, "grad_norm": 0.3949892222881317, "learning_rate": 2.488876117427869e-06, "loss": 0.3538, "step": 2855 }, { "epoch": 2.1030927835051547, "grad_norm": 0.39913707971572876, "learning_rate": 2.4851718411281495e-06, "loss": 0.368, "step": 2856 }, { "epoch": 2.1038291605301915, "grad_norm": 0.3734777569770813, "learning_rate": 2.4814694116422326e-06, "loss": 0.3516, "step": 2857 }, { "epoch": 2.1045655375552283, "grad_norm": 0.414492130279541, "learning_rate": 2.477768831689074e-06, "loss": 0.3833, "step": 2858 }, { "epoch": 2.105301914580265, "grad_norm": 0.3294821083545685, "learning_rate": 2.4740701039862663e-06, "loss": 0.3678, "step": 2859 }, { "epoch": 2.106038291605302, "grad_norm": 0.38018667697906494, "learning_rate": 2.4703732312500438e-06, "loss": 0.3991, "step": 2860 }, { "epoch": 2.106774668630339, "grad_norm": 0.35948431491851807, "learning_rate": 2.466678216195277e-06, "loss": 0.361, "step": 2861 }, { "epoch": 2.1075110456553756, "grad_norm": 0.3856082856655121, "learning_rate": 2.462985061535472e-06, "loss": 0.369, "step": 2862 }, { "epoch": 2.1082474226804124, "grad_norm": 0.39372214674949646, "learning_rate": 2.459293769982774e-06, "loss": 0.3786, "step": 2863 }, { "epoch": 2.1089837997054492, "grad_norm": 0.3526458144187927, "learning_rate": 2.455604344247954e-06, "loss": 0.3703, "step": 2864 }, { "epoch": 2.109720176730486, "grad_norm": 0.3489939570426941, "learning_rate": 2.4519167870404126e-06, "loss": 0.3932, "step": 2865 }, { "epoch": 2.110456553755523, "grad_norm": 0.37256160378456116, "learning_rate": 2.4482311010681842e-06, "loss": 0.3982, "step": 2866 }, { "epoch": 2.1111929307805597, "grad_norm": 0.3323703110218048, "learning_rate": 2.4445472890379233e-06, "loss": 0.3791, "step": 2867 }, { "epoch": 2.1119293078055965, "grad_norm": 0.36218979954719543, "learning_rate": 2.4408653536549104e-06, "loss": 0.4038, "step": 2868 }, { "epoch": 2.1126656848306333, "grad_norm": 0.34477296471595764, "learning_rate": 2.437185297623047e-06, "loss": 0.391, "step": 2869 }, { "epoch": 2.11340206185567, "grad_norm": 0.35292166471481323, "learning_rate": 2.4335071236448536e-06, "loss": 0.3793, "step": 2870 }, { "epoch": 2.114138438880707, "grad_norm": 0.35486897826194763, "learning_rate": 2.4298308344214745e-06, "loss": 0.3849, "step": 2871 }, { "epoch": 2.1148748159057438, "grad_norm": 0.38103657960891724, "learning_rate": 2.4261564326526623e-06, "loss": 0.3523, "step": 2872 }, { "epoch": 2.1156111929307806, "grad_norm": 0.37378567457199097, "learning_rate": 2.422483921036785e-06, "loss": 0.3847, "step": 2873 }, { "epoch": 2.1163475699558174, "grad_norm": 0.38318321108818054, "learning_rate": 2.418813302270829e-06, "loss": 0.3825, "step": 2874 }, { "epoch": 2.1170839469808542, "grad_norm": 0.34228307008743286, "learning_rate": 2.415144579050382e-06, "loss": 0.3661, "step": 2875 }, { "epoch": 2.117820324005891, "grad_norm": 0.3472125828266144, "learning_rate": 2.411477754069645e-06, "loss": 0.3491, "step": 2876 }, { "epoch": 2.118556701030928, "grad_norm": 0.355693519115448, "learning_rate": 2.4078128300214225e-06, "loss": 0.3481, "step": 2877 }, { "epoch": 2.1192930780559647, "grad_norm": 0.40808922052383423, "learning_rate": 2.4041498095971253e-06, "loss": 0.4022, "step": 2878 }, { "epoch": 2.1200294550810015, "grad_norm": 0.3449529707431793, "learning_rate": 2.4004886954867618e-06, "loss": 0.361, "step": 2879 }, { "epoch": 2.1207658321060383, "grad_norm": 0.35305774211883545, "learning_rate": 2.3968294903789474e-06, "loss": 0.3913, "step": 2880 }, { "epoch": 2.121502209131075, "grad_norm": 0.3608260750770569, "learning_rate": 2.393172196960891e-06, "loss": 0.3521, "step": 2881 }, { "epoch": 2.122238586156112, "grad_norm": 0.3713111877441406, "learning_rate": 2.3895168179183947e-06, "loss": 0.389, "step": 2882 }, { "epoch": 2.1229749631811488, "grad_norm": 0.3446366488933563, "learning_rate": 2.3858633559358635e-06, "loss": 0.3765, "step": 2883 }, { "epoch": 2.1237113402061856, "grad_norm": 0.36745747923851013, "learning_rate": 2.3822118136962876e-06, "loss": 0.38, "step": 2884 }, { "epoch": 2.1244477172312224, "grad_norm": 0.3752921223640442, "learning_rate": 2.378562193881248e-06, "loss": 0.3714, "step": 2885 }, { "epoch": 2.125184094256259, "grad_norm": 0.33874091506004333, "learning_rate": 2.3749144991709174e-06, "loss": 0.3857, "step": 2886 }, { "epoch": 2.125920471281296, "grad_norm": 0.3628979027271271, "learning_rate": 2.371268732244048e-06, "loss": 0.3781, "step": 2887 }, { "epoch": 2.126656848306333, "grad_norm": 0.36751553416252136, "learning_rate": 2.367624895777987e-06, "loss": 0.3642, "step": 2888 }, { "epoch": 2.1273932253313697, "grad_norm": 0.3799552023410797, "learning_rate": 2.3639829924486546e-06, "loss": 0.3399, "step": 2889 }, { "epoch": 2.1281296023564065, "grad_norm": 0.34371480345726013, "learning_rate": 2.3603430249305532e-06, "loss": 0.3956, "step": 2890 }, { "epoch": 2.1288659793814433, "grad_norm": 0.3942258954048157, "learning_rate": 2.356704995896768e-06, "loss": 0.3714, "step": 2891 }, { "epoch": 2.12960235640648, "grad_norm": 0.34906941652297974, "learning_rate": 2.353068908018957e-06, "loss": 0.3743, "step": 2892 }, { "epoch": 2.130338733431517, "grad_norm": 0.3614822328090668, "learning_rate": 2.3494347639673513e-06, "loss": 0.3632, "step": 2893 }, { "epoch": 2.1310751104565537, "grad_norm": 0.3540303111076355, "learning_rate": 2.3458025664107587e-06, "loss": 0.3442, "step": 2894 }, { "epoch": 2.1318114874815906, "grad_norm": 0.3677811622619629, "learning_rate": 2.342172318016552e-06, "loss": 0.3421, "step": 2895 }, { "epoch": 2.1325478645066274, "grad_norm": 0.3830873668193817, "learning_rate": 2.33854402145068e-06, "loss": 0.378, "step": 2896 }, { "epoch": 2.133284241531664, "grad_norm": 0.405880868434906, "learning_rate": 2.3349176793776523e-06, "loss": 0.3765, "step": 2897 }, { "epoch": 2.134020618556701, "grad_norm": 0.37011435627937317, "learning_rate": 2.3312932944605433e-06, "loss": 0.3852, "step": 2898 }, { "epoch": 2.134756995581738, "grad_norm": 0.3674939274787903, "learning_rate": 2.3276708693609947e-06, "loss": 0.3663, "step": 2899 }, { "epoch": 2.1354933726067746, "grad_norm": 0.3470740020275116, "learning_rate": 2.324050406739205e-06, "loss": 0.3652, "step": 2900 }, { "epoch": 2.1362297496318114, "grad_norm": 0.3683933913707733, "learning_rate": 2.32043190925393e-06, "loss": 0.3959, "step": 2901 }, { "epoch": 2.1369661266568483, "grad_norm": 0.3439938426017761, "learning_rate": 2.316815379562491e-06, "loss": 0.3843, "step": 2902 }, { "epoch": 2.137702503681885, "grad_norm": 0.3798215389251709, "learning_rate": 2.3132008203207508e-06, "loss": 0.3494, "step": 2903 }, { "epoch": 2.138438880706922, "grad_norm": 0.3750232458114624, "learning_rate": 2.309588234183137e-06, "loss": 0.388, "step": 2904 }, { "epoch": 2.1391752577319587, "grad_norm": 0.34800633788108826, "learning_rate": 2.3059776238026233e-06, "loss": 0.3883, "step": 2905 }, { "epoch": 2.1399116347569955, "grad_norm": 0.3639180660247803, "learning_rate": 2.30236899183073e-06, "loss": 0.3853, "step": 2906 }, { "epoch": 2.1406480117820323, "grad_norm": 0.3714924156665802, "learning_rate": 2.298762340917531e-06, "loss": 0.415, "step": 2907 }, { "epoch": 2.141384388807069, "grad_norm": 0.4161515235900879, "learning_rate": 2.295157673711641e-06, "loss": 0.3851, "step": 2908 }, { "epoch": 2.142120765832106, "grad_norm": 0.35217177867889404, "learning_rate": 2.2915549928602153e-06, "loss": 0.3988, "step": 2909 }, { "epoch": 2.142857142857143, "grad_norm": 0.3613126575946808, "learning_rate": 2.2879543010089613e-06, "loss": 0.3599, "step": 2910 }, { "epoch": 2.1435935198821796, "grad_norm": 0.34729936718940735, "learning_rate": 2.2843556008021105e-06, "loss": 0.3698, "step": 2911 }, { "epoch": 2.1443298969072164, "grad_norm": 0.37736669182777405, "learning_rate": 2.280758894882441e-06, "loss": 0.3627, "step": 2912 }, { "epoch": 2.1450662739322532, "grad_norm": 0.36949247121810913, "learning_rate": 2.2771641858912684e-06, "loss": 0.3889, "step": 2913 }, { "epoch": 2.14580265095729, "grad_norm": 0.3440004587173462, "learning_rate": 2.2735714764684368e-06, "loss": 0.3574, "step": 2914 }, { "epoch": 2.146539027982327, "grad_norm": 0.3486247658729553, "learning_rate": 2.269980769252321e-06, "loss": 0.3554, "step": 2915 }, { "epoch": 2.1472754050073637, "grad_norm": 0.35198965668678284, "learning_rate": 2.2663920668798316e-06, "loss": 0.3627, "step": 2916 }, { "epoch": 2.1480117820324005, "grad_norm": 0.37342336773872375, "learning_rate": 2.262805371986402e-06, "loss": 0.375, "step": 2917 }, { "epoch": 2.1487481590574373, "grad_norm": 0.36178645491600037, "learning_rate": 2.2592206872059913e-06, "loss": 0.4036, "step": 2918 }, { "epoch": 2.149484536082474, "grad_norm": 0.37667861580848694, "learning_rate": 2.255638015171085e-06, "loss": 0.3922, "step": 2919 }, { "epoch": 2.150220913107511, "grad_norm": 0.3384239077568054, "learning_rate": 2.2520573585126863e-06, "loss": 0.3869, "step": 2920 }, { "epoch": 2.1509572901325478, "grad_norm": 0.3617817759513855, "learning_rate": 2.248478719860326e-06, "loss": 0.3777, "step": 2921 }, { "epoch": 2.1516936671575846, "grad_norm": 0.34681081771850586, "learning_rate": 2.2449021018420454e-06, "loss": 0.355, "step": 2922 }, { "epoch": 2.1524300441826214, "grad_norm": 0.3603213131427765, "learning_rate": 2.2413275070844026e-06, "loss": 0.4205, "step": 2923 }, { "epoch": 2.153166421207658, "grad_norm": 0.33105942606925964, "learning_rate": 2.2377549382124767e-06, "loss": 0.3259, "step": 2924 }, { "epoch": 2.153902798232695, "grad_norm": 0.36263999342918396, "learning_rate": 2.2341843978498525e-06, "loss": 0.383, "step": 2925 }, { "epoch": 2.154639175257732, "grad_norm": 0.34897348284721375, "learning_rate": 2.230615888618624e-06, "loss": 0.3873, "step": 2926 }, { "epoch": 2.1553755522827687, "grad_norm": 0.3384336829185486, "learning_rate": 2.2270494131394034e-06, "loss": 0.3643, "step": 2927 }, { "epoch": 2.1561119293078055, "grad_norm": 0.3632971942424774, "learning_rate": 2.223484974031294e-06, "loss": 0.3668, "step": 2928 }, { "epoch": 2.1568483063328423, "grad_norm": 0.3339279592037201, "learning_rate": 2.2199225739119184e-06, "loss": 0.3488, "step": 2929 }, { "epoch": 2.157584683357879, "grad_norm": 0.35420891642570496, "learning_rate": 2.216362215397393e-06, "loss": 0.3732, "step": 2930 }, { "epoch": 2.158321060382916, "grad_norm": 0.35036933422088623, "learning_rate": 2.2128039011023367e-06, "loss": 0.3618, "step": 2931 }, { "epoch": 2.1590574374079528, "grad_norm": 0.40435558557510376, "learning_rate": 2.2092476336398706e-06, "loss": 0.4052, "step": 2932 }, { "epoch": 2.1597938144329896, "grad_norm": 0.32747867703437805, "learning_rate": 2.2056934156216094e-06, "loss": 0.3612, "step": 2933 }, { "epoch": 2.1605301914580264, "grad_norm": 0.3270984888076782, "learning_rate": 2.2021412496576598e-06, "loss": 0.3827, "step": 2934 }, { "epoch": 2.161266568483063, "grad_norm": 0.3557640314102173, "learning_rate": 2.198591138356633e-06, "loss": 0.3625, "step": 2935 }, { "epoch": 2.1620029455081, "grad_norm": 0.3336459994316101, "learning_rate": 2.195043084325616e-06, "loss": 0.35, "step": 2936 }, { "epoch": 2.162739322533137, "grad_norm": 0.3505702316761017, "learning_rate": 2.191497090170193e-06, "loss": 0.3816, "step": 2937 }, { "epoch": 2.1634756995581736, "grad_norm": 0.37018442153930664, "learning_rate": 2.1879531584944396e-06, "loss": 0.3812, "step": 2938 }, { "epoch": 2.1642120765832105, "grad_norm": 0.33809441328048706, "learning_rate": 2.1844112919009087e-06, "loss": 0.39, "step": 2939 }, { "epoch": 2.1649484536082473, "grad_norm": 0.32393452525138855, "learning_rate": 2.1808714929906394e-06, "loss": 0.3462, "step": 2940 }, { "epoch": 2.165684830633284, "grad_norm": 0.33284613490104675, "learning_rate": 2.1773337643631565e-06, "loss": 0.3845, "step": 2941 }, { "epoch": 2.166421207658321, "grad_norm": 0.36973825097084045, "learning_rate": 2.173798108616459e-06, "loss": 0.3597, "step": 2942 }, { "epoch": 2.1671575846833577, "grad_norm": 0.3738713264465332, "learning_rate": 2.1702645283470238e-06, "loss": 0.381, "step": 2943 }, { "epoch": 2.1678939617083945, "grad_norm": 0.3623276352882385, "learning_rate": 2.166733026149811e-06, "loss": 0.3546, "step": 2944 }, { "epoch": 2.1686303387334314, "grad_norm": 0.4014292061328888, "learning_rate": 2.1632036046182416e-06, "loss": 0.3645, "step": 2945 }, { "epoch": 2.169366715758468, "grad_norm": 0.37799927592277527, "learning_rate": 2.159676266344222e-06, "loss": 0.3611, "step": 2946 }, { "epoch": 2.170103092783505, "grad_norm": 0.379955917596817, "learning_rate": 2.15615101391812e-06, "loss": 0.3564, "step": 2947 }, { "epoch": 2.170839469808542, "grad_norm": 0.36554086208343506, "learning_rate": 2.1526278499287746e-06, "loss": 0.3748, "step": 2948 }, { "epoch": 2.1715758468335786, "grad_norm": 0.3485044538974762, "learning_rate": 2.1491067769634927e-06, "loss": 0.3738, "step": 2949 }, { "epoch": 2.1723122238586154, "grad_norm": 0.4192858934402466, "learning_rate": 2.145587797608043e-06, "loss": 0.3907, "step": 2950 }, { "epoch": 2.1730486008836523, "grad_norm": 0.3680093288421631, "learning_rate": 2.1420709144466557e-06, "loss": 0.3673, "step": 2951 }, { "epoch": 2.173784977908689, "grad_norm": 0.3767206370830536, "learning_rate": 2.1385561300620287e-06, "loss": 0.3542, "step": 2952 }, { "epoch": 2.174521354933726, "grad_norm": 0.3542313575744629, "learning_rate": 2.1350434470353065e-06, "loss": 0.3815, "step": 2953 }, { "epoch": 2.1752577319587627, "grad_norm": 0.3587740957736969, "learning_rate": 2.131532867946102e-06, "loss": 0.3831, "step": 2954 }, { "epoch": 2.1759941089837995, "grad_norm": 0.35472214221954346, "learning_rate": 2.1280243953724784e-06, "loss": 0.3649, "step": 2955 }, { "epoch": 2.1767304860088363, "grad_norm": 0.37398767471313477, "learning_rate": 2.1245180318909482e-06, "loss": 0.3801, "step": 2956 }, { "epoch": 2.177466863033873, "grad_norm": 0.3110312521457672, "learning_rate": 2.121013780076483e-06, "loss": 0.3539, "step": 2957 }, { "epoch": 2.17820324005891, "grad_norm": 0.34271514415740967, "learning_rate": 2.1175116425024978e-06, "loss": 0.3796, "step": 2958 }, { "epoch": 2.178939617083947, "grad_norm": 0.34827741980552673, "learning_rate": 2.1140116217408554e-06, "loss": 0.3739, "step": 2959 }, { "epoch": 2.1796759941089836, "grad_norm": 0.327886700630188, "learning_rate": 2.110513720361869e-06, "loss": 0.3734, "step": 2960 }, { "epoch": 2.1804123711340204, "grad_norm": 0.3339656889438629, "learning_rate": 2.107017940934286e-06, "loss": 0.3918, "step": 2961 }, { "epoch": 2.1811487481590572, "grad_norm": 0.380188912153244, "learning_rate": 2.1035242860253064e-06, "loss": 0.3954, "step": 2962 }, { "epoch": 2.181885125184094, "grad_norm": 0.33889806270599365, "learning_rate": 2.100032758200562e-06, "loss": 0.3591, "step": 2963 }, { "epoch": 2.182621502209131, "grad_norm": 0.3376297056674957, "learning_rate": 2.0965433600241247e-06, "loss": 0.3968, "step": 2964 }, { "epoch": 2.1833578792341677, "grad_norm": 0.3318977355957031, "learning_rate": 2.093056094058506e-06, "loss": 0.3602, "step": 2965 }, { "epoch": 2.184094256259205, "grad_norm": 0.3732307255268097, "learning_rate": 2.089570962864647e-06, "loss": 0.3863, "step": 2966 }, { "epoch": 2.1848306332842418, "grad_norm": 0.33816441893577576, "learning_rate": 2.0860879690019216e-06, "loss": 0.3826, "step": 2967 }, { "epoch": 2.1855670103092786, "grad_norm": 0.38040101528167725, "learning_rate": 2.0826071150281374e-06, "loss": 0.3581, "step": 2968 }, { "epoch": 2.1863033873343154, "grad_norm": 0.3592909276485443, "learning_rate": 2.0791284034995296e-06, "loss": 0.4081, "step": 2969 }, { "epoch": 2.187039764359352, "grad_norm": 0.330105185508728, "learning_rate": 2.0756518369707528e-06, "loss": 0.3798, "step": 2970 }, { "epoch": 2.187776141384389, "grad_norm": 0.34812307357788086, "learning_rate": 2.0721774179948978e-06, "loss": 0.4039, "step": 2971 }, { "epoch": 2.188512518409426, "grad_norm": 0.39521628618240356, "learning_rate": 2.0687051491234717e-06, "loss": 0.3804, "step": 2972 }, { "epoch": 2.1892488954344627, "grad_norm": 0.3390514552593231, "learning_rate": 2.0652350329064012e-06, "loss": 0.3691, "step": 2973 }, { "epoch": 2.1899852724594995, "grad_norm": 0.38476595282554626, "learning_rate": 2.061767071892039e-06, "loss": 0.3884, "step": 2974 }, { "epoch": 2.1907216494845363, "grad_norm": 0.35555726289749146, "learning_rate": 2.0583012686271493e-06, "loss": 0.3852, "step": 2975 }, { "epoch": 2.191458026509573, "grad_norm": 0.3225259482860565, "learning_rate": 2.0548376256569107e-06, "loss": 0.3712, "step": 2976 }, { "epoch": 2.19219440353461, "grad_norm": 0.3521542251110077, "learning_rate": 2.051376145524924e-06, "loss": 0.389, "step": 2977 }, { "epoch": 2.1929307805596467, "grad_norm": 0.3622699975967407, "learning_rate": 2.047916830773187e-06, "loss": 0.3854, "step": 2978 }, { "epoch": 2.1936671575846836, "grad_norm": 0.36842185258865356, "learning_rate": 2.044459683942124e-06, "loss": 0.3563, "step": 2979 }, { "epoch": 2.1944035346097204, "grad_norm": 0.4047960042953491, "learning_rate": 2.041004707570555e-06, "loss": 0.3704, "step": 2980 }, { "epoch": 2.195139911634757, "grad_norm": 0.3790772259235382, "learning_rate": 2.037551904195709e-06, "loss": 0.3918, "step": 2981 }, { "epoch": 2.195876288659794, "grad_norm": 0.31942737102508545, "learning_rate": 2.0341012763532243e-06, "loss": 0.3562, "step": 2982 }, { "epoch": 2.196612665684831, "grad_norm": 0.36221376061439514, "learning_rate": 2.0306528265771357e-06, "loss": 0.3856, "step": 2983 }, { "epoch": 2.1973490427098676, "grad_norm": 0.37802380323410034, "learning_rate": 2.0272065573998794e-06, "loss": 0.3645, "step": 2984 }, { "epoch": 2.1980854197349045, "grad_norm": 0.34147152304649353, "learning_rate": 2.0237624713522945e-06, "loss": 0.3898, "step": 2985 }, { "epoch": 2.1988217967599413, "grad_norm": 0.3583637475967407, "learning_rate": 2.020320570963612e-06, "loss": 0.3768, "step": 2986 }, { "epoch": 2.199558173784978, "grad_norm": 0.3441274166107178, "learning_rate": 2.0168808587614584e-06, "loss": 0.3712, "step": 2987 }, { "epoch": 2.200294550810015, "grad_norm": 0.3442659080028534, "learning_rate": 2.0134433372718565e-06, "loss": 0.3423, "step": 2988 }, { "epoch": 2.2010309278350517, "grad_norm": 0.34774288535118103, "learning_rate": 2.010008009019215e-06, "loss": 0.3854, "step": 2989 }, { "epoch": 2.2017673048600885, "grad_norm": 0.3395615518093109, "learning_rate": 2.0065748765263386e-06, "loss": 0.3523, "step": 2990 }, { "epoch": 2.2025036818851254, "grad_norm": 0.3748759627342224, "learning_rate": 2.003143942314415e-06, "loss": 0.3714, "step": 2991 }, { "epoch": 2.203240058910162, "grad_norm": 0.34823179244995117, "learning_rate": 1.999715208903017e-06, "loss": 0.3958, "step": 2992 }, { "epoch": 2.203976435935199, "grad_norm": 0.32306548953056335, "learning_rate": 1.996288678810105e-06, "loss": 0.3813, "step": 2993 }, { "epoch": 2.204712812960236, "grad_norm": 0.3721083700656891, "learning_rate": 1.9928643545520204e-06, "loss": 0.3786, "step": 2994 }, { "epoch": 2.2054491899852726, "grad_norm": 0.34985750913619995, "learning_rate": 1.989442238643478e-06, "loss": 0.3816, "step": 2995 }, { "epoch": 2.2061855670103094, "grad_norm": 0.3577035367488861, "learning_rate": 1.9860223335975815e-06, "loss": 0.355, "step": 2996 }, { "epoch": 2.2069219440353463, "grad_norm": 0.3292270302772522, "learning_rate": 1.9826046419258037e-06, "loss": 0.3322, "step": 2997 }, { "epoch": 2.207658321060383, "grad_norm": 0.3422560393810272, "learning_rate": 1.9791891661379926e-06, "loss": 0.3514, "step": 2998 }, { "epoch": 2.20839469808542, "grad_norm": 0.3499544560909271, "learning_rate": 1.975775908742374e-06, "loss": 0.3692, "step": 2999 }, { "epoch": 2.2091310751104567, "grad_norm": 0.35650497674942017, "learning_rate": 1.972364872245539e-06, "loss": 0.382, "step": 3000 }, { "epoch": 2.2098674521354935, "grad_norm": 0.3358071446418762, "learning_rate": 1.9689560591524482e-06, "loss": 0.3655, "step": 3001 }, { "epoch": 2.2106038291605303, "grad_norm": 0.3230755925178528, "learning_rate": 1.965549471966436e-06, "loss": 0.3629, "step": 3002 }, { "epoch": 2.211340206185567, "grad_norm": 0.33048614859580994, "learning_rate": 1.96214511318919e-06, "loss": 0.3856, "step": 3003 }, { "epoch": 2.212076583210604, "grad_norm": 0.36040183901786804, "learning_rate": 1.958742985320774e-06, "loss": 0.3601, "step": 3004 }, { "epoch": 2.212812960235641, "grad_norm": 0.3354277014732361, "learning_rate": 1.955343090859606e-06, "loss": 0.3607, "step": 3005 }, { "epoch": 2.2135493372606776, "grad_norm": 0.3312050700187683, "learning_rate": 1.9519454323024644e-06, "loss": 0.3716, "step": 3006 }, { "epoch": 2.2142857142857144, "grad_norm": 0.34683001041412354, "learning_rate": 1.9485500121444896e-06, "loss": 0.3692, "step": 3007 }, { "epoch": 2.2150220913107512, "grad_norm": 0.3769376575946808, "learning_rate": 1.945156832879174e-06, "loss": 0.3664, "step": 3008 }, { "epoch": 2.215758468335788, "grad_norm": 0.33873477578163147, "learning_rate": 1.941765896998365e-06, "loss": 0.3909, "step": 3009 }, { "epoch": 2.216494845360825, "grad_norm": 0.3695991635322571, "learning_rate": 1.938377206992266e-06, "loss": 0.3717, "step": 3010 }, { "epoch": 2.2172312223858617, "grad_norm": 0.3528919816017151, "learning_rate": 1.934990765349427e-06, "loss": 0.3708, "step": 3011 }, { "epoch": 2.2179675994108985, "grad_norm": 0.3522799015045166, "learning_rate": 1.931606574556749e-06, "loss": 0.373, "step": 3012 }, { "epoch": 2.2187039764359353, "grad_norm": 0.3132975101470947, "learning_rate": 1.928224637099479e-06, "loss": 0.35, "step": 3013 }, { "epoch": 2.219440353460972, "grad_norm": 0.35459184646606445, "learning_rate": 1.9248449554612076e-06, "loss": 0.3724, "step": 3014 }, { "epoch": 2.220176730486009, "grad_norm": 0.367097944021225, "learning_rate": 1.9214675321238753e-06, "loss": 0.3754, "step": 3015 }, { "epoch": 2.2209131075110458, "grad_norm": 0.38313212990760803, "learning_rate": 1.9180923695677565e-06, "loss": 0.3632, "step": 3016 }, { "epoch": 2.2216494845360826, "grad_norm": 0.3429620563983917, "learning_rate": 1.9147194702714683e-06, "loss": 0.3543, "step": 3017 }, { "epoch": 2.2223858615611194, "grad_norm": 0.31693652272224426, "learning_rate": 1.911348836711969e-06, "loss": 0.3526, "step": 3018 }, { "epoch": 2.223122238586156, "grad_norm": 0.33426809310913086, "learning_rate": 1.907980471364548e-06, "loss": 0.3795, "step": 3019 }, { "epoch": 2.223858615611193, "grad_norm": 0.3561391532421112, "learning_rate": 1.9046143767028309e-06, "loss": 0.3824, "step": 3020 }, { "epoch": 2.22459499263623, "grad_norm": 0.32854175567626953, "learning_rate": 1.9012505551987764e-06, "loss": 0.3357, "step": 3021 }, { "epoch": 2.2253313696612667, "grad_norm": 0.34215104579925537, "learning_rate": 1.897889009322672e-06, "loss": 0.3826, "step": 3022 }, { "epoch": 2.2260677466863035, "grad_norm": 0.3710014522075653, "learning_rate": 1.8945297415431379e-06, "loss": 0.3799, "step": 3023 }, { "epoch": 2.2268041237113403, "grad_norm": 0.35444051027297974, "learning_rate": 1.8911727543271174e-06, "loss": 0.396, "step": 3024 }, { "epoch": 2.227540500736377, "grad_norm": 0.3570059835910797, "learning_rate": 1.8878180501398796e-06, "loss": 0.3755, "step": 3025 }, { "epoch": 2.228276877761414, "grad_norm": 0.34448522329330444, "learning_rate": 1.88446563144502e-06, "loss": 0.4058, "step": 3026 }, { "epoch": 2.2290132547864507, "grad_norm": 0.3360097408294678, "learning_rate": 1.8811155007044523e-06, "loss": 0.3788, "step": 3027 }, { "epoch": 2.2297496318114876, "grad_norm": 0.35772281885147095, "learning_rate": 1.8777676603784122e-06, "loss": 0.4028, "step": 3028 }, { "epoch": 2.2304860088365244, "grad_norm": 0.31866273283958435, "learning_rate": 1.8744221129254514e-06, "loss": 0.3906, "step": 3029 }, { "epoch": 2.231222385861561, "grad_norm": 0.3577582538127899, "learning_rate": 1.871078860802439e-06, "loss": 0.3542, "step": 3030 }, { "epoch": 2.231958762886598, "grad_norm": 0.34226885437965393, "learning_rate": 1.8677379064645567e-06, "loss": 0.3399, "step": 3031 }, { "epoch": 2.232695139911635, "grad_norm": 0.35378527641296387, "learning_rate": 1.8643992523653043e-06, "loss": 0.3938, "step": 3032 }, { "epoch": 2.2334315169366716, "grad_norm": 0.35914602875709534, "learning_rate": 1.8610629009564863e-06, "loss": 0.3751, "step": 3033 }, { "epoch": 2.2341678939617085, "grad_norm": 0.35784912109375, "learning_rate": 1.8577288546882167e-06, "loss": 0.3771, "step": 3034 }, { "epoch": 2.2349042709867453, "grad_norm": 0.3862041234970093, "learning_rate": 1.8543971160089213e-06, "loss": 0.3834, "step": 3035 }, { "epoch": 2.235640648011782, "grad_norm": 0.38658207654953003, "learning_rate": 1.8510676873653278e-06, "loss": 0.3619, "step": 3036 }, { "epoch": 2.236377025036819, "grad_norm": 0.3452564775943756, "learning_rate": 1.8477405712024671e-06, "loss": 0.3496, "step": 3037 }, { "epoch": 2.2371134020618557, "grad_norm": 0.37417715787887573, "learning_rate": 1.8444157699636728e-06, "loss": 0.3811, "step": 3038 }, { "epoch": 2.2378497790868925, "grad_norm": 0.3347567319869995, "learning_rate": 1.8410932860905767e-06, "loss": 0.3752, "step": 3039 }, { "epoch": 2.2385861561119293, "grad_norm": 0.3563539981842041, "learning_rate": 1.8377731220231144e-06, "loss": 0.3723, "step": 3040 }, { "epoch": 2.239322533136966, "grad_norm": 0.33468982577323914, "learning_rate": 1.834455280199512e-06, "loss": 0.3565, "step": 3041 }, { "epoch": 2.240058910162003, "grad_norm": 0.37427690625190735, "learning_rate": 1.8311397630562905e-06, "loss": 0.3703, "step": 3042 }, { "epoch": 2.24079528718704, "grad_norm": 0.3921281099319458, "learning_rate": 1.8278265730282696e-06, "loss": 0.3775, "step": 3043 }, { "epoch": 2.2415316642120766, "grad_norm": 0.3752121925354004, "learning_rate": 1.824515712548553e-06, "loss": 0.3651, "step": 3044 }, { "epoch": 2.2422680412371134, "grad_norm": 0.34277763962745667, "learning_rate": 1.821207184048538e-06, "loss": 0.3717, "step": 3045 }, { "epoch": 2.2430044182621502, "grad_norm": 0.36394500732421875, "learning_rate": 1.8179009899579069e-06, "loss": 0.3698, "step": 3046 }, { "epoch": 2.243740795287187, "grad_norm": 0.3427956700325012, "learning_rate": 1.8145971327046274e-06, "loss": 0.377, "step": 3047 }, { "epoch": 2.244477172312224, "grad_norm": 0.3212227523326874, "learning_rate": 1.8112956147149558e-06, "loss": 0.3747, "step": 3048 }, { "epoch": 2.2452135493372607, "grad_norm": 0.3556148409843445, "learning_rate": 1.8079964384134252e-06, "loss": 0.3795, "step": 3049 }, { "epoch": 2.2459499263622975, "grad_norm": 0.3533985912799835, "learning_rate": 1.80469960622285e-06, "loss": 0.3946, "step": 3050 }, { "epoch": 2.2466863033873343, "grad_norm": 0.32733723521232605, "learning_rate": 1.8014051205643268e-06, "loss": 0.3854, "step": 3051 }, { "epoch": 2.247422680412371, "grad_norm": 0.35768672823905945, "learning_rate": 1.7981129838572248e-06, "loss": 0.3638, "step": 3052 }, { "epoch": 2.248159057437408, "grad_norm": 0.36508840322494507, "learning_rate": 1.79482319851919e-06, "loss": 0.3859, "step": 3053 }, { "epoch": 2.2488954344624448, "grad_norm": 0.33305230736732483, "learning_rate": 1.7915357669661409e-06, "loss": 0.3635, "step": 3054 }, { "epoch": 2.2496318114874816, "grad_norm": 0.324288934469223, "learning_rate": 1.7882506916122683e-06, "loss": 0.3887, "step": 3055 }, { "epoch": 2.2503681885125184, "grad_norm": 0.3586910665035248, "learning_rate": 1.7849679748700305e-06, "loss": 0.3818, "step": 3056 }, { "epoch": 2.2511045655375552, "grad_norm": 0.3524543344974518, "learning_rate": 1.7816876191501587e-06, "loss": 0.4012, "step": 3057 }, { "epoch": 2.251840942562592, "grad_norm": 0.3519776463508606, "learning_rate": 1.7784096268616453e-06, "loss": 0.3727, "step": 3058 }, { "epoch": 2.252577319587629, "grad_norm": 0.36303767561912537, "learning_rate": 1.7751340004117468e-06, "loss": 0.349, "step": 3059 }, { "epoch": 2.2533136966126657, "grad_norm": 0.3571464717388153, "learning_rate": 1.771860742205988e-06, "loss": 0.3939, "step": 3060 }, { "epoch": 2.2540500736377025, "grad_norm": 0.33723917603492737, "learning_rate": 1.7685898546481495e-06, "loss": 0.3779, "step": 3061 }, { "epoch": 2.2547864506627393, "grad_norm": 0.3464319109916687, "learning_rate": 1.7653213401402718e-06, "loss": 0.36, "step": 3062 }, { "epoch": 2.255522827687776, "grad_norm": 0.3434007167816162, "learning_rate": 1.7620552010826535e-06, "loss": 0.3766, "step": 3063 }, { "epoch": 2.256259204712813, "grad_norm": 0.3645339608192444, "learning_rate": 1.7587914398738466e-06, "loss": 0.3689, "step": 3064 }, { "epoch": 2.2569955817378498, "grad_norm": 0.33665645122528076, "learning_rate": 1.7555300589106616e-06, "loss": 0.397, "step": 3065 }, { "epoch": 2.2577319587628866, "grad_norm": 0.312653124332428, "learning_rate": 1.752271060588157e-06, "loss": 0.3567, "step": 3066 }, { "epoch": 2.2584683357879234, "grad_norm": 0.34965354204177856, "learning_rate": 1.7490144472996412e-06, "loss": 0.3739, "step": 3067 }, { "epoch": 2.25920471281296, "grad_norm": 0.30937907099723816, "learning_rate": 1.7457602214366754e-06, "loss": 0.3815, "step": 3068 }, { "epoch": 2.259941089837997, "grad_norm": 0.3516156077384949, "learning_rate": 1.7425083853890628e-06, "loss": 0.3779, "step": 3069 }, { "epoch": 2.260677466863034, "grad_norm": 0.36380839347839355, "learning_rate": 1.7392589415448546e-06, "loss": 0.3853, "step": 3070 }, { "epoch": 2.2614138438880707, "grad_norm": 0.3635159134864807, "learning_rate": 1.736011892290343e-06, "loss": 0.3666, "step": 3071 }, { "epoch": 2.2621502209131075, "grad_norm": 0.3385698199272156, "learning_rate": 1.732767240010062e-06, "loss": 0.3674, "step": 3072 }, { "epoch": 2.2628865979381443, "grad_norm": 0.3308032155036926, "learning_rate": 1.7295249870867898e-06, "loss": 0.3906, "step": 3073 }, { "epoch": 2.263622974963181, "grad_norm": 0.3098700940608978, "learning_rate": 1.726285135901536e-06, "loss": 0.3532, "step": 3074 }, { "epoch": 2.264359351988218, "grad_norm": 0.33167368173599243, "learning_rate": 1.7230476888335484e-06, "loss": 0.3623, "step": 3075 }, { "epoch": 2.2650957290132547, "grad_norm": 0.3469424247741699, "learning_rate": 1.7198126482603144e-06, "loss": 0.3977, "step": 3076 }, { "epoch": 2.2658321060382915, "grad_norm": 0.3413713574409485, "learning_rate": 1.7165800165575475e-06, "loss": 0.3831, "step": 3077 }, { "epoch": 2.2665684830633284, "grad_norm": 0.3475739061832428, "learning_rate": 1.7133497960991945e-06, "loss": 0.3655, "step": 3078 }, { "epoch": 2.267304860088365, "grad_norm": 0.3709220886230469, "learning_rate": 1.7101219892574321e-06, "loss": 0.3622, "step": 3079 }, { "epoch": 2.268041237113402, "grad_norm": 0.3361673951148987, "learning_rate": 1.706896598402663e-06, "loss": 0.3978, "step": 3080 }, { "epoch": 2.268777614138439, "grad_norm": 0.3467946946620941, "learning_rate": 1.7036736259035197e-06, "loss": 0.3752, "step": 3081 }, { "epoch": 2.2695139911634756, "grad_norm": 0.35126227140426636, "learning_rate": 1.7004530741268532e-06, "loss": 0.394, "step": 3082 }, { "epoch": 2.2702503681885124, "grad_norm": 0.36080968379974365, "learning_rate": 1.697234945437739e-06, "loss": 0.3753, "step": 3083 }, { "epoch": 2.2709867452135493, "grad_norm": 0.35190704464912415, "learning_rate": 1.6940192421994766e-06, "loss": 0.3586, "step": 3084 }, { "epoch": 2.271723122238586, "grad_norm": 0.31936824321746826, "learning_rate": 1.6908059667735793e-06, "loss": 0.3687, "step": 3085 }, { "epoch": 2.272459499263623, "grad_norm": 0.33945798873901367, "learning_rate": 1.6875951215197779e-06, "loss": 0.3583, "step": 3086 }, { "epoch": 2.2731958762886597, "grad_norm": 0.3654190003871918, "learning_rate": 1.6843867087960252e-06, "loss": 0.3991, "step": 3087 }, { "epoch": 2.2739322533136965, "grad_norm": 0.33993858098983765, "learning_rate": 1.6811807309584776e-06, "loss": 0.3679, "step": 3088 }, { "epoch": 2.2746686303387333, "grad_norm": 0.35775047540664673, "learning_rate": 1.6779771903615083e-06, "loss": 0.3821, "step": 3089 }, { "epoch": 2.27540500736377, "grad_norm": 0.33738553524017334, "learning_rate": 1.6747760893577037e-06, "loss": 0.3891, "step": 3090 }, { "epoch": 2.276141384388807, "grad_norm": 0.39476099610328674, "learning_rate": 1.6715774302978544e-06, "loss": 0.3794, "step": 3091 }, { "epoch": 2.276877761413844, "grad_norm": 0.3633752167224884, "learning_rate": 1.6683812155309577e-06, "loss": 0.3904, "step": 3092 }, { "epoch": 2.2776141384388806, "grad_norm": 0.31952276825904846, "learning_rate": 1.665187447404219e-06, "loss": 0.3837, "step": 3093 }, { "epoch": 2.2783505154639174, "grad_norm": 0.33152905106544495, "learning_rate": 1.6619961282630453e-06, "loss": 0.3555, "step": 3094 }, { "epoch": 2.2790868924889542, "grad_norm": 0.3404163420200348, "learning_rate": 1.6588072604510435e-06, "loss": 0.3725, "step": 3095 }, { "epoch": 2.279823269513991, "grad_norm": 0.36314231157302856, "learning_rate": 1.6556208463100226e-06, "loss": 0.3619, "step": 3096 }, { "epoch": 2.280559646539028, "grad_norm": 0.38192078471183777, "learning_rate": 1.6524368881799863e-06, "loss": 0.3697, "step": 3097 }, { "epoch": 2.2812960235640647, "grad_norm": 0.3242843747138977, "learning_rate": 1.6492553883991418e-06, "loss": 0.3806, "step": 3098 }, { "epoch": 2.2820324005891015, "grad_norm": 0.33095407485961914, "learning_rate": 1.646076349303884e-06, "loss": 0.382, "step": 3099 }, { "epoch": 2.2827687776141383, "grad_norm": 0.371073454618454, "learning_rate": 1.642899773228801e-06, "loss": 0.4286, "step": 3100 }, { "epoch": 2.283505154639175, "grad_norm": 0.35626521706581116, "learning_rate": 1.6397256625066787e-06, "loss": 0.3677, "step": 3101 }, { "epoch": 2.284241531664212, "grad_norm": 0.3136880695819855, "learning_rate": 1.6365540194684853e-06, "loss": 0.3528, "step": 3102 }, { "epoch": 2.2849779086892488, "grad_norm": 0.35781604051589966, "learning_rate": 1.633384846443381e-06, "loss": 0.3813, "step": 3103 }, { "epoch": 2.2857142857142856, "grad_norm": 0.3532848358154297, "learning_rate": 1.6302181457587092e-06, "loss": 0.368, "step": 3104 }, { "epoch": 2.2864506627393224, "grad_norm": 0.3363364040851593, "learning_rate": 1.6270539197399988e-06, "loss": 0.3679, "step": 3105 }, { "epoch": 2.287187039764359, "grad_norm": 0.347859263420105, "learning_rate": 1.6238921707109639e-06, "loss": 0.371, "step": 3106 }, { "epoch": 2.287923416789396, "grad_norm": 0.3195333480834961, "learning_rate": 1.620732900993497e-06, "loss": 0.3878, "step": 3107 }, { "epoch": 2.288659793814433, "grad_norm": 0.35446897149086, "learning_rate": 1.6175761129076673e-06, "loss": 0.4139, "step": 3108 }, { "epoch": 2.2893961708394697, "grad_norm": 0.35336872935295105, "learning_rate": 1.614421808771729e-06, "loss": 0.3528, "step": 3109 }, { "epoch": 2.2901325478645065, "grad_norm": 0.3762224018573761, "learning_rate": 1.6112699909021057e-06, "loss": 0.3475, "step": 3110 }, { "epoch": 2.2908689248895433, "grad_norm": 0.39090195298194885, "learning_rate": 1.608120661613396e-06, "loss": 0.3518, "step": 3111 }, { "epoch": 2.29160530191458, "grad_norm": 0.33557429909706116, "learning_rate": 1.604973823218376e-06, "loss": 0.3848, "step": 3112 }, { "epoch": 2.292341678939617, "grad_norm": 0.3590867519378662, "learning_rate": 1.6018294780279848e-06, "loss": 0.3553, "step": 3113 }, { "epoch": 2.2930780559646537, "grad_norm": 0.3268950581550598, "learning_rate": 1.598687628351334e-06, "loss": 0.3818, "step": 3114 }, { "epoch": 2.2938144329896906, "grad_norm": 0.3529122769832611, "learning_rate": 1.5955482764957063e-06, "loss": 0.3755, "step": 3115 }, { "epoch": 2.2945508100147274, "grad_norm": 0.32063764333724976, "learning_rate": 1.5924114247665457e-06, "loss": 0.3589, "step": 3116 }, { "epoch": 2.295287187039764, "grad_norm": 0.3151834309101105, "learning_rate": 1.5892770754674596e-06, "loss": 0.3749, "step": 3117 }, { "epoch": 2.296023564064801, "grad_norm": 0.3227955996990204, "learning_rate": 1.5861452309002219e-06, "loss": 0.3449, "step": 3118 }, { "epoch": 2.296759941089838, "grad_norm": 0.3374328315258026, "learning_rate": 1.5830158933647638e-06, "loss": 0.3702, "step": 3119 }, { "epoch": 2.2974963181148746, "grad_norm": 0.34967485070228577, "learning_rate": 1.5798890651591759e-06, "loss": 0.3723, "step": 3120 }, { "epoch": 2.2982326951399115, "grad_norm": 0.32275229692459106, "learning_rate": 1.576764748579706e-06, "loss": 0.3772, "step": 3121 }, { "epoch": 2.2989690721649483, "grad_norm": 0.3539294898509979, "learning_rate": 1.5736429459207569e-06, "loss": 0.3473, "step": 3122 }, { "epoch": 2.299705449189985, "grad_norm": 0.35446813702583313, "learning_rate": 1.570523659474889e-06, "loss": 0.3826, "step": 3123 }, { "epoch": 2.300441826215022, "grad_norm": 0.3261672258377075, "learning_rate": 1.5674068915328105e-06, "loss": 0.3832, "step": 3124 }, { "epoch": 2.3011782032400587, "grad_norm": 0.32315734028816223, "learning_rate": 1.56429264438338e-06, "loss": 0.3884, "step": 3125 }, { "epoch": 2.3019145802650955, "grad_norm": 0.32102611660957336, "learning_rate": 1.561180920313609e-06, "loss": 0.401, "step": 3126 }, { "epoch": 2.3026509572901324, "grad_norm": 0.3752593398094177, "learning_rate": 1.5580717216086533e-06, "loss": 0.3684, "step": 3127 }, { "epoch": 2.303387334315169, "grad_norm": 0.331100732088089, "learning_rate": 1.5549650505518115e-06, "loss": 0.3856, "step": 3128 }, { "epoch": 2.304123711340206, "grad_norm": 0.3341582417488098, "learning_rate": 1.5518609094245351e-06, "loss": 0.3876, "step": 3129 }, { "epoch": 2.304860088365243, "grad_norm": 0.3470800817012787, "learning_rate": 1.5487593005064038e-06, "loss": 0.3762, "step": 3130 }, { "epoch": 2.3055964653902796, "grad_norm": 0.3255835175514221, "learning_rate": 1.5456602260751513e-06, "loss": 0.3874, "step": 3131 }, { "epoch": 2.3063328424153164, "grad_norm": 0.317374587059021, "learning_rate": 1.5425636884066426e-06, "loss": 0.4035, "step": 3132 }, { "epoch": 2.3070692194403533, "grad_norm": 0.3365512788295746, "learning_rate": 1.539469689774879e-06, "loss": 0.3671, "step": 3133 }, { "epoch": 2.30780559646539, "grad_norm": 0.34714415669441223, "learning_rate": 1.5363782324520033e-06, "loss": 0.3717, "step": 3134 }, { "epoch": 2.308541973490427, "grad_norm": 0.3375949263572693, "learning_rate": 1.5332893187082864e-06, "loss": 0.3973, "step": 3135 }, { "epoch": 2.3092783505154637, "grad_norm": 0.305399626493454, "learning_rate": 1.5302029508121325e-06, "loss": 0.38, "step": 3136 }, { "epoch": 2.3100147275405005, "grad_norm": 0.3382294774055481, "learning_rate": 1.5271191310300803e-06, "loss": 0.3695, "step": 3137 }, { "epoch": 2.3107511045655373, "grad_norm": 0.35040998458862305, "learning_rate": 1.5240378616267887e-06, "loss": 0.3784, "step": 3138 }, { "epoch": 2.311487481590574, "grad_norm": 0.3383118510246277, "learning_rate": 1.5209591448650535e-06, "loss": 0.3911, "step": 3139 }, { "epoch": 2.312223858615611, "grad_norm": 0.32253560423851013, "learning_rate": 1.5178829830057883e-06, "loss": 0.3547, "step": 3140 }, { "epoch": 2.312960235640648, "grad_norm": 0.37971949577331543, "learning_rate": 1.5148093783080337e-06, "loss": 0.3687, "step": 3141 }, { "epoch": 2.3136966126656846, "grad_norm": 0.34527388215065, "learning_rate": 1.5117383330289542e-06, "loss": 0.4067, "step": 3142 }, { "epoch": 2.3144329896907214, "grad_norm": 0.3223811686038971, "learning_rate": 1.5086698494238316e-06, "loss": 0.3777, "step": 3143 }, { "epoch": 2.3151693667157582, "grad_norm": 0.3294554054737091, "learning_rate": 1.5056039297460656e-06, "loss": 0.3806, "step": 3144 }, { "epoch": 2.315905743740795, "grad_norm": 0.4138847291469574, "learning_rate": 1.5025405762471795e-06, "loss": 0.366, "step": 3145 }, { "epoch": 2.316642120765832, "grad_norm": 0.3485645353794098, "learning_rate": 1.4994797911768034e-06, "loss": 0.4099, "step": 3146 }, { "epoch": 2.3173784977908687, "grad_norm": 0.3449089527130127, "learning_rate": 1.4964215767826846e-06, "loss": 0.3593, "step": 3147 }, { "epoch": 2.3181148748159055, "grad_norm": 0.3338415324687958, "learning_rate": 1.4933659353106872e-06, "loss": 0.363, "step": 3148 }, { "epoch": 2.3188512518409423, "grad_norm": 0.33928152918815613, "learning_rate": 1.4903128690047802e-06, "loss": 0.3571, "step": 3149 }, { "epoch": 2.319587628865979, "grad_norm": 0.3429686725139618, "learning_rate": 1.4872623801070413e-06, "loss": 0.3747, "step": 3150 }, { "epoch": 2.3203240058910164, "grad_norm": 0.32575222849845886, "learning_rate": 1.4842144708576606e-06, "loss": 0.3772, "step": 3151 }, { "epoch": 2.321060382916053, "grad_norm": 0.34492257237434387, "learning_rate": 1.4811691434949293e-06, "loss": 0.3755, "step": 3152 }, { "epoch": 2.32179675994109, "grad_norm": 0.3562105596065521, "learning_rate": 1.4781264002552425e-06, "loss": 0.3717, "step": 3153 }, { "epoch": 2.322533136966127, "grad_norm": 0.3609287440776825, "learning_rate": 1.4750862433731028e-06, "loss": 0.357, "step": 3154 }, { "epoch": 2.3232695139911637, "grad_norm": 0.33014407753944397, "learning_rate": 1.4720486750811035e-06, "loss": 0.3536, "step": 3155 }, { "epoch": 2.3240058910162005, "grad_norm": 0.3423437476158142, "learning_rate": 1.4690136976099479e-06, "loss": 0.3642, "step": 3156 }, { "epoch": 2.3247422680412373, "grad_norm": 0.35252103209495544, "learning_rate": 1.4659813131884304e-06, "loss": 0.3835, "step": 3157 }, { "epoch": 2.325478645066274, "grad_norm": 0.3483889400959015, "learning_rate": 1.46295152404344e-06, "loss": 0.3714, "step": 3158 }, { "epoch": 2.326215022091311, "grad_norm": 0.3345074951648712, "learning_rate": 1.4599243323999668e-06, "loss": 0.365, "step": 3159 }, { "epoch": 2.3269513991163477, "grad_norm": 0.334255188703537, "learning_rate": 1.4568997404810858e-06, "loss": 0.3851, "step": 3160 }, { "epoch": 2.3276877761413846, "grad_norm": 0.33251067996025085, "learning_rate": 1.4538777505079654e-06, "loss": 0.3928, "step": 3161 }, { "epoch": 2.3284241531664214, "grad_norm": 0.3211064040660858, "learning_rate": 1.4508583646998674e-06, "loss": 0.4159, "step": 3162 }, { "epoch": 2.329160530191458, "grad_norm": 0.33041635155677795, "learning_rate": 1.4478415852741328e-06, "loss": 0.349, "step": 3163 }, { "epoch": 2.329896907216495, "grad_norm": 0.31442853808403015, "learning_rate": 1.4448274144461965e-06, "loss": 0.3824, "step": 3164 }, { "epoch": 2.330633284241532, "grad_norm": 0.3113677501678467, "learning_rate": 1.4418158544295734e-06, "loss": 0.3695, "step": 3165 }, { "epoch": 2.3313696612665686, "grad_norm": 0.33691975474357605, "learning_rate": 1.4388069074358612e-06, "loss": 0.3742, "step": 3166 }, { "epoch": 2.3321060382916055, "grad_norm": 0.33256834745407104, "learning_rate": 1.4358005756747417e-06, "loss": 0.341, "step": 3167 }, { "epoch": 2.3328424153166423, "grad_norm": 0.33882421255111694, "learning_rate": 1.4327968613539734e-06, "loss": 0.3785, "step": 3168 }, { "epoch": 2.333578792341679, "grad_norm": 0.33136385679244995, "learning_rate": 1.429795766679391e-06, "loss": 0.3777, "step": 3169 }, { "epoch": 2.334315169366716, "grad_norm": 0.31647056341171265, "learning_rate": 1.426797293854912e-06, "loss": 0.3867, "step": 3170 }, { "epoch": 2.3350515463917527, "grad_norm": 0.3487012982368469, "learning_rate": 1.4238014450825227e-06, "loss": 0.3682, "step": 3171 }, { "epoch": 2.3357879234167895, "grad_norm": 0.3635872006416321, "learning_rate": 1.4208082225622804e-06, "loss": 0.385, "step": 3172 }, { "epoch": 2.3365243004418264, "grad_norm": 0.35022327303886414, "learning_rate": 1.4178176284923212e-06, "loss": 0.4066, "step": 3173 }, { "epoch": 2.337260677466863, "grad_norm": 0.35386213660240173, "learning_rate": 1.4148296650688465e-06, "loss": 0.3982, "step": 3174 }, { "epoch": 2.3379970544919, "grad_norm": 0.3819175660610199, "learning_rate": 1.4118443344861237e-06, "loss": 0.3898, "step": 3175 }, { "epoch": 2.338733431516937, "grad_norm": 0.36631712317466736, "learning_rate": 1.408861638936493e-06, "loss": 0.3976, "step": 3176 }, { "epoch": 2.3394698085419736, "grad_norm": 0.3479391634464264, "learning_rate": 1.4058815806103542e-06, "loss": 0.3825, "step": 3177 }, { "epoch": 2.3402061855670104, "grad_norm": 0.35590627789497375, "learning_rate": 1.4029041616961703e-06, "loss": 0.3747, "step": 3178 }, { "epoch": 2.3409425625920472, "grad_norm": 0.3458181321620941, "learning_rate": 1.3999293843804728e-06, "loss": 0.3736, "step": 3179 }, { "epoch": 2.341678939617084, "grad_norm": 0.33873867988586426, "learning_rate": 1.3969572508478424e-06, "loss": 0.373, "step": 3180 }, { "epoch": 2.342415316642121, "grad_norm": 0.33975568413734436, "learning_rate": 1.3939877632809279e-06, "loss": 0.3431, "step": 3181 }, { "epoch": 2.3431516936671577, "grad_norm": 0.3956592082977295, "learning_rate": 1.3910209238604306e-06, "loss": 0.3857, "step": 3182 }, { "epoch": 2.3438880706921945, "grad_norm": 0.35657671093940735, "learning_rate": 1.3880567347651052e-06, "loss": 0.3717, "step": 3183 }, { "epoch": 2.3446244477172313, "grad_norm": 0.34503045678138733, "learning_rate": 1.3850951981717665e-06, "loss": 0.3607, "step": 3184 }, { "epoch": 2.345360824742268, "grad_norm": 0.31892770528793335, "learning_rate": 1.3821363162552753e-06, "loss": 0.3666, "step": 3185 }, { "epoch": 2.346097201767305, "grad_norm": 0.3312123715877533, "learning_rate": 1.3791800911885444e-06, "loss": 0.3279, "step": 3186 }, { "epoch": 2.346833578792342, "grad_norm": 0.37443795800209045, "learning_rate": 1.3762265251425394e-06, "loss": 0.3779, "step": 3187 }, { "epoch": 2.3475699558173786, "grad_norm": 0.3352743685245514, "learning_rate": 1.373275620286265e-06, "loss": 0.3839, "step": 3188 }, { "epoch": 2.3483063328424154, "grad_norm": 0.3558715879917145, "learning_rate": 1.370327378786781e-06, "loss": 0.3788, "step": 3189 }, { "epoch": 2.3490427098674522, "grad_norm": 0.38027918338775635, "learning_rate": 1.367381802809185e-06, "loss": 0.4096, "step": 3190 }, { "epoch": 2.349779086892489, "grad_norm": 0.31835508346557617, "learning_rate": 1.3644388945166175e-06, "loss": 0.4012, "step": 3191 }, { "epoch": 2.350515463917526, "grad_norm": 0.31716689467430115, "learning_rate": 1.3614986560702648e-06, "loss": 0.3676, "step": 3192 }, { "epoch": 2.3512518409425627, "grad_norm": 0.3417465090751648, "learning_rate": 1.3585610896293472e-06, "loss": 0.369, "step": 3193 }, { "epoch": 2.3519882179675995, "grad_norm": 0.3349650800228119, "learning_rate": 1.3556261973511236e-06, "loss": 0.3702, "step": 3194 }, { "epoch": 2.3527245949926363, "grad_norm": 0.38491392135620117, "learning_rate": 1.3526939813908929e-06, "loss": 0.3594, "step": 3195 }, { "epoch": 2.353460972017673, "grad_norm": 0.3387065529823303, "learning_rate": 1.349764443901984e-06, "loss": 0.3557, "step": 3196 }, { "epoch": 2.35419734904271, "grad_norm": 0.33061766624450684, "learning_rate": 1.346837587035762e-06, "loss": 0.3777, "step": 3197 }, { "epoch": 2.3549337260677468, "grad_norm": 0.34983956813812256, "learning_rate": 1.343913412941621e-06, "loss": 0.392, "step": 3198 }, { "epoch": 2.3556701030927836, "grad_norm": 0.3425159454345703, "learning_rate": 1.3409919237669843e-06, "loss": 0.3779, "step": 3199 }, { "epoch": 2.3564064801178204, "grad_norm": 0.3214556574821472, "learning_rate": 1.33807312165731e-06, "loss": 0.3965, "step": 3200 }, { "epoch": 2.357142857142857, "grad_norm": 0.32517537474632263, "learning_rate": 1.335157008756075e-06, "loss": 0.3657, "step": 3201 }, { "epoch": 2.357879234167894, "grad_norm": 0.3494378626346588, "learning_rate": 1.3322435872047835e-06, "loss": 0.3905, "step": 3202 }, { "epoch": 2.358615611192931, "grad_norm": 0.39801037311553955, "learning_rate": 1.329332859142967e-06, "loss": 0.3801, "step": 3203 }, { "epoch": 2.3593519882179677, "grad_norm": 0.34806138277053833, "learning_rate": 1.326424826708177e-06, "loss": 0.3711, "step": 3204 }, { "epoch": 2.3600883652430045, "grad_norm": 0.37914684414863586, "learning_rate": 1.3235194920359795e-06, "loss": 0.3568, "step": 3205 }, { "epoch": 2.3608247422680413, "grad_norm": 0.3150370419025421, "learning_rate": 1.3206168572599692e-06, "loss": 0.3577, "step": 3206 }, { "epoch": 2.361561119293078, "grad_norm": 0.3247841000556946, "learning_rate": 1.3177169245117522e-06, "loss": 0.3998, "step": 3207 }, { "epoch": 2.362297496318115, "grad_norm": 0.3235202729701996, "learning_rate": 1.3148196959209491e-06, "loss": 0.3604, "step": 3208 }, { "epoch": 2.3630338733431517, "grad_norm": 0.3443073034286499, "learning_rate": 1.3119251736152005e-06, "loss": 0.3561, "step": 3209 }, { "epoch": 2.3637702503681886, "grad_norm": 0.3384738266468048, "learning_rate": 1.309033359720155e-06, "loss": 0.3632, "step": 3210 }, { "epoch": 2.3645066273932254, "grad_norm": 0.3304847478866577, "learning_rate": 1.3061442563594718e-06, "loss": 0.3745, "step": 3211 }, { "epoch": 2.365243004418262, "grad_norm": 0.3340161144733429, "learning_rate": 1.3032578656548228e-06, "loss": 0.3665, "step": 3212 }, { "epoch": 2.365979381443299, "grad_norm": 0.3465518355369568, "learning_rate": 1.3003741897258864e-06, "loss": 0.3678, "step": 3213 }, { "epoch": 2.366715758468336, "grad_norm": 0.31364724040031433, "learning_rate": 1.297493230690346e-06, "loss": 0.3602, "step": 3214 }, { "epoch": 2.3674521354933726, "grad_norm": 0.3540259599685669, "learning_rate": 1.2946149906638905e-06, "loss": 0.3733, "step": 3215 }, { "epoch": 2.3681885125184094, "grad_norm": 0.34416845440864563, "learning_rate": 1.2917394717602123e-06, "loss": 0.3512, "step": 3216 }, { "epoch": 2.3689248895434463, "grad_norm": 0.37289807200431824, "learning_rate": 1.2888666760910074e-06, "loss": 0.3755, "step": 3217 }, { "epoch": 2.369661266568483, "grad_norm": 0.3461972773075104, "learning_rate": 1.285996605765969e-06, "loss": 0.4092, "step": 3218 }, { "epoch": 2.37039764359352, "grad_norm": 0.36387526988983154, "learning_rate": 1.283129262892789e-06, "loss": 0.3788, "step": 3219 }, { "epoch": 2.3711340206185567, "grad_norm": 0.32595691084861755, "learning_rate": 1.2802646495771592e-06, "loss": 0.3588, "step": 3220 }, { "epoch": 2.3718703976435935, "grad_norm": 0.332273006439209, "learning_rate": 1.2774027679227647e-06, "loss": 0.3772, "step": 3221 }, { "epoch": 2.3726067746686303, "grad_norm": 0.3550896942615509, "learning_rate": 1.2745436200312844e-06, "loss": 0.375, "step": 3222 }, { "epoch": 2.373343151693667, "grad_norm": 0.3492611348628998, "learning_rate": 1.2716872080023901e-06, "loss": 0.3792, "step": 3223 }, { "epoch": 2.374079528718704, "grad_norm": 0.33285483717918396, "learning_rate": 1.2688335339337433e-06, "loss": 0.3837, "step": 3224 }, { "epoch": 2.374815905743741, "grad_norm": 0.3760431110858917, "learning_rate": 1.2659825999209985e-06, "loss": 0.3811, "step": 3225 }, { "epoch": 2.3755522827687776, "grad_norm": 0.33756300806999207, "learning_rate": 1.263134408057794e-06, "loss": 0.3708, "step": 3226 }, { "epoch": 2.3762886597938144, "grad_norm": 0.36119866371154785, "learning_rate": 1.2602889604357548e-06, "loss": 0.3681, "step": 3227 }, { "epoch": 2.3770250368188512, "grad_norm": 0.33655065298080444, "learning_rate": 1.257446259144494e-06, "loss": 0.385, "step": 3228 }, { "epoch": 2.377761413843888, "grad_norm": 0.3267086446285248, "learning_rate": 1.2546063062716069e-06, "loss": 0.3656, "step": 3229 }, { "epoch": 2.378497790868925, "grad_norm": 0.3475969731807709, "learning_rate": 1.2517691039026625e-06, "loss": 0.3735, "step": 3230 }, { "epoch": 2.3792341678939617, "grad_norm": 0.3716298043727875, "learning_rate": 1.2489346541212226e-06, "loss": 0.4026, "step": 3231 }, { "epoch": 2.3799705449189985, "grad_norm": 0.39196276664733887, "learning_rate": 1.2461029590088198e-06, "loss": 0.3645, "step": 3232 }, { "epoch": 2.3807069219440353, "grad_norm": 0.34049636125564575, "learning_rate": 1.2432740206449629e-06, "loss": 0.3597, "step": 3233 }, { "epoch": 2.381443298969072, "grad_norm": 0.3242121636867523, "learning_rate": 1.240447841107143e-06, "loss": 0.397, "step": 3234 }, { "epoch": 2.382179675994109, "grad_norm": 0.3249550759792328, "learning_rate": 1.2376244224708183e-06, "loss": 0.3732, "step": 3235 }, { "epoch": 2.3829160530191458, "grad_norm": 0.3562273681163788, "learning_rate": 1.2348037668094214e-06, "loss": 0.3755, "step": 3236 }, { "epoch": 2.3836524300441826, "grad_norm": 0.3847813904285431, "learning_rate": 1.2319858761943598e-06, "loss": 0.3614, "step": 3237 }, { "epoch": 2.3843888070692194, "grad_norm": 0.32016924023628235, "learning_rate": 1.2291707526950047e-06, "loss": 0.3499, "step": 3238 }, { "epoch": 2.3851251840942562, "grad_norm": 0.38904672861099243, "learning_rate": 1.2263583983786986e-06, "loss": 0.3994, "step": 3239 }, { "epoch": 2.385861561119293, "grad_norm": 0.34415403008461, "learning_rate": 1.2235488153107488e-06, "loss": 0.3723, "step": 3240 }, { "epoch": 2.38659793814433, "grad_norm": 0.3804604709148407, "learning_rate": 1.2207420055544278e-06, "loss": 0.3707, "step": 3241 }, { "epoch": 2.3873343151693667, "grad_norm": 0.34501025080680847, "learning_rate": 1.2179379711709738e-06, "loss": 0.4066, "step": 3242 }, { "epoch": 2.3880706921944035, "grad_norm": 0.35405653715133667, "learning_rate": 1.2151367142195842e-06, "loss": 0.376, "step": 3243 }, { "epoch": 2.3888070692194403, "grad_norm": 0.3774116039276123, "learning_rate": 1.212338236757415e-06, "loss": 0.3763, "step": 3244 }, { "epoch": 2.389543446244477, "grad_norm": 0.35209256410598755, "learning_rate": 1.2095425408395873e-06, "loss": 0.3543, "step": 3245 }, { "epoch": 2.390279823269514, "grad_norm": 0.36879295110702515, "learning_rate": 1.2067496285191743e-06, "loss": 0.3765, "step": 3246 }, { "epoch": 2.3910162002945508, "grad_norm": 0.3600074052810669, "learning_rate": 1.2039595018472055e-06, "loss": 0.377, "step": 3247 }, { "epoch": 2.3917525773195876, "grad_norm": 0.3357848525047302, "learning_rate": 1.2011721628726663e-06, "loss": 0.3659, "step": 3248 }, { "epoch": 2.3924889543446244, "grad_norm": 0.3500244617462158, "learning_rate": 1.1983876136424926e-06, "loss": 0.3847, "step": 3249 }, { "epoch": 2.393225331369661, "grad_norm": 0.3420659303665161, "learning_rate": 1.1956058562015766e-06, "loss": 0.392, "step": 3250 }, { "epoch": 2.393961708394698, "grad_norm": 0.33460474014282227, "learning_rate": 1.192826892592755e-06, "loss": 0.3708, "step": 3251 }, { "epoch": 2.394698085419735, "grad_norm": 0.34583914279937744, "learning_rate": 1.1900507248568128e-06, "loss": 0.349, "step": 3252 }, { "epoch": 2.3954344624447717, "grad_norm": 0.36606159806251526, "learning_rate": 1.1872773550324873e-06, "loss": 0.3739, "step": 3253 }, { "epoch": 2.3961708394698085, "grad_norm": 0.34434834122657776, "learning_rate": 1.1845067851564557e-06, "loss": 0.3459, "step": 3254 }, { "epoch": 2.3969072164948453, "grad_norm": 0.3151196539402008, "learning_rate": 1.1817390172633402e-06, "loss": 0.3635, "step": 3255 }, { "epoch": 2.397643593519882, "grad_norm": 0.32296085357666016, "learning_rate": 1.1789740533857075e-06, "loss": 0.3863, "step": 3256 }, { "epoch": 2.398379970544919, "grad_norm": 0.3849099576473236, "learning_rate": 1.1762118955540609e-06, "loss": 0.3943, "step": 3257 }, { "epoch": 2.3991163475699557, "grad_norm": 0.3584287762641907, "learning_rate": 1.1734525457968488e-06, "loss": 0.3823, "step": 3258 }, { "epoch": 2.3998527245949925, "grad_norm": 0.33477911353111267, "learning_rate": 1.1706960061404527e-06, "loss": 0.37, "step": 3259 }, { "epoch": 2.4005891016200294, "grad_norm": 0.3298894762992859, "learning_rate": 1.1679422786091909e-06, "loss": 0.3887, "step": 3260 }, { "epoch": 2.401325478645066, "grad_norm": 0.3459241986274719, "learning_rate": 1.1651913652253199e-06, "loss": 0.3546, "step": 3261 }, { "epoch": 2.402061855670103, "grad_norm": 0.3287370502948761, "learning_rate": 1.162443268009027e-06, "loss": 0.3604, "step": 3262 }, { "epoch": 2.40279823269514, "grad_norm": 0.33065786957740784, "learning_rate": 1.1596979889784304e-06, "loss": 0.384, "step": 3263 }, { "epoch": 2.4035346097201766, "grad_norm": 0.3277767598628998, "learning_rate": 1.1569555301495817e-06, "loss": 0.3492, "step": 3264 }, { "epoch": 2.4042709867452134, "grad_norm": 0.3225124478340149, "learning_rate": 1.1542158935364584e-06, "loss": 0.3909, "step": 3265 }, { "epoch": 2.4050073637702503, "grad_norm": 0.30637234449386597, "learning_rate": 1.1514790811509658e-06, "loss": 0.4044, "step": 3266 }, { "epoch": 2.405743740795287, "grad_norm": 0.34995532035827637, "learning_rate": 1.148745095002939e-06, "loss": 0.3824, "step": 3267 }, { "epoch": 2.406480117820324, "grad_norm": 0.33643367886543274, "learning_rate": 1.1460139371001339e-06, "loss": 0.3694, "step": 3268 }, { "epoch": 2.4072164948453607, "grad_norm": 0.3490047752857208, "learning_rate": 1.1432856094482282e-06, "loss": 0.384, "step": 3269 }, { "epoch": 2.4079528718703975, "grad_norm": 0.3088352382183075, "learning_rate": 1.1405601140508265e-06, "loss": 0.3662, "step": 3270 }, { "epoch": 2.4086892488954343, "grad_norm": 0.3435070514678955, "learning_rate": 1.1378374529094494e-06, "loss": 0.3965, "step": 3271 }, { "epoch": 2.409425625920471, "grad_norm": 0.33832401037216187, "learning_rate": 1.135117628023536e-06, "loss": 0.3821, "step": 3272 }, { "epoch": 2.410162002945508, "grad_norm": 0.335674524307251, "learning_rate": 1.1324006413904437e-06, "loss": 0.4041, "step": 3273 }, { "epoch": 2.410898379970545, "grad_norm": 0.37129271030426025, "learning_rate": 1.1296864950054443e-06, "loss": 0.364, "step": 3274 }, { "epoch": 2.4116347569955816, "grad_norm": 0.35193946957588196, "learning_rate": 1.1269751908617277e-06, "loss": 0.404, "step": 3275 }, { "epoch": 2.4123711340206184, "grad_norm": 0.34441494941711426, "learning_rate": 1.124266730950392e-06, "loss": 0.3571, "step": 3276 }, { "epoch": 2.4131075110456552, "grad_norm": 0.3434298634529114, "learning_rate": 1.1215611172604468e-06, "loss": 0.3831, "step": 3277 }, { "epoch": 2.413843888070692, "grad_norm": 0.3040885925292969, "learning_rate": 1.1188583517788165e-06, "loss": 0.3555, "step": 3278 }, { "epoch": 2.414580265095729, "grad_norm": 0.3662799596786499, "learning_rate": 1.1161584364903287e-06, "loss": 0.3932, "step": 3279 }, { "epoch": 2.4153166421207657, "grad_norm": 0.3392142653465271, "learning_rate": 1.1134613733777195e-06, "loss": 0.3553, "step": 3280 }, { "epoch": 2.4160530191458025, "grad_norm": 0.35965248942375183, "learning_rate": 1.1107671644216305e-06, "loss": 0.3529, "step": 3281 }, { "epoch": 2.4167893961708393, "grad_norm": 0.37585723400115967, "learning_rate": 1.1080758116006057e-06, "loss": 0.3913, "step": 3282 }, { "epoch": 2.417525773195876, "grad_norm": 0.3380414545536041, "learning_rate": 1.1053873168910966e-06, "loss": 0.3933, "step": 3283 }, { "epoch": 2.418262150220913, "grad_norm": 0.33005619049072266, "learning_rate": 1.1027016822674509e-06, "loss": 0.3837, "step": 3284 }, { "epoch": 2.4189985272459498, "grad_norm": 0.3498542904853821, "learning_rate": 1.1000189097019164e-06, "loss": 0.3881, "step": 3285 }, { "epoch": 2.4197349042709866, "grad_norm": 0.37784454226493835, "learning_rate": 1.0973390011646422e-06, "loss": 0.3898, "step": 3286 }, { "epoch": 2.4204712812960234, "grad_norm": 0.3332594335079193, "learning_rate": 1.0946619586236711e-06, "loss": 0.3571, "step": 3287 }, { "epoch": 2.42120765832106, "grad_norm": 0.32110390067100525, "learning_rate": 1.0919877840449428e-06, "loss": 0.3904, "step": 3288 }, { "epoch": 2.421944035346097, "grad_norm": 0.3465670645236969, "learning_rate": 1.0893164793922894e-06, "loss": 0.3595, "step": 3289 }, { "epoch": 2.422680412371134, "grad_norm": 0.33734622597694397, "learning_rate": 1.0866480466274377e-06, "loss": 0.3817, "step": 3290 }, { "epoch": 2.4234167893961707, "grad_norm": 0.3218252956867218, "learning_rate": 1.0839824877100008e-06, "loss": 0.3472, "step": 3291 }, { "epoch": 2.4241531664212075, "grad_norm": 0.3475230634212494, "learning_rate": 1.0813198045974888e-06, "loss": 0.3629, "step": 3292 }, { "epoch": 2.4248895434462443, "grad_norm": 0.32285812497138977, "learning_rate": 1.0786599992452933e-06, "loss": 0.3819, "step": 3293 }, { "epoch": 2.425625920471281, "grad_norm": 0.34469369053840637, "learning_rate": 1.0760030736066952e-06, "loss": 0.3668, "step": 3294 }, { "epoch": 2.426362297496318, "grad_norm": 0.3158765435218811, "learning_rate": 1.0733490296328613e-06, "loss": 0.3542, "step": 3295 }, { "epoch": 2.4270986745213547, "grad_norm": 0.3300797641277313, "learning_rate": 1.0706978692728416e-06, "loss": 0.4024, "step": 3296 }, { "epoch": 2.4278350515463916, "grad_norm": 0.31223219633102417, "learning_rate": 1.0680495944735665e-06, "loss": 0.3649, "step": 3297 }, { "epoch": 2.4285714285714284, "grad_norm": 0.35305702686309814, "learning_rate": 1.0654042071798498e-06, "loss": 0.3598, "step": 3298 }, { "epoch": 2.429307805596465, "grad_norm": 0.354264497756958, "learning_rate": 1.0627617093343833e-06, "loss": 0.3895, "step": 3299 }, { "epoch": 2.4300441826215025, "grad_norm": 0.34128403663635254, "learning_rate": 1.060122102877739e-06, "loss": 0.3702, "step": 3300 }, { "epoch": 2.4307805596465393, "grad_norm": 0.36130642890930176, "learning_rate": 1.0574853897483634e-06, "loss": 0.4276, "step": 3301 }, { "epoch": 2.431516936671576, "grad_norm": 0.3270364999771118, "learning_rate": 1.054851571882578e-06, "loss": 0.3915, "step": 3302 }, { "epoch": 2.432253313696613, "grad_norm": 0.33622244000434875, "learning_rate": 1.052220651214581e-06, "loss": 0.4009, "step": 3303 }, { "epoch": 2.4329896907216497, "grad_norm": 0.3255276381969452, "learning_rate": 1.0495926296764398e-06, "loss": 0.3953, "step": 3304 }, { "epoch": 2.4337260677466865, "grad_norm": 0.32380256056785583, "learning_rate": 1.0469675091980946e-06, "loss": 0.3766, "step": 3305 }, { "epoch": 2.4344624447717234, "grad_norm": 0.32424572110176086, "learning_rate": 1.0443452917073538e-06, "loss": 0.3914, "step": 3306 }, { "epoch": 2.43519882179676, "grad_norm": 0.3544393479824066, "learning_rate": 1.041725979129894e-06, "loss": 0.3624, "step": 3307 }, { "epoch": 2.435935198821797, "grad_norm": 0.3281401991844177, "learning_rate": 1.0391095733892614e-06, "loss": 0.3437, "step": 3308 }, { "epoch": 2.436671575846834, "grad_norm": 0.32801955938339233, "learning_rate": 1.0364960764068643e-06, "loss": 0.3675, "step": 3309 }, { "epoch": 2.4374079528718706, "grad_norm": 0.3724784553050995, "learning_rate": 1.033885490101974e-06, "loss": 0.3646, "step": 3310 }, { "epoch": 2.4381443298969074, "grad_norm": 0.35734328627586365, "learning_rate": 1.0312778163917298e-06, "loss": 0.3746, "step": 3311 }, { "epoch": 2.4388807069219443, "grad_norm": 0.33092063665390015, "learning_rate": 1.0286730571911264e-06, "loss": 0.3573, "step": 3312 }, { "epoch": 2.439617083946981, "grad_norm": 0.32385748624801636, "learning_rate": 1.0260712144130192e-06, "loss": 0.3724, "step": 3313 }, { "epoch": 2.440353460972018, "grad_norm": 0.3588411808013916, "learning_rate": 1.0234722899681265e-06, "loss": 0.3727, "step": 3314 }, { "epoch": 2.4410898379970547, "grad_norm": 0.32793402671813965, "learning_rate": 1.020876285765015e-06, "loss": 0.3588, "step": 3315 }, { "epoch": 2.4418262150220915, "grad_norm": 0.37099236249923706, "learning_rate": 1.018283203710116e-06, "loss": 0.3759, "step": 3316 }, { "epoch": 2.4425625920471283, "grad_norm": 0.35512179136276245, "learning_rate": 1.0156930457077085e-06, "loss": 0.3909, "step": 3317 }, { "epoch": 2.443298969072165, "grad_norm": 0.32418885827064514, "learning_rate": 1.0131058136599254e-06, "loss": 0.3577, "step": 3318 }, { "epoch": 2.444035346097202, "grad_norm": 0.33136194944381714, "learning_rate": 1.0105215094667542e-06, "loss": 0.3603, "step": 3319 }, { "epoch": 2.444771723122239, "grad_norm": 0.35086795687675476, "learning_rate": 1.0079401350260288e-06, "loss": 0.3666, "step": 3320 }, { "epoch": 2.4455081001472756, "grad_norm": 0.3403591215610504, "learning_rate": 1.0053616922334307e-06, "loss": 0.3901, "step": 3321 }, { "epoch": 2.4462444771723124, "grad_norm": 0.3403468132019043, "learning_rate": 1.0027861829824953e-06, "loss": 0.3854, "step": 3322 }, { "epoch": 2.4469808541973492, "grad_norm": 0.3459208011627197, "learning_rate": 1.0002136091645936e-06, "loss": 0.3169, "step": 3323 }, { "epoch": 2.447717231222386, "grad_norm": 0.3217117190361023, "learning_rate": 9.976439726689469e-07, "loss": 0.3866, "step": 3324 }, { "epoch": 2.448453608247423, "grad_norm": 0.37422022223472595, "learning_rate": 9.95077275382621e-07, "loss": 0.3865, "step": 3325 }, { "epoch": 2.4491899852724597, "grad_norm": 0.37057486176490784, "learning_rate": 9.925135191905194e-07, "loss": 0.3944, "step": 3326 }, { "epoch": 2.4499263622974965, "grad_norm": 0.33397674560546875, "learning_rate": 9.89952705975386e-07, "loss": 0.3958, "step": 3327 }, { "epoch": 2.4506627393225333, "grad_norm": 0.3693501353263855, "learning_rate": 9.873948376178073e-07, "loss": 0.3158, "step": 3328 }, { "epoch": 2.45139911634757, "grad_norm": 0.3479422330856323, "learning_rate": 9.84839915996203e-07, "loss": 0.3486, "step": 3329 }, { "epoch": 2.452135493372607, "grad_norm": 0.3450046479701996, "learning_rate": 9.822879429868304e-07, "loss": 0.3718, "step": 3330 }, { "epoch": 2.4528718703976438, "grad_norm": 0.33801180124282837, "learning_rate": 9.79738920463782e-07, "loss": 0.3553, "step": 3331 }, { "epoch": 2.4536082474226806, "grad_norm": 0.3484099507331848, "learning_rate": 9.771928502989802e-07, "loss": 0.3978, "step": 3332 }, { "epoch": 2.4543446244477174, "grad_norm": 0.34799516201019287, "learning_rate": 9.746497343621857e-07, "loss": 0.393, "step": 3333 }, { "epoch": 2.455081001472754, "grad_norm": 0.3594471216201782, "learning_rate": 9.721095745209847e-07, "loss": 0.3844, "step": 3334 }, { "epoch": 2.455817378497791, "grad_norm": 0.35299617052078247, "learning_rate": 9.695723726407918e-07, "loss": 0.3746, "step": 3335 }, { "epoch": 2.456553755522828, "grad_norm": 0.309285432100296, "learning_rate": 9.670381305848547e-07, "loss": 0.3796, "step": 3336 }, { "epoch": 2.4572901325478647, "grad_norm": 0.3416523039340973, "learning_rate": 9.64506850214243e-07, "loss": 0.3789, "step": 3337 }, { "epoch": 2.4580265095729015, "grad_norm": 0.3610781729221344, "learning_rate": 9.6197853338785e-07, "loss": 0.3822, "step": 3338 }, { "epoch": 2.4587628865979383, "grad_norm": 0.36001071333885193, "learning_rate": 9.594531819624003e-07, "loss": 0.4023, "step": 3339 }, { "epoch": 2.459499263622975, "grad_norm": 0.3570241630077362, "learning_rate": 9.569307977924304e-07, "loss": 0.3697, "step": 3340 }, { "epoch": 2.460235640648012, "grad_norm": 0.34070777893066406, "learning_rate": 9.544113827303064e-07, "loss": 0.3789, "step": 3341 }, { "epoch": 2.4609720176730487, "grad_norm": 0.31999388337135315, "learning_rate": 9.518949386262088e-07, "loss": 0.3746, "step": 3342 }, { "epoch": 2.4617083946980856, "grad_norm": 0.3107585608959198, "learning_rate": 9.493814673281382e-07, "loss": 0.3745, "step": 3343 }, { "epoch": 2.4624447717231224, "grad_norm": 0.33291155099868774, "learning_rate": 9.468709706819141e-07, "loss": 0.3693, "step": 3344 }, { "epoch": 2.463181148748159, "grad_norm": 0.35321640968322754, "learning_rate": 9.443634505311671e-07, "loss": 0.3677, "step": 3345 }, { "epoch": 2.463917525773196, "grad_norm": 0.31461623311042786, "learning_rate": 9.418589087173441e-07, "loss": 0.4084, "step": 3346 }, { "epoch": 2.464653902798233, "grad_norm": 0.311948299407959, "learning_rate": 9.393573470797079e-07, "loss": 0.3756, "step": 3347 }, { "epoch": 2.4653902798232696, "grad_norm": 0.34136369824409485, "learning_rate": 9.368587674553265e-07, "loss": 0.3904, "step": 3348 }, { "epoch": 2.4661266568483065, "grad_norm": 0.3355327546596527, "learning_rate": 9.343631716790813e-07, "loss": 0.3842, "step": 3349 }, { "epoch": 2.4668630338733433, "grad_norm": 0.32075121998786926, "learning_rate": 9.318705615836648e-07, "loss": 0.3478, "step": 3350 }, { "epoch": 2.46759941089838, "grad_norm": 0.31247884035110474, "learning_rate": 9.293809389995734e-07, "loss": 0.3843, "step": 3351 }, { "epoch": 2.468335787923417, "grad_norm": 0.33083632588386536, "learning_rate": 9.268943057551089e-07, "loss": 0.3705, "step": 3352 }, { "epoch": 2.4690721649484537, "grad_norm": 0.34602615237236023, "learning_rate": 9.244106636763827e-07, "loss": 0.3816, "step": 3353 }, { "epoch": 2.4698085419734905, "grad_norm": 0.3348204493522644, "learning_rate": 9.219300145873051e-07, "loss": 0.4182, "step": 3354 }, { "epoch": 2.4705449189985274, "grad_norm": 0.30488619208335876, "learning_rate": 9.19452360309589e-07, "loss": 0.3337, "step": 3355 }, { "epoch": 2.471281296023564, "grad_norm": 0.34817051887512207, "learning_rate": 9.169777026627514e-07, "loss": 0.3839, "step": 3356 }, { "epoch": 2.472017673048601, "grad_norm": 0.328900545835495, "learning_rate": 9.145060434641017e-07, "loss": 0.3649, "step": 3357 }, { "epoch": 2.472754050073638, "grad_norm": 0.36652466654777527, "learning_rate": 9.120373845287561e-07, "loss": 0.3494, "step": 3358 }, { "epoch": 2.4734904270986746, "grad_norm": 0.3472610116004944, "learning_rate": 9.095717276696214e-07, "loss": 0.3894, "step": 3359 }, { "epoch": 2.4742268041237114, "grad_norm": 0.3122188448905945, "learning_rate": 9.071090746973999e-07, "loss": 0.3546, "step": 3360 }, { "epoch": 2.4749631811487482, "grad_norm": 0.338001549243927, "learning_rate": 9.046494274205924e-07, "loss": 0.363, "step": 3361 }, { "epoch": 2.475699558173785, "grad_norm": 0.32908204197883606, "learning_rate": 9.021927876454883e-07, "loss": 0.3786, "step": 3362 }, { "epoch": 2.476435935198822, "grad_norm": 0.3352196216583252, "learning_rate": 8.997391571761682e-07, "loss": 0.3681, "step": 3363 }, { "epoch": 2.4771723122238587, "grad_norm": 0.3196353316307068, "learning_rate": 8.972885378145079e-07, "loss": 0.3735, "step": 3364 }, { "epoch": 2.4779086892488955, "grad_norm": 0.3419160544872284, "learning_rate": 8.94840931360163e-07, "loss": 0.3501, "step": 3365 }, { "epoch": 2.4786450662739323, "grad_norm": 0.3332716226577759, "learning_rate": 8.923963396105861e-07, "loss": 0.3606, "step": 3366 }, { "epoch": 2.479381443298969, "grad_norm": 0.3240850269794464, "learning_rate": 8.899547643610102e-07, "loss": 0.3774, "step": 3367 }, { "epoch": 2.480117820324006, "grad_norm": 0.3325689136981964, "learning_rate": 8.875162074044524e-07, "loss": 0.3788, "step": 3368 }, { "epoch": 2.4808541973490428, "grad_norm": 0.3567977547645569, "learning_rate": 8.850806705317183e-07, "loss": 0.3516, "step": 3369 }, { "epoch": 2.4815905743740796, "grad_norm": 0.33295658230781555, "learning_rate": 8.826481555313909e-07, "loss": 0.3668, "step": 3370 }, { "epoch": 2.4823269513991164, "grad_norm": 0.3270358443260193, "learning_rate": 8.802186641898352e-07, "loss": 0.3957, "step": 3371 }, { "epoch": 2.4830633284241532, "grad_norm": 0.3151583671569824, "learning_rate": 8.777921982911996e-07, "loss": 0.4113, "step": 3372 }, { "epoch": 2.48379970544919, "grad_norm": 0.3360562026500702, "learning_rate": 8.753687596174021e-07, "loss": 0.4078, "step": 3373 }, { "epoch": 2.484536082474227, "grad_norm": 0.3369217813014984, "learning_rate": 8.729483499481467e-07, "loss": 0.383, "step": 3374 }, { "epoch": 2.4852724594992637, "grad_norm": 0.3392618000507355, "learning_rate": 8.705309710609078e-07, "loss": 0.3923, "step": 3375 }, { "epoch": 2.4860088365243005, "grad_norm": 0.3362715244293213, "learning_rate": 8.681166247309348e-07, "loss": 0.3598, "step": 3376 }, { "epoch": 2.4867452135493373, "grad_norm": 0.34049803018569946, "learning_rate": 8.65705312731252e-07, "loss": 0.3656, "step": 3377 }, { "epoch": 2.487481590574374, "grad_norm": 0.328102171421051, "learning_rate": 8.632970368326537e-07, "loss": 0.3924, "step": 3378 }, { "epoch": 2.488217967599411, "grad_norm": 0.3466600775718689, "learning_rate": 8.608917988037036e-07, "loss": 0.3835, "step": 3379 }, { "epoch": 2.4889543446244478, "grad_norm": 0.3212660551071167, "learning_rate": 8.584896004107379e-07, "loss": 0.382, "step": 3380 }, { "epoch": 2.4896907216494846, "grad_norm": 0.3672233819961548, "learning_rate": 8.56090443417859e-07, "loss": 0.3818, "step": 3381 }, { "epoch": 2.4904270986745214, "grad_norm": 0.3334003984928131, "learning_rate": 8.536943295869315e-07, "loss": 0.3788, "step": 3382 }, { "epoch": 2.491163475699558, "grad_norm": 0.35568714141845703, "learning_rate": 8.513012606775928e-07, "loss": 0.3687, "step": 3383 }, { "epoch": 2.491899852724595, "grad_norm": 0.34635692834854126, "learning_rate": 8.489112384472386e-07, "loss": 0.3541, "step": 3384 }, { "epoch": 2.492636229749632, "grad_norm": 0.36689338088035583, "learning_rate": 8.46524264651028e-07, "loss": 0.3686, "step": 3385 }, { "epoch": 2.4933726067746687, "grad_norm": 0.36320188641548157, "learning_rate": 8.441403410418853e-07, "loss": 0.3795, "step": 3386 }, { "epoch": 2.4941089837997055, "grad_norm": 0.3351554274559021, "learning_rate": 8.417594693704901e-07, "loss": 0.365, "step": 3387 }, { "epoch": 2.4948453608247423, "grad_norm": 0.3230125606060028, "learning_rate": 8.393816513852815e-07, "loss": 0.3867, "step": 3388 }, { "epoch": 2.495581737849779, "grad_norm": 0.3341889977455139, "learning_rate": 8.370068888324612e-07, "loss": 0.3554, "step": 3389 }, { "epoch": 2.496318114874816, "grad_norm": 0.3187844753265381, "learning_rate": 8.346351834559784e-07, "loss": 0.388, "step": 3390 }, { "epoch": 2.4970544918998527, "grad_norm": 0.3333573639392853, "learning_rate": 8.322665369975447e-07, "loss": 0.3775, "step": 3391 }, { "epoch": 2.4977908689248896, "grad_norm": 0.32489851117134094, "learning_rate": 8.299009511966221e-07, "loss": 0.3604, "step": 3392 }, { "epoch": 2.4985272459499264, "grad_norm": 0.3213289976119995, "learning_rate": 8.275384277904231e-07, "loss": 0.3635, "step": 3393 }, { "epoch": 2.499263622974963, "grad_norm": 0.341377854347229, "learning_rate": 8.251789685139172e-07, "loss": 0.3761, "step": 3394 }, { "epoch": 2.5, "grad_norm": 0.34892818331718445, "learning_rate": 8.228225750998176e-07, "loss": 0.3652, "step": 3395 }, { "epoch": 2.500736377025037, "grad_norm": 0.31448036432266235, "learning_rate": 8.204692492785876e-07, "loss": 0.3864, "step": 3396 }, { "epoch": 2.5014727540500736, "grad_norm": 0.31953367590904236, "learning_rate": 8.181189927784416e-07, "loss": 0.364, "step": 3397 }, { "epoch": 2.5022091310751104, "grad_norm": 0.33577442169189453, "learning_rate": 8.157718073253351e-07, "loss": 0.3808, "step": 3398 }, { "epoch": 2.5029455081001473, "grad_norm": 0.3457181453704834, "learning_rate": 8.134276946429703e-07, "loss": 0.3665, "step": 3399 }, { "epoch": 2.503681885125184, "grad_norm": 0.34147438406944275, "learning_rate": 8.110866564527925e-07, "loss": 0.3755, "step": 3400 }, { "epoch": 2.504418262150221, "grad_norm": 0.3595358729362488, "learning_rate": 8.087486944739886e-07, "loss": 0.4067, "step": 3401 }, { "epoch": 2.5051546391752577, "grad_norm": 0.32780689001083374, "learning_rate": 8.064138104234897e-07, "loss": 0.3573, "step": 3402 }, { "epoch": 2.5058910162002945, "grad_norm": 0.3280833065509796, "learning_rate": 8.040820060159621e-07, "loss": 0.3645, "step": 3403 }, { "epoch": 2.5066273932253313, "grad_norm": 0.3275584280490875, "learning_rate": 8.017532829638119e-07, "loss": 0.3812, "step": 3404 }, { "epoch": 2.507363770250368, "grad_norm": 0.363805890083313, "learning_rate": 7.994276429771857e-07, "loss": 0.3802, "step": 3405 }, { "epoch": 2.508100147275405, "grad_norm": 0.317568302154541, "learning_rate": 7.971050877639624e-07, "loss": 0.348, "step": 3406 }, { "epoch": 2.508836524300442, "grad_norm": 0.3445456922054291, "learning_rate": 7.947856190297538e-07, "loss": 0.3498, "step": 3407 }, { "epoch": 2.5095729013254786, "grad_norm": 0.3341994285583496, "learning_rate": 7.924692384779098e-07, "loss": 0.3574, "step": 3408 }, { "epoch": 2.5103092783505154, "grad_norm": 0.368876188993454, "learning_rate": 7.901559478095106e-07, "loss": 0.4066, "step": 3409 }, { "epoch": 2.5110456553755522, "grad_norm": 0.333023339509964, "learning_rate": 7.878457487233643e-07, "loss": 0.3624, "step": 3410 }, { "epoch": 2.511782032400589, "grad_norm": 0.33965203166007996, "learning_rate": 7.85538642916015e-07, "loss": 0.379, "step": 3411 }, { "epoch": 2.512518409425626, "grad_norm": 0.3300594091415405, "learning_rate": 7.832346320817297e-07, "loss": 0.3837, "step": 3412 }, { "epoch": 2.5132547864506627, "grad_norm": 0.3793734312057495, "learning_rate": 7.809337179125031e-07, "loss": 0.3478, "step": 3413 }, { "epoch": 2.5139911634756995, "grad_norm": 0.3749076724052429, "learning_rate": 7.786359020980605e-07, "loss": 0.3635, "step": 3414 }, { "epoch": 2.5147275405007363, "grad_norm": 0.3277282118797302, "learning_rate": 7.763411863258441e-07, "loss": 0.3736, "step": 3415 }, { "epoch": 2.515463917525773, "grad_norm": 0.3639467656612396, "learning_rate": 7.740495722810271e-07, "loss": 0.3579, "step": 3416 }, { "epoch": 2.51620029455081, "grad_norm": 0.36798664927482605, "learning_rate": 7.717610616464999e-07, "loss": 0.3939, "step": 3417 }, { "epoch": 2.5169366715758468, "grad_norm": 0.3840385675430298, "learning_rate": 7.694756561028754e-07, "loss": 0.3643, "step": 3418 }, { "epoch": 2.5176730486008836, "grad_norm": 0.3123991787433624, "learning_rate": 7.671933573284878e-07, "loss": 0.3923, "step": 3419 }, { "epoch": 2.5184094256259204, "grad_norm": 0.37913936376571655, "learning_rate": 7.649141669993881e-07, "loss": 0.3921, "step": 3420 }, { "epoch": 2.5191458026509572, "grad_norm": 0.30917784571647644, "learning_rate": 7.626380867893429e-07, "loss": 0.3538, "step": 3421 }, { "epoch": 2.519882179675994, "grad_norm": 0.3407180607318878, "learning_rate": 7.603651183698396e-07, "loss": 0.4078, "step": 3422 }, { "epoch": 2.520618556701031, "grad_norm": 0.34745657444000244, "learning_rate": 7.580952634100758e-07, "loss": 0.387, "step": 3423 }, { "epoch": 2.5213549337260677, "grad_norm": 0.33488914370536804, "learning_rate": 7.558285235769647e-07, "loss": 0.3858, "step": 3424 }, { "epoch": 2.5220913107511045, "grad_norm": 0.3565431237220764, "learning_rate": 7.535649005351309e-07, "loss": 0.3824, "step": 3425 }, { "epoch": 2.5228276877761413, "grad_norm": 0.3106916546821594, "learning_rate": 7.513043959469107e-07, "loss": 0.3919, "step": 3426 }, { "epoch": 2.523564064801178, "grad_norm": 0.34295180439949036, "learning_rate": 7.49047011472352e-07, "loss": 0.398, "step": 3427 }, { "epoch": 2.524300441826215, "grad_norm": 0.316560834646225, "learning_rate": 7.467927487692089e-07, "loss": 0.3448, "step": 3428 }, { "epoch": 2.5250368188512518, "grad_norm": 0.33750441670417786, "learning_rate": 7.445416094929426e-07, "loss": 0.3661, "step": 3429 }, { "epoch": 2.5257731958762886, "grad_norm": 0.34076401591300964, "learning_rate": 7.422935952967236e-07, "loss": 0.3744, "step": 3430 }, { "epoch": 2.5265095729013254, "grad_norm": 0.3119591772556305, "learning_rate": 7.40048707831425e-07, "loss": 0.3562, "step": 3431 }, { "epoch": 2.527245949926362, "grad_norm": 0.33838844299316406, "learning_rate": 7.378069487456241e-07, "loss": 0.3647, "step": 3432 }, { "epoch": 2.527982326951399, "grad_norm": 0.3333272337913513, "learning_rate": 7.355683196856006e-07, "loss": 0.3804, "step": 3433 }, { "epoch": 2.528718703976436, "grad_norm": 0.31554344296455383, "learning_rate": 7.333328222953356e-07, "loss": 0.3709, "step": 3434 }, { "epoch": 2.5294550810014726, "grad_norm": 0.3351151645183563, "learning_rate": 7.311004582165132e-07, "loss": 0.3745, "step": 3435 }, { "epoch": 2.5301914580265095, "grad_norm": 0.3532677888870239, "learning_rate": 7.288712290885119e-07, "loss": 0.3772, "step": 3436 }, { "epoch": 2.5309278350515463, "grad_norm": 0.33610615134239197, "learning_rate": 7.266451365484106e-07, "loss": 0.37, "step": 3437 }, { "epoch": 2.531664212076583, "grad_norm": 0.33613523840904236, "learning_rate": 7.244221822309855e-07, "loss": 0.3705, "step": 3438 }, { "epoch": 2.53240058910162, "grad_norm": 0.3339458107948303, "learning_rate": 7.222023677687062e-07, "loss": 0.382, "step": 3439 }, { "epoch": 2.5331369661266567, "grad_norm": 0.3765323758125305, "learning_rate": 7.199856947917372e-07, "loss": 0.3996, "step": 3440 }, { "epoch": 2.5338733431516935, "grad_norm": 0.3355490565299988, "learning_rate": 7.177721649279367e-07, "loss": 0.3722, "step": 3441 }, { "epoch": 2.5346097201767304, "grad_norm": 0.33720365166664124, "learning_rate": 7.155617798028542e-07, "loss": 0.3867, "step": 3442 }, { "epoch": 2.535346097201767, "grad_norm": 0.3521978259086609, "learning_rate": 7.133545410397274e-07, "loss": 0.3626, "step": 3443 }, { "epoch": 2.536082474226804, "grad_norm": 0.38076502084732056, "learning_rate": 7.111504502594896e-07, "loss": 0.3998, "step": 3444 }, { "epoch": 2.536818851251841, "grad_norm": 0.3390524983406067, "learning_rate": 7.089495090807564e-07, "loss": 0.4084, "step": 3445 }, { "epoch": 2.5375552282768776, "grad_norm": 0.3228042423725128, "learning_rate": 7.067517191198314e-07, "loss": 0.3477, "step": 3446 }, { "epoch": 2.5382916053019144, "grad_norm": 0.373737633228302, "learning_rate": 7.045570819907072e-07, "loss": 0.3804, "step": 3447 }, { "epoch": 2.5390279823269513, "grad_norm": 0.33321553468704224, "learning_rate": 7.023655993050588e-07, "loss": 0.3792, "step": 3448 }, { "epoch": 2.539764359351988, "grad_norm": 0.35181403160095215, "learning_rate": 7.001772726722439e-07, "loss": 0.3664, "step": 3449 }, { "epoch": 2.540500736377025, "grad_norm": 0.36001092195510864, "learning_rate": 6.979921036993042e-07, "loss": 0.4203, "step": 3450 }, { "epoch": 2.5412371134020617, "grad_norm": 0.32719293236732483, "learning_rate": 6.958100939909601e-07, "loss": 0.3662, "step": 3451 }, { "epoch": 2.5419734904270985, "grad_norm": 0.33018410205841064, "learning_rate": 6.936312451496157e-07, "loss": 0.3636, "step": 3452 }, { "epoch": 2.5427098674521353, "grad_norm": 0.32865098118782043, "learning_rate": 6.914555587753508e-07, "loss": 0.3847, "step": 3453 }, { "epoch": 2.543446244477172, "grad_norm": 0.324142187833786, "learning_rate": 6.892830364659231e-07, "loss": 0.3712, "step": 3454 }, { "epoch": 2.544182621502209, "grad_norm": 0.3413415849208832, "learning_rate": 6.871136798167693e-07, "loss": 0.3993, "step": 3455 }, { "epoch": 2.544918998527246, "grad_norm": 0.32384759187698364, "learning_rate": 6.849474904209979e-07, "loss": 0.3785, "step": 3456 }, { "epoch": 2.5456553755522826, "grad_norm": 0.32682374119758606, "learning_rate": 6.827844698693931e-07, "loss": 0.3639, "step": 3457 }, { "epoch": 2.5463917525773194, "grad_norm": 0.3174643814563751, "learning_rate": 6.806246197504118e-07, "loss": 0.3797, "step": 3458 }, { "epoch": 2.5471281296023562, "grad_norm": 0.3432275056838989, "learning_rate": 6.784679416501822e-07, "loss": 0.3791, "step": 3459 }, { "epoch": 2.547864506627393, "grad_norm": 0.35215064883232117, "learning_rate": 6.763144371525048e-07, "loss": 0.3873, "step": 3460 }, { "epoch": 2.54860088365243, "grad_norm": 0.36379846930503845, "learning_rate": 6.741641078388472e-07, "loss": 0.3611, "step": 3461 }, { "epoch": 2.5493372606774667, "grad_norm": 0.3444019854068756, "learning_rate": 6.72016955288346e-07, "loss": 0.3479, "step": 3462 }, { "epoch": 2.5500736377025035, "grad_norm": 0.3353855311870575, "learning_rate": 6.698729810778065e-07, "loss": 0.3835, "step": 3463 }, { "epoch": 2.5508100147275403, "grad_norm": 0.3298718333244324, "learning_rate": 6.677321867816983e-07, "loss": 0.4032, "step": 3464 }, { "epoch": 2.551546391752577, "grad_norm": 0.3183014392852783, "learning_rate": 6.655945739721548e-07, "loss": 0.3547, "step": 3465 }, { "epoch": 2.552282768777614, "grad_norm": 0.33290722966194153, "learning_rate": 6.634601442189753e-07, "loss": 0.3874, "step": 3466 }, { "epoch": 2.5530191458026508, "grad_norm": 0.3307499289512634, "learning_rate": 6.613288990896205e-07, "loss": 0.3761, "step": 3467 }, { "epoch": 2.5537555228276876, "grad_norm": 0.3292166590690613, "learning_rate": 6.592008401492106e-07, "loss": 0.3838, "step": 3468 }, { "epoch": 2.5544918998527244, "grad_norm": 0.32803595066070557, "learning_rate": 6.570759689605305e-07, "loss": 0.3759, "step": 3469 }, { "epoch": 2.555228276877761, "grad_norm": 0.36094194650650024, "learning_rate": 6.549542870840203e-07, "loss": 0.346, "step": 3470 }, { "epoch": 2.555964653902798, "grad_norm": 0.35194647312164307, "learning_rate": 6.528357960777776e-07, "loss": 0.3795, "step": 3471 }, { "epoch": 2.556701030927835, "grad_norm": 0.3414342701435089, "learning_rate": 6.507204974975611e-07, "loss": 0.3704, "step": 3472 }, { "epoch": 2.5574374079528717, "grad_norm": 0.3302089273929596, "learning_rate": 6.486083928967801e-07, "loss": 0.3581, "step": 3473 }, { "epoch": 2.5581737849779085, "grad_norm": 0.3138299286365509, "learning_rate": 6.46499483826501e-07, "loss": 0.3649, "step": 3474 }, { "epoch": 2.5589101620029453, "grad_norm": 0.34113311767578125, "learning_rate": 6.443937718354426e-07, "loss": 0.3937, "step": 3475 }, { "epoch": 2.559646539027982, "grad_norm": 0.33165475726127625, "learning_rate": 6.422912584699753e-07, "loss": 0.3952, "step": 3476 }, { "epoch": 2.560382916053019, "grad_norm": 0.35434436798095703, "learning_rate": 6.401919452741234e-07, "loss": 0.3813, "step": 3477 }, { "epoch": 2.5611192930780557, "grad_norm": 0.3147828280925751, "learning_rate": 6.380958337895582e-07, "loss": 0.378, "step": 3478 }, { "epoch": 2.5618556701030926, "grad_norm": 0.34505346417427063, "learning_rate": 6.360029255555994e-07, "loss": 0.3648, "step": 3479 }, { "epoch": 2.5625920471281294, "grad_norm": 0.3115389049053192, "learning_rate": 6.339132221092181e-07, "loss": 0.3581, "step": 3480 }, { "epoch": 2.563328424153166, "grad_norm": 0.3120824694633484, "learning_rate": 6.318267249850274e-07, "loss": 0.3574, "step": 3481 }, { "epoch": 2.564064801178203, "grad_norm": 0.3659776747226715, "learning_rate": 6.297434357152882e-07, "loss": 0.3897, "step": 3482 }, { "epoch": 2.56480117820324, "grad_norm": 0.33508267998695374, "learning_rate": 6.276633558299056e-07, "loss": 0.3856, "step": 3483 }, { "epoch": 2.5655375552282766, "grad_norm": 0.3201551139354706, "learning_rate": 6.25586486856426e-07, "loss": 0.3766, "step": 3484 }, { "epoch": 2.5662739322533135, "grad_norm": 0.332343727350235, "learning_rate": 6.23512830320041e-07, "loss": 0.3716, "step": 3485 }, { "epoch": 2.5670103092783503, "grad_norm": 0.346399188041687, "learning_rate": 6.214423877435805e-07, "loss": 0.3845, "step": 3486 }, { "epoch": 2.567746686303387, "grad_norm": 0.3369497060775757, "learning_rate": 6.193751606475141e-07, "loss": 0.3658, "step": 3487 }, { "epoch": 2.568483063328424, "grad_norm": 0.3280666768550873, "learning_rate": 6.17311150549953e-07, "loss": 0.3755, "step": 3488 }, { "epoch": 2.5692194403534607, "grad_norm": 0.30463147163391113, "learning_rate": 6.152503589666426e-07, "loss": 0.3658, "step": 3489 }, { "epoch": 2.5699558173784975, "grad_norm": 0.3326549828052521, "learning_rate": 6.131927874109661e-07, "loss": 0.3723, "step": 3490 }, { "epoch": 2.5706921944035344, "grad_norm": 0.338368684053421, "learning_rate": 6.111384373939416e-07, "loss": 0.366, "step": 3491 }, { "epoch": 2.571428571428571, "grad_norm": 0.3462516963481903, "learning_rate": 6.090873104242213e-07, "loss": 0.3906, "step": 3492 }, { "epoch": 2.572164948453608, "grad_norm": 0.33473119139671326, "learning_rate": 6.070394080080921e-07, "loss": 0.3634, "step": 3493 }, { "epoch": 2.572901325478645, "grad_norm": 0.3650546073913574, "learning_rate": 6.049947316494709e-07, "loss": 0.3895, "step": 3494 }, { "epoch": 2.5736377025036816, "grad_norm": 0.3254354000091553, "learning_rate": 6.029532828499052e-07, "loss": 0.3694, "step": 3495 }, { "epoch": 2.5743740795287184, "grad_norm": 0.3232567012310028, "learning_rate": 6.009150631085758e-07, "loss": 0.3675, "step": 3496 }, { "epoch": 2.5751104565537553, "grad_norm": 0.3334692120552063, "learning_rate": 5.988800739222884e-07, "loss": 0.3539, "step": 3497 }, { "epoch": 2.575846833578792, "grad_norm": 0.32919347286224365, "learning_rate": 5.968483167854761e-07, "loss": 0.3744, "step": 3498 }, { "epoch": 2.576583210603829, "grad_norm": 0.3350676894187927, "learning_rate": 5.948197931902034e-07, "loss": 0.3817, "step": 3499 }, { "epoch": 2.5773195876288657, "grad_norm": 0.32519227266311646, "learning_rate": 5.927945046261541e-07, "loss": 0.3669, "step": 3500 }, { "epoch": 2.5780559646539025, "grad_norm": 0.31917646527290344, "learning_rate": 5.90772452580638e-07, "loss": 0.3524, "step": 3501 }, { "epoch": 2.57879234167894, "grad_norm": 0.3143576979637146, "learning_rate": 5.887536385385917e-07, "loss": 0.3696, "step": 3502 }, { "epoch": 2.5795287187039766, "grad_norm": 0.32816874980926514, "learning_rate": 5.867380639825698e-07, "loss": 0.3613, "step": 3503 }, { "epoch": 2.5802650957290134, "grad_norm": 0.3384314179420471, "learning_rate": 5.847257303927484e-07, "loss": 0.3812, "step": 3504 }, { "epoch": 2.5810014727540502, "grad_norm": 0.34819257259368896, "learning_rate": 5.827166392469269e-07, "loss": 0.3749, "step": 3505 }, { "epoch": 2.581737849779087, "grad_norm": 0.31272298097610474, "learning_rate": 5.807107920205202e-07, "loss": 0.3579, "step": 3506 }, { "epoch": 2.582474226804124, "grad_norm": 0.3496764004230499, "learning_rate": 5.78708190186561e-07, "loss": 0.3594, "step": 3507 }, { "epoch": 2.5832106038291607, "grad_norm": 0.3062533438205719, "learning_rate": 5.767088352157002e-07, "loss": 0.3901, "step": 3508 }, { "epoch": 2.5839469808541975, "grad_norm": 0.4096584618091583, "learning_rate": 5.747127285762027e-07, "loss": 0.372, "step": 3509 }, { "epoch": 2.5846833578792343, "grad_norm": 0.32007288932800293, "learning_rate": 5.727198717339511e-07, "loss": 0.366, "step": 3510 }, { "epoch": 2.585419734904271, "grad_norm": 0.32174500823020935, "learning_rate": 5.707302661524372e-07, "loss": 0.3938, "step": 3511 }, { "epoch": 2.586156111929308, "grad_norm": 0.3417740762233734, "learning_rate": 5.687439132927674e-07, "loss": 0.3665, "step": 3512 }, { "epoch": 2.5868924889543448, "grad_norm": 0.32081058621406555, "learning_rate": 5.66760814613661e-07, "loss": 0.3919, "step": 3513 }, { "epoch": 2.5876288659793816, "grad_norm": 0.32988831400871277, "learning_rate": 5.647809715714442e-07, "loss": 0.3802, "step": 3514 }, { "epoch": 2.5883652430044184, "grad_norm": 0.3118469715118408, "learning_rate": 5.628043856200543e-07, "loss": 0.3748, "step": 3515 }, { "epoch": 2.589101620029455, "grad_norm": 0.33467116951942444, "learning_rate": 5.60831058211036e-07, "loss": 0.4001, "step": 3516 }, { "epoch": 2.589837997054492, "grad_norm": 0.3081044852733612, "learning_rate": 5.588609907935405e-07, "loss": 0.3654, "step": 3517 }, { "epoch": 2.590574374079529, "grad_norm": 0.3295198976993561, "learning_rate": 5.568941848143284e-07, "loss": 0.3907, "step": 3518 }, { "epoch": 2.5913107511045657, "grad_norm": 0.32055968046188354, "learning_rate": 5.549306417177602e-07, "loss": 0.3872, "step": 3519 }, { "epoch": 2.5920471281296025, "grad_norm": 0.29698818922042847, "learning_rate": 5.529703629458027e-07, "loss": 0.3768, "step": 3520 }, { "epoch": 2.5927835051546393, "grad_norm": 0.3199003040790558, "learning_rate": 5.510133499380271e-07, "loss": 0.359, "step": 3521 }, { "epoch": 2.593519882179676, "grad_norm": 0.3205602169036865, "learning_rate": 5.490596041316038e-07, "loss": 0.3838, "step": 3522 }, { "epoch": 2.594256259204713, "grad_norm": 0.32970884442329407, "learning_rate": 5.471091269613033e-07, "loss": 0.3974, "step": 3523 }, { "epoch": 2.5949926362297497, "grad_norm": 0.3231904208660126, "learning_rate": 5.451619198594998e-07, "loss": 0.4017, "step": 3524 }, { "epoch": 2.5957290132547866, "grad_norm": 0.3256881535053253, "learning_rate": 5.432179842561614e-07, "loss": 0.3776, "step": 3525 }, { "epoch": 2.5964653902798234, "grad_norm": 0.3105742633342743, "learning_rate": 5.412773215788547e-07, "loss": 0.4146, "step": 3526 }, { "epoch": 2.59720176730486, "grad_norm": 0.36772292852401733, "learning_rate": 5.393399332527466e-07, "loss": 0.3675, "step": 3527 }, { "epoch": 2.597938144329897, "grad_norm": 0.34043705463409424, "learning_rate": 5.374058207005945e-07, "loss": 0.3676, "step": 3528 }, { "epoch": 2.598674521354934, "grad_norm": 0.31147414445877075, "learning_rate": 5.354749853427521e-07, "loss": 0.3456, "step": 3529 }, { "epoch": 2.5994108983799706, "grad_norm": 0.3361349105834961, "learning_rate": 5.335474285971681e-07, "loss": 0.3442, "step": 3530 }, { "epoch": 2.6001472754050075, "grad_norm": 0.3142765164375305, "learning_rate": 5.316231518793802e-07, "loss": 0.3597, "step": 3531 }, { "epoch": 2.6008836524300443, "grad_norm": 0.3216022253036499, "learning_rate": 5.297021566025212e-07, "loss": 0.3665, "step": 3532 }, { "epoch": 2.601620029455081, "grad_norm": 0.32801806926727295, "learning_rate": 5.277844441773105e-07, "loss": 0.392, "step": 3533 }, { "epoch": 2.602356406480118, "grad_norm": 0.3432222902774811, "learning_rate": 5.258700160120567e-07, "loss": 0.3727, "step": 3534 }, { "epoch": 2.6030927835051547, "grad_norm": 0.3140788972377777, "learning_rate": 5.239588735126611e-07, "loss": 0.3747, "step": 3535 }, { "epoch": 2.6038291605301915, "grad_norm": 0.323508083820343, "learning_rate": 5.220510180826071e-07, "loss": 0.416, "step": 3536 }, { "epoch": 2.6045655375552283, "grad_norm": 0.3127088248729706, "learning_rate": 5.201464511229659e-07, "loss": 0.3921, "step": 3537 }, { "epoch": 2.605301914580265, "grad_norm": 0.3234788179397583, "learning_rate": 5.182451740323957e-07, "loss": 0.3813, "step": 3538 }, { "epoch": 2.606038291605302, "grad_norm": 0.30984804034233093, "learning_rate": 5.163471882071352e-07, "loss": 0.3887, "step": 3539 }, { "epoch": 2.606774668630339, "grad_norm": 0.32947883009910583, "learning_rate": 5.144524950410074e-07, "loss": 0.3717, "step": 3540 }, { "epoch": 2.6075110456553756, "grad_norm": 0.3143480718135834, "learning_rate": 5.125610959254213e-07, "loss": 0.3805, "step": 3541 }, { "epoch": 2.6082474226804124, "grad_norm": 0.32195669412612915, "learning_rate": 5.10672992249358e-07, "loss": 0.3862, "step": 3542 }, { "epoch": 2.6089837997054492, "grad_norm": 0.3320513367652893, "learning_rate": 5.087881853993876e-07, "loss": 0.3418, "step": 3543 }, { "epoch": 2.609720176730486, "grad_norm": 0.3392578959465027, "learning_rate": 5.069066767596542e-07, "loss": 0.3469, "step": 3544 }, { "epoch": 2.610456553755523, "grad_norm": 0.3689731955528259, "learning_rate": 5.0502846771188e-07, "loss": 0.3818, "step": 3545 }, { "epoch": 2.6111929307805597, "grad_norm": 0.33059579133987427, "learning_rate": 5.031535596353665e-07, "loss": 0.3901, "step": 3546 }, { "epoch": 2.6119293078055965, "grad_norm": 0.3137247860431671, "learning_rate": 5.012819539069885e-07, "loss": 0.3799, "step": 3547 }, { "epoch": 2.6126656848306333, "grad_norm": 0.3432486057281494, "learning_rate": 4.994136519011966e-07, "loss": 0.3573, "step": 3548 }, { "epoch": 2.61340206185567, "grad_norm": 0.33022403717041016, "learning_rate": 4.975486549900177e-07, "loss": 0.369, "step": 3549 }, { "epoch": 2.614138438880707, "grad_norm": 0.33566609025001526, "learning_rate": 4.956869645430451e-07, "loss": 0.3732, "step": 3550 }, { "epoch": 2.6148748159057438, "grad_norm": 0.3244359791278839, "learning_rate": 4.938285819274507e-07, "loss": 0.3276, "step": 3551 }, { "epoch": 2.6156111929307806, "grad_norm": 0.31504350900650024, "learning_rate": 4.919735085079746e-07, "loss": 0.3572, "step": 3552 }, { "epoch": 2.6163475699558174, "grad_norm": 0.32959744334220886, "learning_rate": 4.901217456469248e-07, "loss": 0.4114, "step": 3553 }, { "epoch": 2.6170839469808542, "grad_norm": 0.30462411046028137, "learning_rate": 4.882732947041818e-07, "loss": 0.3974, "step": 3554 }, { "epoch": 2.617820324005891, "grad_norm": 0.33037570118904114, "learning_rate": 4.86428157037192e-07, "loss": 0.3725, "step": 3555 }, { "epoch": 2.618556701030928, "grad_norm": 0.3179605007171631, "learning_rate": 4.845863340009671e-07, "loss": 0.4015, "step": 3556 }, { "epoch": 2.6192930780559647, "grad_norm": 0.332682341337204, "learning_rate": 4.827478269480895e-07, "loss": 0.3676, "step": 3557 }, { "epoch": 2.6200294550810015, "grad_norm": 0.33066973090171814, "learning_rate": 4.809126372286999e-07, "loss": 0.3644, "step": 3558 }, { "epoch": 2.6207658321060383, "grad_norm": 0.34845325350761414, "learning_rate": 4.790807661905067e-07, "loss": 0.3876, "step": 3559 }, { "epoch": 2.621502209131075, "grad_norm": 0.3088925778865814, "learning_rate": 4.772522151787822e-07, "loss": 0.3701, "step": 3560 }, { "epoch": 2.622238586156112, "grad_norm": 0.3307948708534241, "learning_rate": 4.7542698553635856e-07, "loss": 0.3932, "step": 3561 }, { "epoch": 2.6229749631811488, "grad_norm": 0.33199846744537354, "learning_rate": 4.7360507860362723e-07, "loss": 0.362, "step": 3562 }, { "epoch": 2.6237113402061856, "grad_norm": 0.3131435215473175, "learning_rate": 4.7178649571854473e-07, "loss": 0.3647, "step": 3563 }, { "epoch": 2.6244477172312224, "grad_norm": 0.3339531719684601, "learning_rate": 4.699712382166216e-07, "loss": 0.3888, "step": 3564 }, { "epoch": 2.625184094256259, "grad_norm": 0.3675123155117035, "learning_rate": 4.6815930743092765e-07, "loss": 0.3772, "step": 3565 }, { "epoch": 2.625920471281296, "grad_norm": 0.34725135564804077, "learning_rate": 4.663507046920929e-07, "loss": 0.349, "step": 3566 }, { "epoch": 2.626656848306333, "grad_norm": 0.31065019965171814, "learning_rate": 4.6454543132829653e-07, "loss": 0.3587, "step": 3567 }, { "epoch": 2.6273932253313697, "grad_norm": 0.32565373182296753, "learning_rate": 4.627434886652793e-07, "loss": 0.3668, "step": 3568 }, { "epoch": 2.6281296023564065, "grad_norm": 0.33230826258659363, "learning_rate": 4.6094487802633315e-07, "loss": 0.3763, "step": 3569 }, { "epoch": 2.6288659793814433, "grad_norm": 0.31987035274505615, "learning_rate": 4.591496007323021e-07, "loss": 0.38, "step": 3570 }, { "epoch": 2.62960235640648, "grad_norm": 0.31742241978645325, "learning_rate": 4.573576581015854e-07, "loss": 0.3474, "step": 3571 }, { "epoch": 2.630338733431517, "grad_norm": 0.31509506702423096, "learning_rate": 4.55569051450131e-07, "loss": 0.3646, "step": 3572 }, { "epoch": 2.6310751104565537, "grad_norm": 0.3257356584072113, "learning_rate": 4.537837820914359e-07, "loss": 0.3668, "step": 3573 }, { "epoch": 2.6318114874815906, "grad_norm": 0.32111799716949463, "learning_rate": 4.520018513365515e-07, "loss": 0.3933, "step": 3574 }, { "epoch": 2.6325478645066274, "grad_norm": 0.3151605725288391, "learning_rate": 4.5022326049406986e-07, "loss": 0.3676, "step": 3575 }, { "epoch": 2.633284241531664, "grad_norm": 0.3598038852214813, "learning_rate": 4.484480108701372e-07, "loss": 0.4136, "step": 3576 }, { "epoch": 2.634020618556701, "grad_norm": 0.2969145178794861, "learning_rate": 4.4667610376844197e-07, "loss": 0.3821, "step": 3577 }, { "epoch": 2.634756995581738, "grad_norm": 0.33178281784057617, "learning_rate": 4.449075404902187e-07, "loss": 0.3559, "step": 3578 }, { "epoch": 2.6354933726067746, "grad_norm": 0.3466475307941437, "learning_rate": 4.4314232233424845e-07, "loss": 0.3827, "step": 3579 }, { "epoch": 2.6362297496318114, "grad_norm": 0.3220832645893097, "learning_rate": 4.413804505968533e-07, "loss": 0.367, "step": 3580 }, { "epoch": 2.6369661266568483, "grad_norm": 0.32306790351867676, "learning_rate": 4.3962192657189707e-07, "loss": 0.3961, "step": 3581 }, { "epoch": 2.637702503681885, "grad_norm": 0.32913732528686523, "learning_rate": 4.378667515507895e-07, "loss": 0.3701, "step": 3582 }, { "epoch": 2.638438880706922, "grad_norm": 0.3247510492801666, "learning_rate": 4.361149268224779e-07, "loss": 0.3603, "step": 3583 }, { "epoch": 2.6391752577319587, "grad_norm": 0.36141979694366455, "learning_rate": 4.34366453673446e-07, "loss": 0.3905, "step": 3584 }, { "epoch": 2.6399116347569955, "grad_norm": 0.31596043705940247, "learning_rate": 4.326213333877227e-07, "loss": 0.3527, "step": 3585 }, { "epoch": 2.6406480117820323, "grad_norm": 0.3106141984462738, "learning_rate": 4.308795672468713e-07, "loss": 0.3539, "step": 3586 }, { "epoch": 2.641384388807069, "grad_norm": 0.30769434571266174, "learning_rate": 4.291411565299902e-07, "loss": 0.3495, "step": 3587 }, { "epoch": 2.642120765832106, "grad_norm": 0.30311083793640137, "learning_rate": 4.2740610251371826e-07, "loss": 0.3988, "step": 3588 }, { "epoch": 2.642857142857143, "grad_norm": 0.3294108510017395, "learning_rate": 4.256744064722246e-07, "loss": 0.3609, "step": 3589 }, { "epoch": 2.6435935198821796, "grad_norm": 0.3281387686729431, "learning_rate": 4.2394606967721683e-07, "loss": 0.3651, "step": 3590 }, { "epoch": 2.6443298969072164, "grad_norm": 0.33501261472702026, "learning_rate": 4.222210933979326e-07, "loss": 0.3573, "step": 3591 }, { "epoch": 2.6450662739322532, "grad_norm": 0.35645949840545654, "learning_rate": 4.204994789011396e-07, "loss": 0.4035, "step": 3592 }, { "epoch": 2.64580265095729, "grad_norm": 0.3307779133319855, "learning_rate": 4.187812274511427e-07, "loss": 0.3763, "step": 3593 }, { "epoch": 2.646539027982327, "grad_norm": 0.3367134630680084, "learning_rate": 4.17066340309773e-07, "loss": 0.3359, "step": 3594 }, { "epoch": 2.6472754050073637, "grad_norm": 0.33751773834228516, "learning_rate": 4.153548187363904e-07, "loss": 0.3989, "step": 3595 }, { "epoch": 2.6480117820324005, "grad_norm": 0.29782766103744507, "learning_rate": 4.1364666398788613e-07, "loss": 0.3574, "step": 3596 }, { "epoch": 2.6487481590574373, "grad_norm": 0.3283192217350006, "learning_rate": 4.1194187731867783e-07, "loss": 0.3527, "step": 3597 }, { "epoch": 2.649484536082474, "grad_norm": 0.32778191566467285, "learning_rate": 4.102404599807075e-07, "loss": 0.4059, "step": 3598 }, { "epoch": 2.650220913107511, "grad_norm": 0.35166746377944946, "learning_rate": 4.0854241322344665e-07, "loss": 0.3882, "step": 3599 }, { "epoch": 2.6509572901325478, "grad_norm": 0.34499865770339966, "learning_rate": 4.0684773829388737e-07, "loss": 0.3763, "step": 3600 }, { "epoch": 2.6516936671575846, "grad_norm": 0.3195962607860565, "learning_rate": 4.0515643643655014e-07, "loss": 0.3616, "step": 3601 }, { "epoch": 2.6524300441826214, "grad_norm": 0.34196287393569946, "learning_rate": 4.034685088934737e-07, "loss": 0.3798, "step": 3602 }, { "epoch": 2.653166421207658, "grad_norm": 0.32536935806274414, "learning_rate": 4.0178395690422143e-07, "loss": 0.3623, "step": 3603 }, { "epoch": 2.653902798232695, "grad_norm": 0.3281365633010864, "learning_rate": 4.001027817058789e-07, "loss": 0.3523, "step": 3604 }, { "epoch": 2.654639175257732, "grad_norm": 0.32271289825439453, "learning_rate": 3.9842498453304955e-07, "loss": 0.3578, "step": 3605 }, { "epoch": 2.6553755522827687, "grad_norm": 0.3313661515712738, "learning_rate": 3.9675056661785563e-07, "loss": 0.3415, "step": 3606 }, { "epoch": 2.6561119293078055, "grad_norm": 0.31950780749320984, "learning_rate": 3.950795291899412e-07, "loss": 0.3595, "step": 3607 }, { "epoch": 2.6568483063328423, "grad_norm": 0.34532180428504944, "learning_rate": 3.934118734764647e-07, "loss": 0.3879, "step": 3608 }, { "epoch": 2.657584683357879, "grad_norm": 0.3165544271469116, "learning_rate": 3.9174760070210204e-07, "loss": 0.3682, "step": 3609 }, { "epoch": 2.658321060382916, "grad_norm": 0.33208510279655457, "learning_rate": 3.9008671208904503e-07, "loss": 0.3667, "step": 3610 }, { "epoch": 2.6590574374079528, "grad_norm": 0.31021976470947266, "learning_rate": 3.8842920885699906e-07, "loss": 0.3645, "step": 3611 }, { "epoch": 2.6597938144329896, "grad_norm": 0.32414910197257996, "learning_rate": 3.8677509222318557e-07, "loss": 0.3607, "step": 3612 }, { "epoch": 2.6605301914580264, "grad_norm": 0.33595502376556396, "learning_rate": 3.8512436340233826e-07, "loss": 0.3728, "step": 3613 }, { "epoch": 2.661266568483063, "grad_norm": 0.31958842277526855, "learning_rate": 3.8347702360670036e-07, "loss": 0.3722, "step": 3614 }, { "epoch": 2.6620029455081, "grad_norm": 0.3532390594482422, "learning_rate": 3.8183307404603074e-07, "loss": 0.3657, "step": 3615 }, { "epoch": 2.662739322533137, "grad_norm": 0.3612724840641022, "learning_rate": 3.8019251592759656e-07, "loss": 0.388, "step": 3616 }, { "epoch": 2.6634756995581736, "grad_norm": 0.3217339515686035, "learning_rate": 3.785553504561712e-07, "loss": 0.3428, "step": 3617 }, { "epoch": 2.6642120765832105, "grad_norm": 0.2959875762462616, "learning_rate": 3.769215788340419e-07, "loss": 0.3665, "step": 3618 }, { "epoch": 2.6649484536082473, "grad_norm": 0.3248257637023926, "learning_rate": 3.752912022610006e-07, "loss": 0.3725, "step": 3619 }, { "epoch": 2.665684830633284, "grad_norm": 0.3298156261444092, "learning_rate": 3.736642219343456e-07, "loss": 0.3673, "step": 3620 }, { "epoch": 2.666421207658321, "grad_norm": 0.32262980937957764, "learning_rate": 3.720406390488834e-07, "loss": 0.3928, "step": 3621 }, { "epoch": 2.6671575846833577, "grad_norm": 0.30602526664733887, "learning_rate": 3.7042045479692424e-07, "loss": 0.3647, "step": 3622 }, { "epoch": 2.6678939617083945, "grad_norm": 0.3311411142349243, "learning_rate": 3.6880367036828124e-07, "loss": 0.3779, "step": 3623 }, { "epoch": 2.6686303387334314, "grad_norm": 0.32879799604415894, "learning_rate": 3.671902869502736e-07, "loss": 0.3925, "step": 3624 }, { "epoch": 2.669366715758468, "grad_norm": 0.3125240206718445, "learning_rate": 3.6558030572772075e-07, "loss": 0.3984, "step": 3625 }, { "epoch": 2.670103092783505, "grad_norm": 0.3394649624824524, "learning_rate": 3.639737278829436e-07, "loss": 0.3657, "step": 3626 }, { "epoch": 2.670839469808542, "grad_norm": 0.34995949268341064, "learning_rate": 3.623705545957651e-07, "loss": 0.4049, "step": 3627 }, { "epoch": 2.6715758468335786, "grad_norm": 0.33862966299057007, "learning_rate": 3.607707870435062e-07, "loss": 0.3941, "step": 3628 }, { "epoch": 2.672312223858616, "grad_norm": 0.3330628573894501, "learning_rate": 3.5917442640098997e-07, "loss": 0.3516, "step": 3629 }, { "epoch": 2.6730486008836527, "grad_norm": 0.30992215871810913, "learning_rate": 3.575814738405331e-07, "loss": 0.3642, "step": 3630 }, { "epoch": 2.6737849779086895, "grad_norm": 0.3173280954360962, "learning_rate": 3.559919305319526e-07, "loss": 0.3858, "step": 3631 }, { "epoch": 2.6745213549337263, "grad_norm": 0.32444649934768677, "learning_rate": 3.544057976425619e-07, "loss": 0.3647, "step": 3632 }, { "epoch": 2.675257731958763, "grad_norm": 0.2939029932022095, "learning_rate": 3.528230763371687e-07, "loss": 0.359, "step": 3633 }, { "epoch": 2.6759941089838, "grad_norm": 0.3513110876083374, "learning_rate": 3.51243767778075e-07, "loss": 0.3474, "step": 3634 }, { "epoch": 2.676730486008837, "grad_norm": 0.3469097912311554, "learning_rate": 3.49667873125078e-07, "loss": 0.3416, "step": 3635 }, { "epoch": 2.6774668630338736, "grad_norm": 0.3251095116138458, "learning_rate": 3.480953935354658e-07, "loss": 0.3585, "step": 3636 }, { "epoch": 2.6782032400589104, "grad_norm": 0.31582942605018616, "learning_rate": 3.4652633016402205e-07, "loss": 0.3648, "step": 3637 }, { "epoch": 2.6789396170839472, "grad_norm": 0.31770941615104675, "learning_rate": 3.449606841630182e-07, "loss": 0.3846, "step": 3638 }, { "epoch": 2.679675994108984, "grad_norm": 0.3245222866535187, "learning_rate": 3.433984566822163e-07, "loss": 0.3573, "step": 3639 }, { "epoch": 2.680412371134021, "grad_norm": 0.33965256810188293, "learning_rate": 3.4183964886887135e-07, "loss": 0.3547, "step": 3640 }, { "epoch": 2.6811487481590577, "grad_norm": 0.34220483899116516, "learning_rate": 3.4028426186772435e-07, "loss": 0.3624, "step": 3641 }, { "epoch": 2.6818851251840945, "grad_norm": 0.32621511816978455, "learning_rate": 3.387322968210022e-07, "loss": 0.3795, "step": 3642 }, { "epoch": 2.6826215022091313, "grad_norm": 0.3251863121986389, "learning_rate": 3.3718375486842314e-07, "loss": 0.3535, "step": 3643 }, { "epoch": 2.683357879234168, "grad_norm": 0.32561933994293213, "learning_rate": 3.3563863714718927e-07, "loss": 0.3637, "step": 3644 }, { "epoch": 2.684094256259205, "grad_norm": 0.3464414179325104, "learning_rate": 3.340969447919873e-07, "loss": 0.3708, "step": 3645 }, { "epoch": 2.6848306332842418, "grad_norm": 0.3460819721221924, "learning_rate": 3.3255867893499105e-07, "loss": 0.366, "step": 3646 }, { "epoch": 2.6855670103092786, "grad_norm": 0.36896535754203796, "learning_rate": 3.3102384070585523e-07, "loss": 0.3482, "step": 3647 }, { "epoch": 2.6863033873343154, "grad_norm": 0.3020630478858948, "learning_rate": 3.2949243123171994e-07, "loss": 0.3855, "step": 3648 }, { "epoch": 2.687039764359352, "grad_norm": 0.3055335283279419, "learning_rate": 3.279644516372049e-07, "loss": 0.3841, "step": 3649 }, { "epoch": 2.687776141384389, "grad_norm": 0.3210819363594055, "learning_rate": 3.264399030444132e-07, "loss": 0.3661, "step": 3650 }, { "epoch": 2.688512518409426, "grad_norm": 0.3282409608364105, "learning_rate": 3.2491878657292643e-07, "loss": 0.3595, "step": 3651 }, { "epoch": 2.6892488954344627, "grad_norm": 0.3092752695083618, "learning_rate": 3.2340110333980656e-07, "loss": 0.3499, "step": 3652 }, { "epoch": 2.6899852724594995, "grad_norm": 0.31985411047935486, "learning_rate": 3.218868544595938e-07, "loss": 0.3618, "step": 3653 }, { "epoch": 2.6907216494845363, "grad_norm": 0.33014747500419617, "learning_rate": 3.20376041044308e-07, "loss": 0.3551, "step": 3654 }, { "epoch": 2.691458026509573, "grad_norm": 0.307456910610199, "learning_rate": 3.18868664203445e-07, "loss": 0.3722, "step": 3655 }, { "epoch": 2.69219440353461, "grad_norm": 0.35932809114456177, "learning_rate": 3.1736472504397485e-07, "loss": 0.3935, "step": 3656 }, { "epoch": 2.6929307805596467, "grad_norm": 0.3306938409805298, "learning_rate": 3.1586422467034695e-07, "loss": 0.379, "step": 3657 }, { "epoch": 2.6936671575846836, "grad_norm": 0.3393386900424957, "learning_rate": 3.143671641844831e-07, "loss": 0.4004, "step": 3658 }, { "epoch": 2.6944035346097204, "grad_norm": 0.3137757480144501, "learning_rate": 3.128735446857784e-07, "loss": 0.3594, "step": 3659 }, { "epoch": 2.695139911634757, "grad_norm": 0.33764535188674927, "learning_rate": 3.1138336727110307e-07, "loss": 0.3994, "step": 3660 }, { "epoch": 2.695876288659794, "grad_norm": 0.32632213830947876, "learning_rate": 3.098966330347969e-07, "loss": 0.3772, "step": 3661 }, { "epoch": 2.696612665684831, "grad_norm": 0.3276118040084839, "learning_rate": 3.0841334306867367e-07, "loss": 0.3561, "step": 3662 }, { "epoch": 2.6973490427098676, "grad_norm": 0.3348173201084137, "learning_rate": 3.06933498462017e-07, "loss": 0.357, "step": 3663 }, { "epoch": 2.6980854197349045, "grad_norm": 0.372728168964386, "learning_rate": 3.0545710030157824e-07, "loss": 0.3482, "step": 3664 }, { "epoch": 2.6988217967599413, "grad_norm": 0.34344926476478577, "learning_rate": 3.039841496715823e-07, "loss": 0.39, "step": 3665 }, { "epoch": 2.699558173784978, "grad_norm": 0.3021678924560547, "learning_rate": 3.0251464765371774e-07, "loss": 0.3888, "step": 3666 }, { "epoch": 2.700294550810015, "grad_norm": 0.3373548984527588, "learning_rate": 3.010485953271425e-07, "loss": 0.3803, "step": 3667 }, { "epoch": 2.7010309278350517, "grad_norm": 0.31935036182403564, "learning_rate": 2.9958599376848194e-07, "loss": 0.3878, "step": 3668 }, { "epoch": 2.7017673048600885, "grad_norm": 0.3091143071651459, "learning_rate": 2.9812684405182536e-07, "loss": 0.336, "step": 3669 }, { "epoch": 2.7025036818851254, "grad_norm": 0.325328528881073, "learning_rate": 2.9667114724872937e-07, "loss": 0.4175, "step": 3670 }, { "epoch": 2.703240058910162, "grad_norm": 0.31764811277389526, "learning_rate": 2.9521890442821276e-07, "loss": 0.3726, "step": 3671 }, { "epoch": 2.703976435935199, "grad_norm": 0.3514196276664734, "learning_rate": 2.9377011665675913e-07, "loss": 0.3768, "step": 3672 }, { "epoch": 2.704712812960236, "grad_norm": 0.3268696069717407, "learning_rate": 2.923247849983146e-07, "loss": 0.3878, "step": 3673 }, { "epoch": 2.7054491899852726, "grad_norm": 0.3018622100353241, "learning_rate": 2.908829105142874e-07, "loss": 0.3717, "step": 3674 }, { "epoch": 2.7061855670103094, "grad_norm": 0.28456223011016846, "learning_rate": 2.89444494263546e-07, "loss": 0.3918, "step": 3675 }, { "epoch": 2.7069219440353463, "grad_norm": 0.35831522941589355, "learning_rate": 2.8800953730242e-07, "loss": 0.3505, "step": 3676 }, { "epoch": 2.707658321060383, "grad_norm": 0.33213359117507935, "learning_rate": 2.865780406846985e-07, "loss": 0.3614, "step": 3677 }, { "epoch": 2.70839469808542, "grad_norm": 0.3209593594074249, "learning_rate": 2.85150005461628e-07, "loss": 0.3597, "step": 3678 }, { "epoch": 2.7091310751104567, "grad_norm": 0.3153224587440491, "learning_rate": 2.8372543268191723e-07, "loss": 0.3796, "step": 3679 }, { "epoch": 2.7098674521354935, "grad_norm": 0.3322901129722595, "learning_rate": 2.823043233917272e-07, "loss": 0.3776, "step": 3680 }, { "epoch": 2.7106038291605303, "grad_norm": 0.35406193137168884, "learning_rate": 2.8088667863467754e-07, "loss": 0.3886, "step": 3681 }, { "epoch": 2.711340206185567, "grad_norm": 0.3339800536632538, "learning_rate": 2.794724994518455e-07, "loss": 0.3846, "step": 3682 }, { "epoch": 2.712076583210604, "grad_norm": 0.33495986461639404, "learning_rate": 2.7806178688175977e-07, "loss": 0.3788, "step": 3683 }, { "epoch": 2.712812960235641, "grad_norm": 0.3515438437461853, "learning_rate": 2.7665454196040665e-07, "loss": 0.3865, "step": 3684 }, { "epoch": 2.7135493372606776, "grad_norm": 0.320209801197052, "learning_rate": 2.752507657212228e-07, "loss": 0.3725, "step": 3685 }, { "epoch": 2.7142857142857144, "grad_norm": 0.3267118036746979, "learning_rate": 2.738504591950991e-07, "loss": 0.3785, "step": 3686 }, { "epoch": 2.7150220913107512, "grad_norm": 0.34039753675460815, "learning_rate": 2.724536234103792e-07, "loss": 0.3622, "step": 3687 }, { "epoch": 2.715758468335788, "grad_norm": 0.33974501490592957, "learning_rate": 2.710602593928574e-07, "loss": 0.3561, "step": 3688 }, { "epoch": 2.716494845360825, "grad_norm": 0.3247467279434204, "learning_rate": 2.6967036816577643e-07, "loss": 0.3628, "step": 3689 }, { "epoch": 2.7172312223858617, "grad_norm": 0.3339967131614685, "learning_rate": 2.6828395074983195e-07, "loss": 0.3635, "step": 3690 }, { "epoch": 2.7179675994108985, "grad_norm": 0.31294745206832886, "learning_rate": 2.6690100816316675e-07, "loss": 0.3706, "step": 3691 }, { "epoch": 2.7187039764359353, "grad_norm": 0.3254539370536804, "learning_rate": 2.655215414213719e-07, "loss": 0.3796, "step": 3692 }, { "epoch": 2.719440353460972, "grad_norm": 0.32704704999923706, "learning_rate": 2.6414555153748635e-07, "loss": 0.3642, "step": 3693 }, { "epoch": 2.720176730486009, "grad_norm": 0.31729063391685486, "learning_rate": 2.627730395219941e-07, "loss": 0.3979, "step": 3694 }, { "epoch": 2.7209131075110458, "grad_norm": 0.3445037603378296, "learning_rate": 2.6140400638282826e-07, "loss": 0.3648, "step": 3695 }, { "epoch": 2.7216494845360826, "grad_norm": 0.3214196562767029, "learning_rate": 2.6003845312536526e-07, "loss": 0.397, "step": 3696 }, { "epoch": 2.7223858615611194, "grad_norm": 0.3424557149410248, "learning_rate": 2.5867638075242454e-07, "loss": 0.3801, "step": 3697 }, { "epoch": 2.723122238586156, "grad_norm": 0.36900395154953003, "learning_rate": 2.573177902642726e-07, "loss": 0.3874, "step": 3698 }, { "epoch": 2.723858615611193, "grad_norm": 0.33150961995124817, "learning_rate": 2.5596268265861646e-07, "loss": 0.3587, "step": 3699 }, { "epoch": 2.72459499263623, "grad_norm": 0.35658320784568787, "learning_rate": 2.5461105893060667e-07, "loss": 0.3807, "step": 3700 }, { "epoch": 2.7253313696612667, "grad_norm": 0.3134189248085022, "learning_rate": 2.532629200728343e-07, "loss": 0.3832, "step": 3701 }, { "epoch": 2.7260677466863035, "grad_norm": 0.3331991136074066, "learning_rate": 2.5191826707533173e-07, "loss": 0.3653, "step": 3702 }, { "epoch": 2.7268041237113403, "grad_norm": 0.3472428619861603, "learning_rate": 2.505771009255714e-07, "loss": 0.4057, "step": 3703 }, { "epoch": 2.727540500736377, "grad_norm": 0.36920398473739624, "learning_rate": 2.492394226084666e-07, "loss": 0.396, "step": 3704 }, { "epoch": 2.728276877761414, "grad_norm": 0.3479515314102173, "learning_rate": 2.479052331063658e-07, "loss": 0.3834, "step": 3705 }, { "epoch": 2.7290132547864507, "grad_norm": 0.3214065730571747, "learning_rate": 2.465745333990588e-07, "loss": 0.3581, "step": 3706 }, { "epoch": 2.7297496318114876, "grad_norm": 0.29844558238983154, "learning_rate": 2.4524732446377154e-07, "loss": 0.3715, "step": 3707 }, { "epoch": 2.7304860088365244, "grad_norm": 0.2953944504261017, "learning_rate": 2.439236072751644e-07, "loss": 0.3972, "step": 3708 }, { "epoch": 2.731222385861561, "grad_norm": 0.322376012802124, "learning_rate": 2.426033828053381e-07, "loss": 0.3785, "step": 3709 }, { "epoch": 2.731958762886598, "grad_norm": 0.32461515069007874, "learning_rate": 2.4128665202382327e-07, "loss": 0.3594, "step": 3710 }, { "epoch": 2.732695139911635, "grad_norm": 0.3040623664855957, "learning_rate": 2.3997341589758694e-07, "loss": 0.3436, "step": 3711 }, { "epoch": 2.7334315169366716, "grad_norm": 0.32611605525016785, "learning_rate": 2.3866367539103206e-07, "loss": 0.3653, "step": 3712 }, { "epoch": 2.7341678939617085, "grad_norm": 0.32935771346092224, "learning_rate": 2.37357431465991e-07, "loss": 0.3611, "step": 3713 }, { "epoch": 2.7349042709867453, "grad_norm": 0.3235970139503479, "learning_rate": 2.3605468508172968e-07, "loss": 0.3786, "step": 3714 }, { "epoch": 2.735640648011782, "grad_norm": 0.33503854274749756, "learning_rate": 2.3475543719494676e-07, "loss": 0.3587, "step": 3715 }, { "epoch": 2.736377025036819, "grad_norm": 0.324275404214859, "learning_rate": 2.3345968875977008e-07, "loss": 0.3482, "step": 3716 }, { "epoch": 2.7371134020618557, "grad_norm": 0.33510270714759827, "learning_rate": 2.3216744072775797e-07, "loss": 0.3949, "step": 3717 }, { "epoch": 2.7378497790868925, "grad_norm": 0.35356804728507996, "learning_rate": 2.3087869404789854e-07, "loss": 0.3781, "step": 3718 }, { "epoch": 2.7385861561119293, "grad_norm": 0.3195352554321289, "learning_rate": 2.2959344966660802e-07, "loss": 0.3679, "step": 3719 }, { "epoch": 2.739322533136966, "grad_norm": 0.3420256972312927, "learning_rate": 2.2831170852773198e-07, "loss": 0.3873, "step": 3720 }, { "epoch": 2.740058910162003, "grad_norm": 0.3301747441291809, "learning_rate": 2.2703347157254142e-07, "loss": 0.3652, "step": 3721 }, { "epoch": 2.74079528718704, "grad_norm": 0.3048621714115143, "learning_rate": 2.2575873973973485e-07, "loss": 0.3808, "step": 3722 }, { "epoch": 2.7415316642120766, "grad_norm": 0.31739065051078796, "learning_rate": 2.2448751396543788e-07, "loss": 0.3668, "step": 3723 }, { "epoch": 2.7422680412371134, "grad_norm": 0.3357614278793335, "learning_rate": 2.2321979518319992e-07, "loss": 0.4031, "step": 3724 }, { "epoch": 2.7430044182621502, "grad_norm": 0.3367345929145813, "learning_rate": 2.21955584323994e-07, "loss": 0.3808, "step": 3725 }, { "epoch": 2.743740795287187, "grad_norm": 0.3369562029838562, "learning_rate": 2.2069488231622083e-07, "loss": 0.3616, "step": 3726 }, { "epoch": 2.744477172312224, "grad_norm": 0.30571091175079346, "learning_rate": 2.1943769008569927e-07, "loss": 0.3605, "step": 3727 }, { "epoch": 2.7452135493372607, "grad_norm": 0.32707011699676514, "learning_rate": 2.1818400855567523e-07, "loss": 0.3632, "step": 3728 }, { "epoch": 2.7459499263622975, "grad_norm": 0.3019167482852936, "learning_rate": 2.1693383864681394e-07, "loss": 0.3751, "step": 3729 }, { "epoch": 2.7466863033873343, "grad_norm": 0.32666751742362976, "learning_rate": 2.1568718127720155e-07, "loss": 0.3796, "step": 3730 }, { "epoch": 2.747422680412371, "grad_norm": 0.3546668291091919, "learning_rate": 2.1444403736234686e-07, "loss": 0.3746, "step": 3731 }, { "epoch": 2.748159057437408, "grad_norm": 0.31046637892723083, "learning_rate": 2.132044078151768e-07, "loss": 0.3346, "step": 3732 }, { "epoch": 2.7488954344624448, "grad_norm": 0.3377501666545868, "learning_rate": 2.119682935460371e-07, "loss": 0.3892, "step": 3733 }, { "epoch": 2.7496318114874816, "grad_norm": 0.3156897723674774, "learning_rate": 2.1073569546269434e-07, "loss": 0.3731, "step": 3734 }, { "epoch": 2.7503681885125184, "grad_norm": 0.33260810375213623, "learning_rate": 2.095066144703295e-07, "loss": 0.3553, "step": 3735 }, { "epoch": 2.7511045655375552, "grad_norm": 0.351136177778244, "learning_rate": 2.0828105147154275e-07, "loss": 0.3744, "step": 3736 }, { "epoch": 2.751840942562592, "grad_norm": 0.3310754597187042, "learning_rate": 2.07059007366352e-07, "loss": 0.3813, "step": 3737 }, { "epoch": 2.752577319587629, "grad_norm": 0.3328613340854645, "learning_rate": 2.0584048305218874e-07, "loss": 0.3609, "step": 3738 }, { "epoch": 2.7533136966126657, "grad_norm": 0.31846439838409424, "learning_rate": 2.0462547942389942e-07, "loss": 0.3687, "step": 3739 }, { "epoch": 2.7540500736377025, "grad_norm": 0.29258909821510315, "learning_rate": 2.03413997373747e-07, "loss": 0.3837, "step": 3740 }, { "epoch": 2.7547864506627393, "grad_norm": 0.31229260563850403, "learning_rate": 2.0220603779140759e-07, "loss": 0.3641, "step": 3741 }, { "epoch": 2.755522827687776, "grad_norm": 0.31942838430404663, "learning_rate": 2.0100160156396986e-07, "loss": 0.3746, "step": 3742 }, { "epoch": 2.756259204712813, "grad_norm": 0.3302987515926361, "learning_rate": 1.998006895759347e-07, "loss": 0.4101, "step": 3743 }, { "epoch": 2.7569955817378498, "grad_norm": 0.32852962613105774, "learning_rate": 1.98603302709216e-07, "loss": 0.3463, "step": 3744 }, { "epoch": 2.7577319587628866, "grad_norm": 0.34027835726737976, "learning_rate": 1.9740944184313882e-07, "loss": 0.3922, "step": 3745 }, { "epoch": 2.7584683357879234, "grad_norm": 0.33216574788093567, "learning_rate": 1.9621910785443843e-07, "loss": 0.3951, "step": 3746 }, { "epoch": 2.75920471281296, "grad_norm": 0.3349026143550873, "learning_rate": 1.950323016172595e-07, "loss": 0.3729, "step": 3747 }, { "epoch": 2.759941089837997, "grad_norm": 0.31592321395874023, "learning_rate": 1.9384902400315764e-07, "loss": 0.3497, "step": 3748 }, { "epoch": 2.760677466863034, "grad_norm": 0.33262524008750916, "learning_rate": 1.926692758810955e-07, "loss": 0.3656, "step": 3749 }, { "epoch": 2.7614138438880707, "grad_norm": 0.318095326423645, "learning_rate": 1.9149305811744456e-07, "loss": 0.3944, "step": 3750 }, { "epoch": 2.7621502209131075, "grad_norm": 0.32870233058929443, "learning_rate": 1.9032037157598494e-07, "loss": 0.3801, "step": 3751 }, { "epoch": 2.7628865979381443, "grad_norm": 0.30096235871315, "learning_rate": 1.891512171178994e-07, "loss": 0.3364, "step": 3752 }, { "epoch": 2.763622974963181, "grad_norm": 0.33072760701179504, "learning_rate": 1.8798559560178174e-07, "loss": 0.3867, "step": 3753 }, { "epoch": 2.764359351988218, "grad_norm": 0.34106144309043884, "learning_rate": 1.8682350788362892e-07, "loss": 0.3824, "step": 3754 }, { "epoch": 2.7650957290132547, "grad_norm": 0.3276694416999817, "learning_rate": 1.856649548168421e-07, "loss": 0.3463, "step": 3755 }, { "epoch": 2.7658321060382915, "grad_norm": 0.3228393793106079, "learning_rate": 1.8450993725222856e-07, "loss": 0.3746, "step": 3756 }, { "epoch": 2.7665684830633284, "grad_norm": 0.3612479567527771, "learning_rate": 1.8335845603799806e-07, "loss": 0.3635, "step": 3757 }, { "epoch": 2.767304860088365, "grad_norm": 0.31587934494018555, "learning_rate": 1.8221051201976315e-07, "loss": 0.3564, "step": 3758 }, { "epoch": 2.768041237113402, "grad_norm": 0.3175513446331024, "learning_rate": 1.810661060405411e-07, "loss": 0.3706, "step": 3759 }, { "epoch": 2.768777614138439, "grad_norm": 0.3018471896648407, "learning_rate": 1.7992523894074688e-07, "loss": 0.3492, "step": 3760 }, { "epoch": 2.7695139911634756, "grad_norm": 0.3039242625236511, "learning_rate": 1.7878791155819918e-07, "loss": 0.3739, "step": 3761 }, { "epoch": 2.7702503681885124, "grad_norm": 0.306921124458313, "learning_rate": 1.776541247281177e-07, "loss": 0.3801, "step": 3762 }, { "epoch": 2.7709867452135493, "grad_norm": 0.3310092091560364, "learning_rate": 1.7652387928311977e-07, "loss": 0.381, "step": 3763 }, { "epoch": 2.771723122238586, "grad_norm": 0.3500930368900299, "learning_rate": 1.7539717605322527e-07, "loss": 0.3488, "step": 3764 }, { "epoch": 2.772459499263623, "grad_norm": 0.3258472681045532, "learning_rate": 1.7427401586585068e-07, "loss": 0.3674, "step": 3765 }, { "epoch": 2.7731958762886597, "grad_norm": 0.30289342999458313, "learning_rate": 1.731543995458096e-07, "loss": 0.3477, "step": 3766 }, { "epoch": 2.7739322533136965, "grad_norm": 0.29461464285850525, "learning_rate": 1.7203832791531594e-07, "loss": 0.3893, "step": 3767 }, { "epoch": 2.7746686303387333, "grad_norm": 0.3397156298160553, "learning_rate": 1.7092580179397856e-07, "loss": 0.3684, "step": 3768 }, { "epoch": 2.77540500736377, "grad_norm": 0.3199734687805176, "learning_rate": 1.6981682199880167e-07, "loss": 0.3924, "step": 3769 }, { "epoch": 2.776141384388807, "grad_norm": 0.3441019654273987, "learning_rate": 1.6871138934418884e-07, "loss": 0.3684, "step": 3770 }, { "epoch": 2.776877761413844, "grad_norm": 0.30693432688713074, "learning_rate": 1.676095046419346e-07, "loss": 0.3689, "step": 3771 }, { "epoch": 2.7776141384388806, "grad_norm": 0.3343988358974457, "learning_rate": 1.6651116870122997e-07, "loss": 0.3952, "step": 3772 }, { "epoch": 2.7783505154639174, "grad_norm": 0.3290632665157318, "learning_rate": 1.654163823286603e-07, "loss": 0.3819, "step": 3773 }, { "epoch": 2.7790868924889542, "grad_norm": 0.3326128423213959, "learning_rate": 1.6432514632820363e-07, "loss": 0.3808, "step": 3774 }, { "epoch": 2.779823269513991, "grad_norm": 0.3110780417919159, "learning_rate": 1.6323746150123e-07, "loss": 0.3911, "step": 3775 }, { "epoch": 2.780559646539028, "grad_norm": 0.3054400682449341, "learning_rate": 1.6215332864650434e-07, "loss": 0.3856, "step": 3776 }, { "epoch": 2.7812960235640647, "grad_norm": 0.3271074891090393, "learning_rate": 1.6107274856017763e-07, "loss": 0.3531, "step": 3777 }, { "epoch": 2.7820324005891015, "grad_norm": 0.3214995563030243, "learning_rate": 1.5999572203579783e-07, "loss": 0.3729, "step": 3778 }, { "epoch": 2.7827687776141383, "grad_norm": 0.3266492486000061, "learning_rate": 1.5892224986430006e-07, "loss": 0.384, "step": 3779 }, { "epoch": 2.783505154639175, "grad_norm": 0.2639772295951843, "learning_rate": 1.578523328340087e-07, "loss": 0.3665, "step": 3780 }, { "epoch": 2.784241531664212, "grad_norm": 0.36343199014663696, "learning_rate": 1.5678597173064026e-07, "loss": 0.3718, "step": 3781 }, { "epoch": 2.7849779086892488, "grad_norm": 0.36233747005462646, "learning_rate": 1.5572316733729775e-07, "loss": 0.3556, "step": 3782 }, { "epoch": 2.7857142857142856, "grad_norm": 0.33394989371299744, "learning_rate": 1.5466392043447132e-07, "loss": 0.3592, "step": 3783 }, { "epoch": 2.7864506627393224, "grad_norm": 0.3197415769100189, "learning_rate": 1.5360823180004146e-07, "loss": 0.3903, "step": 3784 }, { "epoch": 2.787187039764359, "grad_norm": 0.3251337707042694, "learning_rate": 1.5255610220927252e-07, "loss": 0.3953, "step": 3785 }, { "epoch": 2.787923416789396, "grad_norm": 0.33090558648109436, "learning_rate": 1.515075324348181e-07, "loss": 0.3659, "step": 3786 }, { "epoch": 2.788659793814433, "grad_norm": 0.31915581226348877, "learning_rate": 1.504625232467155e-07, "loss": 0.3764, "step": 3787 }, { "epoch": 2.7893961708394697, "grad_norm": 0.34577444195747375, "learning_rate": 1.4942107541238705e-07, "loss": 0.3593, "step": 3788 }, { "epoch": 2.7901325478645065, "grad_norm": 0.3278156816959381, "learning_rate": 1.48383189696642e-07, "loss": 0.3688, "step": 3789 }, { "epoch": 2.7908689248895433, "grad_norm": 0.31039220094680786, "learning_rate": 1.4734886686167182e-07, "loss": 0.3732, "step": 3790 }, { "epoch": 2.79160530191458, "grad_norm": 0.31084826588630676, "learning_rate": 1.4631810766705112e-07, "loss": 0.35, "step": 3791 }, { "epoch": 2.792341678939617, "grad_norm": 0.29743996262550354, "learning_rate": 1.4529091286973994e-07, "loss": 0.3907, "step": 3792 }, { "epoch": 2.7930780559646537, "grad_norm": 0.30235204100608826, "learning_rate": 1.4426728322407822e-07, "loss": 0.4054, "step": 3793 }, { "epoch": 2.7938144329896906, "grad_norm": 0.31857481598854065, "learning_rate": 1.4324721948178743e-07, "loss": 0.377, "step": 3794 }, { "epoch": 2.7945508100147274, "grad_norm": 0.3209660053253174, "learning_rate": 1.4223072239197333e-07, "loss": 0.3558, "step": 3795 }, { "epoch": 2.795287187039764, "grad_norm": 0.3144548833370209, "learning_rate": 1.412177927011199e-07, "loss": 0.3617, "step": 3796 }, { "epoch": 2.796023564064801, "grad_norm": 0.32282963395118713, "learning_rate": 1.4020843115309213e-07, "loss": 0.4057, "step": 3797 }, { "epoch": 2.796759941089838, "grad_norm": 0.3219519555568695, "learning_rate": 1.3920263848913484e-07, "loss": 0.3917, "step": 3798 }, { "epoch": 2.7974963181148746, "grad_norm": 0.32549378275871277, "learning_rate": 1.3820041544787167e-07, "loss": 0.3979, "step": 3799 }, { "epoch": 2.7982326951399115, "grad_norm": 0.36029815673828125, "learning_rate": 1.372017627653044e-07, "loss": 0.36, "step": 3800 }, { "epoch": 2.7989690721649483, "grad_norm": 0.29594072699546814, "learning_rate": 1.3620668117481471e-07, "loss": 0.3597, "step": 3801 }, { "epoch": 2.799705449189985, "grad_norm": 0.30883079767227173, "learning_rate": 1.3521517140715867e-07, "loss": 0.3621, "step": 3802 }, { "epoch": 2.800441826215022, "grad_norm": 0.29919102787971497, "learning_rate": 1.3422723419047267e-07, "loss": 0.3504, "step": 3803 }, { "epoch": 2.8011782032400587, "grad_norm": 0.3039191663265228, "learning_rate": 1.332428702502675e-07, "loss": 0.37, "step": 3804 }, { "epoch": 2.8019145802650955, "grad_norm": 0.31316202878952026, "learning_rate": 1.3226208030942934e-07, "loss": 0.3754, "step": 3805 }, { "epoch": 2.8026509572901324, "grad_norm": 0.2829436957836151, "learning_rate": 1.3128486508822202e-07, "loss": 0.3824, "step": 3806 }, { "epoch": 2.803387334315169, "grad_norm": 0.3167963922023773, "learning_rate": 1.3031122530428264e-07, "loss": 0.3699, "step": 3807 }, { "epoch": 2.804123711340206, "grad_norm": 0.3057669997215271, "learning_rate": 1.2934116167262145e-07, "loss": 0.3758, "step": 3808 }, { "epoch": 2.804860088365243, "grad_norm": 0.3250826895236969, "learning_rate": 1.2837467490562583e-07, "loss": 0.3437, "step": 3809 }, { "epoch": 2.8055964653902796, "grad_norm": 0.3518430292606354, "learning_rate": 1.274117657130536e-07, "loss": 0.3829, "step": 3810 }, { "epoch": 2.8063328424153164, "grad_norm": 0.32420122623443604, "learning_rate": 1.2645243480203574e-07, "loss": 0.3743, "step": 3811 }, { "epoch": 2.8070692194403533, "grad_norm": 0.3204830586910248, "learning_rate": 1.254966828770765e-07, "loss": 0.3848, "step": 3812 }, { "epoch": 2.80780559646539, "grad_norm": 0.3180425763130188, "learning_rate": 1.2454451064005058e-07, "loss": 0.39, "step": 3813 }, { "epoch": 2.808541973490427, "grad_norm": 0.32775330543518066, "learning_rate": 1.2359591879020528e-07, "loss": 0.376, "step": 3814 }, { "epoch": 2.8092783505154637, "grad_norm": 0.34739986062049866, "learning_rate": 1.2265090802415724e-07, "loss": 0.3723, "step": 3815 }, { "epoch": 2.8100147275405005, "grad_norm": 0.3330659866333008, "learning_rate": 1.217094790358936e-07, "loss": 0.3697, "step": 3816 }, { "epoch": 2.8107511045655373, "grad_norm": 0.30431681871414185, "learning_rate": 1.2077163251677182e-07, "loss": 0.3748, "step": 3817 }, { "epoch": 2.811487481590574, "grad_norm": 0.33776891231536865, "learning_rate": 1.1983736915551824e-07, "loss": 0.3889, "step": 3818 }, { "epoch": 2.812223858615611, "grad_norm": 0.3244916498661041, "learning_rate": 1.1890668963822793e-07, "loss": 0.379, "step": 3819 }, { "epoch": 2.812960235640648, "grad_norm": 0.3201790750026703, "learning_rate": 1.179795946483625e-07, "loss": 0.3578, "step": 3820 }, { "epoch": 2.8136966126656846, "grad_norm": 0.32924962043762207, "learning_rate": 1.170560848667529e-07, "loss": 0.3561, "step": 3821 }, { "epoch": 2.8144329896907214, "grad_norm": 0.318311870098114, "learning_rate": 1.1613616097159774e-07, "loss": 0.3588, "step": 3822 }, { "epoch": 2.8151693667157582, "grad_norm": 0.30944523215293884, "learning_rate": 1.1521982363846051e-07, "loss": 0.3534, "step": 3823 }, { "epoch": 2.815905743740795, "grad_norm": 0.3113093078136444, "learning_rate": 1.1430707354027182e-07, "loss": 0.3749, "step": 3824 }, { "epoch": 2.816642120765832, "grad_norm": 0.298728346824646, "learning_rate": 1.1339791134732769e-07, "loss": 0.3704, "step": 3825 }, { "epoch": 2.8173784977908687, "grad_norm": 0.30721381306648254, "learning_rate": 1.1249233772729018e-07, "loss": 0.403, "step": 3826 }, { "epoch": 2.8181148748159055, "grad_norm": 0.3090912997722626, "learning_rate": 1.1159035334518343e-07, "loss": 0.3721, "step": 3827 }, { "epoch": 2.8188512518409423, "grad_norm": 0.30627208948135376, "learning_rate": 1.1069195886339923e-07, "loss": 0.3677, "step": 3828 }, { "epoch": 2.819587628865979, "grad_norm": 0.32268771529197693, "learning_rate": 1.0979715494169096e-07, "loss": 0.3591, "step": 3829 }, { "epoch": 2.820324005891016, "grad_norm": 0.33059263229370117, "learning_rate": 1.089059422371741e-07, "loss": 0.3579, "step": 3830 }, { "epoch": 2.8210603829160528, "grad_norm": 0.3322978913784027, "learning_rate": 1.0801832140433066e-07, "loss": 0.3856, "step": 3831 }, { "epoch": 2.8217967599410896, "grad_norm": 0.3347814977169037, "learning_rate": 1.071342930950009e-07, "loss": 0.3722, "step": 3832 }, { "epoch": 2.8225331369661264, "grad_norm": 0.34562426805496216, "learning_rate": 1.0625385795838883e-07, "loss": 0.3687, "step": 3833 }, { "epoch": 2.823269513991163, "grad_norm": 0.3513891100883484, "learning_rate": 1.0537701664106003e-07, "loss": 0.3737, "step": 3834 }, { "epoch": 2.8240058910162, "grad_norm": 0.31909000873565674, "learning_rate": 1.0450376978693999e-07, "loss": 0.3695, "step": 3835 }, { "epoch": 2.824742268041237, "grad_norm": 0.3165963292121887, "learning_rate": 1.0363411803731404e-07, "loss": 0.3876, "step": 3836 }, { "epoch": 2.8254786450662737, "grad_norm": 0.32449856400489807, "learning_rate": 1.0276806203082967e-07, "loss": 0.3678, "step": 3837 }, { "epoch": 2.8262150220913105, "grad_norm": 0.3418518006801605, "learning_rate": 1.0190560240349035e-07, "loss": 0.3594, "step": 3838 }, { "epoch": 2.8269513991163473, "grad_norm": 0.32669028639793396, "learning_rate": 1.0104673978866164e-07, "loss": 0.3804, "step": 3839 }, { "epoch": 2.827687776141384, "grad_norm": 0.2968961298465729, "learning_rate": 1.0019147481706626e-07, "loss": 0.3726, "step": 3840 }, { "epoch": 2.8284241531664214, "grad_norm": 0.3117745816707611, "learning_rate": 9.933980811678401e-08, "loss": 0.3972, "step": 3841 }, { "epoch": 2.829160530191458, "grad_norm": 0.3052343428134918, "learning_rate": 9.84917403132546e-08, "loss": 0.3743, "step": 3842 }, { "epoch": 2.829896907216495, "grad_norm": 0.3451398015022278, "learning_rate": 9.764727202927259e-08, "loss": 0.3921, "step": 3843 }, { "epoch": 2.830633284241532, "grad_norm": 0.3272383213043213, "learning_rate": 9.680640388498974e-08, "loss": 0.3752, "step": 3844 }, { "epoch": 2.8313696612665686, "grad_norm": 0.3403890132904053, "learning_rate": 9.596913649791484e-08, "loss": 0.3564, "step": 3845 }, { "epoch": 2.8321060382916055, "grad_norm": 0.3162234425544739, "learning_rate": 9.51354704829105e-08, "loss": 0.3515, "step": 3846 }, { "epoch": 2.8328424153166423, "grad_norm": 0.31846678256988525, "learning_rate": 9.430540645219755e-08, "loss": 0.374, "step": 3847 }, { "epoch": 2.833578792341679, "grad_norm": 0.32757052779197693, "learning_rate": 9.347894501534949e-08, "loss": 0.3537, "step": 3848 }, { "epoch": 2.834315169366716, "grad_norm": 0.32147282361984253, "learning_rate": 9.26560867792936e-08, "loss": 0.3824, "step": 3849 }, { "epoch": 2.8350515463917527, "grad_norm": 0.34874793887138367, "learning_rate": 9.18368323483132e-08, "loss": 0.3895, "step": 3850 }, { "epoch": 2.8357879234167895, "grad_norm": 0.29230934381484985, "learning_rate": 9.102118232404311e-08, "loss": 0.391, "step": 3851 }, { "epoch": 2.8365243004418264, "grad_norm": 0.33275964856147766, "learning_rate": 9.020913730547309e-08, "loss": 0.3728, "step": 3852 }, { "epoch": 2.837260677466863, "grad_norm": 0.36540740728378296, "learning_rate": 8.940069788894389e-08, "loss": 0.3738, "step": 3853 }, { "epoch": 2.8379970544919, "grad_norm": 0.29792320728302, "learning_rate": 8.859586466814895e-08, "loss": 0.3478, "step": 3854 }, { "epoch": 2.838733431516937, "grad_norm": 0.35100606083869934, "learning_rate": 8.77946382341327e-08, "loss": 0.3724, "step": 3855 }, { "epoch": 2.8394698085419736, "grad_norm": 0.31920912861824036, "learning_rate": 8.699701917529335e-08, "loss": 0.3789, "step": 3856 }, { "epoch": 2.8402061855670104, "grad_norm": 0.3164528012275696, "learning_rate": 8.62030080773768e-08, "loss": 0.3715, "step": 3857 }, { "epoch": 2.8409425625920472, "grad_norm": 0.3255048990249634, "learning_rate": 8.541260552348107e-08, "loss": 0.3844, "step": 3858 }, { "epoch": 2.841678939617084, "grad_norm": 0.3301268517971039, "learning_rate": 8.462581209405519e-08, "loss": 0.4026, "step": 3859 }, { "epoch": 2.842415316642121, "grad_norm": 0.3406490087509155, "learning_rate": 8.384262836689472e-08, "loss": 0.3798, "step": 3860 }, { "epoch": 2.8431516936671577, "grad_norm": 0.30102699995040894, "learning_rate": 8.306305491714683e-08, "loss": 0.3826, "step": 3861 }, { "epoch": 2.8438880706921945, "grad_norm": 0.30409371852874756, "learning_rate": 8.228709231730747e-08, "loss": 0.3754, "step": 3862 }, { "epoch": 2.8446244477172313, "grad_norm": 0.30450791120529175, "learning_rate": 8.151474113721803e-08, "loss": 0.381, "step": 3863 }, { "epoch": 2.845360824742268, "grad_norm": 0.3098084628582001, "learning_rate": 8.074600194407257e-08, "loss": 0.3558, "step": 3864 }, { "epoch": 2.846097201767305, "grad_norm": 0.315789133310318, "learning_rate": 7.998087530240784e-08, "loss": 0.3601, "step": 3865 }, { "epoch": 2.846833578792342, "grad_norm": 0.3192322552204132, "learning_rate": 7.921936177411049e-08, "loss": 0.4126, "step": 3866 }, { "epoch": 2.8475699558173786, "grad_norm": 0.3339972198009491, "learning_rate": 7.846146191841319e-08, "loss": 0.4005, "step": 3867 }, { "epoch": 2.8483063328424154, "grad_norm": 0.31077709794044495, "learning_rate": 7.770717629189462e-08, "loss": 0.3869, "step": 3868 }, { "epoch": 2.8490427098674522, "grad_norm": 0.327124685049057, "learning_rate": 7.695650544847888e-08, "loss": 0.3628, "step": 3869 }, { "epoch": 2.849779086892489, "grad_norm": 0.32845091819763184, "learning_rate": 7.620944993943669e-08, "loss": 0.37, "step": 3870 }, { "epoch": 2.850515463917526, "grad_norm": 0.32456862926483154, "learning_rate": 7.546601031338252e-08, "loss": 0.3459, "step": 3871 }, { "epoch": 2.8512518409425627, "grad_norm": 0.31279054284095764, "learning_rate": 7.472618711627577e-08, "loss": 0.3533, "step": 3872 }, { "epoch": 2.8519882179675995, "grad_norm": 0.31263667345046997, "learning_rate": 7.398998089142128e-08, "loss": 0.3837, "step": 3873 }, { "epoch": 2.8527245949926363, "grad_norm": 0.31096217036247253, "learning_rate": 7.325739217946547e-08, "loss": 0.4039, "step": 3874 }, { "epoch": 2.853460972017673, "grad_norm": 0.31137025356292725, "learning_rate": 7.252842151839967e-08, "loss": 0.3707, "step": 3875 }, { "epoch": 2.85419734904271, "grad_norm": 0.3208453953266144, "learning_rate": 7.180306944355896e-08, "loss": 0.3662, "step": 3876 }, { "epoch": 2.8549337260677468, "grad_norm": 0.3515404164791107, "learning_rate": 7.108133648761839e-08, "loss": 0.3336, "step": 3877 }, { "epoch": 2.8556701030927836, "grad_norm": 0.32757940888404846, "learning_rate": 7.036322318059785e-08, "loss": 0.3916, "step": 3878 }, { "epoch": 2.8564064801178204, "grad_norm": 0.3318811357021332, "learning_rate": 6.964873004985717e-08, "loss": 0.339, "step": 3879 }, { "epoch": 2.857142857142857, "grad_norm": 0.28970828652381897, "learning_rate": 6.893785762009942e-08, "loss": 0.376, "step": 3880 }, { "epoch": 2.857879234167894, "grad_norm": 0.30351996421813965, "learning_rate": 6.823060641336809e-08, "loss": 0.3666, "step": 3881 }, { "epoch": 2.858615611192931, "grad_norm": 0.2939937710762024, "learning_rate": 6.752697694904553e-08, "loss": 0.3697, "step": 3882 }, { "epoch": 2.8593519882179677, "grad_norm": 0.31340742111206055, "learning_rate": 6.682696974385727e-08, "loss": 0.3581, "step": 3883 }, { "epoch": 2.8600883652430045, "grad_norm": 0.3075888156890869, "learning_rate": 6.613058531186767e-08, "loss": 0.4048, "step": 3884 }, { "epoch": 2.8608247422680413, "grad_norm": 0.3178377151489258, "learning_rate": 6.54378241644793e-08, "loss": 0.3927, "step": 3885 }, { "epoch": 2.861561119293078, "grad_norm": 0.31362468004226685, "learning_rate": 6.474868681043578e-08, "loss": 0.3953, "step": 3886 }, { "epoch": 2.862297496318115, "grad_norm": 0.3346673250198364, "learning_rate": 6.406317375581839e-08, "loss": 0.3578, "step": 3887 }, { "epoch": 2.8630338733431517, "grad_norm": 0.31339147686958313, "learning_rate": 6.338128550404721e-08, "loss": 0.377, "step": 3888 }, { "epoch": 2.8637702503681886, "grad_norm": 0.31569039821624756, "learning_rate": 6.270302255588112e-08, "loss": 0.372, "step": 3889 }, { "epoch": 2.8645066273932254, "grad_norm": 0.33136993646621704, "learning_rate": 6.202838540941503e-08, "loss": 0.4067, "step": 3890 }, { "epoch": 2.865243004418262, "grad_norm": 0.3213663101196289, "learning_rate": 6.135737456008207e-08, "loss": 0.3936, "step": 3891 }, { "epoch": 2.865979381443299, "grad_norm": 0.29569748044013977, "learning_rate": 6.06899905006525e-08, "loss": 0.3815, "step": 3892 }, { "epoch": 2.866715758468336, "grad_norm": 0.31531018018722534, "learning_rate": 6.002623372123373e-08, "loss": 0.377, "step": 3893 }, { "epoch": 2.8674521354933726, "grad_norm": 0.31117159128189087, "learning_rate": 5.9366104709267515e-08, "loss": 0.3674, "step": 3894 }, { "epoch": 2.8681885125184094, "grad_norm": 0.3713977634906769, "learning_rate": 5.8709603949533844e-08, "loss": 0.3724, "step": 3895 }, { "epoch": 2.8689248895434463, "grad_norm": 0.30198606848716736, "learning_rate": 5.805673192414596e-08, "loss": 0.4026, "step": 3896 }, { "epoch": 2.869661266568483, "grad_norm": 0.30127283930778503, "learning_rate": 5.740748911255367e-08, "loss": 0.3914, "step": 3897 }, { "epoch": 2.87039764359352, "grad_norm": 0.3306448757648468, "learning_rate": 5.6761875991541704e-08, "loss": 0.3497, "step": 3898 }, { "epoch": 2.8711340206185567, "grad_norm": 0.3268984258174896, "learning_rate": 5.611989303522858e-08, "loss": 0.3916, "step": 3899 }, { "epoch": 2.8718703976435935, "grad_norm": 0.3211970627307892, "learning_rate": 5.5481540715066616e-08, "loss": 0.365, "step": 3900 }, { "epoch": 2.8726067746686303, "grad_norm": 0.336363822221756, "learning_rate": 5.4846819499843605e-08, "loss": 0.3842, "step": 3901 }, { "epoch": 2.873343151693667, "grad_norm": 0.3143905997276306, "learning_rate": 5.4215729855678914e-08, "loss": 0.3844, "step": 3902 }, { "epoch": 2.874079528718704, "grad_norm": 0.35480746626853943, "learning_rate": 5.35882722460257e-08, "loss": 0.3772, "step": 3903 }, { "epoch": 2.874815905743741, "grad_norm": 0.3486427366733551, "learning_rate": 5.296444713166981e-08, "loss": 0.3757, "step": 3904 }, { "epoch": 2.8755522827687776, "grad_norm": 0.3225955367088318, "learning_rate": 5.234425497072981e-08, "loss": 0.3417, "step": 3905 }, { "epoch": 2.8762886597938144, "grad_norm": 0.31462594866752625, "learning_rate": 5.172769621865637e-08, "loss": 0.4036, "step": 3906 }, { "epoch": 2.8770250368188512, "grad_norm": 0.3221474885940552, "learning_rate": 5.1114771328230615e-08, "loss": 0.3568, "step": 3907 }, { "epoch": 2.877761413843888, "grad_norm": 0.319514662027359, "learning_rate": 5.050548074956696e-08, "loss": 0.359, "step": 3908 }, { "epoch": 2.878497790868925, "grad_norm": 0.32837367057800293, "learning_rate": 4.9899824930109694e-08, "loss": 0.3702, "step": 3909 }, { "epoch": 2.8792341678939617, "grad_norm": 0.30610325932502747, "learning_rate": 4.9297804314633604e-08, "loss": 0.3641, "step": 3910 }, { "epoch": 2.8799705449189985, "grad_norm": 0.32444047927856445, "learning_rate": 4.869941934524613e-08, "loss": 0.3677, "step": 3911 }, { "epoch": 2.8807069219440353, "grad_norm": 0.3461378216743469, "learning_rate": 4.810467046138134e-08, "loss": 0.3876, "step": 3912 }, { "epoch": 2.881443298969072, "grad_norm": 0.32242852449417114, "learning_rate": 4.75135580998054e-08, "loss": 0.349, "step": 3913 }, { "epoch": 2.882179675994109, "grad_norm": 0.31807756423950195, "learning_rate": 4.69260826946133e-08, "loss": 0.3642, "step": 3914 }, { "epoch": 2.8829160530191458, "grad_norm": 0.31694477796554565, "learning_rate": 4.634224467722992e-08, "loss": 0.3758, "step": 3915 }, { "epoch": 2.8836524300441826, "grad_norm": 0.343872994184494, "learning_rate": 4.576204447640675e-08, "loss": 0.3576, "step": 3916 }, { "epoch": 2.8843888070692194, "grad_norm": 0.3494185209274292, "learning_rate": 4.518548251822685e-08, "loss": 0.3636, "step": 3917 }, { "epoch": 2.8851251840942562, "grad_norm": 0.32793089747428894, "learning_rate": 4.461255922609986e-08, "loss": 0.3672, "step": 3918 }, { "epoch": 2.885861561119293, "grad_norm": 0.36390119791030884, "learning_rate": 4.4043275020762e-08, "loss": 0.3504, "step": 3919 }, { "epoch": 2.88659793814433, "grad_norm": 0.3462429344654083, "learning_rate": 4.3477630320279405e-08, "loss": 0.3632, "step": 3920 }, { "epoch": 2.8873343151693667, "grad_norm": 0.3237389326095581, "learning_rate": 4.291562554004369e-08, "loss": 0.3802, "step": 3921 }, { "epoch": 2.8880706921944035, "grad_norm": 0.3111920654773712, "learning_rate": 4.235726109277527e-08, "loss": 0.3409, "step": 3922 }, { "epoch": 2.8888070692194403, "grad_norm": 0.2972390949726105, "learning_rate": 4.180253738851947e-08, "loss": 0.3955, "step": 3923 }, { "epoch": 2.889543446244477, "grad_norm": 0.32099413871765137, "learning_rate": 4.125145483464821e-08, "loss": 0.3673, "step": 3924 }, { "epoch": 2.890279823269514, "grad_norm": 0.31523367762565613, "learning_rate": 4.070401383586109e-08, "loss": 0.354, "step": 3925 }, { "epoch": 2.8910162002945508, "grad_norm": 0.3093765079975128, "learning_rate": 4.0160214794180976e-08, "loss": 0.38, "step": 3926 }, { "epoch": 2.8917525773195876, "grad_norm": 0.3105464577674866, "learning_rate": 3.962005810895786e-08, "loss": 0.3578, "step": 3927 }, { "epoch": 2.8924889543446244, "grad_norm": 0.3610590994358063, "learning_rate": 3.908354417686722e-08, "loss": 0.3708, "step": 3928 }, { "epoch": 2.893225331369661, "grad_norm": 0.328524112701416, "learning_rate": 3.855067339190721e-08, "loss": 0.3849, "step": 3929 }, { "epoch": 2.893961708394698, "grad_norm": 0.3390013873577118, "learning_rate": 3.802144614540315e-08, "loss": 0.3637, "step": 3930 }, { "epoch": 2.894698085419735, "grad_norm": 0.3347131907939911, "learning_rate": 3.749586282600359e-08, "loss": 0.3575, "step": 3931 }, { "epoch": 2.8954344624447717, "grad_norm": 0.3055760860443115, "learning_rate": 3.6973923819680344e-08, "loss": 0.3632, "step": 3932 }, { "epoch": 2.8961708394698085, "grad_norm": 0.31726714968681335, "learning_rate": 3.645562950973014e-08, "loss": 0.3791, "step": 3933 }, { "epoch": 2.8969072164948453, "grad_norm": 0.34960147738456726, "learning_rate": 3.5940980276772394e-08, "loss": 0.3802, "step": 3934 }, { "epoch": 2.897643593519882, "grad_norm": 0.3189311921596527, "learning_rate": 3.5429976498749794e-08, "loss": 0.3893, "step": 3935 }, { "epoch": 2.898379970544919, "grad_norm": 0.3356260359287262, "learning_rate": 3.492261855092938e-08, "loss": 0.3665, "step": 3936 }, { "epoch": 2.8991163475699557, "grad_norm": 0.31776583194732666, "learning_rate": 3.441890680589754e-08, "loss": 0.3619, "step": 3937 }, { "epoch": 2.8998527245949925, "grad_norm": 0.3313503563404083, "learning_rate": 3.391884163356618e-08, "loss": 0.3877, "step": 3938 }, { "epoch": 2.9005891016200294, "grad_norm": 0.38583609461784363, "learning_rate": 3.3422423401167634e-08, "loss": 0.3696, "step": 3939 }, { "epoch": 2.901325478645066, "grad_norm": 0.33413827419281006, "learning_rate": 3.292965247325641e-08, "loss": 0.373, "step": 3940 }, { "epoch": 2.902061855670103, "grad_norm": 0.331826388835907, "learning_rate": 3.2440529211709146e-08, "loss": 0.3597, "step": 3941 }, { "epoch": 2.90279823269514, "grad_norm": 0.33604195713996887, "learning_rate": 3.19550539757224e-08, "loss": 0.3729, "step": 3942 }, { "epoch": 2.9035346097201766, "grad_norm": 0.3193381130695343, "learning_rate": 3.147322712181489e-08, "loss": 0.351, "step": 3943 }, { "epoch": 2.9042709867452134, "grad_norm": 0.30179837346076965, "learning_rate": 3.0995049003826325e-08, "loss": 0.3728, "step": 3944 }, { "epoch": 2.9050073637702503, "grad_norm": 0.32953178882598877, "learning_rate": 3.052051997291527e-08, "loss": 0.3682, "step": 3945 }, { "epoch": 2.905743740795287, "grad_norm": 0.3493231236934662, "learning_rate": 3.0049640377561865e-08, "loss": 0.36, "step": 3946 }, { "epoch": 2.906480117820324, "grad_norm": 0.332707941532135, "learning_rate": 2.9582410563565587e-08, "loss": 0.3774, "step": 3947 }, { "epoch": 2.9072164948453607, "grad_norm": 0.326961874961853, "learning_rate": 2.9118830874046988e-08, "loss": 0.3633, "step": 3948 }, { "epoch": 2.9079528718703975, "grad_norm": 0.34812161326408386, "learning_rate": 2.8658901649443183e-08, "loss": 0.3605, "step": 3949 }, { "epoch": 2.9086892488954343, "grad_norm": 0.32744961977005005, "learning_rate": 2.8202623227513993e-08, "loss": 0.3688, "step": 3950 }, { "epoch": 2.909425625920471, "grad_norm": 0.3743055462837219, "learning_rate": 2.7749995943335272e-08, "loss": 0.3543, "step": 3951 }, { "epoch": 2.910162002945508, "grad_norm": 0.3317681550979614, "learning_rate": 2.730102012930336e-08, "loss": 0.3662, "step": 3952 }, { "epoch": 2.910898379970545, "grad_norm": 0.36964574456214905, "learning_rate": 2.6855696115133388e-08, "loss": 0.3509, "step": 3953 }, { "epoch": 2.9116347569955816, "grad_norm": 0.3326326608657837, "learning_rate": 2.6414024227855994e-08, "loss": 0.3931, "step": 3954 }, { "epoch": 2.9123711340206184, "grad_norm": 0.3078947961330414, "learning_rate": 2.597600479182283e-08, "loss": 0.3566, "step": 3955 }, { "epoch": 2.9131075110456552, "grad_norm": 0.3276365399360657, "learning_rate": 2.5541638128702694e-08, "loss": 0.3809, "step": 3956 }, { "epoch": 2.913843888070692, "grad_norm": 0.31601178646087646, "learning_rate": 2.511092455747932e-08, "loss": 0.4119, "step": 3957 }, { "epoch": 2.914580265095729, "grad_norm": 0.34153667092323303, "learning_rate": 2.4683864394458023e-08, "loss": 0.3436, "step": 3958 }, { "epoch": 2.9153166421207657, "grad_norm": 0.3269186019897461, "learning_rate": 2.4260457953257377e-08, "loss": 0.3722, "step": 3959 }, { "epoch": 2.9160530191458025, "grad_norm": 0.3348276913166046, "learning_rate": 2.3840705544815324e-08, "loss": 0.3758, "step": 3960 }, { "epoch": 2.9167893961708393, "grad_norm": 0.3003559708595276, "learning_rate": 2.3424607477384176e-08, "loss": 0.3738, "step": 3961 }, { "epoch": 2.917525773195876, "grad_norm": 0.316893607378006, "learning_rate": 2.3012164056534503e-08, "loss": 0.3652, "step": 3962 }, { "epoch": 2.918262150220913, "grad_norm": 0.3250206410884857, "learning_rate": 2.260337558515291e-08, "loss": 0.3752, "step": 3963 }, { "epoch": 2.9189985272459498, "grad_norm": 0.34431540966033936, "learning_rate": 2.2198242363439814e-08, "loss": 0.3506, "step": 3964 }, { "epoch": 2.9197349042709866, "grad_norm": 0.3138362765312195, "learning_rate": 2.179676468891334e-08, "loss": 0.3998, "step": 3965 }, { "epoch": 2.9204712812960234, "grad_norm": 0.33245036005973816, "learning_rate": 2.1398942856407646e-08, "loss": 0.3787, "step": 3966 }, { "epoch": 2.92120765832106, "grad_norm": 0.34872448444366455, "learning_rate": 2.100477715806959e-08, "loss": 0.3701, "step": 3967 }, { "epoch": 2.9219440353460975, "grad_norm": 0.3229193091392517, "learning_rate": 2.061426788336318e-08, "loss": 0.3846, "step": 3968 }, { "epoch": 2.9226804123711343, "grad_norm": 0.3165433406829834, "learning_rate": 2.0227415319067355e-08, "loss": 0.3617, "step": 3969 }, { "epoch": 2.923416789396171, "grad_norm": 0.34778398275375366, "learning_rate": 1.984421974927375e-08, "loss": 0.3858, "step": 3970 }, { "epoch": 2.924153166421208, "grad_norm": 0.31038519740104675, "learning_rate": 1.946468145538949e-08, "loss": 0.3647, "step": 3971 }, { "epoch": 2.9248895434462447, "grad_norm": 0.2943412959575653, "learning_rate": 1.908880071613717e-08, "loss": 0.3628, "step": 3972 }, { "epoch": 2.9256259204712816, "grad_norm": 0.29908487200737, "learning_rate": 1.871657780755154e-08, "loss": 0.3674, "step": 3973 }, { "epoch": 2.9263622974963184, "grad_norm": 0.3493654131889343, "learning_rate": 1.8348013002982278e-08, "loss": 0.3899, "step": 3974 }, { "epoch": 2.927098674521355, "grad_norm": 0.33574777841567993, "learning_rate": 1.798310657309177e-08, "loss": 0.3621, "step": 3975 }, { "epoch": 2.927835051546392, "grad_norm": 0.3295884430408478, "learning_rate": 1.7621858785856206e-08, "loss": 0.3744, "step": 3976 }, { "epoch": 2.928571428571429, "grad_norm": 0.31094932556152344, "learning_rate": 1.72642699065656e-08, "loss": 0.3566, "step": 3977 }, { "epoch": 2.9293078055964656, "grad_norm": 0.34367263317108154, "learning_rate": 1.6910340197822116e-08, "loss": 0.3691, "step": 3978 }, { "epoch": 2.9300441826215025, "grad_norm": 0.32079145312309265, "learning_rate": 1.6560069919541177e-08, "loss": 0.3693, "step": 3979 }, { "epoch": 2.9307805596465393, "grad_norm": 0.3374221920967102, "learning_rate": 1.6213459328950355e-08, "loss": 0.3858, "step": 3980 }, { "epoch": 2.931516936671576, "grad_norm": 0.34414032101631165, "learning_rate": 1.5870508680589923e-08, "loss": 0.349, "step": 3981 }, { "epoch": 2.932253313696613, "grad_norm": 0.3187008202075958, "learning_rate": 1.5531218226312872e-08, "loss": 0.3836, "step": 3982 }, { "epoch": 2.9329896907216497, "grad_norm": 0.3022221624851227, "learning_rate": 1.5195588215283773e-08, "loss": 0.3954, "step": 3983 }, { "epoch": 2.9337260677466865, "grad_norm": 0.29533877968788147, "learning_rate": 1.4863618893979359e-08, "loss": 0.3874, "step": 3984 }, { "epoch": 2.9344624447717234, "grad_norm": 0.3319675624370575, "learning_rate": 1.4535310506187394e-08, "loss": 0.3379, "step": 3985 }, { "epoch": 2.93519882179676, "grad_norm": 0.3372650444507599, "learning_rate": 1.4210663293008353e-08, "loss": 0.3825, "step": 3986 }, { "epoch": 2.935935198821797, "grad_norm": 0.3170906901359558, "learning_rate": 1.3889677492852083e-08, "loss": 0.3742, "step": 3987 }, { "epoch": 2.936671575846834, "grad_norm": 0.2827453315258026, "learning_rate": 1.3572353341442246e-08, "loss": 0.3538, "step": 3988 }, { "epoch": 2.9374079528718706, "grad_norm": 0.31764787435531616, "learning_rate": 1.3258691071811325e-08, "loss": 0.357, "step": 3989 }, { "epoch": 2.9381443298969074, "grad_norm": 0.31900376081466675, "learning_rate": 1.2948690914303397e-08, "loss": 0.3661, "step": 3990 }, { "epoch": 2.9388807069219443, "grad_norm": 0.32145196199417114, "learning_rate": 1.2642353096573578e-08, "loss": 0.3865, "step": 3991 }, { "epoch": 2.939617083946981, "grad_norm": 0.3384080231189728, "learning_rate": 1.2339677843586917e-08, "loss": 0.3477, "step": 3992 }, { "epoch": 2.940353460972018, "grad_norm": 0.30081725120544434, "learning_rate": 1.2040665377618944e-08, "loss": 0.38, "step": 3993 }, { "epoch": 2.9410898379970547, "grad_norm": 0.3142537772655487, "learning_rate": 1.1745315918255118e-08, "loss": 0.3647, "step": 3994 }, { "epoch": 2.9418262150220915, "grad_norm": 0.3099748194217682, "learning_rate": 1.1453629682391943e-08, "loss": 0.3372, "step": 3995 }, { "epoch": 2.9425625920471283, "grad_norm": 0.32146158814430237, "learning_rate": 1.1165606884234182e-08, "loss": 0.3711, "step": 3996 }, { "epoch": 2.943298969072165, "grad_norm": 0.29835745692253113, "learning_rate": 1.088124773529764e-08, "loss": 0.3736, "step": 3997 }, { "epoch": 2.944035346097202, "grad_norm": 0.32554247975349426, "learning_rate": 1.0600552444406387e-08, "loss": 0.3506, "step": 3998 }, { "epoch": 2.944771723122239, "grad_norm": 0.3305593430995941, "learning_rate": 1.032352121769553e-08, "loss": 0.3846, "step": 3999 }, { "epoch": 2.9455081001472756, "grad_norm": 0.3391202688217163, "learning_rate": 1.0050154258607336e-08, "loss": 0.3827, "step": 4000 }, { "epoch": 2.9462444771723124, "grad_norm": 0.3027651309967041, "learning_rate": 9.780451767895104e-09, "loss": 0.358, "step": 4001 }, { "epoch": 2.9469808541973492, "grad_norm": 0.3510773181915283, "learning_rate": 9.514413943619849e-09, "loss": 0.3837, "step": 4002 }, { "epoch": 2.947717231222386, "grad_norm": 0.30630064010620117, "learning_rate": 9.252040981151956e-09, "loss": 0.3691, "step": 4003 }, { "epoch": 2.948453608247423, "grad_norm": 0.31158211827278137, "learning_rate": 8.993333073169519e-09, "loss": 0.3981, "step": 4004 }, { "epoch": 2.9491899852724597, "grad_norm": 0.31192636489868164, "learning_rate": 8.738290409660566e-09, "loss": 0.3855, "step": 4005 }, { "epoch": 2.9499263622974965, "grad_norm": 0.33482906222343445, "learning_rate": 8.486913177920275e-09, "loss": 0.3726, "step": 4006 }, { "epoch": 2.9506627393225333, "grad_norm": 0.32285335659980774, "learning_rate": 8.239201562553201e-09, "loss": 0.3276, "step": 4007 }, { "epoch": 2.95139911634757, "grad_norm": 0.31974491477012634, "learning_rate": 7.99515574546994e-09, "loss": 0.4194, "step": 4008 }, { "epoch": 2.952135493372607, "grad_norm": 0.33110713958740234, "learning_rate": 7.754775905891576e-09, "loss": 0.3572, "step": 4009 }, { "epoch": 2.9528718703976438, "grad_norm": 0.3111792802810669, "learning_rate": 7.518062220345235e-09, "loss": 0.3705, "step": 4010 }, { "epoch": 2.9536082474226806, "grad_norm": 0.29877275228500366, "learning_rate": 7.285014862666862e-09, "loss": 0.3592, "step": 4011 }, { "epoch": 2.9543446244477174, "grad_norm": 0.3104023039340973, "learning_rate": 7.055634003998446e-09, "loss": 0.3524, "step": 4012 }, { "epoch": 2.955081001472754, "grad_norm": 0.3165355920791626, "learning_rate": 6.829919812790797e-09, "loss": 0.3818, "step": 4013 }, { "epoch": 2.955817378497791, "grad_norm": 0.32550013065338135, "learning_rate": 6.607872454801878e-09, "loss": 0.3589, "step": 4014 }, { "epoch": 2.956553755522828, "grad_norm": 0.3074639141559601, "learning_rate": 6.38949209309625e-09, "loss": 0.3638, "step": 4015 }, { "epoch": 2.9572901325478647, "grad_norm": 0.316501259803772, "learning_rate": 6.174778888046184e-09, "loss": 0.3775, "step": 4016 }, { "epoch": 2.9580265095729015, "grad_norm": 0.33413442969322205, "learning_rate": 5.963732997329996e-09, "loss": 0.3577, "step": 4017 }, { "epoch": 2.9587628865979383, "grad_norm": 0.3284623622894287, "learning_rate": 5.756354575934265e-09, "loss": 0.3684, "step": 4018 }, { "epoch": 2.959499263622975, "grad_norm": 0.32929477095603943, "learning_rate": 5.552643776150501e-09, "loss": 0.3902, "step": 4019 }, { "epoch": 2.960235640648012, "grad_norm": 0.3296683728694916, "learning_rate": 5.352600747577929e-09, "loss": 0.3649, "step": 4020 }, { "epoch": 2.9609720176730487, "grad_norm": 0.32667848467826843, "learning_rate": 5.1562256371229245e-09, "loss": 0.3842, "step": 4021 }, { "epoch": 2.9617083946980856, "grad_norm": 0.33496248722076416, "learning_rate": 4.9635185889967966e-09, "loss": 0.3862, "step": 4022 }, { "epoch": 2.9624447717231224, "grad_norm": 0.2943912744522095, "learning_rate": 4.774479744717453e-09, "loss": 0.3649, "step": 4023 }, { "epoch": 2.963181148748159, "grad_norm": 0.3493749499320984, "learning_rate": 4.589109243109957e-09, "loss": 0.3731, "step": 4024 }, { "epoch": 2.963917525773196, "grad_norm": 0.35220539569854736, "learning_rate": 4.4074072203048605e-09, "loss": 0.387, "step": 4025 }, { "epoch": 2.964653902798233, "grad_norm": 0.3216170370578766, "learning_rate": 4.2293738097376465e-09, "loss": 0.403, "step": 4026 }, { "epoch": 2.9653902798232696, "grad_norm": 0.3366626799106598, "learning_rate": 4.055009142152066e-09, "loss": 0.3539, "step": 4027 }, { "epoch": 2.9661266568483065, "grad_norm": 0.3413197994232178, "learning_rate": 3.884313345595137e-09, "loss": 0.3747, "step": 4028 }, { "epoch": 2.9668630338733433, "grad_norm": 0.3060031831264496, "learning_rate": 3.7172865454210282e-09, "loss": 0.3525, "step": 4029 }, { "epoch": 2.96759941089838, "grad_norm": 0.3200681507587433, "learning_rate": 3.553928864289402e-09, "loss": 0.3714, "step": 4030 }, { "epoch": 2.968335787923417, "grad_norm": 0.3404831886291504, "learning_rate": 3.394240422164852e-09, "loss": 0.3466, "step": 4031 }, { "epoch": 2.9690721649484537, "grad_norm": 0.3348984122276306, "learning_rate": 3.238221336318015e-09, "loss": 0.3704, "step": 4032 }, { "epoch": 2.9698085419734905, "grad_norm": 0.3363956809043884, "learning_rate": 3.0858717213250176e-09, "loss": 0.3904, "step": 4033 }, { "epoch": 2.9705449189985274, "grad_norm": 0.3241819441318512, "learning_rate": 2.9371916890658105e-09, "loss": 0.3679, "step": 4034 }, { "epoch": 2.971281296023564, "grad_norm": 0.34104812145233154, "learning_rate": 2.792181348726941e-09, "loss": 0.3592, "step": 4035 }, { "epoch": 2.972017673048601, "grad_norm": 0.3131769597530365, "learning_rate": 2.6508408067998926e-09, "loss": 0.3541, "step": 4036 }, { "epoch": 2.972754050073638, "grad_norm": 0.3056182265281677, "learning_rate": 2.5131701670805252e-09, "loss": 0.3641, "step": 4037 }, { "epoch": 2.9734904270986746, "grad_norm": 0.3001095652580261, "learning_rate": 2.379169530670744e-09, "loss": 0.3596, "step": 4038 }, { "epoch": 2.9742268041237114, "grad_norm": 0.33647263050079346, "learning_rate": 2.2488389959751666e-09, "loss": 0.3457, "step": 4039 }, { "epoch": 2.9749631811487482, "grad_norm": 0.31432968378067017, "learning_rate": 2.12217865870612e-09, "loss": 0.3862, "step": 4040 }, { "epoch": 2.975699558173785, "grad_norm": 0.3569965362548828, "learning_rate": 1.999188611878089e-09, "loss": 0.3556, "step": 4041 }, { "epoch": 2.976435935198822, "grad_norm": 0.31356170773506165, "learning_rate": 1.8798689458116025e-09, "loss": 0.3892, "step": 4042 }, { "epoch": 2.9771723122238587, "grad_norm": 0.31526947021484375, "learning_rate": 1.7642197481315682e-09, "loss": 0.3534, "step": 4043 }, { "epoch": 2.9779086892488955, "grad_norm": 0.29720044136047363, "learning_rate": 1.6522411037667162e-09, "loss": 0.3851, "step": 4044 }, { "epoch": 2.9786450662739323, "grad_norm": 0.2992621064186096, "learning_rate": 1.5439330949518216e-09, "loss": 0.3677, "step": 4045 }, { "epoch": 2.979381443298969, "grad_norm": 0.3420858681201935, "learning_rate": 1.4392958012238167e-09, "loss": 0.3943, "step": 4046 }, { "epoch": 2.980117820324006, "grad_norm": 0.3347738981246948, "learning_rate": 1.338329299425678e-09, "loss": 0.3472, "step": 4047 }, { "epoch": 2.9808541973490428, "grad_norm": 0.3168458640575409, "learning_rate": 1.2410336637047604e-09, "loss": 0.3886, "step": 4048 }, { "epoch": 2.9815905743740796, "grad_norm": 0.3121738135814667, "learning_rate": 1.147408965511132e-09, "loss": 0.3921, "step": 4049 }, { "epoch": 2.9823269513991164, "grad_norm": 0.30733251571655273, "learning_rate": 1.0574552735997945e-09, "loss": 0.3648, "step": 4050 }, { "epoch": 2.9830633284241532, "grad_norm": 0.32336708903312683, "learning_rate": 9.711726540312383e-10, "loss": 0.3551, "step": 4051 }, { "epoch": 2.98379970544919, "grad_norm": 0.3032035529613495, "learning_rate": 8.885611701675567e-10, "loss": 0.35, "step": 4052 }, { "epoch": 2.984536082474227, "grad_norm": 0.33436474204063416, "learning_rate": 8.09620882676887e-10, "loss": 0.3755, "step": 4053 }, { "epoch": 2.9852724594992637, "grad_norm": 0.3163788318634033, "learning_rate": 7.343518495300794e-10, "loss": 0.372, "step": 4054 }, { "epoch": 2.9860088365243005, "grad_norm": 0.31104227900505066, "learning_rate": 6.62754126002918e-10, "loss": 0.3509, "step": 4055 }, { "epoch": 2.9867452135493373, "grad_norm": 0.3354119062423706, "learning_rate": 5.948277646744549e-10, "loss": 0.3447, "step": 4056 }, { "epoch": 2.987481590574374, "grad_norm": 0.32740065455436707, "learning_rate": 5.305728154275658e-10, "loss": 0.3559, "step": 4057 }, { "epoch": 2.988217967599411, "grad_norm": 0.32162371277809143, "learning_rate": 4.699893254495047e-10, "loss": 0.3998, "step": 4058 }, { "epoch": 2.9889543446244478, "grad_norm": 0.3500117063522339, "learning_rate": 4.1307733923079407e-10, "loss": 0.3904, "step": 4059 }, { "epoch": 2.9896907216494846, "grad_norm": 0.34684303402900696, "learning_rate": 3.5983689856522453e-10, "loss": 0.3776, "step": 4060 }, { "epoch": 2.9904270986745214, "grad_norm": 0.31505316495895386, "learning_rate": 3.1026804255207544e-10, "loss": 0.3636, "step": 4061 }, { "epoch": 2.991163475699558, "grad_norm": 0.3074565529823303, "learning_rate": 2.643708075922291e-10, "loss": 0.3477, "step": 4062 }, { "epoch": 2.991899852724595, "grad_norm": 0.3139492869377136, "learning_rate": 2.2214522739205657e-10, "loss": 0.3694, "step": 4063 }, { "epoch": 2.992636229749632, "grad_norm": 0.33408764004707336, "learning_rate": 1.835913329600869e-10, "loss": 0.4098, "step": 4064 }, { "epoch": 2.9933726067746687, "grad_norm": 0.33919352293014526, "learning_rate": 1.487091526097828e-10, "loss": 0.3638, "step": 4065 }, { "epoch": 2.9941089837997055, "grad_norm": 0.30452626943588257, "learning_rate": 1.174987119573201e-10, "loss": 0.356, "step": 4066 }, { "epoch": 2.9948453608247423, "grad_norm": 0.32732394337654114, "learning_rate": 8.996003392214292e-11, "loss": 0.374, "step": 4067 }, { "epoch": 2.995581737849779, "grad_norm": 0.2973012626171112, "learning_rate": 6.609313872862899e-11, "loss": 0.3513, "step": 4068 }, { "epoch": 2.996318114874816, "grad_norm": 0.30825671553611755, "learning_rate": 4.5898043903314096e-11, "loss": 0.3687, "step": 4069 }, { "epoch": 2.9970544918998527, "grad_norm": 0.3054065406322479, "learning_rate": 2.93747642771125e-11, "loss": 0.3703, "step": 4070 }, { "epoch": 2.9977908689248896, "grad_norm": 0.3413209617137909, "learning_rate": 1.6523311984206757e-11, "loss": 0.3665, "step": 4071 }, { "epoch": 2.9985272459499264, "grad_norm": 0.3274330496788025, "learning_rate": 7.343696462047689e-12, "loss": 0.3562, "step": 4072 }, { "epoch": 2.999263622974963, "grad_norm": 0.31068921089172363, "learning_rate": 1.8359244524646103e-12, "loss": 0.3785, "step": 4073 }, { "epoch": 3.0, "grad_norm": 0.31857830286026, "learning_rate": 0.0, "loss": 0.344, "step": 4074 }, { "epoch": 3.0, "step": 4074, "total_flos": 4823744474644480.0, "train_loss": 0.42331435592460115, "train_runtime": 136035.2407, "train_samples_per_second": 2.875, "train_steps_per_second": 0.03 } ], "logging_steps": 1.0, "max_steps": 4074, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4823744474644480.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }