diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,28560 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 4074, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0007363770250368188, + "grad_norm": 5.638559341430664, + "learning_rate": 2.450980392156863e-08, + "loss": 0.8206, + "step": 1 + }, + { + "epoch": 0.0014727540500736377, + "grad_norm": 5.574512004852295, + "learning_rate": 4.901960784313726e-08, + "loss": 0.8179, + "step": 2 + }, + { + "epoch": 0.0022091310751104565, + "grad_norm": 5.833616733551025, + "learning_rate": 7.352941176470589e-08, + "loss": 0.8298, + "step": 3 + }, + { + "epoch": 0.0029455081001472753, + "grad_norm": 5.760316848754883, + "learning_rate": 9.803921568627452e-08, + "loss": 0.8612, + "step": 4 + }, + { + "epoch": 0.003681885125184094, + "grad_norm": 5.582140922546387, + "learning_rate": 1.2254901960784314e-07, + "loss": 0.8597, + "step": 5 + }, + { + "epoch": 0.004418262150220913, + "grad_norm": 5.870736122131348, + "learning_rate": 1.4705882352941178e-07, + "loss": 0.8647, + "step": 6 + }, + { + "epoch": 0.005154639175257732, + "grad_norm": 5.794524192810059, + "learning_rate": 1.7156862745098042e-07, + "loss": 0.8709, + "step": 7 + }, + { + "epoch": 0.005891016200294551, + "grad_norm": 5.833931922912598, + "learning_rate": 1.9607843137254904e-07, + "loss": 0.8438, + "step": 8 + }, + { + "epoch": 0.0066273932253313695, + "grad_norm": 5.879095554351807, + "learning_rate": 2.2058823529411768e-07, + "loss": 0.8583, + "step": 9 + }, + { + "epoch": 0.007363770250368188, + "grad_norm": 5.551036357879639, + "learning_rate": 2.4509803921568627e-07, + "loss": 0.8273, + "step": 10 + }, + { + "epoch": 0.008100147275405008, + "grad_norm": 5.67064094543457, + "learning_rate": 2.696078431372549e-07, + "loss": 0.8328, + "step": 11 + }, + { + "epoch": 0.008836524300441826, + "grad_norm": 5.897095203399658, + "learning_rate": 2.9411764705882356e-07, + "loss": 0.8742, + "step": 12 + }, + { + "epoch": 0.009572901325478646, + "grad_norm": 5.685695648193359, + "learning_rate": 3.1862745098039215e-07, + "loss": 0.8389, + "step": 13 + }, + { + "epoch": 0.010309278350515464, + "grad_norm": 5.742497444152832, + "learning_rate": 3.4313725490196084e-07, + "loss": 0.8695, + "step": 14 + }, + { + "epoch": 0.011045655375552283, + "grad_norm": 5.5533037185668945, + "learning_rate": 3.6764705882352943e-07, + "loss": 0.8515, + "step": 15 + }, + { + "epoch": 0.011782032400589101, + "grad_norm": 5.345178604125977, + "learning_rate": 3.921568627450981e-07, + "loss": 0.816, + "step": 16 + }, + { + "epoch": 0.012518409425625921, + "grad_norm": 5.715511322021484, + "learning_rate": 4.1666666666666667e-07, + "loss": 0.9075, + "step": 17 + }, + { + "epoch": 0.013254786450662739, + "grad_norm": 5.173557758331299, + "learning_rate": 4.4117647058823536e-07, + "loss": 0.8311, + "step": 18 + }, + { + "epoch": 0.013991163475699559, + "grad_norm": 5.542694568634033, + "learning_rate": 4.6568627450980395e-07, + "loss": 0.8385, + "step": 19 + }, + { + "epoch": 0.014727540500736377, + "grad_norm": 4.2676191329956055, + "learning_rate": 4.901960784313725e-07, + "loss": 0.8088, + "step": 20 + }, + { + "epoch": 0.015463917525773196, + "grad_norm": 4.362669944763184, + "learning_rate": 5.147058823529412e-07, + "loss": 0.8291, + "step": 21 + }, + { + "epoch": 0.016200294550810016, + "grad_norm": 4.3511643409729, + "learning_rate": 5.392156862745098e-07, + "loss": 0.8043, + "step": 22 + }, + { + "epoch": 0.016936671575846832, + "grad_norm": 4.051642894744873, + "learning_rate": 5.637254901960785e-07, + "loss": 0.744, + "step": 23 + }, + { + "epoch": 0.017673048600883652, + "grad_norm": 4.032516002655029, + "learning_rate": 5.882352941176471e-07, + "loss": 0.7975, + "step": 24 + }, + { + "epoch": 0.018409425625920472, + "grad_norm": 3.7897393703460693, + "learning_rate": 6.127450980392157e-07, + "loss": 0.7686, + "step": 25 + }, + { + "epoch": 0.01914580265095729, + "grad_norm": 3.99767804145813, + "learning_rate": 6.372549019607843e-07, + "loss": 0.777, + "step": 26 + }, + { + "epoch": 0.019882179675994108, + "grad_norm": 2.312422037124634, + "learning_rate": 6.61764705882353e-07, + "loss": 0.7323, + "step": 27 + }, + { + "epoch": 0.020618556701030927, + "grad_norm": 2.436676025390625, + "learning_rate": 6.862745098039217e-07, + "loss": 0.7402, + "step": 28 + }, + { + "epoch": 0.021354933726067747, + "grad_norm": 2.346937417984009, + "learning_rate": 7.107843137254903e-07, + "loss": 0.7634, + "step": 29 + }, + { + "epoch": 0.022091310751104567, + "grad_norm": 2.329108476638794, + "learning_rate": 7.352941176470589e-07, + "loss": 0.8077, + "step": 30 + }, + { + "epoch": 0.022827687776141383, + "grad_norm": 2.1621270179748535, + "learning_rate": 7.598039215686275e-07, + "loss": 0.7406, + "step": 31 + }, + { + "epoch": 0.023564064801178203, + "grad_norm": 2.004992723464966, + "learning_rate": 7.843137254901962e-07, + "loss": 0.7373, + "step": 32 + }, + { + "epoch": 0.024300441826215022, + "grad_norm": 1.9387824535369873, + "learning_rate": 8.088235294117648e-07, + "loss": 0.6983, + "step": 33 + }, + { + "epoch": 0.025036818851251842, + "grad_norm": 1.8947597742080688, + "learning_rate": 8.333333333333333e-07, + "loss": 0.7385, + "step": 34 + }, + { + "epoch": 0.02577319587628866, + "grad_norm": 1.8148812055587769, + "learning_rate": 8.57843137254902e-07, + "loss": 0.7547, + "step": 35 + }, + { + "epoch": 0.026509572901325478, + "grad_norm": 1.3978095054626465, + "learning_rate": 8.823529411764707e-07, + "loss": 0.721, + "step": 36 + }, + { + "epoch": 0.027245949926362298, + "grad_norm": 1.5461397171020508, + "learning_rate": 9.068627450980393e-07, + "loss": 0.703, + "step": 37 + }, + { + "epoch": 0.027982326951399118, + "grad_norm": 1.797391414642334, + "learning_rate": 9.313725490196079e-07, + "loss": 0.6854, + "step": 38 + }, + { + "epoch": 0.028718703976435934, + "grad_norm": 2.147284746170044, + "learning_rate": 9.558823529411764e-07, + "loss": 0.736, + "step": 39 + }, + { + "epoch": 0.029455081001472753, + "grad_norm": 2.267225980758667, + "learning_rate": 9.80392156862745e-07, + "loss": 0.7235, + "step": 40 + }, + { + "epoch": 0.030191458026509573, + "grad_norm": 2.0823991298675537, + "learning_rate": 1.0049019607843138e-06, + "loss": 0.7262, + "step": 41 + }, + { + "epoch": 0.030927835051546393, + "grad_norm": 2.126589775085449, + "learning_rate": 1.0294117647058825e-06, + "loss": 0.7298, + "step": 42 + }, + { + "epoch": 0.03166421207658321, + "grad_norm": 1.8871612548828125, + "learning_rate": 1.0539215686274512e-06, + "loss": 0.7012, + "step": 43 + }, + { + "epoch": 0.03240058910162003, + "grad_norm": 1.6312057971954346, + "learning_rate": 1.0784313725490197e-06, + "loss": 0.6583, + "step": 44 + }, + { + "epoch": 0.03313696612665685, + "grad_norm": 1.5015575885772705, + "learning_rate": 1.1029411764705884e-06, + "loss": 0.6593, + "step": 45 + }, + { + "epoch": 0.033873343151693665, + "grad_norm": 1.2276663780212402, + "learning_rate": 1.127450980392157e-06, + "loss": 0.6918, + "step": 46 + }, + { + "epoch": 0.03460972017673049, + "grad_norm": 1.0622628927230835, + "learning_rate": 1.1519607843137255e-06, + "loss": 0.6918, + "step": 47 + }, + { + "epoch": 0.035346097201767304, + "grad_norm": 0.9096408486366272, + "learning_rate": 1.1764705882352942e-06, + "loss": 0.6847, + "step": 48 + }, + { + "epoch": 0.03608247422680412, + "grad_norm": 0.8563209772109985, + "learning_rate": 1.200980392156863e-06, + "loss": 0.6646, + "step": 49 + }, + { + "epoch": 0.036818851251840944, + "grad_norm": 0.9862013459205627, + "learning_rate": 1.2254901960784314e-06, + "loss": 0.6732, + "step": 50 + }, + { + "epoch": 0.03755522827687776, + "grad_norm": 1.099507212638855, + "learning_rate": 1.25e-06, + "loss": 0.6882, + "step": 51 + }, + { + "epoch": 0.03829160530191458, + "grad_norm": 1.1145635843276978, + "learning_rate": 1.2745098039215686e-06, + "loss": 0.6658, + "step": 52 + }, + { + "epoch": 0.0390279823269514, + "grad_norm": 1.0199302434921265, + "learning_rate": 1.2990196078431375e-06, + "loss": 0.6439, + "step": 53 + }, + { + "epoch": 0.039764359351988215, + "grad_norm": 0.8398745059967041, + "learning_rate": 1.323529411764706e-06, + "loss": 0.641, + "step": 54 + }, + { + "epoch": 0.04050073637702504, + "grad_norm": 0.7679091095924377, + "learning_rate": 1.3480392156862745e-06, + "loss": 0.6067, + "step": 55 + }, + { + "epoch": 0.041237113402061855, + "grad_norm": 0.7991195321083069, + "learning_rate": 1.3725490196078434e-06, + "loss": 0.677, + "step": 56 + }, + { + "epoch": 0.04197349042709867, + "grad_norm": 0.7389347553253174, + "learning_rate": 1.3970588235294119e-06, + "loss": 0.6504, + "step": 57 + }, + { + "epoch": 0.042709867452135494, + "grad_norm": 0.7039929628372192, + "learning_rate": 1.4215686274509805e-06, + "loss": 0.6002, + "step": 58 + }, + { + "epoch": 0.04344624447717231, + "grad_norm": 0.7065069675445557, + "learning_rate": 1.4460784313725492e-06, + "loss": 0.581, + "step": 59 + }, + { + "epoch": 0.044182621502209134, + "grad_norm": 0.6952792406082153, + "learning_rate": 1.4705882352941177e-06, + "loss": 0.6523, + "step": 60 + }, + { + "epoch": 0.04491899852724595, + "grad_norm": 0.6492939591407776, + "learning_rate": 1.4950980392156864e-06, + "loss": 0.6587, + "step": 61 + }, + { + "epoch": 0.045655375552282766, + "grad_norm": 0.6436308026313782, + "learning_rate": 1.519607843137255e-06, + "loss": 0.6048, + "step": 62 + }, + { + "epoch": 0.04639175257731959, + "grad_norm": 0.6987003087997437, + "learning_rate": 1.5441176470588238e-06, + "loss": 0.6353, + "step": 63 + }, + { + "epoch": 0.047128129602356406, + "grad_norm": 0.698427677154541, + "learning_rate": 1.5686274509803923e-06, + "loss": 0.63, + "step": 64 + }, + { + "epoch": 0.04786450662739323, + "grad_norm": 0.7047044634819031, + "learning_rate": 1.5931372549019608e-06, + "loss": 0.6048, + "step": 65 + }, + { + "epoch": 0.048600883652430045, + "grad_norm": 0.6246572732925415, + "learning_rate": 1.6176470588235297e-06, + "loss": 0.6264, + "step": 66 + }, + { + "epoch": 0.04933726067746686, + "grad_norm": 0.6060186624526978, + "learning_rate": 1.6421568627450982e-06, + "loss": 0.6277, + "step": 67 + }, + { + "epoch": 0.050073637702503684, + "grad_norm": 0.5113458037376404, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.5985, + "step": 68 + }, + { + "epoch": 0.0508100147275405, + "grad_norm": 0.6172787547111511, + "learning_rate": 1.6911764705882356e-06, + "loss": 0.622, + "step": 69 + }, + { + "epoch": 0.05154639175257732, + "grad_norm": 0.4991741180419922, + "learning_rate": 1.715686274509804e-06, + "loss": 0.6048, + "step": 70 + }, + { + "epoch": 0.05228276877761414, + "grad_norm": 0.5184860825538635, + "learning_rate": 1.7401960784313725e-06, + "loss": 0.6033, + "step": 71 + }, + { + "epoch": 0.053019145802650956, + "grad_norm": 0.4985435903072357, + "learning_rate": 1.7647058823529414e-06, + "loss": 0.5792, + "step": 72 + }, + { + "epoch": 0.05375552282768778, + "grad_norm": 0.5244366526603699, + "learning_rate": 1.78921568627451e-06, + "loss": 0.6168, + "step": 73 + }, + { + "epoch": 0.054491899852724596, + "grad_norm": 0.6064889430999756, + "learning_rate": 1.8137254901960786e-06, + "loss": 0.6034, + "step": 74 + }, + { + "epoch": 0.05522827687776141, + "grad_norm": 0.5819352269172668, + "learning_rate": 1.8382352941176473e-06, + "loss": 0.5942, + "step": 75 + }, + { + "epoch": 0.055964653902798235, + "grad_norm": 0.47975030541419983, + "learning_rate": 1.8627450980392158e-06, + "loss": 0.5817, + "step": 76 + }, + { + "epoch": 0.05670103092783505, + "grad_norm": 0.47679486870765686, + "learning_rate": 1.8872549019607845e-06, + "loss": 0.5789, + "step": 77 + }, + { + "epoch": 0.05743740795287187, + "grad_norm": 0.44839027523994446, + "learning_rate": 1.9117647058823528e-06, + "loss": 0.595, + "step": 78 + }, + { + "epoch": 0.05817378497790869, + "grad_norm": 0.43815600872039795, + "learning_rate": 1.9362745098039217e-06, + "loss": 0.5648, + "step": 79 + }, + { + "epoch": 0.05891016200294551, + "grad_norm": 0.457387775182724, + "learning_rate": 1.96078431372549e-06, + "loss": 0.6067, + "step": 80 + }, + { + "epoch": 0.05964653902798233, + "grad_norm": 0.4818171560764313, + "learning_rate": 1.985294117647059e-06, + "loss": 0.61, + "step": 81 + }, + { + "epoch": 0.060382916053019146, + "grad_norm": 0.4079191982746124, + "learning_rate": 2.0098039215686276e-06, + "loss": 0.5663, + "step": 82 + }, + { + "epoch": 0.06111929307805596, + "grad_norm": 0.4442485272884369, + "learning_rate": 2.034313725490196e-06, + "loss": 0.5761, + "step": 83 + }, + { + "epoch": 0.061855670103092786, + "grad_norm": 0.4076555073261261, + "learning_rate": 2.058823529411765e-06, + "loss": 0.5667, + "step": 84 + }, + { + "epoch": 0.0625920471281296, + "grad_norm": 0.4406167268753052, + "learning_rate": 2.0833333333333334e-06, + "loss": 0.586, + "step": 85 + }, + { + "epoch": 0.06332842415316642, + "grad_norm": 0.4781548082828522, + "learning_rate": 2.1078431372549023e-06, + "loss": 0.5387, + "step": 86 + }, + { + "epoch": 0.06406480117820323, + "grad_norm": 0.4489077925682068, + "learning_rate": 2.132352941176471e-06, + "loss": 0.5346, + "step": 87 + }, + { + "epoch": 0.06480117820324006, + "grad_norm": 0.47296687960624695, + "learning_rate": 2.1568627450980393e-06, + "loss": 0.611, + "step": 88 + }, + { + "epoch": 0.06553755522827688, + "grad_norm": 0.4460233151912689, + "learning_rate": 2.1813725490196082e-06, + "loss": 0.5806, + "step": 89 + }, + { + "epoch": 0.0662739322533137, + "grad_norm": 0.45212772488594055, + "learning_rate": 2.2058823529411767e-06, + "loss": 0.5761, + "step": 90 + }, + { + "epoch": 0.06701030927835051, + "grad_norm": 0.4867306053638458, + "learning_rate": 2.2303921568627456e-06, + "loss": 0.555, + "step": 91 + }, + { + "epoch": 0.06774668630338733, + "grad_norm": 0.44260451197624207, + "learning_rate": 2.254901960784314e-06, + "loss": 0.5333, + "step": 92 + }, + { + "epoch": 0.06848306332842416, + "grad_norm": 0.4509826898574829, + "learning_rate": 2.2794117647058826e-06, + "loss": 0.5817, + "step": 93 + }, + { + "epoch": 0.06921944035346098, + "grad_norm": 0.422654926776886, + "learning_rate": 2.303921568627451e-06, + "loss": 0.5797, + "step": 94 + }, + { + "epoch": 0.06995581737849779, + "grad_norm": 0.40386420488357544, + "learning_rate": 2.32843137254902e-06, + "loss": 0.5972, + "step": 95 + }, + { + "epoch": 0.07069219440353461, + "grad_norm": 0.4382508099079132, + "learning_rate": 2.3529411764705885e-06, + "loss": 0.588, + "step": 96 + }, + { + "epoch": 0.07142857142857142, + "grad_norm": 0.4391435980796814, + "learning_rate": 2.377450980392157e-06, + "loss": 0.5434, + "step": 97 + }, + { + "epoch": 0.07216494845360824, + "grad_norm": 0.39785510301589966, + "learning_rate": 2.401960784313726e-06, + "loss": 0.55, + "step": 98 + }, + { + "epoch": 0.07290132547864507, + "grad_norm": 0.4807189404964447, + "learning_rate": 2.4264705882352943e-06, + "loss": 0.5642, + "step": 99 + }, + { + "epoch": 0.07363770250368189, + "grad_norm": 0.40981948375701904, + "learning_rate": 2.450980392156863e-06, + "loss": 0.5257, + "step": 100 + }, + { + "epoch": 0.0743740795287187, + "grad_norm": 0.4581270217895508, + "learning_rate": 2.4754901960784317e-06, + "loss": 0.5676, + "step": 101 + }, + { + "epoch": 0.07511045655375552, + "grad_norm": 0.4791598320007324, + "learning_rate": 2.5e-06, + "loss": 0.5589, + "step": 102 + }, + { + "epoch": 0.07584683357879234, + "grad_norm": 0.4747304916381836, + "learning_rate": 2.5245098039215687e-06, + "loss": 0.573, + "step": 103 + }, + { + "epoch": 0.07658321060382917, + "grad_norm": 0.501209557056427, + "learning_rate": 2.549019607843137e-06, + "loss": 0.5354, + "step": 104 + }, + { + "epoch": 0.07731958762886598, + "grad_norm": 0.42233791947364807, + "learning_rate": 2.5735294117647057e-06, + "loss": 0.5822, + "step": 105 + }, + { + "epoch": 0.0780559646539028, + "grad_norm": 0.44912800192832947, + "learning_rate": 2.598039215686275e-06, + "loss": 0.5279, + "step": 106 + }, + { + "epoch": 0.07879234167893961, + "grad_norm": 0.47332027554512024, + "learning_rate": 2.6225490196078435e-06, + "loss": 0.5641, + "step": 107 + }, + { + "epoch": 0.07952871870397643, + "grad_norm": 0.44078969955444336, + "learning_rate": 2.647058823529412e-06, + "loss": 0.555, + "step": 108 + }, + { + "epoch": 0.08026509572901326, + "grad_norm": 0.4458158016204834, + "learning_rate": 2.6715686274509804e-06, + "loss": 0.5706, + "step": 109 + }, + { + "epoch": 0.08100147275405008, + "grad_norm": 0.4494592249393463, + "learning_rate": 2.696078431372549e-06, + "loss": 0.5742, + "step": 110 + }, + { + "epoch": 0.0817378497790869, + "grad_norm": 0.47519755363464355, + "learning_rate": 2.720588235294118e-06, + "loss": 0.5553, + "step": 111 + }, + { + "epoch": 0.08247422680412371, + "grad_norm": 0.4567236006259918, + "learning_rate": 2.7450980392156867e-06, + "loss": 0.5677, + "step": 112 + }, + { + "epoch": 0.08321060382916053, + "grad_norm": 0.45444926619529724, + "learning_rate": 2.7696078431372552e-06, + "loss": 0.537, + "step": 113 + }, + { + "epoch": 0.08394698085419734, + "grad_norm": 0.5289820432662964, + "learning_rate": 2.7941176470588237e-06, + "loss": 0.5701, + "step": 114 + }, + { + "epoch": 0.08468335787923417, + "grad_norm": 0.5139638781547546, + "learning_rate": 2.818627450980392e-06, + "loss": 0.5588, + "step": 115 + }, + { + "epoch": 0.08541973490427099, + "grad_norm": 0.42048802971839905, + "learning_rate": 2.843137254901961e-06, + "loss": 0.5342, + "step": 116 + }, + { + "epoch": 0.0861561119293078, + "grad_norm": 0.43646562099456787, + "learning_rate": 2.8676470588235296e-06, + "loss": 0.5506, + "step": 117 + }, + { + "epoch": 0.08689248895434462, + "grad_norm": 0.4417460560798645, + "learning_rate": 2.8921568627450985e-06, + "loss": 0.5493, + "step": 118 + }, + { + "epoch": 0.08762886597938144, + "grad_norm": 0.46676769852638245, + "learning_rate": 2.916666666666667e-06, + "loss": 0.5831, + "step": 119 + }, + { + "epoch": 0.08836524300441827, + "grad_norm": 0.45995548367500305, + "learning_rate": 2.9411764705882355e-06, + "loss": 0.5807, + "step": 120 + }, + { + "epoch": 0.08910162002945508, + "grad_norm": 0.42616021633148193, + "learning_rate": 2.9656862745098044e-06, + "loss": 0.5562, + "step": 121 + }, + { + "epoch": 0.0898379970544919, + "grad_norm": 0.419491708278656, + "learning_rate": 2.990196078431373e-06, + "loss": 0.5285, + "step": 122 + }, + { + "epoch": 0.09057437407952872, + "grad_norm": 0.4764864146709442, + "learning_rate": 3.0147058823529413e-06, + "loss": 0.5761, + "step": 123 + }, + { + "epoch": 0.09131075110456553, + "grad_norm": 0.4339980185031891, + "learning_rate": 3.03921568627451e-06, + "loss": 0.5707, + "step": 124 + }, + { + "epoch": 0.09204712812960236, + "grad_norm": 0.41494664549827576, + "learning_rate": 3.0637254901960787e-06, + "loss": 0.5294, + "step": 125 + }, + { + "epoch": 0.09278350515463918, + "grad_norm": 0.43163448572158813, + "learning_rate": 3.0882352941176476e-06, + "loss": 0.5539, + "step": 126 + }, + { + "epoch": 0.093519882179676, + "grad_norm": 0.42778855562210083, + "learning_rate": 3.112745098039216e-06, + "loss": 0.5625, + "step": 127 + }, + { + "epoch": 0.09425625920471281, + "grad_norm": 0.4978928565979004, + "learning_rate": 3.1372549019607846e-06, + "loss": 0.5107, + "step": 128 + }, + { + "epoch": 0.09499263622974963, + "grad_norm": 0.4967725872993469, + "learning_rate": 3.161764705882353e-06, + "loss": 0.5434, + "step": 129 + }, + { + "epoch": 0.09572901325478646, + "grad_norm": 0.4463275372982025, + "learning_rate": 3.1862745098039216e-06, + "loss": 0.553, + "step": 130 + }, + { + "epoch": 0.09646539027982327, + "grad_norm": 0.5262526869773865, + "learning_rate": 3.210784313725491e-06, + "loss": 0.5464, + "step": 131 + }, + { + "epoch": 0.09720176730486009, + "grad_norm": 0.4471772015094757, + "learning_rate": 3.2352941176470594e-06, + "loss": 0.5089, + "step": 132 + }, + { + "epoch": 0.0979381443298969, + "grad_norm": 0.41978949308395386, + "learning_rate": 3.259803921568628e-06, + "loss": 0.4965, + "step": 133 + }, + { + "epoch": 0.09867452135493372, + "grad_norm": 0.4757583439350128, + "learning_rate": 3.2843137254901964e-06, + "loss": 0.5509, + "step": 134 + }, + { + "epoch": 0.09941089837997054, + "grad_norm": 0.4464501738548279, + "learning_rate": 3.308823529411765e-06, + "loss": 0.5032, + "step": 135 + }, + { + "epoch": 0.10014727540500737, + "grad_norm": 0.46477508544921875, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.5429, + "step": 136 + }, + { + "epoch": 0.10088365243004419, + "grad_norm": 0.426889032125473, + "learning_rate": 3.357843137254902e-06, + "loss": 0.5572, + "step": 137 + }, + { + "epoch": 0.101620029455081, + "grad_norm": 0.42896464467048645, + "learning_rate": 3.382352941176471e-06, + "loss": 0.541, + "step": 138 + }, + { + "epoch": 0.10235640648011782, + "grad_norm": 0.4436034560203552, + "learning_rate": 3.4068627450980396e-06, + "loss": 0.5648, + "step": 139 + }, + { + "epoch": 0.10309278350515463, + "grad_norm": 0.4158652722835541, + "learning_rate": 3.431372549019608e-06, + "loss": 0.5607, + "step": 140 + }, + { + "epoch": 0.10382916053019146, + "grad_norm": 0.4582897424697876, + "learning_rate": 3.4558823529411766e-06, + "loss": 0.5745, + "step": 141 + }, + { + "epoch": 0.10456553755522828, + "grad_norm": 0.4275548756122589, + "learning_rate": 3.480392156862745e-06, + "loss": 0.5147, + "step": 142 + }, + { + "epoch": 0.1053019145802651, + "grad_norm": 0.4628273546695709, + "learning_rate": 3.504901960784314e-06, + "loss": 0.5305, + "step": 143 + }, + { + "epoch": 0.10603829160530191, + "grad_norm": 0.4495219588279724, + "learning_rate": 3.529411764705883e-06, + "loss": 0.563, + "step": 144 + }, + { + "epoch": 0.10677466863033873, + "grad_norm": 0.48380184173583984, + "learning_rate": 3.5539215686274514e-06, + "loss": 0.562, + "step": 145 + }, + { + "epoch": 0.10751104565537556, + "grad_norm": 0.4028421938419342, + "learning_rate": 3.57843137254902e-06, + "loss": 0.5137, + "step": 146 + }, + { + "epoch": 0.10824742268041238, + "grad_norm": 0.4970363974571228, + "learning_rate": 3.6029411764705883e-06, + "loss": 0.5268, + "step": 147 + }, + { + "epoch": 0.10898379970544919, + "grad_norm": 0.46763232350349426, + "learning_rate": 3.6274509803921573e-06, + "loss": 0.542, + "step": 148 + }, + { + "epoch": 0.10972017673048601, + "grad_norm": 0.457398921251297, + "learning_rate": 3.6519607843137257e-06, + "loss": 0.5289, + "step": 149 + }, + { + "epoch": 0.11045655375552282, + "grad_norm": 0.44986864924430847, + "learning_rate": 3.6764705882352946e-06, + "loss": 0.549, + "step": 150 + }, + { + "epoch": 0.11119293078055964, + "grad_norm": 0.46221205592155457, + "learning_rate": 3.700980392156863e-06, + "loss": 0.5005, + "step": 151 + }, + { + "epoch": 0.11192930780559647, + "grad_norm": 0.5250900983810425, + "learning_rate": 3.7254901960784316e-06, + "loss": 0.5468, + "step": 152 + }, + { + "epoch": 0.11266568483063329, + "grad_norm": 0.434471994638443, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.5307, + "step": 153 + }, + { + "epoch": 0.1134020618556701, + "grad_norm": 0.5031168460845947, + "learning_rate": 3.774509803921569e-06, + "loss": 0.5383, + "step": 154 + }, + { + "epoch": 0.11413843888070692, + "grad_norm": 0.4508562684059143, + "learning_rate": 3.7990196078431375e-06, + "loss": 0.5359, + "step": 155 + }, + { + "epoch": 0.11487481590574374, + "grad_norm": 0.4205038845539093, + "learning_rate": 3.8235294117647055e-06, + "loss": 0.5315, + "step": 156 + }, + { + "epoch": 0.11561119293078057, + "grad_norm": 0.40232402086257935, + "learning_rate": 3.848039215686275e-06, + "loss": 0.5075, + "step": 157 + }, + { + "epoch": 0.11634756995581738, + "grad_norm": 0.45643943548202515, + "learning_rate": 3.872549019607843e-06, + "loss": 0.5481, + "step": 158 + }, + { + "epoch": 0.1170839469808542, + "grad_norm": 0.4849976599216461, + "learning_rate": 3.897058823529412e-06, + "loss": 0.5068, + "step": 159 + }, + { + "epoch": 0.11782032400589101, + "grad_norm": 0.4546898901462555, + "learning_rate": 3.92156862745098e-06, + "loss": 0.5489, + "step": 160 + }, + { + "epoch": 0.11855670103092783, + "grad_norm": 0.4682084023952484, + "learning_rate": 3.946078431372549e-06, + "loss": 0.5448, + "step": 161 + }, + { + "epoch": 0.11929307805596466, + "grad_norm": 0.4213881492614746, + "learning_rate": 3.970588235294118e-06, + "loss": 0.4813, + "step": 162 + }, + { + "epoch": 0.12002945508100148, + "grad_norm": 0.4240805208683014, + "learning_rate": 3.995098039215687e-06, + "loss": 0.5164, + "step": 163 + }, + { + "epoch": 0.12076583210603829, + "grad_norm": 0.5331303477287292, + "learning_rate": 4.019607843137255e-06, + "loss": 0.5201, + "step": 164 + }, + { + "epoch": 0.12150220913107511, + "grad_norm": 0.4699327349662781, + "learning_rate": 4.044117647058824e-06, + "loss": 0.527, + "step": 165 + }, + { + "epoch": 0.12223858615611193, + "grad_norm": 0.44539308547973633, + "learning_rate": 4.068627450980392e-06, + "loss": 0.4908, + "step": 166 + }, + { + "epoch": 0.12297496318114874, + "grad_norm": 0.4506961703300476, + "learning_rate": 4.093137254901961e-06, + "loss": 0.5314, + "step": 167 + }, + { + "epoch": 0.12371134020618557, + "grad_norm": 0.44389674067497253, + "learning_rate": 4.11764705882353e-06, + "loss": 0.506, + "step": 168 + }, + { + "epoch": 0.12444771723122239, + "grad_norm": 0.445320725440979, + "learning_rate": 4.142156862745099e-06, + "loss": 0.5236, + "step": 169 + }, + { + "epoch": 0.1251840942562592, + "grad_norm": 0.44341355562210083, + "learning_rate": 4.166666666666667e-06, + "loss": 0.571, + "step": 170 + }, + { + "epoch": 0.12592047128129602, + "grad_norm": 0.42870596051216125, + "learning_rate": 4.191176470588236e-06, + "loss": 0.5204, + "step": 171 + }, + { + "epoch": 0.12665684830633284, + "grad_norm": 0.4404330849647522, + "learning_rate": 4.215686274509805e-06, + "loss": 0.5623, + "step": 172 + }, + { + "epoch": 0.12739322533136965, + "grad_norm": 0.542121946811676, + "learning_rate": 4.240196078431373e-06, + "loss": 0.5301, + "step": 173 + }, + { + "epoch": 0.12812960235640647, + "grad_norm": 0.5006332397460938, + "learning_rate": 4.264705882352942e-06, + "loss": 0.5237, + "step": 174 + }, + { + "epoch": 0.12886597938144329, + "grad_norm": 0.4504947066307068, + "learning_rate": 4.28921568627451e-06, + "loss": 0.5052, + "step": 175 + }, + { + "epoch": 0.12960235640648013, + "grad_norm": 0.4694218635559082, + "learning_rate": 4.313725490196079e-06, + "loss": 0.5012, + "step": 176 + }, + { + "epoch": 0.13033873343151695, + "grad_norm": 0.47194620966911316, + "learning_rate": 4.3382352941176475e-06, + "loss": 0.5049, + "step": 177 + }, + { + "epoch": 0.13107511045655376, + "grad_norm": 0.5346331596374512, + "learning_rate": 4.3627450980392164e-06, + "loss": 0.5118, + "step": 178 + }, + { + "epoch": 0.13181148748159058, + "grad_norm": 0.5172557234764099, + "learning_rate": 4.3872549019607845e-06, + "loss": 0.4994, + "step": 179 + }, + { + "epoch": 0.1325478645066274, + "grad_norm": 0.4675532579421997, + "learning_rate": 4.411764705882353e-06, + "loss": 0.5519, + "step": 180 + }, + { + "epoch": 0.1332842415316642, + "grad_norm": 0.4985281527042389, + "learning_rate": 4.4362745098039215e-06, + "loss": 0.5058, + "step": 181 + }, + { + "epoch": 0.13402061855670103, + "grad_norm": 0.46521127223968506, + "learning_rate": 4.460784313725491e-06, + "loss": 0.5559, + "step": 182 + }, + { + "epoch": 0.13475699558173784, + "grad_norm": 0.48757219314575195, + "learning_rate": 4.485294117647059e-06, + "loss": 0.5249, + "step": 183 + }, + { + "epoch": 0.13549337260677466, + "grad_norm": 0.5091550946235657, + "learning_rate": 4.509803921568628e-06, + "loss": 0.5223, + "step": 184 + }, + { + "epoch": 0.13622974963181148, + "grad_norm": 0.4934118688106537, + "learning_rate": 4.534313725490196e-06, + "loss": 0.5419, + "step": 185 + }, + { + "epoch": 0.13696612665684832, + "grad_norm": 0.5085634589195251, + "learning_rate": 4.558823529411765e-06, + "loss": 0.516, + "step": 186 + }, + { + "epoch": 0.13770250368188514, + "grad_norm": 0.46844273805618286, + "learning_rate": 4.583333333333333e-06, + "loss": 0.5093, + "step": 187 + }, + { + "epoch": 0.13843888070692195, + "grad_norm": 0.45088204741477966, + "learning_rate": 4.607843137254902e-06, + "loss": 0.5236, + "step": 188 + }, + { + "epoch": 0.13917525773195877, + "grad_norm": 0.4446581304073334, + "learning_rate": 4.632352941176471e-06, + "loss": 0.5117, + "step": 189 + }, + { + "epoch": 0.13991163475699558, + "grad_norm": 0.47169244289398193, + "learning_rate": 4.65686274509804e-06, + "loss": 0.5264, + "step": 190 + }, + { + "epoch": 0.1406480117820324, + "grad_norm": 0.4336180090904236, + "learning_rate": 4.681372549019608e-06, + "loss": 0.527, + "step": 191 + }, + { + "epoch": 0.14138438880706922, + "grad_norm": 0.4745936989784241, + "learning_rate": 4.705882352941177e-06, + "loss": 0.5235, + "step": 192 + }, + { + "epoch": 0.14212076583210603, + "grad_norm": 0.49792900681495667, + "learning_rate": 4.730392156862745e-06, + "loss": 0.5006, + "step": 193 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 0.503589391708374, + "learning_rate": 4.754901960784314e-06, + "loss": 0.5314, + "step": 194 + }, + { + "epoch": 0.14359351988217967, + "grad_norm": 0.4816850423812866, + "learning_rate": 4.779411764705883e-06, + "loss": 0.5176, + "step": 195 + }, + { + "epoch": 0.14432989690721648, + "grad_norm": 0.5058243870735168, + "learning_rate": 4.803921568627452e-06, + "loss": 0.4917, + "step": 196 + }, + { + "epoch": 0.14506627393225333, + "grad_norm": 0.4948558509349823, + "learning_rate": 4.82843137254902e-06, + "loss": 0.5227, + "step": 197 + }, + { + "epoch": 0.14580265095729014, + "grad_norm": 0.43871647119522095, + "learning_rate": 4.852941176470589e-06, + "loss": 0.5413, + "step": 198 + }, + { + "epoch": 0.14653902798232696, + "grad_norm": 0.45166996121406555, + "learning_rate": 4.8774509803921576e-06, + "loss": 0.5645, + "step": 199 + }, + { + "epoch": 0.14727540500736377, + "grad_norm": 0.5076332688331604, + "learning_rate": 4.901960784313726e-06, + "loss": 0.5222, + "step": 200 + }, + { + "epoch": 0.1480117820324006, + "grad_norm": 0.46355342864990234, + "learning_rate": 4.9264705882352945e-06, + "loss": 0.4985, + "step": 201 + }, + { + "epoch": 0.1487481590574374, + "grad_norm": 0.473640114068985, + "learning_rate": 4.9509803921568634e-06, + "loss": 0.5339, + "step": 202 + }, + { + "epoch": 0.14948453608247422, + "grad_norm": 0.43335700035095215, + "learning_rate": 4.9754901960784315e-06, + "loss": 0.5236, + "step": 203 + }, + { + "epoch": 0.15022091310751104, + "grad_norm": 0.5285197496414185, + "learning_rate": 5e-06, + "loss": 0.5102, + "step": 204 + }, + { + "epoch": 0.15095729013254786, + "grad_norm": 0.4338807463645935, + "learning_rate": 5.024509803921569e-06, + "loss": 0.5243, + "step": 205 + }, + { + "epoch": 0.15169366715758467, + "grad_norm": 0.5603867769241333, + "learning_rate": 5.049019607843137e-06, + "loss": 0.5269, + "step": 206 + }, + { + "epoch": 0.15243004418262152, + "grad_norm": 0.547127366065979, + "learning_rate": 5.073529411764706e-06, + "loss": 0.5376, + "step": 207 + }, + { + "epoch": 0.15316642120765833, + "grad_norm": 0.44914114475250244, + "learning_rate": 5.098039215686274e-06, + "loss": 0.4983, + "step": 208 + }, + { + "epoch": 0.15390279823269515, + "grad_norm": 0.48467424511909485, + "learning_rate": 5.122549019607843e-06, + "loss": 0.4931, + "step": 209 + }, + { + "epoch": 0.15463917525773196, + "grad_norm": 0.445891797542572, + "learning_rate": 5.147058823529411e-06, + "loss": 0.5086, + "step": 210 + }, + { + "epoch": 0.15537555228276878, + "grad_norm": 0.44430914521217346, + "learning_rate": 5.171568627450981e-06, + "loss": 0.5023, + "step": 211 + }, + { + "epoch": 0.1561119293078056, + "grad_norm": 0.5020100474357605, + "learning_rate": 5.19607843137255e-06, + "loss": 0.492, + "step": 212 + }, + { + "epoch": 0.1568483063328424, + "grad_norm": 0.4869686961174011, + "learning_rate": 5.220588235294118e-06, + "loss": 0.4876, + "step": 213 + }, + { + "epoch": 0.15758468335787923, + "grad_norm": 0.48111408948898315, + "learning_rate": 5.245098039215687e-06, + "loss": 0.5117, + "step": 214 + }, + { + "epoch": 0.15832106038291605, + "grad_norm": 0.5187819004058838, + "learning_rate": 5.269607843137256e-06, + "loss": 0.5188, + "step": 215 + }, + { + "epoch": 0.15905743740795286, + "grad_norm": 0.510890007019043, + "learning_rate": 5.294117647058824e-06, + "loss": 0.545, + "step": 216 + }, + { + "epoch": 0.15979381443298968, + "grad_norm": 0.5489839315414429, + "learning_rate": 5.318627450980393e-06, + "loss": 0.508, + "step": 217 + }, + { + "epoch": 0.16053019145802652, + "grad_norm": 0.4811173677444458, + "learning_rate": 5.343137254901961e-06, + "loss": 0.4955, + "step": 218 + }, + { + "epoch": 0.16126656848306334, + "grad_norm": 0.49445685744285583, + "learning_rate": 5.36764705882353e-06, + "loss": 0.4678, + "step": 219 + }, + { + "epoch": 0.16200294550810015, + "grad_norm": 0.44502392411231995, + "learning_rate": 5.392156862745098e-06, + "loss": 0.5166, + "step": 220 + }, + { + "epoch": 0.16273932253313697, + "grad_norm": 0.46069005131721497, + "learning_rate": 5.416666666666667e-06, + "loss": 0.5279, + "step": 221 + }, + { + "epoch": 0.1634756995581738, + "grad_norm": 0.4491000175476074, + "learning_rate": 5.441176470588236e-06, + "loss": 0.5093, + "step": 222 + }, + { + "epoch": 0.1642120765832106, + "grad_norm": 0.5266180634498596, + "learning_rate": 5.465686274509804e-06, + "loss": 0.5221, + "step": 223 + }, + { + "epoch": 0.16494845360824742, + "grad_norm": 0.48982885479927063, + "learning_rate": 5.4901960784313735e-06, + "loss": 0.4804, + "step": 224 + }, + { + "epoch": 0.16568483063328424, + "grad_norm": 0.46362483501434326, + "learning_rate": 5.514705882352942e-06, + "loss": 0.524, + "step": 225 + }, + { + "epoch": 0.16642120765832105, + "grad_norm": 0.480934202671051, + "learning_rate": 5.5392156862745104e-06, + "loss": 0.5063, + "step": 226 + }, + { + "epoch": 0.16715758468335787, + "grad_norm": 0.4418233633041382, + "learning_rate": 5.563725490196079e-06, + "loss": 0.485, + "step": 227 + }, + { + "epoch": 0.16789396170839468, + "grad_norm": 0.4766086935997009, + "learning_rate": 5.588235294117647e-06, + "loss": 0.4859, + "step": 228 + }, + { + "epoch": 0.16863033873343153, + "grad_norm": 0.5201296806335449, + "learning_rate": 5.612745098039216e-06, + "loss": 0.5128, + "step": 229 + }, + { + "epoch": 0.16936671575846834, + "grad_norm": 0.4526103734970093, + "learning_rate": 5.637254901960784e-06, + "loss": 0.5115, + "step": 230 + }, + { + "epoch": 0.17010309278350516, + "grad_norm": 0.4606441557407379, + "learning_rate": 5.661764705882353e-06, + "loss": 0.5074, + "step": 231 + }, + { + "epoch": 0.17083946980854198, + "grad_norm": 0.4658742845058441, + "learning_rate": 5.686274509803922e-06, + "loss": 0.5212, + "step": 232 + }, + { + "epoch": 0.1715758468335788, + "grad_norm": 0.45116207003593445, + "learning_rate": 5.71078431372549e-06, + "loss": 0.5178, + "step": 233 + }, + { + "epoch": 0.1723122238586156, + "grad_norm": 0.40379467606544495, + "learning_rate": 5.735294117647059e-06, + "loss": 0.4907, + "step": 234 + }, + { + "epoch": 0.17304860088365243, + "grad_norm": 0.461535781621933, + "learning_rate": 5.759803921568627e-06, + "loss": 0.4812, + "step": 235 + }, + { + "epoch": 0.17378497790868924, + "grad_norm": 0.40763628482818604, + "learning_rate": 5.784313725490197e-06, + "loss": 0.4861, + "step": 236 + }, + { + "epoch": 0.17452135493372606, + "grad_norm": 0.4869963228702545, + "learning_rate": 5.808823529411766e-06, + "loss": 0.5126, + "step": 237 + }, + { + "epoch": 0.17525773195876287, + "grad_norm": 0.43191322684288025, + "learning_rate": 5.833333333333334e-06, + "loss": 0.5194, + "step": 238 + }, + { + "epoch": 0.17599410898379972, + "grad_norm": 0.4565574824810028, + "learning_rate": 5.857843137254903e-06, + "loss": 0.5108, + "step": 239 + }, + { + "epoch": 0.17673048600883653, + "grad_norm": 0.4109247326850891, + "learning_rate": 5.882352941176471e-06, + "loss": 0.4885, + "step": 240 + }, + { + "epoch": 0.17746686303387335, + "grad_norm": 0.5282112956047058, + "learning_rate": 5.90686274509804e-06, + "loss": 0.538, + "step": 241 + }, + { + "epoch": 0.17820324005891017, + "grad_norm": 0.4757918417453766, + "learning_rate": 5.931372549019609e-06, + "loss": 0.5143, + "step": 242 + }, + { + "epoch": 0.17893961708394698, + "grad_norm": 0.4788837134838104, + "learning_rate": 5.955882352941177e-06, + "loss": 0.4996, + "step": 243 + }, + { + "epoch": 0.1796759941089838, + "grad_norm": 0.4603605568408966, + "learning_rate": 5.980392156862746e-06, + "loss": 0.4856, + "step": 244 + }, + { + "epoch": 0.18041237113402062, + "grad_norm": 0.48925524950027466, + "learning_rate": 6.004901960784314e-06, + "loss": 0.5062, + "step": 245 + }, + { + "epoch": 0.18114874815905743, + "grad_norm": 0.4555787742137909, + "learning_rate": 6.029411764705883e-06, + "loss": 0.4869, + "step": 246 + }, + { + "epoch": 0.18188512518409425, + "grad_norm": 0.4337728023529053, + "learning_rate": 6.053921568627451e-06, + "loss": 0.5095, + "step": 247 + }, + { + "epoch": 0.18262150220913106, + "grad_norm": 0.4596711993217468, + "learning_rate": 6.07843137254902e-06, + "loss": 0.5334, + "step": 248 + }, + { + "epoch": 0.18335787923416788, + "grad_norm": 0.55954509973526, + "learning_rate": 6.102941176470589e-06, + "loss": 0.542, + "step": 249 + }, + { + "epoch": 0.18409425625920472, + "grad_norm": 0.4391202926635742, + "learning_rate": 6.1274509803921575e-06, + "loss": 0.5085, + "step": 250 + }, + { + "epoch": 0.18483063328424154, + "grad_norm": 0.5165490508079529, + "learning_rate": 6.151960784313726e-06, + "loss": 0.5267, + "step": 251 + }, + { + "epoch": 0.18556701030927836, + "grad_norm": 0.5765572786331177, + "learning_rate": 6.176470588235295e-06, + "loss": 0.5302, + "step": 252 + }, + { + "epoch": 0.18630338733431517, + "grad_norm": 0.4425363540649414, + "learning_rate": 6.200980392156863e-06, + "loss": 0.5294, + "step": 253 + }, + { + "epoch": 0.187039764359352, + "grad_norm": 0.5258336663246155, + "learning_rate": 6.225490196078432e-06, + "loss": 0.4794, + "step": 254 + }, + { + "epoch": 0.1877761413843888, + "grad_norm": 0.4908381998538971, + "learning_rate": 6.25e-06, + "loss": 0.4942, + "step": 255 + }, + { + "epoch": 0.18851251840942562, + "grad_norm": 0.480133980512619, + "learning_rate": 6.274509803921569e-06, + "loss": 0.4991, + "step": 256 + }, + { + "epoch": 0.18924889543446244, + "grad_norm": 0.48387065529823303, + "learning_rate": 6.299019607843137e-06, + "loss": 0.513, + "step": 257 + }, + { + "epoch": 0.18998527245949925, + "grad_norm": 0.5518890619277954, + "learning_rate": 6.323529411764706e-06, + "loss": 0.522, + "step": 258 + }, + { + "epoch": 0.19072164948453607, + "grad_norm": 0.5575649738311768, + "learning_rate": 6.348039215686275e-06, + "loss": 0.5046, + "step": 259 + }, + { + "epoch": 0.19145802650957292, + "grad_norm": 0.5497487187385559, + "learning_rate": 6.372549019607843e-06, + "loss": 0.517, + "step": 260 + }, + { + "epoch": 0.19219440353460973, + "grad_norm": 0.613700807094574, + "learning_rate": 6.397058823529412e-06, + "loss": 0.4874, + "step": 261 + }, + { + "epoch": 0.19293078055964655, + "grad_norm": 0.5236343741416931, + "learning_rate": 6.421568627450982e-06, + "loss": 0.4959, + "step": 262 + }, + { + "epoch": 0.19366715758468336, + "grad_norm": 0.5656614303588867, + "learning_rate": 6.44607843137255e-06, + "loss": 0.5192, + "step": 263 + }, + { + "epoch": 0.19440353460972018, + "grad_norm": 0.4702565371990204, + "learning_rate": 6.470588235294119e-06, + "loss": 0.4974, + "step": 264 + }, + { + "epoch": 0.195139911634757, + "grad_norm": 0.5624386668205261, + "learning_rate": 6.495098039215687e-06, + "loss": 0.5131, + "step": 265 + }, + { + "epoch": 0.1958762886597938, + "grad_norm": 0.5100853443145752, + "learning_rate": 6.519607843137256e-06, + "loss": 0.4945, + "step": 266 + }, + { + "epoch": 0.19661266568483063, + "grad_norm": 0.48825833201408386, + "learning_rate": 6.544117647058824e-06, + "loss": 0.4909, + "step": 267 + }, + { + "epoch": 0.19734904270986744, + "grad_norm": 0.4945085942745209, + "learning_rate": 6.568627450980393e-06, + "loss": 0.5235, + "step": 268 + }, + { + "epoch": 0.19808541973490426, + "grad_norm": 0.46311086416244507, + "learning_rate": 6.593137254901962e-06, + "loss": 0.4975, + "step": 269 + }, + { + "epoch": 0.19882179675994108, + "grad_norm": 0.4879513680934906, + "learning_rate": 6.61764705882353e-06, + "loss": 0.494, + "step": 270 + }, + { + "epoch": 0.19955817378497792, + "grad_norm": 0.43951448798179626, + "learning_rate": 6.642156862745099e-06, + "loss": 0.5052, + "step": 271 + }, + { + "epoch": 0.20029455081001474, + "grad_norm": 0.4860183596611023, + "learning_rate": 6.666666666666667e-06, + "loss": 0.506, + "step": 272 + }, + { + "epoch": 0.20103092783505155, + "grad_norm": 0.5143676400184631, + "learning_rate": 6.6911764705882356e-06, + "loss": 0.5197, + "step": 273 + }, + { + "epoch": 0.20176730486008837, + "grad_norm": 0.4551714062690735, + "learning_rate": 6.715686274509804e-06, + "loss": 0.522, + "step": 274 + }, + { + "epoch": 0.2025036818851252, + "grad_norm": 0.49370667338371277, + "learning_rate": 6.740196078431373e-06, + "loss": 0.4651, + "step": 275 + }, + { + "epoch": 0.203240058910162, + "grad_norm": 0.42197269201278687, + "learning_rate": 6.764705882352942e-06, + "loss": 0.4675, + "step": 276 + }, + { + "epoch": 0.20397643593519882, + "grad_norm": 0.48403897881507874, + "learning_rate": 6.78921568627451e-06, + "loss": 0.4831, + "step": 277 + }, + { + "epoch": 0.20471281296023564, + "grad_norm": 0.5002971291542053, + "learning_rate": 6.813725490196079e-06, + "loss": 0.502, + "step": 278 + }, + { + "epoch": 0.20544918998527245, + "grad_norm": 0.49652308225631714, + "learning_rate": 6.838235294117648e-06, + "loss": 0.5341, + "step": 279 + }, + { + "epoch": 0.20618556701030927, + "grad_norm": 0.4934190809726715, + "learning_rate": 6.862745098039216e-06, + "loss": 0.5091, + "step": 280 + }, + { + "epoch": 0.20692194403534608, + "grad_norm": 0.5390608310699463, + "learning_rate": 6.887254901960785e-06, + "loss": 0.4987, + "step": 281 + }, + { + "epoch": 0.20765832106038293, + "grad_norm": 0.5566352605819702, + "learning_rate": 6.911764705882353e-06, + "loss": 0.5024, + "step": 282 + }, + { + "epoch": 0.20839469808541974, + "grad_norm": 0.5296184420585632, + "learning_rate": 6.936274509803922e-06, + "loss": 0.5055, + "step": 283 + }, + { + "epoch": 0.20913107511045656, + "grad_norm": 0.4719228744506836, + "learning_rate": 6.96078431372549e-06, + "loss": 0.506, + "step": 284 + }, + { + "epoch": 0.20986745213549338, + "grad_norm": 0.49798110127449036, + "learning_rate": 6.985294117647059e-06, + "loss": 0.5311, + "step": 285 + }, + { + "epoch": 0.2106038291605302, + "grad_norm": 0.5432643890380859, + "learning_rate": 7.009803921568628e-06, + "loss": 0.4817, + "step": 286 + }, + { + "epoch": 0.211340206185567, + "grad_norm": 0.5015427470207214, + "learning_rate": 7.034313725490197e-06, + "loss": 0.4787, + "step": 287 + }, + { + "epoch": 0.21207658321060383, + "grad_norm": 0.4968355596065521, + "learning_rate": 7.058823529411766e-06, + "loss": 0.5051, + "step": 288 + }, + { + "epoch": 0.21281296023564064, + "grad_norm": 0.6130073666572571, + "learning_rate": 7.083333333333335e-06, + "loss": 0.4917, + "step": 289 + }, + { + "epoch": 0.21354933726067746, + "grad_norm": 0.4766775369644165, + "learning_rate": 7.107843137254903e-06, + "loss": 0.4849, + "step": 290 + }, + { + "epoch": 0.21428571428571427, + "grad_norm": 0.5826199650764465, + "learning_rate": 7.132352941176472e-06, + "loss": 0.4868, + "step": 291 + }, + { + "epoch": 0.21502209131075112, + "grad_norm": 0.43983742594718933, + "learning_rate": 7.15686274509804e-06, + "loss": 0.4949, + "step": 292 + }, + { + "epoch": 0.21575846833578793, + "grad_norm": 0.561597466468811, + "learning_rate": 7.181372549019609e-06, + "loss": 0.4796, + "step": 293 + }, + { + "epoch": 0.21649484536082475, + "grad_norm": 0.5399706363677979, + "learning_rate": 7.205882352941177e-06, + "loss": 0.4824, + "step": 294 + }, + { + "epoch": 0.21723122238586157, + "grad_norm": 0.45450007915496826, + "learning_rate": 7.230392156862746e-06, + "loss": 0.5083, + "step": 295 + }, + { + "epoch": 0.21796759941089838, + "grad_norm": 0.5827359557151794, + "learning_rate": 7.2549019607843145e-06, + "loss": 0.5013, + "step": 296 + }, + { + "epoch": 0.2187039764359352, + "grad_norm": 0.5125693678855896, + "learning_rate": 7.2794117647058826e-06, + "loss": 0.505, + "step": 297 + }, + { + "epoch": 0.21944035346097202, + "grad_norm": 0.5245058536529541, + "learning_rate": 7.3039215686274515e-06, + "loss": 0.4918, + "step": 298 + }, + { + "epoch": 0.22017673048600883, + "grad_norm": 0.5342023372650146, + "learning_rate": 7.3284313725490195e-06, + "loss": 0.4898, + "step": 299 + }, + { + "epoch": 0.22091310751104565, + "grad_norm": 0.4772011339664459, + "learning_rate": 7.352941176470589e-06, + "loss": 0.4958, + "step": 300 + }, + { + "epoch": 0.22164948453608246, + "grad_norm": 0.5259323716163635, + "learning_rate": 7.377450980392158e-06, + "loss": 0.4956, + "step": 301 + }, + { + "epoch": 0.22238586156111928, + "grad_norm": 0.5168774724006653, + "learning_rate": 7.401960784313726e-06, + "loss": 0.4735, + "step": 302 + }, + { + "epoch": 0.22312223858615612, + "grad_norm": 0.4500476121902466, + "learning_rate": 7.426470588235295e-06, + "loss": 0.5039, + "step": 303 + }, + { + "epoch": 0.22385861561119294, + "grad_norm": 0.49780377745628357, + "learning_rate": 7.450980392156863e-06, + "loss": 0.4629, + "step": 304 + }, + { + "epoch": 0.22459499263622976, + "grad_norm": 0.5209547281265259, + "learning_rate": 7.475490196078432e-06, + "loss": 0.4927, + "step": 305 + }, + { + "epoch": 0.22533136966126657, + "grad_norm": 0.6013898849487305, + "learning_rate": 7.500000000000001e-06, + "loss": 0.5053, + "step": 306 + }, + { + "epoch": 0.2260677466863034, + "grad_norm": 0.6150967478752136, + "learning_rate": 7.524509803921569e-06, + "loss": 0.4872, + "step": 307 + }, + { + "epoch": 0.2268041237113402, + "grad_norm": 0.48117902874946594, + "learning_rate": 7.549019607843138e-06, + "loss": 0.5077, + "step": 308 + }, + { + "epoch": 0.22754050073637702, + "grad_norm": 0.6058593392372131, + "learning_rate": 7.573529411764706e-06, + "loss": 0.4854, + "step": 309 + }, + { + "epoch": 0.22827687776141384, + "grad_norm": 0.5079674124717712, + "learning_rate": 7.598039215686275e-06, + "loss": 0.4977, + "step": 310 + }, + { + "epoch": 0.22901325478645065, + "grad_norm": 0.5175543427467346, + "learning_rate": 7.622549019607843e-06, + "loss": 0.5181, + "step": 311 + }, + { + "epoch": 0.22974963181148747, + "grad_norm": 0.5423833131790161, + "learning_rate": 7.647058823529411e-06, + "loss": 0.5364, + "step": 312 + }, + { + "epoch": 0.23048600883652431, + "grad_norm": 0.49087536334991455, + "learning_rate": 7.671568627450981e-06, + "loss": 0.4943, + "step": 313 + }, + { + "epoch": 0.23122238586156113, + "grad_norm": 0.5285847783088684, + "learning_rate": 7.69607843137255e-06, + "loss": 0.5098, + "step": 314 + }, + { + "epoch": 0.23195876288659795, + "grad_norm": 0.4606481194496155, + "learning_rate": 7.720588235294119e-06, + "loss": 0.477, + "step": 315 + }, + { + "epoch": 0.23269513991163476, + "grad_norm": 0.5595932006835938, + "learning_rate": 7.745098039215687e-06, + "loss": 0.5141, + "step": 316 + }, + { + "epoch": 0.23343151693667158, + "grad_norm": 0.4855089783668518, + "learning_rate": 7.769607843137256e-06, + "loss": 0.5224, + "step": 317 + }, + { + "epoch": 0.2341678939617084, + "grad_norm": 0.49227041006088257, + "learning_rate": 7.794117647058825e-06, + "loss": 0.4859, + "step": 318 + }, + { + "epoch": 0.2349042709867452, + "grad_norm": 0.46864691376686096, + "learning_rate": 7.818627450980393e-06, + "loss": 0.4907, + "step": 319 + }, + { + "epoch": 0.23564064801178203, + "grad_norm": 0.5469391942024231, + "learning_rate": 7.84313725490196e-06, + "loss": 0.4839, + "step": 320 + }, + { + "epoch": 0.23637702503681884, + "grad_norm": 0.5292018055915833, + "learning_rate": 7.86764705882353e-06, + "loss": 0.4977, + "step": 321 + }, + { + "epoch": 0.23711340206185566, + "grad_norm": 0.5337371230125427, + "learning_rate": 7.892156862745098e-06, + "loss": 0.4871, + "step": 322 + }, + { + "epoch": 0.23784977908689248, + "grad_norm": 0.5429027080535889, + "learning_rate": 7.916666666666667e-06, + "loss": 0.4995, + "step": 323 + }, + { + "epoch": 0.23858615611192932, + "grad_norm": 0.5223814249038696, + "learning_rate": 7.941176470588236e-06, + "loss": 0.5147, + "step": 324 + }, + { + "epoch": 0.23932253313696614, + "grad_norm": 0.4621415436267853, + "learning_rate": 7.965686274509804e-06, + "loss": 0.4943, + "step": 325 + }, + { + "epoch": 0.24005891016200295, + "grad_norm": 0.6054308414459229, + "learning_rate": 7.990196078431374e-06, + "loss": 0.535, + "step": 326 + }, + { + "epoch": 0.24079528718703977, + "grad_norm": 0.5081130266189575, + "learning_rate": 8.014705882352942e-06, + "loss": 0.4994, + "step": 327 + }, + { + "epoch": 0.24153166421207659, + "grad_norm": 0.5109507441520691, + "learning_rate": 8.03921568627451e-06, + "loss": 0.4765, + "step": 328 + }, + { + "epoch": 0.2422680412371134, + "grad_norm": 0.5108455419540405, + "learning_rate": 8.06372549019608e-06, + "loss": 0.4965, + "step": 329 + }, + { + "epoch": 0.24300441826215022, + "grad_norm": 0.4857231378555298, + "learning_rate": 8.088235294117648e-06, + "loss": 0.4663, + "step": 330 + }, + { + "epoch": 0.24374079528718703, + "grad_norm": 0.5149396061897278, + "learning_rate": 8.112745098039216e-06, + "loss": 0.517, + "step": 331 + }, + { + "epoch": 0.24447717231222385, + "grad_norm": 0.5667039752006531, + "learning_rate": 8.137254901960784e-06, + "loss": 0.5037, + "step": 332 + }, + { + "epoch": 0.24521354933726067, + "grad_norm": 0.5189935564994812, + "learning_rate": 8.161764705882354e-06, + "loss": 0.4786, + "step": 333 + }, + { + "epoch": 0.24594992636229748, + "grad_norm": 0.5272833108901978, + "learning_rate": 8.186274509803922e-06, + "loss": 0.4986, + "step": 334 + }, + { + "epoch": 0.24668630338733433, + "grad_norm": 0.4853070080280304, + "learning_rate": 8.21078431372549e-06, + "loss": 0.4876, + "step": 335 + }, + { + "epoch": 0.24742268041237114, + "grad_norm": 0.6110916137695312, + "learning_rate": 8.23529411764706e-06, + "loss": 0.5359, + "step": 336 + }, + { + "epoch": 0.24815905743740796, + "grad_norm": 0.6302662491798401, + "learning_rate": 8.259803921568628e-06, + "loss": 0.5306, + "step": 337 + }, + { + "epoch": 0.24889543446244478, + "grad_norm": 0.5661644339561462, + "learning_rate": 8.284313725490198e-06, + "loss": 0.4894, + "step": 338 + }, + { + "epoch": 0.2496318114874816, + "grad_norm": 0.5667375326156616, + "learning_rate": 8.308823529411766e-06, + "loss": 0.4911, + "step": 339 + }, + { + "epoch": 0.2503681885125184, + "grad_norm": 0.5805349349975586, + "learning_rate": 8.333333333333334e-06, + "loss": 0.499, + "step": 340 + }, + { + "epoch": 0.2511045655375552, + "grad_norm": 0.5153322219848633, + "learning_rate": 8.357843137254903e-06, + "loss": 0.4693, + "step": 341 + }, + { + "epoch": 0.25184094256259204, + "grad_norm": 0.5583307147026062, + "learning_rate": 8.382352941176472e-06, + "loss": 0.4913, + "step": 342 + }, + { + "epoch": 0.25257731958762886, + "grad_norm": 0.5932314395904541, + "learning_rate": 8.40686274509804e-06, + "loss": 0.5091, + "step": 343 + }, + { + "epoch": 0.2533136966126657, + "grad_norm": 0.4746854603290558, + "learning_rate": 8.43137254901961e-06, + "loss": 0.4552, + "step": 344 + }, + { + "epoch": 0.2540500736377025, + "grad_norm": 0.6550336480140686, + "learning_rate": 8.455882352941177e-06, + "loss": 0.4583, + "step": 345 + }, + { + "epoch": 0.2547864506627393, + "grad_norm": 0.44437041878700256, + "learning_rate": 8.480392156862745e-06, + "loss": 0.4817, + "step": 346 + }, + { + "epoch": 0.2555228276877761, + "grad_norm": 0.6586700677871704, + "learning_rate": 8.504901960784314e-06, + "loss": 0.4843, + "step": 347 + }, + { + "epoch": 0.25625920471281294, + "grad_norm": 0.5380249619483948, + "learning_rate": 8.529411764705883e-06, + "loss": 0.4783, + "step": 348 + }, + { + "epoch": 0.25699558173784975, + "grad_norm": 0.5162436962127686, + "learning_rate": 8.553921568627451e-06, + "loss": 0.4796, + "step": 349 + }, + { + "epoch": 0.25773195876288657, + "grad_norm": 0.5282143354415894, + "learning_rate": 8.57843137254902e-06, + "loss": 0.4583, + "step": 350 + }, + { + "epoch": 0.25846833578792344, + "grad_norm": 0.5736419558525085, + "learning_rate": 8.60294117647059e-06, + "loss": 0.4989, + "step": 351 + }, + { + "epoch": 0.25920471281296026, + "grad_norm": 0.47756871581077576, + "learning_rate": 8.627450980392157e-06, + "loss": 0.4766, + "step": 352 + }, + { + "epoch": 0.2599410898379971, + "grad_norm": 0.5326557159423828, + "learning_rate": 8.651960784313727e-06, + "loss": 0.4931, + "step": 353 + }, + { + "epoch": 0.2606774668630339, + "grad_norm": 0.5259348750114441, + "learning_rate": 8.676470588235295e-06, + "loss": 0.4863, + "step": 354 + }, + { + "epoch": 0.2614138438880707, + "grad_norm": 0.5482640862464905, + "learning_rate": 8.700980392156863e-06, + "loss": 0.5174, + "step": 355 + }, + { + "epoch": 0.2621502209131075, + "grad_norm": 0.5132611989974976, + "learning_rate": 8.725490196078433e-06, + "loss": 0.4894, + "step": 356 + }, + { + "epoch": 0.26288659793814434, + "grad_norm": 0.5965814590454102, + "learning_rate": 8.750000000000001e-06, + "loss": 0.4922, + "step": 357 + }, + { + "epoch": 0.26362297496318116, + "grad_norm": 0.5167778730392456, + "learning_rate": 8.774509803921569e-06, + "loss": 0.4737, + "step": 358 + }, + { + "epoch": 0.26435935198821797, + "grad_norm": 0.510610818862915, + "learning_rate": 8.799019607843137e-06, + "loss": 0.4877, + "step": 359 + }, + { + "epoch": 0.2650957290132548, + "grad_norm": 0.5722100734710693, + "learning_rate": 8.823529411764707e-06, + "loss": 0.4597, + "step": 360 + }, + { + "epoch": 0.2658321060382916, + "grad_norm": 0.662177562713623, + "learning_rate": 8.848039215686275e-06, + "loss": 0.5086, + "step": 361 + }, + { + "epoch": 0.2665684830633284, + "grad_norm": 0.499646931886673, + "learning_rate": 8.872549019607843e-06, + "loss": 0.4513, + "step": 362 + }, + { + "epoch": 0.26730486008836524, + "grad_norm": 0.5374613404273987, + "learning_rate": 8.897058823529413e-06, + "loss": 0.4636, + "step": 363 + }, + { + "epoch": 0.26804123711340205, + "grad_norm": 0.6908944845199585, + "learning_rate": 8.921568627450982e-06, + "loss": 0.4969, + "step": 364 + }, + { + "epoch": 0.26877761413843887, + "grad_norm": 0.4661368131637573, + "learning_rate": 8.94607843137255e-06, + "loss": 0.4593, + "step": 365 + }, + { + "epoch": 0.2695139911634757, + "grad_norm": 0.5653083920478821, + "learning_rate": 8.970588235294119e-06, + "loss": 0.4714, + "step": 366 + }, + { + "epoch": 0.2702503681885125, + "grad_norm": 0.46112319827079773, + "learning_rate": 8.995098039215687e-06, + "loss": 0.4775, + "step": 367 + }, + { + "epoch": 0.2709867452135493, + "grad_norm": 0.5458109378814697, + "learning_rate": 9.019607843137256e-06, + "loss": 0.472, + "step": 368 + }, + { + "epoch": 0.27172312223858613, + "grad_norm": 0.5330044031143188, + "learning_rate": 9.044117647058824e-06, + "loss": 0.4798, + "step": 369 + }, + { + "epoch": 0.27245949926362295, + "grad_norm": 0.5421646237373352, + "learning_rate": 9.068627450980392e-06, + "loss": 0.5013, + "step": 370 + }, + { + "epoch": 0.27319587628865977, + "grad_norm": 0.575491726398468, + "learning_rate": 9.093137254901962e-06, + "loss": 0.4939, + "step": 371 + }, + { + "epoch": 0.27393225331369664, + "grad_norm": 0.5331813097000122, + "learning_rate": 9.11764705882353e-06, + "loss": 0.514, + "step": 372 + }, + { + "epoch": 0.27466863033873345, + "grad_norm": 0.5526221990585327, + "learning_rate": 9.142156862745098e-06, + "loss": 0.5139, + "step": 373 + }, + { + "epoch": 0.27540500736377027, + "grad_norm": 0.5274295806884766, + "learning_rate": 9.166666666666666e-06, + "loss": 0.4895, + "step": 374 + }, + { + "epoch": 0.2761413843888071, + "grad_norm": 0.5230691432952881, + "learning_rate": 9.191176470588236e-06, + "loss": 0.4956, + "step": 375 + }, + { + "epoch": 0.2768777614138439, + "grad_norm": 0.6479937434196472, + "learning_rate": 9.215686274509804e-06, + "loss": 0.4768, + "step": 376 + }, + { + "epoch": 0.2776141384388807, + "grad_norm": 0.5493687987327576, + "learning_rate": 9.240196078431374e-06, + "loss": 0.4733, + "step": 377 + }, + { + "epoch": 0.27835051546391754, + "grad_norm": 0.5068442225456238, + "learning_rate": 9.264705882352942e-06, + "loss": 0.4696, + "step": 378 + }, + { + "epoch": 0.27908689248895435, + "grad_norm": 0.49230822920799255, + "learning_rate": 9.28921568627451e-06, + "loss": 0.4747, + "step": 379 + }, + { + "epoch": 0.27982326951399117, + "grad_norm": 0.5067638158798218, + "learning_rate": 9.31372549019608e-06, + "loss": 0.4598, + "step": 380 + }, + { + "epoch": 0.280559646539028, + "grad_norm": 0.5510022640228271, + "learning_rate": 9.338235294117648e-06, + "loss": 0.4844, + "step": 381 + }, + { + "epoch": 0.2812960235640648, + "grad_norm": 0.5649453401565552, + "learning_rate": 9.362745098039216e-06, + "loss": 0.497, + "step": 382 + }, + { + "epoch": 0.2820324005891016, + "grad_norm": 0.549223780632019, + "learning_rate": 9.387254901960786e-06, + "loss": 0.4801, + "step": 383 + }, + { + "epoch": 0.28276877761413843, + "grad_norm": 0.46957284212112427, + "learning_rate": 9.411764705882354e-06, + "loss": 0.4752, + "step": 384 + }, + { + "epoch": 0.28350515463917525, + "grad_norm": 0.4900481104850769, + "learning_rate": 9.436274509803922e-06, + "loss": 0.4712, + "step": 385 + }, + { + "epoch": 0.28424153166421207, + "grad_norm": 0.49515727162361145, + "learning_rate": 9.46078431372549e-06, + "loss": 0.4827, + "step": 386 + }, + { + "epoch": 0.2849779086892489, + "grad_norm": 0.44085896015167236, + "learning_rate": 9.48529411764706e-06, + "loss": 0.4651, + "step": 387 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 0.5362505912780762, + "learning_rate": 9.509803921568628e-06, + "loss": 0.4951, + "step": 388 + }, + { + "epoch": 0.2864506627393225, + "grad_norm": 0.593925952911377, + "learning_rate": 9.534313725490198e-06, + "loss": 0.471, + "step": 389 + }, + { + "epoch": 0.28718703976435933, + "grad_norm": 0.6149821281433105, + "learning_rate": 9.558823529411766e-06, + "loss": 0.5252, + "step": 390 + }, + { + "epoch": 0.28792341678939615, + "grad_norm": 0.484279066324234, + "learning_rate": 9.583333333333335e-06, + "loss": 0.4968, + "step": 391 + }, + { + "epoch": 0.28865979381443296, + "grad_norm": 0.59568190574646, + "learning_rate": 9.607843137254903e-06, + "loss": 0.4778, + "step": 392 + }, + { + "epoch": 0.28939617083946984, + "grad_norm": 0.587205708026886, + "learning_rate": 9.632352941176471e-06, + "loss": 0.5125, + "step": 393 + }, + { + "epoch": 0.29013254786450665, + "grad_norm": 0.5671679377555847, + "learning_rate": 9.65686274509804e-06, + "loss": 0.5075, + "step": 394 + }, + { + "epoch": 0.29086892488954347, + "grad_norm": 0.5437478423118591, + "learning_rate": 9.68137254901961e-06, + "loss": 0.4632, + "step": 395 + }, + { + "epoch": 0.2916053019145803, + "grad_norm": 0.517405092716217, + "learning_rate": 9.705882352941177e-06, + "loss": 0.4851, + "step": 396 + }, + { + "epoch": 0.2923416789396171, + "grad_norm": 0.4883304238319397, + "learning_rate": 9.730392156862745e-06, + "loss": 0.4827, + "step": 397 + }, + { + "epoch": 0.2930780559646539, + "grad_norm": 0.5211227536201477, + "learning_rate": 9.754901960784315e-06, + "loss": 0.5063, + "step": 398 + }, + { + "epoch": 0.29381443298969073, + "grad_norm": 0.4950783848762512, + "learning_rate": 9.779411764705883e-06, + "loss": 0.4588, + "step": 399 + }, + { + "epoch": 0.29455081001472755, + "grad_norm": 0.5107828378677368, + "learning_rate": 9.803921568627451e-06, + "loss": 0.4775, + "step": 400 + }, + { + "epoch": 0.29528718703976436, + "grad_norm": 0.5515358448028564, + "learning_rate": 9.82843137254902e-06, + "loss": 0.4581, + "step": 401 + }, + { + "epoch": 0.2960235640648012, + "grad_norm": 0.6434028744697571, + "learning_rate": 9.852941176470589e-06, + "loss": 0.5061, + "step": 402 + }, + { + "epoch": 0.296759941089838, + "grad_norm": 0.6046220064163208, + "learning_rate": 9.877450980392159e-06, + "loss": 0.5053, + "step": 403 + }, + { + "epoch": 0.2974963181148748, + "grad_norm": 0.5827841758728027, + "learning_rate": 9.901960784313727e-06, + "loss": 0.4594, + "step": 404 + }, + { + "epoch": 0.29823269513991163, + "grad_norm": 0.5246471762657166, + "learning_rate": 9.926470588235295e-06, + "loss": 0.4909, + "step": 405 + }, + { + "epoch": 0.29896907216494845, + "grad_norm": 0.5674916505813599, + "learning_rate": 9.950980392156863e-06, + "loss": 0.4717, + "step": 406 + }, + { + "epoch": 0.29970544918998526, + "grad_norm": 0.5033355951309204, + "learning_rate": 9.975490196078433e-06, + "loss": 0.5008, + "step": 407 + }, + { + "epoch": 0.3004418262150221, + "grad_norm": 0.5390115976333618, + "learning_rate": 1e-05, + "loss": 0.5002, + "step": 408 + }, + { + "epoch": 0.3011782032400589, + "grad_norm": 0.5051551461219788, + "learning_rate": 9.999998164075549e-06, + "loss": 0.4609, + "step": 409 + }, + { + "epoch": 0.3019145802650957, + "grad_norm": 0.5789138078689575, + "learning_rate": 9.999992656303539e-06, + "loss": 0.5152, + "step": 410 + }, + { + "epoch": 0.3026509572901325, + "grad_norm": 0.4873914420604706, + "learning_rate": 9.999983476688016e-06, + "loss": 0.4836, + "step": 411 + }, + { + "epoch": 0.30338733431516934, + "grad_norm": 0.5575588345527649, + "learning_rate": 9.999970625235724e-06, + "loss": 0.472, + "step": 412 + }, + { + "epoch": 0.30412371134020616, + "grad_norm": 0.5004022121429443, + "learning_rate": 9.999954101956097e-06, + "loss": 0.5051, + "step": 413 + }, + { + "epoch": 0.30486008836524303, + "grad_norm": 0.5742695927619934, + "learning_rate": 9.999933906861272e-06, + "loss": 0.4613, + "step": 414 + }, + { + "epoch": 0.30559646539027985, + "grad_norm": 0.5978338122367859, + "learning_rate": 9.999910039966079e-06, + "loss": 0.4931, + "step": 415 + }, + { + "epoch": 0.30633284241531666, + "grad_norm": 0.5001004338264465, + "learning_rate": 9.999882501288043e-06, + "loss": 0.475, + "step": 416 + }, + { + "epoch": 0.3070692194403535, + "grad_norm": 0.608790397644043, + "learning_rate": 9.99985129084739e-06, + "loss": 0.4743, + "step": 417 + }, + { + "epoch": 0.3078055964653903, + "grad_norm": 0.5592244863510132, + "learning_rate": 9.99981640866704e-06, + "loss": 0.4767, + "step": 418 + }, + { + "epoch": 0.3085419734904271, + "grad_norm": 0.6118118166923523, + "learning_rate": 9.999777854772608e-06, + "loss": 0.5155, + "step": 419 + }, + { + "epoch": 0.30927835051546393, + "grad_norm": 0.691983699798584, + "learning_rate": 9.999735629192408e-06, + "loss": 0.4758, + "step": 420 + }, + { + "epoch": 0.31001472754050075, + "grad_norm": 0.5400111675262451, + "learning_rate": 9.99968973195745e-06, + "loss": 0.4831, + "step": 421 + }, + { + "epoch": 0.31075110456553756, + "grad_norm": 0.6896957159042358, + "learning_rate": 9.999640163101436e-06, + "loss": 0.4969, + "step": 422 + }, + { + "epoch": 0.3114874815905744, + "grad_norm": 0.5258437991142273, + "learning_rate": 9.99958692266077e-06, + "loss": 0.4701, + "step": 423 + }, + { + "epoch": 0.3122238586156112, + "grad_norm": 0.6300147175788879, + "learning_rate": 9.999530010674552e-06, + "loss": 0.5038, + "step": 424 + }, + { + "epoch": 0.312960235640648, + "grad_norm": 0.5327275991439819, + "learning_rate": 9.999469427184573e-06, + "loss": 0.4895, + "step": 425 + }, + { + "epoch": 0.3136966126656848, + "grad_norm": 0.5516902804374695, + "learning_rate": 9.999405172235325e-06, + "loss": 0.4983, + "step": 426 + }, + { + "epoch": 0.31443298969072164, + "grad_norm": 0.5630642771720886, + "learning_rate": 9.999337245873999e-06, + "loss": 0.4625, + "step": 427 + }, + { + "epoch": 0.31516936671575846, + "grad_norm": 0.5986700654029846, + "learning_rate": 9.999265648150472e-06, + "loss": 0.524, + "step": 428 + }, + { + "epoch": 0.3159057437407953, + "grad_norm": 0.5675637125968933, + "learning_rate": 9.999190379117324e-06, + "loss": 0.484, + "step": 429 + }, + { + "epoch": 0.3166421207658321, + "grad_norm": 0.6256847977638245, + "learning_rate": 9.999111438829834e-06, + "loss": 0.5181, + "step": 430 + }, + { + "epoch": 0.3173784977908689, + "grad_norm": 0.6200090050697327, + "learning_rate": 9.999028827345969e-06, + "loss": 0.4901, + "step": 431 + }, + { + "epoch": 0.3181148748159057, + "grad_norm": 0.5858426690101624, + "learning_rate": 9.9989425447264e-06, + "loss": 0.4901, + "step": 432 + }, + { + "epoch": 0.31885125184094254, + "grad_norm": 0.5074707269668579, + "learning_rate": 9.998852591034488e-06, + "loss": 0.4749, + "step": 433 + }, + { + "epoch": 0.31958762886597936, + "grad_norm": 0.5885580778121948, + "learning_rate": 9.998758966336296e-06, + "loss": 0.4836, + "step": 434 + }, + { + "epoch": 0.32032400589101623, + "grad_norm": 0.5307214856147766, + "learning_rate": 9.998661670700576e-06, + "loss": 0.475, + "step": 435 + }, + { + "epoch": 0.32106038291605304, + "grad_norm": 0.6851618885993958, + "learning_rate": 9.998560704198776e-06, + "loss": 0.4521, + "step": 436 + }, + { + "epoch": 0.32179675994108986, + "grad_norm": 0.6349673271179199, + "learning_rate": 9.99845606690505e-06, + "loss": 0.4875, + "step": 437 + }, + { + "epoch": 0.3225331369661267, + "grad_norm": 0.5425810217857361, + "learning_rate": 9.998347758896234e-06, + "loss": 0.5036, + "step": 438 + }, + { + "epoch": 0.3232695139911635, + "grad_norm": 0.6161746382713318, + "learning_rate": 9.99823578025187e-06, + "loss": 0.4791, + "step": 439 + }, + { + "epoch": 0.3240058910162003, + "grad_norm": 0.51716548204422, + "learning_rate": 9.99812013105419e-06, + "loss": 0.4881, + "step": 440 + }, + { + "epoch": 0.3247422680412371, + "grad_norm": 0.6496948003768921, + "learning_rate": 9.998000811388122e-06, + "loss": 0.4804, + "step": 441 + }, + { + "epoch": 0.32547864506627394, + "grad_norm": 0.6212929487228394, + "learning_rate": 9.997877821341294e-06, + "loss": 0.4871, + "step": 442 + }, + { + "epoch": 0.32621502209131076, + "grad_norm": 0.6089105010032654, + "learning_rate": 9.997751161004026e-06, + "loss": 0.4518, + "step": 443 + }, + { + "epoch": 0.3269513991163476, + "grad_norm": 0.6576497554779053, + "learning_rate": 9.99762083046933e-06, + "loss": 0.4673, + "step": 444 + }, + { + "epoch": 0.3276877761413844, + "grad_norm": 0.6152659058570862, + "learning_rate": 9.99748682983292e-06, + "loss": 0.4811, + "step": 445 + }, + { + "epoch": 0.3284241531664212, + "grad_norm": 0.6389955878257751, + "learning_rate": 9.9973491591932e-06, + "loss": 0.4737, + "step": 446 + }, + { + "epoch": 0.329160530191458, + "grad_norm": 0.645021915435791, + "learning_rate": 9.997207818651273e-06, + "loss": 0.491, + "step": 447 + }, + { + "epoch": 0.32989690721649484, + "grad_norm": 0.5410696864128113, + "learning_rate": 9.997062808310935e-06, + "loss": 0.4895, + "step": 448 + }, + { + "epoch": 0.33063328424153166, + "grad_norm": 0.693676769733429, + "learning_rate": 9.996914128278677e-06, + "loss": 0.5014, + "step": 449 + }, + { + "epoch": 0.33136966126656847, + "grad_norm": 0.5651475191116333, + "learning_rate": 9.996761778663682e-06, + "loss": 0.477, + "step": 450 + }, + { + "epoch": 0.3321060382916053, + "grad_norm": 0.6107382774353027, + "learning_rate": 9.996605759577836e-06, + "loss": 0.4553, + "step": 451 + }, + { + "epoch": 0.3328424153166421, + "grad_norm": 0.540756344795227, + "learning_rate": 9.996446071135711e-06, + "loss": 0.4961, + "step": 452 + }, + { + "epoch": 0.3335787923416789, + "grad_norm": 0.5446950793266296, + "learning_rate": 9.99628271345458e-06, + "loss": 0.4891, + "step": 453 + }, + { + "epoch": 0.33431516936671574, + "grad_norm": 0.4631234407424927, + "learning_rate": 9.996115686654406e-06, + "loss": 0.4734, + "step": 454 + }, + { + "epoch": 0.33505154639175255, + "grad_norm": 0.4855410158634186, + "learning_rate": 9.995944990857848e-06, + "loss": 0.4658, + "step": 455 + }, + { + "epoch": 0.33578792341678937, + "grad_norm": 0.5123491287231445, + "learning_rate": 9.995770626190263e-06, + "loss": 0.4627, + "step": 456 + }, + { + "epoch": 0.33652430044182624, + "grad_norm": 0.512749969959259, + "learning_rate": 9.995592592779695e-06, + "loss": 0.4798, + "step": 457 + }, + { + "epoch": 0.33726067746686306, + "grad_norm": 0.5249326229095459, + "learning_rate": 9.995410890756891e-06, + "loss": 0.5012, + "step": 458 + }, + { + "epoch": 0.3379970544918999, + "grad_norm": 0.47508466243743896, + "learning_rate": 9.995225520255282e-06, + "loss": 0.5039, + "step": 459 + }, + { + "epoch": 0.3387334315169367, + "grad_norm": 0.4680541455745697, + "learning_rate": 9.995036481411005e-06, + "loss": 0.4666, + "step": 460 + }, + { + "epoch": 0.3394698085419735, + "grad_norm": 0.4403444528579712, + "learning_rate": 9.994843774362878e-06, + "loss": 0.4613, + "step": 461 + }, + { + "epoch": 0.3402061855670103, + "grad_norm": 0.48063525557518005, + "learning_rate": 9.994647399252423e-06, + "loss": 0.4709, + "step": 462 + }, + { + "epoch": 0.34094256259204714, + "grad_norm": 0.4822491705417633, + "learning_rate": 9.99444735622385e-06, + "loss": 0.4947, + "step": 463 + }, + { + "epoch": 0.34167893961708395, + "grad_norm": 0.533244788646698, + "learning_rate": 9.994243645424067e-06, + "loss": 0.4927, + "step": 464 + }, + { + "epoch": 0.34241531664212077, + "grad_norm": 0.49757879972457886, + "learning_rate": 9.99403626700267e-06, + "loss": 0.4851, + "step": 465 + }, + { + "epoch": 0.3431516936671576, + "grad_norm": 0.583299994468689, + "learning_rate": 9.993825221111955e-06, + "loss": 0.5015, + "step": 466 + }, + { + "epoch": 0.3438880706921944, + "grad_norm": 0.46042028069496155, + "learning_rate": 9.993610507906904e-06, + "loss": 0.5083, + "step": 467 + }, + { + "epoch": 0.3446244477172312, + "grad_norm": 0.5615798830986023, + "learning_rate": 9.993392127545198e-06, + "loss": 0.4744, + "step": 468 + }, + { + "epoch": 0.34536082474226804, + "grad_norm": 0.4136447310447693, + "learning_rate": 9.99317008018721e-06, + "loss": 0.4552, + "step": 469 + }, + { + "epoch": 0.34609720176730485, + "grad_norm": 0.5388152599334717, + "learning_rate": 9.992944365996002e-06, + "loss": 0.4907, + "step": 470 + }, + { + "epoch": 0.34683357879234167, + "grad_norm": 0.5055995583534241, + "learning_rate": 9.992714985137336e-06, + "loss": 0.4649, + "step": 471 + }, + { + "epoch": 0.3475699558173785, + "grad_norm": 0.599743664264679, + "learning_rate": 9.992481937779655e-06, + "loss": 0.4747, + "step": 472 + }, + { + "epoch": 0.3483063328424153, + "grad_norm": 0.5145841836929321, + "learning_rate": 9.99224522409411e-06, + "loss": 0.5214, + "step": 473 + }, + { + "epoch": 0.3490427098674521, + "grad_norm": 0.5827757716178894, + "learning_rate": 9.99200484425453e-06, + "loss": 0.5151, + "step": 474 + }, + { + "epoch": 0.34977908689248893, + "grad_norm": 0.502983808517456, + "learning_rate": 9.991760798437448e-06, + "loss": 0.4808, + "step": 475 + }, + { + "epoch": 0.35051546391752575, + "grad_norm": 0.5331257581710815, + "learning_rate": 9.99151308682208e-06, + "loss": 0.4544, + "step": 476 + }, + { + "epoch": 0.35125184094256257, + "grad_norm": 0.5600625276565552, + "learning_rate": 9.99126170959034e-06, + "loss": 0.4814, + "step": 477 + }, + { + "epoch": 0.35198821796759944, + "grad_norm": 0.5749978423118591, + "learning_rate": 9.991006666926832e-06, + "loss": 0.4766, + "step": 478 + }, + { + "epoch": 0.35272459499263625, + "grad_norm": 0.5478549599647522, + "learning_rate": 9.990747959018849e-06, + "loss": 0.4763, + "step": 479 + }, + { + "epoch": 0.35346097201767307, + "grad_norm": 0.467970609664917, + "learning_rate": 9.990485586056381e-06, + "loss": 0.4753, + "step": 480 + }, + { + "epoch": 0.3541973490427099, + "grad_norm": 0.4842618405818939, + "learning_rate": 9.990219548232106e-06, + "loss": 0.4721, + "step": 481 + }, + { + "epoch": 0.3549337260677467, + "grad_norm": 0.4819396138191223, + "learning_rate": 9.989949845741393e-06, + "loss": 0.449, + "step": 482 + }, + { + "epoch": 0.3556701030927835, + "grad_norm": 0.5315747857093811, + "learning_rate": 9.989676478782305e-06, + "loss": 0.4866, + "step": 483 + }, + { + "epoch": 0.35640648011782033, + "grad_norm": 0.5199954509735107, + "learning_rate": 9.989399447555594e-06, + "loss": 0.4858, + "step": 484 + }, + { + "epoch": 0.35714285714285715, + "grad_norm": 0.6086426377296448, + "learning_rate": 9.989118752264704e-06, + "loss": 0.5055, + "step": 485 + }, + { + "epoch": 0.35787923416789397, + "grad_norm": 0.5935198664665222, + "learning_rate": 9.988834393115768e-06, + "loss": 0.4878, + "step": 486 + }, + { + "epoch": 0.3586156111929308, + "grad_norm": 0.5054832696914673, + "learning_rate": 9.988546370317609e-06, + "loss": 0.4357, + "step": 487 + }, + { + "epoch": 0.3593519882179676, + "grad_norm": 0.47322750091552734, + "learning_rate": 9.988254684081746e-06, + "loss": 0.4607, + "step": 488 + }, + { + "epoch": 0.3600883652430044, + "grad_norm": 0.4437432289123535, + "learning_rate": 9.987959334622381e-06, + "loss": 0.4772, + "step": 489 + }, + { + "epoch": 0.36082474226804123, + "grad_norm": 0.4980893135070801, + "learning_rate": 9.987660322156413e-06, + "loss": 0.4725, + "step": 490 + }, + { + "epoch": 0.36156111929307805, + "grad_norm": 0.5222845077514648, + "learning_rate": 9.987357646903427e-06, + "loss": 0.4915, + "step": 491 + }, + { + "epoch": 0.36229749631811486, + "grad_norm": 0.45025306940078735, + "learning_rate": 9.987051309085698e-06, + "loss": 0.4995, + "step": 492 + }, + { + "epoch": 0.3630338733431517, + "grad_norm": 0.4636288583278656, + "learning_rate": 9.986741308928189e-06, + "loss": 0.4748, + "step": 493 + }, + { + "epoch": 0.3637702503681885, + "grad_norm": 0.49730953574180603, + "learning_rate": 9.986427646658559e-06, + "loss": 0.4876, + "step": 494 + }, + { + "epoch": 0.3645066273932253, + "grad_norm": 0.5664610862731934, + "learning_rate": 9.986110322507149e-06, + "loss": 0.4727, + "step": 495 + }, + { + "epoch": 0.36524300441826213, + "grad_norm": 0.48833712935447693, + "learning_rate": 9.985789336706993e-06, + "loss": 0.5076, + "step": 496 + }, + { + "epoch": 0.36597938144329895, + "grad_norm": 0.5941821336746216, + "learning_rate": 9.985464689493814e-06, + "loss": 0.5057, + "step": 497 + }, + { + "epoch": 0.36671575846833576, + "grad_norm": 0.4826650023460388, + "learning_rate": 9.985136381106022e-06, + "loss": 0.5201, + "step": 498 + }, + { + "epoch": 0.36745213549337263, + "grad_norm": 0.49603334069252014, + "learning_rate": 9.984804411784717e-06, + "loss": 0.4921, + "step": 499 + }, + { + "epoch": 0.36818851251840945, + "grad_norm": 0.4792011082172394, + "learning_rate": 9.984468781773688e-06, + "loss": 0.459, + "step": 500 + }, + { + "epoch": 0.36892488954344627, + "grad_norm": 0.4913840591907501, + "learning_rate": 9.98412949131941e-06, + "loss": 0.4734, + "step": 501 + }, + { + "epoch": 0.3696612665684831, + "grad_norm": 0.5027479529380798, + "learning_rate": 9.983786540671052e-06, + "loss": 0.4432, + "step": 502 + }, + { + "epoch": 0.3703976435935199, + "grad_norm": 0.4886789321899414, + "learning_rate": 9.98343993008046e-06, + "loss": 0.4696, + "step": 503 + }, + { + "epoch": 0.3711340206185567, + "grad_norm": 0.4682316482067108, + "learning_rate": 9.983089659802178e-06, + "loss": 0.4516, + "step": 504 + }, + { + "epoch": 0.37187039764359353, + "grad_norm": 0.465826153755188, + "learning_rate": 9.982735730093436e-06, + "loss": 0.4827, + "step": 505 + }, + { + "epoch": 0.37260677466863035, + "grad_norm": 0.5015543699264526, + "learning_rate": 9.982378141214144e-06, + "loss": 0.4415, + "step": 506 + }, + { + "epoch": 0.37334315169366716, + "grad_norm": 0.5372326970100403, + "learning_rate": 9.98201689342691e-06, + "loss": 0.4782, + "step": 507 + }, + { + "epoch": 0.374079528718704, + "grad_norm": 0.4749949276447296, + "learning_rate": 9.98165198699702e-06, + "loss": 0.4636, + "step": 508 + }, + { + "epoch": 0.3748159057437408, + "grad_norm": 0.459778755903244, + "learning_rate": 9.98128342219245e-06, + "loss": 0.4608, + "step": 509 + }, + { + "epoch": 0.3755522827687776, + "grad_norm": 0.5350815653800964, + "learning_rate": 9.980911199283864e-06, + "loss": 0.4877, + "step": 510 + }, + { + "epoch": 0.37628865979381443, + "grad_norm": 0.47353699803352356, + "learning_rate": 9.98053531854461e-06, + "loss": 0.4867, + "step": 511 + }, + { + "epoch": 0.37702503681885124, + "grad_norm": 0.4845249652862549, + "learning_rate": 9.980155780250728e-06, + "loss": 0.4473, + "step": 512 + }, + { + "epoch": 0.37776141384388806, + "grad_norm": 0.47789323329925537, + "learning_rate": 9.979772584680933e-06, + "loss": 0.4566, + "step": 513 + }, + { + "epoch": 0.3784977908689249, + "grad_norm": 0.5107650756835938, + "learning_rate": 9.979385732116638e-06, + "loss": 0.4341, + "step": 514 + }, + { + "epoch": 0.3792341678939617, + "grad_norm": 0.5246970057487488, + "learning_rate": 9.978995222841932e-06, + "loss": 0.4741, + "step": 515 + }, + { + "epoch": 0.3799705449189985, + "grad_norm": 0.5087926387786865, + "learning_rate": 9.978601057143593e-06, + "loss": 0.4582, + "step": 516 + }, + { + "epoch": 0.3807069219440353, + "grad_norm": 0.5556607246398926, + "learning_rate": 9.978203235311088e-06, + "loss": 0.4683, + "step": 517 + }, + { + "epoch": 0.38144329896907214, + "grad_norm": 0.5219265818595886, + "learning_rate": 9.97780175763656e-06, + "loss": 0.4449, + "step": 518 + }, + { + "epoch": 0.38217967599410896, + "grad_norm": 0.5306047201156616, + "learning_rate": 9.977396624414848e-06, + "loss": 0.439, + "step": 519 + }, + { + "epoch": 0.38291605301914583, + "grad_norm": 0.638886570930481, + "learning_rate": 9.976987835943465e-06, + "loss": 0.4803, + "step": 520 + }, + { + "epoch": 0.38365243004418265, + "grad_norm": 0.513490617275238, + "learning_rate": 9.976575392522617e-06, + "loss": 0.4631, + "step": 521 + }, + { + "epoch": 0.38438880706921946, + "grad_norm": 0.5516048073768616, + "learning_rate": 9.976159294455186e-06, + "loss": 0.4958, + "step": 522 + }, + { + "epoch": 0.3851251840942563, + "grad_norm": 0.48548823595046997, + "learning_rate": 9.975739542046742e-06, + "loss": 0.5089, + "step": 523 + }, + { + "epoch": 0.3858615611192931, + "grad_norm": 0.5853198170661926, + "learning_rate": 9.975316135605543e-06, + "loss": 0.4525, + "step": 524 + }, + { + "epoch": 0.3865979381443299, + "grad_norm": 0.513939619064331, + "learning_rate": 9.97488907544252e-06, + "loss": 0.466, + "step": 525 + }, + { + "epoch": 0.3873343151693667, + "grad_norm": 0.5338413119316101, + "learning_rate": 9.974458361871299e-06, + "loss": 0.478, + "step": 526 + }, + { + "epoch": 0.38807069219440354, + "grad_norm": 0.4851873815059662, + "learning_rate": 9.974023995208177e-06, + "loss": 0.4578, + "step": 527 + }, + { + "epoch": 0.38880706921944036, + "grad_norm": 0.4788038730621338, + "learning_rate": 9.973585975772144e-06, + "loss": 0.4351, + "step": 528 + }, + { + "epoch": 0.3895434462444772, + "grad_norm": 0.5574104189872742, + "learning_rate": 9.973144303884867e-06, + "loss": 0.4512, + "step": 529 + }, + { + "epoch": 0.390279823269514, + "grad_norm": 0.5594552755355835, + "learning_rate": 9.972698979870698e-06, + "loss": 0.4996, + "step": 530 + }, + { + "epoch": 0.3910162002945508, + "grad_norm": 0.5972340703010559, + "learning_rate": 9.972250004056665e-06, + "loss": 0.4908, + "step": 531 + }, + { + "epoch": 0.3917525773195876, + "grad_norm": 0.5098032355308533, + "learning_rate": 9.971797376772488e-06, + "loss": 0.4873, + "step": 532 + }, + { + "epoch": 0.39248895434462444, + "grad_norm": 0.5212668776512146, + "learning_rate": 9.971341098350557e-06, + "loss": 0.4603, + "step": 533 + }, + { + "epoch": 0.39322533136966126, + "grad_norm": 0.49205219745635986, + "learning_rate": 9.970881169125955e-06, + "loss": 0.4892, + "step": 534 + }, + { + "epoch": 0.3939617083946981, + "grad_norm": 0.4715671241283417, + "learning_rate": 9.970417589436435e-06, + "loss": 0.4701, + "step": 535 + }, + { + "epoch": 0.3946980854197349, + "grad_norm": 0.49158528447151184, + "learning_rate": 9.969950359622438e-06, + "loss": 0.449, + "step": 536 + }, + { + "epoch": 0.3954344624447717, + "grad_norm": 0.5873647928237915, + "learning_rate": 9.969479480027086e-06, + "loss": 0.4757, + "step": 537 + }, + { + "epoch": 0.3961708394698085, + "grad_norm": 0.4452936053276062, + "learning_rate": 9.969004950996175e-06, + "loss": 0.4569, + "step": 538 + }, + { + "epoch": 0.39690721649484534, + "grad_norm": 0.5542396306991577, + "learning_rate": 9.968526772878185e-06, + "loss": 0.4877, + "step": 539 + }, + { + "epoch": 0.39764359351988215, + "grad_norm": 0.5141607522964478, + "learning_rate": 9.968044946024277e-06, + "loss": 0.4906, + "step": 540 + }, + { + "epoch": 0.39837997054491897, + "grad_norm": 0.47367358207702637, + "learning_rate": 9.967559470788292e-06, + "loss": 0.4683, + "step": 541 + }, + { + "epoch": 0.39911634756995584, + "grad_norm": 0.5380948781967163, + "learning_rate": 9.967070347526743e-06, + "loss": 0.4878, + "step": 542 + }, + { + "epoch": 0.39985272459499266, + "grad_norm": 0.46736347675323486, + "learning_rate": 9.966577576598833e-06, + "loss": 0.4865, + "step": 543 + }, + { + "epoch": 0.4005891016200295, + "grad_norm": 0.46713143587112427, + "learning_rate": 9.966081158366434e-06, + "loss": 0.4388, + "step": 544 + }, + { + "epoch": 0.4013254786450663, + "grad_norm": 0.5499324798583984, + "learning_rate": 9.965581093194103e-06, + "loss": 0.4916, + "step": 545 + }, + { + "epoch": 0.4020618556701031, + "grad_norm": 0.45428466796875, + "learning_rate": 9.965077381449073e-06, + "loss": 0.496, + "step": 546 + }, + { + "epoch": 0.4027982326951399, + "grad_norm": 0.58051598072052, + "learning_rate": 9.96457002350125e-06, + "loss": 0.4618, + "step": 547 + }, + { + "epoch": 0.40353460972017674, + "grad_norm": 0.4578842520713806, + "learning_rate": 9.96405901972323e-06, + "loss": 0.4808, + "step": 548 + }, + { + "epoch": 0.40427098674521356, + "grad_norm": 0.5422972440719604, + "learning_rate": 9.96354437049027e-06, + "loss": 0.4721, + "step": 549 + }, + { + "epoch": 0.4050073637702504, + "grad_norm": 0.5694018602371216, + "learning_rate": 9.96302607618032e-06, + "loss": 0.4825, + "step": 550 + }, + { + "epoch": 0.4057437407952872, + "grad_norm": 0.5665662884712219, + "learning_rate": 9.962504137173997e-06, + "loss": 0.4877, + "step": 551 + }, + { + "epoch": 0.406480117820324, + "grad_norm": 0.5937358140945435, + "learning_rate": 9.961978553854597e-06, + "loss": 0.454, + "step": 552 + }, + { + "epoch": 0.4072164948453608, + "grad_norm": 0.5692036747932434, + "learning_rate": 9.961449326608093e-06, + "loss": 0.45, + "step": 553 + }, + { + "epoch": 0.40795287187039764, + "grad_norm": 0.5986254811286926, + "learning_rate": 9.960916455823134e-06, + "loss": 0.4764, + "step": 554 + }, + { + "epoch": 0.40868924889543445, + "grad_norm": 0.540023922920227, + "learning_rate": 9.960379941891043e-06, + "loss": 0.4402, + "step": 555 + }, + { + "epoch": 0.40942562592047127, + "grad_norm": 0.620135486125946, + "learning_rate": 9.959839785205821e-06, + "loss": 0.4795, + "step": 556 + }, + { + "epoch": 0.4101620029455081, + "grad_norm": 0.5036307573318481, + "learning_rate": 9.959295986164139e-06, + "loss": 0.4725, + "step": 557 + }, + { + "epoch": 0.4108983799705449, + "grad_norm": 0.576543390750885, + "learning_rate": 9.958748545165353e-06, + "loss": 0.4593, + "step": 558 + }, + { + "epoch": 0.4116347569955817, + "grad_norm": 0.45533323287963867, + "learning_rate": 9.95819746261148e-06, + "loss": 0.4724, + "step": 559 + }, + { + "epoch": 0.41237113402061853, + "grad_norm": 0.5245439410209656, + "learning_rate": 9.957642738907226e-06, + "loss": 0.4688, + "step": 560 + }, + { + "epoch": 0.41310751104565535, + "grad_norm": 0.4937553107738495, + "learning_rate": 9.957084374459957e-06, + "loss": 0.48, + "step": 561 + }, + { + "epoch": 0.41384388807069217, + "grad_norm": 0.5153213739395142, + "learning_rate": 9.956522369679722e-06, + "loss": 0.4745, + "step": 562 + }, + { + "epoch": 0.41458026509572904, + "grad_norm": 0.5216047763824463, + "learning_rate": 9.955956724979239e-06, + "loss": 0.4751, + "step": 563 + }, + { + "epoch": 0.41531664212076586, + "grad_norm": 0.6182007789611816, + "learning_rate": 9.955387440773902e-06, + "loss": 0.4892, + "step": 564 + }, + { + "epoch": 0.41605301914580267, + "grad_norm": 0.5679191946983337, + "learning_rate": 9.954814517481774e-06, + "loss": 0.4579, + "step": 565 + }, + { + "epoch": 0.4167893961708395, + "grad_norm": 0.6047842502593994, + "learning_rate": 9.954237955523593e-06, + "loss": 0.5028, + "step": 566 + }, + { + "epoch": 0.4175257731958763, + "grad_norm": 0.6306226253509521, + "learning_rate": 9.953657755322772e-06, + "loss": 0.4831, + "step": 567 + }, + { + "epoch": 0.4182621502209131, + "grad_norm": 0.47558221220970154, + "learning_rate": 9.953073917305386e-06, + "loss": 0.4821, + "step": 568 + }, + { + "epoch": 0.41899852724594994, + "grad_norm": 0.725439190864563, + "learning_rate": 9.952486441900196e-06, + "loss": 0.4881, + "step": 569 + }, + { + "epoch": 0.41973490427098675, + "grad_norm": 0.5623874664306641, + "learning_rate": 9.95189532953862e-06, + "loss": 0.4738, + "step": 570 + }, + { + "epoch": 0.42047128129602357, + "grad_norm": 0.6907193064689636, + "learning_rate": 9.951300580654756e-06, + "loss": 0.4706, + "step": 571 + }, + { + "epoch": 0.4212076583210604, + "grad_norm": 0.6095152497291565, + "learning_rate": 9.950702195685366e-06, + "loss": 0.4737, + "step": 572 + }, + { + "epoch": 0.4219440353460972, + "grad_norm": 0.6079631447792053, + "learning_rate": 9.95010017506989e-06, + "loss": 0.4676, + "step": 573 + }, + { + "epoch": 0.422680412371134, + "grad_norm": 0.825157880783081, + "learning_rate": 9.949494519250433e-06, + "loss": 0.4791, + "step": 574 + }, + { + "epoch": 0.42341678939617083, + "grad_norm": 0.6472348570823669, + "learning_rate": 9.94888522867177e-06, + "loss": 0.4969, + "step": 575 + }, + { + "epoch": 0.42415316642120765, + "grad_norm": 0.6898149847984314, + "learning_rate": 9.948272303781346e-06, + "loss": 0.4648, + "step": 576 + }, + { + "epoch": 0.42488954344624447, + "grad_norm": 0.5716066360473633, + "learning_rate": 9.94765574502927e-06, + "loss": 0.4745, + "step": 577 + }, + { + "epoch": 0.4256259204712813, + "grad_norm": 0.6825960874557495, + "learning_rate": 9.94703555286833e-06, + "loss": 0.4888, + "step": 578 + }, + { + "epoch": 0.4263622974963181, + "grad_norm": 0.5997359156608582, + "learning_rate": 9.946411727753975e-06, + "loss": 0.4807, + "step": 579 + }, + { + "epoch": 0.4270986745213549, + "grad_norm": 0.5727107524871826, + "learning_rate": 9.945784270144321e-06, + "loss": 0.4724, + "step": 580 + }, + { + "epoch": 0.42783505154639173, + "grad_norm": 0.5531638264656067, + "learning_rate": 9.945153180500157e-06, + "loss": 0.4379, + "step": 581 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 0.5092889070510864, + "learning_rate": 9.944518459284934e-06, + "loss": 0.4745, + "step": 582 + }, + { + "epoch": 0.42930780559646536, + "grad_norm": 0.5107025504112244, + "learning_rate": 9.943880106964772e-06, + "loss": 0.494, + "step": 583 + }, + { + "epoch": 0.43004418262150224, + "grad_norm": 0.5222765803337097, + "learning_rate": 9.943238124008458e-06, + "loss": 0.4561, + "step": 584 + }, + { + "epoch": 0.43078055964653905, + "grad_norm": 0.550485372543335, + "learning_rate": 9.942592510887448e-06, + "loss": 0.4548, + "step": 585 + }, + { + "epoch": 0.43151693667157587, + "grad_norm": 0.5831854939460754, + "learning_rate": 9.941943268075855e-06, + "loss": 0.4955, + "step": 586 + }, + { + "epoch": 0.4322533136966127, + "grad_norm": 0.4990431070327759, + "learning_rate": 9.941290396050467e-06, + "loss": 0.49, + "step": 587 + }, + { + "epoch": 0.4329896907216495, + "grad_norm": 0.6468625664710999, + "learning_rate": 9.940633895290733e-06, + "loss": 0.4799, + "step": 588 + }, + { + "epoch": 0.4337260677466863, + "grad_norm": 0.5194735527038574, + "learning_rate": 9.939973766278768e-06, + "loss": 0.4453, + "step": 589 + }, + { + "epoch": 0.43446244477172313, + "grad_norm": 0.5722482204437256, + "learning_rate": 9.939310009499348e-06, + "loss": 0.4798, + "step": 590 + }, + { + "epoch": 0.43519882179675995, + "grad_norm": 0.5742457509040833, + "learning_rate": 9.938642625439918e-06, + "loss": 0.4668, + "step": 591 + }, + { + "epoch": 0.43593519882179677, + "grad_norm": 0.5003108382225037, + "learning_rate": 9.937971614590587e-06, + "loss": 0.4648, + "step": 592 + }, + { + "epoch": 0.4366715758468336, + "grad_norm": 0.5724197030067444, + "learning_rate": 9.93729697744412e-06, + "loss": 0.4841, + "step": 593 + }, + { + "epoch": 0.4374079528718704, + "grad_norm": 0.5018848180770874, + "learning_rate": 9.936618714495954e-06, + "loss": 0.4589, + "step": 594 + }, + { + "epoch": 0.4381443298969072, + "grad_norm": 0.5396544337272644, + "learning_rate": 9.935936826244183e-06, + "loss": 0.4753, + "step": 595 + }, + { + "epoch": 0.43888070692194403, + "grad_norm": 0.5358166694641113, + "learning_rate": 9.935251313189564e-06, + "loss": 0.4915, + "step": 596 + }, + { + "epoch": 0.43961708394698085, + "grad_norm": 0.4927530884742737, + "learning_rate": 9.934562175835521e-06, + "loss": 0.4586, + "step": 597 + }, + { + "epoch": 0.44035346097201766, + "grad_norm": 0.47724655270576477, + "learning_rate": 9.933869414688134e-06, + "loss": 0.4734, + "step": 598 + }, + { + "epoch": 0.4410898379970545, + "grad_norm": 0.46607303619384766, + "learning_rate": 9.933173030256142e-06, + "loss": 0.4718, + "step": 599 + }, + { + "epoch": 0.4418262150220913, + "grad_norm": 0.5244195461273193, + "learning_rate": 9.932473023050954e-06, + "loss": 0.4441, + "step": 600 + }, + { + "epoch": 0.4425625920471281, + "grad_norm": 0.5027062892913818, + "learning_rate": 9.931769393586633e-06, + "loss": 0.4456, + "step": 601 + }, + { + "epoch": 0.44329896907216493, + "grad_norm": 0.4999453127384186, + "learning_rate": 9.9310621423799e-06, + "loss": 0.4858, + "step": 602 + }, + { + "epoch": 0.44403534609720174, + "grad_norm": 0.5346736311912537, + "learning_rate": 9.930351269950144e-06, + "loss": 0.4985, + "step": 603 + }, + { + "epoch": 0.44477172312223856, + "grad_norm": 0.5561801791191101, + "learning_rate": 9.929636776819404e-06, + "loss": 0.4554, + "step": 604 + }, + { + "epoch": 0.44550810014727543, + "grad_norm": 0.5683428049087524, + "learning_rate": 9.928918663512382e-06, + "loss": 0.4764, + "step": 605 + }, + { + "epoch": 0.44624447717231225, + "grad_norm": 0.5989724397659302, + "learning_rate": 9.928196930556442e-06, + "loss": 0.473, + "step": 606 + }, + { + "epoch": 0.44698085419734906, + "grad_norm": 0.5784966349601746, + "learning_rate": 9.9274715784816e-06, + "loss": 0.4653, + "step": 607 + }, + { + "epoch": 0.4477172312223859, + "grad_norm": 0.5616557598114014, + "learning_rate": 9.926742607820535e-06, + "loss": 0.4722, + "step": 608 + }, + { + "epoch": 0.4484536082474227, + "grad_norm": 0.6208142638206482, + "learning_rate": 9.926010019108579e-06, + "loss": 0.4639, + "step": 609 + }, + { + "epoch": 0.4491899852724595, + "grad_norm": 0.5640535354614258, + "learning_rate": 9.925273812883724e-06, + "loss": 0.4703, + "step": 610 + }, + { + "epoch": 0.44992636229749633, + "grad_norm": 0.7158654928207397, + "learning_rate": 9.924533989686618e-06, + "loss": 0.4697, + "step": 611 + }, + { + "epoch": 0.45066273932253315, + "grad_norm": 0.6342428922653198, + "learning_rate": 9.923790550060564e-06, + "loss": 0.4662, + "step": 612 + }, + { + "epoch": 0.45139911634756996, + "grad_norm": 0.4764558970928192, + "learning_rate": 9.923043494551522e-06, + "loss": 0.4519, + "step": 613 + }, + { + "epoch": 0.4521354933726068, + "grad_norm": 0.6603193879127502, + "learning_rate": 9.922292823708106e-06, + "loss": 0.5068, + "step": 614 + }, + { + "epoch": 0.4528718703976436, + "grad_norm": 0.522085428237915, + "learning_rate": 9.921538538081588e-06, + "loss": 0.4602, + "step": 615 + }, + { + "epoch": 0.4536082474226804, + "grad_norm": 0.53127121925354, + "learning_rate": 9.92078063822589e-06, + "loss": 0.461, + "step": 616 + }, + { + "epoch": 0.4543446244477172, + "grad_norm": 0.6258383393287659, + "learning_rate": 9.920019124697594e-06, + "loss": 0.489, + "step": 617 + }, + { + "epoch": 0.45508100147275404, + "grad_norm": 0.49485811591148376, + "learning_rate": 9.919253998055928e-06, + "loss": 0.447, + "step": 618 + }, + { + "epoch": 0.45581737849779086, + "grad_norm": 0.5272852182388306, + "learning_rate": 9.918485258862781e-06, + "loss": 0.478, + "step": 619 + }, + { + "epoch": 0.4565537555228277, + "grad_norm": 0.4806794822216034, + "learning_rate": 9.917712907682694e-06, + "loss": 0.4353, + "step": 620 + }, + { + "epoch": 0.4572901325478645, + "grad_norm": 0.501160204410553, + "learning_rate": 9.916936945082854e-06, + "loss": 0.4682, + "step": 621 + }, + { + "epoch": 0.4580265095729013, + "grad_norm": 0.5172646641731262, + "learning_rate": 9.916157371633106e-06, + "loss": 0.4505, + "step": 622 + }, + { + "epoch": 0.4587628865979381, + "grad_norm": 0.5966675281524658, + "learning_rate": 9.915374187905945e-06, + "loss": 0.5016, + "step": 623 + }, + { + "epoch": 0.45949926362297494, + "grad_norm": 0.4914330840110779, + "learning_rate": 9.91458739447652e-06, + "loss": 0.4455, + "step": 624 + }, + { + "epoch": 0.46023564064801176, + "grad_norm": 0.6271175146102905, + "learning_rate": 9.913796991922624e-06, + "loss": 0.4604, + "step": 625 + }, + { + "epoch": 0.46097201767304863, + "grad_norm": 0.5928018689155579, + "learning_rate": 9.913002980824709e-06, + "loss": 0.4519, + "step": 626 + }, + { + "epoch": 0.46170839469808544, + "grad_norm": 0.4849533140659332, + "learning_rate": 9.912205361765868e-06, + "loss": 0.5011, + "step": 627 + }, + { + "epoch": 0.46244477172312226, + "grad_norm": 0.5498746633529663, + "learning_rate": 9.911404135331852e-06, + "loss": 0.4567, + "step": 628 + }, + { + "epoch": 0.4631811487481591, + "grad_norm": 0.4846838712692261, + "learning_rate": 9.910599302111057e-06, + "loss": 0.4607, + "step": 629 + }, + { + "epoch": 0.4639175257731959, + "grad_norm": 0.5435061454772949, + "learning_rate": 9.909790862694528e-06, + "loss": 0.4507, + "step": 630 + }, + { + "epoch": 0.4646539027982327, + "grad_norm": 0.5106296539306641, + "learning_rate": 9.908978817675959e-06, + "loss": 0.4366, + "step": 631 + }, + { + "epoch": 0.4653902798232695, + "grad_norm": 0.4310411512851715, + "learning_rate": 9.908163167651688e-06, + "loss": 0.4729, + "step": 632 + }, + { + "epoch": 0.46612665684830634, + "grad_norm": 0.5732407569885254, + "learning_rate": 9.907343913220707e-06, + "loss": 0.4729, + "step": 633 + }, + { + "epoch": 0.46686303387334316, + "grad_norm": 0.6053887009620667, + "learning_rate": 9.90652105498465e-06, + "loss": 0.4671, + "step": 634 + }, + { + "epoch": 0.46759941089838, + "grad_norm": 0.4543951153755188, + "learning_rate": 9.905694593547803e-06, + "loss": 0.4631, + "step": 635 + }, + { + "epoch": 0.4683357879234168, + "grad_norm": 0.5218227505683899, + "learning_rate": 9.90486452951709e-06, + "loss": 0.4638, + "step": 636 + }, + { + "epoch": 0.4690721649484536, + "grad_norm": 0.5472959876060486, + "learning_rate": 9.904030863502086e-06, + "loss": 0.4677, + "step": 637 + }, + { + "epoch": 0.4698085419734904, + "grad_norm": 0.5236535668373108, + "learning_rate": 9.903193596115011e-06, + "loss": 0.466, + "step": 638 + }, + { + "epoch": 0.47054491899852724, + "grad_norm": 0.5994406342506409, + "learning_rate": 9.902352727970729e-06, + "loss": 0.4811, + "step": 639 + }, + { + "epoch": 0.47128129602356406, + "grad_norm": 0.5658840537071228, + "learning_rate": 9.901508259686746e-06, + "loss": 0.4728, + "step": 640 + }, + { + "epoch": 0.47201767304860087, + "grad_norm": 0.5107308030128479, + "learning_rate": 9.900660191883217e-06, + "loss": 0.4834, + "step": 641 + }, + { + "epoch": 0.4727540500736377, + "grad_norm": 0.5555645227432251, + "learning_rate": 9.899808525182935e-06, + "loss": 0.471, + "step": 642 + }, + { + "epoch": 0.4734904270986745, + "grad_norm": 0.6290595531463623, + "learning_rate": 9.89895326021134e-06, + "loss": 0.4718, + "step": 643 + }, + { + "epoch": 0.4742268041237113, + "grad_norm": 0.5025970935821533, + "learning_rate": 9.898094397596511e-06, + "loss": 0.4752, + "step": 644 + }, + { + "epoch": 0.47496318114874814, + "grad_norm": 0.60362309217453, + "learning_rate": 9.897231937969172e-06, + "loss": 0.4636, + "step": 645 + }, + { + "epoch": 0.47569955817378495, + "grad_norm": 0.6537862420082092, + "learning_rate": 9.896365881962687e-06, + "loss": 0.4574, + "step": 646 + }, + { + "epoch": 0.47643593519882177, + "grad_norm": 0.4715963304042816, + "learning_rate": 9.895496230213061e-06, + "loss": 0.4596, + "step": 647 + }, + { + "epoch": 0.47717231222385864, + "grad_norm": 0.5870654582977295, + "learning_rate": 9.894622983358941e-06, + "loss": 0.4472, + "step": 648 + }, + { + "epoch": 0.47790868924889546, + "grad_norm": 0.6155905723571777, + "learning_rate": 9.893746142041612e-06, + "loss": 0.4841, + "step": 649 + }, + { + "epoch": 0.4786450662739323, + "grad_norm": 0.5059301853179932, + "learning_rate": 9.892865706905e-06, + "loss": 0.454, + "step": 650 + }, + { + "epoch": 0.4793814432989691, + "grad_norm": 0.6735835671424866, + "learning_rate": 9.891981678595671e-06, + "loss": 0.4788, + "step": 651 + }, + { + "epoch": 0.4801178203240059, + "grad_norm": 0.6185165643692017, + "learning_rate": 9.891094057762827e-06, + "loss": 0.4509, + "step": 652 + }, + { + "epoch": 0.4808541973490427, + "grad_norm": 0.5275318026542664, + "learning_rate": 9.89020284505831e-06, + "loss": 0.4064, + "step": 653 + }, + { + "epoch": 0.48159057437407954, + "grad_norm": 0.5368750095367432, + "learning_rate": 9.889308041136601e-06, + "loss": 0.4633, + "step": 654 + }, + { + "epoch": 0.48232695139911635, + "grad_norm": 0.6606622934341431, + "learning_rate": 9.888409646654818e-06, + "loss": 0.4619, + "step": 655 + }, + { + "epoch": 0.48306332842415317, + "grad_norm": 0.6033420562744141, + "learning_rate": 9.88750766227271e-06, + "loss": 0.4549, + "step": 656 + }, + { + "epoch": 0.48379970544919, + "grad_norm": 0.5704071521759033, + "learning_rate": 9.886602088652672e-06, + "loss": 0.4584, + "step": 657 + }, + { + "epoch": 0.4845360824742268, + "grad_norm": 0.6391714811325073, + "learning_rate": 9.885692926459729e-06, + "loss": 0.4612, + "step": 658 + }, + { + "epoch": 0.4852724594992636, + "grad_norm": 0.49151089787483215, + "learning_rate": 9.88478017636154e-06, + "loss": 0.4524, + "step": 659 + }, + { + "epoch": 0.48600883652430044, + "grad_norm": 0.5738299489021301, + "learning_rate": 9.883863839028402e-06, + "loss": 0.4854, + "step": 660 + }, + { + "epoch": 0.48674521354933725, + "grad_norm": 0.5694648027420044, + "learning_rate": 9.882943915133247e-06, + "loss": 0.4583, + "step": 661 + }, + { + "epoch": 0.48748159057437407, + "grad_norm": 0.5113805532455444, + "learning_rate": 9.88202040535164e-06, + "loss": 0.4714, + "step": 662 + }, + { + "epoch": 0.4882179675994109, + "grad_norm": 0.5874478220939636, + "learning_rate": 9.881093310361773e-06, + "loss": 0.4851, + "step": 663 + }, + { + "epoch": 0.4889543446244477, + "grad_norm": 0.5066226124763489, + "learning_rate": 9.880162630844483e-06, + "loss": 0.4644, + "step": 664 + }, + { + "epoch": 0.4896907216494845, + "grad_norm": 0.5173943042755127, + "learning_rate": 9.879228367483228e-06, + "loss": 0.4526, + "step": 665 + }, + { + "epoch": 0.49042709867452133, + "grad_norm": 0.531655490398407, + "learning_rate": 9.878290520964107e-06, + "loss": 0.4503, + "step": 666 + }, + { + "epoch": 0.49116347569955815, + "grad_norm": 0.5501636266708374, + "learning_rate": 9.877349091975844e-06, + "loss": 0.4694, + "step": 667 + }, + { + "epoch": 0.49189985272459497, + "grad_norm": 0.532542884349823, + "learning_rate": 9.876404081209796e-06, + "loss": 0.4588, + "step": 668 + }, + { + "epoch": 0.49263622974963184, + "grad_norm": 0.4820326864719391, + "learning_rate": 9.87545548935995e-06, + "loss": 0.4698, + "step": 669 + }, + { + "epoch": 0.49337260677466865, + "grad_norm": 0.4267001152038574, + "learning_rate": 9.874503317122925e-06, + "loss": 0.4621, + "step": 670 + }, + { + "epoch": 0.49410898379970547, + "grad_norm": 0.4171275198459625, + "learning_rate": 9.873547565197965e-06, + "loss": 0.4448, + "step": 671 + }, + { + "epoch": 0.4948453608247423, + "grad_norm": 0.4105445146560669, + "learning_rate": 9.872588234286946e-06, + "loss": 0.4507, + "step": 672 + }, + { + "epoch": 0.4955817378497791, + "grad_norm": 0.46572741866111755, + "learning_rate": 9.871625325094375e-06, + "loss": 0.4485, + "step": 673 + }, + { + "epoch": 0.4963181148748159, + "grad_norm": 0.44582173228263855, + "learning_rate": 9.870658838327378e-06, + "loss": 0.4712, + "step": 674 + }, + { + "epoch": 0.49705449189985274, + "grad_norm": 0.5541704893112183, + "learning_rate": 9.869688774695719e-06, + "loss": 0.4527, + "step": 675 + }, + { + "epoch": 0.49779086892488955, + "grad_norm": 0.45377492904663086, + "learning_rate": 9.86871513491178e-06, + "loss": 0.4764, + "step": 676 + }, + { + "epoch": 0.49852724594992637, + "grad_norm": 0.5734421610832214, + "learning_rate": 9.867737919690573e-06, + "loss": 0.4822, + "step": 677 + }, + { + "epoch": 0.4992636229749632, + "grad_norm": 0.49860337376594543, + "learning_rate": 9.866757129749733e-06, + "loss": 0.4502, + "step": 678 + }, + { + "epoch": 0.5, + "grad_norm": 0.4667682945728302, + "learning_rate": 9.865772765809528e-06, + "loss": 0.4951, + "step": 679 + }, + { + "epoch": 0.5007363770250368, + "grad_norm": 0.5397236347198486, + "learning_rate": 9.864784828592842e-06, + "loss": 0.4939, + "step": 680 + }, + { + "epoch": 0.5014727540500736, + "grad_norm": 0.5122597813606262, + "learning_rate": 9.863793318825186e-06, + "loss": 0.4452, + "step": 681 + }, + { + "epoch": 0.5022091310751104, + "grad_norm": 0.4826493263244629, + "learning_rate": 9.862798237234697e-06, + "loss": 0.4731, + "step": 682 + }, + { + "epoch": 0.5029455081001473, + "grad_norm": 0.4937388300895691, + "learning_rate": 9.86179958455213e-06, + "loss": 0.4707, + "step": 683 + }, + { + "epoch": 0.5036818851251841, + "grad_norm": 0.49966996908187866, + "learning_rate": 9.860797361510867e-06, + "loss": 0.4764, + "step": 684 + }, + { + "epoch": 0.5044182621502209, + "grad_norm": 0.41720449924468994, + "learning_rate": 9.859791568846908e-06, + "loss": 0.458, + "step": 685 + }, + { + "epoch": 0.5051546391752577, + "grad_norm": 0.5433468222618103, + "learning_rate": 9.858782207298881e-06, + "loss": 0.4631, + "step": 686 + }, + { + "epoch": 0.5058910162002945, + "grad_norm": 0.5088586807250977, + "learning_rate": 9.857769277608027e-06, + "loss": 0.4453, + "step": 687 + }, + { + "epoch": 0.5066273932253313, + "grad_norm": 0.5024508237838745, + "learning_rate": 9.856752780518214e-06, + "loss": 0.4519, + "step": 688 + }, + { + "epoch": 0.5073637702503682, + "grad_norm": 0.5998568534851074, + "learning_rate": 9.855732716775923e-06, + "loss": 0.463, + "step": 689 + }, + { + "epoch": 0.508100147275405, + "grad_norm": 0.5431100130081177, + "learning_rate": 9.854709087130261e-06, + "loss": 0.4632, + "step": 690 + }, + { + "epoch": 0.5088365243004418, + "grad_norm": 0.5772528648376465, + "learning_rate": 9.853681892332948e-06, + "loss": 0.4629, + "step": 691 + }, + { + "epoch": 0.5095729013254786, + "grad_norm": 0.4999123215675354, + "learning_rate": 9.852651133138328e-06, + "loss": 0.4269, + "step": 692 + }, + { + "epoch": 0.5103092783505154, + "grad_norm": 0.6174575686454773, + "learning_rate": 9.851616810303359e-06, + "loss": 0.4618, + "step": 693 + }, + { + "epoch": 0.5110456553755522, + "grad_norm": 0.5877047777175903, + "learning_rate": 9.850578924587614e-06, + "loss": 0.4857, + "step": 694 + }, + { + "epoch": 0.5117820324005891, + "grad_norm": 0.636028528213501, + "learning_rate": 9.849537476753286e-06, + "loss": 0.4832, + "step": 695 + }, + { + "epoch": 0.5125184094256259, + "grad_norm": 0.5791111588478088, + "learning_rate": 9.848492467565182e-06, + "loss": 0.493, + "step": 696 + }, + { + "epoch": 0.5132547864506627, + "grad_norm": 0.557636559009552, + "learning_rate": 9.847443897790728e-06, + "loss": 0.4739, + "step": 697 + }, + { + "epoch": 0.5139911634756995, + "grad_norm": 0.47123220562934875, + "learning_rate": 9.84639176819996e-06, + "loss": 0.463, + "step": 698 + }, + { + "epoch": 0.5147275405007363, + "grad_norm": 0.6015990376472473, + "learning_rate": 9.845336079565529e-06, + "loss": 0.4914, + "step": 699 + }, + { + "epoch": 0.5154639175257731, + "grad_norm": 0.4684229791164398, + "learning_rate": 9.844276832662704e-06, + "loss": 0.4573, + "step": 700 + }, + { + "epoch": 0.5162002945508101, + "grad_norm": 0.4829377233982086, + "learning_rate": 9.843214028269361e-06, + "loss": 0.4414, + "step": 701 + }, + { + "epoch": 0.5169366715758469, + "grad_norm": 0.5336502194404602, + "learning_rate": 9.842147667165993e-06, + "loss": 0.4746, + "step": 702 + }, + { + "epoch": 0.5176730486008837, + "grad_norm": 0.4682151675224304, + "learning_rate": 9.841077750135702e-06, + "loss": 0.4873, + "step": 703 + }, + { + "epoch": 0.5184094256259205, + "grad_norm": 0.5173882842063904, + "learning_rate": 9.840004277964204e-06, + "loss": 0.4497, + "step": 704 + }, + { + "epoch": 0.5191458026509573, + "grad_norm": 0.4670022130012512, + "learning_rate": 9.838927251439823e-06, + "loss": 0.4857, + "step": 705 + }, + { + "epoch": 0.5198821796759941, + "grad_norm": 0.5181158185005188, + "learning_rate": 9.837846671353498e-06, + "loss": 0.4822, + "step": 706 + }, + { + "epoch": 0.520618556701031, + "grad_norm": 0.4926871955394745, + "learning_rate": 9.83676253849877e-06, + "loss": 0.4677, + "step": 707 + }, + { + "epoch": 0.5213549337260678, + "grad_norm": 0.5156863331794739, + "learning_rate": 9.835674853671797e-06, + "loss": 0.4865, + "step": 708 + }, + { + "epoch": 0.5220913107511046, + "grad_norm": 0.5189216136932373, + "learning_rate": 9.83458361767134e-06, + "loss": 0.4697, + "step": 709 + }, + { + "epoch": 0.5228276877761414, + "grad_norm": 0.537659764289856, + "learning_rate": 9.83348883129877e-06, + "loss": 0.4682, + "step": 710 + }, + { + "epoch": 0.5235640648011782, + "grad_norm": 0.5328056812286377, + "learning_rate": 9.832390495358066e-06, + "loss": 0.464, + "step": 711 + }, + { + "epoch": 0.524300441826215, + "grad_norm": 0.5192391872406006, + "learning_rate": 9.831288610655812e-06, + "loss": 0.451, + "step": 712 + }, + { + "epoch": 0.5250368188512519, + "grad_norm": 0.48723873496055603, + "learning_rate": 9.830183178001199e-06, + "loss": 0.4747, + "step": 713 + }, + { + "epoch": 0.5257731958762887, + "grad_norm": 0.594840407371521, + "learning_rate": 9.829074198206024e-06, + "loss": 0.4407, + "step": 714 + }, + { + "epoch": 0.5265095729013255, + "grad_norm": 0.5472460389137268, + "learning_rate": 9.827961672084685e-06, + "loss": 0.4487, + "step": 715 + }, + { + "epoch": 0.5272459499263623, + "grad_norm": 0.5027500987052917, + "learning_rate": 9.82684560045419e-06, + "loss": 0.4904, + "step": 716 + }, + { + "epoch": 0.5279823269513991, + "grad_norm": 0.5732831358909607, + "learning_rate": 9.82572598413415e-06, + "loss": 0.4505, + "step": 717 + }, + { + "epoch": 0.5287187039764359, + "grad_norm": 0.5125916600227356, + "learning_rate": 9.824602823946776e-06, + "loss": 0.4428, + "step": 718 + }, + { + "epoch": 0.5294550810014728, + "grad_norm": 0.5110583901405334, + "learning_rate": 9.823476120716882e-06, + "loss": 0.4561, + "step": 719 + }, + { + "epoch": 0.5301914580265096, + "grad_norm": 0.5457249879837036, + "learning_rate": 9.822345875271884e-06, + "loss": 0.4834, + "step": 720 + }, + { + "epoch": 0.5309278350515464, + "grad_norm": 0.4954557418823242, + "learning_rate": 9.821212088441803e-06, + "loss": 0.4562, + "step": 721 + }, + { + "epoch": 0.5316642120765832, + "grad_norm": 0.6068676710128784, + "learning_rate": 9.820074761059255e-06, + "loss": 0.5063, + "step": 722 + }, + { + "epoch": 0.53240058910162, + "grad_norm": 0.49514177441596985, + "learning_rate": 9.81893389395946e-06, + "loss": 0.4364, + "step": 723 + }, + { + "epoch": 0.5331369661266568, + "grad_norm": 0.5643453598022461, + "learning_rate": 9.817789487980237e-06, + "loss": 0.4487, + "step": 724 + }, + { + "epoch": 0.5338733431516937, + "grad_norm": 0.4869299530982971, + "learning_rate": 9.816641543962001e-06, + "loss": 0.4697, + "step": 725 + }, + { + "epoch": 0.5346097201767305, + "grad_norm": 0.647627055644989, + "learning_rate": 9.815490062747773e-06, + "loss": 0.5138, + "step": 726 + }, + { + "epoch": 0.5353460972017673, + "grad_norm": 0.5390787124633789, + "learning_rate": 9.81433504518316e-06, + "loss": 0.4643, + "step": 727 + }, + { + "epoch": 0.5360824742268041, + "grad_norm": 0.5298830270767212, + "learning_rate": 9.813176492116372e-06, + "loss": 0.4745, + "step": 728 + }, + { + "epoch": 0.5368188512518409, + "grad_norm": 0.5326485633850098, + "learning_rate": 9.812014404398219e-06, + "loss": 0.4452, + "step": 729 + }, + { + "epoch": 0.5375552282768777, + "grad_norm": 0.47868359088897705, + "learning_rate": 9.810848782882101e-06, + "loss": 0.467, + "step": 730 + }, + { + "epoch": 0.5382916053019146, + "grad_norm": 0.5476110577583313, + "learning_rate": 9.809679628424016e-06, + "loss": 0.4368, + "step": 731 + }, + { + "epoch": 0.5390279823269514, + "grad_norm": 0.5308613181114197, + "learning_rate": 9.808506941882556e-06, + "loss": 0.4712, + "step": 732 + }, + { + "epoch": 0.5397643593519882, + "grad_norm": 0.4515834152698517, + "learning_rate": 9.807330724118906e-06, + "loss": 0.4381, + "step": 733 + }, + { + "epoch": 0.540500736377025, + "grad_norm": 0.5537406206130981, + "learning_rate": 9.806150975996843e-06, + "loss": 0.4615, + "step": 734 + }, + { + "epoch": 0.5412371134020618, + "grad_norm": 0.47350478172302246, + "learning_rate": 9.80496769838274e-06, + "loss": 0.4433, + "step": 735 + }, + { + "epoch": 0.5419734904270986, + "grad_norm": 0.4860948920249939, + "learning_rate": 9.803780892145562e-06, + "loss": 0.4558, + "step": 736 + }, + { + "epoch": 0.5427098674521355, + "grad_norm": 0.6019650101661682, + "learning_rate": 9.802590558156863e-06, + "loss": 0.4454, + "step": 737 + }, + { + "epoch": 0.5434462444771723, + "grad_norm": 0.467506468296051, + "learning_rate": 9.801396697290786e-06, + "loss": 0.4628, + "step": 738 + }, + { + "epoch": 0.5441826215022091, + "grad_norm": 0.4791230857372284, + "learning_rate": 9.800199310424067e-06, + "loss": 0.4563, + "step": 739 + }, + { + "epoch": 0.5449189985272459, + "grad_norm": 0.5122166275978088, + "learning_rate": 9.798998398436031e-06, + "loss": 0.4831, + "step": 740 + }, + { + "epoch": 0.5456553755522827, + "grad_norm": 0.47877249121665955, + "learning_rate": 9.797793962208593e-06, + "loss": 0.465, + "step": 741 + }, + { + "epoch": 0.5463917525773195, + "grad_norm": 0.49181488156318665, + "learning_rate": 9.796586002626253e-06, + "loss": 0.4793, + "step": 742 + }, + { + "epoch": 0.5471281296023565, + "grad_norm": 0.48321738839149475, + "learning_rate": 9.795374520576102e-06, + "loss": 0.4757, + "step": 743 + }, + { + "epoch": 0.5478645066273933, + "grad_norm": 0.5315781831741333, + "learning_rate": 9.794159516947812e-06, + "loss": 0.468, + "step": 744 + }, + { + "epoch": 0.5486008836524301, + "grad_norm": 0.4678581655025482, + "learning_rate": 9.792940992633649e-06, + "loss": 0.4663, + "step": 745 + }, + { + "epoch": 0.5493372606774669, + "grad_norm": 0.4767686724662781, + "learning_rate": 9.791718948528457e-06, + "loss": 0.4687, + "step": 746 + }, + { + "epoch": 0.5500736377025037, + "grad_norm": 0.5320640206336975, + "learning_rate": 9.790493385529671e-06, + "loss": 0.4672, + "step": 747 + }, + { + "epoch": 0.5508100147275405, + "grad_norm": 0.45434436202049255, + "learning_rate": 9.789264304537307e-06, + "loss": 0.442, + "step": 748 + }, + { + "epoch": 0.5515463917525774, + "grad_norm": 0.5533967614173889, + "learning_rate": 9.788031706453964e-06, + "loss": 0.4539, + "step": 749 + }, + { + "epoch": 0.5522827687776142, + "grad_norm": 0.5042517781257629, + "learning_rate": 9.786795592184824e-06, + "loss": 0.4619, + "step": 750 + }, + { + "epoch": 0.553019145802651, + "grad_norm": 0.4778529107570648, + "learning_rate": 9.785555962637654e-06, + "loss": 0.4744, + "step": 751 + }, + { + "epoch": 0.5537555228276878, + "grad_norm": 0.5131283402442932, + "learning_rate": 9.784312818722799e-06, + "loss": 0.4449, + "step": 752 + }, + { + "epoch": 0.5544918998527246, + "grad_norm": 0.48384329676628113, + "learning_rate": 9.783066161353188e-06, + "loss": 0.474, + "step": 753 + }, + { + "epoch": 0.5552282768777614, + "grad_norm": 0.49050474166870117, + "learning_rate": 9.781815991444326e-06, + "loss": 0.4445, + "step": 754 + }, + { + "epoch": 0.5559646539027983, + "grad_norm": 0.5421154499053955, + "learning_rate": 9.7805623099143e-06, + "loss": 0.4668, + "step": 755 + }, + { + "epoch": 0.5567010309278351, + "grad_norm": 0.5321326851844788, + "learning_rate": 9.779305117683781e-06, + "loss": 0.4762, + "step": 756 + }, + { + "epoch": 0.5574374079528719, + "grad_norm": 0.5349854230880737, + "learning_rate": 9.778044415676007e-06, + "loss": 0.4774, + "step": 757 + }, + { + "epoch": 0.5581737849779087, + "grad_norm": 0.5067935585975647, + "learning_rate": 9.776780204816801e-06, + "loss": 0.4862, + "step": 758 + }, + { + "epoch": 0.5589101620029455, + "grad_norm": 0.4486694037914276, + "learning_rate": 9.775512486034564e-06, + "loss": 0.4481, + "step": 759 + }, + { + "epoch": 0.5596465390279823, + "grad_norm": 0.5573259592056274, + "learning_rate": 9.774241260260266e-06, + "loss": 0.4695, + "step": 760 + }, + { + "epoch": 0.5603829160530192, + "grad_norm": 0.45694780349731445, + "learning_rate": 9.77296652842746e-06, + "loss": 0.4728, + "step": 761 + }, + { + "epoch": 0.561119293078056, + "grad_norm": 0.46451568603515625, + "learning_rate": 9.771688291472269e-06, + "loss": 0.458, + "step": 762 + }, + { + "epoch": 0.5618556701030928, + "grad_norm": 0.5605217218399048, + "learning_rate": 9.770406550333393e-06, + "loss": 0.4492, + "step": 763 + }, + { + "epoch": 0.5625920471281296, + "grad_norm": 0.4403326213359833, + "learning_rate": 9.769121305952102e-06, + "loss": 0.4721, + "step": 764 + }, + { + "epoch": 0.5633284241531664, + "grad_norm": 0.5193141102790833, + "learning_rate": 9.767832559272244e-06, + "loss": 0.461, + "step": 765 + }, + { + "epoch": 0.5640648011782032, + "grad_norm": 0.49891531467437744, + "learning_rate": 9.766540311240232e-06, + "loss": 0.451, + "step": 766 + }, + { + "epoch": 0.56480117820324, + "grad_norm": 0.5359540581703186, + "learning_rate": 9.765244562805055e-06, + "loss": 0.4595, + "step": 767 + }, + { + "epoch": 0.5655375552282769, + "grad_norm": 0.49058595299720764, + "learning_rate": 9.76394531491827e-06, + "loss": 0.4896, + "step": 768 + }, + { + "epoch": 0.5662739322533137, + "grad_norm": 0.4365098476409912, + "learning_rate": 9.762642568534012e-06, + "loss": 0.4575, + "step": 769 + }, + { + "epoch": 0.5670103092783505, + "grad_norm": 0.4465166926383972, + "learning_rate": 9.76133632460897e-06, + "loss": 0.4771, + "step": 770 + }, + { + "epoch": 0.5677466863033873, + "grad_norm": 0.4014427363872528, + "learning_rate": 9.760026584102414e-06, + "loss": 0.4658, + "step": 771 + }, + { + "epoch": 0.5684830633284241, + "grad_norm": 0.42262935638427734, + "learning_rate": 9.758713347976179e-06, + "loss": 0.4551, + "step": 772 + }, + { + "epoch": 0.569219440353461, + "grad_norm": 0.43725284934043884, + "learning_rate": 9.757396617194663e-06, + "loss": 0.4512, + "step": 773 + }, + { + "epoch": 0.5699558173784978, + "grad_norm": 0.4782891273498535, + "learning_rate": 9.756076392724836e-06, + "loss": 0.4364, + "step": 774 + }, + { + "epoch": 0.5706921944035346, + "grad_norm": 0.4645746350288391, + "learning_rate": 9.75475267553623e-06, + "loss": 0.4487, + "step": 775 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.4626657962799072, + "learning_rate": 9.75342546660094e-06, + "loss": 0.4305, + "step": 776 + }, + { + "epoch": 0.5721649484536082, + "grad_norm": 0.519809365272522, + "learning_rate": 9.752094766893635e-06, + "loss": 0.4818, + "step": 777 + }, + { + "epoch": 0.572901325478645, + "grad_norm": 0.48324060440063477, + "learning_rate": 9.750760577391535e-06, + "loss": 0.4035, + "step": 778 + }, + { + "epoch": 0.5736377025036818, + "grad_norm": 0.5473081469535828, + "learning_rate": 9.74942289907443e-06, + "loss": 0.459, + "step": 779 + }, + { + "epoch": 0.5743740795287187, + "grad_norm": 0.4750016927719116, + "learning_rate": 9.74808173292467e-06, + "loss": 0.4627, + "step": 780 + }, + { + "epoch": 0.5751104565537555, + "grad_norm": 0.5140059590339661, + "learning_rate": 9.746737079927166e-06, + "loss": 0.475, + "step": 781 + }, + { + "epoch": 0.5758468335787923, + "grad_norm": 0.6078734397888184, + "learning_rate": 9.745388941069395e-06, + "loss": 0.4972, + "step": 782 + }, + { + "epoch": 0.5765832106038291, + "grad_norm": 0.5263283848762512, + "learning_rate": 9.744037317341383e-06, + "loss": 0.4598, + "step": 783 + }, + { + "epoch": 0.5773195876288659, + "grad_norm": 0.47057968378067017, + "learning_rate": 9.742682209735727e-06, + "loss": 0.4601, + "step": 784 + }, + { + "epoch": 0.5780559646539027, + "grad_norm": 0.5394490361213684, + "learning_rate": 9.741323619247575e-06, + "loss": 0.4812, + "step": 785 + }, + { + "epoch": 0.5787923416789397, + "grad_norm": 0.43855324387550354, + "learning_rate": 9.739961546874637e-06, + "loss": 0.4499, + "step": 786 + }, + { + "epoch": 0.5795287187039765, + "grad_norm": 0.5713585019111633, + "learning_rate": 9.738595993617172e-06, + "loss": 0.471, + "step": 787 + }, + { + "epoch": 0.5802650957290133, + "grad_norm": 0.48629793524742126, + "learning_rate": 9.737226960478006e-06, + "loss": 0.456, + "step": 788 + }, + { + "epoch": 0.5810014727540501, + "grad_norm": 0.48818135261535645, + "learning_rate": 9.735854448462516e-06, + "loss": 0.4399, + "step": 789 + }, + { + "epoch": 0.5817378497790869, + "grad_norm": 0.5030027031898499, + "learning_rate": 9.73447845857863e-06, + "loss": 0.4885, + "step": 790 + }, + { + "epoch": 0.5824742268041238, + "grad_norm": 0.5217078328132629, + "learning_rate": 9.733098991836834e-06, + "loss": 0.4426, + "step": 791 + }, + { + "epoch": 0.5832106038291606, + "grad_norm": 0.5095168948173523, + "learning_rate": 9.731716049250169e-06, + "loss": 0.4469, + "step": 792 + }, + { + "epoch": 0.5839469808541974, + "grad_norm": 0.4390084743499756, + "learning_rate": 9.730329631834225e-06, + "loss": 0.4538, + "step": 793 + }, + { + "epoch": 0.5846833578792342, + "grad_norm": 0.5591133236885071, + "learning_rate": 9.728939740607145e-06, + "loss": 0.4437, + "step": 794 + }, + { + "epoch": 0.585419734904271, + "grad_norm": 0.48923259973526, + "learning_rate": 9.727546376589622e-06, + "loss": 0.4809, + "step": 795 + }, + { + "epoch": 0.5861561119293078, + "grad_norm": 0.5047422647476196, + "learning_rate": 9.726149540804901e-06, + "loss": 0.4616, + "step": 796 + }, + { + "epoch": 0.5868924889543446, + "grad_norm": 0.47945308685302734, + "learning_rate": 9.724749234278779e-06, + "loss": 0.4662, + "step": 797 + }, + { + "epoch": 0.5876288659793815, + "grad_norm": 0.45942962169647217, + "learning_rate": 9.723345458039595e-06, + "loss": 0.4572, + "step": 798 + }, + { + "epoch": 0.5883652430044183, + "grad_norm": 0.4904220998287201, + "learning_rate": 9.721938213118241e-06, + "loss": 0.4517, + "step": 799 + }, + { + "epoch": 0.5891016200294551, + "grad_norm": 0.4758506715297699, + "learning_rate": 9.720527500548155e-06, + "loss": 0.4392, + "step": 800 + }, + { + "epoch": 0.5898379970544919, + "grad_norm": 0.5180002450942993, + "learning_rate": 9.719113321365324e-06, + "loss": 0.4538, + "step": 801 + }, + { + "epoch": 0.5905743740795287, + "grad_norm": 0.49426352977752686, + "learning_rate": 9.717695676608275e-06, + "loss": 0.4865, + "step": 802 + }, + { + "epoch": 0.5913107511045655, + "grad_norm": 0.5237860083580017, + "learning_rate": 9.716274567318085e-06, + "loss": 0.4542, + "step": 803 + }, + { + "epoch": 0.5920471281296024, + "grad_norm": 0.48790404200553894, + "learning_rate": 9.714849994538373e-06, + "loss": 0.4931, + "step": 804 + }, + { + "epoch": 0.5927835051546392, + "grad_norm": 0.49453994631767273, + "learning_rate": 9.713421959315303e-06, + "loss": 0.4633, + "step": 805 + }, + { + "epoch": 0.593519882179676, + "grad_norm": 0.5625678896903992, + "learning_rate": 9.71199046269758e-06, + "loss": 0.4769, + "step": 806 + }, + { + "epoch": 0.5942562592047128, + "grad_norm": 0.5181511044502258, + "learning_rate": 9.710555505736456e-06, + "loss": 0.4895, + "step": 807 + }, + { + "epoch": 0.5949926362297496, + "grad_norm": 0.4666210114955902, + "learning_rate": 9.709117089485714e-06, + "loss": 0.4602, + "step": 808 + }, + { + "epoch": 0.5957290132547864, + "grad_norm": 0.42587047815322876, + "learning_rate": 9.707675215001685e-06, + "loss": 0.4632, + "step": 809 + }, + { + "epoch": 0.5964653902798233, + "grad_norm": 0.5286933183670044, + "learning_rate": 9.706229883343242e-06, + "loss": 0.4657, + "step": 810 + }, + { + "epoch": 0.5972017673048601, + "grad_norm": 0.4617254436016083, + "learning_rate": 9.704781095571788e-06, + "loss": 0.4559, + "step": 811 + }, + { + "epoch": 0.5979381443298969, + "grad_norm": 0.5746539235115051, + "learning_rate": 9.70332885275127e-06, + "loss": 0.4548, + "step": 812 + }, + { + "epoch": 0.5986745213549337, + "grad_norm": 0.4450950026512146, + "learning_rate": 9.701873155948177e-06, + "loss": 0.4597, + "step": 813 + }, + { + "epoch": 0.5994108983799705, + "grad_norm": 0.48545292019844055, + "learning_rate": 9.70041400623152e-06, + "loss": 0.4684, + "step": 814 + }, + { + "epoch": 0.6001472754050073, + "grad_norm": 0.45839008688926697, + "learning_rate": 9.698951404672858e-06, + "loss": 0.436, + "step": 815 + }, + { + "epoch": 0.6008836524300442, + "grad_norm": 0.5452147722244263, + "learning_rate": 9.697485352346282e-06, + "loss": 0.43, + "step": 816 + }, + { + "epoch": 0.601620029455081, + "grad_norm": 0.4968319237232208, + "learning_rate": 9.696015850328418e-06, + "loss": 0.4438, + "step": 817 + }, + { + "epoch": 0.6023564064801178, + "grad_norm": 0.5042880773544312, + "learning_rate": 9.694542899698422e-06, + "loss": 0.4463, + "step": 818 + }, + { + "epoch": 0.6030927835051546, + "grad_norm": 0.5175356864929199, + "learning_rate": 9.693066501537984e-06, + "loss": 0.4614, + "step": 819 + }, + { + "epoch": 0.6038291605301914, + "grad_norm": 0.5082677006721497, + "learning_rate": 9.691586656931326e-06, + "loss": 0.4635, + "step": 820 + }, + { + "epoch": 0.6045655375552282, + "grad_norm": 0.5881240367889404, + "learning_rate": 9.690103366965204e-06, + "loss": 0.4691, + "step": 821 + }, + { + "epoch": 0.605301914580265, + "grad_norm": 0.4930703341960907, + "learning_rate": 9.688616632728898e-06, + "loss": 0.439, + "step": 822 + }, + { + "epoch": 0.6060382916053019, + "grad_norm": 0.4951449930667877, + "learning_rate": 9.687126455314221e-06, + "loss": 0.4493, + "step": 823 + }, + { + "epoch": 0.6067746686303387, + "grad_norm": 0.4590352475643158, + "learning_rate": 9.685632835815519e-06, + "loss": 0.4778, + "step": 824 + }, + { + "epoch": 0.6075110456553755, + "grad_norm": 0.547599196434021, + "learning_rate": 9.684135775329653e-06, + "loss": 0.4312, + "step": 825 + }, + { + "epoch": 0.6082474226804123, + "grad_norm": 0.4426873028278351, + "learning_rate": 9.682635274956026e-06, + "loss": 0.4735, + "step": 826 + }, + { + "epoch": 0.6089837997054491, + "grad_norm": 0.41880539059638977, + "learning_rate": 9.681131335796557e-06, + "loss": 0.4593, + "step": 827 + }, + { + "epoch": 0.6097201767304861, + "grad_norm": 0.5596433877944946, + "learning_rate": 9.679623958955692e-06, + "loss": 0.4922, + "step": 828 + }, + { + "epoch": 0.6104565537555229, + "grad_norm": 0.45280906558036804, + "learning_rate": 9.678113145540406e-06, + "loss": 0.4737, + "step": 829 + }, + { + "epoch": 0.6111929307805597, + "grad_norm": 0.6396340727806091, + "learning_rate": 9.676598896660194e-06, + "loss": 0.4929, + "step": 830 + }, + { + "epoch": 0.6119293078055965, + "grad_norm": 0.4250088036060333, + "learning_rate": 9.675081213427076e-06, + "loss": 0.421, + "step": 831 + }, + { + "epoch": 0.6126656848306333, + "grad_norm": 0.5473159551620483, + "learning_rate": 9.673560096955588e-06, + "loss": 0.4563, + "step": 832 + }, + { + "epoch": 0.6134020618556701, + "grad_norm": 0.46349871158599854, + "learning_rate": 9.672035548362797e-06, + "loss": 0.4308, + "step": 833 + }, + { + "epoch": 0.614138438880707, + "grad_norm": 0.48488184809684753, + "learning_rate": 9.670507568768281e-06, + "loss": 0.4767, + "step": 834 + }, + { + "epoch": 0.6148748159057438, + "grad_norm": 0.5228757262229919, + "learning_rate": 9.668976159294145e-06, + "loss": 0.48, + "step": 835 + }, + { + "epoch": 0.6156111929307806, + "grad_norm": 0.5242437720298767, + "learning_rate": 9.66744132106501e-06, + "loss": 0.483, + "step": 836 + }, + { + "epoch": 0.6163475699558174, + "grad_norm": 0.47697046399116516, + "learning_rate": 9.665903055208013e-06, + "loss": 0.4704, + "step": 837 + }, + { + "epoch": 0.6170839469808542, + "grad_norm": 0.5204547047615051, + "learning_rate": 9.664361362852813e-06, + "loss": 0.4873, + "step": 838 + }, + { + "epoch": 0.617820324005891, + "grad_norm": 0.4981822967529297, + "learning_rate": 9.66281624513158e-06, + "loss": 0.4354, + "step": 839 + }, + { + "epoch": 0.6185567010309279, + "grad_norm": 0.4810592830181122, + "learning_rate": 9.661267703178999e-06, + "loss": 0.4763, + "step": 840 + }, + { + "epoch": 0.6192930780559647, + "grad_norm": 0.4789879024028778, + "learning_rate": 9.659715738132279e-06, + "loss": 0.4358, + "step": 841 + }, + { + "epoch": 0.6200294550810015, + "grad_norm": 0.48347872495651245, + "learning_rate": 9.658160351131129e-06, + "loss": 0.5049, + "step": 842 + }, + { + "epoch": 0.6207658321060383, + "grad_norm": 0.48118627071380615, + "learning_rate": 9.656601543317784e-06, + "loss": 0.49, + "step": 843 + }, + { + "epoch": 0.6215022091310751, + "grad_norm": 0.4531796872615814, + "learning_rate": 9.655039315836983e-06, + "loss": 0.4872, + "step": 844 + }, + { + "epoch": 0.6222385861561119, + "grad_norm": 0.4885636270046234, + "learning_rate": 9.653473669835978e-06, + "loss": 0.4479, + "step": 845 + }, + { + "epoch": 0.6229749631811488, + "grad_norm": 0.49448058009147644, + "learning_rate": 9.651904606464536e-06, + "loss": 0.4701, + "step": 846 + }, + { + "epoch": 0.6237113402061856, + "grad_norm": 0.47281432151794434, + "learning_rate": 9.650332126874924e-06, + "loss": 0.4437, + "step": 847 + }, + { + "epoch": 0.6244477172312224, + "grad_norm": 0.5343514084815979, + "learning_rate": 9.648756232221925e-06, + "loss": 0.4899, + "step": 848 + }, + { + "epoch": 0.6251840942562592, + "grad_norm": 0.42154327034950256, + "learning_rate": 9.647176923662833e-06, + "loss": 0.4486, + "step": 849 + }, + { + "epoch": 0.625920471281296, + "grad_norm": 0.4753609597682953, + "learning_rate": 9.645594202357438e-06, + "loss": 0.4371, + "step": 850 + }, + { + "epoch": 0.6266568483063328, + "grad_norm": 0.6016598343849182, + "learning_rate": 9.644008069468047e-06, + "loss": 0.4638, + "step": 851 + }, + { + "epoch": 0.6273932253313697, + "grad_norm": 0.46637803316116333, + "learning_rate": 9.642418526159467e-06, + "loss": 0.4541, + "step": 852 + }, + { + "epoch": 0.6281296023564065, + "grad_norm": 0.5309515595436096, + "learning_rate": 9.64082557359901e-06, + "loss": 0.4409, + "step": 853 + }, + { + "epoch": 0.6288659793814433, + "grad_norm": 0.6249402761459351, + "learning_rate": 9.639229212956494e-06, + "loss": 0.5012, + "step": 854 + }, + { + "epoch": 0.6296023564064801, + "grad_norm": 0.43967223167419434, + "learning_rate": 9.637629445404237e-06, + "loss": 0.4438, + "step": 855 + }, + { + "epoch": 0.6303387334315169, + "grad_norm": 0.45795974135398865, + "learning_rate": 9.636026272117058e-06, + "loss": 0.445, + "step": 856 + }, + { + "epoch": 0.6310751104565537, + "grad_norm": 0.5375895500183105, + "learning_rate": 9.63441969427228e-06, + "loss": 0.4441, + "step": 857 + }, + { + "epoch": 0.6318114874815906, + "grad_norm": 0.4524497985839844, + "learning_rate": 9.632809713049726e-06, + "loss": 0.4581, + "step": 858 + }, + { + "epoch": 0.6325478645066274, + "grad_norm": 0.4217367172241211, + "learning_rate": 9.631196329631719e-06, + "loss": 0.4681, + "step": 859 + }, + { + "epoch": 0.6332842415316642, + "grad_norm": 0.5063615441322327, + "learning_rate": 9.629579545203076e-06, + "loss": 0.4632, + "step": 860 + }, + { + "epoch": 0.634020618556701, + "grad_norm": 0.46327000856399536, + "learning_rate": 9.627959360951118e-06, + "loss": 0.437, + "step": 861 + }, + { + "epoch": 0.6347569955817378, + "grad_norm": 0.4578990042209625, + "learning_rate": 9.626335778065655e-06, + "loss": 0.4618, + "step": 862 + }, + { + "epoch": 0.6354933726067746, + "grad_norm": 0.47172173857688904, + "learning_rate": 9.624708797739002e-06, + "loss": 0.4533, + "step": 863 + }, + { + "epoch": 0.6362297496318114, + "grad_norm": 0.5421725511550903, + "learning_rate": 9.623078421165958e-06, + "loss": 0.4663, + "step": 864 + }, + { + "epoch": 0.6369661266568483, + "grad_norm": 0.4460313320159912, + "learning_rate": 9.62144464954383e-06, + "loss": 0.4486, + "step": 865 + }, + { + "epoch": 0.6377025036818851, + "grad_norm": 0.5286182165145874, + "learning_rate": 9.619807484072405e-06, + "loss": 0.4611, + "step": 866 + }, + { + "epoch": 0.6384388807069219, + "grad_norm": 0.5084875226020813, + "learning_rate": 9.618166925953969e-06, + "loss": 0.478, + "step": 867 + }, + { + "epoch": 0.6391752577319587, + "grad_norm": 0.5308467745780945, + "learning_rate": 9.6165229763933e-06, + "loss": 0.4512, + "step": 868 + }, + { + "epoch": 0.6399116347569955, + "grad_norm": 0.5583586096763611, + "learning_rate": 9.614875636597662e-06, + "loss": 0.4665, + "step": 869 + }, + { + "epoch": 0.6406480117820325, + "grad_norm": 0.4598456919193268, + "learning_rate": 9.613224907776814e-06, + "loss": 0.4138, + "step": 870 + }, + { + "epoch": 0.6413843888070693, + "grad_norm": 0.43438005447387695, + "learning_rate": 9.611570791143e-06, + "loss": 0.4584, + "step": 871 + }, + { + "epoch": 0.6421207658321061, + "grad_norm": 0.5358659029006958, + "learning_rate": 9.609913287910957e-06, + "loss": 0.4371, + "step": 872 + }, + { + "epoch": 0.6428571428571429, + "grad_norm": 0.4852744936943054, + "learning_rate": 9.608252399297899e-06, + "loss": 0.4621, + "step": 873 + }, + { + "epoch": 0.6435935198821797, + "grad_norm": 0.4329935610294342, + "learning_rate": 9.606588126523537e-06, + "loss": 0.4368, + "step": 874 + }, + { + "epoch": 0.6443298969072165, + "grad_norm": 0.5553391575813293, + "learning_rate": 9.60492047081006e-06, + "loss": 0.4636, + "step": 875 + }, + { + "epoch": 0.6450662739322534, + "grad_norm": 0.42962443828582764, + "learning_rate": 9.603249433382145e-06, + "loss": 0.4829, + "step": 876 + }, + { + "epoch": 0.6458026509572902, + "grad_norm": 0.47330227494239807, + "learning_rate": 9.60157501546695e-06, + "loss": 0.4412, + "step": 877 + }, + { + "epoch": 0.646539027982327, + "grad_norm": 0.444964200258255, + "learning_rate": 9.599897218294122e-06, + "loss": 0.4608, + "step": 878 + }, + { + "epoch": 0.6472754050073638, + "grad_norm": 0.5015583038330078, + "learning_rate": 9.598216043095779e-06, + "loss": 0.4652, + "step": 879 + }, + { + "epoch": 0.6480117820324006, + "grad_norm": 0.5097722411155701, + "learning_rate": 9.596531491106528e-06, + "loss": 0.4401, + "step": 880 + }, + { + "epoch": 0.6487481590574374, + "grad_norm": 0.5142177939414978, + "learning_rate": 9.594843563563452e-06, + "loss": 0.4416, + "step": 881 + }, + { + "epoch": 0.6494845360824743, + "grad_norm": 0.45564496517181396, + "learning_rate": 9.593152261706113e-06, + "loss": 0.4457, + "step": 882 + }, + { + "epoch": 0.6502209131075111, + "grad_norm": 0.4867049753665924, + "learning_rate": 9.591457586776555e-06, + "loss": 0.4653, + "step": 883 + }, + { + "epoch": 0.6509572901325479, + "grad_norm": 0.4787944257259369, + "learning_rate": 9.589759540019293e-06, + "loss": 0.4849, + "step": 884 + }, + { + "epoch": 0.6516936671575847, + "grad_norm": 0.4713253080844879, + "learning_rate": 9.588058122681324e-06, + "loss": 0.432, + "step": 885 + }, + { + "epoch": 0.6524300441826215, + "grad_norm": 0.48565471172332764, + "learning_rate": 9.586353336012115e-06, + "loss": 0.4606, + "step": 886 + }, + { + "epoch": 0.6531664212076583, + "grad_norm": 0.48033156991004944, + "learning_rate": 9.58464518126361e-06, + "loss": 0.4695, + "step": 887 + }, + { + "epoch": 0.6539027982326951, + "grad_norm": 0.5356956720352173, + "learning_rate": 9.582933659690228e-06, + "loss": 0.4774, + "step": 888 + }, + { + "epoch": 0.654639175257732, + "grad_norm": 0.5078469514846802, + "learning_rate": 9.58121877254886e-06, + "loss": 0.4458, + "step": 889 + }, + { + "epoch": 0.6553755522827688, + "grad_norm": 0.6260564923286438, + "learning_rate": 9.57950052109886e-06, + "loss": 0.446, + "step": 890 + }, + { + "epoch": 0.6561119293078056, + "grad_norm": 0.4698830246925354, + "learning_rate": 9.577778906602069e-06, + "loss": 0.4637, + "step": 891 + }, + { + "epoch": 0.6568483063328424, + "grad_norm": 0.5720396041870117, + "learning_rate": 9.576053930322784e-06, + "loss": 0.4609, + "step": 892 + }, + { + "epoch": 0.6575846833578792, + "grad_norm": 0.5099953413009644, + "learning_rate": 9.574325593527776e-06, + "loss": 0.4763, + "step": 893 + }, + { + "epoch": 0.658321060382916, + "grad_norm": 0.582254946231842, + "learning_rate": 9.572593897486283e-06, + "loss": 0.4653, + "step": 894 + }, + { + "epoch": 0.6590574374079529, + "grad_norm": 0.5127898454666138, + "learning_rate": 9.57085884347001e-06, + "loss": 0.47, + "step": 895 + }, + { + "epoch": 0.6597938144329897, + "grad_norm": 0.5360256433486938, + "learning_rate": 9.56912043275313e-06, + "loss": 0.4483, + "step": 896 + }, + { + "epoch": 0.6605301914580265, + "grad_norm": 0.5025308728218079, + "learning_rate": 9.567378666612279e-06, + "loss": 0.4378, + "step": 897 + }, + { + "epoch": 0.6612665684830633, + "grad_norm": 0.5077435374259949, + "learning_rate": 9.565633546326555e-06, + "loss": 0.4492, + "step": 898 + }, + { + "epoch": 0.6620029455081001, + "grad_norm": 0.580901026725769, + "learning_rate": 9.563885073177523e-06, + "loss": 0.4499, + "step": 899 + }, + { + "epoch": 0.6627393225331369, + "grad_norm": 0.4233223795890808, + "learning_rate": 9.56213324844921e-06, + "loss": 0.4618, + "step": 900 + }, + { + "epoch": 0.6634756995581738, + "grad_norm": 0.5685048699378967, + "learning_rate": 9.560378073428103e-06, + "loss": 0.4363, + "step": 901 + }, + { + "epoch": 0.6642120765832106, + "grad_norm": 0.5028063654899597, + "learning_rate": 9.558619549403148e-06, + "loss": 0.4684, + "step": 902 + }, + { + "epoch": 0.6649484536082474, + "grad_norm": 0.5596533417701721, + "learning_rate": 9.556857677665752e-06, + "loss": 0.4379, + "step": 903 + }, + { + "epoch": 0.6656848306332842, + "grad_norm": 0.391494482755661, + "learning_rate": 9.555092459509783e-06, + "loss": 0.4425, + "step": 904 + }, + { + "epoch": 0.666421207658321, + "grad_norm": 0.5344425439834595, + "learning_rate": 9.553323896231558e-06, + "loss": 0.4473, + "step": 905 + }, + { + "epoch": 0.6671575846833578, + "grad_norm": 0.4520135819911957, + "learning_rate": 9.551551989129864e-06, + "loss": 0.4698, + "step": 906 + }, + { + "epoch": 0.6678939617083947, + "grad_norm": 0.5516674518585205, + "learning_rate": 9.549776739505932e-06, + "loss": 0.4926, + "step": 907 + }, + { + "epoch": 0.6686303387334315, + "grad_norm": 0.4469415545463562, + "learning_rate": 9.547998148663449e-06, + "loss": 0.4509, + "step": 908 + }, + { + "epoch": 0.6693667157584683, + "grad_norm": 0.4923398494720459, + "learning_rate": 9.546216217908564e-06, + "loss": 0.4591, + "step": 909 + }, + { + "epoch": 0.6701030927835051, + "grad_norm": 0.5482118129730225, + "learning_rate": 9.54443094854987e-06, + "loss": 0.4566, + "step": 910 + }, + { + "epoch": 0.6708394698085419, + "grad_norm": 0.5054488778114319, + "learning_rate": 9.542642341898416e-06, + "loss": 0.4513, + "step": 911 + }, + { + "epoch": 0.6715758468335787, + "grad_norm": 0.6202924847602844, + "learning_rate": 9.540850399267698e-06, + "loss": 0.4827, + "step": 912 + }, + { + "epoch": 0.6723122238586157, + "grad_norm": 0.45753154158592224, + "learning_rate": 9.539055121973668e-06, + "loss": 0.4849, + "step": 913 + }, + { + "epoch": 0.6730486008836525, + "grad_norm": 0.7275657653808594, + "learning_rate": 9.537256511334722e-06, + "loss": 0.4659, + "step": 914 + }, + { + "epoch": 0.6737849779086893, + "grad_norm": 0.43268248438835144, + "learning_rate": 9.535454568671705e-06, + "loss": 0.4408, + "step": 915 + }, + { + "epoch": 0.6745213549337261, + "grad_norm": 0.6889506578445435, + "learning_rate": 9.53364929530791e-06, + "loss": 0.4649, + "step": 916 + }, + { + "epoch": 0.6752577319587629, + "grad_norm": 0.4373452365398407, + "learning_rate": 9.531840692569073e-06, + "loss": 0.4474, + "step": 917 + }, + { + "epoch": 0.6759941089837997, + "grad_norm": 0.6262958645820618, + "learning_rate": 9.530028761783379e-06, + "loss": 0.4589, + "step": 918 + }, + { + "epoch": 0.6767304860088366, + "grad_norm": 0.4766991436481476, + "learning_rate": 9.528213504281457e-06, + "loss": 0.4724, + "step": 919 + }, + { + "epoch": 0.6774668630338734, + "grad_norm": 0.43806901574134827, + "learning_rate": 9.526394921396373e-06, + "loss": 0.4478, + "step": 920 + }, + { + "epoch": 0.6782032400589102, + "grad_norm": 0.509490966796875, + "learning_rate": 9.524573014463643e-06, + "loss": 0.4739, + "step": 921 + }, + { + "epoch": 0.678939617083947, + "grad_norm": 0.4827909469604492, + "learning_rate": 9.52274778482122e-06, + "loss": 0.472, + "step": 922 + }, + { + "epoch": 0.6796759941089838, + "grad_norm": 0.46269240975379944, + "learning_rate": 9.520919233809494e-06, + "loss": 0.4498, + "step": 923 + }, + { + "epoch": 0.6804123711340206, + "grad_norm": 0.5521724224090576, + "learning_rate": 9.519087362771302e-06, + "loss": 0.4634, + "step": 924 + }, + { + "epoch": 0.6811487481590575, + "grad_norm": 0.5625545978546143, + "learning_rate": 9.517252173051912e-06, + "loss": 0.4552, + "step": 925 + }, + { + "epoch": 0.6818851251840943, + "grad_norm": 0.4687793552875519, + "learning_rate": 9.515413665999034e-06, + "loss": 0.4439, + "step": 926 + }, + { + "epoch": 0.6826215022091311, + "grad_norm": 0.6292194724082947, + "learning_rate": 9.51357184296281e-06, + "loss": 0.47, + "step": 927 + }, + { + "epoch": 0.6833578792341679, + "grad_norm": 0.5816941857337952, + "learning_rate": 9.51172670529582e-06, + "loss": 0.4765, + "step": 928 + }, + { + "epoch": 0.6840942562592047, + "grad_norm": 0.5308511853218079, + "learning_rate": 9.509878254353076e-06, + "loss": 0.4756, + "step": 929 + }, + { + "epoch": 0.6848306332842415, + "grad_norm": 0.6591737866401672, + "learning_rate": 9.508026491492027e-06, + "loss": 0.5005, + "step": 930 + }, + { + "epoch": 0.6855670103092784, + "grad_norm": 0.477727472782135, + "learning_rate": 9.50617141807255e-06, + "loss": 0.4663, + "step": 931 + }, + { + "epoch": 0.6863033873343152, + "grad_norm": 0.6698769927024841, + "learning_rate": 9.504313035456955e-06, + "loss": 0.4723, + "step": 932 + }, + { + "epoch": 0.687039764359352, + "grad_norm": 0.5323542952537537, + "learning_rate": 9.502451345009984e-06, + "loss": 0.4566, + "step": 933 + }, + { + "epoch": 0.6877761413843888, + "grad_norm": 0.5885183215141296, + "learning_rate": 9.500586348098803e-06, + "loss": 0.457, + "step": 934 + }, + { + "epoch": 0.6885125184094256, + "grad_norm": 0.6716952919960022, + "learning_rate": 9.498718046093013e-06, + "loss": 0.4514, + "step": 935 + }, + { + "epoch": 0.6892488954344624, + "grad_norm": 0.5240690112113953, + "learning_rate": 9.496846440364634e-06, + "loss": 0.4739, + "step": 936 + }, + { + "epoch": 0.6899852724594993, + "grad_norm": 0.7379181981086731, + "learning_rate": 9.49497153228812e-06, + "loss": 0.4672, + "step": 937 + }, + { + "epoch": 0.6907216494845361, + "grad_norm": 0.4711780250072479, + "learning_rate": 9.493093323240348e-06, + "loss": 0.4378, + "step": 938 + }, + { + "epoch": 0.6914580265095729, + "grad_norm": 0.661437451839447, + "learning_rate": 9.491211814600613e-06, + "loss": 0.4741, + "step": 939 + }, + { + "epoch": 0.6921944035346097, + "grad_norm": 0.5604520440101624, + "learning_rate": 9.489327007750644e-06, + "loss": 0.4706, + "step": 940 + }, + { + "epoch": 0.6929307805596465, + "grad_norm": 0.5789750814437866, + "learning_rate": 9.487438904074581e-06, + "loss": 0.4202, + "step": 941 + }, + { + "epoch": 0.6936671575846833, + "grad_norm": 0.6015188694000244, + "learning_rate": 9.485547504958993e-06, + "loss": 0.4321, + "step": 942 + }, + { + "epoch": 0.6944035346097202, + "grad_norm": 0.5518826246261597, + "learning_rate": 9.483652811792866e-06, + "loss": 0.4777, + "step": 943 + }, + { + "epoch": 0.695139911634757, + "grad_norm": 0.5786319375038147, + "learning_rate": 9.481754825967606e-06, + "loss": 0.4405, + "step": 944 + }, + { + "epoch": 0.6958762886597938, + "grad_norm": 0.5292356610298157, + "learning_rate": 9.479853548877033e-06, + "loss": 0.4367, + "step": 945 + }, + { + "epoch": 0.6966126656848306, + "grad_norm": 0.6690455675125122, + "learning_rate": 9.477948981917393e-06, + "loss": 0.4381, + "step": 946 + }, + { + "epoch": 0.6973490427098674, + "grad_norm": 0.4962507486343384, + "learning_rate": 9.476041126487341e-06, + "loss": 0.4595, + "step": 947 + }, + { + "epoch": 0.6980854197349042, + "grad_norm": 0.5820249915122986, + "learning_rate": 9.474129983987943e-06, + "loss": 0.4904, + "step": 948 + }, + { + "epoch": 0.698821796759941, + "grad_norm": 0.5604087710380554, + "learning_rate": 9.472215555822691e-06, + "loss": 0.4706, + "step": 949 + }, + { + "epoch": 0.6995581737849779, + "grad_norm": 0.4952147603034973, + "learning_rate": 9.47029784339748e-06, + "loss": 0.4787, + "step": 950 + }, + { + "epoch": 0.7002945508100147, + "grad_norm": 0.5814775228500366, + "learning_rate": 9.468376848120619e-06, + "loss": 0.4813, + "step": 951 + }, + { + "epoch": 0.7010309278350515, + "grad_norm": 0.4648406207561493, + "learning_rate": 9.466452571402833e-06, + "loss": 0.4607, + "step": 952 + }, + { + "epoch": 0.7017673048600883, + "grad_norm": 0.5979225039482117, + "learning_rate": 9.464525014657249e-06, + "loss": 0.4521, + "step": 953 + }, + { + "epoch": 0.7025036818851251, + "grad_norm": 0.4992789328098297, + "learning_rate": 9.462594179299408e-06, + "loss": 0.4686, + "step": 954 + }, + { + "epoch": 0.7032400589101621, + "grad_norm": 0.5263684391975403, + "learning_rate": 9.460660066747255e-06, + "loss": 0.4618, + "step": 955 + }, + { + "epoch": 0.7039764359351989, + "grad_norm": 0.5737230777740479, + "learning_rate": 9.458722678421146e-06, + "loss": 0.4294, + "step": 956 + }, + { + "epoch": 0.7047128129602357, + "grad_norm": 0.48960253596305847, + "learning_rate": 9.45678201574384e-06, + "loss": 0.4361, + "step": 957 + }, + { + "epoch": 0.7054491899852725, + "grad_norm": 0.6119993925094604, + "learning_rate": 9.454838080140501e-06, + "loss": 0.4593, + "step": 958 + }, + { + "epoch": 0.7061855670103093, + "grad_norm": 0.5781173706054688, + "learning_rate": 9.452890873038697e-06, + "loss": 0.4643, + "step": 959 + }, + { + "epoch": 0.7069219440353461, + "grad_norm": 0.7216101884841919, + "learning_rate": 9.450940395868397e-06, + "loss": 0.468, + "step": 960 + }, + { + "epoch": 0.707658321060383, + "grad_norm": 0.47883689403533936, + "learning_rate": 9.448986650061973e-06, + "loss": 0.4386, + "step": 961 + }, + { + "epoch": 0.7083946980854198, + "grad_norm": 0.5716425180435181, + "learning_rate": 9.447029637054198e-06, + "loss": 0.5083, + "step": 962 + }, + { + "epoch": 0.7091310751104566, + "grad_norm": 0.5576707720756531, + "learning_rate": 9.445069358282242e-06, + "loss": 0.4811, + "step": 963 + }, + { + "epoch": 0.7098674521354934, + "grad_norm": 0.4840952455997467, + "learning_rate": 9.443105815185674e-06, + "loss": 0.4485, + "step": 964 + }, + { + "epoch": 0.7106038291605302, + "grad_norm": 0.6055101156234741, + "learning_rate": 9.44113900920646e-06, + "loss": 0.4484, + "step": 965 + }, + { + "epoch": 0.711340206185567, + "grad_norm": 0.4516480267047882, + "learning_rate": 9.439168941788965e-06, + "loss": 0.4344, + "step": 966 + }, + { + "epoch": 0.7120765832106039, + "grad_norm": 0.5439156889915466, + "learning_rate": 9.437195614379947e-06, + "loss": 0.4537, + "step": 967 + }, + { + "epoch": 0.7128129602356407, + "grad_norm": 0.559701681137085, + "learning_rate": 9.435219028428558e-06, + "loss": 0.4527, + "step": 968 + }, + { + "epoch": 0.7135493372606775, + "grad_norm": 0.4636538326740265, + "learning_rate": 9.43323918538634e-06, + "loss": 0.47, + "step": 969 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 0.4914974272251129, + "learning_rate": 9.431256086707233e-06, + "loss": 0.4688, + "step": 970 + }, + { + "epoch": 0.7150220913107511, + "grad_norm": 0.5754413604736328, + "learning_rate": 9.429269733847563e-06, + "loss": 0.4754, + "step": 971 + }, + { + "epoch": 0.7157584683357879, + "grad_norm": 0.4941750168800354, + "learning_rate": 9.427280128266049e-06, + "loss": 0.4714, + "step": 972 + }, + { + "epoch": 0.7164948453608248, + "grad_norm": 0.5356074571609497, + "learning_rate": 9.425287271423797e-06, + "loss": 0.433, + "step": 973 + }, + { + "epoch": 0.7172312223858616, + "grad_norm": 0.5081741809844971, + "learning_rate": 9.4232911647843e-06, + "loss": 0.4652, + "step": 974 + }, + { + "epoch": 0.7179675994108984, + "grad_norm": 0.4957806169986725, + "learning_rate": 9.42129180981344e-06, + "loss": 0.4731, + "step": 975 + }, + { + "epoch": 0.7187039764359352, + "grad_norm": 0.5311683416366577, + "learning_rate": 9.41928920797948e-06, + "loss": 0.4396, + "step": 976 + }, + { + "epoch": 0.719440353460972, + "grad_norm": 0.44548124074935913, + "learning_rate": 9.417283360753073e-06, + "loss": 0.4488, + "step": 977 + }, + { + "epoch": 0.7201767304860088, + "grad_norm": 0.556957483291626, + "learning_rate": 9.415274269607253e-06, + "loss": 0.4302, + "step": 978 + }, + { + "epoch": 0.7209131075110456, + "grad_norm": 0.478667289018631, + "learning_rate": 9.413261936017433e-06, + "loss": 0.4154, + "step": 979 + }, + { + "epoch": 0.7216494845360825, + "grad_norm": 0.535645604133606, + "learning_rate": 9.41124636146141e-06, + "loss": 0.4511, + "step": 980 + }, + { + "epoch": 0.7223858615611193, + "grad_norm": 0.5650351643562317, + "learning_rate": 9.409227547419364e-06, + "loss": 0.4624, + "step": 981 + }, + { + "epoch": 0.7231222385861561, + "grad_norm": 0.4671155512332916, + "learning_rate": 9.407205495373849e-06, + "loss": 0.462, + "step": 982 + }, + { + "epoch": 0.7238586156111929, + "grad_norm": 0.4867047965526581, + "learning_rate": 9.405180206809799e-06, + "loss": 0.4013, + "step": 983 + }, + { + "epoch": 0.7245949926362297, + "grad_norm": 0.7005500197410583, + "learning_rate": 9.403151683214525e-06, + "loss": 0.4596, + "step": 984 + }, + { + "epoch": 0.7253313696612665, + "grad_norm": 0.47493258118629456, + "learning_rate": 9.401119926077714e-06, + "loss": 0.4439, + "step": 985 + }, + { + "epoch": 0.7260677466863034, + "grad_norm": 0.6544631123542786, + "learning_rate": 9.399084936891424e-06, + "loss": 0.4709, + "step": 986 + }, + { + "epoch": 0.7268041237113402, + "grad_norm": 0.4934577941894531, + "learning_rate": 9.397046717150095e-06, + "loss": 0.4537, + "step": 987 + }, + { + "epoch": 0.727540500736377, + "grad_norm": 0.5570245981216431, + "learning_rate": 9.39500526835053e-06, + "loss": 0.4462, + "step": 988 + }, + { + "epoch": 0.7282768777614138, + "grad_norm": 0.6459545493125916, + "learning_rate": 9.392960591991908e-06, + "loss": 0.4522, + "step": 989 + }, + { + "epoch": 0.7290132547864506, + "grad_norm": 0.5408301949501038, + "learning_rate": 9.39091268957578e-06, + "loss": 0.4684, + "step": 990 + }, + { + "epoch": 0.7297496318114874, + "grad_norm": 0.6333877444267273, + "learning_rate": 9.388861562606059e-06, + "loss": 0.4797, + "step": 991 + }, + { + "epoch": 0.7304860088365243, + "grad_norm": 0.4390884041786194, + "learning_rate": 9.386807212589036e-06, + "loss": 0.4377, + "step": 992 + }, + { + "epoch": 0.7312223858615611, + "grad_norm": 0.4864395260810852, + "learning_rate": 9.384749641033358e-06, + "loss": 0.478, + "step": 993 + }, + { + "epoch": 0.7319587628865979, + "grad_norm": 0.4967077076435089, + "learning_rate": 9.382688849450049e-06, + "loss": 0.4222, + "step": 994 + }, + { + "epoch": 0.7326951399116347, + "grad_norm": 0.45611369609832764, + "learning_rate": 9.380624839352486e-06, + "loss": 0.4561, + "step": 995 + }, + { + "epoch": 0.7334315169366715, + "grad_norm": 0.5901041626930237, + "learning_rate": 9.37855761225642e-06, + "loss": 0.4795, + "step": 996 + }, + { + "epoch": 0.7341678939617083, + "grad_norm": 0.4063359498977661, + "learning_rate": 9.37648716967996e-06, + "loss": 0.4684, + "step": 997 + }, + { + "epoch": 0.7349042709867453, + "grad_norm": 0.5191907286643982, + "learning_rate": 9.374413513143574e-06, + "loss": 0.438, + "step": 998 + }, + { + "epoch": 0.7356406480117821, + "grad_norm": 0.4642137885093689, + "learning_rate": 9.372336644170096e-06, + "loss": 0.432, + "step": 999 + }, + { + "epoch": 0.7363770250368189, + "grad_norm": 0.5185231566429138, + "learning_rate": 9.370256564284713e-06, + "loss": 0.4519, + "step": 1000 + }, + { + "epoch": 0.7371134020618557, + "grad_norm": 0.4697466790676117, + "learning_rate": 9.368173275014973e-06, + "loss": 0.4226, + "step": 1001 + }, + { + "epoch": 0.7378497790868925, + "grad_norm": 0.49580705165863037, + "learning_rate": 9.366086777890785e-06, + "loss": 0.475, + "step": 1002 + }, + { + "epoch": 0.7385861561119293, + "grad_norm": 0.5398573875427246, + "learning_rate": 9.363997074444402e-06, + "loss": 0.4605, + "step": 1003 + }, + { + "epoch": 0.7393225331369662, + "grad_norm": 0.49332237243652344, + "learning_rate": 9.361904166210443e-06, + "loss": 0.4483, + "step": 1004 + }, + { + "epoch": 0.740058910162003, + "grad_norm": 0.6060042977333069, + "learning_rate": 9.359808054725877e-06, + "loss": 0.4546, + "step": 1005 + }, + { + "epoch": 0.7407952871870398, + "grad_norm": 0.5035642385482788, + "learning_rate": 9.357708741530025e-06, + "loss": 0.4692, + "step": 1006 + }, + { + "epoch": 0.7415316642120766, + "grad_norm": 0.5561889410018921, + "learning_rate": 9.355606228164559e-06, + "loss": 0.4416, + "step": 1007 + }, + { + "epoch": 0.7422680412371134, + "grad_norm": 0.5201199650764465, + "learning_rate": 9.3535005161735e-06, + "loss": 0.4581, + "step": 1008 + }, + { + "epoch": 0.7430044182621502, + "grad_norm": 0.5415538549423218, + "learning_rate": 9.351391607103222e-06, + "loss": 0.4417, + "step": 1009 + }, + { + "epoch": 0.7437407952871871, + "grad_norm": 0.5373315811157227, + "learning_rate": 9.34927950250244e-06, + "loss": 0.4449, + "step": 1010 + }, + { + "epoch": 0.7444771723122239, + "grad_norm": 0.6178976893424988, + "learning_rate": 9.347164203922224e-06, + "loss": 0.4936, + "step": 1011 + }, + { + "epoch": 0.7452135493372607, + "grad_norm": 0.4898339807987213, + "learning_rate": 9.34504571291598e-06, + "loss": 0.4354, + "step": 1012 + }, + { + "epoch": 0.7459499263622975, + "grad_norm": 0.6177398562431335, + "learning_rate": 9.34292403103947e-06, + "loss": 0.4385, + "step": 1013 + }, + { + "epoch": 0.7466863033873343, + "grad_norm": 0.4545745551586151, + "learning_rate": 9.34079915985079e-06, + "loss": 0.4313, + "step": 1014 + }, + { + "epoch": 0.7474226804123711, + "grad_norm": 0.5530493259429932, + "learning_rate": 9.33867110091038e-06, + "loss": 0.4351, + "step": 1015 + }, + { + "epoch": 0.748159057437408, + "grad_norm": 0.562359094619751, + "learning_rate": 9.336539855781027e-06, + "loss": 0.4467, + "step": 1016 + }, + { + "epoch": 0.7488954344624448, + "grad_norm": 0.4444253742694855, + "learning_rate": 9.334405426027845e-06, + "loss": 0.4276, + "step": 1017 + }, + { + "epoch": 0.7496318114874816, + "grad_norm": 0.6157680749893188, + "learning_rate": 9.332267813218303e-06, + "loss": 0.4383, + "step": 1018 + }, + { + "epoch": 0.7503681885125184, + "grad_norm": 0.5156718492507935, + "learning_rate": 9.330127018922195e-06, + "loss": 0.4623, + "step": 1019 + }, + { + "epoch": 0.7511045655375552, + "grad_norm": 0.5296850204467773, + "learning_rate": 9.327983044711655e-06, + "loss": 0.4561, + "step": 1020 + }, + { + "epoch": 0.751840942562592, + "grad_norm": 0.5704274773597717, + "learning_rate": 9.325835892161156e-06, + "loss": 0.4747, + "step": 1021 + }, + { + "epoch": 0.7525773195876289, + "grad_norm": 0.5490828156471252, + "learning_rate": 9.323685562847497e-06, + "loss": 0.4119, + "step": 1022 + }, + { + "epoch": 0.7533136966126657, + "grad_norm": 0.4620678722858429, + "learning_rate": 9.321532058349817e-06, + "loss": 0.4218, + "step": 1023 + }, + { + "epoch": 0.7540500736377025, + "grad_norm": 0.507858157157898, + "learning_rate": 9.31937538024959e-06, + "loss": 0.4328, + "step": 1024 + }, + { + "epoch": 0.7547864506627393, + "grad_norm": 0.5123826861381531, + "learning_rate": 9.317215530130607e-06, + "loss": 0.4529, + "step": 1025 + }, + { + "epoch": 0.7555228276877761, + "grad_norm": 0.4884844124317169, + "learning_rate": 9.315052509579004e-06, + "loss": 0.4718, + "step": 1026 + }, + { + "epoch": 0.7562592047128129, + "grad_norm": 0.5704168677330017, + "learning_rate": 9.312886320183232e-06, + "loss": 0.4674, + "step": 1027 + }, + { + "epoch": 0.7569955817378498, + "grad_norm": 0.4477781057357788, + "learning_rate": 9.310716963534077e-06, + "loss": 0.443, + "step": 1028 + }, + { + "epoch": 0.7577319587628866, + "grad_norm": 0.5215590000152588, + "learning_rate": 9.30854444122465e-06, + "loss": 0.4639, + "step": 1029 + }, + { + "epoch": 0.7584683357879234, + "grad_norm": 0.5299932360649109, + "learning_rate": 9.306368754850386e-06, + "loss": 0.4503, + "step": 1030 + }, + { + "epoch": 0.7592047128129602, + "grad_norm": 0.47621920704841614, + "learning_rate": 9.30418990600904e-06, + "loss": 0.4418, + "step": 1031 + }, + { + "epoch": 0.759941089837997, + "grad_norm": 0.5789692401885986, + "learning_rate": 9.302007896300697e-06, + "loss": 0.4675, + "step": 1032 + }, + { + "epoch": 0.7606774668630338, + "grad_norm": 0.4460338354110718, + "learning_rate": 9.299822727327758e-06, + "loss": 0.4253, + "step": 1033 + }, + { + "epoch": 0.7614138438880707, + "grad_norm": 0.5220574140548706, + "learning_rate": 9.297634400694943e-06, + "loss": 0.4548, + "step": 1034 + }, + { + "epoch": 0.7621502209131075, + "grad_norm": 0.5527031421661377, + "learning_rate": 9.295442918009295e-06, + "loss": 0.4268, + "step": 1035 + }, + { + "epoch": 0.7628865979381443, + "grad_norm": 0.4812883734703064, + "learning_rate": 9.29324828088017e-06, + "loss": 0.4721, + "step": 1036 + }, + { + "epoch": 0.7636229749631811, + "grad_norm": 0.5152131915092468, + "learning_rate": 9.291050490919244e-06, + "loss": 0.4758, + "step": 1037 + }, + { + "epoch": 0.7643593519882179, + "grad_norm": 0.5110999941825867, + "learning_rate": 9.288849549740513e-06, + "loss": 0.4721, + "step": 1038 + }, + { + "epoch": 0.7650957290132547, + "grad_norm": 0.4943920373916626, + "learning_rate": 9.286645458960272e-06, + "loss": 0.4636, + "step": 1039 + }, + { + "epoch": 0.7658321060382917, + "grad_norm": 0.4862247109413147, + "learning_rate": 9.28443822019715e-06, + "loss": 0.4717, + "step": 1040 + }, + { + "epoch": 0.7665684830633285, + "grad_norm": 0.4800577163696289, + "learning_rate": 9.282227835072064e-06, + "loss": 0.4624, + "step": 1041 + }, + { + "epoch": 0.7673048600883653, + "grad_norm": 0.5938796401023865, + "learning_rate": 9.280014305208264e-06, + "loss": 0.4698, + "step": 1042 + }, + { + "epoch": 0.7680412371134021, + "grad_norm": 0.45310524106025696, + "learning_rate": 9.277797632231295e-06, + "loss": 0.4681, + "step": 1043 + }, + { + "epoch": 0.7687776141384389, + "grad_norm": 0.6024627089500427, + "learning_rate": 9.275577817769015e-06, + "loss": 0.4819, + "step": 1044 + }, + { + "epoch": 0.7695139911634757, + "grad_norm": 0.4621509611606598, + "learning_rate": 9.273354863451589e-06, + "loss": 0.4683, + "step": 1045 + }, + { + "epoch": 0.7702503681885126, + "grad_norm": 0.5616011619567871, + "learning_rate": 9.271128770911489e-06, + "loss": 0.4924, + "step": 1046 + }, + { + "epoch": 0.7709867452135494, + "grad_norm": 0.5046648979187012, + "learning_rate": 9.268899541783487e-06, + "loss": 0.4244, + "step": 1047 + }, + { + "epoch": 0.7717231222385862, + "grad_norm": 0.47864627838134766, + "learning_rate": 9.266667177704665e-06, + "loss": 0.4358, + "step": 1048 + }, + { + "epoch": 0.772459499263623, + "grad_norm": 0.45622187852859497, + "learning_rate": 9.2644316803144e-06, + "loss": 0.4492, + "step": 1049 + }, + { + "epoch": 0.7731958762886598, + "grad_norm": 0.5317423343658447, + "learning_rate": 9.262193051254377e-06, + "loss": 0.4541, + "step": 1050 + }, + { + "epoch": 0.7739322533136966, + "grad_norm": 0.5093345046043396, + "learning_rate": 9.259951292168576e-06, + "loss": 0.4874, + "step": 1051 + }, + { + "epoch": 0.7746686303387335, + "grad_norm": 0.4720238745212555, + "learning_rate": 9.257706404703276e-06, + "loss": 0.4477, + "step": 1052 + }, + { + "epoch": 0.7754050073637703, + "grad_norm": 0.548051118850708, + "learning_rate": 9.255458390507059e-06, + "loss": 0.4761, + "step": 1053 + }, + { + "epoch": 0.7761413843888071, + "grad_norm": 0.4737755358219147, + "learning_rate": 9.253207251230793e-06, + "loss": 0.4517, + "step": 1054 + }, + { + "epoch": 0.7768777614138439, + "grad_norm": 0.40416955947875977, + "learning_rate": 9.250952988527648e-06, + "loss": 0.4095, + "step": 1055 + }, + { + "epoch": 0.7776141384388807, + "grad_norm": 0.4669051468372345, + "learning_rate": 9.248695604053091e-06, + "loss": 0.4492, + "step": 1056 + }, + { + "epoch": 0.7783505154639175, + "grad_norm": 0.5601681470870972, + "learning_rate": 9.246435099464869e-06, + "loss": 0.4564, + "step": 1057 + }, + { + "epoch": 0.7790868924889544, + "grad_norm": 0.44851595163345337, + "learning_rate": 9.244171476423037e-06, + "loss": 0.4675, + "step": 1058 + }, + { + "epoch": 0.7798232695139912, + "grad_norm": 0.5892722606658936, + "learning_rate": 9.241904736589927e-06, + "loss": 0.4731, + "step": 1059 + }, + { + "epoch": 0.780559646539028, + "grad_norm": 0.47606587409973145, + "learning_rate": 9.239634881630162e-06, + "loss": 0.4443, + "step": 1060 + }, + { + "epoch": 0.7812960235640648, + "grad_norm": 0.5107343196868896, + "learning_rate": 9.237361913210658e-06, + "loss": 0.4415, + "step": 1061 + }, + { + "epoch": 0.7820324005891016, + "grad_norm": 0.593949556350708, + "learning_rate": 9.235085833000613e-06, + "loss": 0.4537, + "step": 1062 + }, + { + "epoch": 0.7827687776141384, + "grad_norm": 0.4850008189678192, + "learning_rate": 9.232806642671513e-06, + "loss": 0.465, + "step": 1063 + }, + { + "epoch": 0.7835051546391752, + "grad_norm": 0.4913601577281952, + "learning_rate": 9.230524343897125e-06, + "loss": 0.4483, + "step": 1064 + }, + { + "epoch": 0.7842415316642121, + "grad_norm": 0.4335813820362091, + "learning_rate": 9.228238938353502e-06, + "loss": 0.4083, + "step": 1065 + }, + { + "epoch": 0.7849779086892489, + "grad_norm": 0.5363730788230896, + "learning_rate": 9.225950427718974e-06, + "loss": 0.4521, + "step": 1066 + }, + { + "epoch": 0.7857142857142857, + "grad_norm": 0.5813281536102295, + "learning_rate": 9.223658813674157e-06, + "loss": 0.4493, + "step": 1067 + }, + { + "epoch": 0.7864506627393225, + "grad_norm": 0.466314435005188, + "learning_rate": 9.221364097901941e-06, + "loss": 0.446, + "step": 1068 + }, + { + "epoch": 0.7871870397643593, + "grad_norm": 0.5087846517562866, + "learning_rate": 9.219066282087497e-06, + "loss": 0.4284, + "step": 1069 + }, + { + "epoch": 0.7879234167893961, + "grad_norm": 0.5554249882698059, + "learning_rate": 9.216765367918272e-06, + "loss": 0.4804, + "step": 1070 + }, + { + "epoch": 0.788659793814433, + "grad_norm": 0.48049336671829224, + "learning_rate": 9.214461357083986e-06, + "loss": 0.4659, + "step": 1071 + }, + { + "epoch": 0.7893961708394698, + "grad_norm": 0.5227876901626587, + "learning_rate": 9.212154251276637e-06, + "loss": 0.4487, + "step": 1072 + }, + { + "epoch": 0.7901325478645066, + "grad_norm": 0.5827174186706543, + "learning_rate": 9.20984405219049e-06, + "loss": 0.4681, + "step": 1073 + }, + { + "epoch": 0.7908689248895434, + "grad_norm": 0.5332369804382324, + "learning_rate": 9.207530761522093e-06, + "loss": 0.4361, + "step": 1074 + }, + { + "epoch": 0.7916053019145802, + "grad_norm": 0.525380551815033, + "learning_rate": 9.205214380970247e-06, + "loss": 0.4415, + "step": 1075 + }, + { + "epoch": 0.792341678939617, + "grad_norm": 0.4778136610984802, + "learning_rate": 9.20289491223604e-06, + "loss": 0.4416, + "step": 1076 + }, + { + "epoch": 0.7930780559646539, + "grad_norm": 0.5096407532691956, + "learning_rate": 9.200572357022815e-06, + "loss": 0.4379, + "step": 1077 + }, + { + "epoch": 0.7938144329896907, + "grad_norm": 0.48358261585235596, + "learning_rate": 9.198246717036187e-06, + "loss": 0.4547, + "step": 1078 + }, + { + "epoch": 0.7945508100147275, + "grad_norm": 0.5392265319824219, + "learning_rate": 9.195917993984039e-06, + "loss": 0.4585, + "step": 1079 + }, + { + "epoch": 0.7952871870397643, + "grad_norm": 0.4468466341495514, + "learning_rate": 9.19358618957651e-06, + "loss": 0.4572, + "step": 1080 + }, + { + "epoch": 0.7960235640648011, + "grad_norm": 0.48624467849731445, + "learning_rate": 9.191251305526013e-06, + "loss": 0.4327, + "step": 1081 + }, + { + "epoch": 0.7967599410898379, + "grad_norm": 0.49556195735931396, + "learning_rate": 9.18891334354721e-06, + "loss": 0.4285, + "step": 1082 + }, + { + "epoch": 0.7974963181148749, + "grad_norm": 0.427839994430542, + "learning_rate": 9.18657230535703e-06, + "loss": 0.4326, + "step": 1083 + }, + { + "epoch": 0.7982326951399117, + "grad_norm": 0.5406298041343689, + "learning_rate": 9.184228192674667e-06, + "loss": 0.455, + "step": 1084 + }, + { + "epoch": 0.7989690721649485, + "grad_norm": 0.4637623131275177, + "learning_rate": 9.18188100722156e-06, + "loss": 0.4596, + "step": 1085 + }, + { + "epoch": 0.7997054491899853, + "grad_norm": 0.4427599608898163, + "learning_rate": 9.179530750721413e-06, + "loss": 0.4371, + "step": 1086 + }, + { + "epoch": 0.8004418262150221, + "grad_norm": 0.5007920861244202, + "learning_rate": 9.177177424900183e-06, + "loss": 0.4656, + "step": 1087 + }, + { + "epoch": 0.801178203240059, + "grad_norm": 0.5360900163650513, + "learning_rate": 9.174821031486083e-06, + "loss": 0.4214, + "step": 1088 + }, + { + "epoch": 0.8019145802650958, + "grad_norm": 0.5234165191650391, + "learning_rate": 9.172461572209578e-06, + "loss": 0.4553, + "step": 1089 + }, + { + "epoch": 0.8026509572901326, + "grad_norm": 0.5163874626159668, + "learning_rate": 9.17009904880338e-06, + "loss": 0.447, + "step": 1090 + }, + { + "epoch": 0.8033873343151694, + "grad_norm": 0.6014302372932434, + "learning_rate": 9.167733463002457e-06, + "loss": 0.4715, + "step": 1091 + }, + { + "epoch": 0.8041237113402062, + "grad_norm": 0.5282832384109497, + "learning_rate": 9.165364816544022e-06, + "loss": 0.4558, + "step": 1092 + }, + { + "epoch": 0.804860088365243, + "grad_norm": 0.547073483467102, + "learning_rate": 9.162993111167541e-06, + "loss": 0.4541, + "step": 1093 + }, + { + "epoch": 0.8055964653902798, + "grad_norm": 0.5955381393432617, + "learning_rate": 9.160618348614718e-06, + "loss": 0.4368, + "step": 1094 + }, + { + "epoch": 0.8063328424153167, + "grad_norm": 0.4820695221424103, + "learning_rate": 9.158240530629512e-06, + "loss": 0.4562, + "step": 1095 + }, + { + "epoch": 0.8070692194403535, + "grad_norm": 0.4931407868862152, + "learning_rate": 9.155859658958117e-06, + "loss": 0.4747, + "step": 1096 + }, + { + "epoch": 0.8078055964653903, + "grad_norm": 0.5072289705276489, + "learning_rate": 9.153475735348973e-06, + "loss": 0.4338, + "step": 1097 + }, + { + "epoch": 0.8085419734904271, + "grad_norm": 0.4261424243450165, + "learning_rate": 9.151088761552764e-06, + "loss": 0.4415, + "step": 1098 + }, + { + "epoch": 0.8092783505154639, + "grad_norm": 0.5440129637718201, + "learning_rate": 9.148698739322409e-06, + "loss": 0.4326, + "step": 1099 + }, + { + "epoch": 0.8100147275405007, + "grad_norm": 0.4529050886631012, + "learning_rate": 9.146305670413069e-06, + "loss": 0.4579, + "step": 1100 + }, + { + "epoch": 0.8107511045655376, + "grad_norm": 0.5012593865394592, + "learning_rate": 9.143909556582143e-06, + "loss": 0.4643, + "step": 1101 + }, + { + "epoch": 0.8114874815905744, + "grad_norm": 0.5710225701332092, + "learning_rate": 9.141510399589261e-06, + "loss": 0.4484, + "step": 1102 + }, + { + "epoch": 0.8122238586156112, + "grad_norm": 0.4354053735733032, + "learning_rate": 9.139108201196296e-06, + "loss": 0.4333, + "step": 1103 + }, + { + "epoch": 0.812960235640648, + "grad_norm": 0.49365589022636414, + "learning_rate": 9.136702963167348e-06, + "loss": 0.457, + "step": 1104 + }, + { + "epoch": 0.8136966126656848, + "grad_norm": 0.46688565611839294, + "learning_rate": 9.134294687268749e-06, + "loss": 0.4454, + "step": 1105 + }, + { + "epoch": 0.8144329896907216, + "grad_norm": 0.47631922364234924, + "learning_rate": 9.131883375269067e-06, + "loss": 0.4471, + "step": 1106 + }, + { + "epoch": 0.8151693667157585, + "grad_norm": 0.54261714220047, + "learning_rate": 9.129469028939094e-06, + "loss": 0.4839, + "step": 1107 + }, + { + "epoch": 0.8159057437407953, + "grad_norm": 0.5108979940414429, + "learning_rate": 9.127051650051854e-06, + "loss": 0.4306, + "step": 1108 + }, + { + "epoch": 0.8166421207658321, + "grad_norm": 0.46782127022743225, + "learning_rate": 9.1246312403826e-06, + "loss": 0.4578, + "step": 1109 + }, + { + "epoch": 0.8173784977908689, + "grad_norm": 0.5475788116455078, + "learning_rate": 9.122207801708802e-06, + "loss": 0.4498, + "step": 1110 + }, + { + "epoch": 0.8181148748159057, + "grad_norm": 0.5272371172904968, + "learning_rate": 9.119781335810164e-06, + "loss": 0.4482, + "step": 1111 + }, + { + "epoch": 0.8188512518409425, + "grad_norm": 0.49965953826904297, + "learning_rate": 9.117351844468609e-06, + "loss": 0.4164, + "step": 1112 + }, + { + "epoch": 0.8195876288659794, + "grad_norm": 0.5126081109046936, + "learning_rate": 9.114919329468283e-06, + "loss": 0.4744, + "step": 1113 + }, + { + "epoch": 0.8203240058910162, + "grad_norm": 0.5301181674003601, + "learning_rate": 9.112483792595547e-06, + "loss": 0.4508, + "step": 1114 + }, + { + "epoch": 0.821060382916053, + "grad_norm": 0.49831271171569824, + "learning_rate": 9.110045235638991e-06, + "loss": 0.4435, + "step": 1115 + }, + { + "epoch": 0.8217967599410898, + "grad_norm": 0.48843833804130554, + "learning_rate": 9.107603660389414e-06, + "loss": 0.4381, + "step": 1116 + }, + { + "epoch": 0.8225331369661266, + "grad_norm": 0.5341619253158569, + "learning_rate": 9.105159068639837e-06, + "loss": 0.4548, + "step": 1117 + }, + { + "epoch": 0.8232695139911634, + "grad_norm": 0.5220252275466919, + "learning_rate": 9.102711462185495e-06, + "loss": 0.4459, + "step": 1118 + }, + { + "epoch": 0.8240058910162003, + "grad_norm": 0.4931434392929077, + "learning_rate": 9.100260842823831e-06, + "loss": 0.4393, + "step": 1119 + }, + { + "epoch": 0.8247422680412371, + "grad_norm": 0.5014846920967102, + "learning_rate": 9.097807212354513e-06, + "loss": 0.4551, + "step": 1120 + }, + { + "epoch": 0.8254786450662739, + "grad_norm": 0.5581266283988953, + "learning_rate": 9.09535057257941e-06, + "loss": 0.4658, + "step": 1121 + }, + { + "epoch": 0.8262150220913107, + "grad_norm": 0.46569985151290894, + "learning_rate": 9.092890925302601e-06, + "loss": 0.4414, + "step": 1122 + }, + { + "epoch": 0.8269513991163475, + "grad_norm": 0.45610496401786804, + "learning_rate": 9.090428272330381e-06, + "loss": 0.4561, + "step": 1123 + }, + { + "epoch": 0.8276877761413843, + "grad_norm": 0.4498854875564575, + "learning_rate": 9.087962615471246e-06, + "loss": 0.4501, + "step": 1124 + }, + { + "epoch": 0.8284241531664213, + "grad_norm": 0.4839475452899933, + "learning_rate": 9.085493956535898e-06, + "loss": 0.4394, + "step": 1125 + }, + { + "epoch": 0.8291605301914581, + "grad_norm": 0.44261297583580017, + "learning_rate": 9.083022297337251e-06, + "loss": 0.4284, + "step": 1126 + }, + { + "epoch": 0.8298969072164949, + "grad_norm": 0.49953368306159973, + "learning_rate": 9.080547639690411e-06, + "loss": 0.4698, + "step": 1127 + }, + { + "epoch": 0.8306332842415317, + "grad_norm": 0.5280026197433472, + "learning_rate": 9.078069985412697e-06, + "loss": 0.435, + "step": 1128 + }, + { + "epoch": 0.8313696612665685, + "grad_norm": 0.46925580501556396, + "learning_rate": 9.075589336323619e-06, + "loss": 0.4702, + "step": 1129 + }, + { + "epoch": 0.8321060382916053, + "grad_norm": 0.49038681387901306, + "learning_rate": 9.073105694244892e-06, + "loss": 0.4367, + "step": 1130 + }, + { + "epoch": 0.8328424153166422, + "grad_norm": 0.564074695110321, + "learning_rate": 9.070619061000429e-06, + "loss": 0.486, + "step": 1131 + }, + { + "epoch": 0.833578792341679, + "grad_norm": 0.45910879969596863, + "learning_rate": 9.068129438416337e-06, + "loss": 0.4243, + "step": 1132 + }, + { + "epoch": 0.8343151693667158, + "grad_norm": 0.5710626840591431, + "learning_rate": 9.065636828320919e-06, + "loss": 0.434, + "step": 1133 + }, + { + "epoch": 0.8350515463917526, + "grad_norm": 0.504901647567749, + "learning_rate": 9.063141232544676e-06, + "loss": 0.4441, + "step": 1134 + }, + { + "epoch": 0.8357879234167894, + "grad_norm": 0.5790115594863892, + "learning_rate": 9.060642652920295e-06, + "loss": 0.457, + "step": 1135 + }, + { + "epoch": 0.8365243004418262, + "grad_norm": 0.5210295915603638, + "learning_rate": 9.058141091282656e-06, + "loss": 0.4539, + "step": 1136 + }, + { + "epoch": 0.8372606774668631, + "grad_norm": 0.48215651512145996, + "learning_rate": 9.055636549468833e-06, + "loss": 0.4558, + "step": 1137 + }, + { + "epoch": 0.8379970544918999, + "grad_norm": 0.42006632685661316, + "learning_rate": 9.053129029318086e-06, + "loss": 0.424, + "step": 1138 + }, + { + "epoch": 0.8387334315169367, + "grad_norm": 0.45918166637420654, + "learning_rate": 9.050618532671862e-06, + "loss": 0.4302, + "step": 1139 + }, + { + "epoch": 0.8394698085419735, + "grad_norm": 0.5937727093696594, + "learning_rate": 9.048105061373793e-06, + "loss": 0.4467, + "step": 1140 + }, + { + "epoch": 0.8402061855670103, + "grad_norm": 0.4482143521308899, + "learning_rate": 9.045588617269694e-06, + "loss": 0.4423, + "step": 1141 + }, + { + "epoch": 0.8409425625920471, + "grad_norm": 0.49220171570777893, + "learning_rate": 9.043069202207571e-06, + "loss": 0.4455, + "step": 1142 + }, + { + "epoch": 0.841678939617084, + "grad_norm": 0.4809843599796295, + "learning_rate": 9.040546818037602e-06, + "loss": 0.4628, + "step": 1143 + }, + { + "epoch": 0.8424153166421208, + "grad_norm": 0.49604564905166626, + "learning_rate": 9.038021466612151e-06, + "loss": 0.4283, + "step": 1144 + }, + { + "epoch": 0.8431516936671576, + "grad_norm": 0.4608924388885498, + "learning_rate": 9.035493149785758e-06, + "loss": 0.4359, + "step": 1145 + }, + { + "epoch": 0.8438880706921944, + "grad_norm": 0.5311315059661865, + "learning_rate": 9.032961869415147e-06, + "loss": 0.4655, + "step": 1146 + }, + { + "epoch": 0.8446244477172312, + "grad_norm": 0.491970032453537, + "learning_rate": 9.03042762735921e-06, + "loss": 0.4477, + "step": 1147 + }, + { + "epoch": 0.845360824742268, + "grad_norm": 0.5328875184059143, + "learning_rate": 9.027890425479016e-06, + "loss": 0.4565, + "step": 1148 + }, + { + "epoch": 0.8460972017673049, + "grad_norm": 0.46462082862854004, + "learning_rate": 9.025350265637816e-06, + "loss": 0.4307, + "step": 1149 + }, + { + "epoch": 0.8468335787923417, + "grad_norm": 0.568123996257782, + "learning_rate": 9.02280714970102e-06, + "loss": 0.4527, + "step": 1150 + }, + { + "epoch": 0.8475699558173785, + "grad_norm": 0.39277321100234985, + "learning_rate": 9.02026107953622e-06, + "loss": 0.4382, + "step": 1151 + }, + { + "epoch": 0.8483063328424153, + "grad_norm": 0.46049660444259644, + "learning_rate": 9.01771205701317e-06, + "loss": 0.4496, + "step": 1152 + }, + { + "epoch": 0.8490427098674521, + "grad_norm": 0.5467735528945923, + "learning_rate": 9.015160084003798e-06, + "loss": 0.462, + "step": 1153 + }, + { + "epoch": 0.8497790868924889, + "grad_norm": 0.46097782254219055, + "learning_rate": 9.012605162382194e-06, + "loss": 0.4612, + "step": 1154 + }, + { + "epoch": 0.8505154639175257, + "grad_norm": 0.5207246541976929, + "learning_rate": 9.010047294024615e-06, + "loss": 0.4397, + "step": 1155 + }, + { + "epoch": 0.8512518409425626, + "grad_norm": 0.5254480838775635, + "learning_rate": 9.007486480809482e-06, + "loss": 0.4787, + "step": 1156 + }, + { + "epoch": 0.8519882179675994, + "grad_norm": 0.45604032278060913, + "learning_rate": 9.00492272461738e-06, + "loss": 0.4607, + "step": 1157 + }, + { + "epoch": 0.8527245949926362, + "grad_norm": 0.4617128372192383, + "learning_rate": 9.002356027331055e-06, + "loss": 0.4236, + "step": 1158 + }, + { + "epoch": 0.853460972017673, + "grad_norm": 0.5179611444473267, + "learning_rate": 8.999786390835408e-06, + "loss": 0.4647, + "step": 1159 + }, + { + "epoch": 0.8541973490427098, + "grad_norm": 0.4486055374145508, + "learning_rate": 8.997213817017508e-06, + "loss": 0.4423, + "step": 1160 + }, + { + "epoch": 0.8549337260677466, + "grad_norm": 0.5427169799804688, + "learning_rate": 8.99463830776657e-06, + "loss": 0.4648, + "step": 1161 + }, + { + "epoch": 0.8556701030927835, + "grad_norm": 0.5229467749595642, + "learning_rate": 8.992059864973972e-06, + "loss": 0.4559, + "step": 1162 + }, + { + "epoch": 0.8564064801178203, + "grad_norm": 0.5158513784408569, + "learning_rate": 8.989478490533247e-06, + "loss": 0.4168, + "step": 1163 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 0.47246885299682617, + "learning_rate": 8.986894186340075e-06, + "loss": 0.4156, + "step": 1164 + }, + { + "epoch": 0.8578792341678939, + "grad_norm": 0.5099225044250488, + "learning_rate": 8.984306954292293e-06, + "loss": 0.4577, + "step": 1165 + }, + { + "epoch": 0.8586156111929307, + "grad_norm": 0.5285944938659668, + "learning_rate": 8.981716796289886e-06, + "loss": 0.4423, + "step": 1166 + }, + { + "epoch": 0.8593519882179675, + "grad_norm": 0.43009987473487854, + "learning_rate": 8.979123714234986e-06, + "loss": 0.4311, + "step": 1167 + }, + { + "epoch": 0.8600883652430045, + "grad_norm": 0.5090437531471252, + "learning_rate": 8.976527710031875e-06, + "loss": 0.4636, + "step": 1168 + }, + { + "epoch": 0.8608247422680413, + "grad_norm": 0.44846466183662415, + "learning_rate": 8.97392878558698e-06, + "loss": 0.4243, + "step": 1169 + }, + { + "epoch": 0.8615611192930781, + "grad_norm": 0.5008236765861511, + "learning_rate": 8.971326942808876e-06, + "loss": 0.4539, + "step": 1170 + }, + { + "epoch": 0.8622974963181149, + "grad_norm": 0.5144447088241577, + "learning_rate": 8.968722183608272e-06, + "loss": 0.4791, + "step": 1171 + }, + { + "epoch": 0.8630338733431517, + "grad_norm": 0.4842917025089264, + "learning_rate": 8.966114509898026e-06, + "loss": 0.4568, + "step": 1172 + }, + { + "epoch": 0.8637702503681886, + "grad_norm": 0.4209856688976288, + "learning_rate": 8.963503923593138e-06, + "loss": 0.4375, + "step": 1173 + }, + { + "epoch": 0.8645066273932254, + "grad_norm": 0.49867910146713257, + "learning_rate": 8.96089042661074e-06, + "loss": 0.4882, + "step": 1174 + }, + { + "epoch": 0.8652430044182622, + "grad_norm": 0.43283653259277344, + "learning_rate": 8.958274020870107e-06, + "loss": 0.437, + "step": 1175 + }, + { + "epoch": 0.865979381443299, + "grad_norm": 0.4610222578048706, + "learning_rate": 8.955654708292647e-06, + "loss": 0.4323, + "step": 1176 + }, + { + "epoch": 0.8667157584683358, + "grad_norm": 0.5031896829605103, + "learning_rate": 8.953032490801908e-06, + "loss": 0.4147, + "step": 1177 + }, + { + "epoch": 0.8674521354933726, + "grad_norm": 0.45876145362854004, + "learning_rate": 8.950407370323563e-06, + "loss": 0.4496, + "step": 1178 + }, + { + "epoch": 0.8681885125184094, + "grad_norm": 0.4437890648841858, + "learning_rate": 8.94777934878542e-06, + "loss": 0.4182, + "step": 1179 + }, + { + "epoch": 0.8689248895434463, + "grad_norm": 0.49486371874809265, + "learning_rate": 8.945148428117423e-06, + "loss": 0.4497, + "step": 1180 + }, + { + "epoch": 0.8696612665684831, + "grad_norm": 0.554208517074585, + "learning_rate": 8.942514610251639e-06, + "loss": 0.4753, + "step": 1181 + }, + { + "epoch": 0.8703976435935199, + "grad_norm": 0.4523577094078064, + "learning_rate": 8.939877897122262e-06, + "loss": 0.4476, + "step": 1182 + }, + { + "epoch": 0.8711340206185567, + "grad_norm": 0.43716442584991455, + "learning_rate": 8.937238290665617e-06, + "loss": 0.43, + "step": 1183 + }, + { + "epoch": 0.8718703976435935, + "grad_norm": 0.4811909794807434, + "learning_rate": 8.934595792820152e-06, + "loss": 0.4329, + "step": 1184 + }, + { + "epoch": 0.8726067746686303, + "grad_norm": 0.5379281044006348, + "learning_rate": 8.931950405526436e-06, + "loss": 0.4529, + "step": 1185 + }, + { + "epoch": 0.8733431516936672, + "grad_norm": 0.4634367525577545, + "learning_rate": 8.92930213072716e-06, + "loss": 0.4269, + "step": 1186 + }, + { + "epoch": 0.874079528718704, + "grad_norm": 0.46587860584259033, + "learning_rate": 8.926650970367138e-06, + "loss": 0.4399, + "step": 1187 + }, + { + "epoch": 0.8748159057437408, + "grad_norm": 0.47557711601257324, + "learning_rate": 8.923996926393306e-06, + "loss": 0.4365, + "step": 1188 + }, + { + "epoch": 0.8755522827687776, + "grad_norm": 0.48660027980804443, + "learning_rate": 8.921340000754708e-06, + "loss": 0.4772, + "step": 1189 + }, + { + "epoch": 0.8762886597938144, + "grad_norm": 0.4610178470611572, + "learning_rate": 8.918680195402512e-06, + "loss": 0.4276, + "step": 1190 + }, + { + "epoch": 0.8770250368188512, + "grad_norm": 0.41225728392601013, + "learning_rate": 8.916017512290001e-06, + "loss": 0.4269, + "step": 1191 + }, + { + "epoch": 0.8777614138438881, + "grad_norm": 0.4658215641975403, + "learning_rate": 8.913351953372565e-06, + "loss": 0.4116, + "step": 1192 + }, + { + "epoch": 0.8784977908689249, + "grad_norm": 0.4793921709060669, + "learning_rate": 8.910683520607713e-06, + "loss": 0.4811, + "step": 1193 + }, + { + "epoch": 0.8792341678939617, + "grad_norm": 0.4011600613594055, + "learning_rate": 8.90801221595506e-06, + "loss": 0.4319, + "step": 1194 + }, + { + "epoch": 0.8799705449189985, + "grad_norm": 0.4331344962120056, + "learning_rate": 8.90533804137633e-06, + "loss": 0.4457, + "step": 1195 + }, + { + "epoch": 0.8807069219440353, + "grad_norm": 0.5520123839378357, + "learning_rate": 8.902660998835359e-06, + "loss": 0.4549, + "step": 1196 + }, + { + "epoch": 0.8814432989690721, + "grad_norm": 0.4395567774772644, + "learning_rate": 8.899981090298084e-06, + "loss": 0.4334, + "step": 1197 + }, + { + "epoch": 0.882179675994109, + "grad_norm": 0.4339301288127899, + "learning_rate": 8.89729831773255e-06, + "loss": 0.4518, + "step": 1198 + }, + { + "epoch": 0.8829160530191458, + "grad_norm": 0.4404997229576111, + "learning_rate": 8.894612683108905e-06, + "loss": 0.4428, + "step": 1199 + }, + { + "epoch": 0.8836524300441826, + "grad_norm": 0.47511598467826843, + "learning_rate": 8.891924188399395e-06, + "loss": 0.4455, + "step": 1200 + }, + { + "epoch": 0.8843888070692194, + "grad_norm": 0.5081692934036255, + "learning_rate": 8.889232835578372e-06, + "loss": 0.5261, + "step": 1201 + }, + { + "epoch": 0.8851251840942562, + "grad_norm": 0.5229424834251404, + "learning_rate": 8.886538626622282e-06, + "loss": 0.4371, + "step": 1202 + }, + { + "epoch": 0.885861561119293, + "grad_norm": 0.45727550983428955, + "learning_rate": 8.883841563509671e-06, + "loss": 0.414, + "step": 1203 + }, + { + "epoch": 0.8865979381443299, + "grad_norm": 0.4745890498161316, + "learning_rate": 8.881141648221185e-06, + "loss": 0.472, + "step": 1204 + }, + { + "epoch": 0.8873343151693667, + "grad_norm": 0.48544570803642273, + "learning_rate": 8.878438882739554e-06, + "loss": 0.4719, + "step": 1205 + }, + { + "epoch": 0.8880706921944035, + "grad_norm": 0.45275935530662537, + "learning_rate": 8.87573326904961e-06, + "loss": 0.4395, + "step": 1206 + }, + { + "epoch": 0.8888070692194403, + "grad_norm": 0.527344822883606, + "learning_rate": 8.873024809138272e-06, + "loss": 0.4796, + "step": 1207 + }, + { + "epoch": 0.8895434462444771, + "grad_norm": 0.44065412878990173, + "learning_rate": 8.870313504994556e-06, + "loss": 0.4587, + "step": 1208 + }, + { + "epoch": 0.8902798232695139, + "grad_norm": 0.5130428671836853, + "learning_rate": 8.867599358609557e-06, + "loss": 0.4672, + "step": 1209 + }, + { + "epoch": 0.8910162002945509, + "grad_norm": 0.4739178419113159, + "learning_rate": 8.864882371976466e-06, + "loss": 0.4384, + "step": 1210 + }, + { + "epoch": 0.8917525773195877, + "grad_norm": 0.4686276614665985, + "learning_rate": 8.862162547090551e-06, + "loss": 0.4473, + "step": 1211 + }, + { + "epoch": 0.8924889543446245, + "grad_norm": 0.43980199098587036, + "learning_rate": 8.859439885949175e-06, + "loss": 0.4568, + "step": 1212 + }, + { + "epoch": 0.8932253313696613, + "grad_norm": 0.43443796038627625, + "learning_rate": 8.856714390551774e-06, + "loss": 0.4529, + "step": 1213 + }, + { + "epoch": 0.8939617083946981, + "grad_norm": 0.514228343963623, + "learning_rate": 8.853986062899869e-06, + "loss": 0.4468, + "step": 1214 + }, + { + "epoch": 0.894698085419735, + "grad_norm": 0.4369834363460541, + "learning_rate": 8.851254904997062e-06, + "loss": 0.4721, + "step": 1215 + }, + { + "epoch": 0.8954344624447718, + "grad_norm": 0.5049661993980408, + "learning_rate": 8.848520918849035e-06, + "loss": 0.4094, + "step": 1216 + }, + { + "epoch": 0.8961708394698086, + "grad_norm": 0.4989210069179535, + "learning_rate": 8.845784106463545e-06, + "loss": 0.4519, + "step": 1217 + }, + { + "epoch": 0.8969072164948454, + "grad_norm": 0.4148315489292145, + "learning_rate": 8.84304446985042e-06, + "loss": 0.4464, + "step": 1218 + }, + { + "epoch": 0.8976435935198822, + "grad_norm": 0.43474555015563965, + "learning_rate": 8.84030201102157e-06, + "loss": 0.4372, + "step": 1219 + }, + { + "epoch": 0.898379970544919, + "grad_norm": 0.49237704277038574, + "learning_rate": 8.837556731990973e-06, + "loss": 0.4582, + "step": 1220 + }, + { + "epoch": 0.8991163475699558, + "grad_norm": 0.46191561222076416, + "learning_rate": 8.83480863477468e-06, + "loss": 0.4557, + "step": 1221 + }, + { + "epoch": 0.8998527245949927, + "grad_norm": 0.41119834780693054, + "learning_rate": 8.832057721390809e-06, + "loss": 0.4343, + "step": 1222 + }, + { + "epoch": 0.9005891016200295, + "grad_norm": 0.4598017632961273, + "learning_rate": 8.829303993859548e-06, + "loss": 0.4636, + "step": 1223 + }, + { + "epoch": 0.9013254786450663, + "grad_norm": 0.4588959515094757, + "learning_rate": 8.826547454203152e-06, + "loss": 0.4331, + "step": 1224 + }, + { + "epoch": 0.9020618556701031, + "grad_norm": 0.44713202118873596, + "learning_rate": 8.823788104445941e-06, + "loss": 0.4329, + "step": 1225 + }, + { + "epoch": 0.9027982326951399, + "grad_norm": 0.5540334582328796, + "learning_rate": 8.821025946614295e-06, + "loss": 0.4508, + "step": 1226 + }, + { + "epoch": 0.9035346097201767, + "grad_norm": 0.42183777689933777, + "learning_rate": 8.818260982736662e-06, + "loss": 0.4633, + "step": 1227 + }, + { + "epoch": 0.9042709867452136, + "grad_norm": 0.5915690064430237, + "learning_rate": 8.815493214843546e-06, + "loss": 0.4515, + "step": 1228 + }, + { + "epoch": 0.9050073637702504, + "grad_norm": 0.5356271862983704, + "learning_rate": 8.812722644967515e-06, + "loss": 0.4205, + "step": 1229 + }, + { + "epoch": 0.9057437407952872, + "grad_norm": 0.4718015789985657, + "learning_rate": 8.809949275143189e-06, + "loss": 0.4572, + "step": 1230 + }, + { + "epoch": 0.906480117820324, + "grad_norm": 0.5468553900718689, + "learning_rate": 8.807173107407248e-06, + "loss": 0.4489, + "step": 1231 + }, + { + "epoch": 0.9072164948453608, + "grad_norm": 0.559502899646759, + "learning_rate": 8.804394143798426e-06, + "loss": 0.4629, + "step": 1232 + }, + { + "epoch": 0.9079528718703976, + "grad_norm": 0.47452765703201294, + "learning_rate": 8.801612386357508e-06, + "loss": 0.4457, + "step": 1233 + }, + { + "epoch": 0.9086892488954345, + "grad_norm": 0.5304099917411804, + "learning_rate": 8.798827837127336e-06, + "loss": 0.4492, + "step": 1234 + }, + { + "epoch": 0.9094256259204713, + "grad_norm": 0.5169060826301575, + "learning_rate": 8.796040498152797e-06, + "loss": 0.4748, + "step": 1235 + }, + { + "epoch": 0.9101620029455081, + "grad_norm": 0.49267488718032837, + "learning_rate": 8.793250371480827e-06, + "loss": 0.4579, + "step": 1236 + }, + { + "epoch": 0.9108983799705449, + "grad_norm": 0.4913206100463867, + "learning_rate": 8.790457459160414e-06, + "loss": 0.4255, + "step": 1237 + }, + { + "epoch": 0.9116347569955817, + "grad_norm": 0.531548798084259, + "learning_rate": 8.787661763242585e-06, + "loss": 0.4728, + "step": 1238 + }, + { + "epoch": 0.9123711340206185, + "grad_norm": 0.52623450756073, + "learning_rate": 8.784863285780419e-06, + "loss": 0.4661, + "step": 1239 + }, + { + "epoch": 0.9131075110456554, + "grad_norm": 0.5517471432685852, + "learning_rate": 8.782062028829028e-06, + "loss": 0.4553, + "step": 1240 + }, + { + "epoch": 0.9138438880706922, + "grad_norm": 0.472700297832489, + "learning_rate": 8.779257994445574e-06, + "loss": 0.4422, + "step": 1241 + }, + { + "epoch": 0.914580265095729, + "grad_norm": 0.44898468255996704, + "learning_rate": 8.776451184689253e-06, + "loss": 0.4124, + "step": 1242 + }, + { + "epoch": 0.9153166421207658, + "grad_norm": 0.48845183849334717, + "learning_rate": 8.773641601621303e-06, + "loss": 0.4469, + "step": 1243 + }, + { + "epoch": 0.9160530191458026, + "grad_norm": 0.5202846527099609, + "learning_rate": 8.770829247304998e-06, + "loss": 0.4723, + "step": 1244 + }, + { + "epoch": 0.9167893961708394, + "grad_norm": 0.44977980852127075, + "learning_rate": 8.768014123805642e-06, + "loss": 0.4413, + "step": 1245 + }, + { + "epoch": 0.9175257731958762, + "grad_norm": 0.46154212951660156, + "learning_rate": 8.765196233190579e-06, + "loss": 0.4411, + "step": 1246 + }, + { + "epoch": 0.9182621502209131, + "grad_norm": 0.5938853621482849, + "learning_rate": 8.762375577529184e-06, + "loss": 0.4399, + "step": 1247 + }, + { + "epoch": 0.9189985272459499, + "grad_norm": 0.43907907605171204, + "learning_rate": 8.75955215889286e-06, + "loss": 0.4402, + "step": 1248 + }, + { + "epoch": 0.9197349042709867, + "grad_norm": 0.4637261629104614, + "learning_rate": 8.756725979355039e-06, + "loss": 0.4587, + "step": 1249 + }, + { + "epoch": 0.9204712812960235, + "grad_norm": 0.45832645893096924, + "learning_rate": 8.753897040991183e-06, + "loss": 0.4579, + "step": 1250 + }, + { + "epoch": 0.9212076583210603, + "grad_norm": 0.44324028491973877, + "learning_rate": 8.751065345878778e-06, + "loss": 0.453, + "step": 1251 + }, + { + "epoch": 0.9219440353460973, + "grad_norm": 0.5125420689582825, + "learning_rate": 8.748230896097338e-06, + "loss": 0.4422, + "step": 1252 + }, + { + "epoch": 0.9226804123711341, + "grad_norm": 0.5083498954772949, + "learning_rate": 8.745393693728395e-06, + "loss": 0.4661, + "step": 1253 + }, + { + "epoch": 0.9234167893961709, + "grad_norm": 0.4523887038230896, + "learning_rate": 8.742553740855507e-06, + "loss": 0.4469, + "step": 1254 + }, + { + "epoch": 0.9241531664212077, + "grad_norm": 0.5064306259155273, + "learning_rate": 8.739711039564245e-06, + "loss": 0.4462, + "step": 1255 + }, + { + "epoch": 0.9248895434462445, + "grad_norm": 0.48103612661361694, + "learning_rate": 8.736865591942208e-06, + "loss": 0.4334, + "step": 1256 + }, + { + "epoch": 0.9256259204712813, + "grad_norm": 0.43229183554649353, + "learning_rate": 8.734017400079002e-06, + "loss": 0.4749, + "step": 1257 + }, + { + "epoch": 0.9263622974963182, + "grad_norm": 0.47184067964553833, + "learning_rate": 8.731166466066258e-06, + "loss": 0.4692, + "step": 1258 + }, + { + "epoch": 0.927098674521355, + "grad_norm": 0.49023813009262085, + "learning_rate": 8.728312791997612e-06, + "loss": 0.4593, + "step": 1259 + }, + { + "epoch": 0.9278350515463918, + "grad_norm": 0.5471561551094055, + "learning_rate": 8.725456379968717e-06, + "loss": 0.465, + "step": 1260 + }, + { + "epoch": 0.9285714285714286, + "grad_norm": 0.41933873295783997, + "learning_rate": 8.722597232077236e-06, + "loss": 0.4507, + "step": 1261 + }, + { + "epoch": 0.9293078055964654, + "grad_norm": 0.4452081322669983, + "learning_rate": 8.71973535042284e-06, + "loss": 0.4604, + "step": 1262 + }, + { + "epoch": 0.9300441826215022, + "grad_norm": 0.40033042430877686, + "learning_rate": 8.716870737107211e-06, + "loss": 0.4578, + "step": 1263 + }, + { + "epoch": 0.930780559646539, + "grad_norm": 0.4695224463939667, + "learning_rate": 8.714003394234031e-06, + "loss": 0.4582, + "step": 1264 + }, + { + "epoch": 0.9315169366715759, + "grad_norm": 0.40169644355773926, + "learning_rate": 8.711133323908993e-06, + "loss": 0.4523, + "step": 1265 + }, + { + "epoch": 0.9322533136966127, + "grad_norm": 0.43690577149391174, + "learning_rate": 8.708260528239788e-06, + "loss": 0.4569, + "step": 1266 + }, + { + "epoch": 0.9329896907216495, + "grad_norm": 0.44741305708885193, + "learning_rate": 8.705385009336111e-06, + "loss": 0.4363, + "step": 1267 + }, + { + "epoch": 0.9337260677466863, + "grad_norm": 0.6049298644065857, + "learning_rate": 8.702506769309656e-06, + "loss": 0.4569, + "step": 1268 + }, + { + "epoch": 0.9344624447717231, + "grad_norm": 0.43075326085090637, + "learning_rate": 8.699625810274115e-06, + "loss": 0.4377, + "step": 1269 + }, + { + "epoch": 0.93519882179676, + "grad_norm": 0.5428841710090637, + "learning_rate": 8.696742134345178e-06, + "loss": 0.437, + "step": 1270 + }, + { + "epoch": 0.9359351988217968, + "grad_norm": 0.5976313948631287, + "learning_rate": 8.69385574364053e-06, + "loss": 0.4581, + "step": 1271 + }, + { + "epoch": 0.9366715758468336, + "grad_norm": 0.48488497734069824, + "learning_rate": 8.690966640279846e-06, + "loss": 0.448, + "step": 1272 + }, + { + "epoch": 0.9374079528718704, + "grad_norm": 0.5577574372291565, + "learning_rate": 8.688074826384801e-06, + "loss": 0.4572, + "step": 1273 + }, + { + "epoch": 0.9381443298969072, + "grad_norm": 0.459681898355484, + "learning_rate": 8.685180304079051e-06, + "loss": 0.4488, + "step": 1274 + }, + { + "epoch": 0.938880706921944, + "grad_norm": 0.5070257782936096, + "learning_rate": 8.682283075488249e-06, + "loss": 0.4495, + "step": 1275 + }, + { + "epoch": 0.9396170839469808, + "grad_norm": 0.486531525850296, + "learning_rate": 8.679383142740033e-06, + "loss": 0.4402, + "step": 1276 + }, + { + "epoch": 0.9403534609720177, + "grad_norm": 0.5276297330856323, + "learning_rate": 8.676480507964021e-06, + "loss": 0.4487, + "step": 1277 + }, + { + "epoch": 0.9410898379970545, + "grad_norm": 0.47423624992370605, + "learning_rate": 8.673575173291826e-06, + "loss": 0.449, + "step": 1278 + }, + { + "epoch": 0.9418262150220913, + "grad_norm": 0.5563042163848877, + "learning_rate": 8.670667140857034e-06, + "loss": 0.4449, + "step": 1279 + }, + { + "epoch": 0.9425625920471281, + "grad_norm": 0.47434091567993164, + "learning_rate": 8.667756412795217e-06, + "loss": 0.4455, + "step": 1280 + }, + { + "epoch": 0.9432989690721649, + "grad_norm": 0.6123890280723572, + "learning_rate": 8.664842991243927e-06, + "loss": 0.4782, + "step": 1281 + }, + { + "epoch": 0.9440353460972017, + "grad_norm": 0.42824527621269226, + "learning_rate": 8.661926878342692e-06, + "loss": 0.4512, + "step": 1282 + }, + { + "epoch": 0.9447717231222386, + "grad_norm": 0.5722915530204773, + "learning_rate": 8.659008076233016e-06, + "loss": 0.4485, + "step": 1283 + }, + { + "epoch": 0.9455081001472754, + "grad_norm": 0.5747035145759583, + "learning_rate": 8.656086587058381e-06, + "loss": 0.4503, + "step": 1284 + }, + { + "epoch": 0.9462444771723122, + "grad_norm": 0.4897293150424957, + "learning_rate": 8.65316241296424e-06, + "loss": 0.4605, + "step": 1285 + }, + { + "epoch": 0.946980854197349, + "grad_norm": 0.5299695134162903, + "learning_rate": 8.650235556098017e-06, + "loss": 0.4241, + "step": 1286 + }, + { + "epoch": 0.9477172312223858, + "grad_norm": 0.5436539053916931, + "learning_rate": 8.647306018609108e-06, + "loss": 0.4211, + "step": 1287 + }, + { + "epoch": 0.9484536082474226, + "grad_norm": 0.49147042632102966, + "learning_rate": 8.644373802648877e-06, + "loss": 0.4528, + "step": 1288 + }, + { + "epoch": 0.9491899852724595, + "grad_norm": 0.5229616165161133, + "learning_rate": 8.641438910370655e-06, + "loss": 0.4492, + "step": 1289 + }, + { + "epoch": 0.9499263622974963, + "grad_norm": 0.49656403064727783, + "learning_rate": 8.638501343929735e-06, + "loss": 0.4382, + "step": 1290 + }, + { + "epoch": 0.9506627393225331, + "grad_norm": 0.47837382555007935, + "learning_rate": 8.635561105483384e-06, + "loss": 0.469, + "step": 1291 + }, + { + "epoch": 0.9513991163475699, + "grad_norm": 0.48511695861816406, + "learning_rate": 8.632618197190817e-06, + "loss": 0.4372, + "step": 1292 + }, + { + "epoch": 0.9521354933726067, + "grad_norm": 0.5691758394241333, + "learning_rate": 8.629672621213221e-06, + "loss": 0.4331, + "step": 1293 + }, + { + "epoch": 0.9528718703976435, + "grad_norm": 0.46881407499313354, + "learning_rate": 8.626724379713736e-06, + "loss": 0.4669, + "step": 1294 + }, + { + "epoch": 0.9536082474226805, + "grad_norm": 0.5768502354621887, + "learning_rate": 8.623773474857461e-06, + "loss": 0.4613, + "step": 1295 + }, + { + "epoch": 0.9543446244477173, + "grad_norm": 0.49087420105934143, + "learning_rate": 8.620819908811455e-06, + "loss": 0.4296, + "step": 1296 + }, + { + "epoch": 0.9550810014727541, + "grad_norm": 0.45804309844970703, + "learning_rate": 8.617863683744726e-06, + "loss": 0.4376, + "step": 1297 + }, + { + "epoch": 0.9558173784977909, + "grad_norm": 0.47427794337272644, + "learning_rate": 8.614904801828234e-06, + "loss": 0.4583, + "step": 1298 + }, + { + "epoch": 0.9565537555228277, + "grad_norm": 0.44913750886917114, + "learning_rate": 8.611943265234895e-06, + "loss": 0.4409, + "step": 1299 + }, + { + "epoch": 0.9572901325478645, + "grad_norm": 0.491207480430603, + "learning_rate": 8.608979076139572e-06, + "loss": 0.449, + "step": 1300 + }, + { + "epoch": 0.9580265095729014, + "grad_norm": 0.43761712312698364, + "learning_rate": 8.606012236719073e-06, + "loss": 0.442, + "step": 1301 + }, + { + "epoch": 0.9587628865979382, + "grad_norm": 0.39326536655426025, + "learning_rate": 8.60304274915216e-06, + "loss": 0.4258, + "step": 1302 + }, + { + "epoch": 0.959499263622975, + "grad_norm": 0.4258727431297302, + "learning_rate": 8.600070615619528e-06, + "loss": 0.4392, + "step": 1303 + }, + { + "epoch": 0.9602356406480118, + "grad_norm": 0.4401472210884094, + "learning_rate": 8.597095838303831e-06, + "loss": 0.4483, + "step": 1304 + }, + { + "epoch": 0.9609720176730486, + "grad_norm": 0.4828743040561676, + "learning_rate": 8.594118419389648e-06, + "loss": 0.4307, + "step": 1305 + }, + { + "epoch": 0.9617083946980854, + "grad_norm": 0.4741956889629364, + "learning_rate": 8.591138361063508e-06, + "loss": 0.4331, + "step": 1306 + }, + { + "epoch": 0.9624447717231223, + "grad_norm": 0.4706476926803589, + "learning_rate": 8.588155665513877e-06, + "loss": 0.4425, + "step": 1307 + }, + { + "epoch": 0.9631811487481591, + "grad_norm": 0.4412619471549988, + "learning_rate": 8.585170334931156e-06, + "loss": 0.4312, + "step": 1308 + }, + { + "epoch": 0.9639175257731959, + "grad_norm": 0.4449722170829773, + "learning_rate": 8.58218237150768e-06, + "loss": 0.4158, + "step": 1309 + }, + { + "epoch": 0.9646539027982327, + "grad_norm": 0.45096832513809204, + "learning_rate": 8.579191777437721e-06, + "loss": 0.4532, + "step": 1310 + }, + { + "epoch": 0.9653902798232695, + "grad_norm": 0.4758065938949585, + "learning_rate": 8.57619855491748e-06, + "loss": 0.4518, + "step": 1311 + }, + { + "epoch": 0.9661266568483063, + "grad_norm": 0.4714369773864746, + "learning_rate": 8.57320270614509e-06, + "loss": 0.4579, + "step": 1312 + }, + { + "epoch": 0.9668630338733432, + "grad_norm": 0.44195324182510376, + "learning_rate": 8.57020423332061e-06, + "loss": 0.4256, + "step": 1313 + }, + { + "epoch": 0.96759941089838, + "grad_norm": 0.49739697575569153, + "learning_rate": 8.567203138646027e-06, + "loss": 0.4411, + "step": 1314 + }, + { + "epoch": 0.9683357879234168, + "grad_norm": 0.48561814427375793, + "learning_rate": 8.564199424325259e-06, + "loss": 0.439, + "step": 1315 + }, + { + "epoch": 0.9690721649484536, + "grad_norm": 0.44757330417633057, + "learning_rate": 8.56119309256414e-06, + "loss": 0.468, + "step": 1316 + }, + { + "epoch": 0.9698085419734904, + "grad_norm": 0.5125359296798706, + "learning_rate": 8.558184145570427e-06, + "loss": 0.4643, + "step": 1317 + }, + { + "epoch": 0.9705449189985272, + "grad_norm": 0.6268301606178284, + "learning_rate": 8.555172585553804e-06, + "loss": 0.4455, + "step": 1318 + }, + { + "epoch": 0.9712812960235641, + "grad_norm": 0.4266813099384308, + "learning_rate": 8.552158414725868e-06, + "loss": 0.4357, + "step": 1319 + }, + { + "epoch": 0.9720176730486009, + "grad_norm": 0.49251487851142883, + "learning_rate": 8.549141635300135e-06, + "loss": 0.4315, + "step": 1320 + }, + { + "epoch": 0.9727540500736377, + "grad_norm": 0.4862709939479828, + "learning_rate": 8.546122249492035e-06, + "loss": 0.4352, + "step": 1321 + }, + { + "epoch": 0.9734904270986745, + "grad_norm": 0.4898791015148163, + "learning_rate": 8.543100259518916e-06, + "loss": 0.4408, + "step": 1322 + }, + { + "epoch": 0.9742268041237113, + "grad_norm": 0.45198628306388855, + "learning_rate": 8.540075667600034e-06, + "loss": 0.4316, + "step": 1323 + }, + { + "epoch": 0.9749631811487481, + "grad_norm": 0.5984836220741272, + "learning_rate": 8.53704847595656e-06, + "loss": 0.4535, + "step": 1324 + }, + { + "epoch": 0.975699558173785, + "grad_norm": 0.5049937963485718, + "learning_rate": 8.534018686811572e-06, + "loss": 0.4115, + "step": 1325 + }, + { + "epoch": 0.9764359351988218, + "grad_norm": 0.41473087668418884, + "learning_rate": 8.530986302390053e-06, + "loss": 0.463, + "step": 1326 + }, + { + "epoch": 0.9771723122238586, + "grad_norm": 0.4933287501335144, + "learning_rate": 8.527951324918897e-06, + "loss": 0.4182, + "step": 1327 + }, + { + "epoch": 0.9779086892488954, + "grad_norm": 0.6397290229797363, + "learning_rate": 8.5249137566269e-06, + "loss": 0.4784, + "step": 1328 + }, + { + "epoch": 0.9786450662739322, + "grad_norm": 0.45069581270217896, + "learning_rate": 8.521873599744758e-06, + "loss": 0.4373, + "step": 1329 + }, + { + "epoch": 0.979381443298969, + "grad_norm": 0.5195262432098389, + "learning_rate": 8.518830856505072e-06, + "loss": 0.4582, + "step": 1330 + }, + { + "epoch": 0.9801178203240059, + "grad_norm": 0.6199840307235718, + "learning_rate": 8.515785529142339e-06, + "loss": 0.4265, + "step": 1331 + }, + { + "epoch": 0.9808541973490427, + "grad_norm": 0.5363901257514954, + "learning_rate": 8.512737619892958e-06, + "loss": 0.4388, + "step": 1332 + }, + { + "epoch": 0.9815905743740795, + "grad_norm": 0.576992392539978, + "learning_rate": 8.509687130995223e-06, + "loss": 0.4475, + "step": 1333 + }, + { + "epoch": 0.9823269513991163, + "grad_norm": 0.4745303690433502, + "learning_rate": 8.506634064689314e-06, + "loss": 0.432, + "step": 1334 + }, + { + "epoch": 0.9830633284241531, + "grad_norm": 0.4878941476345062, + "learning_rate": 8.503578423217316e-06, + "loss": 0.433, + "step": 1335 + }, + { + "epoch": 0.9837997054491899, + "grad_norm": 0.5367373824119568, + "learning_rate": 8.500520208823199e-06, + "loss": 0.4369, + "step": 1336 + }, + { + "epoch": 0.9845360824742269, + "grad_norm": 0.5170448422431946, + "learning_rate": 8.497459423752824e-06, + "loss": 0.4418, + "step": 1337 + }, + { + "epoch": 0.9852724594992637, + "grad_norm": 0.45060834288597107, + "learning_rate": 8.494396070253934e-06, + "loss": 0.4293, + "step": 1338 + }, + { + "epoch": 0.9860088365243005, + "grad_norm": 0.43550148606300354, + "learning_rate": 8.49133015057617e-06, + "loss": 0.437, + "step": 1339 + }, + { + "epoch": 0.9867452135493373, + "grad_norm": 0.4647277891635895, + "learning_rate": 8.488261666971047e-06, + "loss": 0.4149, + "step": 1340 + }, + { + "epoch": 0.9874815905743741, + "grad_norm": 0.5019136667251587, + "learning_rate": 8.485190621691967e-06, + "loss": 0.4437, + "step": 1341 + }, + { + "epoch": 0.9882179675994109, + "grad_norm": 0.41769254207611084, + "learning_rate": 8.482117016994213e-06, + "loss": 0.4457, + "step": 1342 + }, + { + "epoch": 0.9889543446244478, + "grad_norm": 0.4599890112876892, + "learning_rate": 8.479040855134949e-06, + "loss": 0.4314, + "step": 1343 + }, + { + "epoch": 0.9896907216494846, + "grad_norm": 0.495074063539505, + "learning_rate": 8.475962138373212e-06, + "loss": 0.4334, + "step": 1344 + }, + { + "epoch": 0.9904270986745214, + "grad_norm": 0.41426074504852295, + "learning_rate": 8.472880868969922e-06, + "loss": 0.4461, + "step": 1345 + }, + { + "epoch": 0.9911634756995582, + "grad_norm": 0.4830911457538605, + "learning_rate": 8.469797049187867e-06, + "loss": 0.4623, + "step": 1346 + }, + { + "epoch": 0.991899852724595, + "grad_norm": 0.5529137849807739, + "learning_rate": 8.466710681291714e-06, + "loss": 0.4372, + "step": 1347 + }, + { + "epoch": 0.9926362297496318, + "grad_norm": 0.46961885690689087, + "learning_rate": 8.463621767547998e-06, + "loss": 0.4219, + "step": 1348 + }, + { + "epoch": 0.9933726067746687, + "grad_norm": 0.4638763666152954, + "learning_rate": 8.46053031022512e-06, + "loss": 0.4388, + "step": 1349 + }, + { + "epoch": 0.9941089837997055, + "grad_norm": 0.44377005100250244, + "learning_rate": 8.457436311593358e-06, + "loss": 0.4169, + "step": 1350 + }, + { + "epoch": 0.9948453608247423, + "grad_norm": 0.5292149782180786, + "learning_rate": 8.454339773924849e-06, + "loss": 0.4521, + "step": 1351 + }, + { + "epoch": 0.9955817378497791, + "grad_norm": 0.5035756230354309, + "learning_rate": 8.451240699493597e-06, + "loss": 0.4451, + "step": 1352 + }, + { + "epoch": 0.9963181148748159, + "grad_norm": 0.42530128359794617, + "learning_rate": 8.448139090575467e-06, + "loss": 0.434, + "step": 1353 + }, + { + "epoch": 0.9970544918998527, + "grad_norm": 0.5264377593994141, + "learning_rate": 8.445034949448188e-06, + "loss": 0.4654, + "step": 1354 + }, + { + "epoch": 0.9977908689248896, + "grad_norm": 0.5930060148239136, + "learning_rate": 8.441928278391349e-06, + "loss": 0.4539, + "step": 1355 + }, + { + "epoch": 0.9985272459499264, + "grad_norm": 0.43737247586250305, + "learning_rate": 8.438819079686391e-06, + "loss": 0.4495, + "step": 1356 + }, + { + "epoch": 0.9992636229749632, + "grad_norm": 0.46241891384124756, + "learning_rate": 8.43570735561662e-06, + "loss": 0.4729, + "step": 1357 + }, + { + "epoch": 1.0, + "grad_norm": 0.5586955547332764, + "learning_rate": 8.43259310846719e-06, + "loss": 0.4411, + "step": 1358 + }, + { + "epoch": 1.0007363770250368, + "grad_norm": 0.5139601230621338, + "learning_rate": 8.429476340525111e-06, + "loss": 0.4186, + "step": 1359 + }, + { + "epoch": 1.0014727540500736, + "grad_norm": 0.42976438999176025, + "learning_rate": 8.426357054079244e-06, + "loss": 0.4069, + "step": 1360 + }, + { + "epoch": 1.0022091310751104, + "grad_norm": 0.537762463092804, + "learning_rate": 8.423235251420297e-06, + "loss": 0.3928, + "step": 1361 + }, + { + "epoch": 1.0029455081001473, + "grad_norm": 0.4901559054851532, + "learning_rate": 8.420110934840826e-06, + "loss": 0.4392, + "step": 1362 + }, + { + "epoch": 1.003681885125184, + "grad_norm": 0.44034528732299805, + "learning_rate": 8.416984106635238e-06, + "loss": 0.4228, + "step": 1363 + }, + { + "epoch": 1.004418262150221, + "grad_norm": 0.4630548059940338, + "learning_rate": 8.413854769099779e-06, + "loss": 0.4053, + "step": 1364 + }, + { + "epoch": 1.0051546391752577, + "grad_norm": 0.5146474242210388, + "learning_rate": 8.410722924532541e-06, + "loss": 0.3992, + "step": 1365 + }, + { + "epoch": 1.0058910162002945, + "grad_norm": 0.46642136573791504, + "learning_rate": 8.407588575233457e-06, + "loss": 0.4437, + "step": 1366 + }, + { + "epoch": 1.0066273932253313, + "grad_norm": 0.4860120415687561, + "learning_rate": 8.404451723504295e-06, + "loss": 0.421, + "step": 1367 + }, + { + "epoch": 1.0073637702503682, + "grad_norm": 0.5072521567344666, + "learning_rate": 8.401312371648667e-06, + "loss": 0.4082, + "step": 1368 + }, + { + "epoch": 1.008100147275405, + "grad_norm": 0.4237157702445984, + "learning_rate": 8.398170521972017e-06, + "loss": 0.4189, + "step": 1369 + }, + { + "epoch": 1.0088365243004418, + "grad_norm": 0.4586620628833771, + "learning_rate": 8.395026176781627e-06, + "loss": 0.3847, + "step": 1370 + }, + { + "epoch": 1.0095729013254786, + "grad_norm": 0.4629966616630554, + "learning_rate": 8.391879338386604e-06, + "loss": 0.4173, + "step": 1371 + }, + { + "epoch": 1.0103092783505154, + "grad_norm": 0.4874536097049713, + "learning_rate": 8.388730009097895e-06, + "loss": 0.4035, + "step": 1372 + }, + { + "epoch": 1.0110456553755522, + "grad_norm": 0.4574902355670929, + "learning_rate": 8.385578191228272e-06, + "loss": 0.425, + "step": 1373 + }, + { + "epoch": 1.011782032400589, + "grad_norm": 0.3942805230617523, + "learning_rate": 8.382423887092333e-06, + "loss": 0.4186, + "step": 1374 + }, + { + "epoch": 1.0125184094256259, + "grad_norm": 0.4842493236064911, + "learning_rate": 8.379267099006506e-06, + "loss": 0.4149, + "step": 1375 + }, + { + "epoch": 1.0132547864506627, + "grad_norm": 0.46591871976852417, + "learning_rate": 8.376107829289037e-06, + "loss": 0.4112, + "step": 1376 + }, + { + "epoch": 1.0139911634756995, + "grad_norm": 0.4761437177658081, + "learning_rate": 8.372946080260002e-06, + "loss": 0.4109, + "step": 1377 + }, + { + "epoch": 1.0147275405007363, + "grad_norm": 0.4643495976924896, + "learning_rate": 8.369781854241293e-06, + "loss": 0.4222, + "step": 1378 + }, + { + "epoch": 1.0154639175257731, + "grad_norm": 0.4508487284183502, + "learning_rate": 8.36661515355662e-06, + "loss": 0.4215, + "step": 1379 + }, + { + "epoch": 1.01620029455081, + "grad_norm": 0.5168802738189697, + "learning_rate": 8.363445980531515e-06, + "loss": 0.4213, + "step": 1380 + }, + { + "epoch": 1.0169366715758468, + "grad_norm": 0.46994489431381226, + "learning_rate": 8.360274337493321e-06, + "loss": 0.4292, + "step": 1381 + }, + { + "epoch": 1.0176730486008836, + "grad_norm": 0.44330844283103943, + "learning_rate": 8.3571002267712e-06, + "loss": 0.4107, + "step": 1382 + }, + { + "epoch": 1.0184094256259204, + "grad_norm": 0.4858071506023407, + "learning_rate": 8.353923650696119e-06, + "loss": 0.3989, + "step": 1383 + }, + { + "epoch": 1.0191458026509572, + "grad_norm": 0.5103859305381775, + "learning_rate": 8.35074461160086e-06, + "loss": 0.4168, + "step": 1384 + }, + { + "epoch": 1.019882179675994, + "grad_norm": 0.4278160631656647, + "learning_rate": 8.347563111820014e-06, + "loss": 0.4058, + "step": 1385 + }, + { + "epoch": 1.0206185567010309, + "grad_norm": 0.436535507440567, + "learning_rate": 8.34437915368998e-06, + "loss": 0.3877, + "step": 1386 + }, + { + "epoch": 1.0213549337260677, + "grad_norm": 0.47809407114982605, + "learning_rate": 8.341192739548958e-06, + "loss": 0.3958, + "step": 1387 + }, + { + "epoch": 1.0220913107511045, + "grad_norm": 0.4272827208042145, + "learning_rate": 8.338003871736957e-06, + "loss": 0.4139, + "step": 1388 + }, + { + "epoch": 1.0228276877761413, + "grad_norm": 0.4393133819103241, + "learning_rate": 8.334812552595782e-06, + "loss": 0.4266, + "step": 1389 + }, + { + "epoch": 1.0235640648011781, + "grad_norm": 0.45444509387016296, + "learning_rate": 8.331618784469043e-06, + "loss": 0.4055, + "step": 1390 + }, + { + "epoch": 1.024300441826215, + "grad_norm": 0.46266138553619385, + "learning_rate": 8.328422569702148e-06, + "loss": 0.4283, + "step": 1391 + }, + { + "epoch": 1.0250368188512518, + "grad_norm": 0.43913209438323975, + "learning_rate": 8.325223910642297e-06, + "loss": 0.4317, + "step": 1392 + }, + { + "epoch": 1.0257731958762886, + "grad_norm": 0.4571021795272827, + "learning_rate": 8.322022809638492e-06, + "loss": 0.4255, + "step": 1393 + }, + { + "epoch": 1.0265095729013254, + "grad_norm": 0.4817020893096924, + "learning_rate": 8.318819269041524e-06, + "loss": 0.4324, + "step": 1394 + }, + { + "epoch": 1.0272459499263622, + "grad_norm": 0.4041571021080017, + "learning_rate": 8.315613291203977e-06, + "loss": 0.417, + "step": 1395 + }, + { + "epoch": 1.027982326951399, + "grad_norm": 0.4440697729587555, + "learning_rate": 8.312404878480222e-06, + "loss": 0.4043, + "step": 1396 + }, + { + "epoch": 1.0287187039764358, + "grad_norm": 0.47406235337257385, + "learning_rate": 8.309194033226423e-06, + "loss": 0.4089, + "step": 1397 + }, + { + "epoch": 1.0294550810014726, + "grad_norm": 0.4187905192375183, + "learning_rate": 8.305980757800525e-06, + "loss": 0.4108, + "step": 1398 + }, + { + "epoch": 1.0301914580265095, + "grad_norm": 0.45452675223350525, + "learning_rate": 8.302765054562261e-06, + "loss": 0.4159, + "step": 1399 + }, + { + "epoch": 1.0309278350515463, + "grad_norm": 0.4963740110397339, + "learning_rate": 8.299546925873148e-06, + "loss": 0.411, + "step": 1400 + }, + { + "epoch": 1.031664212076583, + "grad_norm": 0.48701077699661255, + "learning_rate": 8.296326374096482e-06, + "loss": 0.409, + "step": 1401 + }, + { + "epoch": 1.0324005891016201, + "grad_norm": 0.4532162547111511, + "learning_rate": 8.293103401597338e-06, + "loss": 0.3964, + "step": 1402 + }, + { + "epoch": 1.033136966126657, + "grad_norm": 0.4221610128879547, + "learning_rate": 8.28987801074257e-06, + "loss": 0.4084, + "step": 1403 + }, + { + "epoch": 1.0338733431516938, + "grad_norm": 0.5044291615486145, + "learning_rate": 8.286650203900808e-06, + "loss": 0.4088, + "step": 1404 + }, + { + "epoch": 1.0346097201767306, + "grad_norm": 0.4653428792953491, + "learning_rate": 8.283419983442453e-06, + "loss": 0.3964, + "step": 1405 + }, + { + "epoch": 1.0353460972017674, + "grad_norm": 0.45748192071914673, + "learning_rate": 8.280187351739686e-06, + "loss": 0.4184, + "step": 1406 + }, + { + "epoch": 1.0360824742268042, + "grad_norm": 0.4691620171070099, + "learning_rate": 8.276952311166451e-06, + "loss": 0.3849, + "step": 1407 + }, + { + "epoch": 1.036818851251841, + "grad_norm": 0.46832898259162903, + "learning_rate": 8.273714864098466e-06, + "loss": 0.4052, + "step": 1408 + }, + { + "epoch": 1.0375552282768779, + "grad_norm": 0.3961854875087738, + "learning_rate": 8.270475012913212e-06, + "loss": 0.3872, + "step": 1409 + }, + { + "epoch": 1.0382916053019147, + "grad_norm": 0.4253145158290863, + "learning_rate": 8.267232759989938e-06, + "loss": 0.3978, + "step": 1410 + }, + { + "epoch": 1.0390279823269515, + "grad_norm": 0.43363869190216064, + "learning_rate": 8.26398810770966e-06, + "loss": 0.4387, + "step": 1411 + }, + { + "epoch": 1.0397643593519883, + "grad_norm": 0.43698909878730774, + "learning_rate": 8.260741058455147e-06, + "loss": 0.4094, + "step": 1412 + }, + { + "epoch": 1.0405007363770251, + "grad_norm": 0.3974965214729309, + "learning_rate": 8.257491614610939e-06, + "loss": 0.407, + "step": 1413 + }, + { + "epoch": 1.041237113402062, + "grad_norm": 0.4418043792247772, + "learning_rate": 8.254239778563325e-06, + "loss": 0.4174, + "step": 1414 + }, + { + "epoch": 1.0419734904270987, + "grad_norm": 0.43214893341064453, + "learning_rate": 8.250985552700359e-06, + "loss": 0.4399, + "step": 1415 + }, + { + "epoch": 1.0427098674521356, + "grad_norm": 0.41666722297668457, + "learning_rate": 8.247728939411845e-06, + "loss": 0.4377, + "step": 1416 + }, + { + "epoch": 1.0434462444771724, + "grad_norm": 0.45747169852256775, + "learning_rate": 8.24446994108934e-06, + "loss": 0.4204, + "step": 1417 + }, + { + "epoch": 1.0441826215022092, + "grad_norm": 0.4472166895866394, + "learning_rate": 8.241208560126154e-06, + "loss": 0.4299, + "step": 1418 + }, + { + "epoch": 1.044918998527246, + "grad_norm": 0.42849603295326233, + "learning_rate": 8.237944798917347e-06, + "loss": 0.419, + "step": 1419 + }, + { + "epoch": 1.0456553755522828, + "grad_norm": 0.46297961473464966, + "learning_rate": 8.234678659859729e-06, + "loss": 0.4288, + "step": 1420 + }, + { + "epoch": 1.0463917525773196, + "grad_norm": 0.5132995247840881, + "learning_rate": 8.231410145351853e-06, + "loss": 0.4088, + "step": 1421 + }, + { + "epoch": 1.0471281296023565, + "grad_norm": 0.45529741048812866, + "learning_rate": 8.228139257794012e-06, + "loss": 0.4097, + "step": 1422 + }, + { + "epoch": 1.0478645066273933, + "grad_norm": 0.4541131854057312, + "learning_rate": 8.224865999588254e-06, + "loss": 0.3927, + "step": 1423 + }, + { + "epoch": 1.04860088365243, + "grad_norm": 0.46962690353393555, + "learning_rate": 8.221590373138358e-06, + "loss": 0.4196, + "step": 1424 + }, + { + "epoch": 1.049337260677467, + "grad_norm": 0.39268723130226135, + "learning_rate": 8.218312380849844e-06, + "loss": 0.4267, + "step": 1425 + }, + { + "epoch": 1.0500736377025037, + "grad_norm": 0.38890156149864197, + "learning_rate": 8.21503202512997e-06, + "loss": 0.4095, + "step": 1426 + }, + { + "epoch": 1.0508100147275405, + "grad_norm": 0.4562273621559143, + "learning_rate": 8.211749308387734e-06, + "loss": 0.4437, + "step": 1427 + }, + { + "epoch": 1.0515463917525774, + "grad_norm": 0.43432724475860596, + "learning_rate": 8.208464233033862e-06, + "loss": 0.4181, + "step": 1428 + }, + { + "epoch": 1.0522827687776142, + "grad_norm": 0.4545213580131531, + "learning_rate": 8.205176801480811e-06, + "loss": 0.4148, + "step": 1429 + }, + { + "epoch": 1.053019145802651, + "grad_norm": 0.5053450465202332, + "learning_rate": 8.201887016142776e-06, + "loss": 0.4027, + "step": 1430 + }, + { + "epoch": 1.0537555228276878, + "grad_norm": 0.4846493601799011, + "learning_rate": 8.198594879435673e-06, + "loss": 0.4324, + "step": 1431 + }, + { + "epoch": 1.0544918998527246, + "grad_norm": 0.4266805648803711, + "learning_rate": 8.19530039377715e-06, + "loss": 0.4072, + "step": 1432 + }, + { + "epoch": 1.0552282768777614, + "grad_norm": 0.5038962364196777, + "learning_rate": 8.192003561586576e-06, + "loss": 0.4021, + "step": 1433 + }, + { + "epoch": 1.0559646539027983, + "grad_norm": 0.5069649815559387, + "learning_rate": 8.188704385285046e-06, + "loss": 0.4373, + "step": 1434 + }, + { + "epoch": 1.056701030927835, + "grad_norm": 0.40627750754356384, + "learning_rate": 8.185402867295373e-06, + "loss": 0.4307, + "step": 1435 + }, + { + "epoch": 1.0574374079528719, + "grad_norm": 0.5366407632827759, + "learning_rate": 8.182099010042095e-06, + "loss": 0.4244, + "step": 1436 + }, + { + "epoch": 1.0581737849779087, + "grad_norm": 0.4338124096393585, + "learning_rate": 8.178792815951465e-06, + "loss": 0.4091, + "step": 1437 + }, + { + "epoch": 1.0589101620029455, + "grad_norm": 0.4336191415786743, + "learning_rate": 8.175484287451448e-06, + "loss": 0.4111, + "step": 1438 + }, + { + "epoch": 1.0596465390279823, + "grad_norm": 0.4909955561161041, + "learning_rate": 8.172173426971732e-06, + "loss": 0.4237, + "step": 1439 + }, + { + "epoch": 1.0603829160530192, + "grad_norm": 0.6215251088142395, + "learning_rate": 8.168860236943709e-06, + "loss": 0.4154, + "step": 1440 + }, + { + "epoch": 1.061119293078056, + "grad_norm": 0.4355964958667755, + "learning_rate": 8.16554471980049e-06, + "loss": 0.3951, + "step": 1441 + }, + { + "epoch": 1.0618556701030928, + "grad_norm": 0.5015247464179993, + "learning_rate": 8.162226877976886e-06, + "loss": 0.4373, + "step": 1442 + }, + { + "epoch": 1.0625920471281296, + "grad_norm": 0.49945569038391113, + "learning_rate": 8.158906713909425e-06, + "loss": 0.3987, + "step": 1443 + }, + { + "epoch": 1.0633284241531664, + "grad_norm": 0.492780864238739, + "learning_rate": 8.155584230036328e-06, + "loss": 0.424, + "step": 1444 + }, + { + "epoch": 1.0640648011782032, + "grad_norm": 0.5154081583023071, + "learning_rate": 8.152259428797535e-06, + "loss": 0.4135, + "step": 1445 + }, + { + "epoch": 1.06480117820324, + "grad_norm": 0.4992247521877289, + "learning_rate": 8.148932312634674e-06, + "loss": 0.4126, + "step": 1446 + }, + { + "epoch": 1.0655375552282769, + "grad_norm": 0.579197883605957, + "learning_rate": 8.14560288399108e-06, + "loss": 0.4069, + "step": 1447 + }, + { + "epoch": 1.0662739322533137, + "grad_norm": 0.5475592613220215, + "learning_rate": 8.142271145311784e-06, + "loss": 0.4106, + "step": 1448 + }, + { + "epoch": 1.0670103092783505, + "grad_norm": 0.5077992081642151, + "learning_rate": 8.138937099043516e-06, + "loss": 0.4135, + "step": 1449 + }, + { + "epoch": 1.0677466863033873, + "grad_norm": 0.5219954252243042, + "learning_rate": 8.135600747634697e-06, + "loss": 0.4654, + "step": 1450 + }, + { + "epoch": 1.0684830633284241, + "grad_norm": 0.5722024440765381, + "learning_rate": 8.132262093535444e-06, + "loss": 0.4448, + "step": 1451 + }, + { + "epoch": 1.069219440353461, + "grad_norm": 0.4120500683784485, + "learning_rate": 8.128921139197563e-06, + "loss": 0.3952, + "step": 1452 + }, + { + "epoch": 1.0699558173784978, + "grad_norm": 0.46952569484710693, + "learning_rate": 8.125577887074552e-06, + "loss": 0.4353, + "step": 1453 + }, + { + "epoch": 1.0706921944035346, + "grad_norm": 0.5226830244064331, + "learning_rate": 8.12223233962159e-06, + "loss": 0.409, + "step": 1454 + }, + { + "epoch": 1.0714285714285714, + "grad_norm": 0.5324341058731079, + "learning_rate": 8.118884499295549e-06, + "loss": 0.4086, + "step": 1455 + }, + { + "epoch": 1.0721649484536082, + "grad_norm": 0.4469471573829651, + "learning_rate": 8.115534368554981e-06, + "loss": 0.4064, + "step": 1456 + }, + { + "epoch": 1.072901325478645, + "grad_norm": 0.5327169895172119, + "learning_rate": 8.112181949860121e-06, + "loss": 0.4437, + "step": 1457 + }, + { + "epoch": 1.0736377025036818, + "grad_norm": 0.5192805528640747, + "learning_rate": 8.108827245672884e-06, + "loss": 0.4017, + "step": 1458 + }, + { + "epoch": 1.0743740795287187, + "grad_norm": 0.3944444954395294, + "learning_rate": 8.105470258456863e-06, + "loss": 0.3981, + "step": 1459 + }, + { + "epoch": 1.0751104565537555, + "grad_norm": 0.4764431416988373, + "learning_rate": 8.102110990677328e-06, + "loss": 0.4081, + "step": 1460 + }, + { + "epoch": 1.0758468335787923, + "grad_norm": 0.49513620138168335, + "learning_rate": 8.098749444801226e-06, + "loss": 0.4406, + "step": 1461 + }, + { + "epoch": 1.076583210603829, + "grad_norm": 0.4354887306690216, + "learning_rate": 8.095385623297171e-06, + "loss": 0.4379, + "step": 1462 + }, + { + "epoch": 1.077319587628866, + "grad_norm": 0.541980504989624, + "learning_rate": 8.092019528635453e-06, + "loss": 0.43, + "step": 1463 + }, + { + "epoch": 1.0780559646539027, + "grad_norm": 0.47262078523635864, + "learning_rate": 8.088651163288032e-06, + "loss": 0.4201, + "step": 1464 + }, + { + "epoch": 1.0787923416789396, + "grad_norm": 0.4839096665382385, + "learning_rate": 8.085280529728533e-06, + "loss": 0.4137, + "step": 1465 + }, + { + "epoch": 1.0795287187039764, + "grad_norm": 0.4608899652957916, + "learning_rate": 8.081907630432246e-06, + "loss": 0.4211, + "step": 1466 + }, + { + "epoch": 1.0802650957290132, + "grad_norm": 0.4837184250354767, + "learning_rate": 8.078532467876126e-06, + "loss": 0.4036, + "step": 1467 + }, + { + "epoch": 1.08100147275405, + "grad_norm": 0.518398642539978, + "learning_rate": 8.075155044538792e-06, + "loss": 0.4052, + "step": 1468 + }, + { + "epoch": 1.0817378497790868, + "grad_norm": 0.5079612731933594, + "learning_rate": 8.071775362900522e-06, + "loss": 0.4255, + "step": 1469 + }, + { + "epoch": 1.0824742268041236, + "grad_norm": 0.4972943365573883, + "learning_rate": 8.068393425443253e-06, + "loss": 0.3831, + "step": 1470 + }, + { + "epoch": 1.0832106038291605, + "grad_norm": 0.4453538656234741, + "learning_rate": 8.065009234650574e-06, + "loss": 0.4174, + "step": 1471 + }, + { + "epoch": 1.0839469808541973, + "grad_norm": 0.46142661571502686, + "learning_rate": 8.061622793007735e-06, + "loss": 0.4331, + "step": 1472 + }, + { + "epoch": 1.084683357879234, + "grad_norm": 0.4330517053604126, + "learning_rate": 8.058234103001634e-06, + "loss": 0.4285, + "step": 1473 + }, + { + "epoch": 1.085419734904271, + "grad_norm": 0.4767371118068695, + "learning_rate": 8.054843167120827e-06, + "loss": 0.4198, + "step": 1474 + }, + { + "epoch": 1.0861561119293077, + "grad_norm": 0.45682060718536377, + "learning_rate": 8.051449987855512e-06, + "loss": 0.4344, + "step": 1475 + }, + { + "epoch": 1.0868924889543445, + "grad_norm": 0.44379401206970215, + "learning_rate": 8.048054567697537e-06, + "loss": 0.4296, + "step": 1476 + }, + { + "epoch": 1.0876288659793814, + "grad_norm": 0.4649368226528168, + "learning_rate": 8.044656909140395e-06, + "loss": 0.4213, + "step": 1477 + }, + { + "epoch": 1.0883652430044182, + "grad_norm": 0.4579814672470093, + "learning_rate": 8.041257014679228e-06, + "loss": 0.4117, + "step": 1478 + }, + { + "epoch": 1.089101620029455, + "grad_norm": 0.4541153013706207, + "learning_rate": 8.037854886810813e-06, + "loss": 0.4197, + "step": 1479 + }, + { + "epoch": 1.0898379970544918, + "grad_norm": 0.39233219623565674, + "learning_rate": 8.034450528033565e-06, + "loss": 0.3982, + "step": 1480 + }, + { + "epoch": 1.0905743740795286, + "grad_norm": 0.40796148777008057, + "learning_rate": 8.031043940847551e-06, + "loss": 0.4377, + "step": 1481 + }, + { + "epoch": 1.0913107511045654, + "grad_norm": 0.47203153371810913, + "learning_rate": 8.027635127754462e-06, + "loss": 0.4251, + "step": 1482 + }, + { + "epoch": 1.0920471281296025, + "grad_norm": 0.4739782512187958, + "learning_rate": 8.024224091257628e-06, + "loss": 0.4257, + "step": 1483 + }, + { + "epoch": 1.0927835051546393, + "grad_norm": 0.3920868933200836, + "learning_rate": 8.020810833862009e-06, + "loss": 0.406, + "step": 1484 + }, + { + "epoch": 1.093519882179676, + "grad_norm": 0.5111119151115417, + "learning_rate": 8.017395358074198e-06, + "loss": 0.4216, + "step": 1485 + }, + { + "epoch": 1.094256259204713, + "grad_norm": 0.46156689524650574, + "learning_rate": 8.013977666402421e-06, + "loss": 0.4185, + "step": 1486 + }, + { + "epoch": 1.0949926362297497, + "grad_norm": 0.42204272747039795, + "learning_rate": 8.010557761356523e-06, + "loss": 0.417, + "step": 1487 + }, + { + "epoch": 1.0957290132547866, + "grad_norm": 0.5379983186721802, + "learning_rate": 8.007135645447982e-06, + "loss": 0.4235, + "step": 1488 + }, + { + "epoch": 1.0964653902798234, + "grad_norm": 0.4320377707481384, + "learning_rate": 8.003711321189895e-06, + "loss": 0.4081, + "step": 1489 + }, + { + "epoch": 1.0972017673048602, + "grad_norm": 0.4671606421470642, + "learning_rate": 8.000284791096983e-06, + "loss": 0.4235, + "step": 1490 + }, + { + "epoch": 1.097938144329897, + "grad_norm": 0.47538307309150696, + "learning_rate": 7.996856057685587e-06, + "loss": 0.45, + "step": 1491 + }, + { + "epoch": 1.0986745213549338, + "grad_norm": 0.45516252517700195, + "learning_rate": 7.993425123473662e-06, + "loss": 0.4002, + "step": 1492 + }, + { + "epoch": 1.0994108983799706, + "grad_norm": 0.4619099199771881, + "learning_rate": 7.989991990980786e-06, + "loss": 0.4214, + "step": 1493 + }, + { + "epoch": 1.1001472754050075, + "grad_norm": 0.36979731917381287, + "learning_rate": 7.986556662728145e-06, + "loss": 0.4225, + "step": 1494 + }, + { + "epoch": 1.1008836524300443, + "grad_norm": 0.43840059638023376, + "learning_rate": 7.983119141238543e-06, + "loss": 0.4197, + "step": 1495 + }, + { + "epoch": 1.101620029455081, + "grad_norm": 0.4606439769268036, + "learning_rate": 7.97967942903639e-06, + "loss": 0.4014, + "step": 1496 + }, + { + "epoch": 1.102356406480118, + "grad_norm": 0.38191360235214233, + "learning_rate": 7.976237528647705e-06, + "loss": 0.4149, + "step": 1497 + }, + { + "epoch": 1.1030927835051547, + "grad_norm": 0.4278278648853302, + "learning_rate": 7.97279344260012e-06, + "loss": 0.4283, + "step": 1498 + }, + { + "epoch": 1.1038291605301915, + "grad_norm": 0.4436081051826477, + "learning_rate": 7.969347173422866e-06, + "loss": 0.4274, + "step": 1499 + }, + { + "epoch": 1.1045655375552283, + "grad_norm": 0.40234655141830444, + "learning_rate": 7.965898723646777e-06, + "loss": 0.4044, + "step": 1500 + }, + { + "epoch": 1.1053019145802652, + "grad_norm": 0.38506656885147095, + "learning_rate": 7.962448095804292e-06, + "loss": 0.4066, + "step": 1501 + }, + { + "epoch": 1.106038291605302, + "grad_norm": 0.4169124364852905, + "learning_rate": 7.958995292429447e-06, + "loss": 0.423, + "step": 1502 + }, + { + "epoch": 1.1067746686303388, + "grad_norm": 0.4359695613384247, + "learning_rate": 7.955540316057877e-06, + "loss": 0.4202, + "step": 1503 + }, + { + "epoch": 1.1075110456553756, + "grad_norm": 0.4167039394378662, + "learning_rate": 7.952083169226813e-06, + "loss": 0.3861, + "step": 1504 + }, + { + "epoch": 1.1082474226804124, + "grad_norm": 0.43994608521461487, + "learning_rate": 7.948623854475079e-06, + "loss": 0.4144, + "step": 1505 + }, + { + "epoch": 1.1089837997054492, + "grad_norm": 0.4638996422290802, + "learning_rate": 7.94516237434309e-06, + "loss": 0.4085, + "step": 1506 + }, + { + "epoch": 1.109720176730486, + "grad_norm": 0.4726799726486206, + "learning_rate": 7.941698731372851e-06, + "loss": 0.3952, + "step": 1507 + }, + { + "epoch": 1.1104565537555229, + "grad_norm": 0.5001874566078186, + "learning_rate": 7.938232928107963e-06, + "loss": 0.4176, + "step": 1508 + }, + { + "epoch": 1.1111929307805597, + "grad_norm": 0.39368122816085815, + "learning_rate": 7.9347649670936e-06, + "loss": 0.4093, + "step": 1509 + }, + { + "epoch": 1.1119293078055965, + "grad_norm": 0.4525279998779297, + "learning_rate": 7.93129485087653e-06, + "loss": 0.4261, + "step": 1510 + }, + { + "epoch": 1.1126656848306333, + "grad_norm": 0.43985629081726074, + "learning_rate": 7.927822582005104e-06, + "loss": 0.4309, + "step": 1511 + }, + { + "epoch": 1.1134020618556701, + "grad_norm": 0.4164014160633087, + "learning_rate": 7.924348163029249e-06, + "loss": 0.4273, + "step": 1512 + }, + { + "epoch": 1.114138438880707, + "grad_norm": 0.5301308035850525, + "learning_rate": 7.920871596500473e-06, + "loss": 0.4277, + "step": 1513 + }, + { + "epoch": 1.1148748159057438, + "grad_norm": 0.4031001925468445, + "learning_rate": 7.917392884971863e-06, + "loss": 0.4058, + "step": 1514 + }, + { + "epoch": 1.1156111929307806, + "grad_norm": 0.564858078956604, + "learning_rate": 7.913912030998079e-06, + "loss": 0.4363, + "step": 1515 + }, + { + "epoch": 1.1163475699558174, + "grad_norm": 0.45828789472579956, + "learning_rate": 7.910429037135355e-06, + "loss": 0.4088, + "step": 1516 + }, + { + "epoch": 1.1170839469808542, + "grad_norm": 0.5494905114173889, + "learning_rate": 7.906943905941495e-06, + "loss": 0.4241, + "step": 1517 + }, + { + "epoch": 1.117820324005891, + "grad_norm": 0.5447287559509277, + "learning_rate": 7.903456639975875e-06, + "loss": 0.4177, + "step": 1518 + }, + { + "epoch": 1.1185567010309279, + "grad_norm": 0.4168299734592438, + "learning_rate": 7.89996724179944e-06, + "loss": 0.4134, + "step": 1519 + }, + { + "epoch": 1.1192930780559647, + "grad_norm": 0.5956445336341858, + "learning_rate": 7.896475713974696e-06, + "loss": 0.4133, + "step": 1520 + }, + { + "epoch": 1.1200294550810015, + "grad_norm": 0.5098801255226135, + "learning_rate": 7.892982059065714e-06, + "loss": 0.4245, + "step": 1521 + }, + { + "epoch": 1.1207658321060383, + "grad_norm": 0.4896615743637085, + "learning_rate": 7.889486279638134e-06, + "loss": 0.4074, + "step": 1522 + }, + { + "epoch": 1.1215022091310751, + "grad_norm": 0.5458217263221741, + "learning_rate": 7.885988378259145e-06, + "loss": 0.4018, + "step": 1523 + }, + { + "epoch": 1.122238586156112, + "grad_norm": 0.49367034435272217, + "learning_rate": 7.882488357497504e-06, + "loss": 0.4176, + "step": 1524 + }, + { + "epoch": 1.1229749631811488, + "grad_norm": 0.5893204212188721, + "learning_rate": 7.87898621992352e-06, + "loss": 0.4162, + "step": 1525 + }, + { + "epoch": 1.1237113402061856, + "grad_norm": 0.5065829753875732, + "learning_rate": 7.875481968109052e-06, + "loss": 0.4279, + "step": 1526 + }, + { + "epoch": 1.1244477172312224, + "grad_norm": 0.5080648064613342, + "learning_rate": 7.871975604627524e-06, + "loss": 0.3956, + "step": 1527 + }, + { + "epoch": 1.1251840942562592, + "grad_norm": 0.6169611811637878, + "learning_rate": 7.8684671320539e-06, + "loss": 0.3985, + "step": 1528 + }, + { + "epoch": 1.125920471281296, + "grad_norm": 0.4453141987323761, + "learning_rate": 7.864956552964695e-06, + "loss": 0.4053, + "step": 1529 + }, + { + "epoch": 1.1266568483063328, + "grad_norm": 0.5291115641593933, + "learning_rate": 7.861443869937973e-06, + "loss": 0.414, + "step": 1530 + }, + { + "epoch": 1.1273932253313697, + "grad_norm": 0.5258786678314209, + "learning_rate": 7.857929085553344e-06, + "loss": 0.4303, + "step": 1531 + }, + { + "epoch": 1.1281296023564065, + "grad_norm": 0.4741217792034149, + "learning_rate": 7.854412202391958e-06, + "loss": 0.4174, + "step": 1532 + }, + { + "epoch": 1.1288659793814433, + "grad_norm": 0.5719811320304871, + "learning_rate": 7.850893223036508e-06, + "loss": 0.4043, + "step": 1533 + }, + { + "epoch": 1.12960235640648, + "grad_norm": 0.46365654468536377, + "learning_rate": 7.847372150071227e-06, + "loss": 0.4338, + "step": 1534 + }, + { + "epoch": 1.130338733431517, + "grad_norm": 0.4973761737346649, + "learning_rate": 7.843848986081882e-06, + "loss": 0.3943, + "step": 1535 + }, + { + "epoch": 1.1310751104565537, + "grad_norm": 0.5263963937759399, + "learning_rate": 7.84032373365578e-06, + "loss": 0.3909, + "step": 1536 + }, + { + "epoch": 1.1318114874815906, + "grad_norm": 0.452614426612854, + "learning_rate": 7.836796395381761e-06, + "loss": 0.415, + "step": 1537 + }, + { + "epoch": 1.1325478645066274, + "grad_norm": 0.49693626165390015, + "learning_rate": 7.833266973850192e-06, + "loss": 0.4189, + "step": 1538 + }, + { + "epoch": 1.1332842415316642, + "grad_norm": 0.5099273920059204, + "learning_rate": 7.829735471652978e-06, + "loss": 0.4125, + "step": 1539 + }, + { + "epoch": 1.134020618556701, + "grad_norm": 0.40911126136779785, + "learning_rate": 7.826201891383542e-06, + "loss": 0.4016, + "step": 1540 + }, + { + "epoch": 1.1347569955817378, + "grad_norm": 0.525810182094574, + "learning_rate": 7.822666235636844e-06, + "loss": 0.4202, + "step": 1541 + }, + { + "epoch": 1.1354933726067746, + "grad_norm": 0.40537041425704956, + "learning_rate": 7.819128507009361e-06, + "loss": 0.4088, + "step": 1542 + }, + { + "epoch": 1.1362297496318114, + "grad_norm": 0.42212677001953125, + "learning_rate": 7.815588708099094e-06, + "loss": 0.4222, + "step": 1543 + }, + { + "epoch": 1.1369661266568483, + "grad_norm": 0.5106800198554993, + "learning_rate": 7.812046841505563e-06, + "loss": 0.4592, + "step": 1544 + }, + { + "epoch": 1.137702503681885, + "grad_norm": 0.419968843460083, + "learning_rate": 7.808502909829807e-06, + "loss": 0.4089, + "step": 1545 + }, + { + "epoch": 1.138438880706922, + "grad_norm": 0.4281230568885803, + "learning_rate": 7.804956915674387e-06, + "loss": 0.3874, + "step": 1546 + }, + { + "epoch": 1.1391752577319587, + "grad_norm": 0.502241313457489, + "learning_rate": 7.80140886164337e-06, + "loss": 0.4284, + "step": 1547 + }, + { + "epoch": 1.1399116347569955, + "grad_norm": 0.39548102021217346, + "learning_rate": 7.79785875034234e-06, + "loss": 0.408, + "step": 1548 + }, + { + "epoch": 1.1406480117820323, + "grad_norm": 0.4249773919582367, + "learning_rate": 7.794306584378392e-06, + "loss": 0.4015, + "step": 1549 + }, + { + "epoch": 1.1413843888070692, + "grad_norm": 0.4784921109676361, + "learning_rate": 7.79075236636013e-06, + "loss": 0.4199, + "step": 1550 + }, + { + "epoch": 1.142120765832106, + "grad_norm": 0.42127084732055664, + "learning_rate": 7.787196098897664e-06, + "loss": 0.3893, + "step": 1551 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 0.4333012104034424, + "learning_rate": 7.783637784602608e-06, + "loss": 0.3984, + "step": 1552 + }, + { + "epoch": 1.1435935198821796, + "grad_norm": 0.47243404388427734, + "learning_rate": 7.780077426088083e-06, + "loss": 0.4212, + "step": 1553 + }, + { + "epoch": 1.1443298969072164, + "grad_norm": 0.47604048252105713, + "learning_rate": 7.776515025968707e-06, + "loss": 0.3994, + "step": 1554 + }, + { + "epoch": 1.1450662739322532, + "grad_norm": 0.5063879489898682, + "learning_rate": 7.772950586860599e-06, + "loss": 0.4086, + "step": 1555 + }, + { + "epoch": 1.14580265095729, + "grad_norm": 0.43437930941581726, + "learning_rate": 7.769384111381375e-06, + "loss": 0.4358, + "step": 1556 + }, + { + "epoch": 1.1465390279823269, + "grad_norm": 0.46195188164711, + "learning_rate": 7.76581560215015e-06, + "loss": 0.4083, + "step": 1557 + }, + { + "epoch": 1.1472754050073637, + "grad_norm": 0.4431914985179901, + "learning_rate": 7.762245061787525e-06, + "loss": 0.4024, + "step": 1558 + }, + { + "epoch": 1.1480117820324005, + "grad_norm": 0.42202213406562805, + "learning_rate": 7.758672492915598e-06, + "loss": 0.3988, + "step": 1559 + }, + { + "epoch": 1.1487481590574373, + "grad_norm": 0.4758620858192444, + "learning_rate": 7.755097898157957e-06, + "loss": 0.4307, + "step": 1560 + }, + { + "epoch": 1.1494845360824741, + "grad_norm": 0.4043138027191162, + "learning_rate": 7.751521280139675e-06, + "loss": 0.3967, + "step": 1561 + }, + { + "epoch": 1.150220913107511, + "grad_norm": 0.41827574372291565, + "learning_rate": 7.747942641487313e-06, + "loss": 0.4107, + "step": 1562 + }, + { + "epoch": 1.1509572901325478, + "grad_norm": 0.45348092913627625, + "learning_rate": 7.74436198482892e-06, + "loss": 0.4278, + "step": 1563 + }, + { + "epoch": 1.1516936671575846, + "grad_norm": 0.43559592962265015, + "learning_rate": 7.74077931279401e-06, + "loss": 0.4146, + "step": 1564 + }, + { + "epoch": 1.1524300441826214, + "grad_norm": 0.3940446078777313, + "learning_rate": 7.7371946280136e-06, + "loss": 0.404, + "step": 1565 + }, + { + "epoch": 1.1531664212076582, + "grad_norm": 0.4286969006061554, + "learning_rate": 7.73360793312017e-06, + "loss": 0.4078, + "step": 1566 + }, + { + "epoch": 1.153902798232695, + "grad_norm": 0.4312724471092224, + "learning_rate": 7.730019230747681e-06, + "loss": 0.4088, + "step": 1567 + }, + { + "epoch": 1.1546391752577319, + "grad_norm": 0.41734251379966736, + "learning_rate": 7.726428523531565e-06, + "loss": 0.4228, + "step": 1568 + }, + { + "epoch": 1.1553755522827687, + "grad_norm": 0.435044527053833, + "learning_rate": 7.722835814108733e-06, + "loss": 0.4059, + "step": 1569 + }, + { + "epoch": 1.1561119293078055, + "grad_norm": 0.443087637424469, + "learning_rate": 7.719241105117559e-06, + "loss": 0.3977, + "step": 1570 + }, + { + "epoch": 1.1568483063328423, + "grad_norm": 0.3917474150657654, + "learning_rate": 7.715644399197893e-06, + "loss": 0.3999, + "step": 1571 + }, + { + "epoch": 1.1575846833578791, + "grad_norm": 0.4369930922985077, + "learning_rate": 7.712045698991043e-06, + "loss": 0.4112, + "step": 1572 + }, + { + "epoch": 1.158321060382916, + "grad_norm": 0.42742857336997986, + "learning_rate": 7.708445007139785e-06, + "loss": 0.4092, + "step": 1573 + }, + { + "epoch": 1.1590574374079528, + "grad_norm": 0.41142895817756653, + "learning_rate": 7.70484232628836e-06, + "loss": 0.4413, + "step": 1574 + }, + { + "epoch": 1.1597938144329896, + "grad_norm": 0.4239861071109772, + "learning_rate": 7.70123765908247e-06, + "loss": 0.4047, + "step": 1575 + }, + { + "epoch": 1.1605301914580266, + "grad_norm": 0.41946011781692505, + "learning_rate": 7.69763100816927e-06, + "loss": 0.3995, + "step": 1576 + }, + { + "epoch": 1.1612665684830634, + "grad_norm": 0.41380631923675537, + "learning_rate": 7.69402237619738e-06, + "loss": 0.4251, + "step": 1577 + }, + { + "epoch": 1.1620029455081002, + "grad_norm": 0.4237883985042572, + "learning_rate": 7.690411765816864e-06, + "loss": 0.4196, + "step": 1578 + }, + { + "epoch": 1.162739322533137, + "grad_norm": 0.4093558192253113, + "learning_rate": 7.68679917967925e-06, + "loss": 0.412, + "step": 1579 + }, + { + "epoch": 1.1634756995581739, + "grad_norm": 0.41598761081695557, + "learning_rate": 7.683184620437511e-06, + "loss": 0.4031, + "step": 1580 + }, + { + "epoch": 1.1642120765832107, + "grad_norm": 0.43564292788505554, + "learning_rate": 7.67956809074607e-06, + "loss": 0.397, + "step": 1581 + }, + { + "epoch": 1.1649484536082475, + "grad_norm": 0.40577253699302673, + "learning_rate": 7.675949593260797e-06, + "loss": 0.3964, + "step": 1582 + }, + { + "epoch": 1.1656848306332843, + "grad_norm": 0.43315377831459045, + "learning_rate": 7.672329130639007e-06, + "loss": 0.4423, + "step": 1583 + }, + { + "epoch": 1.1664212076583211, + "grad_norm": 0.5157422423362732, + "learning_rate": 7.668706705539458e-06, + "loss": 0.4398, + "step": 1584 + }, + { + "epoch": 1.167157584683358, + "grad_norm": 0.4576352834701538, + "learning_rate": 7.66508232062235e-06, + "loss": 0.4075, + "step": 1585 + }, + { + "epoch": 1.1678939617083948, + "grad_norm": 0.3952282965183258, + "learning_rate": 7.661455978549322e-06, + "loss": 0.4418, + "step": 1586 + }, + { + "epoch": 1.1686303387334316, + "grad_norm": 0.40662047266960144, + "learning_rate": 7.657827681983448e-06, + "loss": 0.3875, + "step": 1587 + }, + { + "epoch": 1.1693667157584684, + "grad_norm": 0.46831056475639343, + "learning_rate": 7.654197433589243e-06, + "loss": 0.4206, + "step": 1588 + }, + { + "epoch": 1.1701030927835052, + "grad_norm": 0.44352883100509644, + "learning_rate": 7.65056523603265e-06, + "loss": 0.3882, + "step": 1589 + }, + { + "epoch": 1.170839469808542, + "grad_norm": 0.4131197929382324, + "learning_rate": 7.646931091981045e-06, + "loss": 0.4203, + "step": 1590 + }, + { + "epoch": 1.1715758468335788, + "grad_norm": 0.543073296546936, + "learning_rate": 7.643295004103232e-06, + "loss": 0.4035, + "step": 1591 + }, + { + "epoch": 1.1723122238586157, + "grad_norm": 0.42148828506469727, + "learning_rate": 7.639656975069447e-06, + "loss": 0.4257, + "step": 1592 + }, + { + "epoch": 1.1730486008836525, + "grad_norm": 0.45353934168815613, + "learning_rate": 7.636017007551349e-06, + "loss": 0.4306, + "step": 1593 + }, + { + "epoch": 1.1737849779086893, + "grad_norm": 0.42976027727127075, + "learning_rate": 7.632375104222014e-06, + "loss": 0.3962, + "step": 1594 + }, + { + "epoch": 1.1745213549337261, + "grad_norm": 0.4995291829109192, + "learning_rate": 7.628731267755952e-06, + "loss": 0.3974, + "step": 1595 + }, + { + "epoch": 1.175257731958763, + "grad_norm": 0.4624010920524597, + "learning_rate": 7.6250855008290856e-06, + "loss": 0.4175, + "step": 1596 + }, + { + "epoch": 1.1759941089837997, + "grad_norm": 0.46264103055000305, + "learning_rate": 7.6214378061187546e-06, + "loss": 0.3981, + "step": 1597 + }, + { + "epoch": 1.1767304860088366, + "grad_norm": 0.48678502440452576, + "learning_rate": 7.617788186303714e-06, + "loss": 0.4185, + "step": 1598 + }, + { + "epoch": 1.1774668630338734, + "grad_norm": 0.4638168513774872, + "learning_rate": 7.6141366440641365e-06, + "loss": 0.4137, + "step": 1599 + }, + { + "epoch": 1.1782032400589102, + "grad_norm": 0.4336228668689728, + "learning_rate": 7.610483182081607e-06, + "loss": 0.4418, + "step": 1600 + }, + { + "epoch": 1.178939617083947, + "grad_norm": 0.47200411558151245, + "learning_rate": 7.606827803039112e-06, + "loss": 0.4105, + "step": 1601 + }, + { + "epoch": 1.1796759941089838, + "grad_norm": 0.5027320981025696, + "learning_rate": 7.603170509621054e-06, + "loss": 0.4476, + "step": 1602 + }, + { + "epoch": 1.1804123711340206, + "grad_norm": 0.4112722873687744, + "learning_rate": 7.5995113045132395e-06, + "loss": 0.4147, + "step": 1603 + }, + { + "epoch": 1.1811487481590575, + "grad_norm": 0.4506515562534332, + "learning_rate": 7.595850190402877e-06, + "loss": 0.4221, + "step": 1604 + }, + { + "epoch": 1.1818851251840943, + "grad_norm": 0.4421546757221222, + "learning_rate": 7.59218716997858e-06, + "loss": 0.4313, + "step": 1605 + }, + { + "epoch": 1.182621502209131, + "grad_norm": 0.4537235200405121, + "learning_rate": 7.588522245930357e-06, + "loss": 0.4175, + "step": 1606 + }, + { + "epoch": 1.183357879234168, + "grad_norm": 0.4480210244655609, + "learning_rate": 7.584855420949619e-06, + "loss": 0.4213, + "step": 1607 + }, + { + "epoch": 1.1840942562592047, + "grad_norm": 0.408025324344635, + "learning_rate": 7.581186697729172e-06, + "loss": 0.4099, + "step": 1608 + }, + { + "epoch": 1.1848306332842415, + "grad_norm": 0.45869341492652893, + "learning_rate": 7.577516078963215e-06, + "loss": 0.4143, + "step": 1609 + }, + { + "epoch": 1.1855670103092784, + "grad_norm": 0.48638081550598145, + "learning_rate": 7.573843567347339e-06, + "loss": 0.4361, + "step": 1610 + }, + { + "epoch": 1.1863033873343152, + "grad_norm": 0.39438486099243164, + "learning_rate": 7.570169165578527e-06, + "loss": 0.4394, + "step": 1611 + }, + { + "epoch": 1.187039764359352, + "grad_norm": 0.49038755893707275, + "learning_rate": 7.566492876355147e-06, + "loss": 0.416, + "step": 1612 + }, + { + "epoch": 1.1877761413843888, + "grad_norm": 0.4872884452342987, + "learning_rate": 7.562814702376955e-06, + "loss": 0.4071, + "step": 1613 + }, + { + "epoch": 1.1885125184094256, + "grad_norm": 0.4326101541519165, + "learning_rate": 7.559134646345092e-06, + "loss": 0.4061, + "step": 1614 + }, + { + "epoch": 1.1892488954344624, + "grad_norm": 0.48473793268203735, + "learning_rate": 7.5554527109620775e-06, + "loss": 0.4206, + "step": 1615 + }, + { + "epoch": 1.1899852724594993, + "grad_norm": 0.43738964200019836, + "learning_rate": 7.551768898931816e-06, + "loss": 0.4113, + "step": 1616 + }, + { + "epoch": 1.190721649484536, + "grad_norm": 0.41719716787338257, + "learning_rate": 7.548083212959588e-06, + "loss": 0.4126, + "step": 1617 + }, + { + "epoch": 1.1914580265095729, + "grad_norm": 0.4838084280490875, + "learning_rate": 7.5443956557520485e-06, + "loss": 0.4494, + "step": 1618 + }, + { + "epoch": 1.1921944035346097, + "grad_norm": 0.4023309051990509, + "learning_rate": 7.540706230017227e-06, + "loss": 0.4212, + "step": 1619 + }, + { + "epoch": 1.1929307805596465, + "grad_norm": 0.42563098669052124, + "learning_rate": 7.537014938464529e-06, + "loss": 0.378, + "step": 1620 + }, + { + "epoch": 1.1936671575846833, + "grad_norm": 0.4844202697277069, + "learning_rate": 7.533321783804726e-06, + "loss": 0.4024, + "step": 1621 + }, + { + "epoch": 1.1944035346097202, + "grad_norm": 0.39639124274253845, + "learning_rate": 7.529626768749958e-06, + "loss": 0.4082, + "step": 1622 + }, + { + "epoch": 1.195139911634757, + "grad_norm": 0.45201244950294495, + "learning_rate": 7.525929896013735e-06, + "loss": 0.4252, + "step": 1623 + }, + { + "epoch": 1.1958762886597938, + "grad_norm": 0.463316947221756, + "learning_rate": 7.5222311683109265e-06, + "loss": 0.417, + "step": 1624 + }, + { + "epoch": 1.1966126656848306, + "grad_norm": 0.42931288480758667, + "learning_rate": 7.518530588357769e-06, + "loss": 0.4369, + "step": 1625 + }, + { + "epoch": 1.1973490427098674, + "grad_norm": 0.44762808084487915, + "learning_rate": 7.514828158871852e-06, + "loss": 0.3949, + "step": 1626 + }, + { + "epoch": 1.1980854197349042, + "grad_norm": 0.43206796050071716, + "learning_rate": 7.511123882572133e-06, + "loss": 0.4018, + "step": 1627 + }, + { + "epoch": 1.198821796759941, + "grad_norm": 0.41934624314308167, + "learning_rate": 7.5074177621789155e-06, + "loss": 0.4152, + "step": 1628 + }, + { + "epoch": 1.1995581737849779, + "grad_norm": 0.41448119282722473, + "learning_rate": 7.503709800413868e-06, + "loss": 0.4424, + "step": 1629 + }, + { + "epoch": 1.2002945508100147, + "grad_norm": 0.4471980333328247, + "learning_rate": 7.500000000000001e-06, + "loss": 0.417, + "step": 1630 + }, + { + "epoch": 1.2010309278350515, + "grad_norm": 0.4638178050518036, + "learning_rate": 7.496288363661681e-06, + "loss": 0.431, + "step": 1631 + }, + { + "epoch": 1.2017673048600883, + "grad_norm": 0.4889911115169525, + "learning_rate": 7.492574894124624e-06, + "loss": 0.4159, + "step": 1632 + }, + { + "epoch": 1.2025036818851251, + "grad_norm": 0.4356183707714081, + "learning_rate": 7.4888595941158844e-06, + "loss": 0.4258, + "step": 1633 + }, + { + "epoch": 1.203240058910162, + "grad_norm": 0.4645734429359436, + "learning_rate": 7.485142466363873e-06, + "loss": 0.3947, + "step": 1634 + }, + { + "epoch": 1.2039764359351988, + "grad_norm": 0.44492483139038086, + "learning_rate": 7.481423513598331e-06, + "loss": 0.4154, + "step": 1635 + }, + { + "epoch": 1.2047128129602356, + "grad_norm": 0.42611250281333923, + "learning_rate": 7.477702738550346e-06, + "loss": 0.4105, + "step": 1636 + }, + { + "epoch": 1.2054491899852724, + "grad_norm": 0.4502268135547638, + "learning_rate": 7.473980143952344e-06, + "loss": 0.3943, + "step": 1637 + }, + { + "epoch": 1.2061855670103092, + "grad_norm": 0.4995657801628113, + "learning_rate": 7.470255732538086e-06, + "loss": 0.401, + "step": 1638 + }, + { + "epoch": 1.206921944035346, + "grad_norm": 0.399395227432251, + "learning_rate": 7.466529507042666e-06, + "loss": 0.3953, + "step": 1639 + }, + { + "epoch": 1.2076583210603828, + "grad_norm": 0.472028523683548, + "learning_rate": 7.462801470202513e-06, + "loss": 0.412, + "step": 1640 + }, + { + "epoch": 1.2083946980854197, + "grad_norm": 0.44773492217063904, + "learning_rate": 7.459071624755382e-06, + "loss": 0.4121, + "step": 1641 + }, + { + "epoch": 1.2091310751104565, + "grad_norm": 0.4189164340496063, + "learning_rate": 7.455339973440361e-06, + "loss": 0.398, + "step": 1642 + }, + { + "epoch": 1.2098674521354933, + "grad_norm": 0.40162959694862366, + "learning_rate": 7.4516065189978625e-06, + "loss": 0.4068, + "step": 1643 + }, + { + "epoch": 1.21060382916053, + "grad_norm": 0.4244077503681183, + "learning_rate": 7.4478712641696194e-06, + "loss": 0.4342, + "step": 1644 + }, + { + "epoch": 1.211340206185567, + "grad_norm": 0.4497550427913666, + "learning_rate": 7.444134211698692e-06, + "loss": 0.4067, + "step": 1645 + }, + { + "epoch": 1.2120765832106037, + "grad_norm": 0.4071500301361084, + "learning_rate": 7.44039536432946e-06, + "loss": 0.4084, + "step": 1646 + }, + { + "epoch": 1.2128129602356406, + "grad_norm": 0.413670152425766, + "learning_rate": 7.43665472480762e-06, + "loss": 0.439, + "step": 1647 + }, + { + "epoch": 1.2135493372606774, + "grad_norm": 0.4371226727962494, + "learning_rate": 7.4329122958801806e-06, + "loss": 0.3991, + "step": 1648 + }, + { + "epoch": 1.2142857142857142, + "grad_norm": 0.4222114384174347, + "learning_rate": 7.4291680802954716e-06, + "loss": 0.4208, + "step": 1649 + }, + { + "epoch": 1.2150220913107512, + "grad_norm": 0.4362742304801941, + "learning_rate": 7.425422080803132e-06, + "loss": 0.3812, + "step": 1650 + }, + { + "epoch": 1.215758468335788, + "grad_norm": 0.4180275499820709, + "learning_rate": 7.42167430015411e-06, + "loss": 0.4073, + "step": 1651 + }, + { + "epoch": 1.2164948453608249, + "grad_norm": 0.42266976833343506, + "learning_rate": 7.417924741100662e-06, + "loss": 0.4104, + "step": 1652 + }, + { + "epoch": 1.2172312223858617, + "grad_norm": 0.44959259033203125, + "learning_rate": 7.414173406396351e-06, + "loss": 0.4324, + "step": 1653 + }, + { + "epoch": 1.2179675994108985, + "grad_norm": 0.42745327949523926, + "learning_rate": 7.410420298796045e-06, + "loss": 0.413, + "step": 1654 + }, + { + "epoch": 1.2187039764359353, + "grad_norm": 0.3998710811138153, + "learning_rate": 7.406665421055912e-06, + "loss": 0.4059, + "step": 1655 + }, + { + "epoch": 1.2194403534609721, + "grad_norm": 0.4011445641517639, + "learning_rate": 7.402908775933419e-06, + "loss": 0.4191, + "step": 1656 + }, + { + "epoch": 1.220176730486009, + "grad_norm": 0.44070056080818176, + "learning_rate": 7.399150366187336e-06, + "loss": 0.4215, + "step": 1657 + }, + { + "epoch": 1.2209131075110458, + "grad_norm": 0.4228590726852417, + "learning_rate": 7.395390194577724e-06, + "loss": 0.4107, + "step": 1658 + }, + { + "epoch": 1.2216494845360826, + "grad_norm": 0.3656286299228668, + "learning_rate": 7.391628263865939e-06, + "loss": 0.3822, + "step": 1659 + }, + { + "epoch": 1.2223858615611194, + "grad_norm": 0.4292581081390381, + "learning_rate": 7.387864576814628e-06, + "loss": 0.4188, + "step": 1660 + }, + { + "epoch": 1.2231222385861562, + "grad_norm": 0.39621493220329285, + "learning_rate": 7.3840991361877326e-06, + "loss": 0.3921, + "step": 1661 + }, + { + "epoch": 1.223858615611193, + "grad_norm": 0.4454437792301178, + "learning_rate": 7.380331944750476e-06, + "loss": 0.4097, + "step": 1662 + }, + { + "epoch": 1.2245949926362298, + "grad_norm": 0.39751824736595154, + "learning_rate": 7.37656300526937e-06, + "loss": 0.4232, + "step": 1663 + }, + { + "epoch": 1.2253313696612667, + "grad_norm": 0.4180833399295807, + "learning_rate": 7.37279232051221e-06, + "loss": 0.4162, + "step": 1664 + }, + { + "epoch": 1.2260677466863035, + "grad_norm": 0.41712233424186707, + "learning_rate": 7.369019893248074e-06, + "loss": 0.4304, + "step": 1665 + }, + { + "epoch": 1.2268041237113403, + "grad_norm": 0.38904157280921936, + "learning_rate": 7.365245726247316e-06, + "loss": 0.427, + "step": 1666 + }, + { + "epoch": 1.227540500736377, + "grad_norm": 0.41479575634002686, + "learning_rate": 7.361469822281573e-06, + "loss": 0.4401, + "step": 1667 + }, + { + "epoch": 1.228276877761414, + "grad_norm": 0.3810448944568634, + "learning_rate": 7.3576921841237535e-06, + "loss": 0.4167, + "step": 1668 + }, + { + "epoch": 1.2290132547864507, + "grad_norm": 0.42002904415130615, + "learning_rate": 7.353912814548042e-06, + "loss": 0.4009, + "step": 1669 + }, + { + "epoch": 1.2297496318114876, + "grad_norm": 0.3993929326534271, + "learning_rate": 7.350131716329891e-06, + "loss": 0.4106, + "step": 1670 + }, + { + "epoch": 1.2304860088365244, + "grad_norm": 0.4273282587528229, + "learning_rate": 7.346348892246026e-06, + "loss": 0.4382, + "step": 1671 + }, + { + "epoch": 1.2312223858615612, + "grad_norm": 0.43270593881607056, + "learning_rate": 7.342564345074441e-06, + "loss": 0.4107, + "step": 1672 + }, + { + "epoch": 1.231958762886598, + "grad_norm": 0.40336766839027405, + "learning_rate": 7.338778077594388e-06, + "loss": 0.4238, + "step": 1673 + }, + { + "epoch": 1.2326951399116348, + "grad_norm": 0.39280226826667786, + "learning_rate": 7.33499009258639e-06, + "loss": 0.3768, + "step": 1674 + }, + { + "epoch": 1.2334315169366716, + "grad_norm": 0.4784250557422638, + "learning_rate": 7.331200392832232e-06, + "loss": 0.4272, + "step": 1675 + }, + { + "epoch": 1.2341678939617085, + "grad_norm": 0.3877667188644409, + "learning_rate": 7.32740898111495e-06, + "loss": 0.3862, + "step": 1676 + }, + { + "epoch": 1.2349042709867453, + "grad_norm": 0.4045974016189575, + "learning_rate": 7.323615860218844e-06, + "loss": 0.4099, + "step": 1677 + }, + { + "epoch": 1.235640648011782, + "grad_norm": 0.4475846588611603, + "learning_rate": 7.319821032929467e-06, + "loss": 0.4142, + "step": 1678 + }, + { + "epoch": 1.236377025036819, + "grad_norm": 0.4150718152523041, + "learning_rate": 7.316024502033627e-06, + "loss": 0.428, + "step": 1679 + }, + { + "epoch": 1.2371134020618557, + "grad_norm": 0.4547623097896576, + "learning_rate": 7.31222627031938e-06, + "loss": 0.4304, + "step": 1680 + }, + { + "epoch": 1.2378497790868925, + "grad_norm": 0.4688529968261719, + "learning_rate": 7.308426340576034e-06, + "loss": 0.3903, + "step": 1681 + }, + { + "epoch": 1.2385861561119293, + "grad_norm": 0.5233607888221741, + "learning_rate": 7.30462471559414e-06, + "loss": 0.3849, + "step": 1682 + }, + { + "epoch": 1.2393225331369662, + "grad_norm": 0.43747377395629883, + "learning_rate": 7.3008213981655005e-06, + "loss": 0.3894, + "step": 1683 + }, + { + "epoch": 1.240058910162003, + "grad_norm": 0.40247631072998047, + "learning_rate": 7.297016391083154e-06, + "loss": 0.376, + "step": 1684 + }, + { + "epoch": 1.2407952871870398, + "grad_norm": 0.49333998560905457, + "learning_rate": 7.2932096971413815e-06, + "loss": 0.4105, + "step": 1685 + }, + { + "epoch": 1.2415316642120766, + "grad_norm": 0.4000438153743744, + "learning_rate": 7.289401319135707e-06, + "loss": 0.3952, + "step": 1686 + }, + { + "epoch": 1.2422680412371134, + "grad_norm": 0.44068193435668945, + "learning_rate": 7.285591259862888e-06, + "loss": 0.379, + "step": 1687 + }, + { + "epoch": 1.2430044182621502, + "grad_norm": 0.4409189820289612, + "learning_rate": 7.281779522120914e-06, + "loss": 0.4173, + "step": 1688 + }, + { + "epoch": 1.243740795287187, + "grad_norm": 0.4082186222076416, + "learning_rate": 7.277966108709014e-06, + "loss": 0.407, + "step": 1689 + }, + { + "epoch": 1.2444771723122239, + "grad_norm": 0.40812796354293823, + "learning_rate": 7.27415102242764e-06, + "loss": 0.4243, + "step": 1690 + }, + { + "epoch": 1.2452135493372607, + "grad_norm": 0.41487666964530945, + "learning_rate": 7.2703342660784785e-06, + "loss": 0.4032, + "step": 1691 + }, + { + "epoch": 1.2459499263622975, + "grad_norm": 0.44201499223709106, + "learning_rate": 7.266515842464438e-06, + "loss": 0.448, + "step": 1692 + }, + { + "epoch": 1.2466863033873343, + "grad_norm": 0.4068033695220947, + "learning_rate": 7.262695754389655e-06, + "loss": 0.4022, + "step": 1693 + }, + { + "epoch": 1.2474226804123711, + "grad_norm": 0.3832601010799408, + "learning_rate": 7.258874004659487e-06, + "loss": 0.4315, + "step": 1694 + }, + { + "epoch": 1.248159057437408, + "grad_norm": 0.4322513937950134, + "learning_rate": 7.25505059608051e-06, + "loss": 0.4123, + "step": 1695 + }, + { + "epoch": 1.2488954344624448, + "grad_norm": 0.3983667194843292, + "learning_rate": 7.25122553146052e-06, + "loss": 0.3817, + "step": 1696 + }, + { + "epoch": 1.2496318114874816, + "grad_norm": 0.44032666087150574, + "learning_rate": 7.247398813608531e-06, + "loss": 0.3921, + "step": 1697 + }, + { + "epoch": 1.2503681885125184, + "grad_norm": 0.451885461807251, + "learning_rate": 7.243570445334767e-06, + "loss": 0.4332, + "step": 1698 + }, + { + "epoch": 1.2511045655375552, + "grad_norm": 0.46099600195884705, + "learning_rate": 7.239740429450664e-06, + "loss": 0.4037, + "step": 1699 + }, + { + "epoch": 1.251840942562592, + "grad_norm": 0.4341306984424591, + "learning_rate": 7.235908768768875e-06, + "loss": 0.3931, + "step": 1700 + }, + { + "epoch": 1.2525773195876289, + "grad_norm": 0.4982093274593353, + "learning_rate": 7.232075466103254e-06, + "loss": 0.4291, + "step": 1701 + }, + { + "epoch": 1.2533136966126657, + "grad_norm": 0.4553844928741455, + "learning_rate": 7.228240524268858e-06, + "loss": 0.4011, + "step": 1702 + }, + { + "epoch": 1.2540500736377025, + "grad_norm": 0.43320947885513306, + "learning_rate": 7.224403946081957e-06, + "loss": 0.4023, + "step": 1703 + }, + { + "epoch": 1.2547864506627393, + "grad_norm": 0.46215543150901794, + "learning_rate": 7.220565734360019e-06, + "loss": 0.4213, + "step": 1704 + }, + { + "epoch": 1.2555228276877761, + "grad_norm": 0.4323497712612152, + "learning_rate": 7.216725891921707e-06, + "loss": 0.4226, + "step": 1705 + }, + { + "epoch": 1.256259204712813, + "grad_norm": 0.47635823488235474, + "learning_rate": 7.212884421586889e-06, + "loss": 0.399, + "step": 1706 + }, + { + "epoch": 1.2569955817378498, + "grad_norm": 0.40797707438468933, + "learning_rate": 7.20904132617662e-06, + "loss": 0.3783, + "step": 1707 + }, + { + "epoch": 1.2577319587628866, + "grad_norm": 0.5090800523757935, + "learning_rate": 7.2051966085131584e-06, + "loss": 0.4148, + "step": 1708 + }, + { + "epoch": 1.2584683357879234, + "grad_norm": 0.5189807415008545, + "learning_rate": 7.201350271419945e-06, + "loss": 0.4367, + "step": 1709 + }, + { + "epoch": 1.2592047128129602, + "grad_norm": 0.40211692452430725, + "learning_rate": 7.197502317721616e-06, + "loss": 0.3966, + "step": 1710 + }, + { + "epoch": 1.259941089837997, + "grad_norm": 0.45923155546188354, + "learning_rate": 7.19365275024399e-06, + "loss": 0.4394, + "step": 1711 + }, + { + "epoch": 1.2606774668630338, + "grad_norm": 0.48686525225639343, + "learning_rate": 7.189801571814075e-06, + "loss": 0.4252, + "step": 1712 + }, + { + "epoch": 1.2614138438880707, + "grad_norm": 0.43244504928588867, + "learning_rate": 7.185948785260058e-06, + "loss": 0.3866, + "step": 1713 + }, + { + "epoch": 1.2621502209131075, + "grad_norm": 0.4492274820804596, + "learning_rate": 7.182094393411312e-06, + "loss": 0.423, + "step": 1714 + }, + { + "epoch": 1.2628865979381443, + "grad_norm": 0.404514878988266, + "learning_rate": 7.178238399098381e-06, + "loss": 0.3901, + "step": 1715 + }, + { + "epoch": 1.263622974963181, + "grad_norm": 0.41889336705207825, + "learning_rate": 7.174380805152997e-06, + "loss": 0.402, + "step": 1716 + }, + { + "epoch": 1.264359351988218, + "grad_norm": 0.4595990777015686, + "learning_rate": 7.170521614408056e-06, + "loss": 0.3852, + "step": 1717 + }, + { + "epoch": 1.2650957290132547, + "grad_norm": 0.422296941280365, + "learning_rate": 7.166660829697633e-06, + "loss": 0.4092, + "step": 1718 + }, + { + "epoch": 1.2658321060382915, + "grad_norm": 0.49134594202041626, + "learning_rate": 7.162798453856969e-06, + "loss": 0.4246, + "step": 1719 + }, + { + "epoch": 1.2665684830633284, + "grad_norm": 0.4637044072151184, + "learning_rate": 7.1589344897224795e-06, + "loss": 0.3807, + "step": 1720 + }, + { + "epoch": 1.2673048600883652, + "grad_norm": 0.49215713143348694, + "learning_rate": 7.155068940131741e-06, + "loss": 0.4121, + "step": 1721 + }, + { + "epoch": 1.268041237113402, + "grad_norm": 0.46819546818733215, + "learning_rate": 7.151201807923497e-06, + "loss": 0.4045, + "step": 1722 + }, + { + "epoch": 1.2687776141384388, + "grad_norm": 0.4482595920562744, + "learning_rate": 7.1473330959376515e-06, + "loss": 0.3915, + "step": 1723 + }, + { + "epoch": 1.2695139911634756, + "grad_norm": 0.5111578702926636, + "learning_rate": 7.143462807015271e-06, + "loss": 0.4096, + "step": 1724 + }, + { + "epoch": 1.2702503681885124, + "grad_norm": 0.3854685425758362, + "learning_rate": 7.139590943998579e-06, + "loss": 0.4059, + "step": 1725 + }, + { + "epoch": 1.2709867452135493, + "grad_norm": 0.542492151260376, + "learning_rate": 7.135717509730953e-06, + "loss": 0.4306, + "step": 1726 + }, + { + "epoch": 1.271723122238586, + "grad_norm": 0.4321967661380768, + "learning_rate": 7.1318425070569284e-06, + "loss": 0.3901, + "step": 1727 + }, + { + "epoch": 1.272459499263623, + "grad_norm": 0.41840946674346924, + "learning_rate": 7.127965938822187e-06, + "loss": 0.4079, + "step": 1728 + }, + { + "epoch": 1.2731958762886597, + "grad_norm": 0.3965054750442505, + "learning_rate": 7.124087807873565e-06, + "loss": 0.4081, + "step": 1729 + }, + { + "epoch": 1.2739322533136965, + "grad_norm": 0.4440041184425354, + "learning_rate": 7.1202081170590455e-06, + "loss": 0.4106, + "step": 1730 + }, + { + "epoch": 1.2746686303387333, + "grad_norm": 0.4273045063018799, + "learning_rate": 7.116326869227755e-06, + "loss": 0.3929, + "step": 1731 + }, + { + "epoch": 1.2754050073637702, + "grad_norm": 0.3961130976676941, + "learning_rate": 7.112444067229966e-06, + "loss": 0.4022, + "step": 1732 + }, + { + "epoch": 1.276141384388807, + "grad_norm": 0.3908890187740326, + "learning_rate": 7.108559713917089e-06, + "loss": 0.3974, + "step": 1733 + }, + { + "epoch": 1.2768777614138438, + "grad_norm": 0.4090268909931183, + "learning_rate": 7.104673812141676e-06, + "loss": 0.4551, + "step": 1734 + }, + { + "epoch": 1.2776141384388806, + "grad_norm": 0.38256412744522095, + "learning_rate": 7.100786364757417e-06, + "loss": 0.4439, + "step": 1735 + }, + { + "epoch": 1.2783505154639174, + "grad_norm": 0.37507322430610657, + "learning_rate": 7.096897374619134e-06, + "loss": 0.4087, + "step": 1736 + }, + { + "epoch": 1.2790868924889542, + "grad_norm": 0.39156273007392883, + "learning_rate": 7.093006844582787e-06, + "loss": 0.3903, + "step": 1737 + }, + { + "epoch": 1.279823269513991, + "grad_norm": 0.37378448247909546, + "learning_rate": 7.08911477750546e-06, + "loss": 0.4178, + "step": 1738 + }, + { + "epoch": 1.2805596465390279, + "grad_norm": 0.4035623371601105, + "learning_rate": 7.085221176245374e-06, + "loss": 0.4079, + "step": 1739 + }, + { + "epoch": 1.2812960235640647, + "grad_norm": 0.4087090492248535, + "learning_rate": 7.081326043661867e-06, + "loss": 0.39, + "step": 1740 + }, + { + "epoch": 1.2820324005891015, + "grad_norm": 0.43084970116615295, + "learning_rate": 7.0774293826154095e-06, + "loss": 0.4189, + "step": 1741 + }, + { + "epoch": 1.2827687776141383, + "grad_norm": 0.40026846528053284, + "learning_rate": 7.073531195967591e-06, + "loss": 0.4454, + "step": 1742 + }, + { + "epoch": 1.2835051546391751, + "grad_norm": 0.39230623841285706, + "learning_rate": 7.069631486581123e-06, + "loss": 0.4106, + "step": 1743 + }, + { + "epoch": 1.284241531664212, + "grad_norm": 0.39439305663108826, + "learning_rate": 7.065730257319832e-06, + "loss": 0.4, + "step": 1744 + }, + { + "epoch": 1.2849779086892488, + "grad_norm": 0.37384194135665894, + "learning_rate": 7.061827511048666e-06, + "loss": 0.3976, + "step": 1745 + }, + { + "epoch": 1.2857142857142856, + "grad_norm": 0.4040299355983734, + "learning_rate": 7.057923250633681e-06, + "loss": 0.4322, + "step": 1746 + }, + { + "epoch": 1.2864506627393224, + "grad_norm": 0.42770475149154663, + "learning_rate": 7.054017478942048e-06, + "loss": 0.4132, + "step": 1747 + }, + { + "epoch": 1.2871870397643592, + "grad_norm": 0.43052366375923157, + "learning_rate": 7.050110198842052e-06, + "loss": 0.4313, + "step": 1748 + }, + { + "epoch": 1.287923416789396, + "grad_norm": 0.3632519841194153, + "learning_rate": 7.046201413203076e-06, + "loss": 0.4088, + "step": 1749 + }, + { + "epoch": 1.2886597938144329, + "grad_norm": 0.3969815671443939, + "learning_rate": 7.042291124895615e-06, + "loss": 0.4149, + "step": 1750 + }, + { + "epoch": 1.28939617083947, + "grad_norm": 0.45275336503982544, + "learning_rate": 7.038379336791269e-06, + "loss": 0.4252, + "step": 1751 + }, + { + "epoch": 1.2901325478645067, + "grad_norm": 0.4112440049648285, + "learning_rate": 7.034466051762736e-06, + "loss": 0.4302, + "step": 1752 + }, + { + "epoch": 1.2908689248895435, + "grad_norm": 0.4475271999835968, + "learning_rate": 7.030551272683814e-06, + "loss": 0.4077, + "step": 1753 + }, + { + "epoch": 1.2916053019145803, + "grad_norm": 0.4204852283000946, + "learning_rate": 7.026635002429399e-06, + "loss": 0.4264, + "step": 1754 + }, + { + "epoch": 1.2923416789396172, + "grad_norm": 0.44256094098091125, + "learning_rate": 7.02271724387548e-06, + "loss": 0.4211, + "step": 1755 + }, + { + "epoch": 1.293078055964654, + "grad_norm": 0.4253244698047638, + "learning_rate": 7.018797999899142e-06, + "loss": 0.4134, + "step": 1756 + }, + { + "epoch": 1.2938144329896908, + "grad_norm": 0.42249786853790283, + "learning_rate": 7.014877273378557e-06, + "loss": 0.4062, + "step": 1757 + }, + { + "epoch": 1.2945508100147276, + "grad_norm": 0.42850828170776367, + "learning_rate": 7.010955067192991e-06, + "loss": 0.4063, + "step": 1758 + }, + { + "epoch": 1.2952871870397644, + "grad_norm": 0.4714277386665344, + "learning_rate": 7.0070313842227946e-06, + "loss": 0.409, + "step": 1759 + }, + { + "epoch": 1.2960235640648012, + "grad_norm": 0.3643467426300049, + "learning_rate": 7.003106227349399e-06, + "loss": 0.4297, + "step": 1760 + }, + { + "epoch": 1.296759941089838, + "grad_norm": 0.4631706178188324, + "learning_rate": 6.999179599455321e-06, + "loss": 0.4398, + "step": 1761 + }, + { + "epoch": 1.2974963181148749, + "grad_norm": 0.37905317544937134, + "learning_rate": 6.995251503424158e-06, + "loss": 0.3931, + "step": 1762 + }, + { + "epoch": 1.2982326951399117, + "grad_norm": 0.4009278118610382, + "learning_rate": 6.991321942140587e-06, + "loss": 0.4023, + "step": 1763 + }, + { + "epoch": 1.2989690721649485, + "grad_norm": 0.4024566113948822, + "learning_rate": 6.987390918490356e-06, + "loss": 0.4188, + "step": 1764 + }, + { + "epoch": 1.2997054491899853, + "grad_norm": 0.4234688878059387, + "learning_rate": 6.983458435360291e-06, + "loss": 0.4308, + "step": 1765 + }, + { + "epoch": 1.3004418262150221, + "grad_norm": 0.4323263466358185, + "learning_rate": 6.9795244956382904e-06, + "loss": 0.3954, + "step": 1766 + }, + { + "epoch": 1.301178203240059, + "grad_norm": 0.4084095358848572, + "learning_rate": 6.975589102213318e-06, + "loss": 0.3908, + "step": 1767 + }, + { + "epoch": 1.3019145802650958, + "grad_norm": 0.3725346624851227, + "learning_rate": 6.97165225797541e-06, + "loss": 0.4085, + "step": 1768 + }, + { + "epoch": 1.3026509572901326, + "grad_norm": 0.4671725928783417, + "learning_rate": 6.9677139658156656e-06, + "loss": 0.4093, + "step": 1769 + }, + { + "epoch": 1.3033873343151694, + "grad_norm": 0.42323094606399536, + "learning_rate": 6.963774228626246e-06, + "loss": 0.371, + "step": 1770 + }, + { + "epoch": 1.3041237113402062, + "grad_norm": 0.37016546726226807, + "learning_rate": 6.959833049300376e-06, + "loss": 0.4009, + "step": 1771 + }, + { + "epoch": 1.304860088365243, + "grad_norm": 0.4504646360874176, + "learning_rate": 6.955890430732338e-06, + "loss": 0.3951, + "step": 1772 + }, + { + "epoch": 1.3055964653902798, + "grad_norm": 0.41494229435920715, + "learning_rate": 6.9519463758174745e-06, + "loss": 0.4112, + "step": 1773 + }, + { + "epoch": 1.3063328424153167, + "grad_norm": 0.3828340768814087, + "learning_rate": 6.948000887452177e-06, + "loss": 0.4081, + "step": 1774 + }, + { + "epoch": 1.3070692194403535, + "grad_norm": 0.4123505651950836, + "learning_rate": 6.944053968533895e-06, + "loss": 0.4267, + "step": 1775 + }, + { + "epoch": 1.3078055964653903, + "grad_norm": 0.42603743076324463, + "learning_rate": 6.9401056219611255e-06, + "loss": 0.3866, + "step": 1776 + }, + { + "epoch": 1.3085419734904271, + "grad_norm": 0.48614487051963806, + "learning_rate": 6.936155850633417e-06, + "loss": 0.4409, + "step": 1777 + }, + { + "epoch": 1.309278350515464, + "grad_norm": 0.37237557768821716, + "learning_rate": 6.932204657451358e-06, + "loss": 0.4096, + "step": 1778 + }, + { + "epoch": 1.3100147275405007, + "grad_norm": 0.45322996377944946, + "learning_rate": 6.928252045316588e-06, + "loss": 0.4214, + "step": 1779 + }, + { + "epoch": 1.3107511045655376, + "grad_norm": 0.3440614640712738, + "learning_rate": 6.924298017131786e-06, + "loss": 0.3917, + "step": 1780 + }, + { + "epoch": 1.3114874815905744, + "grad_norm": 0.38658320903778076, + "learning_rate": 6.920342575800672e-06, + "loss": 0.419, + "step": 1781 + }, + { + "epoch": 1.3122238586156112, + "grad_norm": 0.4042324721813202, + "learning_rate": 6.916385724227998e-06, + "loss": 0.4234, + "step": 1782 + }, + { + "epoch": 1.312960235640648, + "grad_norm": 0.41090258955955505, + "learning_rate": 6.912427465319561e-06, + "loss": 0.4129, + "step": 1783 + }, + { + "epoch": 1.3136966126656848, + "grad_norm": 0.378579705953598, + "learning_rate": 6.908467801982186e-06, + "loss": 0.3884, + "step": 1784 + }, + { + "epoch": 1.3144329896907216, + "grad_norm": 0.43072155117988586, + "learning_rate": 6.9045067371237285e-06, + "loss": 0.4319, + "step": 1785 + }, + { + "epoch": 1.3151693667157585, + "grad_norm": 0.42852944135665894, + "learning_rate": 6.9005442736530745e-06, + "loss": 0.4116, + "step": 1786 + }, + { + "epoch": 1.3159057437407953, + "grad_norm": 0.40988779067993164, + "learning_rate": 6.8965804144801386e-06, + "loss": 0.425, + "step": 1787 + }, + { + "epoch": 1.316642120765832, + "grad_norm": 0.4157440960407257, + "learning_rate": 6.89261516251586e-06, + "loss": 0.4266, + "step": 1788 + }, + { + "epoch": 1.317378497790869, + "grad_norm": 0.46512264013290405, + "learning_rate": 6.888648520672198e-06, + "loss": 0.4201, + "step": 1789 + }, + { + "epoch": 1.3181148748159057, + "grad_norm": 0.496389776468277, + "learning_rate": 6.8846804918621355e-06, + "loss": 0.4487, + "step": 1790 + }, + { + "epoch": 1.3188512518409425, + "grad_norm": 0.3850422501564026, + "learning_rate": 6.880711078999673e-06, + "loss": 0.4016, + "step": 1791 + }, + { + "epoch": 1.3195876288659794, + "grad_norm": 0.44369614124298096, + "learning_rate": 6.876740284999828e-06, + "loss": 0.4188, + "step": 1792 + }, + { + "epoch": 1.3203240058910162, + "grad_norm": 0.43615850806236267, + "learning_rate": 6.872768112778629e-06, + "loss": 0.4164, + "step": 1793 + }, + { + "epoch": 1.321060382916053, + "grad_norm": 0.46753209829330444, + "learning_rate": 6.868794565253123e-06, + "loss": 0.4041, + "step": 1794 + }, + { + "epoch": 1.3217967599410898, + "grad_norm": 0.4193538725376129, + "learning_rate": 6.864819645341361e-06, + "loss": 0.3932, + "step": 1795 + }, + { + "epoch": 1.3225331369661266, + "grad_norm": 0.437418669462204, + "learning_rate": 6.860843355962403e-06, + "loss": 0.3834, + "step": 1796 + }, + { + "epoch": 1.3232695139911634, + "grad_norm": 0.4375527799129486, + "learning_rate": 6.856865700036317e-06, + "loss": 0.4254, + "step": 1797 + }, + { + "epoch": 1.3240058910162003, + "grad_norm": 0.4403863251209259, + "learning_rate": 6.852886680484175e-06, + "loss": 0.4204, + "step": 1798 + }, + { + "epoch": 1.324742268041237, + "grad_norm": 0.39266374707221985, + "learning_rate": 6.848906300228047e-06, + "loss": 0.3803, + "step": 1799 + }, + { + "epoch": 1.3254786450662739, + "grad_norm": 0.41610684990882874, + "learning_rate": 6.844924562191003e-06, + "loss": 0.3783, + "step": 1800 + }, + { + "epoch": 1.3262150220913107, + "grad_norm": 0.39313656091690063, + "learning_rate": 6.8409414692971125e-06, + "loss": 0.3979, + "step": 1801 + }, + { + "epoch": 1.3269513991163475, + "grad_norm": 0.3985959589481354, + "learning_rate": 6.836957024471439e-06, + "loss": 0.4142, + "step": 1802 + }, + { + "epoch": 1.3276877761413843, + "grad_norm": 0.38051387667655945, + "learning_rate": 6.832971230640037e-06, + "loss": 0.4066, + "step": 1803 + }, + { + "epoch": 1.3284241531664212, + "grad_norm": 0.4025249481201172, + "learning_rate": 6.828984090729954e-06, + "loss": 0.3993, + "step": 1804 + }, + { + "epoch": 1.329160530191458, + "grad_norm": 0.39893102645874023, + "learning_rate": 6.8249956076692235e-06, + "loss": 0.4082, + "step": 1805 + }, + { + "epoch": 1.3298969072164948, + "grad_norm": 0.3757968246936798, + "learning_rate": 6.8210057843868715e-06, + "loss": 0.4043, + "step": 1806 + }, + { + "epoch": 1.3306332842415316, + "grad_norm": 0.377785861492157, + "learning_rate": 6.817014623812898e-06, + "loss": 0.3817, + "step": 1807 + }, + { + "epoch": 1.3313696612665684, + "grad_norm": 0.39483869075775146, + "learning_rate": 6.813022128878292e-06, + "loss": 0.4089, + "step": 1808 + }, + { + "epoch": 1.3321060382916052, + "grad_norm": 0.4306437373161316, + "learning_rate": 6.809028302515024e-06, + "loss": 0.3828, + "step": 1809 + }, + { + "epoch": 1.332842415316642, + "grad_norm": 0.38789859414100647, + "learning_rate": 6.805033147656037e-06, + "loss": 0.4043, + "step": 1810 + }, + { + "epoch": 1.3335787923416789, + "grad_norm": 0.3801842927932739, + "learning_rate": 6.801036667235252e-06, + "loss": 0.4094, + "step": 1811 + }, + { + "epoch": 1.3343151693667157, + "grad_norm": 0.3741806447505951, + "learning_rate": 6.797038864187564e-06, + "loss": 0.4316, + "step": 1812 + }, + { + "epoch": 1.3350515463917525, + "grad_norm": 0.39872273802757263, + "learning_rate": 6.79303974144884e-06, + "loss": 0.4268, + "step": 1813 + }, + { + "epoch": 1.3357879234167893, + "grad_norm": 0.4184839427471161, + "learning_rate": 6.789039301955913e-06, + "loss": 0.4332, + "step": 1814 + }, + { + "epoch": 1.3365243004418264, + "grad_norm": 0.3965596854686737, + "learning_rate": 6.785037548646586e-06, + "loss": 0.4486, + "step": 1815 + }, + { + "epoch": 1.3372606774668632, + "grad_norm": 0.4066341817378998, + "learning_rate": 6.781034484459624e-06, + "loss": 0.3996, + "step": 1816 + }, + { + "epoch": 1.3379970544919, + "grad_norm": 0.47760289907455444, + "learning_rate": 6.777030112334759e-06, + "loss": 0.4175, + "step": 1817 + }, + { + "epoch": 1.3387334315169368, + "grad_norm": 0.39844220876693726, + "learning_rate": 6.773024435212678e-06, + "loss": 0.4186, + "step": 1818 + }, + { + "epoch": 1.3394698085419736, + "grad_norm": 0.419331431388855, + "learning_rate": 6.769017456035033e-06, + "loss": 0.4229, + "step": 1819 + }, + { + "epoch": 1.3402061855670104, + "grad_norm": 0.41925719380378723, + "learning_rate": 6.765009177744425e-06, + "loss": 0.4078, + "step": 1820 + }, + { + "epoch": 1.3409425625920472, + "grad_norm": 0.39159253239631653, + "learning_rate": 6.760999603284413e-06, + "loss": 0.394, + "step": 1821 + }, + { + "epoch": 1.341678939617084, + "grad_norm": 0.38817527890205383, + "learning_rate": 6.756988735599508e-06, + "loss": 0.3846, + "step": 1822 + }, + { + "epoch": 1.3424153166421209, + "grad_norm": 0.4380631148815155, + "learning_rate": 6.752976577635169e-06, + "loss": 0.4231, + "step": 1823 + }, + { + "epoch": 1.3431516936671577, + "grad_norm": 0.4459487795829773, + "learning_rate": 6.748963132337807e-06, + "loss": 0.4201, + "step": 1824 + }, + { + "epoch": 1.3438880706921945, + "grad_norm": 0.3807379901409149, + "learning_rate": 6.7449484026547705e-06, + "loss": 0.4175, + "step": 1825 + }, + { + "epoch": 1.3446244477172313, + "grad_norm": 0.3830302655696869, + "learning_rate": 6.740932391534358e-06, + "loss": 0.4129, + "step": 1826 + }, + { + "epoch": 1.3453608247422681, + "grad_norm": 0.4267512559890747, + "learning_rate": 6.736915101925807e-06, + "loss": 0.4108, + "step": 1827 + }, + { + "epoch": 1.346097201767305, + "grad_norm": 0.3901468515396118, + "learning_rate": 6.732896536779293e-06, + "loss": 0.3991, + "step": 1828 + }, + { + "epoch": 1.3468335787923418, + "grad_norm": 0.41451314091682434, + "learning_rate": 6.728876699045927e-06, + "loss": 0.4133, + "step": 1829 + }, + { + "epoch": 1.3475699558173786, + "grad_norm": 0.4063207507133484, + "learning_rate": 6.7248555916777595e-06, + "loss": 0.4133, + "step": 1830 + }, + { + "epoch": 1.3483063328424154, + "grad_norm": 0.4275752902030945, + "learning_rate": 6.720833217627769e-06, + "loss": 0.4337, + "step": 1831 + }, + { + "epoch": 1.3490427098674522, + "grad_norm": 0.451399028301239, + "learning_rate": 6.716809579849865e-06, + "loss": 0.3908, + "step": 1832 + }, + { + "epoch": 1.349779086892489, + "grad_norm": 0.43301716446876526, + "learning_rate": 6.712784681298885e-06, + "loss": 0.4326, + "step": 1833 + }, + { + "epoch": 1.3505154639175259, + "grad_norm": 0.4233209192752838, + "learning_rate": 6.708758524930594e-06, + "loss": 0.4022, + "step": 1834 + }, + { + "epoch": 1.3512518409425627, + "grad_norm": 0.43682563304901123, + "learning_rate": 6.704731113701679e-06, + "loss": 0.4212, + "step": 1835 + }, + { + "epoch": 1.3519882179675995, + "grad_norm": 0.4808812439441681, + "learning_rate": 6.70070245056975e-06, + "loss": 0.4156, + "step": 1836 + }, + { + "epoch": 1.3527245949926363, + "grad_norm": 0.40924352407455444, + "learning_rate": 6.696672538493334e-06, + "loss": 0.3947, + "step": 1837 + }, + { + "epoch": 1.3534609720176731, + "grad_norm": 0.3757144808769226, + "learning_rate": 6.692641380431879e-06, + "loss": 0.4008, + "step": 1838 + }, + { + "epoch": 1.35419734904271, + "grad_norm": 0.40984469652175903, + "learning_rate": 6.688608979345742e-06, + "loss": 0.4306, + "step": 1839 + }, + { + "epoch": 1.3549337260677468, + "grad_norm": 0.39980268478393555, + "learning_rate": 6.6845753381961995e-06, + "loss": 0.4105, + "step": 1840 + }, + { + "epoch": 1.3556701030927836, + "grad_norm": 0.38676726818084717, + "learning_rate": 6.680540459945435e-06, + "loss": 0.3773, + "step": 1841 + }, + { + "epoch": 1.3564064801178204, + "grad_norm": 0.4054774045944214, + "learning_rate": 6.676504347556541e-06, + "loss": 0.4035, + "step": 1842 + }, + { + "epoch": 1.3571428571428572, + "grad_norm": 0.42290937900543213, + "learning_rate": 6.6724670039935145e-06, + "loss": 0.4226, + "step": 1843 + }, + { + "epoch": 1.357879234167894, + "grad_norm": 0.41523557901382446, + "learning_rate": 6.668428432221262e-06, + "loss": 0.4143, + "step": 1844 + }, + { + "epoch": 1.3586156111929308, + "grad_norm": 0.40057843923568726, + "learning_rate": 6.664388635205587e-06, + "loss": 0.4092, + "step": 1845 + }, + { + "epoch": 1.3593519882179677, + "grad_norm": 0.43508538603782654, + "learning_rate": 6.660347615913194e-06, + "loss": 0.3645, + "step": 1846 + }, + { + "epoch": 1.3600883652430045, + "grad_norm": 0.4160832464694977, + "learning_rate": 6.656305377311686e-06, + "loss": 0.3927, + "step": 1847 + }, + { + "epoch": 1.3608247422680413, + "grad_norm": 0.40695255994796753, + "learning_rate": 6.652261922369562e-06, + "loss": 0.4079, + "step": 1848 + }, + { + "epoch": 1.361561119293078, + "grad_norm": 0.4502415657043457, + "learning_rate": 6.648217254056211e-06, + "loss": 0.41, + "step": 1849 + }, + { + "epoch": 1.362297496318115, + "grad_norm": 0.4875112473964691, + "learning_rate": 6.644171375341915e-06, + "loss": 0.4101, + "step": 1850 + }, + { + "epoch": 1.3630338733431517, + "grad_norm": 0.4880272150039673, + "learning_rate": 6.640124289197845e-06, + "loss": 0.4239, + "step": 1851 + }, + { + "epoch": 1.3637702503681886, + "grad_norm": 0.44013795256614685, + "learning_rate": 6.636075998596063e-06, + "loss": 0.3872, + "step": 1852 + }, + { + "epoch": 1.3645066273932254, + "grad_norm": 0.43611839413642883, + "learning_rate": 6.632026506509507e-06, + "loss": 0.4029, + "step": 1853 + }, + { + "epoch": 1.3652430044182622, + "grad_norm": 0.4688226878643036, + "learning_rate": 6.627975815912002e-06, + "loss": 0.4, + "step": 1854 + }, + { + "epoch": 1.365979381443299, + "grad_norm": 0.4303816258907318, + "learning_rate": 6.623923929778253e-06, + "loss": 0.4301, + "step": 1855 + }, + { + "epoch": 1.3667157584683358, + "grad_norm": 0.516590416431427, + "learning_rate": 6.6198708510838446e-06, + "loss": 0.4398, + "step": 1856 + }, + { + "epoch": 1.3674521354933726, + "grad_norm": 0.4422401785850525, + "learning_rate": 6.615816582805235e-06, + "loss": 0.3925, + "step": 1857 + }, + { + "epoch": 1.3681885125184094, + "grad_norm": 0.39282315969467163, + "learning_rate": 6.611761127919753e-06, + "loss": 0.3969, + "step": 1858 + }, + { + "epoch": 1.3689248895434463, + "grad_norm": 0.4454099237918854, + "learning_rate": 6.607704489405605e-06, + "loss": 0.4306, + "step": 1859 + }, + { + "epoch": 1.369661266568483, + "grad_norm": 0.45609748363494873, + "learning_rate": 6.603646670241863e-06, + "loss": 0.3761, + "step": 1860 + }, + { + "epoch": 1.37039764359352, + "grad_norm": 0.40115681290626526, + "learning_rate": 6.599587673408469e-06, + "loss": 0.4091, + "step": 1861 + }, + { + "epoch": 1.3711340206185567, + "grad_norm": 0.4406599998474121, + "learning_rate": 6.595527501886223e-06, + "loss": 0.4157, + "step": 1862 + }, + { + "epoch": 1.3718703976435935, + "grad_norm": 0.3757615089416504, + "learning_rate": 6.591466158656795e-06, + "loss": 0.3968, + "step": 1863 + }, + { + "epoch": 1.3726067746686303, + "grad_norm": 0.4482031464576721, + "learning_rate": 6.5874036467027135e-06, + "loss": 0.4094, + "step": 1864 + }, + { + "epoch": 1.3733431516936672, + "grad_norm": 0.4158189594745636, + "learning_rate": 6.583339969007364e-06, + "loss": 0.4047, + "step": 1865 + }, + { + "epoch": 1.374079528718704, + "grad_norm": 0.45515987277030945, + "learning_rate": 6.579275128554986e-06, + "loss": 0.4326, + "step": 1866 + }, + { + "epoch": 1.3748159057437408, + "grad_norm": 0.49017441272735596, + "learning_rate": 6.575209128330679e-06, + "loss": 0.4162, + "step": 1867 + }, + { + "epoch": 1.3755522827687776, + "grad_norm": 0.5305281281471252, + "learning_rate": 6.57114197132039e-06, + "loss": 0.402, + "step": 1868 + }, + { + "epoch": 1.3762886597938144, + "grad_norm": 0.42159077525138855, + "learning_rate": 6.567073660510914e-06, + "loss": 0.4187, + "step": 1869 + }, + { + "epoch": 1.3770250368188512, + "grad_norm": 0.4378039538860321, + "learning_rate": 6.563004198889899e-06, + "loss": 0.4001, + "step": 1870 + }, + { + "epoch": 1.377761413843888, + "grad_norm": 0.45202401280403137, + "learning_rate": 6.5589335894458305e-06, + "loss": 0.4204, + "step": 1871 + }, + { + "epoch": 1.3784977908689249, + "grad_norm": 0.4391757547855377, + "learning_rate": 6.554861835168045e-06, + "loss": 0.4141, + "step": 1872 + }, + { + "epoch": 1.3792341678939617, + "grad_norm": 0.4077272415161133, + "learning_rate": 6.550788939046713e-06, + "loss": 0.3808, + "step": 1873 + }, + { + "epoch": 1.3799705449189985, + "grad_norm": 0.488126665353775, + "learning_rate": 6.546714904072848e-06, + "loss": 0.4128, + "step": 1874 + }, + { + "epoch": 1.3807069219440353, + "grad_norm": 0.43175673484802246, + "learning_rate": 6.542639733238297e-06, + "loss": 0.416, + "step": 1875 + }, + { + "epoch": 1.3814432989690721, + "grad_norm": 0.39616307616233826, + "learning_rate": 6.538563429535742e-06, + "loss": 0.4148, + "step": 1876 + }, + { + "epoch": 1.382179675994109, + "grad_norm": 0.436383455991745, + "learning_rate": 6.534485995958699e-06, + "loss": 0.4205, + "step": 1877 + }, + { + "epoch": 1.3829160530191458, + "grad_norm": 0.5000892281532288, + "learning_rate": 6.530407435501513e-06, + "loss": 0.3923, + "step": 1878 + }, + { + "epoch": 1.3836524300441826, + "grad_norm": 0.4896223545074463, + "learning_rate": 6.5263277511593515e-06, + "loss": 0.4397, + "step": 1879 + }, + { + "epoch": 1.3843888070692194, + "grad_norm": 0.3927074372768402, + "learning_rate": 6.522246945928214e-06, + "loss": 0.422, + "step": 1880 + }, + { + "epoch": 1.3851251840942562, + "grad_norm": 0.4764426350593567, + "learning_rate": 6.518165022804921e-06, + "loss": 0.4127, + "step": 1881 + }, + { + "epoch": 1.385861561119293, + "grad_norm": 0.4523772895336151, + "learning_rate": 6.514081984787112e-06, + "loss": 0.4211, + "step": 1882 + }, + { + "epoch": 1.3865979381443299, + "grad_norm": 0.3522360324859619, + "learning_rate": 6.509997834873246e-06, + "loss": 0.4101, + "step": 1883 + }, + { + "epoch": 1.3873343151693667, + "grad_norm": 0.4114728271961212, + "learning_rate": 6.505912576062602e-06, + "loss": 0.4017, + "step": 1884 + }, + { + "epoch": 1.3880706921944035, + "grad_norm": 0.4329925775527954, + "learning_rate": 6.501826211355269e-06, + "loss": 0.4095, + "step": 1885 + }, + { + "epoch": 1.3888070692194403, + "grad_norm": 0.4298880398273468, + "learning_rate": 6.497738743752151e-06, + "loss": 0.4246, + "step": 1886 + }, + { + "epoch": 1.3895434462444771, + "grad_norm": 0.4396612048149109, + "learning_rate": 6.493650176254958e-06, + "loss": 0.4231, + "step": 1887 + }, + { + "epoch": 1.390279823269514, + "grad_norm": 0.3601161241531372, + "learning_rate": 6.4895605118662116e-06, + "loss": 0.3881, + "step": 1888 + }, + { + "epoch": 1.3910162002945508, + "grad_norm": 0.42812642455101013, + "learning_rate": 6.485469753589241e-06, + "loss": 0.4094, + "step": 1889 + }, + { + "epoch": 1.3917525773195876, + "grad_norm": 0.41203832626342773, + "learning_rate": 6.481377904428171e-06, + "loss": 0.4305, + "step": 1890 + }, + { + "epoch": 1.3924889543446244, + "grad_norm": 0.4372691214084625, + "learning_rate": 6.4772849673879335e-06, + "loss": 0.4255, + "step": 1891 + }, + { + "epoch": 1.3932253313696612, + "grad_norm": 0.47426924109458923, + "learning_rate": 6.473190945474258e-06, + "loss": 0.4377, + "step": 1892 + }, + { + "epoch": 1.393961708394698, + "grad_norm": 0.4277667999267578, + "learning_rate": 6.469095841693671e-06, + "loss": 0.3954, + "step": 1893 + }, + { + "epoch": 1.3946980854197348, + "grad_norm": 0.3962320387363434, + "learning_rate": 6.4649996590534915e-06, + "loss": 0.3877, + "step": 1894 + }, + { + "epoch": 1.3954344624447717, + "grad_norm": 0.4663618206977844, + "learning_rate": 6.460902400561835e-06, + "loss": 0.4046, + "step": 1895 + }, + { + "epoch": 1.3961708394698085, + "grad_norm": 0.4559689164161682, + "learning_rate": 6.456804069227601e-06, + "loss": 0.4166, + "step": 1896 + }, + { + "epoch": 1.3969072164948453, + "grad_norm": 0.3791521489620209, + "learning_rate": 6.452704668060481e-06, + "loss": 0.4049, + "step": 1897 + }, + { + "epoch": 1.397643593519882, + "grad_norm": 0.49199846386909485, + "learning_rate": 6.448604200070953e-06, + "loss": 0.4207, + "step": 1898 + }, + { + "epoch": 1.398379970544919, + "grad_norm": 0.4463452994823456, + "learning_rate": 6.444502668270276e-06, + "loss": 0.4088, + "step": 1899 + }, + { + "epoch": 1.3991163475699557, + "grad_norm": 0.47948887944221497, + "learning_rate": 6.440400075670491e-06, + "loss": 0.3906, + "step": 1900 + }, + { + "epoch": 1.3998527245949925, + "grad_norm": 0.42631468176841736, + "learning_rate": 6.4362964252844165e-06, + "loss": 0.4024, + "step": 1901 + }, + { + "epoch": 1.4005891016200294, + "grad_norm": 0.39506101608276367, + "learning_rate": 6.432191720125651e-06, + "loss": 0.4013, + "step": 1902 + }, + { + "epoch": 1.4013254786450662, + "grad_norm": 0.4285804331302643, + "learning_rate": 6.428085963208567e-06, + "loss": 0.4148, + "step": 1903 + }, + { + "epoch": 1.402061855670103, + "grad_norm": 0.4068101942539215, + "learning_rate": 6.423979157548306e-06, + "loss": 0.4113, + "step": 1904 + }, + { + "epoch": 1.4027982326951398, + "grad_norm": 0.39695197343826294, + "learning_rate": 6.419871306160782e-06, + "loss": 0.3925, + "step": 1905 + }, + { + "epoch": 1.4035346097201766, + "grad_norm": 0.38716959953308105, + "learning_rate": 6.415762412062678e-06, + "loss": 0.38, + "step": 1906 + }, + { + "epoch": 1.4042709867452134, + "grad_norm": 0.41700947284698486, + "learning_rate": 6.411652478271444e-06, + "loss": 0.3994, + "step": 1907 + }, + { + "epoch": 1.4050073637702503, + "grad_norm": 0.4346196949481964, + "learning_rate": 6.407541507805286e-06, + "loss": 0.4043, + "step": 1908 + }, + { + "epoch": 1.405743740795287, + "grad_norm": 0.433353990316391, + "learning_rate": 6.403429503683178e-06, + "loss": 0.4124, + "step": 1909 + }, + { + "epoch": 1.406480117820324, + "grad_norm": 0.4605219066143036, + "learning_rate": 6.399316468924856e-06, + "loss": 0.4196, + "step": 1910 + }, + { + "epoch": 1.4072164948453607, + "grad_norm": 0.4418054223060608, + "learning_rate": 6.395202406550803e-06, + "loss": 0.4054, + "step": 1911 + }, + { + "epoch": 1.4079528718703975, + "grad_norm": 0.4400234818458557, + "learning_rate": 6.391087319582264e-06, + "loss": 0.4206, + "step": 1912 + }, + { + "epoch": 1.4086892488954343, + "grad_norm": 0.40220165252685547, + "learning_rate": 6.386971211041235e-06, + "loss": 0.4301, + "step": 1913 + }, + { + "epoch": 1.4094256259204712, + "grad_norm": 0.41866376996040344, + "learning_rate": 6.382854083950462e-06, + "loss": 0.3882, + "step": 1914 + }, + { + "epoch": 1.410162002945508, + "grad_norm": 0.4194769561290741, + "learning_rate": 6.378735941333437e-06, + "loss": 0.4037, + "step": 1915 + }, + { + "epoch": 1.4108983799705448, + "grad_norm": 0.41453391313552856, + "learning_rate": 6.374616786214402e-06, + "loss": 0.4195, + "step": 1916 + }, + { + "epoch": 1.4116347569955816, + "grad_norm": 0.4037197232246399, + "learning_rate": 6.370496621618338e-06, + "loss": 0.3969, + "step": 1917 + }, + { + "epoch": 1.4123711340206184, + "grad_norm": 0.4220995008945465, + "learning_rate": 6.366375450570971e-06, + "loss": 0.4141, + "step": 1918 + }, + { + "epoch": 1.4131075110456552, + "grad_norm": 0.45782655477523804, + "learning_rate": 6.362253276098762e-06, + "loss": 0.3984, + "step": 1919 + }, + { + "epoch": 1.413843888070692, + "grad_norm": 0.40615370869636536, + "learning_rate": 6.358130101228914e-06, + "loss": 0.4279, + "step": 1920 + }, + { + "epoch": 1.414580265095729, + "grad_norm": 0.42293697595596313, + "learning_rate": 6.35400592898936e-06, + "loss": 0.4001, + "step": 1921 + }, + { + "epoch": 1.415316642120766, + "grad_norm": 0.4643053710460663, + "learning_rate": 6.34988076240877e-06, + "loss": 0.4087, + "step": 1922 + }, + { + "epoch": 1.4160530191458027, + "grad_norm": 0.42279914021492004, + "learning_rate": 6.345754604516539e-06, + "loss": 0.3953, + "step": 1923 + }, + { + "epoch": 1.4167893961708395, + "grad_norm": 0.4467233121395111, + "learning_rate": 6.341627458342794e-06, + "loss": 0.4182, + "step": 1924 + }, + { + "epoch": 1.4175257731958764, + "grad_norm": 0.4454042911529541, + "learning_rate": 6.337499326918386e-06, + "loss": 0.4092, + "step": 1925 + }, + { + "epoch": 1.4182621502209132, + "grad_norm": 0.414069265127182, + "learning_rate": 6.33337021327489e-06, + "loss": 0.4254, + "step": 1926 + }, + { + "epoch": 1.41899852724595, + "grad_norm": 0.4927258789539337, + "learning_rate": 6.329240120444602e-06, + "loss": 0.4215, + "step": 1927 + }, + { + "epoch": 1.4197349042709868, + "grad_norm": 0.4353216588497162, + "learning_rate": 6.325109051460538e-06, + "loss": 0.4077, + "step": 1928 + }, + { + "epoch": 1.4204712812960236, + "grad_norm": 0.4361936151981354, + "learning_rate": 6.3209770093564315e-06, + "loss": 0.439, + "step": 1929 + }, + { + "epoch": 1.4212076583210604, + "grad_norm": 0.4463266432285309, + "learning_rate": 6.316843997166726e-06, + "loss": 0.4217, + "step": 1930 + }, + { + "epoch": 1.4219440353460973, + "grad_norm": 0.4338516294956207, + "learning_rate": 6.312710017926582e-06, + "loss": 0.4278, + "step": 1931 + }, + { + "epoch": 1.422680412371134, + "grad_norm": 0.48313939571380615, + "learning_rate": 6.3085750746718725e-06, + "loss": 0.3838, + "step": 1932 + }, + { + "epoch": 1.423416789396171, + "grad_norm": 0.5277740955352783, + "learning_rate": 6.30443917043917e-06, + "loss": 0.4097, + "step": 1933 + }, + { + "epoch": 1.4241531664212077, + "grad_norm": 0.47533461451530457, + "learning_rate": 6.30030230826576e-06, + "loss": 0.3995, + "step": 1934 + }, + { + "epoch": 1.4248895434462445, + "grad_norm": 0.4214441776275635, + "learning_rate": 6.296164491189628e-06, + "loss": 0.4226, + "step": 1935 + }, + { + "epoch": 1.4256259204712813, + "grad_norm": 0.4258248507976532, + "learning_rate": 6.292025722249463e-06, + "loss": 0.3888, + "step": 1936 + }, + { + "epoch": 1.4263622974963182, + "grad_norm": 0.4134536683559418, + "learning_rate": 6.287886004484651e-06, + "loss": 0.3852, + "step": 1937 + }, + { + "epoch": 1.427098674521355, + "grad_norm": 0.47609132528305054, + "learning_rate": 6.283745340935277e-06, + "loss": 0.3968, + "step": 1938 + }, + { + "epoch": 1.4278350515463918, + "grad_norm": 0.44796431064605713, + "learning_rate": 6.279603734642117e-06, + "loss": 0.4421, + "step": 1939 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 0.404325395822525, + "learning_rate": 6.275461188646641e-06, + "loss": 0.4123, + "step": 1940 + }, + { + "epoch": 1.4293078055964654, + "grad_norm": 0.41975611448287964, + "learning_rate": 6.271317705991014e-06, + "loss": 0.432, + "step": 1941 + }, + { + "epoch": 1.4300441826215022, + "grad_norm": 0.46170738339424133, + "learning_rate": 6.267173289718079e-06, + "loss": 0.4272, + "step": 1942 + }, + { + "epoch": 1.430780559646539, + "grad_norm": 0.388175904750824, + "learning_rate": 6.263027942871375e-06, + "loss": 0.3877, + "step": 1943 + }, + { + "epoch": 1.4315169366715759, + "grad_norm": 0.459796667098999, + "learning_rate": 6.258881668495116e-06, + "loss": 0.3874, + "step": 1944 + }, + { + "epoch": 1.4322533136966127, + "grad_norm": 0.4751020073890686, + "learning_rate": 6.2547344696342015e-06, + "loss": 0.393, + "step": 1945 + }, + { + "epoch": 1.4329896907216495, + "grad_norm": 0.3899977207183838, + "learning_rate": 6.250586349334209e-06, + "loss": 0.3918, + "step": 1946 + }, + { + "epoch": 1.4337260677466863, + "grad_norm": 0.43887919187545776, + "learning_rate": 6.246437310641395e-06, + "loss": 0.43, + "step": 1947 + }, + { + "epoch": 1.4344624447717231, + "grad_norm": 0.4462708830833435, + "learning_rate": 6.242287356602684e-06, + "loss": 0.3989, + "step": 1948 + }, + { + "epoch": 1.43519882179676, + "grad_norm": 0.4143645167350769, + "learning_rate": 6.238136490265681e-06, + "loss": 0.4257, + "step": 1949 + }, + { + "epoch": 1.4359351988217968, + "grad_norm": 0.4817536175251007, + "learning_rate": 6.2339847146786515e-06, + "loss": 0.3969, + "step": 1950 + }, + { + "epoch": 1.4366715758468336, + "grad_norm": 0.40567225217819214, + "learning_rate": 6.22983203289054e-06, + "loss": 0.4277, + "step": 1951 + }, + { + "epoch": 1.4374079528718704, + "grad_norm": 0.4788074791431427, + "learning_rate": 6.225678447950947e-06, + "loss": 0.4243, + "step": 1952 + }, + { + "epoch": 1.4381443298969072, + "grad_norm": 0.38317757844924927, + "learning_rate": 6.2215239629101385e-06, + "loss": 0.4197, + "step": 1953 + }, + { + "epoch": 1.438880706921944, + "grad_norm": 0.40520721673965454, + "learning_rate": 6.217368580819049e-06, + "loss": 0.4112, + "step": 1954 + }, + { + "epoch": 1.4396170839469808, + "grad_norm": 0.4457045793533325, + "learning_rate": 6.213212304729259e-06, + "loss": 0.4104, + "step": 1955 + }, + { + "epoch": 1.4403534609720177, + "grad_norm": 0.42782163619995117, + "learning_rate": 6.209055137693014e-06, + "loss": 0.4247, + "step": 1956 + }, + { + "epoch": 1.4410898379970545, + "grad_norm": 0.4254503548145294, + "learning_rate": 6.204897082763213e-06, + "loss": 0.3672, + "step": 1957 + }, + { + "epoch": 1.4418262150220913, + "grad_norm": 0.3664149343967438, + "learning_rate": 6.200738142993406e-06, + "loss": 0.3824, + "step": 1958 + }, + { + "epoch": 1.4425625920471281, + "grad_norm": 0.41897517442703247, + "learning_rate": 6.1965783214377895e-06, + "loss": 0.3947, + "step": 1959 + }, + { + "epoch": 1.443298969072165, + "grad_norm": 0.3825157582759857, + "learning_rate": 6.1924176211512145e-06, + "loss": 0.3827, + "step": 1960 + }, + { + "epoch": 1.4440353460972017, + "grad_norm": 0.42175137996673584, + "learning_rate": 6.1882560451891715e-06, + "loss": 0.4241, + "step": 1961 + }, + { + "epoch": 1.4447717231222386, + "grad_norm": 0.40888282656669617, + "learning_rate": 6.1840935966077985e-06, + "loss": 0.4041, + "step": 1962 + }, + { + "epoch": 1.4455081001472754, + "grad_norm": 0.42432424426078796, + "learning_rate": 6.179930278463868e-06, + "loss": 0.4155, + "step": 1963 + }, + { + "epoch": 1.4462444771723122, + "grad_norm": 0.4558820128440857, + "learning_rate": 6.175766093814798e-06, + "loss": 0.4044, + "step": 1964 + }, + { + "epoch": 1.446980854197349, + "grad_norm": 0.506184458732605, + "learning_rate": 6.1716010457186395e-06, + "loss": 0.4198, + "step": 1965 + }, + { + "epoch": 1.4477172312223858, + "grad_norm": 0.38299667835235596, + "learning_rate": 6.167435137234078e-06, + "loss": 0.4086, + "step": 1966 + }, + { + "epoch": 1.4484536082474226, + "grad_norm": 0.43511974811553955, + "learning_rate": 6.16326837142043e-06, + "loss": 0.4063, + "step": 1967 + }, + { + "epoch": 1.4491899852724595, + "grad_norm": 0.48710158467292786, + "learning_rate": 6.1591007513376425e-06, + "loss": 0.4157, + "step": 1968 + }, + { + "epoch": 1.4499263622974963, + "grad_norm": 0.41424429416656494, + "learning_rate": 6.15493228004629e-06, + "loss": 0.397, + "step": 1969 + }, + { + "epoch": 1.450662739322533, + "grad_norm": 0.4269460439682007, + "learning_rate": 6.1507629606075724e-06, + "loss": 0.4171, + "step": 1970 + }, + { + "epoch": 1.45139911634757, + "grad_norm": 0.4956137239933014, + "learning_rate": 6.14659279608331e-06, + "loss": 0.4108, + "step": 1971 + }, + { + "epoch": 1.4521354933726067, + "grad_norm": 0.4469659626483917, + "learning_rate": 6.142421789535948e-06, + "loss": 0.4267, + "step": 1972 + }, + { + "epoch": 1.4528718703976435, + "grad_norm": 0.3990239202976227, + "learning_rate": 6.138249944028547e-06, + "loss": 0.4151, + "step": 1973 + }, + { + "epoch": 1.4536082474226804, + "grad_norm": 0.4268157184123993, + "learning_rate": 6.134077262624783e-06, + "loss": 0.42, + "step": 1974 + }, + { + "epoch": 1.4543446244477172, + "grad_norm": 0.45455217361450195, + "learning_rate": 6.129903748388948e-06, + "loss": 0.3977, + "step": 1975 + }, + { + "epoch": 1.455081001472754, + "grad_norm": 0.3931489884853363, + "learning_rate": 6.125729404385946e-06, + "loss": 0.4218, + "step": 1976 + }, + { + "epoch": 1.4558173784977908, + "grad_norm": 0.4898168444633484, + "learning_rate": 6.121554233681286e-06, + "loss": 0.3984, + "step": 1977 + }, + { + "epoch": 1.4565537555228276, + "grad_norm": 0.44921234250068665, + "learning_rate": 6.11737823934109e-06, + "loss": 0.4049, + "step": 1978 + }, + { + "epoch": 1.4572901325478644, + "grad_norm": 0.4154300093650818, + "learning_rate": 6.11320142443208e-06, + "loss": 0.4296, + "step": 1979 + }, + { + "epoch": 1.4580265095729013, + "grad_norm": 0.46535080671310425, + "learning_rate": 6.109023792021586e-06, + "loss": 0.4384, + "step": 1980 + }, + { + "epoch": 1.458762886597938, + "grad_norm": 0.42891719937324524, + "learning_rate": 6.1048453451775305e-06, + "loss": 0.4186, + "step": 1981 + }, + { + "epoch": 1.4594992636229749, + "grad_norm": 0.4581204056739807, + "learning_rate": 6.100666086968441e-06, + "loss": 0.3773, + "step": 1982 + }, + { + "epoch": 1.4602356406480117, + "grad_norm": 0.3952482342720032, + "learning_rate": 6.09648602046344e-06, + "loss": 0.4141, + "step": 1983 + }, + { + "epoch": 1.4609720176730487, + "grad_norm": 0.4057900905609131, + "learning_rate": 6.0923051487322385e-06, + "loss": 0.4384, + "step": 1984 + }, + { + "epoch": 1.4617083946980856, + "grad_norm": 0.4145631194114685, + "learning_rate": 6.088123474845144e-06, + "loss": 0.3874, + "step": 1985 + }, + { + "epoch": 1.4624447717231224, + "grad_norm": 0.3992542624473572, + "learning_rate": 6.0839410018730515e-06, + "loss": 0.414, + "step": 1986 + }, + { + "epoch": 1.4631811487481592, + "grad_norm": 0.43685290217399597, + "learning_rate": 6.079757732887444e-06, + "loss": 0.3838, + "step": 1987 + }, + { + "epoch": 1.463917525773196, + "grad_norm": 0.3933916389942169, + "learning_rate": 6.075573670960385e-06, + "loss": 0.3845, + "step": 1988 + }, + { + "epoch": 1.4646539027982328, + "grad_norm": 0.4441211223602295, + "learning_rate": 6.071388819164525e-06, + "loss": 0.4222, + "step": 1989 + }, + { + "epoch": 1.4653902798232696, + "grad_norm": 0.4381551444530487, + "learning_rate": 6.067203180573094e-06, + "loss": 0.3978, + "step": 1990 + }, + { + "epoch": 1.4661266568483065, + "grad_norm": 0.40585675835609436, + "learning_rate": 6.063016758259896e-06, + "loss": 0.4072, + "step": 1991 + }, + { + "epoch": 1.4668630338733433, + "grad_norm": 0.358671098947525, + "learning_rate": 6.058829555299314e-06, + "loss": 0.4078, + "step": 1992 + }, + { + "epoch": 1.46759941089838, + "grad_norm": 0.39564085006713867, + "learning_rate": 6.054641574766304e-06, + "loss": 0.4084, + "step": 1993 + }, + { + "epoch": 1.468335787923417, + "grad_norm": 0.40318918228149414, + "learning_rate": 6.05045281973639e-06, + "loss": 0.3892, + "step": 1994 + }, + { + "epoch": 1.4690721649484537, + "grad_norm": 0.4056055247783661, + "learning_rate": 6.04626329328567e-06, + "loss": 0.4119, + "step": 1995 + }, + { + "epoch": 1.4698085419734905, + "grad_norm": 0.40646982192993164, + "learning_rate": 6.042072998490805e-06, + "loss": 0.4069, + "step": 1996 + }, + { + "epoch": 1.4705449189985274, + "grad_norm": 0.396869033575058, + "learning_rate": 6.0378819384290185e-06, + "loss": 0.4212, + "step": 1997 + }, + { + "epoch": 1.4712812960235642, + "grad_norm": 0.46342048048973083, + "learning_rate": 6.033690116178101e-06, + "loss": 0.401, + "step": 1998 + }, + { + "epoch": 1.472017673048601, + "grad_norm": 0.34812402725219727, + "learning_rate": 6.0294975348163985e-06, + "loss": 0.3684, + "step": 1999 + }, + { + "epoch": 1.4727540500736378, + "grad_norm": 0.4260614812374115, + "learning_rate": 6.025304197422819e-06, + "loss": 0.3867, + "step": 2000 + }, + { + "epoch": 1.4734904270986746, + "grad_norm": 0.4525951147079468, + "learning_rate": 6.0211101070768184e-06, + "loss": 0.3997, + "step": 2001 + }, + { + "epoch": 1.4742268041237114, + "grad_norm": 0.4634397625923157, + "learning_rate": 6.016915266858413e-06, + "loss": 0.4346, + "step": 2002 + }, + { + "epoch": 1.4749631811487482, + "grad_norm": 0.40612179040908813, + "learning_rate": 6.0127196798481645e-06, + "loss": 0.3975, + "step": 2003 + }, + { + "epoch": 1.475699558173785, + "grad_norm": 0.4513106048107147, + "learning_rate": 6.008523349127188e-06, + "loss": 0.3806, + "step": 2004 + }, + { + "epoch": 1.4764359351988219, + "grad_norm": 0.5352683067321777, + "learning_rate": 6.004326277777141e-06, + "loss": 0.4328, + "step": 2005 + }, + { + "epoch": 1.4771723122238587, + "grad_norm": 0.4118804931640625, + "learning_rate": 6.000128468880223e-06, + "loss": 0.4485, + "step": 2006 + }, + { + "epoch": 1.4779086892488955, + "grad_norm": 0.4650607109069824, + "learning_rate": 5.995929925519181e-06, + "loss": 0.4315, + "step": 2007 + }, + { + "epoch": 1.4786450662739323, + "grad_norm": 0.5012993216514587, + "learning_rate": 5.991730650777297e-06, + "loss": 0.3924, + "step": 2008 + }, + { + "epoch": 1.4793814432989691, + "grad_norm": 0.4178714156150818, + "learning_rate": 5.987530647738394e-06, + "loss": 0.42, + "step": 2009 + }, + { + "epoch": 1.480117820324006, + "grad_norm": 0.4390731453895569, + "learning_rate": 5.983329919486824e-06, + "loss": 0.4017, + "step": 2010 + }, + { + "epoch": 1.4808541973490428, + "grad_norm": 0.4657975137233734, + "learning_rate": 5.9791284691074765e-06, + "loss": 0.3949, + "step": 2011 + }, + { + "epoch": 1.4815905743740796, + "grad_norm": 0.4687153100967407, + "learning_rate": 5.974926299685772e-06, + "loss": 0.3857, + "step": 2012 + }, + { + "epoch": 1.4823269513991164, + "grad_norm": 0.41643956303596497, + "learning_rate": 5.970723414307652e-06, + "loss": 0.4026, + "step": 2013 + }, + { + "epoch": 1.4830633284241532, + "grad_norm": 0.4140578806400299, + "learning_rate": 5.966519816059591e-06, + "loss": 0.3904, + "step": 2014 + }, + { + "epoch": 1.48379970544919, + "grad_norm": 0.42964425683021545, + "learning_rate": 5.962315508028584e-06, + "loss": 0.3904, + "step": 2015 + }, + { + "epoch": 1.4845360824742269, + "grad_norm": 0.4009730815887451, + "learning_rate": 5.958110493302148e-06, + "loss": 0.4047, + "step": 2016 + }, + { + "epoch": 1.4852724594992637, + "grad_norm": 0.41644203662872314, + "learning_rate": 5.95390477496832e-06, + "loss": 0.4019, + "step": 2017 + }, + { + "epoch": 1.4860088365243005, + "grad_norm": 0.43529626727104187, + "learning_rate": 5.94969835611565e-06, + "loss": 0.4447, + "step": 2018 + }, + { + "epoch": 1.4867452135493373, + "grad_norm": 0.4235115051269531, + "learning_rate": 5.945491239833206e-06, + "loss": 0.4281, + "step": 2019 + }, + { + "epoch": 1.4874815905743741, + "grad_norm": 0.441995769739151, + "learning_rate": 5.941283429210568e-06, + "loss": 0.3974, + "step": 2020 + }, + { + "epoch": 1.488217967599411, + "grad_norm": 0.4208744764328003, + "learning_rate": 5.937074927337824e-06, + "loss": 0.3994, + "step": 2021 + }, + { + "epoch": 1.4889543446244478, + "grad_norm": 0.3799182176589966, + "learning_rate": 5.932865737305571e-06, + "loss": 0.4188, + "step": 2022 + }, + { + "epoch": 1.4896907216494846, + "grad_norm": 0.4479272961616516, + "learning_rate": 5.928655862204911e-06, + "loss": 0.4072, + "step": 2023 + }, + { + "epoch": 1.4904270986745214, + "grad_norm": 0.4461783766746521, + "learning_rate": 5.924445305127448e-06, + "loss": 0.4119, + "step": 2024 + }, + { + "epoch": 1.4911634756995582, + "grad_norm": 0.40956565737724304, + "learning_rate": 5.9202340691652895e-06, + "loss": 0.4073, + "step": 2025 + }, + { + "epoch": 1.491899852724595, + "grad_norm": 0.3731072247028351, + "learning_rate": 5.916022157411038e-06, + "loss": 0.4237, + "step": 2026 + }, + { + "epoch": 1.4926362297496318, + "grad_norm": 0.42282289266586304, + "learning_rate": 5.911809572957796e-06, + "loss": 0.415, + "step": 2027 + }, + { + "epoch": 1.4933726067746687, + "grad_norm": 0.41434556245803833, + "learning_rate": 5.907596318899157e-06, + "loss": 0.4238, + "step": 2028 + }, + { + "epoch": 1.4941089837997055, + "grad_norm": 0.4469594657421112, + "learning_rate": 5.9033823983292095e-06, + "loss": 0.4217, + "step": 2029 + }, + { + "epoch": 1.4948453608247423, + "grad_norm": 0.4168426990509033, + "learning_rate": 5.899167814342527e-06, + "loss": 0.4015, + "step": 2030 + }, + { + "epoch": 1.495581737849779, + "grad_norm": 0.3973442018032074, + "learning_rate": 5.8949525700341735e-06, + "loss": 0.3986, + "step": 2031 + }, + { + "epoch": 1.496318114874816, + "grad_norm": 0.5031341314315796, + "learning_rate": 5.890736668499696e-06, + "loss": 0.4217, + "step": 2032 + }, + { + "epoch": 1.4970544918998527, + "grad_norm": 0.42431867122650146, + "learning_rate": 5.886520112835128e-06, + "loss": 0.3919, + "step": 2033 + }, + { + "epoch": 1.4977908689248896, + "grad_norm": 0.4433683454990387, + "learning_rate": 5.8823029061369785e-06, + "loss": 0.393, + "step": 2034 + }, + { + "epoch": 1.4985272459499264, + "grad_norm": 0.3885188400745392, + "learning_rate": 5.878085051502236e-06, + "loss": 0.3966, + "step": 2035 + }, + { + "epoch": 1.4992636229749632, + "grad_norm": 0.4179115295410156, + "learning_rate": 5.873866552028367e-06, + "loss": 0.39, + "step": 2036 + }, + { + "epoch": 1.5, + "grad_norm": 0.409787118434906, + "learning_rate": 5.86964741081331e-06, + "loss": 0.4102, + "step": 2037 + }, + { + "epoch": 1.5007363770250368, + "grad_norm": 0.4352511465549469, + "learning_rate": 5.865427630955475e-06, + "loss": 0.4002, + "step": 2038 + }, + { + "epoch": 1.5014727540500736, + "grad_norm": 0.47778865694999695, + "learning_rate": 5.861207215553739e-06, + "loss": 0.3996, + "step": 2039 + }, + { + "epoch": 1.5022091310751104, + "grad_norm": 0.3906075954437256, + "learning_rate": 5.856986167707448e-06, + "loss": 0.3844, + "step": 2040 + }, + { + "epoch": 1.5029455081001473, + "grad_norm": 0.4253922402858734, + "learning_rate": 5.852764490516414e-06, + "loss": 0.4071, + "step": 2041 + }, + { + "epoch": 1.503681885125184, + "grad_norm": 0.39142486453056335, + "learning_rate": 5.8485421870809076e-06, + "loss": 0.3863, + "step": 2042 + }, + { + "epoch": 1.504418262150221, + "grad_norm": 0.4321049153804779, + "learning_rate": 5.8443192605016604e-06, + "loss": 0.4135, + "step": 2043 + }, + { + "epoch": 1.5051546391752577, + "grad_norm": 0.4090045988559723, + "learning_rate": 5.840095713879864e-06, + "loss": 0.4048, + "step": 2044 + }, + { + "epoch": 1.5058910162002945, + "grad_norm": 0.4378100037574768, + "learning_rate": 5.83587155031716e-06, + "loss": 0.4017, + "step": 2045 + }, + { + "epoch": 1.5066273932253313, + "grad_norm": 0.42427340149879456, + "learning_rate": 5.831646772915651e-06, + "loss": 0.4201, + "step": 2046 + }, + { + "epoch": 1.5073637702503682, + "grad_norm": 0.4345465898513794, + "learning_rate": 5.827421384777883e-06, + "loss": 0.3987, + "step": 2047 + }, + { + "epoch": 1.508100147275405, + "grad_norm": 0.4228680729866028, + "learning_rate": 5.823195389006853e-06, + "loss": 0.4231, + "step": 2048 + }, + { + "epoch": 1.5088365243004418, + "grad_norm": 0.3961130976676941, + "learning_rate": 5.818968788706006e-06, + "loss": 0.4138, + "step": 2049 + }, + { + "epoch": 1.5095729013254786, + "grad_norm": 0.3891131579875946, + "learning_rate": 5.814741586979228e-06, + "loss": 0.3979, + "step": 2050 + }, + { + "epoch": 1.5103092783505154, + "grad_norm": 0.44312554597854614, + "learning_rate": 5.810513786930849e-06, + "loss": 0.4171, + "step": 2051 + }, + { + "epoch": 1.5110456553755522, + "grad_norm": 0.36081069707870483, + "learning_rate": 5.806285391665639e-06, + "loss": 0.3745, + "step": 2052 + }, + { + "epoch": 1.511782032400589, + "grad_norm": 0.43281090259552, + "learning_rate": 5.8020564042888015e-06, + "loss": 0.408, + "step": 2053 + }, + { + "epoch": 1.5125184094256259, + "grad_norm": 0.4164915680885315, + "learning_rate": 5.7978268279059795e-06, + "loss": 0.3897, + "step": 2054 + }, + { + "epoch": 1.5132547864506627, + "grad_norm": 0.41813069581985474, + "learning_rate": 5.7935966656232434e-06, + "loss": 0.4268, + "step": 2055 + }, + { + "epoch": 1.5139911634756995, + "grad_norm": 0.4125601053237915, + "learning_rate": 5.789365920547098e-06, + "loss": 0.3895, + "step": 2056 + }, + { + "epoch": 1.5147275405007363, + "grad_norm": 0.4070327579975128, + "learning_rate": 5.785134595784473e-06, + "loss": 0.3862, + "step": 2057 + }, + { + "epoch": 1.5154639175257731, + "grad_norm": 0.40182122588157654, + "learning_rate": 5.780902694442727e-06, + "loss": 0.4227, + "step": 2058 + }, + { + "epoch": 1.51620029455081, + "grad_norm": 0.40783995389938354, + "learning_rate": 5.776670219629643e-06, + "loss": 0.3972, + "step": 2059 + }, + { + "epoch": 1.5169366715758468, + "grad_norm": 0.4468885064125061, + "learning_rate": 5.772437174453418e-06, + "loss": 0.384, + "step": 2060 + }, + { + "epoch": 1.5176730486008836, + "grad_norm": 0.4095504581928253, + "learning_rate": 5.768203562022674e-06, + "loss": 0.4271, + "step": 2061 + }, + { + "epoch": 1.5184094256259204, + "grad_norm": 0.4125244915485382, + "learning_rate": 5.7639693854464495e-06, + "loss": 0.429, + "step": 2062 + }, + { + "epoch": 1.5191458026509572, + "grad_norm": 0.3764050602912903, + "learning_rate": 5.7597346478341946e-06, + "loss": 0.4135, + "step": 2063 + }, + { + "epoch": 1.519882179675994, + "grad_norm": 0.3860231637954712, + "learning_rate": 5.755499352295772e-06, + "loss": 0.387, + "step": 2064 + }, + { + "epoch": 1.5206185567010309, + "grad_norm": 0.42220139503479004, + "learning_rate": 5.751263501941454e-06, + "loss": 0.385, + "step": 2065 + }, + { + "epoch": 1.5213549337260677, + "grad_norm": 0.4375130236148834, + "learning_rate": 5.747027099881925e-06, + "loss": 0.4225, + "step": 2066 + }, + { + "epoch": 1.5220913107511045, + "grad_norm": 0.3937590420246124, + "learning_rate": 5.742790149228268e-06, + "loss": 0.4172, + "step": 2067 + }, + { + "epoch": 1.5228276877761413, + "grad_norm": 0.49965283274650574, + "learning_rate": 5.738552653091971e-06, + "loss": 0.3976, + "step": 2068 + }, + { + "epoch": 1.5235640648011781, + "grad_norm": 0.4149169623851776, + "learning_rate": 5.734314614584924e-06, + "loss": 0.4299, + "step": 2069 + }, + { + "epoch": 1.524300441826215, + "grad_norm": 0.41810843348503113, + "learning_rate": 5.730076036819414e-06, + "loss": 0.3996, + "step": 2070 + }, + { + "epoch": 1.5250368188512518, + "grad_norm": 0.42622384428977966, + "learning_rate": 5.725836922908125e-06, + "loss": 0.387, + "step": 2071 + }, + { + "epoch": 1.5257731958762886, + "grad_norm": 0.40818580985069275, + "learning_rate": 5.7215972759641335e-06, + "loss": 0.4264, + "step": 2072 + }, + { + "epoch": 1.5265095729013254, + "grad_norm": 0.4584078788757324, + "learning_rate": 5.71735709910091e-06, + "loss": 0.4179, + "step": 2073 + }, + { + "epoch": 1.5272459499263622, + "grad_norm": 0.44075214862823486, + "learning_rate": 5.7131163954323085e-06, + "loss": 0.4033, + "step": 2074 + }, + { + "epoch": 1.527982326951399, + "grad_norm": 0.4163413643836975, + "learning_rate": 5.708875168072577e-06, + "loss": 0.4279, + "step": 2075 + }, + { + "epoch": 1.5287187039764358, + "grad_norm": 0.40220654010772705, + "learning_rate": 5.704633420136343e-06, + "loss": 0.3998, + "step": 2076 + }, + { + "epoch": 1.5294550810014726, + "grad_norm": 0.45772379636764526, + "learning_rate": 5.700391154738619e-06, + "loss": 0.3933, + "step": 2077 + }, + { + "epoch": 1.5301914580265095, + "grad_norm": 0.38176655769348145, + "learning_rate": 5.696148374994795e-06, + "loss": 0.4204, + "step": 2078 + }, + { + "epoch": 1.5309278350515463, + "grad_norm": 0.4314991533756256, + "learning_rate": 5.691905084020642e-06, + "loss": 0.3963, + "step": 2079 + }, + { + "epoch": 1.531664212076583, + "grad_norm": 0.42802858352661133, + "learning_rate": 5.687661284932306e-06, + "loss": 0.4353, + "step": 2080 + }, + { + "epoch": 1.53240058910162, + "grad_norm": 0.4672757089138031, + "learning_rate": 5.6834169808463e-06, + "loss": 0.439, + "step": 2081 + }, + { + "epoch": 1.5331369661266567, + "grad_norm": 0.42437198758125305, + "learning_rate": 5.679172174879516e-06, + "loss": 0.3863, + "step": 2082 + }, + { + "epoch": 1.5338733431516935, + "grad_norm": 0.404613196849823, + "learning_rate": 5.67492687014921e-06, + "loss": 0.4005, + "step": 2083 + }, + { + "epoch": 1.5346097201767304, + "grad_norm": 0.41044384241104126, + "learning_rate": 5.6706810697730095e-06, + "loss": 0.4065, + "step": 2084 + }, + { + "epoch": 1.5353460972017672, + "grad_norm": 0.4328327178955078, + "learning_rate": 5.666434776868895e-06, + "loss": 0.4264, + "step": 2085 + }, + { + "epoch": 1.536082474226804, + "grad_norm": 0.44456425309181213, + "learning_rate": 5.662187994555221e-06, + "loss": 0.3825, + "step": 2086 + }, + { + "epoch": 1.5368188512518408, + "grad_norm": 0.4055061638355255, + "learning_rate": 5.657940725950693e-06, + "loss": 0.3757, + "step": 2087 + }, + { + "epoch": 1.5375552282768776, + "grad_norm": 0.4078313112258911, + "learning_rate": 5.65369297417438e-06, + "loss": 0.414, + "step": 2088 + }, + { + "epoch": 1.5382916053019144, + "grad_norm": 0.4202408194541931, + "learning_rate": 5.6494447423457e-06, + "loss": 0.406, + "step": 2089 + }, + { + "epoch": 1.5390279823269513, + "grad_norm": 0.4479289650917053, + "learning_rate": 5.645196033584426e-06, + "loss": 0.384, + "step": 2090 + }, + { + "epoch": 1.539764359351988, + "grad_norm": 0.40371423959732056, + "learning_rate": 5.640946851010682e-06, + "loss": 0.3914, + "step": 2091 + }, + { + "epoch": 1.540500736377025, + "grad_norm": 0.3889717161655426, + "learning_rate": 5.636697197744941e-06, + "loss": 0.4069, + "step": 2092 + }, + { + "epoch": 1.5412371134020617, + "grad_norm": 0.4180935323238373, + "learning_rate": 5.6324470769080165e-06, + "loss": 0.4073, + "step": 2093 + }, + { + "epoch": 1.5419734904270985, + "grad_norm": 0.46457764506340027, + "learning_rate": 5.6281964916210715e-06, + "loss": 0.3946, + "step": 2094 + }, + { + "epoch": 1.5427098674521353, + "grad_norm": 0.41641589999198914, + "learning_rate": 5.6239454450056066e-06, + "loss": 0.4287, + "step": 2095 + }, + { + "epoch": 1.5434462444771722, + "grad_norm": 0.40457090735435486, + "learning_rate": 5.6196939401834625e-06, + "loss": 0.4061, + "step": 2096 + }, + { + "epoch": 1.544182621502209, + "grad_norm": 0.42012038826942444, + "learning_rate": 5.615441980276814e-06, + "loss": 0.4113, + "step": 2097 + }, + { + "epoch": 1.5449189985272458, + "grad_norm": 0.4231841564178467, + "learning_rate": 5.611189568408173e-06, + "loss": 0.4352, + "step": 2098 + }, + { + "epoch": 1.5456553755522826, + "grad_norm": 0.3875311017036438, + "learning_rate": 5.6069367077003835e-06, + "loss": 0.4069, + "step": 2099 + }, + { + "epoch": 1.5463917525773194, + "grad_norm": 0.38472917675971985, + "learning_rate": 5.6026834012766155e-06, + "loss": 0.3963, + "step": 2100 + }, + { + "epoch": 1.5471281296023565, + "grad_norm": 0.48356834053993225, + "learning_rate": 5.598429652260371e-06, + "loss": 0.4407, + "step": 2101 + }, + { + "epoch": 1.5478645066273933, + "grad_norm": 0.3837643563747406, + "learning_rate": 5.594175463775475e-06, + "loss": 0.3983, + "step": 2102 + }, + { + "epoch": 1.54860088365243, + "grad_norm": 0.4311898648738861, + "learning_rate": 5.5899208389460715e-06, + "loss": 0.4, + "step": 2103 + }, + { + "epoch": 1.549337260677467, + "grad_norm": 0.38184744119644165, + "learning_rate": 5.5856657808966315e-06, + "loss": 0.3998, + "step": 2104 + }, + { + "epoch": 1.5500736377025037, + "grad_norm": 0.4166722297668457, + "learning_rate": 5.581410292751941e-06, + "loss": 0.3881, + "step": 2105 + }, + { + "epoch": 1.5508100147275405, + "grad_norm": 0.44956570863723755, + "learning_rate": 5.577154377637101e-06, + "loss": 0.4095, + "step": 2106 + }, + { + "epoch": 1.5515463917525774, + "grad_norm": 0.3997005522251129, + "learning_rate": 5.572898038677526e-06, + "loss": 0.4035, + "step": 2107 + }, + { + "epoch": 1.5522827687776142, + "grad_norm": 0.4489591121673584, + "learning_rate": 5.5686412789989444e-06, + "loss": 0.4001, + "step": 2108 + }, + { + "epoch": 1.553019145802651, + "grad_norm": 0.40616336464881897, + "learning_rate": 5.5643841017273915e-06, + "loss": 0.3858, + "step": 2109 + }, + { + "epoch": 1.5537555228276878, + "grad_norm": 0.46725019812583923, + "learning_rate": 5.560126509989209e-06, + "loss": 0.3954, + "step": 2110 + }, + { + "epoch": 1.5544918998527246, + "grad_norm": 0.42120271921157837, + "learning_rate": 5.5558685069110444e-06, + "loss": 0.4105, + "step": 2111 + }, + { + "epoch": 1.5552282768777614, + "grad_norm": 0.361070454120636, + "learning_rate": 5.5516100956198445e-06, + "loss": 0.3762, + "step": 2112 + }, + { + "epoch": 1.5559646539027983, + "grad_norm": 0.4115205705165863, + "learning_rate": 5.547351279242861e-06, + "loss": 0.4182, + "step": 2113 + }, + { + "epoch": 1.556701030927835, + "grad_norm": 0.40333011746406555, + "learning_rate": 5.543092060907639e-06, + "loss": 0.3663, + "step": 2114 + }, + { + "epoch": 1.5574374079528719, + "grad_norm": 0.4221840500831604, + "learning_rate": 5.538832443742018e-06, + "loss": 0.4132, + "step": 2115 + }, + { + "epoch": 1.5581737849779087, + "grad_norm": 0.40356510877609253, + "learning_rate": 5.5345724308741326e-06, + "loss": 0.3685, + "step": 2116 + }, + { + "epoch": 1.5589101620029455, + "grad_norm": 0.47017496824264526, + "learning_rate": 5.5303120254324104e-06, + "loss": 0.4117, + "step": 2117 + }, + { + "epoch": 1.5596465390279823, + "grad_norm": 0.3889496624469757, + "learning_rate": 5.52605123054556e-06, + "loss": 0.4128, + "step": 2118 + }, + { + "epoch": 1.5603829160530192, + "grad_norm": 0.44326820969581604, + "learning_rate": 5.521790049342583e-06, + "loss": 0.4213, + "step": 2119 + }, + { + "epoch": 1.561119293078056, + "grad_norm": 0.4094356298446655, + "learning_rate": 5.5175284849527635e-06, + "loss": 0.4006, + "step": 2120 + }, + { + "epoch": 1.5618556701030928, + "grad_norm": 0.46180570125579834, + "learning_rate": 5.513266540505662e-06, + "loss": 0.4114, + "step": 2121 + }, + { + "epoch": 1.5625920471281296, + "grad_norm": 0.38660722970962524, + "learning_rate": 5.509004219131124e-06, + "loss": 0.3904, + "step": 2122 + }, + { + "epoch": 1.5633284241531664, + "grad_norm": 0.42590051889419556, + "learning_rate": 5.504741523959269e-06, + "loss": 0.4155, + "step": 2123 + }, + { + "epoch": 1.5640648011782032, + "grad_norm": 0.4073706269264221, + "learning_rate": 5.500478458120493e-06, + "loss": 0.4318, + "step": 2124 + }, + { + "epoch": 1.56480117820324, + "grad_norm": 0.39798110723495483, + "learning_rate": 5.49621502474546e-06, + "loss": 0.416, + "step": 2125 + }, + { + "epoch": 1.5655375552282769, + "grad_norm": 0.4096386432647705, + "learning_rate": 5.491951226965108e-06, + "loss": 0.4398, + "step": 2126 + }, + { + "epoch": 1.5662739322533137, + "grad_norm": 0.4136483669281006, + "learning_rate": 5.48768706791064e-06, + "loss": 0.4249, + "step": 2127 + }, + { + "epoch": 1.5670103092783505, + "grad_norm": 0.435358464717865, + "learning_rate": 5.4834225507135284e-06, + "loss": 0.3931, + "step": 2128 + }, + { + "epoch": 1.5677466863033873, + "grad_norm": 0.39365580677986145, + "learning_rate": 5.479157678505503e-06, + "loss": 0.419, + "step": 2129 + }, + { + "epoch": 1.5684830633284241, + "grad_norm": 0.4026069939136505, + "learning_rate": 5.474892454418559e-06, + "loss": 0.393, + "step": 2130 + }, + { + "epoch": 1.569219440353461, + "grad_norm": 0.44560348987579346, + "learning_rate": 5.470626881584948e-06, + "loss": 0.4176, + "step": 2131 + }, + { + "epoch": 1.5699558173784978, + "grad_norm": 0.4150668978691101, + "learning_rate": 5.466360963137175e-06, + "loss": 0.4245, + "step": 2132 + }, + { + "epoch": 1.5706921944035346, + "grad_norm": 0.3756403923034668, + "learning_rate": 5.462094702208004e-06, + "loss": 0.4057, + "step": 2133 + }, + { + "epoch": 1.5714285714285714, + "grad_norm": 0.45393243432044983, + "learning_rate": 5.4578281019304494e-06, + "loss": 0.411, + "step": 2134 + }, + { + "epoch": 1.5721649484536082, + "grad_norm": 0.4269341826438904, + "learning_rate": 5.453561165437771e-06, + "loss": 0.3946, + "step": 2135 + }, + { + "epoch": 1.572901325478645, + "grad_norm": 0.39662691950798035, + "learning_rate": 5.449293895863478e-06, + "loss": 0.3958, + "step": 2136 + }, + { + "epoch": 1.5736377025036818, + "grad_norm": 0.5086665153503418, + "learning_rate": 5.445026296341325e-06, + "loss": 0.4376, + "step": 2137 + }, + { + "epoch": 1.5743740795287187, + "grad_norm": 0.43581289052963257, + "learning_rate": 5.440758370005309e-06, + "loss": 0.4384, + "step": 2138 + }, + { + "epoch": 1.5751104565537555, + "grad_norm": 0.39093270897865295, + "learning_rate": 5.4364901199896655e-06, + "loss": 0.4093, + "step": 2139 + }, + { + "epoch": 1.5758468335787923, + "grad_norm": 0.4165237545967102, + "learning_rate": 5.432221549428867e-06, + "loss": 0.3921, + "step": 2140 + }, + { + "epoch": 1.576583210603829, + "grad_norm": 0.4502768814563751, + "learning_rate": 5.427952661457624e-06, + "loss": 0.4246, + "step": 2141 + }, + { + "epoch": 1.577319587628866, + "grad_norm": 0.42256587743759155, + "learning_rate": 5.42368345921088e-06, + "loss": 0.3885, + "step": 2142 + }, + { + "epoch": 1.5780559646539027, + "grad_norm": 0.3865189552307129, + "learning_rate": 5.419413945823806e-06, + "loss": 0.4251, + "step": 2143 + }, + { + "epoch": 1.5787923416789398, + "grad_norm": 0.405563086271286, + "learning_rate": 5.415144124431805e-06, + "loss": 0.4107, + "step": 2144 + }, + { + "epoch": 1.5795287187039766, + "grad_norm": 0.4749312102794647, + "learning_rate": 5.410873998170503e-06, + "loss": 0.4022, + "step": 2145 + }, + { + "epoch": 1.5802650957290134, + "grad_norm": 0.41955462098121643, + "learning_rate": 5.4066035701757535e-06, + "loss": 0.4113, + "step": 2146 + }, + { + "epoch": 1.5810014727540502, + "grad_norm": 0.34526562690734863, + "learning_rate": 5.402332843583631e-06, + "loss": 0.4176, + "step": 2147 + }, + { + "epoch": 1.581737849779087, + "grad_norm": 0.4131619334220886, + "learning_rate": 5.398061821530423e-06, + "loss": 0.398, + "step": 2148 + }, + { + "epoch": 1.5824742268041239, + "grad_norm": 0.38839566707611084, + "learning_rate": 5.393790507152645e-06, + "loss": 0.4031, + "step": 2149 + }, + { + "epoch": 1.5832106038291607, + "grad_norm": 0.35662680864334106, + "learning_rate": 5.389518903587016e-06, + "loss": 0.3957, + "step": 2150 + }, + { + "epoch": 1.5839469808541975, + "grad_norm": 0.417187362909317, + "learning_rate": 5.3852470139704786e-06, + "loss": 0.4268, + "step": 2151 + }, + { + "epoch": 1.5846833578792343, + "grad_norm": 0.4425226151943207, + "learning_rate": 5.380974841440173e-06, + "loss": 0.4115, + "step": 2152 + }, + { + "epoch": 1.5854197349042711, + "grad_norm": 0.41201913356781006, + "learning_rate": 5.376702389133458e-06, + "loss": 0.4056, + "step": 2153 + }, + { + "epoch": 1.586156111929308, + "grad_norm": 0.39344313740730286, + "learning_rate": 5.37242966018789e-06, + "loss": 0.4178, + "step": 2154 + }, + { + "epoch": 1.5868924889543448, + "grad_norm": 0.41077542304992676, + "learning_rate": 5.3681566577412355e-06, + "loss": 0.4046, + "step": 2155 + }, + { + "epoch": 1.5876288659793816, + "grad_norm": 0.40460658073425293, + "learning_rate": 5.363883384931456e-06, + "loss": 0.4302, + "step": 2156 + }, + { + "epoch": 1.5883652430044184, + "grad_norm": 0.43949198722839355, + "learning_rate": 5.359609844896717e-06, + "loss": 0.4148, + "step": 2157 + }, + { + "epoch": 1.5891016200294552, + "grad_norm": 0.43986040353775024, + "learning_rate": 5.355336040775373e-06, + "loss": 0.4092, + "step": 2158 + }, + { + "epoch": 1.589837997054492, + "grad_norm": 0.41094833612442017, + "learning_rate": 5.3510619757059775e-06, + "loss": 0.3936, + "step": 2159 + }, + { + "epoch": 1.5905743740795288, + "grad_norm": 0.456833153963089, + "learning_rate": 5.346787652827279e-06, + "loss": 0.416, + "step": 2160 + }, + { + "epoch": 1.5913107511045657, + "grad_norm": 0.43459081649780273, + "learning_rate": 5.3425130752782065e-06, + "loss": 0.4134, + "step": 2161 + }, + { + "epoch": 1.5920471281296025, + "grad_norm": 0.41692793369293213, + "learning_rate": 5.33823824619788e-06, + "loss": 0.3983, + "step": 2162 + }, + { + "epoch": 1.5927835051546393, + "grad_norm": 0.37748250365257263, + "learning_rate": 5.3339631687256085e-06, + "loss": 0.399, + "step": 2163 + }, + { + "epoch": 1.593519882179676, + "grad_norm": 0.40077048540115356, + "learning_rate": 5.3296878460008785e-06, + "loss": 0.3902, + "step": 2164 + }, + { + "epoch": 1.594256259204713, + "grad_norm": 0.4162245988845825, + "learning_rate": 5.325412281163356e-06, + "loss": 0.3972, + "step": 2165 + }, + { + "epoch": 1.5949926362297497, + "grad_norm": 0.41424018144607544, + "learning_rate": 5.321136477352887e-06, + "loss": 0.396, + "step": 2166 + }, + { + "epoch": 1.5957290132547866, + "grad_norm": 0.4058535099029541, + "learning_rate": 5.3168604377094945e-06, + "loss": 0.3772, + "step": 2167 + }, + { + "epoch": 1.5964653902798234, + "grad_norm": 0.44171908497810364, + "learning_rate": 5.312584165373372e-06, + "loss": 0.4135, + "step": 2168 + }, + { + "epoch": 1.5972017673048602, + "grad_norm": 0.46828219294548035, + "learning_rate": 5.308307663484884e-06, + "loss": 0.4229, + "step": 2169 + }, + { + "epoch": 1.597938144329897, + "grad_norm": 0.39885610342025757, + "learning_rate": 5.304030935184564e-06, + "loss": 0.4146, + "step": 2170 + }, + { + "epoch": 1.5986745213549338, + "grad_norm": 0.4212936460971832, + "learning_rate": 5.299753983613114e-06, + "loss": 0.4033, + "step": 2171 + }, + { + "epoch": 1.5994108983799706, + "grad_norm": 0.4296552836894989, + "learning_rate": 5.2954768119113975e-06, + "loss": 0.4278, + "step": 2172 + }, + { + "epoch": 1.6001472754050075, + "grad_norm": 0.4296092987060547, + "learning_rate": 5.291199423220438e-06, + "loss": 0.4141, + "step": 2173 + }, + { + "epoch": 1.6008836524300443, + "grad_norm": 0.3816363215446472, + "learning_rate": 5.286921820681421e-06, + "loss": 0.4128, + "step": 2174 + }, + { + "epoch": 1.601620029455081, + "grad_norm": 0.415924608707428, + "learning_rate": 5.28264400743569e-06, + "loss": 0.4133, + "step": 2175 + }, + { + "epoch": 1.602356406480118, + "grad_norm": 0.47628721594810486, + "learning_rate": 5.278365986624743e-06, + "loss": 0.4195, + "step": 2176 + }, + { + "epoch": 1.6030927835051547, + "grad_norm": 0.477327823638916, + "learning_rate": 5.274087761390224e-06, + "loss": 0.4116, + "step": 2177 + }, + { + "epoch": 1.6038291605301915, + "grad_norm": 0.42795825004577637, + "learning_rate": 5.269809334873939e-06, + "loss": 0.4033, + "step": 2178 + }, + { + "epoch": 1.6045655375552283, + "grad_norm": 0.4993734657764435, + "learning_rate": 5.2655307102178285e-06, + "loss": 0.4451, + "step": 2179 + }, + { + "epoch": 1.6053019145802652, + "grad_norm": 0.49656498432159424, + "learning_rate": 5.26125189056399e-06, + "loss": 0.4144, + "step": 2180 + }, + { + "epoch": 1.606038291605302, + "grad_norm": 0.4029616713523865, + "learning_rate": 5.256972879054659e-06, + "loss": 0.4113, + "step": 2181 + }, + { + "epoch": 1.6067746686303388, + "grad_norm": 0.46401447057724, + "learning_rate": 5.2526936788322106e-06, + "loss": 0.4219, + "step": 2182 + }, + { + "epoch": 1.6075110456553756, + "grad_norm": 0.49594005942344666, + "learning_rate": 5.248414293039159e-06, + "loss": 0.4095, + "step": 2183 + }, + { + "epoch": 1.6082474226804124, + "grad_norm": 0.5078505873680115, + "learning_rate": 5.244134724818158e-06, + "loss": 0.4391, + "step": 2184 + }, + { + "epoch": 1.6089837997054492, + "grad_norm": 0.3824053108692169, + "learning_rate": 5.2398549773119945e-06, + "loss": 0.391, + "step": 2185 + }, + { + "epoch": 1.609720176730486, + "grad_norm": 0.44562169909477234, + "learning_rate": 5.235575053663582e-06, + "loss": 0.4052, + "step": 2186 + }, + { + "epoch": 1.6104565537555229, + "grad_norm": 0.44519391655921936, + "learning_rate": 5.231294957015969e-06, + "loss": 0.4049, + "step": 2187 + }, + { + "epoch": 1.6111929307805597, + "grad_norm": 0.38943272829055786, + "learning_rate": 5.2270146905123285e-06, + "loss": 0.3984, + "step": 2188 + }, + { + "epoch": 1.6119293078055965, + "grad_norm": 0.4190487265586853, + "learning_rate": 5.222734257295963e-06, + "loss": 0.3971, + "step": 2189 + }, + { + "epoch": 1.6126656848306333, + "grad_norm": 0.5010129809379578, + "learning_rate": 5.218453660510287e-06, + "loss": 0.3993, + "step": 2190 + }, + { + "epoch": 1.6134020618556701, + "grad_norm": 0.4155033826828003, + "learning_rate": 5.214172903298843e-06, + "loss": 0.4258, + "step": 2191 + }, + { + "epoch": 1.614138438880707, + "grad_norm": 0.441825270652771, + "learning_rate": 5.209891988805292e-06, + "loss": 0.4368, + "step": 2192 + }, + { + "epoch": 1.6148748159057438, + "grad_norm": 0.49732720851898193, + "learning_rate": 5.205610920173408e-06, + "loss": 0.3995, + "step": 2193 + }, + { + "epoch": 1.6156111929307806, + "grad_norm": 0.39198291301727295, + "learning_rate": 5.201329700547077e-06, + "loss": 0.412, + "step": 2194 + }, + { + "epoch": 1.6163475699558174, + "grad_norm": 0.37358447909355164, + "learning_rate": 5.197048333070297e-06, + "loss": 0.4017, + "step": 2195 + }, + { + "epoch": 1.6170839469808542, + "grad_norm": 0.49914005398750305, + "learning_rate": 5.192766820887177e-06, + "loss": 0.4346, + "step": 2196 + }, + { + "epoch": 1.617820324005891, + "grad_norm": 0.45152711868286133, + "learning_rate": 5.188485167141929e-06, + "loss": 0.3845, + "step": 2197 + }, + { + "epoch": 1.6185567010309279, + "grad_norm": 0.3699303865432739, + "learning_rate": 5.1842033749788686e-06, + "loss": 0.3937, + "step": 2198 + }, + { + "epoch": 1.6192930780559647, + "grad_norm": 0.578906238079071, + "learning_rate": 5.179921447542417e-06, + "loss": 0.4365, + "step": 2199 + }, + { + "epoch": 1.6200294550810015, + "grad_norm": 0.3776850998401642, + "learning_rate": 5.175639387977091e-06, + "loss": 0.4229, + "step": 2200 + }, + { + "epoch": 1.6207658321060383, + "grad_norm": 0.5020015239715576, + "learning_rate": 5.171357199427507e-06, + "loss": 0.4087, + "step": 2201 + }, + { + "epoch": 1.6215022091310751, + "grad_norm": 0.447489857673645, + "learning_rate": 5.1670748850383734e-06, + "loss": 0.4046, + "step": 2202 + }, + { + "epoch": 1.622238586156112, + "grad_norm": 0.4035995602607727, + "learning_rate": 5.162792447954494e-06, + "loss": 0.3878, + "step": 2203 + }, + { + "epoch": 1.6229749631811488, + "grad_norm": 0.43608716130256653, + "learning_rate": 5.158509891320759e-06, + "loss": 0.4031, + "step": 2204 + }, + { + "epoch": 1.6237113402061856, + "grad_norm": 0.5111272931098938, + "learning_rate": 5.154227218282149e-06, + "loss": 0.3981, + "step": 2205 + }, + { + "epoch": 1.6244477172312224, + "grad_norm": 0.40405896306037903, + "learning_rate": 5.1499444319837326e-06, + "loss": 0.3946, + "step": 2206 + }, + { + "epoch": 1.6251840942562592, + "grad_norm": 0.4375455975532532, + "learning_rate": 5.145661535570656e-06, + "loss": 0.4045, + "step": 2207 + }, + { + "epoch": 1.625920471281296, + "grad_norm": 0.4746772348880768, + "learning_rate": 5.141378532188148e-06, + "loss": 0.4113, + "step": 2208 + }, + { + "epoch": 1.6266568483063328, + "grad_norm": 0.4946158826351166, + "learning_rate": 5.137095424981519e-06, + "loss": 0.4086, + "step": 2209 + }, + { + "epoch": 1.6273932253313697, + "grad_norm": 0.4054524600505829, + "learning_rate": 5.1328122170961534e-06, + "loss": 0.4218, + "step": 2210 + }, + { + "epoch": 1.6281296023564065, + "grad_norm": 0.5024380683898926, + "learning_rate": 5.128528911677509e-06, + "loss": 0.3965, + "step": 2211 + }, + { + "epoch": 1.6288659793814433, + "grad_norm": 0.409605473279953, + "learning_rate": 5.124245511871115e-06, + "loss": 0.4167, + "step": 2212 + }, + { + "epoch": 1.62960235640648, + "grad_norm": 0.44012802839279175, + "learning_rate": 5.119962020822572e-06, + "loss": 0.431, + "step": 2213 + }, + { + "epoch": 1.630338733431517, + "grad_norm": 0.41772133111953735, + "learning_rate": 5.115678441677546e-06, + "loss": 0.3957, + "step": 2214 + }, + { + "epoch": 1.6310751104565537, + "grad_norm": 0.3946409523487091, + "learning_rate": 5.111394777581769e-06, + "loss": 0.422, + "step": 2215 + }, + { + "epoch": 1.6318114874815906, + "grad_norm": 0.4510660767555237, + "learning_rate": 5.107111031681034e-06, + "loss": 0.4063, + "step": 2216 + }, + { + "epoch": 1.6325478645066274, + "grad_norm": 0.4569968581199646, + "learning_rate": 5.1028272071211916e-06, + "loss": 0.4193, + "step": 2217 + }, + { + "epoch": 1.6332842415316642, + "grad_norm": 0.45792651176452637, + "learning_rate": 5.098543307048158e-06, + "loss": 0.4073, + "step": 2218 + }, + { + "epoch": 1.634020618556701, + "grad_norm": 0.42194387316703796, + "learning_rate": 5.094259334607896e-06, + "loss": 0.3922, + "step": 2219 + }, + { + "epoch": 1.6347569955817378, + "grad_norm": 0.48982447385787964, + "learning_rate": 5.089975292946427e-06, + "loss": 0.4192, + "step": 2220 + }, + { + "epoch": 1.6354933726067746, + "grad_norm": 0.4341419041156769, + "learning_rate": 5.085691185209824e-06, + "loss": 0.428, + "step": 2221 + }, + { + "epoch": 1.6362297496318114, + "grad_norm": 0.4825488030910492, + "learning_rate": 5.081407014544202e-06, + "loss": 0.4034, + "step": 2222 + }, + { + "epoch": 1.6369661266568483, + "grad_norm": 0.4492826759815216, + "learning_rate": 5.07712278409573e-06, + "loss": 0.4292, + "step": 2223 + }, + { + "epoch": 1.637702503681885, + "grad_norm": 0.44333913922309875, + "learning_rate": 5.0728384970106135e-06, + "loss": 0.3939, + "step": 2224 + }, + { + "epoch": 1.638438880706922, + "grad_norm": 0.3804095387458801, + "learning_rate": 5.068554156435108e-06, + "loss": 0.3795, + "step": 2225 + }, + { + "epoch": 1.6391752577319587, + "grad_norm": 0.3924141228199005, + "learning_rate": 5.0642697655155e-06, + "loss": 0.3771, + "step": 2226 + }, + { + "epoch": 1.6399116347569955, + "grad_norm": 0.4596264660358429, + "learning_rate": 5.059985327398121e-06, + "loss": 0.4098, + "step": 2227 + }, + { + "epoch": 1.6406480117820323, + "grad_norm": 0.4128621220588684, + "learning_rate": 5.0557008452293275e-06, + "loss": 0.4235, + "step": 2228 + }, + { + "epoch": 1.6413843888070692, + "grad_norm": 0.44195055961608887, + "learning_rate": 5.051416322155519e-06, + "loss": 0.397, + "step": 2229 + }, + { + "epoch": 1.642120765832106, + "grad_norm": 0.4374493956565857, + "learning_rate": 5.047131761323115e-06, + "loss": 0.3965, + "step": 2230 + }, + { + "epoch": 1.6428571428571428, + "grad_norm": 0.39338651299476624, + "learning_rate": 5.0428471658785715e-06, + "loss": 0.3714, + "step": 2231 + }, + { + "epoch": 1.6435935198821796, + "grad_norm": 0.4788668155670166, + "learning_rate": 5.038562538968363e-06, + "loss": 0.4252, + "step": 2232 + }, + { + "epoch": 1.6443298969072164, + "grad_norm": 0.4350014328956604, + "learning_rate": 5.034277883738992e-06, + "loss": 0.3947, + "step": 2233 + }, + { + "epoch": 1.6450662739322532, + "grad_norm": 0.3968852460384369, + "learning_rate": 5.029993203336978e-06, + "loss": 0.4066, + "step": 2234 + }, + { + "epoch": 1.64580265095729, + "grad_norm": 0.3912464678287506, + "learning_rate": 5.025708500908864e-06, + "loss": 0.4011, + "step": 2235 + }, + { + "epoch": 1.6465390279823269, + "grad_norm": 0.4370138943195343, + "learning_rate": 5.021423779601202e-06, + "loss": 0.3789, + "step": 2236 + }, + { + "epoch": 1.6472754050073637, + "grad_norm": 0.39066141843795776, + "learning_rate": 5.017139042560564e-06, + "loss": 0.3597, + "step": 2237 + }, + { + "epoch": 1.6480117820324005, + "grad_norm": 0.45047131180763245, + "learning_rate": 5.01285429293353e-06, + "loss": 0.4255, + "step": 2238 + }, + { + "epoch": 1.6487481590574373, + "grad_norm": 0.4357801377773285, + "learning_rate": 5.008569533866693e-06, + "loss": 0.4201, + "step": 2239 + }, + { + "epoch": 1.6494845360824741, + "grad_norm": 0.3798350393772125, + "learning_rate": 5.00428476850665e-06, + "loss": 0.3872, + "step": 2240 + }, + { + "epoch": 1.650220913107511, + "grad_norm": 0.4238215386867523, + "learning_rate": 5e-06, + "loss": 0.4003, + "step": 2241 + }, + { + "epoch": 1.6509572901325478, + "grad_norm": 0.4555947482585907, + "learning_rate": 4.995715231493352e-06, + "loss": 0.4197, + "step": 2242 + }, + { + "epoch": 1.6516936671575846, + "grad_norm": 0.3676789700984955, + "learning_rate": 4.991430466133308e-06, + "loss": 0.3704, + "step": 2243 + }, + { + "epoch": 1.6524300441826214, + "grad_norm": 0.43429771065711975, + "learning_rate": 4.98714570706647e-06, + "loss": 0.4075, + "step": 2244 + }, + { + "epoch": 1.6531664212076582, + "grad_norm": 0.4155738055706024, + "learning_rate": 4.982860957439437e-06, + "loss": 0.4313, + "step": 2245 + }, + { + "epoch": 1.653902798232695, + "grad_norm": 0.38362860679626465, + "learning_rate": 4.978576220398801e-06, + "loss": 0.4021, + "step": 2246 + }, + { + "epoch": 1.6546391752577319, + "grad_norm": 0.39530959725379944, + "learning_rate": 4.97429149909114e-06, + "loss": 0.3926, + "step": 2247 + }, + { + "epoch": 1.6553755522827687, + "grad_norm": 0.36841920018196106, + "learning_rate": 4.970006796663023e-06, + "loss": 0.4198, + "step": 2248 + }, + { + "epoch": 1.6561119293078055, + "grad_norm": 0.34673774242401123, + "learning_rate": 4.965722116261009e-06, + "loss": 0.413, + "step": 2249 + }, + { + "epoch": 1.6568483063328423, + "grad_norm": 0.35781288146972656, + "learning_rate": 4.961437461031638e-06, + "loss": 0.4232, + "step": 2250 + }, + { + "epoch": 1.6575846833578791, + "grad_norm": 0.4127185046672821, + "learning_rate": 4.95715283412143e-06, + "loss": 0.4162, + "step": 2251 + }, + { + "epoch": 1.658321060382916, + "grad_norm": 0.3733428120613098, + "learning_rate": 4.952868238676885e-06, + "loss": 0.3892, + "step": 2252 + }, + { + "epoch": 1.6590574374079528, + "grad_norm": 0.46769607067108154, + "learning_rate": 4.948583677844482e-06, + "loss": 0.3946, + "step": 2253 + }, + { + "epoch": 1.6597938144329896, + "grad_norm": 0.39663955569267273, + "learning_rate": 4.944299154770673e-06, + "loss": 0.3957, + "step": 2254 + }, + { + "epoch": 1.6605301914580264, + "grad_norm": 0.3862704932689667, + "learning_rate": 4.940014672601881e-06, + "loss": 0.4042, + "step": 2255 + }, + { + "epoch": 1.6612665684830632, + "grad_norm": 0.36831381916999817, + "learning_rate": 4.9357302344845005e-06, + "loss": 0.388, + "step": 2256 + }, + { + "epoch": 1.6620029455081, + "grad_norm": 0.4497531056404114, + "learning_rate": 4.931445843564893e-06, + "loss": 0.4393, + "step": 2257 + }, + { + "epoch": 1.6627393225331368, + "grad_norm": 0.4758419692516327, + "learning_rate": 4.927161502989387e-06, + "loss": 0.3868, + "step": 2258 + }, + { + "epoch": 1.6634756995581736, + "grad_norm": 0.3863910734653473, + "learning_rate": 4.922877215904272e-06, + "loss": 0.4029, + "step": 2259 + }, + { + "epoch": 1.6642120765832105, + "grad_norm": 0.4318389892578125, + "learning_rate": 4.918592985455799e-06, + "loss": 0.4101, + "step": 2260 + }, + { + "epoch": 1.6649484536082473, + "grad_norm": 0.44303011894226074, + "learning_rate": 4.914308814790178e-06, + "loss": 0.3769, + "step": 2261 + }, + { + "epoch": 1.665684830633284, + "grad_norm": 0.4603565037250519, + "learning_rate": 4.910024707053573e-06, + "loss": 0.417, + "step": 2262 + }, + { + "epoch": 1.666421207658321, + "grad_norm": 0.3724450170993805, + "learning_rate": 4.905740665392106e-06, + "loss": 0.4129, + "step": 2263 + }, + { + "epoch": 1.6671575846833577, + "grad_norm": 0.4588872790336609, + "learning_rate": 4.901456692951844e-06, + "loss": 0.4192, + "step": 2264 + }, + { + "epoch": 1.6678939617083945, + "grad_norm": 0.36367759108543396, + "learning_rate": 4.89717279287881e-06, + "loss": 0.4206, + "step": 2265 + }, + { + "epoch": 1.6686303387334314, + "grad_norm": 0.3877205550670624, + "learning_rate": 4.892888968318968e-06, + "loss": 0.4204, + "step": 2266 + }, + { + "epoch": 1.6693667157584682, + "grad_norm": 0.4392288625240326, + "learning_rate": 4.888605222418232e-06, + "loss": 0.4133, + "step": 2267 + }, + { + "epoch": 1.670103092783505, + "grad_norm": 0.35620570182800293, + "learning_rate": 4.884321558322455e-06, + "loss": 0.3808, + "step": 2268 + }, + { + "epoch": 1.6708394698085418, + "grad_norm": 0.39953136444091797, + "learning_rate": 4.8800379791774285e-06, + "loss": 0.4151, + "step": 2269 + }, + { + "epoch": 1.6715758468335786, + "grad_norm": 0.46267053484916687, + "learning_rate": 4.875754488128885e-06, + "loss": 0.3956, + "step": 2270 + }, + { + "epoch": 1.6723122238586157, + "grad_norm": 0.401598185300827, + "learning_rate": 4.871471088322493e-06, + "loss": 0.4193, + "step": 2271 + }, + { + "epoch": 1.6730486008836525, + "grad_norm": 0.41850388050079346, + "learning_rate": 4.867187782903847e-06, + "loss": 0.3883, + "step": 2272 + }, + { + "epoch": 1.6737849779086893, + "grad_norm": 0.47216498851776123, + "learning_rate": 4.862904575018482e-06, + "loss": 0.4087, + "step": 2273 + }, + { + "epoch": 1.6745213549337261, + "grad_norm": 0.416972279548645, + "learning_rate": 4.8586214678118536e-06, + "loss": 0.3824, + "step": 2274 + }, + { + "epoch": 1.675257731958763, + "grad_norm": 0.41057777404785156, + "learning_rate": 4.854338464429346e-06, + "loss": 0.4095, + "step": 2275 + }, + { + "epoch": 1.6759941089837997, + "grad_norm": 0.41428229212760925, + "learning_rate": 4.850055568016268e-06, + "loss": 0.4096, + "step": 2276 + }, + { + "epoch": 1.6767304860088366, + "grad_norm": 0.5290152430534363, + "learning_rate": 4.845772781717852e-06, + "loss": 0.3914, + "step": 2277 + }, + { + "epoch": 1.6774668630338734, + "grad_norm": 0.41980114579200745, + "learning_rate": 4.841490108679242e-06, + "loss": 0.3946, + "step": 2278 + }, + { + "epoch": 1.6782032400589102, + "grad_norm": 0.3944283127784729, + "learning_rate": 4.837207552045509e-06, + "loss": 0.3936, + "step": 2279 + }, + { + "epoch": 1.678939617083947, + "grad_norm": 0.46467381715774536, + "learning_rate": 4.832925114961629e-06, + "loss": 0.4159, + "step": 2280 + }, + { + "epoch": 1.6796759941089838, + "grad_norm": 0.4479691982269287, + "learning_rate": 4.828642800572495e-06, + "loss": 0.3904, + "step": 2281 + }, + { + "epoch": 1.6804123711340206, + "grad_norm": 0.39899787306785583, + "learning_rate": 4.8243606120229095e-06, + "loss": 0.4172, + "step": 2282 + }, + { + "epoch": 1.6811487481590575, + "grad_norm": 0.4519674479961395, + "learning_rate": 4.820078552457584e-06, + "loss": 0.401, + "step": 2283 + }, + { + "epoch": 1.6818851251840943, + "grad_norm": 0.405024915933609, + "learning_rate": 4.815796625021132e-06, + "loss": 0.4126, + "step": 2284 + }, + { + "epoch": 1.682621502209131, + "grad_norm": 0.42666709423065186, + "learning_rate": 4.811514832858072e-06, + "loss": 0.4016, + "step": 2285 + }, + { + "epoch": 1.683357879234168, + "grad_norm": 0.49751371145248413, + "learning_rate": 4.8072331791128244e-06, + "loss": 0.4145, + "step": 2286 + }, + { + "epoch": 1.6840942562592047, + "grad_norm": 0.48239147663116455, + "learning_rate": 4.802951666929704e-06, + "loss": 0.4144, + "step": 2287 + }, + { + "epoch": 1.6848306332842415, + "grad_norm": 0.41547465324401855, + "learning_rate": 4.798670299452926e-06, + "loss": 0.4038, + "step": 2288 + }, + { + "epoch": 1.6855670103092784, + "grad_norm": 0.4863029420375824, + "learning_rate": 4.794389079826594e-06, + "loss": 0.3982, + "step": 2289 + }, + { + "epoch": 1.6863033873343152, + "grad_norm": 0.48596158623695374, + "learning_rate": 4.790108011194709e-06, + "loss": 0.4637, + "step": 2290 + }, + { + "epoch": 1.687039764359352, + "grad_norm": 0.3679700791835785, + "learning_rate": 4.785827096701159e-06, + "loss": 0.3796, + "step": 2291 + }, + { + "epoch": 1.6877761413843888, + "grad_norm": 0.39074385166168213, + "learning_rate": 4.781546339489716e-06, + "loss": 0.4275, + "step": 2292 + }, + { + "epoch": 1.6885125184094256, + "grad_norm": 0.47816234827041626, + "learning_rate": 4.777265742704039e-06, + "loss": 0.4048, + "step": 2293 + }, + { + "epoch": 1.6892488954344624, + "grad_norm": 0.4297253489494324, + "learning_rate": 4.7729853094876714e-06, + "loss": 0.3735, + "step": 2294 + }, + { + "epoch": 1.6899852724594993, + "grad_norm": 0.3920181095600128, + "learning_rate": 4.768705042984031e-06, + "loss": 0.4444, + "step": 2295 + }, + { + "epoch": 1.690721649484536, + "grad_norm": 0.35152071714401245, + "learning_rate": 4.7644249463364205e-06, + "loss": 0.4234, + "step": 2296 + }, + { + "epoch": 1.6914580265095729, + "grad_norm": 0.4087672233581543, + "learning_rate": 4.760145022688007e-06, + "loss": 0.4192, + "step": 2297 + }, + { + "epoch": 1.6921944035346097, + "grad_norm": 0.40535783767700195, + "learning_rate": 4.755865275181843e-06, + "loss": 0.4036, + "step": 2298 + }, + { + "epoch": 1.6929307805596465, + "grad_norm": 0.4112594425678253, + "learning_rate": 4.751585706960842e-06, + "loss": 0.4382, + "step": 2299 + }, + { + "epoch": 1.6936671575846833, + "grad_norm": 0.40678101778030396, + "learning_rate": 4.747306321167791e-06, + "loss": 0.4198, + "step": 2300 + }, + { + "epoch": 1.6944035346097202, + "grad_norm": 0.38269445300102234, + "learning_rate": 4.743027120945342e-06, + "loss": 0.4061, + "step": 2301 + }, + { + "epoch": 1.695139911634757, + "grad_norm": 0.47511664032936096, + "learning_rate": 4.73874810943601e-06, + "loss": 0.4329, + "step": 2302 + }, + { + "epoch": 1.6958762886597938, + "grad_norm": 0.3984917998313904, + "learning_rate": 4.7344692897821714e-06, + "loss": 0.4191, + "step": 2303 + }, + { + "epoch": 1.6966126656848306, + "grad_norm": 0.37280645966529846, + "learning_rate": 4.7301906651260634e-06, + "loss": 0.4067, + "step": 2304 + }, + { + "epoch": 1.6973490427098674, + "grad_norm": 0.3684954047203064, + "learning_rate": 4.725912238609779e-06, + "loss": 0.4016, + "step": 2305 + }, + { + "epoch": 1.6980854197349042, + "grad_norm": 0.3745935559272766, + "learning_rate": 4.7216340133752604e-06, + "loss": 0.3998, + "step": 2306 + }, + { + "epoch": 1.698821796759941, + "grad_norm": 0.360403448343277, + "learning_rate": 4.717355992564311e-06, + "loss": 0.3958, + "step": 2307 + }, + { + "epoch": 1.6995581737849779, + "grad_norm": 0.3953647017478943, + "learning_rate": 4.7130781793185805e-06, + "loss": 0.404, + "step": 2308 + }, + { + "epoch": 1.7002945508100147, + "grad_norm": 0.40129730105400085, + "learning_rate": 4.708800576779564e-06, + "loss": 0.4205, + "step": 2309 + }, + { + "epoch": 1.7010309278350515, + "grad_norm": 0.3984296917915344, + "learning_rate": 4.704523188088604e-06, + "loss": 0.3888, + "step": 2310 + }, + { + "epoch": 1.7017673048600883, + "grad_norm": 0.37107083201408386, + "learning_rate": 4.700246016386887e-06, + "loss": 0.3904, + "step": 2311 + }, + { + "epoch": 1.7025036818851251, + "grad_norm": 0.38005733489990234, + "learning_rate": 4.695969064815436e-06, + "loss": 0.4087, + "step": 2312 + }, + { + "epoch": 1.7032400589101622, + "grad_norm": 0.37871333956718445, + "learning_rate": 4.6916923365151185e-06, + "loss": 0.4028, + "step": 2313 + }, + { + "epoch": 1.703976435935199, + "grad_norm": 0.3813686668872833, + "learning_rate": 4.68741583462663e-06, + "loss": 0.3869, + "step": 2314 + }, + { + "epoch": 1.7047128129602358, + "grad_norm": 0.37767162919044495, + "learning_rate": 4.683139562290506e-06, + "loss": 0.4025, + "step": 2315 + }, + { + "epoch": 1.7054491899852726, + "grad_norm": 0.38888660073280334, + "learning_rate": 4.678863522647114e-06, + "loss": 0.3863, + "step": 2316 + }, + { + "epoch": 1.7061855670103094, + "grad_norm": 0.4239829182624817, + "learning_rate": 4.6745877188366464e-06, + "loss": 0.3987, + "step": 2317 + }, + { + "epoch": 1.7069219440353463, + "grad_norm": 0.38922736048698425, + "learning_rate": 4.670312153999123e-06, + "loss": 0.3987, + "step": 2318 + }, + { + "epoch": 1.707658321060383, + "grad_norm": 0.42677411437034607, + "learning_rate": 4.666036831274392e-06, + "loss": 0.3937, + "step": 2319 + }, + { + "epoch": 1.7083946980854199, + "grad_norm": 0.43811002373695374, + "learning_rate": 4.66176175380212e-06, + "loss": 0.4189, + "step": 2320 + }, + { + "epoch": 1.7091310751104567, + "grad_norm": 0.41332265734672546, + "learning_rate": 4.657486924721797e-06, + "loss": 0.4231, + "step": 2321 + }, + { + "epoch": 1.7098674521354935, + "grad_norm": 0.41495949029922485, + "learning_rate": 4.653212347172723e-06, + "loss": 0.3893, + "step": 2322 + }, + { + "epoch": 1.7106038291605303, + "grad_norm": 0.42686596512794495, + "learning_rate": 4.648938024294023e-06, + "loss": 0.4398, + "step": 2323 + }, + { + "epoch": 1.7113402061855671, + "grad_norm": 0.38408711552619934, + "learning_rate": 4.644663959224629e-06, + "loss": 0.4055, + "step": 2324 + }, + { + "epoch": 1.712076583210604, + "grad_norm": 0.4538039565086365, + "learning_rate": 4.640390155103285e-06, + "loss": 0.4302, + "step": 2325 + }, + { + "epoch": 1.7128129602356408, + "grad_norm": 0.40232333540916443, + "learning_rate": 4.636116615068545e-06, + "loss": 0.4076, + "step": 2326 + }, + { + "epoch": 1.7135493372606776, + "grad_norm": 0.45923569798469543, + "learning_rate": 4.631843342258765e-06, + "loss": 0.4284, + "step": 2327 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 0.390129953622818, + "learning_rate": 4.627570339812109e-06, + "loss": 0.4171, + "step": 2328 + }, + { + "epoch": 1.7150220913107512, + "grad_norm": 0.44369444251060486, + "learning_rate": 4.623297610866544e-06, + "loss": 0.4287, + "step": 2329 + }, + { + "epoch": 1.715758468335788, + "grad_norm": 0.42557501792907715, + "learning_rate": 4.619025158559829e-06, + "loss": 0.4052, + "step": 2330 + }, + { + "epoch": 1.7164948453608249, + "grad_norm": 0.3788171708583832, + "learning_rate": 4.614752986029524e-06, + "loss": 0.3884, + "step": 2331 + }, + { + "epoch": 1.7172312223858617, + "grad_norm": 0.3878194987773895, + "learning_rate": 4.610481096412985e-06, + "loss": 0.4091, + "step": 2332 + }, + { + "epoch": 1.7179675994108985, + "grad_norm": 0.40998604893684387, + "learning_rate": 4.606209492847356e-06, + "loss": 0.379, + "step": 2333 + }, + { + "epoch": 1.7187039764359353, + "grad_norm": 0.3821749985218048, + "learning_rate": 4.6019381784695774e-06, + "loss": 0.4005, + "step": 2334 + }, + { + "epoch": 1.7194403534609721, + "grad_norm": 0.35601040720939636, + "learning_rate": 4.597667156416371e-06, + "loss": 0.3966, + "step": 2335 + }, + { + "epoch": 1.720176730486009, + "grad_norm": 0.40410375595092773, + "learning_rate": 4.5933964298242465e-06, + "loss": 0.3946, + "step": 2336 + }, + { + "epoch": 1.7209131075110458, + "grad_norm": 0.37720221281051636, + "learning_rate": 4.589126001829497e-06, + "loss": 0.3987, + "step": 2337 + }, + { + "epoch": 1.7216494845360826, + "grad_norm": 0.3496411144733429, + "learning_rate": 4.584855875568198e-06, + "loss": 0.4229, + "step": 2338 + }, + { + "epoch": 1.7223858615611194, + "grad_norm": 0.3647906482219696, + "learning_rate": 4.580586054176196e-06, + "loss": 0.4183, + "step": 2339 + }, + { + "epoch": 1.7231222385861562, + "grad_norm": 0.36781540513038635, + "learning_rate": 4.576316540789122e-06, + "loss": 0.3892, + "step": 2340 + }, + { + "epoch": 1.723858615611193, + "grad_norm": 0.36236733198165894, + "learning_rate": 4.572047338542377e-06, + "loss": 0.4132, + "step": 2341 + }, + { + "epoch": 1.7245949926362298, + "grad_norm": 0.3683478832244873, + "learning_rate": 4.567778450571135e-06, + "loss": 0.4394, + "step": 2342 + }, + { + "epoch": 1.7253313696612667, + "grad_norm": 0.378169447183609, + "learning_rate": 4.563509880010336e-06, + "loss": 0.3853, + "step": 2343 + }, + { + "epoch": 1.7260677466863035, + "grad_norm": 0.3431897759437561, + "learning_rate": 4.559241629994693e-06, + "loss": 0.4018, + "step": 2344 + }, + { + "epoch": 1.7268041237113403, + "grad_norm": 0.3948942720890045, + "learning_rate": 4.554973703658676e-06, + "loss": 0.3987, + "step": 2345 + }, + { + "epoch": 1.727540500736377, + "grad_norm": 0.36369240283966064, + "learning_rate": 4.550706104136523e-06, + "loss": 0.3903, + "step": 2346 + }, + { + "epoch": 1.728276877761414, + "grad_norm": 0.40486857295036316, + "learning_rate": 4.546438834562232e-06, + "loss": 0.4185, + "step": 2347 + }, + { + "epoch": 1.7290132547864507, + "grad_norm": 0.3831326365470886, + "learning_rate": 4.542171898069553e-06, + "loss": 0.3756, + "step": 2348 + }, + { + "epoch": 1.7297496318114876, + "grad_norm": 0.40144047141075134, + "learning_rate": 4.537905297791997e-06, + "loss": 0.3718, + "step": 2349 + }, + { + "epoch": 1.7304860088365244, + "grad_norm": 0.38407352566719055, + "learning_rate": 4.5336390368628265e-06, + "loss": 0.4093, + "step": 2350 + }, + { + "epoch": 1.7312223858615612, + "grad_norm": 0.37621355056762695, + "learning_rate": 4.529373118415053e-06, + "loss": 0.4144, + "step": 2351 + }, + { + "epoch": 1.731958762886598, + "grad_norm": 0.34284254908561707, + "learning_rate": 4.525107545581442e-06, + "loss": 0.4234, + "step": 2352 + }, + { + "epoch": 1.7326951399116348, + "grad_norm": 0.42767757177352905, + "learning_rate": 4.5208423214944975e-06, + "loss": 0.4072, + "step": 2353 + }, + { + "epoch": 1.7334315169366716, + "grad_norm": 0.3716091215610504, + "learning_rate": 4.5165774492864715e-06, + "loss": 0.3886, + "step": 2354 + }, + { + "epoch": 1.7341678939617085, + "grad_norm": 0.35501131415367126, + "learning_rate": 4.512312932089361e-06, + "loss": 0.405, + "step": 2355 + }, + { + "epoch": 1.7349042709867453, + "grad_norm": 0.4200528562068939, + "learning_rate": 4.508048773034895e-06, + "loss": 0.4098, + "step": 2356 + }, + { + "epoch": 1.735640648011782, + "grad_norm": 0.3673626780509949, + "learning_rate": 4.503784975254543e-06, + "loss": 0.3913, + "step": 2357 + }, + { + "epoch": 1.736377025036819, + "grad_norm": 0.39019590616226196, + "learning_rate": 4.499521541879508e-06, + "loss": 0.4088, + "step": 2358 + }, + { + "epoch": 1.7371134020618557, + "grad_norm": 0.3743307888507843, + "learning_rate": 4.495258476040732e-06, + "loss": 0.4009, + "step": 2359 + }, + { + "epoch": 1.7378497790868925, + "grad_norm": 0.39364027976989746, + "learning_rate": 4.4909957808688765e-06, + "loss": 0.4161, + "step": 2360 + }, + { + "epoch": 1.7385861561119293, + "grad_norm": 0.3424508571624756, + "learning_rate": 4.486733459494338e-06, + "loss": 0.4315, + "step": 2361 + }, + { + "epoch": 1.7393225331369662, + "grad_norm": 0.3977791368961334, + "learning_rate": 4.482471515047237e-06, + "loss": 0.3828, + "step": 2362 + }, + { + "epoch": 1.740058910162003, + "grad_norm": 0.36530202627182007, + "learning_rate": 4.478209950657418e-06, + "loss": 0.399, + "step": 2363 + }, + { + "epoch": 1.7407952871870398, + "grad_norm": 0.40208595991134644, + "learning_rate": 4.4739487694544415e-06, + "loss": 0.4193, + "step": 2364 + }, + { + "epoch": 1.7415316642120766, + "grad_norm": 0.36320358514785767, + "learning_rate": 4.469687974567591e-06, + "loss": 0.3844, + "step": 2365 + }, + { + "epoch": 1.7422680412371134, + "grad_norm": 0.3930068016052246, + "learning_rate": 4.465427569125868e-06, + "loss": 0.4007, + "step": 2366 + }, + { + "epoch": 1.7430044182621502, + "grad_norm": 0.3515762984752655, + "learning_rate": 4.461167556257984e-06, + "loss": 0.422, + "step": 2367 + }, + { + "epoch": 1.743740795287187, + "grad_norm": 0.39239639043807983, + "learning_rate": 4.456907939092363e-06, + "loss": 0.4105, + "step": 2368 + }, + { + "epoch": 1.7444771723122239, + "grad_norm": 0.3900611102581024, + "learning_rate": 4.45264872075714e-06, + "loss": 0.3909, + "step": 2369 + }, + { + "epoch": 1.7452135493372607, + "grad_norm": 0.38840755820274353, + "learning_rate": 4.448389904380156e-06, + "loss": 0.4077, + "step": 2370 + }, + { + "epoch": 1.7459499263622975, + "grad_norm": 0.3633229732513428, + "learning_rate": 4.444131493088956e-06, + "loss": 0.4119, + "step": 2371 + }, + { + "epoch": 1.7466863033873343, + "grad_norm": 0.3752785325050354, + "learning_rate": 4.4398734900107935e-06, + "loss": 0.3841, + "step": 2372 + }, + { + "epoch": 1.7474226804123711, + "grad_norm": 0.42519140243530273, + "learning_rate": 4.43561589827261e-06, + "loss": 0.4221, + "step": 2373 + }, + { + "epoch": 1.748159057437408, + "grad_norm": 0.40567177534103394, + "learning_rate": 4.431358721001058e-06, + "loss": 0.401, + "step": 2374 + }, + { + "epoch": 1.7488954344624448, + "grad_norm": 0.335718035697937, + "learning_rate": 4.427101961322475e-06, + "loss": 0.3703, + "step": 2375 + }, + { + "epoch": 1.7496318114874816, + "grad_norm": 0.39199185371398926, + "learning_rate": 4.422845622362901e-06, + "loss": 0.3967, + "step": 2376 + }, + { + "epoch": 1.7503681885125184, + "grad_norm": 0.3723236918449402, + "learning_rate": 4.418589707248061e-06, + "loss": 0.3873, + "step": 2377 + }, + { + "epoch": 1.7511045655375552, + "grad_norm": 0.41921404004096985, + "learning_rate": 4.414334219103369e-06, + "loss": 0.3941, + "step": 2378 + }, + { + "epoch": 1.751840942562592, + "grad_norm": 0.37618371844291687, + "learning_rate": 4.4100791610539285e-06, + "loss": 0.3763, + "step": 2379 + }, + { + "epoch": 1.7525773195876289, + "grad_norm": 0.4034104645252228, + "learning_rate": 4.4058245362245276e-06, + "loss": 0.3917, + "step": 2380 + }, + { + "epoch": 1.7533136966126657, + "grad_norm": 0.43745455145835876, + "learning_rate": 4.401570347739631e-06, + "loss": 0.4249, + "step": 2381 + }, + { + "epoch": 1.7540500736377025, + "grad_norm": 0.3758779466152191, + "learning_rate": 4.397316598723385e-06, + "loss": 0.413, + "step": 2382 + }, + { + "epoch": 1.7547864506627393, + "grad_norm": 0.3742757737636566, + "learning_rate": 4.393063292299618e-06, + "loss": 0.4179, + "step": 2383 + }, + { + "epoch": 1.7555228276877761, + "grad_norm": 0.4073251783847809, + "learning_rate": 4.388810431591829e-06, + "loss": 0.4235, + "step": 2384 + }, + { + "epoch": 1.756259204712813, + "grad_norm": 0.4208831787109375, + "learning_rate": 4.384558019723188e-06, + "loss": 0.4119, + "step": 2385 + }, + { + "epoch": 1.7569955817378498, + "grad_norm": 0.41460785269737244, + "learning_rate": 4.380306059816539e-06, + "loss": 0.3966, + "step": 2386 + }, + { + "epoch": 1.7577319587628866, + "grad_norm": 0.4289226830005646, + "learning_rate": 4.376054554994394e-06, + "loss": 0.4164, + "step": 2387 + }, + { + "epoch": 1.7584683357879234, + "grad_norm": 0.41257333755493164, + "learning_rate": 4.371803508378929e-06, + "loss": 0.3822, + "step": 2388 + }, + { + "epoch": 1.7592047128129602, + "grad_norm": 0.44865962862968445, + "learning_rate": 4.367552923091985e-06, + "loss": 0.395, + "step": 2389 + }, + { + "epoch": 1.759941089837997, + "grad_norm": 0.37914708256721497, + "learning_rate": 4.363302802255062e-06, + "loss": 0.4037, + "step": 2390 + }, + { + "epoch": 1.7606774668630338, + "grad_norm": 0.3959915339946747, + "learning_rate": 4.359053148989319e-06, + "loss": 0.3997, + "step": 2391 + }, + { + "epoch": 1.7614138438880707, + "grad_norm": 0.5142814517021179, + "learning_rate": 4.354803966415576e-06, + "loss": 0.4012, + "step": 2392 + }, + { + "epoch": 1.7621502209131075, + "grad_norm": 0.4417358338832855, + "learning_rate": 4.350555257654302e-06, + "loss": 0.3962, + "step": 2393 + }, + { + "epoch": 1.7628865979381443, + "grad_norm": 0.41511955857276917, + "learning_rate": 4.346307025825621e-06, + "loss": 0.407, + "step": 2394 + }, + { + "epoch": 1.763622974963181, + "grad_norm": 0.3971770405769348, + "learning_rate": 4.342059274049308e-06, + "loss": 0.3975, + "step": 2395 + }, + { + "epoch": 1.764359351988218, + "grad_norm": 0.43736016750335693, + "learning_rate": 4.33781200544478e-06, + "loss": 0.3925, + "step": 2396 + }, + { + "epoch": 1.7650957290132547, + "grad_norm": 0.3964605927467346, + "learning_rate": 4.333565223131107e-06, + "loss": 0.3938, + "step": 2397 + }, + { + "epoch": 1.7658321060382915, + "grad_norm": 0.4375699758529663, + "learning_rate": 4.329318930226993e-06, + "loss": 0.4229, + "step": 2398 + }, + { + "epoch": 1.7665684830633284, + "grad_norm": 0.34781819581985474, + "learning_rate": 4.325073129850791e-06, + "loss": 0.3878, + "step": 2399 + }, + { + "epoch": 1.7673048600883652, + "grad_norm": 0.39587196707725525, + "learning_rate": 4.320827825120485e-06, + "loss": 0.4248, + "step": 2400 + }, + { + "epoch": 1.768041237113402, + "grad_norm": 0.47028040885925293, + "learning_rate": 4.3165830191537016e-06, + "loss": 0.3747, + "step": 2401 + }, + { + "epoch": 1.7687776141384388, + "grad_norm": 0.3784812390804291, + "learning_rate": 4.312338715067697e-06, + "loss": 0.3972, + "step": 2402 + }, + { + "epoch": 1.7695139911634756, + "grad_norm": 0.45428574085235596, + "learning_rate": 4.308094915979359e-06, + "loss": 0.4051, + "step": 2403 + }, + { + "epoch": 1.7702503681885124, + "grad_norm": 0.4495220184326172, + "learning_rate": 4.303851625005205e-06, + "loss": 0.4132, + "step": 2404 + }, + { + "epoch": 1.7709867452135493, + "grad_norm": 0.39823389053344727, + "learning_rate": 4.2996088452613835e-06, + "loss": 0.4036, + "step": 2405 + }, + { + "epoch": 1.771723122238586, + "grad_norm": 0.4087408483028412, + "learning_rate": 4.295366579863658e-06, + "loss": 0.3894, + "step": 2406 + }, + { + "epoch": 1.772459499263623, + "grad_norm": 0.5459381937980652, + "learning_rate": 4.291124831927425e-06, + "loss": 0.4391, + "step": 2407 + }, + { + "epoch": 1.7731958762886597, + "grad_norm": 0.4444357752799988, + "learning_rate": 4.286883604567693e-06, + "loss": 0.3851, + "step": 2408 + }, + { + "epoch": 1.7739322533136965, + "grad_norm": 0.4461228549480438, + "learning_rate": 4.282642900899092e-06, + "loss": 0.4251, + "step": 2409 + }, + { + "epoch": 1.7746686303387333, + "grad_norm": 0.41404518485069275, + "learning_rate": 4.278402724035868e-06, + "loss": 0.4339, + "step": 2410 + }, + { + "epoch": 1.7754050073637702, + "grad_norm": 0.4226773977279663, + "learning_rate": 4.274163077091876e-06, + "loss": 0.42, + "step": 2411 + }, + { + "epoch": 1.776141384388807, + "grad_norm": 0.44457805156707764, + "learning_rate": 4.269923963180587e-06, + "loss": 0.4236, + "step": 2412 + }, + { + "epoch": 1.7768777614138438, + "grad_norm": 0.4070783853530884, + "learning_rate": 4.265685385415077e-06, + "loss": 0.3761, + "step": 2413 + }, + { + "epoch": 1.7776141384388806, + "grad_norm": 0.3834463953971863, + "learning_rate": 4.261447346908032e-06, + "loss": 0.4151, + "step": 2414 + }, + { + "epoch": 1.7783505154639174, + "grad_norm": 0.41372165083885193, + "learning_rate": 4.257209850771734e-06, + "loss": 0.3919, + "step": 2415 + }, + { + "epoch": 1.7790868924889542, + "grad_norm": 0.4155184328556061, + "learning_rate": 4.2529729001180765e-06, + "loss": 0.4034, + "step": 2416 + }, + { + "epoch": 1.779823269513991, + "grad_norm": 0.3758399784564972, + "learning_rate": 4.248736498058547e-06, + "loss": 0.4108, + "step": 2417 + }, + { + "epoch": 1.7805596465390279, + "grad_norm": 0.38244372606277466, + "learning_rate": 4.24450064770423e-06, + "loss": 0.3716, + "step": 2418 + }, + { + "epoch": 1.7812960235640647, + "grad_norm": 0.4086960256099701, + "learning_rate": 4.240265352165806e-06, + "loss": 0.3858, + "step": 2419 + }, + { + "epoch": 1.7820324005891015, + "grad_norm": 0.37736374139785767, + "learning_rate": 4.236030614553552e-06, + "loss": 0.4, + "step": 2420 + }, + { + "epoch": 1.7827687776141383, + "grad_norm": 0.4141034185886383, + "learning_rate": 4.2317964379773265e-06, + "loss": 0.4311, + "step": 2421 + }, + { + "epoch": 1.7835051546391751, + "grad_norm": 0.37181559205055237, + "learning_rate": 4.2275628255465846e-06, + "loss": 0.3799, + "step": 2422 + }, + { + "epoch": 1.784241531664212, + "grad_norm": 0.3771205246448517, + "learning_rate": 4.223329780370359e-06, + "loss": 0.4282, + "step": 2423 + }, + { + "epoch": 1.7849779086892488, + "grad_norm": 0.4032301604747772, + "learning_rate": 4.219097305557274e-06, + "loss": 0.4305, + "step": 2424 + }, + { + "epoch": 1.7857142857142856, + "grad_norm": 0.39853084087371826, + "learning_rate": 4.214865404215528e-06, + "loss": 0.3973, + "step": 2425 + }, + { + "epoch": 1.7864506627393224, + "grad_norm": 0.35678479075431824, + "learning_rate": 4.210634079452904e-06, + "loss": 0.368, + "step": 2426 + }, + { + "epoch": 1.7871870397643592, + "grad_norm": 0.3927466869354248, + "learning_rate": 4.206403334376757e-06, + "loss": 0.4107, + "step": 2427 + }, + { + "epoch": 1.787923416789396, + "grad_norm": 0.3804020285606384, + "learning_rate": 4.202173172094022e-06, + "loss": 0.3749, + "step": 2428 + }, + { + "epoch": 1.7886597938144329, + "grad_norm": 0.3756907284259796, + "learning_rate": 4.1979435957111984e-06, + "loss": 0.4019, + "step": 2429 + }, + { + "epoch": 1.7893961708394697, + "grad_norm": 0.3835069239139557, + "learning_rate": 4.193714608334361e-06, + "loss": 0.383, + "step": 2430 + }, + { + "epoch": 1.7901325478645065, + "grad_norm": 0.41579073667526245, + "learning_rate": 4.189486213069152e-06, + "loss": 0.4195, + "step": 2431 + }, + { + "epoch": 1.7908689248895433, + "grad_norm": 0.39560478925704956, + "learning_rate": 4.1852584130207745e-06, + "loss": 0.4095, + "step": 2432 + }, + { + "epoch": 1.7916053019145801, + "grad_norm": 0.4340824782848358, + "learning_rate": 4.181031211293997e-06, + "loss": 0.4334, + "step": 2433 + }, + { + "epoch": 1.792341678939617, + "grad_norm": 0.41419485211372375, + "learning_rate": 4.176804610993149e-06, + "loss": 0.4022, + "step": 2434 + }, + { + "epoch": 1.7930780559646537, + "grad_norm": 0.40266767144203186, + "learning_rate": 4.17257861522212e-06, + "loss": 0.4214, + "step": 2435 + }, + { + "epoch": 1.7938144329896906, + "grad_norm": 0.35882508754730225, + "learning_rate": 4.1683532270843505e-06, + "loss": 0.3855, + "step": 2436 + }, + { + "epoch": 1.7945508100147274, + "grad_norm": 0.37283796072006226, + "learning_rate": 4.16412844968284e-06, + "loss": 0.3659, + "step": 2437 + }, + { + "epoch": 1.7952871870397642, + "grad_norm": 0.3643818199634552, + "learning_rate": 4.1599042861201375e-06, + "loss": 0.4015, + "step": 2438 + }, + { + "epoch": 1.796023564064801, + "grad_norm": 0.38770896196365356, + "learning_rate": 4.155680739498342e-06, + "loss": 0.379, + "step": 2439 + }, + { + "epoch": 1.7967599410898378, + "grad_norm": 0.3598364293575287, + "learning_rate": 4.151457812919094e-06, + "loss": 0.4056, + "step": 2440 + }, + { + "epoch": 1.7974963181148749, + "grad_norm": 0.39613017439842224, + "learning_rate": 4.147235509483587e-06, + "loss": 0.3952, + "step": 2441 + }, + { + "epoch": 1.7982326951399117, + "grad_norm": 0.4370924234390259, + "learning_rate": 4.1430138322925535e-06, + "loss": 0.4109, + "step": 2442 + }, + { + "epoch": 1.7989690721649485, + "grad_norm": 0.3646906912326813, + "learning_rate": 4.138792784446263e-06, + "loss": 0.3945, + "step": 2443 + }, + { + "epoch": 1.7997054491899853, + "grad_norm": 0.34723278880119324, + "learning_rate": 4.134572369044526e-06, + "loss": 0.3952, + "step": 2444 + }, + { + "epoch": 1.8004418262150221, + "grad_norm": 0.40729156136512756, + "learning_rate": 4.1303525891866905e-06, + "loss": 0.4143, + "step": 2445 + }, + { + "epoch": 1.801178203240059, + "grad_norm": 0.4514944851398468, + "learning_rate": 4.126133447971633e-06, + "loss": 0.392, + "step": 2446 + }, + { + "epoch": 1.8019145802650958, + "grad_norm": 0.3769078254699707, + "learning_rate": 4.121914948497764e-06, + "loss": 0.3999, + "step": 2447 + }, + { + "epoch": 1.8026509572901326, + "grad_norm": 0.39932695031166077, + "learning_rate": 4.117697093863023e-06, + "loss": 0.4274, + "step": 2448 + }, + { + "epoch": 1.8033873343151694, + "grad_norm": 0.3611906170845032, + "learning_rate": 4.113479887164873e-06, + "loss": 0.3788, + "step": 2449 + }, + { + "epoch": 1.8041237113402062, + "grad_norm": 0.38393762707710266, + "learning_rate": 4.109263331500305e-06, + "loss": 0.4218, + "step": 2450 + }, + { + "epoch": 1.804860088365243, + "grad_norm": 0.42800167202949524, + "learning_rate": 4.105047429965828e-06, + "loss": 0.3984, + "step": 2451 + }, + { + "epoch": 1.8055964653902798, + "grad_norm": 0.38685426115989685, + "learning_rate": 4.1008321856574745e-06, + "loss": 0.4097, + "step": 2452 + }, + { + "epoch": 1.8063328424153167, + "grad_norm": 0.3722650408744812, + "learning_rate": 4.096617601670793e-06, + "loss": 0.4073, + "step": 2453 + }, + { + "epoch": 1.8070692194403535, + "grad_norm": 0.3909144997596741, + "learning_rate": 4.092403681100844e-06, + "loss": 0.403, + "step": 2454 + }, + { + "epoch": 1.8078055964653903, + "grad_norm": 0.42803841829299927, + "learning_rate": 4.0881904270422045e-06, + "loss": 0.3873, + "step": 2455 + }, + { + "epoch": 1.8085419734904271, + "grad_norm": 0.42633092403411865, + "learning_rate": 4.083977842588963e-06, + "loss": 0.3925, + "step": 2456 + }, + { + "epoch": 1.809278350515464, + "grad_norm": 0.3902944028377533, + "learning_rate": 4.079765930834714e-06, + "loss": 0.4063, + "step": 2457 + }, + { + "epoch": 1.8100147275405007, + "grad_norm": 0.3900681436061859, + "learning_rate": 4.075554694872554e-06, + "loss": 0.406, + "step": 2458 + }, + { + "epoch": 1.8107511045655376, + "grad_norm": 0.412332147359848, + "learning_rate": 4.071344137795091e-06, + "loss": 0.4169, + "step": 2459 + }, + { + "epoch": 1.8114874815905744, + "grad_norm": 0.39725714921951294, + "learning_rate": 4.067134262694431e-06, + "loss": 0.4163, + "step": 2460 + }, + { + "epoch": 1.8122238586156112, + "grad_norm": 0.36094245314598083, + "learning_rate": 4.062925072662177e-06, + "loss": 0.3998, + "step": 2461 + }, + { + "epoch": 1.812960235640648, + "grad_norm": 0.35459452867507935, + "learning_rate": 4.0587165707894326e-06, + "loss": 0.4008, + "step": 2462 + }, + { + "epoch": 1.8136966126656848, + "grad_norm": 0.3944514989852905, + "learning_rate": 4.054508760166795e-06, + "loss": 0.4151, + "step": 2463 + }, + { + "epoch": 1.8144329896907216, + "grad_norm": 0.41395866870880127, + "learning_rate": 4.050301643884352e-06, + "loss": 0.4049, + "step": 2464 + }, + { + "epoch": 1.8151693667157585, + "grad_norm": 0.3743531405925751, + "learning_rate": 4.046095225031683e-06, + "loss": 0.4226, + "step": 2465 + }, + { + "epoch": 1.8159057437407953, + "grad_norm": 0.4096841812133789, + "learning_rate": 4.0418895066978536e-06, + "loss": 0.3879, + "step": 2466 + }, + { + "epoch": 1.816642120765832, + "grad_norm": 0.41024044156074524, + "learning_rate": 4.037684491971417e-06, + "loss": 0.4129, + "step": 2467 + }, + { + "epoch": 1.817378497790869, + "grad_norm": 0.36897313594818115, + "learning_rate": 4.033480183940412e-06, + "loss": 0.391, + "step": 2468 + }, + { + "epoch": 1.8181148748159057, + "grad_norm": 0.3991081416606903, + "learning_rate": 4.029276585692349e-06, + "loss": 0.4263, + "step": 2469 + }, + { + "epoch": 1.8188512518409425, + "grad_norm": 0.43920043110847473, + "learning_rate": 4.0250737003142294e-06, + "loss": 0.433, + "step": 2470 + }, + { + "epoch": 1.8195876288659794, + "grad_norm": 0.47898420691490173, + "learning_rate": 4.0208715308925235e-06, + "loss": 0.4085, + "step": 2471 + }, + { + "epoch": 1.8203240058910162, + "grad_norm": 0.37598884105682373, + "learning_rate": 4.016670080513176e-06, + "loss": 0.4126, + "step": 2472 + }, + { + "epoch": 1.821060382916053, + "grad_norm": 0.36710143089294434, + "learning_rate": 4.012469352261608e-06, + "loss": 0.3943, + "step": 2473 + }, + { + "epoch": 1.8217967599410898, + "grad_norm": 0.42525747418403625, + "learning_rate": 4.0082693492227035e-06, + "loss": 0.3982, + "step": 2474 + }, + { + "epoch": 1.8225331369661266, + "grad_norm": 0.39572814106941223, + "learning_rate": 4.004070074480821e-06, + "loss": 0.3859, + "step": 2475 + }, + { + "epoch": 1.8232695139911634, + "grad_norm": 0.38209137320518494, + "learning_rate": 3.999871531119779e-06, + "loss": 0.3998, + "step": 2476 + }, + { + "epoch": 1.8240058910162003, + "grad_norm": 0.36567169427871704, + "learning_rate": 3.995673722222861e-06, + "loss": 0.397, + "step": 2477 + }, + { + "epoch": 1.824742268041237, + "grad_norm": 0.4170894920825958, + "learning_rate": 3.991476650872813e-06, + "loss": 0.4181, + "step": 2478 + }, + { + "epoch": 1.8254786450662739, + "grad_norm": 0.43204647302627563, + "learning_rate": 3.987280320151835e-06, + "loss": 0.4415, + "step": 2479 + }, + { + "epoch": 1.8262150220913107, + "grad_norm": 0.39173027873039246, + "learning_rate": 3.983084733141588e-06, + "loss": 0.3853, + "step": 2480 + }, + { + "epoch": 1.8269513991163475, + "grad_norm": 0.4281696081161499, + "learning_rate": 3.978889892923183e-06, + "loss": 0.4305, + "step": 2481 + }, + { + "epoch": 1.8276877761413843, + "grad_norm": 0.44608938694000244, + "learning_rate": 3.974695802577184e-06, + "loss": 0.4322, + "step": 2482 + }, + { + "epoch": 1.8284241531664214, + "grad_norm": 0.4017941951751709, + "learning_rate": 3.970502465183602e-06, + "loss": 0.4158, + "step": 2483 + }, + { + "epoch": 1.8291605301914582, + "grad_norm": 0.4041925072669983, + "learning_rate": 3.966309883821901e-06, + "loss": 0.4031, + "step": 2484 + }, + { + "epoch": 1.829896907216495, + "grad_norm": 0.4443046748638153, + "learning_rate": 3.962118061570982e-06, + "loss": 0.3905, + "step": 2485 + }, + { + "epoch": 1.8306332842415318, + "grad_norm": 0.45764559507369995, + "learning_rate": 3.957927001509197e-06, + "loss": 0.399, + "step": 2486 + }, + { + "epoch": 1.8313696612665686, + "grad_norm": 0.4235036373138428, + "learning_rate": 3.953736706714331e-06, + "loss": 0.396, + "step": 2487 + }, + { + "epoch": 1.8321060382916055, + "grad_norm": 0.46383094787597656, + "learning_rate": 3.94954718026361e-06, + "loss": 0.4079, + "step": 2488 + }, + { + "epoch": 1.8328424153166423, + "grad_norm": 0.38650867342948914, + "learning_rate": 3.945358425233697e-06, + "loss": 0.4062, + "step": 2489 + }, + { + "epoch": 1.833578792341679, + "grad_norm": 0.40243032574653625, + "learning_rate": 3.941170444700688e-06, + "loss": 0.3963, + "step": 2490 + }, + { + "epoch": 1.834315169366716, + "grad_norm": 0.4477875530719757, + "learning_rate": 3.9369832417401055e-06, + "loss": 0.3711, + "step": 2491 + }, + { + "epoch": 1.8350515463917527, + "grad_norm": 0.4357861578464508, + "learning_rate": 3.9327968194269074e-06, + "loss": 0.4004, + "step": 2492 + }, + { + "epoch": 1.8357879234167895, + "grad_norm": 0.34190165996551514, + "learning_rate": 3.928611180835476e-06, + "loss": 0.4045, + "step": 2493 + }, + { + "epoch": 1.8365243004418264, + "grad_norm": 0.40084031224250793, + "learning_rate": 3.924426329039616e-06, + "loss": 0.3943, + "step": 2494 + }, + { + "epoch": 1.8372606774668632, + "grad_norm": 0.43317151069641113, + "learning_rate": 3.920242267112557e-06, + "loss": 0.4437, + "step": 2495 + }, + { + "epoch": 1.8379970544919, + "grad_norm": 0.42097267508506775, + "learning_rate": 3.916058998126949e-06, + "loss": 0.4135, + "step": 2496 + }, + { + "epoch": 1.8387334315169368, + "grad_norm": 0.43369096517562866, + "learning_rate": 3.911876525154857e-06, + "loss": 0.4216, + "step": 2497 + }, + { + "epoch": 1.8394698085419736, + "grad_norm": 0.4318563640117645, + "learning_rate": 3.907694851267764e-06, + "loss": 0.396, + "step": 2498 + }, + { + "epoch": 1.8402061855670104, + "grad_norm": 0.39135801792144775, + "learning_rate": 3.903513979536563e-06, + "loss": 0.4167, + "step": 2499 + }, + { + "epoch": 1.8409425625920472, + "grad_norm": 0.4389367997646332, + "learning_rate": 3.899333913031561e-06, + "loss": 0.412, + "step": 2500 + }, + { + "epoch": 1.841678939617084, + "grad_norm": 0.4623863995075226, + "learning_rate": 3.895154654822471e-06, + "loss": 0.3903, + "step": 2501 + }, + { + "epoch": 1.8424153166421209, + "grad_norm": 0.40219375491142273, + "learning_rate": 3.890976207978416e-06, + "loss": 0.4075, + "step": 2502 + }, + { + "epoch": 1.8431516936671577, + "grad_norm": 0.36119547486305237, + "learning_rate": 3.8867985755679206e-06, + "loss": 0.4064, + "step": 2503 + }, + { + "epoch": 1.8438880706921945, + "grad_norm": 0.4446427524089813, + "learning_rate": 3.882621760658911e-06, + "loss": 0.4002, + "step": 2504 + }, + { + "epoch": 1.8446244477172313, + "grad_norm": 0.38301101326942444, + "learning_rate": 3.878445766318714e-06, + "loss": 0.3946, + "step": 2505 + }, + { + "epoch": 1.8453608247422681, + "grad_norm": 0.39597806334495544, + "learning_rate": 3.874270595614057e-06, + "loss": 0.3955, + "step": 2506 + }, + { + "epoch": 1.846097201767305, + "grad_norm": 0.36354929208755493, + "learning_rate": 3.870096251611053e-06, + "loss": 0.415, + "step": 2507 + }, + { + "epoch": 1.8468335787923418, + "grad_norm": 0.4049472510814667, + "learning_rate": 3.865922737375219e-06, + "loss": 0.4134, + "step": 2508 + }, + { + "epoch": 1.8475699558173786, + "grad_norm": 0.42379382252693176, + "learning_rate": 3.861750055971455e-06, + "loss": 0.436, + "step": 2509 + }, + { + "epoch": 1.8483063328424154, + "grad_norm": 0.3723948001861572, + "learning_rate": 3.857578210464053e-06, + "loss": 0.3914, + "step": 2510 + }, + { + "epoch": 1.8490427098674522, + "grad_norm": 0.4040578007698059, + "learning_rate": 3.8534072039166915e-06, + "loss": 0.3967, + "step": 2511 + }, + { + "epoch": 1.849779086892489, + "grad_norm": 0.3998672664165497, + "learning_rate": 3.849237039392429e-06, + "loss": 0.3973, + "step": 2512 + }, + { + "epoch": 1.8505154639175259, + "grad_norm": 0.3948405086994171, + "learning_rate": 3.845067719953711e-06, + "loss": 0.4459, + "step": 2513 + }, + { + "epoch": 1.8512518409425627, + "grad_norm": 0.38502955436706543, + "learning_rate": 3.840899248662358e-06, + "loss": 0.4169, + "step": 2514 + }, + { + "epoch": 1.8519882179675995, + "grad_norm": 0.34823498129844666, + "learning_rate": 3.836731628579573e-06, + "loss": 0.4141, + "step": 2515 + }, + { + "epoch": 1.8527245949926363, + "grad_norm": 0.4114006459712982, + "learning_rate": 3.832564862765924e-06, + "loss": 0.4317, + "step": 2516 + }, + { + "epoch": 1.8534609720176731, + "grad_norm": 0.34312447905540466, + "learning_rate": 3.828398954281361e-06, + "loss": 0.3961, + "step": 2517 + }, + { + "epoch": 1.85419734904271, + "grad_norm": 0.3569032549858093, + "learning_rate": 3.8242339061852035e-06, + "loss": 0.4025, + "step": 2518 + }, + { + "epoch": 1.8549337260677468, + "grad_norm": 0.37142735719680786, + "learning_rate": 3.8200697215361336e-06, + "loss": 0.3778, + "step": 2519 + }, + { + "epoch": 1.8556701030927836, + "grad_norm": 0.39146125316619873, + "learning_rate": 3.815906403392203e-06, + "loss": 0.4004, + "step": 2520 + }, + { + "epoch": 1.8564064801178204, + "grad_norm": 0.3406389057636261, + "learning_rate": 3.8117439548108293e-06, + "loss": 0.4247, + "step": 2521 + }, + { + "epoch": 1.8571428571428572, + "grad_norm": 0.3523464500904083, + "learning_rate": 3.8075823788487863e-06, + "loss": 0.4202, + "step": 2522 + }, + { + "epoch": 1.857879234167894, + "grad_norm": 0.38543546199798584, + "learning_rate": 3.803421678562213e-06, + "loss": 0.4254, + "step": 2523 + }, + { + "epoch": 1.8586156111929308, + "grad_norm": 0.3914850056171417, + "learning_rate": 3.799261857006597e-06, + "loss": 0.402, + "step": 2524 + }, + { + "epoch": 1.8593519882179677, + "grad_norm": 0.3607421815395355, + "learning_rate": 3.7951029172367883e-06, + "loss": 0.396, + "step": 2525 + }, + { + "epoch": 1.8600883652430045, + "grad_norm": 0.3871685862541199, + "learning_rate": 3.790944862306988e-06, + "loss": 0.4304, + "step": 2526 + }, + { + "epoch": 1.8608247422680413, + "grad_norm": 0.4246087074279785, + "learning_rate": 3.786787695270743e-06, + "loss": 0.4147, + "step": 2527 + }, + { + "epoch": 1.861561119293078, + "grad_norm": 0.35566386580467224, + "learning_rate": 3.7826314191809522e-06, + "loss": 0.4074, + "step": 2528 + }, + { + "epoch": 1.862297496318115, + "grad_norm": 0.3580811321735382, + "learning_rate": 3.778476037089861e-06, + "loss": 0.4046, + "step": 2529 + }, + { + "epoch": 1.8630338733431517, + "grad_norm": 0.35247814655303955, + "learning_rate": 3.774321552049054e-06, + "loss": 0.3959, + "step": 2530 + }, + { + "epoch": 1.8637702503681886, + "grad_norm": 0.37611591815948486, + "learning_rate": 3.7701679671094602e-06, + "loss": 0.4283, + "step": 2531 + }, + { + "epoch": 1.8645066273932254, + "grad_norm": 0.3860566020011902, + "learning_rate": 3.7660152853213494e-06, + "loss": 0.4123, + "step": 2532 + }, + { + "epoch": 1.8652430044182622, + "grad_norm": 0.37459200620651245, + "learning_rate": 3.7618635097343225e-06, + "loss": 0.4377, + "step": 2533 + }, + { + "epoch": 1.865979381443299, + "grad_norm": 0.37669306993484497, + "learning_rate": 3.7577126433973176e-06, + "loss": 0.4092, + "step": 2534 + }, + { + "epoch": 1.8667157584683358, + "grad_norm": 0.3490535020828247, + "learning_rate": 3.7535626893586062e-06, + "loss": 0.4069, + "step": 2535 + }, + { + "epoch": 1.8674521354933726, + "grad_norm": 0.3674035370349884, + "learning_rate": 3.749413650665792e-06, + "loss": 0.4336, + "step": 2536 + }, + { + "epoch": 1.8681885125184094, + "grad_norm": 0.3696489930152893, + "learning_rate": 3.7452655303657993e-06, + "loss": 0.4025, + "step": 2537 + }, + { + "epoch": 1.8689248895434463, + "grad_norm": 0.3966639041900635, + "learning_rate": 3.7411183315048847e-06, + "loss": 0.4173, + "step": 2538 + }, + { + "epoch": 1.869661266568483, + "grad_norm": 0.3363037705421448, + "learning_rate": 3.736972057128626e-06, + "loss": 0.4027, + "step": 2539 + }, + { + "epoch": 1.87039764359352, + "grad_norm": 0.35173100233078003, + "learning_rate": 3.732826710281923e-06, + "loss": 0.3908, + "step": 2540 + }, + { + "epoch": 1.8711340206185567, + "grad_norm": 0.3510563373565674, + "learning_rate": 3.728682294008988e-06, + "loss": 0.3961, + "step": 2541 + }, + { + "epoch": 1.8718703976435935, + "grad_norm": 0.3618358373641968, + "learning_rate": 3.7245388113533596e-06, + "loss": 0.3936, + "step": 2542 + }, + { + "epoch": 1.8726067746686303, + "grad_norm": 0.33672353625297546, + "learning_rate": 3.7203962653578853e-06, + "loss": 0.4081, + "step": 2543 + }, + { + "epoch": 1.8733431516936672, + "grad_norm": 0.36822593212127686, + "learning_rate": 3.7162546590647254e-06, + "loss": 0.4108, + "step": 2544 + }, + { + "epoch": 1.874079528718704, + "grad_norm": 0.3691384196281433, + "learning_rate": 3.7121139955153497e-06, + "loss": 0.379, + "step": 2545 + }, + { + "epoch": 1.8748159057437408, + "grad_norm": 0.4163464307785034, + "learning_rate": 3.7079742777505373e-06, + "loss": 0.4063, + "step": 2546 + }, + { + "epoch": 1.8755522827687776, + "grad_norm": 0.3840535879135132, + "learning_rate": 3.7038355088103726e-06, + "loss": 0.403, + "step": 2547 + }, + { + "epoch": 1.8762886597938144, + "grad_norm": 0.3876000642776489, + "learning_rate": 3.699697691734243e-06, + "loss": 0.4056, + "step": 2548 + }, + { + "epoch": 1.8770250368188512, + "grad_norm": 0.3980104625225067, + "learning_rate": 3.695560829560832e-06, + "loss": 0.3986, + "step": 2549 + }, + { + "epoch": 1.877761413843888, + "grad_norm": 0.3940967917442322, + "learning_rate": 3.691424925328129e-06, + "loss": 0.3895, + "step": 2550 + }, + { + "epoch": 1.8784977908689249, + "grad_norm": 0.37271252274513245, + "learning_rate": 3.687289982073419e-06, + "loss": 0.4393, + "step": 2551 + }, + { + "epoch": 1.8792341678939617, + "grad_norm": 0.37549808621406555, + "learning_rate": 3.683156002833276e-06, + "loss": 0.4248, + "step": 2552 + }, + { + "epoch": 1.8799705449189985, + "grad_norm": 0.3900524079799652, + "learning_rate": 3.6790229906435706e-06, + "loss": 0.4255, + "step": 2553 + }, + { + "epoch": 1.8807069219440353, + "grad_norm": 0.36957311630249023, + "learning_rate": 3.674890948539463e-06, + "loss": 0.404, + "step": 2554 + }, + { + "epoch": 1.8814432989690721, + "grad_norm": 0.37756621837615967, + "learning_rate": 3.670759879555399e-06, + "loss": 0.4144, + "step": 2555 + }, + { + "epoch": 1.882179675994109, + "grad_norm": 0.4113319516181946, + "learning_rate": 3.666629786725111e-06, + "loss": 0.3993, + "step": 2556 + }, + { + "epoch": 1.8829160530191458, + "grad_norm": 0.4135149419307709, + "learning_rate": 3.6625006730816157e-06, + "loss": 0.4202, + "step": 2557 + }, + { + "epoch": 1.8836524300441826, + "grad_norm": 0.3912547826766968, + "learning_rate": 3.6583725416572093e-06, + "loss": 0.4029, + "step": 2558 + }, + { + "epoch": 1.8843888070692194, + "grad_norm": 0.36167773604393005, + "learning_rate": 3.6542453954834632e-06, + "loss": 0.4039, + "step": 2559 + }, + { + "epoch": 1.8851251840942562, + "grad_norm": 0.3650279641151428, + "learning_rate": 3.650119237591232e-06, + "loss": 0.4051, + "step": 2560 + }, + { + "epoch": 1.885861561119293, + "grad_norm": 0.358697772026062, + "learning_rate": 3.6459940710106414e-06, + "loss": 0.3967, + "step": 2561 + }, + { + "epoch": 1.8865979381443299, + "grad_norm": 0.41603848338127136, + "learning_rate": 3.6418698987710872e-06, + "loss": 0.3998, + "step": 2562 + }, + { + "epoch": 1.8873343151693667, + "grad_norm": 0.3741896450519562, + "learning_rate": 3.637746723901238e-06, + "loss": 0.4212, + "step": 2563 + }, + { + "epoch": 1.8880706921944035, + "grad_norm": 0.39252546429634094, + "learning_rate": 3.6336245494290305e-06, + "loss": 0.387, + "step": 2564 + }, + { + "epoch": 1.8888070692194403, + "grad_norm": 0.35629981756210327, + "learning_rate": 3.6295033783816636e-06, + "loss": 0.3794, + "step": 2565 + }, + { + "epoch": 1.8895434462444771, + "grad_norm": 0.3638868033885956, + "learning_rate": 3.6253832137856e-06, + "loss": 0.4164, + "step": 2566 + }, + { + "epoch": 1.890279823269514, + "grad_norm": 0.380978524684906, + "learning_rate": 3.621264058666564e-06, + "loss": 0.3887, + "step": 2567 + }, + { + "epoch": 1.8910162002945508, + "grad_norm": 0.3995063304901123, + "learning_rate": 3.6171459160495393e-06, + "loss": 0.408, + "step": 2568 + }, + { + "epoch": 1.8917525773195876, + "grad_norm": 0.4066588878631592, + "learning_rate": 3.6130287889587665e-06, + "loss": 0.4023, + "step": 2569 + }, + { + "epoch": 1.8924889543446244, + "grad_norm": 0.4209347069263458, + "learning_rate": 3.6089126804177373e-06, + "loss": 0.3898, + "step": 2570 + }, + { + "epoch": 1.8932253313696612, + "grad_norm": 0.41782402992248535, + "learning_rate": 3.6047975934491983e-06, + "loss": 0.4186, + "step": 2571 + }, + { + "epoch": 1.893961708394698, + "grad_norm": 0.41970953345298767, + "learning_rate": 3.6006835310751464e-06, + "loss": 0.4078, + "step": 2572 + }, + { + "epoch": 1.8946980854197348, + "grad_norm": 0.3769712746143341, + "learning_rate": 3.596570496316822e-06, + "loss": 0.4206, + "step": 2573 + }, + { + "epoch": 1.8954344624447717, + "grad_norm": 0.38179120421409607, + "learning_rate": 3.592458492194717e-06, + "loss": 0.3897, + "step": 2574 + }, + { + "epoch": 1.8961708394698085, + "grad_norm": 0.42509886622428894, + "learning_rate": 3.5883475217285592e-06, + "loss": 0.4142, + "step": 2575 + }, + { + "epoch": 1.8969072164948453, + "grad_norm": 0.38956883549690247, + "learning_rate": 3.5842375879373237e-06, + "loss": 0.3984, + "step": 2576 + }, + { + "epoch": 1.897643593519882, + "grad_norm": 0.34472212195396423, + "learning_rate": 3.5801286938392195e-06, + "loss": 0.4137, + "step": 2577 + }, + { + "epoch": 1.898379970544919, + "grad_norm": 0.39913445711135864, + "learning_rate": 3.5760208424516957e-06, + "loss": 0.3989, + "step": 2578 + }, + { + "epoch": 1.8991163475699557, + "grad_norm": 0.38616591691970825, + "learning_rate": 3.571914036791435e-06, + "loss": 0.384, + "step": 2579 + }, + { + "epoch": 1.8998527245949925, + "grad_norm": 0.35932883620262146, + "learning_rate": 3.5678082798743498e-06, + "loss": 0.3966, + "step": 2580 + }, + { + "epoch": 1.9005891016200294, + "grad_norm": 0.40238866209983826, + "learning_rate": 3.5637035747155835e-06, + "loss": 0.4105, + "step": 2581 + }, + { + "epoch": 1.9013254786450662, + "grad_norm": 0.39923685789108276, + "learning_rate": 3.5595999243295114e-06, + "loss": 0.4016, + "step": 2582 + }, + { + "epoch": 1.902061855670103, + "grad_norm": 0.428325891494751, + "learning_rate": 3.5554973317297255e-06, + "loss": 0.3744, + "step": 2583 + }, + { + "epoch": 1.9027982326951398, + "grad_norm": 0.3603360652923584, + "learning_rate": 3.5513957999290483e-06, + "loss": 0.3955, + "step": 2584 + }, + { + "epoch": 1.9035346097201766, + "grad_norm": 0.42727532982826233, + "learning_rate": 3.5472953319395196e-06, + "loss": 0.4234, + "step": 2585 + }, + { + "epoch": 1.9042709867452134, + "grad_norm": 0.39020195603370667, + "learning_rate": 3.5431959307724e-06, + "loss": 0.4095, + "step": 2586 + }, + { + "epoch": 1.9050073637702503, + "grad_norm": 0.3618599474430084, + "learning_rate": 3.539097599438167e-06, + "loss": 0.3776, + "step": 2587 + }, + { + "epoch": 1.905743740795287, + "grad_norm": 0.37120410799980164, + "learning_rate": 3.5350003409465085e-06, + "loss": 0.4031, + "step": 2588 + }, + { + "epoch": 1.906480117820324, + "grad_norm": 0.4274289309978485, + "learning_rate": 3.530904158306329e-06, + "loss": 0.3971, + "step": 2589 + }, + { + "epoch": 1.9072164948453607, + "grad_norm": 0.36874106526374817, + "learning_rate": 3.526809054525744e-06, + "loss": 0.3911, + "step": 2590 + }, + { + "epoch": 1.9079528718703975, + "grad_norm": 0.3638487756252289, + "learning_rate": 3.522715032612069e-06, + "loss": 0.405, + "step": 2591 + }, + { + "epoch": 1.9086892488954343, + "grad_norm": 0.3634064495563507, + "learning_rate": 3.518622095571831e-06, + "loss": 0.4145, + "step": 2592 + }, + { + "epoch": 1.9094256259204712, + "grad_norm": 0.3739914894104004, + "learning_rate": 3.5145302464107612e-06, + "loss": 0.3862, + "step": 2593 + }, + { + "epoch": 1.910162002945508, + "grad_norm": 0.39826780557632446, + "learning_rate": 3.510439488133789e-06, + "loss": 0.4123, + "step": 2594 + }, + { + "epoch": 1.9108983799705448, + "grad_norm": 0.37097451090812683, + "learning_rate": 3.506349823745043e-06, + "loss": 0.3913, + "step": 2595 + }, + { + "epoch": 1.9116347569955816, + "grad_norm": 0.3629193603992462, + "learning_rate": 3.5022612562478507e-06, + "loss": 0.4055, + "step": 2596 + }, + { + "epoch": 1.9123711340206184, + "grad_norm": 0.368109792470932, + "learning_rate": 3.498173788644732e-06, + "loss": 0.3793, + "step": 2597 + }, + { + "epoch": 1.9131075110456552, + "grad_norm": 0.418338418006897, + "learning_rate": 3.494087423937399e-06, + "loss": 0.3781, + "step": 2598 + }, + { + "epoch": 1.913843888070692, + "grad_norm": 0.35174235701560974, + "learning_rate": 3.4900021651267557e-06, + "loss": 0.405, + "step": 2599 + }, + { + "epoch": 1.9145802650957289, + "grad_norm": 0.3879396915435791, + "learning_rate": 3.485918015212891e-06, + "loss": 0.383, + "step": 2600 + }, + { + "epoch": 1.9153166421207657, + "grad_norm": 0.34615907073020935, + "learning_rate": 3.481834977195081e-06, + "loss": 0.3978, + "step": 2601 + }, + { + "epoch": 1.9160530191458025, + "grad_norm": 0.36179783940315247, + "learning_rate": 3.4777530540717875e-06, + "loss": 0.4361, + "step": 2602 + }, + { + "epoch": 1.9167893961708393, + "grad_norm": 0.37738144397735596, + "learning_rate": 3.4736722488406493e-06, + "loss": 0.409, + "step": 2603 + }, + { + "epoch": 1.9175257731958761, + "grad_norm": 0.37794360518455505, + "learning_rate": 3.4695925644984885e-06, + "loss": 0.4211, + "step": 2604 + }, + { + "epoch": 1.918262150220913, + "grad_norm": 0.40854412317276, + "learning_rate": 3.465514004041301e-06, + "loss": 0.4055, + "step": 2605 + }, + { + "epoch": 1.9189985272459498, + "grad_norm": 0.3971635103225708, + "learning_rate": 3.461436570464258e-06, + "loss": 0.3971, + "step": 2606 + }, + { + "epoch": 1.9197349042709866, + "grad_norm": 0.3623662292957306, + "learning_rate": 3.4573602667617056e-06, + "loss": 0.4185, + "step": 2607 + }, + { + "epoch": 1.9204712812960234, + "grad_norm": 0.4009900391101837, + "learning_rate": 3.453285095927154e-06, + "loss": 0.4067, + "step": 2608 + }, + { + "epoch": 1.9212076583210602, + "grad_norm": 0.34463950991630554, + "learning_rate": 3.4492110609532892e-06, + "loss": 0.4056, + "step": 2609 + }, + { + "epoch": 1.9219440353460973, + "grad_norm": 0.3612362742424011, + "learning_rate": 3.4451381648319573e-06, + "loss": 0.3852, + "step": 2610 + }, + { + "epoch": 1.922680412371134, + "grad_norm": 0.3724992275238037, + "learning_rate": 3.4410664105541703e-06, + "loss": 0.4193, + "step": 2611 + }, + { + "epoch": 1.923416789396171, + "grad_norm": 0.3696814775466919, + "learning_rate": 3.4369958011101035e-06, + "loss": 0.4328, + "step": 2612 + }, + { + "epoch": 1.9241531664212077, + "grad_norm": 0.3654807507991791, + "learning_rate": 3.4329263394890867e-06, + "loss": 0.3699, + "step": 2613 + }, + { + "epoch": 1.9248895434462445, + "grad_norm": 0.3380682170391083, + "learning_rate": 3.4288580286796106e-06, + "loss": 0.3954, + "step": 2614 + }, + { + "epoch": 1.9256259204712813, + "grad_norm": 0.3763212263584137, + "learning_rate": 3.424790871669321e-06, + "loss": 0.395, + "step": 2615 + }, + { + "epoch": 1.9263622974963182, + "grad_norm": 0.37345150113105774, + "learning_rate": 3.4207248714450157e-06, + "loss": 0.4032, + "step": 2616 + }, + { + "epoch": 1.927098674521355, + "grad_norm": 0.3797363340854645, + "learning_rate": 3.416660030992639e-06, + "loss": 0.4183, + "step": 2617 + }, + { + "epoch": 1.9278350515463918, + "grad_norm": 0.39499834179878235, + "learning_rate": 3.4125963532972878e-06, + "loss": 0.4318, + "step": 2618 + }, + { + "epoch": 1.9285714285714286, + "grad_norm": 0.41415366530418396, + "learning_rate": 3.4085338413432066e-06, + "loss": 0.3798, + "step": 2619 + }, + { + "epoch": 1.9293078055964654, + "grad_norm": 0.38226163387298584, + "learning_rate": 3.4044724981137787e-06, + "loss": 0.4323, + "step": 2620 + }, + { + "epoch": 1.9300441826215022, + "grad_norm": 0.41554006934165955, + "learning_rate": 3.4004123265915328e-06, + "loss": 0.3985, + "step": 2621 + }, + { + "epoch": 1.930780559646539, + "grad_norm": 0.40494853258132935, + "learning_rate": 3.3963533297581375e-06, + "loss": 0.3894, + "step": 2622 + }, + { + "epoch": 1.9315169366715759, + "grad_norm": 0.42371952533721924, + "learning_rate": 3.3922955105943953e-06, + "loss": 0.4125, + "step": 2623 + }, + { + "epoch": 1.9322533136966127, + "grad_norm": 0.4033835530281067, + "learning_rate": 3.3882388720802496e-06, + "loss": 0.4221, + "step": 2624 + }, + { + "epoch": 1.9329896907216495, + "grad_norm": 0.42939725518226624, + "learning_rate": 3.384183417194767e-06, + "loss": 0.4095, + "step": 2625 + }, + { + "epoch": 1.9337260677466863, + "grad_norm": 0.41357123851776123, + "learning_rate": 3.380129148916156e-06, + "loss": 0.3923, + "step": 2626 + }, + { + "epoch": 1.9344624447717231, + "grad_norm": 0.41408517956733704, + "learning_rate": 3.3760760702217477e-06, + "loss": 0.3849, + "step": 2627 + }, + { + "epoch": 1.93519882179676, + "grad_norm": 0.3655821681022644, + "learning_rate": 3.3720241840879992e-06, + "loss": 0.4033, + "step": 2628 + }, + { + "epoch": 1.9359351988217968, + "grad_norm": 0.40222102403640747, + "learning_rate": 3.367973493490494e-06, + "loss": 0.4061, + "step": 2629 + }, + { + "epoch": 1.9366715758468336, + "grad_norm": 0.37809598445892334, + "learning_rate": 3.363924001403939e-06, + "loss": 0.3973, + "step": 2630 + }, + { + "epoch": 1.9374079528718704, + "grad_norm": 0.3718864321708679, + "learning_rate": 3.3598757108021546e-06, + "loss": 0.4259, + "step": 2631 + }, + { + "epoch": 1.9381443298969072, + "grad_norm": 0.409039169549942, + "learning_rate": 3.355828624658087e-06, + "loss": 0.4057, + "step": 2632 + }, + { + "epoch": 1.938880706921944, + "grad_norm": 0.34290969371795654, + "learning_rate": 3.351782745943792e-06, + "loss": 0.3939, + "step": 2633 + }, + { + "epoch": 1.9396170839469808, + "grad_norm": 0.378534734249115, + "learning_rate": 3.3477380776304412e-06, + "loss": 0.3941, + "step": 2634 + }, + { + "epoch": 1.9403534609720177, + "grad_norm": 0.3930615782737732, + "learning_rate": 3.343694622688315e-06, + "loss": 0.4266, + "step": 2635 + }, + { + "epoch": 1.9410898379970545, + "grad_norm": 0.3948494791984558, + "learning_rate": 3.3396523840868065e-06, + "loss": 0.4136, + "step": 2636 + }, + { + "epoch": 1.9418262150220913, + "grad_norm": 0.3820838928222656, + "learning_rate": 3.3356113647944144e-06, + "loss": 0.403, + "step": 2637 + }, + { + "epoch": 1.9425625920471281, + "grad_norm": 0.3924919664859772, + "learning_rate": 3.3315715677787387e-06, + "loss": 0.4077, + "step": 2638 + }, + { + "epoch": 1.943298969072165, + "grad_norm": 0.3638979196548462, + "learning_rate": 3.3275329960064855e-06, + "loss": 0.4086, + "step": 2639 + }, + { + "epoch": 1.9440353460972017, + "grad_norm": 0.3670336902141571, + "learning_rate": 3.3234956524434615e-06, + "loss": 0.4018, + "step": 2640 + }, + { + "epoch": 1.9447717231222386, + "grad_norm": 0.38897061347961426, + "learning_rate": 3.319459540054567e-06, + "loss": 0.4143, + "step": 2641 + }, + { + "epoch": 1.9455081001472754, + "grad_norm": 0.403804749250412, + "learning_rate": 3.315424661803802e-06, + "loss": 0.3792, + "step": 2642 + }, + { + "epoch": 1.9462444771723122, + "grad_norm": 0.3691318929195404, + "learning_rate": 3.3113910206542595e-06, + "loss": 0.4074, + "step": 2643 + }, + { + "epoch": 1.946980854197349, + "grad_norm": 0.41701748967170715, + "learning_rate": 3.307358619568123e-06, + "loss": 0.4037, + "step": 2644 + }, + { + "epoch": 1.9477172312223858, + "grad_norm": 0.3514380156993866, + "learning_rate": 3.303327461506667e-06, + "loss": 0.4278, + "step": 2645 + }, + { + "epoch": 1.9484536082474226, + "grad_norm": 0.3677442669868469, + "learning_rate": 3.29929754943025e-06, + "loss": 0.4029, + "step": 2646 + }, + { + "epoch": 1.9491899852724595, + "grad_norm": 0.38052093982696533, + "learning_rate": 3.295268886298321e-06, + "loss": 0.3938, + "step": 2647 + }, + { + "epoch": 1.9499263622974963, + "grad_norm": 0.4505428671836853, + "learning_rate": 3.2912414750694064e-06, + "loss": 0.3936, + "step": 2648 + }, + { + "epoch": 1.950662739322533, + "grad_norm": 0.35540086030960083, + "learning_rate": 3.2872153187011175e-06, + "loss": 0.4038, + "step": 2649 + }, + { + "epoch": 1.95139911634757, + "grad_norm": 0.38356146216392517, + "learning_rate": 3.2831904201501376e-06, + "loss": 0.4276, + "step": 2650 + }, + { + "epoch": 1.9521354933726067, + "grad_norm": 0.38261398673057556, + "learning_rate": 3.2791667823722327e-06, + "loss": 0.3967, + "step": 2651 + }, + { + "epoch": 1.9528718703976435, + "grad_norm": 0.3730379343032837, + "learning_rate": 3.2751444083222418e-06, + "loss": 0.3942, + "step": 2652 + }, + { + "epoch": 1.9536082474226806, + "grad_norm": 0.41036689281463623, + "learning_rate": 3.271123300954074e-06, + "loss": 0.4031, + "step": 2653 + }, + { + "epoch": 1.9543446244477174, + "grad_norm": 0.37355658411979675, + "learning_rate": 3.2671034632207084e-06, + "loss": 0.4094, + "step": 2654 + }, + { + "epoch": 1.9550810014727542, + "grad_norm": 0.3889113962650299, + "learning_rate": 3.263084898074194e-06, + "loss": 0.4123, + "step": 2655 + }, + { + "epoch": 1.955817378497791, + "grad_norm": 0.3863618075847626, + "learning_rate": 3.2590676084656425e-06, + "loss": 0.3765, + "step": 2656 + }, + { + "epoch": 1.9565537555228278, + "grad_norm": 0.3829159438610077, + "learning_rate": 3.2550515973452295e-06, + "loss": 0.3963, + "step": 2657 + }, + { + "epoch": 1.9572901325478647, + "grad_norm": 0.3452022671699524, + "learning_rate": 3.251036867662195e-06, + "loss": 0.4252, + "step": 2658 + }, + { + "epoch": 1.9580265095729015, + "grad_norm": 0.3539165258407593, + "learning_rate": 3.247023422364831e-06, + "loss": 0.3867, + "step": 2659 + }, + { + "epoch": 1.9587628865979383, + "grad_norm": 0.3868674039840698, + "learning_rate": 3.243011264400494e-06, + "loss": 0.421, + "step": 2660 + }, + { + "epoch": 1.959499263622975, + "grad_norm": 0.39568665623664856, + "learning_rate": 3.2390003967155887e-06, + "loss": 0.4064, + "step": 2661 + }, + { + "epoch": 1.960235640648012, + "grad_norm": 0.3922138214111328, + "learning_rate": 3.2349908222555764e-06, + "loss": 0.3997, + "step": 2662 + }, + { + "epoch": 1.9609720176730487, + "grad_norm": 0.3632813096046448, + "learning_rate": 3.230982543964969e-06, + "loss": 0.3785, + "step": 2663 + }, + { + "epoch": 1.9617083946980856, + "grad_norm": 0.38700589537620544, + "learning_rate": 3.226975564787322e-06, + "loss": 0.4231, + "step": 2664 + }, + { + "epoch": 1.9624447717231224, + "grad_norm": 0.37696290016174316, + "learning_rate": 3.2229698876652415e-06, + "loss": 0.3819, + "step": 2665 + }, + { + "epoch": 1.9631811487481592, + "grad_norm": 0.3456031084060669, + "learning_rate": 3.218965515540377e-06, + "loss": 0.4044, + "step": 2666 + }, + { + "epoch": 1.963917525773196, + "grad_norm": 0.39847907423973083, + "learning_rate": 3.214962451353416e-06, + "loss": 0.4108, + "step": 2667 + }, + { + "epoch": 1.9646539027982328, + "grad_norm": 0.3617440462112427, + "learning_rate": 3.2109606980440887e-06, + "loss": 0.4052, + "step": 2668 + }, + { + "epoch": 1.9653902798232696, + "grad_norm": 0.37616562843322754, + "learning_rate": 3.2069602585511605e-06, + "loss": 0.4069, + "step": 2669 + }, + { + "epoch": 1.9661266568483065, + "grad_norm": 0.39469170570373535, + "learning_rate": 3.202961135812437e-06, + "loss": 0.3785, + "step": 2670 + }, + { + "epoch": 1.9668630338733433, + "grad_norm": 0.4196336269378662, + "learning_rate": 3.1989633327647485e-06, + "loss": 0.4297, + "step": 2671 + }, + { + "epoch": 1.96759941089838, + "grad_norm": 0.380405068397522, + "learning_rate": 3.1949668523439635e-06, + "loss": 0.3883, + "step": 2672 + }, + { + "epoch": 1.968335787923417, + "grad_norm": 0.37013182044029236, + "learning_rate": 3.190971697484977e-06, + "loss": 0.3756, + "step": 2673 + }, + { + "epoch": 1.9690721649484537, + "grad_norm": 0.34765076637268066, + "learning_rate": 3.186977871121708e-06, + "loss": 0.3944, + "step": 2674 + }, + { + "epoch": 1.9698085419734905, + "grad_norm": 0.3879580497741699, + "learning_rate": 3.182985376187105e-06, + "loss": 0.4227, + "step": 2675 + }, + { + "epoch": 1.9705449189985274, + "grad_norm": 0.3524974584579468, + "learning_rate": 3.178994215613131e-06, + "loss": 0.3805, + "step": 2676 + }, + { + "epoch": 1.9712812960235642, + "grad_norm": 0.3391323983669281, + "learning_rate": 3.1750043923307773e-06, + "loss": 0.4046, + "step": 2677 + }, + { + "epoch": 1.972017673048601, + "grad_norm": 0.3604549169540405, + "learning_rate": 3.1710159092700475e-06, + "loss": 0.3858, + "step": 2678 + }, + { + "epoch": 1.9727540500736378, + "grad_norm": 0.36499884724617004, + "learning_rate": 3.167028769359964e-06, + "loss": 0.4425, + "step": 2679 + }, + { + "epoch": 1.9734904270986746, + "grad_norm": 0.31244251132011414, + "learning_rate": 3.1630429755285623e-06, + "loss": 0.4021, + "step": 2680 + }, + { + "epoch": 1.9742268041237114, + "grad_norm": 0.3878641128540039, + "learning_rate": 3.1590585307028884e-06, + "loss": 0.4057, + "step": 2681 + }, + { + "epoch": 1.9749631811487482, + "grad_norm": 0.3576429784297943, + "learning_rate": 3.1550754378089976e-06, + "loss": 0.3675, + "step": 2682 + }, + { + "epoch": 1.975699558173785, + "grad_norm": 0.40466341376304626, + "learning_rate": 3.1510936997719557e-06, + "loss": 0.3959, + "step": 2683 + }, + { + "epoch": 1.9764359351988219, + "grad_norm": 0.3640960156917572, + "learning_rate": 3.1471133195158266e-06, + "loss": 0.3905, + "step": 2684 + }, + { + "epoch": 1.9771723122238587, + "grad_norm": 0.3552342653274536, + "learning_rate": 3.143134299963684e-06, + "loss": 0.4215, + "step": 2685 + }, + { + "epoch": 1.9779086892488955, + "grad_norm": 0.3452814817428589, + "learning_rate": 3.1391566440375987e-06, + "loss": 0.3995, + "step": 2686 + }, + { + "epoch": 1.9786450662739323, + "grad_norm": 0.3464387357234955, + "learning_rate": 3.1351803546586407e-06, + "loss": 0.4106, + "step": 2687 + }, + { + "epoch": 1.9793814432989691, + "grad_norm": 0.3663497567176819, + "learning_rate": 3.131205434746879e-06, + "loss": 0.3728, + "step": 2688 + }, + { + "epoch": 1.980117820324006, + "grad_norm": 0.3594132661819458, + "learning_rate": 3.1272318872213713e-06, + "loss": 0.4075, + "step": 2689 + }, + { + "epoch": 1.9808541973490428, + "grad_norm": 0.35411345958709717, + "learning_rate": 3.123259715000173e-06, + "loss": 0.3754, + "step": 2690 + }, + { + "epoch": 1.9815905743740796, + "grad_norm": 0.33865928649902344, + "learning_rate": 3.1192889210003285e-06, + "loss": 0.3723, + "step": 2691 + }, + { + "epoch": 1.9823269513991164, + "grad_norm": 0.3422465920448303, + "learning_rate": 3.115319508137866e-06, + "loss": 0.387, + "step": 2692 + }, + { + "epoch": 1.9830633284241532, + "grad_norm": 0.3784734010696411, + "learning_rate": 3.1113514793278037e-06, + "loss": 0.397, + "step": 2693 + }, + { + "epoch": 1.98379970544919, + "grad_norm": 0.41923490166664124, + "learning_rate": 3.1073848374841416e-06, + "loss": 0.4199, + "step": 2694 + }, + { + "epoch": 1.9845360824742269, + "grad_norm": 0.37350550293922424, + "learning_rate": 3.1034195855198622e-06, + "loss": 0.4023, + "step": 2695 + }, + { + "epoch": 1.9852724594992637, + "grad_norm": 0.3785882294178009, + "learning_rate": 3.0994557263469267e-06, + "loss": 0.4117, + "step": 2696 + }, + { + "epoch": 1.9860088365243005, + "grad_norm": 0.36809927225112915, + "learning_rate": 3.0954932628762723e-06, + "loss": 0.3928, + "step": 2697 + }, + { + "epoch": 1.9867452135493373, + "grad_norm": 0.39854827523231506, + "learning_rate": 3.0915321980178153e-06, + "loss": 0.3881, + "step": 2698 + }, + { + "epoch": 1.9874815905743741, + "grad_norm": 0.37375113368034363, + "learning_rate": 3.0875725346804385e-06, + "loss": 0.4052, + "step": 2699 + }, + { + "epoch": 1.988217967599411, + "grad_norm": 0.4068243205547333, + "learning_rate": 3.0836142757720034e-06, + "loss": 0.3912, + "step": 2700 + }, + { + "epoch": 1.9889543446244478, + "grad_norm": 0.3465851843357086, + "learning_rate": 3.0796574241993306e-06, + "loss": 0.4136, + "step": 2701 + }, + { + "epoch": 1.9896907216494846, + "grad_norm": 0.39400115609169006, + "learning_rate": 3.0757019828682145e-06, + "loss": 0.3915, + "step": 2702 + }, + { + "epoch": 1.9904270986745214, + "grad_norm": 0.33876892924308777, + "learning_rate": 3.0717479546834136e-06, + "loss": 0.3967, + "step": 2703 + }, + { + "epoch": 1.9911634756995582, + "grad_norm": 0.3738095462322235, + "learning_rate": 3.0677953425486435e-06, + "loss": 0.3866, + "step": 2704 + }, + { + "epoch": 1.991899852724595, + "grad_norm": 0.350265771150589, + "learning_rate": 3.063844149366585e-06, + "loss": 0.3789, + "step": 2705 + }, + { + "epoch": 1.9926362297496318, + "grad_norm": 0.3564341068267822, + "learning_rate": 3.0598943780388744e-06, + "loss": 0.3948, + "step": 2706 + }, + { + "epoch": 1.9933726067746687, + "grad_norm": 0.3786764442920685, + "learning_rate": 3.055946031466105e-06, + "loss": 0.4204, + "step": 2707 + }, + { + "epoch": 1.9941089837997055, + "grad_norm": 0.3561117649078369, + "learning_rate": 3.0519991125478244e-06, + "loss": 0.3828, + "step": 2708 + }, + { + "epoch": 1.9948453608247423, + "grad_norm": 0.40353891253471375, + "learning_rate": 3.0480536241825263e-06, + "loss": 0.3999, + "step": 2709 + }, + { + "epoch": 1.995581737849779, + "grad_norm": 0.3841717839241028, + "learning_rate": 3.0441095692676625e-06, + "loss": 0.4079, + "step": 2710 + }, + { + "epoch": 1.996318114874816, + "grad_norm": 0.4203062057495117, + "learning_rate": 3.040166950699626e-06, + "loss": 0.395, + "step": 2711 + }, + { + "epoch": 1.9970544918998527, + "grad_norm": 0.4298144280910492, + "learning_rate": 3.0362257713737552e-06, + "loss": 0.466, + "step": 2712 + }, + { + "epoch": 1.9977908689248896, + "grad_norm": 0.41384029388427734, + "learning_rate": 3.0322860341843365e-06, + "loss": 0.3908, + "step": 2713 + }, + { + "epoch": 1.9985272459499264, + "grad_norm": 0.35945290327072144, + "learning_rate": 3.028347742024591e-06, + "loss": 0.3889, + "step": 2714 + }, + { + "epoch": 1.9992636229749632, + "grad_norm": 0.3887770473957062, + "learning_rate": 3.024410897786682e-06, + "loss": 0.4116, + "step": 2715 + }, + { + "epoch": 2.0, + "grad_norm": 0.4592723250389099, + "learning_rate": 3.020475504361711e-06, + "loss": 0.4304, + "step": 2716 + }, + { + "epoch": 2.000736377025037, + "grad_norm": 0.4854089915752411, + "learning_rate": 3.01654156463971e-06, + "loss": 0.3914, + "step": 2717 + }, + { + "epoch": 2.0014727540500736, + "grad_norm": 0.40188172459602356, + "learning_rate": 3.0126090815096466e-06, + "loss": 0.3823, + "step": 2718 + }, + { + "epoch": 2.0022091310751104, + "grad_norm": 0.3685808777809143, + "learning_rate": 3.008678057859415e-06, + "loss": 0.3927, + "step": 2719 + }, + { + "epoch": 2.0029455081001473, + "grad_norm": 0.41367441415786743, + "learning_rate": 3.004748496575842e-06, + "loss": 0.3738, + "step": 2720 + }, + { + "epoch": 2.003681885125184, + "grad_norm": 0.3970171809196472, + "learning_rate": 3.0008204005446807e-06, + "loss": 0.3864, + "step": 2721 + }, + { + "epoch": 2.004418262150221, + "grad_norm": 0.38890549540519714, + "learning_rate": 2.996893772650602e-06, + "loss": 0.3532, + "step": 2722 + }, + { + "epoch": 2.0051546391752577, + "grad_norm": 0.39022096991539, + "learning_rate": 2.992968615777206e-06, + "loss": 0.3809, + "step": 2723 + }, + { + "epoch": 2.0058910162002945, + "grad_norm": 0.3781045973300934, + "learning_rate": 2.989044932807008e-06, + "loss": 0.3753, + "step": 2724 + }, + { + "epoch": 2.0066273932253313, + "grad_norm": 0.40674319863319397, + "learning_rate": 2.9851227266214444e-06, + "loss": 0.3748, + "step": 2725 + }, + { + "epoch": 2.007363770250368, + "grad_norm": 0.42673784494400024, + "learning_rate": 2.981202000100861e-06, + "loss": 0.371, + "step": 2726 + }, + { + "epoch": 2.008100147275405, + "grad_norm": 0.3645564615726471, + "learning_rate": 2.9772827561245223e-06, + "loss": 0.3759, + "step": 2727 + }, + { + "epoch": 2.008836524300442, + "grad_norm": 0.3563685715198517, + "learning_rate": 2.9733649975706035e-06, + "loss": 0.3681, + "step": 2728 + }, + { + "epoch": 2.0095729013254786, + "grad_norm": 0.35634204745292664, + "learning_rate": 2.969448727316188e-06, + "loss": 0.3784, + "step": 2729 + }, + { + "epoch": 2.0103092783505154, + "grad_norm": 0.3639596998691559, + "learning_rate": 2.9655339482372647e-06, + "loss": 0.3652, + "step": 2730 + }, + { + "epoch": 2.0110456553755522, + "grad_norm": 0.3867950141429901, + "learning_rate": 2.961620663208732e-06, + "loss": 0.3561, + "step": 2731 + }, + { + "epoch": 2.011782032400589, + "grad_norm": 0.3902873396873474, + "learning_rate": 2.957708875104386e-06, + "loss": 0.3777, + "step": 2732 + }, + { + "epoch": 2.012518409425626, + "grad_norm": 0.39293259382247925, + "learning_rate": 2.9537985867969277e-06, + "loss": 0.358, + "step": 2733 + }, + { + "epoch": 2.0132547864506627, + "grad_norm": 0.37629497051239014, + "learning_rate": 2.9498898011579514e-06, + "loss": 0.3884, + "step": 2734 + }, + { + "epoch": 2.0139911634756995, + "grad_norm": 0.3420172929763794, + "learning_rate": 2.9459825210579534e-06, + "loss": 0.3858, + "step": 2735 + }, + { + "epoch": 2.0147275405007363, + "grad_norm": 0.3435642123222351, + "learning_rate": 2.942076749366321e-06, + "loss": 0.3672, + "step": 2736 + }, + { + "epoch": 2.015463917525773, + "grad_norm": 0.36072492599487305, + "learning_rate": 2.938172488951336e-06, + "loss": 0.3591, + "step": 2737 + }, + { + "epoch": 2.01620029455081, + "grad_norm": 0.34583789110183716, + "learning_rate": 2.9342697426801693e-06, + "loss": 0.3348, + "step": 2738 + }, + { + "epoch": 2.0169366715758468, + "grad_norm": 0.3557235598564148, + "learning_rate": 2.9303685134188785e-06, + "loss": 0.3624, + "step": 2739 + }, + { + "epoch": 2.0176730486008836, + "grad_norm": 0.3426262140274048, + "learning_rate": 2.9264688040324098e-06, + "loss": 0.3786, + "step": 2740 + }, + { + "epoch": 2.0184094256259204, + "grad_norm": 0.33903318643569946, + "learning_rate": 2.922570617384591e-06, + "loss": 0.3707, + "step": 2741 + }, + { + "epoch": 2.0191458026509572, + "grad_norm": 0.32387295365333557, + "learning_rate": 2.918673956338136e-06, + "loss": 0.3661, + "step": 2742 + }, + { + "epoch": 2.019882179675994, + "grad_norm": 0.34796783328056335, + "learning_rate": 2.914778823754628e-06, + "loss": 0.3941, + "step": 2743 + }, + { + "epoch": 2.020618556701031, + "grad_norm": 0.3746124804019928, + "learning_rate": 2.9108852224945405e-06, + "loss": 0.3809, + "step": 2744 + }, + { + "epoch": 2.0213549337260677, + "grad_norm": 0.32552647590637207, + "learning_rate": 2.9069931554172155e-06, + "loss": 0.3749, + "step": 2745 + }, + { + "epoch": 2.0220913107511045, + "grad_norm": 0.36219868063926697, + "learning_rate": 2.9031026253808657e-06, + "loss": 0.3647, + "step": 2746 + }, + { + "epoch": 2.0228276877761413, + "grad_norm": 0.311967134475708, + "learning_rate": 2.899213635242585e-06, + "loss": 0.3714, + "step": 2747 + }, + { + "epoch": 2.023564064801178, + "grad_norm": 0.382050484418869, + "learning_rate": 2.8953261878583263e-06, + "loss": 0.3629, + "step": 2748 + }, + { + "epoch": 2.024300441826215, + "grad_norm": 0.3814060688018799, + "learning_rate": 2.8914402860829116e-06, + "loss": 0.3756, + "step": 2749 + }, + { + "epoch": 2.0250368188512518, + "grad_norm": 0.3468574285507202, + "learning_rate": 2.8875559327700376e-06, + "loss": 0.3673, + "step": 2750 + }, + { + "epoch": 2.0257731958762886, + "grad_norm": 0.3503856658935547, + "learning_rate": 2.8836731307722456e-06, + "loss": 0.3999, + "step": 2751 + }, + { + "epoch": 2.0265095729013254, + "grad_norm": 0.34949731826782227, + "learning_rate": 2.8797918829409553e-06, + "loss": 0.371, + "step": 2752 + }, + { + "epoch": 2.027245949926362, + "grad_norm": 0.4045935273170471, + "learning_rate": 2.8759121921264366e-06, + "loss": 0.3934, + "step": 2753 + }, + { + "epoch": 2.027982326951399, + "grad_norm": 0.3662734031677246, + "learning_rate": 2.8720340611778134e-06, + "loss": 0.3674, + "step": 2754 + }, + { + "epoch": 2.028718703976436, + "grad_norm": 0.3531613051891327, + "learning_rate": 2.8681574929430732e-06, + "loss": 0.3619, + "step": 2755 + }, + { + "epoch": 2.0294550810014726, + "grad_norm": 0.35535216331481934, + "learning_rate": 2.8642824902690482e-06, + "loss": 0.3672, + "step": 2756 + }, + { + "epoch": 2.0301914580265095, + "grad_norm": 0.39943090081214905, + "learning_rate": 2.860409056001421e-06, + "loss": 0.3925, + "step": 2757 + }, + { + "epoch": 2.0309278350515463, + "grad_norm": 0.39442721009254456, + "learning_rate": 2.8565371929847286e-06, + "loss": 0.3568, + "step": 2758 + }, + { + "epoch": 2.031664212076583, + "grad_norm": 0.35805344581604004, + "learning_rate": 2.852666904062351e-06, + "loss": 0.3424, + "step": 2759 + }, + { + "epoch": 2.03240058910162, + "grad_norm": 0.3514004051685333, + "learning_rate": 2.8487981920765044e-06, + "loss": 0.3695, + "step": 2760 + }, + { + "epoch": 2.0331369661266567, + "grad_norm": 0.3559187054634094, + "learning_rate": 2.844931059868261e-06, + "loss": 0.4159, + "step": 2761 + }, + { + "epoch": 2.0338733431516935, + "grad_norm": 0.3435509502887726, + "learning_rate": 2.841065510277523e-06, + "loss": 0.3477, + "step": 2762 + }, + { + "epoch": 2.0346097201767304, + "grad_norm": 0.3598922789096832, + "learning_rate": 2.8372015461430313e-06, + "loss": 0.3543, + "step": 2763 + }, + { + "epoch": 2.035346097201767, + "grad_norm": 0.4100363552570343, + "learning_rate": 2.833339170302369e-06, + "loss": 0.3485, + "step": 2764 + }, + { + "epoch": 2.036082474226804, + "grad_norm": 0.3745668828487396, + "learning_rate": 2.829478385591946e-06, + "loss": 0.3597, + "step": 2765 + }, + { + "epoch": 2.036818851251841, + "grad_norm": 0.32710880041122437, + "learning_rate": 2.8256191948470034e-06, + "loss": 0.3525, + "step": 2766 + }, + { + "epoch": 2.0375552282768776, + "grad_norm": 0.4090001583099365, + "learning_rate": 2.8217616009016203e-06, + "loss": 0.3994, + "step": 2767 + }, + { + "epoch": 2.0382916053019144, + "grad_norm": 0.35250866413116455, + "learning_rate": 2.81790560658869e-06, + "loss": 0.3575, + "step": 2768 + }, + { + "epoch": 2.0390279823269513, + "grad_norm": 0.3470795452594757, + "learning_rate": 2.8140512147399436e-06, + "loss": 0.3801, + "step": 2769 + }, + { + "epoch": 2.039764359351988, + "grad_norm": 0.39775145053863525, + "learning_rate": 2.8101984281859276e-06, + "loss": 0.3803, + "step": 2770 + }, + { + "epoch": 2.040500736377025, + "grad_norm": 0.38484060764312744, + "learning_rate": 2.8063472497560107e-06, + "loss": 0.3948, + "step": 2771 + }, + { + "epoch": 2.0412371134020617, + "grad_norm": 0.35866910219192505, + "learning_rate": 2.802497682278385e-06, + "loss": 0.3644, + "step": 2772 + }, + { + "epoch": 2.0419734904270985, + "grad_norm": 0.3320389688014984, + "learning_rate": 2.7986497285800564e-06, + "loss": 0.3525, + "step": 2773 + }, + { + "epoch": 2.0427098674521353, + "grad_norm": 0.3605724573135376, + "learning_rate": 2.7948033914868415e-06, + "loss": 0.3861, + "step": 2774 + }, + { + "epoch": 2.043446244477172, + "grad_norm": 0.35822010040283203, + "learning_rate": 2.7909586738233816e-06, + "loss": 0.3836, + "step": 2775 + }, + { + "epoch": 2.044182621502209, + "grad_norm": 0.38547834753990173, + "learning_rate": 2.787115578413113e-06, + "loss": 0.3731, + "step": 2776 + }, + { + "epoch": 2.044918998527246, + "grad_norm": 0.36046162247657776, + "learning_rate": 2.7832741080782944e-06, + "loss": 0.3705, + "step": 2777 + }, + { + "epoch": 2.0456553755522826, + "grad_norm": 0.36100590229034424, + "learning_rate": 2.7794342656399835e-06, + "loss": 0.3573, + "step": 2778 + }, + { + "epoch": 2.0463917525773194, + "grad_norm": 0.37001779675483704, + "learning_rate": 2.775596053918043e-06, + "loss": 0.3919, + "step": 2779 + }, + { + "epoch": 2.0471281296023562, + "grad_norm": 0.37465184926986694, + "learning_rate": 2.7717594757311435e-06, + "loss": 0.4072, + "step": 2780 + }, + { + "epoch": 2.047864506627393, + "grad_norm": 0.35224416851997375, + "learning_rate": 2.7679245338967497e-06, + "loss": 0.393, + "step": 2781 + }, + { + "epoch": 2.04860088365243, + "grad_norm": 0.38229209184646606, + "learning_rate": 2.764091231231125e-06, + "loss": 0.391, + "step": 2782 + }, + { + "epoch": 2.0493372606774667, + "grad_norm": 0.3451941907405853, + "learning_rate": 2.7602595705493353e-06, + "loss": 0.367, + "step": 2783 + }, + { + "epoch": 2.0500736377025035, + "grad_norm": 0.33509618043899536, + "learning_rate": 2.7564295546652366e-06, + "loss": 0.3684, + "step": 2784 + }, + { + "epoch": 2.0508100147275403, + "grad_norm": 0.38282108306884766, + "learning_rate": 2.7526011863914702e-06, + "loss": 0.3657, + "step": 2785 + }, + { + "epoch": 2.051546391752577, + "grad_norm": 0.38125553727149963, + "learning_rate": 2.748774468539481e-06, + "loss": 0.3667, + "step": 2786 + }, + { + "epoch": 2.052282768777614, + "grad_norm": 0.34986355900764465, + "learning_rate": 2.74494940391949e-06, + "loss": 0.4248, + "step": 2787 + }, + { + "epoch": 2.0530191458026508, + "grad_norm": 0.38790905475616455, + "learning_rate": 2.7411259953405143e-06, + "loss": 0.3802, + "step": 2788 + }, + { + "epoch": 2.0537555228276876, + "grad_norm": 0.3548150956630707, + "learning_rate": 2.737304245610346e-06, + "loss": 0.3833, + "step": 2789 + }, + { + "epoch": 2.0544918998527244, + "grad_norm": 0.3673829436302185, + "learning_rate": 2.7334841575355618e-06, + "loss": 0.3856, + "step": 2790 + }, + { + "epoch": 2.055228276877761, + "grad_norm": 0.363076388835907, + "learning_rate": 2.7296657339215227e-06, + "loss": 0.3783, + "step": 2791 + }, + { + "epoch": 2.055964653902798, + "grad_norm": 0.3587580621242523, + "learning_rate": 2.725848977572363e-06, + "loss": 0.386, + "step": 2792 + }, + { + "epoch": 2.056701030927835, + "grad_norm": 0.3971223831176758, + "learning_rate": 2.722033891290988e-06, + "loss": 0.3445, + "step": 2793 + }, + { + "epoch": 2.0574374079528717, + "grad_norm": 0.38008826971054077, + "learning_rate": 2.7182204778790878e-06, + "loss": 0.3613, + "step": 2794 + }, + { + "epoch": 2.0581737849779085, + "grad_norm": 0.3624361753463745, + "learning_rate": 2.714408740137115e-06, + "loss": 0.3675, + "step": 2795 + }, + { + "epoch": 2.0589101620029453, + "grad_norm": 0.33494141697883606, + "learning_rate": 2.7105986808642936e-06, + "loss": 0.3914, + "step": 2796 + }, + { + "epoch": 2.059646539027982, + "grad_norm": 0.36965957283973694, + "learning_rate": 2.7067903028586193e-06, + "loss": 0.3601, + "step": 2797 + }, + { + "epoch": 2.060382916053019, + "grad_norm": 0.3501898944377899, + "learning_rate": 2.702983608916849e-06, + "loss": 0.3996, + "step": 2798 + }, + { + "epoch": 2.0611192930780557, + "grad_norm": 0.41025105118751526, + "learning_rate": 2.6991786018345e-06, + "loss": 0.3741, + "step": 2799 + }, + { + "epoch": 2.0618556701030926, + "grad_norm": 0.3536095917224884, + "learning_rate": 2.69537528440586e-06, + "loss": 0.3831, + "step": 2800 + }, + { + "epoch": 2.0625920471281294, + "grad_norm": 0.369924396276474, + "learning_rate": 2.6915736594239676e-06, + "loss": 0.3879, + "step": 2801 + }, + { + "epoch": 2.063328424153166, + "grad_norm": 0.3758985996246338, + "learning_rate": 2.6877737296806217e-06, + "loss": 0.3922, + "step": 2802 + }, + { + "epoch": 2.064064801178203, + "grad_norm": 0.4005085527896881, + "learning_rate": 2.6839754979663752e-06, + "loss": 0.3852, + "step": 2803 + }, + { + "epoch": 2.0648011782032403, + "grad_norm": 0.3522018790245056, + "learning_rate": 2.6801789670705335e-06, + "loss": 0.3979, + "step": 2804 + }, + { + "epoch": 2.065537555228277, + "grad_norm": 0.36220279335975647, + "learning_rate": 2.6763841397811576e-06, + "loss": 0.3662, + "step": 2805 + }, + { + "epoch": 2.066273932253314, + "grad_norm": 0.37038183212280273, + "learning_rate": 2.6725910188850523e-06, + "loss": 0.364, + "step": 2806 + }, + { + "epoch": 2.0670103092783507, + "grad_norm": 0.3805944323539734, + "learning_rate": 2.668799607167769e-06, + "loss": 0.389, + "step": 2807 + }, + { + "epoch": 2.0677466863033875, + "grad_norm": 0.33866146206855774, + "learning_rate": 2.6650099074136095e-06, + "loss": 0.348, + "step": 2808 + }, + { + "epoch": 2.0684830633284244, + "grad_norm": 0.37582066655158997, + "learning_rate": 2.6612219224056133e-06, + "loss": 0.3865, + "step": 2809 + }, + { + "epoch": 2.069219440353461, + "grad_norm": 0.39235755801200867, + "learning_rate": 2.657435654925562e-06, + "loss": 0.3762, + "step": 2810 + }, + { + "epoch": 2.069955817378498, + "grad_norm": 0.3813096284866333, + "learning_rate": 2.6536511077539757e-06, + "loss": 0.3941, + "step": 2811 + }, + { + "epoch": 2.070692194403535, + "grad_norm": 0.35484588146209717, + "learning_rate": 2.6498682836701094e-06, + "loss": 0.388, + "step": 2812 + }, + { + "epoch": 2.0714285714285716, + "grad_norm": 0.37006160616874695, + "learning_rate": 2.6460871854519594e-06, + "loss": 0.4055, + "step": 2813 + }, + { + "epoch": 2.0721649484536084, + "grad_norm": 0.35195285081863403, + "learning_rate": 2.6423078158762473e-06, + "loss": 0.3766, + "step": 2814 + }, + { + "epoch": 2.0729013254786453, + "grad_norm": 0.3882370591163635, + "learning_rate": 2.638530177718427e-06, + "loss": 0.388, + "step": 2815 + }, + { + "epoch": 2.073637702503682, + "grad_norm": 0.33952176570892334, + "learning_rate": 2.6347542737526843e-06, + "loss": 0.3726, + "step": 2816 + }, + { + "epoch": 2.074374079528719, + "grad_norm": 0.37573882937431335, + "learning_rate": 2.6309801067519293e-06, + "loss": 0.3818, + "step": 2817 + }, + { + "epoch": 2.0751104565537557, + "grad_norm": 0.3656890094280243, + "learning_rate": 2.6272076794877915e-06, + "loss": 0.397, + "step": 2818 + }, + { + "epoch": 2.0758468335787925, + "grad_norm": 0.39007052779197693, + "learning_rate": 2.623436994730632e-06, + "loss": 0.3641, + "step": 2819 + }, + { + "epoch": 2.0765832106038293, + "grad_norm": 0.36530953645706177, + "learning_rate": 2.619668055249527e-06, + "loss": 0.3543, + "step": 2820 + }, + { + "epoch": 2.077319587628866, + "grad_norm": 0.3379687964916229, + "learning_rate": 2.6159008638122687e-06, + "loss": 0.3812, + "step": 2821 + }, + { + "epoch": 2.078055964653903, + "grad_norm": 0.3618925213813782, + "learning_rate": 2.6121354231853725e-06, + "loss": 0.4019, + "step": 2822 + }, + { + "epoch": 2.07879234167894, + "grad_norm": 0.39091405272483826, + "learning_rate": 2.608371736134063e-06, + "loss": 0.3696, + "step": 2823 + }, + { + "epoch": 2.0795287187039766, + "grad_norm": 0.3801884055137634, + "learning_rate": 2.6046098054222767e-06, + "loss": 0.3822, + "step": 2824 + }, + { + "epoch": 2.0802650957290134, + "grad_norm": 0.34748730063438416, + "learning_rate": 2.6008496338126643e-06, + "loss": 0.3625, + "step": 2825 + }, + { + "epoch": 2.0810014727540502, + "grad_norm": 0.3482074737548828, + "learning_rate": 2.5970912240665815e-06, + "loss": 0.3685, + "step": 2826 + }, + { + "epoch": 2.081737849779087, + "grad_norm": 0.39526116847991943, + "learning_rate": 2.59333457894409e-06, + "loss": 0.3852, + "step": 2827 + }, + { + "epoch": 2.082474226804124, + "grad_norm": 0.39386582374572754, + "learning_rate": 2.5895797012039576e-06, + "loss": 0.3473, + "step": 2828 + }, + { + "epoch": 2.0832106038291607, + "grad_norm": 0.40795037150382996, + "learning_rate": 2.5858265936036496e-06, + "loss": 0.394, + "step": 2829 + }, + { + "epoch": 2.0839469808541975, + "grad_norm": 0.40214234590530396, + "learning_rate": 2.582075258899339e-06, + "loss": 0.3657, + "step": 2830 + }, + { + "epoch": 2.0846833578792343, + "grad_norm": 0.35180026292800903, + "learning_rate": 2.578325699845892e-06, + "loss": 0.3508, + "step": 2831 + }, + { + "epoch": 2.085419734904271, + "grad_norm": 0.3568512201309204, + "learning_rate": 2.5745779191968686e-06, + "loss": 0.3434, + "step": 2832 + }, + { + "epoch": 2.086156111929308, + "grad_norm": 0.3792460262775421, + "learning_rate": 2.5708319197045297e-06, + "loss": 0.3691, + "step": 2833 + }, + { + "epoch": 2.0868924889543448, + "grad_norm": 0.36163848638534546, + "learning_rate": 2.567087704119821e-06, + "loss": 0.3726, + "step": 2834 + }, + { + "epoch": 2.0876288659793816, + "grad_norm": 0.3781639039516449, + "learning_rate": 2.5633452751923825e-06, + "loss": 0.3749, + "step": 2835 + }, + { + "epoch": 2.0883652430044184, + "grad_norm": 0.3682844340801239, + "learning_rate": 2.5596046356705418e-06, + "loss": 0.3525, + "step": 2836 + }, + { + "epoch": 2.089101620029455, + "grad_norm": 0.3800516128540039, + "learning_rate": 2.5558657883013078e-06, + "loss": 0.3867, + "step": 2837 + }, + { + "epoch": 2.089837997054492, + "grad_norm": 0.34438201785087585, + "learning_rate": 2.5521287358303814e-06, + "loss": 0.4029, + "step": 2838 + }, + { + "epoch": 2.090574374079529, + "grad_norm": 0.3582353889942169, + "learning_rate": 2.54839348100214e-06, + "loss": 0.35, + "step": 2839 + }, + { + "epoch": 2.0913107511045657, + "grad_norm": 0.35245490074157715, + "learning_rate": 2.544660026559639e-06, + "loss": 0.3818, + "step": 2840 + }, + { + "epoch": 2.0920471281296025, + "grad_norm": 0.3359908163547516, + "learning_rate": 2.5409283752446183e-06, + "loss": 0.3835, + "step": 2841 + }, + { + "epoch": 2.0927835051546393, + "grad_norm": 0.3465385437011719, + "learning_rate": 2.537198529797489e-06, + "loss": 0.3917, + "step": 2842 + }, + { + "epoch": 2.093519882179676, + "grad_norm": 0.3186095952987671, + "learning_rate": 2.533470492957335e-06, + "loss": 0.3766, + "step": 2843 + }, + { + "epoch": 2.094256259204713, + "grad_norm": 0.34522688388824463, + "learning_rate": 2.5297442674619153e-06, + "loss": 0.3691, + "step": 2844 + }, + { + "epoch": 2.0949926362297497, + "grad_norm": 0.3759070932865143, + "learning_rate": 2.526019856047656e-06, + "loss": 0.3514, + "step": 2845 + }, + { + "epoch": 2.0957290132547866, + "grad_norm": 0.37958410382270813, + "learning_rate": 2.5222972614496543e-06, + "loss": 0.3999, + "step": 2846 + }, + { + "epoch": 2.0964653902798234, + "grad_norm": 0.33293333649635315, + "learning_rate": 2.518576486401671e-06, + "loss": 0.3622, + "step": 2847 + }, + { + "epoch": 2.09720176730486, + "grad_norm": 0.39282292127609253, + "learning_rate": 2.514857533636128e-06, + "loss": 0.3799, + "step": 2848 + }, + { + "epoch": 2.097938144329897, + "grad_norm": 0.3422982394695282, + "learning_rate": 2.5111404058841155e-06, + "loss": 0.3917, + "step": 2849 + }, + { + "epoch": 2.098674521354934, + "grad_norm": 0.3647412359714508, + "learning_rate": 2.5074251058753783e-06, + "loss": 0.3606, + "step": 2850 + }, + { + "epoch": 2.0994108983799706, + "grad_norm": 0.38597914576530457, + "learning_rate": 2.5037116363383203e-06, + "loss": 0.3649, + "step": 2851 + }, + { + "epoch": 2.1001472754050075, + "grad_norm": 0.33769500255584717, + "learning_rate": 2.5000000000000015e-06, + "loss": 0.37, + "step": 2852 + }, + { + "epoch": 2.1008836524300443, + "grad_norm": 0.3958123028278351, + "learning_rate": 2.4962901995861348e-06, + "loss": 0.3439, + "step": 2853 + }, + { + "epoch": 2.101620029455081, + "grad_norm": 0.34865254163742065, + "learning_rate": 2.4925822378210844e-06, + "loss": 0.3636, + "step": 2854 + }, + { + "epoch": 2.102356406480118, + "grad_norm": 0.3949892222881317, + "learning_rate": 2.488876117427869e-06, + "loss": 0.3538, + "step": 2855 + }, + { + "epoch": 2.1030927835051547, + "grad_norm": 0.39913707971572876, + "learning_rate": 2.4851718411281495e-06, + "loss": 0.368, + "step": 2856 + }, + { + "epoch": 2.1038291605301915, + "grad_norm": 0.3734777569770813, + "learning_rate": 2.4814694116422326e-06, + "loss": 0.3516, + "step": 2857 + }, + { + "epoch": 2.1045655375552283, + "grad_norm": 0.414492130279541, + "learning_rate": 2.477768831689074e-06, + "loss": 0.3833, + "step": 2858 + }, + { + "epoch": 2.105301914580265, + "grad_norm": 0.3294821083545685, + "learning_rate": 2.4740701039862663e-06, + "loss": 0.3678, + "step": 2859 + }, + { + "epoch": 2.106038291605302, + "grad_norm": 0.38018667697906494, + "learning_rate": 2.4703732312500438e-06, + "loss": 0.3991, + "step": 2860 + }, + { + "epoch": 2.106774668630339, + "grad_norm": 0.35948431491851807, + "learning_rate": 2.466678216195277e-06, + "loss": 0.361, + "step": 2861 + }, + { + "epoch": 2.1075110456553756, + "grad_norm": 0.3856082856655121, + "learning_rate": 2.462985061535472e-06, + "loss": 0.369, + "step": 2862 + }, + { + "epoch": 2.1082474226804124, + "grad_norm": 0.39372214674949646, + "learning_rate": 2.459293769982774e-06, + "loss": 0.3786, + "step": 2863 + }, + { + "epoch": 2.1089837997054492, + "grad_norm": 0.3526458144187927, + "learning_rate": 2.455604344247954e-06, + "loss": 0.3703, + "step": 2864 + }, + { + "epoch": 2.109720176730486, + "grad_norm": 0.3489939570426941, + "learning_rate": 2.4519167870404126e-06, + "loss": 0.3932, + "step": 2865 + }, + { + "epoch": 2.110456553755523, + "grad_norm": 0.37256160378456116, + "learning_rate": 2.4482311010681842e-06, + "loss": 0.3982, + "step": 2866 + }, + { + "epoch": 2.1111929307805597, + "grad_norm": 0.3323703110218048, + "learning_rate": 2.4445472890379233e-06, + "loss": 0.3791, + "step": 2867 + }, + { + "epoch": 2.1119293078055965, + "grad_norm": 0.36218979954719543, + "learning_rate": 2.4408653536549104e-06, + "loss": 0.4038, + "step": 2868 + }, + { + "epoch": 2.1126656848306333, + "grad_norm": 0.34477296471595764, + "learning_rate": 2.437185297623047e-06, + "loss": 0.391, + "step": 2869 + }, + { + "epoch": 2.11340206185567, + "grad_norm": 0.35292166471481323, + "learning_rate": 2.4335071236448536e-06, + "loss": 0.3793, + "step": 2870 + }, + { + "epoch": 2.114138438880707, + "grad_norm": 0.35486897826194763, + "learning_rate": 2.4298308344214745e-06, + "loss": 0.3849, + "step": 2871 + }, + { + "epoch": 2.1148748159057438, + "grad_norm": 0.38103657960891724, + "learning_rate": 2.4261564326526623e-06, + "loss": 0.3523, + "step": 2872 + }, + { + "epoch": 2.1156111929307806, + "grad_norm": 0.37378567457199097, + "learning_rate": 2.422483921036785e-06, + "loss": 0.3847, + "step": 2873 + }, + { + "epoch": 2.1163475699558174, + "grad_norm": 0.38318321108818054, + "learning_rate": 2.418813302270829e-06, + "loss": 0.3825, + "step": 2874 + }, + { + "epoch": 2.1170839469808542, + "grad_norm": 0.34228307008743286, + "learning_rate": 2.415144579050382e-06, + "loss": 0.3661, + "step": 2875 + }, + { + "epoch": 2.117820324005891, + "grad_norm": 0.3472125828266144, + "learning_rate": 2.411477754069645e-06, + "loss": 0.3491, + "step": 2876 + }, + { + "epoch": 2.118556701030928, + "grad_norm": 0.355693519115448, + "learning_rate": 2.4078128300214225e-06, + "loss": 0.3481, + "step": 2877 + }, + { + "epoch": 2.1192930780559647, + "grad_norm": 0.40808922052383423, + "learning_rate": 2.4041498095971253e-06, + "loss": 0.4022, + "step": 2878 + }, + { + "epoch": 2.1200294550810015, + "grad_norm": 0.3449529707431793, + "learning_rate": 2.4004886954867618e-06, + "loss": 0.361, + "step": 2879 + }, + { + "epoch": 2.1207658321060383, + "grad_norm": 0.35305774211883545, + "learning_rate": 2.3968294903789474e-06, + "loss": 0.3913, + "step": 2880 + }, + { + "epoch": 2.121502209131075, + "grad_norm": 0.3608260750770569, + "learning_rate": 2.393172196960891e-06, + "loss": 0.3521, + "step": 2881 + }, + { + "epoch": 2.122238586156112, + "grad_norm": 0.3713111877441406, + "learning_rate": 2.3895168179183947e-06, + "loss": 0.389, + "step": 2882 + }, + { + "epoch": 2.1229749631811488, + "grad_norm": 0.3446366488933563, + "learning_rate": 2.3858633559358635e-06, + "loss": 0.3765, + "step": 2883 + }, + { + "epoch": 2.1237113402061856, + "grad_norm": 0.36745747923851013, + "learning_rate": 2.3822118136962876e-06, + "loss": 0.38, + "step": 2884 + }, + { + "epoch": 2.1244477172312224, + "grad_norm": 0.3752921223640442, + "learning_rate": 2.378562193881248e-06, + "loss": 0.3714, + "step": 2885 + }, + { + "epoch": 2.125184094256259, + "grad_norm": 0.33874091506004333, + "learning_rate": 2.3749144991709174e-06, + "loss": 0.3857, + "step": 2886 + }, + { + "epoch": 2.125920471281296, + "grad_norm": 0.3628979027271271, + "learning_rate": 2.371268732244048e-06, + "loss": 0.3781, + "step": 2887 + }, + { + "epoch": 2.126656848306333, + "grad_norm": 0.36751553416252136, + "learning_rate": 2.367624895777987e-06, + "loss": 0.3642, + "step": 2888 + }, + { + "epoch": 2.1273932253313697, + "grad_norm": 0.3799552023410797, + "learning_rate": 2.3639829924486546e-06, + "loss": 0.3399, + "step": 2889 + }, + { + "epoch": 2.1281296023564065, + "grad_norm": 0.34371480345726013, + "learning_rate": 2.3603430249305532e-06, + "loss": 0.3956, + "step": 2890 + }, + { + "epoch": 2.1288659793814433, + "grad_norm": 0.3942258954048157, + "learning_rate": 2.356704995896768e-06, + "loss": 0.3714, + "step": 2891 + }, + { + "epoch": 2.12960235640648, + "grad_norm": 0.34906941652297974, + "learning_rate": 2.353068908018957e-06, + "loss": 0.3743, + "step": 2892 + }, + { + "epoch": 2.130338733431517, + "grad_norm": 0.3614822328090668, + "learning_rate": 2.3494347639673513e-06, + "loss": 0.3632, + "step": 2893 + }, + { + "epoch": 2.1310751104565537, + "grad_norm": 0.3540303111076355, + "learning_rate": 2.3458025664107587e-06, + "loss": 0.3442, + "step": 2894 + }, + { + "epoch": 2.1318114874815906, + "grad_norm": 0.3677811622619629, + "learning_rate": 2.342172318016552e-06, + "loss": 0.3421, + "step": 2895 + }, + { + "epoch": 2.1325478645066274, + "grad_norm": 0.3830873668193817, + "learning_rate": 2.33854402145068e-06, + "loss": 0.378, + "step": 2896 + }, + { + "epoch": 2.133284241531664, + "grad_norm": 0.405880868434906, + "learning_rate": 2.3349176793776523e-06, + "loss": 0.3765, + "step": 2897 + }, + { + "epoch": 2.134020618556701, + "grad_norm": 0.37011435627937317, + "learning_rate": 2.3312932944605433e-06, + "loss": 0.3852, + "step": 2898 + }, + { + "epoch": 2.134756995581738, + "grad_norm": 0.3674939274787903, + "learning_rate": 2.3276708693609947e-06, + "loss": 0.3663, + "step": 2899 + }, + { + "epoch": 2.1354933726067746, + "grad_norm": 0.3470740020275116, + "learning_rate": 2.324050406739205e-06, + "loss": 0.3652, + "step": 2900 + }, + { + "epoch": 2.1362297496318114, + "grad_norm": 0.3683933913707733, + "learning_rate": 2.32043190925393e-06, + "loss": 0.3959, + "step": 2901 + }, + { + "epoch": 2.1369661266568483, + "grad_norm": 0.3439938426017761, + "learning_rate": 2.316815379562491e-06, + "loss": 0.3843, + "step": 2902 + }, + { + "epoch": 2.137702503681885, + "grad_norm": 0.3798215389251709, + "learning_rate": 2.3132008203207508e-06, + "loss": 0.3494, + "step": 2903 + }, + { + "epoch": 2.138438880706922, + "grad_norm": 0.3750232458114624, + "learning_rate": 2.309588234183137e-06, + "loss": 0.388, + "step": 2904 + }, + { + "epoch": 2.1391752577319587, + "grad_norm": 0.34800633788108826, + "learning_rate": 2.3059776238026233e-06, + "loss": 0.3883, + "step": 2905 + }, + { + "epoch": 2.1399116347569955, + "grad_norm": 0.3639180660247803, + "learning_rate": 2.30236899183073e-06, + "loss": 0.3853, + "step": 2906 + }, + { + "epoch": 2.1406480117820323, + "grad_norm": 0.3714924156665802, + "learning_rate": 2.298762340917531e-06, + "loss": 0.415, + "step": 2907 + }, + { + "epoch": 2.141384388807069, + "grad_norm": 0.4161515235900879, + "learning_rate": 2.295157673711641e-06, + "loss": 0.3851, + "step": 2908 + }, + { + "epoch": 2.142120765832106, + "grad_norm": 0.35217177867889404, + "learning_rate": 2.2915549928602153e-06, + "loss": 0.3988, + "step": 2909 + }, + { + "epoch": 2.142857142857143, + "grad_norm": 0.3613126575946808, + "learning_rate": 2.2879543010089613e-06, + "loss": 0.3599, + "step": 2910 + }, + { + "epoch": 2.1435935198821796, + "grad_norm": 0.34729936718940735, + "learning_rate": 2.2843556008021105e-06, + "loss": 0.3698, + "step": 2911 + }, + { + "epoch": 2.1443298969072164, + "grad_norm": 0.37736669182777405, + "learning_rate": 2.280758894882441e-06, + "loss": 0.3627, + "step": 2912 + }, + { + "epoch": 2.1450662739322532, + "grad_norm": 0.36949247121810913, + "learning_rate": 2.2771641858912684e-06, + "loss": 0.3889, + "step": 2913 + }, + { + "epoch": 2.14580265095729, + "grad_norm": 0.3440004587173462, + "learning_rate": 2.2735714764684368e-06, + "loss": 0.3574, + "step": 2914 + }, + { + "epoch": 2.146539027982327, + "grad_norm": 0.3486247658729553, + "learning_rate": 2.269980769252321e-06, + "loss": 0.3554, + "step": 2915 + }, + { + "epoch": 2.1472754050073637, + "grad_norm": 0.35198965668678284, + "learning_rate": 2.2663920668798316e-06, + "loss": 0.3627, + "step": 2916 + }, + { + "epoch": 2.1480117820324005, + "grad_norm": 0.37342336773872375, + "learning_rate": 2.262805371986402e-06, + "loss": 0.375, + "step": 2917 + }, + { + "epoch": 2.1487481590574373, + "grad_norm": 0.36178645491600037, + "learning_rate": 2.2592206872059913e-06, + "loss": 0.4036, + "step": 2918 + }, + { + "epoch": 2.149484536082474, + "grad_norm": 0.37667861580848694, + "learning_rate": 2.255638015171085e-06, + "loss": 0.3922, + "step": 2919 + }, + { + "epoch": 2.150220913107511, + "grad_norm": 0.3384239077568054, + "learning_rate": 2.2520573585126863e-06, + "loss": 0.3869, + "step": 2920 + }, + { + "epoch": 2.1509572901325478, + "grad_norm": 0.3617817759513855, + "learning_rate": 2.248478719860326e-06, + "loss": 0.3777, + "step": 2921 + }, + { + "epoch": 2.1516936671575846, + "grad_norm": 0.34681081771850586, + "learning_rate": 2.2449021018420454e-06, + "loss": 0.355, + "step": 2922 + }, + { + "epoch": 2.1524300441826214, + "grad_norm": 0.3603213131427765, + "learning_rate": 2.2413275070844026e-06, + "loss": 0.4205, + "step": 2923 + }, + { + "epoch": 2.153166421207658, + "grad_norm": 0.33105942606925964, + "learning_rate": 2.2377549382124767e-06, + "loss": 0.3259, + "step": 2924 + }, + { + "epoch": 2.153902798232695, + "grad_norm": 0.36263999342918396, + "learning_rate": 2.2341843978498525e-06, + "loss": 0.383, + "step": 2925 + }, + { + "epoch": 2.154639175257732, + "grad_norm": 0.34897348284721375, + "learning_rate": 2.230615888618624e-06, + "loss": 0.3873, + "step": 2926 + }, + { + "epoch": 2.1553755522827687, + "grad_norm": 0.3384336829185486, + "learning_rate": 2.2270494131394034e-06, + "loss": 0.3643, + "step": 2927 + }, + { + "epoch": 2.1561119293078055, + "grad_norm": 0.3632971942424774, + "learning_rate": 2.223484974031294e-06, + "loss": 0.3668, + "step": 2928 + }, + { + "epoch": 2.1568483063328423, + "grad_norm": 0.3339279592037201, + "learning_rate": 2.2199225739119184e-06, + "loss": 0.3488, + "step": 2929 + }, + { + "epoch": 2.157584683357879, + "grad_norm": 0.35420891642570496, + "learning_rate": 2.216362215397393e-06, + "loss": 0.3732, + "step": 2930 + }, + { + "epoch": 2.158321060382916, + "grad_norm": 0.35036933422088623, + "learning_rate": 2.2128039011023367e-06, + "loss": 0.3618, + "step": 2931 + }, + { + "epoch": 2.1590574374079528, + "grad_norm": 0.40435558557510376, + "learning_rate": 2.2092476336398706e-06, + "loss": 0.4052, + "step": 2932 + }, + { + "epoch": 2.1597938144329896, + "grad_norm": 0.32747867703437805, + "learning_rate": 2.2056934156216094e-06, + "loss": 0.3612, + "step": 2933 + }, + { + "epoch": 2.1605301914580264, + "grad_norm": 0.3270984888076782, + "learning_rate": 2.2021412496576598e-06, + "loss": 0.3827, + "step": 2934 + }, + { + "epoch": 2.161266568483063, + "grad_norm": 0.3557640314102173, + "learning_rate": 2.198591138356633e-06, + "loss": 0.3625, + "step": 2935 + }, + { + "epoch": 2.1620029455081, + "grad_norm": 0.3336459994316101, + "learning_rate": 2.195043084325616e-06, + "loss": 0.35, + "step": 2936 + }, + { + "epoch": 2.162739322533137, + "grad_norm": 0.3505702316761017, + "learning_rate": 2.191497090170193e-06, + "loss": 0.3816, + "step": 2937 + }, + { + "epoch": 2.1634756995581736, + "grad_norm": 0.37018442153930664, + "learning_rate": 2.1879531584944396e-06, + "loss": 0.3812, + "step": 2938 + }, + { + "epoch": 2.1642120765832105, + "grad_norm": 0.33809441328048706, + "learning_rate": 2.1844112919009087e-06, + "loss": 0.39, + "step": 2939 + }, + { + "epoch": 2.1649484536082473, + "grad_norm": 0.32393452525138855, + "learning_rate": 2.1808714929906394e-06, + "loss": 0.3462, + "step": 2940 + }, + { + "epoch": 2.165684830633284, + "grad_norm": 0.33284613490104675, + "learning_rate": 2.1773337643631565e-06, + "loss": 0.3845, + "step": 2941 + }, + { + "epoch": 2.166421207658321, + "grad_norm": 0.36973825097084045, + "learning_rate": 2.173798108616459e-06, + "loss": 0.3597, + "step": 2942 + }, + { + "epoch": 2.1671575846833577, + "grad_norm": 0.3738713264465332, + "learning_rate": 2.1702645283470238e-06, + "loss": 0.381, + "step": 2943 + }, + { + "epoch": 2.1678939617083945, + "grad_norm": 0.3623276352882385, + "learning_rate": 2.166733026149811e-06, + "loss": 0.3546, + "step": 2944 + }, + { + "epoch": 2.1686303387334314, + "grad_norm": 0.4014292061328888, + "learning_rate": 2.1632036046182416e-06, + "loss": 0.3645, + "step": 2945 + }, + { + "epoch": 2.169366715758468, + "grad_norm": 0.37799927592277527, + "learning_rate": 2.159676266344222e-06, + "loss": 0.3611, + "step": 2946 + }, + { + "epoch": 2.170103092783505, + "grad_norm": 0.379955917596817, + "learning_rate": 2.15615101391812e-06, + "loss": 0.3564, + "step": 2947 + }, + { + "epoch": 2.170839469808542, + "grad_norm": 0.36554086208343506, + "learning_rate": 2.1526278499287746e-06, + "loss": 0.3748, + "step": 2948 + }, + { + "epoch": 2.1715758468335786, + "grad_norm": 0.3485044538974762, + "learning_rate": 2.1491067769634927e-06, + "loss": 0.3738, + "step": 2949 + }, + { + "epoch": 2.1723122238586154, + "grad_norm": 0.4192858934402466, + "learning_rate": 2.145587797608043e-06, + "loss": 0.3907, + "step": 2950 + }, + { + "epoch": 2.1730486008836523, + "grad_norm": 0.3680093288421631, + "learning_rate": 2.1420709144466557e-06, + "loss": 0.3673, + "step": 2951 + }, + { + "epoch": 2.173784977908689, + "grad_norm": 0.3767206370830536, + "learning_rate": 2.1385561300620287e-06, + "loss": 0.3542, + "step": 2952 + }, + { + "epoch": 2.174521354933726, + "grad_norm": 0.3542313575744629, + "learning_rate": 2.1350434470353065e-06, + "loss": 0.3815, + "step": 2953 + }, + { + "epoch": 2.1752577319587627, + "grad_norm": 0.3587740957736969, + "learning_rate": 2.131532867946102e-06, + "loss": 0.3831, + "step": 2954 + }, + { + "epoch": 2.1759941089837995, + "grad_norm": 0.35472214221954346, + "learning_rate": 2.1280243953724784e-06, + "loss": 0.3649, + "step": 2955 + }, + { + "epoch": 2.1767304860088363, + "grad_norm": 0.37398767471313477, + "learning_rate": 2.1245180318909482e-06, + "loss": 0.3801, + "step": 2956 + }, + { + "epoch": 2.177466863033873, + "grad_norm": 0.3110312521457672, + "learning_rate": 2.121013780076483e-06, + "loss": 0.3539, + "step": 2957 + }, + { + "epoch": 2.17820324005891, + "grad_norm": 0.34271514415740967, + "learning_rate": 2.1175116425024978e-06, + "loss": 0.3796, + "step": 2958 + }, + { + "epoch": 2.178939617083947, + "grad_norm": 0.34827741980552673, + "learning_rate": 2.1140116217408554e-06, + "loss": 0.3739, + "step": 2959 + }, + { + "epoch": 2.1796759941089836, + "grad_norm": 0.327886700630188, + "learning_rate": 2.110513720361869e-06, + "loss": 0.3734, + "step": 2960 + }, + { + "epoch": 2.1804123711340204, + "grad_norm": 0.3339656889438629, + "learning_rate": 2.107017940934286e-06, + "loss": 0.3918, + "step": 2961 + }, + { + "epoch": 2.1811487481590572, + "grad_norm": 0.380188912153244, + "learning_rate": 2.1035242860253064e-06, + "loss": 0.3954, + "step": 2962 + }, + { + "epoch": 2.181885125184094, + "grad_norm": 0.33889806270599365, + "learning_rate": 2.100032758200562e-06, + "loss": 0.3591, + "step": 2963 + }, + { + "epoch": 2.182621502209131, + "grad_norm": 0.3376297056674957, + "learning_rate": 2.0965433600241247e-06, + "loss": 0.3968, + "step": 2964 + }, + { + "epoch": 2.1833578792341677, + "grad_norm": 0.3318977355957031, + "learning_rate": 2.093056094058506e-06, + "loss": 0.3602, + "step": 2965 + }, + { + "epoch": 2.184094256259205, + "grad_norm": 0.3732307255268097, + "learning_rate": 2.089570962864647e-06, + "loss": 0.3863, + "step": 2966 + }, + { + "epoch": 2.1848306332842418, + "grad_norm": 0.33816441893577576, + "learning_rate": 2.0860879690019216e-06, + "loss": 0.3826, + "step": 2967 + }, + { + "epoch": 2.1855670103092786, + "grad_norm": 0.38040101528167725, + "learning_rate": 2.0826071150281374e-06, + "loss": 0.3581, + "step": 2968 + }, + { + "epoch": 2.1863033873343154, + "grad_norm": 0.3592909276485443, + "learning_rate": 2.0791284034995296e-06, + "loss": 0.4081, + "step": 2969 + }, + { + "epoch": 2.187039764359352, + "grad_norm": 0.330105185508728, + "learning_rate": 2.0756518369707528e-06, + "loss": 0.3798, + "step": 2970 + }, + { + "epoch": 2.187776141384389, + "grad_norm": 0.34812307357788086, + "learning_rate": 2.0721774179948978e-06, + "loss": 0.4039, + "step": 2971 + }, + { + "epoch": 2.188512518409426, + "grad_norm": 0.39521628618240356, + "learning_rate": 2.0687051491234717e-06, + "loss": 0.3804, + "step": 2972 + }, + { + "epoch": 2.1892488954344627, + "grad_norm": 0.3390514552593231, + "learning_rate": 2.0652350329064012e-06, + "loss": 0.3691, + "step": 2973 + }, + { + "epoch": 2.1899852724594995, + "grad_norm": 0.38476595282554626, + "learning_rate": 2.061767071892039e-06, + "loss": 0.3884, + "step": 2974 + }, + { + "epoch": 2.1907216494845363, + "grad_norm": 0.35555726289749146, + "learning_rate": 2.0583012686271493e-06, + "loss": 0.3852, + "step": 2975 + }, + { + "epoch": 2.191458026509573, + "grad_norm": 0.3225259482860565, + "learning_rate": 2.0548376256569107e-06, + "loss": 0.3712, + "step": 2976 + }, + { + "epoch": 2.19219440353461, + "grad_norm": 0.3521542251110077, + "learning_rate": 2.051376145524924e-06, + "loss": 0.389, + "step": 2977 + }, + { + "epoch": 2.1929307805596467, + "grad_norm": 0.3622699975967407, + "learning_rate": 2.047916830773187e-06, + "loss": 0.3854, + "step": 2978 + }, + { + "epoch": 2.1936671575846836, + "grad_norm": 0.36842185258865356, + "learning_rate": 2.044459683942124e-06, + "loss": 0.3563, + "step": 2979 + }, + { + "epoch": 2.1944035346097204, + "grad_norm": 0.4047960042953491, + "learning_rate": 2.041004707570555e-06, + "loss": 0.3704, + "step": 2980 + }, + { + "epoch": 2.195139911634757, + "grad_norm": 0.3790772259235382, + "learning_rate": 2.037551904195709e-06, + "loss": 0.3918, + "step": 2981 + }, + { + "epoch": 2.195876288659794, + "grad_norm": 0.31942737102508545, + "learning_rate": 2.0341012763532243e-06, + "loss": 0.3562, + "step": 2982 + }, + { + "epoch": 2.196612665684831, + "grad_norm": 0.36221376061439514, + "learning_rate": 2.0306528265771357e-06, + "loss": 0.3856, + "step": 2983 + }, + { + "epoch": 2.1973490427098676, + "grad_norm": 0.37802380323410034, + "learning_rate": 2.0272065573998794e-06, + "loss": 0.3645, + "step": 2984 + }, + { + "epoch": 2.1980854197349045, + "grad_norm": 0.34147152304649353, + "learning_rate": 2.0237624713522945e-06, + "loss": 0.3898, + "step": 2985 + }, + { + "epoch": 2.1988217967599413, + "grad_norm": 0.3583637475967407, + "learning_rate": 2.020320570963612e-06, + "loss": 0.3768, + "step": 2986 + }, + { + "epoch": 2.199558173784978, + "grad_norm": 0.3441274166107178, + "learning_rate": 2.0168808587614584e-06, + "loss": 0.3712, + "step": 2987 + }, + { + "epoch": 2.200294550810015, + "grad_norm": 0.3442659080028534, + "learning_rate": 2.0134433372718565e-06, + "loss": 0.3423, + "step": 2988 + }, + { + "epoch": 2.2010309278350517, + "grad_norm": 0.34774288535118103, + "learning_rate": 2.010008009019215e-06, + "loss": 0.3854, + "step": 2989 + }, + { + "epoch": 2.2017673048600885, + "grad_norm": 0.3395615518093109, + "learning_rate": 2.0065748765263386e-06, + "loss": 0.3523, + "step": 2990 + }, + { + "epoch": 2.2025036818851254, + "grad_norm": 0.3748759627342224, + "learning_rate": 2.003143942314415e-06, + "loss": 0.3714, + "step": 2991 + }, + { + "epoch": 2.203240058910162, + "grad_norm": 0.34823179244995117, + "learning_rate": 1.999715208903017e-06, + "loss": 0.3958, + "step": 2992 + }, + { + "epoch": 2.203976435935199, + "grad_norm": 0.32306548953056335, + "learning_rate": 1.996288678810105e-06, + "loss": 0.3813, + "step": 2993 + }, + { + "epoch": 2.204712812960236, + "grad_norm": 0.3721083700656891, + "learning_rate": 1.9928643545520204e-06, + "loss": 0.3786, + "step": 2994 + }, + { + "epoch": 2.2054491899852726, + "grad_norm": 0.34985750913619995, + "learning_rate": 1.989442238643478e-06, + "loss": 0.3816, + "step": 2995 + }, + { + "epoch": 2.2061855670103094, + "grad_norm": 0.3577035367488861, + "learning_rate": 1.9860223335975815e-06, + "loss": 0.355, + "step": 2996 + }, + { + "epoch": 2.2069219440353463, + "grad_norm": 0.3292270302772522, + "learning_rate": 1.9826046419258037e-06, + "loss": 0.3322, + "step": 2997 + }, + { + "epoch": 2.207658321060383, + "grad_norm": 0.3422560393810272, + "learning_rate": 1.9791891661379926e-06, + "loss": 0.3514, + "step": 2998 + }, + { + "epoch": 2.20839469808542, + "grad_norm": 0.3499544560909271, + "learning_rate": 1.975775908742374e-06, + "loss": 0.3692, + "step": 2999 + }, + { + "epoch": 2.2091310751104567, + "grad_norm": 0.35650497674942017, + "learning_rate": 1.972364872245539e-06, + "loss": 0.382, + "step": 3000 + }, + { + "epoch": 2.2098674521354935, + "grad_norm": 0.3358071446418762, + "learning_rate": 1.9689560591524482e-06, + "loss": 0.3655, + "step": 3001 + }, + { + "epoch": 2.2106038291605303, + "grad_norm": 0.3230755925178528, + "learning_rate": 1.965549471966436e-06, + "loss": 0.3629, + "step": 3002 + }, + { + "epoch": 2.211340206185567, + "grad_norm": 0.33048614859580994, + "learning_rate": 1.96214511318919e-06, + "loss": 0.3856, + "step": 3003 + }, + { + "epoch": 2.212076583210604, + "grad_norm": 0.36040183901786804, + "learning_rate": 1.958742985320774e-06, + "loss": 0.3601, + "step": 3004 + }, + { + "epoch": 2.212812960235641, + "grad_norm": 0.3354277014732361, + "learning_rate": 1.955343090859606e-06, + "loss": 0.3607, + "step": 3005 + }, + { + "epoch": 2.2135493372606776, + "grad_norm": 0.3312050700187683, + "learning_rate": 1.9519454323024644e-06, + "loss": 0.3716, + "step": 3006 + }, + { + "epoch": 2.2142857142857144, + "grad_norm": 0.34683001041412354, + "learning_rate": 1.9485500121444896e-06, + "loss": 0.3692, + "step": 3007 + }, + { + "epoch": 2.2150220913107512, + "grad_norm": 0.3769376575946808, + "learning_rate": 1.945156832879174e-06, + "loss": 0.3664, + "step": 3008 + }, + { + "epoch": 2.215758468335788, + "grad_norm": 0.33873477578163147, + "learning_rate": 1.941765896998365e-06, + "loss": 0.3909, + "step": 3009 + }, + { + "epoch": 2.216494845360825, + "grad_norm": 0.3695991635322571, + "learning_rate": 1.938377206992266e-06, + "loss": 0.3717, + "step": 3010 + }, + { + "epoch": 2.2172312223858617, + "grad_norm": 0.3528919816017151, + "learning_rate": 1.934990765349427e-06, + "loss": 0.3708, + "step": 3011 + }, + { + "epoch": 2.2179675994108985, + "grad_norm": 0.3522799015045166, + "learning_rate": 1.931606574556749e-06, + "loss": 0.373, + "step": 3012 + }, + { + "epoch": 2.2187039764359353, + "grad_norm": 0.3132975101470947, + "learning_rate": 1.928224637099479e-06, + "loss": 0.35, + "step": 3013 + }, + { + "epoch": 2.219440353460972, + "grad_norm": 0.35459184646606445, + "learning_rate": 1.9248449554612076e-06, + "loss": 0.3724, + "step": 3014 + }, + { + "epoch": 2.220176730486009, + "grad_norm": 0.367097944021225, + "learning_rate": 1.9214675321238753e-06, + "loss": 0.3754, + "step": 3015 + }, + { + "epoch": 2.2209131075110458, + "grad_norm": 0.38313212990760803, + "learning_rate": 1.9180923695677565e-06, + "loss": 0.3632, + "step": 3016 + }, + { + "epoch": 2.2216494845360826, + "grad_norm": 0.3429620563983917, + "learning_rate": 1.9147194702714683e-06, + "loss": 0.3543, + "step": 3017 + }, + { + "epoch": 2.2223858615611194, + "grad_norm": 0.31693652272224426, + "learning_rate": 1.911348836711969e-06, + "loss": 0.3526, + "step": 3018 + }, + { + "epoch": 2.223122238586156, + "grad_norm": 0.33426809310913086, + "learning_rate": 1.907980471364548e-06, + "loss": 0.3795, + "step": 3019 + }, + { + "epoch": 2.223858615611193, + "grad_norm": 0.3561391532421112, + "learning_rate": 1.9046143767028309e-06, + "loss": 0.3824, + "step": 3020 + }, + { + "epoch": 2.22459499263623, + "grad_norm": 0.32854175567626953, + "learning_rate": 1.9012505551987764e-06, + "loss": 0.3357, + "step": 3021 + }, + { + "epoch": 2.2253313696612667, + "grad_norm": 0.34215104579925537, + "learning_rate": 1.897889009322672e-06, + "loss": 0.3826, + "step": 3022 + }, + { + "epoch": 2.2260677466863035, + "grad_norm": 0.3710014522075653, + "learning_rate": 1.8945297415431379e-06, + "loss": 0.3799, + "step": 3023 + }, + { + "epoch": 2.2268041237113403, + "grad_norm": 0.35444051027297974, + "learning_rate": 1.8911727543271174e-06, + "loss": 0.396, + "step": 3024 + }, + { + "epoch": 2.227540500736377, + "grad_norm": 0.3570059835910797, + "learning_rate": 1.8878180501398796e-06, + "loss": 0.3755, + "step": 3025 + }, + { + "epoch": 2.228276877761414, + "grad_norm": 0.34448522329330444, + "learning_rate": 1.88446563144502e-06, + "loss": 0.4058, + "step": 3026 + }, + { + "epoch": 2.2290132547864507, + "grad_norm": 0.3360097408294678, + "learning_rate": 1.8811155007044523e-06, + "loss": 0.3788, + "step": 3027 + }, + { + "epoch": 2.2297496318114876, + "grad_norm": 0.35772281885147095, + "learning_rate": 1.8777676603784122e-06, + "loss": 0.4028, + "step": 3028 + }, + { + "epoch": 2.2304860088365244, + "grad_norm": 0.31866273283958435, + "learning_rate": 1.8744221129254514e-06, + "loss": 0.3906, + "step": 3029 + }, + { + "epoch": 2.231222385861561, + "grad_norm": 0.3577582538127899, + "learning_rate": 1.871078860802439e-06, + "loss": 0.3542, + "step": 3030 + }, + { + "epoch": 2.231958762886598, + "grad_norm": 0.34226885437965393, + "learning_rate": 1.8677379064645567e-06, + "loss": 0.3399, + "step": 3031 + }, + { + "epoch": 2.232695139911635, + "grad_norm": 0.35378527641296387, + "learning_rate": 1.8643992523653043e-06, + "loss": 0.3938, + "step": 3032 + }, + { + "epoch": 2.2334315169366716, + "grad_norm": 0.35914602875709534, + "learning_rate": 1.8610629009564863e-06, + "loss": 0.3751, + "step": 3033 + }, + { + "epoch": 2.2341678939617085, + "grad_norm": 0.35784912109375, + "learning_rate": 1.8577288546882167e-06, + "loss": 0.3771, + "step": 3034 + }, + { + "epoch": 2.2349042709867453, + "grad_norm": 0.3862041234970093, + "learning_rate": 1.8543971160089213e-06, + "loss": 0.3834, + "step": 3035 + }, + { + "epoch": 2.235640648011782, + "grad_norm": 0.38658207654953003, + "learning_rate": 1.8510676873653278e-06, + "loss": 0.3619, + "step": 3036 + }, + { + "epoch": 2.236377025036819, + "grad_norm": 0.3452564775943756, + "learning_rate": 1.8477405712024671e-06, + "loss": 0.3496, + "step": 3037 + }, + { + "epoch": 2.2371134020618557, + "grad_norm": 0.37417715787887573, + "learning_rate": 1.8444157699636728e-06, + "loss": 0.3811, + "step": 3038 + }, + { + "epoch": 2.2378497790868925, + "grad_norm": 0.3347567319869995, + "learning_rate": 1.8410932860905767e-06, + "loss": 0.3752, + "step": 3039 + }, + { + "epoch": 2.2385861561119293, + "grad_norm": 0.3563539981842041, + "learning_rate": 1.8377731220231144e-06, + "loss": 0.3723, + "step": 3040 + }, + { + "epoch": 2.239322533136966, + "grad_norm": 0.33468982577323914, + "learning_rate": 1.834455280199512e-06, + "loss": 0.3565, + "step": 3041 + }, + { + "epoch": 2.240058910162003, + "grad_norm": 0.37427690625190735, + "learning_rate": 1.8311397630562905e-06, + "loss": 0.3703, + "step": 3042 + }, + { + "epoch": 2.24079528718704, + "grad_norm": 0.3921281099319458, + "learning_rate": 1.8278265730282696e-06, + "loss": 0.3775, + "step": 3043 + }, + { + "epoch": 2.2415316642120766, + "grad_norm": 0.3752121925354004, + "learning_rate": 1.824515712548553e-06, + "loss": 0.3651, + "step": 3044 + }, + { + "epoch": 2.2422680412371134, + "grad_norm": 0.34277763962745667, + "learning_rate": 1.821207184048538e-06, + "loss": 0.3717, + "step": 3045 + }, + { + "epoch": 2.2430044182621502, + "grad_norm": 0.36394500732421875, + "learning_rate": 1.8179009899579069e-06, + "loss": 0.3698, + "step": 3046 + }, + { + "epoch": 2.243740795287187, + "grad_norm": 0.3427956700325012, + "learning_rate": 1.8145971327046274e-06, + "loss": 0.377, + "step": 3047 + }, + { + "epoch": 2.244477172312224, + "grad_norm": 0.3212227523326874, + "learning_rate": 1.8112956147149558e-06, + "loss": 0.3747, + "step": 3048 + }, + { + "epoch": 2.2452135493372607, + "grad_norm": 0.3556148409843445, + "learning_rate": 1.8079964384134252e-06, + "loss": 0.3795, + "step": 3049 + }, + { + "epoch": 2.2459499263622975, + "grad_norm": 0.3533985912799835, + "learning_rate": 1.80469960622285e-06, + "loss": 0.3946, + "step": 3050 + }, + { + "epoch": 2.2466863033873343, + "grad_norm": 0.32733723521232605, + "learning_rate": 1.8014051205643268e-06, + "loss": 0.3854, + "step": 3051 + }, + { + "epoch": 2.247422680412371, + "grad_norm": 0.35768672823905945, + "learning_rate": 1.7981129838572248e-06, + "loss": 0.3638, + "step": 3052 + }, + { + "epoch": 2.248159057437408, + "grad_norm": 0.36508840322494507, + "learning_rate": 1.79482319851919e-06, + "loss": 0.3859, + "step": 3053 + }, + { + "epoch": 2.2488954344624448, + "grad_norm": 0.33305230736732483, + "learning_rate": 1.7915357669661409e-06, + "loss": 0.3635, + "step": 3054 + }, + { + "epoch": 2.2496318114874816, + "grad_norm": 0.324288934469223, + "learning_rate": 1.7882506916122683e-06, + "loss": 0.3887, + "step": 3055 + }, + { + "epoch": 2.2503681885125184, + "grad_norm": 0.3586910665035248, + "learning_rate": 1.7849679748700305e-06, + "loss": 0.3818, + "step": 3056 + }, + { + "epoch": 2.2511045655375552, + "grad_norm": 0.3524543344974518, + "learning_rate": 1.7816876191501587e-06, + "loss": 0.4012, + "step": 3057 + }, + { + "epoch": 2.251840942562592, + "grad_norm": 0.3519776463508606, + "learning_rate": 1.7784096268616453e-06, + "loss": 0.3727, + "step": 3058 + }, + { + "epoch": 2.252577319587629, + "grad_norm": 0.36303767561912537, + "learning_rate": 1.7751340004117468e-06, + "loss": 0.349, + "step": 3059 + }, + { + "epoch": 2.2533136966126657, + "grad_norm": 0.3571464717388153, + "learning_rate": 1.771860742205988e-06, + "loss": 0.3939, + "step": 3060 + }, + { + "epoch": 2.2540500736377025, + "grad_norm": 0.33723917603492737, + "learning_rate": 1.7685898546481495e-06, + "loss": 0.3779, + "step": 3061 + }, + { + "epoch": 2.2547864506627393, + "grad_norm": 0.3464319109916687, + "learning_rate": 1.7653213401402718e-06, + "loss": 0.36, + "step": 3062 + }, + { + "epoch": 2.255522827687776, + "grad_norm": 0.3434007167816162, + "learning_rate": 1.7620552010826535e-06, + "loss": 0.3766, + "step": 3063 + }, + { + "epoch": 2.256259204712813, + "grad_norm": 0.3645339608192444, + "learning_rate": 1.7587914398738466e-06, + "loss": 0.3689, + "step": 3064 + }, + { + "epoch": 2.2569955817378498, + "grad_norm": 0.33665645122528076, + "learning_rate": 1.7555300589106616e-06, + "loss": 0.397, + "step": 3065 + }, + { + "epoch": 2.2577319587628866, + "grad_norm": 0.312653124332428, + "learning_rate": 1.752271060588157e-06, + "loss": 0.3567, + "step": 3066 + }, + { + "epoch": 2.2584683357879234, + "grad_norm": 0.34965354204177856, + "learning_rate": 1.7490144472996412e-06, + "loss": 0.3739, + "step": 3067 + }, + { + "epoch": 2.25920471281296, + "grad_norm": 0.30937907099723816, + "learning_rate": 1.7457602214366754e-06, + "loss": 0.3815, + "step": 3068 + }, + { + "epoch": 2.259941089837997, + "grad_norm": 0.3516156077384949, + "learning_rate": 1.7425083853890628e-06, + "loss": 0.3779, + "step": 3069 + }, + { + "epoch": 2.260677466863034, + "grad_norm": 0.36380839347839355, + "learning_rate": 1.7392589415448546e-06, + "loss": 0.3853, + "step": 3070 + }, + { + "epoch": 2.2614138438880707, + "grad_norm": 0.3635159134864807, + "learning_rate": 1.736011892290343e-06, + "loss": 0.3666, + "step": 3071 + }, + { + "epoch": 2.2621502209131075, + "grad_norm": 0.3385698199272156, + "learning_rate": 1.732767240010062e-06, + "loss": 0.3674, + "step": 3072 + }, + { + "epoch": 2.2628865979381443, + "grad_norm": 0.3308032155036926, + "learning_rate": 1.7295249870867898e-06, + "loss": 0.3906, + "step": 3073 + }, + { + "epoch": 2.263622974963181, + "grad_norm": 0.3098700940608978, + "learning_rate": 1.726285135901536e-06, + "loss": 0.3532, + "step": 3074 + }, + { + "epoch": 2.264359351988218, + "grad_norm": 0.33167368173599243, + "learning_rate": 1.7230476888335484e-06, + "loss": 0.3623, + "step": 3075 + }, + { + "epoch": 2.2650957290132547, + "grad_norm": 0.3469424247741699, + "learning_rate": 1.7198126482603144e-06, + "loss": 0.3977, + "step": 3076 + }, + { + "epoch": 2.2658321060382915, + "grad_norm": 0.3413713574409485, + "learning_rate": 1.7165800165575475e-06, + "loss": 0.3831, + "step": 3077 + }, + { + "epoch": 2.2665684830633284, + "grad_norm": 0.3475739061832428, + "learning_rate": 1.7133497960991945e-06, + "loss": 0.3655, + "step": 3078 + }, + { + "epoch": 2.267304860088365, + "grad_norm": 0.3709220886230469, + "learning_rate": 1.7101219892574321e-06, + "loss": 0.3622, + "step": 3079 + }, + { + "epoch": 2.268041237113402, + "grad_norm": 0.3361673951148987, + "learning_rate": 1.706896598402663e-06, + "loss": 0.3978, + "step": 3080 + }, + { + "epoch": 2.268777614138439, + "grad_norm": 0.3467946946620941, + "learning_rate": 1.7036736259035197e-06, + "loss": 0.3752, + "step": 3081 + }, + { + "epoch": 2.2695139911634756, + "grad_norm": 0.35126227140426636, + "learning_rate": 1.7004530741268532e-06, + "loss": 0.394, + "step": 3082 + }, + { + "epoch": 2.2702503681885124, + "grad_norm": 0.36080968379974365, + "learning_rate": 1.697234945437739e-06, + "loss": 0.3753, + "step": 3083 + }, + { + "epoch": 2.2709867452135493, + "grad_norm": 0.35190704464912415, + "learning_rate": 1.6940192421994766e-06, + "loss": 0.3586, + "step": 3084 + }, + { + "epoch": 2.271723122238586, + "grad_norm": 0.31936824321746826, + "learning_rate": 1.6908059667735793e-06, + "loss": 0.3687, + "step": 3085 + }, + { + "epoch": 2.272459499263623, + "grad_norm": 0.33945798873901367, + "learning_rate": 1.6875951215197779e-06, + "loss": 0.3583, + "step": 3086 + }, + { + "epoch": 2.2731958762886597, + "grad_norm": 0.3654190003871918, + "learning_rate": 1.6843867087960252e-06, + "loss": 0.3991, + "step": 3087 + }, + { + "epoch": 2.2739322533136965, + "grad_norm": 0.33993858098983765, + "learning_rate": 1.6811807309584776e-06, + "loss": 0.3679, + "step": 3088 + }, + { + "epoch": 2.2746686303387333, + "grad_norm": 0.35775047540664673, + "learning_rate": 1.6779771903615083e-06, + "loss": 0.3821, + "step": 3089 + }, + { + "epoch": 2.27540500736377, + "grad_norm": 0.33738553524017334, + "learning_rate": 1.6747760893577037e-06, + "loss": 0.3891, + "step": 3090 + }, + { + "epoch": 2.276141384388807, + "grad_norm": 0.39476099610328674, + "learning_rate": 1.6715774302978544e-06, + "loss": 0.3794, + "step": 3091 + }, + { + "epoch": 2.276877761413844, + "grad_norm": 0.3633752167224884, + "learning_rate": 1.6683812155309577e-06, + "loss": 0.3904, + "step": 3092 + }, + { + "epoch": 2.2776141384388806, + "grad_norm": 0.31952276825904846, + "learning_rate": 1.665187447404219e-06, + "loss": 0.3837, + "step": 3093 + }, + { + "epoch": 2.2783505154639174, + "grad_norm": 0.33152905106544495, + "learning_rate": 1.6619961282630453e-06, + "loss": 0.3555, + "step": 3094 + }, + { + "epoch": 2.2790868924889542, + "grad_norm": 0.3404163420200348, + "learning_rate": 1.6588072604510435e-06, + "loss": 0.3725, + "step": 3095 + }, + { + "epoch": 2.279823269513991, + "grad_norm": 0.36314231157302856, + "learning_rate": 1.6556208463100226e-06, + "loss": 0.3619, + "step": 3096 + }, + { + "epoch": 2.280559646539028, + "grad_norm": 0.38192078471183777, + "learning_rate": 1.6524368881799863e-06, + "loss": 0.3697, + "step": 3097 + }, + { + "epoch": 2.2812960235640647, + "grad_norm": 0.3242843747138977, + "learning_rate": 1.6492553883991418e-06, + "loss": 0.3806, + "step": 3098 + }, + { + "epoch": 2.2820324005891015, + "grad_norm": 0.33095407485961914, + "learning_rate": 1.646076349303884e-06, + "loss": 0.382, + "step": 3099 + }, + { + "epoch": 2.2827687776141383, + "grad_norm": 0.371073454618454, + "learning_rate": 1.642899773228801e-06, + "loss": 0.4286, + "step": 3100 + }, + { + "epoch": 2.283505154639175, + "grad_norm": 0.35626521706581116, + "learning_rate": 1.6397256625066787e-06, + "loss": 0.3677, + "step": 3101 + }, + { + "epoch": 2.284241531664212, + "grad_norm": 0.3136880695819855, + "learning_rate": 1.6365540194684853e-06, + "loss": 0.3528, + "step": 3102 + }, + { + "epoch": 2.2849779086892488, + "grad_norm": 0.35781604051589966, + "learning_rate": 1.633384846443381e-06, + "loss": 0.3813, + "step": 3103 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 0.3532848358154297, + "learning_rate": 1.6302181457587092e-06, + "loss": 0.368, + "step": 3104 + }, + { + "epoch": 2.2864506627393224, + "grad_norm": 0.3363364040851593, + "learning_rate": 1.6270539197399988e-06, + "loss": 0.3679, + "step": 3105 + }, + { + "epoch": 2.287187039764359, + "grad_norm": 0.347859263420105, + "learning_rate": 1.6238921707109639e-06, + "loss": 0.371, + "step": 3106 + }, + { + "epoch": 2.287923416789396, + "grad_norm": 0.3195333480834961, + "learning_rate": 1.620732900993497e-06, + "loss": 0.3878, + "step": 3107 + }, + { + "epoch": 2.288659793814433, + "grad_norm": 0.35446897149086, + "learning_rate": 1.6175761129076673e-06, + "loss": 0.4139, + "step": 3108 + }, + { + "epoch": 2.2893961708394697, + "grad_norm": 0.35336872935295105, + "learning_rate": 1.614421808771729e-06, + "loss": 0.3528, + "step": 3109 + }, + { + "epoch": 2.2901325478645065, + "grad_norm": 0.3762224018573761, + "learning_rate": 1.6112699909021057e-06, + "loss": 0.3475, + "step": 3110 + }, + { + "epoch": 2.2908689248895433, + "grad_norm": 0.39090195298194885, + "learning_rate": 1.608120661613396e-06, + "loss": 0.3518, + "step": 3111 + }, + { + "epoch": 2.29160530191458, + "grad_norm": 0.33557429909706116, + "learning_rate": 1.604973823218376e-06, + "loss": 0.3848, + "step": 3112 + }, + { + "epoch": 2.292341678939617, + "grad_norm": 0.3590867519378662, + "learning_rate": 1.6018294780279848e-06, + "loss": 0.3553, + "step": 3113 + }, + { + "epoch": 2.2930780559646537, + "grad_norm": 0.3268950581550598, + "learning_rate": 1.598687628351334e-06, + "loss": 0.3818, + "step": 3114 + }, + { + "epoch": 2.2938144329896906, + "grad_norm": 0.3529122769832611, + "learning_rate": 1.5955482764957063e-06, + "loss": 0.3755, + "step": 3115 + }, + { + "epoch": 2.2945508100147274, + "grad_norm": 0.32063764333724976, + "learning_rate": 1.5924114247665457e-06, + "loss": 0.3589, + "step": 3116 + }, + { + "epoch": 2.295287187039764, + "grad_norm": 0.3151834309101105, + "learning_rate": 1.5892770754674596e-06, + "loss": 0.3749, + "step": 3117 + }, + { + "epoch": 2.296023564064801, + "grad_norm": 0.3227955996990204, + "learning_rate": 1.5861452309002219e-06, + "loss": 0.3449, + "step": 3118 + }, + { + "epoch": 2.296759941089838, + "grad_norm": 0.3374328315258026, + "learning_rate": 1.5830158933647638e-06, + "loss": 0.3702, + "step": 3119 + }, + { + "epoch": 2.2974963181148746, + "grad_norm": 0.34967485070228577, + "learning_rate": 1.5798890651591759e-06, + "loss": 0.3723, + "step": 3120 + }, + { + "epoch": 2.2982326951399115, + "grad_norm": 0.32275229692459106, + "learning_rate": 1.576764748579706e-06, + "loss": 0.3772, + "step": 3121 + }, + { + "epoch": 2.2989690721649483, + "grad_norm": 0.3539294898509979, + "learning_rate": 1.5736429459207569e-06, + "loss": 0.3473, + "step": 3122 + }, + { + "epoch": 2.299705449189985, + "grad_norm": 0.35446813702583313, + "learning_rate": 1.570523659474889e-06, + "loss": 0.3826, + "step": 3123 + }, + { + "epoch": 2.300441826215022, + "grad_norm": 0.3261672258377075, + "learning_rate": 1.5674068915328105e-06, + "loss": 0.3832, + "step": 3124 + }, + { + "epoch": 2.3011782032400587, + "grad_norm": 0.32315734028816223, + "learning_rate": 1.56429264438338e-06, + "loss": 0.3884, + "step": 3125 + }, + { + "epoch": 2.3019145802650955, + "grad_norm": 0.32102611660957336, + "learning_rate": 1.561180920313609e-06, + "loss": 0.401, + "step": 3126 + }, + { + "epoch": 2.3026509572901324, + "grad_norm": 0.3752593398094177, + "learning_rate": 1.5580717216086533e-06, + "loss": 0.3684, + "step": 3127 + }, + { + "epoch": 2.303387334315169, + "grad_norm": 0.331100732088089, + "learning_rate": 1.5549650505518115e-06, + "loss": 0.3856, + "step": 3128 + }, + { + "epoch": 2.304123711340206, + "grad_norm": 0.3341582417488098, + "learning_rate": 1.5518609094245351e-06, + "loss": 0.3876, + "step": 3129 + }, + { + "epoch": 2.304860088365243, + "grad_norm": 0.3470800817012787, + "learning_rate": 1.5487593005064038e-06, + "loss": 0.3762, + "step": 3130 + }, + { + "epoch": 2.3055964653902796, + "grad_norm": 0.3255835175514221, + "learning_rate": 1.5456602260751513e-06, + "loss": 0.3874, + "step": 3131 + }, + { + "epoch": 2.3063328424153164, + "grad_norm": 0.317374587059021, + "learning_rate": 1.5425636884066426e-06, + "loss": 0.4035, + "step": 3132 + }, + { + "epoch": 2.3070692194403533, + "grad_norm": 0.3365512788295746, + "learning_rate": 1.539469689774879e-06, + "loss": 0.3671, + "step": 3133 + }, + { + "epoch": 2.30780559646539, + "grad_norm": 0.34714415669441223, + "learning_rate": 1.5363782324520033e-06, + "loss": 0.3717, + "step": 3134 + }, + { + "epoch": 2.308541973490427, + "grad_norm": 0.3375949263572693, + "learning_rate": 1.5332893187082864e-06, + "loss": 0.3973, + "step": 3135 + }, + { + "epoch": 2.3092783505154637, + "grad_norm": 0.305399626493454, + "learning_rate": 1.5302029508121325e-06, + "loss": 0.38, + "step": 3136 + }, + { + "epoch": 2.3100147275405005, + "grad_norm": 0.3382294774055481, + "learning_rate": 1.5271191310300803e-06, + "loss": 0.3695, + "step": 3137 + }, + { + "epoch": 2.3107511045655373, + "grad_norm": 0.35040998458862305, + "learning_rate": 1.5240378616267887e-06, + "loss": 0.3784, + "step": 3138 + }, + { + "epoch": 2.311487481590574, + "grad_norm": 0.3383118510246277, + "learning_rate": 1.5209591448650535e-06, + "loss": 0.3911, + "step": 3139 + }, + { + "epoch": 2.312223858615611, + "grad_norm": 0.32253560423851013, + "learning_rate": 1.5178829830057883e-06, + "loss": 0.3547, + "step": 3140 + }, + { + "epoch": 2.312960235640648, + "grad_norm": 0.37971949577331543, + "learning_rate": 1.5148093783080337e-06, + "loss": 0.3687, + "step": 3141 + }, + { + "epoch": 2.3136966126656846, + "grad_norm": 0.34527388215065, + "learning_rate": 1.5117383330289542e-06, + "loss": 0.4067, + "step": 3142 + }, + { + "epoch": 2.3144329896907214, + "grad_norm": 0.3223811686038971, + "learning_rate": 1.5086698494238316e-06, + "loss": 0.3777, + "step": 3143 + }, + { + "epoch": 2.3151693667157582, + "grad_norm": 0.3294554054737091, + "learning_rate": 1.5056039297460656e-06, + "loss": 0.3806, + "step": 3144 + }, + { + "epoch": 2.315905743740795, + "grad_norm": 0.4138847291469574, + "learning_rate": 1.5025405762471795e-06, + "loss": 0.366, + "step": 3145 + }, + { + "epoch": 2.316642120765832, + "grad_norm": 0.3485645353794098, + "learning_rate": 1.4994797911768034e-06, + "loss": 0.4099, + "step": 3146 + }, + { + "epoch": 2.3173784977908687, + "grad_norm": 0.3449089527130127, + "learning_rate": 1.4964215767826846e-06, + "loss": 0.3593, + "step": 3147 + }, + { + "epoch": 2.3181148748159055, + "grad_norm": 0.3338415324687958, + "learning_rate": 1.4933659353106872e-06, + "loss": 0.363, + "step": 3148 + }, + { + "epoch": 2.3188512518409423, + "grad_norm": 0.33928152918815613, + "learning_rate": 1.4903128690047802e-06, + "loss": 0.3571, + "step": 3149 + }, + { + "epoch": 2.319587628865979, + "grad_norm": 0.3429686725139618, + "learning_rate": 1.4872623801070413e-06, + "loss": 0.3747, + "step": 3150 + }, + { + "epoch": 2.3203240058910164, + "grad_norm": 0.32575222849845886, + "learning_rate": 1.4842144708576606e-06, + "loss": 0.3772, + "step": 3151 + }, + { + "epoch": 2.321060382916053, + "grad_norm": 0.34492257237434387, + "learning_rate": 1.4811691434949293e-06, + "loss": 0.3755, + "step": 3152 + }, + { + "epoch": 2.32179675994109, + "grad_norm": 0.3562105596065521, + "learning_rate": 1.4781264002552425e-06, + "loss": 0.3717, + "step": 3153 + }, + { + "epoch": 2.322533136966127, + "grad_norm": 0.3609287440776825, + "learning_rate": 1.4750862433731028e-06, + "loss": 0.357, + "step": 3154 + }, + { + "epoch": 2.3232695139911637, + "grad_norm": 0.33014407753944397, + "learning_rate": 1.4720486750811035e-06, + "loss": 0.3536, + "step": 3155 + }, + { + "epoch": 2.3240058910162005, + "grad_norm": 0.3423437476158142, + "learning_rate": 1.4690136976099479e-06, + "loss": 0.3642, + "step": 3156 + }, + { + "epoch": 2.3247422680412373, + "grad_norm": 0.35252103209495544, + "learning_rate": 1.4659813131884304e-06, + "loss": 0.3835, + "step": 3157 + }, + { + "epoch": 2.325478645066274, + "grad_norm": 0.3483889400959015, + "learning_rate": 1.46295152404344e-06, + "loss": 0.3714, + "step": 3158 + }, + { + "epoch": 2.326215022091311, + "grad_norm": 0.3345074951648712, + "learning_rate": 1.4599243323999668e-06, + "loss": 0.365, + "step": 3159 + }, + { + "epoch": 2.3269513991163477, + "grad_norm": 0.334255188703537, + "learning_rate": 1.4568997404810858e-06, + "loss": 0.3851, + "step": 3160 + }, + { + "epoch": 2.3276877761413846, + "grad_norm": 0.33251067996025085, + "learning_rate": 1.4538777505079654e-06, + "loss": 0.3928, + "step": 3161 + }, + { + "epoch": 2.3284241531664214, + "grad_norm": 0.3211064040660858, + "learning_rate": 1.4508583646998674e-06, + "loss": 0.4159, + "step": 3162 + }, + { + "epoch": 2.329160530191458, + "grad_norm": 0.33041635155677795, + "learning_rate": 1.4478415852741328e-06, + "loss": 0.349, + "step": 3163 + }, + { + "epoch": 2.329896907216495, + "grad_norm": 0.31442853808403015, + "learning_rate": 1.4448274144461965e-06, + "loss": 0.3824, + "step": 3164 + }, + { + "epoch": 2.330633284241532, + "grad_norm": 0.3113677501678467, + "learning_rate": 1.4418158544295734e-06, + "loss": 0.3695, + "step": 3165 + }, + { + "epoch": 2.3313696612665686, + "grad_norm": 0.33691975474357605, + "learning_rate": 1.4388069074358612e-06, + "loss": 0.3742, + "step": 3166 + }, + { + "epoch": 2.3321060382916055, + "grad_norm": 0.33256834745407104, + "learning_rate": 1.4358005756747417e-06, + "loss": 0.341, + "step": 3167 + }, + { + "epoch": 2.3328424153166423, + "grad_norm": 0.33882421255111694, + "learning_rate": 1.4327968613539734e-06, + "loss": 0.3785, + "step": 3168 + }, + { + "epoch": 2.333578792341679, + "grad_norm": 0.33136385679244995, + "learning_rate": 1.429795766679391e-06, + "loss": 0.3777, + "step": 3169 + }, + { + "epoch": 2.334315169366716, + "grad_norm": 0.31647056341171265, + "learning_rate": 1.426797293854912e-06, + "loss": 0.3867, + "step": 3170 + }, + { + "epoch": 2.3350515463917527, + "grad_norm": 0.3487012982368469, + "learning_rate": 1.4238014450825227e-06, + "loss": 0.3682, + "step": 3171 + }, + { + "epoch": 2.3357879234167895, + "grad_norm": 0.3635872006416321, + "learning_rate": 1.4208082225622804e-06, + "loss": 0.385, + "step": 3172 + }, + { + "epoch": 2.3365243004418264, + "grad_norm": 0.35022327303886414, + "learning_rate": 1.4178176284923212e-06, + "loss": 0.4066, + "step": 3173 + }, + { + "epoch": 2.337260677466863, + "grad_norm": 0.35386213660240173, + "learning_rate": 1.4148296650688465e-06, + "loss": 0.3982, + "step": 3174 + }, + { + "epoch": 2.3379970544919, + "grad_norm": 0.3819175660610199, + "learning_rate": 1.4118443344861237e-06, + "loss": 0.3898, + "step": 3175 + }, + { + "epoch": 2.338733431516937, + "grad_norm": 0.36631712317466736, + "learning_rate": 1.408861638936493e-06, + "loss": 0.3976, + "step": 3176 + }, + { + "epoch": 2.3394698085419736, + "grad_norm": 0.3479391634464264, + "learning_rate": 1.4058815806103542e-06, + "loss": 0.3825, + "step": 3177 + }, + { + "epoch": 2.3402061855670104, + "grad_norm": 0.35590627789497375, + "learning_rate": 1.4029041616961703e-06, + "loss": 0.3747, + "step": 3178 + }, + { + "epoch": 2.3409425625920472, + "grad_norm": 0.3458181321620941, + "learning_rate": 1.3999293843804728e-06, + "loss": 0.3736, + "step": 3179 + }, + { + "epoch": 2.341678939617084, + "grad_norm": 0.33873867988586426, + "learning_rate": 1.3969572508478424e-06, + "loss": 0.373, + "step": 3180 + }, + { + "epoch": 2.342415316642121, + "grad_norm": 0.33975568413734436, + "learning_rate": 1.3939877632809279e-06, + "loss": 0.3431, + "step": 3181 + }, + { + "epoch": 2.3431516936671577, + "grad_norm": 0.3956592082977295, + "learning_rate": 1.3910209238604306e-06, + "loss": 0.3857, + "step": 3182 + }, + { + "epoch": 2.3438880706921945, + "grad_norm": 0.35657671093940735, + "learning_rate": 1.3880567347651052e-06, + "loss": 0.3717, + "step": 3183 + }, + { + "epoch": 2.3446244477172313, + "grad_norm": 0.34503045678138733, + "learning_rate": 1.3850951981717665e-06, + "loss": 0.3607, + "step": 3184 + }, + { + "epoch": 2.345360824742268, + "grad_norm": 0.31892770528793335, + "learning_rate": 1.3821363162552753e-06, + "loss": 0.3666, + "step": 3185 + }, + { + "epoch": 2.346097201767305, + "grad_norm": 0.3312123715877533, + "learning_rate": 1.3791800911885444e-06, + "loss": 0.3279, + "step": 3186 + }, + { + "epoch": 2.346833578792342, + "grad_norm": 0.37443795800209045, + "learning_rate": 1.3762265251425394e-06, + "loss": 0.3779, + "step": 3187 + }, + { + "epoch": 2.3475699558173786, + "grad_norm": 0.3352743685245514, + "learning_rate": 1.373275620286265e-06, + "loss": 0.3839, + "step": 3188 + }, + { + "epoch": 2.3483063328424154, + "grad_norm": 0.3558715879917145, + "learning_rate": 1.370327378786781e-06, + "loss": 0.3788, + "step": 3189 + }, + { + "epoch": 2.3490427098674522, + "grad_norm": 0.38027918338775635, + "learning_rate": 1.367381802809185e-06, + "loss": 0.4096, + "step": 3190 + }, + { + "epoch": 2.349779086892489, + "grad_norm": 0.31835508346557617, + "learning_rate": 1.3644388945166175e-06, + "loss": 0.4012, + "step": 3191 + }, + { + "epoch": 2.350515463917526, + "grad_norm": 0.31716689467430115, + "learning_rate": 1.3614986560702648e-06, + "loss": 0.3676, + "step": 3192 + }, + { + "epoch": 2.3512518409425627, + "grad_norm": 0.3417465090751648, + "learning_rate": 1.3585610896293472e-06, + "loss": 0.369, + "step": 3193 + }, + { + "epoch": 2.3519882179675995, + "grad_norm": 0.3349650800228119, + "learning_rate": 1.3556261973511236e-06, + "loss": 0.3702, + "step": 3194 + }, + { + "epoch": 2.3527245949926363, + "grad_norm": 0.38491392135620117, + "learning_rate": 1.3526939813908929e-06, + "loss": 0.3594, + "step": 3195 + }, + { + "epoch": 2.353460972017673, + "grad_norm": 0.3387065529823303, + "learning_rate": 1.349764443901984e-06, + "loss": 0.3557, + "step": 3196 + }, + { + "epoch": 2.35419734904271, + "grad_norm": 0.33061766624450684, + "learning_rate": 1.346837587035762e-06, + "loss": 0.3777, + "step": 3197 + }, + { + "epoch": 2.3549337260677468, + "grad_norm": 0.34983956813812256, + "learning_rate": 1.343913412941621e-06, + "loss": 0.392, + "step": 3198 + }, + { + "epoch": 2.3556701030927836, + "grad_norm": 0.3425159454345703, + "learning_rate": 1.3409919237669843e-06, + "loss": 0.3779, + "step": 3199 + }, + { + "epoch": 2.3564064801178204, + "grad_norm": 0.3214556574821472, + "learning_rate": 1.33807312165731e-06, + "loss": 0.3965, + "step": 3200 + }, + { + "epoch": 2.357142857142857, + "grad_norm": 0.32517537474632263, + "learning_rate": 1.335157008756075e-06, + "loss": 0.3657, + "step": 3201 + }, + { + "epoch": 2.357879234167894, + "grad_norm": 0.3494378626346588, + "learning_rate": 1.3322435872047835e-06, + "loss": 0.3905, + "step": 3202 + }, + { + "epoch": 2.358615611192931, + "grad_norm": 0.39801037311553955, + "learning_rate": 1.329332859142967e-06, + "loss": 0.3801, + "step": 3203 + }, + { + "epoch": 2.3593519882179677, + "grad_norm": 0.34806138277053833, + "learning_rate": 1.326424826708177e-06, + "loss": 0.3711, + "step": 3204 + }, + { + "epoch": 2.3600883652430045, + "grad_norm": 0.37914684414863586, + "learning_rate": 1.3235194920359795e-06, + "loss": 0.3568, + "step": 3205 + }, + { + "epoch": 2.3608247422680413, + "grad_norm": 0.3150370419025421, + "learning_rate": 1.3206168572599692e-06, + "loss": 0.3577, + "step": 3206 + }, + { + "epoch": 2.361561119293078, + "grad_norm": 0.3247841000556946, + "learning_rate": 1.3177169245117522e-06, + "loss": 0.3998, + "step": 3207 + }, + { + "epoch": 2.362297496318115, + "grad_norm": 0.3235202729701996, + "learning_rate": 1.3148196959209491e-06, + "loss": 0.3604, + "step": 3208 + }, + { + "epoch": 2.3630338733431517, + "grad_norm": 0.3443073034286499, + "learning_rate": 1.3119251736152005e-06, + "loss": 0.3561, + "step": 3209 + }, + { + "epoch": 2.3637702503681886, + "grad_norm": 0.3384738266468048, + "learning_rate": 1.309033359720155e-06, + "loss": 0.3632, + "step": 3210 + }, + { + "epoch": 2.3645066273932254, + "grad_norm": 0.3304847478866577, + "learning_rate": 1.3061442563594718e-06, + "loss": 0.3745, + "step": 3211 + }, + { + "epoch": 2.365243004418262, + "grad_norm": 0.3340161144733429, + "learning_rate": 1.3032578656548228e-06, + "loss": 0.3665, + "step": 3212 + }, + { + "epoch": 2.365979381443299, + "grad_norm": 0.3465518355369568, + "learning_rate": 1.3003741897258864e-06, + "loss": 0.3678, + "step": 3213 + }, + { + "epoch": 2.366715758468336, + "grad_norm": 0.31364724040031433, + "learning_rate": 1.297493230690346e-06, + "loss": 0.3602, + "step": 3214 + }, + { + "epoch": 2.3674521354933726, + "grad_norm": 0.3540259599685669, + "learning_rate": 1.2946149906638905e-06, + "loss": 0.3733, + "step": 3215 + }, + { + "epoch": 2.3681885125184094, + "grad_norm": 0.34416845440864563, + "learning_rate": 1.2917394717602123e-06, + "loss": 0.3512, + "step": 3216 + }, + { + "epoch": 2.3689248895434463, + "grad_norm": 0.37289807200431824, + "learning_rate": 1.2888666760910074e-06, + "loss": 0.3755, + "step": 3217 + }, + { + "epoch": 2.369661266568483, + "grad_norm": 0.3461972773075104, + "learning_rate": 1.285996605765969e-06, + "loss": 0.4092, + "step": 3218 + }, + { + "epoch": 2.37039764359352, + "grad_norm": 0.36387526988983154, + "learning_rate": 1.283129262892789e-06, + "loss": 0.3788, + "step": 3219 + }, + { + "epoch": 2.3711340206185567, + "grad_norm": 0.32595691084861755, + "learning_rate": 1.2802646495771592e-06, + "loss": 0.3588, + "step": 3220 + }, + { + "epoch": 2.3718703976435935, + "grad_norm": 0.332273006439209, + "learning_rate": 1.2774027679227647e-06, + "loss": 0.3772, + "step": 3221 + }, + { + "epoch": 2.3726067746686303, + "grad_norm": 0.3550896942615509, + "learning_rate": 1.2745436200312844e-06, + "loss": 0.375, + "step": 3222 + }, + { + "epoch": 2.373343151693667, + "grad_norm": 0.3492611348628998, + "learning_rate": 1.2716872080023901e-06, + "loss": 0.3792, + "step": 3223 + }, + { + "epoch": 2.374079528718704, + "grad_norm": 0.33285483717918396, + "learning_rate": 1.2688335339337433e-06, + "loss": 0.3837, + "step": 3224 + }, + { + "epoch": 2.374815905743741, + "grad_norm": 0.3760431110858917, + "learning_rate": 1.2659825999209985e-06, + "loss": 0.3811, + "step": 3225 + }, + { + "epoch": 2.3755522827687776, + "grad_norm": 0.33756300806999207, + "learning_rate": 1.263134408057794e-06, + "loss": 0.3708, + "step": 3226 + }, + { + "epoch": 2.3762886597938144, + "grad_norm": 0.36119866371154785, + "learning_rate": 1.2602889604357548e-06, + "loss": 0.3681, + "step": 3227 + }, + { + "epoch": 2.3770250368188512, + "grad_norm": 0.33655065298080444, + "learning_rate": 1.257446259144494e-06, + "loss": 0.385, + "step": 3228 + }, + { + "epoch": 2.377761413843888, + "grad_norm": 0.3267086446285248, + "learning_rate": 1.2546063062716069e-06, + "loss": 0.3656, + "step": 3229 + }, + { + "epoch": 2.378497790868925, + "grad_norm": 0.3475969731807709, + "learning_rate": 1.2517691039026625e-06, + "loss": 0.3735, + "step": 3230 + }, + { + "epoch": 2.3792341678939617, + "grad_norm": 0.3716298043727875, + "learning_rate": 1.2489346541212226e-06, + "loss": 0.4026, + "step": 3231 + }, + { + "epoch": 2.3799705449189985, + "grad_norm": 0.39196276664733887, + "learning_rate": 1.2461029590088198e-06, + "loss": 0.3645, + "step": 3232 + }, + { + "epoch": 2.3807069219440353, + "grad_norm": 0.34049636125564575, + "learning_rate": 1.2432740206449629e-06, + "loss": 0.3597, + "step": 3233 + }, + { + "epoch": 2.381443298969072, + "grad_norm": 0.3242121636867523, + "learning_rate": 1.240447841107143e-06, + "loss": 0.397, + "step": 3234 + }, + { + "epoch": 2.382179675994109, + "grad_norm": 0.3249550759792328, + "learning_rate": 1.2376244224708183e-06, + "loss": 0.3732, + "step": 3235 + }, + { + "epoch": 2.3829160530191458, + "grad_norm": 0.3562273681163788, + "learning_rate": 1.2348037668094214e-06, + "loss": 0.3755, + "step": 3236 + }, + { + "epoch": 2.3836524300441826, + "grad_norm": 0.3847813904285431, + "learning_rate": 1.2319858761943598e-06, + "loss": 0.3614, + "step": 3237 + }, + { + "epoch": 2.3843888070692194, + "grad_norm": 0.32016924023628235, + "learning_rate": 1.2291707526950047e-06, + "loss": 0.3499, + "step": 3238 + }, + { + "epoch": 2.3851251840942562, + "grad_norm": 0.38904672861099243, + "learning_rate": 1.2263583983786986e-06, + "loss": 0.3994, + "step": 3239 + }, + { + "epoch": 2.385861561119293, + "grad_norm": 0.34415403008461, + "learning_rate": 1.2235488153107488e-06, + "loss": 0.3723, + "step": 3240 + }, + { + "epoch": 2.38659793814433, + "grad_norm": 0.3804604709148407, + "learning_rate": 1.2207420055544278e-06, + "loss": 0.3707, + "step": 3241 + }, + { + "epoch": 2.3873343151693667, + "grad_norm": 0.34501025080680847, + "learning_rate": 1.2179379711709738e-06, + "loss": 0.4066, + "step": 3242 + }, + { + "epoch": 2.3880706921944035, + "grad_norm": 0.35405653715133667, + "learning_rate": 1.2151367142195842e-06, + "loss": 0.376, + "step": 3243 + }, + { + "epoch": 2.3888070692194403, + "grad_norm": 0.3774116039276123, + "learning_rate": 1.212338236757415e-06, + "loss": 0.3763, + "step": 3244 + }, + { + "epoch": 2.389543446244477, + "grad_norm": 0.35209256410598755, + "learning_rate": 1.2095425408395873e-06, + "loss": 0.3543, + "step": 3245 + }, + { + "epoch": 2.390279823269514, + "grad_norm": 0.36879295110702515, + "learning_rate": 1.2067496285191743e-06, + "loss": 0.3765, + "step": 3246 + }, + { + "epoch": 2.3910162002945508, + "grad_norm": 0.3600074052810669, + "learning_rate": 1.2039595018472055e-06, + "loss": 0.377, + "step": 3247 + }, + { + "epoch": 2.3917525773195876, + "grad_norm": 0.3357848525047302, + "learning_rate": 1.2011721628726663e-06, + "loss": 0.3659, + "step": 3248 + }, + { + "epoch": 2.3924889543446244, + "grad_norm": 0.3500244617462158, + "learning_rate": 1.1983876136424926e-06, + "loss": 0.3847, + "step": 3249 + }, + { + "epoch": 2.393225331369661, + "grad_norm": 0.3420659303665161, + "learning_rate": 1.1956058562015766e-06, + "loss": 0.392, + "step": 3250 + }, + { + "epoch": 2.393961708394698, + "grad_norm": 0.33460474014282227, + "learning_rate": 1.192826892592755e-06, + "loss": 0.3708, + "step": 3251 + }, + { + "epoch": 2.394698085419735, + "grad_norm": 0.34583914279937744, + "learning_rate": 1.1900507248568128e-06, + "loss": 0.349, + "step": 3252 + }, + { + "epoch": 2.3954344624447717, + "grad_norm": 0.36606159806251526, + "learning_rate": 1.1872773550324873e-06, + "loss": 0.3739, + "step": 3253 + }, + { + "epoch": 2.3961708394698085, + "grad_norm": 0.34434834122657776, + "learning_rate": 1.1845067851564557e-06, + "loss": 0.3459, + "step": 3254 + }, + { + "epoch": 2.3969072164948453, + "grad_norm": 0.3151196539402008, + "learning_rate": 1.1817390172633402e-06, + "loss": 0.3635, + "step": 3255 + }, + { + "epoch": 2.397643593519882, + "grad_norm": 0.32296085357666016, + "learning_rate": 1.1789740533857075e-06, + "loss": 0.3863, + "step": 3256 + }, + { + "epoch": 2.398379970544919, + "grad_norm": 0.3849099576473236, + "learning_rate": 1.1762118955540609e-06, + "loss": 0.3943, + "step": 3257 + }, + { + "epoch": 2.3991163475699557, + "grad_norm": 0.3584287762641907, + "learning_rate": 1.1734525457968488e-06, + "loss": 0.3823, + "step": 3258 + }, + { + "epoch": 2.3998527245949925, + "grad_norm": 0.33477911353111267, + "learning_rate": 1.1706960061404527e-06, + "loss": 0.37, + "step": 3259 + }, + { + "epoch": 2.4005891016200294, + "grad_norm": 0.3298894762992859, + "learning_rate": 1.1679422786091909e-06, + "loss": 0.3887, + "step": 3260 + }, + { + "epoch": 2.401325478645066, + "grad_norm": 0.3459241986274719, + "learning_rate": 1.1651913652253199e-06, + "loss": 0.3546, + "step": 3261 + }, + { + "epoch": 2.402061855670103, + "grad_norm": 0.3287370502948761, + "learning_rate": 1.162443268009027e-06, + "loss": 0.3604, + "step": 3262 + }, + { + "epoch": 2.40279823269514, + "grad_norm": 0.33065786957740784, + "learning_rate": 1.1596979889784304e-06, + "loss": 0.384, + "step": 3263 + }, + { + "epoch": 2.4035346097201766, + "grad_norm": 0.3277767598628998, + "learning_rate": 1.1569555301495817e-06, + "loss": 0.3492, + "step": 3264 + }, + { + "epoch": 2.4042709867452134, + "grad_norm": 0.3225124478340149, + "learning_rate": 1.1542158935364584e-06, + "loss": 0.3909, + "step": 3265 + }, + { + "epoch": 2.4050073637702503, + "grad_norm": 0.30637234449386597, + "learning_rate": 1.1514790811509658e-06, + "loss": 0.4044, + "step": 3266 + }, + { + "epoch": 2.405743740795287, + "grad_norm": 0.34995532035827637, + "learning_rate": 1.148745095002939e-06, + "loss": 0.3824, + "step": 3267 + }, + { + "epoch": 2.406480117820324, + "grad_norm": 0.33643367886543274, + "learning_rate": 1.1460139371001339e-06, + "loss": 0.3694, + "step": 3268 + }, + { + "epoch": 2.4072164948453607, + "grad_norm": 0.3490047752857208, + "learning_rate": 1.1432856094482282e-06, + "loss": 0.384, + "step": 3269 + }, + { + "epoch": 2.4079528718703975, + "grad_norm": 0.3088352382183075, + "learning_rate": 1.1405601140508265e-06, + "loss": 0.3662, + "step": 3270 + }, + { + "epoch": 2.4086892488954343, + "grad_norm": 0.3435070514678955, + "learning_rate": 1.1378374529094494e-06, + "loss": 0.3965, + "step": 3271 + }, + { + "epoch": 2.409425625920471, + "grad_norm": 0.33832401037216187, + "learning_rate": 1.135117628023536e-06, + "loss": 0.3821, + "step": 3272 + }, + { + "epoch": 2.410162002945508, + "grad_norm": 0.335674524307251, + "learning_rate": 1.1324006413904437e-06, + "loss": 0.4041, + "step": 3273 + }, + { + "epoch": 2.410898379970545, + "grad_norm": 0.37129271030426025, + "learning_rate": 1.1296864950054443e-06, + "loss": 0.364, + "step": 3274 + }, + { + "epoch": 2.4116347569955816, + "grad_norm": 0.35193946957588196, + "learning_rate": 1.1269751908617277e-06, + "loss": 0.404, + "step": 3275 + }, + { + "epoch": 2.4123711340206184, + "grad_norm": 0.34441494941711426, + "learning_rate": 1.124266730950392e-06, + "loss": 0.3571, + "step": 3276 + }, + { + "epoch": 2.4131075110456552, + "grad_norm": 0.3434298634529114, + "learning_rate": 1.1215611172604468e-06, + "loss": 0.3831, + "step": 3277 + }, + { + "epoch": 2.413843888070692, + "grad_norm": 0.3040885925292969, + "learning_rate": 1.1188583517788165e-06, + "loss": 0.3555, + "step": 3278 + }, + { + "epoch": 2.414580265095729, + "grad_norm": 0.3662799596786499, + "learning_rate": 1.1161584364903287e-06, + "loss": 0.3932, + "step": 3279 + }, + { + "epoch": 2.4153166421207657, + "grad_norm": 0.3392142653465271, + "learning_rate": 1.1134613733777195e-06, + "loss": 0.3553, + "step": 3280 + }, + { + "epoch": 2.4160530191458025, + "grad_norm": 0.35965248942375183, + "learning_rate": 1.1107671644216305e-06, + "loss": 0.3529, + "step": 3281 + }, + { + "epoch": 2.4167893961708393, + "grad_norm": 0.37585723400115967, + "learning_rate": 1.1080758116006057e-06, + "loss": 0.3913, + "step": 3282 + }, + { + "epoch": 2.417525773195876, + "grad_norm": 0.3380414545536041, + "learning_rate": 1.1053873168910966e-06, + "loss": 0.3933, + "step": 3283 + }, + { + "epoch": 2.418262150220913, + "grad_norm": 0.33005619049072266, + "learning_rate": 1.1027016822674509e-06, + "loss": 0.3837, + "step": 3284 + }, + { + "epoch": 2.4189985272459498, + "grad_norm": 0.3498542904853821, + "learning_rate": 1.1000189097019164e-06, + "loss": 0.3881, + "step": 3285 + }, + { + "epoch": 2.4197349042709866, + "grad_norm": 0.37784454226493835, + "learning_rate": 1.0973390011646422e-06, + "loss": 0.3898, + "step": 3286 + }, + { + "epoch": 2.4204712812960234, + "grad_norm": 0.3332594335079193, + "learning_rate": 1.0946619586236711e-06, + "loss": 0.3571, + "step": 3287 + }, + { + "epoch": 2.42120765832106, + "grad_norm": 0.32110390067100525, + "learning_rate": 1.0919877840449428e-06, + "loss": 0.3904, + "step": 3288 + }, + { + "epoch": 2.421944035346097, + "grad_norm": 0.3465670645236969, + "learning_rate": 1.0893164793922894e-06, + "loss": 0.3595, + "step": 3289 + }, + { + "epoch": 2.422680412371134, + "grad_norm": 0.33734622597694397, + "learning_rate": 1.0866480466274377e-06, + "loss": 0.3817, + "step": 3290 + }, + { + "epoch": 2.4234167893961707, + "grad_norm": 0.3218252956867218, + "learning_rate": 1.0839824877100008e-06, + "loss": 0.3472, + "step": 3291 + }, + { + "epoch": 2.4241531664212075, + "grad_norm": 0.3475230634212494, + "learning_rate": 1.0813198045974888e-06, + "loss": 0.3629, + "step": 3292 + }, + { + "epoch": 2.4248895434462443, + "grad_norm": 0.32285812497138977, + "learning_rate": 1.0786599992452933e-06, + "loss": 0.3819, + "step": 3293 + }, + { + "epoch": 2.425625920471281, + "grad_norm": 0.34469369053840637, + "learning_rate": 1.0760030736066952e-06, + "loss": 0.3668, + "step": 3294 + }, + { + "epoch": 2.426362297496318, + "grad_norm": 0.3158765435218811, + "learning_rate": 1.0733490296328613e-06, + "loss": 0.3542, + "step": 3295 + }, + { + "epoch": 2.4270986745213547, + "grad_norm": 0.3300797641277313, + "learning_rate": 1.0706978692728416e-06, + "loss": 0.4024, + "step": 3296 + }, + { + "epoch": 2.4278350515463916, + "grad_norm": 0.31223219633102417, + "learning_rate": 1.0680495944735665e-06, + "loss": 0.3649, + "step": 3297 + }, + { + "epoch": 2.4285714285714284, + "grad_norm": 0.35305702686309814, + "learning_rate": 1.0654042071798498e-06, + "loss": 0.3598, + "step": 3298 + }, + { + "epoch": 2.429307805596465, + "grad_norm": 0.354264497756958, + "learning_rate": 1.0627617093343833e-06, + "loss": 0.3895, + "step": 3299 + }, + { + "epoch": 2.4300441826215025, + "grad_norm": 0.34128403663635254, + "learning_rate": 1.060122102877739e-06, + "loss": 0.3702, + "step": 3300 + }, + { + "epoch": 2.4307805596465393, + "grad_norm": 0.36130642890930176, + "learning_rate": 1.0574853897483634e-06, + "loss": 0.4276, + "step": 3301 + }, + { + "epoch": 2.431516936671576, + "grad_norm": 0.3270364999771118, + "learning_rate": 1.054851571882578e-06, + "loss": 0.3915, + "step": 3302 + }, + { + "epoch": 2.432253313696613, + "grad_norm": 0.33622244000434875, + "learning_rate": 1.052220651214581e-06, + "loss": 0.4009, + "step": 3303 + }, + { + "epoch": 2.4329896907216497, + "grad_norm": 0.3255276381969452, + "learning_rate": 1.0495926296764398e-06, + "loss": 0.3953, + "step": 3304 + }, + { + "epoch": 2.4337260677466865, + "grad_norm": 0.32380256056785583, + "learning_rate": 1.0469675091980946e-06, + "loss": 0.3766, + "step": 3305 + }, + { + "epoch": 2.4344624447717234, + "grad_norm": 0.32424572110176086, + "learning_rate": 1.0443452917073538e-06, + "loss": 0.3914, + "step": 3306 + }, + { + "epoch": 2.43519882179676, + "grad_norm": 0.3544393479824066, + "learning_rate": 1.041725979129894e-06, + "loss": 0.3624, + "step": 3307 + }, + { + "epoch": 2.435935198821797, + "grad_norm": 0.3281401991844177, + "learning_rate": 1.0391095733892614e-06, + "loss": 0.3437, + "step": 3308 + }, + { + "epoch": 2.436671575846834, + "grad_norm": 0.32801955938339233, + "learning_rate": 1.0364960764068643e-06, + "loss": 0.3675, + "step": 3309 + }, + { + "epoch": 2.4374079528718706, + "grad_norm": 0.3724784553050995, + "learning_rate": 1.033885490101974e-06, + "loss": 0.3646, + "step": 3310 + }, + { + "epoch": 2.4381443298969074, + "grad_norm": 0.35734328627586365, + "learning_rate": 1.0312778163917298e-06, + "loss": 0.3746, + "step": 3311 + }, + { + "epoch": 2.4388807069219443, + "grad_norm": 0.33092063665390015, + "learning_rate": 1.0286730571911264e-06, + "loss": 0.3573, + "step": 3312 + }, + { + "epoch": 2.439617083946981, + "grad_norm": 0.32385748624801636, + "learning_rate": 1.0260712144130192e-06, + "loss": 0.3724, + "step": 3313 + }, + { + "epoch": 2.440353460972018, + "grad_norm": 0.3588411808013916, + "learning_rate": 1.0234722899681265e-06, + "loss": 0.3727, + "step": 3314 + }, + { + "epoch": 2.4410898379970547, + "grad_norm": 0.32793402671813965, + "learning_rate": 1.020876285765015e-06, + "loss": 0.3588, + "step": 3315 + }, + { + "epoch": 2.4418262150220915, + "grad_norm": 0.37099236249923706, + "learning_rate": 1.018283203710116e-06, + "loss": 0.3759, + "step": 3316 + }, + { + "epoch": 2.4425625920471283, + "grad_norm": 0.35512179136276245, + "learning_rate": 1.0156930457077085e-06, + "loss": 0.3909, + "step": 3317 + }, + { + "epoch": 2.443298969072165, + "grad_norm": 0.32418885827064514, + "learning_rate": 1.0131058136599254e-06, + "loss": 0.3577, + "step": 3318 + }, + { + "epoch": 2.444035346097202, + "grad_norm": 0.33136194944381714, + "learning_rate": 1.0105215094667542e-06, + "loss": 0.3603, + "step": 3319 + }, + { + "epoch": 2.444771723122239, + "grad_norm": 0.35086795687675476, + "learning_rate": 1.0079401350260288e-06, + "loss": 0.3666, + "step": 3320 + }, + { + "epoch": 2.4455081001472756, + "grad_norm": 0.3403591215610504, + "learning_rate": 1.0053616922334307e-06, + "loss": 0.3901, + "step": 3321 + }, + { + "epoch": 2.4462444771723124, + "grad_norm": 0.3403468132019043, + "learning_rate": 1.0027861829824953e-06, + "loss": 0.3854, + "step": 3322 + }, + { + "epoch": 2.4469808541973492, + "grad_norm": 0.3459208011627197, + "learning_rate": 1.0002136091645936e-06, + "loss": 0.3169, + "step": 3323 + }, + { + "epoch": 2.447717231222386, + "grad_norm": 0.3217117190361023, + "learning_rate": 9.976439726689469e-07, + "loss": 0.3866, + "step": 3324 + }, + { + "epoch": 2.448453608247423, + "grad_norm": 0.37422022223472595, + "learning_rate": 9.95077275382621e-07, + "loss": 0.3865, + "step": 3325 + }, + { + "epoch": 2.4491899852724597, + "grad_norm": 0.37057486176490784, + "learning_rate": 9.925135191905194e-07, + "loss": 0.3944, + "step": 3326 + }, + { + "epoch": 2.4499263622974965, + "grad_norm": 0.33397674560546875, + "learning_rate": 9.89952705975386e-07, + "loss": 0.3958, + "step": 3327 + }, + { + "epoch": 2.4506627393225333, + "grad_norm": 0.3693501353263855, + "learning_rate": 9.873948376178073e-07, + "loss": 0.3158, + "step": 3328 + }, + { + "epoch": 2.45139911634757, + "grad_norm": 0.3479422330856323, + "learning_rate": 9.84839915996203e-07, + "loss": 0.3486, + "step": 3329 + }, + { + "epoch": 2.452135493372607, + "grad_norm": 0.3450046479701996, + "learning_rate": 9.822879429868304e-07, + "loss": 0.3718, + "step": 3330 + }, + { + "epoch": 2.4528718703976438, + "grad_norm": 0.33801180124282837, + "learning_rate": 9.79738920463782e-07, + "loss": 0.3553, + "step": 3331 + }, + { + "epoch": 2.4536082474226806, + "grad_norm": 0.3484099507331848, + "learning_rate": 9.771928502989802e-07, + "loss": 0.3978, + "step": 3332 + }, + { + "epoch": 2.4543446244477174, + "grad_norm": 0.34799516201019287, + "learning_rate": 9.746497343621857e-07, + "loss": 0.393, + "step": 3333 + }, + { + "epoch": 2.455081001472754, + "grad_norm": 0.3594471216201782, + "learning_rate": 9.721095745209847e-07, + "loss": 0.3844, + "step": 3334 + }, + { + "epoch": 2.455817378497791, + "grad_norm": 0.35299617052078247, + "learning_rate": 9.695723726407918e-07, + "loss": 0.3746, + "step": 3335 + }, + { + "epoch": 2.456553755522828, + "grad_norm": 0.309285432100296, + "learning_rate": 9.670381305848547e-07, + "loss": 0.3796, + "step": 3336 + }, + { + "epoch": 2.4572901325478647, + "grad_norm": 0.3416523039340973, + "learning_rate": 9.64506850214243e-07, + "loss": 0.3789, + "step": 3337 + }, + { + "epoch": 2.4580265095729015, + "grad_norm": 0.3610781729221344, + "learning_rate": 9.6197853338785e-07, + "loss": 0.3822, + "step": 3338 + }, + { + "epoch": 2.4587628865979383, + "grad_norm": 0.36001071333885193, + "learning_rate": 9.594531819624003e-07, + "loss": 0.4023, + "step": 3339 + }, + { + "epoch": 2.459499263622975, + "grad_norm": 0.3570241630077362, + "learning_rate": 9.569307977924304e-07, + "loss": 0.3697, + "step": 3340 + }, + { + "epoch": 2.460235640648012, + "grad_norm": 0.34070777893066406, + "learning_rate": 9.544113827303064e-07, + "loss": 0.3789, + "step": 3341 + }, + { + "epoch": 2.4609720176730487, + "grad_norm": 0.31999388337135315, + "learning_rate": 9.518949386262088e-07, + "loss": 0.3746, + "step": 3342 + }, + { + "epoch": 2.4617083946980856, + "grad_norm": 0.3107585608959198, + "learning_rate": 9.493814673281382e-07, + "loss": 0.3745, + "step": 3343 + }, + { + "epoch": 2.4624447717231224, + "grad_norm": 0.33291155099868774, + "learning_rate": 9.468709706819141e-07, + "loss": 0.3693, + "step": 3344 + }, + { + "epoch": 2.463181148748159, + "grad_norm": 0.35321640968322754, + "learning_rate": 9.443634505311671e-07, + "loss": 0.3677, + "step": 3345 + }, + { + "epoch": 2.463917525773196, + "grad_norm": 0.31461623311042786, + "learning_rate": 9.418589087173441e-07, + "loss": 0.4084, + "step": 3346 + }, + { + "epoch": 2.464653902798233, + "grad_norm": 0.311948299407959, + "learning_rate": 9.393573470797079e-07, + "loss": 0.3756, + "step": 3347 + }, + { + "epoch": 2.4653902798232696, + "grad_norm": 0.34136369824409485, + "learning_rate": 9.368587674553265e-07, + "loss": 0.3904, + "step": 3348 + }, + { + "epoch": 2.4661266568483065, + "grad_norm": 0.3355327546596527, + "learning_rate": 9.343631716790813e-07, + "loss": 0.3842, + "step": 3349 + }, + { + "epoch": 2.4668630338733433, + "grad_norm": 0.32075121998786926, + "learning_rate": 9.318705615836648e-07, + "loss": 0.3478, + "step": 3350 + }, + { + "epoch": 2.46759941089838, + "grad_norm": 0.31247884035110474, + "learning_rate": 9.293809389995734e-07, + "loss": 0.3843, + "step": 3351 + }, + { + "epoch": 2.468335787923417, + "grad_norm": 0.33083632588386536, + "learning_rate": 9.268943057551089e-07, + "loss": 0.3705, + "step": 3352 + }, + { + "epoch": 2.4690721649484537, + "grad_norm": 0.34602615237236023, + "learning_rate": 9.244106636763827e-07, + "loss": 0.3816, + "step": 3353 + }, + { + "epoch": 2.4698085419734905, + "grad_norm": 0.3348204493522644, + "learning_rate": 9.219300145873051e-07, + "loss": 0.4182, + "step": 3354 + }, + { + "epoch": 2.4705449189985274, + "grad_norm": 0.30488619208335876, + "learning_rate": 9.19452360309589e-07, + "loss": 0.3337, + "step": 3355 + }, + { + "epoch": 2.471281296023564, + "grad_norm": 0.34817051887512207, + "learning_rate": 9.169777026627514e-07, + "loss": 0.3839, + "step": 3356 + }, + { + "epoch": 2.472017673048601, + "grad_norm": 0.328900545835495, + "learning_rate": 9.145060434641017e-07, + "loss": 0.3649, + "step": 3357 + }, + { + "epoch": 2.472754050073638, + "grad_norm": 0.36652466654777527, + "learning_rate": 9.120373845287561e-07, + "loss": 0.3494, + "step": 3358 + }, + { + "epoch": 2.4734904270986746, + "grad_norm": 0.3472610116004944, + "learning_rate": 9.095717276696214e-07, + "loss": 0.3894, + "step": 3359 + }, + { + "epoch": 2.4742268041237114, + "grad_norm": 0.3122188448905945, + "learning_rate": 9.071090746973999e-07, + "loss": 0.3546, + "step": 3360 + }, + { + "epoch": 2.4749631811487482, + "grad_norm": 0.338001549243927, + "learning_rate": 9.046494274205924e-07, + "loss": 0.363, + "step": 3361 + }, + { + "epoch": 2.475699558173785, + "grad_norm": 0.32908204197883606, + "learning_rate": 9.021927876454883e-07, + "loss": 0.3786, + "step": 3362 + }, + { + "epoch": 2.476435935198822, + "grad_norm": 0.3352196216583252, + "learning_rate": 8.997391571761682e-07, + "loss": 0.3681, + "step": 3363 + }, + { + "epoch": 2.4771723122238587, + "grad_norm": 0.3196353316307068, + "learning_rate": 8.972885378145079e-07, + "loss": 0.3735, + "step": 3364 + }, + { + "epoch": 2.4779086892488955, + "grad_norm": 0.3419160544872284, + "learning_rate": 8.94840931360163e-07, + "loss": 0.3501, + "step": 3365 + }, + { + "epoch": 2.4786450662739323, + "grad_norm": 0.3332716226577759, + "learning_rate": 8.923963396105861e-07, + "loss": 0.3606, + "step": 3366 + }, + { + "epoch": 2.479381443298969, + "grad_norm": 0.3240850269794464, + "learning_rate": 8.899547643610102e-07, + "loss": 0.3774, + "step": 3367 + }, + { + "epoch": 2.480117820324006, + "grad_norm": 0.3325689136981964, + "learning_rate": 8.875162074044524e-07, + "loss": 0.3788, + "step": 3368 + }, + { + "epoch": 2.4808541973490428, + "grad_norm": 0.3567977547645569, + "learning_rate": 8.850806705317183e-07, + "loss": 0.3516, + "step": 3369 + }, + { + "epoch": 2.4815905743740796, + "grad_norm": 0.33295658230781555, + "learning_rate": 8.826481555313909e-07, + "loss": 0.3668, + "step": 3370 + }, + { + "epoch": 2.4823269513991164, + "grad_norm": 0.3270358443260193, + "learning_rate": 8.802186641898352e-07, + "loss": 0.3957, + "step": 3371 + }, + { + "epoch": 2.4830633284241532, + "grad_norm": 0.3151583671569824, + "learning_rate": 8.777921982911996e-07, + "loss": 0.4113, + "step": 3372 + }, + { + "epoch": 2.48379970544919, + "grad_norm": 0.3360562026500702, + "learning_rate": 8.753687596174021e-07, + "loss": 0.4078, + "step": 3373 + }, + { + "epoch": 2.484536082474227, + "grad_norm": 0.3369217813014984, + "learning_rate": 8.729483499481467e-07, + "loss": 0.383, + "step": 3374 + }, + { + "epoch": 2.4852724594992637, + "grad_norm": 0.3392618000507355, + "learning_rate": 8.705309710609078e-07, + "loss": 0.3923, + "step": 3375 + }, + { + "epoch": 2.4860088365243005, + "grad_norm": 0.3362715244293213, + "learning_rate": 8.681166247309348e-07, + "loss": 0.3598, + "step": 3376 + }, + { + "epoch": 2.4867452135493373, + "grad_norm": 0.34049803018569946, + "learning_rate": 8.65705312731252e-07, + "loss": 0.3656, + "step": 3377 + }, + { + "epoch": 2.487481590574374, + "grad_norm": 0.328102171421051, + "learning_rate": 8.632970368326537e-07, + "loss": 0.3924, + "step": 3378 + }, + { + "epoch": 2.488217967599411, + "grad_norm": 0.3466600775718689, + "learning_rate": 8.608917988037036e-07, + "loss": 0.3835, + "step": 3379 + }, + { + "epoch": 2.4889543446244478, + "grad_norm": 0.3212660551071167, + "learning_rate": 8.584896004107379e-07, + "loss": 0.382, + "step": 3380 + }, + { + "epoch": 2.4896907216494846, + "grad_norm": 0.3672233819961548, + "learning_rate": 8.56090443417859e-07, + "loss": 0.3818, + "step": 3381 + }, + { + "epoch": 2.4904270986745214, + "grad_norm": 0.3334003984928131, + "learning_rate": 8.536943295869315e-07, + "loss": 0.3788, + "step": 3382 + }, + { + "epoch": 2.491163475699558, + "grad_norm": 0.35568714141845703, + "learning_rate": 8.513012606775928e-07, + "loss": 0.3687, + "step": 3383 + }, + { + "epoch": 2.491899852724595, + "grad_norm": 0.34635692834854126, + "learning_rate": 8.489112384472386e-07, + "loss": 0.3541, + "step": 3384 + }, + { + "epoch": 2.492636229749632, + "grad_norm": 0.36689338088035583, + "learning_rate": 8.46524264651028e-07, + "loss": 0.3686, + "step": 3385 + }, + { + "epoch": 2.4933726067746687, + "grad_norm": 0.36320188641548157, + "learning_rate": 8.441403410418853e-07, + "loss": 0.3795, + "step": 3386 + }, + { + "epoch": 2.4941089837997055, + "grad_norm": 0.3351554274559021, + "learning_rate": 8.417594693704901e-07, + "loss": 0.365, + "step": 3387 + }, + { + "epoch": 2.4948453608247423, + "grad_norm": 0.3230125606060028, + "learning_rate": 8.393816513852815e-07, + "loss": 0.3867, + "step": 3388 + }, + { + "epoch": 2.495581737849779, + "grad_norm": 0.3341889977455139, + "learning_rate": 8.370068888324612e-07, + "loss": 0.3554, + "step": 3389 + }, + { + "epoch": 2.496318114874816, + "grad_norm": 0.3187844753265381, + "learning_rate": 8.346351834559784e-07, + "loss": 0.388, + "step": 3390 + }, + { + "epoch": 2.4970544918998527, + "grad_norm": 0.3333573639392853, + "learning_rate": 8.322665369975447e-07, + "loss": 0.3775, + "step": 3391 + }, + { + "epoch": 2.4977908689248896, + "grad_norm": 0.32489851117134094, + "learning_rate": 8.299009511966221e-07, + "loss": 0.3604, + "step": 3392 + }, + { + "epoch": 2.4985272459499264, + "grad_norm": 0.3213289976119995, + "learning_rate": 8.275384277904231e-07, + "loss": 0.3635, + "step": 3393 + }, + { + "epoch": 2.499263622974963, + "grad_norm": 0.341377854347229, + "learning_rate": 8.251789685139172e-07, + "loss": 0.3761, + "step": 3394 + }, + { + "epoch": 2.5, + "grad_norm": 0.34892818331718445, + "learning_rate": 8.228225750998176e-07, + "loss": 0.3652, + "step": 3395 + }, + { + "epoch": 2.500736377025037, + "grad_norm": 0.31448036432266235, + "learning_rate": 8.204692492785876e-07, + "loss": 0.3864, + "step": 3396 + }, + { + "epoch": 2.5014727540500736, + "grad_norm": 0.31953367590904236, + "learning_rate": 8.181189927784416e-07, + "loss": 0.364, + "step": 3397 + }, + { + "epoch": 2.5022091310751104, + "grad_norm": 0.33577442169189453, + "learning_rate": 8.157718073253351e-07, + "loss": 0.3808, + "step": 3398 + }, + { + "epoch": 2.5029455081001473, + "grad_norm": 0.3457181453704834, + "learning_rate": 8.134276946429703e-07, + "loss": 0.3665, + "step": 3399 + }, + { + "epoch": 2.503681885125184, + "grad_norm": 0.34147438406944275, + "learning_rate": 8.110866564527925e-07, + "loss": 0.3755, + "step": 3400 + }, + { + "epoch": 2.504418262150221, + "grad_norm": 0.3595358729362488, + "learning_rate": 8.087486944739886e-07, + "loss": 0.4067, + "step": 3401 + }, + { + "epoch": 2.5051546391752577, + "grad_norm": 0.32780689001083374, + "learning_rate": 8.064138104234897e-07, + "loss": 0.3573, + "step": 3402 + }, + { + "epoch": 2.5058910162002945, + "grad_norm": 0.3280833065509796, + "learning_rate": 8.040820060159621e-07, + "loss": 0.3645, + "step": 3403 + }, + { + "epoch": 2.5066273932253313, + "grad_norm": 0.3275584280490875, + "learning_rate": 8.017532829638119e-07, + "loss": 0.3812, + "step": 3404 + }, + { + "epoch": 2.507363770250368, + "grad_norm": 0.363805890083313, + "learning_rate": 7.994276429771857e-07, + "loss": 0.3802, + "step": 3405 + }, + { + "epoch": 2.508100147275405, + "grad_norm": 0.317568302154541, + "learning_rate": 7.971050877639624e-07, + "loss": 0.348, + "step": 3406 + }, + { + "epoch": 2.508836524300442, + "grad_norm": 0.3445456922054291, + "learning_rate": 7.947856190297538e-07, + "loss": 0.3498, + "step": 3407 + }, + { + "epoch": 2.5095729013254786, + "grad_norm": 0.3341994285583496, + "learning_rate": 7.924692384779098e-07, + "loss": 0.3574, + "step": 3408 + }, + { + "epoch": 2.5103092783505154, + "grad_norm": 0.368876188993454, + "learning_rate": 7.901559478095106e-07, + "loss": 0.4066, + "step": 3409 + }, + { + "epoch": 2.5110456553755522, + "grad_norm": 0.333023339509964, + "learning_rate": 7.878457487233643e-07, + "loss": 0.3624, + "step": 3410 + }, + { + "epoch": 2.511782032400589, + "grad_norm": 0.33965203166007996, + "learning_rate": 7.85538642916015e-07, + "loss": 0.379, + "step": 3411 + }, + { + "epoch": 2.512518409425626, + "grad_norm": 0.3300594091415405, + "learning_rate": 7.832346320817297e-07, + "loss": 0.3837, + "step": 3412 + }, + { + "epoch": 2.5132547864506627, + "grad_norm": 0.3793734312057495, + "learning_rate": 7.809337179125031e-07, + "loss": 0.3478, + "step": 3413 + }, + { + "epoch": 2.5139911634756995, + "grad_norm": 0.3749076724052429, + "learning_rate": 7.786359020980605e-07, + "loss": 0.3635, + "step": 3414 + }, + { + "epoch": 2.5147275405007363, + "grad_norm": 0.3277282118797302, + "learning_rate": 7.763411863258441e-07, + "loss": 0.3736, + "step": 3415 + }, + { + "epoch": 2.515463917525773, + "grad_norm": 0.3639467656612396, + "learning_rate": 7.740495722810271e-07, + "loss": 0.3579, + "step": 3416 + }, + { + "epoch": 2.51620029455081, + "grad_norm": 0.36798664927482605, + "learning_rate": 7.717610616464999e-07, + "loss": 0.3939, + "step": 3417 + }, + { + "epoch": 2.5169366715758468, + "grad_norm": 0.3840385675430298, + "learning_rate": 7.694756561028754e-07, + "loss": 0.3643, + "step": 3418 + }, + { + "epoch": 2.5176730486008836, + "grad_norm": 0.3123991787433624, + "learning_rate": 7.671933573284878e-07, + "loss": 0.3923, + "step": 3419 + }, + { + "epoch": 2.5184094256259204, + "grad_norm": 0.37913936376571655, + "learning_rate": 7.649141669993881e-07, + "loss": 0.3921, + "step": 3420 + }, + { + "epoch": 2.5191458026509572, + "grad_norm": 0.30917784571647644, + "learning_rate": 7.626380867893429e-07, + "loss": 0.3538, + "step": 3421 + }, + { + "epoch": 2.519882179675994, + "grad_norm": 0.3407180607318878, + "learning_rate": 7.603651183698396e-07, + "loss": 0.4078, + "step": 3422 + }, + { + "epoch": 2.520618556701031, + "grad_norm": 0.34745657444000244, + "learning_rate": 7.580952634100758e-07, + "loss": 0.387, + "step": 3423 + }, + { + "epoch": 2.5213549337260677, + "grad_norm": 0.33488914370536804, + "learning_rate": 7.558285235769647e-07, + "loss": 0.3858, + "step": 3424 + }, + { + "epoch": 2.5220913107511045, + "grad_norm": 0.3565431237220764, + "learning_rate": 7.535649005351309e-07, + "loss": 0.3824, + "step": 3425 + }, + { + "epoch": 2.5228276877761413, + "grad_norm": 0.3106916546821594, + "learning_rate": 7.513043959469107e-07, + "loss": 0.3919, + "step": 3426 + }, + { + "epoch": 2.523564064801178, + "grad_norm": 0.34295180439949036, + "learning_rate": 7.49047011472352e-07, + "loss": 0.398, + "step": 3427 + }, + { + "epoch": 2.524300441826215, + "grad_norm": 0.316560834646225, + "learning_rate": 7.467927487692089e-07, + "loss": 0.3448, + "step": 3428 + }, + { + "epoch": 2.5250368188512518, + "grad_norm": 0.33750441670417786, + "learning_rate": 7.445416094929426e-07, + "loss": 0.3661, + "step": 3429 + }, + { + "epoch": 2.5257731958762886, + "grad_norm": 0.34076401591300964, + "learning_rate": 7.422935952967236e-07, + "loss": 0.3744, + "step": 3430 + }, + { + "epoch": 2.5265095729013254, + "grad_norm": 0.3119591772556305, + "learning_rate": 7.40048707831425e-07, + "loss": 0.3562, + "step": 3431 + }, + { + "epoch": 2.527245949926362, + "grad_norm": 0.33838844299316406, + "learning_rate": 7.378069487456241e-07, + "loss": 0.3647, + "step": 3432 + }, + { + "epoch": 2.527982326951399, + "grad_norm": 0.3333272337913513, + "learning_rate": 7.355683196856006e-07, + "loss": 0.3804, + "step": 3433 + }, + { + "epoch": 2.528718703976436, + "grad_norm": 0.31554344296455383, + "learning_rate": 7.333328222953356e-07, + "loss": 0.3709, + "step": 3434 + }, + { + "epoch": 2.5294550810014726, + "grad_norm": 0.3351151645183563, + "learning_rate": 7.311004582165132e-07, + "loss": 0.3745, + "step": 3435 + }, + { + "epoch": 2.5301914580265095, + "grad_norm": 0.3532677888870239, + "learning_rate": 7.288712290885119e-07, + "loss": 0.3772, + "step": 3436 + }, + { + "epoch": 2.5309278350515463, + "grad_norm": 0.33610615134239197, + "learning_rate": 7.266451365484106e-07, + "loss": 0.37, + "step": 3437 + }, + { + "epoch": 2.531664212076583, + "grad_norm": 0.33613523840904236, + "learning_rate": 7.244221822309855e-07, + "loss": 0.3705, + "step": 3438 + }, + { + "epoch": 2.53240058910162, + "grad_norm": 0.3339458107948303, + "learning_rate": 7.222023677687062e-07, + "loss": 0.382, + "step": 3439 + }, + { + "epoch": 2.5331369661266567, + "grad_norm": 0.3765323758125305, + "learning_rate": 7.199856947917372e-07, + "loss": 0.3996, + "step": 3440 + }, + { + "epoch": 2.5338733431516935, + "grad_norm": 0.3355490565299988, + "learning_rate": 7.177721649279367e-07, + "loss": 0.3722, + "step": 3441 + }, + { + "epoch": 2.5346097201767304, + "grad_norm": 0.33720365166664124, + "learning_rate": 7.155617798028542e-07, + "loss": 0.3867, + "step": 3442 + }, + { + "epoch": 2.535346097201767, + "grad_norm": 0.3521978259086609, + "learning_rate": 7.133545410397274e-07, + "loss": 0.3626, + "step": 3443 + }, + { + "epoch": 2.536082474226804, + "grad_norm": 0.38076502084732056, + "learning_rate": 7.111504502594896e-07, + "loss": 0.3998, + "step": 3444 + }, + { + "epoch": 2.536818851251841, + "grad_norm": 0.3390524983406067, + "learning_rate": 7.089495090807564e-07, + "loss": 0.4084, + "step": 3445 + }, + { + "epoch": 2.5375552282768776, + "grad_norm": 0.3228042423725128, + "learning_rate": 7.067517191198314e-07, + "loss": 0.3477, + "step": 3446 + }, + { + "epoch": 2.5382916053019144, + "grad_norm": 0.373737633228302, + "learning_rate": 7.045570819907072e-07, + "loss": 0.3804, + "step": 3447 + }, + { + "epoch": 2.5390279823269513, + "grad_norm": 0.33321553468704224, + "learning_rate": 7.023655993050588e-07, + "loss": 0.3792, + "step": 3448 + }, + { + "epoch": 2.539764359351988, + "grad_norm": 0.35181403160095215, + "learning_rate": 7.001772726722439e-07, + "loss": 0.3664, + "step": 3449 + }, + { + "epoch": 2.540500736377025, + "grad_norm": 0.36001092195510864, + "learning_rate": 6.979921036993042e-07, + "loss": 0.4203, + "step": 3450 + }, + { + "epoch": 2.5412371134020617, + "grad_norm": 0.32719293236732483, + "learning_rate": 6.958100939909601e-07, + "loss": 0.3662, + "step": 3451 + }, + { + "epoch": 2.5419734904270985, + "grad_norm": 0.33018410205841064, + "learning_rate": 6.936312451496157e-07, + "loss": 0.3636, + "step": 3452 + }, + { + "epoch": 2.5427098674521353, + "grad_norm": 0.32865098118782043, + "learning_rate": 6.914555587753508e-07, + "loss": 0.3847, + "step": 3453 + }, + { + "epoch": 2.543446244477172, + "grad_norm": 0.324142187833786, + "learning_rate": 6.892830364659231e-07, + "loss": 0.3712, + "step": 3454 + }, + { + "epoch": 2.544182621502209, + "grad_norm": 0.3413415849208832, + "learning_rate": 6.871136798167693e-07, + "loss": 0.3993, + "step": 3455 + }, + { + "epoch": 2.544918998527246, + "grad_norm": 0.32384759187698364, + "learning_rate": 6.849474904209979e-07, + "loss": 0.3785, + "step": 3456 + }, + { + "epoch": 2.5456553755522826, + "grad_norm": 0.32682374119758606, + "learning_rate": 6.827844698693931e-07, + "loss": 0.3639, + "step": 3457 + }, + { + "epoch": 2.5463917525773194, + "grad_norm": 0.3174643814563751, + "learning_rate": 6.806246197504118e-07, + "loss": 0.3797, + "step": 3458 + }, + { + "epoch": 2.5471281296023562, + "grad_norm": 0.3432275056838989, + "learning_rate": 6.784679416501822e-07, + "loss": 0.3791, + "step": 3459 + }, + { + "epoch": 2.547864506627393, + "grad_norm": 0.35215064883232117, + "learning_rate": 6.763144371525048e-07, + "loss": 0.3873, + "step": 3460 + }, + { + "epoch": 2.54860088365243, + "grad_norm": 0.36379846930503845, + "learning_rate": 6.741641078388472e-07, + "loss": 0.3611, + "step": 3461 + }, + { + "epoch": 2.5493372606774667, + "grad_norm": 0.3444019854068756, + "learning_rate": 6.72016955288346e-07, + "loss": 0.3479, + "step": 3462 + }, + { + "epoch": 2.5500736377025035, + "grad_norm": 0.3353855311870575, + "learning_rate": 6.698729810778065e-07, + "loss": 0.3835, + "step": 3463 + }, + { + "epoch": 2.5508100147275403, + "grad_norm": 0.3298718333244324, + "learning_rate": 6.677321867816983e-07, + "loss": 0.4032, + "step": 3464 + }, + { + "epoch": 2.551546391752577, + "grad_norm": 0.3183014392852783, + "learning_rate": 6.655945739721548e-07, + "loss": 0.3547, + "step": 3465 + }, + { + "epoch": 2.552282768777614, + "grad_norm": 0.33290722966194153, + "learning_rate": 6.634601442189753e-07, + "loss": 0.3874, + "step": 3466 + }, + { + "epoch": 2.5530191458026508, + "grad_norm": 0.3307499289512634, + "learning_rate": 6.613288990896205e-07, + "loss": 0.3761, + "step": 3467 + }, + { + "epoch": 2.5537555228276876, + "grad_norm": 0.3292166590690613, + "learning_rate": 6.592008401492106e-07, + "loss": 0.3838, + "step": 3468 + }, + { + "epoch": 2.5544918998527244, + "grad_norm": 0.32803595066070557, + "learning_rate": 6.570759689605305e-07, + "loss": 0.3759, + "step": 3469 + }, + { + "epoch": 2.555228276877761, + "grad_norm": 0.36094194650650024, + "learning_rate": 6.549542870840203e-07, + "loss": 0.346, + "step": 3470 + }, + { + "epoch": 2.555964653902798, + "grad_norm": 0.35194647312164307, + "learning_rate": 6.528357960777776e-07, + "loss": 0.3795, + "step": 3471 + }, + { + "epoch": 2.556701030927835, + "grad_norm": 0.3414342701435089, + "learning_rate": 6.507204974975611e-07, + "loss": 0.3704, + "step": 3472 + }, + { + "epoch": 2.5574374079528717, + "grad_norm": 0.3302089273929596, + "learning_rate": 6.486083928967801e-07, + "loss": 0.3581, + "step": 3473 + }, + { + "epoch": 2.5581737849779085, + "grad_norm": 0.3138299286365509, + "learning_rate": 6.46499483826501e-07, + "loss": 0.3649, + "step": 3474 + }, + { + "epoch": 2.5589101620029453, + "grad_norm": 0.34113311767578125, + "learning_rate": 6.443937718354426e-07, + "loss": 0.3937, + "step": 3475 + }, + { + "epoch": 2.559646539027982, + "grad_norm": 0.33165475726127625, + "learning_rate": 6.422912584699753e-07, + "loss": 0.3952, + "step": 3476 + }, + { + "epoch": 2.560382916053019, + "grad_norm": 0.35434436798095703, + "learning_rate": 6.401919452741234e-07, + "loss": 0.3813, + "step": 3477 + }, + { + "epoch": 2.5611192930780557, + "grad_norm": 0.3147828280925751, + "learning_rate": 6.380958337895582e-07, + "loss": 0.378, + "step": 3478 + }, + { + "epoch": 2.5618556701030926, + "grad_norm": 0.34505346417427063, + "learning_rate": 6.360029255555994e-07, + "loss": 0.3648, + "step": 3479 + }, + { + "epoch": 2.5625920471281294, + "grad_norm": 0.3115389049053192, + "learning_rate": 6.339132221092181e-07, + "loss": 0.3581, + "step": 3480 + }, + { + "epoch": 2.563328424153166, + "grad_norm": 0.3120824694633484, + "learning_rate": 6.318267249850274e-07, + "loss": 0.3574, + "step": 3481 + }, + { + "epoch": 2.564064801178203, + "grad_norm": 0.3659776747226715, + "learning_rate": 6.297434357152882e-07, + "loss": 0.3897, + "step": 3482 + }, + { + "epoch": 2.56480117820324, + "grad_norm": 0.33508267998695374, + "learning_rate": 6.276633558299056e-07, + "loss": 0.3856, + "step": 3483 + }, + { + "epoch": 2.5655375552282766, + "grad_norm": 0.3201551139354706, + "learning_rate": 6.25586486856426e-07, + "loss": 0.3766, + "step": 3484 + }, + { + "epoch": 2.5662739322533135, + "grad_norm": 0.332343727350235, + "learning_rate": 6.23512830320041e-07, + "loss": 0.3716, + "step": 3485 + }, + { + "epoch": 2.5670103092783503, + "grad_norm": 0.346399188041687, + "learning_rate": 6.214423877435805e-07, + "loss": 0.3845, + "step": 3486 + }, + { + "epoch": 2.567746686303387, + "grad_norm": 0.3369497060775757, + "learning_rate": 6.193751606475141e-07, + "loss": 0.3658, + "step": 3487 + }, + { + "epoch": 2.568483063328424, + "grad_norm": 0.3280666768550873, + "learning_rate": 6.17311150549953e-07, + "loss": 0.3755, + "step": 3488 + }, + { + "epoch": 2.5692194403534607, + "grad_norm": 0.30463147163391113, + "learning_rate": 6.152503589666426e-07, + "loss": 0.3658, + "step": 3489 + }, + { + "epoch": 2.5699558173784975, + "grad_norm": 0.3326549828052521, + "learning_rate": 6.131927874109661e-07, + "loss": 0.3723, + "step": 3490 + }, + { + "epoch": 2.5706921944035344, + "grad_norm": 0.338368684053421, + "learning_rate": 6.111384373939416e-07, + "loss": 0.366, + "step": 3491 + }, + { + "epoch": 2.571428571428571, + "grad_norm": 0.3462516963481903, + "learning_rate": 6.090873104242213e-07, + "loss": 0.3906, + "step": 3492 + }, + { + "epoch": 2.572164948453608, + "grad_norm": 0.33473119139671326, + "learning_rate": 6.070394080080921e-07, + "loss": 0.3634, + "step": 3493 + }, + { + "epoch": 2.572901325478645, + "grad_norm": 0.3650546073913574, + "learning_rate": 6.049947316494709e-07, + "loss": 0.3895, + "step": 3494 + }, + { + "epoch": 2.5736377025036816, + "grad_norm": 0.3254354000091553, + "learning_rate": 6.029532828499052e-07, + "loss": 0.3694, + "step": 3495 + }, + { + "epoch": 2.5743740795287184, + "grad_norm": 0.3232567012310028, + "learning_rate": 6.009150631085758e-07, + "loss": 0.3675, + "step": 3496 + }, + { + "epoch": 2.5751104565537553, + "grad_norm": 0.3334692120552063, + "learning_rate": 5.988800739222884e-07, + "loss": 0.3539, + "step": 3497 + }, + { + "epoch": 2.575846833578792, + "grad_norm": 0.32919347286224365, + "learning_rate": 5.968483167854761e-07, + "loss": 0.3744, + "step": 3498 + }, + { + "epoch": 2.576583210603829, + "grad_norm": 0.3350676894187927, + "learning_rate": 5.948197931902034e-07, + "loss": 0.3817, + "step": 3499 + }, + { + "epoch": 2.5773195876288657, + "grad_norm": 0.32519227266311646, + "learning_rate": 5.927945046261541e-07, + "loss": 0.3669, + "step": 3500 + }, + { + "epoch": 2.5780559646539025, + "grad_norm": 0.31917646527290344, + "learning_rate": 5.90772452580638e-07, + "loss": 0.3524, + "step": 3501 + }, + { + "epoch": 2.57879234167894, + "grad_norm": 0.3143576979637146, + "learning_rate": 5.887536385385917e-07, + "loss": 0.3696, + "step": 3502 + }, + { + "epoch": 2.5795287187039766, + "grad_norm": 0.32816874980926514, + "learning_rate": 5.867380639825698e-07, + "loss": 0.3613, + "step": 3503 + }, + { + "epoch": 2.5802650957290134, + "grad_norm": 0.3384314179420471, + "learning_rate": 5.847257303927484e-07, + "loss": 0.3812, + "step": 3504 + }, + { + "epoch": 2.5810014727540502, + "grad_norm": 0.34819257259368896, + "learning_rate": 5.827166392469269e-07, + "loss": 0.3749, + "step": 3505 + }, + { + "epoch": 2.581737849779087, + "grad_norm": 0.31272298097610474, + "learning_rate": 5.807107920205202e-07, + "loss": 0.3579, + "step": 3506 + }, + { + "epoch": 2.582474226804124, + "grad_norm": 0.3496764004230499, + "learning_rate": 5.78708190186561e-07, + "loss": 0.3594, + "step": 3507 + }, + { + "epoch": 2.5832106038291607, + "grad_norm": 0.3062533438205719, + "learning_rate": 5.767088352157002e-07, + "loss": 0.3901, + "step": 3508 + }, + { + "epoch": 2.5839469808541975, + "grad_norm": 0.4096584618091583, + "learning_rate": 5.747127285762027e-07, + "loss": 0.372, + "step": 3509 + }, + { + "epoch": 2.5846833578792343, + "grad_norm": 0.32007288932800293, + "learning_rate": 5.727198717339511e-07, + "loss": 0.366, + "step": 3510 + }, + { + "epoch": 2.585419734904271, + "grad_norm": 0.32174500823020935, + "learning_rate": 5.707302661524372e-07, + "loss": 0.3938, + "step": 3511 + }, + { + "epoch": 2.586156111929308, + "grad_norm": 0.3417740762233734, + "learning_rate": 5.687439132927674e-07, + "loss": 0.3665, + "step": 3512 + }, + { + "epoch": 2.5868924889543448, + "grad_norm": 0.32081058621406555, + "learning_rate": 5.66760814613661e-07, + "loss": 0.3919, + "step": 3513 + }, + { + "epoch": 2.5876288659793816, + "grad_norm": 0.32988831400871277, + "learning_rate": 5.647809715714442e-07, + "loss": 0.3802, + "step": 3514 + }, + { + "epoch": 2.5883652430044184, + "grad_norm": 0.3118469715118408, + "learning_rate": 5.628043856200543e-07, + "loss": 0.3748, + "step": 3515 + }, + { + "epoch": 2.589101620029455, + "grad_norm": 0.33467116951942444, + "learning_rate": 5.60831058211036e-07, + "loss": 0.4001, + "step": 3516 + }, + { + "epoch": 2.589837997054492, + "grad_norm": 0.3081044852733612, + "learning_rate": 5.588609907935405e-07, + "loss": 0.3654, + "step": 3517 + }, + { + "epoch": 2.590574374079529, + "grad_norm": 0.3295198976993561, + "learning_rate": 5.568941848143284e-07, + "loss": 0.3907, + "step": 3518 + }, + { + "epoch": 2.5913107511045657, + "grad_norm": 0.32055968046188354, + "learning_rate": 5.549306417177602e-07, + "loss": 0.3872, + "step": 3519 + }, + { + "epoch": 2.5920471281296025, + "grad_norm": 0.29698818922042847, + "learning_rate": 5.529703629458027e-07, + "loss": 0.3768, + "step": 3520 + }, + { + "epoch": 2.5927835051546393, + "grad_norm": 0.3199003040790558, + "learning_rate": 5.510133499380271e-07, + "loss": 0.359, + "step": 3521 + }, + { + "epoch": 2.593519882179676, + "grad_norm": 0.3205602169036865, + "learning_rate": 5.490596041316038e-07, + "loss": 0.3838, + "step": 3522 + }, + { + "epoch": 2.594256259204713, + "grad_norm": 0.32970884442329407, + "learning_rate": 5.471091269613033e-07, + "loss": 0.3974, + "step": 3523 + }, + { + "epoch": 2.5949926362297497, + "grad_norm": 0.3231904208660126, + "learning_rate": 5.451619198594998e-07, + "loss": 0.4017, + "step": 3524 + }, + { + "epoch": 2.5957290132547866, + "grad_norm": 0.3256881535053253, + "learning_rate": 5.432179842561614e-07, + "loss": 0.3776, + "step": 3525 + }, + { + "epoch": 2.5964653902798234, + "grad_norm": 0.3105742633342743, + "learning_rate": 5.412773215788547e-07, + "loss": 0.4146, + "step": 3526 + }, + { + "epoch": 2.59720176730486, + "grad_norm": 0.36772292852401733, + "learning_rate": 5.393399332527466e-07, + "loss": 0.3675, + "step": 3527 + }, + { + "epoch": 2.597938144329897, + "grad_norm": 0.34043705463409424, + "learning_rate": 5.374058207005945e-07, + "loss": 0.3676, + "step": 3528 + }, + { + "epoch": 2.598674521354934, + "grad_norm": 0.31147414445877075, + "learning_rate": 5.354749853427521e-07, + "loss": 0.3456, + "step": 3529 + }, + { + "epoch": 2.5994108983799706, + "grad_norm": 0.3361349105834961, + "learning_rate": 5.335474285971681e-07, + "loss": 0.3442, + "step": 3530 + }, + { + "epoch": 2.6001472754050075, + "grad_norm": 0.3142765164375305, + "learning_rate": 5.316231518793802e-07, + "loss": 0.3597, + "step": 3531 + }, + { + "epoch": 2.6008836524300443, + "grad_norm": 0.3216022253036499, + "learning_rate": 5.297021566025212e-07, + "loss": 0.3665, + "step": 3532 + }, + { + "epoch": 2.601620029455081, + "grad_norm": 0.32801806926727295, + "learning_rate": 5.277844441773105e-07, + "loss": 0.392, + "step": 3533 + }, + { + "epoch": 2.602356406480118, + "grad_norm": 0.3432222902774811, + "learning_rate": 5.258700160120567e-07, + "loss": 0.3727, + "step": 3534 + }, + { + "epoch": 2.6030927835051547, + "grad_norm": 0.3140788972377777, + "learning_rate": 5.239588735126611e-07, + "loss": 0.3747, + "step": 3535 + }, + { + "epoch": 2.6038291605301915, + "grad_norm": 0.323508083820343, + "learning_rate": 5.220510180826071e-07, + "loss": 0.416, + "step": 3536 + }, + { + "epoch": 2.6045655375552283, + "grad_norm": 0.3127088248729706, + "learning_rate": 5.201464511229659e-07, + "loss": 0.3921, + "step": 3537 + }, + { + "epoch": 2.605301914580265, + "grad_norm": 0.3234788179397583, + "learning_rate": 5.182451740323957e-07, + "loss": 0.3813, + "step": 3538 + }, + { + "epoch": 2.606038291605302, + "grad_norm": 0.30984804034233093, + "learning_rate": 5.163471882071352e-07, + "loss": 0.3887, + "step": 3539 + }, + { + "epoch": 2.606774668630339, + "grad_norm": 0.32947883009910583, + "learning_rate": 5.144524950410074e-07, + "loss": 0.3717, + "step": 3540 + }, + { + "epoch": 2.6075110456553756, + "grad_norm": 0.3143480718135834, + "learning_rate": 5.125610959254213e-07, + "loss": 0.3805, + "step": 3541 + }, + { + "epoch": 2.6082474226804124, + "grad_norm": 0.32195669412612915, + "learning_rate": 5.10672992249358e-07, + "loss": 0.3862, + "step": 3542 + }, + { + "epoch": 2.6089837997054492, + "grad_norm": 0.3320513367652893, + "learning_rate": 5.087881853993876e-07, + "loss": 0.3418, + "step": 3543 + }, + { + "epoch": 2.609720176730486, + "grad_norm": 0.3392578959465027, + "learning_rate": 5.069066767596542e-07, + "loss": 0.3469, + "step": 3544 + }, + { + "epoch": 2.610456553755523, + "grad_norm": 0.3689731955528259, + "learning_rate": 5.0502846771188e-07, + "loss": 0.3818, + "step": 3545 + }, + { + "epoch": 2.6111929307805597, + "grad_norm": 0.33059579133987427, + "learning_rate": 5.031535596353665e-07, + "loss": 0.3901, + "step": 3546 + }, + { + "epoch": 2.6119293078055965, + "grad_norm": 0.3137247860431671, + "learning_rate": 5.012819539069885e-07, + "loss": 0.3799, + "step": 3547 + }, + { + "epoch": 2.6126656848306333, + "grad_norm": 0.3432486057281494, + "learning_rate": 4.994136519011966e-07, + "loss": 0.3573, + "step": 3548 + }, + { + "epoch": 2.61340206185567, + "grad_norm": 0.33022403717041016, + "learning_rate": 4.975486549900177e-07, + "loss": 0.369, + "step": 3549 + }, + { + "epoch": 2.614138438880707, + "grad_norm": 0.33566609025001526, + "learning_rate": 4.956869645430451e-07, + "loss": 0.3732, + "step": 3550 + }, + { + "epoch": 2.6148748159057438, + "grad_norm": 0.3244359791278839, + "learning_rate": 4.938285819274507e-07, + "loss": 0.3276, + "step": 3551 + }, + { + "epoch": 2.6156111929307806, + "grad_norm": 0.31504350900650024, + "learning_rate": 4.919735085079746e-07, + "loss": 0.3572, + "step": 3552 + }, + { + "epoch": 2.6163475699558174, + "grad_norm": 0.32959744334220886, + "learning_rate": 4.901217456469248e-07, + "loss": 0.4114, + "step": 3553 + }, + { + "epoch": 2.6170839469808542, + "grad_norm": 0.30462411046028137, + "learning_rate": 4.882732947041818e-07, + "loss": 0.3974, + "step": 3554 + }, + { + "epoch": 2.617820324005891, + "grad_norm": 0.33037570118904114, + "learning_rate": 4.86428157037192e-07, + "loss": 0.3725, + "step": 3555 + }, + { + "epoch": 2.618556701030928, + "grad_norm": 0.3179605007171631, + "learning_rate": 4.845863340009671e-07, + "loss": 0.4015, + "step": 3556 + }, + { + "epoch": 2.6192930780559647, + "grad_norm": 0.332682341337204, + "learning_rate": 4.827478269480895e-07, + "loss": 0.3676, + "step": 3557 + }, + { + "epoch": 2.6200294550810015, + "grad_norm": 0.33066973090171814, + "learning_rate": 4.809126372286999e-07, + "loss": 0.3644, + "step": 3558 + }, + { + "epoch": 2.6207658321060383, + "grad_norm": 0.34845325350761414, + "learning_rate": 4.790807661905067e-07, + "loss": 0.3876, + "step": 3559 + }, + { + "epoch": 2.621502209131075, + "grad_norm": 0.3088925778865814, + "learning_rate": 4.772522151787822e-07, + "loss": 0.3701, + "step": 3560 + }, + { + "epoch": 2.622238586156112, + "grad_norm": 0.3307948708534241, + "learning_rate": 4.7542698553635856e-07, + "loss": 0.3932, + "step": 3561 + }, + { + "epoch": 2.6229749631811488, + "grad_norm": 0.33199846744537354, + "learning_rate": 4.7360507860362723e-07, + "loss": 0.362, + "step": 3562 + }, + { + "epoch": 2.6237113402061856, + "grad_norm": 0.3131435215473175, + "learning_rate": 4.7178649571854473e-07, + "loss": 0.3647, + "step": 3563 + }, + { + "epoch": 2.6244477172312224, + "grad_norm": 0.3339531719684601, + "learning_rate": 4.699712382166216e-07, + "loss": 0.3888, + "step": 3564 + }, + { + "epoch": 2.625184094256259, + "grad_norm": 0.3675123155117035, + "learning_rate": 4.6815930743092765e-07, + "loss": 0.3772, + "step": 3565 + }, + { + "epoch": 2.625920471281296, + "grad_norm": 0.34725135564804077, + "learning_rate": 4.663507046920929e-07, + "loss": 0.349, + "step": 3566 + }, + { + "epoch": 2.626656848306333, + "grad_norm": 0.31065019965171814, + "learning_rate": 4.6454543132829653e-07, + "loss": 0.3587, + "step": 3567 + }, + { + "epoch": 2.6273932253313697, + "grad_norm": 0.32565373182296753, + "learning_rate": 4.627434886652793e-07, + "loss": 0.3668, + "step": 3568 + }, + { + "epoch": 2.6281296023564065, + "grad_norm": 0.33230826258659363, + "learning_rate": 4.6094487802633315e-07, + "loss": 0.3763, + "step": 3569 + }, + { + "epoch": 2.6288659793814433, + "grad_norm": 0.31987035274505615, + "learning_rate": 4.591496007323021e-07, + "loss": 0.38, + "step": 3570 + }, + { + "epoch": 2.62960235640648, + "grad_norm": 0.31742241978645325, + "learning_rate": 4.573576581015854e-07, + "loss": 0.3474, + "step": 3571 + }, + { + "epoch": 2.630338733431517, + "grad_norm": 0.31509506702423096, + "learning_rate": 4.55569051450131e-07, + "loss": 0.3646, + "step": 3572 + }, + { + "epoch": 2.6310751104565537, + "grad_norm": 0.3257356584072113, + "learning_rate": 4.537837820914359e-07, + "loss": 0.3668, + "step": 3573 + }, + { + "epoch": 2.6318114874815906, + "grad_norm": 0.32111799716949463, + "learning_rate": 4.520018513365515e-07, + "loss": 0.3933, + "step": 3574 + }, + { + "epoch": 2.6325478645066274, + "grad_norm": 0.3151605725288391, + "learning_rate": 4.5022326049406986e-07, + "loss": 0.3676, + "step": 3575 + }, + { + "epoch": 2.633284241531664, + "grad_norm": 0.3598038852214813, + "learning_rate": 4.484480108701372e-07, + "loss": 0.4136, + "step": 3576 + }, + { + "epoch": 2.634020618556701, + "grad_norm": 0.2969145178794861, + "learning_rate": 4.4667610376844197e-07, + "loss": 0.3821, + "step": 3577 + }, + { + "epoch": 2.634756995581738, + "grad_norm": 0.33178281784057617, + "learning_rate": 4.449075404902187e-07, + "loss": 0.3559, + "step": 3578 + }, + { + "epoch": 2.6354933726067746, + "grad_norm": 0.3466475307941437, + "learning_rate": 4.4314232233424845e-07, + "loss": 0.3827, + "step": 3579 + }, + { + "epoch": 2.6362297496318114, + "grad_norm": 0.3220832645893097, + "learning_rate": 4.413804505968533e-07, + "loss": 0.367, + "step": 3580 + }, + { + "epoch": 2.6369661266568483, + "grad_norm": 0.32306790351867676, + "learning_rate": 4.3962192657189707e-07, + "loss": 0.3961, + "step": 3581 + }, + { + "epoch": 2.637702503681885, + "grad_norm": 0.32913732528686523, + "learning_rate": 4.378667515507895e-07, + "loss": 0.3701, + "step": 3582 + }, + { + "epoch": 2.638438880706922, + "grad_norm": 0.3247510492801666, + "learning_rate": 4.361149268224779e-07, + "loss": 0.3603, + "step": 3583 + }, + { + "epoch": 2.6391752577319587, + "grad_norm": 0.36141979694366455, + "learning_rate": 4.34366453673446e-07, + "loss": 0.3905, + "step": 3584 + }, + { + "epoch": 2.6399116347569955, + "grad_norm": 0.31596043705940247, + "learning_rate": 4.326213333877227e-07, + "loss": 0.3527, + "step": 3585 + }, + { + "epoch": 2.6406480117820323, + "grad_norm": 0.3106141984462738, + "learning_rate": 4.308795672468713e-07, + "loss": 0.3539, + "step": 3586 + }, + { + "epoch": 2.641384388807069, + "grad_norm": 0.30769434571266174, + "learning_rate": 4.291411565299902e-07, + "loss": 0.3495, + "step": 3587 + }, + { + "epoch": 2.642120765832106, + "grad_norm": 0.30311083793640137, + "learning_rate": 4.2740610251371826e-07, + "loss": 0.3988, + "step": 3588 + }, + { + "epoch": 2.642857142857143, + "grad_norm": 0.3294108510017395, + "learning_rate": 4.256744064722246e-07, + "loss": 0.3609, + "step": 3589 + }, + { + "epoch": 2.6435935198821796, + "grad_norm": 0.3281387686729431, + "learning_rate": 4.2394606967721683e-07, + "loss": 0.3651, + "step": 3590 + }, + { + "epoch": 2.6443298969072164, + "grad_norm": 0.33501261472702026, + "learning_rate": 4.222210933979326e-07, + "loss": 0.3573, + "step": 3591 + }, + { + "epoch": 2.6450662739322532, + "grad_norm": 0.35645949840545654, + "learning_rate": 4.204994789011396e-07, + "loss": 0.4035, + "step": 3592 + }, + { + "epoch": 2.64580265095729, + "grad_norm": 0.3307779133319855, + "learning_rate": 4.187812274511427e-07, + "loss": 0.3763, + "step": 3593 + }, + { + "epoch": 2.646539027982327, + "grad_norm": 0.3367134630680084, + "learning_rate": 4.17066340309773e-07, + "loss": 0.3359, + "step": 3594 + }, + { + "epoch": 2.6472754050073637, + "grad_norm": 0.33751773834228516, + "learning_rate": 4.153548187363904e-07, + "loss": 0.3989, + "step": 3595 + }, + { + "epoch": 2.6480117820324005, + "grad_norm": 0.29782766103744507, + "learning_rate": 4.1364666398788613e-07, + "loss": 0.3574, + "step": 3596 + }, + { + "epoch": 2.6487481590574373, + "grad_norm": 0.3283192217350006, + "learning_rate": 4.1194187731867783e-07, + "loss": 0.3527, + "step": 3597 + }, + { + "epoch": 2.649484536082474, + "grad_norm": 0.32778191566467285, + "learning_rate": 4.102404599807075e-07, + "loss": 0.4059, + "step": 3598 + }, + { + "epoch": 2.650220913107511, + "grad_norm": 0.35166746377944946, + "learning_rate": 4.0854241322344665e-07, + "loss": 0.3882, + "step": 3599 + }, + { + "epoch": 2.6509572901325478, + "grad_norm": 0.34499865770339966, + "learning_rate": 4.0684773829388737e-07, + "loss": 0.3763, + "step": 3600 + }, + { + "epoch": 2.6516936671575846, + "grad_norm": 0.3195962607860565, + "learning_rate": 4.0515643643655014e-07, + "loss": 0.3616, + "step": 3601 + }, + { + "epoch": 2.6524300441826214, + "grad_norm": 0.34196287393569946, + "learning_rate": 4.034685088934737e-07, + "loss": 0.3798, + "step": 3602 + }, + { + "epoch": 2.653166421207658, + "grad_norm": 0.32536935806274414, + "learning_rate": 4.0178395690422143e-07, + "loss": 0.3623, + "step": 3603 + }, + { + "epoch": 2.653902798232695, + "grad_norm": 0.3281365633010864, + "learning_rate": 4.001027817058789e-07, + "loss": 0.3523, + "step": 3604 + }, + { + "epoch": 2.654639175257732, + "grad_norm": 0.32271289825439453, + "learning_rate": 3.9842498453304955e-07, + "loss": 0.3578, + "step": 3605 + }, + { + "epoch": 2.6553755522827687, + "grad_norm": 0.3313661515712738, + "learning_rate": 3.9675056661785563e-07, + "loss": 0.3415, + "step": 3606 + }, + { + "epoch": 2.6561119293078055, + "grad_norm": 0.31950780749320984, + "learning_rate": 3.950795291899412e-07, + "loss": 0.3595, + "step": 3607 + }, + { + "epoch": 2.6568483063328423, + "grad_norm": 0.34532180428504944, + "learning_rate": 3.934118734764647e-07, + "loss": 0.3879, + "step": 3608 + }, + { + "epoch": 2.657584683357879, + "grad_norm": 0.3165544271469116, + "learning_rate": 3.9174760070210204e-07, + "loss": 0.3682, + "step": 3609 + }, + { + "epoch": 2.658321060382916, + "grad_norm": 0.33208510279655457, + "learning_rate": 3.9008671208904503e-07, + "loss": 0.3667, + "step": 3610 + }, + { + "epoch": 2.6590574374079528, + "grad_norm": 0.31021976470947266, + "learning_rate": 3.8842920885699906e-07, + "loss": 0.3645, + "step": 3611 + }, + { + "epoch": 2.6597938144329896, + "grad_norm": 0.32414910197257996, + "learning_rate": 3.8677509222318557e-07, + "loss": 0.3607, + "step": 3612 + }, + { + "epoch": 2.6605301914580264, + "grad_norm": 0.33595502376556396, + "learning_rate": 3.8512436340233826e-07, + "loss": 0.3728, + "step": 3613 + }, + { + "epoch": 2.661266568483063, + "grad_norm": 0.31958842277526855, + "learning_rate": 3.8347702360670036e-07, + "loss": 0.3722, + "step": 3614 + }, + { + "epoch": 2.6620029455081, + "grad_norm": 0.3532390594482422, + "learning_rate": 3.8183307404603074e-07, + "loss": 0.3657, + "step": 3615 + }, + { + "epoch": 2.662739322533137, + "grad_norm": 0.3612724840641022, + "learning_rate": 3.8019251592759656e-07, + "loss": 0.388, + "step": 3616 + }, + { + "epoch": 2.6634756995581736, + "grad_norm": 0.3217339515686035, + "learning_rate": 3.785553504561712e-07, + "loss": 0.3428, + "step": 3617 + }, + { + "epoch": 2.6642120765832105, + "grad_norm": 0.2959875762462616, + "learning_rate": 3.769215788340419e-07, + "loss": 0.3665, + "step": 3618 + }, + { + "epoch": 2.6649484536082473, + "grad_norm": 0.3248257637023926, + "learning_rate": 3.752912022610006e-07, + "loss": 0.3725, + "step": 3619 + }, + { + "epoch": 2.665684830633284, + "grad_norm": 0.3298156261444092, + "learning_rate": 3.736642219343456e-07, + "loss": 0.3673, + "step": 3620 + }, + { + "epoch": 2.666421207658321, + "grad_norm": 0.32262980937957764, + "learning_rate": 3.720406390488834e-07, + "loss": 0.3928, + "step": 3621 + }, + { + "epoch": 2.6671575846833577, + "grad_norm": 0.30602526664733887, + "learning_rate": 3.7042045479692424e-07, + "loss": 0.3647, + "step": 3622 + }, + { + "epoch": 2.6678939617083945, + "grad_norm": 0.3311411142349243, + "learning_rate": 3.6880367036828124e-07, + "loss": 0.3779, + "step": 3623 + }, + { + "epoch": 2.6686303387334314, + "grad_norm": 0.32879799604415894, + "learning_rate": 3.671902869502736e-07, + "loss": 0.3925, + "step": 3624 + }, + { + "epoch": 2.669366715758468, + "grad_norm": 0.3125240206718445, + "learning_rate": 3.6558030572772075e-07, + "loss": 0.3984, + "step": 3625 + }, + { + "epoch": 2.670103092783505, + "grad_norm": 0.3394649624824524, + "learning_rate": 3.639737278829436e-07, + "loss": 0.3657, + "step": 3626 + }, + { + "epoch": 2.670839469808542, + "grad_norm": 0.34995949268341064, + "learning_rate": 3.623705545957651e-07, + "loss": 0.4049, + "step": 3627 + }, + { + "epoch": 2.6715758468335786, + "grad_norm": 0.33862966299057007, + "learning_rate": 3.607707870435062e-07, + "loss": 0.3941, + "step": 3628 + }, + { + "epoch": 2.672312223858616, + "grad_norm": 0.3330628573894501, + "learning_rate": 3.5917442640098997e-07, + "loss": 0.3516, + "step": 3629 + }, + { + "epoch": 2.6730486008836527, + "grad_norm": 0.30992215871810913, + "learning_rate": 3.575814738405331e-07, + "loss": 0.3642, + "step": 3630 + }, + { + "epoch": 2.6737849779086895, + "grad_norm": 0.3173280954360962, + "learning_rate": 3.559919305319526e-07, + "loss": 0.3858, + "step": 3631 + }, + { + "epoch": 2.6745213549337263, + "grad_norm": 0.32444649934768677, + "learning_rate": 3.544057976425619e-07, + "loss": 0.3647, + "step": 3632 + }, + { + "epoch": 2.675257731958763, + "grad_norm": 0.2939029932022095, + "learning_rate": 3.528230763371687e-07, + "loss": 0.359, + "step": 3633 + }, + { + "epoch": 2.6759941089838, + "grad_norm": 0.3513110876083374, + "learning_rate": 3.51243767778075e-07, + "loss": 0.3474, + "step": 3634 + }, + { + "epoch": 2.676730486008837, + "grad_norm": 0.3469097912311554, + "learning_rate": 3.49667873125078e-07, + "loss": 0.3416, + "step": 3635 + }, + { + "epoch": 2.6774668630338736, + "grad_norm": 0.3251095116138458, + "learning_rate": 3.480953935354658e-07, + "loss": 0.3585, + "step": 3636 + }, + { + "epoch": 2.6782032400589104, + "grad_norm": 0.31582942605018616, + "learning_rate": 3.4652633016402205e-07, + "loss": 0.3648, + "step": 3637 + }, + { + "epoch": 2.6789396170839472, + "grad_norm": 0.31770941615104675, + "learning_rate": 3.449606841630182e-07, + "loss": 0.3846, + "step": 3638 + }, + { + "epoch": 2.679675994108984, + "grad_norm": 0.3245222866535187, + "learning_rate": 3.433984566822163e-07, + "loss": 0.3573, + "step": 3639 + }, + { + "epoch": 2.680412371134021, + "grad_norm": 0.33965256810188293, + "learning_rate": 3.4183964886887135e-07, + "loss": 0.3547, + "step": 3640 + }, + { + "epoch": 2.6811487481590577, + "grad_norm": 0.34220483899116516, + "learning_rate": 3.4028426186772435e-07, + "loss": 0.3624, + "step": 3641 + }, + { + "epoch": 2.6818851251840945, + "grad_norm": 0.32621511816978455, + "learning_rate": 3.387322968210022e-07, + "loss": 0.3795, + "step": 3642 + }, + { + "epoch": 2.6826215022091313, + "grad_norm": 0.3251863121986389, + "learning_rate": 3.3718375486842314e-07, + "loss": 0.3535, + "step": 3643 + }, + { + "epoch": 2.683357879234168, + "grad_norm": 0.32561933994293213, + "learning_rate": 3.3563863714718927e-07, + "loss": 0.3637, + "step": 3644 + }, + { + "epoch": 2.684094256259205, + "grad_norm": 0.3464414179325104, + "learning_rate": 3.340969447919873e-07, + "loss": 0.3708, + "step": 3645 + }, + { + "epoch": 2.6848306332842418, + "grad_norm": 0.3460819721221924, + "learning_rate": 3.3255867893499105e-07, + "loss": 0.366, + "step": 3646 + }, + { + "epoch": 2.6855670103092786, + "grad_norm": 0.36896535754203796, + "learning_rate": 3.3102384070585523e-07, + "loss": 0.3482, + "step": 3647 + }, + { + "epoch": 2.6863033873343154, + "grad_norm": 0.3020630478858948, + "learning_rate": 3.2949243123171994e-07, + "loss": 0.3855, + "step": 3648 + }, + { + "epoch": 2.687039764359352, + "grad_norm": 0.3055335283279419, + "learning_rate": 3.279644516372049e-07, + "loss": 0.3841, + "step": 3649 + }, + { + "epoch": 2.687776141384389, + "grad_norm": 0.3210819363594055, + "learning_rate": 3.264399030444132e-07, + "loss": 0.3661, + "step": 3650 + }, + { + "epoch": 2.688512518409426, + "grad_norm": 0.3282409608364105, + "learning_rate": 3.2491878657292643e-07, + "loss": 0.3595, + "step": 3651 + }, + { + "epoch": 2.6892488954344627, + "grad_norm": 0.3092752695083618, + "learning_rate": 3.2340110333980656e-07, + "loss": 0.3499, + "step": 3652 + }, + { + "epoch": 2.6899852724594995, + "grad_norm": 0.31985411047935486, + "learning_rate": 3.218868544595938e-07, + "loss": 0.3618, + "step": 3653 + }, + { + "epoch": 2.6907216494845363, + "grad_norm": 0.33014747500419617, + "learning_rate": 3.20376041044308e-07, + "loss": 0.3551, + "step": 3654 + }, + { + "epoch": 2.691458026509573, + "grad_norm": 0.307456910610199, + "learning_rate": 3.18868664203445e-07, + "loss": 0.3722, + "step": 3655 + }, + { + "epoch": 2.69219440353461, + "grad_norm": 0.35932809114456177, + "learning_rate": 3.1736472504397485e-07, + "loss": 0.3935, + "step": 3656 + }, + { + "epoch": 2.6929307805596467, + "grad_norm": 0.3306938409805298, + "learning_rate": 3.1586422467034695e-07, + "loss": 0.379, + "step": 3657 + }, + { + "epoch": 2.6936671575846836, + "grad_norm": 0.3393386900424957, + "learning_rate": 3.143671641844831e-07, + "loss": 0.4004, + "step": 3658 + }, + { + "epoch": 2.6944035346097204, + "grad_norm": 0.3137757480144501, + "learning_rate": 3.128735446857784e-07, + "loss": 0.3594, + "step": 3659 + }, + { + "epoch": 2.695139911634757, + "grad_norm": 0.33764535188674927, + "learning_rate": 3.1138336727110307e-07, + "loss": 0.3994, + "step": 3660 + }, + { + "epoch": 2.695876288659794, + "grad_norm": 0.32632213830947876, + "learning_rate": 3.098966330347969e-07, + "loss": 0.3772, + "step": 3661 + }, + { + "epoch": 2.696612665684831, + "grad_norm": 0.3276118040084839, + "learning_rate": 3.0841334306867367e-07, + "loss": 0.3561, + "step": 3662 + }, + { + "epoch": 2.6973490427098676, + "grad_norm": 0.3348173201084137, + "learning_rate": 3.06933498462017e-07, + "loss": 0.357, + "step": 3663 + }, + { + "epoch": 2.6980854197349045, + "grad_norm": 0.372728168964386, + "learning_rate": 3.0545710030157824e-07, + "loss": 0.3482, + "step": 3664 + }, + { + "epoch": 2.6988217967599413, + "grad_norm": 0.34344926476478577, + "learning_rate": 3.039841496715823e-07, + "loss": 0.39, + "step": 3665 + }, + { + "epoch": 2.699558173784978, + "grad_norm": 0.3021678924560547, + "learning_rate": 3.0251464765371774e-07, + "loss": 0.3888, + "step": 3666 + }, + { + "epoch": 2.700294550810015, + "grad_norm": 0.3373548984527588, + "learning_rate": 3.010485953271425e-07, + "loss": 0.3803, + "step": 3667 + }, + { + "epoch": 2.7010309278350517, + "grad_norm": 0.31935036182403564, + "learning_rate": 2.9958599376848194e-07, + "loss": 0.3878, + "step": 3668 + }, + { + "epoch": 2.7017673048600885, + "grad_norm": 0.3091143071651459, + "learning_rate": 2.9812684405182536e-07, + "loss": 0.336, + "step": 3669 + }, + { + "epoch": 2.7025036818851254, + "grad_norm": 0.325328528881073, + "learning_rate": 2.9667114724872937e-07, + "loss": 0.4175, + "step": 3670 + }, + { + "epoch": 2.703240058910162, + "grad_norm": 0.31764811277389526, + "learning_rate": 2.9521890442821276e-07, + "loss": 0.3726, + "step": 3671 + }, + { + "epoch": 2.703976435935199, + "grad_norm": 0.3514196276664734, + "learning_rate": 2.9377011665675913e-07, + "loss": 0.3768, + "step": 3672 + }, + { + "epoch": 2.704712812960236, + "grad_norm": 0.3268696069717407, + "learning_rate": 2.923247849983146e-07, + "loss": 0.3878, + "step": 3673 + }, + { + "epoch": 2.7054491899852726, + "grad_norm": 0.3018622100353241, + "learning_rate": 2.908829105142874e-07, + "loss": 0.3717, + "step": 3674 + }, + { + "epoch": 2.7061855670103094, + "grad_norm": 0.28456223011016846, + "learning_rate": 2.89444494263546e-07, + "loss": 0.3918, + "step": 3675 + }, + { + "epoch": 2.7069219440353463, + "grad_norm": 0.35831522941589355, + "learning_rate": 2.8800953730242e-07, + "loss": 0.3505, + "step": 3676 + }, + { + "epoch": 2.707658321060383, + "grad_norm": 0.33213359117507935, + "learning_rate": 2.865780406846985e-07, + "loss": 0.3614, + "step": 3677 + }, + { + "epoch": 2.70839469808542, + "grad_norm": 0.3209593594074249, + "learning_rate": 2.85150005461628e-07, + "loss": 0.3597, + "step": 3678 + }, + { + "epoch": 2.7091310751104567, + "grad_norm": 0.3153224587440491, + "learning_rate": 2.8372543268191723e-07, + "loss": 0.3796, + "step": 3679 + }, + { + "epoch": 2.7098674521354935, + "grad_norm": 0.3322901129722595, + "learning_rate": 2.823043233917272e-07, + "loss": 0.3776, + "step": 3680 + }, + { + "epoch": 2.7106038291605303, + "grad_norm": 0.35406193137168884, + "learning_rate": 2.8088667863467754e-07, + "loss": 0.3886, + "step": 3681 + }, + { + "epoch": 2.711340206185567, + "grad_norm": 0.3339800536632538, + "learning_rate": 2.794724994518455e-07, + "loss": 0.3846, + "step": 3682 + }, + { + "epoch": 2.712076583210604, + "grad_norm": 0.33495986461639404, + "learning_rate": 2.7806178688175977e-07, + "loss": 0.3788, + "step": 3683 + }, + { + "epoch": 2.712812960235641, + "grad_norm": 0.3515438437461853, + "learning_rate": 2.7665454196040665e-07, + "loss": 0.3865, + "step": 3684 + }, + { + "epoch": 2.7135493372606776, + "grad_norm": 0.320209801197052, + "learning_rate": 2.752507657212228e-07, + "loss": 0.3725, + "step": 3685 + }, + { + "epoch": 2.7142857142857144, + "grad_norm": 0.3267118036746979, + "learning_rate": 2.738504591950991e-07, + "loss": 0.3785, + "step": 3686 + }, + { + "epoch": 2.7150220913107512, + "grad_norm": 0.34039753675460815, + "learning_rate": 2.724536234103792e-07, + "loss": 0.3622, + "step": 3687 + }, + { + "epoch": 2.715758468335788, + "grad_norm": 0.33974501490592957, + "learning_rate": 2.710602593928574e-07, + "loss": 0.3561, + "step": 3688 + }, + { + "epoch": 2.716494845360825, + "grad_norm": 0.3247467279434204, + "learning_rate": 2.6967036816577643e-07, + "loss": 0.3628, + "step": 3689 + }, + { + "epoch": 2.7172312223858617, + "grad_norm": 0.3339967131614685, + "learning_rate": 2.6828395074983195e-07, + "loss": 0.3635, + "step": 3690 + }, + { + "epoch": 2.7179675994108985, + "grad_norm": 0.31294745206832886, + "learning_rate": 2.6690100816316675e-07, + "loss": 0.3706, + "step": 3691 + }, + { + "epoch": 2.7187039764359353, + "grad_norm": 0.3254539370536804, + "learning_rate": 2.655215414213719e-07, + "loss": 0.3796, + "step": 3692 + }, + { + "epoch": 2.719440353460972, + "grad_norm": 0.32704704999923706, + "learning_rate": 2.6414555153748635e-07, + "loss": 0.3642, + "step": 3693 + }, + { + "epoch": 2.720176730486009, + "grad_norm": 0.31729063391685486, + "learning_rate": 2.627730395219941e-07, + "loss": 0.3979, + "step": 3694 + }, + { + "epoch": 2.7209131075110458, + "grad_norm": 0.3445037603378296, + "learning_rate": 2.6140400638282826e-07, + "loss": 0.3648, + "step": 3695 + }, + { + "epoch": 2.7216494845360826, + "grad_norm": 0.3214196562767029, + "learning_rate": 2.6003845312536526e-07, + "loss": 0.397, + "step": 3696 + }, + { + "epoch": 2.7223858615611194, + "grad_norm": 0.3424557149410248, + "learning_rate": 2.5867638075242454e-07, + "loss": 0.3801, + "step": 3697 + }, + { + "epoch": 2.723122238586156, + "grad_norm": 0.36900395154953003, + "learning_rate": 2.573177902642726e-07, + "loss": 0.3874, + "step": 3698 + }, + { + "epoch": 2.723858615611193, + "grad_norm": 0.33150961995124817, + "learning_rate": 2.5596268265861646e-07, + "loss": 0.3587, + "step": 3699 + }, + { + "epoch": 2.72459499263623, + "grad_norm": 0.35658320784568787, + "learning_rate": 2.5461105893060667e-07, + "loss": 0.3807, + "step": 3700 + }, + { + "epoch": 2.7253313696612667, + "grad_norm": 0.3134189248085022, + "learning_rate": 2.532629200728343e-07, + "loss": 0.3832, + "step": 3701 + }, + { + "epoch": 2.7260677466863035, + "grad_norm": 0.3331991136074066, + "learning_rate": 2.5191826707533173e-07, + "loss": 0.3653, + "step": 3702 + }, + { + "epoch": 2.7268041237113403, + "grad_norm": 0.3472428619861603, + "learning_rate": 2.505771009255714e-07, + "loss": 0.4057, + "step": 3703 + }, + { + "epoch": 2.727540500736377, + "grad_norm": 0.36920398473739624, + "learning_rate": 2.492394226084666e-07, + "loss": 0.396, + "step": 3704 + }, + { + "epoch": 2.728276877761414, + "grad_norm": 0.3479515314102173, + "learning_rate": 2.479052331063658e-07, + "loss": 0.3834, + "step": 3705 + }, + { + "epoch": 2.7290132547864507, + "grad_norm": 0.3214065730571747, + "learning_rate": 2.465745333990588e-07, + "loss": 0.3581, + "step": 3706 + }, + { + "epoch": 2.7297496318114876, + "grad_norm": 0.29844558238983154, + "learning_rate": 2.4524732446377154e-07, + "loss": 0.3715, + "step": 3707 + }, + { + "epoch": 2.7304860088365244, + "grad_norm": 0.2953944504261017, + "learning_rate": 2.439236072751644e-07, + "loss": 0.3972, + "step": 3708 + }, + { + "epoch": 2.731222385861561, + "grad_norm": 0.322376012802124, + "learning_rate": 2.426033828053381e-07, + "loss": 0.3785, + "step": 3709 + }, + { + "epoch": 2.731958762886598, + "grad_norm": 0.32461515069007874, + "learning_rate": 2.4128665202382327e-07, + "loss": 0.3594, + "step": 3710 + }, + { + "epoch": 2.732695139911635, + "grad_norm": 0.3040623664855957, + "learning_rate": 2.3997341589758694e-07, + "loss": 0.3436, + "step": 3711 + }, + { + "epoch": 2.7334315169366716, + "grad_norm": 0.32611605525016785, + "learning_rate": 2.3866367539103206e-07, + "loss": 0.3653, + "step": 3712 + }, + { + "epoch": 2.7341678939617085, + "grad_norm": 0.32935771346092224, + "learning_rate": 2.37357431465991e-07, + "loss": 0.3611, + "step": 3713 + }, + { + "epoch": 2.7349042709867453, + "grad_norm": 0.3235970139503479, + "learning_rate": 2.3605468508172968e-07, + "loss": 0.3786, + "step": 3714 + }, + { + "epoch": 2.735640648011782, + "grad_norm": 0.33503854274749756, + "learning_rate": 2.3475543719494676e-07, + "loss": 0.3587, + "step": 3715 + }, + { + "epoch": 2.736377025036819, + "grad_norm": 0.324275404214859, + "learning_rate": 2.3345968875977008e-07, + "loss": 0.3482, + "step": 3716 + }, + { + "epoch": 2.7371134020618557, + "grad_norm": 0.33510270714759827, + "learning_rate": 2.3216744072775797e-07, + "loss": 0.3949, + "step": 3717 + }, + { + "epoch": 2.7378497790868925, + "grad_norm": 0.35356804728507996, + "learning_rate": 2.3087869404789854e-07, + "loss": 0.3781, + "step": 3718 + }, + { + "epoch": 2.7385861561119293, + "grad_norm": 0.3195352554321289, + "learning_rate": 2.2959344966660802e-07, + "loss": 0.3679, + "step": 3719 + }, + { + "epoch": 2.739322533136966, + "grad_norm": 0.3420256972312927, + "learning_rate": 2.2831170852773198e-07, + "loss": 0.3873, + "step": 3720 + }, + { + "epoch": 2.740058910162003, + "grad_norm": 0.3301747441291809, + "learning_rate": 2.2703347157254142e-07, + "loss": 0.3652, + "step": 3721 + }, + { + "epoch": 2.74079528718704, + "grad_norm": 0.3048621714115143, + "learning_rate": 2.2575873973973485e-07, + "loss": 0.3808, + "step": 3722 + }, + { + "epoch": 2.7415316642120766, + "grad_norm": 0.31739065051078796, + "learning_rate": 2.2448751396543788e-07, + "loss": 0.3668, + "step": 3723 + }, + { + "epoch": 2.7422680412371134, + "grad_norm": 0.3357614278793335, + "learning_rate": 2.2321979518319992e-07, + "loss": 0.4031, + "step": 3724 + }, + { + "epoch": 2.7430044182621502, + "grad_norm": 0.3367345929145813, + "learning_rate": 2.21955584323994e-07, + "loss": 0.3808, + "step": 3725 + }, + { + "epoch": 2.743740795287187, + "grad_norm": 0.3369562029838562, + "learning_rate": 2.2069488231622083e-07, + "loss": 0.3616, + "step": 3726 + }, + { + "epoch": 2.744477172312224, + "grad_norm": 0.30571091175079346, + "learning_rate": 2.1943769008569927e-07, + "loss": 0.3605, + "step": 3727 + }, + { + "epoch": 2.7452135493372607, + "grad_norm": 0.32707011699676514, + "learning_rate": 2.1818400855567523e-07, + "loss": 0.3632, + "step": 3728 + }, + { + "epoch": 2.7459499263622975, + "grad_norm": 0.3019167482852936, + "learning_rate": 2.1693383864681394e-07, + "loss": 0.3751, + "step": 3729 + }, + { + "epoch": 2.7466863033873343, + "grad_norm": 0.32666751742362976, + "learning_rate": 2.1568718127720155e-07, + "loss": 0.3796, + "step": 3730 + }, + { + "epoch": 2.747422680412371, + "grad_norm": 0.3546668291091919, + "learning_rate": 2.1444403736234686e-07, + "loss": 0.3746, + "step": 3731 + }, + { + "epoch": 2.748159057437408, + "grad_norm": 0.31046637892723083, + "learning_rate": 2.132044078151768e-07, + "loss": 0.3346, + "step": 3732 + }, + { + "epoch": 2.7488954344624448, + "grad_norm": 0.3377501666545868, + "learning_rate": 2.119682935460371e-07, + "loss": 0.3892, + "step": 3733 + }, + { + "epoch": 2.7496318114874816, + "grad_norm": 0.3156897723674774, + "learning_rate": 2.1073569546269434e-07, + "loss": 0.3731, + "step": 3734 + }, + { + "epoch": 2.7503681885125184, + "grad_norm": 0.33260810375213623, + "learning_rate": 2.095066144703295e-07, + "loss": 0.3553, + "step": 3735 + }, + { + "epoch": 2.7511045655375552, + "grad_norm": 0.351136177778244, + "learning_rate": 2.0828105147154275e-07, + "loss": 0.3744, + "step": 3736 + }, + { + "epoch": 2.751840942562592, + "grad_norm": 0.3310754597187042, + "learning_rate": 2.07059007366352e-07, + "loss": 0.3813, + "step": 3737 + }, + { + "epoch": 2.752577319587629, + "grad_norm": 0.3328613340854645, + "learning_rate": 2.0584048305218874e-07, + "loss": 0.3609, + "step": 3738 + }, + { + "epoch": 2.7533136966126657, + "grad_norm": 0.31846439838409424, + "learning_rate": 2.0462547942389942e-07, + "loss": 0.3687, + "step": 3739 + }, + { + "epoch": 2.7540500736377025, + "grad_norm": 0.29258909821510315, + "learning_rate": 2.03413997373747e-07, + "loss": 0.3837, + "step": 3740 + }, + { + "epoch": 2.7547864506627393, + "grad_norm": 0.31229260563850403, + "learning_rate": 2.0220603779140759e-07, + "loss": 0.3641, + "step": 3741 + }, + { + "epoch": 2.755522827687776, + "grad_norm": 0.31942838430404663, + "learning_rate": 2.0100160156396986e-07, + "loss": 0.3746, + "step": 3742 + }, + { + "epoch": 2.756259204712813, + "grad_norm": 0.3302987515926361, + "learning_rate": 1.998006895759347e-07, + "loss": 0.4101, + "step": 3743 + }, + { + "epoch": 2.7569955817378498, + "grad_norm": 0.32852962613105774, + "learning_rate": 1.98603302709216e-07, + "loss": 0.3463, + "step": 3744 + }, + { + "epoch": 2.7577319587628866, + "grad_norm": 0.34027835726737976, + "learning_rate": 1.9740944184313882e-07, + "loss": 0.3922, + "step": 3745 + }, + { + "epoch": 2.7584683357879234, + "grad_norm": 0.33216574788093567, + "learning_rate": 1.9621910785443843e-07, + "loss": 0.3951, + "step": 3746 + }, + { + "epoch": 2.75920471281296, + "grad_norm": 0.3349026143550873, + "learning_rate": 1.950323016172595e-07, + "loss": 0.3729, + "step": 3747 + }, + { + "epoch": 2.759941089837997, + "grad_norm": 0.31592321395874023, + "learning_rate": 1.9384902400315764e-07, + "loss": 0.3497, + "step": 3748 + }, + { + "epoch": 2.760677466863034, + "grad_norm": 0.33262524008750916, + "learning_rate": 1.926692758810955e-07, + "loss": 0.3656, + "step": 3749 + }, + { + "epoch": 2.7614138438880707, + "grad_norm": 0.318095326423645, + "learning_rate": 1.9149305811744456e-07, + "loss": 0.3944, + "step": 3750 + }, + { + "epoch": 2.7621502209131075, + "grad_norm": 0.32870233058929443, + "learning_rate": 1.9032037157598494e-07, + "loss": 0.3801, + "step": 3751 + }, + { + "epoch": 2.7628865979381443, + "grad_norm": 0.30096235871315, + "learning_rate": 1.891512171178994e-07, + "loss": 0.3364, + "step": 3752 + }, + { + "epoch": 2.763622974963181, + "grad_norm": 0.33072760701179504, + "learning_rate": 1.8798559560178174e-07, + "loss": 0.3867, + "step": 3753 + }, + { + "epoch": 2.764359351988218, + "grad_norm": 0.34106144309043884, + "learning_rate": 1.8682350788362892e-07, + "loss": 0.3824, + "step": 3754 + }, + { + "epoch": 2.7650957290132547, + "grad_norm": 0.3276694416999817, + "learning_rate": 1.856649548168421e-07, + "loss": 0.3463, + "step": 3755 + }, + { + "epoch": 2.7658321060382915, + "grad_norm": 0.3228393793106079, + "learning_rate": 1.8450993725222856e-07, + "loss": 0.3746, + "step": 3756 + }, + { + "epoch": 2.7665684830633284, + "grad_norm": 0.3612479567527771, + "learning_rate": 1.8335845603799806e-07, + "loss": 0.3635, + "step": 3757 + }, + { + "epoch": 2.767304860088365, + "grad_norm": 0.31587934494018555, + "learning_rate": 1.8221051201976315e-07, + "loss": 0.3564, + "step": 3758 + }, + { + "epoch": 2.768041237113402, + "grad_norm": 0.3175513446331024, + "learning_rate": 1.810661060405411e-07, + "loss": 0.3706, + "step": 3759 + }, + { + "epoch": 2.768777614138439, + "grad_norm": 0.3018471896648407, + "learning_rate": 1.7992523894074688e-07, + "loss": 0.3492, + "step": 3760 + }, + { + "epoch": 2.7695139911634756, + "grad_norm": 0.3039242625236511, + "learning_rate": 1.7878791155819918e-07, + "loss": 0.3739, + "step": 3761 + }, + { + "epoch": 2.7702503681885124, + "grad_norm": 0.306921124458313, + "learning_rate": 1.776541247281177e-07, + "loss": 0.3801, + "step": 3762 + }, + { + "epoch": 2.7709867452135493, + "grad_norm": 0.3310092091560364, + "learning_rate": 1.7652387928311977e-07, + "loss": 0.381, + "step": 3763 + }, + { + "epoch": 2.771723122238586, + "grad_norm": 0.3500930368900299, + "learning_rate": 1.7539717605322527e-07, + "loss": 0.3488, + "step": 3764 + }, + { + "epoch": 2.772459499263623, + "grad_norm": 0.3258472681045532, + "learning_rate": 1.7427401586585068e-07, + "loss": 0.3674, + "step": 3765 + }, + { + "epoch": 2.7731958762886597, + "grad_norm": 0.30289342999458313, + "learning_rate": 1.731543995458096e-07, + "loss": 0.3477, + "step": 3766 + }, + { + "epoch": 2.7739322533136965, + "grad_norm": 0.29461464285850525, + "learning_rate": 1.7203832791531594e-07, + "loss": 0.3893, + "step": 3767 + }, + { + "epoch": 2.7746686303387333, + "grad_norm": 0.3397156298160553, + "learning_rate": 1.7092580179397856e-07, + "loss": 0.3684, + "step": 3768 + }, + { + "epoch": 2.77540500736377, + "grad_norm": 0.3199734687805176, + "learning_rate": 1.6981682199880167e-07, + "loss": 0.3924, + "step": 3769 + }, + { + "epoch": 2.776141384388807, + "grad_norm": 0.3441019654273987, + "learning_rate": 1.6871138934418884e-07, + "loss": 0.3684, + "step": 3770 + }, + { + "epoch": 2.776877761413844, + "grad_norm": 0.30693432688713074, + "learning_rate": 1.676095046419346e-07, + "loss": 0.3689, + "step": 3771 + }, + { + "epoch": 2.7776141384388806, + "grad_norm": 0.3343988358974457, + "learning_rate": 1.6651116870122997e-07, + "loss": 0.3952, + "step": 3772 + }, + { + "epoch": 2.7783505154639174, + "grad_norm": 0.3290632665157318, + "learning_rate": 1.654163823286603e-07, + "loss": 0.3819, + "step": 3773 + }, + { + "epoch": 2.7790868924889542, + "grad_norm": 0.3326128423213959, + "learning_rate": 1.6432514632820363e-07, + "loss": 0.3808, + "step": 3774 + }, + { + "epoch": 2.779823269513991, + "grad_norm": 0.3110780417919159, + "learning_rate": 1.6323746150123e-07, + "loss": 0.3911, + "step": 3775 + }, + { + "epoch": 2.780559646539028, + "grad_norm": 0.3054400682449341, + "learning_rate": 1.6215332864650434e-07, + "loss": 0.3856, + "step": 3776 + }, + { + "epoch": 2.7812960235640647, + "grad_norm": 0.3271074891090393, + "learning_rate": 1.6107274856017763e-07, + "loss": 0.3531, + "step": 3777 + }, + { + "epoch": 2.7820324005891015, + "grad_norm": 0.3214995563030243, + "learning_rate": 1.5999572203579783e-07, + "loss": 0.3729, + "step": 3778 + }, + { + "epoch": 2.7827687776141383, + "grad_norm": 0.3266492486000061, + "learning_rate": 1.5892224986430006e-07, + "loss": 0.384, + "step": 3779 + }, + { + "epoch": 2.783505154639175, + "grad_norm": 0.2639772295951843, + "learning_rate": 1.578523328340087e-07, + "loss": 0.3665, + "step": 3780 + }, + { + "epoch": 2.784241531664212, + "grad_norm": 0.36343199014663696, + "learning_rate": 1.5678597173064026e-07, + "loss": 0.3718, + "step": 3781 + }, + { + "epoch": 2.7849779086892488, + "grad_norm": 0.36233747005462646, + "learning_rate": 1.5572316733729775e-07, + "loss": 0.3556, + "step": 3782 + }, + { + "epoch": 2.7857142857142856, + "grad_norm": 0.33394989371299744, + "learning_rate": 1.5466392043447132e-07, + "loss": 0.3592, + "step": 3783 + }, + { + "epoch": 2.7864506627393224, + "grad_norm": 0.3197415769100189, + "learning_rate": 1.5360823180004146e-07, + "loss": 0.3903, + "step": 3784 + }, + { + "epoch": 2.787187039764359, + "grad_norm": 0.3251337707042694, + "learning_rate": 1.5255610220927252e-07, + "loss": 0.3953, + "step": 3785 + }, + { + "epoch": 2.787923416789396, + "grad_norm": 0.33090558648109436, + "learning_rate": 1.515075324348181e-07, + "loss": 0.3659, + "step": 3786 + }, + { + "epoch": 2.788659793814433, + "grad_norm": 0.31915581226348877, + "learning_rate": 1.504625232467155e-07, + "loss": 0.3764, + "step": 3787 + }, + { + "epoch": 2.7893961708394697, + "grad_norm": 0.34577444195747375, + "learning_rate": 1.4942107541238705e-07, + "loss": 0.3593, + "step": 3788 + }, + { + "epoch": 2.7901325478645065, + "grad_norm": 0.3278156816959381, + "learning_rate": 1.48383189696642e-07, + "loss": 0.3688, + "step": 3789 + }, + { + "epoch": 2.7908689248895433, + "grad_norm": 0.31039220094680786, + "learning_rate": 1.4734886686167182e-07, + "loss": 0.3732, + "step": 3790 + }, + { + "epoch": 2.79160530191458, + "grad_norm": 0.31084826588630676, + "learning_rate": 1.4631810766705112e-07, + "loss": 0.35, + "step": 3791 + }, + { + "epoch": 2.792341678939617, + "grad_norm": 0.29743996262550354, + "learning_rate": 1.4529091286973994e-07, + "loss": 0.3907, + "step": 3792 + }, + { + "epoch": 2.7930780559646537, + "grad_norm": 0.30235204100608826, + "learning_rate": 1.4426728322407822e-07, + "loss": 0.4054, + "step": 3793 + }, + { + "epoch": 2.7938144329896906, + "grad_norm": 0.31857481598854065, + "learning_rate": 1.4324721948178743e-07, + "loss": 0.377, + "step": 3794 + }, + { + "epoch": 2.7945508100147274, + "grad_norm": 0.3209660053253174, + "learning_rate": 1.4223072239197333e-07, + "loss": 0.3558, + "step": 3795 + }, + { + "epoch": 2.795287187039764, + "grad_norm": 0.3144548833370209, + "learning_rate": 1.412177927011199e-07, + "loss": 0.3617, + "step": 3796 + }, + { + "epoch": 2.796023564064801, + "grad_norm": 0.32282963395118713, + "learning_rate": 1.4020843115309213e-07, + "loss": 0.4057, + "step": 3797 + }, + { + "epoch": 2.796759941089838, + "grad_norm": 0.3219519555568695, + "learning_rate": 1.3920263848913484e-07, + "loss": 0.3917, + "step": 3798 + }, + { + "epoch": 2.7974963181148746, + "grad_norm": 0.32549378275871277, + "learning_rate": 1.3820041544787167e-07, + "loss": 0.3979, + "step": 3799 + }, + { + "epoch": 2.7982326951399115, + "grad_norm": 0.36029815673828125, + "learning_rate": 1.372017627653044e-07, + "loss": 0.36, + "step": 3800 + }, + { + "epoch": 2.7989690721649483, + "grad_norm": 0.29594072699546814, + "learning_rate": 1.3620668117481471e-07, + "loss": 0.3597, + "step": 3801 + }, + { + "epoch": 2.799705449189985, + "grad_norm": 0.30883079767227173, + "learning_rate": 1.3521517140715867e-07, + "loss": 0.3621, + "step": 3802 + }, + { + "epoch": 2.800441826215022, + "grad_norm": 0.29919102787971497, + "learning_rate": 1.3422723419047267e-07, + "loss": 0.3504, + "step": 3803 + }, + { + "epoch": 2.8011782032400587, + "grad_norm": 0.3039191663265228, + "learning_rate": 1.332428702502675e-07, + "loss": 0.37, + "step": 3804 + }, + { + "epoch": 2.8019145802650955, + "grad_norm": 0.31316202878952026, + "learning_rate": 1.3226208030942934e-07, + "loss": 0.3754, + "step": 3805 + }, + { + "epoch": 2.8026509572901324, + "grad_norm": 0.2829436957836151, + "learning_rate": 1.3128486508822202e-07, + "loss": 0.3824, + "step": 3806 + }, + { + "epoch": 2.803387334315169, + "grad_norm": 0.3167963922023773, + "learning_rate": 1.3031122530428264e-07, + "loss": 0.3699, + "step": 3807 + }, + { + "epoch": 2.804123711340206, + "grad_norm": 0.3057669997215271, + "learning_rate": 1.2934116167262145e-07, + "loss": 0.3758, + "step": 3808 + }, + { + "epoch": 2.804860088365243, + "grad_norm": 0.3250826895236969, + "learning_rate": 1.2837467490562583e-07, + "loss": 0.3437, + "step": 3809 + }, + { + "epoch": 2.8055964653902796, + "grad_norm": 0.3518430292606354, + "learning_rate": 1.274117657130536e-07, + "loss": 0.3829, + "step": 3810 + }, + { + "epoch": 2.8063328424153164, + "grad_norm": 0.32420122623443604, + "learning_rate": 1.2645243480203574e-07, + "loss": 0.3743, + "step": 3811 + }, + { + "epoch": 2.8070692194403533, + "grad_norm": 0.3204830586910248, + "learning_rate": 1.254966828770765e-07, + "loss": 0.3848, + "step": 3812 + }, + { + "epoch": 2.80780559646539, + "grad_norm": 0.3180425763130188, + "learning_rate": 1.2454451064005058e-07, + "loss": 0.39, + "step": 3813 + }, + { + "epoch": 2.808541973490427, + "grad_norm": 0.32775330543518066, + "learning_rate": 1.2359591879020528e-07, + "loss": 0.376, + "step": 3814 + }, + { + "epoch": 2.8092783505154637, + "grad_norm": 0.34739986062049866, + "learning_rate": 1.2265090802415724e-07, + "loss": 0.3723, + "step": 3815 + }, + { + "epoch": 2.8100147275405005, + "grad_norm": 0.3330659866333008, + "learning_rate": 1.217094790358936e-07, + "loss": 0.3697, + "step": 3816 + }, + { + "epoch": 2.8107511045655373, + "grad_norm": 0.30431681871414185, + "learning_rate": 1.2077163251677182e-07, + "loss": 0.3748, + "step": 3817 + }, + { + "epoch": 2.811487481590574, + "grad_norm": 0.33776891231536865, + "learning_rate": 1.1983736915551824e-07, + "loss": 0.3889, + "step": 3818 + }, + { + "epoch": 2.812223858615611, + "grad_norm": 0.3244916498661041, + "learning_rate": 1.1890668963822793e-07, + "loss": 0.379, + "step": 3819 + }, + { + "epoch": 2.812960235640648, + "grad_norm": 0.3201790750026703, + "learning_rate": 1.179795946483625e-07, + "loss": 0.3578, + "step": 3820 + }, + { + "epoch": 2.8136966126656846, + "grad_norm": 0.32924962043762207, + "learning_rate": 1.170560848667529e-07, + "loss": 0.3561, + "step": 3821 + }, + { + "epoch": 2.8144329896907214, + "grad_norm": 0.318311870098114, + "learning_rate": 1.1613616097159774e-07, + "loss": 0.3588, + "step": 3822 + }, + { + "epoch": 2.8151693667157582, + "grad_norm": 0.30944523215293884, + "learning_rate": 1.1521982363846051e-07, + "loss": 0.3534, + "step": 3823 + }, + { + "epoch": 2.815905743740795, + "grad_norm": 0.3113093078136444, + "learning_rate": 1.1430707354027182e-07, + "loss": 0.3749, + "step": 3824 + }, + { + "epoch": 2.816642120765832, + "grad_norm": 0.298728346824646, + "learning_rate": 1.1339791134732769e-07, + "loss": 0.3704, + "step": 3825 + }, + { + "epoch": 2.8173784977908687, + "grad_norm": 0.30721381306648254, + "learning_rate": 1.1249233772729018e-07, + "loss": 0.403, + "step": 3826 + }, + { + "epoch": 2.8181148748159055, + "grad_norm": 0.3090912997722626, + "learning_rate": 1.1159035334518343e-07, + "loss": 0.3721, + "step": 3827 + }, + { + "epoch": 2.8188512518409423, + "grad_norm": 0.30627208948135376, + "learning_rate": 1.1069195886339923e-07, + "loss": 0.3677, + "step": 3828 + }, + { + "epoch": 2.819587628865979, + "grad_norm": 0.32268771529197693, + "learning_rate": 1.0979715494169096e-07, + "loss": 0.3591, + "step": 3829 + }, + { + "epoch": 2.820324005891016, + "grad_norm": 0.33059263229370117, + "learning_rate": 1.089059422371741e-07, + "loss": 0.3579, + "step": 3830 + }, + { + "epoch": 2.8210603829160528, + "grad_norm": 0.3322978913784027, + "learning_rate": 1.0801832140433066e-07, + "loss": 0.3856, + "step": 3831 + }, + { + "epoch": 2.8217967599410896, + "grad_norm": 0.3347814977169037, + "learning_rate": 1.071342930950009e-07, + "loss": 0.3722, + "step": 3832 + }, + { + "epoch": 2.8225331369661264, + "grad_norm": 0.34562426805496216, + "learning_rate": 1.0625385795838883e-07, + "loss": 0.3687, + "step": 3833 + }, + { + "epoch": 2.823269513991163, + "grad_norm": 0.3513891100883484, + "learning_rate": 1.0537701664106003e-07, + "loss": 0.3737, + "step": 3834 + }, + { + "epoch": 2.8240058910162, + "grad_norm": 0.31909000873565674, + "learning_rate": 1.0450376978693999e-07, + "loss": 0.3695, + "step": 3835 + }, + { + "epoch": 2.824742268041237, + "grad_norm": 0.3165963292121887, + "learning_rate": 1.0363411803731404e-07, + "loss": 0.3876, + "step": 3836 + }, + { + "epoch": 2.8254786450662737, + "grad_norm": 0.32449856400489807, + "learning_rate": 1.0276806203082967e-07, + "loss": 0.3678, + "step": 3837 + }, + { + "epoch": 2.8262150220913105, + "grad_norm": 0.3418518006801605, + "learning_rate": 1.0190560240349035e-07, + "loss": 0.3594, + "step": 3838 + }, + { + "epoch": 2.8269513991163473, + "grad_norm": 0.32669028639793396, + "learning_rate": 1.0104673978866164e-07, + "loss": 0.3804, + "step": 3839 + }, + { + "epoch": 2.827687776141384, + "grad_norm": 0.2968961298465729, + "learning_rate": 1.0019147481706626e-07, + "loss": 0.3726, + "step": 3840 + }, + { + "epoch": 2.8284241531664214, + "grad_norm": 0.3117745816707611, + "learning_rate": 9.933980811678401e-08, + "loss": 0.3972, + "step": 3841 + }, + { + "epoch": 2.829160530191458, + "grad_norm": 0.3052343428134918, + "learning_rate": 9.84917403132546e-08, + "loss": 0.3743, + "step": 3842 + }, + { + "epoch": 2.829896907216495, + "grad_norm": 0.3451398015022278, + "learning_rate": 9.764727202927259e-08, + "loss": 0.3921, + "step": 3843 + }, + { + "epoch": 2.830633284241532, + "grad_norm": 0.3272383213043213, + "learning_rate": 9.680640388498974e-08, + "loss": 0.3752, + "step": 3844 + }, + { + "epoch": 2.8313696612665686, + "grad_norm": 0.3403890132904053, + "learning_rate": 9.596913649791484e-08, + "loss": 0.3564, + "step": 3845 + }, + { + "epoch": 2.8321060382916055, + "grad_norm": 0.3162234425544739, + "learning_rate": 9.51354704829105e-08, + "loss": 0.3515, + "step": 3846 + }, + { + "epoch": 2.8328424153166423, + "grad_norm": 0.31846678256988525, + "learning_rate": 9.430540645219755e-08, + "loss": 0.374, + "step": 3847 + }, + { + "epoch": 2.833578792341679, + "grad_norm": 0.32757052779197693, + "learning_rate": 9.347894501534949e-08, + "loss": 0.3537, + "step": 3848 + }, + { + "epoch": 2.834315169366716, + "grad_norm": 0.32147282361984253, + "learning_rate": 9.26560867792936e-08, + "loss": 0.3824, + "step": 3849 + }, + { + "epoch": 2.8350515463917527, + "grad_norm": 0.34874793887138367, + "learning_rate": 9.18368323483132e-08, + "loss": 0.3895, + "step": 3850 + }, + { + "epoch": 2.8357879234167895, + "grad_norm": 0.29230934381484985, + "learning_rate": 9.102118232404311e-08, + "loss": 0.391, + "step": 3851 + }, + { + "epoch": 2.8365243004418264, + "grad_norm": 0.33275964856147766, + "learning_rate": 9.020913730547309e-08, + "loss": 0.3728, + "step": 3852 + }, + { + "epoch": 2.837260677466863, + "grad_norm": 0.36540740728378296, + "learning_rate": 8.940069788894389e-08, + "loss": 0.3738, + "step": 3853 + }, + { + "epoch": 2.8379970544919, + "grad_norm": 0.29792320728302, + "learning_rate": 8.859586466814895e-08, + "loss": 0.3478, + "step": 3854 + }, + { + "epoch": 2.838733431516937, + "grad_norm": 0.35100606083869934, + "learning_rate": 8.77946382341327e-08, + "loss": 0.3724, + "step": 3855 + }, + { + "epoch": 2.8394698085419736, + "grad_norm": 0.31920912861824036, + "learning_rate": 8.699701917529335e-08, + "loss": 0.3789, + "step": 3856 + }, + { + "epoch": 2.8402061855670104, + "grad_norm": 0.3164528012275696, + "learning_rate": 8.62030080773768e-08, + "loss": 0.3715, + "step": 3857 + }, + { + "epoch": 2.8409425625920472, + "grad_norm": 0.3255048990249634, + "learning_rate": 8.541260552348107e-08, + "loss": 0.3844, + "step": 3858 + }, + { + "epoch": 2.841678939617084, + "grad_norm": 0.3301268517971039, + "learning_rate": 8.462581209405519e-08, + "loss": 0.4026, + "step": 3859 + }, + { + "epoch": 2.842415316642121, + "grad_norm": 0.3406490087509155, + "learning_rate": 8.384262836689472e-08, + "loss": 0.3798, + "step": 3860 + }, + { + "epoch": 2.8431516936671577, + "grad_norm": 0.30102699995040894, + "learning_rate": 8.306305491714683e-08, + "loss": 0.3826, + "step": 3861 + }, + { + "epoch": 2.8438880706921945, + "grad_norm": 0.30409371852874756, + "learning_rate": 8.228709231730747e-08, + "loss": 0.3754, + "step": 3862 + }, + { + "epoch": 2.8446244477172313, + "grad_norm": 0.30450791120529175, + "learning_rate": 8.151474113721803e-08, + "loss": 0.381, + "step": 3863 + }, + { + "epoch": 2.845360824742268, + "grad_norm": 0.3098084628582001, + "learning_rate": 8.074600194407257e-08, + "loss": 0.3558, + "step": 3864 + }, + { + "epoch": 2.846097201767305, + "grad_norm": 0.315789133310318, + "learning_rate": 7.998087530240784e-08, + "loss": 0.3601, + "step": 3865 + }, + { + "epoch": 2.846833578792342, + "grad_norm": 0.3192322552204132, + "learning_rate": 7.921936177411049e-08, + "loss": 0.4126, + "step": 3866 + }, + { + "epoch": 2.8475699558173786, + "grad_norm": 0.3339972198009491, + "learning_rate": 7.846146191841319e-08, + "loss": 0.4005, + "step": 3867 + }, + { + "epoch": 2.8483063328424154, + "grad_norm": 0.31077709794044495, + "learning_rate": 7.770717629189462e-08, + "loss": 0.3869, + "step": 3868 + }, + { + "epoch": 2.8490427098674522, + "grad_norm": 0.327124685049057, + "learning_rate": 7.695650544847888e-08, + "loss": 0.3628, + "step": 3869 + }, + { + "epoch": 2.849779086892489, + "grad_norm": 0.32845091819763184, + "learning_rate": 7.620944993943669e-08, + "loss": 0.37, + "step": 3870 + }, + { + "epoch": 2.850515463917526, + "grad_norm": 0.32456862926483154, + "learning_rate": 7.546601031338252e-08, + "loss": 0.3459, + "step": 3871 + }, + { + "epoch": 2.8512518409425627, + "grad_norm": 0.31279054284095764, + "learning_rate": 7.472618711627577e-08, + "loss": 0.3533, + "step": 3872 + }, + { + "epoch": 2.8519882179675995, + "grad_norm": 0.31263667345046997, + "learning_rate": 7.398998089142128e-08, + "loss": 0.3837, + "step": 3873 + }, + { + "epoch": 2.8527245949926363, + "grad_norm": 0.31096217036247253, + "learning_rate": 7.325739217946547e-08, + "loss": 0.4039, + "step": 3874 + }, + { + "epoch": 2.853460972017673, + "grad_norm": 0.31137025356292725, + "learning_rate": 7.252842151839967e-08, + "loss": 0.3707, + "step": 3875 + }, + { + "epoch": 2.85419734904271, + "grad_norm": 0.3208453953266144, + "learning_rate": 7.180306944355896e-08, + "loss": 0.3662, + "step": 3876 + }, + { + "epoch": 2.8549337260677468, + "grad_norm": 0.3515404164791107, + "learning_rate": 7.108133648761839e-08, + "loss": 0.3336, + "step": 3877 + }, + { + "epoch": 2.8556701030927836, + "grad_norm": 0.32757940888404846, + "learning_rate": 7.036322318059785e-08, + "loss": 0.3916, + "step": 3878 + }, + { + "epoch": 2.8564064801178204, + "grad_norm": 0.3318811357021332, + "learning_rate": 6.964873004985717e-08, + "loss": 0.339, + "step": 3879 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.28970828652381897, + "learning_rate": 6.893785762009942e-08, + "loss": 0.376, + "step": 3880 + }, + { + "epoch": 2.857879234167894, + "grad_norm": 0.30351996421813965, + "learning_rate": 6.823060641336809e-08, + "loss": 0.3666, + "step": 3881 + }, + { + "epoch": 2.858615611192931, + "grad_norm": 0.2939937710762024, + "learning_rate": 6.752697694904553e-08, + "loss": 0.3697, + "step": 3882 + }, + { + "epoch": 2.8593519882179677, + "grad_norm": 0.31340742111206055, + "learning_rate": 6.682696974385727e-08, + "loss": 0.3581, + "step": 3883 + }, + { + "epoch": 2.8600883652430045, + "grad_norm": 0.3075888156890869, + "learning_rate": 6.613058531186767e-08, + "loss": 0.4048, + "step": 3884 + }, + { + "epoch": 2.8608247422680413, + "grad_norm": 0.3178377151489258, + "learning_rate": 6.54378241644793e-08, + "loss": 0.3927, + "step": 3885 + }, + { + "epoch": 2.861561119293078, + "grad_norm": 0.31362468004226685, + "learning_rate": 6.474868681043578e-08, + "loss": 0.3953, + "step": 3886 + }, + { + "epoch": 2.862297496318115, + "grad_norm": 0.3346673250198364, + "learning_rate": 6.406317375581839e-08, + "loss": 0.3578, + "step": 3887 + }, + { + "epoch": 2.8630338733431517, + "grad_norm": 0.31339147686958313, + "learning_rate": 6.338128550404721e-08, + "loss": 0.377, + "step": 3888 + }, + { + "epoch": 2.8637702503681886, + "grad_norm": 0.31569039821624756, + "learning_rate": 6.270302255588112e-08, + "loss": 0.372, + "step": 3889 + }, + { + "epoch": 2.8645066273932254, + "grad_norm": 0.33136993646621704, + "learning_rate": 6.202838540941503e-08, + "loss": 0.4067, + "step": 3890 + }, + { + "epoch": 2.865243004418262, + "grad_norm": 0.3213663101196289, + "learning_rate": 6.135737456008207e-08, + "loss": 0.3936, + "step": 3891 + }, + { + "epoch": 2.865979381443299, + "grad_norm": 0.29569748044013977, + "learning_rate": 6.06899905006525e-08, + "loss": 0.3815, + "step": 3892 + }, + { + "epoch": 2.866715758468336, + "grad_norm": 0.31531018018722534, + "learning_rate": 6.002623372123373e-08, + "loss": 0.377, + "step": 3893 + }, + { + "epoch": 2.8674521354933726, + "grad_norm": 0.31117159128189087, + "learning_rate": 5.9366104709267515e-08, + "loss": 0.3674, + "step": 3894 + }, + { + "epoch": 2.8681885125184094, + "grad_norm": 0.3713977634906769, + "learning_rate": 5.8709603949533844e-08, + "loss": 0.3724, + "step": 3895 + }, + { + "epoch": 2.8689248895434463, + "grad_norm": 0.30198606848716736, + "learning_rate": 5.805673192414596e-08, + "loss": 0.4026, + "step": 3896 + }, + { + "epoch": 2.869661266568483, + "grad_norm": 0.30127283930778503, + "learning_rate": 5.740748911255367e-08, + "loss": 0.3914, + "step": 3897 + }, + { + "epoch": 2.87039764359352, + "grad_norm": 0.3306448757648468, + "learning_rate": 5.6761875991541704e-08, + "loss": 0.3497, + "step": 3898 + }, + { + "epoch": 2.8711340206185567, + "grad_norm": 0.3268984258174896, + "learning_rate": 5.611989303522858e-08, + "loss": 0.3916, + "step": 3899 + }, + { + "epoch": 2.8718703976435935, + "grad_norm": 0.3211970627307892, + "learning_rate": 5.5481540715066616e-08, + "loss": 0.365, + "step": 3900 + }, + { + "epoch": 2.8726067746686303, + "grad_norm": 0.336363822221756, + "learning_rate": 5.4846819499843605e-08, + "loss": 0.3842, + "step": 3901 + }, + { + "epoch": 2.873343151693667, + "grad_norm": 0.3143905997276306, + "learning_rate": 5.4215729855678914e-08, + "loss": 0.3844, + "step": 3902 + }, + { + "epoch": 2.874079528718704, + "grad_norm": 0.35480746626853943, + "learning_rate": 5.35882722460257e-08, + "loss": 0.3772, + "step": 3903 + }, + { + "epoch": 2.874815905743741, + "grad_norm": 0.3486427366733551, + "learning_rate": 5.296444713166981e-08, + "loss": 0.3757, + "step": 3904 + }, + { + "epoch": 2.8755522827687776, + "grad_norm": 0.3225955367088318, + "learning_rate": 5.234425497072981e-08, + "loss": 0.3417, + "step": 3905 + }, + { + "epoch": 2.8762886597938144, + "grad_norm": 0.31462594866752625, + "learning_rate": 5.172769621865637e-08, + "loss": 0.4036, + "step": 3906 + }, + { + "epoch": 2.8770250368188512, + "grad_norm": 0.3221474885940552, + "learning_rate": 5.1114771328230615e-08, + "loss": 0.3568, + "step": 3907 + }, + { + "epoch": 2.877761413843888, + "grad_norm": 0.319514662027359, + "learning_rate": 5.050548074956696e-08, + "loss": 0.359, + "step": 3908 + }, + { + "epoch": 2.878497790868925, + "grad_norm": 0.32837367057800293, + "learning_rate": 4.9899824930109694e-08, + "loss": 0.3702, + "step": 3909 + }, + { + "epoch": 2.8792341678939617, + "grad_norm": 0.30610325932502747, + "learning_rate": 4.9297804314633604e-08, + "loss": 0.3641, + "step": 3910 + }, + { + "epoch": 2.8799705449189985, + "grad_norm": 0.32444047927856445, + "learning_rate": 4.869941934524613e-08, + "loss": 0.3677, + "step": 3911 + }, + { + "epoch": 2.8807069219440353, + "grad_norm": 0.3461378216743469, + "learning_rate": 4.810467046138134e-08, + "loss": 0.3876, + "step": 3912 + }, + { + "epoch": 2.881443298969072, + "grad_norm": 0.32242852449417114, + "learning_rate": 4.75135580998054e-08, + "loss": 0.349, + "step": 3913 + }, + { + "epoch": 2.882179675994109, + "grad_norm": 0.31807756423950195, + "learning_rate": 4.69260826946133e-08, + "loss": 0.3642, + "step": 3914 + }, + { + "epoch": 2.8829160530191458, + "grad_norm": 0.31694477796554565, + "learning_rate": 4.634224467722992e-08, + "loss": 0.3758, + "step": 3915 + }, + { + "epoch": 2.8836524300441826, + "grad_norm": 0.343872994184494, + "learning_rate": 4.576204447640675e-08, + "loss": 0.3576, + "step": 3916 + }, + { + "epoch": 2.8843888070692194, + "grad_norm": 0.3494185209274292, + "learning_rate": 4.518548251822685e-08, + "loss": 0.3636, + "step": 3917 + }, + { + "epoch": 2.8851251840942562, + "grad_norm": 0.32793089747428894, + "learning_rate": 4.461255922609986e-08, + "loss": 0.3672, + "step": 3918 + }, + { + "epoch": 2.885861561119293, + "grad_norm": 0.36390119791030884, + "learning_rate": 4.4043275020762e-08, + "loss": 0.3504, + "step": 3919 + }, + { + "epoch": 2.88659793814433, + "grad_norm": 0.3462429344654083, + "learning_rate": 4.3477630320279405e-08, + "loss": 0.3632, + "step": 3920 + }, + { + "epoch": 2.8873343151693667, + "grad_norm": 0.3237389326095581, + "learning_rate": 4.291562554004369e-08, + "loss": 0.3802, + "step": 3921 + }, + { + "epoch": 2.8880706921944035, + "grad_norm": 0.3111920654773712, + "learning_rate": 4.235726109277527e-08, + "loss": 0.3409, + "step": 3922 + }, + { + "epoch": 2.8888070692194403, + "grad_norm": 0.2972390949726105, + "learning_rate": 4.180253738851947e-08, + "loss": 0.3955, + "step": 3923 + }, + { + "epoch": 2.889543446244477, + "grad_norm": 0.32099413871765137, + "learning_rate": 4.125145483464821e-08, + "loss": 0.3673, + "step": 3924 + }, + { + "epoch": 2.890279823269514, + "grad_norm": 0.31523367762565613, + "learning_rate": 4.070401383586109e-08, + "loss": 0.354, + "step": 3925 + }, + { + "epoch": 2.8910162002945508, + "grad_norm": 0.3093765079975128, + "learning_rate": 4.0160214794180976e-08, + "loss": 0.38, + "step": 3926 + }, + { + "epoch": 2.8917525773195876, + "grad_norm": 0.3105464577674866, + "learning_rate": 3.962005810895786e-08, + "loss": 0.3578, + "step": 3927 + }, + { + "epoch": 2.8924889543446244, + "grad_norm": 0.3610590994358063, + "learning_rate": 3.908354417686722e-08, + "loss": 0.3708, + "step": 3928 + }, + { + "epoch": 2.893225331369661, + "grad_norm": 0.328524112701416, + "learning_rate": 3.855067339190721e-08, + "loss": 0.3849, + "step": 3929 + }, + { + "epoch": 2.893961708394698, + "grad_norm": 0.3390013873577118, + "learning_rate": 3.802144614540315e-08, + "loss": 0.3637, + "step": 3930 + }, + { + "epoch": 2.894698085419735, + "grad_norm": 0.3347131907939911, + "learning_rate": 3.749586282600359e-08, + "loss": 0.3575, + "step": 3931 + }, + { + "epoch": 2.8954344624447717, + "grad_norm": 0.3055760860443115, + "learning_rate": 3.6973923819680344e-08, + "loss": 0.3632, + "step": 3932 + }, + { + "epoch": 2.8961708394698085, + "grad_norm": 0.31726714968681335, + "learning_rate": 3.645562950973014e-08, + "loss": 0.3791, + "step": 3933 + }, + { + "epoch": 2.8969072164948453, + "grad_norm": 0.34960147738456726, + "learning_rate": 3.5940980276772394e-08, + "loss": 0.3802, + "step": 3934 + }, + { + "epoch": 2.897643593519882, + "grad_norm": 0.3189311921596527, + "learning_rate": 3.5429976498749794e-08, + "loss": 0.3893, + "step": 3935 + }, + { + "epoch": 2.898379970544919, + "grad_norm": 0.3356260359287262, + "learning_rate": 3.492261855092938e-08, + "loss": 0.3665, + "step": 3936 + }, + { + "epoch": 2.8991163475699557, + "grad_norm": 0.31776583194732666, + "learning_rate": 3.441890680589754e-08, + "loss": 0.3619, + "step": 3937 + }, + { + "epoch": 2.8998527245949925, + "grad_norm": 0.3313503563404083, + "learning_rate": 3.391884163356618e-08, + "loss": 0.3877, + "step": 3938 + }, + { + "epoch": 2.9005891016200294, + "grad_norm": 0.38583609461784363, + "learning_rate": 3.3422423401167634e-08, + "loss": 0.3696, + "step": 3939 + }, + { + "epoch": 2.901325478645066, + "grad_norm": 0.33413827419281006, + "learning_rate": 3.292965247325641e-08, + "loss": 0.373, + "step": 3940 + }, + { + "epoch": 2.902061855670103, + "grad_norm": 0.331826388835907, + "learning_rate": 3.2440529211709146e-08, + "loss": 0.3597, + "step": 3941 + }, + { + "epoch": 2.90279823269514, + "grad_norm": 0.33604195713996887, + "learning_rate": 3.19550539757224e-08, + "loss": 0.3729, + "step": 3942 + }, + { + "epoch": 2.9035346097201766, + "grad_norm": 0.3193381130695343, + "learning_rate": 3.147322712181489e-08, + "loss": 0.351, + "step": 3943 + }, + { + "epoch": 2.9042709867452134, + "grad_norm": 0.30179837346076965, + "learning_rate": 3.0995049003826325e-08, + "loss": 0.3728, + "step": 3944 + }, + { + "epoch": 2.9050073637702503, + "grad_norm": 0.32953178882598877, + "learning_rate": 3.052051997291527e-08, + "loss": 0.3682, + "step": 3945 + }, + { + "epoch": 2.905743740795287, + "grad_norm": 0.3493231236934662, + "learning_rate": 3.0049640377561865e-08, + "loss": 0.36, + "step": 3946 + }, + { + "epoch": 2.906480117820324, + "grad_norm": 0.332707941532135, + "learning_rate": 2.9582410563565587e-08, + "loss": 0.3774, + "step": 3947 + }, + { + "epoch": 2.9072164948453607, + "grad_norm": 0.326961874961853, + "learning_rate": 2.9118830874046988e-08, + "loss": 0.3633, + "step": 3948 + }, + { + "epoch": 2.9079528718703975, + "grad_norm": 0.34812161326408386, + "learning_rate": 2.8658901649443183e-08, + "loss": 0.3605, + "step": 3949 + }, + { + "epoch": 2.9086892488954343, + "grad_norm": 0.32744961977005005, + "learning_rate": 2.8202623227513993e-08, + "loss": 0.3688, + "step": 3950 + }, + { + "epoch": 2.909425625920471, + "grad_norm": 0.3743055462837219, + "learning_rate": 2.7749995943335272e-08, + "loss": 0.3543, + "step": 3951 + }, + { + "epoch": 2.910162002945508, + "grad_norm": 0.3317681550979614, + "learning_rate": 2.730102012930336e-08, + "loss": 0.3662, + "step": 3952 + }, + { + "epoch": 2.910898379970545, + "grad_norm": 0.36964574456214905, + "learning_rate": 2.6855696115133388e-08, + "loss": 0.3509, + "step": 3953 + }, + { + "epoch": 2.9116347569955816, + "grad_norm": 0.3326326608657837, + "learning_rate": 2.6414024227855994e-08, + "loss": 0.3931, + "step": 3954 + }, + { + "epoch": 2.9123711340206184, + "grad_norm": 0.3078947961330414, + "learning_rate": 2.597600479182283e-08, + "loss": 0.3566, + "step": 3955 + }, + { + "epoch": 2.9131075110456552, + "grad_norm": 0.3276365399360657, + "learning_rate": 2.5541638128702694e-08, + "loss": 0.3809, + "step": 3956 + }, + { + "epoch": 2.913843888070692, + "grad_norm": 0.31601178646087646, + "learning_rate": 2.511092455747932e-08, + "loss": 0.4119, + "step": 3957 + }, + { + "epoch": 2.914580265095729, + "grad_norm": 0.34153667092323303, + "learning_rate": 2.4683864394458023e-08, + "loss": 0.3436, + "step": 3958 + }, + { + "epoch": 2.9153166421207657, + "grad_norm": 0.3269186019897461, + "learning_rate": 2.4260457953257377e-08, + "loss": 0.3722, + "step": 3959 + }, + { + "epoch": 2.9160530191458025, + "grad_norm": 0.3348276913166046, + "learning_rate": 2.3840705544815324e-08, + "loss": 0.3758, + "step": 3960 + }, + { + "epoch": 2.9167893961708393, + "grad_norm": 0.3003559708595276, + "learning_rate": 2.3424607477384176e-08, + "loss": 0.3738, + "step": 3961 + }, + { + "epoch": 2.917525773195876, + "grad_norm": 0.316893607378006, + "learning_rate": 2.3012164056534503e-08, + "loss": 0.3652, + "step": 3962 + }, + { + "epoch": 2.918262150220913, + "grad_norm": 0.3250206410884857, + "learning_rate": 2.260337558515291e-08, + "loss": 0.3752, + "step": 3963 + }, + { + "epoch": 2.9189985272459498, + "grad_norm": 0.34431540966033936, + "learning_rate": 2.2198242363439814e-08, + "loss": 0.3506, + "step": 3964 + }, + { + "epoch": 2.9197349042709866, + "grad_norm": 0.3138362765312195, + "learning_rate": 2.179676468891334e-08, + "loss": 0.3998, + "step": 3965 + }, + { + "epoch": 2.9204712812960234, + "grad_norm": 0.33245036005973816, + "learning_rate": 2.1398942856407646e-08, + "loss": 0.3787, + "step": 3966 + }, + { + "epoch": 2.92120765832106, + "grad_norm": 0.34872448444366455, + "learning_rate": 2.100477715806959e-08, + "loss": 0.3701, + "step": 3967 + }, + { + "epoch": 2.9219440353460975, + "grad_norm": 0.3229193091392517, + "learning_rate": 2.061426788336318e-08, + "loss": 0.3846, + "step": 3968 + }, + { + "epoch": 2.9226804123711343, + "grad_norm": 0.3165433406829834, + "learning_rate": 2.0227415319067355e-08, + "loss": 0.3617, + "step": 3969 + }, + { + "epoch": 2.923416789396171, + "grad_norm": 0.34778398275375366, + "learning_rate": 1.984421974927375e-08, + "loss": 0.3858, + "step": 3970 + }, + { + "epoch": 2.924153166421208, + "grad_norm": 0.31038519740104675, + "learning_rate": 1.946468145538949e-08, + "loss": 0.3647, + "step": 3971 + }, + { + "epoch": 2.9248895434462447, + "grad_norm": 0.2943412959575653, + "learning_rate": 1.908880071613717e-08, + "loss": 0.3628, + "step": 3972 + }, + { + "epoch": 2.9256259204712816, + "grad_norm": 0.29908487200737, + "learning_rate": 1.871657780755154e-08, + "loss": 0.3674, + "step": 3973 + }, + { + "epoch": 2.9263622974963184, + "grad_norm": 0.3493654131889343, + "learning_rate": 1.8348013002982278e-08, + "loss": 0.3899, + "step": 3974 + }, + { + "epoch": 2.927098674521355, + "grad_norm": 0.33574777841567993, + "learning_rate": 1.798310657309177e-08, + "loss": 0.3621, + "step": 3975 + }, + { + "epoch": 2.927835051546392, + "grad_norm": 0.3295884430408478, + "learning_rate": 1.7621858785856206e-08, + "loss": 0.3744, + "step": 3976 + }, + { + "epoch": 2.928571428571429, + "grad_norm": 0.31094932556152344, + "learning_rate": 1.72642699065656e-08, + "loss": 0.3566, + "step": 3977 + }, + { + "epoch": 2.9293078055964656, + "grad_norm": 0.34367263317108154, + "learning_rate": 1.6910340197822116e-08, + "loss": 0.3691, + "step": 3978 + }, + { + "epoch": 2.9300441826215025, + "grad_norm": 0.32079145312309265, + "learning_rate": 1.6560069919541177e-08, + "loss": 0.3693, + "step": 3979 + }, + { + "epoch": 2.9307805596465393, + "grad_norm": 0.3374221920967102, + "learning_rate": 1.6213459328950355e-08, + "loss": 0.3858, + "step": 3980 + }, + { + "epoch": 2.931516936671576, + "grad_norm": 0.34414032101631165, + "learning_rate": 1.5870508680589923e-08, + "loss": 0.349, + "step": 3981 + }, + { + "epoch": 2.932253313696613, + "grad_norm": 0.3187008202075958, + "learning_rate": 1.5531218226312872e-08, + "loss": 0.3836, + "step": 3982 + }, + { + "epoch": 2.9329896907216497, + "grad_norm": 0.3022221624851227, + "learning_rate": 1.5195588215283773e-08, + "loss": 0.3954, + "step": 3983 + }, + { + "epoch": 2.9337260677466865, + "grad_norm": 0.29533877968788147, + "learning_rate": 1.4863618893979359e-08, + "loss": 0.3874, + "step": 3984 + }, + { + "epoch": 2.9344624447717234, + "grad_norm": 0.3319675624370575, + "learning_rate": 1.4535310506187394e-08, + "loss": 0.3379, + "step": 3985 + }, + { + "epoch": 2.93519882179676, + "grad_norm": 0.3372650444507599, + "learning_rate": 1.4210663293008353e-08, + "loss": 0.3825, + "step": 3986 + }, + { + "epoch": 2.935935198821797, + "grad_norm": 0.3170906901359558, + "learning_rate": 1.3889677492852083e-08, + "loss": 0.3742, + "step": 3987 + }, + { + "epoch": 2.936671575846834, + "grad_norm": 0.2827453315258026, + "learning_rate": 1.3572353341442246e-08, + "loss": 0.3538, + "step": 3988 + }, + { + "epoch": 2.9374079528718706, + "grad_norm": 0.31764787435531616, + "learning_rate": 1.3258691071811325e-08, + "loss": 0.357, + "step": 3989 + }, + { + "epoch": 2.9381443298969074, + "grad_norm": 0.31900376081466675, + "learning_rate": 1.2948690914303397e-08, + "loss": 0.3661, + "step": 3990 + }, + { + "epoch": 2.9388807069219443, + "grad_norm": 0.32145196199417114, + "learning_rate": 1.2642353096573578e-08, + "loss": 0.3865, + "step": 3991 + }, + { + "epoch": 2.939617083946981, + "grad_norm": 0.3384080231189728, + "learning_rate": 1.2339677843586917e-08, + "loss": 0.3477, + "step": 3992 + }, + { + "epoch": 2.940353460972018, + "grad_norm": 0.30081725120544434, + "learning_rate": 1.2040665377618944e-08, + "loss": 0.38, + "step": 3993 + }, + { + "epoch": 2.9410898379970547, + "grad_norm": 0.3142537772655487, + "learning_rate": 1.1745315918255118e-08, + "loss": 0.3647, + "step": 3994 + }, + { + "epoch": 2.9418262150220915, + "grad_norm": 0.3099748194217682, + "learning_rate": 1.1453629682391943e-08, + "loss": 0.3372, + "step": 3995 + }, + { + "epoch": 2.9425625920471283, + "grad_norm": 0.32146158814430237, + "learning_rate": 1.1165606884234182e-08, + "loss": 0.3711, + "step": 3996 + }, + { + "epoch": 2.943298969072165, + "grad_norm": 0.29835745692253113, + "learning_rate": 1.088124773529764e-08, + "loss": 0.3736, + "step": 3997 + }, + { + "epoch": 2.944035346097202, + "grad_norm": 0.32554247975349426, + "learning_rate": 1.0600552444406387e-08, + "loss": 0.3506, + "step": 3998 + }, + { + "epoch": 2.944771723122239, + "grad_norm": 0.3305593430995941, + "learning_rate": 1.032352121769553e-08, + "loss": 0.3846, + "step": 3999 + }, + { + "epoch": 2.9455081001472756, + "grad_norm": 0.3391202688217163, + "learning_rate": 1.0050154258607336e-08, + "loss": 0.3827, + "step": 4000 + }, + { + "epoch": 2.9462444771723124, + "grad_norm": 0.3027651309967041, + "learning_rate": 9.780451767895104e-09, + "loss": 0.358, + "step": 4001 + }, + { + "epoch": 2.9469808541973492, + "grad_norm": 0.3510773181915283, + "learning_rate": 9.514413943619849e-09, + "loss": 0.3837, + "step": 4002 + }, + { + "epoch": 2.947717231222386, + "grad_norm": 0.30630064010620117, + "learning_rate": 9.252040981151956e-09, + "loss": 0.3691, + "step": 4003 + }, + { + "epoch": 2.948453608247423, + "grad_norm": 0.31158211827278137, + "learning_rate": 8.993333073169519e-09, + "loss": 0.3981, + "step": 4004 + }, + { + "epoch": 2.9491899852724597, + "grad_norm": 0.31192636489868164, + "learning_rate": 8.738290409660566e-09, + "loss": 0.3855, + "step": 4005 + }, + { + "epoch": 2.9499263622974965, + "grad_norm": 0.33482906222343445, + "learning_rate": 8.486913177920275e-09, + "loss": 0.3726, + "step": 4006 + }, + { + "epoch": 2.9506627393225333, + "grad_norm": 0.32285335659980774, + "learning_rate": 8.239201562553201e-09, + "loss": 0.3276, + "step": 4007 + }, + { + "epoch": 2.95139911634757, + "grad_norm": 0.31974491477012634, + "learning_rate": 7.99515574546994e-09, + "loss": 0.4194, + "step": 4008 + }, + { + "epoch": 2.952135493372607, + "grad_norm": 0.33110713958740234, + "learning_rate": 7.754775905891576e-09, + "loss": 0.3572, + "step": 4009 + }, + { + "epoch": 2.9528718703976438, + "grad_norm": 0.3111792802810669, + "learning_rate": 7.518062220345235e-09, + "loss": 0.3705, + "step": 4010 + }, + { + "epoch": 2.9536082474226806, + "grad_norm": 0.29877275228500366, + "learning_rate": 7.285014862666862e-09, + "loss": 0.3592, + "step": 4011 + }, + { + "epoch": 2.9543446244477174, + "grad_norm": 0.3104023039340973, + "learning_rate": 7.055634003998446e-09, + "loss": 0.3524, + "step": 4012 + }, + { + "epoch": 2.955081001472754, + "grad_norm": 0.3165355920791626, + "learning_rate": 6.829919812790797e-09, + "loss": 0.3818, + "step": 4013 + }, + { + "epoch": 2.955817378497791, + "grad_norm": 0.32550013065338135, + "learning_rate": 6.607872454801878e-09, + "loss": 0.3589, + "step": 4014 + }, + { + "epoch": 2.956553755522828, + "grad_norm": 0.3074639141559601, + "learning_rate": 6.38949209309625e-09, + "loss": 0.3638, + "step": 4015 + }, + { + "epoch": 2.9572901325478647, + "grad_norm": 0.316501259803772, + "learning_rate": 6.174778888046184e-09, + "loss": 0.3775, + "step": 4016 + }, + { + "epoch": 2.9580265095729015, + "grad_norm": 0.33413442969322205, + "learning_rate": 5.963732997329996e-09, + "loss": 0.3577, + "step": 4017 + }, + { + "epoch": 2.9587628865979383, + "grad_norm": 0.3284623622894287, + "learning_rate": 5.756354575934265e-09, + "loss": 0.3684, + "step": 4018 + }, + { + "epoch": 2.959499263622975, + "grad_norm": 0.32929477095603943, + "learning_rate": 5.552643776150501e-09, + "loss": 0.3902, + "step": 4019 + }, + { + "epoch": 2.960235640648012, + "grad_norm": 0.3296683728694916, + "learning_rate": 5.352600747577929e-09, + "loss": 0.3649, + "step": 4020 + }, + { + "epoch": 2.9609720176730487, + "grad_norm": 0.32667848467826843, + "learning_rate": 5.1562256371229245e-09, + "loss": 0.3842, + "step": 4021 + }, + { + "epoch": 2.9617083946980856, + "grad_norm": 0.33496248722076416, + "learning_rate": 4.9635185889967966e-09, + "loss": 0.3862, + "step": 4022 + }, + { + "epoch": 2.9624447717231224, + "grad_norm": 0.2943912744522095, + "learning_rate": 4.774479744717453e-09, + "loss": 0.3649, + "step": 4023 + }, + { + "epoch": 2.963181148748159, + "grad_norm": 0.3493749499320984, + "learning_rate": 4.589109243109957e-09, + "loss": 0.3731, + "step": 4024 + }, + { + "epoch": 2.963917525773196, + "grad_norm": 0.35220539569854736, + "learning_rate": 4.4074072203048605e-09, + "loss": 0.387, + "step": 4025 + }, + { + "epoch": 2.964653902798233, + "grad_norm": 0.3216170370578766, + "learning_rate": 4.2293738097376465e-09, + "loss": 0.403, + "step": 4026 + }, + { + "epoch": 2.9653902798232696, + "grad_norm": 0.3366626799106598, + "learning_rate": 4.055009142152066e-09, + "loss": 0.3539, + "step": 4027 + }, + { + "epoch": 2.9661266568483065, + "grad_norm": 0.3413197994232178, + "learning_rate": 3.884313345595137e-09, + "loss": 0.3747, + "step": 4028 + }, + { + "epoch": 2.9668630338733433, + "grad_norm": 0.3060031831264496, + "learning_rate": 3.7172865454210282e-09, + "loss": 0.3525, + "step": 4029 + }, + { + "epoch": 2.96759941089838, + "grad_norm": 0.3200681507587433, + "learning_rate": 3.553928864289402e-09, + "loss": 0.3714, + "step": 4030 + }, + { + "epoch": 2.968335787923417, + "grad_norm": 0.3404831886291504, + "learning_rate": 3.394240422164852e-09, + "loss": 0.3466, + "step": 4031 + }, + { + "epoch": 2.9690721649484537, + "grad_norm": 0.3348984122276306, + "learning_rate": 3.238221336318015e-09, + "loss": 0.3704, + "step": 4032 + }, + { + "epoch": 2.9698085419734905, + "grad_norm": 0.3363956809043884, + "learning_rate": 3.0858717213250176e-09, + "loss": 0.3904, + "step": 4033 + }, + { + "epoch": 2.9705449189985274, + "grad_norm": 0.3241819441318512, + "learning_rate": 2.9371916890658105e-09, + "loss": 0.3679, + "step": 4034 + }, + { + "epoch": 2.971281296023564, + "grad_norm": 0.34104812145233154, + "learning_rate": 2.792181348726941e-09, + "loss": 0.3592, + "step": 4035 + }, + { + "epoch": 2.972017673048601, + "grad_norm": 0.3131769597530365, + "learning_rate": 2.6508408067998926e-09, + "loss": 0.3541, + "step": 4036 + }, + { + "epoch": 2.972754050073638, + "grad_norm": 0.3056182265281677, + "learning_rate": 2.5131701670805252e-09, + "loss": 0.3641, + "step": 4037 + }, + { + "epoch": 2.9734904270986746, + "grad_norm": 0.3001095652580261, + "learning_rate": 2.379169530670744e-09, + "loss": 0.3596, + "step": 4038 + }, + { + "epoch": 2.9742268041237114, + "grad_norm": 0.33647263050079346, + "learning_rate": 2.2488389959751666e-09, + "loss": 0.3457, + "step": 4039 + }, + { + "epoch": 2.9749631811487482, + "grad_norm": 0.31432968378067017, + "learning_rate": 2.12217865870612e-09, + "loss": 0.3862, + "step": 4040 + }, + { + "epoch": 2.975699558173785, + "grad_norm": 0.3569965362548828, + "learning_rate": 1.999188611878089e-09, + "loss": 0.3556, + "step": 4041 + }, + { + "epoch": 2.976435935198822, + "grad_norm": 0.31356170773506165, + "learning_rate": 1.8798689458116025e-09, + "loss": 0.3892, + "step": 4042 + }, + { + "epoch": 2.9771723122238587, + "grad_norm": 0.31526947021484375, + "learning_rate": 1.7642197481315682e-09, + "loss": 0.3534, + "step": 4043 + }, + { + "epoch": 2.9779086892488955, + "grad_norm": 0.29720044136047363, + "learning_rate": 1.6522411037667162e-09, + "loss": 0.3851, + "step": 4044 + }, + { + "epoch": 2.9786450662739323, + "grad_norm": 0.2992621064186096, + "learning_rate": 1.5439330949518216e-09, + "loss": 0.3677, + "step": 4045 + }, + { + "epoch": 2.979381443298969, + "grad_norm": 0.3420858681201935, + "learning_rate": 1.4392958012238167e-09, + "loss": 0.3943, + "step": 4046 + }, + { + "epoch": 2.980117820324006, + "grad_norm": 0.3347738981246948, + "learning_rate": 1.338329299425678e-09, + "loss": 0.3472, + "step": 4047 + }, + { + "epoch": 2.9808541973490428, + "grad_norm": 0.3168458640575409, + "learning_rate": 1.2410336637047604e-09, + "loss": 0.3886, + "step": 4048 + }, + { + "epoch": 2.9815905743740796, + "grad_norm": 0.3121738135814667, + "learning_rate": 1.147408965511132e-09, + "loss": 0.3921, + "step": 4049 + }, + { + "epoch": 2.9823269513991164, + "grad_norm": 0.30733251571655273, + "learning_rate": 1.0574552735997945e-09, + "loss": 0.3648, + "step": 4050 + }, + { + "epoch": 2.9830633284241532, + "grad_norm": 0.32336708903312683, + "learning_rate": 9.711726540312383e-10, + "loss": 0.3551, + "step": 4051 + }, + { + "epoch": 2.98379970544919, + "grad_norm": 0.3032035529613495, + "learning_rate": 8.885611701675567e-10, + "loss": 0.35, + "step": 4052 + }, + { + "epoch": 2.984536082474227, + "grad_norm": 0.33436474204063416, + "learning_rate": 8.09620882676887e-10, + "loss": 0.3755, + "step": 4053 + }, + { + "epoch": 2.9852724594992637, + "grad_norm": 0.3163788318634033, + "learning_rate": 7.343518495300794e-10, + "loss": 0.372, + "step": 4054 + }, + { + "epoch": 2.9860088365243005, + "grad_norm": 0.31104227900505066, + "learning_rate": 6.62754126002918e-10, + "loss": 0.3509, + "step": 4055 + }, + { + "epoch": 2.9867452135493373, + "grad_norm": 0.3354119062423706, + "learning_rate": 5.948277646744549e-10, + "loss": 0.3447, + "step": 4056 + }, + { + "epoch": 2.987481590574374, + "grad_norm": 0.32740065455436707, + "learning_rate": 5.305728154275658e-10, + "loss": 0.3559, + "step": 4057 + }, + { + "epoch": 2.988217967599411, + "grad_norm": 0.32162371277809143, + "learning_rate": 4.699893254495047e-10, + "loss": 0.3998, + "step": 4058 + }, + { + "epoch": 2.9889543446244478, + "grad_norm": 0.3500117063522339, + "learning_rate": 4.1307733923079407e-10, + "loss": 0.3904, + "step": 4059 + }, + { + "epoch": 2.9896907216494846, + "grad_norm": 0.34684303402900696, + "learning_rate": 3.5983689856522453e-10, + "loss": 0.3776, + "step": 4060 + }, + { + "epoch": 2.9904270986745214, + "grad_norm": 0.31505316495895386, + "learning_rate": 3.1026804255207544e-10, + "loss": 0.3636, + "step": 4061 + }, + { + "epoch": 2.991163475699558, + "grad_norm": 0.3074565529823303, + "learning_rate": 2.643708075922291e-10, + "loss": 0.3477, + "step": 4062 + }, + { + "epoch": 2.991899852724595, + "grad_norm": 0.3139492869377136, + "learning_rate": 2.2214522739205657e-10, + "loss": 0.3694, + "step": 4063 + }, + { + "epoch": 2.992636229749632, + "grad_norm": 0.33408764004707336, + "learning_rate": 1.835913329600869e-10, + "loss": 0.4098, + "step": 4064 + }, + { + "epoch": 2.9933726067746687, + "grad_norm": 0.33919352293014526, + "learning_rate": 1.487091526097828e-10, + "loss": 0.3638, + "step": 4065 + }, + { + "epoch": 2.9941089837997055, + "grad_norm": 0.30452626943588257, + "learning_rate": 1.174987119573201e-10, + "loss": 0.356, + "step": 4066 + }, + { + "epoch": 2.9948453608247423, + "grad_norm": 0.32732394337654114, + "learning_rate": 8.996003392214292e-11, + "loss": 0.374, + "step": 4067 + }, + { + "epoch": 2.995581737849779, + "grad_norm": 0.2973012626171112, + "learning_rate": 6.609313872862899e-11, + "loss": 0.3513, + "step": 4068 + }, + { + "epoch": 2.996318114874816, + "grad_norm": 0.30825671553611755, + "learning_rate": 4.5898043903314096e-11, + "loss": 0.3687, + "step": 4069 + }, + { + "epoch": 2.9970544918998527, + "grad_norm": 0.3054065406322479, + "learning_rate": 2.93747642771125e-11, + "loss": 0.3703, + "step": 4070 + }, + { + "epoch": 2.9977908689248896, + "grad_norm": 0.3413209617137909, + "learning_rate": 1.6523311984206757e-11, + "loss": 0.3665, + "step": 4071 + }, + { + "epoch": 2.9985272459499264, + "grad_norm": 0.3274330496788025, + "learning_rate": 7.343696462047689e-12, + "loss": 0.3562, + "step": 4072 + }, + { + "epoch": 2.999263622974963, + "grad_norm": 0.31068921089172363, + "learning_rate": 1.8359244524646103e-12, + "loss": 0.3785, + "step": 4073 + }, + { + "epoch": 3.0, + "grad_norm": 0.31857830286026, + "learning_rate": 0.0, + "loss": 0.344, + "step": 4074 + }, + { + "epoch": 3.0, + "step": 4074, + "total_flos": 4823744474644480.0, + "train_loss": 0.42331435592460115, + "train_runtime": 136035.2407, + "train_samples_per_second": 2.875, + "train_steps_per_second": 0.03 + } + ], + "logging_steps": 1.0, + "max_steps": 4074, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4823744474644480.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}